本质
python第三方库 selenium 空值 浏览器驱动
浏览器驱动控制浏览器
- 推荐 edge 浏览器驱动(不容易遇到版本或者兼容性的问题)
- 驱动下载网址:链接: link
1、实战1
(1)安装 selenium 库
python
pip install selenium
(2)将驱动文件exe放在py文件同级目录下

(3)初步体验驱动器控制浏览器
python
# 从selenium库中导入webdriver模块
from selenium import webdriver
# 从selenium.webdriver.common.by模块中导入By类,用于定位元素
from selenium.webdriver.common.by import By
# 从selenium.webdriver.common.keys模块中导入Keys类,用于模拟键盘按键操作
from selenium.webdriver.common.keys import Keys
import time
# 若你想用Edge浏览器,使用下面这行代码,前提是msedgedriver.exe路径正确
driver = webdriver.Edge("./msedgedriver.exe")
# get方法会一直等到页面被完全加载,然后才会继续程序
driver.get("http://www.baidu.com/")
# id="kw" 是百度搜索输入框,输入字符串 "长城"
driver.find_element(By.CSS_SELECTOR, "#kw").send_keys("长城")
# id="su" 是百度搜索按钮,click() 是模拟点击
driver.find_element(By.CSS_SELECTOR, "#su").click()
# 为了便于观察搜索结果,等待 5 秒
time.sleep(5)
# 关闭浏览器
driver.quit()
(4)元素定位
1.获取单个元素------元素不存在会报错
python
# 通过 ID 定位元素
element_by_id = driver.find_element(By.ID, "inputOriginal")
# 通过 CSS 选择器定位元素 id-#
element_by_css = driver.find_element(By.CSS_SELECTOR, "#inputOriginal")
# 通过标签名定位元素
element_by_tag = driver.find_element(By.TAG_NAME, "div")
# 通过 name 属性定位元素
element_by_name = driver.find_element(By.NAME, "username")
# 通过链接文本定位元素
element_by_link_text = driver.find_element(By.LINK_TEXT, "下一页")
2.获取多个元素------返回列表(元素不存在返回空)
python
# 通过 ID 定位多个元素
elements_by_id = driver.find_elements(By.ID, "inputOriginal")
# 通过 CSS 选择器定位多个元素
elements_by_css = driver.find_elements(By.CSS_SELECTOR, "#inputOriginal")
# 通过标签名定位多个元素
elements_by_tag = driver.find_elements(By.TAG_NAME, "div")
# 通过 name 属性定位多个元素
elements_by_name = driver.find_elements(By.NAME, "username")
# 通过链接文本定位多个元素
elements_by_link_text = driver.find_elements(By.LINK_TEXT, "下一页")
# 后续可对定位到的元素列表进行操作,例如遍历元素列表
for element in elements_by_id:
print(element.text)
2、实战2:访问有道翻译,获取翻译后的内容
python
# 从selenium库中导入webdriver模块
from selenium import webdriver
# 从selenium.webdriver.common.by模块中导入By类,用于定位元素
from selenium.webdriver.common.by import By
# 从selenium.webdriver.common.keys模块中导入Keys类,用于模拟键盘按键操作
from selenium.webdriver.common.keys import Keys
import time
# 若你想用Edge浏览器,使用下面这行代码,前提是msedgedriver.exe路径正确
driver = webdriver.Edge("./msedgedriver.exe")
# 加载有道翻译页面
driver.get("https://fanyi.youdao.com/#/TextTranslate")
# 等待页面加载
time.sleep(2)
# 获取输入框
input_box = driver.find_element(By.ID, "js_fanyi_input")
# 输入内容
input_box.send_keys("hello")
# 等待翻译完成
time.sleep(2)
# 获取翻译后的内容
transTarget = driver.find_element(By.ID, "js_fanyi_output_resultOutput")
print(transTarget.text)
# 为了便于观察搜索结果,等待 5 秒
time.sleep(5)
# 关闭浏览器
driver.quit()
3、实战3:爬取当当网站商品信息
(1)内容获取

(2)窗口操作

(3)实战
python
# 从selenium库中导入webdriver模块
from selenium import webdriver
# 从selenium.webdriver.common.by模块中导入By类,用于定位元素
from selenium.webdriver.common.by import By
# 从selenium.webdriver.common.keys模块中导入Keys类,用于模拟键盘按键操作
from selenium.webdriver.common.keys import Keys
import time
# 若你想用Edge浏览器,使用下面这行代码,前提是msedgedriver.exe路径正确
driver = webdriver.Edge("./msedgedriver.exe")
# 加载当当网
driver.get("https://www.dangdang.com/")
# 等待页面加载
time.sleep(2)
# 获取输入框
key = driver.find_element(By.ID, "key_S")
key.send_keys("科幻")
# 获取搜索框,点击搜索
search = driver.find_element(By.CSS_SELECTOR, "#form_search_new .button")
search.click()
# 等待搜索结果页面加载
time.sleep(3)
# 获取商品标题及价格,循环 5 页
for i in range(5):
shoplist = driver.find_elements(By.CSS_SELECTOR, ".bigimg li") # bigimg 下的所有 li标签
for li in shoplist:
try:
title = li.find_element(By.CSS_SELECTOR, "a").get_attribute("title")
print(title)
except Exception as e:
print(f"获取商品标题时出错: {e}")
try:
price = li.find_element(By.CSS_SELECTOR, ".search_now_price").text
print(price)
except Exception as e:
print(f"获取商品价格时出错: {e}")
# 获取下一页按钮
try:
next_page = driver.find_element(By.LINK_TEXT, "下一页")
next_page.click()
# 等待下一页加载
time.sleep(3)
except Exception as e:
print(f"点击下一页时出错: {e}")
break
# 关闭浏览器
driver.close()
(4)css选择器基本规则

(5)等待------显式/隐式
1.隐式:全局,只要找元素,没出来就等max_time(自定义)
python
driver = webdriver.Edge("./msedgedriver.exe")
driver.implicitly_wait(30)
2.显式:特定条件下的等待:webDriverWait+until+(判断条件)
python
# 程序每 0.5 秒检查,是否满足:标题包含 "百度一下" 这个条件,
# 检查是否满足条件的最长时间为:15 秒,超过 15 秒仍未满足条件则抛出异常
try:
WebDriverWait(driver, 15, 0.5).until(EC.title_contains("百度一下"))
print("页面标题包含 '百度一下'")
except Exception as e:
print(f"等待页面标题时出现异常: {e}")
# 假设要定位的元素 CSS 选择器为 ".example-element",需根据实际情况修改
element_selector = ".example-element"
# 程序每 0.5 秒检查,是否满足:某定位的元素出现,
# 检查是否满足条件的最长时间为:15 秒,超过 15 秒仍未满足条件则抛出异常
try:
WebDriverWait(driver, 15, 0.5).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, element_selector))
)
print(f"元素 {element_selector} 已可见")
except Exception as e:
print(f"等待元素可见时出现异常: {e}")
# 关闭浏览器
driver.quit()
4、实战4:鼠标及键盘操作(动作链)
python
driver.get("https://www.baidu.com/")
more=driver.find_element(By.LINK_TEXT,"更多")
link_element = driver.find_element(By.CSS_SELECTOR, 'a[name="tj_fanyi"]')
baike = link_element.get_attribute('href')
#将鼠标移动到更多按钮
ActionChains(driver).move_to_element(more).move_to_element(link_element).click().perform()
其他:滚动条,窗口截图
5、实战5:爬取知乎数据(应对反爬、滑动验证)

(1)方法一------opencv轮廓检测,由面积和周长确定起始和终止位置
python
# 从selenium库中导入webdriver模块
from selenium import webdriver
# 从selenium.webdriver.common.by模块中导入By类,用于定位元素
from selenium.webdriver.common.by import By
# 从selenium.webdriver.common.keys模块中导入Keys类,用于模拟键盘按键操作
from selenium.webdriver import ActionChains
# 导入显式等待类
from selenium.webdriver.support.wait import WebDriverWait
# 导入等待条件类
from selenium.webdriver.support import expected_conditions as EC
# 保存图片
from urllib import request
# 计算机图像识别
import cv2
# 反爬应对
import random
# 反爬应对
import time
# ------------------------------------------
# 1、创建 driver
driver = webdriver.Edge("./msedgedriver.exe")
driver.get("https://www.zhihu.com/")
driver.maximize_window()
# 2、输入用户名、密码(一系列鼠标点击动作)
dl = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-tabs > div:nth-child(2)")
ActionChains(driver).move_to_element(dl).click().perform()
dh = driver.find_element(By.CSS_SELECTOR,"#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-account > div > label > input")
dh.send_keys("15735188768")
mm = driver.find_element(By.CSS_SELECTOR,"#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-password > div > label > input")
mm.send_keys('wy062600')
login = driver.find_element(By.CSS_SELECTOR,"#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > button")
ActionChains(driver).move_to_element(login).click().perform()
# 3、显式等待直到滑动窗口的出现
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg")))
pic = driver.find_element(By.CSS_SELECTOR,'body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg > img.yidun_bg-img')
imgsrc = pic.get_attribute("src")
# 4、获取图片并保存
request.urlretrieve(imgsrc,'img.png')
# 5、定义函数,获取轮廓位置
def get_pos(imageSrc):
image = cv2.imread(imageSrc) # 利用cv2读取图片
blurred = cv2.GaussianBlur(image, (5, 5), 0, 0)
canny = cv2.Canny(blurred, 0, 100)
contours, hierarchy = cv2.findContours(canny, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
print(len(contours))
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = cv2.contourArea(contour)
zhouchang = cv2.arcLength(contour, True)
if 5025 < area < 7225 and 300 < zhouchang < 380:
x, y, w, h = cv2.boundingRect(contour)
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 2)
cv2.imwrite("111.jpg", image)
return x
return 0
dis = get_pos('img.png')
smallImage = driver.find_element(By.CSS_SELECTOR,'body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg > img.yidun_jigsaw')
dis = int(dis * 340 / 672 - smallImage.location['x'])
driver.implicitly_wait(2000)
ActionChains(driver).click_and_hold(smallImage).perform() # 按下按钮
i = 0
moved = 0
while moved < dis:
x = random.randint(3, 10)
moved += x
ActionChains(driver).move_by_offset(xoffset=x, yoffset=0).perform()
print("第{}次移动后, 位置为{}".format(i, smallImage.location['x']))
i += 1
ActionChains(driver).release().perform()
time.sleep(20000)
# 关闭浏览器
driver.close()
canny = cv2.Canny(blurred, 低阈值, 高阈值)

(2)方法二------opencv灰度检测确定起始和终止位置
python
def calculate_slide_distance(full_image, slider_image):
"""
计算滑块需要滑动的距离
:param full_image: 完整背景图
:param slider_image: 滑块图
:return: 滑动距离
"""
if full_image is None or slider_image is None:
print("图片数据为空,无法计算滑动距离")
return 0
# 灰度化
gray_full = cv2.cvtColor(full_image, cv2.COLOR_BGR2GRAY)
gray_slider = cv2.cvtColor(slider_image, cv2.COLOR_BGR2GRAY)
# 边缘检测
edges_full = cv2.Canny(gray_full, 50, 150)
edges_slider = cv2.Canny(gray_slider, 50, 150)
# 模板匹配,这里使用TM_CCOEFF_NORMED方法
result = cv2.matchTemplate(edges_full, edges_slider, cv2.TM_CCOEFF_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
print(f"匹配的相似度值: {max_val}") # 打印匹配的相似度值
distance = max_loc[0]
print(f"计算得到的滑块滑动距离: {distance}") # 打印计算得到的距离
return distance
# 计算滑动距离
full_image = cv2.imread('img.png')
slider_image = cv2.imread('img2.png')
distance = calculate_slide_distance(full_image, slider_image)
(3)最终结果
python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib import request
import cv2
import random
import time
import csv
import os
# 1、创建 driver
driver = webdriver.Edge("./msedgedriver.exe")
driver.get("https://www.zhihu.com/")
driver.maximize_window()
# 2、输入用户名、密码(一系列鼠标点击动作)
dl = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-tabs > div:nth-child(2)")
ActionChains(driver).move_to_element(dl).click().perform()
dh = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-account > div > label > input")
dh.send_keys("15735188768")
mm = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > div.SignFlow-password > div > label > input")
mm.send_keys('wy062600')
login = driver.find_element(By.CSS_SELECTOR, "#root > div > main > div > div > div > div > div.signQr-rightContainer > div > div.SignContainer-content > div > div:nth-child(1) > form > button")
ActionChains(driver).move_to_element(login).click().perform()
# 3、显式等待直到滑动窗口的出现
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg")))
pic = driver.find_element(By.CSS_SELECTOR, 'body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg > img.yidun_bg-img')
imgsrc = pic.get_attribute("src") # 获取链接
request.urlretrieve(imgsrc,'img1.png')# 下载图片
pic2 = driver.find_element(By.CSS_SELECTOR,'body > div.yidun_popup--light.yidun_popup.yidun_popup--size-small > div.yidun_modal__wrap > div > div > div.yidun_modal__body > div > div.yidun_panel > div > div.yidun_bgimg > img.yidun_jigsaw')
imgsrc2 = pic2.get_attribute("src")
request.urlretrieve(imgsrc2,'img2.png')
# ----------------------------------------------------------
# 4.1、法一:灰度检测
def calculate_slide_distance(full_image, slider_image):
"""
计算滑块需要滑动的距离
:param full_image: 完整背景图
:param slider_image: 滑块图
:return: 滑动距离
"""
if full_image is None or slider_image is None:
print("图片数据为空,无法计算滑动距离")
return 0
# 灰度化
gray_full = cv2.cvtColor(full_image, cv2.COLOR_BGR2GRAY)
gray_slider = cv2.cvtColor(slider_image, cv2.COLOR_BGR2GRAY)
# 边缘检测
edges_full = cv2.Canny(gray_full, 50, 150)
edges_slider = cv2.Canny(gray_slider, 50, 150)
# 模板匹配,这里使用TM_CCOEFF_NORMED方法
result = cv2.matchTemplate(edges_full, edges_slider, cv2.TM_CCOEFF_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
print(f"匹配的相似度值: {max_val}") # 打印匹配的相似度值
distance = max_loc[0]
print(f"计算得到的滑块滑动距离: {distance}") # 打印计算得到的距离
return distance
# 4.1.1、计算滑动距离
full_image = cv2.imread('img1.png')
slider_image = cv2.imread('img2.png')
distance = calculate_slide_distance(full_image, slider_image)
# --------------------------------------
# 4.2、法2:轮廓边界
def get_pos(imageSrc):
image = cv2.imread(imageSrc) # 利用cv2读取图片
blurred = cv2.GaussianBlur(image, (5, 5), 0, 0)
canny = cv2.Canny(blurred, 0, 100)
contours, hierarchy = cv2.findContours(canny, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
print(len(contours))
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = cv2.contourArea(contour)
zhouchang = cv2.arcLength(contour, True)
if 5025 < area < 7225 and 300 < zhouchang < 380:
x, y, w, h = cv2.boundingRect(contour)
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 2)
cv2.imwrite("111.jpg", image)
return x
return 0
# 4.2.1 法2需要计算距离
dis = get_pos('img.png')
dis = int(dis * 340 / 672 - pic2.location['x'])
driver.implicitly_wait(2000)
ActionChains(driver).click_and_hold(pic2).perform() # 按下按钮
# 4.2.2 反爬操作:
i = 0
moved = 0
while moved < dis:
x = random.randint(3, 10)
moved += x
ActionChains(driver).move_by_offset(xoffset=x, yoffset=0).perform()
print("第{}次移动后, 位置为{}".format(i, pic2.location['x']))
i += 1
ActionChains(driver).release().perform()
# 5、等待页面加载完成
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".AppHeader-inner css-11p8nt5")))
# 6、定义 CSV 文件路径
csv_file_path = "zhihu_data3.csv"
# 7、写入 CSV 文件
def write_to_csv(data):
with open(csv_file_path, mode="a", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(data)
# 8、写入 CSV 表头
if not os.path.exists(csv_file_path):
with open(csv_file_path, mode="w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["author_name", "title", "item_id", "has_image", "upvote_num"])
# 9、滚动加载更多内容
def scroll_to_load_more(max_scrolls=10):
scroll_count = 0
last_height = driver.execute_script("return document.body.scrollHeight")
while scroll_count < max_scrolls:
# 滚动到底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # 等待新内容加载
# 计算新的页面高度
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height: # 如果没有新内容加载,退出循环
break
last_height = new_height
scroll_count += 1
scroll_to_load_more(max_scrolls=30)
# 10、提取数据
articles = driver.find_elements(By.CSS_SELECTOR, ".ContentItem.ArticleItem, .ContentItem.AnswerItem")
for article in articles:
try:
# 提取 authorName
author_name = article.get_attribute("data-zop")
if author_name:
author_name = eval(author_name).get("authorName", "未知作者")
else:
author_name = "未知作者"
# 提取 title
title_element = article.find_element(By.CSS_SELECTOR, "h2.ContentItem-title a")
title = title_element.text
# 提取 itemId
item_id = article.get_attribute("data-zop")
if item_id:
item_id = eval(item_id).get("itemId", "未知ID")
else:
item_id = "未知ID"
# 提取 has_image
has_image = False # 默认值
try:
image_element = article.find_element(By.CSS_SELECTOR, ".RichContent-inner img")
if image_element:
has_image = True
except:
pass
# 提取 upvote_num
upvote_num = 0
try:
upvote_element = article.find_element(By.CSS_SELECTOR, ".VoteButton--up")
upvote_num = int(upvote_element.text.replace("赞同", "").strip())
except:
pass
# 打印提取的数据
print(f"作者: {author_name}")
print(f"标题: {title}")
print(f"文章ID: {item_id}")
print(f"是否有图片: {has_image}")
print(f"点赞数: {upvote_num}")
print("-" * 50)
# 写入 CSV 文件
data = [author_name, title, item_id, has_image, upvote_num]
write_to_csv(data)
except Exception as e:
print(f"提取数据时出错: {e}")
time.sleep(2000)
# 关闭浏览器
driver.quit()
