正常流程自动化操作的话使用 selenium 的会比较多,虽然语句语法比较啰嗦,但相对来说比较成熟,但是因为需要使用 chromedriver,即使启动参数里边添加各种如下参数,却还是容易被网站的各种反爬虫机制检测到。
python
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
option.add_argument('--disable-gpu')
option.add_argument("--disable-blink-features=AutomationControlled")
option.add_argument('--no-sandbox')
Selenium 的角色像一个中间传话人,需要依赖 ChromeDriver 驱动才能与浏览器沟通;而 DrissionPage 则绕过了中间人,直接与浏览器对话,因此既不需要下载驱动,运行速度也更快,并且没有 WebDriver 特征,更难被网站的反爬虫机制检测到。
起因
最近做一个自动化操作的项目时,在登录过程中遇到了图形验证码处理的问题。使用 selenium 时虽然已经通过各种识别计算,精确得到了滑动距离且使用 selenium 的 ActionChains 把滑块拉到了指定位置,但是后台校验还是一直提示验证失败。后面甚至尝试了使用 playwright(playwright 也不需要 chromedriver)来进行处理,但是仍然一样提示校验失败。再后面就抱着试一试的心态试了一下 DrissionPage,然后竟奇迹般发现图形验证码处理部分轻松通过。于是乎简单记录一下使用 DrissionPage 的过程及示例。
安装
使用前记得先 pip 安装 DrissionPage:
python
pip install DrissionPage
文档
使用语法可参考 https://drissionpage.cn/browser_control/intro
文档也是作者自己写的,基本上已经很全面了,我就不多啰嗦了。
示例
使用示例(示例仅保留了主流程部分供自己后续参考):
python
import time
import re
import traceback
import requests
import os
import util
import base64
import cv2
import config
import pandas as pd
from config import logger
from DrissionPage import ChromiumPage
from DrissionPage.common import Actions
class FeeBillClass:
def __init__(self):
self.name = "fee_bill"
self.username = config.username
self.password = config.password
self.page = None
self.frame = None
self.split_filepaths = []
def get_images(self, bg_element, slider_element):
logger.info("get_images1 下载背景图和滑块图")
# 等待背景图 url 完全加载,最多等待 30s
for i in range(30):
bg_src = bg_element.attr("src")
if bg_src:
break
time.sleep(1)
# 获取背景图
bg_src = bg_element.attr("src")
# 获取滑块图
slider_src = slider_element.attr("src")
# 图片下载链接有时候访问不到,可重试一次
try:
bg_content = requests.get(bg_src).content
slider_content = requests.get(slider_src).content
except:
time.sleep(1)
bg_content = requests.get(bg_src).content
slider_content = requests.get(slider_src).content
# 下载图片
bg_path = os.path.join(config.img_dir, "bg.png")
slider_path = os.path.join(config.img_dir, "slider.png")
with open(bg_path, "wb") as f:
f.write(bg_content)
with open(slider_path, "wb") as f:
f.write(slider_content)
return bg_content, slider_content, bg_path
def get_images2(self, bg_element, slider_element):
logger.info(f"get_images2 下载背景图和滑块图")
# 等待背景图 url 完全加载,最多等待 30s
for i in range(30):
bg_src = bg_element.get_attribute("src")
if bg_src:
break
time.sleep(config.page_interval)
# 获取背景图
bg_src = bg_element.get_attribute("src")
# 获取滑块图
slider_src = slider_element.get_attribute("style")
slider_src = re.search("(http.*)/origin", slider_src).group(1)
logger.info(bg_src)
logger.info(slider_src)
# 图片下载链接有时候访问不到,可重试一次
try:
bg_content = requests.get(bg_src).content
slider_content = requests.get(slider_src).content
except:
time.sleep(config.page_interval)
logger.info("图片重试下载中...")
bg_content = requests.get(bg_src).content
slider_content = requests.get(slider_src).content
# 下载图片
bg_path = os.path.join(config.img_dir, "bg.png")
slider_path = os.path.join(config.img_dir, "slider.png")
with open(bg_path, "wb") as f:
f.write(bg_content)
with open(slider_path, "wb") as f:
f.write(slider_content)
return bg_path, slider_path
# 文字识别验证码
def check_captcha1(self):
try:
slider_element = self.frame.ele('xpath://div[@id="click_quiz"]/img')
bg_element = self.frame.ele('xpath://div[@id="c_click_wrapper_view"]/img')
bg_content, slider_content, bg_path = self.get_images(bg_element, slider_element)
bg_base64 = base64.b64encode(bg_content).decode('utf-8')
slider_base64 = base64.b64encode(slider_content).decode('utf-8')
bg_width = bg_element.rect.size[0]
real_width = cv2.imread(bg_path).shape[1]
scale = bg_width / real_width
# 根据背景图的页面尺寸和实际尺寸调整缩放比
# scale = 0.7
logger.info(f"缩放比:{scale}")
data = util.captcha_recognize3(bg_base64, slider_base64)
data = str(data).split("|")
logger.info(data)
points = []
for item in data:
x, y = item.split(",")
x = int(int(x) * scale)
y = int(int(y) * scale)
points.append((x, y)) # 按照缩放比重新计算偏移量
logger.info(points)
# 获取元素位置
ac = Actions(self.frame)
for x, y in points:
# page.mouse.click(abs_x + x, abs_y + y)
ac.move_to(bg_element, x, y).click()
time.sleep(1)
except Exception as e:
logger.error(f"文字识别验证码处理报错:{str(e)}")
logger.error(traceback.format_exc())
# 滑块拖动验证码
def check_captcha2(self):
try:
slider_element = self.frame.ele('xpath://div[@id="puzzle_slot"]')
bg_element = self.frame.ele('xpath://div/img[@id="puzzle_backimg"]')
slider_button = self.frame.ele('xpath://div[@class="dv_handler baseslider"]')
# 获取并处理图片
bg_path, slider_path = self.get_images2(bg_element, slider_element)
bg_width = bg_element.rect.size[0]
real_width = cv2.imread(bg_path).shape[1]
scale = bg_width / real_width
# 根据背景图的页面尺寸和实际尺寸调整缩放比
logger.info(f"缩放比:{scale}")
distance = util.captcha_recognize(bg_path, slider_path)
delta = 3
real_distance = distance * scale - delta
logger.info(f"真实滑动距离:{real_distance}")
ac = Actions(self.frame)
ac.hold(slider_button)
ac.move(real_distance, 0, duration=config.page_interval)
ac.release()
except Exception as e:
logger.error(f"滑块验证码处理报错:{str(e)}")
logger.error(traceback.format_exc())
def solve_captcha(self):
try:
time.sleep(config.login_waiting_time)
self.frame.ele('xpath://div[@class="detail-page captcha"]')
except:
logger.info("无图形验证码弹窗,继续后续流程")
return True
# 获取验证码图片
self.check_captcha1()
try:
time.sleep(config.login_waiting_time)
self.frame.ele('xpath://div[@class="detail-page captcha"]')
except:
logger.info("图形验证码弹窗处理完毕,继续后续流程")
return True
self.check_captcha2()
try:
time.sleep(config.login_waiting_time)
self.frame.ele('xpath://div[@class="detail-page captcha"]')
return False
except:
logger.info("图形验证码弹窗处理完毕,继续后续流程")
return True
def handle_file_split(self):
try:
logger.info(f"开始催记文件切分")
# 移除过期目录
self.split_filepaths = []
util.remove_dir(config.tmp_dir, days_to_keep=90)
today = util.get_current_time(fmt="%Y%m%d")
filename = f"网商银行催记拉取{today}.xlsx"
file_path = os.path.join(config.file_dir, filename)
if not os.path.exists(file_path):
logger.info(f"{file_path} 文件不存在")
return False
# 新建当前目录,如果已存在,则清空目录
tmp_today_dir = os.path.join(config.tmp_dir, today)
os.makedirs(tmp_today_dir, exist_ok=True)
util.clear_dir_by_name(tmp_today_dir)
# 文件读取
df = pd.read_excel(file_path)
row_num = df.size
logger.info(f"total_rows: {row_num}")
# 每 5000 行数据分割一个文件
index_num = 0
batch_num = 5000
for index in range(0, row_num, batch_num):
index_num += 1
start_index = index
end_index = index + batch_num
# 如果已经到最后一批数据,则全部取完
if end_index > row_num:
end_index = row_num
logger.info(f"开始文件切分:{start_index}~{end_index}")
tmp_filename = f"网商银行催记拉取{today}-{index_num}.xlsx"
logger.info(f"{tmp_filename}")
tmp_filepath = os.path.join(tmp_today_dir, tmp_filename)
tmp_df = df.iloc[start_index: end_index]
tmp_df.to_excel(tmp_filepath, index=None) # 不需要添加自动索引列
self.split_filepaths.append(tmp_filepath)
return True
except Exception as e:
logger.error(f"文件切分失败:{str(e)}")
logger.error(traceback.format_exc())
return False
def handle_login(self):
try:
logger.info("开始登录")
self.page = ChromiumPage()
self.page.set.user_agent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# 访问登录页
self.page.get(config.login_url)
time.sleep(config.login_waiting_time)
for i in range(10):
# 切换到iframe
try:
self.frame = self.page.get_frame('xpath://*[@id="inst-lgoin-iframe"]')
except:
time.sleep(config.login_waiting_time)
# 填写账号密码(请根据实际页面调整选择器)
self.frame.ele('xpath://input[@name="logonId"]').input(config.username)
self.frame.ele('xpath://input[@name="password"]').input(config.password)
self.frame.ele('xpath://button[@id="submitBtn"]').click() # 点击登录触发验证码
# 等待验证码出现
if not self.solve_captcha():
logger.info("图形验证码处理失败")
return False
logger.info("登录成功")
return True
except Exception as e:
logger.error(f"登录失败:{str(e)}")
logger.error(traceback.format_exc())
return False
def handle_work(self):
try:
logger.info(f"开始任务处理")
time.sleep(config.page_interval)
# 选项点击
# 身份证是否存在点击
idcard_element = self.page.ele('xpath://span[input[@id="idcardExists"]]')
idcard_class = idcard_element.attr("class")
if "checked" not in idcard_class:
idcard_element.click()
time.sleep(config.page_interval)
# 身份证是否有效点击
idcard_element = self.page.ele('xpath://span[input[@id="idcardValid"]]')
idcard_class = idcard_element.attr("class")
if "checked" not in idcard_class:
idcard_element.click()
time.sleep(config.page_interval)
# 贷中/贷后资料
loan_info_element = self.page.ele('xpath://label/span[text()="贷中/贷后资料"]/preceding-sibling::span')
loan_info_element.click()
time.sleep(config.page_interval)
class_info = loan_info_element.attr("class")
if "checked" in class_info:
loan_info_element.click()
time.sleep(config.page_interval)
# 选中催收记录
info_element = self.page.ele('xpath://label/span[text()="催收记录"]/preceding-sibling::span')
class_info = info_element.attr("class")
if "checked" not in class_info:
info_element.click()
time.sleep(config.page_interval)
# 贷前资料
loan_info_element = self.page.ele('xpath://label/span[text()="贷前资料"]/preceding-sibling::span')
loan_info_element.click()
time.sleep(config.page_interval)
class_info = loan_info_element.attr("class")
if "checked" in class_info:
loan_info_element.click()
time.sleep(config.page_interval)
# 文件分批上传
for filepath in self.split_filepaths:
self.page.ele('xpath://div/span[@class="ant-upload ant-upload-btn"]').click()
# tmp_file = os.path.join(config.file_dir, "test.xlsx")
logger.info(f"开始 {filepath} 文件上传")
util.file_upload(filepath)
time.sleep(config.page_interval)
# 测试不点击下载
# self.page.ele('xpath://div/span/button/span[text()="下 载"]').click()
time.sleep(config.page_interval)
time.sleep(config.login_waiting_time)
return True
except Exception as e:
logger.error(f"任务处理失败:{str(e)}")
logger.error(traceback.format_exc())
return False
def close_browser(self):
logger.info("关闭浏览器窗口")
self.page.close()
def run(self):
if not self.handle_file_split():
logger.info("文件切分失败")
return
if not self.handle_login():
logger.info("登录失败")
return
self.handle_work()
self.close_browser()
# 使用示例
if __name__ == "__main__":
hwc = FeeBillClass()
hwc.run()
# hwc.handle_file_split()