Python 流程自动化之 DrissionPage 使用示例

正常流程自动化操作的话使用 selenium 的会比较多,虽然语句语法比较啰嗦,但相对来说比较成熟,但是因为需要使用 chromedriver,即使启动参数里边添加各种如下参数,却还是容易被网站的各种反爬虫机制检测到。

python 复制代码
            option.add_experimental_option('excludeSwitches', ['enable-automation'])
            option.add_experimental_option('useAutomationExtension', False)
            option.add_argument('--disable-gpu')
            option.add_argument("--disable-blink-features=AutomationControlled")
            option.add_argument('--no-sandbox')

Selenium 的角色像一个中间传话人,需要依赖 ChromeDriver 驱动才能与浏览器沟通;而 DrissionPage 则绕过了中间人,直接与浏览器对话,因此既不需要下载驱动,运行速度也更快,并且没有 WebDriver 特征,更难被网站的反爬虫机制检测到

起因

最近做一个自动化操作的项目时,在登录过程中遇到了图形验证码处理的问题。使用 selenium 时虽然已经通过各种识别计算,精确得到了滑动距离且使用 selenium 的 ActionChains 把滑块拉到了指定位置,但是后台校验还是一直提示验证失败。后面甚至尝试了使用 playwright(playwright 也不需要 chromedriver)来进行处理,但是仍然一样提示校验失败。再后面就抱着试一试的心态试了一下 DrissionPage,然后竟奇迹般发现图形验证码处理部分轻松通过。于是乎简单记录一下使用 DrissionPage 的过程及示例。

安装

使用前记得先 pip 安装 DrissionPage:

python 复制代码
pip install DrissionPage

文档

使用语法可参考 https://drissionpage.cn/browser_control/intro

文档也是作者自己写的,基本上已经很全面了,我就不多啰嗦了。

示例

使用示例(示例仅保留了主流程部分供自己后续参考):

python 复制代码
import time
import re
import traceback

import requests
import os
import util
import base64
import cv2
import config
import pandas as pd

from config import logger

from DrissionPage import ChromiumPage
from DrissionPage.common import Actions


class FeeBillClass:

    def __init__(self):
        self.name = "fee_bill"
        self.username = config.username
        self.password = config.password
        self.page = None
        self.frame = None
        self.split_filepaths = []

    def get_images(self, bg_element, slider_element):
        logger.info("get_images1 下载背景图和滑块图")
        # 等待背景图 url 完全加载,最多等待 30s
        for i in range(30):
            bg_src = bg_element.attr("src")
            if bg_src:
                break
            time.sleep(1)
        # 获取背景图
        bg_src = bg_element.attr("src")
        # 获取滑块图
        slider_src = slider_element.attr("src")
        # 图片下载链接有时候访问不到,可重试一次
        try:
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        except:
            time.sleep(1)
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        # 下载图片
        bg_path = os.path.join(config.img_dir, "bg.png")
        slider_path = os.path.join(config.img_dir, "slider.png")

        with open(bg_path, "wb") as f:
            f.write(bg_content)
        with open(slider_path, "wb") as f:
            f.write(slider_content)
        return bg_content, slider_content, bg_path

    def get_images2(self, bg_element, slider_element):
        logger.info(f"get_images2 下载背景图和滑块图")
        # 等待背景图 url 完全加载,最多等待 30s
        for i in range(30):
            bg_src = bg_element.get_attribute("src")
            if bg_src:
                break
            time.sleep(config.page_interval)
        # 获取背景图
        bg_src = bg_element.get_attribute("src")
        # 获取滑块图
        slider_src = slider_element.get_attribute("style")
        slider_src = re.search("(http.*)/origin", slider_src).group(1)
        logger.info(bg_src)
        logger.info(slider_src)
        # 图片下载链接有时候访问不到,可重试一次
        try:
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        except:
            time.sleep(config.page_interval)
            logger.info("图片重试下载中...")
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        # 下载图片
        bg_path = os.path.join(config.img_dir, "bg.png")
        slider_path = os.path.join(config.img_dir, "slider.png")

        with open(bg_path, "wb") as f:
            f.write(bg_content)
        with open(slider_path, "wb") as f:
            f.write(slider_content)
        return bg_path, slider_path

    # 文字识别验证码
    def check_captcha1(self):
        try:
            slider_element = self.frame.ele('xpath://div[@id="click_quiz"]/img')
            bg_element = self.frame.ele('xpath://div[@id="c_click_wrapper_view"]/img')
            bg_content, slider_content, bg_path = self.get_images(bg_element, slider_element)
            bg_base64 = base64.b64encode(bg_content).decode('utf-8')
            slider_base64 = base64.b64encode(slider_content).decode('utf-8')
            bg_width = bg_element.rect.size[0]
            real_width = cv2.imread(bg_path).shape[1]
            scale = bg_width / real_width
            # 根据背景图的页面尺寸和实际尺寸调整缩放比
            # scale = 0.7
            logger.info(f"缩放比:{scale}")
            data = util.captcha_recognize3(bg_base64, slider_base64)
            data = str(data).split("|")
            logger.info(data)
            points = []
            for item in data:
                x, y = item.split(",")
                x = int(int(x) * scale)
                y = int(int(y) * scale)
                points.append((x, y))  # 按照缩放比重新计算偏移量
            logger.info(points)
            # 获取元素位置
            ac = Actions(self.frame)
            for x, y in points:
                # page.mouse.click(abs_x + x, abs_y + y)
                ac.move_to(bg_element, x, y).click()
                time.sleep(1)
        except Exception as e:
            logger.error(f"文字识别验证码处理报错:{str(e)}")
            logger.error(traceback.format_exc())

    # 滑块拖动验证码
    def check_captcha2(self):
        try:
            slider_element = self.frame.ele('xpath://div[@id="puzzle_slot"]')
            bg_element = self.frame.ele('xpath://div/img[@id="puzzle_backimg"]')
            slider_button = self.frame.ele('xpath://div[@class="dv_handler baseslider"]')
            # 获取并处理图片
            bg_path, slider_path = self.get_images2(bg_element, slider_element)
            bg_width = bg_element.rect.size[0]
            real_width = cv2.imread(bg_path).shape[1]
            scale = bg_width / real_width
            # 根据背景图的页面尺寸和实际尺寸调整缩放比
            logger.info(f"缩放比:{scale}")
            distance = util.captcha_recognize(bg_path, slider_path)
            delta = 3
            real_distance = distance * scale - delta
            logger.info(f"真实滑动距离:{real_distance}")
            ac = Actions(self.frame)
            ac.hold(slider_button)
            ac.move(real_distance, 0, duration=config.page_interval)
            ac.release()
        except Exception as e:
            logger.error(f"滑块验证码处理报错:{str(e)}")
            logger.error(traceback.format_exc())

    def solve_captcha(self):
        try:
            time.sleep(config.login_waiting_time)
            self.frame.ele('xpath://div[@class="detail-page captcha"]')
        except:
            logger.info("无图形验证码弹窗,继续后续流程")
            return True
        # 获取验证码图片
        self.check_captcha1()
        try:
            time.sleep(config.login_waiting_time)
            self.frame.ele('xpath://div[@class="detail-page captcha"]')
        except:
            logger.info("图形验证码弹窗处理完毕,继续后续流程")
            return True
        self.check_captcha2()
        try:
            time.sleep(config.login_waiting_time)
            self.frame.ele('xpath://div[@class="detail-page captcha"]')
            return False
        except:
            logger.info("图形验证码弹窗处理完毕,继续后续流程")
            return True

    def handle_file_split(self):
        try:
            logger.info(f"开始催记文件切分")
            # 移除过期目录
            self.split_filepaths = []
            util.remove_dir(config.tmp_dir, days_to_keep=90)
            today = util.get_current_time(fmt="%Y%m%d")
            filename = f"网商银行催记拉取{today}.xlsx"
            file_path = os.path.join(config.file_dir, filename)
            if not os.path.exists(file_path):
                logger.info(f"{file_path} 文件不存在")
                return False
            # 新建当前目录,如果已存在,则清空目录
            tmp_today_dir = os.path.join(config.tmp_dir, today)
            os.makedirs(tmp_today_dir, exist_ok=True)
            util.clear_dir_by_name(tmp_today_dir)
            # 文件读取
            df = pd.read_excel(file_path)
            row_num = df.size
            logger.info(f"total_rows: {row_num}")
            # 每 5000 行数据分割一个文件
            index_num = 0
            batch_num = 5000
            for index in range(0, row_num, batch_num):
                index_num += 1
                start_index = index
                end_index = index + batch_num
                # 如果已经到最后一批数据,则全部取完
                if end_index > row_num:
                    end_index = row_num
                logger.info(f"开始文件切分:{start_index}~{end_index}")
                tmp_filename = f"网商银行催记拉取{today}-{index_num}.xlsx"
                logger.info(f"{tmp_filename}")
                tmp_filepath = os.path.join(tmp_today_dir, tmp_filename)
                tmp_df = df.iloc[start_index: end_index]
                tmp_df.to_excel(tmp_filepath, index=None)  # 不需要添加自动索引列
                self.split_filepaths.append(tmp_filepath)
            return True
        except Exception as e:
            logger.error(f"文件切分失败:{str(e)}")
            logger.error(traceback.format_exc())
            return False

    def handle_login(self):
        try:
            logger.info("开始登录")
            self.page = ChromiumPage()
            self.page.set.user_agent(
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
            # 访问登录页
            self.page.get(config.login_url)
            time.sleep(config.login_waiting_time)
            for i in range(10):
                # 切换到iframe
                try:
                    self.frame = self.page.get_frame('xpath://*[@id="inst-lgoin-iframe"]')
                except:
                    time.sleep(config.login_waiting_time)
            # 填写账号密码(请根据实际页面调整选择器)
            self.frame.ele('xpath://input[@name="logonId"]').input(config.username)
            self.frame.ele('xpath://input[@name="password"]').input(config.password)
            self.frame.ele('xpath://button[@id="submitBtn"]').click()  # 点击登录触发验证码
            # 等待验证码出现
            if not self.solve_captcha():
                logger.info("图形验证码处理失败")
                return False
            logger.info("登录成功")
            return True
        except Exception as e:
            logger.error(f"登录失败:{str(e)}")
            logger.error(traceback.format_exc())
            return False

    def handle_work(self):
        try:
            logger.info(f"开始任务处理")
            time.sleep(config.page_interval)
            # 选项点击
            # 身份证是否存在点击
            idcard_element = self.page.ele('xpath://span[input[@id="idcardExists"]]')
            idcard_class = idcard_element.attr("class")
            if "checked" not in idcard_class:
                idcard_element.click()
                time.sleep(config.page_interval)
            # 身份证是否有效点击
            idcard_element = self.page.ele('xpath://span[input[@id="idcardValid"]]')
            idcard_class = idcard_element.attr("class")
            if "checked" not in idcard_class:
                idcard_element.click()
                time.sleep(config.page_interval)
            # 贷中/贷后资料
            loan_info_element = self.page.ele('xpath://label/span[text()="贷中/贷后资料"]/preceding-sibling::span')
            loan_info_element.click()
            time.sleep(config.page_interval)
            class_info = loan_info_element.attr("class")
            if "checked" in class_info:
                loan_info_element.click()
                time.sleep(config.page_interval)
            # 选中催收记录
            info_element = self.page.ele('xpath://label/span[text()="催收记录"]/preceding-sibling::span')
            class_info = info_element.attr("class")
            if "checked" not in class_info:
                info_element.click()
                time.sleep(config.page_interval)
            # 贷前资料
            loan_info_element = self.page.ele('xpath://label/span[text()="贷前资料"]/preceding-sibling::span')
            loan_info_element.click()
            time.sleep(config.page_interval)
            class_info = loan_info_element.attr("class")
            if "checked" in class_info:
                loan_info_element.click()
                time.sleep(config.page_interval)
            # 文件分批上传
            for filepath in self.split_filepaths:
                self.page.ele('xpath://div/span[@class="ant-upload ant-upload-btn"]').click()
                # tmp_file = os.path.join(config.file_dir, "test.xlsx")
                logger.info(f"开始 {filepath} 文件上传")
                util.file_upload(filepath)
                time.sleep(config.page_interval)
                # 测试不点击下载
                # self.page.ele('xpath://div/span/button/span[text()="下 载"]').click()
                time.sleep(config.page_interval)
                time.sleep(config.login_waiting_time)
            return True
        except Exception as e:
            logger.error(f"任务处理失败:{str(e)}")
            logger.error(traceback.format_exc())
            return False

    def close_browser(self):
        logger.info("关闭浏览器窗口")
        self.page.close()

    def run(self):
        if not self.handle_file_split():
            logger.info("文件切分失败")
            return
        if not self.handle_login():
            logger.info("登录失败")
            return
        self.handle_work()
        self.close_browser()


# 使用示例
if __name__ == "__main__":
    hwc = FeeBillClass()
    hwc.run()
    # hwc.handle_file_split()
相关推荐
牛奶咖啡132 小时前
企鹅龙+再生龙服务器版实现自动化备份与还原系统实践
运维·自动化·企鹅龙·再生龙服务器版·系统批量自动化备份·系统批量自动化还原
阿贵---2 小时前
定时任务专家:Python Schedule库使用指南
jvm·数据库·python
TsukasaNZ2 小时前
如何为开源Python项目做贡献?
jvm·数据库·python
SBFE2 小时前
使用minimax自动化本地部署openclaw操作历程
自动化
云晓-2 小时前
从零入门智能体:核心概念与发展脉络全解析
python
nananaij2 小时前
【LeetCode-05 好数对的数目 python解法】
python·算法·leetcode
請你喝杯Java2 小时前
Python 后端开发:从虚拟环境、pip、requirements.txt 到项目启动
开发语言·python·pip
YFLICKERH2 小时前
【Python-Web后端开发框架】Flask | Django | FastAPI | Tornado 选型与 使用 | 特性
前端·python·flask
2401_831920742 小时前
Python生成器(Generator)与Yield关键字:惰性求值之美
jvm·数据库·python