Python 流程自动化之 DrissionPage 使用示例

正常流程自动化操作的话使用 selenium 的会比较多,虽然语句语法比较啰嗦,但相对来说比较成熟,但是因为需要使用 chromedriver,即使启动参数里边添加各种如下参数,却还是容易被网站的各种反爬虫机制检测到。

python 复制代码
            option.add_experimental_option('excludeSwitches', ['enable-automation'])
            option.add_experimental_option('useAutomationExtension', False)
            option.add_argument('--disable-gpu')
            option.add_argument("--disable-blink-features=AutomationControlled")
            option.add_argument('--no-sandbox')

Selenium 的角色像一个中间传话人,需要依赖 ChromeDriver 驱动才能与浏览器沟通;而 DrissionPage 则绕过了中间人,直接与浏览器对话,因此既不需要下载驱动,运行速度也更快,并且没有 WebDriver 特征,更难被网站的反爬虫机制检测到

起因

最近做一个自动化操作的项目时,在登录过程中遇到了图形验证码处理的问题。使用 selenium 时虽然已经通过各种识别计算,精确得到了滑动距离且使用 selenium 的 ActionChains 把滑块拉到了指定位置,但是后台校验还是一直提示验证失败。后面甚至尝试了使用 playwright(playwright 也不需要 chromedriver)来进行处理,但是仍然一样提示校验失败。再后面就抱着试一试的心态试了一下 DrissionPage,然后竟奇迹般发现图形验证码处理部分轻松通过。于是乎简单记录一下使用 DrissionPage 的过程及示例。

安装

使用前记得先 pip 安装 DrissionPage:

python 复制代码
pip install DrissionPage

文档

使用语法可参考 https://drissionpage.cn/browser_control/intro

文档也是作者自己写的,基本上已经很全面了,我就不多啰嗦了。

示例

使用示例(示例仅保留了主流程部分供自己后续参考):

python 复制代码
import time
import re
import traceback

import requests
import os
import util
import base64
import cv2
import config
import pandas as pd

from config import logger

from DrissionPage import ChromiumPage
from DrissionPage.common import Actions


class FeeBillClass:

    def __init__(self):
        self.name = "fee_bill"
        self.username = config.username
        self.password = config.password
        self.page = None
        self.frame = None
        self.split_filepaths = []

    def get_images(self, bg_element, slider_element):
        logger.info("get_images1 下载背景图和滑块图")
        # 等待背景图 url 完全加载,最多等待 30s
        for i in range(30):
            bg_src = bg_element.attr("src")
            if bg_src:
                break
            time.sleep(1)
        # 获取背景图
        bg_src = bg_element.attr("src")
        # 获取滑块图
        slider_src = slider_element.attr("src")
        # 图片下载链接有时候访问不到,可重试一次
        try:
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        except:
            time.sleep(1)
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        # 下载图片
        bg_path = os.path.join(config.img_dir, "bg.png")
        slider_path = os.path.join(config.img_dir, "slider.png")

        with open(bg_path, "wb") as f:
            f.write(bg_content)
        with open(slider_path, "wb") as f:
            f.write(slider_content)
        return bg_content, slider_content, bg_path

    def get_images2(self, bg_element, slider_element):
        logger.info(f"get_images2 下载背景图和滑块图")
        # 等待背景图 url 完全加载,最多等待 30s
        for i in range(30):
            bg_src = bg_element.get_attribute("src")
            if bg_src:
                break
            time.sleep(config.page_interval)
        # 获取背景图
        bg_src = bg_element.get_attribute("src")
        # 获取滑块图
        slider_src = slider_element.get_attribute("style")
        slider_src = re.search("(http.*)/origin", slider_src).group(1)
        logger.info(bg_src)
        logger.info(slider_src)
        # 图片下载链接有时候访问不到,可重试一次
        try:
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        except:
            time.sleep(config.page_interval)
            logger.info("图片重试下载中...")
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        # 下载图片
        bg_path = os.path.join(config.img_dir, "bg.png")
        slider_path = os.path.join(config.img_dir, "slider.png")

        with open(bg_path, "wb") as f:
            f.write(bg_content)
        with open(slider_path, "wb") as f:
            f.write(slider_content)
        return bg_path, slider_path

    # 文字识别验证码
    def check_captcha1(self):
        try:
            slider_element = self.frame.ele('xpath://div[@id="click_quiz"]/img')
            bg_element = self.frame.ele('xpath://div[@id="c_click_wrapper_view"]/img')
            bg_content, slider_content, bg_path = self.get_images(bg_element, slider_element)
            bg_base64 = base64.b64encode(bg_content).decode('utf-8')
            slider_base64 = base64.b64encode(slider_content).decode('utf-8')
            bg_width = bg_element.rect.size[0]
            real_width = cv2.imread(bg_path).shape[1]
            scale = bg_width / real_width
            # 根据背景图的页面尺寸和实际尺寸调整缩放比
            # scale = 0.7
            logger.info(f"缩放比:{scale}")
            data = util.captcha_recognize3(bg_base64, slider_base64)
            data = str(data).split("|")
            logger.info(data)
            points = []
            for item in data:
                x, y = item.split(",")
                x = int(int(x) * scale)
                y = int(int(y) * scale)
                points.append((x, y))  # 按照缩放比重新计算偏移量
            logger.info(points)
            # 获取元素位置
            ac = Actions(self.frame)
            for x, y in points:
                # page.mouse.click(abs_x + x, abs_y + y)
                ac.move_to(bg_element, x, y).click()
                time.sleep(1)
        except Exception as e:
            logger.error(f"文字识别验证码处理报错:{str(e)}")
            logger.error(traceback.format_exc())

    # 滑块拖动验证码
    def check_captcha2(self):
        try:
            slider_element = self.frame.ele('xpath://div[@id="puzzle_slot"]')
            bg_element = self.frame.ele('xpath://div/img[@id="puzzle_backimg"]')
            slider_button = self.frame.ele('xpath://div[@class="dv_handler baseslider"]')
            # 获取并处理图片
            bg_path, slider_path = self.get_images2(bg_element, slider_element)
            bg_width = bg_element.rect.size[0]
            real_width = cv2.imread(bg_path).shape[1]
            scale = bg_width / real_width
            # 根据背景图的页面尺寸和实际尺寸调整缩放比
            logger.info(f"缩放比:{scale}")
            distance = util.captcha_recognize(bg_path, slider_path)
            delta = 3
            real_distance = distance * scale - delta
            logger.info(f"真实滑动距离:{real_distance}")
            ac = Actions(self.frame)
            ac.hold(slider_button)
            ac.move(real_distance, 0, duration=config.page_interval)
            ac.release()
        except Exception as e:
            logger.error(f"滑块验证码处理报错:{str(e)}")
            logger.error(traceback.format_exc())

    def solve_captcha(self):
        try:
            time.sleep(config.login_waiting_time)
            self.frame.ele('xpath://div[@class="detail-page captcha"]')
        except:
            logger.info("无图形验证码弹窗,继续后续流程")
            return True
        # 获取验证码图片
        self.check_captcha1()
        try:
            time.sleep(config.login_waiting_time)
            self.frame.ele('xpath://div[@class="detail-page captcha"]')
        except:
            logger.info("图形验证码弹窗处理完毕,继续后续流程")
            return True
        self.check_captcha2()
        try:
            time.sleep(config.login_waiting_time)
            self.frame.ele('xpath://div[@class="detail-page captcha"]')
            return False
        except:
            logger.info("图形验证码弹窗处理完毕,继续后续流程")
            return True

    def handle_file_split(self):
        try:
            logger.info(f"开始催记文件切分")
            # 移除过期目录
            self.split_filepaths = []
            util.remove_dir(config.tmp_dir, days_to_keep=90)
            today = util.get_current_time(fmt="%Y%m%d")
            filename = f"网商银行催记拉取{today}.xlsx"
            file_path = os.path.join(config.file_dir, filename)
            if not os.path.exists(file_path):
                logger.info(f"{file_path} 文件不存在")
                return False
            # 新建当前目录,如果已存在,则清空目录
            tmp_today_dir = os.path.join(config.tmp_dir, today)
            os.makedirs(tmp_today_dir, exist_ok=True)
            util.clear_dir_by_name(tmp_today_dir)
            # 文件读取
            df = pd.read_excel(file_path)
            row_num = df.size
            logger.info(f"total_rows: {row_num}")
            # 每 5000 行数据分割一个文件
            index_num = 0
            batch_num = 5000
            for index in range(0, row_num, batch_num):
                index_num += 1
                start_index = index
                end_index = index + batch_num
                # 如果已经到最后一批数据,则全部取完
                if end_index > row_num:
                    end_index = row_num
                logger.info(f"开始文件切分:{start_index}~{end_index}")
                tmp_filename = f"网商银行催记拉取{today}-{index_num}.xlsx"
                logger.info(f"{tmp_filename}")
                tmp_filepath = os.path.join(tmp_today_dir, tmp_filename)
                tmp_df = df.iloc[start_index: end_index]
                tmp_df.to_excel(tmp_filepath, index=None)  # 不需要添加自动索引列
                self.split_filepaths.append(tmp_filepath)
            return True
        except Exception as e:
            logger.error(f"文件切分失败:{str(e)}")
            logger.error(traceback.format_exc())
            return False

    def handle_login(self):
        try:
            logger.info("开始登录")
            self.page = ChromiumPage()
            self.page.set.user_agent(
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
            # 访问登录页
            self.page.get(config.login_url)
            time.sleep(config.login_waiting_time)
            for i in range(10):
                # 切换到iframe
                try:
                    self.frame = self.page.get_frame('xpath://*[@id="inst-lgoin-iframe"]')
                except:
                    time.sleep(config.login_waiting_time)
            # 填写账号密码(请根据实际页面调整选择器)
            self.frame.ele('xpath://input[@name="logonId"]').input(config.username)
            self.frame.ele('xpath://input[@name="password"]').input(config.password)
            self.frame.ele('xpath://button[@id="submitBtn"]').click()  # 点击登录触发验证码
            # 等待验证码出现
            if not self.solve_captcha():
                logger.info("图形验证码处理失败")
                return False
            logger.info("登录成功")
            return True
        except Exception as e:
            logger.error(f"登录失败:{str(e)}")
            logger.error(traceback.format_exc())
            return False

    def handle_work(self):
        try:
            logger.info(f"开始任务处理")
            time.sleep(config.page_interval)
            # 选项点击
            # 身份证是否存在点击
            idcard_element = self.page.ele('xpath://span[input[@id="idcardExists"]]')
            idcard_class = idcard_element.attr("class")
            if "checked" not in idcard_class:
                idcard_element.click()
                time.sleep(config.page_interval)
            # 身份证是否有效点击
            idcard_element = self.page.ele('xpath://span[input[@id="idcardValid"]]')
            idcard_class = idcard_element.attr("class")
            if "checked" not in idcard_class:
                idcard_element.click()
                time.sleep(config.page_interval)
            # 贷中/贷后资料
            loan_info_element = self.page.ele('xpath://label/span[text()="贷中/贷后资料"]/preceding-sibling::span')
            loan_info_element.click()
            time.sleep(config.page_interval)
            class_info = loan_info_element.attr("class")
            if "checked" in class_info:
                loan_info_element.click()
                time.sleep(config.page_interval)
            # 选中催收记录
            info_element = self.page.ele('xpath://label/span[text()="催收记录"]/preceding-sibling::span')
            class_info = info_element.attr("class")
            if "checked" not in class_info:
                info_element.click()
                time.sleep(config.page_interval)
            # 贷前资料
            loan_info_element = self.page.ele('xpath://label/span[text()="贷前资料"]/preceding-sibling::span')
            loan_info_element.click()
            time.sleep(config.page_interval)
            class_info = loan_info_element.attr("class")
            if "checked" in class_info:
                loan_info_element.click()
                time.sleep(config.page_interval)
            # 文件分批上传
            for filepath in self.split_filepaths:
                self.page.ele('xpath://div/span[@class="ant-upload ant-upload-btn"]').click()
                # tmp_file = os.path.join(config.file_dir, "test.xlsx")
                logger.info(f"开始 {filepath} 文件上传")
                util.file_upload(filepath)
                time.sleep(config.page_interval)
                # 测试不点击下载
                # self.page.ele('xpath://div/span/button/span[text()="下 载"]').click()
                time.sleep(config.page_interval)
                time.sleep(config.login_waiting_time)
            return True
        except Exception as e:
            logger.error(f"任务处理失败:{str(e)}")
            logger.error(traceback.format_exc())
            return False

    def close_browser(self):
        logger.info("关闭浏览器窗口")
        self.page.close()

    def run(self):
        if not self.handle_file_split():
            logger.info("文件切分失败")
            return
        if not self.handle_login():
            logger.info("登录失败")
            return
        self.handle_work()
        self.close_browser()


# 使用示例
if __name__ == "__main__":
    hwc = FeeBillClass()
    hwc.run()
    # hwc.handle_file_split()
相关推荐
曾阿伦5 小时前
Python 获取本机所有网卡 IP/MAC 地址
python·tcp/ip
qq_283720055 小时前
Python 操作 MySQL 数据库全解:增删改查、事务、连接池与性能优化
数据库·python·mysql
Leinwin5 小时前
实战教程:3步接入Azure OpenAI调用GPT-5,国内IP直连
后端·python·flask
爱码小白5 小时前
MySQL 系统函数专项练习题
数据库·python·mysql
傻啦嘿哟5 小时前
Python 实现 Excel 数据可视化:柱状图制作教程
开发语言·python
ZC跨境爬虫5 小时前
海南大学交友平台登录页开发实战day3(解决python传输并读取登录信息的问题)
前端·数据库·python·html
2601_954434555 小时前
2026年电钢琴品牌专业深度测评:排名前五权威榜单发布
大数据·人工智能·python
威联通网络存储5 小时前
非结构化数据治理:底层全文检索与自动化归档解析
运维·python·自动化·全文检索
满满和米兜5 小时前
【Java基础】- 集合 - ArrayList与LinkedList
java·python·算法
沪漂阿龙5 小时前
PyTorch 张量与自动微分完全指南:从核心概念到实战训练
人工智能·pytorch·python