Python 流程自动化之 DrissionPage 使用示例

正常流程自动化操作的话使用 selenium 的会比较多,虽然语句语法比较啰嗦,但相对来说比较成熟,但是因为需要使用 chromedriver,即使启动参数里边添加各种如下参数,却还是容易被网站的各种反爬虫机制检测到。

python 复制代码
            option.add_experimental_option('excludeSwitches', ['enable-automation'])
            option.add_experimental_option('useAutomationExtension', False)
            option.add_argument('--disable-gpu')
            option.add_argument("--disable-blink-features=AutomationControlled")
            option.add_argument('--no-sandbox')

Selenium 的角色像一个中间传话人,需要依赖 ChromeDriver 驱动才能与浏览器沟通;而 DrissionPage 则绕过了中间人,直接与浏览器对话,因此既不需要下载驱动,运行速度也更快,并且没有 WebDriver 特征,更难被网站的反爬虫机制检测到

起因

最近做一个自动化操作的项目时,在登录过程中遇到了图形验证码处理的问题。使用 selenium 时虽然已经通过各种识别计算,精确得到了滑动距离且使用 selenium 的 ActionChains 把滑块拉到了指定位置,但是后台校验还是一直提示验证失败。后面甚至尝试了使用 playwright(playwright 也不需要 chromedriver)来进行处理,但是仍然一样提示校验失败。再后面就抱着试一试的心态试了一下 DrissionPage,然后竟奇迹般发现图形验证码处理部分轻松通过。于是乎简单记录一下使用 DrissionPage 的过程及示例。

安装

使用前记得先 pip 安装 DrissionPage:

python 复制代码
pip install DrissionPage

文档

使用语法可参考 https://drissionpage.cn/browser_control/intro

文档也是作者自己写的,基本上已经很全面了,我就不多啰嗦了。

示例

使用示例(示例仅保留了主流程部分供自己后续参考):

python 复制代码
import time
import re
import traceback

import requests
import os
import util
import base64
import cv2
import config
import pandas as pd

from config import logger

from DrissionPage import ChromiumPage
from DrissionPage.common import Actions


class FeeBillClass:

    def __init__(self):
        self.name = "fee_bill"
        self.username = config.username
        self.password = config.password
        self.page = None
        self.frame = None
        self.split_filepaths = []

    def get_images(self, bg_element, slider_element):
        logger.info("get_images1 下载背景图和滑块图")
        # 等待背景图 url 完全加载,最多等待 30s
        for i in range(30):
            bg_src = bg_element.attr("src")
            if bg_src:
                break
            time.sleep(1)
        # 获取背景图
        bg_src = bg_element.attr("src")
        # 获取滑块图
        slider_src = slider_element.attr("src")
        # 图片下载链接有时候访问不到,可重试一次
        try:
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        except:
            time.sleep(1)
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        # 下载图片
        bg_path = os.path.join(config.img_dir, "bg.png")
        slider_path = os.path.join(config.img_dir, "slider.png")

        with open(bg_path, "wb") as f:
            f.write(bg_content)
        with open(slider_path, "wb") as f:
            f.write(slider_content)
        return bg_content, slider_content, bg_path

    def get_images2(self, bg_element, slider_element):
        logger.info(f"get_images2 下载背景图和滑块图")
        # 等待背景图 url 完全加载,最多等待 30s
        for i in range(30):
            bg_src = bg_element.get_attribute("src")
            if bg_src:
                break
            time.sleep(config.page_interval)
        # 获取背景图
        bg_src = bg_element.get_attribute("src")
        # 获取滑块图
        slider_src = slider_element.get_attribute("style")
        slider_src = re.search("(http.*)/origin", slider_src).group(1)
        logger.info(bg_src)
        logger.info(slider_src)
        # 图片下载链接有时候访问不到,可重试一次
        try:
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        except:
            time.sleep(config.page_interval)
            logger.info("图片重试下载中...")
            bg_content = requests.get(bg_src).content
            slider_content = requests.get(slider_src).content
        # 下载图片
        bg_path = os.path.join(config.img_dir, "bg.png")
        slider_path = os.path.join(config.img_dir, "slider.png")

        with open(bg_path, "wb") as f:
            f.write(bg_content)
        with open(slider_path, "wb") as f:
            f.write(slider_content)
        return bg_path, slider_path

    # 文字识别验证码
    def check_captcha1(self):
        try:
            slider_element = self.frame.ele('xpath://div[@id="click_quiz"]/img')
            bg_element = self.frame.ele('xpath://div[@id="c_click_wrapper_view"]/img')
            bg_content, slider_content, bg_path = self.get_images(bg_element, slider_element)
            bg_base64 = base64.b64encode(bg_content).decode('utf-8')
            slider_base64 = base64.b64encode(slider_content).decode('utf-8')
            bg_width = bg_element.rect.size[0]
            real_width = cv2.imread(bg_path).shape[1]
            scale = bg_width / real_width
            # 根据背景图的页面尺寸和实际尺寸调整缩放比
            # scale = 0.7
            logger.info(f"缩放比:{scale}")
            data = util.captcha_recognize3(bg_base64, slider_base64)
            data = str(data).split("|")
            logger.info(data)
            points = []
            for item in data:
                x, y = item.split(",")
                x = int(int(x) * scale)
                y = int(int(y) * scale)
                points.append((x, y))  # 按照缩放比重新计算偏移量
            logger.info(points)
            # 获取元素位置
            ac = Actions(self.frame)
            for x, y in points:
                # page.mouse.click(abs_x + x, abs_y + y)
                ac.move_to(bg_element, x, y).click()
                time.sleep(1)
        except Exception as e:
            logger.error(f"文字识别验证码处理报错:{str(e)}")
            logger.error(traceback.format_exc())

    # 滑块拖动验证码
    def check_captcha2(self):
        try:
            slider_element = self.frame.ele('xpath://div[@id="puzzle_slot"]')
            bg_element = self.frame.ele('xpath://div/img[@id="puzzle_backimg"]')
            slider_button = self.frame.ele('xpath://div[@class="dv_handler baseslider"]')
            # 获取并处理图片
            bg_path, slider_path = self.get_images2(bg_element, slider_element)
            bg_width = bg_element.rect.size[0]
            real_width = cv2.imread(bg_path).shape[1]
            scale = bg_width / real_width
            # 根据背景图的页面尺寸和实际尺寸调整缩放比
            logger.info(f"缩放比:{scale}")
            distance = util.captcha_recognize(bg_path, slider_path)
            delta = 3
            real_distance = distance * scale - delta
            logger.info(f"真实滑动距离:{real_distance}")
            ac = Actions(self.frame)
            ac.hold(slider_button)
            ac.move(real_distance, 0, duration=config.page_interval)
            ac.release()
        except Exception as e:
            logger.error(f"滑块验证码处理报错:{str(e)}")
            logger.error(traceback.format_exc())

    def solve_captcha(self):
        try:
            time.sleep(config.login_waiting_time)
            self.frame.ele('xpath://div[@class="detail-page captcha"]')
        except:
            logger.info("无图形验证码弹窗,继续后续流程")
            return True
        # 获取验证码图片
        self.check_captcha1()
        try:
            time.sleep(config.login_waiting_time)
            self.frame.ele('xpath://div[@class="detail-page captcha"]')
        except:
            logger.info("图形验证码弹窗处理完毕,继续后续流程")
            return True
        self.check_captcha2()
        try:
            time.sleep(config.login_waiting_time)
            self.frame.ele('xpath://div[@class="detail-page captcha"]')
            return False
        except:
            logger.info("图形验证码弹窗处理完毕,继续后续流程")
            return True

    def handle_file_split(self):
        try:
            logger.info(f"开始催记文件切分")
            # 移除过期目录
            self.split_filepaths = []
            util.remove_dir(config.tmp_dir, days_to_keep=90)
            today = util.get_current_time(fmt="%Y%m%d")
            filename = f"网商银行催记拉取{today}.xlsx"
            file_path = os.path.join(config.file_dir, filename)
            if not os.path.exists(file_path):
                logger.info(f"{file_path} 文件不存在")
                return False
            # 新建当前目录,如果已存在,则清空目录
            tmp_today_dir = os.path.join(config.tmp_dir, today)
            os.makedirs(tmp_today_dir, exist_ok=True)
            util.clear_dir_by_name(tmp_today_dir)
            # 文件读取
            df = pd.read_excel(file_path)
            row_num = df.size
            logger.info(f"total_rows: {row_num}")
            # 每 5000 行数据分割一个文件
            index_num = 0
            batch_num = 5000
            for index in range(0, row_num, batch_num):
                index_num += 1
                start_index = index
                end_index = index + batch_num
                # 如果已经到最后一批数据,则全部取完
                if end_index > row_num:
                    end_index = row_num
                logger.info(f"开始文件切分:{start_index}~{end_index}")
                tmp_filename = f"网商银行催记拉取{today}-{index_num}.xlsx"
                logger.info(f"{tmp_filename}")
                tmp_filepath = os.path.join(tmp_today_dir, tmp_filename)
                tmp_df = df.iloc[start_index: end_index]
                tmp_df.to_excel(tmp_filepath, index=None)  # 不需要添加自动索引列
                self.split_filepaths.append(tmp_filepath)
            return True
        except Exception as e:
            logger.error(f"文件切分失败:{str(e)}")
            logger.error(traceback.format_exc())
            return False

    def handle_login(self):
        try:
            logger.info("开始登录")
            self.page = ChromiumPage()
            self.page.set.user_agent(
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
            # 访问登录页
            self.page.get(config.login_url)
            time.sleep(config.login_waiting_time)
            for i in range(10):
                # 切换到iframe
                try:
                    self.frame = self.page.get_frame('xpath://*[@id="inst-lgoin-iframe"]')
                except:
                    time.sleep(config.login_waiting_time)
            # 填写账号密码(请根据实际页面调整选择器)
            self.frame.ele('xpath://input[@name="logonId"]').input(config.username)
            self.frame.ele('xpath://input[@name="password"]').input(config.password)
            self.frame.ele('xpath://button[@id="submitBtn"]').click()  # 点击登录触发验证码
            # 等待验证码出现
            if not self.solve_captcha():
                logger.info("图形验证码处理失败")
                return False
            logger.info("登录成功")
            return True
        except Exception as e:
            logger.error(f"登录失败:{str(e)}")
            logger.error(traceback.format_exc())
            return False

    def handle_work(self):
        try:
            logger.info(f"开始任务处理")
            time.sleep(config.page_interval)
            # 选项点击
            # 身份证是否存在点击
            idcard_element = self.page.ele('xpath://span[input[@id="idcardExists"]]')
            idcard_class = idcard_element.attr("class")
            if "checked" not in idcard_class:
                idcard_element.click()
                time.sleep(config.page_interval)
            # 身份证是否有效点击
            idcard_element = self.page.ele('xpath://span[input[@id="idcardValid"]]')
            idcard_class = idcard_element.attr("class")
            if "checked" not in idcard_class:
                idcard_element.click()
                time.sleep(config.page_interval)
            # 贷中/贷后资料
            loan_info_element = self.page.ele('xpath://label/span[text()="贷中/贷后资料"]/preceding-sibling::span')
            loan_info_element.click()
            time.sleep(config.page_interval)
            class_info = loan_info_element.attr("class")
            if "checked" in class_info:
                loan_info_element.click()
                time.sleep(config.page_interval)
            # 选中催收记录
            info_element = self.page.ele('xpath://label/span[text()="催收记录"]/preceding-sibling::span')
            class_info = info_element.attr("class")
            if "checked" not in class_info:
                info_element.click()
                time.sleep(config.page_interval)
            # 贷前资料
            loan_info_element = self.page.ele('xpath://label/span[text()="贷前资料"]/preceding-sibling::span')
            loan_info_element.click()
            time.sleep(config.page_interval)
            class_info = loan_info_element.attr("class")
            if "checked" in class_info:
                loan_info_element.click()
                time.sleep(config.page_interval)
            # 文件分批上传
            for filepath in self.split_filepaths:
                self.page.ele('xpath://div/span[@class="ant-upload ant-upload-btn"]').click()
                # tmp_file = os.path.join(config.file_dir, "test.xlsx")
                logger.info(f"开始 {filepath} 文件上传")
                util.file_upload(filepath)
                time.sleep(config.page_interval)
                # 测试不点击下载
                # self.page.ele('xpath://div/span/button/span[text()="下 载"]').click()
                time.sleep(config.page_interval)
                time.sleep(config.login_waiting_time)
            return True
        except Exception as e:
            logger.error(f"任务处理失败:{str(e)}")
            logger.error(traceback.format_exc())
            return False

    def close_browser(self):
        logger.info("关闭浏览器窗口")
        self.page.close()

    def run(self):
        if not self.handle_file_split():
            logger.info("文件切分失败")
            return
        if not self.handle_login():
            logger.info("登录失败")
            return
        self.handle_work()
        self.close_browser()


# 使用示例
if __name__ == "__main__":
    hwc = FeeBillClass()
    hwc.run()
    # hwc.handle_file_split()
相关推荐
xxie12379412 小时前
return与print
开发语言·python
秋912 小时前
从 Python 后端工程师转型 AI Engineer(AI 工程化)的完整补课清单(2026实战版)
开发语言·人工智能·python
慕木沐13 小时前
Google ADK Java 1.0版本 核心机制与实战 Demo
java·开发语言·python
Tbisnic13 小时前
AI大模型学习第十一天:技术选型、安全防护与金融实战
python·学习·ai·大模型·提示词工程
hboot14 小时前
AI工程师第一课 - Python
前端·后端·python
许彰午15 小时前
30_Java Stream流操作全解
java·windows·python
云登指纹浏览器15 小时前
WebDriver反检测技术详解:如何让自动化脚本看起来像真实浏览器
运维·自动化·跨境电商
秋915 小时前
3年经验Python后端转AI Engineer:3个月实战转型计划(2026版)
开发语言·人工智能·python
2601_9563198816 小时前
期货夜盘无人值守监控什么:断线、无成交与拒单信号
python·区块链
CTA终结者16 小时前
期货量化目标仓和净持仓对不齐:天勤 TargetPosTask 与 pos 偏差排查
python·区块链