Python 自动化下载夸克网盘分享文件:基于 Playwright 的完整实现(含登录态持久化与提取码处理)

简介

通过 python 脚本下载已知提取码的夸克网盘分享的文件

原理介绍

使用 playwright 模拟浏览器输入提取码、点击下载按钮进行下载。文件的保存位置可以通过拦截下载事件另存至指定位置。 备注:由于夸克网盘下载时需要先登陆,所以使用 playwright 打开夸克网盘进行登陆,扫码登陆后将登陆状态存储在指定文件内,已备后续脚本下载时使用。

代码介绍

1.下载脚本是写在 class 中的,用来共享变量,文后会给出完整代码,想看完整代码的朋友移步文章末尾。 2.代码中"设置监听所有请求和响应"、"模拟人类行为"的功能可删除或自行编写 备注:playwright 不可重复创建浏览器实例,会报异常。

创建浏览器

创建 playwright 实例 --> 创建浏览器实例 --> 创建上下文(用来加载登陆状态) --> 创建页面

python 复制代码
def get_browser(self, headless: bool = None):
    """
    启动浏览器
    :return:
    """
    # 创建 playwright 实例
    print("创建 playwright 实例...")
    self.playwright = sync_playwright().start()
    self.chromium_args.append(f"--window-size={self.width},{self.height}")
    headless = self.headless if headless is None else headless
    # 启动浏览器
    print("启动浏览器...")
    self.browser = self.playwright.chromium.launch(
        headless=headless,
        # channel='chrome',
        # channel='chromium',
        args=self.chromium_args,
    )
    # 从文件加载存储状态
    storage_state = None
    if self.storage_state_path and os.path.exists(self.storage_state_path):
        print("从文件加载存储状态...")
        with open(self.storage_state_path, "r") as f:
            storage_state = json.load(f)
    # 创建浏览器上下文
    print("创建浏览器上下文...")
    self.browser_context = self.browser.new_context(
        storage_state=storage_state
        # user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        # viewport={"width": 1920, "height": 1080},
        # locale='zh-CN',
        # # 设置时区
        # timezone_id="Asia/Shanghai",
    )

def get_page(self, headers: dict[str, str] = None) -> "Page":
    """
    获取新页面
    :param headers:
    :param browser_context:
    :return:
    """
    # 创建页面
    print("创建页面...")
    page = self.browser_context.new_page()
    # 设置额外的HTTP头
    if headers:
        print("设置额外的HTTP头...")
        page.set_extra_http_headers(headers)
    # 监听所有请求和响应
    print("监听所有请求和响应...")
    self.set_request_response_log(page)
    return page

登陆夸克并保存状态

创建状态存储文件的目录 --> 打开夸克网盘登陆页 --> 等待扫码登陆(等待行为是通过账户名称来实现到,请填入正确的账户名称),登陆后会自动保存浏览器状态到文件内。 备注:登陆时需要 headless=False(显示浏览器页面)

python 复制代码
def login_quark_and_save_state(self):
    """
    登陆夸克并保存状态, 此处需要使用显示在桌面的浏览器,用以账户登陆
    """
    # 确保目录存在
    directory = os.path.dirname(self.storage_state_path)
    if directory and not os.path.exists(directory):
        print(f"创建目录: {directory}")
        os.makedirs(directory)

    # 执行登录操作
    print("开始登录...")
    self.page.goto(self.quark_url)
    # 等待登录完成
    print("等待登录完成...")
    self.page.wait_for_selector(f"span:text('{self.username}')")

    # 保存浏览器状态到文件
    print("保存浏览器状态...")
    storage_state = self.browser_context.storage_state()
    with open(self.storage_state_path, "w", encoding="utf-8") as file:
        json.dump(storage_state, file, indent=2, ensure_ascii=False)
    self.page.wait_for_timeout(20000)

下载夸克分享链接

创建文件存储目录 --> 访问分享链接 --> 等待页面加载完成 --> 输入提取码 --> 监听下载事件 --> 点击下载按钮 --> 文件另存为 --> 等待文件下载结束

python 复制代码
def quark_download(self, share_url: str, extract_code: str, download_path: str, file_name: str,
                   page: "Page" = None):
    """
    下载文件
    :param share_url: 分享链接
    :param extract_code: 提取码
    :param download_path: 下载路径
    :param file_name: 文件名
    :param page:
    :return:
    """
    # 确保保存目录存在
    Path(download_path).mkdir(parents=True, exist_ok=True)
    page = page or self.page
    try:
        # 访问目标页面
        print(f"访问目标页面:{share_url}")
        response = page.goto(share_url, timeout=300000, )
        print("目标页面 status:", response.status)
        # 等待页面完全加载
        print("等待页面完全加载...")
        page.wait_for_load_state("networkidle")
        # 模拟人类行为
        print("模拟人类行为...")
        self.simulation_operation(page)
        # 输入提取码 XPath 选择器
        print("输入提取码...")
        input_element = page.locator("//input[@placeholder='请输入提取码,不区分大小写']")
        # 使用标准 CSS 选择器
        # input_element = page.locator("input[placeholder='请输入提取码,不区分大小写']")
        input_element.fill(extract_code)
        print("提取码已输入:" + extract_code)
        # 等待页面加载完成
        page.wait_for_load_state("networkidle")
        # 监听下载事件并下载文件
        print("正在等待下载...")
        with page.expect_download(timeout=30000) as download_info:
            # 点击下载按钮
            share_download_btn = page.locator('.share-download').nth(0)
            share_download_btn.click()
            print("点击下载按钮...")
            # 等待下载开始
            download = download_info.value
            print(f"开始下载: {download.suggested_filename}")
            # 获取下载文件名
            filename = download.suggested_filename
            file_name = f'{file_name}.{filename.split('.')[-1]}' if '.' in filename else filename
            # 指定完整保存路径
            full_save_path = Path(download_path) / file_name
            if full_save_path.exists():
                print(f"文件已存在, 删除: {full_save_path}")
                os.remove(full_save_path)
            print(f"文件已下载到: {full_save_path}")
            # 保存文件到指定位置
            download.save_as(full_save_path)
            # 等待文件保存完成
            print("等待文件保存完成...")
            count = 0
            while not os.path.exists(full_save_path):
                page.wait_for_timeout(1000)
                print("等待文件保存完成...")
                count = count + 1
                if count > self.download_timeout:
                    print(f"下载超时, url:{share_url}, code:{extract_code}, full_save_path:{full_save_path}")
                    break
            print("下载结束")
        page.wait_for_load_state("networkidle")
        return full_save_path
    except Exception as e:
        print(f"访问页面时发生错误: {e}, 错误信息:{e.__dict__}")
        traceback.print_exception(e)
    finally:
        print(f'下载结束:{share_url}, code:{extract_code}')

夸克网盘登陆调用方法

python 复制代码
if __name__ == '__main__':
    # 夸克网盘账户名称,请确保名称正确,否则登陆时会等待超时报错
    username = ''
    # 浏览器状态保存文件全路径或相对路径
    storage_state_path = "xxxxxx/login_state.json"
    
    # 登陆夸克,保存状态
    print("================================= 登陆夸克网盘 =================================")
    # 创建实例
    quark_download_util = QuarkDownloadUtil(username=username, storage_state_path=storage_state_path, headless=False)
    # 登陆
    quark_download_util.login_quark_and_save_state()
    # 关闭实例
    quark_download_util.close()
    print("================================= 登陆完成 =================================")

夸克网盘分享文件下载调用方法

python 复制代码
if __name__ == '__main__':
    # 夸克网盘账户名称,请确保名称正确,否则登陆时会等待超时报错
    username = ''
    # 浏览器状态保存文件全路径或相对路径(请确保与登陆时保存的文件相同)
    storage_state_path = "xxxxxx/login_state.json"
    # 文件下载位置全路径或相对路径(目录文件)
    download_path = 'xxxxxx/Downloads'
    
    # 下载夸克分享链接
    print("================================= 开始下载 =================================")
    share_url = "https://pan.quark.cn/s/3f1946688409"
    extract_code = "Dllg"
    file_name = "星-张远"
    quark_download_util = QuarkDownloadUtil(username=username, storage_state_path=storage_state_path, headless=True)
    quark_download_util.quark_download(share_url=share_url, extract_code=extract_code, download_path=download_path,
                                       file_name=file_name)
    quark_download_util.close()
    print("================================= 下载完成 =================================")

完整源码

python 复制代码
import json
import os
import traceback
from pathlib import Path

from patchright.sync_api import Browser, Page, BrowserContext
from playwright.sync_api import sync_playwright


class QuarkDownloadUtil:
    """
    夸克网盘分享链接下载器
    """
    default_headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    chromium_args = [
        # crawl4ai 内部参数
        "--disable-gpu",
        "--disable-gpu-compositing",
        "--disable-software-rasterizer",
        "--no-sandbox",
        "--disable-dev-shm-usage",
        "--no-first-run",
        "--no-default-browser-check",
        "--disable-infobars",
        "--window-position=0,0",
        "--ignore-certificate-errors",
        "--ignore-certificate-errors-spki-list",
        "--disable-blink-features=AutomationControlled",
        "--window-position=400,0",
        "--disable-renderer-backgrounding",
        "--disable-ipc-flooding-protection",
        "--force-color-profile=srgb",
        "--mute-audio",
        "--disable-background-timer-throttling",
        # "--single-process",
        f"--window-size={1080},{600}",
        # f"--user-data-dir=/path/to/your/user/data/dir"
    ]
    playwright: "sync_playwright" = None
    browser: "Browser" = None
    browser_context: "BrowserContext" = None
    quark_url = "https://pan.quark.cn"
    width: int = 1080
    height: int = 600
    download_timeout: int

    def __init__(self, username, storage_state_path: str, download_timeout: int = None, headless: bool = True):
        """
        :param username: 用户名
        :param storage_state_path: 状态保存位置
        :param headless: 是否为无头浏览器
        """
        self.username = username
        self.storage_state_path = storage_state_path
        self.headless = headless
        self.download_timeout = download_timeout if download_timeout else 20
        self.get_browser()
        self.page = self.get_page()

    def get_browser(self, headless: bool = None):
        """
        启动浏览器
        :return:
        """
        # 创建 playwright 实例
        print("创建 playwright 实例...")
        self.playwright = sync_playwright().start()
        self.chromium_args.append(f"--window-size={self.width},{self.height}")
        headless = self.headless if headless is None else headless
        # 启动浏览器
        print("启动浏览器...")
        self.browser = self.playwright.chromium.launch(
            headless=headless,
            # channel='chrome',
            # channel='chromium',
            args=self.chromium_args,
        )
        # 从文件加载存储状态
        storage_state = None
        if self.storage_state_path and os.path.exists(self.storage_state_path):
            print("从文件加载存储状态...")
            with open(self.storage_state_path, "r") as f:
                storage_state = json.load(f)
        # 创建浏览器上下文
        print("创建浏览器上下文...")
        self.browser_context = self.browser.new_context(
            storage_state=storage_state
            # user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
            # viewport={"width": 1920, "height": 1080},
            # locale='zh-CN',
            # # 设置时区
            # timezone_id="Asia/Shanghai",
        )

    def get_page(self, headers: dict[str, str] = None) -> "Page":
        """
        获取新页面
        :param headers:
        :param browser_context:
        :return:
        """
        # 创建页面
        print("创建页面...")
        page = self.browser_context.new_page()
        # 设置额外的HTTP头
        if headers:
            print("设置额外的HTTP头...")
            page.set_extra_http_headers(headers)
        # 监听所有请求和响应
        print("监听所有请求和响应...")
        self.set_request_response_log(page)
        return page

    def set_request_response_log(self, page: "Page" = None):
        """
        设置监听所有请求和响应
        :param page:
        :return:
        """
        page = page or self.page

        def log_request(request):
            # print(f"请求: {request.method} {request.url}")
            pass

        def log_response(response):
            pass
            # print(f"响应: {response.status} {response.url}")
            # if response.status >= 400:
            #     print(f"错误响应头: {response.headers}")
            #     try:
            #         print(f"错误响应内容: {response.text()}")
            #     except:
            #         pass

        page.on("request", log_request)
        page.on("response", log_response)

    def simulation_operation(self, page: "Page" = None):
        """
        模拟人类行为
        :return:
        """
        page = page or self.page
        for i in range(3):
            page.evaluate("window.scrollBy(0, window.innerHeight / 2)")
            page.wait_for_timeout(500)

    def login_quark_and_save_state(self):
        """
        登陆夸克并保存状态, 此处需要使用显示在桌面的浏览器,用以账户登陆
        """
        # 确保目录存在
        directory = os.path.dirname(self.storage_state_path)
        if directory and not os.path.exists(directory):
            print(f"创建目录: {directory}")
            os.makedirs(directory)

        # 执行登录操作
        print("开始登录...")
        self.page.goto(self.quark_url)
        # 等待登录完成
        print("等待登录完成...")
        self.page.wait_for_selector(f"span:text('{self.username}')")

        # 保存浏览器状态到文件
        print("保存浏览器状态...")
        storage_state = self.browser_context.storage_state()
        with open(self.storage_state_path, "w", encoding="utf-8") as file:
            json.dump(storage_state, file, indent=2, ensure_ascii=False)
        self.page.wait_for_timeout(20000)

    def quark_download(self, share_url: str, extract_code: str, download_path: str, file_name: str,
                       page: "Page" = None):
        """
        下载文件
        :param share_url: 分享链接
        :param extract_code: 提取码
        :param download_path: 下载路径
        :param file_name: 文件名
        :param page:
        :return:
        """
        # 确保保存目录存在
        Path(download_path).mkdir(parents=True, exist_ok=True)
        page = page or self.page
        try:
            # 访问目标页面
            print(f"访问目标页面:{share_url}")
            response = page.goto(share_url, timeout=300000, )
            print("目标页面 status:", response.status)
            # 等待页面完全加载
            print("等待页面完全加载...")
            page.wait_for_load_state("networkidle")
            # 模拟人类行为
            print("模拟人类行为...")
            self.simulation_operation(page)
            # 输入提取码 XPath 选择器
            print("输入提取码...")
            input_element = page.locator("//input[@placeholder='请输入提取码,不区分大小写']")
            # 使用标准 CSS 选择器
            # input_element = page.locator("input[placeholder='请输入提取码,不区分大小写']")
            input_element.fill(extract_code)
            print("提取码已输入:" + extract_code)
            # 等待页面加载完成
            page.wait_for_load_state("networkidle")
            # 监听下载事件并下载文件
            print("正在等待下载...")
            with page.expect_download(timeout=30000) as download_info:
                # 点击下载按钮
                share_download_btn = page.locator('.share-download').nth(0)
                share_download_btn.click()
                print("点击下载按钮...")
                # 等待下载开始
                download = download_info.value
                print(f"开始下载: {download.suggested_filename}")
                # 获取下载文件名
                filename = download.suggested_filename
                file_name = f'{file_name}.{filename.split('.')[-1]}' if '.' in filename else filename
                # 指定完整保存路径
                full_save_path = Path(download_path) / file_name
                if full_save_path.exists():
                    print(f"文件已存在, 删除: {full_save_path}")
                    os.remove(full_save_path)
                print(f"文件已下载到: {full_save_path}")
                # 保存文件到指定位置
                download.save_as(full_save_path)
                # 等待文件保存完成
                print("等待文件保存完成...")
                count = 0
                while not os.path.exists(full_save_path):
                    page.wait_for_timeout(1000)
                    print("等待文件保存完成...")
                    count = count + 1
                    if count > self.download_timeout:
                        print(f"下载超时, url:{share_url}, code:{extract_code}, full_save_path:{full_save_path}")
                        break
                print("下载结束")
            page.wait_for_load_state("networkidle")
            return full_save_path
        except Exception as e:
            print(f"访问页面时发生错误: {e}, 错误信息:{e.__dict__}")
            traceback.print_exception(e)
        finally:
            print(f'下载结束:{share_url}, code:{extract_code}')

    def close(self):
        """
        关闭浏览器
        :return:
        """
        self.page.close()
        self.browser.close()
        self.browser_context.close()
        self.playwright.stop()
        
        
        
if __name__ == '__main__':
    # 夸克网盘账户名称,请确保名称正确,否则登陆时会等待超时报错
    username = ''
    # 浏览器状态保存文件全路径或相对路径(请确保与登陆时保存的文件相同)
    storage_state_path = "xxxxxx/login_state.json"
    # 文件下载位置全路径或相对路径(目录文件)
    download_path = 'xxxxxx/Downloads'
    
    # 登陆夸克,保存状态
    print("================================= 登陆夸克网盘 =================================")
    quark_download_util = QuarkDownloadUtil(username=username, storage_state_path=storage_state_path, headless=False)
    quark_download_util.login_quark_and_save_state()
    quark_download_util.close()
    print("================================= 登陆完成 =================================")

    # 下载夸克分享链接
    print("================================= 开始下载 =================================")
    share_url = "https://pan.quark.cn/s/3f1946688409"
    extract_code = "Dllg"
    file_name = "星-张远"
    quark_download_util = QuarkDownloadUtil(username=username, storage_state_path=storage_state_path, headless=True)
    quark_download_util.quark_download(share_url=share_url, extract_code=extract_code, download_path=download_path,
                                       file_name=file_name)
    quark_download_util.close()
    print("================================= 下载完成 =================================")
相关推荐
吴佳浩8 小时前
Python入门指南(六) - 搭建你的第一个YOLO检测API
人工智能·后端·python
踏浪无痕8 小时前
JobFlow已开源:面向业务中台的轻量级分布式调度引擎 — 支持动态分片与延时队列
后端·架构·开源
superman超哥8 小时前
仓颉语言中基本数据类型的深度剖析与工程实践
c语言·开发语言·python·算法·仓颉
Pitayafruit8 小时前
Spring AI 进阶之路05:集成 MCP 协议实现工具调用
spring boot·后端·llm
ss2739 小时前
线程池:任务队列、工作线程与生命周期管理
java·后端
不像程序员的程序媛9 小时前
Spring的cacheEvict
java·后端·spring
Learner__Q9 小时前
每天五分钟:滑动窗口-LeetCode高频题解析_day3
python·算法·leetcode
————A9 小时前
强化学习----->轨迹、回报、折扣因子和回合
人工智能·python
踏浪无痕9 小时前
JobFlow 实战:无锁调度是怎么做到的
后端·面试·架构
shoubepatien9 小时前
JAVA -- 11
java·后端·intellij-idea