简介
通过 python 脚本下载已知提取码的夸克网盘分享的文件
原理介绍
使用 playwright 模拟浏览器输入提取码、点击下载按钮进行下载。文件的保存位置可以通过拦截下载事件另存至指定位置。 备注:由于夸克网盘下载时需要先登陆,所以使用 playwright 打开夸克网盘进行登陆,扫码登陆后将登陆状态存储在指定文件内,已备后续脚本下载时使用。
代码介绍
1.下载脚本是写在 class 中的,用来共享变量,文后会给出完整代码,想看完整代码的朋友移步文章末尾。 2.代码中"设置监听所有请求和响应"、"模拟人类行为"的功能可删除或自行编写 备注:playwright 不可重复创建浏览器实例,会报异常。
创建浏览器
创建 playwright 实例 --> 创建浏览器实例 --> 创建上下文(用来加载登陆状态) --> 创建页面
python
def get_browser(self, headless: bool = None):
"""
启动浏览器
:return:
"""
# 创建 playwright 实例
print("创建 playwright 实例...")
self.playwright = sync_playwright().start()
self.chromium_args.append(f"--window-size={self.width},{self.height}")
headless = self.headless if headless is None else headless
# 启动浏览器
print("启动浏览器...")
self.browser = self.playwright.chromium.launch(
headless=headless,
# channel='chrome',
# channel='chromium',
args=self.chromium_args,
)
# 从文件加载存储状态
storage_state = None
if self.storage_state_path and os.path.exists(self.storage_state_path):
print("从文件加载存储状态...")
with open(self.storage_state_path, "r") as f:
storage_state = json.load(f)
# 创建浏览器上下文
print("创建浏览器上下文...")
self.browser_context = self.browser.new_context(
storage_state=storage_state
# user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
# viewport={"width": 1920, "height": 1080},
# locale='zh-CN',
# # 设置时区
# timezone_id="Asia/Shanghai",
)
def get_page(self, headers: dict[str, str] = None) -> "Page":
"""
获取新页面
:param headers:
:param browser_context:
:return:
"""
# 创建页面
print("创建页面...")
page = self.browser_context.new_page()
# 设置额外的HTTP头
if headers:
print("设置额外的HTTP头...")
page.set_extra_http_headers(headers)
# 监听所有请求和响应
print("监听所有请求和响应...")
self.set_request_response_log(page)
return page
登陆夸克并保存状态
创建状态存储文件的目录 --> 打开夸克网盘登陆页 --> 等待扫码登陆(等待行为是通过账户名称来实现到,请填入正确的账户名称),登陆后会自动保存浏览器状态到文件内。 备注:登陆时需要 headless=False(显示浏览器页面)
python
def login_quark_and_save_state(self):
"""
登陆夸克并保存状态, 此处需要使用显示在桌面的浏览器,用以账户登陆
"""
# 确保目录存在
directory = os.path.dirname(self.storage_state_path)
if directory and not os.path.exists(directory):
print(f"创建目录: {directory}")
os.makedirs(directory)
# 执行登录操作
print("开始登录...")
self.page.goto(self.quark_url)
# 等待登录完成
print("等待登录完成...")
self.page.wait_for_selector(f"span:text('{self.username}')")
# 保存浏览器状态到文件
print("保存浏览器状态...")
storage_state = self.browser_context.storage_state()
with open(self.storage_state_path, "w", encoding="utf-8") as file:
json.dump(storage_state, file, indent=2, ensure_ascii=False)
self.page.wait_for_timeout(20000)
下载夸克分享链接
创建文件存储目录 --> 访问分享链接 --> 等待页面加载完成 --> 输入提取码 --> 监听下载事件 --> 点击下载按钮 --> 文件另存为 --> 等待文件下载结束
python
def quark_download(self, share_url: str, extract_code: str, download_path: str, file_name: str,
page: "Page" = None):
"""
下载文件
:param share_url: 分享链接
:param extract_code: 提取码
:param download_path: 下载路径
:param file_name: 文件名
:param page:
:return:
"""
# 确保保存目录存在
Path(download_path).mkdir(parents=True, exist_ok=True)
page = page or self.page
try:
# 访问目标页面
print(f"访问目标页面:{share_url}")
response = page.goto(share_url, timeout=300000, )
print("目标页面 status:", response.status)
# 等待页面完全加载
print("等待页面完全加载...")
page.wait_for_load_state("networkidle")
# 模拟人类行为
print("模拟人类行为...")
self.simulation_operation(page)
# 输入提取码 XPath 选择器
print("输入提取码...")
input_element = page.locator("//input[@placeholder='请输入提取码,不区分大小写']")
# 使用标准 CSS 选择器
# input_element = page.locator("input[placeholder='请输入提取码,不区分大小写']")
input_element.fill(extract_code)
print("提取码已输入:" + extract_code)
# 等待页面加载完成
page.wait_for_load_state("networkidle")
# 监听下载事件并下载文件
print("正在等待下载...")
with page.expect_download(timeout=30000) as download_info:
# 点击下载按钮
share_download_btn = page.locator('.share-download').nth(0)
share_download_btn.click()
print("点击下载按钮...")
# 等待下载开始
download = download_info.value
print(f"开始下载: {download.suggested_filename}")
# 获取下载文件名
filename = download.suggested_filename
file_name = f'{file_name}.{filename.split('.')[-1]}' if '.' in filename else filename
# 指定完整保存路径
full_save_path = Path(download_path) / file_name
if full_save_path.exists():
print(f"文件已存在, 删除: {full_save_path}")
os.remove(full_save_path)
print(f"文件已下载到: {full_save_path}")
# 保存文件到指定位置
download.save_as(full_save_path)
# 等待文件保存完成
print("等待文件保存完成...")
count = 0
while not os.path.exists(full_save_path):
page.wait_for_timeout(1000)
print("等待文件保存完成...")
count = count + 1
if count > self.download_timeout:
print(f"下载超时, url:{share_url}, code:{extract_code}, full_save_path:{full_save_path}")
break
print("下载结束")
page.wait_for_load_state("networkidle")
return full_save_path
except Exception as e:
print(f"访问页面时发生错误: {e}, 错误信息:{e.__dict__}")
traceback.print_exception(e)
finally:
print(f'下载结束:{share_url}, code:{extract_code}')
夸克网盘登陆调用方法
python
if __name__ == '__main__':
# 夸克网盘账户名称,请确保名称正确,否则登陆时会等待超时报错
username = ''
# 浏览器状态保存文件全路径或相对路径
storage_state_path = "xxxxxx/login_state.json"
# 登陆夸克,保存状态
print("================================= 登陆夸克网盘 =================================")
# 创建实例
quark_download_util = QuarkDownloadUtil(username=username, storage_state_path=storage_state_path, headless=False)
# 登陆
quark_download_util.login_quark_and_save_state()
# 关闭实例
quark_download_util.close()
print("================================= 登陆完成 =================================")
夸克网盘分享文件下载调用方法
python
if __name__ == '__main__':
# 夸克网盘账户名称,请确保名称正确,否则登陆时会等待超时报错
username = ''
# 浏览器状态保存文件全路径或相对路径(请确保与登陆时保存的文件相同)
storage_state_path = "xxxxxx/login_state.json"
# 文件下载位置全路径或相对路径(目录文件)
download_path = 'xxxxxx/Downloads'
# 下载夸克分享链接
print("================================= 开始下载 =================================")
share_url = "https://pan.quark.cn/s/3f1946688409"
extract_code = "Dllg"
file_name = "星-张远"
quark_download_util = QuarkDownloadUtil(username=username, storage_state_path=storage_state_path, headless=True)
quark_download_util.quark_download(share_url=share_url, extract_code=extract_code, download_path=download_path,
file_name=file_name)
quark_download_util.close()
print("================================= 下载完成 =================================")
完整源码
python
import json
import os
import traceback
from pathlib import Path
from patchright.sync_api import Browser, Page, BrowserContext
from playwright.sync_api import sync_playwright
class QuarkDownloadUtil:
"""
夸克网盘分享链接下载器
"""
default_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
chromium_args = [
# crawl4ai 内部参数
"--disable-gpu",
"--disable-gpu-compositing",
"--disable-software-rasterizer",
"--no-sandbox",
"--disable-dev-shm-usage",
"--no-first-run",
"--no-default-browser-check",
"--disable-infobars",
"--window-position=0,0",
"--ignore-certificate-errors",
"--ignore-certificate-errors-spki-list",
"--disable-blink-features=AutomationControlled",
"--window-position=400,0",
"--disable-renderer-backgrounding",
"--disable-ipc-flooding-protection",
"--force-color-profile=srgb",
"--mute-audio",
"--disable-background-timer-throttling",
# "--single-process",
f"--window-size={1080},{600}",
# f"--user-data-dir=/path/to/your/user/data/dir"
]
playwright: "sync_playwright" = None
browser: "Browser" = None
browser_context: "BrowserContext" = None
quark_url = "https://pan.quark.cn"
width: int = 1080
height: int = 600
download_timeout: int
def __init__(self, username, storage_state_path: str, download_timeout: int = None, headless: bool = True):
"""
:param username: 用户名
:param storage_state_path: 状态保存位置
:param headless: 是否为无头浏览器
"""
self.username = username
self.storage_state_path = storage_state_path
self.headless = headless
self.download_timeout = download_timeout if download_timeout else 20
self.get_browser()
self.page = self.get_page()
def get_browser(self, headless: bool = None):
"""
启动浏览器
:return:
"""
# 创建 playwright 实例
print("创建 playwright 实例...")
self.playwright = sync_playwright().start()
self.chromium_args.append(f"--window-size={self.width},{self.height}")
headless = self.headless if headless is None else headless
# 启动浏览器
print("启动浏览器...")
self.browser = self.playwright.chromium.launch(
headless=headless,
# channel='chrome',
# channel='chromium',
args=self.chromium_args,
)
# 从文件加载存储状态
storage_state = None
if self.storage_state_path and os.path.exists(self.storage_state_path):
print("从文件加载存储状态...")
with open(self.storage_state_path, "r") as f:
storage_state = json.load(f)
# 创建浏览器上下文
print("创建浏览器上下文...")
self.browser_context = self.browser.new_context(
storage_state=storage_state
# user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
# viewport={"width": 1920, "height": 1080},
# locale='zh-CN',
# # 设置时区
# timezone_id="Asia/Shanghai",
)
def get_page(self, headers: dict[str, str] = None) -> "Page":
"""
获取新页面
:param headers:
:param browser_context:
:return:
"""
# 创建页面
print("创建页面...")
page = self.browser_context.new_page()
# 设置额外的HTTP头
if headers:
print("设置额外的HTTP头...")
page.set_extra_http_headers(headers)
# 监听所有请求和响应
print("监听所有请求和响应...")
self.set_request_response_log(page)
return page
def set_request_response_log(self, page: "Page" = None):
"""
设置监听所有请求和响应
:param page:
:return:
"""
page = page or self.page
def log_request(request):
# print(f"请求: {request.method} {request.url}")
pass
def log_response(response):
pass
# print(f"响应: {response.status} {response.url}")
# if response.status >= 400:
# print(f"错误响应头: {response.headers}")
# try:
# print(f"错误响应内容: {response.text()}")
# except:
# pass
page.on("request", log_request)
page.on("response", log_response)
def simulation_operation(self, page: "Page" = None):
"""
模拟人类行为
:return:
"""
page = page or self.page
for i in range(3):
page.evaluate("window.scrollBy(0, window.innerHeight / 2)")
page.wait_for_timeout(500)
def login_quark_and_save_state(self):
"""
登陆夸克并保存状态, 此处需要使用显示在桌面的浏览器,用以账户登陆
"""
# 确保目录存在
directory = os.path.dirname(self.storage_state_path)
if directory and not os.path.exists(directory):
print(f"创建目录: {directory}")
os.makedirs(directory)
# 执行登录操作
print("开始登录...")
self.page.goto(self.quark_url)
# 等待登录完成
print("等待登录完成...")
self.page.wait_for_selector(f"span:text('{self.username}')")
# 保存浏览器状态到文件
print("保存浏览器状态...")
storage_state = self.browser_context.storage_state()
with open(self.storage_state_path, "w", encoding="utf-8") as file:
json.dump(storage_state, file, indent=2, ensure_ascii=False)
self.page.wait_for_timeout(20000)
def quark_download(self, share_url: str, extract_code: str, download_path: str, file_name: str,
page: "Page" = None):
"""
下载文件
:param share_url: 分享链接
:param extract_code: 提取码
:param download_path: 下载路径
:param file_name: 文件名
:param page:
:return:
"""
# 确保保存目录存在
Path(download_path).mkdir(parents=True, exist_ok=True)
page = page or self.page
try:
# 访问目标页面
print(f"访问目标页面:{share_url}")
response = page.goto(share_url, timeout=300000, )
print("目标页面 status:", response.status)
# 等待页面完全加载
print("等待页面完全加载...")
page.wait_for_load_state("networkidle")
# 模拟人类行为
print("模拟人类行为...")
self.simulation_operation(page)
# 输入提取码 XPath 选择器
print("输入提取码...")
input_element = page.locator("//input[@placeholder='请输入提取码,不区分大小写']")
# 使用标准 CSS 选择器
# input_element = page.locator("input[placeholder='请输入提取码,不区分大小写']")
input_element.fill(extract_code)
print("提取码已输入:" + extract_code)
# 等待页面加载完成
page.wait_for_load_state("networkidle")
# 监听下载事件并下载文件
print("正在等待下载...")
with page.expect_download(timeout=30000) as download_info:
# 点击下载按钮
share_download_btn = page.locator('.share-download').nth(0)
share_download_btn.click()
print("点击下载按钮...")
# 等待下载开始
download = download_info.value
print(f"开始下载: {download.suggested_filename}")
# 获取下载文件名
filename = download.suggested_filename
file_name = f'{file_name}.{filename.split('.')[-1]}' if '.' in filename else filename
# 指定完整保存路径
full_save_path = Path(download_path) / file_name
if full_save_path.exists():
print(f"文件已存在, 删除: {full_save_path}")
os.remove(full_save_path)
print(f"文件已下载到: {full_save_path}")
# 保存文件到指定位置
download.save_as(full_save_path)
# 等待文件保存完成
print("等待文件保存完成...")
count = 0
while not os.path.exists(full_save_path):
page.wait_for_timeout(1000)
print("等待文件保存完成...")
count = count + 1
if count > self.download_timeout:
print(f"下载超时, url:{share_url}, code:{extract_code}, full_save_path:{full_save_path}")
break
print("下载结束")
page.wait_for_load_state("networkidle")
return full_save_path
except Exception as e:
print(f"访问页面时发生错误: {e}, 错误信息:{e.__dict__}")
traceback.print_exception(e)
finally:
print(f'下载结束:{share_url}, code:{extract_code}')
def close(self):
"""
关闭浏览器
:return:
"""
self.page.close()
self.browser.close()
self.browser_context.close()
self.playwright.stop()
if __name__ == '__main__':
# 夸克网盘账户名称,请确保名称正确,否则登陆时会等待超时报错
username = ''
# 浏览器状态保存文件全路径或相对路径(请确保与登陆时保存的文件相同)
storage_state_path = "xxxxxx/login_state.json"
# 文件下载位置全路径或相对路径(目录文件)
download_path = 'xxxxxx/Downloads'
# 登陆夸克,保存状态
print("================================= 登陆夸克网盘 =================================")
quark_download_util = QuarkDownloadUtil(username=username, storage_state_path=storage_state_path, headless=False)
quark_download_util.login_quark_and_save_state()
quark_download_util.close()
print("================================= 登陆完成 =================================")
# 下载夸克分享链接
print("================================= 开始下载 =================================")
share_url = "https://pan.quark.cn/s/3f1946688409"
extract_code = "Dllg"
file_name = "星-张远"
quark_download_util = QuarkDownloadUtil(username=username, storage_state_path=storage_state_path, headless=True)
quark_download_util.quark_download(share_url=share_url, extract_code=extract_code, download_path=download_path,
file_name=file_name)
quark_download_util.close()
print("================================= 下载完成 =================================")