在大批量的请求视频或图片的场景下,尽量不需要使用会话的形式进行请求
使用aiohttp 会比httpx 更轻量,更快,但是底层兼容太差,和不稳定。使用https代理会有问题,故使用httpx较好
python
import asyncio
import math
import os
from loguru import logger
import httpx
import aiofiles
from typing import Optional
media_headers = {
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=0, i",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
async def create_client(timeout: int = 300, proxy: Optional[str] = None) -> httpx.AsyncClient:
"""创建独立的httpx客户端 - 禁用SSL验证"""
return httpx.AsyncClient(
headers=media_headers,
timeout=timeout,
proxy=proxy,
verify=False, # 禁用SSL验证
follow_redirects=True,
)
async def download_media(url: str, save_path: str, file_name: str,
timeout: int = 300, ext_type: str = 'mp4',
max_workers: int = 8, max_retries: int = 2,
proxy: Optional[str] = None) -> bool:
"""下载媒体文件的主函数 - 使用独立客户端"""
os.makedirs(save_path, exist_ok=True)
file_path = os.path.join(save_path, f"{file_name}.{ext_type}")
# 如果文件已存在,先删除
if os.path.exists(file_path):
os.remove(file_path)
# 创建独立客户端
async with await create_client(timeout, proxy) as client:
try:
# HEAD请求获取文件信息
content_length = None
accept_ranges = False
head_resp = await client.head(url)
if head_resp.status_code == 200:
content_length = head_resp.headers.get("Content-Length")
accept_ranges = head_resp.headers.get("Accept-Ranges") == "bytes"
# logger.info(f"✅ HEAD请求成功: {file_name}, 大小: {content_length} bytes")
else:
logger.error(f"❌ {file_name} HEAD请求失败, 状态: {head_resp.status_code}")
return False
# 如果无法获取文件大小,直接返回
if not content_length:
logger.error(f"❌ {file_name} 无法获取文件大小")
return False
file_size = int(content_length)
# 分块并发下载
logger.info(f"🚀 启用分块并发下载: {file_name}, 大小: {file_size / 1024 / 1024:.2f}MB")
return await download_multipart(url, file_path, client, file_size, max_workers, max_retries)
except Exception as e:
logger.error(f"❌ {file_name} 下载失败: {repr(e)}")
return False
async def download_multipart(url: str, file_path: str, client: httpx.AsyncClient,
total_size: int, workers: int = 8,
max_retries: int = 2) -> bool:
"""分块并发下载,失败自动重试 - httpx版本"""
# 计算分块
min_chunk_size = 5 * 1024 * 1024
chunk_size = max(math.ceil(total_size / workers), min_chunk_size)
actual_workers = math.ceil(total_size / chunk_size)
part_files = [f"{file_path}.part{i}" for i in range(actual_workers)]
# 记录已经失败的分块,避免重复输出相同错误
failed_part_history = set()
async def download_part(part_index: int) -> bool:
"""下载单个分块(带重试)"""
nonlocal failed_part_history
start = part_index * chunk_size
end = min(start + chunk_size - 1, total_size - 1)
part_file = part_files[part_index]
expected_size = end - start + 1
# 如果已下载完成,跳过
if os.path.exists(part_file):
if os.path.getsize(part_file) == expected_size:
logger.debug(f"分块 {part_index} 已存在,跳过")
return True
else:
# 不完整,删掉重新下载
os.remove(part_file)
headers = media_headers.copy()
headers["Range"] = f"bytes={start}-{end}"
# 重试机制
for attempt in range(1, max_retries + 1):
try:
async with client.stream("GET", url, headers=headers) as resp:
if resp.status_code not in (200, 206):
error_msg = f"分块 {part_index} 状态码异常: {resp.status_code}"
if part_index not in failed_part_history:
logger.error(error_msg)
failed_part_history.add(part_index)
continue
async with aiofiles.open(part_file, "wb") as f:
async for chunk in resp.aiter_bytes(2 * 1024 * 1024):
await f.write(chunk)
# 下载完成后校验
if os.path.getsize(part_file) != expected_size:
error_msg = f"分块 {part_index} 大小不匹配"
if part_index not in failed_part_history:
logger.error(error_msg)
failed_part_history.add(part_index)
continue
# 成功时从失败记录中移除
if part_index in failed_part_history:
failed_part_history.remove(part_index)
# logger.debug(f"分块 {part_index} 下载完成")
return True
except Exception as e:
error_msg = f"分块 {part_index} 下载失败(第{attempt}次): {repr(e)}"
if part_index not in failed_part_history:
logger.warning(error_msg)
failed_part_history.add(part_index)
if attempt < max_retries:
await asyncio.sleep(1)
continue
logger.error(f"分块 {part_index} 多次失败,放弃")
return False
# 下载所有分块
# logger.info(f"⚡ 共 {actual_workers} 个分块,开始下载...")
for round_i in range(max_retries):
if round_i > 0:
logger.info(f"📌 第 {round_i + 1}/{max_retries} 轮分块下载尝试...")
tasks = [asyncio.create_task(download_part(i)) for i in range(actual_workers)]
results = await asyncio.gather(*tasks)
failed_parts = [i for i, ok in enumerate(results) if not ok]
if not failed_parts:
# if round_i > 0:
# logger.info("🎉 重试成功,所有分块下载完成")
# else:
# logger.info("🎉 所有分块下载成功")
break
if round_i < max_retries - 1:
logger.warning(f"⚠️ 第 {round_i + 1} 轮结束,{len(failed_parts)}个分块失败: {failed_parts}")
# 等待一段时间后重试
await asyncio.sleep(2)
else:
logger.error(f"❌ 分块多轮重试后仍有{len(failed_parts)}个分块失败: {failed_parts}")
return False
# 合并分块
# logger.info("🔄 合并分块中...")
try:
async with aiofiles.open(file_path, "wb") as final_file:
for i, part_file in enumerate(part_files):
if os.path.exists(part_file):
async with aiofiles.open(part_file, "rb") as pf:
await final_file.write(await pf.read())
try:
os.remove(part_file)
except Exception as e:
logger.warning(f"删除分块文件失败: {part_file}, {e}")
else:
logger.error(f"分块文件不存在: {part_file}")
return False
# 最终校验
if os.path.getsize(file_path) != total_size:
logger.error(f"❌ 合并后文件大小不符: 期望{total_size}, 实际{os.path.getsize(file_path)}")
return False
# logger.success(f"🎉 文件下载完成: {file_path}")
return True
except Exception as e:
logger.error(f"❌ 合并分块失败: {e}")
return False