批量视频数据或高质量图片数据下载

在大批量的请求视频或图片的场景下,尽量不需要使用会话的形式进行请求

使用aiohttp 会比httpx 更轻量,更快,但是底层兼容太差,和不稳定。使用https代理会有问题,故使用httpx较好

python 复制代码
import asyncio
import math
import os
from loguru import logger
import httpx
import aiofiles
from typing import Optional

media_headers = {
    "accept": "*/*",
    "accept-language": "zh-CN,zh;q=0.9",
    "priority": "u=0, i",
    "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "none",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}


async def create_client(timeout: int = 300, proxy: Optional[str] = None) -> httpx.AsyncClient:
    """创建独立的httpx客户端 - 禁用SSL验证"""

    return httpx.AsyncClient(
        headers=media_headers,
        timeout=timeout,
        proxy=proxy,
        verify=False,  # 禁用SSL验证
        follow_redirects=True,
    )


async def download_media(url: str, save_path: str, file_name: str,
                         timeout: int = 300, ext_type: str = 'mp4',
                         max_workers: int = 8, max_retries: int = 2,
                         proxy: Optional[str] = None) -> bool:
    """下载媒体文件的主函数 - 使用独立客户端"""

    os.makedirs(save_path, exist_ok=True)
    file_path = os.path.join(save_path, f"{file_name}.{ext_type}")

    # 如果文件已存在,先删除
    if os.path.exists(file_path):
        os.remove(file_path)

    # 创建独立客户端
    async with await create_client(timeout, proxy) as client:
        try:
            # HEAD请求获取文件信息
            content_length = None
            accept_ranges = False

            head_resp = await client.head(url)
            if head_resp.status_code == 200:
                content_length = head_resp.headers.get("Content-Length")
                accept_ranges = head_resp.headers.get("Accept-Ranges") == "bytes"
                # logger.info(f"✅ HEAD请求成功: {file_name}, 大小: {content_length} bytes")
            else:
                logger.error(f"❌ {file_name} HEAD请求失败, 状态: {head_resp.status_code}")
                return False

            # 如果无法获取文件大小,直接返回
            if not content_length:
                logger.error(f"❌ {file_name} 无法获取文件大小")
                return False

            file_size = int(content_length)

            # 分块并发下载
            logger.info(f"🚀 启用分块并发下载: {file_name}, 大小: {file_size / 1024 / 1024:.2f}MB")
            return await download_multipart(url, file_path, client, file_size, max_workers, max_retries)

        except Exception as e:
            logger.error(f"❌ {file_name} 下载失败: {repr(e)}")
            return False


async def download_multipart(url: str, file_path: str, client: httpx.AsyncClient,
                             total_size: int, workers: int = 8,
                             max_retries: int = 2) -> bool:
    """分块并发下载,失败自动重试 - httpx版本"""

    # 计算分块
    min_chunk_size = 5 * 1024 * 1024
    chunk_size = max(math.ceil(total_size / workers), min_chunk_size)
    actual_workers = math.ceil(total_size / chunk_size)

    part_files = [f"{file_path}.part{i}" for i in range(actual_workers)]

    # 记录已经失败的分块,避免重复输出相同错误
    failed_part_history = set()

    async def download_part(part_index: int) -> bool:
        """下载单个分块(带重试)"""
        nonlocal failed_part_history

        start = part_index * chunk_size
        end = min(start + chunk_size - 1, total_size - 1)
        part_file = part_files[part_index]
        expected_size = end - start + 1

        # 如果已下载完成,跳过
        if os.path.exists(part_file):
            if os.path.getsize(part_file) == expected_size:
                logger.debug(f"分块 {part_index} 已存在,跳过")
                return True
            else:
                # 不完整,删掉重新下载
                os.remove(part_file)

        headers = media_headers.copy()
        headers["Range"] = f"bytes={start}-{end}"

        # 重试机制
        for attempt in range(1, max_retries + 1):
            try:
                async with client.stream("GET", url, headers=headers) as resp:
                    if resp.status_code not in (200, 206):
                        error_msg = f"分块 {part_index} 状态码异常: {resp.status_code}"
                        if part_index not in failed_part_history:
                            logger.error(error_msg)
                            failed_part_history.add(part_index)
                        continue

                    async with aiofiles.open(part_file, "wb") as f:
                        async for chunk in resp.aiter_bytes(2 * 1024 * 1024):
                            await f.write(chunk)

                # 下载完成后校验
                if os.path.getsize(part_file) != expected_size:
                    error_msg = f"分块 {part_index} 大小不匹配"
                    if part_index not in failed_part_history:
                        logger.error(error_msg)
                        failed_part_history.add(part_index)
                    continue

                # 成功时从失败记录中移除
                if part_index in failed_part_history:
                    failed_part_history.remove(part_index)

                # logger.debug(f"分块 {part_index} 下载完成")
                return True

            except Exception as e:
                error_msg = f"分块 {part_index} 下载失败(第{attempt}次): {repr(e)}"
                if part_index not in failed_part_history:
                    logger.warning(error_msg)
                    failed_part_history.add(part_index)

                if attempt < max_retries:
                    await asyncio.sleep(1)
                continue

        logger.error(f"分块 {part_index} 多次失败,放弃")
        return False

    # 下载所有分块
    # logger.info(f"⚡ 共 {actual_workers} 个分块,开始下载...")

    for round_i in range(max_retries):
        if round_i > 0:
            logger.info(f"📌 第 {round_i + 1}/{max_retries} 轮分块下载尝试...")

        tasks = [asyncio.create_task(download_part(i)) for i in range(actual_workers)]
        results = await asyncio.gather(*tasks)

        failed_parts = [i for i, ok in enumerate(results) if not ok]

        if not failed_parts:
            # if round_i > 0:
            #     logger.info("🎉 重试成功,所有分块下载完成")
            # else:
            #     logger.info("🎉 所有分块下载成功")
            break

        if round_i < max_retries - 1:
            logger.warning(f"⚠️ 第 {round_i + 1} 轮结束,{len(failed_parts)}个分块失败: {failed_parts}")
            # 等待一段时间后重试
            await asyncio.sleep(2)
        else:
            logger.error(f"❌ 分块多轮重试后仍有{len(failed_parts)}个分块失败: {failed_parts}")
            return False

    # 合并分块
    # logger.info("🔄 合并分块中...")
    try:
        async with aiofiles.open(file_path, "wb") as final_file:
            for i, part_file in enumerate(part_files):
                if os.path.exists(part_file):
                    async with aiofiles.open(part_file, "rb") as pf:
                        await final_file.write(await pf.read())
                    try:
                        os.remove(part_file)
                    except Exception as e:
                        logger.warning(f"删除分块文件失败: {part_file}, {e}")
                else:
                    logger.error(f"分块文件不存在: {part_file}")
                    return False

        # 最终校验
        if os.path.getsize(file_path) != total_size:
            logger.error(f"❌ 合并后文件大小不符: 期望{total_size}, 实际{os.path.getsize(file_path)}")
            return False

        # logger.success(f"🎉 文件下载完成: {file_path}")
        return True

    except Exception as e:
        logger.error(f"❌ 合并分块失败: {e}")
        return False
相关推荐
顾道长生'1 小时前
(Arxiv-2025)MAGREF:用于任意参考视频生成的掩码引导与主体解耦
音视频
m0_626535201 小时前
代码分析 长音频分割为短音频
javascript·python·音视频
Black蜡笔小新1 小时前
视频融合平台EasyCVR远程监控技术在沙尘暴交通监控中的应用
音视频
EasyCVR5 小时前
视频汇聚平台EasyCVR赋能石油管道计量站精准监控与安全管理
安全·音视频
4***997416 小时前
React音频处理案例
前端·react.js·音视频
EasyCVR20 小时前
视频汇聚平台EasyCVR打造阳光药房远程可视化智慧监管体系
音视频
别动哪条鱼20 小时前
MP4转AAC转换器C++
c++·ffmpeg·音视频·aac
大模型实验室Lab4AI21 小时前
从帧到世界:面向世界模型的长视频生成
音视频
aqi001 天前
FFmpeg开发笔记(九十一)基于Kotlin的Android直播开源框架RootEncoder
android·ffmpeg·kotlin·音视频·直播·流媒体