📚 教程目标
学习构建一个健壮、可扩展的网络内容下载框架,涵盖从基础到高级的技能。
🎯 学习路线图
阶段1:基础知识准备
HTTP协议基础详解
🌐 HTTP协议概述
HTTP(HyperText Transfer Protocol)是互联网上应用最广泛的协议,用于客户端和服务器之间的通信。
🔍 HTTP请求方法
GET 请求
python
import requests
# GET请求示例
response = requests.get('https://api.example.com/data')
print(f"状态码: {response.status_code}")
print(f"响应内容: {response.text[:100]}...")
GET请求特点:
- 从服务器获取数据
- 参数通过URL传递(查询字符串)
- 可以被缓存
- 有长度限制(浏览器通常限制2048个字符)
- 不应包含敏感信息
- 幂等操作(多次执行结果相同)
python
# 带参数的GET请求
params = {
'page': 1,
'limit': 10,
'search': 'python'
}
response = requests.get('https://api.example.com/items', params=params)
print(f"完整URL: {response.url}")
POST 请求
python
# POST请求示例
data = {
'username': 'john_doe',
'password': 'secure_pass123'
}
response = requests.post('https://api.example.com/login', data=data)
POST请求特点:
- 向服务器提交数据
- 数据放在请求体中
- 无长度限制
- 不会被缓存
- 可以包含敏感信息
- 非幂等操作(可能改变服务器状态)
python
# 发送JSON数据的POST请求
import json
json_data = {
'title': 'New Post',
'content': 'This is the content',
'tags': ['python', 'web']
}
response = requests.post(
'https://api.example.com/posts',
json=json_data, # 自动设置Content-Type为application/json
headers={'Authorization': 'Bearer token123'}
)
其他HTTP方法
python
# PUT - 更新资源
update_data = {'name': 'Updated Name'}
response = requests.put('https://api.example.com/users/123', json=update_data)
# DELETE - 删除资源
response = requests.delete('https://api.example.com/users/123')
# PATCH - 部分更新资源
patch_data = {'age': 30}
response = requests.patch('https://api.example.com/users/123', json=patch_data)
# HEAD - 只获取响应头
response = requests.head('https://api.example.com/page')
print(f"内容类型: {response.headers.get('Content-Type')}")
print(f"内容长度: {response.headers.get('Content-Length')}")
📊 HTTP状态码详解
1xx 信息性状态码
python
# 100 Continue - 继续发送请求体
# 101 Switching Protocols - 切换协议(如WebSocket)
2xx 成功状态码
python
# 200 OK - 请求成功
def check_success(response):
if response.status_code == 200:
print("请求成功!")
return True
return False
# 201 Created - 资源创建成功
def create_resource(data):
response = requests.post('https://api.example.com/resources', json=data)
if response.status_code == 201:
print(f"资源创建成功,位置: {response.headers.get('Location')}")
return response.json()
return None
# 204 No Content - 成功但无返回内容
def delete_item(item_id):
response = requests.delete(f'https://api.example.com/items/{item_id}')
if response.status_code == 204:
print("删除成功")
return True
return False
3xx 重定向状态码
python
import requests
# 处理重定向
def follow_redirects(url, max_redirects=5):
"""跟踪重定向链"""
history = []
current_url = url
for i in range(max_redirects):
response = requests.get(current_url, allow_redirects=False)
if response.status_code in [301, 302, 303, 307, 308]:
redirect_url = response.headers.get('Location')
print(f"重定向 {i+1}: {current_url} -> {redirect_url}")
history.append((current_url, response.status_code))
current_url = redirect_url
else:
history.append((current_url, response.status_code))
break
return history
# 301 Moved Permanently - 永久重定向
# 浏览器和搜索引擎会记住这个重定向
# 302 Found - 临时重定向
# 浏览器不会缓存这个重定向
# 自动重定向处理
response = requests.get('https://httpbin.org/redirect/2', allow_redirects=True)
print(f"最终URL: {response.url}")
print(f"重定向历史: {response.history}")
4xx 客户端错误
python
def handle_client_errors(response):
"""处理客户端错误"""
status_code = response.status_code
error_handlers = {
400: lambda: handle_bad_request(response),
401: lambda: handle_unauthorized(response),
403: lambda: handle_forbidden(response),
404: lambda: handle_not_found(response),
429: lambda: handle_rate_limit(response)
}
if status_code in error_handlers:
return error_handlers[status_code]()
return False
def handle_bad_request(response):
"""400 Bad Request - 请求语法错误"""
print("错误的请求格式")
print(f"错误详情: {response.text}")
return False
def handle_unauthorized(response):
"""401 Unauthorized - 需要身份验证"""
print("需要登录或身份验证")
# 尝试重新获取token
token = refresh_auth_token()
return token is not None
def handle_forbidden(response):
"""403 Forbidden - 服务器理解请求但拒绝执行"""
print("没有访问权限")
return False
def handle_not_found(response):
"""404 Not Found - 资源不存在"""
print("请求的资源不存在")
return False
def handle_rate_limit(response):
"""429 Too Many Requests - 请求过多"""
import time
retry_after = int(response.headers.get('Retry-After', 60))
print(f"请求过于频繁,{retry_after}秒后重试")
time.sleep(retry_after)
return True
5xx 服务器错误
python
def handle_server_errors(response, max_retries=3):
"""处理服务器错误,带重试机制"""
status_code = response.status_code
if status_code in [500, 502, 503, 504]:
for attempt in range(max_retries):
print(f"服务器错误 ({status_code}),尝试 {attempt + 1}/{max_retries}")
time.sleep(2 ** attempt) # 指数退避
try:
response = requests.get(response.url)
if response.status_code < 500:
return response
except Exception as e:
print(f"重试失败: {e}")
raise Exception(f"服务器错误,重试{max_retries}次后仍失败")
return response
📝 HTTP请求头详解
常用请求头示例
python
import requests
from datetime import datetime
def create_comprehensive_headers():
"""创建完整的请求头"""
headers = {
# 用户代理 - 告诉服务器客户端信息
'User-Agent': 'MyPythonDownloader/1.0 (https://github.com/username/downloader)',
# 接受的内容类型
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
# 连接控制
'Connection': 'keep-alive',
# 缓存控制
'Cache-Control': 'no-cache',
# 来源页(反爬虫常用)
'Referer': 'https://www.google.com/',
# 内容类型(POST请求需要)
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 自定义头
'X-Requested-With': 'XMLHttpRequest',
'X-Custom-Header': 'MyValue',
# 认证信息
# 'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...',
}
return headers
# 使用自定义头
headers = create_comprehensive_headers()
response = requests.get('https://httpbin.org/headers', headers=headers)
print("服务器看到的请求头:")
print(json.dumps(response.json()['headers'], indent=2))
处理内容协商
python
def download_with_accept_headers(url):
"""根据不同的Accept头下载内容"""
formats = {
'json': {
'Accept': 'application/json',
'filename': 'data.json'
},
'xml': {
'Accept': 'application/xml',
'filename': 'data.xml'
},
'html': {
'Accept': 'text/html',
'filename': 'page.html'
}
}
results = {}
for format_name, config in formats.items():
response = requests.get(url, headers={'Accept': config['Accept']})
content_type = response.headers.get('Content-Type', '')
if config['filename']:
with open(config['filename'], 'w', encoding='utf-8') as f:
f.write(response.text)
results[format_name] = {
'content_type': content_type,
'size': len(response.text),
'success': response.status_code == 200
}
return results
📦 HTTP响应头详解
解析响应头
python
def analyze_response_headers(response):
"""分析响应头信息"""
headers = response.headers
analysis = {
'basic_info': {},
'content_info': {},
'cache_info': {},
'security_info': {}
}
# 基础信息
analysis['basic_info'] = {
'status_code': response.status_code,
'reason': response.reason,
'url': response.url
}
# 内容信息
content_headers = {
'Content-Type': headers.get('Content-Type'),
'Content-Length': headers.get('Content-Length'),
'Content-Encoding': headers.get('Content-Encoding'),
'Content-Disposition': headers.get('Content-Disposition')
}
analysis['content_info'] = content_headers
# 缓存信息
cache_headers = {
'Cache-Control': headers.get('Cache-Control'),
'Expires': headers.get('Expires'),
'ETag': headers.get('ETag'),
'Last-Modified': headers.get('Last-Modified')
}
analysis['cache_info'] = cache_headers
# 安全信息
security_headers = {
'Set-Cookie': headers.get('Set-Cookie'),
'Strict-Transport-Security': headers.get('Strict-Transport-Security'),
'X-Frame-Options': headers.get('X-Frame-Options'),
'X-Content-Type-Options': headers.get('X-Content-Type-Options')
}
analysis['security_info'] = security_headers
return analysis
# 使用示例
response = requests.get('https://httpbin.org/get')
analysis = analyze_response_headers(response)
print("响应头分析:")
for category, info in analysis.items():
print(f"\n{category.upper()}:")
for key, value in info.items():
if value:
print(f" {key}: {value}")
处理不同类型的响应
python
class ResponseHandler:
"""响应处理器"""
@staticmethod
def handle_json_response(response):
"""处理JSON响应"""
try:
content_type = response.headers.get('Content-Type', '')
if 'application/json' in content_type:
return response.json()
else:
# 尝试解析可能格式错误的JSON
return json.loads(response.text)
except json.JSONDecodeError:
print("响应不是有效的JSON格式")
return None
@staticmethod
def handle_file_download(response, filename=None):
"""处理文件下载响应"""
if not filename:
# 从Content-Disposition提取文件名
content_disposition = response.headers.get('Content-Disposition', '')
if 'filename=' in content_disposition:
filename = content_disposition.split('filename=')[-1].strip('"\'')
else:
filename = f"download_{int(time.time())}"
# 获取文件扩展名
content_type = response.headers.get('Content-Type', '')
extension_map = {
'image/jpeg': '.jpg',
'image/png': '.png',
'application/pdf': '.pdf',
'text/html': '.html',
'application/json': '.json',
}
for mime_type, ext in extension_map.items():
if mime_type in content_type:
if not filename.endswith(ext):
filename += ext
break
return filename
@staticmethod
def handle_chunked_response(response, chunk_size=8192):
"""处理分块传输编码的响应"""
if response.headers.get('Transfer-Encoding') == 'chunked':
content = b''
for chunk in response.iter_content(chunk_size=chunk_size):
content += chunk
return content.decode('utf-8')
else:
return response.text
🎯 实战练习:构建HTTP调试工具
python
class HTTPDebugger:
"""HTTP调试和测试工具"""
def __init__(self):
self.session = requests.Session()
def send_request(self, method, url, **kwargs):
"""发送请求并记录详细信息"""
print(f"\n{'='*50}")
print(f"发送 {method.upper()} 请求到: {url}")
print(f"{'='*50}")
# 记录请求信息
if 'headers' in kwargs:
print("\n请求头:")
for key, value in kwargs['headers'].items():
print(f" {key}: {value}")
if 'params' in kwargs:
print(f"\n查询参数: {kwargs['params']}")
if 'data' in kwargs:
print(f"\n请求体 (form): {kwargs['data']}")
if 'json' in kwargs:
print(f"\n请求体 (JSON): {json.dumps(kwargs['json'], indent=2)}")
# 发送请求
start_time = time.time()
try:
response = self.session.request(method, url, **kwargs)
elapsed_time = time.time() - start_time
# 显示响应信息
print(f"\n响应时间: {elapsed_time:.2f}秒")
print(f"状态码: {response.status_code} {response.reason}")
print("\n响应头:")
for key, value in response.headers.items():
print(f" {key}: {value}")
print(f"\n响应大小: {len(response.content)} 字节")
# 尝试显示响应内容(如果是文本)
content_type = response.headers.get('Content-Type', '')
if 'application/json' in content_type:
print("\n响应内容 (JSON):")
try:
print(json.dumps(response.json(), indent=2))
except:
print(response.text[:500])
elif 'text/' in content_type:
print(f"\n响应内容 (预览):")
print(response.text[:500] + "..." if len(response.text) > 500 else response.text)
return response
except requests.exceptions.RequestException as e:
print(f"\n请求失败: {type(e).__name__}: {e}")
return None
def test_methods(self, url):
"""测试不同的HTTP方法"""
print(f"\n测试不同的HTTP方法: {url}")
methods = ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS']
for method in methods:
print(f"\n{'─'*30}")
print(f"测试 {method} 方法")
print(f"{'─'*30}")
if method in ['POST', 'PUT']:
response = self.send_request(method, url,
json={'test': 'data'},
headers={'Content-Type': 'application/json'})
else:
response = self.send_request(method, url)
def analyze_endpoint(self, url):
"""分析端点支持的HTTP方法和头"""
print(f"\n分析端点: {url}")
# 测试OPTIONS方法
print("\n1. 测试OPTIONS方法:")
response = self.send_request('OPTIONS', url)
if response:
print(f"允许的方法: {response.headers.get('Allow', '未知')}")
# 测试不同内容类型的接受
print("\n2. 测试内容协商:")
accept_types = [
('application/json', 'JSON'),
('application/xml', 'XML'),
('text/html', 'HTML'),
('text/plain', '纯文本')
]
for mime_type, name in accept_types:
response = self.send_request('GET', url,
headers={'Accept': mime_type})
if response:
content_type = response.headers.get('Content-Type', '')
print(f" {name}: 返回类型 - {content_type}")
# 使用示例
if __name__ == "__main__":
debugger = HTTPDebugger()
# 测试HTTP方法
debugger.test_methods('https://httpbin.org/anything')
# 分析端点
debugger.analyze_endpoint('https://httpbin.org/headers')
# 发送复杂请求
debugger.send_request(
'POST',
'https://httpbin.org/post',
json={'name': 'John', 'age': 30},
headers={'X-Custom-Header': 'TestValue'},
params={'key': 'value'}
)
这个详解应该让你对HTTP协议有了全面的理解。记住,实践是最好的学习方式,尝试编写代码来测试每个概念,使用在线工具如 httpbin.org 来测试你的理解。
阶段2:构建基础下载框架
python
# 2.1 基础下载器类
import requests
import os
from pathlib import Path
from urllib.parse import urlparse
import time
import hashlib
from typing import Optional, Dict, Any, Tuple
import logging
class BaseDownloader:
"""基础下载器框架"""
def __init__(self, save_dir: str = "./downloads"):
"""
初始化下载器
Args:
save_dir: 保存文件的目录
"""
self.save_dir = Path(save_dir)
self.save_dir.mkdir(exist_ok=True)
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
self.logger = logging.getLogger(__name__)
# 默认请求头
self.default_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
def _generate_filename(self, url: str, content_type: str = None) -> str:
"""
根据URL和内容类型生成安全的文件名
Args:
url: 下载URL
content_type: 内容类型
Returns:
生成的文件名
"""
# 从URL提取文件名
parsed_url = urlparse(url)
url_path = Path(parsed_url.path)
if url_path.name and '.' in url_path.name:
# URL已经有文件名
filename = url_path.name
else:
# 生成基于URL哈希的文件名
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
# 根据内容类型添加扩展名
extension_map = {
'text/html': '.html',
'application/json': '.json',
'image/jpeg': '.jpg',
'image/png': '.png',
'application/pdf': '.pdf',
'text/plain': '.txt',
}
extension = extension_map.get(content_type, '.bin')
filename = f"download_{url_hash}{extension}"
return filename
def _validate_response(self, response) -> bool:
"""
验证响应是否有效
Args:
response: 响应对象
Returns:
是否有效
"""
if not response:
return False
if hasattr(response, 'status_code'):
return response.status_code == 200
elif hasattr(response, 'getcode'):
return response.getcode() == 200
return False
阶段3:实现不同类型的下载器
python
# 3.1 文本内容下载器
class TextDownloader(BaseDownloader):
"""文本内容下载器(HTML, JSON, XML等)"""
def download_text(self, url: str,
encoding: str = 'utf-8',
params: Dict = None,
headers: Dict = None) -> Optional[str]:
"""
下载文本内容
Args:
url: 目标URL
encoding: 文本编码
params: 查询参数
headers: 自定义请求头
Returns:
下载的文本内容
"""
try:
# 合并请求头
request_headers = {**self.default_headers, **(headers or {})}
# 发送请求
response = requests.get(
url,
headers=request_headers,
params=params,
timeout=30
)
# 验证响应
if not self._validate_response(response):
self.logger.error(f"下载失败: {response.status_code}")
return None
# 检测编码
if encoding == 'auto':
encoding = response.encoding or 'utf-8'
# 保存文件
filename = self._generate_filename(url, response.headers.get('Content-Type'))
save_path = self.save_dir / filename
with open(save_path, 'w', encoding=encoding) as f:
f.write(response.text)
self.logger.info(f"文本已保存到: {save_path}")
return response.text
except Exception as e:
self.logger.error(f"下载出错: {str(e)}")
return None
# 3.2 文件下载器
class FileDownloader(BaseDownloader):
"""二进制文件下载器(图片、PDF、ZIP等)"""
def download_file(self, url: str,
filename: str = None,
chunk_size: int = 8192,
headers: Dict = None,
verify_ssl: bool = True) -> Optional[Path]:
"""
下载二进制文件
Args:
url: 目标URL
filename: 自定义文件名(None则自动生成)
chunk_size: 分块大小
headers: 自定义请求头
verify_ssl: 是否验证SSL证书
Returns:
保存的文件路径
"""
try:
# 合并请求头
request_headers = {**self.default_headers, **(headers or {})}
# 发送流式请求
response = requests.get(
url,
headers=request_headers,
stream=True,
timeout=60,
verify=verify_ssl
)
# 验证响应
if not self._validate_response(response):
self.logger.error(f"下载失败: {response.status_code}")
return None
# 生成或使用提供的文件名
content_type = response.headers.get('Content-Type')
if not filename:
filename = self._generate_filename(url, content_type)
save_path = self.save_dir / filename
# 获取文件大小
total_size = int(response.headers.get('content-length', 0))
# 分块下载并显示进度
downloaded = 0
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
downloaded += len(chunk)
# 显示进度
if total_size:
percent = (downloaded / total_size) * 100
self.logger.info(f"下载进度: {percent:.1f}% ({downloaded}/{total_size} bytes)")
self.logger.info(f"文件已保存到: {save_path} ({downloaded} bytes)")
return save_path
except Exception as e:
self.logger.error(f"下载出错: {str(e)}")
return None
# 3.3 批量下载器
class BatchDownloader(FileDownloader):
"""批量下载管理器"""
def __init__(self, save_dir: str = "./downloads", max_workers: int = 4):
super().__init__(save_dir)
self.max_workers = max_workers
self.download_history = []
def download_multiple(self, urls: list,
filenames: list = None,
delay: float = 1.0) -> Dict[str, Any]:
"""
批量下载多个文件
Args:
urls: URL列表
filenames: 对应的文件名列表
delay: 请求之间的延迟(秒)
Returns:
下载结果统计
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
results = {
'success': 0,
'failed': 0,
'total': len(urls),
'success_files': [],
'failed_files': []
}
def download_task(url, filename=None):
try:
result = self.download_file(url, filename)
if result:
return url, result, True
else:
return url, None, False
except Exception as e:
return url, str(e), False
# 使用线程池并发下载
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = []
for i, url in enumerate(urls):
filename = filenames[i] if filenames and i < len(filenames) else None
future = executor.submit(download_task, url, filename)
futures.append(future)
# 避免请求过快
time.sleep(delay)
# 收集结果
for future in as_completed(futures):
url, result, success = future.result()
if success:
results['success'] += 1
results['success_files'].append({
'url': url,
'path': str(result)
})
self.logger.info(f"成功下载: {url}")
else:
results['failed'] += 1
results['failed_files'].append({
'url': url,
'error': result
})
self.logger.error(f"下载失败: {url} - {result}")
return results
阶段4:高级功能实现
python
# 4.1 支持断点续传的下载器
class ResumableDownloader(FileDownloader):
"""支持断点续传的下载器"""
def download_with_resume(self, url: str,
filename: str = None,
max_retries: int = 3) -> Optional[Path]:
"""
支持断点续传的下载
Args:
url: 目标URL
filename: 文件名
max_retries: 最大重试次数
Returns:
保存的文件路径
"""
if not filename:
filename = self._generate_filename(url)
save_path = self.save_dir / filename
# 检查已下载部分
downloaded_size = 0
if save_path.exists():
downloaded_size = save_path.stat().st_size
self.logger.info(f"发现已下载部分: {downloaded_size} bytes")
headers = {'Range': f'bytes={downloaded_size}-'} if downloaded_size > 0 else {}
for attempt in range(max_retries):
try:
response = requests.get(
url,
headers={**self.default_headers, **headers},
stream=True,
timeout=60
)
# 检查服务器是否支持断点续传
if downloaded_size > 0 and response.status_code != 206:
self.logger.warning("服务器不支持断点续传,重新下载")
save_path.unlink(missing_ok=True)
downloaded_size = 0
headers = {}
continue
# 获取文件总大小
total_size = downloaded_size
if 'content-range' in response.headers:
total_size = int(response.headers['content-range'].split('/')[-1])
else:
total_size += int(response.headers.get('content-length', 0))
# 继续下载
mode = 'ab' if downloaded_size > 0 else 'wb'
with open(save_path, mode) as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded_size += len(chunk)
self.logger.info(f"下载完成: {save_path} ({downloaded_size} bytes)")
return save_path
except Exception as e:
self.logger.error(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # 指数退避
else:
self.logger.error("所有重试均失败")
return None
# 4.2 异步下载器(Python 3.7+)
import asyncio
import aiohttp
from aiofiles import open as aio_open
class AsyncDownloader(BaseDownloader):
"""异步下载器(高性能)"""
async def download_async(self, url: str,
session: aiohttp.ClientSession,
filename: str = None) -> Optional[Path]:
"""
异步下载文件
Args:
url: 目标URL
session: aiohttp会话
filename: 文件名
Returns:
保存的文件路径
"""
try:
async with session.get(url, headers=self.default_headers) as response:
if response.status != 200:
self.logger.error(f"下载失败: {response.status}")
return None
if not filename:
content_type = response.headers.get('Content-Type')
filename = self._generate_filename(url, content_type)
save_path = self.save_dir / filename
# 异步写入文件
async with aio_open(save_path, 'wb') as f:
async for chunk in response.content.iter_chunked(8192):
await f.write(chunk)
self.logger.info(f"异步下载完成: {save_path}")
return save_path
except Exception as e:
self.logger.error(f"异步下载出错: {str(e)}")
return None
async def download_multiple_async(self, urls: list,
max_concurrent: int = 10) -> list:
"""
异步批量下载
Args:
urls: URL列表
max_concurrent: 最大并发数
Returns:
下载结果列表
"""
connector = aiohttp.TCPConnector(limit=max_concurrent)
async with aiohttp.ClientSession(connector=connector,
headers=self.default_headers) as session:
tasks = [self.download_async(url, session) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
阶段5:实战案例
python
# 5.1 完整的下载管理器
class DownloadManager:
"""完整的下载管理器"""
def __init__(self, config: Dict = None):
"""
初始化下载管理器
Args:
config: 配置字典
"""
self.config = config or {}
self.downloaders = {
'text': TextDownloader(),
'file': FileDownloader(),
'batch': BatchDownloader(),
'resume': ResumableDownloader(),
}
def smart_download(self, url: str, **kwargs) -> Any:
"""
智能下载:根据URL类型选择下载器
Args:
url: 目标URL
**kwargs: 其他参数
Returns:
下载结果
"""
# 根据URL扩展名或内容类型选择下载器
url_lower = url.lower()
if any(ext in url_lower for ext in ['.jpg', '.png', '.pdf', '.zip', '.exe']):
# 二进制文件
return self.downloaders['file'].download_file(url, **kwargs)
elif any(ext in url_lower for ext in ['.html', '.json', '.xml', '.txt']):
# 文本文件
return self.downloaders['text'].download_text(url, **kwargs)
else:
# 默认尝试文本下载
return self.downloaders['text'].download_text(url, **kwargs)
def run_example(self):
"""运行示例"""
# 示例URL列表
urls = [
# 文本文件
"https://httpbin.org/html",
"https://jsonplaceholder.typicode.com/posts/1",
# 图片
"https://httpbin.org/image/jpeg",
"https://httpbin.org/image/png",
]
print("=" * 50)
print("下载管理器示例")
print("=" * 50)
# 批量下载示例
print("\n1. 批量下载示例:")
batch_result = self.downloaders['batch'].download_multiple(urls, delay=0.5)
print(f"批量下载结果: {batch_result['success']}/{batch_result['total']} 成功")
# 智能下载示例
print("\n2. 智能下载示例:")
test_url = "https://jsonplaceholder.typicode.com/posts/1"
result = self.smart_download(test_url)
if result:
print(f"智能下载成功,获取字符数: {len(result)}")
# 断点续传示例
print("\n3. 断点续传示例:")
large_file_url = "https://speed.hetzner.de/100MB.bin"
result = self.downloaders['resume'].download_with_resume(large_file_url)
if result:
print(f"断点续传成功: {result}")
# 5.2 使用示例
def main():
"""主函数:演示如何使用下载框架"""
# 创建下载管理器
manager = DownloadManager()
# 示例1:单个文件下载
print("示例1:下载单个文件")
file_downloader = FileDownloader(save_dir="./downloads/images")
file_path = file_downloader.download_file(
url="https://picsum.photos/800/600",
filename="sample_image.jpg"
)
# 示例2:批量下载
print("\n示例2:批量下载")
urls_to_download = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://httpbin.org/image/jpeg",
"https://httpbin.org/image/png",
]
batch_downloader = BatchDownloader(save_dir="./downloads/batch", max_workers=3)
results = batch_downloader.download_multiple(urls_to_download, delay=0.5)
print(f"批量下载完成: {results['success']} 成功, {results['failed']} 失败")
# 示例3:断点续传(模拟中断后继续)
print("\n示例3:断点续传演示")
resume_downloader = ResumableDownloader(save_dir="./downloads/resume")
# 模拟大文件下载
resume_downloader.download_with_resume(
url="https://speed.hetzner.de/10MB.bin",
filename="large_file.bin"
)
if __name__ == "__main__":
main()
阶段6:扩展练习和最佳实践
python
# 6.1 扩展练习任务清单
"""
练习1:添加代理支持
练习2:实现下载速度限制
练习3:添加文件完整性验证(MD5/SHA256)
练习4:实现下载队列优先级
练习5:添加浏览器自动化下载(selenium)
练习6:实现分布式下载(多服务器)
练习7:添加Web界面(Flask/FastAPI)
练习8:实现云存储上传(AWS S3, Google Cloud)
练习9:添加下载任务调度
练习10:实现插件系统
"""
# 6.2 最佳实践检查清单
"""
✅ 错误处理:try-except块覆盖所有网络操作
✅ 日志记录:详细的下载日志
✅ 进度显示:大文件下载进度反馈
✅ 重试机制:网络失败自动重试
✅ 资源管理:使用with语句确保资源释放
✅ 并发控制:限制并发连接数
✅ 用户代理:设置合理的User-Agent
✅ 超时设置:避免无限等待
✅ 路径安全:防止路径遍历攻击
✅ 内存管理:流式下载大文件
✅ 断点续传:支持中断后继续
✅ 编码处理:正确处理不同字符集
"""
# 6.3 配置文件示例
"""
config.yaml:
download:
save_dir: "./downloads"
max_workers: 5
timeout: 30
user_agent: "MyDownloader/1.0"
retry:
max_attempts: 3
backoff_factor: 2
proxies:
http: "http://proxy.example.com:8080"
https: "https://proxy.example.com:8080"
limits:
max_download_size: 104857600 # 100MB
rate_limit: 102400 # 100KB/s
categories:
images:
extensions: [".jpg", ".png", ".gif", ".webp"]
save_dir: "./downloads/images"
documents:
extensions: [".pdf", ".doc", ".docx", ".txt"]
save_dir: "./downloads/documents"
"""
📦 项目创意
- 图片收集器:从网站批量下载特定主题图片
- 文档备份工具:自动备份网站内容到本地
- 视频下载器:支持YouTube/B站视频下载
- 学术论文收集器:自动下载研究论文
- 音乐下载器:支持多种音频格式
- 社交媒体内容备份:备份Instagram/Twitter内容
这个框架设计从简单到复杂,涵盖了网络下载的各个方面。通过逐步实现这些组件,你将掌握:
- HTTP协议和网络编程
- 文件系统操作
- 异常处理和日志记录
- 并发和异步编程
- 设计模式应用
- 项目架构设计