Python 爬虫技术指南
一、基础工具
1. 请求库
python
# requests - 最常用的 HTTP 库
import requests
# 基本GET请求
response = requests.get('https://api.example.com/data')
print(response.text)
# POST请求
data = {'key': 'value'}
response = requests.post('https://api.example.com/post', json=data)
2. 解析库
python
# BeautifulSoup4 - HTML解析
from bs4 import BeautifulSoup
# 解析HTML
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.find('title').text
# lxml - 高效的XML/HTML解析器
from lxml import etree
tree = etree.HTML(html_content)
二、高级技术
1. Selenium 自动化
python
from selenium import webdriver
from selenium.webdriver.common.by import By
# 初始化浏览器
driver = webdriver.Chrome()
# 访问页面
driver.get('https://example.com')
# 查找元素
element = driver.find_element(By.ID, 'search')
element.send_keys('python')
2. 异步爬虫
python
import aiohttp
import asyncio
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.text()
# 运行异步任务
async def main():
urls = ['url1', 'url2', 'url3']
tasks = [fetch(url) for url in urls]
results = await asyncio.gather(*tasks)
三、反爬虫对策
1. 请求头处理
python
headers = {
'User-Agent': 'Mozilla/5.0 ...',
'Accept': 'text/html,application/xhtml+xml...',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://example.com'
}
response = requests.get(url, headers=headers)
2. IP代理池
python
proxies = {
'http': 'http://10.10.10.1:8000',
'https': 'http://10.10.10.1:8000'
}
response = requests.get(url, proxies=proxies)
3. Cookie管理
python
from http.cookiejar import CookieJar
import requests
session = requests.Session()
cookies = {'session_id': '123456'}
session.cookies.update(cookies)
四、数据存储
1. 文件存储
python
# CSV存储
import csv
with open('data.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(data)
# JSON存储
import json
with open('data.json', 'w') as f:
json.dump(data, f)
2. 数据库存储
python
# SQLite示例
import sqlite3
conn = sqlite3.connect('database.db')
cursor = conn.cursor()
# MongoDB示例
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017/')
db = client['database_name']
五、并发处理
1. 多线程
python
from concurrent.futures import ThreadPoolExecutor
import threading
def crawl(url):
# 爬虫逻辑
pass
with ThreadPoolExecutor(max_workers=5) as executor:
executor.map(crawl, urls)
2. 多进程
python
from multiprocessing import Pool
def crawl(url):
# 爬虫逻辑
pass
if __name__ == '__main__':
with Pool(4) as p:
p.map(crawl, urls)
六、高级特性
1. 验证码处理
python
# 使用OCR识别
import pytesseract
from PIL import Image
def recognize_captcha(image_path):
image = Image.open(image_path)
text = pytesseract.image_to_string(image)
return text
2. JavaScript渲染
python
# 使用Splash处理JavaScript
import requests
splash_url = 'http://localhost:8050/render.html'
params = {'url': target_url, 'wait': 2}
response = requests.get(splash_url, params=params)
七、最佳实践
1. 错误处理
python
def safe_request(url, retries=3):
for i in range(retries):
try:
response = requests.get(url, timeout=10)
return response
except requests.RequestException as e:
print(f"Retry {i+1}, Error: {e}")
if i == retries - 1:
raise
2. 限速控制
python
import time
from ratelimit import limits, sleep_and_retry
@sleep_and_retry
@limits(calls=1, period=1) # 1秒1次请求
def rate_limited_request(url):
return requests.get(url)
八、监控和日志
1. 日志记录
python
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
filename='crawler.log'
)
logging.info('Starting crawler...')
2. 性能监控
python
import time
from memory_profiler import profile
@profile
def memory_intensive_crawl():
# 爬虫逻辑
pass
九、项目结构
crawler/
├── config/
│ └── settings.py
├── spiders/
│ ├── __init__.py
│ └── spider.py
├── utils/
│ ├── proxy.py
│ └── parser.py
├── storage/
│ └── database.py
└── main.py
这个指南涵盖了Python爬虫开发的主要方面,从基础到高级特性。根据具体需求,可以选择合适的工具和技术组合使用。记住要遵守网站的robots.txt规则,合理控制爬取频率。