python爬虫例子,且处理反爬的网站也能爬
python
import requests
import random
import time
import logging
import threading
import queue
import json
import csv
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from requests.exceptions import RequestException, Timeout
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('spider.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class AntiCrawlerSpider:
def __init__(self, config=None):
self.config = config or {
'max_threads': 5,
'timeout': 30,
'max_retries': 3,
'delay_range': (1, 3),
'proxy_enabled': False,
'proxies': []
}
# 初始化用户代理池
self.ua = UserAgent()
# 初始化任务队列
self.task_queue = queue.Queue()
self.result_queue = queue.Queue()
# 初始化线程锁
self.lock = threading.Lock()
# 统计信息
self.stats = {
'total_urls': 0,
'success_urls': 0,
'failed_urls': 0,
'start_time': time.time()
}
def get_random_user_agent(self):
"""获取随机用户代理"""
return self.ua.random
def get_random_proxy(self):
"""获取随机代理"""
if not self.config['proxy_enabled'] or not self.config['proxies']:
return None
return random.choice(self.config['proxies'])
def random_delay(self):
"""随机延迟"""
delay = random.uniform(*self.config['delay_range'])
time.sleep(delay)
def make_request(self, url, headers=None, proxies=None):
"""发送HTTP请求"""
for retry in range(self.config['max_retries']):
try:
# 构建请求头
default_headers = {
'User-Agent': self.get_random_user_agent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
if headers:
default_headers.update(headers)
# 获取代理
if not proxies and self.config['proxy_enabled']:
proxies = {'http': self.get_random_proxy(), 'https': self.get_random_proxy()}
# 发送请求
response = requests.get(
url,
headers=default_headers,
proxies=proxies,
timeout=self.config['timeout'],
allow_redirects=True
)
# 检查响应状态
response.raise_for_status()
logger.info(f"Successfully fetched: {url}")
return response
except Timeout:
logger.warning(f"Timeout for {url}, retry {retry+1}/{self.config['max_retries']}")
except RequestException as e:
logger.warning(f"Request failed for {url}: {str(e)}, retry {retry+1}/{self.config['max_retries']}")
# 随机延迟后重试
self.random_delay()
logger.error(f"Failed to fetch {url} after {self.config['max_retries']} retries")
return None
def parse_page(self, response, url):
"""解析页面"""
try:
soup = BeautifulSoup(response.text, 'html.parser')
# 提取标题
title = soup.title.string if soup.title else 'No title'
# 提取所有链接
links = []
for a in soup.find_all('a', href=True):
href = a['href']
# 处理相对链接
if href.startswith('http'):
links.append(href)
elif href.startswith('/'):
# 构建绝对链接
from urllib.parse import urljoin
absolute_url = urljoin(url, href)
links.append(absolute_url)
return {
'url': url,
'title': title,
'links': links,
'content_length': len(response.text)
}
except Exception as e:
logger.error(f"Failed to parse {url}: {str(e)}")
return None
def worker(self):
"""工作线程"""
while True:
try:
url = self.task_queue.get(block=False)
# 发送请求
response = self.make_request(url)
if response:
# 解析页面
result = self.parse_page(response, url)
if result:
self.result_queue.put(result)
# 添加新链接到任务队列
for link in result['links'][:10]: # 限制每个页面提取的链接数
with self.lock:
if self.stats['total_urls'] < 100: # 限制总爬取数量
self.task_queue.put(link)
self.stats['total_urls'] += 1
with self.lock:
self.stats['success_urls'] += 1
else:
with self.lock:
self.stats['failed_urls'] += 1
else:
with self.lock:
self.stats['failed_urls'] += 1
# 随机延迟
self.random_delay()
except queue.Empty:
break
except Exception as e:
logger.error(f"Worker error: {str(e)}")
finally:
self.task_queue.task_done()
def save_results(self, output_file='results.json'):
"""保存结果"""
results = []
while not self.result_queue.empty():
results.append(self.result_queue.get())
# 保存为JSON
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# 保存为CSV
csv_file = output_file.replace('.json', '.csv')
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['URL', 'Title', 'Content Length', 'Links Count'])
for result in results:
writer.writerow([result['url'], result['title'], result['content_length'], len(result['links'])])
logger.info(f"Saved {len(results)} results to {output_file} and {csv_file}")
def run(self, start_urls):
"""运行爬虫"""
logger.info(f"Starting spider with {self.config['max_threads']} threads")
# 添加初始URL到任务队列
for url in start_urls:
self.task_queue.put(url)
self.stats['total_urls'] += 1
# 创建并启动线程
threads = []
for _ in range(self.config['max_threads']):
t = threading.Thread(target=self.worker)
t.daemon = True
t.start()
threads.append(t)
# 等待所有任务完成
self.task_queue.join()
# 等待所有线程完成
for t in threads:
t.join()
# 保存结果
self.save_results()
# 打印统计信息
end_time = time.time()
elapsed_time = end_time - self.stats['start_time']
logger.info("=== Spider Statistics ===")
logger.info(f"Total URLs: {self.stats['total_urls']}")
logger.info(f"Successful URLs: {self.stats['success_urls']}")
logger.info(f"Failed URLs: {self.stats['failed_urls']}")
logger.info(f"Elapsed time: {elapsed_time:.2f} seconds")
logger.info(f"Average time per URL: {elapsed_time / self.stats['total_urls']:.2f} seconds")
return self.stats
def main():
print("=== 反反爬爬虫工具 ===")
print("此工具可以爬取具有反爬机制的网站,包含以下功能:")
print("- 用户代理轮换")
print("- 随机延迟")
print("- 自动重试")
print("- 多线程爬取")
print("- 数据存储")
# 获取用户输入
start_urls = input("请输入起始URL(多个URL用逗号分隔): ").split(',')
start_urls = [url.strip() for url in start_urls if url.strip()]
if not start_urls:
print("错误:请输入至少一个URL")
return
# 配置爬虫
config = {
'max_threads': 5,
'timeout': 30,
'max_retries': 3,
'delay_range': (1, 3),
'proxy_enabled': False,
'proxies': [] # 可以添加代理,格式: 'http://ip:port'
}
# 初始化并运行爬虫
spider = AntiCrawlerSpider(config)
stats = spider.run(start_urls)
print("\n=== 爬取完成 ===")
print(f"总URL数: {stats['total_urls']}")
print(f"成功URL数: {stats['success_urls']}")
print(f"失败URL数: {stats['failed_urls']}")
print("结果已保存到 results.json 和 results.csv")
if __name__ == "__main__":
main()
需要安装爬虫所需的依赖包,包括requests、beautifulsoup4和fake_useragent。
pip install requests beautifulsoup4 fake_useragent
