python爬虫例子,且处理反爬的网站也能爬

python爬虫例子,且处理反爬的网站也能爬

python 复制代码
import requests
import random
import time
import logging
import threading
import queue
import json
import csv
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from requests.exceptions import RequestException, Timeout

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('spider.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class AntiCrawlerSpider:
    def __init__(self, config=None):
        self.config = config or {
            'max_threads': 5,
            'timeout': 30,
            'max_retries': 3,
            'delay_range': (1, 3),
            'proxy_enabled': False,
            'proxies': []
        }
        
        # 初始化用户代理池
        self.ua = UserAgent()
        
        # 初始化任务队列
        self.task_queue = queue.Queue()
        self.result_queue = queue.Queue()
        
        # 初始化线程锁
        self.lock = threading.Lock()
        
        # 统计信息
        self.stats = {
            'total_urls': 0,
            'success_urls': 0,
            'failed_urls': 0,
            'start_time': time.time()
        }
    
    def get_random_user_agent(self):
        """获取随机用户代理"""
        return self.ua.random
    
    def get_random_proxy(self):
        """获取随机代理"""
        if not self.config['proxy_enabled'] or not self.config['proxies']:
            return None
        return random.choice(self.config['proxies'])
    
    def random_delay(self):
        """随机延迟"""
        delay = random.uniform(*self.config['delay_range'])
        time.sleep(delay)
    
    def make_request(self, url, headers=None, proxies=None):
        """发送HTTP请求"""
        for retry in range(self.config['max_retries']):
            try:
                # 构建请求头
                default_headers = {
                    'User-Agent': self.get_random_user_agent(),
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Language': 'en-US,en;q=0.5',
                    'Accept-Encoding': 'gzip, deflate',
                    'Connection': 'keep-alive',
                    'Upgrade-Insecure-Requests': '1'
                }
                
                if headers:
                    default_headers.update(headers)
                
                # 获取代理
                if not proxies and self.config['proxy_enabled']:
                    proxies = {'http': self.get_random_proxy(), 'https': self.get_random_proxy()}
                
                # 发送请求
                response = requests.get(
                    url,
                    headers=default_headers,
                    proxies=proxies,
                    timeout=self.config['timeout'],
                    allow_redirects=True
                )
                
                # 检查响应状态
                response.raise_for_status()
                
                logger.info(f"Successfully fetched: {url}")
                return response
                
            except Timeout:
                logger.warning(f"Timeout for {url}, retry {retry+1}/{self.config['max_retries']}")
            except RequestException as e:
                logger.warning(f"Request failed for {url}: {str(e)}, retry {retry+1}/{self.config['max_retries']}")
            
            # 随机延迟后重试
            self.random_delay()
        
        logger.error(f"Failed to fetch {url} after {self.config['max_retries']} retries")
        return None
    
    def parse_page(self, response, url):
        """解析页面"""
        try:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 提取标题
            title = soup.title.string if soup.title else 'No title'
            
            # 提取所有链接
            links = []
            for a in soup.find_all('a', href=True):
                href = a['href']
                # 处理相对链接
                if href.startswith('http'):
                    links.append(href)
                elif href.startswith('/'):
                    # 构建绝对链接
                    from urllib.parse import urljoin
                    absolute_url = urljoin(url, href)
                    links.append(absolute_url)
            
            return {
                'url': url,
                'title': title,
                'links': links,
                'content_length': len(response.text)
            }
            
        except Exception as e:
            logger.error(f"Failed to parse {url}: {str(e)}")
            return None
    
    def worker(self):
        """工作线程"""
        while True:
            try:
                url = self.task_queue.get(block=False)
                
                # 发送请求
                response = self.make_request(url)
                
                if response:
                    # 解析页面
                    result = self.parse_page(response, url)
                    
                    if result:
                        self.result_queue.put(result)
                        
                        # 添加新链接到任务队列
                        for link in result['links'][:10]:  # 限制每个页面提取的链接数
                            with self.lock:
                                if self.stats['total_urls'] < 100:  # 限制总爬取数量
                                    self.task_queue.put(link)
                                    self.stats['total_urls'] += 1
                        
                        with self.lock:
                            self.stats['success_urls'] += 1
                    else:
                        with self.lock:
                            self.stats['failed_urls'] += 1
                else:
                    with self.lock:
                        self.stats['failed_urls'] += 1
                
                # 随机延迟
                self.random_delay()
                
            except queue.Empty:
                break
            except Exception as e:
                logger.error(f"Worker error: {str(e)}")
            finally:
                self.task_queue.task_done()
    
    def save_results(self, output_file='results.json'):
        """保存结果"""
        results = []
        while not self.result_queue.empty():
            results.append(self.result_queue.get())
        
        # 保存为JSON
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        # 保存为CSV
        csv_file = output_file.replace('.json', '.csv')
        with open(csv_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['URL', 'Title', 'Content Length', 'Links Count'])
            for result in results:
                writer.writerow([result['url'], result['title'], result['content_length'], len(result['links'])])
        
        logger.info(f"Saved {len(results)} results to {output_file} and {csv_file}")
    
    def run(self, start_urls):
        """运行爬虫"""
        logger.info(f"Starting spider with {self.config['max_threads']} threads")
        
        # 添加初始URL到任务队列
        for url in start_urls:
            self.task_queue.put(url)
            self.stats['total_urls'] += 1
        
        # 创建并启动线程
        threads = []
        for _ in range(self.config['max_threads']):
            t = threading.Thread(target=self.worker)
            t.daemon = True
            t.start()
            threads.append(t)
        
        # 等待所有任务完成
        self.task_queue.join()
        
        # 等待所有线程完成
        for t in threads:
            t.join()
        
        # 保存结果
        self.save_results()
        
        # 打印统计信息
        end_time = time.time()
        elapsed_time = end_time - self.stats['start_time']
        
        logger.info("=== Spider Statistics ===")
        logger.info(f"Total URLs: {self.stats['total_urls']}")
        logger.info(f"Successful URLs: {self.stats['success_urls']}")
        logger.info(f"Failed URLs: {self.stats['failed_urls']}")
        logger.info(f"Elapsed time: {elapsed_time:.2f} seconds")
        logger.info(f"Average time per URL: {elapsed_time / self.stats['total_urls']:.2f} seconds")
        
        return self.stats

def main():
    print("=== 反反爬爬虫工具 ===")
    print("此工具可以爬取具有反爬机制的网站,包含以下功能:")
    print("- 用户代理轮换")
    print("- 随机延迟")
    print("- 自动重试")
    print("- 多线程爬取")
    print("- 数据存储")
    
    # 获取用户输入
    start_urls = input("请输入起始URL(多个URL用逗号分隔): ").split(',')
    start_urls = [url.strip() for url in start_urls if url.strip()]
    
    if not start_urls:
        print("错误:请输入至少一个URL")
        return
    
    # 配置爬虫
    config = {
        'max_threads': 5,
        'timeout': 30,
        'max_retries': 3,
        'delay_range': (1, 3),
        'proxy_enabled': False,
        'proxies': []  # 可以添加代理,格式: 'http://ip:port'
    }
    
    # 初始化并运行爬虫
    spider = AntiCrawlerSpider(config)
    stats = spider.run(start_urls)
    
    print("\n=== 爬取完成 ===")
    print(f"总URL数: {stats['total_urls']}")
    print(f"成功URL数: {stats['success_urls']}")
    print(f"失败URL数: {stats['failed_urls']}")
    print("结果已保存到 results.json 和 results.csv")

if __name__ == "__main__":
    main()

需要安装爬虫所需的依赖包,包括requests、beautifulsoup4和fake_useragent。

pip install requests beautifulsoup4 fake_useragent

相关推荐
hutengyi2 小时前
SpringBoot项目中读取resource目录下的文件(六种方法)
spring boot·python·pycharm
老天文学家了2 小时前
蓝桥杯:直线
python
程序员在线炒粉8元1份顺丰包邮送可乐2 小时前
【Java 实现】用友 BIP V5 版本与飞书集成单点登录(飞书免密登录到用友 ERP)
java·开发语言·飞书·用友 bip
铁手飞鹰2 小时前
eBUS SDK Python环境安装
开发语言·python
放下华子我只抽RuiKe52 小时前
智聊机器人进阶:从 API 调试到全功能交互界面的完美落地
开发语言·人工智能·python·机器学习·分类·机器人·交互
放下华子我只抽RuiKe52 小时前
构建企业级私有化 AI:从大模型原理到本地智聊机器人全栈部署指南
开发语言·人工智能·python·深度学习·机器学习·分类·机器人
Knight_AL2 小时前
Java 中 Date 与 LocalDate 的区别
java·开发语言·数据库
ID_180079054732 小时前
python采集小红书笔记详情API接口,json数据返回
笔记·python·json
问水っ2 小时前
Qt Creator快速入门 第三版 第四章 布局管理
开发语言·qt·学习