Python基础之爬虫技术（一）

1. 爬虫概述

什么是网络爬虫？

网络爬虫（Web Crawler）是一种自动浏览互联网并收集信息的程序。它按照一定的规则自动访问网页，提取所需数据，广泛应用于搜索引擎、数据分析、价格监控等领域。

爬虫的工作原理

发送HTTP请求获取网页内容
解析网页提取有用信息
存储提取的数据
根据链接继续爬取其他页面

2. 环境准备

python 复制代码

# 安装必要的库
# pip install requests beautifulsoup4 lxml selenium scrapy

import requests
from bs4 import BeautifulSoup
import json
import csv
import time
import random

3. HTTP基础

HTTP请求方法

python 复制代码

import requests

# GET请求
response = requests.get('https://httpbin.org/get')
print(f"状态码: {response.status_code}")
print(f"响应内容: {response.text}")

# POST请求
data = {'key': 'value'}
response = requests.post('https://httpbin.org/post', data=data)
print(f"POST响应: {response.json()}")

# 带参数的请求
params = {'key1': 'value1', 'key2': 'value2'}
response = requests.get('https://httpbin.org/get', params=params)
print(f"POST响应: {response.json()}")

HTTP头部信息

python 复制代码

# 设置请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
}

response = requests.get('https://httpbin.org/headers', headers=headers)
print(f"请求头信息: {response.json()}")

4. 使用Requests库发送请求

基本请求

python 复制代码

import requests

# 简单的GET请求
def simple_get(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # 如果状态码不是200，抛出异常
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"请求错误: {e}")
        return None

# 使用示例
html_content = simple_get('https://httpbin.org/html')
if html_content:
    print("网页获取成功!")
    print(html_content)

处理会话和Cookie

python 复制代码

# 使用会话保持Cookie
session = requests.Session()

# 登录请求
login_data = {
    'username': 'your_username',
    'password': 'your_password'
}

# 首先登录
login_response = session.post('https://httpbin.org/post', data=login_data)
print(f"登录响应: {login_response.status_code}")

# 使用同一个会话访问需要登录的页面
profile_response = session.get('https://httpbin.org/get')
print(f"个人页面: {profile_response.status_code}")

5. 解析HTML - BeautifulSoup

基本用法

python 复制代码

from bs4 import BeautifulSoup
import requests

def parse_html_demo():
    html_doc = """
    <html>
    <head><title>测试页面</title></head>
    <body>
    <div class="content">
        <h1 id="title">主标题</h1>
        <p class="text">第一段文本</p>
        <p class="text special">特殊文本</p>
        <a href="https://example.com">链接</a>
        <ul>
            <li>项目1</li>
            <li>项目2</li>
            <li>项目3</li>
        </ul>
    </div>
    </body>
    </html>
    """
    
    soup = BeautifulSoup(html_doc, 'lxml')
    
    # 通过标签名查找
    title = soup.title
    print(f"标题: {title.text}")
    
    # 通过ID查找
    main_title = soup.find('h1', id='title')
    print(f"主标题: {main_title.text}")
    
    # 通过class查找
    texts = soup.find_all('p', class_='text')
    for i, text in enumerate(texts, 1):
        print(f"第{i}段: {text.text}")
    
    # 获取属性
    link = soup.find('a')
    print(f"链接地址: {link['href']}")
    
    # CSS选择器
    special_text = soup.select_one('.special')
    print(f"特殊文本: {special_text.text}")
    
    list_items = soup.select('ul li')
    for item in list_items:
        print(f"列表项: {item.text}")

parse_html_demo()

实战：爬取新闻标题

python 复制代码

def crawl_news_demo():
    # 以简化的示例演示
    sample_html = """
    <div class="news-list">
        <div class="news-item">
            <h2><a href="/news/1">Python 3.11发布，性能大幅提升</a></h2>
            <span class="date">2023-10-01</span>
        </div>
        <div class="news-item">
            <h2><a href="/news/2">人工智能技术新突破</a></h2>
            <span class="date">2023-10-02</span>
        </div>
    </div>
    """
    
    soup = BeautifulSoup(sample_html, 'lxml')
    news_items = soup.find_all('div', class_='news-item')
    
    news_list = []
    for item in news_items:
        title_link = item.find('h2').find('a')
        title = title_link.text
        url = title_link['href']
        date = item.find('span', class_='date').text
        
        news_list.append({
            'title': title,
            'url': url,
            'date': date
        })
    
    return news_list

news = crawl_news_demo()
for item in news:
    print(f"标题: {item['title']}, 日期: {item['date']}, 链接: {item['url']}")

6. 数据存储

保存到CSV文件

python 复制代码

import csv

def save_to_csv(data, filename):
    if not data:
        print("没有数据可保存")
        return
    
    # 获取字段名
    fieldnames = data[0].keys()
    
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)
    
    print(f"数据已保存到 {filename}")

# 使用示例
sample_data = [
    {'name': '张三', 'age': 25, 'city': '北京'},
    {'name': '李四', 'age': 30, 'city': '上海'},
    {'name': '王五', 'age': 28, 'city': '广州'}
]

save_to_csv(sample_data, 'people.csv')

保存到JSON文件

python 复制代码

import json

def save_to_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    print(f"数据已保存到 {filename}")

# 使用示例
save_to_json(sample_data, 'people.json')

保存到文本文件

python 复制代码

def save_to_txt(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for item in data:
            line = f"姓名: {item['name']}, 年龄: {item['age']}, 城市: {item['city']}\n"
            f.write(line)
    
    print(f"数据已保存到 {filename}")

save_to_txt(sample_data, 'people.txt')

7. 处理动态内容 - Selenium

Selenium基础

python 复制代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

def setup_driver():
    """设置Chrome驱动"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # 无头模式
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def dynamic_crawl_example():
    """动态爬取示例"""
    driver = setup_driver()
    
    try:
        driver.get("https://httpbin.org/html")
        
        # 等待元素加载
        wait = WebDriverWait(driver, 10)
        element = wait.until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )
        
        # 获取页面标题
        print(f"页面标题: {driver.title}")
        
        # 查找元素
        h1_element = driver.find_element(By.TAG_NAME, "h1")
        print(f"H1内容: {h1_element.text}")
        
        # 执行JavaScript
        script_result = driver.execute_script("return document.title;")
        print(f"通过JS获取标题: {script_result}")
        
    finally:
        driver.quit()

dynamic_crawl_example()

8. 爬虫礼仪与反爬应对

设置延迟和随机User-Agent

python 复制代码

import time
import random
from fake_useragent import UserAgent

class PoliteCrawler:
    def __init__(self):
        self.ua = UserAgent()
        self.session = requests.Session()
    
    def get_with_delay(self, url, delay=1):
        """带延迟的请求"""
        headers = {
            'User-Agent': self.ua.random
        }
        
        time.sleep(delay + random.uniform(0, 1))  # 随机延迟
        
        try:
            response = self.session.get(url, headers=headers)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            print(f"请求失败: {e}")
            return None

# 使用示例
crawler = PoliteCrawler()
response = crawler.get_with_delay('https://httpbin.org/user-agent', delay=2)
if response:
    print(f"使用的User-Agent: {response.json()}")