1. 爬虫概述
什么是网络爬虫?
网络爬虫(Web Crawler)是一种自动浏览互联网并收集信息的程序。它按照一定的规则自动访问网页,提取所需数据,广泛应用于搜索引擎、数据分析、价格监控等领域。
爬虫的工作原理
- 发送HTTP请求获取网页内容
- 解析网页提取有用信息
- 存储提取的数据
- 根据链接继续爬取其他页面
2. 环境准备
python
# 安装必要的库
# pip install requests beautifulsoup4 lxml selenium scrapy
import requests
from bs4 import BeautifulSoup
import json
import csv
import time
import random
3. HTTP基础
HTTP请求方法
python
import requests
# GET请求
response = requests.get('https://httpbin.org/get')
print(f"状态码: {response.status_code}")
print(f"响应内容: {response.text}")
# POST请求
data = {'key': 'value'}
response = requests.post('https://httpbin.org/post', data=data)
print(f"POST响应: {response.json()}")
# 带参数的请求
params = {'key1': 'value1', 'key2': 'value2'}
response = requests.get('https://httpbin.org/get', params=params)
print(f"POST响应: {response.json()}")
HTTP头部信息
python
# 设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
response = requests.get('https://httpbin.org/headers', headers=headers)
print(f"请求头信息: {response.json()}")
4. 使用Requests库发送请求
基本请求
python
import requests
# 简单的GET请求
def simple_get(url):
try:
response = requests.get(url)
response.raise_for_status() # 如果状态码不是200,抛出异常
return response.text
except requests.exceptions.RequestException as e:
print(f"请求错误: {e}")
return None
# 使用示例
html_content = simple_get('https://httpbin.org/html')
if html_content:
print("网页获取成功!")
print(html_content)
处理会话和Cookie
python
# 使用会话保持Cookie
session = requests.Session()
# 登录请求
login_data = {
'username': 'your_username',
'password': 'your_password'
}
# 首先登录
login_response = session.post('https://httpbin.org/post', data=login_data)
print(f"登录响应: {login_response.status_code}")
# 使用同一个会话访问需要登录的页面
profile_response = session.get('https://httpbin.org/get')
print(f"个人页面: {profile_response.status_code}")
5. 解析HTML - BeautifulSoup
基本用法
python
from bs4 import BeautifulSoup
import requests
def parse_html_demo():
html_doc = """
<html>
<head><title>测试页面</title></head>
<body>
<div class="content">
<h1 id="title">主标题</h1>
<p class="text">第一段文本</p>
<p class="text special">特殊文本</p>
<a href="https://example.com">链接</a>
<ul>
<li>项目1</li>
<li>项目2</li>
<li>项目3</li>
</ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 通过标签名查找
title = soup.title
print(f"标题: {title.text}")
# 通过ID查找
main_title = soup.find('h1', id='title')
print(f"主标题: {main_title.text}")
# 通过class查找
texts = soup.find_all('p', class_='text')
for i, text in enumerate(texts, 1):
print(f"第{i}段: {text.text}")
# 获取属性
link = soup.find('a')
print(f"链接地址: {link['href']}")
# CSS选择器
special_text = soup.select_one('.special')
print(f"特殊文本: {special_text.text}")
list_items = soup.select('ul li')
for item in list_items:
print(f"列表项: {item.text}")
parse_html_demo()
实战:爬取新闻标题
python
def crawl_news_demo():
# 以简化的示例演示
sample_html = """
<div class="news-list">
<div class="news-item">
<h2><a href="/news/1">Python 3.11发布,性能大幅提升</a></h2>
<span class="date">2023-10-01</span>
</div>
<div class="news-item">
<h2><a href="/news/2">人工智能技术新突破</a></h2>
<span class="date">2023-10-02</span>
</div>
</div>
"""
soup = BeautifulSoup(sample_html, 'lxml')
news_items = soup.find_all('div', class_='news-item')
news_list = []
for item in news_items:
title_link = item.find('h2').find('a')
title = title_link.text
url = title_link['href']
date = item.find('span', class_='date').text
news_list.append({
'title': title,
'url': url,
'date': date
})
return news_list
news = crawl_news_demo()
for item in news:
print(f"标题: {item['title']}, 日期: {item['date']}, 链接: {item['url']}")
6. 数据存储
保存到CSV文件
python
import csv
def save_to_csv(data, filename):
if not data:
print("没有数据可保存")
return
# 获取字段名
fieldnames = data[0].keys()
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
print(f"数据已保存到 {filename}")
# 使用示例
sample_data = [
{'name': '张三', 'age': 25, 'city': '北京'},
{'name': '李四', 'age': 30, 'city': '上海'},
{'name': '王五', 'age': 28, 'city': '广州'}
]
save_to_csv(sample_data, 'people.csv')
保存到JSON文件
python
import json
def save_to_json(data, filename):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"数据已保存到 {filename}")
# 使用示例
save_to_json(sample_data, 'people.json')
保存到文本文件
python
def save_to_txt(data, filename):
with open(filename, 'w', encoding='utf-8') as f:
for item in data:
line = f"姓名: {item['name']}, 年龄: {item['age']}, 城市: {item['city']}\n"
f.write(line)
print(f"数据已保存到 {filename}")
save_to_txt(sample_data, 'people.txt')
7. 处理动态内容 - Selenium
Selenium基础
python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
def setup_driver():
"""设置Chrome驱动"""
chrome_options = Options()
chrome_options.add_argument('--headless') # 无头模式
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
return driver
def dynamic_crawl_example():
"""动态爬取示例"""
driver = setup_driver()
try:
driver.get("https://httpbin.org/html")
# 等待元素加载
wait = WebDriverWait(driver, 10)
element = wait.until(
EC.presence_of_element_located((By.TAG_NAME, "h1"))
)
# 获取页面标题
print(f"页面标题: {driver.title}")
# 查找元素
h1_element = driver.find_element(By.TAG_NAME, "h1")
print(f"H1内容: {h1_element.text}")
# 执行JavaScript
script_result = driver.execute_script("return document.title;")
print(f"通过JS获取标题: {script_result}")
finally:
driver.quit()
dynamic_crawl_example()
8. 爬虫礼仪与反爬应对
设置延迟和随机User-Agent
python
import time
import random
from fake_useragent import UserAgent
class PoliteCrawler:
def __init__(self):
self.ua = UserAgent()
self.session = requests.Session()
def get_with_delay(self, url, delay=1):
"""带延迟的请求"""
headers = {
'User-Agent': self.ua.random
}
time.sleep(delay + random.uniform(0, 1)) # 随机延迟
try:
response = self.session.get(url, headers=headers)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
# 使用示例
crawler = PoliteCrawler()
response = crawler.get_with_delay('https://httpbin.org/user-agent', delay=2)
if response:
print(f"使用的User-Agent: {response.json()}")