一、什么是网络爬虫
网络爬虫(Web Crawler)是一种按照一定规则自动抓取互联网信息的程序。简单来说,它就像一只在互联网上"爬行"的蜘蛛,沿着链接不断获取页面内容,提取我们需要的数据。
Python凭借其简洁的语法和丰富的第三方库,成为了实现网络爬虫的首选语言。
二、环境准备
2.1 安装必要的库
bash
pip install requests beautifulsoup4 lxml selenium scrapy
-
requests:发送HTTP请求
-
beautifulsoup4:解析HTML/XML文档
-
lxml:高效的解析引擎
-
selenium:模拟浏览器操作
-
scrapy:强大的爬虫框架
三、基础爬虫实现
3.1 最简单的爬虫
python
import requests
from bs4 import BeautifulSoup
# 发送请求
url = 'https://example.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
# 检查状态码
if response.status_code == 200:
# 解析HTML
soup = BeautifulSoup(response.text, 'lxml')
# 提取标题
title = soup.title.text
print(f'标题: {title}')
3.2 处理请求参数
python
# GET请求带参数
params = {
'keyword': 'python',
'page': 1
}
response = requests.get('https://example.com/search', params=params, headers=headers)
# POST请求带数据
data = {
'username': 'user',
'password': 'pass'
}
response = requests.post('https://example.com/login', data=data, headers=headers)
3.3 处理Cookies和Session
python
# 使用Session保持会话
session = requests.Session()
session.headers.update(headers)
# 登录
session.post('https://example.com/login', data=data)
# 后续请求会自动携带cookie
response = session.get('https://example.com/profile')
四、数据解析
4.1 BeautifulSoup解析
python
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# 常用查找方法
# 1. 根据标签查找
title = soup.find('h1')
all_links = soup.find_all('a')
# 2. 根据属性查找
div = soup.find('div', class_='content')
links = soup.find_all('a', {'class': 'title'})
# 3. CSS选择器
items = soup.select('.item-list > li')
title = soup.select_one('#main-title')
4.2 正则表达式提取
python
import re
# 提取邮箱
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, text)
# 提取手机号
phone_pattern = r'1[3-9]\d{9}'
phones = re.findall(phone_pattern, text)
4.3 XPath解析
python
from lxml import etree
html = etree.HTML(response.text)
# 提取所有链接
links = html.xpath('//a/@href')
# 提取特定文本
title = html.xpath('//div[@class="title"]/text()')[0]
五、应对反爬策略
5.1 设置请求头
python
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.google.com',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate'
}
5.2 使用代理IP
python
proxies = {
'http': 'http://127.0.0.1:8080',
'https': 'https://127.0.0.1:8080'
}
response = requests.get(url, headers=headers, proxies=proxies)
5.3 添加延迟
python
import time
import random
# 随机延迟
time.sleep(random.uniform(1, 3))
5.4 使用Selenium处理动态页面
python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 初始化浏览器
driver = webdriver.Chrome()
driver.get('https://example.com')
# 等待元素加载
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "content"))
)
# 获取页面源码
html = driver.page_source
driver.quit()
六、完整实战案例
爬取豆瓣电影Top250
python
import requests
from bs4 import BeautifulSoup
import time
import random
import csv
class DoubanSpider:
def __init__(self):
self.base_url = 'https://movie.douban.com/top250'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
self.movies = []
def get_page(self, start):
"""获取单页数据"""
params = {'start': start}
try:
response = requests.get(self.base_url, params=params,
headers=self.headers, timeout=10)
response.raise_for_status()
return response.text
except Exception as e:
print(f"请求失败: {e}")
return None
def parse_page(self, html):
"""解析页面"""
soup = BeautifulSoup(html, 'lxml')
items = soup.select('.grid_view .item')
for item in items:
# 排名
rank = item.select_one('.pic em').text
# 电影名
title = item.select_one('.title').text
# 评分
rating = item.select_one('.rating_num').text
# 评价人数
people_num = item.select_one('.star span:last-child').text[:-3]
# 简介(可能存在)
quote_tag = item.select_one('.inq')
quote = quote_tag.text if quote_tag else '暂无简介'
self.movies.append({
'rank': rank,
'title': title,
'rating': rating,
'people_num': people_num,
'quote': quote
})
def save_to_csv(self, filename='douban_top250.csv'):
"""保存到CSV文件"""
with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=['rank', 'title', 'rating', 'people_num', 'quote'])
writer.writeheader()
writer.writerows(self.movies)
print(f"已保存 {len(self.movies)} 条数据到 {filename}")
def run(self):
"""运行爬虫"""
for start in range(0, 250, 25):
print(f"正在爬取第 {start//25 + 1} 页...")
html = self.get_page(start)
if html:
self.parse_page(html)
# 随机延迟
time.sleep(random.uniform(1, 2))
self.save_to_csv()
print("爬取完成!")
if __name__ == '__main__':
spider = DoubanSpider()
spider.run()
七、高级技巧
7.1 多线程爬虫
python
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
class MultiThreadSpider:
def __init__(self, max_workers=5):
self.max_workers = max_workers
self.lock = threading.Lock()
def fetch_url(self, url):
response = requests.get(url, headers=headers)
return response.text
def run(self, urls):
results = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
future_to_url = {executor.submit(self.fetch_url, url): url for url in urls}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
results.append(data)
print(f"成功: {url}")
except Exception as e:
print(f"失败: {url}, 错误: {e}")
return results
7.2 使用Scrapy框架
python
# scrapy项目结构
# myproject/
# spiders/
# __init__.py
# movie_spider.py
# items.py
# pipelines.py
# settings.py
# items.py
import scrapy
class MovieItem(scrapy.Item):
title = scrapy.Field()
rating = scrapy.Field()
url = scrapy.Field()
# movie_spider.py
import scrapy
from myproject.items import MovieItem
class MovieSpider(scrapy.Spider):
name = 'movie'
start_urls = ['https://example.com/movies']
def parse(self, response):
for movie in response.css('.movie-item'):
item = MovieItem()
item['title'] = movie.css('.title::text').get()
item['rating'] = movie.css('.rating::text').get()
item['url'] = movie.css('a::attr(href)').get()
yield item
八、注意事项与法律风险
8.1 遵守robots协议
python
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url('https://example.com/robots.txt')
rp.read()
if rp.can_fetch('MyBot', 'https://example.com/page'):
# 可以爬取
pass
8.2 注意事项
-
尊重网站规则:查看robots.txt,遵守爬取限制
-
控制爬取频率:不要对服务器造成压力
-
数据使用合规:获取的数据不要用于商业目的或侵犯隐私
-
添加身份标识:在User-Agent中标识自己的爬虫
-
保留版权信息:尊重原创内容的版权
8.3 法律红线
-
不要爬取涉及个人隐私的数据
-
不要破解反爬措施(如验证码、加密)
-
不要对网站造成DDOS攻击般的压力
-
不要将爬取的数据用于违法活动
九、总结
Python爬虫开发是一个循序渐进的过程:
-
初级阶段:掌握requests + BeautifulSoup,能够爬取静态页面
-
中级阶段:学习Selenium处理动态页面,应对反爬策略
-
高级阶段:掌握Scrapy框架、分布式爬虫、大规模数据抓取
爬虫技术是把双刃剑,合理使用能提高工作效率,滥用则会带来法律风险。建议在实际开发中:
-
优先使用官方API
-
爬取前阅读网站的robots.txt
-
控制请求频率,添加延迟
-
数据仅用于学习和研究
希望这篇文章能帮助你快速上手Python爬虫开发!