结合 Selenium 浏览器自动化与 AI 大模型能力,构建能够自动识别反爬机制、智能解析页面的新一代爬虫系统。
1. 系统架构
验证码
登录墙
正常页面
种子 URL 队列
调度器
Selenium WebDriver
反检测模块
页面渲染
AI 反爬识别
AI 验证码破解
自动登录
AI 数据提取
数据清洗管道
存储
MongoDB / CSV
数据看板
2. 反爬机制分布
35% 25% 20% 10% 7% 3% 常见反爬机制占比(Top 500 网站统计) JS 动态渲染 请求频率限制 验证码(图形/滑块) User-Agent 检测 IP 封禁 其他(指纹等)
3. 项目结构
ai-spider/
├── spider/
│ ├── __init__.py
│ ├── browser.py # 浏览器管理
│ ├── anti_detect.py # 反检测策略
│ ├── ai_analyzer.py # AI 页面分析
│ ├── captcha_solver.py # 验证码破解
│ ├── extractor.py # 智能数据提取
│ └── pipeline.py # 数据管道
├── config.py
├── main.py
└── requirements.txt
4. 核心代码实现
4.1 浏览器管理与反检测
python
# spider/browser.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from fake_useragent import UserAgent
import random
import time
class StealthBrowser:
"""隐身浏览器 ------ 绕过常见指纹检测"""
def __init__(self, headless: bool = True):
self.options = Options()
self.ua = UserAgent()
if headless:
self.options.add_argument("--headless=new")
# 基础反检测参数
self.options.add_argument(f"--user-agent={self.ua.random}")
self.options.add_argument("--disable-blink-features=AutomationControlled")
self.options.add_argument("--no-sandbox")
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--window-size=1920,1080")
# 排除自动化开关
self.options.add_experimental_option("excludeSwitches",
["enable-automation"])
self.options.add_experimental_option("useAutomationExtension", False)
def create_driver(self) -> webdriver.Chrome:
driver = webdriver.Chrome(options=self.options)
# 注入反检测 JS ------ 覆盖 navigator.webdriver
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// 覆盖 chrome 对象
window.chrome = {
runtime: {},
loadTimes: function() {},
csi: function() {},
app: {}
};
// 覆盖 permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) =>
parameters.name === 'notifications'
? Promise.resolve({ state: Notification.permission })
: originalQuery(parameters);
"""
})
return driver
@staticmethod
def human_like_delay(min_sec: float = 0.5, max_sec: float = 2.5):
"""模拟人类操作间隔"""
time.sleep(random.uniform(min_sec, max_sec))
4.2 AI 反爬识别
python
# spider/ai_analyzer.py
from openai import OpenAI
from selenium.webdriver.remote.webelement import WebElement
import json
class AIAnalyzer:
"""利用大模型智能分析页面状态"""
def __init__(self, api_key: str):
self.client = OpenAI(api_key=api_key)
def detect_anti_crawl(self, page_source: str, current_url: str) -> dict:
"""识别当前页面是否触发了反爬机制"""
prompt = f"""
分析以下网页内容,判断是否触发了反爬机制。请返回 JSON 格式结果。
当前 URL: {current_url}
页面内容(前 3000 字符):
{page_source[:3000]}
请返回以下格式:
{{
"is_blocked": true/false,
"block_type": "none/captcha/login/paywall/rate_limit/ip_ban",
"confidence": 0.0-1.0,
"suggested_action": "none/solve_captcha/login/change_ip/wait",
"description": "简要描述"
}}
只返回 JSON,不要其他内容。
"""
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
)
try:
return json.loads(response.choices[0].message.content)
except json.JSONDecodeError:
return {
"is_blocked": False,
"block_type": "none",
"confidence": 0,
"suggested_action": "none",
"description": "解析失败,默认正常",
}
def extract_data(self, page_source: str, fields: list[dict]) -> dict:
"""AI 智能提取页面数据
Args:
page_source: 页面 HTML
fields: 需要提取的字段列表,如 [{"name": "title", "desc": "文章标题"}]
"""
field_desc = "\n".join(
f'- "{f["name"]}": {f["desc"]}' for f in fields
)
prompt = f"""
从以下网页内容中提取数据。
需要提取的字段:
{field_desc}
网页内容:
{page_source[:6000]}
请返回 JSON,键为字段名,值为提取的内容。如无法提取则填 null。
只返回 JSON。
"""
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0,
)
try:
return json.loads(response.choices[0].message.content)
except json.JSONDecodeError:
return {}
4.3 验证码破解模块
python
# spider/captcha_solver.py
import base64
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from openai import OpenAI
class CaptchaSolver:
"""AI 驱动的验证码破解器"""
def __init__(self, api_key: str):
self.client = OpenAI(api_key=api_key)
def solve_image_captcha(self, driver, captcha_element) -> str:
"""识别图形验证码"""
# 截取验证码图片
screenshot = captcha_element.screenshot_as_png()
b64_image = base64.b64encode(screenshot).decode()
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "请识别图片中的验证码文字,只返回验证码内容"},
{"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64_image}"}},
],
}],
max_tokens=20,
)
captcha_text = response.choices[0].message.content.strip()
# 填入验证码
input_box = driver.find_element(
By.CSS_SELECTOR, "input[name*='captcha'], input[name*='verify']"
)
input_box.clear()
input_box.send_keys(captcha_text)
# 点击提交
submit_btn = driver.find_element(
By.CSS_SELECTOR, "button[type='submit'], input[type='submit']"
)
submit_btn.click()
time.sleep(2)
return captcha_text
def handle_slider_captcha(self, driver) -> bool:
"""处理滑块验证码(简化版)"""
try:
slider = WebDriverWait(driver, 5).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, ".slider-btn, .slide-block")
)
)
from selenium.webdriver.common.action_chains import ActionChains
import random
# 模拟人类拖动轨迹
actions = ActionChains(driver)
actions.click_and_hold(slider).perform()
total_distance = 260 # 通常滑块距离
steps = random.randint(15, 25)
for i in range(steps):
offset = total_distance / steps + random.uniform(-3, 3)
actions.move_by_offset(offset, random.uniform(-2, 2)).perform()
time.sleep(random.uniform(0.02, 0.08))
actions.release().perform()
time.sleep(2)
return True
except Exception:
return False
4.4 智能数据提取管道
python
# spider/extractor.py
import csv
import json
from pathlib import Path
from datetime import datetime
class DataPipeline:
"""数据清洗与存储管道"""
def __init__(self, output_dir: str = "./output"):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
self.results: list[dict] = []
def process(self, raw_data: dict) -> dict:
"""数据清洗"""
cleaned = {}
for key, value in raw_data.items():
if value is None:
continue
# 去除多余空白
if isinstance(value, str):
value = " ".join(value.split())
cleaned[key] = value
cleaned["_crawled_at"] = datetime.now().isoformat()
self.results.append(cleaned)
return cleaned
def save_csv(self, filename: str = None):
"""导出 CSV"""
if not self.results:
return
filename = filename or f"data_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
filepath = self.output_dir / filename
with open(filepath, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=self.results[0].keys())
writer.writeheader()
writer.writerows(self.results)
print(f"已保存 {len(self.results)} 条数据到 {filepath}")
def save_json(self, filename: str = None):
"""导出 JSON"""
filename = filename or f"data_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
filepath = self.output_dir / filename
with open(filepath, "w", encoding="utf-8") as f:
json.dump(self.results, f, ensure_ascii=False, indent=2)
print(f"已保存 {len(self.results)} 条数据到 {filepath}")
4.5 爬虫主控流程
python
# main.py
from spider.browser import StealthBrowser
from spider.ai_analyzer import AIAnalyzer
from spider.captcha_solver import CaptchaSolver
from spider.extractor import DataPipeline
from spider.config import OPENAI_API_KEY, TARGET_URLS, FIELDS_TO_EXTRACT
import time
class AISpider:
"""AI 智能爬虫主控"""
def __init__(self):
self.browser_manager = StealthBrowser(headless=False)
self.analyzer = AIAnalyzer(api_key=OPENAI_API_KEY)
self.captcha = CaptchaSolver(api_key=OPENAI_API_KEY)
self.pipeline = DataPipeline()
def crawl_page(self, url: str) -> dict | None:
"""爬取单个页面"""
driver = self.browser_manager.create_driver()
try:
print(f"正在访问: {url}")
driver.get(url)
StealthBrowser.human_like_delay(2, 4)
# AI 检测反爬
analysis = self.analyzer.detect_anti_crawl(
driver.page_source, driver.current_url
)
print(f"反爬分析: {analysis}")
if analysis["is_blocked"]:
action = analysis["suggested_action"]
if action == "solve_captcha":
print("检测到验证码,尝试破解...")
self.captcha.handle_slider_captcha(driver)
time.sleep(3)
elif action == "wait":
print("触发频率限制,等待后重试...")
time.sleep(30)
driver.get(url)
elif action in ("change_ip", "ip_ban"):
print("IP 被封,需要切换代理")
return None
# AI 智能提取数据
raw_data = self.analyzer.extract_data(
driver.page_source, FIELDS_TO_EXTRACT
)
print(f"提取到数据: {raw_data}")
# 数据清洗并保存
cleaned = self.pipeline.process(raw_data)
return cleaned
except Exception as e:
print(f"爬取失败 [{url}]: {e}")
return None
finally:
driver.quit()
def run(self):
"""批量爬取"""
print("=" * 50)
print("AI 智能爬虫启动")
print("=" * 50)
for i, url in enumerate(TARGET_URLS):
self.crawl_page(url)
# 随机间隔,模拟人类浏览
if i < len(TARGET_URLS) - 1:
StealthBrowser.human_like_delay(3, 8)
self.pipeline.save_csv()
self.pipeline.save_json()
print(f"\n爬取完成,共获取 {len(self.pipeline.results)} 条数据")
if __name__ == "__main__":
spider = AISpider()
spider.run()
5. 爬取流程详解
Data Pipeline Captcha Solver AI Analyzer Selenium 主控 Data Pipeline Captcha Solver AI Analyzer Selenium 主控 alt [需要破解验证码] 循环处理下一个 URL 创建隐身浏览器 访问目标 URL 返回页面内容 检测反爬机制 {blocked: true, type: "captcha"} 调用验证码破解 模拟人类操作 破解成功 继续 智能提取数据 {title: "...", price: "..."} 清洗 & 存储 保存完成
6. 配置文件
python
# spider/config.py
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# 目标 URL 列表
TARGET_URLS = [
"https://example.com/products/1",
"https://example.com/products/2",
"https://example.com/products/3",
]
# 需要提取的字段
FIELDS_TO_EXTRACT = [
{"name": "title", "desc": "商品标题"},
{"name": "price", "desc": "价格"},
{"name": "description", "desc": "商品描述"},
{"name": "rating", "desc": "评分"},
{"name": "reviews_count", "desc": "评论数量"},
]
7. 性能优化建议
40% 25% 20% 10% 5% 单页面处理耗时分布 页面加载渲染 AI 反爬分析 AI 数据提取 数据清洗存储 其他开销
| 优化方向 | 具体手段 |
|---|---|
| 降低 AI 调用成本 | 使用 gpt-4o-mini 替代 gpt-4o,prompt 精简 |
| 提升并发 | 多浏览器实例并行爬取 |
| 减少渲染时间 | 复用浏览器 Tab,避免反复启动 |
| 缓存策略 | 相似页面结构只做一次 AI 分析 |
| 代理池 | 接入付费代理,轮换 IP |
8. 总结
本文构建的 AI 智能爬虫具备以下核心能力:
- 反检测浏览器 ------ 绕过 WebDriver 指纹检测
- AI 反爬识别 ------ 自动判断页面状态并选择应对策略
- AI 数据提取 ------ 无需编写 CSS 选择器,大模型直接从 HTML 提取结构化数据
- 验证码破解 ------ 支持图形验证码和滑块验证码
- 数据管道 ------ 自动清洗并导出为 CSV / JSON
注意 :爬虫技术请合法合规使用,遵守目标网站的
robots.txt和相关法律法规。
