import tkinter as tk
from tkinter import ttk, messagebox, scrolledtext
import requests
from bs4 import BeautifulSoup
import csv
import os
import threading
from urllib.parse import urljoin
import time
class SpiderGUI:
def init(self, root):
self.root = root
self.root.title("专业数据采集器 v1.0")
self.root.geometry("600x650")
--- UI 布局 ---
self.create_widgets()
def create_widgets(self):
1. 网址与翻页设置
frame1 = ttk.LabelFrame(self.root, text="1. 目标设置 (翻页请在网址中用 {page} 代替页码)")
frame1.pack(padx=10, pady=5, fill="x")
ttk.Label(frame1, text="网址模板:").grid(row=0, column=0, padx=5, pady=5, sticky="e")
self.url_entry = ttk.Entry(frame1, width=50)
self.url_entry.grid(row=0, column=1, columnspan=3, padx=5, pady=5)
self.url_entry.insert(0, "https://example.com/page={page}")
ttk.Label(frame1, text="起始页:").grid(row=1, column=0, padx=5, pady=5, sticky="e")
self.start_page = ttk.Entry(frame1, width=10)
self.start_page.grid(row=1, column=1, sticky="w")
self.start_page.insert(0, "1")
ttk.Label(frame1, text="结束页:").grid(row=1, column=2, padx=5, pady=5, sticky="e")
self.end_page = ttk.Entry(frame1, width=10)
self.end_page.grid(row=1, column=3, sticky="w")
self.end_page.insert(0, "3")
2. 爬取模式与参数
frame2 = ttk.LabelFrame(self.root, text="2. 提取规则")
frame2.pack(padx=10, pady=5, fill="x")
ttk.Label(frame2, text="模式:").grid(row=0, column=0, padx=5, pady=5, sticky="e")
self.mode_var = tk.StringVar(value="text")
ttk.Radiobutton(frame2, text="提取文本 (存为CSV)", variable=self.mode_var, value="text").grid(row=0, column=1, sticky="w")
ttk.Radiobutton(frame2, text="下载图片 (存入文件夹)", variable=self.mode_var, value="image").grid(row=0, column=2, sticky="w")
ttk.Label(frame2, text="HTML标签:").grid(row=1, column=0, padx=5, pady=5, sticky="e")
self.tag_entry = ttk.Entry(frame2, width=15)
self.tag_entry.grid(row=1, column=1, sticky="w")
self.tag_entry.insert(0, "p")
ttk.Label(frame2, text="Class属性:").grid(row=1, column=2, padx=5, pady=5, sticky="e")
self.class_entry = ttk.Entry(frame2, width=15)
self.class_entry.grid(row=1, column=3, sticky="w")
3. 高级设置 (代理)
frame3 = ttk.LabelFrame(self.root, text="3. 高级设置 (代理IP)")
frame3.pack(padx=10, pady=5, fill="x")
ttk.Label(frame3, text="HTTP代理 (例: 127.0.0.1:8080):").grid(row=0, column=0, padx=5, pady=5, sticky="e")
self.proxy_entry = ttk.Entry(frame3, width=35)
self.proxy_entry.grid(row=0, column=1, padx=5, pady=5, sticky="w")
4. 控制按钮
self.start_btn = ttk.Button(self.root, text="🚀 开始爬取", command=self.start_scraping_thread)
self.start_btn.pack(pady=10)
5. 日志输出
self.log_area = scrolledtext.ScrolledText(self.root, width=70, height=15, state='disabled')
self.log_area.pack(padx=10, pady=5)
def log(self, message):
"""在 GUI 中打印日志"""
self.log_area.config(state='normal')
self.log_area.insert(tk.END, message + "\n")
self.log_area.see(tk.END)
self.log_area.config(state='disabled')
def start_scraping_thread(self):
"""使用多线程防止 GUI 卡死"""
self.start_btn.config(state='disabled')
self.log("--- 任务开始 ---")
启动后台线程执行爬虫逻辑
threading.Thread(target=self.run_spider, daemon=True).start()
def run_spider(self):
try:
url_template = self.url_entry.get()
start = int(self.start_page.get())
end = int(self.end_page.get())
mode = self.mode_var.get()
tag = self.tag_entry.get()
class_name = self.class_entry.get()
proxy_str = self.proxy_entry.get().strip()
设置代理
proxies = None
if proxy_str:
proxies = {'http': f'http://{proxy_str}', 'https': f'http://{proxy_str}'}
self.log(f"已启用代理: {proxy_str}")
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124'}
all_data = []
自动翻页逻辑
for page in range(start, end + 1):
替换 URL 中的 {page} 占位符
current_url = url_template.replace("{page}", str(page))
self.log(f"正在抓取第 {page} 页: {current_url}")
try:
response = requests.get(current_url, headers=headers, proxies=proxies, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
if mode == "text":
文本提取逻辑
elements = soup.find_all(tag, class_=class_name if class_name else None)
for el in elements:
text = el.get_text(strip=True)
if text:
all_data.append([page, text])
elif mode == "image":
图片下载逻辑
img_dir = "downloaded_images"
os.makedirs(img_dir, exist_ok=True)
如果没有指定特定标签,默认找 img
img_tag = tag if tag else "img"
images = soup.find_all(img_tag, class_=class_name if class_name else None)
for idx, img in enumerate(images):
img_url = img.get('src')
if not img_url:
continue
处理相对路径转绝对路径
img_url = urljoin(current_url, img_url)
img_name = f"page{page}_{idx}.jpg"
img_path = os.path.join(img_dir, img_name)
self.log(f" ⬇️ 下载图片: {img_url[:50]}...")
img_res = requests.get(img_url, headers=headers, proxies=proxies, timeout=10)
with open(img_path, 'wb') as f:
f.write(img_res.content)
except Exception as e:
self.log(f"❌ 抓取第 {page} 页失败: {e}")
礼貌延时,防止被封 IP
time.sleep(1)
保存文本数据为 CSV
if mode == "text" and all_data:
csv_path = "output_data.csv"
with open(csv_path, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(["Page", "Content"]) # 写入表头
writer.writerows(all_data)
self.log(f"✅ 文本提取完毕!共 {len(all_data)} 条数据,已保存至 {csv_path} (可用 Excel 打开)")
elif mode == "image":
self.log("✅ 图片全部下载完毕!请查看 downloaded_images 文件夹。")
except Exception as e:
self.log(f"❌ 发生致命错误: {e}")
finally:
self.start_btn.config(state='normal')
self.log("--- 任务结束 ---")
if name == "main":
root = tk.Tk()
app = SpiderGUI(root)
root.mainloop()