做数据分析的朋友都知道,数据采集是最耗时的环节。前几年我做过一个舆情分析项目,每天要从几十个新闻网站采集数据。现在用Trae提示词生成数据采集与清洗一体化方案让效率提升10倍,不管是新闻网站、电商网站还是社交媒体,只要把需求说清楚,它就能生成完整的爬虫代码,包括数据清洗、去重、存储,一气呵成。原来要一上午的活儿,现在几分钟就搞定。
今天就分享我是怎么用Trae搞定网络爬虫的,这些方法都是实战中摸索出来的,希望能帮到大家。
技巧1:用Trae提示词生成requests+BeautifulSoup爬虫代码
实际场景
爬取一个新闻网站,得先看网页源码,找到新闻列表的位置,然后提取每条新闻的标题、链接、时间,再进入详情页获取正文。那时候我经常对着网页源码看半天,写出来的代码还经常出错。有一次要爬取某科技媒体的文章,网页结构改版了,我写的代码全废了,又得重新分析网页结构,折腾了两天才搞定。后来用Trae提示词只要把网页结构描述清楚,它就能生成完整的爬虫代码,包括各种异常处理,半小时就搞定了。
Trae提示词技巧
刚开始用Trae时,我只会说"帮我写个爬虫",结果生成的代码要么太简单,要么不符合我的需求。后来我发现,只要把网页结构、数据提取规则、存储方式说清楚,Trae生成的代码就能直接用。
优化前:
帮我写一个爬虫代码
优化后:
使用requests和BeautifulSoup编写一个企业级网络爬虫,要求:
1. 核心功能:
- 支持多网站爬取(配置化管理)
- 自动解析网页结构(CSS选择器/XPath)
- 支持分页爬取
- 支持深度爬取(列表页+详情页)
- 支持图片下载
2. 数据提取:
- 提取文本内容(标题、正文、时间、作者等)
- 提取链接(相对链接转绝对链接)
- 提取图片(自动下载)
- 提取结构化数据(表格、列表)
3. 数据存储:
- 支持保存为Excel/CSV/JSON
- 支持保存到数据库(MySQL/SQLite)
- 支持增量更新(避免重复爬取)
- 支持数据备份
4. 反爬虫应对:
- 设置User-Agent
- 设置请求间隔(随机延迟)
- 支持代理IP
- 支持Cookie管理
5. 代码要求:
- 使用面向对象编程
- 添加详细的中文注释和日志
- 实现异常处理和错误恢复
- 支持配置文件
- 遵循robots.txt协议
6. 输出要求:
- 生成可执行的爬虫脚本
- 生成爬取日志文件
- 生成数据文件(Excel/CSV/JSON)
- 可选:生成爬取报告
```text
这样写就把需求说清楚了,Trae生成的代码基本能直接用。
### 生成的代码
```python
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging
from datetime import datetime
from typing import Dict, List, Any, Optional
import json
import os
from urllib.parse import urljoin, urlparse
import hashlib
class WebCrawler:
"""企业级网络爬虫"""
def __init__(self, config: Dict[str, Any] = None):
"""初始化爬虫"""
self.config = config or self._default_config()
self.setup_logging()
self.session = requests.Session()
self.crawled_urls = set()
self.results = []
def _default_config(self) -> Dict[str, Any]:
"""默认配置"""
return {
'user_agents': [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
],
'request_delay': (1, 3),
'max_retries': 3,
'timeout': 30,
'output_format': 'excel',
'output_dir': 'crawl_results'
}
def setup_logging(self):
"""设置日志"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f"crawler_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def get_random_user_agent(self) -> str:
"""获取随机User-Agent"""
return random.choice(self.config['user_agents'])
def fetch_page(self, url: str) -> Optional[str]:
"""获取网页内容"""
try:
headers = {
'User-Agent': self.get_random_user_agent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
}
for attempt in range(self.config['max_retries']):
try:
response = self.session.get(
url,
headers=headers,
timeout=self.config['timeout']
)
response.raise_for_status()
response.encoding = response.apparent_encoding
self.logger.info(f"成功获取页面: {url}")
return response.text
except requests.RequestException as e:
self.logger.warning(f"第{attempt + 1}次尝试失败: {str(e)}")
if attempt < self.config['max_retries'] - 1:
time.sleep(2 ** attempt)
else:
raise
except Exception as e:
self.logger.error(f"获取页面失败 {url}: {str(e)}")
return None
def parse_news_list(self, html: str, base_url: str, selectors: Dict[str, str]) -> List[Dict[str, Any]]:
"""解析新闻列表"""
try:
soup = BeautifulSoup(html, 'html.parser')
news_list = []
news_items = soup.select(selectors['list'])
for item in news_items:
try:
news = {}
# 提取标题
title_elem = item.select_one(selectors['title'])
news['title'] = title_elem.get_text(strip=True) if title_elem else ''
# 提取链接
link_elem = item.select_one(selectors['link'])
if link_elem:
href = link_elem.get('href', '')
news['url'] = urljoin(base_url, href)
else:
news['url'] = ''
# 提取时间
time_elem = item.select_one(selectors.get('time', ''))
news['publish_time'] = time_elem.get_text(strip=True) if time_elem else ''
# 提取摘要
summary_elem = item.select_one(selectors.get('summary', ''))
news['summary'] = summary_elem.get_text(strip=True) if summary_elem else ''
if news['title'] and news['url']:
news_list.append(news)
except Exception as e:
self.logger.warning(f"解析新闻项失败: {str(e)}")
continue
self.logger.info(f"成功解析 {len(news_list)} 条新闻")
return news_list
except Exception as e:
self.logger.error(f"解析新闻列表失败: {str(e)}")
return []
def parse_news_detail(self, html: str, selectors: Dict[str, str]) -> Dict[str, Any]:
"""解析新闻详情"""
try:
soup = BeautifulSoup(html, 'html.parser')
detail = {}
# 提取正文
content_elem = soup.select_one(selectors['content'])
detail['content'] = content_elem.get_text(strip=True) if content_elem else ''
# 提取作者
author_elem = soup.select_one(selectors.get('author', ''))
detail['author'] = author_elem.get_text(strip=True) if author_elem else ''
# 提取来源
source_elem = soup.select_one(selectors.get('source', ''))
detail['source'] = source_elem.get_text(strip=True) if source_elem else ''
# 提取图片
images = []
img_elems = soup.select(selectors.get('images', 'img'))
for img in img_elems:
src = img.get('src', '')
if src:
images.append(src)
detail['images'] = images
return detail
except Exception as e:
self.logger.error(f"解析新闻详情失败: {str(e)}")
return {}
def crawl_news_site(self, site_config: Dict[str, Any]) -> List[Dict[str, Any]]:
"""爬取新闻网站"""
try:
self.logger.info(f"开始爬取网站: {site_config['name']}")
base_url = site_config['url']
selectors = site_config['selectors']
# 获取列表页
html = self.fetch_page(base_url)
if not html:
return []
# 解析新闻列表
news_list = self.parse_news_list(html, base_url, selectors)
# 爬取详情页
for news in news_list:
if news['url'] in self.crawled_urls:
continue
self.crawled_urls.add(news['url'])
# 随机延迟
delay = random.uniform(*self.config['request_delay'])
time.sleep(delay)
# 获取详情页
detail_html = self.fetch_page(news['url'])
if detail_html:
detail = self.parse_news_detail(detail_html, selectors)
news.update(detail)
self.results.append(news)
self.logger.info(f"已爬取: {news['title']}")
self.logger.info(f"网站 {site_config['name']} 爬取完成,共 {len(news_list)} 条")
return news_list
except Exception as e:
self.logger.error(f"爬取网站失败 {site_config['name']}: {str(e)}")
return []
def save_results(self, filename: str = None) -> str:
"""保存爬取结果"""
try:
os.makedirs(self.config['output_dir'], exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if not filename:
filename = f'news_{timestamp}'
output_format = self.config['output_format']
if output_format == 'excel':
output_path = os.path.join(self.config['output_dir'], f'{filename}.xlsx')
df = pd.DataFrame(self.results)
df.to_excel(output_path, index=False)
elif output_format == 'csv':
output_path = os.path.join(self.config['output_dir'], f'{filename}.csv')
df = pd.DataFrame(self.results)
df.to_csv(output_path, index=False, encoding='utf-8-sig')
elif output_format == 'json':
output_path = os.path.join(self.config['output_dir'], f'{filename}.json')
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(self.results, f, ensure_ascii=False, indent=2)
else:
raise ValueError(f"不支持的输出格式: {output_format}")
self.logger.info(f"结果已保存: {output_path}")
return output_path
except Exception as e:
self.logger.error(f"保存结果失败: {str(e)}")
raise
def main():
"""主函数"""
# 创建爬虫实例
crawler = WebCrawler()
# 配置xx新闻网站爬取
xinwen_config = {
'name': 'xx新闻网站',
'url': 'http://www.example.com/',
'selectors': {
'list': 'a[href*="/news"]',
'title': 'a',
'link': 'a',
'time': '',
'summary': '',
'content': '.main-aticle',
'author': '.source',
'source': '.source',
'images': 'img'
}
}
# 爬取新闻
results = crawler.crawl_news_site(xinwen_config)
# 保存结果
output_path = crawler.save_results('xinwen_news')
print(f"爬取完成,共 {len(results)} 条新闻")
print(f"结果已保存: {output_path}")
if __name__ == '__main__':
main()
使用说明
这套爬虫框架用起来挺方便的,我一般按这几个步骤来:
-
安装依赖:
pip install requests beautifulsoup4 pandas lxml
-
配置爬取网站:
site_config = {
'name': '网站名称',
'url': '网站URL',
'selectors': {
'list': '新闻列表选择器',
'title': '标题选择器',
'link': '链接选择器',
'content': '正文选择器'
}
} -
运行爬虫:
python crawler.py
效果对比
用这套爬虫框架后,数据采集效率确实提升了不少:
- 以前手动打开网站、复制内容、整理数据,要花2-3小时
- 现在用Trae生成代码,10分钟就能搞定(包括写提示词)
- 运行爬虫只要5分钟
算下来效率提升了10-20倍,而且数据质量也提高了不少。
技巧2:Trae提示词实现反爬虫策略与应对
实际场景
很多网站都有反爬虫机制,常见的有:
- IP限制:同一IP访问次数过多会被封禁
- User-Agent检测:非浏览器请求会被拒绝
- 请求频率限制:请求过快会被拦截
- 验证码:需要人工识别验证码
- Cookie验证:需要登录才能访问
以前爬虫经常被封,现在用反爬虫策略,可以稳定运行。
Trae提示词技巧
优化前:
帮我写一个反爬虫的代码
优化后:
编写一个反爬虫应对系统,要求:
1. 核心功能:
- User-Agent池管理
- IP代理池管理
- Cookie管理
- 请求频率控制
- 验证码识别
2. 反爬虫策略:
- 随机User-Agent
- 随机请求延迟
- IP轮换
- Session保持
- 请求重试机制
3. 高级功能:
- 分布式爬取
- 异步请求
- 请求队列管理
- 失败任务重试
- 爬取状态监控
4. 代码要求:
- 使用面向对象编程
- 添加详细的中文注释和日志
- 实现异常处理和错误恢复
- 支持配置文件
- 遵循robots.txt协议
5. 输出要求:
- 生成可执行的爬虫脚本
- 生成爬取日志文件
- 生成代理池状态报告
- 可选:集成到调度系统
```text
### 生成的代码
```python
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
from datetime import datetime
from typing import Dict, List, Any, Optional
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
class AntiSpiderCrawler:
"""反爬虫应对爬虫"""
def __init__(self, config: Dict[str, Any] = None):
"""初始化爬虫"""
self.config = config or self._default_config()
self.setup_logging()
self.session = requests.Session()
self.lock = threading.Lock()
self.request_count = 0
self.last_request_time = 0
self.proxy_pool = self._init_proxy_pool()
self.user_agent_pool = self._init_user_agent_pool()
def _default_config(self) -> Dict[str, Any]:
"""默认配置"""
return {
'max_requests_per_minute': 30,
'request_delay': (1, 3),
'max_retries': 3,
'timeout': 30,
'use_proxy': False,
'proxy_list': [],
'enable_async': False,
'max_workers': 5
}
def setup_logging(self):
"""设置日志"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f"anti_spider_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def _init_user_agent_pool(self) -> List[str]:
"""初始化User-Agent池"""
return [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
]
def _init_proxy_pool(self) -> List[str]:
"""初始化代理池"""
if self.config['use_proxy']:
return self.config['proxy_list']
return []
def get_random_user_agent(self) -> str:
"""获取随机User-Agent"""
return random.choice(self.user_agent_pool)
def get_random_proxy(self) -> Optional[Dict[str, str]]:
"""获取随机代理"""
if not self.proxy_pool:
return None
proxy_str = random.choice(self.proxy_pool)
return {
'http': proxy_str,
'https': proxy_str
}
def check_rate_limit(self):
"""检查请求频率限制"""
with self.lock:
current_time = time.time()
self.request_count += 1
# 每分钟重置计数器
if current_time - self.last_request_time > 60:
self.request_count = 1
self.last_request_time = current_time
# 检查是否超过限制
if self.request_count > self.config['max_requests_per_minute']:
wait_time = 60 - (current_time - self.last_request_time)
self.logger.warning(f"请求频率过高,等待 {wait_time:.2f} 秒")
time.sleep(wait_time)
self.request_count = 1
self.last_request_time = time.time()
def fetch_page(self, url: str) -> Optional[str]:
"""获取网页内容(带反爬虫策略)"""
try:
# 检查请求频率
self.check_rate_limit()
# 随机延迟
delay = random.uniform(*self.config['request_delay'])
time.sleep(delay)
headers = {
'User-Agent': self.get_random_user_agent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Referer': url
}
proxies = self.get_random_proxy()
for attempt in range(self.config['max_retries']):
try:
response = self.session.get(
url,
headers=headers,
proxies=proxies,
timeout=self.config['timeout']
)
response.raise_for_status()
response.encoding = response.apparent_encoding
self.logger.info(f"成功获取页面: {url}")
return response.text
except requests.RequestException as e:
self.logger.warning(f"第{attempt + 1}次尝试失败: {str(e)}")
if attempt < self.config['max_retries'] - 1:
wait_time = (2 ** attempt) + random.uniform(0, 1)
time.sleep(wait_time)
else:
raise
except Exception as e:
self.logger.error(f"获取页面失败 {url}: {str(e)}")
return None
def crawl_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
"""爬取多个URL"""
results = []
if self.config['enable_async']:
# 异步爬取
with ThreadPoolExecutor(max_workers=self.config['max_workers']) as executor:
future_to_url = {
executor.submit(self._crawl_single_url, url): url
for url in urls
}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
if result:
results.append(result)
except Exception as e:
self.logger.error(f"爬取失败 {url}: {str(e)}")
else:
# 同步爬取
for url in urls:
result = self._crawl_single_url(url)
if result:
results.append(result)
return results
def _crawl_single_url(self, url: str) -> Optional[Dict[str, Any]]:
"""爬取单个URL"""
html = self.fetch_page(url)
if html:
return {
'url': url,
'html': html,
'crawl_time': datetime.now().isoformat()
}
return None
def main():
"""主函数"""
# 创建反爬虫爬虫实例
crawler = AntiSpiderCrawler()
# 测试爬取
test_urls = [
'http://www.example.com/',
]
results = crawler.crawl_urls(test_urls)
print(f"爬取完成,共 {len(results)} 个页面")
# 保存结果
if results:
output_path = f"crawl_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"结果已保存: {output_path}")
if __name__ == '__main__':
main()
使用说明
-
配置反爬虫策略:
config = {
'max_requests_per_minute': 30,
'request_delay': (1, 3),
'use_proxy': True,
'proxy_list': ['http://proxy1:port', 'http://proxy2:port'],
'enable_async': True,
'max_workers': 5
}
crawler = AntiSpiderCrawler(config) -
添加代理:
crawler.proxy_pool.append('http://new_proxy:port')
-
运行爬虫:
python anti_spider_crawler.py
效果对比
- 无反爬虫策略:经常被封,爬取不稳定
- 有反爬虫策略:稳定运行,成功率90%以上
技巧3:使用Trae生成数据自动清洗与存储代码
实际场景
爬取的数据往往存在各种问题:
- 数据格式不统一(日期格式、数字格式)
- 包含大量无用信息(广告、导航栏)
- 数据重复(同一篇文章被多次爬取)
- 数据缺失(某些字段为空)
以前都要手动清洗,现在用自动化脚本,爬取完成后自动清洗、自动存储。
Trae提示词技巧
优化前:
帮我写一个数据清洗的代码
优化后:
编写一个爬虫数据自动清洗与存储系统,要求:
1. 核心功能:
- 数据格式标准化(日期、数字、文本)
- 数据去重(URL去重、内容去重)
- 数据清洗(去除HTML标签、广告、无用信息)
- 数据验证(必填字段检查、格式验证)
- 数据存储(Excel/CSV/JSON/数据库)
2. 数据清洗规则:
- 去除HTML标签
- 去除空白字符
- 标准化日期格式
- 标准化数字格式
- 提取关键词
3. 数据存储:
- 支持多种存储格式
- 支持增量更新
- 支持数据备份
- 支持数据索引
4. 代码要求:
- 使用面向对象编程
- 添加详细的中文注释和日志
- 实现异常处理和错误恢复
- 支持配置文件
- 遵循PEP 8代码规范
5. 输出要求:
- 生成清洗后的数据文件
- 生成清洗报告
- 生成存储日志
- 可选:发送通知
```text
### 生成的代码
```python
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import logging
from datetime import datetime
from typing import Dict, List, Any, Optional
import json
import os
import hashlib
from urllib.parse import urlparse
class CrawlDataCleaner:
"""爬虫数据清洗器"""
def __init__(self, config: Dict[str, Any] = None):
"""初始化清洗器"""
self.config = config or self._default_config()
self.setup_logging()
self.cleaning_report = {
'original_records': 0,
'cleaned_records': 0,
'duplicates_removed': 0,
'invalid_records': 0,
'cleaning_steps': []
}
def _default_config(self) -> Dict[str, Any]:
"""默认配置"""
return {
'remove_html_tags': True,
'remove_duplicates': True,
'validate_required_fields': True,
'required_fields': ['title', 'url'],
'standardize_date_format': True,
'date_format': '%Y-%m-%d %H:%M:%S',
'output_format': 'excel',
'output_dir': 'cleaned_data'
}
def setup_logging(self):
"""设置日志"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f"cleaner_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def load_data(self, file_path: str) -> pd.DataFrame:
"""加载数据"""
try:
self.logger.info(f"开始加载数据: {file_path}")
if file_path.endswith('.xlsx'):
df = pd.read_excel(file_path)
elif file_path.endswith('.csv'):
df = pd.read_csv(file_path)
elif file_path.endswith('.json'):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
df = pd.DataFrame(data)
else:
raise ValueError(f"不支持的文件格式: {file_path}")
self.cleaning_report['original_records'] = len(df)
self.logger.info(f"成功加载 {len(df)} 条记录")
return df
except Exception as e:
self.logger.error(f"加载数据失败: {str(e)}")
raise
def remove_html_tags(self, text: str) -> str:
"""去除HTML标签"""
if pd.isna(text) or not isinstance(text, str):
return text
soup = BeautifulSoup(text, 'html.parser')
return soup.get_text(strip=True)
def clean_text(self, text: str) -> str:
"""清洗文本"""
if pd.isna(text) or not isinstance(text, str):
return text
# 去除多余空格
text = re.sub(r'\s+', ' ', text)
# 去除特殊字符
text = re.sub(r'[^\w\s\u4e00-\u9fff.,!?;:()""\'-]', '', text)
return text.strip()
def standardize_date(self, date_str: str) -> Optional[str]:
"""标准化日期格式"""
if pd.isna(date_str):
return None
# 尝试解析各种日期格式
date_patterns = [
r'(\d{4})-(\d{1,2})-(\d{1,2})',
r'(\d{4})/(\d{1,2})/(\d{1,2})',
r'(\d{4})年(\d{1,2})月(\d{1,2})日',
r'(\d{1,2})-(\d{1,2})-(\d{4})',
r'(\d{1,2})/(\d{1,2})/(\d{4})'
]
for pattern in date_patterns:
match = re.search(pattern, date_str)
if match:
try:
if pattern == date_patterns[0] or pattern == date_patterns[1]:
year, month, day = match.groups()
elif pattern == date_patterns[2]:
year, month, day = match.groups()
else:
day, month, year = match.groups()
return datetime(int(year), int(month), int(day)).strftime(
self.config['date_format']
)
except:
continue
return date_str
def remove_duplicates(self, df: pd.DataFrame) -> pd.DataFrame:
"""去除重复数据"""
if not self.config['remove_duplicates']:
return df
original_count = len(df)
# 按URL去重
if 'url' in df.columns:
df = df.drop_duplicates(subset=['url'], keep='first')
# 按标题去重
if 'title' in df.columns:
df = df.drop_duplicates(subset=['title'], keep='first')
duplicates_removed = original_count - len(df)
self.cleaning_report['duplicates_removed'] = duplicates_removed
return df
def validate_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""验证数据"""
if not self.config['validate_required_fields']:
return df
original_count = len(df)
# 检查必填字段
for field in self.config['required_fields']:
if field in df.columns:
df = df[df[field].notna() & (df[field] != '')]
invalid_records = original_count - len(df)
self.cleaning_report['invalid_records'] = invalid_records
return df
def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""清洗数据"""
self.logger.info("开始清洗数据")
# 去除HTML标签
if self.config['remove_html_tags']:
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].apply(self.remove_html_tags)
self.cleaning_report['cleaning_steps'].append('去除HTML标签')
# 清洗文本
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].apply(self.clean_text)
self.cleaning_report['cleaning_steps'].append('清洗文本')
# 标准化日期
if self.config['standardize_date_format']:
for col in df.columns:
if 'date' in col.lower() or 'time' in col.lower():
df[col] = df[col].apply(self.standardize_date)
self.cleaning_report['cleaning_steps'].append('标准化日期格式')
# 去除重复
df = self.remove_duplicates(df)
# 验证数据
df = self.validate_data(df)
self.cleaning_report['cleaned_records'] = len(df)
self.logger.info(f"数据清洗完成: 原始 {self.cleaning_report['original_records']} 条,清洗后 {len(df)} 条")
return df
def save_data(self, df: pd.DataFrame, filename: str = None) -> str:
"""保存数据"""
try:
os.makedirs(self.config['output_dir'], exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if not filename:
filename = f'cleaned_data_{timestamp}'
output_format = self.config['output_format']
if output_format == 'excel':
output_path = os.path.join(self.config['output_dir'], f'{filename}.xlsx')
df.to_excel(output_path, index=False)
elif output_format == 'csv':
output_path = os.path.join(self.config['output_dir'], f'{filename}.csv')
df.to_csv(output_path, index=False, encoding='utf-8-sig')
elif output_format == 'json':
output_path = os.path.join(self.config['output_dir'], f'{filename}.json')
df.to_json(output_path, orient='records', force_ascii=False, indent=2)
else:
raise ValueError(f"不支持的输出格式: {output_format}")
self.logger.info(f"数据已保存: {output_path}")
return output_path
except Exception as e:
self.logger.error(f"保存数据失败: {str(e)}")
raise
def generate_report(self) -> Dict[str, Any]:
"""生成清洗报告"""
return self.cleaning_report
def main():
"""主函数"""
# 创建清洗器实例
cleaner = CrawlDataCleaner()
# 加载数据
df = cleaner.load_data('crawl_results.xlsx')
# 清洗数据
cleaned_df = cleaner.clean_data(df)
# 保存数据
output_path = cleaner.save_data(cleaned_df)
print(f"清洗完成,结果已保存: {output_path}")
# 生成报告
report = cleaner.generate_report()
print("\n清洗报告:")
print(f"原始记录数: {report['original_records']}")
print(f"清洗后记录数: {report['cleaned_records']}")
print(f"删除重复记录: {report['duplicates_removed']}")
print(f"无效记录数: {report['invalid_records']}")
print(f"清洗步骤: {', '.join(report['cleaning_steps'])}")
if __name__ == '__main__':
main()
使用说明
-
配置清洗规则:
config = {
'remove_html_tags': True,
'remove_duplicates': True,
'validate_required_fields': True,
'required_fields': ['title', 'url'],
'standardize_date_format': True,
'output_format': 'excel'
}
cleaner = CrawlDataCleaner(config) -
清洗数据:
df = cleaner.load_data('crawl_results.xlsx')
cleaned_df = cleaner.clean_data(df)
cleaner.save_data(cleaned_df) -
查看报告:
report = cleaner.generate_report()
print(report)
效果对比
- 手动清洗:1-2小时
- Trae生成代码:5分钟(包括编写提示词)
- 代码执行:10秒
效率提升了10-20倍!
总结
通过这3个技巧,我用Trae成功生成了一套完整的网络爬虫自动化系统:
- requests+BeautifulSoup爬虫:支持多网站爬取、数据提取、数据存储
- 反爬虫策略:User-Agent池、IP代理池、请求频率控制、异步爬取
- 数据自动清洗与存储:数据清洗、去重、验证、存储
整个爬虫流程从2-3小时缩短到几分钟,效率提升了10倍以上。运营团队现在可以专注于数据分析,而不是重复的手动工作。
如果你也想提升数据采集效率,不妨试试用Trae生成网络爬虫自动化系统,相信会有意想不到的收获!本文内容仅限于研究学习使用。
📚Python Trae提示词开发实战系列目录
👉 最新发布点击关注 解锁更多深度干货!
💡 如果你觉得有收获,欢迎点个【赞】或【收藏】💡
