每天花大量时间浏览行业资讯?本文将带你用 Python 爬虫自动采集多源信息,结合 AI 大模型智能总结,打造一套全自动的行业日报生成系统。每天早上 8 点,日报自动推送到你的邮箱/飞书/钉钉。
一、系统设计概览
整体流程
┌──────────────────────────────────────────────────────────────┐
│ 定时触发 (Cron) │
│ 每天 08:00 │
└───────────────────────────┬──────────────────────────────────┘
│
▼
┌──────────────────────────────────────────────────────────────┐
│ Step 1: 多源数据采集 │
│ │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ 技术博客 │ │ 新闻网站 │ │ RSS 订阅 │ │ 社交媒体 │ │
│ │ (Scrapy) │ │(Requests)│ │(Feedparser)│ │(API) │ │
│ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ │
│ │ │ │ │ │
│ └──────────────┴──────┬───────┴──────────────┘ │
│ ▼ │
│ 统一数据格式 (Article) │
└───────────────────────────┬──────────────────────────────────┘
│
▼
┌──────────────────────────────────────────────────────────────┐
│ Step 2: 内容清洗与去重 │
│ │
│ 去除HTML标签 → 正文提取 → 相似度去重 → 日期过滤 │
└───────────────────────────┬──────────────────────────────────┘
│
▼
┌──────────────────────────────────────────────────────────────┐
│ Step 3: AI 智能总结 │
│ │
│ 分类打标签 → 提取摘要 → 生成点评 → 按主题聚合 │
└───────────────────────────┬──────────────────────────────────┘
│
▼
┌──────────────────────────────────────────────────────────────┐
│ Step 4: 日报生成与推送 │
│ │
│ Markdown 模板渲染 → HTML 转换 → 邮件/飞书/钉钉推送 │
└──────────────────────────────────────────────────────────────┘
项目结构
ai-daily-report/
├── config/
│ ├── settings.py # 全局配置
│ └── sources.yaml # 数据源配置
├── crawlers/
│ ├── base.py # 爬虫基类
│ ├── rss_crawler.py # RSS 爬虫
│ ├── web_crawler.py # 网页爬虫
│ └── api_crawler.py # API 爬虫
├── processors/
│ ├── cleaner.py # 内容清洗
│ ├── deduplicator.py # 去重
│ └── ai_summarizer.py # AI 总结
├── generators/
│ ├── template.py # 模板引擎
│ └── report.py # 日报生成
├── notifiers/
│ ├── email_sender.py # 邮件推送
│ └── webhook_sender.py # 飞书/钉钉
├── scheduler.py # 定时任务
├── main.py # 主入口
└── requirements.txt
二、多源数据采集
2.1 数据模型定义
python
# models.py
from dataclasses import dataclass, field
from datetime import datetime
@dataclass
class Article:
title: str
url: str
content: str
source: str # 来源网站
author: str = ""
published_at: datetime = field(default_factory=datetime.now)
summary: str = "" # AI 生成的摘要
category: str = "" # AI 分类的标签
importance: int = 0 # AI 评估的重要性 1-5
2.2 数据源配置
yaml
# config/sources.yaml
sources:
- name: "36氪"
type: "web"
url: "https://36kr.com/newsflashes"
parser: "css"
selector:
title: ".article-item-title"
link: ".article-item-title a[href]"
content: ".article-item-description"
category: "科技资讯"
- name: "Hacker News"
type: "rss"
url: "https://hnrss.org/frontpage"
category: "技术前沿"
- name: "GitHub Trending"
type: "web"
url: "https://github.com/trending"
parser: "css"
selector:
title: "h2.Box-sc-g0xbh4-0 a[href]"
link: "h2.Box-sc-g0xbh4-0 a[href]"
content: "p.col-9"
category: "开源项目"
- name: "InfoQ"
type: "rss"
url: "https://www.infoq.cn/feed"
category: "技术资讯"
2.3 爬虫基类与实现
python
# crawlers/base.py
from abc import ABC, abstractmethod
from typing import List
from models import Article
class BaseCrawler(ABC):
"""爬虫基类"""
def __init__(self, source_config: dict):
self.name = source_config["name"]
self.url = source_config["url"]
self.category = source_config.get("category", "")
self.headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
@abstractmethod
def crawl(self) -> List[Article]:
"""执行爬取,返回文章列表"""
pass
python
# crawlers/rss_crawler.py
import feedparser
from typing import List
from models import Article
from crawlers.base import BaseCrawler
class RSSCrawler(BaseCrawler):
"""RSS 订阅爬虫"""
def crawl(self) -> List[Article]:
feed = feedparser.parse(self.url)
articles = []
for entry in feed.entries[:20]: # 每个源最多取 20 篇
content = ""
if hasattr(entry, "summary"):
content = entry.summary
elif hasattr(entry, "content"):
content = entry.content[0].value
articles.append(Article(
title=entry.get("title", ""),
url=entry.get("link", ""),
content=content,
source=self.name,
author=entry.get("author", ""),
published_at=entry.get("published_parsed", ""),
category=self.category,
))
print(f"[{self.name}] RSS 采集到 {len(articles)} 篇文章")
return articles
python
# crawlers/web_crawler.py
import requests
from bs4 import BeautifulSoup
from typing import List
from models import Article
from crawlers.base import BaseCrawler
class WebCrawler(BaseCrawler):
"""网页爬虫(CSS 选择器解析)"""
def crawl(self) -> List[Article]:
selector = self.source_config.get("selector", {})
resp = requests.get(self.url, headers=self.headers, timeout=30)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
articles = []
titles = soup.select(selector.get("title", ""))
links = soup.select(selector.get("link", ""))
contents = soup.select(selector.get("content", ""))
for i in range(min(len(titles), 15)):
title = titles[i].get_text(strip=True)
link = links[i].get("href", "") if i < len(links) else ""
content = contents[i].get_text(strip=True) if i < len(contents) else ""
# 补全相对链接
if link and not link.startswith("http"):
link = f"https://github.com{link}"
articles.append(Article(
title=title,
url=link,
content=content[:500], # 截取前 500 字
source=self.name,
category=self.category,
))
print(f"[{self.name}] Web 采集到 {len(articles)} 篇文章")
return articles
2.4 爬虫调度器
python
# crawlers/scheduler.py
import yaml
from crawlers.rss_crawler import RSSCrawler
from crawlers.web_crawler import WebCrawler
from models import Article
from typing import List
CRAWLER_MAP = {
"rss": RSSCrawler,
"web": WebCrawler,
}
def crawl_all(sources_path: str = "config/sources.yaml") -> List[Article]:
"""调度所有爬虫,统一采集"""
with open(sources_path, "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
all_articles = []
for source in config["sources"]:
crawler_cls = CRAWLER_MAP.get(source["type"])
if crawler_cls:
try:
crawler = crawler_cls(source)
articles = crawler.crawl()
all_articles.extend(articles)
except Exception as e:
print(f"[{source['name']}] 采集失败: {e}")
print(f"\n总计采集 {len(all_articles)} 篇文章")
return all_articles
三、内容处理
3.1 内容清洗与去重
python
# processors/cleaner.py
import re
from models import Article
def clean_article(article: Article) -> Article:
"""清洗文章内容"""
# 去除 HTML 标签
article.content = re.sub(r"<[^>]+>", "", article.content)
article.title = re.sub(r"<[^>]+>", "", article.title)
# 去除多余空白
article.content = re.sub(r"\s+", " ", article.content).strip()
article.title = article.title.strip()
# 去除过短的内容(无实质信息)
if len(article.content) < 50:
article.content = article.title # 用标题替代
return article
python
# processors/deduplicator.py
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from models import Article
from typing import List
def deduplicate(articles: List[Article], threshold: float = 0.85) -> List[Article]:
"""基于 TF-IDF + 余弦相似度的去重"""
if len(articles) <= 1:
return articles
texts = [f"{a.title} {a.content[:200]}" for a in articles]
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(texts)
similarity = cosine_similarity(tfidf_matrix)
keep = []
removed = set()
for i in range(len(articles)):
if i in removed:
continue
keep.append(articles[i])
for j in range(i + 1, len(articles)):
if j not in removed and similarity[i][j] > threshold:
removed.add(j)
print(f"[去重] {len(articles)} → {len(keep)} 篇 (移除 {len(removed)} 篇重复)")
return keep
四、AI 智能总结
4.1 AI 总结器
python
# processors/ai_summarizer.py
from openai import OpenAI
from models import Article
from typing import List
class AISummarizer:
"""AI 内容总结器"""
def __init__(self, api_key: str, base_url: str = None, model: str = "gpt-4o-mini"):
kwargs = {"api_key": api_key}
if base_url:
kwargs["base_url"] = base_url
self.client = OpenAI(**kwargs)
self.model = model
def summarize_article(self, article: Article) -> dict:
"""总结单篇文章:生成摘要 + 分类 + 重要性评分"""
prompt = f"""请分析以下技术资讯文章,返回 JSON 格式的分析结果:
标题: {article.title}
内容: {article.content[:800]}
请返回如下 JSON:
{{
"summary": "50字以内的中文摘要",
"category": "从以下选择一个: [AI/大模型, 前端, 后端, 云原生, 开源项目, 行业动态, 安全, 其他]",
"importance": "重要性评分 1-5 (5最重要)",
"tags": ["标签1", "标签2"]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
response_format={"type": "json_object"},
)
import json
result = json.loads(response.choices[0].message.content)
return result
def generate_daily_summary(self, articles: List[Article]) -> str:
"""生成每日行业总结"""
# 构建文章列表文本
article_texts = []
for a in articles[:30]: # 最多取 30 篇
article_texts.append(f"- [{a.category}] {a.title}: {a.summary or a.content[:100]}")
articles_block = "\n".join(article_texts)
prompt = f"""你是一位资深技术分析师。请基于以下今日行业资讯,生成一份专业的行业日报总结。
今日资讯列表:
{articles_block}
请生成日报总结,包含:
1. 今日热点概览(3-5 个核心趋势,每个 1-2 句话)
2. 重点事件分析(选取 2-3 个最重要事件深入分析)
3. 趋势洞察(对近期技术趋势的个人判断)
4. 推荐阅读(从列表中推荐 5 篇最值得深入阅读的文章)
请用 Markdown 格式输出。"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.5,
)
return response.choices[0].message.content
def process_all(self, articles: List[Article]) -> tuple:
"""批量处理所有文章"""
for i, article in enumerate(articles):
try:
result = self.summarize_article(article)
article.summary = result.get("summary", "")
article.category = result.get("category", article.category)
article.importance = result.get("importance", 3)
except Exception as e:
print(f"[AI总结] 文章处理失败 ({article.title}): {e}")
if (i + 1) % 5 == 0:
print(f"[AI总结] 已处理 {i+1}/{len(articles)}")
# 按重要性排序
articles.sort(key=lambda a: a.importance, reverse=True)
# 生成每日总结
daily_summary = self.generate_daily_summary(articles)
return articles, daily_summary
五、日报生成与推送
5.1 Markdown 模板渲染
python
# generators/report.py
from datetime import datetime, timedelta
from typing import List
from models import Article
class ReportGenerator:
"""日报生成器"""
def generate(self, articles: List[Article], daily_summary: str) -> str:
"""生成完整的 Markdown 日报"""
today = datetime.now().strftime("%Y年%m月%d日")
yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
md = f"""# AI 行业日报 | {today}
> 由 Python 爬虫 + AI 自动生成,每日 08:00 推送
---
## 今日总结
{daily_summary}
---
## 资讯列表
"""
# 按分类分组
categories = {}
for article in articles:
cat = article.category or "其他"
categories.setdefault(cat, []).append(article)
for category, cat_articles in categories.items():
md += f"### {category}\n\n"
for article in cat_articles:
importance_stars = "★" * article.importance + "☆" * (5 - article.importance)
md += f"- **[{article.source}]** [{article.title}]({article.url})\n"
if article.summary:
md += f" > {article.summary}\n"
md += f" 重要性: {importance_stars}\n\n"
md += "---\n\n"
md += f"""
---
*数据来源: 多源爬虫聚合 | AI 分析引擎: GPT-4o-mini*
*生成时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}*
"""
return md
5.2 推送通知
python
# notifiers/email_sender.py
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import markdown
class EmailSender:
"""邮件推送"""
def __init__(self, smtp_host: str, smtp_port: int, sender: str, password: str):
self.smtp_host = smtp_host
self.smtp_port = smtp_port
self.sender = sender
self.password = password
def send(self, to: str, subject: str, markdown_content: str):
"""发送邮件(Markdown 转 HTML)"""
html_content = markdown.markdown(markdown_content)
# 添加基础样式
styled_html = f"""
<html>
<head><style>
body {{ font-family: -apple-system, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }}
h1 {{ color: #1a1a1a; border-bottom: 2px solid #4f46e5; padding-bottom: 10px; }}
h3 {{ color: #4f46e5; }}
a {{ color: #2563eb; text-decoration: none; }}
blockquote {{ border-left: 3px solid #e5e7eb; padding-left: 12px; color: #6b7280; }}
</style></head>
<body>{html_content}</body>
</html>"""
msg = MIMEMultipart("alternative")
msg["Subject"] = subject
msg["From"] = self.sender
msg["To"] = to
msg.attach(MIMEText(markdown_content, "plain", "utf-8"))
msg.attach(MIMEText(styled_html, "html", "utf-8"))
with smtplib.SMTP(self.smtp_host, self.smtp_port) as server:
server.starttls()
server.login(self.sender, self.password)
server.send_message(msg)
print(f"[邮件] 已发送至 {to}")
python
# notifiers/webhook_sender.py
import requests
class WebhookSender:
"""飞书/钉钉 Webhook 推送"""
def __init__(self, webhook_url: str, platform: str = "feishu"):
self.webhook_url = webhook_url
self.platform = platform
def send_feishu(self, title: str, content: str):
"""发送飞书消息"""
payload = {
"msg_type": "interactive",
"card": {
"header": {
"title": {"content": title, "tag": "plain_text"},
"template": "blue",
},
"elements": [
{
"tag": "markdown",
"content": content[:4000], # 飞书限制
}
],
},
}
requests.post(self.webhook_url, json=payload)
print("[飞书] 日报已推送")
def send_dingtalk(self, title: str, content: str):
"""发送钉钉消息"""
payload = {
"msgtype": "markdown",
"markdown": {
"title": title,
"text": content,
},
}
requests.post(self.webhook_url, json=payload)
print("[钉钉] 日报已推送")
def send(self, title: str, content: str):
if self.platform == "feishu":
self.send_feishu(title, content)
else:
self.send_dingtalk(title, content)
六、主程序与定时调度
6.1 主流程
python
# main.py
import os
from datetime import datetime
from dotenv import load_dotenv
from crawlers.scheduler import crawl_all
from processors.cleaner import clean_article
from processors.deduplicator import deduplicate
from processors.ai_summarizer import AISummarizer
from generators.report import ReportGenerator
from notifiers.email_sender import EmailSender
from notifiers.webhook_sender import WebhookSender
load_dotenv()
def run_daily_report():
"""执行完整的日报生成流程"""
print(f"{'='*60}")
print(f"AI 行业日报生成 - {datetime.now().strftime('%Y-%m-%d %H:%M')}")
print(f"{'='*60}\n")
# Step 1: 多源采集
print("[Step 1] 数据采集中...")
articles = crawl_all("config/sources.yaml")
# Step 2: 清洗
print("\n[Step 2] 内容清洗中...")
articles = [clean_article(a) for a in articles]
# Step 3: 去重
print("\n[Step 3] 智能去重中...")
articles = deduplicate(articles)
# Step 4: AI 总结
print("\n[Step 4] AI 智能分析中...")
summarizer = AISummarizer(
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_BASE_URL"),
model=os.getenv("LLM_MODEL", "gpt-4o-mini"),
)
articles, daily_summary = summarizer.process_all(articles)
# Step 5: 生成日报
print("\n[Step 5] 生成日报中...")
generator = ReportGenerator()
report_md = generator.generate(articles, daily_summary)
# 保存本地副本
date_str = datetime.now().strftime("%Y-%m-%d")
with open(f"reports/daily_report_{date_str}.md", "w", encoding="utf-8") as f:
f.write(report_md)
# Step 6: 推送
print("\n[Step 6] 推送日报...")
subject = f"AI 行业日报 | {datetime.now().strftime('%Y年%m月%d日')}"
# 邮件推送
if os.getenv("EMAIL_ENABLED") == "true":
sender = EmailSender(
smtp_host=os.getenv("SMTP_HOST"),
smtp_port=int(os.getenv("SMTP_PORT", "587")),
sender=os.getenv("EMAIL_SENDER"),
password=os.getenv("EMAIL_PASSWORD"),
)
sender.send(os.getenv("EMAIL_TO"), subject, report_md)
# 飞书/钉钉推送
webhook_url = os.getenv("WEBHOOK_URL")
if webhook_url:
notifier = WebhookSender(webhook_url, os.getenv("WEBHOOK_PLATFORM", "feishu"))
notifier.send(subject, report_md)
print(f"\n{'='*60}")
print(f"日报生成完成! 共处理 {len(articles)} 篇文章")
print(f"{'='*60}")
if __name__ == "__main__":
run_daily_report()
6.2 定时调度
python
# scheduler.py
from apscheduler.schedulers.blocking import BlockingScheduler
from main import run_daily_report
scheduler = BlockingScheduler()
# 每天 08:00 执行
scheduler.add_job(run_daily_report, "cron", hour=8, minute=0)
print("定时任务已启动,每天 08:00 生成日报...")
scheduler.start()
或者使用系统 Crontab:
bash
# crontab -e
0 8 * * * cd /path/to/ai-daily-report && python main.py >> logs/cron.log 2>&1
七、运行效果示例
生成的日报 Markdown 效果:
markdown
# AI 行业日报 | 2026年04月13日
## 今日总结
### 热点概览
1. **大模型开源竞争加剧** --- DeepSeek 发布 V4 模型,多项基准超越 GPT-4
2. **MCP 协议生态爆发** --- 超过 2000 个 MCP Server 上线,开发者生态快速形成
### 重点事件分析
> DeepSeek V4 的发布意味着开源模型已全面逼近闭源模型水平...
## 资讯列表
### AI/大模型
- **[Hacker News]** [DeepSeek V4 Released](https://...)
> DeepSeek 发布最新开源模型,在代码生成和推理任务上表现优异
重要性: ★★★★★
### 开源项目
- **[GitHub Trending]** [mcp-framework](https://github.com/...)
> 新的 MCP Server 快速开发框架,支持 Python 和 TypeScript
重要性: ★★★★☆
八、成本与优化
| 项目 | 说明 |
|---|---|
| API 调用成本 | 30 篇文章总结 ≈ $0.05/天(GPT-4o-mini) |
| 运行环境 | 任意支持 Python 3.10+ 的服务器/本地 |
| 数据存储 | 每日约 100KB Markdown 文件 |
| 扩展性 | 新增数据源只需在 sources.yaml 中添加配置 |
进一步优化方向
- 增量爬取:记录已爬 URL,避免重复处理
- 多语言支持:自动检测并翻译外文资讯
- 个性化推荐:基于用户阅读偏好调整日报内容权重
- 图表生成:自动生成关键词云、趋势折线图
总结
本文构建了一套完整的 Python 爬虫 + AI 日报系统,涵盖:
- 多源数据采集 --- RSS + Web 爬虫统一调度
- 智能去重 --- TF-IDF + 余弦相似度
- AI 总结 --- 分类、摘要、重要性评估、趋势洞察
- 自动推送 --- 邮件 + 飞书/钉钉多渠道
- 定时调度 --- 每天自动运行,零人工干预
整个系统代码量约 500 行,维护成本低,可根据个人需求灵活定制数据源和推送渠道。
