author: 专注Python实战,分享爬虫与数据分析干货
title: Python爬虫实战㉘|综合实战3,新闻热点追踪与舆情分析系统
update: 2026-04-26
tags: Python,爬虫实战,新闻,舆情分析,情感分析,关键词提取,热力图
作者:专注Python实战,分享爬虫与数据分析干货
更新时间:2026年4月
适合人群:已学完全部基础、想做完整项目的开发者
前言:舆情监控 = 爬虫 + NLP + 可视化
企业想知道用户怎么评价自己?个人想追踪行业热点?
自动采集新闻 → 提取关键词 → 情感分析 → 热度追踪,一套搞定。
一、项目架构
news_tracker/
├── config.py # 配置(关键词、来源等)
├── crawler.py # 新闻爬虫
├── analyzer.py # 舆情分析(关键词+情感)
├── visualizer.py # 可视化
├── reporter.py # 报告生成
├── main.py # 入口
└── data/
└── news.db # SQLite
二、新闻爬虫
2.1 多源爬取
python
# crawler.py
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
import hashlib
class NewsCrawler:
"""多源新闻爬虫"""
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"
}
self.articles = []
def crawl_rss(self, rss_url, source_name):
"""爬取RSS源"""
try:
resp = requests.get(rss_url, headers=self.headers, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "xml")
items = soup.find_all("item")
for item in items:
article = {
"标题": self._get_text(item, "title"),
"链接": self._get_text(item, "link"),
"摘要": self._get_text(item, "description"),
"发布时间": self._get_text(item, "pubDate"),
"来源": source_name,
"内容": "",
}
article["ID"] = hashlib.md5(article["链接"].encode()).hexdigest()[:12]
self.articles.append(article)
print(f" {source_name}: 采集{len(items)}条")
except Exception as e:
print(f" {source_name}: 爬取失败 - {e}")
def crawl_page(self, url, source_name, title_sel, summary_sel, time_sel):
"""爬取网页源"""
try:
resp = requests.get(url, headers=self.headers, timeout=15)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding
soup = BeautifulSoup(resp.text, "html.parser")
articles = soup.select(".news-item") # 通用选择器
for item in articles:
article = {
"标题": self._safe_text(item.select_one(title_sel)),
"链接": "",
"摘要": self._safe_text(item.select_one(summary_sel)),
"发布时间": self._safe_text(item.select_one(time_sel)),
"来源": source_name,
"内容": "",
}
article["ID"] = hashlib.md5(article["标题"].encode()).hexdigest()[:12]
self.articles.append(article)
except Exception as e:
print(f" {source_name}: 爬取失败 - {e}")
time.sleep(random.uniform(1, 2))
def _get_text(self, item, tag):
elem = item.find(tag)
return elem.get_text(strip=True) if elem else ""
def _safe_text(self, element):
return element.get_text(strip=True) if element else ""
def to_dataframe(self):
df = pd.DataFrame(self.articles)
df = df.drop_duplicates(subset=["ID"]).reset_index(drop=True)
return df
三、舆情分析
3.1 关键词提取
python
# analyzer.py
import jieba
import jieba.analyse
from collections import Counter
class NewsAnalyzer:
"""新闻舆情分析器"""
def __init__(self, df):
self.df = df
self.stop_words = self._load_stop_words()
def _load_stop_words(self):
"""加载停用词"""
common_stop = {"的", "了", "和", "是", "在", "与", "及", "等", "中",
"为", "对", "从", "到", "被", "把", "让", "用", "也",
"就", "都", "而", "之", "这", "那", "有", "将", "其",
"可", "此", "以", "上", "下", "不", "没", "要", "会"}
return common_stop
def extract_keywords_tfidf(self, text, top_k=10):
"""TF-IDF关键词提取"""
keywords = jieba.analyse.extract_tags(text, topK=top_k, withWeight=True)
return keywords
def extract_keywords_textrank(self, text, top_k=10):
"""TextRank关键词提取"""
keywords = jieba.analyse.textrank(text, topK=top_k, withWeight=True)
return keywords
def get_hot_keywords(self, top_k=30):
"""全局热门关键词"""
all_text = " ".join(self.df["标题"].fillna("") + " " + self.df["摘要"].fillna(""))
keywords = self.extract_keywords_tfidf(all_text, top_k=top_k)
return keywords
def get_daily_keywords(self, top_k=10):
"""每日热门关键词"""
self.df["日期"] = pd.to_datetime(self.df["发布时间"], errors="coerce").dt.date
daily = {}
for date, group in self.df.groupby("日期"):
text = " ".join(group["标题"].fillna(""))
keywords = self.extract_keywords_tfidf(text, top_k=top_k)
daily[date] = keywords
return daily
3.2 简易情感分析
python
def sentiment_analysis(self, text):
"""简易情感分析(基于词典)"""
positive_words = {"好", "优", "强", "增长", "突破", "创新", "领先",
"提升", "进步", "成功", "发展", "繁荣", "利好", "上涨"}
negative_words = {"差", "弱", "下降", "亏损", "风险", "危机", "衰退",
"失败", "问题", "困难", "损失", "下跌", "违规", "处罚"}
words = set(jieba.cut(text))
pos_count = len(words & positive_words)
neg_count = len(words & negative_words)
if pos_count > neg_count:
return "正面", (pos_count - neg_count) / max(len(words), 1)
elif neg_count > pos_count:
return "负面", -(neg_count - pos_count) / max(len(words), 1)
else:
return "中性", 0.0
def analyze_all_sentiment(self):
"""分析所有文章情感"""
results = []
for _, row in self.df.iterrows():
text = str(row["标题"]) + " " + str(row.get("摘要", ""))
label, score = self.sentiment_analysis(text)
results.append({"情感": label, "情感分数": score})
sentiment_df = pd.DataFrame(results)
self.df = pd.concat([self.df, sentiment_df], axis=1)
# 统计
print("\n=== 情感分布 ===")
print(self.df["情感"].value_counts())
print(f"\n平均情感分数: {self.df['情感分数'].mean():.3f}")
return self.df
四、可视化
python
# visualizer.py
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from wordcloud import WordCloud
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
class NewsVisualizer:
"""新闻可视化"""
def __init__(self, df):
self.df = df
def plot_keyword_frequency(self, keywords, top_k=20):
"""关键词频率图"""
words, weights = zip(*keywords[:top_k])
plt.figure(figsize=(10, 6))
bars = plt.barh(range(len(words)), weights, color=plt.cm.viridis(np.linspace(0.3, 0.9, len(words))))
plt.yticks(range(len(words)), words)
plt.xlabel("TF-IDF权重")
plt.title("新闻热门关键词TOP20", fontsize=16, fontweight="bold")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig("news_keywords.png", dpi=150, bbox_inches="tight")
plt.show()
def plot_sentiment_distribution(self):
"""情感分布图"""
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# 饼图
counts = self.df["情感"].value_counts()
axes[0].pie(counts.values, labels=counts.index, autopct="%1.1f%%",
colors=["#4CAF50", "#FFC107", "#F44336"], startangle=90)
axes[0].set_title("情感分布")
# 时间线
if "日期" in self.df.columns:
daily_sentiment = self.df.groupby("日期")["情感分数"].mean()
axes[1].plot(daily_sentiment.index, daily_sentiment.values, "o-",
color="#4472C4", linewidth=2)
axes[1].axhline(0, color="gray", linestyle="--", alpha=0.5)
axes[1].set_title("每日情感趋势")
axes[1].set_ylabel("情感分数")
plt.tight_layout()
plt.savefig("sentiment_analysis.png", dpi=150, bbox_inches="tight")
plt.show()
def plot_wordcloud(self, keywords):
"""词云"""
wc = WordCloud(
font_path="C:/Windows/Fonts/simhei.ttf",
width=1200, height=600,
background_color="white",
max_words=50,
)
wc.generate_from_frequencies(dict(keywords))
plt.figure(figsize=(14, 7))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("新闻关键词词云", fontsize=16, fontweight="bold")
plt.tight_layout()
plt.savefig("news_wordcloud.png", dpi=150, bbox_inches="tight")
plt.show()
def plot_source_distribution(self):
"""来源分布"""
source_counts = self.df["来源"].value_counts()
plt.figure(figsize=(8, 5))
source_counts.plot(kind="bar", color="#4472C4")
plt.title("各来源新闻数量", fontsize=16, fontweight="bold")
plt.xlabel("来源")
plt.ylabel("数量")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("source_dist.png", dpi=150, bbox_inches="tight")
plt.show()
五、主程序
python
# main.py
from crawler import NewsCrawler
from analyzer import NewsAnalyzer
from visualizer import NewsVisualizer
def main():
print("=" * 50)
print(" 新闻热点追踪与舆情分析系统")
print("=" * 50)
# 1. 爬取新闻
print("\n📡 采集新闻数据...")
crawler = NewsCrawler()
# 添加RSS源(示例)
rss_sources = [
("https://feeds.feedburner.com/tech_news", "科技资讯"),
("https://rss.example.com/finance", "财经新闻"),
]
for url, name in rss_sources:
crawler.crawl_rss(url, name)
df = crawler.to_dataframe()
print(f"采集完成: {len(df)}条新闻")
if len(df) == 0:
print("无数据,退出")
return
# 2. 舆情分析
print("\n📊 舆情分析...")
analyzer = NewsAnalyzer(df)
# 热门关键词
hot_keywords = analyzer.get_hot_keywords(top_k=30)
print("\n热门关键词TOP10:")
for word, weight in hot_keywords[:10]:
print(f" {word}: {weight:.4f}")
# 情感分析
df = analyzer.analyze_all_sentiment()
# 3. 可视化
print("\n📈 生成可视化图表...")
viz = NewsVisualizer(df)
viz.plot_keyword_frequency(hot_keywords)
viz.plot_sentiment_distribution()
viz.plot_wordcloud(hot_keywords)
viz.plot_source_distribution()
print("\n✅ 分析完成")
if __name__ == "__main__":
main()
六、知识卡
| 模块 | 说明 |
|---|---|
| NewsCrawler | 多源新闻爬虫(RSS+网页) |
| jieba.analyse | TF-IDF/TextRank关键词提取 |
| 情感词典 | 正面/负面词汇统计 |
| WordCloud | 词云可视化 |
| hashlib.md5 | 文章去重ID |
| groupby("日期") | 时间维度分析 |
七、课后作业
必做题:
- 搭建多源新闻爬虫
- 实现关键词提取和情感分析
- 生成关键词频率图
选做题:
- 使用snownlp做更精准的情感分析
- 添加定时采集和变化检测
有问题欢迎评论区留言,大家一起讨论!
标签:Python | 爬虫实战 | 新闻 | 舆情分析 | 情感分析 | 关键词提取 | 词云