
主要优化内容:
定时任务调整:
将定时任务从每小时改为每10分钟执行一次
调整了请求延迟时间,从1-3秒减少到0.5-1.5秒
缩短了请求超时时间,从10秒减少到8秒
性能优化:
移除了广告数据的处理,减少不必要的处理
优化了数据结构,减少内存占用
添加了数据清理功能,自动删除7天前的数据
用户体验改进:
HTML页面添加了自动刷新功能(每5分钟)
添加了手动刷新按钮
增加了统计信息显示(总热搜数、时间段数等)
优化了移动端显示效果
代码健壮性:
添加了异常处理
优化了数据存储和读取逻辑
添加了数据清理机制,防止数据无限增长
界面美化:
添加了网站图标
优化了颜色方案和布局
改进了响应式设计,适配移动设备
这个优化版本每10分钟获取一次微博热搜,并将新数据追加到当天的HTML文件中,同时保持了去重功能。页面也会每5分钟自动刷新,确保用户看到最新数据。
python
import requests
from bs4 import BeautifulSoup
import time
import os
from datetime import datetime, timedelta
import schedule
import random
import json
import re
def fetch_weibo_hot():
"""使用API接口获取微博热搜数据(避免HTML结构变化问题)"""
api_url = "https://weibo.com/ajax/side/hotSearch"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
'Referer': 'https://weibo.com/',
'Cookie': 'XSRF-TOKEN=RKnNBEaQBrp==' # 替换为你的实际Cookie
}
try:
# 添加随机延迟避免被封
time.sleep(random.uniform(0.5, 1.5))
response = requests.get(api_url, headers=headers, timeout=8)
response.raise_for_status()
# 解析JSON数据
data = response.json()
# 提取热搜数据
hot_items = []
for group in data['data']['realtime']:
# 普通热搜项
if 'word' in group:
item = {
'rank': group.get('rank', ''),
'title': group['word'],
'hot': group.get('num', '0'),
'link': f"https://s.weibo.com/weibo?q={group['word']}",
'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
'source': 'weibo'
}
hot_items.append(item)
# 只保留前50条热搜
return hot_items[:50]
except Exception as e:
print(f"🚨 获取热搜数据失败: {e}")
return []
def load_existing_hot_searches(date_str):
"""加载当天已存在的热搜数据"""
json_file = os.path.join("weibo_hot", date_str, "hot_searches.json")
if os.path.exists(json_file):
try:
with open(json_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"读取已有热搜数据失败: {e}")
return []
return []
def save_hot_data_json(hot_data, date_str):
"""保存热搜数据到JSON文件"""
daily_dir = os.path.join("weibo_hot", date_str)
if not os.path.exists(daily_dir):
os.makedirs(daily_dir)
json_file = os.path.join(daily_dir, "hot_searches.json")
# 加载已有数据
existing_data = load_existing_hot_searches(date_str)
existing_titles = {item['title'] for item in existing_data}
# 过滤掉已存在的热搜
new_data = [item for item in hot_data if item['title'] not in existing_titles]
# 合并数据
all_data = existing_data + new_data
# 保存到JSON文件
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(all_data, f, ensure_ascii=False, indent=2)
return all_data, len(new_data)
def generate_html(hot_data, date_str):
"""生成HTML文件"""
if not hot_data:
return "<html><body><h1>未获取到热搜数据</h1></body></html>"
# 按时间分组热搜
time_groups = {}
for item in hot_data:
time_key = item['timestamp'][:13] # 只取到小时
if time_key not in time_groups:
time_groups[time_key] = []
time_groups[time_key].append(item)
# 按时间倒序排列
sorted_times = sorted(time_groups.keys(), reverse=True)
# 统计信息
total_count = len(hot_data)
time_count = len(time_groups)
html_content = f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>微博热搜榜 {date_str}</title>
<link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22>🔥</text></svg>">
<style>
* {{
box-sizing: border-box;
margin: 0;
padding: 0;
}}
body {{
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "PingFang SC", "Microsoft YaHei", sans-serif;
background-color: #f5f8fa;
color: #14171a;
line-height: 1.5;
padding: 20px;
max-width: 1200px;
margin: 0 auto;
}}
.container {{
background: white;
border-radius: 16px;
box-shadow: 0 3px 10px rgba(0, 0, 0, 0.08);
overflow: hidden;
margin-bottom: 30px;
}}
.header {{
background: linear-gradient(135deg, #ff6b6b, #5e72eb);
color: white;
padding: 25px 30px;
position: relative;
}}
.title {{
font-size: 28px;
font-weight: 700;
margin-bottom: 5px;
}}
.subtitle {{
font-size: 16px;
opacity: 0.9;
}}
.stats {{
margin-top: 15px;
display: flex;
gap: 15px;
flex-wrap: wrap;
}}
.stat-item {{
background: rgba(255, 255, 255, 0.2);
padding: 8px 15px;
border-radius: 20px;
font-size: 14px;
}}
.update-time {{
position: absolute;
top: 25px;
right: 30px;
background: rgba(0, 0, 0, 0.15);
padding: 5px 12px;
border-radius: 20px;
font-size: 14px;
}}
.time-section {{
margin: 20px 0;
padding: 15px;
background: #f8f9fa;
border-radius: 8px;
}}
.time-header {{
font-size: 18px;
font-weight: 600;
margin-bottom: 15px;
color: #495057;
display: flex;
align-items: center;
justify-content: space-between;
}}
.time-info {{
display: flex;
align-items: center;
}}
.time-count {{
background: #e9ecef;
padding: 2px 8px;
border-radius: 10px;
font-size: 14px;
margin-left: 10px;
}}
.hot-list {{
padding: 0;
}}
.hot-item {{
display: flex;
align-items: center;
padding: 16px 20px;
border-bottom: 1px solid #e6ecf0;
transition: background 0.2s;
}}
.hot-item:hover {{
background-color: #f7f9fa;
}}
.rank {{
width: 36px;
height: 36px;
line-height: 36px;
text-align: center;
font-weight: bold;
font-size: 16px;
background: #f0f2f5;
border-radius: 8px;
margin-right: 15px;
flex-shrink: 0;
}}
.top1 {{
background: linear-gradient(135deg, #ff9a9e, #fad0c4);
color: #d63031;
}}
.top2 {{
background: linear-gradient(135deg, #a1c4fd, #c2e9fb);
color: #0984e3;
}}
.top3 {{
background: linear-gradient(135deg, #ffecd2, #fcb69f);
color: #e17055;
}}
.hot-content {{
flex: 1;
min-width: 0;
}}
.hot-title {{
font-size: 17px;
font-weight: 500;
margin-bottom: 6px;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}}
.hot-stats {{
display: flex;
align-items: center;
color: #657786;
font-size: 14px;
gap: 15px;
flex-wrap: wrap;
}}
.hot-value {{
color: #ff6b6b;
font-weight: 700;
}}
.hot-time {{
font-size: 12px;
color: #868e96;
}}
.link {{
color: #1da1f2;
text-decoration: none;
transition: color 0.2s;
}}
.link:hover {{
color: #0d8bda;
text-decoration: underline;
}}
.footer {{
text-align: center;
padding: 20px;
color: #657786;
font-size: 13px;
border-top: 1px solid #e6ecf0;
}}
.no-data {{
text-align: center;
padding: 40px;
color: #657786;
}}
.collapse-btn {{
background: #6c757d;
color: white;
border: none;
padding: 5px 10px;
border-radius: 4px;
cursor: pointer;
font-size: 12px;
}}
.collapse-btn:hover {{
background: #5a6268;
}}
.collapsed .hot-list {{
display: none;
}}
.auto-refresh {{
text-align: center;
margin: 20px 0;
}}
.refresh-btn {{
background: #28a745;
color: white;
border: none;
padding: 10px 20px;
border-radius: 5px;
cursor: pointer;
font-size: 14px;
}}
.refresh-btn:hover {{
background: #218838;
}}
@media (max-width: 768px) {{
body {{
padding: 10px;
}}
.header {{
padding: 20px 15px;
}}
.title {{
font-size: 22px;
}}
.update-time {{
position: static;
margin-top: 10px;
}}
.hot-item {{
padding: 14px 15px;
}}
.hot-title {{
font-size: 16px;
}}
.hot-stats {{
flex-direction: column;
align-items: flex-start;
gap: 5px;
}}
}}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1 class="title">微博热搜榜</h1>
<div class="subtitle">全天热点汇总 · 每10分钟更新</div>
<div class="stats">
<div class="stat-item">📊 总热搜数: {total_count}</div>
<div class="stat-item">🕒 时间段: {time_count}</div>
<div class="stat-item">⏰ 更新频率: 每10分钟</div>
</div>
<div class="update-time">最后更新: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</div>
</div>
"""
# 添加时间分组的热搜内容
for time_key in sorted_times:
time_display = f"{time_key}:00"
time_data = time_groups[time_key]
html_content += f"""
<div class="time-section">
<div class="time-header">
<div class="time-info">
🕒 {time_display}
<span class="time-count">{len(time_data)} 条热搜</span>
</div>
<button class="collapse-btn" onclick="this.parentElement.parentElement.classList.toggle('collapsed')">折叠/展开</button>
</div>
<div class="hot-list">
"""
for item in time_data:
rank_class = ""
if item['rank'] == 1:
rank_class = "top1"
elif item['rank'] == 2:
rank_class = "top2"
elif item['rank'] == 3:
rank_class = "top3"
hot_value = f"<span class='hot-value'>🔥 {item['hot']}</span>" if item['hot'] else ""
html_content += f"""
<div class="hot-item">
<div class="rank {rank_class}">{item['rank']}</div>
<div class="hot-content">
<div class="hot-title">
<a href="{item['link']}" class="link" target="_blank">{item['title']}</a>
</div>
<div class="hot-stats">
{hot_value}
<span class="hot-time">📅 {item['timestamp']}</span>
</div>
</div>
</div>
"""
html_content += """
</div>
</div>
"""
if not hot_data:
html_content += """
<div class="no-data">
<h3>暂无热搜数据</h3>
<p>请检查网络连接或稍后再试</p>
</div>
"""
html_content += f"""
<div class="auto-refresh">
<button class="refresh-btn" onclick="location.reload()">🔄 刷新页面</button>
<p>页面每10分钟自动更新,也可手动刷新</p>
</div>
<div class="footer">
数据来源: 微博热搜 • 每10分钟自动更新 • 最后更新: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} • 仅供学习参考
</div>
</div>
<script>
// 默认折叠所有时间段的搜索结果
document.addEventListener('DOMContentLoaded', function() {{
var sections = document.querySelectorAll('.time-section');
sections.forEach(function(section, index) {{
if (index > 0) {{ // 保持最新时间段展开
section.classList.add('collapsed');
}}
}});
// 设置自动刷新(每5分钟)
setTimeout(function() {{
location.reload();
}}, 5 * 60 * 1000);
}});
</script>
</body>
</html>
"""
return html_content
def cleanup_old_data():
"""清理7天前的数据"""
try:
now = datetime.now()
cutoff_date = (now - timedelta(days=7)).strftime("%Y-%m-%d")
weibo_hot_dir = "weibo_hot"
if os.path.exists(weibo_hot_dir):
for date_dir in os.listdir(weibo_hot_dir):
if date_dir < cutoff_date:
dir_path = os.path.join(weibo_hot_dir, date_dir)
import shutil
shutil.rmtree(dir_path)
print(f"🗑️ 已清理过期数据: {date_dir}")
except Exception as e:
print(f"清理旧数据时出错: {e}")
def save_hot_data():
"""保存热搜数据"""
try:
# 创建存储目录
if not os.path.exists("weibo_hot"):
os.makedirs("weibo_hot")
# 获取当前日期
now = datetime.now()
date_str = now.strftime("%Y-%m-%d")
print(f"🕒 开始获取 {now.strftime('%Y-%m-%d %H:%M:%S')} 的热搜数据...")
hot_data = fetch_weibo_hot()
if hot_data:
print(f"✅ 成功获取 {len(hot_data)} 条热搜数据")
# 保存到JSON文件并获取所有数据
all_data, new_count = save_hot_data_json(hot_data, date_str)
print(f"📊 已有 {len(all_data)} 条热搜,新增 {new_count} 条")
# 生成HTML内容
html_content = generate_html(all_data, date_str)
# 保存HTML文件
html_file = os.path.join("weibo_hot", date_str, "index.html")
with open(html_file, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"💾 已保存到: {html_file}")
# 每周清理一次旧数据
#if now.weekday() == 0 and now.hour == 0 and now.minute < 10: # 每周一凌晨
# cleanup_old_data()
else:
print("⚠️ 未获取到热搜数据,跳过保存")
except Exception as e:
print(f"❌ 保存数据时出错: {e}")
def job():
"""定时任务"""
print("\n" + "="*60)
print(f"⏰ 执行定时任务: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
save_hot_data()
print("="*60 + "\n")
if __name__ == "__main__":
print("🔥 微博热搜爬虫已启动 🔥")
print("⏳ 首次执行将立即运行,之后每10分钟执行一次")
print("💡 提示: 请确保已更新有效的Cookie")
print("="*60)
# 立即执行一次
job()
# 设置定时任务(每10分钟执行一次)
schedule.every(10).minutes.do(job)
# 每天凌晨清理一次旧数据
schedule.every().day.at("00:00").do(cleanup_old_data)
print("⏳ 程序运行中,按Ctrl+C退出...")
try:
# 保持程序运行
while True:
schedule.run_pending()
time.sleep(30) # 每30秒检查一次
except KeyboardInterrupt:
print("\n👋 程序已手动停止")