别再手动查热点了!200行Python代码搞定微博知乎头条等全网焦点,小白也能快速上手

以下是将所有平台整合后的代码文件,包含必要的依赖和配置说明:

python 复制代码
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from urllib.parse import quote

# ================== 全局配置 ==================
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'zh-CN,zh;q=0.9'
}

# ================== 平台抓取函数 ==================

def fetch_weibo_hot():
    """微博热搜"""
    try:
        url = 'https://weibo.com/ajax/side/hotSearch'
        response = requests.get(url, headers=HEADERS, timeout=10)
        data = response.json()['data']['realtime']
        return [{'title': item['word'], 'url': f"https://s.weibo.com/weibo?q={item['word']}"} for item in data]
    except Exception as e:
        print(f"微博热搜抓取失败: {str(e)}")
        return []

def fetch_zhihu_hot():
    """知乎热榜"""
    try:
        url = 'https://www.zhihu.com/billboard'
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        script_data = soup.find('script', id='js-initialData').string
        json_data = json.loads(script_data)
        hot_list = json_data['initialState']['topstory']['hotList']
        return [{
            'title': item['target']['titleArea']['text'],
            'url': item['target']['link']['url']
        } for item in hot_list]
    except Exception as e:
        print(f"知乎热榜抓取失败: {str(e)}")
        return []

def fetch_baidu_hot():
    """百度热搜"""
    try:
        url = 'https://top.baidu.com/board?tab=realtime'
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        items = soup.find_all('div', class_='category-wrap_iQLoo')
        return [{
            'title': item.find('div', class_='c-single-text-ellipsis').text.strip(),
            'url': item.find('a')['href']
        } for item in items]
    except Exception as e:
        print(f"百度热搜抓取失败: {str(e)}")
        return []

def fetch_weixin_hot():
    """微信热点(通过搜狗)"""
    try:
        url = 'https://weixin.sogou.com/'
        response = requests.get(url, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = []
        for item in soup.select('#hotBox li'):
            title_tag = item.find('a', href=re.compile(r'^http://mp.weixin.qq.com'))
            if title_tag:
                articles.append({
                    'title': title_tag.text.strip(),
                    'url': title_tag['href']
                })
        return articles[:10]
    except Exception as e:
        print(f"微信热点抓取失败: {str(e)}")
        return []

def fetch_juejin_hot():
    """掘金热榜"""
    try:
        url = 'https://api.juejin.cn/content_api/v1/content/article_rank?category_id=1&type=hot'
        response = requests.post(url, headers=HEADERS, timeout=10)
        return [{
            'title': item['content']['title'],
            'url': f"https://juejin.cn/post/{item['content']['content_id']}"
        } for item in response.json()['data'][:20]]
    except Exception as e:
        print(f"掘金热榜抓取失败: {str(e)}")
        return []

def fetch_csdn_hot():
    """CSDN热榜"""
    try:
        url = 'https://blog.csdn.net/phoenix/web/blog/hot-rank?page=0&pageSize=25'
        headers = HEADERS.copy()
        headers['Referer'] = 'https://blog.csdn.net/'
        response = requests.get(url, headers=headers, timeout=10)
        return [{
            'title': item['articleTitle'],
            'url': item['articleDetailUrl'],
            'heat': item['hotRankScore']
        } for item in response.json()['data'][:15]]
    except Exception as e:
        print(f"CSDN热榜抓取失败: {str(e)}")
        return []

def fetch_toutiao_hot():
    """今日头条热榜(Selenium方案)"""
    try:
        options = Options()
        options.add_argument("--headless")
        options.add_argument(f"user-agent={HEADERS['User-Agent']}")
        
        driver = webdriver.Chrome(options=options)
        driver.get("https://www.toutiao.com/")
        time.sleep(5)
        
        hot_items = driver.find_elements(By.XPATH, '//div[contains(@class,"hot-list-item")]')
        results = []
        for item in hot_items[:20]:
            results.append({
                'title': item.find_element(By.CLASS_NAME, 'title').text,
                'url': item.find_element(By.TAG_NAME, 'a').get_attribute('href')
            })
        driver.quit()
        return results
    except Exception as e:
        print(f"今日头条抓取失败: {str(e)}")
        return []

# ================== 核心逻辑 ==================

def save_to_json(data, filename='hot_data.json'):
    """保存数据到JSON文件"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def main():
    platforms = {
        '微博热搜': fetch_weibo_hot,
        '知乎热榜': fetch_zhihu_hot,
        '百度热搜': fetch_baidu_hot,
        '微信热点': fetch_weixin_hot,
        '掘金热榜': fetch_juejin_hot,
        'CSDN热榜': fetch_csdn_hot,
        '今日头条': fetch_toutiao_hot
    }

    all_data = {}
    for name, func in platforms.items():
        print(f'\n====== 正在抓取 {name} ======')
        start_time = time.time()
        all_data[name] = func()
        cost = time.time() - start_time
        print(f"完成!耗时{cost:.2f}秒,获取到{len(all_data[name])}条数据")
        time.sleep(random.randint(2, 5))  # 随机延迟

    # 保存数据
    save_to_json(all_data)
    print('\n===== 数据已保存到 hot_data.json =====')

    # 打印摘要
    print('\n===== 最新热榜摘要 =====')
    for platform, items in all_data.items():
        print(f'\n▶ {platform}(共{len(items)}条):')
        for idx, item in enumerate(items[:3], 1):
            print(f"  {idx}. {item['title'][:30]}...")
            if 'heat' in item:
                print(f"     热度:{item['heat']} 链接:{item['url'][:50]}...")
            else:
                print(f"     链接:{item['url'][:50]}...")

if __name__ == '__main__':
    main()

使用说明

  1. 环境准备

    bash 复制代码
    # 安装依赖库
    pip install requests beautifulsoup4 selenium webdriver-manager
  2. 浏览器驱动配置

    • 安装Chrome浏览器
    • 根据Chrome版本下载对应chromedriver
    • 将驱动文件放在系统PATH路径或代码同级目录
  3. 运行程序

    bash 复制代码
    python hotlist_crawler.py

功能扩展

添加新平台

python 复制代码
def fetch_新平台_hot():
    # 实现抓取逻辑
    return [{'title':..., 'url':...}]

# 更新platforms字典
platforms = {
    ...
    '新平台名称': fetch_新平台_hot
}

输出示例

控制台输出:

perl 复制代码
====== 正在抓取 微博热搜 ======
完成!耗时1.23秒,获取到50条数据

====== 正在抓取 知乎热榜 ======
完成!耗时2.15秒,获取到50条数据

...

===== 数据已保存到 hot_data.json =====

===== 最新热榜摘要 =====

▶ 微博热搜(共50条):
  1. #神舟十八号发射成功#...
     链接:https://s.weibo.com/weibo?q=%23%E7%A5%9E%E...
  2. #五一假期出行预测#...
     链接:https://s.weibo.com/weibo?q=%23%E4%BA%94%...

▶ 今日头条(共20条):
  1. 国际油价创年内新高...
     链接:https://www.toutiao.com/trending/723456...

注意事项

  1. 反爬策略

    • HEADERS中添加有效的Cookie
    • 使用代理IP池(推荐使用付费代理服务)
    python 复制代码
    # 在请求时添加代理
    response = requests.get(url, proxies={'http': 'http://ip:port'})
  2. 性能优化

    python 复制代码
    # 启用多线程(示例)
    from concurrent.futures import ThreadPoolExecutor
    
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = {executor.submit(func): name for name, func in platforms.items()}
        for future in as_completed(futures):
            name = futures[future]
            all_data[name] = future.result()
  3. 定时任务

    python 复制代码
    # 使用APScheduler定时执行
    from apscheduler.schedulers.blocking import BlockingScheduler
    
    scheduler = BlockingScheduler()
    @scheduler.scheduled_job('interval', hours=1)
    def scheduled_job():
        main()
    scheduler.start()

建议根据实际需求调整请求频率和反爬策略,完整代码已包含主流平台的热榜抓取功能,可直接运行或二次开发。

相关推荐
T - mars5 小时前
爬虫:IP代理
爬虫
酱酱们的每日掘金6 小时前
一键连接 6000 + 应用dify MCP 插件指南、谷歌 AI 编程产品一网打尽、MCP玩出花了丨AI Coding 周刊第 4 期
前端·后端·ai编程·mcp
whoisi22227 小时前
用Trae做一个Roguelike爬塔游戏
人工智能·ai编程·trae
whoisi22227 小时前
用Cursor 做一个ARPG游戏
人工智能·ai编程·cursor
昊昊该干饭了8 小时前
数据采集爬虫三要素:User-Agent、随机延迟、代理ip
爬虫·网络协议·tcp/ip·网络爬虫
q567315238 小时前
利用Ruby的Typhoeus编写爬虫程序
开发语言·爬虫·scrapy·ruby
用户Taobaoapi201410 小时前
深入研究:微店商品列表API详解
大数据·爬虫·数据挖掘
opentrending11 小时前
Github 热点项目 Krillin AI一键横转竖+AI配音+AI精准字幕,短视频创作者必备神器,效率翻倍
人工智能·git·爬虫·github·邮箱
随行就市11 小时前
python爬虫
运维·服务器·爬虫
用户40993225021211 小时前
FastAPI依赖注入性能优化策略
后端·ai编程·trae