微博一级评论爬虫

cookies需要替换成自己的

python 复制代码
import requests
import requests
from lxml import etree
import openpyxl
from concurrent.futures.thread import ThreadPoolExecutor
import re
from datetime import datetime, timedelta
from urllib import parse
from jsonpath import jsonpath
from datetime import datetime
import os
import csv
import time
import random
import logging
import colorlog





def log_init():
    # 创建日志器
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    # 创建控制台输出器
    sh = logging.StreamHandler()

    # 创建格式化器,使用colorlog设置颜色
    fmt = '%(log_color)s%(asctime)s %(levelname)s [%(name)s] [%(filename)s:%(lineno)d] - %(message)s%(reset)s'
    formatter = colorlog.ColoredFormatter(fmt,
                                          log_colors={
                                              'DEBUG': 'red',
                                              'INFO': 'yellow',
                                              'WARNING': 'green',
                                              'ERROR': 'cyan',
                                              'CRITICAL': 'red,bg_white',
                                          },
                                          style='%')

    # 把格式化器加入输出器
    sh.setFormatter(formatter)

    # 把处理器加入日志器
    logger.addHandler(sh)

    # 移除所有之前的处理器(如果有的话)
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

        # 添加新的处理器
    logger.addHandler(sh)

    return logger  # 返回配置好的logger实例


def get_cookies():
    cookies_list = [

    ]
    return random.choice(cookies_list)


def crawl(response):

    html = etree.HTML(response)
    nodes = html.xpath('//div[@action-type="feed_list_item"]')


    num = 0
    mid_list = html.xpath("//div[@class='card-wrap']/@mid")



    uid = ",".join(html.xpath("//div[@class='avator']/a[@target='_blank']/@href"))
    uid_list = re.findall(r'//weibo\.com/(\d+)' , uid)


    for node in nodes:
        try:
            name = node.xpath('.//a[@class="name"]/text()')[0]

            content = node.xpath('.//p[@node-type="feed_list_content_full"]//text()')

            if content == []:
                content = node.xpath('.//p[@node-type="feed_list_content"]//text()')
            # print(content)
            date_str = node.xpath('.//div[@class="from"]/a[1]/text()')[0].strip()

            forwards = node.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')
            comments_counts = node.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')
            # print(comments_counts[-1].strip())
            likes = node.xpath('.//div[@class="card-act"]/ul/li[3]/a//text()')
            if forwards[-1].strip() in ' 转发':
                forwards[-1] = '0'
            if comments_counts[-1].strip() in ' 评论':
                comments_counts[-1] = '0'

            if comments_counts[-1].strip():
                mid = mid_list[num]
                uid = uid_list[num]

                get_comments(mid, uid,''.join(content).strip().replace(
                          '\u200b', ''),name,key_word)


            num += 1

            if likes[2].strip() in '赞':
                likes[2] = '0'

            result = [name, date_str, forwards[-1].strip(), comments_counts[-1].strip(), likes[2].strip(),
                      ''.join(content).strip().replace(
                          '\u200b', '')]
            print(result)

        except Exception as f:
            print(f)


def get_comments(mid, uid,content,name_au,max_id=None):



    global num,headers
    url = "https://weibo.com/ajax/statuses/buildComments"

    # print(1)
    if max_id == None:
        params = {
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'count': '10',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }
    else:
        params = {
            'flow': '0',
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'max_id': max_id,
            'count': '20',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }


    response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).json()

    if len(response['data']):

        comment_list = jsonpath(response, '$..data[*].text_raw')
        name_list = jsonpath(response, '$..data[*]..screen_name')
        time_str_list = jsonpath(response, '$..data[*].created_at')
        disable_reply_list = jsonpath(response, '$..data[*].disable_reply')  # 转发数

        idstr_list = jsonpath(response, '$..data[*].idstr')
        like_counts_list = jsonpath(response, '$..data[*].like_counts')  # 点赞数
        source_list = jsonpath(response, '$..data[*].source')  # 来源

        id_list = jsonpath(response, '$..data[*].id')
        rootid_list = jsonpath(response, '$..data[*].rootid')

        for i in range(0, len(comment_list)):
            comment = comment_list[i]
            name = name_list[i]

            followers_count = response['data'][i]['user']['followers_count']
            location = response['data'][i]['user']['location']
            total_number = response['data'][i]['total_number']

            gender = response['data'][i]['user']['gender']

            if gender == "m":
                gender = "男"
            else:
                gender = '女'

            try:
                time_str = time_str_list[i]
                dt = datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")
                # 格式化 datetime 对象
                time_str = dt.strftime("%Y-%m-%d %H:%M:%S")
            except:
                time_str = ''

            disable_reply = disable_reply_list[i]

            idstr = idstr_list[i]
            like_counts = like_counts_list[i]

            id = str(id_list[i])
            rootid = str(rootid_list[i])
            # ['标题', '发布者', 'id', 'rootid', '内容', 评论者名称' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]
            data_list = [content, name_au, id, rootid, comment, name, gender, time_str, followers_count, total_number,like_counts,location]

            save_data_to_csv(data_list)
            num += 1
            logging.info(f"{YELLOW}评论数 :{num} " + f"一级评论{data_list}")



    max_id = jsonpath(response, '$.max_id')[0]

    if max_id != 0 and response['data'] != '':
        get_comments(mid, uid, content, name_au, max_id)




def save_data_to_xlsx(data):
    filename = f'李佳琪/{key_word}.xlsx'
    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称' , ' 性别' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]

    if os.path.exists(filename):
        workbook = openpyxl.load_workbook(filename)
        sheet = workbook.active
        sheet.append(data)
    else:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        # 添加表头
        sheet.append(name_headers)
        sheet.append(data)
    # 保存 Excel 文件
    workbook.save(filename)


def save_data_to_csv(data_list):
    global key_word
    filename = f'{key_word}.csv'

    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称', ' 性别', '时间', '粉丝数', '评论数', '点赞数', 'IP', '居住地', ]

    if not os.path.isfile(filename):
        with open(f'{filename}', 'a', encoding='utf-8-sig', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=name_headers)
            csv_write.writeheader()
    else:
        with open(f'{filename}', 'a', encoding='utf-8', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=data_list)
            csv_write.writeheader()


def run():
    url = "https://s.weibo.com/weibo"
    for i in range(1, page):
        params = {
            "q": f'{key_word}',
            "page": f"{i}",
            "xsort": "hot",
            "suball": "1",
            "timescope": f"custom:'{start_time}':'{end_time}'",
            "Refer": "g",
        }
        response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).text
        xml = etree.HTML(response)
        err_msg = ",".join(xml.xpath("//div[@class='card card-no-result s-pt20b40']/p/text()"))
        if '抱歉,未找到相关结果。' in err_msg:
            break
        print(requests.get(url, headers=headers, cookies=get_cookies(), params=params).url)
        crawl(response)


if __name__ == '__main__':
    RED = '\033[31m'  # 红色
    WHITE = '\033[37m'  # 白色
    YELLOW = '\033[33m'  # 黄色
    num = 0
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "priority": "u=0, i",
        "referer": "https://weibo.com/",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-site",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    }


    key_word = '#邯郸初中生被害案3人被刑事追诉#'
    start_time = '2024-4-1-1'
    end_time = '2024-4-20-1'
    page = 10  # 页数

    log_init()
    run()
相关推荐
NPE~4 小时前
自动化工具Drissonpage 保姆级教程(含xpath语法)
运维·后端·爬虫·自动化·网络爬虫·xpath·浏览器自动化
lucky67077 小时前
Windows 上彻底卸载 Node.js
windows·node.js
编程小白20267 小时前
从 C++ 基础到效率翻倍:Qt 开发环境搭建与Windows 神级快捷键指南
开发语言·c++·windows·qt·学习
凯子坚持 c9 小时前
CANN 性能剖析实战:从原始事件到交互式火焰图
windows·microsoft
开开心心就好9 小时前
发票合并打印工具,多页布局设置实时预览
linux·运维·服务器·windows·pdf·harmonyos·1024程序员节
獨枭9 小时前
PyCharm 跑通 SAM 全流程实战
windows
仙剑魔尊重楼10 小时前
音乐制作电子软件FL Studio2025.2.4.5242中文版新功能介绍
windows·音频·录屏·音乐·fl studio
喵手11 小时前
Python爬虫实战:电商价格监控系统 - 从定时任务到历史趋势分析的完整实战(附CSV导出 + SQLite持久化存储)!
爬虫·python·爬虫实战·零基础python爬虫教学·电商价格监控系统·从定时任务到历史趋势分析·采集结果sqlite存储
摘星|11 小时前
正则匹配与爬虫爬取图片路径综合练习
爬虫
PHP小志11 小时前
Windows 服务器怎么修改密码和用户名?账户被系统锁定如何解锁
windows