微博一级评论爬虫

cookies需要替换成自己的

python 复制代码
import requests
import requests
from lxml import etree
import openpyxl
from concurrent.futures.thread import ThreadPoolExecutor
import re
from datetime import datetime, timedelta
from urllib import parse
from jsonpath import jsonpath
from datetime import datetime
import os
import csv
import time
import random
import logging
import colorlog





def log_init():
    # 创建日志器
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    # 创建控制台输出器
    sh = logging.StreamHandler()

    # 创建格式化器,使用colorlog设置颜色
    fmt = '%(log_color)s%(asctime)s %(levelname)s [%(name)s] [%(filename)s:%(lineno)d] - %(message)s%(reset)s'
    formatter = colorlog.ColoredFormatter(fmt,
                                          log_colors={
                                              'DEBUG': 'red',
                                              'INFO': 'yellow',
                                              'WARNING': 'green',
                                              'ERROR': 'cyan',
                                              'CRITICAL': 'red,bg_white',
                                          },
                                          style='%')

    # 把格式化器加入输出器
    sh.setFormatter(formatter)

    # 把处理器加入日志器
    logger.addHandler(sh)

    # 移除所有之前的处理器(如果有的话)
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

        # 添加新的处理器
    logger.addHandler(sh)

    return logger  # 返回配置好的logger实例


def get_cookies():
    cookies_list = [

    ]
    return random.choice(cookies_list)


def crawl(response):

    html = etree.HTML(response)
    nodes = html.xpath('//div[@action-type="feed_list_item"]')


    num = 0
    mid_list = html.xpath("//div[@class='card-wrap']/@mid")



    uid = ",".join(html.xpath("//div[@class='avator']/a[@target='_blank']/@href"))
    uid_list = re.findall(r'//weibo\.com/(\d+)' , uid)


    for node in nodes:
        try:
            name = node.xpath('.//a[@class="name"]/text()')[0]

            content = node.xpath('.//p[@node-type="feed_list_content_full"]//text()')

            if content == []:
                content = node.xpath('.//p[@node-type="feed_list_content"]//text()')
            # print(content)
            date_str = node.xpath('.//div[@class="from"]/a[1]/text()')[0].strip()

            forwards = node.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')
            comments_counts = node.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')
            # print(comments_counts[-1].strip())
            likes = node.xpath('.//div[@class="card-act"]/ul/li[3]/a//text()')
            if forwards[-1].strip() in ' 转发':
                forwards[-1] = '0'
            if comments_counts[-1].strip() in ' 评论':
                comments_counts[-1] = '0'

            if comments_counts[-1].strip():
                mid = mid_list[num]
                uid = uid_list[num]

                get_comments(mid, uid,''.join(content).strip().replace(
                          '\u200b', ''),name,key_word)


            num += 1

            if likes[2].strip() in '赞':
                likes[2] = '0'

            result = [name, date_str, forwards[-1].strip(), comments_counts[-1].strip(), likes[2].strip(),
                      ''.join(content).strip().replace(
                          '\u200b', '')]
            print(result)

        except Exception as f:
            print(f)


def get_comments(mid, uid,content,name_au,max_id=None):



    global num,headers
    url = "https://weibo.com/ajax/statuses/buildComments"

    # print(1)
    if max_id == None:
        params = {
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'count': '10',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }
    else:
        params = {
            'flow': '0',
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'max_id': max_id,
            'count': '20',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }


    response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).json()

    if len(response['data']):

        comment_list = jsonpath(response, '$..data[*].text_raw')
        name_list = jsonpath(response, '$..data[*]..screen_name')
        time_str_list = jsonpath(response, '$..data[*].created_at')
        disable_reply_list = jsonpath(response, '$..data[*].disable_reply')  # 转发数

        idstr_list = jsonpath(response, '$..data[*].idstr')
        like_counts_list = jsonpath(response, '$..data[*].like_counts')  # 点赞数
        source_list = jsonpath(response, '$..data[*].source')  # 来源

        id_list = jsonpath(response, '$..data[*].id')
        rootid_list = jsonpath(response, '$..data[*].rootid')

        for i in range(0, len(comment_list)):
            comment = comment_list[i]
            name = name_list[i]

            followers_count = response['data'][i]['user']['followers_count']
            location = response['data'][i]['user']['location']
            total_number = response['data'][i]['total_number']

            gender = response['data'][i]['user']['gender']

            if gender == "m":
                gender = "男"
            else:
                gender = '女'

            try:
                time_str = time_str_list[i]
                dt = datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")
                # 格式化 datetime 对象
                time_str = dt.strftime("%Y-%m-%d %H:%M:%S")
            except:
                time_str = ''

            disable_reply = disable_reply_list[i]

            idstr = idstr_list[i]
            like_counts = like_counts_list[i]

            id = str(id_list[i])
            rootid = str(rootid_list[i])
            # ['标题', '发布者', 'id', 'rootid', '内容', 评论者名称' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]
            data_list = [content, name_au, id, rootid, comment, name, gender, time_str, followers_count, total_number,like_counts,location]

            save_data_to_csv(data_list)
            num += 1
            logging.info(f"{YELLOW}评论数 :{num} " + f"一级评论{data_list}")



    max_id = jsonpath(response, '$.max_id')[0]

    if max_id != 0 and response['data'] != '':
        get_comments(mid, uid, content, name_au, max_id)




def save_data_to_xlsx(data):
    filename = f'李佳琪/{key_word}.xlsx'
    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称' , ' 性别' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]

    if os.path.exists(filename):
        workbook = openpyxl.load_workbook(filename)
        sheet = workbook.active
        sheet.append(data)
    else:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        # 添加表头
        sheet.append(name_headers)
        sheet.append(data)
    # 保存 Excel 文件
    workbook.save(filename)


def save_data_to_csv(data_list):
    global key_word
    filename = f'{key_word}.csv'

    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称', ' 性别', '时间', '粉丝数', '评论数', '点赞数', 'IP', '居住地', ]

    if not os.path.isfile(filename):
        with open(f'{filename}', 'a', encoding='utf-8-sig', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=name_headers)
            csv_write.writeheader()
    else:
        with open(f'{filename}', 'a', encoding='utf-8', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=data_list)
            csv_write.writeheader()


def run():
    url = "https://s.weibo.com/weibo"
    for i in range(1, page):
        params = {
            "q": f'{key_word}',
            "page": f"{i}",
            "xsort": "hot",
            "suball": "1",
            "timescope": f"custom:'{start_time}':'{end_time}'",
            "Refer": "g",
        }
        response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).text
        xml = etree.HTML(response)
        err_msg = ",".join(xml.xpath("//div[@class='card card-no-result s-pt20b40']/p/text()"))
        if '抱歉,未找到相关结果。' in err_msg:
            break
        print(requests.get(url, headers=headers, cookies=get_cookies(), params=params).url)
        crawl(response)


if __name__ == '__main__':
    RED = '\033[31m'  # 红色
    WHITE = '\033[37m'  # 白色
    YELLOW = '\033[33m'  # 黄色
    num = 0
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "priority": "u=0, i",
        "referer": "https://weibo.com/",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-site",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    }


    key_word = '#邯郸初中生被害案3人被刑事追诉#'
    start_time = '2024-4-1-1'
    end_time = '2024-4-20-1'
    page = 10  # 页数

    log_init()
    run()
相关推荐
周胡杰4 小时前
鸿蒙接入flutter环境变量配置windows-命令行或者手动配置-到项目的创建-运行demo项目
javascript·windows·flutter·华为·harmonyos·鸿蒙·鸿蒙系统
奋斗者1号8 小时前
Docker 部署 - Crawl4AI 文档 (v0.5.x)
人工智能·爬虫·机器学习
几道之旅8 小时前
分别在windows和linux上使用curl,有啥区别?
linux·运维·windows
一直奔跑在路上9 小时前
【Ansible】基于windows主机,采用NTLM+HTTPS 认证部署
windows·https·ansible
郭逍遥9 小时前
[工具]B站缓存工具箱 (By 郭逍遥)
windows·python·缓存·工具
x-cmd11 小时前
[250512] Node.js 24 发布:ClangCL 构建,升级 V8 引擎、集成 npm 11
前端·javascript·windows·npm·node.js
IT空门:门主11 小时前
本地的ip实现https访问-OpenSSL安装+ssl正式的生成(Windows 系统)
windows·https·ssl
安装虚拟机的老师傅11 小时前
【2025最新】Windows系统装VSCode搭建C/C++开发环境(附带所有安装包)
c语言·windows·vscode·其他
越甲八千15 小时前
windowsC++操作ADB
c++·windows·adb
九班长15 小时前
Mirror的多人连接管理及房间系统
windows