微博一级评论爬虫

cookies需要替换成自己的

python 复制代码
import requests
import requests
from lxml import etree
import openpyxl
from concurrent.futures.thread import ThreadPoolExecutor
import re
from datetime import datetime, timedelta
from urllib import parse
from jsonpath import jsonpath
from datetime import datetime
import os
import csv
import time
import random
import logging
import colorlog





def log_init():
    # 创建日志器
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    # 创建控制台输出器
    sh = logging.StreamHandler()

    # 创建格式化器,使用colorlog设置颜色
    fmt = '%(log_color)s%(asctime)s %(levelname)s [%(name)s] [%(filename)s:%(lineno)d] - %(message)s%(reset)s'
    formatter = colorlog.ColoredFormatter(fmt,
                                          log_colors={
                                              'DEBUG': 'red',
                                              'INFO': 'yellow',
                                              'WARNING': 'green',
                                              'ERROR': 'cyan',
                                              'CRITICAL': 'red,bg_white',
                                          },
                                          style='%')

    # 把格式化器加入输出器
    sh.setFormatter(formatter)

    # 把处理器加入日志器
    logger.addHandler(sh)

    # 移除所有之前的处理器(如果有的话)
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

        # 添加新的处理器
    logger.addHandler(sh)

    return logger  # 返回配置好的logger实例


def get_cookies():
    cookies_list = [

    ]
    return random.choice(cookies_list)


def crawl(response):

    html = etree.HTML(response)
    nodes = html.xpath('//div[@action-type="feed_list_item"]')


    num = 0
    mid_list = html.xpath("//div[@class='card-wrap']/@mid")



    uid = ",".join(html.xpath("//div[@class='avator']/a[@target='_blank']/@href"))
    uid_list = re.findall(r'//weibo\.com/(\d+)' , uid)


    for node in nodes:
        try:
            name = node.xpath('.//a[@class="name"]/text()')[0]

            content = node.xpath('.//p[@node-type="feed_list_content_full"]//text()')

            if content == []:
                content = node.xpath('.//p[@node-type="feed_list_content"]//text()')
            # print(content)
            date_str = node.xpath('.//div[@class="from"]/a[1]/text()')[0].strip()

            forwards = node.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')
            comments_counts = node.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')
            # print(comments_counts[-1].strip())
            likes = node.xpath('.//div[@class="card-act"]/ul/li[3]/a//text()')
            if forwards[-1].strip() in ' 转发':
                forwards[-1] = '0'
            if comments_counts[-1].strip() in ' 评论':
                comments_counts[-1] = '0'

            if comments_counts[-1].strip():
                mid = mid_list[num]
                uid = uid_list[num]

                get_comments(mid, uid,''.join(content).strip().replace(
                          '\u200b', ''),name,key_word)


            num += 1

            if likes[2].strip() in '赞':
                likes[2] = '0'

            result = [name, date_str, forwards[-1].strip(), comments_counts[-1].strip(), likes[2].strip(),
                      ''.join(content).strip().replace(
                          '\u200b', '')]
            print(result)

        except Exception as f:
            print(f)


def get_comments(mid, uid,content,name_au,max_id=None):



    global num,headers
    url = "https://weibo.com/ajax/statuses/buildComments"

    # print(1)
    if max_id == None:
        params = {
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'count': '10',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }
    else:
        params = {
            'flow': '0',
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'max_id': max_id,
            'count': '20',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }


    response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).json()

    if len(response['data']):

        comment_list = jsonpath(response, '$..data[*].text_raw')
        name_list = jsonpath(response, '$..data[*]..screen_name')
        time_str_list = jsonpath(response, '$..data[*].created_at')
        disable_reply_list = jsonpath(response, '$..data[*].disable_reply')  # 转发数

        idstr_list = jsonpath(response, '$..data[*].idstr')
        like_counts_list = jsonpath(response, '$..data[*].like_counts')  # 点赞数
        source_list = jsonpath(response, '$..data[*].source')  # 来源

        id_list = jsonpath(response, '$..data[*].id')
        rootid_list = jsonpath(response, '$..data[*].rootid')

        for i in range(0, len(comment_list)):
            comment = comment_list[i]
            name = name_list[i]

            followers_count = response['data'][i]['user']['followers_count']
            location = response['data'][i]['user']['location']
            total_number = response['data'][i]['total_number']

            gender = response['data'][i]['user']['gender']

            if gender == "m":
                gender = "男"
            else:
                gender = '女'

            try:
                time_str = time_str_list[i]
                dt = datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")
                # 格式化 datetime 对象
                time_str = dt.strftime("%Y-%m-%d %H:%M:%S")
            except:
                time_str = ''

            disable_reply = disable_reply_list[i]

            idstr = idstr_list[i]
            like_counts = like_counts_list[i]

            id = str(id_list[i])
            rootid = str(rootid_list[i])
            # ['标题', '发布者', 'id', 'rootid', '内容', 评论者名称' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]
            data_list = [content, name_au, id, rootid, comment, name, gender, time_str, followers_count, total_number,like_counts,location]

            save_data_to_csv(data_list)
            num += 1
            logging.info(f"{YELLOW}评论数 :{num} " + f"一级评论{data_list}")



    max_id = jsonpath(response, '$.max_id')[0]

    if max_id != 0 and response['data'] != '':
        get_comments(mid, uid, content, name_au, max_id)




def save_data_to_xlsx(data):
    filename = f'李佳琪/{key_word}.xlsx'
    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称' , ' 性别' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]

    if os.path.exists(filename):
        workbook = openpyxl.load_workbook(filename)
        sheet = workbook.active
        sheet.append(data)
    else:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        # 添加表头
        sheet.append(name_headers)
        sheet.append(data)
    # 保存 Excel 文件
    workbook.save(filename)


def save_data_to_csv(data_list):
    global key_word
    filename = f'{key_word}.csv'

    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称', ' 性别', '时间', '粉丝数', '评论数', '点赞数', 'IP', '居住地', ]

    if not os.path.isfile(filename):
        with open(f'{filename}', 'a', encoding='utf-8-sig', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=name_headers)
            csv_write.writeheader()
    else:
        with open(f'{filename}', 'a', encoding='utf-8', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=data_list)
            csv_write.writeheader()


def run():
    url = "https://s.weibo.com/weibo"
    for i in range(1, page):
        params = {
            "q": f'{key_word}',
            "page": f"{i}",
            "xsort": "hot",
            "suball": "1",
            "timescope": f"custom:'{start_time}':'{end_time}'",
            "Refer": "g",
        }
        response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).text
        xml = etree.HTML(response)
        err_msg = ",".join(xml.xpath("//div[@class='card card-no-result s-pt20b40']/p/text()"))
        if '抱歉,未找到相关结果。' in err_msg:
            break
        print(requests.get(url, headers=headers, cookies=get_cookies(), params=params).url)
        crawl(response)


if __name__ == '__main__':
    RED = '\033[31m'  # 红色
    WHITE = '\033[37m'  # 白色
    YELLOW = '\033[33m'  # 黄色
    num = 0
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "priority": "u=0, i",
        "referer": "https://weibo.com/",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-site",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    }


    key_word = '#邯郸初中生被害案3人被刑事追诉#'
    start_time = '2024-4-1-1'
    end_time = '2024-4-20-1'
    page = 10  # 页数

    log_init()
    run()
相关推荐
YJlio18 分钟前
Windows Internals 读书笔记 10.3.3:Task Scheduler 架构详解
人工智能·windows·笔记·python·学习·chatgpt·架构
微软技术分享1 小时前
Windows平台下CUDA安装及llama.cpp使用教程
windows·llama
CHANG_THE_WORLD1 小时前
<Fluent Python > 2. 第二章:序列的数组
网络·windows·python
独自破碎E2 小时前
解决 Windows 虚拟内存迁移失败的全过程实录
windows
L1624762 小时前
临时拉高 CPU 利用率(防缩容)操作全总结(linux和windows系统)
linux·运维·windows
AI玫瑰助手2 小时前
Python基础:数据类型的转换(int/str/list等互转)
windows·python·list
Java陈序员2 小时前
牛马效率可视化!一款键鼠统计菜单栏应用!
windows·macos
Hungry_Shark2 小时前
Windows上Docker安装失败:DockerDesktop must beowned by an elevated account
windows·docker
书源丶2 小时前
三十二、Java集合(一)——Collection与List全家桶
java·windows·list
一个人旅程~2 小时前
Win旧版或win10部分版本如何解除260字符长路径名限制?
linux·windows·经验分享·电脑