微博一级评论爬虫

cookies需要替换成自己的

python 复制代码
import requests
import requests
from lxml import etree
import openpyxl
from concurrent.futures.thread import ThreadPoolExecutor
import re
from datetime import datetime, timedelta
from urllib import parse
from jsonpath import jsonpath
from datetime import datetime
import os
import csv
import time
import random
import logging
import colorlog





def log_init():
    # 创建日志器
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    # 创建控制台输出器
    sh = logging.StreamHandler()

    # 创建格式化器,使用colorlog设置颜色
    fmt = '%(log_color)s%(asctime)s %(levelname)s [%(name)s] [%(filename)s:%(lineno)d] - %(message)s%(reset)s'
    formatter = colorlog.ColoredFormatter(fmt,
                                          log_colors={
                                              'DEBUG': 'red',
                                              'INFO': 'yellow',
                                              'WARNING': 'green',
                                              'ERROR': 'cyan',
                                              'CRITICAL': 'red,bg_white',
                                          },
                                          style='%')

    # 把格式化器加入输出器
    sh.setFormatter(formatter)

    # 把处理器加入日志器
    logger.addHandler(sh)

    # 移除所有之前的处理器(如果有的话)
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

        # 添加新的处理器
    logger.addHandler(sh)

    return logger  # 返回配置好的logger实例


def get_cookies():
    cookies_list = [

    ]
    return random.choice(cookies_list)


def crawl(response):

    html = etree.HTML(response)
    nodes = html.xpath('//div[@action-type="feed_list_item"]')


    num = 0
    mid_list = html.xpath("//div[@class='card-wrap']/@mid")



    uid = ",".join(html.xpath("//div[@class='avator']/a[@target='_blank']/@href"))
    uid_list = re.findall(r'//weibo\.com/(\d+)' , uid)


    for node in nodes:
        try:
            name = node.xpath('.//a[@class="name"]/text()')[0]

            content = node.xpath('.//p[@node-type="feed_list_content_full"]//text()')

            if content == []:
                content = node.xpath('.//p[@node-type="feed_list_content"]//text()')
            # print(content)
            date_str = node.xpath('.//div[@class="from"]/a[1]/text()')[0].strip()

            forwards = node.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')
            comments_counts = node.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')
            # print(comments_counts[-1].strip())
            likes = node.xpath('.//div[@class="card-act"]/ul/li[3]/a//text()')
            if forwards[-1].strip() in ' 转发':
                forwards[-1] = '0'
            if comments_counts[-1].strip() in ' 评论':
                comments_counts[-1] = '0'

            if comments_counts[-1].strip():
                mid = mid_list[num]
                uid = uid_list[num]

                get_comments(mid, uid,''.join(content).strip().replace(
                          '\u200b', ''),name,key_word)


            num += 1

            if likes[2].strip() in '赞':
                likes[2] = '0'

            result = [name, date_str, forwards[-1].strip(), comments_counts[-1].strip(), likes[2].strip(),
                      ''.join(content).strip().replace(
                          '\u200b', '')]
            print(result)

        except Exception as f:
            print(f)


def get_comments(mid, uid,content,name_au,max_id=None):



    global num,headers
    url = "https://weibo.com/ajax/statuses/buildComments"

    # print(1)
    if max_id == None:
        params = {
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'count': '10',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }
    else:
        params = {
            'flow': '0',
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'max_id': max_id,
            'count': '20',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }


    response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).json()

    if len(response['data']):

        comment_list = jsonpath(response, '$..data[*].text_raw')
        name_list = jsonpath(response, '$..data[*]..screen_name')
        time_str_list = jsonpath(response, '$..data[*].created_at')
        disable_reply_list = jsonpath(response, '$..data[*].disable_reply')  # 转发数

        idstr_list = jsonpath(response, '$..data[*].idstr')
        like_counts_list = jsonpath(response, '$..data[*].like_counts')  # 点赞数
        source_list = jsonpath(response, '$..data[*].source')  # 来源

        id_list = jsonpath(response, '$..data[*].id')
        rootid_list = jsonpath(response, '$..data[*].rootid')

        for i in range(0, len(comment_list)):
            comment = comment_list[i]
            name = name_list[i]

            followers_count = response['data'][i]['user']['followers_count']
            location = response['data'][i]['user']['location']
            total_number = response['data'][i]['total_number']

            gender = response['data'][i]['user']['gender']

            if gender == "m":
                gender = "男"
            else:
                gender = '女'

            try:
                time_str = time_str_list[i]
                dt = datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")
                # 格式化 datetime 对象
                time_str = dt.strftime("%Y-%m-%d %H:%M:%S")
            except:
                time_str = ''

            disable_reply = disable_reply_list[i]

            idstr = idstr_list[i]
            like_counts = like_counts_list[i]

            id = str(id_list[i])
            rootid = str(rootid_list[i])
            # ['标题', '发布者', 'id', 'rootid', '内容', 评论者名称' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]
            data_list = [content, name_au, id, rootid, comment, name, gender, time_str, followers_count, total_number,like_counts,location]

            save_data_to_csv(data_list)
            num += 1
            logging.info(f"{YELLOW}评论数 :{num} " + f"一级评论{data_list}")



    max_id = jsonpath(response, '$.max_id')[0]

    if max_id != 0 and response['data'] != '':
        get_comments(mid, uid, content, name_au, max_id)




def save_data_to_xlsx(data):
    filename = f'李佳琪/{key_word}.xlsx'
    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称' , ' 性别' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]

    if os.path.exists(filename):
        workbook = openpyxl.load_workbook(filename)
        sheet = workbook.active
        sheet.append(data)
    else:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        # 添加表头
        sheet.append(name_headers)
        sheet.append(data)
    # 保存 Excel 文件
    workbook.save(filename)


def save_data_to_csv(data_list):
    global key_word
    filename = f'{key_word}.csv'

    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称', ' 性别', '时间', '粉丝数', '评论数', '点赞数', 'IP', '居住地', ]

    if not os.path.isfile(filename):
        with open(f'{filename}', 'a', encoding='utf-8-sig', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=name_headers)
            csv_write.writeheader()
    else:
        with open(f'{filename}', 'a', encoding='utf-8', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=data_list)
            csv_write.writeheader()


def run():
    url = "https://s.weibo.com/weibo"
    for i in range(1, page):
        params = {
            "q": f'{key_word}',
            "page": f"{i}",
            "xsort": "hot",
            "suball": "1",
            "timescope": f"custom:'{start_time}':'{end_time}'",
            "Refer": "g",
        }
        response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).text
        xml = etree.HTML(response)
        err_msg = ",".join(xml.xpath("//div[@class='card card-no-result s-pt20b40']/p/text()"))
        if '抱歉,未找到相关结果。' in err_msg:
            break
        print(requests.get(url, headers=headers, cookies=get_cookies(), params=params).url)
        crawl(response)


if __name__ == '__main__':
    RED = '\033[31m'  # 红色
    WHITE = '\033[37m'  # 白色
    YELLOW = '\033[33m'  # 黄色
    num = 0
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "priority": "u=0, i",
        "referer": "https://weibo.com/",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-site",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    }


    key_word = '#邯郸初中生被害案3人被刑事追诉#'
    start_time = '2024-4-1-1'
    end_time = '2024-4-20-1'
    page = 10  # 页数

    log_init()
    run()
相关推荐
暮雪倾风10 分钟前
【WPF开发】超级详细的“文件选择”(附带示例工程)
windows·wpf
大神薯条老师1 小时前
Python从入门到高手5.1节-Python简单数据类型
爬虫·python·深度学习·机器学习·数据分析
何中应2 小时前
如何使用CMD命令启动应用程序(二)
windows·桌面应用·batch命令
sukalot3 小时前
windows C++-使用任务和 XML HTTP 请求进行连接(一)
c++·windows
ぃ扶摇ぅ4 小时前
Windows系统编程(三)进程与线程二
c++·windows
镜花照无眠5 小时前
Python爬虫使用实例-mdrama
开发语言·爬虫·python
weixin_419349795 小时前
windows上安装python环境
windows
天上掉下来个程小白6 小时前
Stream流的中间方法
java·开发语言·windows
暮雪倾风6 小时前
【WPF开发】控件介绍-Grid(网格布局)
windows·wpf
sukalot8 小时前
windows C++-windows C++-使用任务和 XML HTTP 请求进行连接(二)
c++·windows