微博一级评论爬虫

cookies需要替换成自己的

python 复制代码
import requests
import requests
from lxml import etree
import openpyxl
from concurrent.futures.thread import ThreadPoolExecutor
import re
from datetime import datetime, timedelta
from urllib import parse
from jsonpath import jsonpath
from datetime import datetime
import os
import csv
import time
import random
import logging
import colorlog





def log_init():
    # 创建日志器
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    # 创建控制台输出器
    sh = logging.StreamHandler()

    # 创建格式化器,使用colorlog设置颜色
    fmt = '%(log_color)s%(asctime)s %(levelname)s [%(name)s] [%(filename)s:%(lineno)d] - %(message)s%(reset)s'
    formatter = colorlog.ColoredFormatter(fmt,
                                          log_colors={
                                              'DEBUG': 'red',
                                              'INFO': 'yellow',
                                              'WARNING': 'green',
                                              'ERROR': 'cyan',
                                              'CRITICAL': 'red,bg_white',
                                          },
                                          style='%')

    # 把格式化器加入输出器
    sh.setFormatter(formatter)

    # 把处理器加入日志器
    logger.addHandler(sh)

    # 移除所有之前的处理器(如果有的话)
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

        # 添加新的处理器
    logger.addHandler(sh)

    return logger  # 返回配置好的logger实例


def get_cookies():
    cookies_list = [

    ]
    return random.choice(cookies_list)


def crawl(response):

    html = etree.HTML(response)
    nodes = html.xpath('//div[@action-type="feed_list_item"]')


    num = 0
    mid_list = html.xpath("//div[@class='card-wrap']/@mid")



    uid = ",".join(html.xpath("//div[@class='avator']/a[@target='_blank']/@href"))
    uid_list = re.findall(r'//weibo\.com/(\d+)' , uid)


    for node in nodes:
        try:
            name = node.xpath('.//a[@class="name"]/text()')[0]

            content = node.xpath('.//p[@node-type="feed_list_content_full"]//text()')

            if content == []:
                content = node.xpath('.//p[@node-type="feed_list_content"]//text()')
            # print(content)
            date_str = node.xpath('.//div[@class="from"]/a[1]/text()')[0].strip()

            forwards = node.xpath('.//div[@class="card-act"]/ul/li[1]/a//text()')
            comments_counts = node.xpath('.//div[@class="card-act"]/ul/li[2]/a//text()')
            # print(comments_counts[-1].strip())
            likes = node.xpath('.//div[@class="card-act"]/ul/li[3]/a//text()')
            if forwards[-1].strip() in ' 转发':
                forwards[-1] = '0'
            if comments_counts[-1].strip() in ' 评论':
                comments_counts[-1] = '0'

            if comments_counts[-1].strip():
                mid = mid_list[num]
                uid = uid_list[num]

                get_comments(mid, uid,''.join(content).strip().replace(
                          '\u200b', ''),name,key_word)


            num += 1

            if likes[2].strip() in '赞':
                likes[2] = '0'

            result = [name, date_str, forwards[-1].strip(), comments_counts[-1].strip(), likes[2].strip(),
                      ''.join(content).strip().replace(
                          '\u200b', '')]
            print(result)

        except Exception as f:
            print(f)


def get_comments(mid, uid,content,name_au,max_id=None):



    global num,headers
    url = "https://weibo.com/ajax/statuses/buildComments"

    # print(1)
    if max_id == None:
        params = {
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'count': '10',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }
    else:
        params = {
            'flow': '0',
            'is_reload': '1',
            'id': mid,
            'is_show_bulletin': '2',
            'is_mix': '0',
            'max_id': max_id,
            'count': '20',
            'uid': uid,
            'fetch_level': '0',
            'locale': 'zh-CN',
        }


    response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).json()

    if len(response['data']):

        comment_list = jsonpath(response, '$..data[*].text_raw')
        name_list = jsonpath(response, '$..data[*]..screen_name')
        time_str_list = jsonpath(response, '$..data[*].created_at')
        disable_reply_list = jsonpath(response, '$..data[*].disable_reply')  # 转发数

        idstr_list = jsonpath(response, '$..data[*].idstr')
        like_counts_list = jsonpath(response, '$..data[*].like_counts')  # 点赞数
        source_list = jsonpath(response, '$..data[*].source')  # 来源

        id_list = jsonpath(response, '$..data[*].id')
        rootid_list = jsonpath(response, '$..data[*].rootid')

        for i in range(0, len(comment_list)):
            comment = comment_list[i]
            name = name_list[i]

            followers_count = response['data'][i]['user']['followers_count']
            location = response['data'][i]['user']['location']
            total_number = response['data'][i]['total_number']

            gender = response['data'][i]['user']['gender']

            if gender == "m":
                gender = "男"
            else:
                gender = '女'

            try:
                time_str = time_str_list[i]
                dt = datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")
                # 格式化 datetime 对象
                time_str = dt.strftime("%Y-%m-%d %H:%M:%S")
            except:
                time_str = ''

            disable_reply = disable_reply_list[i]

            idstr = idstr_list[i]
            like_counts = like_counts_list[i]

            id = str(id_list[i])
            rootid = str(rootid_list[i])
            # ['标题', '发布者', 'id', 'rootid', '内容', 评论者名称' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]
            data_list = [content, name_au, id, rootid, comment, name, gender, time_str, followers_count, total_number,like_counts,location]

            save_data_to_csv(data_list)
            num += 1
            logging.info(f"{YELLOW}评论数 :{num} " + f"一级评论{data_list}")



    max_id = jsonpath(response, '$.max_id')[0]

    if max_id != 0 and response['data'] != '':
        get_comments(mid, uid, content, name_au, max_id)




def save_data_to_xlsx(data):
    filename = f'李佳琪/{key_word}.xlsx'
    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称' , ' 性别' , '时间', '粉丝数','评论数','点赞数', 'IP', '居住地',]

    if os.path.exists(filename):
        workbook = openpyxl.load_workbook(filename)
        sheet = workbook.active
        sheet.append(data)
    else:
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        # 添加表头
        sheet.append(name_headers)
        sheet.append(data)
    # 保存 Excel 文件
    workbook.save(filename)


def save_data_to_csv(data_list):
    global key_word
    filename = f'{key_word}.csv'

    name_headers = ['标题', '发布者', 'id', 'rootid', '内容', '评论者名称', ' 性别', '时间', '粉丝数', '评论数', '点赞数', 'IP', '居住地', ]

    if not os.path.isfile(filename):
        with open(f'{filename}', 'a', encoding='utf-8-sig', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=name_headers)
            csv_write.writeheader()
    else:
        with open(f'{filename}', 'a', encoding='utf-8', newline='')as f:
            csv_write = csv.DictWriter(f, fieldnames=data_list)
            csv_write.writeheader()


def run():
    url = "https://s.weibo.com/weibo"
    for i in range(1, page):
        params = {
            "q": f'{key_word}',
            "page": f"{i}",
            "xsort": "hot",
            "suball": "1",
            "timescope": f"custom:'{start_time}':'{end_time}'",
            "Refer": "g",
        }
        response = requests.get(url, headers=headers, cookies=get_cookies(), params=params).text
        xml = etree.HTML(response)
        err_msg = ",".join(xml.xpath("//div[@class='card card-no-result s-pt20b40']/p/text()"))
        if '抱歉,未找到相关结果。' in err_msg:
            break
        print(requests.get(url, headers=headers, cookies=get_cookies(), params=params).url)
        crawl(response)


if __name__ == '__main__':
    RED = '\033[31m'  # 红色
    WHITE = '\033[37m'  # 白色
    YELLOW = '\033[33m'  # 黄色
    num = 0
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "priority": "u=0, i",
        "referer": "https://weibo.com/",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-site",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    }


    key_word = '#邯郸初中生被害案3人被刑事追诉#'
    start_time = '2024-4-1-1'
    end_time = '2024-4-20-1'
    page = 10  # 页数

    log_init()
    run()
相关推荐
图南随笔1 小时前
Spring Boot(二十二):RedisTemplate的List类型操作
windows·spring boot·list
邹老师的小课堂1 小时前
Windows环境下,Jenkins+Gitee的CICD
windows·gitee·jenkins·测试·cicd
在无清风4 小时前
Java实现Redis
前端·windows·bootstrap
水w10 小时前
【Python爬虫】简单案例介绍2
开发语言·爬虫·python
noravinsc11 小时前
centos部署的openstack发布windows虚拟机
linux·windows·centos
ONE_Gua12 小时前
魔改chromium源码——自定义浏览器启动参数
chrome·爬虫·浏览器
ALe要立志成为web糕手13 小时前
数据库脱裤
数据库·windows·mysql·web安全·网络安全·adb·mssql
涛涛讲AI14 小时前
wkhtmltopdf 实现批量对网页转为图片的好工具,快速实现大量卡片制作
linux·服务器·windows·windows效率工具
晓131314 小时前
第三章 爬虫提速、selenium模块、requests模块进阶(终)
爬虫·python·selenium·测试工具·http
大数据魔法师15 小时前
Redis(一) - Redis安装教程(Windows + Linux)
linux·windows·redis