python爬虫爬取中国国际招标有限公司

python爬虫爬取中国国际招标有限公司

python爬取数据存储到redis中

python 复制代码
#  -*- coding: utf-8 -*-
# 中国国际招标有限公司
import re
from datetime import datetime

import redis
import requests
from lxml import etree
from config import REDIS_IP, REDIS_PORT, REDIS_DB, REDIS_PASSWORD
from items.sql import MySQL


class Cntcitc:
    def __init__(self):
        self.redis = redis.Redis(host=REDIS_IP, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PASSWORD,
                                 decode_responses=True, charset='UTF-8',
                                 encoding='UTF-8')
        self.db = MySQL()
        self.db.connect()
        self.name = '中国国际招标有限公司'
        self.url = 'https://www.cntcitc.com.cn/searchPage.html'
        self.api_url = 'https://www.cntcitc.com.cn/search.html'
        self.today = datetime.today().strftime('%Y-%m-%d')
        self.counter_key = f"cntcitc:counter:{self.today}"
        self.overall_cycle = False
        self.headers = {
            "referer": self.url,
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
        }

    def get_data(self, key, page=1):
        payload = {
            'channelId': '-1',
            'key': key,
            'startTime': "2024-06-18",
            'endTime': '',
            'currentPage': page
        }
        con = requests.post(url=self.api_url, headers=self.headers, data=payload).content.decode('utf8')
        html = etree.HTML(con)
        content_text = ''.join(html.xpath('/html/body/div/div/form/div[2]/ul/text()'))
        content = content_text.strip()
        print(f"key:{key},爬取内容:{content}")
        if content == "未查询到相关内容":
            return None
        else:
            return html

    # 获取总页数
    def get_page(self, key):
        html = self.get_data(key)
        if html is not None:
            pageText = ''.join(html.xpath('/html/body/div/div/form/div[2]/div/span[2]/text()'))
            # 使用正则表达式匹配"共x页"格式的文本
            match = re.search(r"共\d+页", pageText)
            # 如果匹配成功,去除中间空格并提取数字
            if match:
                # 去除中间空格
                cleaned_text = re.sub(r"\s", "", match.group())
                # 提取数字
                page = re.search(r"\d+", cleaned_text).group()

            else:
                page = None
            return page
        else:
            return None

    def spider(self, key):
        pages = self.get_page(key)
        if pages is not None:
            self.overall_cycle = False
            # 爬取增量数据
            last_page_key = f"cntcitc:last_link:{key}"
            last_page_link = str(self.redis.get(last_page_key) or "")
            try:
                for page in range(1, int(pages) + 1):
                    if self.overall_cycle:
                        break
                    html = self.get_data(key, page)
                    if html is not None:
                        for i in range(1, 16):
                            title = ''.join(
                                html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/text()')).strip()
                            if title == "":
                                break
                            suffix_link = ''.join(
                                html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/@href'))
                            link = f"https://www.cntcitc.com.cn/{suffix_link}"
                            if last_page_link == link:
                                self.overall_cycle = True
                                break
                            publish_time_text = ''.join(
                                html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/text()'))
                            # 使用正则表达式匹配日期
                            match = re.search(r'\d{4}-\d{2}-\d{2}', publish_time_text)
                            publish_time = ""
                            if match:
                                date_str = match.group()
                                publish_time = date_str

                            self.store_to_redis(link, title, publish_time, key)
                            if last_page_link == "":
                                self.redis.set(last_page_key, link)
                                last_page_link = link
            except Exception as e:
                print(f"中国国际招标有限公司爬虫出现异常: {e}")
                self.redis.set(last_page_key, "")

    def store_to_redis(self, link, title, show_times, key):
        if self.redis.exists(link):
            existing_keys = self.redis.hget(link, 'keys').split(',')
            if key not in existing_keys:
                existing_keys.append(key)
                self.redis.hset(link, 'keys', ','.join(existing_keys))
                self.redis.hset(link, 'is_synced', 'false')
        else:
            self.redis.hset(link, mapping={
                'title': title,
                'show_times': show_times,
                'keys': key,
                'is_synced': 'false'
            })
            # 设置过期时间为28天(2419200秒)
            self.redis.expire(link, 2419200)
        self.redis.incr(self.counter_key)

    def get_today_crawl_count(self):
        return int(self.redis.get(self.counter_key) or 0)

    def process(self):
        key_list = ['动漫', '引流', '银行', '业务']
        for key in key_list:
            self.spider(key)
        print(f'中国国际招标有限公司的爬取数据数量为:{self.get_today_crawl_count()}')


if __name__ == '__main__':
    bank_cntcitc = Cntcitc()
    bank_cntcitc.process()
相关推荐
孟健7 小时前
Karpathy 用 200 行纯 Python 从零实现 GPT:代码逐行解析
python
码路飞9 小时前
写了个 AI 聊天页面,被 5 种流式格式折腾了一整天 😭
javascript·python
曲幽12 小时前
FastAPI压力测试实战:Locust模拟真实用户并发及优化建议
python·fastapi·web·locust·asyncio·test·uvicorn·workers
敏编程16 小时前
一天一个Python库:jsonschema - JSON 数据验证利器
python
前端付豪16 小时前
LangChain记忆:通过Memory记住上次的对话细节
人工智能·python·langchain
databook16 小时前
ManimCE v0.20.1 发布:LaTeX 渲染修复与动画稳定性提升
python·动效
花酒锄作田1 天前
使用 pkgutil 实现动态插件系统
python
前端付豪1 天前
LangChain链 写一篇完美推文?用SequencialChain链接不同的组件
人工智能·python·langchain
曲幽1 天前
FastAPI实战:打造本地文生图接口,ollama+diffusers让AI绘画更听话
python·fastapi·web·cors·diffusers·lcm·ollama·dreamshaper8·txt2img
老赵全栈实战1 天前
Pydantic配置管理最佳实践(一)
python