python爬虫爬取中国国际招标有限公司

python爬虫爬取中国国际招标有限公司

python爬取数据存储到redis中

python 复制代码
#  -*- coding: utf-8 -*-
# 中国国际招标有限公司
import re
from datetime import datetime

import redis
import requests
from lxml import etree
from config import REDIS_IP, REDIS_PORT, REDIS_DB, REDIS_PASSWORD
from items.sql import MySQL


class Cntcitc:
    def __init__(self):
        self.redis = redis.Redis(host=REDIS_IP, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PASSWORD,
                                 decode_responses=True, charset='UTF-8',
                                 encoding='UTF-8')
        self.db = MySQL()
        self.db.connect()
        self.name = '中国国际招标有限公司'
        self.url = 'https://www.cntcitc.com.cn/searchPage.html'
        self.api_url = 'https://www.cntcitc.com.cn/search.html'
        self.today = datetime.today().strftime('%Y-%m-%d')
        self.counter_key = f"cntcitc:counter:{self.today}"
        self.overall_cycle = False
        self.headers = {
            "referer": self.url,
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
        }

    def get_data(self, key, page=1):
        payload = {
            'channelId': '-1',
            'key': key,
            'startTime': "2024-06-18",
            'endTime': '',
            'currentPage': page
        }
        con = requests.post(url=self.api_url, headers=self.headers, data=payload).content.decode('utf8')
        html = etree.HTML(con)
        content_text = ''.join(html.xpath('/html/body/div/div/form/div[2]/ul/text()'))
        content = content_text.strip()
        print(f"key:{key},爬取内容:{content}")
        if content == "未查询到相关内容":
            return None
        else:
            return html

    # 获取总页数
    def get_page(self, key):
        html = self.get_data(key)
        if html is not None:
            pageText = ''.join(html.xpath('/html/body/div/div/form/div[2]/div/span[2]/text()'))
            # 使用正则表达式匹配"共x页"格式的文本
            match = re.search(r"共\d+页", pageText)
            # 如果匹配成功,去除中间空格并提取数字
            if match:
                # 去除中间空格
                cleaned_text = re.sub(r"\s", "", match.group())
                # 提取数字
                page = re.search(r"\d+", cleaned_text).group()

            else:
                page = None
            return page
        else:
            return None

    def spider(self, key):
        pages = self.get_page(key)
        if pages is not None:
            self.overall_cycle = False
            # 爬取增量数据
            last_page_key = f"cntcitc:last_link:{key}"
            last_page_link = str(self.redis.get(last_page_key) or "")
            try:
                for page in range(1, int(pages) + 1):
                    if self.overall_cycle:
                        break
                    html = self.get_data(key, page)
                    if html is not None:
                        for i in range(1, 16):
                            title = ''.join(
                                html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/text()')).strip()
                            if title == "":
                                break
                            suffix_link = ''.join(
                                html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/@href'))
                            link = f"https://www.cntcitc.com.cn/{suffix_link}"
                            if last_page_link == link:
                                self.overall_cycle = True
                                break
                            publish_time_text = ''.join(
                                html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/text()'))
                            # 使用正则表达式匹配日期
                            match = re.search(r'\d{4}-\d{2}-\d{2}', publish_time_text)
                            publish_time = ""
                            if match:
                                date_str = match.group()
                                publish_time = date_str

                            self.store_to_redis(link, title, publish_time, key)
                            if last_page_link == "":
                                self.redis.set(last_page_key, link)
                                last_page_link = link
            except Exception as e:
                print(f"中国国际招标有限公司爬虫出现异常: {e}")
                self.redis.set(last_page_key, "")

    def store_to_redis(self, link, title, show_times, key):
        if self.redis.exists(link):
            existing_keys = self.redis.hget(link, 'keys').split(',')
            if key not in existing_keys:
                existing_keys.append(key)
                self.redis.hset(link, 'keys', ','.join(existing_keys))
                self.redis.hset(link, 'is_synced', 'false')
        else:
            self.redis.hset(link, mapping={
                'title': title,
                'show_times': show_times,
                'keys': key,
                'is_synced': 'false'
            })
            # 设置过期时间为28天(2419200秒)
            self.redis.expire(link, 2419200)
        self.redis.incr(self.counter_key)

    def get_today_crawl_count(self):
        return int(self.redis.get(self.counter_key) or 0)

    def process(self):
        key_list = ['动漫', '引流', '银行', '业务']
        for key in key_list:
            self.spider(key)
        print(f'中国国际招标有限公司的爬取数据数量为:{self.get_today_crawl_count()}')


if __name__ == '__main__':
    bank_cntcitc = Cntcitc()
    bank_cntcitc.process()
相关推荐
Swift社区2 小时前
在 Swift 中实现字符串分割问题:以字典中的单词构造句子
开发语言·ios·swift
没头脑的ht2 小时前
Swift内存访问冲突
开发语言·ios·swift
没头脑的ht2 小时前
Swift闭包的本质
开发语言·ios·swift
wjs20242 小时前
Swift 数组
开发语言
stm 学习ing3 小时前
FPGA 第十讲 避免latch的产生
c语言·开发语言·单片机·嵌入式硬件·fpga开发·fpga
湫ccc4 小时前
《Python基础》之字符串格式化输出
开发语言·python
mqiqe4 小时前
Python MySQL通过Binlog 获取变更记录 恢复数据
开发语言·python·mysql
AttackingLin5 小时前
2024强网杯--babyheap house of apple2解法
linux·开发语言·python