python
复制代码
# -*- coding: utf-8 -*-
# 中国国际招标有限公司
import re
from datetime import datetime
import redis
import requests
from lxml import etree
from config import REDIS_IP, REDIS_PORT, REDIS_DB, REDIS_PASSWORD
from items.sql import MySQL
class Cntcitc:
def __init__(self):
self.redis = redis.Redis(host=REDIS_IP, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PASSWORD,
decode_responses=True, charset='UTF-8',
encoding='UTF-8')
self.db = MySQL()
self.db.connect()
self.name = '中国国际招标有限公司'
self.url = 'https://www.cntcitc.com.cn/searchPage.html'
self.api_url = 'https://www.cntcitc.com.cn/search.html'
self.today = datetime.today().strftime('%Y-%m-%d')
self.counter_key = f"cntcitc:counter:{self.today}"
self.overall_cycle = False
self.headers = {
"referer": self.url,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
}
def get_data(self, key, page=1):
payload = {
'channelId': '-1',
'key': key,
'startTime': "2024-06-18",
'endTime': '',
'currentPage': page
}
con = requests.post(url=self.api_url, headers=self.headers, data=payload).content.decode('utf8')
html = etree.HTML(con)
content_text = ''.join(html.xpath('/html/body/div/div/form/div[2]/ul/text()'))
content = content_text.strip()
print(f"key:{key},爬取内容:{content}")
if content == "未查询到相关内容":
return None
else:
return html
# 获取总页数
def get_page(self, key):
html = self.get_data(key)
if html is not None:
pageText = ''.join(html.xpath('/html/body/div/div/form/div[2]/div/span[2]/text()'))
# 使用正则表达式匹配"共x页"格式的文本
match = re.search(r"共\d+页", pageText)
# 如果匹配成功,去除中间空格并提取数字
if match:
# 去除中间空格
cleaned_text = re.sub(r"\s", "", match.group())
# 提取数字
page = re.search(r"\d+", cleaned_text).group()
else:
page = None
return page
else:
return None
def spider(self, key):
pages = self.get_page(key)
if pages is not None:
self.overall_cycle = False
# 爬取增量数据
last_page_key = f"cntcitc:last_link:{key}"
last_page_link = str(self.redis.get(last_page_key) or "")
try:
for page in range(1, int(pages) + 1):
if self.overall_cycle:
break
html = self.get_data(key, page)
if html is not None:
for i in range(1, 16):
title = ''.join(
html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/text()')).strip()
if title == "":
break
suffix_link = ''.join(
html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/@href'))
link = f"https://www.cntcitc.com.cn/{suffix_link}"
if last_page_link == link:
self.overall_cycle = True
break
publish_time_text = ''.join(
html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/text()'))
# 使用正则表达式匹配日期
match = re.search(r'\d{4}-\d{2}-\d{2}', publish_time_text)
publish_time = ""
if match:
date_str = match.group()
publish_time = date_str
self.store_to_redis(link, title, publish_time, key)
if last_page_link == "":
self.redis.set(last_page_key, link)
last_page_link = link
except Exception as e:
print(f"中国国际招标有限公司爬虫出现异常: {e}")
self.redis.set(last_page_key, "")
def store_to_redis(self, link, title, show_times, key):
if self.redis.exists(link):
existing_keys = self.redis.hget(link, 'keys').split(',')
if key not in existing_keys:
existing_keys.append(key)
self.redis.hset(link, 'keys', ','.join(existing_keys))
self.redis.hset(link, 'is_synced', 'false')
else:
self.redis.hset(link, mapping={
'title': title,
'show_times': show_times,
'keys': key,
'is_synced': 'false'
})
# 设置过期时间为28天(2419200秒)
self.redis.expire(link, 2419200)
self.redis.incr(self.counter_key)
def get_today_crawl_count(self):
return int(self.redis.get(self.counter_key) or 0)
def process(self):
key_list = ['动漫', '引流', '银行', '业务']
for key in key_list:
self.spider(key)
print(f'中国国际招标有限公司的爬取数据数量为:{self.get_today_crawl_count()}')
if __name__ == '__main__':
bank_cntcitc = Cntcitc()
bank_cntcitc.process()