创建一个异步爬虫并将数据存入excel

事例网站链接: https://xk.scjgj.sh.gov.cn/xzxk_wbjg/#/tzdwSYDJList

一.数据获取流程

1️⃣对列表页请求获取有关详情页的字段值

2️⃣构造详情页的URL获取详情页数据

3️⃣将数据存入excel

二.异步代码

python 复制代码
import asyncio
import logging
import pandas as pd
from aiohttp import ClientSession
from httpx._urlparse import quote
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import ssl

# 设置日志
logging.basicConfig(level=logging.INFO)

# 请求头和数据定义
url_template = 'https://xk.scjgj.sh.gov.cn/xzxk_wbjg/query/public/sydjQueryDeviceEtInfo'
url_template2 = 'https://xk.scjgj.sh.gov.cn/xzxk_wbjg/query/public/useLicInfo/{}/{}'
headers = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Content-Type': 'application/json;charset=UTF-8',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
}

# 初始化DataFrame
columns = ['applyId', '设备名称', '设备种类', '注册代码', '证书编号', '维保单位', '制造单位', '使用单位', '产品编号',
           '单位内编号', '使用单位地址', '发证日期', '登记机关', '设备类别']
df = pd.DataFrame(columns=columns)

# 创建一个新的工作簿用于保存数据
wb = Workbook()
ws = wb.active



# 定义一个锁来确保在保存Excel时没有其他操作
save_lock = asyncio.Lock()

# 并发请求的限制
concurrency_limit = 8  # 限制并发请求数量为32

# Semaphore用于控制并发数
semaphore = asyncio.Semaphore(concurrency_limit)

# 创建一个忽略SSL验证的上下文
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE


# 协程函数用于发送HTTP请求
async def fetch(session, url, data):
    async with semaphore:  # 控制并发数
        logging.warning(f"send URL: {url}")
        async with session.post(url, json=data, headers=headers, ssl=context) as response:
            content_type = response.headers.get('Content-Type', '')
            if 'application/json' in content_type:
                return await response.json()
            else:
                logging.warning(f"Unexpected Content-Type: {content_type} for URL: {url}")
                return None


# 协程函数用于处理一页数据
async def process_page(page):
    global df
    logging.info(f"start processing page {page}.")
    data = {"rows": 50, "zszl": "00206", "page": page}
    async with ClientSession() as session:
        res1 = await fetch(session, url_template, data)
        if res1 is None:
            logging.warning(f"Failed to fetch initial data for page {page}.")
            return

        resultList = res1['data']['resultList']
        tasks = []

        for item in resultList:
            encoded_lic_unique_id = quote(item['licUniqueId'])
            url2 = url_template2.format(item['applyId'], encoded_lic_unique_id)
            task = asyncio.create_task(fetch(session, url2, {}))
            tasks.append(task)

        results = await asyncio.gather(*tasks)
        for result, item in zip(results, resultList):
            if result is None:
                logging.warning(f"Failed to fetch secondary data for applyId {item['applyId']}.inpage:{page}")
                row = pd.DataFrame({
                    'applyId': [item['applyId']],
                    '设备名称': [item['devName']],
                    '设备种类': [item['devSuperclass']],
                    '注册代码': [item['deviceCode']],
                    '证书编号': [item['useLicNo']],
                    '维保单位': [item['maintainComName']],
                    '制造单位': [item['makeComName']],
                    '使用单位': [item['useComName']],
                    '产品编号': None,
                    '单位内编号': None,
                    '使用单位地址': None,
                    '发证日期': None,
                    '登记机关': None,
                    '设备类别': None
                })
                df = pd.concat([df, row], ignore_index=True)
                continue

            row = pd.DataFrame({
                'applyId': [item['applyId']],
                '设备名称': [item['devName']],
                '设备种类': [item['devSuperclass']],
                '注册代码': [item['deviceCode']],
                '证书编号': [item['useLicNo']],
                '维保单位': [item['maintainComName']],
                '制造单位': [item['makeComName']],
                '使用单位': [item['useComName']],
                '产品编号': [result['data'].get('productCode')],
                '单位内编号': [result['data'].get('innerCode')],
                '使用单位地址': [result['data'].get('usePlace')],
                '发证日期': [result['data'].get('qfsj')],
                '登记机关': [result['data'].get('fzjgmc')],
                '设备类别': [result['data'].get('devSubclass')]
            })
            df = pd.concat([df, row], ignore_index=True)
        logging.info(f"Finished processing page {page}.")
        


# 异步保存数据函数
async def save_data(df):
    global ws
    async with save_lock:
        try:
            logging.info("Saving data.")

            # 等待当前所有任务完成
            pending_tasks = [task for task in asyncio.all_tasks() if task is not asyncio.current_task()]
            if pending_tasks:
                logging.info(f"Waiting for {len(pending_tasks)} tasks to complete before saving.")
                await asyncio.gather(*pending_tasks)

            # 先保存数据
            for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=ws.max_row == 1), start=1):
                for c_idx, value in enumerate(row, start=1):
                    ws.cell(row=r_idx, column=c_idx, value=value)

            # 保存Excel文件
            wb.save('./yb.xlsx')

            # 清空DataFrame
            df.drop(df.index, inplace=True)

            logging.info("Data saved.")
        except Exception as e:
            logging.error(f"Failed to save data: {e}")

# 异步主函数
async def main():
    global df, last_saved_page
    logging.warning("start")
    pages = list(range(8000, 9000))
    tasks = [process_page(page) for page in pages]
    logging.warning("listtaskend")
    await asyncio.gather(*tasks)

    # 检查是否有剩余数据需要保存
    if not df.empty:
        await save_data(df)

    # 确保所有任务都已完成
    pending_tasks = [task for task in asyncio.all_tasks() if task is not asyncio.current_task()]
    if pending_tasks:
        logging.info(f"Waiting for {len(pending_tasks)} final tasks to complete.")
        await asyncio.gather(*pending_tasks)

    # 释放锁
    save_lock.release()


# 运行异步主函数
if __name__ == '__main__':
    asyncio.run(main())
相关推荐
铁盒薄荷糖1 小时前
【Pytorch】Pytorch的安装
人工智能·pytorch·python
yyfhq1 小时前
rescorediff
python·深度学习·机器学习
糊涂君-Q1 小时前
Python小白学习教程从入门到入坑------第十九课 异常模块与包【下】(语法基础)
开发语言·python·学习·程序人生·改行学it
API199701081102 小时前
京东平台接口技术详解及示例代码
开发语言·前端·python
(●'◡'●)知2 小时前
基于树莓派的安保巡逻机器人--(一、快速人脸录入与精准人脸识别)
人工智能·python·opencv·机器学习·计算机视觉
秦朝胖子得加钱2 小时前
Flask
后端·python·flask
幽兰的天空2 小时前
Python实现的简单时钟
开发语言·python
NCU_AI3 小时前
Python 网络爬虫快速入门
python·网络爬虫
幽兰的天空3 小时前
简单的Python爬虫实例
开发语言·爬虫·python
IT·小灰灰4 小时前
Python——自动化发送邮件
运维·网络·后端·python·自动化