python第一个多进程爬虫

使用 multiprocessing 模块实现多进程爬取股票网址买卖数据的基本思路是:

  1. 定义爬虫函数,用于从一个或多个股票网址上抓取数据。
  2. 创建多个进程,每个进程执行爬虫函数,可能针对不同的股票或不同的网页。
  3. 使用 multiprocessing.Queuemultiprocessing.Manager() 管理共享数据结构,以便进程间可以共享爬取的数据。

以下是一个简化的示例,展示如何使用 multiprocessing 模块和 requests 库来实现多进程爬取股票数据:

python 复制代码
# encoding:utf-8
import sys,os,copy,time,traceback,copy
import multiprocessing
# from queue import Queue
import pandas as pd
from loguru import logger
sys.path.append("..")
from QhSetting import QHJSPATH
from QhSpiderObj import QhDFSpider
from QhCsvMode import QHDFDBJSON,QhPdCsvUnique
from QhSpiderTool import QhDbPathJieXiIsMkdir,QhDfDateSort,QhSouHuJiaoYiDate,QhNotNaNdf,\
                        QhDfWeiYiZhi,QhGetTimes
from QhSpiderTool import QhStarEndTime 
from QhInterFace import _QhDfMaiMAIDetails,_QhDBToCsv

def worker(num):
    print(f'Worker: {num}')

# @QhStarEndTime
def QhDfMaiMAIDetailsForM(QhCodeList,QhQueue,QhIsCsv=False):
    """
    作者:阙辉
    功能:获取每日买卖明细
    """
    # QhCsvPath = QHDFDBJSON["QhDfAllStock"]["QhCsvPath"]
    # QhCsvPath = QhDbPathJieXiIsMkdir(QhCsvPath,QHJSPATH)
    # QhCsvName = QHDFDBJSON["QhDfAllStock"]["QhCsvName"]
    # QhCsvPath = "{}\{}".format(QhCsvPath,QhCsvName)
    # QhOldCsvDf = pd.read_csv(QhCsvPath,encoding='gbk')
    # QhOldCsvDf.set_index('股票代码',drop=False,inplace=True)   #重置索引并保留原列  要先设置所以 否则无法使用at方法
    # QhOldCsvDf = QhOldCsvDf[QhOldCsvDf["交易板块"].isin(["上证A股","深证A股","北证A股","科创板","创业板"])]#[:10]
    QhUniqueValue = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhUniqueValue"]
    QhJiaoYiDateD = QhSouHuJiaoYiDate()[2]    # 获取交易日期('YYYY','YYYY-MM','YYYY-MM-DD')
    QhCsvPath = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhCsvPath"]
    QhCsvName0 = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhCsvName"]
    QhCsvName = QhCsvName0.format(QhJiaoYiDateD)
    QhCsvPathF0 = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhCsvPathF"]
    QhCsvNameF0 = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhCsvNameF"]
    QhCsvPath = QhDbPathJieXiIsMkdir(QhCsvPath,QHJSPATH)
    QhCsvPath = "{}\{}".format(QhCsvPath,QhCsvName)
    print(QhCsvPath)
    QhI = 0
    for QhRow in QhCodeList:
        try:
            QhCode01 = QhRow[0]
            QhShiChang = QhRow[1]
            QhCsvPathF = copy.deepcopy(QhCsvPathF0)
            QhCsvNameF = QhCsvNameF0.format(QhCode01)
            QhCsvPathF = QhDbPathJieXiIsMkdir(QhCsvPathF,QHJSPATH)
            QhCsvPathF = "{}\{}".format(QhCsvPathF,QhCsvNameF)
            QhCode = QhCode01.replace("Q","")
            QhCodes = QhShiChang
            secid ="{}.{}".format(QhCodes,QhCode)
            QhJieGuoRowDf = _QhDfMaiMAIDetails(QhSecid=secid)
            QhJieGuoRowDf["交易日期01"] = QhJiaoYiDateD
            QhQueue.put(QhJieGuoRowDf)
            print(QhJieGuoRowDf)
            # 将数据添加后面
            if QhI == 0:
                QhJieGuoDfNew = QhJieGuoRowDf.copy(deep=True) 
            else:
                try:  # 兼容旧版本处理
                    QhJieGuoDfNew = QhJieGuoDfNew._append(QhJieGuoRowDf)
                except:
                    QhJieGuoDfNew = QhJieGuoDfNew.append(QhJieGuoRowDf)
            _QhDBToCsv(QhCsvPathF,QhUniqueValue,QhJieGuoRowDf,QhDateSort="",QhIsCsv=True)
            QhI = QhI + 1
        except:
            QhErrMsg = traceback.print_exc()
            logger.error("【买卖竞价数据】获取失败,报错消息\n{QhErrMsg}!QueHui!".format(QhErrMsg=QhErrMsg))
            _QhDBToCsv(QhCsvPath,QhUniqueValue,QhJieGuoDfNew,QhDateSort="",QhIsCsv=True)
            QhI = QhI + 1
    # 存储
    _QhDBToCsv(QhCsvPath,QhUniqueValue,QhJieGuoDfNew,QhDateSort="",QhIsCsv=QhIsCsv)
    return QhJieGuoDfNew
if __name__ == '__main__':

    # processes = []
    # for i in range(5):  # 创建5个进程
    #     p = multiprocessing.Process(target=worker, args=(i,))
    #     processes.append(p)
    #     p.start()  # 启动进程

    # for process in processes:
    #     process.join()  # 等待进程结束

    QhCsvPath = QHDFDBJSON["QhDfAllStock"]["QhCsvPath"]
    QhCsvPath = QhDbPathJieXiIsMkdir(QhCsvPath,QHJSPATH)
    QhCsvName = QHDFDBJSON["QhDfAllStock"]["QhCsvName"]
    QhCsvPath = "{}\{}".format(QhCsvPath,QhCsvName)
    QhOldCsvDf = pd.read_csv(QhCsvPath,encoding='gbk')
    QhOldCsvDf.set_index('股票代码',drop=False,inplace=True)   #重置索引并保留原列  要先设置所以 否则无法使用at方法
    QhOldCsvDf = QhOldCsvDf[QhOldCsvDf["交易板块"].isin(["上证A股","深证A股","北证A股","科创板","创业板"])][:500]
    QhOldCsvList = []
    for index, row  in QhOldCsvDf.iterrows():
        # print(row)
        QhOldCsvListRow = []
        QhCode = row["股票代码"]
        QhOldCsvListRow.append(QhCode)
        QhShiChang = row["市场代码"]
        QhOldCsvListRow.append(QhShiChang)
        QhOldCsvList.append(QhOldCsvListRow)
        
    qh_group_count = 100
    processes = []
    QhQueueList = []
    QhTotalTimes = QhGetTimes(len(QhOldCsvList),qh_group_count = qh_group_count)
    QhManager = multiprocessing.Manager()
    QhQueue = QhManager.Queue()  # 设置队列上限为3
    QhStart = 0
    for QhRow in range(1,QhTotalTimes+1):
        QhPa = QhOldCsvList[QhStart:QhRow*qh_group_count]
        print(QhPa)
        QhStart = QhRow*qh_group_count 
       
        p = multiprocessing.Process(target=QhDfMaiMAIDetailsForM, args=(QhPa,QhQueue,False))
        processes.append(p)
        # QhQueueList.append(QhQueue)
        p.start()  # 启动进程

    for process in processes:
        process.join()  # 等待进程结束
相关推荐
老赵的博客11 分钟前
c++ 杂记
开发语言·c++
jimmy.hua13 分钟前
[C++刷怪笼]:set/map--优质且易操作的容器
开发语言·c++
XiaoMu_00124 分钟前
基于Python+Streamlit的旅游数据分析与预测系统:从数据可视化到机器学习预测的完整实现
python·信息可视化·旅游
THMAIL26 分钟前
深度学习从入门到精通 - 生成对抗网络(GAN)实战:创造逼真图像的魔法艺术
人工智能·python·深度学习·神经网络·机器学习·生成对抗网络·cnn
w2sfot1 小时前
Passing Arguments as an Object in JavaScript
开发语言·javascript·ecmascript
郝学胜-神的一滴2 小时前
避免使用非const全局变量:C++中的最佳实践 (C++ Core Guidelines)
开发语言·c++·程序人生
我没想到原来他们都是一堆坏人2 小时前
(未完待续...)如何编写一个用于构建python web项目镜像的dockerfile文件
java·前端·python
搞一搞汽车电子2 小时前
S32K3平台eMIOS 应用说明
开发语言·驱动开发·笔记·单片机·嵌入式硬件·汽车
总有刁民想爱朕ha2 小时前
车牌模拟生成器:Python3.8+Opencv代码实现与商业应用前景(C#、python 开发包SDK)
开发语言·python·数据挖掘
小菜全3 小时前
uniapp新增页面及跳转配置方法
开发语言·前端·javascript·vue.js·前端框架