使用 multiprocessing
模块实现多进程爬取股票网址买卖数据的基本思路是:
- 定义爬虫函数,用于从一个或多个股票网址上抓取数据。
- 创建多个进程,每个进程执行爬虫函数,可能针对不同的股票或不同的网页。
- 使用
multiprocessing.Queue
或multiprocessing.Manager()
管理共享数据结构,以便进程间可以共享爬取的数据。
以下是一个简化的示例,展示如何使用 multiprocessing
模块和 requests
库来实现多进程爬取股票数据:
python
# encoding:utf-8
import sys,os,copy,time,traceback,copy
import multiprocessing
# from queue import Queue
import pandas as pd
from loguru import logger
sys.path.append("..")
from QhSetting import QHJSPATH
from QhSpiderObj import QhDFSpider
from QhCsvMode import QHDFDBJSON,QhPdCsvUnique
from QhSpiderTool import QhDbPathJieXiIsMkdir,QhDfDateSort,QhSouHuJiaoYiDate,QhNotNaNdf,\
QhDfWeiYiZhi,QhGetTimes
from QhSpiderTool import QhStarEndTime
from QhInterFace import _QhDfMaiMAIDetails,_QhDBToCsv
def worker(num):
print(f'Worker: {num}')
# @QhStarEndTime
def QhDfMaiMAIDetailsForM(QhCodeList,QhQueue,QhIsCsv=False):
"""
作者:阙辉
功能:获取每日买卖明细
"""
# QhCsvPath = QHDFDBJSON["QhDfAllStock"]["QhCsvPath"]
# QhCsvPath = QhDbPathJieXiIsMkdir(QhCsvPath,QHJSPATH)
# QhCsvName = QHDFDBJSON["QhDfAllStock"]["QhCsvName"]
# QhCsvPath = "{}\{}".format(QhCsvPath,QhCsvName)
# QhOldCsvDf = pd.read_csv(QhCsvPath,encoding='gbk')
# QhOldCsvDf.set_index('股票代码',drop=False,inplace=True) #重置索引并保留原列 要先设置所以 否则无法使用at方法
# QhOldCsvDf = QhOldCsvDf[QhOldCsvDf["交易板块"].isin(["上证A股","深证A股","北证A股","科创板","创业板"])]#[:10]
QhUniqueValue = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhUniqueValue"]
QhJiaoYiDateD = QhSouHuJiaoYiDate()[2] # 获取交易日期('YYYY','YYYY-MM','YYYY-MM-DD')
QhCsvPath = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhCsvPath"]
QhCsvName0 = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhCsvName"]
QhCsvName = QhCsvName0.format(QhJiaoYiDateD)
QhCsvPathF0 = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhCsvPathF"]
QhCsvNameF0 = QHDFDBJSON["_QhDfMaiMAIDetails"]["QhCsvNameF"]
QhCsvPath = QhDbPathJieXiIsMkdir(QhCsvPath,QHJSPATH)
QhCsvPath = "{}\{}".format(QhCsvPath,QhCsvName)
print(QhCsvPath)
QhI = 0
for QhRow in QhCodeList:
try:
QhCode01 = QhRow[0]
QhShiChang = QhRow[1]
QhCsvPathF = copy.deepcopy(QhCsvPathF0)
QhCsvNameF = QhCsvNameF0.format(QhCode01)
QhCsvPathF = QhDbPathJieXiIsMkdir(QhCsvPathF,QHJSPATH)
QhCsvPathF = "{}\{}".format(QhCsvPathF,QhCsvNameF)
QhCode = QhCode01.replace("Q","")
QhCodes = QhShiChang
secid ="{}.{}".format(QhCodes,QhCode)
QhJieGuoRowDf = _QhDfMaiMAIDetails(QhSecid=secid)
QhJieGuoRowDf["交易日期01"] = QhJiaoYiDateD
QhQueue.put(QhJieGuoRowDf)
print(QhJieGuoRowDf)
# 将数据添加后面
if QhI == 0:
QhJieGuoDfNew = QhJieGuoRowDf.copy(deep=True)
else:
try: # 兼容旧版本处理
QhJieGuoDfNew = QhJieGuoDfNew._append(QhJieGuoRowDf)
except:
QhJieGuoDfNew = QhJieGuoDfNew.append(QhJieGuoRowDf)
_QhDBToCsv(QhCsvPathF,QhUniqueValue,QhJieGuoRowDf,QhDateSort="",QhIsCsv=True)
QhI = QhI + 1
except:
QhErrMsg = traceback.print_exc()
logger.error("【买卖竞价数据】获取失败,报错消息\n{QhErrMsg}!QueHui!".format(QhErrMsg=QhErrMsg))
_QhDBToCsv(QhCsvPath,QhUniqueValue,QhJieGuoDfNew,QhDateSort="",QhIsCsv=True)
QhI = QhI + 1
# 存储
_QhDBToCsv(QhCsvPath,QhUniqueValue,QhJieGuoDfNew,QhDateSort="",QhIsCsv=QhIsCsv)
return QhJieGuoDfNew
if __name__ == '__main__':
# processes = []
# for i in range(5): # 创建5个进程
# p = multiprocessing.Process(target=worker, args=(i,))
# processes.append(p)
# p.start() # 启动进程
# for process in processes:
# process.join() # 等待进程结束
QhCsvPath = QHDFDBJSON["QhDfAllStock"]["QhCsvPath"]
QhCsvPath = QhDbPathJieXiIsMkdir(QhCsvPath,QHJSPATH)
QhCsvName = QHDFDBJSON["QhDfAllStock"]["QhCsvName"]
QhCsvPath = "{}\{}".format(QhCsvPath,QhCsvName)
QhOldCsvDf = pd.read_csv(QhCsvPath,encoding='gbk')
QhOldCsvDf.set_index('股票代码',drop=False,inplace=True) #重置索引并保留原列 要先设置所以 否则无法使用at方法
QhOldCsvDf = QhOldCsvDf[QhOldCsvDf["交易板块"].isin(["上证A股","深证A股","北证A股","科创板","创业板"])][:500]
QhOldCsvList = []
for index, row in QhOldCsvDf.iterrows():
# print(row)
QhOldCsvListRow = []
QhCode = row["股票代码"]
QhOldCsvListRow.append(QhCode)
QhShiChang = row["市场代码"]
QhOldCsvListRow.append(QhShiChang)
QhOldCsvList.append(QhOldCsvListRow)
qh_group_count = 100
processes = []
QhQueueList = []
QhTotalTimes = QhGetTimes(len(QhOldCsvList),qh_group_count = qh_group_count)
QhManager = multiprocessing.Manager()
QhQueue = QhManager.Queue() # 设置队列上限为3
QhStart = 0
for QhRow in range(1,QhTotalTimes+1):
QhPa = QhOldCsvList[QhStart:QhRow*qh_group_count]
print(QhPa)
QhStart = QhRow*qh_group_count
p = multiprocessing.Process(target=QhDfMaiMAIDetailsForM, args=(QhPa,QhQueue,False))
processes.append(p)
# QhQueueList.append(QhQueue)
p.start() # 启动进程
for process in processes:
process.join() # 等待进程结束