我在上交所没发现上交所有像深交所一样的一键下载股票xls文档的按钮,因此上交所的股票列表读取就会比较麻烦。总体思路是查出来所有股票的代码之后根据股票代码逐一发起HTTP请求读取公司英文名、总股本、流通股本等详细信息,这就导致上交所爬虫的网络交互次数远超深交所。
这里放出上交所爬虫模块的代码:
python
# -*- coding: utf-8 -*-
# 上海交易所爬虫
import json
import random
import time
import requests
LIST = "L" # 上市状态:上市
DELISTED = "D" # 上市状态:退市
PAUSED = "P" # 上市状态:暂停上市
SSE = "SSE" # 交易所:上交所
market_ZB = "主板" # 市场类型:主板
market_KCB = "科创板" # 市场类型:科创板
def get_stock_list(industry_list):
s = requests.session()
s.keep_alive = False
# 读取沪市主板股票代码
ZB_url = "https://query.sse.com.cn/sseQuery/commonQuery.do?jsonCallBack=jsonpCallback"+str(random.randint(10000, 999999))+"&STOCK_TYPE=1®_PROVINCE=&CSRC_CODE=&STOCK_CODE=&sqlId=COMMON_SSE_CP_GPJCTPZ_GPLB_GP_L&COMPANY_STATUS=2%2C4%2C5%2C7%2C8&type=inParams&isPagination=true&pageHelp.cacheSize=1&pageHelp.beginPage=1&pageHelp.pageSize=4000&pageHelp.pageNo=1&pageHelp.endPage=1"
# 读取沪市科创板股票代码
KCB_url = "https://query.sse.com.cn/sseQuery/commonQuery.do?jsonCallBack=jsonpCallback"+str(random.randint(10000, 999999))+"&STOCK_TYPE=8®_PROVINCE=&CSRC_CODE=&STOCK_CODE=&sqlId=COMMON_SSE_CP_GPJCTPZ_GPLB_GP_L&COMPANY_STATUS=2%2C4%2C5%2C7%2C8&type=inParams&isPagination=true&pageHelp.cacheSize=1&pageHelp.beginPage=1&pageHelp.pageSize=4000&pageHelp.pageNo=1&pageHelp.endPage=1"
# 根据股票代码查询公司基本情况
stock_detail_url = "https://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback"+str(random.randint(100000, 999999999))+"&isPagination=false&sqlId=COMMON_SSE_CP_GPJCTPZ_GPLB_GPGK_GSGK_C&COMPANY_CODE="
# 根据股票代码查询公司总股本和流通股本
stock_select_totalshare_url = "https://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback"+str(random.randint(100000, 999999999))+"&isPagination=false&sqlId=COMMON_SSE_CP_GPJCTPZ_GPLB_GPGK_GBJG_C&COMPANY_CODE="
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'https://www.sse.com.cn/',
'Connection': 'close'
}
# 读取主板股票数据
ZB_response = requests.get(url=ZB_url, headers=headers)
ZB_data = json.loads(ZB_response.text.split('"data":')[1].split(',"endDate"')[0])
stocks = []
for stock in ZB_data:
stock['market'] = market_ZB
stocks.append(stock)
# 读取科创板股票数据
KCB_response = requests.get(url=KCB_url, headers=headers)
KCB_data = json.loads(KCB_response.text.split('"data":')[1].split(',"endDate"')[0])
for stock in KCB_data:
stock['market'] = market_KCB
stocks.append(stock)
result = []
for stock in stocks:
time.sleep(2)
_url = stock_detail_url+stock["A_STOCK_CODE"]
# 根据股票代码查询详细信息
stock_detail_info_json = requests.get(url=_url, headers=headers, timeout=4000)
detail_info = json.loads(stock_detail_info_json.text.split('"queryDate":"","result":')[1].split(',"securityCode":"","sqlId"')[0])[0]
# 解析股票代码
stock_code = stock["A_STOCK_CODE"]
# 解析股票名称
stock_name = stock["COMPANY_ABBR"]
# 解析上市公司所属省份
province = detail_info['AREA_NAME'].replace("省","").replace("市","").replace("自治区","").replace("维吾尔","").replace("壮族","").replace("回族","")
# 解析上市公司所属一级、二级行业
industry_chinese = detail_info["CSRC_CODE_DESC"]
industry_2_chinese = detail_info["CSRC_GREAT_CODE_DESC"]
# 行业搜索成功标记
industry_flag_1 = False
industry_flag_2 = False
industry = ''
industry_2 = ''
for industry_info in industry_list:
if industry_info[1] == industry_chinese:
industry = industry_info[0] # 一级行业
industry_flag_1 = True
if industry_info[1]== industry_2_chinese:
industry_2 = industry_info[0] # 二级行业
industry_flag_2 = True
if industry_flag_1 is True and industry_flag_2 is True:
break
# 不存在该一级行业,直接返回报错信息
if industry_flag_1 is False:
return 'industry_info_error'
# 不存在该二级行业,将二级行业置空
else:
if industry_flag_2 is False:
industry_2 = None
#解析上市公司英文全称
enname = detail_info['FULL_NAME_EN']
#解析上市公司所属市场类型
market = stock['market']
#生成上市公司所属交易所代码为SSE
exchange = SSE
#生成股票的上市状态
list_status = LIST
#生成股票的上市日期
list_date_str = detail_info['A_LIST_DATE']
list_date = list_date_str[0:4]+"-"+list_date_str[4:6]+"-"+list_date_str[6:8]
#生成股票退市日期
delist_date = None
#查询股票总股本和流通股本
totalshare_url = stock_select_totalshare_url+stock["A_STOCK_CODE"]
stock_totalshare_info_json = requests.get(url=totalshare_url, headers=headers, timeout=4000)
share_info = json.loads(stock_totalshare_info_json.text.split('"queryDate":"","result":')[1].split(',"securityCode":"","sqlId"')[0])[0]
# 统一沪深交易所股本数据结构
total_share = str(float(share_info["TOTAL_DOMESTIC_VOL"])*10000)
float_share = str(float(share_info["TOTAL_UNLIMIT_VOL"])*10000)
# 组合股票数据
result.append((stock_code, stock_name, province, industry, industry_2, enname, market, exchange,
list_status, list_date, delist_date, total_share, float_share))
print((stock_code, stock_name, province, industry, industry_2, enname, market, exchange,
list_status, list_date, delist_date, total_share, float_share))
return result