搭建自己的金融数据源和量化分析平台(六):下载并存储沪深两市上市公司财报

基于不依赖wind、某花顺等第三方平台数据的考虑,尝试直接从财报中解析三大报表进而计算ROE等财务指标,因此需要下载沪深两市的上市公司财报数据,便于后续从pdf中解析三大报表。

深市爬虫好做,先放深市爬虫:

注:同一个IP频繁向两市服务器发起请求会导致HTTPSConnectionPool(host='xxxx', port=443): Max retries exceeded with url:报错

解决方法有两个:一是使用代理池,二是爬虫休眠。

python 复制代码
'''
根据时间段下载深交所上市公司财报
path str 指定财报存储路径
time str 财报年度 如[2023,2024]
stock_list list 下载财报的股票代码列表 例如['000001','000002']
financial_statements_type list 财报的类别 例如['annual','semi-annual','quarterly_1','quarterly_3'] 分别为年报、半年报、一季报、三季报
'''
def get_financial_statements(path, time, stock_list, financial_statements_type):
    url = "https://www.szse.cn/api/disc/announcement/annList"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Content-Type': 'application/json',
        'Connection': 'close'
    }
    download_url = "https://disc.static.szse.cn/download"
    # 逐只股票读取相应pdf报表
    for stock in stock_list:
        # 逐年循环
        for year in time:
            # 根据财报类型逐个读取pdf
            for fs_type in financial_statements_type:
                if fs_type == 'annual':
                    title = "年报"
                    bigCategoryId = '010301'   # 年报查询代码
                    timestart = str(year)+"-12-31"
                    timeend = str(year+1)+"-09-01"  # 防止出现财报更正之后时间节点覆盖不到,统一往后推三个月
                elif fs_type == 'semi-annual':
                    title = "中报"
                    bigCategoryId = '010303'   # 中报查询代码
                    timestart = str(year) + "-07-01"
                    timeend = str(year) + "-12-31"
                elif fs_type == 'quarterly_1':
                    title = "一季报"
                    bigCategoryId = '010305'   # 一季报查询代码
                    timestart = str(year) + "-04-01"
                    timeend = str(year) + "-07-31"
                else:
                    title = "三季报"
                    bigCategoryId = '010307'   # 三季报查询代码
                    timestart = str(year) + "-10-01"
                    timeend = str(year) + "-12-31"
                data = {
                    "seDate": [timestart, timeend],
                    "stock": [stock],
                    "channelCode": ["listedNotice_disc"],
                    "bigCategoryId": [bigCategoryId],
                    "pageSize": 50,
                    "pageNum": 1
                }
                response = requests.post(url=url, data=json.dumps(data), headers=headers)
                data = json.loads(response.text)["data"]
                if len(data) == 0 or data is None:
                    print("警告:股票代码:"+stock+" "+str(year)+title+"不存在!")
                else:
                    for entry in data:
                        # 对摘要栏目做特殊处理
                        if entry['title'].find("报告摘要") < 0:
                            # 检查path路径下stock代码文件夹、年份文件夹是否存在,不存在则创建
                            file_path = path+stock+"/"+str(year)
                            if Tools.check_folder_exists(path+stock) == False:
                                os.mkdir(path+stock)
                            if Tools.check_folder_exists(file_path) == False:
                                os.mkdir(file_path)
                            file = file_path + "/" + str(year) + title + "##" + entry['title'].replace("*", "") + ".pdf"
                            # 检查文件是否已存在,不存在再下载
                            if os.path.exists(file):
                                print("警告:股票代码:" + stock + " " + str(year) + title + "已存在!")
                            else:
                                filecontent = requests.get(download_url + entry["attachPath"])
                                with open(file, "wb") as pdf:
                                    pdf.write(filecontent.content)
                                print("股票代码:" + stock + " " + str(year) + title + "写入成功。")

# 爬虫调用实例:
# timestart = [2023,2024]
# stock_list = ['000001','000002']
# financial_statements_type = ['annual', 'semi-annual', 'quarterly_1', 'quarterly_3']
# SZ_financial_statement_path = "F:/data/SZ/"
# get_financial_statements(SZ_financial_statement_path, timestart,stock_list,financial_statements_type)

沪市爬虫:

python 复制代码
'''
根据时间段下载上交所上市公司财报
time str 财报年度 如2024、2023
stock_list list 下载财报的股票代码列表 例如['000001','000002']
financial_statements_type list 财报的类别 例如['annual','semi-annual','quarterly_1','quarterly_3'] 分别为年报、半年报、一季报、三季报
'''
def get_financial_statements(path, time, stock_list, financial_statements_type):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Referer': 'https://www.sse.com.cn/',
        'Connection': 'close'
    }
    download_url = "https://www.sse.com.cn"
    # 逐只股票读取相应pdf报表
    for stock in stock_list:
        # 逐年循环
        for year in time:
            # 根据财报类型逐个读取pdf
            for fs_type in financial_statements_type:
                if fs_type == 'annual':
                    title = "年报"
                    bigCategoryId = 'YEARLY'  # 年报查询代码
                    timestart = str(year) + "-12-31"
                    timeend = str(year + 1) + "-09-01"  # 防止出现财报更正之后时间节点覆盖不到,统一往后推三个月
                elif fs_type == 'semi-annual':
                    title = "中报"
                    bigCategoryId = 'QUATER2'  # 中报查询代码
                    timestart = str(year) + "-07-01"
                    timeend = str(year) + "-12-31"
                elif fs_type == 'quarterly_1':
                    title = "一季报"
                    bigCategoryId = 'QUATER1'  # 一季报查询代码
                    timestart = str(year) + "-04-01"
                    timeend = str(year) + "-07-31"
                else:
                    title = "三季报"
                    bigCategoryId = 'QUATER3'  # 三季报查询代码
                    timestart = str(year) + "-10-01"
                    timeend = str(year) + "-12-31"
                url = "https://query.sse.com.cn/security/stock/queryCompanyBulletin.do?jsonCallBack=jsonpCallback"+str(random.randint(10000, 999999))+"&isPagination=true&pageHelp.pageSize=50&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&productId="+stock+"&securityType=0101%2C120100%2C020100%2C020200%2C120200&reportType2=DQBG&reportType="+bigCategoryId+"&beginDate="+timestart+"&endDate="+timeend
                response = requests.get(url=url, headers=headers)
                datas = json.loads(response.text.split('"keyWord":null,"pageHelp":')[1].split(',"productId":')[0])['data']
                if len(datas) == 0 or datas is None:
                    print("警告:股票代码:" + stock + " " + str(year) + title + "不存在!")
                else:
                    for entry in datas:
                        # 对摘要栏目做特殊处理,去除摘要
                        if entry['TITLE'].find("摘要") < 0:
                            # 检查path路径下stock代码文件夹、年份文件夹是否存在,不存在则创建
                            file_path = path + stock + "/" + str(year)
                            if Tools.check_folder_exists(path + stock) == False:
                                os.mkdir(path + stock)
                            if Tools.check_folder_exists(file_path) == False:
                                os.mkdir(file_path)
                            file = file_path + "/" + str(year) + title + "##" + entry['TITLE'].replace("*", "") + ".pdf"
                            # 检查文件是否已存在,不存在再下载
                            if os.path.exists(file):
                                print("警告:股票代码:" + stock + " " + str(year) + title + "已存在!")
                            else:
                                filecontent = requests.get(download_url + entry["URL"])
                                with open(file, "wb") as pdf:
                                    pdf.write(filecontent.content)
                                print("股票代码:" + stock + " " + str(year) + title + "写入成功。")
# timestart = [2023]
# stock_list = ['600011']
# financial_statements_type = ['annual', 'semi-annual', 'quarterly_1', 'quarterly_3']
# SZ_financial_statement_path = "F:/data/SH/"
# get_financial_statements(SZ_financial_statement_path, timestart,stock_list,financial_statements_type)

控制模块代码:

python 复制代码
#更新A股股票财报数据
def update_A_financial_data(SZ=False,SH=False,BJ=False,time=[],financial_statements_type=[]):
    database = "stock_a"
    if SZ == True:
        # 读取深交所上市处于上市状态的公司股票代码
        select_sql = "select stock_code from stock_list where exchange = '"+SZSE+"' and list_status = '"+LIST+"'"
        select_result = ExecSelect(database, select_sql)  # 读取查询结果
        stocks = []
        for stock in select_result:
            stocks.append(stock[0])
        A_SZ_basic.get_financial_statements(SZ_financial_statement_path, time, stocks, financial_statements_type)
    if SH == True:
        # 读取上交所上市处于上市状态的公司股票代码
        select_sql = "select stock_code from stock_list where exchange = '" + SSE + "' and list_status = '" + LIST + "'"
        select_result = ExecSelect(database, select_sql)  # 读取查询结果
        stocks = []
        for stock in select_result:
            stocks.append(stock[0])
        A_SH_basic.get_financial_statements(SH_financial_statement_path, time, stocks, financial_statements_type)
    if BJ == True:
        pass
相关推荐
THMAIL14 小时前
量化基金从小白到大师 - 金融数据获取大全:从免费API到Tick级数据实战指南
人工智能·python·深度学习·算法·机器学习·金融·kafka
zzywxc78714 小时前
AI在金融、医疗、教育、制造业等领域的落地案例(含代码、流程图、Prompt示例与图表)
人工智能·spring·机器学习·金融·数据挖掘·prompt·流程图
tang7778915 小时前
金融行业:静态与动态代理 IP 的选型与风控
网络·tcp/ip·金融
CryptoPP21 小时前
跨境金融数据对接实践:印度NSE/BSE股票行情API集成指南
开发语言·后端·金融
金融数据出海1 天前
黄金金融期货数据API对接技术文档
开发语言·金融·github
七夜zippoe2 天前
AI+Java 守护你的钱袋子!金融领域的智能风控与极速交易
java·人工智能·金融
dingzd952 天前
去中心化金融(DeFi)入门必看
金融·web3·去中心化·区块链·facebook·tiktok·instagram
zzywxc7872 天前
AI行业应用:金融、医疗、教育、制造业的落地案例全解析
人工智能·深度学习·spring·机器学习·金融·数据挖掘
一尘之中3 天前
《空中隧道》:一位金融预言家写在1927年的“科幻小说”,藏着何种投资秘钥?
人工智能·金融·ai写作
芒果量化3 天前
redis - 远程发送买卖信号、本地接收信号处理
redis·python·金融