测试链接:https://lishi.tianqi.com/guangzhou/202003.html
源码:
import requests, pymysql from lxml import etree class ThSpider(object): def __init__(self): # 初始化 self.month_list = ['202101', '202102', '202103', '202104', '202105', '202106', '202107', '202108', '202109', '202110', '202111', '202112', '202201', '202202'] # 链接数据库 self.cz = pymysql.connect( host='127.0.0.1', port=3306, user='root', password='root', db='基本操作', charset='utf8' ) # 创建游标 self.kit = self.cz.cursor() # 创建数据库 table_sql = \ ''' create table if not exists 天气数据 ( 日期 varchar (50), 最高气温 varchar (50), 最低气温 varchar (50), 天气 varchar (50), 风向 varchar (50) ); ''' self.kit.execute(table_sql) def request_start_url(self): # 发送请求 + 得到响应 for month in self.month_list: start_url = 'https://lishi.tianqi.com/shanghai/{}.html'.format(month) self.headers = { 'Cookie': 'UserId=17209281674394559; Hm_lvt_7c50c7060f1f743bccf8c150a646e90a=1720928176; HMACCOUNT=66A8254591DC78E3; Hm_lpvt_7c50c7060f1f743bccf8c150a646e90a=1720941400', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' } response = requests.get(start_url, headers=self.headers).text self.parse_response(response) self.commit_close() # 也可在此处调用提交函数 请求一个月份,解析一个月份,然后提交数据库 def parse_response(self, response): # 解析响应 + 得到字段 A = etree.HTML(response) li_list = A.xpath('//ul[@class="thrui"]/li') for li in li_list: # 日期 rq = li.xpath('./div[1]/text()')[0] # 最高气温 gw = li.xpath('./div[2]/text()')[0] # 最低气温 dw = li.xpath('./div[3]/text()')[0] # 天气 tq = li.xpath('./div[4]/text()')[0] # 风向 fx = li.xpath('./div[5]/text()')[0] # print(rq, gw, dw, tq, fx) insert_sql = \ ''' insert into 天气数据 values ("{}", "{}", "{}", "{}", "{}") '''.format(rq, gw, dw, tq, fx) self.kit.execute(insert_sql) print('ok --{}'.format(rq)) def commit_close(self): # 数据库的提交和关闭 self.kit.close() self.cz.commit() self.cz.close() def main(self): self.request_start_url() # self.commit_close() if __name__ == '__main__': th = ThSpider() th.main()
运行效果: