python
复制代码
import re
import requests
from bs4 import BeautifulSoup
import xlwt
class Spider(object):
"""
天气数据爬虫类
"""
# 定义类变量,用于正则表达式
datatime_pattern = re.compile(r'<div class="th200">(.*?)</div>')
wendu_pattern = re.compile(r'<div class="th140">(.*?)</div>')
def __init__(self, url, headers, filepath):
"""
初始化方法
:param url: 基础URL模板
:param headers: HTTP请求头
:param filepath: 输出文件路径
"""
self.url = url
self.headers = headers
self.datalist = [] # 存储日期的列表
self.mwen = [] # 存储最高温度的列表
self.iwen = [] # 存储最低温度的列表
self.tq = [] # 存储天气状况的列表
self.fx = [] # 存储风向的列表
self.filepath = filepath
def download_page(self,url):
"""
下载页面并返回页面内容
:param url: 要下载的页面URL
:return: 页面内容或None(如果下载失败)
"""
try:
response = requests.get(url, headers=self.headers)
response.raise_for_status() # 如果HTTP请求返回了不成功的状态码,则引发HTTPError异常
return response.text
except requests.RequestException as e:
print(f"Error downloading page: {e}")
return None
def parse_page(self, html):
"""
解析页面内容,提取日期和温度数据
:param html: 页面内容
"""
soup = BeautifulSoup(html, 'html.parser')
# print(soup)
for item in soup.find_all('ul', class_='thrui'):
item_str = str(item)
# print(item)
# 使用正则表达式提取日期数据
dates = re.findall(self.datatime_pattern, item_str)
self.datalist.extend(dates)
# print(dates)
# 使用正则表达式提取温度数据
temperatures = re.findall(self.wendu_pattern, item_str)
print(temperatures)
# 假设每组温度数据包含4个部分:最高温度、最低温度、天气状况、风向
for i in range(0, len(temperatures), 4):
self.mwen.append(temperatures[i])
self.iwen.append(temperatures[i + 1])
self.tq.append(temperatures[i + 2])
self.fx.append(temperatures[i + 3])
def download_and_parse_all_pages(self):
"""
下载并解析所有页面
"""
for year in range(23, 24): # 这里设定只是下载2023年的
for month in range(1, 2): # 这里设定只是下载2023年的1月的
# base_url = self.url
page_url = f"{self.url}20{year:02d}{month:02d}.html" # 这里设定不够两位补0
print(page_url)
html = self.download_page(page_url)
# print(html)
if html:
self.parse_page(html)
def save_to_excel(self):
"""
将爬取的数据保存到Excel文件中
"""
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
worksheet = workbook.add_sheet('北京历史天气数据', cell_overwrite_ok=True)
# 写入表头
columns = ("日期", "最高温度", "最低温度", "天气", "风向")
for i, col in enumerate(columns):
worksheet.write(0, i, col)
# 写入数据
for i in range(len(self.datalist)):
worksheet.write(i + 1, 0, self.datalist[i])
worksheet.write(i + 1, 1, self.mwen[i])
worksheet.write(i + 1, 2, self.iwen[i])
worksheet.write(i + 1, 3, self.tq[i])
worksheet.write(i + 1, 4, self.fx[i])
workbook.save(self.filepath)
print(f"Data saved to {self.filepath}")
def run(self):
self.download_and_parse_all_pages()
self.save_to_excel()
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
}
url_template = "http://lishi.tianqi.com/beijing/"
filepath = "beijing_weather_data.xls"
spider = Spider(url_template, headers, filepath)
spider.run()
python
复制代码
import requests
from bs4 import BeautifulSoup
import re
import xlwt
datatime = re.compile('<div class="th200">(.*?)</div>')
wendu = re.compile('<div class="th140">(.*?)</div>')
def down_allpage(url):
datalist = []
mwen = []
iwen = []
tq = []
fx = []
for i in range(23,24):
for j in range(1,2):
baseurl = url + '20{}{:0>2d}.html'.format(i, j)
html = down_page(baseurl)
# print(html)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('ul',class_='thrui'):
# print(item)
item = str(item)
riqi = re.findall(datatime,item)
for item1 in riqi:
datalist.append(item1)
# print(datalist)
zb_all = re.findall(wendu,item)
# print(zb_all)
for i in range(31):
mwen.append(zb_all[i*4+0])
iwen.append(zb_all[i*4+1])
tq.append(zb_all[i*4+2])
fx.append(zb_all[i*4+3])
# print(mwen,'\n',iwen,'\n',tq,'\n',fx)
return datalist,mwen,iwen,tq,fx
def save_xls(datalist,mwen,iwen,tq,fx):
wb = xlwt.Workbook(encoding='utf-8', style_compression=0)
ws = wb.add_sheet('天气数据',cell_overwrite_ok=True)
col = ("日期","最高温度","最低温度","天气","风向")
for i in range(len(col)):
ws.write(0,i,col[i])
for i in range(len(datalist)):
ws.write(i+1,0,datalist[i])
for i in range(len(mwen)):
ws.write(i+1,1,mwen[i])
for i in range(len(iwen)):
ws.write(i+1,2,iwen[i])
for i in range(len(tq)):
ws.write(i+1,3,tq[i])
for i in range(len(fx)):
ws.write(i+1,4,fx[i])
wb.save(r'D:\天气数据.xls')
def down_page(url):
headers = {
'User-Agent': 'Mozilla/5.0(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
}
r = requests.get(url, headers=headers)
html = r.text
return html
# print(html)
if __name__ == '__main__':
url = 'http://lishi.tianqi.com/beijing/'
# down_page(url)
down_allpage(url)
datalist,mwen,iwen,tq,fx = down_allpage(url)
print(datalist)
save_xls(datalist,mwen,iwen,tq,fx)