import requests
from bs4 import BeautifulSoup
import re
import xlwt
datatime = re.compile('<div class="th200">(.*?)</div>')
wendu = re.compile('<div class="th140">(.*?)</div>')
def down_allpage(url):
datalist = []
mwen = []
iwen = []
tq = []
fx = []
for i in range(23,24):
for j in range(1,2):
baseurl = url + '20{}{:0>2d}.html'.format(i, j)
html = down_page(baseurl)
# print(html)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('ul',class_='thrui'):
# print(item)
item = str(item)
riqi = re.findall(datatime,item)
for item1 in riqi:
datalist.append(item1)
# print(datalist)
zb_all = re.findall(wendu,item)
# print(zb_all)
for i in range(31):
mwen.append(zb_all[i*4+0])
iwen.append(zb_all[i*4+1])
tq.append(zb_all[i*4+2])
fx.append(zb_all[i*4+3])
# print(mwen,'\n',iwen,'\n',tq,'\n',fx)
return datalist,mwen,iwen,tq,fx
def save_xls(datalist,mwen,iwen,tq,fx):
wb = xlwt.Workbook(encoding='utf-8', style_compression=0)
ws = wb.add_sheet('天气数据',cell_overwrite_ok=True)
col = ("日期","最高温度","最低温度","天气","风向")
for i in range(len(col)):
ws.write(0,i,col[i])
for i in range(len(datalist)):
ws.write(i+1,0,datalist[i])
for i in range(len(mwen)):
ws.write(i+1,1,mwen[i])
for i in range(len(iwen)):
ws.write(i+1,2,iwen[i])
for i in range(len(tq)):
ws.write(i+1,3,tq[i])
for i in range(len(fx)):
ws.write(i+1,4,fx[i])
wb.save(r'D:\天气数据.xls')
def down_page(url):
headers = {
'User-Agent': 'Mozilla/5.0(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
}
r = requests.get(url, headers=headers)
html = r.text
return html
# print(html)
if __name__ == '__main__':
url = 'http://lishi.tianqi.com/beijing/'
# down_page(url)
down_allpage(url)
datalist,mwen,iwen,tq,fx = down_allpage(url)
print(datalist)
save_xls(datalist,mwen,iwen,tq,fx)