作业要求
一、作业内容
作业①
- 要求 :在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
- 作业代码和图片:
点击查看代码
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
def openDB(self):
self.con=sqlite3.connect("weathers.db")
self.cursor=self.con.cursor()
try:
self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
except:
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
class WeatherForecast:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "廊坊": "101090601"}
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " code cannot be found")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
print(city, date, weather, temp)
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "廊坊"])
print("completed")
- 作业心得 :
这份作业做的主要是复现,所以难度还好。
作业②
- 要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。
- 候选网站 :
- 技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。
- 参考链接 :https://zhuanlan.zhihu.com/p/50099084
- 作业代码和图片:
点击查看代码
import requests
import pandas as pd
import json
import re
import time
class SimpleStockSpider:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
}
def get_stock_data(self):
"""获取股票数据"""
stocks_data = []
# 主要股票代码
stock_codes = [
('1.000001', '上证指数'),
('0.399001', '深证成指'),
('0.399006', '创业板指'),
('1.600036', '招商银行'),
('0.000858', '五粮液')
]
for code, name in stock_codes:
try:
url = f'http://push2.eastmoney.com/api/qt/stock/get'
params = {
'ut': 'fa5fd1943c7b386f172d6893dbfba10b',
'invt': '2',
'fltt': '2',
'fields': 'f43,f44,f45,f46,f60,f84,f85,f86,f169,f170',
'secid': code,
'cb': f'jQuery1124_{int(time.time() * 1000)}',
'_': str(int(time.time() * 1000))
}
response = requests.get(url, params=params, headers=self.headers, timeout=10)
if response.status_code == 200:
content = response.text
json_str = re.search(r'\{.*\}', content)
if json_str:
data = json.loads(json_str.group())
if data.get('data'):
stock = data['data']
stock_info = {
'股票代码': code.split('.')[-1],
'股票名称': name,
'最新报价': stock.get('f43', 0) / 100 if stock.get('f43') else 0,
'涨跌幅': round(stock.get('f170', 0) / 100, 2),
'涨跌额': round(stock.get('f169', 0) / 100, 2),
'成交量': f"{stock.get('f84', 0) / 10000:.2f}万",
'成交额': f"{stock.get('f86', 0) / 100000000:.2f}亿",
'振幅': round(stock.get('f171', 0) / 100, 2),
'最高': stock.get('f44', 0) / 100 if stock.get('f44') else 0,
'最低': stock.get('f45', 0) / 100 if stock.get('f45') else 0,
'今开': stock.get('f46', 0) / 100 if stock.get('f46') else 0,
'昨收': stock.get('f60', 0) / 100 if stock.get('f60') else 0
}
stocks_data.append(stock_info)
time.sleep(0.5)
except Exception as e:
print(f"获取股票 {name} 数据失败: {e}")
continue
return stocks_data
def save_to_csv(self, stocks_data):
"""保存到CSV文件"""
if stocks_data:
df = pd.DataFrame(stocks_data)
df.to_csv('stock_data.csv', index=False, encoding='utf-8-sig')
print(f"数据已保存到 stock_data.csv,共 {len(stocks_data)} 条记录")
# 显示数据
print("\n爬取的股票数据:")
print(df.to_string(index=False))
else:
print("没有获取到数据")
def run(self):
"""运行爬虫"""
print("开始爬取股票数据...")
stocks_data = self.get_stock_data()
self.save_to_csv(stocks_data)
if __name__ == "__main__":
spider = SimpleStockSpider()
spider.run()
- 作业心得:
- 教程是比较早的,代码里面的url不能用,这个要自己去抓包,找包找了好久
作业③
- 要求 :爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
- 技巧:分析该网站的发包情况,分析获取数据的api
- 作业代码和图片:
点击查看代码
import requests
import json
import pymysql
from sqlalchemy import create_engine
import pandas as pd
class UniversityRankingSpider:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0',
'Referer': 'https://www.shanghairanking.cn/'
}
self.api_url = 'https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021'
self.db_engine = None
self.init_database()
def init_database(self):
"""初始化数据库连接"""
try:
# MySQL数据库配置 - 请根据你的环境修改
db_config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'your_password',
'database': 'university_ranking'
}
self.db_engine = create_engine(
f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
)
# 创建数据表
self.create_table()
print("数据库连接成功!")
except Exception as e:
print(f"数据库连接失败: {e}")
print("将数据保存到CSV文件")
def create_table(self):
"""创建大学排名数据表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS university_ranking_2021 (
id INT AUTO_INCREMENT PRIMARY KEY,
排名 INT NOT NULL,
学校名称 VARCHAR(100) NOT NULL,
省市 VARCHAR(50),
类型 VARCHAR(20),
总分 DECIMAL(8,2),
更新时间 TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE KEY unique_rank_school (排名, 学校名称)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
"""
try:
with self.db_engine.connect() as conn:
conn.execute(create_table_sql)
except Exception as e:
print(f"创建表失败: {e}")
def get_university_data(self):
"""获取大学排名数据"""
try:
print("正在从API获取数据...")
response = requests.get(self.api_url, headers=self.headers, timeout=30)
response.raise_for_status()
# 解析JSON数据
data = json.loads(response.text)
rankings = data['data']['rankings']
universities = []
for item in rankings:
university_info = {
'排名': item['rankOverall'],
'学校名称': item['univNameCn'],
'省市': item['province'],
'类型': item['univCategory'],
'总分': float(item['score']) if item['score'] else 0
}
universities.append(university_info)
print(f"成功获取 {len(universities)} 所大学的数据")
return universities
except Exception as e:
print(f"获取数据失败: {e}")
return []
def save_to_database(self, universities):
"""保存数据到数据库"""
if not universities:
print("没有数据可保存")
return False
try:
df = pd.DataFrame(universities)
if self.db_engine:
# 保存到MySQL数据库
df.to_sql('university_ranking_2021', self.db_engine, if_exists='replace', index=False)
print(f"成功保存 {len(universities)} 条数据到数据库")
else:
# 保存到CSV文件
df.to_csv('university_ranking_2021.csv', index=False, encoding='utf-8-sig')
print(f"成功保存 {len(universities)} 条数据到CSV文件")
return True
except Exception as e:
print(f"保存数据失败: {e}")
return False
def display_data(self, universities, num=20):
"""显示前N条数据"""
if not universities:
print("没有数据可显示")
return
print("\n" + "=" * 80)
print("2021年中国大学排名(主榜)前{}名".format(num))
print("=" * 80)
print(f"{'排名':<8} {'学校名称':<20} {'省市':<10} {'类型':<8} {'总分':<8}")
print("-" * 80)
for uni in universities[:num]:
print(f"{uni['排名']:<8} {uni['学校名称']:<20} {uni['省市']:<10} {uni['类型']:<8} {uni['总分']:<8.1f}")
print("=" * 80)
def run(self):
"""运行爬虫"""
print("开始爬取2021年中国大学排名数据...")
# 获取数据
universities = self.get_university_data()
if universities:
# 显示前20条数据
self.display_data(universities, 20)
# 保存数据
self.save_to_database(universities)
print(f"\n爬取完成!共获取 {len(universities)} 所大学的数据")
else:
print("未能获取到数据")
# 简化版本(无需数据库配置)
class SimpleUniversitySpider:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
}
self.api_url = 'https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021'
def get_and_save_data(self):
"""获取并保存数据"""
try:
print("正在获取2021年中国大学排名数据...")
response = requests.get(self.api_url, headers=self.headers, timeout=30)
response.raise_for_status()
data = json.loads(response.text)[[]()]()
rankings = data['data']['rankings']
universities = []
for item in rankings:
universities.append([
item['rankOverall'],
item['univNameCn'],
item['province'],
item['univCategory'],
float(item['score']) if item['score'] else 0
])
# 保存到CSV文件
df = pd.DataFrame(universities, columns=['排名', '学校名称', '省市', '类型', '总分'])
df.to_csv('中国大学排名2021.csv', index=False, encoding='utf-8-sig')
# 显示前20名
print("\n前20名大学排名:")
print("排名\t学校名称\t\t省市\t类型\t总分")
print("-" * 60)
for i in range(min(20, len(universities))):
uni = universities[i]
print(f"{uni[0]}\t{uni[1]}\t{uni[2]}\t{uni[3]}\t{uni[4]:.1f}")
print(f"\n成功获取并保存 {len(universities)} 所大学的数据到 '中国大学排名2021.csv'")
except Exception as e:
print(f"获取数据失败: {e}")
# 运行示例
if __name__ == "__main__":
spider = SimpleUniversitySpider()
spider.get_and_save_data()
# 如果需要使用数据库版本,取消注释下面的代码
spider = UniversityRankingSpider()
spider.run()
- 作业心得 :
这个网站看起来简单,实际非常有难度。在发现翻页后url居然不变时感觉不妙(这意味着不能用传统的翻页办法了。F12检查时发现这个网站的数据是通过配套的js动态部署的,所有的元素都在payload.js里面,配套的脚本提取后执行并呈现。