中国大学排名爬取
爬取高三网中国大学排名一览表,爬取数据包括学校名称、总分、全国排名、星级排名、办学层级。爬取后的数据保存在CSV文件中。
网址为 http://www.bspider.top/gaosan/
python
import requests
import csv
from bs4 import BeautifulSoup
def get_html(url, time=10):
"""获取网页HTML内容"""
try:
r = requests.get(url, timeout=time)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except Exception as error:
print("网页获取失败:", error)
return None
def parser(html):
"""解析大学排名表格数据"""
soup = BeautifulSoup(html, "lxml")
out_list = []
# 遍历表格行(跳过表头)
for row in soup.select("table>tbody>tr"):
tds = row.select("td")
# 提取关键数据:学校名称、总分、全国排名、星级、办学层次
row_data = [
tds[1].text.strip(), # 学校名称
tds[2].text.strip(), # 总分
tds[3].text.strip(), # 全国排名
tds[4].text.strip(), # 星级
tds[5].text.strip() # 办学层次
]
out_list.append(row_data)
return out_list
def save_csv(data, file_path):
"""将数据保存为CSV文件"""
with open(file_path, "w+", newline='', encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerows(data)
print(f"数据已成功保存至 {file_path}")
if __name__ == "__main__":
# 目标网址(大学排名页面)
url = "http://www.bspider.top/gaosan/"
# 完整爬取流程
html_content = get_html(url)
if html_content:
parsed_data = parser(html_content)
save_csv(parsed_data, "school.csv")
发现school.csv中的"总分"那一列有一些空数据

数据预处理
删除包含空字段的行
python
import pandas as pd
df = pd.read_csv("school.csv")
new_df = df.dropna()
print(new_df.to_string())
处理丢失数据之用指定内容来替换一些空字段
python
import pandas as pd
df = pd.read_csv("school.csv")
df.fillna("暂无分数信息", inplace = True)
print(df.to_string())

均值替换空单元格
python
import pandas as pd
df = pd.read_csv("school.csv")
x = df["总分"].mean()
print("总分的均值为")
print(x)
df["总分"].fillna(x, inplace = True)
print(df.to_string())

中位数替换空单元格
python
import pandas as pd
df = pd.read_csv("school.csv")
x = df["总分"].median()
print("总分的中位数为")
print(x)
df["总分"].fillna(x, inplace = True)
print(df.to_string())

Matplotlib
其中,按照星级排名分,8星学校有8所,7星学校有16所,6星学校有36所,5星学校有59所,4星学校有103所,3星学校有190所,2星学校有148所,1星学校有260所。
柱状图
python
import matplotlib.pyplot as plt
import numpy as np
x = np.array(["8星","7星","6星","5星","4星","3星","2星","1星"])
y = np.array([8,16,36,59,103,190,148,260])
plt.title("不同星级的学校个数")
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.bar(x,y)
plt.show()

只需将plt.bar(x,y)换成plt.barh(x,y),就会变成横向的柱状图

饼图
python
import matplotlib.pyplot as plt
import numpy as np
y = np.array([1,2,4.5,7.2,12.5,23.1,18,31.7])
plt.pie(y,labels=["8星","7星","6星","5星","4星","3星","2星","1星"])
plt.title("不同星级的学校个数")
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.show()

练习
中文网
https://www.ryjiaoyu.com/tag/details/
爬取信息
python
import requests
import csv
from lxml import etree
import pymysql
from pymysql.constants import CLIENT # 解决加密认证问题
# 数据保存到MySQL的函数
def save_mysql(sql, val, **dbinfo):
connect = None
cursor = None
try:
# 添加重连和认证插件参数
dbinfo.update({"client_flag": CLIENT.MULTI_STATEMENTS})
connect = pymysql.connect(**dbinfo)
cursor = connect.cursor()
cursor.executemany(sql, val) # 批量执行SQL
connect.commit()
print(f"成功保存 {len(val)} 条数据到数据库")
except Exception as err:
if connect: connect.rollback()
print("数据库保存错误:", err)
finally:
if cursor: cursor.close()
if connect: connect.close()
def get_html(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
r = requests.get(url, headers=headers, timeout=10)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except Exception as error:
print(f"请求失败: {error}")
return None
def safe_xpath(row, xpath, default=""):
"""安全提取XPath数据"""
elements = row.xpath(xpath)
return elements[0].strip() if elements else default
result_list = [] # 存放爬取内容
def parse(html):
try:
doc = etree.HTML(html)
# 使用更健壮的XPath选择器
for row in doc.xpath('//*[@id="tag-book"]/div/ul/li/div[2]'):
name = safe_xpath(row, 'h4/a/text()', "未知书名")
url = safe_xpath(row, 'h4/a/@href', "")
author = safe_xpath(row, 'div/span/text()', "未知作者").strip()
price = safe_xpath(row, 'span/span/text()', "未知价格")
item = [name, url, author, price]
result_list.append(item)
except Exception as e:
print(f"解析异常: {e}")
return result_list
def save_csv(data, path):
with open(path, "w", newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(["书名", "链接", "作者", "价格"])
writer.writerows(data)
print(f"CSV保存成功: {len(data)}条数据")
if __name__ == "__main__":
base_url = "https://www.ryjiaoyu.com/tag/details/7?page={}"
page = 1
max_data = 500 # 目标数据量
while len(result_list) < max_data:
url = base_url.format(page)
print(f"爬取第 {page} 页: {url}")
html = get_html(url)
if not html:
print(f"第 {page} 页获取失败,终止爬取")
break
parse(html)
print(f"当前已获取 {len(result_list)} 条数据")
page += 1
# 保存前30条到CSV
save_data = result_list[:max_data]
save_csv(save_data, "./图书.csv")
# 数据库保存部分
try:
parms = {
"host": "127.0.0.1",
"user": "root", # 替换为实际用户名
"password": "123456", # 替换为实际密码
"db": "text", # 确保数据库存在
"charset": "utf8mb4",
}
# 创建匹配表结构
create_table_sql = """
CREATE TABLE IF NOT EXISTS zhongwenwang (
id INT AUTO_INCREMENT PRIMARY KEY,
BookName VARCHAR(255) NOT NULL,
URL VARCHAR(255),
Author VARCHAR(100),
Price VARCHAR(50)
) CHARSET=utf8mb4;
"""
# 插入SQL(与实际爬取字段匹配)
sql = """
INSERT INTO zhongwenwang (BookName, URL, Author, Price)
VALUES (%s, %s, %s, %s)
"""
# 创建连接并建表
connection = pymysql.connect(**parms)
with connection:
with connection.cursor() as cursor:
cursor.execute(create_table_sql)
connection.commit()
print("数据表创建/验证成功")
# 保存到数据库
save_mysql(sql, save_data, **parms)
except Exception as e:
print("数据库操作失败:", e)
我提取了五百条信息,并进行筛选综合
python
import pandas as pd
from collections import defaultdict
import chardet # 必须导入chardet库才能使用[1,6](@ref)
# 1. 读取CSV文件(优化编码处理)
try:
# 方法1:使用chardet自动检测编码
def detect_encoding(file_path):
with open(file_path, 'rb') as f:
result = chardet.detect(f.read(10000)) # 读取前10000字节检测[9](@ref)GBK
return result['encoding']
file_encoding = detect_encoding('图书.csv')
print(f"检测到的文件编码:{file_encoding}")
# 用检测到的编码读取文件
df = pd.read_csv('图书.csv', encoding=file_encoding)
except (FileNotFoundError, UnicodeDecodeError):
try:
# 方法2:尝试常见中文编码[1](@ref)
df = pd.read_csv('图书.csv', encoding='GB18030') # 兼容性更强的编码[9](@ref)
except:
print(" 文件读取失败,请检查文件路径或编码")
exit()
# 2. 数据预处理
def extract_price(price_str):
try:
# 处理¥符号和逗号分隔符(如¥1,200)
clean_str = price_str.replace('¥', '').replace(',', '')
return float(clean_str)
except:
return None
# 假设价格在第4列(索引3),根据实际情况调整
df['价格数值'] = df.iloc[:, 3].apply(extract_price)
df_clean = df.dropna(subset=['价格数值']).copy() # 创建独立副本避免警告[1](@ref)
# 3. 定义价格区间
bins = [30, 40, 50, 60, 70]
labels = ['30-40元', '40-50元', '50-60元', '60-70元']
# 4. 区间统计(避免SettingWithCopyWarning)
df_clean.loc[:, '价格区间'] = pd.cut(
df_clean['价格数值'],
bins=bins,
labels=labels,
right=False # 左闭右开区间[30,40)
)
counts = df_clean['价格区间'].value_counts().sort_index() #.sort_index()确保区间顺序输出
# 5. 输出价格统计结果
print("\n 价格区间统计结果:")
print("*" * 30)
for interval, count in counts.items():
print(f"│ {interval.ljust(8)} │ {str(count).center(6)} │")
print("*" * 30)
# 6. 版本统计
target_versions = [
"微课版", "慕课版", "大赛版", "全彩慕课版",
"实践篇", "AIGC版", "附微课", "理论篇", "其它"
]
version_counts = defaultdict(int)
version_counts["其它"] = 0 # 初始化"其它"类别
# 假设书名在第1列(索引0),根据实际情况调整
for index, row in df.iterrows():
book_name = str(row.iloc[0]).strip() # 去除首尾空格
matched = False
# 优化匹配:避免全彩慕课版被计入慕课版
for version in sorted(target_versions, key=len, reverse=True): # 优先匹配长关键词
if version in book_name:
version_counts[version] += 1
matched = True
break
if not matched:
version_counts["其它"] += 1
# 7. 输出版本统计结果
print("\n 图书版本统计结果:")
print("*" * 30)
for version in target_versions: # 按定义顺序输出
count = version_counts[version]
print(f"│ {version.ljust(8)} │ {str(count).center(6)} │")
print("*" * 30)
# 8. 保存结果(可选)
result_df = pd.DataFrame({
"类型": ["价格区间"] * len(counts) + ["图书版本"] * len(version_counts),
"分类": list(counts.index) + list(version_counts.keys()),
"数量": list(counts.values) + list(version_counts.values())
})

处理丢失数据之用指定内容来替换一些空字段
python
import pandas as pd
df = pd.read_csv("图书.csv")
df.fillna("暂无信息", inplace = True)
print(df.to_string())

柱状图
python
import matplotlib.pyplot as plt
import numpy as np
plt.subplot(1, 2, 1)
x = np.array(["30-40元","40-50元","50-60元","60-70元"])
y = np.array([7,118,225,96])
plt.title("价格统计")
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.bar(x,y,color = 'm' )
plt.subplot(1, 2, 2)
x = np.array(["微课版","慕课版","大赛版","全彩慕课版","实践篇","AIGC版","附微课","理论篇","其它"])
y = np.array([283,62,2,16,1,16,11,1,108])
plt.title("价格数量")
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.bar(x,y,color = 'g' )
plt.show()

饼图
python
import matplotlib.pyplot as plt
import numpy as np
# 设置全局中文字体(只需设置一次)
plt.rcParams["font.sans-serif"] = ["SimHei"]
# 创建1行2列的子图布局
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) # 增大画布尺寸
# --- 第一个饼图:价格区间分布 ---
y_price = np.array([7, 118, 225, 96])
labels_price = ["30-40元", "40-50元", "50-60元", "60-70元"]
colors_price = ["#d5695d", "#5d8ca8", "#65a479", "#a564c9"]
ax1.pie(
y_price,
labels=labels_price,
colors=colors_price,
explode=(0.1, 0, 0, 0), # 突出显示30-40元
autopct='%.1f%%', # 显示1位小数百分比
shadow=True # 添加阴影增强立体感
)
ax1.set_title("价格区间分布", fontsize=14)
ax1.axis('equal') # 确保饼图为正圆[1,2](@ref)
# --- 第二个饼图:版本类型分布 ---
y_version = np.array([283, 62, 2, 16, 1, 16, 11, 1, 108])
labels_version = ["微课版", "慕课版", "大赛版", "全彩慕课版", "实践篇", "AIGC版", "附微课", "理论篇", "其它"]
# 生成9种不重复的颜色(使用tab20色系)
ax2.pie(
y_version,
labels=labels_version,
colors=colors_version,
explode=(0, 0.1, 0, 0, 0, 0, 0, 0, 0), # 突出显示慕课版
autopct=lambda p: f'{p:.1f}%' if p > 3 else '', # 仅显示>3%的百分比
pctdistance=0.85, # 百分比标签向内移动
textprops={'fontsize': 9} # 调小字体避免重叠
)
ax2.set_title("图书版本分布", fontsize=14)
ax2.axis('equal')
# 调整布局并展示
plt.tight_layout(pad=3.0) # 增加子图间距[6](@ref)
plt.show()

盒须图
python
import pandas as pd
import matplotlib.pyplot as plt
# 设置中文显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams["axes.unicode_minus"] = False # 解决负号显示问题
# 读取数据并预处理
try:
df = pd.read_csv("图书.csv")
# 价格处理:提取数字并转换为数值类型
df['价格'] = df['价格'].astype(str).str.replace(r'[^\d.-]', '', regex=True)
df['价格'] = pd.to_numeric(df['价格'], errors='coerce')
df = df.dropna(subset=['价格']) # 移除价格为空的行
# 异常值检测函数(使用固定价格范围)
def detect_outliers(series):
# 固定正常价格范围为 50~70 元
lower_bound = 50
upper_bound = 70
bounds = (lower_bound, upper_bound)
# 检测超出范围的异常值
outliers = series[(series < lower_bound) | (series > upper_bound)]
return outliers, bounds
# 执行检测
if len(df) > 0: # 只要有数据就可以检测
outliers, (lower, upper) = detect_outliers(df['价格'])
print(f"正常价格范围:{lower:.2f} ~ {upper:.2f} 元")
print(f"异常值数量:{len(outliers)}")
if not outliers.empty:
print("异常值明细:")
print(df.loc[outliers.index, ['书名', '价格']].to_string(index=False))
# 绘制盒须图
plt.figure(figsize=(10, 6)) # 设置图形大小
box = plt.boxplot(df['价格'], showmeans=True, patch_artist=True)
# 美化盒须图
for patch in box['boxes']:
patch.set_facecolor('#E0E0E0') # 设置箱体颜色
# 添加参考线
plt.axhline(y=lower, color='r', linestyle='-.', label='正常范围下限')
plt.axhline(y=upper, color='r', linestyle='--', label='正常范围上限')
# 设置标签和标题
plt.ylabel('价格(元)', fontsize=12)
plt.title('图书价格分布盒须图', fontsize=14)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
else:
print("没有有效数据可检测")
except FileNotFoundError:
print("错误:找不到'图书.csv'文件,请检查文件路径是否正确")
except Exception as e:
print(f"处理数据时发生错误:{str(e)}")


单独出来的圆点为异常值