一、题目
python
import pandas as pd
# 题目1:读取data_A.csv文件,输出前3行
# 结果示例:
# id age sex
# 0 1 60 1
# 1 2 38 1
# 2 3 45 0
def project1():
# TODO: project1-step1: 读取data_A.csv
df = pd.read_csv('data_A.csv')
# TODO: project1-step2: 取出前3行
df = df.head(3)
# TODO: project1-step2: 取出前3行
ret = df.head(3)
# TODO: project1-step3: 替换返回值
ret = None
return ret
# 题目2:针对题目1的数据检查缺失值,分性别使用众数进行填充,输出所有年龄的均值,保留2位小数
# 结果示例: 52.66
def project2():
# TODO: project2-step1: 读取data_A.csv
df = pd.read_csv('data_A.csv')
# TODO: project2-step2: 填充缺失值
# 分性别使用众数进行填充
for gender in df['sex'].unique():
# 获取当前性别的众数(年龄),注意groupbyseries没有mode方法,所以需要切片成series
mode_age = df[df['sex'] == gender]['age'].mode()
# 使用众数填充该性别的缺失年龄值
df.loc[(df['sex'] == gender) & (df['age'].isnull()), 'age'] = mode_age.iloc[0]
# TODO: project2-step3: 计算平均值
average_age = round(df['age'].mean(), 2)
# TODO: project2-step4: 替换返回值
ret = average_age
return ret
# 题目3:读取data_A.csv(题目2处理后的数据)和data_B.csv两个文件,按id列合并数据,
# 处理2列字符串数据转换为int类型,输出转换后所有列的数据类型
# 输出示例:
# id int64
# age float64
# sex int64
# cp int64
def project3():
# TODO: project3-step1: 读取data_A.csv
df_a = pd.read_csv('data_A.csv')
# TODO: project3-step2: 填充缺失值(同上一题)
for gender in df_a['sex'].unique():
mode_age = df_a[df_a['sex'] == gender]['age'].mode()
df_a.loc[(df_a['sex'] == gender) & (df_a['age'].isnull()), 'age'] = mode_age.iloc[0]
# TODO: project3-step3: 读取data_B.csv
df_b = pd.read_csv('data_B.csv')
# TODO: project3-step4: 处理 trestbps 列,去除单位并转换为数值
df_b['trestbps'] = df_b['trestbps'].str.replace('mm Hg', '').astype(int)
# TODO: project3-step5: 处理 chol 列,去除单位并转换为数值
df_b['chol'] = df_b['chol'].str.replace('mg/dl', '').astype(int)
# TODO: project3-step6: 合并数据
merged_df = pd.merge(df_a, df_b, on='id')
# TODO: project3-step7: 获取每列数据类型
data_types = merged_df.dtypes
# TODO: project3-step8: 替换返回值
ret = data_types
return ret
# 题目4:输出重复数据数量,和清理重复数据后的数据量
def project4():
# TODO: project4-step1: 读取data_A.csv
df_a = pd.read_csv('data_A.csv')
# TODO: project4-step2: 填充缺失值(同上一题)
for gender in df_a['sex'].unique():
mode_age = df_a[df_a['sex'] == gender]['age'].mode()
df_a.loc[(df_a['sex'] == gender) & (df_a['age'].isnull()), 'age'] = mode_age.iloc[0]
# TODO: project4-step3: 读取data_B.csv
df_b = pd.read_csv('data_B.csv')
# TODO: project4-step4: 处理 trestbps 列,去除单位并转换为数值(同上一题)
df_b['trestbps'] = df_b['trestbps'].str.replace('mm Hg', '').astype(int)
# TODO: project4-step5: 处理 chol 列,去除单位并转换为数值(同上一题)
df_b['chol'] = df_b['chol'].str.replace('mg/dl', '').astype(int)
# TODO: project4-step6: 合并数据(同上一题)
merged_df = pd.merge(df_a, df_b, on='id')
# TODO: project4-step7: 计算重复数据数量
duplicate_count = merged_df.duplicated().sum()
# TODO: project4-step8: 清理重复数据
cleaned_df = merged_df.drop_duplicates()
# TODO: project3-step9: 替换返回值,ret1-重复数据数量,ret2-清理重复数据后的数据量
ret1 = duplicate_count
ret2 = len(cleaned_df)
return ret1, ret2
# 题目5:在题目4数据的基础上计算各字段与结果字段condition相关性系数,输出相关性系数最大的列名
def project5():
# TODO: project5-step1: 读取data_A.csv
df_a = pd.read_csv('data_A.csv')
# TODO: project5-step2: 填充缺失值(同上一题)
for gender in df_a['sex'].unique():
mode_age = df_a[df_a['sex'] == gender]['age'].mode()
df_a.loc[(df_a['sex'] == gender) & (df_a['age'].isnull()), 'age'] = mode_age.iloc[0]
# TODO: project5-step3: 读取data_B.csv
df_b = pd.read_csv('data_B.csv')
# TODO: project5-step4: 处理 trestbps 列,去除单位并转换为数值(同上一题)
df_b['trestbps'] = df_b['trestbps'].str.replace('mm Hg', '').astype(int)
# TODO: project5-step5: 处理 chol 列,去除单位并转换为数值(同上一题)
df_b['chol'] = df_b['chol'].str.replace('mg/dl', '').astype(int)
# TODO: project5-step6: 合并数据(同上一题)
merged_df = pd.merge(df_a, df_b, on='id')
# TODO: project5-step7: 清理重复数据(同上一题)
merged_df = merged_df.drop_duplicates()
# TODO: project5-step8: 计算各字段与 condition 的相关系数 (不包括condition自身)
correlation_matrix = merged_df.corr(numeric_only=True)
"""
corr(numeric_only=True) 作用:
- 计算DataFrame中所有数值列(numeric_only=True)两两之间的皮尔逊相关系数(-1~1);
- 相关系数越接近1:正相关(如转速越高,设备健康度越高);
- 越接近-1:负相关(如温度越高,健康度越低);
- 接近0:无线性相关。
"""
condition_corr = correlation_matrix['condition'].abs().drop('condition')
"""
- correlation_matrix['condition']:取相关系数矩阵中"condition"列(所有字段与condition的相关系数);
- .abs():取绝对值(只关注相关性强度,不关注正负方向);
- .drop('condition'):删除condition自身与自身的相关系数(值为1,无意义)。
"""
# TODO: project5-step9: 找出绝对值最大的列名
max_corr_column = condition_corr.idxmax()
"""
- condition_corr.idxmax():返回相关系数绝对值最大的列名(即与condition相关性最强的字段)。
"""
# TODO: project3-step10: 替换返回值
ret = max_corr_column
return ret
if __name__ == "__main__":
print("\n题目1结果:")
print(project1())
avg_age = project2()
print("\n题目2结果:")
print(f"所有年龄的均值:{avg_age}")
df_type = project3()
print("\n题目3结果:")
print(df_type)
duplicate_count, clean_data_count = project4()
print("\n题目4结果:")
print(f"重复数据数量:{duplicate_count}")
print(f"清理重复数据后的数据量:{clean_data_count}")
max_corr_col = project5()
print("\n题目5结果:")
print(f"与condition相关性系数最大的列名:{max_corr_col}")
二、复盘(知识补充)