python学习DAY22打卡

作业:

自行学习参考如何使用kaggle平台,写下使用注意点,并对下述比赛提交代码

kaggle泰坦尼克号人员生还预测

python 复制代码
import warnings
warnings.filterwarnings("ignore") #忽略警告信息
# 数据处理清洗包
import pandas as pd
import numpy as np
import random as rnd
# 可视化包
import seaborn as sns
import matplotlib.pyplot as plt
# 机器学习算法相关包
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
# 设置中文字体(解决中文显示问题)
plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows系统常用黑体字体
plt.rcParams['axes.unicode_minus'] = False    # 正常显示负号
data_train = pd.read_csv(r'train.csv')    #读取训练集数据
data_test = pd.read_csv(r'test.csv')    #读取测试集数据
combine = [data_train, data_test] # 合并数据
#这只是放到一个列表,以便后续合并,现在还没合并
print(data_train.isnull().sum())
print(data_test.isnull().sum())
#删除无用特征 Ticket 
print("Before", data_train.shape, data_test.shape, combine[0].shape, combine[1].shape)
data_train = data_train.drop(['Ticket'], axis=1)
data_test = data_test.drop(['Ticket'], axis=1)
combine = [data_train, data_test]
print("After", data_train.shape, data_test.shape, combine[0].shape, combine[1].shape)

# 转换分类特征Sex
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)  #男性赋值为0,女性赋值为1,并转换为整型数据
data_test.head()

# 创建空数组
guess_ages = np.zeros((2,3))
guess_ages

# 遍历 Sex (0 或 1) 和 Pclass (1, 2, 3) 来计算六种组合的 Age 猜测值
for dataset in combine:
    # 第一个for循环计算每一个分组的Age预测值
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            # 将随机年龄浮点数转换为最接近的 0.5 年龄(四舍五入)
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    # 第二个for循环对空值进行赋值        
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)
data_train.head()

# Embarked: 仅2个缺失,用众数填充
data_train['Embarked'].fillna(data_train['Embarked'].mode()[0], inplace=True)
data_test['Embarked'].fillna(data_test['Embarked'].mode()[0], inplace=True)
print(data_train.isnull().sum())
# Fare: 极少数缺失,用中位数填充
data_test['Fare'].fillna(data_test['Fare'].median(), inplace=True)

# 姓名对预测生存率的影响较小,除非从中提取出有用的信息(如头衔、家庭关系等)
data_train['Title'] = data_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data_train['Title'] = data_train['Title'].replace(['Lady', 'Countess', 'Dr', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data_train['Title'] = data_train['Title'].replace('Mlle', 'Miss').replace('Ms', 'Miss').replace('Mme', 'Mrs')
data_test['Title'] = data_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data_test['Title'] = data_test['Title'].replace(['Lady', 'Countess', 'Dr', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data_test['Title'] = data_test['Title'].replace('Mlle', 'Miss').replace('Ms', 'Miss').replace('Mme', 'Mrs')
# 计算家庭成员数量,家庭规模可能影响生存率
data_train['FamilySize'] = data_train['SibSp'] + data_train['Parch'] + 1
data_test['FamilySize'] = data_test['SibSp'] + data_test['Parch'] + 1
# 结合 FamilySize 创建新特征(如是否独自乘船):
data_train['IsAlone'] = (data_train['FamilySize'] == 1).astype(int)
data_test['IsAlone'] = (data_test['FamilySize'] == 1).astype(int)
# 客舱甲板
# 提取客舱首字母(如果有的话)
data_train['Deck'] = data_train['Cabin'].str[0]
data_train['Deck'] = data_train['Deck'].fillna('Unknown') # 填充缺失值
data_test['Deck'] = data_test['Cabin'].str[0]
data_test['Deck'] = data_test['Deck'].fillna('Unknown') # 填充缺失值
# 删除冗余特征
data_train.drop([ 'Name', 'Cabin'], axis=1, inplace=True)
data_test.drop(['Name', 'Cabin'], axis=1, inplace=True)
# 先筛选字符串变量
discrete_features = data_train.select_dtypes(include=['object']).columns.tolist()
print("离散变量:", discrete_features)  # 打印离散变量列名

data_train.rename(columns={'Sex': 'isFemale'}, inplace=True)  # 重命名列名 -> 是否女性
data_test.rename(columns={'Sex': 'isFemale'}, inplace=True)  # 重命名列名
print(data_train['isFemale'].value_counts())  # 打印Sex列的取值分布
# 对embarked和deck进行独热编码(无序)
data_train = pd.get_dummies(data_train, columns=['Embarked', 'Deck'], dtype=int, drop_first=True)
data_test = pd.get_dummies(data_test, columns=['Embarked', 'Deck'], dtype=int, drop_first=True)
 
# 确保训练集和测试集的列顺序一致
# 排除标签列'Survived',仅比较特征列
feature_columns = [col for col in data_train.columns if col != 'Survived']
missing_cols = set(feature_columns) - set(data_test.columns)
for col in missing_cols:
    data_test[col] = 0
# 按照训练集特征列的顺序排序测试集列
data_test = data_test[feature_columns]
 
print(data_train.head())
title_mapping = {
    'Mr': 0,
    'Rare': 1,
    'Master': 2,
    'Miss': 3,
     'Mrs': 4
}
data_train['Title'] = data_train['Title'].map(title_mapping)
data_test['Title'] = data_test['Title'].map(title_mapping)
print(data_train['Title'].value_counts())  # 打印Title列的取值分布
# 对Age, Fare 进行标准化(均值为0,方差为1)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
data_train[['Age', 'Fare']] = scaler.fit_transform(data_train[['Age', 'Fare']])
 
print(data_train.head())  # 打印前几行数据
print(data_train.info())
 
# 划分一下测试集
from sklearn.model_selection import train_test_split
X = data_train.drop(['Survived'], axis=1)  # 特征,axis=1表示按列删除
y = data_train['Survived'] # 标签
# 按照8:2划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80%训练集,20%测试集

# --- 1. 默认参数的随机森林 ---
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
import time
print("--- 1. 默认参数随机森林 (训练集 -> 测试集) ---")
start_time = time.time() # 记录开始时间
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train) # 在训练集上训练
rf_pred = rf_model.predict(X_test) # 在测试集上预测
end_time = time.time() # 记录结束时间
print(f"训练与预测耗时: {end_time - start_time:.4f} 秒")
print("\n默认随机森林 在测试集上的分类报告:")
print(classification_report(y_test, rf_pred))
print("默认随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))

@浙大疏锦行

相关推荐
西岸行者5 天前
学习笔记:SKILLS 能帮助更好的vibe coding
笔记·学习
悠哉悠哉愿意5 天前
【单片机学习笔记】串口、超声波、NE555的同时使用
笔记·单片机·学习
别催小唐敲代码5 天前
嵌入式学习路线
学习
毛小茛5 天前
计算机系统概论——校验码
学习
babe小鑫5 天前
大专经济信息管理专业学习数据分析的必要性
学习·数据挖掘·数据分析
winfreedoms5 天前
ROS2知识大白话
笔记·学习·ros2
在这habit之下5 天前
Linux Virtual Server(LVS)学习总结
linux·学习·lvs
我想我不够好。5 天前
2026.2.25监控学习
学习
im_AMBER5 天前
Leetcode 127 删除有序数组中的重复项 | 删除有序数组中的重复项 II
数据结构·学习·算法·leetcode
CodeJourney_J5 天前
从“Hello World“ 开始 C++
c语言·c++·学习