泰坦尼克号生还预测比赛代码示例
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 1. 加载数据集
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
# 2. 数据预处理
def preprocess(data):
# 填充缺失值
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)
# 提取特征
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = pd.get_dummies(data[features]) # 类别特征独热编码
return X
X_train = preprocess(train_data)
y_train = train_data['Survived']
X_test = preprocess(test_data)
# 3. 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# 4. 预测并生成提交文件
predictions = model.predict(X_test)
submission = pd.DataFrame({
'PassengerId': test_data['PassengerId'],
'Survived': predictions
})
submission.to_csv('submission.csv', index=False)