目录
[1. 读取数据 + 清洗数据](#1. 读取数据 + 清洗数据)
[2. 选择缺失值填充方法](#2. 选择缺失值填充方法)
[3. 样本均衡 + 保存数据集](#3. 样本均衡 + 保存数据集)
[四、6 种缺失值填充方法完整实现(核心工具函数)](#四、6 种缺失值填充方法完整实现(核心工具函数))
[1. CCA 完整案例删除法(直接删除含缺失值的行)](#1. CCA 完整案例删除法(直接删除含缺失值的行))
[2. 均值填充(按类别分别填充)](#2. 均值填充(按类别分别填充))
[3. 中位数填充](#3. 中位数填充)
[4. 众数填充](#4. 众数填充)
[5. 线性回归填充(模型预测填充)](#5. 线性回归填充(模型预测填充))
[6. 随机森林填充(效果最优,推荐使用)](#6. 随机森林填充(效果最优,推荐使用))
[五、6 种填充方法对比总结](#五、6 种填充方法对比总结)
一、项目背景
在实际数据集(如矿物数据、医疗数据、金融数据)中,缺失值是非常常见的问题 ,直接影响模型训练效果。 本项目基于矿物数据集 ,实现6 种经典缺失值填充方案 ,严格遵循机器学习准则: ✅ 测试集必须使用训练集的统计量 / 模型填充 ✅ 训练集、测试集分离处理,杜绝数据泄露 ✅ 填充后输出标准化训练 / 测试集 Excel ✅ 支持类别分别填充(按矿物类型)
二、环境准备
python
运行
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
三、数据集加载与预处理(核心主程序)
1. 读取数据 + 清洗数据
python
运行
# 读取矿物数据
data = pd.read_excel("矿物数据.xlsx")
# 剔除E类样本
data = data[data['矿物类型'] != 'E']
# 查看缺失值
null_num = data.isnull().sum()
print("各列缺失值数量:\n", null_num)
# 划分特征与标签
X_whole = data.drop(['矿物类型', '序号'], axis=1)
y_whole = data['矿物类型']
# 标签编码:A→0, B→1, C→2, D→3
label_dict = {"A": 0, "B": 1, "C": 2, "D": 3}
encoded_labels = [label_dict[label] for label in y_whole]
y_whole = pd.Series(encoded_labels, name='矿物类型')
# 统一转为数值型(处理异常字符)
for column_name in X_whole.columns:
X_whole[column_name] = pd.to_numeric(X_whole[column_name], errors='coerce')
# 数据标准化(Z-Score)
scaler = StandardScaler()
X_whole_Z = scaler.fit_transform(X_whole)
X_whole = pd.DataFrame(X_whole_Z, columns=X_whole.columns)
# 划分训练集、测试集(7:3)
x_train_w, x_test_w, y_train_w, y_test_w = train_test_split(
X_whole, y_whole, test_size=0.3, random_state=50000
)
2. 选择缺失值填充方法
python
运行
# ========== 选择填充方法(取消注释即可使用)==========
# x_train_fill,y_train_fill=cca_train_fill(x_train_w,y_train_w)
# x_test_fill,y_test_fill=cca_test_fill(x_train_fill,y_train_fill,x_test_w,y_test_w)
# x_train_fill,y_train_fill=median_train_fill(x_train_w,y_train_w)
# x_test_fill,y_test_fill=median_test_fill(x_train_fill,y_train_fill,x_test_w,y_test_w)
# x_train_fill,y_train_fill=mean_train_fill(x_train_w,y_train_w)
# x_test_fill,y_test_fill=mean_test_fill(x_train_fill,y_train_fill,x_test_w,y_test_w)
# x_train_fill,y_train_fill=mode_train_fill(x_train_w,y_train_w)
# x_test_fill,y_test_fill=mode_test_fill(x_train_fill,y_train_fill,x_test_w,y_test_w)
# x_train_fill,y_train_fill=lr_train_fill(x_train_w,y_train_w)
# x_test_fill,y_test_fill=lr_test_fill(x_train_fill,y_train_fill,x_test_w,y_test_w)
x_train_fill, y_train_fill = rf_train_fill(x_train_w, y_train_w)
x_test_fill, y_test_fill = rf_test_fill(x_train_fill, y_train_fill, x_test_w, y_test_w)
3. 样本均衡 + 保存数据集
python
运行
# SMOTE 过采样(解决样本不均衡)
oversampler = SMOTE(k_neighbors=1, random_state=42)
os_x_train, os_y_train = oversampler.fit_resample(x_train_fill, y_train_fill)
# 保存最终训练/测试集
data_train = pd.concat([os_y_train, os_x_train], axis=1).sample(frac=1, random_state=4)
data_test = pd.concat([y_test_fill, x_test_fill], axis=1)
data_train.to_excel(r'训练数据集[随机森林填充].xlsx', index=False)
data_test.to_excel(r'测试数据集[随机森林填充].xlsx', index=False)
print("数据集处理完成,已保存Excel!")
四、6 种缺失值填充方法完整实现(核心工具函数)
1. CCA 完整案例删除法(直接删除含缺失值的行)
python
运行
def cca_train_fill(train_data, train_label):
data = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
df_filled = data.dropna()
return df_filled.drop('矿物类型', axis=1), df_filled['矿物类型']
def cca_test_fill(train_data, train_label, test_data, test_label):
data = pd.concat([test_data, test_label], axis=1).reset_index(drop=True)
df_filled = data.dropna()
return df_filled.drop('矿物类型', axis=1), df_filled['矿物类型']
2. 均值填充(按类别分别填充)
python
运行
def mean_train_method(data):
fill_values = data.mean()
return data.fillna(fill_values)
def mean_train_fill(train_data, train_label):
data = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
df_filled = mean_train_method(data)
return df_filled.drop('矿物类型', axis=1), df_filled['矿物类型']
def mean_test_method(train_data, test_data):
fill_values = train_data.mean()
return test_data.fillna(fill_values)
def mean_test_fill(train_data, train_label, test_data, test_label):
train_data_all = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
test_data_all = pd.concat([test_data, test_label], axis=1).reset_index(drop=True)
A_train = train_data_all[train_data_all['矿物类型'] == 0]
B_train = train_data_all[train_data_all['矿物类型'] == 1]
C_train = train_data_all[train_data_all['矿物类型'] == 2]
D_train = train_data_all[train_data_all['矿物类型'] == 3]
A_test = test_data_all[test_data_all['矿物类型'] == 0]
B_test = test_data_all[test_data_all['矿物类型'] == 1]
C_test = test_data_all[test_data_all['矿物类型'] == 2]
D_test = test_data_all[test_data_all['矿物类型'] == 3]
A = mean_test_method(A_train, A_test)
B = mean_test_method(B_train, B_test)
C = mean_test_method(C_train, C_test)
D = mean_test_method(D_train, D_test)
df_filled = pd.concat([A, B, C, D]).reset_index(drop=True)
return df_filled.drop('矿物类型', axis=1), df_filled['矿物类型']
3. 中位数填充
python
运行
def median_method(data):
fill_values = data.median()
return data.fillna(fill_values)
def median_train_fill(train_data, train_label):
data = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
A = data[data['矿物类型'] == 0]
B = data[data['矿物类型'] == 1]
C = data[data['矿物类型'] == 2]
D = data[data['矿物类型'] == 3]
A = median_method(A)
B = median_method(B)
C = median_method(C)
D = median_method(D)
df_filled = pd.concat([A, B, C, D]).reset_index(drop=True)
return df_filled.drop('矿物类型', axis=1), df_filled['矿物类型']
def median_test_method(train_data, test_data):
fill_values = train_data.median()
return test_data.fillna(fill_values)
def median_test_fill(train_data, train_label, test_data, test_label):
train_data_all = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
test_data_all = pd.concat([test_data, test_label], axis=1).reset_index(drop=True)
A_train = train_data_all[train_data_all['矿物类型'] == 0]
B_train = train_data_all[train_data_all['矿物类型'] == 1]
C_train = train_data_all[train_data_all['矿物类型'] == 2]
D_train = train_data_all[train_data_all['矿物类型'] == 3]
A_test = test_data_all[test_data_all['矿物类型'] == 0]
B_test = test_data_all[test_data_all['矿物类型'] == 1]
C_test = test_data_all[test_data_all['矿物类型'] == 2]
D_test = test_data_all[test_data_all['矿物类型'] == 3]
A = median_test_method(A_train, A_test)
B = median_test_method(B_train, B_test)
C = median_test_method(C_train, C_test)
D = median_test_method(D_train, D_test)
df_filled = pd.concat([A, B, C, D]).reset_index(drop=True)
return df_filled.drop('矿物类型', axis=1), df_filled['矿物类型']
4. 众数填充
python
运行
def mode_method(data):
fill_values = data.mode().iloc[0]
return data.fillna(fill_values)
def mode_train_fill(train_data, train_label):
data = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
df_filled = mode_method(data)
return df_filled.drop('矿物类型', axis=1), df_filled['矿物类型']
def mode_test_fill(train_data, train_label, test_data, test_label):
fill_values = train_data.mode().iloc[0]
test_filled = test_data.fillna(fill_values)
return test_filled, test_label
5. 线性回归填充(模型预测填充)
python
运行
def lr_train_fill(train_data, train_label):
train_data_all = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
train_data_X = train_data_all.drop('矿物类型', axis=1)
null_num = train_data_X.isnull().sum()
null_num_sorted = null_num.sort_values(ascending=True)
filling_feature = []
for i in null_num_sorted.index:
filling_feature.append(i)
if null_num_sorted[i] != 0:
X = train_data_X[filling_feature].drop(i, axis=1)
y = train_data_X[i]
null_idx = train_data_X[train_data_X[i].isnull()].index.tolist()
X_train = X.drop(null_idx)
y_train = y.drop(null_idx)
X_test = X.iloc[null_idx]
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
train_data_X.loc[null_idx, i] = y_pred
print(f'训练集 {i} 填充完成')
return train_data_X, train_data_all['矿物类型']
def lr_test_fill(train_data, train_label, test_data, test_label):
train_data_all = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
test_data_all = pd.concat([test_data, test_label], axis=1).reset_index(drop=True)
train_X = train_data_all.drop('矿物类型', axis=1)
test_X = test_data_all.drop('矿物类型', axis=1)
null_num = test_X.isnull().sum()
null_sorted = null_num.sort_values(ascending=True)
filling_feature = []
for col in null_sorted.index:
filling_feature.append(col)
if null_sorted[col] > 0:
X_tr = train_X[filling_feature].drop(col, axis=1)
y_tr = train_X[col]
X_te = test_X[filling_feature].drop(col, axis=1)
null_idx = test_X[test_X[col].isnull()].index.tolist()
X_te_null = X_te.iloc[null_idx]
model = LinearRegression()
model.fit(X_tr, y_tr)
pred = model.predict(X_te_null)
test_X.loc[null_idx, col] = pred
print(f'测试集 {col} 填充完成')
return test_X, test_data_all['矿物类型']
6. 随机森林填充(效果最优,推荐使用)
python
运行
def rf_train_fill(train_data, train_label):
train_data_all = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
train_data_X = train_data_all.drop('矿物类型', axis=1)
null_num = train_data_X.isnull().sum()
null_num_sorted = null_num.sort_values(ascending=True)
filling_feature = []
for i in null_num_sorted.index:
filling_feature.append(i)
if null_num_sorted[i] != 0:
X = train_data_X[filling_feature].drop(i, axis=1)
y = train_data_X[i]
null_idx = train_data_X[train_data_X[i].isnull()].index.tolist()
X_train = X.drop(null_idx)
y_train = y.drop(null_idx)
X_test = X.iloc[null_idx]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
train_data_X.loc[null_idx, i] = y_pred
print(f'训练集 {i} 填充完成')
return train_data_X, train_data_all['矿物类型']
def rf_test_fill(train_data, train_label, test_data, test_label):
train_data_all = pd.concat([train_data, train_label], axis=1).reset_index(drop=True)
test_data_all = pd.concat([test_data, test_label], axis=1).reset_index(drop=True)
train_X = train_data_all.drop('矿物类型', axis=1)
test_X = test_data_all.drop('矿物类型', axis=1)
null_num = test_X.isnull().sum()
null_sorted = null_num.sort_values(ascending=True)
filling_feature = []
for col in null_sorted.index:
filling_feature.append(col)
if null_sorted[col] > 0:
X_tr = train_X[filling_feature].drop(col, axis=1)
y_tr = train_X[col]
X_te = test_X[filling_feature].drop(col, axis=1)
null_idx = test_X[test_X[col].isnull()].index.tolist()
X_te_null = X_te.iloc[null_idx]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_tr, y_tr)
pred = model.predict(X_te_null)
test_X.loc[null_idx, col] = pred
print(f'测试集 {col} 填充完成')
return test_X, test_data_all['矿物类型']
五、6 种填充方法对比总结
表格
| 填充方法 | 优点 | 缺点 | 适用场景 |
|---|---|---|---|
| CCA 删除法 | 简单、无数据篡改 | 丢失大量样本、易偏差 | 缺失率 < 5%、大数据集 |
| 均值填充 | 计算快、简单 | 受异常值影响大 | 近似正态分布数据 |
| 中位数填充 | 抗异常值 | 精度一般 | 存在异常值的数据 |
| 众数填充 | 适用于离散特征 | 连续性特征效果差 | 类别型、离散型特征 |
| 线性回归填充 | 利用特征相关性 | 仅适合线性关系 | 特征线性相关数据 |
| 随机森林填充 | 精度高、非线性强 | 速度稍慢 | 大部分数据、推荐使用 |
六、运行效果
- 自动统计每列缺失值并按顺序填充
- 控制台输出填充进度
- 自动生成标准化 + 均衡化的训练集 / 测试集 Excel
- 可直接用于后续分类模型训练(KNN、决策树、SVM、逻辑回归等)
七、使用说明
- 将矿物数据命名为
矿物数据.xlsx放在代码同目录 - 直接运行主程序
- 取消注释对应填充方法即可切换方案
- 最终 Excel 自动保存到当前目录,可直接用于建模
八、总结
这份代码是数据预处理阶段的完整工具箱 ,覆盖了机器学习竞赛、毕业设计中最常用的 6 种缺失值处理方案 ,并且严格遵守训练测试分离、数据泄露防护、类别分别填充 等工业级规范。 尤其是随机森林填充,在非线性数据集中表现最优,非常推荐大家使用!