Pandas缺失值处理完全指南：从基础操作到高级技巧

一、认识缺失值：什么是NaN？

在Pandas中，缺失值主要由以下形式表示：

· NaN（Not a Number）：数值类型的缺失值 · None：对象类型的缺失值 · NaT（Not a Time）：时间类型的缺失值

python 复制代码

import pandas as pd
import numpy as np

##  创建包含缺失值的示例数据
data = {
    '姓名': ['张三', '李四', '王五', '赵六', None],
    '年龄': [25, np.nan, 30, 35, 28],
    '工资': [5000, 6000, None, 7000, 5500],
    '部门': ['技术部', '销售部', None, '技术部', '销售部'],
    '入职日期': pd.to_datetime(['2020-01-01', '2019-05-15', None, '2021-03-20', '2020-11-10'])
}

df = pd.DataFrame(data)
print("原始数据:")
print(df)
print(f"\n数据形状: {df.shape}")

二、检测缺失值

1. 基础检测方法

python 复制代码

# 查看每列的缺失值数量
print("每列缺失值数量:")
print(df.isnull().sum())

# 查看每列的缺失值比例
print("\n每列缺失值比例:")
print(df.isnull().mean().round(4))

# 查看是否有缺失值
print(f"\n数据中是否存在缺失值: {df.isnull().any().any()}")

# 查看完整行（无缺失值的行）
complete_rows = df.dropna()
print(f"完整行数量: {len(complete_rows)}")

2. 可视化缺失值

python 复制代码

import matplotlib.pyplot as plt
import seaborn as sns

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

def plot_missing_values(df):
    """绘制缺失值热力图"""
    plt.figure(figsize=(10, 6))
    
    # 方法1：热力图
    plt.subplot(1, 2, 1)
    sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis')
    plt.title('缺失值热力图')
    
    # 方法2：条形图
    plt.subplot(1, 2, 2)
    missing_data = df.isnull().sum()
    missing_data = missing_data[missing_data > 0]
    missing_data.plot(kind='bar')
    plt.title('各列缺失值数量')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

# 显示缺失值分布
plot_missing_values(df)

三、处理缺失值：三大策略

1. 删除缺失值

适用场景：缺失值比例很小，或者缺失行对分析影响不大。

python 复制代码

# 删除包含任何缺失值的行
df_dropped_any = df.dropna()
print(f"删除任何缺失值后的形状: {df_dropped_any.shape}")

# 删除所有列都为缺失值的行
df_dropped_all = df.dropna(how='all')
print(f"删除全为缺失值后的形状: {df_dropped_all.shape}")

# 删除特定列有缺失值的行
df_dropped_specific = df.dropna(subset=['工资', '年龄'])
print(f"删除工资或年龄缺失后的形状: {df_dropped_specific.shape}")

# 删除缺失值比例超过阈值的行
threshold = 0.5  # 缺失值比例阈值
df_dropped_thresh = df.dropna(thresh=int(df.shape[1] * threshold))
print(f"按阈值删除后的形状: {df_dropped_thresh.shape}")

2. 填充缺失值

适用场景：缺失值有一定规律，或者数据比较珍贵。

python 复制代码

# 创建副本进行操作
df_filled = df.copy()

# 固定值填充
df_filled['部门'] = df_filled['部门'].fillna('未知部门')
print("固定值填充后的部门列:")
print(df_filled['部门'])

# 前向填充和后向填充
df_filled['年龄'] = df_filled['年龄'].fillna(method='ffill')  # 前向填充
df_filled['工资'] = df_filled['工资'].fillna(method='bfill')  # 后向填充
print("\n前向/后向填充后的数值列:")
print(df_filled[['年龄', '工资']])

# 统计值填充
mean_age = df_filled['年龄'].mean()
median_salary = df_filled['工资'].median()
mode_department = df_filled['部门'].mode()[0]

df_filled['年龄'] = df_filled['年龄'].fillna(mean_age)
df_filled['工资'] = df_filled['工资'].fillna(median_salary)
df_filled['部门'] = df_filled['部门'].fillna(mode_department)

print("\n统计值填充后的数据:")
print(df_filled)

3. 高级填充技巧

python 复制代码

# 分组填充（按部门填充平均工资）
df_group_fill = df.copy()
department_salary = df_group_fill.groupby('部门')['工资'].transform('mean')
df_group_fill['工资'] = df_group_fill['工资'].fillna(department_salary)
print("分组填充后的工资列:")
print(df_group_fill[['部门', '工资']])

# 插值法填充（适用于时间序列）
df_interpolate = df.copy()
df_interpolate['年龄'] = df_interpolate['年龄'].interpolate(method='linear')
print("\n插值法填充后的年龄列:")
print(df_interpolate['年龄'])

# 模型预测填充（使用KNN）
from sklearn.impute import KNNImputer

# 只对数值列使用KNN
numeric_cols = ['年龄', '工资']
df_knn = df[numeric_cols].copy()

imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(imputer.fit_transform(df_knn), 
                             columns=numeric_cols)
print("\nKNN填充后的数值数据:")
print(df_knn_imputed)

四、案例：超市销售数据分析

python 复制代码

# 创建超市销售数据
np.random.seed(42)
sales_data = {
    '商品ID': range(1, 101),
    '商品名称': [f'商品_{i}' for i in range(1, 101)],
    '类别': np.random.choice(['食品', '饮料', '日用品', '电子产品'], 100),
    '销售额': np.random.normal(1000, 300, 100),
    '成本': np.random.normal(600, 200, 100),
    '库存': np.random.randint(0, 500, 100)
}

sales_df = pd.DataFrame(sales_data)

# 人为添加缺失值
missing_indices = {
    '销售额': [5, 15, 25, 35, 45],
    '成本': [10, 20, 30, 40, 50, 60],
    '库存': [3, 13, 23, 33, 43, 53, 63],
    '类别': [7, 17, 27]
}

for col, indices in missing_indices.items():
    sales_df.loc[indices, col] = np.nan

print("处理前的数据信息:")
print(sales_df.info())
print(f"\n缺失值统计:")
print(sales_df.isnull().sum())

1. 分析缺失模式

python 复制代码

def analyze_missing_pattern(df):
    """分析缺失值模式"""
    # 缺失值相关性
    missing_corr = df.isnull().corr()
    print("缺失值相关性矩阵:")
    print(missing_corr)
    
    # 缺失模式分析
    missing_pattern = df.isnull().sum(axis=1)
    print(f"\n每行缺失值数量分布:")
    print(missing_pattern.value_counts().sort_index())
    
    return missing_corr

missing_correlation = analyze_missing_pattern(sales_df)

2. 制定处理策略

python 复制代码

def handle_missing_data_strategic(df):
    """基于策略的缺失值处理"""
    df_processed = df.copy()
    
    # 策略1：对于类别变量，用众数填充
    categorical_cols = ['类别']
    for col in categorical_cols:
        mode_val = df_processed[col].mode()
        if len(mode_val) > 0:
            df_processed[col] = df_processed[col].fillna(mode_val[0])
    
    # 策略2：对于销售额和成本，按类别分组填充
    numeric_cols = ['销售额', '成本']
    for col in numeric_cols:
        # 先尝试按类别均值填充
        group_means = df_processed.groupby('类别')[col].transform('mean')
        df_processed[col] = df_processed[col].fillna(group_means)
        
        # 如果还有缺失，用整体均值填充
        df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
    
    # 策略3：对于库存，用0填充（假设缺失表示无库存）
    df_processed['库存'] = df_processed['库存'].fillna(0)
    
    return df_processed

# 应用处理策略
sales_processed = handle_missing_data_strategic(sales_df)
print("处理后的缺失值统计:")
print(sales_processed.isnull().sum())

五、高级技巧与最佳实践

1. 创建缺失值指示器

python 复制代码

def create_missing_indicators(df, columns=None):
    """创建缺失值指示器"""
    if columns is None:
        columns = df.columns
    
    df_with_indicators = df.copy()
    
    for col in columns:
        if df[col].isnull().any():
            indicator_name = f'{col}_missing'
            df_with_indicators[indicator_name] = df[col].isnull().astype(int)
    
    return df_with_indicators

# 使用缺失值指示器
df_with_indicators = create_missing_indicators(sales_df)
print("添加缺失值指示器后的列名:")
print(df_with_indicators.columns.tolist())

2. 多重填充技术

python 复制代码

def advanced_imputation(df, target_col):
    """高级填充技术"""
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    
    # 选择数值列进行多重填充
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    if target_col in numeric_cols:
        # 使用随机森林进行多重填充
        imputer = IterativeImputer(
            estimator=RandomForestRegressor(n_estimators=100, random_state=42),
            max_iter=10,
            random_state=42
        )
        
        df_imputed = df.copy()
        df_imputed[numeric_cols] = imputer.fit_transform(df[numeric_cols])
        
        return df_imputed
    else:
        print("目标列不是数值类型，使用基础填充方法")
        return df.fillna(df.median())

# 应用高级填充
# sales_advanced = advanced_imputation(sales_df, '销售额')

3. 验证处理效果

python 复制代码

def validate_imputation(original_df, imputed_df, numeric_cols):
    """验证填充效果"""
    results = {}
    
    for col in numeric_cols:
        if original_df[col].isnull().any():
            # 获取原始完整数据（用于比较）
            complete_mask = original_df[col].notnull()
            original_complete = original_df.loc[complete_mask, col]
            imputed_values = imputed_df.loc[complete_mask, col]
            
            # 计算误差
            from sklearn.metrics import mean_absolute_error, mean_squared_error
            mae = mean_absolute_error(original_complete, imputed_values)
            rmse = np.sqrt(mean_squared_error(original_complete, imputed_values))
            
            results[col] = {
                'MAE': mae,
                'RMSE': rmse,
                '原始均值': original_complete.mean(),
                '填充后均值': imputed_values.mean()
            }
    
    return pd.DataFrame(results).T

# 验证填充效果（在知道真实值的情况下）
# validation_results = validate_imputation(original_complete_data, sales_processed, ['销售额', '成本'])
# print(validation_results)

六、总结与最佳实践

1. 处理策略选择指南

缺失比例推荐策略说明 < 5% 删除对整体影响很小，直接删除 5% - 20% 简单填充使用均值、中位数、众数等 20% - 50% 高级填充使用模型预测、多重填充等 50% 谨慎处理考虑删除该变量或特殊标记

2. 黄金法则

· 先分析后处理：了解缺失机制（MCAR、MAR、MNAR） · 多种方法比较：不要局限于一种填充方法 · 保留缺失信息：使用缺失值指示器 · 验证处理效果：在有条件的情况下验证填充准确性 · 文档记录：记录处理过程和决策依据

3. 完整处理流程

python 复制代码

def comprehensive_missing_value_pipeline(df):
    """
    完整的缺失值处理流程
    """
    # 数据备份
    df_processed = df.copy()
    
    # 缺失值分析
    missing_summary = df_processed.isnull().sum()
    missing_ratio = missing_summary / len(df_processed)
    
    print("缺失值分析报告:")
    for col in df_processed.columns:
        if missing_summary[col] > 0:
            print(f"  {col}: {missing_summary[col]} 个缺失值 ({missing_ratio[col]:.2%})")
    
    # 创建缺失值指示器
    df_processed = create_missing_indicators(df_processed)
    
    # 分类型处理
    for col in df.columns:
        if df_processed[col].isnull().any():
            if df_processed[col].dtype == 'object':
                # 类别变量用众数填充
                df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
            else:
                # 数值变量用中位数填充
                df_processed[col] = df_processed[col].fillna(df_processed[col].median())
    
    # 验证结果
    remaining_missing = df_processed.isnull().sum().sum()
    print(f"\n处理完成！剩余缺失值: {remaining_missing}")
    
    return df_processed

# 使用完整流程
final_df = comprehensive_missing_value_pipeline(sales_df)

Pandas缺失值处理完全指南：从基础操作到高级技巧

一、 认识缺失值：什么是NaN？

二、 检测缺失值

1. 基础检测方法

2. 可视化缺失值

三、 处理缺失值：三大策略

1. 删除缺失值

2. 填充缺失值

3. 高级填充技巧

四、 案例：超市销售数据分析

1. 分析缺失模式

2. 制定处理策略

五、 高级技巧与最佳实践

1. 创建缺失值指示器

2. 多重填充技术

3. 验证处理效果

六、 总结与最佳实践

1. 处理策略选择指南

2. 黄金法则

3. 完整处理流程

一、认识缺失值：什么是NaN？

二、检测缺失值

三、处理缺失值：三大策略

四、案例：超市销售数据分析

五、高级技巧与最佳实践

六、总结与最佳实践