Pandas数据清洗别再用fillna了，这些骚操作让你效率提升10倍

周五下午5点，老板突然扔给你一个10万行的Excel表格，说"周一前把这个数据洗干净"。

你打开一看，心凉了半截：

缺失值像地雷一样到处都是
重复行密密麻麻
还有一堆奇怪的占位符（"-"、"N/A"、"暂无数据"）

这时候，你打开搜索引擎，输入"pandas缺失值处理"，然后看到了千篇一律的df.fillna(0)...

兄弟，醒醒！2024年了，还在用这种老掉牙的方法？

今天本小姐带你见识一下pandas数据清洗的真正骚操作，让你从此告别加班，轻松下班。

先别急着动手，先用这几招看看数据全貌

python 复制代码

import pandas as pd
import numpy as np

# 先造点"垃圾数据"模拟真实场景
data = {
    'name': ['张三', '李四', '王五', '张三', '赵六', np.nan, '钱七'],
    'age': [25, 30, np.nan, 25, '-', 35, 28],
    'salary': [8000, np.nan, 12000, 8000, 15000, 'N/A', 9000],
    'department': ['技术部', '销售部', '技术部', '技术部', '人事部', '财务部', '技术部']
}
df = pd.DataFrame(data)

# 大部分人的做法（❌错误示范）
print(df.info())  # 只看到类型和内存，看不到实质问题

# 老司机的做法（✅正确姿势）
print("=" * 50)
print("数据缺失情况全貌：")
print(df.isnull().sum())
print("\n缺失值比例：")
print(df.isnull().sum() / len(df) * 100)

# 更骚的操作：一次性看透所有问题
print("\n数据质量报告：")
print(f"总行数：{len(df)}")
print(f"完全重复的行数：{df.duplicated().sum()}")
print(f"至少有一个缺失值的行数：{df.isnull().any(axis=1).sum()}")
print(f"全部是缺失值的行数：{df.isnull().all(axis=1).sum()}")

输出结果：

go 复制代码

==================================================
数据缺失情况全貌：
name          1
age           1
salary        2
department    0
dtype: int64

缺失值比例：
name         14.285714
age          14.285714
salary       28.571429
department    0.000000
dtype: float64

数据质量报告：
总行数：7
完全重复的行数：1
至少有一个缺失值的行数：4
全部是缺失值的行数：0

看到了吗？一眼就能看出：张三那行完全重复，salary字段问题最多，还有各种奇怪的占位符。

缺失值处理：别只会用fillna(0)

很多人处理缺失值就是简单粗暴：

python 复制代码

# ❌ 很多人的做法
df['age'].fillna(0, inplace=True)  # 年龄填0？用户刚出生吗？
df['salary'].fillna(0, inplace=True)  # 工资填0？要造反吗？

兄弟，你这样做，你的数据分析师会骂死你的！

真正的骚操作来了

1. 智能识别各种奇葩的缺失值

python 复制代码

# 现实中的数据，缺失值花样百出："-", "N/A", "暂无", "null", "..."
df = pd.read_excel('messy_data.xlsx', na_values=['-', 'N/A', '暂无数据', 'null', 'na', 'NaN'])

# 或者事后补救
df = df.replace(['-', 'N/A', '暂无数据', 'null', 'na', 'NaN'], np.nan)

print("清洗后的缺失值情况：")
print(df.isnull().sum())

2. 按列类型分别处理，精准打击

python 复制代码

# 按照数据类型和业务逻辑分别处理
def smart_fillna(df):
    df_filled = df.copy()

    for col in df.columns:
        if df[col].dtype == 'object':  # 字符串类型
            # 对于分类变量，用众数填充
            mode_value = df[col].mode()
            if len(mode_value) > 0:
                df_filled[col] = df[col].fillna(mode_value[0])

        elif df[col].dtype in ['int64', 'float64']:  # 数值类型
            # 对于数值变量，按情况选择填充策略
            if 'age' in col.lower():
                # 年龄用中位数填充（避免极端值影响）
                df_filled[col] = df[col].fillna(df[col].median())
            elif 'salary' in col.lower():
                # 薪资用同部门平均值填充
                for dept in df['department'].unique():
                    dept_avg = df[df['department'] == dept][col].mean()
                    mask = (df[col].isnull()) & (df['department'] == dept)
                    df_filled.loc[mask, col] = dept_avg
            else:
                # 其他数值用中位数
                df_filled[col] = df[col].fillna(df[col].median())

    return df_filled

df_smart_filled = smart_fillna(df)
print("智能填充结果：")
print(df_smart_filled)

3. 前后填充：时序数据的神器

python 复制代码

# 模拟时序数据
time_series_data = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=10),
    'temperature': [20, 21, np.nan, 23, np.nan, np.nan, 26, 25, np.nan, 24],
    'sales': [100, 120, np.nan, 140, np.nan, 160, 170, np.nan, 190, 200]
})

# 前向填充（用前一个值填充）
print("前向填充：")
print(time_series_data.fillna(method='ffill'))

# 后向填充（用后一个值填充）
print("\n后向填充：")
print(time_series_data.fillna(method='bfill'))

# 组合拳：先前向，再后向
print("\n组合填充：")
print(time_series_data.fillna(method='ffill').fillna(method='bfill'))

# 限制填充范围（避免连续缺失被错误填充）
print("\n限制填充范围（最多填充1个）：")
print(time_series_data.fillna(method='ffill', limit=1))

4. 插值：数值型数据的优雅处理

python 复制代码

# 线性插值
print("线性插值：")
print(time_series_data.interpolate(method='linear'))

# 多项式插值（更平滑）
print("\n二次多项式插值：")
print(time_series_data.interpolate(method='polynomial', order=2))

# 时间插值（考虑时间间隔）
print("\n时间插值：")
print(time_series_data.set_index('date').interpolate(method='time'))

重复值处理：不只是一行drop_duplicates

python 复制代码

# 创建有重复数据的DataFrame
duplicate_data = pd.DataFrame({
    'name': ['张三', '李四', '张三', '张三', '王五'],
    'age': [25, 30, 25, 26, 35],  # 注意张三的年龄不一致
    'salary': [8000, 9000, 8000, 8000, 12000],
    'department': ['技术部', '销售部', '技术部', '技术部', '技术部']
})

# ❌ 很多人只会这样
print("简单去重（默认保留第一个）：")
print(duplicate_data.drop_duplicates())

# ✅ 但实际情况要复杂得多
print("\n基于关键列去重（比如姓名+部门）：")
print(duplicate_data.drop_duplicates(subset=['name', 'department']))

print("\n保留最后一个出现的记录：")
print(duplicate_data.drop_duplicates(subset=['name'], keep='last'))

print("\n标记所有重复项（不保留任何）：")
print(duplicate_data[duplicate_data.duplicated(subset=['name'], keep=False)])

# 🚀 神操作：智能处理重复数据
def smart_handle_duplicates(df, key_columns, strategy='first'):
    """
    智能处理重复数据

    Args:
        df: DataFrame
        key_columns: 用来判断重复的关键列
        strategy: 'first', 'last', 'mean', 'max', 'min'
    """
    if strategy in ['first', 'last']:
        return df.drop_duplicates(subset=key_columns, keep=strategy)

    # 对于数值型数据，可以聚合处理
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns

    # 分组聚合
    result = df.groupby(key_columns, as_index=False).agg({
        **{col: 'first' for col in non_numeric_cols if col not in key_columns},
        **{col: strategy for col in numeric_cols if col not in key_columns}
    })

    return result

print("\n🚀 智能聚合处理（保留第一次出现的字符串，薪资取平均值）：")
print(smart_handle_duplicates(duplicate_data, ['name'], 'mean'))

高级骚操作：一行代码解决常见问题

1. 一行代码检测所有数据问题

python 复制代码

def get_data_quality_report(df):
    """一行代码生成数据质量报告"""
    report = pd.DataFrame({
        '数据类型': df.dtypes,
        '缺失值数量': df.isnull().sum(),
        '缺失值比例(%)': round(df.isnull().sum() / len(df) * 100, 2),
        '唯一值数量': df.nunique(),
        '重复值数量': df.duplicated().sum()
    })

    # 添加数据质量评分
    quality_score = []
    for col in df.columns:
        missing_ratio = df[col].isnull().sum() / len(df)
        if missing_ratio == 0:
            score = 100
        elif missing_ratio < 0.1:
            score = 80
        elif missing_ratio < 0.3:
            score = 60
        else:
            score = 30
        quality_score.append(score)

    report['数据质量评分'] = quality_score
    return report

print("数据质量报告：")
print(get_data_quality_report(df))

2. 一行代码处理90%的数据清洗

python 复制代码

def auto_clean_data(df,
                   remove_duplicates=True,
                   duplicate_subset=None,
                   fill_missing=True,
                   fill_strategy='smart'):
    """
    自动数据清洗函数

    Args:
        df: 要清洗的DataFrame
        remove_duplicates: 是否去重
        duplicate_subset: 去重依据的列
        fill_missing: 是否填充缺失值
        fill_strategy: 'smart', 'mean', 'median', 'mode', 'ffill', 'bfill'
    """
    df_clean = df.copy()

    # 1. 去重
    if remove_duplicates:
        if duplicate_subset:
            df_clean = df_clean.drop_duplicates(subset=duplicate_subset, keep='first')
        else:
            df_clean = df_clean.drop_duplicates(keep='first')

    # 2. 处理缺失值
    if fill_missing:
        if fill_strategy == 'smart':
            df_clean = smart_fillna(df_clean)
        elif fill_strategy == 'mean':
            df_clean = df_clean.fillna(df_clean.mean())
        elif fill_strategy == 'median':
            df_clean = df_clean.fillna(df_clean.median())
        elif fill_strategy == 'mode':
            df_clean = df_clean.fillna(df_clean.mode().iloc[0])
        elif fill_strategy == 'ffill':
            df_clean = df_clean.fillna(method='ffill')
        elif fill_strategy == 'bfill':
            df_clean = df_clean.fillna(method='bfill')

    # 3. 重置索引
    df_clean = df_clean.reset_index(drop=True)

    return df_clean

# 🚀 一行代码搞定
df_auto_cleaned = auto_clean_data(df, remove_duplicates=True, fill_strategy='smart')
print("自动清洗结果：")
print(df_auto_cleaned)

实战案例：10万行数据，3分钟搞定

python 复制代码

# 模拟真实的大数据场景
import time

def create_large_dirty_data(rows=100000):
    """创建大量脏数据"""
    np.random.seed(42)

    departments = ['技术部', '销售部', '人事部', '财务部', '市场部']
    names = [f'员工{i}' for i in range(1000)]

    data = {
        '员工ID': range(1, rows + 1),
        '姓名': np.random.choice(names, rows, replace=True),
        '部门': np.random.choice(departments, rows),
        '年龄': np.random.normal(35, 8, rows),
        '薪资': np.random.normal(8000, 2000, rows),
        '绩效评分': np.random.uniform(60, 100, rows)
    }

    df = pd.DataFrame(data)

    # 制造缺失值
    for col in ['年龄', '薪资', '绩效评分']:
        missing_indices = np.random.choice(rows, int(rows * 0.15), replace=False)
        df.loc[missing_indices, col] = np.nan

    # 制造重复行
    duplicate_indices = np.random.choice(rows, int(rows * 0.05), replace=False)
    df = pd.concat([df, df.iloc[duplicate_indices]], ignore_index=True)

    # 制造异常值
    outlier_indices = np.random.choice(rows, int(rows * 0.02), replace=False)
    df.loc[outlier_indices, '年龄'] = df.loc[outlier_indices, '年龄'] * 10

    return df

# 性能测试
large_dirty_df = create_large_dirty_data(100000)
print(f"原始数据：{len(large_dirty_df)} 行")

start_time = time.time()

# 使用我们的自动清洗函数
cleaned_large_df = auto_clean_data(
    large_dirty_df,
    remove_duplicates=True,
    duplicate_subset=['员工ID'],  # 基于员工ID去重
    fill_strategy='smart'
)

end_time = time.time()

print(f"清洗后数据：{len(cleaned_large_df)} 行")
print(f"处理时间：{end_time - start_time:.2f} 秒")
print(f"数据质量提升：{(len(large_dirty_df) - len(cleaned_large_df)) / len(large_dirty_df) * 100:.1f}%")

总结：记住这几条原则

先看诊，再开药 ：永远先用isnull().sum()看看缺失值分布
区别对待：数值型和分类型数据要分开处理
业务优先：年龄不能填0，薪资不能乱填
时序特殊：时间序列数据优先考虑前向后向填充
重复慎删：先确定哪些字段是判断重复的依据

记住一句话：数据清洗不是技术活，是业务活。

下次再遇到脏数据，别慌。按照这套骚操作下来，10万行数据也就是一杯咖啡的时间。

你在项目中遇到过什么奇葩的数据清洗问题？评论区聊聊，看看谁的数据更脏？(￣▽￣)ノ