机器学习特征工程:从原始数据到模型输入
1. 引言
"数据和特征决定了机器学习的上限。" 好的特征工程可以让简单模型超越复杂模型。
特征工程流程:
原始数据 → 数据清洗 → 特征提取 → 特征变换 → 特征选择 → 模型输入
2. 数值特征处理
2.1 标准化与归一化
python
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
# 标准化:均值0,方差1(适合正态分布)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 归一化:缩放到 [0, 1]
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
# 鲁棒标准化:用中位数和四分位距(适合有异常值)
scaler = RobustScaler()
X_robust = scaler.fit_transform(X)
2.2 对数变换
python
import numpy as np
# 处理右偏分布(如收入、房价)
X_log = np.log1p(X) # log(1 + x),避免 log(0)
# Box-Cox 变换
from scipy.stats import boxcox
X_boxcox, lambda_opt = boxcox(X + 1)
# Yeo-Johnson 变换(支持负数)
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='yeo-johnson')
X_yeojohnson = pt.fit_transform(X)
3. 类别特征编码
3.1 常用编码
python
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
# 标签编码(有序类别)
le = LabelEncoder()
encoded = le.fit_transform(["低", "中", "高", "中"]) # [0, 1, 2, 1]
# One-Hot 编码(无序类别)
ohe = OneHotEncoder(sparse=False)
encoded = ohe.fit_transform([["红"], ["蓝"], ["红"], ["绿"]])
# [[1,0,0], [0,1,0], [1,0,0], [0,0,1]]
# 目标编码(高基数类别)
from category_encoders import TargetEncoder
te = TargetEncoder()
encoded = te.fit_transform(X_categorical, y)
3.2 高基数类别处理
python
# 频率编码
freq = X['city'].value_counts(normalize=True)
X['city_freq'] = X['city'].map(freq)
# 目标编码(带交叉验证防止过拟合)
from category_encoders import TargetEncoder
te = TargetEncoder(smoothing=10)
X['city_target'] = te.fit_transform(X['city'], y)
# 嵌入编码(深度学习)
import torch.nn as nn
embedding = nn.Embedding(num_categories=100, embedding_dim=8)
4. 时间特征
python
import pandas as pd
def extract_time_features(df, time_col):
"""提取时间特征"""
df[time_col] = pd.to_datetime(df[time_col])
# 基础特征
df['year'] = df[time_col].dt.year
df['month'] = df[time_col].dt.month
df['day'] = df[time_col].dt.day
df['hour'] = df[time_col].dt.hour
df['minute'] = df[time_col].dt.minute
df['dayofweek'] = df[time_col].dt.dayofweek # 0=周一
df['dayofyear'] = df[time_col].dt.dayofyear
df['week'] = df[time_col].dt.isocalendar().week
# 周期编码(sin/cos)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
# 布尔特征
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
df['is_month_start'] = df[time_col].dt.is_month_start.astype(int)
df['is_month_end'] = df[time_col].dt.is_month_end.astype(int)
return df
5. 文本特征
python
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
# TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(texts)
# Sentence-BERT 嵌入
tokenizer = AutoTokenizer.from_pretrained("all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("all-MiniLM-L6-v2")
def get_embedding(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
output = model(**inputs)
return output.last_hidden_state.mean(dim=1).squeeze().numpy()
6. 交互特征
python
# 数学组合
X['ratio'] = X['feature_a'] / (X['feature_b'] + 1e-8)
X['product'] = X['feature_a'] * X['feature_b']
X['difference'] = X['feature_a'] - X['feature_b']
# 多项式特征
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X)
# 分桶
X['age_bin'] = pd.cut(X['age'], bins=[0, 18, 35, 50, 65, 100],
labels=['少年', '青年', '中年', '老年', '高龄'])
7. 特征选择
7.1 过滤法
python
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
# 方差过滤:删除方差为 0 的特征
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
X_filtered = selector.fit_transform(X)
# 相关性过滤
correlations = X.corrwith(y).abs()
selected = correlations[correlations > 0.05].index
# SelectKBest
selector = SelectKBest(f_classif, k=50)
X_selected = selector.fit_transform(X, y)
7.2 包装法
python
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
# 递归特征消除
model = RandomForestClassifier(n_estimators=100)
rfe = RFE(model, n_features_to_select=20, step=5)
X_selected = rfe.fit_transform(X, y)
print(f"选中的特征: {X.columns[rfe.support_].tolist()}")
7.3 嵌入法
python
from sklearn.ensemble import GradientBoostingClassifier
# 基于树模型的特征重要性
model = GradientBoostingClassifier()
model.fit(X, y)
importances = pd.Series(model.feature_importances_, index=X.columns)
top_features = importances.nlargest(20)
print(top_features)
8. 特征工程 Pipeline
python
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# 定义不同列的处理方式
numeric_features = ['age', 'income', 'score']
categorical_features = ['city', 'gender', 'education']
preprocessor = ColumnTransformer([
('num', Pipeline([
('scaler', StandardScaler()),
]), numeric_features),
('cat', Pipeline([
('encoder', OneHotEncoder(handle_unknown='ignore')),
]), categorical_features),
])
# 完整 Pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('feature_selection', SelectKBest(f_classif, k=50)),
('classifier', GradientBoostingClassifier()),
])
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
9. 总结
特征工程的核心:
- 数值特征:标准化/归一化是基础,对数变换处理偏态
- 类别特征:低基数用 One-Hot,高基数用目标编码
- 时间特征:周期编码(sin/cos)比直接用数值更好
- 特征选择:先过滤(快速),再包装(精确),最后嵌入(模型驱动)
- Pipeline:把所有步骤封装为可复现的流水线