
scikit-learn实战与模型管理(Pipeline、模型保存)
一、scikit-learn统一API
1.1 核心API概览
python
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
import joblib
import pickle
import warnings
warnings.filterwarnings('ignore')
print("=" * 60)
print("scikit-learn统一API:fit, predict, transform")
print("=" * 60)
# scikit-learn的核心API模式
print("""
📌 scikit-learn的统一API设计:
1. 估计器 (Estimator):
- fit(X, y): 训练模型
- predict(X): 预测
- score(X, y): 评估
2. 转换器 (Transformer):
- fit(X): 学习参数
- transform(X): 转换数据
- fit_transform(X): 拟合+转换
3. 所有模型遵循相同模式!
""")
# 示例:不同模型的统一使用方式
X, y = make_classification(n_samples=200, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
models = {
'逻辑回归': LogisticRegression(max_iter=1000),
'决策树': DecisionTreeClassifier(),
'随机森林': RandomForestClassifier()
}
print("\n统一API示例:")
for name, model in models.items():
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
print(f" {name}: 准确率 = {acc:.4f}")
二、Pipeline流水线
2.1 基础Pipeline
python
def pipeline_basics():
"""Pipeline基础使用"""
# 加载数据
X, y = make_classification(n_samples=300, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 创建Pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression(max_iter=1000))
])
# 训练
pipeline.fit(X_train, y_train)
# 评估
train_acc = pipeline.score(X_train, y_train)
test_acc = pipeline.score(X_test, y_test)
print("\n" + "=" * 60)
print("Pipeline基础使用")
print("=" * 60)
print(f"训练集准确率: {train_acc:.4f}")
print(f"测试集准确率: {test_acc:.4f}")
# 可视化Pipeline
fig, ax = plt.subplots(figsize=(10, 3))
ax.axis('off')
steps = ['原始数据', '标准化', '分类器', '预测']
x_pos = np.linspace(0.1, 0.9, len(steps))
for i, (step, x) in enumerate(zip(steps, x_pos)):
circle = plt.Circle((x, 0.5), 0.08, color='lightblue', ec='black')
ax.add_patch(circle)
ax.text(x, 0.5, step, ha='center', va='center', fontsize=10)
if i < len(steps) - 1:
ax.annotate('', xy=(x+0.18, 0.5), xytext=(x+0.1, 0.5),
arrowprops=dict(arrowstyle='->', lw=2))
ax.set_xlim(0, 1)
ax.set_title('Pipeline工作流程', fontsize=12)
plt.tight_layout()
plt.show()
return pipeline
pipeline = pipeline_basics()
2.2 复杂Pipeline:预处理+特征工程+建模
python
def advanced_pipeline():
"""复杂Pipeline演示"""
# 创建混合类型数据
np.random.seed(42)
n_samples = 500
data = pd.DataFrame({
'年龄': np.random.randint(18, 80, n_samples),
'收入': np.random.randint(30000, 150000, n_samples),
'评分': np.random.uniform(1, 5, n_samples),
'城市': np.random.choice(['北京', '上海', '广州', '深圳'], n_samples),
'学历': np.random.choice(['本科', '硕士', '博士', '其他'], n_samples)
})
# 引入缺失值
data.loc[np.random.choice(n_samples, 50, replace=False), '年龄'] = np.nan
data.loc[np.random.choice(n_samples, 30, replace=False), '评分'] = np.nan
# 生成目标变量
y = (data['收入'] > 80000).astype(int)
# 定义特征类型
numeric_features = ['年龄', '收入', '评分']
categorical_features = ['城市', '学历']
# 创建预处理步骤
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='缺失')),
('onehot', OneHotEncoder(drop='first', sparse_output=False))
])
# 组合预处理器
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# 完整Pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
# 划分数据
X = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 训练
pipeline.fit(X_train, y_train)
print("\n" + "=" * 60)
print("复杂Pipeline:预处理 + 特征工程 + 建模")
print("=" * 60)
print(f"训练集准确率: {pipeline.score(X_train, y_train):.4f}")
print(f"测试集准确率: {pipeline.score(X_test, y_test):.4f}")
# 查看预处理后的特征数
X_preprocessed = pipeline.named_steps['preprocessor'].transform(X_train)
print(f"预处理后特征数: {X_preprocessed.shape[1]}")
return pipeline
advanced_pipeline()
2.3 Pipeline与交叉验证
python
def pipeline_cross_validation():
"""Pipeline与交叉验证"""
X, y = make_classification(n_samples=300, n_features=10, random_state=42)
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression(max_iter=1000))
])
# 交叉验证
scores = cross_val_score(pipeline, X, y, cv=5)
print("\n" + "=" * 60)
print("Pipeline与交叉验证")
print("=" * 60)
print(f"各折得分: {scores}")
print(f"平均得分: {scores.mean():.4f} (+/- {scores.std():.4f})")
# 带参数搜索的Pipeline
param_grid = {
'classifier__C': [0.1, 1, 10, 100],
'classifier__solver': ['lbfgs', 'liblinear']
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)
print(f"\n网格搜索最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")
return grid_search
pipeline_cross_validation()
三、模型保存与加载
3.1 pickle模块
python
def pickle_save_load():
"""使用pickle保存和加载模型"""
X, y = make_classification(n_samples=300, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
original_score = model.score(X_test, y_test)
print("\n" + "=" * 60)
print("pickle保存和加载模型")
print("=" * 60)
# 保存模型
with open('model.pkl', 'wb') as f:
pickle.dump(model, f)
print("模型已保存到 model.pkl")
# 加载模型
with open('model.pkl', 'rb') as f:
loaded_model = pickle.load(f)
print("模型已加载")
# 验证
loaded_score = loaded_model.score(X_test, y_test)
print(f"原始模型准确率: {original_score:.4f}")
print(f"加载模型准确率: {loaded_score:.4f}")
print(f"结果一致: {np.allclose(original_score, loaded_score)}")
pickle_save_load()
3.2 joblib模块(推荐)
python
def joblib_save_load():
"""使用joblib保存和加载模型(推荐)"""
X, y = make_classification(n_samples=300, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 创建Pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline.fit(X_train, y_train)
original_score = pipeline.score(X_test, y_test)
print("\n" + "=" * 60)
print("joblib保存和加载模型(推荐)")
print("=" * 60)
# 保存Pipeline
joblib.dump(pipeline, 'pipeline.joblib')
print("Pipeline已保存到 pipeline.joblib")
# 加载Pipeline
loaded_pipeline = joblib.load('pipeline.joblib')
print("Pipeline已加载")
# 验证
loaded_score = loaded_pipeline.score(X_test, y_test)
print(f"原始Pipeline准确率: {original_score:.4f}")
print(f"加载Pipeline准确率: {loaded_score:.4f}")
# joblib vs pickle
import os
print(f"\n文件大小对比:")
print(f" pickle: {os.path.getsize('model.pkl') / 1024:.1f} KB")
print(f" joblib: {os.path.getsize('pipeline.joblib') / 1024:.1f} KB")
print(" joblib对NumPy数组更高效!")
joblib_save_load()
四、自定义转换器
4.1 创建自定义转换器
python
def custom_transformer():
"""创建自定义转换器"""
from sklearn.base import BaseEstimator, TransformerMixin
class OutlierRemover(BaseEstimator, TransformerMixin):
"""自定义异常值移除器"""
def __init__(self, threshold=3):
self.threshold = threshold
self.lower_bounds_ = None
self.upper_bounds_ = None
def fit(self, X, y=None):
# 计算每个特征的上下界
self.lower_bounds_ = []
self.upper_bounds_ = []
for i in range(X.shape[1]):
mean = np.mean(X[:, i])
std = np.std(X[:, i])
self.lower_bounds_.append(mean - self.threshold * std)
self.upper_bounds_.append(mean + self.threshold * std)
return self
def transform(self, X):
# 标记异常值
mask = np.ones(X.shape[0], dtype=bool)
for i in range(X.shape[1]):
mask &= (X[:, i] >= self.lower_bounds_[i]) & (X[:, i] <= self.upper_bounds_[i])
return X[mask]
# 测试自定义转换器
X, y = make_classification(n_samples=500, n_features=2, random_state=42)
# 添加异常值
X[:10] = X[:10] * 5
pipeline = Pipeline([
('outlier_remover', OutlierRemover(threshold=3)),
('scaler', StandardScaler()),
('classifier', LogisticRegression())
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
pipeline.fit(X_train, y_train)
print("\n" + "=" * 60)
print("自定义转换器")
print("=" * 60)
print(f"原始训练集大小: {len(X_train)}")
# 查看移除异常值后的样本数
X_clean = pipeline.named_steps['outlier_remover'].transform(X_train)
print(f"移除异常值后: {len(X_clean)}")
print(f"测试集准确率: {pipeline.score(X_test, y_test):.4f}")
custom_transformer()
五、模型部署实战
5.1 完整项目流程
python
def complete_project():
"""完整机器学习项目流程"""
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix, roc_curve, auc
print("\n" + "=" * 60)
print("完整机器学习项目流程")
print("=" * 60)
# Step 1: 加载数据
data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names
print(f"Step 1: 加载数据")
print(f" 样本数: {X.shape[0]}, 特征数: {X.shape[1]}")
# Step 2: 划分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nStep 2: 划分数据")
print(f" 训练集: {len(X_train)} 样本")
print(f" 测试集: {len(X_test)} 样本")
# Step 3: 创建Pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
# Step 4: 交叉验证选择参数
param_grid = {
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [5, 10, None],
'classifier__min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"\nStep 3-4: 模型训练与调优")
print(f" 最佳参数: {grid_search.best_params_}")
print(f" 最佳交叉验证得分: {grid_search.best_score_:.4f}")
# Step 5: 评估
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"\nStep 5: 模型评估")
print(f" 测试集准确率: {test_acc:.4f}")
# Step 6: 保存模型
joblib.dump(best_model, 'breast_cancer_model.joblib')
print(f"\nStep 6: 保存模型")
print(" 模型已保存到 breast_cancer_model.joblib")
# 可视化结果
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
axes[0, 0].imshow(cm, cmap='Blues')
axes[0, 0].set_xticks([0, 1])
axes[0, 0].set_yticks([0, 1])
axes[0, 0].set_xticklabels(['良性', '恶性'])
axes[0, 0].set_yticklabels(['良性', '恶性'])
for i in range(2):
for j in range(2):
axes[0, 0].text(j, i, cm[i, j], ha='center', va='center', fontsize=16)
axes[0, 0].set_title('混淆矩阵')
# ROC曲线
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
axes[0, 1].plot(fpr, tpr, 'b-', linewidth=2, label=f'AUC = {roc_auc:.3f}')
axes[0, 1].plot([0, 1], [0, 1], 'r--', alpha=0.5)
axes[0, 1].set_xlabel('假正率')
axes[0, 1].set_ylabel('真正率')
axes[0, 1].set_title('ROC曲线')
axes[0, 1].legend()
# 特征重要性
importances = best_model.named_steps['classifier'].feature_importances_
top_idx = np.argsort(importances)[-10:]
axes[1, 0].barh(range(10), importances[top_idx])
axes[1, 0].set_yticks(range(10))
axes[1, 0].set_yticklabels([feature_names[i] for i in top_idx])
axes[1, 0].set_xlabel('特征重要性')
axes[1, 0].set_title('Top 10 重要特征')
# 分类报告
axes[1, 1].axis('off')
report = classification_report(y_test, y_pred, target_names=['良性', '恶性'], output_dict=True)
report_text = f"""
分类报告:
┌──────────┬──────────┬──────────┬──────────┐
│ │ 精确率 │ 召回率 │ F1分数 │
├──────────┼──────────┼──────────┼──────────┤
│ 良性 │ {report['良性']['precision']:.3f} │ {report['良性']['recall']:.3f} │ {report['良性']['f1-score']:.3f} │
├──────────┼──────────┼──────────┼──────────┤
│ 恶性 │ {report['恶性']['precision']:.3f} │ {report['恶性']['recall']:.3f} │ {report['恶性']['f1-score']:.3f} │
├──────────┼──────────┼──────────┼──────────┤
│ 准确率 │ │ │ {report['accuracy']:.3f} │
└──────────┴──────────┴──────────┴──────────┘
"""
axes[1, 1].text(0.05, 0.95, report_text, transform=axes[1, 1].transAxes,
fontsize=10, verticalalignment='top', fontfamily='monospace')
plt.suptitle('乳腺癌分类项目 - 完整评估报告', fontsize=14)
plt.tight_layout()
plt.show()
return best_model
model = complete_project()
六、生产环境部署示例
python
def production_deployment():
"""生产环境部署示例"""
# 模拟部署:加载模型并进行预测
print("\n" + "=" * 60)
print("生产环境部署示例")
print("=" * 60)
# 加载保存的模型
loaded_model = joblib.load('breast_cancer_model.joblib')
# 模拟新样本(假设新病人的检查数据)
new_patient = np.array([[
17.99, 10.38, 122.8, 1001.0, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419, 0.07871,
1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587, 0.03003, 0.006193,
25.38, 17.33, 184.6, 2019.0, 0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189
]])
# 预测
prediction = loaded_model.predict(new_patient)
probability = loaded_model.predict_proba(new_patient)
print(f"新病人预测:")
print(f" 预测类别: {'恶性' if prediction[0] == 0 else '良性'}")
print(f" 良性概率: {probability[0][1]:.4f}")
print(f" 恶性概率: {probability[0][0]:.4f}")
# 模拟API接口
print("\n📡 模型API接口示例:")
print("""
# Flask API示例
from flask import Flask, request, jsonify
import joblib
app = Flask(__name__)
model = joblib.load('breast_cancer_model.joblib')
@app.route('/predict', methods=['POST'])
def predict():
data = request.json['data']
prediction = model.predict([data])
probability = model.predict_proba([data])
return jsonify({
'prediction': int(prediction[0]),
'probability': probability[0].tolist()
})
if __name__ == '__main__':
app.run(port=5000)
""")
production_deployment()
七、总结
| 组件 | 作用 | 关键方法 |
|---|---|---|
| 估计器 | 机器学习模型 | fit(), predict(), score() |
| 转换器 | 数据预处理 | fit(), transform(), fit_transform() |
| Pipeline | 组合多个步骤 | fit(), predict() |
| ColumnTransformer | 处理混合类型 | fit_transform() |
模型保存方式对比:
| 方式 | 优点 | 缺点 | 适用场景 |
|---|---|---|---|
| pickle | Python原生 | 跨版本问题 | 临时保存 |
| joblib | 对NumPy高效 | 需要安装 | 生产部署 |
最佳实践:
- 始终使用Pipeline封装完整流程
- 用GridSearchCV进行超参数调优
- 交叉验证评估模型
- 保存整个Pipeline而非单个模型
- 版本控制模型文件