02-机器学习基础: 无监督学习——scikit-learn实战与模型管理

scikit-learn实战与模型管理(Pipeline、模型保存)

一、scikit-learn统一API

1.1 核心API概览

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
import joblib
import pickle
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("scikit-learn统一API:fit, predict, transform")
print("=" * 60)

# scikit-learn的核心API模式
print("""
📌 scikit-learn的统一API设计:

1. 估计器 (Estimator):
   - fit(X, y): 训练模型
   - predict(X): 预测
   - score(X, y): 评估

2. 转换器 (Transformer):
   - fit(X): 学习参数
   - transform(X): 转换数据
   - fit_transform(X): 拟合+转换

3. 所有模型遵循相同模式!
""")

# 示例:不同模型的统一使用方式
X, y = make_classification(n_samples=200, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

models = {
    '逻辑回归': LogisticRegression(max_iter=1000),
    '决策树': DecisionTreeClassifier(),
    '随机森林': RandomForestClassifier()
}

print("\n统一API示例:")
for name, model in models.items():
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print(f"  {name}: 准确率 = {acc:.4f}")

二、Pipeline流水线

2.1 基础Pipeline

python 复制代码
def pipeline_basics():
    """Pipeline基础使用"""
    
    # 加载数据
    X, y = make_classification(n_samples=300, n_features=10, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    # 创建Pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=1000))
    ])
    
    # 训练
    pipeline.fit(X_train, y_train)
    
    # 评估
    train_acc = pipeline.score(X_train, y_train)
    test_acc = pipeline.score(X_test, y_test)
    
    print("\n" + "=" * 60)
    print("Pipeline基础使用")
    print("=" * 60)
    print(f"训练集准确率: {train_acc:.4f}")
    print(f"测试集准确率: {test_acc:.4f}")
    
    # 可视化Pipeline
    fig, ax = plt.subplots(figsize=(10, 3))
    ax.axis('off')
    
    steps = ['原始数据', '标准化', '分类器', '预测']
    x_pos = np.linspace(0.1, 0.9, len(steps))
    
    for i, (step, x) in enumerate(zip(steps, x_pos)):
        circle = plt.Circle((x, 0.5), 0.08, color='lightblue', ec='black')
        ax.add_patch(circle)
        ax.text(x, 0.5, step, ha='center', va='center', fontsize=10)
        
        if i < len(steps) - 1:
            ax.annotate('', xy=(x+0.18, 0.5), xytext=(x+0.1, 0.5),
                       arrowprops=dict(arrowstyle='->', lw=2))
    
    ax.set_xlim(0, 1)
    ax.set_title('Pipeline工作流程', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    return pipeline

pipeline = pipeline_basics()

2.2 复杂Pipeline:预处理+特征工程+建模

python 复制代码
def advanced_pipeline():
    """复杂Pipeline演示"""
    
    # 创建混合类型数据
    np.random.seed(42)
    n_samples = 500
    
    data = pd.DataFrame({
        '年龄': np.random.randint(18, 80, n_samples),
        '收入': np.random.randint(30000, 150000, n_samples),
        '评分': np.random.uniform(1, 5, n_samples),
        '城市': np.random.choice(['北京', '上海', '广州', '深圳'], n_samples),
        '学历': np.random.choice(['本科', '硕士', '博士', '其他'], n_samples)
    })
    
    # 引入缺失值
    data.loc[np.random.choice(n_samples, 50, replace=False), '年龄'] = np.nan
    data.loc[np.random.choice(n_samples, 30, replace=False), '评分'] = np.nan
    
    # 生成目标变量
    y = (data['收入'] > 80000).astype(int)
    
    # 定义特征类型
    numeric_features = ['年龄', '收入', '评分']
    categorical_features = ['城市', '学历']
    
    # 创建预处理步骤
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='缺失')),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False))
    ])
    
    # 组合预处理器
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
    # 完整Pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    # 划分数据
    X = data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 训练
    pipeline.fit(X_train, y_train)
    
    print("\n" + "=" * 60)
    print("复杂Pipeline:预处理 + 特征工程 + 建模")
    print("=" * 60)
    print(f"训练集准确率: {pipeline.score(X_train, y_train):.4f}")
    print(f"测试集准确率: {pipeline.score(X_test, y_test):.4f}")
    
    # 查看预处理后的特征数
    X_preprocessed = pipeline.named_steps['preprocessor'].transform(X_train)
    print(f"预处理后特征数: {X_preprocessed.shape[1]}")
    
    return pipeline

advanced_pipeline()

2.3 Pipeline与交叉验证

python 复制代码
def pipeline_cross_validation():
    """Pipeline与交叉验证"""
    
    X, y = make_classification(n_samples=300, n_features=10, random_state=42)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=1000))
    ])
    
    # 交叉验证
    scores = cross_val_score(pipeline, X, y, cv=5)
    
    print("\n" + "=" * 60)
    print("Pipeline与交叉验证")
    print("=" * 60)
    print(f"各折得分: {scores}")
    print(f"平均得分: {scores.mean():.4f} (+/- {scores.std():.4f})")
    
    # 带参数搜索的Pipeline
    param_grid = {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__solver': ['lbfgs', 'liblinear']
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    
    print(f"\n网格搜索最佳参数: {grid_search.best_params_}")
    print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")
    
    return grid_search

pipeline_cross_validation()

三、模型保存与加载

3.1 pickle模块

python 复制代码
def pickle_save_load():
    """使用pickle保存和加载模型"""
    
    X, y = make_classification(n_samples=300, n_features=10, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    # 训练模型
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    original_score = model.score(X_test, y_test)
    
    print("\n" + "=" * 60)
    print("pickle保存和加载模型")
    print("=" * 60)
    
    # 保存模型
    with open('model.pkl', 'wb') as f:
        pickle.dump(model, f)
    print("模型已保存到 model.pkl")
    
    # 加载模型
    with open('model.pkl', 'rb') as f:
        loaded_model = pickle.load(f)
    print("模型已加载")
    
    # 验证
    loaded_score = loaded_model.score(X_test, y_test)
    print(f"原始模型准确率: {original_score:.4f}")
    print(f"加载模型准确率: {loaded_score:.4f}")
    print(f"结果一致: {np.allclose(original_score, loaded_score)}")

pickle_save_load()

3.2 joblib模块(推荐)

python 复制代码
def joblib_save_load():
    """使用joblib保存和加载模型(推荐)"""
    
    X, y = make_classification(n_samples=300, n_features=10, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    # 创建Pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    pipeline.fit(X_train, y_train)
    original_score = pipeline.score(X_test, y_test)
    
    print("\n" + "=" * 60)
    print("joblib保存和加载模型(推荐)")
    print("=" * 60)
    
    # 保存Pipeline
    joblib.dump(pipeline, 'pipeline.joblib')
    print("Pipeline已保存到 pipeline.joblib")
    
    # 加载Pipeline
    loaded_pipeline = joblib.load('pipeline.joblib')
    print("Pipeline已加载")
    
    # 验证
    loaded_score = loaded_pipeline.score(X_test, y_test)
    print(f"原始Pipeline准确率: {original_score:.4f}")
    print(f"加载Pipeline准确率: {loaded_score:.4f}")
    
    # joblib vs pickle
    import os
    print(f"\n文件大小对比:")
    print(f"  pickle: {os.path.getsize('model.pkl') / 1024:.1f} KB")
    print(f"  joblib: {os.path.getsize('pipeline.joblib') / 1024:.1f} KB")
    print("  joblib对NumPy数组更高效!")

joblib_save_load()

四、自定义转换器

4.1 创建自定义转换器

python 复制代码
def custom_transformer():
    """创建自定义转换器"""
    
    from sklearn.base import BaseEstimator, TransformerMixin
    
    class OutlierRemover(BaseEstimator, TransformerMixin):
        """自定义异常值移除器"""
        
        def __init__(self, threshold=3):
            self.threshold = threshold
            self.lower_bounds_ = None
            self.upper_bounds_ = None
        
        def fit(self, X, y=None):
            # 计算每个特征的上下界
            self.lower_bounds_ = []
            self.upper_bounds_ = []
            for i in range(X.shape[1]):
                mean = np.mean(X[:, i])
                std = np.std(X[:, i])
                self.lower_bounds_.append(mean - self.threshold * std)
                self.upper_bounds_.append(mean + self.threshold * std)
            return self
        
        def transform(self, X):
            # 标记异常值
            mask = np.ones(X.shape[0], dtype=bool)
            for i in range(X.shape[1]):
                mask &= (X[:, i] >= self.lower_bounds_[i]) & (X[:, i] <= self.upper_bounds_[i])
            return X[mask]
    
    # 测试自定义转换器
    X, y = make_classification(n_samples=500, n_features=2, random_state=42)
    
    # 添加异常值
    X[:10] = X[:10] * 5
    
    pipeline = Pipeline([
        ('outlier_remover', OutlierRemover(threshold=3)),
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    pipeline.fit(X_train, y_train)
    
    print("\n" + "=" * 60)
    print("自定义转换器")
    print("=" * 60)
    print(f"原始训练集大小: {len(X_train)}")
    
    # 查看移除异常值后的样本数
    X_clean = pipeline.named_steps['outlier_remover'].transform(X_train)
    print(f"移除异常值后: {len(X_clean)}")
    print(f"测试集准确率: {pipeline.score(X_test, y_test):.4f}")

custom_transformer()

五、模型部署实战

5.1 完整项目流程

python 复制代码
def complete_project():
    """完整机器学习项目流程"""
    
    from sklearn.datasets import load_breast_cancer
    from sklearn.metrics import confusion_matrix, roc_curve, auc
    
    print("\n" + "=" * 60)
    print("完整机器学习项目流程")
    print("=" * 60)
    
    # Step 1: 加载数据
    data = load_breast_cancer()
    X, y = data.data, data.target
    feature_names = data.feature_names
    
    print(f"Step 1: 加载数据")
    print(f"  样本数: {X.shape[0]}, 特征数: {X.shape[1]}")
    
    # Step 2: 划分数据
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"\nStep 2: 划分数据")
    print(f"  训练集: {len(X_train)} 样本")
    print(f"  测试集: {len(X_test)} 样本")
    
    # Step 3: 创建Pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    # Step 4: 交叉验证选择参数
    param_grid = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [5, 10, None],
        'classifier__min_samples_split': [2, 5, 10]
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print(f"\nStep 3-4: 模型训练与调优")
    print(f"  最佳参数: {grid_search.best_params_}")
    print(f"  最佳交叉验证得分: {grid_search.best_score_:.4f}")
    
    # Step 5: 评估
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    
    print(f"\nStep 5: 模型评估")
    print(f"  测试集准确率: {test_acc:.4f}")
    
    # Step 6: 保存模型
    joblib.dump(best_model, 'breast_cancer_model.joblib')
    print(f"\nStep 6: 保存模型")
    print("  模型已保存到 breast_cancer_model.joblib")
    
    # 可视化结果
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # 混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    axes[0, 0].imshow(cm, cmap='Blues')
    axes[0, 0].set_xticks([0, 1])
    axes[0, 0].set_yticks([0, 1])
    axes[0, 0].set_xticklabels(['良性', '恶性'])
    axes[0, 0].set_yticklabels(['良性', '恶性'])
    for i in range(2):
        for j in range(2):
            axes[0, 0].text(j, i, cm[i, j], ha='center', va='center', fontsize=16)
    axes[0, 0].set_title('混淆矩阵')
    
    # ROC曲线
    y_proba = best_model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    axes[0, 1].plot(fpr, tpr, 'b-', linewidth=2, label=f'AUC = {roc_auc:.3f}')
    axes[0, 1].plot([0, 1], [0, 1], 'r--', alpha=0.5)
    axes[0, 1].set_xlabel('假正率')
    axes[0, 1].set_ylabel('真正率')
    axes[0, 1].set_title('ROC曲线')
    axes[0, 1].legend()
    
    # 特征重要性
    importances = best_model.named_steps['classifier'].feature_importances_
    top_idx = np.argsort(importances)[-10:]
    axes[1, 0].barh(range(10), importances[top_idx])
    axes[1, 0].set_yticks(range(10))
    axes[1, 0].set_yticklabels([feature_names[i] for i in top_idx])
    axes[1, 0].set_xlabel('特征重要性')
    axes[1, 0].set_title('Top 10 重要特征')
    
    # 分类报告
    axes[1, 1].axis('off')
    report = classification_report(y_test, y_pred, target_names=['良性', '恶性'], output_dict=True)
    report_text = f"""
    分类报告:
    ┌──────────┬──────────┬──────────┬──────────┐
    │          │ 精确率   │ 召回率   │ F1分数   │
    ├──────────┼──────────┼──────────┼──────────┤
    │ 良性     │ {report['良性']['precision']:.3f}    │ {report['良性']['recall']:.3f}    │ {report['良性']['f1-score']:.3f}    │
    ├──────────┼──────────┼──────────┼──────────┤
    │ 恶性     │ {report['恶性']['precision']:.3f}    │ {report['恶性']['recall']:.3f}    │ {report['恶性']['f1-score']:.3f}    │
    ├──────────┼──────────┼──────────┼──────────┤
    │ 准确率   │          │          │ {report['accuracy']:.3f}    │
    └──────────┴──────────┴──────────┴──────────┘
    """
    axes[1, 1].text(0.05, 0.95, report_text, transform=axes[1, 1].transAxes,
                    fontsize=10, verticalalignment='top', fontfamily='monospace')
    
    plt.suptitle('乳腺癌分类项目 - 完整评估报告', fontsize=14)
    plt.tight_layout()
    plt.show()
    
    return best_model

model = complete_project()

六、生产环境部署示例

python 复制代码
def production_deployment():
    """生产环境部署示例"""
    
    # 模拟部署:加载模型并进行预测
    print("\n" + "=" * 60)
    print("生产环境部署示例")
    print("=" * 60)
    
    # 加载保存的模型
    loaded_model = joblib.load('breast_cancer_model.joblib')
    
    # 模拟新样本(假设新病人的检查数据)
    new_patient = np.array([[
        17.99, 10.38, 122.8, 1001.0, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419, 0.07871,
        1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587, 0.03003, 0.006193,
        25.38, 17.33, 184.6, 2019.0, 0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189
    ]])
    
    # 预测
    prediction = loaded_model.predict(new_patient)
    probability = loaded_model.predict_proba(new_patient)
    
    print(f"新病人预测:")
    print(f"  预测类别: {'恶性' if prediction[0] == 0 else '良性'}")
    print(f"  良性概率: {probability[0][1]:.4f}")
    print(f"  恶性概率: {probability[0][0]:.4f}")
    
    # 模拟API接口
    print("\n📡 模型API接口示例:")
    print("""
    # Flask API示例
    from flask import Flask, request, jsonify
    import joblib
    
    app = Flask(__name__)
    model = joblib.load('breast_cancer_model.joblib')
    
    @app.route('/predict', methods=['POST'])
    def predict():
        data = request.json['data']
        prediction = model.predict([data])
        probability = model.predict_proba([data])
        return jsonify({
            'prediction': int(prediction[0]),
            'probability': probability[0].tolist()
        })
    
    if __name__ == '__main__':
        app.run(port=5000)
    """)

production_deployment()

七、总结

组件 作用 关键方法
估计器 机器学习模型 fit(), predict(), score()
转换器 数据预处理 fit(), transform(), fit_transform()
Pipeline 组合多个步骤 fit(), predict()
ColumnTransformer 处理混合类型 fit_transform()

模型保存方式对比:

方式 优点 缺点 适用场景
pickle Python原生 跨版本问题 临时保存
joblib 对NumPy高效 需要安装 生产部署

最佳实践:

  1. 始终使用Pipeline封装完整流程
  2. 用GridSearchCV进行超参数调优
  3. 交叉验证评估模型
  4. 保存整个Pipeline而非单个模型
  5. 版本控制模型文件
相关推荐
handler012 小时前
拒绝权限报错!三分钟掌握 Linux 权限管理
linux·c语言·c++·笔记·学习
hipolymers4 小时前
C语言怎么样?难学吗?
c语言·数据结构·学习·算法·编程
xiaotao1315 小时前
03-深度学习基础:循环神经网络(RNN)
人工智能·深度学习·机器学习
richxu202510015 小时前
嵌入式学习之路->stm32篇->(11)SPI通信(下)
stm32·嵌入式硬件·学习
xuhaoyu_cpp_java5 小时前
连接池学习
数据库·经验分享·笔记·学习
GHL2842710907 小时前
Agent相关问题整理学习
学习·ai
qq_429499577 小时前
恒流源学习
学习
小糖学代码7 小时前
LLM系列:2.pytorch入门:3.基本优化思想与最小二乘法
人工智能·python·算法·机器学习·ai·数据挖掘·最小二乘法