MLOps 实战：实验追踪、模型注册与自动化 Pipeline

1. 引言

MLOps = Machine Learning + DevOps。它解决了从"笔记本里的模型"到"生产环境中的服务"的鸿沟。

MLOps 成熟度：

级别	描述	工具
L0	手动训练、手动部署	Jupyter + 手动
L1	实验追踪、模型版本管理	MLflow
L2	自动化训练 Pipeline	Kubeflow/Airflow
L3	持续训练、A/B 测试	完整 MLOps 平台

2. MLflow 实验追踪

2.1 基础使用

python 复制代码

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# 设置实验
mlflow.set_experiment("classification_experiment")

with mlflow.start_run(run_name="rf_baseline"):
    # 记录参数
    params = {
        "n_estimators": 100,
        "max_depth": 10,
        "min_samples_split": 5,
    }
    mlflow.log_params(params)

    # 训练
    model = RandomForestClassifier(**params, random_state=42)
    model.fit(X_train, y_train)

    # 评估
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # 记录指标
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)

    # 记录模型
    mlflow.sklearn.log_model(model, "model")

    # 记录 artifacts
    mlflow.log_artifact("confusion_matrix.png")

    print(f"Accuracy: {accuracy:.4f}, F1: {f1:.4f}")

# 启动 UI: mlflow ui --port 5000

2.2 自动记录

python 复制代码

# 自动记录 sklearn 实验
mlflow.sklearn.autolog()

# 之后正常训练即可，MLflow 自动记录所有参数和指标
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train, y_train)

2.3 模型注册

python 复制代码

# 注册模型
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri, "ProductionClassifier")

# 加载注册的模型
model = mlflow.sklearn.load_model("models:/ProductionClassifier/Production")

# 模型阶段管理
from mlflow import MlflowClient
client = MlflowClient()

# Transition to Production
client.transition_model_version_stage(
    name="ProductionClassifier",
    version=1,
    stage="Production",
)

3. Kubeflow Pipeline

3.1 定义 Pipeline

python 复制代码

from kfp import dsl, compiler
from kfp.dsl import component, pipeline

@component(
    base_image="python:3.9",
    packages_to_install=["scikit-learn", "pandas"],
)
def preprocess_data(
    data_path: str,
    output_path: dsl.OutputPath(),
):
    import pandas as pd
    from sklearn.preprocessing import StandardScaler

    df = pd.read_csv(data_path)
    scaler = StandardScaler()
    X = scaler.fit_transform(df.drop('target', axis=1))
    df_processed = pd.DataFrame(X, columns=df.columns[:-1])
    df_processed['target'] = df['target']
    df_processed.to_csv(output_path, index=False)

@component(
    packages_to_install=["scikit-learn"],
)
def train_model(
    data_path: dsl.InputPath(),
    model_path: dsl.OutputPath(),
    n_estimators: int = 100,
):
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    import joblib

    df = pd.read_csv(data_path)
    X = df.drop('target', axis=1)
    y = df['target']

    model = RandomForestClassifier(n_estimators=n_estimators)
    model.fit(X, y)
    joblib.dump(model, model_path)

@component(
    packages_to_install=["scikit-learn"],
)
def evaluate_model(
    model_path: dsl.InputPath(),
    test_data_path: dsl.InputPath(),
) -> float:
    import pandas as pd
    from sklearn.metrics import accuracy_score
    import joblib

    model = joblib.load(model_path)
    df = pd.read_csv(test_data_path)
    X = df.drop('target', axis=1)
    y = df['target']

    accuracy = accuracy_score(y, model.predict(X))
    print(f"Accuracy: {accuracy}")
    return accuracy

@pipeline(
    name="ml-training-pipeline",
    description="End-to-end ML training pipeline",
)
def ml_pipeline(
    data_path: str = "data/raw.csv",
    n_estimators: int = 100,
):
    preprocess = preprocess_data(data_path=data_path)
    train = train_model(
        data_path=preprocess.output,
        n_estimators=n_estimators,
    )
    evaluate = evaluate_model(
        model_path=train.output,
        test_data_path=preprocess.output,
    )

# 编译
compiler.Compiler().compile(ml_pipeline, "pipeline.yaml")

4. CI/CD for ML

4.1 GitHub Actions

yaml 复制代码

# .github/workflows/ml-pipeline.yml
name: ML Pipeline

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

jobs:
  train:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install -r requirements.txt

      - name: Run tests
        run: pytest tests/

      - name: Train model
        run: python train.py
        env:
          MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_URI }}

      - name: Evaluate model
        run: python evaluate.py --threshold 0.85

      - name: Register model
        if: github.ref == 'refs/heads/main'
        run: python register_model.py

4.2 模型验证

python 复制代码

def validate_model(model, X_test, y_test, threshold=0.85):
    """模型质量门禁"""
    from sklearn.metrics import accuracy_score, f1_score

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

    if accuracy < threshold:
        raise ValueError(
            f"Model accuracy {accuracy:.4f} below threshold {threshold}"
        )

    print("✅ Model validation passed!")
    return True

5. 模型监控

python 复制代码

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset

def check_data_drift(reference_data, current_data):
    """检测数据漂移"""
    report = Report(metrics=[DataDriftPreset()])
    report.run(
        reference_data=reference_data,
        current_data=current_data,
    )
    report.save_html("drift_report.html")

    # 获取漂移结果
    result = report.as_dict()
    drift_detected = result['metrics'][0]['result']['dataset_drift']

    if drift_detected:
        print("⚠️ Data drift detected! Consider retraining.")
    else:
        print("✅ No significant data drift.")

    return drift_detected

6. 完整 MLOps 流程

复制代码

开发 → 实验追踪 → 模型验证 → 注册 → 部署 → 监控 → 再训练
 ↑                                                    ↓
 └────────────────── 反馈循环 ──────────────────────────┘

7. 总结

MLOps 的核心：

实验追踪（MLflow）：记录每次实验的参数、指标、模型
模型注册：版本管理 + 阶段管理（Staging → Production）
自动化 Pipeline：数据处理 → 训练 → 评估 → 部署全自动
CI/CD：代码变更自动触发训练和验证
监控：数据漂移检测，自动触发再训练