MLOps 实战:实验追踪、模型注册与自动化 Pipeline
1. 引言
MLOps = Machine Learning + DevOps。它解决了从"笔记本里的模型"到"生产环境中的服务"的鸿沟。
MLOps 成熟度:
| 级别 | 描述 | 工具 |
|---|---|---|
| L0 | 手动训练、手动部署 | Jupyter + 手动 |
| L1 | 实验追踪、模型版本管理 | MLflow |
| L2 | 自动化训练 Pipeline | Kubeflow/Airflow |
| L3 | 持续训练、A/B 测试 | 完整 MLOps 平台 |
2. MLflow 实验追踪
2.1 基础使用
python
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
# 设置实验
mlflow.set_experiment("classification_experiment")
with mlflow.start_run(run_name="rf_baseline"):
# 记录参数
params = {
"n_estimators": 100,
"max_depth": 10,
"min_samples_split": 5,
}
mlflow.log_params(params)
# 训练
model = RandomForestClassifier(**params, random_state=42)
model.fit(X_train, y_train)
# 评估
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
# 记录指标
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("f1_score", f1)
# 记录模型
mlflow.sklearn.log_model(model, "model")
# 记录 artifacts
mlflow.log_artifact("confusion_matrix.png")
print(f"Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
# 启动 UI: mlflow ui --port 5000
2.2 自动记录
python
# 自动记录 sklearn 实验
mlflow.sklearn.autolog()
# 之后正常训练即可,MLflow 自动记录所有参数和指标
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train, y_train)
2.3 模型注册
python
# 注册模型
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri, "ProductionClassifier")
# 加载注册的模型
model = mlflow.sklearn.load_model("models:/ProductionClassifier/Production")
# 模型阶段管理
from mlflow import MlflowClient
client = MlflowClient()
# Transition to Production
client.transition_model_version_stage(
name="ProductionClassifier",
version=1,
stage="Production",
)
3. Kubeflow Pipeline
3.1 定义 Pipeline
python
from kfp import dsl, compiler
from kfp.dsl import component, pipeline
@component(
base_image="python:3.9",
packages_to_install=["scikit-learn", "pandas"],
)
def preprocess_data(
data_path: str,
output_path: dsl.OutputPath(),
):
import pandas as pd
from sklearn.preprocessing import StandardScaler
df = pd.read_csv(data_path)
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('target', axis=1))
df_processed = pd.DataFrame(X, columns=df.columns[:-1])
df_processed['target'] = df['target']
df_processed.to_csv(output_path, index=False)
@component(
packages_to_install=["scikit-learn"],
)
def train_model(
data_path: dsl.InputPath(),
model_path: dsl.OutputPath(),
n_estimators: int = 100,
):
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib
df = pd.read_csv(data_path)
X = df.drop('target', axis=1)
y = df['target']
model = RandomForestClassifier(n_estimators=n_estimators)
model.fit(X, y)
joblib.dump(model, model_path)
@component(
packages_to_install=["scikit-learn"],
)
def evaluate_model(
model_path: dsl.InputPath(),
test_data_path: dsl.InputPath(),
) -> float:
import pandas as pd
from sklearn.metrics import accuracy_score
import joblib
model = joblib.load(model_path)
df = pd.read_csv(test_data_path)
X = df.drop('target', axis=1)
y = df['target']
accuracy = accuracy_score(y, model.predict(X))
print(f"Accuracy: {accuracy}")
return accuracy
@pipeline(
name="ml-training-pipeline",
description="End-to-end ML training pipeline",
)
def ml_pipeline(
data_path: str = "data/raw.csv",
n_estimators: int = 100,
):
preprocess = preprocess_data(data_path=data_path)
train = train_model(
data_path=preprocess.output,
n_estimators=n_estimators,
)
evaluate = evaluate_model(
model_path=train.output,
test_data_path=preprocess.output,
)
# 编译
compiler.Compiler().compile(ml_pipeline, "pipeline.yaml")
4. CI/CD for ML
4.1 GitHub Actions
yaml
# .github/workflows/ml-pipeline.yml
name: ML Pipeline
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
train:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install -r requirements.txt
- name: Run tests
run: pytest tests/
- name: Train model
run: python train.py
env:
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_URI }}
- name: Evaluate model
run: python evaluate.py --threshold 0.85
- name: Register model
if: github.ref == 'refs/heads/main'
run: python register_model.py
4.2 模型验证
python
def validate_model(model, X_test, y_test, threshold=0.85):
"""模型质量门禁"""
from sklearn.metrics import accuracy_score, f1_score
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
if accuracy < threshold:
raise ValueError(
f"Model accuracy {accuracy:.4f} below threshold {threshold}"
)
print("✅ Model validation passed!")
return True
5. 模型监控
python
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
def check_data_drift(reference_data, current_data):
"""检测数据漂移"""
report = Report(metrics=[DataDriftPreset()])
report.run(
reference_data=reference_data,
current_data=current_data,
)
report.save_html("drift_report.html")
# 获取漂移结果
result = report.as_dict()
drift_detected = result['metrics'][0]['result']['dataset_drift']
if drift_detected:
print("⚠️ Data drift detected! Consider retraining.")
else:
print("✅ No significant data drift.")
return drift_detected
6. 完整 MLOps 流程
开发 → 实验追踪 → 模型验证 → 注册 → 部署 → 监控 → 再训练
↑ ↓
└────────────────── 反馈循环 ──────────────────────────┘
7. 总结
MLOps 的核心:
- 实验追踪(MLflow):记录每次实验的参数、指标、模型
- 模型注册:版本管理 + 阶段管理(Staging → Production)
- 自动化 Pipeline:数据处理 → 训练 → 评估 → 部署全自动
- CI/CD:代码变更自动触发训练和验证
- 监控:数据漂移检测,自动触发再训练