day27pipeline管道@浙大疏锦行

Id	Home Ownership	Annual Income	Years in current job	Tax Liens	Years of Credit History	Maximum Open Credit	Number of Credit Problems	Months since last delinquent	Bankruptcies	Purpose	Term	Current Loan Amount	Current Credit Balance	Monthly Debt	Credit Score	Credit Default
0	0	Own Home	482087.0	NaN	11.0	26.3	685960.0	1.0	NaN	1.0	debt consolidation	Short Term	99999999.0	47386.0	7914.0	749.0	0
1	1	Own Home	1025487.0	10+ years	15.0	15.3	1181730.0	0.0	NaN	0.0	debt consolidation	Long Term	264968.0	394972.0	18373.0	737.0	1
2	2	Home Mortgage	751412.0	8 years	11.0	35.0	1182434.0	0.0	NaN	0.0	debt consolidation	Short Term	99999999.0	308389.0	13651.0	742.0	0
3	3	Own Home	805068.0	6 years	8.0	22.5	147400.0	1.0	NaN	1.0	debt consolidation	Short Term	121396.0	95855.0	11338.0	694.0	0
4	4	Rent	776264.0	8 years	13.0	13.6	385836.0	1.0	NaN	0.0	debt consolidation	Short Term	125840.0	93309.0	7180.0	719.0	0

1. 特征类型分类配置

根据数据特点，将特征分为：

有序分类特征：有明确顺序关系的分类特征（如工作年限）
标称分类特征：无顺序关系的分类特征（如贷款目的）
数值特征：连续数值特征

python 复制代码

# ==========================================
# 特征配置（适用于当前数据集，可根据实际数据调整）
# ==========================================

# 目标变量
TARGET = 'Credit Default'

# 分离特征和标签
y = data[TARGET]
X = data.drop([TARGET], axis=1)

# ---------- 有序分类特征配置 ----------
# 定义有序分类特征及其类别顺序
ordinal_features = ['Home Ownership', 'Years in current job', 'Term']

# 每个有序特征的类别顺序（编码顺序：0, 1, 2, ...）
ordinal_categories = [
    ['Own Home', 'Rent', 'Have Mortgage', 'Home Mortgage'],  # Home Ownership
    ['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', 
     '6 years', '7 years', '8 years', '9 years', '10+ years'],  # Years in current job
    ['Short Term', 'Long Term']  # Term
]

# ---------- 标称分类特征配置 ----------
# 需要独热编码的特征
nominal_features = ['Purpose']

# ---------- 数值特征配置 ----------
# 自动识别：排除分类特征后的所有列
all_categorical = ordinal_features + nominal_features
numeric_features = [col for col in X.columns if col not in all_categorical]

print("=" * 60)
print("特征类型配置")
print("=" * 60)
print(f"目标变量: {TARGET}")
print(f"\n有序分类特征 ({len(ordinal_features)}个): {ordinal_features}")
print(f"\n标称分类特征 ({len(nominal_features)}个): {nominal_features}")
print(f"\n数值特征 ({len(numeric_features)}个): {numeric_features}")

2. 构建通用预处理 Pipeline

使用 ColumnTransformer 将不同的预处理步骤应用于不同类型的特征：

有序特征：众数填充 + OrdinalEncoder
标称特征：众数填充 + OneHotEncoder
数值特征：众数填充 + StandardScaler

python 复制代码

# ==========================================
# 构建预处理器（ColumnTransformer）
# ==========================================

# 有序特征处理器：众数填充 + 有序编码
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(
        categories=ordinal_categories, 
        handle_unknown='use_encoded_value', 
        unknown_value=-1
    ))
])

# 标称特征处理器：众数填充 + 独热编码
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 数值特征处理器：众数填充 + 标准化
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

# 组合所有预处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_transformer, ordinal_features),
        ('nominal', nominal_transformer, nominal_features),
        ('numeric', numeric_transformer, numeric_features)
    ],
    remainder='drop'  # 丢弃未指定的列
)

print("✅ 预处理器构建完成！")
print("\n预处理器结构:")
print(preprocessor)

3. 定义模型字典

定义一个包含多种机器学习模型的字典，方便后续批量训练和比较

python 复制代码

# ==========================================
# 定义多种机器学习模型
# ==========================================

# 模型字典：模型名称 -> (模型实例, 参数网格)
# 参数网格用于网格搜索调参，格式为 {'pipeline参数名': [候选值列表]}
# 注意：在Pipeline中，参数名格式为 'step_name__parameter_name'

models = {
    '随机森林': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [10, 20, None],
            'classifier__min_samples_split': [2, 5]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, eval_metric='logloss'),
        'params': {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [3, 6, 9],
            'classifier__learning_rate': [0.01, 0.1]
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(random_state=42, verbosity=-1),
        'params': {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [3, 6, -1],
            'classifier__learning_rate': [0.01, 0.1]
        }
    },
    '逻辑回归': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'classifier__C': [0.01, 0.1, 1, 10],
            'classifier__penalty': ['l2']
        }
    },
    'SVM': {
        'model': SVC(random_state=42, probability=True),
        'params': {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['rbf', 'linear']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'classifier__n_neighbors': [3, 5, 7],
            'classifier__weights': ['uniform', 'distance']
        }
    },
    '决策树': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {
            'classifier__max_depth': [5, 10, 20, None],
            'classifier__min_samples_split': [2, 5, 10]
        }
    },
    '梯度提升': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [3, 5],
            'classifier__learning_rate': [0.01, 0.1]
        }
    }
}

print(f"✅ 已定义 {len(models)} 种机器学习模型！")
for name in models.keys():
    print(f"   - {name}")

4. 划分数据集

python 复制代码

# ==========================================
# 划分训练集和测试集（在预处理之前划分，防止数据泄露）
# ==========================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # 分层采样，保持类别比例一致
)

print("=" * 60)
print("数据集划分完成")
print("=" * 60)
print(f"训练集: {X_train.shape[0]} 样本")
print(f"测试集: {X_test.shape[0]} 样本")
print(f"\n训练集目标分布:\n{y_train.value_counts()}")
print(f"\n测试集目标分布:\n{y_test.value_counts()}")

5. 创建通用 Pipeline 构建函数

python 复制代码

# ==========================================
# 通用 Pipeline 构建函数
# ==========================================

def create_pipeline(model, preprocessor):
    """
    创建完整的机器学习 Pipeline
    
    参数:
        model: sklearn 模型实例
        preprocessor: ColumnTransformer 预处理器
    
    返回:
        Pipeline 对象
    """
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])


def evaluate_model(pipeline, X_train, X_test, y_train, y_test, model_name="模型"):
    """
    训练并评估模型
    
    参数:
        pipeline: Pipeline 对象
        X_train, X_test: 训练集和测试集特征
        y_train, y_test: 训练集和测试集标签
        model_name: 模型名称（用于打印）
    
    返回:
        dict: 包含各种评估指标的字典
    """
    start_time = time.time()
    
    # 训练模型
    pipeline.fit(X_train, y_train)
    
    # 预测
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline, 'predict_proba') else None
    
    train_time = time.time() - start_time
    
    # 计算评估指标
    results = {
        'model_name': model_name,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None,
        'train_time': train_time,
        'pipeline': pipeline,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    return results


def print_evaluation_report(results):
    """打印评估报告"""
    print(f"\n{'='*60}")
    print(f"模型: {results['model_name']}")
    print(f"{'='*60}")
    print(f"训练耗时: {results['train_time']:.4f} 秒")
    print(f"\n评估指标:")
    print(f"  准确率 (Accuracy):  {results['accuracy']:.4f}")
    print(f"  精确率 (Precision): {results['precision']:.4f}")
    print(f"  召回率 (Recall):    {results['recall']:.4f}")
    print(f"  F1 分数:            {results['f1']:.4f}")
    if results['roc_auc']:
        print(f"  AUC-ROC:            {results['roc_auc']:.4f}")


print("✅ 通用函数定义完成！")

6. 批量训练和评估所有模型（使用默认参数）

python 复制代码

# ==========================================
# 批量训练和评估所有模型
# ==========================================

all_results = []

print("开始训练所有模型...")
print("=" * 80)

for model_name, model_config in models.items():
    # 创建 Pipeline
    pipeline = create_pipeline(model_config['model'], preprocessor)
    
    # 训练和评估
    results = evaluate_model(pipeline, X_train, X_test, y_train, y_test, model_name)
    all_results.append(results)
    
    # 打印简要结果
    auc_str = f"{results['roc_auc']:.4f}" if results['roc_auc'] else 'N/A'
    print(f"{model_name:12s} | 准确率: {results['accuracy']:.4f} | F1: {results['f1']:.4f} | AUC: {auc_str} | 耗时: {results['train_time']:.2f}s")

print("=" * 80)
print("✅ 所有模型训练完成！")

7. 模型对比可视化

python 复制代码

# ==========================================
# 模型性能对比可视化
# ==========================================

# 将结果转换为DataFrame
results_df = pd.DataFrame([{
    '模型': r['model_name'],
    '准确率': r['accuracy'],
    '精确率': r['precision'],
    '召回率': r['recall'],
    'F1分数': r['f1'],
    'AUC-ROC': r['roc_auc'],
    '训练时间(s)': r['train_time']
} for r in all_results])

# 按F1分数排序
results_df = results_df.sort_values('F1分数', ascending=False)
print("模型性能对比表（按F1分数排序）：")
display(results_df)

# 绘制对比图
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. 准确率对比
ax1 = axes[0, 0]
colors = plt.cm.Set3(np.linspace(0, 1, len(results_df)))
bars1 = ax1.barh(results_df['模型'], results_df['准确率'], color=colors)
ax1.set_xlabel('准确率')
ax1.set_title('各模型准确率对比')
ax1.set_xlim([0.5, 1.0])
for bar, val in zip(bars1, results_df['准确率']):
    ax1.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.4f}', va='center')

# 2. F1分数对比
ax2 = axes[0, 1]
bars2 = ax2.barh(results_df['模型'], results_df['F1分数'], color=colors)
ax2.set_xlabel('F1分数')
ax2.set_title('各模型F1分数对比')
ax2.set_xlim([0, 0.8])
for bar, val in zip(bars2, results_df['F1分数']):
    ax2.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.4f}', va='center')

# 3. AUC-ROC对比
ax3 = axes[1, 0]
bars3 = ax3.barh(results_df['模型'], results_df['AUC-ROC'], color=colors)
ax3.set_xlabel('AUC-ROC')
ax3.set_title('各模型AUC-ROC对比')
ax3.set_xlim([0.5, 1.0])
for bar, val in zip(bars3, results_df['AUC-ROC']):
    ax3.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.4f}', va='center')

# 4. 训练时间对比
ax4 = axes[1, 1]
bars4 = ax4.barh(results_df['模型'], results_df['训练时间(s)'], color=colors)
ax4.set_xlabel('训练时间 (秒)')
ax4.set_title('各模型训练时间对比')
for bar, val in zip(bars4, results_df['训练时间(s)']):
    ax4.text(val + 0.1, bar.get_y() + bar.get_height()/2, f'{val:.2f}s', va='center')

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✅ 对比图已保存为 model_comparison.png")

8. ROC曲线对比

python 复制代码

# ==========================================
# ROC曲线对比
# ==========================================

plt.figure(figsize=(10, 8))

# 绘制每个模型的ROC曲线
for result in all_results:
    if result['y_pred_proba'] is not None:
        fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
        auc = result['roc_auc']
        plt.plot(fpr, tpr, label=f"{result['model_name']} (AUC={auc:.4f})", linewidth=2)

# 绘制对角线（随机分类器）
plt.plot([0, 1], [0, 1], 'k--', label='随机分类器', linewidth=1)

plt.xlabel('假阳性率 (FPR)', fontsize=12)
plt.ylabel('真阳性率 (TPR)', fontsize=12)
plt.title('各模型ROC曲线对比', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('roc_curves.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✅ ROC曲线图已保存为 roc_curves.png")

9. 选择最佳模型并进行详细评估

python 复制代码

# ==========================================
# 选择最佳模型（按AUC-ROC）
# ==========================================

# 找到AUC最高的模型
best_result = max(all_results, key=lambda x: x['roc_auc'] if x['roc_auc'] else 0)
best_model_name = best_result['model_name']
best_pipeline = best_result['pipeline']

print("=" * 60)
print(f"最佳模型: {best_model_name}")
print("=" * 60)

# 详细评估报告
print_evaluation_report(best_result)

# 分类报告
print(f"\n详细分类报告:")
print(classification_report(y_test, best_result['y_pred']))

# 混淆矩阵可视化
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, best_result['y_pred'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['未违约(0)', '违约(1)'],
            yticklabels=['未违约(0)', '违约(1)'])
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.title(f'{best_model_name} - 混淆矩阵')
plt.tight_layout()
plt.savefig('best_model_confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

10. 使用网格搜索进行超参数调优（以随机森林为例）

python 复制代码

# ==========================================
# 网格搜索调参示例（以梯度提升为例，因为它是当前最佳模型）
# ==========================================

print("开始网格搜索调参...")
print("=" * 60)

# 创建Pipeline
tuning_pipeline = create_pipeline(GradientBoostingClassifier(random_state=42), preprocessor)

# 定义参数网格（简化版，避免过长时间）
param_grid = {
    'classifier__n_estimators': [100, 150, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.05, 0.1, 0.15]
}

# 创建网格搜索对象
grid_search = GridSearchCV(
    tuning_pipeline,
    param_grid,
    cv=5,                    # 5折交叉验证
    scoring='roc_auc',       # 使用AUC-ROC作为评分指标
    n_jobs=-1,               # 使用所有CPU核心
    verbose=1
)

# 执行网格搜索
start_time = time.time()
grid_search.fit(X_train, y_train)
grid_time = time.time() - start_time

print(f"\n网格搜索完成！耗时: {grid_time:.2f} 秒")
print(f"\n最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证AUC: {grid_search.best_score_:.4f}")

python 复制代码

# ==========================================
# 使用最佳参数的模型在测试集上评估
# ==========================================

# 获取最佳模型
best_tuned_pipeline = grid_search.best_estimator_

# 在测试集上预测
y_pred_tuned = best_tuned_pipeline.predict(X_test)
y_pred_proba_tuned = best_tuned_pipeline.predict_proba(X_test)[:, 1]

# 计算评估指标
print("=" * 60)
print("调参后的梯度提升模型 - 测试集表现")
print("=" * 60)
print(f"准确率 (Accuracy):  {accuracy_score(y_test, y_pred_tuned):.4f}")
print(f"精确率 (Precision): {precision_score(y_test, y_pred_tuned):.4f}")
print(f"召回率 (Recall):    {recall_score(y_test, y_pred_tuned):.4f}")
print(f"F1 分数:            {f1_score(y_test, y_pred_tuned):.4f}")
print(f"AUC-ROC:            {roc_auc_score(y_test, y_pred_proba_tuned):.4f}")

print("\n详细分类报告:")
print(classification_report(y_test, y_pred_tuned))

# 与调参前对比
print("\n" + "=" * 60)
print("调参前后对比")
print("=" * 60)
original_auc = [r['roc_auc'] for r in all_results if r['model_name'] == '梯度提升'][0]
tuned_auc = roc_auc_score(y_test, y_pred_proba_tuned)
print(f"默认参数 AUC-ROC: {original_auc:.4f}")
print(f"调参后 AUC-ROC:   {tuned_auc:.4f}")
print(f"提升:             {(tuned_auc - original_auc) * 100:.2f}%")

11. 通用 Pipeline 封装类

将整个流程封装成一个可复用的类，方便在其他项目中使用

python 复制代码

# ==========================================
# 通用机器学习 Pipeline 封装类
# ==========================================

class UniversalMLPipeline:
    """
    通用机器学习 Pipeline 类
    
    功能:
    1. 自动识别和处理不同类型的特征
    2. 支持多种机器学习模型
    3. 支持网格搜索调参
    4. 提供完整的评估报告
    """
    
    def __init__(self, 
                 ordinal_features=None, 
                 ordinal_categories=None,
                 nominal_features=None, 
                 numeric_features=None,
                 impute_strategy='most_frequent',
                 scale_numeric=True):
        """
        初始化 Pipeline
        
        参数:
            ordinal_features: 有序分类特征列表
            ordinal_categories: 有序分类特征的类别顺序
            nominal_features: 标称分类特征列表
            numeric_features: 数值特征列表（如果为None，则自动识别）
            impute_strategy: 缺失值填充策略 ('most_frequent', 'mean', 'median')
            scale_numeric: 是否对数值特征进行标准化
        """
        self.ordinal_features = ordinal_features or []
        self.ordinal_categories = ordinal_categories or []
        self.nominal_features = nominal_features or []
        self.numeric_features = numeric_features
        self.impute_strategy = impute_strategy
        self.scale_numeric = scale_numeric
        
        self.preprocessor = None
        self.pipeline = None
        self.results = {}
        
    def _build_preprocessor(self, X):
        """构建预处理器"""
        # 自动识别数值特征
        if self.numeric_features is None:
            all_categorical = self.ordinal_features + self.nominal_features
            self.numeric_features = [col for col in X.columns if col not in all_categorical]
        
        transformers = []
        
        # 有序特征处理
        if self.ordinal_features:
            ordinal_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OrdinalEncoder(
                    categories=self.ordinal_categories, 
                    handle_unknown='use_encoded_value', 
                    unknown_value=-1
                ))
            ])
            transformers.append(('ordinal', ordinal_transformer, self.ordinal_features))
        
        # 标称特征处理
        if self.nominal_features:
            nominal_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ])
            transformers.append(('nominal', nominal_transformer, self.nominal_features))
        
        # 数值特征处理
        if self.numeric_features:
            steps = [('imputer', SimpleImputer(strategy=self.impute_strategy))]
            if self.scale_numeric:
                steps.append(('scaler', StandardScaler()))
            numeric_transformer = Pipeline(steps=steps)
            transformers.append(('numeric', numeric_transformer, self.numeric_features))
        
        self.preprocessor = ColumnTransformer(
            transformers=transformers,
            remainder='drop'
        )
        
        return self.preprocessor
    
    def create_pipeline(self, model):
        """创建完整的 Pipeline"""
        return Pipeline(steps=[
            ('preprocessor', self.preprocessor),
            ('classifier', model)
        ])
    
    def fit_evaluate(self, X_train, X_test, y_train, y_test, model, model_name="模型"):
        """训练并评估模型"""
        if self.preprocessor is None:
            self._build_preprocessor(X_train)
        
        pipeline = self.create_pipeline(model)
        
        start_time = time.time()
        pipeline.fit(X_train, y_train)
        
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline, 'predict_proba') else None
        
        train_time = time.time() - start_time
        
        results = {
            'model_name': model_name,
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'roc_auc': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None,
            'train_time': train_time,
            'pipeline': pipeline,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        }
        
        self.results[model_name] = results
        return results
    
    def grid_search(self, X_train, y_train, model, param_grid, cv=5, scoring='roc_auc'):
        """网格搜索调参"""
        if self.preprocessor is None:
            self._build_preprocessor(X_train)
        
        pipeline = self.create_pipeline(model)
        
        grid_search = GridSearchCV(
            pipeline, param_grid, cv=cv, scoring=scoring, n_jobs=-1, verbose=1
        )
        grid_search.fit(X_train, y_train)
        
        return grid_search
    
    def compare_models(self, X_train, X_test, y_train, y_test, models_dict):
        """比较多个模型"""
        if self.preprocessor is None:
            self._build_preprocessor(X_train)
        
        for name, model in models_dict.items():
            self.fit_evaluate(X_train, X_test, y_train, y_test, model, name)
        
        return self.results
    
    def get_best_model(self, metric='roc_auc'):
        """获取最佳模型"""
        if not self.results:
            raise ValueError("请先训练模型！")
        
        best_name = max(self.results.keys(), 
                       key=lambda x: self.results[x][metric] if self.results[x][metric] else 0)
        return self.results[best_name]


print("✅ UniversalMLPipeline 类定义完成！")

12. 使用封装类的示例

python 复制代码

# ==========================================
# 使用 UniversalMLPipeline 类的简洁示例
# ==========================================

# 1. 创建 Pipeline 实例
ml_pipeline = UniversalMLPipeline(
    ordinal_features=['Home Ownership', 'Years in current job', 'Term'],
    ordinal_categories=[
        ['Own Home', 'Rent', 'Have Mortgage', 'Home Mortgage'],
        ['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', 
         '6 years', '7 years', '8 years', '9 years', '10+ years'],
        ['Short Term', 'Long Term']
    ],
    nominal_features=['Purpose'],
    impute_strategy='most_frequent',
    scale_numeric=True
)

# 2. 定义要比较的模型
models_to_compare = {
    '随机森林': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=42, verbosity=-1),
    '梯度提升': GradientBoostingClassifier(random_state=42)
}

# 3. 一键比较所有模型
print("=" * 60)
print("使用 UniversalMLPipeline 类进行模型比较")
print("=" * 60)

results = ml_pipeline.compare_models(X_train, X_test, y_train, y_test, models_to_compare)

# 4. 打印结果
print("\n模型比较结果:")
for name, result in results.items():
    print(f"{name:12s} | 准确率: {result['accuracy']:.4f} | F1: {result['f1']:.4f} | AUC: {result['roc_auc']:.4f}")

# 5. 获取最佳模型
best = ml_pipeline.get_best_model('roc_auc')
print(f"\n🏆 最佳模型: {best['model_name']} (AUC: {best['roc_auc']:.4f})")

@浙大疏锦行