2.31 机器学习神器项目实战：如何在真实项目中应用XGBoost等算法

引言

本文通过真实项目案例，演示如何在项目中应用XGBoost等机器学习神器。从数据准备、特征工程、模型训练到部署上线，提供完整的实战流程。

一、项目背景

1.1 项目需求

python 复制代码

# 项目背景
def project_background():
    """
    项目背景
    """
    print("=" * 60)
    print("项目背景：客户流失预测")
    print("=" * 60)
    
    print("""
    业务需求：
    - 预测客户是否会流失
    - 识别高风险客户
    - 制定挽留策略
    
    数据：
    - 客户基本信息
    - 使用行为数据
    - 历史交易数据
    
    目标：
    - 构建高准确率预测模型
    - 识别关键流失因素
    - 支持业务决策
    """)
    
    return True

project_background()

二、数据准备

2.1 数据加载和探索

python 复制代码

# 数据准备
def prepare_project_data():
    """
    准备项目数据
    """
    np.random.seed(42)
    n = 5000
    
    data = {
        'customer_id': range(1, n + 1),
        'age': np.random.randint(18, 70, n),
        'gender': np.random.choice(['M', 'F'], n),
        'tenure': np.random.uniform(0, 60, n),
        'monthly_charge': np.random.uniform(20, 200, n),
        'total_charges': np.random.uniform(100, 10000, n),
        'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n),
        'payment_method': np.random.choice(['Electronic', 'Mailed', 'Bank'], n),
        'usage_minutes': np.random.uniform(0, 2000, n),
        'support_calls': np.random.randint(0, 10, n),
        'churn': np.random.choice([0, 1], n, p=[0.7, 0.3])
    }
    
    df = pd.DataFrame(data)
    return df

df = prepare_project_data()
print(f"数据形状: {df.shape}")
print(df.head())

三、特征工程

3.1 特征处理

python 复制代码

# 特征工程
def feature_engineering(df):
    """
    特征工程
    """
    from sklearn.preprocessing import LabelEncoder
    
    df_processed = df.copy()
    
    # 编码分类特征
    le_contract = LabelEncoder()
    df_processed['contract_encoded'] = le_contract.fit_transform(df['contract'])
    
    le_payment = LabelEncoder()
    df_processed['payment_encoded'] = le_payment.fit_transform(df['payment_method'])
    
    le_gender = LabelEncoder()
    df_processed['gender_encoded'] = le_gender.fit_transform(df['gender'])
    
    # 选择特征
    features = ['age', 'tenure', 'monthly_charge', 'total_charges',
               'contract_encoded', 'payment_encoded', 'gender_encoded',
               'usage_minutes', 'support_calls']
    
    X = df_processed[features]
    y = df_processed['churn']
    
    return X, y

X, y = feature_engineering(df)
print(f"特征形状: {X.shape}")

四、模型训练

4.1 XGBoost训练

python 复制代码

# XGBoost模型训练
def train_xgboost_model(X, y):
    """
    训练XGBoost模型
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # 训练XGBoost
    import xgboost as xgb
    
    model = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # 预测
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # 评估
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    print(f"XGBoost准确率: {accuracy:.4f}")
    print(f"XGBoost AUC: {auc:.4f}")
    
    # 特征重要性
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\\n特征重要性:")
    print(importance)
    
    return model, importance

model, importance = train_xgboost_model(X, y)

五、模型优化

5.1 参数调优

python 复制代码

# 参数调优
def hyperparameter_tuning(X, y):
    """
    超参数调优
    """
    from sklearn.model_selection import GridSearchCV
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # 参数网格
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
    
    # 网格搜索
    model = xgb.XGBClassifier(random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print(f"最佳参数: {grid_search.best_params_}")
    print(f"最佳分数: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# best_model = hyperparameter_tuning(X, y)
print("参数调优函数已准备")

六、模型部署

6.1 模型保存和加载

python 复制代码

# 模型部署
def model_deployment(model):
    """
    模型部署
    """
    import joblib
    
    # 保存模型
    joblib.dump(model, 'churn_model.pkl')
    print("模型已保存为 churn_model.pkl")
    
    # 加载模型
    loaded_model = joblib.load('churn_model.pkl')
    print("模型已加载")
    
    return loaded_model

# deployed_model = model_deployment(model)
print("模型部署函数已准备")

七、总结与思考

7.1 核心要点

完整流程：数据准备→特征工程→模型训练→优化→部署
XGBoost应用：在真实项目中的应用方法
最佳实践：参数调优、特征工程、模型评估
工程化：模型保存、加载、部署

7.2 思考题

如何在项目中应用XGBoost？
如何优化模型效果？
如何部署和维护模型？

7.3 实践建议

从简单开始：先用默认参数
逐步优化：根据效果调整
工程化：考虑部署和维护
持续改进：根据反馈优化

下一节预告：我们将学习男女声音识别实战，音频特征提取与分类模型构建完整案例。