1. 数据生成与理解
首先,我们生成一个包含10000条记录的信用卡欺诈检测数据集,这是一个典型的二分类问题。
python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
# 设置随机种子保证结果可重现
np.random.seed(42)
# 生成10000条数据
n_samples = 10000
# 生成特征数据
data = {
'age': np.random.normal(45, 15, n_samples).astype(int),
'income': np.random.normal(50000, 20000, n_samples),
'credit_score': np.random.normal(650, 100, n_samples),
'account_balance': np.random.exponential(10000, n_samples),
'transaction_amount': np.random.gamma(2, 100, n_samples),
'transaction_frequency': np.random.poisson(10, n_samples),
'days_since_last_transaction': np.random.exponential(30, n_samples),
'number_of_credit_cards': np.random.randint(1, 6, n_samples),
'loan_balance': np.random.exponential(5000, n_samples),
'payment_delinquency': np.random.poisson(0.5, n_samples)
}
# 创建DataFrame
df = pd.DataFrame(data)
# 确保数值合理性
df['age'] = df['age'].clip(18, 80)
df['income'] = df['income'].clip(10000, 150000)
df['credit_score'] = df['credit_score'].clip(300, 850)
df['account_balance'] = df['account_balance'].clip(0, 50000)
print("数据集基本信息:")
print(f"数据集形状: {df.shape}")
print("\n前5行数据:")
print(df.head())
2. 数据清洗
数据清洗是机器学习项目中最关键的步骤之一,直接影响模型性能。
python
# 2.1 检查缺失值
print("缺失值检查:")
print(df.isnull().sum())
# 2.2 添加一些缺失值以演示清洗过程(在实际项目中不需要)
np.random.seed(42)
missing_indices = np.random.choice(df.index, size=100, replace=False)
df.loc[missing_indices, 'credit_score'] = np.nan
missing_indices = np.random.choice(df.index, size=50, replace=False)
df.loc[missing_indices, 'income'] = np.nan
print(f"\n添加缺失值后:")
print(df.isnull().sum())
# 2.3 处理缺失值
# 数值型特征用中位数填充
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
if df[col].isnull().sum() > 0:
df[col].fillna(df[col].median(), inplace=True)
print(f"\n处理缺失值后:")
print(df.isnull().sum())
# 2.4 异常值处理
def handle_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# 将异常值缩放到边界值
df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
return df
# 对数值特征进行异常值处理
outlier_columns = ['income', 'account_balance', 'transaction_amount', 'loan_balance']
for col in outlier_columns:
df = handle_outliers(df, col)
print("\n异常值处理完成")
# 2.5 创建目标变量(基于业务规则模拟欺诈交易)
# 使用多个特征的组合来创建更有意义的目标变量
np.random.seed(42)
fraud_probability = (
0.3 * (df['transaction_amount'] > df['transaction_amount'].quantile(0.9)).astype(int) +
0.2 * (df['credit_score'] < 550).astype(int) +
0.2 * (df['payment_delinquency'] > 2).astype(int) +
0.15 * (df['days_since_last_transaction'] > 60).astype(int) +
0.15 * np.random.random(n_samples)
)
# 转换为二分类目标变量(欺诈/非欺诈)
df['is_fraud'] = (fraud_probability > fraud_probability.quantile(0.85)).astype(int)
print(f"\n目标变量分布:")
print(df['is_fraud'].value_counts())
print(f"欺诈比例: {df['is_fraud'].mean():.3f}")
3. 探索性数据分析与特征工程
python
# 3.1 数据分布可视化
plt.figure(figsize=(15, 10))
plt.subplot(2, 3, 1)
sns.histplot(data=df, x='age', hue='is_fraud', multiple="stack")
plt.title('年龄分布')
plt.subplot(2, 3, 2)
sns.histplot(data=df, x='income', hue='is_fraud', multiple="stack")
plt.title('收入分布')
plt.subplot(2, 3, 3)
sns.histplot(data=df, x='credit_score', hue='is_fraud', multiple="stack")
plt.title('信用分数分布')
plt.subplot(2, 3, 4)
sns.boxplot(data=df, x='is_fraud', y='transaction_amount')
plt.title('交易金额箱线图')
plt.subplot(2, 3, 5)
sns.countplot(data=df, x='number_of_credit_cards', hue='is_fraud')
plt.title('信用卡数量分布')
plt.subplot(2, 3, 6)
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('特征相关性热力图')
plt.tight_layout()
plt.show()
# 3.2 特征工程
print("开始特征工程...")
# 创建新特征
df['income_to_balance_ratio'] = df['income'] / (df['account_balance'] + 1)
df['transaction_to_balance_ratio'] = df['transaction_amount'] / (df['account_balance'] + 1)
df['credit_utilization'] = df['loan_balance'] / (df['income'] * 0.3 + 1) # 假设信用额度为收入的30%
df['age_group'] = pd.cut(df['age'], bins=[18, 30, 45, 60, 80], labels=['18-30', '31-45', '46-60', '61-80'])
# 对类别特征进行独热编码
df = pd.get_dummies(df, columns=['age_group'], prefix='age_group')
print(f"特征工程后数据集形状: {df.shape}")
print(f"特征列表: {list(df.columns)}")
4. 特征筛选
python
# 4.1 准备特征和目标变量
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']
# 4.2 基于相关性的特征筛选
plt.figure(figsize=(12, 8))
correlation_with_target = df.corr()['is_fraud'].sort_values(ascending=False)
correlation_with_target.drop('is_fraud', inplace=True)
plt.subplot(1, 2, 1)
correlation_with_target.plot(kind='bar')
plt.title('特征与目标变量的相关性')
plt.xticks(rotation=45)
# 选择相关性较高的特征
high_corr_features = correlation_with_target[abs(correlation_with_target) > 0.02].index.tolist()
print(f"高相关性特征 ({len(high_corr_features)}个): {high_corr_features}")
# 4.3 使用XGBoost内置特征重要性进行筛选
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
# 训练基础XGBoost模型获取特征重要性
base_model = XGBClassifier(
n_estimators=100,
max_depth=3,
learning_rate=0.1,
random_state=42
)
base_model.fit(X_train, y_train)
# 绘制特征重要性
plt.subplot(1, 2, 2)
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': base_model.feature_importances_
}).sort_values('importance', ascending=False)
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('XGBoost特征重要性 Top 10')
plt.tight_layout()
plt.show()
# 选择重要性较高的特征
important_features = feature_importance[feature_importance['importance'] > 0.01]['feature'].tolist()
print(f"重要特征 ({len(important_features)}个): {important_features}")
# 4.4 最终特征选择
selected_features = list(set(high_corr_features + important_features))
print(f"最终选择的特征 ({len(selected_features)}个): {selected_features}")
# 使用选择的特征
X_selected = X[selected_features]
5. 数据预处理与建模
python
# 5.1 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_val_scaled = scaler.transform(X_val[selected_features])
X_test_scaled = scaler.transform(X_test[selected_features])
print("数据标准化完成")
# 5.2 基础XGBoost模型
print("\n训练基础XGBoost模型...")
base_xgb = XGBClassifier(
random_state=42,
eval_metric='logloss'
)
base_xgb.fit(
X_train_scaled, y_train,
eval_set=[(X_val_scaled, y_val)],
verbose=False
)
# 基础模型评估
y_pred_base = base_xgb.predict(X_test_scaled)
y_pred_proba_base = base_xgb.predict_proba(X_test_scaled)[:, 1]
print("基础模型性能:")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_base):.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred_base))
6. 超参数调优
python
# 6.1 网格搜索调参
print("开始超参数调优...")
# 定义参数网格
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [3, 4, 5],
'learning_rate': [0.01, 0.1, 0.2],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0]
}
# 使用网格搜索
xgb_tuned = XGBClassifier(random_state=42)
# 为了节省时间,使用较小的网格搜索(实际项目中可以使用更大网格)
grid_search = GridSearchCV(
estimator=xgb_tuned,
param_grid=param_grid,
scoring='roc_auc',
cv=3,
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train_scaled, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")
# 6.2 使用最佳参数训练模型
best_xgb = grid_search.best_estimator_
# 调优后模型评估
y_pred_tuned = best_xgb.predict(X_test_scaled)
y_pred_proba_tuned = best_xgb.predict_proba(X_test_scaled)[:, 1]
print("\n调优后模型性能:")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_tuned):.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred_tuned))
7. 模型评估与可视化
python
# 7.1 模型比较可视化
plt.figure(figsize=(15, 10))
# 混淆矩阵比较
plt.subplot(2, 3, 1)
cm_base = confusion_matrix(y_test, y_pred_base)
sns.heatmap(cm_base, annot=True, fmt='d', cmap='Blues')
plt.title('基础模型混淆矩阵')
plt.subplot(2, 3, 2)
cm_tuned = confusion_matrix(y_test, y_pred_tuned)
sns.heatmap(cm_tuned, annot=True, fmt='d', cmap='Blues')
plt.title('调优模型混淆矩阵')
# 特征重要性比较
plt.subplot(2, 3, 3)
tuned_feature_importance = pd.DataFrame({
'feature': selected_features,
'importance': best_xgb.feature_importances_
}).sort_values('importance', ascending=True)
plt.barh(tuned_feature_importance['feature'], tuned_feature_importance['importance'])
plt.title('调优模型特征重要性')
plt.xlabel('重要性')
# ROC曲线
from sklearn.metrics import roc_curve
plt.subplot(2, 3, 4)
fpr_base, tpr_base, _ = roc_curve(y_test, y_pred_proba_base)
fpr_tuned, tpr_tuned, _ = roc_curve(y_test, y_pred_proba_tuned)
plt.plot(fpr_base, tpr_base, label=f'基础模型 (AUC = {roc_auc_score(y_test, y_pred_proba_base):.3f})')
plt.plot(fpr_tuned, tpr_tuned, label=f'调优模型 (AUC = {roc_auc_score(y_test, y_pred_proba_tuned):.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('假正率')
plt.ylabel('真正率')
plt.title('ROC曲线比较')
plt.legend()
# 学习曲线
from sklearn.model_selection import learning_curve
plt.subplot(2, 3, 5)
train_sizes, train_scores, test_scores = learning_curve(
best_xgb, X_train_scaled, y_train, cv=3,
scoring='roc_auc', train_sizes=np.linspace(0.1, 1.0, 10)
)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', label='训练分数')
plt.plot(train_sizes, test_scores_mean, 'o-', label='验证分数')
plt.xlabel('训练样本数')
plt.ylabel('ROC-AUC分数')
plt.title('学习曲线')
plt.legend()
plt.tight_layout()
plt.show()
# 7.2 模型性能总结
print("=" * 50)
print("模型性能总结")
print("=" * 50)
print(f"基础模型 ROC-AUC: {roc_auc_score(y_test, y_pred_proba_base):.4f}")
print(f"调优模型 ROC-AUC: {roc_auc_score(y_test, y_pred_proba_tuned):.4f}")
print(f"性能提升: {roc_auc_score(y_test, y_pred_proba_tuned) - roc_auc_score(y_test, y_pred_proba_base):.4f}")
8. 模型部署与推理
python
# 8.1 保存模型和预处理对象
import joblib
model_artifacts = {
'model': best_xgb,
'scaler': scaler,
'feature_names': selected_features
}
joblib.dump(model_artifacts, 'xgboost_fraud_detection_model.pkl')
print("模型和预处理对象已保存")
# 8.2 创建推理函数
def predict_fraud(new_data, model_path='xgboost_fraud_detection_model.pkl'):
"""
对新数据进行欺诈预测
"""
# 加载模型
artifacts = joblib.load(model_path)
model = artifacts['model']
scaler = artifacts['scaler']
feature_names = artifacts['feature_names']
# 确保输入数据包含所有必要特征
missing_features = set(feature_names) - set(new_data.columns)
if missing_features:
raise ValueError(f"缺少特征: {missing_features}")
# 选择特征并预处理
X_new = new_data[feature_names]
X_new_scaled = scaler.transform(X_new)
# 预测
predictions = model.predict(X_new_scaled)
probabilities = model.predict_proba(X_new_scaled)[:, 1]
return predictions, probabilities
# 8.3 测试推理函数
print("\n测试推理函数...")
test_sample = X_test[selected_features].head(3)
predictions, probabilities = predict_fraud(test_sample)
print("测试样本预测结果:")
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
status = "欺诈" if pred == 1 else "正常"
print(f"样本 {i+1}: {status} (概率: {prob:.4f})")