信贷风险预测代码,考虑神经网络,借助AI改进
python
# ===================== 1. 导入核心库 =====================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings("ignore")
# 深度学习库
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# 机器学习工具
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
classification_report, confusion_matrix)
# ===================== 2. 全局配置(保证可复现) =====================
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 设置随机种子
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
# ===================== 3. 数据加载与基础检查 =====================
def load_data(file_path):
"""加载数据并做基础检查"""
data = pd.read_csv(file_path)
print("=== 数据基础信息 ===")
print(f"数据形状: {data.shape}")
print(f"\n缺失值统计:\n{data.isnull().sum()[data.isnull().sum()>0]}")
print(f"\n数据类型:\n{data.dtypes[:5]}") # 展示前5列类型
return data
# 数据路径(建议用原始字符串避免转义)
file_path = r'E:\study\PythonStudy\python60-days-challenge-master\data.csv'
data = load_data(file_path)
data_raw = data.copy() # 保留原始数据备份
# ===================== 4. 数据预处理(适配神经网络) =====================
def preprocess_data(data):
"""数据预处理:编码分类特征 + 缺失值填充 + 标准化"""
df = data.copy()
# ---- 4.1 分类特征编码 ----
# 1) Home Ownership 标签编码
home_ownership_mapping = {'Own Home': 1, 'Rent': 2, 'Have Mortgage': 3, 'Home Mortgage': 4}
df['Home Ownership'] = df['Home Ownership'].map(home_ownership_mapping)
# 2) Years in current job 标签编码
years_mapping = {'< 1 year': 1, '1 year': 2, '2 years': 3, '3 years': 4, '4 years': 5,
'5 years': 6, '6 years': 7, '7 years': 8, '8 years': 9, '9 years': 10, '10+ years': 11}
df['Years in current job'] = df['Years in current job'].map(years_mapping)
# 3) Term 二值编码 + 重命名
term_mapping = {'Short Term': 0, 'Long Term': 1}
df['Term'] = df['Term'].map(term_mapping)
df.rename(columns={'Term': 'Long Term'}, inplace=True)
# 4) Purpose 独热编码(转换为数值型)
df = pd.get_dummies(df, columns=['Purpose'], dtype=int)
# ---- 4.2 缺失值填充(神经网络对缺失值更敏感) ----
# 分离连续特征和分类特征
continuous_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
continuous_features.remove('Credit Default') # 排除标签
# 连续特征:中位数填充(抗异常值)+ 后续标准化
for feat in continuous_features:
median_val = df[feat].median()
df[feat] = df[feat].fillna(median_val)
# 分类特征(若有):众数填充(本例已无纯分类特征)
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
for feat in categorical_features:
mode_val = df[feat].mode()[0]
df[feat] = df[feat].fillna(mode_val)
# ---- 4.3 特征标准化(神经网络核心预处理步骤) ----
scaler = StandardScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])
return df, scaler
# 执行预处理
data_processed, scaler = preprocess_data(data)
# ===================== 5. 数据集划分 =====================
def split_dataset(df):
"""划分训练集/测试集(8:2)"""
X = df.drop(['Credit Default'], axis=1)
y = df['Credit Default']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=SEED, stratify=y # stratify保证标签分布一致
)
print(f"\n=== 数据集划分 ===")
print(f"训练集形状: X={X_train.shape}, y={y_train.shape}")
print(f"测试集形状: X={X_test.shape}, y={y_test.shape}")
print(f"训练集标签分布: {y_train.value_counts(normalize=True).round(3)}")
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split_dataset(data_processed)
# ===================== 6. 基准模型:随机森林(保留对比) =====================
def train_random_forest(X_train, X_test, y_train, y_test):
"""训练默认参数的随机森林,作为基准"""
print("\n=== 1. 基准模型:随机森林 ===")
start_time = time.time()
# 训练模型
rf_model = RandomForestClassifier(random_state=SEED, n_jobs=-1) # n_jobs加速训练
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
# 耗时统计
cost_time = time.time() - start_time
# 评估指标
print(f"训练+预测耗时: {cost_time:.4f} 秒")
print("\n分类报告:")
print(classification_report(y_test, rf_pred))
# 混淆矩阵
cm_rf = confusion_matrix(y_test, rf_pred)
return rf_model, rf_pred, cm_rf
# 执行随机森林训练
rf_model, rf_pred, cm_rf = train_random_forest(X_train, X_test, y_train, y_test)
# ===================== 7. 核心模型:神经网络(MLP) =====================
def build_nn_model(input_dim):
"""构建多层感知机(MLP)神经网络"""
model = Sequential([
# 输入层 + 隐藏层1(带批量归一化+Dropout防止过拟合)
Dense(128, activation='relu', input_dim=input_dim, kernel_initializer='he_normal'),
BatchNormalization(),
Dropout(0.3),
# 隐藏层2
Dense(64, activation='relu', kernel_initializer='he_normal'),
BatchNormalization(),
Dropout(0.2),
# 隐藏层3
Dense(32, activation='relu', kernel_initializer='he_normal'),
BatchNormalization(),
Dropout(0.1),
# 输出层(二分类)
Dense(1, activation='sigmoid')
])
# 编译模型(适配二分类任务)
optimizer = Adam(learning_rate=0.001)
model.compile(
optimizer=optimizer,
loss='binary_crossentropy', # 二分类交叉熵
metrics=['accuracy',
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall')]
)
return model
# 构建模型
input_dim = X_train.shape[1]
nn_model = build_nn_model(input_dim)
print("\n=== 2. 神经网络模型结构 ===")
nn_model.summary() # 打印模型结构
# 训练神经网络(带早停和学习率衰减)
def train_nn_model(model, X_train, y_train):
"""训练神经网络,加入回调函数防止过拟合"""
print("\n=== 开始训练神经网络 ===")
start_time = time.time()
# 回调函数
early_stopping = EarlyStopping(
monitor='val_loss', patience=10, restore_best_weights=True, verbose=1
)
reduce_lr = ReduceLROnPlateau(
monitor='val_loss', factor=0.5, patience=5, verbose=1, min_lr=1e-6
)
# 训练
history = model.fit(
X_train, y_train,
epochs=100,
batch_size=32,
validation_split=0.1, # 训练集中拆分10%作为验证集
callbacks=[early_stopping, reduce_lr],
verbose=1
)
# 耗时统计
cost_time = time.time() - start_time
print(f"神经网络训练耗时: {cost_time:.4f} 秒")
return model, history
# 执行训练
nn_model, history = train_nn_model(nn_model, X_train, y_train)
# ===================== 8. 神经网络评估 =====================
def evaluate_nn_model(model, X_test, y_test):
"""评估神经网络性能"""
# 预测(概率转标签:阈值0.5)
nn_pred_prob = model.predict(X_test, verbose=0)
nn_pred = (nn_pred_prob > 0.5).astype(int).flatten()
# 评估指标
print("\n=== 神经网络测试集评估 ===")
print("分类报告:")
print(classification_report(y_test, nn_pred))
# 混淆矩阵
cm_nn = confusion_matrix(y_test, nn_pred)
# 模型测试集评分
test_loss, test_acc, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)
print(f"\n测试集Loss: {test_loss:.4f}")
print(f"测试集Accuracy: {test_acc:.4f}")
print(f"测试集Precision: {test_precision:.4f}")
print(f"测试集Recall: {test_recall:.4f}")
return nn_pred, cm_nn, history
# 执行评估
nn_pred, cm_nn, history = evaluate_nn_model(nn_model, X_test, y_test)
# ===================== 9. 结果可视化 =====================
def plot_results(history, cm_rf, cm_nn, y_test, rf_pred, nn_pred):
"""可视化训练过程和评估结果"""
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
# ---- 9.1 神经网络训练曲线:Loss ----
axes[0,0].plot(history.history['loss'], label='训练Loss', color='blue')
axes[0,0].plot(history.history['val_loss'], label='验证Loss', color='red')
axes[0,0].set_title('神经网络训练/验证Loss曲线', fontsize=12)
axes[0,0].set_xlabel('Epoch')
axes[0,0].set_ylabel('Loss')
axes[0,0].legend()
axes[0,0].grid(True)
# ---- 9.2 神经网络训练曲线:Accuracy ----
axes[0,1].plot(history.history['accuracy'], label='训练Accuracy', color='blue')
axes[0,1].plot(history.history['val_accuracy'], label='验证Accuracy', color='red')
axes[0,1].set_title('神经网络训练/验证Accuracy曲线', fontsize=12)
axes[0,1].set_xlabel('Epoch')
axes[0,1].set_ylabel('Accuracy')
axes[0,1].legend()
axes[0,1].grid(True)
# ---- 9.3 随机森林混淆矩阵热力图 ----
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[1,0],
xticklabels=['无违约', '违约'], yticklabels=['无违约', '违约'])
axes[1,0].set_title('随机森林 混淆矩阵', fontsize=12)
axes[1,0].set_xlabel('预测标签')
axes[1,0].set_ylabel('真实标签')
# ---- 9.4 神经网络混淆矩阵热力图 ----
sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Reds', ax=axes[1,1],
xticklabels=['无违约', '违约'], yticklabels=['无违约', '违约'])
axes[1,1].set_title('神经网络 混淆矩阵', fontsize=12)
axes[1,1].set_xlabel('预测标签')
axes[1,1].set_ylabel('真实标签')
plt.tight_layout()
plt.savefig(r'E:\study\PythonStudy\信贷风险预测结果对比.png', dpi=300, bbox_inches='tight')
plt.show()
# ---- 9.5 模型性能对比表格 ----
def get_metrics(y_true, y_pred):
return {
'准确率': accuracy_score(y_true, y_pred),
'精确率': precision_score(y_true, y_pred),
'召回率': recall_score(y_true, y_pred),
'F1分数': f1_score(y_true, y_pred)
}
rf_metrics = get_metrics(y_test, rf_pred)
nn_metrics = get_metrics(y_test, nn_pred)
metrics_df = pd.DataFrame({
'指标': list(rf_metrics.keys()),
'随机森林': [round(v, 4) for v in rf_metrics.values()],
'神经网络': [round(v, 4) for v in nn_metrics.values()]
})
print("\n=== 模型性能对比 ===")
print(metrics_df)
# 执行可视化
plot_results(history, cm_rf, cm_nn, y_test, rf_pred, nn_pred)
# ===================== 10. 模型保存(可选) =====================
def save_models(nn_model, scaler):
"""保存神经网络模型和标准化器"""
# 保存神经网络模型
nn_model.save(r'E:\study\PythonStudy\credit_risk_nn_model.h5')
# 保存标准化器(用于后续预测)
import joblib
joblib.dump(scaler, r'E:\study\PythonStudy\credit_risk_scaler.pkl')
print("\n模型和标准化器已保存!")
# 执行保存(如需)
# save_models(nn_model, scaler)