Day 42 复习日

@浙大疏锦行

信贷风险预测代码,考虑神经网络,借助AI改进

python 复制代码
# ===================== 1. 导入核心库 =====================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings("ignore")

# 深度学习库
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# 机器学习工具
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix)

# ===================== 2. 全局配置(保证可复现) =====================
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 设置随机种子
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# ===================== 3. 数据加载与基础检查 =====================
def load_data(file_path):
    """加载数据并做基础检查"""
    data = pd.read_csv(file_path)
    print("=== 数据基础信息 ===")
    print(f"数据形状: {data.shape}")
    print(f"\n缺失值统计:\n{data.isnull().sum()[data.isnull().sum()>0]}")
    print(f"\n数据类型:\n{data.dtypes[:5]}")  # 展示前5列类型
    return data

# 数据路径(建议用原始字符串避免转义)
file_path = r'E:\study\PythonStudy\python60-days-challenge-master\data.csv'
data = load_data(file_path)
data_raw = data.copy()  # 保留原始数据备份

# ===================== 4. 数据预处理(适配神经网络) =====================
def preprocess_data(data):
    """数据预处理:编码分类特征 + 缺失值填充 + 标准化"""
    df = data.copy()
    
    # ---- 4.1 分类特征编码 ----
    # 1) Home Ownership 标签编码
    home_ownership_mapping = {'Own Home': 1, 'Rent': 2, 'Have Mortgage': 3, 'Home Mortgage': 4}
    df['Home Ownership'] = df['Home Ownership'].map(home_ownership_mapping)
    
    # 2) Years in current job 标签编码
    years_mapping = {'< 1 year': 1, '1 year': 2, '2 years': 3, '3 years': 4, '4 years': 5,
                     '5 years': 6, '6 years': 7, '7 years': 8, '8 years': 9, '9 years': 10, '10+ years': 11}
    df['Years in current job'] = df['Years in current job'].map(years_mapping)
    
    # 3) Term 二值编码 + 重命名
    term_mapping = {'Short Term': 0, 'Long Term': 1}
    df['Term'] = df['Term'].map(term_mapping)
    df.rename(columns={'Term': 'Long Term'}, inplace=True)
    
    # 4) Purpose 独热编码(转换为数值型)
    df = pd.get_dummies(df, columns=['Purpose'], dtype=int)
    
    # ---- 4.2 缺失值填充(神经网络对缺失值更敏感) ----
    # 分离连续特征和分类特征
    continuous_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    continuous_features.remove('Credit Default')  # 排除标签
    
    # 连续特征:中位数填充(抗异常值)+ 后续标准化
    for feat in continuous_features:
        median_val = df[feat].median()
        df[feat] = df[feat].fillna(median_val)
    
    # 分类特征(若有):众数填充(本例已无纯分类特征)
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    for feat in categorical_features:
        mode_val = df[feat].mode()[0]
        df[feat] = df[feat].fillna(mode_val)
    
    # ---- 4.3 特征标准化(神经网络核心预处理步骤) ----
    scaler = StandardScaler()
    df[continuous_features] = scaler.fit_transform(df[continuous_features])
    
    return df, scaler

# 执行预处理
data_processed, scaler = preprocess_data(data)

# ===================== 5. 数据集划分 =====================
def split_dataset(df):
    """划分训练集/测试集(8:2)"""
    X = df.drop(['Credit Default'], axis=1)
    y = df['Credit Default']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED, stratify=y  # stratify保证标签分布一致
    )
    print(f"\n=== 数据集划分 ===")
    print(f"训练集形状: X={X_train.shape}, y={y_train.shape}")
    print(f"测试集形状: X={X_test.shape}, y={y_test.shape}")
    print(f"训练集标签分布: {y_train.value_counts(normalize=True).round(3)}")
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_dataset(data_processed)

# ===================== 6. 基准模型:随机森林(保留对比) =====================
def train_random_forest(X_train, X_test, y_train, y_test):
    """训练默认参数的随机森林,作为基准"""
    print("\n=== 1. 基准模型:随机森林 ===")
    start_time = time.time()
    
    # 训练模型
    rf_model = RandomForestClassifier(random_state=SEED, n_jobs=-1)  # n_jobs加速训练
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    
    # 耗时统计
    cost_time = time.time() - start_time
    
    # 评估指标
    print(f"训练+预测耗时: {cost_time:.4f} 秒")
    print("\n分类报告:")
    print(classification_report(y_test, rf_pred))
    
    # 混淆矩阵
    cm_rf = confusion_matrix(y_test, rf_pred)
    
    return rf_model, rf_pred, cm_rf

# 执行随机森林训练
rf_model, rf_pred, cm_rf = train_random_forest(X_train, X_test, y_train, y_test)

# ===================== 7. 核心模型:神经网络(MLP) =====================
def build_nn_model(input_dim):
    """构建多层感知机(MLP)神经网络"""
    model = Sequential([
        # 输入层 + 隐藏层1(带批量归一化+Dropout防止过拟合)
        Dense(128, activation='relu', input_dim=input_dim, kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.3),
        
        # 隐藏层2
        Dense(64, activation='relu', kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.2),
        
        # 隐藏层3
        Dense(32, activation='relu', kernel_initializer='he_normal'),
        BatchNormalization(),
        Dropout(0.1),
        
        # 输出层(二分类)
        Dense(1, activation='sigmoid')
    ])
    
    # 编译模型(适配二分类任务)
    optimizer = Adam(learning_rate=0.001)
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',  # 二分类交叉熵
        metrics=['accuracy', 
                 tf.keras.metrics.Precision(name='precision'),
                 tf.keras.metrics.Recall(name='recall')]
    )
    
    return model

# 构建模型
input_dim = X_train.shape[1]
nn_model = build_nn_model(input_dim)
print("\n=== 2. 神经网络模型结构 ===")
nn_model.summary()  # 打印模型结构

# 训练神经网络(带早停和学习率衰减)
def train_nn_model(model, X_train, y_train):
    """训练神经网络,加入回调函数防止过拟合"""
    print("\n=== 开始训练神经网络 ===")
    start_time = time.time()
    
    # 回调函数
    early_stopping = EarlyStopping(
        monitor='val_loss', patience=10, restore_best_weights=True, verbose=1
    )
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=5, verbose=1, min_lr=1e-6
    )
    
    # 训练
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        validation_split=0.1,  # 训练集中拆分10%作为验证集
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    
    # 耗时统计
    cost_time = time.time() - start_time
    print(f"神经网络训练耗时: {cost_time:.4f} 秒")
    
    return model, history

# 执行训练
nn_model, history = train_nn_model(nn_model, X_train, y_train)

# ===================== 8. 神经网络评估 =====================
def evaluate_nn_model(model, X_test, y_test):
    """评估神经网络性能"""
    # 预测(概率转标签:阈值0.5)
    nn_pred_prob = model.predict(X_test, verbose=0)
    nn_pred = (nn_pred_prob > 0.5).astype(int).flatten()
    
    # 评估指标
    print("\n=== 神经网络测试集评估 ===")
    print("分类报告:")
    print(classification_report(y_test, nn_pred))
    
    # 混淆矩阵
    cm_nn = confusion_matrix(y_test, nn_pred)
    
    # 模型测试集评分
    test_loss, test_acc, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)
    print(f"\n测试集Loss: {test_loss:.4f}")
    print(f"测试集Accuracy: {test_acc:.4f}")
    print(f"测试集Precision: {test_precision:.4f}")
    print(f"测试集Recall: {test_recall:.4f}")
    
    return nn_pred, cm_nn, history

# 执行评估
nn_pred, cm_nn, history = evaluate_nn_model(nn_model, X_test, y_test)

# ===================== 9. 结果可视化 =====================
def plot_results(history, cm_rf, cm_nn, y_test, rf_pred, nn_pred):
    """可视化训练过程和评估结果"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # ---- 9.1 神经网络训练曲线:Loss ----
    axes[0,0].plot(history.history['loss'], label='训练Loss', color='blue')
    axes[0,0].plot(history.history['val_loss'], label='验证Loss', color='red')
    axes[0,0].set_title('神经网络训练/验证Loss曲线', fontsize=12)
    axes[0,0].set_xlabel('Epoch')
    axes[0,0].set_ylabel('Loss')
    axes[0,0].legend()
    axes[0,0].grid(True)
    
    # ---- 9.2 神经网络训练曲线:Accuracy ----
    axes[0,1].plot(history.history['accuracy'], label='训练Accuracy', color='blue')
    axes[0,1].plot(history.history['val_accuracy'], label='验证Accuracy', color='red')
    axes[0,1].set_title('神经网络训练/验证Accuracy曲线', fontsize=12)
    axes[0,1].set_xlabel('Epoch')
    axes[0,1].set_ylabel('Accuracy')
    axes[0,1].legend()
    axes[0,1].grid(True)
    
    # ---- 9.3 随机森林混淆矩阵热力图 ----
    sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[1,0],
                xticklabels=['无违约', '违约'], yticklabels=['无违约', '违约'])
    axes[1,0].set_title('随机森林 混淆矩阵', fontsize=12)
    axes[1,0].set_xlabel('预测标签')
    axes[1,0].set_ylabel('真实标签')
    
    # ---- 9.4 神经网络混淆矩阵热力图 ----
    sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Reds', ax=axes[1,1],
                xticklabels=['无违约', '违约'], yticklabels=['无违约', '违约'])
    axes[1,1].set_title('神经网络 混淆矩阵', fontsize=12)
    axes[1,1].set_xlabel('预测标签')
    axes[1,1].set_ylabel('真实标签')
    
    plt.tight_layout()
    plt.savefig(r'E:\study\PythonStudy\信贷风险预测结果对比.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # ---- 9.5 模型性能对比表格 ----
    def get_metrics(y_true, y_pred):
        return {
            '准确率': accuracy_score(y_true, y_pred),
            '精确率': precision_score(y_true, y_pred),
            '召回率': recall_score(y_true, y_pred),
            'F1分数': f1_score(y_true, y_pred)
        }
    
    rf_metrics = get_metrics(y_test, rf_pred)
    nn_metrics = get_metrics(y_test, nn_pred)
    
    metrics_df = pd.DataFrame({
        '指标': list(rf_metrics.keys()),
        '随机森林': [round(v, 4) for v in rf_metrics.values()],
        '神经网络': [round(v, 4) for v in nn_metrics.values()]
    })
    print("\n=== 模型性能对比 ===")
    print(metrics_df)

# 执行可视化
plot_results(history, cm_rf, cm_nn, y_test, rf_pred, nn_pred)

# ===================== 10. 模型保存(可选) =====================
def save_models(nn_model, scaler):
    """保存神经网络模型和标准化器"""
    # 保存神经网络模型
    nn_model.save(r'E:\study\PythonStudy\credit_risk_nn_model.h5')
    # 保存标准化器(用于后续预测)
    import joblib
    joblib.dump(scaler, r'E:\study\PythonStudy\credit_risk_scaler.pkl')
    print("\n模型和标准化器已保存!")

# 执行保存(如需)
# save_models(nn_model, scaler)
相关推荐
Robot侠6 小时前
视觉语言导航从入门到精通(二)
开发语言·人工智能·python·llm·vln
无限大.6 小时前
为什么玩游戏需要独立显卡?——GPU与CPU的分工协作
python·玩游戏
deephub6 小时前
llama.cpp Server 引入路由模式:多模型热切换与进程隔离机制详解
人工智能·python·深度学习·llama
简单点好不好6 小时前
2025--简单点--python之状态模式
开发语言·python·状态模式
棒棒的皮皮6 小时前
【OpenCV】Python图像处理之仿射变换
图像处理·python·opencv·计算机视觉
weixin_446260856 小时前
FastF1: 轻松获取和分析F1数据的Python包
开发语言·python
我送炭你添花6 小时前
Pelco KBD300A 模拟器:06.用 PyQt5 实现 1:1 像素级完美复刻 Pelco KBD300A 键盘
python·qt·自动化·运维开发
山土成旧客6 小时前
【Python学习打卡-Day22】启航Kaggle:从路径管理到独立项目研究的全方位指南
开发语言·python·学习
我命由我123456 小时前
Python Flask 开发 - Flask 路径参数类型(string、int、float、path、uuid)
服务器·开发语言·后端·python·flask·学习方法·python3.11