Day43 PythonStudy

@浙大疏锦行

python 复制代码
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 读取数据
data = pd.read_csv('E:\PyStudy\data.csv')

# 预处理代码(保持不变)
discrete_features = data.select_dtypes(include=['object']).columns.tolist()

# Home Ownership 标签编码
home_ownership_mapping = {
    'Own Home': 1,
    'Rent': 2,
    'Have Mortgage': 3,
    'Home Mortgage': 4
}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)

# Years in current job 标签编码
years_in_job_mapping = {
    '< 1 year': 1,
    '1 year': 2,
    '2 years': 3,
    '3 years': 4,
    '4 years': 5,
    '5 years': 6,
    '6 years': 7,
    '7 years': 8,
    '8 years': 9,
    '9 years': 10,
    '10+ years': 11
}
data['Years in current job'] = data['Years in current job'].map(years_in_job_mapping)

# Purpose 独热编码
data = pd.get_dummies(data, columns=['Purpose'])
data2 = pd.read_csv("data.csv")
list_final = []
for i in data.columns:
    if i not in data2.columns:
        list_final.append(i)
for i in list_final:
    data[i] = data[i].astype(int)

# Term 0 - 1 映射
term_mapping = {
    'Short Term': 0,
    'Long Term': 1
}
data['Term'] = data['Term'].map(term_mapping)
data.rename(columns={'Term': 'Long Term'}, inplace=True)

# 连续特征用中位数补全
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in continuous_features:
    mode_value = data[feature].mode()[0]
    data[feature].fillna(mode_value, inplace=True)

# 划分数据集
X = data.drop(['Credit Default'], axis=1)
y = data['Credit Default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 数据标准化(对神经网络很重要)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 转换为PyTorch张量并移动到GPU
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# 创建数据集和数据加载器(可选)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

input_size = X_train_scaled.shape[1]

class MLP(nn.Module): 
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):  # 二分类输出2
        super(MLP, self).__init__()
        
        # 动态设置输入维度
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # 输入层到隐藏层
        self.bn1 = nn.BatchNorm1d(hidden_dim)  # 批归一化
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)  # Dropout防止过拟合
        
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        
        self.fc3 = nn.Linear(hidden_dim // 2, hidden_dim // 4)
        
        self.fc4 = nn.Linear(hidden_dim // 4, output_dim)  # 输出层

    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.relu(out)
        
        out = self.fc4(out)
        return out

# 定义早停类
class EarlyStopping:
    def __init__(self, patience=10, min_delta=0, save_path='best_model.pth'):
        """
        Args:
            patience: 容忍多少个epoch没有改善
            min_delta: 最小改善量
            save_path: 最佳模型保存路径
        """
        self.patience = patience
        self.min_delta = min_delta
        self.save_path = save_path
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.save_checkpoint(model)
            self.counter = 0
            
    def save_checkpoint(self, model):
        """保存最佳模型"""
        torch.save(model.state_dict(), self.save_path)
        print(f'Validation loss decreased. Saving model to {self.save_path}')

# 模型路径
model_path = 'credit_model.pth'
best_model_path = 'best_credit_model.pth'

# 检查是否有已保存的模型
if os.path.exists(model_path):
    print("加载已保存的模型权重...")
    model = MLP(input_dim=input_size).to(device)
    model.load_state_dict(torch.load(model_path))
    print("模型权重加载成功!")
else:
    print("训练新模型...")
    # 实例化模型并移动到GPU
    model = MLP(input_dim=input_size).to(device)
    
    # 分类问题使用交叉熵损失函数
    criterion = nn.CrossEntropyLoss()
    
    # 使用Adam优化器(通常比SGD更好)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # 初始化早停
    early_stopping = EarlyStopping(patience=15, min_delta=0.001, save_path=best_model_path)
    
    # 训练模型
    num_epochs = 20000
    losses = []
    val_losses = []
    
    start_time = time.time()
    
    for epoch in range(num_epochs):
        # 训练模式
        model.train()
        train_loss = 0
        
        # 使用数据加载器进行批处理
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            # 反向传播和优化
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        losses.append(avg_train_loss)
        
        # 验证模式
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_test_tensor)
            val_loss = criterion(val_outputs, y_test_tensor)
            val_losses.append(val_loss.item())
            
            # 计算准确率
            _, predicted = torch.max(val_outputs, 1)
            accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())
        
        # 早停检查
        early_stopping(val_loss.item(), model)
        
        # 打印训练信息
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], '
                  f'Train Loss: {avg_train_loss:.4f}, '
                  f'Val Loss: {val_loss.item():.4f}, '
                  f'Accuracy: {accuracy:.4f}')
        
        if early_stopping.early_stop:
            print("早停触发!")
            break
    
    training_time = time.time() - start_time
    print(f'Training time: {training_time:.2f} seconds')
    
    # 保存最终模型
    torch.save(model.state_dict(), model_path)
    print(f"模型已保存到 {model_path}")
    
    # 可视化损失曲线
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(losses)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss over Epochs')
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
python 复制代码
# ==================== 加载权重后继续训练50轮 ====================
print("\n" + "="*50)
print("开始继续训练50轮...")
print("="*50)

# 确保模型在正确的设备上
model = MLP(input_dim=input_size).to(device)

# 如果存在最佳模型,加载最佳模型权重
if os.path.exists(best_model_path):
    print(f"加载最佳模型权重: {best_model_path}")
    model.load_state_dict(torch.load(best_model_path, map_location=device))
elif os.path.exists(model_path):
    print(f"加载最终模型权重: {model_path}")
    model.load_state_dict(torch.load(model_path, map_location=device))

# 继续训练的优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=0.0005)  # 使用更小的学习率继续训练
criterion = nn.CrossEntropyLoss()

# 早停策略
continue_early_stopping = EarlyStopping(patience=10, min_delta=0.0005, save_path='continue_best_model.pth')

# 继续训练
continue_epochs = 50
continue_losses = []
continue_val_losses = []

start_time = time.time()

for epoch in range(continue_epochs):
    # 训练模式
    model.train()
    train_loss = 0
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    avg_train_loss = train_loss / len(train_loader)
    continue_losses.append(avg_train_loss)
    
    # 验证模式
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test_tensor)
        val_loss = criterion(val_outputs, y_test_tensor)
        continue_val_losses.append(val_loss.item())
        
        # 计算准确率
        _, predicted = torch.max(val_outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())
    
    # 早停检查
    continue_early_stopping(val_loss.item(), model)
    
    # 打印训练信息
    print(f'Continue Epoch [{epoch+1}/{continue_epochs}], '
          f'Train Loss: {avg_train_loss:.4f}, '
          f'Val Loss: {val_loss.item():.4f}, '
          f'Accuracy: {accuracy:.4f}')
    
    if continue_early_stopping.early_stop:
        print("继续训练早停触发!")
        break

continue_training_time = time.time() - start_time
print(f'继续训练时间: {continue_training_time:.2f} seconds')

# 保存继续训练后的模型
torch.save(model.state_dict(), 'final_continue_model.pth')
print("继续训练完成,模型已保存为 'final_continue_model.pth'")

# 可视化继续训练的损失曲线
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(continue_losses, label='Continue Training Loss')
plt.plot(continue_val_losses, label='Continue Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Continue Training Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
epoch_range = range(len(continue_losses))
plt.plot(epoch_range, continue_losses, 'b-', label='Train Loss')
plt.plot(epoch_range, continue_val_losses, 'r-', label='Val Loss')
plt.fill_between(epoch_range, continue_losses, continue_val_losses, alpha=0.2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Continue Training Overview')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# ==================== 最终测试评估 ====================
print("\n" + "="*50)
print("在测试集上评估模型")
print("="*50)

# 确保模型处于评估模式
model.eval()

# 在测试集上进行预测
with torch.no_grad():
    # 注意:这里应该使用 X_test_tensor,而不是 X_test
    outputs = model(X_test_tensor)
    
    # 获取预测类别
    _, predicted = torch.max(outputs, 1)
    
    # 将张量转换为numpy数组进行比较
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    # 计算准确率
    correct = (y_pred == y_true).sum()
    total = len(y_true)
    accuracy = correct / total
    
    print(f'测试集准确率: {accuracy * 100:.2f}%')
    print(f'正确预测数: {correct}/{total}')
    
    # 计算更详细的评估指标
    from sklearn.metrics import classification_report, confusion_matrix
    
    print("\n分类报告:")
    print(classification_report(y_true, y_pred, target_names=['Non-Default', 'Default']))
    
    # 绘制混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Non-Default', 'Default'], 
                yticklabels=['Non-Default', 'Default'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix on Test Set')
    plt.show()

# 清理GPU缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

==================================================

在测试集上评估模型

==================================================

测试集准确率: 76.87%

正确预测数: 1153/1500

分类报告:

precision recall f1-score support

Non-Default 0.77 0.97 0.86 1059

Default 0.79 0.29 0.43 441

accuracy 0.77 1500

macro avg 0.78 0.63 0.64 1500

weighted avg 0.77 0.77 0.73 1500

相关推荐
gihigo19983 分钟前
基于粒子滤波的三维雷达目标跟踪方案
人工智能·计算机视觉·目标跟踪
3DVisionary9 分钟前
磁性轴承尺寸如何精准检测?蓝光扫描仪全尺寸3D检测解析
人工智能·3d·质量控制·蓝光三维扫描仪·非接触三维测量·磁性轴承·全尺寸形位公差
IT策士11 分钟前
AI驱动的业务PPT智能生成:DeepSeek × Skills × MCP × 知识库
人工智能·powerpoint
J&A~ing12 分钟前
第一章 opencv 的 Windows源码在 Visual Studio 下的编译安装
人工智能·windows·opencv·计算机视觉·visual studio
爱睡懒觉的焦糖玛奇朵14 分钟前
【从视频到数据集:焦糖玛奇朵的魔法工具Video To YOLO Dataset】
人工智能·python·学习·yolo·音视频
计算机魔术师21 分钟前
【AI面试八股文 Vol.3.4:训练微调部署选型】从预训练到量化部署:LLM 工程落地如何做模型选择
人工智能·后端·面试·架构·moe·vol.3.3·vol.3.4
叶子Talk24 分钟前
OpenAI破解80年数学猜想,AI首次做出原创证明
人工智能·数学·算法·机器学习·ai·openai·ai推理
刘一说25 分钟前
AI科技热点日报 | 2026年5月22日
数据库·人工智能·科技
Binary_ey28 分钟前
红外系统杂散光难管控?OAS精准助力高质量成像
人工智能
2501_9400417429 分钟前
探索非主流游戏机制的AI生成指南
人工智能·游戏