Day43 PythonStudy

@浙大疏锦行

python 复制代码
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 读取数据
data = pd.read_csv('E:\PyStudy\data.csv')

# 预处理代码(保持不变)
discrete_features = data.select_dtypes(include=['object']).columns.tolist()

# Home Ownership 标签编码
home_ownership_mapping = {
    'Own Home': 1,
    'Rent': 2,
    'Have Mortgage': 3,
    'Home Mortgage': 4
}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)

# Years in current job 标签编码
years_in_job_mapping = {
    '< 1 year': 1,
    '1 year': 2,
    '2 years': 3,
    '3 years': 4,
    '4 years': 5,
    '5 years': 6,
    '6 years': 7,
    '7 years': 8,
    '8 years': 9,
    '9 years': 10,
    '10+ years': 11
}
data['Years in current job'] = data['Years in current job'].map(years_in_job_mapping)

# Purpose 独热编码
data = pd.get_dummies(data, columns=['Purpose'])
data2 = pd.read_csv("data.csv")
list_final = []
for i in data.columns:
    if i not in data2.columns:
        list_final.append(i)
for i in list_final:
    data[i] = data[i].astype(int)

# Term 0 - 1 映射
term_mapping = {
    'Short Term': 0,
    'Long Term': 1
}
data['Term'] = data['Term'].map(term_mapping)
data.rename(columns={'Term': 'Long Term'}, inplace=True)

# 连续特征用中位数补全
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in continuous_features:
    mode_value = data[feature].mode()[0]
    data[feature].fillna(mode_value, inplace=True)

# 划分数据集
X = data.drop(['Credit Default'], axis=1)
y = data['Credit Default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 数据标准化(对神经网络很重要)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 转换为PyTorch张量并移动到GPU
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# 创建数据集和数据加载器(可选)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

input_size = X_train_scaled.shape[1]

class MLP(nn.Module): 
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):  # 二分类输出2
        super(MLP, self).__init__()
        
        # 动态设置输入维度
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # 输入层到隐藏层
        self.bn1 = nn.BatchNorm1d(hidden_dim)  # 批归一化
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)  # Dropout防止过拟合
        
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        
        self.fc3 = nn.Linear(hidden_dim // 2, hidden_dim // 4)
        
        self.fc4 = nn.Linear(hidden_dim // 4, output_dim)  # 输出层

    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.relu(out)
        
        out = self.fc4(out)
        return out

# 定义早停类
class EarlyStopping:
    def __init__(self, patience=10, min_delta=0, save_path='best_model.pth'):
        """
        Args:
            patience: 容忍多少个epoch没有改善
            min_delta: 最小改善量
            save_path: 最佳模型保存路径
        """
        self.patience = patience
        self.min_delta = min_delta
        self.save_path = save_path
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.save_checkpoint(model)
            self.counter = 0
            
    def save_checkpoint(self, model):
        """保存最佳模型"""
        torch.save(model.state_dict(), self.save_path)
        print(f'Validation loss decreased. Saving model to {self.save_path}')

# 模型路径
model_path = 'credit_model.pth'
best_model_path = 'best_credit_model.pth'

# 检查是否有已保存的模型
if os.path.exists(model_path):
    print("加载已保存的模型权重...")
    model = MLP(input_dim=input_size).to(device)
    model.load_state_dict(torch.load(model_path))
    print("模型权重加载成功!")
else:
    print("训练新模型...")
    # 实例化模型并移动到GPU
    model = MLP(input_dim=input_size).to(device)
    
    # 分类问题使用交叉熵损失函数
    criterion = nn.CrossEntropyLoss()
    
    # 使用Adam优化器(通常比SGD更好)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # 初始化早停
    early_stopping = EarlyStopping(patience=15, min_delta=0.001, save_path=best_model_path)
    
    # 训练模型
    num_epochs = 20000
    losses = []
    val_losses = []
    
    start_time = time.time()
    
    for epoch in range(num_epochs):
        # 训练模式
        model.train()
        train_loss = 0
        
        # 使用数据加载器进行批处理
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            # 反向传播和优化
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        losses.append(avg_train_loss)
        
        # 验证模式
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_test_tensor)
            val_loss = criterion(val_outputs, y_test_tensor)
            val_losses.append(val_loss.item())
            
            # 计算准确率
            _, predicted = torch.max(val_outputs, 1)
            accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())
        
        # 早停检查
        early_stopping(val_loss.item(), model)
        
        # 打印训练信息
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], '
                  f'Train Loss: {avg_train_loss:.4f}, '
                  f'Val Loss: {val_loss.item():.4f}, '
                  f'Accuracy: {accuracy:.4f}')
        
        if early_stopping.early_stop:
            print("早停触发!")
            break
    
    training_time = time.time() - start_time
    print(f'Training time: {training_time:.2f} seconds')
    
    # 保存最终模型
    torch.save(model.state_dict(), model_path)
    print(f"模型已保存到 {model_path}")
    
    # 可视化损失曲线
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(losses)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss over Epochs')
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
python 复制代码
# ==================== 加载权重后继续训练50轮 ====================
print("\n" + "="*50)
print("开始继续训练50轮...")
print("="*50)

# 确保模型在正确的设备上
model = MLP(input_dim=input_size).to(device)

# 如果存在最佳模型,加载最佳模型权重
if os.path.exists(best_model_path):
    print(f"加载最佳模型权重: {best_model_path}")
    model.load_state_dict(torch.load(best_model_path, map_location=device))
elif os.path.exists(model_path):
    print(f"加载最终模型权重: {model_path}")
    model.load_state_dict(torch.load(model_path, map_location=device))

# 继续训练的优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=0.0005)  # 使用更小的学习率继续训练
criterion = nn.CrossEntropyLoss()

# 早停策略
continue_early_stopping = EarlyStopping(patience=10, min_delta=0.0005, save_path='continue_best_model.pth')

# 继续训练
continue_epochs = 50
continue_losses = []
continue_val_losses = []

start_time = time.time()

for epoch in range(continue_epochs):
    # 训练模式
    model.train()
    train_loss = 0
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    avg_train_loss = train_loss / len(train_loader)
    continue_losses.append(avg_train_loss)
    
    # 验证模式
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test_tensor)
        val_loss = criterion(val_outputs, y_test_tensor)
        continue_val_losses.append(val_loss.item())
        
        # 计算准确率
        _, predicted = torch.max(val_outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())
    
    # 早停检查
    continue_early_stopping(val_loss.item(), model)
    
    # 打印训练信息
    print(f'Continue Epoch [{epoch+1}/{continue_epochs}], '
          f'Train Loss: {avg_train_loss:.4f}, '
          f'Val Loss: {val_loss.item():.4f}, '
          f'Accuracy: {accuracy:.4f}')
    
    if continue_early_stopping.early_stop:
        print("继续训练早停触发!")
        break

continue_training_time = time.time() - start_time
print(f'继续训练时间: {continue_training_time:.2f} seconds')

# 保存继续训练后的模型
torch.save(model.state_dict(), 'final_continue_model.pth')
print("继续训练完成,模型已保存为 'final_continue_model.pth'")

# 可视化继续训练的损失曲线
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(continue_losses, label='Continue Training Loss')
plt.plot(continue_val_losses, label='Continue Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Continue Training Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
epoch_range = range(len(continue_losses))
plt.plot(epoch_range, continue_losses, 'b-', label='Train Loss')
plt.plot(epoch_range, continue_val_losses, 'r-', label='Val Loss')
plt.fill_between(epoch_range, continue_losses, continue_val_losses, alpha=0.2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Continue Training Overview')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# ==================== 最终测试评估 ====================
print("\n" + "="*50)
print("在测试集上评估模型")
print("="*50)

# 确保模型处于评估模式
model.eval()

# 在测试集上进行预测
with torch.no_grad():
    # 注意:这里应该使用 X_test_tensor,而不是 X_test
    outputs = model(X_test_tensor)
    
    # 获取预测类别
    _, predicted = torch.max(outputs, 1)
    
    # 将张量转换为numpy数组进行比较
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    # 计算准确率
    correct = (y_pred == y_true).sum()
    total = len(y_true)
    accuracy = correct / total
    
    print(f'测试集准确率: {accuracy * 100:.2f}%')
    print(f'正确预测数: {correct}/{total}')
    
    # 计算更详细的评估指标
    from sklearn.metrics import classification_report, confusion_matrix
    
    print("\n分类报告:")
    print(classification_report(y_true, y_pred, target_names=['Non-Default', 'Default']))
    
    # 绘制混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Non-Default', 'Default'], 
                yticklabels=['Non-Default', 'Default'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix on Test Set')
    plt.show()

# 清理GPU缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

==================================================

在测试集上评估模型

==================================================

测试集准确率: 76.87%

正确预测数: 1153/1500

分类报告:

precision recall f1-score support

Non-Default 0.77 0.97 0.86 1059

Default 0.79 0.29 0.43 441

accuracy 0.77 1500

macro avg 0.78 0.63 0.64 1500

weighted avg 0.77 0.77 0.73 1500

相关推荐
撬动未来的支点3 分钟前
【AI】光速理解YOLO框架
人工智能·yolo·计算机视觉
电商API_180079052474 分钟前
批量获取电商商品数据的主流技术方法全解析
大数据·数据库·人工智能·数据分析·网络爬虫
学境思源AcademicIdeas12 分钟前
我在手机上部署了一个AI大模型,用它写完了论文初稿【附提示词】
人工智能·智能手机
week_泽15 分钟前
第1课:AI Agent是什么 - 学习笔记_1
人工智能·笔记·学习
kebijuelun16 分钟前
REAP the Experts:去掉 MoE 一半专家还能保持性能不变
人工智能·gpt·深度学习·语言模型·transformer
医工交叉实验工坊18 分钟前
从零详解WGCNA分析
人工智能·机器学习
百万彩票中奖候选人28 分钟前
在trae、qoder、Claude Code、Cursor等AI IDE中使用ui-ux-pro-max-skill
人工智能·ui·ux
Bigemap36 分钟前
如何保存WAsp软件需要的 map格式(.map)的文件
人工智能·无人机·政务·bigemappro添加地图
闲云一鹤39 分钟前
2026 最新 ComfyUI 教程 - 本地部署 AI 生图模型 - Z-Image-Turbo
前端·人工智能·ai编程
幻云201041 分钟前
Python深度学习:从入门到实战
人工智能·python