Day43 PythonStudy

@浙大疏锦行

python 复制代码
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 读取数据
data = pd.read_csv('E:\PyStudy\data.csv')

# 预处理代码(保持不变)
discrete_features = data.select_dtypes(include=['object']).columns.tolist()

# Home Ownership 标签编码
home_ownership_mapping = {
    'Own Home': 1,
    'Rent': 2,
    'Have Mortgage': 3,
    'Home Mortgage': 4
}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)

# Years in current job 标签编码
years_in_job_mapping = {
    '< 1 year': 1,
    '1 year': 2,
    '2 years': 3,
    '3 years': 4,
    '4 years': 5,
    '5 years': 6,
    '6 years': 7,
    '7 years': 8,
    '8 years': 9,
    '9 years': 10,
    '10+ years': 11
}
data['Years in current job'] = data['Years in current job'].map(years_in_job_mapping)

# Purpose 独热编码
data = pd.get_dummies(data, columns=['Purpose'])
data2 = pd.read_csv("data.csv")
list_final = []
for i in data.columns:
    if i not in data2.columns:
        list_final.append(i)
for i in list_final:
    data[i] = data[i].astype(int)

# Term 0 - 1 映射
term_mapping = {
    'Short Term': 0,
    'Long Term': 1
}
data['Term'] = data['Term'].map(term_mapping)
data.rename(columns={'Term': 'Long Term'}, inplace=True)

# 连续特征用中位数补全
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in continuous_features:
    mode_value = data[feature].mode()[0]
    data[feature].fillna(mode_value, inplace=True)

# 划分数据集
X = data.drop(['Credit Default'], axis=1)
y = data['Credit Default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 数据标准化(对神经网络很重要)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 转换为PyTorch张量并移动到GPU
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.LongTensor(y_train.values).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.LongTensor(y_test.values).to(device)

# 创建数据集和数据加载器(可选)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

input_size = X_train_scaled.shape[1]

class MLP(nn.Module): 
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):  # 二分类输出2
        super(MLP, self).__init__()
        
        # 动态设置输入维度
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # 输入层到隐藏层
        self.bn1 = nn.BatchNorm1d(hidden_dim)  # 批归一化
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)  # Dropout防止过拟合
        
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        
        self.fc3 = nn.Linear(hidden_dim // 2, hidden_dim // 4)
        
        self.fc4 = nn.Linear(hidden_dim // 4, output_dim)  # 输出层

    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.relu(out)
        
        out = self.fc4(out)
        return out

# 定义早停类
class EarlyStopping:
    def __init__(self, patience=10, min_delta=0, save_path='best_model.pth'):
        """
        Args:
            patience: 容忍多少个epoch没有改善
            min_delta: 最小改善量
            save_path: 最佳模型保存路径
        """
        self.patience = patience
        self.min_delta = min_delta
        self.save_path = save_path
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.save_checkpoint(model)
            self.counter = 0
            
    def save_checkpoint(self, model):
        """保存最佳模型"""
        torch.save(model.state_dict(), self.save_path)
        print(f'Validation loss decreased. Saving model to {self.save_path}')

# 模型路径
model_path = 'credit_model.pth'
best_model_path = 'best_credit_model.pth'

# 检查是否有已保存的模型
if os.path.exists(model_path):
    print("加载已保存的模型权重...")
    model = MLP(input_dim=input_size).to(device)
    model.load_state_dict(torch.load(model_path))
    print("模型权重加载成功!")
else:
    print("训练新模型...")
    # 实例化模型并移动到GPU
    model = MLP(input_dim=input_size).to(device)
    
    # 分类问题使用交叉熵损失函数
    criterion = nn.CrossEntropyLoss()
    
    # 使用Adam优化器(通常比SGD更好)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # 初始化早停
    early_stopping = EarlyStopping(patience=15, min_delta=0.001, save_path=best_model_path)
    
    # 训练模型
    num_epochs = 20000
    losses = []
    val_losses = []
    
    start_time = time.time()
    
    for epoch in range(num_epochs):
        # 训练模式
        model.train()
        train_loss = 0
        
        # 使用数据加载器进行批处理
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            # 反向传播和优化
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        losses.append(avg_train_loss)
        
        # 验证模式
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_test_tensor)
            val_loss = criterion(val_outputs, y_test_tensor)
            val_losses.append(val_loss.item())
            
            # 计算准确率
            _, predicted = torch.max(val_outputs, 1)
            accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())
        
        # 早停检查
        early_stopping(val_loss.item(), model)
        
        # 打印训练信息
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], '
                  f'Train Loss: {avg_train_loss:.4f}, '
                  f'Val Loss: {val_loss.item():.4f}, '
                  f'Accuracy: {accuracy:.4f}')
        
        if early_stopping.early_stop:
            print("早停触发!")
            break
    
    training_time = time.time() - start_time
    print(f'Training time: {training_time:.2f} seconds')
    
    # 保存最终模型
    torch.save(model.state_dict(), model_path)
    print(f"模型已保存到 {model_path}")
    
    # 可视化损失曲线
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(losses)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss over Epochs')
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
python 复制代码
# ==================== 加载权重后继续训练50轮 ====================
print("\n" + "="*50)
print("开始继续训练50轮...")
print("="*50)

# 确保模型在正确的设备上
model = MLP(input_dim=input_size).to(device)

# 如果存在最佳模型,加载最佳模型权重
if os.path.exists(best_model_path):
    print(f"加载最佳模型权重: {best_model_path}")
    model.load_state_dict(torch.load(best_model_path, map_location=device))
elif os.path.exists(model_path):
    print(f"加载最终模型权重: {model_path}")
    model.load_state_dict(torch.load(model_path, map_location=device))

# 继续训练的优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=0.0005)  # 使用更小的学习率继续训练
criterion = nn.CrossEntropyLoss()

# 早停策略
continue_early_stopping = EarlyStopping(patience=10, min_delta=0.0005, save_path='continue_best_model.pth')

# 继续训练
continue_epochs = 50
continue_losses = []
continue_val_losses = []

start_time = time.time()

for epoch in range(continue_epochs):
    # 训练模式
    model.train()
    train_loss = 0
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    avg_train_loss = train_loss / len(train_loader)
    continue_losses.append(avg_train_loss)
    
    # 验证模式
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test_tensor)
        val_loss = criterion(val_outputs, y_test_tensor)
        continue_val_losses.append(val_loss.item())
        
        # 计算准确率
        _, predicted = torch.max(val_outputs, 1)
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted.cpu().numpy())
    
    # 早停检查
    continue_early_stopping(val_loss.item(), model)
    
    # 打印训练信息
    print(f'Continue Epoch [{epoch+1}/{continue_epochs}], '
          f'Train Loss: {avg_train_loss:.4f}, '
          f'Val Loss: {val_loss.item():.4f}, '
          f'Accuracy: {accuracy:.4f}')
    
    if continue_early_stopping.early_stop:
        print("继续训练早停触发!")
        break

continue_training_time = time.time() - start_time
print(f'继续训练时间: {continue_training_time:.2f} seconds')

# 保存继续训练后的模型
torch.save(model.state_dict(), 'final_continue_model.pth')
print("继续训练完成,模型已保存为 'final_continue_model.pth'")

# 可视化继续训练的损失曲线
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(continue_losses, label='Continue Training Loss')
plt.plot(continue_val_losses, label='Continue Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Continue Training Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
epoch_range = range(len(continue_losses))
plt.plot(epoch_range, continue_losses, 'b-', label='Train Loss')
plt.plot(epoch_range, continue_val_losses, 'r-', label='Val Loss')
plt.fill_between(epoch_range, continue_losses, continue_val_losses, alpha=0.2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Continue Training Overview')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# ==================== 最终测试评估 ====================
print("\n" + "="*50)
print("在测试集上评估模型")
print("="*50)

# 确保模型处于评估模式
model.eval()

# 在测试集上进行预测
with torch.no_grad():
    # 注意:这里应该使用 X_test_tensor,而不是 X_test
    outputs = model(X_test_tensor)
    
    # 获取预测类别
    _, predicted = torch.max(outputs, 1)
    
    # 将张量转换为numpy数组进行比较
    y_pred = predicted.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()
    
    # 计算准确率
    correct = (y_pred == y_true).sum()
    total = len(y_true)
    accuracy = correct / total
    
    print(f'测试集准确率: {accuracy * 100:.2f}%')
    print(f'正确预测数: {correct}/{total}')
    
    # 计算更详细的评估指标
    from sklearn.metrics import classification_report, confusion_matrix
    
    print("\n分类报告:")
    print(classification_report(y_true, y_pred, target_names=['Non-Default', 'Default']))
    
    # 绘制混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Non-Default', 'Default'], 
                yticklabels=['Non-Default', 'Default'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix on Test Set')
    plt.show()

# 清理GPU缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

==================================================

在测试集上评估模型

==================================================

测试集准确率: 76.87%

正确预测数: 1153/1500

分类报告:

precision recall f1-score support

Non-Default 0.77 0.97 0.86 1059

Default 0.79 0.29 0.43 441

accuracy 0.77 1500

macro avg 0.78 0.63 0.64 1500

weighted avg 0.77 0.77 0.73 1500

相关推荐
星河队长4 小时前
人工智能的自我认知
人工智能
无人装备硬件开发爱好者4 小时前
AI 赋能航天造物:LEAP71 式火箭发动机计算工程软件开发全解析 1
人工智能·商业火箭发动机·增材加工·leap71
数智联AI团队4 小时前
AI搜索引领行业变革:2023年GEO优化服务市场深度洞察与专业机构选择指南
人工智能
PaperRed ai写作降重助手4 小时前
主流 AI 论文写作工具排名(2026 最新)
人工智能·aigc·ai写作·论文写作·论文降重·论文查重·辅助写作
翱翔的苍鹰4 小时前
一个简单的法律问答机器人实现思路
人工智能·深度学习·语言模型·自然语言处理
njsgcs4 小时前
我要fork openclaw了 ai自己写skill
人工智能
小W与影刀RPA4 小时前
【影刀RPA】:智能过滤敏感词,高效输出表格
大数据·人工智能·python·低代码·自动化·rpa·影刀rpa
铁蛋AI编程实战4 小时前
DeepSeek mHC 架构 + Agent 实战大模型开发指南
人工智能·架构·开源
源于花海4 小时前
迁移学习简明手册——迁移学习相关研究学者
人工智能·机器学习·迁移学习·研究学者
OPEN-Source4 小时前
开源工具轻松实现高清视频修复
人工智能·视频处理