Day 38 早停策略和模型权重的保存

一、早停策略

在训练模型时同时监控模型在验证集上的指标,若指标在验证集上不在变好,则终止训练。

python 复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import time
import matplotlib.pyplot as plt
from tqdm import tqdm  # 导入tqdm库用于进度条显示
import warnings
warnings.filterwarnings("ignore")  # 忽略警告信息

# 设置GPU设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 加载鸢尾花数据集
iris = load_iris()
X = iris.data  # 特征数据
y = iris.target  # 标签数据

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 归一化数据
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 将数据转换为PyTorch张量并移至GPU
X_train = torch.FloatTensor(X_train).to(device)
y_train = torch.LongTensor(y_train).to(device)
X_test = torch.FloatTensor(X_test).to(device)
y_test = torch.LongTensor(y_test).to(device)

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(4, 10)  # 输入层到隐藏层
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(10, 3)  # 隐藏层到输出层

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# 实例化模型并移至GPU
model = MLP().to(device)

# 分类问题使用交叉熵损失函数
criterion = nn.CrossEntropyLoss()

# 使用随机梯度下降优化器
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 训练模型
num_epochs = 20000  # 训练的轮数

# 用于存储每200个epoch的损失值和对应的epoch数
train_losses = []  # 存储训练集损失
test_losses = []   # 存储测试集损失
epochs = []

# ===== 新增早停相关参数 =====
best_test_loss = float('inf')  # 记录最佳测试集损失
best_epoch = 0                 # 记录最佳epoch
patience = 50                # 早停耐心值(连续多少轮测试集损失未改善时停止训练)
counter = 0                    # 早停计数器
early_stopped = False          # 是否早停标志
# ==========================

start_time = time.time()  # 记录开始时间

# 创建tqdm进度条
with tqdm(total=num_epochs, desc="训练进度", unit="epoch") as pbar:
    # 训练模型
    for epoch in range(num_epochs):
        # 前向传播
        outputs = model(X_train)  # 隐式调用forward函数
        train_loss = criterion(outputs, y_train)

        # 反向传播和优化
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # 记录损失值并更新进度条
        if (epoch + 1) % 200 == 0:
            # 计算测试集损失
            model.eval()
            with torch.no_grad():
                test_outputs = model(X_test)
                test_loss = criterion(test_outputs, y_test)
            model.train()
            
            train_losses.append(train_loss.item())
            test_losses.append(test_loss.item())
            epochs.append(epoch + 1)
            
            # 更新进度条的描述信息
            pbar.set_postfix({'Train Loss': f'{train_loss.item():.4f}', 'Test Loss': f'{test_loss.item():.4f}'})
            
            # ===== 新增早停逻辑 =====
            if test_loss.item() < best_test_loss: # 如果当前测试集损失小于最佳损失
                best_test_loss = test_loss.item() # 更新最佳损失
                best_epoch = epoch + 1 # 更新最佳epoch
                counter = 0 # 重置计数器
                # 保存最佳模型
                torch.save(model.state_dict(), 'best_model.pth')
            else:
                counter += 1
                if counter >= patience:
                    print(f"早停触发!在第{epoch+1}轮,测试集损失已有{patience}轮未改善。")
                    print(f"最佳测试集损失出现在第{best_epoch}轮,损失值为{best_test_loss:.4f}")
                    early_stopped = True
                    break  # 终止训练循环
            # ======================

        # 每1000个epoch更新一次进度条
        if (epoch + 1) % 1000 == 0:
            pbar.update(1000)  # 更新进度条

    # 确保进度条达到100%
    if pbar.n < num_epochs:
        pbar.update(num_epochs - pbar.n)  # 计算剩余的进度并更新

time_all = time.time() - start_time  # 计算训练时间
print(f'Training time: {time_all:.2f} seconds')

# ===== 新增:加载最佳模型用于最终评估 =====
if early_stopped:
    print(f"加载第{best_epoch}轮的最佳模型进行最终评估...")
    model.load_state_dict(torch.load('best_model.pth'))
# ================================

# 可视化损失曲线
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_losses, label='Train Loss')
plt.plot(epochs, test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss over Epochs')
plt.legend()
plt.grid(True)
plt.show()

# 在测试集上评估模型
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == y_test).sum().item()
    accuracy = correct / y_test.size(0)
    print(f'测试集准确率: {accuracy * 100:.2f}%')    

二、权重的保存

作业:对信贷数据集进行训练后保存权重,加载权重后,继续训练50轮,并采取早停策略

python 复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import time
import matplotlib.pyplot as plt
from tqdm import tqdm  # 导入tqdm库用于进度条显示
import warnings
warnings.filterwarnings("ignore")  # 忽略警告信息

# 设置GPU设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

import pandas as pd
# 加载信贷数据集
data = pd.read_csv(r"E:\PythonStudy\python60-days-challenge-master\data.csv")

import numpy as np     #用于数值计算,提供了高效的数组操作。
# 先筛选字符串变量 
discrete_features = data.select_dtypes(include=['object']).columns.tolist()
# Home Ownership 标签编码
home_ownership_mapping = {
    'Own Home': 1,
    'Rent': 2,
    'Have Mortgage': 3,
    'Home Mortgage': 4
}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)

# Years in current job 标签编码
years_in_job_mapping = {
    '< 1 year': 1,
    '1 year': 2,
    '2 years': 3,
    '3 years': 4,
    '4 years': 5,
    '5 years': 6,
    '6 years': 7,
    '7 years': 8,
    '8 years': 9,
    '9 years': 10,
    '10+ years': 11
}
data['Years in current job'] = data['Years in current job'].map(years_in_job_mapping)

# Purpose 独热编码,记得需要将bool类型转换为数值
data = pd.get_dummies(data, columns=['Purpose'])
data2 = pd.read_csv(r"E:\PythonStudy\python60-days-challenge-master\data.csv") # 重新读取数据,用来做列名对比
list_final = [] # 新建一个空列表,用于存放独热编码后新增的特征名
for i in data.columns:
    if i not in data2.columns:
       list_final.append(i) # 这里打印出来的就是独热编码后的特征名
for i in list_final:
    data[i] = data[i].astype(int) # 这里的i就是独热编码后的特征名



# Term 0 - 1 映射
term_mapping = {
    'Short Term': 0,
    'Long Term': 1
}
data['Term'] = data['Term'].map(term_mapping)
data.rename(columns={'Term': 'Long Term'}, inplace=True) # 重命名列
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()  #把筛选出来的列名转换成列表
 
 # 连续特征用中位数补全
for feature in continuous_features:     
    mode_value = data[feature].mode()[0]            #获取该列的众数。
    data[feature].fillna(mode_value, inplace=True)          #用众数填充该列的缺失值,inplace=True表示直接在原数据上修改。

# 最开始也说了 很多调参函数自带交叉验证,甚至是必选的参数,你如果想要不交叉反而实现起来会麻烦很多
# 所以这里我们还是只划分一次数据集
from sklearn.model_selection import train_test_split
X = data.drop(['Credit Default'], axis=1)  # 特征,axis=1表示按列删除
y = data['Credit Default'] # 标签
# 按照7:3划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)  
#将测试集1:1划分验证集和测试集
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)  # 50%验证集,50%测试集
#归一化处理
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train) 
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
y_val = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1).to(device)
y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

#定义神经网络模型
# 定义模型(简单的3层全连接网络,相当于"小炒锅",足够炒信贷数据这个菜)
import torch.nn as nn

class CreditModel(nn.Module):
    def __init__(self, input_dim):
        super(CreditModel, self).__init__()
        # 网络层:输入层→隐藏层1→隐藏层2→输出层(二分类只有1个输出)
        self.fc1 = nn.Linear(input_dim, 128)  # 第一层:输入维度→128个神经元
        self.fc2 = nn.Linear(128, 64)         # 第二层:128→64
        self.fc3 = nn.Linear(64, 1)           # 输出层:64→1(预测是否违约)
        self.relu = nn.ReLU()                 # 激活函数(让模型学复杂规律)
        self.sigmoid = nn.Sigmoid()           # 输出0~1之间的概率

    def forward(self, x):
        # 前向传播(数据在网络里的流动)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# 实例化模型(把锅架起来)
model = CreditModel(X_train.shape[1]).to(device)

# 定义损失函数和优化器
criterion = nn.BCELoss()  # 二分类任务的损失函数(适合判断违约)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # 优化器(让模型学的更快)

# 构建数据加载器
from torch.utils.data import TensorDataset, DataLoader

batch_size = 32
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 查看处理后的数据维度(确认没问题)
input_dim = X_train.shape[1]
print(f"模型输入维度:{input_dim}")
python 复制代码
# ---------------------- 首次训练(20轮) ----------------------
def train_one_epoch(model, train_loader, criterion, optimizer):
    # 训练一轮的函数
    model.train()  # 切换到训练模式
    total_loss = 0.0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()  # 清空上一轮的梯度(避免残留)
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()  # 反向传播(计算梯度)
        optimizer.step()  # 更新权重(模型学东西)
        total_loss += loss.item() * batch_x.size(0)
    avg_loss = total_loss / len(train_loader.dataset)
    return avg_loss

# 首次训练20轮
initial_epochs = 20
for epoch in range(initial_epochs):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer)
    if (epoch+1) % 5 == 0:  # 每5轮打印一次,看进度
        print(f"首次训练第{epoch+1}轮,训练损失:{train_loss:.4f}")

# ---------------------- 保存权重(关键:存成文件,方便后续续训) ----------------------
import os
save_dir = os.path.expanduser("E:/PythonStudy/pystudy check-in/credit_train")  
if not os.path.exists(save_dir):
    os.makedirs(save_dir)  # 文件夹不存在就创建

# 保存权重(用state_dict,只存参数,文件小且稳)
weight_path = os.path.join(save_dir, "credit_model_initial.pth")
torch.save(model.state_dict(), weight_path)
print(f"首次训练权重已保存到:{weight_path}")
python 复制代码
# ---------------------- 定义早停策略 ----------------------
# 早停的参数(今天学的:耐心值、最小提升阈值、最佳验证损失)
patience = 10  # 连续10轮验证损失没进步就停
delta = 0.0001  # 至少下降0.0001才算进步
best_val_loss = float('inf')  # 初始化为无穷大(因为损失越小越好)
counter = 0  # 计数轮数
stop_training = False  # 是否停止训练的标志

# 验证函数(判断模型炒得好不好)
def validate(model, val_loader, criterion):
    model.eval()  # 切换到评估模式
    total_loss = 0.0
    with torch.no_grad():  # 关闭梯度计算,加快速度
        for batch_x, batch_y in val_loader:
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item() * batch_x.size(0)
    avg_loss = total_loss / len(val_loader.dataset)
    return avg_loss

# ---------------------- 继续训练50轮(带早停) ----------------------
max_continue_epochs = 50  # 计划续训50轮
start_epoch = 0  # 续训的起始轮数

for epoch in range(start_epoch, max_continue_epochs):
    if stop_training:
        print("早停触发,提前结束训练!")
        break  # 触发早停就跳出循环

    # 训练一轮
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer)
    # 验证一轮
    val_loss = validate(model, val_loader, criterion)

    # 打印进度
    print(f"续训第{epoch+1}轮 | 训练损失:{train_loss:.4f} | 验证损失:{val_loss:.4f}")

    # 早停判断(今天学的核心逻辑)
    if val_loss < best_val_loss - delta:
        # 验证损失有有效提升,更新最佳损失,重置计数器
        best_val_loss = val_loss
        counter = 0
        # 保存最佳模型(可选,推荐)
        best_weight_path = os.path.join(save_dir, "credit_model_best.pth")
        torch.save(model.state_dict(), best_weight_path)
        print(f"✨ 发现更好的模型,已保存到:{best_weight_path}")
    else:
        # 验证损失没进步,计数器+1
        counter += 1
        print(f"⚠️  连续{counter}轮验证损失无有效提升,耐心值:{patience}")
        if counter >= patience:
            stop_training = True  # 触发早停

# 打印最终结果
print(f"续训完成!实际训练轮数:{epoch+1 if not stop_training else counter}")

@浙大疏锦行

相关推荐
sunywz16 小时前
【JVM】(2)java类加载机制
java·jvm·python
小鸡吃米…16 小时前
机器学习中的随机森林算法
算法·随机森林·机器学习
Silence_Jy16 小时前
GPU架构
python
kwg12616 小时前
本地搭建 OPC UA MCP 服务
python·agent·mcp
belldeep16 小时前
python:mnist 数据集下载,parse
python·numpy·mnist
柠檬叶子C16 小时前
【Python】解决 No module named ‘imp‘ 问题 | Python3 中废弃的 imp 模块
开发语言·python
我想吃烤肉肉16 小时前
wait_until=“domcontentloaded“ 解释
开发语言·前端·javascript·爬虫·python
weixin1997010801616 小时前
废旧物资 item_search - 按关键字搜索商品列表接口对接全攻略:从入门到精通
数据库·python
海棠AI实验室16 小时前
第二章 从脚本到工程:进阶学习的 5 个方法论(可维护性/可复现/可评估/可扩展/可交付)
python·数据