day45打卡

@浙大疏锦行

python 复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from torch.utils.tensorboard import SummaryWriter
import os
import time

# 1. 配置基础参数
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 5e-4
LOG_DIR = "./runs/resnet18_cifar10_finetune"  # TensorBoard日志保存路径
CHECKPOINT_DIR = "./checkpoints"

# 创建必要的目录
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# 2. 数据预处理和加载
# 训练集数据增强
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 测试集仅做标准化
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 加载数据集
train_dataset = datasets.CIFAR10(
    root='./data', train=True, download=True, transform=train_transform
)
test_dataset = datasets.CIFAR10(
    root='./data', train=False, download=True, transform=test_transform
)

# 创建数据加载器
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4
)

# 3. 构建模型并设置微调策略
def build_resnet18_finetune(num_classes=10):
    # 加载预训练的ResNet18
    model = models.resnet18(pretrained=True)
    
    # 第一步:冻结所有层
    for param in model.parameters():
        param.requires_grad = False
    
    # 修改最后一层全连接层,适配CIFAR-10的10分类
    in_features = model.fc.in_features
    model.fc = nn.Linear(in_features, num_classes)
    
    # 将模型移到指定设备
    model = model.to(DEVICE)
    
    return model

# 初始化模型
model = build_resnet18_finetune(num_classes=10)

# 4. 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
# 优化器只更新需要训练的参数(这里初始只有最后一层)
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)
# 学习率调度器
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# 5. 初始化TensorBoard
writer = SummaryWriter(log_dir=LOG_DIR)

# 6. 定义训练和验证函数
def train_one_epoch(model, loader, criterion, optimizer, epoch, writer):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(loader):
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 统计损失和准确率
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        # 每100个batch记录一次训练数据到TensorBoard
        global_step = epoch * len(loader) + batch_idx
        if batch_idx % 100 == 0:
            writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)
            writer.add_scalar('Train/Batch_Accuracy', 100.*correct/total, global_step)
            writer.add_scalar('Train/Learning_Rate', optimizer.param_groups[0]['lr'], global_step)
    
    # 计算epoch级别的指标
    epoch_loss = running_loss / len(loader)
    epoch_acc = 100. * correct / total
    
    # 记录epoch级别的指标到TensorBoard
    writer.add_scalar('Train/Epoch_Loss', epoch_loss, epoch)
    writer.add_scalar('Train/Epoch_Accuracy', epoch_acc, epoch)
    
    return epoch_loss, epoch_acc

def validate(model, loader, criterion, epoch, writer):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(loader):
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    # 计算验证集指标
    epoch_loss = running_loss / len(loader)
    epoch_acc = 100. * correct / total
    
    # 记录到TensorBoard
    writer.add_scalar('Val/Loss', epoch_loss, epoch)
    writer.add_scalar('Val/Accuracy', epoch_acc, epoch)
    
    # 保存最佳模型
    global best_acc
    if epoch_acc > best_acc:
        best_acc = epoch_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'accuracy': epoch_acc,
        }, os.path.join(CHECKPOINT_DIR, 'best_model.pth'))
    
    return epoch_loss, epoch_acc

# 7. 主训练循环
best_acc = 0.0
# 微调阶段1:只训练最后一层(前10个epoch)
for epoch in range(10):
    print(f"\nEpoch {epoch+1}/{EPOCHS} (冻结阶段)")
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
    val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    scheduler.step()

# 微调阶段2:解冻部分层继续训练
print("\n开始解冻模型进行微调...")
# 解冻除前几层外的所有层(可以根据需要调整解冻范围)
for name, param in model.named_parameters():
    if 'layer3' in name or 'layer4' in name or 'fc' in name:
        param.requires_grad = True

# 更新优化器,包含所有解冻的参数
optimizer = optim.Adam(
    model.parameters(),
    lr=LEARNING_RATE/10,  # 学习率降低10倍
    weight_decay=WEIGHT_DECAY
)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

# 继续训练剩余的epoch
for epoch in range(10, EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS} (解冻阶段)")
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
    val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    scheduler.step()

# 8. 训练结束
writer.close()
print(f"\n训练完成!最佳验证准确率: {best_acc:.2f}%")
print(f"TensorBoard日志保存在: {LOG_DIR}")
print(f"最佳模型保存在: {os.path.join(CHECKPOINT_DIR, 'best_model.pth')}")
相关推荐
测试19982 小时前
软件测试 - 单元测试总结
自动化测试·软件测试·python·测试工具·职场和发展·单元测试·测试用例
曲幽5 小时前
我用了FastApiAdmin后,连夜把踩过的坑都整理出来了
redis·python·postgresql·vue3·fastapi·web·sqlalchemy·admin·fastapiadmin
weixin_446260856 小时前
[特殊字符] 视觉Transformer (ViT) 原理及性能突破:从CNN到大规模自注意力机制的迁移
深度学习·cnn·transformer
小a彤6 小时前
GE 在 CANN 五层架构中的位置
人工智能·深度学习·transformer
前端若水6 小时前
会话管理:创建、切换、删除对话历史
前端·人工智能·python·react.js
碧海银沙音频科技研究院7 小时前
通话AEC与语音识别AEC的软硬回采链路
深度学习·算法·语音识别
放下华子我只抽RuiKe57 小时前
React 从入门到生产(四):自定义 Hook
前端·javascript·人工智能·深度学习·react.js·自然语言处理·前端框架
涛声依旧-底层原理研究所7 小时前
残差连接与层归一化通俗易懂的详解
人工智能·python·神经网络·transformer
csdn_aspnet7 小时前
Python 算法快闪 LeetCode 编号 70 - 爬楼梯
python·算法·leetcode·职场和发展