day45打卡

@浙大疏锦行

python 复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from torch.utils.tensorboard import SummaryWriter
import os
import time

# 1. 配置基础参数
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 5e-4
LOG_DIR = "./runs/resnet18_cifar10_finetune"  # TensorBoard日志保存路径
CHECKPOINT_DIR = "./checkpoints"

# 创建必要的目录
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# 2. 数据预处理和加载
# 训练集数据增强
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 测试集仅做标准化
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 加载数据集
train_dataset = datasets.CIFAR10(
    root='./data', train=True, download=True, transform=train_transform
)
test_dataset = datasets.CIFAR10(
    root='./data', train=False, download=True, transform=test_transform
)

# 创建数据加载器
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4
)

# 3. 构建模型并设置微调策略
def build_resnet18_finetune(num_classes=10):
    # 加载预训练的ResNet18
    model = models.resnet18(pretrained=True)
    
    # 第一步:冻结所有层
    for param in model.parameters():
        param.requires_grad = False
    
    # 修改最后一层全连接层,适配CIFAR-10的10分类
    in_features = model.fc.in_features
    model.fc = nn.Linear(in_features, num_classes)
    
    # 将模型移到指定设备
    model = model.to(DEVICE)
    
    return model

# 初始化模型
model = build_resnet18_finetune(num_classes=10)

# 4. 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
# 优化器只更新需要训练的参数(这里初始只有最后一层)
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)
# 学习率调度器
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# 5. 初始化TensorBoard
writer = SummaryWriter(log_dir=LOG_DIR)

# 6. 定义训练和验证函数
def train_one_epoch(model, loader, criterion, optimizer, epoch, writer):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(loader):
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 统计损失和准确率
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        # 每100个batch记录一次训练数据到TensorBoard
        global_step = epoch * len(loader) + batch_idx
        if batch_idx % 100 == 0:
            writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)
            writer.add_scalar('Train/Batch_Accuracy', 100.*correct/total, global_step)
            writer.add_scalar('Train/Learning_Rate', optimizer.param_groups[0]['lr'], global_step)
    
    # 计算epoch级别的指标
    epoch_loss = running_loss / len(loader)
    epoch_acc = 100. * correct / total
    
    # 记录epoch级别的指标到TensorBoard
    writer.add_scalar('Train/Epoch_Loss', epoch_loss, epoch)
    writer.add_scalar('Train/Epoch_Accuracy', epoch_acc, epoch)
    
    return epoch_loss, epoch_acc

def validate(model, loader, criterion, epoch, writer):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(loader):
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    # 计算验证集指标
    epoch_loss = running_loss / len(loader)
    epoch_acc = 100. * correct / total
    
    # 记录到TensorBoard
    writer.add_scalar('Val/Loss', epoch_loss, epoch)
    writer.add_scalar('Val/Accuracy', epoch_acc, epoch)
    
    # 保存最佳模型
    global best_acc
    if epoch_acc > best_acc:
        best_acc = epoch_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'accuracy': epoch_acc,
        }, os.path.join(CHECKPOINT_DIR, 'best_model.pth'))
    
    return epoch_loss, epoch_acc

# 7. 主训练循环
best_acc = 0.0
# 微调阶段1:只训练最后一层(前10个epoch)
for epoch in range(10):
    print(f"\nEpoch {epoch+1}/{EPOCHS} (冻结阶段)")
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
    val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    scheduler.step()

# 微调阶段2:解冻部分层继续训练
print("\n开始解冻模型进行微调...")
# 解冻除前几层外的所有层(可以根据需要调整解冻范围)
for name, param in model.named_parameters():
    if 'layer3' in name or 'layer4' in name or 'fc' in name:
        param.requires_grad = True

# 更新优化器,包含所有解冻的参数
optimizer = optim.Adam(
    model.parameters(),
    lr=LEARNING_RATE/10,  # 学习率降低10倍
    weight_decay=WEIGHT_DECAY
)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

# 继续训练剩余的epoch
for epoch in range(10, EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS} (解冻阶段)")
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
    val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    scheduler.step()

# 8. 训练结束
writer.close()
print(f"\n训练完成!最佳验证准确率: {best_acc:.2f}%")
print(f"TensorBoard日志保存在: {LOG_DIR}")
print(f"最佳模型保存在: {os.path.join(CHECKPOINT_DIR, 'best_model.pth')}")
相关推荐
2401_8323655220 分钟前
JavaScript中rest参数(...args)取代arguments的优势
jvm·数据库·python
Sirius.z22 分钟前
第J3周:DenseNet121算法详解
python
2301_7796224137 分钟前
Go语言怎么用信号量控制并发_Go语言semaphore信号量教程【入门】
jvm·数据库·python
2301_766283441 小时前
c++如何将控制台输出保存到文件_cout重定向到txt【详解】
jvm·数据库·python
沪漂阿龙2 小时前
OpenAI Agents SDK 深度解析(三):执行层——Agent 的“幕后指挥部”
人工智能·深度学习
数智工坊2 小时前
【SAM-DETR论文阅读】:基于语义对齐匹配的DETR极速收敛检测框架
网络·论文阅读·人工智能·深度学习·transformer
童园管理札记2 小时前
【续】数字时代:学前教育的新改革
经验分享·深度学习·职场和发展·微信公众平台
小康小小涵2 小时前
基于ESP32S3实现无人机RID模块底层源码编译
linux·开发语言·python
lzjava20242 小时前
Python的函数
开发语言·python
Awesome Baron3 小时前
skill、tool calling、MCP区别
开发语言·人工智能·python