day45打卡

@浙大疏锦行

python 复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from torch.utils.tensorboard import SummaryWriter
import os
import time

# 1. 配置基础参数
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 5e-4
LOG_DIR = "./runs/resnet18_cifar10_finetune"  # TensorBoard日志保存路径
CHECKPOINT_DIR = "./checkpoints"

# 创建必要的目录
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# 2. 数据预处理和加载
# 训练集数据增强
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 测试集仅做标准化
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 加载数据集
train_dataset = datasets.CIFAR10(
    root='./data', train=True, download=True, transform=train_transform
)
test_dataset = datasets.CIFAR10(
    root='./data', train=False, download=True, transform=test_transform
)

# 创建数据加载器
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4
)

# 3. 构建模型并设置微调策略
def build_resnet18_finetune(num_classes=10):
    # 加载预训练的ResNet18
    model = models.resnet18(pretrained=True)
    
    # 第一步:冻结所有层
    for param in model.parameters():
        param.requires_grad = False
    
    # 修改最后一层全连接层,适配CIFAR-10的10分类
    in_features = model.fc.in_features
    model.fc = nn.Linear(in_features, num_classes)
    
    # 将模型移到指定设备
    model = model.to(DEVICE)
    
    return model

# 初始化模型
model = build_resnet18_finetune(num_classes=10)

# 4. 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
# 优化器只更新需要训练的参数(这里初始只有最后一层)
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)
# 学习率调度器
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# 5. 初始化TensorBoard
writer = SummaryWriter(log_dir=LOG_DIR)

# 6. 定义训练和验证函数
def train_one_epoch(model, loader, criterion, optimizer, epoch, writer):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(loader):
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 统计损失和准确率
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        # 每100个batch记录一次训练数据到TensorBoard
        global_step = epoch * len(loader) + batch_idx
        if batch_idx % 100 == 0:
            writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)
            writer.add_scalar('Train/Batch_Accuracy', 100.*correct/total, global_step)
            writer.add_scalar('Train/Learning_Rate', optimizer.param_groups[0]['lr'], global_step)
    
    # 计算epoch级别的指标
    epoch_loss = running_loss / len(loader)
    epoch_acc = 100. * correct / total
    
    # 记录epoch级别的指标到TensorBoard
    writer.add_scalar('Train/Epoch_Loss', epoch_loss, epoch)
    writer.add_scalar('Train/Epoch_Accuracy', epoch_acc, epoch)
    
    return epoch_loss, epoch_acc

def validate(model, loader, criterion, epoch, writer):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(loader):
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    # 计算验证集指标
    epoch_loss = running_loss / len(loader)
    epoch_acc = 100. * correct / total
    
    # 记录到TensorBoard
    writer.add_scalar('Val/Loss', epoch_loss, epoch)
    writer.add_scalar('Val/Accuracy', epoch_acc, epoch)
    
    # 保存最佳模型
    global best_acc
    if epoch_acc > best_acc:
        best_acc = epoch_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'accuracy': epoch_acc,
        }, os.path.join(CHECKPOINT_DIR, 'best_model.pth'))
    
    return epoch_loss, epoch_acc

# 7. 主训练循环
best_acc = 0.0
# 微调阶段1:只训练最后一层(前10个epoch)
for epoch in range(10):
    print(f"\nEpoch {epoch+1}/{EPOCHS} (冻结阶段)")
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
    val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    scheduler.step()

# 微调阶段2:解冻部分层继续训练
print("\n开始解冻模型进行微调...")
# 解冻除前几层外的所有层(可以根据需要调整解冻范围)
for name, param in model.named_parameters():
    if 'layer3' in name or 'layer4' in name or 'fc' in name:
        param.requires_grad = True

# 更新优化器,包含所有解冻的参数
optimizer = optim.Adam(
    model.parameters(),
    lr=LEARNING_RATE/10,  # 学习率降低10倍
    weight_decay=WEIGHT_DECAY
)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

# 继续训练剩余的epoch
for epoch in range(10, EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS} (解冻阶段)")
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
    val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    scheduler.step()

# 8. 训练结束
writer.close()
print(f"\n训练完成!最佳验证准确率: {best_acc:.2f}%")
print(f"TensorBoard日志保存在: {LOG_DIR}")
print(f"最佳模型保存在: {os.path.join(CHECKPOINT_DIR, 'best_model.pth')}")
相关推荐
aiguangyuan12 小时前
使用LSTM进行情感分类:原理与实现剖析
人工智能·python·nlp
小小张说故事12 小时前
BeautifulSoup:Python网页解析的优雅利器
后端·爬虫·python
Yeats_Liao12 小时前
评估体系构建:基于自动化指标与人工打分的双重验证
运维·人工智能·深度学习·算法·机器学习·自动化
luoluoal12 小时前
基于python的医疗领域用户问答的意图识别算法研究(源码+文档)
python
Shi_haoliu12 小时前
python安装操作流程-FastAPI + PostgreSQL简单流程
python·postgresql·fastapi
ZH154558913112 小时前
Flutter for OpenHarmony Python学习助手实战:API接口开发的实现
python·学习·flutter
断眉的派大星12 小时前
均值为0,方差为1:数据的“标准校服”
人工智能·机器学习·均值算法
小宋102112 小时前
Java 项目结构 vs Python 项目结构:如何快速搭一个可跑项目
java·开发语言·python
Tadas-Gao12 小时前
缸中之脑:大模型架构的智能幻象与演进困局
人工智能·深度学习·机器学习·架构·大模型·llm