day45打卡

@浙大疏锦行

python 复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from torch.utils.tensorboard import SummaryWriter
import os
import time

# 1. 配置基础参数
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 5e-4
LOG_DIR = "./runs/resnet18_cifar10_finetune"  # TensorBoard日志保存路径
CHECKPOINT_DIR = "./checkpoints"

# 创建必要的目录
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# 2. 数据预处理和加载
# 训练集数据增强
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 测试集仅做标准化
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 加载数据集
train_dataset = datasets.CIFAR10(
    root='./data', train=True, download=True, transform=train_transform
)
test_dataset = datasets.CIFAR10(
    root='./data', train=False, download=True, transform=test_transform
)

# 创建数据加载器
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4
)

# 3. 构建模型并设置微调策略
def build_resnet18_finetune(num_classes=10):
    # 加载预训练的ResNet18
    model = models.resnet18(pretrained=True)
    
    # 第一步:冻结所有层
    for param in model.parameters():
        param.requires_grad = False
    
    # 修改最后一层全连接层,适配CIFAR-10的10分类
    in_features = model.fc.in_features
    model.fc = nn.Linear(in_features, num_classes)
    
    # 将模型移到指定设备
    model = model.to(DEVICE)
    
    return model

# 初始化模型
model = build_resnet18_finetune(num_classes=10)

# 4. 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
# 优化器只更新需要训练的参数(这里初始只有最后一层)
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)
# 学习率调度器
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# 5. 初始化TensorBoard
writer = SummaryWriter(log_dir=LOG_DIR)

# 6. 定义训练和验证函数
def train_one_epoch(model, loader, criterion, optimizer, epoch, writer):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(loader):
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 统计损失和准确率
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        # 每100个batch记录一次训练数据到TensorBoard
        global_step = epoch * len(loader) + batch_idx
        if batch_idx % 100 == 0:
            writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)
            writer.add_scalar('Train/Batch_Accuracy', 100.*correct/total, global_step)
            writer.add_scalar('Train/Learning_Rate', optimizer.param_groups[0]['lr'], global_step)
    
    # 计算epoch级别的指标
    epoch_loss = running_loss / len(loader)
    epoch_acc = 100. * correct / total
    
    # 记录epoch级别的指标到TensorBoard
    writer.add_scalar('Train/Epoch_Loss', epoch_loss, epoch)
    writer.add_scalar('Train/Epoch_Accuracy', epoch_acc, epoch)
    
    return epoch_loss, epoch_acc

def validate(model, loader, criterion, epoch, writer):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(loader):
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    # 计算验证集指标
    epoch_loss = running_loss / len(loader)
    epoch_acc = 100. * correct / total
    
    # 记录到TensorBoard
    writer.add_scalar('Val/Loss', epoch_loss, epoch)
    writer.add_scalar('Val/Accuracy', epoch_acc, epoch)
    
    # 保存最佳模型
    global best_acc
    if epoch_acc > best_acc:
        best_acc = epoch_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'accuracy': epoch_acc,
        }, os.path.join(CHECKPOINT_DIR, 'best_model.pth'))
    
    return epoch_loss, epoch_acc

# 7. 主训练循环
best_acc = 0.0
# 微调阶段1:只训练最后一层(前10个epoch)
for epoch in range(10):
    print(f"\nEpoch {epoch+1}/{EPOCHS} (冻结阶段)")
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
    val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    scheduler.step()

# 微调阶段2:解冻部分层继续训练
print("\n开始解冻模型进行微调...")
# 解冻除前几层外的所有层(可以根据需要调整解冻范围)
for name, param in model.named_parameters():
    if 'layer3' in name or 'layer4' in name or 'fc' in name:
        param.requires_grad = True

# 更新优化器,包含所有解冻的参数
optimizer = optim.Adam(
    model.parameters(),
    lr=LEARNING_RATE/10,  # 学习率降低10倍
    weight_decay=WEIGHT_DECAY
)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

# 继续训练剩余的epoch
for epoch in range(10, EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS} (解冻阶段)")
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
    val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    scheduler.step()

# 8. 训练结束
writer.close()
print(f"\n训练完成!最佳验证准确率: {best_acc:.2f}%")
print(f"TensorBoard日志保存在: {LOG_DIR}")
print(f"最佳模型保存在: {os.path.join(CHECKPOINT_DIR, 'best_model.pth')}")
相关推荐
apocelipes5 小时前
常用编程语言和库的正则表达式性能对比
c语言·c++·python·性能优化·golang·开发工具和环境
用户8356290780517 小时前
使用 Python 在 PDF 中创建与管理书签
后端·python
MeixianAgent11 小时前
Python 回测数据入口怎么验?历史 K 线入库前先做 5 个检查
后端·python
咕白m62515 小时前
用 Python 实现一键批量查找与替换 Excel 数据
后端·python
阿里云大数据AI技术1 天前
光轮智能 × 阿里云:共建 Physical AI 云上数据、评测与持续学习基础设施
人工智能·机器学习
SelectDB1 天前
Apache Doris Python UDF:让 SQL 直接调用 Python 生态,支撑 Agent 时代复杂业务逻辑
大数据·数据库·python
荣码2 天前
GraphRAG:普通RAG只能回答"点"的问题,我踩了4个坑才搞懂
java·python
金銀銅鐵2 天前
[Python] 基于欧几里得算法,实现分数约分计算器
python·数学
Lyn_Li2 天前
Kaggle Top 5 | 198只股票、200条数据的金融预测——BattleFin高分方案从零复现
python·kaggle·比赛复盘·金融预测