python
复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from torch.utils.tensorboard import SummaryWriter
import os
import time
# 1. 配置基础参数
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 5e-4
LOG_DIR = "./runs/resnet18_cifar10_finetune" # TensorBoard日志保存路径
CHECKPOINT_DIR = "./checkpoints"
# 创建必要的目录
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
# 2. 数据预处理和加载
# 训练集数据增强
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# 测试集仅做标准化
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# 加载数据集
train_dataset = datasets.CIFAR10(
root='./data', train=True, download=True, transform=train_transform
)
test_dataset = datasets.CIFAR10(
root='./data', train=False, download=True, transform=test_transform
)
# 创建数据加载器
train_loader = DataLoader(
train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4
)
test_loader = DataLoader(
test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4
)
# 3. 构建模型并设置微调策略
def build_resnet18_finetune(num_classes=10):
# 加载预训练的ResNet18
model = models.resnet18(pretrained=True)
# 第一步:冻结所有层
for param in model.parameters():
param.requires_grad = False
# 修改最后一层全连接层,适配CIFAR-10的10分类
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, num_classes)
# 将模型移到指定设备
model = model.to(DEVICE)
return model
# 初始化模型
model = build_resnet18_finetune(num_classes=10)
# 4. 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
# 优化器只更新需要训练的参数(这里初始只有最后一层)
optimizer = optim.Adam(
filter(lambda p: p.requires_grad, model.parameters()),
lr=LEARNING_RATE,
weight_decay=WEIGHT_DECAY
)
# 学习率调度器
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
# 5. 初始化TensorBoard
writer = SummaryWriter(log_dir=LOG_DIR)
# 6. 定义训练和验证函数
def train_one_epoch(model, loader, criterion, optimizer, epoch, writer):
model.train()
running_loss = 0.0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(loader):
inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
# 前向传播
outputs = model(inputs)
loss = criterion(outputs, targets)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 统计损失和准确率
running_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
# 每100个batch记录一次训练数据到TensorBoard
global_step = epoch * len(loader) + batch_idx
if batch_idx % 100 == 0:
writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)
writer.add_scalar('Train/Batch_Accuracy', 100.*correct/total, global_step)
writer.add_scalar('Train/Learning_Rate', optimizer.param_groups[0]['lr'], global_step)
# 计算epoch级别的指标
epoch_loss = running_loss / len(loader)
epoch_acc = 100. * correct / total
# 记录epoch级别的指标到TensorBoard
writer.add_scalar('Train/Epoch_Loss', epoch_loss, epoch)
writer.add_scalar('Train/Epoch_Accuracy', epoch_acc, epoch)
return epoch_loss, epoch_acc
def validate(model, loader, criterion, epoch, writer):
model.eval()
running_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(loader):
inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
outputs = model(inputs)
loss = criterion(outputs, targets)
running_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
# 计算验证集指标
epoch_loss = running_loss / len(loader)
epoch_acc = 100. * correct / total
# 记录到TensorBoard
writer.add_scalar('Val/Loss', epoch_loss, epoch)
writer.add_scalar('Val/Accuracy', epoch_acc, epoch)
# 保存最佳模型
global best_acc
if epoch_acc > best_acc:
best_acc = epoch_acc
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'accuracy': epoch_acc,
}, os.path.join(CHECKPOINT_DIR, 'best_model.pth'))
return epoch_loss, epoch_acc
# 7. 主训练循环
best_acc = 0.0
# 微调阶段1:只训练最后一层(前10个epoch)
for epoch in range(10):
print(f"\nEpoch {epoch+1}/{EPOCHS} (冻结阶段)")
train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
scheduler.step()
# 微调阶段2:解冻部分层继续训练
print("\n开始解冻模型进行微调...")
# 解冻除前几层外的所有层(可以根据需要调整解冻范围)
for name, param in model.named_parameters():
if 'layer3' in name or 'layer4' in name or 'fc' in name:
param.requires_grad = True
# 更新优化器,包含所有解冻的参数
optimizer = optim.Adam(
model.parameters(),
lr=LEARNING_RATE/10, # 学习率降低10倍
weight_decay=WEIGHT_DECAY
)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
# 继续训练剩余的epoch
for epoch in range(10, EPOCHS):
print(f"\nEpoch {epoch+1}/{EPOCHS} (解冻阶段)")
train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, epoch, writer)
val_loss, val_acc = validate(model, test_loader, criterion, epoch, writer)
print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
scheduler.step()
# 8. 训练结束
writer.close()
print(f"\n训练完成!最佳验证准确率: {best_acc:.2f}%")
print(f"TensorBoard日志保存在: {LOG_DIR}")
print(f"最佳模型保存在: {os.path.join(CHECKPOINT_DIR, 'best_model.pth')}")