DAY 45 Tensorboard使用介绍
知识点回顾:
- tensorboard的发展历史和原理
- tensorboard的常见操作
- tensorboard在cifar上的实战:MLP和CNN模型
效果展示如下,很适合拿去组会汇报撑页数:

**作业:**对resnet18在cifar10上采用微调策略下,用tensorboard监控训练过程。
python
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import os
# 设置中文字体支持
plt.rcParams["font.family"] = ["SimHei"]
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
# 检查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")
# 1. 数据预处理(训练集增强,测试集标准化)
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
transforms.RandomRotation(15),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# 2. 加载CIFAR-10数据集
train_dataset = datasets.CIFAR10(
root='./data',
train=True,
download=True,
transform=train_transform
)
test_dataset = datasets.CIFAR10(
root='./data',
train=False,
transform=test_transform
)
# 3. 创建数据加载器
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 4. 定义ResNet18模型
def create_resnet18(pretrained=True, num_classes=10):
model = models.resnet18(pretrained=pretrained)
# 修改最后一层全连接层
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, num_classes)
return model.to(device)
# 5. 冻结/解冻模型层的函数
# 这种设计允许我们在迁移学习中保留预训练模型的特征提取部分(卷积层),只训练新添加的分类层(全连接层)。
def freeze_model(model, freeze=True):
"""冻结或解冻模型的卷积层参数"""
# 冻结/解冻除fc层外的所有参数
for name, param in model.named_parameters():
if 'fc' not in name: #排除名称中包含 "fc" 的参数,这些通常是全连接层的参数
param.requires_grad = not freeze #param.requires_grad是 PyTorch 中控制参数是否参与反向传播和梯度更新的标志
# 打印冻结状态
frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad) #统计所有requires_grad=False的参数数量
total_params = sum(p.numel() for p in model.parameters())
if freeze:
print(f"已冻结模型卷积层参数 ({frozen_params}/{total_params} 参数)")
else:
print(f"已解冻模型所有参数 ({total_params}/{total_params} 参数可训练)")
return model
# 6. 训练函数(整合 TensorBoard 记录)
def train_with_freeze_schedule(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, freeze_epochs=5):
# ======================== TensorBoard 核心配置 ========================
# 在使用tensorboard前需要先指定日志保存路径
log_dir = "runs/cifar10_resnet18_exp" # 指定日志保存路径
if os.path.exists(log_dir): #检查刚才定义的路径是否存在
version = 1
while os.path.exists(f"{log_dir}_v{version}"): # 如果路径存在且版本号一致
version += 1 # 版本号加1
log_dir = f"{log_dir}_v{version}" # 如果路径存在,则创建一个新版本
writer = SummaryWriter(log_dir) # 初始化SummaryWriter
print("开始使用ResNet18训练模型...")
print(f"TensorBoard 日志目录: {log_dir}") # 所以第一次是cifar10_resnet_exp、第二次是cifar10_resnet_exp_v1
print("训练后执行: tensorboard --logdir=runs 查看可视化")
# (可选)记录模型结构:用一个真实样本走一遍前向传播,让 TensorBoard 解析计算图
dataiter = iter(train_loader)
images, labels = next(dataiter)
images = images.to(device)
writer.add_graph(model, images) # 写入模型结构到 TensorBoard
# (可选)记录原始训练图像:可视化数据增强前/后效果
img_grid = torchvision.utils.make_grid(images[:8].cpu()) # 取前8张
writer.add_image('原始训练图像(增强前)', img_grid, global_step=0)
global_step = 0 # 全局步骤,用于 TensorBoard 标量记录
"""
前freeze_epochs轮冻结卷积层,之后解冻所有层进行训练
"""
# 初始冻结卷积层
if freeze_epochs > 0:
model = freeze_model(model, freeze=True)
for epoch in range(epochs):
# 解冻控制:在指定轮次后解冻所有层
if epoch == freeze_epochs:
model = freeze_model(model, freeze=False)
# 解冻后调整优化器(可选)
optimizer.param_groups[0]['lr'] = 1e-4 # 降低学习率防止过拟合
model.train() # 设置为训练模式
running_loss = 0.0
correct_train = 0
total_train = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
# 统计准确率
running_loss += iter_loss
_, predicted = output.max(1)
total_train += target.size(0)
correct_train += predicted.eq(target).sum().item()
# ======================== TensorBoard 标量记录 ========================
# 记录每个 batch 的损失、准确率和学习率
batch_acc = 100. * correct_train / total_train
writer.add_scalar('Train/Batch Loss', iter_loss, global_step)
writer.add_scalar('Train/Batch Accuracy', batch_acc, global_step)
writer.add_scalar('Train/Learning Rate', optimizer.param_groups[0]['lr'], global_step)
# 每 200 个 batch 记录一次参数直方图(可选,耗时稍高)
if (batch_idx + 1) % 200 == 0:
for name, param in model.named_parameters():
writer.add_histogram(f'Weights/{name}', param, global_step)
if param.grad is not None:
writer.add_histogram(f'Gradients/{name}', param.grad, global_step)
global_step += 1 # 全局步骤递增
# 计算 epoch 级训练指标
epoch_train_loss = running_loss / len(train_loader)
epoch_train_acc = 100. * correct_train / total_train
# ======================== TensorBoard epoch 标量记录 ========================
writer.add_scalar('Train/Epoch Loss', epoch_train_loss, epoch)
writer.add_scalar('Train/Epoch Accuracy', epoch_train_acc, epoch)
# 测试阶段
model.eval()
correct_test = 0
total_test = 0
test_loss = 0.0
wrong_images = [] # 存储错误预测样本(用于可视化)
wrong_labels = []
wrong_preds = []
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += criterion(output, target).item()
_, predicted = output.max(1)
total_test += target.size(0)
correct_test += predicted.eq(target).sum().item()
# 收集错误预测样本(用于可视化)
wrong_mask = (predicted != target)
if wrong_mask.sum() > 0:
wrong_batch_images = data[wrong_mask][:8].cpu() # 最多存8张
wrong_batch_labels = target[wrong_mask][:8].cpu()
wrong_batch_preds = predicted[wrong_mask][:8].cpu()
wrong_images.extend(wrong_batch_images)
wrong_labels.extend(wrong_batch_labels)
wrong_preds.extend(wrong_batch_preds)
# 计算 epoch 级测试指标
epoch_test_loss = test_loss / len(test_loader)
epoch_test_acc = 100. * correct_test / total_test
# ======================== TensorBoard 测试集记录 ========================
writer.add_scalar('Test/Epoch Loss', epoch_test_loss, epoch)
writer.add_scalar('Test/Epoch Accuracy', epoch_test_acc, epoch)
# (可选)可视化错误预测样本
if wrong_images:
wrong_img_grid = torchvision.utils.make_grid(wrong_images)
writer.add_image('错误预测样本', wrong_img_grid, epoch)
# 写入错误标签文本(可选)
wrong_text = [f"真实: {classes[wl]}, 预测: {classes[wp]}"
for wl, wp in zip(wrong_labels, wrong_preds)]
writer.add_text('错误预测标签', '\n'.join(wrong_text), epoch)
# 记录历史数据
train_loss_history.append(epoch_train_loss)
test_loss_history.append(epoch_test_loss)
train_acc_history.append(epoch_train_acc)
test_acc_history.append(epoch_test_acc)
# 更新学习率调度器
if scheduler is not None:
scheduler.step(epoch_test_loss)
# 打印 epoch 结果
print(f"Epoch {epoch+1} 完成 | 训练损失: {epoch_train_loss:.4f} "
f"| 训练准确率: {epoch_train_acc:.2f}% | 测试准确率: {epoch_test_acc:.2f}%")
# 关闭 TensorBoard 写入器
writer.close()
return epoch_test_acc # 返回最终测试准确率
# 主函数:训练模型
def main():
# 参数设置
epochs = 40 # 总训练轮次
freeze_epochs = 5 # 冻结卷积层的轮次
learning_rate = 1e-3 # 初始学习率
weight_decay = 1e-4 # 权重衰减
# 创建ResNet18模型(加载预训练权重)
model = create_resnet18(pretrained=True, num_classes=10)
# 定义优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()
# 定义学习率调度器
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode='min', factor=0.5, patience=2, verbose=True
)
# 开始训练(前5轮冻结卷积层,之后解冻)
final_accuracy = train_with_freeze_schedule(
model=model,
train_loader=train_loader,
test_loader=test_loader,
criterion=criterion,
optimizer=optimizer,
scheduler=scheduler,
device=device,
epochs=epochs,
freeze_epochs=freeze_epochs
)
print(f"训练完成!最终测试准确率: {final_accuracy:.2f}%")
# # 保存模型
# torch.save(model.state_dict(), 'resnet18_cifar10_finetuned.pth')
# print("模型已保存至: resnet18_cifar10_finetuned.pth")
if __name__ == "__main__":
main()
