TensorBoard 是 TensorFlow/PyTorch 官方的可视化工具,核心作用是实时监控训练过程、分析模型性能、可视化模型结构,通过网页端交互展示训练数据,比单纯打印日志更直观。
python
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter # TensorBoard核心类
import numpy as np
import matplotlib.pyplot as plt
# ===================== 1. 初始化TensorBoard Writer =====================
# 日志保存路径(会自动创建,建议按时间命名避免覆盖)
log_dir = "./runs/cifar10_resnet18_experiment"
writer = SummaryWriter(log_dir=log_dir) # 核心对象,负责写入日志
# ===================== 2. 基础配置 =====================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64
lr = 0.001
epochs = 5
num_classes = 10
classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
'dog', 'frog', 'horse', 'ship', 'truck']
# ===================== 3. 数据预处理 =====================
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# ===================== 4. 加载预训练模型 =====================
model = models.resnet18(pretrained=True)
# 冻结卷积层
for param in model.parameters():
param.requires_grad = False
# 替换最后一层
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, num_classes)
model = model.to(device)
# ===================== 5. 损失函数 & 优化器 =====================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=lr)
# ===================== 6. 辅助函数:反归一化显示图像 =====================
def denormalize(img_tensor):
"""还原归一化的图像,用于TensorBoard显示"""
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
img = img_tensor.cpu().numpy().transpose((1, 2, 0)) # (C,H,W)→(H,W,C)
img = img * std + mean
img = np.clip(img, 0, 1) # 限制在0-1之间
return img
# ===================== 7. 训练+TensorBoard记录 =====================
def train_and_log():
global_step = 0 # 全局步数(跨epoch累计)
# 1. 记录模型图(关键!可视化模型结构)
# 生成一个示例输入(batch_size=1, 3, 224, 224)
dummy_input = torch.randn(1, 3, 224, 224).to(device)
writer.add_graph(model, dummy_input) # 写入模型计算图
# 2. 记录一批样本图像(可视化数据集)
images, labels = next(iter(train_loader))
# 反归一化后拼接成网格
img_grid = []
for i in range(8): # 取前8张图
img = denormalize(images[i])
img_grid.append(img)
img_grid = np.concatenate(img_grid, axis=1) # 横向拼接
writer.add_image('CIFAR10_Samples', img_grid.transpose((2, 0, 1)), global_step=0) # 转成(C,H,W)
# 3. 训练并记录标量/直方图
for epoch in range(1, epochs+1):
model.train()
train_loss = 0.0
correct = 0
total = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
# 累计损失和准确率
train_loss += loss.item()
pred = output.argmax(dim=1)
correct += pred.eq(target).sum().item()
total += target.size(0)
# ========== 记录每batch的标量(损失) ==========
writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)
global_step += 1
# ========== 记录每epoch的标量(平均损失/准确率) ==========
avg_train_loss = train_loss / len(train_loader)
train_acc = 100.0 * correct / total
# 测试集评估
test_loss, test_acc = test(model, test_loader, criterion, device)
# 写入标量(支持多曲线对比)
writer.add_scalars('Loss', {
'Train': avg_train_loss,
'Test': test_loss
}, epoch)
writer.add_scalars('Accuracy', {
'Train': train_acc,
'Test': test_acc
}, epoch)
# ========== 记录模型参数直方图(分析参数分布) ==========
for name, param in model.named_parameters():
if 'fc' in name: # 只记录全连接层参数(避免日志过大)
writer.add_histogram(f'Params/{name}', param, epoch)
writer.add_histogram(f'Grads/{name}', param.grad, epoch)
# 打印日志
print(f"Epoch {epoch:2d} | "
f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.2f}%")
# 4. 关闭Writer(重要!确保日志写入完成)
writer.close()
# ===================== 8. 测试函数 =====================
def test(model, loader, criterion, device):
model.eval()
total_loss = 0
correct = 0
with torch.no_grad():
for data, target in loader:
data, target = data.to(device), target.to(device)
output = model(data)
total_loss += criterion(output, target).item()
pred = output.argmax(dim=1)
correct += pred.eq(target).sum().item()
avg_loss = total_loss / len(loader)
acc = 100.0 * correct / len(loader.dataset)
return avg_loss, acc
# ===================== 9. 执行训练 =====================
if __name__ == "__main__":
train_and_log()
print(f"\n训练完成!TensorBoard日志已保存至:{log_dir}")
print("启动命令:tensorboard --logdir=./runs/cifar10_resnet18_experiment")
| 操作方法 | 作用 | 网页端对应标签 |
|---|---|---|
writer.add_scalar() |
记录标量(损失、准确率) | Scalars |
writer.add_scalars() |
同时记录多个标量(对比训练 / 测试曲线) | Scalars |
writer.add_image() |
记录单张 / 网格图像(数据集 / 预测结果) | Images |
writer.add_graph() |
可视化模型计算图 | Graphs |
writer.add_histogram() |
记录参数 / 梯度的分布(直方图) | Histograms |