DAY45 Tensorboard使用介绍

@浙大疏锦行

python 复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision.utils import make_grid
import matplotlib.pyplot as plt
import numpy as np
import os
from datetime import datetime

# 设置随机种子保证可重复性
torch.manual_seed(42)
np.random.seed(42)

# 设备配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 数据预处理和增强
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# 加载CIFAR-10数据集
train_dataset = datasets.CIFAR10(
    root='./data', 
    train=True, 
    download=True, 
    transform=transform_train
)
test_dataset = datasets.CIFAR10(
    root='./data', 
    train=False, 
    download=True, 
    transform=transform_test
)

train_loader = DataLoader(
    train_dataset, 
    batch_size=128, 
    shuffle=True, 
    num_workers=2
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=100, 
    shuffle=False, 
    num_workers=2
)

# 类别名称
classes = ('plane', 'car', 'bird', 'cat', 'deer', 
           'dog', 'frog', 'horse', 'ship', 'truck')

# 创建TensorBoard的SummaryWriter
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_dir = f'runs/resnet18_cifar10_{timestamp}'
writer = SummaryWriter(log_dir=log_dir)
print(f"TensorBoard日志目录: {log_dir}")

# 创建ResNet18模型(使用预训练权重)
def create_resnet18_finetune(num_classes=10, freeze_backbone=True):
    """
    创建用于微调的ResNet18模型
    
    Args:
        num_classes: 输出类别数
        freeze_backbone: 是否冻结主干网络(前几个epoch可以冻结,之后解冻)
    """
    # 加载预训练的ResNet18
    model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
    
    # 冻结所有卷积层(在微调开始时)
    if freeze_backbone:
        for param in model.parameters():
            param.requires_grad = False
    
    # 修改最后的全连接层以适应CIFAR-10
    num_features = model.fc.in_features
    model.fc = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(num_features, 256),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(256, num_classes)
    )
    
    return model

# 训练函数
def train_epoch(model, train_loader, criterion, optimizer, epoch, device, writer=None):
    """训练一个epoch"""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        # 前向传播
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 反向传播
        loss.backward()
        optimizer.step()
        
        # 统计
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        # 每100个batch打印一次进度
        if (batch_idx + 1) % 100 == 0:
            batch_loss = loss.item()
            batch_acc = 100. * predicted.eq(targets).sum().item() / targets.size(0)
            print(f'Epoch: {epoch+1}, Batch: {batch_idx+1}/{len(train_loader)}, '
                  f'Loss: {batch_loss:.4f}, Acc: {batch_acc:.2f}%')
            
            # 记录每个batch的学习率
            if writer:
                writer.add_scalar('Learning Rate', optimizer.param_groups[0]['lr'], 
                                epoch * len(train_loader) + batch_idx)
    
    # 计算整个epoch的指标
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100. * correct / total
    
    # 记录到TensorBoard
    if writer:
        writer.add_scalar('Loss/Train', epoch_loss, epoch)
        writer.add_scalar('Accuracy/Train', epoch_acc, epoch)
    
    return epoch_loss, epoch_acc

# 测试函数
def test_epoch(model, test_loader, criterion, epoch, device, writer=None):
    """测试模型"""
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    # 计算整体指标
    test_loss = test_loss / len(test_loader)
    test_acc = 100. * correct / total
    
    # 记录到TensorBoard
    if writer:
        writer.add_scalar('Loss/Test', test_loss, epoch)
        writer.add_scalar('Accuracy/Test', test_acc, epoch)
    
    return test_loss, test_acc

# 主训练循环
def train_resnet18_finetune(epochs=30, lr=0.001, unfreeze_epoch=5):
    """
    微调ResNet18
    
    Args:
        epochs: 总训练轮数
        lr: 学习率
        unfreeze_epoch: 解冻主干网络的epoch
    """
    print("=" * 50)
    print("开始微调ResNet18")
    print("=" * 50)
    
    # 创建模型
    model = create_resnet18_finetune(num_classes=10, freeze_backbone=True)
    model = model.to(device)
    
    # 打印模型参数
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"总参数: {total_params:,}")
    print(f"可训练参数: {trainable_params:,}")
    
    # 损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    
    # 第一阶段:只训练最后的全连接层
    optimizer = optim.AdamW(model.fc.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    
    best_acc = 0.0
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        print("-" * 30)
        
        # 在指定epoch解冻主干网络
        if epoch == unfreeze_epoch:
            print("解冻主干网络,开始微调所有层...")
            for param in model.parameters():
                param.requires_grad = True
            
            # 重新定义优化器,包含所有参数
            optimizer = optim.AdamW(
                model.parameters(), 
                lr=lr/10,  # 解冻后使用更小的学习率
                weight_decay=1e-4
            )
            scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs-unfreeze_epoch)
            
            # 更新可训练参数计数
            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
            print(f"解冻后可训练参数: {trainable_params:,}")
        
        # 训练
        train_loss, train_acc = train_epoch(
            model, train_loader, criterion, optimizer, 
            epoch, device, writer
        )
        
        # 测试
        test_loss, test_acc = test_epoch(
            model, test_loader, criterion, epoch, 
            device, writer
        )
        
        # 学习率调度
        scheduler.step()
        
        # 打印结果
        print(f"训练结果: Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%")
        print(f"测试结果: Loss: {test_loss:.4f}, Acc: {test_acc:.2f}%")
        print(f"当前学习率: {optimizer.param_groups[0]['lr']:.6f}")
        
        # 保存最佳模型
        if test_acc > best_acc:
            best_acc = test_acc
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'test_acc': test_acc,
                'train_acc': train_acc,
            }, f'best_resnet18_cifar10.pth')
            print(f"保存最佳模型,准确率: {test_acc:.2f}%")
    
    print("\n" + "=" * 50)
    print(f"训练完成!最佳测试准确率: {best_acc:.2f}%")
    print("=" * 50)
    
    return model, best_acc

# 可视化函数
def visualize_results(model, test_loader, writer, num_images=8):
    """在TensorBoard中可视化一些测试图片和预测结果"""
    model.eval()
    dataiter = iter(test_loader)
    images, labels = next(dataiter)
    
    # 只取前num_images张图片
    images = images[:num_images]
    labels = labels[:num_images]
    images = images.to(device)
    
    with torch.no_grad():
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
    
    # 将图片添加到TensorBoard
    # 反归一化用于显示
    mean = torch.tensor([0.4914, 0.4822, 0.4465]).view(3, 1, 1)
    std = torch.tensor([0.2023, 0.1994, 0.2010]).view(3, 1, 1)
    images_denorm = images.cpu() * std + mean
    images_denorm = torch.clamp(images_denorm, 0, 1)
    
    # 创建图片网格
    img_grid = make_grid(images_denorm, nrow=4, normalize=False)
    
    # 添加到TensorBoard
    writer.add_image('Test Images/Predictions', img_grid, 0)
    
    # 添加预测文本
    print("\n预测示例:")
    for i in range(min(num_images, len(images))):
        pred_label = predicted[i].item()
        true_label = labels[i].item()
        is_correct = pred_label == true_label
        status = "✓" if is_correct else "✗"
        print(f"图片{i+1}: 真实={classes[true_label]:8s}, 预测={classes[pred_label]:8s} {status}")
    
    return images, labels, predicted

# 运行训练
if __name__ == "__main__":
    # 创建日志目录
    os.makedirs('models', exist_ok=True)
    
    # 开始训练
    trained_model, best_acc = train_resnet18_finetune(
        epochs=30,  # 可以调整epoch数量
        lr=0.001,   # 初始学习率
        unfreeze_epoch=5  # 在第5个epoch解冻主干网络
    )
    
    # 可视化一些结果
    print("\n可视化预测结果...")
    images, labels, predictions = visualize_results(trained_model, test_loader, writer, num_images=8)
    
    # 记录模型图结构
    print("记录模型图结构到TensorBoard...")
    dummy_input = torch.randn(1, 3, 32, 32).to(device)
    writer.add_graph(trained_model, dummy_input)
    
    # 记录超参数
    print("记录超参数到TensorBoard...")
    writer.add_hparams(
        {
            'lr': 0.001,
            'batch_size': 128,
            'epochs': 30,
            'unfreeze_epoch': 5,
            'weight_decay': 1e-4
        },
        {
            'best_accuracy': best_acc,
            'final_train_acc': 0,  # 这里可以记录实际的训练准确率
            'final_test_acc': 0    # 这里可以记录实际的测试准确率
        }
    )
    
    # 关闭TensorBoard写入器
    writer.close()
    
    print(f"\nTensorBoard日志保存在: {log_dir}")
    print("在终端运行以下命令查看结果:")
    print(f"tensorboard --logdir={log_dir}")
    print("\n或者指定端口:")
    print(f"tensorboard --logdir={log_dir} --port=6006")
    
    # 加载最佳模型并测试
    print("\n加载最佳模型进行最终测试...")
    checkpoint = torch.load('best_resnet18_cifar10.pth')
    trained_model.load_state_dict(checkpoint['model_state_dict'])
    
    # 最终测试
    test_loss, test_acc = test_epoch(
        trained_model, test_loader, 
        nn.CrossEntropyLoss(), 0, device
    )
    print(f"最佳模型测试准确率: {test_acc:.2f}%")


相关推荐
智算菩萨14 分钟前
【Generative AI For Autonomous Driving】4 自动驾驶生成式模型前沿实战——从图像合成到多模态大模型的技术全景解析
论文阅读·人工智能·深度学习·机器学习·ai·自动驾驶
才兄说1 小时前
机器人租售效果好吗?任务前演示确认
人工智能·机器人
测试_AI_一辰6 小时前
AI测试工程笔记 05:AI评测实践(从数据集到自动评测闭环)
人工智能·笔记·功能测试·自动化·ai编程
云境筑桃源哇7 小时前
海洋ALFA:主权与创新的交响,开启AI生态新纪元
人工智能
liliangcsdn7 小时前
LLM复杂数值的提取计算场景示例
人工智能·python
小和尚同志7 小时前
OpenCodeUI 让你随时随地 AI Coding
人工智能·aigc·ai编程
AI视觉网奇7 小时前
2d 数字人解决方案-待机动作
人工智能·计算机视觉
人工智能AI酱7 小时前
【AI深究】逻辑回归(Logistic Regression)全网最详细全流程详解与案例(附大量Python代码演示)| 数学原理、案例流程、代码演示及结果解读 | 决策边界、正则化、优缺点及工程建议
人工智能·python·算法·机器学习·ai·逻辑回归·正则化
爱喝可乐的老王7 小时前
机器学习监督学习模型--逻辑回归
人工智能·机器学习·逻辑回归
Ao0000007 小时前
机器学习——逻辑回归
人工智能·机器学习·逻辑回归