pythonday50

作业:

1.好好理解下resnet18的模型结构

2.尝试对vgg16+cbam进行微调策略

复制代码
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision import models
from torch.utils.data import DataLoader
import time
import copy
 
# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
 
# --- CBAM Module Implementation ---
class ChannelAttention(nn.Module):
    def __init__(self, in_planes, ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
 
        self.fc = nn.Sequential(
            nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False),
            nn.ReLU(),
            nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)
        )
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = avg_out + max_out
        return self.sigmoid(out)
 
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
        padding = 3 if kernel_size == 7 else 1
 
        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x_cat = torch.cat([avg_out, max_out], dim=1)
        x_att = self.conv1(x_cat)
        return self.sigmoid(x_att)
 
class CBAM(nn.Module):
    def __init__(self, in_planes, ratio=16, kernel_size=7):
        super(CBAM, self).__init__()
        self.ca = ChannelAttention(in_planes, ratio)
        self.sa = SpatialAttention(kernel_size)
 
    def forward(self, x):
        x = x * self.ca(x)  # Apply channel attention
        x = x * self.sa(x)  # Apply spatial attention
        return x
 
# --- VGG16 with CBAM Model ---
class VGG16_CBAM(nn.Module):
    def __init__(self, num_classes=10, pretrained=True):
        super(VGG16_CBAM, self).__init__()
        vgg_base = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1 if pretrained else None)
        
        self.features = vgg_base.features
        
        # CBAM module after feature extraction
        # VGG16 features output 512 channels
        self.cbam = CBAM(in_planes=512) 
        
        # Adaptive average pooling to get a fixed size output (e.g., 7x7 for original VGG classifier, or 1x1)
        # VGG's original avgpool is AdaptiveAvgPool2d(output_size=(7, 7))
        # If we keep this, input to classifier is 512 * 7 * 7
        self.avgpool = vgg_base.avgpool 
        # Original VGG classifier:
        # (0): Linear(in_features=25088, out_features=4096, bias=True)
        # (1): ReLU(inplace=True)
        # (2): Dropout(p=0.5, inplace=False)
        # (3): Linear(in_features=4096, out_features=4096, bias=True)
        # (4): ReLU(inplace=True)
        # (5): Dropout(p=0.5, inplace=False)
        # (6): Linear(in_features=4096, out_features=1000, bias=True)
        
        # Modify the last layer of the classifier for the new number of classes
        # For CIFAR-10, num_classes = 10
        # The input to the classifier is 512 * 7 * 7 = 25088
        # Or if we change avgpool to nn.AdaptiveAvgPool2d((1,1)), then input is 512
        
        # Option 1: Keep original avgpool, input to classifier is 25088
        num_ftrs = vgg_base.classifier[0].in_features # Should be 25088
        
        # Option 2: Adapt for smaller feature map (e.g. 1x1 output from avgpool)
        # self.avgpool = nn.AdaptiveAvgPool2d((1,1)) # Output 512x1x1
        # num_ftrs = 512
        self.classifier = nn.Sequential(
            nn.Linear(num_ftrs, 4096),
            nn.ReLU(True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes)
        )
        # Initialize weights of the new classifier (optional, often helps)
        for m in self.classifier.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
    def forward(self, x):
        x = self.features(x)
        x = self.cbam(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x
# --- Data Preparation ---
# CIFAR-10 specific transforms
# VGG expects 224x224 images
transform_train = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), # CIFAR-10 stats
])
transform_test = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True) # Adjust batch_size based on GPU VRAM
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = DataLoader(testset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
num_classes = len(classes)
# --- Training and Evaluation Functions ---
def train_model_epochs(model, criterion, optimizer, dataloader, num_epochs=10, accumulation_steps=1):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available()) # For mixed precision
    for epoch in range(num_epochs):
        running_loss = 0.0
        running_corrects = 0
        total_samples = 0
        
        optimizer.zero_grad() # Zero out gradients before starting accumulation for an effective batch
        for i, (inputs, labels) in enumerate(dataloader):
            inputs, labels = inputs.to(device), labels.to(device)
            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): # Mixed precision context
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss = loss / accumulation_steps # Scale loss
            scaler.scale(loss).backward() # Scale loss and call backward
            if (i + 1) % accumulation_steps == 0 or (i + 1) == len(dataloader):
                scaler.step(optimizer) # Perform optimizer step
                scaler.update() # Update scaler
                optimizer.zero_grad() # Zero out gradients for the next effective batch
            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0) * accumulation_steps # Unscale loss for logging
            running_corrects += torch.sum(preds == labels.data)
            total_samples += inputs.size(0)
            if (i + 1) % 100 == 0: # Log every 100 mini-batches
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(dataloader)}], Loss: {loss.item()*accumulation_steps:.4f}')
        epoch_loss = running_loss / total_samples
        epoch_acc = running_corrects.double() / total_samples
        print(f'Epoch {epoch+1}/{num_epochs} - Train Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.4f}')
    return model
def evaluate_model(model, criterion, dataloader):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    total_samples = 0
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                 outputs = model(inputs)
                 loss = criterion(outputs, labels)
            
            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
            total_samples += inputs.size(0)
            
    epoch_loss = running_loss / total_samples
    epoch_acc = running_corrects.double() / total_samples
    print(f'Test Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.4f}\n')
    return epoch_acc
# --- Fine-tuning Strategy Implementation ---
model_vgg_cbam = VGG16_CBAM(num_classes=num_classes, pretrained=True).to(device)
criterion = nn.CrossEntropyLoss()
accumulation_steps = 2 # Simulate larger batch size: 32*2 = 64 effective
# **Phase 1: Train only the CBAM and the new classifier**
print("--- Phase 1: Training CBAM and Classifier ---")
# Freeze all layers in features
for param in model_vgg_cbam.features.parameters():
    param.requires_grad = False
# Ensure CBAM and classifier parameters are trainable
for param in model_vgg_cbam.cbam.parameters():
    param.requires_grad = True
for param in model_vgg_cbam.classifier.parameters():
    param.requires_grad = True
# Collect parameters to optimize for phase 1
params_to_optimize_phase1 = []
for name, param in model_vgg_cbam.named_parameters():
    if param.requires_grad:
        params_to_optimize_phase1.append(param)
        print(f"Phase 1 optimizing: {name}")
optimizer_phase1 = optim.AdamW(params_to_optimize_phase1, lr=1e-3, weight_decay=1e-4)
# scheduler_phase1 = optim.lr_scheduler.StepLR(optimizer_phase1, step_size=5, gamma=0.1) # Optional scheduler
# Reduced epochs for quicker demonstration, increase for better results
epochs_phase1 = 10 # e.g., 10-15 epochs
model_vgg_cbam = train_model_epochs(model_vgg_cbam, criterion, optimizer_phase1, trainloader, num_epochs=epochs_phase1, accumulation_steps=accumulation_steps)
evaluate_model(model_vgg_cbam, criterion, testloader)
# **Phase 2: Unfreeze some later layers of the backbone (e.g., last VGG block) and train with a smaller LR**
print("\n--- Phase 2: Fine-tuning later backbone layers, CBAM, and Classifier ---")
# VGG16 features layers: total 31 layers (conv, relu, pool)
# Last block (conv5_1, relu, conv5_2, relu, conv5_3, relu, pool5) starts around index 24
# Unfreeze layers from index 24 onwards (last conv block)
# Note: VGG feature blocks end at indices: 4 (block1), 9 (block2), 16 (block3), 23 (block4), 30 (block5)
# Let's unfreeze block 5 (layers 24-30) and block 4 (layers 17-23)
unfreeze_from_layer_idx = 17 # Start of block4
for i, child in enumerate(model_vgg_cbam.features.children()):
    if i >= unfreeze_from_layer_idx:
        print(f"Phase 2 Unfreezing feature layer: {i} - {type(child)}")
        for param in child.parameters():
            param.requires_grad = True
    # else: # Keep earlier layers frozen
    #     for param in child.parameters():
    #         param.requires_grad = False 
            # This is already done from phase 1, but good to be explicit if starting from scratch
 
# Differential learning rates
# Backbone layers (newly unfrozen) get a smaller LR
# CBAM and classifier get a slightly larger LR (or same as backbone if preferred)
lr_backbone_phase2 = 1e-5
lr_head_phase2 = 5e-5 # CBAM and classifier
 
params_group_phase2 = [
    {'params': [], 'lr': lr_backbone_phase2, 'name': 'fine_tune_features'}, # For later backbone layers
    {'params': model_vgg_cbam.cbam.parameters(), 'lr': lr_head_phase2, 'name': 'cbam'},
    {'params': model_vgg_cbam.classifier.parameters(), 'lr': lr_head_phase2, 'name': 'classifier'}
]
 
# Add only newly unfrozen feature layers to the optimizer group
for i, child in enumerate(model_vgg_cbam.features.children()):
    if i >= unfreeze_from_layer_idx:
        params_group_phase2[0]['params'].extend(list(child.parameters()))
        print(f"Phase 2 optimizing feature layer: {i} with lr {lr_backbone_phase2}")
 
# Ensure early backbone layers are NOT in optimizer if they are frozen (param.requires_grad == False)
# The AdamW constructor below will only consider params with requires_grad=True from the list
optimizer_phase2 = optim.AdamW(
    [p for p_group in params_group_phase2 for p in p_group['params'] if p.requires_grad], 
    lr=lr_head_phase2 # Default LR, overridden by group LRs
)
# More explicit group definition:
optimizer_phase2 = optim.AdamW([
    {'params': [p for p in params_group_phase2[0]['params'] if p.requires_grad], 'lr': lr_backbone_phase2},
    {'params': [p for p in params_group_phase2[1]['params'] if p.requires_grad], 'lr': lr_head_phase2},
    {'params': [p for p in params_group_phase2[2]['params'] if p.requires_grad], 'lr': lr_head_phase2}
], weight_decay=1e-4)
 
 
epochs_phase2 = 15 # e.g., 15-20 epochs
model_vgg_cbam = train_model_epochs(model_vgg_cbam, criterion, optimizer_phase2, trainloader, num_epochs=epochs_phase2, accumulation_steps=accumulation_steps)
evaluate_model(model_vgg_cbam, criterion, testloader)
 
 
# **Phase 3: Unfreeze all layers and train with a very small learning rate**
print("\n--- Phase 3: Fine-tuning all layers with very small LR ---")
for param in model_vgg_cbam.features.parameters():
    param.requires_grad = True # Unfreeze all feature layers
 
# Single very small learning rate for all parameters
lr_phase3 = 2e-6 
 
# Or differential (earlier layers even smaller)
params_group_phase3 = [
    {'params': list(model_vgg_cbam.features[:unfreeze_from_layer_idx].parameters()), 'lr': lr_phase3 * 0.1, 'name':'early_features'}, # Earlier backbone layers
    {'params': list(model_vgg_cbam.features[unfreeze_from_layer_idx:].parameters()), 'lr': lr_phase3, 'name':'later_features'}, # Later backbone layers
    {'params': model_vgg_cbam.cbam.parameters(), 'lr': lr_phase3 * 2, 'name':'cbam'}, # CBAM slightly higher
    {'params': model_vgg_cbam.classifier.parameters(), 'lr': lr_phase3 * 2, 'name':'classifier'} # Classifier slightly higher
]
optimizer_phase3 = optim.AdamW(params_group_phase3, weight_decay=1e-5) # default LR is not used here
 
# optimizer_phase3 = optim.AdamW(model_vgg_cbam.parameters(), lr=lr_phase3, weight_decay=1e-5) # Simpler: one LR for all
 
epochs_phase3 = 15 # e.g., 15-20 epochs
model_vgg_cbam = train_model_epochs(model_vgg_cbam, criterion, optimizer_phase3, trainloader, num_epochs=epochs_phase3, accumulation_steps=accumulation_steps)
evaluate_model(model_vgg_cbam, criterion, testloader)
 
print("Fine-tuning complete!")
 
# Save the final model (optional)
# torch.save(model_vgg_cbam.state_dict(), 'vgg16_cbam_cifar10_final.pth')

@浙大疏锦行

相关推荐
安替-AnTi16 小时前
厚朴 APK 搜索接口分析
python·apk·解析·taobao
山川湖海17 小时前
AI时代快速学编程语言的陷阱(以Python为例)
大数据·人工智能·python
H Journey17 小时前
Supervisor 进程管理工具介绍
python·supervisor·linux 运维
z小猫不吃鱼17 小时前
13 Scaling Law 入门:模型规模、数据规模和计算量是什么关系?
人工智能·深度学习·机器学习
三无推导17 小时前
ComfyUI 安装部署教程:Windows 下快速搭建可视化 AI 绘图工作流,零基础也能跑通
人工智能·pytorch·windows·stable diffusion·aigc·ai绘画·持续部署
春日见17 小时前
5分钟入门强化学习之动态规划算法与实现
大数据·人工智能·python·算法·机器学习·计算机视觉
DeniuHe18 小时前
sklearn 中所有交叉验证数据集划分方式完整总结
人工智能·python·sklearn
DeniuHe18 小时前
sklearn中不同交叉验证方法的场景适配
人工智能·python·sklearn
知识浅谈18 小时前
Transformer 中的 Q、K、V 到底是什么?怎么理解 Query、Key、Value?
人工智能·深度学习·transformer
人工智能培训18 小时前
设备故障?数字孪生提前预警
人工智能·深度学习·神经网络·机器学习·生成对抗网络