day48 python通道注意力

一、注意力机制的探索与理解

在深度学习的旅程中，我们已经接触过Transformer框架中的自注意力机制，它通过让模型学会"选择性关注重要信息"，极大地提升了模型对数据特征的提取能力。从one-hot编码到ELMo，再到自注意力机制的出现，特征提取的方式不断进化，而注意力机制正是这一进化过程中的关键里程碑。

注意力机制的本质是对输入特征进行加权求和，其输出为输入特征与注意力权重的乘积之和。与卷积操作类似，卷积是"固定权重"的特征提取，而注意力机制则是"动态权重"的特征提取，权重会随着输入数据的不同而变化。这种动态性使得注意力机制能够更好地适应不同的输入，从而更精准地提取关键信息。

在实际应用中，我们经常会遇到多种注意力模块，如通道注意力、空间注意力、多头注意力等。这些模块并非万能，而是针对不同的场景和需求而设计。例如，通道注意力可以强化关键通道的特征，空间注意力则能聚焦于物体所在区域，忽略背景。这种多样化的模块设计，使得我们能够根据具体任务的特点，选择最合适的注意力机制，从而提高模型的性能。

二、CNN模型的训练与特征图可视化

在今天的实验中，我们首先回顾了CNN模型的训练过程。通过使用PyTorch框架，我们构建了一个简单的CNN模型，并在CIFAR-10数据集上进行了训练。以下是CNN模型的代码实现：

python 复制代码

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        # 第一个卷积块
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(2, 2)
        
        # 第二个卷积块
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(2)
        
        # 第三个卷积块
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(2)
        
        # 全连接层
        self.fc1 = nn.Linear(128 * 4 * 4, 512)
        self.dropout = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):
        # 卷积块1
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        
        # 卷积块2
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        
        # 卷积块3
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.pool3(x)
        
        # 展平与全连接层
        x = x.view(-1, 128 * 4 * 4)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

在训练过程中，我们采用了数据增强技术，如随机裁剪、水平翻转、颜色抖动和旋转等，以提高模型的泛化能力。同时，我们还使用了学习率调度器，根据测试损失动态调整学习率，以确保模型能够更快地收敛。以下是训练代码的核心部分：

python 复制代码

def train(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()
        epoch_train_loss = running_loss / len(train_loader)
        epoch_train_acc = 100. * correct / total
        # 测试阶段
        model.eval()
        test_loss = 0
        correct_test = 0
        total_test = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += criterion(output, target).item()
                _, predicted = output.max(1)
                total_test += target.size(0)
                correct_test += predicted.eq(target).sum().item()
        epoch_test_loss = test_loss / len(test_loader)
        epoch_test_acc = 100. * correct_test / total_test
        scheduler.step(epoch_test_loss)
        print(f'Epoch {epoch+1}/{epochs} 完成 | 训练准确率: {epoch_train_acc:.2f}% | 测试准确率: {epoch_test_acc:.2f}%')

经过50个epoch的训练，模型的最终测试准确率达到了84.68%。这一结果表明，我们的CNN模型在CIFAR-10数据集上表现良好，能够较好地识别不同类别的图像。

在模型训练完成后，我们进一步进行了特征图的可视化。通过注册钩子函数，我们捕获了CNN模型中不同卷积层的输出特征图，并对这些特征图进行了可视化。以下是特征图可视化的代码：

python 复制代码

def visualize_feature_maps(model, test_loader, device, layer_names, num_images=3, num_channels=9):
    model.eval()
    class_names = ['飞机', '汽车', '鸟', '猫', '鹿', '狗', '青蛙', '马', '船', '卡车']
    images_list, labels_list = [], []
    for images, labels in test_loader:
        images_list.append(images)
        labels_list.append(labels)
        if len(images_list) * test_loader.batch_size >= num_images:
            break
    images = torch.cat(images_list, dim=0)[:num_images].to(device)
    labels = torch.cat(labels_list, dim=0)[:num_images].to(device)

    with torch.no_grad():
        feature_maps = {}
        hooks = []
        def hook(module, input, output, name):
            feature_maps[name] = output.cpu()
        for name in layer_names:
            module = getattr(model, name)
            hook_handle = module.register_forward_hook(lambda m, i, o, n=name: hook(m, i, o, n))
            hooks.append(hook_handle)
        _ = model(images)
        for hook_handle in hooks:
            hook_handle.remove()
        for img_idx in range(num_images):
            img = images[img_idx].cpu().permute(1, 2, 0).numpy()
            img = img * np.array([0.2023, 0.1994, 0.2010]).reshape(1, 1, 3) + np.array([0.4914, 0.4822, 0.4465]).reshape(1, 1, 3)
            img = np.clip(img, 0, 1)
            num_layers = len(layer_names)
            fig, axes = plt.subplots(1, num_layers + 1, figsize=(4 * (num_layers + 1), 4))
            axes[0].imshow(img)
            axes[0].set_title(f'原始图像\n类别: {class_names[labels[img_idx]]}')
            axes[0].axis('off')
            for layer_idx, layer_name in enumerate(layer_names):
                fm = feature_maps[layer_name][img_idx]
                fm = fm[:num_channels]
                num_rows = int(np.sqrt(num_channels))
                num_cols = num_channels // num_rows if num_rows != 0 else 1
                layer_ax = axes[layer_idx + 1]
                layer_ax.set_title(f'{layer_name}特征图')
                layer_ax.axis('off')
                for ch_idx, channel in enumerate(fm):
                    ax = layer_ax.inset_axes([ch_idx % num_cols / num_cols, 
                                            (num_rows - 1 - ch_idx // num_cols) / num_rows, 
                                            1/num_cols, 1/num_rows])
                    ax.imshow(channel.numpy(), cmap='viridis')
                    ax.set_title(f'通道 {ch_idx + 1}')
                    ax.axis('off')
            plt.tight_layout()
            plt.show()

我们发现，随着卷积层的加深，特征图的语义信息逐渐变得抽象，从浅层的边缘、纹理特征，到深层的全局语义特征，模型逐渐学会了从图像中提取更有意义的信息。

三、通道注意力的引入与效果分析

为了进一步提升模型的性能，我们在CNN模型中引入了通道注意力机制。通道注意力模块的核心思想是通过全局平均池化和全连接层，学习通道间的依赖关系，并为每个通道分配一个权重。这些权重会根据通道的重要性进行调整，从而增强重要通道的特征，抑制不重要通道的特征。以下是通道注意力模块的代码实现：

python 复制代码

class ChannelAttention(nn.Module):
    def __init__(self, in_channels, reduction_ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(in_channels, in_channels // reduction_ratio, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(in_channels // reduction_ratio, in_channels, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        batch_size, channels, height, width = x.size()
        avg_pool_output = self.avg_pool(x).view(batch_size, channels)
        channel_weights = self.fc(avg_pool_output).view(batch_size, channels, 1, 1)
        return x * channel_weights

在引入通道注意力后，我们重新定义了CNN模型，并在每个卷积块后插入了通道注意力模块。以下是修改后的CNN模型代码：

python 复制代码

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        # 第一个卷积块
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU()
        self.ca1 = ChannelAttention(in_channels=32, reduction_ratio=16)
        self.pool1 = nn.MaxPool2d(2, 2)
        
        # 第二个卷积块
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU()
        self.ca2 = ChannelAttention(in_channels=64, reduction_ratio=16)
        self.pool2 = nn.MaxPool2d(2)
        
        # 第三个卷积块
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.relu3 = nn.ReLU()
        self.ca3 = ChannelAttention(in_channels=128, reduction_ratio=16)
        self.pool3 = nn.MaxPool2d(2)
        
        # 全连接层
        self.fc1 = nn.Linear(128 * 4 * 4, 512)
        self.dropout = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):
        # 卷积块1
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.ca1(x)
        x = self.pool1(x)
        
        # 卷积块2
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.ca2(x)
        x = self.pool2(x)
        
        # 卷积块3
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.ca3(x)
        x = self.pool3(x)
        
        # 展平与全连接层
        x = x.view(-1, 128 * 4 * 4)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

经过50个epoch的训练，模型的最终测试准确率提升到了85.38%。这一结果表明，通道注意力机制能够有效地提升模型对特征的提取能力，从而提高模型的分类准确率。

此外，我们还对引入通道注意力后的模型进行了可视化分析。通过可视化注意力热力图，我们能够直观地看到模型关注的图像区域。以下是注意力热力图可视化的代码：

python 复制代码

def visualize_attention_map(model, test_loader, device, class_names, num_samples=3):
    model.eval()
    with torch.no_grad():
        for i, (images, labels) in enumerate(test_loader):
            if i >= num_samples:
                break
            images, labels = images.to(device), labels.to(device)
            activation_maps = []
            def hook(module, input, output):
                activation_maps.append(output.cpu())
            hook_handle = model.conv3.register_forward_hook(hook)
            outputs = model(images)
            hook_handle.remove()
            _, predicted = torch.max(outputs, 1)
            img = images[0].cpu().permute(1, 2, 0).numpy()
            img = img * np.array([0.2023, 0.1994, 0.2010]).reshape(1, 1, 3) + np.array([0.4914, 0.4822, 0.4465]).reshape(1, 1, 3)
            img = np.clip(img, 0, 1)
            feature_map = activation_maps[0][0].cpu()
            channel_weights = torch.mean(feature_map, dim=(1, 2))
            sorted_indices = torch.argsort(channel_weights, descending=True)
            fig, axes = plt.subplots(1, 4, figsize=(16, 4))
            axes[0].imshow(img)
            axes[0].set_title(f'原始图像\n真实: {class_names[labels[0]]}\n预测: {class_names[predicted[0]]}')
            axes[0].axis('off')
            for j in range(3):
                channel_idx = sorted_indices[j]
                channel_map = feature_map[channel_idx].numpy()
                channel_map = (channel_map - channel_map.min()) / (channel_map.max() - channel_map.min() + 1e-8)
                from scipy.ndimage import zoom
                heatmap = zoom(channel_map, (32/feature_map.shape[1], 32/feature_map.shape[2]))
                axes[j+1].imshow(img)
                axes[j+1].imshow(heatmap, alpha=0.5, cmap='jet')
                axes[j+1].set_title(f'注意力热力图 - 通道 {channel_idx}')
                axes[j+1].axis('off')
            plt.tight_layout()
            plt.show()

通过可视化注意力热力图，我们能够直观地看到模型关注的图像区域。例如，在识别"狗"的图像时，注意力热力图聚焦于狗的面部、身体轮廓等关键区域，这表明模型能够正确地关注到对分类最有帮助的特征区域。

@浙大疏锦行