一、方案整体设计
1. 数据集选择
选用 Kaggle 经典的 Dogs vs. Cats 数据集(https://www.kaggle.com/c/dogs-vs-cats/data),该数据集包含 25000 张带标签的猫狗图像,适合 CNN 二分类任务,且数据规模适中,训练成本低。
2. 技术栈
- 深度学习框架:PyTorch
- 数据处理:torchvision, PIL
- Grad-CAM 实现:自定义梯度加权类激活映射
- 模块化拆分:按功能拆分为 4 个独立文件,便于维护和复用
二、基础版(单文件完整实现)
先提供单文件版本,方便快速验证效果,再进行模块化拆分。
python
# main.py (单文件完整版本)
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import numpy as np
import cv2
# ---------------------- 1. 数据集定义 ----------------------
class CatDogDataset(Dataset):
def __init__(self, data_dir, transform=None):
self.data_dir = data_dir
self.transform = transform
self.images = []
self.labels = []
# 读取数据:文件名包含标签(cat.0.jpg / dog.0.jpg)
for img_name in os.listdir(data_dir):
if img_name.endswith('.jpg'):
label = 1 if 'dog' in img_name else 0 # dog=1, cat=0
self.images.append(os.path.join(data_dir, img_name))
self.labels.append(label)
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
img_path = self.images[idx]
label = self.labels[idx]
image = Image.open(img_path).convert('RGB')
if self.transform:
image = self.transform(image)
return image, label
# ---------------------- 2. CNN模型定义 ----------------------
class SimpleCNN(nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()
# 特征提取层
self.features = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
)
# 分类层
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 28 * 28, 512),
nn.ReLU(),
nn.Linear(512, 2)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# 用于Grad-CAM:获取最后一个卷积层的输出
def get_last_conv_layer(self):
return self.features[-2]
# ---------------------- 3. Grad-CAM实现 ----------------------
class GradCAM:
def __init__(self, model, target_layer):
self.model = model
self.target_layer = target_layer
self.gradients = None
self.activations = None
# 注册钩子
target_layer.register_forward_hook(self.save_activation)
target_layer.register_backward_hook(self.save_gradient)
def save_activation(self, module, input, output):
self.activations = output
def save_gradient(self, module, grad_input, grad_output):
self.gradients = grad_output[0]
def generate_cam(self, input_tensor, target_class=None):
# 前向传播
output = self.model(input_tensor)
if target_class is None:
target_class = torch.argmax(output, dim=1).item()
# 反向传播
self.model.zero_grad()
one_hot = torch.zeros_like(output)
one_hot[0][target_class] = 1
output.backward(gradient=one_hot, retain_graph=True)
# 计算权重
gradients = self.gradients.cpu().data.numpy()[0]
activations = self.activations.cpu().data.numpy()[0]
weights = np.mean(gradients, axis=(1, 2))
# 生成CAM
cam = np.zeros(activations.shape[1:], dtype=np.float32)
for i, w in enumerate(weights):
cam += w * activations[i]
cam = np.maximum(cam, 0)
cam = cv2.resize(cam, (input_tensor.shape[3], input_tensor.shape[2]))
cam = cam - np.min(cam)
cam = cam / np.max(cam)
return cam
# ---------------------- 4. 训练函数 ----------------------
def train_model(model, train_loader, criterion, optimizer, device, epochs=5):
model.train()
for epoch in range(epochs):
running_loss = 0.0
correct = 0
total = 0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
epoch_loss = running_loss / len(train_loader)
epoch_acc = 100 * correct / total
print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')
return model
# ---------------------- 5. 主函数 ----------------------
if __name__ == '__main__':
# 配置参数
data_dir = './train' # 替换为你的数据集路径
batch_size = 32
epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 数据预处理
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# 加载数据集
dataset = CatDogDataset(data_dir, transform=transform)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)
# 初始化模型、损失函数、优化器
model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
print("开始训练...")
trained_model = train_model(model, train_loader, criterion, optimizer, device, epochs)
torch.save(trained_model.state_dict(), 'cat_dog_cnn.pth')
print("模型训练完成并保存!")
# Grad-CAM可视化
print("生成Grad-CAM可视化...")
# 加载测试图像
test_img_path = './test/dog.12345.jpg' # 替换为你的测试图像路径
test_img = Image.open(test_img_path).convert('RGB')
input_tensor = transform(test_img).unsqueeze(0).to(device)
# 初始化Grad-CAM
target_layer = trained_model.get_last_conv_layer()
grad_cam = GradCAM(trained_model, target_layer)
# 生成CAM
cam = grad_cam.generate_cam(input_tensor)
# 叠加到原图
img = np.array(test_img)
heatmap = cv2.applyColorMap(np.uint8(255 * cam), cv2.COLORMAP_JET)
heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
result = cv2.addWeighted(img, 0.5, heatmap, 0.5, 0)
# 保存结果
cv2.imwrite('grad_cam_result.jpg', cv2.cvtColor(result, cv2.COLOR_RGB2BGR))
print("Grad-CAM可视化完成,结果已保存!")