以下是几种常用的 卷积神经网络(CNN) 架构及其 PyTorch 实现示例,涵盖经典模型和现代变体。这些模型在图像分类、目标检测等任务中表现卓越。
1. LeNet-5(1998)
- 特点
首个成功应用于手写数字识别的CNN。
结构:卷积层 + 池化层 + 全连接层。
适用场景:小尺寸图像分类(如MNIST)。
PyTorch实现:
python
import torch.nn as nn
class LeNet(nn.Module):
def __init__(self, num_classes=10):
super(LeNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5), # 输入1通道(灰度)
nn.ReLU(),
nn.AvgPool2d(kernel_size=2),
nn.Conv2d(6, 16, kernel_size=5),
nn.ReLU(),
nn.AvgPool2d(kernel_size=2)
)
self.classifier = nn.Sequential(
nn.Linear(16*4*4, 120), # 根据输入尺寸调整
nn.ReLU(),
nn.Linear(120, 84),
nn.ReLU(),
nn.Linear(84, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
# 示例:MNIST分类
model = LeNet(num_classes=10)
2. AlexNet(2012)
- 特点
引入ReLU激活函数和Dropout,大幅提升性能。
使用GPU加速训练。
适用场景:中等尺寸图像分类(如ImageNet)。
PyTorch实现:
python
class AlexNet(nn.Module):
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(64, 192, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(256*6*6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), 256*6*6)
x = self.classifier(x)
return x
# 示例:ImageNet分类
model = AlexNet(num_classes=1000)
3. VGG(2014)
- 特点
使用重复的3x3卷积块,加深网络。
常见变体:VGG16、VGG19。
适用场景:高精度图像分类。
PyTorch实现:
python
class VGG16(nn.Module):
def __init__(self, num_classes=1000):
super(VGG16, self).__init__()
self.features = nn.Sequential(
# Block 1
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Block 2-5(类似结构,此处省略)
# ...
)
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512*7*7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
# 使用PyTorch内置VGG(推荐)
import torchvision.models as models
vgg16 = models.vgg16(pretrained=True) # 加载预训练权重
4. ResNet(2015)
- 特点
引入残差连接(Residual Block),解决梯度消失问题。
支持极深网络(如ResNet-152)。
适用场景:通用视觉任务。
PyTorch实现(简化版):
python
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(out_channels)
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
residual = x
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.bn2(x)
x += self.shortcut(residual)
x = self.relu(x)
return x
class ResNet18(nn.Module):
def __init__(self, num_classes=10):
super(ResNet18, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# 添加残差块
self.layer1 = self._make_layer(64, 64, 2, stride=1)
self.layer2 = self._make_layer(64, 128, 2, stride=2)
self.layer3 = self._make_layer(128, 256, 2, stride=2)
self.layer4 = self._make_layer(256, 512, 2, stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512, num_classes)
def _make_layer(self, in_channels, out_channels, blocks, stride):
layers = []
layers.append(ResidualBlock(in_channels, out_channels, stride))
for _ in range(1, blocks):
layers.append(ResidualBlock(out_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# 示例:CIFAR-10分类
model = ResNet18(num_classes=10)
5. MobileNet(2017)
- 特点
使用深度可分离卷积(Depthwise Separable Convolution),降低计算量。
适合移动端和嵌入式设备。
PyTorch实现:
python
class DepthwiseSeparableConv(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super().__init__()
self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size=3,
stride=stride, padding=1, groups=in_channels)
self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1)
def forward(self, x):
x = self.depthwise(x)
x = self.pointwise(x)
return x
class MobileNet(nn.Module):
def __init__(self, num_classes=1000):
super(MobileNet, self).__init__()
self.model = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
nn.ReLU(inplace=True),
DepthwiseSeparableConv(32, 64, stride=1),
DepthwiseSeparableConv(64, 128, stride=2),
DepthwiseSeparableConv(128, 128, stride=1),
DepthwiseSeparableConv(128, 256, stride=2),
# 更多层...
nn.AdaptiveAvgPool2d(1)
)
self.fc = nn.Linear(256, num_classes)
def forward(self, x):
x = self.model(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# 示例:轻量化模型
model = MobileNet(num_classes=1000)
6. 使用预训练模型(PyTorch内置)
PyTorch提供多种预训练模型,可直接加载:
python
import torchvision.models as models
# 加载预训练模型
resnet50 = models.resnet50(pretrained=True)
inception_v3 = models.inception_v3(pretrained=True)
mobilenet_v2 = models.mobilenet_v2(pretrained=True)
# 微调(修改最后一层)
resnet50.fc = nn.Linear(resnet50.fc.in_features, num_classes=10)
训练示例(以CIFAR-10为例)
python
import torch.optim as optim
# 定义模型、损失函数、优化器
model = ResNet18(num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# 训练循环
for epoch in range(10):
for inputs, labels in train_loader:
outputs = model(inputs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')
总结:模型选择指南
模型 | 参数量 | 适用场景 | 优势 |
---|---|---|---|
LeNet | ~60k | 简单分类(MNIST) | 结构简单,计算量小 |
AlexNet | ~60M | 中等图像分类 | 经典基准模型 |
VGG | ~138M | 高精度分类 | 结构规则,易于扩展 |
ResNet | ~25M (18层) | 通用视觉任务 | 解决梯度消失,支持极深网络 |
MobileNet | ~3.5M | 移动端/嵌入式设备 | 计算高效,参数量少 |