本文较长,建议点赞收藏,以免遗失。更多AI大模型应用开发学习内容和资料尽在个人主页。
一、卷积层:图像特征提取的基石
核心思想 :局部连接 + 权值共享
数学表达 :
(I * K)(i,j) = ∑∑ I(m,n)K(i-m,j-n)
其中:
I
:输入特征图K
:卷积核*
:卷积操作
关键参数:
- 卷积核尺寸(3×3, 5×5)
- 步长(Stride)
- 填充(Padding)
- 输出通道数
PyTorch实现:
ini
import torch
import torch.nn as nn
# 创建卷积层:输入3通道,输出64通道,3x3卷积核,步长1,填充1
conv = nn.Conv2d(in_channels=3, out_channels=64,
kernel_size=3, stride=1, padding=1)
# 输入图像:32x32 RGB图像 (batch_size=16)
input = torch.randn(16, 3, 32, 32)
output = conv(input)
print("输出尺寸:", output.shape) # [16, 64, 32, 32]
卷积过程可视化:

二、池化层:特征降维与不变性增强

核心作用:
- 降低空间分辨率(减少参数)
- 增强平移不变性
- 防止过拟合
主要类型:
ini
# 最大池化:提取局部最显著特征
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
# 平均池化:保留局部整体信息
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
# 输入特征图:32x32
input = torch.randn(1, 64, 32, 32)
output = max_pool(input)
print("池化后尺寸:", output.shape) # [1, 64, 16, 16]
三、经典CNN架构演进史
1. LeNet-5 (1998):CNN开山之作
ini
class LeNet(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 6, 5) # 输入1通道,输出6通道
self.pool1 = nn.AvgPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.pool2 = nn.AvgPool2d(2, 2)
self.fc1 = nn.Linear(16*4*4, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool1(torch.tanh(self.conv1(x)))
x = self.pool2(torch.tanh(self.conv2(x)))
x = x.view(-1, 16*4*4)
x = torch.tanh(self.fc1(x))
x = torch.tanh(self.fc2(x))
return self.fc3(x)
创新点:
- 首个成功应用于数字识别的CNN
- 交替卷积+池化结构
- 参数量仅6万(MNIST上99.2%准确率)
2. AlexNet (2012):深度学习复兴标志
scss
class AlexNet(nn.Module):
def __init__(self, num_classes=1000):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 96, 11, stride=4), # 输入224x224
nn.ReLU(),
nn.MaxPool2d(3, 2),
nn.Conv2d(96, 256, 5, padding=2),
nn.ReLU(),
nn.MaxPool2d(3, 2),
nn.Conv2d(256, 384, 3, padding=1),
nn.ReLU(),
nn.Conv2d(384, 384, 3, padding=1),
nn.ReLU(),
nn.Conv2d(384, 256, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(3, 2),
)
self.classifier = nn.Sequential(
nn.Dropout(0.5), # 首次引入Dropout
nn.Linear(256*6*6, 4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = torch.flatten(x, 1)
return self.classifier(x)
里程碑贡献:
- 首次使用ReLU激活函数
- 引入Dropout正则化
- 多GPU并行训练
- ImageNet top-5错误率15.3%(超越传统方法41%)
3. VGG (2014):深度与规范化的胜利
scss
def make_vgg_block(in_channels, out_channels, num_convs):
layers = []
for _ in range(num_convs):
layers += [
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.ReLU()
]
in_channels = out_channels
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
return nn.Sequential(*layers)
class VGG(nn.Module):
def __init__(self, config='D'):
super().__init__()
# VGG配置:A-11层,B-13层,D-16层(VGG16),E-19层(VGG19)
configs = {
'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M',
512, 512, 512, 'M', 512, 512, 512, 'M']
}
self.features = self._make_layers(configs[config])
self.classifier = nn.Sequential(
nn.Linear(512*7*7, 4096), nn.ReLU(), nn.Dropout(0.5),
nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(0.5),
nn.Linear(4096, 1000)
)
def _make_layers(self, cfg):
layers = []
in_channels = 3
for v in cfg:
if v == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
layers += [
nn.Conv2d(in_channels, v, 3, padding=1),
nn.ReLU(inplace=True)
]
in_channels = v
return nn.Sequential(*layers)
核心思想:
- 全部使用3×3小卷积核(减少参数量)
- 深度增加提升表征能力
- 统一架构设计范式
4. ResNet (2015):突破深度极限
ini
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1)
self.bn2 = nn.BatchNorm2d(out_channels)
# 跳跃连接
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1, stride),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
residual = self.shortcut(x)
x = F.relu(self.bn1(self.conv1(x)))
x = self.bn2(self.conv2(x))
x += residual # 关键残差连接
return F.relu(x)
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super().__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(3, 64, 3, 1, 1)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, num_blocks[0], 1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], 2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], 2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], 2)
self.linear = nn.Linear(512, num_classes)
def _make_layer(self, block, out_channels, num_blocks, stride):
layers = [block(self.in_channels, out_channels, stride)]
self.in_channels = out_channels
for _ in range(1, num_blocks):
layers.append(block(out_channels, out_channels, 1))
return nn.Sequential(*layers)
革命性创新:
- 残差连接:
H(x) = F(x) + x
- 解决深度网络梯度消失问题
- 首次训练超过1000层的网络
- ImageNet top-5错误率3.57%(超越人类水平)
四、图像分类实战:CIFAR-10数据集
1. 数据预处理
ini
from torchvision import datasets, transforms
# 数据增强
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2470, 0.2435, 0.2616))
])
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2470, 0.2435, 0.2616))
])
# 加载数据集
train_data = datasets.CIFAR10('./data', train=True,
download=True, transform=train_transform)
test_data = datasets.CIFAR10('./data', train=False,
transform=test_transform)
2. 现代CNN模型实现(ResNet18)
ini
def ResNet18():
return ResNet(ResidualBlock, [2, 2, 2, 2])
model = ResNet18().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1,
momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.MultiStepLR(
optimizer, milestones=[100, 150], gamma=0.1)
criterion = nn.CrossEntropyLoss()
3. 训练循环
scss
for epoch in range(200):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
# 验证集评估
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += criterion(output, target).item()
pred = output.argmax(dim=1)
correct += pred.eq(target).sum().item()
acc = 100. * correct / len(test_loader.dataset)
print(f"Epoch {epoch}: Test Acc={acc:.2f}%")
scheduler.step()
五、CNN架构设计原则
- 空间维度递减 :
输入尺寸逐渐减小(224→112→56→28→14→7)
通道数逐渐增加(3→64→128→256→512) - 计算量平衡 :
早期层:大尺寸特征图 + 小通道数
深层:小尺寸特征图 + 大通道数 - 现代架构模式:
css
graph LR
A[输入] --> B[Stem模块:快速降维]
B --> C[阶段1:高分辨率+低通道]
C --> D[阶段2:中分辨率+中通道]
D --> E[阶段3:低分辨率+高通道]
E --> F[全局池化]
F --> G[全连接分类]
六、性能优化技巧
1. 卷积加速方法
ini
# 标准卷积
conv = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
# 深度可分离卷积(MobileNet)
depthwise = nn.Conv2d(256, 256, kernel_size=3, groups=256)
pointwise = nn.Conv2d(256, 512, kernel_size=1)
# 计算量对比:
# 标准卷积:256*512*3*3 = 1,179,648
# 深度可分离:256*3*3 + 256*512 = 131,584 (加速9倍)
2. 注意力增强
scss
# 通道注意力(SENet)
class SEBlock(nn.Module):
def __init__(self, channel, reduction=16):
super().__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channel, channel // reduction),
nn.ReLU(),
nn.Linear(channel // reduction, channel),
nn.Sigmoid()
)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return x * y # 特征重校准
七、学习路线与资源推荐
知识进阶路径:

笔者洞见 :
图像任务首选CNN,但ViT正在崛起
ResNet仍是工业界黄金标准
移动端部署需考虑:
模型压缩(剪枝/量化)
高效架构(MobileNet/EfficientNet)
数据质量 > 模型复杂度
创作不易,记得留下你的小红心。更多AI大模型应用开发学习内容和资料,尽在AI大模型技术社。