经典卷积神经网络架构详解
1、卷积网络
该章节主要介绍应用较为广泛的网络架构,包括AlexNet、VGG、GoogLeNet、ResNet等经典模型。
1、AlexNet
AlexNet是2012年ImageNet竞赛的冠军模型,标志着深度学习的复兴。它采用了深层卷积神经网络结构,在当时取得了突破性的性能提升。
1.1 AlexNet架构详解
AlexNet的核心特点包括:
- 使用ReLU激活函数替代Sigmoid,缓解梯度消失问题
- 引入Dropout正则化防止过拟合
- 使用数据增强技术提高泛化能力
- 采用GPU加速训练
1.2 AlexNet实现步骤
首先第一步是模拟数据,其次是定义模型,其次是训练参数。在实现过程中,我们遇到了多个关于数据格式的问题。
1.2.1 数据模拟与加载
python
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
# 模拟数据
X1_train = torch.full(size=(50, 3, 224, 224), fill_value=0)
Y1_train = torch.full(size=(50,), fill_value=0)
X2_train = torch.full(size=(50, 3, 224, 224), fill_value=1)
Y2_train = torch.full(size=(50,), fill_value=1)
X3_train = torch.full(size=(50, 3, 224, 224), fill_value=2)
Y3_train = torch.full(size=(50,), fill_value=2)
X4_train = torch.full(size=(50, 3, 224, 224), fill_value=3)
Y4_train = torch.full(size=(50,), fill_value=3)
# 合并训练数据
X_train = torch.cat([X1_train, X2_train, X3_train, X4_train], axis=0).to(dtype=torch.float32)
Y_train = torch.cat([Y1_train, Y2_train, Y3_train, Y4_train], axis=0)
# 生成测试数据
X_test = torch.tensor(np.random.choice(2, size=(X_train.numel(),), p=[0.9, 0.1])).reshape(X_train.shape) + X_train
Y_test = torch.cat([Y1_train, Y2_train, Y3_train, Y4_train], axis=0)
# 创建数据集和数据加载器
train_dataset = TensorDataset(X_train, Y_train)
test_dataset = TensorDataset(X_test, Y_test)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
1.2.2 模型定义
python
# 定义AlexNet模型
AlexNet = nn.Sequential(
nn.Conv2d(3, 96, kernel_size=(11, 11), stride=4),
nn.MaxPool2d(kernel_size=(3, 3), stride=2),
nn.Conv2d(96, 256, kernel_size=(5, 5), padding=2, stride=1),
nn.MaxPool2d(kernel_size=(3, 3), stride=2),
nn.Conv2d(256, 384, kernel_size=(3, 3), padding=1, stride=1),
nn.Conv2d(384, 384, kernel_size=(3, 3), padding=1, stride=1),
nn.Conv2d(384, 256, kernel_size=(3, 3), padding=1, stride=1),
nn.MaxPool2d(kernel_size=(3, 3), stride=1),
nn.Flatten(start_dim=1),
nn.Linear(25600, 4096),
nn.Sigmoid(),
nn.Linear(4096, 4096),
nn.Sigmoid(),
nn.Linear(4096, 4)
)
1.2.3 数据格式问题与解决
在实现过程中,我们遇到了多个数据格式相关的问题:
问题1:数据格式不统一
RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x25600 and 25600x4096)
问题分析:
- 第一次报错指出AlexNet的forward方法在处理int和float的数据
- 在将X.to(dtype=torch.float16)之后,报出的错误是half和float的格式不匹配
- 在将X.to(dtype=torch.float(32))之后,没有报出错误
学习要点:
-
数据类型理解:
half表示torch.float16float表示torch.float32torch.long表示torch.int64torch.int表示torch.int32
-
解决数据格式不统一问题的方法有两种:
-
方法一:转换数据格式
pythonX_train = X_train.to(dtype=torch.float32) -
方法二:转换模型的数据格式
- 没有实例化的,在实例化过程中,进行
.to() - 已实例化的,通过
AlexNet = AlexNet.to(dtype=torch.float16)
- 没有实例化的,在实例化过程中,进行
-
问题2:损失函数数据格式错误
Expected object of scalar type Long but got scalar type Float for argument #2 'target'
问题分析:
- 传入了与期待格式不相同的数据格式
scalar float but expected long
解决关键:
- 损失函数的参数分别需要传入什么
- 第一个参数(预测值)需要传入的格式是
float - 第二个参数(标签)需要传入的格式是
long
python
# 错误示例
loss = nn.CrossEntropyLoss()
result = loss(Y_pre, Y_train) # Y_train需要是long类型
# 正确示例
result = loss(Y_pre, Y_train.long())
1.2.4 前向传播测试
python
# 进行前向传播
for X, y in train_dataloader:
break
print(AlexNet(X).shape)
# 打印每一层的输出形状
for M in AlexNet:
X = M(X)
print(f"{M.__class__.__name__}: {X.shape}")
输出示例:
torch.Size([16, 4])
Conv2d: torch.Size([16, 96, 54, 54])
MaxPool2d: torch.Size([16, 96, 26, 26])
Conv2d: torch.Size([16, 256, 26, 26])
MaxPool2d: torch.Size([16, 256, 12, 12])
Conv2d: torch.Size([16, 384, 12, 12])
Conv2d: torch.Size([16, 384, 12, 12])
Conv2d: torch.Size([16, 256, 12, 12])
MaxPool2d: torch.Size([16, 256, 10, 10])
Flatten: torch.Size([16, 25600])
Linear: torch.Size([16, 4096])
Sigmoid: torch.Size([16, 4096])
Linear: torch.Size([16, 4096])
Sigmoid: torch.Size([16, 4096])
Linear: torch.Size([16, 4])
1.3 AlexNet训练与评估
python
from tqdm import tqdm
# 定义训练函数
def train(epoch, lr, train_dataloader, test_dataloader, net):
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss(reduction="mean")
train_acc = []
train_loss = []
# 初始化权重
def init_weight(m):
if type(m) == nn.Conv2d or type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
net.apply(init_weight)
# 训练阶段
if isinstance(net, nn.Module):
net.train()
for i in tqdm(range(epoch)):
for X_train, Y_train in train_dataloader:
optimizer.zero_grad()
Y_pre = net(X_train)
result = loss_fn(Y_pre, Y_train.long())
result.backward()
optimizer.step()
with torch.no_grad():
accu = Y_pre.argmax(axis=1) == Y_train
train_acc.append(int(accu.sum()) / len(Y_train))
train_loss.append(result.item())
# 评估阶段
if isinstance(net, nn.Module):
net.eval()
test_accu = []
test_loss = []
for X, y in test_dataloader:
with torch.no_grad():
Y_pre = net(X)
result = loss_fn(Y_pre, y.long())
accu = Y_pre.argmax(axis=1) == y
test_accu.append(int(accu.sum()) / len(y))
test_loss.append(result.item())
return train_acc, train_loss, test_accu, test_loss
# 训练模型
train_accu, train_loss, test_accu, test_loss = train(
epoch=120,
lr=1e-4,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
net=AlexNet
)
1.4 AlexNet性能评估
python
# 查看测试准确率
print(test_accu)
# 计算最终指标
final_accuracy = sum(test_accu) / len(test_accu)
final_loss = sum(test_loss) / len(test_loss)
print(f"Finally metric 准确度:{final_accuracy} 损失函数:{final_loss}")
2、VGG网络
VGG(Visual Geometry Group)网络是2014年ImageNet竞赛的亚军,以其简洁的网络结构而闻名。VGG网络的核心思想是使用更小的卷积核(3×3)和更深的网络结构。
2.1 VGG网络特点
- 使用连续的3×3卷积核替代更大的卷积核
- 网络结构规整,易于理解和实现
- 使用ReLU激活函数
- 引入Batch Normalization
2.2 VGG16实现
python
class VGG16(nn.Module):
def __init__(self, num_classes=1000):
super(VGG16, self).__init__()
self.features = nn.Sequential(
# Conv Block 1
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Conv Block 2
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Conv Block 3
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Conv Block 4
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
# Conv Block 5
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
# 实例化模型
vgg16 = VGG16(num_classes=4)
3、GoogLeNet(Inception网络)
GoogLeNet是2014年ImageNet竞赛的冠军,引入了Inception模块,通过并行使用不同大小的卷积核来捕捉多尺度特征。
3.1 Inception模块
python
class Inception(nn.Module):
def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj):
super(Inception, self).__init__()
self.branch1 = nn.Sequential(
nn.Conv2d(in_channels, ch1x1, kernel_size=1),
nn.ReLU(inplace=True)
)
self.branch2 = nn.Sequential(
nn.Conv2d(in_channels, ch3x3red, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv2d(ch3x3red, ch3x3, kernel_size=3, padding=1),
nn.ReLU(inplace=True)
)
self.branch3 = nn.Sequential(
nn.Conv2d(in_channels, ch5x5red, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv2d(ch5x5red, ch5x5, kernel_size=5, padding=2),
nn.ReLU(inplace=True)
)
self.branch4 = nn.Sequential(
nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
nn.Conv2d(in_channels, pool_proj, kernel_size=1),
nn.ReLU(inplace=True)
)
def forward(self, x):
branch1 = self.branch1(x)
branch2 = self.branch2(x)
branch3 = self.branch3(x)
branch4 = self.branch4(x)
outputs = [branch1, branch2, branch3, branch4]
return torch.cat(outputs, 1)
4、ResNet(残差网络)
ResNet是2015年ImageNet竞赛的冠军,引入了残差连接(Residual Connection),解决了深层网络训练困难的问题。
4.1 残差块
python
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# 如果输入和输出通道数不同,需要使用1x1卷积调整通道数
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
identity = self.shortcut(identity)
out += identity
out = self.relu(out)
return out
4.2 ResNet18实现
python
class ResNet18(nn.Module):
def __init__(self, num_classes=1000):
super(ResNet18, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
self.layer1 = nn.Sequential(
ResidualBlock(64, 64),
ResidualBlock(64, 64)
)
self.layer2 = nn.Sequential(
ResidualBlock(64, 128, stride=2),
ResidualBlock(128, 128)
)
self.layer3 = nn.Sequential(
ResidualBlock(128, 256, stride=2),
ResidualBlock(256, 256)
)
self.layer4 = nn.Sequential(
ResidualBlock(256, 512, stride=2),
ResidualBlock(512, 512)
)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512, num_classes)
def forward(self, x):
x = self.conv1(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# 实例化模型
resnet18 = ResNet18(num_classes=4)
总结
通过本文的介绍,我们深入了解了经典卷积神经网络的架构和实现方法:
4.1 关键学习要点
-
数据格式的重要性:
- PyTorch中不同的数据类型(float16, float32, int32, int64)
- 损失函数对输入数据格式的要求
- 数据格式转换的方法
-
经典网络架构:
- AlexNet:深层网络的先驱,引入ReLU和Dropout
- VGG:使用小卷积核构建深层网络
- GoogLeNet:Inception模块,多尺度特征提取
- ResNet:残差连接,解决深层网络训练问题
-
模型训练技巧:
- 权重初始化的重要性
- 学习率的选择和调整
- 正则化技术的使用
-
调试和优化:
- 理解错误信息
- 逐步调试模型
- 监控训练过程
掌握这些经典网络架构对于深度学习的学习和应用至关重要,它们为后续更复杂网络的设计奠定了基础。