下面是可直接运行的 PyTorch 代码,完整实现了原始 YOLOv1 的网络结构(对应图3-1),并打印每一层的输出尺寸,方便对照理解。
python
import torch
import torch.nn as nn
import torch.nn.functional as F
class YOLOv1(nn.Module):
def __init__(self, num_classes=20, num_bboxes=2):
super().__init__()
self.num_classes = num_classes
self.num_bboxes = num_bboxes
# 定义网络层
self.layer1 = nn.Sequential(
# Conv1: 7x7x64-s-2
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(64),
nn.LeakyReLU(0.1),
# MaxPool: 2x2-s-2
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.layer2 = nn.Sequential(
# Conv2: 3x3x192
nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(192),
nn.LeakyReLU(0.1),
# MaxPool: 2x2-s-2
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.layer3 = nn.Sequential(
# Conv3: 1x1x128 -> 3x3x256 -> 1x1x256 -> 3x3x512
nn.Conv2d(192, 128, kernel_size=1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(128),
nn.LeakyReLU(0.1),
nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.1),
nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.1),
nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1),
# MaxPool: 2x2-s-2
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.layer4 = nn.Sequential(
# Conv4: (1x1x256 -> 3x3x512) x4
*[
nn.Sequential(
nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.1),
nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1)
) for _ in range(4)
],
# MaxPool: 2x2-s-2
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.layer5 = nn.Sequential(
# Conv5: (1x1x512 -> 3x3x1024) x2
*[
nn.Sequential(
nn.Conv2d(512, 512, kernel_size=1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.1),
nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1)
) for _ in range(2)
],
# Conv: 3x3x1024-s-2
nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1)
)
self.layer6 = nn.Sequential(
# Conv6: 3x3x1024 x2
nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1),
nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(1024),
nn.LeakyReLU(0.1)
)
# 全连接层
self.fc_layers = nn.Sequential(
nn.Flatten(),
nn.Linear(7 * 7 * 1024, 4096),
nn.LeakyReLU(0.1),
nn.Dropout(0.5),
nn.Linear(4096, 7 * 7 * (num_bboxes * 5 + num_classes))
)
def forward(self, x):
print(f"输入尺寸: {x.shape}") # [2, 3, 448, 448]
x = self.layer1(x)
print(f"Layer1 输出: {x.shape}") # [2, 64, 112, 112]
x = self.layer2(x)
print(f"Layer2 输出: {x.shape}") # [2, 192, 56, 56]
x = self.layer3(x)
print(f"Layer3 输出: {x.shape}") # [2, 512, 28, 28]
x = self.layer4(x)
print(f"Layer4 输出: {x.shape}") # [2, 512, 14, 14]
x = self.layer5(x)
print(f"Layer5 输出: {x.shape}") # [2, 1024, 7, 7]
x = self.layer6(x)
print(f"Layer6 输出: {x.shape}") # [2, 1024, 7, 7]
x = self.fc_layers(x)
# 重塑为 [B, 7, 7, 30]
x = x.view(-1, 7, 7, self.num_bboxes * 5 + self.num_classes)
print(f"最终输出: {x.shape}") # [2, 7, 7, 30]
return x
# 测试代码
if __name__ == "__main__":
# 输入: [B, 3, 448, 448]
model = YOLOv1(num_classes=20, num_bboxes=2)
x = torch.randn(2, 3, 448, 448)
out = model(x)
运行结果说明
运行代码后,你会看到如下输出,清晰展示了原始 YOLOv1 的尺寸变化:
输入尺寸: torch.Size([2, 3, 448, 448])
Layer1 输出: torch.Size([2, 64, 112, 112])
Layer2 输出: torch.Size([2, 192, 56, 56])
Layer3 输出: torch.Size([2, 512, 28, 28])
Layer4 输出: torch.Size([2, 512, 14, 14])
Layer5 输出: torch.Size([2, 1024, 7, 7])
Layer6 输出: torch.Size([2, 1024, 7, 7])
最终输出: torch.Size([2, 7, 7, 30])
关键设计解读
- 1×1 卷积:用于压缩通道数,减少计算量,同时不改变空间尺寸。
- 3×3 卷积:用于提取空间特征,是网络的核心计算单元。
- 全连接层:将 7×7×1024 的特征图展平为一维向量,通过全连接层回归出最终的检测结果。
- 输出维度 :
7×7×30,其中7×7是网格数,30代表每个网格预测 2 个边界框(5 个参数)和 20 个类别概率。