基于Pytorch的YOLOv1 的网络结构代码

下面是可直接运行的 PyTorch 代码,完整实现了原始 YOLOv1 的网络结构(对应图3-1),并打印每一层的输出尺寸,方便对照理解。

python 复制代码
import torch
import torch.nn as nn
import torch.nn.functional as F

class YOLOv1(nn.Module):
    def __init__(self, num_classes=20, num_bboxes=2):
        super().__init__()
        self.num_classes = num_classes
        self.num_bboxes = num_bboxes
        
        # 定义网络层
        self.layer1 = nn.Sequential(
            # Conv1: 7x7x64-s-2
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer2 = nn.Sequential(
            # Conv2: 3x3x192
            nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(192),
            nn.LeakyReLU(0.1),
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer3 = nn.Sequential(
            # Conv3: 1x1x128 -> 3x3x256 -> 1x1x256 -> 3x3x512
            nn.Conv2d(192, 128, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1),
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer4 = nn.Sequential(
            # Conv4: (1x1x256 -> 3x3x512) x4
            *[
                nn.Sequential(
                    nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
                    nn.BatchNorm2d(256),
                    nn.LeakyReLU(0.1),
                    nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1)
                ) for _ in range(4)
            ],
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer5 = nn.Sequential(
            # Conv5: (1x1x512 -> 3x3x1024) x2
            *[
                nn.Sequential(
                    nn.Conv2d(512, 512, kernel_size=1, stride=1, padding=0, bias=False),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1),
                    nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1)
                ) for _ in range(2)
            ],
            # Conv: 3x3x1024-s-2
            nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        
        self.layer6 = nn.Sequential(
            # Conv6: 3x3x1024 x2
            nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        
        # 全连接层
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(7 * 7 * 1024, 4096),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.5),
            nn.Linear(4096, 7 * 7 * (num_bboxes * 5 + num_classes))
        )

    def forward(self, x):
        print(f"输入尺寸: {x.shape}")  # [2, 3, 448, 448]
        
        x = self.layer1(x)
        print(f"Layer1 输出: {x.shape}")  # [2, 64, 112, 112]
        
        x = self.layer2(x)
        print(f"Layer2 输出: {x.shape}")  # [2, 192, 56, 56]
        
        x = self.layer3(x)
        print(f"Layer3 输出: {x.shape}")  # [2, 512, 28, 28]
        
        x = self.layer4(x)
        print(f"Layer4 输出: {x.shape}")  # [2, 512, 14, 14]
        
        x = self.layer5(x)
        print(f"Layer5 输出: {x.shape}")  # [2, 1024, 7, 7]
        
        x = self.layer6(x)
        print(f"Layer6 输出: {x.shape}")  # [2, 1024, 7, 7]
        
        x = self.fc_layers(x)
        # 重塑为 [B, 7, 7, 30]
        x = x.view(-1, 7, 7, self.num_bboxes * 5 + self.num_classes)
        print(f"最终输出: {x.shape}")  # [2, 7, 7, 30]
        
        return x

# 测试代码
if __name__ == "__main__":
    # 输入: [B, 3, 448, 448]
    model = YOLOv1(num_classes=20, num_bboxes=2)
    x = torch.randn(2, 3, 448, 448)
    out = model(x)

运行结果说明

运行代码后,你会看到如下输出,清晰展示了原始 YOLOv1 的尺寸变化:

复制代码
输入尺寸: torch.Size([2, 3, 448, 448])
Layer1 输出: torch.Size([2, 64, 112, 112])
Layer2 输出: torch.Size([2, 192, 56, 56])
Layer3 输出: torch.Size([2, 512, 28, 28])
Layer4 输出: torch.Size([2, 512, 14, 14])
Layer5 输出: torch.Size([2, 1024, 7, 7])
Layer6 输出: torch.Size([2, 1024, 7, 7])
最终输出: torch.Size([2, 7, 7, 30])

关键设计解读

  1. 1×1 卷积:用于压缩通道数,减少计算量,同时不改变空间尺寸。
  2. 3×3 卷积:用于提取空间特征,是网络的核心计算单元。
  3. 全连接层:将 7×7×1024 的特征图展平为一维向量,通过全连接层回归出最终的检测结果。
  4. 输出维度7×7×30,其中 7×7 是网格数,30 代表每个网格预测 2 个边界框(5 个参数)和 20 个类别概率。
相关推荐
NAGNIP6 小时前
一文搞懂深度学习中的通用逼近定理!
人工智能·算法·面试
冬奇Lab7 小时前
一天一个开源项目(第36篇):EverMemOS - 跨 LLM 与平台的长时记忆 OS,让 Agent 会记忆更会推理
人工智能·开源·资讯
冬奇Lab7 小时前
OpenClaw 源码深度解析(一):Gateway——为什么需要一个"中枢"
人工智能·开源·源码阅读
AngelPP10 小时前
OpenClaw 架构深度解析:如何把 AI 助手搬到你的个人设备上
人工智能
宅小年11 小时前
Claude Code 换成了Kimi K2.5后,我再也回不去了
人工智能·ai编程·claude
九狼11 小时前
Flutter URL Scheme 跨平台跳转
人工智能·flutter·github
ZFSS11 小时前
Kimi Chat Completion API 申请及使用
前端·人工智能
天翼云开发者社区12 小时前
春节复工福利就位!天翼云息壤2500万Tokens免费送,全品类大模型一键畅玩!
人工智能·算力服务·息壤
知识浅谈12 小时前
教你如何用 Gemini 将课本图片一键转为精美 PPT
人工智能
Ray Liang13 小时前
被低估的量化版模型,小身材也能干大事
人工智能·ai·ai助手·mindx