基于Pytorch的YOLOv1 的网络结构代码

下面是可直接运行的 PyTorch 代码,完整实现了原始 YOLOv1 的网络结构(对应图3-1),并打印每一层的输出尺寸,方便对照理解。

python 复制代码
import torch
import torch.nn as nn
import torch.nn.functional as F

class YOLOv1(nn.Module):
    def __init__(self, num_classes=20, num_bboxes=2):
        super().__init__()
        self.num_classes = num_classes
        self.num_bboxes = num_bboxes
        
        # 定义网络层
        self.layer1 = nn.Sequential(
            # Conv1: 7x7x64-s-2
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer2 = nn.Sequential(
            # Conv2: 3x3x192
            nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(192),
            nn.LeakyReLU(0.1),
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer3 = nn.Sequential(
            # Conv3: 1x1x128 -> 3x3x256 -> 1x1x256 -> 3x3x512
            nn.Conv2d(192, 128, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1),
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer4 = nn.Sequential(
            # Conv4: (1x1x256 -> 3x3x512) x4
            *[
                nn.Sequential(
                    nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
                    nn.BatchNorm2d(256),
                    nn.LeakyReLU(0.1),
                    nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1)
                ) for _ in range(4)
            ],
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer5 = nn.Sequential(
            # Conv5: (1x1x512 -> 3x3x1024) x2
            *[
                nn.Sequential(
                    nn.Conv2d(512, 512, kernel_size=1, stride=1, padding=0, bias=False),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1),
                    nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1)
                ) for _ in range(2)
            ],
            # Conv: 3x3x1024-s-2
            nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        
        self.layer6 = nn.Sequential(
            # Conv6: 3x3x1024 x2
            nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        
        # 全连接层
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(7 * 7 * 1024, 4096),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.5),
            nn.Linear(4096, 7 * 7 * (num_bboxes * 5 + num_classes))
        )

    def forward(self, x):
        print(f"输入尺寸: {x.shape}")  # [2, 3, 448, 448]
        
        x = self.layer1(x)
        print(f"Layer1 输出: {x.shape}")  # [2, 64, 112, 112]
        
        x = self.layer2(x)
        print(f"Layer2 输出: {x.shape}")  # [2, 192, 56, 56]
        
        x = self.layer3(x)
        print(f"Layer3 输出: {x.shape}")  # [2, 512, 28, 28]
        
        x = self.layer4(x)
        print(f"Layer4 输出: {x.shape}")  # [2, 512, 14, 14]
        
        x = self.layer5(x)
        print(f"Layer5 输出: {x.shape}")  # [2, 1024, 7, 7]
        
        x = self.layer6(x)
        print(f"Layer6 输出: {x.shape}")  # [2, 1024, 7, 7]
        
        x = self.fc_layers(x)
        # 重塑为 [B, 7, 7, 30]
        x = x.view(-1, 7, 7, self.num_bboxes * 5 + self.num_classes)
        print(f"最终输出: {x.shape}")  # [2, 7, 7, 30]
        
        return x

# 测试代码
if __name__ == "__main__":
    # 输入: [B, 3, 448, 448]
    model = YOLOv1(num_classes=20, num_bboxes=2)
    x = torch.randn(2, 3, 448, 448)
    out = model(x)

运行结果说明

运行代码后,你会看到如下输出,清晰展示了原始 YOLOv1 的尺寸变化:

复制代码
输入尺寸: torch.Size([2, 3, 448, 448])
Layer1 输出: torch.Size([2, 64, 112, 112])
Layer2 输出: torch.Size([2, 192, 56, 56])
Layer3 输出: torch.Size([2, 512, 28, 28])
Layer4 输出: torch.Size([2, 512, 14, 14])
Layer5 输出: torch.Size([2, 1024, 7, 7])
Layer6 输出: torch.Size([2, 1024, 7, 7])
最终输出: torch.Size([2, 7, 7, 30])

关键设计解读

  1. 1×1 卷积:用于压缩通道数,减少计算量,同时不改变空间尺寸。
  2. 3×3 卷积:用于提取空间特征,是网络的核心计算单元。
  3. 全连接层:将 7×7×1024 的特征图展平为一维向量,通过全连接层回归出最终的检测结果。
  4. 输出维度7×7×30,其中 7×7 是网格数,30 代表每个网格预测 2 个边界框(5 个参数)和 20 个类别概率。
相关推荐
小何code1 天前
人工智能【第10篇】支持向量机SVM:寻找最优分类超平面(长文+代码实现)
人工智能·机器学习·支持向量机
晨启AI1 天前
GPT-5.5 来了!OpenAI 最新提示词指南深度解读
大数据·人工智能·ai·提示词
wayz111 天前
Day 18 编程实战:Keras搭建MLP神经网络
人工智能·神经网络·keras
凯歌的博客1 天前
MetaGPT和Superpowers区别, AI编程
人工智能·ai编程
NaMM CHIN1 天前
Spring Boot + Spring AI快速体验
人工智能·spring boot·spring
一切皆是因缘际会1 天前
可落地数字生命工程:从记忆厮杀到自我意识觉醒全链路,AGI内生智能硅基生命心智建模
人工智能·深度学习·算法·机器学习·ai·系统架构·agi
70asunflower1 天前
从CPU市场到AI算力格局:一场关于技术路线、商业逻辑与生态锁定的深度梳理
人工智能
地球资源数据云1 天前
中国陆地生态系统主要植物功能特征空间分布数据
大数据·数据库·人工智能·机器学习
AI创界者1 天前
最新RedMix-Ernie-Image整合包,解压即用:文生图、图生图,n卡8G显存玩转4K
人工智能
月诸清酒1 天前
51-260503 AI 科技日报 (ChatGPT图像功能用户量暴涨,新用户占六成)
人工智能·chatgpt