基于Pytorch的YOLOv1 的网络结构代码

下面是可直接运行的 PyTorch 代码,完整实现了原始 YOLOv1 的网络结构(对应图3-1),并打印每一层的输出尺寸,方便对照理解。

python 复制代码
import torch
import torch.nn as nn
import torch.nn.functional as F

class YOLOv1(nn.Module):
    def __init__(self, num_classes=20, num_bboxes=2):
        super().__init__()
        self.num_classes = num_classes
        self.num_bboxes = num_bboxes
        
        # 定义网络层
        self.layer1 = nn.Sequential(
            # Conv1: 7x7x64-s-2
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer2 = nn.Sequential(
            # Conv2: 3x3x192
            nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(192),
            nn.LeakyReLU(0.1),
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer3 = nn.Sequential(
            # Conv3: 1x1x128 -> 3x3x256 -> 1x1x256 -> 3x3x512
            nn.Conv2d(192, 128, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1),
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer4 = nn.Sequential(
            # Conv4: (1x1x256 -> 3x3x512) x4
            *[
                nn.Sequential(
                    nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
                    nn.BatchNorm2d(256),
                    nn.LeakyReLU(0.1),
                    nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1)
                ) for _ in range(4)
            ],
            # MaxPool: 2x2-s-2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer5 = nn.Sequential(
            # Conv5: (1x1x512 -> 3x3x1024) x2
            *[
                nn.Sequential(
                    nn.Conv2d(512, 512, kernel_size=1, stride=1, padding=0, bias=False),
                    nn.BatchNorm2d(512),
                    nn.LeakyReLU(0.1),
                    nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
                    nn.BatchNorm2d(1024),
                    nn.LeakyReLU(0.1)
                ) for _ in range(2)
            ],
            # Conv: 3x3x1024-s-2
            nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        
        self.layer6 = nn.Sequential(
            # Conv6: 3x3x1024 x2
            nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        
        # 全连接层
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(7 * 7 * 1024, 4096),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.5),
            nn.Linear(4096, 7 * 7 * (num_bboxes * 5 + num_classes))
        )

    def forward(self, x):
        print(f"输入尺寸: {x.shape}")  # [2, 3, 448, 448]
        
        x = self.layer1(x)
        print(f"Layer1 输出: {x.shape}")  # [2, 64, 112, 112]
        
        x = self.layer2(x)
        print(f"Layer2 输出: {x.shape}")  # [2, 192, 56, 56]
        
        x = self.layer3(x)
        print(f"Layer3 输出: {x.shape}")  # [2, 512, 28, 28]
        
        x = self.layer4(x)
        print(f"Layer4 输出: {x.shape}")  # [2, 512, 14, 14]
        
        x = self.layer5(x)
        print(f"Layer5 输出: {x.shape}")  # [2, 1024, 7, 7]
        
        x = self.layer6(x)
        print(f"Layer6 输出: {x.shape}")  # [2, 1024, 7, 7]
        
        x = self.fc_layers(x)
        # 重塑为 [B, 7, 7, 30]
        x = x.view(-1, 7, 7, self.num_bboxes * 5 + self.num_classes)
        print(f"最终输出: {x.shape}")  # [2, 7, 7, 30]
        
        return x

# 测试代码
if __name__ == "__main__":
    # 输入: [B, 3, 448, 448]
    model = YOLOv1(num_classes=20, num_bboxes=2)
    x = torch.randn(2, 3, 448, 448)
    out = model(x)

运行结果说明

运行代码后,你会看到如下输出,清晰展示了原始 YOLOv1 的尺寸变化:

复制代码
输入尺寸: torch.Size([2, 3, 448, 448])
Layer1 输出: torch.Size([2, 64, 112, 112])
Layer2 输出: torch.Size([2, 192, 56, 56])
Layer3 输出: torch.Size([2, 512, 28, 28])
Layer4 输出: torch.Size([2, 512, 14, 14])
Layer5 输出: torch.Size([2, 1024, 7, 7])
Layer6 输出: torch.Size([2, 1024, 7, 7])
最终输出: torch.Size([2, 7, 7, 30])

关键设计解读

  1. 1×1 卷积:用于压缩通道数,减少计算量,同时不改变空间尺寸。
  2. 3×3 卷积:用于提取空间特征,是网络的核心计算单元。
  3. 全连接层:将 7×7×1024 的特征图展平为一维向量,通过全连接层回归出最终的检测结果。
  4. 输出维度7×7×30,其中 7×7 是网格数,30 代表每个网格预测 2 个边界框(5 个参数)和 20 个类别概率。
相关推荐
美酒没故事°19 小时前
Open WebUI安装指南。搭建自己的自托管 AI 平台
人工智能·windows·ai
云烟成雨TD20 小时前
Spring AI Alibaba 1.x 系列【6】ReactAgent 同步执行 & 流式执行
java·人工智能·spring
AI攻城狮20 小时前
用 Obsidian CLI + LLM 构建本地 RAG:让你的笔记真正「活」起来
人工智能·云原生·aigc
鸿乃江边鸟20 小时前
Nanobot 从onboard启动命令来看个人助理Agent的实现
人工智能·ai
lpfasd12320 小时前
基于Cloudflare生态的应用部署与开发全解
人工智能·agent·cloudflare
俞凡20 小时前
DevOps 2.0:智能体如何接管故障修复和基础设施维护
人工智能
comedate20 小时前
[OpenClaw] GLM 5 关于电影 - 人工智能 - 的思考
人工智能·电影评价
财迅通Ai20 小时前
6000万吨产能承压 卫星化学迎来战略窗口期
大数据·人工智能·物联网·卫星化学
liliangcsdn20 小时前
Agent Memory智能体记忆系统的示例分析
数据库·人工智能·全文检索
GISer_Jing20 小时前
Page-agent MCP结构
前端·人工智能