04-进阶方向: 01-计算机视觉(CV)——目标检测:SSD

目标检测:SSD(Single Shot MultiBox Detector)

一、SSD核心思想

1.1 SSD vs 其他检测器

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("SSD:单阶段多尺度检测器")
print("=" * 60)

# SSD vs YOLO vs Faster R-CNN对比
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Faster R-CNN
ax1 = axes[0]
ax1.axis('off')
ax1.set_title('Faster R-CNN\n(两阶段)', fontsize=10)
stages = [("RPN", 0.3), ("ROI Pooling", 0.6), ("分类+回归", 0.9)]
for label, x in stages:
    circle = plt.Circle((x, 0.5), 0.1, color='lightcoral', ec='black')
    ax1.add_patch(circle)
    ax1.text(x, 0.5, label, ha='center', va='center', fontsize=7)
ax1.annotate('', xy=(0.5, 0.5), xytext=(0.4, 0.5), arrowprops=dict(arrowstyle='->', lw=1))
ax1.annotate('', xy=(0.8, 0.5), xytext=(0.7, 0.5), arrowprops=dict(arrowstyle='->', lw=1))

# YOLO
ax2 = axes[1]
ax2.axis('off')
ax2.set_title('YOLO\n(单阶段,单尺度)', fontsize=10)
circle = plt.Circle((0.5, 0.5), 0.15, color='lightgreen', ec='black')
ax2.add_patch(circle)
ax2.text(0.5, 0.5, 'CNN\n一次性预测', ha='center', va='center', fontsize=8)

# SSD
ax3 = axes[2]
ax3.axis('off')
ax3.set_title('SSD\n(单阶段,多尺度)', fontsize=10)

# 多尺度特征图
scales = [(0.2, 0.7, '大物体\n预测', 'lightblue'),
          (0.4, 0.5, '中物体\n预测', 'lightgreen'),
          (0.6, 0.3, '小物体\n预测', 'lightyellow')]

for x, y, label, color in scales:
    circle = plt.Circle((x, y), 0.08, color=color, ec='black')
    ax3.add_patch(circle)
    ax3.text(x, y, label, ha='center', va='center', fontsize=6)

ax3.annotate('', xy=(0.5, 0.5), xytext=(0.28, 0.65), arrowprops=dict(arrowstyle='->', lw=1))
ax3.annotate('', xy=(0.5, 0.5), xytext=(0.48, 0.45), arrowprops=dict(arrowstyle='->', lw=1))
ax3.annotate('', xy=(0.5, 0.5), xytext=(0.68, 0.25), arrowprops=dict(arrowstyle='->', lw=1))

plt.suptitle('SSD vs 其他检测器', fontsize=14)
plt.tight_layout()
plt.show()

print("\n💡 SSD核心思想:")
print("   1. 单阶段: 直接预测边界框和类别,无需候选区域")
print("   2. 多尺度: 使用不同大小的特征图检测不同尺寸的物体")
print("   3. 默认框: 在每个特征图位置预定义多个默认框")
print("   4. 速度快: 同时达到高精度和高速度")

二、多尺度特征图

2.1 特征金字塔

python 复制代码
def multi_scale_features():
    """多尺度特征图可视化"""
    
    print("\n" + "=" * 60)
    print("多尺度特征图")
    print("=" * 60)
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    
    # 不同尺度的特征图
    scales = [
        (0, 0, '38×38\n(大特征图)', '检测小物体', 38),
        (0, 1, '19×19\n(中等特征图)', '检测中等物体', 19),
        (0, 2, '10×10\n(小特征图)', '检测大物体', 10),
        (1, 0, '5×5', '', 5),
        (1, 1, '3×3', '', 3),
        (1, 2, '1×1', '', 1),
    ]
    
    for idx, (row, col, title, desc, size) in enumerate(scales):
        ax = axes[row, col]
        
        # 创建网格
        for i in range(size):
            ax.axhline(y=i/size, color='gray', linestyle='-', alpha=0.3)
            ax.axvline(x=i/size, color='gray', linestyle='-', alpha=0.3)
        
        # 每个网格点代表一个感受野
        step = 1/size
        for i in range(size):
            for j in range(size):
                rect = Rectangle((j*step, i*step), step, step,
                                facecolor='none', edgecolor='blue', alpha=0.5)
                ax.add_patch(rect)
        
        ax.set_title(f'{title}\n{desc}', fontsize=10)
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)
        ax.set_aspect('equal')
        ax.axis('off')
    
    plt.suptitle('SSD多尺度特征图', fontsize=14)
    plt.tight_layout()
    plt.show()
    
    print("\n📊 特征图与感受野:")
    print("   - 浅层特征图(大尺寸): 感受野小 → 检测小物体")
    print("   - 深层特征图(小尺寸): 感受野大 → 检测大物体")
    print("\n   SSD使用6个尺度的特征图:")
    print("   38×38 → 19×19 → 10×10 → 5×5 → 3×3 → 1×1")

multi_scale_features()

2.2 特征图上的检测

python 复制代码
def feature_map_detection():
    """特征图上的检测"""
    
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.axis('off')
    
    # 绘制特征金字塔
    levels = [
        {'size': 38, 'y': 0.7, 'color': 'lightblue', 'label': 'Conv4_3\n38×38'},
        {'size': 19, 'y': 0.55, 'color': 'lightgreen', 'label': 'Conv7\n19×19'},
        {'size': 10, 'y': 0.4, 'color': 'lightyellow', 'label': 'Conv8_2\n10×10'},
        {'size': 5, 'y': 0.25, 'color': 'lightcoral', 'label': 'Conv9_2\n5×5'},
        {'size': 3, 'y': 0.1, 'color': 'lightpink', 'label': 'Conv10_2\n3×3'},
    ]
    
    for level in levels:
        size = level['size']
        y = level['y']
        color = level['color']
        label = level['label']
        
        # 绘制特征图
        for i in range(size):
            ax.axhline(y=y + i/size*0.15, xmin=0.1, xmax=0.4, 
                      color='gray', alpha=0.3, linewidth=0.5)
            ax.axvline(x=0.1 + i/size*0.3, ymin=y, ymax=y+0.15,
                      color='gray', alpha=0.3, linewidth=0.5)
        
        # 标注
        rect = Rectangle((0.1, y), 0.3, 0.15, 
                        facecolor=color, ec='black', alpha=0.7)
        ax.add_patch(rect)
        ax.text(0.25, y+0.075, label, ha='center', va='center', fontsize=8)
        
        # 检测头
        head_rect = Rectangle((0.55, y), 0.15, 0.15,
                             facecolor='lightgray', ec='black')
        ax.add_patch(head_rect)
        ax.text(0.625, y+0.075, '检测头\n(分类+回归)', ha='center', va='center', fontsize=7)
        
        # 连接线
        ax.annotate('', xy=(0.55, y+0.075), xytext=(0.4, y+0.075),
                   arrowprops=dict(arrowstyle='->', lw=1))
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_title('SSD多尺度特征图检测', fontsize=14)
    
    plt.tight_layout()
    plt.show()
    
    print("\n📐 每个特征图位置的预测:")
    print("   对于每个特征图上的每个位置,预测:")
    print("   - 默认框偏移: (cx, cy, w, h)")
    print("   - 类别分数: (c1, c2, ..., ck)")
    print("\n   总预测数 = Σ (feature_size² × num_default_boxes)")

feature_map_detection()

三、默认框(Default Box)

3.1 默认框设计

python 复制代码
def default_boxes():
    """默认框设计"""
    
    print("\n" + "=" * 60)
    print("默认框(Default Box)")
    print("=" * 60)
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # 不同尺度和比例的默认框
    ax1 = axes[0]
    ax1.set_title('默认框设计', fontsize=12)
    
    # 不同尺度的默认框
    scales = [0.1, 0.2, 0.3]
    ratios = [0.5, 1.0, 2.0]
    colors = ['blue', 'green', 'red']
    
    center = (0.5, 0.5)
    
    for scale, color in zip(scales, colors):
        for ratio in ratios:
            w = scale * np.sqrt(ratio)
            h = scale / np.sqrt(ratio)
            rect = Rectangle((center[0]-w/2, center[1]-h/2), w, h,
                           facecolor='none', edgecolor=color, linewidth=1.5)
            ax1.add_patch(rect)
    
    ax1.plot(center[0], center[1], 'ko', markersize=8)
    ax1.set_xlim(0, 1)
    ax1.set_ylim(0, 1)
    ax1.set_aspect('equal')
    ax1.grid(True, alpha=0.3)
    
    # 默认框参数说明
    ax2 = axes[1]
    ax2.axis('off')
    ax2.set_title('默认框参数', fontsize=12)
    
    param_text = """
    📐 默认框参数:
    
    1. 尺度 (Scale):
       s_k = s_min + (s_max - s_min) * (k-1)/(m-1)
       其中 s_min=0.2, s_max=0.9
    
    2. 比例 (Aspect Ratio):
       a_r ∈ {1, 2, 3, 1/2, 1/3}
    
    3. 每个默认框预测:
       - 边界框偏移 (Δcx, Δcy, Δw, Δh)
       - 类别分数 (C个类别)
    
    4. 总默认框数:
       Σ (feature_size² × num_boxes)
    """
    
    ax2.text(0.05, 0.95, param_text, transform=ax2.transAxes,
            fontsize=10, verticalalignment='top', fontfamily='monospace')
    
    plt.suptitle('SSD默认框设计', fontsize=14)
    plt.tight_layout()
    plt.show()
    
    print("\n📊 默认框数量统计:")
    print("   特征图尺寸: 38×38, 19×19, 10×10, 5×5, 3×3, 1×1")
    print("   每位置默认框数: 4, 6, 6, 6, 4, 4")
    print("   总默认框数: 38²×4 + 19²×6 + 10²×6 + 5²×6 + 3²×4 + 1²×4 = 8732")

default_boxes()

四、SSD网络结构

4.1 完整架构

python 复制代码
def ssd_architecture():
    """SSD完整架构"""
    
    print("\n" + "=" * 60)
    print("SSD网络架构")
    print("=" * 60)
    
    fig, ax = plt.subplots(figsize=(14, 10))
    ax.axis('off')
    
    # 基础网络(VGG16)
    base = plt.Rectangle((0.05, 0.8), 0.2, 0.12,
                        facecolor='lightblue', ec='black')
    ax.add_patch(base)
    ax.text(0.15, 0.86, 'VGG16\n(基础网络)', ha='center', va='center', fontsize=8)
    
    # 额外卷积层
    extra = plt.Rectangle((0.35, 0.8), 0.2, 0.12,
                         facecolor='lightgreen', ec='black')
    ax.add_patch(extra)
    ax.text(0.45, 0.86, '额外卷积层\n(多尺度特征)', ha='center', va='center', fontsize=8)
    ax.annotate('', xy=(0.35, 0.86), xytext=(0.25, 0.86),
               arrowprops=dict(arrowstyle='->', lw=1))
    
    # 多尺度检测头
    detection_heads = [
        (0.65, 0.8, 'Conv4_3\n(38×38)', 'lightyellow'),
        (0.65, 0.65, 'Conv7\n(19×19)', 'lightyellow'),
        (0.65, 0.5, 'Conv8_2\n(10×10)', 'lightyellow'),
        (0.65, 0.35, 'Conv9_2\n(5×5)', 'lightyellow'),
        (0.65, 0.2, 'Conv10_2\n(3×3)', 'lightyellow'),
        (0.65, 0.05, 'Conv11_2\n(1×1)', 'lightyellow'),
    ]
    
    for x, y, label, color in detection_heads:
        box = plt.Rectangle((x, y), 0.15, 0.1,
                           facecolor=color, ec='black')
        ax.add_patch(box)
        ax.text(x+0.075, y+0.05, label, ha='center', va='center', fontsize=7)
        
        # 连接到额外卷积层
        ax.annotate('', xy=(x, y+0.05), xytext=(0.55, y+0.05),
                   arrowprops=dict(arrowstyle='->', lw=1))
    
    # NMS后处理
    nms = plt.Rectangle((0.05, 0.05), 0.2, 0.12,
                       facecolor='lightcoral', ec='black')
    ax.add_patch(nms)
    ax.text(0.15, 0.11, 'NMS\n(非极大值抑制)', ha='center', va='center', fontsize=8)
    
    # 连接所有检测头到NMS
    for _, y, _, _ in detection_heads:
        ax.annotate('', xy=(0.25, y+0.05), xytext=(0.25, 0.11),
                   arrowprops=dict(arrowstyle='->', lw=1, connectionstyle='arc3,rad=0.3'))
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_title('SSD网络架构', fontsize=14)
    
    plt.tight_layout()
    plt.show()
    
    print("\n📊 SSD网络组成:")
    print("   1. 基础网络: VGG16(截断)")
    print("   2. 额外卷积层: 产生多尺度特征图")
    print("   3. 检测头: 每个特征图上的卷积预测")
    print("   4. NMS: 后处理去除重复框")

ssd_architecture()

五、训练与损失函数

5.1 匹配策略

python 复制代码
def matching_strategy():
    """默认框与真实框匹配"""
    
    print("\n" + "=" * 60)
    print("默认框匹配策略")
    print("=" * 60)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # 真实框
    gt_rect = Rectangle((0.3, 0.4), 0.4, 0.3,
                       facecolor='none', edgecolor='red', linewidth=2)
    ax.add_patch(gt_rect)
    ax.text(0.5, 0.55, '真实框', ha='center', va='center', fontsize=10, color='red')
    
    # 默认框
    default_boxes_list = [
        (0.2, 0.35, 0.15, 0.2, 'blue', 0.3),
        (0.4, 0.45, 0.2, 0.25, 'green', 0.7),
        (0.6, 0.5, 0.25, 0.2, 'green', 0.65),
        (0.35, 0.3, 0.1, 0.15, 'blue', 0.2),
        (0.55, 0.6, 0.2, 0.3, 'green', 0.5),
    ]
    
    for x, y, w, h, color, iou in default_boxes_list:
        rect = Rectangle((x-w/2, y-h/2), w, h,
                        facecolor='none', edgecolor=color, linewidth=1.5, alpha=0.7)
        ax.add_patch(rect)
        
        if iou > 0.5:
            ax.text(x, y, f'IoU={iou}', ha='center', va='center', fontsize=8, color='green')
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_aspect('equal')
    ax.set_title('默认框匹配(IoU > 0.5为正样本)', fontsize=12)
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\n📐 匹配策略:")
    print("   1. 每个真实框匹配IoU最大的默认框")
    print("   2. 默认框与任一真实框IoU > 0.5则匹配")
    print("   3. 未匹配的默认框为负样本")
    print("   4. 正负样本比例1:3平衡")

matching_strategy()

5.2 损失函数

python 复制代码
def ssd_loss():
    """SSD损失函数"""
    
    print("\n" + "=" * 60)
    print("SSD损失函数")
    print("=" * 60)
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # 损失组成
    ax1 = axes[0]
    ax1.axis('off')
    ax1.set_title('损失函数组成', fontsize=12)
    
    loss_text = """
    📐 SSD总损失:
    
    L(x, c, l, g) = 1/N [L_conf(x, c) + α L_loc(x, l, g)]
    
    其中:
    • N: 正样本数量
    • L_conf: 置信度损失(分类)
    • L_loc: 定位损失(回归)
    • α: 权重系数(通常α=1)
    
    定位损失 (Smooth L1):
    L_loc = Σ smooth_L1(l_i^m - g_j^m)
    
    置信度损失 (Softmax):
    L_conf = -log(softmax(c_i^p))
    """
    
    ax1.text(0.05, 0.95, loss_text, transform=ax1.transAxes,
            fontsize=10, verticalalignment='top', fontfamily='monospace')
    
    # 正负样本平衡
    ax2 = axes[1]
    ax2.axis('off')
    ax2.set_title('难例挖掘', fontsize=12)
    
    hard_mining_text = """
    🔧 难例挖掘 (Hard Negative Mining):
    
    问题: 负样本远多于正样本
    
    解决方案:
    1. 按置信度损失排序所有负样本
    2. 选择损失最大的负样本
    3. 保持正负样本比例 1:3
    
    作用:
    • 平衡正负样本
    • 加速收敛
    • 提高检测精度
    """
    
    ax2.text(0.05, 0.95, hard_mining_text, transform=ax2.transAxes,
            fontsize=10, verticalalignment='top', fontfamily='monospace')
    
    plt.tight_layout()
    plt.show()

ssd_loss()

六、代码实现示例

6.1 SSD检测头

python 复制代码
def ssd_detection_head():
    """SSD检测头实现"""
    
    print("\n" + "=" * 60)
    print("SSD检测头代码")
    print("=" * 60)
    
    code = """
import torch
import torch.nn as nn
import torch.nn.functional as F

class SSDDetectionHead(nn.Module):
    def __init__(self, in_channels, num_classes, num_anchors):
        super(SSDDetectionHead, self).__init__()
        
        # 分类分支
        self.cls_head = nn.Conv2d(in_channels, num_anchors * num_classes, 
                                   kernel_size=3, padding=1)
        # 回归分支
        self.loc_head = nn.Conv2d(in_channels, num_anchors * 4, 
                                   kernel_size=3, padding=1)
        
    def forward(self, x):
        batch_size = x.size(0)
        
        # 分类预测
        cls_pred = self.cls_head(x)
        cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous()
        cls_pred = cls_pred.view(batch_size, -1, self.num_classes)
        
        # 位置预测
        loc_pred = self.loc_head(x)
        loc_pred = loc_pred.permute(0, 2, 3, 1).contiguous()
        loc_pred = loc_pred.view(batch_size, -1, 4)
        
        return cls_pred, loc_pred

class SSD(nn.Module):
    def __init__(self, num_classes=21):
        super(SSD, self).__init__()
        
        self.num_classes = num_classes
        
        # 基础网络 VGG16
        self.base = self._vgg_layers()
        
        # 额外层
        self.extras = self._extra_layers()
        
        # 检测头配置
        self.detection_heads = nn.ModuleList([
            SSDDetectionHead(512, num_classes, 4),   # conv4_3
            SSDDetectionHead(1024, num_classes, 6),  # conv7
            SSDDetectionHead(512, num_classes, 6),   # conv8_2
            SSDDetectionHead(256, num_classes, 6),   # conv9_2
            SSDDetectionHead(256, num_classes, 4),   # conv10_2
            SSDDetectionHead(256, num_classes, 4),   # conv11_2
        ])
        
        # 默认框配置
        self.default_boxes = self._generate_default_boxes()
    
    def forward(self, x):
        sources = []
        
        # 提取基础网络特征
        for layer in self.base:
            x = layer(x)
            if layer == self.base[23]:  # conv4_3
                sources.append(x)
        
        # 提取额外层特征
        for layer in self.extras:
            x = layer(x)
            sources.append(x)
        
        # 多尺度检测
        cls_preds = []
        loc_preds = []
        
        for source, head in zip(sources, self.detection_heads):
            cls, loc = head(source)
            cls_preds.append(cls)
            loc_preds.append(loc)
        
        cls_preds = torch.cat(cls_preds, dim=1)
        loc_preds = torch.cat(loc_preds, dim=1)
        
        return cls_preds, loc_preds
"""
    
    print(code)

ssd_detection_head()

6.2 默认框生成

python 复制代码
def default_boxes_generation():
    """默认框生成代码"""
    
    print("\n" + "=" * 60)
    print("默认框生成")
    print("=" * 60)
    
    code = """
def generate_default_boxes(feature_maps, aspect_ratios, scales):
    """生成默认框"""
    default_boxes = []
    
    for k, (feature_size, ratios, scale) in enumerate(zip(feature_maps, aspect_ratios, scales)):
        # 特征图上的每个位置
        for i in range(feature_size):
            for j in range(feature_size):
                # 中心坐标
                cx = (j + 0.5) / feature_size
                cy = (i + 0.5) / feature_size
                
                # 尺度
                s_k = scale
                s_k_plus = scales[min(k+1, len(scales)-1)]
                
                # 添加第一个默认框
                default_boxes.append([cx, cy, s_k, s_k])
                
                # 添加第二个默认框
                default_boxes.append([cx, cy, np.sqrt(s_k * s_k_plus), np.sqrt(s_k * s_k_plus)])
                
                # 添加比例框
                for ratio in ratios:
                    w = s_k * np.sqrt(ratio)
                    h = s_k / np.sqrt(ratio)
                    default_boxes.append([cx, cy, w, h])
                    if ratio != 1:
                        default_boxes.append([cx, cy, h, w])
    
    return torch.tensor(default_boxes)

# 配置
feature_maps = [38, 19, 10, 5, 3, 1]
aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
scales = [0.1, 0.2, 0.375, 0.55, 0.725, 0.9]

default_boxes = generate_default_boxes(feature_maps, aspect_ratios, scales)
print(f"总默认框数: {len(default_boxes)}")
"""
    
    print(code)

default_boxes_generation()

七、总结

组件 作用 关键参数
基础网络 特征提取 VGG16
额外层 多尺度特征 卷积层
检测头 分类+回归 3×3卷积
默认框 先验框 尺度、比例
NMS 去重 IoU阈值

SSD vs YOLO对比:

特性 SSD YOLO
多尺度检测 ✅ 是 YOLOv3后支持
默认框 密集采样 Anchor Box
速度 更快
小物体检测 较好 一般

核心要点:

  • 多尺度特征图是关键创新
  • 默认框密集覆盖不同尺寸物体
  • 端到端训练,单阶段检测
  • 速度与精度的良好平衡
相关推荐
心勤则明1 小时前
基于Spring AI Alibaba的监督者模式实践
人工智能·python·spring
沪漂阿龙在努力1 小时前
从“瞎推车”到“平衡大师”:一文读懂强化学习里的策略梯度法(小白也能懂)
人工智能
蓝桉~MLGT1 小时前
Ai-Agent学习历程—— Harness和Memory介绍和应用 & vibe Coding工具选择
人工智能·学习
刘劲松12 小时前
Feishu-CLI-Web:私有化部署飞书Web智能工作台,自然语言操控飞书全能力
人工智能·飞书
人工智能AI技术2 小时前
多模态基础:文字、图像、语音统一理解原理
人工智能
佳xuan2 小时前
wsl(linux)安装miniconda及虚拟环境
linux·人工智能·conda
脑极体2 小时前
工业Agent的新芽,生长在飞书的旷野上
人工智能·飞书
zhangfeng11332 小时前
LLaMA-Factory 在训练模型时检查点(Checkpoint)文件说明
人工智能·深度学习·llama
烛之武2 小时前
《深度学习基础与概念》笔记(1)
人工智能·笔记·深度学习