开放词汇目标检测：Grounding DINO 与 SAM2 实战

1. 引言

传统目标检测只能识别训练时见过的类别（如 COCO 的 80 类）。开放词汇检测（Open-Vocabulary Detection） 允许用自然语言描述任意目标，实现"零样本"检测。Grounding DINO + SAM2 的组合是目前最强的开放词汇检测+分割方案。

核心能力：

输入：图片 + 文本描述（如 "红色的消防栓"）
输出：精确的边界框 + 像素级分割掩码
无需任何微调，零样本泛化

2. Grounding DINO 架构

复制代码

文本: "一只黑色的猫"  →  Text Encoder (BERT)  →  文本特征
                                                    ↓
图像: [猫的图片]      →  Image Backbone (Swin-T) →  图像特征
                                                    ↓
                                              特征融合层
                                                    ↓
                                              对齐解码器
                                                    ↓
                                         检测框 + 置信度

3. 环境搭建

bash 复制代码

# 安装 Grounding DINO
git clone https://github.com/IDEA-Research/GroundingDINO.git
cd GroundingDINO
pip install -e .

# 安装 SAM2
git clone https://github.com/facebookresearch/sam2.git
cd sam2
pip install -e .

# 下载权重
wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
wget https://dl.fbaipublicfiles.com/segment_anything/2/sam2.1_hiera_large.pt

4. Grounding DINO 检测

python 复制代码

from groundingdino.util.inference import load_model, load_image, predict
import torch

# 加载模型
model = load_model(
    "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
    "groundingdino_swint_ogc.pth",
)

# 加载图像
image_source, image = load_image("test.jpg")

# 文本提示检测
boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption="black cat . red car . person wearing hat",
    box_threshold=0.35,      # 检测框置信度阈值
    text_threshold=0.25,     # 文本匹配阈值
)

print(f"检测到 {len(boxes)} 个目标")
for box, score, phrase in zip(boxes, logits, phrases):
    print(f"  {phrase}: {score:.2f} | {box.tolist()}")

4.1 可视化

python 复制代码

import cv2
import numpy as np

def draw_boxes(image_path, boxes, phrases, scores, output_path="result.jpg"):
    """可视化检测结果"""
    image = cv2.imread(image_path)
    h, w = image.shape[:2]

    colors = [(0,255,0), (255,0,0), (0,0,255), (255,255,0), (0,255,255)]

    for i, (box, phrase, score) in enumerate(zip(boxes, phrases, scores)):
        # 转换坐标（归一化 → 像素）
        cx, cy, bw, bh = box.tolist()
        x1 = int((cx - bw/2) * w)
        y1 = int((cy - bh/2) * h)
        x2 = int((cx + bw/2) * w)
        y2 = int((cy + bh/2) * h)

        color = colors[i % len(colors)]
        cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
        cv2.putText(image, f"{phrase}: {score:.2f}",
                    (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

    cv2.imwrite(output_path, image)

5. SAM2 精确分割

5.1 使用检测框提示 SAM2

python 复制代码

from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor

# 加载 SAM2
sam2_model = build_sam2("sam2_hiera_l.yaml", "sam2.1_hiera_large.pt")
predictor = SAM2ImagePredictor(sam2_model)

def segment_with_boxes(image_path, boxes, phrases):
    """用检测框作为提示进行分割"""
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    predictor.set_image(image_rgb)

    results = []
    for box, phrase in zip(boxes, phrases):
        # 将归一化坐标转为像素坐标
        h, w = image.shape[:2]
        cx, cy, bw, bh = box.tolist()
        input_box = np.array([
            (cx - bw/2) * w, (cy - bh/2) * h,
            (cx + bw/2) * w, (cy + bh/2) * h
        ])

        # SAM2 分割
        masks, scores, _ = predictor.predict(
            box=input_box,
            multimask_output=True,  # 输出多个候选掩码
        )

        # 选择最高分的掩码
        best_mask = masks[scores.argmax()]
        results.append({
            "phrase": phrase,
            "mask": best_mask,
            "score": scores.max(),
            "box": input_box,
        })

    return results

results = segment_with_boxes("test.jpg", boxes, phrases)

5.2 掩码可视化与保存

python 复制代码

def visualize_masks(image_path, results, output_path="segmented.jpg"):
    """可视化分割结果"""
    image = cv2.imread(image_path)

    for i, result in enumerate(results):
        mask = result["mask"]
        color = np.random.randint(0, 255, 3).tolist()

        # 彩色掩码叠加
        colored_mask = np.zeros_like(image)
        colored_mask[mask] = color
        image = cv2.addWeighted(image, 1, colored_mask, 0.4, 0)

        # 绘制轮廓
        contours, _ = cv2.findContours(
            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )
        cv2.drawContours(image, contours, -1, color, 2)

        # 标签
        x, y = int(result["box"][0]), int(result["box"][1]) - 10
        cv2.putText(image, f"{result['phrase']}: {result['score']:.2f}",
                    (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

    cv2.imwrite(output_path, image)

6. Grounded SAM2 完整 Pipeline

python 复制代码

class GroundedSAM2:
    """Grounding DINO + SAM2 完整 Pipeline"""

    def __init__(self):
        # 加载 Grounding DINO
        self.det_model = load_model(
            "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
            "groundingdino_swint_ogc.pth",
        )

        # 加载 SAM2
        sam2_model = build_sam2("sam2_hiera_l.yaml", "sam2.1_hiera_large.pt")
        self.sam_predictor = SAM2ImagePredictor(sam2_model)

    def detect_and_segment(self, image_path, text_prompt,
                           box_thresh=0.35, text_thresh=0.25):
        """检测 + 分割"""
        # 加载图像
        image_source, image = load_image(image_path)

        # 检测
        boxes, logits, phrases = predict(
            model=self.det_model,
            image=image,
            caption=text_prompt,
            box_threshold=box_thresh,
            text_threshold=text_thresh,
        )

        if len(boxes) == 0:
            return []

        # 分割
        image_rgb = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
        self.sam_predictor.set_image(image_rgb)

        results = []
        h, w = image_source.shape[:2]

        for box, score, phrase in zip(boxes, logits, phrases):
            cx, cy, bw, bh = box.tolist()
            input_box = np.array([
                (cx - bw/2) * w, (cy - bh/2) * h,
                (cx + bw/2) * w, (cy + bh/2) * h
            ])

            masks, mask_scores, _ = self.sam_predictor.predict(
                box=input_box, multimask_output=True
            )

            best_idx = mask_scores.argmax()
            results.append({
                "phrase": phrase,
                "detection_score": float(score),
                "segmentation_score": float(mask_scores[best_idx]),
                "mask": masks[best_idx],
                "box": input_box,
            })

        return results

# 使用
pipeline = GroundedSAM2()
results = pipeline.detect_and_segment(
    "street.jpg",
    "traffic light . car . pedestrian . road sign"
)

7. 应用场景

7.1 自动驾驶场景理解

python 复制代码

driving_prompt = "traffic light . stop sign . car . truck . pedestrian . bicycle . lane marking"
results = pipeline.detect_and_segment("dashcam.jpg", driving_prompt)

7.2 工业质检

python 复制代码

defect_prompt = "scratch . dent . crack . stain . missing part"
results = pipeline.detect_and_segment("product.jpg", defect_prompt)

7.3 医学影像

python 复制代码

medical_prompt = "tumor . lesion . nodule . cyst . calcification"
results = pipeline.detect_and_segment("xray.jpg", medical_prompt)

8. 性能对比

方法	mAP (COCO)	零样本能力	速度
YOLOv8	53.9	无	8ms
DINO	63.3	无	45ms
Grounding DINO	52.5 (zero-shot)	强	85ms
Grounded SAM2	-	强（含分割）	120ms

9. 总结

Grounding DINO + SAM2 是当前最强的开放词汇检测+分割方案：

开放词汇：用自然语言描述目标，无需训练
精确分割：SAM2 提供像素级掩码
零样本泛化：适用于任意领域
组合使用：先检测定位，再分割精细掩码