开放词汇目标检测:Grounding DINO 与 SAM2 实战
1. 引言
传统目标检测只能识别训练时见过的类别(如 COCO 的 80 类)。开放词汇检测(Open-Vocabulary Detection) 允许用自然语言描述任意目标,实现"零样本"检测。Grounding DINO + SAM2 的组合是目前最强的开放词汇检测+分割方案。
核心能力:
- 输入:图片 + 文本描述(如 "红色的消防栓")
- 输出:精确的边界框 + 像素级分割掩码
- 无需任何微调,零样本泛化
2. Grounding DINO 架构
文本: "一只黑色的猫" → Text Encoder (BERT) → 文本特征
↓
图像: [猫的图片] → Image Backbone (Swin-T) → 图像特征
↓
特征融合层
↓
对齐解码器
↓
检测框 + 置信度
3. 环境搭建
bash
# 安装 Grounding DINO
git clone https://github.com/IDEA-Research/GroundingDINO.git
cd GroundingDINO
pip install -e .
# 安装 SAM2
git clone https://github.com/facebookresearch/sam2.git
cd sam2
pip install -e .
# 下载权重
wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
wget https://dl.fbaipublicfiles.com/segment_anything/2/sam2.1_hiera_large.pt
4. Grounding DINO 检测
python
from groundingdino.util.inference import load_model, load_image, predict
import torch
# 加载模型
model = load_model(
"GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
"groundingdino_swint_ogc.pth",
)
# 加载图像
image_source, image = load_image("test.jpg")
# 文本提示检测
boxes, logits, phrases = predict(
model=model,
image=image,
caption="black cat . red car . person wearing hat",
box_threshold=0.35, # 检测框置信度阈值
text_threshold=0.25, # 文本匹配阈值
)
print(f"检测到 {len(boxes)} 个目标")
for box, score, phrase in zip(boxes, logits, phrases):
print(f" {phrase}: {score:.2f} | {box.tolist()}")
4.1 可视化
python
import cv2
import numpy as np
def draw_boxes(image_path, boxes, phrases, scores, output_path="result.jpg"):
"""可视化检测结果"""
image = cv2.imread(image_path)
h, w = image.shape[:2]
colors = [(0,255,0), (255,0,0), (0,0,255), (255,255,0), (0,255,255)]
for i, (box, phrase, score) in enumerate(zip(boxes, phrases, scores)):
# 转换坐标(归一化 → 像素)
cx, cy, bw, bh = box.tolist()
x1 = int((cx - bw/2) * w)
y1 = int((cy - bh/2) * h)
x2 = int((cx + bw/2) * w)
y2 = int((cy + bh/2) * h)
color = colors[i % len(colors)]
cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
cv2.putText(image, f"{phrase}: {score:.2f}",
(x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
cv2.imwrite(output_path, image)
5. SAM2 精确分割
5.1 使用检测框提示 SAM2
python
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
# 加载 SAM2
sam2_model = build_sam2("sam2_hiera_l.yaml", "sam2.1_hiera_large.pt")
predictor = SAM2ImagePredictor(sam2_model)
def segment_with_boxes(image_path, boxes, phrases):
"""用检测框作为提示进行分割"""
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
predictor.set_image(image_rgb)
results = []
for box, phrase in zip(boxes, phrases):
# 将归一化坐标转为像素坐标
h, w = image.shape[:2]
cx, cy, bw, bh = box.tolist()
input_box = np.array([
(cx - bw/2) * w, (cy - bh/2) * h,
(cx + bw/2) * w, (cy + bh/2) * h
])
# SAM2 分割
masks, scores, _ = predictor.predict(
box=input_box,
multimask_output=True, # 输出多个候选掩码
)
# 选择最高分的掩码
best_mask = masks[scores.argmax()]
results.append({
"phrase": phrase,
"mask": best_mask,
"score": scores.max(),
"box": input_box,
})
return results
results = segment_with_boxes("test.jpg", boxes, phrases)
5.2 掩码可视化与保存
python
def visualize_masks(image_path, results, output_path="segmented.jpg"):
"""可视化分割结果"""
image = cv2.imread(image_path)
for i, result in enumerate(results):
mask = result["mask"]
color = np.random.randint(0, 255, 3).tolist()
# 彩色掩码叠加
colored_mask = np.zeros_like(image)
colored_mask[mask] = color
image = cv2.addWeighted(image, 1, colored_mask, 0.4, 0)
# 绘制轮廓
contours, _ = cv2.findContours(
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
cv2.drawContours(image, contours, -1, color, 2)
# 标签
x, y = int(result["box"][0]), int(result["box"][1]) - 10
cv2.putText(image, f"{result['phrase']}: {result['score']:.2f}",
(x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
cv2.imwrite(output_path, image)
6. Grounded SAM2 完整 Pipeline
python
class GroundedSAM2:
"""Grounding DINO + SAM2 完整 Pipeline"""
def __init__(self):
# 加载 Grounding DINO
self.det_model = load_model(
"GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
"groundingdino_swint_ogc.pth",
)
# 加载 SAM2
sam2_model = build_sam2("sam2_hiera_l.yaml", "sam2.1_hiera_large.pt")
self.sam_predictor = SAM2ImagePredictor(sam2_model)
def detect_and_segment(self, image_path, text_prompt,
box_thresh=0.35, text_thresh=0.25):
"""检测 + 分割"""
# 加载图像
image_source, image = load_image(image_path)
# 检测
boxes, logits, phrases = predict(
model=self.det_model,
image=image,
caption=text_prompt,
box_threshold=box_thresh,
text_threshold=text_thresh,
)
if len(boxes) == 0:
return []
# 分割
image_rgb = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
self.sam_predictor.set_image(image_rgb)
results = []
h, w = image_source.shape[:2]
for box, score, phrase in zip(boxes, logits, phrases):
cx, cy, bw, bh = box.tolist()
input_box = np.array([
(cx - bw/2) * w, (cy - bh/2) * h,
(cx + bw/2) * w, (cy + bh/2) * h
])
masks, mask_scores, _ = self.sam_predictor.predict(
box=input_box, multimask_output=True
)
best_idx = mask_scores.argmax()
results.append({
"phrase": phrase,
"detection_score": float(score),
"segmentation_score": float(mask_scores[best_idx]),
"mask": masks[best_idx],
"box": input_box,
})
return results
# 使用
pipeline = GroundedSAM2()
results = pipeline.detect_and_segment(
"street.jpg",
"traffic light . car . pedestrian . road sign"
)
7. 应用场景
7.1 自动驾驶场景理解
python
driving_prompt = "traffic light . stop sign . car . truck . pedestrian . bicycle . lane marking"
results = pipeline.detect_and_segment("dashcam.jpg", driving_prompt)
7.2 工业质检
python
defect_prompt = "scratch . dent . crack . stain . missing part"
results = pipeline.detect_and_segment("product.jpg", defect_prompt)
7.3 医学影像
python
medical_prompt = "tumor . lesion . nodule . cyst . calcification"
results = pipeline.detect_and_segment("xray.jpg", medical_prompt)
8. 性能对比
| 方法 | mAP (COCO) | 零样本能力 | 速度 |
|---|---|---|---|
| YOLOv8 | 53.9 | 无 | 8ms |
| DINO | 63.3 | 无 | 45ms |
| Grounding DINO | 52.5 (zero-shot) | 强 | 85ms |
| Grounded SAM2 | - | 强(含分割) | 120ms |
9. 总结
Grounding DINO + SAM2 是当前最强的开放词汇检测+分割方案:
- 开放词汇:用自然语言描述目标,无需训练
- 精确分割:SAM2 提供像素级掩码
- 零样本泛化:适用于任意领域
- 组合使用:先检测定位,再分割精细掩码