目标跟踪实战:SORT、DeepSORT 与 ByteTrack 原理实现
1. 引言
目标跟踪(Multi-Object Tracking, MOT)是计算机视觉的核心任务之一。在自动驾驶、视频监控、运动分析等场景中,需要在连续帧中维持每个目标的唯一身份。
核心挑战: 检测器给出每帧的目标框,但不知道"第1帧的车A"和"第2帧的车A"是同一辆车。
技术演进:
SORT (2016) → DeepSORT (2017) → ByteTrack (2021) → BoT-SORT (2023)
卡尔曼滤波 +外观特征 低分检测利用 +相机补偿
2. SORT(Simple Online and Realtime Tracking)
2.1 核心流程
帧 t: 检测结果 D_t = {d_1, d_2, ...}
帧 t-1: 跟踪轨迹 T_{t-1} = {t_1, t_2, ...}
1. 预测:用卡尔曼滤波预测每个轨迹在帧 t 的位置
2. 匹配:用匈牙利算法将预测框与检测框匹配
3. 更新:匹配成功的轨迹用检测框更新
4. 创建:未匹配的检测创建新轨迹
5. 删除:长时间未匹配的轨迹删除
2.2 卡尔曼滤波
python
import numpy as np
from filterpy.kalman import KalmanFilter
class KalmanBoxTracker:
"""基于卡尔曼滤波的边界框跟踪器"""
count = 0
def __init__(self, bbox):
"""
bbox: [x1, y1, x2, y2] → 转为 [cx, cy, s, r]
cx, cy = 中心坐标
s = 面积
r = 宽高比
"""
self.kf = KalmanFilter(dim_x=7, dim_z=4)
# 状态转移矩阵 F
self.kf.F = np.array([
[1, 0, 0, 0, 1, 0, 0],
[0, 1, 0, 0, 0, 1, 0],
[0, 0, 1, 0, 0, 0, 1],
[0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 1],
])
# 观测矩阵 H
self.kf.H = np.array([
[1, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0],
])
# 噪声
self.kf.R[2:, 2:] *= 10.
self.kf.P[4:, 4:] *= 1000.
self.kf.P *= 10.
self.kf.Q[-1, -1] *= 0.01
self.kf.Q[4:, 4:] *= 0.01
# 初始化状态
self.kf.x[:4] = self._bbox_to_z(bbox)
self.time_since_update = 0
self.id = KalmanBoxTracker.count
KalmanBoxTracker.count += 1
self.history = []
self.hits = 0
self.hit_streak = 0
self.age = 0
def _bbox_to_z(self, bbox):
"""[x1,y1,x2,y2] → [cx,cy,s,r]"""
w = bbox[2] - bbox[0]
h = bbox[3] - bbox[1]
cx = bbox[0] + w / 2.
cy = bbox[1] + h / 2.
s = w * h
r = w / float(h)
return np.array([[cx], [cy], [s], [r]])
def _z_to_bbox(self, z):
"""[cx,cy,s,r] → [x1,y1,x2,y2]"""
w = np.sqrt(z[2] * z[3])
h = z[2] / w
return np.array([
z[0] - w/2., z[1] - h/2.,
z[0] + w/2., z[1] + h/2.
]).reshape((1, 4))
def predict(self):
"""预测下一帧位置"""
self.kf.predict()
self.age += 1
if self.time_since_update > 0:
self.hit_streak = 0
self.time_since_update += 1
return self._z_to_bbox(self.kf.x)
def update(self, bbox):
"""用检测框更新"""
self.time_since_update = 0
self.hits += 1
self.hit_streak += 1
self.kf.update(self._bbox_to_z(bbox))
def get_state(self):
return self._z_to_bbox(self.kf.x)
2.3 SORT 主流程
python
from scipy.optimize import linear_sum_assignment
class SORT:
def __init__(self, max_age=5, min_hits=3, iou_threshold=0.3):
self.max_age = max_age
self.min_hits = min_hits
self.iou_threshold = iou_threshold
self.trackers = []
self.frame_count = 0
def update(self, detections):
"""
detections: (N, 5) --- [x1, y1, x2, y2, score]
返回: (M, 5) --- [x1, y1, x2, y2, track_id]
"""
self.frame_count += 1
# 预测已有轨迹
predicted = []
for trk in self.trackers:
pred = trk.predict()
predicted.append(pred[0])
predicted = np.array(predicted) if predicted else np.empty((0, 4))
# 匹配:IoU + 匈牙利算法
if len(predicted) > 0 and len(detections) > 0:
iou_matrix = self._iou_batch(detections[:, :4], predicted)
row_idx, col_idx = linear_sum_assignment(-iou_matrix)
# 过滤低 IoU
matched = []
unmatched_dets = list(range(len(detections)))
unmatched_trks = list(range(len(predicted)))
for r, c in zip(row_idx, col_idx):
if iou_matrix[r, c] >= self.iou_threshold:
matched.append((r, c))
unmatched_dets.remove(r)
unmatched_trks.remove(c)
else:
matched = []
unmatched_dets = list(range(len(detections)))
unmatched_trks = list(range(len(predicted)))
# 更新匹配的轨迹
for d, t in matched:
self.trackers[t].update(detections[d, :4])
# 创建新轨迹
for d in unmatched_dets:
trk = KalmanBoxTracker(detections[d, :4])
self.trackers.append(trk)
# 删除旧轨迹
self.trackers = [t for t in self.trackers
if t.time_since_update <= self.max_age]
# 输出确认的轨迹
results = []
for trk in self.trackers:
if trk.hits >= self.min_hits:
bbox = trk.get_state()[0]
results.append([*bbox, trk.id])
return np.array(results) if results else np.empty((0, 5))
def _iou_batch(self, bb_dets, bb_trks):
"""计算 IoU 矩阵"""
def box_iou(a, b):
x1 = max(a[0], b[0])
y1 = max(a[1], b[1])
x2 = min(a[2], b[2])
y2 = min(a[3], b[3])
inter = max(0, x2-x1) * max(0, y2-y1)
area_a = (a[2]-a[0]) * (a[3]-a[1])
area_b = (b[2]-b[0]) * (b[3]-b[1])
return inter / (area_a + area_b - inter + 1e-6)
iou = np.zeros((len(bb_dets), len(bb_trks)))
for d in range(len(bb_dets)):
for t in range(len(bb_trks)):
iou[d, t] = box_iou(bb_dets[d], bb_trks[t])
return iou
3. DeepSORT
3.1 改进点
DeepSORT 在 SORT 基础上加入外观特征:
代价矩阵 = α × IoU代价 + (1-α) × 外观代价
外观特征:CNN 提取 128 维特征向量
特征库:每个轨迹维护一个特征队列(最近 100 帧)
匹配:余弦距离
3.2 外观特征提取
python
import torch
import torch.nn as nn
from torchvision.models import resnet50
class FeatureExtractor(nn.Module):
"""外观特征提取器"""
def __init__(self, feature_dim=128):
super().__init__()
backbone = resnet50(pretrained=True)
self.features = nn.Sequential(*list(backbone.children())[:-1])
self.fc = nn.Linear(2048, feature_dim)
def forward(self, images):
"""
images: (B, 3, 128, 64) --- 裁剪的目标图像
返回: (B, 128) --- 归一化特征
"""
feat = self.features(images).flatten(1)
feat = self.fc(feat)
feat = nn.functional.normalize(feat, dim=1)
return feat
3.3 级联匹配
python
class DeepSORT:
def __init__(self, max_age=70, nn_budget=100):
self.max_age = max_age
self.nn_budget = nn_budget
self.tracks = []
self.feature_extractor = FeatureExtractor()
def update(self, detections, features):
"""
detections: (N, 5) --- [x1, y1, x2, y2, score]
features: (N, 128) --- 外观特征
"""
# 1. 预测
for track in self.tracks:
track.predict()
# 2. 级联匹配(优先匹配更长时间未更新的轨迹)
matched, unmatched_dets, unmatched_trks = self._cascade_match(
detections, features
)
# 3. IoU 匹配(剩余的用 IoU 匹配)
if len(unmatched_dets) > 0 and len(unmatched_trks) > 0:
iou_matched, unmatched_dets, unmatched_trks = self._iou_match(
detections[unmatched_dets], unmatched_trks
)
matched.extend(iou_matched)
# 4. 更新/创建/删除
for d, t in matched:
self.tracks[t].update(detections[d], features[d])
for d in unmatched_dets:
self.tracks.append(Track(detections[d], features[d]))
self.tracks = [t for t in self.tracks if t.time_since_update <= self.max_age]
return self._get_results()
def _cascade_match(self, detections, features):
"""级联匹配"""
matched = []
unmatched_dets = list(range(len(detections)))
for age in range(self.max_age + 1):
tracks_of_age = [i for i, t in enumerate(self.tracks)
if t.time_since_update == age]
if not tracks_of_age or not unmatched_dets:
continue
# 计算代价矩阵
cost_matrix = self._cosine_distance(
features[unmatched_dets],
[self.tracks[t].features for t in tracks_of_age]
)
row_idx, col_idx = linear_sum_assignment(cost_matrix)
new_matched = []
for r, c in zip(row_idx, col_idx):
if cost_matrix[r, c] < 0.7: # 余弦距离阈值
new_matched.append((unmatched_dets[r], tracks_of_age[c]))
unmatched_dets.remove(unmatched_dets[r])
matched.extend(new_matched)
unmatched_trks = [i for i in range(len(self.tracks))
if self.tracks[i].time_since_update > 0
and i not in [m[1] for m in matched]]
return matched, unmatched_dets, unmatched_trks
4. ByteTrack
4.1 核心创新
ByteTrack 的关键洞察: 低分检测框也有用!
传统方法:
高分检测 (>0.6) → 匹配跟踪
低分检测 (<0.6) → 直接丢弃
ByteTrack:
第一轮:高分检测 ↔ 已有轨迹 匹配
第二轮:低分检测 ↔ 剩余轨迹 匹配
第三轮:未匹配高分检测 → 创建新轨迹
4.2 实现
python
class ByteTrack:
def __init__(self, high_thresh=0.6, low_thresh=0.1, max_age=30):
self.high_thresh = high_thresh
self.low_thresh = low_thresh
self.max_age = max_age
self.tracks = []
self.track_id = 0
def update(self, detections):
"""
detections: (N, 6) --- [x1, y1, x2, y2, score, class]
"""
# 分为高分和低分检测
high_dets = detections[detections[:, 4] >= self.high_thresh]
low_dets = detections[
(detections[:, 4] >= self.low_thresh) &
(detections[:, 4] < self.high_thresh)
]
# 预测
for track in self.tracks:
track.predict()
# 第一轮:高分检测 ↔ 所有轨迹
matched1, unmatched_tracks, unmatched_high = self._match(
self.tracks, high_dets, thresh=0.3
)
# 第二轮:低分检测 ↔ 剩余轨迹
remaining_tracks = [self.tracks[i] for i in unmatched_tracks]
matched2, still_unmatched, unmatched_low = self._match(
remaining_tracks, low_dets, thresh=0.5
)
# 更新匹配的轨迹
for t_idx, d_idx in matched1:
self.tracks[t_idx].update(high_dets[d_idx])
for t_idx, d_idx in matched2:
remaining_tracks[t_idx].update(low_dets[d_idx])
# 创建新轨迹(仅高分检测)
for d_idx in unmatched_high:
self.tracks.append(Track(high_dets[d_idx], self.track_id))
self.track_id += 1
# 删除旧轨迹
self.tracks = [t for t in self.tracks if t.time_since_update <= self.max_age]
return self._get_results()
def _match(self, tracks, detections, thresh):
"""IoU + 匈牙利匹配"""
if not tracks or len(detections) == 0:
return [], list(range(len(tracks))), list(range(len(detections)))
# IoU 矩阵
iou_matrix = self._compute_iou(tracks, detections)
row_idx, col_idx = linear_sum_assignment(-iou_matrix)
matched, unmatched_tracks, unmatched_dets = [], [], []
matched_tracks, matched_dets = set(), set()
for r, c in zip(row_idx, col_idx):
if iou_matrix[r, c] >= thresh:
matched.append((r, c))
matched_tracks.add(r)
matched_dets.add(c)
unmatched_tracks = [i for i in range(len(tracks)) if i not in matched_tracks]
unmatched_dets = [i for i in range(len(detections)) if i not in matched_dets]
return matched, unmatched_tracks, unmatched_dets
5. 算法对比
| 算法 | 匹配策略 | 外观特征 | 速度 | MOTA |
|---|---|---|---|---|
| SORT | IoU | 无 | 200 FPS | 59.8 |
| DeepSORT | IoU + 外观 | ResNet | 40 FPS | 61.4 |
| ByteTrack | 两轮 IoU | 无 | 150 FPS | 80.3 |
| BoT-SORT | IoU + 外观 + 相机补偿 | ResNet | 35 FPS | 81.2 |
6. 总结
目标跟踪的核心是帧间关联:
- SORT:最简单,卡尔曼滤波 + IoU 匹配
- DeepSORT:加入外观特征,解决遮挡后重识别
- ByteTrack:利用低分检测,大幅减少漏检
- 实践建议:速度优先选 ByteTrack,精度优先选 BoT-SORT