目标跟踪实战：SORT、DeepSORT 与 ByteTrack 原理实现

1. 引言

目标跟踪（Multi-Object Tracking, MOT）是计算机视觉的核心任务之一。在自动驾驶、视频监控、运动分析等场景中，需要在连续帧中维持每个目标的唯一身份。

核心挑战： 检测器给出每帧的目标框，但不知道"第1帧的车A"和"第2帧的车A"是同一辆车。

技术演进：

复制代码

SORT (2016) → DeepSORT (2017) → ByteTrack (2021) → BoT-SORT (2023)
  卡尔曼滤波    +外观特征          低分检测利用         +相机补偿

2. SORT（Simple Online and Realtime Tracking）

2.1 核心流程

复制代码

帧 t: 检测结果 D_t = {d_1, d_2, ...}
帧 t-1: 跟踪轨迹 T_{t-1} = {t_1, t_2, ...}

1. 预测：用卡尔曼滤波预测每个轨迹在帧 t 的位置
2. 匹配：用匈牙利算法将预测框与检测框匹配
3. 更新：匹配成功的轨迹用检测框更新
4. 创建：未匹配的检测创建新轨迹
5. 删除：长时间未匹配的轨迹删除

2.2 卡尔曼滤波

python 复制代码

import numpy as np
from filterpy.kalman import KalmanFilter

class KalmanBoxTracker:
    """基于卡尔曼滤波的边界框跟踪器"""

    count = 0

    def __init__(self, bbox):
        """
        bbox: [x1, y1, x2, y2] → 转为 [cx, cy, s, r]
        cx, cy = 中心坐标
        s = 面积
        r = 宽高比
        """
        self.kf = KalmanFilter(dim_x=7, dim_z=4)

        # 状态转移矩阵 F
        self.kf.F = np.array([
            [1, 0, 0, 0, 1, 0, 0],
            [0, 1, 0, 0, 0, 1, 0],
            [0, 0, 1, 0, 0, 0, 1],
            [0, 0, 0, 1, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0],
            [0, 0, 0, 0, 0, 1, 0],
            [0, 0, 0, 0, 0, 0, 1],
        ])

        # 观测矩阵 H
        self.kf.H = np.array([
            [1, 0, 0, 0, 0, 0, 0],
            [0, 1, 0, 0, 0, 0, 0],
            [0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 1, 0, 0, 0],
        ])

        # 噪声
        self.kf.R[2:, 2:] *= 10.
        self.kf.P[4:, 4:] *= 1000.
        self.kf.P *= 10.
        self.kf.Q[-1, -1] *= 0.01
        self.kf.Q[4:, 4:] *= 0.01

        # 初始化状态
        self.kf.x[:4] = self._bbox_to_z(bbox)

        self.time_since_update = 0
        self.id = KalmanBoxTracker.count
        KalmanBoxTracker.count += 1
        self.history = []
        self.hits = 0
        self.hit_streak = 0
        self.age = 0

    def _bbox_to_z(self, bbox):
        """[x1,y1,x2,y2] → [cx,cy,s,r]"""
        w = bbox[2] - bbox[0]
        h = bbox[3] - bbox[1]
        cx = bbox[0] + w / 2.
        cy = bbox[1] + h / 2.
        s = w * h
        r = w / float(h)
        return np.array([[cx], [cy], [s], [r]])

    def _z_to_bbox(self, z):
        """[cx,cy,s,r] → [x1,y1,x2,y2]"""
        w = np.sqrt(z[2] * z[3])
        h = z[2] / w
        return np.array([
            z[0] - w/2., z[1] - h/2.,
            z[0] + w/2., z[1] + h/2.
        ]).reshape((1, 4))

    def predict(self):
        """预测下一帧位置"""
        self.kf.predict()
        self.age += 1
        if self.time_since_update > 0:
            self.hit_streak = 0
        self.time_since_update += 1
        return self._z_to_bbox(self.kf.x)

    def update(self, bbox):
        """用检测框更新"""
        self.time_since_update = 0
        self.hits += 1
        self.hit_streak += 1
        self.kf.update(self._bbox_to_z(bbox))

    def get_state(self):
        return self._z_to_bbox(self.kf.x)

2.3 SORT 主流程

python 复制代码

from scipy.optimize import linear_sum_assignment

class SORT:
    def __init__(self, max_age=5, min_hits=3, iou_threshold=0.3):
        self.max_age = max_age
        self.min_hits = min_hits
        self.iou_threshold = iou_threshold
        self.trackers = []
        self.frame_count = 0

    def update(self, detections):
        """
        detections: (N, 5) --- [x1, y1, x2, y2, score]
        返回: (M, 5) --- [x1, y1, x2, y2, track_id]
        """
        self.frame_count += 1

        # 预测已有轨迹
        predicted = []
        for trk in self.trackers:
            pred = trk.predict()
            predicted.append(pred[0])
        predicted = np.array(predicted) if predicted else np.empty((0, 4))

        # 匹配：IoU + 匈牙利算法
        if len(predicted) > 0 and len(detections) > 0:
            iou_matrix = self._iou_batch(detections[:, :4], predicted)
            row_idx, col_idx = linear_sum_assignment(-iou_matrix)

            # 过滤低 IoU
            matched = []
            unmatched_dets = list(range(len(detections)))
            unmatched_trks = list(range(len(predicted)))

            for r, c in zip(row_idx, col_idx):
                if iou_matrix[r, c] >= self.iou_threshold:
                    matched.append((r, c))
                    unmatched_dets.remove(r)
                    unmatched_trks.remove(c)
        else:
            matched = []
            unmatched_dets = list(range(len(detections)))
            unmatched_trks = list(range(len(predicted)))

        # 更新匹配的轨迹
        for d, t in matched:
            self.trackers[t].update(detections[d, :4])

        # 创建新轨迹
        for d in unmatched_dets:
            trk = KalmanBoxTracker(detections[d, :4])
            self.trackers.append(trk)

        # 删除旧轨迹
        self.trackers = [t for t in self.trackers
                        if t.time_since_update <= self.max_age]

        # 输出确认的轨迹
        results = []
        for trk in self.trackers:
            if trk.hits >= self.min_hits:
                bbox = trk.get_state()[0]
                results.append([*bbox, trk.id])

        return np.array(results) if results else np.empty((0, 5))

    def _iou_batch(self, bb_dets, bb_trks):
        """计算 IoU 矩阵"""
        def box_iou(a, b):
            x1 = max(a[0], b[0])
            y1 = max(a[1], b[1])
            x2 = min(a[2], b[2])
            y2 = min(a[3], b[3])
            inter = max(0, x2-x1) * max(0, y2-y1)
            area_a = (a[2]-a[0]) * (a[3]-a[1])
            area_b = (b[2]-b[0]) * (b[3]-b[1])
            return inter / (area_a + area_b - inter + 1e-6)

        iou = np.zeros((len(bb_dets), len(bb_trks)))
        for d in range(len(bb_dets)):
            for t in range(len(bb_trks)):
                iou[d, t] = box_iou(bb_dets[d], bb_trks[t])
        return iou

3. DeepSORT

3.1 改进点

DeepSORT 在 SORT 基础上加入外观特征：

复制代码

代价矩阵 = α × IoU代价 + (1-α) × 外观代价

外观特征：CNN 提取 128 维特征向量
特征库：每个轨迹维护一个特征队列（最近 100 帧）
匹配：余弦距离

3.2 外观特征提取

python 复制代码

import torch
import torch.nn as nn
from torchvision.models import resnet50

class FeatureExtractor(nn.Module):
    """外观特征提取器"""

    def __init__(self, feature_dim=128):
        super().__init__()
        backbone = resnet50(pretrained=True)
        self.features = nn.Sequential(*list(backbone.children())[:-1])
        self.fc = nn.Linear(2048, feature_dim)

    def forward(self, images):
        """
        images: (B, 3, 128, 64) --- 裁剪的目标图像
        返回: (B, 128) --- 归一化特征
        """
        feat = self.features(images).flatten(1)
        feat = self.fc(feat)
        feat = nn.functional.normalize(feat, dim=1)
        return feat

3.3 级联匹配

python 复制代码

class DeepSORT:
    def __init__(self, max_age=70, nn_budget=100):
        self.max_age = max_age
        self.nn_budget = nn_budget
        self.tracks = []
        self.feature_extractor = FeatureExtractor()

    def update(self, detections, features):
        """
        detections: (N, 5) --- [x1, y1, x2, y2, score]
        features: (N, 128) --- 外观特征
        """
        # 1. 预测
        for track in self.tracks:
            track.predict()

        # 2. 级联匹配（优先匹配更长时间未更新的轨迹）
        matched, unmatched_dets, unmatched_trks = self._cascade_match(
            detections, features
        )

        # 3. IoU 匹配（剩余的用 IoU 匹配）
        if len(unmatched_dets) > 0 and len(unmatched_trks) > 0:
            iou_matched, unmatched_dets, unmatched_trks = self._iou_match(
                detections[unmatched_dets], unmatched_trks
            )
            matched.extend(iou_matched)

        # 4. 更新/创建/删除
        for d, t in matched:
            self.tracks[t].update(detections[d], features[d])

        for d in unmatched_dets:
            self.tracks.append(Track(detections[d], features[d]))

        self.tracks = [t for t in self.tracks if t.time_since_update <= self.max_age]

        return self._get_results()

    def _cascade_match(self, detections, features):
        """级联匹配"""
        matched = []
        unmatched_dets = list(range(len(detections)))

        for age in range(self.max_age + 1):
            tracks_of_age = [i for i, t in enumerate(self.tracks)
                           if t.time_since_update == age]

            if not tracks_of_age or not unmatched_dets:
                continue

            # 计算代价矩阵
            cost_matrix = self._cosine_distance(
                features[unmatched_dets],
                [self.tracks[t].features for t in tracks_of_age]
            )

            row_idx, col_idx = linear_sum_assignment(cost_matrix)

            new_matched = []
            for r, c in zip(row_idx, col_idx):
                if cost_matrix[r, c] < 0.7:  # 余弦距离阈值
                    new_matched.append((unmatched_dets[r], tracks_of_age[c]))
                    unmatched_dets.remove(unmatched_dets[r])

            matched.extend(new_matched)

        unmatched_trks = [i for i in range(len(self.tracks))
                         if self.tracks[i].time_since_update > 0
                         and i not in [m[1] for m in matched]]

        return matched, unmatched_dets, unmatched_trks

4. ByteTrack

4.1 核心创新

ByteTrack 的关键洞察： 低分检测框也有用！

复制代码

传统方法：
  高分检测 (>0.6) → 匹配跟踪
  低分检测 (<0.6) → 直接丢弃

ByteTrack：
  第一轮：高分检测 ↔ 已有轨迹 匹配
  第二轮：低分检测 ↔ 剩余轨迹 匹配
  第三轮：未匹配高分检测 → 创建新轨迹

4.2 实现

python 复制代码

class ByteTrack:
    def __init__(self, high_thresh=0.6, low_thresh=0.1, max_age=30):
        self.high_thresh = high_thresh
        self.low_thresh = low_thresh
        self.max_age = max_age
        self.tracks = []
        self.track_id = 0

    def update(self, detections):
        """
        detections: (N, 6) --- [x1, y1, x2, y2, score, class]
        """
        # 分为高分和低分检测
        high_dets = detections[detections[:, 4] >= self.high_thresh]
        low_dets = detections[
            (detections[:, 4] >= self.low_thresh) &
            (detections[:, 4] < self.high_thresh)
        ]

        # 预测
        for track in self.tracks:
            track.predict()

        # 第一轮：高分检测 ↔ 所有轨迹
        matched1, unmatched_tracks, unmatched_high = self._match(
            self.tracks, high_dets, thresh=0.3
        )

        # 第二轮：低分检测 ↔ 剩余轨迹
        remaining_tracks = [self.tracks[i] for i in unmatched_tracks]
        matched2, still_unmatched, unmatched_low = self._match(
            remaining_tracks, low_dets, thresh=0.5
        )

        # 更新匹配的轨迹
        for t_idx, d_idx in matched1:
            self.tracks[t_idx].update(high_dets[d_idx])

        for t_idx, d_idx in matched2:
            remaining_tracks[t_idx].update(low_dets[d_idx])

        # 创建新轨迹（仅高分检测）
        for d_idx in unmatched_high:
            self.tracks.append(Track(high_dets[d_idx], self.track_id))
            self.track_id += 1

        # 删除旧轨迹
        self.tracks = [t for t in self.tracks if t.time_since_update <= self.max_age]

        return self._get_results()

    def _match(self, tracks, detections, thresh):
        """IoU + 匈牙利匹配"""
        if not tracks or len(detections) == 0:
            return [], list(range(len(tracks))), list(range(len(detections)))

        # IoU 矩阵
        iou_matrix = self._compute_iou(tracks, detections)
        row_idx, col_idx = linear_sum_assignment(-iou_matrix)

        matched, unmatched_tracks, unmatched_dets = [], [], []
        matched_tracks, matched_dets = set(), set()

        for r, c in zip(row_idx, col_idx):
            if iou_matrix[r, c] >= thresh:
                matched.append((r, c))
                matched_tracks.add(r)
                matched_dets.add(c)

        unmatched_tracks = [i for i in range(len(tracks)) if i not in matched_tracks]
        unmatched_dets = [i for i in range(len(detections)) if i not in matched_dets]

        return matched, unmatched_tracks, unmatched_dets

5. 算法对比

算法	匹配策略	外观特征	速度	MOTA
SORT	IoU	无	200 FPS	59.8
DeepSORT	IoU + 外观	ResNet	40 FPS	61.4
ByteTrack	两轮 IoU	无	150 FPS	80.3
BoT-SORT	IoU + 外观 + 相机补偿	ResNet	35 FPS	81.2

6. 总结

目标跟踪的核心是帧间关联：

SORT：最简单，卡尔曼滤波 + IoU 匹配
DeepSORT：加入外观特征，解决遮挡后重识别
ByteTrack：利用低分检测，大幅减少漏检
实践建议：速度优先选 ByteTrack，精度优先选 BoT-SORT