Jetson 部署 YOLOv8 实时目标检测：训练到边缘推理

1. 项目架构

复制代码

YOLOv8 Jetson 部署流水线：
├── 数据准备 → Roboflow 标注 + 增强
├── 模型训练 → Ultralytics YOLOv8
├── 模型导出 → ONNX → TensorRT FP16
├── 边缘推理 → Jetson + CSI 摄像头
└── 性能优化 → 多线程 + 批处理

2. 数据准备

2.1 Roboflow 标注

bash 复制代码

# 安装 Roboflow CLI
pip3 install roboflow

# 下载数据集（示例：安全帽检测）
python3 << 'EOF'
from roboflow import Roboflow
rf = Roboflow(api_key="YOUR_API_KEY")
project = rf.workspace("your-workspace").project("safety-helmet")
dataset = project.version(1).download("yolov8")
print(f"数据集路径: {dataset.location}")
print(f"训练集: {len(os.listdir(f'{dataset.location}/train/images'))} 张")
EOF

2.2 数据增强配置

yaml 复制代码

# data augmentation config (Roboflow 自带，也可手动配置)
# augmentations.yaml
hsv_h: 0.015  # 色调
hsv_s: 0.7    # 饱和度
hsv_v: 0.4    # 亮度
degrees: 5     # 旋转
translate: 0.1 # 平移
scale: 0.5     # 缩放
shear: 2       # 剪切
perspective: 0.0
flipud: 0.0    # 上下翻转
fliplr: 0.5    # 左右翻转
mosaic: 1.0    # Mosaic 增强
mixup: 0.0     # MixUp
copy_paste: 0.0

2.3 数据集结构

复制代码

dataset/
├── train/
│   ├── images/
│   │   ├── 001.jpg
│   │   └── 002.jpg
│   └── labels/
│       ├── 001.txt  # class x_center y_center width height
│       └── 002.txt
├── valid/
│   ├── images/
│   └── labels/
├── test/
│   ├── images/
│   └── labels/
└── data.yaml

yaml 复制代码

# data.yaml
path: ./dataset
train: train/images
val: valid/images
test: test/images

names:
  0: helmet
  1: person
  2: no-helmet

nc: 3

3. 模型训练

3.1 选择模型尺寸

复制代码

YOLOv8 模型尺寸对比（COCO 预训练）：
┌─────────────┬──────────┬───────────┬──────────┬──────────┐
│ 模型         │ 参数量    │ mAP50-95  │ 推理速度  │ Jetson   │
├─────────────┼──────────┼───────────┼──────────┼──────────┤
│ YOLOv8n     │ 3.2M     │ 37.3      │ 最快      │ ✅ 推荐  │
│ YOLOv8s     │ 11.2M    │ 44.9      │ 快        │ ✅ 推荐  │
│ YOLOv8m     │ 25.9M    │ 50.2      │ 中等      │ ⚠️ 可用  │
│ YOLOv8l     │ 43.7M    │ 52.9      │ 慢        │ ❌ 不推荐│
│ YOLOv8x     │ 68.2M    │ 53.9      │ 很慢      │ ❌ 不推荐│
└─────────────┴──────────┴───────────┴──────────┴──────────┘

3.2 开始训练

python 复制代码

#!/usr/bin/env python3
"""train_yolov8.py"""
from ultralytics import YOLO

# 加载预训练模型
model = YOLO("yolov8s.pt")

# 训练
results = model.train(
    data="data.yaml",
    epochs=100,
    imgsz=640,
    batch=16,
    device="0",  # GPU 编号
    workers=8,
    patience=20,        # 早停轮数
    save=True,
    save_period=10,     # 每 10 epoch 保存一次
    project="runs/detect",
    name="helmet_yolov8s",
    exist_ok=True,
    pretrained=True,
    optimizer="auto",
    lr0=0.01,
    lrf=0.01,
    momentum=0.937,
    weight_decay=0.0005,
    warmup_epochs=3,
    warmup_momentum=0.8,
    warmup_bias_lr=0.1,
    box=7.5,
    cls=0.5,
    dfl=1.5,
    plots=True,
    val=True,
)

# 打印最佳结果
print(f"最佳 mAP50: {results.best_map50:.4f}")
print(f"最佳 mAP50-95: {results.best_map50_95:.4f}")
print(f"最佳模型: {results.best}")

3.3 模型评估

python 复制代码

#!/usr/bin/env python3
"""evaluate.py"""
from ultralytics import YOLO

# 加载最佳模型
model = YOLO("runs/detect/helmet_yolov8s/weights/best.pt")

# 在测试集上评估
metrics = model.val(
    data="data.yaml",
    split="test",
    imgsz=640,
    batch=1,
    conf=0.25,
    iou=0.6,
    device="0"
)

print(f"mAP50: {metrics.box.map50:.4f}")
print(f"mAP50-95: {metrics.box.map:.4f}")
print(f"Precision: {metrics.box.mp:.4f}")
print(f"Recall: {metrics.box.mr:.4f}")

# 每类 AP
for i, name in metrics.names.items():
    print(f"  {name}: AP50={metrics.box.ap50[i]:.4f}")

4. 模型导出

4.1 导出到 ONNX

python 复制代码

#!/usr/bin/env python3
"""export.py"""
from ultralytics import YOLO

model = YOLO("runs/detect/helmet_yolov8s/weights/best.pt")

# 导出 ONNX
model.export(
    format="onnx",
    imgsz=640,
    opset=17,
    simplify=True,      # 简化模型
    dynamic=False,       # Jetson 推荐静态 batch
    batch=1,
)
# 输出：helmet_yolov8s.onnx (约 22MB)

4.2 转换为 TensorRT

bash 复制代码

# 方式一：Ultralytics 直接导出（推荐）
python3 -c "
from ultralytics import YOLO
model = YOLO('runs/detect/helmet_yolov8s/weights/best.pt')
model.export(format='engine', imgsz=640, half=True, batch=1, workspace=4)
"

# 方式二：trtexec 手动转换
/usr/src/tensorrt/bin/trtexec \
    --onnx=helmet_yolov8s.onnx \
    --saveEngine=helmet_yolov8s.engine \
    --fp16 \
    --workspace=4096 \
    --inputIOFormats=fp16:chw \
    --outputIOFormats=fp16:chw

# 验证引擎
/usr/src/tensorrt/bin/trtexec \
    --loadEngine=helmet_yolov8s.engine \
    --batch=1 \
    --iterations=200 \
    --warmUp=1000

5. Jetson 实时推理代码

5.1 完整推理类

python 复制代码

#!/usr/bin/env python3
"""jetson_yolo.py - Jetson YOLOv8 实时推理"""
import cv2
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time
from typing import List, Tuple

class YOLOv8TRT:
    """YOLOv8 TensorRT 推理引擎"""
    
    def __init__(self, engine_path: str, conf_thresh: float = 0.25, iou_thresh: float = 0.45):
        self.conf_thresh = conf_thresh
        self.iou_thresh = iou_thresh
        
        # 加载引擎
        logger = trt.Logger(trt.Logger.WARNING)
        runtime = trt.Runtime(logger)
        with open(engine_path, "rb") as f:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        self.context = self.engine.create_execution_context()
        
        # 分配内存
        self.inputs = []
        self.outputs = []
        self.bindings = []
        self.stream = cuda.Stream()
        
        for i in range(self.engine.num_io_tensors):
            name = self.engine.get_tensor_name(i)
            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
            shape = self.engine.get_tensor_shape(name)
            shape = tuple(max(1, s) if s >= 0 else 1 for s in shape)
            size = trt.volume(shape)
            
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            self.bindings.append(int(device_mem))
            
            info = {"name": name, "host": host_mem, "device": device_mem, "shape": shape, "dtype": dtype}
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                self.inputs.append(info)
                self.input_shape = shape
            else:
                self.outputs.append(info)
        
        # 类别名称
        self.class_names = {0: "helmet", 1: "person", 2: "no-helmet"}
        self.colors = {0: (0, 255, 0), 1: (255, 165, 0), 2: (0, 0, 255)}
    
    def preprocess(self, image: np.ndarray) -> Tuple[np.ndarray, float, Tuple]:
        """预处理：缩放 + 归一化 + 转换"""
        h, w = image.shape[:2]
        input_h, input_w = self.input_shape[2], self.input_shape[3]
        
        # 保持宽高比缩放
        scale = min(input_h / h, input_w / w)
        new_h, new_w = int(h * scale), int(w * scale)
        resized = cv2.resize(image, (new_w, new_h))
        
        # 创建填充画布
        canvas = np.full((input_h, input_w, 3), 114, dtype=np.uint8)
        dy, dx = (input_h - new_h) // 2, (input_w - new_w) // 2
        canvas[dy:dy+new_h, dx:dx+new_w] = resized
        
        # BGR → RGB, HWC → CHW, 归一化
        blob = canvas[:, :, ::-1].transpose(2, 0, 1).astype(np.float32) / 255.0
        blob = np.expand_dims(blob, axis=0)
        
        return blob, scale, (dy, dx)
    
    def postprocess(self, output: np.ndarray, scale: float, pad: Tuple, orig_shape: Tuple) -> List[dict]:
        """后处理：NMS + 坐标还原"""
        # YOLOv8 输出格式: [batch, 4+nc, 8400]
        predictions = output[0]  # [4+nc, 8400]
        predictions = predictions.T  # [8400, 4+nc]
        
        # 提取框和分数
        boxes = predictions[:, :4]  # cx, cy, w, h
        scores = predictions[:, 4:]  # class scores
        
        # 获取每个框的最大类别分数
        max_scores = scores.max(axis=1)
        class_ids = scores.argmax(axis=1)
        
        # 置信度过滤
        mask = max_scores > self.conf_thresh
        boxes = boxes[mask]
        max_scores = max_scores[mask]
        class_ids = class_ids[mask]
        
        if len(boxes) == 0:
            return []
        
        # xywh → xyxy
        x1 = boxes[:, 0] - boxes[:, 2] / 2
        y1 = boxes[:, 1] - boxes[:, 3] / 2
        x2 = boxes[:, 0] + boxes[:, 2] / 2
        y2 = boxes[:, 1] + boxes[:, 3] / 2
        
        # 还原到原图坐标
        dy, dx = pad
        x1 = (x1 - dx) / scale
        y1 = (y1 - dy) / scale
        x2 = (x2 - dx) / scale
        y2 = (y2 - dy) / scale
        
        # NMS
        keep = self._nms(np.stack([x1, y1, x2, y2], axis=1), max_scores, self.iou_thresh)
        
        results = []
        for i in keep:
            results.append({
                "bbox": [int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])],
                "score": float(max_scores[i]),
                "class_id": int(class_ids[i]),
                "class_name": self.class_names.get(int(class_ids[i]), "unknown")
            })
        
        return results
    
    def _nms(self, boxes: np.ndarray, scores: np.ndarray, iou_thresh: float) -> list:
        """非极大值抑制"""
        x1 = boxes[:, 0]
        y1 = boxes[:, 1]
        x2 = boxes[:, 2]
        y2 = boxes[:, 3]
        areas = (x2 - x1) * (y2 - y1)
        
        order = scores.argsort()[::-1]
        keep = []
        
        while order.size > 0:
            i = order[0]
            keep.append(i)
            
            xx1 = np.maximum(x1[i], x1[order[1:]])
            yy1 = np.maximum(y1[i], y1[order[1:]])
            xx2 = np.minimum(x2[i], x2[order[1:]])
            yy2 = np.minimum(y2[i], y2[order[1:]])
            
            w = np.maximum(0.0, xx2 - xx1)
            h = np.maximum(0.0, yy2 - yy1)
            inter = w * h
            iou = inter / (areas[i] + areas[order[1:]] - inter)
            
            inds = np.where(iou <= iou_thresh)[0]
            order = order[inds + 1]
        
        return keep
    
    def detect(self, image: np.ndarray) -> List[dict]:
        """单帧检测"""
        # 预处理
        blob, scale, pad = self.preprocess(image)
        orig_shape = image.shape[:2]
        
        # 传输到 GPU
        np.copyto(self.inputs[0]["host"], blob.ravel())
        cuda.memcpy_htod_async(self.inputs[0]["device"], self.inputs[0]["host"], self.stream)
        
        # 推理
        self.context.set_input_shape(self.inputs[0]["name"], blob.shape)
        self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
        
        # 取回结果
        cuda.memcpy_dtoh_async(self.outputs[0]["host"], self.outputs[0]["device"], self.stream)
        self.stream.synchronize()
        
        output = self.outputs[0]["host"].reshape(self.outputs[0]["shape"])
        
        # 后处理
        return self.postprocess(output, scale, pad, orig_shape)
    
    def draw_detections(self, image: np.ndarray, detections: List[dict]) -> np.ndarray:
        """绘制检测结果"""
        canvas = image.copy()
        
        for det in detections:
            x1, y1, x2, y2 = det["bbox"]
            cls_id = det["class_id"]
            score = det["score"]
            color = self.colors.get(cls_id, (255, 255, 255))
            label = f"{det['class_name']} {score:.2f}"
            
            # 绘制框
            cv2.rectangle(canvas, (x1, y1), (x2, y2), color, 2)
            
            # 绘制标签
            (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
            cv2.rectangle(canvas, (x1, y1 - th - 10), (x1 + tw, y1), color, -1)
            cv2.putText(canvas, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
        
        return canvas

5.2 CSI 摄像头实时推理

python 复制代码

#!/usr/bin/env python3
"""realtime_detect.py - CSI 摄像头实时检测"""
import cv2
import time
from jetson_yolo import YOLOv8TRT

def gstreamer_pipeline(sensor_id=0, width=1280, height=720, fps=30):
    """GStreamer 管道（CSI 摄像头）"""
    return (
        f"nvarguscamerasrc sensor-id={sensor_id} ! "
        f"video/x-raw(memory:NVMM), width={width}, height={height}, "
        f"format=NV12, framerate={fps}/1 ! "
        f"nvvidconv flip-method=0 ! "
        f"video/x-raw, width={width}, height={height}, format=BGRx ! "
        f"videoconvert ! "
        f"video/x-raw, format=BGR ! appsink"
    )

def main():
    # 初始化模型
    model = YOLOv8TRT("helmet_yolov8s.engine", conf_thresh=0.3)
    
    # 打开 CSI 摄像头
    cap = cv2.VideoCapture(gstreamer_pipeline(0, 1280, 720, 30), cv2.CAP_GSTREAMER)
    
    if not cap.isOpened():
        print("❌ 无法打开摄像头")
        return
    
    fps_counter = 0
    fps_start = time.time()
    fps_display = 0
    
    print("🎯 按 'q' 退出")
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # 检测
        detections = model.detect(frame)
        
        # 绘制结果
        canvas = model.draw_detections(frame, detections)
        
        # 计算 FPS
        fps_counter += 1
        if time.time() - fps_start >= 1.0:
            fps_display = fps_counter
            fps_counter = 0
            fps_start = time.time()
        
        # 显示 FPS
        cv2.putText(canvas, f"FPS: {fps_display}", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.putText(canvas, f"Objects: {len(detections)}", (10, 70),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        cv2.imshow("YOLOv8 Detection", canvas)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

5.3 USB 摄像头实时推理

python 复制代码

#!/usr/bin/env python3
"""usb_camera_detect.py - USB 摄像头推理"""
import cv2
import time
from jetson_yolo import YOLOv8TRT

def main():
    model = YOLOv8TRT("helmet_yolov8s.engine", conf_thresh=0.3)
    
    # USB 摄像头
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
    cap.set(cv2.CAP_PROP_FPS, 30)
    
    # 使用 GStreamer 加速（推荐）
    # cap = cv2.VideoCapture(
    #     "v4l2src device=/dev/video0 ! video/x-raw,width=1280,height=720,framerate=30/1 ! "
    #     "videoconvert ! video/x-raw,format=BGR ! appsink",
    #     cv2.CAP_GSTREAMER
    # )
    
    fps_counter = 0
    fps_start = time.time()
    fps_display = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        detections = model.detect(frame)
        canvas = model.draw_detections(frame, detections)
        
        fps_counter += 1
        if time.time() - fps_start >= 1.0:
            fps_display = fps_counter
            fps_counter = 0
            fps_start = time.time()
        
        cv2.putText(canvas, f"FPS: {fps_display}", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        cv2.imshow("Detection", canvas)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

6. 多摄像头并行推理

python 复制代码

#!/usr/bin/env python3
"""multi_camera.py - 多摄像头并行推理"""
import cv2
import threading
import queue
import time
import numpy as np
from jetson_yolo import YOLOv8TRT

class CameraThread(threading.Thread):
    """摄像头采集线程"""
    
    def __init__(self, camera_id, pipeline_func, frame_queue):
        super().__init__(daemon=True)
        self.camera_id = camera_id
        self.pipeline_func = pipeline_func
        self.frame_queue = frame_queue
        self.running = True
    
    def run(self):
        cap = cv2.VideoCapture(self.pipeline_func(self.camera_id), cv2.CAP_GSTREAMER)
        while self.running:
            ret, frame = cap.read()
            if ret:
                if not self.frame_queue.full():
                    self.frame_queue.put((self.camera_id, frame))
            else:
                time.sleep(0.01)
        cap.release()

class MultiCameraDetector:
    """多摄像头检测器"""
    
    def __init__(self, engine_path: str, camera_ids: list):
        self.model = YOLOv8TRT(engine_path)
        self.camera_ids = camera_ids
        self.frame_queues = {cid: queue.Queue(maxsize=2) for cid in camera_ids}
        self.result_queues = {cid: queue.Queue(maxsize=2) for cid in camera_ids}
    
    def csi_pipeline(self, sensor_id):
        return (
            f"nvarguscamerasrc sensor-id={sensor_id} ! "
            f"video/x-raw(memory:NVMM), width=640, height=480, format=NV12, framerate=30/1 ! "
            f"nvvidconv ! video/x-raw, format=BGRx ! videoconvert ! "
            f"video/x-raw, format=BGR ! appsink"
        )
    
    def start(self):
        # 启动摄像头线程
        self.threads = []
        for cid in self.camera_ids:
            t = CameraThread(cid, self.csi_pipeline, self.frame_queues[cid])
            t.start()
            self.threads.append(t)
        
        # 检测循环
        print(f"🎯 监控 {len(self.camera_ids)} 路摄像头")
        
        while True:
            for cid in self.camera_ids:
                try:
                    cam_id, frame = self.frame_queues[cid].get(timeout=0.1)
                    detections = self.model.detect(frame)
                    canvas = self.model.draw_detections(frame, detections)
                    
                    # 显示
                    cv2.imshow(f"Camera {cid}", canvas)
                except queue.Empty:
                    continue
            
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        
        # 清理
        for t in self.threads:
            t.running = False
        cv2.destroyAllWindows()

if __name__ == "__main__":
    detector = MultiCameraDetector("helmet_yolov8s.engine", [0, 1])
    detector.start()

7. 视频文件批量处理

python 复制代码

#!/usr/bin/env python3
"""video_batch.py - 视频文件批量处理"""
import cv2
import os
import time
from jetson_yolo import YOLOv8TRT

def process_video(model, video_path, output_path=None):
    """处理单个视频"""
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    writer = None
    if output_path:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    frame_count = 0
    total_detections = 0
    start_time = time.time()
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        detections = model.detect(frame)
        total_detections += len(detections)
        
        if writer:
            canvas = model.draw_detections(frame, detections)
            writer.write(canvas)
        
        frame_count += 1
        if frame_count % 100 == 0:
            elapsed = time.time() - start_time
            current_fps = frame_count / elapsed
            print(f"  帧 {frame_count}/{total_frames} | FPS: {current_fps:.1f} | 检测数: {total_detections}")
    
    cap.release()
    if writer:
        writer.release()
    
    elapsed = time.time() - start_time
    print(f"  完成: {frame_count} 帧, {elapsed:.1f}s, {frame_count/elapsed:.1f} FPS, {total_detections} 检测")

if __name__ == "__main__":
    model = YOLOv8TRT("helmet_yolov8s.engine")
    
    video_dir = "videos/"
    output_dir = "results/"
    os.makedirs(output_dir, exist_ok=True)
    
    for video_file in os.listdir(video_dir):
        if video_file.endswith(('.mp4', '.avi', '.mov')):
            video_path = os.path.join(video_dir, video_file)
            output_path = os.path.join(output_dir, f"det_{video_file}")
            print(f"处理: {video_file}")
            process_video(model, video_path, output_path)

8. 性能优化技巧

8.1 Jetson 性能调优

bash 复制代码

# 1. 设置最大性能模式
sudo nvpmodel -m 0
sudo jetson_clocks

# 2. 增加 swap 空间
sudo fallocate -l 8G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab

# 3. 关闭桌面环境（纯推理时）
sudo systemctl set-default multi-user.target
# 恢复：sudo systemctl set-default graphical.target

# 4. 调整 GPU 频率
cat /sys/devices/17000000.ga10b/devfreq/17000000.ga10b/max_freq | sudo tee \
    /sys/devices/17000000.ga10b/devfreq/17000000.ga10b/min_freq

8.2 代码优化

python 复制代码

# 1. 使用 CUDA Stream 重叠计算和数据传输
# 2. 预分配内存，避免运行时分配
# 3. 使用半精度推理（FP16）
# 4. 减少 CPU-GPU 数据拷贝
# 5. 多线程采集 + 单线程推理

# 优化前后对比（YOLOv8s on Orin NX 16GB）：
# ┌─────────────────┬──────────┬──────────┐
# │ 优化项           │ 优化前    │ 优化后    │
# ├─────────────────┼──────────┼──────────┤
# │ 预处理           │ 8 ms     │ 3 ms     │
# │ 推理             │ 25 ms    │ 12 ms    │
# │ 后处理           │ 5 ms     │ 2 ms     │
# │ 总延迟           │ 38 ms    │ 17 ms    │
# │ FPS             │ 26       │ 59       │
# └─────────────────┴──────────┴──────────┘

总结

YOLOv8 在 Jetson 上的部署流程：

数据准备 → Roboflow 标注 + 增强
模型训练 → 选择 YOLOv8s/n 平衡精度和速度
导出 TensorRT → model.export(format='engine', half=True)
实时推理 → CSI/USB 摄像头 + GStreamer 加速
性能优化 → nvpmodel -m 0 + jetson_clocks

预期性能：YOLOv8s on Orin NX 16GB = 50-60 FPS (640×640)