Jetson + OpenCV DNN 部署:不依赖 TensorRT 的轻量方案

Jetson + OpenCV DNN 部署:不依赖 TensorRT 的轻量方案

1. OpenCV DNN 优势

复制代码
OpenCV DNN vs TensorRT:
├── 易用性:OpenCV DNN 更简单,无需转换
├── 兼容性:支持 ONNX/TF/Caffe 直接加载
├── 性能:TensorRT 更快(2-5x)
├── 适用场景:原型验证、多框架混合、快速迭代
└── 推荐:验证用 OpenCV DNN,生产用 TensorRT

2. 基础推理

python 复制代码
#!/usr/bin/env python3
"""opencv_dnn_basic.py - OpenCV DNN 基础推理"""
import cv2
import numpy as np
import time

class OpenCVDetector:
    """OpenCV DNN 检测器"""
    
    def __init__(self, model_path, conf_thresh=0.3, nms_thresh=0.45):
        # 加载模型(支持 ONNX/TF/Caffe)
        if model_path.endswith('.onnx'):
            self.net = cv2.dnn.readNetFromONNX(model_path)
        elif model_path.endswith('.pb'):
            self.net = cv2.dnn.readNetFromTensorflow(model_path)
        elif model_path.endswith('.caffemodel'):
            self.net = cv2.dnn.readNetFromCaffe(model_path)
        
        # 使用 CUDA 后端
        self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
        self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
        
        # 备用:如果 CUDA 不可用
        # self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
        # self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
        
        self.conf_thresh = conf_thresh
        self.nms_thresh = nms_thresh
    
    def detect(self, image, input_size=640):
        """检测"""
        h, w = image.shape[:2]
        
        # 预处理
        blob = cv2.dnn.blobFromImage(
            image, 1/255.0, (input_size, input_size),
            swapRB=True, crop=False
        )
        
        # 推理
        self.net.setInput(blob)
        start = time.time()
        outputs = self.net.forward(self.net.getUnconnectedOutLayersNames())
        infer_time = (time.time() - start) * 1000
        
        # 后处理(YOLOv8 格式)
        detections = self._parse_output(outputs, w, h)
        
        return detections, infer_time
    
    def _parse_output(self, outputs, orig_w, orig_h):
        """解析 YOLOv8 输出"""
        predictions = outputs[0]  # [1, 84, 8400]
        predictions = predictions[0].T  # [8400, 84]
        
        boxes = predictions[:, :4]
        scores = predictions[:, 4:]
        
        # 最大类别分数
        max_scores = scores.max(axis=1)
        class_ids = scores.argmax(axis=1)
        
        # 过滤
        mask = max_scores > self.conf_thresh
        boxes = boxes[mask]
        max_scores = max_scores[mask]
        class_ids = class_ids[mask]
        
        if len(boxes) == 0:
            return []
        
        # xywh → xyxy
        x1 = boxes[:, 0] - boxes[:, 2] / 2
        y1 = boxes[:, 1] - boxes[:, 3] / 2
        x2 = boxes[:, 0] + boxes[:, 2] / 2
        y2 = boxes[:, 1] + boxes[:, 3] / 2
        
        # 缩放到原图
        x1 = x1 * orig_w / 640
        y1 = y1 * orig_h / 640
        x2 = x2 * orig_w / 640
        y2 = y2 * orig_h / 640
        
        # NMS
        indices = cv2.dnn.NMSBoxes(
            [[int(x1[i]), int(y1[i]), int(x2[i]-x1[i]), int(y2[i]-y1[i])] for i in range(len(x1))],
            max_scores.tolist(),
            self.conf_thresh,
            self.nms_thresh
        )
        
        results = []
        for i in indices:
            results.append({
                "bbox": [int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])],
                "score": float(max_scores[i]),
                "class_id": int(class_ids[i])
            })
        
        return results

if __name__ == "__main__":
    detector = OpenCVDetector("yolov8s.onnx")
    
    image = cv2.imread("test.jpg")
    detections, infer_time = detector.detect(image)
    print(f"检测到 {len(detections)} 个目标,耗时 {infer_time:.1f}ms")

3. 多模型混合推理

python 复制代码
#!/usr/bin/env python3
"""multi_model_opencv.py - 多模型混合推理"""
import cv2

class MultiModelPipeline:
    """多模型管线"""
    
    def __init__(self):
        # 检测模型
        self.detector = cv2.dnn.readNetFromONNX("yolov8s.onnx")
        self.detector.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
        self.detector.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
        
        # 分类模型
        self.classifier = cv2.dnn.readNetFromONNX("resnet50.onnx")
        self.classifier.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
        self.classifier.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
        
        # 分割模型
        self.segmentor = cv2.dnn.readNetFromONNX("unet.onnx")
        self.segmentor.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
        self.segmentor.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
    
    def process(self, image):
        """多模型处理"""
        # 1. 检测
        det_result = self._detect(image)
        
        # 2. 对每个检测框分类
        for det in det_result:
            x1, y1, x2, y2 = det["bbox"]
            crop = image[y1:y2, x1:x2]
            det["classification"] = self._classify(crop)
        
        # 3. 全图分割
        seg_mask = self._segment(image)
        
        return det_result, seg_mask

4. OpenCV DNN CUDA 编译

bash 复制代码
# Jetson 上 OpenCV 默认已启用 CUDA
# 验证 CUDA 支持
python3 -c "
import cv2
print(f'OpenCV: {cv2.__version__}')
print(f'CUDA: {cv2.cuda.getCudaEnabledDeviceCount()} devices')
print(f'DNN CUDA: {cv2.dnn.DNN_BACKEND_CUDA}')
"

# 如果 CUDA 不可用,重新编译 OpenCV
# 参考:https://github.com/mdegans/nano_build_opencv

5. 性能对比

复制代码
OpenCV DNN vs TensorRT(YOLOv8s on Orin NX):
┌──────────────┬──────────┬──────────┬──────────┐
│ 后端          │ FP32     │ FP16     │ INT8     │
├──────────────┼──────────┼──────────┼──────────┤
│ OpenCV CPU   │ 120ms    │ -        │ -        │
│ OpenCV CUDA  │ 18ms     │ 12ms     │ -        │
│ TensorRT     │ 12ms     │ 6.5ms    │ 4.2ms    │
└──────────────┴──────────┴──────────┴──────────┘

结论:
- OpenCV CUDA 比 TensorRT 慢约 2x
- 但无需转换模型,开发效率高
- 适合原型验证和快速迭代

总结

场景 推荐方案
原型验证 OpenCV DNN + CUDA
快速迭代 OpenCV DNN
生产部署 TensorRT
多框架混合 OpenCV DNN

核心要点:

  1. 直接加载:ONNX/TF/Caffe 模型无需转换
  2. CUDA 加速:setPreferableBackend(DNN_BACKEND_CUDA)
  3. 兼容性好:支持所有主流模型格式
  4. 性能够用:YOLOv8s 约 18ms,30+ FPS