Jetson + OpenCV DNN 部署:不依赖 TensorRT 的轻量方案
1. OpenCV DNN 优势
OpenCV DNN vs TensorRT:
├── 易用性:OpenCV DNN 更简单,无需转换
├── 兼容性:支持 ONNX/TF/Caffe 直接加载
├── 性能:TensorRT 更快(2-5x)
├── 适用场景:原型验证、多框架混合、快速迭代
└── 推荐:验证用 OpenCV DNN,生产用 TensorRT
2. 基础推理
python
#!/usr/bin/env python3
"""opencv_dnn_basic.py - OpenCV DNN 基础推理"""
import cv2
import numpy as np
import time
class OpenCVDetector:
"""OpenCV DNN 检测器"""
def __init__(self, model_path, conf_thresh=0.3, nms_thresh=0.45):
# 加载模型(支持 ONNX/TF/Caffe)
if model_path.endswith('.onnx'):
self.net = cv2.dnn.readNetFromONNX(model_path)
elif model_path.endswith('.pb'):
self.net = cv2.dnn.readNetFromTensorflow(model_path)
elif model_path.endswith('.caffemodel'):
self.net = cv2.dnn.readNetFromCaffe(model_path)
# 使用 CUDA 后端
self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
# 备用:如果 CUDA 不可用
# self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
# self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
self.conf_thresh = conf_thresh
self.nms_thresh = nms_thresh
def detect(self, image, input_size=640):
"""检测"""
h, w = image.shape[:2]
# 预处理
blob = cv2.dnn.blobFromImage(
image, 1/255.0, (input_size, input_size),
swapRB=True, crop=False
)
# 推理
self.net.setInput(blob)
start = time.time()
outputs = self.net.forward(self.net.getUnconnectedOutLayersNames())
infer_time = (time.time() - start) * 1000
# 后处理(YOLOv8 格式)
detections = self._parse_output(outputs, w, h)
return detections, infer_time
def _parse_output(self, outputs, orig_w, orig_h):
"""解析 YOLOv8 输出"""
predictions = outputs[0] # [1, 84, 8400]
predictions = predictions[0].T # [8400, 84]
boxes = predictions[:, :4]
scores = predictions[:, 4:]
# 最大类别分数
max_scores = scores.max(axis=1)
class_ids = scores.argmax(axis=1)
# 过滤
mask = max_scores > self.conf_thresh
boxes = boxes[mask]
max_scores = max_scores[mask]
class_ids = class_ids[mask]
if len(boxes) == 0:
return []
# xywh → xyxy
x1 = boxes[:, 0] - boxes[:, 2] / 2
y1 = boxes[:, 1] - boxes[:, 3] / 2
x2 = boxes[:, 0] + boxes[:, 2] / 2
y2 = boxes[:, 1] + boxes[:, 3] / 2
# 缩放到原图
x1 = x1 * orig_w / 640
y1 = y1 * orig_h / 640
x2 = x2 * orig_w / 640
y2 = y2 * orig_h / 640
# NMS
indices = cv2.dnn.NMSBoxes(
[[int(x1[i]), int(y1[i]), int(x2[i]-x1[i]), int(y2[i]-y1[i])] for i in range(len(x1))],
max_scores.tolist(),
self.conf_thresh,
self.nms_thresh
)
results = []
for i in indices:
results.append({
"bbox": [int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])],
"score": float(max_scores[i]),
"class_id": int(class_ids[i])
})
return results
if __name__ == "__main__":
detector = OpenCVDetector("yolov8s.onnx")
image = cv2.imread("test.jpg")
detections, infer_time = detector.detect(image)
print(f"检测到 {len(detections)} 个目标,耗时 {infer_time:.1f}ms")
3. 多模型混合推理
python
#!/usr/bin/env python3
"""multi_model_opencv.py - 多模型混合推理"""
import cv2
class MultiModelPipeline:
"""多模型管线"""
def __init__(self):
# 检测模型
self.detector = cv2.dnn.readNetFromONNX("yolov8s.onnx")
self.detector.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
self.detector.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
# 分类模型
self.classifier = cv2.dnn.readNetFromONNX("resnet50.onnx")
self.classifier.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
self.classifier.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
# 分割模型
self.segmentor = cv2.dnn.readNetFromONNX("unet.onnx")
self.segmentor.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
self.segmentor.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
def process(self, image):
"""多模型处理"""
# 1. 检测
det_result = self._detect(image)
# 2. 对每个检测框分类
for det in det_result:
x1, y1, x2, y2 = det["bbox"]
crop = image[y1:y2, x1:x2]
det["classification"] = self._classify(crop)
# 3. 全图分割
seg_mask = self._segment(image)
return det_result, seg_mask
4. OpenCV DNN CUDA 编译
bash
# Jetson 上 OpenCV 默认已启用 CUDA
# 验证 CUDA 支持
python3 -c "
import cv2
print(f'OpenCV: {cv2.__version__}')
print(f'CUDA: {cv2.cuda.getCudaEnabledDeviceCount()} devices')
print(f'DNN CUDA: {cv2.dnn.DNN_BACKEND_CUDA}')
"
# 如果 CUDA 不可用,重新编译 OpenCV
# 参考:https://github.com/mdegans/nano_build_opencv
5. 性能对比
OpenCV DNN vs TensorRT(YOLOv8s on Orin NX):
┌──────────────┬──────────┬──────────┬──────────┐
│ 后端 │ FP32 │ FP16 │ INT8 │
├──────────────┼──────────┼──────────┼──────────┤
│ OpenCV CPU │ 120ms │ - │ - │
│ OpenCV CUDA │ 18ms │ 12ms │ - │
│ TensorRT │ 12ms │ 6.5ms │ 4.2ms │
└──────────────┴──────────┴──────────┴──────────┘
结论:
- OpenCV CUDA 比 TensorRT 慢约 2x
- 但无需转换模型,开发效率高
- 适合原型验证和快速迭代
总结
| 场景 | 推荐方案 |
|---|---|
| 原型验证 | OpenCV DNN + CUDA |
| 快速迭代 | OpenCV DNN |
| 生产部署 | TensorRT |
| 多框架混合 | OpenCV DNN |
核心要点:
- 直接加载:ONNX/TF/Caffe 模型无需转换
- CUDA 加速:setPreferableBackend(DNN_BACKEND_CUDA)
- 兼容性好:支持所有主流模型格式
- 性能够用:YOLOv8s 约 18ms,30+ FPS