Jetson 部署 YOLOv8 实时目标检测:训练到边缘推理
1. 项目架构
YOLOv8 Jetson 部署流水线:
├── 数据准备 → Roboflow 标注 + 增强
├── 模型训练 → Ultralytics YOLOv8
├── 模型导出 → ONNX → TensorRT FP16
├── 边缘推理 → Jetson + CSI 摄像头
└── 性能优化 → 多线程 + 批处理
2. 数据准备
2.1 Roboflow 标注
bash
# 安装 Roboflow CLI
pip3 install roboflow
# 下载数据集(示例:安全帽检测)
python3 << 'EOF'
from roboflow import Roboflow
rf = Roboflow(api_key="YOUR_API_KEY")
project = rf.workspace("your-workspace").project("safety-helmet")
dataset = project.version(1).download("yolov8")
print(f"数据集路径: {dataset.location}")
print(f"训练集: {len(os.listdir(f'{dataset.location}/train/images'))} 张")
EOF
2.2 数据增强配置
yaml
# data augmentation config (Roboflow 自带,也可手动配置)
# augmentations.yaml
hsv_h: 0.015 # 色调
hsv_s: 0.7 # 饱和度
hsv_v: 0.4 # 亮度
degrees: 5 # 旋转
translate: 0.1 # 平移
scale: 0.5 # 缩放
shear: 2 # 剪切
perspective: 0.0
flipud: 0.0 # 上下翻转
fliplr: 0.5 # 左右翻转
mosaic: 1.0 # Mosaic 增强
mixup: 0.0 # MixUp
copy_paste: 0.0
2.3 数据集结构
dataset/
├── train/
│ ├── images/
│ │ ├── 001.jpg
│ │ └── 002.jpg
│ └── labels/
│ ├── 001.txt # class x_center y_center width height
│ └── 002.txt
├── valid/
│ ├── images/
│ └── labels/
├── test/
│ ├── images/
│ └── labels/
└── data.yaml
yaml
# data.yaml
path: ./dataset
train: train/images
val: valid/images
test: test/images
names:
0: helmet
1: person
2: no-helmet
nc: 3
3. 模型训练
3.1 选择模型尺寸
YOLOv8 模型尺寸对比(COCO 预训练):
┌─────────────┬──────────┬───────────┬──────────┬──────────┐
│ 模型 │ 参数量 │ mAP50-95 │ 推理速度 │ Jetson │
├─────────────┼──────────┼───────────┼──────────┼──────────┤
│ YOLOv8n │ 3.2M │ 37.3 │ 最快 │ ✅ 推荐 │
│ YOLOv8s │ 11.2M │ 44.9 │ 快 │ ✅ 推荐 │
│ YOLOv8m │ 25.9M │ 50.2 │ 中等 │ ⚠️ 可用 │
│ YOLOv8l │ 43.7M │ 52.9 │ 慢 │ ❌ 不推荐│
│ YOLOv8x │ 68.2M │ 53.9 │ 很慢 │ ❌ 不推荐│
└─────────────┴──────────┴───────────┴──────────┴──────────┘
3.2 开始训练
python
#!/usr/bin/env python3
"""train_yolov8.py"""
from ultralytics import YOLO
# 加载预训练模型
model = YOLO("yolov8s.pt")
# 训练
results = model.train(
data="data.yaml",
epochs=100,
imgsz=640,
batch=16,
device="0", # GPU 编号
workers=8,
patience=20, # 早停轮数
save=True,
save_period=10, # 每 10 epoch 保存一次
project="runs/detect",
name="helmet_yolov8s",
exist_ok=True,
pretrained=True,
optimizer="auto",
lr0=0.01,
lrf=0.01,
momentum=0.937,
weight_decay=0.0005,
warmup_epochs=3,
warmup_momentum=0.8,
warmup_bias_lr=0.1,
box=7.5,
cls=0.5,
dfl=1.5,
plots=True,
val=True,
)
# 打印最佳结果
print(f"最佳 mAP50: {results.best_map50:.4f}")
print(f"最佳 mAP50-95: {results.best_map50_95:.4f}")
print(f"最佳模型: {results.best}")
3.3 模型评估
python
#!/usr/bin/env python3
"""evaluate.py"""
from ultralytics import YOLO
# 加载最佳模型
model = YOLO("runs/detect/helmet_yolov8s/weights/best.pt")
# 在测试集上评估
metrics = model.val(
data="data.yaml",
split="test",
imgsz=640,
batch=1,
conf=0.25,
iou=0.6,
device="0"
)
print(f"mAP50: {metrics.box.map50:.4f}")
print(f"mAP50-95: {metrics.box.map:.4f}")
print(f"Precision: {metrics.box.mp:.4f}")
print(f"Recall: {metrics.box.mr:.4f}")
# 每类 AP
for i, name in metrics.names.items():
print(f" {name}: AP50={metrics.box.ap50[i]:.4f}")
4. 模型导出
4.1 导出到 ONNX
python
#!/usr/bin/env python3
"""export.py"""
from ultralytics import YOLO
model = YOLO("runs/detect/helmet_yolov8s/weights/best.pt")
# 导出 ONNX
model.export(
format="onnx",
imgsz=640,
opset=17,
simplify=True, # 简化模型
dynamic=False, # Jetson 推荐静态 batch
batch=1,
)
# 输出:helmet_yolov8s.onnx (约 22MB)
4.2 转换为 TensorRT
bash
# 方式一:Ultralytics 直接导出(推荐)
python3 -c "
from ultralytics import YOLO
model = YOLO('runs/detect/helmet_yolov8s/weights/best.pt')
model.export(format='engine', imgsz=640, half=True, batch=1, workspace=4)
"
# 方式二:trtexec 手动转换
/usr/src/tensorrt/bin/trtexec \
--onnx=helmet_yolov8s.onnx \
--saveEngine=helmet_yolov8s.engine \
--fp16 \
--workspace=4096 \
--inputIOFormats=fp16:chw \
--outputIOFormats=fp16:chw
# 验证引擎
/usr/src/tensorrt/bin/trtexec \
--loadEngine=helmet_yolov8s.engine \
--batch=1 \
--iterations=200 \
--warmUp=1000
5. Jetson 实时推理代码
5.1 完整推理类
python
#!/usr/bin/env python3
"""jetson_yolo.py - Jetson YOLOv8 实时推理"""
import cv2
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time
from typing import List, Tuple
class YOLOv8TRT:
"""YOLOv8 TensorRT 推理引擎"""
def __init__(self, engine_path: str, conf_thresh: float = 0.25, iou_thresh: float = 0.45):
self.conf_thresh = conf_thresh
self.iou_thresh = iou_thresh
# 加载引擎
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
with open(engine_path, "rb") as f:
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
# 分配内存
self.inputs = []
self.outputs = []
self.bindings = []
self.stream = cuda.Stream()
for i in range(self.engine.num_io_tensors):
name = self.engine.get_tensor_name(i)
dtype = trt.nptype(self.engine.get_tensor_dtype(name))
shape = self.engine.get_tensor_shape(name)
shape = tuple(max(1, s) if s >= 0 else 1 for s in shape)
size = trt.volume(shape)
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(device_mem))
info = {"name": name, "host": host_mem, "device": device_mem, "shape": shape, "dtype": dtype}
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
self.inputs.append(info)
self.input_shape = shape
else:
self.outputs.append(info)
# 类别名称
self.class_names = {0: "helmet", 1: "person", 2: "no-helmet"}
self.colors = {0: (0, 255, 0), 1: (255, 165, 0), 2: (0, 0, 255)}
def preprocess(self, image: np.ndarray) -> Tuple[np.ndarray, float, Tuple]:
"""预处理:缩放 + 归一化 + 转换"""
h, w = image.shape[:2]
input_h, input_w = self.input_shape[2], self.input_shape[3]
# 保持宽高比缩放
scale = min(input_h / h, input_w / w)
new_h, new_w = int(h * scale), int(w * scale)
resized = cv2.resize(image, (new_w, new_h))
# 创建填充画布
canvas = np.full((input_h, input_w, 3), 114, dtype=np.uint8)
dy, dx = (input_h - new_h) // 2, (input_w - new_w) // 2
canvas[dy:dy+new_h, dx:dx+new_w] = resized
# BGR → RGB, HWC → CHW, 归一化
blob = canvas[:, :, ::-1].transpose(2, 0, 1).astype(np.float32) / 255.0
blob = np.expand_dims(blob, axis=0)
return blob, scale, (dy, dx)
def postprocess(self, output: np.ndarray, scale: float, pad: Tuple, orig_shape: Tuple) -> List[dict]:
"""后处理:NMS + 坐标还原"""
# YOLOv8 输出格式: [batch, 4+nc, 8400]
predictions = output[0] # [4+nc, 8400]
predictions = predictions.T # [8400, 4+nc]
# 提取框和分数
boxes = predictions[:, :4] # cx, cy, w, h
scores = predictions[:, 4:] # class scores
# 获取每个框的最大类别分数
max_scores = scores.max(axis=1)
class_ids = scores.argmax(axis=1)
# 置信度过滤
mask = max_scores > self.conf_thresh
boxes = boxes[mask]
max_scores = max_scores[mask]
class_ids = class_ids[mask]
if len(boxes) == 0:
return []
# xywh → xyxy
x1 = boxes[:, 0] - boxes[:, 2] / 2
y1 = boxes[:, 1] - boxes[:, 3] / 2
x2 = boxes[:, 0] + boxes[:, 2] / 2
y2 = boxes[:, 1] + boxes[:, 3] / 2
# 还原到原图坐标
dy, dx = pad
x1 = (x1 - dx) / scale
y1 = (y1 - dy) / scale
x2 = (x2 - dx) / scale
y2 = (y2 - dy) / scale
# NMS
keep = self._nms(np.stack([x1, y1, x2, y2], axis=1), max_scores, self.iou_thresh)
results = []
for i in keep:
results.append({
"bbox": [int(x1[i]), int(y1[i]), int(x2[i]), int(y2[i])],
"score": float(max_scores[i]),
"class_id": int(class_ids[i]),
"class_name": self.class_names.get(int(class_ids[i]), "unknown")
})
return results
def _nms(self, boxes: np.ndarray, scores: np.ndarray, iou_thresh: float) -> list:
"""非极大值抑制"""
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1) * (y2 - y1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1)
h = np.maximum(0.0, yy2 - yy1)
inter = w * h
iou = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(iou <= iou_thresh)[0]
order = order[inds + 1]
return keep
def detect(self, image: np.ndarray) -> List[dict]:
"""单帧检测"""
# 预处理
blob, scale, pad = self.preprocess(image)
orig_shape = image.shape[:2]
# 传输到 GPU
np.copyto(self.inputs[0]["host"], blob.ravel())
cuda.memcpy_htod_async(self.inputs[0]["device"], self.inputs[0]["host"], self.stream)
# 推理
self.context.set_input_shape(self.inputs[0]["name"], blob.shape)
self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
# 取回结果
cuda.memcpy_dtoh_async(self.outputs[0]["host"], self.outputs[0]["device"], self.stream)
self.stream.synchronize()
output = self.outputs[0]["host"].reshape(self.outputs[0]["shape"])
# 后处理
return self.postprocess(output, scale, pad, orig_shape)
def draw_detections(self, image: np.ndarray, detections: List[dict]) -> np.ndarray:
"""绘制检测结果"""
canvas = image.copy()
for det in detections:
x1, y1, x2, y2 = det["bbox"]
cls_id = det["class_id"]
score = det["score"]
color = self.colors.get(cls_id, (255, 255, 255))
label = f"{det['class_name']} {score:.2f}"
# 绘制框
cv2.rectangle(canvas, (x1, y1), (x2, y2), color, 2)
# 绘制标签
(tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
cv2.rectangle(canvas, (x1, y1 - th - 10), (x1 + tw, y1), color, -1)
cv2.putText(canvas, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1)
return canvas
5.2 CSI 摄像头实时推理
python
#!/usr/bin/env python3
"""realtime_detect.py - CSI 摄像头实时检测"""
import cv2
import time
from jetson_yolo import YOLOv8TRT
def gstreamer_pipeline(sensor_id=0, width=1280, height=720, fps=30):
"""GStreamer 管道(CSI 摄像头)"""
return (
f"nvarguscamerasrc sensor-id={sensor_id} ! "
f"video/x-raw(memory:NVMM), width={width}, height={height}, "
f"format=NV12, framerate={fps}/1 ! "
f"nvvidconv flip-method=0 ! "
f"video/x-raw, width={width}, height={height}, format=BGRx ! "
f"videoconvert ! "
f"video/x-raw, format=BGR ! appsink"
)
def main():
# 初始化模型
model = YOLOv8TRT("helmet_yolov8s.engine", conf_thresh=0.3)
# 打开 CSI 摄像头
cap = cv2.VideoCapture(gstreamer_pipeline(0, 1280, 720, 30), cv2.CAP_GSTREAMER)
if not cap.isOpened():
print("❌ 无法打开摄像头")
return
fps_counter = 0
fps_start = time.time()
fps_display = 0
print("🎯 按 'q' 退出")
while True:
ret, frame = cap.read()
if not ret:
break
# 检测
detections = model.detect(frame)
# 绘制结果
canvas = model.draw_detections(frame, detections)
# 计算 FPS
fps_counter += 1
if time.time() - fps_start >= 1.0:
fps_display = fps_counter
fps_counter = 0
fps_start = time.time()
# 显示 FPS
cv2.putText(canvas, f"FPS: {fps_display}", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.putText(canvas, f"Objects: {len(detections)}", (10, 70),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow("YOLOv8 Detection", canvas)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
5.3 USB 摄像头实时推理
python
#!/usr/bin/env python3
"""usb_camera_detect.py - USB 摄像头推理"""
import cv2
import time
from jetson_yolo import YOLOv8TRT
def main():
model = YOLOv8TRT("helmet_yolov8s.engine", conf_thresh=0.3)
# USB 摄像头
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
cap.set(cv2.CAP_PROP_FPS, 30)
# 使用 GStreamer 加速(推荐)
# cap = cv2.VideoCapture(
# "v4l2src device=/dev/video0 ! video/x-raw,width=1280,height=720,framerate=30/1 ! "
# "videoconvert ! video/x-raw,format=BGR ! appsink",
# cv2.CAP_GSTREAMER
# )
fps_counter = 0
fps_start = time.time()
fps_display = 0
while True:
ret, frame = cap.read()
if not ret:
break
detections = model.detect(frame)
canvas = model.draw_detections(frame, detections)
fps_counter += 1
if time.time() - fps_start >= 1.0:
fps_display = fps_counter
fps_counter = 0
fps_start = time.time()
cv2.putText(canvas, f"FPS: {fps_display}", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow("Detection", canvas)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
6. 多摄像头并行推理
python
#!/usr/bin/env python3
"""multi_camera.py - 多摄像头并行推理"""
import cv2
import threading
import queue
import time
import numpy as np
from jetson_yolo import YOLOv8TRT
class CameraThread(threading.Thread):
"""摄像头采集线程"""
def __init__(self, camera_id, pipeline_func, frame_queue):
super().__init__(daemon=True)
self.camera_id = camera_id
self.pipeline_func = pipeline_func
self.frame_queue = frame_queue
self.running = True
def run(self):
cap = cv2.VideoCapture(self.pipeline_func(self.camera_id), cv2.CAP_GSTREAMER)
while self.running:
ret, frame = cap.read()
if ret:
if not self.frame_queue.full():
self.frame_queue.put((self.camera_id, frame))
else:
time.sleep(0.01)
cap.release()
class MultiCameraDetector:
"""多摄像头检测器"""
def __init__(self, engine_path: str, camera_ids: list):
self.model = YOLOv8TRT(engine_path)
self.camera_ids = camera_ids
self.frame_queues = {cid: queue.Queue(maxsize=2) for cid in camera_ids}
self.result_queues = {cid: queue.Queue(maxsize=2) for cid in camera_ids}
def csi_pipeline(self, sensor_id):
return (
f"nvarguscamerasrc sensor-id={sensor_id} ! "
f"video/x-raw(memory:NVMM), width=640, height=480, format=NV12, framerate=30/1 ! "
f"nvvidconv ! video/x-raw, format=BGRx ! videoconvert ! "
f"video/x-raw, format=BGR ! appsink"
)
def start(self):
# 启动摄像头线程
self.threads = []
for cid in self.camera_ids:
t = CameraThread(cid, self.csi_pipeline, self.frame_queues[cid])
t.start()
self.threads.append(t)
# 检测循环
print(f"🎯 监控 {len(self.camera_ids)} 路摄像头")
while True:
for cid in self.camera_ids:
try:
cam_id, frame = self.frame_queues[cid].get(timeout=0.1)
detections = self.model.detect(frame)
canvas = self.model.draw_detections(frame, detections)
# 显示
cv2.imshow(f"Camera {cid}", canvas)
except queue.Empty:
continue
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# 清理
for t in self.threads:
t.running = False
cv2.destroyAllWindows()
if __name__ == "__main__":
detector = MultiCameraDetector("helmet_yolov8s.engine", [0, 1])
detector.start()
7. 视频文件批量处理
python
#!/usr/bin/env python3
"""video_batch.py - 视频文件批量处理"""
import cv2
import os
import time
from jetson_yolo import YOLOv8TRT
def process_video(model, video_path, output_path=None):
"""处理单个视频"""
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
writer = None
if output_path:
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_count = 0
total_detections = 0
start_time = time.time()
while True:
ret, frame = cap.read()
if not ret:
break
detections = model.detect(frame)
total_detections += len(detections)
if writer:
canvas = model.draw_detections(frame, detections)
writer.write(canvas)
frame_count += 1
if frame_count % 100 == 0:
elapsed = time.time() - start_time
current_fps = frame_count / elapsed
print(f" 帧 {frame_count}/{total_frames} | FPS: {current_fps:.1f} | 检测数: {total_detections}")
cap.release()
if writer:
writer.release()
elapsed = time.time() - start_time
print(f" 完成: {frame_count} 帧, {elapsed:.1f}s, {frame_count/elapsed:.1f} FPS, {total_detections} 检测")
if __name__ == "__main__":
model = YOLOv8TRT("helmet_yolov8s.engine")
video_dir = "videos/"
output_dir = "results/"
os.makedirs(output_dir, exist_ok=True)
for video_file in os.listdir(video_dir):
if video_file.endswith(('.mp4', '.avi', '.mov')):
video_path = os.path.join(video_dir, video_file)
output_path = os.path.join(output_dir, f"det_{video_file}")
print(f"处理: {video_file}")
process_video(model, video_path, output_path)
8. 性能优化技巧
8.1 Jetson 性能调优
bash
# 1. 设置最大性能模式
sudo nvpmodel -m 0
sudo jetson_clocks
# 2. 增加 swap 空间
sudo fallocate -l 8G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
# 3. 关闭桌面环境(纯推理时)
sudo systemctl set-default multi-user.target
# 恢复:sudo systemctl set-default graphical.target
# 4. 调整 GPU 频率
cat /sys/devices/17000000.ga10b/devfreq/17000000.ga10b/max_freq | sudo tee \
/sys/devices/17000000.ga10b/devfreq/17000000.ga10b/min_freq
8.2 代码优化
python
# 1. 使用 CUDA Stream 重叠计算和数据传输
# 2. 预分配内存,避免运行时分配
# 3. 使用半精度推理(FP16)
# 4. 减少 CPU-GPU 数据拷贝
# 5. 多线程采集 + 单线程推理
# 优化前后对比(YOLOv8s on Orin NX 16GB):
# ┌─────────────────┬──────────┬──────────┐
# │ 优化项 │ 优化前 │ 优化后 │
# ├─────────────────┼──────────┼──────────┤
# │ 预处理 │ 8 ms │ 3 ms │
# │ 推理 │ 25 ms │ 12 ms │
# │ 后处理 │ 5 ms │ 2 ms │
# │ 总延迟 │ 38 ms │ 17 ms │
# │ FPS │ 26 │ 59 │
# └─────────────────┴──────────┴──────────┘
总结
YOLOv8 在 Jetson 上的部署流程:
- 数据准备 → Roboflow 标注 + 增强
- 模型训练 → 选择 YOLOv8s/n 平衡精度和速度
- 导出 TensorRT →
model.export(format='engine', half=True) - 实时推理 → CSI/USB 摄像头 + GStreamer 加速
- 性能优化 →
nvpmodel -m 0+jetson_clocks
预期性能:YOLOv8s on Orin NX 16GB = 50-60 FPS (640×640)