Jetson + Triton Inference Server:模型服务化部署
1. Triton 概述
Triton Inference Server 是 NVIDIA 的开源推理服务框架:
Triton 核心能力:
├── 多框架支持:TensorRT / ONNX / PyTorch / TensorFlow
├── 多模型并发:同时服务多个模型
├── 动态批处理:自动合并请求提升吞吐
├── 模型版本管理:A/B 测试、灰度发布
├── 健康检查:内置监控和指标
└── 客户端:gRPC / HTTP REST / Python
2. 安装 Triton
bash
# 方式一:Docker(推荐)
docker pull nvcr.io/nvidia/tritonserver:24.01-py3
# 方式二:apt 安装
sudo apt install -y tritonserver
# 验证
tritonserver --version
3. 模型仓库结构
model_repository/
├── yolov8/
│ ├── config.pbtxt
│ └── 1/
│ └── model.onnx
├── resnet50/
│ ├── config.pbtxt
│ └── 1/
│ └── model.plan # TensorRT engine
├── deepsort/
│ ├── config.pbtxt
│ └── 1/
│ └── model.onnx
└── ensemble_yolo/
├── config.pbtxt
└── 1/
└── model.py
4. 模型配置
protobuf
# model_repository/yolov8/config.pbtxt
name: "yolov8"
platform: "onnxruntime_onnx"
max_batch_size: 4
input [
{
name: "images"
data_type: TYPE_FP32
dims: [ 3, 640, 640 ]
}
]
output [
{
name: "output0"
data_type: TYPE_FP32
dims: [ 84, 8400 ]
}
]
instance_group [
{
count: 1
kind: KIND_GPU
gpus: [ 0 ]
}
]
dynamic_batching {
preferred_batch_size: [ 1, 2, 4 ]
max_queue_delay_microseconds: 100
}
protobuf
# TensorRT 引擎配置
name: "yolov8"
platform: "tensorrt_plan"
max_batch_size: 4
input [
{
name: "images"
data_type: TYPE_FP32
dims: [ 3, 640, 640 ]
}
]
output [
{
name: "output0"
data_type: TYPE_FP32
dims: [ 84, 8400 ]
}
]
optimization {
execution_accelerators {
gpu_execution_accelerator : [ { name : "tensorrt" } ]
}
}
5. 启动 Triton Server
bash
# Docker 启动
docker run --rm --runtime=nvidia \
-p 8000:8000 -p 8001:8001 -p 8002:8002 \
-v /path/to/model_repository:/models \
nvcr.io/nvidia/tritonserver:24.01-py3 \
tritonserver --model-repository=/models
# 直接启动
tritonserver \
--model-repository=/path/to/model_repository \
--http-port=8000 \
--grpc-port=8001 \
--metrics-port=8002
# 健康检查
curl -s http://localhost:8000/v2/health/ready
# 返回 200 表示就绪
# 查看已加载模型
curl -s http://localhost:8000/v2/models
6. Python 客户端
python
#!/usr/bin/env python3
"""triton_client.py - Triton 推理客户端"""
import tritonclient.http as httpclient
import tritonclient.grpc as grpcclient
import numpy as np
import cv2
class TritonClient:
"""Triton 推理客户端"""
def __init__(self, url="localhost:8000", protocol="http"):
if protocol == "http":
self.client = httpclient.InferenceServerClient(url=url)
else:
self.client = grpcclient.InferenceServerClient(url=url)
def is_ready(self):
return self.client.is_server_ready()
def infer(self, model_name, input_data, input_name="images", output_name="output0"):
"""推理"""
# 准备输入
inputs = [httpclient.InferInput(input_name, input_data.shape, "FP32")]
inputs[0].set_data_from_numpy(input_data)
# 准备输出
outputs = [httpclient.InferRequestedOutput(output_name)]
# 推理
result = self.client.infer(
model_name=model_name,
inputs=inputs,
outputs=outputs
)
return result.as_numpy(output_name)
def preprocess(self, image_path, size=640):
"""预处理"""
img = cv2.imread(image_path)
img = cv2.resize(img, (size, size))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.astype(np.float32) / 255.0
img = img.transpose(2, 0, 1) # HWC → CHW
return np.expand_dims(img, axis=0) # 添加 batch 维度
def get_model_info(self, model_name):
"""获取模型信息"""
model = self.client.get_model_metadata(model_name)
return model
if __name__ == "__main__":
client = TritonClient("localhost:8000")
if client.is_ready():
print("Triton Server 就绪")
# 推理
input_data = client.preprocess("test.jpg")
output = client.infer("yolov8", input_data)
print(f"输出 shape: {output.shape}")
# 模型信息
info = client.get_model_info("yolov8")
print(f"模型: {info.name}, 版本: {info.versions}")
7. 批量推理
python
#!/usr/bin/env python3
"""batch_inference.py - 批量推理"""
import numpy as np
from concurrent.futures import ThreadPoolExecutor
def batch_infer(client, model_name, images, batch_size=4):
"""批量推理"""
results = []
# 分批
for i in range(0, len(images), batch_size):
batch = images[i:i+batch_size]
# 补齐 batch(如果不足 batch_size)
while len(batch) < batch_size:
batch = np.concatenate([batch, batch[-1:]], axis=0)
# 推理
output = client.infer(model_name, batch)
results.append(output[:len(images[i:i+batch_size])])
return np.concatenate(results, axis=0)
# 并行推理多个模型
def multi_model_infer(client, image):
"""多模型并行推理"""
with ThreadPoolExecutor(max_workers=3) as executor:
future_det = executor.submit(client.infer, "yolov8", image)
future_seg = executor.submit(client.infer, "unet", image)
future_depth = executor.submit(client.infer, "depth", image)
det_result = future_det.result()
seg_result = future_seg.result()
depth_result = future_depth.result()
return det_result, seg_result, depth_result
8. 性能监控
bash
# Prometheus 指标
curl -s http://localhost:8002/metrics
# 关键指标:
# nv_inference_request_success:成功推理数
# nv_inference_request_failure:失败推理数
# nv_inference_exec_count:执行次数
# nv_inference_exec_duration_us:执行延迟(微秒)
# nv_inference_queue_duration_us:队列等待时间
# nv_inference_compute_input_duration_us:输入处理时间
# nv_inference_compute_infer_duration_us:推理时间
# nv_inference_compute_output_duration_us:输出处理时间
# 模型统计
curl -s http://localhost:8000/v2/models/yolov8/stats
总结
| 特性 | 说明 |
|---|---|
| 多框架 | TensorRT / ONNX / PyTorch / TF |
| 动态批处理 | 自动合并请求 |
| 模型并发 | 多模型同时服务 |
| gRPC | 高性能远程调用 |
| 监控 | Prometheus 指标 |
核心要点:
- 模型仓库:标准目录结构,config.pbtxt 定义输入输出
- 动态批处理:提升吞吐 2-4x
- TensorRT 优化:platform=tensorrt_plan 获得最佳性能
- Docker 部署:一行命令启动服务
- gRPC 优先:比 REST 更快,适合高并发场景