Jetson + Triton Inference Server:模型服务化部署

Jetson + Triton Inference Server:模型服务化部署

1. Triton 概述

Triton Inference Server 是 NVIDIA 的开源推理服务框架:

复制代码
Triton 核心能力:
├── 多框架支持:TensorRT / ONNX / PyTorch / TensorFlow
├── 多模型并发:同时服务多个模型
├── 动态批处理:自动合并请求提升吞吐
├── 模型版本管理:A/B 测试、灰度发布
├── 健康检查:内置监控和指标
└── 客户端:gRPC / HTTP REST / Python

2. 安装 Triton

bash 复制代码
# 方式一:Docker(推荐)
docker pull nvcr.io/nvidia/tritonserver:24.01-py3

# 方式二:apt 安装
sudo apt install -y tritonserver

# 验证
tritonserver --version

3. 模型仓库结构

复制代码
model_repository/
├── yolov8/
│   ├── config.pbtxt
│   └── 1/
│       └── model.onnx
├── resnet50/
│   ├── config.pbtxt
│   └── 1/
│       └── model.plan  # TensorRT engine
├── deepsort/
│   ├── config.pbtxt
│   └── 1/
│       └── model.onnx
└── ensemble_yolo/
    ├── config.pbtxt
    └── 1/
        └── model.py

4. 模型配置

protobuf 复制代码
# model_repository/yolov8/config.pbtxt
name: "yolov8"
platform: "onnxruntime_onnx"
max_batch_size: 4
input [
  {
    name: "images"
    data_type: TYPE_FP32
    dims: [ 3, 640, 640 ]
  }
]
output [
  {
    name: "output0"
    data_type: TYPE_FP32
    dims: [ 84, 8400 ]
  }
]
instance_group [
  {
    count: 1
    kind: KIND_GPU
    gpus: [ 0 ]
  }
]
dynamic_batching {
  preferred_batch_size: [ 1, 2, 4 ]
  max_queue_delay_microseconds: 100
}
protobuf 复制代码
# TensorRT 引擎配置
name: "yolov8"
platform: "tensorrt_plan"
max_batch_size: 4
input [
  {
    name: "images"
    data_type: TYPE_FP32
    dims: [ 3, 640, 640 ]
  }
]
output [
  {
    name: "output0"
    data_type: TYPE_FP32
    dims: [ 84, 8400 ]
  }
]
optimization {
  execution_accelerators {
    gpu_execution_accelerator : [ { name : "tensorrt" } ]
  }
}

5. 启动 Triton Server

bash 复制代码
# Docker 启动
docker run --rm --runtime=nvidia \
    -p 8000:8000 -p 8001:8001 -p 8002:8002 \
    -v /path/to/model_repository:/models \
    nvcr.io/nvidia/tritonserver:24.01-py3 \
    tritonserver --model-repository=/models

# 直接启动
tritonserver \
    --model-repository=/path/to/model_repository \
    --http-port=8000 \
    --grpc-port=8001 \
    --metrics-port=8002

# 健康检查
curl -s http://localhost:8000/v2/health/ready
# 返回 200 表示就绪

# 查看已加载模型
curl -s http://localhost:8000/v2/models

6. Python 客户端

python 复制代码
#!/usr/bin/env python3
"""triton_client.py - Triton 推理客户端"""
import tritonclient.http as httpclient
import tritonclient.grpc as grpcclient
import numpy as np
import cv2

class TritonClient:
    """Triton 推理客户端"""
    
    def __init__(self, url="localhost:8000", protocol="http"):
        if protocol == "http":
            self.client = httpclient.InferenceServerClient(url=url)
        else:
            self.client = grpcclient.InferenceServerClient(url=url)
    
    def is_ready(self):
        return self.client.is_server_ready()
    
    def infer(self, model_name, input_data, input_name="images", output_name="output0"):
        """推理"""
        # 准备输入
        inputs = [httpclient.InferInput(input_name, input_data.shape, "FP32")]
        inputs[0].set_data_from_numpy(input_data)
        
        # 准备输出
        outputs = [httpclient.InferRequestedOutput(output_name)]
        
        # 推理
        result = self.client.infer(
            model_name=model_name,
            inputs=inputs,
            outputs=outputs
        )
        
        return result.as_numpy(output_name)
    
    def preprocess(self, image_path, size=640):
        """预处理"""
        img = cv2.imread(image_path)
        img = cv2.resize(img, (size, size))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img.astype(np.float32) / 255.0
        img = img.transpose(2, 0, 1)  # HWC → CHW
        return np.expand_dims(img, axis=0)  # 添加 batch 维度

    def get_model_info(self, model_name):
        """获取模型信息"""
        model = self.client.get_model_metadata(model_name)
        return model

if __name__ == "__main__":
    client = TritonClient("localhost:8000")
    
    if client.is_ready():
        print("Triton Server 就绪")
    
    # 推理
    input_data = client.preprocess("test.jpg")
    output = client.infer("yolov8", input_data)
    print(f"输出 shape: {output.shape}")
    
    # 模型信息
    info = client.get_model_info("yolov8")
    print(f"模型: {info.name}, 版本: {info.versions}")

7. 批量推理

python 复制代码
#!/usr/bin/env python3
"""batch_inference.py - 批量推理"""
import numpy as np
from concurrent.futures import ThreadPoolExecutor

def batch_infer(client, model_name, images, batch_size=4):
    """批量推理"""
    results = []
    
    # 分批
    for i in range(0, len(images), batch_size):
        batch = images[i:i+batch_size]
        
        # 补齐 batch(如果不足 batch_size)
        while len(batch) < batch_size:
            batch = np.concatenate([batch, batch[-1:]], axis=0)
        
        # 推理
        output = client.infer(model_name, batch)
        results.append(output[:len(images[i:i+batch_size])])
    
    return np.concatenate(results, axis=0)

# 并行推理多个模型
def multi_model_infer(client, image):
    """多模型并行推理"""
    with ThreadPoolExecutor(max_workers=3) as executor:
        future_det = executor.submit(client.infer, "yolov8", image)
        future_seg = executor.submit(client.infer, "unet", image)
        future_depth = executor.submit(client.infer, "depth", image)
        
        det_result = future_det.result()
        seg_result = future_seg.result()
        depth_result = future_depth.result()
    
    return det_result, seg_result, depth_result

8. 性能监控

bash 复制代码
# Prometheus 指标
curl -s http://localhost:8002/metrics

# 关键指标:
# nv_inference_request_success:成功推理数
# nv_inference_request_failure:失败推理数
# nv_inference_exec_count:执行次数
# nv_inference_exec_duration_us:执行延迟(微秒)
# nv_inference_queue_duration_us:队列等待时间
# nv_inference_compute_input_duration_us:输入处理时间
# nv_inference_compute_infer_duration_us:推理时间
# nv_inference_compute_output_duration_us:输出处理时间

# 模型统计
curl -s http://localhost:8000/v2/models/yolov8/stats

总结

特性 说明
多框架 TensorRT / ONNX / PyTorch / TF
动态批处理 自动合并请求
模型并发 多模型同时服务
gRPC 高性能远程调用
监控 Prometheus 指标

核心要点:

  1. 模型仓库:标准目录结构,config.pbtxt 定义输入输出
  2. 动态批处理:提升吞吐 2-4x
  3. TensorRT 优化:platform=tensorrt_plan 获得最佳性能
  4. Docker 部署:一行命令启动服务
  5. gRPC 优先:比 REST 更快,适合高并发场景