Jetson TensorRT 模型加速推理:从导出到部署全流程

Jetson TensorRT 模型加速推理:从导出到部署全流程

1. TensorRT 在 Jetson 上的优势

TensorRT 是 NVIDIA 针对自家 GPU 的高性能推理引擎,在 Jetson 上的加速效果尤为显著:

复制代码
推理加速原理:
├── 层融合(Layer Fusion):合并相邻算子,减少 kernel 启动开销
├── 精度校准(Precision Calibration):FP32 → FP16/INT8,带宽减半
├── 内核自动调优(Kernel Auto-Tuning):针对目标 GPU 选择最优实现
├── 动态内存管理:优化显存分配和复用
└── 多流并行:重叠计算和数据传输

典型加速效果(ResNet-50 on Orin NX 16GB):

推理框架 精度 延迟 (ms) 吞吐 (FPS)
PyTorch FP32 45 22
ONNX Runtime FP32 28 36
TensorRT FP32 12 83
TensorRT FP16 6.5 154
TensorRT INT8 4.2 238

2. 环境准备

bash 复制代码
# 确认 TensorRT 版本(JetPack 6.0 内置 TensorRT 8.6)
dpkg -l | grep tensorrt

# 安装 Python 绑定
sudo apt install -y python3-libnvinfer python3-libnvinfer-dev

# 安装 ONNX 工具链
pip3 install onnx==1.15.0 onnxruntime-gpu==1.17.0 onnx-graphsurgeon

# 安装 Polygraphy(TensorRT 调试工具)
pip3 install polygraphy

# 验证安装
python3 -c "
import tensorrt as trt
print(f'TensorRT: {trt.__version__}')
print(f'Logger: {trt.Logger(trt.Logger.WARNING)}')
"

3. 方式一:PyTorch → ONNX → TensorRT

3.1 导出 PyTorch 模型到 ONNX

python 复制代码
#!/usr/bin/env python3
"""export_onnx.py - PyTorch 模型导出到 ONNX"""
import torch
import torchvision.models as models

def export_resnet50():
    # 加载预训练模型
    model = models.resnet50(pretrained=True)
    model.eval()
    
    # 创建示例输入
    dummy_input = torch.randn(1, 3, 224, 224)
    
    # 导出 ONNX
    torch.onnx.export(
        model,
        dummy_input,
        "resnet50.onnx",
        opset_version=17,
        do_constant_folding=True,
        input_names=["input"],
        output_names=["output"],
        dynamic_axes={
            "input": {0: "batch_size"},
            "output": {0: "batch_size"}
        }
    )
    print("✅ 导出成功: resnet50.onnx")

def export_custom_model():
    """导出自定义模型示例"""
    # 假设你有一个自定义模型
    class MyModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.backbone = models.mobilenet_v3_small(pretrained=True)
            self.head = torch.nn.Linear(1000, 10)  # 10 类分类
        
        def forward(self, x):
            return self.head(self.backbone(x))
    
    model = MyModel()
    model.eval()
    model.load_state_dict(torch.load("best.pth"))
    
    dummy = torch.randn(1, 3, 224, 224)
    torch.onnx.export(
        model, dummy, "custom_model.onnx",
        opset_version=17,
        input_names=["input"],
        output_names=["output"],
        dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}}
    )
    print("✅ 自定义模型导出成功")

if __name__ == "__main__":
    export_resnet50()

3.2 验证 ONNX 模型

bash 复制代码
# 检查模型结构
python3 -c "
import onnx
model = onnx.load('resnet50.onnx')
print('输入:', [(i.name, [d.dim_value for d in i.type.tensor_type.shape.dim]) for i in model.graph.input])
print('输出:', [(o.name, [d.dim_value for d in o.type.tensor_type.shape.dim]) for o in model.graph.output])
print('节点数:', len(model.graph.node))
onnx.checker.check_model(model)
print('✅ ONNX 模型验证通过')
"

# Polygraphy 检查
polygraphy inspect model resnet50.onnx

3.3 ONNX → TensorRT 转换

python 复制代码
#!/usr/bin/env python3
"""onnx_to_trt.py - ONNX 转 TensorRT 引擎"""
import tensorrt as trt
import os

def build_engine(
    onnx_path: str,
    engine_path: str,
    fp16: bool = True,
    int8: bool = False,
    max_batch: int = 1,
    max_workspace: int = 1 << 30,  # 1GB
    dynamic_shapes: dict = None
):
    """构建 TensorRT 引擎"""
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    
    # 创建网络(显式 batch)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    
    # ONNX 解析器
    parser = trt.OnnxParser(network, logger)
    with open(onnx_path, "rb") as f:
        if not parser.parse(f.read()):
            for i in range(parser.num_errors):
                print(f"❌ 解析错误: {parser.get_error(i)}")
            return None
    
    # 配置构建器
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, max_workspace)
    
    # 设置精度
    if fp16 and builder.platform_has_fast_fp16:
        config.set_flag(trt.BuilderFlag.FP16)
        print("✅ 启用 FP16 精度")
    
    if int8 and builder.platform_has_fast_int8:
        config.set_flag(trt.BuilderFlag.INT8)
        print("✅ 启用 INT8 精度(需要校准器)")
    
    # 设置动态 shape
    if dynamic_shapes:
        profile = builder.create_optimization_profile()
        for name, shapes in dynamic_shapes.items():
            min_shape, opt_shape, max_shape = shapes
            profile.set_shape(name, min_shape, opt_shape, max_shape)
        config.add_optimization_profile(profile)
        print(f"✅ 动态 shape 配置: {list(dynamic_shapes.keys())}")
    
    # 构建引擎
    print("⏳ 正在构建 TensorRT 引擎(可能需要几分钟)...")
    serialized_engine = builder.build_serialized_network(network, config)
    
    if serialized_engine is None:
        print("❌ 引擎构建失败")
        return None
    
    # 保存引擎
    with open(engine_path, "wb") as f:
        f.write(serialized_engine)
    
    engine_size = os.path.getsize(engine_path) / (1024 * 1024)
    print(f"✅ 引擎已保存: {engine_path} ({engine_size:.1f} MB)")
    return engine_path

if __name__ == "__main__":
    # 静态 batch
    build_engine("resnet50.onnx", "resnet50_fp16.engine", fp16=True)
    
    # 动态 batch
    build_engine(
        "resnet50.onnx",
        "resnet50_dynamic.engine",
        fp16=True,
        dynamic_shapes={
            "input": ([1, 3, 224, 224], [4, 3, 224, 224], [8, 3, 224, 224])
        }
    )

3.4 使用 trtexec 命令行工具

bash 复制代码
# 最简单的转换方式(FP16)
/usr/src/tensorrt/bin/trtexec \
    --onnx=resnet50.onnx \
    --saveEngine=resnet50_fp16.engine \
    --fp16 \
    --workspace=1024

# 动态 batch
/usr/src/tensorrt/bin/trtexec \
    --onnx=resnet50.onnx \
    --saveEngine=resnet50_dynamic.engine \
    --fp16 \
    --minShapes=input:1x3x224x224 \
    --optShapes=input:4x3x224x224 \
    --maxShapes=input:8x3x224x224

# 基准测试
/usr/src/tensorrt/bin/trtexec \
    --loadEngine=resnet50_fp16.engine \
    --batch=1 \
    --iterations=100 \
    --warmUp=500
# 输出:Throughput: 154.32 qps, Average Latency: 6.48 ms

4. 方式二:直接使用 torch2trt(简单快捷)

bash 复制代码
# 安装 torch2trt
git clone https://github.com/NVIDIA-AI-IOT/torch2trt.git
cd torch2trt
sudo python3 setup.py install
python 复制代码
#!/usr/bin/env python3
"""torch2trt 快速转换示例"""
import torch
import torchvision.models as models
from torch2trt import torch2trt

# 加载模型
model = models.resnet50(pretrained=True).eval().cuda()

# 创建示例输入
x = torch.randn(1, 3, 224, 224).cuda()

# 直接转换(FP16)
model_trt = torch2trt(model, [x], fp16_mode=True)

# 测试推理
with torch.no_grad():
    y_pytorch = model(x)
    y_trt = model_trt(x)
    print(f"最大误差: {(y_pytorch - y_trt).abs().max().item():.6f}")

# 保存引擎
torch.save(model_trt.state_dict(), "resnet50_trt.pth")

# 基准测试
import time
# PyTorch
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
    with torch.no_grad():
        model(x)
torch.cuda.synchronize()
pytorch_time = (time.time() - start) / 100

# TensorRT
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
    with torch.no_grad():
        model_trt(x)
torch.cuda.synchronize()
trt_time = (time.time() - start) / 100

print(f"PyTorch: {pytorch_time*1000:.1f} ms")
print(f"TensorRT: {trt_time*1000:.1f} ms")
print(f"加速比: {pytorch_time/trt_time:.1f}x")

5. TensorRT 推理封装类

python 复制代码
#!/usr/bin/env python3
"""trt_inference.py - 通用 TensorRT 推理封装"""
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time

class TRTInference:
    """通用 TensorRT 推理引擎"""
    
    def __init__(self, engine_path: str, max_batch_size: int = 1):
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(self.logger)
        
        # 加载引擎
        with open(engine_path, "rb") as f:
            self.engine = self.runtime.deserialize_cuda_engine(f.read())
        
        self.context = self.engine.create_execution_context()
        self.max_batch = max_batch_size
        
        # 分配 GPU 内存
        self.inputs = []
        self.outputs = []
        self.bindings = []
        self.stream = cuda.Stream()
        
        for i in range(self.engine.num_io_tensors):
            name = self.engine.get_tensor_name(i)
            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
            shape = self.engine.get_tensor_shape(name)
            
            # 处理动态维度
            shape = tuple(max(1, s) if s >= 0 else 1 for s in shape)
            size = trt.volume(shape)
            
            # 分配内存
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            
            self.bindings.append(int(device_mem))
            
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                self.inputs.append({
                    "name": name, "host": host_mem, "device": device_mem,
                    "shape": shape, "dtype": dtype
                })
            else:
                self.outputs.append({
                    "name": name, "host": host_mem, "device": device_mem,
                    "shape": shape, "dtype": dtype
                })
    
    def infer(self, input_data: np.ndarray, batch_size: int = 1) -> np.ndarray:
        """执行推理"""
        # 设置输入
        np.copyto(self.inputs[0]["host"], input_data.ravel())
        
        # 传输到 GPU
        cuda.memcpy_htod_async(
            self.inputs[0]["device"],
            self.inputs[0]["host"],
            self.stream
        )
        
        # 设置动态 shape
        self.context.set_input_shape(
            self.inputs[0]["name"],
            input_data.shape
        )
        
        # 执行推理
        self.context.execute_async_v2(
            bindings=self.bindings,
            stream_handle=self.stream.handle
        )
        
        # 从 GPU 取回结果
        cuda.memcpy_dtoh_async(
            self.outputs[0]["host"],
            self.outputs[0]["device"],
            self.stream
        )
        
        self.stream.synchronize()
        
        return self.outputs[0]["host"][:batch_size * np.prod(self.outputs[0]["shape"][1:])].reshape(
            batch_size, *self.outputs[0]["shape"][1:]
        )
    
    def benchmark(self, input_shape: tuple, iterations: int = 100, warmup: int = 50):
        """性能基准测试"""
        dummy = np.random.randn(*input_shape).astype(np.float32)
        
        # Warmup
        for _ in range(warmup):
            self.infer(dummy)
        
        # 测试
        times = []
        for _ in range(iterations):
            start = time.time()
            self.infer(dummy)
            times.append(time.time() - start)
        
        times = np.array(times) * 1000  # 转为 ms
        print(f"平均延迟: {times.mean():.2f} ms")
        print(f"P50 延迟: {np.percentile(times, 50):.2f} ms")
        print(f"P95 延迟: {np.percentile(times, 95):.2f} ms")
        print(f"P99 延迟: {np.percentile(times, 99):.2f} ms")
        print(f"吞吐量: {1000/times.mean():.1f} FPS")
    
    def __del__(self):
        if hasattr(self, 'stream'):
            self.stream.synchronize()

if __name__ == "__main__":
    # 使用示例
    engine = TRTInference("resnet50_fp16.engine")
    
    # 推理测试
    dummy = np.random.randn(1, 3, 224, 224).astype(np.float32)
    output = engine.infer(dummy)
    print(f"输出 shape: {output.shape}")
    print(f"Top-5 类别: {np.argsort(output[0])[-5:][::-1]}")
    
    # 性能测试
    engine.benchmark((1, 3, 224, 224))

6. INT8 量化(进阶)

INT8 量化可以将推理速度再提升 50-100%,但需要校准数据集:

python 复制代码
#!/usr/bin/env python3
"""int8_calibration.py - INT8 量化校准"""
import tensorrt as trt
import numpy as np
import glob
from PIL import Image

class ImageCalibrator(trt.IInt8EntropyCalibrator2):
    """INT8 校准器"""
    
    def __init__(self, data_dir: str, batch_size: int = 8, cache_file: str = "calibration.cache"):
        super().__init__()
        self.batch_size = batch_size
        self.cache_file = cache_file
        
        # 收集校准图片
        self.images = glob.glob(f"{data_dir}/**/*.jpg", recursive=True)[:500]
        self.current_index = 0
        
        # 分配 GPU 内存
        self.device_input = cuda.mem_alloc(batch_size * 3 * 224 * 224 * 4)
    
    def preprocess(self, img_path: str) -> np.ndarray:
        """预处理图片"""
        img = Image.open(img_path).resize((224, 224))
        img = np.array(img, dtype=np.float32) / 255.0
        img = (img - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
        return img.transpose(2, 0, 1)  # HWC → CHW
    
    def get_batch_size(self):
        return self.batch_size
    
    def get_batch(self, names):
        if self.current_index >= len(self.images):
            return None
        
        batch = []
        for i in range(self.batch_size):
            if self.current_index < len(self.images):
                img = self.preprocess(self.images[self.current_index])
                batch.append(img)
                self.current_index += 1
        
        if not batch:
            return None
        
        batch = np.stack(batch).astype(np.float32)
        cuda.memcpy_htod(self.device_input, batch.ravel())
        return [int(self.device_input)]
    
    def read_calibration_cache(self):
        try:
            with open(self.cache_file, "rb") as f:
                return f.read()
        except FileNotFoundError:
            return None
    
    def write_calibration_cache(self, cache):
        with open(self.cache_file, "wb") as f:
            f.write(cache)

7. Jetson 功耗监控与推理优化

python 复制代码
#!/usr/bin/env python3
"""power_monitor.py - 推理时功耗监控"""
import subprocess
import threading
import time
import re

class JetsonPowerMonitor:
    """Jetson 功耗监控"""
    
    def __init__(self):
        self.monitoring = False
        self.power_data = []
    
    def _read_power(self):
        """读取当前功耗"""
        try:
            result = subprocess.run(
                ["cat", "/sys/bus/i2c/drivers/ina3221/1-0040/iio:device0/in_power0_input"],
                capture_output=True, text=True
            )
            return float(result.stdout.strip()) / 1000  # mW → W
        except:
            return 0.0
    
    def _read_temp(self):
        """读取 CPU 温度"""
        try:
            result = subprocess.run(
                ["cat", "/sys/devices/virtual/thermal/thermal_zone0/temp"],
                capture_output=True, text=True
            )
            return float(result.stdout.strip()) / 1000
        except:
            return 0.0
    
    def start(self):
        """开始监控"""
        self.monitoring = True
        self.power_data = []
        self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
        self._thread.start()
    
    def stop(self):
        """停止监控并返回统计"""
        self.monitoring = False
        self._thread.join(timeout=2)
        
        if self.power_data:
            powers = [d["power"] for d in self.power_data]
            temps = [d["temp"] for d in self.power_data]
            return {
                "avg_power_w": sum(powers) / len(powers),
                "max_power_w": max(powers),
                "avg_temp_c": sum(temps) / len(temps),
                "max_temp_c": max(temps),
                "duration_s": len(self.power_data) * 0.5,
                "samples": len(self.power_data)
            }
        return None
    
    def _monitor_loop(self):
        while self.monitoring:
            self.power_data.append({
                "power": self._read_power(),
                "temp": self._read_temp(),
                "time": time.time()
            })
            time.sleep(0.5)

# 使用示例
if __name__ == "__main__":
    monitor = JetsonPowerMonitor()
    monitor.start()
    
    # 执行推理...
    import time
    time.sleep(5)  # 模拟推理
    
    stats = monitor.stop()
    print(f"平均功耗: {stats['avg_power_w']:.1f} W")
    print(f"峰值功耗: {stats['max_power_w']:.1f} W")
    print(f"平均温度: {stats['avg_temp_c']:.1f} °C")

总结

方法 适用场景 转换速度 推理速度
torch2trt 快速原型 ⚡ 快 ★★★★
trtexec 命令行 标准模型 ⚡ 快 ★★★★★
Python API 自定义模型 🐢 慢 ★★★★★
INT8 量化 极致性能 🐢 慢 ★★★★★+

核心步骤:

  1. PyTorch → ONNX:确保 opset_version ≥ 17
  2. ONNX → TensorRT:推荐 trtexec 一键转换
  3. FP16 默认开启:几乎无精度损失,速度翻倍
  4. INT8 需要校准:准备 200-500 张代表性图片
  5. 动态 batch:按实际场景设置 min/opt/max shape