Jetson TensorRT 模型加速推理:从导出到部署全流程
1. TensorRT 在 Jetson 上的优势
TensorRT 是 NVIDIA 针对自家 GPU 的高性能推理引擎,在 Jetson 上的加速效果尤为显著:
推理加速原理:
├── 层融合(Layer Fusion):合并相邻算子,减少 kernel 启动开销
├── 精度校准(Precision Calibration):FP32 → FP16/INT8,带宽减半
├── 内核自动调优(Kernel Auto-Tuning):针对目标 GPU 选择最优实现
├── 动态内存管理:优化显存分配和复用
└── 多流并行:重叠计算和数据传输
典型加速效果(ResNet-50 on Orin NX 16GB):
| 推理框架 | 精度 | 延迟 (ms) | 吞吐 (FPS) |
|---|---|---|---|
| PyTorch | FP32 | 45 | 22 |
| ONNX Runtime | FP32 | 28 | 36 |
| TensorRT | FP32 | 12 | 83 |
| TensorRT | FP16 | 6.5 | 154 |
| TensorRT | INT8 | 4.2 | 238 |
2. 环境准备
bash
# 确认 TensorRT 版本(JetPack 6.0 内置 TensorRT 8.6)
dpkg -l | grep tensorrt
# 安装 Python 绑定
sudo apt install -y python3-libnvinfer python3-libnvinfer-dev
# 安装 ONNX 工具链
pip3 install onnx==1.15.0 onnxruntime-gpu==1.17.0 onnx-graphsurgeon
# 安装 Polygraphy(TensorRT 调试工具)
pip3 install polygraphy
# 验证安装
python3 -c "
import tensorrt as trt
print(f'TensorRT: {trt.__version__}')
print(f'Logger: {trt.Logger(trt.Logger.WARNING)}')
"
3. 方式一:PyTorch → ONNX → TensorRT
3.1 导出 PyTorch 模型到 ONNX
python
#!/usr/bin/env python3
"""export_onnx.py - PyTorch 模型导出到 ONNX"""
import torch
import torchvision.models as models
def export_resnet50():
# 加载预训练模型
model = models.resnet50(pretrained=True)
model.eval()
# 创建示例输入
dummy_input = torch.randn(1, 3, 224, 224)
# 导出 ONNX
torch.onnx.export(
model,
dummy_input,
"resnet50.onnx",
opset_version=17,
do_constant_folding=True,
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size"},
"output": {0: "batch_size"}
}
)
print("✅ 导出成功: resnet50.onnx")
def export_custom_model():
"""导出自定义模型示例"""
# 假设你有一个自定义模型
class MyModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.backbone = models.mobilenet_v3_small(pretrained=True)
self.head = torch.nn.Linear(1000, 10) # 10 类分类
def forward(self, x):
return self.head(self.backbone(x))
model = MyModel()
model.eval()
model.load_state_dict(torch.load("best.pth"))
dummy = torch.randn(1, 3, 224, 224)
torch.onnx.export(
model, dummy, "custom_model.onnx",
opset_version=17,
input_names=["input"],
output_names=["output"],
dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}}
)
print("✅ 自定义模型导出成功")
if __name__ == "__main__":
export_resnet50()
3.2 验证 ONNX 模型
bash
# 检查模型结构
python3 -c "
import onnx
model = onnx.load('resnet50.onnx')
print('输入:', [(i.name, [d.dim_value for d in i.type.tensor_type.shape.dim]) for i in model.graph.input])
print('输出:', [(o.name, [d.dim_value for d in o.type.tensor_type.shape.dim]) for o in model.graph.output])
print('节点数:', len(model.graph.node))
onnx.checker.check_model(model)
print('✅ ONNX 模型验证通过')
"
# Polygraphy 检查
polygraphy inspect model resnet50.onnx
3.3 ONNX → TensorRT 转换
python
#!/usr/bin/env python3
"""onnx_to_trt.py - ONNX 转 TensorRT 引擎"""
import tensorrt as trt
import os
def build_engine(
onnx_path: str,
engine_path: str,
fp16: bool = True,
int8: bool = False,
max_batch: int = 1,
max_workspace: int = 1 << 30, # 1GB
dynamic_shapes: dict = None
):
"""构建 TensorRT 引擎"""
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
# 创建网络(显式 batch)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
# ONNX 解析器
parser = trt.OnnxParser(network, logger)
with open(onnx_path, "rb") as f:
if not parser.parse(f.read()):
for i in range(parser.num_errors):
print(f"❌ 解析错误: {parser.get_error(i)}")
return None
# 配置构建器
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, max_workspace)
# 设置精度
if fp16 and builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
print("✅ 启用 FP16 精度")
if int8 and builder.platform_has_fast_int8:
config.set_flag(trt.BuilderFlag.INT8)
print("✅ 启用 INT8 精度(需要校准器)")
# 设置动态 shape
if dynamic_shapes:
profile = builder.create_optimization_profile()
for name, shapes in dynamic_shapes.items():
min_shape, opt_shape, max_shape = shapes
profile.set_shape(name, min_shape, opt_shape, max_shape)
config.add_optimization_profile(profile)
print(f"✅ 动态 shape 配置: {list(dynamic_shapes.keys())}")
# 构建引擎
print("⏳ 正在构建 TensorRT 引擎(可能需要几分钟)...")
serialized_engine = builder.build_serialized_network(network, config)
if serialized_engine is None:
print("❌ 引擎构建失败")
return None
# 保存引擎
with open(engine_path, "wb") as f:
f.write(serialized_engine)
engine_size = os.path.getsize(engine_path) / (1024 * 1024)
print(f"✅ 引擎已保存: {engine_path} ({engine_size:.1f} MB)")
return engine_path
if __name__ == "__main__":
# 静态 batch
build_engine("resnet50.onnx", "resnet50_fp16.engine", fp16=True)
# 动态 batch
build_engine(
"resnet50.onnx",
"resnet50_dynamic.engine",
fp16=True,
dynamic_shapes={
"input": ([1, 3, 224, 224], [4, 3, 224, 224], [8, 3, 224, 224])
}
)
3.4 使用 trtexec 命令行工具
bash
# 最简单的转换方式(FP16)
/usr/src/tensorrt/bin/trtexec \
--onnx=resnet50.onnx \
--saveEngine=resnet50_fp16.engine \
--fp16 \
--workspace=1024
# 动态 batch
/usr/src/tensorrt/bin/trtexec \
--onnx=resnet50.onnx \
--saveEngine=resnet50_dynamic.engine \
--fp16 \
--minShapes=input:1x3x224x224 \
--optShapes=input:4x3x224x224 \
--maxShapes=input:8x3x224x224
# 基准测试
/usr/src/tensorrt/bin/trtexec \
--loadEngine=resnet50_fp16.engine \
--batch=1 \
--iterations=100 \
--warmUp=500
# 输出:Throughput: 154.32 qps, Average Latency: 6.48 ms
4. 方式二:直接使用 torch2trt(简单快捷)
bash
# 安装 torch2trt
git clone https://github.com/NVIDIA-AI-IOT/torch2trt.git
cd torch2trt
sudo python3 setup.py install
python
#!/usr/bin/env python3
"""torch2trt 快速转换示例"""
import torch
import torchvision.models as models
from torch2trt import torch2trt
# 加载模型
model = models.resnet50(pretrained=True).eval().cuda()
# 创建示例输入
x = torch.randn(1, 3, 224, 224).cuda()
# 直接转换(FP16)
model_trt = torch2trt(model, [x], fp16_mode=True)
# 测试推理
with torch.no_grad():
y_pytorch = model(x)
y_trt = model_trt(x)
print(f"最大误差: {(y_pytorch - y_trt).abs().max().item():.6f}")
# 保存引擎
torch.save(model_trt.state_dict(), "resnet50_trt.pth")
# 基准测试
import time
# PyTorch
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
with torch.no_grad():
model(x)
torch.cuda.synchronize()
pytorch_time = (time.time() - start) / 100
# TensorRT
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
with torch.no_grad():
model_trt(x)
torch.cuda.synchronize()
trt_time = (time.time() - start) / 100
print(f"PyTorch: {pytorch_time*1000:.1f} ms")
print(f"TensorRT: {trt_time*1000:.1f} ms")
print(f"加速比: {pytorch_time/trt_time:.1f}x")
5. TensorRT 推理封装类
python
#!/usr/bin/env python3
"""trt_inference.py - 通用 TensorRT 推理封装"""
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time
class TRTInference:
"""通用 TensorRT 推理引擎"""
def __init__(self, engine_path: str, max_batch_size: int = 1):
self.logger = trt.Logger(trt.Logger.WARNING)
self.runtime = trt.Runtime(self.logger)
# 加载引擎
with open(engine_path, "rb") as f:
self.engine = self.runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
self.max_batch = max_batch_size
# 分配 GPU 内存
self.inputs = []
self.outputs = []
self.bindings = []
self.stream = cuda.Stream()
for i in range(self.engine.num_io_tensors):
name = self.engine.get_tensor_name(i)
dtype = trt.nptype(self.engine.get_tensor_dtype(name))
shape = self.engine.get_tensor_shape(name)
# 处理动态维度
shape = tuple(max(1, s) if s >= 0 else 1 for s in shape)
size = trt.volume(shape)
# 分配内存
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(device_mem))
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
self.inputs.append({
"name": name, "host": host_mem, "device": device_mem,
"shape": shape, "dtype": dtype
})
else:
self.outputs.append({
"name": name, "host": host_mem, "device": device_mem,
"shape": shape, "dtype": dtype
})
def infer(self, input_data: np.ndarray, batch_size: int = 1) -> np.ndarray:
"""执行推理"""
# 设置输入
np.copyto(self.inputs[0]["host"], input_data.ravel())
# 传输到 GPU
cuda.memcpy_htod_async(
self.inputs[0]["device"],
self.inputs[0]["host"],
self.stream
)
# 设置动态 shape
self.context.set_input_shape(
self.inputs[0]["name"],
input_data.shape
)
# 执行推理
self.context.execute_async_v2(
bindings=self.bindings,
stream_handle=self.stream.handle
)
# 从 GPU 取回结果
cuda.memcpy_dtoh_async(
self.outputs[0]["host"],
self.outputs[0]["device"],
self.stream
)
self.stream.synchronize()
return self.outputs[0]["host"][:batch_size * np.prod(self.outputs[0]["shape"][1:])].reshape(
batch_size, *self.outputs[0]["shape"][1:]
)
def benchmark(self, input_shape: tuple, iterations: int = 100, warmup: int = 50):
"""性能基准测试"""
dummy = np.random.randn(*input_shape).astype(np.float32)
# Warmup
for _ in range(warmup):
self.infer(dummy)
# 测试
times = []
for _ in range(iterations):
start = time.time()
self.infer(dummy)
times.append(time.time() - start)
times = np.array(times) * 1000 # 转为 ms
print(f"平均延迟: {times.mean():.2f} ms")
print(f"P50 延迟: {np.percentile(times, 50):.2f} ms")
print(f"P95 延迟: {np.percentile(times, 95):.2f} ms")
print(f"P99 延迟: {np.percentile(times, 99):.2f} ms")
print(f"吞吐量: {1000/times.mean():.1f} FPS")
def __del__(self):
if hasattr(self, 'stream'):
self.stream.synchronize()
if __name__ == "__main__":
# 使用示例
engine = TRTInference("resnet50_fp16.engine")
# 推理测试
dummy = np.random.randn(1, 3, 224, 224).astype(np.float32)
output = engine.infer(dummy)
print(f"输出 shape: {output.shape}")
print(f"Top-5 类别: {np.argsort(output[0])[-5:][::-1]}")
# 性能测试
engine.benchmark((1, 3, 224, 224))
6. INT8 量化(进阶)
INT8 量化可以将推理速度再提升 50-100%,但需要校准数据集:
python
#!/usr/bin/env python3
"""int8_calibration.py - INT8 量化校准"""
import tensorrt as trt
import numpy as np
import glob
from PIL import Image
class ImageCalibrator(trt.IInt8EntropyCalibrator2):
"""INT8 校准器"""
def __init__(self, data_dir: str, batch_size: int = 8, cache_file: str = "calibration.cache"):
super().__init__()
self.batch_size = batch_size
self.cache_file = cache_file
# 收集校准图片
self.images = glob.glob(f"{data_dir}/**/*.jpg", recursive=True)[:500]
self.current_index = 0
# 分配 GPU 内存
self.device_input = cuda.mem_alloc(batch_size * 3 * 224 * 224 * 4)
def preprocess(self, img_path: str) -> np.ndarray:
"""预处理图片"""
img = Image.open(img_path).resize((224, 224))
img = np.array(img, dtype=np.float32) / 255.0
img = (img - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
return img.transpose(2, 0, 1) # HWC → CHW
def get_batch_size(self):
return self.batch_size
def get_batch(self, names):
if self.current_index >= len(self.images):
return None
batch = []
for i in range(self.batch_size):
if self.current_index < len(self.images):
img = self.preprocess(self.images[self.current_index])
batch.append(img)
self.current_index += 1
if not batch:
return None
batch = np.stack(batch).astype(np.float32)
cuda.memcpy_htod(self.device_input, batch.ravel())
return [int(self.device_input)]
def read_calibration_cache(self):
try:
with open(self.cache_file, "rb") as f:
return f.read()
except FileNotFoundError:
return None
def write_calibration_cache(self, cache):
with open(self.cache_file, "wb") as f:
f.write(cache)
7. Jetson 功耗监控与推理优化
python
#!/usr/bin/env python3
"""power_monitor.py - 推理时功耗监控"""
import subprocess
import threading
import time
import re
class JetsonPowerMonitor:
"""Jetson 功耗监控"""
def __init__(self):
self.monitoring = False
self.power_data = []
def _read_power(self):
"""读取当前功耗"""
try:
result = subprocess.run(
["cat", "/sys/bus/i2c/drivers/ina3221/1-0040/iio:device0/in_power0_input"],
capture_output=True, text=True
)
return float(result.stdout.strip()) / 1000 # mW → W
except:
return 0.0
def _read_temp(self):
"""读取 CPU 温度"""
try:
result = subprocess.run(
["cat", "/sys/devices/virtual/thermal/thermal_zone0/temp"],
capture_output=True, text=True
)
return float(result.stdout.strip()) / 1000
except:
return 0.0
def start(self):
"""开始监控"""
self.monitoring = True
self.power_data = []
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._thread.start()
def stop(self):
"""停止监控并返回统计"""
self.monitoring = False
self._thread.join(timeout=2)
if self.power_data:
powers = [d["power"] for d in self.power_data]
temps = [d["temp"] for d in self.power_data]
return {
"avg_power_w": sum(powers) / len(powers),
"max_power_w": max(powers),
"avg_temp_c": sum(temps) / len(temps),
"max_temp_c": max(temps),
"duration_s": len(self.power_data) * 0.5,
"samples": len(self.power_data)
}
return None
def _monitor_loop(self):
while self.monitoring:
self.power_data.append({
"power": self._read_power(),
"temp": self._read_temp(),
"time": time.time()
})
time.sleep(0.5)
# 使用示例
if __name__ == "__main__":
monitor = JetsonPowerMonitor()
monitor.start()
# 执行推理...
import time
time.sleep(5) # 模拟推理
stats = monitor.stop()
print(f"平均功耗: {stats['avg_power_w']:.1f} W")
print(f"峰值功耗: {stats['max_power_w']:.1f} W")
print(f"平均温度: {stats['avg_temp_c']:.1f} °C")
总结
| 方法 | 适用场景 | 转换速度 | 推理速度 |
|---|---|---|---|
| torch2trt | 快速原型 | ⚡ 快 | ★★★★ |
| trtexec 命令行 | 标准模型 | ⚡ 快 | ★★★★★ |
| Python API | 自定义模型 | 🐢 慢 | ★★★★★ |
| INT8 量化 | 极致性能 | 🐢 慢 | ★★★★★+ |
核心步骤:
- PyTorch → ONNX:确保 opset_version ≥ 17
- ONNX → TensorRT:推荐 trtexec 一键转换
- FP16 默认开启:几乎无精度损失,速度翻倍
- INT8 需要校准:准备 200-500 张代表性图片
- 动态 batch:按实际场景设置 min/opt/max shape