YOLO26N INT8 量化:极致压缩与边缘加速
1. INT8 量化收益
YOLO26N 量化对比:
┌──────────┬──────────┬──────────┬──────────┬──────────┐
│ 精度 │ 模型大小 │ 内存占用 │ 推理延迟 │ mAP │
├──────────┼──────────┼──────────┼──────────┼──────────┤
│ FP32 │ 10.4MB │ 42MB │ 12ms │ 38.5 │
│ FP16 │ 5.2MB │ 21MB │ 4.5ms │ 38.4 │
│ INT8 │ 2.6MB │ 11MB │ 3.2ms │ 37.8 │
└──────────┴──────────┴──────────┴──────────┴──────────┘
INT8 优势:
├── 模型大小:减少 4x(10.4MB → 2.6MB)
├── 推理速度:提升 3.7x(12ms → 3.2ms)
├── 精度损失:仅 0.7 mAP(38.5 → 37.8)
└── 功耗降低:约 30%
2. PTQ 量化(训练后量化)
#!/usr/bin/env python3
"""ptq_int8.py - PTQ INT8 量化"""
import tensorrt as trt
import numpy as np
import cv2
import glob
class INT8Calibrator(trt.IInt8EntropyCalibrator2):
def __init__(self, data_dir, batch_size=8, cache_file="yolo26n_int8.cache"):
super().__init__()
self.batch_size = batch_size
self.cache_file = cache_file
self.images = sorted(glob.glob(f"{data_dir}/**/*.jpg", recursive=True))[:200]
self.current_index = 0
import pycuda.driver as cuda
self.device_input = cuda.mem_alloc(batch_size * 3 * 640 * 640 * 4)
def preprocess(self, img_path):
img = cv2.imread(img_path)
img = cv2.resize(img, (640, 640))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return (img.astype(np.float32) / 255.0).transpose(2, 0, 1)
def get_batch_size(self):
return self.batch_size
def get_batch(self, names):
import pycuda.driver as cuda
if self.current_index >= len(self.images):
return None
batch = []
for i in range(self.batch_size):
if self.current_index < len(self.images):
batch.append(self.preprocess(self.images[self.current_index]))
self.current_index += 1
if not batch:
return None
batch = np.stack(batch).astype(np.float32)
cuda.memcpy_htod(self.device_input, batch.ravel())
return [int(self.device_input)]
def read_calibration_cache(self):
try:
with open(self.cache_file, "rb") as f:
return f.read()
except FileNotFoundError:
return None
def write_calibration_cache(self, cache):
with open(self.cache_file, "wb") as f:
f.write(cache)
def build_int8_engine(onnx_path, engine_path, calib_dir):
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)
with open(onnx_path, "rb") as f:
parser.parse(f.read())
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
config.set_flag(trt.BuilderFlag.INT8)
config.set_flag(trt.BuilderFlag.FP16) # INT8+FP16 混合
config.int8_calibrator = INT8Calibrator(calib_dir, batch_size=8)
print("构建 INT8 引擎...")
engine = builder.build_serialized_network(network, config)
with open(engine_path, "wb") as f:
f.write(engine)
print(f"已保存: {engine_path}")
if __name__ == "__main__":
build_int8_engine("yolo26n.onnx", "yolo26n_int8.engine", "calibration_images/")
3. Ultralytics 一键量化
from ultralytics import YOLO
model = YOLO("yolo26n.pt")
# 一键 INT8 量化(需要校准数据)
model.export(
format="engine",
imgsz=640,
int8=True,
batch=1,
data="coco.yaml", # 用于校准的数据集配置
)
4. 精度校验
#!/usr/bin/env python3
"""quant_eval.py - 量化精度校验"""
import numpy as np
import onnxruntime as ort
def compare_fp32_int8(fp32_model, int8_model, test_images):
"""对比 FP32 和 INT8 输出"""
fp32_session = ort.InferenceSession(fp32_model)
int8_session = ort.InferenceSession(int8_model)
cosine_sims = []
for img_path in test_images:
img = preprocess(img_path)
fp32_out = fp32_session.run(None, {"images": img})[0]
int8_out = int8_session.run(None, {"images": img})[0]
# 余弦相似度
cosine = np.dot(fp32_out.flatten(), int8_out.flatten()) / \
(np.linalg.norm(fp32_out) * np.linalg.norm(int8_out))
cosine_sims.append(cosine)
avg_cosine = np.mean(cosine_sims)
print(f"平均余弦相似度: {avg_cosine:.6f}")
if avg_cosine > 0.99:
print("✅ 量化精度优秀")
elif avg_cosine > 0.95:
print("⚠️ 量化精度可接受")
else:
print("❌ 量化精度不足")
if __name__ == "__main__":
compare_fp32_int8("yolo26n.onnx", "yolo26n_int8.onnx", test_images)
5. 各平台 INT8 性能
YOLO26N INT8 性能:
┌──────────────────┬──────────┬──────────┬──────────┐
│ 平台 │ FP16 │ INT8 │ 加速比 │
├──────────────────┼──────────┼──────────┼──────────┤
│ Jetson Orin NX │ 4.5ms │ 3.2ms │ 1.4x │
│ Jetson Orin Nano │ 8.0ms │ 5.5ms │ 1.5x │
│ RK3588 NPU │ - │ 8.0ms │ - │
│ Hailo-8 │ - │ 3.0ms │ - │
│ Intel NCS2 │ - │ 15ms │ - │
└──────────────────┴──────────┴──────────┴──────────┘
总结
| 步骤 |
工具 |
关键参数 |
| 校准数据 |
200+ 张代表性图片 |
覆盖全场景 |
| PTQ 量化 |
TensorRT INT8 |
EntropyCalibrator |
| 精度校验 |
余弦相似度 > 0.99 |
逐层对比 |
| 性能测试 |
trtexec |
延迟/FPS/功耗 |