YOLO26N INT8 量化:极致压缩与边缘加速

YOLO26N INT8 量化:极致压缩与边缘加速

1. INT8 量化收益

复制代码
YOLO26N 量化对比:
┌──────────┬──────────┬──────────┬──────────┬──────────┐
│ 精度      │ 模型大小  │ 内存占用  │ 推理延迟   │ mAP      │
├──────────┼──────────┼──────────┼──────────┼──────────┤
│ FP32     │ 10.4MB   │ 42MB     │ 12ms     │ 38.5     │
│ FP16     │ 5.2MB    │ 21MB     │ 4.5ms    │ 38.4     │
│ INT8     │ 2.6MB    │ 11MB     │ 3.2ms    │ 37.8     │
└──────────┴──────────┴──────────┴──────────┴──────────┘

INT8 优势:
├── 模型大小:减少 4x(10.4MB → 2.6MB)
├── 推理速度:提升 3.7x(12ms → 3.2ms)
├── 精度损失:仅 0.7 mAP(38.5 → 37.8)
└── 功耗降低:约 30%

2. PTQ 量化(训练后量化)

python 复制代码
#!/usr/bin/env python3
"""ptq_int8.py - PTQ INT8 量化"""
import tensorrt as trt
import numpy as np
import cv2
import glob

class INT8Calibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, data_dir, batch_size=8, cache_file="yolo26n_int8.cache"):
        super().__init__()
        self.batch_size = batch_size
        self.cache_file = cache_file
        self.images = sorted(glob.glob(f"{data_dir}/**/*.jpg", recursive=True))[:200]
        self.current_index = 0
        
        import pycuda.driver as cuda
        self.device_input = cuda.mem_alloc(batch_size * 3 * 640 * 640 * 4)
    
    def preprocess(self, img_path):
        img = cv2.imread(img_path)
        img = cv2.resize(img, (640, 640))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return (img.astype(np.float32) / 255.0).transpose(2, 0, 1)
    
    def get_batch_size(self):
        return self.batch_size
    
    def get_batch(self, names):
        import pycuda.driver as cuda
        if self.current_index >= len(self.images):
            return None
        
        batch = []
        for i in range(self.batch_size):
            if self.current_index < len(self.images):
                batch.append(self.preprocess(self.images[self.current_index]))
                self.current_index += 1
        
        if not batch:
            return None
        
        batch = np.stack(batch).astype(np.float32)
        cuda.memcpy_htod(self.device_input, batch.ravel())
        return [int(self.device_input)]
    
    def read_calibration_cache(self):
        try:
            with open(self.cache_file, "rb") as f:
                return f.read()
        except FileNotFoundError:
            return None
    
    def write_calibration_cache(self, cache):
        with open(self.cache_file, "wb") as f:
            f.write(cache)

def build_int8_engine(onnx_path, engine_path, calib_dir):
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, logger)
    
    with open(onnx_path, "rb") as f:
        parser.parse(f.read())
    
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
    config.set_flag(trt.BuilderFlag.INT8)
    config.set_flag(trt.BuilderFlag.FP16)  # INT8+FP16 混合
    config.int8_calibrator = INT8Calibrator(calib_dir, batch_size=8)
    
    print("构建 INT8 引擎...")
    engine = builder.build_serialized_network(network, config)
    with open(engine_path, "wb") as f:
        f.write(engine)
    print(f"已保存: {engine_path}")

if __name__ == "__main__":
    build_int8_engine("yolo26n.onnx", "yolo26n_int8.engine", "calibration_images/")

3. Ultralytics 一键量化

python 复制代码
from ultralytics import YOLO

model = YOLO("yolo26n.pt")

# 一键 INT8 量化(需要校准数据)
model.export(
    format="engine",
    imgsz=640,
    int8=True,
    batch=1,
    data="coco.yaml",  # 用于校准的数据集配置
)

4. 精度校验

python 复制代码
#!/usr/bin/env python3
"""quant_eval.py - 量化精度校验"""
import numpy as np
import onnxruntime as ort

def compare_fp32_int8(fp32_model, int8_model, test_images):
    """对比 FP32 和 INT8 输出"""
    fp32_session = ort.InferenceSession(fp32_model)
    int8_session = ort.InferenceSession(int8_model)
    
    cosine_sims = []
    
    for img_path in test_images:
        img = preprocess(img_path)
        
        fp32_out = fp32_session.run(None, {"images": img})[0]
        int8_out = int8_session.run(None, {"images": img})[0]
        
        # 余弦相似度
        cosine = np.dot(fp32_out.flatten(), int8_out.flatten()) / \
                 (np.linalg.norm(fp32_out) * np.linalg.norm(int8_out))
        cosine_sims.append(cosine)
    
    avg_cosine = np.mean(cosine_sims)
    print(f"平均余弦相似度: {avg_cosine:.6f}")
    
    if avg_cosine > 0.99:
        print("✅ 量化精度优秀")
    elif avg_cosine > 0.95:
        print("⚠️ 量化精度可接受")
    else:
        print("❌ 量化精度不足")

if __name__ == "__main__":
    compare_fp32_int8("yolo26n.onnx", "yolo26n_int8.onnx", test_images)

5. 各平台 INT8 性能

复制代码
YOLO26N INT8 性能:
┌──────────────────┬──────────┬──────────┬──────────┐
│ 平台              │ FP16     │ INT8     │ 加速比    │
├──────────────────┼──────────┼──────────┼──────────┤
│ Jetson Orin NX   │ 4.5ms    │ 3.2ms    │ 1.4x     │
│ Jetson Orin Nano │ 8.0ms    │ 5.5ms    │ 1.5x     │
│ RK3588 NPU       │ -        │ 8.0ms    │ -        │
│ Hailo-8          │ -        │ 3.0ms    │ -        │
│ Intel NCS2       │ -        │ 15ms     │ -        │
└──────────────────┴──────────┴──────────┴──────────┘

总结

步骤 工具 关键参数
校准数据 200+ 张代表性图片 覆盖全场景
PTQ 量化 TensorRT INT8 EntropyCalibrator
精度校验 余弦相似度 > 0.99 逐层对比
性能测试 trtexec 延迟/FPS/功耗