目标检测与跟踪（7）- YOLOv8 ONNX量化模型部署指南

YOLOv8 ONNX量化模型部署指南

本文以YOLOv8模型从训练到ONNX量化部署的完整流程为主线。主要内容包括：1）环境准备与模型导出为ONNX格式；2）ONNX模型的FP16/INT8量化方法；3）使用ONNXRuntime和TensorRT的部署实现；4）性能优化策略如IO绑定和批处理；5）完整的部署脚本和注意事项。该方案支持静态和动态输入尺寸，提供多种量化精度选择，并包含基准测试功能，适用于边缘计算和嵌入式设备部署场景。

一、环境准备

bash 复制代码

# 基础环境
pip install ultralytics onnx onnxruntime onnxruntime-gpu

# 量化工具
pip install onnxruntime-tools
pip install neural-compressor  # Intel的量化工具
pip install onnxsim  # ONNX模型简化

# 可选：TensorRT支持
pip install tensorrt  # 如果需要TensorRT部署

二、YOLOv8模型导出为ONNX

1. 基础导出

bash 复制代码

from ultralytics import YOLO
import torch

def export_yolov8_to_onnx(model_path, output_path, imgsz=640, simplify=True):
    """
    将YOLOv8模型导出为ONNX格式
    
    Args:
        model_path: YOLO模型路径 (.pt)
        output_path: 输出ONNX路径
        imgsz: 输入图像尺寸
        simplify: 是否简化模型
    """
    # 加载模型
    model = YOLO(model_path)
    
    # 导出为ONNX
    success = model.export(
        format="onnx",  # 导出格式
        imgsz=imgsz,    # 输入尺寸
        opset=12,       # ONNX opset版本
        simplify=simplify,  # 简化模型
        dynamic=False,  # 固定输入尺寸（False为静态，True为动态）
        half=False,     # FP16精度
        device='cpu'    # 导出设备
    )
    
    if success:
        print(f"模型已成功导出到: {output_path}")
        return True
    else:
        print("模型导出失败")
        return False

# 使用示例
export_yolov8_to_onnx(
    model_path='yolov8n.pt',
    output_path='yolov8n.onnx'
)

2. 带有动态输入尺寸的导出

bash 复制代码

def export_yolov8_dynamic_onnx(model_path, output_path, batch_sizes=[1, 4, 8]):
    """
    导出支持动态批处理的ONNX模型
    """
    model = YOLO(model_path)
    
    # 定义动态尺寸
    dynamic_axes = {
        'images': {0: 'batch_size'},  # 批处理维度动态
        'output0': {0: 'batch_size'}, # 输出批处理维度动态
    }
    
    # 获取示例输入
    example_input = torch.randn(batch_sizes[0], 3, 640, 640)
    
    # 导出
    torch.onnx.export(
        model.model,  # YOLO模型的torch模块
        example_input,
        output_path,
        input_names=['images'],
        output_names=['output0'],
        dynamic_axes=dynamic_axes,
        opset_version=12,
        do_constant_folding=True
    )
    
    # 简化模型
    import onnx
    from onnxsim import simplify
    
    onnx_model = onnx.load(output_path)
    model_simp, check = simplify(onnx_model)
    
    if check:
        onnx.save(model_simp, output_path.replace('.onnx', '_simplified.onnx'))
        print(f"简化模型已保存")
    
    return True

复制代码

三、ONNX模型量化

1. ONNX Runtime量化（静态量化）

bash 复制代码

import onnx
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType
import numpy as np
from PIL import Image
import cv2

class YOLOCalibrationDataReader(CalibrationDataReader):
    """YOLO校准数据读取器"""
    
    def __init__(self, calibration_dataset, input_shape=(640, 640), batch_size=1):
        """
        Args:
            calibration_dataset: 校准数据集路径列表
            input_shape: 输入尺寸 (H, W)
            batch_size: 批处理大小
        """
        self.dataset = calibration_dataset
        self.input_shape = input_shape
        self.batch_size = batch_size
        self.current_index = 0
        
    def get_next(self):
        """获取下一个校准数据批次"""
        if self.current_index >= len(self.dataset):
            return None
        
        batch_data = []
        for _ in range(self.batch_size):
            if self.current_index >= len(self.dataset):
                break
            
            img_path = self.dataset[self.current_index]
            
            # 预处理图像
            img = self.preprocess_image(img_path)
            batch_data.append(img)
            self.current_index += 1
        
        if not batch_data:
            return None
        
        # 堆叠为批次
        batch_array = np.stack(batch_data, axis=0).astype(np.float32)
        
        return {'images': batch_array}
    
    def preprocess_image(self, img_path):
        """图像预处理"""
        # 读取图像
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # 调整大小
        img = cv2.resize(img, self.input_shape)
        
        # 归一化 (0-1)
        img = img / 255.0
        
        # 转换为CHW格式
        img = np.transpose(img, (2, 0, 1))
        
        # 添加批次维度
        img = np.expand_dims(img, axis=0)
        
        return img

def quantize_yolov8_onnx(model_path, calibration_dataset, output_path):
    """
    量化YOLOv8 ONNX模型
    
    Args:
        model_path: ONNX模型路径
        calibration_dataset: 校准数据集路径列表
        output_path: 量化模型输出路径
    """
    # 加载原始模型
    onnx_model = onnx.load(model_path)
    
    # 创建校准数据读取器
    calibration_data_reader = YOLOCalibrationDataReader(
        calibration_dataset=calibration_dataset,
        input_shape=(640, 640),
        batch_size=1
    )
    
    # 量化配置
    quant_config = {
        'calibrate_method': 'MinMax',  # 校准方法: MinMax, Entropy
        'quant_format': QuantType.QInt8,  # 量化格式
        'per_channel': True,  # 逐通道量化
        'weight_type': QuantType.QInt8,  # 权重量化类型
        'activation_type': QuantType.QInt8,  # 激活量化类型
        'nodes_to_quantize': [],  # 空列表表示量化所有节点
        'nodes_to_exclude': [],  # 需要排除的节点
        'extra_options': {
            'ActivationSymmetric': False,
            'WeightSymmetric': True,
            'EnableSubgraph': False
        }
    }
    
    # 执行量化
    quantized_model = quantize_static(
        model_input=model_path,
        calibration_data_reader=calibration_data_reader,
        quant_format=quant_config['quant_format'],
        per_channel=quant_config['per_channel'],
        weight_type=quant_config['weight_type'],
        activation_type=quant_config['activation_type'],
        nodes_to_quantize=quant_config['nodes_to_quantize'],
        nodes_to_exclude=quant_config['nodes_to_exclude'],
        extra_options=quant_config['extra_options']
    )
    
    # 保存量化模型
    onnx.save(quantized_model, output_path)
    
    print(f"量化模型已保存到: {output_path}")
    
    return output_path

# 使用示例
calibration_images = [
    'calibration/image1.jpg',
    'calibration/image2.jpg',
    # ... 更多图像
]

quantize_yolov8_onnx(
    model_path='yolov8n.onnx',
    calibration_dataset=calibration_images,
    output_path='yolov8n_quantized.onnx'
)

2. 使用Intel Neural Compressor量化

bash 复制代码

from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig

def quantize_with_neural_compressor(model_path, calibration_dataset, output_path):
    """
    使用Intel Neural Compressor量化
    """
    from neural_compressor.data import DataLoader, Datasets
    
    # 创建数据集
    dataset = Datasets('onnxrt_qdq')['dummy_v2'](
        input_shape=(1, 3, 640, 640), 
        label_shape=(1,)
    )
    
    # 创建数据加载器
    dataloader = DataLoader(framework='onnxruntime', dataset=dataset)
    
    # 配置量化参数
    config = PostTrainingQuantConfig(
        approach='static',  # 静态量化
        calibration_sampling_size=[8, 16, 32],  # 校准采样大小
        op_type_dict={
            'Conv': {'weight': {'dtype': ['s8']}, 'activation': {'dtype': ['u8']}},
            'MatMul': {'weight': {'dtype': ['s8']}, 'activation': {'dtype': ['u8']}}
        },
        recipes={
            'smooth_quant': True,  # 平滑量化
            'smooth_quant_args': {'alpha': 0.5}
        }
    )
    
    # 执行量化
    q_model = quantization.fit(
        model=model_path,
        conf=config,
        calib_dataloader=dataloader,
        eval_dataloader=dataloader
    )
    
    # 保存量化模型
    q_model.save(output_path)
    
    print(f"Neural Compressor量化模型已保存: {output_path}")
    
    return output_path

复制代码

3. FP16量化（混合精度）

bash 复制代码

def convert_to_fp16(model_path, output_path):
    """
    将模型转换为FP16精度
    """
    import onnx
    from onnxconverter_common import float16
    
    # 加载模型
    model = onnx.load(model_path)
    
    # 转换为FP16
    model_fp16 = float16.convert_float_to_float16(
        model,
        keep_io_types=True,  # 保持输入输出为FP32
        op_block_list=['NonMaxSuppression']  # 某些操作保持FP32
    )
    
    # 保存模型
    onnx.save(model_fp16, output_path)
    
    print(f"FP16模型已保存: {output_path}")
    return output_path

复制代码

四、部署量化模型

1. ONNX Runtime推理引擎

bash 复制代码

import onnxruntime as ort
import numpy as np
import cv2
from typing import List, Tuple, Dict

class YOLOv8ONNXInference:
    """YOLOv8 ONNX推理器"""
    
    def __init__(self, model_path: str, providers=None):
        """
        初始化推理器
        
        Args:
            model_path: ONNX模型路径
            providers: 执行提供者列表
        """
        if providers is None:
            # 自动选择最佳提供者
            available_providers = ort.get_available_providers()
            providers = []
            
            # 优先级：TensorRT > CUDA > CPU
            if 'TensorrtExecutionProvider' in available_providers:
                providers.append('TensorrtExecutionProvider')
            if 'CUDAExecutionProvider' in available_providers:
                providers.append('CUDAExecutionProvider')
            providers.append('CPUExecutionProvider')
        
        # 创建会话选项
        session_options = ort.SessionOptions()
        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
        
        # 创建推理会话
        self.session = ort.InferenceSession(
            model_path,
            sess_options=session_options,
            providers=providers
        )
        
        # 获取输入输出信息
        self.input_name = self.session.get_inputs()[0].name
        self.output_name = self.session.get_outputs()[0].name
        self.input_shape = self.session.get_inputs()[0].shape
        self.output_shape = self.session.get_outputs()[0].shape
        
        print(f"模型加载成功:")
        print(f"  输入名称: {self.input_name}, 形状: {self.input_shape}")
        print(f"  输出名称: {self.output_name}, 形状: {self.output_shape}")
        print(f"  执行提供者: {self.session.get_providers()}")
    
    def preprocess(self, image: np.ndarray, target_size: Tuple[int, int] = (640, 640)) -> np.ndarray:
        """
        预处理图像
        
        Args:
            image: 输入图像 (H, W, C)
            target_size: 目标尺寸 (H, W)
            
        Returns:
            预处理后的图像 (1, C, H, W)
        """
        # 调整大小
        img_resized = cv2.resize(image, target_size)
        
        # BGR转RGB（如果必要）
        if len(img_resized.shape) == 3 and img_resized.shape[2] == 3:
            img_resized = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
        
        # 归一化 (0-1)
        img_normalized = img_resized / 255.0
        
        # 转换为CHW格式
        img_chw = np.transpose(img_normalized, (2, 0, 1))
        
        # 添加批次维度
        img_batch = np.expand_dims(img_chw, axis=0).astype(np.float32)
        
        return img_batch
    
    def postprocess(self, outputs: np.ndarray, 
                   confidence_threshold: float = 0.25,
                   iou_threshold: float = 0.45) -> List[Dict]:
        """
        后处理输出
        
        Args:
            outputs: 模型输出
            confidence_threshold: 置信度阈值
            iou_threshold: IOU阈值
            
        Returns:
            检测结果列表
        """
        # YOLOv8输出格式: [batch, num_detections, 85]
        # 85 = [x, y, w, h, conf, class1, class2, ...]
        
        detections = []
        output = outputs[0]  # 获取第一个批次的输出
        
        for detection in output:
            # 提取坐标和置信度
            x, y, w, h, conf = detection[:5]
            
            # 检查置信度
            if conf < confidence_threshold:
                continue
            
            # 提取类别分数
            class_scores = detection[5:]
            class_id = np.argmax(class_scores)
            class_score = class_scores[class_id]
            
            # 计算最终置信度
            final_confidence = conf * class_score
            
            if final_confidence < confidence_threshold:
                continue
            
            # 计算边界框坐标 (原始图像尺寸)
            x1 = x - w / 2
            y1 = y - h / 2
            x2 = x + w / 2
            y2 = y + h / 2
            
            detections.append({
                'bbox': [x1, y1, x2, y2],
                'confidence': float(final_confidence),
                'class_id': int(class_id),
                'class_score': float(class_score)
            })
        
        # 非极大抑制
        if detections and iou_threshold > 0:
            detections = self.non_max_suppression(detections, iou_threshold)
        
        return detections
    
    def non_max_suppression(self, detections: List[Dict], iou_threshold: float) -> List[Dict]:
        """非极大抑制"""
        if not detections:
            return []
        
        # 按置信度排序
        detections.sort(key=lambda x: x['confidence'], reverse=True)
        
        keep = []
        
        while detections:
            # 取出置信度最高的检测
            best = detections.pop(0)
            keep.append(best)
            
            # 计算与剩余检测的IOU
            i = 0
            while i < len(detections):
                iou = self.calculate_iou(best['bbox'], detections[i]['bbox'])
                
                # 如果IOU超过阈值，移除该检测
                if iou > iou_threshold:
                    detections.pop(i)
                else:
                    i += 1
        
        return keep
    
    def calculate_iou(self, box1: List[float], box2: List[float]) -> float:
        """计算IOU"""
        # 计算交集
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])
        
        if x2 < x1 or y2 < y1:
            return 0.0
        
        intersection = (x2 - x1) * (y2 - y1)
        
        # 计算并集
        area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
        area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
        union = area1 + area2 - intersection
        
        return intersection / union if union > 0 else 0.0
    
    def infer(self, image: np.ndarray) -> List[Dict]:
        """
        执行推理
        
        Args:
            image: 输入图像
            
        Returns:
            检测结果
        """
        # 预处理
        input_tensor = self.preprocess(image)
        
        # 推理
        outputs = self.session.run(
            [self.output_name],
            {self.input_name: input_tensor}
        )
        
        # 后处理
        detections = self.postprocess(outputs)
        
        # 调整边界框到原始图像尺寸
        orig_h, orig_w = image.shape[:2]
        input_h, input_w = self.input_shape[2], self.input_shape[3]
        
        for det in detections:
            bbox = det['bbox']
            # 缩放回原始尺寸
            bbox[0] = bbox[0] * orig_w / input_w
            bbox[1] = bbox[1] * orig_h / input_h
            bbox[2] = bbox[2] * orig_w / input_w
            bbox[3] = bbox[3] * orig_h / input_h
            det['bbox'] = [int(coord) for coord in bbox]
        
        return detections
    
    def benchmark(self, image: np.ndarray, warmup=10, iterations=100):
        """
        基准测试
        
        Args:
            image: 测试图像
            warmup: 预热迭代次数
            iterations: 测试迭代次数
        """
        import time
        
        # 预热
        print("预热...")
        for _ in range(warmup):
            self.infer(image)
        
        # 基准测试
        print(f"运行 {iterations} 次推理...")
        times = []
        
        for i in range(iterations):
            start_time = time.perf_counter()
            self.infer(image)
            end_time = time.perf_counter()
            times.append((end_time - start_time) * 1000)  # 转换为毫秒
            
            if (i + 1) % 10 == 0:
                print(f"已完成 {i + 1}/{iterations} 次推理")
        
        # 统计结果
        avg_time = np.mean(times)
        min_time = np.min(times)
        max_time = np.max(times)
        std_time = np.std(times)
        fps = 1000 / avg_time
        
        print("\n基准测试结果:")
        print(f"  平均推理时间: {avg_time:.2f} ms")
        print(f"  最小推理时间: {min_time:.2f} ms")
        print(f"  最大推理时间: {max_time:.2f} ms")
        print(f"  标准差: {std_time:.2f} ms")
        print(f"  FPS: {fps:.2f}")
        
        return {
            'avg_time_ms': avg_time,
            'min_time_ms': min_time,
            'max_time_ms': max_time,
            'std_time_ms': std_time,
            'fps': fps
        }

# 使用示例
def test_onnx_inference():
    # 初始化推理器
    detector = YOLOv8ONNXInference('yolov8n_quantized.onnx')
    
    # 读取图像
    image = cv2.imread('test.jpg')
    
    # 执行推理
    detections = detector.infer(image)
    
    # 打印结果
    print(f"检测到 {len(detections)} 个物体:")
    for i, det in enumerate(detections):
        print(f"{i+1}. 类别: {det['class_id']}, 置信度: {det['confidence']:.2f}, "
              f"边界框: {det['bbox']}")
    
    # 基准测试
    stats = detector.benchmark(image, warmup=5, iterations=50)
    
    # 可视化结果
    for det in detections:
        x1, y1, x2, y2 = det['bbox']
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        label = f"Class {det['class_id']}: {det['confidence']:.2f}"
        cv2.putText(image, label, (x1, y1-10), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    cv2.imwrite('result.jpg', image)
    print("结果已保存到 result.jpg")

if __name__ == "__main__":
    test_onnx_inference()

复制代码

2. TensorRT部署（可选）

python 复制代码

def convert_onnx_to_tensorrt(onnx_path, trt_path, precision='fp16'):
    """
    将ONNX模型转换为TensorRT引擎
    
    Args:
        onnx_path: ONNX模型路径
        trt_path: TensorRT引擎保存路径
        precision: 精度 ('fp32', 'fp16', 'int8')
    """
    import tensorrt as trt
    
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    
    # 创建构建器
    builder = trt.Builder(TRT_LOGGER)
    
    # 创建网络定义
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    
    # 创建ONNX解析器
    parser = trt.OnnxParser(network, TRT_LOGGER)
    
    # 解析ONNX模型
    with open(onnx_path, 'rb') as model:
        if not parser.parse(model.read()):
            print("解析失败:")
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return False
    
    # 构建配置
    config = builder.create_builder_config()
    config.max_workspace_size = 1 << 30  # 1GB
    
    # 设置精度
    if precision == 'fp16':
        config.set_flag(trt.BuilderFlag.FP16)
    elif precision == 'int8':
        config.set_flag(trt.BuilderFlag.INT8)
        # 需要校准器
        # calibrator = trt.Int8_calibrator(...)
        # config.int8_calibrator = calibrator
    
    # 构建引擎
    engine = builder.build_engine(network, config)
    
    if engine is None:
        print("构建引擎失败")
        return False
    
    # 保存引擎
    with open(trt_path, 'wb') as f:
        f.write(engine.serialize())
    
    print(f"TensorRT引擎已保存到: {trt_path}")
    return True

复制代码

五、性能优化策略

1. IO绑定优化

python 复制代码

class OptimizedYOLOv8ONNXInference(YOLOv8ONNXInference):
    """优化的YOLOv8 ONNX推理器"""
    
    def __init__(self, model_path: str, providers=None):
        super().__init__(model_path, providers)
        
        # 启用IO绑定
        self.io_binding = self.session.io_binding()
        
        # 预分配内存
        self._preallocate_buffers()
    
    def _preallocate_buffers(self):
        """预分配输入输出缓冲区"""
        # 获取输入输出信息
        input_info = self.session.get_inputs()[0]
        output_info = self.session.get_outputs()[0]
        
        # 预分配输入缓冲区
        self.input_shape = input_info.shape
        self.input_dtype = np.float32
        
        # 计算最大输出大小
        max_batch_size = self.input_shape[0]
        max_detections = output_info.shape[1]
        
        # 预分配输出缓冲区
        self.output_buffer = ort.OrtValue.ortvalue_from_numpy(
            np.zeros((max_batch_size, max_detections, output_info.shape[2]), 
                     dtype=np.float32)
        )
    
    def infer_optimized(self, image: np.ndarray) -> List[Dict]:
        """优化的推理方法"""
        # 预处理
        input_tensor = self.preprocess(image)
        
        # 创建OrtValue
        input_ortvalue = ort.OrtValue.ortvalue_from_numpy(input_tensor)
        
        # 清除之前的绑定
        self.io_binding.clear_binding_inputs()
        self.io_binding.clear_binding_outputs()
        
        # 绑定输入输出
        self.io_binding.bind_input(
            self.input_name,
            input_ortvalue.device_name(),
            input_ortvalue.device_id(),
            self.input_dtype,
            input_tensor.shape,
            input_ortvalue.data_ptr()
        )
        
        self.io_binding.bind_output(
            self.output_name,
            self.output_buffer.device_name(),
            self.output_buffer.device_id(),
            self.output_buffer.dtype(),
            self.output_buffer.shape(),
            self.output_buffer.data_ptr()
        )
        
        # 执行推理
        self.session.run_with_iobinding(self.io_binding)
        
        # 获取输出
        outputs = self.io_binding.copy_outputs_to_cpu()
        
        # 后处理
        detections = self.postprocess(outputs)
        
        return detections

2. 批处理优化

python 复制代码

class YOLOv8BatchInference:
    """YOLOv8批处理推理器"""
    
    def __init__(self, model_path: str, batch_size: int = 4):
        self.batch_size = batch_size
        self.inference_engine = YOLOv8ONNXInference(model_path)
        self.batch_buffer = []
    
    def add_to_batch(self, image: np.ndarray) -> bool:
        """添加图像到批处理缓冲区"""
        if len(self.batch_buffer) >= self.batch_size:
            return False
        
        # 预处理并添加到缓冲区
        processed = self.inference_engine.preprocess(image)
        self.batch_buffer.append((image, processed))
        return True
    
    def process_batch(self):
        """处理当前批次"""
        if not self.batch_buffer:
            return []
        
        # 堆叠批处理数据
        batch_images = [item[0] for item in self.batch_buffer]
        batch_tensors = [item[1] for item in self.batch_buffer]
        
        # 堆叠张量
        batch_tensor = np.concatenate(batch_tensors, axis=0)
        
        # 执行批处理推理
        outputs = self.inference_engine.session.run(
            [self.inference_engine.output_name],
            {self.inference_engine.input_name: batch_tensor}
        )
        
        # 处理每个图像的检测结果
        all_detections = []
        for i, original_image in enumerate(batch_images):
            # 提取单个图像的输出
            single_output = outputs[0][i:i+1]
            
            # 后处理
            detections = self.inference_engine.postprocess([single_output])
            
            # 调整边界框
            orig_h, orig_w = original_image.shape[:2]
            for det in detections:
                bbox = det['bbox']
                bbox[0] = bbox[0] * orig_w / 640
                bbox[1] = bbox[1] * orig_h / 640
                bbox[2] = bbox[2] * orig_w / 640
                bbox[3] = bbox[3] * orig_h / 640
                det['bbox'] = [int(coord) for coord in bbox]
            
            all_detections.append(detections)
        
        # 清空缓冲区
        self.batch_buffer.clear()
        
        return all_detections

复制代码

六、完整的部署脚本

python 复制代码

#!/usr/bin/env python3
"""
YOLOv8 ONNX量化模型部署脚本
"""

import argparse
import os
import json
import time
from pathlib import Path

def main():
    parser = argparse.ArgumentParser(description='YOLOv8 ONNX量化部署')
    parser.add_argument('--model', type=str, required=True, help='YOLOv8模型路径 (.pt)')
    parser.add_argument('--output', type=str, default='output', help='输出目录')
    parser.add_argument('--calibration-data', type=str, help='校准数据目录')
    parser.add_argument('--quantize', action='store_true', help='是否量化')
    parser.add_argument('--precision', choices=['fp32', 'fp16', 'int8'], default='int8', help='量化精度')
    parser.add_argument('--batch-size', type=int, default=1, help='批处理大小')
    parser.add_argument('--test-image', type=str, help='测试图像路径')
    parser.add_argument('--benchmark', action='store_true', help='运行基准测试')
    
    args = parser.parse_args()
    
    # 创建输出目录
    os.makedirs(args.output, exist_ok=True)
    
    # 1. 导出ONNX模型
    print("步骤1: 导出ONNX模型...")
    from ultralytics import YOLO
    
    model = YOLO(args.model)
    onnx_path = os.path.join(args.output, 'model.onnx')
    
    model.export(
        format='onnx',
        imgsz=640,
        opset=12,
        simplify=True,
        dynamic=False if args.batch_size == 1 else {'batch_size': [1, args.batch_size]},
        half=False,
        device='cpu'
    )
    
    # 移动导出的模型
    model_name = Path(args.model).stem
    exported_onnx = f'{model_name}.onnx'
    if os.path.exists(exported_onnx):
        os.rename(exported_onnx, onnx_path)
    
    print(f"ONNX模型已导出到: {onnx_path}")
    
    # 2. 量化（如果启用）
    if args.quantize:
        print(f"\n步骤2: {args.precision.upper()}量化...")
        
        if args.precision == 'fp16':
            # FP16量化
            quantized_path = os.path.join(args.output, 'model_fp16.onnx')
            convert_to_fp16(onnx_path, quantized_path)
            
        elif args.precision == 'int8':
            # INT8量化需要校准数据
            if not args.calibration_data:
                print("错误: INT8量化需要校准数据目录")
                return
            
            # 收集校准图像
            calibration_images = []
            for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp']:
                calibration_images.extend(Path(args.calibration_data).glob(ext))
            calibration_images = [str(p) for p in calibration_images[:100]]  # 最多100张
            
            if not calibration_images:
                print("错误: 校准目录中没有找到图像")
                return
            
            print(f"使用 {len(calibration_images)} 张图像进行校准...")
            
            # INT8量化
            quantized_path = os.path.join(args.output, 'model_int8.onnx')
            quantize_yolov8_onnx(onnx_path, calibration_images, quantized_path)
        
        else:
            # FP32，不量化
            quantized_path = onnx_path
    
    else:
        quantized_path = onnx_path
    
    # 3. 测试推理
    if args.test_image:
        print("\n步骤3: 测试推理...")
        
        # 初始化推理器
        inference_engine = YOLOv8ONNXInference(quantized_path)
        
        # 测试图像
        import cv2
        image = cv2.imread(args.test_image)
        
        if image is None:
            print(f"错误: 无法读取图像 {args.test_image}")
        else:
            # 执行推理
            start_time = time.time()
            detections = inference_engine.infer(image)
            inference_time = (time.time() - start_time) * 1000  # 毫秒
            
            print(f"推理时间: {inference_time:.2f} ms")
            print(f"检测到 {len(detections)} 个物体:")
            
            for i, det in enumerate(detections):
                print(f"  {i+1}. 类别: {det['class_id']}, "
                      f"置信度: {det['confidence']:.3f}, "
                      f"边界框: {det['bbox']}")
            
            # 可视化结果
            for det in detections:
                x1, y1, x2, y2 = det['bbox']
                cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
                label = f"Class {det['class_id']}: {det['confidence']:.2f}"
                cv2.putText(image, label, (x1, y1-10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            
            result_path = os.path.join(args.output, 'result.jpg')
            cv2.imwrite(result_path, image)
            print(f"结果已保存到: {result_path}")
    
    # 4. 基准测试
    if args.benchmark:
        print("\n步骤4: 基准测试...")
        
        # 创建测试图像
        import cv2
        import numpy as np
        
        test_image = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
        
        # 初始化推理器
        inference_engine = YOLOv8ONNXInference(quantized_path)
        
        # 运行基准测试
        stats = inference_engine.benchmark(
            test_image, 
            warmup=10, 
            iterations=100
        )
        
        # 保存基准测试结果
        benchmark_path = os.path.join(args.output, 'benchmark.json')
        with open(benchmark_path, 'w') as f:
            json.dump(stats, f, indent=2)
        
        print(f"基准测试结果已保存到: {benchmark_path}")
    
    print("\n部署完成!")
    print(f"模型文件: {quantized_path}")
    print(f"输出目录: {args.output}")

if __name__ == "__main__":
    # 导入必要的函数
    from export_yolov8_to_onnx import export_yolov8_to_onnx
    from quantization import quantize_yolov8_onnx, convert_to_fp16
    from inference import YOLOv8ONNXInference
    
    main()

复制代码

七、部署注意事项

1. 性能对比表

模型类型	精度	速度	内存占用	适用场景
FP32	32位浮点	慢	高	开发调试
FP16	16位浮点	快	中	边缘GPU
INT8	8位整型	最快	低	移动端/嵌入式

2. 常见问题解决

python 复制代码

def troubleshoot_onnx_deployment():
    """ONNX部署常见问题解决"""
    
    # 1. 模型兼容性问题
    import onnx
    
    def check_model_compatibility(model_path):
        """检查模型兼容性"""
        model = onnx.load(model_path)
        
        # 检查opset版本
        opset_import = model.opset_import
        print(f"ONNX opset版本: {opset_import[0].version}")
        
        # 检查支持的算子
        from onnx import helper
        
        # 列出所有算子类型
        ops = set()
        for node in model.graph.node:
            ops.add(node.op_type)
        
        print(f"模型中使用的算子: {sorted(ops)}")
        
        # 检查模型有效性
        from onnx import checker
        try:
            checker.check_model(model)
            print("模型有效")
        except Exception as e:
            print(f"模型无效: {e}")
    
    # 2. 内存优化
    def optimize_memory_usage(session_options):
        """优化内存使用"""
        # 启用内存优化
        session_options.enable_cpu_mem_arena = True
        session_options.enable_mem_pattern = True
        session_options.enable_mem_reuse = True
        
        # 设置内存限制
        session_options.max_num_graphs = 3
        session_options.intra_op_num_threads = 1
        session_options.inter_op_num_threads = 1
    
    # 3. 性能优化
    def optimize_performance(session_options):
        """优化性能"""
        # 启用所有优化
        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        
        # 设置执行模式
        session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
        
        # 启用CUDA优化
        session_options.add_session_config_entry('session.disable_prepacking', '0')
        
        return session_options

复制代码

3. 部署检查清单

python 复制代码

def deployment_checklist(model_path, output_dir):
    """部署检查清单"""
    checklist = {
        "模型格式": {
            "检查点": "模型是否为.pt格式",
            "ONNX导出": "是否已导出为ONNX",
            "opset版本": "是否使用opset 12或更高",
            "动态尺寸": "是否支持需要的输入尺寸"
        },
        "量化准备": {
            "校准数据": "是否准备足够的校准图像",
            "量化工具": "是否安装量化工具",
            "精度要求": "是否确定量化精度(FP16/INT8)"
        },
        "部署环境": {
            "ONNX Runtime": "是否安装正确版本",
            "CUDA支持": "是否启用GPU加速",
            "内存检查": "是否有足够的内存",
            "兼容性": "检查算子兼容性"
        },
        "性能测试": {
            "基准测试": "是否进行速度测试",
            "精度测试": "是否验证量化后的精度",
            "内存测试": "是否检查内存使用",
            "多线程": "是否测试多线程性能"
        }
    }
    
    # 执行检查
    results = {}
    
    for category, items in checklist.items():
        results[category] = {}
        for item, description in items.items():
            # 这里可以添加具体的检查逻辑
            results[category][item] = {
                "description": description,
                "status": "待检查"
            }
    
    return results

复制代码

YOLOv8 ONNX量化部署方案包含了从模型导出、量化到部署的完整流程，可以根据具体需求进行调整和优化。

目标检测与跟踪 （7）- YOLOv8 ONNX量化模型部署指南

YOLOv8 ONNX量化模型部署指南

一、环境准备

二、YOLOv8模型导出为ONNX

1. 基础导出

2. 带有动态输入尺寸的导出

三、ONNX模型量化

1. ONNX Runtime量化（静态量化）

2. 使用Intel Neural Compressor量化

3. FP16量化（混合精度）

四、部署量化模型

1. ONNX Runtime推理引擎

2. TensorRT部署（可选）

五、性能优化策略

1. IO绑定优化

2. 批处理优化

六、完整的部署脚本

七、部署注意事项

1. 性能对比表

2. 常见问题解决

3. 部署检查清单

目标检测与跟踪（7）- YOLOv8 ONNX量化模型部署指南