目标检测与跟踪 (7)- YOLOv8 ONNX量化模型部署指南

YOLOv8 ONNX量化模型部署指南

本文以YOLOv8模型从训练到ONNX量化部署的完整流程为主线。主要内容包括:1)环境准备与模型导出为ONNX格式;2)ONNX模型的FP16/INT8量化方法;3)使用ONNXRuntime和TensorRT的部署实现;4)性能优化策略如IO绑定和批处理;5)完整的部署脚本和注意事项。该方案支持静态和动态输入尺寸,提供多种量化精度选择,并包含基准测试功能,适用于边缘计算和嵌入式设备部署场景。

一、环境准备

bash 复制代码
# 基础环境
pip install ultralytics onnx onnxruntime onnxruntime-gpu

# 量化工具
pip install onnxruntime-tools
pip install neural-compressor  # Intel的量化工具
pip install onnxsim  # ONNX模型简化

# 可选:TensorRT支持
pip install tensorrt  # 如果需要TensorRT部署

二、YOLOv8模型导出为ONNX

1. 基础导出

bash 复制代码
from ultralytics import YOLO
import torch

def export_yolov8_to_onnx(model_path, output_path, imgsz=640, simplify=True):
    """
    将YOLOv8模型导出为ONNX格式
    
    Args:
        model_path: YOLO模型路径 (.pt)
        output_path: 输出ONNX路径
        imgsz: 输入图像尺寸
        simplify: 是否简化模型
    """
    # 加载模型
    model = YOLO(model_path)
    
    # 导出为ONNX
    success = model.export(
        format="onnx",  # 导出格式
        imgsz=imgsz,    # 输入尺寸
        opset=12,       # ONNX opset版本
        simplify=simplify,  # 简化模型
        dynamic=False,  # 固定输入尺寸(False为静态,True为动态)
        half=False,     # FP16精度
        device='cpu'    # 导出设备
    )
    
    if success:
        print(f"模型已成功导出到: {output_path}")
        return True
    else:
        print("模型导出失败")
        return False

# 使用示例
export_yolov8_to_onnx(
    model_path='yolov8n.pt',
    output_path='yolov8n.onnx'
)

2. 带有动态输入尺寸的导出

bash 复制代码
def export_yolov8_dynamic_onnx(model_path, output_path, batch_sizes=[1, 4, 8]):
    """
    导出支持动态批处理的ONNX模型
    """
    model = YOLO(model_path)
    
    # 定义动态尺寸
    dynamic_axes = {
        'images': {0: 'batch_size'},  # 批处理维度动态
        'output0': {0: 'batch_size'}, # 输出批处理维度动态
    }
    
    # 获取示例输入
    example_input = torch.randn(batch_sizes[0], 3, 640, 640)
    
    # 导出
    torch.onnx.export(
        model.model,  # YOLO模型的torch模块
        example_input,
        output_path,
        input_names=['images'],
        output_names=['output0'],
        dynamic_axes=dynamic_axes,
        opset_version=12,
        do_constant_folding=True
    )
    
    # 简化模型
    import onnx
    from onnxsim import simplify
    
    onnx_model = onnx.load(output_path)
    model_simp, check = simplify(onnx_model)
    
    if check:
        onnx.save(model_simp, output_path.replace('.onnx', '_simplified.onnx'))
        print(f"简化模型已保存")
    
    return True
复制代码

三、ONNX模型量化

1. ONNX Runtime量化(静态量化)

bash 复制代码
import onnx
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType
import numpy as np
from PIL import Image
import cv2

class YOLOCalibrationDataReader(CalibrationDataReader):
    """YOLO校准数据读取器"""
    
    def __init__(self, calibration_dataset, input_shape=(640, 640), batch_size=1):
        """
        Args:
            calibration_dataset: 校准数据集路径列表
            input_shape: 输入尺寸 (H, W)
            batch_size: 批处理大小
        """
        self.dataset = calibration_dataset
        self.input_shape = input_shape
        self.batch_size = batch_size
        self.current_index = 0
        
    def get_next(self):
        """获取下一个校准数据批次"""
        if self.current_index >= len(self.dataset):
            return None
        
        batch_data = []
        for _ in range(self.batch_size):
            if self.current_index >= len(self.dataset):
                break
            
            img_path = self.dataset[self.current_index]
            
            # 预处理图像
            img = self.preprocess_image(img_path)
            batch_data.append(img)
            self.current_index += 1
        
        if not batch_data:
            return None
        
        # 堆叠为批次
        batch_array = np.stack(batch_data, axis=0).astype(np.float32)
        
        return {'images': batch_array}
    
    def preprocess_image(self, img_path):
        """图像预处理"""
        # 读取图像
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # 调整大小
        img = cv2.resize(img, self.input_shape)
        
        # 归一化 (0-1)
        img = img / 255.0
        
        # 转换为CHW格式
        img = np.transpose(img, (2, 0, 1))
        
        # 添加批次维度
        img = np.expand_dims(img, axis=0)
        
        return img

def quantize_yolov8_onnx(model_path, calibration_dataset, output_path):
    """
    量化YOLOv8 ONNX模型
    
    Args:
        model_path: ONNX模型路径
        calibration_dataset: 校准数据集路径列表
        output_path: 量化模型输出路径
    """
    # 加载原始模型
    onnx_model = onnx.load(model_path)
    
    # 创建校准数据读取器
    calibration_data_reader = YOLOCalibrationDataReader(
        calibration_dataset=calibration_dataset,
        input_shape=(640, 640),
        batch_size=1
    )
    
    # 量化配置
    quant_config = {
        'calibrate_method': 'MinMax',  # 校准方法: MinMax, Entropy
        'quant_format': QuantType.QInt8,  # 量化格式
        'per_channel': True,  # 逐通道量化
        'weight_type': QuantType.QInt8,  # 权重量化类型
        'activation_type': QuantType.QInt8,  # 激活量化类型
        'nodes_to_quantize': [],  # 空列表表示量化所有节点
        'nodes_to_exclude': [],  # 需要排除的节点
        'extra_options': {
            'ActivationSymmetric': False,
            'WeightSymmetric': True,
            'EnableSubgraph': False
        }
    }
    
    # 执行量化
    quantized_model = quantize_static(
        model_input=model_path,
        calibration_data_reader=calibration_data_reader,
        quant_format=quant_config['quant_format'],
        per_channel=quant_config['per_channel'],
        weight_type=quant_config['weight_type'],
        activation_type=quant_config['activation_type'],
        nodes_to_quantize=quant_config['nodes_to_quantize'],
        nodes_to_exclude=quant_config['nodes_to_exclude'],
        extra_options=quant_config['extra_options']
    )
    
    # 保存量化模型
    onnx.save(quantized_model, output_path)
    
    print(f"量化模型已保存到: {output_path}")
    
    return output_path

# 使用示例
calibration_images = [
    'calibration/image1.jpg',
    'calibration/image2.jpg',
    # ... 更多图像
]

quantize_yolov8_onnx(
    model_path='yolov8n.onnx',
    calibration_dataset=calibration_images,
    output_path='yolov8n_quantized.onnx'
)

2. 使用Intel Neural Compressor量化

bash 复制代码
from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig

def quantize_with_neural_compressor(model_path, calibration_dataset, output_path):
    """
    使用Intel Neural Compressor量化
    """
    from neural_compressor.data import DataLoader, Datasets
    
    # 创建数据集
    dataset = Datasets('onnxrt_qdq')['dummy_v2'](
        input_shape=(1, 3, 640, 640), 
        label_shape=(1,)
    )
    
    # 创建数据加载器
    dataloader = DataLoader(framework='onnxruntime', dataset=dataset)
    
    # 配置量化参数
    config = PostTrainingQuantConfig(
        approach='static',  # 静态量化
        calibration_sampling_size=[8, 16, 32],  # 校准采样大小
        op_type_dict={
            'Conv': {'weight': {'dtype': ['s8']}, 'activation': {'dtype': ['u8']}},
            'MatMul': {'weight': {'dtype': ['s8']}, 'activation': {'dtype': ['u8']}}
        },
        recipes={
            'smooth_quant': True,  # 平滑量化
            'smooth_quant_args': {'alpha': 0.5}
        }
    )
    
    # 执行量化
    q_model = quantization.fit(
        model=model_path,
        conf=config,
        calib_dataloader=dataloader,
        eval_dataloader=dataloader
    )
    
    # 保存量化模型
    q_model.save(output_path)
    
    print(f"Neural Compressor量化模型已保存: {output_path}")
    
    return output_path
复制代码

3. FP16量化(混合精度)

bash 复制代码
def convert_to_fp16(model_path, output_path):
    """
    将模型转换为FP16精度
    """
    import onnx
    from onnxconverter_common import float16
    
    # 加载模型
    model = onnx.load(model_path)
    
    # 转换为FP16
    model_fp16 = float16.convert_float_to_float16(
        model,
        keep_io_types=True,  # 保持输入输出为FP32
        op_block_list=['NonMaxSuppression']  # 某些操作保持FP32
    )
    
    # 保存模型
    onnx.save(model_fp16, output_path)
    
    print(f"FP16模型已保存: {output_path}")
    return output_path
复制代码

四、部署量化模型

1. ONNX Runtime推理引擎

bash 复制代码
import onnxruntime as ort
import numpy as np
import cv2
from typing import List, Tuple, Dict

class YOLOv8ONNXInference:
    """YOLOv8 ONNX推理器"""
    
    def __init__(self, model_path: str, providers=None):
        """
        初始化推理器
        
        Args:
            model_path: ONNX模型路径
            providers: 执行提供者列表
        """
        if providers is None:
            # 自动选择最佳提供者
            available_providers = ort.get_available_providers()
            providers = []
            
            # 优先级:TensorRT > CUDA > CPU
            if 'TensorrtExecutionProvider' in available_providers:
                providers.append('TensorrtExecutionProvider')
            if 'CUDAExecutionProvider' in available_providers:
                providers.append('CUDAExecutionProvider')
            providers.append('CPUExecutionProvider')
        
        # 创建会话选项
        session_options = ort.SessionOptions()
        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
        
        # 创建推理会话
        self.session = ort.InferenceSession(
            model_path,
            sess_options=session_options,
            providers=providers
        )
        
        # 获取输入输出信息
        self.input_name = self.session.get_inputs()[0].name
        self.output_name = self.session.get_outputs()[0].name
        self.input_shape = self.session.get_inputs()[0].shape
        self.output_shape = self.session.get_outputs()[0].shape
        
        print(f"模型加载成功:")
        print(f"  输入名称: {self.input_name}, 形状: {self.input_shape}")
        print(f"  输出名称: {self.output_name}, 形状: {self.output_shape}")
        print(f"  执行提供者: {self.session.get_providers()}")
    
    def preprocess(self, image: np.ndarray, target_size: Tuple[int, int] = (640, 640)) -> np.ndarray:
        """
        预处理图像
        
        Args:
            image: 输入图像 (H, W, C)
            target_size: 目标尺寸 (H, W)
            
        Returns:
            预处理后的图像 (1, C, H, W)
        """
        # 调整大小
        img_resized = cv2.resize(image, target_size)
        
        # BGR转RGB(如果必要)
        if len(img_resized.shape) == 3 and img_resized.shape[2] == 3:
            img_resized = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
        
        # 归一化 (0-1)
        img_normalized = img_resized / 255.0
        
        # 转换为CHW格式
        img_chw = np.transpose(img_normalized, (2, 0, 1))
        
        # 添加批次维度
        img_batch = np.expand_dims(img_chw, axis=0).astype(np.float32)
        
        return img_batch
    
    def postprocess(self, outputs: np.ndarray, 
                   confidence_threshold: float = 0.25,
                   iou_threshold: float = 0.45) -> List[Dict]:
        """
        后处理输出
        
        Args:
            outputs: 模型输出
            confidence_threshold: 置信度阈值
            iou_threshold: IOU阈值
            
        Returns:
            检测结果列表
        """
        # YOLOv8输出格式: [batch, num_detections, 85]
        # 85 = [x, y, w, h, conf, class1, class2, ...]
        
        detections = []
        output = outputs[0]  # 获取第一个批次的输出
        
        for detection in output:
            # 提取坐标和置信度
            x, y, w, h, conf = detection[:5]
            
            # 检查置信度
            if conf < confidence_threshold:
                continue
            
            # 提取类别分数
            class_scores = detection[5:]
            class_id = np.argmax(class_scores)
            class_score = class_scores[class_id]
            
            # 计算最终置信度
            final_confidence = conf * class_score
            
            if final_confidence < confidence_threshold:
                continue
            
            # 计算边界框坐标 (原始图像尺寸)
            x1 = x - w / 2
            y1 = y - h / 2
            x2 = x + w / 2
            y2 = y + h / 2
            
            detections.append({
                'bbox': [x1, y1, x2, y2],
                'confidence': float(final_confidence),
                'class_id': int(class_id),
                'class_score': float(class_score)
            })
        
        # 非极大抑制
        if detections and iou_threshold > 0:
            detections = self.non_max_suppression(detections, iou_threshold)
        
        return detections
    
    def non_max_suppression(self, detections: List[Dict], iou_threshold: float) -> List[Dict]:
        """非极大抑制"""
        if not detections:
            return []
        
        # 按置信度排序
        detections.sort(key=lambda x: x['confidence'], reverse=True)
        
        keep = []
        
        while detections:
            # 取出置信度最高的检测
            best = detections.pop(0)
            keep.append(best)
            
            # 计算与剩余检测的IOU
            i = 0
            while i < len(detections):
                iou = self.calculate_iou(best['bbox'], detections[i]['bbox'])
                
                # 如果IOU超过阈值,移除该检测
                if iou > iou_threshold:
                    detections.pop(i)
                else:
                    i += 1
        
        return keep
    
    def calculate_iou(self, box1: List[float], box2: List[float]) -> float:
        """计算IOU"""
        # 计算交集
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])
        
        if x2 < x1 or y2 < y1:
            return 0.0
        
        intersection = (x2 - x1) * (y2 - y1)
        
        # 计算并集
        area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
        area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
        union = area1 + area2 - intersection
        
        return intersection / union if union > 0 else 0.0
    
    def infer(self, image: np.ndarray) -> List[Dict]:
        """
        执行推理
        
        Args:
            image: 输入图像
            
        Returns:
            检测结果
        """
        # 预处理
        input_tensor = self.preprocess(image)
        
        # 推理
        outputs = self.session.run(
            [self.output_name],
            {self.input_name: input_tensor}
        )
        
        # 后处理
        detections = self.postprocess(outputs)
        
        # 调整边界框到原始图像尺寸
        orig_h, orig_w = image.shape[:2]
        input_h, input_w = self.input_shape[2], self.input_shape[3]
        
        for det in detections:
            bbox = det['bbox']
            # 缩放回原始尺寸
            bbox[0] = bbox[0] * orig_w / input_w
            bbox[1] = bbox[1] * orig_h / input_h
            bbox[2] = bbox[2] * orig_w / input_w
            bbox[3] = bbox[3] * orig_h / input_h
            det['bbox'] = [int(coord) for coord in bbox]
        
        return detections
    
    def benchmark(self, image: np.ndarray, warmup=10, iterations=100):
        """
        基准测试
        
        Args:
            image: 测试图像
            warmup: 预热迭代次数
            iterations: 测试迭代次数
        """
        import time
        
        # 预热
        print("预热...")
        for _ in range(warmup):
            self.infer(image)
        
        # 基准测试
        print(f"运行 {iterations} 次推理...")
        times = []
        
        for i in range(iterations):
            start_time = time.perf_counter()
            self.infer(image)
            end_time = time.perf_counter()
            times.append((end_time - start_time) * 1000)  # 转换为毫秒
            
            if (i + 1) % 10 == 0:
                print(f"已完成 {i + 1}/{iterations} 次推理")
        
        # 统计结果
        avg_time = np.mean(times)
        min_time = np.min(times)
        max_time = np.max(times)
        std_time = np.std(times)
        fps = 1000 / avg_time
        
        print("\n基准测试结果:")
        print(f"  平均推理时间: {avg_time:.2f} ms")
        print(f"  最小推理时间: {min_time:.2f} ms")
        print(f"  最大推理时间: {max_time:.2f} ms")
        print(f"  标准差: {std_time:.2f} ms")
        print(f"  FPS: {fps:.2f}")
        
        return {
            'avg_time_ms': avg_time,
            'min_time_ms': min_time,
            'max_time_ms': max_time,
            'std_time_ms': std_time,
            'fps': fps
        }

# 使用示例
def test_onnx_inference():
    # 初始化推理器
    detector = YOLOv8ONNXInference('yolov8n_quantized.onnx')
    
    # 读取图像
    image = cv2.imread('test.jpg')
    
    # 执行推理
    detections = detector.infer(image)
    
    # 打印结果
    print(f"检测到 {len(detections)} 个物体:")
    for i, det in enumerate(detections):
        print(f"{i+1}. 类别: {det['class_id']}, 置信度: {det['confidence']:.2f}, "
              f"边界框: {det['bbox']}")
    
    # 基准测试
    stats = detector.benchmark(image, warmup=5, iterations=50)
    
    # 可视化结果
    for det in detections:
        x1, y1, x2, y2 = det['bbox']
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        label = f"Class {det['class_id']}: {det['confidence']:.2f}"
        cv2.putText(image, label, (x1, y1-10), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    cv2.imwrite('result.jpg', image)
    print("结果已保存到 result.jpg")

if __name__ == "__main__":
    test_onnx_inference()
复制代码

2. TensorRT部署(可选)

python 复制代码
def convert_onnx_to_tensorrt(onnx_path, trt_path, precision='fp16'):
    """
    将ONNX模型转换为TensorRT引擎
    
    Args:
        onnx_path: ONNX模型路径
        trt_path: TensorRT引擎保存路径
        precision: 精度 ('fp32', 'fp16', 'int8')
    """
    import tensorrt as trt
    
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    
    # 创建构建器
    builder = trt.Builder(TRT_LOGGER)
    
    # 创建网络定义
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    
    # 创建ONNX解析器
    parser = trt.OnnxParser(network, TRT_LOGGER)
    
    # 解析ONNX模型
    with open(onnx_path, 'rb') as model:
        if not parser.parse(model.read()):
            print("解析失败:")
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return False
    
    # 构建配置
    config = builder.create_builder_config()
    config.max_workspace_size = 1 << 30  # 1GB
    
    # 设置精度
    if precision == 'fp16':
        config.set_flag(trt.BuilderFlag.FP16)
    elif precision == 'int8':
        config.set_flag(trt.BuilderFlag.INT8)
        # 需要校准器
        # calibrator = trt.Int8_calibrator(...)
        # config.int8_calibrator = calibrator
    
    # 构建引擎
    engine = builder.build_engine(network, config)
    
    if engine is None:
        print("构建引擎失败")
        return False
    
    # 保存引擎
    with open(trt_path, 'wb') as f:
        f.write(engine.serialize())
    
    print(f"TensorRT引擎已保存到: {trt_path}")
    return True
复制代码

五、性能优化策略

1. IO绑定优化

python 复制代码
class OptimizedYOLOv8ONNXInference(YOLOv8ONNXInference):
    """优化的YOLOv8 ONNX推理器"""
    
    def __init__(self, model_path: str, providers=None):
        super().__init__(model_path, providers)
        
        # 启用IO绑定
        self.io_binding = self.session.io_binding()
        
        # 预分配内存
        self._preallocate_buffers()
    
    def _preallocate_buffers(self):
        """预分配输入输出缓冲区"""
        # 获取输入输出信息
        input_info = self.session.get_inputs()[0]
        output_info = self.session.get_outputs()[0]
        
        # 预分配输入缓冲区
        self.input_shape = input_info.shape
        self.input_dtype = np.float32
        
        # 计算最大输出大小
        max_batch_size = self.input_shape[0]
        max_detections = output_info.shape[1]
        
        # 预分配输出缓冲区
        self.output_buffer = ort.OrtValue.ortvalue_from_numpy(
            np.zeros((max_batch_size, max_detections, output_info.shape[2]), 
                     dtype=np.float32)
        )
    
    def infer_optimized(self, image: np.ndarray) -> List[Dict]:
        """优化的推理方法"""
        # 预处理
        input_tensor = self.preprocess(image)
        
        # 创建OrtValue
        input_ortvalue = ort.OrtValue.ortvalue_from_numpy(input_tensor)
        
        # 清除之前的绑定
        self.io_binding.clear_binding_inputs()
        self.io_binding.clear_binding_outputs()
        
        # 绑定输入输出
        self.io_binding.bind_input(
            self.input_name,
            input_ortvalue.device_name(),
            input_ortvalue.device_id(),
            self.input_dtype,
            input_tensor.shape,
            input_ortvalue.data_ptr()
        )
        
        self.io_binding.bind_output(
            self.output_name,
            self.output_buffer.device_name(),
            self.output_buffer.device_id(),
            self.output_buffer.dtype(),
            self.output_buffer.shape(),
            self.output_buffer.data_ptr()
        )
        
        # 执行推理
        self.session.run_with_iobinding(self.io_binding)
        
        # 获取输出
        outputs = self.io_binding.copy_outputs_to_cpu()
        
        # 后处理
        detections = self.postprocess(outputs)
        
        return detections

2. 批处理优化

python 复制代码
class YOLOv8BatchInference:
    """YOLOv8批处理推理器"""
    
    def __init__(self, model_path: str, batch_size: int = 4):
        self.batch_size = batch_size
        self.inference_engine = YOLOv8ONNXInference(model_path)
        self.batch_buffer = []
    
    def add_to_batch(self, image: np.ndarray) -> bool:
        """添加图像到批处理缓冲区"""
        if len(self.batch_buffer) >= self.batch_size:
            return False
        
        # 预处理并添加到缓冲区
        processed = self.inference_engine.preprocess(image)
        self.batch_buffer.append((image, processed))
        return True
    
    def process_batch(self):
        """处理当前批次"""
        if not self.batch_buffer:
            return []
        
        # 堆叠批处理数据
        batch_images = [item[0] for item in self.batch_buffer]
        batch_tensors = [item[1] for item in self.batch_buffer]
        
        # 堆叠张量
        batch_tensor = np.concatenate(batch_tensors, axis=0)
        
        # 执行批处理推理
        outputs = self.inference_engine.session.run(
            [self.inference_engine.output_name],
            {self.inference_engine.input_name: batch_tensor}
        )
        
        # 处理每个图像的检测结果
        all_detections = []
        for i, original_image in enumerate(batch_images):
            # 提取单个图像的输出
            single_output = outputs[0][i:i+1]
            
            # 后处理
            detections = self.inference_engine.postprocess([single_output])
            
            # 调整边界框
            orig_h, orig_w = original_image.shape[:2]
            for det in detections:
                bbox = det['bbox']
                bbox[0] = bbox[0] * orig_w / 640
                bbox[1] = bbox[1] * orig_h / 640
                bbox[2] = bbox[2] * orig_w / 640
                bbox[3] = bbox[3] * orig_h / 640
                det['bbox'] = [int(coord) for coord in bbox]
            
            all_detections.append(detections)
        
        # 清空缓冲区
        self.batch_buffer.clear()
        
        return all_detections
复制代码

六、完整的部署脚本

python 复制代码
#!/usr/bin/env python3
"""
YOLOv8 ONNX量化模型部署脚本
"""

import argparse
import os
import json
import time
from pathlib import Path

def main():
    parser = argparse.ArgumentParser(description='YOLOv8 ONNX量化部署')
    parser.add_argument('--model', type=str, required=True, help='YOLOv8模型路径 (.pt)')
    parser.add_argument('--output', type=str, default='output', help='输出目录')
    parser.add_argument('--calibration-data', type=str, help='校准数据目录')
    parser.add_argument('--quantize', action='store_true', help='是否量化')
    parser.add_argument('--precision', choices=['fp32', 'fp16', 'int8'], default='int8', help='量化精度')
    parser.add_argument('--batch-size', type=int, default=1, help='批处理大小')
    parser.add_argument('--test-image', type=str, help='测试图像路径')
    parser.add_argument('--benchmark', action='store_true', help='运行基准测试')
    
    args = parser.parse_args()
    
    # 创建输出目录
    os.makedirs(args.output, exist_ok=True)
    
    # 1. 导出ONNX模型
    print("步骤1: 导出ONNX模型...")
    from ultralytics import YOLO
    
    model = YOLO(args.model)
    onnx_path = os.path.join(args.output, 'model.onnx')
    
    model.export(
        format='onnx',
        imgsz=640,
        opset=12,
        simplify=True,
        dynamic=False if args.batch_size == 1 else {'batch_size': [1, args.batch_size]},
        half=False,
        device='cpu'
    )
    
    # 移动导出的模型
    model_name = Path(args.model).stem
    exported_onnx = f'{model_name}.onnx'
    if os.path.exists(exported_onnx):
        os.rename(exported_onnx, onnx_path)
    
    print(f"ONNX模型已导出到: {onnx_path}")
    
    # 2. 量化(如果启用)
    if args.quantize:
        print(f"\n步骤2: {args.precision.upper()}量化...")
        
        if args.precision == 'fp16':
            # FP16量化
            quantized_path = os.path.join(args.output, 'model_fp16.onnx')
            convert_to_fp16(onnx_path, quantized_path)
            
        elif args.precision == 'int8':
            # INT8量化需要校准数据
            if not args.calibration_data:
                print("错误: INT8量化需要校准数据目录")
                return
            
            # 收集校准图像
            calibration_images = []
            for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp']:
                calibration_images.extend(Path(args.calibration_data).glob(ext))
            calibration_images = [str(p) for p in calibration_images[:100]]  # 最多100张
            
            if not calibration_images:
                print("错误: 校准目录中没有找到图像")
                return
            
            print(f"使用 {len(calibration_images)} 张图像进行校准...")
            
            # INT8量化
            quantized_path = os.path.join(args.output, 'model_int8.onnx')
            quantize_yolov8_onnx(onnx_path, calibration_images, quantized_path)
        
        else:
            # FP32,不量化
            quantized_path = onnx_path
    
    else:
        quantized_path = onnx_path
    
    # 3. 测试推理
    if args.test_image:
        print("\n步骤3: 测试推理...")
        
        # 初始化推理器
        inference_engine = YOLOv8ONNXInference(quantized_path)
        
        # 测试图像
        import cv2
        image = cv2.imread(args.test_image)
        
        if image is None:
            print(f"错误: 无法读取图像 {args.test_image}")
        else:
            # 执行推理
            start_time = time.time()
            detections = inference_engine.infer(image)
            inference_time = (time.time() - start_time) * 1000  # 毫秒
            
            print(f"推理时间: {inference_time:.2f} ms")
            print(f"检测到 {len(detections)} 个物体:")
            
            for i, det in enumerate(detections):
                print(f"  {i+1}. 类别: {det['class_id']}, "
                      f"置信度: {det['confidence']:.3f}, "
                      f"边界框: {det['bbox']}")
            
            # 可视化结果
            for det in detections:
                x1, y1, x2, y2 = det['bbox']
                cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
                label = f"Class {det['class_id']}: {det['confidence']:.2f}"
                cv2.putText(image, label, (x1, y1-10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            
            result_path = os.path.join(args.output, 'result.jpg')
            cv2.imwrite(result_path, image)
            print(f"结果已保存到: {result_path}")
    
    # 4. 基准测试
    if args.benchmark:
        print("\n步骤4: 基准测试...")
        
        # 创建测试图像
        import cv2
        import numpy as np
        
        test_image = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
        
        # 初始化推理器
        inference_engine = YOLOv8ONNXInference(quantized_path)
        
        # 运行基准测试
        stats = inference_engine.benchmark(
            test_image, 
            warmup=10, 
            iterations=100
        )
        
        # 保存基准测试结果
        benchmark_path = os.path.join(args.output, 'benchmark.json')
        with open(benchmark_path, 'w') as f:
            json.dump(stats, f, indent=2)
        
        print(f"基准测试结果已保存到: {benchmark_path}")
    
    print("\n部署完成!")
    print(f"模型文件: {quantized_path}")
    print(f"输出目录: {args.output}")

if __name__ == "__main__":
    # 导入必要的函数
    from export_yolov8_to_onnx import export_yolov8_to_onnx
    from quantization import quantize_yolov8_onnx, convert_to_fp16
    from inference import YOLOv8ONNXInference
    
    main()
复制代码

七、部署注意事项

1. 性能对比表

模型类型 精度 速度 内存占用 适用场景
FP32 32位浮点 开发调试
FP16 16位浮点 边缘GPU
INT8 8位整型 最快 移动端/嵌入式

2. 常见问题解决

python 复制代码
def troubleshoot_onnx_deployment():
    """ONNX部署常见问题解决"""
    
    # 1. 模型兼容性问题
    import onnx
    
    def check_model_compatibility(model_path):
        """检查模型兼容性"""
        model = onnx.load(model_path)
        
        # 检查opset版本
        opset_import = model.opset_import
        print(f"ONNX opset版本: {opset_import[0].version}")
        
        # 检查支持的算子
        from onnx import helper
        
        # 列出所有算子类型
        ops = set()
        for node in model.graph.node:
            ops.add(node.op_type)
        
        print(f"模型中使用的算子: {sorted(ops)}")
        
        # 检查模型有效性
        from onnx import checker
        try:
            checker.check_model(model)
            print("模型有效")
        except Exception as e:
            print(f"模型无效: {e}")
    
    # 2. 内存优化
    def optimize_memory_usage(session_options):
        """优化内存使用"""
        # 启用内存优化
        session_options.enable_cpu_mem_arena = True
        session_options.enable_mem_pattern = True
        session_options.enable_mem_reuse = True
        
        # 设置内存限制
        session_options.max_num_graphs = 3
        session_options.intra_op_num_threads = 1
        session_options.inter_op_num_threads = 1
    
    # 3. 性能优化
    def optimize_performance(session_options):
        """优化性能"""
        # 启用所有优化
        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        
        # 设置执行模式
        session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
        
        # 启用CUDA优化
        session_options.add_session_config_entry('session.disable_prepacking', '0')
        
        return session_options
复制代码

3. 部署检查清单

python 复制代码
def deployment_checklist(model_path, output_dir):
    """部署检查清单"""
    checklist = {
        "模型格式": {
            "检查点": "模型是否为.pt格式",
            "ONNX导出": "是否已导出为ONNX",
            "opset版本": "是否使用opset 12或更高",
            "动态尺寸": "是否支持需要的输入尺寸"
        },
        "量化准备": {
            "校准数据": "是否准备足够的校准图像",
            "量化工具": "是否安装量化工具",
            "精度要求": "是否确定量化精度(FP16/INT8)"
        },
        "部署环境": {
            "ONNX Runtime": "是否安装正确版本",
            "CUDA支持": "是否启用GPU加速",
            "内存检查": "是否有足够的内存",
            "兼容性": "检查算子兼容性"
        },
        "性能测试": {
            "基准测试": "是否进行速度测试",
            "精度测试": "是否验证量化后的精度",
            "内存测试": "是否检查内存使用",
            "多线程": "是否测试多线程性能"
        }
    }
    
    # 执行检查
    results = {}
    
    for category, items in checklist.items():
        results[category] = {}
        for item, description in items.items():
            # 这里可以添加具体的检查逻辑
            results[category][item] = {
                "description": description,
                "status": "待检查"
            }
    
    return results
复制代码

YOLOv8 ONNX量化部署方案包含了从模型导出、量化到部署的完整流程,可以根据具体需求进行调整和优化。

相关推荐
FL16238631292 小时前
电力场景输电线路电缆线异常连接处缺陷金属部件腐蚀检测数据集VOC+YOLO格式3429张5类别
人工智能·yolo·机器学习
学习3人组3 小时前
YOLOv8模型TensorRT量化实操步骤手册
yolo
奔袭的算法工程师4 小时前
论文解读--FocalFormer3D : Focusing on Hard Instance for 3D Object Detection
人工智能·目标检测·计算机视觉
q_30238195565 小时前
RK3588 + YOLOv8 田块分割实战指南:从环境搭建到部署落地全流程
人工智能·单片机·深度学习·神经网络·物联网·yolo
lxmyzzs6 小时前
【端侧AI】基于 openvino + YOLOv11 构建多模态视觉分析终端 | 单目测距 | 单目测速
人工智能·yolo·openvino
学习3人组7 小时前
目标检测训练常见问题排查清单
人工智能·目标检测·计算机视觉
学习3人组9 小时前
目标检测模型选型决策树
人工智能·目标检测·决策树
学习3人组9 小时前
深度学习目标检测模型
人工智能·深度学习·目标检测
学习3人组9 小时前
YOLOv5模型训练完整讲解方案
yolo