YOLOv8 ONNX量化模型部署指南
本文以YOLOv8模型从训练到ONNX量化部署的完整流程为主线。主要内容包括:1)环境准备与模型导出为ONNX格式;2)ONNX模型的FP16/INT8量化方法;3)使用ONNXRuntime和TensorRT的部署实现;4)性能优化策略如IO绑定和批处理;5)完整的部署脚本和注意事项。该方案支持静态和动态输入尺寸,提供多种量化精度选择,并包含基准测试功能,适用于边缘计算和嵌入式设备部署场景。
一、环境准备
bash
# 基础环境
pip install ultralytics onnx onnxruntime onnxruntime-gpu
# 量化工具
pip install onnxruntime-tools
pip install neural-compressor # Intel的量化工具
pip install onnxsim # ONNX模型简化
# 可选:TensorRT支持
pip install tensorrt # 如果需要TensorRT部署
二、YOLOv8模型导出为ONNX
1. 基础导出
bash
from ultralytics import YOLO
import torch
def export_yolov8_to_onnx(model_path, output_path, imgsz=640, simplify=True):
"""
将YOLOv8模型导出为ONNX格式
Args:
model_path: YOLO模型路径 (.pt)
output_path: 输出ONNX路径
imgsz: 输入图像尺寸
simplify: 是否简化模型
"""
# 加载模型
model = YOLO(model_path)
# 导出为ONNX
success = model.export(
format="onnx", # 导出格式
imgsz=imgsz, # 输入尺寸
opset=12, # ONNX opset版本
simplify=simplify, # 简化模型
dynamic=False, # 固定输入尺寸(False为静态,True为动态)
half=False, # FP16精度
device='cpu' # 导出设备
)
if success:
print(f"模型已成功导出到: {output_path}")
return True
else:
print("模型导出失败")
return False
# 使用示例
export_yolov8_to_onnx(
model_path='yolov8n.pt',
output_path='yolov8n.onnx'
)
2. 带有动态输入尺寸的导出
bash
def export_yolov8_dynamic_onnx(model_path, output_path, batch_sizes=[1, 4, 8]):
"""
导出支持动态批处理的ONNX模型
"""
model = YOLO(model_path)
# 定义动态尺寸
dynamic_axes = {
'images': {0: 'batch_size'}, # 批处理维度动态
'output0': {0: 'batch_size'}, # 输出批处理维度动态
}
# 获取示例输入
example_input = torch.randn(batch_sizes[0], 3, 640, 640)
# 导出
torch.onnx.export(
model.model, # YOLO模型的torch模块
example_input,
output_path,
input_names=['images'],
output_names=['output0'],
dynamic_axes=dynamic_axes,
opset_version=12,
do_constant_folding=True
)
# 简化模型
import onnx
from onnxsim import simplify
onnx_model = onnx.load(output_path)
model_simp, check = simplify(onnx_model)
if check:
onnx.save(model_simp, output_path.replace('.onnx', '_simplified.onnx'))
print(f"简化模型已保存")
return True
三、ONNX模型量化
1. ONNX Runtime量化(静态量化)
bash
import onnx
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType
import numpy as np
from PIL import Image
import cv2
class YOLOCalibrationDataReader(CalibrationDataReader):
"""YOLO校准数据读取器"""
def __init__(self, calibration_dataset, input_shape=(640, 640), batch_size=1):
"""
Args:
calibration_dataset: 校准数据集路径列表
input_shape: 输入尺寸 (H, W)
batch_size: 批处理大小
"""
self.dataset = calibration_dataset
self.input_shape = input_shape
self.batch_size = batch_size
self.current_index = 0
def get_next(self):
"""获取下一个校准数据批次"""
if self.current_index >= len(self.dataset):
return None
batch_data = []
for _ in range(self.batch_size):
if self.current_index >= len(self.dataset):
break
img_path = self.dataset[self.current_index]
# 预处理图像
img = self.preprocess_image(img_path)
batch_data.append(img)
self.current_index += 1
if not batch_data:
return None
# 堆叠为批次
batch_array = np.stack(batch_data, axis=0).astype(np.float32)
return {'images': batch_array}
def preprocess_image(self, img_path):
"""图像预处理"""
# 读取图像
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 调整大小
img = cv2.resize(img, self.input_shape)
# 归一化 (0-1)
img = img / 255.0
# 转换为CHW格式
img = np.transpose(img, (2, 0, 1))
# 添加批次维度
img = np.expand_dims(img, axis=0)
return img
def quantize_yolov8_onnx(model_path, calibration_dataset, output_path):
"""
量化YOLOv8 ONNX模型
Args:
model_path: ONNX模型路径
calibration_dataset: 校准数据集路径列表
output_path: 量化模型输出路径
"""
# 加载原始模型
onnx_model = onnx.load(model_path)
# 创建校准数据读取器
calibration_data_reader = YOLOCalibrationDataReader(
calibration_dataset=calibration_dataset,
input_shape=(640, 640),
batch_size=1
)
# 量化配置
quant_config = {
'calibrate_method': 'MinMax', # 校准方法: MinMax, Entropy
'quant_format': QuantType.QInt8, # 量化格式
'per_channel': True, # 逐通道量化
'weight_type': QuantType.QInt8, # 权重量化类型
'activation_type': QuantType.QInt8, # 激活量化类型
'nodes_to_quantize': [], # 空列表表示量化所有节点
'nodes_to_exclude': [], # 需要排除的节点
'extra_options': {
'ActivationSymmetric': False,
'WeightSymmetric': True,
'EnableSubgraph': False
}
}
# 执行量化
quantized_model = quantize_static(
model_input=model_path,
calibration_data_reader=calibration_data_reader,
quant_format=quant_config['quant_format'],
per_channel=quant_config['per_channel'],
weight_type=quant_config['weight_type'],
activation_type=quant_config['activation_type'],
nodes_to_quantize=quant_config['nodes_to_quantize'],
nodes_to_exclude=quant_config['nodes_to_exclude'],
extra_options=quant_config['extra_options']
)
# 保存量化模型
onnx.save(quantized_model, output_path)
print(f"量化模型已保存到: {output_path}")
return output_path
# 使用示例
calibration_images = [
'calibration/image1.jpg',
'calibration/image2.jpg',
# ... 更多图像
]
quantize_yolov8_onnx(
model_path='yolov8n.onnx',
calibration_dataset=calibration_images,
output_path='yolov8n_quantized.onnx'
)
2. 使用Intel Neural Compressor量化
bash
from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig
def quantize_with_neural_compressor(model_path, calibration_dataset, output_path):
"""
使用Intel Neural Compressor量化
"""
from neural_compressor.data import DataLoader, Datasets
# 创建数据集
dataset = Datasets('onnxrt_qdq')['dummy_v2'](
input_shape=(1, 3, 640, 640),
label_shape=(1,)
)
# 创建数据加载器
dataloader = DataLoader(framework='onnxruntime', dataset=dataset)
# 配置量化参数
config = PostTrainingQuantConfig(
approach='static', # 静态量化
calibration_sampling_size=[8, 16, 32], # 校准采样大小
op_type_dict={
'Conv': {'weight': {'dtype': ['s8']}, 'activation': {'dtype': ['u8']}},
'MatMul': {'weight': {'dtype': ['s8']}, 'activation': {'dtype': ['u8']}}
},
recipes={
'smooth_quant': True, # 平滑量化
'smooth_quant_args': {'alpha': 0.5}
}
)
# 执行量化
q_model = quantization.fit(
model=model_path,
conf=config,
calib_dataloader=dataloader,
eval_dataloader=dataloader
)
# 保存量化模型
q_model.save(output_path)
print(f"Neural Compressor量化模型已保存: {output_path}")
return output_path
3. FP16量化(混合精度)
bash
def convert_to_fp16(model_path, output_path):
"""
将模型转换为FP16精度
"""
import onnx
from onnxconverter_common import float16
# 加载模型
model = onnx.load(model_path)
# 转换为FP16
model_fp16 = float16.convert_float_to_float16(
model,
keep_io_types=True, # 保持输入输出为FP32
op_block_list=['NonMaxSuppression'] # 某些操作保持FP32
)
# 保存模型
onnx.save(model_fp16, output_path)
print(f"FP16模型已保存: {output_path}")
return output_path
四、部署量化模型
1. ONNX Runtime推理引擎
bash
import onnxruntime as ort
import numpy as np
import cv2
from typing import List, Tuple, Dict
class YOLOv8ONNXInference:
"""YOLOv8 ONNX推理器"""
def __init__(self, model_path: str, providers=None):
"""
初始化推理器
Args:
model_path: ONNX模型路径
providers: 执行提供者列表
"""
if providers is None:
# 自动选择最佳提供者
available_providers = ort.get_available_providers()
providers = []
# 优先级:TensorRT > CUDA > CPU
if 'TensorrtExecutionProvider' in available_providers:
providers.append('TensorrtExecutionProvider')
if 'CUDAExecutionProvider' in available_providers:
providers.append('CUDAExecutionProvider')
providers.append('CPUExecutionProvider')
# 创建会话选项
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
# 创建推理会话
self.session = ort.InferenceSession(
model_path,
sess_options=session_options,
providers=providers
)
# 获取输入输出信息
self.input_name = self.session.get_inputs()[0].name
self.output_name = self.session.get_outputs()[0].name
self.input_shape = self.session.get_inputs()[0].shape
self.output_shape = self.session.get_outputs()[0].shape
print(f"模型加载成功:")
print(f" 输入名称: {self.input_name}, 形状: {self.input_shape}")
print(f" 输出名称: {self.output_name}, 形状: {self.output_shape}")
print(f" 执行提供者: {self.session.get_providers()}")
def preprocess(self, image: np.ndarray, target_size: Tuple[int, int] = (640, 640)) -> np.ndarray:
"""
预处理图像
Args:
image: 输入图像 (H, W, C)
target_size: 目标尺寸 (H, W)
Returns:
预处理后的图像 (1, C, H, W)
"""
# 调整大小
img_resized = cv2.resize(image, target_size)
# BGR转RGB(如果必要)
if len(img_resized.shape) == 3 and img_resized.shape[2] == 3:
img_resized = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
# 归一化 (0-1)
img_normalized = img_resized / 255.0
# 转换为CHW格式
img_chw = np.transpose(img_normalized, (2, 0, 1))
# 添加批次维度
img_batch = np.expand_dims(img_chw, axis=0).astype(np.float32)
return img_batch
def postprocess(self, outputs: np.ndarray,
confidence_threshold: float = 0.25,
iou_threshold: float = 0.45) -> List[Dict]:
"""
后处理输出
Args:
outputs: 模型输出
confidence_threshold: 置信度阈值
iou_threshold: IOU阈值
Returns:
检测结果列表
"""
# YOLOv8输出格式: [batch, num_detections, 85]
# 85 = [x, y, w, h, conf, class1, class2, ...]
detections = []
output = outputs[0] # 获取第一个批次的输出
for detection in output:
# 提取坐标和置信度
x, y, w, h, conf = detection[:5]
# 检查置信度
if conf < confidence_threshold:
continue
# 提取类别分数
class_scores = detection[5:]
class_id = np.argmax(class_scores)
class_score = class_scores[class_id]
# 计算最终置信度
final_confidence = conf * class_score
if final_confidence < confidence_threshold:
continue
# 计算边界框坐标 (原始图像尺寸)
x1 = x - w / 2
y1 = y - h / 2
x2 = x + w / 2
y2 = y + h / 2
detections.append({
'bbox': [x1, y1, x2, y2],
'confidence': float(final_confidence),
'class_id': int(class_id),
'class_score': float(class_score)
})
# 非极大抑制
if detections and iou_threshold > 0:
detections = self.non_max_suppression(detections, iou_threshold)
return detections
def non_max_suppression(self, detections: List[Dict], iou_threshold: float) -> List[Dict]:
"""非极大抑制"""
if not detections:
return []
# 按置信度排序
detections.sort(key=lambda x: x['confidence'], reverse=True)
keep = []
while detections:
# 取出置信度最高的检测
best = detections.pop(0)
keep.append(best)
# 计算与剩余检测的IOU
i = 0
while i < len(detections):
iou = self.calculate_iou(best['bbox'], detections[i]['bbox'])
# 如果IOU超过阈值,移除该检测
if iou > iou_threshold:
detections.pop(i)
else:
i += 1
return keep
def calculate_iou(self, box1: List[float], box2: List[float]) -> float:
"""计算IOU"""
# 计算交集
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
if x2 < x1 or y2 < y1:
return 0.0
intersection = (x2 - x1) * (y2 - y1)
# 计算并集
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0.0
def infer(self, image: np.ndarray) -> List[Dict]:
"""
执行推理
Args:
image: 输入图像
Returns:
检测结果
"""
# 预处理
input_tensor = self.preprocess(image)
# 推理
outputs = self.session.run(
[self.output_name],
{self.input_name: input_tensor}
)
# 后处理
detections = self.postprocess(outputs)
# 调整边界框到原始图像尺寸
orig_h, orig_w = image.shape[:2]
input_h, input_w = self.input_shape[2], self.input_shape[3]
for det in detections:
bbox = det['bbox']
# 缩放回原始尺寸
bbox[0] = bbox[0] * orig_w / input_w
bbox[1] = bbox[1] * orig_h / input_h
bbox[2] = bbox[2] * orig_w / input_w
bbox[3] = bbox[3] * orig_h / input_h
det['bbox'] = [int(coord) for coord in bbox]
return detections
def benchmark(self, image: np.ndarray, warmup=10, iterations=100):
"""
基准测试
Args:
image: 测试图像
warmup: 预热迭代次数
iterations: 测试迭代次数
"""
import time
# 预热
print("预热...")
for _ in range(warmup):
self.infer(image)
# 基准测试
print(f"运行 {iterations} 次推理...")
times = []
for i in range(iterations):
start_time = time.perf_counter()
self.infer(image)
end_time = time.perf_counter()
times.append((end_time - start_time) * 1000) # 转换为毫秒
if (i + 1) % 10 == 0:
print(f"已完成 {i + 1}/{iterations} 次推理")
# 统计结果
avg_time = np.mean(times)
min_time = np.min(times)
max_time = np.max(times)
std_time = np.std(times)
fps = 1000 / avg_time
print("\n基准测试结果:")
print(f" 平均推理时间: {avg_time:.2f} ms")
print(f" 最小推理时间: {min_time:.2f} ms")
print(f" 最大推理时间: {max_time:.2f} ms")
print(f" 标准差: {std_time:.2f} ms")
print(f" FPS: {fps:.2f}")
return {
'avg_time_ms': avg_time,
'min_time_ms': min_time,
'max_time_ms': max_time,
'std_time_ms': std_time,
'fps': fps
}
# 使用示例
def test_onnx_inference():
# 初始化推理器
detector = YOLOv8ONNXInference('yolov8n_quantized.onnx')
# 读取图像
image = cv2.imread('test.jpg')
# 执行推理
detections = detector.infer(image)
# 打印结果
print(f"检测到 {len(detections)} 个物体:")
for i, det in enumerate(detections):
print(f"{i+1}. 类别: {det['class_id']}, 置信度: {det['confidence']:.2f}, "
f"边界框: {det['bbox']}")
# 基准测试
stats = detector.benchmark(image, warmup=5, iterations=50)
# 可视化结果
for det in detections:
x1, y1, x2, y2 = det['bbox']
cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
label = f"Class {det['class_id']}: {det['confidence']:.2f}"
cv2.putText(image, label, (x1, y1-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
cv2.imwrite('result.jpg', image)
print("结果已保存到 result.jpg")
if __name__ == "__main__":
test_onnx_inference()
2. TensorRT部署(可选)
python
def convert_onnx_to_tensorrt(onnx_path, trt_path, precision='fp16'):
"""
将ONNX模型转换为TensorRT引擎
Args:
onnx_path: ONNX模型路径
trt_path: TensorRT引擎保存路径
precision: 精度 ('fp32', 'fp16', 'int8')
"""
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# 创建构建器
builder = trt.Builder(TRT_LOGGER)
# 创建网络定义
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
# 创建ONNX解析器
parser = trt.OnnxParser(network, TRT_LOGGER)
# 解析ONNX模型
with open(onnx_path, 'rb') as model:
if not parser.parse(model.read()):
print("解析失败:")
for error in range(parser.num_errors):
print(parser.get_error(error))
return False
# 构建配置
config = builder.create_builder_config()
config.max_workspace_size = 1 << 30 # 1GB
# 设置精度
if precision == 'fp16':
config.set_flag(trt.BuilderFlag.FP16)
elif precision == 'int8':
config.set_flag(trt.BuilderFlag.INT8)
# 需要校准器
# calibrator = trt.Int8_calibrator(...)
# config.int8_calibrator = calibrator
# 构建引擎
engine = builder.build_engine(network, config)
if engine is None:
print("构建引擎失败")
return False
# 保存引擎
with open(trt_path, 'wb') as f:
f.write(engine.serialize())
print(f"TensorRT引擎已保存到: {trt_path}")
return True
五、性能优化策略
1. IO绑定优化
python
class OptimizedYOLOv8ONNXInference(YOLOv8ONNXInference):
"""优化的YOLOv8 ONNX推理器"""
def __init__(self, model_path: str, providers=None):
super().__init__(model_path, providers)
# 启用IO绑定
self.io_binding = self.session.io_binding()
# 预分配内存
self._preallocate_buffers()
def _preallocate_buffers(self):
"""预分配输入输出缓冲区"""
# 获取输入输出信息
input_info = self.session.get_inputs()[0]
output_info = self.session.get_outputs()[0]
# 预分配输入缓冲区
self.input_shape = input_info.shape
self.input_dtype = np.float32
# 计算最大输出大小
max_batch_size = self.input_shape[0]
max_detections = output_info.shape[1]
# 预分配输出缓冲区
self.output_buffer = ort.OrtValue.ortvalue_from_numpy(
np.zeros((max_batch_size, max_detections, output_info.shape[2]),
dtype=np.float32)
)
def infer_optimized(self, image: np.ndarray) -> List[Dict]:
"""优化的推理方法"""
# 预处理
input_tensor = self.preprocess(image)
# 创建OrtValue
input_ortvalue = ort.OrtValue.ortvalue_from_numpy(input_tensor)
# 清除之前的绑定
self.io_binding.clear_binding_inputs()
self.io_binding.clear_binding_outputs()
# 绑定输入输出
self.io_binding.bind_input(
self.input_name,
input_ortvalue.device_name(),
input_ortvalue.device_id(),
self.input_dtype,
input_tensor.shape,
input_ortvalue.data_ptr()
)
self.io_binding.bind_output(
self.output_name,
self.output_buffer.device_name(),
self.output_buffer.device_id(),
self.output_buffer.dtype(),
self.output_buffer.shape(),
self.output_buffer.data_ptr()
)
# 执行推理
self.session.run_with_iobinding(self.io_binding)
# 获取输出
outputs = self.io_binding.copy_outputs_to_cpu()
# 后处理
detections = self.postprocess(outputs)
return detections
2. 批处理优化
python
class YOLOv8BatchInference:
"""YOLOv8批处理推理器"""
def __init__(self, model_path: str, batch_size: int = 4):
self.batch_size = batch_size
self.inference_engine = YOLOv8ONNXInference(model_path)
self.batch_buffer = []
def add_to_batch(self, image: np.ndarray) -> bool:
"""添加图像到批处理缓冲区"""
if len(self.batch_buffer) >= self.batch_size:
return False
# 预处理并添加到缓冲区
processed = self.inference_engine.preprocess(image)
self.batch_buffer.append((image, processed))
return True
def process_batch(self):
"""处理当前批次"""
if not self.batch_buffer:
return []
# 堆叠批处理数据
batch_images = [item[0] for item in self.batch_buffer]
batch_tensors = [item[1] for item in self.batch_buffer]
# 堆叠张量
batch_tensor = np.concatenate(batch_tensors, axis=0)
# 执行批处理推理
outputs = self.inference_engine.session.run(
[self.inference_engine.output_name],
{self.inference_engine.input_name: batch_tensor}
)
# 处理每个图像的检测结果
all_detections = []
for i, original_image in enumerate(batch_images):
# 提取单个图像的输出
single_output = outputs[0][i:i+1]
# 后处理
detections = self.inference_engine.postprocess([single_output])
# 调整边界框
orig_h, orig_w = original_image.shape[:2]
for det in detections:
bbox = det['bbox']
bbox[0] = bbox[0] * orig_w / 640
bbox[1] = bbox[1] * orig_h / 640
bbox[2] = bbox[2] * orig_w / 640
bbox[3] = bbox[3] * orig_h / 640
det['bbox'] = [int(coord) for coord in bbox]
all_detections.append(detections)
# 清空缓冲区
self.batch_buffer.clear()
return all_detections
六、完整的部署脚本
python
#!/usr/bin/env python3
"""
YOLOv8 ONNX量化模型部署脚本
"""
import argparse
import os
import json
import time
from pathlib import Path
def main():
parser = argparse.ArgumentParser(description='YOLOv8 ONNX量化部署')
parser.add_argument('--model', type=str, required=True, help='YOLOv8模型路径 (.pt)')
parser.add_argument('--output', type=str, default='output', help='输出目录')
parser.add_argument('--calibration-data', type=str, help='校准数据目录')
parser.add_argument('--quantize', action='store_true', help='是否量化')
parser.add_argument('--precision', choices=['fp32', 'fp16', 'int8'], default='int8', help='量化精度')
parser.add_argument('--batch-size', type=int, default=1, help='批处理大小')
parser.add_argument('--test-image', type=str, help='测试图像路径')
parser.add_argument('--benchmark', action='store_true', help='运行基准测试')
args = parser.parse_args()
# 创建输出目录
os.makedirs(args.output, exist_ok=True)
# 1. 导出ONNX模型
print("步骤1: 导出ONNX模型...")
from ultralytics import YOLO
model = YOLO(args.model)
onnx_path = os.path.join(args.output, 'model.onnx')
model.export(
format='onnx',
imgsz=640,
opset=12,
simplify=True,
dynamic=False if args.batch_size == 1 else {'batch_size': [1, args.batch_size]},
half=False,
device='cpu'
)
# 移动导出的模型
model_name = Path(args.model).stem
exported_onnx = f'{model_name}.onnx'
if os.path.exists(exported_onnx):
os.rename(exported_onnx, onnx_path)
print(f"ONNX模型已导出到: {onnx_path}")
# 2. 量化(如果启用)
if args.quantize:
print(f"\n步骤2: {args.precision.upper()}量化...")
if args.precision == 'fp16':
# FP16量化
quantized_path = os.path.join(args.output, 'model_fp16.onnx')
convert_to_fp16(onnx_path, quantized_path)
elif args.precision == 'int8':
# INT8量化需要校准数据
if not args.calibration_data:
print("错误: INT8量化需要校准数据目录")
return
# 收集校准图像
calibration_images = []
for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp']:
calibration_images.extend(Path(args.calibration_data).glob(ext))
calibration_images = [str(p) for p in calibration_images[:100]] # 最多100张
if not calibration_images:
print("错误: 校准目录中没有找到图像")
return
print(f"使用 {len(calibration_images)} 张图像进行校准...")
# INT8量化
quantized_path = os.path.join(args.output, 'model_int8.onnx')
quantize_yolov8_onnx(onnx_path, calibration_images, quantized_path)
else:
# FP32,不量化
quantized_path = onnx_path
else:
quantized_path = onnx_path
# 3. 测试推理
if args.test_image:
print("\n步骤3: 测试推理...")
# 初始化推理器
inference_engine = YOLOv8ONNXInference(quantized_path)
# 测试图像
import cv2
image = cv2.imread(args.test_image)
if image is None:
print(f"错误: 无法读取图像 {args.test_image}")
else:
# 执行推理
start_time = time.time()
detections = inference_engine.infer(image)
inference_time = (time.time() - start_time) * 1000 # 毫秒
print(f"推理时间: {inference_time:.2f} ms")
print(f"检测到 {len(detections)} 个物体:")
for i, det in enumerate(detections):
print(f" {i+1}. 类别: {det['class_id']}, "
f"置信度: {det['confidence']:.3f}, "
f"边界框: {det['bbox']}")
# 可视化结果
for det in detections:
x1, y1, x2, y2 = det['bbox']
cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
label = f"Class {det['class_id']}: {det['confidence']:.2f}"
cv2.putText(image, label, (x1, y1-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
result_path = os.path.join(args.output, 'result.jpg')
cv2.imwrite(result_path, image)
print(f"结果已保存到: {result_path}")
# 4. 基准测试
if args.benchmark:
print("\n步骤4: 基准测试...")
# 创建测试图像
import cv2
import numpy as np
test_image = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
# 初始化推理器
inference_engine = YOLOv8ONNXInference(quantized_path)
# 运行基准测试
stats = inference_engine.benchmark(
test_image,
warmup=10,
iterations=100
)
# 保存基准测试结果
benchmark_path = os.path.join(args.output, 'benchmark.json')
with open(benchmark_path, 'w') as f:
json.dump(stats, f, indent=2)
print(f"基准测试结果已保存到: {benchmark_path}")
print("\n部署完成!")
print(f"模型文件: {quantized_path}")
print(f"输出目录: {args.output}")
if __name__ == "__main__":
# 导入必要的函数
from export_yolov8_to_onnx import export_yolov8_to_onnx
from quantization import quantize_yolov8_onnx, convert_to_fp16
from inference import YOLOv8ONNXInference
main()
七、部署注意事项
1. 性能对比表
| 模型类型 | 精度 | 速度 | 内存占用 | 适用场景 |
|---|---|---|---|---|
| FP32 | 32位浮点 | 慢 | 高 | 开发调试 |
| FP16 | 16位浮点 | 快 | 中 | 边缘GPU |
| INT8 | 8位整型 | 最快 | 低 | 移动端/嵌入式 |
2. 常见问题解决
python
def troubleshoot_onnx_deployment():
"""ONNX部署常见问题解决"""
# 1. 模型兼容性问题
import onnx
def check_model_compatibility(model_path):
"""检查模型兼容性"""
model = onnx.load(model_path)
# 检查opset版本
opset_import = model.opset_import
print(f"ONNX opset版本: {opset_import[0].version}")
# 检查支持的算子
from onnx import helper
# 列出所有算子类型
ops = set()
for node in model.graph.node:
ops.add(node.op_type)
print(f"模型中使用的算子: {sorted(ops)}")
# 检查模型有效性
from onnx import checker
try:
checker.check_model(model)
print("模型有效")
except Exception as e:
print(f"模型无效: {e}")
# 2. 内存优化
def optimize_memory_usage(session_options):
"""优化内存使用"""
# 启用内存优化
session_options.enable_cpu_mem_arena = True
session_options.enable_mem_pattern = True
session_options.enable_mem_reuse = True
# 设置内存限制
session_options.max_num_graphs = 3
session_options.intra_op_num_threads = 1
session_options.inter_op_num_threads = 1
# 3. 性能优化
def optimize_performance(session_options):
"""优化性能"""
# 启用所有优化
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# 设置执行模式
session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
# 启用CUDA优化
session_options.add_session_config_entry('session.disable_prepacking', '0')
return session_options
3. 部署检查清单
python
def deployment_checklist(model_path, output_dir):
"""部署检查清单"""
checklist = {
"模型格式": {
"检查点": "模型是否为.pt格式",
"ONNX导出": "是否已导出为ONNX",
"opset版本": "是否使用opset 12或更高",
"动态尺寸": "是否支持需要的输入尺寸"
},
"量化准备": {
"校准数据": "是否准备足够的校准图像",
"量化工具": "是否安装量化工具",
"精度要求": "是否确定量化精度(FP16/INT8)"
},
"部署环境": {
"ONNX Runtime": "是否安装正确版本",
"CUDA支持": "是否启用GPU加速",
"内存检查": "是否有足够的内存",
"兼容性": "检查算子兼容性"
},
"性能测试": {
"基准测试": "是否进行速度测试",
"精度测试": "是否验证量化后的精度",
"内存测试": "是否检查内存使用",
"多线程": "是否测试多线程性能"
}
}
# 执行检查
results = {}
for category, items in checklist.items():
results[category] = {}
for item, description in items.items():
# 这里可以添加具体的检查逻辑
results[category][item] = {
"description": description,
"status": "待检查"
}
return results
YOLOv8 ONNX量化部署方案包含了从模型导出、量化到部署的完整流程,可以根据具体需求进行调整和优化。