Vlm-RT-DETR网络模型部署推理

训练这里跳过了和yolo一样把model=yolov8n换成redetr-l就可以

代码层

python 复制代码

import cv2 as cv
import numpy as np
import openvino as ov
import time

# def onnx_rtdetr():
#     # Load a COCO-pretrained RT-DETR-l model
#     model = RTDETR("rtdetr-l.pt")
#
#     # Display model information (optional)
#     model.info()
#     model.export(format="onnx", imgsz=640)

def format_yolov8(frame):
    row, col, _ = frame.shape
    _max = max(col, row)
    result = np.zeros((_max, _max, 3), np.uint8)
    result[0:row, 0:col] = frame
    return result

def read_all_lines(filename):
    try:
        with open(filename, 'r') as file:
            lines = file.readlines()
            return [line.strip() for line in lines]
    except FileNotFoundError:
        print(f"文件 '{filename}' 未找到。")
        return []
    except Exception as e:
        print(f"读取文件时发生错误: {e}")
        return []
class_list = read_all_lines("./classes.txt")

def redetr_infer_demo():
    colors = [(255, 255, 0), (0, 255, 0), (0, 255, 255), (255, 0, 0)]

    core = ov.Core()
    for device in core.available_devices:
        print(device)

    # Read IR
    # model = core.read_model()
    # onnxpath="./rtdetr-l.onnx"
    onnxpath = "./rtdetr_pill_best.onnx"
    compiled_model = core.compile_model(model=onnxpath, device_name="CPU")
    output_layer = compiled_model.output(0)
    print("out name", output_layer.shape)
    # frame = cv.imread("D:/images/frame_deeppose.jpg")
    frame = cv.imread("pill_demo.png")
    bgr = format_yolov8(frame)
    img_h, img_w, img_c = bgr.shape

    start = time.time()
    image = cv.dnn.blobFromImage(bgr, 1 / 255.0, (640, 640), swapRB=True, crop=False)

    res = compiled_model([image])[output_layer] # 1x300x6
    rows = np.squeeze(res, 0)
    x_factor = img_w / 640
    y_factor = img_h / 640

    for r in range(rows.shape[0]):
        row = rows[r]
        # classes_scores = row[4:]
        # class_id = np.argmax(classes_scores)
        # conf = classes_scores[class_id]
        class_id = 0
        conf = row[4]
        if conf>0.25:
            x, y, w, h = row[0].item(), row[1].item(), row[2].item(), row[3].item()
            left = int((x - 0.5 * w) * 640) * x_factor
            top = int((y - 0.5 * h) * 640) * x_factor
            width = int(w * 640) * x_factor
            height = int(h * 640) * x_factor
            box = [int(left), int(top), int(width), int(height)]
            color = colors[class_id % len(colors)]
            cv.rectangle(frame, box, color, 2)
            cv.rectangle(frame, (box[0], box[1] - 20), (box[0] + box[2], box[1]), color, -1)
            cv.putText(frame, "pill" + (" %.2f"%conf), (box[0], box[1] - 7), cv.FONT_HERSHEY_SIMPLEX, .5, (0, 0, 0))

    end = time.time()
    inf_end = end - start
    fps = 1 / inf_end
    fps_label = "FPS: %.2f" % fps
    cv.putText(frame, fps_label, (20, 45), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv.imshow("RTDETR Object Detection + OpenVINNO2025.1", frame)
    cc = cv.waitKey(0)
    cv.destroyAllWindows()

if __name__ == "__main__":
    redetr_infer_demo()

解析

这段代码的核心是：利用 OpenVINO 工具包加载预训练的 RT-DETR 目标检测模型（ONNX 格式），对指定的药片图片进行推理，解析检测结果后绘制检测框、置信度和推理 FPS，并通过 OpenCV 显示最终的检测效果。整个流程涵盖模型加载、图像预处理、模型推理、结果后处理、可视化展示五个核心环节。

1. 导入依赖库

python 复制代码

import cv2 as cv
import numpy as np
import openvino as ov
import time

cv2：OpenCV 库，负责图像读取、预处理、绘制检测框、图像显示等核心视觉操作。
numpy：数值计算库，处理模型推理的张量数据（如维度变换、坐标计算）。
openvino：英特尔的深度学习推理优化库，负责加载和编译模型、加速推理。
time：用于统计推理耗时，计算 FPS（每秒帧数）。

2. 图像格式适配函数（format_yolov8）

python 复制代码

def format_yolov8(frame):
    row, col, _ = frame.shape  # 获取原始图像的高(row)、宽(col)、通道数(_)
    _max = max(col, row)       # 取宽/高中的最大值，用于构建正方形画布
    result = np.zeros((_max, _max, 3), np.uint8)  # 创建全黑的正方形画布（尺寸为_max×_max）
    result[0:row, 0:col] = frame  # 将原始图像粘贴到正方形画布的左上角，其余区域补黑
    return result

核心作用：RT-DETR/YOLOv8 类模型要求输入为正方形尺寸（如 640×640），该函数通过 "补黑边" 将任意尺寸的图像转为正方形，保证图像比例不被拉伸。
输入：原始 RGB 图像（OpenCV 读取的是 BGR，不影响此处尺寸处理）；输出：正方形尺寸的补边图像。

3. 类别文件读取函数（read_all_lines）

python 复制代码

def read_all_lines(filename):
    try:
        with open(filename, 'r') as file:
            lines = file.readlines()
            return [line.strip() for line in lines]  # 读取所有行并去除换行/空格
    except FileNotFoundError:
        print(f"文件 '{filename}' 未找到。")
        return []
    except Exception as e:
        print(f"读取文件时发生错误: {e}")
        return []
class_list = read_all_lines("./classes.txt")  # 加载类别列表

核心作用：读取存储目标检测类别的文本文件（如classes.txt中可能写着pill），返回清洗后的类别列表。
异常处理：覆盖 "文件不存在" 和 "其他读取错误"，避免程序崩溃；
注意：当前代码中class_list并未实际使用（后续硬编码了类别为pill），属于预留的可扩展逻辑。

5. 核心推理函数（redetr_infer_demo）

这是代码的核心，我们拆分为多个子环节解析：

5.1 初始化基础变量

python 复制代码

def redetr_infer_demo():
    colors = [(255, 255, 0), (0, 255, 0), (0, 255, 255), (255, 0, 0)]  # 检测框颜色列表（青、绿、黄、红）

    core = ov.Core()  # 初始化OpenVINO核心对象
    for device in core.available_devices:
        print(device)  # 打印当前设备支持的推理硬件（如CPU、GPU、NPU等）

colors：为不同类别分配不同颜色，通过class_id % len(colors)循环使用；
ov.Core()：OpenVINO 的核心入口，负责管理模型、设备、推理上下文。

5.2 加载并编译 ONNX 模型

python 复制代码

    # 模型路径（自定义训练的药片检测RT-DETR模型）
    onnxpath = "./rtdetr_pill_best.onnx"
    compiled_model = core.compile_model(model=onnxpath, device_name="CPU")  # 编译模型到CPU设备
    output_layer = compiled_model.output(0)  # 获取模型的第一个输出层
    print("out name", output_layer.shape)  # 打印输出层形状（示例：[1, 300, 6]，含义见下文）

compile_model：将 ONNX 模型编译为适配指定硬件（此处为 CPU）的优化格式，是 OpenVINO 加速推理的核心步骤；
输出层形状说明（如[1, 300, 6]）：
- 1：batch 维度（单张图片）；
- 300：模型预测的最大检测框数量；
- 6：每个检测框的信息，格式为[x, y, w, h, conf, class_score]（中心坐标、宽高、置信度、类别得分）。

5.3 图像读取与预处理

python 复制代码

    frame = cv.imread("pill_demo.png")  # 读取待检测的药片图片（BGR格式）
    bgr = format_yolov8(frame)  # 转为正方形补边图像
    img_h, img_w, img_c = bgr.shape  # 获取补边后图像的高/宽/通道数

    start = time.time()  # 记录推理开始时间
    # 将图像转为模型输入的Blob格式：归一化(1/255)、尺寸调整(640x640)、通道交换(RGB↔BGR)、不裁剪
    image = cv.dnn.blobFromImage(bgr, 1 / 255.0, (640, 640), swapRB=True, crop=False)

cv.imread：OpenCV 默认读取 BGR 格式图像，swapRB=True会将其转为 RGB（匹配模型训练时的通道顺序）；
cv.dnn.blobFromImage：深度学习模型的标准图像预处理方式，输出形状为[1, 3, 640, 640]（batch, channel, height, width）。

5.4 模型推理与结果解析

python 复制代码

    res = compiled_model([image])[output_layer]  # 执行推理，获取输出层结果
    rows = np.squeeze(res, 0)  # 去除batch维度，形状变为[300, 6]
    x_factor = img_w / 640  # x轴缩放因子（补边后图像宽 / 模型输入宽）
    y_factor = img_h / 640  # y轴缩放因子（补边后图像高 / 模型输入高）

compiled_model([image])：传入预处理后的图像张量，返回字典（key 为输出层，value 为推理结果）；
np.squeeze：去除维度为 1 的轴，简化后续遍历逻辑；
缩放因子：模型输出的坐标是基于 640×640 的，需要缩放回补边后图像的实际尺寸。

5.5 检测框过滤与绘制

python 复制代码

    for r in range(rows.shape[0]):  # 遍历所有300个检测框
        row = rows[r]  # 单个检测框的信息：[x, y, w, h, conf, class_score]
        # 注：原代码注释了通用类别判断逻辑，硬编码为检测"药片"类别
        # classes_scores = row[4:]  # 所有类别的得分
        # class_id = np.argmax(classes_scores)  # 得分最高的类别ID
        # conf = classes_scores[class_id]  # 该类别的置信度
        class_id = 0  # 硬编码类别ID为0（对应"药片"）
        conf = row[4]  # 直接取置信度值

        if conf>0.25:  # 过滤低置信度（置信度>25%才绘制）
            # 提取检测框的中心坐标、宽高（模型输出的是归一化值？此处已转为像素值）
            x, y, w, h = row[0].item(), row[1].item(), row[2].item(), row[3].item()
            # 计算检测框左上角坐标（将中心坐标转为左上角坐标）
            left = int((x - 0.5 * w) * 640) * x_factor
            top = int((y - 0.5 * h) * 640) * y_factor
            width = int(w * 640) * x_factor  # 检测框宽度（缩放后）
            height = int(h * 640) * y_factor  # 检测框高度（缩放后）
            box = [int(left), int(top), int(width), int(height)]  # 检测框坐标（左上x, 左上y, 宽, 高）
            
            # 绘制检测框
            color = colors[class_id % len(colors)]  # 选择对应类别的颜色
            cv.rectangle(frame, box, color, 2)  # 绘制矩形框（线宽2）
            # 绘制置信度文本的背景（填充矩形）
            cv.rectangle(frame, (box[0], box[1] - 20), (box[0] + box[2], box[1]), color, -1)
            # 绘制类别+置信度文本（字体、大小、颜色）
            cv.putText(frame, "pill" + (" %.2f"%conf), (box[0], box[1] - 7), cv.FONT_HERSHEY_SIMPLEX, .5, (0, 0, 0))

核心逻辑：只保留置信度 > 0.25 的检测框，将模型输出的 "中心坐标 + 宽高" 转为 "左上角坐标 + 宽高"，并缩放回原始图像尺寸；
硬编码说明：原代码注释了通用的多类别判断逻辑，改为只检测 "药片"（class_id=0），适合单类别检测场景；
绘制细节：cv.rectangle的-1参数表示填充矩形（用于文本背景），cv.putText设置字体、大小、颜色。

5.6 计算 FPS 并显示结果

python 复制代码

    end = time.time()  # 记录推理结束时间
    inf_end = end - start  # 推理总耗时（秒）
    fps = 1 / inf_end  # 每秒推理帧数（FPS）
    fps_label = "FPS: %.2f" % fps  # 格式化FPS文本
    # 绘制FPS到图像上（位置、字体、大小、颜色、线宽）
    cv.putText(frame, fps_label, (20, 45), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv.imshow("RTDETR Object Detection + OpenVINNO2025.1", frame)  # 显示检测结果窗口
    cc = cv.waitKey(0)  # 等待按键输入（按任意键关闭窗口）
    cv.destroyAllWindows()  # 销毁所有OpenCV窗口

FPS 计算：1/推理耗时表示每秒能处理的图片数，是衡量推理速度的核心指标；
cv.waitKey(0)：参数 0 表示 "无限等待按键"，直到用户按任意键才关闭窗口。