YOLO26-Seg ONNXruntime C++/python推理

python版本：

python 复制代码

import cv2
import argparse
import numpy as np
import math
import onnxruntime
from typing import Tuple

class_names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
               'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
               'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
               'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
               'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
               'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
               'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
               'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
               'scissors', 'teddy bear', 'hair drier', 'toothbrush']

# Create a list of colors for each class where each color is a tuple of 3 integer values
rng = np.random.default_rng(3)
colors = rng.uniform(0, 255, size=(len(class_names), 3))

class Segment:
    def __init__(self, path, conf_thres=0.7):
        self.conf_threshold = conf_thres
        self.onnx_path = path

        self.session = onnxruntime.InferenceSession(self.onnx_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
        self.get_input_output_details()
        # 新增：存储letterbox的偏移和缩放信息
        self.letterbox_top = 0
        self.letterbox_left = 0
        self.letterbox_scale = 1.0
    
    def get_input_output_details(self):
        model_inputs = self.session.get_inputs()
        model_outputs = self.session.get_outputs()

        self.input_names = [input_.name for input_ in model_inputs]
        self.output_names = [output.name for output in model_outputs]

        self.input_shape = model_inputs[0].shape
        self.input_height = self.input_shape[2]
        self.input_width = self.input_shape[3]
    
    def letterbox(self, img: np.ndarray, new_shape: Tuple[int, int]) -> Tuple[np.ndarray, Tuple[int, int], float]:
        """
        图片缩放，等比例缩放加114补黑边
        Args:
            img: 输入图片 (H,W,C)
            new_shape: 目标尺寸 (height, width)
        Returns:
            img: 处理后的图片
            (top, left): 补边的上下左右偏移
            r: 缩放比例
        """
        shape = img.shape[:2]  # current shape [height, width]

        # 计算等比例缩放因子
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])

        # 计算缩放后的尺寸
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        # 计算需要补充的边距
        dw, dh = (new_shape[1] - new_unpad[0]) / 2, (new_shape[0] - new_unpad[1]) / 2  # wh padding

        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
        
        # 计算上下左右补边
        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
        
        # 补边（114灰度值）
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))

        return img, (top, left), r

    def prepare_input(self, image):
        self.img_height, self.img_width = image.shape[:2]

        input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # 替换原有resize：使用letterbox等比例缩放+补边
        input_img, (self.letterbox_top, self.letterbox_left), self.letterbox_scale = self.letterbox(
            input_img, (self.input_height, self.input_width)
        )

        # Scale input pixel values to 0 to 1
        input_img = input_img / 255.0
        input_img = input_img.transpose(2, 0, 1)
        input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)

        return input_tensor

    def inference(self, input_tensor):
        outputs = self.session.run(self.output_names, {self.input_names[0]: input_tensor})
        return outputs

    def process_box_output_26(self, box_output):
        detections = box_output[0]
        boxes = []
        scores = []
        class_ids = []
        mask_predictions = []

        for det in detections:
            x1, y1, x2, y2, score, class_id = det[:6]

            mask_prediction = det[6:]

            if score < self.conf_threshold:
                continue
            boxes.append([int(x1), int(y1), int(x2), int(y2)])
            scores.append(float(score))
            class_ids.append(int(class_id))
            mask_predictions.append(mask_prediction)

        boxes = np.array(boxes)
        # 修正坐标还原：需要先减去补边，再还原缩放比例
        if len(boxes) > 0:
            # 第一步：减去letterbox补的边
            boxes[:, [0, 2]] -= self.letterbox_left  # x1, x2 减左补边
            boxes[:, [1, 3]] -= self.letterbox_top   # y1, y2 减上补边
            # 第二步：还原到原始图片尺寸（除以缩放比例）
            boxes = boxes / self.letterbox_scale
        
        scores = np.array(scores)
        class_ids = np.array(class_ids)
        mask_predictions = np.array(mask_predictions)

        return boxes, scores, class_ids, mask_predictions

    def segment_objects(self, image):
        input_tensor = self.prepare_input(image)
        outputs = self.inference(input_tensor)

        self.boxes, self.scores, self.class_ids, mask_pred = self.process_box_output_26(outputs[0])

        self.mask_maps = self.process_mask_output(mask_pred, outputs[1])

        return self.boxes, self.scores, self.class_ids, self.mask_maps

    def process_mask_output(self, mask_predictions, mask_output):
        if mask_predictions.shape[0] == 0:
            return []

        mask_output = np.squeeze(mask_output)
        num_mask, mask_height, mask_width = mask_output.shape  # CHW
        
        # 计算mask
        masks = sigmoid(mask_predictions @ mask_output.reshape((num_mask, -1)))
        masks = masks.reshape((-1, mask_height, mask_width))

        # 修正mask坐标缩放：基于letterbox的缩放比例和补边
        # 计算mask尺寸到letterbox尺寸的缩放比
        mask2letter_x = self.input_width / mask_width
        mask2letter_y = self.input_height / mask_height
        
        # 计算letterbox到原图的缩放比（就是1/letterbox_scale）
        letter2orig_x = 1.0 / self.letterbox_scale
        letter2orig_y = 1.0 / self.letterbox_scale

        mask_maps = np.zeros((len(self.boxes), self.img_height, self.img_width))
        blur_size = (int(self.img_width / (mask_width * self.letterbox_scale)), 
                     int(self.img_height / (mask_height * self.letterbox_scale)))

        for i in range(len(self.boxes)):
            # 获取原始坐标
            x1, y1, x2, y2 = self.boxes[i].astype(int)
            
            # 计算mask上的对应坐标（反向计算）
            letter_x1 = int((x1 * self.letterbox_scale) + self.letterbox_left)
            letter_y1 = int((y1 * self.letterbox_scale) + self.letterbox_top)
            letter_x2 = int((x2 * self.letterbox_scale) + self.letterbox_left)
            letter_y2 = int((y2 * self.letterbox_scale) + self.letterbox_top)
            
            # 转换到mask尺寸
            scale_x1 = max(0, int(letter_x1 / mask2letter_x))
            scale_y1 = max(0, int(letter_y1 / mask2letter_y))
            scale_x2 = min(mask_width, int(letter_x2 / mask2letter_x))
            scale_y2 = min(mask_height, int(letter_y2 / mask2letter_y))

            if scale_x1 >= scale_x2 or scale_y1 >= scale_y2:
                continue

            # 裁剪并缩放mask
            scale_crop_mask = masks[i][scale_y1:scale_y2, scale_x1:scale_x2]
            crop_mask = cv2.resize(scale_crop_mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_CUBIC)

            crop_mask = cv2.blur(crop_mask, blur_size)
            crop_mask = (crop_mask > 0.5).astype(np.uint8)
            
            # 确保坐标在图片范围内
            y1 = max(0, y1)
            y2 = min(self.img_height, y2)
            x1 = max(0, x1)
            x2 = min(self.img_width, x2)
            
            mask_maps[i, y1:y2, x1:x2] = crop_mask

        return mask_maps

    def rescale_boxes(self, boxes, input_shape, image_shape):
        """兼容原有接口，实际已在process_box_output_26中完成坐标还原"""
        return boxes

def draw_detections(image, boxes, scores, class_ids, mask_maps=None, mask_alpha=0.5):
    img_height, img_width = image.shape[:2]
    size = min([img_height, img_width]) * 0.0006
    text_thickness = int(min([img_height, img_width]) * 0.001)

    mask_img = draw_masks(image, boxes, class_ids, mask_alpha, mask_maps)

    # Draw bounding boxes and labels of detections
    for box, score, class_id in zip(boxes, scores, class_ids):
        color = colors[class_id]

        x1, y1, x2, y2 = box.astype(int)
        
        # 确保坐标在图片范围内
        x1 = max(0, x1)
        y1 = max(0, y1)
        x2 = min(img_width, x2)
        y2 = min(img_height, y2)

        # Draw rectangle
        cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, 2)

        label = class_names[class_id]
        caption = f'{label} {int(score * 100)}%'
        (tw, th), _ = cv2.getTextSize(text=caption, fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                                      fontScale=size, thickness=text_thickness)
        th = int(th * 1.2)

        cv2.rectangle(mask_img, (x1, y1), (x1 + tw, y1 - th), color, -1)

        cv2.putText(mask_img, caption, (x1, y1),
                    cv2.FONT_HERSHEY_SIMPLEX, size, (255, 255, 255), text_thickness, cv2.LINE_AA)

    return mask_img

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def draw_masks(image, boxes, class_ids, mask_alpha=0.3, mask_maps=None):
    mask_img = image.copy()

    # Draw bounding boxes and labels of detections
    for i, (box, class_id) in enumerate(zip(boxes, class_ids)):
        color = colors[class_id]

        x1, y1, x2, y2 = box.astype(int)
        
        # 确保坐标在图片范围内
        x1 = max(0, x1)
        y1 = max(0, y1)
        x2 = min(image.shape[1], x2)
        y2 = min(image.shape[0], y2)

        # Draw fill mask image
        if mask_maps is None:
            cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, -1)
        else:
            crop_mask = mask_maps[i][y1:y2, x1:x2, np.newaxis]
            crop_mask_img = mask_img[y1:y2, x1:x2]
            crop_mask_img = crop_mask_img * (1 - crop_mask) + crop_mask * color
            mask_img[y1:y2, x1:x2] = crop_mask_img

    return cv2.addWeighted(mask_img, mask_alpha, image, 1 - mask_alpha, 0)

if __name__ == '__main__':
    # 初始化推理器
    yoloseg = Segment("weights/yolo26s-seg.onnx", conf_thres=0.5)
    
    # 读取图片
    img = cv2.imread("bus.jpg")
    if img is None:
        print("Error: 无法读取图片文件")
    else:
        # 推理
        boxes, scores, class_ids, masks = yoloseg.segment_objects(img)
        
        # 绘制结果
        combined_img = draw_detections(img, boxes, scores, class_ids, masks)
        
        # 保存结果
        cv2.imwrite("output.jpg", combined_img)
        print("推理完成，结果已保存为 output.jpg")

C++版本：

python 复制代码

// 1. 必须放在最第一行，解决 strdup 等函数的安全警告
#define _CRT_SECURE_NO_WARNINGS 

#include <iostream>
#include <vector>
#include <string>
#include <random>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <onnxruntime_cxx_api.h>

// 2. 解决跨平台 strdup 问题
#ifdef _WIN32
#define MY_STRDUP _strdup
#else
#define MY_STRDUP strdup
#endif

using namespace cv;
using namespace std;

// 类别名称
const std::vector<std::string> class_names = {
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
	"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
	"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
	"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
	"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
	"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
	"cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
	"scissors", "teddy bear", "hair drier", "toothbrush"
};

// 颜色生成
std::vector<cv::Scalar> generateColors(int num_classes) {
	std::vector<cv::Scalar> colors;
	std::mt19937 rng(3);
	std::uniform_int_distribution<int> uni(0, 255);
	for (int i = 0; i < num_classes; ++i) {
		colors.push_back(cv::Scalar(uni(rng), uni(rng), uni(rng)));
	}
	return colors;
}

static std::vector<cv::Scalar> colors = generateColors(class_names.size());

struct Detection {
	cv::Rect box;       // xywh (integer)
	float score;
	int class_id;
	std::vector<float> mask_coeffs; // 32维系数
};

class Segment {
public:
	Segment(const std::string& model_path, float conf_thres = 0.7);

	// ============ 修改点：析构函数必须是 public ============
	~Segment() {
		for (auto ptr : input_names) free(ptr);
		for (auto ptr : output_names) free(ptr);
	}
	// ====================================================

	void run(const cv::Mat& image, std::vector<Detection>& detections, std::vector<cv::Mat>& masks);
	cv::Mat draw(const cv::Mat& image, const std::vector<Detection>& detections, const std::vector<cv::Mat>& masks);

private:
	Ort::Env env;
	Ort::Session session;
	Ort::AllocatorWithDefaultOptions allocator;

	std::vector<char*> input_names;
	std::vector<char*> output_names;

	int input_h;
	int input_w;
	float conf_threshold;

	// Letterbox 参数记录
	int lb_top = 0;
	int lb_left = 0;
	float lb_scale = 1.0f;
	int img_h = 0;
	int img_w = 0;

	void letterbox(const cv::Mat& src, cv::Mat& dst);
};

Segment::Segment(const std::string& model_path, float conf_thres)
	: env(ORT_LOGGING_LEVEL_WARNING, "YoloSeg"),
	session(nullptr),
	conf_threshold(conf_thres)
{
	Ort::SessionOptions session_options;
	session_options.SetIntraOpNumThreads(1);
	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
	// session_options.AppendExecutionProvider_CUDA(0); // 如果有CUDA请取消注释

	// 3. 解决 Windows 路径宽字符问题
#ifdef _WIN32
	std::wstring wide_model_path = std::wstring(model_path.begin(), model_path.end());
	session = Ort::Session(env, wide_model_path.c_str(), session_options);
#else
	session = Ort::Session(env, model_path.c_str(), session_options);
#endif

	// 获取输入输出信息
	size_t num_input_nodes = session.GetInputCount();
	for (size_t i = 0; i < num_input_nodes; i++) {
		// 使用 MY_STRDUP (_strdup)
		input_names.push_back(MY_STRDUP(session.GetInputNameAllocated(i, allocator).get()));
	}
	size_t num_output_nodes = session.GetOutputCount();
	for (size_t i = 0; i < num_output_nodes; i++) {
		output_names.push_back(MY_STRDUP(session.GetOutputNameAllocated(i, allocator).get()));
	}

	// 获取输入尺寸 (假设是 [1, 3, H, W])
	auto input_shape = session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
	input_h = input_shape[2];
	input_w = input_shape[3];
}

void Segment::letterbox(const cv::Mat& src, cv::Mat& dst) {
	int h = src.rows;
	int w = src.cols;
	this->img_h = h;
	this->img_w = w;

	// 计算缩放比例 (等比例)
	float r = std::min((float)input_h / h, (float)input_w / w);
	this->lb_scale = r;

	int new_unpad_w = (int)std::round(w * r);
	int new_unpad_h = (int)std::round(h * r);

	int dw = input_w - new_unpad_w;
	int dh = input_h - new_unpad_h;

	// 居中补边
	this->lb_left = (int)std::round(dw / 2.0 - 0.1);
	int right = input_w - new_unpad_w - this->lb_left;
	this->lb_top = (int)std::round(dh / 2.0 - 0.1);
	int bottom = input_h - new_unpad_h - this->lb_top;

	cv::Mat resized;
	if (h != new_unpad_h || w != new_unpad_w) {
		cv::resize(src, resized, cv::Size(new_unpad_w, new_unpad_h));
	}
	else {
		resized = src;
	}

	cv::copyMakeBorder(resized, dst, lb_top, bottom, lb_left, right, cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));
}

void Segment::run(const cv::Mat& image, std::vector<Detection>& detections, std::vector<cv::Mat>& masks) {
	detections.clear();
	masks.clear();

	// 1. 预处理
	cv::Mat input_img;
	letterbox(image, input_img);

	cv::Mat blob;
	cv::dnn::blobFromImage(input_img, blob, 1.0 / 255.0, cv::Size(), cv::Scalar(), true, false);

	// 2. 推理
	std::vector<int64_t> input_dims = { 1, 3, input_h, input_w };
	Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
	Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
		memory_info, blob.ptr<float>(), blob.total(), input_dims.data(), input_dims.size());

	// 注意：Run 函数需要 const char* const* 类型的 input_names
	auto outputs = session.Run(Ort::RunOptions{ nullptr },
		(const char* const*)input_names.data(), &input_tensor, 1,
		(const char* const*)output_names.data(), output_names.size());

	// 3. 解析 Output0 (Boxes)
	// 假设 Output0 形状: [1, Num_Dets, 6 + 32] -> [1, N, 38]
	float* box_data = outputs[0].GetTensorMutableData<float>();
	auto box_shape = outputs[0].GetTensorTypeAndShapeInfo().GetShape();
	int num_dets = box_shape[1];
	int dim = box_shape[2]; // 38

	std::vector<std::vector<float>> mask_coeffs_list;

	for (int i = 0; i < num_dets; i++) {
		float* ptr = box_data + i * dim;
		float score = ptr[4];

		if (score < conf_threshold) continue;

		Detection det;
		// 直接读取坐标 (x1, y1, x2, y2)
		float x1 = ptr[0];
		float y1 = ptr[1];
		float x2 = ptr[2];
		float y2 = ptr[3];

		det.class_id = (int)ptr[5];
		det.score = score;

		// 坐标还原 (减去padding -> 除以缩放)
		x1 = (x1 - lb_left) / lb_scale;
		y1 = (y1 - lb_top) / lb_scale;
		x2 = (x2 - lb_left) / lb_scale;
		y2 = (y2 - lb_top) / lb_scale;

		det.box = cv::Rect((int)x1, (int)y1, (int)(x2 - x1), (int)(y2 - y1));

		// 读取 Mask 系数 (从索引6开始的32个值)
		det.mask_coeffs.assign(ptr + 6, ptr + 6 + 32);

		detections.push_back(det);
		mask_coeffs_list.push_back(det.mask_coeffs);
	}

	// 4. 解析 Output1 (Mask Protos) 并生成 Mask
	if (detections.empty()) return;

	// Output1 Shape: [1, 32, 160, 160] (假设)
	float* proto_data = outputs[1].GetTensorMutableData<float>();
	auto proto_shape = outputs[1].GetTensorTypeAndShapeInfo().GetShape();
	int mask_c = proto_shape[1]; // 32
	int mask_h = proto_shape[2]; // 160
	int mask_w = proto_shape[3]; // 160

	// 将 Protos 视为矩阵 [32, 160*160]
	cv::Mat protos(mask_c, mask_h * mask_w, CV_32F, proto_data);

	// 将所有检测的系数堆叠为矩阵 [Num_Dets, 32]
	cv::Mat coeffs(detections.size(), mask_c, CV_32F);
	for (size_t i = 0; i < detections.size(); ++i) {
		memcpy(coeffs.ptr<float>(i), mask_coeffs_list[i].data(), mask_c * sizeof(float));
	}

	// 矩阵乘法: [N, 32] * [32, 25600] = [N, 25600]
	cv::Mat mask_result_flat = coeffs * protos;

	// 计算缩放比例 (Mask -> Letterbox)
	float mask2letter_x = (float)input_w / mask_w;
	float mask2letter_y = (float)input_h / mask_h;

	// 模糊核大小
	int blur_w = (int)(img_w / (mask_w * lb_scale));
	int blur_h = (int)(img_h / (mask_h * lb_scale));
	if (blur_w < 1) blur_w = 1;
	if (blur_h < 1) blur_h = 1;

	for (size_t i = 0; i < detections.size(); ++i) {
		// 获取当前检测的 mask (1x25600) -> reshape (160x160)
		cv::Mat single_mask = mask_result_flat.row(i).reshape(1, mask_h); // 160x160

		// Sigmoid
		cv::Mat sigmoid_mask;
		cv::exp(-single_mask, sigmoid_mask);
		sigmoid_mask = 1.0 / (1.0 + sigmoid_mask);

		// 获取还原后的 Box 坐标
		int x1 = detections[i].box.x;
		int y1 = detections[i].box.y;
		int x2 = x1 + detections[i].box.width;
		int y2 = y1 + detections[i].box.height;

		// 反算回 Letterbox 坐标系
		int letter_x1 = (int)(x1 * lb_scale + lb_left);
		int letter_y1 = (int)(y1 * lb_scale + lb_top);
		int letter_x2 = (int)(x2 * lb_scale + lb_left);
		int letter_y2 = (int)(y2 * lb_scale + lb_top);

		// 映射到 Mask 坐标系 (160x160)
		int scale_x1 = std::max(0, (int)(letter_x1 / mask2letter_x));
		int scale_y1 = std::max(0, (int)(letter_y1 / mask2letter_y));
		int scale_x2 = std::min(mask_w, (int)(letter_x2 / mask2letter_x));
		int scale_y2 = std::min(mask_h, (int)(letter_y2 / mask2letter_y));

		if (scale_x2 <= scale_x1 || scale_y2 <= scale_y1) {
			masks.push_back(cv::Mat::zeros(img_h, img_w, CV_8UC1)); // 空 Mask
			continue;
		}

		// 裁剪 Mask
		cv::Rect crop_rect(scale_x1, scale_y1, scale_x2 - scale_x1, scale_y2 - scale_y1);
		cv::Mat crop_mask = sigmoid_mask(crop_rect);

		// Resize 到物体在原图中的大小
		int target_w = x2 - x1;
		int target_h = y2 - y1;

		if (target_w <= 0 || target_h <= 0) {
			masks.push_back(cv::Mat::zeros(img_h, img_w, CV_8UC1));
			continue;
		}

		cv::Mat resized_mask;
		cv::resize(crop_mask, resized_mask, cv::Size(target_w, target_h), 0, 0, cv::INTER_CUBIC);

		// Blur
		cv::blur(resized_mask, resized_mask, cv::Size(blur_w, blur_h));

		// 二值化
		cv::Mat binary_mask;
		cv::compare(resized_mask, 0.5, binary_mask, cv::CMP_GT);

		// 放入全图大小的 Mask 中
		cv::Mat full_mask = cv::Mat::zeros(img_h, img_w, CV_8UC1);

		// 确保边界安全
		x1 = std::max(0, x1); y1 = std::max(0, y1);
		x2 = std::min(img_w, x2); y2 = std::min(img_h, y2);

		int roi_w = std::min(x2 - x1, resized_mask.cols);
		int roi_h = std::min(y2 - y1, resized_mask.rows);

		if (roi_w > 0 && roi_h > 0) {
			cv::Rect roi_rect(x1, y1, roi_w, roi_h);
			cv::Rect mask_rect(0, 0, roi_w, roi_h);
			binary_mask(mask_rect).copyTo(full_mask(roi_rect));
		}

		masks.push_back(full_mask);
	}
}

cv::Mat Segment::draw(const cv::Mat& image, const std::vector<Detection>& detections, const std::vector<cv::Mat>& masks) {
	cv::Mat mask_img = image.clone();
	float mask_alpha = 0.5;

	// 绘制 Masks
	for (size_t i = 0; i < detections.size(); ++i) {
		if (masks[i].empty()) continue;

		cv::Scalar color = colors[detections[i].class_id % colors.size()];

		// 创建彩色遮罩
		cv::Mat color_mask(image.size(), image.type(), color);

		// 将原图和纯色遮罩混合
		cv::Mat blended;
		cv::addWeighted(mask_img, 1.0 - mask_alpha, color_mask, mask_alpha, 0, blended);

		// 只把 mask 区域的混合结果复制回去
		blended.copyTo(mask_img, masks[i]);
	}

	// 绘制边框和文字
	for (const auto& det : detections) {
		cv::Scalar color = colors[det.class_id % colors.size()];
		int x1 = det.box.x;
		int y1 = det.box.y;
		int x2 = x1 + det.box.width;
		int y2 = y1 + det.box.height;

		// 限制在图像内
		x1 = std::max(0, x1); y1 = std::max(0, y1);
		x2 = std::min(image.cols, x2); y2 = std::min(image.rows, y2);

		cv::rectangle(mask_img, cv::Point(x1, y1), cv::Point(x2, y2), color, 2);

		std::string label = class_names[det.class_id] + " " + std::to_string((int)(det.score * 100)) + "%";

		int baseLine;
		cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 1, &baseLine);
		int top = std::max(y1, labelSize.height);

		cv::rectangle(mask_img, cv::Point(x1, top - labelSize.height),
			cv::Point(x1 + labelSize.width, top + baseLine), color, cv::FILLED);
		cv::putText(mask_img, label, cv::Point(x1, top), cv::FONT_HERSHEY_SIMPLEX, 0.6, cv::Scalar(255, 255, 255), 1);
	}

	return mask_img;
}

int main(int argc, char** argv) {
	// 默认路径
	std::string model_path = "yolo26n-seg.onnx";
	std::string img_path = "bus.jpg";

	if (argc > 1) model_path = argv[1];
	if (argc > 2) img_path = argv[2];

	try {
		std::cout << "Loading model: " << model_path << std::endl;
		Segment segmentor(model_path, 0.5); // conf_thres

		cv::Mat img = cv::imread(img_path);
		if (img.empty()) {
			std::cerr << "Error: Could not read image " << img_path << std::endl;
			return -1;
		}

		std::vector<Detection> detections;
		std::vector<cv::Mat> masks;

		std::cout << "Running inference..." << std::endl;
		segmentor.run(img, detections, masks);

		std::cout << "Drawing results..." << std::endl;
		cv::Mat result_img = segmentor.draw(img, detections, masks);

		cv::imwrite("output.jpg", result_img);
		std::cout << "Inference finished. Saved to output.jpg" << std::endl;

	}
	catch (const std::exception& e) {
		std::cerr << "Exception: " << e.what() << std::endl;
		return -1;
	}

	return 0;
}