yolo目标检测线程池高性能视频tensorrt推理(每秒1000+帧)

yolov5-tensorrt推理框架实现

logger.h

cpp 复制代码
#ifndef LOGGER_H
#define LOGGER_H

#include <NvInfer.h>
#include <NvInferRuntime.h>


inline const char* severity_string(nvinfer1::ILogger::Severity t)
{
	switch (t)
	{
	case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
	case nvinfer1::ILogger::Severity::kERROR:   return "error";
	case nvinfer1::ILogger::Severity::kWARNING: return "warning";
	case nvinfer1::ILogger::Severity::kINFO:    return "info";
	case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
	default: return "unknow";
	}
}


class TRTLogger : public nvinfer1::ILogger
{
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
	{
		if (severity <= Severity::kINFO)
		{
			if (severity == Severity::kWARNING)
				printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
			else if (severity <= Severity::kERROR)
				printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
			else
				printf("%s: %s\n", severity_string(severity), msg);
		}
	}
} logger;

#endif // LOGGER_H

preprocess.h

cpp 复制代码
#ifndef PREPROCESS_H
#define PREPROCESS_H

#include <opencv2/opencv.hpp>


void LetterBox(const cv::Mat& image, cv::Mat& outImage, const cv::Size& newShape = cv::Size(640, 640), const cv::Scalar& color = cv::Scalar(114, 114, 114));

#endif // PREPROCESS_H

preprocess.cpp

cpp 复制代码
#include "preprocess.h"


void LetterBox(const cv::Mat& image, cv::Mat& outImage, const cv::Size& newShape, const cv::Scalar& color)
{
	cv::Size shape = image.size();
	float r = std::min((float)newShape.height / (float)shape.height, (float)newShape.width / (float)shape.width);
	float ratio[2]{ r, r };
	int new_un_pad[2] = { (int)std::round((float)shape.width * r),(int)std::round((float)shape.height * r) };

	auto dw = (float)(newShape.width - new_un_pad[0]) / 2;
	auto dh = (float)(newShape.height - new_un_pad[1]) / 2;

	if (shape.width != new_un_pad[0] && shape.height != new_un_pad[1])
		cv::resize(image, outImage, cv::Size(new_un_pad[0], new_un_pad[1]));
	else
		outImage = image.clone();

	int top = int(std::round(dh - 0.1f));
	int bottom = int(std::round(dh + 0.1f));
	int left = int(std::round(dw - 0.1f));
	int right = int(std::round(dw + 0.1f));

	cv::Vec4d params;
	params[0] = ratio[0];
	params[1] = ratio[1];
	params[2] = left;
	params[3] = top;

	cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
}

yolov5.h

cpp 复制代码
#ifndef YOLOV5_H
#define YOLOV5_H

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <ctime>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <opencv2/opencv.hpp>

#include "preprocess.h"
#include "postprocess.h"

//#define USE_CUDA

#ifdef USE_CUDA
    #include "preprocess.cuh"
    #include "decode.cuh"
#endif


class Yolov5
{
public:
    Yolov5();
    ~Yolov5();

    int load_model(const std::string& model_path);                        // 加载模型
    int infer(const cv::Mat &image, std::vector<Detection> &detections); // 推理运行模型

private:
    int pre_process(const cv::Mat &image);   // 图像预处理
    int post_process(const cv::Mat &image, std::vector<Detection>& detections); // 后处理

    const cv::Size input_size = cv::Size(640, 640);
    const int input_numel = 1 * 3 * input_size.width * input_size.height;
    const float confidence_threshold = 0.5;
    const float score_threshold = 0.25;
    const float nms_threshold = 0.45;
    const int class_num = 80;
    const int output_numprob = 5 + class_num;
    const int output_numbox = 3 * (input_size.width / 8 * input_size.height / 8 + input_size.width / 16 * input_size.height / 16 + input_size.width / 32 * input_size.height / 32);
    const int output_numel = 1 * output_numprob * output_numbox;

    nvinfer1::IRuntime* runtime = nullptr;
    nvinfer1::ICudaEngine* engine = nullptr;
    nvinfer1::IExecutionContext* execution_context = nullptr;
    cudaStream_t stream = nullptr;
    float* input_h = nullptr;
    float* output_h = nullptr;
    float* input_d = nullptr;          	
	float* output_d = nullptr;
    float* bindings[2]; 

#ifdef USE_CUDA
	uint8_t* input_host;
	float* d2s_host;
   	float* d2s_device; 
    float* s2d_host;
    float* s2d_device;
    float* output_box_host;
    float* output_box_device;
    const int max_box = 1024;
   	const int nubox_element = 7; 
    const int max_input_size = sizeof(float) * 3 * 1024 * 1024;
#endif
};

#endif // YOLOV5_H

yolov5.cpp

cpp 复制代码
#include "yolov5.h"
#include "logger.h"


Yolov5::Yolov5()
{
	runtime = nvinfer1::createInferRuntime(logger);

	cudaMallocHost(&input_h, sizeof(float) * input_numel);
    cudaMallocHost(&output_h, sizeof(float) * output_numel);

	cudaMalloc(&input_d, sizeof(float) * input_numel);
	cudaMalloc(&output_d, sizeof(float) * output_numel);

	bindings[0] = input_d;
	bindings[1] = output_d;

#ifdef USE_CUDA
	cudaMallocHost(&input_host, max_input_size);
	cudaMallocHost(&d2s_host, sizeof(float) * 6);
	cudaMalloc(&d2s_device, sizeof(float) * 6);
	cudaMallocHost(&s2d_host, sizeof(float) * 6);
	cudaMalloc(&s2d_device, sizeof(float) * 6);
	cudaMallocHost(&output_box_host, sizeof(float) * (nubox_element * max_box + 1));
	cudaMalloc(&output_box_device, sizeof(float) * (nubox_element * max_box + 1));
#endif
}

Yolov5::~Yolov5()
{
	cudaStreamDestroy(stream);
	cudaFree(input_d);
    cudaFree(output_d);
	cudaFreeHost(input_h);
    cudaFreeHost(output_h);
}


int Yolov5::load_model(const std::string& model_path)
{
    std::ifstream in(model_path, std::ios::binary);
    if (!in.is_open()) 
		return -1;

    in.seekg(0, std::ios::end);
    size_t size = in.tellg();
    std::vector<unsigned char> engine_data(size);
    in.seekg(0, std::ios::beg);
    in.read((char*)engine_data.data(), size);
    in.close();

    engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
	if(engine == nullptr) 
		return -1;

    execution_context = engine->createExecutionContext();
	if(execution_context == nullptr)
		return -1;

    cudaStreamCreate(&stream);
    return 0;
}


int Yolov5::pre_process(const cv::Mat &image)
{
#ifdef USE_CUDA
	cudaMemcpyAsync(input_host, image.data, sizeof(uint8_t) * 3 * image.cols * image.rows, cudaMemcpyHostToDevice, stream);
	preprocess_kernel_img(input_host, image.cols, image.rows, input_d, input_size.width, input_size.height, d2s_host, s2d_host, stream);
	cudaMemcpyAsync(d2s_device, d2s_host, sizeof(float) * 6, cudaMemcpyHostToDevice, stream);
	cudaMemcpyAsync(s2d_device, s2d_host, sizeof(float) * 6, cudaMemcpyHostToDevice, stream);
#else
	cv::Mat letterbox;
	LetterBox(image, letterbox, input_size);
	//cv::resize(image, letterbox, input_size);
	letterbox.convertTo(letterbox, CV_32FC3, 1.0f / 255.0f);

	int image_area = letterbox.cols * letterbox.rows;
	float* pimage = (float*)letterbox.data;
	float* phost_b = input_h + image_area * 0;
	float* phost_g = input_h + image_area * 1;
	float* phost_r = input_h + image_area * 2;
	for (int i = 0; i < image_area; ++i, pimage += 3)
	{
		*phost_r++ = pimage[0];
		*phost_g++ = pimage[1];
		*phost_b++ = pimage[2];
	}

	cudaMemcpyAsync(input_d, input_h, sizeof(float) * input_numel, cudaMemcpyHostToDevice, stream);
#endif
    return 0;
}


int Yolov5::infer(const cv::Mat& image, std::vector<Detection>& detections)
{
    pre_process(image);

	bool success = execution_context->executeV2((void**)bindings);
	if(!success)
	{
	    std::cerr << "Failed to run inference" << std::endl;
	    return -1;
	}

#ifndef USE_CUDA
	cudaMemcpyAsync(output_h, output_d, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);
#endif

    post_process(image, detections);
    return 0;
}


int Yolov5::post_process(const cv::Mat &image,  std::vector<Detection>& detections)
{
    std::vector<cv::Rect> boxes;
	std::vector<float> scores;
	std::vector<int> class_ids;

#ifdef USE_CUDA
	cudaMemset(output_box_device, 0, sizeof(float) * (nubox_element * max_box + 1));	
	decode_kernel_invoker(output_d, output_numbox, class_num, score_threshold, d2s_device, output_box_device, max_box, nubox_element, stream);
	nms_kernel_invoker(output_box_device, nms_threshold, max_box, nubox_element, stream);
	cudaMemcpyAsync(output_box_host, output_box_device, sizeof(float) * (nubox_element * max_box + 1), cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	for (size_t i = 0; i < max_box; i++)
	{
		if (output_box_host[7 * i + 7])
		{
			float x1 = output_box_host[7 * i + 1];
			float y1 = output_box_host[7 * i + 2];
			float x2 = output_box_host[7 * i + 3];
			float y2 = output_box_host[7 * i + 4];
			boxes.push_back(cv::Rect(x1, y1, x2-x1, y2-y1));
			scores.push_back(output_box_host[7 * i + 5]);
			class_ids.push_back(output_box_host[7 * i + 6]);
		}
	}

	detections.clear();
	detections.resize(boxes.size());
	for (int i = 0; i < boxes.size(); i++)
	{
		detections[i].bbox = boxes[i];
		detections[i].score = scores[i];
		detections[i].id = class_ids[i];
	}

#else
	// float x_ratio = float(image.cols) / input_size.width;
	// float y_ratio = float(image.rows) / input_size.height;
	for (int i = 0; i < output_numbox; ++i)
	{
		float* ptr = output_h + i * output_numprob;
		float obj_score = ptr[4];
		if (obj_score < confidence_threshold)
			continue;

		float* classes_scores = 5 + ptr;
		int class_id = std::max_element(classes_scores, classes_scores + class_num) - classes_scores;
		float score = classes_scores[class_id] * obj_score;
		if (score < score_threshold)
			continue;

		float x = ptr[0];
		float y = ptr[1];
		float w = ptr[2];
		float h = ptr[3];
		int left = int(x - 0.5 * w);
		int top = int(y - 0.5 * h);
		int width = int(w);
		int height = int(h);

		cv::Rect box = cv::Rect(left, top, width, height);
		scale_boxes(box, input_size, image.size());
		boxes.push_back(box);
		scores.push_back(score);
		class_ids.push_back(class_id);
	}

	std::vector<int> indices;
	nms(boxes, scores, score_threshold, nms_threshold, indices);
	
	detections.clear();
	detections.resize(indices.size());
	for (int i = 0; i < indices.size(); ++i)
	{
	    int idx = indices[i];
		detections[i].bbox = boxes[idx];
		detections[i].score = scores[idx];
		detections[i].id = class_ids[idx];
	}
#endif

    return 0;
}

单线程版本

test_yolov5.cpp

cpp 复制代码
#include "yolov5.h"

int main()
{
    Yolov5* yolov5 = new Yolov5();
    yolov5->load_model("yolov5n_int8.engine");

    cv::Mat image = cv::imread("bus.jpg");
    std::vector<Detection> detections;
    yolov5->infer(image, detections);
    std::cout << "detections size: " << detections.size() << std::endl;
    
    clock_t start = clock();
    for (int i = 0; i < 1000; i++)
    {
        cv::Mat image = cv::imread("bus.jpg");
        yolov5->infer(image, detections);
    }
    clock_t end = clock();
    std::cout << "time: " << (double)(end - start) / CLOCKS_PER_SEC << "s" << std::endl;

    draw_detections(image, detections);
    cv::imwrite("result.jpg", image);

    return 0;
}

运行./demo输出:

bash 复制代码
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 0, GPU 17 (MiB)
detections size: 3
time: 24.9972s

线程池版本

yolov5_thread_pool.h

cpp 复制代码
#ifndef YOLOV5_THREAD_POOL_H
#define YOLOV5_THREAD_POOL_H

#include "yolov5.h"

#include <iostream>
#include <vector>
#include <queue>
#include <map>
#include <thread>
#include <mutex>
#include <condition_variable>

class Yolov5ThreadPool
{
private:
    std::queue<std::pair<int, cv::Mat>> tasks;             // <id, img>用来存放任务
    std::vector<std::shared_ptr<Yolov5>> yolov5_instances; // 模型实例
    std::map<int, std::vector<Detection>> results;         // <id, objects>用来存放结果(检测框)
    std::map<int, cv::Mat> img_results;                    // <id, img>用来存放结果(图片)
    std::vector<std::thread> threads;                      // 线程池
    std::mutex mtx1;
    std::mutex mtx2;
    std::condition_variable cv_task, cv_result;
    bool stop;

    void worker(int id);

public:
    Yolov5ThreadPool();
    ~Yolov5ThreadPool();

    int setUp(const std::string &model_path, int num_threads = 12);     // 初始化
    int submitTask(const cv::Mat &img, int id);                         // 提交任务
    int getTargetResult(std::vector<Detection> &objects, int id);       // 获取结果
    int getTargetImgResult(cv::Mat &img, int id);                       // 获取结果(图片)
    void stopAll();                                                     // 停止所有线程
};

#endif //YOLOV5_THREAD_POOL_H

yolov5_thread_pool.cpp

cpp 复制代码
#include "yolov5_thread_pool.h"

// 构造函数
Yolov5ThreadPool::Yolov5ThreadPool() { stop = false; }

// 析构函数
Yolov5ThreadPool::~Yolov5ThreadPool()
{
    // stop all threads
    stop = true;
    cv_task.notify_all();
    for (auto &thread : threads)
    {
        if (thread.joinable())
        {
            thread.join();
        }
    }
}

// 初始化:加载模型,创建线程,参数:模型路径,线程数量
int Yolov5ThreadPool::setUp(const std::string &model_path, int num_threads)
{
    // 遍历线程数量,创建模型实例,放入vector
    // 这些线程加载的模型是同一个
    for (size_t i = 0; i < num_threads; ++i)
    {
        std::shared_ptr<Yolov5> yolov5 = std::make_shared<Yolov5>();
        yolov5->load_model(model_path.c_str());
        yolov5_instances.push_back(yolov5);
    }
    // 遍历线程数量,创建线程
    for (size_t i = 0; i < num_threads; ++i)
    {
        threads.emplace_back(&Yolov5ThreadPool::worker, this, i);
    }
    return 0;
}


// 线程函数。参数:线程id
void Yolov5ThreadPool::worker(int id)
{
    while (!stop)
    {
        std::pair<int, cv::Mat> task;
        std::shared_ptr<Yolov5> instance = yolov5_instances[id]; // 获取模型实例
        {
            // 获取任务
            std::unique_lock<std::mutex> lock(mtx1);
            cv_task.wait(lock, [&] { return !tasks.empty() || stop; });
            if (stop)
                return;

            task = tasks.front();
            tasks.pop();
        }
        // 运行模型
        std::vector<Detection> detections;
        instance->infer(task.second, detections);

        {
            // 保存结果
            std::lock_guard<std::mutex> lock(mtx2);
            results.insert({task.first, detections});
            draw_detections(task.second, detections);
            img_results.insert({task.first, task.second});
            cv_result.notify_one();
        }
    }
}


// 提交任务,参数:图片,id(帧号)
int Yolov5ThreadPool::submitTask(const cv::Mat &img, int id)
{
    // 如果任务队列中的任务数量大于10,等待,避免内存占用过多
    while (tasks.size() > 1000)
    {
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
    }

    {
        // 保存任务
        std::lock_guard<std::mutex> lock(mtx1);
        tasks.push({id, img});
    }
    cv_task.notify_one();
    return 0;
}

// 获取结果,参数:检测框,id(帧号)
int Yolov5ThreadPool::getTargetResult(std::vector<Detection> &objects, int id)
{
    // 如果没有结果,等待
    while (results.find(id) == results.end())
    {
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
    }
    std::lock_guard<std::mutex> lock(mtx2);
    objects = results[id];
    // remove from map
    results.erase(id);
    return 0;
}

// 获取结果(图片),参数:图片,id(帧号)
int Yolov5ThreadPool::getTargetImgResult(cv::Mat &img, int id)
{
    int loop_cnt = 0;
    // 如果没有结果,等待
    while (img_results.find(id) == img_results.end())
    {
        // 等待 
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
        loop_cnt++;
        if (loop_cnt > 1000)
        {
            std::cerr << "getTargetImgResult timeout" << std::endl;
            return 0;
        }
    }
    std::lock_guard<std::mutex> lock(mtx2);
    img = img_results[id];
    // remove from map
    img_results.erase(id);

    return 0;
}

void Yolov5ThreadPool::stopAll()
{
    stop = true;
    cv_task.notify_all();
}

test_yolov5_thread_pool.cpp

cpp 复制代码
#include "yolov5_thread_pool.h"
#include <chrono>


static int g_frame_start_id = 0; // 读取视频帧的索引
static int g_frame_end_id = 0;   // 模型处理完的索引
static Yolov5ThreadPool *g_pool = nullptr;
bool end = false;


void read_stream(const std::string& video_file)
{
    cv::VideoCapture cap(video_file);
    if (!cap.isOpened())
        return;

    cv::Mat img;
    while (true)
    {
        cap >> img;
        if (img.empty())
        {
            end = true;
            break;
        }
        g_pool->submitTask(img.clone(), g_frame_start_id++);
    }
    cap.release();
}


void get_results()
{
    auto start_all = std::chrono::high_resolution_clock::now();
    int frame_count = 0;

    //cv::VideoWriter writer = cv::VideoWriter("result.mp4", cv::VideoWriter::fourcc('m', 'p', '4', 'v'), 30, cv::Size(1280, 720));
    while (true)
    {
        cv::Mat img;
        auto ret = g_pool->getTargetImgResult(img, g_frame_end_id++);
        if (end)
        {
            g_pool->stopAll();
            break;
        }
        //cv::imwrite("output/" + std::to_string(g_frame_end_id) + ".jpg", img);
        //writer << img;

        frame_count++;
        auto end_all = std::chrono::high_resolution_clock::now();
        auto elapsed_all_2 = std::chrono::duration_cast<std::chrono::microseconds>(end_all - start_all).count() / 1000.f;
        if (elapsed_all_2 >= 1000)
        {
            printf("FPS:%f \n", frame_count / (elapsed_all_2 / 1000.0f));
            frame_count = 0;
            start_all = std::chrono::high_resolution_clock::now();
        }
    }
    g_pool->stopAll();
}


int main(int argc, char **argv)
{
    g_pool = new Yolov5ThreadPool();
    g_pool->setUp(argv[1], atoi(argv[2]));

    std::thread read_stream_thread(read_stream, "bj_full.mp4");
    std::thread result_thread(get_results);

    read_stream_thread.join();
    result_thread.join();

    return 0;
}

CMakeLists.txt

bash 复制代码
cmake_minimum_required(VERSION 3.20)
project(trt_inference LANGUAGES C CXX CUDA)
set(CMAKE_CXX_STANDARD 14)

find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})

set(OpenCV_DIR /home/tfy/document/HybrIK/cpp/opencv-4.12.0/lib/cmake/opencv4)
find_package(OpenCV REQUIRED)

set(TENSORRT_INCLUDE_DIRS /home/tfy/docker_share/TensorRT-10.6.0.26/include)
set(TENSORRT_LIBRARY_DIRS /home/tfy/docker_share/TensorRT-10.6.0.26/targets/x86_64-linux-gnu/lib)
include_directories(${TENSORRT_INCLUDE_DIRS} ${OpenCV_INCLUDE_DIRS})
link_directories(${TENSORRT_LIBRARY_DIRS})

add_executable(demo test_yolov5.cpp preprocess.cpp postprocess.cpp yolov5.cpp)
add_executable(thread_pool test_yolov5_thread_pool.cpp preprocess.cpp postprocess.cpp yolov5.cpp yolov5_thread_pool.cpp)
target_link_libraries(demo PRIVATE ${OpenCV_LIBS} ${CUDA_LIBRARIES} ${TENSORRT_LIBRARY_DIRS}/libnvinfer.so)
target_link_libraries(thread_pool PRIVATE ${OpenCV_LIBS} ${CUDA_LIBRARIES} ${TENSORRT_LIBRARY_DIRS}/libnvinfer.so)

运行./thread_pool yolov5n_int8.engine 16性能实测:(测试硬件为RTX4090+24核cpu)

输出

bash 复制代码
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 0, GPU 17 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 1, GPU 35 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +14, now: CPU 1, GPU 52 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 2, GPU 70 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 2, GPU 88 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +14, now: CPU 3, GPU 105 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 3, GPU 123 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 4, GPU 141 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +14, now: CPU 4, GPU 158 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 5, GPU 176 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 5, GPU 194 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 6, GPU 211 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 6, GPU 229 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 7, GPU 247 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 7, GPU 264 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 8, GPU 282 (MiB)
FPS:731.412842 
FPS:713.332947 
FPS:797.625244 
FPS:850.723083 
FPS:759.826111 
FPS:790.603821 
FPS:893.340698 
FPS:876.184570 
FPS:849.282715 
FPS:823.478210 
FPS:818.882874 
FPS:839.987427 
FPS:841.113464 
FPS:846.878052 
FPS:853.196289 
FPS:881.448181 
FPS:867.657776 
FPS:909.925354 
FPS:884.442505 
FPS:975.532715 
FPS:984.539246 
FPS:1001.592773 
FPS:951.560303 
FPS:965.459229 
FPS:963.278687 
FPS:916.945496 
FPS:954.125122 
FPS:965.415710 
FPS:1004.943909 
FPS:919.196472 
FPS:942.398926 
FPS:911.690796 
FPS:994.875244 
FPS:895.988342 

视频推理截图:

完整工程见:https://github.com/taifyang/yolov5-tensorrt-threadpool

相关推荐
工程师老罗7 小时前
基于Pytorch的YOLOv1 的网络结构代码
人工智能·pytorch·yolo
woshikejiaih7 小时前
**播客听书与有声书区别解析2026指南,适配不同场景的音频
大数据·人工智能·python·音视频
Mr数据杨8 小时前
【ComfyUI】AV-FunASR 音频转文本
音视频
学习3人组10 小时前
YOLO模型集成到Label Studio的MODEL服务
yolo
孤狼warrior10 小时前
YOLO目标检测 一千字解析yolo最初的摸样 模型下载,数据集构建及模型训练代码
人工智能·python·深度学习·算法·yolo·目标检测·目标跟踪
凉辰11 小时前
使用uni.createInnerAudioContext()播放指定音频(踩坑分享功能)
开发语言·javascript·音视频
AI资源库12 小时前
Remotion 一个用 React 程序化制作视频的框架
人工智能·语言模型·音视频
水中加点糖12 小时前
小白都能看懂的——车牌检测与识别(最新版YOLO26快速入门)
人工智能·yolo·目标检测·计算机视觉·ai·车牌识别·lprnet
永远都不秃头的程序员(互关)13 小时前
基于CANN的ops-signal仓库实现AIGC音频生成中的动态窗函数融合优化——从STFT预处理到端到端低延迟合成
aigc·音视频
薛定谔的猫喵喵13 小时前
基于PyQt5的视频答题竞赛系统设计与实现
开发语言·qt·音视频