yolo目标检测线程池高性能视频tensorrt推理（每秒1000+帧）

yolov5-tensorrt推理框架实现

logger.h

cpp 复制代码

#ifndef LOGGER_H
#define LOGGER_H

#include <NvInfer.h>
#include <NvInferRuntime.h>


inline const char* severity_string(nvinfer1::ILogger::Severity t)
{
	switch (t)
	{
	case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
	case nvinfer1::ILogger::Severity::kERROR:   return "error";
	case nvinfer1::ILogger::Severity::kWARNING: return "warning";
	case nvinfer1::ILogger::Severity::kINFO:    return "info";
	case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
	default: return "unknow";
	}
}


class TRTLogger : public nvinfer1::ILogger
{
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
	{
		if (severity <= Severity::kINFO)
		{
			if (severity == Severity::kWARNING)
				printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
			else if (severity <= Severity::kERROR)
				printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
			else
				printf("%s: %s\n", severity_string(severity), msg);
		}
	}
} logger;

#endif // LOGGER_H

preprocess.h

cpp 复制代码

#ifndef PREPROCESS_H
#define PREPROCESS_H

#include <opencv2/opencv.hpp>


void LetterBox(const cv::Mat& image, cv::Mat& outImage, const cv::Size& newShape = cv::Size(640, 640), const cv::Scalar& color = cv::Scalar(114, 114, 114));

#endif // PREPROCESS_H

preprocess.cpp

cpp 复制代码

#include "preprocess.h"


void LetterBox(const cv::Mat& image, cv::Mat& outImage, const cv::Size& newShape, const cv::Scalar& color)
{
	cv::Size shape = image.size();
	float r = std::min((float)newShape.height / (float)shape.height, (float)newShape.width / (float)shape.width);
	float ratio[2]{ r, r };
	int new_un_pad[2] = { (int)std::round((float)shape.width * r),(int)std::round((float)shape.height * r) };

	auto dw = (float)(newShape.width - new_un_pad[0]) / 2;
	auto dh = (float)(newShape.height - new_un_pad[1]) / 2;

	if (shape.width != new_un_pad[0] && shape.height != new_un_pad[1])
		cv::resize(image, outImage, cv::Size(new_un_pad[0], new_un_pad[1]));
	else
		outImage = image.clone();

	int top = int(std::round(dh - 0.1f));
	int bottom = int(std::round(dh + 0.1f));
	int left = int(std::round(dw - 0.1f));
	int right = int(std::round(dw + 0.1f));

	cv::Vec4d params;
	params[0] = ratio[0];
	params[1] = ratio[1];
	params[2] = left;
	params[3] = top;

	cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
}

yolov5.h

cpp 复制代码

#ifndef YOLOV5_H
#define YOLOV5_H

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <ctime>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <opencv2/opencv.hpp>

#include "preprocess.h"
#include "postprocess.h"

//#define USE_CUDA

#ifdef USE_CUDA
    #include "preprocess.cuh"
    #include "decode.cuh"
#endif


class Yolov5
{
public:
    Yolov5();
    ~Yolov5();

    int load_model(const std::string& model_path);                        // 加载模型
    int infer(const cv::Mat &image, std::vector<Detection> &detections); // 推理运行模型

private:
    int pre_process(const cv::Mat &image);   // 图像预处理
    int post_process(const cv::Mat &image, std::vector<Detection>& detections); // 后处理

    const cv::Size input_size = cv::Size(640, 640);
    const int input_numel = 1 * 3 * input_size.width * input_size.height;
    const float confidence_threshold = 0.5;
    const float score_threshold = 0.25;
    const float nms_threshold = 0.45;
    const int class_num = 80;
    const int output_numprob = 5 + class_num;
    const int output_numbox = 3 * (input_size.width / 8 * input_size.height / 8 + input_size.width / 16 * input_size.height / 16 + input_size.width / 32 * input_size.height / 32);
    const int output_numel = 1 * output_numprob * output_numbox;

    nvinfer1::IRuntime* runtime = nullptr;
    nvinfer1::ICudaEngine* engine = nullptr;
    nvinfer1::IExecutionContext* execution_context = nullptr;
    cudaStream_t stream = nullptr;
    float* input_h = nullptr;
    float* output_h = nullptr;
    float* input_d = nullptr;          	
	float* output_d = nullptr;
    float* bindings[2]; 

#ifdef USE_CUDA
	uint8_t* input_host;
	float* d2s_host;
   	float* d2s_device; 
    float* s2d_host;
    float* s2d_device;
    float* output_box_host;
    float* output_box_device;
    const int max_box = 1024;
   	const int nubox_element = 7; 
    const int max_input_size = sizeof(float) * 3 * 1024 * 1024;
#endif
};

#endif // YOLOV5_H

yolov5.cpp

cpp 复制代码

#include "yolov5.h"
#include "logger.h"


Yolov5::Yolov5()
{
	runtime = nvinfer1::createInferRuntime(logger);

	cudaMallocHost(&input_h, sizeof(float) * input_numel);
    cudaMallocHost(&output_h, sizeof(float) * output_numel);

	cudaMalloc(&input_d, sizeof(float) * input_numel);
	cudaMalloc(&output_d, sizeof(float) * output_numel);

	bindings[0] = input_d;
	bindings[1] = output_d;

#ifdef USE_CUDA
	cudaMallocHost(&input_host, max_input_size);
	cudaMallocHost(&d2s_host, sizeof(float) * 6);
	cudaMalloc(&d2s_device, sizeof(float) * 6);
	cudaMallocHost(&s2d_host, sizeof(float) * 6);
	cudaMalloc(&s2d_device, sizeof(float) * 6);
	cudaMallocHost(&output_box_host, sizeof(float) * (nubox_element * max_box + 1));
	cudaMalloc(&output_box_device, sizeof(float) * (nubox_element * max_box + 1));
#endif
}

Yolov5::~Yolov5()
{
	cudaStreamDestroy(stream);
	cudaFree(input_d);
    cudaFree(output_d);
	cudaFreeHost(input_h);
    cudaFreeHost(output_h);
}


int Yolov5::load_model(const std::string& model_path)
{
    std::ifstream in(model_path, std::ios::binary);
    if (!in.is_open()) 
		return -1;

    in.seekg(0, std::ios::end);
    size_t size = in.tellg();
    std::vector<unsigned char> engine_data(size);
    in.seekg(0, std::ios::beg);
    in.read((char*)engine_data.data(), size);
    in.close();

    engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
	if(engine == nullptr) 
		return -1;

    execution_context = engine->createExecutionContext();
	if(execution_context == nullptr)
		return -1;

    cudaStreamCreate(&stream);
    return 0;
}


int Yolov5::pre_process(const cv::Mat &image)
{
#ifdef USE_CUDA
	cudaMemcpyAsync(input_host, image.data, sizeof(uint8_t) * 3 * image.cols * image.rows, cudaMemcpyHostToDevice, stream);
	preprocess_kernel_img(input_host, image.cols, image.rows, input_d, input_size.width, input_size.height, d2s_host, s2d_host, stream);
	cudaMemcpyAsync(d2s_device, d2s_host, sizeof(float) * 6, cudaMemcpyHostToDevice, stream);
	cudaMemcpyAsync(s2d_device, s2d_host, sizeof(float) * 6, cudaMemcpyHostToDevice, stream);
#else
	cv::Mat letterbox;
	LetterBox(image, letterbox, input_size);
	//cv::resize(image, letterbox, input_size);
	letterbox.convertTo(letterbox, CV_32FC3, 1.0f / 255.0f);

	int image_area = letterbox.cols * letterbox.rows;
	float* pimage = (float*)letterbox.data;
	float* phost_b = input_h + image_area * 0;
	float* phost_g = input_h + image_area * 1;
	float* phost_r = input_h + image_area * 2;
	for (int i = 0; i < image_area; ++i, pimage += 3)
	{
		*phost_r++ = pimage[0];
		*phost_g++ = pimage[1];
		*phost_b++ = pimage[2];
	}

	cudaMemcpyAsync(input_d, input_h, sizeof(float) * input_numel, cudaMemcpyHostToDevice, stream);
#endif
    return 0;
}


int Yolov5::infer(const cv::Mat& image, std::vector<Detection>& detections)
{
    pre_process(image);

	bool success = execution_context->executeV2((void**)bindings);
	if(!success)
	{
	    std::cerr << "Failed to run inference" << std::endl;
	    return -1;
	}

#ifndef USE_CUDA
	cudaMemcpyAsync(output_h, output_d, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);
#endif

    post_process(image, detections);
    return 0;
}


int Yolov5::post_process(const cv::Mat &image,  std::vector<Detection>& detections)
{
    std::vector<cv::Rect> boxes;
	std::vector<float> scores;
	std::vector<int> class_ids;

#ifdef USE_CUDA
	cudaMemset(output_box_device, 0, sizeof(float) * (nubox_element * max_box + 1));	
	decode_kernel_invoker(output_d, output_numbox, class_num, score_threshold, d2s_device, output_box_device, max_box, nubox_element, stream);
	nms_kernel_invoker(output_box_device, nms_threshold, max_box, nubox_element, stream);
	cudaMemcpyAsync(output_box_host, output_box_device, sizeof(float) * (nubox_element * max_box + 1), cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	for (size_t i = 0; i < max_box; i++)
	{
		if (output_box_host[7 * i + 7])
		{
			float x1 = output_box_host[7 * i + 1];
			float y1 = output_box_host[7 * i + 2];
			float x2 = output_box_host[7 * i + 3];
			float y2 = output_box_host[7 * i + 4];
			boxes.push_back(cv::Rect(x1, y1, x2-x1, y2-y1));
			scores.push_back(output_box_host[7 * i + 5]);
			class_ids.push_back(output_box_host[7 * i + 6]);
		}
	}

	detections.clear();
	detections.resize(boxes.size());
	for (int i = 0; i < boxes.size(); i++)
	{
		detections[i].bbox = boxes[i];
		detections[i].score = scores[i];
		detections[i].id = class_ids[i];
	}

#else
	// float x_ratio = float(image.cols) / input_size.width;
	// float y_ratio = float(image.rows) / input_size.height;
	for (int i = 0; i < output_numbox; ++i)
	{
		float* ptr = output_h + i * output_numprob;
		float obj_score = ptr[4];
		if (obj_score < confidence_threshold)
			continue;

		float* classes_scores = 5 + ptr;
		int class_id = std::max_element(classes_scores, classes_scores + class_num) - classes_scores;
		float score = classes_scores[class_id] * obj_score;
		if (score < score_threshold)
			continue;

		float x = ptr[0];
		float y = ptr[1];
		float w = ptr[2];
		float h = ptr[3];
		int left = int(x - 0.5 * w);
		int top = int(y - 0.5 * h);
		int width = int(w);
		int height = int(h);

		cv::Rect box = cv::Rect(left, top, width, height);
		scale_boxes(box, input_size, image.size());
		boxes.push_back(box);
		scores.push_back(score);
		class_ids.push_back(class_id);
	}

	std::vector<int> indices;
	nms(boxes, scores, score_threshold, nms_threshold, indices);
	
	detections.clear();
	detections.resize(indices.size());
	for (int i = 0; i < indices.size(); ++i)
	{
	    int idx = indices[i];
		detections[i].bbox = boxes[idx];
		detections[i].score = scores[idx];
		detections[i].id = class_ids[idx];
	}
#endif

    return 0;
}

单线程版本

test_yolov5.cpp

cpp 复制代码

#include "yolov5.h"

int main()
{
    Yolov5* yolov5 = new Yolov5();
    yolov5->load_model("yolov5n_int8.engine");

    cv::Mat image = cv::imread("bus.jpg");
    std::vector<Detection> detections;
    yolov5->infer(image, detections);
    std::cout << "detections size: " << detections.size() << std::endl;
    
    clock_t start = clock();
    for (int i = 0; i < 1000; i++)
    {
        cv::Mat image = cv::imread("bus.jpg");
        yolov5->infer(image, detections);
    }
    clock_t end = clock();
    std::cout << "time: " << (double)(end - start) / CLOCKS_PER_SEC << "s" << std::endl;

    draw_detections(image, detections);
    cv::imwrite("result.jpg", image);

    return 0;
}

运行./demo输出：

bash 复制代码

info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 0, GPU 17 (MiB)
detections size: 3
time: 24.9972s

线程池版本

yolov5_thread_pool.h

cpp 复制代码

#ifndef YOLOV5_THREAD_POOL_H
#define YOLOV5_THREAD_POOL_H

#include "yolov5.h"

#include <iostream>
#include <vector>
#include <queue>
#include <map>
#include <thread>
#include <mutex>
#include <condition_variable>

class Yolov5ThreadPool
{
private:
    std::queue<std::pair<int, cv::Mat>> tasks;             // <id, img>用来存放任务
    std::vector<std::shared_ptr<Yolov5>> yolov5_instances; // 模型实例
    std::map<int, std::vector<Detection>> results;         // <id, objects>用来存放结果（检测框）
    std::map<int, cv::Mat> img_results;                    // <id, img>用来存放结果（图片）
    std::vector<std::thread> threads;                      // 线程池
    std::mutex mtx1;
    std::mutex mtx2;
    std::condition_variable cv_task, cv_result;
    bool stop;

    void worker(int id);

public:
    Yolov5ThreadPool();
    ~Yolov5ThreadPool();

    int setUp(const std::string &model_path, int num_threads = 12);     // 初始化
    int submitTask(const cv::Mat &img, int id);                         // 提交任务
    int getTargetResult(std::vector<Detection> &objects, int id);       // 获取结果
    int getTargetImgResult(cv::Mat &img, int id);                       // 获取结果（图片）
    void stopAll();                                                     // 停止所有线程
};

#endif //YOLOV5_THREAD_POOL_H

yolov5_thread_pool.cpp

cpp 复制代码

#include "yolov5_thread_pool.h"

// 构造函数
Yolov5ThreadPool::Yolov5ThreadPool() { stop = false; }

// 析构函数
Yolov5ThreadPool::~Yolov5ThreadPool()
{
    // stop all threads
    stop = true;
    cv_task.notify_all();
    for (auto &thread : threads)
    {
        if (thread.joinable())
        {
            thread.join();
        }
    }
}

// 初始化：加载模型，创建线程，参数：模型路径，线程数量
int Yolov5ThreadPool::setUp(const std::string &model_path, int num_threads)
{
    // 遍历线程数量，创建模型实例，放入vector
    // 这些线程加载的模型是同一个
    for (size_t i = 0; i < num_threads; ++i)
    {
        std::shared_ptr<Yolov5> yolov5 = std::make_shared<Yolov5>();
        yolov5->load_model(model_path.c_str());
        yolov5_instances.push_back(yolov5);
    }
    // 遍历线程数量，创建线程
    for (size_t i = 0; i < num_threads; ++i)
    {
        threads.emplace_back(&Yolov5ThreadPool::worker, this, i);
    }
    return 0;
}


// 线程函数。参数：线程id
void Yolov5ThreadPool::worker(int id)
{
    while (!stop)
    {
        std::pair<int, cv::Mat> task;
        std::shared_ptr<Yolov5> instance = yolov5_instances[id]; // 获取模型实例
        {
            // 获取任务
            std::unique_lock<std::mutex> lock(mtx1);
            cv_task.wait(lock, [&] { return !tasks.empty() || stop; });
            if (stop)
                return;

            task = tasks.front();
            tasks.pop();
        }
        // 运行模型
        std::vector<Detection> detections;
        instance->infer(task.second, detections);

        {
            // 保存结果
            std::lock_guard<std::mutex> lock(mtx2);
            results.insert({task.first, detections});
            draw_detections(task.second, detections);
            img_results.insert({task.first, task.second});
            cv_result.notify_one();
        }
    }
}


// 提交任务，参数：图片，id（帧号）
int Yolov5ThreadPool::submitTask(const cv::Mat &img, int id)
{
    // 如果任务队列中的任务数量大于10，等待，避免内存占用过多
    while (tasks.size() > 1000)
    {
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
    }

    {
        // 保存任务
        std::lock_guard<std::mutex> lock(mtx1);
        tasks.push({id, img});
    }
    cv_task.notify_one();
    return 0;
}

// 获取结果，参数：检测框，id（帧号）
int Yolov5ThreadPool::getTargetResult(std::vector<Detection> &objects, int id)
{
    // 如果没有结果，等待
    while (results.find(id) == results.end())
    {
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
    }
    std::lock_guard<std::mutex> lock(mtx2);
    objects = results[id];
    // remove from map
    results.erase(id);
    return 0;
}

// 获取结果（图片），参数：图片，id（帧号）
int Yolov5ThreadPool::getTargetImgResult(cv::Mat &img, int id)
{
    int loop_cnt = 0;
    // 如果没有结果，等待
    while (img_results.find(id) == img_results.end())
    {
        // 等待 
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
        loop_cnt++;
        if (loop_cnt > 1000)
        {
            std::cerr << "getTargetImgResult timeout" << std::endl;
            return 0;
        }
    }
    std::lock_guard<std::mutex> lock(mtx2);
    img = img_results[id];
    // remove from map
    img_results.erase(id);

    return 0;
}

void Yolov5ThreadPool::stopAll()
{
    stop = true;
    cv_task.notify_all();
}

test_yolov5_thread_pool.cpp

cpp 复制代码

#include "yolov5_thread_pool.h"
#include <chrono>


static int g_frame_start_id = 0; // 读取视频帧的索引
static int g_frame_end_id = 0;   // 模型处理完的索引
static Yolov5ThreadPool *g_pool = nullptr;
bool end = false;


void read_stream(const std::string& video_file)
{
    cv::VideoCapture cap(video_file);
    if (!cap.isOpened())
        return;

    cv::Mat img;
    while (true)
    {
        cap >> img;
        if (img.empty())
        {
            end = true;
            break;
        }
        g_pool->submitTask(img.clone(), g_frame_start_id++);
    }
    cap.release();
}


void get_results()
{
    auto start_all = std::chrono::high_resolution_clock::now();
    int frame_count = 0;

    //cv::VideoWriter writer = cv::VideoWriter("result.mp4", cv::VideoWriter::fourcc('m', 'p', '4', 'v'), 30, cv::Size(1280, 720));
    while (true)
    {
        cv::Mat img;
        auto ret = g_pool->getTargetImgResult(img, g_frame_end_id++);
        if (end)
        {
            g_pool->stopAll();
            break;
        }
        //cv::imwrite("output/" + std::to_string(g_frame_end_id) + ".jpg", img);
        //writer << img;

        frame_count++;
        auto end_all = std::chrono::high_resolution_clock::now();
        auto elapsed_all_2 = std::chrono::duration_cast<std::chrono::microseconds>(end_all - start_all).count() / 1000.f;
        if (elapsed_all_2 >= 1000)
        {
            printf("FPS:%f \n", frame_count / (elapsed_all_2 / 1000.0f));
            frame_count = 0;
            start_all = std::chrono::high_resolution_clock::now();
        }
    }
    g_pool->stopAll();
}


int main(int argc, char **argv)
{
    g_pool = new Yolov5ThreadPool();
    g_pool->setUp(argv[1], atoi(argv[2]));

    std::thread read_stream_thread(read_stream, "bj_full.mp4");
    std::thread result_thread(get_results);

    read_stream_thread.join();
    result_thread.join();

    return 0;
}

CMakeLists.txt

bash 复制代码

cmake_minimum_required(VERSION 3.20)
project(trt_inference LANGUAGES C CXX CUDA)
set(CMAKE_CXX_STANDARD 14)

find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})

set(OpenCV_DIR /home/tfy/document/HybrIK/cpp/opencv-4.12.0/lib/cmake/opencv4)
find_package(OpenCV REQUIRED)

set(TENSORRT_INCLUDE_DIRS /home/tfy/docker_share/TensorRT-10.6.0.26/include)
set(TENSORRT_LIBRARY_DIRS /home/tfy/docker_share/TensorRT-10.6.0.26/targets/x86_64-linux-gnu/lib)
include_directories(${TENSORRT_INCLUDE_DIRS} ${OpenCV_INCLUDE_DIRS})
link_directories(${TENSORRT_LIBRARY_DIRS})

add_executable(demo test_yolov5.cpp preprocess.cpp postprocess.cpp yolov5.cpp)
add_executable(thread_pool test_yolov5_thread_pool.cpp preprocess.cpp postprocess.cpp yolov5.cpp yolov5_thread_pool.cpp)
target_link_libraries(demo PRIVATE ${OpenCV_LIBS} ${CUDA_LIBRARIES} ${TENSORRT_LIBRARY_DIRS}/libnvinfer.so)
target_link_libraries(thread_pool PRIVATE ${OpenCV_LIBS} ${CUDA_LIBRARIES} ${TENSORRT_LIBRARY_DIRS}/libnvinfer.so)

运行./thread_pool yolov5n_int8.engine 16性能实测：（测试硬件为RTX4090+24核cpu）

输出

bash 复制代码

info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 0, GPU 17 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 1, GPU 35 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +14, now: CPU 1, GPU 52 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 2, GPU 70 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 2, GPU 88 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +14, now: CPU 3, GPU 105 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 3, GPU 123 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 4, GPU 141 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +14, now: CPU 4, GPU 158 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 5, GPU 176 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 5, GPU 194 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 6, GPU 211 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 6, GPU 229 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 7, GPU 247 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 7, GPU 264 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 8, GPU 282 (MiB)
FPS:731.412842 
FPS:713.332947 
FPS:797.625244 
FPS:850.723083 
FPS:759.826111 
FPS:790.603821 
FPS:893.340698 
FPS:876.184570 
FPS:849.282715 
FPS:823.478210 
FPS:818.882874 
FPS:839.987427 
FPS:841.113464 
FPS:846.878052 
FPS:853.196289 
FPS:881.448181 
FPS:867.657776 
FPS:909.925354 
FPS:884.442505 
FPS:975.532715 
FPS:984.539246 
FPS:1001.592773 
FPS:951.560303 
FPS:965.459229 
FPS:963.278687 
FPS:916.945496 
FPS:954.125122 
FPS:965.415710 
FPS:1004.943909 
FPS:919.196472 
FPS:942.398926 
FPS:911.690796 
FPS:994.875244 
FPS:895.988342

视频推理截图：

完整工程见：https://github.com/taifyang/yolov5-tensorrt-threadpool