yolov5-tensorrt推理框架实现
logger.h
cpp
#ifndef LOGGER_H
#define LOGGER_H
#include <NvInfer.h>
#include <NvInferRuntime.h>
inline const char* severity_string(nvinfer1::ILogger::Severity t)
{
switch (t)
{
case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
case nvinfer1::ILogger::Severity::kERROR: return "error";
case nvinfer1::ILogger::Severity::kWARNING: return "warning";
case nvinfer1::ILogger::Severity::kINFO: return "info";
case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
default: return "unknow";
}
}
class TRTLogger : public nvinfer1::ILogger
{
public:
virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
{
if (severity <= Severity::kINFO)
{
if (severity == Severity::kWARNING)
printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
else if (severity <= Severity::kERROR)
printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
else
printf("%s: %s\n", severity_string(severity), msg);
}
}
} logger;
#endif // LOGGER_H
preprocess.h
cpp
#ifndef PREPROCESS_H
#define PREPROCESS_H
#include <opencv2/opencv.hpp>
void LetterBox(const cv::Mat& image, cv::Mat& outImage, const cv::Size& newShape = cv::Size(640, 640), const cv::Scalar& color = cv::Scalar(114, 114, 114));
#endif // PREPROCESS_H
preprocess.cpp
cpp
#include "preprocess.h"
void LetterBox(const cv::Mat& image, cv::Mat& outImage, const cv::Size& newShape, const cv::Scalar& color)
{
cv::Size shape = image.size();
float r = std::min((float)newShape.height / (float)shape.height, (float)newShape.width / (float)shape.width);
float ratio[2]{ r, r };
int new_un_pad[2] = { (int)std::round((float)shape.width * r),(int)std::round((float)shape.height * r) };
auto dw = (float)(newShape.width - new_un_pad[0]) / 2;
auto dh = (float)(newShape.height - new_un_pad[1]) / 2;
if (shape.width != new_un_pad[0] && shape.height != new_un_pad[1])
cv::resize(image, outImage, cv::Size(new_un_pad[0], new_un_pad[1]));
else
outImage = image.clone();
int top = int(std::round(dh - 0.1f));
int bottom = int(std::round(dh + 0.1f));
int left = int(std::round(dw - 0.1f));
int right = int(std::round(dw + 0.1f));
cv::Vec4d params;
params[0] = ratio[0];
params[1] = ratio[1];
params[2] = left;
params[3] = top;
cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
}
yolov5.h
cpp
#ifndef YOLOV5_H
#define YOLOV5_H
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <ctime>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <opencv2/opencv.hpp>
#include "preprocess.h"
#include "postprocess.h"
//#define USE_CUDA
#ifdef USE_CUDA
#include "preprocess.cuh"
#include "decode.cuh"
#endif
class Yolov5
{
public:
Yolov5();
~Yolov5();
int load_model(const std::string& model_path); // 加载模型
int infer(const cv::Mat &image, std::vector<Detection> &detections); // 推理运行模型
private:
int pre_process(const cv::Mat &image); // 图像预处理
int post_process(const cv::Mat &image, std::vector<Detection>& detections); // 后处理
const cv::Size input_size = cv::Size(640, 640);
const int input_numel = 1 * 3 * input_size.width * input_size.height;
const float confidence_threshold = 0.5;
const float score_threshold = 0.25;
const float nms_threshold = 0.45;
const int class_num = 80;
const int output_numprob = 5 + class_num;
const int output_numbox = 3 * (input_size.width / 8 * input_size.height / 8 + input_size.width / 16 * input_size.height / 16 + input_size.width / 32 * input_size.height / 32);
const int output_numel = 1 * output_numprob * output_numbox;
nvinfer1::IRuntime* runtime = nullptr;
nvinfer1::ICudaEngine* engine = nullptr;
nvinfer1::IExecutionContext* execution_context = nullptr;
cudaStream_t stream = nullptr;
float* input_h = nullptr;
float* output_h = nullptr;
float* input_d = nullptr;
float* output_d = nullptr;
float* bindings[2];
#ifdef USE_CUDA
uint8_t* input_host;
float* d2s_host;
float* d2s_device;
float* s2d_host;
float* s2d_device;
float* output_box_host;
float* output_box_device;
const int max_box = 1024;
const int nubox_element = 7;
const int max_input_size = sizeof(float) * 3 * 1024 * 1024;
#endif
};
#endif // YOLOV5_H
yolov5.cpp
cpp
#include "yolov5.h"
#include "logger.h"
Yolov5::Yolov5()
{
runtime = nvinfer1::createInferRuntime(logger);
cudaMallocHost(&input_h, sizeof(float) * input_numel);
cudaMallocHost(&output_h, sizeof(float) * output_numel);
cudaMalloc(&input_d, sizeof(float) * input_numel);
cudaMalloc(&output_d, sizeof(float) * output_numel);
bindings[0] = input_d;
bindings[1] = output_d;
#ifdef USE_CUDA
cudaMallocHost(&input_host, max_input_size);
cudaMallocHost(&d2s_host, sizeof(float) * 6);
cudaMalloc(&d2s_device, sizeof(float) * 6);
cudaMallocHost(&s2d_host, sizeof(float) * 6);
cudaMalloc(&s2d_device, sizeof(float) * 6);
cudaMallocHost(&output_box_host, sizeof(float) * (nubox_element * max_box + 1));
cudaMalloc(&output_box_device, sizeof(float) * (nubox_element * max_box + 1));
#endif
}
Yolov5::~Yolov5()
{
cudaStreamDestroy(stream);
cudaFree(input_d);
cudaFree(output_d);
cudaFreeHost(input_h);
cudaFreeHost(output_h);
}
int Yolov5::load_model(const std::string& model_path)
{
std::ifstream in(model_path, std::ios::binary);
if (!in.is_open())
return -1;
in.seekg(0, std::ios::end);
size_t size = in.tellg();
std::vector<unsigned char> engine_data(size);
in.seekg(0, std::ios::beg);
in.read((char*)engine_data.data(), size);
in.close();
engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
if(engine == nullptr)
return -1;
execution_context = engine->createExecutionContext();
if(execution_context == nullptr)
return -1;
cudaStreamCreate(&stream);
return 0;
}
int Yolov5::pre_process(const cv::Mat &image)
{
#ifdef USE_CUDA
cudaMemcpyAsync(input_host, image.data, sizeof(uint8_t) * 3 * image.cols * image.rows, cudaMemcpyHostToDevice, stream);
preprocess_kernel_img(input_host, image.cols, image.rows, input_d, input_size.width, input_size.height, d2s_host, s2d_host, stream);
cudaMemcpyAsync(d2s_device, d2s_host, sizeof(float) * 6, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(s2d_device, s2d_host, sizeof(float) * 6, cudaMemcpyHostToDevice, stream);
#else
cv::Mat letterbox;
LetterBox(image, letterbox, input_size);
//cv::resize(image, letterbox, input_size);
letterbox.convertTo(letterbox, CV_32FC3, 1.0f / 255.0f);
int image_area = letterbox.cols * letterbox.rows;
float* pimage = (float*)letterbox.data;
float* phost_b = input_h + image_area * 0;
float* phost_g = input_h + image_area * 1;
float* phost_r = input_h + image_area * 2;
for (int i = 0; i < image_area; ++i, pimage += 3)
{
*phost_r++ = pimage[0];
*phost_g++ = pimage[1];
*phost_b++ = pimage[2];
}
cudaMemcpyAsync(input_d, input_h, sizeof(float) * input_numel, cudaMemcpyHostToDevice, stream);
#endif
return 0;
}
int Yolov5::infer(const cv::Mat& image, std::vector<Detection>& detections)
{
pre_process(image);
bool success = execution_context->executeV2((void**)bindings);
if(!success)
{
std::cerr << "Failed to run inference" << std::endl;
return -1;
}
#ifndef USE_CUDA
cudaMemcpyAsync(output_h, output_d, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
#endif
post_process(image, detections);
return 0;
}
int Yolov5::post_process(const cv::Mat &image, std::vector<Detection>& detections)
{
std::vector<cv::Rect> boxes;
std::vector<float> scores;
std::vector<int> class_ids;
#ifdef USE_CUDA
cudaMemset(output_box_device, 0, sizeof(float) * (nubox_element * max_box + 1));
decode_kernel_invoker(output_d, output_numbox, class_num, score_threshold, d2s_device, output_box_device, max_box, nubox_element, stream);
nms_kernel_invoker(output_box_device, nms_threshold, max_box, nubox_element, stream);
cudaMemcpyAsync(output_box_host, output_box_device, sizeof(float) * (nubox_element * max_box + 1), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
for (size_t i = 0; i < max_box; i++)
{
if (output_box_host[7 * i + 7])
{
float x1 = output_box_host[7 * i + 1];
float y1 = output_box_host[7 * i + 2];
float x2 = output_box_host[7 * i + 3];
float y2 = output_box_host[7 * i + 4];
boxes.push_back(cv::Rect(x1, y1, x2-x1, y2-y1));
scores.push_back(output_box_host[7 * i + 5]);
class_ids.push_back(output_box_host[7 * i + 6]);
}
}
detections.clear();
detections.resize(boxes.size());
for (int i = 0; i < boxes.size(); i++)
{
detections[i].bbox = boxes[i];
detections[i].score = scores[i];
detections[i].id = class_ids[i];
}
#else
// float x_ratio = float(image.cols) / input_size.width;
// float y_ratio = float(image.rows) / input_size.height;
for (int i = 0; i < output_numbox; ++i)
{
float* ptr = output_h + i * output_numprob;
float obj_score = ptr[4];
if (obj_score < confidence_threshold)
continue;
float* classes_scores = 5 + ptr;
int class_id = std::max_element(classes_scores, classes_scores + class_num) - classes_scores;
float score = classes_scores[class_id] * obj_score;
if (score < score_threshold)
continue;
float x = ptr[0];
float y = ptr[1];
float w = ptr[2];
float h = ptr[3];
int left = int(x - 0.5 * w);
int top = int(y - 0.5 * h);
int width = int(w);
int height = int(h);
cv::Rect box = cv::Rect(left, top, width, height);
scale_boxes(box, input_size, image.size());
boxes.push_back(box);
scores.push_back(score);
class_ids.push_back(class_id);
}
std::vector<int> indices;
nms(boxes, scores, score_threshold, nms_threshold, indices);
detections.clear();
detections.resize(indices.size());
for (int i = 0; i < indices.size(); ++i)
{
int idx = indices[i];
detections[i].bbox = boxes[idx];
detections[i].score = scores[idx];
detections[i].id = class_ids[idx];
}
#endif
return 0;
}
单线程版本
test_yolov5.cpp
cpp
#include "yolov5.h"
int main()
{
Yolov5* yolov5 = new Yolov5();
yolov5->load_model("yolov5n_int8.engine");
cv::Mat image = cv::imread("bus.jpg");
std::vector<Detection> detections;
yolov5->infer(image, detections);
std::cout << "detections size: " << detections.size() << std::endl;
clock_t start = clock();
for (int i = 0; i < 1000; i++)
{
cv::Mat image = cv::imread("bus.jpg");
yolov5->infer(image, detections);
}
clock_t end = clock();
std::cout << "time: " << (double)(end - start) / CLOCKS_PER_SEC << "s" << std::endl;
draw_detections(image, detections);
cv::imwrite("result.jpg", image);
return 0;
}
运行./demo输出:
bash
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 0, GPU 17 (MiB)
detections size: 3
time: 24.9972s
线程池版本
yolov5_thread_pool.h
cpp
#ifndef YOLOV5_THREAD_POOL_H
#define YOLOV5_THREAD_POOL_H
#include "yolov5.h"
#include <iostream>
#include <vector>
#include <queue>
#include <map>
#include <thread>
#include <mutex>
#include <condition_variable>
class Yolov5ThreadPool
{
private:
std::queue<std::pair<int, cv::Mat>> tasks; // <id, img>用来存放任务
std::vector<std::shared_ptr<Yolov5>> yolov5_instances; // 模型实例
std::map<int, std::vector<Detection>> results; // <id, objects>用来存放结果(检测框)
std::map<int, cv::Mat> img_results; // <id, img>用来存放结果(图片)
std::vector<std::thread> threads; // 线程池
std::mutex mtx1;
std::mutex mtx2;
std::condition_variable cv_task, cv_result;
bool stop;
void worker(int id);
public:
Yolov5ThreadPool();
~Yolov5ThreadPool();
int setUp(const std::string &model_path, int num_threads = 12); // 初始化
int submitTask(const cv::Mat &img, int id); // 提交任务
int getTargetResult(std::vector<Detection> &objects, int id); // 获取结果
int getTargetImgResult(cv::Mat &img, int id); // 获取结果(图片)
void stopAll(); // 停止所有线程
};
#endif //YOLOV5_THREAD_POOL_H
yolov5_thread_pool.cpp
cpp
#include "yolov5_thread_pool.h"
// 构造函数
Yolov5ThreadPool::Yolov5ThreadPool() { stop = false; }
// 析构函数
Yolov5ThreadPool::~Yolov5ThreadPool()
{
// stop all threads
stop = true;
cv_task.notify_all();
for (auto &thread : threads)
{
if (thread.joinable())
{
thread.join();
}
}
}
// 初始化:加载模型,创建线程,参数:模型路径,线程数量
int Yolov5ThreadPool::setUp(const std::string &model_path, int num_threads)
{
// 遍历线程数量,创建模型实例,放入vector
// 这些线程加载的模型是同一个
for (size_t i = 0; i < num_threads; ++i)
{
std::shared_ptr<Yolov5> yolov5 = std::make_shared<Yolov5>();
yolov5->load_model(model_path.c_str());
yolov5_instances.push_back(yolov5);
}
// 遍历线程数量,创建线程
for (size_t i = 0; i < num_threads; ++i)
{
threads.emplace_back(&Yolov5ThreadPool::worker, this, i);
}
return 0;
}
// 线程函数。参数:线程id
void Yolov5ThreadPool::worker(int id)
{
while (!stop)
{
std::pair<int, cv::Mat> task;
std::shared_ptr<Yolov5> instance = yolov5_instances[id]; // 获取模型实例
{
// 获取任务
std::unique_lock<std::mutex> lock(mtx1);
cv_task.wait(lock, [&] { return !tasks.empty() || stop; });
if (stop)
return;
task = tasks.front();
tasks.pop();
}
// 运行模型
std::vector<Detection> detections;
instance->infer(task.second, detections);
{
// 保存结果
std::lock_guard<std::mutex> lock(mtx2);
results.insert({task.first, detections});
draw_detections(task.second, detections);
img_results.insert({task.first, task.second});
cv_result.notify_one();
}
}
}
// 提交任务,参数:图片,id(帧号)
int Yolov5ThreadPool::submitTask(const cv::Mat &img, int id)
{
// 如果任务队列中的任务数量大于10,等待,避免内存占用过多
while (tasks.size() > 1000)
{
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
{
// 保存任务
std::lock_guard<std::mutex> lock(mtx1);
tasks.push({id, img});
}
cv_task.notify_one();
return 0;
}
// 获取结果,参数:检测框,id(帧号)
int Yolov5ThreadPool::getTargetResult(std::vector<Detection> &objects, int id)
{
// 如果没有结果,等待
while (results.find(id) == results.end())
{
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
std::lock_guard<std::mutex> lock(mtx2);
objects = results[id];
// remove from map
results.erase(id);
return 0;
}
// 获取结果(图片),参数:图片,id(帧号)
int Yolov5ThreadPool::getTargetImgResult(cv::Mat &img, int id)
{
int loop_cnt = 0;
// 如果没有结果,等待
while (img_results.find(id) == img_results.end())
{
// 等待
std::this_thread::sleep_for(std::chrono::milliseconds(1));
loop_cnt++;
if (loop_cnt > 1000)
{
std::cerr << "getTargetImgResult timeout" << std::endl;
return 0;
}
}
std::lock_guard<std::mutex> lock(mtx2);
img = img_results[id];
// remove from map
img_results.erase(id);
return 0;
}
void Yolov5ThreadPool::stopAll()
{
stop = true;
cv_task.notify_all();
}
test_yolov5_thread_pool.cpp
cpp
#include "yolov5_thread_pool.h"
#include <chrono>
static int g_frame_start_id = 0; // 读取视频帧的索引
static int g_frame_end_id = 0; // 模型处理完的索引
static Yolov5ThreadPool *g_pool = nullptr;
bool end = false;
void read_stream(const std::string& video_file)
{
cv::VideoCapture cap(video_file);
if (!cap.isOpened())
return;
cv::Mat img;
while (true)
{
cap >> img;
if (img.empty())
{
end = true;
break;
}
g_pool->submitTask(img.clone(), g_frame_start_id++);
}
cap.release();
}
void get_results()
{
auto start_all = std::chrono::high_resolution_clock::now();
int frame_count = 0;
//cv::VideoWriter writer = cv::VideoWriter("result.mp4", cv::VideoWriter::fourcc('m', 'p', '4', 'v'), 30, cv::Size(1280, 720));
while (true)
{
cv::Mat img;
auto ret = g_pool->getTargetImgResult(img, g_frame_end_id++);
if (end)
{
g_pool->stopAll();
break;
}
//cv::imwrite("output/" + std::to_string(g_frame_end_id) + ".jpg", img);
//writer << img;
frame_count++;
auto end_all = std::chrono::high_resolution_clock::now();
auto elapsed_all_2 = std::chrono::duration_cast<std::chrono::microseconds>(end_all - start_all).count() / 1000.f;
if (elapsed_all_2 >= 1000)
{
printf("FPS:%f \n", frame_count / (elapsed_all_2 / 1000.0f));
frame_count = 0;
start_all = std::chrono::high_resolution_clock::now();
}
}
g_pool->stopAll();
}
int main(int argc, char **argv)
{
g_pool = new Yolov5ThreadPool();
g_pool->setUp(argv[1], atoi(argv[2]));
std::thread read_stream_thread(read_stream, "bj_full.mp4");
std::thread result_thread(get_results);
read_stream_thread.join();
result_thread.join();
return 0;
}
CMakeLists.txt
bash
cmake_minimum_required(VERSION 3.20)
project(trt_inference LANGUAGES C CXX CUDA)
set(CMAKE_CXX_STANDARD 14)
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
set(OpenCV_DIR /home/tfy/document/HybrIK/cpp/opencv-4.12.0/lib/cmake/opencv4)
find_package(OpenCV REQUIRED)
set(TENSORRT_INCLUDE_DIRS /home/tfy/docker_share/TensorRT-10.6.0.26/include)
set(TENSORRT_LIBRARY_DIRS /home/tfy/docker_share/TensorRT-10.6.0.26/targets/x86_64-linux-gnu/lib)
include_directories(${TENSORRT_INCLUDE_DIRS} ${OpenCV_INCLUDE_DIRS})
link_directories(${TENSORRT_LIBRARY_DIRS})
add_executable(demo test_yolov5.cpp preprocess.cpp postprocess.cpp yolov5.cpp)
add_executable(thread_pool test_yolov5_thread_pool.cpp preprocess.cpp postprocess.cpp yolov5.cpp yolov5_thread_pool.cpp)
target_link_libraries(demo PRIVATE ${OpenCV_LIBS} ${CUDA_LIBRARIES} ${TENSORRT_LIBRARY_DIRS}/libnvinfer.so)
target_link_libraries(thread_pool PRIVATE ${OpenCV_LIBS} ${CUDA_LIBRARIES} ${TENSORRT_LIBRARY_DIRS}/libnvinfer.so)
运行./thread_pool yolov5n_int8.engine 16性能实测:(测试硬件为RTX4090+24核cpu)

输出
bash
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 0, GPU 17 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 1, GPU 35 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +14, now: CPU 1, GPU 52 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 2, GPU 70 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 2, GPU 88 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +14, now: CPU 3, GPU 105 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 3, GPU 123 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 4, GPU 141 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +14, now: CPU 4, GPU 158 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 5, GPU 176 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 5, GPU 194 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 6, GPU 211 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 6, GPU 229 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 7, GPU 247 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +15, now: CPU 7, GPU 264 (MiB)
info: Loaded engine size: 4 MiB
info: [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +15, now: CPU 8, GPU 282 (MiB)
FPS:731.412842
FPS:713.332947
FPS:797.625244
FPS:850.723083
FPS:759.826111
FPS:790.603821
FPS:893.340698
FPS:876.184570
FPS:849.282715
FPS:823.478210
FPS:818.882874
FPS:839.987427
FPS:841.113464
FPS:846.878052
FPS:853.196289
FPS:881.448181
FPS:867.657776
FPS:909.925354
FPS:884.442505
FPS:975.532715
FPS:984.539246
FPS:1001.592773
FPS:951.560303
FPS:965.459229
FPS:963.278687
FPS:916.945496
FPS:954.125122
FPS:965.415710
FPS:1004.943909
FPS:919.196472
FPS:942.398926
FPS:911.690796
FPS:994.875244
FPS:895.988342
视频推理截图:

完整工程见:https://github.com/taifyang/yolov5-tensorrt-threadpool