cpp
复制代码
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <stdio.h>
#include <chrono>
#include <fstream>
#include "box.hpp"
using namespace std;
using namespace cv;
#define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__)
bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
if(code != cudaSuccess){
const char* err_name = cudaGetErrorName(code);
const char* err_message = cudaGetErrorString(code);
printf("runtime error %s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message);
return false;
}
return true;
}
// static std::vector<uint8_t> load_file(const string& file){
// ifstream in(file, ios::in | ios::binary);
// if (!in.is_open())
// return {};
// in.seekg(0, ios::end);
// size_t length = in.tellg();
// std::vector<uint8_t> data;
// if (length > 0){
// in.seekg(0, ios::beg);
// data.resize(length);
// in.read((char*)&data[0], length);
// }
// in.close();
// return data;
// }
// 函数:从指定路径加载整个文件到内存,返回字节向量(二进制模式)
// 参数:file ------ 要读取的文件路径(字符串)
// 返回值:成功时返回包含文件全部内容的 std::vector<uint8_t>;失败时返回空 vector
static std::vector<uint8_t> load_file(const string& file) {
// 以二进制模式(ios::binary)和只读模式(ios::in)打开文件
// 使用 ifstream(输入文件流)读取文件内容
ifstream in(file, ios::in | ios::binary);
// 检查文件是否成功打开
// 如果文件不存在、路径错误或权限不足,is_open() 返回 false
if (!in.is_open())
return {}; // 返回一个空的 vector<uint8_t>,表示加载失败
// 将文件指针移动到文件末尾(ios::end),用于获取文件总长度
in.seekg(0, ios::end);
// tellg() 返回当前文件指针的位置(即文件大小,单位:字节)
size_t length = in.tellg();
// 声明一个 vector 用于存储文件数据
std::vector<uint8_t> data;
// 只有当文件长度大于 0 时才进行读取(避免对空文件 resize 或 read)
if (length > 0) {
// 将文件指针重新定位到文件开头(ios::beg),准备读取数据
in.seekg(0, ios::beg);
// 预先分配 vector 的容量为文件长度,避免多次内存重分配,提高效率
data.resize(length);
// 从文件中读取 length 个字节到 data 的底层缓冲区
// 注意:&data[0] 是 vector 第一个元素的地址(C++11 起保证连续存储)
// 强制转换为 char* 是因为 istream::read 接受 char* 类型
in.read((char*)&data[0], length);
}
// 显式关闭文件流(虽然析构函数会自动关闭,但显式关闭更清晰、安全)
in.close();
// 返回包含整个文件内容的字节数组
return data;
}
//结构是25200*85
vector<Box> cpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
vector<Box> boxes;
int num_classes = cols - 5;
for(int i = 0; i < rows; ++i){//25200
float* pitem = predict + i * cols;//每一行的指针地址 85个数,分别是 x y w h objness pclass0 pclass1 ... pclass79
float objness = pitem[4];//objness 计算得分的时候用
if(objness < confidence_threshold)
continue;
float* pclass = pitem + 5;//类别得分地址
int label = std::max_element(pclass, pclass + num_classes) - pclass;//得分最高的标签的索引
float prob = pclass[label];//得分最高的标签的得分
float confidence = prob * objness;//最终的得分
if(confidence < confidence_threshold)
continue;
float cx = pitem[0];//框中心点x
float cy = pitem[1];//框中心点y
float width = pitem[2];//框宽
float height = pitem[3];//框高
float left = cx - width * 0.5;//左上角x
float top = cy - height * 0.5;//左上角y
float right = cx + width * 0.5;//右下角x
float bottom = cy + height * 0.5;//右下角y
boxes.emplace_back(left, top, right, bottom, confidence, (float)label);
}
std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){return a.confidence > b.confidence;});//所有框置信度从高到低排列
std::vector<bool> remove_flags(boxes.size());
std::vector<Box> box_result;
box_result.reserve(boxes.size());//经过nms之后的框
auto iou = [](const Box& a, const Box& b){//iou计算
float cross_left = std::max(a.left, b.left);
float cross_top = std::max(a.top, b.top);
float cross_right = std::min(a.right, b.right);
float cross_bottom = std::min(a.bottom, b.bottom);
float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
float union_area = std::max(0.0f, a.right - a.left) * std::max(0.0f, a.bottom - a.top)
+ std::max(0.0f, b.right - b.left) * std::max(0.0f, b.bottom - b.top) - cross_area;
if(cross_area == 0 || union_area == 0) return 0.0f;
return cross_area / union_area;
};
for(int i = 0; i < boxes.size(); ++i){
if(remove_flags[i]) continue;
auto& ibox = boxes[i];
box_result.emplace_back(ibox);
for(int j = i + 1; j < boxes.size(); ++j){
if(remove_flags[j]) continue;
auto& jbox = boxes[j];
if(ibox.label == jbox.label){
// class matched
if(iou(ibox, jbox) >= nms_threshold)
remove_flags[j] = true;
}
}
}
return box_result;
}
void decode_kernel_invoker(
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, cudaStream_t stream);
vector<Box> gpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
vector<Box> box_result;
cudaStream_t stream = nullptr;
checkRuntime(cudaStreamCreate(&stream));
float* predict_device = nullptr;
float* output_device = nullptr;
float* output_host = nullptr;
int max_objects = 1000;
int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag
checkRuntime(cudaMalloc(&predict_device, rows * cols * sizeof(float)));//输入数据 在gpu
checkRuntime(cudaMalloc(&output_device, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));//输出数据 还在gpu
checkRuntime(cudaMallocHost(&output_host, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));//输出数据 在cpu
checkRuntime(cudaMemcpyAsync(predict_device, predict, rows * cols * sizeof(float), cudaMemcpyHostToDevice, stream));
//
std::cout<<"rows "<<rows<<" cols - 5 " <<cols - 5<<std::endl;
decode_kernel_invoker(//25200 80
predict_device, rows, cols - 5, confidence_threshold,
nms_threshold, nullptr, output_device, max_objects, NUM_BOX_ELEMENT, stream
);
checkRuntime(cudaMemcpyAsync(output_host, output_device,
sizeof(int) + max_objects * NUM_BOX_ELEMENT * sizeof(float),
cudaMemcpyDeviceToHost, stream
));
checkRuntime(cudaStreamSynchronize(stream));
int num_boxes = min((int)output_host[0], max_objects);
for(int i = 0; i < num_boxes; ++i){
float* ptr = output_host + 1 + NUM_BOX_ELEMENT * i;
int keep_flag = ptr[6];
if(keep_flag){
box_result.emplace_back(
ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5]
);
}
}
checkRuntime(cudaStreamDestroy(stream));
checkRuntime(cudaFree(predict_device));
checkRuntime(cudaFree(output_device));
checkRuntime(cudaFreeHost(output_host));
return box_result;
}
int main(){
auto data = load_file("predict.data");
auto image = cv::imread("input-image.jpg");
float* ptr = (float*)data.data();
int nelem = data.size() / sizeof(float);
int ncols = 85;
int nrows = nelem / ncols;
std::cout<<"==行数=="<<nrows<<" ==列数=="<<ncols<<std::endl;//==行数==25200 ==列数==85
// auto boxes = cpu_decode(ptr, nrows, ncols);
auto boxes = gpu_decode(ptr, nrows, ncols);
for(auto& box : boxes){
cv::rectangle(image, cv::Point(box.left, box.top), cv::Point(box.right, box.bottom), cv::Scalar(0, 255, 0), 2);
cv::putText(image, cv::format("%.2f", box.confidence), cv::Point(box.left, box.top - 7), 0, 0.8, cv::Scalar(0, 0, 255), 2, 16);
}
cv::imwrite("image-draw.jpg", image);
return 0;
}
cpp
复制代码
#include <cuda_runtime.h>
#include <iostream>
static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){
*ox = matrix[0] * x + matrix[1] * y + matrix[2];
*oy = matrix[3] * x + matrix[4] * y + matrix[5];
}
//25200个线程,每个线程处理一个框
static __global__ void decode_kernel(//25200 80
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT
){
int position = blockDim.x * blockIdx.x + threadIdx.x;//范围是 0--25199
if (position >= num_bboxes) return;
float* pitem = predict + (5 + num_classes) * position;//每个线程的起始地址 每一行的指针地址 85个数,分别是 x y w h objness pclass0 pclass1 ... pclass79
float objectness = pitem[4];
if(objectness < confidence_threshold)
return;
float* class_confidence = pitem + 5;
float confidence = *class_confidence++;
int label = 0;
for(int i = 1; i < num_classes; ++i, ++class_confidence){
if(*class_confidence > confidence){
confidence = *class_confidence;
label = i;
}
}
confidence *= objectness;
if(confidence < confidence_threshold)
return;
int index = atomicAdd(parray, 1);
if(index >= max_objects)
return;
float cx = *pitem++;
float cy = *pitem++;
float width = *pitem++;
float height = *pitem++;
float left = cx - width * 0.5f;
float top = cy - height * 0.5f;
float right = cx + width * 0.5f;
float bottom = cy + height * 0.5f;
// affine_project(invert_affine_matrix, left, top, &left, &top);
// affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
// left, top, right, bottom, confidence, class, keepflag
float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
}
static __device__ float box_iou(
float aleft, float atop, float aright, float abottom,
float bleft, float btop, float bright, float bbottom
){
float cleft = max(aleft, bleft);
float ctop = max(atop, btop);
float cright = min(aright, bright);
float cbottom = min(abottom, bbottom);
float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
if(c_area == 0.0f)
return 0.0f;
float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
return c_area / (a_area + b_area - c_area);
}
static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT){
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = min((int)*bboxes, max_objects);
if (position >= count)
return;
//排列为:框的个数,框1,框2,。。。,框n 每个框的形式为:7个数
// left, top, right, bottom, confidence, class, keepflag
float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;//多线程中每个框的起始地址
for(int i = 0; i < count; ++i){//循环处理单线程中的所有框 单个线程中一个框和所有的框做iou,排除自身以及类别不同的,剩余的做iou计算 nms
float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;//每个框的起始地址
if(i == position || pcurrent[5] != pitem[5]) continue;
if(pitem[4] >= pcurrent[4]){
if(pitem[4] == pcurrent[4] && i < position)//只处理 多线程中框的位置对应的所有for循环中每个框的右侧的位置
continue;
float iou = box_iou(
pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
pitem[0], pitem[1], pitem[2], pitem[3]
);
if(iou > threshold){
pcurrent[6] = 0; // 1=keep, 0=ignore
return;
}
}
}
}
//predict 输入数据 还在gpu
//parray 输出数据 还在gpu
void decode_kernel_invoker(//25200 80
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, cudaStream_t stream){
auto block = num_bboxes > 512 ? 512 : num_bboxes;
auto grid = (num_bboxes + block - 1) / block;
std::cout<<"block "<<block<<" grid "<<grid<<std::endl;
/* 如果核函数有波浪线,没关系,他是正常的,你只是看不顺眼罢了 */
decode_kernel<<<grid, block, 0, stream>>>(//25200个线程,每个线程处理一个框
predict, num_bboxes, num_classes, confidence_threshold,
invert_affine_matrix, parray, max_objects, NUM_BOX_ELEMENT
);
block = max_objects > 512 ? 512 : max_objects;
grid = (max_objects + block - 1) / block;
std::cout<<"block "<<block<<" grid "<<grid<<std::endl;
// 2 512
fast_nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold, NUM_BOX_ELEMENT);
}