cuda小白
原始API链接 NPP
GPU架构近些年也有不少的变化,具体的可以参考别的博主的介绍,都比较详细。还有一些cuda中的专有名词的含义,可以参考《详解CUDA的Context、Stream、Warp、SM、SP、Kernel、Block、Grid》
常见的NppStatus,可以看这里。
Thresholding Operations
分通道,逐像素进行比较,根据指定的Operation,如果不符合则更新当前值。当前模块分为两大类,一个是直接原地址进行操作,另外一类则是指定不同的输出地址。
cpp
/*
enum NppCmpOp {
NPP_CMP_LESS,
NPP_CMP_LESS_EQ,
NPP_CMP_EQ,
NPP_CMP_GREATER_EQ,
NPP_CMP_GREATER
};
*/
// 通用的,如果满足比较条件,则
NppStatus nppiThreshold_8u_C3R(const Npp8u *pSrc,
int nSrcStep,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
const Npp8u rThresholds[3],
NppCmpOp eComparisonOperation);
// 大于 NPP_CMP_GREATER_EQ
NppStatus nppiThreshold_GT_8u_C3R(const Npp8u *pSrc,
int nSrcStep,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
const Npp8u rThresholds[3]);
// 小于 NPP_CMP_LESS_EQ
NppStatus nppiThreshold_LT_8u_C3R(const Npp8u *pSrc,
int nSrcStep,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
const Npp8u rThresholds[3]);
// 指定需要设置的值
NppStatus nppiThreshold_Val_8u_C3R(const Npp8u *pSrc,
int nSrcStep,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
const Npp8u rThresholds[3],
const Npp8u rValues[3],
NppCmpOp eComparisonOperation);
NppStatus nppiThreshold_GTVal_8u_C3R(const Npp8u * pSrc,
int nSrcStep,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
const Npp8u rThresholds[3],
const Npp8u rValues[3]);
NppStatus nppiThreshold_LTVal_8u_C3R(const Npp8u *pSrc,
int nSrcStep,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
const Npp8u rThresholds[3],
const Npp8u rValues[3]);
// 设置上下界
NppStatus nppiThreshold_LTValGTVal_8u_C3R(const Npp8u *pSrc,
int nSrcStep,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
const Npp8u rThresholdsLT[3],
const Npp8u rValuesLT[3],
const Npp8u rThresholdsGT[3],
const Npp8u rValuesGT[3]);
两边各选用一个接口作为示例
code
cpp
#include <iostream>
#include <cuda_runtime.h>
#include <npp.h>
#include <opencv2/opencv.hpp>
#define CUDA_FREE(ptr) { if (ptr != nullptr) { cudaFree(ptr); ptr = nullptr; } }
int main() {
std::string directory = "../";
cv::Mat image_dog = cv::imread(directory + "dog.png");
int image_width = image_dog.cols;
int image_height = image_dog.rows;
int image_size = image_width * image_height;
// =============== device memory ===============
// input
uint8_t *in_image;
cudaMalloc((void**)&in_image, image_size * 3 * sizeof(uint8_t));
cudaMemcpy(in_image, image_dog.data, image_size * 3 * sizeof(uint8_t), cudaMemcpyHostToDevice);
// output
uint8_t *out_ptr1, *out_ptr2;
cudaMalloc((void**)&out_ptr1, image_size * 3 * sizeof(uint8_t)); // 三通道
cudaMalloc((void**)&out_ptr2, image_size * 3 * sizeof(uint8_t)); // 三通道
NppiSize in_size;
in_size.width = image_width;
in_size.height = image_height;
uint8_t threshold[3] = {150, 150, 150};
cv::Mat out_image = cv::Mat::zeros(image_height, image_width, CV_8UC3);
// =============== nppiThreshold_GT_8u_C3R ===============
NppStatus status;
status = nppiThreshold_GT_8u_C3R(in_image, image_width * 3, out_ptr1, image_width * 3,
in_size, threshold);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiThreshold_GT_8u_C3R failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr1, image_size * 3, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "threshold_gt.jpg", out_image);
// =============== nppiThreshold_GTVal_8u_C3R ===============
uint8_t value[3] = {255, 255, 255};
status = nppiThreshold_GTVal_8u_C3R(in_image, image_width * 3, out_ptr2, image_width * 3,
in_size, threshold, value);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiThreshold_GTVal_8u_C3R failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr2, image_size * 3, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "threshold_gt_value.jpg", out_image);
// free
CUDA_FREE(in_image)
CUDA_FREE(out_ptr1)
CUDA_FREE(out_ptr2)
}
make
cpp
cmake_minimum_required(VERSION 3.20)
project(test)
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
file(GLOB CUDA_LIBS "/usr/local/cuda/lib64/*.so")
add_executable(test test.cpp)
target_link_libraries(test
${OpenCV_LIBS}
${CUDA_LIBS}
)
result
Comparison Operations
本文到此就只阐述比较简单的两个接口,其他的结果按需索取
cpp
NppStatus nppiCompare_8u_C3R(const Npp8u *pSrc1,
int nSrc1Step,
const Npp8u *pSrc2,
int nSrc2Step,
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
NppCmpOp eComparisonOperation);
NppStatus nppiCompareC_8u_C3R(const Npp8u *pSrc,
int nSrcStep,
const Npp8u *pConstants,
Npp8u * pDst,
int nDstStep,
NppiSize oSizeROI,
NppCmpOp eComparisonOperation);
目的就是比较两张图或者将一张图与一个constant进行比较,并且生成一个二进制的结果图像。二进制的结果图像类型是8UC1,如果是不同的话,则设置为0,反之表示uint8_t的最大值。
code
cpp
#include <iostream>
#include <cuda_runtime.h>
#include <npp.h>
#include <opencv2/opencv.hpp>
#define CUDA_FREE(ptr) { if (ptr != nullptr) { cudaFree(ptr); ptr = nullptr; } }
int main() {
std::string directory = "../";
cv::Mat image_dog = cv::imread(directory + "dog.png");
int image_width = image_dog.cols;
int image_height = image_dog.rows;
int image_size = image_width * image_height;
// =============== device memory ===============
// input
uint8_t *in_image1, *in_image2;
cudaMalloc((void**)&in_image1, image_size * 3 * sizeof(uint8_t));
cudaMalloc((void**)&in_image2, image_size * 3 * sizeof(uint8_t));
cudaMemcpy(in_image1, image_dog.data, image_size * 3 * sizeof(uint8_t), cudaMemcpyHostToDevice);
cv::Mat mask = cv::Mat::zeros(image_height, image_width, CV_8UC3);
int step = 4;
int step_width = image_width / step;
cv::Mat ones = cv::Mat::ones(image_height, step_width, CV_8UC3);
for (int i = 1; i < step; ++i) {
cv::Rect rc1 = cv::Rect(i * step_width, 0, step_width, image_height);
mask(rc1) = ones.clone() * 50 * i;
}
cudaMemcpy(in_image2, mask.data, image_size * 3 * sizeof(uint8_t), cudaMemcpyHostToDevice);
// output
uint8_t *out_ptr1, *out_ptr2;
cudaMalloc((void**)&out_ptr1, image_size * sizeof(uint8_t)); // 三通道
cudaMalloc((void**)&out_ptr2, image_size * sizeof(uint8_t)); // 三通道
NppiSize in_size;
in_size.width = image_width;
in_size.height = image_height;
cv::Mat out_image = cv::Mat::zeros(image_height, image_width, CV_8UC1);
NppStatus status;
// =============== nppiCompare_8u_C3R ===============
status = nppiCompare_8u_C3R(in_image1, image_width * 3, in_image2, image_width * 3,
out_ptr1, image_width, in_size, NPP_CMP_GREATER);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiCompare_8u_C3R failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr1, image_size, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "compare.jpg", out_image);
// =============== nppiCompareC_8u_C3R ===============
uint8_t constant[3] = {100, 100, 100};
status = nppiCompareC_8u_C3R(in_image1, image_width * 3, constant, out_ptr2, image_width,
in_size, NPP_CMP_GREATER);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiCompareC_8u_C3R failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr2, image_size, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "comparec.jpg", out_image);
// free
CUDA_FREE(in_image1)
CUDA_FREE(in_image2)
CUDA_FREE(out_ptr1)
CUDA_FREE(out_ptr2)
}
make
cpp
cmake_minimum_required(VERSION 3.20)
project(test)
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
file(GLOB CUDA_LIBS "/usr/local/cuda/lib64/*.so")
add_executable(test test.cpp)
target_link_libraries(test
${OpenCV_LIBS}![请添加图片描述](https://img-blog.csdnimg.cn/81402e58c241462fa7d22d7783b5d176.png)
${CUDA_LIBS}
)