CUDA小白 - NPP(4) 图像处理 Data Exchange and Initialization(1)

cuda小白

原始API链接 NPP

GPU架构近些年也有不少的变化,具体的可以参考别的博主的介绍,都比较详细。还有一些cuda中的专有名词的含义,可以参考《详解CUDA的Context、Stream、Warp、SM、SP、Kernel、Block、Grid》

常见的NppStatus,可以看这里

如有问题,请指出,谢谢

Image Set Operations

当前模块主要功能是set图像中的像素值,主要分为三个大类:将ROI区域内的所有像素设置为一个特殊的值(Set),mask赋值(Masked Set),以及单通道赋值(Channel Set)。

三个大类分别以一个三通道的uint8_t为例子简单介绍一下。

cpp 复制代码
// ROI区域内的三通道设置为aValue
NppStatus nppiSet_8u_C3R(const Npp8u aValue[3],
					 	 Npp8u *pDst,
						 int nDstStep,
						 NppiSize oSizeROI);
// 通过mask控制ROI区域内的那些像素会被set
NppStatus nppiSet_8u_C3MR(const Npp8u aValue[3],
						  Npp8u *pDst,
						  int nDstStep,
						  NppiSize oSizeROI,
						  const Npp8u *pMask,
						  int nMaskStep);	
// 通过pointer的起始位置区别,选择某通道设置为固定值
NppStatus nppiSet_8u_C3CR(Npp8u nValue,
						  Npp8u *pDst,
						  int nDstStep,
						  NppiSize oSizeROI);
code
cpp 复制代码
#include <iostream>
#include <cuda_runtime.h>
#include <npp.h>
#include <opencv2/opencv.hpp>

#define CUDA_FREE(ptr) { if (ptr != nullptr) { cudaFree(ptr); ptr = nullptr; } }

int main() {
  std::string directory = "../";

  // =============== load image ===============
  cv::Mat image_dog = cv::imread(directory + "dog.png");
  int image_width = image_dog.cols;
  int image_height = image_dog.rows;
  int image_size = image_width * image_height;

  // =============== device memory ===============
  uint8_t *out_ptr1, *out_ptr2, *out_ptr3;
  cudaMalloc((void**)&out_ptr1, image_size * 3 * sizeof(uint8_t));
  cudaMalloc((void**)&out_ptr2, image_size * 3 * sizeof(uint8_t));
  cudaMalloc((void**)&out_ptr3, image_size * 3 * sizeof(uint8_t));
  cudaMemcpy(out_ptr1, image_dog.data, image_size * 3 * sizeof(uint8_t), cudaMemcpyHostToDevice);
  cudaMemcpy(out_ptr2, image_dog.data, image_size * 3 * sizeof(uint8_t), cudaMemcpyHostToDevice);
  cudaMemcpy(out_ptr3, image_dog.data, image_size * 3 * sizeof(uint8_t), cudaMemcpyHostToDevice);

  cv::Mat mask = cv::Mat::zeros(image_height, image_width, CV_8UC1);
  cv::Mat mask1 = cv::Mat::ones(image_height * 3 / 4, image_width * 3 / 4, CV_8UC1);
  cv::Rect rc1 = cv::Rect(image_width / 4, image_height / 4, image_width * 3 / 4, image_height * 3 / 4);
  mask1.copyTo(mask(rc1));

  uint8_t *gpu_mask;
  cudaMalloc((void**)&gpu_mask, image_size * sizeof(uint8_t));
  cudaMemcpy(gpu_mask, mask.data, image_size * sizeof(uint8_t), cudaMemcpyHostToDevice);

  NppiSize roi1, roi2;
  roi1.width = image_width;
  roi1.height = image_height;
  roi2.width = image_width / 2;
  roi2.height = image_height / 2;

  cv::Mat out_image = cv::Mat::zeros(image_height, image_width, CV_8UC3);
  NppStatus status;
  // =============== nppiSet_8u_C3R ===============
  uint8_t value[3] = { 255, 0, 0 };
  status = nppiSet_8u_C3R(value, out_ptr1, image_width * 3, roi1);
  if (status != NPP_SUCCESS) {
    std::cout << "[GPU] ERROR nppiSet_8u_C3R failed, status = " << status << std::endl;
    return false;
  }
  cudaMemcpy(out_image.data, out_ptr1, image_size * 3, cudaMemcpyDeviceToHost);
  cv::imwrite(directory + "set.jpg", out_image);

  // =============== nppiSet_8u_C3R ===============
  uint8_t value2[3] = { 0, 0, 255 };
  status = nppiSet_8u_C3MR(value2, out_ptr2, image_width * 3, roi1, gpu_mask, image_width);
  if (status != NPP_SUCCESS) {
    std::cout << "[GPU] ERROR nppiSet_8u_C3MR failed, status = " << status << std::endl;
    return false;
  }
  cudaMemcpy(out_image.data, out_ptr2, image_size * 3, cudaMemcpyD![请添加图片描述](https://img-blog.csdnimg.cn/9da721ce7d4649839ef40228bb3937e1.png)
eviceToHost);
  cv::imwrite(directory + "set_mask.jpg", out_image);
  
  // green
  status = nppiSet_8u_C3CR(255, out_ptr3 + image_width * 3 * 200 + 1, image_width * 3, roi1);
  if (status != NPP_SUCCESS) {
    std::cout << "[GPU] ERROR nppiSet_8u_C3CR failed, status = " << status << std::endl;
    return false;
  }
  cudaMemcpy(out_image.data, out_ptr3, image_size * 3, cudaMemcpyDeviceToHost);
  cv::imwrite(directory + "set_channel.jpg", out_image);


  // free
  CUDA_FREE(out_ptr1)
  CUDA_FREE(out_ptr2)
  CUDA_FREE(out_ptr3)
}
make
cpp 复制代码
cmake_minimum_required(VERSION 3.20)
project(test)

find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})

find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
file(GLOB CUDA_LIBS "/usr/local/cuda/lib64/*.so")

add_executable(test test.cpp)
target_link_libraries(test
                      ${OpenCV_LIBS}
                      ${CUDA_LIBS}
)
result

注意:

  1. mask使用的是单通道的,仅表示那些像素需要进行set,那些不需要
  2. 通道的set,通过指针来表示是针对那个通道进行转换。输入指针表示开始set的起始位置,对于三通道的图像而言,则是隔两个set一次。

Image Copy Operations

除了比较常见的copy操作(copy,masked copy,channel copy)之外,还有一些planar和packed之间的来回拷贝,拷贝的同时伴随着border,以及Copy Sub-pixel(没接触过)

cpp 复制代码
// 单纯的拷贝
NppStatus nppiCopy_8u_C3R(const Npp8u *pSrc,
						  int nSrcStep,
						  Npp8u *pDst,
						  int nDstStep,
						  NppiSize oSizeROI);	
// 依据mask有选择性的进行拷贝
NppStatus nppiCopy_8u_C3MR(const Npp8u *pSrc,
						   int nSrcStep,
						   Npp8u *pDst,
						   int nDstStep,
						   NppiSize oSizeROI,
						   const Npp8u *pMask,
						   int nMaskStep);	
// Channel Copy, 将一个多通道的某个通道拷贝到另外一个多通道图像的某一个channel
NppStatus nppiCopy_8u_C3CR(const Npp8u *pSrc,
						   int nSrcStep,
						   Npp8u *pDst,
						   int nDstStep,
						   NppiSize oSizeROI);	
// Extract Channel Copy, 将一个多通道的某个通道拷贝到另外一个单通道的图像
NppStatus nppiCopy_8u_C3C1R(const Npp8u * pSrc,
						    int nSrcStep,
							Npp8u *pDst,
							int nDstStep,
							NppiSize oSizeROI);	
// Insert Channel Copy, 一个单通道的图像拷贝到多通道中的某一个通道
NppStatus nppiCopy_8u_C1C3R(const Npp8u * pSrc,
							int nSrcStep,
							Npp8u * pDst,
							int nDstStep,
							NppiSize oSizeROI);
// 剩下的接口平时接触较少,所以暂时不做详细介绍
code
cpp 复制代码
#include <iostream>
#include <cuda_runtime.h>
#include <npp.h>
#include <opencv2/opencv.hpp>

#define CUDA_FREE(ptr) { if (ptr != nullptr) { cudaFree(ptr); ptr = nullptr; } }

int main() {
  std::string directory = "../";

  // =============== load image ===============
  cv::Mat image_dog = cv::imread(directory + "dog.png");
  cv::Mat image_dog_gray;
  cv::cvtColor(image_dog, image_dog_gray, CV_RGB2GRAY);

  int image_width = image_dog.cols;
  int image_height = image_dog.rows;
  int image_size = image_width * image_height;

  // =============== device memory ===============
  uint8_t *in_image, *in_img_gray;
  cudaMalloc((void**)&in_image, image_size * 3 * sizeof(uint8_t));
  cudaMalloc((void**)&in_img_gray, image_size * sizeof(uint8_t));
  cudaMemcpy(in_image, image_dog.data, image_size * 3 * sizeof(uint8_t), cudaMemcpyHostToDevice);
  cudaMemcpy(in_img_gray, image_dog_gray.data, image_size * sizeof(uint8_t), cudaMemcpyHostToDevice);

  uint8_t *out_ptr1, *out_ptr2, *out_ptr3, *out_ptr4, *out_ptr5;
  cudaMalloc((void**)&out_ptr1, image_size * 3 * sizeof(uint8_t));  // 三通道
  cudaMalloc((void**)&out_ptr2, image_size * 3 * sizeof(uint8_t));  // 三通道
  cudaMalloc((void**)&out_ptr3, image_size * 3 * sizeof(uint8_t));  // 三通道
  cudaMalloc((void**)&out_ptr4, image_size * sizeof(uint8_t));  // 单通道
  cudaMalloc((void**)&out_ptr5, image_size * 3 * sizeof(uint8_t));  // 三通道

  // mask
  cv::Mat mask = cv::Mat::zeros(image_height, image_width, CV_8UC1);
  cv::Mat mask1 = cv::Mat::ones(image_height * 3 / 4, image_width * 3 / 4, CV_8UC1);
  cv::Rect rc1 = cv::Rect(image_width / 4, image_height / 4, image_width * 3 / 4, image_height * 3 / 4);
  mask1.copyTo(mask(rc1));

  uint8_t *gpu_mask;
  cudaMalloc((void**)&gpu_mask, image_size * sizeof(uint8_t));
  cudaMemcpy(gpu_mask, mask.data, image_size * sizeof(uint8_t), cudaMemcpyHostToDevice);

  NppiSize roi1, roi2;
  roi1.width = image_width;
  roi1.height = image_height;
  roi2.width = image_width / 2;
  roi2.height = image_height / 2;

  cv::Mat out_image = cv::Mat::zeros(image_height, image_width, CV_8UC3);
  cv::Mat out_single = cv::Mat::zeros(image_height, image_width, CV_8UC1);
  NppStatus status;
  // =============== nppiCopy_8u_C3R ===============
  status = nppiCopy_8u_C3R(in_image, image_width * 3, out_ptr1, image_width * 3, roi1);
  if (status != NPP_SUCCESS) {
    std::cout << "[GPU] ERROR nppiCopy_8u_C3R failed, status = " << status << std::endl;
    return false;
  }
  cudaMemcpy(out_image.data, out_ptr1, image_size * 3, cudaMemcpyDeviceToHost);
  cv::imwrite(directory + "copy.jpg", out_image);

  // =============== nppiCopy_8u_C3MR ===============
  status = nppiCopy_8u_C3MR(in_image, image_width * 3, out_ptr2, image_width * 3, roi1, gpu_mask, image_width);
  if (status != NPP_SUCCESS) {
    std::cout << "[GPU] ERROR nppiCopy_8u_C3MR failed, status = " << status << std::endl;
    return false;
  }
  cudaMemcpy(out_image.data, out_ptr2, image_size * 3, cudaMemcpyDeviceToHost);
  cv::imwrite(directory + "copy_mask.jpg", out_image);
  
  // =============== nppiCopy_8u_C3CR ===============
  status = nppiCopy_8u_C3CR(in_image, image_width * 3, out_ptr3, image_width * 3, roi1);
  if (status != NPP_SUCCESS) {
    std::cout << "[GPU] ERROR nppiCopy_8u_C3CR failed, status = " << status << std::endl;
    return false;
  }
  cudaMemcpy(out_image.data, out_ptr3, image_size * 3, cudaMemcpyDeviceToHost);
  cv::imwrite(directory + "copy_channel.jpg", out_image);

  // =============== nppiCopy_8u_C3C1R ===============
  status = nppiCopy_8u_C3C1R(in_image, image_width * 3, out_ptr4, image_width, roi1);
  if (status != NPP_SUCCESS) {
    std::cout << "[GPU] ERROR nppiCopy_8u_C3C1R failed, status = " << status << std::endl;
    return false;
  }
  cudaMemcpy(out_single.data, out_ptr4, image_size, cudaMemcpyDeviceToHost);
  cv::imwrite(directory + "copy_channel_extract.jpg", out_single);

  // =============== nppiCopy_8u_C1C3R ===============
  status = nppiCopy_8u_C1C3R(in_img_gray, image_width, out_ptr5, image_width * 3, roi1);
  if (status != NPP_SUCCESS) {
    std::cout << "[GPU] ERROR nppiCopy_8u_C1C3R failed, status = " << status << std::endl;
    return false;
  }
  cudaMemcpy(out_image.data, out_ptr5, image_size * 3, cudaMemcpyDeviceToHost);
  cv::imwrite(directory + "copy_channel_insert.jpg", out_image);

  // free
  CUDA_FREE(in_image)
  CUDA_FREE(in_img_gray)
  CUDA_FREE(out_ptr1)
  CUDA_FREE(out_ptr2)
  CUDA_FREE(out_ptr3)
  CUDA_FREE(out_ptr4)
  CUDA_FREE(out_ptr5)
}
make
cpp 复制代码
cmake_minimum_required(VERSION 3.20)
project(test)

find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})

find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
file(GLOB CUDA_LIBS "/usr/local/cuda/lib64/*.so")

add_executable(test test.cpp)
target_link_libraries(test
                      ${OpenCV_LIBS}
                      ${CUDA_LIBS}
)
result

注意:

  1. 由于提取三个通道进行copy,存图的时候只有单个通道,因此呈现出来的结果是灰色的。
相关推荐
UestcXiye1 小时前
《TCP/IP网络编程》学习笔记 | Chapter 3:地址族与数据序列
c++·计算机网络·ip·tcp
霁月风3 小时前
设计模式——适配器模式
c++·适配器模式
jrrz08283 小时前
LeetCode 热题100(七)【链表】(1)
数据结构·c++·算法·leetcode·链表
咖啡里的茶i3 小时前
Vehicle友元Date多态Sedan和Truck
c++
海绵波波1073 小时前
Webserver(4.9)本地套接字的通信
c++
@小博的博客3 小时前
C++初阶学习第十弹——深入讲解vector的迭代器失效
数据结构·c++·学习
爱吃喵的鲤鱼4 小时前
linux进程的状态之环境变量
linux·运维·服务器·开发语言·c++
7年老菜鸡5 小时前
策略模式(C++)三分钟读懂
c++·qt·策略模式
Ni-Guvara5 小时前
函数对象笔记
c++·算法
似霰5 小时前
安卓智能指针sp、wp、RefBase浅析
android·c++·binder