Cuda加速直线拟合

对于一些工业产线上收集来的三维数据，由于环境的复杂，数据量比较大，通常需要对数据进行清洗，预处理，加速等操作。

现在提供一种cuda加速的方式，对采集到的深度数据（Z方向数据）做预处理，进行直线拟合的方法。适用于定长数据数组

1、首先参考别人的代码，用QT封装了cuda核函数。

method_global.h

cpp 复制代码

#include <QtCore/qglobal.h>

#ifndef BUILD_STATIC
# if defined(QPCLTOIMAGE_LIB)
#  define QPCLTOIMAGE_EXPORT Q_DECL_EXPORT
# else
#  define QPCLTOIMAGE_EXPORT Q_DECL_IMPORT
# endif
#else
# define QPCLTOIMAGE_EXPORT
#endif

cuda.h

cpp 复制代码

#pragma once

#include <map>
#include <string>
#include <vector>
#include <cuda.h>
#include <nvrtc.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cudaProfiler.h>
#include <cuda_runtime.h>
#include <QtConcurrent/QtConcurrent>
#include <QDir>
#include "CudaHelper.h"

namespace redips {
	class QPCLTOIMAGE_EXPORT Cuder {
		CUcontext context;
		std::map <std::string, CUmodule> modules;
		std::map <std::string, std::pair<CUdeviceptr, unsigned int>> devptrs;

	public:
		Cuder();
		void release();

		static int getInitCount();

		class QPCLTOIMAGE_EXPORT ValueHolder {
		public:
			void * value = nullptr;
			bool is_string = false;

			ValueHolder(const char* str);

			template <typename T>
			ValueHolder(const T& data) {
				value = new T(data);
			}
		};

		//forbidden copy-constructor and assignment function
		Cuder(const Cuder&) = delete;
		Cuder& operator= (const Cuder& another) = delete;

		Cuder(Cuder&& another);
		Cuder& operator= (Cuder&& another);

		virtual ~Cuder();

	public:
		bool launch(dim3 gridDim, dim3 blockDim, ::std::string module, ::std::string kernel_function, ::std::initializer_list<ValueHolder> params);

		bool addModule(::std::string cufile);

		void applyArray(const char* name, size_t size, void* h_ptr = nullptr);

		void fetchArray(const char* name, size_t size, void * h_ptr);

	private:
		static int devID;
		static CUdevice cuDevice;
		static bool cuda_enviroment_initialized;
		/*static*/ void initialize();
		//如果是ptx文件则直接返回文件内容，如果是cu文件则编译后返回ptx
		std::string get_ptx(::std::string filename);
		CUstream m_cuStream;
		std::ofstream outtxt;
	};
};

cuda.cpp

cpp 复制代码

#include "Cuder.h"
#include <cuda_runtime.h>

namespace redips {

	static int g_initCount = 0;
	std::string txt_path = QCoreApplication::applicationDirPath().toStdString() + "/AlthTxtLog/";
	QDir dir(QString::fromStdString(txt_path));
	Cuder::Cuder() {
		if (!cuda_enviroment_initialized) initialize(); 
		checkCudaErrors(cuCtxCreate(&context, 0, cuDevice));
		checkCudaErrors(cuStreamCreate(&m_cuStream, 0));
		g_initCount++;
	}

	void Cuder::release() {
		//for (auto module : modules) delete module.second;
		for (auto dptr : devptrs)    cudaFree((void*)dptr.second.first);
		devptrs.clear();
		modules.clear();
		cuStreamDestroy(m_cuStream);
		cuCtxDestroy(context);
	}

	int Cuder::getInitCount()
	{
		return g_initCount;
	}

	Cuder::ValueHolder::ValueHolder(const char* str) {
			value = (void*)str;
			is_string = true;
	}

	Cuder::Cuder(Cuder&& another) {
		this->context = another.context;
		another.context = nullptr;
		this->devptrs = std::map<std::string, std::pair<CUdeviceptr, unsigned int>>(std::move(another.devptrs));
		this->modules = std::map<std::string, CUmodule>(std::move(another.modules));
	}

	Cuder& Cuder::operator= (Cuder&& another) {
		if (this->context == another.context) return *this;
		release();
		this->context = another.context;
		another.context = nullptr;
		this->devptrs = std::map<std::string, std::pair<CUdeviceptr, unsigned int>>(std::move(another.devptrs));
		this->modules = std::map<std::string, CUmodule>(std::move(another.modules));
		return *this;
	}

	Cuder::~Cuder() { release(); };

	bool Cuder::launch(dim3 gridDim, dim3 blockDim, std::string module, std::string kernel_function, std::initializer_list<ValueHolder> params) {
		//get kernel address
		if (!modules.count(module)) {
			std::cerr << "[Cuder] : doesn't exists an module named " << module << std::endl; 
			return false;
		}
		CUfunction kernel_addr;
		if (CUDA_SUCCESS != cuModuleGetFunction(&kernel_addr, modules[module], kernel_function.c_str())) {
			std::cerr << "[Cuder] : doesn't exists an kernel named " << kernel_function << " in module " << module << std::endl; 
			return false;
		}
		//setup params
		std::vector<void*> pamary;
		for (auto v : params) {
			if (v.is_string) {
				if (devptrs.count((const char*)(v.value))) pamary.push_back((void*)(&(devptrs[(const char*)(v.value)])));
				else {
					std::cerr << "[Cuder] :launch failed. doesn't exists an array named " << (const char*)(v.value) << std::endl;
					return false;
				}
			}
			else pamary.push_back(v.value);
		}
		bool result = (CUDA_SUCCESS == cuLaunchKernel(kernel_addr, gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, 0, m_cuStream, &pamary[0], 0));
		cudaStreamSynchronize(m_cuStream);
		cuCtxSynchronize();
		if (!result) {
			fprintf(stderr, "Cuda runtime error in line %d of file %s : %s \n", __LINE__, __FILE__, cudaGetErrorString(cudaGetLastError()));
		}

		"ms" << std::endl;
		return result;
	}

	bool Cuder::addModule(std::string cufile) {
		auto it = modules.find(cufile);
		if(it != modules.end()) {
			std::cerr << "[Cuder] :already has an modules named " << cufile << std::endl;;
			return false;
		}

		std::string ptx = get_ptx(cufile);

		if (ptx.length() > 0) {
			CUmodule module;
			checkCudaErrors(cuModuleLoadDataEx(&module, ptx.c_str(), 0, 0, 0));
			modules[cufile] = module;
			return true;
		}
		else {
			std::cerr << "[Cuder] : error: add module " << cufile << " failed!\n";
			return false;
		}
	}

	void Cuder::applyArray(const char* name, size_t size, void* h_ptr) {
		if (devptrs.count(name)) {
			if (h_ptr) {
				CUdeviceptr d_ptr;
				if (devptrs[name].second < size) {
					cudaFree((void*)devptrs[name].first);
					checkCudaErrors(cudaMalloc((void**)&d_ptr, size));
					devptrs[name].first = d_ptr;
					devptrs[name].second = size;
				}
				d_ptr = devptrs[name].first;
				checkCudaErrors(cudaMemcpy((void*)d_ptr, h_ptr, size, cudaMemcpyHostToDevice));
			}

			std::cerr << "[Cuder] : already has an array named " << name << std::endl;;
			return;
		}
		CUdeviceptr d_ptr; 
		checkCudaErrors(cudaMalloc((void**)&d_ptr, size));
		if (h_ptr)
			checkCudaErrors(cudaMemcpy((void*)d_ptr, h_ptr, size, cudaMemcpyHostToDevice));
		devptrs[name] = { d_ptr, size};
	}

	void Cuder::fetchArray(const char* name, size_t size, void * h_ptr) {
		if (!devptrs.count(name)) {
			std::cerr << "[Cuder] : doesn't exists an array named " << name << std::endl;;
			return;
		}
		checkCudaErrors(cudaMemcpy((void*)h_ptr, (const void*)devptrs[name].first, size, cudaMemcpyDeviceToHost));
	}

	void Cuder::initialize() {
		// picks the best CUDA device [with highest Gflops/s] available
		devID = gpuGetMaxGflopsDeviceIdDRV();
		checkCudaErrors(cuDeviceGet(&cuDevice, devID));
		// print device information
		{
			char name[100]; int major = 0, minor = 0;
		}
		//initialize
		checkCudaErrors(cuInit(0));

		cuda_enviroment_initialized = true;
	}
	//如果是ptx文件则直接返回文件内容，如果是cu文件则编译后返回ptx
	std::string Cuder::get_ptx(std::string filename) {
		std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate);
		if (!inputFile.is_open()) {
			std::cerr << "[Cuder] : error: unable to open " << filename << " for reading! the first time.\n";
			if (!dir.exists()) {
				dir.mkpath(QString::fromStdString(txt_path));
			}
			std::string txt_path_log = txt_path + "txtlog.txt";
			outtxt.open(txt_path_log, std::ios::out | std::ios::binary);
			if (outtxt.is_open()) {
				outtxt << "cudaerror: unable to open for reading! the first time." << std::endl;
				outtxt.close();
			}
			
			inputFile.open(filename, std::ios::in | std::ios::binary | std::ios::ate);
			if (!inputFile.is_open()) {
				std::cerr << "[Cuder] : error: unable to open " << filename << " for reading! the second time.\n";
				if (!dir.exists()) {
					dir.mkpath(QString::fromStdString(txt_path));
				}
				std::string txt_path_log = txt_path + "txtlog.txt";
				outtxt.open(txt_path_log, std::ios::out | std::ios::binary);
				if (outtxt.is_open()) {
					outtxt << "cudaerror: unable to open for reading! the second time." << std::endl;
					outtxt.close();
				}
				return "";
			}			
		}	
		std::streampos pos = inputFile.tellg();
		size_t inputSize = (size_t)pos;
		char * memBlock = new char[inputSize + 1];

		inputFile.seekg(0, std::ios::beg);
		inputFile.read(memBlock, inputSize);
		inputFile.close();
		memBlock[inputSize] = '\x0';

		if (filename.find(".ptx") != std::string::npos)
			return std::string(std::move(memBlock));
		// compile
		nvrtcProgram prog;
		if (nvrtcCreateProgram(&prog, memBlock, filename.c_str(), 0, NULL, NULL) == NVRTC_SUCCESS) {
			delete[] memBlock;
			if (nvrtcCompileProgram(prog, 0, nullptr) == NVRTC_SUCCESS) {
				// dump log
				size_t logSize;
				nvrtcGetProgramLogSize(prog, &logSize);
				if (logSize > 0) {
					char *log = new char[logSize + 1];
					nvrtcGetProgramLog(prog, log);
					log[logSize] = '\x0';
					std::cout << "[Cuder] : compile [" << filename << "] " << log << std::endl;
					delete[] log;
				}
				else std::cout << "[Cuder] : compile [" << filename << "] finish" << std::endl;

				// fetch PTX
				size_t ptxSize;
				nvrtcGetPTXSize(prog, &ptxSize);
				char *ptx = new char[ptxSize + 1];
				nvrtcGetPTX(prog, ptx);
				nvrtcDestroyProgram(&prog);
				return std::string(std::move(ptx));
			}
		}
		delete[] memBlock;
		return "";
	}
	
	bool Cuder::cuda_enviroment_initialized = false;
	int Cuder::devID = 0;
	CUdevice Cuder::cuDevice = 0;
}

CudaHelper.h

cpp 复制代码

#pragma once

#include <cuda_runtime.h>
#include "method_global.h"

namespace redips {
	#define checkCudaErrors(a) do { \
	if(cudaSuccess != (a)) { \
		fprintf(stderr, "Cuda runtime error in line %d of file %s\
		: %s \n", __LINE__, __FILE__, cudaGetErrorString(cudaGetLastError())); \
		exit(EXIT_FAILURE); \
		} \
	} while (0);

	int QPCLTOIMAGE_EXPORT gpuGetMaxGflopsDeviceIdDRV();
}

参考：https://www.cnblogs.com/redips-l/p/8372795.html

2、直线拟合

1）简要介绍

假设已知有N个点，设这条直线方程为： y = a·x + b

公式：

如果用c++进行实现，可以是：

参考：https://www.cnblogs.com/chl052529/p/18630770

2）cuda加速后实现

主要代码如下：

cpp 复制代码

#include <math.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <host_defines.h>
#include <device_functions.h>
#include <device_launch_parameters.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>

......//其他逻辑
extern "C"  __global__ void proccedRowAvr(float * Val, unsigned int cols, unsigned int rows, float * RowAvr, float * IndexAvr,float* Factor_a,float* Factor_b) {
	//求直线拟合
	const int gap = blockDim.x * gridDim.x;
	//printf("gridDim.x: %d, blockDim.x: %d, blockIdx.x: %d, threadIdx.x: %d\n", gridDim.x, blockDim.x, blockIdx.x, threadIdx.x);
	//printf("gridDim.y: %d, blockDim.y: %d, blockIdx.y: %d, threadIdx.y: %d\n", gridDim.y, blockDim.y, blockIdx.y, threadIdx.y);
	__shared__ float* val;
	val = Val;
	__shared__ float* rowval;
	rowval = RowAvr;
	//__shared__ float* indexavg;
	__shared__ float* indexavg;
	indexavg = IndexAvr;
	__shared__ float* factor_a;
	factor_a = Factor_a;
	__shared__ float* factor_b;
	factor_b = Factor_b;
	
	//__syncthreads();
	for (int rowIndex = blockDim.x * blockIdx.x + threadIdx.x; rowIndex < rows; rowIndex += gap) {
		float rowAverageVal;
		rowAverageVal = 0.0;
		int IndexAverageVal; 
		IndexAverageVal = 0;
		int count = 0;
		//if (rowIndex == 0) {
			//printf("blockDim.x: %d, blockIdx.x: %d, threadIdx.x: %d, rowIndex: %d, gap: %d\n", blockDim.x, blockIdx.x, threadIdx.x, rowIndex, gap);
		for (int tmpColIndex = 0; tmpColIndex < cols; tmpColIndex++)
		{
			const int dataIndex = tmpColIndex * rows + rowIndex;
			//printf("blockDim.x: %d, blockIdx.x: %d, threadIdx.x: %d, rowIndex: %d, tmpColIndex: %d, dataIndex: %d\n", blockDim.x, blockIdx.x, threadIdx.x, rowIndex, tmpColIndex, dataIndex);
			//printf("dataIndex: %d\n", dataIndex);
			const float zVal = val[dataIndex];
			if (-20000.00 != zVal) {
				count++;
				rowAverageVal = rowAverageVal + zVal;
				IndexAverageVal = IndexAverageVal + dataIndex;
			}				
		}	
		if ((rowAverageVal!=0) && (IndexAverageVal!=0))
		{
			rowval[rowIndex] = rowAverageVal / count;  //y值平均
			indexavg[rowIndex] = IndexAverageVal / count; //x值平均	
		}								
	}
	__syncthreads();
	////计算Lxx,Lyy,Lxy
	//for (int i = 0; i < n; i++) {
	//	Lxy += (points[i].getX() - avgX)*(points[i].getY() - avgY);
	//	Lxx += (points[i].getX() - avgX)*(points[i].getX() - avgX);
	//	Lyy += (points[i].getY() - avgY)*(points[i].getY() - avgY);
	//}
	for (int rowIndex = blockDim.x * blockIdx.x + threadIdx.x; rowIndex < rows; rowIndex += gap) {
		float Lxy = 0.0;
		float Lxx = 0.0;
		float Lyy = 0.0;
		for (int tmpColIndex = 0; tmpColIndex < cols; tmpColIndex++)
		{
			const int dataIndex = tmpColIndex * rows + rowIndex;
			const float zVal = val[dataIndex];
			if (-20000.00 != zVal) {				
				Lxy = Lxy + (dataIndex - indexavg[rowIndex])*(val[dataIndex] - rowval[rowIndex]);
				Lxx = Lxx + (dataIndex - indexavg[rowIndex])*(dataIndex - indexavg[rowIndex]);
				Lyy = Lyy + (val[dataIndex] - rowval[rowIndex])*(val[dataIndex] - rowval[rowIndex]);
			}
		
		}
		
		/*float a = Lxy / Lxx;		
		float b = avgY - a * avgX;*/
		if ((Lxy != 0) &&( Lxx != 0) &&( Lyy != 0))
		{
			factor_a[rowIndex] = Lxy / Lxx;
			factor_b[rowIndex] = rowval[rowIndex] - factor_a[rowIndex] * indexavg[rowIndex];
		}
									
	}
	__syncthreads();

	for (int rowIndex = blockDim.x * blockIdx.x + threadIdx.x; rowIndex < rows; rowIndex += gap) {
		for (int tmpColIndex = 0; tmpColIndex < cols; tmpColIndex++)
		{
			const int dataIndex = tmpColIndex * rows + rowIndex;
			const float zVal = val[dataIndex];
			if (-20000.00 != zVal) {
				val[dataIndex] = zVal - (factor_a[rowIndex] * dataIndex + factor_b[rowIndex]);
			}
		}
	}
	__syncthreads();
}

......//其他逻辑

调用核函数

主要步骤：

1.初始化

cpp 复制代码

void initCuda()
{
	if (!m_cuder) {
		module_file = get_ptx_path("CCUDA.cu");//成员变量加载核函数.cu
	}
}

2.加载模块

cpp 复制代码

m_cuder->addModule(module_file);

3、申请数组空间

cpp 复制代码

m_cuder->applyArray("a_dev_1", sizeof(float) * InputData.points.size(),  InputData.points.data());
m_cuder->applyArray("R_dev", sizeof(float) * rows, avgRowVal.data());
m_cuder->applyArray("Fa_dev", sizeof(float) * rows, factor_a.data());
m_cuder->applyArray("Fb_dev", sizeof(float) * rows, factor_b.data());
m_cuder->applyArray("In_dev", sizeof(float) * rows, avgIndex.data());

4、运行核函数

cpp 复制代码

m_cuder->launch(dim3(512, 1, 1), dim3(256, 1, 1), m_module_file, "proccedRowAvr", { "a_dev_1", cols, rows, "R_dev","In_dev","Fa_dev","Fb_dev" });

5、拿回数据到cpu

cpp 复制代码

m_cuder->fetchArray("a_dev_1", sizeof(float) * tmpZValData.points.size(), tmpZValData.points.data());  //直线拟合后拿回数据
//m_cuder->fetchArray("Fa_dev", sizeof(float) * rows, factor_a.data());
//m_cuder->fetchArray("Fb_dev", sizeof(float) * rows, factor_b.data());