OpenCL 学习(3)---- OpenCL 第一个程序

目录

OpenCL 开发流程

OpenCL 编程的标准开发流程如下:

  1. 查询平台(platform)和设备(device),选择需要的计算设备
  2. 创建上下文(context)
  3. 创建命令队列(command queue)
  4. 创建和编译程序对象(program)
  5. 创建内核对象,设置内核参数(kernel Arg)
  6. 执行内核
  7. 数据拷贝回主机端

基本流程如下所示:

参考实例
c 复制代码
static const int ARRAY_SIZE = 100;

static const char *kernel_function_vec_add =
		"__kernel void vector_add(global const float *a, global const float *b, global float *result)"
		"{                                                                                           "
		"int gid = get_global_id(0);                                                                 "
		"result[gid] = a[gid] + b[gid];                                                              "
		"}                                                                                           ";

int demoVectorAddOptimizeImpl(int argc, char* argv[]) {
	cl_int errNum;

	/*prepare input data*/
	float result[ARRAY_SIZE];
	float a[ARRAY_SIZE];
	float b[ARRAY_SIZE];

	for (int i = 0; i < ARRAY_SIZE; i++) {
		a[i] = (float)i + 0.123;
		b[i] = (float)(i * 2) + 0.345;
	}

	cl_uint numPlatforms;
	cl_platform_id firstPlatformId;
	cl_context context;
	cl_device_id device_id;
	errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
	if (errNum != CL_SUCCESS || numPlatforms <= 0) {
		printf("Failed to find any OpenCL platforms.");
		return EXIT_FAILURE;
	}

	errNum = clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	if (errNum != CL_SUCCESS) {
		printf("There is no GPU, trying CPU... \n");
		errNum = clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
		if (errNum != CL_SUCCESS) {
			printf("There is NO GPU or CPU \n");
			return EXIT_FAILURE;
		}
	}

	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &errNum);
	if (errNum != CL_SUCCESS) {
		printf("create context error\n");
		return NULL;
	}

	cl_command_queue commandQueue = clCreateCommandQueueWithProperties(context, device_id, 0, NULL);
	if (commandQueue == NULL) {
		printf("Failed to create commandQueue for device 0 \n");
		return EXIT_FAILURE;
	}

	cl_program program;
	program = clCreateProgramWithSource(context, 1, &kernel_function_vec_add, NULL, NULL);
	if (program == NULL) {
		printf("Failed to create CL program from source. \n");
		return EXIT_FAILURE;
	}

	errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (errNum != CL_SUCCESS) {
		char buildLog[16384];
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL);
		printf("Error in kernel:%s \n", buildLog);
		clReleaseProgram(program);
		return NULL;
	}

	cl_kernel kernel;
	kernel = clCreateKernel(program, "vector_add", NULL);
	if (kernel == NULL) {
		printf("Failed to create kernel \n");
		return EXIT_FAILURE;
	}

	cl_mem input_mem0 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*ARRAY_SIZE, a, NULL);
	cl_mem input_mem1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*ARRAY_SIZE, b, NULL);
	cl_mem output_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*ARRAY_SIZE, NULL, NULL);
	if (input_mem0 == NULL || input_mem1 == NULL || output_mem == NULL) {
		printf("Error creating memory objects. \n");
		return EXIT_FAILURE;
	}

	errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_mem0);
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input_mem1);
	errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_mem);
	if (errNum != CL_SUCCESS) {
		printf("Error setting kernel arguments.\n");
		return EXIT_FAILURE;
	}

	size_t globalWorkSize[1] = { ARRAY_SIZE };
	size_t localWorkSize[1] = { 1 };
	//执行内核
	errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if (errNum != CL_SUCCESS) {
		printf("Error queuing kernel for execution. \n");

		return EXIT_FAILURE;
	}

	errNum = clEnqueueReadBuffer(commandQueue, output_mem, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), result, 0, NULL, NULL);
	if (errNum != CL_SUCCESS) {
		printf("Error reading result buffer. \n");
		
		return EXIT_FAILURE;
	}

	for (int i = 0; i < ARRAY_SIZE; i++) {
		printf("a[%d]=%f b[%d]=%f result[%d]=%f\n", i, a[i], i, b[i], i, result[i]);
	}
	printf("Executed program succesfully. \n");

cleanup:
	if (input_mem0)
		clReleaseMemObject(input_mem0);
	if (input_mem1)
		clReleaseMemObject(input_mem1);
	if (output_mem)
		clReleaseMemObject(output_mem);
	if (commandQueue)
		clReleaseCommandQueue(commandQueue);
	if (kernel)
		clReleaseKernel(kernel);
	if (program)
		clReleaseProgram(program);
	if (context)
		clReleaseContext(context);

	return 0;
}
相关推荐
FluxMelodySun9 分钟前
机器学习(二十七) 降维:度量学习与随机梯度下降法求解
人工智能·学习·机器学习
一尘之中28 分钟前
利用QPanda测试量子系统噪声:从理论到QAOA实践
学习·ai写作·量子计算
艾莉丝努力练剑38 分钟前
【MYSQL】MYSQL学习的一大重点:表的约束
linux·运维·服务器·开发语言·数据库·学习·mysql
叶子野格40 分钟前
Notepad++编写html文件使用D3绘图:数据可视化
笔记·学习·信息可视化·开源·notepad++
Chunyyyen1 小时前
【第三十八周】论文复现记录01
学习
woodykissme1 小时前
揭秘表面粗糙度的16%规则:为什么允许16%的超差?
学习·制造·机械·粗糙度·工艺知识
秋刀鱼不做梦2 小时前
网络编程和Socket套接字(UDP+TCP)(如果想知道Java中有关网络编程和Socket套接字的知识,那么只看这一篇就足够了!)
网络·网络协议·学习·tcp/ip·udp
AI成长日志2 小时前
【笔面试算法学习专栏】链表操作专题:反转、环形检测与合并
学习·算法·面试
徐某人..2 小时前
基于i.MX6ULL开发板与OV5640摄像头实现QT相机应用开发
qt·学习·arm
是翔仔呐2 小时前
第10章 串口通信USART全解:轮询/中断/DMA三种收发模式与上位机通信实战
c语言·开发语言·stm32·单片机·嵌入式硬件·学习·gitee