OpenCL 学习(3)---- OpenCL 第一个程序

目录

OpenCL 开发流程

OpenCL 编程的标准开发流程如下:

  1. 查询平台(platform)和设备(device),选择需要的计算设备
  2. 创建上下文(context)
  3. 创建命令队列(command queue)
  4. 创建和编译程序对象(program)
  5. 创建内核对象,设置内核参数(kernel Arg)
  6. 执行内核
  7. 数据拷贝回主机端

基本流程如下所示:

参考实例
c 复制代码
static const int ARRAY_SIZE = 100;

static const char *kernel_function_vec_add =
		"__kernel void vector_add(global const float *a, global const float *b, global float *result)"
		"{                                                                                           "
		"int gid = get_global_id(0);                                                                 "
		"result[gid] = a[gid] + b[gid];                                                              "
		"}                                                                                           ";

int demoVectorAddOptimizeImpl(int argc, char* argv[]) {
	cl_int errNum;

	/*prepare input data*/
	float result[ARRAY_SIZE];
	float a[ARRAY_SIZE];
	float b[ARRAY_SIZE];

	for (int i = 0; i < ARRAY_SIZE; i++) {
		a[i] = (float)i + 0.123;
		b[i] = (float)(i * 2) + 0.345;
	}

	cl_uint numPlatforms;
	cl_platform_id firstPlatformId;
	cl_context context;
	cl_device_id device_id;
	errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
	if (errNum != CL_SUCCESS || numPlatforms <= 0) {
		printf("Failed to find any OpenCL platforms.");
		return EXIT_FAILURE;
	}

	errNum = clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	if (errNum != CL_SUCCESS) {
		printf("There is no GPU, trying CPU... \n");
		errNum = clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
		if (errNum != CL_SUCCESS) {
			printf("There is NO GPU or CPU \n");
			return EXIT_FAILURE;
		}
	}

	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &errNum);
	if (errNum != CL_SUCCESS) {
		printf("create context error\n");
		return NULL;
	}

	cl_command_queue commandQueue = clCreateCommandQueueWithProperties(context, device_id, 0, NULL);
	if (commandQueue == NULL) {
		printf("Failed to create commandQueue for device 0 \n");
		return EXIT_FAILURE;
	}

	cl_program program;
	program = clCreateProgramWithSource(context, 1, &kernel_function_vec_add, NULL, NULL);
	if (program == NULL) {
		printf("Failed to create CL program from source. \n");
		return EXIT_FAILURE;
	}

	errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (errNum != CL_SUCCESS) {
		char buildLog[16384];
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL);
		printf("Error in kernel:%s \n", buildLog);
		clReleaseProgram(program);
		return NULL;
	}

	cl_kernel kernel;
	kernel = clCreateKernel(program, "vector_add", NULL);
	if (kernel == NULL) {
		printf("Failed to create kernel \n");
		return EXIT_FAILURE;
	}

	cl_mem input_mem0 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*ARRAY_SIZE, a, NULL);
	cl_mem input_mem1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*ARRAY_SIZE, b, NULL);
	cl_mem output_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*ARRAY_SIZE, NULL, NULL);
	if (input_mem0 == NULL || input_mem1 == NULL || output_mem == NULL) {
		printf("Error creating memory objects. \n");
		return EXIT_FAILURE;
	}

	errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_mem0);
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input_mem1);
	errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_mem);
	if (errNum != CL_SUCCESS) {
		printf("Error setting kernel arguments.\n");
		return EXIT_FAILURE;
	}

	size_t globalWorkSize[1] = { ARRAY_SIZE };
	size_t localWorkSize[1] = { 1 };
	//执行内核
	errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if (errNum != CL_SUCCESS) {
		printf("Error queuing kernel for execution. \n");

		return EXIT_FAILURE;
	}

	errNum = clEnqueueReadBuffer(commandQueue, output_mem, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), result, 0, NULL, NULL);
	if (errNum != CL_SUCCESS) {
		printf("Error reading result buffer. \n");
		
		return EXIT_FAILURE;
	}

	for (int i = 0; i < ARRAY_SIZE; i++) {
		printf("a[%d]=%f b[%d]=%f result[%d]=%f\n", i, a[i], i, b[i], i, result[i]);
	}
	printf("Executed program succesfully. \n");

cleanup:
	if (input_mem0)
		clReleaseMemObject(input_mem0);
	if (input_mem1)
		clReleaseMemObject(input_mem1);
	if (output_mem)
		clReleaseMemObject(output_mem);
	if (commandQueue)
		clReleaseCommandQueue(commandQueue);
	if (kernel)
		clReleaseKernel(kernel);
	if (program)
		clReleaseProgram(program);
	if (context)
		clReleaseContext(context);

	return 0;
}
相关推荐
雾沉川4 分钟前
Flutter 入门开发环境完整搭建教程
学习·flutter
星夜夏空9913 分钟前
STM32单片机学习(37) —— PWR和BKP
stm32·单片机·学习
万岳科技18 分钟前
教育培训系统开发流程详解:平台建设关键环节解析
数据库·后端·学习
fanged19 分钟前
高通学习14--RB5(TODO)
学习
Tbisnic2 小时前
AI大模型学习第十四天:Coze项目实战中的分治智慧
人工智能·python·学习·大模型·工作流·智能体·coze
小风吹啊吹~2 小时前
通过时态图学习意图驱动识别足球控球比赛阶段 论文详解
学习·transformer·论文笔记·gan·足球战术·战术分析系统
阿i索2 小时前
【C++学习笔记】【基础】4.string类(2)——模拟实现
c++·笔记·学习
袁小皮皮不皮3 小时前
6.HCIP OSPF域间防环机制与虚链路
服务器·网络·笔记·网络协议·学习·智能路由器
一口吃俩胖子3 小时前
【脉宽调制DCDC功率变换学习笔记026】补偿设计和闭环性能
笔记·学习