《cuda c编程权威指南》05 - cuda矩阵求和

cpp 复制代码
#include "cuda_runtime.h"
#include "device_launch_parameters.h"  // threadIdx

#include <stdio.h>    // io
#include <time.h>     // time_t
#include <stdlib.h>  // rand
#include <memory.h>  //memset


#define CHECK(call)                                   \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
}

/// <summary>
/// 矩阵相加,线性存储的二维矩阵
/// </summary>
/// <param name="h_a"></param>
/// <param name="h_b"></param>
/// <param name="h_c"></param>
/// <param name="nx"></param>
/// <param name="ny"></param>
void sumMatrixOnHost(float* h_a, float* h_b, float* h_c, const int nx, const int ny)
{
	float* ia = h_a;
	float* ib = h_b;
	float* ic = h_c;
	for (int iy = 0; iy < ny; iy++)
	{
		for (int ix = 0; ix < nx; ix++)  // 处理当前行
		{
			ic[ix] = ia[ix] + ib[ix];
		}
		ia += nx; ib += nx; ic += nx;  // 移动到下一行,ia下一行的第一个索引变成了0.
	}
}

// 去掉循环
__global__ void sumMatrixOnDevice2D(float* d_a, float* d_b, float* d_c, const int nx, const int ny)
{
	// 二维网格和二维块,映射到矩阵坐标
	unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
	unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
	// 由矩阵坐标, 映射到全局坐标(都是线性存储的)
	unsigned int idx = iy * nx + ix;  // 坐标(ix, iy),前面由iy行,每行有nx个元素
	// 相加
	if (ix < nx && iy < ny)  // 配置线程的可能过多,这里防止越界。
	{
		d_c[idx] = d_a[idx] + d_b[idx];
	}
	if (idx == 0)
		printf("%d, %d", idx, d_c[idx]);
}

void initialData(float* p, const int N)
{
	//generate different seed from random number
	time_t t;
	srand((unsigned int)time(&t));  // 生成种子

	for (int i = 0; i < N; i++)
	{
		p[i] = (float)(rand() & 0xFF) / 10.0f;  // 随机数
	}
}


void checkResult(float* hostRef, float* deviceRef, const int N)
{
	double eps = 1.0E-8;
	int match = 1;
	for (int i = 0; i < N; i++)
	{
		if (hostRef[i] - deviceRef[i] > eps)
		{
			match = 0;
			printf("\nArrays do not match\n");
			printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], deviceRef[i], i);
			break;
		}
	}
	if (match)
		printf("\nArrays match!\n");
}


int main(void)
{
	// get device info
	int device = 0;
	cudaDeviceProp deviceProp;
	CHECK(cudaGetDeviceProperties(&deviceProp, device));
	printf("Using device: %d %s", device, deviceProp.name);  // 卡号0的显卡名称。
	CHECK(cudaSetDevice(device));  // 设置显卡号

	// set matrix dimension. 2^14 = 16384行列数
	int nx = 1<<14, ny =1<<14, nxy = nx * ny;
	//int nx = 1 << 13, ny = 1 << 13, nxy = nx * ny;
	int nBytes = nxy * sizeof(float);

	// malloc host memory
	float* h_a, * h_b, * hostRef, * gpuRef;
	h_a = (float*)malloc(nBytes);
	h_b = (float*)malloc(nBytes);
	hostRef = (float*)malloc(nBytes); // 主机端求得的结果
	gpuRef = (float*)malloc(nBytes);  // 设备端拷回的数据
	// init data
	initialData(h_a, nxy);
	initialData(h_b, nxy);
	memset(hostRef, 0, nBytes);
	memset(gpuRef, 0, nBytes);
	// add matrix on host side for result checks.
	sumMatrixOnHost(h_a, h_b, hostRef, nx, ny);

	// malloc device memory
	float* d_mat_a, * d_mat_b, * d_mat_c;
	cudaMalloc((void**)&d_mat_a, nBytes);
	cudaMalloc((void**)&d_mat_b, nBytes);
	cudaMalloc((void**)&d_mat_c, nBytes);

	// transfer data from host to device
	cudaMemcpy(d_mat_a, h_a, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_mat_b, h_b, nBytes, cudaMemcpyHostToDevice);

	// config
	int dimx = 32;
	int dimy = 32;
	dim3 block(dimx, dimy);  // 二维线程块(x,y)=(4,2)
	dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); // 二维网格(2,3)
	// 直接nx/block.x = 8/4=2. (8+4-1)/4=2.

	// invoke kernel
	sumMatrixOnDevice2D << <grid, block >> > (d_mat_a, d_mat_b, d_mat_c, nx, ny);
	CHECK(cudaDeviceSynchronize());

	// check kernel error
	CHECK(cudaGetLastError());  // 一定要加上这一行,不然上面求和有错不会中断也不提示。

	// copy kernel result back to host side
	cudaMemcpy(gpuRef, d_mat_c, nBytes, cudaMemcpyDeviceToHost);

	// check result
	checkResult(hostRef, gpuRef, nxy);

	// free memory
	cudaFree(d_mat_a);
	cudaFree(d_mat_b);
	cudaFree(d_mat_c);
	free(h_a);
	free(h_b);
	free(hostRef);
	free(gpuRef);

	// reset device
	cudaDeviceReset();

	return 0;
}

注意:如果没有下面这一行,会报错。

cpp 复制代码
	// check kernel error
	CHECK(cudaGetLastError());  // 一定要加上这一行,不然上面求和有错不会中断也不提示。

错误信息:

File: D:/zxq/code/cuda/CUDA111/CUDA111/kernel.cu

Line: 144

Error code: 700

Error text: an illegal memory access was encountered

1<<14会报矩阵求和不一致错误,1<<13即可。

相关推荐
碧海潮生_CC1 天前
【CUDA笔记】04 CUDA 归约, 原子操作,Warp 交换
笔记·cuda
fpcc5 天前
并行编程实战——CUDA编程的流的优先级
c++·cuda
碧海潮生_CC6 天前
【CUDA笔记】03 CUDA GPU 架构与一般的程序优化思路(下)
笔记·架构·cuda
中医正骨葛大夫7 天前
一文解决如何在Pycharm中创建cuda深度学习环境?
pytorch·深度学习·pycharm·软件安装·cuda·anaconda·配置环境
lvxiangyu1112 天前
wsl2 ubuntu24 opengl 无法使用nvidia显卡 解决方法记录
wsl·cuda·opengl
李昊哲小课12 天前
wsl ubuntu24.04 cuda13 cudnn9 pytorch 显卡加速
人工智能·pytorch·python·cuda·cudnn
wanzhong233313 天前
CUDA学习2-CPU和GPU的性能优化
深度学习·gpu·cuda·高性能计算
碧海潮生_CC19 天前
【CUDA笔记】01-入门简介
笔记·cuda
喆星时瑜22 天前
关于 ComfyUI 的 Windows 本地部署系统环境教程(详细讲解Windows 10/11、NVIDIA GPU、Python、PyTorch环境等)
python·cuda·comfyui
安全二次方security²1 个月前
CUDA C++编程指南(1)——简介
nvidia·cuda·c/c++·device·cuda编程·architecture·compute unified