《cuda c编程权威指南》05 - cuda矩阵求和

cpp 复制代码
#include "cuda_runtime.h"
#include "device_launch_parameters.h"  // threadIdx

#include <stdio.h>    // io
#include <time.h>     // time_t
#include <stdlib.h>  // rand
#include <memory.h>  //memset


#define CHECK(call)                                   \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
}

/// <summary>
/// 矩阵相加,线性存储的二维矩阵
/// </summary>
/// <param name="h_a"></param>
/// <param name="h_b"></param>
/// <param name="h_c"></param>
/// <param name="nx"></param>
/// <param name="ny"></param>
void sumMatrixOnHost(float* h_a, float* h_b, float* h_c, const int nx, const int ny)
{
	float* ia = h_a;
	float* ib = h_b;
	float* ic = h_c;
	for (int iy = 0; iy < ny; iy++)
	{
		for (int ix = 0; ix < nx; ix++)  // 处理当前行
		{
			ic[ix] = ia[ix] + ib[ix];
		}
		ia += nx; ib += nx; ic += nx;  // 移动到下一行,ia下一行的第一个索引变成了0.
	}
}

// 去掉循环
__global__ void sumMatrixOnDevice2D(float* d_a, float* d_b, float* d_c, const int nx, const int ny)
{
	// 二维网格和二维块,映射到矩阵坐标
	unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
	unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
	// 由矩阵坐标, 映射到全局坐标(都是线性存储的)
	unsigned int idx = iy * nx + ix;  // 坐标(ix, iy),前面由iy行,每行有nx个元素
	// 相加
	if (ix < nx && iy < ny)  // 配置线程的可能过多,这里防止越界。
	{
		d_c[idx] = d_a[idx] + d_b[idx];
	}
	if (idx == 0)
		printf("%d, %d", idx, d_c[idx]);
}

void initialData(float* p, const int N)
{
	//generate different seed from random number
	time_t t;
	srand((unsigned int)time(&t));  // 生成种子

	for (int i = 0; i < N; i++)
	{
		p[i] = (float)(rand() & 0xFF) / 10.0f;  // 随机数
	}
}


void checkResult(float* hostRef, float* deviceRef, const int N)
{
	double eps = 1.0E-8;
	int match = 1;
	for (int i = 0; i < N; i++)
	{
		if (hostRef[i] - deviceRef[i] > eps)
		{
			match = 0;
			printf("\nArrays do not match\n");
			printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], deviceRef[i], i);
			break;
		}
	}
	if (match)
		printf("\nArrays match!\n");
}


int main(void)
{
	// get device info
	int device = 0;
	cudaDeviceProp deviceProp;
	CHECK(cudaGetDeviceProperties(&deviceProp, device));
	printf("Using device: %d %s", device, deviceProp.name);  // 卡号0的显卡名称。
	CHECK(cudaSetDevice(device));  // 设置显卡号

	// set matrix dimension. 2^14 = 16384行列数
	int nx = 1<<14, ny =1<<14, nxy = nx * ny;
	//int nx = 1 << 13, ny = 1 << 13, nxy = nx * ny;
	int nBytes = nxy * sizeof(float);

	// malloc host memory
	float* h_a, * h_b, * hostRef, * gpuRef;
	h_a = (float*)malloc(nBytes);
	h_b = (float*)malloc(nBytes);
	hostRef = (float*)malloc(nBytes); // 主机端求得的结果
	gpuRef = (float*)malloc(nBytes);  // 设备端拷回的数据
	// init data
	initialData(h_a, nxy);
	initialData(h_b, nxy);
	memset(hostRef, 0, nBytes);
	memset(gpuRef, 0, nBytes);
	// add matrix on host side for result checks.
	sumMatrixOnHost(h_a, h_b, hostRef, nx, ny);

	// malloc device memory
	float* d_mat_a, * d_mat_b, * d_mat_c;
	cudaMalloc((void**)&d_mat_a, nBytes);
	cudaMalloc((void**)&d_mat_b, nBytes);
	cudaMalloc((void**)&d_mat_c, nBytes);

	// transfer data from host to device
	cudaMemcpy(d_mat_a, h_a, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_mat_b, h_b, nBytes, cudaMemcpyHostToDevice);

	// config
	int dimx = 32;
	int dimy = 32;
	dim3 block(dimx, dimy);  // 二维线程块(x,y)=(4,2)
	dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); // 二维网格(2,3)
	// 直接nx/block.x = 8/4=2. (8+4-1)/4=2.

	// invoke kernel
	sumMatrixOnDevice2D << <grid, block >> > (d_mat_a, d_mat_b, d_mat_c, nx, ny);
	CHECK(cudaDeviceSynchronize());

	// check kernel error
	CHECK(cudaGetLastError());  // 一定要加上这一行,不然上面求和有错不会中断也不提示。

	// copy kernel result back to host side
	cudaMemcpy(gpuRef, d_mat_c, nBytes, cudaMemcpyDeviceToHost);

	// check result
	checkResult(hostRef, gpuRef, nxy);

	// free memory
	cudaFree(d_mat_a);
	cudaFree(d_mat_b);
	cudaFree(d_mat_c);
	free(h_a);
	free(h_b);
	free(hostRef);
	free(gpuRef);

	// reset device
	cudaDeviceReset();

	return 0;
}

注意:如果没有下面这一行,会报错。

cpp 复制代码
	// check kernel error
	CHECK(cudaGetLastError());  // 一定要加上这一行,不然上面求和有错不会中断也不提示。

错误信息:

File: D:/zxq/code/cuda/CUDA111/CUDA111/kernel.cu

Line: 144

Error code: 700

Error text: an illegal memory access was encountered

1<<14会报矩阵求和不一致错误,1<<13即可。

相关推荐
fpcc4 天前
并行编程实战——CUDA编程的图之六子图的创建
人工智能·cuda
明月醉窗台5 天前
[jetson] AGX Xavier 安装Ubuntu18.04及jetpack4.5
人工智能·算法·nvidia·cuda·jetson
飞翔的SA5 天前
全程 Python:无需离开 Python 即可实现光速级 CUDA 加速,无需c++支持
开发语言·c++·python·nvidia·cuda
阿钱真强道10 天前
01 飞腾 S5000C 服务器环境搭建实战:PyTorch + CUDA + RTX 4090D 安装与验证
pytorch·cuda·aarch64·深度学习环境搭建·飞腾服务器·s5000c·rtx4090d
酌量13 天前
nvidia orin agx刷机忘记CUDA runtime,安装torch和cuda
linux·笔记·ubuntu·torch·cuda·agx
明月醉窗台14 天前
[Jetson] NVIDIA Jetson 全系列边缘计算芯片进阶指南
人工智能·边缘计算·cuda·jetson
明月醉窗台18 天前
[Jetson] 在Jetson Xavier AGX编译opencv+cuda
人工智能·opencv·计算机视觉·cuda·jetson
专注VB编程开发20年19 天前
CUDA实现随机切割算法,显卡多线程计算
算法·cuda
KIDGINBROOK23 天前
NVIDIA NCCL 源码学习(十七)- LL和LL128协议
cuda·rdma·nccl
老鱼说AI23 天前
大规模并发处理器程序设计(PMPP)讲解(CUDA架构):第四期:计算架构与调度
c语言·深度学习·算法·架构·cuda