在2.0中进行了用一维网格和块对一维向量进行了求和。
在2.1中例化了二维的网格和块。
接下来进行2维网络(grid)和2维块(block)对矩阵进行求和。
python
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>
typedef unsigned long DWORD;
#define CHECK(call) \
{\
const cudaError_t error = call; \
if (error != cudaSuccess)\
{\
printf("Error: %s: %d\n", __FILE__, __LINE__);\
printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\
exit(1);\
}\
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i])> epsilon)
{
match = 0;
printf("Array do not match\n");
printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);
break;
}
}
if (match) printf("array matches\n");
}
void initialData(float *ip, int size)
{
time_t t;
srand((unsigned int) time(&t));
for (int i = 0; i < size; i++) {
ip[i] = (float) (rand() & 0xff) / 10.0f;
}
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny){
float *ia = A;
float *ib = B;
float *ic = C;
for (int iy = 0; iy < ny; iy++)
{
for (int ix =0; ix < nx; ix++){
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
}
__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny){
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy*nx + ix;
if (ix < nx && iy < ny){
MatC[idx] = MatA[idx] + MatB[idx];
}
}
int main(int argc , char **argv)
{
printf("%s starting\n", argv[0]);
int dev = 0;
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("Using Device %d : %s\n", dev, deviceprop.name);
CHECK(cudaSetDevice(dev));
//set up data
int nx = 1<<14;
int ny = 1<<14;
int nxy = nx * ny;
size_t nBytes = nxy * sizeof(float);
printf("matrix size %d %d\n", nx, ny);
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *) malloc (nBytes);
h_B = (float *) malloc (nBytes);
hostRef = (float *) malloc (nBytes);
gpuRef = (float *) malloc (nBytes);
initialData(h_A, nxy);
initialData(h_B, nxy);
memset(hostRef,0, nBytes);
memset(gpuRef,0, nBytes);
// malloc device global memory
float *d_MatA, *d_MatB, *d_MatC;
cudaMalloc((float**)&d_MatA, nBytes);
cudaMalloc((float**)&d_MatB, nBytes);
cudaMalloc((float**)&d_MatC, nBytes);
//transfer data from host to device
cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1)/block.x, (ny + block.y - 1)/block.y);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
sumMatrixOnGPU2D<<<grid,block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("execution config <<<(%d,%d), (%d,%d)>>>\n", grid.x,grid.y, block.x, block.y);
printf("Kernel execution time: %f ms\n", milliseconds);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy kernel result back to host
cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost);
sumMatrixOnHost(h_A, h_B, hostRef, nx,ny);
checkResult(hostRef, gpuRef, nxy);
cudaFree(d_MatA);
cudaFree(d_MatB);
cudaFree(d_MatC);
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
return 0;
}
基本流程和1维向量求和类似
输出结果:
Using Device 0 : NVIDIA GeForce RTX 4090
matrix size 16384 16384
execution config <<<(512,512), (32,32)>>>
Kernel execution time: 5.351136 ms
array matches
block的尺寸为32x32。//block(dimx,dimy)定义的。
改变block尺寸为32x16:
execution config <<<(512,1024), (32,16)>>>
Kernel execution time: 3.778752 ms
进一步改变block尺寸为16x16:
execution config <<<(1024,1024), (16,16)>>>
Kernel execution time: 3.712736 ms
在之前尝试使用nvprof测试kernl性能时,report
======= Warning: nvprof is not supported on devices with compute capability 8.0 and higher.
参考 https://blog.csdn.net/TH_NUM/article/details/109952643 使用nsys
将C:\Program Files\NVIDIA Corporation\Nsight Systems 2024.5.1\target-windows-x64加入环境变量即可
python
nsys profile --stats=true .\sum_matrix_on_gpu_timer.exe
输出:
python
Collecting data...
Generating 'C:\Users\ADMINI~1\AppData\Local\Temp\nsys-report-ffa3.qdstrm'
[1/8] [========================100%] report2.nsys-rep
[2/8] [========================100%] report2.sqlite
[3/8] Executing 'nvtx_sum' stats report
SKIPPED: C:\Users\Administrator\Desktop\edward_temp\chapter2\report2.sqlite does not contain NV Tools Extension (NVTX) data.
[4/8] Executing 'osrt_sum' stats report
SKIPPED: No data available.
[5/8] Executing 'cuda_api_sum' stats report
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ----------- ---------- -------- --------- ----------- ----------------------
93.3 321764988 3 107254996.0 91069908.0 83897570 146797510 34432084.1 cudaMemcpy
4.0 13772507 3 4590835.7 4393180.0 3984976 5394351 725179.5 cudaFree
1.5 5118078 3 1706026.0 1249576.0 819401 3049101 1182856.9 cudaMalloc
1.0 3496955 1 3496955.0 3496955.0 3496955 3496955 0.0 cudaDeviceSynchronize
0.1 459711 1 459711.0 459711.0 459711 459711 0.0 cudaLaunchKernel
0.0 49593 2 24796.5 24796.5 707 48886 34067.7 cudaEventCreate
0.0 22341 1 22341.0 22341.0 22341 22341 0.0 cuLibraryUnload
0.0 18196 2 9098.0 9098.0 7920 10276 1665.9 cudaEventRecord
0.0 15060 1 15060.0 15060.0 15060 15060 0.0 cudaEventSynchronize
0.0 1961 1 1961.0 1961.0 1961 1961 0.0 cuCtxSynchronize
0.0 1434 1 1434.0 1434.0 1434 1434 0.0 cuModuleGetLoadingMode
0.0 1012 2 506.0 506.0 205 807 425.7 cudaEventDestroy
0.0 181 1 181.0 181.0 181 181 0.0 cuDeviceGetLuid
[6/8] Executing 'cuda_gpu_kern_sum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- --------- --------- -------- -------- ----------- -----------------------------------------------------
100.0 3453326 1 3453326.0 3453326.0 3453326 3453326 0.0 sumMatrixOnGPU2D(float *, float *, float *, int, int)
[7/8] Executing 'cuda_gpu_mem_time_sum' stats report
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
-------- --------------- ----- ---------- ---------- -------- -------- ----------- ----------------------------
68.3 180949528 2 90474764.0 90474764.0 89939258 91010270 757319.8 [CUDA memcpy Host-to-Device]
31.7 83834368 1 83834368.0 83834368.0 83834368 83834368 0.0 [CUDA memcpy Device-to-Host]
[8/8] Executing 'cuda_gpu_mem_size_sum' stats report
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
---------- ----- -------- -------- -------- -------- ----------- ----------------------------
2147.484 2 1073.742 1073.742 1073.742 1073.742 0.000 [CUDA memcpy Host-to-Device]
1073.742 1 1073.742 1073.742 1073.742 1073.742 0.000 [CUDA memcpy Device-to-Host]
Generated:
C:\Users\Administrator\Desktop\edward_temp\chapter2\report2.nsys-rep
C:\Users\Administrator\Desktop\edward_temp\chapter2\report2.sqlite