CUDA性能测量与错误处理
- 讨论如何通过CUDA事件来测量它的性能
- 如何通过CUDA代码进行调试
1.测量CUDA程序的性能
1.1 CUDA事件
- CPU端的计时器可能无法给出正确的内核执行时间
- CUDA事件等于是在你的CUDA应用运行的特定时刻被记录的时间戳,通过使用CUDA事件API,由GPU来记录这个时间戳
- 使用CUDA测量时间需要两个步骤:创建事件和记录事件,记录事件(开始时间与结束时间)
- 代码如下:
cpp
复制代码
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//Defining number of elements in Array
#define N 50000
//Defining Kernel function for vector addition
__global__ void gpuAdd(int* d_a, int* d_b, int* d_c) {
//Getting Thread index of current kernel
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N)
{
d_c[tid] = d_a[tid] + d_b[tid];
tid += blockDim.x * gridDim.x;
}
}
int main(void) {
//Defining host arrays
int h_a[N], h_b[N], h_c[N];
//Defining device pointers
int* d_a, * d_b, * d_c;
//----------创建事件记录起止时间---------------------
cudaEvent_t e_start, e_stop;
cudaEventCreate(&e_start);
cudaEventCreate(&e_stop);
//第一次记录时间戳
cudaEventRecord(e_start, 0);
// allocate the memory
cudaMalloc((void**)&d_a, N * sizeof(int));
cudaMalloc((void**)&d_b, N * sizeof(int));
cudaMalloc((void**)&d_c, N * sizeof(int));
//Initializing Arrays
for (int i = 0; i < N; i++) {
h_a[i] = 2 * i * i;
h_b[i] = i;
}
// Copy input arrays from host to device memory
cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
//Calling kernels passing device pointers as parameters
gpuAdd << <512, 512 >> > (d_a, d_b, d_c);
//Copy result back to host memory from device memory
cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
//再次记录时间戳
cudaEventRecord(e_stop, 0);
//等待所有GPU工作都完成
cudaEventSynchronize(e_stop);
float elapsedTime;
//计算时间插值
cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
printf("Time to add %d numbers: %3.1f ms\n", N, elapsedTime);
int Correct = 1;
printf("Vector addition on GPU \n");
//Printing result on console
for (int i = 0; i < N; i++) {
if ((h_a[i] + h_b[i] != h_c[i]))
{
Correct = 0;
}
}
if (Correct == 1)
{
printf("GPU has computed Sum Correctly\n");
}
else
{
printf("There is an Error in GPU Computation\n");
}
//Free up memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
1.2 NVIDIA Visual Profiler
- 如果你在程序中使用了CUDA,代码的性能并未提升,在这种情况下,能够可视化地查看代码的哪些部分花费了最长的时间完成将非常有用,这叫剖析内核执行代码
- 英伟达提供了以上用途的工具
nvvp
,就在标准的CUDA安装包里,在电脑的如下路径可以被找到:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp
:
- 执行它需要安装java环境,即安装
jdk8
即可,可以去官网下载,也可以从我的链接 jdk8下载,然后需要配置环境变量C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\extras\CUPTI\lib64
C:\Program Files\Java\jdk-1.8\bin
- 打开nvvp 会出现如下窗口,此工具会分析你的代码执行过程,采集GPU上的性能数据,运行结束后会给你一个详细的报告,包括每个内核的执行时间,代码中每个详细操作的时间戳,以及代码存储器的使用情况
- 想要得到详细报告,可依次点击
File -> New Session
,然后在弹出的对话框中选择程序的.exe
文件
- Profiler 是分析内核执行情况的重要工具,它也可以用来比较两个内核的性能。它会告诉你就是是代码里的何种操作拉低了性能
2. CUDA中的错误处理
- 如果系统中没有可用的GPU设备怎么办?显存不足怎么办?
- 学会在CUDA程序里边添加错误处理代码很有好处
cpp
复制代码
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c) {
*d_c = *d_a + *d_b;
}
int main()
{
//Defining host variables
int h_a, h_b, h_c;
//Defining Device Pointers
int *d_a, *d_b, *d_c;
//Initializing host variables
h_a = 1;
h_b = 4;
//定义错误结果变量
cudaError_t cudaStatus;
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&d_c, sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_a, sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_b, sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(d_a,&h_a, sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(d_b, &h_b, sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
gpuAdd<<<1, 1>>>(d_a, d_b, d_c);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(&h_c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
printf("Passing Parameter by Reference Output: %d + %d = %d\n", h_a, h_b, h_c);
Error:
cudaFree(d_c);
cudaFree(d_a);
cudaFree(d_b);
return 0;
}
- -----------------------END----------------------------