技术背景

在CUDA编程中有可能会遇到一些相对比较隐蔽的报错，但是直接编译运行cu文件是不显现的。那么可以通过添加一个用于检查的宏，来监测CUDA程序运行过程中可能出现的报错。

error.cuh

我们在CUDA头文件中实现这个宏：

cuh 复制代码

#pragma once
#include <stdio.h>

#define CHECK(call) do{const cudaError_t error_code = call; if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf("    File:   %s\n", __FILE__); printf("    Line:   %d\n", __LINE__); printf("    Error code: %d\n", error_code); printf("    Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0)

然后在调用CUDA相关函数或者核函数的时候，就可以使用CHECK操作来监测其中有无相关异常。

调用测试

先用一个简单的测试案例，就是显存分配的场景，如果是一个正常的显存分配：

cu 复制代码

// nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && ./test_error
#include "error.cuh"
#include <stdio.h>

int main(void){
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double *d_x;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaFree(d_x));
    printf("Success!\n");
}

运行结果是没有报错的：

bash 复制代码

Success!

但是如果我们调大N的值，使其超出显存大小：

cu 复制代码

// nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && ./test_error
#include "error.cuh"
#include <stdio.h>

int main(void){
    const int N = 1000000000;
    const int M = sizeof(double) * N;
    double *d_x;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaFree(d_x));
    printf("Success!\n");
}

再次运行，就会报OOM错误：

bash 复制代码

./test_error.cu(7): warning #69-D: integer conversion resulted in truncation
      const int M = sizeof(double) * N;
                    ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

./test_error.cu(9): warning #68-D: integer conversion resulted in a change of sign
      do{const cudaError_t error_code = cudaMalloc((void **)&d_x, M); if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf("    File:   %s\n", "./test_error.cu"); printf("    Line:   %d\n", 9); printf("    Error code: %d\n", error_code); printf("    Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0);
                                                                  ^

./test_error.cu(7): warning #69-D: integer conversion resulted in truncation
      const int M = sizeof(double) * N;
                    ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

./test_error.cu(9): warning #68-D: integer conversion resulted in a change of sign
      do{const cudaError_t error_code = cudaMalloc((void **)&d_x, M); if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf("    File:   %s\n", "./test_error.cu"); printf("    Line:   %d\n", 9); printf("    Error code: %d\n", error_code); printf("    Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0);
                                                                  ^

./test_error.cu: In function 'int main()':
./test_error.cu:7:31: warning: overflow in conversion from 'long unsigned int' to 'int' changes value from '8000000000' to '-589934592' [-Woverflow]
    7 |     const int M = sizeof(double) * N;
      |               ~~~~~~~~~~~~~~~~^~~~
CUDA Error:
    File:   ./test_error.cu
    Line:   9
    Error code: 2
    Error text: out of memory

当然，中间因为整形溢出，还有一些其他的warnning信息，但是这里主要要展现的是OOM报错问题。

核函数检测

上面的异常检测针对是cudaMalloc这个CUDA操作，其实对于核函数，也是一样可以检测出其异常。我们先演示一个正常的示例：

cu 复制代码

// nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error
#include "error.cuh"
#include <math.h>
#include <stdio.h>

void __global__ add(const double *x, const double *y, double *z, const int N){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N){
        z[idx] = x[idx] + y[idx];
    }
}

int main(void){
    const int N = 10;
    const int M = sizeof(double) * N;
    const double a = 1.23;
    double *h_x = (double*) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
    }
    double *d_x, *d_z;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaMalloc((void **)&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    const int block_size = 1024;
    const int grid_size = (N + block_size - 1) / block_size;
    add<<<grid_size, block_size>>>(d_x, d_x, d_z, N);
    CHECK(cudaGetLastError());
    CHECK(cudaDeviceSynchronize());
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_z));
    free(h_x);
    printf("Success!\n");
    return 0;
}

这个CUDA程序运行的是一个数组加法。运行结果：

bash 复制代码

$ nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error
Success!

调整一下block_size参数：

cu 复制代码

// nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error
#include "error.cuh"
#include <math.h>
#include <stdio.h>

void __global__ add(const double *x, const double *y, double *z, const int N){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N){
        z[idx] = x[idx] + y[idx];
    }
}

int main(void){
    const int N = 10;
    const int M = sizeof(double) * N;
    const double a = 1.23;
    double *h_x = (double*) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
    }
    double *d_x, *d_z;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaMalloc((void **)&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    const int block_size = 1025;
    const int grid_size = (N + block_size - 1) / block_size;
    add<<<grid_size, block_size>>>(d_x, d_x, d_z, N);
    CHECK(cudaGetLastError());
    CHECK(cudaDeviceSynchronize());
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_z));
    free(h_x);
    printf("Success!\n");
    return 0;
}

由于Block大小在CUDA程序中最大只能是1024，因此如果超出这个数就会出现异常，但是如果没有异常检测函数的话，程序是能够正常执行下去的，这样这个异常就会一直保留在程序中。运行结果：

bash 复制代码

$ nvcc ./test_error.cu -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error
CUDA Error:
    File:   ./test_error.cu
    Line:   29
    Error code: 9
    Error text: invalid configuration argument

因为加上了cudaGetLastError()函数，并使用了异常捕获的宏，所以这里就会提示参数配置异常。

总结概要

本文主要介绍了在CUDA编程的实践中，增加一个异常捕获的宏模块，以保障CUDA项目结果的准确性。主要代码内容参考了樊哲勇所著的《CUDA编程基础与实践》，是一本很好的CUDA编程入门书籍。

版权声明

本文首发链接为：https://www.cnblogs.com/dechinphy/p/cuda_error.html

作者ID：DechinPhy

更多原著文章：https://www.cnblogs.com/dechinphy/

请博主喝咖啡：https://www.cnblogs.com/dechinphy/gallery/image/379634.html

参考内容

《CUDA编程基础与实践》------樊哲勇

CUDA异常捕获

技术背景

error.cuh

调用测试

核函数检测

总结概要

版权声明

参考内容