CUDA 学习(3)——CUDA 初步实践

1 定位 thread

CUDA 中提供了 blockIdx, threadIdx, blockDim, GridDim来定位发起 thread，下面发起 1 个 grid，里面有 2 个 block，每个 block 里有 5 个 threads。

程序让每个 thread 输出自己的 id 号:

c 复制代码

#include <stdio.h>

__global__ void print_id() {
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    printf("This is thread %d.\n", id);
}

int main() {
    print_id<<<2, 5>>>();
    cudaDeviceSynchronize();
}

编译并运行：

bash 复制代码

nvcc -o get_thread_id get_thread_id.cu -run

This is thread 5.
This is thread 6.
This is thread 7.
This is thread 8.
This is thread 9.
This is thread 0.
This is thread 1.
This is thread 2.
This is thread 3.
This is thread 4.

2 vector add

将向量 a 与向量 b 逐元素相加，计算的结果为向量 c。使用 cpu 与 GPU 分别实现向量相加，并比较执行速度。

这里需要使用cudastart.h定义的计算时间和初始化的函数：

c 复制代码

#ifndef CUDASTART_H
#define CUDASTART_H
#define CHECK(call)\
{\
    const cudaError_t error=call;\
    if(error != cudaSuccess)\
    {\
        printf("Error: %s: %d, ", __FILE__, __LINE__);\
        printf("code: %d, reason: %s\n", error, cudaGetErrorString(error));\
        exit(1);\
    }\
}

#include <time.h>
#ifdef _WIN32
#   include <windows.h>
#else
#   include<sys/time.h>
#endif

double cpuSecond() {
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return((double)tp.tv_sec + (double)tp.tv_usec * 1e-6);
}

void initiaData(float* ip, int size) {
    time_t t;
    srand((unsigned)time(&t));
    for(int i = 0; i < size; i++) {
        ip[i] = (float)(rand() & 0xffff) / 1000.0f;
    }
}

void initDevice(int devNum) {
    int dev = devNum;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Using device %d: %s\n", dev,  deviceProp.name);
    CHECK(cudaSetDevice(dev));
}

void checkResult(float* hostRef, float* gpuRef, const int N) {
    double epsilon = 1.0E-8;
    for(int i = 0; i < N; i++) {
        if(abs(hostRef[i] - gpuRef[i]) > epsilon) {
            printf("Results don\'t match!\n");
            printf("%f(hostRef[%d]) != %f(gpuRef[%d])\n", hostRef[i], i, gpuRef[i], i);
            return;
        }
    }
    printf("Check result success!\n");
}

#endif

vector_add_gpu.cu

c 复制代码

#include <stdio.h>
#include <assert.h>
#include <time.h>
#include "cudastart.h"

const int N = 1 << 30;

inline cudaError_t checkCuda(cudaError_t result) {
    if (result != cudaSuccess) {
        fprintf(stderr, "Cuda Runtime Error: %s\n", cudaGetErrorString(result));
        assert(result == cudaSuccess);
    }
    return result;
}

void initWith(int* a) {
    for (int i = 0; i < N; ++i) {
        a[i] = i;
    }
}

__global__ void addVectorsInto(int* result, int* a, int* b) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int gridstride = gridDim.x * blockDim.x;
    for (int i = index; i < N; i += gridstride)
        result[i] = a[i] + b[i];
}

void checkElementsAre(int* array) {
    for (int i = 0; i < N; i++) {
        if (array[i] != 2 * i) {
            printf("FAIL: array[%d] - %d does not equal %d\n", i, array[i], 2 * i);
            exit(1);
        }
    }
    printf("SUCCESS All values added correctly.\n");
}

void cpuAdd (int* h_a, int* h_b, int* h_c) {
    int tid = 0;
    while (tid < N) {
        h_c[tid] = h_a[tid] +  h_b[tid];
        tid += 1;
    }
}

int main() {
    size_t size = N * sizeof(int);

    int* cpu_a = (int*)malloc(size);
    int* cpu_b = (int*)malloc(size);
    int* cpu_c = (int*)malloc(size);

    initWith(cpu_a);
    initWith(cpu_b);

    double start_cpu = cpuSecond();
    cpuAdd(cpu_a, cpu_b, cpu_c);
    double end_cpu = cpuSecond();
    checkElementsAre(cpu_c);
    printf("vector add, CPU Time used: %f ms\n", (end_cpu - start_cpu) * 1000);
    free(cpu_a);
    free(cpu_b);
    free(cpu_c);

    int* a;
    int* b;
    int* c;
    int deviceId;
    cudaGetDevice(&deviceId);

    checkCuda(cudaMallocManaged(&a, size));
    checkCuda(cudaMallocManaged(&b, size));
    checkCuda(cudaMallocManaged(&c, size));

    cudaMemPrefetchAsync(a, size, cudaCpuDeviceId);
    cudaMemPrefetchAsync(b, size, cudaCpuDeviceId);
    initWith(a);
    initWith(b);
    cudaMemPrefetchAsync(a, size, deviceId);
    cudaMemPrefetchAsync(b, size, deviceId);
    size_t threadsPerBlock = 1024;
    size_t numberOfBlock = (N + threadsPerBlock - 1) / threadsPerBlock;

    double start = cpuSecond();
    addVectorsInto<<<numberOfBlock, threadsPerBlock>>>(c, a, b);
    checkCuda(cudaDeviceSynchronize());
    double end = cpuSecond();
    checkElementsAre(c);

    printf("vector add, GPU Time used: %f ms\n", (end - start) * 1000);

    cudaFree(a);
    cudaFree(b);
    cudaFree(c);
}

说明：

这段代码需要向 GPU 申请显存，用来存储数组。cudaMallocManaged 和 cudaMalloc 都是 CUDA 中用于分配设备内存的函数，它们之间有几个重要的区别：

管理方式：
- cudaMallocManaged 分配的内存是统一内存 (Unified Memory)，可由 CPU 和 GPU 共享，无需显式地进行数据传输。这使得程序员可以更轻松地编写并行代码，而不必担心内存管理和数据传输。
- cudaMalloc 分配的内存则是显式地分配给 GPU 使用的内存，需要通过显式的数据传输函数（如 cudaMemcpy）来在 CPU 和 GPU 之间传输数据。
自动数据迁移：
- 统一内存由 CUDA 运行时自动管理数据的迁移。当 CPU 或 GPU 尝试访问未分配到当前设备的统一内存时，CUDA 运行时会自动将数据迁移到访问的设备上。
- 对于 cudaMalloc 分配的内存，需要手动使用 cudaMemcpy 等函数进行数据传输。
便利性：
- 使用 cudaMallocManaged 更加方便，因为无需手动管理数据的迁移和分配，程序员可以更专注于算法和逻辑的实现。
- cudaMalloc 则需要更多的手动管理，包括数据传输和内存释放。
优缺点：
- cudaMallocManaged 的优点在于简化了内存管理和数据传输，提高了编程的便利性和代码的可读性。同时，由于统一内存的存在，可以减少内存使用上的一些烦琐问题。
- cudaMalloc 的优点在于更加灵活，程序员可以精确地控制内存的分配和数据传输，适用于需要更细粒度控制的情况。此外，对于某些特定的算法和应用场景，手动管理内存和数据传输可能会比统一内存更加高效。

编译运行：

bash 复制代码

nvcc -o vector_add_gpu vector_add_gpu.cu -run

SUCCESS All values added correctly.
vector add, CPU Time used: 3081.584930 ms
SUCCESS All values added correctly.
vector add, GPU Time used: 280.308962 ms

3 Reduction

对数组进行规约操作。

vector_reduction.cu

c 复制代码

#include <cuda_runtime.h>
#include <stdio.h>
#include "cudastart.h"

__global__ void reduce_test(int* g_idata, int* sum, unsigned int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        atomicAdd(sum, g_idata[idx]);
    }
}

__global__ void reduceNeighbored(int* g_idata, int* g_odata, unsigned int n) {
    //set thread ID
    unsigned int tid = threadIdx.x;

    //boundary check
    if (tid >= n) return;

    int* idata = g_idata + blockIdx.x * blockDim.x;
    // in-place reduction in the global memory
    for (int stride = 1; stride < blockDim.x; stride *= 2) {
        if ((tid % (2 * stride)) == 0) {
            idata[tid] += idata[tid + stride];
        }
        // synchronize within block
        __syncthreads();
    }

    // write result for this block to global mem
    if (tid == 0) {
        g_odata[blockIdx.x] = idata[0];
    }
}

__global__ void reduceNeighboredLess(int* g_idata, int* g_odata, unsigned int n) {
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // convert global data pointer to the local point of this block
    int* idata = g_idata + blockIdx.x * blockDim.x;
    if (idx >= n) return;

    //in-place reduction in global memory
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            idata[tid] += idata[tid + stride];
        }
        __syncthreads();
    }
    //write result for this block to global men
	if (tid == 0)
        g_odata[blockIdx.x] = idata[0];
}

__global__ void reduceInterleaved(int* g_idata, int* g_odata, unsigned int n) {
    unsigned int tid = threadIdx.x;
	unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
	// convert global data pointer to the local point of this block
	int *idata = g_idata + blockIdx.x*blockDim.x;
	if (idx >= n) return;

    //in-place reduction in global memory
	for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            idata[tid] += idata[tid + stride];
        }
        __syncthreads();
    }
    //write result for this block to global men
	if (tid == 0)
        g_odata[blockIdx.x] = idata[0];
}

__global__ void reduceInterleaved_share(int* g_idata, int* g_odata, unsigned int n) {
    __shared__ int sh_arr[1024];
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    sh_arr[tid] = g_idata[idx];
    __syncthreads();
    //in-place reduction in global memory
	for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride)
            sh_arr[tid] += sh_arr[tid + stride];
        __syncthreads();
    }
    //write result for this block to global men
	if (tid == 0)
        g_odata[blockIdx.x] = sh_arr[0];
}

int main(int argc, char** argv) {
    initDevice(0);
	
	//initialization

	int size = 1 << 24;
    printf("with array size %d ", size);

    //execution configuration
	int blocksize = 1024;
    if (argc > 1) {
        blocksize = atoi(argv[1]);   //从命令行输入设置block大小
    }
    dim3 block(blocksize, 1);
    dim3 grid((size - 1) / block.x + 1, 1);
    printf("grid %d block %d \n", grid.x, block.x);

    //allocate host memory
	size_t bytes = size * sizeof(int);
	int *idata_host = (int*)malloc(bytes);
	int *odata_host = (int*)malloc(grid.x * sizeof(int));
	int * tmp = (int*)malloc(bytes);

    //initialize the array
	initialData_int(idata_host, size);

    if (size < 100) {
        printf("Array: [");
		for (int i = 0; i < size; ++i)
			printf("%d, ", idata_host[i]);
		printf("]\n");
    }

    memcpy(tmp, idata_host, bytes);
	double timeStart, timeElaps;
	int gpu_sum = 0;

    // device memory
	int * idata_dev = NULL;
	int * odata_dev = NULL;
	CHECK(cudaMalloc((void**)&idata_dev, bytes));
	CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int)));

    //cpu reduction 对照组
	int cpu_sum = 0;
	timeStart = cpuSecond();
	for (int i = 0; i < size; i++)
		cpu_sum += tmp[i];
	timeElaps = 1000 * (cpuSecond() - timeStart);

    printf("cpu sum:%d \n", cpu_sum);
	printf("cpu reduction elapsed %lf ms cpu_sum: %d\n", timeElaps, cpu_sum);

    //kernel 0 reduce
	int *reduce_sum;
	CHECK(cudaMalloc((void**)&reduce_sum, sizeof(int)));
	CHECK(cudaMemset(reduce_sum, 0, sizeof(int)));
	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
	cudaDeviceSynchronize();
    timeStart = cpuSecond();
	reduce_test <<<grid, block>>>(idata_dev, reduce_sum, size);
	cudaDeviceSynchronize();
	cudaMemcpy(&gpu_sum, reduce_sum, sizeof(int), cudaMemcpyDeviceToHost);
	printf("gpu sum:%d \n", gpu_sum);
	printf("gpu atomicAdd elapsed %lf ms <<<grid %d block %d>>>\n", timeElaps, grid.x, block.x);
	CHECK(cudaFree(reduce_sum));

    //kernel 1 reduceNeighbored
    CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
	CHECK(cudaDeviceSynchronize());
	timeStart = cpuSecond();
	reduceNeighbored <<<grid, block>>>(idata_dev, odata_dev, size);
	cudaDeviceSynchronize();
	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
	gpu_sum = 0;
	for (int i = 0; i < grid.x; i++)
		gpu_sum += odata_host[i];	
    timeElaps = 1000 * (cpuSecond() - timeStart);
    printf("gpu sum:%d \n", gpu_sum);
	printf("gpu reduceNeighbored elapsed %lf ms <<<grid %d block %d>>>\n", timeElaps, grid.x, block.x);

    //kernel 2 reduceNeighboredless
	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
	CHECK(cudaDeviceSynchronize());
	timeStart = cpuSecond();
	reduceNeighboredLess <<<grid, block >>>(idata_dev, odata_dev, size);
	cudaDeviceSynchronize();
	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
	gpu_sum = 0;
	for (int i = 0; i < grid.x; i++)
		gpu_sum += odata_host[i];	
    timeElaps = 1000*(cpuSecond() - timeStart);

	printf("gpu sum:%d \n", gpu_sum);
	printf("gpu reduceNeighboredless elapsed %lf ms <<<grid %d block %d>>>\n",
		timeElaps, grid.x, block.x);

    //kernel 3 reduceInterleaved
	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
	CHECK(cudaDeviceSynchronize());
	timeStart = cpuSecond();
	reduceInterleaved <<<grid, block>>>(idata_dev, odata_dev, size);
	cudaDeviceSynchronize();
	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
	gpu_sum = 0;
	for (int i = 0; i < grid.x; i++)
		gpu_sum += odata_host[i];	
    timeElaps = 1000 * (cpuSecond() - timeStart);
	printf("gpu sum:%d \n", gpu_sum);
	printf("gpu reduceInterleaved elapsed %lf ms <<<grid %d block %d>>>\n", timeElaps, grid.x, block.x);
    
	//kernel 4 reduceInterleaved shared memory
	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
	CHECK(cudaDeviceSynchronize());
	timeStart = cpuSecond();
	reduceInterleaved_share <<<grid, block>>>(idata_dev, odata_dev, size);
	cudaDeviceSynchronize();
	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
	gpu_sum = 0;
	for (int i = 0; i < grid.x; i++)
		gpu_sum += odata_host[i];	
    timeElaps = 1000 * (cpuSecond() - timeStart);
	printf("gpu sum:%d \n", gpu_sum);
	printf("gpu reduceInterleaved elapsed %lf ms <<<grid %d block %d>>>\n", timeElaps, grid.x, block.x);

    // free host memory
    free(idata_host);
    free(odata_host);
    CHECK(cudaFree(idata_dev));
    CHECK(cudaFree(odata_dev));

    // reset device
    cudaDeviceReset();

    // check the results
    if (gpu_sum == cpu_sum) {
        printf("Test success!\n");
    }
    return EXIT_SUCCESS;
}

编译并运行：

bash 复制代码

nvcc -o vector_reduction vector_reduction.cu -run

Using device 0: NVIDIA GeForce RTX 2080 Ti
with array size 16777216 grid 16384 block 1024
cpu sum:2138839322
cpu reduction elapsed 30.170918 ms cpu_sum: 2138839322
gpu sum:2138839322
gpu atomicAdd elapsed 30.170918 ms <<<grid 16384 block 1024>>>
gpu sum:2138839322
gpu reduceNeighbored elapsed 1.228094 ms <<<grid 16384 block 1024>>>
gpu sum:2138839322
gpu reduceNeighboredless elapsed 0.674963 ms <<<grid 16384 block 1024>>>
gpu sum:2138839322
gpu reduceInterleaved elapsed 0.669956 ms <<<grid 16384 block 1024>>>
gpu sum:2138839322
gpu reduceInterleaved elapsed 0.744820 ms <<<grid 16384 block 1024>>>
Test success!

4 Matrix multiplication

计算二维矩阵乘法。

matmul.cu

c 复制代码

#include <cuda_runtime.h>
#include <stdio.h>
#include "cudastart.h"

#define N 1024
#define tile_size 16

void matrixMulCPU(int* a, int* b, int* c) {
    int val = 0;
    for (int row = 0; row < N; ++row) {
        for (int col = 0; col < N; ++col) {
            val = 0;
            for (int k = 0; k < N; ++k) {
                val += a[row * N + k] * b[k * N + col];
            }
            c[row * N + col] = val;
        }
    }
}

__global__ void maxtrixMulGPU(int* a, int* b, int* c) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;
    int val = 0;
    for (int i = 0; i < N; ++i) {
        val += a[row * N + i] * b[i * N + col];
    }
    c[row * N + col] = val;
}

__global__ void maxtrixMulGPU_tile(int* a, int* b, int* c) {
    __shared__ int tile_a[tile_size][tile_size];
    __shared__ int tile_b[tile_size][tile_size];
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;
    for (int i = 0; i < N / tile_size; ++i) {
        tile_a[threadIdx.x][threadIdx.y] = a[row * N + i * tile_size + threadIdx.y];
        tile_b[threadIdx.x][threadIdx.y] = b[(i * tile_size + threadIdx.x) * N + col];
        __syncthreads();
        for (int j = 0; j < tile_size; ++j) {
            c[row * N + col] += tile_a[threadIdx.x][j] * tile_b[j][threadIdx.y];
        }
        __syncthreads();
    }
}

void check(int* c_cpu, int* c_gpu) {
    bool error = false;
    for (int row = 0; row < N && !error; row++) {
        for (int col = 0; col < N && !error; col++) {
            if (c_cpu[row * N + col] != c_gpu[row * N + col]) {
                printf("FOUND ERROR at c[%d][%d]\n", row, col);
                error = true;
                break;
            }
        }
    }
    if (!error) {
        printf("Success!\n");
    }
}

int main() {
    int *a, *b, *c_cpu, *c_gpu, *c_gpu_opt; // Allocate a solution matrix for both the CPU and the GPU operations
    int size = N * N * sizeof (int); // Number of bytes of an N x N matrix
    double timeStart, timeElaps;
    
    // Allocate memory
    cudaMallocManaged (&a, size);
    cudaMallocManaged (&b, size);
    cudaMallocManaged (&c_cpu, size);
    cudaMallocManaged (&c_gpu, size);
    cudaMallocManaged (&c_gpu_opt, size);

    initialData_int(a, N * N);
    initialData_int(b, N * N);
    memset(c_cpu, 0, N * N);
    memset(c_gpu, 0, N * N);
    memset(c_gpu_opt, 0, N * N);

    timeStart = cpuSecond();
    matrixMulCPU(a, b, c_cpu);
    timeElaps = 1000 * (cpuSecond() - timeStart);
    printf("cpu matrix mul time: %f ms\n", timeElaps);

    // test kernel 1: matrixMulGPU
    dim3 threads_per_block(16, 16, 1);
    dim3 number_of_blocks(N / threads_per_block.x, N / threads_per_block.y, 1);
    timeStart = cpuSecond();
    maxtrixMulGPU <<<number_of_blocks, threads_per_block>>>(a, b, c_gpu);
    timeElaps = 1000 * (cpuSecond() - timeStart);
    cudaDeviceSynchronize();
    printf("gpu matrix mul time: %f ms\n", timeElaps);
    check(c_cpu, c_gpu);

    // test kernel 2: matrixMulGPU optimize
    timeStart = cpuSecond();
    maxtrixMulGPU_tile<<<number_of_blocks, threads_per_block>>>(a, b, c_gpu_opt);
     timeElaps = 1000 * (cpuSecond() - timeStart);
    cudaDeviceSynchronize();
    printf("gpu matrix mul time: %f ms\n", timeElaps);
    check(c_cpu, c_gpu_opt);

    // Free all our allocated memory
    cudaFree(a);
    cudaFree(b);
    cudaFree(c_cpu);
    cudaFree(c_gpu);
    cudaFree(c_gpu_opt);
}

编译并运行：

bash 复制代码

nvcc -o matmul matmul.cu -run

cpu matrix mul time: 4380.882978 ms
gpu matrix mul time: 0.206947 ms
Success!
gpu matrix mul time: 0.063896 ms
Success!

5 cuDNN sigmoid

使用 cuDNN 实现 sigmoid 函数

cudnn_sigmoid.cu

c 复制代码

#include <iostream>
#include <cuda_runtime.h>
#include <cudnn.h>

int main(int argc, char** argv) {
    // get gpu info
    int numGPUs;
    cudaGetDeviceCount(&numGPUs);
    std::cout << "Found " << numGPUs << " GPUs." << std::endl;
    cudaSetDevice(0); // use GPU0
    int device;
    struct cudaDeviceProp devProp;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&devProp, device);
    std::cout << "Compute capability:" << devProp.major << "." << devProp.minor << std::endl;

    cudnnHandle_t handle_;
    cudnnCreate(&handle_);
    std::cout << "Created cuDNN handle" << std::endl;

    // create the tensor descriptor
    cudnnDataType_t dtype = CUDNN_DATA_FLOAT;
    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
    int n = 1, c = 1, h = 1, w = 10;
    int NUM_ELEMENTS = n * c * h * w;
    cudnnTensorDescriptor_t x_desc;
    cudnnCreateTensorDescriptor(&x_desc);
    cudnnSetTensor4dDescriptor(x_desc, format, dtype, n, c, h, w);
    
    // create the tensor
    float *x;

    // 创建 Unified Memory，这样cpu和memory都可以使用
    cudaMallocManaged(&x, NUM_ELEMENTS * sizeof(float));
    for(int i = 0; i < NUM_ELEMENTS; i++) x[i] = i * 1.00f;
    std::cout << "Original array: ";
    for(int i = 0; i < NUM_ELEMENTS; i++) std::cout << x[i] << " ";

    // create activation function descriptor
    float alpha[1] = {1};
    float beta[1] = {0.0};
    cudnnActivationDescriptor_t sigmoid_activation;
    cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
    cudnnNanPropagation_t prop = CUDNN_NOT_PROPAGATE_NAN;
    cudnnCreateActivationDescriptor(&sigmoid_activation);
    cudnnSetActivationDescriptor(sigmoid_activation, mode, prop, 0.0f);

    cudnnActivationForward(
        handle_,
        sigmoid_activation,
        alpha,
        x_desc,
        x,
        beta,
        x_desc,
        x
    );
    
    cudnnDestroy(handle_);
    std::cout << std::endl << "Destroyed cuDNN handle." << std::endl;
    std::cout << "New array: ";
    for(int i = 0; i < NUM_ELEMENTS; i++) std::cout << x[i] << " ";
    std::cout << std::endl;
    cudaFree(x);
    return 0;
}

编译并运行：

bash 复制代码

nvcc -o cudnn_sigmoid cudnn_sigmoid.cu -run -lcudnn

Found 2 GPUs.
Compute capability:7.5
Created cuDNN handle
Original array: 0 1 2 3 4 5 6 7 8 9
Destroyed cuDNN handle.
New array: 0.5 0.731059 0.880797 0.952574 0.982014 0.993307 0.997527 0.999089 0.999665 0.999877