CUDA 调试器 sanitizer，检测数据竞争，竞争条件 race condition

有数据竞争的代码 race.cu：

cpp 复制代码

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

// 有明显数据竞争的内核
__global__ void raceConditionKernel(int *data, int N) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
    // 数据竞争：多个线程同时写入 data[0]
    if (tid < N) {
        data[0] += data[tid];  // 所有线程都竞争写入 data[0]
    }
}

// 共享内存数据竞争
__global__ void sharedMemoryRaceKernel(int *output) {
    __shared__ int shared_var;
    
    // 数据竞争：多个线程同时初始化
    if (threadIdx.x < 10) {
        shared_var = threadIdx.x;  // 多个线程竞争写入
    }
    
    __syncthreads();
    
    if (threadIdx.x == 0) {
        *output = shared_var;
    }
}

// 更复杂的数据竞争
__global__ void complexRaceKernel(int *data, int N) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    __shared__ int temp[256];
    
    temp[threadIdx.x] = tid;
    __syncthreads();
    
    // 数据竞争：多个线程写入 data[tid % 10]
    if (tid < N) {
        int target_index = tid % 10;  // 只有10个不同的索引
        data[target_index] += temp[threadIdx.x % 256];
    }
}

int main() {
    const int N = 1000;
    int *d_data, *h_data;
    
    // 分配内存
    cudaMalloc(&d_data, N * sizeof(int));
    h_data = (int*)malloc(N * sizeof(int));
    
    // 初始化数据
    for (int i = 0; i < N; i++) {
        h_data[i] = i + 1;
    }
    
    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);
    
    printf("Running kernels with race conditions...\n");
    
    // 运行有数据竞争的内核
    dim3 blocks(10);
    dim3 threads(100);
    
    // 内核1：全局内存竞争
    printf("1. Global memory race condition:\n");
    raceConditionKernel<<<blocks, threads>>>(d_data, N);
    cudaDeviceSynchronize();
    
    // 内核2：共享内存竞争
    printf("2. Shared memory race condition:\n");
    int *d_output;
    cudaMalloc(&d_output, sizeof(int));
    sharedMemoryRaceKernel<<<1, 32>>>(d_output);
    cudaDeviceSynchronize();
    
    // 内核3：复杂竞争
    printf("3. Complex race condition:\n");
    complexRaceKernel<<<blocks, threads>>>(d_data, N);
    cudaDeviceSynchronize();
    
    // 读取结果
    cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost);
    
    printf("First 10 results: ");
    for (int i = 0; i < 10; i++) {
        printf("%d ", h_data[i]);
    }
    printf("\n");
    
    // 清理
    cudaFree(d_data);
    cudaFree(d_output);
    free(h_data);
    
    printf("Program completed.\n");
    return 0;
}

命令：

bash 复制代码

nvcc -g -G race.cu
compute-sanitizer --tool racecheck ./a.out

试了几次，只能检测到第二种数据竞争，不知道为什么