6. cuda reduce kernel

  1. reduce的定义

给N个数值,对它们做累计的算数操作,例如求出其总和、最大值、最小值、均值、异或这一类的操作,成为reduce

2.解决方案

(1)Two-Pass:启动两次kernel

Baseline:一个线程做reduce

reduce0:引入shared memory且并行化reduce算法

reduce1:除余替换为位运算

reduce2:消除shared memory bank conflict

reduce3:尽可能让更多的线程都在干活

reduce4:展开for循环最后一个warp

reduce5:完全展开for循环

reduce6:一个线程干多个活

reduce7:提升GPU并行度,处理更多数据,分配多个block作reduce

reduce8:warp shuffle

(2)single-Pass:启动一次kernel


baseline:一个线程做reduce/cpu写法

cpp 复制代码
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
// 注: 每个cuda程序(.cu文件)的main函数功能大同小异,后面不会每个cu文件都注释main函数逻辑
//999ms
__global__ void reduce_baseline(const int* input, int* output, size_t n) {
  // 由于只分配了1个block和thread,此时cuda程序相当于串行程序
  int sum = 0;
  // 累加
  for (size_t i = 0; i < n; ++i) {
    sum += input[i];
  }
  // 累加结果写回显存
  *output = sum;
}

bool CheckResult(int *out, int groudtruth, int n){
    if (*out != groudtruth) {
        return false;
    }
    return true;
}

int main(){
    float milliseconds = 0;
    //const int N = 32 * 1024 * 1024;
    const int N = 25600000;
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    //const int blockSize = 256;
    const int blockSize = 1;
    //int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);//used later
    int GridSize = 1;
    // 分配内存和显存并初始化数据
    int *a = (int *)malloc(N * sizeof(int));
    int *d_a;
    cudaMalloc((void **)&d_a, N * sizeof(int));

    int *out = (int*)malloc((GridSize) * sizeof(int));
    int *d_out;
    cudaMalloc((void **)&d_out, (GridSize) * sizeof(int));

    for(int i = 0; i < N; i++){
        a[i] = 1;
    }

    int groudtruth = N * 1;
    // 把初始化后的数据拷贝到GPU
    cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    // 定义分配的block数量和threads数量
    dim3 Grid(GridSize);
    dim3 Block(blockSize);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    // 分配1个block和1个thread
    reduce_baseline<<<1, 1>>>(d_a, d_out, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);
    // 将结果拷回CPU并check正确性
    cudaMemcpy(out, d_out, GridSize * sizeof(int), cudaMemcpyDeviceToHost);
    printf("allcated %d blocks, data counts are %d", GridSize, N);
    bool is_right = CheckResult(out, groudtruth, GridSize);
    if(is_right) {
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        for(int i = 0; i < GridSize;i++){
            printf("res per block : %lf ",out[i]);
        }
        printf("\n");
        printf("groudtruth is: %f \n", groudtruth);
    }
    printf("reduce_baseline latency = %f ms\n", milliseconds);

    cudaFree(d_a);
    cudaFree(d_out);
    free(a);
    free(out);
}

reduce0:引入shared_memory且并行化reduce算法

(1)将数据load至shared memroy中;

(2)在shared_memory中对数据进行reduce操作

(3)将最后的结果写会global memory中

cpp 复制代码
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"

// v0: naive版本
// latency: 3.835ms
// blockSize作为模板参数的效果主要用于静态shared memory的申请需要传入编译期常量指定大小(L10)
template<int blockSize>
__global__ void reduce_v0(float *d_in,float *d_out){
    __shared__ float smem[blockSize];
    // 泛指当前线程在其block内的id
    int tid = threadIdx.x;
    // 泛指当前线程在所有block范围内的全局id
    int gtid = blockIdx.x * blockSize + threadIdx.x;
    // load: 每个线程加载一个元素到shared mem对应位置
    smem[tid] = d_in[gtid];
    // 涉及到对shared memory的读写最好都加上__syncthreads
    __syncthreads();

    // 每个线程在shared memory上跨index加另一个元素,直到跨度>线程数量
    // 此时一个block对d_in这块数据的reduce sum结果保存在id为0的线程上面
    for(int index = 1; index < blockDim.x; index *= 2) {
        // 注意!v0并没有warp divergence,因为没有else分支,视频目前这里讲错
        // 现在的v0和v1性能大体相似
        // v0慢的原因在于下一行使用了除余%,除余%是个非常耗时的指令,我会在下个版本对这里进一步修正
        // 可尝试把下一行替换为`if ((tid & (2 * index - 1)) == 0) {`, 性能大概可以提升30%~50%
        if (tid % (2 * index) == 0) {
            smem[tid] += smem[tid + index];
        }
        __syncthreads();
    }

    // store: 哪里来回哪里去,把reduce结果写回显存
    if (tid == 0) {
        d_out[blockIdx.x] = smem[0];
    }
}
bool CheckResult(float *out, float groudtruth, int n){
    float res = 0;
    for (int i = 0; i < n; i++){
        res += out[i];
    }
    if (res != groudtruth) {
        return false;
    }
    return true;
}

int main(){
    float milliseconds = 0;
    const int N = 25600000;
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    const int blockSize = 256;
    int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
    //int GridSize = 100000;
    float *a = (float *)malloc(N * sizeof(float));
    float *d_a;
    cudaMalloc((void **)&d_a, N * sizeof(float));

    float *out = (float*)malloc((GridSize) * sizeof(float));
    float *d_out;
    cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));

    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
    }

    float groudtruth = N * 1.0f;

    cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 Grid(GridSize);
    dim3 Block(blockSize);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    reduce_v0<blockSize><<<Grid,Block>>>(d_a, d_out);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
    printf("allcated %d blocks, data counts are %d", GridSize, N);
    bool is_right = CheckResult(out, groudtruth, GridSize);
    if(is_right) {
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        //for(int i = 0; i < GridSize;i++){
            //printf("res per block : %lf ",out[i]);
        //}
        //printf("\n");
        printf("groudtruth is: %f \n", groudtruth);
    }
    printf("reduce_v0 latency = %f ms\n", milliseconds);

    cudaFree(d_a);
    cudaFree(d_out);
    free(a);
    free(out);
}

notes:

(1)每个block享有一个独立的shared memory,所以在cuda kernel的函数中,数据的移动是smem[tid] = d_in[gtid];

(2)在shared memory进行的运算后都需要__syncthreads();

(3)在if-else的逻辑代码中,warp内的线程出现warp divergence,即一个warp内有threads程符合if条件有些threads符合else条件????

对于independent thread scheduling的解释:p32

..

..

..


前面这个版本的计算代码性能较慢,下面是优化后的代码:

cpp 复制代码
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"

// v1新版本: 用位运算替换除余操作
// latency: 2.825ms
// blockSize作为模板参数的效果主要用于静态shared memory的申请需要传入编译期常量指定大小(L120)
template<int blockSize>
__global__ void reduce_v1(float *d_in,float *d_out){
    // 泛指当前线程在其block内的id
    int tid = threadIdx.x;
    // 泛指当前线程在所有block范围内的全局id
    int gtid = threadIdx.x + blockIdx.x * blockSize;
    // load: 每个线程加载一个元素到shared mem对应位置
    __shared__ float smem[blockSize];
    smem[tid] = d_in[gtid];
    // 每对shared memory做读写操作都需要加__syncthreads保证一个block内的threads此刻都同步,以防结果错误
    __syncthreads();

    for(int index = 1; index < blockDim.x; index *= 2) {
        // 算法思路和v0一致,仅仅是用位运算替代了v0 if语句中的除余操作                            
        /* 符号:& 是按位与。
           操作:同位均为1则得1,否则得0。*/
        if ((tid & (2 * index - 1)) == 0){
            smem[tid] += smem[tid + index];
        }
        __syncthreads();
    }
    
    // GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
    if(tid == 0) {
        d_out[blockIdx.x] = smem[0];
    }
}
bool CheckResult(float *out, float groudtruth, int n){
    float res = 0;
    for (int i = 0; i < n; i++){
        res += out[i];
    }
    if (res != groudtruth) {
        return false;
    }
    return true;
}

int main(){
    float milliseconds = 0;
    const int N = 25600000;
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    const int blockSize = 256;
    int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
    //int GridSize = 100000;
    float *a = (float *)malloc(N * sizeof(float));
    float *d_a;
    cudaMalloc((void **)&d_a, N * sizeof(float));

    float *out = (float*)malloc((GridSize) * sizeof(float));
    float *d_out;
    cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));

    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
    }

    float groudtruth = N * 1.0f;

    cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 Grid(GridSize);
    dim3 Block(blockSize);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    reduce_v1<blockSize><<<Grid,Block>>>(d_a, d_out);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
    printf("allcated %d blocks, data counts are %d", GridSize, N);
    bool is_right = CheckResult(out, groudtruth, GridSize);
    if(is_right) {
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        //for(int i = 0; i < GridSize;i++){
            //printf("res per block : %lf ",out[i]);
        //}
        //printf("\n");
        printf("groudtruth is: %f \n", groudtruth);
    }
    printf("reduce_v0 latency = %f ms\n", milliseconds);

    cudaFree(d_a);
    cudaFree(d_out);
    free(a);
    free(out);
}

主要优化除余这个操作,

除余:是一个很耗性能的操作,由很多条指令组成常用位运算替代除余操作和出发操作。

在你之前的代码 if ((tid & (2 * index - 1)) == 0) 中,这个运算有一个非常特殊的用途:掩码 (Masking)

因为 (2 * index - 1) 这个数字很特殊,它的二进制全是 1

例如 index = 2 时,2*2 - 1 = 3,二进制是 ...00011

作用:提取低位

当你用任何数 tid 去和 3 (...00011) 做 & 运算时,高位全部被清零,只保留最后两位

为什么要用 & 而不是 %

  1. 速度快 :在计算机底层,位运算(AND)通常比除法/取模运算(%)快得多。虽然现代编译器会把 % 4 优化成位运算,但在 GPU 这种对指令敏感的架构上,显式写出位运算是一种习惯和最佳实践。
  2. 处理 2 的幂 :只要右边的数是 2n−12n−1 (如 1, 3, 7, 15, 31...),x & mask 就完美等价于 x % (mask + 1)

总结

  • 符号& 是按位与。
  • 操作:同位均为1则得1,否则得0。

reduce2: 消除shared memory bank conflict

什么是shared memory bank conflict?为什么会造成性能下降

答:如果一个warp的多个线程访问同一个bank的不同字段(注:同一个bank的不同字段如bank[0[0],bank[1][0],...,bank[n][0]),那么就发生了bank冲突,应为不同的bank可以同时访问,而当如果多个线程请求shared memory地址被映射到同一个bank上,那么这些请求就会变成串行的

1. 什么是 Bank Conflict?(通俗比喻)

想象共享内存(Shared Memory)不是一个巨大的仓库,而是由 32 个独立的窄通道(Bank) 组成的。

  • 规则 :每个 Bank 在同一时刻只能服务 1 个线程 的读写请求。
  • 理想情况 :如果 32 个线程(一个 Warp)分别去访问 32 个 不同 的 Bank,那么它们可以 同时 完成读取,速度最快(无冲突)。
  • 冲突情况 :如果 32 个线程中,有 2 个线程都要访问 同一个 Bank(比如都去访问第 5 号通道),那么第二个线程必须 排队等待 ,第一个读完它才能读。这就叫 Bank Conflict
    • 2 个线程争抢 = 2-way conflict(速度减半)
    • 32 个线程争抢 = 32-way conflict(速度变成串行的 1/32,性能崩塌)

地址映射规则

在大多数 GPU 上,连续的 float (4 bytes) 地址依次分配给 Bank 0, Bank 1, ..., Bank 31,然后循环。

  • 地址 0 -> Bank 0
  • 地址 1 -> Bank 1
  • ...
  • 地址 32 -> Bank 0 (又回到了 Bank 0,冲突隐患!)

2. 为什么普通的归约会有 Bank Conflict?

假设我们用最直观的 "相邻配对" 方式(Sequential Addressing)写归约:

  • 第一轮:线程 0 加 线程 1,线程 2 加 线程 3...

    • 线程 0 读 sdata[0]sdata[1]
    • 线程 1 读 sdata[2]sdata[3]
    • ...
    • 结果 :大家访问的地址是连续的 (0,1), (2,3)... 每个线程访问不同的 Bank。无冲突!
  • 第二轮(步长=2):线程 0 加 线程 2,线程 4 加 线程 6...

    • 活跃线程:0, 4, 8, 12...
    • 线程 0 读 sdata[0]sdata[2]
    • 线程 4 读 sdata[4]sdata[6]
    • ...
    • 结果:地址间隔为 2。只要间隔不是 32 的倍数,通常也没大问题。
  • 灾难轮次(步长=32, 64...):

    • 假设步长 stride = 32
    • 活跃线程:0, 64, 128... (假设 Block 很大)
    • 线程 0 需要读 sdata[0]sdata[32]
    • 问题出现
      • sdata[0] 映射到 Bank 0
      • sdata[32] 也映射到 Bank 0 (因为 32mod  32=032mod32=0 )。
    • 更糟的是 :如果在一个 Warp 内,线程 0 访问 sdata[0]sdata[32],而线程 1 访问 sdata[1]sdata[33]...
    • 这会导致所有线程都在争抢相同的几个 Bank,造成严重的 32-way Bank Conflict,性能急剧下降。
cpp 复制代码
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"

// v2: 消除shared memory bank conflict
// latency: 2.300ms
template<int blockSize>
__global__ void reduce_v2(float *d_in,float *d_out){
    __shared__ float smem[blockSize];
    // 泛指当前线程在其block内的id
    unsigned int tid = threadIdx.x;
    // 泛指当前线程在所有block范围内的全局id
    unsigned int gtid = blockIdx.x * blockSize + threadIdx.x;
    // load: 每个线程加载一个元素到shared mem对应位置
    smem[tid] = d_in[gtid];
    __syncthreads();

    // 基于v1作出改进: 从之前的当前线程ID加2*线程ID位置然后不断加上*2位置上的数据,改成不断地对半相加,以消除bank conflict
    // 此时一个block对d_in这块数据的reduce sum结果保存在id为0的线程上面
    for (unsigned int index = blockDim.x / 2; index > 0; index >>= 1) {
        if (tid < index) {
            smem[tid] += smem[tid + index];
        }
        __syncthreads();
    }

    // store: 哪里来回哪里去,把reduce结果写回显存
    if (tid == 0) {
        d_out[blockIdx.x] = smem[0];
    }
}

bool CheckResult(float *out, float groudtruth, int n){
    float res = 0;
    for (int i = 0; i < n; i++){
        res += out[i];
    }
    if (res != groudtruth) {
        return false;
    }
    return true;
}

int main(){
    float milliseconds = 0;
    //const int N = 32 * 1024 * 1024;
    const int N = 25600000;
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    const int blockSize = 256;
    int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
    //int GridSize = 100000;
    float *a = (float *)malloc(N * sizeof(float));
    float *d_a;
    cudaMalloc((void **)&d_a, N * sizeof(float));

    float *out = (float*)malloc((GridSize) * sizeof(float));
    float *d_out;
    cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));

    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
    }

    float groudtruth = N * 1.0f;

    cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 Grid(GridSize);
    dim3 Block(blockSize);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    reduce_v2<blockSize><<<Grid,Block>>>(d_a, d_out);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
    printf("allcated %d blocks, data counts are %d", GridSize, N);
    bool is_right = CheckResult(out, groudtruth, GridSize);
    if(is_right) {
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        //for(int i = 0; i < GridSize;i++){
            //printf("res per block : %lf ",out[i]);
        //}
        //printf("\n");
        printf("groudtruth is: %f \n", groudtruth);
    }
    printf("reduce_v2 latency = %f ms\n", milliseconds);

    cudaFree(d_a);
    cudaFree(d_out);
    free(a);
    free(out);
}

这个 for 循环是 并行归约(Parallel Reduction) 算法的核心引擎。它控制着归约的轮次步长

让我们逐部分拆解:

1. 代码拆解

cpp

复制代码
for (unsigned int index = blockDim.x / 2; index > 0; index >>= 1)
  • 初始化 : unsigned int index = blockDim.x / 2

    • blockDim.x 是当前 Block 中的线程总数(例如 256, 512, 1024)。
    • index 代表当前的步长 (Stride),即两个需要相加的元素之间的距离。
    • 为什么从一半开始?
      • 第一轮归约是将数组"对半折叠"。
      • 如果有 256 个数据,我们需要让前 128 个线程分别去加后 128 个数据(data[i] + data[i+128])。
      • 所以初始步长必须是总线程数的一半。
  • 条件 : index > 0

    • 只要步长大于 0,就继续循环。
    • 当步长缩减到 0 时,说明所有数据已经汇聚到一个点(通常是 sdata[0]),循环结束。
  • 更新 : index >>= 1

    • 这是右移运算符 ,等价于 index = index / 2
    • 每一轮结束后,步长减半。
    • 序列: N/2→N/4→N/8→⋯→1N/2→N/4→N/8→⋯→1 。

reduce3:让idle线程也干活

reduce2的最大问题就是线程的浪费。可以看到启动了256个线程,但是第一轮迭代时只有128个线程在干活,第二轮迭代的时候只有64个线程在干活,每次干活的线程都在减少一半。在8以此轮迭代只有前128个线程干活,后面128个线程啥也没干。

cpp 复制代码
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"

//v3: 让空闲线程也干活
//latency: 1.147ms
template<int blockSize>
__global__ void reduce_v3(float *d_in, float *d_out){
    __shared__ float smem[blockSize];
    // 泛指当前线程在其block内的id
    unsigned int tid = threadIdx.x;
    // 泛指当前线程在所有block范围内的全局id, *2代表当前block要处理2*blocksize的数据
    // ep. blocksize = 2, blockIdx.x = 1, when tid = 0, gtid = 4, gtid + blockSize = 6; when tid = 1, gtid = 5, gtid + blockSize = 7
    // ep. blocksize = 2, blockIdx.x = 0, when tid = 0, gtid = 0, gtid + blockSize = 2; when tid = 1, gtid = 1, gtid + blockSize = 3
    // so, we can understand L18, one thread handle data located in tid and tid + blockSize 
    unsigned int gtid = blockIdx.x * (blockSize * 2) + threadIdx.x;
    // load: 每个线程加载两个元素到shared mem对应位置
    smem[tid] = d_in[gtid] + d_in[gtid + blockSize];
    __syncthreads();

    // 同v2: 从之前的当前线程ID加2*线程ID位置然后不断加上*2位置上的数据,改成不断地对半相加,以消除bank conflict
    // 此时一个block对d_in这块数据的reduce sum结果保存在id为0的线程上面
    for (unsigned int index = blockDim.x / 2; index > 0; index >>= 1) {
        if (tid < index) {
            smem[tid] += smem[tid + index];
        }
        __syncthreads();
    }

    // store: 哪里来回哪里去,把reduce结果写回显存
    // GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
    if (tid == 0) {
        d_out[blockIdx.x] = smem[0];
    }
}

bool CheckResult(float *out, float groudtruth, int n){
    float res = 0;
    for (int i = 0; i < n; i++){
        res += out[i];
    }
    if (res != groudtruth) {
        return false;
    }
    return true;
}

int main(){
    float milliseconds = 0;
    //const int N = 32 * 1024 * 1024;
    const int N = 25600000;
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    const int blockSize = 256;
    int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
    //int GridSize = 100000;
    float *a = (float *)malloc(N * sizeof(float));
    float *d_a;
    cudaMalloc((void **)&d_a, N * sizeof(float));

    float *out = (float*)malloc((GridSize) * sizeof(float));
    float *d_out;
    cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));

    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
    }

    float groudtruth = N * 1.0f;

    cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 Grid(GridSize);
    dim3 Block(blockSize / 2);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    reduce_v3<blockSize / 2><<<Grid,Block>>>(d_a, d_out);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
    printf("allcated %d blocks, data counts are %d", GridSize, N);
    bool is_right = CheckResult(out, groudtruth, GridSize);
    if(is_right) {
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        //for(int i = 0; i < GridSize;i++){
            //printf("res per block : %lf ",out[i]);
        //}
        //printf("\n");
        printf("groudtruth is: %f \n", groudtruth);
    }
    printf("reduce_v3 latency = %f ms\n", milliseconds);

    cudaFree(d_a);
    cudaFree(d_out);
    free(a);
    free(out);
}

reduce4:展开for循环最后一个warp

最后一轮迭代时,block中只有warp0还在干活,此时无需syncthreads。这条语句造成很大的开销,因为同步比较耗时。

cpp 复制代码
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"

//v4: 最后一个warp不用参与__syncthreads
//latency: 0.694ms
__device__ void WarpSharedMemReduce(volatile float* smem, int tid){
    // CUDA不保证所有的shared memory读操作都能在写操作之前完成,因此存在竞争关系,可能导致结果错误
    // 比如smem[tid] += smem[tid + 16] => smem[0] += smem[16], smem[16] += smem[32]
    // 此时L9中smem[16]的读和写到底谁在前谁在后,这是不确定的,所以在Volta架构后最后加入中间寄存器(L11)配合syncwarp和volatile(使得不会看见其他线程更新smem上的结果)保证读写依赖
    float x = smem[tid];
    if (blockDim.x >= 64) {
      x += smem[tid + 32]; __syncwarp();
      smem[tid] = x; __syncwarp();
    }
    x += smem[tid + 16]; __syncwarp();
    smem[tid] = x; __syncwarp();
    x += smem[tid + 8]; __syncwarp();
    smem[tid] = x; __syncwarp();
    x += smem[tid + 4]; __syncwarp();
    smem[tid] = x; __syncwarp();
    x += smem[tid + 2]; __syncwarp();
    smem[tid] = x; __syncwarp();
    x += smem[tid + 1]; __syncwarp();
    smem[tid] = x; __syncwarp();
}
// Note: using blockSize as a template arg can benefit from NVCC compiler optimization, 
// which is better than using blockDim.x that is known in runtime.
template<int blockSize>
__global__ void reduce_v4(float *d_in,float *d_out){
    __shared__ float smem[blockSize];
    // 泛指当前线程在其block内的id
    int tid = threadIdx.x;
    // 泛指当前线程在所有block范围内的全局id, *2代表当前block要处理2*blocksize的数据
    // ep. blocksize = 2, blockIdx.x = 1, when tid = 0, gtid = 4, gtid + blockSize = 6; when tid = 1, gtid = 5, gtid + blockSize = 7
    // ep. blocksize = 2, blockIdx.x = 0, when tid = 0, gtid = 0, gtid + blockSize = 2; when tid = 1, gtid = 1, gtid + blockSize = 3
    // so, we can understand L38, one thread handle data located in tid and tid + blockSize 
    int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
    // load: 每个线程加载两个元素到shared mem对应位置
    smem[tid] = d_in[i] + d_in[i + blockSize];
    __syncthreads();

    // 基于v3改进:把最后一个warp抽离出来reduce,避免多做一次sync threads
    // 此时一个block对d_in这块数据的reduce sum结果保存在id为0的线程上面
    for (int s = blockDim.x / 2; s > 32; s >>= 1) {
        if (tid < s) {
            smem[tid] += smem[tid + s];
        }
        __syncthreads();
    }

    // last warp拎出来单独作reduce
    if (tid < 32) {
        WarpSharedMemReduce(smem, tid);
    }
    // store: 哪里来回哪里去,把reduce结果写回显存
    // GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
    if (tid == 0) {
        d_out[blockIdx.x] = smem[0];
    }
}

bool CheckResult(float *out, float groudtruth, int n){
    float res = 0;
    for (int i = 0; i < n; i++){
        res += out[i];
    }
    //printf("%f", res);
    if (res != groudtruth) {
        return false;
    }
    return true;
}

int main(){
    float milliseconds = 0;
    //const int N = 32 * 1024 * 1024;
    const int N = 25600000;
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    const int blockSize = 256;
    int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
    //int GridSize = 100000;
    float *a = (float *)malloc(N * sizeof(float));
    float *d_a;
    cudaMalloc((void **)&d_a, N * sizeof(float));

    float *out = (float*)malloc((GridSize) * sizeof(float));
    float *d_out;
    cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));

    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
    }

    float groudtruth = N * 1.0f;

    cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 Grid(GridSize);
    dim3 Block(blockSize / 2);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    reduce_v4<blockSize / 2><<<Grid,Block>>>(d_a, d_out);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
    printf("allcated %d blocks, data counts are %d \n", GridSize, N);
    bool is_right = CheckResult(out, groudtruth, GridSize);
    if(is_right) {
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        for(int i = 0; i < GridSize;i++){
            printf("resPerBlock : %lf ",out[i]);
        }
        printf("\n");
        printf("groudtruth is: %f \n", groudtruth);
    }
    printf("reduce_v4 latency = %f ms\n", milliseconds);

    cudaFree(d_a);
    cudaFree(d_out);
    free(a);
    free(out);
}

。。


reduce5:完全展开for循环省掉for循环中的判断和加法操作

cpp 复制代码
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"

#define THREAD_PER_BLOCK 256
// latency: 0.656ms
// v5:循环展开
template <int blockSize>
__device__ void BlockSharedMemReduce(float* smem) {
    //对v4 L45的for循环展开,以减去for循环中的加法指令,以及给编译器更多重排指令的空间
  if (blockSize >= 1024) {
    if (threadIdx.x < 512) {
      smem[threadIdx.x] += smem[threadIdx.x + 512];
    }
    __syncthreads();
  }
  if (blockSize >= 512) {
    if (threadIdx.x < 256) {
      smem[threadIdx.x] += smem[threadIdx.x + 256];
    }
    __syncthreads();
  }
  if (blockSize >= 256) {
    if (threadIdx.x < 128) {
      smem[threadIdx.x] += smem[threadIdx.x + 128];
    }
    __syncthreads();
  }
  if (blockSize >= 128) {
    if (threadIdx.x < 64) {
      smem[threadIdx.x] += smem[threadIdx.x + 64];
    }
    __syncthreads();
  }
  // the final warp
  if (threadIdx.x < 32) {
    volatile float* vshm = smem;
    if (blockDim.x >= 64) {
      vshm[threadIdx.x] += vshm[threadIdx.x + 32];
    }
    vshm[threadIdx.x] += vshm[threadIdx.x + 16];
    vshm[threadIdx.x] += vshm[threadIdx.x + 8];
    vshm[threadIdx.x] += vshm[threadIdx.x + 4];
    vshm[threadIdx.x] += vshm[threadIdx.x + 2]; 
    vshm[threadIdx.x] += vshm[threadIdx.x + 1];
  }
}

template <int blockSize>
__global__ void reduce_v5(float *d_in, float *d_out){
    __shared__ float smem[THREAD_PER_BLOCK];
    // 泛指当前线程在其block内的id
    unsigned int tid = threadIdx.x;
    // 泛指当前线程在所有block范围内的全局id, *2代表当前block要处理2*blocksize的数据
    // ep. blocksize = 2, blockIdx.x = 1, when tid = 0, gtid = 4, gtid + blockSize = 6; when tid = 1, gtid = 5, gtid + blockSize = 7
    // ep. blocksize = 2, blockIdx.x = 0, when tid = 0, gtid = 0, gtid + blockSize = 2; when tid = 1, gtid = 1, gtid + blockSize = 3
    // so, we can understand L59, one thread handle data located in tid and tid + blockSize 
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
    // load: 每个线程加载两个元素到shared mem对应位置
    smem[tid] = d_in[i] + d_in[i + blockDim.x];
    __syncthreads();
    // compute: reduce in shared mem
    BlockSharedMemReduce<blockSize>(smem);

    // store: 哪里来回哪里去,把reduce结果写回显存
    // GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
    if (tid == 0) {
        d_out[blockIdx.x] = smem[0];
    }
}

bool CheckResult(float *out, float groudtruth, int n){
    float res = 0;
    for (int i = 0; i < n; i++){
        res += out[i];
    }
    if (res != groudtruth) {
        return false;
    }
    return true;
}

int main(){
    float milliseconds = 0;
    
    const int N = 25600000;
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    const int blockSize = 256;
    int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
    //int GridSize = 100000;
    float *a = (float *)malloc(N * sizeof(float));
    float *d_a;
    cudaMalloc((void **)&d_a, N * sizeof(float));

    float *out = (float*)malloc((GridSize) * sizeof(float));
    float *d_out;
    cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));

    for(int i = 0; i < N; i++){
        a[i] = 1.0f;
    }

    float groudtruth = N * 1.0f;

    cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 Grid(GridSize);
    dim3 Block(blockSize / 2);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    reduce_v5<blockSize / 2><<<Grid,Block>>>(d_a, d_out);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
    printf("allcated %d blocks, data counts are %d \n", GridSize, N);
    bool is_right = CheckResult(out, groudtruth, GridSize);
    if(is_right) {
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        for(int i = 0; i < GridSize;i++){
            printf("resPerBlock : %lf ",out[i]);
        }
        printf("\n");
        printf("groudtruth is: %f \n", groudtruth);
    }
    printf("reduce_v5 latency = %f ms\n", milliseconds);

    cudaFree(d_a);
    cudaFree(d_out);
    free(a);
    free(out);
}

reduce6:一个block/thread处理多个元素,基于gridsize step的loop舍弃掉之前的reduce3方案中的让一个线程干了两份活

好处:

(1)更灵活:可以handle大于你启动的线程数量的problem size

(2) 复用线程:减少线程创建和开销

(3)方便debug:设置block数量和thread数量为1时,此时为串行程序,便于debug

cpp 复制代码
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
// 注意: v0-v5里面kernel得到的是各个block负责范围内的总和,要想得到最终的和,需要把各个block求得的总和再做reduce sum
// v6: multi-block reduce final result by two pass
// latency: 1.815ms
template <int blockSize>
__device__ void BlockSharedMemReduce(float* smem) {
    //对v4 L45的for循环展开,以减去for循环中的加法指令,以及给编译器更多重排指令的空间
  if (blockSize >= 1024) {
    if (threadIdx.x < 512) {
      smem[threadIdx.x] += smem[threadIdx.x + 512];
    }
    __syncthreads();
  }
  if (blockSize >= 512) {
    if (threadIdx.x < 256) {
      smem[threadIdx.x] += smem[threadIdx.x + 256];
    }
    __syncthreads();
  }
  if (blockSize >= 256) {
    if (threadIdx.x < 128) {
      smem[threadIdx.x] += smem[threadIdx.x + 128];
    }
    __syncthreads();
  }
  if (blockSize >= 128) {
    if (threadIdx.x < 64) {
      smem[threadIdx.x] += smem[threadIdx.x + 64];
    }
    __syncthreads();
  }
  // the final warp
  if (threadIdx.x < 32) {
    volatile float* vshm = smem;
    if (blockDim.x >= 64) {
      vshm[threadIdx.x] += vshm[threadIdx.x + 32];
    }
    vshm[threadIdx.x] += vshm[threadIdx.x + 16];
    vshm[threadIdx.x] += vshm[threadIdx.x + 8];
    vshm[threadIdx.x] += vshm[threadIdx.x + 4];
    vshm[threadIdx.x] += vshm[threadIdx.x + 2];                                                                                                                                                                                          vshm[threadIdx.x] += vshm[threadIdx.x + 1];

  }
}

template <int blockSize>
__global__ void reduce_v6(float *d_in, float *d_out, int nums){
    __shared__ float smem[blockSize];
    // 泛指当前线程在其block内的id
    unsigned int tid = threadIdx.x;
    // 泛指当前线程在所有block范围内的全局id
    unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int total_thread_num = blockDim.x * gridDim.x;
    // 基于v5的改进:不用显式指定一个线程处理2个元素,而是通过L58的for循环来自动确定每个线程处理的元素个数
    float sum = 0.0f;
    for (int32_t i = gtid; i < nums; i += total_thread_num) {
        sum += d_in[i];
    }
    smem[tid] = sum;
    __syncthreads();
    // compute: reduce in shared mem
    BlockSharedMemReduce<blockSize>(smem);

    // store: 哪里来回哪里去,把reduce结果写回显存
    // GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
    if (tid == 0) {
        d_out[blockIdx.x] = smem[0];
    }
}

bool CheckResult(float *out, float groudtruth, int n){
    if (*out != groudtruth) {
      return false;
    }
    return true;
}

int main(){
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    int maxblocks = deviceProp.maxGridSize[0];
    const int blockSize = 256;
    const int N = 25600000;
    int gridSize = std::min((N + blockSize - 1) / blockSize, maxblocks);

    float milliseconds = 0;
    float *a = (float *)malloc(N * sizeof(float));
    float *d_a;
    cudaMalloc((void **)&d_a,N * sizeof(float));

    float *out = (float*)malloc((gridSize) * sizeof(float));
    float *d_out;
    float *part_out;//新增part_out存储每个block reduce的结果
    cudaMalloc((void **)&d_out, 1 * sizeof(float));
    cudaMalloc((void **)&part_out, (gridSize) * sizeof(float));
    float groudtruth = N;

    for(int i = 0; i < N; i++){
        a[i] = 1;
    }

    cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 Grid(gridSize);
    dim3 Block(blockSize);
    

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    reduce_v6<blockSize><<<Grid, Block>>>(d_a, part_out, N);
    reduce_v6<blockSize><<<1, Block>>>(part_out, d_out, gridSize);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(out, d_out, 1 * sizeof(float), cudaMemcpyDeviceToHost);
    bool is_right = CheckResult(out, groudtruth, 1);
    if(is_right) {
        printf("the ans is right\n");
    } else {
        printf("the ans is wrong\n");
        for(int i = 0;i < 1;i++){
            printf("%lf ",out[i]);
        }
        printf("\n");
    }
    printf("reduce_v6 latency = %f ms\n", milliseconds);

    cudaFree(d_a);
    cudaFree(d_out);
    cudaFree(part_out);
    free(a);
    free(out);
}

。。

。。

。。

。。

相关推荐
无忧智库2 小时前
破局大模型“语料荒”:国家级高质量中文多模态语料库处理平台的深度解构与实战指南(WORD)
大数据·人工智能
大大大大晴天2 小时前
Hudi生产问题排障-insert overwrite 路径不存在
大数据·spark
综合热讯2 小时前
香港启世集团宣布启动核聚变能源研究计划
大数据·人工智能·能源
数字化顾问2 小时前
(85页PPT)麦肯锡XX集团财务管理体系构建咨询规划报告(附下载方式)
大数据·人工智能
Gain_chance2 小时前
Flume01:大数据日志收集与传输利器
大数据·数据仓库·flume
zandy10112 小时前
告别指标混乱:衡石指标中台如何通过“原子化指标+语义层”统一企业数据语言
大数据·指标中台
冯RI375II694872 小时前
欧盟EU 10/2011与LFGB的差异对比
大数据
cramer_50h3 小时前
更新-常用的Flask第三方扩展库清单合集教程和详细的代码示例
大数据
rainy雨3 小时前
六西格玛改进系统的全流程功能:传统企业转型中如何用六西格玛解决成本失控与交付延期的双重难题
大数据·人工智能·精益工程