- reduce的定义
给N个数值,对它们做累计的算数操作,例如求出其总和、最大值、最小值、均值、异或这一类的操作,成为reduce
2.解决方案
(1)Two-Pass:启动两次kernel
Baseline:一个线程做reduce
reduce0:引入shared memory且并行化reduce算法
reduce1:除余替换为位运算
reduce2:消除shared memory bank conflict
reduce3:尽可能让更多的线程都在干活
reduce4:展开for循环最后一个warp
reduce5:完全展开for循环
reduce6:一个线程干多个活
reduce7:提升GPU并行度,处理更多数据,分配多个block作reduce
reduce8:warp shuffle
(2)single-Pass:启动一次kernel
baseline:一个线程做reduce/cpu写法
cpp
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
// 注: 每个cuda程序(.cu文件)的main函数功能大同小异,后面不会每个cu文件都注释main函数逻辑
//999ms
__global__ void reduce_baseline(const int* input, int* output, size_t n) {
// 由于只分配了1个block和thread,此时cuda程序相当于串行程序
int sum = 0;
// 累加
for (size_t i = 0; i < n; ++i) {
sum += input[i];
}
// 累加结果写回显存
*output = sum;
}
bool CheckResult(int *out, int groudtruth, int n){
if (*out != groudtruth) {
return false;
}
return true;
}
int main(){
float milliseconds = 0;
//const int N = 32 * 1024 * 1024;
const int N = 25600000;
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
//const int blockSize = 256;
const int blockSize = 1;
//int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);//used later
int GridSize = 1;
// 分配内存和显存并初始化数据
int *a = (int *)malloc(N * sizeof(int));
int *d_a;
cudaMalloc((void **)&d_a, N * sizeof(int));
int *out = (int*)malloc((GridSize) * sizeof(int));
int *d_out;
cudaMalloc((void **)&d_out, (GridSize) * sizeof(int));
for(int i = 0; i < N; i++){
a[i] = 1;
}
int groudtruth = N * 1;
// 把初始化后的数据拷贝到GPU
cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
// 定义分配的block数量和threads数量
dim3 Grid(GridSize);
dim3 Block(blockSize);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
// 分配1个block和1个thread
reduce_baseline<<<1, 1>>>(d_a, d_out, N);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
// 将结果拷回CPU并check正确性
cudaMemcpy(out, d_out, GridSize * sizeof(int), cudaMemcpyDeviceToHost);
printf("allcated %d blocks, data counts are %d", GridSize, N);
bool is_right = CheckResult(out, groudtruth, GridSize);
if(is_right) {
printf("the ans is right\n");
} else {
printf("the ans is wrong\n");
for(int i = 0; i < GridSize;i++){
printf("res per block : %lf ",out[i]);
}
printf("\n");
printf("groudtruth is: %f \n", groudtruth);
}
printf("reduce_baseline latency = %f ms\n", milliseconds);
cudaFree(d_a);
cudaFree(d_out);
free(a);
free(out);
}
reduce0:引入shared_memory且并行化reduce算法
(1)将数据load至shared memroy中;
(2)在shared_memory中对数据进行reduce操作
(3)将最后的结果写会global memory中
cpp
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
// v0: naive版本
// latency: 3.835ms
// blockSize作为模板参数的效果主要用于静态shared memory的申请需要传入编译期常量指定大小(L10)
template<int blockSize>
__global__ void reduce_v0(float *d_in,float *d_out){
__shared__ float smem[blockSize];
// 泛指当前线程在其block内的id
int tid = threadIdx.x;
// 泛指当前线程在所有block范围内的全局id
int gtid = blockIdx.x * blockSize + threadIdx.x;
// load: 每个线程加载一个元素到shared mem对应位置
smem[tid] = d_in[gtid];
// 涉及到对shared memory的读写最好都加上__syncthreads
__syncthreads();
// 每个线程在shared memory上跨index加另一个元素,直到跨度>线程数量
// 此时一个block对d_in这块数据的reduce sum结果保存在id为0的线程上面
for(int index = 1; index < blockDim.x; index *= 2) {
// 注意!v0并没有warp divergence,因为没有else分支,视频目前这里讲错
// 现在的v0和v1性能大体相似
// v0慢的原因在于下一行使用了除余%,除余%是个非常耗时的指令,我会在下个版本对这里进一步修正
// 可尝试把下一行替换为`if ((tid & (2 * index - 1)) == 0) {`, 性能大概可以提升30%~50%
if (tid % (2 * index) == 0) {
smem[tid] += smem[tid + index];
}
__syncthreads();
}
// store: 哪里来回哪里去,把reduce结果写回显存
if (tid == 0) {
d_out[blockIdx.x] = smem[0];
}
}
bool CheckResult(float *out, float groudtruth, int n){
float res = 0;
for (int i = 0; i < n; i++){
res += out[i];
}
if (res != groudtruth) {
return false;
}
return true;
}
int main(){
float milliseconds = 0;
const int N = 25600000;
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
const int blockSize = 256;
int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
//int GridSize = 100000;
float *a = (float *)malloc(N * sizeof(float));
float *d_a;
cudaMalloc((void **)&d_a, N * sizeof(float));
float *out = (float*)malloc((GridSize) * sizeof(float));
float *d_out;
cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));
for(int i = 0; i < N; i++){
a[i] = 1.0f;
}
float groudtruth = N * 1.0f;
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
dim3 Grid(GridSize);
dim3 Block(blockSize);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reduce_v0<blockSize><<<Grid,Block>>>(d_a, d_out);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
printf("allcated %d blocks, data counts are %d", GridSize, N);
bool is_right = CheckResult(out, groudtruth, GridSize);
if(is_right) {
printf("the ans is right\n");
} else {
printf("the ans is wrong\n");
//for(int i = 0; i < GridSize;i++){
//printf("res per block : %lf ",out[i]);
//}
//printf("\n");
printf("groudtruth is: %f \n", groudtruth);
}
printf("reduce_v0 latency = %f ms\n", milliseconds);
cudaFree(d_a);
cudaFree(d_out);
free(a);
free(out);
}
notes:
(1)每个block享有一个独立的shared memory,所以在cuda kernel的函数中,数据的移动是smem[tid] = d_in[gtid];
(2)在shared memory进行的运算后都需要__syncthreads();
(3)在if-else的逻辑代码中,warp内的线程出现warp divergence,即一个warp内有threads程符合if条件有些threads符合else条件????
对于independent thread scheduling的解释:p32
..
..
..
前面这个版本的计算代码性能较慢,下面是优化后的代码:
cpp
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
// v1新版本: 用位运算替换除余操作
// latency: 2.825ms
// blockSize作为模板参数的效果主要用于静态shared memory的申请需要传入编译期常量指定大小(L120)
template<int blockSize>
__global__ void reduce_v1(float *d_in,float *d_out){
// 泛指当前线程在其block内的id
int tid = threadIdx.x;
// 泛指当前线程在所有block范围内的全局id
int gtid = threadIdx.x + blockIdx.x * blockSize;
// load: 每个线程加载一个元素到shared mem对应位置
__shared__ float smem[blockSize];
smem[tid] = d_in[gtid];
// 每对shared memory做读写操作都需要加__syncthreads保证一个block内的threads此刻都同步,以防结果错误
__syncthreads();
for(int index = 1; index < blockDim.x; index *= 2) {
// 算法思路和v0一致,仅仅是用位运算替代了v0 if语句中的除余操作
/* 符号:& 是按位与。
操作:同位均为1则得1,否则得0。*/
if ((tid & (2 * index - 1)) == 0){
smem[tid] += smem[tid + index];
}
__syncthreads();
}
// GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
if(tid == 0) {
d_out[blockIdx.x] = smem[0];
}
}
bool CheckResult(float *out, float groudtruth, int n){
float res = 0;
for (int i = 0; i < n; i++){
res += out[i];
}
if (res != groudtruth) {
return false;
}
return true;
}
int main(){
float milliseconds = 0;
const int N = 25600000;
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
const int blockSize = 256;
int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
//int GridSize = 100000;
float *a = (float *)malloc(N * sizeof(float));
float *d_a;
cudaMalloc((void **)&d_a, N * sizeof(float));
float *out = (float*)malloc((GridSize) * sizeof(float));
float *d_out;
cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));
for(int i = 0; i < N; i++){
a[i] = 1.0f;
}
float groudtruth = N * 1.0f;
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
dim3 Grid(GridSize);
dim3 Block(blockSize);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reduce_v1<blockSize><<<Grid,Block>>>(d_a, d_out);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
printf("allcated %d blocks, data counts are %d", GridSize, N);
bool is_right = CheckResult(out, groudtruth, GridSize);
if(is_right) {
printf("the ans is right\n");
} else {
printf("the ans is wrong\n");
//for(int i = 0; i < GridSize;i++){
//printf("res per block : %lf ",out[i]);
//}
//printf("\n");
printf("groudtruth is: %f \n", groudtruth);
}
printf("reduce_v0 latency = %f ms\n", milliseconds);
cudaFree(d_a);
cudaFree(d_out);
free(a);
free(out);
}
主要优化除余这个操作,
除余:是一个很耗性能的操作,由很多条指令组成常用位运算替代除余操作和出发操作。
在你之前的代码 if ((tid & (2 * index - 1)) == 0) 中,这个运算有一个非常特殊的用途:掩码 (Masking)。
因为 (2 * index - 1) 这个数字很特殊,它的二进制全是 1 。
例如 index = 2 时,2*2 - 1 = 3,二进制是 ...00011。
作用:提取低位
当你用任何数 tid 去和 3 (...00011) 做 & 运算时,高位全部被清零,只保留最后两位。
为什么要用 & 而不是 %?
- 速度快 :在计算机底层,位运算(AND)通常比除法/取模运算(%)快得多。虽然现代编译器会把
% 4优化成位运算,但在 GPU 这种对指令敏感的架构上,显式写出位运算是一种习惯和最佳实践。 - 处理 2 的幂 :只要右边的数是 2n−12n−1 (如 1, 3, 7, 15, 31...),
x & mask就完美等价于x % (mask + 1)。
总结
- 符号 :
&是按位与。 - 操作:同位均为1则得1,否则得0。
reduce2: 消除shared memory bank conflict
什么是shared memory bank conflict?为什么会造成性能下降
答:如果一个warp的多个线程访问同一个bank的不同字段(注:同一个bank的不同字段如bank[0[0],bank[1][0],...,bank[n][0]),那么就发生了bank冲突,应为不同的bank可以同时访问,而当如果多个线程请求shared memory地址被映射到同一个bank上,那么这些请求就会变成串行的
1. 什么是 Bank Conflict?(通俗比喻)
想象共享内存(Shared Memory)不是一个巨大的仓库,而是由 32 个独立的窄通道(Bank) 组成的。
- 规则 :每个 Bank 在同一时刻只能服务 1 个线程 的读写请求。
- 理想情况 :如果 32 个线程(一个 Warp)分别去访问 32 个 不同 的 Bank,那么它们可以 同时 完成读取,速度最快(无冲突)。
- 冲突情况 :如果 32 个线程中,有 2 个线程都要访问 同一个 Bank(比如都去访问第 5 号通道),那么第二个线程必须 排队等待 ,第一个读完它才能读。这就叫 Bank Conflict 。
- 2 个线程争抢 = 2-way conflict(速度减半)
- 32 个线程争抢 = 32-way conflict(速度变成串行的 1/32,性能崩塌)
地址映射规则 :
在大多数 GPU 上,连续的 float (4 bytes) 地址依次分配给 Bank 0, Bank 1, ..., Bank 31,然后循环。
- 地址 0 -> Bank 0
- 地址 1 -> Bank 1
- ...
- 地址 32 -> Bank 0 (又回到了 Bank 0,冲突隐患!)
2. 为什么普通的归约会有 Bank Conflict?
假设我们用最直观的 "相邻配对" 方式(Sequential Addressing)写归约:
-
第一轮:线程 0 加 线程 1,线程 2 加 线程 3...
- 线程 0 读
sdata[0]和sdata[1] - 线程 1 读
sdata[2]和sdata[3] - ...
- 结果 :大家访问的地址是连续的 (0,1), (2,3)... 每个线程访问不同的 Bank。无冲突! ✅
- 线程 0 读
-
第二轮(步长=2):线程 0 加 线程 2,线程 4 加 线程 6...
- 活跃线程:0, 4, 8, 12...
- 线程 0 读
sdata[0]和sdata[2] - 线程 4 读
sdata[4]和sdata[6] - ...
- 结果:地址间隔为 2。只要间隔不是 32 的倍数,通常也没大问题。
-
灾难轮次(步长=32, 64...):
- 假设步长
stride = 32。 - 活跃线程:0, 64, 128... (假设 Block 很大)
- 线程 0 需要读
sdata[0]和sdata[32]。 - 问题出现 :
sdata[0]映射到 Bank 0。sdata[32]也映射到 Bank 0 (因为 32mod 32=032mod32=0 )。
- 更糟的是 :如果在一个 Warp 内,线程 0 访问
sdata[0]和sdata[32],而线程 1 访问sdata[1]和sdata[33]... - 这会导致所有线程都在争抢相同的几个 Bank,造成严重的 32-way Bank Conflict,性能急剧下降。
- 假设步长
cpp
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
// v2: 消除shared memory bank conflict
// latency: 2.300ms
template<int blockSize>
__global__ void reduce_v2(float *d_in,float *d_out){
__shared__ float smem[blockSize];
// 泛指当前线程在其block内的id
unsigned int tid = threadIdx.x;
// 泛指当前线程在所有block范围内的全局id
unsigned int gtid = blockIdx.x * blockSize + threadIdx.x;
// load: 每个线程加载一个元素到shared mem对应位置
smem[tid] = d_in[gtid];
__syncthreads();
// 基于v1作出改进: 从之前的当前线程ID加2*线程ID位置然后不断加上*2位置上的数据,改成不断地对半相加,以消除bank conflict
// 此时一个block对d_in这块数据的reduce sum结果保存在id为0的线程上面
for (unsigned int index = blockDim.x / 2; index > 0; index >>= 1) {
if (tid < index) {
smem[tid] += smem[tid + index];
}
__syncthreads();
}
// store: 哪里来回哪里去,把reduce结果写回显存
if (tid == 0) {
d_out[blockIdx.x] = smem[0];
}
}
bool CheckResult(float *out, float groudtruth, int n){
float res = 0;
for (int i = 0; i < n; i++){
res += out[i];
}
if (res != groudtruth) {
return false;
}
return true;
}
int main(){
float milliseconds = 0;
//const int N = 32 * 1024 * 1024;
const int N = 25600000;
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
const int blockSize = 256;
int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
//int GridSize = 100000;
float *a = (float *)malloc(N * sizeof(float));
float *d_a;
cudaMalloc((void **)&d_a, N * sizeof(float));
float *out = (float*)malloc((GridSize) * sizeof(float));
float *d_out;
cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));
for(int i = 0; i < N; i++){
a[i] = 1.0f;
}
float groudtruth = N * 1.0f;
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
dim3 Grid(GridSize);
dim3 Block(blockSize);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reduce_v2<blockSize><<<Grid,Block>>>(d_a, d_out);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
printf("allcated %d blocks, data counts are %d", GridSize, N);
bool is_right = CheckResult(out, groudtruth, GridSize);
if(is_right) {
printf("the ans is right\n");
} else {
printf("the ans is wrong\n");
//for(int i = 0; i < GridSize;i++){
//printf("res per block : %lf ",out[i]);
//}
//printf("\n");
printf("groudtruth is: %f \n", groudtruth);
}
printf("reduce_v2 latency = %f ms\n", milliseconds);
cudaFree(d_a);
cudaFree(d_out);
free(a);
free(out);
}
这个 for 循环是 并行归约(Parallel Reduction) 算法的核心引擎。它控制着归约的轮次 和步长。
让我们逐部分拆解:
1. 代码拆解
cpp
for (unsigned int index = blockDim.x / 2; index > 0; index >>= 1)
-
初始化 :
unsigned int index = blockDim.x / 2blockDim.x是当前 Block 中的线程总数(例如 256, 512, 1024)。index代表当前的步长 (Stride),即两个需要相加的元素之间的距离。- 为什么从一半开始?
- 第一轮归约是将数组"对半折叠"。
- 如果有 256 个数据,我们需要让前 128 个线程分别去加后 128 个数据(
data[i] + data[i+128])。 - 所以初始步长必须是总线程数的一半。
-
条件 :
index > 0- 只要步长大于 0,就继续循环。
- 当步长缩减到 0 时,说明所有数据已经汇聚到一个点(通常是
sdata[0]),循环结束。
-
更新 :
index >>= 1- 这是右移运算符 ,等价于
index = index / 2。 - 每一轮结束后,步长减半。
- 序列: N/2→N/4→N/8→⋯→1N/2→N/4→N/8→⋯→1 。
- 这是右移运算符 ,等价于
reduce3:让idle线程也干活
reduce2的最大问题就是线程的浪费。可以看到启动了256个线程,但是第一轮迭代时只有128个线程在干活,第二轮迭代的时候只有64个线程在干活,每次干活的线程都在减少一半。在8以此轮迭代只有前128个线程干活,后面128个线程啥也没干。
cpp
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
//v3: 让空闲线程也干活
//latency: 1.147ms
template<int blockSize>
__global__ void reduce_v3(float *d_in, float *d_out){
__shared__ float smem[blockSize];
// 泛指当前线程在其block内的id
unsigned int tid = threadIdx.x;
// 泛指当前线程在所有block范围内的全局id, *2代表当前block要处理2*blocksize的数据
// ep. blocksize = 2, blockIdx.x = 1, when tid = 0, gtid = 4, gtid + blockSize = 6; when tid = 1, gtid = 5, gtid + blockSize = 7
// ep. blocksize = 2, blockIdx.x = 0, when tid = 0, gtid = 0, gtid + blockSize = 2; when tid = 1, gtid = 1, gtid + blockSize = 3
// so, we can understand L18, one thread handle data located in tid and tid + blockSize
unsigned int gtid = blockIdx.x * (blockSize * 2) + threadIdx.x;
// load: 每个线程加载两个元素到shared mem对应位置
smem[tid] = d_in[gtid] + d_in[gtid + blockSize];
__syncthreads();
// 同v2: 从之前的当前线程ID加2*线程ID位置然后不断加上*2位置上的数据,改成不断地对半相加,以消除bank conflict
// 此时一个block对d_in这块数据的reduce sum结果保存在id为0的线程上面
for (unsigned int index = blockDim.x / 2; index > 0; index >>= 1) {
if (tid < index) {
smem[tid] += smem[tid + index];
}
__syncthreads();
}
// store: 哪里来回哪里去,把reduce结果写回显存
// GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
if (tid == 0) {
d_out[blockIdx.x] = smem[0];
}
}
bool CheckResult(float *out, float groudtruth, int n){
float res = 0;
for (int i = 0; i < n; i++){
res += out[i];
}
if (res != groudtruth) {
return false;
}
return true;
}
int main(){
float milliseconds = 0;
//const int N = 32 * 1024 * 1024;
const int N = 25600000;
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
const int blockSize = 256;
int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
//int GridSize = 100000;
float *a = (float *)malloc(N * sizeof(float));
float *d_a;
cudaMalloc((void **)&d_a, N * sizeof(float));
float *out = (float*)malloc((GridSize) * sizeof(float));
float *d_out;
cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));
for(int i = 0; i < N; i++){
a[i] = 1.0f;
}
float groudtruth = N * 1.0f;
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
dim3 Grid(GridSize);
dim3 Block(blockSize / 2);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reduce_v3<blockSize / 2><<<Grid,Block>>>(d_a, d_out);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
printf("allcated %d blocks, data counts are %d", GridSize, N);
bool is_right = CheckResult(out, groudtruth, GridSize);
if(is_right) {
printf("the ans is right\n");
} else {
printf("the ans is wrong\n");
//for(int i = 0; i < GridSize;i++){
//printf("res per block : %lf ",out[i]);
//}
//printf("\n");
printf("groudtruth is: %f \n", groudtruth);
}
printf("reduce_v3 latency = %f ms\n", milliseconds);
cudaFree(d_a);
cudaFree(d_out);
free(a);
free(out);
}
reduce4:展开for循环最后一个warp
最后一轮迭代时,block中只有warp0还在干活,此时无需syncthreads。这条语句造成很大的开销,因为同步比较耗时。
cpp
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
//v4: 最后一个warp不用参与__syncthreads
//latency: 0.694ms
__device__ void WarpSharedMemReduce(volatile float* smem, int tid){
// CUDA不保证所有的shared memory读操作都能在写操作之前完成,因此存在竞争关系,可能导致结果错误
// 比如smem[tid] += smem[tid + 16] => smem[0] += smem[16], smem[16] += smem[32]
// 此时L9中smem[16]的读和写到底谁在前谁在后,这是不确定的,所以在Volta架构后最后加入中间寄存器(L11)配合syncwarp和volatile(使得不会看见其他线程更新smem上的结果)保证读写依赖
float x = smem[tid];
if (blockDim.x >= 64) {
x += smem[tid + 32]; __syncwarp();
smem[tid] = x; __syncwarp();
}
x += smem[tid + 16]; __syncwarp();
smem[tid] = x; __syncwarp();
x += smem[tid + 8]; __syncwarp();
smem[tid] = x; __syncwarp();
x += smem[tid + 4]; __syncwarp();
smem[tid] = x; __syncwarp();
x += smem[tid + 2]; __syncwarp();
smem[tid] = x; __syncwarp();
x += smem[tid + 1]; __syncwarp();
smem[tid] = x; __syncwarp();
}
// Note: using blockSize as a template arg can benefit from NVCC compiler optimization,
// which is better than using blockDim.x that is known in runtime.
template<int blockSize>
__global__ void reduce_v4(float *d_in,float *d_out){
__shared__ float smem[blockSize];
// 泛指当前线程在其block内的id
int tid = threadIdx.x;
// 泛指当前线程在所有block范围内的全局id, *2代表当前block要处理2*blocksize的数据
// ep. blocksize = 2, blockIdx.x = 1, when tid = 0, gtid = 4, gtid + blockSize = 6; when tid = 1, gtid = 5, gtid + blockSize = 7
// ep. blocksize = 2, blockIdx.x = 0, when tid = 0, gtid = 0, gtid + blockSize = 2; when tid = 1, gtid = 1, gtid + blockSize = 3
// so, we can understand L38, one thread handle data located in tid and tid + blockSize
int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
// load: 每个线程加载两个元素到shared mem对应位置
smem[tid] = d_in[i] + d_in[i + blockSize];
__syncthreads();
// 基于v3改进:把最后一个warp抽离出来reduce,避免多做一次sync threads
// 此时一个block对d_in这块数据的reduce sum结果保存在id为0的线程上面
for (int s = blockDim.x / 2; s > 32; s >>= 1) {
if (tid < s) {
smem[tid] += smem[tid + s];
}
__syncthreads();
}
// last warp拎出来单独作reduce
if (tid < 32) {
WarpSharedMemReduce(smem, tid);
}
// store: 哪里来回哪里去,把reduce结果写回显存
// GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
if (tid == 0) {
d_out[blockIdx.x] = smem[0];
}
}
bool CheckResult(float *out, float groudtruth, int n){
float res = 0;
for (int i = 0; i < n; i++){
res += out[i];
}
//printf("%f", res);
if (res != groudtruth) {
return false;
}
return true;
}
int main(){
float milliseconds = 0;
//const int N = 32 * 1024 * 1024;
const int N = 25600000;
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
const int blockSize = 256;
int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
//int GridSize = 100000;
float *a = (float *)malloc(N * sizeof(float));
float *d_a;
cudaMalloc((void **)&d_a, N * sizeof(float));
float *out = (float*)malloc((GridSize) * sizeof(float));
float *d_out;
cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));
for(int i = 0; i < N; i++){
a[i] = 1.0f;
}
float groudtruth = N * 1.0f;
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
dim3 Grid(GridSize);
dim3 Block(blockSize / 2);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reduce_v4<blockSize / 2><<<Grid,Block>>>(d_a, d_out);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
printf("allcated %d blocks, data counts are %d \n", GridSize, N);
bool is_right = CheckResult(out, groudtruth, GridSize);
if(is_right) {
printf("the ans is right\n");
} else {
printf("the ans is wrong\n");
for(int i = 0; i < GridSize;i++){
printf("resPerBlock : %lf ",out[i]);
}
printf("\n");
printf("groudtruth is: %f \n", groudtruth);
}
printf("reduce_v4 latency = %f ms\n", milliseconds);
cudaFree(d_a);
cudaFree(d_out);
free(a);
free(out);
}
。。
reduce5:完全展开for循环省掉for循环中的判断和加法操作
cpp
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
#define THREAD_PER_BLOCK 256
// latency: 0.656ms
// v5:循环展开
template <int blockSize>
__device__ void BlockSharedMemReduce(float* smem) {
//对v4 L45的for循环展开,以减去for循环中的加法指令,以及给编译器更多重排指令的空间
if (blockSize >= 1024) {
if (threadIdx.x < 512) {
smem[threadIdx.x] += smem[threadIdx.x + 512];
}
__syncthreads();
}
if (blockSize >= 512) {
if (threadIdx.x < 256) {
smem[threadIdx.x] += smem[threadIdx.x + 256];
}
__syncthreads();
}
if (blockSize >= 256) {
if (threadIdx.x < 128) {
smem[threadIdx.x] += smem[threadIdx.x + 128];
}
__syncthreads();
}
if (blockSize >= 128) {
if (threadIdx.x < 64) {
smem[threadIdx.x] += smem[threadIdx.x + 64];
}
__syncthreads();
}
// the final warp
if (threadIdx.x < 32) {
volatile float* vshm = smem;
if (blockDim.x >= 64) {
vshm[threadIdx.x] += vshm[threadIdx.x + 32];
}
vshm[threadIdx.x] += vshm[threadIdx.x + 16];
vshm[threadIdx.x] += vshm[threadIdx.x + 8];
vshm[threadIdx.x] += vshm[threadIdx.x + 4];
vshm[threadIdx.x] += vshm[threadIdx.x + 2];
vshm[threadIdx.x] += vshm[threadIdx.x + 1];
}
}
template <int blockSize>
__global__ void reduce_v5(float *d_in, float *d_out){
__shared__ float smem[THREAD_PER_BLOCK];
// 泛指当前线程在其block内的id
unsigned int tid = threadIdx.x;
// 泛指当前线程在所有block范围内的全局id, *2代表当前block要处理2*blocksize的数据
// ep. blocksize = 2, blockIdx.x = 1, when tid = 0, gtid = 4, gtid + blockSize = 6; when tid = 1, gtid = 5, gtid + blockSize = 7
// ep. blocksize = 2, blockIdx.x = 0, when tid = 0, gtid = 0, gtid + blockSize = 2; when tid = 1, gtid = 1, gtid + blockSize = 3
// so, we can understand L59, one thread handle data located in tid and tid + blockSize
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
// load: 每个线程加载两个元素到shared mem对应位置
smem[tid] = d_in[i] + d_in[i + blockDim.x];
__syncthreads();
// compute: reduce in shared mem
BlockSharedMemReduce<blockSize>(smem);
// store: 哪里来回哪里去,把reduce结果写回显存
// GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
if (tid == 0) {
d_out[blockIdx.x] = smem[0];
}
}
bool CheckResult(float *out, float groudtruth, int n){
float res = 0;
for (int i = 0; i < n; i++){
res += out[i];
}
if (res != groudtruth) {
return false;
}
return true;
}
int main(){
float milliseconds = 0;
const int N = 25600000;
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
const int blockSize = 256;
int GridSize = std::min((N + 256 - 1) / 256, deviceProp.maxGridSize[0]);
//int GridSize = 100000;
float *a = (float *)malloc(N * sizeof(float));
float *d_a;
cudaMalloc((void **)&d_a, N * sizeof(float));
float *out = (float*)malloc((GridSize) * sizeof(float));
float *d_out;
cudaMalloc((void **)&d_out, (GridSize) * sizeof(float));
for(int i = 0; i < N; i++){
a[i] = 1.0f;
}
float groudtruth = N * 1.0f;
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
dim3 Grid(GridSize);
dim3 Block(blockSize / 2);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reduce_v5<blockSize / 2><<<Grid,Block>>>(d_a, d_out);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
cudaMemcpy(out, d_out, GridSize * sizeof(float), cudaMemcpyDeviceToHost);
printf("allcated %d blocks, data counts are %d \n", GridSize, N);
bool is_right = CheckResult(out, groudtruth, GridSize);
if(is_right) {
printf("the ans is right\n");
} else {
printf("the ans is wrong\n");
for(int i = 0; i < GridSize;i++){
printf("resPerBlock : %lf ",out[i]);
}
printf("\n");
printf("groudtruth is: %f \n", groudtruth);
}
printf("reduce_v5 latency = %f ms\n", milliseconds);
cudaFree(d_a);
cudaFree(d_out);
free(a);
free(out);
}
reduce6:一个block/thread处理多个元素,基于gridsize step的loop舍弃掉之前的reduce3方案中的让一个线程干了两份活
好处:
(1)更灵活:可以handle大于你启动的线程数量的problem size
(2) 复用线程:减少线程创建和开销
(3)方便debug:设置block数量和thread数量为1时,此时为串行程序,便于debug
cpp
#include <bits/stdc++.h>
#include <cuda.h>
#include "cuda_runtime.h"
// 注意: v0-v5里面kernel得到的是各个block负责范围内的总和,要想得到最终的和,需要把各个block求得的总和再做reduce sum
// v6: multi-block reduce final result by two pass
// latency: 1.815ms
template <int blockSize>
__device__ void BlockSharedMemReduce(float* smem) {
//对v4 L45的for循环展开,以减去for循环中的加法指令,以及给编译器更多重排指令的空间
if (blockSize >= 1024) {
if (threadIdx.x < 512) {
smem[threadIdx.x] += smem[threadIdx.x + 512];
}
__syncthreads();
}
if (blockSize >= 512) {
if (threadIdx.x < 256) {
smem[threadIdx.x] += smem[threadIdx.x + 256];
}
__syncthreads();
}
if (blockSize >= 256) {
if (threadIdx.x < 128) {
smem[threadIdx.x] += smem[threadIdx.x + 128];
}
__syncthreads();
}
if (blockSize >= 128) {
if (threadIdx.x < 64) {
smem[threadIdx.x] += smem[threadIdx.x + 64];
}
__syncthreads();
}
// the final warp
if (threadIdx.x < 32) {
volatile float* vshm = smem;
if (blockDim.x >= 64) {
vshm[threadIdx.x] += vshm[threadIdx.x + 32];
}
vshm[threadIdx.x] += vshm[threadIdx.x + 16];
vshm[threadIdx.x] += vshm[threadIdx.x + 8];
vshm[threadIdx.x] += vshm[threadIdx.x + 4];
vshm[threadIdx.x] += vshm[threadIdx.x + 2]; vshm[threadIdx.x] += vshm[threadIdx.x + 1];
}
}
template <int blockSize>
__global__ void reduce_v6(float *d_in, float *d_out, int nums){
__shared__ float smem[blockSize];
// 泛指当前线程在其block内的id
unsigned int tid = threadIdx.x;
// 泛指当前线程在所有block范围内的全局id
unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int total_thread_num = blockDim.x * gridDim.x;
// 基于v5的改进:不用显式指定一个线程处理2个元素,而是通过L58的for循环来自动确定每个线程处理的元素个数
float sum = 0.0f;
for (int32_t i = gtid; i < nums; i += total_thread_num) {
sum += d_in[i];
}
smem[tid] = sum;
__syncthreads();
// compute: reduce in shared mem
BlockSharedMemReduce<blockSize>(smem);
// store: 哪里来回哪里去,把reduce结果写回显存
// GridSize个block内部的reduce sum已得出,保存到d_out的每个索引位置
if (tid == 0) {
d_out[blockIdx.x] = smem[0];
}
}
bool CheckResult(float *out, float groudtruth, int n){
if (*out != groudtruth) {
return false;
}
return true;
}
int main(){
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
int maxblocks = deviceProp.maxGridSize[0];
const int blockSize = 256;
const int N = 25600000;
int gridSize = std::min((N + blockSize - 1) / blockSize, maxblocks);
float milliseconds = 0;
float *a = (float *)malloc(N * sizeof(float));
float *d_a;
cudaMalloc((void **)&d_a,N * sizeof(float));
float *out = (float*)malloc((gridSize) * sizeof(float));
float *d_out;
float *part_out;//新增part_out存储每个block reduce的结果
cudaMalloc((void **)&d_out, 1 * sizeof(float));
cudaMalloc((void **)&part_out, (gridSize) * sizeof(float));
float groudtruth = N;
for(int i = 0; i < N; i++){
a[i] = 1;
}
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
dim3 Grid(gridSize);
dim3 Block(blockSize);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reduce_v6<blockSize><<<Grid, Block>>>(d_a, part_out, N);
reduce_v6<blockSize><<<1, Block>>>(part_out, d_out, gridSize);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
cudaMemcpy(out, d_out, 1 * sizeof(float), cudaMemcpyDeviceToHost);
bool is_right = CheckResult(out, groudtruth, 1);
if(is_right) {
printf("the ans is right\n");
} else {
printf("the ans is wrong\n");
for(int i = 0;i < 1;i++){
printf("%lf ",out[i]);
}
printf("\n");
}
printf("reduce_v6 latency = %f ms\n", milliseconds);
cudaFree(d_a);
cudaFree(d_out);
cudaFree(part_out);
free(a);
free(out);
}
。。
。。
。。
。
。。
。