cuda块合并

main.cpp

cpp 复制代码
#include "sum.h"
#include "stdlib.h"
#include <chrono>

void init_array(float* array,int l)
{
    for(int i=0; i<l; i++)
    {
        array[i]=i%10;
    }
}

int main(int argc,char **argv)
{
    //a数组很大,栈区放不下,必须放在堆里
    float *a = (float*)malloc((ARRAY_L+500) * sizeof(float));
    float b[1];
    init_array(a,ARRAY_L+500);

    char* endptr;
    int block = strtol(argv[1], &endptr, 10);

    auto start = std::chrono::steady_clock::now();
    ArraySum( a, b, ARRAY_L, block);
    auto end = std::chrono::steady_clock::now();
    long long us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    printf("gpu1 cost:%ld us;sum=%f\n", us, b[0]);

    start = std::chrono::steady_clock::now();
    ArraySumUnfold( a, b, ARRAY_L, block);
    end = std::chrono::steady_clock::now();
    us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    printf("gpu2 cost:%ld us;sum=%f\n", us, b[0]);

    start = std::chrono::steady_clock::now();
    ArraySumUnfold3( a, b, ARRAY_L, block);
    end = std::chrono::steady_clock::now();
    us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    printf("gpu3 cost:%ld us;sum=%f\n", us, b[0]);
    
    printf("sum=%f\n",b[0]);

    start = std::chrono::steady_clock::now();
    float sum=0;
    for(int i=0;i<ARRAY_L;i++)
    {
        sum += a[i];
    }
    end = std::chrono::steady_clock::now();
    us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    printf("cpu cost:%ld us;sum=%f\n", us, sum);

    free(a);
    return 0;
}

sum.cu

cpp 复制代码
#include <cuda_runtime.h>
#include <stdlib.h>
#include "sum.h"

/*
每个block都是一个小数组,对每个小数组求和,最后合成大数组
blockDim=256,array_long=10000,gridDim=40时:
grid[0],block[0]里的一个warp,也就是blockIdx.x=0,0<=threadIdx.x<32;
idx就是[0到31],idara=&input[0],stride从128开始二分,tid就是0到31
而最后一个线程束里发生了什么?
tid是224到255,idx是256*39+224=10208到10239,全部越界,直接return
可最后一个block的warp0呢?
tid是0到31,idx是9984到10015,部分越界!
此时idata=&input[9984],idx等于9984时不被拦截,
第一次stride等于128,那么tid+stride=128,
此时idata可以访问input[9984+128],直接越界!
*/
__global__ void sumKernel(float* input,float* output,int num,unsigned long long* block_times)
{
    unsigned long long start = clock64();
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    //创建小数组,注意不要非法访问
    float* idata = input + blockIdx.x * blockDim.x;

    if(idx >= num) return;

    //长度为4时,stride=2,tid取0,1;长度为3时,stride=1,tid取0,
    //则input[2]无法计算,不能适用于奇数情况,不过,threadDim都是32的倍数,所以无需考虑奇数
    for(int stride = blockDim.x/2; stride > 0; stride /= 2)
    {
        if(tid < stride && ( idx + stride < num)) //数组前半部分
        {
            idata[tid]+=idata[tid + stride];
        }
        __syncthreads(); //原数据已经被更新,必须同步
    }

    if(tid==0)
    //每个小数组的和
    output[blockIdx.x]=idata[0];

    unsigned long long end = clock64();
    block_times[blockIdx.x] = end -start;
}

__global__ void sumKernelUnfold(float* input,float* output,int num,unsigned long long* block_times)
{
    unsigned long long start = clock64();
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x * 2 + threadIdx.x;//将原来的两个block合并
    //创建小数组,注意不要非法访问
    float* idata = input + blockIdx.x * blockDim.x * 2;

    if(idx+blockDim.x<num) input[idx]+=input[idx+blockDim.x];//合并两个数据块
    __syncthreads();

    //input已经合并,idata直接对合并后的值相加
    for(int stride = blockDim.x/2; stride > 0; stride /= 2)
    {
        if(tid < stride && ( idx + stride < num)) //数组前半部分
        {
            idata[tid]+=idata[tid + stride];
        }
        __syncthreads(); //原数据已经被更新,必须同步
    }

    if(tid==0)
    //每个小数组的和
    output[blockIdx.x]=idata[0];

    unsigned long long end = clock64();
    block_times[blockIdx.x] = end -start;
}

__global__ void sumKernelUnfold3(float* input, float* output, int num, unsigned long long* block_times) {
    unsigned long long start = clock64();
    int tid = threadIdx.x;
    int blockSize = blockDim.x;
    int total_per_block = blockSize * 3;          // 每个block处理3倍数据
    int idx = blockIdx.x * total_per_block + tid; // 全局起始索引

    float* idata = input + blockIdx.x * total_per_block;

    // ---- 第一步:合并三个块(仅当存在三个完整块时) ----
    // 将第二个块(偏移 blockSize)累加到第一个块
    if (idx + blockSize < num) {
        input[idx] += input[idx + blockSize];
    }
    __syncthreads();  // 确保第一批合并完成

    // 将第三个块(偏移 2*blockSize)累加到第一个块(此时第一个块已包含第二个块的值)
    if (idx + 2 * blockSize < num) {
        input[idx] += input[idx + 2 * blockSize];
    }
    __syncthreads();

    // ---- 第二步:对合并后的第一个块进行归约(标准做法) ----
    // 注意:此时有效数据只在前 blockSize 个元素中(但最后一个block可能不足)
    // 我们仍然用同样的归约循环,但只对前 blockSize 个线程有效
    for (int stride = blockSize / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            // 只要当前线程对应的元素有效(idx < num)且待加的元素有效(idx + stride < num?)
            // 但这里我们操作的是 idata,是局部连续地址,所以检查相对偏移更合理。
            // 使用相对偏移:如果 tid + stride < blockSize 且 (idx + stride) < num
            // 但 idx 是绝对索引,可能跨过有效范围,所以用相对偏移判断更准确:
            int relIdx = tid + stride;
            if (relIdx < blockSize && (blockIdx.x * total_per_block + relIdx) < num) {
                idata[tid] += idata[relIdx];
            }
        }
        __syncthreads();
    }

    if (tid == 0) {
        // 只有第一个元素(即块内归约结果)有效
        output[blockIdx.x] = idata[0];
    }

    unsigned long long end = clock64();
    if (tid == 0) {
        block_times[blockIdx.x] = end - start;
    }
}

void ArraySum(const float* h_a,float* h_b,int num,int w)
{
    float *d_a,*d_b;
    unsigned long long *block_times;
    //多分配500,为了验证越界
    CUDA_CHECK(cudaMalloc((void**)&d_a, (num+500) * sizeof(float)));

    CUDA_CHECK(cudaMemcpy(d_a, h_a, (num+500) * sizeof(float), cudaMemcpyHostToDevice));

    dim3 block(w);
    dim3 grid((ARRAY_L+block.x-1)/block.x);
    CUDA_CHECK(cudaMalloc((void**)&d_b, grid.x * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&block_times, grid.x * sizeof(unsigned long long)));
    sumKernel<<<grid,block>>>(d_a,d_b,num,block_times);
    printf("block(%d);grid(%d)\n", block.x, grid.x);
    unsigned long long* h_block_times = (unsigned long long*)malloc(grid.x * sizeof(unsigned long long));
    cudaMemcpy(h_block_times, block_times, grid.x * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
    float av_time=0;
    for(int i=0;i<grid.x;i++)
    {
        av_time+=h_block_times[i];
    }
    av_time/=grid.x;
    printf("block average time:%f cycles\n",av_time);

    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    float* h_partial = (float*)malloc(grid.x * sizeof(float));
    cudaMemcpy(h_partial, d_b, grid.x * sizeof(float), cudaMemcpyDeviceToHost);
    // CPU 累加得到最终结果
    float sum = 0.0f;
    for (int i = 0; i < grid.x; i++) {
        sum += h_partial[i];
    }
    *h_b = sum;

    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(block_times));
    free(h_partial);
    free(h_block_times);
}

void ArraySumUnfold(const float* h_a,float* h_b,int num,int w)
{
    float *d_a,*d_b;
    unsigned long long *block_times;
    //多分配500,为了验证越界
    CUDA_CHECK(cudaMalloc((void**)&d_a, (num+500) * sizeof(float)));

    CUDA_CHECK(cudaMemcpy(d_a, h_a, (num+500) * sizeof(float), cudaMemcpyHostToDevice));

    dim3 block(w);
    dim3 grid((ARRAY_L/2+block.x-1)/block.x);//两块合并
    CUDA_CHECK(cudaMalloc((void**)&d_b, grid.x * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&block_times, grid.x * sizeof(unsigned long long)));
    sumKernelUnfold<<<grid,block>>>(d_a,d_b,num,block_times);
    unsigned long long* h_block_times = (unsigned long long*)malloc(grid.x * sizeof(unsigned long long));
    cudaMemcpy(h_block_times, block_times, grid.x * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
    printf("block(%d);grid(%d)\n", block.x, grid.x);
    float av_time=0;
    for(int i=0;i<grid.x;i++)
    {
        av_time+=h_block_times[i];
    }
    av_time/=grid.x;
    printf("unfold block average time:%f cycles\n",av_time);

    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    float* h_partial = (float*)malloc(grid.x * sizeof(float));
    cudaMemcpy(h_partial, d_b, grid.x * sizeof(float), cudaMemcpyDeviceToHost);
    // CPU 累加得到最终结果
    float sum = 0.0f;
    for (int i = 0; i < grid.x; i++) {
        sum += h_partial[i];
    }
    *h_b = sum;

    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(block_times));
    free(h_partial);
    free(h_block_times);
}

void ArraySumUnfold3(const float* h_a, float* h_b, int num, int w) {
    float *d_a, *d_b;
    unsigned long long *block_times;

    // 分配设备内存(多分配500方便越界测试,实际可省略)
    CUDA_CHECK(cudaMalloc((void**)&d_a, (num + 500) * sizeof(float)));
    CUDA_CHECK(cudaMemcpy(d_a, h_a, (num + 500) * sizeof(float), cudaMemcpyHostToDevice));

    int blockSize = w;
    int total_per_block = blockSize * 3;
    int gridSize = (num + total_per_block - 1) / total_per_block;  // 上取整

    dim3 block(blockSize);
    dim3 grid(gridSize);
    CUDA_CHECK(cudaMalloc((void**)&d_b, gridSize * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&block_times, gridSize * sizeof(unsigned long long)));

    sumKernelUnfold3<<<grid, block>>>(d_a, d_b, num, block_times);
    cudaDeviceSynchronize();

    // 拷贝 block_times 回主机并计算平均耗时
    unsigned long long* h_block_times = (unsigned long long*)malloc(gridSize * sizeof(unsigned long long));
    cudaMemcpy(h_block_times, block_times, gridSize * sizeof(unsigned long long), cudaMemcpyDeviceToHost);

    unsigned long long total_cycles = 0;
    int valid_blocks = 0;
    for (int i = 0; i < gridSize; i++) {
        if (h_block_times[i] > 0) {  // 忽略从未写入的块(所有block都有tid=0写入,所以都有效)
            total_cycles += h_block_times[i];
            valid_blocks++;
        }
    }
    double avg_cycles = (valid_blocks > 0) ? (double)total_cycles / valid_blocks : 0.0;
    printf("block(%d);grid(%d)\n", blockSize, gridSize);
    printf("unfold3 block average time: %f cycles\n", avg_cycles);

    // 拷贝部分和并求和
    float* h_partial = (float*)malloc(gridSize * sizeof(float));
    cudaMemcpy(h_partial, d_b, gridSize * sizeof(float), cudaMemcpyDeviceToHost);
    float sum = 0.0f;
    for (int i = 0; i < gridSize; i++) {
        sum += h_partial[i];
    }
    *h_b = sum;

    // 释放资源
    free(h_block_times);
    free(h_partial);
    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(block_times));
}

sum.h

cpp 复制代码
#pragma once

#include "stdio.h"

#define ARRAY_L 1000000

// CUDA 错误检查宏
#define CUDA_CHECK(call)                                                       \
do {                                                                        \
    cudaError_t err = (call);                                              \
    if (err != cudaSuccess) {                                              \
        fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,  \
                cudaGetErrorString(err));                                   \
        exit(EXIT_FAILURE);                                                \
    }                                                                      \
} while (0)

void ArraySum(const float* h_a,float* h_b,int num,int w);
void ArraySumUnfold(const float* h_a,float* h_b,int num,int w);
void ArraySumUnfold3(const float* h_a, float* h_b, int num, int w);

输出

cpp 复制代码
eric@eric-virtual-machine:~/cfz/learn_by_DS/CUDA/sum/build$ nvprof ./cuda_sum 256
==162784== NVPROF is profiling process 162784, command: ./cuda_sum 256
block(256);grid(3907)
block average time:1987.840332 cycles
gpu1 cost:239869 us;sum=4500000.000000
block(256);grid(1954)
unfold block average time:2221.188232 cycles
gpu2 cost:713 us;sum=4500000.000000
block(256);grid(1303)
unfold3 block average time: 2941.056792 cycles
gpu3 cost:722 us;sum=4500000.000000
sum=4500000.000000
cpu cost:2214 us;sum=4500000.000000
==162784== Profiling application: ./cuda_sum 256
==162784== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   78.50%  1.1675ms         3  389.17us  353.86us  430.44us  [CUDA memcpy HtoD]
                    9.94%  147.78us         1  147.78us  147.78us  147.78us  sumKernel(float*, float*, int, __int64*)
                    5.58%  82.945us         1  82.945us  82.945us  82.945us  sumKernelUnfold(float*, float*, int, __int64*)
                    4.76%  70.721us         1  70.721us  70.721us  70.721us  sumKernelUnfold3(float*, float*, int, __int64*)
                    1.23%  18.271us         6  3.0450us  2.3360us  4.3520us  [CUDA memcpy DtoH]
      API calls:   97.03%  71.709ms         9  7.9677ms  2.8240us  71.389ms  cudaMalloc
                    2.06%  1.5222ms         9  169.14us  11.609us  457.47us  cudaMemcpy
                    0.36%  262.43us         9  29.158us  3.0960us  63.593us  cudaFree
                    0.26%  188.72us         3  62.906us  9.7560us  168.33us  cudaLaunchKernel
                    0.14%  100.87us         3  33.624us  2.2410us  95.706us  cudaDeviceSynchronize
                    0.14%  100.11us       101     991ns      94ns  38.500us  cuDeviceGetAttribute
                    0.02%  14.428us         1  14.428us  14.428us  14.428us  cuDeviceGetName
                    0.01%  4.6110us         1  4.6110us  4.6110us  4.6110us  cuDeviceGetPCIBusId
                    0.00%  1.7840us         3     594ns     149ns  1.3920us  cuDeviceGetCount
                    0.00%     809ns         1     809ns     809ns     809ns  cuModuleGetLoadingMode
                    0.00%     779ns         2     389ns     136ns     643ns  cuDeviceGet
                    0.00%     643ns         2     321ns     272ns     371ns  cudaGetLastError
                    0.00%     244ns         1     244ns     244ns     244ns  cuDeviceTotalMem
                    0.00%     155ns         1     155ns     155ns     155ns  cuDeviceGetUuid
指标 sumKernel sumKernelUnfold 变化
Block 平均周期数 1985.7 2216.1 +11.6% (每个 Block 变慢)
Block 总数 3907 1954 -50%
总周期数(平均×个数) ~7.76M ~4.33M -44.2%
nvprof 总时间 147.74 µs 82.59 µs -44.1%

将原来的两个block合并,虽然每个block计算变复杂了,但是总block数减少,最后gpu耗时反而还减少了,而且总周期数变换率和Kernel耗时变化率完全一样!

  • 单 Block 时间clock64 测得):指一个 Block 从开始到结束需要约 1.18 µs=2216/1.88G(假设频率 1.88GHz)。

  • 内核总时间nvprof 测得):指 1954 个 Block 全部算完 需要 82.9 µs

由于 GTX 1650 只有 14 个 SM,每个 SM 同时最多跑 2 个 Block,即同时只能跑 28 个 Block。

  • 1954 个 Block 需要分成 1954 / 28 ≈ 70 批(波次)执行。

  • 总时间 ≈ 70 批 × 1.18 µs/批 ≈ 82.6 µs

  • 这完美验证了 nvprof 的 82.9 µs!

    至于为什么每个SM最多同时跑2个block,我还不知道,怎么预测每个block的周期数,我也不知道,不过现在好歹算出了个nvprof的时间,剩下再学吧。