CUDA c++ Clock示例代码详细分析如何分析每个块的时间效率

文章目录

一、逻辑链路分析

cpp 复制代码
// This example shows how to use the clock function to measure the performance of
// block of threads of a kernel accurately.
//
// Blocks are executed in parallel and out of order. Since there's no synchronization
// mechanism between blocks, we measure the clock once for each block. The clock
// samples are written to device memory.

// System includes
#include <stdio.h>
#include <stdint.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>

// This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored
// in device memory.
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
{
    // __shared__ float shared[2 * blockDim.x];
    extern __shared__ float shared[];

    const int tid = threadIdx.x;
    const int bid = blockIdx.x;

    if (tid == 0) timer[bid] = clock();

    // Copy input.
    shared[tid] = input[tid];
    shared[tid + blockDim.x] = input[tid + blockDim.x];

    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2)
    {
        __syncthreads();

        if (tid < d)
        {
            float f0 = shared[tid];
            float f1 = shared[tid + d];

            if (f1 < f0)
            {
                shared[tid] = f1;
            }
        }
    }

    // Write result.
    if (tid == 0) output[bid] = shared[0];

    __syncthreads();

    if (tid == 0) timer[bid+gridDim.x] = clock();
}

#define NUM_BLOCKS    64
#define NUM_THREADS   256

// It's interesting to change the number of blocks and the number of threads to
// understand how to keep the hardware busy.
//
// Here are some numbers I get on my G80:
//    blocks - clocks
//    1 - 3096
//    8 - 3232
//    16 - 3364
//    32 - 4615
//    64 - 9981
//
// With less than 16 blocks some of the multiprocessors of the device are idle. With
// more than 16 you are using all the multiprocessors, but there's only one block per
// multiprocessor and that doesn't allow you to hide the latency of the memory. With
// more than 32 the speed scales linearly.

// Start the main CUDA Sample here
int main(int argc, char **argv)
{
    printf("CUDA Clock sample\n");

    // This will pick the best possible CUDA capable device
    int dev = findCudaDevice(argc, (const char **)argv);

    float *dinput = NULL;
    float *doutput = NULL;
    clock_t *dtimer = NULL;

    clock_t timer[NUM_BLOCKS * 2];
    float input[NUM_THREADS * 2];

    for (int i = 0; i < NUM_THREADS * 2; i++)
    {
        input[i] = (float)i;
    }

    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));

    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));

    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 *NUM_THREADS>>>(dinput, doutput, dtimer);

    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));

    checkCudaErrors(cudaFree(dinput));
    checkCudaErrors(cudaFree(doutput));
    checkCudaErrors(cudaFree(dtimer));

    long double avgElapsedClocks = 0;

    for (int i = 0; i < NUM_BLOCKS; i++)
    {
        avgElapsedClocks += (long double) (timer[i + NUM_BLOCKS] - timer[i]);
    }

    avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS;
    printf("Average clocks/block = %Lf\n", avgElapsedClocks);

    return EXIT_SUCCESS;
}

1.1 程序整体架构

该程序通过GPU并行归约计算,并利用clock()函数测量每个线程块执行归约操作所消耗的时钟周期数。

1.2 数据流逻辑

复制代码
主机端数据初始化 → 拷贝到设备端 → GPU并行归约计算 + 计时 → 拷贝回主机端 → 统计分析

1.3 核心算法逻辑

  • 归约算法:使用共享内存进行并行归约,每个线程块处理2*NUM_THREADS个数据元素
  • 计时机制:在线程块开始和结束时分别记录时钟值,计算差值得到执行时间

1.4 线程块与数据映射

  • 64个线程块,每个块256个线程
  • 每个线程块处理512个float数据(2*256)
  • 总共处理64*512 = 32768个数据点

二、逐行代码功能分析

2.1 头文件和宏定义部分

c 复制代码
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
  • 包含标准库和CUDA辅助库
  • helper_cuda.h提供checkCudaErrors()等错误检查宏

2.2 Kernel函数:timedReduction

参数列表
c 复制代码
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
  • __global__:表示这是CUDA内核函数,在设备端执行,从主机端调用
  • input:输入数据指针(只读)
  • output:输出结果指针,每个线程块输出一个最小值
  • timer:计时数组,存储每个块的开始和结束时钟
共享内存声明
c 复制代码
extern __shared__ float shared[];
  • 动态分配共享内存大小,在kernel调用时通过第三个参数指定
  • 每个线程块独有的高速缓存(在SM内部)
线程索引获取
c 复制代码
const int tid = threadIdx.x;
const int bid = blockIdx.x;
  • tid:块内线程索引(0-255)
  • bid:网格中块索引(0-63)
开始计时
c 复制代码
if (tid == 0) timer[bid] = clock();
  • 只有每个块的第一个线程(tid=0)记录开始时间
  • clock()返回GPU当前时钟周期计数
数据加载到共享内存
c 复制代码
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
  • 每个线程加载两个数据到共享内存
  • 块0处理input0-511,块1处理input512-1023,依此类推
并行归约核心循环
c 复制代码
for (int d = blockDim.x; d > 0; d /= 2)
{
    __syncthreads();
    
    if (tid < d)
    {
        float f0 = shared[tid];
        float f1 = shared[tid + d];
        
        if (f1 < f0)
        {
            shared[tid] = f1;
        }
    }
}
  • 归约流程
    • d=256: 线程0-127比较并合并相邻元素
    • d=128: 线程0-63继续合并
    • d=64: 线程0-31继续合并
    • ...直到d=1: 线程0最终得出最小值
  • __syncthreads():同步块内所有线程,确保数据一致性
输出结果
c 复制代码
if (tid == 0) output[bid] = shared[0];
  • 每个块的第一个线程将最小值写入全局内存
结束计时
c 复制代码
__syncthreads();
if (tid == 0) timer[bid+gridDim.x] = clock();
  • 同步确保所有线程完成归约
  • 记录结束时钟,存储在数组后半部分(偏移gridDim.x=64)

2.3 主函数main

常量定义
c 复制代码
#define NUM_BLOCKS    64
#define NUM_THREADS   256
  • 定义网格和块大小
设备选择
c 复制代码
int dev = findCudaDevice(argc, (const char **)argv);
  • 自动选择最佳CUDA设备
内存分配
c 复制代码
checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
  • 设备端内存分配:
    • dinput: 512个float
    • doutput: 64个float(每个块一个结果)
    • dtimer: 128个clock_t(每个块开始和结束各一个)
数据初始化与传输
c 复制代码
for (int i = 0; i < NUM_THREADS * 2; i++)
{
    input[i] = (float)i;
}
  • 主机端数据初始化为0-511
c 复制代码
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
  • 将数据从主机拷贝到设备
Kernel启动
c 复制代码
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 *NUM_THREADS>>>(dinput, doutput, dtimer);
  • <<<64, 256, 512*sizeof(float)>>>
  • 第三个参数指定动态共享内存大小:512*4=2048字节
结果回传与统计分析
c 复制代码
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
  • 将计时数据拷贝回主机
c 复制代码
long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++)
{
    avgElapsedClocks += (long double) (timer[i + NUM_BLOCKS] - timer[i]);
}
avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS;
  • 计算每个块的平均执行时钟周期数
  • timer[i]:开始时间,timer[i+NUM_BLOCKS]:结束时间

三、关键技术点总结

  1. 动态共享内存 :使用extern __shared__在运行时指定大小
  2. 并行归约:利用共享内存减少全局内存访问
  3. 线程同步__syncthreads()确保块内线程同步
  4. 性能测量clock()函数测量GPU时钟周期
  5. 原子性操作:通过tid==0保证单线程写入,避免竞争

四、性能优化洞察

注释中的性能数据展示了:

  • 块数量从1到64,执行时间增加
  • 原因:块数少时SM利用率低,块数多时资源竞争加剧
  • 最优块数通常在SM数量的2-4倍之间,以隐藏内存延迟

该示例很好地展示了CUDA程序性能分析的基础方法。