验证4个SMSP是否是串行访问ShareMemory的

验证4个SMSP是否是串行访问ShareMemory的

原以为4个smsp中的warp在没有bank冲突的情况下,是可以并行访问共享内存的

通过下面的测试发现,其实是串行的,share memory每个cycle只能处理一个请求

测试过程

c 复制代码
tee shm_kernel.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
__global__ void shm_kernel(float *input,float *output) {
    int tid  = threadIdx.x + blockIdx.x * blockDim.x;
    __shared__ float shm_data[0xc000/4];
    float vals;
    clock_t t0=clock64();
    vals=shm_data[tid];
    __syncthreads();
    clock_t t1=clock64();
    vals*=(tid);
    output[tid]=vals;
    if(tid==0)
    {
        printf("ts:%lld\n",t1-t0);
    }
}
EOF

/usr/local/cuda/bin/nvcc -std=c++17  -dc -lineinfo -arch=sm_86 -ptx shm_kernel.cu -o shm_kernel.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 shm_kernel.ptx -cubin -o shm_kernel.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 shm_kernel.cubin -fatbin -o shm_kernel.fatbin
/usr/local/cuda/bin/cuobjdump --dump-sass shm_kernel.fatbin
 
tee shm_kernel_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>

int main(int argc,char *argv[])
{
    CUresult error;
    CUdevice cuDevice;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    error = cuDeviceGet(&cuDevice, 0);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in get device!\n");
    }
    CUcontext cuContext;
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in create context!\n");
    }

    CUmodule module;
    CUfunction function;

    const char* module_file = "shm_kernel.fatbin";
    const char* kernel_name = "_Z10shm_kernelPfS_";

    error = cuModuleLoad(&module, module_file);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in load moudle %d!\n",error);
    }

    error = cuModuleGetFunction(&function, module, kernel_name);
    if(error!=CUDA_SUCCESS)
    {
        printf("get function error!\n");
    }
    
    int thread_size_conf[3]={32,32*4,32*4*4};
    
    for(int k=0;k<3;k++)
    {
        int block_size=1;
        int thread_size=thread_size_conf[k];
        int data_size=sizeof(float)*thread_size*block_size;

        float *output_ptr=nullptr;
        float *input_ptr=nullptr;
        int cudaStatus=0;

        cudaStatus = cudaMalloc((void**)&input_ptr, data_size);
        if(cudaStatus)
        {
            printf("cudaMalloc input_ptr Failed\n");
        }
        cudaStatus= cudaMalloc((void**)&output_ptr, data_size);
        if(cudaStatus)
        {
            printf("cudaMalloc output_ptr Failed\n");        
        }
        void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};
        
        auto ret=cuLaunchKernel(function,
                    block_size, 1, 1,
                    thread_size, 1, 1,
                    0,0,kernelParams, 0);
        cudaError_t cudaerr = cudaDeviceSynchronize();
        if (cudaerr != cudaSuccess)
            printf("kernel launch failed with error \"%s\".\n",cudaGetErrorString(cudaerr));  
       
        cudaFree(output_ptr);
        cudaFree(input_ptr);        
    }
    cuModuleUnload(module);
    cuCtxDestroy(cuContext);
    return 0;
}
EOF
g++ shm_kernel_main.cpp -o shm_kernel_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda

/usr/local/NVIDIA-Nsight-Compute/ncu --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,\
smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts.sum,\
smsp__sass_inst_executed_op_shared_ld.sum,\
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.peak_sustained,\
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.avg.peak_sustained,\
l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum ./shm_kernel_main

输出

bash 复制代码
ts:33
ts:1551
0%....50%....100% - 3 passes
==PROF== Profiling "shm_kernel(float *, float *)" - 1: ts:39
ts:39
ts:1622
0%....50%....100% - 3 passes
==PROF== Profiling "shm_kernel(float *, float *)" - 2: ts:64
ts:57
ts:1706
0%....50%....100% - 3 passes
==PROF== Disconnected from process 657443
[657443] shm_kernel_main@127.0.0.1
  shm_kernel(float *, float *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- ----------- ------------
    Metric Name                                                            Metric Unit Metric Value
    ---------------------------------------------------------------------- ----------- ------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum                                      0
    l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.avg.peak_sustained        1/cycle            1
    l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum                                          1
    l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.peak_sustained        1/cycle           28
    smsp__sass_inst_executed_op_shared_ld.sum                                     inst            1
    smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts.sum                        0
    ---------------------------------------------------------------------- ----------- ------------

  shm_kernel(float *, float *) (1, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- ----------- ------------
    Metric Name                                                            Metric Unit Metric Value
    ---------------------------------------------------------------------- ----------- ------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum                                      0
    l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.avg.peak_sustained        1/cycle            1
    l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum                                          4
    l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.peak_sustained        1/cycle           28
    smsp__sass_inst_executed_op_shared_ld.sum                                     inst            4
    smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts.sum                        0
    ---------------------------------------------------------------------- ----------- ------------

  shm_kernel(float *, float *) (1, 1, 1)x(512, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- ----------- ------------
    Metric Name                                                            Metric Unit Metric Value
    ---------------------------------------------------------------------- ----------- ------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum                                      0
    l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.avg.peak_sustained        1/cycle            1
    l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum                                         16
    l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.peak_sustained        1/cycle           28
    smsp__sass_inst_executed_op_shared_ld.sum                                     inst           16
    smsp__sass_l1tex_data_bank_conflicts_pipe_lsu_mem_shared_op_ldgsts.sum                        0
    ---------------------------------------------------------------------- ----------- ------------
相关推荐
晴空了无痕10 小时前
游戏客户端架构设计与实战:从模块化到性能优化
游戏·性能优化
谢尔登10 小时前
【React】React 性能优化
前端·react.js·性能优化
我爱松子鱼10 小时前
mysql之InnoDB Buffer Pool 深度解析与性能优化
android·mysql·性能优化
weixin_4258782319 小时前
Redis复制性能优化利器:深入解析replica-lazy-flush参数
数据库·redis·性能优化
奔跑吧邓邓子20 小时前
【Python爬虫(36)】深挖多进程爬虫性能优化:从通信到负载均衡
开发语言·爬虫·python·性能优化·负载均衡·多进程
web1350858863520 小时前
全面指南:使用JMeter进行性能压测与性能优化(中间件压测、数据库压测、分布式集群压测、调优)
jmeter·中间件·性能优化
怪怪王21 小时前
【GPU驱动】OpenGLES图形管线渲染机制
驱动开发·gpu·opengl
程序员远仔1 天前
【Vue.js 和 React.js 的主要区别是什么?】
前端·javascript·css·vue.js·react.js·性能优化·html5
叶庭云2 天前
PyTorch 深度学习框架中 torch.cuda.empty_cache() 的妙用与注意事项
pytorch·深度学习·gpu·empty_cache·内存缓存管理
敢嗣先锋2 天前
鸿蒙5.0实战案例:基于ArkUI启动冷启动过程最大连续丢帧数问题分析思路&案例
性能优化·移动开发·多线程·harmonyos·arkui·鸿蒙开发