smsp__inst_executed_pipe_fp64为什么对不上

smsp__inst_executed_pipe_fp64为什么对不上

smsp__inst_executed_pipe_fp64 为什么对不上

一.小结

  • int idx = threadIdx.x; 会生成I2F.F64指令
  • I2F.F64也会占用fp64 pipe

二.复现过程

bash 复制代码
tee fp64_test.cu<<-'EOF'
#include <cuda_runtime.h>
#include <iostream>

__global__ void Kernel_v1(double *output) {
    int idx = threadIdx.x;
    output[idx] = output[idx] * 2.0;
}

__global__ void Kernel_v2(double *output) {
    int idx = threadIdx.x;
    output[idx] = idx * 2.0;
}
EOF

/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx fp64_test.cu -o fp64_test.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.ptx -cubin -o fp64_test.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
cat fp64_test.ptx
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin

tee fp64_test_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>

int main(int argc,char *argv[])
{
    CUresult error;
    CUdevice cuDevice;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    error = cuDeviceGet(&cuDevice, 0);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in get device!\n");
    }
    CUcontext cuContext;
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in create context!\n");
    }
    int block_count=1;int block_size=32;
    int thread_size=block_count*block_size;

    int data_size=sizeof(double)*thread_size;

    double *output_ptr=nullptr;
    double *input_ptr=nullptr;
    int cudaStatus=0;
    cudaStatus = cudaMalloc((void**)&input_ptr, data_size);
    cudaStatus = cudaMalloc((void**)&output_ptr, data_size);
    void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};

    CUmodule module;
    CUfunction double_function;
    CUfunction float_function;
    const char* module_file = "fp64_test.fatbin";
    const char* double_kernel_name = "_Z9Kernel_v1Pd";
    const char* float_kernel_name = "_Z9Kernel_v2Pd";
    
    error = cuModuleLoad(&module, module_file);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in load moudle %d!\n",error);
    }
    error = cuModuleGetFunction(&double_function, module, double_kernel_name);
    if(error!=CUDA_SUCCESS)
    {
        printf("get double_function error!\n");
    }
    error = cuModuleGetFunction(&float_function, module, float_kernel_name);
    if(error!=CUDA_SUCCESS)
    {
        printf("get float_kernel_name error!\n");
    }    
    cuLaunchKernel(double_function,
                    block_count, 1, 1,
                    block_size, 1, 1,
                    0,0,kernelParams, 0);
    cuLaunchKernel(float_function,
                    block_count, 1, 1,
                    block_size, 1, 1,
                    0,0,kernelParams, 0);
    cudaFree(output_ptr);
    cudaFree(input_ptr);
    cuModuleUnload(module);
    cuCtxDestroy(cuContext);
    return 0;
}
EOF
g++ fp64_test_main.cpp -o fp64_test_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__inst_executed_pipe_fp64.max ./fp64_test_main

cuasm.py fp64_test.cubin fp64_test.cuasm
sed '/I2F/d' -i fp64_test.cuasm
cuasm.py fp64_test.cuasm
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-resource-usage fp64_test.fatbin

/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__inst_executed_pipe_fp64.max ./fp64_test_main

三.输出

  • 修改前
bash 复制代码
		Function : _Z9Kernel_v2Pd
.headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/                   MOV R1, c[0x0][0x28] ;                 /* 0x00000a0000017a02 */
																  /* 0x000fc40000000f00 */
/*0010*/                   S2R R0, SR_TID.X ;                     /* 0x0000000000007919 */
																  /* 0x000e220000002100 */
/*0020*/                   MOV R5, 0x8 ;                          /* 0x0000000800057802 */
																  /* 0x000fe20000000f00 */
/*0030*/                   ULDC.64 UR4, c[0x0][0x118] ;           /* 0x0000460000047ab9 */
																  /* 0x000fe20000000a00 */
/*0040*/                   I2F.F64 R2, R0 ;                       /* 0x0000000000027312 */  #怀疑点
																  /* 0x001e260000201c00 */
/*0050*/                   IMAD.WIDE R4, R0, R5, c[0x0][0x160] ;  /* 0x0000580000047625 */
																  /* 0x000fe200078e0205 */
/*0060*/                   DADD R2, R2, R2 ;                      /* 0x0000000002027229 */
																  /* 0x001e0e0000000002 */
/*0070*/                   STG.E.64 [R4.64], R2 ;                 /* 0x0000000204007986 */
																  /* 0x001fe2000c101b04 */
/*0080*/                   EXIT ;                                 /* 0x000000000000794d */
																  /* 0x000fea0003800000 */
/*0090*/                   BRA 0x90;                              /* 0xfffffff000007947 */

		Function : _Z9Kernel_v1Pd
.headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/                   MOV R1, c[0x0][0x28] ;                 /* 0x00000a0000017a02 */
																  /* 0x000fc40000000f00 */
/*0010*/                   S2R R2, SR_TID.X ;                     /* 0x0000000000027919 */
																  /* 0x000e220000002100 */
/*0020*/                   MOV R3, 0x8 ;                          /* 0x0000000800037802 */
																  /* 0x000fe20000000f00 */
/*0030*/                   ULDC.64 UR4, c[0x0][0x118] ;           /* 0x0000460000047ab9 */
																  /* 0x000fc80000000a00 */
/*0040*/                   IMAD.WIDE R2, R2, R3, c[0x0][0x160] ;  /* 0x0000580002027625 */
																  /* 0x001fca00078e0203 */
/*0050*/                   LDG.E.64 R4, [R2.64] ;                 /* 0x0000000402047981 */
																  /* 0x000ea4000c1e1b00 */
/*0060*/                   DADD R4, R4, R4 ;                      /* 0x0000000004047229 */
																  /* 0x004e0e0000000004 */
/*0070*/                   STG.E.64 [R2.64], R4 ;                 /* 0x0000000402007986 */
																  /* 0x001fe2000c101b04 */
/*0080*/                   EXIT ;                                 /* 0x000000000000794d */
																  /* 0x000fea0003800000 */
/*0090*/                   BRA 0x90;                              /* 0xfffffff000007947 */

Kernel_v1(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            1
--------------------------------- ----------- ------------

Kernel_v2(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            2  #fp64 pipe执行了二条指令
--------------------------------- ----------- ------------
  • I2F.F64 R2, R0删除后
bash 复制代码
Kernel_v1(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            1
--------------------------------- ----------- ------------

Kernel_v2(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            1 #只有一条指令
--------------------------------- ----------- ------------
相关推荐
无尽的大道4 小时前
深入理解 Java 阻塞队列:使用场景、原理与性能优化
java·开发语言·性能优化
loey_ln4 小时前
webpack配置和打包性能优化
前端·webpack·性能优化
郭梧悠15 小时前
HarmonyOS(57) UI性能优化
ui·性能优化·harmonyos
奈斯ing15 小时前
【Oracle篇】SQL性能优化实战案例(从15秒优化到0.08秒)(第七篇,总共七篇)
运维·数据库·sql·oracle·性能优化
探索云原生1 天前
大模型推理指南:使用 vLLM 实现高效推理
ai·云原生·kubernetes·gpu·vllm
青云交1 天前
大数据新视界 -- Impala 性能优化:分布式环境中的优化新视野(下)(28 / 30)
大数据·性能优化·资源管理·impala·优化策略·分布式环境·数据布局
hummhumm2 天前
第 24 章 -Golang 性能优化
java·开发语言·前端·后端·python·性能优化·golang
白茶等风121382 天前
准备阶段 Profiler性能分析工具的使用(一)
unity·性能优化
激流丶2 天前
【Redis 探秘】Redis 性能优化技巧
redis·性能优化·bootstrap