smsp__inst_executed_pipe_fp64为什么对不上

smsp__inst_executed_pipe_fp64为什么对不上

smsp__inst_executed_pipe_fp64 为什么对不上

一.小结

  • int idx = threadIdx.x; 会生成I2F.F64指令
  • I2F.F64也会占用fp64 pipe

二.复现过程

bash 复制代码
tee fp64_test.cu<<-'EOF'
#include <cuda_runtime.h>
#include <iostream>

__global__ void Kernel_v1(double *output) {
    int idx = threadIdx.x;
    output[idx] = output[idx] * 2.0;
}

__global__ void Kernel_v2(double *output) {
    int idx = threadIdx.x;
    output[idx] = idx * 2.0;
}
EOF

/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx fp64_test.cu -o fp64_test.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.ptx -cubin -o fp64_test.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
cat fp64_test.ptx
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin

tee fp64_test_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>

int main(int argc,char *argv[])
{
    CUresult error;
    CUdevice cuDevice;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    error = cuDeviceGet(&cuDevice, 0);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in get device!\n");
    }
    CUcontext cuContext;
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in create context!\n");
    }
    int block_count=1;int block_size=32;
    int thread_size=block_count*block_size;

    int data_size=sizeof(double)*thread_size;

    double *output_ptr=nullptr;
    double *input_ptr=nullptr;
    int cudaStatus=0;
    cudaStatus = cudaMalloc((void**)&input_ptr, data_size);
    cudaStatus = cudaMalloc((void**)&output_ptr, data_size);
    void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};

    CUmodule module;
    CUfunction double_function;
    CUfunction float_function;
    const char* module_file = "fp64_test.fatbin";
    const char* double_kernel_name = "_Z9Kernel_v1Pd";
    const char* float_kernel_name = "_Z9Kernel_v2Pd";
    
    error = cuModuleLoad(&module, module_file);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in load moudle %d!\n",error);
    }
    error = cuModuleGetFunction(&double_function, module, double_kernel_name);
    if(error!=CUDA_SUCCESS)
    {
        printf("get double_function error!\n");
    }
    error = cuModuleGetFunction(&float_function, module, float_kernel_name);
    if(error!=CUDA_SUCCESS)
    {
        printf("get float_kernel_name error!\n");
    }    
    cuLaunchKernel(double_function,
                    block_count, 1, 1,
                    block_size, 1, 1,
                    0,0,kernelParams, 0);
    cuLaunchKernel(float_function,
                    block_count, 1, 1,
                    block_size, 1, 1,
                    0,0,kernelParams, 0);
    cudaFree(output_ptr);
    cudaFree(input_ptr);
    cuModuleUnload(module);
    cuCtxDestroy(cuContext);
    return 0;
}
EOF
g++ fp64_test_main.cpp -o fp64_test_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__inst_executed_pipe_fp64.max ./fp64_test_main

cuasm.py fp64_test.cubin fp64_test.cuasm
sed '/I2F/d' -i fp64_test.cuasm
cuasm.py fp64_test.cuasm
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-resource-usage fp64_test.fatbin

/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__inst_executed_pipe_fp64.max ./fp64_test_main

三.输出

  • 修改前
bash 复制代码
		Function : _Z9Kernel_v2Pd
.headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/                   MOV R1, c[0x0][0x28] ;                 /* 0x00000a0000017a02 */
																  /* 0x000fc40000000f00 */
/*0010*/                   S2R R0, SR_TID.X ;                     /* 0x0000000000007919 */
																  /* 0x000e220000002100 */
/*0020*/                   MOV R5, 0x8 ;                          /* 0x0000000800057802 */
																  /* 0x000fe20000000f00 */
/*0030*/                   ULDC.64 UR4, c[0x0][0x118] ;           /* 0x0000460000047ab9 */
																  /* 0x000fe20000000a00 */
/*0040*/                   I2F.F64 R2, R0 ;                       /* 0x0000000000027312 */  #怀疑点
																  /* 0x001e260000201c00 */
/*0050*/                   IMAD.WIDE R4, R0, R5, c[0x0][0x160] ;  /* 0x0000580000047625 */
																  /* 0x000fe200078e0205 */
/*0060*/                   DADD R2, R2, R2 ;                      /* 0x0000000002027229 */
																  /* 0x001e0e0000000002 */
/*0070*/                   STG.E.64 [R4.64], R2 ;                 /* 0x0000000204007986 */
																  /* 0x001fe2000c101b04 */
/*0080*/                   EXIT ;                                 /* 0x000000000000794d */
																  /* 0x000fea0003800000 */
/*0090*/                   BRA 0x90;                              /* 0xfffffff000007947 */

		Function : _Z9Kernel_v1Pd
.headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/                   MOV R1, c[0x0][0x28] ;                 /* 0x00000a0000017a02 */
																  /* 0x000fc40000000f00 */
/*0010*/                   S2R R2, SR_TID.X ;                     /* 0x0000000000027919 */
																  /* 0x000e220000002100 */
/*0020*/                   MOV R3, 0x8 ;                          /* 0x0000000800037802 */
																  /* 0x000fe20000000f00 */
/*0030*/                   ULDC.64 UR4, c[0x0][0x118] ;           /* 0x0000460000047ab9 */
																  /* 0x000fc80000000a00 */
/*0040*/                   IMAD.WIDE R2, R2, R3, c[0x0][0x160] ;  /* 0x0000580002027625 */
																  /* 0x001fca00078e0203 */
/*0050*/                   LDG.E.64 R4, [R2.64] ;                 /* 0x0000000402047981 */
																  /* 0x000ea4000c1e1b00 */
/*0060*/                   DADD R4, R4, R4 ;                      /* 0x0000000004047229 */
																  /* 0x004e0e0000000004 */
/*0070*/                   STG.E.64 [R2.64], R4 ;                 /* 0x0000000402007986 */
																  /* 0x001fe2000c101b04 */
/*0080*/                   EXIT ;                                 /* 0x000000000000794d */
																  /* 0x000fea0003800000 */
/*0090*/                   BRA 0x90;                              /* 0xfffffff000007947 */

Kernel_v1(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            1
--------------------------------- ----------- ------------

Kernel_v2(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            2  #fp64 pipe执行了二条指令
--------------------------------- ----------- ------------
  • I2F.F64 R2, R0删除后
bash 复制代码
Kernel_v1(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            1
--------------------------------- ----------- ------------

Kernel_v2(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            1 #只有一条指令
--------------------------------- ----------- ------------
相关推荐
人工智能培训咨询叶梓1 小时前
MobiLlama,面向资源受限设备的轻量级全透明GPT模型
人工智能·gpt·语言模型·自然语言处理·性能优化·多模态·轻量级
Flying_Fish_roe13 小时前
JVM 性能优化与调优-ZGC(Z Garbage Collector)
jvm·性能优化
Flying_Fish_roe19 小时前
JVM 性能优化与调优-GraalVM
jvm·性能优化
四代水门1 天前
游戏性能优化
游戏·性能优化
大鹅同志1 天前
在服务器上开Juypter Lab教程(远程访问)
运维·服务器·pytorch·jupyter·cuda·云服务器
安卓美女1 天前
Android自定义View性能优化
android·性能优化
Flying_Fish_roe1 天前
JVM 性能优化与调优-Shenandoah GC
jvm·性能优化
旺小仔.1 天前
【数据结构篇】~排序(1)之插入排序
c语言·数据结构·算法·链表·性能优化·排序算法
小锋学长生活大爆炸1 天前
【踩坑】装了显卡,如何让显示器从主板和显卡HDMI都输出
计算机外设·gpu·显卡·hdmi·外设
洁洁!1 天前
深入分析计算机网络性能指标
网络·计算机网络·性能优化