smsp__inst_executed_pipe_fp64为什么对不上

smsp__inst_executed_pipe_fp64为什么对不上

smsp__inst_executed_pipe_fp64 为什么对不上

一.小结

  • int idx = threadIdx.x; 会生成I2F.F64指令
  • I2F.F64也会占用fp64 pipe

二.复现过程

bash 复制代码
tee fp64_test.cu<<-'EOF'
#include <cuda_runtime.h>
#include <iostream>

__global__ void Kernel_v1(double *output) {
    int idx = threadIdx.x;
    output[idx] = output[idx] * 2.0;
}

__global__ void Kernel_v2(double *output) {
    int idx = threadIdx.x;
    output[idx] = idx * 2.0;
}
EOF

/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx fp64_test.cu -o fp64_test.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.ptx -cubin -o fp64_test.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
cat fp64_test.ptx
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin

tee fp64_test_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>

int main(int argc,char *argv[])
{
    CUresult error;
    CUdevice cuDevice;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    error = cuDeviceGet(&cuDevice, 0);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in get device!\n");
    }
    CUcontext cuContext;
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in create context!\n");
    }
    int block_count=1;int block_size=32;
    int thread_size=block_count*block_size;

    int data_size=sizeof(double)*thread_size;

    double *output_ptr=nullptr;
    double *input_ptr=nullptr;
    int cudaStatus=0;
    cudaStatus = cudaMalloc((void**)&input_ptr, data_size);
    cudaStatus = cudaMalloc((void**)&output_ptr, data_size);
    void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};

    CUmodule module;
    CUfunction double_function;
    CUfunction float_function;
    const char* module_file = "fp64_test.fatbin";
    const char* double_kernel_name = "_Z9Kernel_v1Pd";
    const char* float_kernel_name = "_Z9Kernel_v2Pd";
    
    error = cuModuleLoad(&module, module_file);
    if(error!=CUDA_SUCCESS)
        {
        printf("Error happened in load moudle %d!\n",error);
    }
    error = cuModuleGetFunction(&double_function, module, double_kernel_name);
    if(error!=CUDA_SUCCESS)
    {
        printf("get double_function error!\n");
    }
    error = cuModuleGetFunction(&float_function, module, float_kernel_name);
    if(error!=CUDA_SUCCESS)
    {
        printf("get float_kernel_name error!\n");
    }    
    cuLaunchKernel(double_function,
                    block_count, 1, 1,
                    block_size, 1, 1,
                    0,0,kernelParams, 0);
    cuLaunchKernel(float_function,
                    block_count, 1, 1,
                    block_size, 1, 1,
                    0,0,kernelParams, 0);
    cudaFree(output_ptr);
    cudaFree(input_ptr);
    cuModuleUnload(module);
    cuCtxDestroy(cuContext);
    return 0;
}
EOF
g++ fp64_test_main.cpp -o fp64_test_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__inst_executed_pipe_fp64.max ./fp64_test_main

cuasm.py fp64_test.cubin fp64_test.cuasm
sed '/I2F/d' -i fp64_test.cuasm
cuasm.py fp64_test.cuasm
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-resource-usage fp64_test.fatbin

/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__inst_executed_pipe_fp64.max ./fp64_test_main

三.输出

  • 修改前
bash 复制代码
		Function : _Z9Kernel_v2Pd
.headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/                   MOV R1, c[0x0][0x28] ;                 /* 0x00000a0000017a02 */
																  /* 0x000fc40000000f00 */
/*0010*/                   S2R R0, SR_TID.X ;                     /* 0x0000000000007919 */
																  /* 0x000e220000002100 */
/*0020*/                   MOV R5, 0x8 ;                          /* 0x0000000800057802 */
																  /* 0x000fe20000000f00 */
/*0030*/                   ULDC.64 UR4, c[0x0][0x118] ;           /* 0x0000460000047ab9 */
																  /* 0x000fe20000000a00 */
/*0040*/                   I2F.F64 R2, R0 ;                       /* 0x0000000000027312 */  #怀疑点
																  /* 0x001e260000201c00 */
/*0050*/                   IMAD.WIDE R4, R0, R5, c[0x0][0x160] ;  /* 0x0000580000047625 */
																  /* 0x000fe200078e0205 */
/*0060*/                   DADD R2, R2, R2 ;                      /* 0x0000000002027229 */
																  /* 0x001e0e0000000002 */
/*0070*/                   STG.E.64 [R4.64], R2 ;                 /* 0x0000000204007986 */
																  /* 0x001fe2000c101b04 */
/*0080*/                   EXIT ;                                 /* 0x000000000000794d */
																  /* 0x000fea0003800000 */
/*0090*/                   BRA 0x90;                              /* 0xfffffff000007947 */

		Function : _Z9Kernel_v1Pd
.headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/                   MOV R1, c[0x0][0x28] ;                 /* 0x00000a0000017a02 */
																  /* 0x000fc40000000f00 */
/*0010*/                   S2R R2, SR_TID.X ;                     /* 0x0000000000027919 */
																  /* 0x000e220000002100 */
/*0020*/                   MOV R3, 0x8 ;                          /* 0x0000000800037802 */
																  /* 0x000fe20000000f00 */
/*0030*/                   ULDC.64 UR4, c[0x0][0x118] ;           /* 0x0000460000047ab9 */
																  /* 0x000fc80000000a00 */
/*0040*/                   IMAD.WIDE R2, R2, R3, c[0x0][0x160] ;  /* 0x0000580002027625 */
																  /* 0x001fca00078e0203 */
/*0050*/                   LDG.E.64 R4, [R2.64] ;                 /* 0x0000000402047981 */
																  /* 0x000ea4000c1e1b00 */
/*0060*/                   DADD R4, R4, R4 ;                      /* 0x0000000004047229 */
																  /* 0x004e0e0000000004 */
/*0070*/                   STG.E.64 [R2.64], R4 ;                 /* 0x0000000402007986 */
																  /* 0x001fe2000c101b04 */
/*0080*/                   EXIT ;                                 /* 0x000000000000794d */
																  /* 0x000fea0003800000 */
/*0090*/                   BRA 0x90;                              /* 0xfffffff000007947 */

Kernel_v1(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            1
--------------------------------- ----------- ------------

Kernel_v2(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            2  #fp64 pipe执行了二条指令
--------------------------------- ----------- ------------
  • I2F.F64 R2, R0删除后
bash 复制代码
Kernel_v1(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            1
--------------------------------- ----------- ------------

Kernel_v2(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max        inst            1 #只有一条指令
--------------------------------- ----------- ------------
相关推荐
前端青山7 小时前
Node.js-增强 API 安全性和性能优化
开发语言·前端·javascript·性能优化·前端框架·node.js
青云交12 小时前
大数据新视界 -- 大数据大厂之 Impala 性能优化:应对海量复杂数据的挑战(上)(7/30)
大数据·性能优化·impala·数据分区·查询优化·海量复杂数据·经典案例
chusheng184015 小时前
Python 爬取大量数据如何并发抓取与性能优化
开发语言·python·性能优化
探索云原生1 天前
GPU 环境搭建指南:如何在裸机、Docker、K8s 等环境中使用 GPU
ai·云原生·kubernetes·go·gpu
XMYX-01 天前
MySQL 性能优化策略:提升响应速度与系统稳定性
mysql·性能优化
一个处女座的程序猿1 天前
AI之硬件对比:据传英伟达Nvidia2025年将推出RTX 5090-32GB/RTX 5080-24GB、华为2025年推出910C/910D
人工智能·gpu
PangPiLoLo2 天前
高可用架构-业务高可用
java·性能优化·架构
尸僵打怪兽2 天前
软考(中级-软件设计师)数据库篇(1101)
数据库·oracle·性能优化·软考
程序猿进阶2 天前
系统上云-流量分析和链路分析
java·后端·阿里云·面试·性能优化·系统架构·云计算
飞腾开发者3 天前
飞腾平台Arm ComputeLibrary编译安装指南
linux·服务器·arm开发·后端·性能优化