smsp__inst_executed_pipe_fp64为什么对不上
smsp__inst_executed_pipe_fp64 为什么对不上
一.小结
- int idx = threadIdx.x; 会生成I2F.F64指令
- I2F.F64也会占用fp64 pipe
二.复现过程
bash
tee fp64_test.cu<<-'EOF'
#include <cuda_runtime.h>
#include <iostream>
__global__ void Kernel_v1(double *output) {
int idx = threadIdx.x;
output[idx] = output[idx] * 2.0;
}
__global__ void Kernel_v2(double *output) {
int idx = threadIdx.x;
output[idx] = idx * 2.0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx fp64_test.cu -o fp64_test.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.ptx -cubin -o fp64_test.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
cat fp64_test.ptx
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin
tee fp64_test_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>
int main(int argc,char *argv[])
{
CUresult error;
CUdevice cuDevice;
cuInit(0);
int deviceCount = 0;
error = cuDeviceGetCount(&deviceCount);
error = cuDeviceGet(&cuDevice, 0);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in get device!\n");
}
CUcontext cuContext;
error = cuCtxCreate(&cuContext, 0, cuDevice);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in create context!\n");
}
int block_count=1;int block_size=32;
int thread_size=block_count*block_size;
int data_size=sizeof(double)*thread_size;
double *output_ptr=nullptr;
double *input_ptr=nullptr;
int cudaStatus=0;
cudaStatus = cudaMalloc((void**)&input_ptr, data_size);
cudaStatus = cudaMalloc((void**)&output_ptr, data_size);
void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};
CUmodule module;
CUfunction double_function;
CUfunction float_function;
const char* module_file = "fp64_test.fatbin";
const char* double_kernel_name = "_Z9Kernel_v1Pd";
const char* float_kernel_name = "_Z9Kernel_v2Pd";
error = cuModuleLoad(&module, module_file);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in load moudle %d!\n",error);
}
error = cuModuleGetFunction(&double_function, module, double_kernel_name);
if(error!=CUDA_SUCCESS)
{
printf("get double_function error!\n");
}
error = cuModuleGetFunction(&float_function, module, float_kernel_name);
if(error!=CUDA_SUCCESS)
{
printf("get float_kernel_name error!\n");
}
cuLaunchKernel(double_function,
block_count, 1, 1,
block_size, 1, 1,
0,0,kernelParams, 0);
cuLaunchKernel(float_function,
block_count, 1, 1,
block_size, 1, 1,
0,0,kernelParams, 0);
cudaFree(output_ptr);
cudaFree(input_ptr);
cuModuleUnload(module);
cuCtxDestroy(cuContext);
return 0;
}
EOF
g++ fp64_test_main.cpp -o fp64_test_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__inst_executed_pipe_fp64.max ./fp64_test_main
cuasm.py fp64_test.cubin fp64_test.cuasm
sed '/I2F/d' -i fp64_test.cuasm
cuasm.py fp64_test.cuasm
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-resource-usage fp64_test.fatbin
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__inst_executed_pipe_fp64.max ./fp64_test_main
三.输出
- 修改前
bash
Function : _Z9Kernel_v2Pd
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ MOV R5, 0x8 ; /* 0x0000000800057802 */
/* 0x000fe20000000f00 */
/*0030*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fe20000000a00 */
/*0040*/ I2F.F64 R2, R0 ; /* 0x0000000000027312 */ #怀疑点
/* 0x001e260000201c00 */
/*0050*/ IMAD.WIDE R4, R0, R5, c[0x0][0x160] ; /* 0x0000580000047625 */
/* 0x000fe200078e0205 */
/*0060*/ DADD R2, R2, R2 ; /* 0x0000000002027229 */
/* 0x001e0e0000000002 */
/*0070*/ STG.E.64 [R4.64], R2 ; /* 0x0000000204007986 */
/* 0x001fe2000c101b04 */
/*0080*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0090*/ BRA 0x90; /* 0xfffffff000007947 */
Function : _Z9Kernel_v1Pd
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R2, SR_TID.X ; /* 0x0000000000027919 */
/* 0x000e220000002100 */
/*0020*/ MOV R3, 0x8 ; /* 0x0000000800037802 */
/* 0x000fe20000000f00 */
/*0030*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fc80000000a00 */
/*0040*/ IMAD.WIDE R2, R2, R3, c[0x0][0x160] ; /* 0x0000580002027625 */
/* 0x001fca00078e0203 */
/*0050*/ LDG.E.64 R4, [R2.64] ; /* 0x0000000402047981 */
/* 0x000ea4000c1e1b00 */
/*0060*/ DADD R4, R4, R4 ; /* 0x0000000004047229 */
/* 0x004e0e0000000004 */
/*0070*/ STG.E.64 [R2.64], R4 ; /* 0x0000000402007986 */
/* 0x001fe2000c101b04 */
/*0080*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0090*/ BRA 0x90; /* 0xfffffff000007947 */
Kernel_v1(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max inst 1
--------------------------------- ----------- ------------
Kernel_v2(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max inst 2 #fp64 pipe执行了二条指令
--------------------------------- ----------- ------------
- I2F.F64 R2, R0删除后
bash
Kernel_v1(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max inst 1
--------------------------------- ----------- ------------
Kernel_v2(double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__inst_executed_pipe_fp64.max inst 1 #只有一条指令
--------------------------------- ----------- ------------