统计一条cuda ld指令需要经过哪些硬件单元--演示CuAssembler如何修改CUDA SASS指令
- 1.准备SASS反汇编工具CuAssembler
- [2.仅包含ld.global.cv.f32的cuda kernel,如果不加st指令,编译器会将ld指令也优化掉。后面手动修改汇编指令删除掉st指令](#2.仅包含ld.global.cv.f32的cuda kernel,如果不加st指令,编译器会将ld指令也优化掉。后面手动修改汇编指令删除掉st指令)
- 3.生成fatbin
- 4.修改SASS指令,删除掉STG.E.STRONG.SYS指令,重新生成fatbin
- 5.准备测试程序,加载fatbin并运行里面的Kernel
- [6.ncu profing](#6.ncu profing)
- 7.获取NCU支持的metrics列表
- 8.查询每个metrics
- 9.过滤掉值为0的metrics
背景 :想统计一条ld指令需要经过哪些硬件单元
步骤:
- cuda Kernel里只包含一条load指令,但如果没有st会被编译器优化掉(ptx还在,但sass里却没了)
- 暂时没有找到编译选项关掉该优化
- 于是采用CuAssembler将ST SASS指令删掉,重新生成fatbin
- 采用cuModuleLoad加载fatbin,用cuLaunchKernel运行该Kernel
1.准备SASS反汇编工具CuAssembler
bash
git clone https://github.com/cloudcores/CuAssembler
export PATH=${PATH}:$PWD/CuAssembler/bin:/usr/local/cuda/bin/
export PYTHONPATH=${PYTHOPATH}:$PWD/CuAssembler/
pip install pyelftools
2.仅包含ld.global.cv.f32的cuda kernel,如果不加st指令,编译器会将ld指令也优化掉。后面手动修改汇编指令删除掉st指令
c
tee ptx_ld_inst.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
__global__ void ptx_ld_inst_kernel(float *input, float *out) {
float d;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
asm("ld.global.cv.f32 %0, [%1];" : "=f"(d) : "l"(&input[tid]));
asm("st.global.wt.f32 [%0],%1;" :: "l"(&out[tid]),"f"(d));
}
EOF
3.生成fatbin
bash
# 生成ptx
/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx ptx_ld_inst.cu -o ptx_ld_inst.ptx
# 生成cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 ptx_ld_inst.ptx -cubin -o ptx_ld_inst.cubin
# 生成fatbin
/usr/local/cuda/bin/nvcc -arch=sm_86 ptx_ld_inst.cubin -fatbin -o ptx_ld_inst.fatbin
# 查看ptx
cat ptx_ld_inst.ptx
# 查看sass指令
/usr/local/cuda/bin/cuobjdump --dump-sass ptx_ld_inst.fatbin
# 输出:
/*0070*/ LDG.E.STRONG.SYS R3, [R2.64] ; /* 0x0000000402037981 */
/* 0x000ea2000c1f5900 */
/*0080*/ IMAD.WIDE R4, R4, R5, c[0x0][0x168] ; /* 0x00005a0004047625 */
/* 0x000fca00078e0205 */
/*0090*/ STG.E.STRONG.SYS [R4.64], R3 ; /* 0x0000000304007986 */
/* 0x004fe2000c115904 */
/*00a0*/ EXIT ; /* 0x000000000000794d */
4.修改SASS指令,删除掉STG.E.STRONG.SYS指令,重新生成fatbin
bash
cuasm.py ptx_ld_inst.cubin ptx_ld_inst.cuasm
cat ptx_ld_inst.cuasm | grep "STG.E.STRONG.SYS" -B 2
# 输出
[B------:R-:W2:-:S01] /*0070*/ LDG.E.STRONG.SYS R3, desc[UR4][R2.64] ;
[B------:R-:W-:Y:S05] /*0080*/ IMAD.WIDE R4, R4, R5, c[0x0][0x168] ;
[B--2---:R-:W-:-:S01] /*0090*/ STG.E.STRONG.SYS desc[UR4][R4.64], R3 ;
# 删除这二行
sed '/STG.E.STRONG.SYS/d' -i ptx_ld_inst.cuasm
sed '/IMAD.WIDE R4/d' -i ptx_ld_inst.cuasm
# 生新行成cubin
cuasm.py ptx_ld_inst.cuasm
# 生成fatbin
/usr/local/cuda/bin/nvcc -arch=sm_86 ptx_ld_inst.cubin -fatbin -o ptx_ld_inst.fatbin
# 查看sass指令
/usr/local/cuda/bin/cuobjdump --dump-sass ptx_ld_inst.fatbin
输出:
/*0050*/ IMAD R4, R3, c[0x0][0x0], R4 ; /* 0x0000000003047a24 */
/* 0x001fc800078e0204 */
/*0060*/ IMAD.WIDE R2, R4, R5, c[0x0][0x160] ; /* 0x0000580004027625 */
/* 0x000fcc00078e0205 */
/*0070*/ LDG.E.STRONG.SYS R3, desc[UR4][R2.64] ; /* 0x0000000402037981 */
/* 0x000ea2200c1f5900 */
/*0080*/ EXIT ; /* 0x000000000000794d */
5.准备测试程序,加载fatbin并运行里面的Kernel
bash
tee ptx_ld_inst_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>
int main(int argc,char *argv[])
{
CUresult error;
CUdevice cuDevice;
cuInit(0);
int deviceCount = 0;
error = cuDeviceGetCount(&deviceCount);
error = cuDeviceGet(&cuDevice, 0);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in get device!\n");
}
CUcontext cuContext;
error = cuCtxCreate(&cuContext, 0, cuDevice);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in create context!\n");
}
CUmodule module;
CUfunction function;
const char* module_file = "ptx_ld_inst.fatbin";
const char* kernel_name = "_Z18ptx_ld_inst_kernelPfS_";
error = cuModuleLoad(&module, module_file);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in load moudle %d!\n",error);
}
error = cuModuleGetFunction(&function, module, kernel_name);
if(error!=CUDA_SUCCESS)
{
printf("get function error!\n");
}
int data_size=sizeof(float)*8192;
float *output_ptr=nullptr;
float *input_ptr=nullptr;
int cudaStatus=0;
cudaStatus = cudaMalloc((void**)&input_ptr, data_size);
cudaStatus = cudaMalloc((void**)&output_ptr, data_size);
void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};
cuLaunchKernel(function,
1, 1, 1,
32, 1, 1,
0,0,kernelParams, 0);
cudaFree(output_ptr);
cudaFree(input_ptr);
cuModuleUnload(module);
cuCtxDestroy(cuContext);
return 0;
}
EOF
g++ ptx_ld_inst_main.cpp -o ptx_ld_inst_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda
6.ncu profing
bash
/usr/local/NVIDIA-Nsight-Compute/ncu --set full --section SpeedOfLight_HierarchicalTensorRooflineChart \
--target-processes all --clock-control=none \
--print-details all --export ncu_report_ptx_ld_inst -f ./ptx_ld_inst_main
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.max_rate,\
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct,\
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio,\
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_ld.max,\
l1tex__m_xbar2l1tex_read_bytes_mem_lg_op_ld.max,\
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.max,\
l1tex__t_bytes_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_bytes_pipe_lsu_mem_global_op_ld_lookup_miss.max,\
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_requests_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_miss.max,\
l1tex__t_set_accesses_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_set_conflicts_pipe_lsu_mem_global_op_ld.max,\
sm__sass_data_bytes_mem_global_op_ld.max,\
sm__sass_inst_executed_op_global_ld.max,\
sm__sass_inst_executed_op_ld.max,\
sm__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max,\
smsp__inst_executed_op_global_ld.max,\
smsp__inst_executed_op_global_ld_pred_on_any.max,\
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.max_rate,\
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct,\
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.ratio,\
smsp__sass_data_bytes_mem_global_op_ld.max,\
smsp__sass_inst_executed_op_global_ld.max,\
smsp__sass_inst_executed_op_ld.max,\
smsp__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max ./ptx_ld_inst_main
输出
bash
----------------------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
----------------------------------------------------------------------- ----------- ------------
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.max_rate sector/1 32
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/1 4
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_ld.max 0
l1tex__m_xbar2l1tex_read_bytes_mem_lg_op_ld.max byte 128
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.max sector 4
l1tex__t_bytes_pipe_lsu_mem_global_op_ld.max byte 128
l1tex__t_bytes_pipe_lsu_mem_global_op_ld_lookup_miss.max byte 128
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.max 1
l1tex__t_requests_pipe_lsu_mem_global_op_ld.max 1
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.max sector 4
l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_miss.max sector 4
l1tex__t_set_accesses_pipe_lsu_mem_global_op_ld.max 1
l1tex__t_set_conflicts_pipe_lsu_mem_global_op_ld.max cycle 0
sm__sass_data_bytes_mem_global_op_ld.max byte 128
sm__sass_inst_executed_op_global_ld.max inst 1
sm__sass_inst_executed_op_ld.max inst 1
sm__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max sector 4
smsp__inst_executed_op_global_ld.max inst 1
smsp__inst_executed_op_global_ld_pred_on_any.max inst 1
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.max_rate byte/sector 32
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct % 100
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.ratio byte/sector 32
smsp__sass_data_bytes_mem_global_op_ld.max byte 128
smsp__sass_inst_executed_op_global_ld.max inst 1
smsp__sass_inst_executed_op_ld.max inst 1
smsp__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max sector 4
----------------------------------------------------------------------- ----------- ------------
7.获取NCU支持的metrics列表
bash
/usr/local/NVIDIA-Nsight-Compute/ncu --query-metrics \
--csv | awk -F, '{print $1}' | sed 's/"//g' | tail -n +2 > metrics.txt
8.查询每个metrics
bash
tee get_metrics.sh<<-'EOF'
rm -f ptx_ld_inst_metrics.txt
for line in `cat metrics.txt`
do
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics $line \
./ptx_ld_inst_main 2>&1 | grep "$line" | grep -v "n/a" | tee -a "ptx_ld_inst_metrics.txt"
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics $line \
./ptx_ld_inst_main 2>&1 | grep "$line" | grep -v "n/a" | tee -a "ptx_ld_inst_metrics.txt"
done
EOF
bash get_metrics.sh