测试cuda-gdb调试某个线程,是否会影响其它线程

测试cuda-gdb调试某个线程,是否会影响其它线程

测试cuda-gdb调试某个线程,是否会影响其它线程

1.测试方法

  • kernel里有4条fma指令,每条指令之间获取当前时间戳,在kernel最后将这几个时间戳写入
    使用asm volatile防止编译器擅自修改执行顺序
    最后才将时间戳写入dram,防止计时及st指令的影响
  • 1个block,threadBlockSize=3244(每个smsp上4个warp)
  • 不跑cuda-gdb时,每二条fma指令的执行间隔是固定且很小的
  • 开启cuda-gdb,在第3个fma的位置加条件断点:threadIdx.x == 16,停住后,等一会儿再continue
  • 理论上只有第1个warp的第3个fma间隔会变大,其它warp不受影响
  • 实际上每个warp的第3个fma的间隔都变很大了
  • 如果换成if(threadIdx.x==16) { __trap();},则其它线程fma间隔不受影响
  • 从现象看,cuda-gdb只调试某一个线程,会影响其它线程的行为

2.操作步骤

c 复制代码
tee cuda_gdb_inspect.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>

__global__ void kernel(float *output_data,
                       unsigned long long*output_ts_0,
                       unsigned long long*output_ts_1,
                       unsigned long long*output_ts_2,
                       unsigned long long*output_ts_3,
                       unsigned long long*output_ts_4) 
{
    int tid  = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int ts0;
    unsigned int ts1;
    unsigned int ts2;
    unsigned int ts3;
    unsigned int ts4;
    float d0;
    float d1;
    float d2;
    float d3;
    float a=clock();
    float b=clock();
    float c=clock();
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(ts0) :: "memory");
    __asm__  __volatile__("fma.rn.f32 %0,%1,%2,%3;" : "=f"(d0) : "f"(a),"f"(b),"f"(d0));
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(ts1) :: "memory");
    __asm__  __volatile__("fma.rn.f32 %0,%1,%2,%3;" : "=f"(d1) : "f"(a),"f"(b),"f"(d1));
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(ts2) :: "memory");
    /* 如果这样,则不影响其它线程
    if(threadIdx.x==16)
    {
        __trap();
    }*/
    __asm__  __volatile__("fma.rn.f32 %0,%1,%2,%3;" : "=f"(d2) : "f"(a),"f"(b),"f"(d2));
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(ts3) :: "memory");
    __asm__  __volatile__("fma.rn.f32 %0,%1,%2,%3;" : "=f"(d3) : "f"(a),"f"(b),"f"(d3));
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(ts4) :: "memory");
    __asm__  __volatile__("st.global.v4.f32 [%0],{%1,%2,%3,%4};" :: "l"(output_data),"f"(d0),"f"(d1),"f"(d2),"f"(d3): "memory");
    output_ts_0[tid]=ts0;
    output_ts_1[tid]=ts1;
    output_ts_2[tid]=ts2;
    output_ts_3[tid]=ts3;
    output_ts_4[tid]=ts4;
}

#define CHECK_CUDA(call)                                           \
    do {                                                           \
        cudaError_t err = call;                                    \
        if (err != cudaSuccess) {                                  \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__;  \
            std::cerr << " code=" << err << " (" << cudaGetErrorString(cudaGetLastError()) << ")" << std::endl; \
        }                                                          \
    } while (0)

int main(int argc,char *argv[])
{
    int deviceid=0;
    cudaSetDevice(deviceid);  
    
    int block_size=1;
    int thread_block_size=32*4*4;
    int thread_size=thread_block_size*block_size;

    int data_size=sizeof(float)*thread_size;
    int ts_size=sizeof(unsigned long long)*thread_size;
    int smid_size=sizeof(int)*thread_size;
    
    float *dev_output_data=nullptr;
    unsigned long long* dev_output_ts_0=nullptr;
    unsigned long long* dev_output_ts_1=nullptr;
    unsigned long long* dev_output_ts_2=nullptr;
    unsigned long long* dev_output_ts_3=nullptr;
    unsigned long long* dev_output_ts_4=nullptr;
    
    unsigned int* dev_smid=nullptr;

    float *host_output_data=new float[thread_size];
    unsigned long long*host_output_ts_0=new unsigned long long[thread_size];
    unsigned long long*host_output_ts_1=new unsigned long long[thread_size];
    unsigned long long*host_output_ts_2=new unsigned long long[thread_size];
    unsigned long long*host_output_ts_3=new unsigned long long[thread_size];
    unsigned long long*host_output_ts_4=new unsigned long long[thread_size];
    
    unsigned int* host_smid=new unsigned int[thread_size];
        
    CHECK_CUDA(cudaMalloc((void**)&dev_output_data, data_size));
    CHECK_CUDA(cudaMalloc((void**)&dev_output_ts_0, ts_size));
    CHECK_CUDA(cudaMalloc((void**)&dev_output_ts_1, ts_size));
    CHECK_CUDA(cudaMalloc((void**)&dev_output_ts_2, ts_size));
    CHECK_CUDA(cudaMalloc((void**)&dev_output_ts_3, ts_size));
    CHECK_CUDA(cudaMalloc((void**)&dev_output_ts_4, ts_size));    
    CHECK_CUDA(cudaMalloc((void**)&dev_smid, smid_size));
    
    CHECK_CUDA(cudaMemcpy(dev_output_data,host_output_data,data_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_output_ts_0,host_output_ts_0,ts_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_output_ts_1,host_output_ts_1,ts_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_output_ts_2,host_output_ts_2,ts_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_output_ts_3,host_output_ts_3,ts_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_output_ts_4,host_output_ts_4,ts_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_smid,host_smid,smid_size,cudaMemcpyHostToDevice));
    
    kernel<<<block_size, thread_block_size>>>(dev_output_data,
                                    dev_output_ts_0,
                                    dev_output_ts_1,
                                    dev_output_ts_2,
                                    dev_output_ts_3,
                                    dev_output_ts_4);
                                    
    CHECK_CUDA(cudaDeviceSynchronize());
       
    CHECK_CUDA(cudaMemcpy(host_output_ts_0,dev_output_ts_0,ts_size,cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(host_output_ts_1,dev_output_ts_1,ts_size,cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(host_output_ts_2,dev_output_ts_2,ts_size,cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(host_output_ts_3,dev_output_ts_3,ts_size,cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(host_output_ts_4,dev_output_ts_4,ts_size,cudaMemcpyDeviceToHost));
    
    for(int i=0;i<thread_size;i++)
    {
        printf("tid:%04d ts:%lld,%lld,%lld,%lld \n",i,host_output_ts_1[i]-host_output_ts_0[i],
                                                      host_output_ts_2[i]-host_output_ts_1[i],
                                                      host_output_ts_3[i]-host_output_ts_2[i],
                                                      host_output_ts_4[i]-host_output_ts_3[i]);
    }
    CHECK_CUDA(cudaFree(dev_output_data));
    CHECK_CUDA(cudaFree(dev_output_ts_0));
    CHECK_CUDA(cudaFree(dev_output_ts_1));
    CHECK_CUDA(cudaFree(dev_output_ts_2));
    CHECK_CUDA(cudaFree(dev_output_ts_3));
    CHECK_CUDA(cudaFree(dev_output_ts_4));
    return 0;
}
EOF

/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_86 -g -lineinfo -o cuda_gdb_inspect cuda_gdb_inspect.cu \
        -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
/usr/local/cuda/bin/cuda-gdb ./cuda_gdb_inspect
break cuda_gdb_inspect.cu:40 if threadIdx.x == 16

输出

bash 复制代码
tid:0000 ts:299,38,62694194,48
tid:0001 ts:299,38,62694194,48
tid:0002 ts:299,38,62694194,48
tid:0003 ts:299,38,62694194,48
tid:0004 ts:299,38,62694194,48
tid:0005 ts:299,38,62694194,48
tid:0006 ts:299,38,62694194,48
tid:0007 ts:299,38,62694194,48
tid:0008 ts:299,38,62694194,48
tid:0009 ts:299,38,62694194,48
tid:0010 ts:299,38,62694194,48
tid:0011 ts:299,38,62694194,48
tid:0012 ts:299,38,62694194,48
tid:0013 ts:299,38,62694194,48
tid:0014 ts:299,38,62694194,48
tid:0015 ts:299,38,62694194,48
tid:0016 ts:299,38,62694194,48
tid:0017 ts:299,38,62694194,48
tid:0018 ts:299,38,62694194,48
tid:0019 ts:299,38,62694194,48
tid:0020 ts:299,38,62694194,48
tid:0021 ts:299,38,62694194,48
tid:0022 ts:299,38,62694194,48
tid:0023 ts:299,38,62694194,48
tid:0024 ts:299,38,62694194,48
tid:0025 ts:299,38,62694194,48
tid:0026 ts:299,38,62694194,48
tid:0027 ts:299,38,62694194,48
tid:0028 ts:299,38,62694194,48
tid:0029 ts:299,38,62694194,48
tid:0030 ts:299,38,62694194,48
tid:0031 ts:299,38,62694194,48
tid:0032 ts:299,62694241,6178513,38
tid:0033 ts:299,62694241,6178513,38
tid:0034 ts:299,62694241,6178513,38
tid:0035 ts:299,62694241,6178513,38
tid:0036 ts:299,62694241,6178513,38
tid:0037 ts:299,62694241,6178513,38
tid:0038 ts:299,62694241,6178513,38
tid:0039 ts:299,62694241,6178513,38
tid:0040 ts:299,62694241,6178513,38
tid:0041 ts:299,62694241,6178513,38
tid:0042 ts:299,62694241,6178513,38
tid:0043 ts:299,62694241,6178513,38
tid:0044 ts:299,62694241,6178513,38
tid:0045 ts:299,62694241,6178513,38
tid:0046 ts:299,62694241,6178513,38
tid:0047 ts:299,62694241,6178513,38
tid:0048 ts:299,62694241,6178513,38
tid:0049 ts:299,62694241,6178513,38
tid:0050 ts:299,62694241,6178513,38
tid:0051 ts:299,62694241,6178513,38
tid:0052 ts:299,62694241,6178513,38
tid:0053 ts:299,62694241,6178513,38
tid:0054 ts:299,62694241,6178513,38
tid:0055 ts:299,62694241,6178513,38
tid:0056 ts:299,62694241,6178513,38
tid:0057 ts:299,62694241,6178513,38
tid:0058 ts:299,62694241,6178513,38
tid:0059 ts:299,62694241,6178513,38
tid:0060 ts:299,62694241,6178513,38
tid:0061 ts:299,62694241,6178513,38
tid:0062 ts:299,62694241,6178513,38
tid:0063 ts:299,62694241,6178513,38
tid:0064 ts:299,38,62694224,46
tid:0065 ts:299,38,62694224,46
tid:0066 ts:299,38,62694224,46
tid:0067 ts:299,38,62694224,46
tid:0068 ts:299,38,62694224,46
tid:0069 ts:299,38,62694224,46
tid:0070 ts:299,38,62694224,46
tid:0071 ts:299,38,62694224,46
tid:0072 ts:299,38,62694224,46
tid:0073 ts:299,38,62694224,46
tid:0074 ts:299,38,62694224,46
tid:0075 ts:299,38,62694224,46
tid:0076 ts:299,38,62694224,46
tid:0077 ts:299,38,62694224,46
tid:0078 ts:299,38,62694224,46
tid:0079 ts:299,38,62694224,46
tid:0080 ts:299,38,62694224,46
tid:0081 ts:299,38,62694224,46
相关推荐
前端青山7 小时前
Node.js-增强 API 安全性和性能优化
开发语言·前端·javascript·性能优化·前端框架·node.js
青云交12 小时前
大数据新视界 -- 大数据大厂之 Impala 性能优化:应对海量复杂数据的挑战(上)(7/30)
大数据·性能优化·impala·数据分区·查询优化·海量复杂数据·经典案例
chusheng184015 小时前
Python 爬取大量数据如何并发抓取与性能优化
开发语言·python·性能优化
探索云原生1 天前
GPU 环境搭建指南:如何在裸机、Docker、K8s 等环境中使用 GPU
ai·云原生·kubernetes·go·gpu
XMYX-01 天前
MySQL 性能优化策略:提升响应速度与系统稳定性
mysql·性能优化
一个处女座的程序猿1 天前
AI之硬件对比:据传英伟达Nvidia2025年将推出RTX 5090-32GB/RTX 5080-24GB、华为2025年推出910C/910D
人工智能·gpu
PangPiLoLo2 天前
高可用架构-业务高可用
java·性能优化·架构
尸僵打怪兽2 天前
软考(中级-软件设计师)数据库篇(1101)
数据库·oracle·性能优化·软考
程序猿进阶2 天前
系统上云-流量分析和链路分析
java·后端·阿里云·面试·性能优化·系统架构·云计算
飞腾开发者3 天前
飞腾平台Arm ComputeLibrary编译安装指南
linux·服务器·arm开发·后端·性能优化