测试NV GPU SM的时钟是否一致

测试NV GPU SM的时钟是否一致

测试NV GPU SM的时钟是否一致

操作步骤

c 复制代码
tee sm_clock_benchmark.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>

#define CHECK_CUDA(call)                                           \
    do {                                                           \
        cudaError_t err = call;                                    \
        if (err != cudaSuccess) {                                  \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__;  \
            std::cerr << " code=" << err << " (" << cudaGetErrorString(cudaGetLastError()) << ")" << std::endl; \
        }                                                          \
    } while (0)

__global__ void kernel(unsigned long long*output_ts,unsigned int*output_smid) {
    int tid  = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned long long ts0=0;
    asm volatile ("mov.u64 %0, %clock64;" : "=l"(ts0) :: "memory");
    unsigned int smid;
    asm volatile("mov.u32 %0, %smid;" : "=r"(smid));
    if(tid%blockDim.x==0)
    {
        output_ts[blockIdx.x]=ts0;
        output_smid[blockIdx.x]=smid;
    }
}

int main(int argc,char *argv[])
{
    int deviceid=0;
    cudaSetDevice(deviceid);  
    
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, deviceid);

    int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
    int sharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
    int maxBlocksPerMultiprocessor = deviceProp.maxBlocksPerMultiProcessor;
    int smCount = deviceProp.multiProcessorCount;

    std::cout << "Device name: " << deviceProp.name << std::endl;
    std::cout << "Max threads per block: " << maxThreadsPerBlock << std::endl;
    std::cout << "Shared memory per block: " << sharedMemoryPerBlock << " bytes" << std::endl;
    std::cout << "Max blocks per SM: " << maxBlocksPerMultiprocessor << std::endl;
    std::cout << "Number of SMs: " << smCount << std::endl;

    int block_size=smCount;
    int thread_block_size=maxThreadsPerBlock;
    int thread_size=thread_block_size*block_size;

    int data_size=sizeof(float)*thread_size;
    int ts_size=sizeof(unsigned long long)*thread_size;
    int smid_size=sizeof(int)*thread_size;

    unsigned long long* dev_output_ts=nullptr;
    unsigned int* dev_smid=nullptr;

    unsigned long long*host_output_ts=new unsigned long long[thread_size];;
    unsigned int* host_smid=new unsigned int[thread_size];
        
    CHECK_CUDA(cudaMalloc((void**)&dev_output_ts, ts_size));
    CHECK_CUDA(cudaMalloc((void**)&dev_smid, smid_size));
    
    CHECK_CUDA(cudaMemcpy(dev_output_ts,host_output_ts,ts_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_smid,host_smid,smid_size,cudaMemcpyHostToDevice));
    
    printf("dev_output_ts:%p\n",dev_output_ts);
    printf("dev_smid:%p\n",dev_smid);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    for(int iter=0;iter<3;iter++)
    {
        cudaEventRecord(start, stream);    
        kernel<<<block_size, thread_block_size,sharedMemoryPerBlock,stream>>>(dev_output_ts,dev_smid);       
        cudaEventRecord(stop, stream);
        CHECK_CUDA(cudaEventSynchronize(stop));
        float milliseconds = 0;
        cudaEventElapsedTime(&milliseconds, start, stop);
        printf("cudaEventElapsedTime:%d %.3f(milliseconds)\n",iter,milliseconds);
            
        CHECK_CUDA(cudaMemcpy(host_output_ts,dev_output_ts,ts_size,cudaMemcpyDeviceToHost));
        CHECK_CUDA(cudaMemcpy(host_smid,dev_smid,smid_size,cudaMemcpyDeviceToHost));
       
        unsigned long long _min=0;
        unsigned long long _max=0;
        for(int i=0;i<block_size;i++)
        {
            if(_min==0) _min=host_output_ts[i];
            if(_max==0) _max=host_output_ts[i];
            if(host_output_ts[i]<_min){_min=host_output_ts[i];}
            if(host_output_ts[i]>_max){_max=host_output_ts[i];}
            printf("blockid:%04d ts:%lld smid:%d\n",i,host_output_ts[i],host_smid[i]);
        }
        unsigned long long diff=_max-_min;
        printf("_max-_min=%lld(cycles) %6.2f(sec)\n",diff,diff/(1.89*1e9));    
    }
    CHECK_CUDA(cudaFree(dev_smid));
    CHECK_CUDA(cudaFree(dev_output_ts));
    return 0;
}
EOF

/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_86 -g -lineinfo -o sm_clock_benchmark sm_clock_benchmark.cu \
                    -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
./sm_clock_benchmark

输出

bash 复制代码
cudaEventElapsedTime:2 0.006(milliseconds)
blockid:0000 ts:3642438400169 smid:0
blockid:0001 ts:3644393850856 smid:2
blockid:0002 ts:3646612108206 smid:4
blockid:0003 ts:3642438400201 smid:6
blockid:0004 ts:3644393850888 smid:8
blockid:0005 ts:3646612108190 smid:10
blockid:0006 ts:3642438400234 smid:12
blockid:0007 ts:3644393850921 smid:14
blockid:0008 ts:3646612108239 smid:16
blockid:0009 ts:3642438400184 smid:18
blockid:0010 ts:3644393850871 smid:20
blockid:0011 ts:3646612108221 smid:22
blockid:0012 ts:3642438400216 smid:24
blockid:0013 ts:3644393850903 smid:26
blockid:0014 ts:3642438400177 smid:1
blockid:0015 ts:3644393850864 smid:3
blockid:0016 ts:3646612108214 smid:5
blockid:0017 ts:3642438400209 smid:7
blockid:0018 ts:3644393850896 smid:9
blockid:0019 ts:3646612108198 smid:11
blockid:0020 ts:3642438400242 smid:13
blockid:0021 ts:3644393850929 smid:15
blockid:0022 ts:3646612108247 smid:17
blockid:0023 ts:3642438400192 smid:19
blockid:0024 ts:3644393850879 smid:21
blockid:0025 ts:3646612108229 smid:23
blockid:0026 ts:3642438400224 smid:25
blockid:0027 ts:3644393850911 smid:27
_max-_min=4173708078(cycles)   2.21(sec)
相关推荐
Eloudy16 小时前
learning_gem5 part1_05 gem5 v24.1:使用 gem5 标准库配置脚本
gpu·arch·gem5
fpcc2 天前
并行编程实战——CUDA编程的流的优先级
c++·cuda
Eloudy3 天前
learning_gem5 part1_04 理解gem5统计信息与输出文件
gpu·arch·gem5
碧海潮生_CC4 天前
【CUDA笔记】03 CUDA GPU 架构与一般的程序优化思路(下)
笔记·架构·cuda
Eloudy4 天前
全文 -- GPU-Initiated Networking for NCCL
gpu·arch
HyperAI超神经4 天前
【TVM 教程】优化大语言模型
人工智能·语言模型·自然语言处理·cpu·gpu·编程语言·tvm
Felven5 天前
天数智芯MR50推理卡测试
gpu·推理·mr50·天数
中医正骨葛大夫5 天前
一文解决如何在Pycharm中创建cuda深度学习环境?
pytorch·深度学习·pycharm·软件安装·cuda·anaconda·配置环境
杰克逊的日记5 天前
slurm部署
cpu·gpu·作业管理
Eloudy5 天前
AMD Instinct MI300 系列 GPU 技术规格说明
gpu