测试NV GPU SM的时钟是否一致

测试NV GPU SM的时钟是否一致

测试NV GPU SM的时钟是否一致

操作步骤

c 复制代码
tee sm_clock_benchmark.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>

#define CHECK_CUDA(call)                                           \
    do {                                                           \
        cudaError_t err = call;                                    \
        if (err != cudaSuccess) {                                  \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__;  \
            std::cerr << " code=" << err << " (" << cudaGetErrorString(cudaGetLastError()) << ")" << std::endl; \
        }                                                          \
    } while (0)

__global__ void kernel(unsigned long long*output_ts,unsigned int*output_smid) {
    int tid  = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned long long ts0=0;
    asm volatile ("mov.u64 %0, %clock64;" : "=l"(ts0) :: "memory");
    unsigned int smid;
    asm volatile("mov.u32 %0, %smid;" : "=r"(smid));
    if(tid%blockDim.x==0)
    {
        output_ts[blockIdx.x]=ts0;
        output_smid[blockIdx.x]=smid;
    }
}

int main(int argc,char *argv[])
{
    int deviceid=0;
    cudaSetDevice(deviceid);  
    
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, deviceid);

    int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
    int sharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
    int maxBlocksPerMultiprocessor = deviceProp.maxBlocksPerMultiProcessor;
    int smCount = deviceProp.multiProcessorCount;

    std::cout << "Device name: " << deviceProp.name << std::endl;
    std::cout << "Max threads per block: " << maxThreadsPerBlock << std::endl;
    std::cout << "Shared memory per block: " << sharedMemoryPerBlock << " bytes" << std::endl;
    std::cout << "Max blocks per SM: " << maxBlocksPerMultiprocessor << std::endl;
    std::cout << "Number of SMs: " << smCount << std::endl;

    int block_size=smCount;
    int thread_block_size=maxThreadsPerBlock;
    int thread_size=thread_block_size*block_size;

    int data_size=sizeof(float)*thread_size;
    int ts_size=sizeof(unsigned long long)*thread_size;
    int smid_size=sizeof(int)*thread_size;

    unsigned long long* dev_output_ts=nullptr;
    unsigned int* dev_smid=nullptr;

    unsigned long long*host_output_ts=new unsigned long long[thread_size];;
    unsigned int* host_smid=new unsigned int[thread_size];
        
    CHECK_CUDA(cudaMalloc((void**)&dev_output_ts, ts_size));
    CHECK_CUDA(cudaMalloc((void**)&dev_smid, smid_size));
    
    CHECK_CUDA(cudaMemcpy(dev_output_ts,host_output_ts,ts_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_smid,host_smid,smid_size,cudaMemcpyHostToDevice));
    
    printf("dev_output_ts:%p\n",dev_output_ts);
    printf("dev_smid:%p\n",dev_smid);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    for(int iter=0;iter<3;iter++)
    {
        cudaEventRecord(start, stream);    
        kernel<<<block_size, thread_block_size,sharedMemoryPerBlock,stream>>>(dev_output_ts,dev_smid);       
        cudaEventRecord(stop, stream);
        CHECK_CUDA(cudaEventSynchronize(stop));
        float milliseconds = 0;
        cudaEventElapsedTime(&milliseconds, start, stop);
        printf("cudaEventElapsedTime:%d %.3f(milliseconds)\n",iter,milliseconds);
            
        CHECK_CUDA(cudaMemcpy(host_output_ts,dev_output_ts,ts_size,cudaMemcpyDeviceToHost));
        CHECK_CUDA(cudaMemcpy(host_smid,dev_smid,smid_size,cudaMemcpyDeviceToHost));
       
        unsigned long long _min=0;
        unsigned long long _max=0;
        for(int i=0;i<block_size;i++)
        {
            if(_min==0) _min=host_output_ts[i];
            if(_max==0) _max=host_output_ts[i];
            if(host_output_ts[i]<_min){_min=host_output_ts[i];}
            if(host_output_ts[i]>_max){_max=host_output_ts[i];}
            printf("blockid:%04d ts:%lld smid:%d\n",i,host_output_ts[i],host_smid[i]);
        }
        unsigned long long diff=_max-_min;
        printf("_max-_min=%lld(cycles) %6.2f(sec)\n",diff,diff/(1.89*1e9));    
    }
    CHECK_CUDA(cudaFree(dev_smid));
    CHECK_CUDA(cudaFree(dev_output_ts));
    return 0;
}
EOF

/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_86 -g -lineinfo -o sm_clock_benchmark sm_clock_benchmark.cu \
                    -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
./sm_clock_benchmark

输出

bash 复制代码
cudaEventElapsedTime:2 0.006(milliseconds)
blockid:0000 ts:3642438400169 smid:0
blockid:0001 ts:3644393850856 smid:2
blockid:0002 ts:3646612108206 smid:4
blockid:0003 ts:3642438400201 smid:6
blockid:0004 ts:3644393850888 smid:8
blockid:0005 ts:3646612108190 smid:10
blockid:0006 ts:3642438400234 smid:12
blockid:0007 ts:3644393850921 smid:14
blockid:0008 ts:3646612108239 smid:16
blockid:0009 ts:3642438400184 smid:18
blockid:0010 ts:3644393850871 smid:20
blockid:0011 ts:3646612108221 smid:22
blockid:0012 ts:3642438400216 smid:24
blockid:0013 ts:3644393850903 smid:26
blockid:0014 ts:3642438400177 smid:1
blockid:0015 ts:3644393850864 smid:3
blockid:0016 ts:3646612108214 smid:5
blockid:0017 ts:3642438400209 smid:7
blockid:0018 ts:3644393850896 smid:9
blockid:0019 ts:3646612108198 smid:11
blockid:0020 ts:3642438400242 smid:13
blockid:0021 ts:3644393850929 smid:15
blockid:0022 ts:3646612108247 smid:17
blockid:0023 ts:3642438400192 smid:19
blockid:0024 ts:3644393850879 smid:21
blockid:0025 ts:3646612108229 smid:23
blockid:0026 ts:3642438400224 smid:25
blockid:0027 ts:3644393850911 smid:27
_max-_min=4173708078(cycles)   2.21(sec)
相关推荐
探索云原生1 天前
大模型推理指南:使用 vLLM 实现高效推理
ai·云原生·kubernetes·gpu·vllm
若石之上4 天前
DeepSpeed:PyTorch优化库,使模型分布式训练能高效使用内存和更快速
pytorch·内存·gpu·deepspeed·速度·zero
luoganttcc5 天前
ubuntu.24安装cuda
cuda
qiang425 天前
想租用显卡训练自己的网络?AutoDL保姆级使用教程(PyCharm版)
pycharm·gpu·autodl·租显卡
扫地的小何尚8 天前
NVIDIA RTX 系统上使用 llama.cpp 加速 LLM
人工智能·aigc·llama·gpu·nvidia·cuda·英伟达
藓类少女8 天前
【深度学习】使用硬件加速模型训练速度
人工智能·深度学习·分布式训练·gpu
centurysee10 天前
【一文搞懂】GPU硬件拓扑与传输速度
gpu·nvidia
吃肉夹馍不要夹馍10 天前
CublasLt 极简入门
cuda·cublas·gemm·cublaslt
Code-world-112 天前
Ubuntu系统安装NVIDIA驱动、CUDA、PyTorch等GPU深度学习环境
linux·pytorch·深度学习·cuda·深度强化学习