测试NV GPU SM的时钟是否一致

测试NV GPU SM的时钟是否一致

测试NV GPU SM的时钟是否一致

操作步骤

c 复制代码
tee sm_clock_benchmark.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>

#define CHECK_CUDA(call)                                           \
    do {                                                           \
        cudaError_t err = call;                                    \
        if (err != cudaSuccess) {                                  \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__;  \
            std::cerr << " code=" << err << " (" << cudaGetErrorString(cudaGetLastError()) << ")" << std::endl; \
        }                                                          \
    } while (0)

__global__ void kernel(unsigned long long*output_ts,unsigned int*output_smid) {
    int tid  = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned long long ts0=0;
    asm volatile ("mov.u64 %0, %clock64;" : "=l"(ts0) :: "memory");
    unsigned int smid;
    asm volatile("mov.u32 %0, %smid;" : "=r"(smid));
    if(tid%blockDim.x==0)
    {
        output_ts[blockIdx.x]=ts0;
        output_smid[blockIdx.x]=smid;
    }
}

int main(int argc,char *argv[])
{
    int deviceid=0;
    cudaSetDevice(deviceid);  
    
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, deviceid);

    int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
    int sharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
    int maxBlocksPerMultiprocessor = deviceProp.maxBlocksPerMultiProcessor;
    int smCount = deviceProp.multiProcessorCount;

    std::cout << "Device name: " << deviceProp.name << std::endl;
    std::cout << "Max threads per block: " << maxThreadsPerBlock << std::endl;
    std::cout << "Shared memory per block: " << sharedMemoryPerBlock << " bytes" << std::endl;
    std::cout << "Max blocks per SM: " << maxBlocksPerMultiprocessor << std::endl;
    std::cout << "Number of SMs: " << smCount << std::endl;

    int block_size=smCount;
    int thread_block_size=maxThreadsPerBlock;
    int thread_size=thread_block_size*block_size;

    int data_size=sizeof(float)*thread_size;
    int ts_size=sizeof(unsigned long long)*thread_size;
    int smid_size=sizeof(int)*thread_size;

    unsigned long long* dev_output_ts=nullptr;
    unsigned int* dev_smid=nullptr;

    unsigned long long*host_output_ts=new unsigned long long[thread_size];;
    unsigned int* host_smid=new unsigned int[thread_size];
        
    CHECK_CUDA(cudaMalloc((void**)&dev_output_ts, ts_size));
    CHECK_CUDA(cudaMalloc((void**)&dev_smid, smid_size));
    
    CHECK_CUDA(cudaMemcpy(dev_output_ts,host_output_ts,ts_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_smid,host_smid,smid_size,cudaMemcpyHostToDevice));
    
    printf("dev_output_ts:%p\n",dev_output_ts);
    printf("dev_smid:%p\n",dev_smid);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    for(int iter=0;iter<3;iter++)
    {
        cudaEventRecord(start, stream);    
        kernel<<<block_size, thread_block_size,sharedMemoryPerBlock,stream>>>(dev_output_ts,dev_smid);       
        cudaEventRecord(stop, stream);
        CHECK_CUDA(cudaEventSynchronize(stop));
        float milliseconds = 0;
        cudaEventElapsedTime(&milliseconds, start, stop);
        printf("cudaEventElapsedTime:%d %.3f(milliseconds)\n",iter,milliseconds);
            
        CHECK_CUDA(cudaMemcpy(host_output_ts,dev_output_ts,ts_size,cudaMemcpyDeviceToHost));
        CHECK_CUDA(cudaMemcpy(host_smid,dev_smid,smid_size,cudaMemcpyDeviceToHost));
       
        unsigned long long _min=0;
        unsigned long long _max=0;
        for(int i=0;i<block_size;i++)
        {
            if(_min==0) _min=host_output_ts[i];
            if(_max==0) _max=host_output_ts[i];
            if(host_output_ts[i]<_min){_min=host_output_ts[i];}
            if(host_output_ts[i]>_max){_max=host_output_ts[i];}
            printf("blockid:%04d ts:%lld smid:%d\n",i,host_output_ts[i],host_smid[i]);
        }
        unsigned long long diff=_max-_min;
        printf("_max-_min=%lld(cycles) %6.2f(sec)\n",diff,diff/(1.89*1e9));    
    }
    CHECK_CUDA(cudaFree(dev_smid));
    CHECK_CUDA(cudaFree(dev_output_ts));
    return 0;
}
EOF

/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_86 -g -lineinfo -o sm_clock_benchmark sm_clock_benchmark.cu \
                    -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
./sm_clock_benchmark

输出

bash 复制代码
cudaEventElapsedTime:2 0.006(milliseconds)
blockid:0000 ts:3642438400169 smid:0
blockid:0001 ts:3644393850856 smid:2
blockid:0002 ts:3646612108206 smid:4
blockid:0003 ts:3642438400201 smid:6
blockid:0004 ts:3644393850888 smid:8
blockid:0005 ts:3646612108190 smid:10
blockid:0006 ts:3642438400234 smid:12
blockid:0007 ts:3644393850921 smid:14
blockid:0008 ts:3646612108239 smid:16
blockid:0009 ts:3642438400184 smid:18
blockid:0010 ts:3644393850871 smid:20
blockid:0011 ts:3646612108221 smid:22
blockid:0012 ts:3642438400216 smid:24
blockid:0013 ts:3644393850903 smid:26
blockid:0014 ts:3642438400177 smid:1
blockid:0015 ts:3644393850864 smid:3
blockid:0016 ts:3646612108214 smid:5
blockid:0017 ts:3642438400209 smid:7
blockid:0018 ts:3644393850896 smid:9
blockid:0019 ts:3646612108198 smid:11
blockid:0020 ts:3642438400242 smid:13
blockid:0021 ts:3644393850929 smid:15
blockid:0022 ts:3646612108247 smid:17
blockid:0023 ts:3642438400192 smid:19
blockid:0024 ts:3644393850879 smid:21
blockid:0025 ts:3646612108229 smid:23
blockid:0026 ts:3642438400224 smid:25
blockid:0027 ts:3644393850911 smid:27
_max-_min=4173708078(cycles)   2.21(sec)
相关推荐
明洞日记1 天前
【CUDA手册002】CUDA 基础执行模型:写出第一个正确的 Kernel
c++·图像处理·算法·ai·图形渲染·gpu·cuda
明洞日记1 天前
【CUDA手册004】一个典型算子的 CUDA 化完整流程
c++·图像处理·算法·ai·图形渲染·gpu·cuda
小烤箱2 天前
CUDA 编程完全理解系列(第四篇):硬件视角下的索引变量与分级内存机制
cuda·并行计算·感知算法
linweidong2 天前
中科曙光C++面试题及参考答案
二叉树·cuda·内存泄漏·寄存器·c++面试·c++面经·混合编译
抠头专注python环境配置2 天前
2026终极诊断指南:解决Windows PyTorch GPU安装失败,从迷茫到确定
人工智能·pytorch·windows·深度学习·gpu·环境配置·cuda
chinamaoge3 天前
NVIDIA大模型推理框架:TensorRT-LLM软件流程(四)探究TensorRT LLM自定义算子调用流程
cuda·tensorrt plugin·tensorrt llm
Hi202402173 天前
使用星图AI算力平台训练PETRV2-BEV模型
人工智能·自动驾驶·gpu·机器视觉·bev·算力平台
love530love4 天前
突破 ComfyUI 环境枷锁:RTX 3090 强行开启 comfy-kitchen 官方全后端加速库实战
人工智能·windows·python·cuda·comfyui·triton·comfy-kitchen
心 爱心 爱4 天前
pip 隔离环境内 安装 cuda 113 不覆盖原有的全局 cuda 115
pip·cuda·隔离环境
virtaitech4 天前
云平台一键部署【Step-1X-3D】3D生成界的Flux
人工智能·科技·ai·gpu·算力·云平台