cuda学习1: 获取设备信息

deviceInfo.cu

cpp 复制代码
#include <cuda_runtime_api.h>
#include <iostream>

// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2Cores(int major, int minor) {
    // Defines for GPU Architecture types (using the SM version to determine
    // the # of cores per SM
    typedef struct {
        int SM; // 0xMm (hexidecimal notation), M = SM Major version,
        // and m = SM minor version
        int Cores;
    } sSMtoCores;

    sSMtoCores nGpuArchCoresPerSM[] = {
        {0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128},
        {0x52, 128}, {0x53, 128}, {0x60, 64},  {0x61, 128}, {0x62, 128},
        {0x70, 64},  {0x72, 64},  {0x75, 64},  {0x80, 64},  {0x86, 128},
        {0x87, 128}, {0x89, 128}, {0x90, 128}, {0xa0, 128}, {0xa1, 128},
        {0xc0, 128}, {-1, -1}};

    int index = 0;

    while (nGpuArchCoresPerSM[index].SM != -1) {
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
            return nGpuArchCoresPerSM[index].Cores;
        }

        index++;
    }

    // If we don't find the values, we default use the previous one
    // to run properly
    printf("MapSMtoCores for SM %d.%d is undefined."
           "  Default to use %d Cores/SM\n",
           major, minor, nGpuArchCoresPerSM[index - 1].Cores);
    return nGpuArchCoresPerSM[index - 1].Cores;
}

inline const char *_ConvertSMVer2ArchName(int major, int minor) {
    // Defines for GPU Architecture types (using the SM version to determine
    // the GPU Arch name)
    typedef struct {
        int SM; // 0xMm (hexidecimal notation), M = SM Major version,
        // and m = SM minor version
        const char *name;
    } sSMtoArchName;

    sSMtoArchName nGpuArchNameSM[] = {
        {0x30, "Kepler"},       {0x32, "Kepler"},    {0x35, "Kepler"},
        {0x37, "Kepler"},       {0x50, "Maxwell"},   {0x52, "Maxwell"},
        {0x53, "Maxwell"},      {0x60, "Pascal"},    {0x61, "Pascal"},
        {0x62, "Pascal"},       {0x70, "Volta"},     {0x72, "Xavier"},
        {0x75, "Turing"},       {0x80, "Ampere"},    {0x86, "Ampere"},
        {0x87, "Ampere"},       {0x89, "Ada"},       {0x90, "Hopper"},
        {0xa0, "Blackwell"},    {0xa1, "Blackwell"}, {0xc0, "Blackwell"},
        {-1, "Graphics Device"}};

    int index = 0;

    while (nGpuArchNameSM[index].SM != -1) {
        if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
            return nGpuArchNameSM[index].name;
        }

        index++;
    }

    // If we don't find the values, we default use the previous one
    // to run properly
    printf("MapSMtoArchName for SM %d.%d is undefined."
           "  Default to use %s\n",
           major, minor, nGpuArchNameSM[index - 1].name);
    return nGpuArchNameSM[index - 1].name;
}
// end of GPU Architecture definitions

int main() {
    int count;
    cudaGetDeviceCount(&count); // 返回计算能力大于1.0的GPU数量

    // int gpuid = 0;        // 选择GPU: 0
    // cudaSetDevice(gpuid); // 根据GPU的index设置需要的GPU,默认为0
    // cudaGetDevice(&gpuid); // 获得当前线程所使用的GPU index,赋值给device

    for (int i = 0; i < count; ++i) {
        struct cudaDeviceProp device_prop;
        auto error = cudaGetDeviceProperties(&device_prop, i);
        if (cudaSuccess != error) {
            std::cerr << "cudaGetDeviceProperties " << i << " error "
                      << cudaGetErrorString(error) << std::endl;
            break;
        }

        std::cout << "GPU \t" << i << std::endl;
        std::cout << "Name: \t" << device_prop.name << std::endl;
        std::cout << "Architecture: "
                  << _ConvertSMVer2ArchName(device_prop.major,
                                            device_prop.minor)
                  << std::endl;
        std::cout << "Capability: \t" << device_prop.major << "."
                  << device_prop.minor << std::endl;
        std::cout << "Spcores \t"
                  << _ConvertSMVer2Cores(device_prop.major, device_prop.minor) *
                         device_prop.multiProcessorCount
                  << std::endl;
        std::cout << "Total Memory: \t"
                  << (device_prop.totalGlobalMem / 1024 / 1024) << " MB "
                  << std::endl;
        std::cout << "Shared Memory Per Block: \t"
                  << (device_prop.sharedMemPerBlock / 1024) << " KB "
                  << std::endl;
        std::cout << "warpSize: \t" << device_prop.warpSize << std::endl;
        std::cout << "Max Threads Per Block: \t"
                  << device_prop.maxThreadsPerBlock << std::endl;
        std::cout << "Max Threads Dim: \t[" << device_prop.maxThreadsDim[0]
                  << ", " << device_prop.maxThreadsDim[1] << ", "
                  << device_prop.maxThreadsDim[2] << "]" << std::endl;

        std::cout << "Max Grid Size: \t[" << device_prop.maxGridSize[0] << ", "
                  << device_prop.maxGridSize[1] << ", "
                  << device_prop.maxGridSize[2] << "]" << std::endl;
    }
}

_ConvertSMVer2Cores 用于获取每个流处理器的核心数,_ConvertSMVer2ArchName 用于获取架构名称,这两个函数都来自https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h

CMakeLists.txt

bash 复制代码
cmake_minimum_required(VERSION 3.26)

project(learningcuda CUDA CXX)

# 该命令会导入一个名为 CUDA::toolkit 的模块. 并且会给包含在 CUDAToolkit 的一些库定义可选的导入目标. 例如可以使用
# CUDA::cudart 来导入 CUDA Runtime 库, 使用 CUDA::cublas 来导入 cuBLAS 库等.
find_package(CUDAToolkit REQUIRED)

set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CUDA_STANDARD 11)

# 变量 CMAKE_CUDA_ARCHITECTURES 是 CMake 3.18 版本中加入的一个变量, 用于指定编译 CUDA 代码时支持的 GPU
# 架构, 如果要使用新架构的一些特性, 则必须要指定特定的架构. nvidia-smi -q | grep Architecture 查看架构信息
set(CMAKE_CUDA_ARCHITECTURES 60)

add_executable(deviceInfo deviceInfo.cu)

运行结果

$ ./deviceInfo

GPU 0

Name: NVIDIA GeForce GTX 1050 Ti

Architecture: Pascal

Capability: 6.1

Spcores 768

Total Memory: 4038 MB

Shared Memory Per Block: 48 KB

warpSize: 32

Max Threads Per Block: 1024

Max Threads Dim: [1024, 1024, 64]

Max Grid Size: [2147483647, 65535, 65535]

相关推荐
2303_Alpha2 天前
SpringBoot
笔记·学习
萘柰奈2 天前
Unity学习----【进阶】TextMeshPro学习(三)--进阶知识点(TMP基础设置,材质球相关,两个辅助工具类)
学习·unity
沐矢羽2 天前
Tomcat PUT方法任意写文件漏洞学习
学习·tomcat
好奇龙猫2 天前
日语学习-日语知识点小记-进阶-JLPT-N1阶段蓝宝书,共120语法(10):91-100语法+考え方13
学习
向阳花开_miemie2 天前
Android音频学习(十八)——混音流程
学习·音视频
工大一只猿2 天前
51单片机学习
嵌入式硬件·学习·51单片机
c0d1ng2 天前
量子计算学习(第十四周周报)
学习·量子计算
Hello_Embed3 天前
STM32HAL 快速入门(二十):UART 中断改进 —— 环形缓冲区解决数据丢失
笔记·stm32·单片机·学习·嵌入式软件
咸甜适中3 天前
rust语言 (1.88) 学习笔记:客户端和服务器端同在一个项目中
笔记·学习·rust
Magnetic_h3 天前
【iOS】设计模式复习
笔记·学习·ios·设计模式·objective-c·cocoa