cuda学习1: 获取设备信息

deviceInfo.cu

cpp 复制代码
#include <cuda_runtime_api.h>
#include <iostream>

// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2Cores(int major, int minor) {
    // Defines for GPU Architecture types (using the SM version to determine
    // the # of cores per SM
    typedef struct {
        int SM; // 0xMm (hexidecimal notation), M = SM Major version,
        // and m = SM minor version
        int Cores;
    } sSMtoCores;

    sSMtoCores nGpuArchCoresPerSM[] = {
        {0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128},
        {0x52, 128}, {0x53, 128}, {0x60, 64},  {0x61, 128}, {0x62, 128},
        {0x70, 64},  {0x72, 64},  {0x75, 64},  {0x80, 64},  {0x86, 128},
        {0x87, 128}, {0x89, 128}, {0x90, 128}, {0xa0, 128}, {0xa1, 128},
        {0xc0, 128}, {-1, -1}};

    int index = 0;

    while (nGpuArchCoresPerSM[index].SM != -1) {
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
            return nGpuArchCoresPerSM[index].Cores;
        }

        index++;
    }

    // If we don't find the values, we default use the previous one
    // to run properly
    printf("MapSMtoCores for SM %d.%d is undefined."
           "  Default to use %d Cores/SM\n",
           major, minor, nGpuArchCoresPerSM[index - 1].Cores);
    return nGpuArchCoresPerSM[index - 1].Cores;
}

inline const char *_ConvertSMVer2ArchName(int major, int minor) {
    // Defines for GPU Architecture types (using the SM version to determine
    // the GPU Arch name)
    typedef struct {
        int SM; // 0xMm (hexidecimal notation), M = SM Major version,
        // and m = SM minor version
        const char *name;
    } sSMtoArchName;

    sSMtoArchName nGpuArchNameSM[] = {
        {0x30, "Kepler"},       {0x32, "Kepler"},    {0x35, "Kepler"},
        {0x37, "Kepler"},       {0x50, "Maxwell"},   {0x52, "Maxwell"},
        {0x53, "Maxwell"},      {0x60, "Pascal"},    {0x61, "Pascal"},
        {0x62, "Pascal"},       {0x70, "Volta"},     {0x72, "Xavier"},
        {0x75, "Turing"},       {0x80, "Ampere"},    {0x86, "Ampere"},
        {0x87, "Ampere"},       {0x89, "Ada"},       {0x90, "Hopper"},
        {0xa0, "Blackwell"},    {0xa1, "Blackwell"}, {0xc0, "Blackwell"},
        {-1, "Graphics Device"}};

    int index = 0;

    while (nGpuArchNameSM[index].SM != -1) {
        if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
            return nGpuArchNameSM[index].name;
        }

        index++;
    }

    // If we don't find the values, we default use the previous one
    // to run properly
    printf("MapSMtoArchName for SM %d.%d is undefined."
           "  Default to use %s\n",
           major, minor, nGpuArchNameSM[index - 1].name);
    return nGpuArchNameSM[index - 1].name;
}
// end of GPU Architecture definitions

int main() {
    int count;
    cudaGetDeviceCount(&count); // 返回计算能力大于1.0的GPU数量

    // int gpuid = 0;        // 选择GPU: 0
    // cudaSetDevice(gpuid); // 根据GPU的index设置需要的GPU,默认为0
    // cudaGetDevice(&gpuid); // 获得当前线程所使用的GPU index,赋值给device

    for (int i = 0; i < count; ++i) {
        struct cudaDeviceProp device_prop;
        auto error = cudaGetDeviceProperties(&device_prop, i);
        if (cudaSuccess != error) {
            std::cerr << "cudaGetDeviceProperties " << i << " error "
                      << cudaGetErrorString(error) << std::endl;
            break;
        }

        std::cout << "GPU \t" << i << std::endl;
        std::cout << "Name: \t" << device_prop.name << std::endl;
        std::cout << "Architecture: "
                  << _ConvertSMVer2ArchName(device_prop.major,
                                            device_prop.minor)
                  << std::endl;
        std::cout << "Capability: \t" << device_prop.major << "."
                  << device_prop.minor << std::endl;
        std::cout << "Spcores \t"
                  << _ConvertSMVer2Cores(device_prop.major, device_prop.minor) *
                         device_prop.multiProcessorCount
                  << std::endl;
        std::cout << "Total Memory: \t"
                  << (device_prop.totalGlobalMem / 1024 / 1024) << " MB "
                  << std::endl;
        std::cout << "Shared Memory Per Block: \t"
                  << (device_prop.sharedMemPerBlock / 1024) << " KB "
                  << std::endl;
        std::cout << "warpSize: \t" << device_prop.warpSize << std::endl;
        std::cout << "Max Threads Per Block: \t"
                  << device_prop.maxThreadsPerBlock << std::endl;
        std::cout << "Max Threads Dim: \t[" << device_prop.maxThreadsDim[0]
                  << ", " << device_prop.maxThreadsDim[1] << ", "
                  << device_prop.maxThreadsDim[2] << "]" << std::endl;

        std::cout << "Max Grid Size: \t[" << device_prop.maxGridSize[0] << ", "
                  << device_prop.maxGridSize[1] << ", "
                  << device_prop.maxGridSize[2] << "]" << std::endl;
    }
}

_ConvertSMVer2Cores 用于获取每个流处理器的核心数,_ConvertSMVer2ArchName 用于获取架构名称,这两个函数都来自https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h

CMakeLists.txt

bash 复制代码
cmake_minimum_required(VERSION 3.26)

project(learningcuda CUDA CXX)

# 该命令会导入一个名为 CUDA::toolkit 的模块. 并且会给包含在 CUDAToolkit 的一些库定义可选的导入目标. 例如可以使用
# CUDA::cudart 来导入 CUDA Runtime 库, 使用 CUDA::cublas 来导入 cuBLAS 库等.
find_package(CUDAToolkit REQUIRED)

set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CUDA_STANDARD 11)

# 变量 CMAKE_CUDA_ARCHITECTURES 是 CMake 3.18 版本中加入的一个变量, 用于指定编译 CUDA 代码时支持的 GPU
# 架构, 如果要使用新架构的一些特性, 则必须要指定特定的架构. nvidia-smi -q | grep Architecture 查看架构信息
set(CMAKE_CUDA_ARCHITECTURES 60)

add_executable(deviceInfo deviceInfo.cu)

运行结果

$ ./deviceInfo

GPU 0

Name: NVIDIA GeForce GTX 1050 Ti

Architecture: Pascal

Capability: 6.1

Spcores 768

Total Memory: 4038 MB

Shared Memory Per Block: 48 KB

warpSize: 32

Max Threads Per Block: 1024

Max Threads Dim: [1024, 1024, 64]

Max Grid Size: [2147483647, 65535, 65535]

相关推荐
西岸行者5 天前
学习笔记:SKILLS 能帮助更好的vibe coding
笔记·学习
悠哉悠哉愿意5 天前
【单片机学习笔记】串口、超声波、NE555的同时使用
笔记·单片机·学习
别催小唐敲代码5 天前
嵌入式学习路线
学习
毛小茛5 天前
计算机系统概论——校验码
学习
babe小鑫5 天前
大专经济信息管理专业学习数据分析的必要性
学习·数据挖掘·数据分析
winfreedoms5 天前
ROS2知识大白话
笔记·学习·ros2
在这habit之下5 天前
Linux Virtual Server(LVS)学习总结
linux·学习·lvs
我想我不够好。5 天前
2026.2.25监控学习
学习
im_AMBER5 天前
Leetcode 127 删除有序数组中的重复项 | 删除有序数组中的重复项 II
数据结构·学习·算法·leetcode
CodeJourney_J5 天前
从“Hello World“ 开始 C++
c语言·c++·学习