RTX-5090 基础测试

一、环境搭建

1、安装支持PCIE P2P的驱动

bash 复制代码

git clone https://github.com/aikitoria/open-gpu-kernel-modules.git
cd open-gpu-kernel-modules
git checkout remotes/origin/595.58.03-p2p

# 卸载当前正在运行的 NVIDIA 模块（如果有）
sudo rmmod nvidia_drm nvidia_modeset nvidia_uvm nvidia

# 编译内核模块 (-j$(nproc) 表示使用所有CPU核心加速)
make modules -j$(nproc)

# 安装到系统
sudo make modules_install -j$(nproc)
sudo depmod  # 更新模块依赖关系

apt install nvidia-firmware-595-595.58.03

wget https://developer.download.nvidia.com/compute/nvidia-driver/595.58.03/local_installers/nvidia-driver-local-repo-ubuntu2404-595.58.03_1.0-1_amd64.deb
dpkg -i nvidia-driver-local-repo-ubuntu2404-595.58.03_1.0-1_amd64.deb

dpkg -X /var/nvidia-driver-local-repo-ubuntu2404-595.58.03/libnvidia-compute_595.58.03-1ubuntu1_amd64.deb .

cp usr/bin/nvidia-smi /usr/bin/ -rf
cp usr/lib/x86_64-linux-gnu/* /usr/lib/x86_64-linux-gnu/ -ravf

# 清理垃圾文件
rm var usr -rf

nvidia-smi

输出

bash 复制代码

Tue Jun 23 11:42:52 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 595.58.03              Driver Version: 595.58.03      CUDA Version: 13.2     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 5090        Off |   00000000:81:00.0 Off |                  N/A |
| 46%   35C    P0             65W /  575W |       0MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 5090        Off |   00000000:A1:00.0 Off |                  N/A |
| 46%   35C    P0             67W /  575W |       0MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   2  NVIDIA GeForce RTX 5090        Off |   00000000:C1:00.0 Off |                  N/A |
| 46%   32C    P0             69W /  575W |       0MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   3  NVIDIA GeForce RTX 5090        Off |   00000000:E1:00.0 Off |                  N/A |
| 46%   35C    P0             67W /  575W |       0MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

1、创建容器

bash 复制代码

cd /data
docker stop zj_nv_eval
docker rm zj_nv_eval
docker run --gpus all --shm-size=128g -itd -e NVIDIA_VISIBLE_DEVICES=all \
  --name zj_nv_eval --hostname zj_nv_eval --privileged --net=host \
  -v $PWD:/app -w /app nvcr.io/nvidia/pytorch:26.03-py3 /bin/bash
docker start zj_nv_eval
docker exec -ti zj_nv_eval bash
mkdir /app/zj_nv_eval -p
cd /app/zj_nv_eval

2、查看设备信息

bash 复制代码

root@zj_nv_eval:/app/zj_nv_eval# nvidia-smi
Tue Jun 23 11:47:31 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 595.58.03              Driver Version: 595.58.03      CUDA Version: 13.2     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 5090        Off |   00000000:81:00.0 Off |                  N/A |
| 52%   34C    P0             62W /  575W |       0MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 5090        Off |   00000000:A1:00.0 Off |                  N/A |
| 51%   34C    P0             65W /  575W |       0MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   2  NVIDIA GeForce RTX 5090        Off |   00000000:C1:00.0 Off |                  N/A |
| 43%   32C    P0             61W /  575W |       0MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   3  NVIDIA GeForce RTX 5090        Off |   00000000:E1:00.0 Off |                  N/A |
| 46%   34C    P0             64W /  575W |       0MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+


root@zj_nv_eval:/app/zj_nv_eval# nvidia-smi topo -m
        GPU0    GPU1    GPU2    GPU3    CPU Affinity    NUMA Affinity   GPU NUMA ID
GPU0     X      NODE    NODE    NODE    16-31,48-63     1               N/A
GPU1    NODE     X      NODE    NODE    16-31,48-63     1               N/A
GPU2    NODE    NODE     X      NODE    16-31,48-63     1               N/A
GPU3    NODE    NODE    NODE     X      16-31,48-63     1               N/A

Legend:

  X    = Self
  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
  PIX  = Connection traversing at most a single PCIe bridge
  NV#  = Connection traversing a bonded set of # NVLinks

root@zj_nv_eval:/app/zj_nv_eval# nvidia-smi -i 0 -q

==============NVSMI LOG==============

Timestamp                                              : Tue Jun 23 11:47:54 2026
Driver Version                                         : 595.58.03
CUDA Version                                           : 13.2

Attached GPUs                                          : 4
GPU 00000000:81:00.0
    Product Name                                       : NVIDIA GeForce RTX 5090
    Product Brand                                      : GeForce
    Product Architecture                               : Blackwell
    Display Mode                                       : Requested functionality has been deprecated
    Display Attached                                   : No
    Display Active                                     : Disabled
    Persistence Mode                                   : Disabled
    Addressing Mode                                    : HMM
    MIG Mode
        Current                                        : N/A
        Pending                                        : N/A
    Accounting Mode                                    : Disabled
    Accounting Mode Buffer Size                        : 4000
    Driver Model
        Current                                        : N/A
        Pending                                        : N/A
    Serial Number                                      : 0
    GPU UUID                                           : GPU-af08fa67-ad9c-4121-67de-25d9b48552d3
    GPU PDI                                            : 0x33496b18ee1659d9
    Minor Number                                       : 2
    VBIOS Version                                      : 98.02.2E.80.4F
    MultiGPU Board                                     : No
    Board ID                                           : 0x8100
    Board Part Number                                  : N/A
    GPU Part Number                                    : 2B85-300-A1
    FRU Part Number                                    : N/A
    Platform Info
        Chassis Serial Number                          :
        Slot Number                                    : 0
        Tray Index                                     : 0
        Host ID                                        : 1
        Peer Type                                      : Direct Connected
        Module Id                                      : 1
        GPU Fabric GUID                                : 0x0000000000000000
    Inforom Version
        Image Version                                  : G005.0000.98.01
        OEM Object                                     : 2.1
        ECC Object                                     : N/A
        Power Management Object                        : N/A
    Inforom BBX Object Flush
        Latest Timestamp                               : N/A
        Latest Duration                                : N/A
    GPU Operation Mode
        Current                                        : N/A
        Pending                                        : N/A
    GPU C2C Mode                                       : Disabled
    GPU Virtualization Mode
        Virtualization Mode                            : None
        Host VGPU Mode                                 : N/A
        vGPU Heterogeneous Mode                        : N/A
    GPU Recovery Action                                : None
    GSP Firmware Version                               : 595.58.03
    IBMNPU
        Relaxed Ordering Mode                          : N/A
    PCI
        Bus                                            : 0x81
        Device                                         : 0x00
        Domain                                         : 0x0000
        Base Classcode                                 : 0x3
        Sub Classcode                                  : 0x0
        Device Id                                      : 0x2B8510DE
        Bus Id                                         : 00000000:81:00.0
        Sub System Id                                  : 0x205910DE
        GPU Link Info
            PCIe Generation
                Max                                    : 5
                Current                                : 5
                Device Current                         : 5
                Device Max                             : 5
                Host Max                               : 5
            Link Width
                Max                                    : 16x
                Current                                : 16x
        Bridge Chip
            Type                                       : N/A
            Firmware                                   : N/A
        Replays Since Reset                            : 0
        Replay Number Rollovers                        : 0
        Tx Throughput                                  : 708 KB/s
        Rx Throughput                                  : 558 KB/s
        Atomic Caps Outbound                           : N/A
        Atomic Caps Inbound                            : FETCHADD_32 FETCHADD_64 SWAP_32 SWAP_64 CAS_32 CAS_64
    Fan Speed                                          : 46 %
    Performance State                                  : P0
    Clocks Event Reasons
        Idle                                           : Not Active
        Applications Clocks Setting                    : Not Active
        SW Power Cap                                   : Not Active
        HW Slowdown                                    : Not Active
            HW Thermal Slowdown                        : Not Active
            HW Power Brake Slowdown                    : Not Active
        Sync Boost                                     : Not Active
        SW Thermal Slowdown                            : Not Active
        Display Clock Setting                          : Not Active
    Clocks Event Reasons Counters
        SW Power Capping                               : 218967 us
        Sync Boost                                     : 0 us
        SW Thermal Slowdown                            : 0 us
        HW Thermal Slowdown                            : 0 us
        HW Power Braking                               : 0 us
    Sparse Operation Mode                              : N/A
    FB Memory Usage
        Total                                          : 32607 MiB
        Reserved                                       : 497 MiB
        Used                                           : 0 MiB
        Free                                           : 32111 MiB
    BAR1 Memory Usage
        Total                                          : 32768 MiB
        Used                                           : 0 MiB
        Free                                           : 32768 MiB
    Conf Compute Protected Memory Usage
        Total                                          : 0 MiB
        Used                                           : 0 MiB
        Free                                           : 0 MiB
    Compute Mode                                       : Default
    Utilization
        GPU                                            : 0 %
        Memory                                         : 0 %
        Encoder                                        : 0 %
        Decoder                                        : 0 %
        JPEG                                           : 0 %
        OFA                                            : 0 %
    Encoder Stats
        Active Sessions                                : 0
        Average FPS                                    : 0
        Average Latency                                : 0
    FBC Stats
        Active Sessions                                : 0
        Average FPS                                    : 0
        Average Latency                                : 0
    DRAM Encryption Mode
        Current                                        : Disabled
        Pending                                        : Disabled
    ECC Mode
        Current                                        : N/A
        Pending                                        : N/A
    ECC Errors
        Volatile
            SRAM Correctable                           : N/A
            SRAM Uncorrectable Parity                  : N/A
            SRAM Uncorrectable SEC-DED                 : N/A
            DRAM Correctable                           : N/A
            DRAM Uncorrectable                         : N/A
        Aggregate
            SRAM Correctable                           : N/A
            SRAM Uncorrectable Parity                  : N/A
            SRAM Uncorrectable SEC-DED                 : N/A
            DRAM Correctable                           : N/A
            DRAM Uncorrectable                         : N/A
            SRAM Threshold Exceeded                    : N/A
        Aggregate Uncorrectable SRAM Sources
            SRAM L2                                    : N/A
            SRAM SM                                    : N/A
            SRAM Microcontroller                       : N/A
            SRAM PCIE                                  : N/A
            SRAM Other                                 : N/A
        Channel Repair Pending                         : No
        TPC Repair Pending                             : No
        Unrepairable Memory                            : N/A
    Retired Pages
        Single Bit ECC                                 : N/A
        Double Bit ECC                                 : N/A
        Pending Page Blacklist                         : N/A
    Remapped Rows                                      : N/A
    Temperature
        GPU Current Temp                               : 34 C
        GPU T.Limit Temp                               : 56 C
        GPU Shutdown T.Limit Temp                      : -5 C
        GPU Slowdown T.Limit Temp                      : -2 C
        GPU Max Operating T.Limit Temp                 : 0 C
        GPU Target Temperature                         : N/A
        Memory Current Temp                            : N/A
        Memory Max Operating T.Limit Temp              : N/A
    GPU Power Readings
        Average Power Draw                             : 65.25 W
        Instantaneous Power Draw                       : 61.69 W
        Current Power Limit                            : 575.00 W
        Requested Power Limit                          : 575.00 W
        Default Power Limit                            : 575.00 W
        Min Power Limit                                : 400.00 W
        Max Power Limit                                : 575.00 W
    GPU Memory Power Readings
        Average Power Draw                             : N/A
        Instantaneous Power Draw                       : N/A
    Module Power Readings
        Average Power Draw                             : N/A
        Instantaneous Power Draw                       : N/A
        Current Power Limit                            : N/A
        Requested Power Limit                          : N/A
        Default Power Limit                            : N/A
        Min Power Limit                                : N/A
        Max Power Limit                                : N/A
    Power Smoothing                                    : N/A
    Workload Power Profiles
        Requested Profiles                             : N/A
        Enforced Profiles                              : N/A
    EDPp Multiplier                                    : N/A
    Clocks
        Graphics                                       : 2400 MHz
        SM                                             : 2400 MHz
        Memory                                         : 14001 MHz
        Video                                          : 2077 MHz
    Applications Clocks
        Graphics                                       : Requested functionality has been deprecated
        Memory                                         : Requested functionality has been deprecated
    Default Applications Clocks
        Graphics                                       : Requested functionality has been deprecated
        Memory                                         : Requested functionality has been deprecated
    Deferred Clocks
        Memory                                         : N/A
    Max Clocks
        Graphics                                       : 3090 MHz
        SM                                             : 3090 MHz
        Memory                                         : 14001 MHz
        Video                                          : 3090 MHz
    Max Customer Boost Clocks
        Graphics                                       : N/A
    Clock Policy
        Auto Boost                                     : N/A
        Auto Boost Default                             : N/A
    Fabric
        State                                          : N/A
        Status                                         : N/A
        CliqueId                                       : N/A
        ClusterUUID                                    : N/A
        Health
            Summary                                    : N/A
            Bandwidth                                  : N/A
            Route Recovery in progress                 : N/A
            Route Unhealthy                            : N/A
            Access Timeout Recovery                    : N/A
            Incorrect Configuration                    : N/A
            Partition Assigned                         : N/A
    Processes                                          : None
    Capabilities
        EGM                                            : disabled

root@zj_nv_eval:/app/zj_nv_eval# nvidia-smi dmon -i 0,1 -s pucvmet -d 1
# gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk  pviol  tviol     fb   bar1   ccpm  sbecc  dbecc    pci  rxpci  txpci
# Idx      W      C      C      %      %      %      %      %      %    MHz    MHz      %   bool     MB     MB     MB   errs   errs   errs   MB/s   MB/s
    0     74     35      -      0      0      0      0      0      0  14001   2655      0      0      0  32122      0      -      -      0      0      0
    1     86     35      -      0      0      0      0      0      0  14001   2895      0      0      0  32122      0      -      -      0      0      0
    0     73     35      -      0      0      0      0      0      0  14001   2655      0      0      0  32122      0      -      -      0      0      0

3、验证是否支持PCIE P2P

c 复制代码

cd /app/zj_nv_eval
cat > p2p_peer_validate.cu << 'EOF'
#include <cuda_runtime.h>

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define CHECK_CUDA(cmd)                                                         \
  do {                                                                          \
    cudaError_t err__ = (cmd);                                                  \
    if (err__ != cudaSuccess) {                                                 \
      fprintf(stderr, "CUDA error at %s:%d: %s (%d)\n", __FILE__, __LINE__,     \
              cudaGetErrorString(err__), (int)err__);                           \
      exit(EXIT_FAILURE);                                                       \
    }                                                                           \
  } while (0)

__global__ void fill_pattern_kernel(uint32_t* buf, size_t count, uint32_t seed) {
  size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < count) buf[idx] = seed ^ (uint32_t)idx;
}

__global__ void remote_read_copy_kernel(const uint32_t* remote_src,
                                        uint32_t* local_dst,
                                        size_t count) {
  size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < count) local_dst[idx] = remote_src[idx];
}

static void verify_pattern(const uint32_t* host_buf,
                           size_t count,
                           uint32_t seed,
                           const char* label) {
  for (size_t i = 0; i < count; ++i) {
    uint32_t expected = seed ^ (uint32_t)i;
    if (host_buf[i] != expected) {
      fprintf(stderr,
              "%s verify failed at index %zu: expected 0x%08x got 0x%08x\n",
              label, i, expected, host_buf[i]);
      exit(EXIT_FAILURE);
    }
  }
}

static void print_device_summary(int dev) {
  cudaDeviceProp prop;
  CHECK_CUDA(cudaGetDeviceProperties(&prop, dev));
  printf("GPU %d: %s, PCI %02x:%02x.%x, totalGlobalMem %.1f GiB\n", dev,
         prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID,
         (double)prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
}

static void enable_peer_or_die(int src_dev, int dst_dev) {
  int can_access = 0;
  CHECK_CUDA(cudaDeviceCanAccessPeer(&can_access, src_dev, dst_dev));
  printf("  GPU %d -> GPU %d peer access: %s\n", src_dev, dst_dev,
         can_access ? "supported" : "not supported");
  if (!can_access) exit(EXIT_FAILURE);

  CHECK_CUDA(cudaSetDevice(src_dev));
  cudaError_t err = cudaDeviceEnablePeerAccess(dst_dev, 0);
  if (err != cudaSuccess && err != cudaErrorPeerAccessAlreadyEnabled) {
    fprintf(stderr, "Failed to enable peer access %d -> %d: %s\n", src_dev,
            dst_dev, cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
}

static void test_memcpy_peer(int src_dev, int dst_dev, size_t count) {
  const uint32_t seed = 0x12340000u + (uint32_t)(src_dev * 0x100 + dst_dev);
  const size_t bytes = count * sizeof(uint32_t);
  const int threads = 256;
  const int blocks = (int)((count + threads - 1) / threads);

  uint32_t *src = NULL, *dst = NULL, *host = (uint32_t*)malloc(bytes);
  if (host == NULL) {
    fprintf(stderr, "malloc failed\n");
    exit(EXIT_FAILURE);
  }

  CHECK_CUDA(cudaSetDevice(src_dev));
  CHECK_CUDA(cudaMalloc(&src, bytes));
  fill_pattern_kernel<<<blocks, threads>>>(src, count, seed);
  CHECK_CUDA(cudaGetLastError());
  CHECK_CUDA(cudaDeviceSynchronize());

  CHECK_CUDA(cudaSetDevice(dst_dev));
  CHECK_CUDA(cudaMalloc(&dst, bytes));
  CHECK_CUDA(cudaMemset(dst, 0, bytes));

  printf("  [cudaMemcpyPeer] GPU %d -> GPU %d, bytes=%zu ... ", src_dev, dst_dev,
         bytes);
  fflush(stdout);
  CHECK_CUDA(cudaMemcpyPeer(dst, dst_dev, src, src_dev, bytes));
  CHECK_CUDA(cudaMemcpy(host, dst, bytes, cudaMemcpyDeviceToHost));
  verify_pattern(host, count, seed, "cudaMemcpyPeer");
  printf("PASS\n");

  free(host);
  CHECK_CUDA(cudaFree(dst));
  CHECK_CUDA(cudaSetDevice(src_dev));
  CHECK_CUDA(cudaFree(src));
}

static void test_remote_read(int src_dev, int dst_dev, size_t count) {
  const uint32_t seed = 0x56780000u + (uint32_t)(src_dev * 0x100 + dst_dev);
  const size_t bytes = count * sizeof(uint32_t);
  const int threads = 256;
  const int blocks = (int)((count + threads - 1) / threads);

  uint32_t *remote_src = NULL, *local_dst = NULL, *host = (uint32_t*)malloc(bytes);
  if (host == NULL) {
    fprintf(stderr, "malloc failed\n");
    exit(EXIT_FAILURE);
  }

  CHECK_CUDA(cudaSetDevice(src_dev));
  CHECK_CUDA(cudaMalloc(&remote_src, bytes));
  fill_pattern_kernel<<<blocks, threads>>>(remote_src, count, seed);
  CHECK_CUDA(cudaGetLastError());
  CHECK_CUDA(cudaDeviceSynchronize());

  CHECK_CUDA(cudaSetDevice(dst_dev));
  CHECK_CUDA(cudaMalloc(&local_dst, bytes));
  CHECK_CUDA(cudaMemset(local_dst, 0, bytes));

  printf("  [remote read kernel] GPU %d reads GPU %d, bytes=%zu ... ", dst_dev,
         src_dev, bytes);
  fflush(stdout);
  remote_read_copy_kernel<<<blocks, threads>>>(remote_src, local_dst, count);
  CHECK_CUDA(cudaGetLastError());
  CHECK_CUDA(cudaDeviceSynchronize());
  CHECK_CUDA(cudaMemcpy(host, local_dst, bytes, cudaMemcpyDeviceToHost));
  verify_pattern(host, count, seed, "remote read");
  printf("PASS\n");

  free(host);
  CHECK_CUDA(cudaFree(local_dst));
  CHECK_CUDA(cudaSetDevice(src_dev));
  CHECK_CUDA(cudaFree(remote_src));
}

int main(int argc, char** argv) {
  int device_count = 0;
  size_t count = 1 << 20;

  if (argc > 1) {
    count = strtoull(argv[1], NULL, 10);
    if (count == 0) {
      fprintf(stderr, "Invalid element count: %s\n", argv[1]);
      return EXIT_FAILURE;
    }
  }

  CHECK_CUDA(cudaGetDeviceCount(&device_count));
  if (device_count < 2) {
    fprintf(stderr, "Need at least 2 CUDA devices, found %d\n", device_count);
    return EXIT_FAILURE;
  }

  printf("Found %d CUDA devices, validating first 2 GPUs.\n", device_count);
  print_device_summary(0);
  print_device_summary(1);
  printf("Elements: %zu, bytes per buffer: %zu\n", count,
         count * sizeof(uint32_t));

  enable_peer_or_die(0, 1);
  enable_peer_or_die(1, 0);

  test_memcpy_peer(0, 1, count);
  test_memcpy_peer(1, 0, count);
  test_remote_read(0, 1, count);
  test_remote_read(1, 0, count);

  printf("All P2P data-path tests passed.\n");
  return EXIT_SUCCESS;
}
EOF
nvcc -o p2p_peer_validate p2p_peer_validate.cu
./p2p_peer_validate

输出

bash 复制代码

Found 4 CUDA devices, validating first 2 GPUs.
GPU 0: NVIDIA GeForce RTX 5090, PCI 81:00.0, totalGlobalMem 31.4 GiB
GPU 1: NVIDIA GeForce RTX 5090, PCI a1:00.0, totalGlobalMem 31.4 GiB
Elements: 1048576, bytes per buffer: 4194304
  GPU 0 -> GPU 1 peer access: supported
  GPU 1 -> GPU 0 peer access: supported
  [cudaMemcpyPeer] GPU 0 -> GPU 1, bytes=4194304 ... PASS
  [cudaMemcpyPeer] GPU 1 -> GPU 0, bytes=4194304 ... PASS
  [remote read kernel] GPU 1 reads GPU 0, bytes=4194304 ... PASS
  [remote read kernel] GPU 0 reads GPU 1, bytes=4194304 ... PASS
All P2P data-path tests passed.

4、测试PCIE P2P带宽

bash 复制代码

cd /app/zj_nv_eval
cat > gpu_bandwidth.cu << 'EOF'
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError_t err, const char *file, const int line) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: %s at %s:%d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}

// 安全启用 P2P（忽略已启用错误）
void safeEnablePeerAccess(int from, int to) {
    cudaSetDevice(from);
    cudaError_t err = cudaDeviceEnablePeerAccess(to, 0);
    if (err != cudaSuccess && err != cudaErrorPeerAccessAlreadyEnabled) {
        fprintf(stderr, "Error enabling peer access from %d to %d: %s\n", from, to, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

double testBandwidthAsync(int srcGpu, int dstGpu, size_t totalSize, int numStreams) {
    // ---------- 分配内存 ----------
    checkCudaErrors(cudaSetDevice(srcGpu));
    void* srcPtr = nullptr;
    checkCudaErrors(cudaMalloc(&srcPtr, totalSize));
    checkCudaErrors(cudaMemset(srcPtr, 0xAA, totalSize));

    checkCudaErrors(cudaSetDevice(dstGpu));
    void* dstPtr = nullptr;
    checkCudaErrors(cudaMalloc(&dstPtr, totalSize));

    // ---------- 启用 P2P ----------
    int canAccess = 0;
    checkCudaErrors(cudaDeviceCanAccessPeer(&canAccess, srcGpu, dstGpu));
    if (canAccess) {
        safeEnablePeerAccess(srcGpu, dstGpu);
        safeEnablePeerAccess(dstGpu, srcGpu);
    } else {
        fprintf(stderr, "Warning: P2P not supported, may fallback to PCIe\n");
    }

    // ---------- 创建流（必须在 srcGpu 上） ----------
    checkCudaErrors(cudaSetDevice(srcGpu));  // 关键！
    cudaStream_t *streams = (cudaStream_t*)malloc(numStreams * sizeof(cudaStream_t));
    for (int i = 0; i < numStreams; ++i) {
        checkCudaErrors(cudaStreamCreate(&streams[i]));
    }

    // ---------- 创建事件（也在 srcGpu 上） ----------
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    // ---------- 计算分块大小 ----------
    size_t chunkSize = totalSize / numStreams;
    // 如果无法整除，则退化为单流传输（但为了流水线，我们让最后一个流多传一点）
    if (chunkSize * numStreams != totalSize) {
        chunkSize = totalSize;  // 简化：全部用第一个流
        numStreams = 1;
    }

    // ---------- Warm-up ----------
    checkCudaErrors(cudaSetDevice(srcGpu));
    checkCudaErrors(cudaMemcpyPeerAsync(dstPtr, dstGpu, srcPtr, srcGpu, totalSize, streams[0]));
    checkCudaErrors(cudaStreamSynchronize(streams[0]));  // 等待完成

    // ---------- 正式测量 ----------
    checkCudaErrors(cudaEventRecord(start, 0));  // 记录开始事件（默认流，但当前设备是 srcGpu）

    if (numStreams == 1) {
        // 单流传输整个数据块
        checkCudaErrors(cudaMemcpyPeerAsync(dstPtr, dstGpu, srcPtr, srcGpu, totalSize, streams[0]));
    } else {
        // 多流分块流水线传输
        for (int i = 0; i < numStreams; ++i) {
            void* srcOffset = (char*)srcPtr + i * chunkSize;
            void* dstOffset = (char*)dstPtr + i * chunkSize;
            checkCudaErrors(cudaMemcpyPeerAsync(dstOffset, dstGpu, srcOffset, srcGpu, chunkSize, streams[i]));
        }
    }

    // 同步所有流
    for (int i = 0; i < numStreams; ++i) {
        checkCudaErrors(cudaStreamSynchronize(streams[i]));
    }

    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaEventSynchronize(stop));

    float elapsed_ms = 0.0f;
    checkCudaErrors(cudaEventElapsedTime(&elapsed_ms, start, stop));
    double bandwidth_GBps = ((double)totalSize / (elapsed_ms / 1000.0)) / (1024.0 * 1024.0 * 1024.0);

    // ---------- 清理 ----------
    checkCudaErrors(cudaSetDevice(srcGpu));
    checkCudaErrors(cudaFree(srcPtr));
    checkCudaErrors(cudaSetDevice(dstGpu));
    checkCudaErrors(cudaFree(dstPtr));
    for (int i = 0; i < numStreams; ++i) checkCudaErrors(cudaStreamDestroy(streams[i]));
    free(streams);
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));

    // 可选的：禁用 P2P（略）

    return bandwidth_GBps;
}

int main() {
    int deviceCount = 0;
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
    if (deviceCount < 2) {
        fprintf(stderr, "Need at least 2 GPUs\n");
        return EXIT_FAILURE;
    }

    int srcGpu = 0, dstGpu = 1;
    size_t totalSize = 1024ULL * 1024 * 1024; // 1GB

    printf("Testing P2P bandwidth (async, multi-stream) from GPU %d to GPU %d\n", srcGpu, dstGpu);
    printf("Transfer size: %zu MB\n", totalSize / (1024*1024));
    printf("Streams\tBandwidth (GB/s)\t%% of 64 GB/s\n");
    printf("------------------------------------------------\n");

    for (int streams = 1; streams <= 8; streams *= 2) {
        double bw = testBandwidthAsync(srcGpu, dstGpu, totalSize, streams);
        printf("%d\t%.2f\t\t%.1f%%\n", streams, bw, (bw / 64.0) * 100.0);
        fflush(stdout);
    }

    return 0;
}
EOF
nvcc -o gpu_bandwidth gpu_bandwidth.cu
./gpu_bandwidth

输出

bash 复制代码

Testing P2P bandwidth (async, multi-stream) from GPU 0 to GPU 1
Transfer size: 1024 MB
Streams Bandwidth (GB/s)        % of 64 GB/s
------------------------------------------------
1       52.43           81.9%
2       52.48           82.0%
4       52.46           82.0%
8       52.48           82.0%

5、更新nccl

bash 复制代码

cd /app/zj_nv_eval
git clone https://github.com/NVIDIA/nccl.git
cd nccl
git checkout v2.29.7-1
make clean
make -j
rm -rf /usr/lib/x86_64-linux-gnu/libnccl.so*
cp build/lib/libnccl* /usr/lib/x86_64-linux-gnu/ -rvf

6、nccl-tests性能测试

编译

bash 复制代码

cd /app/zj_nv_eval
git clone https://github.com/NVIDIA/nccl-tests.git
cd nccl-tests
make -j

测试all_reduce_perf

bash 复制代码

export NCCL_P2P_LEVEL=SYS
export NCCL_SHM_DISABLE=1
export NCCL_IB_DISABLE=1
export NCCL_NET_PLUGIN=none
export NCCL_SOCKET_IFNAME=lo
export NCCL_PROTO=Simple
export NCCL_ALGO=Ring
./build/all_reduce_perf -b 128M -e 2G -f 2 -g 4 -n 20

输出

bash 复制代码

# nccl-tests version 2.19.0 nccl-headers=22501 nccl-library=22907
# Collective test starting: all_reduce_perf
# nThread 1 nGpus 4 minBytes 134217728 maxBytes 2147483648 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0 unalign: 0
#
# Using devices
#  Rank  0 Group  0 Pid 121900 on zj_nv_eval device  0 [0000:81:00] NVIDIA GeForce RTX 5090
#  Rank  1 Group  0 Pid 121900 on zj_nv_eval device  1 [0000:a1:00] NVIDIA GeForce RTX 5090
#  Rank  2 Group  0 Pid 121900 on zj_nv_eval device  2 [0000:c1:00] NVIDIA GeForce RTX 5090
#  Rank  3 Group  0 Pid 121900 on zj_nv_eval device  3 [0000:e1:00] NVIDIA GeForce RTX 5090
#
#                                                              out-of-place                       in-place
#       size         count      type   redop    root     time   algbw   busbw  #wrong     time   algbw   busbw  #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)             (us)  (GB/s)  (GB/s)
   134217728      33554432     float     sum      -1  4230.16   31.73   47.59       0  4229.29   31.74   47.60       0
   268435456      67108864     float     sum      -1  8405.86   31.93   47.90       0  8391.48   31.99   47.98       0
   536870912     134217728     float     sum      -1  16707.1   32.13   48.20       0  16704.3   32.14   48.21       0
  1073741824     268435456     float     sum      -1  33124.4   32.42   48.62       0  33229.4   32.31   48.47       0
  2147483648     536870912     float     sum      -1  66342.0   32.37   48.55       0  66016.1   32.53   48.79       0
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 48.1934
#
# Collective test concluded: all_reduce_perf
#

测试sendrecv

bash 复制代码

export NCCL_P2P_DISABLE=0
export NCCL_P2P_LEVEL=6
export NCCL_SHM_DISABLE=1          # 禁用 SHM，强制 P2P
export NCCL_IB_DISABLE=1
export NCCL_NET_PLUGIN=none
export NCCL_SOCKET_IFNAME=lo
./build/sendrecv_perf -b 128M -e 128M -g 2 -n 10

输出

bash 复制代码

# nccl-tests version 2.19.0 nccl-headers=22501 nccl-library=22907
# Collective test starting: sendrecv_perf
# nThread 1 nGpus 2 minBytes 134217728 maxBytes 134217728 step: 1048576(bytes) warmup iters: 1 iters: 10 agg iters: 1 validation: 1 graph: 0 unalign: 0
#
# Using devices
#  Rank  0 Group  0 Pid 122314 on zj_nv_eval device  0 [0000:81:00] NVIDIA GeForce RTX 5090
#  Rank  1 Group  0 Pid 122314 on zj_nv_eval device  1 [0000:a1:00] NVIDIA GeForce RTX 5090
#
#                                                              out-of-place                       in-place
#       size         count      type   redop    root     time   algbw   busbw  #wrong     time   algbw   busbw  #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)             (us)  (GB/s)  (GB/s)
   134217728      33554432     float     sum      -1  3054.01   43.95   43.95       0  3053.15   43.96   43.96    N/A
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 43.9542
#
# Collective test concluded: sendrecv_perf
#

二、DCGM 测试

bash 复制代码

docker pull swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/nvidia/dcgm:4.1.1-1-ubuntu22.04
docker run --gpus all --shm-size=128g -it --entrypoint /bin/bash -e NVIDIA_VISIBLE_DEVICES=all  \
    --rm --privileged --net=host swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/nvidia/dcgm:4.5.2-1-ubuntu22.04

cat > config.yaml << 'EOF'
version: "@CMAKE_PROJECT_VERSION@"
spec: dcgm-diag-v1
skus:
  - name: NVIDIA GeForce RTX 5090
    id: 2b85
    targeted_power:
      is_allowed: true
      starting_matrix_dim: 1024.0
      target_power: 575.0
      use_dgemm: false
    targeted_stress:
      is_allowed: true
      use_dgemm: false
      target_stress: 100
    sm_stress:
      is_allowed: true
      target_stress: 100.0
      use_dgemm: false
    nvbandwidth:
      is_allowed: true
    pcie:
      is_allowed: true
      h2d_d2h_single_pinned:
        min_pci_generation: 5.0
        min_pci_width: 16.0
      h2d_d2h_single_unpinned:
        min_pci_generation: 5.0
        min_pci_width: 16.0
    memory:
      is_allowed: true
      l1cache_size_kb_per_sm: 192.0
    diagnostic:
      is_allowed: true
      matrix_dim: 8192.0
    memory_bandwidth:
      is_allowed: true
      minimum_bandwidth: 1230000
    pulse_test:
      is_allowed: true
EOF

time dcgmi diag -r 4 -i 0 -c config.yaml  -j

输出

bash 复制代码

real    31m45.237s
user    14m11.761s
sys     0m12.611s