RTX-5090 基础测试
一、环境搭建
1、安装支持PCIE P2P的驱动
bash
git clone https://github.com/aikitoria/open-gpu-kernel-modules.git
cd open-gpu-kernel-modules
git checkout remotes/origin/595.58.03-p2p
# 卸载当前正在运行的 NVIDIA 模块(如果有)
sudo rmmod nvidia_drm nvidia_modeset nvidia_uvm nvidia
# 编译内核模块 (-j$(nproc) 表示使用所有CPU核心加速)
make modules -j$(nproc)
# 安装到系统
sudo make modules_install -j$(nproc)
sudo depmod # 更新模块依赖关系
apt install nvidia-firmware-595-595.58.03
wget https://developer.download.nvidia.com/compute/nvidia-driver/595.58.03/local_installers/nvidia-driver-local-repo-ubuntu2404-595.58.03_1.0-1_amd64.deb
dpkg -i nvidia-driver-local-repo-ubuntu2404-595.58.03_1.0-1_amd64.deb
dpkg -X /var/nvidia-driver-local-repo-ubuntu2404-595.58.03/libnvidia-compute_595.58.03-1ubuntu1_amd64.deb .
cp usr/bin/nvidia-smi /usr/bin/ -rf
cp usr/lib/x86_64-linux-gnu/* /usr/lib/x86_64-linux-gnu/ -ravf
# 清理垃圾文件
rm var usr -rf
nvidia-smi
输出
bash
Tue Jun 23 11:42:52 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 595.58.03 Driver Version: 595.58.03 CUDA Version: 13.2 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 5090 Off | 00000000:81:00.0 Off | N/A |
| 46% 35C P0 65W / 575W | 0MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA GeForce RTX 5090 Off | 00000000:A1:00.0 Off | N/A |
| 46% 35C P0 67W / 575W | 0MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA GeForce RTX 5090 Off | 00000000:C1:00.0 Off | N/A |
| 46% 32C P0 69W / 575W | 0MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA GeForce RTX 5090 Off | 00000000:E1:00.0 Off | N/A |
| 46% 35C P0 67W / 575W | 0MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
1、创建容器
bash
cd /data
docker stop zj_nv_eval
docker rm zj_nv_eval
docker run --gpus all --shm-size=128g -itd -e NVIDIA_VISIBLE_DEVICES=all \
--name zj_nv_eval --hostname zj_nv_eval --privileged --net=host \
-v $PWD:/app -w /app nvcr.io/nvidia/pytorch:26.03-py3 /bin/bash
docker start zj_nv_eval
docker exec -ti zj_nv_eval bash
mkdir /app/zj_nv_eval -p
cd /app/zj_nv_eval
2、查看设备信息
bash
root@zj_nv_eval:/app/zj_nv_eval# nvidia-smi
Tue Jun 23 11:47:31 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 595.58.03 Driver Version: 595.58.03 CUDA Version: 13.2 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 5090 Off | 00000000:81:00.0 Off | N/A |
| 52% 34C P0 62W / 575W | 0MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA GeForce RTX 5090 Off | 00000000:A1:00.0 Off | N/A |
| 51% 34C P0 65W / 575W | 0MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA GeForce RTX 5090 Off | 00000000:C1:00.0 Off | N/A |
| 43% 32C P0 61W / 575W | 0MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA GeForce RTX 5090 Off | 00000000:E1:00.0 Off | N/A |
| 46% 34C P0 64W / 575W | 0MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
root@zj_nv_eval:/app/zj_nv_eval# nvidia-smi topo -m
GPU0 GPU1 GPU2 GPU3 CPU Affinity NUMA Affinity GPU NUMA ID
GPU0 X NODE NODE NODE 16-31,48-63 1 N/A
GPU1 NODE X NODE NODE 16-31,48-63 1 N/A
GPU2 NODE NODE X NODE 16-31,48-63 1 N/A
GPU3 NODE NODE NODE X 16-31,48-63 1 N/A
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
NV# = Connection traversing a bonded set of # NVLinks
root@zj_nv_eval:/app/zj_nv_eval# nvidia-smi -i 0 -q
==============NVSMI LOG==============
Timestamp : Tue Jun 23 11:47:54 2026
Driver Version : 595.58.03
CUDA Version : 13.2
Attached GPUs : 4
GPU 00000000:81:00.0
Product Name : NVIDIA GeForce RTX 5090
Product Brand : GeForce
Product Architecture : Blackwell
Display Mode : Requested functionality has been deprecated
Display Attached : No
Display Active : Disabled
Persistence Mode : Disabled
Addressing Mode : HMM
MIG Mode
Current : N/A
Pending : N/A
Accounting Mode : Disabled
Accounting Mode Buffer Size : 4000
Driver Model
Current : N/A
Pending : N/A
Serial Number : 0
GPU UUID : GPU-af08fa67-ad9c-4121-67de-25d9b48552d3
GPU PDI : 0x33496b18ee1659d9
Minor Number : 2
VBIOS Version : 98.02.2E.80.4F
MultiGPU Board : No
Board ID : 0x8100
Board Part Number : N/A
GPU Part Number : 2B85-300-A1
FRU Part Number : N/A
Platform Info
Chassis Serial Number :
Slot Number : 0
Tray Index : 0
Host ID : 1
Peer Type : Direct Connected
Module Id : 1
GPU Fabric GUID : 0x0000000000000000
Inforom Version
Image Version : G005.0000.98.01
OEM Object : 2.1
ECC Object : N/A
Power Management Object : N/A
Inforom BBX Object Flush
Latest Timestamp : N/A
Latest Duration : N/A
GPU Operation Mode
Current : N/A
Pending : N/A
GPU C2C Mode : Disabled
GPU Virtualization Mode
Virtualization Mode : None
Host VGPU Mode : N/A
vGPU Heterogeneous Mode : N/A
GPU Recovery Action : None
GSP Firmware Version : 595.58.03
IBMNPU
Relaxed Ordering Mode : N/A
PCI
Bus : 0x81
Device : 0x00
Domain : 0x0000
Base Classcode : 0x3
Sub Classcode : 0x0
Device Id : 0x2B8510DE
Bus Id : 00000000:81:00.0
Sub System Id : 0x205910DE
GPU Link Info
PCIe Generation
Max : 5
Current : 5
Device Current : 5
Device Max : 5
Host Max : 5
Link Width
Max : 16x
Current : 16x
Bridge Chip
Type : N/A
Firmware : N/A
Replays Since Reset : 0
Replay Number Rollovers : 0
Tx Throughput : 708 KB/s
Rx Throughput : 558 KB/s
Atomic Caps Outbound : N/A
Atomic Caps Inbound : FETCHADD_32 FETCHADD_64 SWAP_32 SWAP_64 CAS_32 CAS_64
Fan Speed : 46 %
Performance State : P0
Clocks Event Reasons
Idle : Not Active
Applications Clocks Setting : Not Active
SW Power Cap : Not Active
HW Slowdown : Not Active
HW Thermal Slowdown : Not Active
HW Power Brake Slowdown : Not Active
Sync Boost : Not Active
SW Thermal Slowdown : Not Active
Display Clock Setting : Not Active
Clocks Event Reasons Counters
SW Power Capping : 218967 us
Sync Boost : 0 us
SW Thermal Slowdown : 0 us
HW Thermal Slowdown : 0 us
HW Power Braking : 0 us
Sparse Operation Mode : N/A
FB Memory Usage
Total : 32607 MiB
Reserved : 497 MiB
Used : 0 MiB
Free : 32111 MiB
BAR1 Memory Usage
Total : 32768 MiB
Used : 0 MiB
Free : 32768 MiB
Conf Compute Protected Memory Usage
Total : 0 MiB
Used : 0 MiB
Free : 0 MiB
Compute Mode : Default
Utilization
GPU : 0 %
Memory : 0 %
Encoder : 0 %
Decoder : 0 %
JPEG : 0 %
OFA : 0 %
Encoder Stats
Active Sessions : 0
Average FPS : 0
Average Latency : 0
FBC Stats
Active Sessions : 0
Average FPS : 0
Average Latency : 0
DRAM Encryption Mode
Current : Disabled
Pending : Disabled
ECC Mode
Current : N/A
Pending : N/A
ECC Errors
Volatile
SRAM Correctable : N/A
SRAM Uncorrectable Parity : N/A
SRAM Uncorrectable SEC-DED : N/A
DRAM Correctable : N/A
DRAM Uncorrectable : N/A
Aggregate
SRAM Correctable : N/A
SRAM Uncorrectable Parity : N/A
SRAM Uncorrectable SEC-DED : N/A
DRAM Correctable : N/A
DRAM Uncorrectable : N/A
SRAM Threshold Exceeded : N/A
Aggregate Uncorrectable SRAM Sources
SRAM L2 : N/A
SRAM SM : N/A
SRAM Microcontroller : N/A
SRAM PCIE : N/A
SRAM Other : N/A
Channel Repair Pending : No
TPC Repair Pending : No
Unrepairable Memory : N/A
Retired Pages
Single Bit ECC : N/A
Double Bit ECC : N/A
Pending Page Blacklist : N/A
Remapped Rows : N/A
Temperature
GPU Current Temp : 34 C
GPU T.Limit Temp : 56 C
GPU Shutdown T.Limit Temp : -5 C
GPU Slowdown T.Limit Temp : -2 C
GPU Max Operating T.Limit Temp : 0 C
GPU Target Temperature : N/A
Memory Current Temp : N/A
Memory Max Operating T.Limit Temp : N/A
GPU Power Readings
Average Power Draw : 65.25 W
Instantaneous Power Draw : 61.69 W
Current Power Limit : 575.00 W
Requested Power Limit : 575.00 W
Default Power Limit : 575.00 W
Min Power Limit : 400.00 W
Max Power Limit : 575.00 W
GPU Memory Power Readings
Average Power Draw : N/A
Instantaneous Power Draw : N/A
Module Power Readings
Average Power Draw : N/A
Instantaneous Power Draw : N/A
Current Power Limit : N/A
Requested Power Limit : N/A
Default Power Limit : N/A
Min Power Limit : N/A
Max Power Limit : N/A
Power Smoothing : N/A
Workload Power Profiles
Requested Profiles : N/A
Enforced Profiles : N/A
EDPp Multiplier : N/A
Clocks
Graphics : 2400 MHz
SM : 2400 MHz
Memory : 14001 MHz
Video : 2077 MHz
Applications Clocks
Graphics : Requested functionality has been deprecated
Memory : Requested functionality has been deprecated
Default Applications Clocks
Graphics : Requested functionality has been deprecated
Memory : Requested functionality has been deprecated
Deferred Clocks
Memory : N/A
Max Clocks
Graphics : 3090 MHz
SM : 3090 MHz
Memory : 14001 MHz
Video : 3090 MHz
Max Customer Boost Clocks
Graphics : N/A
Clock Policy
Auto Boost : N/A
Auto Boost Default : N/A
Fabric
State : N/A
Status : N/A
CliqueId : N/A
ClusterUUID : N/A
Health
Summary : N/A
Bandwidth : N/A
Route Recovery in progress : N/A
Route Unhealthy : N/A
Access Timeout Recovery : N/A
Incorrect Configuration : N/A
Partition Assigned : N/A
Processes : None
Capabilities
EGM : disabled
root@zj_nv_eval:/app/zj_nv_eval# nvidia-smi dmon -i 0,1 -s pucvmet -d 1
# gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk pviol tviol fb bar1 ccpm sbecc dbecc pci rxpci txpci
# Idx W C C % % % % % % MHz MHz % bool MB MB MB errs errs errs MB/s MB/s
0 74 35 - 0 0 0 0 0 0 14001 2655 0 0 0 32122 0 - - 0 0 0
1 86 35 - 0 0 0 0 0 0 14001 2895 0 0 0 32122 0 - - 0 0 0
0 73 35 - 0 0 0 0 0 0 14001 2655 0 0 0 32122 0 - - 0 0 0
3、验证是否支持PCIE P2P
c
cd /app/zj_nv_eval
cat > p2p_peer_validate.cu << 'EOF'
#include <cuda_runtime.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CHECK_CUDA(cmd) \
do { \
cudaError_t err__ = (cmd); \
if (err__ != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s (%d)\n", __FILE__, __LINE__, \
cudaGetErrorString(err__), (int)err__); \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ void fill_pattern_kernel(uint32_t* buf, size_t count, uint32_t seed) {
size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
if (idx < count) buf[idx] = seed ^ (uint32_t)idx;
}
__global__ void remote_read_copy_kernel(const uint32_t* remote_src,
uint32_t* local_dst,
size_t count) {
size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
if (idx < count) local_dst[idx] = remote_src[idx];
}
static void verify_pattern(const uint32_t* host_buf,
size_t count,
uint32_t seed,
const char* label) {
for (size_t i = 0; i < count; ++i) {
uint32_t expected = seed ^ (uint32_t)i;
if (host_buf[i] != expected) {
fprintf(stderr,
"%s verify failed at index %zu: expected 0x%08x got 0x%08x\n",
label, i, expected, host_buf[i]);
exit(EXIT_FAILURE);
}
}
}
static void print_device_summary(int dev) {
cudaDeviceProp prop;
CHECK_CUDA(cudaGetDeviceProperties(&prop, dev));
printf("GPU %d: %s, PCI %02x:%02x.%x, totalGlobalMem %.1f GiB\n", dev,
prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID,
(double)prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
}
static void enable_peer_or_die(int src_dev, int dst_dev) {
int can_access = 0;
CHECK_CUDA(cudaDeviceCanAccessPeer(&can_access, src_dev, dst_dev));
printf(" GPU %d -> GPU %d peer access: %s\n", src_dev, dst_dev,
can_access ? "supported" : "not supported");
if (!can_access) exit(EXIT_FAILURE);
CHECK_CUDA(cudaSetDevice(src_dev));
cudaError_t err = cudaDeviceEnablePeerAccess(dst_dev, 0);
if (err != cudaSuccess && err != cudaErrorPeerAccessAlreadyEnabled) {
fprintf(stderr, "Failed to enable peer access %d -> %d: %s\n", src_dev,
dst_dev, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
static void test_memcpy_peer(int src_dev, int dst_dev, size_t count) {
const uint32_t seed = 0x12340000u + (uint32_t)(src_dev * 0x100 + dst_dev);
const size_t bytes = count * sizeof(uint32_t);
const int threads = 256;
const int blocks = (int)((count + threads - 1) / threads);
uint32_t *src = NULL, *dst = NULL, *host = (uint32_t*)malloc(bytes);
if (host == NULL) {
fprintf(stderr, "malloc failed\n");
exit(EXIT_FAILURE);
}
CHECK_CUDA(cudaSetDevice(src_dev));
CHECK_CUDA(cudaMalloc(&src, bytes));
fill_pattern_kernel<<<blocks, threads>>>(src, count, seed);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaDeviceSynchronize());
CHECK_CUDA(cudaSetDevice(dst_dev));
CHECK_CUDA(cudaMalloc(&dst, bytes));
CHECK_CUDA(cudaMemset(dst, 0, bytes));
printf(" [cudaMemcpyPeer] GPU %d -> GPU %d, bytes=%zu ... ", src_dev, dst_dev,
bytes);
fflush(stdout);
CHECK_CUDA(cudaMemcpyPeer(dst, dst_dev, src, src_dev, bytes));
CHECK_CUDA(cudaMemcpy(host, dst, bytes, cudaMemcpyDeviceToHost));
verify_pattern(host, count, seed, "cudaMemcpyPeer");
printf("PASS\n");
free(host);
CHECK_CUDA(cudaFree(dst));
CHECK_CUDA(cudaSetDevice(src_dev));
CHECK_CUDA(cudaFree(src));
}
static void test_remote_read(int src_dev, int dst_dev, size_t count) {
const uint32_t seed = 0x56780000u + (uint32_t)(src_dev * 0x100 + dst_dev);
const size_t bytes = count * sizeof(uint32_t);
const int threads = 256;
const int blocks = (int)((count + threads - 1) / threads);
uint32_t *remote_src = NULL, *local_dst = NULL, *host = (uint32_t*)malloc(bytes);
if (host == NULL) {
fprintf(stderr, "malloc failed\n");
exit(EXIT_FAILURE);
}
CHECK_CUDA(cudaSetDevice(src_dev));
CHECK_CUDA(cudaMalloc(&remote_src, bytes));
fill_pattern_kernel<<<blocks, threads>>>(remote_src, count, seed);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaDeviceSynchronize());
CHECK_CUDA(cudaSetDevice(dst_dev));
CHECK_CUDA(cudaMalloc(&local_dst, bytes));
CHECK_CUDA(cudaMemset(local_dst, 0, bytes));
printf(" [remote read kernel] GPU %d reads GPU %d, bytes=%zu ... ", dst_dev,
src_dev, bytes);
fflush(stdout);
remote_read_copy_kernel<<<blocks, threads>>>(remote_src, local_dst, count);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaDeviceSynchronize());
CHECK_CUDA(cudaMemcpy(host, local_dst, bytes, cudaMemcpyDeviceToHost));
verify_pattern(host, count, seed, "remote read");
printf("PASS\n");
free(host);
CHECK_CUDA(cudaFree(local_dst));
CHECK_CUDA(cudaSetDevice(src_dev));
CHECK_CUDA(cudaFree(remote_src));
}
int main(int argc, char** argv) {
int device_count = 0;
size_t count = 1 << 20;
if (argc > 1) {
count = strtoull(argv[1], NULL, 10);
if (count == 0) {
fprintf(stderr, "Invalid element count: %s\n", argv[1]);
return EXIT_FAILURE;
}
}
CHECK_CUDA(cudaGetDeviceCount(&device_count));
if (device_count < 2) {
fprintf(stderr, "Need at least 2 CUDA devices, found %d\n", device_count);
return EXIT_FAILURE;
}
printf("Found %d CUDA devices, validating first 2 GPUs.\n", device_count);
print_device_summary(0);
print_device_summary(1);
printf("Elements: %zu, bytes per buffer: %zu\n", count,
count * sizeof(uint32_t));
enable_peer_or_die(0, 1);
enable_peer_or_die(1, 0);
test_memcpy_peer(0, 1, count);
test_memcpy_peer(1, 0, count);
test_remote_read(0, 1, count);
test_remote_read(1, 0, count);
printf("All P2P data-path tests passed.\n");
return EXIT_SUCCESS;
}
EOF
nvcc -o p2p_peer_validate p2p_peer_validate.cu
./p2p_peer_validate
输出
bash
Found 4 CUDA devices, validating first 2 GPUs.
GPU 0: NVIDIA GeForce RTX 5090, PCI 81:00.0, totalGlobalMem 31.4 GiB
GPU 1: NVIDIA GeForce RTX 5090, PCI a1:00.0, totalGlobalMem 31.4 GiB
Elements: 1048576, bytes per buffer: 4194304
GPU 0 -> GPU 1 peer access: supported
GPU 1 -> GPU 0 peer access: supported
[cudaMemcpyPeer] GPU 0 -> GPU 1, bytes=4194304 ... PASS
[cudaMemcpyPeer] GPU 1 -> GPU 0, bytes=4194304 ... PASS
[remote read kernel] GPU 1 reads GPU 0, bytes=4194304 ... PASS
[remote read kernel] GPU 0 reads GPU 1, bytes=4194304 ... PASS
All P2P data-path tests passed.
4、测试PCIE P2P带宽
bash
cd /app/zj_nv_eval
cat > gpu_bandwidth.cu << 'EOF'
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError_t err, const char *file, const int line) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: %s at %s:%d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
// 安全启用 P2P(忽略已启用错误)
void safeEnablePeerAccess(int from, int to) {
cudaSetDevice(from);
cudaError_t err = cudaDeviceEnablePeerAccess(to, 0);
if (err != cudaSuccess && err != cudaErrorPeerAccessAlreadyEnabled) {
fprintf(stderr, "Error enabling peer access from %d to %d: %s\n", from, to, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
double testBandwidthAsync(int srcGpu, int dstGpu, size_t totalSize, int numStreams) {
// ---------- 分配内存 ----------
checkCudaErrors(cudaSetDevice(srcGpu));
void* srcPtr = nullptr;
checkCudaErrors(cudaMalloc(&srcPtr, totalSize));
checkCudaErrors(cudaMemset(srcPtr, 0xAA, totalSize));
checkCudaErrors(cudaSetDevice(dstGpu));
void* dstPtr = nullptr;
checkCudaErrors(cudaMalloc(&dstPtr, totalSize));
// ---------- 启用 P2P ----------
int canAccess = 0;
checkCudaErrors(cudaDeviceCanAccessPeer(&canAccess, srcGpu, dstGpu));
if (canAccess) {
safeEnablePeerAccess(srcGpu, dstGpu);
safeEnablePeerAccess(dstGpu, srcGpu);
} else {
fprintf(stderr, "Warning: P2P not supported, may fallback to PCIe\n");
}
// ---------- 创建流(必须在 srcGpu 上) ----------
checkCudaErrors(cudaSetDevice(srcGpu)); // 关键!
cudaStream_t *streams = (cudaStream_t*)malloc(numStreams * sizeof(cudaStream_t));
for (int i = 0; i < numStreams; ++i) {
checkCudaErrors(cudaStreamCreate(&streams[i]));
}
// ---------- 创建事件(也在 srcGpu 上) ----------
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// ---------- 计算分块大小 ----------
size_t chunkSize = totalSize / numStreams;
// 如果无法整除,则退化为单流传输(但为了流水线,我们让最后一个流多传一点)
if (chunkSize * numStreams != totalSize) {
chunkSize = totalSize; // 简化:全部用第一个流
numStreams = 1;
}
// ---------- Warm-up ----------
checkCudaErrors(cudaSetDevice(srcGpu));
checkCudaErrors(cudaMemcpyPeerAsync(dstPtr, dstGpu, srcPtr, srcGpu, totalSize, streams[0]));
checkCudaErrors(cudaStreamSynchronize(streams[0])); // 等待完成
// ---------- 正式测量 ----------
checkCudaErrors(cudaEventRecord(start, 0)); // 记录开始事件(默认流,但当前设备是 srcGpu)
if (numStreams == 1) {
// 单流传输整个数据块
checkCudaErrors(cudaMemcpyPeerAsync(dstPtr, dstGpu, srcPtr, srcGpu, totalSize, streams[0]));
} else {
// 多流分块流水线传输
for (int i = 0; i < numStreams; ++i) {
void* srcOffset = (char*)srcPtr + i * chunkSize;
void* dstOffset = (char*)dstPtr + i * chunkSize;
checkCudaErrors(cudaMemcpyPeerAsync(dstOffset, dstGpu, srcOffset, srcGpu, chunkSize, streams[i]));
}
}
// 同步所有流
for (int i = 0; i < numStreams; ++i) {
checkCudaErrors(cudaStreamSynchronize(streams[i]));
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
float elapsed_ms = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&elapsed_ms, start, stop));
double bandwidth_GBps = ((double)totalSize / (elapsed_ms / 1000.0)) / (1024.0 * 1024.0 * 1024.0);
// ---------- 清理 ----------
checkCudaErrors(cudaSetDevice(srcGpu));
checkCudaErrors(cudaFree(srcPtr));
checkCudaErrors(cudaSetDevice(dstGpu));
checkCudaErrors(cudaFree(dstPtr));
for (int i = 0; i < numStreams; ++i) checkCudaErrors(cudaStreamDestroy(streams[i]));
free(streams);
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
// 可选的:禁用 P2P(略)
return bandwidth_GBps;
}
int main() {
int deviceCount = 0;
checkCudaErrors(cudaGetDeviceCount(&deviceCount));
if (deviceCount < 2) {
fprintf(stderr, "Need at least 2 GPUs\n");
return EXIT_FAILURE;
}
int srcGpu = 0, dstGpu = 1;
size_t totalSize = 1024ULL * 1024 * 1024; // 1GB
printf("Testing P2P bandwidth (async, multi-stream) from GPU %d to GPU %d\n", srcGpu, dstGpu);
printf("Transfer size: %zu MB\n", totalSize / (1024*1024));
printf("Streams\tBandwidth (GB/s)\t%% of 64 GB/s\n");
printf("------------------------------------------------\n");
for (int streams = 1; streams <= 8; streams *= 2) {
double bw = testBandwidthAsync(srcGpu, dstGpu, totalSize, streams);
printf("%d\t%.2f\t\t%.1f%%\n", streams, bw, (bw / 64.0) * 100.0);
fflush(stdout);
}
return 0;
}
EOF
nvcc -o gpu_bandwidth gpu_bandwidth.cu
./gpu_bandwidth
输出
bash
Testing P2P bandwidth (async, multi-stream) from GPU 0 to GPU 1
Transfer size: 1024 MB
Streams Bandwidth (GB/s) % of 64 GB/s
------------------------------------------------
1 52.43 81.9%
2 52.48 82.0%
4 52.46 82.0%
8 52.48 82.0%
5、更新nccl
bash
cd /app/zj_nv_eval
git clone https://github.com/NVIDIA/nccl.git
cd nccl
git checkout v2.29.7-1
make clean
make -j
rm -rf /usr/lib/x86_64-linux-gnu/libnccl.so*
cp build/lib/libnccl* /usr/lib/x86_64-linux-gnu/ -rvf
6、nccl-tests性能测试
- 编译
bash
cd /app/zj_nv_eval
git clone https://github.com/NVIDIA/nccl-tests.git
cd nccl-tests
make -j
- 测试all_reduce_perf
bash
export NCCL_P2P_LEVEL=SYS
export NCCL_SHM_DISABLE=1
export NCCL_IB_DISABLE=1
export NCCL_NET_PLUGIN=none
export NCCL_SOCKET_IFNAME=lo
export NCCL_PROTO=Simple
export NCCL_ALGO=Ring
./build/all_reduce_perf -b 128M -e 2G -f 2 -g 4 -n 20
输出
bash
# nccl-tests version 2.19.0 nccl-headers=22501 nccl-library=22907
# Collective test starting: all_reduce_perf
# nThread 1 nGpus 4 minBytes 134217728 maxBytes 2147483648 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0 unalign: 0
#
# Using devices
# Rank 0 Group 0 Pid 121900 on zj_nv_eval device 0 [0000:81:00] NVIDIA GeForce RTX 5090
# Rank 1 Group 0 Pid 121900 on zj_nv_eval device 1 [0000:a1:00] NVIDIA GeForce RTX 5090
# Rank 2 Group 0 Pid 121900 on zj_nv_eval device 2 [0000:c1:00] NVIDIA GeForce RTX 5090
# Rank 3 Group 0 Pid 121900 on zj_nv_eval device 3 [0000:e1:00] NVIDIA GeForce RTX 5090
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
134217728 33554432 float sum -1 4230.16 31.73 47.59 0 4229.29 31.74 47.60 0
268435456 67108864 float sum -1 8405.86 31.93 47.90 0 8391.48 31.99 47.98 0
536870912 134217728 float sum -1 16707.1 32.13 48.20 0 16704.3 32.14 48.21 0
1073741824 268435456 float sum -1 33124.4 32.42 48.62 0 33229.4 32.31 48.47 0
2147483648 536870912 float sum -1 66342.0 32.37 48.55 0 66016.1 32.53 48.79 0
# Out of bounds values : 0 OK
# Avg bus bandwidth : 48.1934
#
# Collective test concluded: all_reduce_perf
#
- 测试sendrecv
bash
export NCCL_P2P_DISABLE=0
export NCCL_P2P_LEVEL=6
export NCCL_SHM_DISABLE=1 # 禁用 SHM,强制 P2P
export NCCL_IB_DISABLE=1
export NCCL_NET_PLUGIN=none
export NCCL_SOCKET_IFNAME=lo
./build/sendrecv_perf -b 128M -e 128M -g 2 -n 10
输出
bash
# nccl-tests version 2.19.0 nccl-headers=22501 nccl-library=22907
# Collective test starting: sendrecv_perf
# nThread 1 nGpus 2 minBytes 134217728 maxBytes 134217728 step: 1048576(bytes) warmup iters: 1 iters: 10 agg iters: 1 validation: 1 graph: 0 unalign: 0
#
# Using devices
# Rank 0 Group 0 Pid 122314 on zj_nv_eval device 0 [0000:81:00] NVIDIA GeForce RTX 5090
# Rank 1 Group 0 Pid 122314 on zj_nv_eval device 1 [0000:a1:00] NVIDIA GeForce RTX 5090
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
134217728 33554432 float sum -1 3054.01 43.95 43.95 0 3053.15 43.96 43.96 N/A
# Out of bounds values : 0 OK
# Avg bus bandwidth : 43.9542
#
# Collective test concluded: sendrecv_perf
#
二、DCGM 测试
bash
docker pull swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/nvidia/dcgm:4.1.1-1-ubuntu22.04
docker run --gpus all --shm-size=128g -it --entrypoint /bin/bash -e NVIDIA_VISIBLE_DEVICES=all \
--rm --privileged --net=host swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/nvidia/dcgm:4.5.2-1-ubuntu22.04
cat > config.yaml << 'EOF'
version: "@CMAKE_PROJECT_VERSION@"
spec: dcgm-diag-v1
skus:
- name: NVIDIA GeForce RTX 5090
id: 2b85
targeted_power:
is_allowed: true
starting_matrix_dim: 1024.0
target_power: 575.0
use_dgemm: false
targeted_stress:
is_allowed: true
use_dgemm: false
target_stress: 100
sm_stress:
is_allowed: true
target_stress: 100.0
use_dgemm: false
nvbandwidth:
is_allowed: true
pcie:
is_allowed: true
h2d_d2h_single_pinned:
min_pci_generation: 5.0
min_pci_width: 16.0
h2d_d2h_single_unpinned:
min_pci_generation: 5.0
min_pci_width: 16.0
memory:
is_allowed: true
l1cache_size_kb_per_sm: 192.0
diagnostic:
is_allowed: true
matrix_dim: 8192.0
memory_bandwidth:
is_allowed: true
minimum_bandwidth: 1230000
pulse_test:
is_allowed: true
EOF
time dcgmi diag -r 4 -i 0 -c config.yaml -j
输出
bash
real 31m45.237s
user 14m11.761s
sys 0m12.611s