一、自动化测试脚本(整合核心测试项)
脚本名称:gpu_cluster_delivery_test.sh
bash
#!/bin/bash
set -euo pipefail
# ==================== 配置参数(根据实际需求修改)====================
TEST_DIR="/tmp/gpu_test" # 测试临时目录
LOG_FILE="${TEST_DIR}/gpu_delivery_test.log" # 测试日志文件
STRESS_DURATION=3600 # 单节点压力测试时长(秒,默认1小时,可改为86400=24小时)
GPU_BURN_PATH="${TEST_DIR}/gpu_burn" # GPU满负载测试工具路径
PYTHON_SCRIPT="${TEST_DIR}/memory_bandwidth_test.py" # 显存带宽测试Python脚本
NETWORK_TARGET_IP="192.168.1.101" # 网络测试目标节点IP(跨节点测试时修改)
STORAGE_TEST_SIZE="100G" # 存储测试数据量(默认100G,根据硬盘容量调整)
# ==================== 初始化环境 ====================
init_env() {
echo "===== 初始化测试环境 =====" | tee -a ${LOG_FILE}
mkdir -p ${TEST_DIR}
touch ${LOG_FILE}
chmod 777 ${TEST_DIR}
# 安装依赖工具
echo "安装依赖包..." | tee -a ${LOG_FILE}
apt update -y && apt install -y stress-ng fio iperf3 python3 python3-pip gcc git >> ${LOG_FILE} 2>&1
pip3 install torch numpy pandas --upgrade >> ${LOG_FILE} 2>&1
# 下载GPU Burn工具(GPU满负载测试)
if [ ! -f ${GPU_BURN_PATH} ]; then
echo "下载GPU Burn工具..." | tee -a ${LOG_FILE}
git clone https://github.com/wilicc/gpu-burn.git ${TEST_DIR}/gpu-burn-src >> ${LOG_FILE} 2>&1
cd ${TEST_DIR}/gpu-burn-src && make >> ${LOG_FILE} 2>&1
cp ${TEST_DIR}/gpu-burn-src/gpu_burn ${GPU_BURN_PATH}
fi
# 生成显存带宽测试Python脚本
cat > ${PYTHON_SCRIPT} << 'EOF'
import torch
import time
import numpy as np
def test_memory_bandwidth(device_id=0):
device = torch.device(f"cuda:{device_id}" if torch.cuda.is_available() else "cpu")
if not torch.cuda.is_available():
print("GPU not found!")
return None, None
# 测试参数:张量大小(16GB,根据显存调整)
tensor_size = 1024 * 1024 * 1024 * 4 # 4GB per tensor (float32=4byte)
if torch.cuda.get_device_properties(device).total_memory < tensor_size * 2:
tensor_size = 1024 * 1024 * 512 * 4 # 2GB per tensor
# 生成随机张量
a = torch.randn(tensor_size, device=device, dtype=torch.float32)
b = torch.randn(tensor_size, device=device, dtype=torch.float32)
# 预热
for _ in range(10):
c = a * b
# 测试写入带宽
start_time = time.time()
for _ in range(100):
b.copy_(a)
torch.cuda.synchronize()
write_time = time.time() - start_time
write_bandwidth = (tensor_size * 4 * 100) / (write_time * 1024 * 1024 * 1024) # GB/s
# 测试计算+读写带宽(MatMul)
start_time = time.time()
for _ in range(50):
c = torch.matmul(a.view(1024, -1), b.view(-1, 1024))
torch.cuda.synchronize()
matmul_time = time.time() - start_time
matmul_bandwidth = (tensor_size * 4 * 2 * 50) / (matmul_time * 1024 * 1024 * 1024) # GB/s
return write_bandwidth, matmul_bandwidth
if __name__ == "__main__":
num_gpus = torch.cuda.device_count()
print(f"Found {num_gpus} GPU(s)")
for gpu_id in range(num_gpus):
write_bw, matmul_bw = test_memory_bandwidth(gpu_id)
print(f"GPU {gpu_id}: Write Bandwidth = {write_bw:.2f} GB/s, MatMul Bandwidth = {matmul_bw:.2f} GB/s")
EOF
echo "环境初始化完成!" | tee -a ${LOG_FILE}
}
# ==================== 1. 硬件基础验证 ====================
hardware_check() {
echo -e "\n===== 1. 硬件基础验证 =====" | tee -a ${LOG_FILE}
# 系统信息
echo "=== 系统信息 ===" | tee -a ${LOG_FILE}
uname -a >> ${LOG_FILE}
lsb_release -a >> ${LOG_FILE} 2>&1
# CPU信息
echo "=== CPU信息 ===" | tee -a ${LOG_FILE}
lscpu | grep -E "Model name|CPU(s):|Thread(s) per core" >> ${LOG_FILE}
# 内存信息
echo "=== 内存信息 ===" | tee -a ${LOG_FILE}
free -h >> ${LOG_FILE}
dmidecode -t memory | grep -E "Size:|Speed:" >> ${LOG_FILE} 2>&1
# GPU信息(nvidia-smi)
echo "=== GPU信息 ===" | tee -a ${LOG_FILE}
if command -v nvidia-smi &> /dev/null; then
nvidia-smi >> ${LOG_FILE}
nvidia-smi --query-gpu=name,memory.total,temperature.gpu,power.draw --format=csv,noheader,nounits >> ${LOG_FILE}
else
echo "ERROR: nvidia-smi not found! GPU may not be recognized." | tee -a ${LOG_FILE}
exit 1
fi
# 存储信息
echo "=== 存储信息 ===" | tee -a ${LOG_FILE}
lsblk >> ${LOG_FILE}
df -h >> ${LOG_FILE}
# 网卡信息
echo "=== 网卡信息 ===" | tee -a ${LOG_FILE}
ip addr >> ${LOG_FILE}
lshw -class network | grep -E "description:|product:|speed:" >> ${LOG_FILE} 2>&1
echo "硬件基础验证完成!" | tee -a ${LOG_FILE}
}
# ==================== 2. GPU核心性能测试 ====================
gpu_performance_test() {
echo -e "\n===== 2. GPU核心性能测试 =====" | tee -a ${LOG_FILE}
# 2.1 单卡算力测试(nvcc编译简单MatMul)
echo "=== 单卡算力测试(FP32 MatMul) ===" | tee -a ${LOG_FILE}
cat > ${TEST_DIR}/matmul_test.cu << 'EOF'
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void matmul(float *a, float *b, float *c, int n) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int k = 0; k < n; k++) {
sum += a[i * n + k] * b[k * n + j];
}
c[i * n + j] = sum;
}
int main() {
int n = 2048;
size_t size = n * n * sizeof(float);
float *h_a, *h_b, *h_c;
float *d_a, *d_b, *d_c;
cudaMallocHost(&h_a, size);
cudaMallocHost(&h_b, size);
cudaMallocHost(&h_c, size);
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
for (int i = 0; i < n*n; i++) {
h_a[i] = rand() / (float)RAND_MAX;
h_b[i] = rand() / (float)RAND_MAX;
}
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
dim3 block(32, 32);
dim3 grid(n/block.x, n/block.y);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
matmul<<<grid, block>>>(d_a, d_b, d_c, n);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms;
cudaEventElapsedTime(&ms, start, stop);
float flops = 2.0f * n * n * n / (ms * 1e6); // TFLOPS
printf("FP32 MatMul (2048x2048): Time = %.2f ms, TFLOPS = %.2f\n", ms, flops);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
cudaFreeHost(h_a); cudaFreeHost(h_b); cudaFreeHost(h_c);
cudaEventDestroy(start); cudaEventDestroy(stop);
return 0;
}
EOF
if command -v nvcc &> /dev/null; then
nvcc ${TEST_DIR}/matmul_test.cu -o ${TEST_DIR}/matmul_test >> ${LOG_FILE} 2>&1
${TEST_DIR}/matmul_test >> ${LOG_FILE}
else
echo "WARNING: nvcc not found, skip MatMul算力测试" | tee -a ${LOG_FILE}
fi
# 2.2 显存带宽测试(Python脚本)
echo "=== 显存带宽测试 ===" | tee -a ${LOG_FILE}
python3 ${PYTHON_SCRIPT} >> ${LOG_FILE}
# 2.3 多卡NCCL通信测试(需安装NCCL)
echo "=== 多卡NCCL通信测试 ===" | tee -a ${LOG_FILE}
if command -v nccl-tests/build/all_reduce_perf &> /dev/null; then
nccl-tests/build/all_reduce_perf -b 8MB -e 16GB -f 2 >> ${LOG_FILE}
else
echo "WARNING: NCCL tests not found, skip多卡通信测试(需安装nccl-tests)" | tee -a ${LOG_FILE}
fi
echo "GPU核心性能测试完成!" | tee -a ${LOG_FILE}
}
# ==================== 3. 网络互联测试 ====================
network_test() {
echo -e "\n===== 3. 网络互联测试 =====" | tee -a ${LOG_FILE}
# 3.1 本机网卡带宽(iperf3自测试)
echo "=== 本机网卡带宽测试(iperf3) ===" | tee -a ${LOG_FILE}
iperf3 -s -D --logfile ${TEST_DIR}/iperf_server.log
sleep 2
iperf3 -c localhost -i 1 -t 30 -P 8 >> ${LOG_FILE}
pkill iperf3
# 3.2 跨节点网络测试(需配置TARGET_IP)
if [ -n "${NETWORK_TARGET_IP}" ] && ping -c 1 ${NETWORK_TARGET_IP} &> /dev/null; then
echo "=== 跨节点网络测试(目标IP: ${NETWORK_TARGET_IP}) ===" | tee -a ${LOG_FILE}
iperf3 -c ${NETWORK_TARGET_IP} -i 1 -t 30 -P 8 >> ${LOG_FILE}
else
echo "WARNING: 目标节点不可达,跳过跨节点网络测试" | tee -a ${LOG_FILE}
fi
echo "网络互联测试完成!" | tee -a ${LOG_FILE}
}
# ==================== 4. 稳定性压力测试 ====================
stress_test() {
echo -e "\n===== 4. 稳定性压力测试(持续${STRESS_DURATION}秒) =====" | tee -a ${LOG_FILE}
# 4.1 GPU满负载测试(gpu-burn)
echo "=== GPU满负载测试 ===" | tee -a ${LOG_FILE}
${GPU_BURN_PATH} ${STRESS_DURATION} >> ${LOG_FILE} 2>&1 &
GPU_BURN_PID=$!
# 4.2 CPU+内存压力测试(stress-ng)
echo "=== CPU+内存压力测试 ===" | tee -a ${LOG_FILE}
stress-ng --cpu $(nproc) --memory 80% --vm 4 --timeout ${STRESS_DURATION} >> ${LOG_FILE} 2>&1 &
STRESS_NG_PID=$!
# 4.3 存储压力测试(fio)
echo "=== 存储压力测试(随机读写) ===" | tee -a ${LOG_FILE}
fio --name=storage_stress --rw=randrw --bs=64k --size=${STORAGE_TEST_SIZE} --numjobs=8 --runtime=${STRESS_DURATION} --iodepth=32 --direct=1 --group_reporting >> ${LOG_FILE} 2>&1 &
FIO_PID=$!
# 等待所有压力测试完成
wait ${GPU_BURN_PID} ${STRESS_NG_PID} ${FIO_PID}
echo "稳定性压力测试完成!" | tee -a ${LOG_FILE}
}
# ==================== 5. 软件兼容性测试 ====================
software_compatibility_test() {
echo -e "\n===== 5. 软件兼容性测试 =====" | tee -a ${LOG_FILE}
# 5.1 PyTorch GPU可用性测试
echo "=== PyTorch GPU可用性测试 ===" | tee -a ${LOG_FILE}
python3 -c "import torch; print(f'PyTorch version: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}'); print(f'GPU count: {torch.cuda.device_count()}')" >> ${LOG_FILE}
# 5.2 简单模型训练测试(MNIST)
echo "=== 简单模型训练测试(MNIST) ===" | tee -a ${LOG_FILE}
python3 - << 'EOF'
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# 简单CNN模型
class SimpleCNN(nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = nn.functional.relu(x)
x = self.conv2(x)
x = nn.functional.relu(x)
x = nn.functional.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = nn.functional.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = nn.functional.log_softmax(x, dim=1)
return output
# 训练配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.MNIST('../data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
model = SimpleCNN().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=1.0)
criterion = nn.NLLLoss()
# 训练1个epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx % 100 == 0:
print(f'Train Epoch: 1 [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')
print("PyTorch MNIST训练测试成功!")
EOF >> ${LOG_FILE} 2>&1
echo "软件兼容性测试完成!" | tee -a ${LOG_FILE}
}
# ==================== 主函数执行 ====================
main() {
echo "===== GPU服务器集群交付测试开始($(date))=====" | tee -a ${LOG_FILE}
init_env
hardware_check
gpu_performance_test
network_test
stress_test
software_compatibility_test
echo -e "\n===== 测试结束($(date))=====" | tee -a ${LOG_FILE}
echo "测试日志已保存至:${LOG_FILE}" | tee -a ${LOG_FILE}
}
# 启动测试
main
二、脚本使用说明
1. 前置条件
- 操作系统:Ubuntu 20.04/22.04(脚本基于 Debian 系,CentOS 需修改
apt为yum)。 - 已安装 GPU 驱动(NVIDIA Driver ≥ 525)、CUDA Toolkit(建议 12.0+)。
- 服务器联网(需下载依赖包和工具)。
- 权限:以
root用户执行(避免权限不足)。
2. 脚本修改(关键配置)
根据实际集群情况修改脚本头部的 配置参数:
bash
NETWORK_TARGET_IP="192.168.1.101" # 跨节点网络测试的目标节点IP(必填,否则跳过跨节点测试)
STRESS_DURATION=3600 # 压力测试时长(默认1小时,交付测试建议改为86400=24小时)
STORAGE_TEST_SIZE="100G" # 存储测试数据量(根据硬盘剩余空间调整,避免空间不足)
3. 执行步骤
(1)单节点测试
bash
# 1. 下载脚本(或手动创建文件粘贴内容)
wget https://xxx/gpu_cluster_delivery_test.sh # 替换为实际脚本路径,或手动创建
chmod +x gpu_cluster_delivery_test.sh
# 2. 执行脚本(后台运行,避免断开连接中断测试)
nohup ./gpu_cluster_delivery_test.sh &
# 3. 查看测试进度
tail -f /tmp/gpu_test/gpu_delivery_test.log
(2)集群批量测试(结合 Ansible)
若需对集群所有节点执行测试,可通过 Ansible 批量分发脚本并执行:
bash
# 1. 编辑Ansible主机清单(inventory.ini)
[gpu_cluster]
node1 ansible_host=192.168.1.100
node2 ansible_host=192.168.1.101
node3 ansible_host=192.168.1.102
# 2. 批量分发脚本
ansible gpu_cluster -m copy -a "src=./gpu_cluster_delivery_test.sh dest=/root/ mode=755" -i inventory.ini
# 3. 批量执行脚本
ansible gpu_cluster -m shell -a "nohup /root/gpu_cluster_delivery_test.sh &" -i inventory.ini
# 4. 批量获取测试日志
ansible gpu_cluster -m fetch -a "src=/tmp/gpu_test/gpu_delivery_test.log dest=./cluster_test_logs/" -i inventory.ini
三、脚本覆盖范围与补充说明
1. 已覆盖的测试维度
| 测试维度 | 核心测试项 | 工具 / 方法 |
|---|---|---|
| 硬件基础验证 | CPU / 内存 / GPU / 存储 / 网卡识别与配置核对 | lscpu、nvidia-smi、lsblk |
| GPU 性能 | 单卡算力(MatMul)、显存带宽、多卡 NCCL 通信 | nvcc、PyTorch、nccl-tests |
| 网络互联 | 本机带宽、跨节点带宽(iperf3) | iperf3 |
| 稳定性 | GPU/CPU/ 内存 / 存储满负载压力测试 | gpu-burn、stress-ng、fio |
| 软件兼容性 | PyTorch 可用性、简单模型训练 | PyTorch、MNIST 数据集 |
2. 未覆盖的测试项(需手动补充)
- 集群分布式训练测试(如跨节点 8 卡微调 LLaMA):需结合用户实际业务场景,脚本仅覆盖单节点多卡测试。
- InfiniBand 网络测试 :若集群用 IB 网卡,需替换
iperf3为ib_write_bw/ib_write_lat(脚本默认支持以太网)。 - K8s/GPU Operator 适配测试 :需手动部署 K8s 和 GPU Operator,执行
kubectl describe nodes验证 GPU 调度。 - MLPerf 基准测试:若需严格对标行业标准,需单独运行 MLPerf 脚本(需提前下载数据集和配置文件)。
3. 测试结果判断标准
脚本执行完成后,查看/tmp/gpu_test/gpu_delivery_test.log,重点关注:
- 无
ERROR日志,WARNING可根据实际场景忽略(如未安装 NCCL 则跳过多卡测试)。 - GPU 温度≤90℃(压力测试期间),无自动降频或停机。
- 显存带宽 / 算力接近官方标称值(误差≤10%)。
- 跨节点网络带宽≥标称值的 90%(如 100G 以太网≥90 Gbps)。
- 稳定性测试期间无进程崩溃、节点离线。
四、扩展建议
- 定制化修改:根据用户业务场景添加测试项(如 AI 绘画 Stable Diffusion 测试、大模型推理延迟测试)。
- 报告自动化 :在脚本末尾添加日志解析逻辑,生成 HTML 格式测试报告(需安装
pandas、jinja2)。 - 故障告警 :整合
email或钉钉机器人,测试失败时自动发送告警。