centos7-nvidia驱动安装及简单测试

centos7-nvidia驱动安装

类别	信息
服务器型号	Rack Mount Chassis NF5280M6
CPU	Intel® Xeon® Silver 4310 CPU @ 2.10GHz * 2
系统版本	Centos 7
系统内核版本	3.10.0-1160.el7.x86_64
GPU型号	NVIDIA A100（40G）*4
Nvidia版本	525.85.05
CUDA版本	12.0.0
docker版本	20.10.9

一、基础系统部分(已经安装过可以不用安装)

1、安装基础软件

bash 复制代码

yum update

bash 复制代码

yum -y install openssh-server openssh-client apt-utils freeipmi ipmitool sshpass  ethtool zip unzip nano less git netplan.io iputils-ping mtr ipvsadm smartmontools python3-pip socat conntrack libvirt-clients libnuma-dev ctorrent nvme-cli gcc-12 g++-12 vim wget apt git unzip zip ntp ntpdate lrzsz lftp tree bash-completion  elinks dos2unix tmux jq

bash 复制代码

yum -y install  nmap net-tools  mtr traceroute tcptraceroute aptitude  htop iftop hping3 fping nethogs sshuttle tcpdump figlet  stress iperf iperf3 dnsutils curl linux-tools-generic linux-cloud-tools-generic

bash 复制代码

yum groupinstall -y "Development Tools"

bash 复制代码

curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash

bash 复制代码

yum install git-lfs

bash 复制代码

git lfs install

2、调整文件描述符

bash 复制代码

echo "ulimit -SHn 655350" >>/etc/profile
echo "fs.file-max = 655350" >>/etc/sysctl.conf
echo "root soft nofile 655350" >>/etc/security/limits.conf
echo "root hard nofile 655350" >>/etc/security/limits.conf
echo "* soft nofile 655350" >>/etc/security/limits.conf
echo "* hard nofile 655350" >>/etc/security/limits.conf

bash 复制代码

source /etc/profile

优化history

bash 复制代码

cat /etc/profile
export HISTTIMEFORMAT="%Y-%m-%d %H:%M:%S  `whoami` "
export HISTFILESIZE=50000
export HISTSIZE=50000

bash 复制代码

source /etc/profile

5、优化内核参数

bash 复制代码

cp /etc/sysctl.conf /etc/sysctl.conf.bak

bash 复制代码

vi /etc/sysctl.conf
net.ipv4.tcp_syncookies = 1
net.ipv4.tcp_abort_on_overflow = 1
net.ipv4.tcp_max_tw_buckets = 6000
net.ipv4.tcp_sack = 1
net.ipv4.tcp_window_scaling = 1
net.ipv4.tcp_rmem = 4096        87380  4194304
net.ipv4.tcp_wmem = 4096        66384  4194304
net.ipv4.tcp_mem = 94500000 915000000 927000000
net.core.optmem_max = 81920
net.core.wmem_default = 8388608
net.core.wmem_max = 16777216
net.core.rmem_default = 8388608
net.core.rmem_max = 16777216
net.ipv4.tcp_max_syn_backlog = 1020000
net.core.netdev_max_backlog = 862144
net.core.somaxconn = 262144
net.ipv4.tcp_max_orphans = 327680
net.ipv4.tcp_timestamps = 0
net.ipv4.tcp_synack_retries = 1
net.ipv4.tcp_syn_retries = 1
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_fin_timeout = 15
net.ipv4.tcp_keepalive_time = 30
net.ipv4.ip_local_port_range = 1024    65535
net.netfilter.nf_conntrack_tcp_timeout_established = 180
net.netfilter.nf_conntrack_max = 1048576
net.nf_conntrack_max = 1048576
fs.file-max = 655350`

bash 复制代码

modprobe nf_conntrack
sysctl -p /etc/sysctl.conf
sysctl -w net.ipv4.route.flush=1

二、显卡驱动、cuda等部署

手动创建禁用 nouveau 的配置

bash 复制代码

bash -c "echo blacklist nouveau > /etc/modprobe.d/blacklist-nvidia-nouveau.conf"
bash -c "echo options nouveau modeset=0 >> /etc/modprobe.d/blacklist-nvidia-nouveau.conf"
echo options nouveau modeset=0 | tee -a /etc/modprobe.d/nouveau-kms.conf

# boot备份
cp -r /boot/ /root/

dracut -f /boot/initramfs-$(uname -r).img $(uname -r)

bash 复制代码

# 重启验证是否禁用成功
reboot
lsmod | grep nouveau（重启成功后打开终端输入如下，如果什么都不显示，说明正面上面禁用nouveau的流程正确）

安装nvidia驱动（https://download.nvidia.com/XFree86/Linux-x86_64）

获取推荐安装版本（可不选择推荐安装版本）

bash 复制代码

# 导入 ELRepo 的公钥
sudo rpm --import https://www.elrepo.org/RPM-GPG-KEY-elrepo.org

# 安装 ELRepo 仓库
sudo yum install -y https://www.elrepo.org/elrepo-release-7.0-4.el7.elrepo.noarch.rpm

sudo yum makecache

lspci | grep -i nvidia

下载对应内核工具防止安装错误

bash 复制代码

# 安装 yum-config-manager 工具（开启工具查找centos7老版本内核工具）
yum install -y yum-utils

# 启用 vault 仓库
yum-config-manager --enable vault

yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)

bash 复制代码

wget https://download.nvidia.com/XFree86/Linux-x86_64/525.85.05/NVIDIA-Linux-x86_64-525.85.05.run

chmod +x NVIDIA-Linux-x86_64-525.85.05.run

bash NVIDIA-Linux-x86_64-525.85.05.run  --no-opengl-files --ui=none --no-questions --accept-license

安装完成后执行nvidia-smi查看

bash 复制代码

[root@gnode196 ~]# nvidia-smi
Tue Jan 27 16:48:41 2026       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.05    Driver Version: 525.85.05    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA A100-PCI...  Off  | 00000000:4B:00.0 Off |                    0 |
| N/A   32C    P0    36W / 250W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  Off  | 00000000:65:00.0 Off |                    0 |
| N/A   33C    P0    36W / 250W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   2  NVIDIA A100-PCI...  Off  | 00000000:CA:00.0 Off |                    0 |
| N/A   31C    P0    38W / 250W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   3  NVIDIA A100-PCI...  Off  | 00000000:E3:00.0 Off |                    0 |
| N/A   32C    P0    39W / 250W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

安装cuda

根据上面步骤可以看到cuda支持可用的cuda版本是12.0，登录访问https://developer.nvidia.com/cuda-toolkit-archive 并下载12.0版本的cuda

bash 复制代码

wget https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run
bash cuda_12.0.0_525.60.13_linux.run --toolkit --silent --override

增加环境变量并验证

bash 复制代码

在pofile内添加cuda环境变量
cat /etc/profile
export PATH=/usr/local/cuda-12.0/bin:$PATH 
export LD_LIBRARY_PATH=/usr/local/cuda-12.0/lib64:$LD_LIBRARY_PATH

source /etc/profile
nvcc -V 验证

安装nvidia-docker

bash 复制代码

curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
  sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo

yum install -y nvidia-container-toolkit

验证安装

bash 复制代码

nvidia-container-cli --version
nvidia-ctk --version

配置docker使用nvidia-runtime

bash 复制代码

nvidia-ctk runtime configure --runtime=docker
systemctl restart docker

固定内核

bash 复制代码

yum versionlock add kernel-3.10.0-1160.el7.x86_64
yum versionlock add kernel-core-3.10.0-1160.el7.x86_64
yum versionlock add kernel-modules-3.10.0-1160.el7.x86_64
echo "exclude=kernel*" >> /etc/yum.conf

CPU/GPU相关性能开启

bash 复制代码

# 持久化开启（开启Persistence Mode模式）
nvidia-smi -pm 1

# 允许ECC内存模式下模拟错误
nvidia-smi -e ENABLED

# CPU锁频
yum install -y kernel-tools
cpupower idle-set -D 0
cpupower frequency-set -g performance
echo 'cpupower frequency-set -g performance' >> /etc/rc.local
chmod +x /etc/rc.d/rc.local

# GPU相关优化锁到最高频
nvidia-smi -lgc 1410,1410

# 关闭 PCIe ASPM（节能）
grubby --update-kernel=ALL --args="pcie_aspm=off"

部署HPC-X(https://developer.nvidia.com/networking/hpc-x 页面最下选择下载版本)

bash 复制代码

wget http://www.mellanox.com/page/hpcx_eula?mrequest=downloads&mtype=hpc&mver=hpc-x&mname=v2.18.1/hpcx-v2.18.1-gcc-inbox-redhat7-cuda12-x86_64.tbz

tar -xf hpcx-v2.18.1-gcc-inbox-redhat7-cuda12-x86_64.tbz -C /opt/

ln -s /opt/hpcx-v2.18.1-gcc-inbox-redhat7-cuda12-x86_64 /opt/hpcx

export HPCX_HOME=/opt/hpcx

. $HPCX_HOME/hpcx-init.sh

hpcx_load

nccl/gpubun测试

安装nccl(静态编译)

bash 复制代码

mkdir -p /root/nccl/ && cd /root/nccl
git clone https://github.com/NVIDIA/nccl.git
cd nccl
make -j 24 src.build CUDA_HOME=/usr/local/cuda PATH=$PATH:/usr/local/cuda/bin LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# -j 并法参数

安装nccl-test (静态编译)

bash 复制代码

mkdir -p /root/nccl/ && cd /root/nccl
git clone https://github.com/NVIDIA/nccl-tests.git
cd nccl-tests
which mpirun
# /opt/hpcx/ompi/bin/mpirun 截取 MPI_HOME=/opt/hpcx/ompi

cd /root/nccl/nccl-tests
PATH=$PATH:/usr/local/cuda/bin LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64 make -j 30 CUDA_HOME=/usr/local/cuda NCCL_HOME=/root/nccl/nccl/build NCCL_LIBDIR=/root/nccl/nccl/build/lib NCCL_STATIC=1 NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"

nccl测试

bash 复制代码

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/nccl/nccl/build/lib
./build/all_reduce_perf -b 8 -e 35G -f 2 -g 4 -n 50

测试参数

bash 复制代码

-b <大小>：起始大小（如 -b 8、-b 1M）
-e <大小>：结束大小（如 -e 10G）
-f <倍数>：每次乘以几倍（如 -f 2 表示翻倍）
-g <数量>：使用几个 GPU（如 -g 1、-g 4）
-n <次数>：测试迭代次数（如 -n 100，默认 20）

bash 复制代码

# 1. 单 GPU 测试（从 8 字节到 10GB，每次翻倍）
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1

# 2. 4 GPU 测试
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 4

# 3. 测试更大数据量（35GB，4 GPU）
./build/all_reduce_perf -b 8 -e 35G -f 2 -g 4

# 4. 增加迭代次数，结果更稳定
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 4 -n 100

# 5. 快速测试小数据范围
./build/all_reduce_perf -b 1M -e 1G -f 2 -g 4

gpubun

bash 复制代码

git clone https://github.com/wilicc/gpu-burn.git

编辑配置文件

bash 复制代码

cd gpu-burn
vi Makefile

gpu_burn: gpu_burn-drv.o compare.ptx
	g++ -o $@ $< -O3 ${LDFLAGS}

修改为

gpu_burn: gpu_burn-drv.o compare.ptx
	g++ -o $@ $< -O3 ${LDFLAGS} -static-libgcc -static-libstdc++

编译并测试

bash 复制代码

修改后进行编译，编译完成后在其他机器拷贝后就可以直接使用了
yum install -y libstdc++-static
make clean
make
./gpu_burn 3600(测试时间)

模型部署相关

huggingface下载

bash 复制代码

apt-get -y install git-lfs
git lfs install
apt-get install python3 python-is-python3
python3 -m pip install --upgrade "pip==20.3.4" -i https://mirrors.aliyun.com/pypi/simple/
pip3.12 config set global.index-url https://pypi.org/simple/
pip3.12 install -U huggingface_hub --break-system-packages

huggingface登录

bash 复制代码

 huggingface-cli login
 # hf auth login
 # uggingface_hub 的最新版本（1.2.3）已经将 CLI 命令从 huggingface-cli 改为 hf。旧命令 huggingface-cli 在新版本中不再支持
⚠️  Warning: 'huggingface-cli login' is deprecated. Use 'hf auth login' instead.

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible):
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `deploy` has been saved to /root/.cache/huggingface/stored_tokens

[root@gnode196 ~]# git config --global credential.helper store
[root@gnode196 ~]# git config --global credential.helper
store