1、安装gcc
#安装编译环境
yum -y install make gcc gcc-c++
2、下载显卡驱动
点击 直达连接
nvidia高级搜索下载历史版本驱动程序(下载历史版本驱动)
https://www.nvidia.cn/Download/Find.aspx?lang=cn
3、安装驱动
安装显卡驱动
./NVIDIA-Linux-x86_64-535.98.run -m=kernel-open
4、修改系统参数,更新内核,重启服务器
rm -f /etc/modprobe.d/blacklist-nvidia-nouveau.conf /etc/modprobe.d/nvidia-unsupported-gpu.conf
echo blacklist nouveau | tee /etc/modprobe.d/blacklist-nvidia-nouveau.conf && \
echo options nouveau modeset=0 | tee -a /etc/modprobe.d/blacklist-nvidia-nouveau.conf && \
echo options nvidia NVreg_OpenRmEnableUnsupportedGpus=1 | tee /etc/modprobe.d/nvidia-unsupported-gpu.conf && \
dracut --force && \
/sbin/reboot
5、检查驱动
执行nvidia-smi
Wed Aug 16 13:46:06 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98 Driver Version: 535.98 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3090 Off | 00000000:13:00.0 Off | N/A |
| 32% 21C P8 8W / 350W | 4MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
6、安装nvidia-container-runtime
#安装源
curl -s -L https://nvidia.github.io/libnvidia-container/centos8/libnvidia-container.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
#安装容器运行时
yum install -y nvidia-container-runtime
7、修改containerd配置文件
7.1、增加如下配置
[plugins."io.containerd.runtime.v1.linux"]
no_shim = false
runtime = "nvidia-container-runtime"
runtime_root = ""
shim = "containerd-shim"
shim_debug = false
7.2、修改container配置
修改前:runtime_type = "io.containerd.runc.v2"
修改后:runtime_type = "io.containerd.runtime.v1.linux"
7.3、完整配置文件
[root@ai-4 containerd]# pwd
/etc/containerd
[root@ai-4 containerd]# cat config.toml
version = 2
root = "/var/lib/containerd"
state = "/run/containerd"
oom_score = 0
[grpc]
address = "/run/containerd/containerd.sock"
uid = 0
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216
[debug]
address = "/run/containerd/containerd-debug.sock"
uid = 0
gid = 0
level = "warn"
[timeouts]
"io.containerd.timeout.shim.cleanup" = "5s"
"io.containerd.timeout.shim.load" = "5s"
"io.containerd.timeout.shim.shutdown" = "3s"
"io.containerd.timeout.task.state" = "2s"
[plugins]
[plugins."io.containerd.grpc.v1.cri"]
sandbox_image = "sealos.hub:5000/pause:3.2"
max_container_log_line_size = -1
max_concurrent_downloads = 20
disable_apparmor = true
[plugins."io.containerd.grpc.v1.cri".containerd]
snapshotter = "overlayfs"
default_runtime_name = "runc"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runtime.v1.linux"
runtime_engine = ""
runtime_root = ""
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/etc/containerd/certs.d"
[plugins."io.containerd.grpc.v1.cri".registry.configs]
[plugins."io.containerd.grpc.v1.cri".registry.configs."sealos.hub:5000".auth]
username = "admin"
password = "***********"
[plugins."io.containerd.runtime.v1.linux"]
no_shim = false
runtime = "nvidia-container-runtime"
runtime_root = ""
shim = "containerd-shim"
shim_debug = false
8、测试containerd下显卡是否正常加载显卡
[root@ai-4 containerd]# ctr run --rm --gpus 0 docker.io/nvidia/cuda:11.0.3-base-ubuntu20.04 nvidia-smi nvidia-smi
Wed Aug 16 05:57:19 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98 Driver Version: 535.98 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3090 Off | 00000000:13:00.0 Off | N/A |
| 32% 21C P8 8W / 350W | 4MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
9、K8S部署插件支持显卡(如果没有部署可通过如下命令部署,K8S Master上执行)
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.7.1/nvidia-device-plugin.yml
10、K8S检查对应节点是否有GPU资源
[root@k8s-master-17227100216 ~]# kubectl describe node node9 |grep gpu
gpu/type=nvidia
nvidia.com/gpu: 1
nvidia.com/gpu: 1
nvidia.com/gpu 0 0
11、部署GPU测试容器
apiVersion: v1
kind: Pod
metadata:
name: cuda-vector-add
spec:
restartPolicy: OnFailure
containers:
- name: cuda-vector-add
#image: "k8s.gcr.io/cuda-vector-add:v0.1"
image: "docker.io/nvidia/cuda:11.0.3-base-ubuntu20.04"
command:
- nvidia-smi
resources:
limits:
nvidia.com/gpu: 1