记K8S集群工作节点,AnolisOS 8.6部署显卡驱动集成Containerd运行时

1、安装gcc

#安装编译环境

yum -y install make gcc gcc-c++

2、下载显卡驱动

点击 直达连接

nvidia高级搜索下载历史版本驱动程序(下载历史版本驱动)

https://www.nvidia.cn/Download/Find.aspx?lang=cn

3、安装驱动

安装显卡驱动

 ./NVIDIA-Linux-x86_64-535.98.run  -m=kernel-open

4、修改系统参数,更新内核,重启服务器

rm -f /etc/modprobe.d/blacklist-nvidia-nouveau.conf /etc/modprobe.d/nvidia-unsupported-gpu.conf
echo blacklist nouveau | tee /etc/modprobe.d/blacklist-nvidia-nouveau.conf && \
	echo options nouveau modeset=0 | tee -a /etc/modprobe.d/blacklist-nvidia-nouveau.conf && \
	echo options nvidia NVreg_OpenRmEnableUnsupportedGpus=1 | tee /etc/modprobe.d/nvidia-unsupported-gpu.conf && \
	dracut --force && \
	 /sbin/reboot

5、检查驱动

执行nvidia-smi

Wed Aug 16 13:46:06 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98                 Driver Version: 535.98       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:13:00.0 Off |                  N/A |
| 32%   21C    P8               8W / 350W |      4MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+

6、安装nvidia-container-runtime

#安装源

curl -s -L https://nvidia.github.io/libnvidia-container/centos8/libnvidia-container.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo

#安装容器运行时

yum install -y nvidia-container-runtime

7、修改containerd配置文件

7.1、增加如下配置

  [plugins."io.containerd.runtime.v1.linux"]
    no_shim = false
    runtime = "nvidia-container-runtime"
    runtime_root = ""
    shim = "containerd-shim"
    shim_debug = false

7.2、修改container配置

修改前:runtime_type = "io.containerd.runc.v2" 
修改后:runtime_type = "io.containerd.runtime.v1.linux"

7.3、完整配置文件

[root@ai-4 containerd]# pwd
/etc/containerd
[root@ai-4 containerd]# cat config.toml
version = 2
root = "/var/lib/containerd"
state = "/run/containerd"
oom_score = 0

[grpc]
  address = "/run/containerd/containerd.sock"
  uid = 0
  gid = 0
  max_recv_message_size = 16777216
  max_send_message_size = 16777216

[debug]
  address = "/run/containerd/containerd-debug.sock"
  uid = 0
  gid = 0
  level = "warn"

[timeouts]
  "io.containerd.timeout.shim.cleanup" = "5s"
  "io.containerd.timeout.shim.load" = "5s"
  "io.containerd.timeout.shim.shutdown" = "3s"
  "io.containerd.timeout.task.state" = "2s"

[plugins]
  [plugins."io.containerd.grpc.v1.cri"]
    sandbox_image = "sealos.hub:5000/pause:3.2"
    max_container_log_line_size = -1
    max_concurrent_downloads = 20
    disable_apparmor = true
    [plugins."io.containerd.grpc.v1.cri".containerd]
      snapshotter = "overlayfs"
      default_runtime_name = "runc"
      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
          runtime_type = "io.containerd.runtime.v1.linux"
          runtime_engine = ""
          runtime_root = ""
          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
            SystemdCgroup = true
    [plugins."io.containerd.grpc.v1.cri".registry]
      config_path = "/etc/containerd/certs.d"
      [plugins."io.containerd.grpc.v1.cri".registry.configs]
          [plugins."io.containerd.grpc.v1.cri".registry.configs."sealos.hub:5000".auth]
            username = "admin"
            password = "***********"
  [plugins."io.containerd.runtime.v1.linux"]
    no_shim = false
    runtime = "nvidia-container-runtime"
    runtime_root = ""
    shim = "containerd-shim"
    shim_debug = false

8、测试containerd下显卡是否正常加载显卡

[root@ai-4 containerd]# ctr run --rm --gpus 0 docker.io/nvidia/cuda:11.0.3-base-ubuntu20.04 nvidia-smi nvidia-smi
Wed Aug 16 05:57:19 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98                 Driver Version: 535.98       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:13:00.0 Off |                  N/A |
| 32%   21C    P8               8W / 350W |      4MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+

9、K8S部署插件支持显卡(如果没有部署可通过如下命令部署,K8S Master上执行)

kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.7.1/nvidia-device-plugin.yml

10、K8S检查对应节点是否有GPU资源

[root@k8s-master-17227100216 ~]# kubectl describe node node9 |grep gpu
                    gpu/type=nvidia
  nvidia.com/gpu:     1
  nvidia.com/gpu:     1
  nvidia.com/gpu     0           0

11、部署GPU测试容器

apiVersion: v1
kind: Pod
metadata:
  name: cuda-vector-add
spec:
  restartPolicy: OnFailure
  containers:
    - name: cuda-vector-add
      #image: "k8s.gcr.io/cuda-vector-add:v0.1"
      image: "docker.io/nvidia/cuda:11.0.3-base-ubuntu20.04"
      command:
      - nvidia-smi
      resources:
        limits:
          nvidia.com/gpu: 1
相关推荐
cominglately1 小时前
centos单机部署seata
linux·运维·centos
魏 无羡1 小时前
linux CentOS系统上卸载docker
linux·kubernetes·centos
CircleMouse1 小时前
Centos7, 使用yum工具,出现 Could not resolve host: mirrorlist.centos.org
linux·运维·服务器·centos
Karoku0662 小时前
【k8s集群应用】kubeadm1.20高可用部署(3master)
运维·docker·云原生·容器·kubernetes
木子Linux2 小时前
【Linux打怪升级记 | 问题01】安装Linux系统忘记设置时区怎么办?3个方法教你回到东八区
linux·运维·服务器·centos·云计算
mit6.8242 小时前
Ubuntu 系统下性能剖析工具: perf
linux·运维·ubuntu
鹏大师运维2 小时前
聊聊开源的虚拟化平台--PVE
linux·开源·虚拟化·虚拟机·pve·存储·nfs
watermelonoops2 小时前
Windows安装Ubuntu,Deepin三系统启动问题(XXX has invalid signature 您需要先加载内核)
linux·运维·ubuntu·deepin
阿甘知识库2 小时前
宝塔面板跨服务器数据同步教程:双机备份零停机
android·运维·服务器·备份·同步·宝塔面板·建站
滴水之功3 小时前
VMware OpenWrt怎么桥接模式联网
linux·openwrt