项目代码:基于v2.8.0分支源码分析
背景
HAMI-Device-Plugin 是用于替代默认的Nvidia-Device-Plugin,实现vGPU设备的发现管理,为vGPU资源调度提供了数据源。
对于Device-Plugin我们重点关注以下几个方法
- 重点是 Register,ListAndWatch 和 Allocate,这是各类Device-Plugin通用的功能方法;
- RegisterInAnnotation只是一个HAMI-Device-Plugin自己的一个依赖功能;
- Register:将插件注册到 Kubelet 的参数 ResourceName 比较重要,Kubelet后续要上报信息给集群
- RegisterInAnnotation:向K8s Node写annotation,供HAMI调度逻辑使用
- ListAndWatch:感知 GPU 并上报的
- Allocate:给容器分配GPU资源
Register
启动gRPC服务,并向kubelet进行注册服务
cmd\device-plugin\nvidia\main.go程序启动后,通过执行start -> startPlugins
go
func startPlugins(c *cli.Context, o *options) ([]plugin.Interface, bool, error) {
// ... something before
started := 0
for _, p := range plugins {
// Just continue if there are no devices to serve for plugin p.
if len(p.Devices()) == 0 {
continue
}
// Start the gRPC server for plugin p and connect it with the kubelet.
if err := p.Start(o.kubeletSocket); err != nil {
klog.Errorf("Failed to start plugin: %v", err)
return plugins, true, nil
}
started++
}
// ... something after
}
关键是这里执行p.Start(o.kubeletSocket),去触发后续的Register流程
参数o.kubeletSocket这个都是默认值:/var/lib/kubelet/device-plugins/kubelet.sock,来自参数解析传入
yaml
&cli.StringFlag{
Name: "kubelet-socket",
Value: kubeletdevicepluginv1beta1.KubeletSocket,
Usage: "specify the socket for communicating with the kubelet; if this is empty, no connection with the kubelet is attempted",
Destination: &o.kubeletSocket,
EnvVars: []string{"KUBELET_SOCKET"},
},
真正启动gRPC服务和完成注册是在pkg侧实现,cmd框架侧只负责调用抽象能力。
hami-device-plugin 的 gRPC服务启动,会向kubelet发起注册,并上报自己的resourceName资源名
go
/ Start starts the gRPC server, registers the device plugin with the Kubelet,
// and starts the device healthchecks.
func (plugin *NvidiaDevicePlugin) Start(kubeletSocket string) error {
// ...
// 1.启动 gRPC 服务
err = plugin.Serve()
// ...
// 2.向 kubelet 注册
err = plugin.Register(kubeletSocket)
// ...
}
// plugin.Register(kubeletSocket)
func (plugin *NvidiaDevicePlugin) Register(kubeletSocket string) error {
if kubeletSocket == "" {
klog.Info("Skipping registration with Kubelet")
return nil
}
conn, err := plugin.dial(kubeletSocket, 5*time.Second)
if err != nil {
return err
}
defer conn.Close()
client := kubeletdevicepluginv1beta1.NewRegistrationClient(conn)
// 关键点: 上报 ResourceName
reqt := &kubeletdevicepluginv1beta1.RegisterRequest{
Version: kubeletdevicepluginv1beta1.Version,
Endpoint: path.Base(plugin.socket),
ResourceName: string(plugin.rm.Resource()),
Options: &kubeletdevicepluginv1beta1.DevicePluginOptions{
GetPreferredAllocationAvailable: false,
},
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
return err
}
return nil
}
此外,HAMI-Device-Plugin还会监听kubelet socket,如果发现 kubelet socket 被重新创建,就会重启整套 plugin,再走一遍 Serve() -> Register()
go
func start(c *cli.Context, o *options) error {
kubeletSocketDir := filepath.Dir(o.kubeletSocket)
watcher, err := watch.Files(kubeletSocketDir)
// ...
case event := <-watcher.Events:
if o.kubeletSocket != "" && event.Name == o.kubeletSocket && event.Op&fsnotify.Create == fsnotify.Create {
klog.Infof("inotify: %s created, restarting.", o.kubeletSocket)
goto restart
}
}
RegisterInAnnotation
为 K8s Node 标记 HAMI-Scheduler 调度需要的 Annotations
在 HAMI-Device-Plugin 的 gRPC Serve 和注册 Kubelet 完成后,开始开启 goroutine 去进行 WatchAndRegister,这是执行RegisterInAnnotation的入口
go
// Start starts the gRPC server, registers the device plugin with the Kubelet,
// and starts the device healthchecks.
func (plugin *NvidiaDevicePlugin) Start(kubeletSocket string) error {
plugin.initialize()
// ...
go func() {
plugin.WatchAndRegister(plugin.disableWatchAndRegister, plugin.ackDisableWatchAndRegister)
}()
if deviceSupportMig {
plugin.ApplyMigTemplate()
}
return nil
}
WatchAndRegister 进行调用 RegisterInAnnotation
go
func (plugin *NvidiaDevicePlugin) WatchAndRegister(disableNVML <-chan bool, ackDisableWatchAndRegister chan<- bool) {
klog.Info("Starting WatchAndRegister")
errorSleepInterval := time.Second * 5
successSleepInterval := time.Second * 30
var disableWatchAndRegister bool
for {
select {
case disable := <-disableNVML:
if disable {
// when received disableNVML signal, stop the watch and register all the time
klog.Info("Received disableNVML signal, stopping WatchAndRegister")
disableWatchAndRegister = true
} else {
// when received enableNVML signal, start the watch and register again
klog.Info("Received enableNVML signal, resuming WatchAndRegister")
disableWatchAndRegister = false
}
default:
}
if disableWatchAndRegister {
klog.Info("WatchAndRegister is disabled by disableWatchAndRegister signal, sleep a success interval")
ackDisableWatchAndRegister <- true
time.Sleep(successSleepInterval)
continue
}
// 执行
err := plugin.RegisterInAnnotation()
if err != nil {
klog.Errorf("Failed to register annotation: %v", err)
klog.Infof("Retrying in %v seconds...", errorSleepInterval)
time.Sleep(errorSleepInterval)
} else {
klog.Infof("Successfully registered annotation. Next check in %v seconds...", successSleepInterval)
time.Sleep(successSleepInterval)
}
}
}
RegisterInAnnotation
go
func (plugin *NvidiaDevicePlugin) RegisterInAnnotation() error {
// 1. 获取 hami.io/node-nvidia-register Annotation 的数据源
devices := plugin.getAPIDevices()
klog.InfoS("start working on the devices", "devices", devices)
annos := make(map[string]string)
node, err := util.GetNode(util.NodeName)
if err != nil {
klog.Errorln("get node error", err.Error())
return err
}
encodeddevices := device.MarshalNodeDevices(*devices)
if encodeddevices == plugin.deviceCache {
return nil
}
plugin.deviceCache = encodeddevices
var data []byte
if os.Getenv("ENABLE_TOPOLOGY_SCORE") == "true" {
// 2. 获取 hami.io/node-nvidia-score Annotation 的数据源
gpuScore, err := nvidia.CalculateGPUScore(device.GetDevicesUUIDList(*devices))
if err != nil {
klog.ErrorS(err, "calculate gpu topo score error")
return err
}
data, err = json.Marshal(gpuScore)
if err != nil {
klog.ErrorS(err, "marshal gpu score error.")
return err
}
}
klog.V(4).InfoS("patch nvidia topo score to node", "hami.io/node-nvidia-score", string(data))
annos[nvidia.RegisterAnnos] = encodeddevices
if len(data) > 0 {
annos[nvidia.RegisterGPUPairScore] = string(data)
}
klog.Infof("patch node with the following annos %v", fmt.Sprintf("%v", annos))
// 直接和 kube-apiserver 进行通信,而不是使用的传统的 device plugin 上报流程
err = util.PatchNodeAnnotations(node, annos)
if err != nil {
klog.Errorln("patch node error", err.Error())
}
return err
}
RegisterInAnnotation() 功能:
- 把本节点 GPU 资源清单写到 hami.io/node-nvidia-register
json
hami.io/node-nvidia-register: '[{"id":"GPU-******-****-****-********","count":4,"devmem":15360,"devcore":100,"type":"NVIDIA-Tesla
T4","numa":1,"mode":"hami-core","health":true,"devicepairscore":{}}]'
- 把 GPU 拓扑亲和分数写到 hami.io/node-nvidia-score
css
[ { "uuid": "GPU-0", "score": { "GPU-1": 200, "GPU-2": 100 } }]
-
让 HAMi scheduler 不依赖 kubelet 的 allocatable 细节,而是基于更细粒度的 GPU 信息做调度和分配
-
util.PatchNodeAnnotations:直接和 kube-apiserver 进行通信,而不是使用的传统的 device plugin 上报流程。 传统Device-Plugin上报的是资源总量,而HAMI调度需要更细粒度的信息。例如,单卡剩余显存/核心是否够分配,NUMA是否匹配,多卡组合拓步Score是否有更优的节点。
ListAndWatch
供 Kubelet 进行 gRPC 调用,获取设备信息,一个标准接口名字
当HAMI-Device-Plugin启动自己的gRPC Server服务后,Kubelet就能顺着标准device-plugin函数名通过gRPC调用来执行,获取设备情况。
go
// ListAndWatch lists devices and update that list according to the health status
func (plugin *NvidiaDevicePlugin) ListAndWatch(e *kubeletdevicepluginv1beta1.Empty, s kubeletdevicepluginv1beta1.DevicePlugin_ListAndWatchServer) error {
s.Send(&kubeletdevicepluginv1beta1.ListAndWatchResponse{Devices: plugin.apiDevices()})
for {
select {
case <-plugin.stop:
return nil
case d := <-plugin.health:
// FIXME: there is no way to recover from the Unhealthy state.
d.Health = kubeletdevicepluginv1beta1.Unhealthy
klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.ID)
s.Send(&kubeletdevicepluginv1beta1.ListAndWatchResponse{Devices: plugin.apiDevices()})
}
}
}
返回给kubelet当前设备信息,例如
- 设备ID
- 设备健康情况
- 设备Topology拓扑情况
Kubelet后续调用Allocate只需要知道把哪块GPU设备分配给哪个Container就行。
Allocate
HAMI-Device-Plugin 的 Allocate() 方法解析 Annotations Patched By HAMI-Scheduler 然后作为Allocate的依据进行分配GPU设备,最终Kubelet按照PodUID + ContainerName的维度持久化下来存储在
/var/lib/kubelet/device-plugins/kubelet_internal_checkpoin文件里;
整体流程
- 硬件分配
-
HAMi-scheduler 提前写入的位置:Pod annotations
-
device plugin 在 Allocate() 里依赖的关键 annotation:
-
Allocate() 的解析方式:
- 先通过 GetPendingPod() 找到当前 Pod
- 再通过 GetNextDeviceRequest() 从 annotation 找到当前应处理的 container
- 返回对应的 env/mount/device 信息给 kubelet,最终kubelet 最终记录归属:PodUID + ContainerName + ResourceName
- CUDA劫持
-
注入CUDA劫持,限制显存/算力使用的环境变量
CUDA_DEVICE_MEMORY_LIMIT_X和CUDA_DEVICE_SM_LIMIT -
挂载 libvgpu.so 到 Pod 中
/usr/local/vgpu/libvgpu.so
两个细节:
细节一
- hami.io/vgpu-devices-to-allocate 由 HAMI-Scheduler 写入,后续等 Kubelet gRPC 调用 HAMI-Device-Plugin Allocate 成功后会逐个erase擦除;
- hami.io/vgpu-devices-allocated 由 HAMI-Scheduler 写入后不变;
细节二
- 生产实测容器启动后,注入的 LD_PRELOAD 预期值如果被镜像里覆盖,不影响CUDA劫持。
libvgpu.so 可以不走 LD_PRELOAD 环境变量,走/etc/ld.so.preload进行加载(我甚至怀疑HAMI最新代码是不是移除了LD_PRELOAD的注入,直接依赖/etc/ld.so.preload的全局环境兜底逻辑,有待考究,但至少在代码里没看到)
shell
# 检查容器内是否有这个文件 cat /etc/ld.so.preload
# 输出:/usr/local/vgpu/libvgpu.so
Linux 动态链接器 ld.so 在每个进程启动时的执行顺序是:
bash
进程启动
↓
ld.so 读取 /etc/ld.so.preload ← 第一步,文件级,不可绕过
↓
ld.so 读取 LD_PRELOAD 环境变量 ← 第二步,可被覆盖
↓
正常加载依赖库
所以即使容器镜像把 LD_PRELOAD 改成了别的值,/etc/ld.so.preload 里的 libvgpu.so 依然会被强制加载,CUDA 劫持不受影响。
kotlin
for _, val := range currentCtr.Env {
if strings.Compare(val.Name, "CUDA_DISABLE_CONTROL") == 0 {
// if env existed but is set to false or can not be parsed, ignore
t, _ := strconv.ParseBool(val.Value)
if !t {
continue
}
// only env existed and set to true, we mark it "found"
found = true
break
}
}
if !found {
response.Mounts = append(response.Mounts, &kubeletdevicepluginv1beta1.Mount{ContainerPath: "/etc/ld.so.preload",
HostPath: hostHookPath + "/vgpu/ld.so.preload",
ReadOnly: true},
)
}
示例说明
- hami.io/bind-phase: success
- hami.io/bind-time: 1776156519【HAMI-Scheduler 写死不变】
- hami.io/vgpu-devices-allocated: GPU-f49304b4-45f4-1c78-6bd9-bb970575182e,NVIDIA,7680,50:;【HAMI-Scheduler 写死不变,一个Container内不同设备用:分割,不同Container之间用;间隔,示例里只提供单Pod单Container的Demo】
- hami.io/vgpu-devices-to-allocate: ;【HAMI-Scheduler 初始化后再由 HAMI-Device-Plugin 擦除】
- hami.io/vgpu-node: svr30187hw1288【HAMI-Scheduler 写死不变】
- hami.io/vgpu-time: 1776156519【HAMI-Scheduler 写死不变】
上面是一个成功分配的Pod的Annotation示例,帮助我们更直观的理解源码中的分配结果。
HAMI-Scheduler 调度后
hami.io/bind-time: 1776156519
HAMI-Device-Plugin 分配后(Allocate)
hami.io/bind-phase:allocating 更新为 success or failed
FAQ
为什么Device-Plugin 按照Container维度分配GPU设备?
证据
查看宿主机上 cat /var/lib/kubelet/device-plugins/kubelet_internal_checkpoint 文件内容
json
{"Data":{"PodDeviceEntries":[{"PodUID":"11ee8cca-fef9-4191-9724-bed354bb2883","ContainerName":"aitraining","ResourceName":"cloud.ctrip.com/ip","DeviceIDs":{"-1":["svr30187hw1288-bdab8990-c242-48a1-99ac-4593bff1b46c"]},"AllocResp":"CkgKEUNUUklQX05JQ19ERVZJQ0VTEjNzdnIzMDE4N2h3MTI4OC1iZGFiODk5MC1jMjQyLTQ4YTEtOTlhYy00NTkzYmZmMWI0NmM="},{"PodUID":"11ee8cca-fef9-4191-9724-bed354bb2883","ContainerName":"aitraining","ResourceName":"nvidia.com/gpu","DeviceIDs":{"-1":["GPU-f49304b4-45f4-1c78-6bd9-bb970575182e-2"]},"AllocResp":"CiMlbG9hZBgB"},{"PodUID":"9dc762f1-0576-445c-8fb4-3d1a9e45d675","ContainerName":"aitraining","ResourceName":"nvidia.com/gpu","DeviceIDs":{"-1":["GPU-f49304b4-45f4-1c78-6bd9-bb970575182e-1"]},"AllocResp":"CkIKFk8ucHJlbG9hZBgB"},{"PodUID":"9dc762f1-0576-445c-8fb4-3d1a9e45d675","ContainerName":"aitraining","ResourceName":"cloud.ctrip.com/ip","DeviceIDs":{"-1":["svr30187hw1288-a0bf2bb5-df36-4dd0-a338-4a0579599128"]},"AllocResp":"CkgKEUNUUklQX05JQ19ERVZJQ0VTEjNzdnIzMDE4N2h3MTI4OC1hMGJmMmJiNS1kZjM2LTRkZDAtYTMzOC00YTA1Nzk1OTkxMjg="},{"PodUID":"d8e10140-03e5-4d9e-8c02-84e03a541a4c","ContainerName":"querynode","ResourceName":"cloud.ctrip.com/ip","DeviceIDs":{"-1":["svr30187hw1288-fed5a117-5c4f-4c1d-8900-2a0f3a9096a4"]},"AllocResp":"CkgKEUNUUklQX05JQ19ERVZJQ0VTEjNzdnIzMDE4N2h3MTI4OC1mZWQ1YTExNy01YzRmLTRjMWQtODkwMC0yYTBmM2E5MDk2YTQ="}],"RegisteredDevices":{"cloud.ctrip.com/ip":["svr30187hw1288-744c731a-0fac-492e-b534-d93f16e50a46","svr30187hw1288-05da848b-c2c2-4703-9989-ac92c252f32c"],"nvidia.com/gpu":["GPU-f49304b4-45f4-1c78-6bd9-bb970575182e-0","GPU-f49304b4-45f4-1c78-6bd9-bb970575182e-1","GPU-f49304b4-45f4-1c78-6bd9-bb970575182e-2","GPU-f49304b4-45f4-1c78-6bd9-bb970575182e-3"]}},"Checksum":451059619}
原理 我们回顾下容器环境的 GPU 设备挂载链路,这个问题的答案就很显而易见了。 Docker run -it --gpu all .... 启动后,由docker/Containerd来构建容器 Runtime Bundle (Runtime Spec + rootfs),随后 nvidia-container-runtime.so/ nvidia-container-runtime-hook 去判断 container 容器是否需要使用 GPU 设备,然后进行 GPU 设备,GPU 驱动库(.so文件)等依赖描述加入 Runtime Bundle,最后由runc 来完成剩余真正的创建/启动工作。