Kubernetes100行代码轻松实现根据真实使用资源调度Pod

该文章旨在快速开发基于真实使用资源调度Pod,偏原理性东西不在此叙述,尽可能通俗易懂

代码文件均已上传github: github.com/huangjc7/sc...

结尾有踩坑解决方案

1.介绍

1.1 版本介绍

基于kubernetes1.28.5开发,各类库使用使用版本为0.28.5

go版本: 1.22

下列是go.mod文件

go 复制代码
module github.com/huangjc7/scheduler-demo

go 1.22.0

require (
	github.com/prometheus/client_golang v1.18.0
	github.com/prometheus/common v0.45.0
	github.com/spf13/pflag v1.0.5
	k8s.io/api v0.28.5
	k8s.io/apimachinery v0.28.5
	k8s.io/component-base v0.29.2
	k8s.io/klog/v2 v2.100.1
	k8s.io/kubernetes v1.28.5
)

require (
	github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
	github.com/NYTimes/gziphandler v1.1.1 // indirect
	github.com/antlr/antlr4/runtime/Go/antlr/v4 v4.0.0-20230305170008-8188dc5388df // indirect
	github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect
	github.com/beorn7/perks v1.0.1 // indirect
	github.com/blang/semver/v4 v4.0.0 // indirect
	github.com/cenkalti/backoff/v4 v4.2.1 // indirect
	github.com/cespare/xxhash/v2 v2.2.0 // indirect
	github.com/coreos/go-semver v0.3.1 // indirect
	github.com/coreos/go-systemd/v22 v22.5.0 // indirect
	github.com/davecgh/go-spew v1.1.1 // indirect
	github.com/docker/distribution v2.8.2+incompatible // indirect
	github.com/emicklei/go-restful/v3 v3.9.0 // indirect
	github.com/evanphx/json-patch v4.12.0+incompatible // indirect
	github.com/felixge/httpsnoop v1.0.3 // indirect
	github.com/fsnotify/fsnotify v1.6.0 // indirect
	github.com/go-logr/logr v1.2.4 // indirect
	github.com/go-logr/stdr v1.2.2 // indirect
	github.com/go-openapi/jsonpointer v0.19.6 // indirect
	github.com/go-openapi/jsonreference v0.20.2 // indirect
	github.com/go-openapi/swag v0.22.3 // indirect
	github.com/gogo/protobuf v1.3.2 // indirect
	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
	github.com/golang/protobuf v1.5.3 // indirect
	github.com/google/cel-go v0.16.1 // indirect
	github.com/google/gnostic-models v0.6.8 // indirect
	github.com/google/go-cmp v0.6.0 // indirect
	github.com/google/gofuzz v1.2.0 // indirect
	github.com/google/uuid v1.3.0 // indirect
	github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
	github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0 // indirect
	github.com/imdario/mergo v0.3.6 // indirect
	github.com/inconshreveable/mousetrap v1.1.0 // indirect
	github.com/josharian/intern v1.0.0 // indirect
	github.com/json-iterator/go v1.1.12 // indirect
	github.com/mailru/easyjson v0.7.7 // indirect
	github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
	github.com/moby/sys/mountinfo v0.6.2 // indirect
	github.com/moby/term v0.0.0-20221205130635-1aeaba878587 // indirect
	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
	github.com/modern-go/reflect2 v1.0.2 // indirect
	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
	github.com/opencontainers/go-digest v1.0.0 // indirect
	github.com/opencontainers/selinux v1.10.0 // indirect
	github.com/pkg/errors v0.9.1 // indirect
	github.com/prometheus/client_model v0.6.0 // indirect
	github.com/prometheus/procfs v0.12.0 // indirect
	github.com/spf13/cobra v1.7.0 // indirect
	github.com/stoewer/go-strcase v1.2.0 // indirect
	go.etcd.io/etcd/api/v3 v3.5.9 // indirect
	go.etcd.io/etcd/client/pkg/v3 v3.5.9 // indirect
	go.etcd.io/etcd/client/v3 v3.5.9 // indirect
	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.35.0 // indirect
	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.35.1 // indirect
	go.opentelemetry.io/otel v1.10.0 // indirect
	go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.10.0 // indirect
	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.10.0 // indirect
	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.10.0 // indirect
	go.opentelemetry.io/otel/metric v0.31.0 // indirect
	go.opentelemetry.io/otel/sdk v1.10.0 // indirect
	go.opentelemetry.io/otel/trace v1.10.0 // indirect
	go.opentelemetry.io/proto/otlp v0.19.0 // indirect
	go.uber.org/atomic v1.10.0 // indirect
	go.uber.org/multierr v1.11.0 // indirect
	go.uber.org/zap v1.19.0 // indirect
	golang.org/x/crypto v0.19.0 // indirect
	golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect
	golang.org/x/net v0.21.0 // indirect
	golang.org/x/oauth2 v0.17.0 // indirect
	golang.org/x/sync v0.3.0 // indirect
	golang.org/x/sys v0.17.0 // indirect
	golang.org/x/term v0.17.0 // indirect
	golang.org/x/text v0.14.0 // indirect
	golang.org/x/time v0.3.0 // indirect
	google.golang.org/appengine v1.6.7 // indirect
	google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 // indirect
	google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9 // indirect
	google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 // indirect
	google.golang.org/grpc v1.56.3 // indirect
	google.golang.org/protobuf v1.32.0 // indirect
	gopkg.in/inf.v0 v0.9.1 // indirect
	gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
	gopkg.in/yaml.v2 v2.4.0 // indirect
	gopkg.in/yaml.v3 v3.0.1 // indirect
	k8s.io/apiextensions-apiserver v0.0.0 // indirect
	k8s.io/apiserver v0.28.5 // indirect
	k8s.io/client-go v1.5.2 // indirect
	k8s.io/cloud-provider v0.28.5 // indirect
	k8s.io/component-helpers v0.28.5 // indirect
	k8s.io/controller-manager v0.28.5 // indirect
	k8s.io/csi-translation-lib v0.19.9 // indirect
	k8s.io/dynamic-resource-allocation v0.0.0 // indirect
	k8s.io/kms v0.28.5 // indirect
	k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9 // indirect
	k8s.io/kube-scheduler v0.19.9 // indirect
	k8s.io/kubelet v0.28.5 // indirect
	k8s.io/mount-utils v0.0.0 // indirect
	k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 // indirect
	sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.1.2 // indirect
	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
	sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
	sigs.k8s.io/yaml v1.3.0 // indirect
)

replace (
	k8s.io/api => k8s.io/api v0.28.5
	k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.28.5
	k8s.io/apimachinery => k8s.io/apimachinery v0.28.5
	k8s.io/apiserver => k8s.io/apiserver v0.28.5
	k8s.io/cli-runtime => k8s.io/cli-runtime v0.28.5
	k8s.io/client-是go => k8s.io/client-go v0.19.9
	k8s.io/cloud-provider => k8s.io/cloud-provider v0.28.5
	k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.28.5
	k8s.io/code-generator => k8s.io/code-generator v0.28.5
	k8s.io/component-base => k8s.io/component-base v0.28.5
	k8s.io/cri-api => k8s.io/cri-api v0.28.5
	k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.28.5
	k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.28.5
	k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.28.5
	k8s.io/kube-proxy => k8s.io/kube-proxy v0.28.5
	k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.28.5
	k8s.io/kubectl => k8s.io/kubectl v0.28.5
	k8s.io/kubelet => k8s.io/kubelet v0.28.5
	k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.28.5
	k8s.io/metrics => k8s.io/metrics v0.28.5
	k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.28.5
)

replace k8s.io/client-go => k8s.io/client-go v0.28.5

replace k8s.io/component-helpers => k8s.io/component-helpers v0.28.5

replace k8s.io/controller-manager => k8s.io/controller-manager v0.28.5

replace k8s.io/dynamic-resource-allocation => k8s.io/dynamic-resource-allocation v0.28.5

replace k8s.io/endpointslice => k8s.io/endpointslice v0.28.5

replace k8s.io/kms => k8s.io/kms v0.28.5

replace k8s.io/mount-utils => k8s.io/mount-utils v0.28.5

replace k8s.io/pod-security-admission => k8s.io/pod-security-admission v0.28.5

replace k8s.io/sample-cli-plugin => k8s.io/sample-cli-plugin v0.28.5

replace k8s.io/sample-controller => k8s.io/sample-controller v0.28.5

1.2 目录结构介绍

shell 复制代码
. tree
├── README.md
├── docker
│   └── Dockerfile
├── go.mod
├── go.sum
├── hack
│   └── go-get-kubernetes.sh
├── main.go
└── pkg
    └── plugins
        └── sample.go

2. 代码开发

主要核心通过 scheduler frwamwork提供的WithPlugin方法进行插件注册扩展,允许开发者可以编写自己的调度逻辑然后注册在一个插件集合中去
main.go代码如下

go 复制代码
/*
Copyright 2014 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
    "github.com/huangjc7/scheduler-demo/pkg/plugins"
    "math/rand"
    "os"
    "time"

    "github.com/spf13/pflag"

    cliflag "k8s.io/component-base/cli/flag"
    "k8s.io/component-base/logs"
    _ "k8s.io/component-base/metrics/prometheus/clientgo"
    _ "k8s.io/component-base/metrics/prometheus/version" // for version metric registration
    "k8s.io/kubernetes/cmd/kube-scheduler/app"
)

func main() {
    rand.Seed(time.Now().UnixNano())
    // 核心:通过WithPlugin方法传入插件名字和插件的实现逻辑 plugins.New中为实现逻辑
    // runtime.PluginFactory是一个回调函数
    command := app.NewSchedulerCommand(app.WithPlugin(plugins.Name, plugins.New))  
    
    pflag.CommandLine.SetNormalizeFunc(cliflag.WordSepNormalizeFunc)
    logs.InitLogs()
    defer logs.FlushLogs()

    if err := command.Execute(); err != nil {
       os.Exit(1)
    }
}

核心实现逻辑

在scheduling framework中,一个pod的调度是需要通过很多个阶段

例如,预选阶段,优选阶段,节点打分。这里为官方说明
kubernetes.io/docs/concep...

我们本次代码示例是在实现filter阶段进行逻辑开发,通过查询prometheus得到主机cpu空闲率来进行调度

代码文件pkg/plugins/sample.go

go 复制代码
package plugins

import (
	"context"
	"fmt"
	"github.com/prometheus/client_golang/api"
	promeV1 "github.com/prometheus/client_golang/api/prometheus/v1"
	"github.com/prometheus/common/model"
	v1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/runtime"
	"k8s.io/klog/v2"
	"k8s.io/kubernetes/pkg/scheduler/framework"
	"time"
)

const (
	Name         = "sample-plugin"         // 定义插件名字
	promtheusURL = "http://localhost:8080" //定义prometheus地址
)

// 固定格式 实现FilterPlugin接口
// 因为本次代码要进行filter阶段逻辑开发
// 如果有其他阶段逻辑开发要实现对应的阶段接口
// 例如:
//   var _ framework.PreFilterPlugin = &Sample{}  // 预过滤阶段接口
//   var _ framework.ScorePlugin = &Sample{}  // 打分阶段接口
var _ framework.FilterPlugin = &Sample{}

type Sample struct {
	handle           framework.Handle // 固定格式: 实现Handler接口 内置了一些核心运行方法
	prometheusClient promeV1.API      //prometheus client
}

// 新建插件
func New(object runtime.Object, f framework.Handle) (framework.Plugin, error) {

	// 初始化Prometheus客户端
	client, err := api.NewClient(api.Config{
		Address: promtheusURL,
	})

	if err != nil {
		return nil, fmt.Errorf("creating prometheus client failed: %v", err)
	}

	//创建 prometheus API 客户端
	promeClient := promeV1.NewAPI(client)
	
	// 将插件调度注册框架
	return &Sample{
		handle:           f,
		prometheusClient: promeClient,
	}, nil
}

// FilterPlugin涵盖了 Filter()和Name()方法 我们需要在代码中实现

func (s *Sample) Name() string {
	return Name
}

func (s *Sample) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
	//query prometheus get cpu rate
	// 查询 Prometheus 获取节点的 CPU 使用率
	cpuUsage, err := s.queryNodeCPUUsage(ctx, nodeInfo.Node().Name)
	if err != nil {
		return framework.NewStatus(framework.Error, err.Error())
	}

	// 定义 CPU 使用率的阈值
	const maxAllowedCPUUsage = 0.7 // 70%

	// 如果节点的 CPU 使用率超过阈值,则不调度 Pod 到该节点
	if cpuUsage > maxAllowedCPUUsage {
		return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("node's CPU usage is too high: %.2f", cpuUsage))
	}

	//logic
	klog.V(3).Infof("filter pod : %v, node: %v", pod.Name, nodeInfo.Node().Name)
       // 成功以后更新filter阶段最终状态为Success 即为0
       // 这样符合要求的node将会到达Score打分阶段 如果想在打分阶段在做精细化控制
       // 可以额外再去实现SocrePlugin
	return framework.NewStatus(framework.Success)
}

func (s *Sample) queryNodeCPUUsage(ctx context.Context, nodeName string) (float64, error) {
	// 构建 Prometheus 查询
	query := fmt.Sprintf("rate(node_cpu_seconds_total{mode='idle',node='%s'}[1m])", nodeName)

	// 打印promQL 便于排错
	klog.V(3).Infof("Query promQL", query)
	// 执行查询
	val, _, err := s.prometheusClient.Query(ctx, query, time.Now())
	if err != nil {
		return 0, fmt.Errorf("querying Prometheus failed: %v", err)
	}

	// 解析查询结果
	vec, ok := val.(model.Vector)
	if !ok || len(vec) == 0 {
		return 0, fmt.Errorf("invalid Prometheus response")
	}

	// 获取 CPU 空闲率
	idleCPU := float64(vec[0].Value)

	// CPU 使用率为 1 减去 CPU 空闲率
	cpuUsage := 1 - idleCPU

	return cpuUsage, nil
}

2.1 将scheduler制作成容器

Dockerfile如下

dockerfile 复制代码
# Build the scheduler binary
FROM golang:1.22 as builder

#RUN apt-get -y update && apt-get -y install upx

WORKDIR /workspace
# Copy the Go Modules manifests
COPY go.mod go.mod
COPY go.sum go.sum

# Copy the go source
COPY main.go main.go
COPY pkg/ pkg/

# Build
ENV CGO_ENABLED=0
ENV GOOS=linux
ENV GOARCH=arm64
ENV GO111MODULE=on
ENV GOPROXY="https://goproxy.cn"

# cache deps before building and copying source so that we don't need to re-download as much
# and so that source changes don't invalidate our downloaded layer
#RUN go mod download && \
#    go build -a -o sample-scheduler main.go && \
#    upx sample-scheduler \

RUN go mod download && \
    go build -a -o sample-scheduler main.go

FROM ubuntu
COPY --from=builder /workspace/sample-scheduler /usr/local/bin/sample-scheduler

3. 部署

运行插件需要通过一个 --write-config-to 的参数,可以把调度配置的值写入到指定文件中,利用该参数可以来查看下调度器的配置文件,然后再根据配置来添加上我们编写的插件。

修改/etc/kubernetes/manifests/kube-scheduler.yaml文件,具体修改项如下

yaml 复制代码
apiVersion: v1
kind: Pod
metadata:
  creationTimestamp: null
  labels:
    component: kube-scheduler
    tier: control-plane
  name: kube-scheduler
  namespace: kube-system
spec:
  containers:
  - command:
    - kube-scheduler
    - --authentication-kubeconfig=/etc/kubernetes/scheduler.conf
    - --authorization-kubeconfig=/etc/kubernetes/scheduler.conf
    - --bind-address=0.0.0.0
    - --kubeconfig=/etc/kubernetes/scheduler.conf
    # 新增配置
    # 将配置文件写入到scheduler-config.yaml文件种
    - --write-config-to=/etc/kubernetes/scheduler-config.yaml
    - --leader-elect=true
    - --port=0
    image: registry.aliyuncs.com/k8sxio/kube-scheduler:v1.19.9
    imagePullPolicy: IfNotPresent
    livenessProbe:
      failureThreshold: 8
      httpGet:
        host: 127.0.0.1
        path: /healthz
        port: 10259
        scheme: HTTPS
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
        host: 127.0.0.1
    name: kube-scheduler
    resources:
      requests:
        cpu: 100m
    startupProbe:
      failureThreshold: 24
      httpGet:
        host: 127.0.0.1
        path: /healthz
        port: 10259
        scheme: HTTPS
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
    volumeMounts:
    - mountPath: /etc/kubernetes/scheduler.conf
      name: kubeconfig
      readOnly: true
      #新增配置 挂载
    - mountPath: /etc/kubernetes/scheduler-config.yaml
      name: scheduler-config
  hostNetwork: true
  priorityClassName: system-node-critical
  volumes:
  - hostPath:
      path: /etc/kubernetes/scheduler.conf
      type: FileOrCreate
    name: kubeconfig
  # 新增配置 将配置文件挂载到宿主机上去
  - hostPath:
      path: /etc/kubernetes/scheduler-config.yaml
      type: FileOrCreate
    name: scheduler-config

修改完了保存以后 kube-scheduler Pod会出现报错,马上恢复刚才修改的部分即可恢复,得到/etc/kubernetes/scheduler-config.yaml配置文件

yaml 复制代码
apiVersion: kubescheduler.config.k8s.io/v1
clientConnection:
  acceptContentTypes: ""
  burst: 100
  contentType: application/vnd.kubernetes.protobuf
  kubeconfig: /etc/kubernetes/scheduler.conf
  qps: 50
enableContentionProfiling: true
enableProfiling: true
kind: KubeSchedulerConfiguration
leaderElection:
  leaderElect: true
  leaseDuration: 15s
  renewDeadline: 10s
  resourceLock: leases
  resourceName: smaple-scheduler
  resourceNamespace: kube-system
  retryPeriod: 2s
parallelism: 16
percentageOfNodesToScore: 0
podInitialBackoffSeconds: 1
podMaxBackoffSeconds: 10
profiles:
- pluginConfig:
  - args:
    favorColor: "#326CE5"
    favorNumber: 7
    thanksTo: "Kubernetes"
  - args:
      apiVersion: kubescheduler.config.k8s.io/v1
      kind: DefaultPreemptionArgs
      minCandidateNodesAbsolute: 100
      minCandidateNodesPercentage: 10
    name: DefaultPreemption
  - args:
      apiVersion: kubescheduler.config.k8s.io/v1
      hardPodAffinityWeight: 1
      ignorePreferredTermsOfExistingPods: false
      kind: InterPodAffinityArgs
    name: InterPodAffinity
  - args:
      apiVersion: kubescheduler.config.k8s.io/v1
      kind: NodeAffinityArgs
    name: NodeAffinity
  - args:
      apiVersion: kubescheduler.config.k8s.io/v1
      kind: NodeResourcesBalancedAllocationArgs
      resources:
      - name: cpu
        weight: 1
      - name: memory
        weight: 1
    name: NodeResourcesBalancedAllocation
  - args:
      apiVersion: kubescheduler.config.k8s.io/v1
      kind: NodeResourcesFitArgs
      scoringStrategy:
        resources:
        - name: cpu
          weight: 1
        - name: memory
          weight: 1
        type: LeastAllocated
    name: NodeResourcesFit
  - args:
      apiVersion: kubescheduler.config.k8s.io/v1
      defaultingType: System
      kind: PodTopologySpreadArgs
    name: PodTopologySpread
  - args:
      apiVersion: kubescheduler.config.k8s.io/v1
      bindTimeoutSeconds: 600
      kind: VolumeBindingArgs
    name: VolumeBinding
  plugins:
    bind: {}
    filter:
      enabled:
      - name: "sample-plugin"
    multiPoint:
      enabled:
      - name: PrioritySort
        weight: 0
      - name: NodeUnschedulable
        weight: 0
      - name: NodeName
        weight: 0
      - name: TaintToleration
        weight: 3
      - name: NodeAffinity
        weight: 2
      - name: NodePorts
        weight: 0
      - name: NodeResourcesFit
        weight: 1
      - name: VolumeRestrictions
        weight: 0
      - name: EBSLimits
        weight: 0
      - name: GCEPDLimits
        weight: 0
      - name: NodeVolumeLimits
        weight: 0
      - name: AzureDiskLimits
        weight: 0
      - name: VolumeBinding
        weight: 0
      - name: VolumeZone
        weight: 0
      - name: PodTopologySpread
        weight: 2
      - name: InterPodAffinity
        weight: 2
      - name: DefaultPreemption
        weight: 0
      - name: NodeResourcesBalancedAllocation
        weight: 1
      - name: ImageLocality
        weight: 1
      - name: DefaultBinder
        weight: 0
      - name: SchedulingGates
        weight: 0
    permit: {}
    postBind: {}
    postFilter: {}
    preBind: {}
    preEnqueue: {}
    preFilter:
      enabled:
      - name: "sample-plugin"
    preScore: {}
    queueSort: {}
    reserve: {}
    score: {}
  schedulerName: sample-scheduler

3.1 运行自定义scheduler

将自定义scheduler以deployment方式运行起来并且将上述scheduler-config.yaml配置文件挂载至自定义scheduler容器中,具体yaml如下

yaml 复制代码
apiVersion: v1
kind: ServiceAccount
metadata:
  name: sample-scheduler-sa
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: sample-scheduler-admin-binding
subjects:
- kind: ServiceAccount
  name: sample-scheduler-sa
  namespace: kube-system
roleRef:
  kind: ClusterRole
  name: cluster-admin
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: scheduler-config
  namespace: kube-system
data:
  ## 挂载scheduler-config.yaml配置文件
  ## 注意: 这里不是完整是scheduler-config.yaml配置文件 只挂载了部分需要的
  scheduler-config.yaml: |
    apiVersion: kubescheduler.config.k8s.io/v1
    kind: KubeSchedulerConfiguration
    leaderElection:
      leaderElect: true
      leaseDuration: 15s
      renewDeadline: 10s
      resourceLock: leases
      resourceName: smaple-scheduler # 此处修改自定义scheduler name
      resourceNamespace: kube-system
      retryPeriod: 2s
    parallelism: 16
    percentageOfNodesToScore: 0
    podInitialBackoffSeconds: 1
    podMaxBackoffSeconds: 10
    profiles:
    - pluginConfig:
      - args:
          apiVersion: kubescheduler.config.k8s.io/v1
          kind: DefaultPreemptionArgs
          minCandidateNodesAbsolute: 100
          minCandidateNodesPercentage: 10
        name: DefaultPreemption
      - args:
          apiVersion: kubescheduler.config.k8s.io/v1
          hardPodAffinityWeight: 1
          ignorePreferredTermsOfExistingPods: false
          kind: InterPodAffinityArgs
        name: InterPodAffinity
      - args:
          apiVersion: kubescheduler.config.k8s.io/v1
          kind: NodeAffinityArgs
        name: NodeAffinity
      - args:
          apiVersion: kubescheduler.config.k8s.io/v1
          kind: NodeResourcesBalancedAllocationArgs
          resources:
          - name: cpu
            weight: 1
          - name: memory
            weight: 1
        name: NodeResourcesBalancedAllocation
      - args:
          apiVersion: kubescheduler.config.k8s.io/v1
          kind: NodeResourcesFitArgs
          scoringStrategy:
            resources:
            - name: cpu
              weight: 1
            - name: memory
              weight: 1
            type: LeastAllocated
        name: NodeResourcesFit
      - args:
          apiVersion: kubescheduler.config.k8s.io/v1
          defaultingType: System
          kind: PodTopologySpreadArgs
        name: PodTopologySpread
      - args:
          apiVersion: kubescheduler.config.k8s.io/v1
          bindTimeoutSeconds: 600
          kind: VolumeBindingArgs
        name: VolumeBinding
      plugins:
        bind: {}
        filter:
          enabled:
          - name: "sample-plugin" # 在filter阶段使用sample-plugin插件 而不使用默认插件
        multiPoint:
          enabled:
          - name: PrioritySort
            weight: 0
          - name: NodeUnschedulable
            weight: 0
          - name: NodeName
            weight: 0
          - name: TaintToleration
            weight: 3
          - name: NodeAffinity
            weight: 2
          - name: NodePorts
            weight: 0
          - name: NodeResourcesFit
            weight: 1
          - name: VolumeRestrictions
            weight: 0
          - name: EBSLimits
            weight: 0
          - name: GCEPDLimits
            weight: 0
          - name: NodeVolumeLimits
            weight: 0
          - name: AzureDiskLimits
            weight: 0
          - name: VolumeBinding
            weight: 0
          - name: VolumeZone
            weight: 0
          - name: PodTopologySpread
            weight: 2
          - name: InterPodAffinity
            weight: 2
          - name: DefaultPreemption
            weight: 0
          - name: NodeResourcesBalancedAllocation
            weight: 1
          - name: ImageLocality
            weight: 1
          - name: DefaultBinder
            weight: 0
          - name: SchedulingGates
            weight: 0
        permit: {}
        postBind: {}
        postFilter: {}
        preBind: {}
        preEnqueue: {}
        preFilter:
          enabled:
          - name: "sample-plugin"
        preScore: {}
        queueSort: {}
        reserve: {}
        score: {}
      schedulerName: sample-scheduler # 定义使用调度器名字
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: sample-scheduler
  namespace: kube-system
  labels:
    component: sample-scheduler
spec:
  selector:
    matchLabels:
      component: sample-scheduler
  template:
    metadata:
      labels:
        component: sample-scheduler
    spec:
      serviceAccountName: sample-scheduler-sa
      priorityClassName: system-cluster-critical
      volumes:
        - name: scheduler-config
          configMap:
            name: scheduler-config
      containers:
        - name: scheduler
          image: kubernetes/samp-scheduler:v1.0.1 # 调度器镜像
          imagePullPolicy: IfNotPresent
          command:
            - sample-scheduler
            - --config=/etc/kubernetes/scheduler-config.yaml # 制定配置文件
            - --v=3
          volumeMounts:
            - name: scheduler-config
              mountPath: /etc/kubernetes
#          livenessProbe:
#            httpGet:
#              path: /healthz
#              port: 10251
#            initialDelaySeconds: 15
#          readinessProbe:
#            httpGet:
#              path: /healthz
#              port: 10251

4. 测试

集群目前有两个节点 node02和node03(node01为master)

将node03 cpu资源跑满 使用下列脚本即可

shell 复制代码
#!/bin/bash
# 此脚本会创建足够多的后台任务来使CPU使用率达到100%

# 获取CPU核心数
CORES=$(nproc)

# 对于每个核心,启动一个后台任务
for i in $(seq 1 $CORES); do
  # 使用一个无限循环来进行计算密集型的操作
  while : ; do : ; done &
done

# 等待所有后台任务结束(实际上这些任务永远不会结束)
wait

部署一个nginx指定自定义scheduler进行调度测试

yaml 复制代码
apiVersion: apps/v1
kind: Deployment
metadata:
  name: test-scheduler
spec:
  selector:
    matchLabels:
      app: test-scheduler # 指定自定义schduler
  template:
    metadata:
      labels:
        app: test-scheduler
    spec:
      schedulerName: sample-scheduler
      containers:
        - image: nginx
          imagePullPolicy: IfNotPresent
          name: nginx
          ports:
            - containerPort: 80

自定义scheduler日志如下:

shell 复制代码
I0303 14:07:59.523243       1 eventhandlers.go:197] "Add event for scheduled pod" pod="default/test-scheduler-668f68df99-82tjw"
I0303 14:07:59.523235       1 eventhandlers.go:171] "Delete event for unscheduled pod" pod="default/test-scheduler-668f68df99-82tjw"
I0303 14:07:59.718833       1 eventhandlers.go:244] "Delete event for scheduled pod" pod="default/test-scheduler-668f68df99-7psgg"
I0303 14:08:03.121363       1 eventhandlers.go:126] "Add event for unscheduled pod" pod="default/test-scheduler-668f68df99-btsmt"
I0303 14:08:03.121403       1 schedule_one.go:93] "Attempting to schedule pod" pod="default/test-scheduler-668f68df99-btsmt"
I0303 14:08:03.121476       1 sample.go:117] prefilter pod: test-scheduler-668f68df99-btsmt
I0303 14:08:03.121518       1 log.go:245] Query promQL rate(node_cpu_seconds_total{mode='idle',node='node02'}[1m])
I0303 14:08:03.121685       1 log.go:245] Query promQL rate(node_cpu_seconds_total{mode='idle',node='node03'}[1m])
I0303 14:08:03.124109       1 sample.go:150] filter pod : test-scheduler-668f68df99-btsmt, node: node02, pre state: &{{0 0 0 0 map[]}}
I0303 14:08:03.124193       1 default_binder.go:53] "Attempting to bind pod to node" pod="default/test-scheduler-668f68df99-btsmt" node="node02"
I0303 14:08:03.127611       1 eventhandlers.go:197] "Add event for scheduled pod" pod="default/test-scheduler-668f68df99-btsmt"
I0303 14:08:03.127612       1 eventhandlers.go:171] "Delete event for unscheduled pod" pod="default/test-scheduler-668f68df99-btsmt"
I0303 14:08:03.129072       1 schedule_one.go:286] "Successfully bound pod to node" pod="default/test-scheduler-668f68df99-btsmt" node="node02" evaluatedNodes=3 feasibleNodes=1

5. 踩坑

kubernetes因为早期包管理问题,导致直接go get k8s.io/kubernetes@v1.28.5会存在文件,这里官方提供了一个脚本,直接获取对应kubernetes版本的go.mod

脚本地址: github.com/huangjc7/sc...

使用方法: bash hack/go-get-kubernetes.sh v1.28.5

相关推荐
ggaofeng3 小时前
通过命令学习k8s
云原生·容器·kubernetes
qq_道可道6 小时前
K8S升级到1.24后,切换运行时导致 dind 构建镜像慢根因定位与解决
云原生·容器·kubernetes
SONGW20186 小时前
k8s拓扑域 :topologyKey
kubernetes
weixin_438197387 小时前
K8S实现反向代理,负载均衡
linux·运维·服务器·nginx·kubernetes
华为云开发者联盟13 小时前
解读Karmada多云容器编排技术,加速分布式云原生应用升级
kubernetes·集群·karmada·多云容器
严格要求自己1 天前
nacos-operator在k8s集群上部署nacos-server2.4.3版本踩坑实录
云原生·容器·kubernetes
少吃一口就会少吃一口1 天前
k8s笔记
云原生·容器·kubernetes
葡萄皮Apple1 天前
[CKS] K8S ServiceAccount Set Up
服务器·容器·kubernetes
2301_806131361 天前
Kubernetes 核心组件调度器(Scheduler)
云原生·容器·kubernetes
放手啊2 天前
sealos部署K8s,安装docker时master节点突然NotReady
docker·容器·kubernetes