Metrics Server 是 Kubernetes 集群的核心组件之一,用于聚合集群中节点和 Pod 的资源使用数据(如 CPU、内存),并通过 Metrics API 提供给 Horizontal Pod Autoscaler (HPA) 或 kubectl top 等工具使用。它轻量、高效,通常用于监控和自动扩缩容场景。本教程将教你如何完成Metrics Server 完整配置安装!
1. 删除现有部署(如有)
bash
# 删除现有的 Metrics Server
kubectl delete -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml --ignore-not-found=true
# 或者强制删除所有相关资源
kubectl delete deployment metrics-server -n kube-system --ignore-not-found=true
kubectl delete service metrics-server -n kube-system --ignore-not-found=true
kubectl delete apiservice v1beta1.metrics.k8s.io --ignore-not-found=true
kubectl delete clusterrole system:aggregated-metrics-reader --ignore-not-found=true
kubectl delete clusterrolebinding metrics-server:system:auth-delegator --ignore-not-found=true
kubectl delete rolebinding metrics-server-auth-reader -n kube-system --ignore-not-found=true
kubectl delete clusterrolebinding metrics-server:system:metrics-server --ignore-not-found=true
2. 创建完整配置文件
创建 metrics-server.yaml 文件:
bash
cat > metrics-server-fixed.yaml << 'EOF'
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-app: metrics-server
name: metrics-server
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
k8s-app: metrics-server
rbac.authorization.k8s.io/aggregate-to-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true"
name: system:aggregated-metrics-reader
rules:
- apiGroups:
- metrics.k8s.io
resources:
- pods
- nodes
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
k8s-app: metrics-server
name: system:metrics-server
rules:
- apiGroups:
- ""
resources:
- nodes/metrics
verbs:
- get
- apiGroups:
- ""
resources:
- pods
- nodes
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
k8s-app: metrics-server
name: metrics-server-auth-reader
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: extension-apiserver-authentication-reader
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
k8s-app: metrics-server
name: metrics-server:system:auth-delegator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
k8s-app: metrics-server
name: metrics-server:system:metrics-server
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:metrics-server
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
labels:
k8s-app: metrics-server
name: metrics-server
namespace: kube-system
spec:
ports:
- name: https
port: 443
protocol: TCP
targetPort: https
selector:
k8s-app: metrics-server
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
k8s-app: metrics-server
name: metrics-server
namespace: kube-system
spec:
selector:
matchLabels:
k8s-app: metrics-server
strategy:
rollingUpdate:
maxUnavailable: 0
template:
metadata:
labels:
k8s-app: metrics-server
spec:
containers:
- args:
- --cert-dir=/tmp
- --secure-port=4443
- --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
- --kubelet-use-node-status-port
- --kubelet-insecure-tls
image: registry.cn-hangzhou.aliyuncs.com/google_containers/metrics-server:v0.7.1
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /livez
port: https
scheme: HTTPS
periodSeconds: 10
name: metrics-server
ports:
- containerPort: 4443
name: https
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /readyz
port: https
scheme: HTTPS
initialDelaySeconds: 20
periodSeconds: 10
resources:
requests:
cpu: 100m
memory: 200Mi
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
volumeMounts:
- mountPath: /tmp
name: tmp-dir
nodeSelector:
kubernetes.io/os: linux
priorityClassName: system-cluster-critical
serviceAccountName: metrics-server
volumes:
- emptyDir: {}
name: tmp-dir
---
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
labels:
k8s-app: metrics-server
name: v1beta1.metrics.k8s.io
spec:
group: metrics.k8s.io
groupPriorityMinimum: 100
insecureSkipTLSVerify: true
service:
name: metrics-server
namespace: kube-system
version: v1beta1
versionPriority: 100
EOF
3. 应用配置
bash
# 应用完整配置
kubectl apply -f metrics-server.yaml
4. 等待部署完成
bash
# 等待 Pod 启动
kubectl wait --for=condition=ready pod -l k8s-app=metrics-server -n kube-system --timeout=180s
# 或者实时观察部署状态
kubectl get pods -n kube-system -l k8s-app=metrics-server -w
5. 验证安装
bash
# 检查 Pod 状态
kubectl get pods -n kube-system -l k8s-app=metrics-server
# 测试 Metrics Server 功能
kubectl top nodes
# 测试 Pod 指标
kubectl top pods -A
# 检查 API 服务状态
kubectl get apiservice v1beta1.metrics.k8s.io
6. 故障排除
如果出现镜像拉取问题
选项一:使用国内镜像源
bash
# 修改镜像地址
sed -i 's|k8s.gcr.io/metrics-server/metrics-server:v0.7.1|registry.cn-hangzhou.aliyuncs.com/google_containers/metrics-server:v0.7.1|g' metrics-server.yaml
# 重新应用
kubectl apply -f metrics-server.yaml
选项二:使用官方最新版本
bash
# 下载最新版本
curl -LO https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
# 应用
kubectl apply -f components.yaml
检查详细日志
bash
# 查看 Pod 详细信息
kubectl describe pod -n kube-system -l k8s-app=metrics-server
# 查看日志
kubectl logs -n kube-system -l k8s-app=metrics-server
重启部署
bash
# 重启 Metrics Server
kubectl rollout restart deployment/metrics-server -n kube-system
7. 配置说明
这个完整配置包含以下关键特性:
- 安全配置:使用非 root 用户运行,只读根文件系统
- 资源限制:CPU 100m,内存 200Mi
- 健康检查:就绪性和存活探针
- TLS 配置 :跳过 Kubelet TLS 验证(
--kubelet-insecure-tls) - 高可用:滚动更新策略
- 优先级:系统集群关键优先级
8. 验证 HPA 功能
安装完成后,可以测试 HPA:
bash
# 创建测试部署
kubectl create deployment hpa-test --image=nginx:alpine
# 创建 HPA
kubectl autoscale deployment hpa-test --cpu-percent=50 --min=1 --max=3
# 观察 HPA
kubectl get hpa hpa-test -w
这个完整配置应该能够解决大多数 Metrics Server 的安装问题。