k8s Prometheus

一、部署 Prometheus

shell 复制代码
kubectl create ns kube-ops
yaml 复制代码
# 创建 prometheus-cm.yaml
apiVersion: v1
kind: ConfigMap
metadata:
 name: prometheus-config
 namespace: kube-ops
data:
 prometheus.yml: |
   global:
     scrape_interval: 15s # 表示 prometheus 抓取指标数据的频率,默认是 15s
     scrape_timeout: 15s # 表示 prometheus 抓取指标数据的超时时间,默认是 15s
   scrape_configs:
   - job_name: 'prometheus'
     static_configs:
     - targets: ['localhost:9090']
yaml 复制代码
# 创建 prometheus-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
 name: prometheus
 namespace: kube-ops
spec:
 storageClassName: nfs-storage
 accessModes:
   - ReadWriteMany
 resources:
   requests:
     storage: 10Gi
yaml 复制代码
# 创建prometheus-pv.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
 name: prometheus # pv 名字
spec:
 capacity: # 容量
  storage: 10Gi # 存储空间
 accessModes: # 存储模式
  - ReadWriteMany 
 persistentVolumeReclaimPolicy: Recycle # 持久卷回收策略
 storageClassName: nfs-storage # 存储类的名字
 nfs:
  path: /nfsdata/prometheus # nfs共享路径
  server: 10.0.17.100 # nfs服务器地址
yaml 复制代码
# 创建 prometheus-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: kube-ops
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups:
  - ""
  resources:
  - nodes
  - services
  - endpoints
  - pods
  - nodes/proxy
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - configmaps
  - nodes/metrics
  verbs:
  - get
- nonResourceURLs:
  - /metrics
  verbs:
  - get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
yaml 复制代码
# 创建 prometheus-deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
 name: prometheus
 namespace: kube-ops
 labels:
   app: prometheus
spec:
 selector:
   matchLabels:
     app: prometheus
 template:
   metadata:
     labels:
       app: prometheus
   spec:
     serviceAccountName: prometheus
     containers:
     - image: prom/prometheus:v2.4.3
       name: prometheus
       command:
       - "/bin/prometheus"
       args:
       - "--config.file=/etc/prometheus/prometheus.yml"
       - "--storage.tsdb.path=/prometheus"
       - "--storage.tsdb.retention=24h"
       - "--web.enable-admin-api"  # 控制对admin HTTP API的访问,其中包括删除时间序列等功能
       - "--web.enable-lifecycle"  # 支持热更新,直接执行localhost:9090/-/reload立即生效
       ports:
       - containerPort: 9090
         protocol: TCP
         name: http
       volumeMounts:
       - mountPath: "/prometheus"
         subPath: prometheus
         name: data
       - mountPath: "/etc/prometheus"
         name: config-volume
       resources:
         requests:
           cpu: 100m
           memory: 512Mi
         limits:
           cpu: 100m
           memory: 512Mi
     securityContext:
       runAsUser: 0
     volumes:
     - name: data
       persistentVolumeClaim:
         claimName: prometheus
     - configMap:
         name: prometheus-config
       name: config-volume
yaml 复制代码
# 创建 prometheus-svc.yaml 
apiVersion: v1
kind: Service
metadata:
 name: prometheus
 namespace: kube-ops
 labels:
   app: prometheus
spec:
 selector:
   app: prometheus
 type: NodePort
 ports:
   - name: web
     port: 9090
     targetPort: http
shell 复制代码
[root@master 01_prometheus]# kubectl get -n kube-ops svc
NAME         TYPE       CLUSTER-IP    EXTERNAL-IP   PORT(S)          AGE
prometheus   NodePort   10.7.124.89   <none>        9090:32010/TCP   30s

二、监控 ingress-nginx

Prometheus 的数据指标是通过一个公开的 HTTP(S) 数据接口获取到的,我们不需要单独安装监控的

agent,只需要暴露一个 metrics 接口,Prometheus 就会定期去拉取数据;对于一些普通的 HTTP 服

务,我们完全可以直接重用这个服务,添加一个 /metrics 接口暴露给 Prometheus

yaml 复制代码
# 创建新的配置文件 prome-cm.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
 name: prometheus-config
 namespace: kube-ops
data:
 prometheus.yml: |
   global:
     scrape_interval: 30s
     scrape_timeout: 30s
   scrape_configs:
   - job_name: 'prometheus'
     static_configs:
       - targets: ['localhost:9090']
   - job_name: 'ingressnginx12'
     static_configs:
       - targets: ['10.0.17.101:10254']
   - job_name: 'ingressnginx13'
     static_configs:
       - targets: ['10.0.17.102:10254']
shell 复制代码
# 重载 prometheus
[root@master 02_ingress-nginx]# kubectl get svc -n kube-ops 
NAME         TYPE       CLUSTER-IP    EXTERNAL-IP   PORT(S)          AGE
prometheus   NodePort   10.7.124.89   <none>        9090:32010/TCP   10m
[root@master 02_ingress-nginx]# curl -X POST "http://10.0.17.100:32010/-/reload"
# 查看 http://10.0.17.100:32010/targets

# 确认是否捕获数据
PQL:
sum(nginx_ingress_controller_nginx_process_requests_total{controller_class="k8s.io/ingress-nginx",controller_namespace="ingress-nginx"})

三、创建 node-exporter,监控节点资源

node_exporter 就是抓取用于采集服务器节点的各种运行指标,目前 node_exporter 支持几乎所有常见

的监控点,比如 conntrack,cpu,diskstats,filesystem,loadavg,meminfo,netstat 等,详细的

监控点列表可以参考其 Github repo

yaml 复制代码
# 创建 prome-node-exporter.yaml 
apiVersion: apps/v1
kind: DaemonSet
metadata:
 name: node-exporter
 namespace: kube-ops
 labels:
   name: node-exporter
spec:
 selector:
   matchLabels:
     name: node-exporter
 template:
   metadata:
     labels:
       name: node-exporter
   spec:
     hostPID: true
     hostIPC: true
     hostNetwork: true
     containers:
     - name: node-exporter
       image: prom/node-exporter:v0.16.0
       ports:
       - containerPort: 9100
       resources:
         requests:
           cpu: 0.15
       securityContext:
         privileged: true
       args:
       - --path.procfs
       - /host/proc
       - --path.sysfs
       - /host/sys
       - --collector.filesystem.ignored-mount-points
       - '"^/(sys|proc|dev|host|etc)($|/)"'
       volumeMounts:
       - name: dev
         mountPath: /host/dev
       - name: proc
         mountPath: /host/proc
       - name: sys
         mountPath: /host/sys
       - name: rootfs
         mountPath: /rootfs
     tolerations:
     - key: "node-role.kubernetes.io/control-plane"
       operator: "Exists"
       effect: "NoSchedule"
     volumes:
       - name: proc
         hostPath:
           path: /proc
       - name: dev
         hostPath:
           path: /dev
       - name: sys
         hostPath:
           path: /sys
       - name: rootfs
         hostPath:
           path: /

在 Kubernetes 下,prometheus 通过与 Kubernetes API 集成,目前主要支持5中服务发现模式,分别是:Node、Service、Pod、Endpoints、Ingress通过指定 kubernetes_sd_configs 的模式为 node ,Prometheus 就会自动从 Kubernetes 中发现所

有的 node 节点并作为当前 job 监控的目标实例,发现的节点 /metrics 接口是默认的 kubelet 的HTTP 接口

prometheus 去发现 Node 模式的服务的时候,访问的端口默认是10250,而现在该端口下面已经没有了 /metrics 指标数据了,现在 kubelet 只读的数据接口统一通过10255端口进行暴露了,所以我们应该去替换掉这里的端口,但是我们是要替换成10255端口吗?不是的,因为我们是要去配置上面通过node-exporter 抓取到的节点指标数据,而我们上面是不是指定了 hostNetwork=true ,所以在每个节点上就会绑定一个端口9100,所以我们应该将这里的10250替换成9100

这里我们就需要使用到 Prometheus 提供的 relabel_configs 中的 replace 能力了,relabel 可以在Prometheus 采集数据之前,通过Target 实例的 Metadata 信息,动态重新写入 Label 的值。除此之

外,我们还能根据 Target 实例的 Metadata 信息选择是否采集或者忽略该 Target 实例

添加了一个 action 为 labelmap ,正则表达式是 _meta_kubernetes_node_label (.+) 的配置,这里的意思就是表达式中匹配都的数据也添加到指标数据的 Label 标签中去。

对于 kubernetes_sd_configs 下面可用的标签如下: 可用元标签:

  • _meta_kubernetes_node_name:节点对象的名称
  • _meta_kubernetes_node_label:节点对象中的每个标签
  • _meta_kubernetes_node_annotation:来自节点对象的每个注释
  • _meta_kubernetes_node_address:每个节点地址类型的第一个地址(如果存在) *
    关于 kubernets_sd_configs 更多信息可以查看官方文档:kubernetes_sd_config
    Kubernetes 1.11+ 版本以后,kubelet 就移除了 10255 端口, metrics 接口又回到了 10250 端口,所以这里不需要替换端口,但是需要使用 https 的协议
yaml 复制代码
# 修改 prometheus 配置文件 prome-cm.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
 name: prometheus-config
 namespace: kube-ops
data:
 prometheus.yml: |
   global:
     scrape_interval: 30s
     scrape_timeout: 30s
   scrape_configs:
   - job_name: 'prometheus'
     static_configs:
       - targets: ['localhost:9090']
   - job_name: 'ingressnginx12'
     static_configs:
       - targets: ['10.0.17.101:10254']
   - job_name: 'ingressnginx13'
     static_configs:
       - targets: ['10.0.17.102:10254']
   - job_name: 'kubernetes-nodes'
     kubernetes_sd_configs:
     - role: node
     relabel_configs:
     - source_labels: [__address__]
       regex: '(.*):10250'
       replacement: '${1}:9100'
       target_label: __address__
       action: replace
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-kubelet'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecure_skip_verify: true
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
shell 复制代码
[root@master 02_ingress-nginx]# curl -X POST "http://10.0.17.100:32010/-/reload"
# 查看 http://10.0.17.100:32010/targets

四、监控容器资源指标

说到容器监控我们自然会想到 cAdvisor ,我们前面也说过 cAdvisor 已经内置在了 kubelet 组件之中,所以我们不需要单独去安装, cAdvisor 的数据路径为 /api/v1/nodes//proxy/metrics ,同样我们这里使用 node 的服务发现模式,因为每一个节点下面都有 kubelet

yaml 复制代码
# 修改 prometheus 配置文件,prome-cm.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
 name: prometheus-config
 namespace: kube-ops
data:
 prometheus.yml: |
   global:
     scrape_interval: 30s
     scrape_timeout: 30s
   scrape_configs:
   - job_name: 'prometheus'
     static_configs:
       - targets: ['localhost:9090']
   - job_name: 'ingressnginx12'
     static_configs:
       - targets: ['10.0.17.101:10254']
   - job_name: 'ingressnginx13'
     static_configs:
       - targets: ['10.0.17.102:10254']
   - job_name: 'kubernetes-nodes'
     kubernetes_sd_configs:
     - role: node
     relabel_configs:
     - source_labels: [__address__]
       regex: '(.*):10250'
       replacement: '${1}:9100'
       target_label: __address__
       action: replace
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-kubelet'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecure_skip_verify: true
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-cadvisor'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
     - target_label: __address__
       replacement: kubernetes.default.svc:443
     - source_labels: [__meta_kubernetes_node_name]
       regex: (.+)
       target_label: __metrics_path__
       replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
shell 复制代码
[root@master 02_ingress-nginx]# curl -X POST "http://10.0.17.100:32010/-/reload"
# 查看 http://10.0.17.100:32010/targets

五、监控 ApiServer 指标

yaml 复制代码
# 修改 prometheus 配置文件,prome-cm.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
 name: prometheus-config
 namespace: kube-ops
data:
 prometheus.yml: |
   global:
     scrape_interval: 30s
     scrape_timeout: 30s
   scrape_configs:
   - job_name: 'prometheus'
     static_configs:
       - targets: ['localhost:9090']
   - job_name: 'ingressnginx12'
     static_configs:
       - targets: ['10.0.17.101:10254']
   - job_name: 'ingressnginx13'
     static_configs:
       - targets: ['10.0.17.102:10254']
   - job_name: 'kubernetes-nodes'
     kubernetes_sd_configs:
     - role: node
     relabel_configs:
     - source_labels: [__address__]
       regex: '(.*):10250'
       replacement: '${1}:9100'
       target_label: __address__
       action: replace
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-kubelet'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecure_skip_verify: true
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-cadvisor'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
     - target_label: __address__
       replacement: kubernetes.default.svc:443
     - source_labels: [__meta_kubernetes_node_name]
       regex: (.+)
       target_label: __metrics_path__
       replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
   - job_name: 'kubernetes-apiservers'
     kubernetes_sd_configs:
     - role: endpoints
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name]
       action: keep
       regex: default;kubernetes;https
shell 复制代码
[root@master 02_ingress-nginx]# curl -X POST "http://10.0.17.100:32010/-/reload"
# 查看 http://10.0.17.100:32010/targets

六、通过 Service 监控服务

yaml 复制代码
# 修改 prometheus 配置文件,prome-cm.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
 name: prometheus-config
 namespace: kube-ops
data:
 prometheus.yml: |
   global:
     scrape_interval: 30s
     scrape_timeout: 30s
   scrape_configs:
   - job_name: 'prometheus'
     static_configs:
       - targets: ['localhost:9090']
   - job_name: 'ingressnginx12'
     static_configs:
       - targets: ['10.0.17.101:10254']
   - job_name: 'ingressnginx13'
     static_configs:
       - targets: ['10.0.17.102:10254']
   - job_name: 'kubernetes-nodes'
     kubernetes_sd_configs:
     - role: node
     relabel_configs:
     - source_labels: [__address__]
       regex: '(.*):10250'
       replacement: '${1}:9100'
       target_label: __address__
       action: replace
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-kubelet'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecure_skip_verify: true
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-cadvisor'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
     - target_label: __address__
       replacement: kubernetes.default.svc:443
     - source_labels: [__meta_kubernetes_node_name]
       regex: (.+)
       target_label: __metrics_path__
       replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
   - job_name: 'kubernetes-apiservers'
     kubernetes_sd_configs:
     - role: endpoints
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name]
       action: keep
       regex: default;kubernetes;https
   - job_name: 'kubernetes-service-endpoints'
     kubernetes_sd_configs:
     - role: endpoints
     relabel_configs:
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
       action: keep
       regex: true
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
       action: replace
       target_label: __scheme__
       regex: (https?)
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
       action: replace
       target_label: __metrics_path__
       regex: (.+)
     - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
       action: replace
       target_label: __address__
       regex: ([^:]+)(?::\d+)?;(\d+)
       replacement: $1:$2
     - action: labelmap
       regex: __meta_kubernetes_service_label_(.+)
     - source_labels: [__meta_kubernetes_namespace]
       action: replace
       target_label: kubernetes_namespace
     - source_labels: [__meta_kubernetes_service_name]
       action: replace
       target_label: kubernetes_name
shell 复制代码
[root@master 02_ingress-nginx]# curl -X POST "http://10.0.17.100:32010/-/reload"
# 查看 http://10.0.17.100:32010/targets

七、部署 kube-state-metrics,并监控

shell 复制代码
# 部署 kube-state-metrics,地址 https://github.com/kubernetes/kube-state-metrics
$ kubectl apply -f examples/standard
shell 复制代码
# 创建 kube-state-metrics 服务的 svc 文件,被监控,svc.yaml 
apiVersion: v1
kind: Service
metadata:
 annotations:
   prometheus.io/scrape: 'true'
   prometheus.io/port: "8080"
 namespace: kube-system
 labels:
   app: kube-state-metrics
 name: kube-state-metrics-exporter
spec:
 type: ClusterIP
 ports:
 - name: 80-8080
   port: 80
   protocol: TCP
   targetPort: 8080
 selector:
   app.kubernetes.io/name: kube-state-metrics

八、部署 grafana 服务

yaml 复制代码
# 创建 grafana 部署文件
apiVersion: apps/v1
kind: Deployment
metadata:
 name: grafana
 namespace: kube-ops
 labels:
   app: grafana
spec:
 revisionHistoryLimit: 10
 selector:
   matchLabels:
     app: grafana
 template:
   metadata:
     labels:
       app: grafana
   spec:
     containers:
     - name: grafana
       image: grafana/grafana:5.3.4
       imagePullPolicy: IfNotPresent
       ports:
       - containerPort: 3000
         name: grafana
       env:
       - name: GF_SECURITY_ADMIN_USER
         value: admin
       - name: GF_SECURITY_ADMIN_PASSWORD
         value: admin321
       readinessProbe:
         failureThreshold: 10
         httpGet:
           path: /api/health
           port: 3000
           scheme: HTTP
         initialDelaySeconds: 60
         periodSeconds: 10
         successThreshold: 1
         timeoutSeconds: 30
       livenessProbe:
         failureThreshold: 3
         httpGet:
           path: /api/health
           port: 3000
           scheme: HTTP
         periodSeconds: 10
         successThreshold: 1
         timeoutSeconds: 1
       resources:
         limits:
           cpu: 100m
           memory: 256Mi
         requests:
           cpu: 100m
           memory: 256Mi
       volumeMounts:
       - mountPath: /var/lib/grafana
         subPath: grafana
         name: storage
     securityContext:
       fsGroup: 472
       runAsUser: 472
     volumes:
     - name: storage
       persistentVolumeClaim:
         claimName: grafana
yaml 复制代码
# 创建 grafana-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
 name: grafana
 namespace: kube-ops
spec:
 storageClassName: nfs-storage
 accessModes:
   - ReadWriteMany
 resources:
   requests:
     storage: 1Gi
yaml 复制代码
# 创建prometheus-pv.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
 name: grafana # pv 名字
spec:
 capacity: # 容量
  storage: 1Gi # 存储空间
 accessModes: # 存储模式
  - ReadWriteMany 
 persistentVolumeReclaimPolicy: Recycle # 持久卷回收策略
 storageClassName: nfs-storage # 存储类的名字
 nfs:
  path: /nfsdata/grafana # nfs共享路径
  server: 10.0.17.100 # nfs服务器地址
yaml 复制代码
# 创建 grafan svc 文件,grafana-svc.yaml 
apiVersion: v1
kind: Service
metadata:
 name: grafana
 namespace: kube-ops
 labels:
   app: grafana
spec:
 type: NodePort
 ports:
   - port: 3000
 selector:
   app: grafana
yaml 复制代码
# 创建 job,调整 grafana 挂载目录权限,grafana-chown-job.yaml 
apiVersion: batch/v1
kind: Job
metadata:
 name: grafana-chown
 namespace: kube-ops
spec:
 template:
   spec:
     restartPolicy: Never
     containers:
     - name: grafana-chown
       command: ["chown", "-R", "472:472", "/var/lib/grafana"]
       image: busybox
       imagePullPolicy: IfNotPresent
       volumeMounts:
       - name: storage
         subPath: grafana
         mountPath: /var/lib/grafana
     volumes:
     - name: storage
       persistentVolumeClaim:
         claimName: grafana
shell 复制代码
http://10.0.17.100:32362
#添加数据源 Type选择Prometheus url由于Prometheus和grafana处于同一pod http://prometheus:9090
# save&test

数据面板json文件

json 复制代码
{
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": "-- Grafana --",
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "gnetId": null,
  "graphTooltip": 0,
  "id": 1,
  "links": [],
  "panels": [
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 0
      },
      "id": 40,
      "panels": [],
      "title": "Kubernetes 指标",
      "type": "row"
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fill": 1,
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 0,
        "y": 1
      },
      "id": 38,
      "legend": {
        "alignAsTable": false,
        "avg": false,
        "current": false,
        "hideEmpty": false,
        "max": false,
        "min": false,
        "rightSide": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "sum(rate(apiserver_request_total[1m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "一分钟平均",
          "refId": "A"
        },
        {
          "expr": "sum(rate(apiserver_request_total[5m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "五分钟平均",
          "refId": "B"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "ApiServer 每分钟请求数",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
      "colors": [
        "#299c46",
        "rgba(237, 129, 40, 0.89)",
        "#d44a3a"
      ],
      "datasource": "prometheus",
      "format": "none",
      "gauge": {
        "maxValue": 100,
        "minValue": 0,
        "show": false,
        "thresholdLabels": false,
        "thresholdMarkers": true
      },
      "gridPos": {
        "h": 7,
        "w": 4,
        "x": 8,
        "y": 1
      },
      "id": 32,
      "interval": null,
      "links": [],
      "mappingType": 1,
      "mappingTypes": [
        {
          "name": "value to text",
          "value": 1
        },
        {
          "name": "range to text",
          "value": 2
        }
      ],
      "maxDataPoints": 100,
      "nullPointMode": "connected",
      "nullText": null,
      "postfix": "",
      "postfixFontSize": "50%",
      "prefix": "",
      "prefixFontSize": "50%",
      "rangeMaps": [
        {
          "from": "null",
          "text": "N/A",
          "to": "null"
        }
      ],
      "sparkline": {
        "fillColor": "rgba(31, 118, 189, 0.18)",
        "full": false,
        "lineColor": "rgb(31, 120, 193)",
        "show": false
      },
      "tableColumn": "",
      "targets": [
        {
          "expr": "kubelet_active_pods{beta_kubernetes_io_arch=\"amd64\",beta_kubernetes_io_os=\"linux\",instance=\"master\",job=\"kubernetes-kubelet\",kubernetes_io_arch=\"amd64\",kubernetes_io_hostname=\"master\",kubernetes_io_os=\"linux\",static=\"\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "",
          "refId": "A"
        }
      ],
      "thresholds": "",
      "title": "Master Pod 总量",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
        {
          "op": "=",
          "text": "N/A",
          "value": "null"
        }
      ],
      "valueName": "avg"
    },
    {
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
      "colors": [
        "#299c46",
        "rgba(237, 129, 40, 0.89)",
        "#d44a3a"
      ],
      "datasource": "prometheus",
      "format": "none",
      "gauge": {
        "maxValue": 100,
        "minValue": 0,
        "show": false,
        "thresholdLabels": false,
        "thresholdMarkers": true
      },
      "gridPos": {
        "h": 7,
        "w": 4,
        "x": 12,
        "y": 1
      },
      "id": 34,
      "interval": null,
      "links": [],
      "mappingType": 1,
      "mappingTypes": [
        {
          "name": "value to text",
          "value": 1
        },
        {
          "name": "range to text",
          "value": 2
        }
      ],
      "maxDataPoints": 100,
      "nullPointMode": "connected",
      "nullText": null,
      "postfix": "",
      "postfixFontSize": "50%",
      "prefix": "",
      "prefixFontSize": "50%",
      "rangeMaps": [
        {
          "from": "null",
          "text": "N/A",
          "to": "null"
        }
      ],
      "sparkline": {
        "fillColor": "rgba(31, 118, 189, 0.18)",
        "full": false,
        "lineColor": "rgb(31, 120, 193)",
        "show": false
      },
      "tableColumn": "",
      "targets": [
        {
          "expr": "kubelet_active_pods{kubernetes_io_hostname=\"node1\",static=\"\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "A"
        }
      ],
      "thresholds": "",
      "title": "node1 Pod 总量",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
        {
          "op": "=",
          "text": "N/A",
          "value": "null"
        }
      ],
      "valueName": "avg"
    },
    {
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
      "colors": [
        "#299c46",
        "rgba(237, 129, 40, 0.89)",
        "#d44a3a"
      ],
      "datasource": "prometheus",
      "format": "none",
      "gauge": {
        "maxValue": 100,
        "minValue": 0,
        "show": false,
        "thresholdLabels": false,
        "thresholdMarkers": true
      },
      "gridPos": {
        "h": 7,
        "w": 4,
        "x": 16,
        "y": 1
      },
      "id": 36,
      "interval": null,
      "links": [],
      "mappingType": 1,
      "mappingTypes": [
        {
          "name": "value to text",
          "value": 1
        },
        {
          "name": "range to text",
          "value": 2
        }
      ],
      "maxDataPoints": 100,
      "nullPointMode": "connected",
      "nullText": null,
      "postfix": "",
      "postfixFontSize": "50%",
      "prefix": "",
      "prefixFontSize": "50%",
      "rangeMaps": [
        {
          "from": "null",
          "text": "N/A",
          "to": "null"
        }
      ],
      "sparkline": {
        "fillColor": "rgba(31, 118, 189, 0.18)",
        "full": false,
        "lineColor": "rgb(31, 120, 193)",
        "show": false
      },
      "tableColumn": "",
      "targets": [
        {
          "expr": "kubelet_active_pods{kubernetes_io_hostname=\"node2\",static=\"\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "A"
        }
      ],
      "thresholds": "",
      "title": "node2 Pod 总量",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
        {
          "op": "=",
          "text": "N/A",
          "value": "null"
        }
      ],
      "valueName": "avg"
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 8
      },
      "id": 24,
      "panels": [],
      "title": "Kubernetes 物理机",
      "type": "row"
    },
    {
      "aliasColors": {},
      "bars": true,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "decimals": null,
      "fill": 4,
      "gridPos": {
        "h": 6,
        "w": 8,
        "x": 0,
        "y": 9
      },
      "id": 10,
      "legend": {
        "alignAsTable": false,
        "avg": false,
        "current": false,
        "hideEmpty": false,
        "max": false,
        "min": false,
        "rightSide": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": false,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"master\"}[1m])))  ",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
          "legendFormat": "Master01",
          "refId": "A"
        },
        {
          "expr": "100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"node1\"}[1m])))",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
          "legendFormat": "Node01",
          "refId": "B"
        },
        {
          "expr": "100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=\"node2\"}[1m])))",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
          "legendFormat": "Node02",
          "refId": "C"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "CPU 使用量(百分比)",
      "tooltip": {
        "shared": false,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "series",
        "name": null,
        "show": true,
        "values": [
          "current"
        ]
      },
      "yaxes": [
        {
          "format": "none",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "aliasColors": {},
      "bars": true,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fill": 3,
      "gridPos": {
        "h": 6,
        "w": 8,
        "x": 8,
        "y": 9
      },
      "id": 2,
      "legend": {
        "alignAsTable": false,
        "avg": false,
        "current": false,
        "hideEmpty": false,
        "max": false,
        "min": false,
        "rightSide": false,
        "show": false,
        "total": false,
        "values": false
      },
      "lines": false,
      "linewidth": 3,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "(node_memory_MemTotal_bytes{instance=\"master\"} - (node_memory_MemFree_bytes{instance=\"master\"} + node_memory_Buffers_bytes{instance=\"master\"} + node_memory_Cached_bytes{instance=\"master\"})) / node_memory_MemTotal_bytes{instance=\"master\"} * 100",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Master",
          "refId": "A"
        },
        {8
          "expr": "(node_memory_MemTotal_bytes{instance=\"node1\"} - (node_memory_MemFree_bytes{instance=\"node1\"} + node_memory_Buffers_bytes{instance=\"node1\"} + node_memory_Cached_bytes{instance=\"node1\"})) / node_memory_MemTotal_bytes{instance=\"node1\"} * 100",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "node1",
          "refId": "B"
        },
        {
          "expr": "(node_memory_MemTotal_bytes{instance=\"node2\"} - (node_memory_MemFree_bytes{instance=\"node2\"} + node_memory_Buffers_bytes{instance=\"node2\"} + node_memory_Cached_bytes{instance=\"node2\"})) / node_memory_MemTotal_bytes{instance=\"node2\"} * 100",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
          "legendFormat": "node2",
          "refId": "C"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "内存使用量(百分比)",
      "tooltip": {
        "shared": false,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "series",
        "name": null,
        "show": true,
        "values": [
          "current"
        ]
      },
      "yaxes": [
        {
          "format": "none",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "decimals": null,
          "format": "none",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "aliasColors": {},
      "bars": true,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fill": 1,
      "gridPos": {
        "h": 6,
        "w": 8,
        "x": 16,
        "y": 9
      },
      "id": 18,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": false,
        "total": false,
        "values": false
      },
      "lines": false,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "node_filesystem_avail_bytes{device=\"/dev/mapper/rl-root\",mountpoint=\"/rootfs\",instance=\"master\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Master01",
          "refId": "A"
        },
        {
          "expr": "node_filesystem_avail_bytes{device=\"/dev/mapper/rl-root\",mountpoint=\"/rootfs\",instance=\"node1\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Node01",
          "refId": "B"
        },
        {
          "expr": "node_filesystem_avail_bytes{device=\"/dev/mapper/rl-root\",mountpoint=\"/rootfs\",instance=\"node2\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Node02",
          "refId": "C"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "存储空间剩余量",
      "tooltip": {
        "shared": false,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "series",
        "name": null,
        "show": true,
        "values": [
          "current"
        ]
      },
      "yaxes": [
        {
          "format": "bytes",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fill": 1,
      "gridPos": {
        "h": 6,
        "w": 24,
        "x": 0,
        "y": 15
      },
      "id": 12,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": true,
      "steppedLine": false,
      "targets": [
        {
          "expr": "sum(rate(node_network_receive_bytes_total{instance=\"master\"}[1m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Master01 下行",
          "refId": "A"
        },
        {
          "expr": "sum(rate(node_network_transmit_bytes_total{instance=\"master\"}[1m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Master01 上行",
          "refId": "B"
        },
        {
          "expr": "sum(rate(node_network_receive_bytes_total{instance=\"node1\"}[1m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Node01  下行",
          "refId": "C"
        },
        {
          "expr": "sum(rate(node_network_transmit_bytes_total{instance=\"node1\"}[1m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Node01 上行",
          "refId": "D"
        },
        {
          "expr": "sum(rate(node_network_receive_bytes_total{instance=\"node2\"}[1m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Node02 下行",
          "refId": "E"
        },
        {
          "expr": "sum(rate(node_network_transmit_bytes_total{instance=\"node2\"}[1m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Node02 上行",
          "refId": "F"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "网络 IO",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 21
      },
      "id": 16,
      "panels": [],
      "repeat": null,
      "title": "Pod",
      "type": "row"
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fill": 1,
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 0,
        "y": 22
      },
      "id": 20,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "100 *  (sum(rate(container_cpu_usage_seconds_total{namespace=\"kube-system\", pod=\"kube-apiserver-master\"}[1m])) by (namespace, pod))",
          "format": "time_series",
          "interval": "",
          "intervalFactor": 1,
          "legendFormat": "apiServer",
          "refId": "A"
        },
        {
          "expr": "",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "B"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "Pod  CPU 使用量",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "none",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fill": 1,
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 8,
        "y": 22
      },
      "id": 22,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "sum(container_memory_usage_bytes{namespace=\"kube-system\", pod=\"kube-apiserver-master\"}) by (namespace, pod)",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "apiServer",
          "refId": "A"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "Pod 内存使用量",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "bytes",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 29
      },
      "id": 8,
      "panels": [],
      "title": "Ingress-Nginx",
      "type": "row"
    },
    {
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
      "colors": [
        "#299c46",
        "rgba(237, 129, 40, 0.89)",
        "#d44a3a"
      ],
      "datasource": "prometheus",
      "format": "none",
      "gauge": {
        "maxValue": 100,
        "minValue": 0,
        "show": false,
        "thresholdLabels": false,
        "thresholdMarkers": true
      },
      "gridPos": {
        "h": 2,
        "w": 4,
        "x": 0,
        "y": 30
      },
      "id": 28,
      "interval": null,
      "links": [],
      "mappingType": 1,
      "mappingTypes": [
        {
          "name": "value to text",
          "value": 1
        },
        {
          "name": "range to text",
          "value": 2
        }
      ],
      "maxDataPoints": 100,
      "nullPointMode": "connected",
      "nullText": null,
      "postfix": "",
      "postfixFontSize": "50%",
      "prefix": "",
      "prefixFontSize": "50%",
      "rangeMaps": [
        {
          "from": "null",
          "text": "N/A",
          "to": "null"
        }
      ],
      "sparkline": {
        "fillColor": "rgba(31, 118, 189, 0.18)",
        "full": false,
        "lineColor": "rgb(31, 120, 193)",
        "show": false
      },
      "tableColumn": "",
      "targets": [
        {
          "expr": "nginx_ingress_controller_success{job=\"ingressnginx12\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "node01",
          "refId": "A"
        },
        {
          "expr": "",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "B"
        }
      ],
      "thresholds": "",
      "title": "node1 Nginx-ingress 重载次数",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
        {
          "op": "=",
          "text": "N/A",
          "value": "null"
        }
      ],
      "valueName": "avg"
    },
    {
      "cacheTimeout": null,
      "colorBackground": false,
      "colorValue": false,
      "colors": [
        "#299c46",
        "rgba(237, 129, 40, 0.89)",
        "#d44a3a"
      ],
      "datasource": "prometheus",
      "format": "none",
      "gauge": {
        "maxValue": 100,
        "minValue": 0,
        "show": false,
        "thresholdLabels": false,
        "thresholdMarkers": true
      },
      "gridPos": {
        "h": 2,
        "w": 4,
        "x": 4,
        "y": 30
      },
      "id": 30,
      "interval": null,
      "links": [],
      "mappingType": 1,
      "mappingTypes": [
        {
          "name": "value to text",
          "value": 1
        },
        {
          "name": "range to text",
          "value": 2
        }
      ],
      "maxDataPoints": 100,
      "nullPointMode": "connected",
      "nullText": null,
      "postfix": "",
      "postfixFontSize": "50%",
      "prefix": "",
      "prefixFontSize": "50%",
      "rangeMaps": [
        {
          "from": "null",
          "text": "N/A",
          "to": "null"
        }
      ],
      "sparkline": {
        "fillColor": "rgba(31, 118, 189, 0.18)",
        "full": false,
        "lineColor": "rgb(31, 120, 193)",
        "show": false
      },
      "tableColumn": "",
      "targets": [
        {
          "expr": "nginx_ingress_controller_success{job=\"ingressnginx13\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "A"
        }
      ],
      "thresholds": "",
      "title": "node2 Nginx-ingress 重载次数",
      "type": "singlestat",
      "valueFontSize": "80%",
      "valueMaps": [
        {
          "op": "=",
          "text": "N/A",
          "value": "null"
        }
      ],
      "valueName": "avg"
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fill": 1,
      "gridPos": {
        "h": 5,
        "w": 8,
        "x": 0,
        "y": 32
      },
      "id": 4,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "nginx_ingress_controller_nginx_process_requests_total{controller_class=\"k8s.io/ingress-nginx\",controller_namespace=\"ingress\",controller_pod=\"ingress-nginx-controller-c5h6j\",instance=\"10.0.17.101:10254\",job=\"ingressnginx12\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "node01",
          "refId": "A"
        },
        {
          "expr": "nginx_ingress_controller_nginx_process_requests_total{controller_class=\"k8s.io/ingress-nginx\",controller_namespace=\"ingress\",controller_pod=\"ingress-nginx-controller-c5h6j\",instance=\"10.0.17.102:10254\",job=\"ingressnginx12\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "node02",
          "refId": "B"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "Ingress-Nginx 请求量",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 37
      },
      "id": 6,
      "panels": [],
      "title": "NFS-StorageClass",
      "type": "row"
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fill": 1,
      "gridPos": {
        "h": 6,
        "w": 24,
        "x": 0,
        "y": 38
      },
      "id": 14,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "node_nfsd_disk_bytes_read_total{beta_kubernetes_io_arch=\"amd64\",beta_kubernetes_io_os=\"linux\",instance=\"master\",job=\"kubernetes-nodes\",kubernetes_io_arch=\"amd64\",kubernetes_io_hostname=\"master\",kubernetes_io_os=\"linux\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "读取总量",
          "refId": "A"
        },
        {
          "expr": "node_nfsd_disk_bytes_written_total{beta_kubernetes_io_arch=\"amd64\",beta_kubernetes_io_os=\"linux\",instance=\"master\",job=\"kubernetes-nodes\",kubernetes_io_arch=\"amd64\",kubernetes_io_hostname=\"master\",kubernetes_io_os=\"linux\"}",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "写入总量",
          "refId": "B"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "NFS storageClass 读取文件总量",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "decbytes",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    }
  ],
  "refresh": false,
  "schemaVersion": 16,
  "style": "dark",
  "tags": [],
  "templating": {
    "list": []
  },
  "time": {
    "from": "now-6h",
    "to": "now"
  },
  "timepicker": {
    "refresh_intervals": [
      "5s",
      "10s",
      "30s",
      "1m",
      "5m",
      "15m",
      "30m",
      "1h",
      "2h",
      "1d"
    ],
    "time_options": [
      "5m",
      "15m",
      "1h",
      "6h",
      "12h",
      "24h",
      "2d",
      "7d",
      "30d"
    ]
  },
  "timezone": "",
  "title": "Kubernetes 监控",
  "uid": "Lwdu47xIk",
  "version": 2
}

九、监控 metrics.server

从 Kubernetes v1.8 开始,资源使用情况的监控可以通过 Metrics API 的形式获取,例如容器 CPU 和内存使用率。这些度量可以由用户直接访问(例如,通过使用 kubectl top 命令),或者由集群中的控制

器(例如,Horizontal Pod Autoscaler)使用来进行决策,具体的组件为 Metrics Server,用来替换之前的 heapster,heapster 从 1.11 开始逐渐被废弃。

Metrics-Server 是集群核心监控数据的聚合器。通俗地说,它存储了集群中各节点的监控数据,并且提供了 API 以供分析和使用。Metrics-Server 作为一个 Deployment 对象默认部署在 Kubernetes 集群

中。不过准确地说,它是 Deployment,Service,ClusterRole,ClusterRoleBinding,APIService,RoleBinding 等资源对象的综合体。

shell 复制代码
# https://github.com/kubernetes-sigs/metrics-server
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
shell 复制代码
[root@master 09_metrics-server]# kubectl top pod -A
NAMESPACE     NAME                                            CPU(cores)   MEMORY(bytes)   
default       proxyhttps-deploy-69798bfdb9-fwshg              0m           4Mi             
ingress       ingress-nginx-controller-rrh9d                  3m           84Mi            
ingress       ingress-nginx-controller-x4dgn                  1m           89Mi            
ingress       ingress-nginx-defaultbackend-774db5d85d-v2rvr   1m           9Mi             
kube-ops      grafana-b64b67875-4j4ss                         1m           40Mi            
kube-ops      node-exporter-qjpf8                             0m           13Mi            
kube-ops      node-exporter-xfm95                             2m           14Mi            
kube-ops      node-exporter-xvhmg                             2m           19Mi            
kube-ops      prometheus-844847f5c7-hsvlh                     5m           211Mi           
kube-system   calico-kube-controllers-558d465845-btr5r        3m           36Mi            
kube-system   calico-node-f5r9h                               20m          103Mi           
kube-system   calico-node-g2s29                               21m          121Mi           
kube-system   calico-node-x2jq7                               34m          161Mi           
kube-system   calico-typha-5b56944f9b-r96gn                   2m           45Mi            
kube-system   coredns-857d9ff4c9-6cb2b                        3m           48Mi            
kube-system   coredns-857d9ff4c9-tvrff                        2m           50Mi            
kube-system   etcd-master                                     25m          184Mi           
kube-system   kube-apiserver-master                           51m          552Mi           
kube-system   kube-controller-manager-master                  14m          139Mi           
kube-system   kube-proxy-jwvjm                                9m           72Mi            
kube-system   kube-proxy-rf7hf                                14m          32Mi            
kube-system   kube-proxy-s885z                                1m           32Mi            
kube-system   kube-scheduler-master                           5m           69Mi            
kube-system   metrics-server-75bbd6fd46-7j49n                 3m           65Mi            
ubuntu        test-676c94bbbb-gzvmb                           0m           0Mi 
yaml 复制代码
apiVersion: v1
kind: ServiceAccount
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server
  namespace: kube-system
---

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    k8s-app: metrics-server
    rbac.authorization.k8s.io/aggregate-to-admin: "true"
    rbac.authorization.k8s.io/aggregate-to-edit: "true"
    rbac.authorization.k8s.io/aggregate-to-view: "true"
  name: system:aggregated-metrics-reader
rules:
- apiGroups:
  - metrics.k8s.io
  resources:
  - pods
  - nodes
  verbs:
  - get
  - list
  - watch
---

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    k8s-app: metrics-server
  name: system:metrics-server
rules:
- apiGroups:
  - ""
  resources:
  - nodes/metrics
  verbs:
  - get
- apiGroups:
  - ""
  resources:
  - pods
  - nodes
  verbs:
  - get
  - list
  - watch
---

apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server-auth-reader
  namespace: kube-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: extension-apiserver-authentication-reader
subjects:
- kind: ServiceAccount
  name: metrics-server
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server:system:auth-delegator
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: system:auth-delegator
subjects:
- kind: ServiceAccount
  name: metrics-server
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  labels:
    k8s-app: metrics-server
  name: system:metrics-server
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: system:metrics-server
subjects:
- kind: ServiceAccount
  name: metrics-server
  namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server
  namespace: kube-system
spec:
  ports:
  - name: https
    port: 443
    protocol: TCP
    targetPort: https
  selector:
    k8s-app: metrics-server
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    k8s-app: metrics-server
  name: metrics-server
  namespace: kube-system
spec:
  selector:
    matchLabels:
      k8s-app: metrics-server
  strategy:
    rollingUpdate:
      maxUnavailable: 0
  template:
    metadata:
      labels:
        k8s-app: metrics-server
    spec:
      containers:
      - args:
        - --cert-dir=/tmp
        - --secure-port=4443
        - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
        - --kubelet-use-node-status-port
        - --metric-resolution=15s
        - --kubelet-preferred-address-types=InternalIP   # InternalIP\Hostname\InternalDNS\ExternalDNS\ExternalIP, Hostname 默认的通过主机名通讯,InternalIP 需要显示设置后才能通过 IP 通讯
        - --kubelet-insecure-tls     # 如果不想设置 kubelet 的证书认证,可以通过此选项跳过认证
        image: registry.k8s.io/metrics-server/metrics-server:v0.6.4
        imagePullPolicy: IfNotPresent
        livenessProbe:
          failureThreshold: 3
          httpGet:
            path: /livez
            port: https
            scheme: HTTPS
          periodSeconds: 10
        name: metrics-server
        ports:
        - containerPort: 4443
          name: https
          protocol: TCP
        readinessProbe:
          failureThreshold: 3
          httpGet:
            path: /readyz
            port: https
            scheme: HTTPS
          initialDelaySeconds: 20
          periodSeconds: 10
        resources:
          requests:
            cpu: 100m
            memory: 200Mi
        securityContext:
          allowPrivilegeEscalation: false
          readOnlyRootFilesystem: true
          runAsNonRoot: true
          runAsUser: 1000
        volumeMounts:
        - mountPath: /tmp
          name: tmp-dir
      nodeSelector:
        kubernetes.io/os: linux
      priorityClassName: system-cluster-critical
      serviceAccountName: metrics-server
      volumes:
      - emptyDir: {}
        name: tmp-dir
---
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
  labels:
    k8s-app: metrics-server
  name: v1beta1.metrics.k8s.io
spec:
  group: metrics.k8s.io
  groupPriorityMinimum: 100
  insecureSkipTLSVerify: true #跳过安全认证
  service:
    name: metrics-server
    namespace: kube-system
  version: v1beta1
  versionPriority: 100

十、alertmanager 部署

yaml 复制代码
# 创建 alertmanager 配置文件,alertmanager-conf.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
 name: alert-config
 namespace: kube-ops
data:
 config.yml: |-
   global:  
  # 在没有报警的情况下声明为已解决的时间
     resolve_timeout: 5m
  # 配置邮件发送信息
     smtp_smarthost: 'smtp.163.com:25'  # smtp服务地址端口
     smtp_from: 'wangyanglinux@163.com' # smtp服务地址
     smtp_auth_username: 'wangyanglinux@163.com' # smtp账号
     smtp_auth_password: 'APYDOSTDPDUOEEHQ' # stmp密码
     smtp_hello: '163.com'
     smtp_require_tls: false  
  # 所有报警信息进入后的根路由,用来设置报警的分发策略
   route:
      # 这里的标签列表是接收到报警信息后的重新分组标签,例如,接收到的报警信息里面有许多具有cluster=A 和 alertname=LatncyHigh 这样的标签的报警信息将会批量被聚合到一个分组里面
     group_by: ['alertname', 'cluster']
      # 当一个新的报警分组被创建后,需要等待至少group_wait时间来初始化通知,这种方式可以确保您能有足够的时间为同一分组来获取多个警报,然后一起触发这个报警信息。
     group_wait: 30s
      # 当第一个报警发送后,等待'group_interval'时间来发送新的一组报警信息。
     group_interval: 5m
      # 如果一个报警信息已经发送成功了,等待'repeat_interval'时间来重新发送他们
     repeat_interval: 5m
      # 默认的receiver:如果一个报警没有被一个route匹配,则发送给默认的接收器
     receiver: default
      # 上面所有的属性都由所有子路由继承,并且可以在每个子路由上进行覆盖。
     routes:
     - receiver: email
       group_wait: 10s
       match:
         team: node
   receivers:
   - name: 'default'
     email_configs:
     - to: 'wangyanglinux@88.com' # 接收邮箱
       send_resolved: true
   - name: 'email'
     email_configs:
     - to: 'wangyanglinux@88.com'  # 接收邮箱
       send_resolved: true
yaml 复制代码
# 修改 prometheus 配置文件,prometheus-cm.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
 name: prometheus-config
 namespace: kube-ops
data:
 prometheus.yml: |
   global:
     scrape_interval: 30s
     scrape_timeout: 30s
   scrape_configs:
   - job_name: 'prometheus'
     static_configs:
       - targets: ['localhost:9090']
   - job_name: 'ingressnginx12'
     static_configs:
       - targets: ['10.0.17.101:10254']
   - job_name: 'ingressnginx13'
     static_configs:
       - targets: ['10.0.17.102:10254']
   - job_name: 'kubernetes-nodes'
     kubernetes_sd_configs:
     - role: node
     relabel_configs:
     - source_labels: [__address__]
       regex: '(.*):10250'
       replacement: '${1}:9100'
       target_label: __address__
       action: replace
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-kubelet'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecure_skip_verify: true
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-cadvisor'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
     - target_label: __address__
       replacement: kubernetes.default.svc:443
     - source_labels: [__meta_kubernetes_node_name]
       regex: (.+)
       target_label: __metrics_path__
       replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
   - job_name: 'kubernetes-apiservers'
     kubernetes_sd_configs:
     - role: endpoints
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
       action: keep
       regex: default;kubernetes;https
   - job_name: 'kubernetes-service-endpoints'
     kubernetes_sd_configs:
     - role: endpoints
     relabel_configs:
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
       action: keep
       regex: true
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
       action: replace
       target_label: __scheme__
       regex: (https?)
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
       action: replace
       target_label: __metrics_path__
       regex: (.+)
     - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
       action: replace
       target_label: __address__
       regex: ([^:]+)(?::\d+)?;(\d+)
       replacement: $1:$2
     - action: labelmap
       regex: __meta_kubernetes_service_label_(.+)
     - source_labels: [__meta_kubernetes_namespace]
       action: replace
       target_label: kubernetes_namespace
     - source_labels: [__meta_kubernetes_service_name]
       action: replace
       target_label: kubernetes_name
   alerting:
     alertmanagers:
       - static_configs:
         - targets: ["localhost:9093"]
yaml 复制代码
# 修改 prometheus service 文件, prometheus-svc.yaml
apiVersion: v1
kind: Service
metadata:
 name: prometheus
 namespace: kube-ops
 labels:
   app: prometheus
spec:
 selector:
   app: prometheus
 type: NodePort
 ports:
   - name: web
     port: 9090
     targetPort: http
   - name: altermanager
     port: 9093
     targetPort: 9093
yaml 复制代码
# 合并 altermanager 至 prometheus deploy 文件,promethus-alertmanager-deploy.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
 name: prometheus
 namespace: kube-ops
 labels:
   app: prometheus
spec:
 selector:
   matchLabels:
     app: prometheus
 template:
   metadata:
     labels:
       app: prometheus
   spec:
     serviceAccountName: prometheus
     containers:
     - name: alertmanager
       image: prom/alertmanager:v0.15.3
       imagePullPolicy: IfNotPresent
       args:
       - "--config.file=/etc/alertmanager/config.yml"
       - "--storage.path=/alertmanager/data"
       ports:
       - containerPort: 9093
         name: alertmanager
       volumeMounts:
       - mountPath: "/etc/alertmanager"
         name: alertcfg
       resources:
         requests:
           cpu: 100m
           memory: 256Mi
         limits:
           cpu: 100m
           memory: 256Mi
     - image: prom/prometheus:v2.4.3
       name: prometheus
       command:
       - "/bin/prometheus"
       args:
       - "--config.file=/etc/prometheus/prometheus.yml"
       - "--storage.tsdb.path=/prometheus"
       - "--storage.tsdb.retention=24h"
       - "--web.enable-admin-api"  # 控制对admin HTTP API的访问,其中包括删除时间序列等功能
       - "--web.enable-lifecycle"  # 支持热更新,直接执行localhost:9090/-/reload立即生效
       ports:
       - containerPort: 9090
         protocol: TCP
         name: http
       volumeMounts:
       - mountPath: "/prometheus"
         subPath: prometheus
         name: data
       - mountPath: "/etc/prometheus"
         name: config-volume
       resources:
         requests:
           cpu: 100m
           memory: 512Mi
         limits:
           cpu: 100m
           memory: 512Mi
     securityContext:
       runAsUser: 0
     volumes:
     - name: data
       persistentVolumeClaim:
         claimName: prometheus
     - configMap:
         name: prometheus-config
       name: config-volume
     - name: alertcfg
       configMap:
         name: alert-config

添加报警演示

yaml 复制代码
# 修改 prometheus cm,添加监控 mem 使用量,prometheus-cm-test.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
 name: prometheus-config
 namespace: kube-ops
data:
 rules.yml: |
   groups:
   - name: test-rule
     rules:
     - alert: NodeMemoryUsage
       expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 20
       for: 2m
       labels:
         team: node
       annotations:
         summary: "{{$labels.instance}}: High Memory usage detected"
         description: "{{$labels.instance}}: Memory usage is above 20% (current 
value is: {{ $value }}"
 prometheus.yml: |
   global:
     scrape_interval: 30s
     scrape_timeout: 30s
   scrape_configs:
   - job_name: 'prometheus'
     static_configs:
       - targets: ['localhost:9090']
   - job_name: 'ingressnginx12'
     static_configs:
       - targets: ['10.0.17.101:10254']
   - job_name: 'ingressnginx13'
     static_configs:
       - targets: ['10.0.17.102:10254']
   - job_name: 'kubernetes-nodes'
     kubernetes_sd_configs:
     - role: node
     relabel_configs:
     - source_labels: [__address__]
       regex: '(.*):10250'
       replacement: '${1}:9100'
       target_label: __address__
       action: replace
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-kubelet'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecure_skip_verify: true
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-cadvisor'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
     - target_label: __address__
       replacement: kubernetes.default.svc:443
     - source_labels: [__meta_kubernetes_node_name]
       regex: (.+)
       target_label: __metrics_path__
       replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
   - job_name: 'kubernetes-apiservers'
     kubernetes_sd_configs:
     - role: endpoints
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
       action: keep
       regex: default;kubernetes;https
   - job_name: 'kubernetes-service-endpoints'
     kubernetes_sd_configs:
     - role: endpoints
     relabel_configs:
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
       action: keep
       regex: true
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
       action: replace
       target_label: __scheme__
       regex: (https?)
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
       action: replace
       target_label: __metrics_path__
       regex: (.+)
     - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
       action: replace
       target_label: __address__
       regex: ([^:]+)(?::\d+)?;(\d+)
       replacement: $1:$2
     - action: labelmap
       regex: __meta_kubernetes_service_label_(.+)
     - source_labels: [__meta_kubernetes_namespace]
       action: replace
       target_label: kubernetes_namespace
     - source_labels: [__meta_kubernetes_service_name]
       action: replace
       target_label: kubernetes_name
   alerting:
     alertmanagers:
       - static_configs:
         - targets: ["localhost:9093"]
   rule_files:
       - /etc/prometheus/rules.yml

去除无效监控项

yaml 复制代码
# 删除监控 mem 项,prometheus-cm.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
 name: prometheus-config
 namespace: kube-ops
data:
 rules.yml: |
   groups:
 prometheus.yml: |
   global:
     scrape_interval: 30s
     scrape_timeout: 30s
   scrape_configs:
   - job_name: 'prometheus'
     static_configs:
       - targets: ['localhost:9090']
   - job_name: 'ingressnginx12'
     static_configs:
       - targets: ['10.0.17.101:10254']
   - job_name: 'ingressnginx13'
     static_configs:
       - targets: ['10.0.17.102:10254']
   - job_name: 'kubernetes-nodes'
     kubernetes_sd_configs:
     - role: node
     relabel_configs:
     - source_labels: [__address__]
       regex: '(.*):10250'
       replacement: '${1}:9100'
       target_label: __address__
       action: replace
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-kubelet'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecure_skip_verify: true
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
   - job_name: 'kubernetes-cadvisor'
     kubernetes_sd_configs:
     - role: node
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - action: labelmap
       regex: __meta_kubernetes_node_label_(.+)
     - target_label: __address__
       replacement: kubernetes.default.svc:443
     - source_labels: [__meta_kubernetes_node_name]
       regex: (.+)
       target_label: __metrics_path__
       replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
   - job_name: 'kubernetes-apiservers'
     kubernetes_sd_configs:
     - role: endpoints
     scheme: https
     tls_config:
       ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
     bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     relabel_configs:
     - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
       action: keep
       regex: default;kubernetes;https
   - job_name: 'kubernetes-service-endpoints'
     kubernetes_sd_configs:
     - role: endpoints
     relabel_configs:
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
       action: keep
       regex: true
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
       action: replace
       target_label: __scheme__
       regex: (https?)
     - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
       action: replace
       target_label: __metrics_path__
       regex: (.+)
     - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
       action: replace
       target_label: __address__
       regex: ([^:]+)(?::\d+)?;(\d+)
       replacement: $1:$2
     - action: labelmap
       regex: __meta_kubernetes_service_label_(.+)
     - source_labels: [__meta_kubernetes_namespace]
       action: replace
       target_label: kubernetes_namespace
     - source_labels: [__meta_kubernetes_service_name]
       action: replace
       target_label: kubernetes_name
   alerting:
     alertmanagers:
       - static_configs:
         - targets: ["localhost:9093"]
   rule_files:
       - /etc/prometheus/rules.yml
相关推荐
运维小文44 分钟前
K8S资源限制之LimitRange
云原生·容器·kubernetes·k8s资源限制
登云时刻1 小时前
Kubernetes集群外连接redis集群和使用redis-shake工具迁移数据(二)
redis·容器·kubernetes
wuxingge10 小时前
k8s1.30.0高可用集群部署
云原生·容器·kubernetes
志凌海纳SmartX10 小时前
趋势洞察|AI 能否带动裸金属 K8s 强势崛起?
云原生·容器·kubernetes
锅总11 小时前
nacos与k8s service健康检查详解
云原生·容器·kubernetes
BUG弄潮儿11 小时前
k8s 集群安装
云原生·容器·kubernetes
Code_Artist11 小时前
Docker镜像加速解决方案:配置HTTP代理,让Docker学会科学上网!
docker·云原生·容器
何遇mirror12 小时前
云原生基础-云计算概览
后端·云原生·云计算
颜淡慕潇13 小时前
【K8S系列】kubectl describe pod显示ImagePullBackOff,如何进一步排查?
后端·云原生·容器·kubernetes
Linux运维日记13 小时前
k8s1.31版本最新版本集群使用容器镜像仓库Harbor
linux·docker·云原生·容器·kubernetes