k8s巡检脚本

k8s_inspect.py

bash 复制代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Kubernetes 集群健康巡检脚本

检查内容:
  - 节点是否处于 Ready 状态
  - Pod 是否正常运行(跳过已完成的 Job)
  - Deployment 是否达到期望副本数
  - Service 是否有可用的后端 Endpoints

使用方法:
  python3 k8s_inspect.py

依赖安装(建议在虚拟环境中):
  pip install kubernetes
"""

import sys
from kubernetes import client, config
from kubernetes.client.rest import ApiException


def load_kube_config():
    """加载 Kubernetes 配置:优先尝试 in-cluster 模式,否则使用 kubeconfig 文件"""
    try:
        config.load_incluster_config()
        print("[成功] 使用集群内配置(in-cluster config)")
    except config.ConfigException:
        try:
            config.load_kube_config()
            print("[成功] 使用本地 kubeconfig 文件")
        except config.ConfigException as e:
            print(f"[失败] 无法加载 Kubernetes 配置: {e}")
            sys.exit(1)


def check_nodes():
    """检查所有节点是否就绪(Ready)"""
    print("\n[信息] 正在检查节点状态...")
    v1 = client.CoreV1Api()
    try:
        nodes = v1.list_node().items
        if not nodes:
            print("[警告] 未发现任何节点")
            return False

        all_ready = True
        for node in nodes:
            name = node.metadata.name
            ready = any(
                cond.type == "Ready" and cond.status == "True"
                for cond in node.status.conditions or []
            )
            if not ready:
                print(f"[错误] 节点 {name} 未就绪(NotReady)")
                all_ready = False
            else:
                print(f"[正常] 节点 {name} 已就绪")
        return all_ready
    except ApiException as e:
        print(f"[错误] 获取节点列表失败: {e}")
        return False


def check_pods():
    """检查所有命名空间中的 Pod 状态,跳过已完成的 Job Pod"""
    print("\n[信息] 正在检查 Pod 状态...")
    v1 = client.CoreV1Api()
    try:
        pods = v1.list_pod_for_all_namespaces().items
        if not pods:
            print("[信息] 未发现任何 Pod")
            return True

        all_ok = True
        for pod in pods:
            namespace = pod.metadata.namespace
            name = pod.metadata.name
            phase = pod.status.phase

            is_job_pod = (
                pod.metadata.owner_references
                and any(owner.kind == "Job" for owner in pod.metadata.owner_references)
            )
            if is_job_pod and phase in ("Succeeded", "Completed"):
                continue

            if phase == "Running":
                continue
            elif phase == "Pending":
                continue
            elif any(
                c.state.waiting and c.state.waiting.reason == "CrashLoopBackOff"
                for c in (pod.status.container_statuses or [])
            ):
                print(f"[错误] Pod {namespace}/{name} 处于 CrashLoopBackOff 状态")
                all_ok = False
            else:
                print(f"[警告] Pod {namespace}/{name} 状态异常: {phase}")
                all_ok = False
        return all_ok
    except ApiException as e:
        print(f"[错误] 获取 Pod 列表失败: {e}")
        return False


def check_deployments():
    """检查所有 Deployment 是否达到期望的可用副本数"""
    print("\n[信息] 正在检查 Deployment 状态...")
    apps_v1 = client.AppsV1Api()
    try:
        deployments = apps_v1.list_deployment_for_all_namespaces().items
        if not deployments:
            print("[信息] 未发现任何 Deployment")
            return True

        all_ok = True
        for dep in deployments:
            namespace = dep.metadata.namespace
            name = dep.metadata.name
            desired = dep.spec.replicas or 0
            available = dep.status.available_replicas or 0
            if available < desired:
                print(f"[错误] Deployment {namespace}/{name} 可用副本不足: {available}/{desired}")
                all_ok = False
        return all_ok
    except ApiException as e:
        print(f"[错误] 获取 Deployment 列表失败: {e}")
        return False


def check_services():
    """检查每个带 selector 的 Service 是否有可用的 Endpoints"""
    print("\n[信息] 正在检查 Service 的 Endpoints...")
    v1 = client.CoreV1Api()
    try:
        services = v1.list_service_for_all_namespaces().items
        services = [s for s in services if s.spec.selector]

        if not services:
            print("[信息] 未发现带 selector 的 Service")
            return True

        all_ok = True
        for svc in services:
            namespace = svc.metadata.namespace
            name = svc.metadata.name
            try:
                endpoints = v1.read_namespaced_endpoints(name, namespace)
                if not endpoints.subsets:
                    print(f"[错误] Service {namespace}/{name} 没有可用的后端 Endpoints")
                    all_ok = False
            except ApiException as e:
                print(f"[警告] 读取 Service {namespace}/{name} 的 Endpoints 失败: {e}")
                all_ok = False
        return all_ok
    except ApiException as e:
        print(f"[错误] 获取 Service 列表失败: {e}")
        return False


def main():
    """主函数:执行检查并汇总结果"""
    print("开始 Kubernetes 集群健康巡检...")

    load_kube_config()

    node_ok = check_nodes()
    pod_ok = check_pods()
    deploy_ok = check_deployments()
    svc_ok = check_services()

    print("\n========== 巡检结果汇总 ==========")
    print(f"节点状态:       {'正常' if node_ok else '异常'}")
    print(f"Pod 状态:       {'正常' if pod_ok else '异常'}")
    print(f"Deployment:     {'正常' if deploy_ok else '异常'}")
    print(f"Service 后端:   {'正常' if svc_ok else '异常'}")

    if all([node_ok, pod_ok, deploy_ok, svc_ok]):
        print("\n集群整体健康,无异常!")
        sys.exit(0)
    else:
        print("\n集群存在异常,请根据上述信息排查!")
        sys.exit(1)


if __name__ == "__main__":
    main()
相关推荐
Patrick_Wilson1 分钟前
K8s 探针避坑:Next.js 不同部署模式下的健康检查实践
kubernetes·node.js·next.js
运维瓦工14 分钟前
DevOps 生态介绍(十):Docker Compose 核心 YAML 配置详解与常用命令大全
spring cloud·docker·容器
Plastic garden42 分钟前
K8s(10)NFS 的动态 PV 创建数据库给k8s的mysql和redis
docker·容器·kubernetes
Plastic garden1 小时前
k8s(11) Pod 控制器,服务发现与存储管理
kubernetes
与海boy1 小时前
docker compose minio
docker·容器·eureka
星辰徐哥2 小时前
云原生核心特性:容器化、微服务与DevOps的通俗解读
微服务·云原生·devops
武子康2 小时前
调查研究-167 Docker Compose 详解:从单容器到多服务编排的工程化入口
运维·docker·云原生·容器·kubernetes·k8s·docker-compose
heimeiyingwang2 小时前
【架构实战】分布式会话:从Session到JWT的演进
微服务·云原生·架构
旅僧3 小时前
Ubantu docker环境配置(前置)
运维·docker·容器