分享一个很实用的K8S巡检脚本

bash 复制代码
#!/bin/bash

# ============================================================================
# K8s 集群全面巡检脚本
# 功能:检查节点、Pod、Deployment、Service、Ingress、PVC、事件等资源状态
# 输出:生成炫酷的HTML巡检报告
# 依赖:kubectl, jq
# ============================================================================

set -e

# ----------------------------- 配置区域 -------------------------------------
REPORT_FILE="k8s_inspection_report_$(date +%Y%m%d_%H%M%S).html"
KUBECTL="kubectl"
JQ="jq"
NAMESPACES=""  # 留空表示所有命名空间,可设置为 "default,kube-system" 等
TIMEOUT=10      # kubectl 命令超时时间(秒)
LOG_LEVEL="INFO" # 日志级别 INFO/WARN/ERROR

# 颜色输出(仅用于脚本日志)
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# ----------------------------- 函数定义 -------------------------------------

# 增强的日志函数,支持日志级别
log() {
    local level=$1
    shift
    local message="$*"
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    case $level in
        ERROR)
            echo -e "${RED}[ERROR] [$timestamp]${NC} $message" >&2
            ;;
        WARN)
            echo -e "${YELLOW}[WARN] [$timestamp]${NC} $message"
            ;;
        INFO)
            echo -e "${GREEN}[INFO] [$timestamp]${NC} $message"
            ;;
        DEBUG)
            if [[ "$LOG_LEVEL" == "DEBUG" ]]; then
                echo -e "${BLUE}[DEBUG] [$timestamp]${NC} $message"
            fi
            ;;
        *)
            echo -e "[UNKNOWN] [$timestamp] $message"
            ;;
    esac
}

log_info() { log "INFO" "$@"; }
log_warn() { log "WARN" "$@"; }
log_error() { log "ERROR" "$@"; }

# 带超时的 kubectl 命令执行函数
run_kubectl_with_timeout() {
    local cmd="$*"
    timeout $TIMEOUT $cmd 2>/dev/null || {
        log_error "kubectl command failed or timed out: $cmd"
        return 1
    }
}

# 检查依赖并验证集群连通性
check_dependencies() {
    log_info "检查依赖..."
    if ! command -v $KUBECTL &> /dev/null; then
        log_error "kubectl 未安装或不在PATH中"
        exit 1
    fi
    if ! command -v $JQ &> /dev/null; then
        log_error "jq 未安装或不在PATH中,请先安装 jq"
        exit 1
    fi
    
    log_info "验证集群连通性..."
    if ! run_kubectl_with_timeout $KUBECTL cluster-info &> /dev/null; then
        log_error "无法连接到Kubernetes集群,请检查kubeconfig配置"
        exit 1
    fi
    
    # 检查 jq 是否能正确处理 JSON
    if ! echo '{"test": "ok"}' | $JQ '.' &> /dev/null; then
        log_error "jq 安装有问题,无法处理 JSON"
        exit 1
    fi
    
    log_info "依赖检查通过,开始巡检集群..."
}

# 获取所有命名空间(逗号分隔),处理错误情况
get_namespaces() {
    if [ -z "$NAMESPACES" ]; then
        NAMESPACES=$(run_kubectl_with_timeout $KUBECTL get namespaces -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' ',' || echo "")
        if [ -z "$NAMESPACES" ]; then
            log_warn "无法获取命名空间列表,将使用 --all-namespaces 选项"
            NAMESPACES="--all-namespaces"
        fi
    fi
    echo "$NAMESPACES"
}

# 收集节点信息,优化性能和错误处理
collect_nodes() {
    log_info "收集节点信息..."
    local NODES_JSON
    NODES_JSON=$(run_kubectl_with_timeout $KUBECTL get nodes -o json)
    if [ $? -ne 0 ] || [ -z "$NODES_JSON" ]; then
        log_error "无法获取节点信息"
        NODES_COUNT=0
        NODES_READY=0
        NODES_NOT_READY=0
        NODES_TABLE="<tr><td colspan='4'>无法获取节点信息</td></tr>"
        return
    fi

    NODES_COUNT=$(echo "$NODES_JSON" | $JQ '.items | length')
    NODES_READY=$(echo "$NODES_JSON" | $JQ '[.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="True"))] | length')
    NODES_NOT_READY=$((NODES_COUNT - NODES_READY))

    # 优化节点信息收集,提取更多有用指标
    NODES_TABLE=""
    while IFS=$'\t' read -r name status cpu mem allocatable_cpu allocatable_mem taints addr_internal addr_external os arch kernel; do
        local status_class="status-bad"
        if [ "$status" == "True" ]; then 
            status_class="status-good"
        fi
        
        # 格式化显示资源信息
        local cpu_display="$cpu/$allocatable_cpu"
        local mem_display="$mem/$allocatable_mem"
        
        # 处理 Taints 为空的情况
        taints=${taints:-"None"}
        addr_internal=${addr_internal:-"N/A"}
        addr_external=${addr_external:-"N/A"}
        
        NODES_TABLE+="<tr><td>$name</td><td class=\"$status_class\">$status</td><td>$cpu_display</td><td>$mem_display</td><td>$taints</td><td>$addr_internal</td><td>$addr_external</td><td>$os/$arch</td><td>$kernel</td></tr>"
    done < <(echo "$NODES_JSON" | $JQ -r '.items[] | [
        .metadata.name,
        (.status.conditions[] | select(.type=="Ready") | .status),
        (.status.capacity.cpu // "?"),
        (.status.capacity.memory // "?"),
        (.status.allocatable.cpu // "?"),
        (.status.allocatable.memory // "?"),
        ([.spec.taints[]?.key + "=" + (.spec.taints[]?.value // "") + ":" + (.spec.taints[]?.effect // "")] | join(", ") // ""),
        (.status.addresses[] | select(.type=="InternalIP") | .address // ""),
        (.status.addresses[] | select(.type=="ExternalIP") | .address // ""),
        (.status.nodeInfo.osImage // "?"),
        (.status.nodeInfo.architecture // "?"),
        (.status.nodeInfo.kernelVersion // "?")
    ] | @tsv')
}

# 收集 Pod 信息(所有命名空间,重点异常状态),优化性能
collect_pods() {
    log_info "收集Pod信息..."
    local PODS_JSON
    PODS_JSON=$(run_kubectl_with_timeout $KUBECTL get pods --all-namespaces -o json)
    if [ $? -ne 0 ] || [ -z "$PODS_JSON" ]; then
        log_error "无法获取Pod信息"
        TOTAL_PODS=0
        RUNNING_PODS=0
        PENDING_PODS=0
        FAILED_PODS=0
        UNKNOWN_PODS=0
        ABNORMAL_CONTAINERS=0
        OOMKILLED=0
        ABNORMAL_PODS_TABLE="<tr><td colspan='5'>无法获取Pod信息</td></tr>"
        return
    fi

    TOTAL_PODS=$(echo "$PODS_JSON" | $JQ '.items | length')

    # 统计各种状态
    RUNNING_PODS=$(echo "$PODS_JSON" | $JQ '[.items[] | select(.status.phase=="Running")] | length')
    PENDING_PODS=$(echo "$PODS_JSON" | $JQ '[.items[] | select(.status.phase=="Pending")] | length')
    FAILED_PODS=$(echo "$PODS_JSON" | $JQ '[.items[] | select(.status.phase=="Failed")] | length')
    UNKNOWN_PODS=$(echo "$PODS_JSON" | $JQ '[.items[] | select(.status.phase=="Unknown")] | length')

    # 异常容器状态 (CrashLoopBackOff, ImagePullBackOff, ErrImagePull, OOMKilled等)
    ABNORMAL_CONTAINERS=$(echo "$PODS_JSON" | $JQ '[.items[].status.containerStatuses[]? | select(.state.waiting != null and (.state.waiting.reason == "CrashLoopBackOff" or .state.waiting.reason == "ImagePullBackOff" or .state.waiting.reason == "ErrImagePull"))] | length')
    OOMKILLED=$(echo "$PODS_JSON" | $JQ '[.items[].status.containerStatuses[]? | select(.state.terminated != null and .state.terminated.reason == "OOMKilled")] | length')

    # 获取所有异常 Pod 列表 (状态非 Running 或 Ready 不为 True),优化查询
    ABNORMAL_PODS_TABLE=""
    while IFS=$'\t' read -r namespace name phase ready_status last_state_reason restart_count age_str qos_class; do
        local status_class="status-bad"
        local reason_display="$last_state_reason"
        
        # 如果没有具体的容器状态原因,则显示Phase
        if [ -z "$reason_display" ] || [ "$reason_display" == "null" ]; then
            reason_display="$phase"
        fi
        
        # 计算年龄(如果可用)
        age_str=${age_str:-"N/A"}
        qos_class=${qos_class:-"N/A"}
        
        ABNORMAL_PODS_TABLE+="<tr><td>$namespace</td><td>$name</td><td class=\"$status_class\">$reason_display</td><td>$restart_count</td><td>$age_str</td><td>$qos_class</td></tr>"
    done < <(echo "$PODS_JSON" | $JQ -r '
        .items[] | select(
            .status.phase != "Running" or 
            ([.status.conditions[]? | select(.type=="Ready") | .status] | first // "False") == "False"
        ) | [
            .metadata.namespace,
            .metadata.name,
            .status.phase,
            ([.status.conditions[]? | select(.type=="Ready") | .status] | first // "Unknown"),
            (
                (.status.containerStatuses[]?.state.waiting?.reason // .status.containerStatuses[]?.state.terminated?.reason // "") as $cont_reason |
                if $cont_reason != "" then $cont_reason else empty end
            ),
            (.status.containerStatuses[]?.restartCount // 0 | tostring),
            (.metadata.creationTimestamp | sub("\\.[0-9]+Z$"; "Z") | .[0:19] | . + "Z" | .[0:10] + " " + .[11:]) as $created_time |
            (now - ($created_time | fromdateiso8601)) as $age_seconds |
            if $age_seconds < 60 then ($age_seconds | tostring) + "s"
            elif $age_seconds < 3600 then (($age_seconds / 60) | floor | tostring) + "m"
            elif $age_seconds < 86400 then (($age_seconds / 3600) | floor | tostring) + "h"
            else (($age_seconds / 86400) | floor | tostring) + "d" end
            ),
            (.status.qosClass // "N/A")
        ] | @tsv' | sort -k1,1 -k2,2)
}

# 收集 Deployment 信息,优化性能
collect_deployments() {
    log_info "收集Deployment信息..."
    local DEPLOY_JSON
    DEPLOY_JSON=$(run_kubectl_with_timeout $KUBECTL get deployments --all-namespaces -o json)
    if [ $? -ne 0 ] || [ -z "$DEPLOY_JSON" ]; then
        log_error "无法获取Deployment信息"
        TOTAL_DEPLOY=0
        UNHEALTHY_DEPLOY=0
        DEPLOY_TABLE="<tr><td colspan='6'>无法获取Deployment信息</td></tr>"
        return
    fi

    TOTAL_DEPLOY=$(echo "$DEPLOY_JSON" | $JQ '.items | length')
    UNHEALTHY_DEPLOY=0
    DEPLOY_TABLE=""
    
    while IFS=$'\t' read -r ns name desired available ready updated ready_replicas unavailable; do
        local status="健康"
        local status_class="status-good"
        if [ "$desired" != "$available" ] || [ "$desired" != "$ready" ] || [ "$unavailable" != "0" ]; then
            status="异常"
            status_class="status-bad"
            ((UNHEALTHY_DEPLOY++))
        fi
        
        # 格式化显示,处理空值
        desired=${desired:-0}
        available=${available:-0}
        ready=${ready:-0}
        updated=${updated:-0}
        ready_replicas=${ready_replicas:-0}
        unavailable=${unavailable:-0}
        
        DEPLOY_TABLE+="<tr><td>$ns</td><td>$name</td><td>$desired</td><td>$available</td><td>$ready</td><td class=\"$status_class\">$status</td></tr>"
    done < <(echo "$DEPLOY_JSON" | $JQ -r '.items[] | [
        .metadata.namespace,
        .metadata.name,
        (.spec.replicas // 0),
        (.status.availableReplicas // 0),
        (.status.readyReplicas // 0),
        (.status.updatedReplicas // 0),
        (.status.replicas // 0),
        (.status.unavailableReplicas // 0)
    ] | @tsv')
}

# 收集 Service 信息,优化性能
collect_services() {
    log_info "收集Service信息..."
    local SVC_JSON
    SVC_JSON=$(run_kubectl_with_timeout $KUBECTL get services --all-namespaces -o json)
    if [ $? -ne 0 ] || [ -z "$SVC_JSON" ]; then
        log_error "无法获取Service信息"
        TOTAL_SVC=0
        LB_PENDING=0
        SVC_TABLE="<tr><td colspan='4'>无法获取Service信息</td></tr>"
        return
    fi

    TOTAL_SVC=$(echo "$SVC_JSON" | $JQ '.items | length')
    LB_PENDING=0
    SVC_TABLE=""
    
    while IFS=$'\t' read -r ns name svcType clusterIP externalIP ports selector session_affinity type; do
        SVC_TABLE+="<tr><td>$ns</td><td>$name</td><td>$svcType</td><td>$clusterIP</td><td>$externalIP</td><td>$ports</td><td>$selector</td><td>$session_affinity</td></tr>"
        
        # 检查 LoadBalancer pending 状态
        if [ "$svcType" == "LoadBalancer" ] && [ -z "$externalIP" ] && [ "$externalIP" != "<pending>" ]; then
            ((LB_PENDING++))
        fi
    done < <(echo "$SVC_JSON" | $JQ -r '.items[] | [
        .metadata.namespace,
        .metadata.name,
        .spec.type,
        (.spec.clusterIP // "-"),
        (.status.loadBalancer.ingress[0].ip // .status.loadBalancer.ingress[0].hostname // "<pending>"),
        ([.spec.ports[] | "\(.port):\(.targetPort)/\(.protocol)"] | join(", ")),
        ([.spec.selector | to_entries[] | "\(.key)=\(.value)"] | join(",")),
        (.spec.sessionAffinity // "None"),
        (.spec.type // "ClusterIP")
    ] | @tsv')
    
    # 重新计算 LB pending 数量(更精确)
    LB_PENDING=$(echo "$SVC_JSON" | $JQ '[.items[] | select(.spec.type=="LoadBalancer" and (.status.loadBalancer.ingress == null or .status.loadBalancer.ingress | length == 0 or (.status.loadBalancer.ingress[0].ip == null and .status.loadBalancer.ingress[0].hostname == null)))] | length')
}

# 收集 Ingress 信息,优化性能
collect_ingress() {
    log_info "收集Ingress信息..."
    local INGRESS_JSON
    if run_kubectl_with_timeout $KUBECTL get ingress --all-namespaces &>/dev/null; then
        INGRESS_JSON=$(run_kubectl_with_timeout $KUBECTL get ingress --all-namespaces -o json)
        if [ $? -ne 0 ] || [ -z "$INGRESS_JSON" ]; then
            log_error "无法获取Ingress信息"
            TOTAL_INGRESS=0
            INGRESS_TABLE="<tr><td colspan='4'>无法获取Ingress信息</td></tr>"
            return
        fi
        
        TOTAL_INGRESS=$(echo "$INGRESS_JSON" | $JQ '.items | length')
        INGRESS_TABLE=""
        
        while IFS=$'\t' read -r ns name hosts tls_enabled backend_service paths age_str; do
            # 处理空值
            hosts=${hosts:-"N/A"}
            tls_enabled=${tls_enabled:-"No"}
            backend_service=${backend_service:-"N/A"}
            paths=${paths:-"N/A"}
            age_str=${age_str:-"N/A"}
            
            INGRESS_TABLE+="<tr><td>$ns</td><td>$name</td><td>$hosts</td><td>$tls_enabled</td><td>$backend_service</td><td>$paths</td><td>$age_str</td></tr>"
        done < <(echo "$INGRESS_JSON" | $JQ -r '
            .items[] | [
                .metadata.namespace,
                .metadata.name,
                ([.spec.rules[].host] | unique | join(",")),
                (if .spec.tls and (.spec.tls | length > 0) then "Yes" else "No" end),
                ([.spec.rules[].http.paths[].backend.service.name] | unique | join(",")),
                ([.spec.rules[].http.paths[].path // "/*"] | unique | join("<br>")),
                (.metadata.creationTimestamp | sub("\\.[0-9]+Z$"; "Z") | .[0:19] | . + "Z" | .[0:10] + " " + .[11:]) as $created_time |
                (now - ($created_time | fromdateiso8601)) as $age_seconds |
                if $age_seconds < 60 then ($age_seconds | tostring) + "s"
                elif $age_seconds < 3600 then (($age_seconds / 60) | floor | tostring) + "m"
                elif $age_seconds < 86400 then (($age_seconds / 3600) | floor | tostring) + "h"
                else (($age_seconds / 86400) | floor | tostring) + "d" end
            ] | @tsv')
    else
        TOTAL_INGRESS=0
        INGRESS_TABLE="<tr><td colspan='4'>未发现 Ingress 资源或 Ingress 资源不可用</td></tr>"
    fi
}

# 收集 PVC 信息 (检查绑定状态),优化性能
collect_pvcs() {
    log_info "收集PVC信息..."
    local PVC_JSON
    PVC_JSON=$(run_kubectl_with_timeout $KUBECTL get pvc --all-namespaces -o json)
    if [ $? -ne 0 ] || [ -z "$PVC_JSON" ]; then
        log_error "无法获取PVC信息"
        TOTAL_PVC=0
        PVC_PENDING=0
        PVC_TABLE="<tr><td colspan='6'>无法获取PVC信息</td></tr>"
        return
    fi

    TOTAL_PVC=$(echo "$PVC_JSON" | $JQ '.items | length')
    PVC_PENDING=0
    PVC_TABLE=""
    
    while IFS=$'\t' read -r ns name status volume access_modes storage_class requested capacity age_str; do
        local status_class="status-good"
        if [ "$status" == "Pending" ]; then
            status_class="status-bad"
            ((PVC_PENDING++))
        fi
        
        # 处理空值
        volume=${volume:-"-"}
        access_modes=${access_modes:-"N/A"}
        storage_class=${storage_class:-"default"}
        requested=${requested:-"N/A"}
        capacity=${capacity:-"N/A"}
        age_str=${age_str:-"N/A"}
        
        PVC_TABLE+="<tr><td>$ns</td><td>$name</td><td class=\"$status_class\">$status</td><td>$volume</td><td>$access_modes</td><td>$storage_class</td><td>$requested</td><td>$capacity</td><td>$age_str</td></tr>"
    done < <(echo "$PVC_JSON" | $JQ -r '
        .items[] | [
            .metadata.namespace,
            .metadata.name,
            .status.phase,
            (.spec.volumeName // "-"),
            ([.spec.accessModes[]] | join(",")),
            (.spec.storageClassName // "default"),
            (.spec.resources.requests.storage // "N/A"),
            (.status.capacity.storage // "N/A"),
            (.metadata.creationTimestamp | sub("\\.[0-9]+Z$"; "Z") | .[0:19] | . + "Z" | .[0:10] + " " + .[11:]) as $created_time |
            (now - ($created_time | fromdateiso8601)) as $age_seconds |
            if $age_seconds < 60 then ($age_seconds | tostring) + "s"
            elif $age_seconds < 3600 then (($age_seconds / 60) | floor | tostring) + "m"
            elif $age_seconds < 86400 then (($age_seconds / 3600) | floor | tostring) + "h"
            else (($age_seconds / 86400) | floor | tostring) + "d" end
        ] | @tsv')
}

# 收集事件 (Warning 级别),优化性能,限制数量
collect_events() {
    log_info "收集Warning事件..."
    local EVENTS_JSON
    EVENTS_JSON=$(run_kubectl_with_timeout $KUBECTL get events --all-namespaces --field-selector type=Warning -o json)
    if [ $? -ne 0 ] || [ -z "$EVENTS_JSON" ]; then
        log_error "无法获取事件信息"
        TOTAL_WARNINGS=0
        EVENTS_TABLE="<tr><td colspan='6'>无法获取事件信息</td></tr>"
        return
    fi

    TOTAL_WARNINGS=$(echo "$EVENTS_JSON" | $JQ '.items | length')
    EVENTS_TABLE=""
    
    # 只取最近的20个事件,按时间倒序排列
    while IFS=$'\t' read -r ns kind name reason message timestamp count first_timestamp; do
        # 截取过长消息,保留重要信息
        local message_display=$(echo "$message" | cut -c1-100)
        local tooltip_msg=$(echo "$message" | sed 's|"|&quot;|g')
        
        # 格式化时间
        timestamp=${timestamp:-"N/A"}
        first_timestamp=${first_timestamp:-"N/A"}
        count=${count:-"1"}
        
        EVENTS_TABLE+="<tr><td>$ns</td><td>$kind</td><td>$name</td><td>$reason</td><td title=\"$tooltip_msg\">${message_display}...</td><td>$timestamp</td><td>$count</td><td>$first_timestamp</td></tr>"
    done < <(echo "$EVENTS_JSON" | $JQ -r '
        .items | 
        map(. | .firstTimestamp = (.firstTimestamp // .eventTime)) |
        sort_by(.lastTimestamp // .eventTime) | 
        reverse | 
        .[:20] | 
        .[] | [
            .metadata.namespace,
            .involvedObject.kind,
            .involvedObject.name,
            .reason,
            .message,
            (.lastTimestamp // .eventTime // "N/A"),
            (.count // 1 | tostring),
            (.firstTimestamp // .eventTime // "N/A")
        ] | @tsv')
}

# 收集集群总体信息,增加更多指标
collect_cluster_info() {
    log_info "收集集群基本信息..."
    CLUSTER_INFO=$(run_kubectl_with_timeout $KUBECTL cluster-info 2>/dev/null | head -n1 || echo "无法获取集群信息")
    K8S_VERSION=$(run_kubectl_with_timeout $KUBECTL version -o json 2>/dev/null | $JQ -r '.serverVersion.gitVersion' 2>/dev/null || echo "unknown")
    CURRENT_CONTEXT=$(run_kubectl_with_timeout $KUBECTL config current-context 2>/dev/null || echo "unknown")
    SERVER_ADDR=$(run_kubectl_with_timeout $KUBECTL config view -o jsonpath="{.clusters[?(@.name==\"$CURRENT_CONTEXT\")].cluster.server}" 2>/dev/null || echo "unknown")
    
    # 收集更多集群信息
    NODE_ARCHITECTURE=$(run_kubectl_with_timeout $KUBECTL get nodes -o jsonpath='{.items[0].status.nodeInfo.architecture}' 2>/dev/null || echo "unknown")
    NODE_OS_IMAGE=$(run_kubectl_with_timeout $KUBECTL get nodes -o jsonpath='{.items[0].status.nodeInfo.osImage}' 2>/dev/null || echo "unknown")
    CNI_PLUGIN=$(run_kubectl_with_timeout $KUBECTL get pods -n kube-system -l k8s-app -o jsonpath='{.items[0].metadata.labels.k8s-app}' 2>/dev/null || echo "unknown")
}

# ----------------------------- 生成 HTML 报告 ---------------------------------
generate_html() {
    log_info "生成HTML报告: $REPORT_FILE"

    # 计算健康分数 (满分100),引入权重
    local node_weight=0.25
    local pod_weight=0.30
    local deploy_weight=0.25
    local pvc_weight=0.20
    
    local node_score=0
    local pod_score=0
    local deploy_score=0
    local pvc_score=0
    
    if [ $NODES_COUNT -gt 0 ]; then
        node_score=$(( NODES_READY * 100 / NODES_COUNT ))
    fi
    
    if [ $TOTAL_PODS -gt 0 ]; then
        local healthy_pods=$(( TOTAL_PODS - PENDING_PODS - FAILED_PODS - UNKNOWN_PODS ))
        pod_score=$(( healthy_pods * 100 / TOTAL_PODS ))
    fi
    
    if [ $TOTAL_DEPLOY -gt 0 ]; then
        deploy_score=$(( (TOTAL_DEPLOY - UNHEALTHY_DEPLOY) * 100 / TOTAL_DEPLOY ))
    fi
    
    if [ $TOTAL_PVC -gt 0 ]; then
        pvc_score=$(( (TOTAL_PVC - PVC_PENDING) * 100 / TOTAL_PVC ))
    fi
    
    # 加权平均健康分数
    local health_score
    health_score=$(awk "BEGIN {printf \"%.0f\", ($node_score * $node_weight + $pod_score * $pod_weight + $deploy_score * $deploy_weight + $pvc_score * $pvc_weight) * 100}")
    health_score=$((health_score / 100))
    
    # 确定分数颜色
    local score_color="#10b981"
    local score_status="健康"
    if [ $health_score -ge 80 ]; then
        score_color="#10b981"
        score_status="健康"
    elif [ $health_score -ge 60 ]; then
        score_color="#f59e0b"
        score_status="警告"
    else
        score_color="#ef4444"
        score_status="危险"
    fi

    # 定义图表数据
    local chart_data_node="[$node_score, $((100 - node_score))]"
    local chart_data_pod="[$pod_score, $((100 - pod_score))]"
    local chart_data_deploy="[$deploy_score, $((100 - deploy_score))]"
    local chart_data_pvc="[$pvc_score, $((100 - pvc_score))]"
    local chart_data_total="[$health_score, $((100 - health_score))]"

    cat > "$REPORT_FILE" <<EOF
<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>K8s 集群巡检报告 - ${CURRENT_CONTEXT}</title>
    <!-- Font Awesome & Google Fonts -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
    <!-- Chart.js CDN -->
    <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
    <style>
        * {
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }
        body {
            background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
            font-family: 'Inter', sans-serif;
            color: #e2e8f0;
            padding: 2rem;
            line-height: 1.6;
        }
        .container {
            max-width: 1600px;
            margin: 0 auto;
        }
        /* 头部区域 */
        .header {
            text-align: center;
            margin-bottom: 3rem;
            animation: fadeInDown 0.8s ease;
        }
        .header h1 {
            font-size: 2.5rem;
            font-weight: 800;
            background: linear-gradient(135deg, #60a5fa, #a78bfa);
            -webkit-background-clip: text;
            background-clip: text;
            color: transparent;
            margin-bottom: 0.5rem;
        }
        .header .sub {
            color: #94a3b8;
            font-size: 0.95rem;
        }
        .badge {
            display: inline-block;
            background: #1e293b;
            border-radius: 40px;
            padding: 0.25rem 1rem;
            font-size: 0.8rem;
            margin-top: 0.5rem;
            margin-right: 0.5rem;
        }
        /* 卡片网格 */
        .grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
            gap: 1.5rem;
            margin-bottom: 2rem;
        }
        .card {
            background: rgba(30, 41, 59, 0.7);
            backdrop-filter: blur(10px);
            border-radius: 24px;
            padding: 1.5rem;
            border: 1px solid rgba(255,255,255,0.1);
            transition: transform 0.2s, box-shadow 0.2s;
            box-shadow: 0 8px 20px rgba(0,0,0,0.2);
            position: relative;
            overflow: hidden;
        }
        .card::before {
            content: '';
            position: absolute;
            top: 0;
            left: 0;
            right: 0;
            height: 3px;
            background: linear-gradient(90deg, #60a5fa, #a78bfa);
        }
        .card:hover {
            transform: translateY(-4px);
            box-shadow: 0 20px 30px -12px rgba(0,0,0,0.4);
            border-color: rgba(96,165,250,0.4);
        }
        .card-title {
            font-size: 1.1rem;
            font-weight: 600;
            text-transform: uppercase;
            letter-spacing: 1px;
            color: #94a3b8;
            margin-bottom: 1rem;
            display: flex;
            align-items: center;
            gap: 0.5rem;
        }
        .card-value {
            font-size: 2.5rem;
            font-weight: 800;
            margin-bottom: 0.5rem;
        }
        .card-desc {
            font-size: 0.8rem;
            color: #cbd5e1;
            white-space: nowrap;
            overflow: hidden;
            text-overflow: ellipsis;
        }
        .status-good {
            color: #10b981;
            font-weight: 600;
        }
        .status-bad {
            color: #ef4444;
            font-weight: 600;
        }
        .status-warning {
            color: #f59e0b;
        }
        /* 健康仪表盘圆形 */
        .score-container {
            display: flex;
            justify-content: center;
            align-items: center;
            flex-direction: column;
        }
        canvas.healthChart {
            width: 120px !important;
            height: 120px !important;
            margin-bottom: 1rem;
        }
        .score-text {
            font-size: 1.2rem;
            font-weight: 700;
        }
        /* 表格样式 */
        .table-wrapper {
            overflow-x: auto;
            border-radius: 20px;
            background: rgba(15, 23, 42, 0.6);
            padding: 0.5rem;
            margin-bottom: 2rem;
            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
        }
        table {
            width: 100%;
            border-collapse: collapse;
            font-size: 0.85rem;
        }
        th {
            text-align: left;
            padding: 0.8rem 0.8rem;
            background: #0f172a;
            color: #94a3b8;
            font-weight: 600;
            border-bottom: 1px solid #334155;
            position: sticky;
            top: 0;
            z-index: 10;
        }
        td {
            padding: 0.7rem 0.8rem;
            border-bottom: 1px solid #1e293b;
        }
        tr:hover {
            background: rgba(51, 65, 85, 0.4);
        }
        .section-title {
            font-size: 1.5rem;
            font-weight: 700;
            margin: 2rem 0 1rem 0;
            display: flex;
            align-items: center;
            gap: 0.8rem;
            color: #e2e8f0;
        }
        .footer {
            text-align: center;
            margin-top: 3rem;
            padding: 1rem;
            font-size: 0.75rem;
            color: #475569;
            border-top: 1px solid #334155;
            padding-top: 1rem;
        }
        @keyframes fadeInDown {
            from {
                opacity: 0;
                transform: translateY(-20px);
            }
            to {
                opacity: 1;
                transform: translateY(0);
            }
        }
        i.icon {
            width: 28px;
            color: #60a5fa;
        }
        .cluster-info {
            background: rgba(0,0,0,0.3);
            border-radius: 16px;
            padding: 1rem;
            margin-bottom: 2rem;
            font-size: 0.9rem;
            display: flex;
            flex-wrap: wrap;
            justify-content: space-between;
            gap: 1rem;
        }
        .chart-container {
            display: flex;
            justify-content: space-around;
            flex-wrap: wrap;
            gap: 1rem;
            margin: 2rem 0;
        }
        .chart-item {
            text-align: center;
            flex: 1;
            min-width: 150px;
        }
        .chart-title {
            font-size: 0.9rem;
            color: #94a3b8;
            margin-bottom: 0.5rem;
        }
        .collapsible-table {
            cursor: pointer;
        }
        .collapsible-table .table-header {
            background: #1e293b;
            padding: 0.5rem;
            border-radius: 10px;
            margin-bottom: 0.5rem;
            display: flex;
            justify-content: space-between;
            align-items: center;
        }
        .collapsible-table .table-content {
            max-height: 400px;
            overflow-y: auto;
        }
        .toggle-icon {
            transition: transform 0.3s;
        }
        .collapsed .toggle-icon {
            transform: rotate(-90deg);
        }
        .collapsed .table-content {
            display: none;
        }
        @media (max-width: 768px) {
            body {
                padding: 1rem;
            }
            .cluster-info {
                flex-direction: column;
                align-items: flex-start;
            }
            .grid {
                grid-template-columns: 1fr;
            }
            .chart-container {
                flex-direction: column;
            }
        }
    </style>
</head>
<body>
<div class="container">
    <div class="header">
        <h1><i class="fas fa-kubernetes"></i> K8s 集群巡检报告</h1>
        <div class="sub">全面健康检查 & 异常诊断 | 生成时间: $(date '+%Y-%m-%d %H:%M:%S')</div>
        <div>
            <div class="badge"><i class="fas fa-check-circle"></i> 集群: ${CURRENT_CONTEXT}</div>
            <div class="badge"><i class="fas fa-server"></i> 版本: ${K8S_VERSION}</div>
            <div class="badge"><i class="fas fa-microchip"></i> 架构: ${NODE_ARCHITECTURE}</div>
            <div class="badge"><i class="fas fa-desktop"></i> 系统: ${NODE_OS_IMAGE}</div>
        </div>
    </div>

    <div class="cluster-info">
        <span><i class="fas fa-server"></i> <strong>API Server:</strong> ${SERVER_ADDR}</span>
        <span><i class="fab fa-kubernetes"></i> <strong>版本:</strong> ${K8S_VERSION}</span>
        <span><i class="fas fa-cogs"></i> <strong>CNI:</strong> ${CNI_PLUGIN}</span>
        <span><i class="fas fa-chart-line"></i> <strong>巡检范围:</strong> 全集群资源</span>
    </div>

    <!-- 健康分数卡片及统计 -->
    <div class="grid">
        <div class="card">
            <div class="card-title"><i class="fas fa-heartbeat icon"></i> 集群健康指数</div>
            <div class="score-container">
                <canvas id="healthChart" class="healthChart" width="120" height="120"></canvas>
                <div class="score-text" style="color: ${score_color};">${health_score}分 - ${score_status}</div>
            </div>
        </div>
        <div class="card">
            <div class="card-title"><i class="fas fa-server icon"></i> 节点</div>
            <div class="card-value">${NODES_READY} / ${NODES_COUNT}</div>
            <div class="card-desc">就绪节点 / 总数 &nbsp; <span class="${NODES_NOT_READY>0?'status-bad':'status-good'}">异常节点: ${NODES_NOT_READY}</span></div>
        </div>
        <div class="card">
            <div class="card-title"><i class="fas fa-cubes icon"></i> Pods</div>
            <div class="card-value">${RUNNING_PODS} / ${TOTAL_PODS}</div>
            <div class="card-desc">运行中 | 等待: ${PENDING_PODS} 失败: ${FAILED_PODS} <br> 异常容器: ${ABNORMAL_CONTAINERS} OOM: ${OOMKILLED}</div>
        </div>
        <div class="card">
            <div class="card-title"><i class="fas fa-rocket icon"></i> Deployments</div>
            <div class="card-value">$((TOTAL_DEPLOY - UNHEALTHY_DEPLOY)) / ${TOTAL_DEPLOY}</div>
            <div class="card-desc">健康/总数 &nbsp; 不健康副本: ${UNHEALTHY_DEPLOY}</div>
        </div>
        <div class="card">
            <div class="card-title"><i class="fas fa-database icon"></i> PersistentVolumeClaims</div>
            <div class="card-value">$((TOTAL_PVC - PVC_PENDING)) / ${TOTAL_PVC}</div>
            <div class="card-desc">已绑定 | Pending: ${PVC_PENDING}</div>
        </div>
        <div class="card">
            <div class="card-title"><i class="fas fa-exclamation-triangle icon"></i> Warning事件</div>
            <div class="card-value">${TOTAL_WARNINGS}</div>
            <div class="card-desc">最近Warning级别事件</div>
        </div>
    </div>

    <!-- 图表展示 -->
    <div class="section-title"><i class="fas fa-chart-bar"></i> 健康度分布</div>
    <div class="chart-container">
        <div class="chart-item">
            <div class="chart-title">节点健康度</div>
            <canvas id="nodeChart" class="healthChart" width="100" height="100"></canvas>
            <div>${node_score}%</div>
        </div>
        <div class="chart-item">
            <div class="chart-title">Pod健康度</div>
            <canvas id="podChart" class="healthChart" width="100" height="100"></canvas>
            <div>${pod_score}%</div>
        </div>
        <div class="chart-item">
            <div class="chart-title">Deployment健康度</div>
            <canvas id="deployChart" class="healthChart" width="100" height="100"></canvas>
            <div>${deploy_score}%</div>
        </div>
        <div class="chart-item">
            <div class="chart-title">PVC健康度</div>
            <canvas id="pvcChart" class="healthChart" width="100" height="100"></canvas>
            <div>${pvc_score}%</div>
        </div>
    </div>

    <!-- 节点详情 -->
    <div class="section-title"><i class="fas fa-server"></i> 节点状态详情</div>
    <div class="table-wrapper">
        <table>
            <thead><tr><th>节点名称</th><th>状态 (Ready)</th><th>CPU (Capacity/Allocatable)</th><th>内存 (Capacity/Allocatable)</th><th>Taints</th><th>内部IP</th><th>外部IP</th><th>系统/架构</th><th>内核版本</th></tr></thead>
            <tbody>
                ${NODES_TABLE}
            </tbody>
        </table>
    </div>

    <!-- 异常Pod列表 -->
    <div class="section-title"><i class="fas fa-exclamation-circle"></i> 异常Pod / 容器问题</div>
    <div class="table-wrapper">
        <table>
            <thead><tr><th>命名空间</th><th>Pod名称</th><th>异常原因</th><th>重启次数</th><th>运行时长</th><th>QoS等级</th></tr></thead>
            <tbody>
                ${ABNORMAL_PODS_TABLE:-<tr><td colspan="6" style="text-align:center">✅ 没有发现异常Pod</td></tr>}
            </tbody>
        </table>
    </div>

    <!-- Deployment 状态 -->
    <div class="section-title"><i class="fas fa-chart-simple"></i> Deployment 副本健康度</div>
    <div class="table-wrapper">
        <table>
            <thead><tr><th>命名空间</th><th>名称</th><th>期望副本</th><th>可用副本</th><th>就绪副本</th><th>状态</th></tr></thead>
            <tbody>
                ${DEPLOY_TABLE}
            </tbody>
        </table>
    </div>

    <!-- Service 列表 -->
    <div class="section-title"><i class="fas fa-link"></i> 服务 (Service)</div>
    <div class="table-wrapper">
        <table>
            <thead><tr><th>命名空间</th><th>服务名</th><th>类型</th><th>ClusterIP</th><th>ExternalIP</th><th>端口</th><th>Selector</th><th>会话亲和性</th></tr></thead>
            <tbody>
                ${SVC_TABLE}
            </tbody>
        </table>
        <div style="margin-top: 0.5rem; font-size:0.8rem; color:#f59e0b;">⚠️ LoadBalancer Pending数量: ${LB_PENDING}</div>
    </div>

    <!-- Ingress 列表 -->
    <div class="section-title"><i class="fas fa-globe"></i> Ingress 路由</div>
    <div class="table-wrapper">
        <table>
            <thead><tr><th>命名空间</th><th>Ingress名称</th><th>Hosts</th><th>TLS</th><th>后端服务</th><th>路径</th><th>运行时长</th></tr></thead>
            <tbody>
                ${INGRESS_TABLE}
            </tbody>
        </table>
    </div>

    <!-- PVC 列表 -->
    <div class="section-title"><i class="fas fa-hdd"></i> 持久卷声明 (PVC)</div>
    <div class="table-wrapper">
        <table>
            <thead><tr><th>命名空间</th><th>PVC名称</th><th>状态</th><th>绑定PV</th><th>访问模式</th><th>存储类</th><th>请求容量</th><th>实际容量</th><th>运行时长</th></tr></thead>
            <tbody>
                ${PVC_TABLE}
            </tbody>
        </table>
    </div>

    <!-- 最新Warning事件 -->
    <div class="section-title"><i class="fas fa-bell"></i> 近期Warning事件 (最多20条)</div>
    <div class="table-wrapper">
        <table>
            <thead><tr><th>命名空间</th><th>资源类型</th><th>资源名</th><th>原因</th><th>消息摘要</th><th>最后发生时间</th><th>发生次数</th><th>首次发生时间</th></tr></thead>
            <tbody>
                ${EVENTS_TABLE:-<tr><td colspan="8">无Warning事件,集群很干净~</td></tr>}
            </tbody>
        </table>
    </div>

    <div class="footer">
        <i class="fas fa-shield-alt"></i> 巡检脚本 v2.0 | 数据基于 kubectl 实时采集 | 报告生成: $(date)
    </div>
</div>

<script>
    // 通用图表创建函数
    function createDoughnutChart(ctxId, data, color) {
        const ctx = document.getElementById(ctxId).getContext('2d');
        return new Chart(ctx, {
            type: 'doughnut',
            data: {
                datasets: [{
                    data: data,
                    backgroundColor: [color, '#334155'],
                    borderWidth: 0,
                    cutout: '70%',
                    borderRadius: 10,
                    spacing: 5
                }]
            },
            options: {
                responsive: true,
                maintainAspectRatio: true,
                plugins: { 
                    tooltip: { 
                        enabled: true,
                        callbacks: {
                            label: function(context) {
                                const total = context.dataset.data.reduce((a, b) => a + b, 0);
                                const percentage = Math.round((context.raw / total) * 100);
                                return context.index === 0 ? '健康: ' + percentage + '%' : '异常: ' + percentage + '%';
                            }
                        }
                    }, 
                    legend: { display: false } 
                }
            }
        });
    }

    // 创建各种图表
    createDoughnutChart('healthChart', $chart_data_total, '${score_color}');
    createDoughnutChart('nodeChart', $chart_data_node, '${score_color}');
    createDoughnutChart('podChart', $chart_data_pod, '${score_color}');
    createDoughnutChart('deployChart', $chart_data_deploy, '${score_color}');
    createDoughnutChart('pvcChart', $chart_data_pvc, '${score_color}');

    // 表格折叠功能
    document.querySelectorAll('.collapsible-table .table-header').forEach(header => {
        header.addEventListener('click', function() {
            const tableContainer = this.parentElement;
            tableContainer.classList.toggle('collapsed');
        });
    });
</script>
</body>
</html>
EOF

    log_info "HTML报告已生成: $(pwd)/$REPORT_FILE"
}

# ----------------------------- 主函数 -------------------------------------
main() {
    log_info "开始执行K8s集群巡检脚本..."
    start_time=$(date +%s)
    
    check_dependencies
    collect_cluster_info
    collect_nodes
    collect_pods
    collect_deployments
    collect_services
    collect_ingress
    collect_pvcs
    collect_events
    generate_html

    end_time=$(date +%s)
    duration=$((end_time - start_time))
    
    echo ""
    log_info "巡检完成!耗时 ${duration} 秒"
    log_info "报告已保存至: $REPORT_FILE"
    log_info "您可以使用浏览器打开该文件查看炫酷的仪表板报告。"
    
    # 输出简要摘要
    echo ""
    log_info "=== 巡检摘要 ==="
    log_info "总节点数: $NODES_COUNT (就绪: $NODES_READY, 异常: $NODES_NOT_READY)"
    log_info "总Pod数: $TOTAL_PODS (运行中: $RUNNING_PODS, 异常: $((TOTAL_PODS - RUNNING_PODS)))"
    log_info "总Deployment数: $TOTAL_DEPLOY (健康: $((TOTAL_DEPLOY - UNHEALTHY_DEPLOY)), 异常: $UNHEALTHY_DEPLOY)"
    log_info "总PVC数: $TOTAL_PVC (已绑定: $((TOTAL_PVC - PVC_PENDING)), Pending: $PVC_PENDING)"
    log_info "Warning事件数: $TOTAL_WARNINGS"
    log_info "集群健康评分: ${health_score}/100 (${score_status})"
}

# 执行主函数
main "$@"
相关推荐
Net_Walke2 小时前
【Ubuntu】共享文件夹 /mnt/hgfs 下不显示问题解决
linux·运维·ubuntu
CHANG_THE_WORLD2 小时前
PDFIUM如何处理宽度数组
java·linux·服务器
孙同学_2 小时前
【Linux篇】应用层自定义协议与序列化
linux·服务器·网络
航Hang*2 小时前
第3章:Linux系统安全管理——第1节:Linux 防火墙部署(firewalld)
linux·服务器·网络·学习·系统安全·vmware
桦02 小时前
【Linux复习】:多线程
linux·运维
南境十里·墨染春水2 小时前
linux学习进展 基础命令 vi基础命令
linux·运维·服务器·笔记·学习
Deitymoon2 小时前
linux——读写锁
linux
迷路爸爸1802 小时前
Docker 入门学习笔记 05:卷到底是什么,为什么容器删了数据却还能保留
笔记·学习·docker
赵民勇2 小时前
locales包详解
linux