bash
复制代码
#!/bin/bash
# ============================================================================
# K8s 集群全面巡检脚本
# 功能:检查节点、Pod、Deployment、Service、Ingress、PVC、事件等资源状态
# 输出:生成炫酷的HTML巡检报告
# 依赖:kubectl, jq
# ============================================================================
set -e
# ----------------------------- 配置区域 -------------------------------------
REPORT_FILE="k8s_inspection_report_$(date +%Y%m%d_%H%M%S).html"
KUBECTL="kubectl"
JQ="jq"
NAMESPACES="" # 留空表示所有命名空间,可设置为 "default,kube-system" 等
TIMEOUT=10 # kubectl 命令超时时间(秒)
LOG_LEVEL="INFO" # 日志级别 INFO/WARN/ERROR
# 颜色输出(仅用于脚本日志)
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# ----------------------------- 函数定义 -------------------------------------
# 增强的日志函数,支持日志级别
log() {
local level=$1
shift
local message="$*"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
case $level in
ERROR)
echo -e "${RED}[ERROR] [$timestamp]${NC} $message" >&2
;;
WARN)
echo -e "${YELLOW}[WARN] [$timestamp]${NC} $message"
;;
INFO)
echo -e "${GREEN}[INFO] [$timestamp]${NC} $message"
;;
DEBUG)
if [[ "$LOG_LEVEL" == "DEBUG" ]]; then
echo -e "${BLUE}[DEBUG] [$timestamp]${NC} $message"
fi
;;
*)
echo -e "[UNKNOWN] [$timestamp] $message"
;;
esac
}
log_info() { log "INFO" "$@"; }
log_warn() { log "WARN" "$@"; }
log_error() { log "ERROR" "$@"; }
# 带超时的 kubectl 命令执行函数
run_kubectl_with_timeout() {
local cmd="$*"
timeout $TIMEOUT $cmd 2>/dev/null || {
log_error "kubectl command failed or timed out: $cmd"
return 1
}
}
# 检查依赖并验证集群连通性
check_dependencies() {
log_info "检查依赖..."
if ! command -v $KUBECTL &> /dev/null; then
log_error "kubectl 未安装或不在PATH中"
exit 1
fi
if ! command -v $JQ &> /dev/null; then
log_error "jq 未安装或不在PATH中,请先安装 jq"
exit 1
fi
log_info "验证集群连通性..."
if ! run_kubectl_with_timeout $KUBECTL cluster-info &> /dev/null; then
log_error "无法连接到Kubernetes集群,请检查kubeconfig配置"
exit 1
fi
# 检查 jq 是否能正确处理 JSON
if ! echo '{"test": "ok"}' | $JQ '.' &> /dev/null; then
log_error "jq 安装有问题,无法处理 JSON"
exit 1
fi
log_info "依赖检查通过,开始巡检集群..."
}
# 获取所有命名空间(逗号分隔),处理错误情况
get_namespaces() {
if [ -z "$NAMESPACES" ]; then
NAMESPACES=$(run_kubectl_with_timeout $KUBECTL get namespaces -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' ',' || echo "")
if [ -z "$NAMESPACES" ]; then
log_warn "无法获取命名空间列表,将使用 --all-namespaces 选项"
NAMESPACES="--all-namespaces"
fi
fi
echo "$NAMESPACES"
}
# 收集节点信息,优化性能和错误处理
collect_nodes() {
log_info "收集节点信息..."
local NODES_JSON
NODES_JSON=$(run_kubectl_with_timeout $KUBECTL get nodes -o json)
if [ $? -ne 0 ] || [ -z "$NODES_JSON" ]; then
log_error "无法获取节点信息"
NODES_COUNT=0
NODES_READY=0
NODES_NOT_READY=0
NODES_TABLE="<tr><td colspan='4'>无法获取节点信息</td></tr>"
return
fi
NODES_COUNT=$(echo "$NODES_JSON" | $JQ '.items | length')
NODES_READY=$(echo "$NODES_JSON" | $JQ '[.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="True"))] | length')
NODES_NOT_READY=$((NODES_COUNT - NODES_READY))
# 优化节点信息收集,提取更多有用指标
NODES_TABLE=""
while IFS=$'\t' read -r name status cpu mem allocatable_cpu allocatable_mem taints addr_internal addr_external os arch kernel; do
local status_class="status-bad"
if [ "$status" == "True" ]; then
status_class="status-good"
fi
# 格式化显示资源信息
local cpu_display="$cpu/$allocatable_cpu"
local mem_display="$mem/$allocatable_mem"
# 处理 Taints 为空的情况
taints=${taints:-"None"}
addr_internal=${addr_internal:-"N/A"}
addr_external=${addr_external:-"N/A"}
NODES_TABLE+="<tr><td>$name</td><td class=\"$status_class\">$status</td><td>$cpu_display</td><td>$mem_display</td><td>$taints</td><td>$addr_internal</td><td>$addr_external</td><td>$os/$arch</td><td>$kernel</td></tr>"
done < <(echo "$NODES_JSON" | $JQ -r '.items[] | [
.metadata.name,
(.status.conditions[] | select(.type=="Ready") | .status),
(.status.capacity.cpu // "?"),
(.status.capacity.memory // "?"),
(.status.allocatable.cpu // "?"),
(.status.allocatable.memory // "?"),
([.spec.taints[]?.key + "=" + (.spec.taints[]?.value // "") + ":" + (.spec.taints[]?.effect // "")] | join(", ") // ""),
(.status.addresses[] | select(.type=="InternalIP") | .address // ""),
(.status.addresses[] | select(.type=="ExternalIP") | .address // ""),
(.status.nodeInfo.osImage // "?"),
(.status.nodeInfo.architecture // "?"),
(.status.nodeInfo.kernelVersion // "?")
] | @tsv')
}
# 收集 Pod 信息(所有命名空间,重点异常状态),优化性能
collect_pods() {
log_info "收集Pod信息..."
local PODS_JSON
PODS_JSON=$(run_kubectl_with_timeout $KUBECTL get pods --all-namespaces -o json)
if [ $? -ne 0 ] || [ -z "$PODS_JSON" ]; then
log_error "无法获取Pod信息"
TOTAL_PODS=0
RUNNING_PODS=0
PENDING_PODS=0
FAILED_PODS=0
UNKNOWN_PODS=0
ABNORMAL_CONTAINERS=0
OOMKILLED=0
ABNORMAL_PODS_TABLE="<tr><td colspan='5'>无法获取Pod信息</td></tr>"
return
fi
TOTAL_PODS=$(echo "$PODS_JSON" | $JQ '.items | length')
# 统计各种状态
RUNNING_PODS=$(echo "$PODS_JSON" | $JQ '[.items[] | select(.status.phase=="Running")] | length')
PENDING_PODS=$(echo "$PODS_JSON" | $JQ '[.items[] | select(.status.phase=="Pending")] | length')
FAILED_PODS=$(echo "$PODS_JSON" | $JQ '[.items[] | select(.status.phase=="Failed")] | length')
UNKNOWN_PODS=$(echo "$PODS_JSON" | $JQ '[.items[] | select(.status.phase=="Unknown")] | length')
# 异常容器状态 (CrashLoopBackOff, ImagePullBackOff, ErrImagePull, OOMKilled等)
ABNORMAL_CONTAINERS=$(echo "$PODS_JSON" | $JQ '[.items[].status.containerStatuses[]? | select(.state.waiting != null and (.state.waiting.reason == "CrashLoopBackOff" or .state.waiting.reason == "ImagePullBackOff" or .state.waiting.reason == "ErrImagePull"))] | length')
OOMKILLED=$(echo "$PODS_JSON" | $JQ '[.items[].status.containerStatuses[]? | select(.state.terminated != null and .state.terminated.reason == "OOMKilled")] | length')
# 获取所有异常 Pod 列表 (状态非 Running 或 Ready 不为 True),优化查询
ABNORMAL_PODS_TABLE=""
while IFS=$'\t' read -r namespace name phase ready_status last_state_reason restart_count age_str qos_class; do
local status_class="status-bad"
local reason_display="$last_state_reason"
# 如果没有具体的容器状态原因,则显示Phase
if [ -z "$reason_display" ] || [ "$reason_display" == "null" ]; then
reason_display="$phase"
fi
# 计算年龄(如果可用)
age_str=${age_str:-"N/A"}
qos_class=${qos_class:-"N/A"}
ABNORMAL_PODS_TABLE+="<tr><td>$namespace</td><td>$name</td><td class=\"$status_class\">$reason_display</td><td>$restart_count</td><td>$age_str</td><td>$qos_class</td></tr>"
done < <(echo "$PODS_JSON" | $JQ -r '
.items[] | select(
.status.phase != "Running" or
([.status.conditions[]? | select(.type=="Ready") | .status] | first // "False") == "False"
) | [
.metadata.namespace,
.metadata.name,
.status.phase,
([.status.conditions[]? | select(.type=="Ready") | .status] | first // "Unknown"),
(
(.status.containerStatuses[]?.state.waiting?.reason // .status.containerStatuses[]?.state.terminated?.reason // "") as $cont_reason |
if $cont_reason != "" then $cont_reason else empty end
),
(.status.containerStatuses[]?.restartCount // 0 | tostring),
(.metadata.creationTimestamp | sub("\\.[0-9]+Z$"; "Z") | .[0:19] | . + "Z" | .[0:10] + " " + .[11:]) as $created_time |
(now - ($created_time | fromdateiso8601)) as $age_seconds |
if $age_seconds < 60 then ($age_seconds | tostring) + "s"
elif $age_seconds < 3600 then (($age_seconds / 60) | floor | tostring) + "m"
elif $age_seconds < 86400 then (($age_seconds / 3600) | floor | tostring) + "h"
else (($age_seconds / 86400) | floor | tostring) + "d" end
),
(.status.qosClass // "N/A")
] | @tsv' | sort -k1,1 -k2,2)
}
# 收集 Deployment 信息,优化性能
collect_deployments() {
log_info "收集Deployment信息..."
local DEPLOY_JSON
DEPLOY_JSON=$(run_kubectl_with_timeout $KUBECTL get deployments --all-namespaces -o json)
if [ $? -ne 0 ] || [ -z "$DEPLOY_JSON" ]; then
log_error "无法获取Deployment信息"
TOTAL_DEPLOY=0
UNHEALTHY_DEPLOY=0
DEPLOY_TABLE="<tr><td colspan='6'>无法获取Deployment信息</td></tr>"
return
fi
TOTAL_DEPLOY=$(echo "$DEPLOY_JSON" | $JQ '.items | length')
UNHEALTHY_DEPLOY=0
DEPLOY_TABLE=""
while IFS=$'\t' read -r ns name desired available ready updated ready_replicas unavailable; do
local status="健康"
local status_class="status-good"
if [ "$desired" != "$available" ] || [ "$desired" != "$ready" ] || [ "$unavailable" != "0" ]; then
status="异常"
status_class="status-bad"
((UNHEALTHY_DEPLOY++))
fi
# 格式化显示,处理空值
desired=${desired:-0}
available=${available:-0}
ready=${ready:-0}
updated=${updated:-0}
ready_replicas=${ready_replicas:-0}
unavailable=${unavailable:-0}
DEPLOY_TABLE+="<tr><td>$ns</td><td>$name</td><td>$desired</td><td>$available</td><td>$ready</td><td class=\"$status_class\">$status</td></tr>"
done < <(echo "$DEPLOY_JSON" | $JQ -r '.items[] | [
.metadata.namespace,
.metadata.name,
(.spec.replicas // 0),
(.status.availableReplicas // 0),
(.status.readyReplicas // 0),
(.status.updatedReplicas // 0),
(.status.replicas // 0),
(.status.unavailableReplicas // 0)
] | @tsv')
}
# 收集 Service 信息,优化性能
collect_services() {
log_info "收集Service信息..."
local SVC_JSON
SVC_JSON=$(run_kubectl_with_timeout $KUBECTL get services --all-namespaces -o json)
if [ $? -ne 0 ] || [ -z "$SVC_JSON" ]; then
log_error "无法获取Service信息"
TOTAL_SVC=0
LB_PENDING=0
SVC_TABLE="<tr><td colspan='4'>无法获取Service信息</td></tr>"
return
fi
TOTAL_SVC=$(echo "$SVC_JSON" | $JQ '.items | length')
LB_PENDING=0
SVC_TABLE=""
while IFS=$'\t' read -r ns name svcType clusterIP externalIP ports selector session_affinity type; do
SVC_TABLE+="<tr><td>$ns</td><td>$name</td><td>$svcType</td><td>$clusterIP</td><td>$externalIP</td><td>$ports</td><td>$selector</td><td>$session_affinity</td></tr>"
# 检查 LoadBalancer pending 状态
if [ "$svcType" == "LoadBalancer" ] && [ -z "$externalIP" ] && [ "$externalIP" != "<pending>" ]; then
((LB_PENDING++))
fi
done < <(echo "$SVC_JSON" | $JQ -r '.items[] | [
.metadata.namespace,
.metadata.name,
.spec.type,
(.spec.clusterIP // "-"),
(.status.loadBalancer.ingress[0].ip // .status.loadBalancer.ingress[0].hostname // "<pending>"),
([.spec.ports[] | "\(.port):\(.targetPort)/\(.protocol)"] | join(", ")),
([.spec.selector | to_entries[] | "\(.key)=\(.value)"] | join(",")),
(.spec.sessionAffinity // "None"),
(.spec.type // "ClusterIP")
] | @tsv')
# 重新计算 LB pending 数量(更精确)
LB_PENDING=$(echo "$SVC_JSON" | $JQ '[.items[] | select(.spec.type=="LoadBalancer" and (.status.loadBalancer.ingress == null or .status.loadBalancer.ingress | length == 0 or (.status.loadBalancer.ingress[0].ip == null and .status.loadBalancer.ingress[0].hostname == null)))] | length')
}
# 收集 Ingress 信息,优化性能
collect_ingress() {
log_info "收集Ingress信息..."
local INGRESS_JSON
if run_kubectl_with_timeout $KUBECTL get ingress --all-namespaces &>/dev/null; then
INGRESS_JSON=$(run_kubectl_with_timeout $KUBECTL get ingress --all-namespaces -o json)
if [ $? -ne 0 ] || [ -z "$INGRESS_JSON" ]; then
log_error "无法获取Ingress信息"
TOTAL_INGRESS=0
INGRESS_TABLE="<tr><td colspan='4'>无法获取Ingress信息</td></tr>"
return
fi
TOTAL_INGRESS=$(echo "$INGRESS_JSON" | $JQ '.items | length')
INGRESS_TABLE=""
while IFS=$'\t' read -r ns name hosts tls_enabled backend_service paths age_str; do
# 处理空值
hosts=${hosts:-"N/A"}
tls_enabled=${tls_enabled:-"No"}
backend_service=${backend_service:-"N/A"}
paths=${paths:-"N/A"}
age_str=${age_str:-"N/A"}
INGRESS_TABLE+="<tr><td>$ns</td><td>$name</td><td>$hosts</td><td>$tls_enabled</td><td>$backend_service</td><td>$paths</td><td>$age_str</td></tr>"
done < <(echo "$INGRESS_JSON" | $JQ -r '
.items[] | [
.metadata.namespace,
.metadata.name,
([.spec.rules[].host] | unique | join(",")),
(if .spec.tls and (.spec.tls | length > 0) then "Yes" else "No" end),
([.spec.rules[].http.paths[].backend.service.name] | unique | join(",")),
([.spec.rules[].http.paths[].path // "/*"] | unique | join("<br>")),
(.metadata.creationTimestamp | sub("\\.[0-9]+Z$"; "Z") | .[0:19] | . + "Z" | .[0:10] + " " + .[11:]) as $created_time |
(now - ($created_time | fromdateiso8601)) as $age_seconds |
if $age_seconds < 60 then ($age_seconds | tostring) + "s"
elif $age_seconds < 3600 then (($age_seconds / 60) | floor | tostring) + "m"
elif $age_seconds < 86400 then (($age_seconds / 3600) | floor | tostring) + "h"
else (($age_seconds / 86400) | floor | tostring) + "d" end
] | @tsv')
else
TOTAL_INGRESS=0
INGRESS_TABLE="<tr><td colspan='4'>未发现 Ingress 资源或 Ingress 资源不可用</td></tr>"
fi
}
# 收集 PVC 信息 (检查绑定状态),优化性能
collect_pvcs() {
log_info "收集PVC信息..."
local PVC_JSON
PVC_JSON=$(run_kubectl_with_timeout $KUBECTL get pvc --all-namespaces -o json)
if [ $? -ne 0 ] || [ -z "$PVC_JSON" ]; then
log_error "无法获取PVC信息"
TOTAL_PVC=0
PVC_PENDING=0
PVC_TABLE="<tr><td colspan='6'>无法获取PVC信息</td></tr>"
return
fi
TOTAL_PVC=$(echo "$PVC_JSON" | $JQ '.items | length')
PVC_PENDING=0
PVC_TABLE=""
while IFS=$'\t' read -r ns name status volume access_modes storage_class requested capacity age_str; do
local status_class="status-good"
if [ "$status" == "Pending" ]; then
status_class="status-bad"
((PVC_PENDING++))
fi
# 处理空值
volume=${volume:-"-"}
access_modes=${access_modes:-"N/A"}
storage_class=${storage_class:-"default"}
requested=${requested:-"N/A"}
capacity=${capacity:-"N/A"}
age_str=${age_str:-"N/A"}
PVC_TABLE+="<tr><td>$ns</td><td>$name</td><td class=\"$status_class\">$status</td><td>$volume</td><td>$access_modes</td><td>$storage_class</td><td>$requested</td><td>$capacity</td><td>$age_str</td></tr>"
done < <(echo "$PVC_JSON" | $JQ -r '
.items[] | [
.metadata.namespace,
.metadata.name,
.status.phase,
(.spec.volumeName // "-"),
([.spec.accessModes[]] | join(",")),
(.spec.storageClassName // "default"),
(.spec.resources.requests.storage // "N/A"),
(.status.capacity.storage // "N/A"),
(.metadata.creationTimestamp | sub("\\.[0-9]+Z$"; "Z") | .[0:19] | . + "Z" | .[0:10] + " " + .[11:]) as $created_time |
(now - ($created_time | fromdateiso8601)) as $age_seconds |
if $age_seconds < 60 then ($age_seconds | tostring) + "s"
elif $age_seconds < 3600 then (($age_seconds / 60) | floor | tostring) + "m"
elif $age_seconds < 86400 then (($age_seconds / 3600) | floor | tostring) + "h"
else (($age_seconds / 86400) | floor | tostring) + "d" end
] | @tsv')
}
# 收集事件 (Warning 级别),优化性能,限制数量
collect_events() {
log_info "收集Warning事件..."
local EVENTS_JSON
EVENTS_JSON=$(run_kubectl_with_timeout $KUBECTL get events --all-namespaces --field-selector type=Warning -o json)
if [ $? -ne 0 ] || [ -z "$EVENTS_JSON" ]; then
log_error "无法获取事件信息"
TOTAL_WARNINGS=0
EVENTS_TABLE="<tr><td colspan='6'>无法获取事件信息</td></tr>"
return
fi
TOTAL_WARNINGS=$(echo "$EVENTS_JSON" | $JQ '.items | length')
EVENTS_TABLE=""
# 只取最近的20个事件,按时间倒序排列
while IFS=$'\t' read -r ns kind name reason message timestamp count first_timestamp; do
# 截取过长消息,保留重要信息
local message_display=$(echo "$message" | cut -c1-100)
local tooltip_msg=$(echo "$message" | sed 's|"|"|g')
# 格式化时间
timestamp=${timestamp:-"N/A"}
first_timestamp=${first_timestamp:-"N/A"}
count=${count:-"1"}
EVENTS_TABLE+="<tr><td>$ns</td><td>$kind</td><td>$name</td><td>$reason</td><td title=\"$tooltip_msg\">${message_display}...</td><td>$timestamp</td><td>$count</td><td>$first_timestamp</td></tr>"
done < <(echo "$EVENTS_JSON" | $JQ -r '
.items |
map(. | .firstTimestamp = (.firstTimestamp // .eventTime)) |
sort_by(.lastTimestamp // .eventTime) |
reverse |
.[:20] |
.[] | [
.metadata.namespace,
.involvedObject.kind,
.involvedObject.name,
.reason,
.message,
(.lastTimestamp // .eventTime // "N/A"),
(.count // 1 | tostring),
(.firstTimestamp // .eventTime // "N/A")
] | @tsv')
}
# 收集集群总体信息,增加更多指标
collect_cluster_info() {
log_info "收集集群基本信息..."
CLUSTER_INFO=$(run_kubectl_with_timeout $KUBECTL cluster-info 2>/dev/null | head -n1 || echo "无法获取集群信息")
K8S_VERSION=$(run_kubectl_with_timeout $KUBECTL version -o json 2>/dev/null | $JQ -r '.serverVersion.gitVersion' 2>/dev/null || echo "unknown")
CURRENT_CONTEXT=$(run_kubectl_with_timeout $KUBECTL config current-context 2>/dev/null || echo "unknown")
SERVER_ADDR=$(run_kubectl_with_timeout $KUBECTL config view -o jsonpath="{.clusters[?(@.name==\"$CURRENT_CONTEXT\")].cluster.server}" 2>/dev/null || echo "unknown")
# 收集更多集群信息
NODE_ARCHITECTURE=$(run_kubectl_with_timeout $KUBECTL get nodes -o jsonpath='{.items[0].status.nodeInfo.architecture}' 2>/dev/null || echo "unknown")
NODE_OS_IMAGE=$(run_kubectl_with_timeout $KUBECTL get nodes -o jsonpath='{.items[0].status.nodeInfo.osImage}' 2>/dev/null || echo "unknown")
CNI_PLUGIN=$(run_kubectl_with_timeout $KUBECTL get pods -n kube-system -l k8s-app -o jsonpath='{.items[0].metadata.labels.k8s-app}' 2>/dev/null || echo "unknown")
}
# ----------------------------- 生成 HTML 报告 ---------------------------------
generate_html() {
log_info "生成HTML报告: $REPORT_FILE"
# 计算健康分数 (满分100),引入权重
local node_weight=0.25
local pod_weight=0.30
local deploy_weight=0.25
local pvc_weight=0.20
local node_score=0
local pod_score=0
local deploy_score=0
local pvc_score=0
if [ $NODES_COUNT -gt 0 ]; then
node_score=$(( NODES_READY * 100 / NODES_COUNT ))
fi
if [ $TOTAL_PODS -gt 0 ]; then
local healthy_pods=$(( TOTAL_PODS - PENDING_PODS - FAILED_PODS - UNKNOWN_PODS ))
pod_score=$(( healthy_pods * 100 / TOTAL_PODS ))
fi
if [ $TOTAL_DEPLOY -gt 0 ]; then
deploy_score=$(( (TOTAL_DEPLOY - UNHEALTHY_DEPLOY) * 100 / TOTAL_DEPLOY ))
fi
if [ $TOTAL_PVC -gt 0 ]; then
pvc_score=$(( (TOTAL_PVC - PVC_PENDING) * 100 / TOTAL_PVC ))
fi
# 加权平均健康分数
local health_score
health_score=$(awk "BEGIN {printf \"%.0f\", ($node_score * $node_weight + $pod_score * $pod_weight + $deploy_score * $deploy_weight + $pvc_score * $pvc_weight) * 100}")
health_score=$((health_score / 100))
# 确定分数颜色
local score_color="#10b981"
local score_status="健康"
if [ $health_score -ge 80 ]; then
score_color="#10b981"
score_status="健康"
elif [ $health_score -ge 60 ]; then
score_color="#f59e0b"
score_status="警告"
else
score_color="#ef4444"
score_status="危险"
fi
# 定义图表数据
local chart_data_node="[$node_score, $((100 - node_score))]"
local chart_data_pod="[$pod_score, $((100 - pod_score))]"
local chart_data_deploy="[$deploy_score, $((100 - deploy_score))]"
local chart_data_pvc="[$pvc_score, $((100 - pvc_score))]"
local chart_data_total="[$health_score, $((100 - health_score))]"
cat > "$REPORT_FILE" <<EOF
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>K8s 集群巡检报告 - ${CURRENT_CONTEXT}</title>
<!-- Font Awesome & Google Fonts -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
<!-- Chart.js CDN -->
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
font-family: 'Inter', sans-serif;
color: #e2e8f0;
padding: 2rem;
line-height: 1.6;
}
.container {
max-width: 1600px;
margin: 0 auto;
}
/* 头部区域 */
.header {
text-align: center;
margin-bottom: 3rem;
animation: fadeInDown 0.8s ease;
}
.header h1 {
font-size: 2.5rem;
font-weight: 800;
background: linear-gradient(135deg, #60a5fa, #a78bfa);
-webkit-background-clip: text;
background-clip: text;
color: transparent;
margin-bottom: 0.5rem;
}
.header .sub {
color: #94a3b8;
font-size: 0.95rem;
}
.badge {
display: inline-block;
background: #1e293b;
border-radius: 40px;
padding: 0.25rem 1rem;
font-size: 0.8rem;
margin-top: 0.5rem;
margin-right: 0.5rem;
}
/* 卡片网格 */
.grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
gap: 1.5rem;
margin-bottom: 2rem;
}
.card {
background: rgba(30, 41, 59, 0.7);
backdrop-filter: blur(10px);
border-radius: 24px;
padding: 1.5rem;
border: 1px solid rgba(255,255,255,0.1);
transition: transform 0.2s, box-shadow 0.2s;
box-shadow: 0 8px 20px rgba(0,0,0,0.2);
position: relative;
overflow: hidden;
}
.card::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
height: 3px;
background: linear-gradient(90deg, #60a5fa, #a78bfa);
}
.card:hover {
transform: translateY(-4px);
box-shadow: 0 20px 30px -12px rgba(0,0,0,0.4);
border-color: rgba(96,165,250,0.4);
}
.card-title {
font-size: 1.1rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 1px;
color: #94a3b8;
margin-bottom: 1rem;
display: flex;
align-items: center;
gap: 0.5rem;
}
.card-value {
font-size: 2.5rem;
font-weight: 800;
margin-bottom: 0.5rem;
}
.card-desc {
font-size: 0.8rem;
color: #cbd5e1;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
.status-good {
color: #10b981;
font-weight: 600;
}
.status-bad {
color: #ef4444;
font-weight: 600;
}
.status-warning {
color: #f59e0b;
}
/* 健康仪表盘圆形 */
.score-container {
display: flex;
justify-content: center;
align-items: center;
flex-direction: column;
}
canvas.healthChart {
width: 120px !important;
height: 120px !important;
margin-bottom: 1rem;
}
.score-text {
font-size: 1.2rem;
font-weight: 700;
}
/* 表格样式 */
.table-wrapper {
overflow-x: auto;
border-radius: 20px;
background: rgba(15, 23, 42, 0.6);
padding: 0.5rem;
margin-bottom: 2rem;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
table {
width: 100%;
border-collapse: collapse;
font-size: 0.85rem;
}
th {
text-align: left;
padding: 0.8rem 0.8rem;
background: #0f172a;
color: #94a3b8;
font-weight: 600;
border-bottom: 1px solid #334155;
position: sticky;
top: 0;
z-index: 10;
}
td {
padding: 0.7rem 0.8rem;
border-bottom: 1px solid #1e293b;
}
tr:hover {
background: rgba(51, 65, 85, 0.4);
}
.section-title {
font-size: 1.5rem;
font-weight: 700;
margin: 2rem 0 1rem 0;
display: flex;
align-items: center;
gap: 0.8rem;
color: #e2e8f0;
}
.footer {
text-align: center;
margin-top: 3rem;
padding: 1rem;
font-size: 0.75rem;
color: #475569;
border-top: 1px solid #334155;
padding-top: 1rem;
}
@keyframes fadeInDown {
from {
opacity: 0;
transform: translateY(-20px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
i.icon {
width: 28px;
color: #60a5fa;
}
.cluster-info {
background: rgba(0,0,0,0.3);
border-radius: 16px;
padding: 1rem;
margin-bottom: 2rem;
font-size: 0.9rem;
display: flex;
flex-wrap: wrap;
justify-content: space-between;
gap: 1rem;
}
.chart-container {
display: flex;
justify-content: space-around;
flex-wrap: wrap;
gap: 1rem;
margin: 2rem 0;
}
.chart-item {
text-align: center;
flex: 1;
min-width: 150px;
}
.chart-title {
font-size: 0.9rem;
color: #94a3b8;
margin-bottom: 0.5rem;
}
.collapsible-table {
cursor: pointer;
}
.collapsible-table .table-header {
background: #1e293b;
padding: 0.5rem;
border-radius: 10px;
margin-bottom: 0.5rem;
display: flex;
justify-content: space-between;
align-items: center;
}
.collapsible-table .table-content {
max-height: 400px;
overflow-y: auto;
}
.toggle-icon {
transition: transform 0.3s;
}
.collapsed .toggle-icon {
transform: rotate(-90deg);
}
.collapsed .table-content {
display: none;
}
@media (max-width: 768px) {
body {
padding: 1rem;
}
.cluster-info {
flex-direction: column;
align-items: flex-start;
}
.grid {
grid-template-columns: 1fr;
}
.chart-container {
flex-direction: column;
}
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1><i class="fas fa-kubernetes"></i> K8s 集群巡检报告</h1>
<div class="sub">全面健康检查 & 异常诊断 | 生成时间: $(date '+%Y-%m-%d %H:%M:%S')</div>
<div>
<div class="badge"><i class="fas fa-check-circle"></i> 集群: ${CURRENT_CONTEXT}</div>
<div class="badge"><i class="fas fa-server"></i> 版本: ${K8S_VERSION}</div>
<div class="badge"><i class="fas fa-microchip"></i> 架构: ${NODE_ARCHITECTURE}</div>
<div class="badge"><i class="fas fa-desktop"></i> 系统: ${NODE_OS_IMAGE}</div>
</div>
</div>
<div class="cluster-info">
<span><i class="fas fa-server"></i> <strong>API Server:</strong> ${SERVER_ADDR}</span>
<span><i class="fab fa-kubernetes"></i> <strong>版本:</strong> ${K8S_VERSION}</span>
<span><i class="fas fa-cogs"></i> <strong>CNI:</strong> ${CNI_PLUGIN}</span>
<span><i class="fas fa-chart-line"></i> <strong>巡检范围:</strong> 全集群资源</span>
</div>
<!-- 健康分数卡片及统计 -->
<div class="grid">
<div class="card">
<div class="card-title"><i class="fas fa-heartbeat icon"></i> 集群健康指数</div>
<div class="score-container">
<canvas id="healthChart" class="healthChart" width="120" height="120"></canvas>
<div class="score-text" style="color: ${score_color};">${health_score}分 - ${score_status}</div>
</div>
</div>
<div class="card">
<div class="card-title"><i class="fas fa-server icon"></i> 节点</div>
<div class="card-value">${NODES_READY} / ${NODES_COUNT}</div>
<div class="card-desc">就绪节点 / 总数 <span class="${NODES_NOT_READY>0?'status-bad':'status-good'}">异常节点: ${NODES_NOT_READY}</span></div>
</div>
<div class="card">
<div class="card-title"><i class="fas fa-cubes icon"></i> Pods</div>
<div class="card-value">${RUNNING_PODS} / ${TOTAL_PODS}</div>
<div class="card-desc">运行中 | 等待: ${PENDING_PODS} 失败: ${FAILED_PODS} <br> 异常容器: ${ABNORMAL_CONTAINERS} OOM: ${OOMKILLED}</div>
</div>
<div class="card">
<div class="card-title"><i class="fas fa-rocket icon"></i> Deployments</div>
<div class="card-value">$((TOTAL_DEPLOY - UNHEALTHY_DEPLOY)) / ${TOTAL_DEPLOY}</div>
<div class="card-desc">健康/总数 不健康副本: ${UNHEALTHY_DEPLOY}</div>
</div>
<div class="card">
<div class="card-title"><i class="fas fa-database icon"></i> PersistentVolumeClaims</div>
<div class="card-value">$((TOTAL_PVC - PVC_PENDING)) / ${TOTAL_PVC}</div>
<div class="card-desc">已绑定 | Pending: ${PVC_PENDING}</div>
</div>
<div class="card">
<div class="card-title"><i class="fas fa-exclamation-triangle icon"></i> Warning事件</div>
<div class="card-value">${TOTAL_WARNINGS}</div>
<div class="card-desc">最近Warning级别事件</div>
</div>
</div>
<!-- 图表展示 -->
<div class="section-title"><i class="fas fa-chart-bar"></i> 健康度分布</div>
<div class="chart-container">
<div class="chart-item">
<div class="chart-title">节点健康度</div>
<canvas id="nodeChart" class="healthChart" width="100" height="100"></canvas>
<div>${node_score}%</div>
</div>
<div class="chart-item">
<div class="chart-title">Pod健康度</div>
<canvas id="podChart" class="healthChart" width="100" height="100"></canvas>
<div>${pod_score}%</div>
</div>
<div class="chart-item">
<div class="chart-title">Deployment健康度</div>
<canvas id="deployChart" class="healthChart" width="100" height="100"></canvas>
<div>${deploy_score}%</div>
</div>
<div class="chart-item">
<div class="chart-title">PVC健康度</div>
<canvas id="pvcChart" class="healthChart" width="100" height="100"></canvas>
<div>${pvc_score}%</div>
</div>
</div>
<!-- 节点详情 -->
<div class="section-title"><i class="fas fa-server"></i> 节点状态详情</div>
<div class="table-wrapper">
<table>
<thead><tr><th>节点名称</th><th>状态 (Ready)</th><th>CPU (Capacity/Allocatable)</th><th>内存 (Capacity/Allocatable)</th><th>Taints</th><th>内部IP</th><th>外部IP</th><th>系统/架构</th><th>内核版本</th></tr></thead>
<tbody>
${NODES_TABLE}
</tbody>
</table>
</div>
<!-- 异常Pod列表 -->
<div class="section-title"><i class="fas fa-exclamation-circle"></i> 异常Pod / 容器问题</div>
<div class="table-wrapper">
<table>
<thead><tr><th>命名空间</th><th>Pod名称</th><th>异常原因</th><th>重启次数</th><th>运行时长</th><th>QoS等级</th></tr></thead>
<tbody>
${ABNORMAL_PODS_TABLE:-<tr><td colspan="6" style="text-align:center">✅ 没有发现异常Pod</td></tr>}
</tbody>
</table>
</div>
<!-- Deployment 状态 -->
<div class="section-title"><i class="fas fa-chart-simple"></i> Deployment 副本健康度</div>
<div class="table-wrapper">
<table>
<thead><tr><th>命名空间</th><th>名称</th><th>期望副本</th><th>可用副本</th><th>就绪副本</th><th>状态</th></tr></thead>
<tbody>
${DEPLOY_TABLE}
</tbody>
</table>
</div>
<!-- Service 列表 -->
<div class="section-title"><i class="fas fa-link"></i> 服务 (Service)</div>
<div class="table-wrapper">
<table>
<thead><tr><th>命名空间</th><th>服务名</th><th>类型</th><th>ClusterIP</th><th>ExternalIP</th><th>端口</th><th>Selector</th><th>会话亲和性</th></tr></thead>
<tbody>
${SVC_TABLE}
</tbody>
</table>
<div style="margin-top: 0.5rem; font-size:0.8rem; color:#f59e0b;">⚠️ LoadBalancer Pending数量: ${LB_PENDING}</div>
</div>
<!-- Ingress 列表 -->
<div class="section-title"><i class="fas fa-globe"></i> Ingress 路由</div>
<div class="table-wrapper">
<table>
<thead><tr><th>命名空间</th><th>Ingress名称</th><th>Hosts</th><th>TLS</th><th>后端服务</th><th>路径</th><th>运行时长</th></tr></thead>
<tbody>
${INGRESS_TABLE}
</tbody>
</table>
</div>
<!-- PVC 列表 -->
<div class="section-title"><i class="fas fa-hdd"></i> 持久卷声明 (PVC)</div>
<div class="table-wrapper">
<table>
<thead><tr><th>命名空间</th><th>PVC名称</th><th>状态</th><th>绑定PV</th><th>访问模式</th><th>存储类</th><th>请求容量</th><th>实际容量</th><th>运行时长</th></tr></thead>
<tbody>
${PVC_TABLE}
</tbody>
</table>
</div>
<!-- 最新Warning事件 -->
<div class="section-title"><i class="fas fa-bell"></i> 近期Warning事件 (最多20条)</div>
<div class="table-wrapper">
<table>
<thead><tr><th>命名空间</th><th>资源类型</th><th>资源名</th><th>原因</th><th>消息摘要</th><th>最后发生时间</th><th>发生次数</th><th>首次发生时间</th></tr></thead>
<tbody>
${EVENTS_TABLE:-<tr><td colspan="8">无Warning事件,集群很干净~</td></tr>}
</tbody>
</table>
</div>
<div class="footer">
<i class="fas fa-shield-alt"></i> 巡检脚本 v2.0 | 数据基于 kubectl 实时采集 | 报告生成: $(date)
</div>
</div>
<script>
// 通用图表创建函数
function createDoughnutChart(ctxId, data, color) {
const ctx = document.getElementById(ctxId).getContext('2d');
return new Chart(ctx, {
type: 'doughnut',
data: {
datasets: [{
data: data,
backgroundColor: [color, '#334155'],
borderWidth: 0,
cutout: '70%',
borderRadius: 10,
spacing: 5
}]
},
options: {
responsive: true,
maintainAspectRatio: true,
plugins: {
tooltip: {
enabled: true,
callbacks: {
label: function(context) {
const total = context.dataset.data.reduce((a, b) => a + b, 0);
const percentage = Math.round((context.raw / total) * 100);
return context.index === 0 ? '健康: ' + percentage + '%' : '异常: ' + percentage + '%';
}
}
},
legend: { display: false }
}
}
});
}
// 创建各种图表
createDoughnutChart('healthChart', $chart_data_total, '${score_color}');
createDoughnutChart('nodeChart', $chart_data_node, '${score_color}');
createDoughnutChart('podChart', $chart_data_pod, '${score_color}');
createDoughnutChart('deployChart', $chart_data_deploy, '${score_color}');
createDoughnutChart('pvcChart', $chart_data_pvc, '${score_color}');
// 表格折叠功能
document.querySelectorAll('.collapsible-table .table-header').forEach(header => {
header.addEventListener('click', function() {
const tableContainer = this.parentElement;
tableContainer.classList.toggle('collapsed');
});
});
</script>
</body>
</html>
EOF
log_info "HTML报告已生成: $(pwd)/$REPORT_FILE"
}
# ----------------------------- 主函数 -------------------------------------
main() {
log_info "开始执行K8s集群巡检脚本..."
start_time=$(date +%s)
check_dependencies
collect_cluster_info
collect_nodes
collect_pods
collect_deployments
collect_services
collect_ingress
collect_pvcs
collect_events
generate_html
end_time=$(date +%s)
duration=$((end_time - start_time))
echo ""
log_info "巡检完成!耗时 ${duration} 秒"
log_info "报告已保存至: $REPORT_FILE"
log_info "您可以使用浏览器打开该文件查看炫酷的仪表板报告。"
# 输出简要摘要
echo ""
log_info "=== 巡检摘要 ==="
log_info "总节点数: $NODES_COUNT (就绪: $NODES_READY, 异常: $NODES_NOT_READY)"
log_info "总Pod数: $TOTAL_PODS (运行中: $RUNNING_PODS, 异常: $((TOTAL_PODS - RUNNING_PODS)))"
log_info "总Deployment数: $TOTAL_DEPLOY (健康: $((TOTAL_DEPLOY - UNHEALTHY_DEPLOY)), 异常: $UNHEALTHY_DEPLOY)"
log_info "总PVC数: $TOTAL_PVC (已绑定: $((TOTAL_PVC - PVC_PENDING)), Pending: $PVC_PENDING)"
log_info "Warning事件数: $TOTAL_WARNINGS"
log_info "集群健康评分: ${health_score}/100 (${score_status})"
}
# 执行主函数
main "$@"