python
复制代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
K8s 巡检工具
功能:全面检查 K8s 集群健康状态,生成炫酷 HTML + Word 报告
依赖:kubernetes, jinja2, requests, pyyaml, python-docx
用法:python k8sinspect.py [--namespace NAMESPACE] [--all-namespaces] [--format {html,word,both}]
"""
import os
import sys
import json
import time
import base64
import argparse
import subprocess
from datetime import datetime, timedelta
from collections import defaultdict
from typing import Dict, List, Tuple, Optional, Any
# 第三方库导入检查
missing_libs = []
try:
from kubernetes import client, config
from kubernetes.client.rest import ApiException
except ImportError:
missing_libs.append("kubernetes")
try:
import jinja2
except ImportError:
missing_libs.append("jinja2")
try:
import requests
except ImportError:
missing_libs.append("requests")
try:
import yaml
except ImportError:
missing_libs.append("pyyaml")
try:
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
except ImportError:
missing_libs.append("python-docx")
if missing_libs:
print(f"缺少依赖库: {', '.join(missing_libs)}")
print("请执行: pip install kubernetes jinja2 requests pyyaml python-docx")
sys.exit(1)
# =============================================================
# 全局配置
# =============================================================
REPORT_DIR = os.environ.get("REPORT_DIR", "/tmp/k8s-inspect")
THRESHOLD_CPU = 80 # CPU 告警阈值 (%)
THRESHOLD_MEM = 80 # 内存告警阈值 (%)
THRESHOLD_DISK = 85 # 磁盘告警阈值 (%)
THRESHOLD_RESTART = 5 # Pod 重启次数告警阈值
CERT_WARN_DAYS = 30 # 证书到期提前告警天数
# 告警 Webhook (可选)
DINGTALK_WEBHOOK = os.environ.get("DINGTALK_WEBHOOK", "")
WECOM_WEBHOOK = os.environ.get("WECOM_WEBHOOK", "")
# 确保报告目录存在
os.makedirs(REPORT_DIR, exist_ok=True)
# =============================================================
# HTML 模板 (Jinja2)
# =============================================================
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>K8s 巡检报告</title>
<style>
:root{--ok:#00d084;--warn:#f59e0b;--err:#ef4444;--bg:#0f172a;--card:#1e293b;}
body{margin:0;font-family:system-ui;background:var(--bg);color:#e2e8f0;}
.header{background:linear-gradient(135deg,#1e3a5f,#0f172a);padding:32px;text-align:center;border-bottom:1px solid #334155;}
.title{font-size:28px;font-weight:900;color:#38bdf8;margin-bottom:8px;}
.subtitle{color:#94a3b8;font-size:14px;}
.summary{display:grid;grid-template-columns:repeat(4,1fr);gap:16px;padding:24px;max-width:1200px;margin:0 auto;}
.stat{background:var(--card);border-radius:12px;padding:20px;text-align:center;border:1px solid #334155;}
.stat-num{font-size:36px;font-weight:900;margin-bottom:4px;}
.stat-label{font-size:12px;color:#94a3b8;}
.section{max-width:1200px;margin:0 auto 24px;padding:0 24px;}
.section-title{font-size:18px;font-weight:700;color:#38bdf8;padding:12px 0;border-bottom:2px solid #334155;margin-bottom:16px;}
table{width:100%;border-collapse:collapse;background:var(--card);border-radius:10px;overflow:hidden;}
th{background:#1e3a5f;color:#94a3b8;font-size:12px;padding:10px 14px;text-align:left;}
td{padding:10px 14px;font-size:13px;border-bottom:1px solid #334155;}
tr:last-child td{border-bottom:none;}
.ok{color:var(--ok);font-weight:700;}
.warn{color:var(--warn);font-weight:700;}
.err{color:var(--err);font-weight:700;}
.badge{display:inline-block;padding:2px 8px;border-radius:12px;font-size:11px;font-weight:700;}
.badge-ok{background:rgba(0,208,132,0.15);color:#00d084;}
.badge-warn{background:rgba(245,158,11,0.15);color:#f59e0b;}
.badge-err{background:rgba(239,68,68,0.15);color:#ef4444;}
</style>
</head>
<body>
<div class="header">
<div class="title">🔍 K8s 集群巡检报告</div>
<div class="subtitle">生成时间:{{ timestamp }}</div>
</div>
<div class="summary">
<div class="stat"><div class="stat-num" style="color:#38bdf8">{{ node_total }}</div><div class="stat-label">总节点数</div></div>
<div class="stat"><div class="stat-num ok">{{ node_ready }}</div><div class="stat-label">就绪节点</div></div>
<div class="stat"><div class="stat-num" style="color:#a78bfa">{{ pod_total }}</div><div class="stat-label">总 Pod 数</div></div>
<div class="stat"><div class="stat-num" style="color:#fb923c">{{ ns_total }}</div><div class="stat-label">命名空间数</div></div>
</div>
<!-- 节点健康检查 -->
<div class="section">
<div class="section-title">🖥️ 节点健康检查</div>
<table>
<tr><th>节点名称</th><th>状态</th><th>角色</th><th>K8s 版本</th><th>CPU 使用</th><th>内存使用</th><th>运行时间</th><th>内核版本</th></tr>
{% for node in nodes %}
<tr>
<td>{{ node.name }}</td>
<td class="{{ node.status_class }}">{{ node.status }}</td>
<td>{{ node.role }}</td>
<td>{{ node.version }}</td>
<td>{{ node.cpu_usage }}</td>
<td>{{ node.mem_usage }}</td>
<td>{{ node.uptime }}</td>
<td>{{ node.kernel }}</td>
</tr>
{% endfor %}
</table>
</div>
<!-- Pod 状态检查 -->
<div class="section">
<div class="section-title">🚀 Pod 状态检查</div>
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:12px;margin-bottom:16px;">
<div class="stat"><div class="stat-num ok">{{ pod_stats.running }}</div><div class="stat-label">Running</div></div>
<div class="stat"><div class="stat-num err">{{ pod_stats.abnormal }}</div><div class="stat-label">异常 Pod</div></div>
<div class="stat"><div class="stat-num warn">{{ pod_stats.pending }}</div><div class="stat-label">Pending</div></div>
</div>
<table>
<tr><th>命名空间</th><th>Pod 名称</th><th>状态</th><th>重启次数</th><th>运行时间</th><th>所在节点</th><th>镜像</th></tr>
{% for pod in abnormal_pods %}
<tr>
<td>{{ pod.namespace }}</td>
<td>{{ pod.name }}</td>
<td class="{{ pod.status_class }}">{{ pod.status }}</td>
<td>{{ pod.restarts }}</td>
<td>{{ pod.age }}</td>
<td>{{ pod.node }}</td>
<td style="font-size:11px;">{{ pod.image }}</td>
</tr>
{% endfor %}
</table>
</div>
<!-- 资源配额 & 存储检查 -->
<div class="section">
<div class="section-title">💰 资源配额 & 存储检查</div>
<table>
<tr><th>命名空间</th><th>配额名称</th><th>CPU 请求</th><th>CPU 限制</th><th>内存请求</th><th>内存限制</th><th>Pod 数量</th></tr>
{% for quota in resource_quotas %}
<tr>
<td>{{ quota.namespace }}</td>
<td>{{ quota.name }}</td>
<td>{{ quota.cpu_req }}</td>
<td>{{ quota.cpu_limit }}</td>
<td>{{ quota.mem_req }}</td>
<td>{{ quota.mem_limit }}</td>
<td>{{ quota.pods }}</td>
</tr>
{% endfor %}
</table>
<br>
<table>
<tr><th>命名空间</th><th>PVC 名称</th><th>状态</th><th>容量</th><th>存储类</th><th>访问模式</th></tr>
{% for pvc in pvcs %}
<tr>
<td>{{ pvc.namespace }}</td>
<td>{{ pvc.name }}</td>
<td class="{{ pvc.status_class }}">{{ pvc.status }}</td>
<td>{{ pvc.capacity }}</td>
<td>{{ pvc.storage_class }}</td>
<td>{{ pvc.access_modes }}</td>
</tr>
{% endfor %}
</table>
</div>
<!-- 证书安全检查 -->
<div class="section">
<div class="section-title">🔒 证书安全检查</div>
<table>
<tr><th>证书名称</th><th>到期时间</th><th>剩余天数</th><th>状态</th></tr>
{% for cert in certificates %}
<tr>
<td>{{ cert.name }}</td>
<td>{{ cert.expire_date }}</td>
<td class="{{ cert.class }}">{{ cert.days_left }} 天</td>
<td><span class="badge badge-{{ cert.class }}">{{ cert.status_text }}</span></td>
</tr>
{% endfor %}
</table>
</div>
<!-- 网络组件检查 -->
<div class="section">
<div class="section-title">🌐 网络组件检查</div>
<table>
<tr><th>组件</th><th>命名空间</th><th>期望副本</th><th>就绪副本</th><th>状态</th></tr>
{% for comp in network_components %}
<tr>
<td>{{ comp.name }}</td>
<td>{{ comp.namespace }}</td>
<td>{{ comp.desired }}</td>
<td class="{{ comp.class }}">{{ comp.ready }}</td>
<td><span class="badge badge-{{ comp.class }}">{{ comp.status_text }}</span></td>
</tr>
{% endfor %}
</table>
</div>
<!-- 近期异常事件 -->
<div class="section">
<div class="section-title">⚡ 近期异常事件(最近 1 小时)</div>
<table>
<tr><th>时间</th><th>命名空间</th><th>类型</th><th>对象</th><th>原因</th><th>消息</th></tr>
{% for event in warning_events %}
<tr>
<td style="font-size:11px">{{ event.last_timestamp }}</td>
<td>{{ event.namespace }}</td>
<td><span class="badge badge-warn">Warning</span></td>
<td style="font-size:11px">{{ event.object }}</td>
<td class="warn">{{ event.reason }}</td>
<td style="font-size:11px;color:#94a3b8">{{ event.message }}</td>
</tr>
{% endfor %}
</table>
</div>
<div style="text-align:center;padding:32px;color:#475569;font-size:13px;">
巡检时间:{{ timestamp }} | 报告由 k8s_inspect.py 自动生成
</div>
</body>
</html>
"""
# =============================================================
# K8s 巡检类
# =============================================================
class K8sInspector:
def __init__(self, namespace: str = None, all_namespaces: bool = True):
# 加载 kubeconfig
try:
config.load_incluster_config()
except config.ConfigException:
config.load_kube_config()
self.core_v1 = client.CoreV1Api()
self.apps_v1 = client.AppsV1Api()
self.batch_v1 = client.BatchV1Api()
self.storage_v1 = client.StorageV1Api()
self.networking_v1 = client.NetworkingV1Api()
self.rbac_v1 = client.RbacAuthorizationV1Api()
self.custom_objects = client.CustomObjectsApi()
self.namespace = namespace
self.all_namespaces = all_namespaces
self.report_data = {
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"node_total": 0,
"node_ready": 0,
"pod_total": 0,
"ns_total": 0,
"nodes": [],
"pod_stats": {"running": 0, "abnormal": 0, "pending": 0},
"abnormal_pods": [],
"resource_quotas": [],
"pvcs": [],
"certificates": [],
"network_components": [],
"warning_events": []
}
# 收集基础统计数据
self._collect_basic_stats()
def _collect_basic_stats(self):
"""收集节点、Pod、命名空间总数"""
try:
nodes = self.core_v1.list_node()
self.report_data["node_total"] = len(nodes.items)
self.report_data["node_ready"] = sum(
1 for n in nodes.items
if any(c.type == "Ready" and c.status == "True" for c in n.status.conditions)
)
except ApiException:
pass
try:
pods = self.core_v1.list_pod_for_all_namespaces()
self.report_data["pod_total"] = len(pods.items)
except ApiException:
pass
try:
ns = self.core_v1.list_namespace()
self.report_data["ns_total"] = len(ns.items)
except ApiException:
pass
def _get_node_metrics(self) -> Dict[str, Dict]:
"""通过 metrics.k8s.io 获取节点 CPU/内存使用率"""
metrics = {}
try:
result = self.custom_objects.list_cluster_custom_object(
group="metrics.k8s.io", version="v1beta1", plural="nodes"
)
for item in result.get("items", []):
name = item["metadata"]["name"]
cpu = item["usage"]["cpu"]
mem = item["usage"]["memory"]
metrics[name] = {"cpu": cpu, "memory": mem}
except Exception:
pass
return metrics
def check_nodes(self):
"""节点健康检查"""
print("[INFO] 检查节点健康状态...")
nodes = []
node_metrics = self._get_node_metrics()
try:
node_list = self.core_v1.list_node()
for node in node_list.items:
name = node.metadata.name
status = "NotReady"
conditions = {c.type: c.status for c in node.status.conditions}
if conditions.get("Ready") == "True":
status = "Ready"
# 角色
role = "worker"
if "node-role.kubernetes.io/control-plane" in node.metadata.labels or \
"node-role.kubernetes.io/master" in node.metadata.labels:
role = "control-plane"
# 版本信息
version = node.status.node_info.kubelet_version
kernel = node.status.node_info.kernel_version
# 运行时间
uptime = "N/A"
if node.metadata.creation_timestamp:
delta = datetime.now(node.metadata.creation_timestamp.tzinfo) - node.metadata.creation_timestamp
days = delta.days
hours = delta.seconds // 3600
uptime = f"{days}d{hours}h"
# CPU/内存使用
cpu_usage = "N/A"
mem_usage = "N/A"
if name in node_metrics:
cpu_usage = node_metrics[name]["cpu"]
mem_usage = node_metrics[name]["memory"]
status_class = "ok" if status == "Ready" else "err"
nodes.append({
"name": name,
"status": status,
"status_class": status_class,
"role": role,
"version": version,
"cpu_usage": cpu_usage,
"mem_usage": mem_usage,
"uptime": uptime,
"kernel": kernel
})
except ApiException as e:
print(f"[ERROR] 获取节点列表失败: {e}")
self.report_data["nodes"] = nodes
def check_pods(self):
"""Pod 状态检查"""
print("[INFO] 检查 Pod 状态...")
abnormal_pods = []
running = 0
pending = 0
abnormal = 0
try:
if self.all_namespaces:
pods = self.core_v1.list_pod_for_all_namespaces()
else:
pods = self.core_v1.list_namespaced_pod(namespace=self.namespace or "default")
for pod in pods.items:
status = pod.status.phase
if status == "Running":
running += 1
elif status == "Pending":
pending += 1
abnormal += 1
elif status in ["Failed", "Unknown"]:
abnormal += 1
# 检查容器状态中的异常
container_statuses = pod.status.container_statuses or []
for cs in container_statuses:
if cs.state.waiting:
reason = cs.state.waiting.reason
if reason in ["CrashLoopBackOff", "ImagePullBackOff", "ErrImagePull", "OOMKilled"]:
abnormal += 1
if status == "Running":
running -= 1
break
# 收集非 Running 或异常 Pod
if status != "Running" or any(
cs.state.waiting and cs.state.waiting.reason in ["CrashLoopBackOff", "ImagePullBackOff", "ErrImagePull"]
for cs in (pod.status.container_statuses or [])
):
pod_status = pod.status.phase
restarts = sum(cs.restart_count for cs in (pod.status.container_statuses or []))
age = "N/A"
if pod.status.start_time:
delta = datetime.now(pod.status.start_time.tzinfo) - pod.status.start_time
age = f"{delta.days}d{delta.seconds//3600}h" if delta.days > 0 else f"{delta.seconds//60}m"
node = pod.spec.node_name or "N/A"
image = "N/A"
if pod.spec.containers:
image = pod.spec.containers[0].image
status_class = "warn"
if pod_status in ["Failed", "Unknown"]:
status_class = "err"
for cs in (pod.status.container_statuses or []):
if cs.state.waiting and cs.state.waiting.reason in ["CrashLoopBackOff", "OOMKilled", "ImagePullBackOff"]:
status_class = "err"
pod_status = cs.state.waiting.reason
break
abnormal_pods.append({
"namespace": pod.metadata.namespace,
"name": pod.metadata.name,
"status": pod_status,
"status_class": status_class,
"restarts": restarts,
"age": age,
"node": node,
"image": image
})
self.report_data["pod_stats"] = {
"running": running,
"abnormal": abnormal,
"pending": pending
}
self.report_data["abnormal_pods"] = abnormal_pods
except ApiException as e:
print(f"[ERROR] 获取 Pod 列表失败: {e}")
def check_resources(self):
"""资源配额和 PVC 检查"""
print("[INFO] 检查资源配额与存储...")
quotas = []
pvcs = []
try:
# ResourceQuota
if self.all_namespaces:
quota_list = self.core_v1.list_resource_quota_for_all_namespaces()
else:
quota_list = self.core_v1.list_namespaced_resource_quota(namespace=self.namespace or "default")
for quota in quota_list.items:
ns = quota.metadata.namespace
name = quota.metadata.name
hard = quota.status.hard or {}
used = quota.status.used or {}
def format_quota(key):
u = used.get(key, "0")
h = hard.get(key, "0")
return f"{u}/{h}" if h != "0" else "N/A"
quotas.append({
"namespace": ns,
"name": name,
"cpu_req": format_quota("requests.cpu"),
"cpu_limit": format_quota("limits.cpu"),
"mem_req": format_quota("requests.memory"),
"mem_limit": format_quota("limits.memory"),
"pods": format_quota("pods")
})
except ApiException as e:
print(f"[WARN] 获取 ResourceQuota 失败: {e}")
try:
# PVC
if self.all_namespaces:
pvc_list = self.core_v1.list_persistent_volume_claim_for_all_namespaces()
else:
pvc_list = self.core_v1.list_namespaced_persistent_volume_claim(namespace=self.namespace or "default")
for pvc in pvc_list.items:
status = pvc.status.phase
capacity = pvc.status.capacity.get("storage", "N/A") if pvc.status.capacity else "N/A"
sc = pvc.spec.storage_class_name or "N/A"
access_modes = ", ".join(pvc.spec.access_modes) if pvc.spec.access_modes else "N/A"
status_class = "ok" if status == "Bound" else "err"
pvcs.append({
"namespace": pvc.metadata.namespace,
"name": pvc.metadata.name,
"status": status,
"status_class": status_class,
"capacity": capacity,
"storage_class": sc,
"access_modes": access_modes
})
except ApiException as e:
print(f"[WARN] 获取 PVC 失败: {e}")
self.report_data["resource_quotas"] = quotas
self.report_data["pvcs"] = pvcs
def check_certificates(self):
"""证书到期检查"""
print("[INFO] 检查证书到期时间...")
certs = []
# 检查 kubeadm 证书(如果可用)
try:
result = subprocess.run(
["kubeadm", "certs", "check-expiration"],
capture_output=True, text=True, timeout=10
)
if result.returncode == 0:
lines = result.stdout.strip().split("\n")
for line in lines[1:]: # 跳过表头
parts = line.split()
if len(parts) >= 7:
name = parts[0]
expire_date = f"{parts[2]} {parts[3]} {parts[4]}"
residual = parts[6] # e.g., "364d"
days_left = int(''.join(filter(str.isdigit, residual))) if residual else 0
cls = "err" if days_left <= 7 else ("warn" if days_left <= CERT_WARN_DAYS else "ok")
status_text = "紧急" if days_left <= 7 else ("警告" if days_left <= CERT_WARN_DAYS else "正常")
certs.append({
"name": name,
"expire_date": expire_date,
"days_left": days_left,
"class": cls,
"status_text": status_text
})
except Exception as e:
print(f"[WARN] kubeadm 证书检查失败: {e}")
# 检查 Ingress TLS 证书
try:
secrets = self.core_v1.list_secret_for_all_namespaces(field_selector="type=kubernetes.io/tls")
for secret in secrets.items:
if "tls.crt" not in secret.data:
continue
cert_data = base64.b64decode(secret.data["tls.crt"]).decode("utf-8")
# 使用 openssl 解析
proc = subprocess.run(
["openssl", "x509", "-noout", "-enddate"],
input=cert_data, capture_output=True, text=True
)
if proc.returncode == 0:
enddate_str = proc.stdout.strip().split("=")[1]
expire_ts = datetime.strptime(enddate_str, "%b %d %H:%M:%S %Y %Z")
now = datetime.now(expire_ts.tzinfo)
days_left = (expire_ts - now).days
cls = "err" if days_left <= 7 else ("warn" if days_left <= CERT_WARN_DAYS else "ok")
status_text = "紧急" if days_left <= 7 else ("警告" if days_left <= CERT_WARN_DAYS else "正常")
certs.append({
"name": f"TLS:{secret.metadata.namespace}/{secret.metadata.name}",
"expire_date": enddate_str,
"days_left": days_left,
"class": cls,
"status_text": status_text
})
except Exception as e:
print(f"[WARN] Ingress TLS 证书检查失败: {e}")
self.report_data["certificates"] = certs
def check_network(self):
"""网络组件检查"""
print("[INFO] 检查网络组件...")
components = []
target_components = ["coredns", "kube-proxy", "calico-node", "flannel", "cilium"]
try:
# 获取所有 Deployment 和 DaemonSet
deps = self.apps_v1.list_deployment_for_all_namespaces()
dss = self.apps_v1.list_daemon_set_for_all_namespaces()
all_workloads = []
for dep in deps.items:
all_workloads.append({
"kind": "Deployment",
"name": dep.metadata.name,
"namespace": dep.metadata.namespace,
"desired": dep.spec.replicas,
"ready": dep.status.ready_replicas or 0
})
for ds in dss.items:
all_workloads.append({
"kind": "DaemonSet",
"name": ds.metadata.name,
"namespace": ds.metadata.namespace,
"desired": ds.status.desired_number_scheduled,
"ready": ds.status.number_ready or 0
})
for wl in all_workloads:
if any(comp in wl["name"].lower() for comp in target_components):
cls = "ok" if wl["ready"] >= wl["desired"] else "warn"
status_text = "正常" if cls == "ok" else "降级"
components.append({
"name": wl["name"],
"namespace": wl["namespace"],
"desired": wl["desired"],
"ready": wl["ready"],
"class": cls,
"status_text": status_text
})
except ApiException as e:
print(f"[WARN] 网络组件检查失败: {e}")
self.report_data["network_components"] = components
def check_events(self):
"""近期 Warning 事件"""
print("[INFO] 检查近期异常事件...")
events_data = []
try:
events = self.core_v1.list_event_for_all_namespaces(
field_selector="type=Warning",
limit=50
)
# 按时间排序(最新的在前面)
sorted_events = sorted(
events.items,
key=lambda e: e.last_timestamp or e.event_time or datetime.min,
reverse=True
)
# 过滤最近1小时
one_hour_ago = datetime.now(sorted_events[0].last_timestamp.tzinfo) - timedelta(hours=1) if sorted_events else None
for event in sorted_events:
if one_hour_ago and event.last_timestamp and event.last_timestamp < one_hour_ago:
continue
events_data.append({
"last_timestamp": event.last_timestamp.strftime("%H:%M:%S") if event.last_timestamp else "N/A",
"namespace": event.metadata.namespace,
"object": f"{event.involved_kind}/{event.involved_name}",
"reason": event.reason,
"message": (event.message or "")[:80] + ("..." if len(event.message or "") > 80 else "")
})
except ApiException as e:
print(f"[WARN] 获取事件失败: {e}")
self.report_data["warning_events"] = events_data
def generate_html_report(self) -> str:
"""生成 HTML 报告"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_file = os.path.join(REPORT_DIR, f"k8s_report_{timestamp}.html")
template = jinja2.Template(HTML_TEMPLATE)
html_content = template.render(**self.report_data)
with open(report_file, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"[INFO] HTML 报告已生成: {report_file}")
return report_file
def generate_word_report(self) -> str:
"""生成 Word 报告 (.docx)"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_file = os.path.join(REPORT_DIR, f"k8s_report_{timestamp}.docx")
doc = Document()
# 设置文档默认字体
style = doc.styles['Normal']
style.font.name = '微软雅黑'
style._element.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑')
style.font.size = Pt(10)
# 标题
title = doc.add_heading('K8s 集群巡检报告', level=1)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = title.runs[0]
run.font.color.rgb = RGBColor(56, 189, 248)
run.font.size = Pt(28)
run.font.bold = True
subtitle = doc.add_paragraph(f"生成时间:{self.report_data['timestamp']}")
subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER
subtitle.runs[0].font.color.rgb = RGBColor(148, 163, 184)
doc.add_paragraph()
# 概览统计表格
summary_table = doc.add_table(rows=1, cols=4)
summary_table.style = 'Table Grid'
summary_table.alignment = WD_TABLE_ALIGNMENT.CENTER
headers = ['总节点数', '就绪节点', '总 Pod 数', '命名空间数']
values = [
self.report_data['node_total'],
self.report_data['node_ready'],
self.report_data['pod_total'],
self.report_data['ns_total']
]
hdr_cells = summary_table.rows[0].cells
for i, header in enumerate(headers):
hdr_cells[i].text = header
hdr_cells[i].paragraphs[0].runs[0].font.bold = True
hdr_cells[i].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
row_cells = summary_table.add_row().cells
for i, val in enumerate(values):
row_cells[i].text = str(val)
row_cells[i].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_paragraph()
# 辅助函数:添加章节标题
def add_section_title(text):
heading = doc.add_heading(text, level=2)
run = heading.runs[0]
run.font.color.rgb = RGBColor(56, 189, 248)
run.font.size = Pt(18)
return heading
# 辅助函数:设置单元格颜色(通过背景色)
def set_cell_background(cell, hex_color):
shading_elm = OxmlElement('w:shd')
shading_elm.set(qn('w:fill'), hex_color)
cell._tc.get_or_add_tcPr().append(shading_elm)
# 1. 节点健康检查
add_section_title('🖥️ 节点健康检查')
if self.report_data['nodes']:
node_table = doc.add_table(rows=1, cols=8)
node_table.style = 'Table Grid'
node_headers = ['节点名称', '状态', '角色', 'K8s版本', 'CPU使用', '内存使用', '运行时间', '内核版本']
for i, h in enumerate(node_headers):
node_table.rows[0].cells[i].text = h
node_table.rows[0].cells[i].paragraphs[0].runs[0].font.bold = True
for node in self.report_data['nodes']:
cells = node_table.add_row().cells
cells[0].text = node['name']
cells[1].text = node['status']
cells[2].text = node['role']
cells[3].text = node['version']
cells[4].text = node['cpu_usage']
cells[5].text = node['mem_usage']
cells[6].text = node['uptime']
cells[7].text = node['kernel']
# 状态颜色标记
if node['status_class'] == 'ok':
cells[1].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 208, 132)
elif node['status_class'] == 'err':
cells[1].paragraphs[0].runs[0].font.color.rgb = RGBColor(239, 68, 68)
else:
cells[1].paragraphs[0].runs[0].font.color.rgb = RGBColor(245, 158, 11)
else:
doc.add_paragraph("无节点数据")
doc.add_paragraph()
# 2. Pod 状态检查
add_section_title('🚀 Pod 状态检查')
stats = self.report_data['pod_stats']
pod_summary = doc.add_paragraph()
pod_summary.add_run(f"Running: {stats['running']} ").bold = True
pod_summary.add_run(f"异常 Pod: {stats['abnormal']} ").font.color.rgb = RGBColor(239, 68, 68)
pod_summary.add_run(f"Pending: {stats['pending']}").font.color.rgb = RGBColor(245, 158, 11)
if self.report_data['abnormal_pods']:
pod_table = doc.add_table(rows=1, cols=7)
pod_table.style = 'Table Grid'
pod_headers = ['命名空间', 'Pod名称', '状态', '重启次数', '运行时间', '所在节点', '镜像']
for i, h in enumerate(pod_headers):
pod_table.rows[0].cells[i].text = h
pod_table.rows[0].cells[i].paragraphs[0].runs[0].font.bold = True
for pod in self.report_data['abnormal_pods']:
cells = pod_table.add_row().cells
cells[0].text = pod['namespace']
cells[1].text = pod['name']
cells[2].text = pod['status']
cells[3].text = str(pod['restarts'])
cells[4].text = pod['age']
cells[5].text = pod['node']
cells[6].text = pod['image']
if pod['status_class'] == 'err':
cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(239, 68, 68)
elif pod['status_class'] == 'warn':
cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(245, 158, 11)
else:
cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 208, 132)
else:
doc.add_paragraph("无异常 Pod")
doc.add_paragraph()
# 3. 资源配额
add_section_title('💰 资源配额检查')
if self.report_data['resource_quotas']:
quota_table = doc.add_table(rows=1, cols=7)
quota_table.style = 'Table Grid'
quota_headers = ['命名空间', '配额名称', 'CPU请求', 'CPU限制', '内存请求', '内存限制', 'Pod数量']
for i, h in enumerate(quota_headers):
quota_table.rows[0].cells[i].text = h
quota_table.rows[0].cells[i].paragraphs[0].runs[0].font.bold = True
for q in self.report_data['resource_quotas']:
cells = quota_table.add_row().cells
cells[0].text = q['namespace']
cells[1].text = q['name']
cells[2].text = q['cpu_req']
cells[3].text = q['cpu_limit']
cells[4].text = q['mem_req']
cells[5].text = q['mem_limit']
cells[6].text = q['pods']
else:
doc.add_paragraph("无 ResourceQuota 配置")
doc.add_paragraph()
# 4. PVC 状态
add_section_title('💾 PVC 状态检查')
if self.report_data['pvcs']:
pvc_table = doc.add_table(rows=1, cols=6)
pvc_table.style = 'Table Grid'
pvc_headers = ['命名空间', 'PVC名称', '状态', '容量', '存储类', '访问模式']
for i, h in enumerate(pvc_headers):
pvc_table.rows[0].cells[i].text = h
pvc_table.rows[0].cells[i].paragraphs[0].runs[0].font.bold = True
for p in self.report_data['pvcs']:
cells = pvc_table.add_row().cells
cells[0].text = p['namespace']
cells[1].text = p['name']
cells[2].text = p['status']
cells[3].text = p['capacity']
cells[4].text = p['storage_class']
cells[5].text = p['access_modes']
if p['status_class'] == 'ok':
cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 208, 132)
else:
cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(239, 68, 68)
else:
doc.add_paragraph("无 PVC 资源")
doc.add_paragraph()
# 5. 证书检查
add_section_title('🔒 证书安全检查')
if self.report_data['certificates']:
cert_table = doc.add_table(rows=1, cols=4)
cert_table.style = 'Table Grid'
cert_headers = ['证书名称', '到期时间', '剩余天数', '状态']
for i, h in enumerate(cert_headers):
cert_table.rows[0].cells[i].text = h
cert_table.rows[0].cells[i].paragraphs[0].runs[0].font.bold = True
for c in self.report_data['certificates']:
cells = cert_table.add_row().cells
cells[0].text = c['name']
cells[1].text = c['expire_date']
cells[2].text = f"{c['days_left']} 天"
cells[3].text = c['status_text']
if c['class'] == 'err':
cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(239, 68, 68)
cells[3].paragraphs[0].runs[0].font.color.rgb = RGBColor(239, 68, 68)
elif c['class'] == 'warn':
cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(245, 158, 11)
cells[3].paragraphs[0].runs[0].font.color.rgb = RGBColor(245, 158, 11)
else:
cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 208, 132)
cells[3].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 208, 132)
else:
doc.add_paragraph("无证书信息")
doc.add_paragraph()
# 6. 网络组件
add_section_title('🌐 网络组件检查')
if self.report_data['network_components']:
net_table = doc.add_table(rows=1, cols=5)
net_table.style = 'Table Grid'
net_headers = ['组件', '命名空间', '期望副本', '就绪副本', '状态']
for i, h in enumerate(net_headers):
net_table.rows[0].cells[i].text = h
net_table.rows[0].cells[i].paragraphs[0].runs[0].font.bold = True
for comp in self.report_data['network_components']:
cells = net_table.add_row().cells
cells[0].text = comp['name']
cells[1].text = comp['namespace']
cells[2].text = str(comp['desired'])
cells[3].text = str(comp['ready'])
cells[4].text = comp['status_text']
if comp['class'] == 'ok':
cells[3].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 208, 132)
cells[4].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 208, 132)
else:
cells[3].paragraphs[0].runs[0].font.color.rgb = RGBColor(245, 158, 11)
cells[4].paragraphs[0].runs[0].font.color.rgb = RGBColor(245, 158, 11)
else:
doc.add_paragraph("无网络组件信息")
doc.add_paragraph()
# 7. 异常事件
add_section_title('⚡ 近期异常事件(最近 1 小时)')
if self.report_data['warning_events']:
event_table = doc.add_table(rows=1, cols=6)
event_table.style = 'Table Grid'
event_headers = ['时间', '命名空间', '类型', '对象', '原因', '消息']
for i, h in enumerate(event_headers):
event_table.rows[0].cells[i].text = h
event_table.rows[0].cells[i].paragraphs[0].runs[0].font.bold = True
for e in self.report_data['warning_events']:
cells = event_table.add_row().cells
cells[0].text = e['last_timestamp']
cells[1].text = e['namespace']
cells[2].text = 'Warning'
cells[3].text = e['object']
cells[4].text = e['reason']
cells[5].text = e['message']
cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(245, 158, 11)
cells[4].paragraphs[0].runs[0].font.color.rgb = RGBColor(245, 158, 11)
else:
doc.add_paragraph("无 Warning 事件")
# 页脚
doc.add_paragraph()
footer = doc.add_paragraph(f"巡检时间:{self.report_data['timestamp']} | 报告由 k8s_inspect.py 自动生成")
footer.alignment = WD_ALIGN_PARAGRAPH.CENTER
footer.runs[0].font.color.rgb = RGBColor(71, 85, 105)
footer.runs[0].font.size = Pt(9)
doc.save(report_file)
print(f"[INFO] Word 报告已生成: {report_file}")
return report_file
def run_full_inspection(self, output_format: str = "both"):
"""执行完整巡检"""
print("=" * 60)
print("开始 K8s 全面巡检...")
print("=" * 60)
self.check_nodes()
self.check_pods()
self.check_resources()
self.check_certificates()
self.check_network()
self.check_events()
reports = {}
if output_format in ["html", "both"]:
reports["html"] = self.generate_html_report()
if output_format in ["word", "both"]:
reports["word"] = self.generate_word_report()
# 发送告警(如果有异常)
if DINGTALK_WEBHOOK or WECOM_WEBHOOK:
self._send_alerts()
return reports
def _send_alerts(self):
"""发送钉钉/企微告警"""
abnormal_count = self.report_data["pod_stats"]["abnormal"]
if abnormal_count == 0:
return
msg = f"""## ⚠️ K8s 巡检告警
> 发现 **{abnormal_count}** 个异常 Pod,请及时处理!
>
> 巡检时间:{self.report_data['timestamp']}
> 完整报告:http://your-nginx/reports/latest.html"""
if DINGTALK_WEBHOOK:
try:
requests.post(DINGTALK_WEBHOOK, json={
"msgtype": "markdown",
"markdown": {"title": "K8s 集群异常告警", "text": msg}
}, timeout=5)
except Exception as e:
print(f"[WARN] 钉钉告警发送失败: {e}")
if WECOM_WEBHOOK:
try:
requests.post(WECOM_WEBHOOK, json={
"msgtype": "markdown",
"markdown": {"content": msg}
}, timeout=5)
except Exception as e:
print(f"[WARN] 企业微信告警发送失败: {e}")
# =============================================================
# 命令行入口
# =============================================================
def main():
parser = argparse.ArgumentParser(description="K8s 全面巡检脚本")
parser.add_argument("--namespace", "-n", help="指定命名空间(默认所有命名空间)")
parser.add_argument("--all-namespaces", "-A", action="store_true", default=True,
help="检查所有命名空间(默认)")
parser.add_argument("--output-dir", "-o", help="报告输出目录")
parser.add_argument("--format", "-f", choices=["html", "word", "both"], default="both",
help="输出报告格式 (默认: both)")
args = parser.parse_args()
if args.output_dir:
global REPORT_DIR
REPORT_DIR = args.output_dir
os.makedirs(REPORT_DIR, exist_ok=True)
inspector = K8sInspector(
namespace=args.namespace,
all_namespaces=args.all_namespaces
)
reports = inspector.run_full_inspection(output_format=args.format)
print("\n✅ 巡检完成!")
for fmt, path in reports.items():
print(f"{fmt.upper()} 报告: {path}")
if __name__ == "__main__":
main()