我用 Python + AI 搭了一套自动化运维巡检系统:从异常检测到自动修复,半夜再也没被报警吵醒过
读者对象 :个人开发者、小团队运维、需要看管多台服务器的技术负责人
解决的问题:服务器出问题靠人盯着看板,半夜被报警吵醒。本文给出一套"自动巡检 + 异常检测 + 智能自愈"的完整方案,把被动救火变成主动防御。
一、痛:半夜被 CPU 报警吵醒
我手上有 3 台 VPS,分别跑着 API 服务、数据库、爬虫任务。
最烦的是凌晨 3 点的报警。
- CPU 突然飙到 100%,一看是爬虫跑了一个死循环
- 磁盘满了,因为某个日志 3 天没轮转
- 内存泄漏,API 服务 OOM 重启了 4 次
每次都是同一个剧本:被报警吵醒 → 打开 SSH → 手动 kill 进程 / 删日志 / 重启服务 → 继续睡(但睡不着了)。
"能不能让系统自己发现问题、解决问题?"
二、方案:五阶段巡检流水线
定时触发(每 5 分钟)
↓
阶段 1:指标采集(CPU/内存/磁盘/进程/日志)
↓
阶段 2:AI 异常检测(用 LLM 判断当前指标是否异常 + 根因分析)
↓
阶段 3:告警分级(P0 立即通知 / P1 延迟通知 / P2 仅记录)
↓
阶段 4:自动修复(kill 僵尸进程 / 清理日志 / 重启服务 / 释放内存)
↓
阶段 5:巡检报告(每次巡检生成 Markdown 报告,可推送企业微信/钉钉)
三、实操:逐阶段实现
阶段 1:指标采集
用 Python 的 psutil 一把梭,跨平台。
python
# metrics_collector.py
import psutil
import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List
class MetricsCollector:
"""指标采集器:CPU、内存、磁盘、进程、网络"""
def collect_all(self) -> Dict:
"""采集所有指标"""
return {
"timestamp": datetime.now().isoformat(),
"hostname": self._get_hostname(),
"cpu": self._collect_cpu(),
"memory": self._collect_memory(),
"disk": self._collect_disk(),
"network": self._collect_network(),
"processes": self._get_top_processes(top_n=10),
"system": self._collect_system()
}
def _collect_cpu(self) -> Dict:
"""CPU 指标"""
cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
return {
"percent_per_cpu": cpu_percent,
"average": round(sum(cpu_percent) / len(cpu_percent), 1),
"max": max(cpu_percent),
"cores": psutil.cpu_count(),
"load_avg": list(psutil.getloadavg()) if hasattr(psutil, "getloadavg") else None
}
def _collect_memory(self) -> Dict:
"""内存指标"""
mem = psutil.virtual_memory()
swap = psutil.swap_memory()
return {
"total_gb": round(mem.total / (1024**3), 1),
"used_gb": round(mem.used / (1024**3), 1),
"percent": mem.percent,
"available_gb": round(mem.available / (1024**3), 1),
"swap_used_gb": round(swap.used / (1024**3), 1),
"swap_percent": swap.percent
}
def _collect_disk(self) -> Dict:
"""磁盘指标"""
disks = []
for part in psutil.disk_partitions():
try:
usage = psutil.disk_usage(part.mountpoint)
disks.append({
"mount": part.mountpoint,
"total_gb": round(usage.total / (1024**3), 1),
"used_gb": round(usage.used / (1024**3), 1),
"percent": usage.percent,
"free_gb": round(usage.free / (1024**3), 1)
})
except PermissionError:
continue
return {"partitions": disks}
def _collect_network(self) -> Dict:
"""网络指标"""
net = psutil.net_io_counters()
return {
"bytes_sent_mb": round(net.bytes_sent / (1024**2), 1),
"bytes_recv_mb": round(net.bytes_recv / (1024**2), 1),
"packets_sent": net.packets_sent,
"packets_recv": net.packets_recv
}
def _get_top_processes(self, top_n: int = 10) -> List[Dict]:
"""获取 CPU/内存占用 Top N 的进程"""
processes = []
for proc in psutil.process_iter(["pid", "name", "cpu_percent", "memory_percent"]):
try:
info = proc.info
processes.append(info)
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
# 按 CPU 排序
cpu_top = sorted(processes, key=lambda x: x.get("cpu_percent", 0), reverse=True)[:top_n]
mem_top = sorted(processes, key=lambda x: x.get("memory_percent", 0), reverse=True)[:5]
return {
"top_cpu": [
{"pid": p["pid"], "name": p["name"], "cpu": p["cpu_percent"]}
for p in cpu_top
],
"top_memory": [
{"pid": p["pid"], "name": p["name"], "mem": p["memory_percent"]}
for p in mem_top
]
}
def _collect_system(self) -> Dict:
"""系统级信息"""
boot_time = datetime.fromtimestamp(psutil.boot_time())
uptime = datetime.now() - boot_time
return {
"uptime_hours": round(uptime.total_seconds() / 3600, 1),
"boot_time": boot_time.isoformat()
}
def _get_hostname(self) -> str:
import socket
return socket.gethostname()
def save_snapshot(self, output_dir: str = "./metrics"):
"""保存指标快照"""
Path(output_dir).mkdir(exist_ok=True)
metrics = self.collect_all()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filepath = Path(output_dir) / f"metrics_{timestamp}.json"
with open(filepath, "w") as f:
json.dump(metrics, f, indent=2, ensure_ascii=False)
return str(filepath)
# 用法
collector = MetricsCollector()
snapshot = collector.collect_all()
print(f"主机:{snapshot['hostname']}")
print(f"CPU 平均:{snapshot['cpu']['average']}%")
print(f"内存使用:{snapshot['memory']['percent']}%")
print(f"磁盘:", end=" ")
for d in snapshot["disk"]["partitions"]:
print(f"{d['mount']}={d['percent']}%", end=" ")
print()
阶段 2:AI 异常检测
指标采集完了,但"85% CPU 算不算异常"这个问题很难用固定阈值回答。白天 85% 正常,凌晨 85% 就是异常。
用 AI 来做上下文相关的异常判断。
python
# anomaly_detector.py
import openai
import os
import json
from typing import Dict, List
class AnomalyDetector:
"""AI 异常检测:用 LLM 判断当前指标是否异常"""
PROMPT = """你是一位资深运维工程师。请分析以下服务器指标,判断是否存在异常。
{metrics_json}
请返回 JSON:
{{
"has_anomaly": true/false,
"anomalies": [
{{
"type": "CPU/Memory/Disk/Process/Network",
"severity": "P0/P1/P2",
"detail": "具体异常描述",
"root_cause": "可能的根因",
"suggested_action": "建议的修复操作"
}}
],
"overall_health": "健康/警告/严重",
"summary": "一句话总结"
}}
"""
def __init__(self, model: str = "gpt-4o-mini"):
self.model = model
self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def detect(
self,
current_metrics: Dict,
history_metrics: List[Dict] = None
) -> Dict:
"""检测异常"""
# 如果提供历史数据,一起传给 AI 做趋势判断
data = {"current": current_metrics}
if history_metrics:
data["history_last_12"] = history_metrics[-12:] # 最近 12 个快照(1小时)
metrics_json = json.dumps(data, indent=2, ensure_ascii=False)
prompt = self.PROMPT.format(metrics_json=metrics_json)
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0.3
)
result = json.loads(response.choices[0].message.content)
return result
except Exception as e:
return {"error": str(e), "has_anomaly": False}
def should_alert(self, detection_result: Dict) -> bool:
"""判断是否需要告警"""
if detection_result.get("error"):
return False
if not detection_result.get("has_anomaly"):
return False
# P0 严重异常立即告警,P1 告警,P2 仅记录
for anomaly in detection_result.get("anomalies", []):
if anomaly["severity"] in ["P0", "P1"]:
return True
return False
# 用法
detector = AnomalyDetector()
# 假设 collect_all() 采集了当前指标
current = collector.collect_all()
# 载入最近几小时的历史数据做对比
import glob
history_files = sorted(glob.glob("./metrics/metrics_*.json"))[-12:]
history = []
for f in history_files:
with open(f) as fp:
history.append(json.load(fp))
result = detector.detect(current, history)
print(json.dumps(result, indent=2, ensure_ascii=False))
阶段 3:告警分级
不是所有异常都需要半夜叫醒人。
python
# alert_manager.py
import json
from typing import Dict, List
from enum import Enum
class Severity(Enum):
P0 = "P0" # 严重:服务不可用、OOM、磁盘满 100%
P1 = "P1" # 警告:CPU > 90%、内存 > 85%、磁盘 > 90%
P2 = "P2" # 注意:CPU 趋势上涨、日志文件增长速度异常
class AlertManager:
"""告警管理器:分级、去重、通知"""
def __init__(self):
self.last_alerts = {} # alert_key → timestamp,用于去重
def should_notify(self, anomaly: Dict) -> bool:
"""判断是否应该发通知(去重:同样的问题 30 分钟内不重复通知)"""
alert_key = f"{anomaly.get('type')}_{anomaly.get('detail', '')[:50]}"
import time
now = time.time()
if alert_key in self.last_alerts:
elapsed = now - self.last_alerts[alert_key]
if elapsed < 1800: # 30 分钟
print(f"⏭️ 重复告警已跳过:{alert_key}({int(elapsed/60)} 分钟前通知过)")
return False
self.last_alerts[alert_key] = now
return True
def send_notification(self, severity: str, message: str):
"""发送通知(支持多通道)"""
if severity == "P0":
self._send_wework(message) # 企业微信群机器人(立即)
self._send_phone(message) # 手机推送(如果有)
elif severity == "P1":
self._send_wework(message) # 企业微信(可延迟)
else:
print(f"[P2] {message}") # 仅日志记录
def _send_wework(self, message: str):
"""发送到企业微信机器人"""
# 替换为你的 webhook URL
webhook_url = os.getenv("WEWORK_BOT_URL", "")
if not webhook_url:
print(f"📱 [企业微信] {message}")
return
import requests
data = {
"msgtype": "markdown",
"markdown": {"content": message}
}
try:
resp = requests.post(webhook_url, json=data, timeout=5)
print(f"📱 企业微信通知已发送:{resp.status_code}")
except Exception as e:
print(f"❌ 企业微信通知失败:{e}")
def _send_phone(self, message: str):
"""手机推送(如果有 Bark 或类似服务)"""
bark_url = os.getenv("BARK_URL", "")
if bark_url:
import requests
requests.get(f"{bark_url}/{message}", timeout=3)
# 用法
alert_mgr = AlertManager()
# 模拟一个 P0 告警
anomaly = {
"type": "Disk",
"severity": "P0",
"detail": "磁盘 /data 使用率 98%,IO wait 持续 > 30%",
"root_cause": "日志文件 /var/log/app.log 超过 50GB",
"suggested_action": "清理日志文件,检查日志轮转配置"
}
if alert_mgr.should_notify(anomaly):
msg = f"## 🚨 运维告警 [{anomaly['severity']}]\n"
msg += f"**类型**:{anomaly['type']}\n"
msg += f"**详情**:{anomaly['detail']}\n"
msg += f"**根因**:{anomaly['root_cause']}\n"
msg += f"**建议**:{anomaly['suggested_action']}"
alert_mgr.send_notification(anomaly["severity"], msg)
阶段 4:自动修复
告警不是终点,能自动修的自动修。
python
# auto_healer.py
import os
import subprocess
from typing import Dict, Optional
class AutoHealer:
"""自动修复:根据异常类型执行预定义的修复操作"""
# 修复策略映射
HEAL_ACTIONS = {
"high_cpu_zombie": {
"description": "Kill CPU 占用超过 30% 且运行超 1 小时的进程",
"command": "ps aux --sort=-%cpu | awk 'NR>1 && $3>30 {print $2}' | xargs kill -9",
"dangerous": True, # 危险操作,需要确认
},
"high_memory": {
"description": "清理系统缓存 + 重启 OOM 的服务",
"command": "sync && echo 3 > /proc/sys/vm/drop_caches && systemctl restart api.service",
"dangerous": True,
},
"disk_full_logs": {
"description": "清理 7 天前的日志文件",
"command": "find /var/log -name '*.log' -mtime +7 -delete",
"dangerous": False,
},
"disk_full_temp": {
"description": "清理临时文件",
"command": "rm -rf /tmp/* /var/tmp/*",
"dangerous": False,
},
}
def heal(self, anomaly: Dict) -> Dict:
"""执行修复"""
heal_key = self._determine_action(anomaly)
if not heal_key or heal_key not in self.HEAL_ACTIONS:
return {"success": False, "reason": f"无匹配的修复策略:{anomaly}"}
action = self.HEAL_ACTIONS[heal_key]
# 危险操作二次确认
if action["dangerous"]:
print(f"⚠️ 危险操作:{action['description']}")
print(f" 命令:{action['command']}")
# 实际项目中这里需要更高层级的确认逻辑
# return {"success": False, "reason": "危险操作需要人工确认"}
try:
result = subprocess.run(
action["command"],
shell=True,
capture_output=True,
text=True,
timeout=30
)
return {
"success": result.returncode == 0,
"action": heal_key,
"stdout": result.stdout[:500],
"stderr": result.stderr[:500]
}
except subprocess.TimeoutExpired:
return {"success": False, "reason": "修复操作超时"}
except Exception as e:
return {"success": False, "reason": str(e)}
def _determine_action(self, anomaly: Dict) -> Optional[str]:
"""根据异常类型确定修复策略"""
anomaly_type = anomaly.get("type", "")
detail = anomaly.get("detail", "").lower()
if anomaly_type == "CPU" and ("僵尸" in detail or "zombie" in detail or "100%" in detail):
return "high_cpu_zombie"
elif anomaly_type == "Memory":
return "high_memory"
elif anomaly_type == "Disk" and "98%" in detail:
return "disk_full_logs"
elif anomaly_type == "Disk":
return "disk_full_temp"
return None
# 用法
healer = AutoHealer()
anomaly = {
"type": "Disk",
"severity": "P0",
"detail": "磁盘 /data 使用率 98%,日志文件占据 40GB"
}
result = healer.heal(anomaly)
print(result)
阶段 5:巡检报告
每次巡检生成一份报告,存档备查。
python
# patrol_reporter.py
from datetime import datetime
from pathlib import Path
from typing import Dict, List
class PatrolReporter:
"""巡检报告生成器"""
def generate_report(
self,
metrics: Dict,
detection: Dict,
alerts: List[Dict],
healed: List[Dict],
output_dir: str = "./reports"
) -> str:
"""生成 Markdown 巡检报告"""
now = datetime.now()
filename = f"patrol_{now.strftime('%Y%m%d_%H%M%S')}.md"
filepath = Path(output_dir) / filename
Path(output_dir).mkdir(exist_ok=True)
report = f"""# 服务器巡检报告
**生成时间**:{now.strftime('%Y-%m-%d %H:%M:%S')}
**主机**:{metrics.get('hostname', 'unknown')}
**运行时长**:{metrics.get('system', {}).get('uptime_hours', '?')} 小时
---
## 当前资源
| 资源 | 使用率 | 状态 |
|------|--------|------|
| CPU | {metrics['cpu']['average']}% | {"⚠️" if metrics['cpu']['average'] > 80 else "✅"} |
| 内存 | {metrics['memory']['percent']}% | {"⚠️" if metrics['memory']['percent'] > 85 else "✅"} |
"""
for disk in metrics.get("disk", {}).get("partitions", []):
status = "🚨" if disk["percent"] > 95 else ("⚠️" if disk["percent"] > 85 else "✅")
report += f"| 磁盘 {disk['mount']} | {disk['percent']}% | {status} |\n"
report += f"""
---
## AI 异常检测
**整体健康度**:{detection.get('overall_health', '未知')}
**总结**:{detection.get('summary', '无异常')}
"""
if detection.get("anomalies"):
for a in detection["anomalies"]:
report += f"""
### {a['severity']} - {a['type']}
- **详情**:{a['detail']}
- **根因**:{a.get('root_cause', '未分析')}
- **建议**:{a.get('suggested_action', '无')}
"""
if healed:
report += """
---
## 自动修复记录
"""
for h in healed:
status = "✅ 成功" if h.get("success") else "❌ 失败"
report += f"- {status} | {h.get('action', 'unknown')}\n"
report += """
---
*报告由 AI 自动生成*
"""
with open(filepath, "w", encoding="utf-8") as f:
f.write(report)
print(f"📋 巡检报告已生成:{filepath}")
return str(filepath)
# 用法
reporter = PatrolReporter()
reporter.generate_report(
metrics=current,
detection=result,
alerts=[],
healed=[]
)
四、整合:定时运行 + 完整编排
python
# patrol_main.py
import time
import schedule
from typing import Dict
def run_patrol():
"""执行一次完整巡检"""
print(f"\n{'='*50}")
print(f"🔍 开始巡检:{datetime.now().strftime('%H:%M:%S')}")
# 阶段 1:采集
collector = MetricsCollector()
metrics = collector.collect_all()
collector.save_snapshot()
# 阶段 2:检测
detector = AnomalyDetector()
history = load_history() # 自己实现
detection = detector.detect(metrics, history)
if not detection.get("has_anomaly"):
print("✅ 无异常")
return
print(f"⚠️ 发现异常:{len(detection.get('anomalies', []))} 个")
# 阶段 3:告警
alert_mgr = AlertManager()
alerts_sent = []
for anomaly in detection.get("anomalies", []):
if alert_mgr.should_notify(anomaly):
msg = format_alert_message(anomaly)
alert_mgr.send_notification(anomaly["severity"], msg)
alerts_sent.append(anomaly)
# 阶段 4:自动修复
healer = AutoHealer()
healed = []
for anomaly in detection.get("anomalies", []):
if anomaly["severity"] == "P0":
result = healer.heal(anomaly)
healed.append(result)
# 阶段 5:报告
reporter = PatrolReporter()
reporter.generate_report(metrics, detection, alerts_sent, healed)
# 创建定时任务
schedule.every(5).minutes.do(run_patrol)
print("🤖 运维巡检系统已启动,每 5 分钟巡检一次...")
while True:
schedule.run_pending()
time.sleep(1)
五、效果数据
部署这套系统两个月的数据:
| 指标 | 部署前 | 部署后 |
|---|---|---|
| 凌晨告警次数 | 6 次/月 | 0(全部自动修复) |
| 平均问题发现→修复时间 | 20 分钟(人工) | 3 分钟(自动) |
| 磁盘满事故 | 2 次/月 | 0(提前清理) |
| 人工巡检时间 | 每天 30 分钟 | 0(全自动) |
| AI 误报率 | - | ~5%(持续优化中) |
六、踩坑记录
坑 1:AI 对普通波动过度敏感
症状:CPU 从 10% 跳到 35%,AI 判定为"异常上升趋势"。
原因:AI 没有历史基线,不知道这台机器的正常波动范围。
解决方案:传给 AI 最近 24 小时的数据做对比,AI 能看到"下午 3 点通常就是 35%":
python
# 增加对比基线
baseline_24h = get_metrics_last_24h() # 自己实现
detection = detector.detect(current, baseline_24h)
坑 2:自动修复把自己服务 Kill 了
症状:自动 kill 进程时,误杀了 API 服务的主进程。
原因:Kill 策略是"CPU > 80% 的都干掉",没有排除白名单进程。
解决方案:加白名单:
python
PROTECTED_PROCESSES = ["python api_server.py", "postgres", "redis-server", "nginx"]
def is_safe_to_kill(pid: int) -> bool:
"""检查是否可以安全 Kill"""
try:
proc = psutil.Process(pid)
cmdline = " ".join(proc.cmdline()).lower()
for protected in PROTECTED_PROCESSES:
if protected.lower() in cmdline:
return False
return True
except psutil.NoSuchProcess:
return False
坑 3:AI 检测调用太频繁,一个月花了 ¥300
症状:每 5 分钟调用一次 GPT-4o-mini,月费 ¥300+。
原因:14400 次检测/月 × ¥0.02 = ¥288。
解决方案:先用简单阈值预筛,只有阈值告警时才让 AI 做深度分析:
python
def pre_check(metrics: Dict) -> bool:
"""阈值预筛:只有以下条件触发时才调用 AI"""
if metrics["cpu"]["average"] > 80:
return True
if metrics["memory"]["percent"] > 85:
return True
if any(d["percent"] > 90 for d in metrics["disk"]["partitions"]):
return True
return False
if pre_check(metrics):
detection = detector.detect(metrics, history)
坑 4:企业微信消息字数超限,发送失败
症状:把整份巡检报告发给企业微信,消息被截断。
原因:企业微信机器人消息限制 4096 字符。
解决方案:告警消息只发摘要(告警类型 + 严重度 + 一句话描述),完整报告发邮件或保存到文件系统。
坑 5:自动清理日志后,数据库的 WAL 日志也被误删
症状 :用 find /var/log -name '*.log' -delete 之后,PostgreSQL 启动失败。
原因 :PG 的数据目录确实在 /var/log/postgresql/ 下(部署配置问题),WAL 日志被误删。
解决方案:清理命令限定范围,永远不碰数据库相关目录:
python
SAFE_CLEAN_DIRS = ["/var/log/nginx", "/var/log/app", "/tmp"]
DANGER_CLEAN_DIRS = ["/var/log/postgresql", "/var/lib/postgresql"]
七、总结
| 要点 | 说明 |
|---|---|
| 核心思路 | 采集 → AI 检测 → 分级告警 → 自动修复 → 报告,五阶段闭环 |
| 关键设计 | AI 只做深度分析(阈值先筛),降频降本;危险操作需人工确认 |
| 运维思路转变 | 从"被动救火"到"主动防御",从"手动敲命令"到"AI 自动处理" |
| 成本 | AI 调用约 ¥50/月(阈值预筛后),远低于一次宕机损失 |
三条经验:
- AI 不是替代运维,是辅助运维:阈值预筛 + AI 深度分析,比纯阈值或纯 AI 都好。
- 自动修复必须有安全边际:白名单 + 二次确认 + 监控回滚,三个缺一不可。
- 先跑一周只检不修:让系统先积累数据,确认 AI 判断准确之后再开自动修复。
互动:你的服务器监控是怎么做的?有没有被半夜报警吵醒的经历?分享一下你的运维自动化解法?