第42章:监控与运维
42.1 监控与运维概述
监控与运维是确保系统稳定运行、快速发现和解决问题的关键环节。剪映小助手作为一个复杂的视频处理系统,需要全面的监控体系和运维策略来保证服务质量。
42.1.1 监控的重要性
在视频编辑自动化系统中,监控系统的价值体现在:
- 故障预防:通过实时监控,在问题发生前发现潜在风险
- 性能优化:识别系统瓶颈,指导性能调优
- 容量规划:基于历史数据预测资源需求
- 用户体验:确保服务可用性和响应速度
- 合规要求:满足服务等级协议(SLA)要求
42.1.2 运维的核心目标
剪映小助手的运维体系围绕以下目标构建:
可靠性(Reliability):确保系统7×24小时稳定运行
可维护性(Maintainability):简化系统维护和更新流程
可观测性(Observability):提供全面的系统状态可见性
自动化(Automation):减少人工干预,提高效率
快速恢复(Fast Recovery):出现问题时快速定位和恢复
42.2 应用监控指标
42.2.1 核心性能指标(KPI)
系统性能指标是评估系统健康状态的基础,包括CPU、内存、磁盘和网络等关键资源的使用情况。
python
# src/monitoring/metrics.py
import time
import psutil
import asyncio
from typing import Dict
from collections import deque
import logging
class SystemMetrics:
"""系统指标收集器"""
def __init__(self, window_size: int = 60):
self.window_size = window_size
self.cpu_history = deque(maxlen=window_size)
self.memory_history = deque(maxlen=window_size)
self.disk_history = deque(maxlen=window_size)
self.logger = logging.getLogger(__name__)
async def collect_metrics(self) -> Dict:
"""异步收集系统指标"""
try:
metrics = {
"timestamp": time.time(),
"cpu": {"percent": psutil.cpu_percent(interval=0.1), "count": psutil.cpu_count(logical=True)},
"memory": psutil.virtual_memory()._asdict(),
"disk": psutil.disk_usage('/')._asdict()
}
# 更新历史数据
self.cpu_history.append(metrics["cpu"]["percent"])
self.memory_history.append(metrics["memory"]["percent"])
self.disk_history.append(metrics["disk"]["percent"])
return metrics
except Exception as e:
self.logger.error(f"收集系统指标失败: {e}")
return {"error": str(e)}
# 全局系统指标实例
system_metrics = SystemMetrics()
42.2.2 业务指标监控
业务指标反映了系统的实际使用情况和用户体验,包括请求量、错误率、响应时间和媒体处理任务状态等。
python
# src/monitoring/business_metrics.py
from typing import Dict
import time
import threading
from collections import Counter
import logging
logger = logging.getLogger(__name__)
class BusinessMetrics:
"""业务指标收集器"""
def __init__(self):
self.request_counter = Counter()
self.error_counter = Counter()
self.response_times = {}
self.media_tasks = {"pending": 0, "processing": 0, "completed": 0, "failed": 0}
self.lock = threading.RLock()
def record_request(self, endpoint: str, success: bool = True, response_time: float = 0):
"""记录请求信息"""
with self.lock:
self.request_counter[endpoint] += 1
if not success:
self.error_counter[endpoint] += 1
if response_time > 0:
if endpoint not in self.response_times:
self.response_times[endpoint] = []
self.response_times[endpoint].append(response_time)
def update_media_task_status(self, status: str, delta: int = 1):
"""更新媒体任务状态"""
with self.lock:
if status in self.media_tasks:
self.media_tasks[status] += delta
def get_metrics(self) -> Dict:
"""获取业务指标"""
with self.lock:
total_requests = sum(self.request_counter.values())
total_errors = sum(self.error_counter.values())
error_rate = (total_errors / total_requests * 100) if total_requests > 0 else 0
# 计算平均响应时间
avg_response_times = {}
for endpoint, times in self.response_times.items():
if times:
avg_response_times[endpoint] = sum(times) / len(times)
return {
"timestamp": time.time(),
"total_requests": total_requests,
"error_rate": error_rate,
"top_endpoints": dict(self.request_counter.most_common(5)),
"avg_response_times": avg_response_times,
"media_tasks": dict(self.media_tasks)
}
# 全局业务指标实例
business_metrics = BusinessMetrics()
42.2.3 健康度计算
系统健康度是综合评估系统状态的重要指标,通过分析系统资源使用情况和业务性能指标计算得出。
python
# src/monitoring/health.py
from typing import Dict
import time
import logging
logger = logging.getLogger(__name__)
def calculate_system_health(system_metrics: Dict) -> Dict:
"""计算系统健康度"""
try:
cpu_percent = system_metrics.get("cpu", {}).get("percent", 0)
memory_percent = system_metrics.get("memory", {}).get("percent", 0)
disk_percent = system_metrics.get("disk", {}).get("percent", 0)
# 健康度计算逻辑
cpu_health = 100 - min(cpu_percent, 100)
memory_health = 100 - min(memory_percent, 100)
disk_health = 100 - min(disk_percent, 100)
# 加权计算总体健康度
overall_health = (cpu_health * 0.3 + memory_health * 0.4 + disk_health * 0.3)
# 确定健康状态
if overall_health >= 80:
status = "healthy"
elif overall_health >= 60:
status = "degraded"
else:
status = "critical"
return {
"timestamp": time.time(),
"overall_health": round(overall_health, 2),
"status": status,
"components": {
"cpu": round(cpu_health, 2),
"memory": round(memory_health, 2),
"disk": round(disk_health, 2)
}
}
except Exception as e:
logger.error(f"计算系统健康度失败: {e}")
return {"error": str(e)}
def calculate_application_health(business_metrics: Dict) -> Dict:
"""计算应用健康度"""
try:
error_rate = business_metrics.get("error_rate", 0)
total_requests = business_metrics.get("total_requests", 0)
# 基于错误率计算健康度
if total_requests < 10:
health_score = 90
else:
health_score = max(0, 100 - error_rate * 5) # 每1%错误率扣5分
# 确定健康状态
if health_score >= 90:
status = "healthy"
elif health_score >= 70:
status = "degraded"
else:
status = "critical"
return {
"timestamp": time.time(),
"health_score": round(health_score, 2),
"status": status,
"error_rate": error_rate
}
except Exception as e:
logger.error(f"计算应用健康度失败: {e}")
return {"error": str(e)}
42.3 日志分析系统
42.3.1 结构化日志设计
剪映小助手采用结构化日志格式,便于机器解析和分析,支持JSON格式记录和异常跟踪。
python
# src/logger/structured_logger.py
import logging
import json
import traceback
from typing import Dict
from datetime import datetime
class StructuredFormatter(logging.Formatter):
"""结构化日志格式化器"""
def format(self, record: logging.LogRecord) -> str:
"""格式化日志记录为JSON"""
log_data = {
"timestamp": datetime.utcnow().isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"logger": record.name,
"module": record.module,
"line": record.lineno,
}
# 添加额外字段
if hasattr(record, 'extra'):
log_data.update(record.extra)
# 添加异常信息
if record.exc_info:
log_data["exception"] = {
"type": record.exc_info[0].__name__ if record.exc_info[0] else None,
"message": str(record.exc_info[1]) if record.exc_info[1] else None,
"traceback": ''.join(traceback.format_exception(*record.exc_info))
}
return json.dumps(log_data, ensure_ascii=False)
class StructuredLogger:
"""结构化日志记录器"""
@staticmethod
def get_logger(name: str, level: int = logging.INFO) -> logging.Logger:
"""获取配置好的结构化日志记录器"""
logger = logging.getLogger(name)
logger.setLevel(level)
# 避免重复添加handler
if not logger.handlers:
handler = logging.StreamHandler()
handler.setFormatter(StructuredFormatter())
logger.addHandler(handler)
return logger
# 使用示例
logger = StructuredLogger.get_logger(__name__)
42.3.2 日志收集与分析
系统实现了日志收集和基础分析功能,支持错误日志统计和性能日志分析。
python
# src/logger/log_analyzer.py
import os
import json
import logging
from typing import Dict, List
from datetime import datetime, timedelta
logger = logging.getLogger(__name__)
class LogAnalyzer:
"""日志分析器"""
def __init__(self, log_dir: str = "logs"):
self.log_dir = log_dir
def parse_log_file(self, file_path: str) -> List[Dict]:
"""解析日志文件"""
logs = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
try:
log_entry = json.loads(line.strip())
logs.append(log_entry)
except json.JSONDecodeError:
continue
except Exception as e:
logger.error(f"解析日志文件失败 {file_path}: {e}")
return logs
def analyze_error_logs(self, hours: int = 24) -> Dict:
"""分析错误日志"""
try:
cutoff_time = (datetime.utcnow() - timedelta(hours=hours)).isoformat()
error_logs = []
error_counts = {}
# 获取错误日志文件
error_file = os.path.join(self.log_dir, "error.log")
if os.path.exists(error_file):
logs = self.parse_log_file(error_file)
# 过滤指定时间范围内的错误日志
for log in logs:
if log.get("level") in ["ERROR", "CRITICAL"] and log.get("timestamp") > cutoff_time:
error_logs.append(log)
# 统计错误类型
error_type = log.get("exception", {}).get("type", "Unknown")
error_counts[error_type] = error_counts.get(error_type, 0) + 1
return {
"total_errors": len(error_logs),
"error_types": error_counts,
"recent_errors": error_logs[-10:] if error_logs else []
}
except Exception as e:
logger.error(f"分析错误日志失败: {e}")
return {"error": str(e)}
42.4 健康检查机制
42.4.1 多层次健康检查
系统实现了多层次的健康检查机制,确保全面监控系统状态,包括系统资源、数据库连接和服务可用性等。
python
# src/monitoring/health_check.py
import asyncio
import logging
import time
from typing import Dict
from enum import Enum
logger = logging.getLogger(__name__)
class HealthStatus(Enum):
"""健康状态枚举"""
HEALTHY = "healthy"
DEGRADED = "degraded"
CRITICAL = "critical"
UNKNOWN = "unknown"
class HealthCheck:
"""健康检查基类"""
def __init__(self, name: str, timeout: int = 5):
self.name = name
self.timeout = timeout
self.last_status = HealthStatus.UNKNOWN
async def check(self) -> Dict:
"""执行健康检查"""
start_time = time.time()
try:
# 使用超时保护
result = await asyncio.wait_for(self._check_impl(), timeout=self.timeout)
# 更新状态
self.last_status = result.get("status", HealthStatus.UNKNOWN)
return {
"name": self.name,
"status": result.get("status", HealthStatus.UNKNOWN).value,
"message": result.get("message", ""),
"duration_ms": round((time.time() - start_time) * 1000, 2),
"timestamp": start_time
}
except asyncio.TimeoutError:
logger.error(f"健康检查 {self.name} 超时")
return {
"name": self.name,
"status": HealthStatus.CRITICAL.value,
"message": "Health check timed out",
"duration_ms": round((time.time() - start_time) * 1000, 2),
"timestamp": start_time
}
async def _check_impl(self) -> Dict:
"""具体的检查实现,子类需要重写"""
raise NotImplementedError("子类必须实现_check_impl方法")
class SystemHealthCheck(HealthCheck):
"""系统健康检查"""
def __init__(self):
super().__init__("system")
async def _check_impl(self) -> Dict:
"""检查系统资源状态"""
try:
import psutil
# 检查资源使用率
cpu_percent = psutil.cpu_percent(interval=0.1)
memory_percent = psutil.virtual_memory().percent
disk_percent = psutil.disk_usage('/').percent
# 确定状态
if cpu_percent > 90 or memory_percent > 90 or disk_percent > 95:
status = HealthStatus.CRITICAL
message = f"资源使用率过高 - CPU: {cpu_percent}%, Memory: {memory_percent}%, Disk: {disk_percent}%"
elif cpu_percent > 75 or memory_percent > 75 or disk_percent > 85:
status = HealthStatus.DEGRADED
message = f"资源使用率较高 - CPU: {cpu_percent}%, Memory: {memory_percent}%, Disk: {disk_percent}%"
else:
status = HealthStatus.HEALTHY
message = f"系统资源状态正常 - CPU: {cpu_percent}%, Memory: {memory_percent}%, Disk: {disk_percent}%"
return {"status": status, "message": message}
except ImportError:
return {"status": HealthStatus.UNKNOWN, "message": "psutil模块未安装"}
class HealthCheckManager:
"""健康检查管理器"""
def __init__(self):
self.health_checks: Dict[str, HealthCheck] = {}
self.logger = logging.getLogger(__name__)
def register_check(self, health_check: HealthCheck):
"""注册健康检查"""
self.health_checks[health_check.name] = health_check
self.logger.info(f"已注册健康检查: {health_check.name}")
async def run_all_checks(self) -> Dict:
"""运行所有健康检查"""
try:
# 并行执行所有健康检查
tasks = [check.check() for check in self.health_checks.values()]
results = await asyncio.gather(*tasks, return_exceptions=True)
# 处理结果
check_results = {}
overall_status = HealthStatus.HEALTHY
for i, check in enumerate(self.health_checks.values()):
result = results[i]
if isinstance(result, Exception):
# 处理异常
check_results[check.name] = {
"status": HealthStatus.CRITICAL.value,
"message": f"检查执行失败: {str(result)}",
"timestamp": time.time()
}
overall_status = HealthStatus.CRITICAL
else:
# 正常结果
check_results[check.name] = result
# 更新整体状态
if result["status"] == HealthStatus.CRITICAL.value:
overall_status = HealthStatus.CRITICAL
elif result["status"] == HealthStatus.DEGRADED.value and overall_status != HealthStatus.CRITICAL:
overall_status = HealthStatus.DEGRADED
return {
"timestamp": time.time(),
"status": overall_status.value,
"checks": check_results
}
except Exception as e:
self.logger.error(f"运行健康检查失败: {e}")
return {"status": HealthStatus.CRITICAL.value, "error": str(e), "timestamp": time.time()}
# 创建全局健康检查管理器实例
health_check_manager = HealthCheckManager()
42.4.2 健康检查API
提供了健康检查的API接口,便于外部系统集成和监控。
python
# src/api/health.py
from fastapi import APIRouter, HTTPException
from typing import Dict
import logging
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/health", tags=["health"])
@router.get("/live")
async def liveness_check() -> Dict:
"""存活检查端点"""
return {
"status": "healthy",
"service": "capcut-mate",
"message": "Service is running"
}
@router.get("/ready")
async def readiness_check() -> Dict:
"""就绪检查端点"""
try:
from src.monitoring.health_check import health_check_manager
results = await health_check_manager.run_all_checks()
# 根据结果返回适当的状态码
if results["status"] == "critical":
raise HTTPException(status_code=503, detail=results)
elif results["status"] == "degraded":
return {"status": "degraded", "checks": results.get("checks", {})}
else:
return results
except Exception as e:
logger.error(f"就绪检查失败: {e}")
raise HTTPException(status_code=503, detail=str(e))
@router.get("/checks")
async def get_health_checks() -> Dict:
"""获取所有健康检查结果"""
try:
from src.monitoring.health_check import health_check_manager
return await health_check_manager.run_all_checks()
except Exception as e:
logger.error(f"获取健康检查结果失败: {e}")
raise HTTPException(status_code=500, detail=str(e))
42.5 告警机制
42.5.1 告警规则引擎
系统实现了灵活的告警规则引擎,支持多种告警条件和级别,能够根据监控指标自动触发告警。
python
# src/alerting/alert_engine.py
import time
import logging
from typing import Dict, List
from enum import Enum
from dataclasses import dataclass
logger = logging.getLogger(__name__)
class AlertLevel(Enum):
"""告警级别枚举"""
INFO = "info"
WARNING = "warning"
CRITICAL = "critical"
EMERGENCY = "emergency"
class AlertStatus(Enum):
"""告警状态枚举"""
ACTIVE = "active"
RESOLVED = "resolved"
ACKNOWLEDGED = "acknowledged"
@dataclass
class AlertRule:
"""告警规则数据类"""
rule_id: str
name: str
description: str
metric_name: str
condition: str # 例如: ">= 90"
level: AlertLevel = AlertLevel.WARNING
enabled: bool = True
cooldown: int = 300 # 冷却时间(秒)
@dataclass
class Alert:
"""告警数据类"""
alert_id: str
rule_id: str
level: AlertLevel
status: AlertStatus
message: str
metric_name: str
current_value: float
threshold_value: float
start_time: float
class AlertRuleEngine:
"""告警规则引擎"""
def __init__(self):
self.rules: Dict[str, AlertRule] = {}
self.active_alerts: Dict[str, Alert] = {}
self.alert_history: List[Alert] = []
self.register_default_rules()
def register_default_rules(self):
"""注册默认告警规则"""
default_rules = [
AlertRule(
rule_id="high_cpu_usage",
name="CPU使用率过高",
description="CPU使用率持续高于85%",
metric_name="cpu.percent",
condition=">= 85",
level=AlertLevel.WARNING
),
AlertRule(
rule_id="critical_cpu_usage",
name="CPU使用率严重过高",
description="CPU使用率持续高于95%",
metric_name="cpu.percent",
condition=">= 95",
level=AlertLevel.CRITICAL
),
AlertRule(
rule_id="high_memory_usage",
name="内存使用率过高",
description="内存使用率持续高于85%",
metric_name="memory.percent",
condition=">= 85",
level=AlertLevel.WARNING
),
AlertRule(
rule_id="high_error_rate",
name="错误率过高",
description="API错误率持续高于5%",
metric_name="error_rate",
condition=">= 5",
level=AlertLevel.WARNING
)
]
for rule in default_rules:
self.register_rule(rule)
def register_rule(self, rule: AlertRule):
"""注册告警规则"""
self.rules[rule.rule_id] = rule
logger.info(f"已注册告警规则: {rule.name} ({rule.rule_id})")
def _evaluate_condition(self, current_value: float, condition: str) -> bool:
"""评估条件是否满足"""
try:
# 简单的条件评估
if condition.startswith(">="):
threshold = float(condition[2:].strip())
return current_value >= threshold
elif condition.startswith("<="):
threshold = float(condition[2:].strip())
return current_value <= threshold
elif condition.startswith("> ") or condition.startswith("> "):
threshold = float(condition[1:].strip())
return current_value > threshold
elif condition.startswith("< ") or condition.startswith("< "):
threshold = float(condition[1:].strip())
return current_value < threshold
else:
logger.warning(f"无法解析条件: {condition}")
return False
except Exception as e:
logger.error(f"评估条件失败: {e}")
return False
def evaluate_metrics(self, metrics: Dict) -> List[Alert]:
"""评估指标并生成告警"""
new_alerts = []
current_time = time.time()
for rule_id, rule in self.rules.items():
# 跳过禁用的规则
if not rule.enabled:
continue
# 获取指标值
metric_value = self._get_metric_value(metrics, rule.metric_name)
if metric_value is None:
continue
# 评估条件
if self._evaluate_condition(metric_value, rule.condition):
# 创建新告警
alert_key = f"{rule_id}_{rule.metric_name}"
if alert_key not in self.active_alerts:
alert = Alert(
alert_id=f"{rule_id}_{int(current_time)}",
rule_id=rule_id,
level=rule.level,
status=AlertStatus.ACTIVE,
message=f"{rule.description}: 当前值 {metric_value}",
metric_name=rule.metric_name,
current_value=metric_value,
threshold_value=float(rule.condition.split()[1]),
start_time=current_time
)
self.active_alerts[alert_key] = alert
new_alerts.append(alert)
logger.warning(f"生成告警: {alert.message}")
return new_alerts
def _get_metric_value(self, metrics: Dict, metric_path: str) -> float:
"""从嵌套字典中获取指标值"""
try:
parts = metric_path.split('.')
value = metrics
for part in parts:
if isinstance(value, dict) and part in value:
value = value[part]
else:
return None
return float(value)
except Exception:
return None
def get_active_alerts(self) -> List[Alert]:
"""获取所有活动告警"""
return list(self.active_alerts.values())
def get_alert_history(self, hours: int = 24) -> List[Alert]:
"""获取告警历史"""
cutoff_time = time.time() - (hours * 60 * 60)
return [a for a in self.alert_history if a.start_time >= cutoff_time]
# 全局告警引擎实例
alert_engine = AlertRuleEngine()
42.5.2 告警通知系统
实现了多渠道的告警通知系统,支持邮件、Webhook和钉钉等多种通知方式。
python
# src/alerting/notification.py
import asyncio
import logging
from typing import Dict
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import aiohttp
logger = logging.getLogger(__name__)
class NotificationChannel:
"""通知渠道基类"""
def __init__(self, name: str, config: Dict):
self.name = name
self.config = config
async def send(self, alert) -> bool:
"""发送通知,子类需要实现"""
raise NotImplementedError("子类必须实现send方法")
def format_message(self, alert) -> str:
"""格式化告警消息"""
return (
f"告警ID: {alert.alert_id}\n"
f"规则ID: {alert.rule_id}\n"
f"级别: {alert.level.value.upper()}\n"
f"消息: {alert.message}\n"
f"指标: {alert.metric_name}\n"
f"当前值: {alert.current_value}\n"
f"阈值: {alert.threshold_value}"
)
class EmailNotificationChannel(NotificationChannel):
"""邮件通知渠道"""
async def send(self, alert) -> bool:
"""发送邮件通知"""
try:
smtp_config = self.config.get("smtp", {})
# 创建邮件
msg = MIMEMultipart()
msg['From'] = smtp_config.get("from", "alert@example.com")
msg['To'] = ", ".join(self.config.get("recipients", []))
msg['Subject'] = f"[{alert.level.value.upper()}] {alert.message[:50]}"
# 邮件正文
body = self.format_message(alert)
msg.attach(MIMEText(body, 'plain', 'utf-8'))
# 发送邮件
with smtplib.SMTP(smtp_config.get("host", "localhost"),
smtp_config.get("port", 25)) as server:
if smtp_config.get("use_tls", False):
server.starttls()
if smtp_config.get("username") and smtp_config.get("password"):
server.login(smtp_config["username"], smtp_config["password"])
server.send_message(msg)
logger.info(f"邮件通知已发送: {alert.alert_id}")
return True
except Exception as e:
logger.error(f"邮件通知发送失败: {e}")
return False
class WebhookNotificationChannel(NotificationChannel):
"""Webhook通知渠道"""
async def send(self, alert) -> bool:
"""发送Webhook通知"""
try:
webhook_url = self.config.get("url")
if not webhook_url:
logger.error("Webhook URL未配置")
return False
# 构建请求数据
payload = {
"alert_id": alert.alert_id,
"rule_id": alert.rule_id,
"level": alert.level.value,
"status": alert.status.value,
"message": alert.message,
"current_value": alert.current_value,
"threshold_value": alert.threshold_value,
"timestamp": alert.start_time
}
# 发送HTTP请求
async with aiohttp.ClientSession() as session:
async with session.post(webhook_url, json=payload, timeout=30) as response:
if response.status == 200:
logger.info(f"Webhook通知已发送: {alert.alert_id}")
return True
else:
logger.error(f"Webhook通知发送失败: HTTP {response.status}")
return False
except Exception as e:
logger.error(f"Webhook通知发送失败: {e}")
return False
class NotificationManager:
"""通知管理器"""
def __init__(self):
self.channels: Dict[str, NotificationChannel] = {}
def register_channel(self, channel: NotificationChannel):
"""注册通知渠道"""
self.channels[channel.name] = channel
logger.info(f"已注册通知渠道: {channel.name}")
async def send_alert(self, alert, channel_names=None) -> Dict:
"""发送告警通知"""
if channel_names is None:
channel_names = list(self.channels.keys())
results = {}
for channel_name in channel_names:
if channel_name not in self.channels:
results[channel_name] = {"success": False, "error": "渠道未找到"}
continue
try:
channel = self.channels[channel_name]
success = await channel.send(alert)
results[channel_name] = {"success": success}
if not success:
results[channel_name]["error"] = "发送失败"
except Exception as e:
logger.error(f"发送告警通知失败 ({channel_name}): {e}")
results[channel_name] = {"success": False, "error": str(e)}
return results
# 全局通知管理器
notification_manager = NotificationManager()
42.5.3 告警API接口
提供了告警管理的API接口,便于查询和管理告警。
python
# src/api/alerts.py
from fastapi import APIRouter, HTTPException, Query
from typing import Dict, List, Optional
import logging
router = APIRouter(prefix="/alerts", tags=["alerts"])
logger = logging.getLogger(__name__)
@router.get("/")
async def get_alerts(
status: Optional[str] = Query(None, description="告警状态: active, resolved"),
level: Optional[str] = Query(None, description="告警级别: info, warning, critical, emergency"),
hours: int = Query(24, ge=1, le=168, description="时间范围(小时)")
) -> Dict:
"""获取告警列表"""
try:
from src.alerting.alert_engine import alert_engine, AlertStatus
if status == "active":
alerts = alert_engine.get_active_alerts()
elif status == "resolved":
alerts = [alert for alert in alert_engine.get_alert_history(hours)
if alert.status == AlertStatus.RESOLVED]
else:
alerts = alert_engine.get_active_alerts() + alert_engine.get_alert_history(hours)
# 按级别过滤
if level:
from src.alerting.alert_engine import AlertLevel
alerts = [alert for alert in alerts if alert.level.value == level]
# 转换为可序列化的格式
alert_data = []
for alert in alerts:
alert_data.append({
"alert_id": alert.alert_id,
"rule_id": alert.rule_id,
"level": alert.level.value,
"status": alert.status.value,
"message": alert.message,
"metric_name": alert.metric_name,
"current_value": alert.current_value,
"threshold_value": alert.threshold_value,
"start_time": alert.start_time
})
return {
"total": len(alert_data),
"alerts": alert_data
}
except Exception as e:
logger.error(f"获取告警列表失败: {e}")
raise HTTPException(status_code=500, detail="获取告警列表失败")
@router.get("/statistics")
async def get_alert_statistics() -> Dict:
"""获取告警统计"""
try:
from src.alerting.alert_engine import alert_engine
return alert_engine.get_alert_statistics()
except Exception as e:
logger.error(f"获取告警统计失败: {e}")
raise HTTPException(status_code=500, detail="获取告警统计失败")
42.6 自动化运维
42.6.1 自动化运维管理器
系统实现了自动化运维管理器,支持日志清理、备份等自动化任务,提高运维效率。
python
# src/operations/automation.py
import os
import shutil
import json
from typing import Dict
from datetime import datetime, timedelta
import logging
import yaml
logger = logging.getLogger(__name__)
class AutomationManager:
"""自动化运维管理器"""
def __init__(self):
self.logger = logging.getLogger(__name__)
self.automation_config = self.load_automation_config()
def load_automation_config(self) -> Dict:
"""加载自动化配置"""
config_path = "config/automation.yaml"
try:
if os.path.exists(config_path):
with open(config_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f) or {}
else:
self.logger.warning(f"自动化配置文件未找到: {config_path}")
return {}
except Exception as e:
self.logger.error(f"加载自动化配置失败: {e}")
return {}
async def cleanup_logs(self, days_to_keep: int = 7) -> Dict:
"""清理旧日志文件"""
try:
log_dirs = ["logs", "logs/archive", "logs/error"]
deleted_files = []
total_size_freed = 0
cutoff_date = datetime.now() - timedelta(days=days_to_keep)
for log_dir in log_dirs:
if not os.path.exists(log_dir):
continue
for root, dirs, files in os.walk(log_dir):
for file in files:
if file.endswith('.log') or file.endswith('.log.gz'):
file_path = os.path.join(root, file)
try:
file_stat = os.stat(file_path)
file_mtime = datetime.fromtimestamp(file_stat.st_mtime)
if file_mtime < cutoff_date:
file_size = file_stat.st_size
os.remove(file_path)
deleted_files.append(file_path)
total_size_freed += file_size
except Exception as e:
self.logger.error(f"删除日志文件失败 {file_path}: {e}")
return {
"task": "cleanup_logs",
"status": "success",
"deleted_files": deleted_files,
"files_deleted_count": len(deleted_files),
"space_freed_mb": round(total_size_freed / (1024 * 1024), 2)
}
except Exception as e:
self.logger.error(f"日志清理失败: {e}")
return {"task": "cleanup_logs", "status": "failed", "error": str(e)}
async def backup_data(self, backup_type: str = "full") -> Dict:
"""备份数据"""
try:
backup_config = self.automation_config.get("backup", {})
if not backup_config.get("enabled", False):
return {
"task": "backup_data",
"status": "skipped",
"reason": "备份功能未启用"
}
backup_dir = backup_config.get("backup_dir", "backups")
os.makedirs(backup_dir, exist_ok=True)
backup_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_name = f"backup_{backup_type}_{backup_timestamp}"
backup_path = os.path.join(backup_dir, backup_name)
os.makedirs(backup_path, exist_ok=True)
# 备份配置文件
config_backup_result = await self.backup_configs(backup_path)
# 创建备份清单
manifest = {
"backup_name": backup_name,
"backup_type": backup_type,
"timestamp": datetime.now().isoformat(),
"components": {
"configs": config_backup_result
}
}
manifest_path = os.path.join(backup_path, "manifest.json")
with open(manifest_path, 'w', encoding='utf-8') as f:
json.dump(manifest, f, ensure_ascii=False, indent=2)
return {
"task": "backup_data",
"status": "success",
"backup_name": backup_name,
"backup_path": backup_path
}
except Exception as e:
self.logger.error(f"数据备份失败: {e}")
return {"task": "backup_data", "status": "failed", "error": str(e)}
async def backup_configs(self, backup_path: str) -> Dict:
"""备份配置文件"""
try:
config_backup_dir = os.path.join(backup_path, "configs")
os.makedirs(config_backup_dir, exist_ok=True)
config_files = [
"config/app.yaml",
"config/database.yaml",
"config/logging.yaml",
".env"
]
copied_files = []
for config_file in config_files:
if os.path.exists(config_file):
dest_file = os.path.join(config_backup_dir, os.path.basename(config_file))
shutil.copy2(config_file, dest_file)
copied_files.append(config_file)
return {
"status": "success",
"copied_files": copied_files,
"files_count": len(copied_files)
}
except Exception as e:
self.logger.error(f"配置文件备份失败: {e}")
return {"status": "failed", "error": str(e)}
# 全局自动化运维管理器
automation_manager = AutomationManager()
42.6.2 运维API接口
提供了运维操作的API接口,便于远程管理和自动化。
python
# src/api/operations.py
from fastapi import APIRouter, HTTPException, Query, BackgroundTasks
from typing import Dict
import logging
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/operations", tags=["operations"])
@router.post("/cleanup/logs")
async def cleanup_logs(
days_to_keep: int = Query(7, ge=1, le=30, description="保留日志天数"),
background_tasks: BackgroundTasks = BackgroundTasks()
) -> Dict:
"""清理旧日志文件(异步执行)"""
try:
from src.operations.automation import automation_manager
# 添加到后台任务
background_tasks.add_task(automation_manager.cleanup_logs, days_to_keep)
return {
"message": "日志清理任务已启动",
"days_to_keep": days_to_keep,
"status": "started"
}
except Exception as e:
logger.error(f"启动日志清理任务失败: {e}")
raise HTTPException(status_code=500, detail="启动日志清理任务失败")
@router.post("/backup")
async def create_backup(
backup_type: str = Query("full", regex="^(full|config)$", description="备份类型"),
background_tasks: BackgroundTasks = BackgroundTasks()
) -> Dict:
"""创建备份(异步执行)"""
try:
from src.operations.automation import automation_manager
# 添加到后台任务
background_tasks.add_task(automation_manager.backup_data, backup_type)
return {
"message": "备份任务已启动",
"backup_type": backup_type,
"status": "started"
}
except Exception as e:
logger.error(f"启动备份任务失败: {e}")
raise HTTPException(status_code=500, detail="启动备份任务失败")
@router.get("/health")
async def operations_health_check() -> Dict:
"""运维健康检查"""
try:
from src.operations.automation import automation_manager
return await automation_manager.health_check()
except Exception as e:
logger.error(f"运维健康检查失败: {e}")
raise HTTPException(status_code=500, detail="运维健康检查失败")
42.7 监控与运维最佳实践
42.7.1 监控策略建议
- 分层监控:系统层、应用层、业务层分别设置监控指标
- 合理的告警阈值:根据系统特点和用户场景设置合理的告警阈值
- 告警降噪:设置合理的告警冷却时间,避免告警风暴
- 历史数据保留:保留足够的历史数据用于趋势分析和容量规划
- 定期健康检查:定期运行全面的健康检查,确保系统各组件正常运行
42.7.2 日志管理最佳实践
- 结构化日志:统一使用结构化日志格式,便于机器解析
- 日志级别合理使用:根据日志重要性选择适当的级别
- 日志轮转:配置适当的日志轮转策略,避免日志文件过大
- 敏感信息脱敏:确保日志中不包含敏感信息
- 集中式日志管理:考虑使用ELK、Loki等工具进行日志集中管理和分析
42.7.3 告警管理最佳实践
- 告警分级:根据严重程度对告警进行分级,便于优先级处理
- 告警路由:根据告警类型和级别路由到不同的处理人员或团队
- 告警升级:对于未及时处理的告警设置升级机制
- 告警聚合:相似的告警进行聚合,减少重复告警
- 告警演练:定期进行告警响应演练,确保团队能够快速响应
42.7.4 自动化运维建议
- 自动化部署:实现CI/CD流程,自动化部署和回滚
- 自动化备份:定期自动备份关键数据和配置
- 自动化恢复:实现故障自动恢复机制
- 资源自动扩缩容:根据负载自动调整资源配置
- 定期自动清理:定期清理临时文件、过期日志等
附录
代码仓库地址:
- GitHub:
https://github.com/Hommy-master/capcut-mate - Gitee:
https://gitee.com/taohongmin-gitee/capcut-mate
接口文档地址:
- API文档地址:
https://docs.jcaigc.cn