【剪映小助手源码精讲】第42章：监控与运维

第42章：监控与运维

42.1 监控与运维概述

监控与运维是确保系统稳定运行、快速发现和解决问题的关键环节。剪映小助手作为一个复杂的视频处理系统，需要全面的监控体系和运维策略来保证服务质量。

42.1.1 监控的重要性

在视频编辑自动化系统中，监控系统的价值体现在：

故障预防：通过实时监控，在问题发生前发现潜在风险
性能优化：识别系统瓶颈，指导性能调优
容量规划：基于历史数据预测资源需求
用户体验：确保服务可用性和响应速度
合规要求：满足服务等级协议（SLA）要求

42.1.2 运维的核心目标

剪映小助手的运维体系围绕以下目标构建：

可靠性（Reliability）：确保系统7×24小时稳定运行

可维护性（Maintainability）：简化系统维护和更新流程

可观测性（Observability）：提供全面的系统状态可见性

自动化（Automation）：减少人工干预，提高效率

快速恢复（Fast Recovery）：出现问题时快速定位和恢复

42.2 应用监控指标

42.2.1 核心性能指标（KPI）

系统性能指标是评估系统健康状态的基础，包括CPU、内存、磁盘和网络等关键资源的使用情况。

python 复制代码

# src/monitoring/metrics.py
import time
import psutil
import asyncio
from typing import Dict
from collections import deque
import logging

class SystemMetrics:
    """系统指标收集器"""
    
    def __init__(self, window_size: int = 60):
        self.window_size = window_size
        self.cpu_history = deque(maxlen=window_size)
        self.memory_history = deque(maxlen=window_size)
        self.disk_history = deque(maxlen=window_size)
        self.logger = logging.getLogger(__name__)
    
    async def collect_metrics(self) -> Dict:
        """异步收集系统指标"""
        try:
            metrics = {
                "timestamp": time.time(),
                "cpu": {"percent": psutil.cpu_percent(interval=0.1), "count": psutil.cpu_count(logical=True)},
                "memory": psutil.virtual_memory()._asdict(),
                "disk": psutil.disk_usage('/')._asdict()
            }
            
            # 更新历史数据
            self.cpu_history.append(metrics["cpu"]["percent"])
            self.memory_history.append(metrics["memory"]["percent"])
            self.disk_history.append(metrics["disk"]["percent"])
            
            return metrics
        except Exception as e:
            self.logger.error(f"收集系统指标失败: {e}")
            return {"error": str(e)}

# 全局系统指标实例
system_metrics = SystemMetrics()

42.2.2 业务指标监控

业务指标反映了系统的实际使用情况和用户体验，包括请求量、错误率、响应时间和媒体处理任务状态等。

python 复制代码

# src/monitoring/business_metrics.py
from typing import Dict
import time
import threading
from collections import Counter
import logging

logger = logging.getLogger(__name__)

class BusinessMetrics:
    """业务指标收集器"""
    
    def __init__(self):
        self.request_counter = Counter()
        self.error_counter = Counter()
        self.response_times = {}
        self.media_tasks = {"pending": 0, "processing": 0, "completed": 0, "failed": 0}
        self.lock = threading.RLock()
    
    def record_request(self, endpoint: str, success: bool = True, response_time: float = 0):
        """记录请求信息"""
        with self.lock:
            self.request_counter[endpoint] += 1
            if not success:
                self.error_counter[endpoint] += 1
            if response_time > 0:
                if endpoint not in self.response_times:
                    self.response_times[endpoint] = []
                self.response_times[endpoint].append(response_time)
    
    def update_media_task_status(self, status: str, delta: int = 1):
        """更新媒体任务状态"""
        with self.lock:
            if status in self.media_tasks:
                self.media_tasks[status] += delta
    
    def get_metrics(self) -> Dict:
        """获取业务指标"""
        with self.lock:
            total_requests = sum(self.request_counter.values())
            total_errors = sum(self.error_counter.values())
            error_rate = (total_errors / total_requests * 100) if total_requests > 0 else 0
            
            # 计算平均响应时间
            avg_response_times = {}
            for endpoint, times in self.response_times.items():
                if times:
                    avg_response_times[endpoint] = sum(times) / len(times)
            
            return {
                "timestamp": time.time(),
                "total_requests": total_requests,
                "error_rate": error_rate,
                "top_endpoints": dict(self.request_counter.most_common(5)),
                "avg_response_times": avg_response_times,
                "media_tasks": dict(self.media_tasks)
            }

# 全局业务指标实例
business_metrics = BusinessMetrics()

42.2.3 健康度计算

系统健康度是综合评估系统状态的重要指标，通过分析系统资源使用情况和业务性能指标计算得出。

python 复制代码

# src/monitoring/health.py
from typing import Dict
import time
import logging

logger = logging.getLogger(__name__)

def calculate_system_health(system_metrics: Dict) -> Dict:
    """计算系统健康度"""
    try:
        cpu_percent = system_metrics.get("cpu", {}).get("percent", 0)
        memory_percent = system_metrics.get("memory", {}).get("percent", 0)
        disk_percent = system_metrics.get("disk", {}).get("percent", 0)
        
        # 健康度计算逻辑
        cpu_health = 100 - min(cpu_percent, 100)
        memory_health = 100 - min(memory_percent, 100)
        disk_health = 100 - min(disk_percent, 100)
        
        # 加权计算总体健康度
        overall_health = (cpu_health * 0.3 + memory_health * 0.4 + disk_health * 0.3)
        
        # 确定健康状态
        if overall_health >= 80:
            status = "healthy"
        elif overall_health >= 60:
            status = "degraded"
        else:
            status = "critical"
        
        return {
            "timestamp": time.time(),
            "overall_health": round(overall_health, 2),
            "status": status,
            "components": {
                "cpu": round(cpu_health, 2),
                "memory": round(memory_health, 2),
                "disk": round(disk_health, 2)
            }
        }
    except Exception as e:
        logger.error(f"计算系统健康度失败: {e}")
        return {"error": str(e)}

def calculate_application_health(business_metrics: Dict) -> Dict:
    """计算应用健康度"""
    try:
        error_rate = business_metrics.get("error_rate", 0)
        total_requests = business_metrics.get("total_requests", 0)
        
        # 基于错误率计算健康度
        if total_requests < 10:
            health_score = 90
        else:
            health_score = max(0, 100 - error_rate * 5)  # 每1%错误率扣5分
        
        # 确定健康状态
        if health_score >= 90:
            status = "healthy"
        elif health_score >= 70:
            status = "degraded"
        else:
            status = "critical"
        
        return {
            "timestamp": time.time(),
            "health_score": round(health_score, 2),
            "status": status,
            "error_rate": error_rate
        }
    except Exception as e:
        logger.error(f"计算应用健康度失败: {e}")
        return {"error": str(e)}

42.3 日志分析系统

42.3.1 结构化日志设计

剪映小助手采用结构化日志格式，便于机器解析和分析，支持JSON格式记录和异常跟踪。

python 复制代码

# src/logger/structured_logger.py
import logging
import json
import traceback
from typing import Dict
from datetime import datetime

class StructuredFormatter(logging.Formatter):
    """结构化日志格式化器"""
    
    def format(self, record: logging.LogRecord) -> str:
        """格式化日志记录为JSON"""
        log_data = {
            "timestamp": datetime.utcnow().isoformat(),
            "level": record.levelname,
            "message": record.getMessage(),
            "logger": record.name,
            "module": record.module,
            "line": record.lineno,
        }
        
        # 添加额外字段
        if hasattr(record, 'extra'):
            log_data.update(record.extra)
        
        # 添加异常信息
        if record.exc_info:
            log_data["exception"] = {
                "type": record.exc_info[0].__name__ if record.exc_info[0] else None,
                "message": str(record.exc_info[1]) if record.exc_info[1] else None,
                "traceback": ''.join(traceback.format_exception(*record.exc_info))
            }
        
        return json.dumps(log_data, ensure_ascii=False)

class StructuredLogger:
    """结构化日志记录器"""
    
    @staticmethod
    def get_logger(name: str, level: int = logging.INFO) -> logging.Logger:
        """获取配置好的结构化日志记录器"""
        logger = logging.getLogger(name)
        logger.setLevel(level)
        
        # 避免重复添加handler
        if not logger.handlers:
            handler = logging.StreamHandler()
            handler.setFormatter(StructuredFormatter())
            logger.addHandler(handler)
        
        return logger

# 使用示例
logger = StructuredLogger.get_logger(__name__)

42.3.2 日志收集与分析

系统实现了日志收集和基础分析功能，支持错误日志统计和性能日志分析。

python 复制代码

# src/logger/log_analyzer.py
import os
import json
import logging
from typing import Dict, List
from datetime import datetime, timedelta

logger = logging.getLogger(__name__)

class LogAnalyzer:
    """日志分析器"""
    
    def __init__(self, log_dir: str = "logs"):
        self.log_dir = log_dir
    
    def parse_log_file(self, file_path: str) -> List[Dict]:
        """解析日志文件"""
        logs = []
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        log_entry = json.loads(line.strip())
                        logs.append(log_entry)
                    except json.JSONDecodeError:
                        continue
        except Exception as e:
            logger.error(f"解析日志文件失败 {file_path}: {e}")
        return logs
    
    def analyze_error_logs(self, hours: int = 24) -> Dict:
        """分析错误日志"""
        try:
            cutoff_time = (datetime.utcnow() - timedelta(hours=hours)).isoformat()
            error_logs = []
            error_counts = {}
            
            # 获取错误日志文件
            error_file = os.path.join(self.log_dir, "error.log")
            if os.path.exists(error_file):
                logs = self.parse_log_file(error_file)
                # 过滤指定时间范围内的错误日志
                for log in logs:
                    if log.get("level") in ["ERROR", "CRITICAL"] and log.get("timestamp") > cutoff_time:
                        error_logs.append(log)
                        # 统计错误类型
                        error_type = log.get("exception", {}).get("type", "Unknown")
                        error_counts[error_type] = error_counts.get(error_type, 0) + 1
            
            return {
                "total_errors": len(error_logs),
                "error_types": error_counts,
                "recent_errors": error_logs[-10:] if error_logs else []
            }
        except Exception as e:
            logger.error(f"分析错误日志失败: {e}")
            return {"error": str(e)}

42.4 健康检查机制

42.4.1 多层次健康检查

系统实现了多层次的健康检查机制，确保全面监控系统状态，包括系统资源、数据库连接和服务可用性等。

python 复制代码

# src/monitoring/health_check.py
import asyncio
import logging
import time
from typing import Dict
from enum import Enum

logger = logging.getLogger(__name__)

class HealthStatus(Enum):
    """健康状态枚举"""
    HEALTHY = "healthy"
    DEGRADED = "degraded"
    CRITICAL = "critical"
    UNKNOWN = "unknown"

class HealthCheck:
    """健康检查基类"""
    
    def __init__(self, name: str, timeout: int = 5):
        self.name = name
        self.timeout = timeout
        self.last_status = HealthStatus.UNKNOWN
    
    async def check(self) -> Dict:
        """执行健康检查"""
        start_time = time.time()
        try:
            # 使用超时保护
            result = await asyncio.wait_for(self._check_impl(), timeout=self.timeout)
            
            # 更新状态
            self.last_status = result.get("status", HealthStatus.UNKNOWN)
            
            return {
                "name": self.name,
                "status": result.get("status", HealthStatus.UNKNOWN).value,
                "message": result.get("message", ""),
                "duration_ms": round((time.time() - start_time) * 1000, 2),
                "timestamp": start_time
            }
        except asyncio.TimeoutError:
            logger.error(f"健康检查 {self.name} 超时")
            return {
                "name": self.name,
                "status": HealthStatus.CRITICAL.value,
                "message": "Health check timed out",
                "duration_ms": round((time.time() - start_time) * 1000, 2),
                "timestamp": start_time
            }
    
    async def _check_impl(self) -> Dict:
        """具体的检查实现，子类需要重写"""
        raise NotImplementedError("子类必须实现_check_impl方法")

class SystemHealthCheck(HealthCheck):
    """系统健康检查"""
    
    def __init__(self):
        super().__init__("system")
    
    async def _check_impl(self) -> Dict:
        """检查系统资源状态"""
        try:
            import psutil
            
            # 检查资源使用率
            cpu_percent = psutil.cpu_percent(interval=0.1)
            memory_percent = psutil.virtual_memory().percent
            disk_percent = psutil.disk_usage('/').percent
            
            # 确定状态
            if cpu_percent > 90 or memory_percent > 90 or disk_percent > 95:
                status = HealthStatus.CRITICAL
                message = f"资源使用率过高 - CPU: {cpu_percent}%, Memory: {memory_percent}%, Disk: {disk_percent}%"
            elif cpu_percent > 75 or memory_percent > 75 or disk_percent > 85:
                status = HealthStatus.DEGRADED
                message = f"资源使用率较高 - CPU: {cpu_percent}%, Memory: {memory_percent}%, Disk: {disk_percent}%"
            else:
                status = HealthStatus.HEALTHY
                message = f"系统资源状态正常 - CPU: {cpu_percent}%, Memory: {memory_percent}%, Disk: {disk_percent}%"
            
            return {"status": status, "message": message}
        except ImportError:
            return {"status": HealthStatus.UNKNOWN, "message": "psutil模块未安装"}

class HealthCheckManager:
    """健康检查管理器"""
    
    def __init__(self):
        self.health_checks: Dict[str, HealthCheck] = {}
        self.logger = logging.getLogger(__name__)
    
    def register_check(self, health_check: HealthCheck):
        """注册健康检查"""
        self.health_checks[health_check.name] = health_check
        self.logger.info(f"已注册健康检查: {health_check.name}")
    
    async def run_all_checks(self) -> Dict:
        """运行所有健康检查"""
        try:
            # 并行执行所有健康检查
            tasks = [check.check() for check in self.health_checks.values()]
            results = await asyncio.gather(*tasks, return_exceptions=True)
            
            # 处理结果
            check_results = {}
            overall_status = HealthStatus.HEALTHY
            
            for i, check in enumerate(self.health_checks.values()):
                result = results[i]
                if isinstance(result, Exception):
                    # 处理异常
                    check_results[check.name] = {
                        "status": HealthStatus.CRITICAL.value,
                        "message": f"检查执行失败: {str(result)}",
                        "timestamp": time.time()
                    }
                    overall_status = HealthStatus.CRITICAL
                else:
                    # 正常结果
                    check_results[check.name] = result
                    # 更新整体状态
                    if result["status"] == HealthStatus.CRITICAL.value:
                        overall_status = HealthStatus.CRITICAL
                    elif result["status"] == HealthStatus.DEGRADED.value and overall_status != HealthStatus.CRITICAL:
                        overall_status = HealthStatus.DEGRADED
            
            return {
                "timestamp": time.time(),
                "status": overall_status.value,
                "checks": check_results
            }
        except Exception as e:
            self.logger.error(f"运行健康检查失败: {e}")
            return {"status": HealthStatus.CRITICAL.value, "error": str(e), "timestamp": time.time()}

# 创建全局健康检查管理器实例
health_check_manager = HealthCheckManager()

42.4.2 健康检查API

提供了健康检查的API接口，便于外部系统集成和监控。

python 复制代码

# src/api/health.py
from fastapi import APIRouter, HTTPException
from typing import Dict
import logging

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/health", tags=["health"])

@router.get("/live")
async def liveness_check() -> Dict:
    """存活检查端点"""
    return {
        "status": "healthy",
        "service": "capcut-mate",
        "message": "Service is running"
    }

@router.get("/ready")
async def readiness_check() -> Dict:
    """就绪检查端点"""
    try:
        from src.monitoring.health_check import health_check_manager
        results = await health_check_manager.run_all_checks()
        
        # 根据结果返回适当的状态码
        if results["status"] == "critical":
            raise HTTPException(status_code=503, detail=results)
        elif results["status"] == "degraded":
            return {"status": "degraded", "checks": results.get("checks", {})}
        else:
            return results
    except Exception as e:
        logger.error(f"就绪检查失败: {e}")
        raise HTTPException(status_code=503, detail=str(e))

@router.get("/checks")
async def get_health_checks() -> Dict:
    """获取所有健康检查结果"""
    try:
        from src.monitoring.health_check import health_check_manager
        return await health_check_manager.run_all_checks()
    except Exception as e:
        logger.error(f"获取健康检查结果失败: {e}")
        raise HTTPException(status_code=500, detail=str(e))

42.5 告警机制

42.5.1 告警规则引擎

系统实现了灵活的告警规则引擎，支持多种告警条件和级别，能够根据监控指标自动触发告警。

python 复制代码

# src/alerting/alert_engine.py
import time
import logging
from typing import Dict, List
from enum import Enum
from dataclasses import dataclass

logger = logging.getLogger(__name__)

class AlertLevel(Enum):
    """告警级别枚举"""
    INFO = "info"
    WARNING = "warning"
    CRITICAL = "critical"
    EMERGENCY = "emergency"

class AlertStatus(Enum):
    """告警状态枚举"""
    ACTIVE = "active"
    RESOLVED = "resolved"
    ACKNOWLEDGED = "acknowledged"

@dataclass
class AlertRule:
    """告警规则数据类"""
    rule_id: str
    name: str
    description: str
    metric_name: str
    condition: str  # 例如: ">= 90"
    level: AlertLevel = AlertLevel.WARNING
    enabled: bool = True
    cooldown: int = 300  # 冷却时间（秒）

@dataclass
class Alert:
    """告警数据类"""
    alert_id: str
    rule_id: str
    level: AlertLevel
    status: AlertStatus
    message: str
    metric_name: str
    current_value: float
    threshold_value: float
    start_time: float

class AlertRuleEngine:
    """告警规则引擎"""
    
    def __init__(self):
        self.rules: Dict[str, AlertRule] = {}
        self.active_alerts: Dict[str, Alert] = {}
        self.alert_history: List[Alert] = []
        self.register_default_rules()
    
    def register_default_rules(self):
        """注册默认告警规则"""
        default_rules = [
            AlertRule(
                rule_id="high_cpu_usage",
                name="CPU使用率过高",
                description="CPU使用率持续高于85%",
                metric_name="cpu.percent",
                condition=">= 85",
                level=AlertLevel.WARNING
            ),
            AlertRule(
                rule_id="critical_cpu_usage",
                name="CPU使用率严重过高",
                description="CPU使用率持续高于95%",
                metric_name="cpu.percent",
                condition=">= 95",
                level=AlertLevel.CRITICAL
            ),
            AlertRule(
                rule_id="high_memory_usage",
                name="内存使用率过高",
                description="内存使用率持续高于85%",
                metric_name="memory.percent",
                condition=">= 85",
                level=AlertLevel.WARNING
            ),
            AlertRule(
                rule_id="high_error_rate",
                name="错误率过高",
                description="API错误率持续高于5%",
                metric_name="error_rate",
                condition=">= 5",
                level=AlertLevel.WARNING
            )
        ]
        
        for rule in default_rules:
            self.register_rule(rule)
    
    def register_rule(self, rule: AlertRule):
        """注册告警规则"""
        self.rules[rule.rule_id] = rule
        logger.info(f"已注册告警规则: {rule.name} ({rule.rule_id})")
    
    def _evaluate_condition(self, current_value: float, condition: str) -> bool:
        """评估条件是否满足"""
        try:
            # 简单的条件评估
            if condition.startswith(">="):
                threshold = float(condition[2:].strip())
                return current_value >= threshold
            elif condition.startswith("<="):
                threshold = float(condition[2:].strip())
                return current_value <= threshold
            elif condition.startswith("> ") or condition.startswith("> "):
                threshold = float(condition[1:].strip())
                return current_value > threshold
            elif condition.startswith("< ") or condition.startswith("< "):
                threshold = float(condition[1:].strip())
                return current_value < threshold
            else:
                logger.warning(f"无法解析条件: {condition}")
                return False
        except Exception as e:
            logger.error(f"评估条件失败: {e}")
            return False
    
    def evaluate_metrics(self, metrics: Dict) -> List[Alert]:
        """评估指标并生成告警"""
        new_alerts = []
        current_time = time.time()
        
        for rule_id, rule in self.rules.items():
            # 跳过禁用的规则
            if not rule.enabled:
                continue
            
            # 获取指标值
            metric_value = self._get_metric_value(metrics, rule.metric_name)
            if metric_value is None:
                continue
            
            # 评估条件
            if self._evaluate_condition(metric_value, rule.condition):
                # 创建新告警
                alert_key = f"{rule_id}_{rule.metric_name}"
                if alert_key not in self.active_alerts:
                    alert = Alert(
                        alert_id=f"{rule_id}_{int(current_time)}",
                        rule_id=rule_id,
                        level=rule.level,
                        status=AlertStatus.ACTIVE,
                        message=f"{rule.description}: 当前值 {metric_value}",
                        metric_name=rule.metric_name,
                        current_value=metric_value,
                        threshold_value=float(rule.condition.split()[1]),
                        start_time=current_time
                    )
                    
                    self.active_alerts[alert_key] = alert
                    new_alerts.append(alert)
                    logger.warning(f"生成告警: {alert.message}")
        
        return new_alerts
    
    def _get_metric_value(self, metrics: Dict, metric_path: str) -> float:
        """从嵌套字典中获取指标值"""
        try:
            parts = metric_path.split('.')
            value = metrics
            for part in parts:
                if isinstance(value, dict) and part in value:
                    value = value[part]
                else:
                    return None
            return float(value)
        except Exception:
            return None
    
    def get_active_alerts(self) -> List[Alert]:
        """获取所有活动告警"""
        return list(self.active_alerts.values())
    
    def get_alert_history(self, hours: int = 24) -> List[Alert]:
        """获取告警历史"""
        cutoff_time = time.time() - (hours * 60 * 60)
        return [a for a in self.alert_history if a.start_time >= cutoff_time]

# 全局告警引擎实例
alert_engine = AlertRuleEngine()

42.5.2 告警通知系统

实现了多渠道的告警通知系统，支持邮件、Webhook和钉钉等多种通知方式。

python 复制代码

# src/alerting/notification.py
import asyncio
import logging
from typing import Dict
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import aiohttp

logger = logging.getLogger(__name__)

class NotificationChannel:
    """通知渠道基类"""
    
    def __init__(self, name: str, config: Dict):
        self.name = name
        self.config = config
    
    async def send(self, alert) -> bool:
        """发送通知，子类需要实现"""
        raise NotImplementedError("子类必须实现send方法")
    
    def format_message(self, alert) -> str:
        """格式化告警消息"""
        return (
            f"告警ID: {alert.alert_id}\n"
            f"规则ID: {alert.rule_id}\n"
            f"级别: {alert.level.value.upper()}\n"
            f"消息: {alert.message}\n"
            f"指标: {alert.metric_name}\n"
            f"当前值: {alert.current_value}\n"
            f"阈值: {alert.threshold_value}"
        )

class EmailNotificationChannel(NotificationChannel):
    """邮件通知渠道"""
    
    async def send(self, alert) -> bool:
        """发送邮件通知"""
        try:
            smtp_config = self.config.get("smtp", {})
            
            # 创建邮件
            msg = MIMEMultipart()
            msg['From'] = smtp_config.get("from", "alert@example.com")
            msg['To'] = ", ".join(self.config.get("recipients", []))
            msg['Subject'] = f"[{alert.level.value.upper()}] {alert.message[:50]}"
            
            # 邮件正文
            body = self.format_message(alert)
            msg.attach(MIMEText(body, 'plain', 'utf-8'))
            
            # 发送邮件
            with smtplib.SMTP(smtp_config.get("host", "localhost"), 
                            smtp_config.get("port", 25)) as server:
                if smtp_config.get("use_tls", False):
                    server.starttls()
                
                if smtp_config.get("username") and smtp_config.get("password"):
                    server.login(smtp_config["username"], smtp_config["password"])
                
                server.send_message(msg)
            
            logger.info(f"邮件通知已发送: {alert.alert_id}")
            return True
            
        except Exception as e:
            logger.error(f"邮件通知发送失败: {e}")
            return False

class WebhookNotificationChannel(NotificationChannel):
    """Webhook通知渠道"""
    
    async def send(self, alert) -> bool:
        """发送Webhook通知"""
        try:
            webhook_url = self.config.get("url")
            if not webhook_url:
                logger.error("Webhook URL未配置")
                return False
            
            # 构建请求数据
            payload = {
                "alert_id": alert.alert_id,
                "rule_id": alert.rule_id,
                "level": alert.level.value,
                "status": alert.status.value,
                "message": alert.message,
                "current_value": alert.current_value,
                "threshold_value": alert.threshold_value,
                "timestamp": alert.start_time
            }
            
            # 发送HTTP请求
            async with aiohttp.ClientSession() as session:
                async with session.post(webhook_url, json=payload, timeout=30) as response:
                    if response.status == 200:
                        logger.info(f"Webhook通知已发送: {alert.alert_id}")
                        return True
                    else:
                        logger.error(f"Webhook通知发送失败: HTTP {response.status}")
                        return False
                        
        except Exception as e:
            logger.error(f"Webhook通知发送失败: {e}")
            return False

class NotificationManager:
    """通知管理器"""
    
    def __init__(self):
        self.channels: Dict[str, NotificationChannel] = {}
    
    def register_channel(self, channel: NotificationChannel):
        """注册通知渠道"""
        self.channels[channel.name] = channel
        logger.info(f"已注册通知渠道: {channel.name}")
    
    async def send_alert(self, alert, channel_names=None) -> Dict:
        """发送告警通知"""
        if channel_names is None:
            channel_names = list(self.channels.keys())
        
        results = {}
        
        for channel_name in channel_names:
            if channel_name not in self.channels:
                results[channel_name] = {"success": False, "error": "渠道未找到"}
                continue
            
            try:
                channel = self.channels[channel_name]
                success = await channel.send(alert)
                results[channel_name] = {"success": success}
                
                if not success:
                    results[channel_name]["error"] = "发送失败"
                    
            except Exception as e:
                logger.error(f"发送告警通知失败 ({channel_name}): {e}")
                results[channel_name] = {"success": False, "error": str(e)}
        
        return results

# 全局通知管理器
notification_manager = NotificationManager()

42.5.3 告警API接口

提供了告警管理的API接口，便于查询和管理告警。

python 复制代码

# src/api/alerts.py
from fastapi import APIRouter, HTTPException, Query
from typing import Dict, List, Optional
import logging

router = APIRouter(prefix="/alerts", tags=["alerts"])
logger = logging.getLogger(__name__)

@router.get("/")
async def get_alerts(
    status: Optional[str] = Query(None, description="告警状态: active, resolved"),
    level: Optional[str] = Query(None, description="告警级别: info, warning, critical, emergency"),
    hours: int = Query(24, ge=1, le=168, description="时间范围（小时）")
) -> Dict:
    """获取告警列表"""
    try:
        from src.alerting.alert_engine import alert_engine, AlertStatus
        
        if status == "active":
            alerts = alert_engine.get_active_alerts()
        elif status == "resolved":
            alerts = [alert for alert in alert_engine.get_alert_history(hours) 
                     if alert.status == AlertStatus.RESOLVED]
        else:
            alerts = alert_engine.get_active_alerts() + alert_engine.get_alert_history(hours)
        
        # 按级别过滤
        if level:
            from src.alerting.alert_engine import AlertLevel
            alerts = [alert for alert in alerts if alert.level.value == level]
        
        # 转换为可序列化的格式
        alert_data = []
        for alert in alerts:
            alert_data.append({
                "alert_id": alert.alert_id,
                "rule_id": alert.rule_id,
                "level": alert.level.value,
                "status": alert.status.value,
                "message": alert.message,
                "metric_name": alert.metric_name,
                "current_value": alert.current_value,
                "threshold_value": alert.threshold_value,
                "start_time": alert.start_time
            })
        
        return {
            "total": len(alert_data),
            "alerts": alert_data
        }
        
    except Exception as e:
        logger.error(f"获取告警列表失败: {e}")
        raise HTTPException(status_code=500, detail="获取告警列表失败")

@router.get("/statistics")
async def get_alert_statistics() -> Dict:
    """获取告警统计"""
    try:
        from src.alerting.alert_engine import alert_engine
        return alert_engine.get_alert_statistics()
        
    except Exception as e:
        logger.error(f"获取告警统计失败: {e}")
        raise HTTPException(status_code=500, detail="获取告警统计失败")

42.6 自动化运维

42.6.1 自动化运维管理器

系统实现了自动化运维管理器，支持日志清理、备份等自动化任务，提高运维效率。

python 复制代码

# src/operations/automation.py
import os
import shutil
import json
from typing import Dict
from datetime import datetime, timedelta
import logging
import yaml

logger = logging.getLogger(__name__)

class AutomationManager:
    """自动化运维管理器"""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.automation_config = self.load_automation_config()
    
    def load_automation_config(self) -> Dict:
        """加载自动化配置"""
        config_path = "config/automation.yaml"
        try:
            if os.path.exists(config_path):
                with open(config_path, 'r', encoding='utf-8') as f:
                    return yaml.safe_load(f) or {}
            else:
                self.logger.warning(f"自动化配置文件未找到: {config_path}")
                return {}
        except Exception as e:
            self.logger.error(f"加载自动化配置失败: {e}")
            return {}
    
    async def cleanup_logs(self, days_to_keep: int = 7) -> Dict:
        """清理旧日志文件"""
        try:
            log_dirs = ["logs", "logs/archive", "logs/error"]
            deleted_files = []
            total_size_freed = 0
            
            cutoff_date = datetime.now() - timedelta(days=days_to_keep)
            
            for log_dir in log_dirs:
                if not os.path.exists(log_dir):
                    continue
                
                for root, dirs, files in os.walk(log_dir):
                    for file in files:
                        if file.endswith('.log') or file.endswith('.log.gz'):
                            file_path = os.path.join(root, file)
                            
                            try:
                                file_stat = os.stat(file_path)
                                file_mtime = datetime.fromtimestamp(file_stat.st_mtime)
                                
                                if file_mtime < cutoff_date:
                                    file_size = file_stat.st_size
                                    os.remove(file_path)
                                    
                                    deleted_files.append(file_path)
                                    total_size_freed += file_size
                            except Exception as e:
                                self.logger.error(f"删除日志文件失败 {file_path}: {e}")
            
            return {
                "task": "cleanup_logs",
                "status": "success",
                "deleted_files": deleted_files,
                "files_deleted_count": len(deleted_files),
                "space_freed_mb": round(total_size_freed / (1024 * 1024), 2)
            }
            
        except Exception as e:
            self.logger.error(f"日志清理失败: {e}")
            return {"task": "cleanup_logs", "status": "failed", "error": str(e)}
    
    async def backup_data(self, backup_type: str = "full") -> Dict:
        """备份数据"""
        try:
            backup_config = self.automation_config.get("backup", {})
            if not backup_config.get("enabled", False):
                return {
                    "task": "backup_data",
                    "status": "skipped",
                    "reason": "备份功能未启用"
                }
            
            backup_dir = backup_config.get("backup_dir", "backups")
            os.makedirs(backup_dir, exist_ok=True)
            
            backup_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            backup_name = f"backup_{backup_type}_{backup_timestamp}"
            backup_path = os.path.join(backup_dir, backup_name)
            os.makedirs(backup_path, exist_ok=True)
            
            # 备份配置文件
            config_backup_result = await self.backup_configs(backup_path)
            
            # 创建备份清单
            manifest = {
                "backup_name": backup_name,
                "backup_type": backup_type,
                "timestamp": datetime.now().isoformat(),
                "components": {
                    "configs": config_backup_result
                }
            }
            
            manifest_path = os.path.join(backup_path, "manifest.json")
            with open(manifest_path, 'w', encoding='utf-8') as f:
                json.dump(manifest, f, ensure_ascii=False, indent=2)
            
            return {
                "task": "backup_data",
                "status": "success",
                "backup_name": backup_name,
                "backup_path": backup_path
            }
            
        except Exception as e:
            self.logger.error(f"数据备份失败: {e}")
            return {"task": "backup_data", "status": "failed", "error": str(e)}
    
    async def backup_configs(self, backup_path: str) -> Dict:
        """备份配置文件"""
        try:
            config_backup_dir = os.path.join(backup_path, "configs")
            os.makedirs(config_backup_dir, exist_ok=True)
            
            config_files = [
                "config/app.yaml",
                "config/database.yaml",
                "config/logging.yaml",
                ".env"
            ]
            
            copied_files = []
            
            for config_file in config_files:
                if os.path.exists(config_file):
                    dest_file = os.path.join(config_backup_dir, os.path.basename(config_file))
                    shutil.copy2(config_file, dest_file)
                    copied_files.append(config_file)
            
            return {
                "status": "success",
                "copied_files": copied_files,
                "files_count": len(copied_files)
            }
            
        except Exception as e:
            self.logger.error(f"配置文件备份失败: {e}")
            return {"status": "failed", "error": str(e)}

# 全局自动化运维管理器
automation_manager = AutomationManager()

42.6.2 运维API接口

提供了运维操作的API接口，便于远程管理和自动化。

python 复制代码

# src/api/operations.py
from fastapi import APIRouter, HTTPException, Query, BackgroundTasks
from typing import Dict
import logging

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/operations", tags=["operations"])

@router.post("/cleanup/logs")
async def cleanup_logs(
    days_to_keep: int = Query(7, ge=1, le=30, description="保留日志天数"),
    background_tasks: BackgroundTasks = BackgroundTasks()
) -> Dict:
    """清理旧日志文件（异步执行）"""
    try:
        from src.operations.automation import automation_manager
        
        # 添加到后台任务
        background_tasks.add_task(automation_manager.cleanup_logs, days_to_keep)
        
        return {
            "message": "日志清理任务已启动",
            "days_to_keep": days_to_keep,
            "status": "started"
        }
    except Exception as e:
        logger.error(f"启动日志清理任务失败: {e}")
        raise HTTPException(status_code=500, detail="启动日志清理任务失败")

@router.post("/backup")
async def create_backup(
    backup_type: str = Query("full", regex="^(full|config)$", description="备份类型"),
    background_tasks: BackgroundTasks = BackgroundTasks()
) -> Dict:
    """创建备份（异步执行）"""
    try:
        from src.operations.automation import automation_manager
        
        # 添加到后台任务
        background_tasks.add_task(automation_manager.backup_data, backup_type)
        
        return {
            "message": "备份任务已启动",
            "backup_type": backup_type,
            "status": "started"
        }
    except Exception as e:
        logger.error(f"启动备份任务失败: {e}")
        raise HTTPException(status_code=500, detail="启动备份任务失败")

@router.get("/health")
async def operations_health_check() -> Dict:
    """运维健康检查"""
    try:
        from src.operations.automation import automation_manager
        return await automation_manager.health_check()
    except Exception as e:
        logger.error(f"运维健康检查失败: {e}")
        raise HTTPException(status_code=500, detail="运维健康检查失败")

42.7 监控与运维最佳实践

42.7.1 监控策略建议

分层监控：系统层、应用层、业务层分别设置监控指标
合理的告警阈值：根据系统特点和用户场景设置合理的告警阈值
告警降噪：设置合理的告警冷却时间，避免告警风暴
历史数据保留：保留足够的历史数据用于趋势分析和容量规划
定期健康检查：定期运行全面的健康检查，确保系统各组件正常运行

42.7.2 日志管理最佳实践

结构化日志：统一使用结构化日志格式，便于机器解析
日志级别合理使用：根据日志重要性选择适当的级别
日志轮转：配置适当的日志轮转策略，避免日志文件过大
敏感信息脱敏：确保日志中不包含敏感信息
集中式日志管理：考虑使用ELK、Loki等工具进行日志集中管理和分析

42.7.3 告警管理最佳实践

告警分级：根据严重程度对告警进行分级，便于优先级处理
告警路由：根据告警类型和级别路由到不同的处理人员或团队
告警升级：对于未及时处理的告警设置升级机制
告警聚合：相似的告警进行聚合，减少重复告警
告警演练：定期进行告警响应演练，确保团队能够快速响应

42.7.4 自动化运维建议

自动化部署：实现CI/CD流程，自动化部署和回滚
自动化备份：定期自动备份关键数据和配置
自动化恢复：实现故障自动恢复机制
资源自动扩缩容：根据负载自动调整资源配置
定期自动清理：定期清理临时文件、过期日志等

附录

代码仓库地址：

GitHub: https://github.com/Hommy-master/capcut-mate
Gitee: https://gitee.com/taohongmin-gitee/capcut-mate

接口文档地址：

API文档地址: https://docs.jcaigc.cn