AI 应用监控与运维:确保系统稳定运行
前言
AI 应用的监控与运维是确保系统稳定运行的关键。一个好的监控系统能够及时发现问题、预警风险、保障服务质量。
我在项目中负责过多个 AI 系统的运维工作,对监控指标和运维流程有深入理解。今天分享一些实用的监控和运维经验。
监控指标
性能指标
python
class PerformanceMetrics:
"""性能指标"""
def __init__(self):
self.request_count = 0
self.total_latency = 0
self.error_count = 0
def record_request(self, latency: float, success: bool):
"""记录请求"""
self.request_count += 1
self.total_latency += latency
if not success:
self.error_count += 1
def get_metrics(self) -> dict:
"""获取指标"""
return {
"requests_total": self.request_count,
"avg_latency": self.total_latency / max(self.request_count, 1),
"error_rate": self.error_count / max(self.request_count, 1)
}
资源指标
python
class ResourceMetrics:
"""资源指标"""
def __init__(self):
self.cpu_usage = 0
self.memory_usage = 0
self.gpu_usage = 0
def update(self):
"""更新指标"""
import psutil
self.cpu_usage = psutil.cpu_percent()
self.memory_usage = psutil.virtual_memory().percent
# GPU 使用率
try:
import GPUtil
gpus = GPUtil.getGPUs()
if gpus:
self.gpu_usage = gpus[0].load * 100
except:
self.gpu_usage = 0
def get_metrics(self) -> dict:
"""获取指标"""
return {
"cpu_usage": self.cpu_usage,
"memory_usage": self.memory_usage,
"gpu_usage": self.gpu_usage
}
日志系统
python
import logging
from pythonjsonlogger import jsonlogger
class StructuredLogger:
"""结构化日志"""
def __init__(self, name: str):
self.logger = logging.getLogger(name)
self.logger.setLevel(logging.INFO)
# 控制台处理器
handler = logging.StreamHandler()
formatter = jsonlogger.JsonFormatter(
'%(asctime)s %(levelname)s %(message)s %(request_id)s'
)
handler.setFormatter(formatter)
self.logger.addHandler(handler)
def log_request(self, request_id: str, prompt: str, latency: float):
"""记录请求日志"""
self.logger.info(
"Request processed",
extra={
"request_id": request_id,
"prompt_length": len(prompt),
"latency": latency
}
)
def log_error(self, request_id: str, error: str):
"""记录错误日志"""
self.logger.error(
"Request failed",
extra={
"request_id": request_id,
"error": error
}
)
告警系统
python
class AlertingSystem:
"""告警系统"""
def __init__(self):
self.thresholds = {
"error_rate": 0.05,
"latency": 2.0,
"cpu_usage": 90
}
def check_alerts(self, metrics: dict) -> list:
"""检查告警条件"""
alerts = []
if metrics.get("error_rate", 0) > self.thresholds["error_rate"]:
alerts.append({
"level": "critical",
"message": f"错误率过高: {metrics['error_rate']:.2%}"
})
if metrics.get("avg_latency", 0) > self.thresholds["latency"]:
alerts.append({
"level": "warning",
"message": f"延迟过高: {metrics['avg_latency']:.2f}s"
})
return alerts
def send_alert(self, alert: dict):
"""发送告警"""
# 可以发送到 Slack、邮件等
print(f"ALERT [{alert['level']}]: {alert['message']}")
自动化运维
python
class AutoScaler:
"""自动扩缩容"""
def __init__(self, min_replicas: int = 1, max_replicas: int = 10):
self.min_replicas = min_replicas
self.max_replicas = max_replicas
self.current_replicas = 1
def scale(self, metrics: dict):
"""根据指标调整副本数"""
error_rate = metrics.get("error_rate", 0)
latency = metrics.get("avg_latency", 0)
if latency > 3.0 and self.current_replicas < self.max_replicas:
self.current_replicas += 1
print(f"扩容到 {self.current_replicas} 个副本")
elif error_rate < 0.01 and self.current_replicas > self.min_replicas:
self.current_replicas -= 1
print(f"缩容到 {self.current_replicas} 个副本")
总结
AI 应用监控与运维需要:
- 性能监控:请求数、延迟、错误率
- 资源监控:CPU、内存、GPU 使用情况
- 日志系统:结构化日志记录
- 告警系统:及时发现异常
- 自动化运维:自动扩缩容
关键要点:
- 监控指标要全面
- 日志要结构化便于分析
- 告警阈值要合理设置
- 自动化能减少人工干预