概述
在将 AI 代理应用部署到生产环境之前,需要进行全面的检查和准备。本文提供了一个完整的上线前检查清单、监控告警配置指南和运维最佳实践。
为什么需要生产环境清单?
想象一下,如果你的应用在生产环境中出现问题,但你没有监控、没有日志、没有备份,会是什么情况?
生产环境清单就像飞行员起飞前的检查清单,确保所有关键系统都已就绪,避免在生产环境中出现意外。
上线前检查清单
1. 安全检查
1.1 密钥管理
-
\] API 密钥不在代码中硬编码
-
\] 配置文件已添加到 .gitignore
-
\] 实施了密钥轮换策略
验证方法:
# 检查代码中是否有硬编码的密钥
grep -r "sk-" . --exclude-dir=node_modules --exclude-dir=bin
grep -r "api_key" . --exclude-dir=node_modules --exclude-dir=bin
1.2 数据保护
-
\] 敏感数据已脱敏
-
\] 实施了访问控制(RBAC)
-
\] 符合数据保护法规(GDPR、个人信息保护法等)
-
\] 使用 HTTPS
-
\] 实施了速率限制
-
\] 启用了 DDoS 防护
2.1 性能优化
-
\] 实现了代理复用
-
\] 优化了提示词长度
-
\] 配置了并行处理
-
\] 选择了合适的模型
public class PerformanceBenchmark
{
public async Task RunBenchmark()
{
Console.WriteLine("=== 性能基准测试 ===\n");// 测试响应时间 var avgResponseTime = await MeasureAverageResponseTime(); Console.WriteLine($"平均响应时间: {avgResponseTime:F2} 秒"); // 测试并发能力 var throughput = await MeasureThroughput(); Console.WriteLine($"吞吐量: {throughput:F2} 请求/秒"); // 测试缓存命中率 var cacheHitRate = await MeasureCacheHitRate(); Console.WriteLine($"缓存命中率: {cacheHitRate:P}"); } private async Task<double> MeasureAverageResponseTime() { // 实现测量逻辑 return 0; } private async Task<double> MeasureThroughput() { // 实现测量逻辑 return 0; } private async Task<double> MeasureCacheHitRate() { // 实现测量逻辑 return 0; }}
性能目标:
-
平均响应时间 < 3 秒
-
P95 响应时间 < 5 秒
-
吞吐量 > 10 请求/秒
-
缓存命中率 > 30%
2.2 资源配置
-
\] 配置了合适的 CPU 和内存
-
\] 设置了资源限制
3. 可靠性检查
3.1 错误处理
-
\] 实现了完整的异常捕获
-
\] 实现了断路器模式
-
\] 记录了所有错误日志
-
\] 配置了健康检查
-
\] 配置了故障转移
健康检查示例:
public class HealthCheck
{
private readonly ChatCompletionAgent _agent;
public async Task<HealthStatus> CheckHealthAsync()
{
var status = new HealthStatus();
try
{
// 检查 AI 服务连接
var thread = new AgentThread();
await thread.AddUserMessageAsync("健康检查");
var response = await _agent.InvokeAsync(thread);
status.IsHealthy = true;
status.Message = "服务正常";
}
catch (Exception ex)
{
status.IsHealthy = false;
status.Message = $"服务异常: {ex.Message}";
}
return status;
}
}
public class HealthStatus
{
public bool IsHealthy { get; set; }
public string Message { get; set; }
public DateTime CheckTime { get; set; } = DateTime.UtcNow;
}
4. 监控和日志
4.1 日志配置
-
\] 配置了结构化日志
-
\] 不记录敏感信息
-
\] 日志可以集中查看
public class ProductionLogger : ILogger
{
private readonly string _logPath;
private readonly LogLevel _minLevel;public ProductionLogger(string logPath, LogLevel minLevel = LogLevel.Warning) { _logPath = logPath; _minLevel = minLevel; } public void Log(LogLevel level, string message, Exception ex = null) { if (level < _minLevel) return; var logEntry = new { Timestamp = DateTime.UtcNow, Level = level.ToString(), Message = message, Exception = ex?.ToString(), MachineName = Environment.MachineName, ProcessId = Environment.ProcessId }; var json = JsonSerializer.Serialize(logEntry); File.AppendAllText(_logPath, json + Environment.NewLine); }}
public enum LogLevel
{
Debug,
Info,
Warning,
Error,
Critical
}
4.2 监控指标
-
\] 配置了性能监控(响应时间、吞吐量)
-
\] 配置了资源使用监控(CPU、内存、磁盘)
-
\] 配置了成本监控(API 调用费用)
public class MetricsCollector
{
private long _totalRequests = 0;
private long _successfulRequests = 0;
private long _failedRequests = 0;
private readonly List<double> _responseTimes = new();public void RecordRequest(bool success, double responseTime) { Interlocked.Increment(ref _totalRequests); if (success) Interlocked.Increment(ref _successfulRequests); else Interlocked.Increment(ref _failedRequests); lock (_responseTimes) { _responseTimes.Add(responseTime); // 只保留最近 1000 条记录 if (_responseTimes.Count > 1000) _responseTimes.RemoveAt(0); } } public Metrics GetMetrics() { lock (_responseTimes) { return new Metrics { TotalRequests = _totalRequests, SuccessfulRequests = _successfulRequests, FailedRequests = _failedRequests, SuccessRate = _totalRequests > 0 ? (double)_successfulRequests / _totalRequests : 0, AverageResponseTime = _responseTimes.Any() ? _responseTimes.Average() : 0, P95ResponseTime = _responseTimes.Any() ? CalculatePercentile(_responseTimes, 0.95) : 0 }; } } private double CalculatePercentile(List<double> values, double percentile) { var sorted = values.OrderBy(x => x).ToList(); var index = (int)Math.Ceiling(sorted.Count * percentile) - 1; return sorted[Math.Max(0, index)]; }}
public class Metrics
{
public long TotalRequests { get; set; }
public long SuccessfulRequests { get; set; }
public long FailedRequests { get; set; }
public double SuccessRate { get; set; }
public double AverageResponseTime { get; set; }
public double P95ResponseTime { get; set; }
}
5. 测试检查
5.1 测试覆盖
-
\] 单元测试覆盖率 \> 80%
-
\] 端到端测试通过
-
\] 压力测试通过
-
\] 在类生产环境中测试
-
\] 测试了高负载场景
6. 文档检查
-
\] API 文档完整
-
\] 运维手册完整
-
\] 用户手册完整(如需要)
1. 配置 Application Insights
using Microsoft.ApplicationInsights;
using Microsoft.ApplicationInsights.Extensibility;
public class ApplicationInsightsMonitoring
{
private readonly TelemetryClient _telemetryClient;
public ApplicationInsightsMonitoring(string instrumentationKey)
{
var config = TelemetryConfiguration.CreateDefault();
config.InstrumentationKey = instrumentationKey;
_telemetryClient = new TelemetryClient(config);
}
public void TrackRequest(string name, DateTimeOffset startTime, TimeSpan duration, bool success)
{
_telemetryClient.TrackRequest(name, startTime, duration,
success ? "200" : "500", success);
}
public void TrackException(Exception ex)
{
_telemetryClient.TrackException(ex);
}
public void TrackMetric(string name, double value)
{
_telemetryClient.TrackMetric(name, value);
}
public void TrackEvent(string name, Dictionary<string, string> properties = null)
{
_telemetryClient.TrackEvent(name, properties);
}
}
2. 配置告警规则
告警配置示例:
public class AlertConfiguration
{
public List<AlertRule> Rules { get; set; } = new()
{
new AlertRule
{
Name = "高错误率告警",
Condition = metrics => metrics.SuccessRate < 0.95,
Message = "错误率超过 5%",
Severity = AlertSeverity.High
},
new AlertRule
{
Name = "响应时间告警",
Condition = metrics => metrics.P95ResponseTime > 5.0,
Message = "P95 响应时间超过 5 秒",
Severity = AlertSeverity.Medium
},
new AlertRule
{
Name = "服务不可用告警",
Condition = metrics => metrics.TotalRequests == 0,
Message = "服务可能不可用",
Severity = AlertSeverity.Critical
}
};
}
public class AlertRule
{
public string Name { get; set; }
public Func<Metrics, bool> Condition { get; set; }
public string Message { get; set; }
public AlertSeverity Severity { get; set; }
}
public enum AlertSeverity
{
Low,
Medium,
High,
Critical
}
告警检查器:
public class AlertChecker
{
private readonly AlertConfiguration _config;
private readonly INotificationService _notificationService;
public AlertChecker(AlertConfiguration config, INotificationService notificationService)
{
_config = config;
_notificationService = notificationService;
}
public async Task CheckAlertsAsync(Metrics metrics)
{
foreach (var rule in _config.Rules)
{
if (rule.Condition(metrics))
{
await _notificationService.SendAlertAsync(new Alert
{
RuleName = rule.Name,
Message = rule.Message,
Severity = rule.Severity,
Timestamp = DateTime.UtcNow,
Metrics = metrics
});
}
}
}
}
public interface INotificationService
{
Task SendAlertAsync(Alert alert);
}
public class Alert
{
public string RuleName { get; set; }
public string Message { get; set; }
public AlertSeverity Severity { get; set; }
public DateTime Timestamp { get; set; }
public Metrics Metrics { get; set; }
}
3. 配置通知渠道
public class EmailNotificationService : INotificationService
{
private readonly string _smtpServer;
private readonly string _fromEmail;
private readonly List<string> _toEmails;
public EmailNotificationService(string smtpServer, string fromEmail, List<string> toEmails)
{
_smtpServer = smtpServer;
_fromEmail = fromEmail;
_toEmails = toEmails;
}
public async Task SendAlertAsync(Alert alert)
{
var subject = $"[{alert.Severity}] {alert.RuleName}";
var body = $@"
告警时间: {alert.Timestamp:yyyy-MM-dd HH:mm:ss}
告警规则: {alert.RuleName}
告警消息: {alert.Message}
当前指标:
- 总请求数: {alert.Metrics.TotalRequests}
- 成功率: {alert.Metrics.SuccessRate:P}
- 平均响应时间: {alert.Metrics.AverageResponseTime:F2} 秒
- P95 响应时间: {alert.Metrics.P95ResponseTime:F2} 秒
";
// 实际发送邮件的代码
Console.WriteLine($"发送告警邮件: {subject}");
await Task.CompletedTask;
}
}
运维最佳实践
1. 部署策略
1.1 蓝绿部署
-
维护两个相同的生产环境(蓝和绿)
-
新版本部署到非活动环境
-
测试通过后切换流量
-
出问题可以快速回滚
1.2 金丝雀发布
-
先将新版本部署到一小部分服务器
-
观察指标,逐步扩大范围
-
发现问题及时回滚
2. 备份和恢复
public class BackupService
{
private readonly string _backupPath;
public BackupService(string backupPath)
{
_backupPath = backupPath;
}
// 备份配置
public async Task BackupConfigurationAsync()
{
var config = LoadConfiguration();
var backupFile = Path.Combine(_backupPath,
$"config_backup_{DateTime.UtcNow:yyyyMMdd_HHmmss}.json");
await File.WriteAllTextAsync(backupFile,
JsonSerializer.Serialize(config, new JsonSerializerOptions { WriteIndented = true }));
Console.WriteLine($"配置已备份到: {backupFile}");
}
// 备份数据
public async Task BackupDataAsync()
{
// 实现数据备份逻辑
await Task.CompletedTask;
}
// 恢复配置
public async Task RestoreConfigurationAsync(string backupFile)
{
var json = await File.ReadAllTextAsync(backupFile);
var config = JsonSerializer.Deserialize<Configuration>(json);
// 恢复配置
Console.WriteLine($"配置已从 {backupFile} 恢复");
}
private Configuration LoadConfiguration()
{
// 实际实现
return new Configuration();
}
}
public class Configuration
{
// 配置属性
}
3. 定期维护任务
public class MaintenanceTasks
{
// 清理过期数据
public async Task CleanupExpiredDataAsync()
{
Console.WriteLine("开始清理过期数据...");
// 清理 30 天前的日志
var cutoffDate = DateTime.UtcNow.AddDays(-30);
// 实现清理逻辑
Console.WriteLine("过期数据清理完成");
await Task.CompletedTask;
}
// 优化数据库
public async Task OptimizeDatabaseAsync()
{
Console.WriteLine("开始优化数据库...");
// 实现数据库优化逻辑
Console.WriteLine("数据库优化完成");
await Task.CompletedTask;
}
// 检查系统健康
public async Task CheckSystemHealthAsync()
{
Console.WriteLine("开始系统健康检查...");
// 检查磁盘空间
var drives = DriveInfo.GetDrives();
foreach (var drive in drives.Where(d => d.IsReady))
{
var freeSpacePercent = (double)drive.AvailableFreeSpace / drive.TotalSize;
if (freeSpacePercent < 0.1)
{
Console.WriteLine($"警告: 磁盘 {drive.Name} 空间不足 ({freeSpacePercent:P})");
}
}
Console.WriteLine("系统健康检查完成");
await Task.CompletedTask;
}
}
4. 故障排查流程
故障排查清单:
-
确认问题
-
收集错误信息
-
确定影响范围
-
记录开始时间
-
-
查看监控
-
检查错误率
-
检查响应时间
-
检查资源使用
-
-
查看日志
-
查找错误日志
-
分析错误模式
-
确定根本原因
-
-
采取行动
-
如果是配置问题,回滚配置
-
如果是代码问题,回滚版本
-
如果是资源问题,扩容资源
-
-
验证修复
-
确认问题已解决
-
监控指标恢复正常
-
通知相关人员
-
-
事后分析
-
记录故障原因
-
制定预防措施
-
更新文档
-
生产环境检查清单总结
上线前必查项
-
\] **安全**:密钥管理、数据保护、网络安全
-
\] **可靠性**:错误处理、容错能力
-
\] **测试**:测试覆盖、测试环境
上线后必做项
-
\] **监控**:持续监控关键指标
-
\] **备份**:定期备份配置和数据
-
\] **优化**:根据监控数据持续优化
生产环境部署是一个系统工程,关键要点:
-
全面检查:使用清单确保不遗漏
-
监控告警:及时发现和响应问题
-
备份恢复:做好最坏的打算
-
持续优化:根据数据不断改进
-
文档完善:让团队都能快速上手
记住:生产环境的稳定性比新功能更重要。
