基于最新的Spring AI框架能力,构建一个结合AI智能检测的Redis哨兵切换监控系统的简单实现:
1. 项目依赖配置
xml
<dependencies>
<!-- Spring Boot 基础依赖 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- Spring AI 核心依赖 -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-core</artifactId>
<version>1.0.0</version>
</dependency>
<!-- Spring AI OpenAI 集成 -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-openai-spring-boot-starter</artifactId>
<version>1.0.0</version>
</dependency>
<!-- Redis 相关依赖 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<!-- Spring AI Redis Vector Store -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-redis-store</artifactId>
<version>1.0.0</version>
</dependency>
<!-- 监控和指标 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
</dependencies>
2. 配置文件
yaml
spring:
ai:
openai:
api-key: ${OPENAI_API_KEY}
chat:
options:
model: gpt-4
temperature: 0.1
vectorstore:
redis:
uri: redis://localhost:6379
index: "redis-monitoring-vectors"
prefix: "ai:monitoring:"
data:
redis:
sentinel:
master: mymaster
nodes:
- 192.168.1.10:26379
- 192.168.1.11:26379
- 192.168.1.12:26379
timeout: 2000ms
lettuce:
pool:
max-active: 20
max-idle: 8
min-idle: 2
# AI 监控配置
redis:
ai:
monitoring:
enabled: true
analysis-interval: 30s
anomaly-threshold: 0.8
prediction-window: 300s
alert-threshold: 0.9
management:
endpoints:
web:
exposure:
include: health,metrics,prometheus
metrics:
export:
prometheus:
enabled: true
3. AI智能监控核心实现
java
@Component
@Slf4j
public class AIRedisMonitoringService {
private final ChatClient chatClient;
private final RedisTemplate<String, Object> redisTemplate;
private final VectorStore vectorStore;
private final MeterRegistry meterRegistry;
private final RedisAnomalyPredictor anomalyPredictor;
public AIRedisMonitoringService(ChatClient.Builder chatClientBuilder,
RedisTemplate<String, Object> redisTemplate,
VectorStore vectorStore,
MeterRegistry meterRegistry,
RedisAnomalyPredictor anomalyPredictor) {
this.chatClient = chatClientBuilder.build();
this.redisTemplate = redisTemplate;
this.vectorStore = vectorStore;
this.meterRegistry = meterRegistry;
this.anomalyPredictor = anomalyPredictor;
}
/**
* AI驱动的Redis健康状态分析
*/
@Scheduled(fixedDelay = 30000)
public void performAIHealthAnalysis() {
try {
// 1. 收集Redis指标数据
RedisMetrics metrics = collectRedisMetrics();
// 2. 使用AI分析当前状态
HealthAssessment assessment = analyzeHealthWithAI(metrics);
// 3. 异常预测
AnomalyPrediction prediction = anomalyPredictor.predictAnomaly(metrics);
// 4. 存储分析结果到向量数据库
storeAnalysisResults(assessment, prediction, metrics);
// 5. 处理告警和自动修复
handleAlertsAndAutoRemediation(assessment, prediction);
log.debug("AI健康分析完成 - 状态: {}, 风险等级: {}",
assessment.getStatus(), assessment.getRiskLevel());
} catch (Exception e) {
log.error("AI健康分析失败", e);
meterRegistry.counter("redis.ai.analysis.error").increment();
}
}
private RedisMetrics collectRedisMetrics() {
return RedisMetrics.builder()
.responseTime(measureResponseTime())
.connectionCount(getConnectionCount())
.memoryUsage(getMemoryUsage())
.replicationLag(getReplicationLag())
.sentinelStatus(getSentinelStatus())
.errorRate(getErrorRate())
.throughput(getThroughput())
.timestamp(Instant.now())
.build();
}
private HealthAssessment analyzeHealthWithAI(RedisMetrics metrics) {
String prompt = String.format("""
作为Redis专家,请分析以下Redis集群指标并评估健康状态:
指标数据:
- 响应时间: %d ms
- 连接数: %d
- 内存使用率: %.2f%%
- 复制延迟: %d ms
- 哨兵状态: %s
- 错误率: %.4f%%
- 吞吐量: %d ops/sec
请提供:
1. 整体健康状态评级 (HEALTHY/WARNING/CRITICAL)
2. 风险等级 (LOW/MEDIUM/HIGH/CRITICAL)
3. 关键问题识别
4. 具体的改进建议
5. 是否需要立即干预
请以JSON格式返回结果。
""",
metrics.getResponseTime(),
metrics.getConnectionCount(),
metrics.getMemoryUsage(),
metrics.getReplicationLag(),
metrics.getSentinelStatus(),
metrics.getErrorRate(),
metrics.getThroughput()
);
try {
String response = chatClient.prompt()
.user(prompt)
.call()
.content();
return parseHealthAssessment(response);
} catch (Exception e) {
log.error("AI分析失败,使用默认评估", e);
return createFallbackAssessment(metrics);
}
}
private HealthAssessment parseHealthAssessment(String aiResponse) {
try {
ObjectMapper mapper = new ObjectMapper();
JsonNode jsonResponse = mapper.readTree(aiResponse);
return HealthAssessment.builder()
.status(HealthStatus.valueOf(jsonResponse.get("status").asText()))
.riskLevel(RiskLevel.valueOf(jsonResponse.get("riskLevel").asText()))
.issues(parseIssues(jsonResponse.get("issues")))
.recommendations(parseRecommendations(jsonResponse.get("recommendations")))
.requiresImmediateAction(jsonResponse.get("requiresImmediateAction").asBoolean())
.confidence(jsonResponse.get("confidence").asDouble())
.timestamp(Instant.now())
.build();
} catch (Exception e) {
log.error("解析AI响应失败", e);
throw new AIAnalysisException("AI响应解析失败", e);
}
}
}
4. AI异常预测组件
java
@Component
@Slf4j
public class RedisAnomalyPredictor {
private final VectorStore vectorStore;
private final ChatClient chatClient;
private final List<RedisMetrics> historicalData = new ArrayList<>();
public RedisAnomalyPredictor(VectorStore vectorStore, ChatClient.Builder chatClientBuilder) {
this.vectorStore = vectorStore;
this.chatClient = chatClientBuilder.build();
}
/**
* 基于历史数据和AI模型预测异常
*/
public AnomalyPrediction predictAnomaly(RedisMetrics currentMetrics) {
try {
// 1. 更新历史数据
updateHistoricalData(currentMetrics);
// 2. 特征工程 - 计算趋势和模式
MetricsTrend trend = calculateTrend();
// 3. 向量相似性搜索 - 查找历史相似场景
List<Document> similarScenarios = findSimilarScenarios(currentMetrics);
// 4. AI预测分析
AnomalyPrediction prediction = performAIPrediction(currentMetrics, trend, similarScenarios);
// 5. 存储预测结果
storePredictionResult(prediction);
return prediction;
} catch (Exception e) {
log.error("异常预测失败", e);
return createFallbackPrediction();
}
}
private List<Document> findSimilarScenarios(RedisMetrics metrics) {
// 创建当前指标的向量表示
String metricsText = String.format(
"Redis指标: 响应时间=%dms 内存使用=%.2f%% 复制延迟=%dms 错误率=%.4f%%",
metrics.getResponseTime(), metrics.getMemoryUsage(),
metrics.getReplicationLag(), metrics.getErrorRate()
);
// 执行相似性搜索
return vectorStore.similaritySearch(
SearchRequest.query(metricsText)
.withTopK(5)
.withSimilarityThreshold(0.7)
);
}
private AnomalyPrediction performAIPrediction(RedisMetrics currentMetrics,
MetricsTrend trend,
List<Document> similarScenarios) {
String prompt = createPredictionPrompt(currentMetrics, trend, similarScenarios);
String response = chatClient.prompt()
.user(prompt)
.call()
.content();
return parsePredictionResponse(response);
}
private String createPredictionPrompt(RedisMetrics metrics, MetricsTrend trend,
List<Document> scenarios) {
StringBuilder promptBuilder = new StringBuilder();
promptBuilder.append("""
作为Redis性能专家,请基于以下信息预测Redis集群在未来5分钟内的异常风险:
当前指标:
- 响应时间: %d ms (趋势: %s)
- 内存使用: %.2f%% (趋势: %s)
- 复制延迟: %d ms (趋势: %s)
- 错误率: %.4f%% (趋势: %s)
- 连接数: %d (趋势: %s)
""".formatted(
metrics.getResponseTime(), trend.getResponseTimeTrend(),
metrics.getMemoryUsage(), trend.getMemoryUsageTrend(),
metrics.getReplicationLag(), trend.getReplicationLagTrend(),
metrics.getErrorRate(), trend.getErrorRateTrend(),
metrics.getConnectionCount(), trend.getConnectionTrend()
));
if (!scenarios.isEmpty()) {
promptBuilder.append("历史相似场景:\n");
scenarios.forEach(doc ->
promptBuilder.append("- ").append(doc.getContent()).append("\n")
);
}
promptBuilder.append("""
请预测并返回JSON格式结果,包含:
1. anomalyProbability: 异常概率 (0-1)
2. predictedIssues: 可能出现的问题列表
3. timeToAnomaly: 预计多久后出现异常(秒)
4. recommendedActions: 建议的预防措施
5. confidence: 预测置信度 (0-1)
""");
return promptBuilder.toString();
}
}
5. 智能故障自动修复
java
@Component
@Slf4j
public class AIAutoRemediationService {
private final ChatClient chatClient;
private final RedisTemplate<String, Object> redisTemplate;
private final RedisOperations redisOperations;
public AIAutoRemediationService(ChatClient.Builder chatClientBuilder,
RedisTemplate<String, Object> redisTemplate,
RedisOperations redisOperations) {
this.chatClient = chatClientBuilder.build();
this.redisTemplate = redisTemplate;
this.redisOperations = redisOperations;
}
/**
* AI驱动的自动故障修复
*/
public RemediationResult performAutoRemediation(HealthAssessment assessment,
AnomalyPrediction prediction) {
if (!shouldPerformAutoRemediation(assessment, prediction)) {
return RemediationResult.skipped("不满足自动修复条件");
}
try {
// 1. AI生成修复方案
RemediationPlan plan = generateRemediationPlan(assessment, prediction);
// 2. 安全性检查
if (!isSafeToExecute(plan)) {
return RemediationResult.rejected("修复方案安全检查未通过");
}
// 3. 执行修复操作
List<RemediationAction> executedActions = new ArrayList<>();
for (RemediationAction action : plan.getActions()) {
try {
executeRemediationAction(action);
executedActions.add(action);
log.info("修复操作执行成功: {}", action.getDescription());
} catch (Exception e) {
log.error("修复操作执行失败: {}", action.getDescription(), e);
// 如果关键操作失败,可能需要回滚
if (action.isCritical()) {
rollbackActions(executedActions);
return RemediationResult.failed("关键修复操作失败,已回滚");
}
}
}
return RemediationResult.success(executedActions);
} catch (Exception e) {
log.error("自动修复过程失败", e);
return RemediationResult.failed("自动修复执行异常: " + e.getMessage());
}
}
private RemediationPlan generateRemediationPlan(HealthAssessment assessment,
AnomalyPrediction prediction) {
String prompt = String.format("""
作为Redis运维专家,请为以下问题生成自动修复方案:
当前问题:
%s
预测风险:
%s
请生成安全的自动修复方案,返回JSON格式:
{
"actions": [
{
"type": "REDIS_CONFIG|MEMORY_CLEANUP|CONNECTION_LIMIT|FAILOVER",
"description": "操作描述",
"command": "具体执行命令",
"critical": true/false,
"estimatedImpact": "预计影响",
"rollbackCommand": "回滚命令"
}
],
"estimatedDuration": "预计修复时间(秒)",
"riskLevel": "LOW|MEDIUM|HIGH"
}
注意:只生成低风险的自动修复操作,高风险操作需要人工干预。
""",
formatIssues(assessment.getIssues()),
formatPrediction(prediction)
);
String response = chatClient.prompt()
.user(prompt)
.call()
.content();
return parseRemediationPlan(response);
}
private void executeRemediationAction(RemediationAction action) {
switch (action.getType()) {
case REDIS_CONFIG:
executeRedisConfigChange(action);
break;
case MEMORY_CLEANUP:
executeMemoryCleanup(action);
break;
case CONNECTION_LIMIT:
executeConnectionLimitAdjustment(action);
break;
case FAILOVER:
executeManagedFailover(action);
break;
default:
throw new UnsupportedOperationException("不支持的修复操作类型: " + action.getType());
}
}
private void executeRedisConfigChange(RemediationAction action) {
// 安全的Redis配置调整
String[] configParts = action.getCommand().split(" ");
if (configParts.length >= 3 && "CONFIG".equals(configParts[0]) && "SET".equals(configParts[1])) {
String configKey = configParts[2];
String configValue = configParts[3];
// 只允许安全的配置项调整
if (isSafeConfigKey(configKey)) {
redisTemplate.execute((RedisCallback<Object>) connection -> {
connection.configSet(configKey, configValue);
return null;
});
log.info("Redis配置已更新: {} = {}", configKey, configValue);
}
}
}
private void executeMemoryCleanup(RemediationAction action) {
// 执行内存清理操作
redisTemplate.execute((RedisCallback<Object>) connection -> {
// 清理过期键
connection.eval(
"return redis.call('MEMORY', 'PURGE')",
ReturnType.INTEGER,
0
);
return null;
});
log.info("Redis内存清理完成");
}
private void executeManagedFailover(RemediationAction action) {
// 受控的故障转移
log.warn("执行受控故障转移");
redisTemplate.execute((RedisCallback<Object>) connection -> {
// 这里可以实现更复杂的故障转移逻辑
// 例如:检查从节点状态,然后执行SENTINEL FAILOVER
return null;
});
}
}
6. AI监控仪表板控制器
java
@RestController
@RequestMapping("/api/redis/ai-monitoring")
@Slf4j
public class AIMonitoringController {
private final AIRedisMonitoringService monitoringService;
private final RedisAnomalyPredictor anomalyPredictor;
private final AIAutoRemediationService remediationService;
@GetMapping("/status")
public ResponseEntity<AIMonitoringStatus> getCurrentStatus() {
try {
RedisMetrics metrics = monitoringService.getCurrentMetrics();
HealthAssessment assessment = monitoringService.getLatestAssessment();
AnomalyPrediction prediction = anomalyPredictor.getLatestPrediction();
AIMonitoringStatus status = AIMonitoringStatus.builder()
.metrics(metrics)
.healthAssessment(assessment)
.anomalyPrediction(prediction)
.timestamp(Instant.now())
.build();
return ResponseEntity.ok(status);
} catch (Exception e) {
log.error("获取AI监控状态失败", e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
}
}
@PostMapping("/analyze")
public ResponseEntity<AnalysisResult> performManualAnalysis(@RequestBody AnalysisRequest request) {
try {
AnalysisResult result = monitoringService.performManualAnalysis(request);
return ResponseEntity.ok(result);
} catch (Exception e) {
log.error("手动分析失败", e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR)
.body(AnalysisResult.error("分析失败: " + e.getMessage()));
}
}
@PostMapping("/remediation/trigger")
public ResponseEntity<RemediationResult> triggerRemediation(@RequestBody RemediationRequest request) {
try {
if (!request.isConfirmed()) {
return ResponseEntity.badRequest()
.body(RemediationResult.rejected("需要用户确认"));
}
HealthAssessment assessment = monitoringService.getLatestAssessment();
AnomalyPrediction prediction = anomalyPredictor.getLatestPrediction();
RemediationResult result = remediationService.performAutoRemediation(assessment, prediction);
return ResponseEntity.ok(result);
} catch (Exception e) {
log.error("触发自动修复失败", e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR)
.body(RemediationResult.failed("修复触发失败: " + e.getMessage()));
}
}
@GetMapping("/insights")
public ResponseEntity<List<AIInsight>> getAIInsights(@RequestParam(defaultValue = "24") int hours) {
try {
List<AIInsight> insights = monitoringService.generateInsights(Duration.ofHours(hours));
return ResponseEntity.ok(insights);
} catch (Exception e) {
log.error("获取AI洞察失败", e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
}
}
}
7. 实时AI分析WebSocket
java
@Component
@Slf4j
public class AIMonitoringWebSocketHandler extends TextWebSocketHandler {
private final AIRedisMonitoringService monitoringService;
private final Set<WebSocketSession> sessions = ConcurrentHashMap.newKeySet();
@Override
public void afterConnectionEstablished(WebSocketSession session) throws Exception {
sessions.add(session);
log.info("AI监控WebSocket连接建立: {}", session.getId());
// 发送初始状态
sendInitialStatus(session);
}
@Override
public void afterConnectionClosed(WebSocketSession session, CloseStatus status) throws Exception {
sessions.remove(session);
log.info("AI监控WebSocket连接关闭: {}", session.getId());
}
@EventListener
public void handleAIAnalysisComplete(AIAnalysisCompleteEvent event) {
AIMonitoringUpdate update = AIMonitoringUpdate.builder()
.type("ANALYSIS_COMPLETE")
.assessment(event.getAssessment())
.prediction(event.getPrediction())
.timestamp(Instant.now())
.build();
broadcastUpdate(update);
}
@EventListener
public void handleAnomalyDetected(AnomalyDetectedEvent event) {
AIMonitoringUpdate update = AIMonitoringUpdate.builder()
.type("ANOMALY_DETECTED")
.anomaly(event.getAnomaly())
.severity(event.getSeverity())
.timestamp(Instant.now())
.build();
broadcastUpdate(update);
}
private void broadcastUpdate(AIMonitoringUpdate update) {
String message;
try {
message = new ObjectMapper().writeValueAsString(update);
} catch (Exception e) {
log.error("序列化监控更新失败", e);
return;
}
sessions.forEach(session -> {
try {
if (session.isOpen()) {
session.sendMessage(new TextMessage(message));
}
} catch (Exception e) {
log.error("发送WebSocket消息失败", e);
sessions.remove(session);
}
});
}
}
关键特性总结
这个AI增强的Redis监控系统提供了以下智能功能:
- AI健康分析: 利用Spring AI和Redis的集成能力,通过GPT模型分析Redis指标,提供专业的健康评估
- 异常预测: 基于历史数据和向量相似性搜索,预测潜在的系统问题
- 自动修复: AI生成安全的修复方案并自动执行低风险操作
- 实时监控: WebSocket实时推送AI分析结果和异常告警
- 智能洞察: 结合Micrometer观察能力,生成深度的系统运行洞察
- 向量存储: 利用Redis作为向量数据库存储历史分析结果,支持相似场景检索
这个案例将传统的Redis监控提升到了AI驱动的智能运维水平,能够主动发现问题、预测故障并自动修复,大大提高了系统的可靠性和运维效率。