🎯 目标: 构建全面的监控和日志系统,提供完整的可观测性支持
🤔 为什么需要监控和日志?
在生产环境中,监控和日志是不可或缺的:
- 📈 性能监控: 实时了解系统性能状况
- 🔍 问题诊断: 快速定位和解决问题
- 📊 业务洞察: 通过数据分析业务趋势
- 🚨 告警通知: 及时发现异常情况
- 📝 审计追踪: 记录关键操作和变更
- 🎯 容量规划: 基于历史数据进行容量规划
🏗️ 监控和日志架构
graph TB
subgraph "数据收集层 📊"
A1[FlowMetricsCollector]
A2[StepMetricsCollector]
A3[SystemMetricsCollector]
A4[BusinessMetricsCollector]
end
subgraph "日志记录层 📝"
B1[FlowLogger]
B2[StepLogger]
B3[ErrorLogger]
B4[AuditLogger]
end
subgraph "数据处理层 🔄"
C1[MetricsProcessor]
C2[LogProcessor]
C3[AlertProcessor]
C4[AggregationProcessor]
end
subgraph "存储层 💾"
D1[TimeSeriesDB]
D2[LogStorage]
D3[MetricsCache]
D4[AlertStorage]
end
subgraph "分析层 🔍"
E1[TrendAnalyzer]
E2[AnomalyDetector]
E3[PerformanceAnalyzer]
E4[BusinessAnalyzer]
end
subgraph "展示层 📱"
F1[MetricsDashboard]
F2[LogViewer]
F3[AlertCenter]
F4[ReportGenerator]
end
subgraph "通知层 🔔"
G1[EmailNotifier]
G2[SlackNotifier]
G3[WebhookNotifier]
G4[SMSNotifier]
end
A1 --> C1
A2 --> C1
A3 --> C1
A4 --> C1
B1 --> C2
B2 --> C2
B3 --> C2
B4 --> C2
C1 --> D1
C1 --> D3
C2 --> D2
C3 --> D4
C3 --> G1
C4 --> D1
D1 --> E1
D1 --> E2
D1 --> E3
D2 --> E4
E1 --> F1
E2 --> F3
E3 --> F1
E4 --> F4
D4 --> G2
D4 --> G3
D4 --> G4
📊 指标收集系统
🎯 核心指标接口
java
/**
* 指标收集器接口
*/
public interface MetricsCollector {
/**
* 收集指标
* @param context 收集上下文
* @return 指标数据
*/
MetricsData collect(MetricsContext context);
/**
* 获取支持的指标类型
* @return 指标类型列表
*/
List<MetricType> getSupportedMetrics();
/**
* 是否启用
* @return 是否启用
*/
boolean isEnabled();
}
/**
* 流程指标收集器
*/
@Component
public class FlowMetricsCollector implements MetricsCollector, FlowExecutionListener {
private static final Logger logger = LoggerFactory.getLogger(FlowMetricsCollector.class);
private final MeterRegistry meterRegistry;
private final AtomicLong totalExecutions = new AtomicLong(0);
private final AtomicLong successfulExecutions = new AtomicLong(0);
private final AtomicLong failedExecutions = new AtomicLong(0);
private final Map<String, FlowMetrics> flowMetricsMap = new ConcurrentHashMap<>();
// 指标定义
private final Counter executionCounter;
private final Counter successCounter;
private final Counter failureCounter;
private final Timer executionTimer;
private final Gauge activeFlowsGauge;
private final DistributionSummary stepCountSummary;
public FlowMetricsCollector(MeterRegistry meterRegistry, FlowEngine engine) {
this.meterRegistry = meterRegistry;
// 初始化指标
this.executionCounter = Counter.builder("flow_executions_total")
.description("Total number of flow executions")
.register(meterRegistry);
this.successCounter = Counter.builder("flow_executions_success_total")
.description("Number of successful flow executions")
.register(meterRegistry);
this.failureCounter = Counter.builder("flow_executions_failure_total")
.description("Number of failed flow executions")
.register(meterRegistry);
this.executionTimer = Timer.builder("flow_execution_duration_seconds")
.description("Flow execution duration in seconds")
.register(meterRegistry);
this.activeFlowsGauge = Gauge.builder("flow_executions_active")
.description("Number of currently active flow executions")
.register(meterRegistry, this, collector -> collector.getActiveFlowCount());
this.stepCountSummary = DistributionSummary.builder("flow_steps_count")
.description("Number of steps in executed flows")
.register(meterRegistry);
// 注册为监听器
engine.addExecutionListener(this);
logger.info("流程指标收集器已初始化");
}
@Override
public MetricsData collect(MetricsContext context) {
MetricsData.Builder builder = MetricsData.builder()
.timestamp(Instant.now())
.source("flow-metrics-collector");
// 基础指标
builder.metric("total_executions", totalExecutions.get())
.metric("successful_executions", successfulExecutions.get())
.metric("failed_executions", failedExecutions.get())
.metric("success_rate", calculateSuccessRate())
.metric("active_flows", getActiveFlowCount());
// 流程级别指标
for (Map.Entry<String, FlowMetrics> entry : flowMetricsMap.entrySet()) {
String flowId = entry.getKey();
FlowMetrics metrics = entry.getValue();
builder.metric("flow_" + flowId + "_executions", metrics.getExecutionCount())
.metric("flow_" + flowId + "_success_rate", metrics.getSuccessRate())
.metric("flow_" + flowId + "_avg_duration", metrics.getAverageDuration())
.metric("flow_" + flowId + "_last_execution", metrics.getLastExecutionTime());
}
return builder.build();
}
@Override
public List<MetricType> getSupportedMetrics() {
return Arrays.asList(
MetricType.COUNTER,
MetricType.GAUGE,
MetricType.TIMER,
MetricType.DISTRIBUTION_SUMMARY
);
}
@Override
public boolean isEnabled() {
return true;
}
@Override
public void onFlowStarted(FlowExecutionEvent event) {
totalExecutions.incrementAndGet();
// 记录流程级别指标
FlowMetrics flowMetrics = flowMetricsMap.computeIfAbsent(
event.getFlowId(),
k -> new FlowMetrics(k)
);
flowMetrics.recordStart();
// 更新Micrometer指标
executionCounter.increment(
Tags.of(
"flow_id", event.getFlowId(),
"flow_name", event.getFlowName()
)
);
logger.debug("流程开始执行: {}", event.getFlowId());
}
@Override
public void onFlowCompleted(FlowExecutionEvent event) {
FlowMetrics flowMetrics = flowMetricsMap.get(event.getFlowId());
if (flowMetrics != null) {
flowMetrics.recordCompletion(event.getResult(), event.getExecutionTime());
}
Tags tags = Tags.of(
"flow_id", event.getFlowId(),
"flow_name", event.getFlowName()
);
if (event.getResult().isSuccess()) {
successfulExecutions.incrementAndGet();
successCounter.increment(tags);
} else {
failedExecutions.incrementAndGet();
failureCounter.increment(tags.and("error_type", event.getResult().getErrorType()));
}
// 记录执行时间
executionTimer.record(event.getExecutionTime(), TimeUnit.MILLISECONDS, tags);
// 记录步骤数量
stepCountSummary.record(event.getStepCount(), tags);
logger.debug("流程执行完成: {}, 状态: {}, 耗时: {}ms",
event.getFlowId(), event.getResult().getStatus(), event.getExecutionTime());
}
@Override
public void onStepStarted(StepExecutionEvent event) {
Counter.builder("flow_steps_total")
.tags(
"flow_id", event.getFlowId(),
"step_id", event.getStepId(),
"step_type", event.getStepType().name()
)
.register(meterRegistry)
.increment();
}
@Override
public void onStepCompleted(StepExecutionEvent event) {
Tags tags = Tags.of(
"flow_id", event.getFlowId(),
"step_id", event.getStepId(),
"step_type", event.getStepType().name()
);
if (event.getResult().isSuccess()) {
Counter.builder("flow_steps_success_total")
.tags(tags)
.register(meterRegistry)
.increment();
} else {
Counter.builder("flow_steps_failure_total")
.tags(tags.and("error_type", event.getResult().getErrorType()))
.register(meterRegistry)
.increment();
}
Timer.builder("flow_step_duration_seconds")
.tags(tags)
.register(meterRegistry)
.record(event.getExecutionTime(), TimeUnit.MILLISECONDS);
}
/**
* 计算成功率
*/
private double calculateSuccessRate() {
long total = totalExecutions.get();
if (total == 0) {
return 0.0;
}
return (double) successfulExecutions.get() / total;
}
/**
* 获取活跃流程数量
*/
private int getActiveFlowCount() {
return flowMetricsMap.values().stream()
.mapToInt(FlowMetrics::getActiveCount)
.sum();
}
/**
* 流程指标数据
*/
private static class FlowMetrics {
private final String flowId;
private final AtomicLong executionCount = new AtomicLong(0);
private final AtomicLong successCount = new AtomicLong(0);
private final AtomicInteger activeCount = new AtomicInteger(0);
private final AtomicLong totalDuration = new AtomicLong(0);
private volatile long lastExecutionTime = 0;
public FlowMetrics(String flowId) {
this.flowId = flowId;
}
public void recordStart() {
activeCount.incrementAndGet();
}
public void recordCompletion(FlowExecutionResult result, long duration) {
executionCount.incrementAndGet();
activeCount.decrementAndGet();
totalDuration.addAndGet(duration);
lastExecutionTime = System.currentTimeMillis();
if (result.isSuccess()) {
successCount.incrementAndGet();
}
}
public long getExecutionCount() {
return executionCount.get();
}
public double getSuccessRate() {
long total = executionCount.get();
return total == 0 ? 0.0 : (double) successCount.get() / total;
}
public double getAverageDuration() {
long count = executionCount.get();
return count == 0 ? 0.0 : (double) totalDuration.get() / count;
}
public int getActiveCount() {
return activeCount.get();
}
public long getLastExecutionTime() {
return lastExecutionTime;
}
}
}
/**
* 系统指标收集器
*/
@Component
public class SystemMetricsCollector implements MetricsCollector {
private final MeterRegistry meterRegistry;
private final MemoryMXBean memoryBean;
private final List<GarbageCollectorMXBean> gcBeans;
private final ThreadMXBean threadBean;
public SystemMetricsCollector(MeterRegistry meterRegistry) {
this.meterRegistry = meterRegistry;
this.memoryBean = ManagementFactory.getMemoryMXBean();
this.gcBeans = ManagementFactory.getGarbageCollectorMXBeans();
this.threadBean = ManagementFactory.getThreadMXBean();
// 注册JVM指标
new JvmMemoryMetrics().bindTo(meterRegistry);
new JvmGcMetrics().bindTo(meterRegistry);
new JvmThreadMetrics().bindTo(meterRegistry);
new ProcessorMetrics().bindTo(meterRegistry);
}
@Override
public MetricsData collect(MetricsContext context) {
MetricsData.Builder builder = MetricsData.builder()
.timestamp(Instant.now())
.source("system-metrics-collector");
// 内存指标
MemoryUsage heapMemory = memoryBean.getHeapMemoryUsage();
MemoryUsage nonHeapMemory = memoryBean.getNonHeapMemoryUsage();
builder.metric("jvm_memory_heap_used", heapMemory.getUsed())
.metric("jvm_memory_heap_max", heapMemory.getMax())
.metric("jvm_memory_heap_usage", (double) heapMemory.getUsed() / heapMemory.getMax())
.metric("jvm_memory_nonheap_used", nonHeapMemory.getUsed())
.metric("jvm_memory_nonheap_max", nonHeapMemory.getMax());
// GC指标
for (GarbageCollectorMXBean gcBean : gcBeans) {
String gcName = gcBean.getName().toLowerCase().replace(" ", "_");
builder.metric("jvm_gc_" + gcName + "_collections", gcBean.getCollectionCount())
.metric("jvm_gc_" + gcName + "_time", gcBean.getCollectionTime());
}
// 线程指标
builder.metric("jvm_threads_current", threadBean.getThreadCount())
.metric("jvm_threads_daemon", threadBean.getDaemonThreadCount())
.metric("jvm_threads_peak", threadBean.getPeakThreadCount())
.metric("jvm_threads_started", threadBean.getTotalStartedThreadCount());
// CPU指标
OperatingSystemMXBean osBean = ManagementFactory.getOperatingSystemMXBean();
if (osBean instanceof com.sun.management.OperatingSystemMXBean) {
com.sun.management.OperatingSystemMXBean sunOsBean =
(com.sun.management.OperatingSystemMXBean) osBean;
builder.metric("system_cpu_usage", sunOsBean.getSystemCpuLoad())
.metric("process_cpu_usage", sunOsBean.getProcessCpuLoad())
.metric("system_memory_total", sunOsBean.getTotalPhysicalMemorySize())
.metric("system_memory_free", sunOsBean.getFreePhysicalMemorySize());
}
return builder.build();
}
@Override
public List<MetricType> getSupportedMetrics() {
return Arrays.asList(MetricType.GAUGE, MetricType.COUNTER);
}
@Override
public boolean isEnabled() {
return true;
}
}
📝 日志系统实现
🎯 结构化日志
java
/**
* 流程日志记录器
*/
@Component
public class FlowLogger implements FlowExecutionListener {
private static final Logger logger = LoggerFactory.getLogger(FlowLogger.class);
private static final Logger flowExecutionLogger = LoggerFactory.getLogger("flow.execution");
private static final Logger stepExecutionLogger = LoggerFactory.getLogger("flow.step");
private static final Logger errorLogger = LoggerFactory.getLogger("flow.error");
private final ObjectMapper objectMapper;
private final FlowLogProperties properties;
public FlowLogger(FlowLogProperties properties) {
this.properties = properties;
this.objectMapper = new ObjectMapper();
this.objectMapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false);
this.objectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL);
}
@Override
public void onFlowStarted(FlowExecutionEvent event) {
if (properties.isLogFlowExecution()) {
FlowLogEntry logEntry = FlowLogEntry.builder()
.timestamp(Instant.now())
.level("INFO")
.event("FLOW_STARTED")
.flowId(event.getFlowId())
.flowName(event.getFlowName())
.executionId(event.getExecutionId())
.userId(event.getUserId())
.inputs(sanitizeInputs(event.getInputs()))
.build();
logStructured(flowExecutionLogger, logEntry);
}
}
@Override
public void onFlowCompleted(FlowExecutionEvent event) {
if (properties.isLogFlowExecution()) {
FlowLogEntry logEntry = FlowLogEntry.builder()
.timestamp(Instant.now())
.level(event.getResult().isSuccess() ? "INFO" : "ERROR")
.event("FLOW_COMPLETED")
.flowId(event.getFlowId())
.flowName(event.getFlowName())
.executionId(event.getExecutionId())
.userId(event.getUserId())
.status(event.getResult().getStatus().name())
.duration(event.getExecutionTime())
.outputs(sanitizeOutputs(event.getResult().getOutputs()))
.errorMessage(event.getResult().getErrorMessage())
.errorType(event.getResult().getErrorType())
.build();
if (event.getResult().isSuccess()) {
logStructured(flowExecutionLogger, logEntry);
} else {
logStructured(errorLogger, logEntry);
}
}
}
@Override
public void onStepStarted(StepExecutionEvent event) {
if (properties.isLogStepExecution()) {
StepLogEntry logEntry = StepLogEntry.builder()
.timestamp(Instant.now())
.level("DEBUG")
.event("STEP_STARTED")
.flowId(event.getFlowId())
.executionId(event.getExecutionId())
.stepId(event.getStepId())
.stepName(event.getStepName())
.stepType(event.getStepType().name())
.inputs(sanitizeInputs(event.getInputs()))
.build();
logStructured(stepExecutionLogger, logEntry);
}
}
@Override
public void onStepCompleted(StepExecutionEvent event) {
if (properties.isLogStepExecution()) {
StepLogEntry logEntry = StepLogEntry.builder()
.timestamp(Instant.now())
.level(event.getResult().isSuccess() ? "DEBUG" : "WARN")
.event("STEP_COMPLETED")
.flowId(event.getFlowId())
.executionId(event.getExecutionId())
.stepId(event.getStepId())
.stepName(event.getStepName())
.stepType(event.getStepType().name())
.status(event.getResult().getStatus().name())
.duration(event.getExecutionTime())
.outputs(sanitizeOutputs(event.getResult().getOutputs()))
.errorMessage(event.getResult().getErrorMessage())
.build();
if (event.getResult().isSuccess()) {
logStructured(stepExecutionLogger, logEntry);
} else {
logStructured(errorLogger, logEntry);
}
}
}
/**
* 记录结构化日志
*/
private void logStructured(Logger targetLogger, Object logEntry) {
try {
String jsonLog = objectMapper.writeValueAsString(logEntry);
targetLogger.info(jsonLog);
} catch (Exception e) {
logger.warn("记录结构化日志失败", e);
targetLogger.info("LogEntry: {}", logEntry.toString());
}
}
/**
* 清理敏感输入数据
*/
private Map<String, Object> sanitizeInputs(Map<String, Object> inputs) {
if (inputs == null || !properties.isSanitizeSensitiveData()) {
return inputs;
}
Map<String, Object> sanitized = new HashMap<>(inputs);
for (String sensitiveKey : properties.getSensitiveKeys()) {
if (sanitized.containsKey(sensitiveKey)) {
sanitized.put(sensitiveKey, "[REDACTED]");
}
}
return sanitized;
}
/**
* 清理敏感输出数据
*/
private Map<String, Object> sanitizeOutputs(Map<String, Object> outputs) {
return sanitizeInputs(outputs); // 使用相同的清理逻辑
}
}
/**
* 流程日志条目
*/
@Data
@Builder
@JsonInclude(JsonInclude.Include.NON_NULL)
public class FlowLogEntry {
private Instant timestamp;
private String level;
private String event;
private String flowId;
private String flowName;
private String executionId;
private String userId;
private String status;
private Long duration;
private Map<String, Object> inputs;
private Map<String, Object> outputs;
private String errorMessage;
private String errorType;
private Map<String, Object> context;
}
/**
* 步骤日志条目
*/
@Data
@Builder
@JsonInclude(JsonInclude.Include.NON_NULL)
public class StepLogEntry {
private Instant timestamp;
private String level;
private String event;
private String flowId;
private String executionId;
private String stepId;
private String stepName;
private String stepType;
private String status;
private Long duration;
private Map<String, Object> inputs;
private Map<String, Object> outputs;
private String errorMessage;
private Map<String, Object> context;
}
/**
* 审计日志记录器
*/
@Component
public class AuditLogger {
private static final Logger auditLogger = LoggerFactory.getLogger("flow.audit");
private final ObjectMapper objectMapper;
public AuditLogger() {
this.objectMapper = new ObjectMapper();
this.objectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL);
}
/**
* 记录流程定义变更
*/
public void logFlowDefinitionChange(String operation, String flowId, String userId,
Object oldValue, Object newValue) {
AuditLogEntry entry = AuditLogEntry.builder()
.timestamp(Instant.now())
.operation(operation)
.resourceType("FLOW_DEFINITION")
.resourceId(flowId)
.userId(userId)
.oldValue(oldValue)
.newValue(newValue)
.build();
logAuditEntry(entry);
}
/**
* 记录配置变更
*/
public void logConfigurationChange(String operation, String configKey, String userId,
Object oldValue, Object newValue) {
AuditLogEntry entry = AuditLogEntry.builder()
.timestamp(Instant.now())
.operation(operation)
.resourceType("CONFIGURATION")
.resourceId(configKey)
.userId(userId)
.oldValue(oldValue)
.newValue(newValue)
.build();
logAuditEntry(entry);
}
/**
* 记录权限变更
*/
public void logPermissionChange(String operation, String userId, String targetUserId,
String permission, String reason) {
AuditLogEntry entry = AuditLogEntry.builder()
.timestamp(Instant.now())
.operation(operation)
.resourceType("PERMISSION")
.resourceId(permission)
.userId(userId)
.targetUserId(targetUserId)
.reason(reason)
.build();
logAuditEntry(entry);
}
private void logAuditEntry(AuditLogEntry entry) {
try {
String jsonLog = objectMapper.writeValueAsString(entry);
auditLogger.info(jsonLog);
} catch (Exception e) {
auditLogger.warn("记录审计日志失败: {}", entry, e);
}
}
}
/**
* 审计日志条目
*/
@Data
@Builder
@JsonInclude(JsonInclude.Include.NON_NULL)
public class AuditLogEntry {
private Instant timestamp;
private String operation;
private String resourceType;
private String resourceId;
private String userId;
private String targetUserId;
private Object oldValue;
private Object newValue;
private String reason;
private String ipAddress;
private String userAgent;
}
🚨 告警系统实现
java
/**
* 告警管理器
*/
@Component
public class AlertManager {
private static final Logger logger = LoggerFactory.getLogger(AlertManager.class);
private final List<AlertRule> alertRules = new CopyOnWriteArrayList<>();
private final List<AlertNotifier> notifiers = new CopyOnWriteArrayList<>();
private final AlertStorage alertStorage;
private final ScheduledExecutorService scheduler;
public AlertManager(AlertStorage alertStorage, List<AlertNotifier> notifiers) {
this.alertStorage = alertStorage;
this.notifiers.addAll(notifiers);
this.scheduler = Executors.newScheduledThreadPool(2);
// 启动告警检查任务
startAlertChecking();
}
/**
* 添加告警规则
*/
public void addAlertRule(AlertRule rule) {
alertRules.add(rule);
logger.info("添加告警规则: {}", rule.getName());
}
/**
* 移除告警规则
*/
public void removeAlertRule(String ruleName) {
alertRules.removeIf(rule -> rule.getName().equals(ruleName));
logger.info("移除告警规则: {}", ruleName);
}
/**
* 检查指标并触发告警
*/
public void checkMetrics(MetricsData metrics) {
for (AlertRule rule : alertRules) {
if (rule.isEnabled()) {
try {
AlertResult result = rule.evaluate(metrics);
if (result.isTriggered()) {
handleAlert(rule, result);
}
} catch (Exception e) {
logger.error("评估告警规则失败: {}", rule.getName(), e);
}
}
}
}
/**
* 处理告警
*/
private void handleAlert(AlertRule rule, AlertResult result) {
Alert alert = Alert.builder()
.id(UUID.randomUUID().toString())
.ruleName(rule.getName())
.severity(rule.getSeverity())
.title(result.getTitle())
.message(result.getMessage())
.timestamp(Instant.now())
.status(AlertStatus.ACTIVE)
.metadata(result.getMetadata())
.build();
// 存储告警
alertStorage.save(alert);
// 发送通知
sendNotifications(alert);
logger.warn("触发告警: {} - {}", alert.getTitle(), alert.getMessage());
}
/**
* 发送告警通知
*/
private void sendNotifications(Alert alert) {
for (AlertNotifier notifier : notifiers) {
if (notifier.supports(alert.getSeverity())) {
try {
notifier.notify(alert);
} catch (Exception e) {
logger.error("发送告警通知失败: {}", notifier.getClass().getSimpleName(), e);
}
}
}
}
/**
* 启动告警检查任务
*/
private void startAlertChecking() {
// 定期检查告警状态
scheduler.scheduleAtFixedRate(this::checkAlertStatus, 1, 1, TimeUnit.MINUTES);
// 定期清理过期告警
scheduler.scheduleAtFixedRate(this::cleanupExpiredAlerts, 1, 1, TimeUnit.HOURS);
}
/**
* 检查告警状态
*/
private void checkAlertStatus() {
try {
List<Alert> activeAlerts = alertStorage.findByStatus(AlertStatus.ACTIVE);
for (Alert alert : activeAlerts) {
// 检查告警是否应该自动恢复
if (shouldAutoResolve(alert)) {
resolveAlert(alert.getId(), "自动恢复");
}
}
} catch (Exception e) {
logger.error("检查告警状态失败", e);
}
}
/**
* 清理过期告警
*/
private void cleanupExpiredAlerts() {
try {
Instant cutoff = Instant.now().minus(30, ChronoUnit.DAYS);
int cleaned = alertStorage.deleteOlderThan(cutoff);
if (cleaned > 0) {
logger.info("清理过期告警: {} 条", cleaned);
}
} catch (Exception e) {
logger.error("清理过期告警失败", e);
}
}
/**
* 解决告警
*/
public void resolveAlert(String alertId, String reason) {
Alert alert = alertStorage.findById(alertId);
if (alert != null && alert.getStatus() == AlertStatus.ACTIVE) {
alert.setStatus(AlertStatus.RESOLVED);
alert.setResolvedAt(Instant.now());
alert.setResolveReason(reason);
alertStorage.save(alert);
logger.info("告警已解决: {} - {}", alert.getTitle(), reason);
}
}
/**
* 检查是否应该自动恢复
*/
private boolean shouldAutoResolve(Alert alert) {
// 实现自动恢复逻辑
// 例如:检查相关指标是否已恢复正常
return false;
}
}
/**
* 告警规则接口
*/
public interface AlertRule {
/**
* 规则名称
*/
String getName();
/**
* 规则描述
*/
String getDescription();
/**
* 告警级别
*/
AlertSeverity getSeverity();
/**
* 是否启用
*/
boolean isEnabled();
/**
* 评估指标
*/
AlertResult evaluate(MetricsData metrics);
}
/**
* 流程执行失败率告警规则
*/
@Component
public class FlowFailureRateAlertRule implements AlertRule {
private final double threshold;
private final Duration timeWindow;
public FlowFailureRateAlertRule(@Value("${simple.flow.alert.failure-rate.threshold:0.1}") double threshold,
@Value("${simple.flow.alert.failure-rate.time-window:PT5M}") Duration timeWindow) {
this.threshold = threshold;
this.timeWindow = timeWindow;
}
@Override
public String getName() {
return "flow-failure-rate";
}
@Override
public String getDescription() {
return "流程执行失败率过高告警";
}
@Override
public AlertSeverity getSeverity() {
return AlertSeverity.WARNING;
}
@Override
public boolean isEnabled() {
return true;
}
@Override
public AlertResult evaluate(MetricsData metrics) {
Double successRate = metrics.getMetric("success_rate", Double.class);
if (successRate != null) {
double failureRate = 1.0 - successRate;
if (failureRate > threshold) {
return AlertResult.triggered(
"流程执行失败率过高",
String.format("当前失败率: %.2f%%, 阈值: %.2f%%",
failureRate * 100, threshold * 100),
Map.of(
"failure_rate", failureRate,
"threshold", threshold,
"success_rate", successRate
)
);
}
}
return AlertResult.notTriggered();
}
}
/**
* 邮件告警通知器
*/
@Component
@ConditionalOnProperty(name = "simple.flow.alert.email.enabled", havingValue = "true")
public class EmailAlertNotifier implements AlertNotifier {
private static final Logger logger = LoggerFactory.getLogger(EmailAlertNotifier.class);
private final JavaMailSender mailSender;
private final AlertEmailProperties properties;
public EmailAlertNotifier(JavaMailSender mailSender, AlertEmailProperties properties) {
this.mailSender = mailSender;
this.properties = properties;
}
@Override
public boolean supports(AlertSeverity severity) {
return properties.getSupportedSeverities().contains(severity);
}
@Override
public void notify(Alert alert) {
try {
MimeMessage message = mailSender.createMimeMessage();
MimeMessageHelper helper = new MimeMessageHelper(message, true, "UTF-8");
helper.setFrom(properties.getFrom());
helper.setTo(properties.getTo().toArray(new String[0]));
helper.setSubject("[" + alert.getSeverity() + "] " + alert.getTitle());
String content = buildEmailContent(alert);
helper.setText(content, true);
mailSender.send(message);
logger.info("邮件告警通知已发送: {}", alert.getTitle());
} catch (Exception e) {
logger.error("发送邮件告警失败", e);
throw new AlertNotificationException("邮件发送失败", e);
}
}
private String buildEmailContent(Alert alert) {
return String.format("""
<html>
<body>
<h2>流程编排框架告警</h2>
<table border="1" cellpadding="5">
<tr><td><b>告警标题</b></td><td>%s</td></tr>
<tr><td><b>告警级别</b></td><td>%s</td></tr>
<tr><td><b>告警时间</b></td><td>%s</td></tr>
<tr><td><b>告警消息</b></td><td>%s</td></tr>
<tr><td><b>规则名称</b></td><td>%s</td></tr>
</table>
<h3>详细信息</h3>
<pre>%s</pre>
</body>
</html>
""",
alert.getTitle(),
alert.getSeverity(),
alert.getTimestamp(),
alert.getMessage(),
alert.getRuleName(),
formatMetadata(alert.getMetadata())
);
}
private String formatMetadata(Map<String, Object> metadata) {
if (metadata == null || metadata.isEmpty()) {
return "无";
}
return metadata.entrySet().stream()
.map(entry -> entry.getKey() + ": " + entry.getValue())
.collect(Collectors.joining("\n"));
}
}
📊 监控仪表板
java
/**
* 监控仪表板控制器
*/
@RestController
@RequestMapping("/api/monitoring")
public class MonitoringDashboardController {
private final FlowMonitoringService monitoringService;
private final AlertManager alertManager;
public MonitoringDashboardController(FlowMonitoringService monitoringService,
AlertManager alertManager) {
this.monitoringService = monitoringService;
this.alertManager = alertManager;
}
/**
* 获取仪表板概览
*/
@GetMapping("/dashboard")
public ResponseEntity<DashboardOverview> getDashboardOverview() {
DashboardOverview overview = monitoringService.getDashboardOverview();
return ResponseEntity.ok(overview);
}
/**
* 获取流程执行趋势
*/
@GetMapping("/trends/execution")
public ResponseEntity<List<TrendData>> getExecutionTrends(
@RequestParam @DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME) Instant start,
@RequestParam @DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME) Instant end,
@RequestParam(defaultValue = "1h") String interval) {
List<TrendData> trends = monitoringService.getExecutionTrends(start, end, interval);
return ResponseEntity.ok(trends);
}
/**
* 获取性能指标
*/
@GetMapping("/metrics/performance")
public ResponseEntity<PerformanceMetrics> getPerformanceMetrics(
@RequestParam(defaultValue = "1h") String timeRange) {
PerformanceMetrics metrics = monitoringService.getPerformanceMetrics(timeRange);
return ResponseEntity.ok(metrics);
}
/**
* 获取活跃告警
*/
@GetMapping("/alerts/active")
public ResponseEntity<List<Alert>> getActiveAlerts() {
List<Alert> alerts = alertManager.getActiveAlerts();
return ResponseEntity.ok(alerts);
}
/**
* 获取系统健康状态
*/
@GetMapping("/health")
public ResponseEntity<SystemHealth> getSystemHealth() {
SystemHealth health = monitoringService.getSystemHealth();
return ResponseEntity.ok(health);
}
/**
* 获取流程排行榜
*/
@GetMapping("/rankings/flows")
public ResponseEntity<List<FlowRanking>> getFlowRankings(
@RequestParam(defaultValue = "execution_count") String sortBy,
@RequestParam(defaultValue = "10") int limit) {
List<FlowRanking> rankings = monitoringService.getFlowRankings(sortBy, limit);
return ResponseEntity.ok(rankings);
}
}
/**
* 仪表板概览数据
*/
@Data
@Builder
public class DashboardOverview {
private long totalFlows;
private long totalExecutions;
private long activeExecutions;
private double successRate;
private double averageExecutionTime;
private int activeAlerts;
private SystemStatus systemStatus;
private List<RecentExecution> recentExecutions;
private Map<String, Long> executionsByStatus;
private Map<String, Double> performanceMetrics;
}
/**
* 趋势数据
*/
@Data
@Builder
public class TrendData {
private Instant timestamp;
private long executions;
private long successes;
private long failures;
private double averageDuration;
private double successRate;
}
/**
* 性能指标
*/
@Data
@Builder
public class PerformanceMetrics {
private double cpuUsage;
private double memoryUsage;
private double diskUsage;
private int threadCount;
private double responseTime;
private double throughput;
private Map<String, Double> jvmMetrics;
}
🎯 设计亮点
📊 全面监控
- 多维度指标: 业务、系统、JVM等全方位监控
- 实时收集: 基于事件驱动的实时指标收集
- 历史趋势: 时间序列数据存储和分析
- 自定义指标: 支持业务自定义指标
📝 结构化日志
- JSON格式: 便于日志分析和检索
- 分级记录: 不同级别的日志详细程度
- 敏感数据保护: 自动清理敏感信息
- 审计追踪: 完整的操作审计记录
🚨 智能告警
- 规则引擎: 灵活的告警规则配置
- 多渠道通知: 邮件、短信、Webhook等
- 告警聚合: 避免告警风暴
- 自动恢复: 智能的告警恢复机制
📱 可视化展示
- 实时仪表板: 直观的监控界面
- 趋势分析: 历史数据趋势展示
- 交互式图表: 丰富的图表组件
- 移动适配: 支持移动设备访问
📝 本章小结
本章实现了完整的监控和日志系统,具备以下特性:
✅ 全面的指标收集 : 业务和系统指标全覆盖
✅ 结构化日志记录 : JSON格式便于分析
✅ 智能告警系统 : 规则引擎和多渠道通知
✅ 可视化仪表板 : 直观的监控界面
✅ 审计追踪: 完整的操作记录
下一章我们将实现性能优化和扩展,提升框架的性能和可扩展性。 🚀