【Python高级工程与架构实战】项目三：实时数据管道（Kafka + Polars + Delta Lake）(二)

3.4.3 anomaly_detection.py

脚本功能 ：Z-Score算法实时异常检测与告警系统 使用方式 ：python 3.4.3_anomaly_detection.py

复制代码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
3.4.3 实时异常检测系统
功能：
- Z-Score统计异常检测
- 滑动窗口均值/标准差计算
- 动态阈值调整
- 多级告警（警告/严重）
- 可视化监控面板
使用方式：python 3.4.3_anomaly_detection.py
"""

import time
import logging
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Callable, Deque
from collections import deque
from dataclasses import dataclass
import threading

import numpy as np
import polars as pl
from kafka import KafkaConsumer, KafkaProducer
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import matplotlib.patches as mpatches

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


@dataclass
class AnomalyEvent:
    """异常事件"""
    timestamp: datetime
    metric_name: str
    value: float
    expected_mean: float
    expected_std: float
    z_score: float
    severity: str  # 'warning', 'critical'
    description: str


class ZScoreDetector:
    """
    Z-Score异常检测器
    基于滑动窗口统计
    """
    
    def __init__(self,
                 window_size: int = 100,
                 threshold_warning: float = 2.0,
                 threshold_critical: float = 3.0,
                 min_samples: int = 30):
        self.window_size = window_size
        self.threshold_warning = threshold_warning
        self.threshold_critical = threshold_critical
        self.min_samples = min_samples
        
        # 滑动窗口数据
        self.windows: Dict[str, Deque[float]] = {}
        self.timestamps: Dict[str, Deque[datetime]] = {}
        
        # 统计缓存
        self.stats: Dict[str, Dict] = {}
        
        # 告警回调
        self.alert_handlers: List[Callable] = []
    
    def register_alert_handler(self, handler: Callable[[AnomalyEvent], None]):
        """注册告警处理器"""
        self.alert_handlers.append(handler)
    
    def update_window(self, metric_name: str, value: float, timestamp: Optional[datetime] = None):
        """
        更新滑动窗口
        """
        if timestamp is None:
            timestamp = datetime.utcnow()
        
        # 初始化窗口
        if metric_name not in self.windows:
            self.windows[metric_name] = deque(maxlen=self.window_size)
            self.timestamps[metric_name] = deque(maxlen=self.window_size)
        
        # 添加数据
        self.windows[metric_name].append(value)
        self.timestamps[metric_name].append(timestamp)
        
        # 更新统计
        if len(self.windows[metric_name]) >= self.min_samples:
            window_data = list(self.windows[metric_name])
            self.stats[metric_name] = {
                'mean': np.mean(window_data),
                'std': np.std(window_data),
                'min': np.min(window_data),
                'max': np.max(window_data),
                'count': len(window_data)
            }
    
    def detect(self, metric_name: str, value: float, timestamp: Optional[datetime] = None) -> Optional[AnomalyEvent]:
        """
        检测单点异常
        """
        if metric_name not in self.stats:
            return None
        
        stats = self.stats[metric_name]
        if stats['std'] == 0:
            return None
        
        # 计算Z-Score
        z_score = (value - stats['mean']) / stats['std']
        
        # 判断严重级别
        abs_z = abs(z_score)
        if abs_z >= self.threshold_critical:
            severity = 'critical'
        elif abs_z >= self.threshold_warning:
            severity = 'warning'
        else:
            return None
        
        # 构造异常事件
        event = AnomalyEvent(
            timestamp=timestamp or datetime.utcnow(),
            metric_name=metric_name,
            value=value,
            expected_mean=stats['mean'],
            expected_std=stats['std'],
            z_score=z_score,
            severity=severity,
            description=f"Value {value:.2f} is {abs_z:.2f} std devs from mean"
        )
        
        # 触发告警
        for handler in self.alert_handlers:
            try:
                handler(event)
            except Exception as e:
                logger.error(f"告警处理失败: {e}")
        
        return event
    
    def process(self, metric_name: str, value: float, timestamp: Optional[datetime] = None) -> Optional[AnomalyEvent]:
        """
        完整处理流程：更新窗口 + 检测
        """
        self.update_window(metric_name, value, timestamp)
        return self.detect(metric_name, value, timestamp)
    
    def get_stats(self, metric_name: str) -> Optional[Dict]:
        """获取当前统计"""
        return self.stats.get(metric_name)


class StreamingAnomalyDetector:
    """
    流式异常检测引擎
    集成Kafka输入输出
    """
    
    def __init__(self,
                 kafka_bootstrap: str = "localhost:9092",
                 input_topic: str = "metrics.stream",
                 output_topic: str = "anomalies.detected"):
        self.kafka_bootstrap = kafka_bootstrap
        self.input_topic = input_topic
        self.output_topic = output_topic
        
        self.detector = ZScoreDetector(
            window_size=200,
            threshold_warning=2.5,
            threshold_critical=3.5
        )
        
        self.anomaly_history: List[AnomalyEvent] = []
        self.running = False
        
        # 配置告警
        self._setup_alerts()
    
    def _setup_alerts(self):
        """设置告警处理器"""
        # 1. 日志告警
        self.detector.register_alert_handler(self._log_alert)
        
        # 2. Kafka告警（用于下游处理）
        self.alert_producer = KafkaProducer(
            bootstrap_servers=self.kafka_bootstrap,
            value_serializer=lambda v: json.dumps(v, default=str).encode('utf-8')
        )
        self.detector.register_alert_handler(self._kafka_alert)
        
        # 3. PagerDuty模拟（严重告警）
        self.detector.register_alert_handler(self._pagerduty_alert)
    
    def _log_alert(self, event: AnomalyEvent):
        """日志记录告警"""
        logger.warning(f"[{event.severity.upper()}] {event.metric_name}: {event.description} "
                      f"(Z-Score: {event.z_score:.2f})")
    
    def _kafka_alert(self, event: AnomalyEvent):
        """发送到Kafka"""
        try:
            self.alert_producer.send(
                self.output_topic,
                key=event.metric_name.encode(),
                value={
                    'timestamp': event.timestamp.isoformat(),
                    'metric': event.metric_name,
                    'value': event.value,
                    'z_score': event.z_score,
                    'severity': event.severity,
                    'baseline_mean': event.expected_mean,
                    'baseline_std': event.expected_std
                }
            )
        except Exception as e:
            logger.error(f"Kafka告警发送失败: {e}")
    
    def _pagerduty_alert(self, event: AnomalyEvent):
        """严重告警PagerDuty通知"""
        if event.severity == 'critical':
            logger.critical(f"PAGERDUTY ALERT: {event.metric_name} CRITICAL - {event.description}")
            # 实际应调用PagerDuty API
    
    def run(self, duration_seconds: int = 60):
        """运行检测"""
        self.running = True
        
        consumer = KafkaConsumer(
            self.input_topic,
            bootstrap_servers=self.kafka_bootstrap,
            group_id=f"anomaly-detector-{int(time.time())}",
            auto_offset_reset='latest',
            value_deserializer=lambda m: json.loads(m.decode('utf-8'))
        )
        
        logger.info(f"启动异常检测: {self.input_topic} -> {self.output_topic}")
        
        try:
            while self.running:
                messages = consumer.poll(timeout_ms=1000)
                
                for tp, records in messages.items():
                    for record in records:
                        data = record.value
                        
                        # 支持多指标
                        for metric_name, value in data.items():
                            if isinstance(value, (int, float)):
                                anomaly = self.detector.process(
                                    metric_name=metric_name,
                                    value=value,
                                    timestamp=datetime.fromtimestamp(record.timestamp / 1000)
                                )
                                
                                if anomaly:
                                    self.anomaly_history.append(anomaly)
                
        except KeyboardInterrupt:
            logger.info("停止检测")
        finally:
            consumer.close()
            self.alert_producer.close()
            self.running = False
    
    def generate_report(self) -> Dict:
        """生成检测报告"""
        if not self.anomaly_history:
            return {'message': 'No anomalies detected'}
        
        df = pl.DataFrame([{
            'timestamp': a.timestamp,
            'metric': a.metric_name,
            'severity': a.severity,
            'z_score': abs(a.z_score),
            'value': a.value
        } for a in self.anomaly_history])
        
        return {
            'total_anomalies': len(self.anomaly_history),
            'critical_count': len(df.filter(pl.col('severity') == 'critical')),
            'warning_count': len(df.filter(pl.col('severity') == 'warning')),
            'by_metric': df.groupby('metric').count().to_dicts(),
            'by_hour': df.with_columns(
                pl.col('timestamp').dt.hour().alias('hour')
            ).groupby('hour').count().to_dicts(),
            'avg_z_score': df['z_score'].mean()
        }
    
    def visualize_detection(self, output_file: str = "anomaly_detection.png"):
        """可视化检测结果"""
        if not self.anomaly_history:
            logger.warning("无异常数据可可视化")
            return
        
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        fig.suptitle('Real-time Anomaly Detection Report', fontsize=16, fontweight='bold')
        
        # 准备数据
        anomalies = self.anomaly_history
        metrics = list(set(a.metric_name for a in anomalies))
        
        # 1. 时间线
        ax1 = axes[0, 0]
        times = [a.timestamp for a in anomalies]
        z_scores = [abs(a.z_score) for a in anomalies]
        colors = ['red' if a.severity == 'critical' else 'orange' for a in anomalies]
        
        ax1.scatter(times, z_scores, c=colors, s=100, alpha=0.7)
        ax1.axhline(y=self.detector.threshold_warning, color='orange', linestyle='--', label='Warning')
        ax1.axhline(y=self.detector.threshold_critical, color='red', linestyle='--', label='Critical')
        ax1.set_xlabel('Time')
        ax1.set_ylabel('|Z-Score|')
        ax1.set_title('Anomaly Timeline')
        ax1.legend()
        
        # 2. 指标分布
        ax2 = axes[0, 1]
        metric_counts = {}
        for a in anomalies:
            metric_counts[a.metric_name] = metric_counts.get(a.metric_name, 0) + 1
        
        bars = ax2.bar(metric_counts.keys(), metric_counts.values(), color='#F44336')
        ax2.set_title('Anomalies by Metric')
        ax2.set_ylabel('Count')
        ax2.tick_params(axis='x', rotation=45)
        
        # 3. 严重级别饼图
        ax3 = axes[1, 0]
        critical = len([a for a in anomalies if a.severity == 'critical'])
        warning = len([a for a in anomalies if a.severity == 'warning'])
        
        ax3.pie([warning, critical], labels=['Warning', 'Critical'],
               colors=['#FF9800', '#F44336'], autopct='%1.1f%%', startangle=90)
        ax3.set_title('Severity Distribution')
        
        # 4. 统计摘要
        ax4 = axes[1, 1]
        report = self.generate_report()
        
        summary_text = f"""
        Detection Summary:
        
        Total Anomalies: {report['total_anomalies']}
        Critical: {report['critical_count']}
        Warning: {report['warning_count']}
        Avg Z-Score: {report['avg_z_score']:.2f}
        
        Configuration:
        Window Size: {self.detector.window_size}
        Warning Threshold: {self.detector.threshold_warning}
        Critical Threshold: {self.detector.threshold_critical}
        
        Metrics Monitored:
        {', '.join(metrics)}
        """
        
        ax4.text(0.1, 0.9, summary_text, transform=ax4.transAxes,
                fontsize=10, verticalalignment='top', family='monospace',
                bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
        ax4.set_xlim(0, 1)
        ax4.set_ylim(0, 1)
        ax4.axis('off')
        ax4.set_title('Summary')
        
        plt.tight_layout()
        plt.savefig(output_file, dpi=150, bbox_inches='tight')
        logger.info(f"异常检测报告已保存: {output_file}")
        plt.show()


def simulate_anomaly_detection():
    """模拟异常检测"""
    print("=" * 60)
    print("3.4.3 Z-Score异常检测演示")
    print("=" * 60)
    
    detector = ZScoreDetector(
        window_size=50,
        threshold_warning=2.0,
        threshold_critical=3.0
    )
    
    # 注册告警处理器
    anomalies_captured = []
    detector.register_alert_handler(lambda e: anomalies_captured.append(e))
    
    # 生成模拟数据（包含异常）
    print("\n生成模拟数据流...")
    np.random.seed(42)
    
    data_stream = []
    for i in range(200):
        # 正常数据：均值100，标准差10
        if i < 100 or (120 <= i < 150):
            value = np.random.normal(100, 10)
        elif 100 <= i < 110:
            # 注入异常：均值偏移到150
            value = np.random.normal(150, 15)
        else:
            # 注入异常：方差增大
            value = np.random.normal(100, 50)
        
        data_stream.append({
            'timestamp': datetime(2024, 1, 1, 12, 0, 0) + timedelta(minutes=i),
            'value': value
        })
        
        # 处理
        detector.process('cpu_usage', value, data_stream[-1]['timestamp'])
    
    print(f"检测到 {len(anomalies_captured)} 个异常")
    
    for a in anomalies_captured[:5]:
        print(f"  [{a.severity}] {a.timestamp.strftime('%H:%M')}: "
              f"value={a.value:.1f}, z={a.z_score:.2f}")
    
    # 可视化
    fig, axes = plt.subplots(2, 1, figsize=(12, 8))
    fig.suptitle('Z-Score Anomaly Detection Simulation', fontsize=14, fontweight='bold')
    
    # 1. 数据流
    ax1 = axes[0]
    times = [d['timestamp'] for d in data_stream]
    values = [d['value'] for d in data_stream]
    
    ax1.plot(times, values, 'b-', alpha=0.7, label='Data')
    
    # 标记异常
    for a in anomalies_captured:
        color = 'red' if a.severity == 'critical' else 'orange'
        ax1.scatter(a.timestamp, a.value, c=color, s=100, zorder=5)
    
    ax1.axhline(y=100, color='green', linestyle='--', alpha=0.5, label='Mean (100)')
    ax1.set_ylabel('Value')
    ax1.set_title('Data Stream with Anomalies')
    ax1.legend()
    
    # 2. Z-Score
    ax2 = axes[1]
    
    # 重建Z-Score历史（简化）
    z_history = []
    for i, d in enumerate(data_stream):
        if i >= 30:  # 有统计后
            window = [v['value'] for v in data_stream[max(0, i-49):i+1]]
            mean = np.mean(window)
            std = np.std(window)
            if std > 0:
                z = (d['value'] - mean) / std
                z_history.append(z)
            else:
                z_history.append(0)
        else:
            z_history.append(0)
    
    ax2.plot(times, [abs(z) for z in z_history], 'g-', label='|Z-Score|')
    ax2.axhline(y=detector.threshold_warning, color='orange', linestyle='--', label='Warning')
    ax2.axhline(y=detector.threshold_critical, color='red', linestyle='--', label='Critical')
    ax2.fill_between(times, detector.threshold_warning, 10, where=[abs(z) > detector.threshold_warning for z in z_history],
                     alpha=0.3, color='red')
    ax2.set_ylabel('|Z-Score|')
    ax2.set_title('Z-Score Evolution')
    ax2.legend()
    
    plt.tight_layout()
    plt.savefig('anomaly_detection_simulation.png', dpi=150)
    plt.show()


if __name__ == "__main__":
    simulate_anomaly_detection()

3.4.4 latency_monitoring.py

脚本功能 ：端到端延迟监控，Kafka Lag监控与SLI设定 使用方式 ：python 3.4.4_latency_monitoring.py

复制代码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
3.4.4 端到端延迟监控与SLI设定
功能：
- Kafka Consumer Lag监控
- 端到端处理延迟计算
- SLI/SLO定义与监控
- 延迟分布可视化与告警
使用方式：python 3.4.4_latency_monitoring.py
"""

import time
import json
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Callable
from dataclasses import dataclass, asdict
from collections import deque
import threading

import numpy as np
import matplotlib.pyplot as plt
from kafka import KafkaConsumer, TopicPartition, KafkaAdminClient
from kafka.admin import NewTopic
import pandas as pd

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


@dataclass
class LatencyMetric:
    """延迟指标"""
    timestamp: datetime
    stage: str
    latency_ms: float
    topic: Optional[str] = None
    partition: Optional[int] = None
    offset: Optional[int] = None
    metadata: Dict = None


@dataclass
class SLIConfig:
    """SLI配置"""
    name: str
    target_p99_ms: float
    target_p95_ms: float
    target_avg_ms: float
    evaluation_window_minutes: int = 5


class KafkaLagMonitor:
    """
    Kafka Lag监控器
    实时监控消费延迟
    """
    
    def __init__(self, 
                 kafka_bootstrap: str = "localhost:9092",
                 consumer_group: str = "monitor-group"):
        self.kafka_bootstrap = kafka_bootstrap
        self.consumer_group = consumer_group
        self.admin_client = KafkaAdminClient(bootstrap_servers=kafka_bootstrap)
        self.consumer = KafkaConsumer(
            bootstrap_servers=kafka_bootstrap,
            group_id=consumer_group
        )
        self.lag_history: deque = deque(maxlen=1000)
        self.running = False
    
    def get_current_lag(self, topic: str, group_id: Optional[str] = None) -> Dict:
        """
        获取当前Lag情况
        """
        if group_id is None:
            group_id = self.consumer_group
        
        try:
            # 获取分区信息
            partitions = self.consumer.partitions_for_topic(topic)
            if not partitions:
                return {'error': 'No partitions found'}
            
            topic_partitions = [TopicPartition(topic, p) for p in partitions]
            
            # 获取Log End Offset（生产者写入位置）
            end_offsets = self.consumer.end_offsets(topic_partitions)
            
            # 获取Committed Offset（消费者消费位置）
            committed = {
                tp: self.consumer.committed(tp) for tp in topic_partitions
            }
            
            # 计算Lag
            lag_per_partition = {}
            total_lag = 0
            
            for tp in topic_partitions:
                end_offset = end_offsets.get(tp, 0)
                committed_offset = committed.get(tp)
                
                if committed_offset is not None:
                    lag = end_offset - committed_offset
                else:
                    lag = end_offset  # 从未消费过
                
                lag_per_partition[tp.partition] = {
                    'end_offset': end_offset,
                    'committed_offset': committed_offset if committed_offset else 0,
                    'lag': lag
                }
                total_lag += lag
            
            result = {
                'topic': topic,
                'group_id': group_id,
                'timestamp': datetime.utcnow().isoformat(),
                'total_lag': total_lag,
                'partitions': lag_per_partition,
                'max_lag_partition': max(lag_per_partition.items(), key=lambda x: x[1]['lag'])[0]
            }
            
            self.lag_history.append(result)
            return result
            
        except Exception as e:
            logger.error(f"获取Lag失败: {e}")
            return {'error': str(e)}
    
    def monitor_continuous(self, 
                          topics: List[str],
                          interval_seconds: int = 10,
                          callback: Optional[Callable] = None):
        """
        持续监控
        """
        self.running = True
        
        def monitor_thread():
            while self.running:
                for topic in topics:
                    lag_info = self.get_current_lag(topic)
                    if callback:
                        callback(lag_info)
                    else:
                        if 'total_lag' in lag_info:
                            logger.info(f"[{topic}] Lag: {lag_info['total_lag']}")
                time.sleep(interval_seconds)
        
        thread = threading.Thread(target=monitor_thread, daemon=True)
        thread.start()
        return thread
    
    def stop(self):
        self.running = False
        self.consumer.close()
        self.admin_client.close()


class EndToEndLatencyTracker:
    """
    端到端延迟追踪器
    从生产者到消费者的完整链路延迟
    """
    
    def __init__(self):
        self.latency_stages: Dict[str, deque] = {
            'ingestion': deque(maxlen=1000),
            'processing': deque(maxlen=1000),
            'storage': deque(maxlen=1000),
            'total': deque(maxlen=1000)
        }
        self.sli_configs: Dict[str, SLIConfig] = {}
    
    def add_sli(self, config: SLIConfig):
        """添加SLI配置"""
        self.sli_configs[config.name] = config
    
    def record_stage_latency(self, 
                            stage: str, 
                            latency_ms: float,
                            metadata: Dict = None):
        """记录阶段延迟"""
        metric = LatencyMetric(
            timestamp=datetime.utcnow(),
            stage=stage,
            latency_ms=latency_ms,
            metadata=metadata
        )
        self.latency_stages[stage].append(metric)
    
    def record_total_latency(self, 
                            produce_time: datetime,
                            consume_time: datetime,
                            metadata: Dict = None):
        """记录端到端延迟"""
        latency_ms = (consume_time - produce_time).total_seconds() * 1000
        
        metric = LatencyMetric(
            timestamp=consume_time,
            stage='total',
            latency_ms=latency_ms,
            metadata=metadata
        )
        self.latency_stages['total'].append(metric)
    
    def calculate_percentiles(self, stage: str, percentiles: List[int] = [50, 95, 99]) -> Dict:
        """
        计算延迟分位数
        """
        if stage not in self.latency_stages or not self.latency_stages[stage]:
            return {}
        
        values = [m.latency_ms for m in self.latency_stages[stage]]
        
        return {
            f'p{p}': np.percentile(values, p) for p in percentiles
        }
    
    def check_sli_compliance(self) -> Dict:
        """
        检查SLI合规性
        """
        results = {}
        
        for name, config in self.sli_configs.items():
            stage = name.split('_')[0]  # 简化：假设SLI名对应stage
            
            if stage not in self.latency_stages:
                continue
            
            percentiles = self.calculate_percentiles(stage)
            
            if not percentiles:
                continue
            
            p99 = percentiles.get('p99', 0)
            p95 = percentiles.get('p95', 0)
            avg = np.mean([m.latency_ms for m in self.latency_stages[stage]])
            
            results[name] = {
                'target_p99': config.target_p99_ms,
                'actual_p99': p99,
                'p99_compliant': p99 <= config.target_p99_ms,
                'target_p95': config.target_p95_ms,
                'actual_p95': p95,
                'p95_compliant': p95 <= config.target_p95_ms,
                'target_avg': config.target_avg_ms,
                'actual_avg': avg,
                'avg_compliant': avg <= config.target_avg_ms
            }
        
        return results
    
    def visualize_latency(self, output_file: str = "latency_monitoring.png"):
        """可视化延迟指标"""
        if not any(self.latency_stages.values()):
            logger.warning("无延迟数据可可视化")
            return
        
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        fig.suptitle('End-to-End Latency Monitoring & SLI Compliance', fontsize=16, fontweight='bold')
        
        # 1. 各阶段延迟分布（箱线图）
        ax1 = axes[0, 0]
        stage_data = []
        stage_labels = []
        
        for stage, metrics in self.latency_stages.items():
            if metrics:
                values = [m.latency_ms for m in metrics]
                stage_data.append(values)
                stage_labels.append(stage)
        
        if stage_data:
            bp = ax1.boxplot(stage_data, labels=stage_labels, patch_artist=True)
            colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']
            for patch, color in zip(bp['boxes'], colors):
                patch.set_facecolor(color)
                patch.set_alpha(0.7)
            ax1.set_ylabel('Latency (ms)')
            ax1.set_title('Latency Distribution by Stage')
            ax1.tick_params(axis='x', rotation=45)
        
        # 2. 时间序列（最近100个点）
        ax2 = axes[0, 1]
        total_metrics = list(self.latency_stages['total'])[-100:]
        
        if total_metrics:
            times = [m.timestamp for m in total_metrics]
            latencies = [m.latency_ms for m in total_metrics]
            
            ax2.plot(times, latencies, 'b-', alpha=0.7)
            
            # 标记SLI线
            if 'total_e2e' in self.sli_configs:
                config = self.sli_configs['total_e2e']
                ax2.axhline(y=config.target_p99_ms, color='red', linestyle='--', 
                           label=f'P99 Target ({config.target_p99_ms}ms)')
                ax2.axhline(y=config.target_p95_ms, color='orange', linestyle='--',
                           label=f'P95 Target ({config.target_p95_ms}ms)')
            
            ax2.set_xlabel('Time')
            ax2.set_ylabel('Latency (ms)')
            ax2.set_title('End-to-End Latency Trend')
            ax2.legend()
            ax2.tick_params(axis='x', rotation=45)
        
        # 3. 分位数对比（与SLI目标）
        ax3 = axes[1, 0]
        sli_results = self.check_sli_compliance()
        
        if sli_results:
            stages = list(sli_results.keys())
            targets_p99 = [sli_results[s]['target_p99'] for s in stages]
            actuals_p99 = [sli_results[s]['actual_p99'] for s in stages]
            
            x = np.arange(len(stages))
            width = 0.35
            
            bars1 = ax3.bar(x - width/2, targets_p99, width, label='Target', color='#4CAF50', alpha=0.7)
            bars2 = ax3.bar(x + width/2, actuals_p99, width, label='Actual', 
                           color=['#4CAF50' if sli_results[s]['p99_compliant'] else '#F44336' for s in stages])
            
            ax3.set_ylabel('Latency (ms)')
            ax3.set_title('SLI Compliance: P99 Latency')
            ax3.set_xticks(x)
            ax3.set_xticklabels(stages, rotation=45)
            ax3.legend()
        
        # 4. SLI摘要
        ax4 = axes[1, 1]
        
        if sli_results:
            summary_text = "SLI Compliance Report:\n\n"
            
            for name, result in sli_results.items():
                status = "✓ PASS" if (result['p99_compliant'] and result['p95_compliant']) else "✗ FAIL"
                summary_text += f"{name}: {status}\n"
                summary_text += f"  P99: {result['actual_p99']:.1f}ms (target: {result['target_p99']}ms)\n"
                summary_text += f"  P95: {result['actual_p95']:.1f}ms (target: {result['target_p95']}ms)\n"
                summary_text += f"  Avg: {result['actual_avg']:.1f}ms (target: {result['target_avg']}ms)\n\n"
        else:
            summary_text = "No SLI data available"
        
        ax4.text(0.1, 0.9, summary_text, transform=ax4.transAxes,
                fontsize=10, verticalalignment='top', family='monospace',
                bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
        ax4.set_xlim(0, 1)
        ax4.set_ylim(0, 1)
        ax4.axis('off')
        ax4.set_title('SLI Summary')
        
        plt.tight_layout()
        plt.savefig(output_file, dpi=150, bbox_inches='tight')
        logger.info(f"延迟监控报告已保存: {output_file}")
        plt.show()


class LatencyAlertManager:
    """
    延迟告警管理器
    """
    
    def __init__(self):
        self.alert_rules: List[Dict] = []
        self.alert_history: deque = deque(maxlen=100)
    
    def add_rule(self, 
                 name: str,
                 condition: Callable[[float], bool],
                 severity: str = 'warning'):
        """添加告警规则"""
        self.alert_rules.append({
            'name': name,
            'condition': condition,
            'severity': severity
        })
    
    def check_latency(self, latency_ms: float, context: Dict = None):
        """检查延迟是否触发告警"""
        for rule in self.alert_rules:
            if rule['condition'](latency_ms):
                alert = {
                    'timestamp': datetime.utcnow().isoformat(),
                    'rule': rule['name'],
                    'severity': rule['severity'],
                    'latency_ms': latency_ms,
                    'context': context
                }
                self.alert_history.append(alert)
                logger.warning(f"[{rule['severity'].upper()}] {rule['name']}: {latency_ms}ms")
                return alert
        return None


def demonstrate_latency_monitoring():
    """演示延迟监控"""
    print("=" * 60)
    print("3.4.4 端到端延迟监控与SLI设定")
    print("=" * 60)
    
    # 创建追踪器
    tracker = EndToEndLatencyTracker()
    
    # 配置SLI
    tracker.add_sli(SLIConfig(
        name="ingestion_processing",
        target_p99_ms=500,
        target_p95_ms=300,
        target_avg_ms=200
    ))
    
    tracker.add_sli(SLIConfig(
        name="total_e2e",
        target_p99_ms=1000,
        target_p95_ms=800,
        target_avg_ms=500
    ))
    
    # 模拟延迟数据
    print("\n生成模拟延迟数据...")
    np.random.seed(42)
    
    for i in range(200):
        # 正常延迟
        base_latency = np.random.exponential(150)  # 指数分布模拟长尾
        
        # 偶尔尖刺（模拟GC或网络抖动）
        if i % 50 == 0:
            base_latency *= 3
        
        tracker.record_stage_latency('ingestion', base_latency * 0.3)
        tracker.record_stage_latency('processing', base_latency * 0.5)
        tracker.record_stage_latency('storage', base_latency * 0.2)
        tracker.record_total_latency(
            datetime.utcnow() - timedelta(milliseconds=base_latency),
            datetime.utcnow()
        )
        
        time.sleep(0.01)  # 模拟时间流逝
    
    # 计算分位数
    print("\n延迟分位数统计:")
    for stage in ['ingestion', 'processing', 'storage', 'total']:
        percentiles = tracker.calculate_percentiles(stage)
        if percentiles:
            print(f"  {stage}:")
            print(f"    P50: {percentiles['p50']:.1f}ms")
            print(f"    P95: {percentiles['p95']:.1f}ms")
            print(f"    P99: {percentiles['p99']:.1f}ms")
    
    # SLI合规检查
    print("\nSLI合规检查:")
    compliance = tracker.check_sli_compliance()
    for name, result in compliance.items():
        status = "✓ 合规" if (result['p99_compliant'] and result['p95_compliant']) else "✗ 不合规"
        print(f"  {name}: {status}")
        print(f"    P99目标: {result['target_p99']}ms, 实际: {result['actual_p99']:.1f}ms")
    
    # 可视化
    tracker.visualize_latency()
    
    # 告警演示
    print("\n--- 延迟告警演示 ---")
    alert_manager = LatencyAlertManager()
    
    # 添加规则
    alert_manager.add_rule(
        "High P99 Latency",
        lambda x: x > 800,
        severity='critical'
    )
    alert_manager.add_rule(
        "Degraded Performance",
        lambda x: x > 500,
        severity='warning'
    )
    
    # 测试告警
    test_latencies = [400, 600, 900, 300]
    for lat in test_latencies:
        alert = alert_manager.check_latency(lat, {'stage': 'test'})
        if alert:
            print(f"  触发告警: {alert['rule']} ({alert['severity']}) - {lat}ms")


if __name__ == "__main__":
    demonstrate_latency_monitoring()

3.5 运维与优化实现

3.5.2 k8s_deployment.yaml

脚本功能 ：Kubernetes部署配置，StatefulSet管理有状态消费者 使用方式 ：kubectl apply -f 3.5.2_k8s_deployment.yaml

复制代码

# 3.5.2 Kubernetes部署配置
# 功能：StatefulSet管理Kafka消费者，PersistentVolume持久化状态
# 使用：kubectl apply -f 3.5.2_k8s_deployment.yaml

---
# Namespace
apiVersion: v1
kind: Namespace
metadata:
  name: data-pipeline
  labels:
    name: data-pipeline

---
# ConfigMap: 应用配置
apiVersion: v1
kind: ConfigMap
metadata:
  name: pipeline-config
  namespace: data-pipeline
data:
  KAFKA_BOOTSTRAP: "kafka:9092"
  SCHEMA_REGISTRY_URL: "http://schema-registry:8081"
  DELTA_LAKE_PATH: "/data/delta"
  LOG_LEVEL: "INFO"
  POLARS_THREADS: "4"

---
# Secret: 敏感信息
apiVersion: v1
kind: Secret
metadata:
  name: pipeline-secrets
  namespace: data-pipeline
type: Opaque
stringData:
  KAFKA_SASL_USERNAME: "pipeline-user"
  KAFKA_SASL_PASSWORD: "pipeline-password"
  AWS_ACCESS_KEY_ID: "minioadmin"
  AWS_SECRET_ACCESS_KEY: "minioadmin"

---
# ServiceAccount
apiVersion: v1
kind: ServiceAccount
metadata:
  name: pipeline-sa
  namespace: data-pipeline

---
# RBAC: 权限配置
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: pipeline-role
  namespace: data-pipeline
rules:
- apiGroups: [""]
  resources: ["pods", "services", "configmaps"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
  resources: ["statefulsets"]
  verbs: ["get", "list"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: pipeline-rolebinding
  namespace: data-pipeline
subjects:
- kind: ServiceAccount
  name: pipeline-sa
  namespace: data-pipeline
roleRef:
  kind: Role
  name: pipeline-role
  apiGroup: rbac.authorization.k8s.io

---
# Headless Service for StatefulSet
apiVersion: v1
kind: Service
metadata:
  name: pipeline-consumer
  namespace: data-pipeline
  labels:
    app: pipeline-consumer
spec:
  ports:
  - port: 8080
    name: metrics
  - port: 8081
    name: health
  clusterIP: None
  selector:
    app: pipeline-consumer

---
# StatefulSet: 有状态流处理器
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: polars-stream-processor
  namespace: data-pipeline
spec:
  serviceName: pipeline-consumer
  replicas: 3
  selector:
    matchLabels:
      app: pipeline-consumer
  podManagementPolicy: Parallel  # 并行启动提高可用性
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      partition: 0
  template:
    metadata:
      labels:
        app: pipeline-consumer
        version: v3.2.1
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
    spec:
      serviceAccountName: pipeline-sa
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            podAffinityTerm:
              labelSelector:
                matchExpressions:
                - key: app
                  operator: In
                  values:
                  - pipeline-consumer
              topologyKey: kubernetes.io/hostname
      containers:
      - name: processor
        image: data-pipeline/polars-processor:latest
        imagePullPolicy: Always
        ports:
        - containerPort: 8080
          name: metrics
        - containerPort: 8081
          name: health
        env:
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        - name: CONSUMER_GROUP
          value: "polars-stream-group"
        - name: INSTANCE_ID
          value: "$(POD_NAME)"
        envFrom:
        - configMapRef:
            name: pipeline-config
        - secretRef:
            name: pipeline-secrets
        resources:
          requests:
            memory: "2Gi"
            cpu: "1000m"
          limits:
            memory: "4Gi"
            cpu: "2000m"
        livenessProbe:
          httpGet:
            path: /health/live
            port: 8081
          initialDelaySeconds: 30
          periodSeconds: 10
          timeoutSeconds: 5
          failureThreshold: 3
        readinessProbe:
          httpGet:
            path: /health/ready
            port: 8081
          initialDelaySeconds: 10
          periodSeconds: 5
          timeoutSeconds: 3
          failureThreshold: 3
        startupProbe:
          httpGet:
            path: /health/startup
            port: 8081
          initialDelaySeconds: 10
          periodSeconds: 5
          failureThreshold: 12  # 60秒启动时间
        volumeMounts:
        - name: state-data
          mountPath: /data/state
        - name: tmp
          mountPath: /tmp
        securityContext:
          allowPrivilegeEscalation: false
          readOnlyRootFilesystem: true
          runAsNonRoot: true
          runAsUser: 1000
          capabilities:
            drop:
            - ALL
      volumes:
      - name: tmp
        emptyDir: {}
      terminationGracePeriodSeconds: 60  # 优雅关闭时间
  volumeClaimTemplates:
  - metadata:
      name: state-data
    spec:
      accessModes: ["ReadWriteOnce"]
      storageClassName: "standard"  # 根据环境调整
      resources:
        requests:
          storage: 10Gi

---
# Deployment: 无状态API服务（Webhook接收器）
apiVersion: apps/v1
kind: Deployment
metadata:
  name: webhook-receiver
  namespace: data-pipeline
spec:
  replicas: 2
  selector:
    matchLabels:
      app: webhook-receiver
  template:
    metadata:
      labels:
        app: webhook-receiver
    spec:
      containers:
      - name: webhook
        image: data-pipeline/webhook:latest
        ports:
        - containerPort: 8000
        env:
        - name: KAFKA_BOOTSTRAP
          valueFrom:
            configMapKeyRef:
              name: pipeline-config
              key: KAFKA_BOOTSTRAP
        resources:
          requests:
            memory: "512Mi"
            cpu: "250m"
          limits:
            memory: "1Gi"
            cpu: "500m"
        livenessProbe:
          httpGet:
            path: /health
            port: 8000
          initialDelaySeconds: 10
          periodSeconds: 10

---
# Service: Webhook负载均衡
apiVersion: v1
kind: Service
metadata:
  name: webhook-service
  namespace: data-pipeline
spec:
  type: LoadBalancer
  ports:
  - port: 80
    targetPort: 8000
  selector:
    app: webhook-receiver

---
# CronJob: 定期VACUUM和优化
apiVersion: batch/v1
kind: CronJob
metadata:
  name: delta-maintenance
  namespace: data-pipeline
spec:
  schedule: "0 2 * * *"  # 每天凌晨2点
  concurrencyPolicy: Forbid
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: maintenance
            image: data-pipeline/maintenance:latest
            command:
            - python
            - /app/maintenance.py
            - --vacuum
            - --optimize
            env:
            - name: DELTA_LAKE_PATH
              value: "/data/delta"
            volumeMounts:
            - name: delta-storage
              mountPath: /data
          volumes:
          - name: delta-storage
            persistentVolumeClaim:
              claimName: delta-pvc
          restartPolicy: OnFailure

3.5.3 keda_autoscaling.yaml

脚本功能 ：KEDA基于Kafka Lag的HPA自动扩缩容配置 使用方式 ：kubectl apply -f 3.5.3_keda_autoscaling.yaml

复制代码

# 3.5.3 KEDA自动扩缩容配置
# 功能：基于Kafka Consumer Lag的自动水平扩缩容
# 使用：kubectl apply -f 3.5.3_keda_autoscaling.yaml
# 前提：安装KEDA (kubectl apply -f https://github.com/kedacore/keda/releases/download/v2.12.0/keda-2.12.0.yaml)

---
# TriggerAuthentication: Kafka认证配置
apiVersion: keda.sh/v1alpha1
kind: TriggerAuthentication
metadata:
  name: kafka-trigger-auth
  namespace: data-pipeline
spec:
  secretTargetRef:
  - parameter: sasl
    name: pipeline-secrets
    key: KAFKA_SASL_USERNAME
  - parameter: username
    name: pipeline-secrets
    key: KAFKA_SASL_USERNAME
  - parameter: password
    name: pipeline-secrets
    key: KAFKA_SASL_PASSWORD

---
# ScaledObject: KEDA扩缩容配置
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
  name: polars-stream-processor-scaler
  namespace: data-pipeline
spec:
  scaleTargetRef:
    name: polars-stream-processor  # 对应StatefulSet名称
    kind: StatefulSet
    apiVersion: apps/v1
  pollingInterval: 30              # 检查指标间隔（秒）
  cooldownPeriod: 300              # 缩容冷却时间（秒）
  minReplicaCount: 1               # 最小副本数
  maxReplicaCount: 10              # 最大副本数
  advanced:
    restoreToOriginalReplicaCount: false
    horizontalPodAutoscalerConfig:
      behavior:
        scaleDown:
          stabilizationWindowSeconds: 300  # 5分钟稳定窗口
          policies:
          - type: Percent
            value: 50
            periodSeconds: 60
        scaleUp:
          stabilizationWindowSeconds: 0
          policies:
          - type: Percent
            value: 100
            periodSeconds: 15
          - type: Pods
            value: 4
            periodSeconds: 15
          selectPolicy: Max
  triggers:
  # 触发器1: Kafka Lag
  - type: kafka
    metadata:
      bootstrapServers: kafka:9092
      consumerGroup: polars-stream-group
      topic: raw.sensor.data
      lagThreshold: "100"           # 每个副本目标Lag
      activationLagThreshold: "10"  # 激活阈值
      offsetResetPolicy: latest
    authenticationRef:
      name: kafka-trigger-auth
  
  # 触发器2: CPU利用率（备用）
  - type: cpu
    metadata:
      type: Utilization
      value: "70"
  
  # 触发器3: 内存利用率（保护）
  - type: memory
    metadata:
      type: Utilization
      value: "80"

---
# ScaledJob: 批处理任务扩缩容（用于Delta Lake合并等）
apiVersion: keda.sh/v1alpha1
kind: ScaledJob
metadata:
  name: delta-optimize-job
  namespace: data-pipeline
spec:
  jobTargetRef:
    parallelism: 1
    completions: 1
    activeDeadlineSeconds: 600
    backoffLimit: 2
    template:
      spec:
        containers:
        - name: optimizer
          image: data-pipeline/delta-optimizer:latest
          command: ["python", "/app/optimize.py"]
          env:
          - name: TABLE_PATH
            value: "/data/delta/silver"
          resources:
            requests:
              memory: "1Gi"
              cpu: "500m"
            limits:
              memory: "2Gi"
              cpu: "1000m"
        restartPolicy: OnFailure
  pollingInterval: 300  # 每5分钟检查
  successfulJobsHistoryLimit: 3
  failedJobsHistoryLimit: 1
  triggers:
  - type: cron
    metadata:
      timezone: Asia/Shanghai
      start: 0 2 * * *    # 每天凌晨2点触发
      end: 0 4 * * *
      desiredReplicas: "1"

---
# Prometheus ScaledObject: 基于自定义业务指标
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
  name: webhook-receiver-scaler
  namespace: data-pipeline
spec:
  scaleTargetRef:
    name: webhook-receiver
    kind: Deployment
  minReplicaCount: 2
  maxReplicaCount: 20
  triggers:
  - type: prometheus
    metadata:
      serverAddress: http://prometheus.monitoring:9090
      metricName: http_requests_per_second
      threshold: "100"
      query: |
        sum(rate(http_requests_total{service="webhook-service"}[2m]))

3.5.4 monitoring.py

脚本功能 ：Prometheus指标采集与PagerDuty告警集成 使用方式 ：python 3.5.4_monitoring.py

复制代码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
3.5.4 监控告警系统
功能：
- Prometheus指标暴露（Counter, Gauge, Histogram）
- 业务自定义指标
- PagerDuty告警集成
- Grafana Dashboard JSON生成
使用方式：python 3.5.4_monitoring.py
"""

import time
import random
import logging
from typing import Dict, List, Optional, Callable
from dataclasses import dataclass
from datetime import datetime
import threading
import json

# 模拟Prometheus客户端（实际使用prometheus_client库）
class Counter:
    def __init__(self, name: str, description: str, labels: List[str] = None):
        self.name = name
        self.description = description
        self.labels = labels or []
        self.value = 0
        self.label_values = {}
    
    def inc(self, amount: int = 1, **labels):
        self.value += amount
        key = tuple(labels.get(l, '') for l in self.labels)
        self.label_values[key] = self.label_values.get(key, 0) + amount

class Gauge:
    def __init__(self, name: str, description: str, labels: List[str] = None):
        self.name = name
        self.description = description
        self.labels = labels or []
        self.values = {}
    
    def set(self, value: float, **labels):
        key = tuple(labels.get(l, '') for l in self.labels)
        self.values[key] = value
    
    def inc(self, amount: float = 1, **labels):
        key = tuple(labels.get(l, '') for l in self.labels)
        self.values[key] = self.values.get(key, 0) + amount
    
    def dec(self, amount: float = 1, **labels):
        self.inc(-amount, **labels)

class Histogram:
    def __init__(self, name: str, description: str, buckets: List[float] = None):
        self.name = name
        self.description = description
        self.buckets = buckets or [.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0]
        self.observations = []
    
    def observe(self, value: float):
        self.observations.append(value)
    
    def get_quantile(self, q: float) -> float:
        if not self.observations:
            return 0
        sorted_obs = sorted(self.observations)
        idx = int(len(sorted_obs) * q)
        return sorted_obs[min(idx, len(sorted_obs)-1)]


class PrometheusMetricsExporter:
    """
    Prometheus指标导出器
    实际应使用prometheus_client.start_http_server()
    """
    
    def __init__(self, port: int = 8000):
        self.port = port
        self.metrics = {}
        self.registry = []
    
    def register(self, metric):
        """注册指标"""
        self.metrics[metric.name] = metric
        self.registry.append(metric)
        return metric
    
    def generate_latest(self) -> str:
        """生成Prometheus格式的指标输出"""
        output = []
        
        for metric in self.registry:
            output.append(f"# HELP {metric.name} {metric.description}")
            
            if isinstance(metric, Counter):
                output.append(f"# TYPE {metric.name} counter")
                if metric.labels:
                    for key, val in metric.label_values.items():
                        label_str = ','.join(f'{l}="{v}"' for l, v in zip(metric.labels, key))
                        output.append(f'{metric.name}{{{label_str}}} {val}')
                else:
                    output.append(f"{metric.name} {metric.value}")
                    
            elif isinstance(metric, Gauge):
                output.append(f"# TYPE {metric.name} gauge")
                if hasattr(metric, 'values'):
                    for key, val in metric.values.items():
                        if metric.labels:
                            label_str = ','.join(f'{l}="{v}"' for l, v in zip(metric.labels, key))
                            output.append(f'{metric.name}{{{label_str}}} {val}')
                        else:
                            output.append(f"{metric.name} {val}")
                            
            elif isinstance(metric, Histogram):
                output.append(f"# TYPE {metric.name} histogram")
                # 简化输出
                output.append(f"{metric.name}_count {len(metric.observations)}")
                if metric.observations:
                    output.append(f"{metric.name}_sum {sum(metric.observations)}")
        
        return '\n'.join(output)


class PagerDutyClient:
    """
    PagerDuty告警客户端
    """
    
    def __init__(self, routing_key: str, service_key: Optional[str] = None):
        self.routing_key = routing_key
        self.service_key = service_key
        self.incident_counter = 0
        self.active_incidents: Dict[str, str] = {}  # dedup_key -> incident_key
    
    def trigger_incident(self,
                        summary: str,
                        severity: str = 'critical',
                        source: str = 'data-pipeline',
                        dedup_key: Optional[str] = None,
                        custom_details: Dict = None) -> str:
        """
        触发PagerDuty事件
        
        severity: critical, error, warning, info
        """
        self.incident_counter += 1
        
        if dedup_key is None:
            dedup_key = f"incident-{self.incident_counter}"
        
        # 模拟API调用（实际应调用PagerDuty Events API v2）
        incident_key = f"PD-{datetime.now().strftime('%Y%m%d')}-{self.incident_counter}"
        self.active_incidents[dedup_key] = incident_key
        
        payload = {
            "routing_key": self.routing_key,
            "event_action": "trigger",
            "dedup_key": dedup_key,
            "payload": {
                "summary": summary,
                "severity": severity,
                "source": source,
                "timestamp": datetime.utcnow().isoformat(),
                "custom_details": custom_details or {}
            }
        }
        
        logging.warning(f"[PAGERDUTY] Incident triggered: {summary} (severity: {severity})")
        return incident_key
    
    def resolve_incident(self, dedup_key: str):
        """解决告警"""
        if dedup_key in self.active_incidents:
            del self.active_incidents[dedup_key]
            logging.info(f"[PAGERDUTY] Incident resolved: {dedup_key}")
    
    def acknowledge_incident(self, dedup_key: str):
        """确认告警"""
        logging.info(f"[PAGERDUTY] Incident acknowledged: {dedup_key}")


class MonitoringDashboard:
    """
    监控仪表板
    生成Grafana Dashboard JSON
    """
    
    def __init__(self, title: str = "Data Pipeline Dashboard"):
        self.title = title
        self.panels = []
    
    def add_panel(self,
                  title: str,
                  targets: List[Dict],
                  type_: str = "timeseries",
                  grid_pos: Dict = None):
        """添加面板"""
        panel = {
            "id": len(self.panels) + 1,
            "title": title,
            "type": type_,
            "targets": targets,
            "gridPos": grid_pos or {"h": 8, "w": 12, "x": 0, "y": 0}
        }
        self.panels.append(panel)
        return self
    
    def generate_grafana_json(self) -> Dict:
        """生成Grafana Dashboard JSON"""
        return {
            "dashboard": {
                "id": None,
                "uid": "data-pipeline-overview",
                "title": self.title,
                "tags": ["kafka", "polars", "delta"],
                "timezone": "browser",
                "schemaVersion": 36,
                "refresh": "30s",
                "panels": self.panels
            },
            "overwrite": True
        }
    
    def save_dashboard(self, filename: str = "grafana_dashboard.json"):
        """保存Dashboard JSON"""
        dashboard = self.generate_grafana_json()
        with open(filename, 'w') as f:
            json.dump(dashboard, f, indent=2)
        logging.info(f"Grafana Dashboard saved: {filename}")


class PipelineMonitor:
    """
    完整管道监控
    集成Prometheus + PagerDuty + Grafana
    """
    
    def __init__(self, 
                 pagerduty_key: Optional[str] = None,
                 prometheus_port: int = 8000):
        self.exporter = PrometheusMetricsExporter(prometheus_port)
        self.pagerduty = PagerDutyClient(pagerduty_key) if pagerduty_key else None
        self.dashboard = MonitoringDashboard()
        
        # 定义关键指标
        self._setup_metrics()
        self._setup_alerts()
        self._setup_dashboard()
    
    def _setup_metrics(self):
        """设置业务指标"""
        # 处理指标
        self.messages_processed = self.exporter.register(
            Counter("pipeline_messages_processed_total", 
                   "Total messages processed", 
                   ["stage", "topic"])
        )
        self.processing_latency = self.exporter.register(
            Histogram("pipeline_processing_duration_seconds",
                     "Message processing latency")
        )
        self.kafka_lag = self.exporter.register(
            Gauge("pipeline_kafka_consumer_lag",
                 "Current Kafka consumer lag",
                 ["topic", "partition", "consumer_group"])
        )
        
        # 质量指标
        self.quality_score = self.exporter.register(
            Gauge("pipeline_data_quality_score",
                 "Current data quality score",
                 ["table"])
        )
        self.anomalies_detected = self.exporter.register(
            Counter("pipeline_anomalies_detected_total",
                   "Total anomalies detected",
                   ["severity"])
        )
        
        # 存储指标
        self.delta_table_size = self.exporter.register(
            Gauge("pipeline_delta_table_size_bytes",
                 "Delta table size in bytes",
                 ["table", "layer"])
        )
    
    def _setup_alerts(self):
        """设置告警规则"""
        self.alert_rules = [
            {
                "name": "HighProcessingLatency",
                "condition": lambda: self.processing_latency.get_quantile(0.99) > 5.0,
                "severity": "critical",
                "summary": "Processing latency P99 > 5 seconds"
            },
            {
                "name": "HighKafkaLag",
                "condition": lambda: any(v > 10000 for v in self.kafka_lag.values.values()),
                "severity": "warning",
                "summary": "Kafka lag exceeds 10,000 messages"
            },
            {
                "name": "LowDataQuality",
                "condition": lambda: any(v < 0.8 for v in self.quality_score.values.values()),
                "severity": "warning",
                "summary": "Data quality score below 80%"
            }
        ]
        self.check_interval = 60
    
    def _setup_dashboard(self):
        """设置Grafana面板"""
        (self.dashboard
         .add_panel(
             "Processing Rate",
             [{"expr": "rate(pipeline_messages_processed_total[5m])"}],
             grid_pos={"h": 8, "w": 12, "x": 0, "y": 0}
         )
         .add_panel(
             "Processing Latency",
             [{"expr": "histogram_quantile(0.99, pipeline_processing_duration_seconds)"}],
             grid_pos={"h": 8, "w": 12, "x": 12, "y": 0}
         )
         .add_panel(
             "Kafka Lag",
             [{"expr": "pipeline_kafka_consumer_lag"}],
             grid_pos={"h": 8, "w": 12, "x": 0, "y": 8}
         )
         .add_panel(
             "Data Quality",
             [{"expr": "pipeline_data_quality_score"}],
             grid_pos={"h": 8, "w": 12, "x": 12, "y": 8}
         ))
    
    def start_monitoring(self):
        """启动监控循环"""
        def monitor_loop():
            while True:
                try:
                    self._check_alerts()
                    time.sleep(self.check_interval)
                except Exception as e:
                    logging.error(f"监控循环错误: {e}")
        
        thread = threading.Thread(target=monitor_loop, daemon=True)
        thread.start()
        logging.info("监控循环已启动")
    
    def _check_alerts(self):
        """检查告警条件"""
        for rule in self.alert_rules:
            try:
                if rule["condition"]():
                    if self.pagerduty:
                        self.pagerduty.trigger_incident(
                            summary=rule["summary"],
                            severity=rule["severity"],
                            dedup_key=rule["name"]
                        )
                    else:
                        logging.warning(f"[ALERT] {rule['name']}: {rule['summary']}")
            except Exception as e:
                logging.error(f"告警检查失败 {rule['name']}: {e}")
    
    def record_message_processed(self, stage: str, topic: str, latency_seconds: float):
        """记录消息处理"""
        self.messages_processed.inc(stage=stage, topic=topic)
        self.processing_latency.observe(latency_seconds)
    
    def record_kafka_lag(self, topic: str, partition: int, group: str, lag: int):
        """记录Kafka Lag"""
        self.kafka_lag.set(lag, topic=topic, partition=str(partition), consumer_group=group)
    
    def record_quality_score(self, table: str, score: float):
        """记录质量分数"""
        self.quality_score.set(score, table=table)
    
    def get_metrics_snapshot(self) -> Dict:
        """获取指标快照"""
        return {
            'messages_processed': self.messages_processed.value,
            'avg_latency': sum(self.processing_latency.observations) / len(self.processing_latency.observations) if self.processing_latency.observations else 0,
            'p99_latency': self.processing_latency.get_quantile(0.99),
            'current_lags': dict(self.kafka_lag.values),
            'quality_scores': dict(self.quality_score.values)
        }


def demonstrate_monitoring():
    """演示监控系统"""
    print("=" * 60)
    print("3.5.4 监控告警系统演示")
    print("=" * 60)
    
    # 创建监控器
    monitor = PipelineMonitor(pagerduty_key="dummy-key-for-demo")
    
    # 模拟指标数据
    print("\n模拟指标数据生成...")
    
    stages = ["ingestion", "transformation", "storage"]
    topics = ["raw.data", "processed.events", "aggregated.metrics"]
    
    for i in range(100):
        # 模拟处理延迟（偶尔尖刺）
        latency = random.expovariate(1/0.1)  # 平均100ms
        if i % 20 == 0:
            latency *= 5  # 模拟延迟尖刺
        
        monitor.record_message_processed(
            stage=random.choice(stages),
            topic=random.choice(topics),
            latency_seconds=latency
        )
        
        # 模拟Kafka Lag
        monitor.record_kafka_lag(
            topic="raw.data",
            partition=i % 3,
            group="main-consumer",
            lag=int(random.expovariate(1/5000))
        )
        
        # 模拟质量分数
        monitor.record_quality_score(
            table=f"table_{i % 5}",
            score=random.uniform(0.7, 1.0)
        )
    
    # 显示指标
    print("\n指标快照:")
    snapshot = monitor.get_metrics_snapshot()
    print(f"  处理消息数: {snapshot['messages_processed']}")
    print(f"  平均延迟: {snapshot['avg_latency']*1000:.2f}ms")
    print(f"  P99延迟: {snapshot['p99_latency']*1000:.2f}ms")
    
    # 生成Prometheus格式
    print("\nPrometheus格式输出（前10行）:")
    metrics_output = monitor.exporter.generate_latest()
    for line in metrics_output.split('\n')[:10]:
        print(f"  {line}")
    
    # 生成Grafana Dashboard
    print("\n生成Grafana Dashboard...")
    monitor.dashboard.save_dashboard("pipeline_dashboard.json")
    
    # 模拟告警检查
    print("\n模拟告警检查...")
    monitor._check_alerts()
    
    print("\n监控系统演示完成")


if __name__ == "__main__":
    demonstrate_monitoring()

项目总结与架构图

系统架构总览

依赖安装说明

创建 requirements.txt:

复制代码

# 核心数据处理
polars>=0.20.0
pandas>=2.0.0
pyarrow>=14.0.0
numpy>=1.24.0

# Kafka生态
kafka-python>=2.0.2
confluent-kafka>=2.3.0

# Delta Lake
deltalake>=0.15.0

# 存储后端
python-rocksdb>=0.8.0

# Web服务
fastapi>=0.104.0
uvicorn>=0.24.0

# 数据验证
pydantic>=2.5.0
great-expectations>=0.18.0

# 可视化
matplotlib>=3.8.0
networkx>=3.2.0
graphviz>=0.20.0

# 工具
tenacity>=8.2.0
lz4>=4.3.0
requests>=2.31.0

快速启动指南

bash

复制

复制代码

# 1. 启动基础设施
docker-compose -f docker-compose.yml up -d

# 2. 安装依赖
pip install -r requirements.txt

# 3. 启动模拟数据生成
python 3.1.1_rest_api_poller.py --mock &

# 4. 启动Webhook接收器
python 3.1.1_webhook_receiver.py --port 8000 &

# 5. 启动Polars流处理
python 3.2.1_polars_streaming.py

# 6. 启动监控
python 3.5.4_monitoring.py

# 7. 部署到Kubernetes（可选）
kubectl apply -f 3.5.2_k8s_deployment.yaml
kubectl apply -f 3.5.3_keda_autoscaling.yaml

项目说明：本项目提供了完整的实时数据管道实现，从数据采集、流处理、存储到质量监控的全链路覆盖。每个脚本均可独立运行，具备完整可视化，且通过Docker Compose可一键启动全部依赖服务。所有代码均遵循生产级最佳实践，包括错误处理、性能优化和可观测性。