国家电网Java面试被问:分布式Top K问题的解决方案

一、基础理论与问题分析

1.1 Top K问题定义与分类

python

复制

下载

复制代码
"""
Top K问题类型分析
"""
from typing import List, Tuple, Any, Dict, Union, Optional
from dataclasses import dataclass
from enum import Enum
import heapq
from collections import Counter, defaultdict
import math

class TopKType(Enum):
    """Top K问题类型枚举"""
    MAX_K = "max_k"          # 最大的K个元素
    MIN_K = "min_k"          # 最小的K个元素
    FREQUENT_K = "frequent_k"  # 最频繁的K个元素
    HEAVIEST_K = "heaviest_k"   # 权重最大的K个元素
    DISTINCT_K = "distinct_k"   # 最频繁的K个不同元素

@dataclass
class TopKQuery:
    """Top K查询定义"""
    k: int                    # K值
    query_type: TopKType      # 查询类型
    data_range: Optional[Tuple[Any, Any]] = None  # 数据范围
    filters: Optional[Dict[str, Any]] = None      # 过滤条件
    weight_field: Optional[str] = None           # 权重字段
    
    def validate(self) -> bool:
        """验证查询参数"""
        if self.k <= 0:
            raise ValueError("K must be positive")
        if self.k > 1000000:  # 限制最大K值
            raise ValueError("K is too large")
        return True

@dataclass
class DistributedTopKConfig:
    """分布式Top K配置"""
    num_nodes: int            # 节点数量
    data_sharding: str        # 数据分片策略: hash, range, random
    replication_factor: int   # 复制因子
    consistency_level: str    # 一致性级别: strong, eventual
    algorithm: str           # 算法: mapreduce, streaming, gossip
    
    # 性能参数
    batch_size: int = 1000    # 批量处理大小
    max_memory_mb: int = 1024 # 最大内存(MB)
    timeout_seconds: int = 30 # 超时时间
    
    def __post_init__(self):
        if self.num_nodes <= 0:
            raise ValueError("Number of nodes must be positive")
        if self.replication_factor <= 0 or self.replication_factor > self.num_nodes:
            raise ValueError("Invalid replication factor")

class TopKProblemAnalyzer:
    """Top K问题分析器"""
    
    def __init__(self, data_size: int, data_distribution: str = "uniform"):
        """
        参数:
            data_size: 数据总量
            data_distribution: 数据分布类型 (uniform, zipf, normal, skewed)
        """
        self.data_size = data_size
        self.distribution = data_distribution
        self.analysis_results = {}
    
    def analyze_problem(self, k: int) -> Dict[str, Any]:
        """分析Top K问题的复杂度"""
        analysis = {
            "data_size": self.data_size,
            "k_value": k,
            "distribution": self.distribution,
            "k_ratio": k / self.data_size if self.data_size > 0 else 0
        }
        
        # 复杂度分析
        if k <= 1000:
            analysis["complexity"] = "trivial"
            analysis["recommended_approach"] = "local_sorting"
        elif k <= 10000 and self.data_size <= 1000000:
            analysis["complexity"] = "moderate"
            analysis["recommended_approach"] = "heap_based"
        elif k / self.data_size > 0.1:  # K超过数据量的10%
            analysis["complexity"] = "high"
            analysis["recommended_approach"] = "quickselect"
        else:
            analysis["complexity"] = "distributed"
            analysis["recommended_approach"] = "map_reduce"
        
        # 内存需求估算
        memory_estimate = self._estimate_memory(k)
        analysis["memory_estimate_mb"] = memory_estimate
        
        # 网络传输估算
        network_estimate = self._estimate_network(k)
        analysis["network_estimate_mb"] = network_estimate
        
        return analysis
    
    def _estimate_memory(self, k: int) -> float:
        """估算内存需求"""
        # 假设每个元素8字节,加上开销
        element_size = 8  # 字节
        overhead = 1.5    # 开销因子
        
        memory_bytes = k * element_size * overhead
        
        # 考虑数据结构开销
        if k > 10000:
            # 需要额外数据结构
            memory_bytes *= 2
        
        return memory_bytes / (1024 * 1024)  # 转换为MB
    
    def _estimate_network(self, k: int) -> float:
        """估算网络传输量"""
        # 假设每个节点发送自己的top k
        element_size = 8  # 字节
        compression_ratio = 0.5  # 压缩比
        
        network_bytes = k * element_size * compression_ratio
        
        return network_bytes / (1024 * 1024)  # 转换为MB
    
    def recommend_algorithm(self, num_nodes: int) -> str:
        """推荐算法"""
        k_ratio = self.analysis_results.get("k_ratio", 0)
        
        if num_nodes == 1:
            if k_ratio > 0.1:
                return "quick_select"
            else:
                return "heap_based"
        elif num_nodes <= 10:
            return "tree_aggregation"
        elif num_nodes <= 100:
            return "map_reduce"
        else:
            return "gossip_based"

class DataDistributionSimulator:
    """数据分布模拟器"""
    
    @staticmethod
    def generate_data(distribution: str, size: int, **kwargs) -> List[float]:
        """生成模拟数据"""
        import numpy as np
        import random
        
        if distribution == "uniform":
            low = kwargs.get('low', 0)
            high = kwargs.get('high', 1000)
            return [random.uniform(low, high) for _ in range(size)]
        
        elif distribution == "normal":
            mean = kwargs.get('mean', 0)
            std = kwargs.get('std', 1)
            return np.random.normal(mean, std, size).tolist()
        
        elif distribution == "zipf":
            a = kwargs.get('a', 1.5)  # 分布参数
            return np.random.zipf(a, size).tolist()
        
        elif distribution == "skewed":
            # 偏斜分布:大部分数据集中在头部
            base = kwargs.get('base', 1000)
            skewness = kwargs.get('skewness', 0.8)
            
            data = []
            for i in range(size):
                if random.random() < skewness:
                    data.append(random.uniform(0, base * 0.1))
                else:
                    data.append(random.uniform(base * 0.1, base))
            return data
        
        else:
            raise ValueError(f"Unknown distribution: {distribution}")
    
    @staticmethod
    def analyze_distribution(data: List[float]) -> Dict[str, Any]:
        """分析数据分布特征"""
        import numpy as np
        from scipy import stats
        
        data_np = np.array(data)
        
        analysis = {
            "size": len(data),
            "mean": float(np.mean(data_np)),
            "std": float(np.std(data_np)),
            "min": float(np.min(data_np)),
            "max": float(np.max(data_np)),
            "percentiles": {
                "p25": float(np.percentile(data_np, 25)),
                "p50": float(np.percentile(data_np, 50)),
                "p75": float(np.percentile(data_np, 75)),
                "p90": float(np.percentile(data_np, 90)),
                "p95": float(np.percentile(data_np, 95)),
                "p99": float(np.percentile(data_np, 99))
            },
            "skewness": float(stats.skew(data_np)),
            "kurtosis": float(stats.kurtosis(data_np))
        }
        
        return analysis

1.2 分布式Top K算法理论基础

python

复制

下载

复制代码
"""
分布式Top K算法理论基础
"""
import hashlib
import bisect
from typing import List, Set, Dict, Any, Tuple, Optional
import numpy as np
from dataclasses import dataclass
from abc import ABC, abstractmethod
import asyncio
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

class DistributedTopKAlgorithm(ABC):
    """分布式Top K算法基类"""
    
    def __init__(self, config: DistributedTopKConfig):
        self.config = config
        self.nodes = {}  # 节点状态
        self.coordinator = None  # 协调节点
        self.stats = {
            "operations": 0,
            "data_transferred_mb": 0,
            "time_elapsed_seconds": 0,
            "comparisons": 0,
            "network_rounds": 0
        }
    
    @abstractmethod
    async def compute_top_k(
        self, 
        query: TopKQuery,
        data_streams: List[List[Any]]
    ) -> List[Tuple[Any, float]]:
        """
        计算分布式Top K
        
        参数:
            query: Top K查询
            data_streams: 每个节点的数据流
            
        返回:
            Top K结果列表 (元素, 分数)
        """
        pass
    
    @abstractmethod
    def get_complexity_analysis(self) -> Dict[str, Any]:
        """获取复杂度分析"""
        pass
    
    @abstractmethod
    def get_network_cost(self) -> Dict[str, float]:
        """获取网络成本估算"""
        pass

class ComplexityAnalyzer:
    """分布式Top K算法复杂度分析"""
    
    @staticmethod
    def analyze_algorithm(
        algorithm: str,
        n: int,      # 总数据量
        m: int,      # 节点数
        k: int       # K值
    ) -> Dict[str, float]:
        """分析算法复杂度"""
        
        complexities = {}
        
        if algorithm == "naive_merge":
            # 朴素合并:每个节点发送自己的top k,协调节点合并
            complexities["time_complexity"] = O(m * k * log(k))
            complexities["space_complexity"] = O(m * k)
            complexities["network_complexity"] = O(m * k)
            complexities["round_complexity"] = 2  # 两轮通信
            
        elif algorithm == "threshold_based":
            # 基于阈值的算法
            complexities["time_complexity"] = O(n/m * log(k) + m * log(m))
            complexities["space_complexity"] = O(k + m)
            complexities["network_complexity"] = O(m * log(m))
            complexities["round_complexity"] = O(log(m))
            
        elif algorithm == "tree_aggregation":
            # 树形聚合
            complexities["time_complexity"] = O(n/m * log(k) + log(m) * k)
            complexities["space_complexity"] = O(k * log(m))
            complexities["network_complexity"] = O(k * log(m))
            complexities["round_complexity"] = O(log(m))
            
        elif algorithm == "map_reduce":
            # MapReduce版本
            complexities["time_complexity"] = O(n/m * log(k) + m * k)
            complexities["space_complexity"] = O(m * k)
            complexities["network_complexity"] = O(m * k)
            complexities["round_complexity"] = 2
            
        elif algorithm == "gossip_based":
            # 基于Gossip的算法
            complexities["time_complexity"] = O(log(m) * n/m)
            complexities["space_complexity"] = O(k * log(m))
            complexities["network_complexity"] = O(m * k * log(m))
            complexities["round_complexity"] = O(log(m))
            
        elif algorithm == "quantile_based":
            # 基于分位数的算法
            complexities["time_complexity"] = O(n/m * log(n/m) + m * log(m))
            complexities["space_complexity"] = O(k + m)
            complexities["network_complexity"] = O(m * log(m))
            complexities["round_complexity"] = O(log(m))
            
        else:
            raise ValueError(f"Unknown algorithm: {algorithm}")
        
        # 计算实际值
        complexities["estimated_time"] = ComplexityAnalyzer._estimate_time(
            complexities["time_complexity"], n, m, k
        )
        complexities["estimated_network_mb"] = ComplexityAnalyzer._estimate_network(
            complexities["network_complexity"], n, m, k
        )
        
        return complexities
    
    @staticmethod
    def _estimate_time(complexity: str, n: int, m: int, k: int) -> float:
        """估算时间"""
        # 简化估算,实际需要基准测试
        if "n/m" in complexity:
            base_time = (n / m) * 0.000001  # 假设每个操作1微秒
        elif "n" in complexity:
            base_time = n * 0.000001
        elif "m*k" in complexity:
            base_time = (m * k) * 0.000001
        else:
            base_time = 1000  # 默认1秒
        
        # 考虑对数因子
        if "log(m)" in complexity:
            base_time *= math.log2(m) if m > 1 else 1
        if "log(k)" in complexity:
            base_time *= math.log2(k) if k > 1 else 1
        
        return base_time
    
    @staticmethod
    def _estimate_network(complexity: str, n: int, m: int, k: int) -> float:
        """估算网络传输量"""
        element_size = 8  # 每个元素8字节
        
        if "m*k" in complexity:
            network_bytes = m * k * element_size
        elif "k*log(m)" in complexity:
            network_bytes = k * math.log2(m) * element_size if m > 1 else k * element_size
        elif "m*log(m)" in complexity:
            network_bytes = m * math.log2(m) * element_size if m > 1 else m * element_size
        else:
            network_bytes = m * k * element_size  # 默认
        
        return network_bytes / (1024 * 1024)  # MB

class TopKAlgorithmSelector:
    """Top K算法选择器"""
    
    @staticmethod
    def select_algorithm(
        problem_size: int,
        num_nodes: int,
        k_value: int,
        data_distribution: str,
        network_bandwidth: float,  # MB/s
        latency: float,           # 毫秒
        memory_constraint: float  # MB
    ) -> Dict[str, Any]:
        """选择合适的算法"""
        
        recommendations = []
        
        # 算法评估标准
        algorithms = [
            {
                "name": "threshold_based",
                "description": "基于阈值的剪枝算法",
                "suitability": TopKAlgorithmSelector._evaluate_threshold_based(
                    problem_size, num_nodes, k_value, data_distribution
                )
            },
            {
                "name": "tree_aggregation",
                "description": "树形聚合算法",
                "suitability": TopKAlgorithmSelector._evaluate_tree_aggregation(
                    problem_size, num_nodes, k_value, network_bandwidth, latency
                )
            },
            {
                "name": "map_reduce",
                "description": "MapReduce算法",
                "suitability": TopKAlgorithmSelector._evaluate_map_reduce(
                    problem_size, num_nodes, k_value, memory_constraint
                )
            },
            {
                "name": "gossip_based",
                "description": "Gossip算法",
                "suitability": TopKAlgorithmSelector._evaluate_gossip_based(
                    num_nodes, network_bandwidth, latency
                )
            },
            {
                "name": "quantile_based",
                "description": "分位数算法",
                "suitability": TopKAlgorithmSelector._evaluate_quantile_based(
                    problem_size, data_distribution
                )
            }
        ]
        
        # 选择最适合的算法
        best_algorithm = max(algorithms, key=lambda x: x["suitability"]["score"])
        
        # 复杂度分析
        complexity = ComplexityAnalyzer.analyze_algorithm(
            best_algorithm["name"],
            problem_size,
            num_nodes,
            k_value
        )
        
        return {
            "selected_algorithm": best_algorithm["name"],
            "algorithm_description": best_algorithm["description"],
            "suitability_score": best_algorithm["suitability"]["score"],
            "suitability_reasons": best_algorithm["suitability"]["reasons"],
            "complexity_analysis": complexity,
            "all_algorithms": algorithms
        }


    
    @staticmethod
    def _evaluate_threshold_based(
        problem_size: int,
        num_nodes: int,
        k_value: int,
        data_distribution: str
    ) -> Dict[str, Any]:
        """评估基于阈值的算法"""
        score = 0
        reasons = []
        
        # 数据分布影响
        if data_distribution in ["uniform", "normal"]:
            score += 30
            reasons.append("均匀/正态分布适合阈值剪枝")
        elif data_distribution == "skewed":
            score += 10
            reasons.append("偏斜分布可能影响阈值效果")
        
        # 节点数量影响
        if num_nodes <= 100:
            score += 20
            reasons.append("中等规模集群适用")
        elif num_nodes <= 1000:
            score += 10
            reasons.append("大规模集群可能通信成本高")
        
        # K值影响
        if k_value <= 1000:
            score += 30
            reasons.append("较小的K值适合阈值算法")
        elif k_value <= 10000:
            score += 15
            reasons.append("中等K值可能适用")
        else:
            score += 5
            reasons.append("大K值可能不适用")
        
        # 问题规模
        if problem_size >= 1000000:
            score += 20
            reasons.append("大数据量适合")
        
        return {"score": score, "reasons": reasons}
    
    @staticmethod
    def _evaluate_tree_aggregation(
        problem_size: int,
        num_nodes: int,
        k_value: int,
        network_bandwidth: float,
        latency: float
    ) -> Dict[str, Any]:
        """评估树形聚合算法"""
        score = 0
        reasons = []
        
        # 网络条件
        if network_bandwidth >= 100:  # 100MB/s以上
            score += 30
            reasons.append("高带宽网络适合")
        elif network_bandwidth >= 10:
            score += 20
            reasons.append("中等带宽可能适用")
        
        if latency <= 10:  # 10ms以下
            score += 20
            reasons.append("低延迟网络适合")
        
        # 节点数量(树形结构适合中等规模)
        if 8 <= num_nodes <= 256:
            score += 30
            reasons.append("节点数量适合构建树形结构")
        elif num_nodes < 8:
            score += 10
            reasons.append("节点太少,树形结构优势不明显")
        else:
            score += 5
            reasons.append("节点太多,树可能过深")
        
        # K值影响
        if k_value <= 10000:
            score += 20
            reasons.append("K值适中,适合聚合")
        
        return {"score": score, "reasons": reasons}
    
    @staticmethod
    def _evaluate_map_reduce(
        problem_size: int,
        num_nodes: int,
        k_value: int,
        memory_constraint: float
    ) -> Dict[str, Any]:
        """评估MapReduce算法"""
        score = 0
        reasons = []
        
        # 内存约束
        if memory_constraint >= 4096:  # 4GB以上
            score += 40
            reasons.append("内存充足,适合MapReduce")
        elif memory_constraint >= 1024:
            score += 20
            reasons.append("内存中等,可能适用")
        
        # 节点数量
        if num_nodes <= 100:
            score += 30
            reasons.append("中等规模集群适用")
        
        # 问题规模
        if problem_size >= 10000000:  # 1000万以上
            score += 30
            reasons.append("超大数据量适合MapReduce")
        
        return {"score": score, "reasons": reasons}
    
    @staticmethod
    def _evaluate_gossip_based(
        num_nodes: int,
        network_bandwidth: float,
        latency: float
    ) -> Dict[str, Any]:
        """评估Gossip算法"""
        score = 0
        reasons = []
        
        # 节点数量
        if num_nodes >= 100:
            score += 40
            reasons.append("大规模集群适合Gossip")
        elif num_nodes >= 10:
            score += 20
            reasons.append("中等规模可能适用")
        
        # 网络条件(Gossip对网络要求较高)
        if network_bandwidth >= 1000:  # 1GB/s
            score += 30
            reasons.append("超高带宽适合Gossip")
        elif network_bandwidth >= 100:
            score += 20
            reasons.append("高带宽可能适用")
        
        if latency <= 1:  # 1ms以下
            score += 30
            reasons.append("极低延迟适合Gossip")
        
        return {"score": score, "reasons": reasons}
    
    @staticmethod
    def _evaluate_quantile_based(
        problem_size: int,
        data_distribution: str
    ) -> Dict[str, Any]:
        """评估分位数算法"""
        score = 0
        reasons = []
        
        # 数据分布
        if data_distribution in ["uniform", "normal"]:
            score += 40
            reasons.append("均匀分布适合分位数估计")
        elif data_distribution == "zipf":
            score += 20
            reasons.append("Zipf分布可能适用")
        elif data_distribution == "skewed":
            score += 10
            reasons.append("偏斜分布可能不准确")
        
        # 问题规模
        if problem_size >= 1000000:
            score += 30
            reasons.append("大数据量适合分位数算法")
        
        # 需要排序,适合数据可排序的场景
        score += 30
        reasons.append("适合数值型可排序数据")
        
        return {"score": score, "reasons": reasons}

篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc

需要全套面试笔记及答案
【点击此处即可/免费获取】​​​

二、核心算法实现

2.1 基于阈值的剪枝算法

python

复制

下载

复制代码
"""
基于阈值的分布式Top K算法
"""
import asyncio
import heapq
import random
from typing import List, Tuple, Any, Dict, Set, Optional
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
import statistics

@dataclass
class NodeState:
    """节点状态"""
    node_id: int
    local_top_k: List[Tuple[Any, float]]  # 本地Top K
    threshold: float                     # 当前阈值
    is_active: bool = True               # 是否活跃
    processed_count: int = 0             # 已处理数据量
    candidate_set: Set[Any] = None       # 候选集合

class ThresholdBasedTopK(DistributedTopKAlgorithm):
    """基于阈值的Top K算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.threshold_history = []
        self.candidate_counts = []
        
    async def compute_top_k(
        self, 
        query: TopKQuery,
        data_streams: List[List[Any]]
    ) -> List[Tuple[Any, float]]:
        """
        基于阈值的分布式Top K算法
        
        算法步骤:
        1. 每个节点计算本地Top K
        2. 协调节点收集所有本地Top K,计算初始阈值
        3. 广播阈值给所有节点
        4. 节点丢弃低于阈值的数据,上传候选数据
        5. 重复3-4直到找到准确的Top K
        """
        
        query.validate()
        n = len(data_streams)
        k = query.k
        
        # 初始化节点
        self._initialize_nodes(n)
        
        # 阶段1:每个节点计算本地Top K
        await self._phase1_local_topk(query, data_streams)
        
        # 阶段2:迭代阈值剪枝
        final_result = await self._phase2_threshold_pruning(query)
        
        return final_result
    
    def _initialize_nodes(self, num_nodes: int):
        """初始化节点状态"""
        for i in range(num_nodes):
            self.nodes[i] = NodeState(
                node_id=i,
                local_top_k=[],
                threshold=float('-inf'),
                candidate_set=set()
            )
    
    async def _phase1_local_topk(self, query: TopKQuery, data_streams: List[List[Any]]):
        """阶段1:计算本地Top K"""
        tasks = []
        
        for node_id, data_stream in enumerate(data_streams):
            task = asyncio.create_task(
                self._compute_local_topk(node_id, query, data_stream)
            )
            tasks.append(task)
        
        await asyncio.gather(*tasks)
        
        # 更新统计
        self.stats["operations"] += len(data_streams)
    
    async def _compute_local_topk(
        self, 
        node_id: int, 
        query: TopKQuery, 
        data_stream: List[Any]
    ):
        """计算单个节点的本地Top K"""
        node = self.nodes[node_id]
        
        if query.query_type == TopKType.MAX_K:
            # 使用最小堆找最大的K个
            heap = []
            for item in data_stream:
                # 提取分数(这里假设数据是(元素, 分数)的元组)
                if isinstance(item, tuple) and len(item) == 2:
                    element, score = item
                else:
                    element, score = item, float(item)  # 假设数据本身就是分数
                
                if len(heap) < query.k:
                    heapq.heappush(heap, (score, element))
                else:
                    # 如果比堆顶大,替换堆顶
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, element))
            
            # 转换为列表
            node.local_top_k = [(element, score) for score, element in heap]
        
        elif query.query_type == TopKType.MIN_K:
            # 使用最大堆找最小的K个
            heap = []
            for item in data_stream:
                if isinstance(item, tuple) and len(item) == 2:
                    element, score = item
                else:
                    element, score = item, float(item)
                
                # 使用负分数实现最大堆
                neg_score = -score
                
                if len(heap) < query.k:
                    heapq.heappush(heap, (neg_score, element))
                else:
                    if neg_score > heap[0][0]:  # 注意这里是负分数比较
                        heapq.heapreplace(heap, (neg_score, element))
            
            # 转换回正分数
            node.local_top_k = [(element, -neg_score) for neg_score, element in heap]
        
        elif query.query_type == TopKType.FREQUENT_K:
            # 计算频率
            counter = Counter(data_stream)
            top_items = counter.most_common(query.k)
            node.local_top_k = [(item, count) for item, count in top_items]
        
        node.processed_count = len(data_stream)
        
        # 记录统计
        self.stats["comparisons"] += len(data_stream) * math.log2(query.k) if query.k > 1 else len(data_stream)
    
    async def _phase2_threshold_pruning(self, query: TopKQuery) -> List[Tuple[Any, float]]:
        """阶段2:阈值剪枝"""
        k = query.k
        iteration = 0
        max_iterations = 10  # 最大迭代次数
        
        global_candidates = {}
        
        while iteration < max_iterations:
            iteration += 1
            self.stats["network_rounds"] += 1
            
            # 收集所有节点的本地Top K
            all_local_topk = []
            for node in self.nodes.values():
                all_local_topk.extend(node.local_top_k)
            
            if not all_local_topk:
                break
            
            # 计算全局Top K作为候选
            if query.query_type == TopKType.MAX_K:
                # 找最大的K个
                heap = []
                for element, score in all_local_topk:
                    if len(heap) < k:
                        heapq.heappush(heap, (score, element))
                    else:
                        if score > heap[0][0]:
                            heapq.heapreplace(heap, (score, element))
                
                # 当前阈值是第K大的分数
                if len(heap) >= k:
                    current_threshold = heap[0][0]
                    candidates = [(element, score) for score, element in heap]
                else:
                    current_threshold = float('-inf')
                    candidates = [(element, score) for score, element in heap]
            
            elif query.query_type == TopKType.MIN_K:
                # 找最小的K个
                heap = []
                for element, score in all_local_topk:
                    neg_score = -score
                    if len(heap) < k:
                        heapq.heappush(heap, (neg_score, element))
                    else:
                        if neg_score > heap[0][0]:
                            heapq.heapreplace(heap, (neg_score, element))
                
                if len(heap) >= k:
                    # 阈值是第K小的分数(注意堆顶是第K小的负分数)
                    current_threshold = -heap[0][0]
                    candidates = [(element, -neg_score) for neg_score, element in heap]
                else:
                    current_threshold = float('inf')
                    candidates = [(element, -neg_score) for neg_score, element in heap]
            
            # 更新全局候选集合
            for element, score in candidates:
                if element not in global_candidates or score > global_candidates[element]:
                    global_candidates[element] = score
            
            # 检查阈值是否稳定
            self.threshold_history.append(current_threshold)
            
            if len(self.threshold_history) >= 2:
                prev_threshold = self.threshold_history[-2]
                threshold_change = abs(current_threshold - prev_threshold)
                
                # 如果阈值变化很小,停止迭代
                if threshold_change < 1e-10:
                    break
            
            # 更新节点阈值
            for node in self.nodes.values():
                node.threshold = current_threshold
            
            # 模拟节点反馈:哪些元素超过阈值
            # 在实际实现中,节点会扫描本地数据,找出超过阈值的元素
            self.candidate_counts.append(len(global_candidates))
            
            # 如果候选集合大小接近K,可以提前停止
            if len(global_candidates) <= k * 2:
                break
        
        # 最终排序并返回Top K
        if query.query_type == TopKType.MAX_K:
            top_items = sorted(global_candidates.items(), key=lambda x: x[1], reverse=True)[:k]
        else:  # MIN_K
            top_items = sorted(global_candidates.items(), key=lambda x: x[1])[:k]
        
        return top_items
    
    def get_complexity_analysis(self) -> Dict[str, Any]:
        """获取复杂度分析"""
        n_total = sum(node.processed_count for node in self.nodes.values())
        m = len(self.nodes)
        k = len(self.threshold_history)
        
        analysis = {
            "total_data_points": n_total,
            "num_nodes": m,
            "iterations": len(self.threshold_history),
            "total_comparisons": self.stats["comparisons"],
            "network_rounds": self.stats["network_rounds"],
            "candidate_sizes": self.candidate_counts,
            "threshold_history": self.threshold_history
        }
        
        # 理论复杂度
        analysis["time_complexity"] = f"O((n/m) * log(k) + I * m)"
        analysis["space_complexity"] = f"O(m * k + C)"
        analysis["network_complexity"] = f"O(I * m * k)"
        
        # 估算值
        avg_candidates = statistics.mean(self.candidate_counts) if self.candidate_counts else 0
        analysis["estimated_network_mb"] = (
            len(self.threshold_history) * m * avg_candidates * 8 / (1024 * 1024)
        )
        
        return analysis
    
    def get_network_cost(self) -> Dict[str, float]:
        """获取网络成本"""
        m = len(self.nodes)
        iterations = len(self.threshold_history)
        avg_candidates = statistics.mean(self.candidate_counts) if self.candidate_counts else 0
        
        # 每个元素假设8字节
        element_size = 8
        
        # 每轮通信:阈值广播 + 候选数据上传
        threshold_broadcast = m * 8  # 阈值8字节
        candidates_upload = m * avg_candidates * element_size
        
        total_bytes = iterations * (threshold_broadcast + candidates_upload)
        
        return {
            "total_mb": total_bytes / (1024 * 1024),
            "per_iteration_mb": (threshold_broadcast + candidates_upload) / (1024 * 1024),
            "iterations": iterations,
            "avg_candidates_per_node": avg_candidates
        }

class OptimizedThresholdTopK(ThresholdBasedTopK):
    """优化的阈值算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.adaptive_threshold = True
        self.sampling_rate = 0.1  # 采样率
        self.confidence_level = 0.95  # 置信水平
    
    async def compute_top_k(
        self, 
        query: TopKQuery,
        data_streams: List[List[Any]]
    ) -> List[Tuple[Any, float]]:
        """优化的阈值算法"""
        
        # 阶段0:采样估计数据分布
        if self.adaptive_threshold:
            await self._estimate_distribution(data_streams)
        
        # 阶段1:使用分位数估计初始阈值
        initial_threshold = await self._estimate_initial_threshold(query, data_streams)
        
        # 设置初始阈值
        for node in self.nodes.values():
            node.threshold = initial_threshold
        
        # 阶段2:自适应阈值剪枝
        return await self._adaptive_threshold_pruning(query)
    
    async def _estimate_distribution(self, data_streams: List[List[Any]]):
        """估计数据分布"""
        # 从每个节点采样数据
        samples = []
        for data_stream in data_streams:
            if len(data_stream) > 1000:
                # 采样
                sample_size = int(len(data_stream) * self.sampling_rate)
                sampled = random.sample(data_stream, min(sample_size, 1000))
                samples.extend(sampled)
            else:
                samples.extend(data_stream)
        
        # 分析分布
        if samples:
            # 提取分数
            scores = []
            for item in samples:
                if isinstance(item, tuple) and len(item) == 2:
                    _, score = item
                else:
                    score = float(item)
                scores.append(score)
            
            # 计算分位数
            self.distribution_stats = {
                "mean": statistics.mean(scores),
                "std": statistics.stdev(scores) if len(scores) > 1 else 0,
                "min": min(scores),
                "max": max(scores),
                "q1": np.percentile(scores, 25),
                "q2": np.percentile(scores, 50),
                "q3": np.percentile(scores, 75),
                "q95": np.percentile(scores, 95),
                "q99": np.percentile(scores, 99)
            }
    
    async def _estimate_initial_threshold(
        self, 
        query: TopKQuery, 
        data_streams: List[List[Any]]
    ) -> float:
        """估计初始阈值"""
        k = query.k
        total_elements = sum(len(stream) for stream in data_streams)
        
        # 计算目标分位数
        if query.query_type == TopKType.MAX_K:
            # 找最大的K个,目标分位数是 (1 - k/N)
            target_quantile = 1 - (k / total_elements) if total_elements > 0 else 0.99
        else:  # MIN_K
            # 找最小的K个,目标分位数是 k/N
            target_quantile = k / total_elements if total_elements > 0 else 0.01
        
        # 根据分布估计阈值
        if hasattr(self, 'distribution_stats'):
            stats = self.distribution_stats
            
            if query.query_type == TopKType.MAX_K:
                # 使用高尾分位数估计
                if target_quantile > 0.95:
                    # 使用极值估计
                    initial_threshold = stats["q95"] + 2 * stats["std"]
                elif target_quantile > 0.75:
                    initial_threshold = stats["q3"]
                else:
                    initial_threshold = stats["q2"]
            else:
                # 使用低尾分位数估计
                if target_quantile < 0.05:
                    initial_threshold = stats["q1"] - 2 * stats["std"]
                elif target_quantile < 0.25:
                    initial_threshold = stats["q1"]
                else:
                    initial_threshold = stats["q2"]
        else:
            # 如果没有分布信息,使用保守估计
            if query.query_type == TopKType.MAX_K:
                initial_threshold = float('-inf')
            else:
                initial_threshold = float('inf')
        
        return initial_threshold
    
    async def _adaptive_threshold_pruning(self, query: TopKQuery) -> List[Tuple[Any, float]]:
        """自适应阈值剪枝"""
        k = query.k
        iteration = 0
        max_iterations = 20
        
        global_candidates = {}
        convergence_count = 0
        required_convergence = 2  # 需要连续收敛的次数
        
        while iteration < max_iterations:
            iteration += 1
            
            # 收集候选数据
            candidates = await self._collect_candidates(query)
            
            # 合并到全局候选
            for element, score in candidates:
                if element not in global_candidates or score > global_candidates[element]:
                    global_candidates[element] = score
            
            # 计算新阈值
            if global_candidates:
                scores = list(global_candidates.values())
                if query.query_type == TopKType.MAX_K:
                    # 第K大的分数
                    if len(scores) >= k:
                        kth_score = heapq.nlargest(k, scores)[-1]
                    else:
                        kth_score = min(scores)
                else:
                    # 第K小的分数
                    if len(scores) >= k:
                        kth_score = heapq.nsmallest(k, scores)[-1]
                    else:
                        kth_score = max(scores)
            else:
                kth_score = self.nodes[0].threshold
            
            # 检查收敛
            self.threshold_history.append(kth_score)
            
            if len(self.threshold_history) >= 2:
                prev_threshold = self.threshold_history[-2]
                threshold_change = abs(kth_score - prev_threshold) / max(abs(prev_threshold), 1e-10)
                
                if threshold_change < 0.01:  # 变化小于1%
                    convergence_count += 1
                else:
                    convergence_count = 0
                
                if convergence_count >= required_convergence:
                    break
            
            # 自适应调整阈值
            new_threshold = self._adjust_threshold(kth_score, iteration, len(global_candidates), k)
            
            # 更新所有节点的阈值
            for node in self.nodes.values():
                node.threshold = new_threshold
        
        # 返回最终结果
        if query.query_type == TopKType.MAX_K:
            return sorted(global_candidates.items(), key=lambda x: x[1], reverse=True)[:k]
        else:
            return sorted(global_candidates.items(), key=lambda x: x[1])[:k]
    
    async def _collect_candidates(self, query: TopKQuery) -> List[Tuple[Any, float]]:
        """收集超过阈值的候选数据"""
        all_candidates = []
        
        for node in self.nodes.values():
            # 在实际系统中,节点会扫描本地数据
            # 这里简化为从local_top_k中筛选
            for element, score in node.local_top_k:
                if query.query_type == TopKType.MAX_K:
                    if score > node.threshold:
                        all_candidates.append((element, score))
                else:
                    if score < node.threshold:
                        all_candidates.append((element, score))
        
        return all_candidates
    
    def _adjust_threshold(
        self, 
        current_threshold: float, 
        iteration: int,
        candidate_count: int,
        k: int
    ) -> float:
        """自适应调整阈值"""
        
        # 根据候选数量调整
        if candidate_count > k * 10:  # 候选太多
            # 提高阈值(对于MAX_K)或降低阈值(对于MIN_K)
            adjustment_factor = 1.1
        elif candidate_count < k:  # 候选不足
            adjustment_factor = 0.9
        else:
            adjustment_factor = 1.0
        
        # 根据迭代次数调整
        if iteration > 10:
            # 后期减小调整幅度
            adjustment_factor **= 0.5
        
        # 应用调整
        new_threshold = current_threshold * adjustment_factor
        
        return new_threshold

2.2 树形聚合算法

python

复制

下载

复制代码
"""
树形聚合分布式Top K算法
"""
import heapq
import math
from typing import List, Tuple, Any, Dict, Optional
from dataclasses import dataclass, field
import asyncio
from collections import defaultdict

@dataclass
class TreeNode:
    """树节点"""
    node_id: int
    parent_id: Optional[int] = None
    child_ids: List[int] = field(default_factory=list)
    level: int = 0
    aggregated_topk: List[Tuple[Any, float]] = field(default_factory=list)
    
class TreeAggregationTopK(DistributedTopKAlgorithm):
    """树形聚合Top K算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.tree_structure = {}  # 节点ID -> TreeNode
        self.root_id = 0
        self.levels = 0
        
    async def compute_top_k(
        self, 
        query: TopKQuery,
        data_streams: List[List[Any]]
    ) -> List[Tuple[Any, float]]:
        """
        树形聚合算法
        
        步骤:
        1. 构建聚合树
        2. 叶子节点计算本地Top K
        3. 自底向上聚合
        4. 根节点得到全局Top K
        """
        
        query.validate()
        n = len(data_streams)
        k = query.k
        
        # 阶段1:构建聚合树
        self._build_aggregation_tree(n)
        
        # 阶段2:叶子节点计算本地Top K
        await self._compute_leaf_topk(query, data_streams)
        
        # 阶段3:自底向上聚合
        await self._bottom_up_aggregation(query)
        
        # 阶段4:从根节点获取结果
        root_node = self.tree_structure[self.root_id]
        return root_node.aggregated_topk[:k]
    
    def _build_aggregation_tree(self, num_nodes: int):
        """构建聚合树"""
        # 计算树的高度
        self.levels = math.ceil(math.log2(num_nodes + 1))
        
        # 构建完全二叉树
        node_id = 0
        
        for level in range(self.levels):
            nodes_in_level = min(num_nodes - node_id, 2 ** level)
            
            for i in range(nodes_in_level):
                tree_node = TreeNode(node_id=node_id, level=level)
                
                # 设置父子关系
                if level > 0:
                    parent_id = (node_id - 1) // 2
                    tree_node.parent_id = parent_id
                    self.tree_structure[parent_id].child_ids.append(node_id)
                
                self.tree_structure[node_id] = tree_node
                node_id += 1
        
        # 设置根节点
        self.root_id = 0
        
        # 初始化节点状态
        for i in range(num_nodes):
            if i not in self.tree_structure:
                self.tree_structure[i] = TreeNode(node_id=i, level=self.levels)
    
    async def _compute_leaf_topk(self, query: TopKQuery, data_streams: List[List[Any]]):
        """计算叶子节点的本地Top K"""
        tasks = []
        
        for node_id, data_stream in enumerate(data_streams):
            task = asyncio.create_task(
                self._compute_node_topk(node_id, query, data_stream)
            )
            tasks.append(task)
        
        await asyncio.gather(*tasks)
    
    async def _compute_node_topk(self, node_id: int, query: TopKQuery, data_stream: List[Any]):
        """计算单个节点的Top K"""
        tree_node = self.tree_structure[node_id]
        
        if query.query_type == TopKType.MAX_K:
            # 使用堆找最大的K个
            heap = []
            for item in data_stream:
                if isinstance(item, tuple) and len(item) == 2:
                    element, score = item
                else:
                    element, score = item, float(item)
                
                if len(heap) < query.k:
                    heapq.heappush(heap, (score, element))
                else:
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, element))
            
            tree_node.aggregated_topk = [(element, score) for score, element in heap]
        
        elif query.query_type == TopKType.MIN_K:
            # 使用堆找最小的K个
            heap = []
            for item in data_stream:
                if isinstance(item, tuple) and len(item) == 2:
                    element, score = item
                else:
                    element, score = item, float(item)
                
                neg_score = -score
                if len(heap) < query.k:
                    heapq.heappush(heap, (neg_score, element))
                else:
                    if neg_score > heap[0][0]:
                        heapq.heapreplace(heap, (neg_score, element))
            
            tree_node.aggregated_topk = [(element, -neg_score) for neg_score, element in heap]
        
        elif query.query_type == TopKType.FREQUENT_K:
            # 计算频率
            from collections import Counter
            counter = Counter(data_stream)
            top_items = counter.most_common(query.k)
            tree_node.aggregated_topk = [(item, count) for item, count in top_items]
        
        # 更新统计
        self.stats["operations"] += 1
        self.stats["comparisons"] += len(data_stream) * math.log2(query.k) if query.k > 1 else len(data_stream)
    
    async def _bottom_up_aggregation(self, query: TopKQuery):
        """自底向上聚合"""
        k = query.k
        
        # 从最底层开始向上聚合
        for level in range(self.levels - 1, -1, -1):
            nodes_in_level = [
                node for node in self.tree_structure.values() 
                if node.level == level and node.child_ids
            ]
            
            tasks = []
            for tree_node in nodes_in_level:
                task = asyncio.create_task(
                    self._aggregate_children(tree_node, query, k)
                )
                tasks.append(task)
            
            await asyncio.gather(*tasks)
            
            # 更新网络轮次
            self.stats["network_rounds"] += 1
    
    async def _aggregate_children(self, parent_node: TreeNode, query: TopKQuery, k: int):
        """聚合子节点的结果"""
        # 收集所有子节点的Top K
        all_candidates = []
        
        for child_id in parent_node.child_ids:
            child_node = self.tree_structure[child_id]
            all_candidates.extend(child_node.aggregated_topk)
        
        # 加上父节点自己的数据(如果有)
        if parent_node.aggregated_topk:
            all_candidates.extend(parent_node.aggregated_topk)
        
        # 聚合得到Top K
        if query.query_type == TopKType.MAX_K:
            # 找最大的K个
            heap = []
            for element, score in all_candidates:
                if len(heap) < k:
                    heapq.heappush(heap, (score, element))
                else:
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, element))
            
            parent_node.aggregated_topk = [(element, score) for score, element in heap]
        
        elif query.query_type == TopKType.MIN_K:
            # 找最小的K个
            heap = []
            for element, score in all_candidates:
                neg_score = -score
                if len(heap) < k:
                    heapq.heappush(heap, (neg_score, element))
                else:
                    if neg_score > heap[0][0]:
                        heapq.heapreplace(heap, (neg_score, element))
            
            parent_node.aggregated_topk = [(element, -neg_score) for neg_score, element in heap]
        
        elif query.query_type == TopKType.FREQUENT_K:
            # 合并频率
            freq_counter = defaultdict(int)
            for element, frequency in all_candidates:
                freq_counter[element] += frequency
            
            # 取频率最高的K个
            top_items = sorted(freq_counter.items(), key=lambda x: x[1], reverse=True)[:k]
            parent_node.aggregated_topk = top_items
        
        # 更新网络传输统计
        self.stats["data_transferred_mb"] += len(all_candidates) * 8 / (1024 * 1024)
    
    def get_complexity_analysis(self) -> Dict[str, Any]:
        """获取复杂度分析"""
        n_nodes = len(self.tree_structure)
        levels = self.levels
        
        analysis = {
            "tree_height": levels,
            "total_nodes": n_nodes,
            "network_rounds": self.stats["network_rounds"],
            "total_comparisons": self.stats["comparisons"],
            "data_transferred_mb": self.stats["data_transferred_mb"]
        }
        
        # 理论复杂度
        analysis["time_complexity"] = f"O((n/m) * log(k) + log(m) * k)"
        analysis["space_complexity"] = f"O(k * log(m))"
        analysis["network_complexity"] = f"O(k * log(m))"
        
        return analysis
    
    def get_network_cost(self) -> Dict[str, float]:
        """获取网络成本"""
        levels = self.levels
        
        # 每层传输k个元素,每元素8字节
        element_size = 8
        
        # 树形聚合:每层传输k个元素
        bytes_per_level = len(self.tree_structure) * self.config.batch_size * element_size / 2  # 估算
        
        total_bytes = levels * bytes_per_level
        
        return {
            "total_mb": total_bytes / (1024 * 1024),
            "levels": levels,
            "bytes_per_level_mb": bytes_per_level / (1024 * 1024)
        }

class OptimizedTreeTopK(TreeAggregationTopK):
    """优化的树形聚合算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.adaptive_fanout = True
        self.compression_enabled = True
        self.bloom_filter_enabled = False
        
    def _build_aggregation_tree(self, num_nodes: int):
        """构建优化的聚合树"""
        # 自适应调整扇出
        if self.adaptive_fanout:
            # 根据节点数量调整扇出
            if num_nodes <= 16:
                fanout = 2  # 二叉树
            elif num_nodes <= 64:
                fanout = 4  # 四叉树
            elif num_nodes <= 256:
                fanout = 8  # 八叉树
            else:
                fanout = 16  # 十六叉树
        else:
            fanout = 2  # 默认二叉树
        
        # 计算树的高度
        self.levels = math.ceil(math.log(num_nodes, fanout))
        
        # 构建多叉树
        node_id = 0
        parent_queue = []
        
        # 创建根节点
        root = TreeNode(node_id=node_id, level=0)
        self.tree_structure[node_id] = root
        parent_queue.append(root)
        node_id += 1
        
        # 层次构建
        current_level = 0
        while node_id < num_nodes and parent_queue:
            next_parents = []
            
            for parent in parent_queue:
                # 为父节点添加子节点
                for _ in range(fanout):
                    if node_id >= num_nodes:
                        break
                    
                    child = TreeNode(
                        node_id=node_id,
                        parent_id=parent.node_id,
                        level=current_level + 1
                    )
                    
                    parent.child_ids.append(node_id)
                    self.tree_structure[node_id] = child
                    next_parents.append(child)
                    
                    node_id += 1
                
                if node_id >= num_nodes:
                    break
            
            parent_queue = next_parents
            current_level += 1
        
        self.levels = current_level
        self.root_id = 0
    
    async def _aggregate_children(self, parent_node: TreeNode, query: TopKQuery, k: int):
        """优化的聚合方法"""
        # 收集子节点数据
        child_results = []
        total_elements = 0
        
        for child_id in parent_node.child_ids:
            child_node = self.tree_structure[child_id]
            child_results.append(child_node.aggregated_topk)
            total_elements += len(child_node.aggregated_topk)
        
        # 如果数据量很大,使用优化策略
        if total_elements > k * 10 and self.compression_enabled:
            # 使用阈值剪枝减少数据量
            threshold = self._estimate_aggregation_threshold(child_results, query, k)
            pruned_results = self._prune_by_threshold(child_results, query, threshold, k)
            
            # 合并剪枝后的结果
            all_candidates = []
            for pruned in pruned_results:
                all_candidates.extend(pruned)
        else:
            # 直接合并所有结果
            all_candidates = []
            for result in child_results:
                all_candidates.extend(result)
        
        # 加上父节点自己的数据
        if parent_node.aggregated_topk:
            all_candidates.extend(parent_node.aggregated_topk)
        
        # 使用优化的Top K算法
        if len(all_candidates) > k * 100:
            # 数据量很大,使用选择算法
            parent_node.aggregated_topk = self._quickselect_topk(all_candidates, query, k)
        else:
            # 数据量适中,使用堆算法
            parent_node.aggregated_topk = self._heap_topk(all_candidates, query, k)
        
        # 更新统计
        transferred_elements = sum(len(result) for result in child_results)
        self.stats["data_transferred_mb"] += transferred_elements * 8 / (1024 * 1024)
    
    def _estimate_aggregation_threshold(
        self, 
        child_results: List[List[Tuple[Any, float]]], 
        query: TopKQuery, 
        k: int
    ) -> float:
        """估计聚合阈值"""
        # 采样估计
        sample_size = min(1000, sum(len(result) for result in child_results))
        samples = []
        
        for result in child_results:
            for element, score in result:
                if len(samples) < sample_size:
                    samples.append(score)
                else:
                    if random.random() < sample_size / (len(samples) + 1):
                        idx = random.randint(0, sample_size - 1)
                        samples[idx] = score
        
        if not samples:
            if query.query_type == TopKType.MAX_K:
                return float('-inf')
            else:
                return float('inf')
        
        # 根据查询类型计算阈值
        if query.query_type == TopKType.MAX_K:
            # 估计第K大的阈值
            samples.sort(reverse=True)
            threshold_idx = min(k, len(samples)) - 1
            return samples[threshold_idx] if threshold_idx >= 0 else samples[0]
        else:
            # 估计第K小的阈值
            samples.sort()
            threshold_idx = min(k, len(samples)) - 1
            return samples[threshold_idx] if threshold_idx >= 0 else samples[0]
    
    def _prune_by_threshold(
        self, 
        child_results: List[List[Tuple[Any, float]]], 
        query: TopKQuery, 
        threshold: float, 
        k: int
    ) -> List[List[Tuple[Any, float]]]:
        """根据阈值剪枝"""
        pruned_results = []
        
        for result in child_results:
            if query.query_type == TopKType.MAX_K:
                # 保留大于阈值的元素
                pruned = [(elem, score) for elem, score in result if score > threshold]
            else:
                # 保留小于阈值的元素
                pruned = [(elem, score) for elem, score in result if score < threshold]
            
            # 如果剪枝后仍然很多,只保留Top K
            if len(pruned) > k * 2:
                if query.query_type == TopKType.MAX_K:
                    pruned.sort(key=lambda x: x[1], reverse=True)
                else:
                    pruned.sort(key=lambda x: x[1])
                pruned = pruned[:k]
            
            pruned_results.append(pruned)
        
        return pruned_results
    
    def _heap_topk(
        self, 
        candidates: List[Tuple[Any, float]], 
        query: TopKQuery, 
        k: int
    ) -> List[Tuple[Any, float]]:
        """使用堆算法找Top K"""
        if query.query_type == TopKType.MAX_K:
            heap = []
            for element, score in candidates:
                if len(heap) < k:
                    heapq.heappush(heap, (score, element))
                else:
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, element))
            
            return [(element, score) for score, element in heap]
        
        else:  # MIN_K
            heap = []
            for element, score in candidates:
                neg_score = -score
                if len(heap) < k:
                    heapq.heappush(heap, (neg_score, element))
                else:
                    if neg_score > heap[0][0]:
                        heapq.heapreplace(heap, (neg_score, element))
            
            return [(element, -neg_score) for neg_score, element in heap]
    
    def _quickselect_topk(
        self, 
        candidates: List[Tuple[Any, float]], 
        query: TopKQuery, 
        k: int
    ) -> List[Tuple[Any, float]]:
        """使用快速选择算法找Top K"""
        if len(candidates) <= k:
            if query.query_type == TopKType.MAX_K:
                return sorted(candidates, key=lambda x: x[1], reverse=True)
            else:
                return sorted(candidates, key=lambda x: x[1])
        
        # 提取分数
        scores = [score for _, score in candidates]
        elements = [elem for elem, _ in candidates]
        
        if query.query_type == TopKType.MAX_K:
            # 找第K大的分数
            if k == 1:
                # 直接找最大值
                max_idx = max(range(len(scores)), key=lambda i: scores[i])
                return [(elements[max_idx], scores[max_idx])]
            
            # 使用快速选择找第K大的
            kth_score = self._quickselect(scores, k, largest=True)
            
            # 收集大于等于第K大的元素
            result = []
            for elem, score in zip(elements, scores):
                if score >= kth_score:
                    result.append((elem, score))
            
            # 排序并取前K个
            result.sort(key=lambda x: x[1], reverse=True)
            return result[:k]
        
        else:
            # 找第K小的分数
            if k == 1:
                min_idx = min(range(len(scores)), key=lambda i: scores[i])
                return [(elements[min_idx], scores[min_idx])]
            
            kth_score = self._quickselect(scores, k, largest=False)
            
            # 收集小于等于第K小的元素
            result = []
            for elem, score in zip(elements, scores):
                if score <= kth_score:
                    result.append((elem, score))
            
            result.sort(key=lambda x: x[1])
            return result[:k]
    
    def _quickselect(self, arr: List[float], k: int, largest: bool = True) -> float:
        """快速选择算法"""
        import random
        
        def select(arr, left, right, k):
            if left == right:
                return arr[left]
            
            pivot_idx = random.randint(left, right)
            pivot_val = arr[pivot_idx]
            
            # 分区
            arr[pivot_idx], arr[right] = arr[right], arr[pivot_idx]
            store_idx = left
            
            for i in range(left, right):
                if largest:
                    if arr[i] > pivot_val:
                        arr[store_idx], arr[i] = arr[i], arr[store_idx]
                        store_idx += 1
                else:
                    if arr[i] < pivot_val:
                        arr[store_idx], arr[i] = arr[i], arr[store_idx]
                        store_idx += 1
            
            arr[store_idx], arr[right] = arr[right], arr[store_idx]
            
            if store_idx == k:
                return arr[store_idx]
            elif store_idx < k:
                return select(arr, store_idx + 1, right, k)
            else:
                return select(arr, left, store_idx - 1, k)
        
        k_idx = k - 1 if largest else k - 1
        return select(arr, 0, len(arr) - 1, k_idx)

2.3 MapReduce风格算法

python

复制

下载

复制代码
"""
MapReduce风格的分布式Top K算法
"""
from typing import List, Tuple, Any, Dict, Iterator, Optional
from dataclasses import dataclass
import heapq
import collections
import hashlib

@dataclass
class MapReduceJob:
    """MapReduce作业"""
    job_id: str
    input_data: List[Any]
    mapper_count: int
    reducer_count: int
    k_value: int
    query_type: TopKType

class MapReduceTopK(DistributedTopKAlgorithm):
    """MapReduce风格的Top K算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.jobs = {}
        self.intermediate_data = {}
        
    async def compute_top_k(
        self, 
        query: TopKQuery,
        data_streams: List[List[Any]]
    ) -> List[Tuple[Any, float]]:
        """
        MapReduce风格算法
        
        步骤:
        1. Map阶段:每个节点计算本地Top K
        2. Shuffle阶段:按分区键分发数据
        3. Reduce阶段:每个Reducer计算部分Top K
        4. Final阶段:合并所有Reducer的结果
        """
        
        query.validate()
        
        # 创建MapReduce作业
        job_id = hashlib.md5(str(query).encode()).hexdigest()[:8]
        job = MapReduceJob(
            job_id=job_id,
            input_data=data_streams,
            mapper_count=len(data_streams),
            reducer_count=min(self.config.num_nodes // 2, 10),
            k_value=query.k,
            query_type=query.query_type
        )
        self.jobs[job_id] = job
        
        # 阶段1: Map阶段
        intermediate = await self._map_phase(job, data_streams)
        
        # 阶段2: Shuffle阶段
        shuffled_data = self._shuffle_phase(intermediate, job.reducer_count)
        
        # 阶段3: Reduce阶段
        reduced_results = await self._reduce_phase(shuffled_data, job)
        
        # 阶段4: Final阶段
        final_result = await self._final_phase(reduced_results, job)
        
        return final_result
    
    async def _map_phase(
        self, 
        job: MapReduceJob, 
        data_streams: List[List[Any]]
    ) -> List[List[Tuple[str, Tuple[Any, float]]]]:
        """Map阶段:每个Mapper计算本地Top K"""
        intermediate_results = []
        
        tasks = []
        for mapper_id, data_stream in enumerate(data_streams):
            task = asyncio.create_task(
                self._mapper_function(mapper_id, data_stream, job)
            )
            tasks.append(task)
        
        mapper_outputs = await asyncio.gather(*tasks)
        
        # 收集所有Mapper的输出
        for output in mapper_outputs:
            intermediate_results.append(output)
        
        # 更新统计
        self.stats["operations"] += len(data_streams)
        
        return intermediate_results
    
    async def _mapper_function(
        self, 
        mapper_id: int, 
        data_stream: List[Any], 
        job: MapReduceJob
    ) -> List[Tuple[str, Tuple[Any, float]]]:
        """Mapper函数:计算本地Top K并输出"""
        # 计算本地Top K
        if job.query_type == TopKType.MAX_K:
            # 找最大的K个
            heap = []
            for item in data_stream:
                if isinstance(item, tuple) and len(item) == 2:
                    element, score = item
                else:
                    element, score = item, float(item)
                
                if len(heap) < job.k_value:
                    heapq.heappush(heap, (score, element))
                else:
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, element))
            
            local_topk = [(element, score) for score, element in heap]
        
        elif job.query_type == TopKType.MIN_K:
            # 找最小的K个
            heap = []
            for item in data_stream:
                if isinstance(item, tuple) and len(item) == 2:
                    element, score = item
                else:
                    element, score = item, float(item)
                
                neg_score = -score
                if len(heap) < job.k_value:
                    heapq.heappush(heap, (neg_score, element))
                else:
                    if neg_score > heap[0][0]:
                        heapq.heapreplace(heap, (neg_score, element))
            
            local_topk = [(element, -neg_score) for neg_score, element in heap]
        
        elif job.query_type == TopKType.FREQUENT_K:
            # 计算频率
            counter = collections.Counter(data_stream)
            top_items = counter.most_common(job.k_value)
            local_topk = [(item, count) for item, count in top_items]
        
        # 为每个元素生成分区键
        mapper_output = []
        for element, score in local_topk:
            # 使用元素哈希确定Reducer
            if isinstance(element, str):
                element_str = element
            else:
                element_str = str(element)
            
            partition_key = self._get_partition_key(element_str, job.reducer_count)
            mapper_output.append((partition_key, (element, score)))
        
        # 更新统计
        self.stats["comparisons"] += len(data_stream) * math.log2(job.k_value) if job.k_value > 1 else len(data_stream)
        
        return mapper_output
    
    def _get_partition_key(self, element: str, num_partitions: int) -> str:
        """获取分区键"""
        # 使用哈希确定分区
        hash_val = int(hashlib.md5(element.encode()).hexdigest(), 16)
        partition_id = hash_val % num_partitions
        return f"partition_{partition_id}"
    
    def _shuffle_phase(
        self, 
        intermediate_results: List[List[Tuple[str, Tuple[Any, float]]]], 
        num_reducers: int
    ) -> Dict[str, List[Tuple[Any, float]]]:
        """Shuffle阶段:按分区键分组数据"""
        # 初始化分区
        partitions = {f"partition_{i}": [] for i in range(num_reducers)}
        
        # 收集所有Mapper的输出
        for mapper_output in intermediate_results:
            for partition_key, (element, score) in mapper_output:
                if partition_key in partitions:
                    partitions[partition_key].append((element, score))
                else:
                    # 如果分区键不在预定义的分区中,使用默认分区
                    default_partition = f"partition_{int(hashlib.md5(partition_key.encode()).hexdigest(), 16) % num_reducers}"
                    partitions[default_partition].append((element, score))
        
        # 更新网络传输统计
        total_elements = sum(len(data) for data in partitions.values())
        self.stats["data_transferred_mb"] += total_elements * 8 / (1024 * 1024)
        self.stats["network_rounds"] += 1
        
        return partitions
    
    async def _reduce_phase(
        self, 
        shuffled_data: Dict[str, List[Tuple[Any, float]]], 
        job: MapReduceJob
    ) -> Dict[str, List[Tuple[Any, float]]]:
        """Reduce阶段:每个Reducer计算部分Top K"""
        reduced_results = {}
        
        tasks = []
        for partition_key, partition_data in shuffled_data.items():
            task = asyncio.create_task(
                self._reducer_function(partition_key, partition_data, job)
            )
            tasks.append(task)
        
        reducer_outputs = await asyncio.gather(*tasks)
        
        # 收集Reducer输出
        for partition_key, result in reducer_outputs:
            reduced_results[partition_key] = result
        
        return reduced_results
    
    async def _reducer_function(
        self, 
        partition_key: str, 
        partition_data: List[Tuple[Any, float]], 
        job: MapReduceJob
    ) -> Tuple[str, List[Tuple[Any, float]]]:
        """Reducer函数:计算分区的Top K"""
        if job.query_type == TopKType.MAX_K:
            # 找分区中最大的K个
            heap = []
            for element, score in partition_data:
                if len(heap) < job.k_value:
                    heapq.heappush(heap, (score, element))
                else:
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, element))
            
            partition_topk = [(element, score) for score, element in heap]
        
        elif job.query_type == TopKType.MIN_K:
            # 找分区中最小的K个
            heap = []
            for element, score in partition_data:
                neg_score = -score
                if len(heap) < job.k_value:
                    heapq.heappush(heap, (neg_score, element))
                else:
                    if neg_score > heap[0][0]:
                        heapq.heapreplace(heap, (neg_score, element))
            
            partition_topk = [(element, -neg_score) for neg_score, element in heap]
        
        elif job.query_type == TopKType.FREQUENT_K:
            # 合并频率
            freq_counter = collections.defaultdict(int)
            for element, frequency in partition_data:
                freq_counter[element] += frequency
            
            # 取频率最高的K个
            top_items = sorted(freq_counter.items(), key=lambda x: x[1], reverse=True)[:job.k_value]
            partition_topk = top_items
        
        return (partition_key, partition_topk)
    
    async def _final_phase(
        self, 
        reduced_results: Dict[str, List[Tuple[Any, float]]], 
        job: MapReduceJob
    ) -> List[Tuple[Any, float]]:
        """Final阶段:合并所有Reducer的结果"""
        # 收集所有Reducer的输出
        all_candidates = []
        for partition_topk in reduced_results.values():
            all_candidates.extend(partition_topk)
        
        # 计算最终的Top K
        if job.query_type == TopKType.MAX_K:
            # 找全局最大的K个
            heap = []
            for element, score in all_candidates:
                if len(heap) < job.k_value:
                    heapq.heappush(heap, (score, element))
                else:
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, element))
            
            final_result = [(element, score) for score, element in heap]
        
        elif job.query_type == TopKType.MIN_K:
            # 找全局最小的K个
            heap = []
            for element, score in all_candidates:
                neg_score = -score
                if len(heap) < job.k_value:
                    heapq.heappush(heap, (neg_score, element))
                else:
                    if neg_score > heap[0][0]:
                        heapq.heapreplace(heap, (neg_score, element))
            
            final_result = [(element, -neg_score) for neg_score, element in heap]
        
        elif job.query_type == TopKType.FREQUENT_K:
            # 合并所有分区的频率
            freq_counter = collections.defaultdict(int)
            for element, frequency in all_candidates:
                freq_counter[element] += frequency
            
            # 取全局频率最高的K个
            final_result = sorted(freq_counter.items(), key=lambda x: x[1], reverse=True)[:job.k_value]
        
        # 更新网络传输统计
        total_elements = sum(len(data) for data in reduced_results.values())
        self.stats["data_transferred_mb"] += total_elements * 8 / (1024 * 1024)
        self.stats["network_rounds"] += 1
        
        return final_result
    
    def get_complexity_analysis(self) -> Dict[str, Any]:
        """获取复杂度分析"""
        analysis = {
            "total_data_transferred_mb": self.stats["data_transferred_mb"],
            "network_rounds": self.stats["network_rounds"],
            "total_comparisons": self.stats["comparisons"],
            "operations": self.stats["operations"]
        }
        
        # 理论复杂度
        analysis["time_complexity"] = f"O((n/m) * log(k) + m * k)"
        analysis["space_complexity"] = f"O(m * k)"
        analysis["network_complexity"] = f"O(m * k)"
        
        return analysis
    
    def get_network_cost(self) -> Dict[str, float]:
        """获取网络成本"""
        return {
            "total_mb": self.stats["data_transferred_mb"],
            "shuffle_phase_mb": self.stats["data_transferred_mb"] * 0.6,  # 估算
            "reduce_phase_mb": self.stats["data_transferred_mb"] * 0.4   # 估算
        }

class OptimizedMapReduceTopK(MapReduceTopK):
    """优化的MapReduce Top K算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.combiner_enabled = True
        self.compression_enabled = True
        self.skew_handling_enabled = True
    
    async def _mapper_function(
        self, 
        mapper_id: int, 
        data_stream: List[Any], 
        job: MapReduceJob
    ) -> List[Tuple[str, Tuple[Any, float]]]:
        """优化的Mapper函数:使用Combiner减少输出"""
        # 计算本地Top K
        local_topk = await super()._mapper_function(mapper_id, data_stream, job)
        
        # Combiner:在Mapper端先进行局部聚合
        if self.combiner_enabled and job.query_type == TopKType.FREQUENT_K:
            # 对于频率统计,在Mapper端先聚合
            combined = self._combine_mapper_output(local_topk, job.k_value)
            local_topk = combined
        
        # 压缩输出
        if self.compression_enabled and len(local_topk) > job.k_value * 2:
            # 只保留最重要的元素
            if job.query_type == TopKType.MAX_K:
                local_topk.sort(key=lambda x: x[1][1], reverse=True)  # 按分数排序
            elif job.query_type == TopKType.MIN_K:
                local_topk.sort(key=lambda x: x[1][1])  # 按分数排序
            
            local_topk = local_topk[:job.k_value * 2]  # 保留2倍K个元素
        
        return local_topk
    
    def _combine_mapper_output(
        self, 
        mapper_output: List[Tuple[str, Tuple[Any, float]]], 
        k: int
    ) -> List[Tuple[str, Tuple[Any, float]]]:
        """Combiner:在Mapper端聚合频率"""
        # 按元素聚合频率
        freq_by_element = collections.defaultdict(int)
        
        for partition_key, (element, frequency) in mapper_output:
            freq_by_element[element] += frequency
        
        # 取频率最高的K个
        top_elements = sorted(freq_by_element.items(), key=lambda x: x[1], reverse=True)[:k]
        
        # 重新分配分区
        combined_output = []
        for element, frequency in top_elements:
            partition_key = self._get_partition_key(str(element), self.config.num_nodes)
            combined_output.append((partition_key, (element, frequency)))
        
        return combined_output
    
    def _shuffle_phase(
        self, 
        intermediate_results: List[List[Tuple[str, Tuple[Any, float]]]], 
        num_reducers: int
    ) -> Dict[str, List[Tuple[Any, float]]]:
        """优化的Shuffle:处理数据倾斜"""
        partitions = super()._shuffle_phase(intermediate_results, num_reducers)
        
        # 处理数据倾斜
        if self.skew_handling_enabled:
            partitions = self._handle_data_skew(partitions, num_reducers)
        
        return partitions
    
    def _handle_data_skew(
        self, 
        partitions: Dict[str, List[Tuple[Any, float]]], 
        num_reducers: int
    ) -> Dict[str, List[Tuple[Any, float]]]:
        """处理数据倾斜:重新平衡分区"""
        # 计算每个分区的大小
        partition_sizes = {key: len(data) for key, data in partitions.items()}
        
        # 计算平均大小
        avg_size = sum(partition_sizes.values()) / len(partition_sizes) if partition_sizes else 0
        
        # 识别倾斜的分区(大小超过平均值的2倍)
        skewed_partitions = []
        for key, size in partition_sizes.items():
            if size > avg_size * 2:
                skewed_partitions.append((key, size))
        
        # 对倾斜的分区进行分裂
        for key, size in skewed_partitions:
            if size > avg_size * 3:
                # 分裂为多个子分区
                data = partitions[key]
                split_count = min(int(size / avg_size) + 1, 4)  # 最多分裂为4个
                
                # 移除原分区
                del partitions[key]
                
                # 创建新分区
                split_size = len(data) // split_count
                for i in range(split_count):
                    start_idx = i * split_size
                    end_idx = start_idx + split_size if i < split_count - 1 else len(data)
                    
                    new_key = f"{key}_split_{i}"
                    partitions[new_key] = data[start_idx:end_idx]
        
        return partitions
    
    async def _reducer_function(
        self, 
        partition_key: str, 
        partition_data: List[Tuple[Any, float]], 
        job: MapReduceJob
    ) -> Tuple[str, List[Tuple[Any, float]]]:
        """优化的Reducer:使用增量处理"""
        # 如果数据量很大,使用增量处理
        if len(partition_data) > job.k_value * 100:
            return await self._incremental_reducer(partition_key, partition_data, job)
        else:
            return await super()._reducer_function(partition_key, partition_data, job)
    
    async def _incremental_reducer(
        self, 
        partition_key: str, 
        partition_data: List[Tuple[Any, float]], 
        job: MapReduceJob
    ) -> Tuple[str, List[Tuple[Any, float]]]:
        """增量Reducer:分批处理大数据"""
        batch_size = self.config.batch_size
        
        # 初始化堆
        if job.query_type == TopKType.MAX_K:
            heap = []  # 最小堆,保留最大的K个
            process_func = self._process_max_batch
        elif job.query_type == TopKType.MIN_K:
            heap = []  # 最大堆(通过负分数实现),保留最小的K个
            process_func = self._process_min_batch
        else:
            return await super()._reducer_function(partition_key, partition_data, job)
        
        # 分批处理
        for i in range(0, len(partition_data), batch_size):
            batch = partition_data[i:i + batch_size]
            heap = process_func(heap, batch, job.k_value)
        
        # 转换为结果格式
        if job.query_type == TopKType.MAX_K:
            partition_topk = [(element, score) for score, element in heap]
        else:
            partition_topk = [(element, -neg_score) for neg_score, element in heap]
        
        return (partition_key, partition_topk)
    
    def _process_max_batch(
        self, 
        heap: List[Tuple[float, Any]], 
        batch: List[Tuple[Any, float]], 
        k: int
    ) -> List[Tuple[float, Any]]:
        """处理一批数据(最大K个)"""
        for element, score in batch:
            if len(heap) < k:
                heapq.heappush(heap, (score, element))
            else:
                if score > heap[0][0]:
                    heapq.heapreplace(heap, (score, element))
        return heap
    
    def _process_min_batch(
        self, 
        heap: List[Tuple[float, Any]], 
        batch: List[Tuple[Any, float]], 
        k: int
    ) -> List[Tuple[float, Any]]:
        """处理一批数据(最小K个)"""
        for element, score in batch:
            neg_score = -score
            if len(heap) < k:
                heapq.heappush(heap, (neg_score, element))
            else:
                if neg_score > heap[0][0]:
                    heapq.heapreplace(heap, (neg_score, element))
        return heap

2.4 基于分位数的算法

python

复制

下载

复制代码
"""
基于分位数的分布式Top K算法
"""
import numpy as np
from typing import List, Tuple, Any, Dict, Optional
from dataclasses import dataclass
import asyncio
import random
from scipy import stats

class QuantileBasedTopK(DistributedTopKAlgorithm):
    """基于分位数的Top K算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.quantile_history = []
        self.sample_stats = {}
        self.epsilon = 0.01  # 误差容忍度
        self.confidence = 0.95  # 置信水平
    
    async def compute_top_k(
        self, 
        query: TopKQuery,
        data_streams: List[List[Any]]
    ) -> List[Tuple[Any, float]]:
        """
        基于分位数的算法
        
        步骤:
        1. 采样估计数据分布
        2. 估计目标分位数
        3. 广播分位数阈值
        4. 收集超过阈值的数据
        5. 精确计算Top K
        """
        
        query.validate()
        k = query.k
        
        # 阶段1:采样估计全局分布
        global_samples = await self._collect_samples(data_streams)
        
        # 阶段2:估计目标分位数
        target_quantile = self._calculate_target_quantile(query, global_samples, data_streams)
        
        # 阶段3:迭代精化分位数估计
        threshold, candidates = await self._iterative_quantile_refinement(
            query, data_streams, target_quantile
        )
        
        # 阶段4:精确计算Top K
        final_result = self._exact_topk_from_candidates(candidates, query, k)
        
        return final_result
    
    async def _collect_samples(self, data_streams: List[List[Any]]) -> List[float]:
        """收集全局样本"""
        all_samples = []
        
        # 从每个节点采样
        for node_id, data_stream in enumerate(data_streams):
            if data_stream:
                # 采样率自适应
                sample_rate = min(0.1, 1000 / len(data_stream)) if len(data_stream) > 1000 else 1.0
                sample_size = int(len(data_stream) * sample_rate)
                
                if sample_size > 0:
                    # 随机采样
                    sampled = random.sample(data_stream, min(sample_size, 1000))
                    
                    # 提取分数
                    for item in sampled:
                        if isinstance(item, tuple) and len(item) == 2:
                            _, score = item
                        else:
                            score = float(item)
                        all_samples.append(score)
        
        # 记录采样统计
        self.sample_stats = {
            "total_samples": len(all_samples),
            "sample_rate": len(all_samples) / sum(len(stream) for stream in data_streams) if data_streams else 0,
            "sample_mean": np.mean(all_samples) if all_samples else 0,
            "sample_std": np.std(all_samples) if len(all_samples) > 1 else 0
        }
        
        return all_samples
    
    def _calculate_target_quantile(
        self, 
        query: TopKQuery, 
        samples: List[float], 
        data_streams: List[List[Any]]
    ) -> float:
        """计算目标分位数"""
        total_elements = sum(len(stream) for stream in data_streams)
        k = query.k
        
        if query.query_type == TopKType.MAX_K:
            # 对于最大的K个,需要找到第(N-K)个分位数
            # 即:1 - k/N
            target_quantile = 1 - (k / total_elements) if total_elements > 0 else 0.99
        else:  # MIN_K
            # 对于最小的K个,需要找到第K个分位数
            # 即:k/N
            target_quantile = k / total_elements if total_elements > 0 else 0.01
        
        # 使用样本估计分位数
        if samples:
            samples_sorted = sorted(samples)
            quantile_idx = int(target_quantile * len(samples_sorted))
            quantile_idx = max(0, min(quantile_idx, len(samples_sorted) - 1))
            
            estimated_quantile = samples_sorted[quantile_idx]
            
            # 计算置信区间
            n_samples = len(samples)
            if n_samples > 100:
                # 使用正态近似计算置信区间
                z = stats.norm.ppf((1 + self.confidence) / 2)
                se = np.sqrt(target_quantile * (1 - target_quantile) / n_samples)
                margin = z * se * (max(samples) - min(samples)) if samples else 0
                
                self.sample_stats["quantile_estimate"] = estimated_quantile
                self.sample_stats["confidence_interval"] = (
                    estimated_quantile - margin, estimated_quantile + margin
                )
            
            return target_quantile
        
        return target_quantile
    
    async def _iterative_quantile_refinement(
        self, 
        query: TopKQuery, 
        data_streams: List[List[Any]], 
        target_quantile: float
    ) -> Tuple[float, List[Tuple[Any, float]]]:
        """迭代精化分位数估计"""
        iteration = 0
        max_iterations = 10
        current_threshold = float('-inf') if query.query_type == TopKType.MAX_K else float('inf')
        
        all_candidates = []
        
        while iteration < max_iterations:
            iteration += 1
            
            # 收集超过当前阈值的数据
            iteration_candidates = await self._collect_candidates_above_threshold(
                query, data_streams, current_threshold
            )
            
            # 合并到总候选集
            all_candidates.extend(iteration_candidates)
            
            # 去重(按元素)
            candidate_dict = {}
            for element, score in all_candidates:
                if element not in candidate_dict or score > candidate_dict[element]:
                    candidate_dict[element] = score
            
            all_candidates = list(candidate_dict.items())
            
            # 如果候选足够,可以尝试估计新阈值
            if len(all_candidates) >= query.k * 2:
                scores = [score for _, score in all_candidates]
                
                if query.query_type == TopKType.MAX_K:
                    # 估计第K大的分数
                    scores_sorted = sorted(scores, reverse=True)
                    new_threshold = scores_sorted[min(query.k, len(scores_sorted)) - 1]
                else:
                    # 估计第K小的分数
                    scores_sorted = sorted(scores)
                    new_threshold = scores_sorted[min(query.k, len(scores_sorted)) - 1]
                
                # 检查阈值变化
                self.quantile_history.append(new_threshold)
                
                if len(self.quantile_history) >= 2:
                    prev_threshold = self.quantile_history[-2]
                    threshold_change = abs(new_threshold - prev_threshold) / max(abs(prev_threshold), 1e-10)
                    
                    if threshold_change < self.epsilon:
                        # 阈值稳定,停止迭代
                        current_threshold = new_threshold
                        break
                
                current_threshold = new_threshold
            
            # 更新网络轮次
            self.stats["network_rounds"] += 1
        
        return current_threshold, all_candidates
    
    async def _collect_candidates_above_threshold(
        self, 
        query: TopKQuery, 
        data_streams: List[List[Any]], 
        threshold: float
    ) -> List[Tuple[Any, float]]:
        """收集超过阈值的数据"""
        all_candidates = []
        
        tasks = []
        for node_id, data_stream in enumerate(data_streams):
            task = asyncio.create_task(
                self._scan_node_data(node_id, data_stream, query, threshold)
            )
            tasks.append(task)
        
        node_results = await asyncio.gather(*tasks)
        
        for candidates in node_results:
            all_candidates.extend(candidates)
        
        # 更新统计
        self.stats["operations"] += len(data_streams)
        
        return all_candidates
    
    async def _scan_node_data(
        self, 
        node_id: int, 
        data_stream: List[Any], 
        query: TopKQuery, 
        threshold: float
    ) -> List[Tuple[Any, float]]:
        """扫描节点数据,找出超过阈值的元素"""
        candidates = []
        
        for item in data_stream:
            if isinstance(item, tuple) and len(item) == 2:
                element, score = item
            else:
                element, score = item, float(item)
            
            if query.query_type == TopKType.MAX_K:
                if score > threshold:
                    candidates.append((element, score))
            else:  # MIN_K
                if score < threshold:
                    candidates.append((element, score))
        
        # 如果候选太多,可以进一步过滤
        if len(candidates) > query.k * 10:
            # 只保留最好的候选
            if query.query_type == TopKType.MAX_K:
                candidates.sort(key=lambda x: x[1], reverse=True)
            else:
                candidates.sort(key=lambda x: x[1])
            
            candidates = candidates[:query.k * 5]
        
        return candidates
    
    def _exact_topk_from_candidates(
        self, 
        candidates: List[Tuple[Any, float]], 
        query: TopKQuery, 
        k: int
    ) -> List[Tuple[Any, float]]:
        """从候选集中精确计算Top K"""
        if query.query_type == TopKType.MAX_K:
            # 找最大的K个
            heap = []
            for element, score in candidates:
                if len(heap) < k:
                    heapq.heappush(heap, (score, element))
                else:
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, element))
            
            result = [(element, score) for score, element in heap]
            result.sort(key=lambda x: x[1], reverse=True)
        
        else:  # MIN_K
            # 找最小的K个
            heap = []
            for element, score in candidates:
                neg_score = -score
                if len(heap) < k:
                    heapq.heappush(heap, (neg_score, element))
                else:
                    if neg_score > heap[0][0]:
                        heapq.heapreplace(heap, (neg_score, element))
            
            result = [(element, -neg_score) for neg_score, element in heap]
            result.sort(key=lambda x: x[1])
        
        # 更新比较统计
        self.stats["comparisons"] += len(candidates) * math.log2(k) if k > 1 else len(candidates)
        
        return result
    
    def get_complexity_analysis(self) -> Dict[str, Any]:
        """获取复杂度分析"""
        analysis = {
            "total_samples": self.sample_stats.get("total_samples", 0),
            "sample_rate": self.sample_stats.get("sample_rate", 0),
            "iterations": len(self.quantile_history),
            "network_rounds": self.stats["network_rounds"],
            "total_comparisons": self.stats["comparisons"]
        }
        
        if "quantile_estimate" in self.sample_stats:
            analysis["quantile_estimate"] = self.sample_stats["quantile_estimate"]
            analysis["confidence_interval"] = self.sample_stats["confidence_interval"]
        
        # 理论复杂度
        analysis["time_complexity"] = f"O(n/m * log(n/m) + I * m)"
        analysis["space_complexity"] = f"O(k + S)"  # S是样本大小
        analysis["network_complexity"] = f"O(I * m)"
        
        return analysis
    
    def get_network_cost(self) -> Dict[str, float]:
        """获取网络成本"""
        iterations = len(self.quantile_history)
        m = self.config.num_nodes
        
        # 每轮通信:阈值广播 + 候选数据上传
        threshold_broadcast = m * 8  # 阈值8字节
        # 假设每轮上传k个候选
        candidates_per_node = self.config.batch_size
        candidates_upload = m * candidates_per_node * 8
        
        total_bytes = iterations * (threshold_broadcast + candidates_upload)
        
        return {
            "total_mb": total_bytes / (1024 * 1024),
            "iterations": iterations,
            "bytes_per_iteration_mb": (threshold_broadcast + candidates_upload) / (1024 * 1024)
        }

class AdaptiveQuantileTopK(QuantileBasedTopK):
    """自适应分位数算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.histogram_buckets = 100
        self.adaptive_sampling = True
        self.error_bounds = {}
    
    async def _collect_samples(self, data_streams: List[List[Any]]) -> List[float]:
        """使用自适应采样收集样本"""
        if not self.adaptive_sampling:
            return await super()._collect_samples(data_streams)
        
        # 构建直方图来指导采样
        all_samples = []
        histogram = np.zeros(self.histogram_buckets)
        
        # 第一轮:快速构建粗略直方图
        for data_stream in data_streams:
            if len(data_stream) > 1000:
                # 采样构建直方图
                quick_sample = random.sample(data_stream, min(100, len(data_stream)))
                
                # 提取分数并归一化到[0, 1]
                scores = []
                for item in quick_sample:
                    if isinstance(item, tuple) and len(item) == 2:
                        _, score = item
                    else:
                        score = float(item)
                    scores.append(score)
                
                if scores:
                    min_val, max_val = min(scores), max(scores)
                    if max_val > min_val:
                        for score in scores:
                            bucket = int((score - min_val) / (max_val - min_val) * (self.histogram_buckets - 1))
                            bucket = max(0, min(bucket, self.histogram_buckets - 1))
                            histogram[bucket] += 1
        
        # 识别数据密集区域
        total_count = np.sum(histogram)
        if total_count > 0:
            histogram_density = histogram / total_count
            
            # 第二轮:根据密度进行分层采样
            for data_stream in data_streams:
                if len(data_stream) > 100:
                    # 决定采样总数
                    sample_size = min(500, len(data_stream) // 10)
                    
                    # 分层采样
                    sampled = []
                    for item in data_stream:
                        if isinstance(item, tuple) and len(item) == 2:
                            _, score = item
                        else:
                            score = float(item)
                        
                        # 根据直方图密度决定采样概率
                        bucket = int((score - min_val) / (max_val - min_val) * (self.histogram_buckets - 1))
                        bucket = max(0, min(bucket, self.histogram_buckets - 1))
                        
                        # 密度越低,采样概率越高(为了更好覆盖)
                        density = histogram_density[bucket]
                        sampling_prob = max(0.01, 0.1 / (density + 0.01))
                        
                        if random.random() < sampling_prob and len(sampled) < sample_size:
                            sampled.append(score)
                    
                    all_samples.extend(sampled)
        
        self.sample_stats["histogram"] = histogram.tolist()
        self.sample_stats["total_samples"] = len(all_samples)
        
        return all_samples
    
    async def _iterative_quantile_refinement(
        self, 
        query: TopKQuery, 
        data_streams: List[List[Any]], 
        target_quantile: float
    ) -> Tuple[float, List[Tuple[Any, float]]]:
        """自适应迭代精化"""
        iteration = 0
        current_threshold = float('-inf') if query.query_type == TopKType.MAX_K else float('inf')
        
        all_candidates = []
        error_bound = float('inf')
        
        while iteration < 10 and error_bound > self.epsilon:
            iteration += 1
            
            # 收集候选
            candidates = await self._collect_candidates_above_threshold(
                query, data_streams, current_threshold
            )
            
            # 合并候选
            candidate_dict = {}
            for element, score in candidates:
                if element not in candidate_dict or score > candidate_dict[element]:
                    candidate_dict[element] = score
            
            # 更新总候选集
            for element, score in candidate_dict.items():
                existing_score = next((s for e, s in all_candidates if e == element), None)
                if existing_score is None or score > existing_score:
                    # 移除旧的(如果有)
                    all_candidates = [(e, s) for e, s in all_candidates if e != element]
                    all_candidates.append((element, score))
            
            # 估计新阈值和误差界
            if len(all_candidates) >= query.k:
                scores = [score for _, score in all_candidates]
                
                if query.query_type == TopKType.MAX_K:
                    scores_sorted = sorted(scores, reverse=True)
                    new_threshold = scores_sorted[min(query.k, len(scores_sorted)) - 1]
                    
                    # 估计误差界
                    if len(scores_sorted) > query.k:
                        next_score = scores_sorted[query.k] if query.k < len(scores_sorted) else scores_sorted[-1]
                        error_bound = abs(new_threshold - next_score) / max(abs(new_threshold), 1)
                    else:
                        error_bound = 1.0  # 保守估计
                
                else:  # MIN_K
                    scores_sorted = sorted(scores)
                    new_threshold = scores_sorted[min(query.k, len(scores_sorted)) - 1]
                    
                    if len(scores_sorted) > query.k:
                        next_score = scores_sorted[query.k] if query.k < len(scores_sorted) else scores_sorted[-1]
                        error_bound = abs(new_threshold - next_score) / max(abs(new_threshold), 1)
                    else:
                        error_bound = 1.0
                
                # 记录误差
                self.error_bounds[iteration] = error_bound
                
                # 检查是否收敛
                if error_bound <= self.epsilon:
                    current_threshold = new_threshold
                    break
                
                # 自适应调整阈值
                if iteration > 1:
                    # 根据误差调整阈值
                    if error_bound > 0.1:  # 误差还很大
                        # 更激进的调整
                        adjustment = 1.2 if query.query_type == TopKType.MAX_K else 0.8
                    elif error_bound > 0.01:
                        adjustment = 1.05 if query.query_type == TopKType.MAX_K else 0.95
                    else:
                        adjustment = 1.01 if query.query_type == TopKType.MAX_K else 0.99
                    
                    current_threshold = new_threshold * adjustment
                else:
                    current_threshold = new_threshold
            
            self.stats["network_rounds"] += 1
        
        return current_threshold, all_candidates

篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc

需要全套面试笔记及答案
【点击此处即可/免费获取】

2.5 基于Gossip的算法

python

复制

下载

复制代码
"""
基于Gossip的分布式Top K算法
"""
import random
import asyncio
from typing import List, Tuple, Any, Dict, Set, Optional
from dataclasses import dataclass, field
import heapq
from collections import defaultdict
import time

@dataclass
class GossipNode:
    """Gossip节点"""
    node_id: int
    local_data: List[Tuple[Any, float]]
    view: Dict[int, 'GossipNode'] = field(default_factory=dict)  # 邻居视图
    topk_cache: List[Tuple[Any, float]] = field(default_factory=list)  # Top K缓存
    rumor_count: int = 0  # 传播的谣言数量
    received_rumors: Set[str] = field(default_factory=set)  # 已接收的谣言ID
    
class GossipTopK(DistributedTopKAlgorithm):
    """基于Gossip的Top K算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.gossip_nodes = {}
        self.rumor_history = defaultdict(list)
        self.convergence_history = []
        self.fanout = 3  # 每次传播的节点数
        self.gossip_interval = 0.1  # 传播间隔(秒)
        
    async def compute_top_k(
        self, 
        query: TopKQuery,
        data_streams: List[List[Any]]
    ) -> List[Tuple[Any, float]]:
        """
        基于Gossip的算法
        
        步骤:
        1. 初始化Gossip网络
        2. 每个节点维护本地Top K
        3. 定期与邻居交换Top K信息
        4. 经过多轮传播后,每个节点都有近似的全局Top K
        5. 选择一个节点收集最终结果
        """
        
        query.validate()
        k = query.k
        
        # 阶段1:初始化Gossip网络
        await self._initialize_gossip_network(data_streams, query)
        
        # 阶段2:执行Gossip传播
        await self._run_gossip_protocol(query, num_rounds=20)
        
        # 阶段3:收集和合并结果
        final_result = await self._collect_final_result(query)
        
        return final_result
    
    async def _initialize_gossip_network(
        self, 
        data_streams: List[List[Any]], 
        query: TopKQuery
    ):
        """初始化Gossip网络"""
        # 创建节点
        for node_id, data_stream in enumerate(data_streams):
            # 计算本地Top K
            local_topk = self._compute_local_topk(data_stream, query)
            
            node = GossipNode(
                node_id=node_id,
                local_data=local_topk,
                topk_cache=local_topk.copy()
            )
            
            self.gossip_nodes[node_id] = node
        
        # 建立邻居关系(随机图)
        self._build_neighbor_graph()
    
    def _compute_local_topk(
        self, 
        data_stream: List[Any], 
        query: TopKQuery
    ) -> List[Tuple[Any, float]]:
        """计算本地Top K"""
        k = query.k
        
        if query.query_type == TopKType.MAX_K:
            heap = []
            for item in data_stream:
                if isinstance(item, tuple) and len(item) == 2:
                    element, score = item
                else:
                    element, score = item, float(item)
                
                if len(heap) < k:
                    heapq.heappush(heap, (score, element))
                else:
                    if score > heap[0][0]:
                        heapq.heapreplace(heap, (score, element))
            
            return [(element, score) for score, element in heap]
        
        elif query.query_type == TopKType.MIN_K:
            heap = []
            for item in data_stream:
                if isinstance(item, tuple) and len(item) == 2:
                    element, score = item
                else:
                    element, score = item, float(item)
                
                neg_score = -score
                if len(heap) < k:
                    heapq.heappush(heap, (neg_score, element))
                else:
                    if neg_score > heap[0][0]:
                        heapq.heapreplace(heap, (neg_score, element))
            
            return [(element, -neg_score) for neg_score, element in heap]
        
        elif query.query_type == TopKType.FREQUENT_K:
            counter = defaultdict(int)
            for item in data_stream:
                counter[item] += 1
            
            top_items = sorted(counter.items(), key=lambda x: x[1], reverse=True)[:k]
            return top_items
    
    def _build_neighbor_graph(self):
        """构建邻居图"""
        node_ids = list(self.gossip_nodes.keys())
        
        for node_id, node in self.gossip_nodes.items():
            # 随机选择邻居
            possible_neighbors = [nid for nid in node_ids if nid != node_id]
            
            # 每个节点有fanout个邻居
            neighbor_count = min(self.fanout, len(possible_neighbors))
            neighbors = random.sample(possible_neighbors, neighbor_count)
            
            for neighbor_id in neighbors:
                node.view[neighbor_id] = self.gossip_nodes[neighbor_id]
    
    async def _run_gossip_protocol(self, query: TopKQuery, num_rounds: int):
        """执行Gossip协议"""
        convergence_check_interval = 5
        
        for round_num in range(num_rounds):
            start_time = time.time()
            
            # 每个节点进行Gossip传播
            tasks = []
            for node in self.gossip_nodes.values():
                task = asyncio.create_task(
                    self._gossip_round(node, query, round_num)
                )
                tasks.append(task)
            
            await asyncio.gather(*tasks)
            
            # 检查收敛
            if round_num % convergence_check_interval == 0:
                convergence = self._check_convergence(query)
                self.convergence_history.append(convergence)
                
                # 如果已经收敛,可以提前停止
                if convergence["converged"]:
                    print(f"Gossip converged at round {round_num}")
                    break
            
            # 控制传播速度
            elapsed = time.time() - start_time
            if elapsed < self.gossip_interval:
                await asyncio.sleep(self.gossip_interval - elapsed)
            
            # 更新统计
            self.stats["network_rounds"] += 1
    
    async def _gossip_round(self, node: GossipNode, query: TopKQuery, round_num: int):
        """单个节点的Gossip轮次"""
        # 选择要传播的邻居
        if not node.view:
            return
        
        neighbors = list(node.view.keys())
        selected_neighbors = random.sample(
            neighbors, 
            min(self.fanout, len(neighbors))
        )
        
        # 创建新的谣言
        rumor_id = f"{node.node_id}_{round_num}_{time.time()}"
        
        # 谣言内容:节点的当前Top K
        rumor = {
            "id": rumor_id,
            "source": node.node_id,
            "round": round_num,
            "topk": node.topk_cache,
            "timestamp": time.time()
        }
        
        # 传播给邻居
        for neighbor_id in selected_neighbors:
            if neighbor_id in self.gossip_nodes:
                neighbor = self.gossip_nodes[neighbor_id]
                
                # 避免重复接收
                if rumor_id not in neighbor.received_rumors:
                    await self._receive_rumor(neighbor, rumor, query)
                    neighbor.received_rumors.add(rumor_id)
        
        # 更新节点统计
        node.rumor_count += 1
        
        # 记录谣言历史
        self.rumor_history[round_num].append({
            "source": node.node_id,
            "rumor_id": rumor_id,
            "neighbors": selected_neighbors
        })
    
    async def _receive_rumor(self, node: GossipNode, rumor: Dict, query: TopKQuery):
        """节点接收谣言"""
        # 合并谣言中的Top K
        received_topk = rumor["topk"]
        
        # 合并到本地缓存
        merged = self._merge_topk(node.topk_cache, received_topk, query)
        
        # 更新缓存
        node.topk_cache = merged
        
        # 更新统计
        self.stats["operations"] += 1
    
    def _merge_topk(
        self, 
        local_topk: List[Tuple[Any, float]], 
        remote_topk: List[Tuple[Any, float]], 
        query: TopKQuery
    ) -> List[Tuple[Any, float]]:
        """合并两个Top K列表"""
        k = len(local_topk) if local_topk else query.k
        
        # 合并所有元素
        all_items = defaultdict(float)
        
        for element, score in local_topk:
            all_items[element] = max(all_items[element], score)
        
        for element, score in remote_topk:
            all_items[element] = max(all_items[element], score)
        
        # 转换为列表
        items_list = list(all_items.items())
        
        # 根据查询类型排序
        if query.query_type == TopKType.MAX_K:
            items_list.sort(key=lambda x: x[1], reverse=True)
        elif query.query_type == TopKType.MIN_K:
            items_list.sort(key=lambda x: x[1])
        elif query.query_type == TopKType.FREQUENT_K:
            items_list.sort(key=lambda x: x[1], reverse=True)
        
        # 取前K个
        return items_list[:k]
    
    def _check_convergence(self, query: TopKQuery) -> Dict[str, Any]:
        """检查收敛情况"""
        if not self.gossip_nodes:
            return {"converged": False, "agreement": 0}
        
        # 收集所有节点的Top K
        all_topks = []
        for node in self.gossip_nodes.values():
            all_topks.append(set([elem for elem, _ in node.topk_cache]))
        
        # 计算一致性(Jaccard相似度)
        agreement_scores = []
        for i in range(len(all_topks)):
            for j in range(i + 1, len(all_topks)):
                set_i = all_topks[i]
                set_j = all_topks[j]
                
                if set_i and set_j:
                    intersection = len(set_i.intersection(set_j))
                    union = len(set_i.union(set_j))
                    similarity = intersection / union if union > 0 else 0
                    agreement_scores.append(similarity)
        
        avg_agreement = sum(agreement_scores) / len(agreement_scores) if agreement_scores else 0
        
        # 检查是否收敛(一致性超过阈值)
        converged = avg_agreement > 0.9
        
        return {
            "converged": converged,
            "agreement": avg_agreement,
            "min_agreement": min(agreement_scores) if agreement_scores else 0,
            "max_agreement": max(agreement_scores) if agreement_scores else 0
        }
    
    async def _collect_final_result(self, query: TopKQuery) -> List[Tuple[Any, float]]:
        """收集最终结果"""
        # 选择一个协调节点(例如节点0)
        coordinator = self.gossip_nodes[0]
        
        # 从协调节点获取Top K
        coordinator_topk = coordinator.topk_cache
        
        # 为了更准确,可以合并多个节点的结果
        sample_nodes = random.sample(
            list(self.gossip_nodes.keys()), 
            min(3, len(self.gossip_nodes))
        )
        
        all_candidates = defaultdict(float)
        
        for node_id in sample_nodes:
            node = self.gossip_nodes[node_id]
            for element, score in node.topk_cache:
                all_candidates[element] = max(all_candidates[element], score)
        
        # 转换为列表并排序
        items_list = list(all_candidates.items())
        
        if query.query_type == TopKType.MAX_K:
            items_list.sort(key=lambda x: x[1], reverse=True)
        elif query.query_type == TopKType.MIN_K:
            items_list.sort(key=lambda x: x[1])
        elif query.query_type == TopKType.FREQUENT_K:
            items_list.sort(key=lambda x: x[1], reverse=True)
        
        k = len(coordinator_topk)
        return items_list[:k]
    
    def get_complexity_analysis(self) -> Dict[str, Any]:
        """获取复杂度分析"""
        analysis = {
            "total_nodes": len(self.gossip_nodes),
            "gossip_rounds": self.stats["network_rounds"],
            "total_rumors": sum(node.rumor_count for node in self.gossip_nodes.values()),
            "convergence_history": self.convergence_history,
            "operations": self.stats["operations"]
        }
        
        # 理论复杂度
        n_nodes = len(self.gossip_nodes)
        analysis["time_complexity"] = f"O(log(n) * n/m)"
        analysis["space_complexity"] = f"O(k * log(n))"
        analysis["network_complexity"] = f"O(n * k * log(n))"
        
        return analysis
    
    def get_network_cost(self) -> Dict[str, float]:
        """获取网络成本"""
        n_nodes = len(self.gossip_nodes)
        rounds = self.stats["network_rounds"]
        fanout = self.fanout
        
        # 每轮每个节点发送fanout个消息
        # 每个消息包含k个元素,每个元素8字节
        k = self.config.batch_size
        element_size = 8
        
        messages_per_round = n_nodes * fanout
        bytes_per_message = k * element_size
        
        total_bytes = rounds * messages_per_round * bytes_per_message
        
        return {
            "total_mb": total_bytes / (1024 * 1024),
            "rounds": rounds,
            "messages_per_round": messages_per_round,
            "bytes_per_message_mb": bytes_per_message / (1024 * 1024)
        }

class PushPullGossipTopK(GossipTopK):
    """Push-Pull Gossip算法"""
    
    async def _gossip_round(self, node: GossipNode, query: TopKQuery, round_num: int):
        """Push-Pull Gossip轮次"""
        if not node.view:
            return
        
        neighbors = list(node.view.keys())
        selected_neighbors = random.sample(
            neighbors, 
            min(self.fanout, len(neighbors))
        )
        
        for neighbor_id in selected_neighbors:
            if neighbor_id in self.gossip_nodes:
                neighbor = self.gossip_nodes[neighbor_id]
                
                # Push阶段:发送本地Top K
                push_rumor_id = f"push_{node.node_id}_{neighbor_id}_{round_num}"
                
                if push_rumor_id not in neighbor.received_rumors:
                    push_rumor = {
                        "id": push_rumor_id,
                        "source": node.node_id,
                        "type": "push",
                        "topk": node.topk_cache,
                        "timestamp": time.time()
                    }
                    
                    await self._receive_rumor(neighbor, push_rumor, query)
                    neighbor.received_rumors.add(push_rumor_id)
                
                # Pull阶段:请求邻居的Top K
                pull_rumor_id = f"pull_{node.node_id}_{neighbor_id}_{round_num}"
                
                if pull_rumor_id not in node.received_rumors:
                    # 模拟邻居响应
                    pull_response = {
                        "id": f"response_{pull_rumor_id}",
                        "source": neighbor_id,
                        "type": "pull_response",
                        "topk": neighbor.topk_cache,
                        "timestamp": time.time()
                    }
                    
                    await self._receive_rumor(node, pull_response, query)
                    node.received_rumors.add(pull_rumor_id)
        
        node.rumor_count += len(selected_neighbors) * 2  # Push和Pull各算一次

class AdaptiveGossipTopK(GossipTopK):
    """自适应Gossip算法"""
    
    def __init__(self, config: DistributedTopKConfig):
        super().__init__(config)
        self.adaptive_fanout = True
        self.convergence_speed = 1.0
        self.last_convergence = 0
    
    def _build_neighbor_graph(self):
        """自适应构建邻居图"""
        node_ids = list(self.gossip_nodes.keys())
        
        # 根据节点数量自适应调整fanout
        if self.adaptive_fanout:
            n = len(node_ids)
            if n <= 10:
                self.fanout = 2
            elif n <= 50:
                self.fanout = 3
            elif n <= 100:
                self.fanout = 4
            else:
                self.fanout = 5
        
        # 使用小世界网络结构(部分随机,部分结构化)
        for i, node_id in enumerate(node_ids):
            node = self.gossip_nodes[node_id]
            
            # 添加结构化邻居(环状结构)
            left_neighbor = node_ids[(i - 1) % len(node_ids)]
            right_neighbor = node_ids[(i + 1) % len(node_ids)]
            
            node.view[left_neighbor] = self.gossip_nodes[left_neighbor]
            node.view[right_neighbor] = self.gossip_nodes[right_neighbor]
            
            # 添加随机邻居
            remaining = [nid for nid in node_ids 
                        if nid != node_id and nid != left_neighbor and nid != right_neighbor]
            
            random_count = max(0, self.fanout - 2)  # 减去结构化邻居
            if random_count > 0 and remaining:
                random_neighbors = random.sample(remaining, min(random_count, len(remaining)))
                for neighbor_id in random_neighbors:
                    node.view[neighbor_id] = self.gossip_nodes[neighbor_id]
    
    async def _gossip_round(self, node: GossipNode, query: TopKQuery, round_num: int):
        """自适应Gossip轮次"""
        if not node.view:
            return
        
        # 自适应选择邻居:优先选择Top K差异大的邻居
        neighbors = self._select_informative_neighbors(node, query)
        
        if not neighbors:
            neighbors = list(node.view.keys())
        
        selected_count = self._adaptive_fanout_size(round_num)
        selected_neighbors = random.sample(
            neighbors, 
            min(selected_count, len(neighbors))
        )
        
        for neighbor_id in selected_neighbors:
            if neighbor_id in self.gossip_nodes:
                neighbor = self.gossip_nodes[neighbor_id]
                
                # 计算要发送的信息量(根据差异程度)
                info_content = self._calculate_information_content(node, neighbor, query)
                
                rumor_id = f"{node.node_id}_{neighbor_id}_{round_num}"
                
                if rumor_id not in neighbor.received_rumors:
                    # 只发送有差异的部分
                    rumor_topk = self._select_different_items(node.topk_cache, neighbor.topk_cache, query)
                    
                    rumor = {
                        "id": rumor_id,
                        "source": node.node_id,
                        "round": round_num,
                        "topk": rumor_topk,
                        "info_content": info_content,
                        "timestamp": time.time()
                    }
                    
                    await self._receive_rumor(neighbor, rumor, query)
                    neighbor.received_rumors.add(rumor_id)
        
        node.rumor_count += len(selected_neighbors)
    
    def _select_informative_neighbors(self, node: GossipNode, query: TopKQuery) -> List[int]:
        """选择信息量大的邻居"""
        neighbors = list(node.view.keys())
        
        if len(neighbors) <= 3:
            return neighbors
        
        # 计算与每个邻居的Top K差异
        differences = []
        for neighbor_id in neighbors:
            neighbor = self.gossip_nodes[neighbor_id]
            
            # 计算Jaccard差异
            node_set = set([elem for elem, _ in node.topk_cache])
            neighbor_set = set([elem for elem, _ in neighbor.topk_cache])
            
            if node_set and neighbor_set:
                intersection = len(node_set.intersection(neighbor_set))
                union = len(node_set.union(neighbor_set))
                similarity = intersection / union if union > 0 else 0
                difference = 1 - similarity
            else:
                difference = 1.0
            
            differences.append((neighbor_id, difference))
        
        # 按差异从大到小排序
        differences.sort(key=lambda x: x[1], reverse=True)
        
        # 选择差异最大的几个
        informative_count = min(len(differences), self.fanout * 2)
        return [nid for nid, _ in differences[:informative_count]]
    
    def _adaptive_fanout_size(self, round_num: int) -> int:
        """自适应调整fanout大小"""
        base_fanout = self.fanout
        
        # 根据收敛速度调整
        if len(self.convergence_history) >= 2:
            recent = self.convergence_history[-2:]
            conv_diff = recent[1]["agreement"] - recent[0]["agreement"] if len(recent) == 2 else 0
            
            if conv_diff < 0.01:  # 收敛慢
                # 增加fanout加速传播
                adjusted = min(base_fanout * 2, 10)
            elif conv_diff > 0.05:  # 收敛快
                # 减少fanout节省带宽
                adjusted = max(base_fanout // 2, 2)
            else:
                adjusted = base_fanout
        else:
            adjusted = base_fanout
        
        # 后期减少fanout
        if round_num > 15:
            adjusted = max(adjusted // 2, 2)
        
        return adjusted
    
    def _calculate_information_content(
        self, 
        node: GossipNode, 
        neighbor: GossipNode, 
        query: TopKQuery
    ) -> float:
        """计算信息含量"""
        node_set = set([elem for elem, _ in node.topk_cache])
        neighbor_set = set([elem for elem, _ in neighbor.topk_cache])
        
        # 计算节点有而邻居没有的元素
        unique_to_node = node_set - neighbor_set
        unique_to_neighbor = neighbor_set - node_set
        
        # 信息含量 = 独特元素的数量
        info_content = len(unique_to_node) + len(unique_to_neighbor)
        
        return info_content / (len(node.topk_cache) + len(neighbor.topk_cache)) if (node.topk_cache or neighbor.topk_cache) else 0
    
    def _select_different_items(
        self, 
        local_topk: List[Tuple[Any, float]], 
        remote_topk: List[Tuple[Any, float]], 
        query: TopKQuery
    ) -> List[Tuple[Any, float]]:
        """选择有差异的项目"""
        local_dict = {elem: score for elem, score in local_topk}
        remote_dict = {elem: score for elem, score in remote_topk}
        
        # 找出本地有而远程没有,或者分数差异大的项目
        different_items = []
        
        for elem, local_score in local_dict.items():
            if elem not in remote_dict:
                # 远程没有,完全不同的项目
                different_items.append((elem, local_score))
            else:
                remote_score = remote_dict[elem]
                score_diff = abs(local_score - remote_score) / max(abs(local_score), abs(remote_score), 1e-10)
                
                if score_diff > 0.1:  # 分数差异超过10%
                    different_items.append((elem, local_score))
        
        # 限制数量
        max_items = min(len(local_topk) // 2, 10)
        return different_items[:max_items]

三、性能评估与比较

python

复制

下载

复制代码
"""
分布式Top K算法性能评估
"""
import time
import asyncio
from typing import List, Dict, Any, Tuple
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dataclasses import dataclass
import json
import numpy as np

@dataclass
class BenchmarkConfig:
    """基准测试配置"""
    data_size: int                    # 数据总量
    num_nodes: int                   # 节点数量
    k_values: List[int]              # 测试的K值
    data_distributions: List[str]    # 数据分布类型
    algorithms: List[str]            # 测试的算法
    network_latency_ms: float = 10   # 网络延迟(ms)
    network_bandwidth_mbps: float = 100  # 网络带宽(Mbps)
    repeat_times: int = 3            # 重复次数

class TopKBenchmark:
    """Top K算法基准测试"""
    
    def __init__(self, config: BenchmarkConfig):
        self.config = config
        self.results = []
        self.summary_stats = {}
        
    async def run_benchmark(self):
        """运行基准测试"""
        print("="*80)
        print("开始分布式Top K算法基准测试")
        print("="*80)
        
        for distribution in self.config.data_distributions:
            print(f"\n测试数据分布: {distribution}")
            print("-"*40)
            
            for k in self.config.k_values:
                print(f"\n测试K值: {k}")
                
                # 生成测试数据
                data_streams = self._generate_test_data(distribution, k)
                
                for algorithm_name in self.config.algorithms:
                    print(f"\n测试算法: {algorithm_name}", end="", flush=True)
                    
                    # 重复测试取平均
                    algorithm_times = []
                    algorithm_accuracies = []
                    
                    for repeat in range(self.config.repeat_times):
                        print(".", end="", flush=True)
                        
                        # 创建算法实例
                        algorithm = self._create_algorithm(algorithm_name)
                        
                        # 创建查询
                        query = TopKQuery(
                            k=k,
                            query_type=TopKType.MAX_K
                        )
                        
                        # 运行测试
                        start_time = time.time()
                        
                        try:
                            result = await algorithm.compute_top_k(query, data_streams)
                            elapsed = time.time() - start_time
                            
                            # 计算准确性(与真实Top K比较)
                            accuracy = self._calculate_accuracy(result, data_streams, k)
                            
                            algorithm_times.append(elapsed)
                            algorithm_accuracies.append(accuracy)
                            
                            # 记录结果
                            self._record_result(
                                distribution, k, algorithm_name, repeat,
                                elapsed, accuracy, algorithm
                            )
                            
                        except Exception as e:
                            print(f"算法 {algorithm_name} 测试失败: {e}")
                            algorithm_times.append(float('inf'))
                            algorithm_accuracies.append(0.0)
                    
                    # 计算统计
                    avg_time = np.mean(algorithm_times)
                    avg_accuracy = np.mean(algorithm_accuracies)
                    
                    print(f" 平均时间: {avg_time:.3f}s, 准确率: {avg_accuracy:.2%}")
        
        print("\n" + "="*80)
        print("基准测试完成")
        print("="*80)
        
        # 生成报告
        self._generate_report()
    
    def _generate_test_data(self, distribution: str, k: int) -> List[List[Tuple[int, float]]]:
        """生成测试数据"""
        import random
        
        data_per_node = self.config.data_size // self.config.num_nodes
        data_streams = []
        
        for node_id in range(self.config.num_nodes):
            node_data = []
            
            if distribution == "uniform":
                # 均匀分布
                for i in range(data_per_node):
                    element = node_id * data_per_node + i
                    score = random.uniform(0, 1000)
                    node_data.append((element, score))
            
            elif distribution == "normal":
                # 正态分布
                for i in range(data_per_node):
                    element = node_id * data_per_node + i
                    score = random.normalvariate(500, 150)
                    node_data.append((element, score))
            
            elif distribution == "zipf":
                # Zipf分布(重尾分布)
                a = 1.5  # 分布参数
                for i in range(data_per_node):
                    element = node_id * data_per_node + i
                    score = np.random.zipf(a) * 10
                    node_data.append((element, score))
            
            elif distribution == "skewed":
                # 偏斜分布:80%的数据在低分区,20%在高分区
                for i in range(data_per_node):
                    element = node_id * data_per_node + i
                    if random.random() < 0.8:
                        score = random.uniform(0, 200)
                    else:
                        score = random.uniform(200, 1000)
                    node_data.append((element, score))
            
            data_streams.append(node_data)
        
        return data_streams
    
    def _create_algorithm(self, algorithm_name: str) -> DistributedTopKAlgorithm:
        """创建算法实例"""
        config = DistributedTopKConfig(
            num_nodes=self.config.num_nodes,
            data_sharding="hash",
            replication_factor=1,
            consistency_level="strong",
            algorithm=algorithm_name
        )
        
        if algorithm_name == "threshold_based":
            return ThresholdBasedTopK(config)
        elif algorithm_name == "optimized_threshold":
            return OptimizedThresholdTopK(config)
        elif algorithm_name == "tree_aggregation":
            return TreeAggregationTopK(config)
        elif algorithm_name == "optimized_tree":
            return OptimizedTreeTopK(config)
        elif algorithm_name == "map_reduce":
            return MapReduceTopK(config)
        elif algorithm_name == "optimized_map_reduce":
            return OptimizedMapReduceTopK(config)
        elif algorithm_name == "quantile_based":
            return QuantileBasedTopK(config)
        elif algorithm_name == "adaptive_quantile":
            return AdaptiveQuantileTopK(config)
        elif algorithm_name == "gossip_based":
            return GossipTopK(config)
        elif algorithm_name == "push_pull_gossip":
            return PushPullGossipTopK(config)
        elif algorithm_name == "adaptive_gossip":
            return AdaptiveGossipTopK(config)
        else:
            raise ValueError(f"未知算法: {algorithm_name}")
    
    def _calculate_accuracy(
        self, 
        result: List[Tuple[Any, float]], 
        data_streams: List[List[Tuple[int, float]]], 
        k: int
    ) -> float:
        """计算算法准确性"""
        # 计算真实的Top K(集中式计算)
        all_data = []
        for node_data in data_streams:
            all_data.extend(node_data)
        
        # 按分数排序
        all_data.sort(key=lambda x: x[1], reverse=True)
        true_topk = set([elem for elem, _ in all_data[:k]])
        
        # 算法结果的Top K
        algorithm_topk = set([elem for elem, _ in result[:k]])
        
        # 计算Jaccard相似度
        if true_topk and algorithm_topk:
            intersection = len(true_topk.intersection(algorithm_topk))
            union = len(true_topk.union(algorithm_topk))
            accuracy = intersection / union if union > 0 else 0
        else:
            accuracy = 0.0
        
        return accuracy
    
    def _record_result(
        self, 
        distribution: str, 
        k: int, 
        algorithm: str, 
        repeat: int,
        time_elapsed: float, 
        accuracy: float,
        algorithm_instance: DistributedTopKAlgorithm
    ):
        """记录测试结果"""
        # 获取算法统计信息
        try:
            complexity = algorithm_instance.get_complexity_analysis()
            network_cost = algorithm_instance.get_network_cost()
        except:
            complexity = {}
            network_cost = {}
        
        result = {
            "distribution": distribution,
            "k_value": k,
            "algorithm": algorithm,
            "repeat": repeat,
            "time_elapsed": time_elapsed,
            "accuracy": accuracy,
            "complexity": complexity,
            "network_cost": network_cost,
            "timestamp": time.time()
        }
        
        self.results.append(result)
    
    def _generate_report(self):
        """生成测试报告"""
        print("\n" + "="*80)
        print("生成测试报告")
        print("="*80)
        
        # 转换为DataFrame
        df = pd.DataFrame(self.results)
        
        # 汇总统计
        summary = df.groupby(['distribution', 'k_value', 'algorithm']).agg({
            'time_elapsed': ['mean', 'std', 'min', 'max'],
            'accuracy': ['mean', 'std', 'min', 'max']
        }).round(4)
        
        print("\n性能汇总:")
        print(summary)
        
        # 保存结果
        self._save_results(df, summary)
        
        # 生成可视化图表
        self._generate_visualizations(df)
    
    def _save_results(self, df: pd.DataFrame, summary: pd.DataFrame):
        """保存测试结果"""
        import os
        import json
        
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        output_dir = f"benchmark_results_{timestamp}"
        os.makedirs(output_dir, exist_ok=True)
        
        # 保存原始数据
        df.to_csv(f"{output_dir}/raw_results.csv", index=False)
        
        # 保存汇总统计
        summary.to_csv(f"{output_dir}/summary.csv")
        
        # 保存详细结果
        detailed_results = []
        for result in self.results:
            detailed = {
                "distribution": result["distribution"],
                "k_value": result["k_value"],
                "algorithm": result["algorithm"],
                "time_elapsed": result["time_elapsed"],
                "accuracy": result["accuracy"],
                "complexity": result.get("complexity", {}),
                "network_cost": result.get("network_cost", {})
            }
            detailed_results.append(detailed)
        
        with open(f"{output_dir}/detailed_results.json", 'w') as f:
            json.dump(detailed_results, f, indent=2, default=str)
        
        print(f"\n结果已保存到目录: {output_dir}")
    
    def _generate_visualizations(self, df: pd.DataFrame):
        """生成可视化图表"""
        import matplotlib.pyplot as plt
        import seaborn as sns
        
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        output_dir = f"benchmark_results_{timestamp}"
        
        # 设置样式
        sns.set_style("whitegrid")
        plt.figure(figsize=(15, 10))
        
        # 图表1: 不同算法的执行时间对比
        plt.subplot(2, 2, 1)
        time_data = df.groupby(['algorithm', 'k_value'])['time_elapsed'].mean().unstack()
        time_data.plot(kind='bar', ax=plt.gca())
        plt.title('不同算法的执行时间对比')
        plt.xlabel('算法')
        plt.ylabel('平均执行时间(s)')
        plt.legend(title='K值')
        plt.xticks(rotation=45)
        
        # 图表2: 不同算法的准确率对比
        plt.subplot(2, 2, 2)
        accuracy_data = df.groupby(['algorithm', 'k_value'])['accuracy'].mean().unstack()
        accuracy_data.plot(kind='bar', ax=plt.gca())
        plt.title('不同算法的准确率对比')
        plt.xlabel('算法')
        plt.ylabel('平均准确率')
        plt.legend(title='K值')
        plt.xticks(rotation=45)
        
        # 图表3: 不同数据分布下的性能
        plt.subplot(2, 2, 3)
        dist_data = df.groupby(['distribution', 'algorithm'])['time_elapsed'].mean().unstack()
        dist_data.plot(kind='bar', ax=plt.gca())
        plt.title('不同数据分布下的执行时间')
        plt.xlabel('数据分布')
        plt.ylabel('平均执行时间(s)')
        plt.legend(title='算法', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.xticks(rotation=45)
        
        # 图表4: 时间-准确率散点图
        plt.subplot(2, 2, 4)
        scatter_data = df.groupby(['algorithm']).agg({
            'time_elapsed': 'mean',
            'accuracy': 'mean'
        }).reset_index()
        
        for _, row in scatter_data.iterrows():
            plt.scatter(row['time_elapsed'], row['accuracy'], label=row['algorithm'], s=100)
            plt.annotate(row['algorithm'], 
                        (row['time_elapsed'], row['accuracy']),
                        xytext=(5, 5), textcoords='offset points')
        
        plt.title('算法性能散点图(时间 vs 准确率)')
        plt.xlabel('平均执行时间(s)')
        plt.ylabel('平均准确率')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f"{output_dir}/performance_comparison.png", dpi=300, bbox_inches='tight')
        plt.show()
        
        # 生成算法推荐矩阵
        self._generate_recommendation_matrix(df, output_dir)
    
    def _generate_recommendation_matrix(self, df: pd.DataFrame, output_dir: str):
        """生成算法推荐矩阵"""
        # 根据不同的场景推荐算法
        scenarios = [
            {
                "name": "小规模集群 (<= 10节点)",
                "conditions": lambda row: self.config.num_nodes <= 10,
                "weight": {"time": 0.4, "accuracy": 0.6}
            },
            {
                "name": "中规模集群 (11-50节点)",
                "conditions": lambda row: 11 <= self.config.num_nodes <= 50,
                "weight": {"time": 0.5, "accuracy": 0.5}
            },
            {
                "name": "大规模集群 (> 50节点)",
                "conditions": lambda row: self.config.num_nodes > 50,
                "weight": {"time": 0.6, "accuracy": 0.4}
            },
            {
                "name": "高精度要求",
                "conditions": lambda row: True,
                "weight": {"time": 0.3, "accuracy": 0.7}
            },
            {
                "name": "低延迟要求",
                "conditions": lambda row: True,
                "weight": {"time": 0.7, "accuracy": 0.3}
            }
        ]
        
        recommendations = []
        
        for scenario in scenarios:
            # 筛选符合条件的数据
            scenario_data = df[df.apply(scenario["conditions"], axis=1)]
            
            if scenario_data.empty:
                continue
            
            # 计算每个算法的得分
            algorithm_scores = {}
            
            for algorithm in scenario_data['algorithm'].unique():
                algo_data = scenario_data[scenario_data['algorithm'] == algorithm]
                
                # 标准化时间(越小越好)
                time_mean = algo_data['time_elapsed'].mean()
                time_std = algo_data['time_elapsed'].std()
                time_norm = 1 - (time_mean / scenario_data['time_elapsed'].max()) if scenario_data['time_elapsed'].max() > 0 else 1
                
                # 准确率(越大越好)
                accuracy_mean = algo_data['accuracy'].mean()
                
                # 计算综合得分
                score = (
                    scenario["weight"]["time"] * time_norm +
                    scenario["weight"]["accuracy"] * accuracy_mean
                )
                
                algorithm_scores[algorithm] = {
                    "score": score,
                    "time": time_mean,
                    "accuracy": accuracy_mean
                }
            
            # 排序并推荐
            sorted_algorithms = sorted(algorithm_scores.items(), key=lambda x: x[1]["score"], reverse=True)
            
            if sorted_algorithms:
                recommendations.append({
                    "scenario": scenario["name"],
                    "recommended_algorithm": sorted_algorithms[0][0],
                    "recommendation_score": sorted_algorithms[0][1]["score"],
                    "all_algorithms": sorted_algorithms
                })
        
        # 保存推荐矩阵
        import json
        with open(f"{output_dir}/recommendation_matrix.json", 'w') as f:
            json.dump(recommendations, f, indent=2, default=str)
        
        # 打印推荐
        print("\n" + "="*80)
        print("算法推荐矩阵")
        print("="*80)
        
        for rec in recommendations:
            print(f"\n场景: {rec['scenario']}")
            print(f"推荐算法: {rec['recommended_algorithm']} (得分: {rec['recommendation_score']:.3f})")
            print("所有算法排名:")
            for i, (algo, scores) in enumerate(rec['all_algorithms'][:5], 1):
                print(f"  {i}. {algo}: 得分={scores['score']:.3f}, "
                      f"时间={scores['time']:.3f}s, 准确率={scores['accuracy']:.2%}")

class RealTimeMonitor:
    """实时监控系统"""
    
    def __init__(self):
        self.metrics_history = []
        self.alert_thresholds = {
            "time_exceeded": 30.0,  # 执行时间超过30秒
            "accuracy_below": 0.8,  # 准确率低于80%
            "network_overload": 100.0  # 网络传输超过100MB
        }
        
    async def monitor_algorithm(self, algorithm: DistributedTopKAlgorithm, query: TopKQuery):
        """监控算法执行"""
        start_time = time.time()
        
        # 监控指标
        metrics = {
            "start_time": start_time,
            "query": str(query),
            "algorithm": algorithm.__class__.__name__,
            "phase": "initializing"
        }
        
        self.metrics_history.append(metrics)
        
        # 模拟执行过程监控
        try:
            # 这里可以添加更详细的监控点
            metrics["phase"] = "processing"
            
            # 执行算法
            result = await algorithm.compute_top_k(query, [])
            
            # 记录完成指标
            end_time = time.time()
            metrics.update({
                "end_time": end_time,
                "total_time": end_time - start_time,
                "result_size": len(result),
                "phase": "completed",
                "success": True
            })
            
            # 检查警报
            await self._check_alerts(metrics, algorithm)
            
            return result
            
        except Exception as e:
            end_time = time.time()
            metrics.update({
                "end_time": end_time,
                "total_time": end_time - start_time,
                "error": str(e),
                "phase": "failed",
                "success": False
            })
            
            # 发送错误警报
            await self._send_alert(f"算法执行失败: {e}", "critical")
            
            raise
    
    async def _check_alerts(self, metrics: Dict[str, Any], algorithm: DistributedTopKAlgorithm):
        """检查警报条件"""
        # 检查执行时间
        if metrics["total_time"] > self.alert_thresholds["time_exceeded"]:
            await self._send_alert(
                f"算法执行时间过长: {metrics['total_time']:.2f}s",
                "warning"
            )
        
        # 检查网络成本
        try:
            network_cost = algorithm.get_network_cost()
            if network_cost.get("total_mb", 0) > self.alert_thresholds["network_overload"]:
                await self._send_alert(
                    f"网络传输量过大: {network_cost['total_mb']:.2f}MB",
                    "warning"
                )
        except:
            pass
    
    async def _send_alert(self, message: str, severity: str):
        """发送警报"""
        alert = {
            "timestamp": time.time(),
            "message": message,
            "severity": severity
        }
        
        print(f"[ALERT {severity.upper()}] {message}")
        
        # 这里可以集成到实际的警报系统(如邮件、Slack等)
    
    def get_performance_report(self) -> Dict[str, Any]:
        """获取性能报告"""
        if not self.metrics_history:
            return {}
        
        successful_runs = [m for m in self.metrics_history if m.get("success", False)]
        
        if successful_runs:
            avg_time = np.mean([m["total_time"] for m in successful_runs])
            max_time = np.max([m["total_time"] for m in successful_runs])
            min_time = np.min([m["total_time"] for m in successful_runs])
        else:
            avg_time = max_time = min_time = 0
        
        report = {
            "total_runs": len(self.metrics_history),
            "successful_runs": len(successful_runs),
            "failed_runs": len(self.metrics_history) - len(successful_runs),
            "time_stats": {
                "average": avg_time,
                "maximum": max_time,
                "minimum": min_time
            },
            "recent_runs": self.metrics_history[-10:]  # 最近10次运行
        }
        
        return report

# 主测试程序
async def main():
    """主测试程序"""
    # 配置基准测试
    config = BenchmarkConfig(
        data_size=100000,      # 10万条数据
        num_nodes=10,          # 10个节点
        k_values=[10, 100, 1000],  # 测试不同的K值
        data_distributions=["uniform", "normal", "skewed"],  # 测试不同分布
        algorithms=[
            "threshold_based",
            "tree_aggregation", 
            "map_reduce",
            "quantile_based",
            "gossip_based"
        ],
        repeat_times=2         # 每个测试重复2次
    )
    
    # 运行基准测试
    benchmark = TopKBenchmark(config)
    await benchmark.run_benchmark()
    
    # 演示实时监控
    print("\n" + "="*80)
    print("演示实时监控系统")
    print("="*80)
    
    monitor = RealTimeMonitor()
    
    # 测试一个算法
    test_config = DistributedTopKConfig(
        num_nodes=5,
        data_sharding="hash",
        replication_factor=1,
        consistency_level="strong",
        algorithm="threshold_based"
    )
    
    algorithm = ThresholdBasedTopK(test_config)
    query = TopKQuery(k=100, query_type=TopKType.MAX_K)
    
    try:
        print("\n执行算法监控测试...")
        await monitor.monitor_algorithm(algorithm, query)
        
        # 获取监控报告
        report = monitor.get_performance_report()
        print(f"\n监控报告: {report['total_runs']} 次运行, "
              f"{report['successful_runs']} 次成功, "
              f"{report['failed_runs']} 次失败")
        
    except Exception as e:
        print(f"监控测试失败: {e}")
    
    print("\n" + "="*80)
    print("测试完成")
    print("="*80)

if __name__ == "__main__":
    asyncio.run(main())

篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc

需要全套面试笔记及答案
【点击此处即可/免费获取】

四、生产环境部署与优化

python

复制

下载

复制代码
"""
生产环境部署与优化
"""
import yaml
import docker
import kubernetes
from typing import Dict, Any, List, Optional
import asyncio
from dataclasses import dataclass, field
import logging
from prometheus_client import start_http_server, Counter, Histogram, Gauge
import psutil
import time

@dataclass
class ProductionDeploymentConfig:
    """生产环境部署配置"""
    cluster_type: str  # docker, kubernetes, cloud
    node_count: int
    resource_limits: Dict[str, str]
    auto_scaling: bool
    monitoring_enabled: bool
    backup_enabled: bool
    security_enabled: bool
    
    # 网络配置
    network_policy: Optional[Dict[str, Any]] = None
    load_balancer: Optional[Dict[str, Any]] = None
    
    # 存储配置
    storage_class: str = "standard"
    storage_size_gb: int = 100
    
    # 监控配置
    prometheus_endpoint: Optional[str] = None
    grafana_dashboard: Optional[str] = None
    
    def validate(self) -> bool:
        """验证配置"""
        if self.node_count <= 0:
            raise ValueError("节点数量必须大于0")
        
        if self.cluster_type not in ["docker", "kubernetes", "aws", "gcp", "azure"]:
            raise ValueError(f"不支持的集群类型: {self.cluster_type}")
        
        return True

class ProductionTopKDeployer:
    """生产环境Top K部署器"""
    
    def __init__(self, config: ProductionDeploymentConfig):
        self.config = config
        self.logger = logging.getLogger(__name__)
        self.deployment_status = {}
        
        # 监控指标
        self.metrics = {
            "deployment_success": Counter("deployment_success_total", "成功部署次数"),
            "deployment_failure": Counter("deployment_failure_total", "部署失败次数", ["error_type"]),
            "deployment_duration": Histogram("deployment_duration_seconds", "部署耗时"),
            "node_health": Gauge("node_health", "节点健康状态", ["node_id"]),
            "cluster_load": Gauge("cluster_load_percent", "集群负载百分比")
        }
    
    async def deploy(self, algorithm_config: Dict[str, Any]) -> bool:
        """部署Top K服务"""
        self.logger.info(f"开始部署Top K服务,集群类型: {self.config.cluster_type}")
        
        start_time = time.time()
        
        try:
            # 根据集群类型选择部署方式
            if self.config.cluster_type == "docker":
                success = await self._deploy_docker(algorithm_config)
            elif self.config.cluster_type == "kubernetes":
                success = await self._deploy_kubernetes(algorithm_config)
            elif self.config.cluster_type in ["aws", "gcp", "azure"]:
                success = await self._deploy_cloud(algorithm_config)
            else:
                raise ValueError(f"不支持的部署类型: {self.config.cluster_type}")
            
            # 记录部署指标
            duration = time.time() - start_time
            self.metrics["deployment_duration"].observe(duration)
            
            if success:
                self.metrics["deployment_success"].inc()
                self.logger.info(f"部署成功,耗时: {duration:.2f}秒")
            else:
                self.metrics["deployment_failure"].labels(error_type="unknown").inc()
            
            return success
            
        except Exception as e:
            duration = time.time() - start_time
            self.metrics["deployment_duration"].observe(duration)
            self.metrics["deployment_failure"].labels(error_type=type(e).__name__).inc()
            
            self.logger.error(f"部署失败: {e}")
            return False
    
    async def _deploy_docker(self, algorithm_config: Dict[str, Any]) -> bool:
        """使用Docker部署"""
        self.logger.info("使用Docker Compose部署")
        
        try:
            # 生成docker-compose.yml
            compose_config = self._generate_docker_compose_config(algorithm_config)
            
            # 写入文件
            with open("docker-compose.yml", "w") as f:
                yaml.dump(compose_config, f, default_flow_style=False)
            
            self.logger.info("已生成docker-compose.yml")
            
            # 启动服务(这里需要实际的Docker客户端)
            # 在实际部署中,会调用docker-compose up命令
            
            # 检查服务状态
            await self._check_service_health()
            
            return True
            
        except Exception as e:
            self.logger.error(f"Docker部署失败: {e}")
            return False
    
    def _generate_docker_compose_config(self, algorithm_config: Dict[str, Any]) -> Dict[str, Any]:
        """生成Docker Compose配置"""
        config = {
            "version": "3.8",
            "services": {},
            "networks": {
                "topk-network": {
                    "driver": "bridge"
                }
            },
            "volumes": {
                "data-volume": None
            }
        }
        
        # 添加算法服务
        for i in range(self.config.node_count):
            service_name = f"topk-node-{i}"
            
            config["services"][service_name] = {
                "image": "topk-algorithm:latest",
                "container_name": service_name,
                "environment": {
                    "NODE_ID": str(i),
                    "TOTAL_NODES": str(self.config.node_count),
                    "ALGORITHM_TYPE": algorithm_config.get("type", "threshold_based"),
                    "K_VALUE": str(algorithm_config.get("k", 100)),
                    "LOG_LEVEL": "INFO"
                },
                "ports": [f"{8080 + i}:8080"],
                "networks": ["topk-network"],
                "volumes": ["data-volume:/app/data"],
                "depends_on": ["coordinator"],
                "healthcheck": {
                    "test": ["CMD", "curl", "-f", f"http://localhost:{8080 + i}/health"],
                    "interval": "30s",
                    "timeout": "10s",
                    "retries": 3
                },
                "deploy": {
                    "resources": {
                        "limits": self.config.resource_limits
                    }
                }
            }
        
        # 添加协调节点
        config["services"]["coordinator"] = {
            "image": "topk-coordinator:latest",
            "container_name": "coordinator",
            "environment": {
                "TOTAL_NODES": str(self.config.node_count),
                "ALGORITHM_TYPE": algorithm_config.get("type", "threshold_based")
            },
            "ports": ["8080:8080"],
            "networks": ["topk-network"],
            "volumes": ["data-volume:/app/data"],
            "healthcheck": {
                "test": ["CMD", "curl", "-f", "http://localhost:8080/health"],
                "interval": "30s",
                "timeout": "10s",
                "retries": 3
            }
        }
        
        # 添加监控服务
        if self.config.monitoring_enabled:
            config["services"]["prometheus"] = {
                "image": "prom/prometheus:latest",
                "container_name": "prometheus",
                "ports": ["9090:9090"],
                "volumes": ["./prometheus.yml:/etc/prometheus/prometheus.yml"],
                "networks": ["topk-network"]
            }
            
            config["services"]["grafana"] = {
                "image": "grafana/grafana:latest",
                "container_name": "grafana",
                "ports": ["3000:3000"],
                "environment": {
                    "GF_SECURITY_ADMIN_PASSWORD": "admin"
                },
                "networks": ["topk-network"],
                "depends_on": ["prometheus"]
            }
        
        return config
    
    async def _deploy_kubernetes(self, algorithm_config: Dict[str, Any]) -> bool:
        """使用Kubernetes部署"""
        self.logger.info("使用Kubernetes部署")
        
        try:
            # 生成Kubernetes配置
            k8s_configs = self._generate_kubernetes_configs(algorithm_config)
            
            # 写入文件
            for name, config in k8s_configs.items():
                with open(f"k8s-{name}.yaml", "w") as f:
                    yaml.dump(config, f, default_flow_style=False)
            
            self.logger.info("已生成Kubernetes配置文件")
            
            # 在实际部署中,会调用kubectl apply命令
            
            # 检查部署状态
            await self._check_kubernetes_deployment()
            
            return True
            
        except Exception as e:
            self.logger.error(f"Kubernetes部署失败: {e}")
            return False
    
    def _generate_kubernetes_configs(self, algorithm_config: Dict[str, Any]) -> Dict[str, Any]:
        """生成Kubernetes配置文件"""
        configs = {}
        
        # 1. Namespace
        configs["namespace"] = {
            "apiVersion": "v1",
            "kind": "Namespace",
            "metadata": {
                "name": "topk-system",
                "labels": {
                    "name": "topk-system"
                }
            }
        }
        
        # 2. ConfigMap
        configs["configmap"] = {
            "apiVersion": "v1",
            "kind": "ConfigMap",
            "metadata": {
                "name": "topk-config",
                "namespace": "topk-system"
            },
            "data": {
                "algorithm.config": yaml.dump(algorithm_config),
                "node.count": str(self.config.node_count)
            }
        }
        
        # 3. Deployment
        deployment = {
            "apiVersion": "apps/v1",
            "kind": "Deployment",
            "metadata": {
                "name": "topk-nodes",
                "namespace": "topk-system",
                "labels": {
                    "app": "topk-node"
                }
            },
            "spec": {
                "replicas": self.config.node_count,
                "selector": {
                    "matchLabels": {
                        "app": "topk-node"
                    }
                },
                "template": {
                    "metadata": {
                        "labels": {
                            "app": "topk-node"
                        }
                    },
                    "spec": {
                        "containers": [{
                            "name": "topk-node",
                            "image": "topk-algorithm:latest",
                            "imagePullPolicy": "IfNotPresent",
                            "ports": [{
                                "containerPort": 8080,
                                "name": "http"
                            }],
                            "env": [
                                {
                                    "name": "NODE_ID",
                                    "valueFrom": {
                                        "fieldRef": {
                                            "fieldPath": "metadata.name"
                                        }
                                    }
                                },
                                {
                                    "name": "TOTAL_NODES",
                                    "value": str(self.config.node_count)
                                },
                                {
                                    "name": "ALGORITHM_CONFIG",
                                    "valueFrom": {
                                        "configMapKeyRef": {
                                            "name": "topk-config",
                                            "key": "algorithm.config"
                                        }
                                    }
                                }
                            ],
                            "resources": {
                                "limits": self._convert_resource_limits(),
                                "requests": {
                                    "cpu": "100m",
                                    "memory": "256Mi"
                                }
                            },
                            "livenessProbe": {
                                "httpGet": {
                                    "path": "/health",
                                    "port": 8080
                                },
                                "initialDelaySeconds": 30,
                                "periodSeconds": 10
                            },
                            "readinessProbe": {
                                "httpGet": {
                                    "path": "/health",
                                    "port": 8080
                                },
                                "initialDelaySeconds": 5,
                                "periodSeconds": 5
                            }
                        }]
                    }
                }
            }
        }
        
        # 添加自动扩缩容
        if self.config.auto_scaling:
            deployment["spec"]["template"]["spec"]["containers"][0]["resources"]["limits"] = \
                self._convert_resource_limits()
        
        configs["deployment"] = deployment
        
        # 4. Service
        configs["service"] = {
            "apiVersion": "v1",
            "kind": "Service",
            "metadata": {
                "name": "topk-service",
                "namespace": "topk-system"
            },
            "spec": {
                "selector": {
                    "app": "topk-node"
                },
                "ports": [{
                    "protocol": "TCP",
                    "port": 8080,
                    "targetPort": 8080
                }],
                "type": "LoadBalancer" if self.config.load_balancer else "ClusterIP"
            }
        }
        
        # 5. HorizontalPodAutoscaler
        if self.config.auto_scaling:
            configs["hpa"] = {
                "apiVersion": "autoscaling/v2",
                "kind": "HorizontalPodAutoscaler",
                "metadata": {
                    "name": "topk-hpa",
                    "namespace": "topk-system"
                },
                "spec": {
                    "scaleTargetRef": {
                        "apiVersion": "apps/v1",
                        "kind": "Deployment",
                        "name": "topk-nodes"
                    },
                    "minReplicas": 2,
                    "maxReplicas": self.config.node_count * 2,
                    "metrics": [{
                        "type": "Resource",
                        "resource": {
                            "name": "cpu",
                            "target": {
                                "type": "Utilization",
                                "averageUtilization": 70
                            }
                        }
                    }, {
                        "type": "Resource",
                        "resource": {
                            "name": "memory",
                            "target": {
                                "type": "Utilization",
                                "averageUtilization": 80
                            }
                        }
                    }]
                }
            }
        
        # 6. PersistentVolumeClaim
        configs["pvc"] = {
            "apiVersion": "v1",
            "kind": "PersistentVolumeClaim",
            "metadata": {
                "name": "topk-storage",
                "namespace": "topk-system"
            },
            "spec": {
                "accessModes": ["ReadWriteMany"],
                "storageClassName": self.config.storage_class,
                "resources": {
                    "requests": {
                        "storage": f"{self.config.storage_size_gb}Gi"
                    }
                }
            }
        }
        
        return configs
    
    def _convert_resource_limits(self) -> Dict[str, str]:
        """转换资源限制格式"""
        # 从字符串格式转换为Kubernetes格式
        limits = {}
        
        for key, value in self.config.resource_limits.items():
            if key == "memory":
                limits[key] = value
            elif key == "cpus":
                limits["cpu"] = value
            else:
                limits[key] = value
        
        return limits
    
    async def _deploy_cloud(self, algorithm_config: Dict[str, Any]) -> bool:
        """部署到云平台"""
        self.logger.info(f"部署到云平台: {self.config.cluster_type}")
        
        # 这里需要集成具体的云平台SDK
        # 例如:AWS ECS/EKS, GCP GKE, Azure AKS
        
        # 简化的部署流程
        try:
            # 1. 创建集群
            await self._create_cloud_cluster()
            
            # 2. 部署应用
            await self._deploy_cloud_application(algorithm_config)
            
            # 3. 配置网络和负载均衡
            await self._configure_cloud_networking()
            
            # 4. 设置监控
            if self.config.monitoring_enabled:
                await self._setup_cloud_monitoring()
            
            return True
            
        except Exception as e:
            self.logger.error(f"云平台部署失败: {e}")
            return False
    
    async def _check_service_health(self) -> bool:
        """检查服务健康状态"""
        self.logger.info("检查服务健康状态...")
        
        # 模拟健康检查
        await asyncio.sleep(2)
        
        # 在实际部署中,会检查每个容器的健康端点
        
        self.logger.info("服务健康状态正常")
        return True
    
    async def _check_kubernetes_deployment(self) -> bool:
        """检查Kubernetes部署状态"""
        self.logger.info("检查Kubernetes部署状态...")
        
        # 模拟检查
        await asyncio.sleep(3)
        
        # 在实际部署中,会使用kubectl检查部署状态
        
        self.logger.info("Kubernetes部署成功")
        return True
    
    async def _create_cloud_cluster(self):
        """创建云集群"""
        self.logger.info(f"创建{self.config.cluster_type}集群...")
        await asyncio.sleep(5)
        self.logger.info("集群创建完成")
    
    async def _deploy_cloud_application(self, algorithm_config: Dict[str, Any]):
        """部署云应用"""
        self.logger.info("部署应用到云集群...")
        await asyncio.sleep(3)
        self.logger.info("应用部署完成")
    
    async def _configure_cloud_networking(self):
        """配置云网络"""
        self.logger.info("配置网络和负载均衡...")
        await asyncio.sleep(2)
        self.logger.info("网络配置完成")
    
    async def _setup_cloud_monitoring(self):
        """设置云监控"""
        self.logger.info("设置监控系统...")
        await asyncio.sleep(2)
        self.logger.info("监控系统就绪")

class ProductionOptimizer:
    """生产环境优化器"""
    
    def __init__(self, deployment: ProductionTopKDeployer):
        self.deployment = deployment
        self.logger = logging.getLogger(__name__)
        self.optimization_history = []
        
    async def optimize_performance(self, metrics_data: Dict[str, Any]) -> Dict[str, Any]:
        """优化性能"""
        self.logger.info("开始性能优化分析...")
        
        optimizations = []
        
        # 1. 分析当前性能
        performance_analysis = self._analyze_performance(metrics_data)
        
        # 2. 识别瓶颈
        bottlenecks = self._identify_bottlenecks(performance_analysis)
        
        # 3. 生成优化建议
        for bottleneck in bottlenecks:
            recommendation = self._generate_optimization_recommendation(bottleneck)
            optimizations.append(recommendation)
        
        # 4. 应用优化
        applied_optimizations = []
        for optimization in optimizations:
            if optimization["priority"] == "high":
                success = await self._apply_optimization(optimization)
                if success:
                    applied_optimizations.append(optimization)
        
        # 5. 评估优化效果
        if applied_optimizations:
            evaluation = await self._evaluate_optimization_effect(applied_optimizations)
        else:
            evaluation = {"improvement": 0, "details": "无优化应用"}
        
        result = {
            "analysis": performance_analysis,
            "bottlenecks": bottlenecks,
            "recommendations": optimizations,
            "applied_optimizations": applied_optimizations,
            "evaluation": evaluation,
            "timestamp": time.time()
        }
        
        self.optimization_history.append(result)
        self.logger.info(f"性能优化完成,共提出 {len(optimizations)} 条建议,应用了 {len(applied_optimizations)} 条")
        
        return result
    
    def _analyze_performance(self, metrics_data: Dict[str, Any]) -> Dict[str, Any]:
        """分析性能数据"""
        analysis = {
            "throughput": metrics_data.get("queries_per_second", 0),
            "latency": metrics_data.get("average_latency_ms", 0),
            "accuracy": metrics_data.get("accuracy", 0),
            "resource_utilization": {
                "cpu": metrics_data.get("cpu_usage_percent", 0),
                "memory": metrics_data.get("memory_usage_percent", 0),
                "network": metrics_data.get("network_usage_mbps", 0)
            },
            "error_rate": metrics_data.get("error_rate", 0)
        }
        
        # 计算性能分数
        score = self._calculate_performance_score(analysis)
        analysis["performance_score"] = score
        
        return analysis
    
    def _calculate_performance_score(self, analysis: Dict[str, Any]) -> float:
        """计算性能分数"""
        weights = {
            "throughput": 0.3,
            "latency": 0.3,
            "accuracy": 0.2,
            "resource_efficiency": 0.2
        }
        
        # 标准化各项指标
        throughput_score = min(analysis["throughput"] / 1000, 1.0)  # 假设1000 QPS为满分
        latency_score = max(0, 1 - analysis["latency"] / 1000)  # 假设1000ms为0分
        
        accuracy_score = analysis["accuracy"]
        
        # 资源效率 = 1 - 平均资源使用率
        resource_usage = analysis["resource_utilization"]
        avg_resource_usage = (resource_usage["cpu"] + resource_usage["memory"] + resource_usage["network"]) / 3
        resource_efficiency = max(0, 1 - avg_resource_usage / 100)
        
        # 计算加权分数
        score = (
            weights["throughput"] * throughput_score +
            weights["latency"] * latency_score +
            weights["accuracy"] * accuracy_score +
            weights["resource_efficiency"] * resource_efficiency
        )
        
        return score
    
    def _identify_bottlenecks(self, analysis: Dict[str, Any]) -> List[Dict[str, Any]]:
        """识别性能瓶颈"""
        bottlenecks = []
        
        # 检查吞吐量瓶颈
        if analysis["throughput"] < 100:  # 低于100 QPS
            bottlenecks.append({
                "type": "throughput",
                "severity": "high",
                "metric": analysis["throughput"],
                "threshold": 100,
                "description": "吞吐量过低"
            })
        
        # 检查延迟瓶颈
        if analysis["latency"] > 500:  # 高于500ms
            bottlenecks.append({
                "type": "latency",
                "severity": "high",
                "metric": analysis["latency"],
                "threshold": 500,
                "description": "延迟过高"
            })
        
        # 检查资源瓶颈
        resource_usage = analysis["resource_utilization"]
        
        if resource_usage["cpu"] > 80:
            bottlenecks.append({
                "type": "cpu",
                "severity": "medium",
                "metric": resource_usage["cpu"],
                "threshold": 80,
                "description": "CPU使用率过高"
            })
        
        if resource_usage["memory"] > 85:
            bottlenecks.append({
                "type": "memory",
                "severity": "high",
                "metric": resource_usage["memory"],
                "threshold": 85,
                "description": "内存使用率过高"
            })
        
        if resource_usage["network"] > 70:
            bottlenecks.append({
                "type": "network",
                "severity": "medium",
                "metric": resource_usage["network"],
                "threshold": 70,
                "description": "网络带宽使用率过高"
            })
        
        # 检查准确性
        if analysis["accuracy"] < 0.9:
            bottlenecks.append({
                "type": "accuracy",
                "severity": "medium",
                "metric": analysis["accuracy"],
                "threshold": 0.9,
                "description": "准确率过低"
            })
        
        return bottlenecks
    
    def _generate_optimization_recommendation(self, bottleneck: Dict[str, Any]) -> Dict[str, Any]:
        """生成优化建议"""
        recommendations = {
            "throughput": [
                {
                    "action": "增加节点数量",
                    "priority": "high",
                    "estimated_improvement": "30-50%",
                    "cost": "medium",
                    "implementation_time": "minutes"
                },
                {
                    "action": "优化算法参数",
                    "priority": "medium",
                    "estimated_improvement": "10-20%",
                    "cost": "low",
                    "implementation_time": "minutes"
                },
                {
                    "action": "启用缓存",
                    "priority": "medium",
                    "estimated_improvement": "20-40%",
                    "cost": "low",
                    "implementation_time": "hours"
                }
            ],
            "latency": [
                {
                    "action": "使用更高效的算法",
                    "priority": "high",
                    "estimated_improvement": "40-60%",
                    "cost": "high",
                    "implementation_time": "days"
                },
                {
                    "action": "优化网络配置",
                    "priority": "medium",
                    "estimated_improvement": "10-30%",
                    "cost": "low",
                    "implementation_time": "hours"
                },
                {
                    "action": "增加批量处理大小",
                    "priority": "low",
                    "estimated_improvement": "5-15%",
                    "cost": "low",
                    "implementation_time": "minutes"
                }
            ],
            "cpu": [
                {
                    "action": "垂直扩展(增加CPU核心)",
                    "priority": "high",
                    "estimated_improvement": "立即缓解",
                    "cost": "medium",
                    "implementation_time": "hours"
                },
                {
                    "action": "优化算法复杂度",
                    "priority": "medium",
                    "estimated_improvement": "20-40%",
                    "cost": "high",
                    "implementation_time": "days"
                },
                {
                    "action": "启用CPU亲和性",
                    "priority": "low",
                    "estimated_improvement": "5-10%",
                    "cost": "low",
                    "implementation_time": "hours"
                }
            ],
            "memory": [
                {
                    "action": "增加内存",
                    "priority": "high",
                    "estimated_improvement": "立即缓解",
                    "cost": "medium",
                    "implementation_time": "hours"
                },
                {
                    "action": "优化内存使用",
                    "priority": "medium",
                    "estimated_improvement": "20-50%",
                    "cost": "medium",
                    "implementation_time": "days"
                },
                {
                    "action": "启用内存压缩",
                    "priority": "low",
                    "estimated_improvement": "10-20%",
                    "cost": "low",
                    "implementation_time": "hours"
                }
            ],
            "network": [
                {
                    "action": "增加网络带宽",
                    "priority": "high",
                    "estimated_improvement": "立即缓解",
                    "cost": "high",
                    "implementation_time": "days"
                },
                {
                    "action": "优化数据传输",
                    "priority": "medium",
                    "estimated_improvement": "30-50%",
                    "cost": "medium",
                    "implementation_time": "days"
                },
                {
                    "action": "启用数据压缩",
                    "priority": "low",
                    "estimated_improvement": "10-30%",
                    "cost": "low",
                    "implementation_time": "hours"
                }
            ],
            "accuracy": [
                {
                    "action": "增加K值缓冲区",
                    "priority": "high",
                    "estimated_improvement": "显著提高",
                    "cost": "low",
                    "implementation_time": "minutes"
                },
                {
                    "action": "使用更精确的算法",
                    "priority": "medium",
                    "estimated_improvement": "显著提高",
                    "cost": "high",
                    "implementation_time": "days"
                },
                {
                    "action": "增加迭代次数",
                    "priority": "low",
                    "estimated_improvement": "适度提高",
                    "cost": "low",
                    "implementation_time": "minutes"
                }
            ]
        }
        
        bottleneck_type = bottleneck["type"]
        available_recommendations = recommendations.get(bottleneck_type, [])
        
        if available_recommendations:
            # 选择最合适的建议
            recommendation = available_recommendations[0].copy()
            recommendation["bottleneck"] = bottleneck
            recommendation["applied"] = False
            return recommendation
        else:
            return {
                "action": "通用优化",
                "priority": "medium",
                "estimated_improvement": "未知",
                "cost": "未知",
                "implementation_time": "未知",
                "bottleneck": bottleneck,
                "applied": False
            }
    
    async def _apply_optimization(self, optimization: Dict[str, Any]) -> bool:
        """应用优化"""
        self.logger.info(f"应用优化: {optimization['action']}")
        
        action = optimization["action"]
        
        try:
            if "增加节点数量" in action:
                # 在实际部署中,会调用云平台的扩缩容API
                success = await self._scale_cluster("out")
                
            elif "优化算法参数" in action:
                success = await self._update_algorithm_parameters()
                
            elif "启用缓存" in action:
                success = await self._enable_caching()
                
            elif "使用更高效的算法" in action:
                success = await self._switch_algorithm()
                
            elif "优化网络配置" in action:
                success = await self._optimize_network()
                
            elif "垂直扩展" in action:
                success = await self._vertical_scaling()
                
            elif "增加内存" in action:
                success = await self._increase_memory()
                
            elif "增加网络带宽" in action:
                success = await self._increase_bandwidth()
                
            elif "增加K值缓冲区" in action:
                success = await self._increase_k_buffer()
                
            else:
                # 通用优化
                success = await self._generic_optimization()
            
            optimization["applied"] = success
            optimization["applied_at"] = time.time()
            
            return success
            
        except Exception as e:
            self.logger.error(f"应用优化失败: {e}")
            optimization["applied"] = False
            optimization["error"] = str(e)
            return False
    
    async def _evaluate_optimization_effect(self, optimizations: List[Dict[str, Any]]) -> Dict[str, Any]:
        """评估优化效果"""
        self.logger.info("评估优化效果...")
        
        # 模拟评估过程
        await asyncio.sleep(2)
        
        # 在实际系统中,会比较优化前后的性能指标
        
        improvement = random.uniform(0.1, 0.5)  # 模拟10-50%的改善
        
        evaluation = {
            "improvement": improvement,
            "details": f"应用 {len(optimizations)} 个优化,性能提升约 {improvement*100:.1f}%",
            "optimizations_applied": [opt["action"] for opt in optimizations],
            "evaluation_time": time.time()
        }
        
        return evaluation
    
    # 优化操作的具体实现(模拟)
    async def _scale_cluster(self, direction: str) -> bool:
        """扩缩容集群"""
        await asyncio.sleep(3)
        return True
    
    async def _update_algorithm_parameters(self) -> bool:
        """更新算法参数"""
        await asyncio.sleep(1)
        return True
    
    async def _enable_caching(self) -> bool:
        """启用缓存"""
        await asyncio.sleep(2)
        return True
    
    async def _switch_algorithm(self) -> bool:
        """切换算法"""
        await asyncio.sleep(5)
        return True
    
    async def _optimize_network(self) -> bool:
        """优化网络配置"""
        await asyncio.sleep(2)
        return True
    
    async def _vertical_scaling(self) -> bool:
        """垂直扩展"""
        await asyncio.sleep(4)
        return True
    
    async def _increase_memory(self) -> bool:
        """增加内存"""
        await asyncio.sleep(3)
        return True
    
    async def _increase_bandwidth(self) -> bool:
        """增加带宽"""
        await asyncio.sleep(5)
        return True
    
    async def _increase_k_buffer(self) -> bool:
        """增加K值缓冲区"""
        await asyncio.sleep(1)
        return True
    
    async def _generic_optimization(self) -> bool:
        """通用优化"""
        await asyncio.sleep(2)
        return True

# 主程序:生产环境部署演示
async def main_production():
    """生产环境部署演示"""
    logging.basicConfig(level=logging.INFO)
    
    print("="*80)
    print("生产环境分布式Top K系统部署演示")
    print("="*80)
    
    # 1. 配置生产环境
    production_config = ProductionDeploymentConfig(
        cluster_type="kubernetes",  # 使用Kubernetes
        node_count=5,
        resource_limits={"cpus": "2", "memory": "4Gi"},
        auto_scaling=True,
        monitoring_enabled=True,
        backup_enabled=True,
        security_enabled=True,
        storage_class="fast-ssd",
        storage_size_gb=200
    )
    
    # 2. 算法配置
    algorithm_config = {
        "type": "optimized_threshold",
        "k": 100,
        "parameters": {
            "max_iterations": 10,
            "error_tolerance": 0.01,
            "adaptive_threshold": True
        }
    }
    
    # 3. 创建部署器
    deployer = ProductionTopKDeployer(production_config)
    
    # 4. 部署服务
    print("\n1. 部署分布式Top K服务...")
    success = await deployer.deploy(algorithm_config)
    
    if success:
        print("✓ 部署成功")
    else:
        print("✗ 部署失败")
        return
    
    # 5. 创建优化器
    optimizer = ProductionOptimizer(deployer)
    
    # 6. 模拟性能数据
    print("\n2. 收集性能数据...")
    metrics_data = {
        "queries_per_second": 85,  # 低于阈值
        "average_latency_ms": 620,  # 高于阈值
        "accuracy": 0.95,
        "cpu_usage_percent": 75,
        "memory_usage_percent": 90,  # 高于阈值
        "network_usage_mbps": 65,
        "error_rate": 0.01
    }
    
    # 7. 性能优化
    print("\n3. 性能优化分析...")
    optimization_result = await optimizer.optimize_performance(metrics_data)
    
    # 8. 输出优化报告
    print("\n" + "="*80)
    print("优化报告")
    print("="*80)
    
    print(f"\n性能分析:")
    analysis = optimization_result["analysis"]
    print(f"  • 性能分数: {analysis['performance_score']:.3f}")
    print(f"  • 吞吐量: {analysis['throughput']} QPS")
    print(f"  • 延迟: {analysis['latency']}ms")
    print(f"  • CPU使用率: {analysis['resource_utilization']['cpu']}%")
    print(f"  • 内存使用率: {analysis['resource_utilization']['memory']}%")
    
    print(f"\n识别到的瓶颈 ({len(optimization_result['bottlenecks'])} 个):")
    for bottleneck in optimization_result["bottlenecks"]:
        print(f"  • {bottleneck['description']} (严重程度: {bottleneck['severity']})")
    
    print(f"\n优化建议 ({len(optimization_result['recommendations'])} 条):")
    for i, recommendation in enumerate(optimization_result["recommendations"], 1):
        status = "✓ 已应用" if recommendation.get("applied") else "○ 未应用"
        print(f"  {i}. {recommendation['action']} ({status})")
        print(f"     优先级: {recommendation['priority']}, "
              f"预计改善: {recommendation['estimated_improvement']}")
    
    print(f"\n优化效果评估:")
    evaluation = optimization_result["evaluation"]
    print(f"  • 总体改善: {evaluation['improvement']*100:.1f}%")
    print(f"  • 详情: {evaluation['details']}")
    
    print("\n" + "="*80)
    print("演示完成")
    print("="*80)

if __name__ == "__main__":
    asyncio.run(main_production())

总结

本解决方案全面覆盖了分布式Top K问题的各个方面:

核心算法实现:

  1. 基于阈值的剪枝算法 - 适合数据分布均匀的场景

  2. 树形聚合算法 - 适合中等规模集群,通信效率高

  3. MapReduce风格算法 - 适合大规模数据处理

  4. 基于分位数的算法 - 适合数据分布已知的场景

  5. 基于Gossip的算法 - 适合超大规模、动态变化的集群

关键特性:

  1. 自适应优化 - 根据数据分布和集群状态自动调整参数

  2. 容错处理 - 处理节点故障和数据不一致

  3. 性能监控 - 实时监控算法执行状态

  4. 资源优化 - 最小化网络传输和计算资源消耗

生产环境特性:

  1. 多平台部署 - 支持Docker、Kubernetes和云平台

  2. 自动扩缩容 - 根据负载动态调整集群规模

  3. 全面监控 - 集成Prometheus和Grafana

  4. 智能优化 - 自动识别瓶颈并推荐优化方案

适用场景:

  • 推荐系统:实时获取最热门的商品

  • 监控系统:找出性能最差的服务器

  • 金融风控:识别风险最高的交易

  • 日志分析:统计最频繁的错误类型

通过选择合适的算法和优化策略,本方案能够在保证准确性的同时,显著提升分布式Top K查询的性能和效率。

相关推荐
码农水水2 小时前
浅谈 MySQL InnoDB 的内存组件
java·开发语言·数据库·后端·mysql·面试
_ziva_2 小时前
分布式(三)深入浅出理解PyTorch分布式训练:nn.parallel.DistributedDataParallel详解
人工智能·pytorch·分布式
shjita2 小时前
mapreduce输出乱码的处理
java·开发语言·数据库
进阶小白猿2 小时前
Java技术八股学习Day25
java·jvm·学习
是垚不是土2 小时前
基于OpenTelemetry实现分布式链路追踪
java·运维·分布式·目标跟踪·系统架构
组合缺一2 小时前
Solon AI Remote Skills:开启分布式技能的“感知”时代
java·人工智能·分布式·agent·langgraph·mcp
weixin_436525072 小时前
若依多租户版 - modules中创建子模块
java·服务器·前端
jiunian_cn2 小时前
【Redis】Redis入门——分布式架构演进及Redis基本特性初识
redis·分布式·架构
小y要自律2 小时前
10 string容器 - 字符串插入和删除
开发语言·c++·stl