一、问题定义与挑战
1.1 问题定义
在分布式系统中,我们需要从海量数据(PB级别)中找出最大的K个元素或最小的K个元素。
python
复制
下载
class DistributedTopKProblem:
"""分布式Top K问题定义"""
def __init__(self, data_size: int = 10**9, k: int = 1000):
"""
Args:
data_size: 数据总量,通常10亿级以上
k: 要查找的Top K数量
"""
self.data_size = data_size
self.k = k
self.n_nodes = 10 # 分布式节点数量
def challenges(self):
"""分布式Top K的主要挑战"""
return {
"数据量太大": {
"内存限制": "单机无法加载全部数据",
"磁盘IO": "频繁读写导致性能瓶颈",
"网络传输": "数据传输成为主要开销"
},
"实时性要求": {
"在线查询": "需要毫秒级响应",
"增量更新": "数据不断变化时的实时更新",
"查询并发": "高并发查询场景"
},
"准确性与效率平衡": {
"精确算法": "保证100%准确但代价高",
"近似算法": "牺牲精度换取效率",
"误差界限": "可接受的误差范围"
},
"系统复杂性": {
"节点故障": "容错性和一致性",
"数据倾斜": "负载均衡问题",
"通信开销": "节点间协调成本"
}
}
1.2 应用场景
python
复制
下载
class TopKApplicationScenarios:
"""Top K问题的应用场景"""
@staticmethod
def real_world_examples():
"""实际应用示例"""
return {
"搜索引擎": {
"问题": "从数十亿网页中找出最相关的K个",
"K值": "通常K=10(第一页结果)",
"特点": "实时性要求高,需要个性化"
},
"推荐系统": {
"问题": "从百万商品中推荐Top K给用户",
"K值": "K=10-100(推荐列表)",
"特点": "需要实时更新,个性化推荐"
},
"监控告警": {
"问题": "从海量指标中找出Top K异常",
"K值": "K=10-50(重点监控对象)",
"特点": "低延迟,高准确性要求"
},
"金融风控": {
"问题": "找出Top K可疑交易",
"K值": "K=100-1000(人工审核量)",
"特点": "准确性要求极高,可容忍延迟"
},
"大数据分析": {
"问题": "网站Top K访问页面",
"K值": "K=10-100(分析报告)",
"特点": "批量处理,可容忍分钟级延迟"
}
}
二、精确算法解决方案
2.1 MapReduce方案
python
复制
下载
class MapReduceTopK:
"""基于MapReduce的Top K解决方案"""
def __init__(self, data_paths: list, k: int, n_reducers: int = 10):
"""
Args:
data_paths: 数据分布在各节点的路径
k: 要查找的Top K数量
n_reducers: Reduce任务数量
"""
self.data_paths = data_paths
self.k = k
self.n_reducers = n_reducers
def map_phase(self, node_data):
"""
Map阶段:每个节点找出本地Top K
时间复杂度:O(N_local log K)
空间复杂度:O(K)
"""
import heapq
class Mapper:
def __init__(self, k: int):
self.k = k
self.min_heap = [] # 最小堆,维护本地Top K
def map(self, value):
"""处理单个数据"""
if len(self.min_heap) < self.k:
heapq.heappush(self.min_heap, value)
elif value > self.min_heap[0]: # 比当前第K大还大
heapq.heapreplace(self.min_heap, value)
def emit(self):
"""输出本地Top K"""
return sorted(self.min_heap, reverse=True)[:self.k]
return Mapper(self.k)
def reduce_phase(self, all_topks):
"""
Reduce阶段:合并所有节点的Top K
时间复杂度:O(M*K log K),M为节点数
空间复杂度:O(M*K)
"""
import heapq
class Reducer:
def __init__(self, k: int):
self.k = k
self.min_heap = []
def reduce(self, local_topk):
"""合并一个节点的结果"""
for value in local_topk:
if len(self.min_heap) < self.k:
heapq.heappush(self.min_heap, value)
elif value > self.min_heap[0]:
heapq.heapreplace(self.min_heap, value)
def get_result(self):
"""获取最终Top K"""
return sorted(self.min_heap, reverse=True)
reducer = Reducer(self.k)
for local_topk in all_topks:
reducer.reduce(local_topk)
return reducer.get_result()
def two_round_mapreduce(self):
"""
两轮MapReduce优化
第一轮:本地Top K
第二轮:全局Top K
"""
optimization = {
"第一轮MapReduce": {
"Map": "每个节点计算本地Top mK (m=2-10)",
"Reduce": "收集所有节点的mK个候选",
"目的": "大幅减少数据传输量",
"减少比例": "从N减少到m*M*K,M为节点数"
},
"第二轮MapReduce": {
"Map": "对候选集重新分发",
"Reduce": "计算最终Top K",
"目的": "保证全局准确性",
"数据量": "仅m*M*K,可单机处理"
},
"参数调优": {
"m的选择": "m=2-10,平衡准确性和效率",
"分区策略": "按值范围分区提高效率",
"Combiner": "Map端预聚合减少传输"
}
}
return optimization
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
2.2 分布式排序算法
python
复制
下载
class DistributedSortTopK:
"""分布式排序找Top K"""
def external_sort_merge(self, data_chunks: list, k: int):
"""
外排序-归并算法
适用于数据无法全部装入内存的场景
"""
algorithm_steps = [
"1. 本地排序阶段:",
" - 每个节点对本地数据排序",
" - 生成有序的数据块",
" - 时间复杂度:O(N_local log N_local)",
"",
"2. 多路归并阶段:",
" - 使用优先队列多路归并",
" - 只需维护K个元素在内存中",
" - 时间复杂度:O(N log M),M为数据块数",
"",
"3. 优化策略:",
" - 败者树优化多路归并",
" - 预取和缓存优化磁盘IO",
" - 并行归并提高速度"
]
return algorithm_steps
def quickselect_distributed(self, nodes_data, k: int):
"""
分布式快速选择算法
基于分治思想,减少数据交换
"""
import random
class DistributedQuickSelect:
def __init__(self, k: int):
self.k = k
def select_pivot(self, samples):
"""选择合适的主元"""
# 从各节点采样
all_samples = []
for node_sample in samples:
all_samples.extend(node_sample)
# 选择中位数的中位数作为主元
return self.median_of_medians(all_samples)
def median_of_medians(self, arr):
"""BFPRT算法找中位数"""
if len(arr) <= 5:
return sorted(arr)[len(arr)//2]
# 分成5个一组
chunks = [arr[i:i+5] for i in range(0, len(arr), 5)]
medians = [sorted(chunk)[len(chunk)//2] for chunk in chunks]
# 递归找中位数
return self.median_of_medians(medians)
def partition_and_count(self, pivot):
"""
统计各分区大小
返回:(小于pivot的数量, 等于pivot的数量)
"""
less_count = 0
equal_count = 0
# 各节点并行统计
# 实际分布式环境中,每个节点独立统计本地数据
# 然后汇总结果
return less_count, equal_count
def distribute_work(self, pivot, direction):
"""
根据主元分发任务
direction: 'less'或'greater'
"""
# 决定下一步处理哪个分区
# 类似QuickSelect算法
pass
return DistributedQuickSelect(k)
三、近似算法解决方案
3.1 Count-Min Sketch + Heap
python
复制
下载
class ApproximateTopK:
"""近似Top K算法"""
def count_min_sketch_topk(self, stream_data, k: int, epsilon=0.01, delta=0.01):
"""
Count-Min Sketch + Heap 近似算法
适用于数据流场景,内存使用固定
"""
import math
import heapq
import mmh3 # MurmurHash
class CountMinSketchTopK:
def __init__(self, k: int, epsilon: float, delta: float):
"""
Args:
k: Top K数量
epsilon: 误差参数
delta: 失败概率
"""
self.k = k
self.epsilon = epsilon
self.delta = delta
# Count-Min Sketch参数
self.w = math.ceil(math.e / epsilon) # 宽度
self.d = math.ceil(math.log(1 / delta)) # 深度
# 初始化sketch
self.sketch = [[0] * self.w for _ in range(self.d)]
self.hash_seeds = [i * 1000 for i in range(self.d)]
# Top K堆
self.min_heap = [] # (频率, 元素)的最小堆
self.element_freq = {} # 元素频率缓存
def update(self, element):
"""更新元素频率"""
# 更新Count-Min Sketch
for i in range(self.d):
hash_val = mmh3.hash(str(element), self.hash_seeds[i]) % self.w
self.sketch[i][hash_val] += 1
# 更新堆
freq = self.estimate_freq(element)
self.element_freq[element] = freq
if len(self.min_heap) < self.k:
heapq.heappush(self.min_heap, (freq, element))
elif freq > self.min_heap[0][0]:
# 替换堆顶元素
heapq.heapreplace(self.min_heap, (freq, element))
def estimate_freq(self, element):
"""估计元素频率(取最小值)"""
min_freq = float('inf')
for i in range(self.d):
hash_val = mmh3.hash(str(element), self.hash_seeds[i]) % self.w
min_freq = min(min_freq, self.sketch[i][hash_val])
return min_freq
def get_topk(self):
"""获取近似Top K"""
# 堆中元素可能有误差,需要重新排序
candidates = [(self.estimate_freq(elem), elem)
for _, elem in self.min_heap]
return sorted(candidates, reverse=True)[:self.k]
def memory_usage(self):
"""内存使用量"""
sketch_size = self.d * self.w * 4 # 假设int为4字节
heap_size = self.k * 16 # 估计值
return f"{sketch_size + heap_size} bytes"
return CountMinSketchTopK(k, epsilon, delta)
def space_saving_algorithm(self, stream_data, k: int):
"""
Space-Saving算法
精确维护Top K,但使用固定内存
"""
import heapq
class SpaceSavingTopK:
def __init__(self, k: int):
self.k = k
self.heap = [] # (频率, 误差, 元素)的最小堆
self.counter = {} # 元素->(频率, 误差)
def update(self, element):
"""处理一个元素"""
if element in self.counter:
# 元素已在监控中,增加频率
freq, error = self.counter[element]
self.counter[element] = (freq + 1, error)
# 更新堆中位置(需要重建堆)
self._rebuild_heap()
elif len(self.heap) < self.k:
# 堆未满,直接加入
self.counter[element] = (1, 0)
heapq.heappush(self.heap, (1, 0, element))
else:
# 替换堆顶元素
min_freq, min_error, min_elem = self.heap[0]
# 删除旧元素
del self.counter[min_elem]
# 添加新元素,继承旧元素的频率作为误差
self.counter[element] = (min_freq + 1, min_freq)
heapq.heapreplace(self.heap, (min_freq + 1, min_freq, element))
def _rebuild_heap(self):
"""重建堆以维持正确顺序"""
self.heap = [(freq, error, elem)
for elem, (freq, error) in self.counter.items()]
heapq.heapify(self.heap)
def get_topk(self):
"""获取Top K(带误差估计)"""
results = []
for freq, error, elem in sorted(self.heap, reverse=True):
results.append({
'element': elem,
'estimated_frequency': freq,
'error_bound': error
})
return results
def query_frequency(self, element):
"""查询元素频率(可能高估)"""
if element in self.counter:
freq, error = self.counter[element]
return {
'min_frequency': freq - error,
'max_frequency': freq
}
return {'min_frequency': 0, 'max_frequency': 0}
return SpaceSavingTopK(k)
3.2 HyperLogLog + Heap
python
复制
下载
class HeavyHittersTopK:
"""热点元素Top K检测"""
def lossy_counting(self, stream_data, k: int, epsilon=0.001):
"""
Lossy Counting算法
找出频率超过阈值的元素
"""
class LossyCountingTopK:
def __init__(self, epsilon: float):
self.epsilon = epsilon
self.bucket_size = int(1 / epsilon)
self.current_bucket = 1
self.freq_map = {} # 元素->(频率, 最大可能误差)
def update(self, element):
"""处理数据流"""
# 更新频率
if element in self.freq_map:
freq, delta = self.freq_map[element]
self.freq_map[element] = (freq + 1, delta)
else:
self.freq_map[element] = (1, self.current_bucket - 1)
# 定期清理低频元素
if len(self.freq_map) >= self.bucket_size:
self._cleanup()
self.current_bucket += 1
def _cleanup(self):
"""清理频率低于当前桶号的元素"""
to_delete = []
for element, (freq, delta) in self.freq_map.items():
if freq + delta <= self.current_bucket:
to_delete.append(element)
for element in to_delete:
del self.freq_map[element]
def get_heavy_hitters(self, min_freq_ratio=0.01):
"""获取热点元素"""
total_elements = self.current_bucket * self.bucket_size
min_freq = total_elements * min_freq_ratio
heavy_hitters = []
for element, (freq, delta) in self.freq_map.items():
if freq >= min_freq:
heavy_hitters.append({
'element': element,
'frequency': freq,
'error_bound': delta
})
return sorted(heavy_hitters,
key=lambda x: x['frequency'],
reverse=True)[:k]
return LossyCountingTopK(epsilon)
四、分布式系统实现方案
4.1 基于Spark的实现
python
复制
下载
class SparkTopKSolutions:
"""Spark中Top K解决方案"""
def rdd_topk_solution(self, spark_session, data_rdd, k: int):
"""
Spark RDD方式实现Top K
适用于中等规模数据
"""
from pyspark import StorageLevel
def spark_topk_implementation():
"""
多种Spark Top K实现方式
"""
implementations = {
"方法1: top()算子": {
"代码": "data_rdd.top(k)",
"原理": "全局排序取前K个",
"缺点": "需要shuffle全部数据",
"适用": "K很小,数据量中等"
},
"方法2: takeOrdered()": {
"代码": "data_rdd.takeOrdered(k, key=lambda x: -x)",
"原理": "使用优先队列,只shuffle部分数据",
"优点": "比top()更高效",
"适用": "通用场景"
},
"方法3: 两阶段聚合": {
"步骤": [
"1. mapPartitions: 每个分区本地Top mK",
"2. collect: 收集所有分区的候选",
"3. driver端: 合并得到最终Top K",
"4. broadcast: 广播结果"
],
"代码示例": """
# 第一步:各分区本地Top 2K
def local_topk(iterator):
import heapq
heap = []
for value in iterator:
if len(heap) < 2*k:
heapq.heappush(heap, value)
elif value > heap[0]:
heapq.heapreplace(heap, value)
return [heap]
local_topks = data_rdd.mapPartitions(local_topk).collect()
# 第二步:Driver端合并
import heapq
global_heap = []
for local_heap in local_topks:
for value in local_heap:
if len(global_heap) < k:
heapq.heappush(global_heap, value)
elif value > global_heap[0]:
heapq.heapreplace(global_heap, value)
result = sorted(global_heap, reverse=True)
""",
"优点": "减少shuffle数据量",
"适用": "大数据量,K中等"
},
"方法4: 基于排序": {
"步骤": [
"1. sortByKey(): 全局排序",
"2. zipWithIndex(): 添加索引",
"3. filter(): 取前K个"
],
"缺点": "需要全量shuffle",
"适用": "需要全排序的场景"
}
}
return implementations
return spark_topk_implementation()
def dataframe_topk_solution(self, spark_df, k: int, order_by_col: str):
"""
Spark DataFrame方式实现Top K
更高效,支持SQL优化
"""
solutions = {
"方法1: window函数": {
"SQL": f"""
SELECT * FROM (
SELECT *,
ROW_NUMBER() OVER (ORDER BY {order_by_col} DESC) as rn
FROM table
) WHERE rn <= {k}
""",
"说明": "使用窗口函数,需要全局排序",
"性能": "中等,但代码简洁"
},
"方法2: limit + sort": {
"代码": f"spark_df.orderBy(F.desc(order_by_col)).limit(k)",
"原理": "Catalyst优化器会优化执行计划",
"优化": "可能转换为两阶段聚合",
"适用": "推荐使用"
},
"方法3: 采样优化": {
"步骤": [
"1. 采样估计数据分布",
"2. 根据分布选择分区策略",
"3. 部分数据shuffle",
"4. 获取精确Top K"
],
"代码": """
# 采样估计
stats = spark_df.select(order_by_col).summary("min", "max", "approx_count_distinct").collect()
# 根据统计信息优化
if 数据分布均匀:
return spark_df.orderBy(F.desc(order_by_col)).limit(k)
else:
# 使用自定义分区策略
return customized_topk(spark_df, k, order_by_col)
""",
"适用": "数据倾斜严重时"
}
}
return solutions
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
4.2 基于Flink的实现
python
复制
下载
class FlinkTopKSolutions:
"""Flink流处理中的Top K"""
def streaming_topk(self, data_stream, k: int, window_size: str):
"""
流处理中的Top K
支持滑动窗口、滚动窗口
"""
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment
class FlinkTopKImplementations:
@staticmethod
def tumbling_window_topk():
"""滚动窗口Top K"""
return {
"SQL实现": f"""
SELECT * FROM (
SELECT *,
ROW_NUMBER() OVER (
PARTITION BY window_start, window_end
ORDER BY value DESC
) as rn
FROM TABLE(
TUMBLE(
TABLE input_table,
DESCRIPTOR(event_time),
INTERVAL '{window_size}'
)
)
) WHERE rn <= {k}
""",
"DataStream API": """
data_stream
.key_by(lambda x: x.key)
.window(TumblingEventTimeWindows.of(Time.seconds(window_size)))
.process(TopKProcessFunction(k))
""",
"注意事项": [
"需要设置合适的水位线",
"考虑乱序数据的影响",
"内存中维护Top K状态"
]
}
@staticmethod
def sliding_window_topk():
"""滑动窗口Top K"""
return {
"特点": "每个元素属于多个窗口",
"优化": "增量计算,避免重复",
"实现": "使用AggregateFunction + ProcessWindowFunction"
}
@staticmethod
def global_topk_with_partitioning():
"""全局Top K(带分区)"""
return {
"步骤": [
"1. KeyBy分区: 将数据哈希到多个分区",
"2. 分区内Top K: 每个分区维护本地Top K",
"3. 全局聚合: 定期合并各分区结果",
"4. 输出结果: 输出全局Top K"
],
"代码结构": """
class GlobalTopK:
def __init__(self, k, partition_num):
self.k = k
self.partition_num = partition_num
self.partition_topk = [[] for _ in range(partition_num)]
def add(self, element):
# 1. 哈希到分区
partition = hash(element) % self.partition_num
# 2. 更新分区Top K
self._update_partition_topk(partition, element)
# 3. 定期触发全局聚合
if 触发条件:
return self._global_aggregate()
def _update_partition_topk(self, partition, element):
# 维护分区内的Top K
pass
def _global_aggregate(self):
# 合并所有分区的Top K
pass
"""
}
return FlinkTopKImplementations()
4.3 分布式数据结构方案
python
复制
下载
class DistributedDataStructures:
"""基于分布式数据结构的Top K"""
def redis_sorted_set_solution(self, k: int):
"""Redis Sorted Set实现Top K"""
import redis
class RedisTopK:
def __init__(self, redis_client, key_name: str, k: int):
self.redis = redis_client
self.key = key_name
self.k = k
def update(self, element, score: float):
"""更新元素分数"""
# 添加或更新元素分数
self.redis.zadd(self.key, {element: score})
# 保持只保留Top K
current_count = self.redis.zcard(self.key)
if current_count > self.k * 2: # 保留一些冗余
# 删除排名在K之后的元素
self.redis.zremrangebyrank(self.key, 0, -(self.k + 1))
def get_topk(self):
"""获取Top K"""
# 按分数降序获取前K个
return self.redis.zrevrange(self.key, 0, self.k - 1, withscores=True)
def distributed_version(self, shards: int):
"""分布式版本"""
return {
"分片策略": f"将数据哈希到{shards}个Redis实例",
"查询步骤": [
"1. 查询每个分片的Top K",
"2. 合并所有分片的结果",
"3. 取全局Top K"
],
"优化": "定期合并,减少查询时的网络开销"
}
return RedisTopK
def apache_ignite_topk(self):
"""Apache Ignite分布式内存方案"""
implementations = {
"分布式优先队列": {
"原理": "每个节点维护本地优先队列",
"查询": "定期同步各节点状态",
"适用": "实时性要求不高的场景"
},
"分布式排序缓存": {
"原理": "使用Ignite的分布式缓存+索引",
"查询": "SQL查询取Top K",
"性能": "毫秒级响应"
}
}
return implementations
五、工程实践与优化
5.1 性能优化策略
python
复制
下载
class TopKOptimization:
"""Top K性能优化策略"""
def optimization_techniques(self):
"""优化技术汇总"""
return {
"减少数据传输": {
"本地聚合": "先在节点内部聚合,再全局聚合",
"数据压缩": "传输前压缩,减少网络开销",
"增量传输": "只传输变化部分"
},
"并行处理优化": {
"负载均衡": "动态调整节点负载",
"数据倾斜处理": [
"采样识别热点数据",
"热点数据特殊处理",
"使用一致性哈希避免倾斜"
],
"流水线处理": "重叠计算和传输"
},
"内存优化": {
"数据结构选择": [
"小K:使用堆",
"大K:使用快速选择",
"流数据:使用Count-Min Sketch"
],
"内存池": "重用内存对象",
"序列化优化": "高效序列化方案"
},
"索引优化": {
"预排序": "数据预先排序,加快查询",
"分层索引": "建立多级索引结构",
"Bloom Filter": "快速判断元素是否存在"
},
"缓存策略": {
"结果缓存": "缓存Top K结果,减少重复计算",
"热点缓存": "缓存高频查询元素",
"LRU/K缓存": "智能缓存管理"
}
}
5.2 容错与一致性
python
复制
下载
class TopKFaultTolerance:
"""Top K的容错与一致性"""
def fault_tolerance_mechanisms(self):
"""容错机制"""
return {
"检查点机制": {
"定期快照": "保存Top K状态到可靠存储",
"恢复策略": "从检查点恢复,重新计算丢失数据"
},
"副本策略": {
"主备复制": "主节点计算,备节点同步",
"多副本投票": "多个副本独立计算,投票决定结果",
"Quorum机制": "满足多数一致即可"
},
"结果验证": {
"边界验证": "验证Top K的边界值是否合理",
"抽样验证": "抽样验证部分结果的正确性",
"代价函数": "定义错误结果的代价,权衡准确性和性能"
},
"一致性级别": {
"强一致性": "所有节点看到相同的结果",
"最终一致性": "允许短暂不一致,最终一致",
"弱一致性": "接受近似结果,性能最好"
}
}
六、完整示例:实时热点商品Top K
python
复制
下载
class RealTimeHotProducts:
"""实时热点商品Top K系统"""
def __init__(self):
self.system_design = self._design_system()
def _design_system(self):
"""系统架构设计"""
return {
"数据源": {
"用户点击流": "Kafka实时流",
"订单数据": "MySQL binlog",
"用户行为": "埋点日志"
},
"处理层": {
"实时层(Flink)": {
"任务": "计算分钟级Top K",
"技术": "滑动窗口,增量计算",
"输出": "Redis缓存,Dashboard展示"
},
"近实时层(Spark)": {
"任务": "计算小时/天级Top K",
"技术": "微批处理,精确计算",
"输出": "Hive/ClickHouse存储"
},
"离线层(Hadoop)": {
"任务": "历史数据分析,模型训练",
"技术": "MapReduce,全量计算",
"输出": "数据仓库,报表系统"
}
},
"存储层": {
"实时存储(Redis)": {
"数据结构": "Sorted Set存储Top K",
"TTL": "设置合适过期时间",
"分片": "按商品类别分片"
},
"持久化存储(HBase)": {
"存储": "全量Top K历史数据",
"查询": "支持时间范围查询",
"压缩": "历史数据压缩存储"
}
},
"查询服务": {
"API网关": "统一查询入口",
"缓存层": "查询结果缓存",
"降级策略": "缓存失效时的降级方案"
}
}
def implementation_example(self):
"""实现示例"""
import json
from datetime import datetime, timedelta
class RealTimeTopKSystem:
def __init__(self, k: int = 100):
self.k = k
self.redis_client = self._init_redis()
self.flink_job = self._setup_flink_job()
def _init_redis(self):
"""初始化Redis连接"""
import redis
# 实际项目中应使用连接池
return redis.Redis(host='localhost', port=6379, db=0)
def _setup_flink_job(self):
"""设置Flink实时任务"""
job_config = {
"source": {
"type": "kafka",
"topics": ["user_clicks", "user_orders"],
"group_id": "topk_processor"
},
"processing": {
"window": {
"size": "1 minute",
"slide": "10 seconds" # 10秒更新一次
},
"topk_algorithm": "SpaceSaving",
"k": self.k
},
"sink": {
"redis": {
"key_pattern": "topk:{category}:{timestamp}",
"expire_seconds": 300 # 5分钟过期
},
"dashboard": {
"websocket": "实时推送",
"api": "REST API查询"
}
}
}
return job_config
def query_topk(self, category: str = None,
time_range: str = "realtime"):
"""查询Top K"""
if time_range == "realtime":
# 查询实时数据
current_minute = datetime.now().strftime("%Y%m%d%H%M")
redis_key = f"topk:{category or 'all'}:{current_minute}"
# 从Redis获取
result = self.redis_client.zrevrange(
redis_key, 0, self.k-1, withscores=True
)
return {
'timestamp': current_minute,
'category': category,
'topk': [
{'product_id': pid.decode(), 'score': score}
for pid, score in result
]
}
else:
# 查询历史数据(从HBase/ClickHouse)
return self._query_historical_data(category, time_range)
def _query_historical_data(self, category, time_range):
"""查询历史数据"""
# 实际项目中会查询数据仓库
return {
'category': category,
'time_range': time_range,
'data_source': 'hbase/clickhouse',
'note': '历史查询实现略'
}
return RealTimeTopKSystem()
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
七、面试要点总结
7.1 核心问题回答模板
python
复制
下载
class TopKInterviewGuide:
"""Top K面试指南"""
@staticmethod
def answer_structure():
"""回答结构模板"""
return """
分布式Top K问题可以从以下几个层面回答:
1. 问题分析(30%):
- 数据规模(单机无法处理)
- 实时性要求(流式/批处理)
- 准确性要求(精确/近似)
2. 算法选择(40%):
- 精确算法:MapReduce两阶段聚合、分布式排序
- 近似算法:Count-Min Sketch、SpaceSaving
- 流处理算法:滑动窗口Top K
3. 系统设计(20%):
- 架构设计(Lambda/Kappa架构)
- 组件选择(Spark/Flink/Redis)
- 数据存储(内存/磁盘/分布式存储)
4. 优化策略(10%):
- 性能优化(减少shuffle、数据压缩)
- 容错处理(检查点、副本)
- 监控运维(指标监控、报警)
"""
@staticmethod
def common_questions():
"""常见面试问题"""
return {
"基础问题": [
"单机Top K有哪些算法?时间复杂度?",
"分布式Top K的挑战是什么?",
"什么时候选择精确算法?什么时候选择近似算法?"
],
"算法细节": [
"MapReduce两阶段聚合的具体步骤?",
"Count-Min Sketch的原理和误差分析?",
"SpaceSaving算法如何保证Top K准确性?"
],
"系统设计": [
"设计一个实时热点商品Top K系统",
"如何保证系统的可扩展性和高可用性?",
"数据倾斜怎么处理?"
],
"实践问题": [
"Spark中top()和takeOrdered()的区别?",
"Flink中如何实现滑动窗口Top K?",
"Redis Sorted Set实现Top K的优缺点?"
]
}
@staticmethod
def evaluation_criteria():
"""评价标准"""
return {
"优秀回答": [
"全面覆盖算法、系统、优化多个层面",
"能结合具体场景选择合适方案",
"了解各种方案的优缺点和适用场景",
"能讨论trade-off和设计取舍"
],
"良好回答": [
"掌握主要算法原理",
"能设计基本系统架构",
"了解常见优化手段"
],
"及格回答": [
"知道单机Top K算法",
"了解分布式的基本概念",
"能说出1-2种分布式方案"
]
}
这个分布式Top K解决方案涵盖了从理论基础到工程实践的完整内容,适用于不同级别的面试要求。在面试中,应根据问题的具体要求和时间限制,选择合适的内容进行回答。