R-Tree 创建、遍历及四大空间查询算法详解
一、R-Tree 创建算法
1. R-Tree 结构回顾
text
R-Tree节点结构:
- 每个节点有M到m个条目(M最大容量,m最小填充)
- 每个条目:叶子节点存储(对象,MBR);非叶节点存储(子节点指针,MBR)
- MBR:最小边界矩形 [min_x, min_y, max_x, max_y]
2. R-Tree 创建算法
批量加载算法(STR - Sort-Tile-Recursive)
python
def STR_bulk_load(objects, node_capacity):
"""STR批量加载算法(适合静态数据)"""
# 1. 计算需要的叶节点数
n = len(objects)
leaf_count = ceil(n / node_capacity)
# 2. 按X坐标排序,分成√S个垂直切片
objects.sort(key=lambda o: o.mbr.min_x)
slice_size = ceil(n / sqrt(leaf_count))
slices = []
for i in range(0, n, slice_size):
slice_objects = objects[i:i+slice_size]
# 3. 每个切片内按Y坐标排序
slice_objects.sort(key=lambda o: o.mbr.min_y)
# 4. 创建叶节点
for j in range(0, len(slice_objects), node_capacity):
leaf_node = create_leaf_node(slice_objects[j:j+node_capacity])
slices.append(leaf_node)
# 5. 递归创建上层节点
return build_tree_level(slices, node_capacity)
def build_tree_level(nodes, node_capacity):
"""递归构建树层级"""
if len(nodes) <= node_capacity:
return create_root_node(nodes)
# 对当前层节点进行STR分组
level_count = ceil(len(nodes) / node_capacity)
slice_size = ceil(len(nodes) / sqrt(level_count))
nodes.sort(key=lambda n: n.mbr.min_x)
parent_nodes = []
for i in range(0, len(nodes), slice_size):
slice_nodes = nodes[i:i+slice_size]
slice_nodes.sort(key=lambda n: n.mbr.min_y)
for j in range(0, len(slice_nodes), node_capacity):
parent_node = create_internal_node(slice_nodes[j:j+node_capacity])
parent_nodes.append(parent_node)
return build_tree_level(parent_nodes, node_capacity)
动态插入算法(经典R-Tree)
python
class RTree:
def __init__(self, M=50, m=20):
self.M = M # 最大容量
self.m = m # 最小填充
self.root = None
def insert(self, obj, obj_mbr):
"""插入对象到R-Tree"""
if self.root is None:
self.root = LeafNode([(obj, obj_mbr)])
return
# 1. 选择插入的叶节点
leaf = self.choose_leaf(obj_mbr)
# 2. 插入到叶节点
if len(leaf.entries) < self.M:
leaf.add_entry(obj, obj_mbr)
self.adjust_tree(leaf)
else:
# 3. 节点分裂
new_node = self.split_node(leaf, obj, obj_mbr)
self.adjust_tree(leaf, new_node)
def choose_leaf(self, mbr):
"""选择插入的叶节点(面积增加最小原则)"""
node = self.root
while not node.is_leaf:
# 选择面积增加最小的子节点
best_child = min(node.children,
key=lambda c: self.area_increase(c.mbr, mbr))
node = best_child
return node
def area_increase(self, node_mbr, new_mbr):
"""计算MBR合并后的面积增加"""
merged = self.merge_mbr(node_mbr, new_mbr)
return area(merged) - area(node_mbr)
def split_node(self, node, obj=None, obj_mbr=None):
"""节点分裂算法(有多种策略)"""
# 方法1:线性分裂(简单但效果一般)
entries = node.entries.copy()
if obj:
entries.append((obj, obj_mbr))
# 按某维度排序并划分
entries.sort(key=lambda e: e[1].min_x) # 按X最小值排序
split_point = self.m
group1 = entries[:split_point]
group2 = entries[split_point:]
node.entries = group1
node.update_mbr()
new_node = type(node)(group2)
return new_node
def adjust_tree(self, node, new_node=None):
"""调整树结构"""
while node != self.root:
parent = node.parent
parent.update_mbr_for_child(node)
if new_node:
if len(parent.children) < self.M:
parent.add_child(new_node)
new_node.parent = parent
new_node = None
else:
# 父节点也需要分裂
sibling = self.split_node(parent)
new_node = sibling
node = parent
# 处理根节点分裂
if new_node:
old_root = self.root
self.root = InternalNode([old_root, new_node])
old_root.parent = self.root
new_node.parent = self.root
R*-Tree 改进插入算法
python
class RStarTree(RTree):
def choose_subtree(self, mbr, level):
"""R*-Tree的选择子树算法"""
if self.root.level == level:
return self.root
node = self.root
while node.level != level:
if node.level == level + 1:
# 叶节点层:选择重叠增加最小的
return self.choose_subtree_leaf(node, mbr)
else:
# 中间层:选择面积增加最小的
node = min(node.children,
key=lambda c: self.area_increase(c.mbr, mbr))
return node
def choose_subtree_leaf(self, node, mbr):
"""R*-Tree叶节点选择(考虑重叠)"""
if len(node.children) == 0:
return node
# 1. 如果所有子节点未满,选择面积增加最小的
if all(len(c.entries) < self.M for c in node.children):
return min(node.children,
key=lambda c: self.area_increase(c.mbr, mbr))
# 2. 否则选择重叠增加最小的
return min(node.children,
key=lambda c: self.overlap_increase(c, mbr, node.children))
def overlap_increase(self, candidate, new_mbr, siblings):
"""计算重叠面积增加量"""
old_overlap = 0
new_overlap = 0
# 计算与所有兄弟节点的重叠
for sibling in siblings:
if sibling != candidate:
old_overlap += overlap_area(candidate.mbr, sibling.mbr)
merged_mbr = merge_mbr(candidate.mbr, new_mbr)
new_overlap += overlap_area(merged_mbr, sibling.mbr)
return new_overlap - old_overlap
二、R-Tree 遍历算法
1. 深度优先遍历(DFS)
sql
def dfs_traverse(node, level=0):
"""深度优先遍历R-Tree"""
indent = " " * level
print(f"{indent}Level {level}: MBR={node.mbr}")
if node.is_leaf:
for obj, mbr in node.entries:
print(f"{indent} Object: {obj}, MBR={mbr}")
else:
for child in node.children:
dfs_traverse(child, level + 1)
2. 广度优先遍历(BFS)
sql
from collections import deque
def bfs_traverse(root):
"""广度优先遍历R-Tree"""
queue = deque([(root, 0)])
while queue:
node, level = queue.popleft()
indent = " " * level
print(f"{indent}Level {level}: MBR={node.mbr}")
if node.is_leaf:
for obj, mbr in node.entries:
print(f"{indent} Object: {obj}, MBR={mbr}")
else:
for child in node.children:
queue.append((child, level + 1))
3. 层级统计遍历
sql
def level_statistics(root):
"""统计R-Tree各层级信息"""
stats = {}
queue = deque([(root, 0)])
while queue:
node, level = queue.popleft()
if level not in stats:
stats[level] = {'count': 0, 'avg_entries': 0, 'total_nodes': 0}
stats[level]['count'] += 1
if node.is_leaf:
entries_count = len(node.entries)
else:
entries_count = len(node.children)
for child in node.children:
queue.append((child, level + 1))
# 更新统计
prev_avg = stats[level]['avg_entries']
prev_total = stats[level]['total_nodes']
stats[level]['avg_entries'] = (prev_avg * prev_total + entries_count) / (prev_total + 1)
stats[level]['total_nodes'] = prev_total + 1
return stats
三、四大空间查询算法实现
类型1:点查询(Point Query)
算法1:精确点查询
sql
def point_query_exact(root, query_point):
"""查找包含给定点的所有对象"""
results = []
def search(node):
# 检查节点MBR是否包含查询点
if not contains_point(node.mbr, query_point):
return
if node.is_leaf:
# 叶节点:检查每个对象
for obj, obj_mbr in node.entries:
if contains_point(obj_mbr, query_point):
results.append(obj)
else:
# 内部节点:递归搜索所有子节点
for child in node.children:
search(child)
search(root)
return results
def contains_point(mbr, point):
"""判断MBR是否包含点"""
px, py = point
return (mbr.min_x <= px <= mbr.max_x and
mbr.min_y <= py <= mbr.max_y)
算法2:最近点查询
sql
def nearest_point_query(root, query_point, k=1):
"""查找距离查询点最近的k个对象"""
# 优先队列:(负距离, 节点),使用负距离实现最大堆
import heapq
heap = []
heapq.heappush(heap, (0, root)) # 距离为0,从根开始
results = []
min_distances = {} # 节点到查询点的最小可能距离
while heap and len(results) < k:
neg_dist, node = heapq.heappop(heap)
if node.is_leaf:
# 叶节点:计算实际距离
for obj, obj_mbr in node.entries:
dist = point_to_mbr_distance(query_point, obj_mbr)
if dist == 0: # 点就在对象内
results.append((obj, dist))
elif len(results) < k or dist < results[-1][1]:
heapq.heappush(results, (-dist, obj))
if len(results) > k:
heapq.heappop(results)
else:
# 内部节点:计算到每个子节点MBR的最小距离
for child in node.children:
dist = point_to_mbr_distance(query_point, child.mbr)
heapq.heappush(heap, (-dist, child))
return [(obj, -dist) for dist, obj in results]
类型2:范围查询(Range Query)
算法1:矩形范围查询
python
def range_query_rectangle(root, query_rect):
"""查找与查询矩形相交的所有对象"""
results = []
def search(node):
# 检查节点MBR是否与查询矩形相交
if not mbr_intersect(node.mbr, query_rect):
return
if node.is_leaf:
for obj, obj_mbr in node.entries:
if mbr_intersect(obj_mbr, query_rect):
# 精确几何相交检查
if geometry_intersect(obj.geometry, query_rect):
results.append(obj)
else:
for child in node.children:
search(child)
search(root)
return results
def mbr_intersect(mbr1, mbr2):
"""判断两个MBR是否相交"""
return not (mbr1.max_x < mbr2.min_x or
mbr1.min_x > mbr2.max_x or
mbr1.max_y < mbr2.min_y or
mbr1.min_y > mbr2.max_y)
算法2:圆形范围查询
python
def range_query_circle(root, center, radius):
"""查找在圆形范围内的所有对象"""
# 将圆形查询转换为MBR查询 + 距离过滤
query_mbr = (
center[0] - radius, center[1] - radius,
center[0] + radius, center[1] + radius
)
results = []
candidates = range_query_rectangle(root, query_mbr)
for obj in candidates:
# 精确距离计算
if distance(obj.geometry, center) <= radius:
results.append(obj)
return results
算法3:多边形范围查询
python
def range_query_polygon(root, polygon):
"""查找在多边形范围内的所有对象"""
# 计算多边形MBR进行初步过滤
poly_mbr = polygon_bounding_box(polygon)
results = []
def search(node):
if not mbr_intersect(node.mbr, poly_mbr):
return
# MBR测试优化
mbr_relationship = mbr_polygon_test(node.mbr, polygon)
if mbr_relationship == "inside":
# 节点MBR完全在多边形内,添加所有对象
if node.is_leaf:
results.extend([obj for obj, _ in node.entries])
else:
for child in node.children:
search(child)
elif mbr_relationship == "intersect":
# 节点MBR与多边形相交
if node.is_leaf:
for obj, obj_mbr in node.entries:
if polygon_intersect(obj.geometry, polygon):
results.append(obj)
else:
for child in node.children:
search(child)
# "outside"情况已通过初步过滤排除
search(root)
return results
类型3:最近邻查询(Nearest Neighbor Query)
算法1:深度优先最近邻(DF-NN)
python
def nearest_neighbor_dfs(root, query_point, k=1):
"""深度优先最近邻查询"""
results = [] # 存储(距离, 对象)
def search(node):
nonlocal results
if node.is_leaf:
# 处理叶节点
for obj, obj_mbr in node.entries:
dist = distance(query_point, obj_mbr)
# 维护大小为k的最近邻列表
if len(results) < k:
heapq.heappush(results, (-dist, obj))
elif dist < -results[0][0]:
heapq.heapreplace(results, (-dist, obj))
else:
# 非叶节点:按距离排序子节点
children_with_dist = []
for child in node.children:
dist = point_to_mbr_distance(query_point, child.mbr)
children_with_dist.append((dist, child))
# 按距离升序排序(优先访问近的)
children_with_dist.sort(key=lambda x: x[0])
for dist, child in children_with_dist:
# 剪枝:如果当前最小距离已经小于子节点最小可能距离
if len(results) >= k and dist >= -results[0][0]:
continue
search(child)
search(root)
return [(obj, -dist) for dist, obj in results]
算法2:最佳优先最近邻(Best-First NN)
python
def nearest_neighbor_best_first(root, query_point, k=1):
"""最佳优先最近邻查询(更高效)"""
import heapq
# 优先队列:(最小可能距离, 节点)
heap = []
heapq.heappush(heap, (0, root))
results = [] # 最大堆存储结果
while heap:
min_dist, node = heapq.heappop(heap)
# 剪枝条件
if len(results) >= k and min_dist >= -results[0][0]:
break
if node.is_leaf:
for obj, obj_mbr in node.entries:
actual_dist = distance(query_point, obj_mbr)
if len(results) < k:
heapq.heappush(results, (-actual_dist, obj))
elif actual_dist < -results[0][0]:
heapq.heapreplace(results, (-actual_dist, obj))
else:
# 将子节点按最小距离加入优先队列
for child in node.children:
child_min_dist = point_to_mbr_distance(query_point, child.mbr)
heapq.heappush(heap, (child_min_dist, child))
return [(obj, -dist) for dist, obj in results]
算法3:增量最近邻查询
sql
class IncrementalNN:
"""增量最近邻查询(适用于多次查询)"""
def __init__(self, root, query_point):
self.root = root
self.query_point = query_point
self.heap = []
heapq.heappush(self.heap, (0, root))
self.results = []
def get_next(self):
"""获取下一个最近邻"""
while self.heap:
min_dist, node = heapq.heappop(self.heap)
if node.is_leaf:
for obj, obj_mbr in node.entries:
actual_dist = distance(self.query_point, obj_mbr)
self.results.append((obj, actual_dist))
return obj, actual_dist
else:
for child in node.children:
child_dist = point_to_mbr_distance(self.query_point, child.mbr)
heapq.heappush(self.heap, (child_dist, child))
return None # 没有更多结果
类型4:空间连接查询(Spatial Join)
算法1:R-Tree嵌套循环连接
python
def spatial_join_nested_loop(tree1, tree2, predicate):
"""
嵌套循环空间连接
predicate: 'intersects', 'contains', 'within', 'distance < d'
"""
results = []
def traverse_and_join(node1, node2):
# 如果两个节点MBR不满足谓词,剪枝
if not satisfy_predicate(node1.mbr, node2.mbr, predicate):
return
if node1.is_leaf and node2.is_leaf:
# 叶节点对:检查所有对象对
for obj1, mbr1 in node1.entries:
for obj2, mbr2 in node2.entries:
if satisfy_predicate(mbr1, mbr2, predicate):
# 精确几何检查
if geometry_predicate(obj1.geometry, obj2.geometry, predicate):
results.append((obj1, obj2))
elif not node1.is_leaf and not node2.is_leaf:
# 两个都是内部节点:递归连接
for child1 in node1.children:
for child2 in node2.children:
traverse_and_join(child1, child2)
elif not node1.is_leaf: # node1内部,node2叶节点
for child in node1.children:
traverse_and_join(child, node2)
else: # node1叶节点,node2内部
for child in node2.children:
traverse_and_join(node1, child)
traverse_and_join(tree1.root, tree2.root)
return results
算法2:同步遍历连接(更高效)
python
def spatial_join_sync_traversal(tree1, tree2, predicate):
"""同步遍历两个R-Tree进行连接"""
results = []
stack = [(tree1.root, tree2.root)]
while stack:
node1, node2 = stack.pop()
# MBR级别剪枝
if not satisfy_predicate(node1.mbr, node2.mbr, predicate):
continue
if node1.is_leaf and node2.is_leaf:
# 叶节点对检查
for obj1, mbr1 in node1.entries:
for obj2, mbr2 in node2.entries:
if satisfy_predicate(mbr1, mbr2, predicate):
if geometry_predicate(obj1.geometry, obj2.geometry, predicate):
results.append((obj1, obj2))
elif not node1.is_leaf and not node2.is_leaf:
# 两个内部节点:计算MBR对的优先级
pairs = []
for child1 in node1.children:
for child2 in node2.children:
if satisfy_predicate(child1.mbr, child2.mbr, predicate):
# 按重叠面积或距离排序
priority = compute_priority(child1.mbr, child2.mbr, predicate)
pairs.append((priority, child1, child2))
# 按优先级排序(最佳优先)
pairs.sort(key=lambda x: x[0], reverse=True)
for _, child1, child2 in pairs:
stack.append((child1, child2))
elif not node1.is_leaf:
# node1内部,node2叶节点
for child1 in node1.children:
if satisfy_predicate(child1.mbr, node2.mbr, predicate):
stack.append((child1, node2))
else:
# node1叶节点,node2内部
for child2 in node2.children:
if satisfy_predicate(node1.mbr, child2.mbr, predicate):
stack.append((node1, child2))
return results
算法3:基于距离的空间连接
python
def distance_join(tree1, tree2, max_distance):
"""查找距离在max_distance内的所有对象对"""
results = []
stack = [(tree1.root, tree2.root)]
while stack:
node1, node2 = stack.pop()
# 计算两个节点MBR的最小距离
min_dist = mbr_distance(node1.mbr, node2.mbr)
if min_dist > max_distance:
continue # 剪枝
if node1.is_leaf and node2.is_leaf:
# 检查所有对象对
for obj1, mbr1 in node1.entries:
for obj2, mbr2 in node2.entries:
if mbr_distance(mbr1, mbr2) <= max_distance:
actual_dist = distance(obj1.geometry, obj2.geometry)
if actual_dist <= max_distance:
results.append((obj1, obj2, actual_dist))
elif not node1.is_leaf and not node2.is_leaf:
# 两个内部节点:生成子节点对
pairs = []
for child1 in node1.children:
for child2 in node2.children:
child_dist = mbr_distance(child1.mbr, child2.mbr)
if child_dist <= max_distance:
pairs.append((child_dist, child1, child2))
# 按距离排序(最近优先)
pairs.sort(key=lambda x: x[0])
for _, child1, child2 in pairs:
stack.append((child1, child2))
elif not node1.is_leaf:
for child1 in node1.children:
if mbr_distance(child1.mbr, node2.mbr) <= max_distance:
stack.append((child1, node2))
else:
for child2 in node2.children:
if mbr_distance(node1.mbr, child2.mbr) <= max_distance:
stack.append((node1, child2))
return results
四、辅助函数实现
python
# 几何计算辅助函数
import math
def mbr_distance(mbr1, mbr2):
"""计算两个MBR的最小距离"""
# 如果相交,距离为0
if mbr_intersect(mbr1, mbr2):
return 0
# 计算在X和Y方向上的距离
dx = max(mbr1.min_x - mbr2.max_x, mbr2.min_x - mbr1.max_x, 0)
dy = max(mbr1.min_y - mbr2.max_y, mbr2.min_y - mbr1.max_y, 0)
return math.sqrt(dx*dx + dy*dy)
def point_to_mbr_distance(point, mbr):
"""计算点到MBR的最小距离"""
px, py = point
# 如果点在MBR内,距离为0
if (mbr.min_x <= px <= mbr.max_x and
mbr.min_y <= py <= mbr.max_y):
return 0
# 计算到各边的距离
dx = max(mbr.min_x - px, 0, px - mbr.max_x)
dy = max(mbr.min_y - py, 0, py - mbr.max_y)
return math.sqrt(dx*dx + dy*dy)
def overlap_area(mbr1, mbr2):
"""计算两个MBR的重叠面积"""
if not mbr_intersect(mbr1, mbr2):
return 0
overlap_width = min(mbr1.max_x, mbr2.max_x) - max(mbr1.min_x, mbr2.min_x)
overlap_height = min(mbr1.max_y, mbr2.max_y) - max(mbr1.min_y, mbr2.min_y)
return overlap_width * overlap_height
def satisfy_predicate(mbr1, mbr2, predicate):
"""检查两个MBR是否满足谓词条件"""
if predicate == 'intersects':
return mbr_intersect(mbr1, mbr2)
elif predicate == 'contains':
return (mbr1.min_x <= mbr2.min_x and mbr1.max_x >= mbr2.max_x and
mbr1.min_y <= mbr2.min_y and mbr1.max_y >= mbr2.max_y)
elif predicate == 'within':
return (mbr2.min_x <= mbr1.min_x and mbr2.max_x >= mbr1.max_x and
mbr2.min_y <= mbr1.min_y and mbr2.max_y >= mbr1.max_y)
elif predicate.startswith('distance'):
# 解析距离阈值,如'distance < 10'
max_dist = float(predicate.split('<')[1].strip())
return mbr_distance(mbr1, mbr2) <= max_dist
return False
五、性能优化策略
1. 查询优化技术
python
def optimized_range_query(root, query_rect, use_mbr_only=False):
"""优化范围查询:批量处理+早期剪枝"""
results = []
batch_size = 1000 # 批量处理大小
def search_batch(node, query_rect, result_batch):
if not mbr_intersect(node.mbr, query_rect):
return
if node.is_leaf:
for obj, obj_mbr in node.entries:
if use_mbr_only or geometry_intersect(obj.geometry, query_rect):
result_batch.append(obj)
# 批量返回
if len(result_batch) >= batch_size:
results.extend(result_batch)
result_batch.clear()
else:
# 按与查询矩形的重叠面积排序子节点
children_sorted = sorted(node.children,
key=lambda c: overlap_area(c.mbr, query_rect),
reverse=True)
for child in children_sorted:
search_batch(child, query_rect, result_batch)
current_batch = []
search_batch(root, query_rect, current_batch)
# 添加剩余结果
if current_batch:
results.extend(current_batch)
return results
2. 并行查询处理
python
from concurrent.futures import ThreadPoolExecutor
def parallel_range_query(root, query_rect, max_workers=4):
"""并行范围查询"""
results = []
def process_subtree(subtree_root):
return range_query_rectangle(subtree_root, query_rect)
if not root.is_leaf:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(process_subtree, child)
for child in root.children]
for future in futures:
results.extend(future.result())
else:
results = range_query_rectangle(root, query_rect)
return results
六、总结对比表
| 查询类型 | 算法选择 | 时间复杂度 | 适用场景 |
|---|---|---|---|
| 点查询 | 精确点查询 | O(logᴍ N) | 精确位置查找 |
| 范围查询 | 矩形范围查询 | O(logᴍ N + K) | 区域数据分析 |
| 最近邻查询 | 最佳优先NN | O(logᴍ N) | 位置服务、推荐系统 |
| 空间连接 | 同步遍历连接 | O(N log N) | 空间关系分析 |
关键优化原则:
-
尽早剪枝:在MBR级别尽可能排除不相关分支
-
最佳优先:优先处理更可能包含结果的节点
-
批量处理:减少函数调用和内存分配开销
-
并行化:充分利用多核CPU处理独立子树
这些算法构成了空间数据库查询优化的核心,在实际系统中通常结合多种策略以获得最佳性能。