一、相似度搜索基础
1.1 向量相似度度量方法
python
复制
下载
import numpy as np
from scipy.spatial import distance
class SimilarityMetrics:
"""相似度计算度量方法"""
@staticmethod
def cosine_similarity(v1, v2):
"""余弦相似度 - 最常用"""
dot_product = np.dot(v1, v2)
norm_v1 = np.linalg.norm(v1)
norm_v2 = np.linalg.norm(v2)
return dot_product / (norm_v1 * norm_v2)
@staticmethod
def euclidean_distance(v1, v2):
"""欧氏距离"""
return np.linalg.norm(v1 - v2)
@staticmethod
def inner_product(v1, v2):
"""内积 - 高性能但需归一化"""
return np.dot(v1, v2)
@staticmethod
def jaccard_similarity(set1, set2):
"""Jaccard相似度 - 适合稀疏向量"""
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union if union != 0 else 0
@staticmethod
def angular_distance(v1, v2):
"""角度距离"""
cosine = SimilarityMetrics.cosine_similarity(v1, v2)
# 将cosine值映射到[0, 2]范围
cosine = np.clip(cosine, -1.0, 1.0)
return np.arccos(cosine) / np.pi
@staticmethod
def mips_score(v1, v2):
"""最大内积搜索分数"""
# 先对向量归一化,然后计算内积
v1_norm = v1 / np.linalg.norm(v1)
v2_norm = v2 / np.linalg.norm(v2)
return np.dot(v1_norm, v2_norm)
1.2 近似最近邻(ANN)搜索问题
python
复制
下载
class ANNProblem:
"""近似最近邻搜索问题定义"""
def __init__(self, vectors, queries, k=10, metric='cosine'):
"""
Args:
vectors: 数据库向量 [N, D]
queries: 查询向量 [M, D]
k: 返回的最近邻数量
metric: 距离度量方法
"""
self.vectors = vectors
self.queries = queries
self.k = k
self.metric = metric
self.dimension = vectors.shape[1]
self.n_vectors = vectors.shape[0]
def ground_truth(self):
"""计算精确最近邻(暴力搜索)- 用于评估"""
from sklearn.neighbors import NearestNeighbors
if self.metric == 'cosine':
# 余弦相似度转换为距离
neigh = NearestNeighbors(n_neighbors=self.k,
metric='cosine')
elif self.metric == 'euclidean':
neigh = NearestNeighbors(n_neighbors=self.k,
metric='euclidean')
else:
raise ValueError(f"Unsupported metric: {self.metric}")
neigh.fit(self.vectors)
distances, indices = neigh.kneighbors(self.queries)
return indices, distances
def recall_rate(self, approx_indices, exact_indices):
"""计算召回率"""
total_correct = 0
total_neighbors = 0
for i in range(len(approx_indices)):
approx_set = set(approx_indices[i])
exact_set = set(exact_indices[i])
total_correct += len(approx_set.intersection(exact_set))
total_neighbors += len(exact_set)
return total_correct / total_neighbors
def latency_profile(self, search_func):
"""性能分析"""
import time
latencies = []
for query in self.queries:
start = time.perf_counter()
_ = search_func(query)
latencies.append(time.perf_counter() - start)
return {
'p50': np.percentile(latencies, 50),
'p95': np.percentile(latencies, 95),
'p99': np.percentile(latencies, 99),
'mean': np.mean(latencies),
'std': np.std(latencies)
}
二、索引构建算法
2.1 基于树的索引
python
复制
下载
class KDTreeIndex:
"""KD-Tree索引实现"""
class Node:
def __init__(self, point, left=None, right=None, axis=None):
self.point = point
self.left = left
self.right = right
self.axis = axis
def __init__(self, dimension, leaf_size=10):
self.dimension = dimension
self.leaf_size = leaf_size
self.root = None
def build(self, points, indices=None, depth=0):
"""递归构建KD-Tree"""
if indices is None:
indices = np.arange(len(points))
n = len(indices)
# 叶子节点
if n <= self.leaf_size:
return self.Node(
point=points[indices] if n == 1 else None,
axis=None
)
# 选择划分维度
axis = depth % self.dimension
# 按选定维度排序并选择中位数
sorted_idx = indices[np.argsort(points[indices, axis])]
median_idx = len(sorted_idx) // 2
# 递归构建左右子树
node = self.Node(
point=points[sorted_idx[median_idx]],
axis=axis
)
node.left = self.build(points,
sorted_idx[:median_idx],
depth + 1)
node.right = self.build(points,
sorted_idx[median_idx + 1:],
depth + 1)
return node
def knn_search(self, query, k=1, max_distance=float('inf')):
"""k近邻搜索"""
from heapq import heappush, heappop
def search_node(node, query, heap, depth=0):
if node is None:
return
# 计算当前节点距离
dist = np.linalg.norm(node.point - query)
if dist < max_distance:
# 使用负距离构建最大堆
if len(heap) < k:
heappush(heap, (-dist, node.point))
elif -heap[0][0] > dist:
heappushpop(heap, (-dist, node.point))
# 确定搜索方向
axis = node.axis
if axis is not None:
if query[axis] < node.point[axis]:
nearer = node.left
further = node.right
else:
nearer = node.right
further = node.left
# 搜索更近的分支
search_node(nearer, query, heap, depth + 1)
# 如果超球面与超平面相交,搜索另一个分支
if len(heap) < k or abs(query[axis] - node.point[axis]) < -heap[0][0]:
search_node(further, query, heap, depth + 1)
heap = []
search_node(self.root, query, heap)
# 按距离排序返回结果
results = sorted([(-d, p) for d, p in heap])
distances = [d for d, _ in results]
points = [p for _, p in results]
return distances, points
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
2.2 基于图的索引(HNSW)
python
复制
下载
class HNSWIndex:
"""分层可导航小世界图索引"""
class Node:
def __init__(self, id, vector, level):
self.id = id
self.vector = vector
self.level = level
self.neighbors = [] # 每层的邻居列表
def __init__(self, M=16, efConstruction=200, efSearch=50, mL=1/16):
"""
Args:
M: 每个节点的最大连接数
efConstruction: 构建时的动态候选列表大小
efSearch: 搜索时的动态候选列表大小
mL: 层级分配参数
"""
self.M = M
self.efConstruction = efConstruction
self.efSearch = efSearch
self.mL = mL
self.nodes = []
self.max_level = 0
self.enter_point = None
def random_level(self):
"""生成随机层级 - 指数分布"""
import random
level = 0
while random.random() < self.mL and level < 30: # 限制最大层级
level += 1
return level
def search_layer(self, q, ep, ef, layer):
"""在指定层搜索"""
from heapq import heappush, heappop
visited = set([ep.id])
candidates = [] # 最小堆
heappush(candidates, (self.distance(q, ep.vector), ep))
results = [] # 最大堆
heappush(results, (-candidates[0][0], ep))
while candidates:
dist, node = heappop(candidates)
# 如果当前节点比结果中最差节点还差,停止搜索
if dist > -results[0][0] and len(results) >= ef:
break
for neighbor in node.neighbors[layer]:
if neighbor.id not in visited:
visited.add(neighbor.id)
dist_q = self.distance(q, neighbor.vector)
# 如果结果未满或距离更小
if len(results) < ef:
heappush(results, (-dist_q, neighbor))
heappush(candidates, (dist_q, neighbor))
elif dist_q < -results[0][0]:
heappushpop(results, (-dist_q, neighbor))
heappush(candidates, (dist_q, neighbor))
# 按距离排序返回
return sorted([(-d, n) for d, n in results])
def insert(self, vector):
"""插入新节点"""
node_id = len(self.nodes)
level = self.random_level()
node = self.Node(node_id, vector, level)
node.neighbors = [[] for _ in range(level + 1)]
# 如果是第一个节点
if not self.nodes:
self.nodes.append(node)
self.enter_point = node
self.max_level = level
return node_id
# 从最高层开始搜索入口点
ep = self.enter_point
L = min(self.max_level, level)
for l in range(self.max_level, L, -1):
# 每层搜索最近的节点
nearest = self.search_layer(vector, ep, 1, l)
if nearest:
ep = nearest[0][1]
# 逐层插入
for l in range(L, -1, -1):
# 搜索efConstruction个候选节点
candidates = self.search_layer(vector, ep, self.efConstruction, l)
# 选择最近的M个邻居
neighbors = self.select_neighbors(candidates, self.M, l)
# 设置邻居连接
node.neighbors[l] = neighbors
# 添加反向连接
for neighbor in neighbors:
if len(neighbor.neighbors[l]) < self.M:
neighbor.neighbors[l].append(node)
else:
# 需要优化连接
self.optimize_connections(neighbor, l)
# 为下一层更新入口点
if candidates:
ep = candidates[0][1]
self.nodes.append(node)
# 更新入口点和最大层级
if level > self.max_level:
self.max_level = level
self.enter_point = node
return node_id
def select_neighbors(self, candidates, M, layer):
"""启发式选择邻居"""
if len(candidates) <= M:
return [n for _, n in candidates]
# 简单的最近选择策略
return [n for _, n in candidates[:M]]
def optimize_connections(self, node, layer):
"""优化节点的连接"""
# 简化的连接优化:删除最远的连接
if len(node.neighbors[layer]) > self.M:
# 计算所有邻居的距离
distances = []
for neighbor in node.neighbors[layer]:
dist = self.distance(node.vector, neighbor.vector)
distances.append((dist, neighbor))
# 按距离排序,保留最近的M个
distances.sort()
node.neighbors[layer] = [n for _, n in distances[:self.M]]
def search(self, query, k=10):
"""搜索k个最近邻"""
# 从入口点开始
ep = self.enter_point
# 从最高层向下搜索
for l in range(self.max_level, 0, -1):
nearest = self.search_layer(query, ep, 1, l)
if nearest:
ep = nearest[0][1]
# 在最底层进行搜索
results = self.search_layer(query, ep, self.efSearch, 0)
# 返回前k个结果
return [(n.id, -d) for d, n in results[:k]]
def distance(self, v1, v2):
"""余弦距离"""
return 1 - np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
2.3 基于量化的索引(PQ/IVF-PQ)
python
复制
下载
class IVF_PQ_Index:
"""倒排文件+乘积量化索引"""
def __init__(self, nlist=1000, m=8, nbits=8, nprobe=10):
"""
Args:
nlist: 聚类中心数量
m: 子向量数量(乘积量化)
nbits: 每个子量化的编码位数
nprobe: 搜索时访问的聚类数量
"""
self.nlist = nlist
self.m = m
self.nbits = nbits
self.nprobe = nprobe
self.d = None # 向量维度
self.ds = None # 子向量维度
self.codebooks = None # 码本
self.cluster_centers = None # 聚类中心
self.inverted_lists = {} # 倒排列表
self.vectors = [] # 原始向量(可选存储)
def train(self, vectors):
"""训练索引"""
from sklearn.cluster import KMeans
import faiss
n, self.d = vectors.shape
self.ds = self.d // self.m
# 1. 聚类训练
print("训练聚类中心...")
kmeans = KMeans(n_clusters=self.nlist, random_state=42)
cluster_labels = kmeans.fit_predict(vectors)
self.cluster_centers = kmeans.cluster_centers_
# 初始化倒排列表
self.inverted_lists = {i: [] for i in range(self.nlist)}
# 2. 乘积量化训练
print("训练乘积量化...")
self.codebooks = []
for i in range(self.m):
# 提取子向量
sub_vectors = vectors[:, i*self.ds:(i+1)*self.ds]
# 对每个子空间进行聚类
k = 2 ** self.nbits
sub_kmeans = KMeans(n_clusters=k, random_state=42)
sub_kmeans.fit(sub_vectors)
# 保存码本
self.codebooks.append(sub_kmeans.cluster_centers_)
# 为所有向量编码
if i == 0:
# 初始化编码矩阵
codes = np.zeros((n, self.m), dtype=np.uint8)
# 分配子向量到最近的质心
labels = sub_kmeans.predict(sub_vectors)
codes[:, i] = labels
# 3. 构建倒排索引
print("构建倒排索引...")
for idx, (vector, cluster_id, code) in enumerate(zip(vectors, cluster_labels, codes)):
# 存储向量ID、编码和残差
residual = vector - self.cluster_centers[cluster_id]
self.inverted_lists[cluster_id].append({
'id': idx,
'code': code,
'residual': residual,
'vector': vector # 可选存储
})
return self
def encode_vector(self, vector):
"""编码单个向量"""
# 1. 找到最近的聚类中心
distances = np.linalg.norm(self.cluster_centers - vector, axis=1)
nearest_cluster = np.argmin(distances)
# 2. 计算残差
residual = vector - self.cluster_centers[nearest_cluster]
# 3. 乘积量化编码
code = np.zeros(self.m, dtype=np.uint8)
for i in range(self.m):
sub_residual = residual[i*self.ds:(i+1)*self.ds]
# 找到最近的码字
sub_distances = np.linalg.norm(
self.codebooks[i] - sub_residual,
axis=1
)
code[i] = np.argmin(sub_distances)
return nearest_cluster, code, residual
def asymmetric_distance(self, query, cluster_id, pq_code):
"""非对称距离计算"""
# 重构近似向量
recon_vector = self.cluster_centers[cluster_id].copy()
for i in range(self.m):
code = pq_code[i]
recon_vector[i*self.ds:(i+1)*self.ds] += self.codebooks[i][code]
# 计算查询向量与重构向量的距离
return np.linalg.norm(query - recon_vector)
def search(self, query, k=10):
"""搜索最近邻"""
from heapq import heappush, heappop
# 1. 找到最近的nprobe个聚类
distances_to_clusters = np.linalg.norm(
self.cluster_centers - query,
axis=1
)
candidate_clusters = np.argsort(distances_to_clusters)[:self.nprobe]
# 2. 在每个候选聚类中搜索
results = [] # 最大堆
for cluster_id in candidate_clusters:
cluster_items = self.inverted_lists[cluster_id]
for item in cluster_items:
# 计算非对称距离
dist = self.asymmetric_distance(query, cluster_id, item['code'])
# 维护k个最小距离
if len(results) < k:
heappush(results, (-dist, item['id']))
elif dist < -results[0][0]:
heappushpop(results, (-dist, item['id']))
# 3. 排序并返回结果
sorted_results = sorted([(-d, idx) for d, idx in results])
distances = [d for d, _ in sorted_results]
indices = [idx for _, idx in sorted_results]
return distances, indices
def add_batch(self, vectors):
"""批量添加向量"""
for i, vector in enumerate(vectors):
cluster_id, code, residual = self.encode_vector(vector)
global_id = len(self.vectors)
self.vectors.append(vector)
self.inverted_lists[cluster_id].append({
'id': global_id,
'code': code,
'residual': residual
})
三、混合索引与优化策略
3.1 复合索引结构
python
复制
下载
class HybridVectorIndex:
"""混合向量索引 - 结合多种索引优势"""
def __init__(self, config):
"""
config示例:
{
'primary_index': 'HNSW',
'secondary_index': 'IVF_PQ',
'partition_strategy': 'kmeans',
'num_partitions': 16,
'use_gpu': False,
'compression': 'PQ',
'cache_size': 10000
}
"""
self.config = config
self.primary_index = None
self.secondary_index = None
self.partitions = []
self.cache = {}
def build(self, vectors, metadata=None):
"""构建混合索引"""
n_vectors, dim = vectors.shape
# 1. 数据分区
if self.config['partition_strategy'] == 'kmeans':
from sklearn.cluster import MiniBatchKMeans
n_partitions = self.config['num_partitions']
kmeans = MiniBatchKMeans(
n_clusters=n_partitions,
batch_size=1000,
random_state=42
)
partition_labels = kmeans.fit_predict(vectors)
# 创建分区
self.partitions = []
for i in range(n_partitions):
mask = partition_labels == i
partition_vectors = vectors[mask]
partition_metadata = metadata[mask] if metadata is not None else None
self.partitions.append({
'id': i,
'vectors': partition_vectors,
'metadata': partition_metadata,
'center': kmeans.cluster_centers_[i],
'size': np.sum(mask)
})
# 2. 为每个分区构建主索引
for partition in self.partitions:
if self.config['primary_index'] == 'HNSW':
index = HNSWIndex(
M=self.config.get('M', 16),
efConstruction=self.config.get('efConstruction', 200)
)
# 批量插入向量
for vec in partition['vectors']:
index.insert(vec)
partition['primary_index'] = index
elif self.config['primary_index'] == 'FLAT':
# 简单的暴力搜索索引
partition['primary_index'] = {
'type': 'FLAT',
'vectors': partition['vectors']
}
# 3. 构建全局二级索引(可选)
if self.config.get('secondary_index'):
if self.config['secondary_index'] == 'IVF_PQ':
self.secondary_index = IVF_PQ_Index(
nlist=self.config.get('nlist', 100),
m=self.config.get('m', 8)
)
self.secondary_index.train(vectors)
return self
def adaptive_search(self, query, k=10, timeout_ms=100):
"""自适应搜索策略"""
import time
start_time = time.time()
# 策略1:首先使用二级索引快速筛选
candidate_partitions = []
if self.secondary_index:
# 使用二级索引找到相关分区
_, partition_ids = self.secondary_index.search(
query,
k=self.config.get('nprobe', 4)
)
candidate_partitions = [self.partitions[i] for i in partition_ids]
else:
# 基于距离选择分区
partition_distances = []
for partition in self.partitions:
dist = np.linalg.norm(query - partition['center'])
partition_distances.append((dist, partition))
partition_distances.sort()
candidate_partitions = [p for _, p in partition_distances[:4]]
# 策略2:并行搜索候选分区
all_results = []
for partition in candidate_partitions:
if time.time() - start_time > timeout_ms / 1000.0:
break
index = partition['primary_index']
if index['type'] == 'FLAT':
# 暴力搜索
vectors = index['vectors']
distances = np.linalg.norm(vectors - query, axis=1)
top_k_idx = np.argsort(distances)[:k]
for idx in top_k_idx:
all_results.append((
distances[idx],
partition['id'],
idx,
partition['metadata'][idx] if partition['metadata'] is not None else None
))
else:
# HNSW搜索
results = index.search(query, k=k)
for vector_id, distance in results:
all_results.append((
distance,
partition['id'],
vector_id,
partition['metadata'][vector_id] if partition['metadata'] is not None else None
))
# 合并并排序所有结果
all_results.sort(key=lambda x: x[0])
final_results = all_results[:k]
return {
'distances': [r[0] for r in final_results],
'indices': [r[2] for r in final_results],
'partitions': [r[1] for r in final_results],
'metadata': [r[3] for r in final_results],
'search_time_ms': (time.time() - start_time) * 1000
}
def incremental_update(self, new_vectors, metadata=None):
"""增量更新索引"""
# 1. 为新区量找到最合适的分区
for i, vector in enumerate(new_vectors):
# 找到最近的分区中心
distances = []
for partition in self.partitions:
dist = np.linalg.norm(vector - partition['center'])
distances.append((dist, partition))
distances.sort()
target_partition = distances[0][1]
# 2. 更新分区
target_partition['vectors'] = np.vstack([
target_partition['vectors'],
vector
])
# 更新元数据
if metadata is not None:
if target_partition['metadata'] is not None:
target_partition['metadata'].append(metadata[i])
else:
target_partition['metadata'] = [metadata[i]]
# 3. 增量更新索引
if 'primary_index' in target_partition:
if isinstance(target_partition['primary_index'], HNSWIndex):
target_partition['primary_index'].insert(vector)
# 4. 定期重新平衡分区
self._rebalance_partitions()
return self
def _rebalance_partitions(self):
"""重新平衡分区大小"""
# 如果某个分区过大,分裂它
max_size = self.config.get('max_partition_size', 100000)
for i, partition in enumerate(self.partitions):
if len(partition['vectors']) > max_size:
self._split_partition(i)
3.2 GPU加速实现
python
复制
下载
import torch
import cupy as cp
class GPUAcceleratedIndex:
"""GPU加速的向量索引"""
def __init__(self, device='cuda:0', batch_size=1024):
self.device = device
self.batch_size = batch_size
self.vectors_gpu = None
self.indices_gpu = None
def build_on_gpu(self, vectors):
"""在GPU上构建索引"""
# 将数据转移到GPU
self.vectors_gpu = torch.from_numpy(vectors).to(self.device)
self.n_vectors = vectors.shape[0]
self.dim = vectors.shape[1]
# 预计算向量范数用于余弦相似度
self.norms_gpu = torch.norm(self.vectors_gpu, dim=1, keepdim=True)
return self
def gpu_brute_force_search(self, queries, k=10, metric='cosine'):
"""GPU暴力搜索"""
import faiss
# 使用Faiss GPU实现
res = faiss.StandardGpuResources()
# 创建索引
index = faiss.IndexFlatL2(self.dim) if metric == 'euclidean' \
else faiss.IndexFlatIP(self.dim)
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
gpu_index.add(self.vectors_gpu.cpu().numpy())
# 批量搜索
all_distances = []
all_indices = []
for i in range(0, len(queries), self.batch_size):
batch = queries[i:i+self.batch_size]
distances, indices = gpu_index.search(batch, k)
all_distances.append(distances)
all_indices.append(indices)
return np.vstack(all_distances), np.vstack(all_indices)
def fused_distance_calculation(self, query_batch):
"""融合距离计算内核"""
import numba.cuda as cuda
@cuda.jit
def cosine_distance_kernel(queries, vectors, norms, output):
"""CUDA内核:批量计算余弦距离"""
i, j = cuda.grid(2)
if i < queries.shape[0] and j < vectors.shape[0]:
dot = 0.0
for k in range(queries.shape[1]):
dot += queries[i, k] * vectors[j, k]
query_norm = 0.0
for k in range(queries.shape[1]):
query_norm += queries[i, k] * queries[i, k]
query_norm = math.sqrt(query_norm)
output[i, j] = 1.0 - dot / (query_norm * norms[j, 0])
# 分配GPU内存
queries_gpu = cp.asarray(query_batch)
output_gpu = cp.zeros((len(query_batch), self.n_vectors))
# 启动内核
threads_per_block = (16, 16)
blocks_per_grid = (
(len(query_batch) + threads_per_block[0] - 1) // threads_per_block[0],
(self.n_vectors + threads_per_block[1] - 1) // threads_per_block[1]
)
cosine_distance_kernel[blocks_per_grid, threads_per_block](
queries_gpu,
cp.asarray(self.vectors_gpu.cpu().numpy()),
cp.asarray(self.norms_gpu.cpu().numpy()),
output_gpu
)
return output_gpu
def approximate_gpu_search(self, queries, k=10, nlist=100, nprobe=10):
"""GPU上的近似搜索"""
import faiss
# 使用IVF-PQ GPU索引
quantizer = faiss.IndexFlatL2(self.dim)
index = faiss.IndexIVFPQ(quantizer, self.dim, nlist, 8, 8)
# 转移到GPU
res = faiss.StandardGpuResources()
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
# 训练索引
gpu_index.train(self.vectors_gpu.cpu().numpy())
gpu_index.add(self.vectors_gpu.cpu().numpy())
gpu_index.nprobe = nprobe
# 搜索
distances, indices = gpu_index.search(queries, k)
return distances, indices
四、生产环境优化
4.1 内存与性能优化
python
复制
下载
class OptimizedVectorDatabase:
"""生产级向量数据库优化"""
def __init__(self, config):
self.config = config
self.memory_pool = {}
self.compression_ratio = {}
self.access_pattern = {}
def memory_optimized_storage(self, vectors):
"""内存优化存储策略"""
# 1. 数据类型优化
if self.config.get('use_fp16', False):
vectors = vectors.astype(np.float16)
print(f"使用FP16存储,内存减少:{vectors.nbytes / (vectors.size * 4):.1%}")
# 2. 稀疏向量压缩
sparsity = np.mean(vectors == 0)
if sparsity > 0.7: # 70%稀疏度
from scipy import sparse
vectors_csr = sparse.csr_matrix(vectors)
self.memory_pool['vectors'] = vectors_csr
print(f"使用稀疏矩阵,内存减少:{vectors_csr.data.nbytes / vectors.nbytes:.1%}")
else:
self.memory_pool['vectors'] = vectors
# 3. 分块存储
chunk_size = self.config.get('chunk_size', 10000)
n_chunks = len(vectors) // chunk_size + 1
self.chunks = []
for i in range(n_chunks):
chunk = vectors[i*chunk_size:(i+1)*chunk_size]
if len(chunk) > 0:
self.chunks.append(chunk)
return self
def cache_aware_search(self, query, k=10, use_cache=True):
"""缓存感知的搜索优化"""
query_hash = hash(query.tobytes())
# 1. 查询缓存
if use_cache and query_hash in self.access_pattern:
cached_result = self.access_pattern[query_hash]
cached_result['cache_hit'] = True
return cached_result
# 2. 预测搜索范围
predicted_clusters = self._predict_search_clusters(query)
# 3. 预取数据到缓存
self._prefetch_clusters(predicted_clusters)
# 4. 执行搜索
start_time = time.time()
results = self._execute_search(query, k, predicted_clusters)
search_time = time.time() - start_time
# 5. 更新缓存和访问模式
if use_cache:
self.access_pattern[query_hash] = {
'results': results,
'timestamp': time.time(),
'search_time': search_time
}
# LRU缓存淘汰
if len(self.access_pattern) > self.config.get('max_cache_size', 10000):
oldest_key = min(self.access_pattern.keys(),
key=lambda k: self.access_pattern[k]['timestamp'])
del self.access_pattern[oldest_key]
return {
'results': results,
'search_time': search_time,
'cache_hit': False
}
def _predict_search_clusters(self, query):
"""预测需要搜索的聚类"""
# 基于历史访问模式的简单预测
# 在实际系统中可以使用机器学习模型
return list(range(min(10, len(self.chunks))))
def _prefetch_clusters(self, cluster_ids):
"""预取数据到内存"""
for cluster_id in cluster_ids:
if cluster_id < len(self.chunks):
# 确保数据在内存中
_ = self.chunks[cluster_id][0]
def batch_optimized_search(self, queries, k=10):
"""批量查询优化"""
import concurrent.futures
# 1. 查询去重
unique_queries, inverse_indices = np.unique(
queries, axis=0, return_inverse=True
)
print(f"批量查询去重: {len(queries)} -> {len(unique_queries)}")
# 2. 并行处理
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.config.get('num_workers', 4)
) as executor:
# 提交搜索任务
future_to_query = {
executor.submit(self.cache_aware_search, query, k): idx
for idx, query in enumerate(unique_queries)
}
# 收集结果
unique_results = {}
for future in concurrent.futures.as_completed(future_to_query):
query_idx = future_to_query[future]
try:
result = future.result()
unique_results[query_idx] = result['results']
except Exception as e:
print(f"查询失败: {e}")
unique_results[query_idx] = []
# 3. 映射回原始顺序
all_results = [unique_results[inverse_indices[i]] for i in range(len(queries))]
return all_results
def dynamic_index_tuning(self, query_logs):
"""动态索引调优"""
# 分析查询模式
query_patterns = self._analyze_query_patterns(query_logs)
# 调整索引参数
new_config = self._suggest_index_config(query_patterns)
# 渐进式重新构建索引
self._progressive_reindex(new_config)
return new_config
def _analyze_query_patterns(self, query_logs):
"""分析查询模式"""
patterns = {
'query_distribution': [],
'result_size_distribution': [],
'time_distribution': [],
'popular_vectors': {}
}
for log in query_logs:
# 统计查询向量分布
query_hash = hash(log['query'].tobytes())
patterns['query_distribution'].append(query_hash)
# 统计结果数量
patterns['result_size_distribution'].append(len(log.get('results', [])))
# 统计热门向量
for result in log.get('results', []):
vector_id = result.get('id')
patterns['popular_vectors'][vector_id] = \
patterns['popular_vectors'].get(vector_id, 0) + 1
return patterns
def _suggest_index_config(self, patterns):
"""基于模式建议索引配置"""
config = self.config.copy()
# 根据热门向量调整缓存策略
hot_vectors = sorted(
patterns['popular_vectors'].items(),
key=lambda x: x[1],
reverse=True
)[:1000]
if hot_vectors:
config['hot_vector_cache_size'] = len(hot_vectors)
# 根据查询分布调整分区策略
unique_queries = len(set(patterns['query_distribution']))
if unique_queries < 1000:
config['enable_query_cache'] = True
return config
def _progressive_reindex(self, new_config):
"""渐进式重新构建索引"""
# 在实际系统中,这会逐步迁移数据到新索引
# 同时保持旧索引可用
pass
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
4.2 分布式向量数据库
python
复制
下载
class DistributedVectorIndex:
"""分布式向量索引"""
def __init__(self, num_shards=4, replication_factor=2):
self.num_shards = num_shards
self.replication_factor = replication_factor
self.shards = []
self.coordinator = None
def shard_by_hash(self, vectors):
"""基于哈希分片"""
shard_size = len(vectors) // self.num_shards
for i in range(self.num_shards):
start_idx = i * shard_size
end_idx = (i + 1) * shard_size if i < self.num_shards - 1 else len(vectors)
shard_vectors = vectors[start_idx:end_idx]
# 创建分片副本
replicas = []
for j in range(self.replication_factor):
replica_id = f"shard_{i}_replica_{j}"
replica = VectorShard(replica_id, shard_vectors)
replicas.append(replica)
self.shards.append({
'id': i,
'range': (start_idx, end_idx),
'replicas': replicas,
'primary_replica': replicas[0]
})
return self
def shard_by_clustering(self, vectors):
"""基于聚类分片"""
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=self.num_shards, random_state=42)
cluster_labels = kmeans.fit_predict(vectors)
for i in range(self.num_shards):
mask = cluster_labels == i
shard_vectors = vectors[mask]
if len(shard_vectors) == 0:
continue
replicas = []
for j in range(self.replication_factor):
replica_id = f"cluster_{i}_replica_{j}"
replica = VectorShard(replica_id, shard_vectors)
replicas.append(replica)
self.shards.append({
'id': i,
'cluster_center': kmeans.cluster_centers_[i],
'replicas': replicas,
'primary_replica': replicas[0],
'size': len(shard_vectors)
})
return self
def distributed_search(self, query, k=10, consistency_level='one'):
"""分布式搜索"""
import concurrent.futures
# 1. 路由查询到相关分片
target_shards = self._route_query_to_shards(query)
# 2. 并行搜索每个分片
with concurrent.futures.ThreadPoolExecutor(
max_workers=len(target_shards)
) as executor:
# 提交搜索任务到副本
futures = []
for shard in target_shards:
# 根据一致性级别选择副本
if consistency_level == 'one':
replica = shard['primary_replica']
elif consistency_level == 'quorum':
# 选择多数副本
replicas = shard['replicas'][:self.replication_factor//2 + 1]
future = executor.submit(
self._search_with_quorum,
query, k, replicas
)
futures.append(future)
continue
elif consistency_level == 'all':
replicas = shard['replicas']
future = executor.submit(
self._search_with_consensus,
query, k, replicas
)
futures.append(future)
continue
future = executor.submit(replica.search, query, k)
futures.append((shard['id'], future))
# 3. 收集和合并结果
all_results = []
for shard_id, future in futures:
try:
results = future.result()
# 添加分片信息
for dist, idx in results:
all_results.append((dist, shard_id, idx))
except Exception as e:
print(f"分片 {shard_id} 搜索失败: {e}")
# 4. 全局排序
all_results.sort(key=lambda x: x[0])
final_results = all_results[:k]
return {
'distances': [r[0] for r in final_results],
'shard_ids': [r[1] for r in final_results],
'local_indices': [r[2] for r in final_results],
'total_shards_searched': len(target_shards)
}
def _route_query_to_shards(self, query):
"""路由查询到相关分片"""
if hasattr(self, 'cluster_centers'):
# 基于聚类中心的路由
distances = []
for shard in self.shards:
dist = np.linalg.norm(query - shard['cluster_center'])
distances.append((dist, shard))
# 选择距离最近的几个分片
distances.sort()
return [shard for _, shard in distances[:3]]
else:
# 广播到所有分片
return self.shards
def _search_with_quorum(self, query, k, replicas):
"""仲裁搜索"""
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(replica.search, query, k)
for replica in replicas]
all_results = []
for future in concurrent.futures.as_completed(futures):
try:
results = future.result()
all_results.extend(results)
except Exception as e:
print(f"副本搜索失败: {e")
# 合并结果并去重
merged_results = self._merge_results(all_results, k)
return merged_results
def _search_with_consensus(self, query, k, replicas):
"""共识搜索"""
results_by_replica = []
for replica in replicas:
try:
results = replica.search(query, k)
results_by_replica.append(results)
except Exception as e:
print(f"副本搜索失败: {e}")
continue
# 投票选择结果
return self._vote_on_results(results_by_replica, k)
def _merge_results(self, all_results, k):
"""合并多个结果集"""
# 简单合并并排序
all_results.sort(key=lambda x: x[0])
# 去重(基于向量ID)
seen = set()
unique_results = []
for dist, idx in all_results:
if idx not in seen:
seen.add(idx)
unique_results.append((dist, idx))
if len(unique_results) >= k:
break
return unique_results
def _vote_on_results(self, results_by_replica, k):
"""基于投票的结果选择"""
# 统计每个向量的出现次数和平均距离
vector_scores = {}
for replica_results in results_by_replica:
for rank, (dist, idx) in enumerate(replica_results):
if idx not in vector_scores:
vector_scores[idx] = {
'count': 0,
'total_distance': 0,
'total_rank': 0
}
vector_scores[idx]['count'] += 1
vector_scores[idx]['total_distance'] += dist
vector_scores[idx]['total_rank'] += rank
# 计算综合分数
scored_vectors = []
for idx, scores in vector_scores.items():
# 考虑出现次数、平均距离和平均排名
score = (
scores['count'] * 0.5 + # 出现频率
(1 / (scores['total_distance'] / scores['count'] + 1e-6)) * 0.3 + # 距离倒数
(1 / (scores['total_rank'] / scores['count'] + 1)) * 0.2 # 排名倒数
)
avg_distance = scores['total_distance'] / scores['count']
scored_vectors.append((score, avg_distance, idx))
# 按分数排序
scored_vectors.sort(key=lambda x: x[0], reverse=True)
return [(dist, idx) for _, dist, idx in scored_vectors[:k]]
class VectorShard:
"""向量分片"""
def __init__(self, shard_id, vectors):
self.shard_id = shard_id
self.vectors = vectors
self.index = None
def build_index(self, index_type='HNSW', **kwargs):
"""在分片上构建索引"""
if index_type == 'HNSW':
self.index = HNSWIndex(**kwargs)
for vector in self.vectors:
self.index.insert(vector)
elif index_type == 'IVF_PQ':
self.index = IVF_PQ_Index(**kwargs)
self.index.train(self.vectors)
return self
def search(self, query, k=10):
"""在分片内搜索"""
if self.index is None:
# 如果没有索引,使用暴力搜索
distances = np.linalg.norm(self.vectors - query, axis=1)
top_k_idx = np.argsort(distances)[:k]
return [(distances[i], i) for i in top_k_idx]
# 使用索引搜索
return self.index.search(query, k)
五、评估与监控
5.1 性能评估框架
python
复制
下载
class VectorIndexEvaluator:
"""向量索引评估框架"""
def __init__(self, ground_truth_func):
self.ground_truth_func = ground_truth_func
self.metrics_history = []
def evaluate_index(self, index, test_queries, k_values=[1, 10, 100]):
"""全面评估索引性能"""
evaluation_results = {}
for k in k_values:
print(f"\n评估 k={k}")
# 1. 召回率评估
recall_results = self.evaluate_recall(index, test_queries, k)
# 2. 延迟评估
latency_results = self.evaluate_latency(index, test_queries, k)
# 3. 内存使用评估
memory_results = self.evaluate_memory(index)
# 4. 构建时间评估
build_results = self.evaluate_build_time(index)
evaluation_results[k] = {
'recall': recall_results,
'latency': latency_results,
'memory': memory_results,
'build_time': build_results
}
self.metrics_history.append({
'timestamp': time.time(),
'k': k,
**evaluation_results[k]
})
return evaluation_results
def evaluate_recall(self, index, queries, k):
"""评估召回率"""
exact_results = []
approx_results = []
for query in queries:
# 精确搜索
exact_dist, exact_idx = self.ground_truth_func(query, k)
exact_results.append((exact_dist, exact_idx))
# 近似搜索
start_time = time.time()
approx_dist, approx_idx = index.search(query, k)
search_time = time.time() - start_time
approx_results.append({
'distances': approx_dist,
'indices': approx_idx,
'search_time': search_time
})
# 计算召回率
total_recall = 0
for exact, approx in zip(exact_results, approx_results):
exact_set = set(exact[1])
approx_set = set(approx['indices'])
recall = len(exact_set.intersection(approx_set)) / k
total_recall += recall
avg_recall = total_recall / len(queries)
return {
'avg_recall': avg_recall,
'recall_at_1': self._recall_at_n(exact_results, approx_results, 1),
'recall_at_10': self._recall_at_n(exact_results, approx_results, 10),
'precision': self._calculate_precision(exact_results, approx_results, k)
}
def evaluate_latency(self, index, queries, k, warmup_runs=10):
"""评估搜索延迟"""
# 预热
for _ in range(warmup_runs):
for query in queries[:10]:
_ = index.search(query, k)
# 正式测试
latencies = []
for query in queries:
start_time = time.perf_counter()
_ = index.search(query, k)
latencies.append(time.perf_counter() - start_time)
return {
'mean_latency_ms': np.mean(latencies) * 1000,
'p50_latency_ms': np.percentile(latencies, 50) * 1000,
'p95_latency_ms': np.percentile(latencies, 95) * 1000,
'p99_latency_ms': np.percentile(latencies, 99) * 1000,
'std_latency_ms': np.std(latencies) * 1000,
'qps': 1 / np.mean(latencies)
}
def evaluate_memory(self, index):
"""评估内存使用"""
import psutil
import os
process = psutil.Process(os.getpid())
memory_info = process.memory_info()
return {
'rss_mb': memory_info.rss / 1024 / 1024,
'vms_mb': memory_info.vms / 1024 / 1024,
'shared_mb': memory_info.shared / 1024 / 1024,
'text_mb': memory_info.text / 1024 / 1024,
'data_mb': memory_info.data / 1024 / 1024
}
def evaluate_build_time(self, index):
"""评估构建时间"""
if hasattr(index, 'build_time'):
return {
'build_time_seconds': index.build_time,
'indexing_speed_qps': index.n_vectors / index.build_time
}
return {'build_time_seconds': None}
def _recall_at_n(self, exact_results, approx_results, n):
"""计算前n个结果的召回率"""
total_recall = 0
for exact, approx in zip(exact_results, approx_results):
exact_set = set(exact[1][:n])
approx_set = set(approx['indices'][:n])
recall = len(exact_set.intersection(approx_set)) / min(n, len(exact_set))
total_recall += recall
return total_recall / len(exact_results)
def _calculate_precision(self, exact_results, approx_results, k):
"""计算精确率"""
total_precision = 0
for exact, approx in zip(exact_results, approx_results):
exact_set = set(exact[1])
approx_set = set(approx['indices'])
precision = len(exact_set.intersection(approx_set)) / k
total_precision += precision
return total_precision / len(exact_results)
def generate_report(self, evaluation_results):
"""生成评估报告"""
report = {
'summary': {
'total_queries': len(evaluation_results),
'evaluation_time': time.time(),
'config': {}
},
'detailed_metrics': evaluation_results,
'recommendations': []
}
# 分析结果并给出建议
for k, metrics in evaluation_results.items():
recall = metrics['recall']['avg_recall']
latency = metrics['latency']['p95_latency_ms']
if recall < 0.9 and latency < 10:
report['recommendations'].append(
f"对于k={k},可以增加索引精度以提高召回率"
)
elif recall > 0.95 and latency > 50:
report['recommendations'].append(
f"对于k={k},可以降低索引精度以改善延迟"
)
return report
5.2 实时监控系统
python
复制
下载
class VectorDBMonitor:
"""向量数据库监控系统"""
def __init__(self, prometheus_endpoint=None):
self.metrics = {
'search_latency': [],
'recall_rate': [],
'memory_usage': [],
'throughput': [],
'error_rate': []
}
self.prometheus = prometheus_endpoint
def record_metric(self, metric_name, value, tags=None):
"""记录指标"""
if metric_name not in self.metrics:
self.metrics[metric_name] = []
record = {
'timestamp': time.time(),
'value': value,
'tags': tags or {}
}
self.metrics[metric_name].append(record)
# 保留最近10000个记录
if len(self.metrics[metric_name]) > 10000:
self.metrics[metric_name] = self.metrics[metric_name][-10000:]
def monitor_search_operation(self, func):
"""监控搜索操作的装饰器"""
def wrapper(*args, **kwargs):
start_time = time.time()
try:
result = func(*args, **kwargs)
# 记录成功指标
latency = time.time() - start_time
self.record_metric('search_latency', latency * 1000)
self.record_metric('throughput', 1)
if 'recall' in result:
self.record_metric('recall_rate', result['recall'])
return result
except Exception as e:
# 记录错误
self.record_metric('error_rate', 1)
self.record_metric('error_type', str(type(e).__name__))
raise e
return wrapper
def get_performance_dashboard(self):
"""获取性能仪表板数据"""
current_time = time.time()
hour_ago = current_time - 3600
dashboard = {
'last_hour': {},
'last_5_minutes': {},
'current_status': {}
}
# 计算各个时间段的统计
for metric_name, records in self.metrics.items():
# 最近1小时
recent_records = [r for r in records if r['timestamp'] > hour_ago]
if recent_records:
values = [r['value'] for r in recent_records]
dashboard['last_hour'][metric_name] = {
'avg': np.mean(values),
'p95': np.percentile(values, 95),
'p99': np.percentile(values, 99),
'count': len(values)
}
# 最近5分钟
five_min_ago = current_time - 300
recent_records = [r for r in records if r['timestamp'] > five_min_ago]
if recent_records:
values = [r['value'] for r in recent_records]
dashboard['last_5_minutes'][metric_name] = {
'avg': np.mean(values),
'p95': np.percentile(values, 95),
'p99': np.percentile(values, 99),
'count': len(values)
}
# 当前状态
if 'search_latency' in self.metrics and self.metrics['search_latency']:
last_latency = self.metrics['search_latency'][-1]['value']
dashboard['current_status']['latency_ms'] = last_latency
# 判断是否超过阈值
if last_latency > 100: # 100ms阈值
dashboard['current_status']['latency_status'] = 'warning'
elif last_latency > 500:
dashboard['current_status']['latency_status'] = 'critical'
else:
dashboard['current_status']['latency_status'] = 'normal'
return dashboard
def alert_on_anomalies(self, alert_rules):
"""异常告警"""
alerts = []
for rule in alert_rules:
metric_name = rule['metric']
threshold = rule['threshold']
duration = rule.get('duration', 60) # 默认60秒
if metric_name in self.metrics:
# 检查最近一段时间内的记录
time_limit = time.time() - duration
recent_values = [
r['value'] for r in self.metrics[metric_name]
if r['timestamp'] > time_limit
]
if recent_values:
if rule['condition'] == 'greater_than':
if np.mean(recent_values) > threshold:
alerts.append({
'metric': metric_name,
'condition': f"平均值 > {threshold}",
'actual_value': np.mean(recent_values),
'duration': duration,
'timestamp': time.time()
})
elif rule['condition'] == 'less_than':
if np.mean(recent_values) < threshold:
alerts.append({
'metric': metric_name,
'condition': f"平均值 < {threshold}",
'actual_value': np.mean(recent_values),
'duration': duration,
'timestamp': time.time()
})
return alerts
def export_to_prometheus(self):
"""导出指标到Prometheus"""
if not self.prometheus:
return
# 构建Prometheus指标
metrics_data = []
for metric_name, records in self.metrics.items():
if records:
latest = records[-1]
# 创建Prometheus格式的指标
prom_metric = {
'name': f"vectordb_{metric_name}",
'value': latest['value'],
'timestamp': latest['timestamp'],
'labels': latest.get('tags', {})
}
metrics_data.append(prom_metric)
# 发送到Prometheus
# 这里简化实现,实际应使用Prometheus客户端库
return metrics_data
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
六、最佳实践总结
6.1 索引选择指南
python
复制
下载
class IndexSelectionGuide:
"""索引选择指南"""
@staticmethod
def recommend_index(dataset_size, dimension,
recall_requirement, latency_requirement,
memory_constraint, update_frequency):
"""推荐最适合的索引类型"""
recommendations = []
# 基于数据集大小
if dataset_size < 10000:
recommendations.append({
'index_type': 'FLAT',
'reason': '数据集小,暴力搜索即可',
'expected_recall': 1.0,
'expected_latency': '低'
})
elif dataset_size < 1000000:
recommendations.append({
'index_type': 'HNSW',
'reason': '中等数据集,HNSW平衡了性能和准确率',
'expected_recall': '0.95-0.99',
'expected_latency': '1-10ms'
})
if memory_constraint < 1: # 内存小于1GB
recommendations.append({
'index_type': 'IVF_PQ',
'reason': '内存受限,使用量化压缩',
'expected_recall': '0.90-0.98',
'expected_latency': '1-5ms'
})
else: # 大数据集
recommendations.append({
'index_type': 'IVF_PQ',
'reason': '大数据集,需要高效的内存使用',
'expected_recall': '0.85-0.95',
'expected_latency': '5-50ms'
})
if update_frequency == 'high':
recommendations.append({
'index_type': 'HNSW',
'reason': '频繁更新,HNSW支持增量更新',
'expected_recall': '0.95-0.99',
'expected_latency': '10-100ms'
})
# 基于召回率要求过滤
if recall_requirement > 0.99:
recommendations = [r for r in recommendations
if '0.99' in str(r['expected_recall'])]
elif recall_requirement > 0.95:
recommendations = [r for r in recommendations
if '0.95' in str(r['expected_recall'])]
# 基于延迟要求过滤
if latency_requirement < 5: # 要求5ms以内
recommendations = [r for r in recommendations
if 'ms' in r['expected_latency'] and
int(r['expected_latency'].split('-')[0]) < 5]
return recommendations
@staticmethod
def get_optimization_tips(index_type):
"""获取优化建议"""
tips = {
'HNSW': [
'增加M参数以提高召回率,但会增加内存使用',
'增加efConstruction以提高索引质量',
'根据查询负载调整efSearch参数',
'对于高维数据,考虑先进行PCA降维'
],
'IVF_PQ': [
'增加nlist以提高召回率,但会增加训练时间',
'调整m和nbits参数以平衡精度和内存',
'增加nprobe以提高召回率,但会增加查询时间',
'定期重新训练索引以适应数据分布变化'
],
'FLAT': [
'使用批量查询以提高吞吐量',
'考虑使用GPU加速',
'对于超大数据集,考虑分区处理',
'使用向量归一化以支持内积搜索'
]
}
return tips.get(index_type, [])
6.2 调优检查清单
python
复制
下载
class TuningChecklist:
"""向量索引调优检查清单"""
def __init__(self):
self.checklist = {
'数据预处理': [
'向量是否已归一化(对于余弦相似度)',
'是否进行了维度检查',
'是否有异常值处理',
'是否考虑数据分布'
],
'索引构建': [
'是否选择了合适的索引类型',
'参数是否经过调优',
'是否进行了充分的训练',
'是否有验证集评估'
],
'查询优化': [
'是否使用了批量查询',
'是否有查询缓存',
'是否考虑了查询负载均衡',
'是否有连接池管理'
],
'内存优化': [
'是否使用了量化',
'是否有内存监控',
'是否考虑了分页策略',
'是否有垃圾回收机制'
],
'监控告警': [
'是否有性能监控',
'是否有异常检测',
'是否有自动扩缩容',
'是否有备份恢复机制'
]
}
def run_check(self, vector_db, config):
"""运行检查"""
results = {}
for category, checks in self.checklist.items():
results[category] = {}
for check in checks:
# 这里简化检查逻辑,实际应实现具体检查
status = self._perform_check(check, vector_db, config)
results[category][check] = status
return results
def _perform_check(self, check, vector_db, config):
"""执行单个检查"""
# 这里简化实现,实际应包含详细的检查逻辑
if '归一化' in check:
return self._check_normalization(vector_db)
elif '索引类型' in check:
return self._check_index_type(config)
elif '批量查询' in check:
return self._check_batch_query(config)
elif '内存监控' in check:
return self._check_memory_monitoring(config)
else:
return '待检查'
def generate_report(self, check_results):
"""生成检查报告"""
report = {
'summary': {
'total_checks': 0,
'passed_checks': 0,
'failed_checks': 0,
'pending_checks': 0
},
'details': check_results,
'recommendations': []
}
for category, checks in check_results.items():
for check, status in checks.items():
report['summary']['total_checks'] += 1
if status == '通过':
report['summary']['passed_checks'] += 1
elif status == '失败':
report['summary']['failed_checks'] += 1
report['recommendations'].append(f"{category}: {check}")
else:
report['summary']['pending_checks'] += 1
return report
七、未来发展趋势
7.1 新兴技术方向
python
复制
下载
class FutureVectorSearchTrends:
"""向量搜索未来趋势"""
trends = [
{
'name': '可学习索引',
'description': '使用机器学习模型学习数据分布,优化索引结构',
'key_technologies': ['神经网络', '强化学习', '元学习'],
'potential_impact': '自适应索引,减少人工调参'
},
{
'name': '混合检索',
'description': '结合向量搜索与传统关键词搜索',
'key_technologies': ['多模态学习', '交叉编码器', '重排序'],
'potential_impact': '更准确的搜索结果,支持复杂查询'
},
{
'name': '实时向量流处理',
'description': '对流式向量数据进行实时索引和搜索',
'key_technologies': ['流处理', '增量学习', '在线索引'],
'potential_impact': '支持实时推荐和异常检测'
},
{
'name': '量子加速搜索',
'description': '利用量子计算加速向量相似度计算',
'key_technologies': ['量子算法', '量子硬件', '混合计算'],
'potential_impact': '指数级加速,突破传统性能瓶颈'
},
{
'name': '联邦向量学习',
'description': '在保护隐私的前提下进行分布式向量索引',
'key_technologies': ['联邦学习', '同态加密', '差分隐私'],
'potential_impact': '隐私保护的多方数据检索'
}
]
@classmethod
def get_implementation_roadmap(cls):
"""获取技术实现路线图"""
roadmap = {
'短期(1-2年)': [
'更智能的自动参数调优',
'异构硬件支持(CPU/GPU/TPU)',
'更高效的混合索引结构'
],
'中期(3-5年)': [
'端到端的可学习索引系统',
'量子-经典混合计算架构',
'实时自适应索引优化'
],
'长期(5年以上)': [
'完全自主的AI驱动索引系统',
'量子优越性的向量搜索',
'跨模态通用检索系统'
]
}
return roadmap
这个全面指南涵盖了向量数据库相似度搜索和索引构建的各个方面,从基础算法到生产级实现,从单机优化到分布式部署,以及完整的评估和监控体系。实际应用中需要根据具体场景选择合适的算法和优化策略。