9章:索引系统架构与向量搜索
概述
索引是 Lance 提供快速查询的核心。本章讨论索引系统设计、向量索引实现、向量搜索优化。
索引系统设计
Index Trait 接口
rust
pub trait Index: Send + Sync {
// 打开索引
async fn open(
&mut self,
index_metadata: &IndexMetadata,
) -> Result<()>;
// 索引搜索
async fn search(
&self,
query: &SearchQuery,
) -> Result<Vec<SearchResult>>;
// 索引统计
fn statistics(&self) -> IndexStatistics;
}
pub struct SearchQuery {
pub column: String,
pub vector: Vec<f32>,
pub k: usize,
pub metric: DistanceMetric,
pub nprobes: Option<usize>,
pub filters: Option<String>,
}
pub struct SearchResult {
pub row_id: u64,
pub score: f32,
pub data: Option<RecordBatch>,
}
索引注册表
rust
pub struct IndexRegistry {
indexes: DashMap<String, Box<dyn Index>>,
}
impl IndexRegistry {
pub fn register_index(
&self,
name: String,
index: Box<dyn Index>,
) {
self.indexes.insert(name, index);
}
pub fn get_index(
&self,
name: &str,
) -> Option<Arc<dyn Index>> {
self.indexes.get(name).map(|entry| Arc::from(entry.value()))
}
}
索引生命周期
索引元数据定义
rust
pub struct IndexMetadata {
pub index_id: String,
pub index_type: IndexType,
pub column: String,
pub created_at: i64,
pub updated_at: i64,
pub index_size: u64,
pub index_files: Vec<String>,
pub parameters: HashMap<String, String>,
pub statistics: IndexStatistics,
}
pub enum IndexType {
Scalar(ScalarIndexType),
Vector(VectorIndexType),
}
pub enum ScalarIndexType {
BTree,
Bitmap,
Inverted,
}
pub enum VectorIndexType {
IVF { partitions: u32 },
HNSW { max_connections: u32 },
FLAT,
}
pub struct IndexStatistics {
pub num_rows: u64,
pub index_size: u64,
pub build_time_ms: u64,
pub search_latency_ms: f64,
}
向量索引搜索
IVF 索引搜索
rust
pub async fn search_ivf(
&self,
query_vector: &[f32],
k: usize,
nprobes: Option<usize>,
) -> Result<Vec<SearchResult>> {
// 1. 计算到质心的距离
let distances = compute_distances_to_centroids(query_vector)?;
// 2. 选择最近的 nprobes 个分区
let nprobes = nprobes.unwrap_or(20);
let top_partitions = select_top_k(&distances, nprobes)?;
// 3. 在分区内搜索
let mut candidates = Vec::new();
for partition_idx in top_partitions {
let vectors = get_partition_vectors(partition_idx)?;
for (row_id, vector) in vectors.iter().enumerate() {
let distance = l2_distance(query_vector, &vector)?;
candidates.push((row_id as u64, distance));
}
}
// 4. 返回 Top-K
candidates.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
Ok(candidates.into_iter().take(k)
.map(|(row_id, score)| SearchResult { row_id, score, data: None })
.collect())
}
HNSW 索引搜索
rust
pub async fn search_hnsw(
&self,
query_vector: &[f32],
k: usize,
) -> Result<Vec<SearchResult>> {
// 从顶层开始贪心搜索
let mut current_nearest = start_search_at_layer(0)?;
// 从上到下逐层搜索
for layer in 1..self.max_layer {
current_nearest = search_layer(
¤t_nearest,
query_vector,
layer,
)?;
}
// 底层详细搜索
let results = search_layer_detailed(
¤t_nearest,
query_vector,
0,
k,
)?;
Ok(results)
}
量化优化
乘积量化(PQ)
rust
pub struct ProductQuantizer {
num_subvectors: usize,
bits_per_subvector: u8,
codebooks: Vec<Vec<f32>>,
}
impl ProductQuantizer {
pub fn encode(&self, vector: &[f32]) -> Result<Vec<u8>> {
let subvector_size = vector.len() / self.num_subvectors;
let mut codes = Vec::new();
for i in 0..self.num_subvectors {
let start = i * subvector_size;
let end = start + subvector_size;
let subvector = &vector[start..end];
let code = self.find_nearest_centroid(i, subvector)?;
codes.push(code);
}
Ok(codes)
}
}
真实世界场景
语义搜索示例
rust
pub async fn semantic_search(
dataset_uri: &str,
query_text: &str,
k: usize,
) -> Result<Vec<SearchResult>> {
// 1. 打开数据集
let dataset = Dataset::open(dataset_uri).await?;
// 2. 文本转向量
let query_embedding = embed_text(query_text).await?;
// 3. 向量搜索
let search_query = SearchQuery {
column: "embedding".to_string(),
vector: query_embedding,
k,
metric: DistanceMetric::L2,
nprobes: Some(20),
filters: None,
};
dataset.search(&search_query).await
}
总结
Lance 索引系统特点:
- 多种索引类型:IVF、HNSW、FLAT
- 高性能搜索:量化加速、分区优化
- 灵活配置:nprobes、metrics、filters
- 生产级质量:版本管理、缓存优化
- 易用接口:简洁的搜索 API