第22章:表达式与投影
🎯 核心概览
表达式和投影是查询执行的基础。表达式用于计算结果列的值(如函数调用、算术运算、条件判断),投影用于选择和转换输出的列。Lance 实现了一套高效的表达式求值和投影下推机制,使查询性能获得数倍提升。
能力 :从任意表达式高效计算结果,支持投影下推减少 IO。
📐 表达式类型与求值
表达式体系
rust
pub enum Expr {
// 字面值
Literal(ScalarValue),
// 列引用
Column {
name: String,
index: usize, // 在 RecordBatch 中的列索引
},
// 一元操作
UnaryOp {
op: UnaryOperator, // NOT, -, 等
operand: Box<Expr>,
},
// 二元操作
BinaryOp {
left: Box<Expr>,
op: BinaryOperator, // +, -, *, /, <, >, ==, AND, OR 等
right: Box<Expr>,
},
// 函数调用
Function {
name: String,
args: Vec<Expr>,
return_type: DataType,
},
// 条件表达式(CASE WHEN)
Case {
operand: Option<Box<Expr>>,
when_then_expr: Vec<(Box<Expr>, Box<Expr>)>,
else_expr: Option<Box<Expr>>,
},
// 投影(选择列)
Project(Vec<usize>),
// 向量搜索
VectorSearch {
column: String,
query: Box<Expr>,
k: usize,
},
// 向量距离
VectorDistance {
left: Box<Expr>,
right: Box<Expr>,
metric: String, // "l2", "cosine", "dot"
},
}
pub enum UnaryOperator {
Not, // !
Neg, // -
IsNull, // IS NULL
IsNotNull, // IS NOT NULL
}
pub enum BinaryOperator {
// 算术
Add, Sub, Mul, Div, Rem,
// 比较
Eq, Lt, Gt, LtEq, GtEq, NotEq,
// 逻辑
And, Or,
// 字符串
Concat,
}
表达式求值
rust
pub trait Evaluator {
fn evaluate(&self, batch: &RecordBatch) -> Result<ArrayRef>;
}
impl Evaluator for Expr {
fn evaluate(&self, batch: &RecordBatch) -> Result<ArrayRef> {
match self {
// 字面值:广播到数组
Expr::Literal(val) => {
Ok(Arc::new(val.to_array(batch.num_rows())))
}
// 列引用:直接返回列
Expr::Column { name, index } => {
Ok(batch.column(*index).clone())
}
// 一元操作
Expr::UnaryOp { op, operand } => {
let operand_val = operand.evaluate(batch)?;
match op {
UnaryOperator::Not => {
let bool_array = operand_val.as_boolean();
Ok(Arc::new(compute::not(bool_array)?))
}
UnaryOperator::Neg => {
let num_array = operand_val.as_primitive::<Float32Type>();
let neg = num_array.iter()
.map(|v| Some(-v))
.collect::<Vec<_>>();
Ok(Arc::new(Float32Array::from(neg)))
}
UnaryOperator::IsNull => {
let is_null = operand_val.iter()
.map(|v| Some(v.is_none()))
.collect::<Vec<_>>();
Ok(Arc::new(BooleanArray::from(is_null)))
}
_ => Err("Unsupported unary op".into()),
}
}
// 二元操作
Expr::BinaryOp { left, op, right } => {
let left_val = left.evaluate(batch)?;
let right_val = right.evaluate(batch)?;
match op {
// 算术操作
BinaryOperator::Add => {
self.binary_arithmetic(&left_val, &right_val, |a, b| a + b)
}
BinaryOperator::Sub => {
self.binary_arithmetic(&left_val, &right_val, |a, b| a - b)
}
BinaryOperator::Mul => {
self.binary_arithmetic(&left_val, &right_val, |a, b| a * b)
}
// 比较操作
BinaryOperator::Lt => {
self.binary_comparison(&left_val, &right_val, |a, b| a < b)
}
BinaryOperator::Eq => {
self.binary_comparison(&left_val, &right_val, |a, b| a == b)
}
// 逻辑操作
BinaryOperator::And => {
let left_bool = left_val.as_boolean();
let right_bool = right_val.as_boolean();
Ok(Arc::new(compute::and(left_bool, right_bool)?))
}
BinaryOperator::Or => {
let left_bool = left_val.as_boolean();
let right_bool = right_val.as_boolean();
Ok(Arc::new(compute::or(left_bool, right_bool)?))
}
_ => Err("Unsupported binary op".into()),
}
}
// 函数调用
Expr::Function { name, args, .. } => {
let arg_values: Result<Vec<_>> = args.iter()
.map(|arg| arg.evaluate(batch))
.collect();
let arg_values = arg_values?;
// 调用对应的函数
match name.as_str() {
"LENGTH" => self.function_length(&arg_values[0]),
"UPPER" => self.function_upper(&arg_values[0]),
"LOWER" => self.function_lower(&arg_values[0]),
"ROUND" => self.function_round(&arg_values[0], &arg_values[1]),
"ABS" => self.function_abs(&arg_values[0]),
_ => Err(format!("Unknown function: {}", name).into()),
}
}
// CASE WHEN 表达式
Expr::Case { operand, when_then_expr, else_expr } => {
self.evaluate_case(batch, operand, when_then_expr, else_expr)
}
// 向量搜索
Expr::VectorSearch { column, query, k } => {
// 特殊处理:返回搜索结果的行号
let query_vec = query.evaluate(batch)?;
self.vector_search(batch, column, &query_vec, *k)
}
// 向量距离
Expr::VectorDistance { left, right, metric } => {
let left_vec = left.evaluate(batch)?;
let right_vec = right.evaluate(batch)?;
self.vector_distance(&left_vec, &right_vec, metric)
}
_ => Err("Unknown expression type".into()),
}
}
}
impl Expr {
// 二元算术操作的通用实现
fn binary_arithmetic<F>(
&self,
left: &ArrayRef,
right: &ArrayRef,
op: F,
) -> Result<ArrayRef>
where
F: Fn(f32, f32) -> f32,
{
let left_arr = left.as_primitive::<Float32Type>();
let right_arr = right.as_primitive::<Float32Type>();
let result: Vec<_> = left_arr.iter()
.zip(right_arr.iter())
.map(|(a, b)| {
match (a, b) {
(Some(&av), Some(&bv)) => Some(op(av, bv)),
_ => None,
}
})
.collect();
Ok(Arc::new(Float32Array::from(result)))
}
// 向量距离计算
fn vector_distance(
&self,
left: &ArrayRef,
right: &ArrayRef,
metric: &str,
) -> Result<ArrayRef> {
let left_fsl = left.as_fixed_size_list();
let right_fsl = right.as_fixed_size_list();
let distances: Vec<_> = (0..left_fsl.len())
.map(|i| {
let left_vec = left_fsl.value(i);
let right_vec = right_fsl.value(i);
match metric {
"l2" => {
let left_arr = left_vec.as_primitive::<Float32Type>();
let right_arr = right_vec.as_primitive::<Float32Type>();
let dist: f32 = left_arr.iter()
.zip(right_arr.iter())
.map(|(a, b)| match (a, b) {
(Some(&av), Some(&bv)) => (av - bv).powi(2),
_ => 0.0,
})
.sum();
Some(dist.sqrt())
}
"cosine" => {
// 余弦相似度实现
let left_arr = left_vec.as_primitive::<Float32Type>();
let right_arr = right_vec.as_primitive::<Float32Type>();
let dot: f32 = left_arr.iter()
.zip(right_arr.iter())
.map(|(a, b)| match (a, b) {
(Some(&av), Some(&bv)) => av * bv,
_ => 0.0,
})
.sum();
let left_norm: f32 = left_arr.iter()
.map(|a| match a {
Some(&v) => v.powi(2),
None => 0.0,
})
.sum::<f32>()
.sqrt();
let right_norm: f32 = right_arr.iter()
.map(|b| match b {
Some(&v) => v.powi(2),
None => 0.0,
})
.sum::<f32>()
.sqrt();
if left_norm > 0.0 && right_norm > 0.0 {
Some(1.0 - dot / (left_norm * right_norm))
} else {
Some(f32::MAX)
}
}
_ => None,
}
})
.collect();
Ok(Arc::new(Float32Array::from(distances)))
}
}
🎯 投影与列选择
投影的作用
投影操作用于:
- 选择列:从多列中选出需要的列
- 列重排:改变列的顺序
- 列转换:应用表达式转换列值
- 去重 :
SELECT DISTINCT
rust
pub struct ProjectionPlan {
pub input: Arc<dyn ExecutionPlan>,
pub expressions: Vec<Expr>, // 投影表达式
pub output_names: Vec<String>, // 输出列名
}
#[async_trait]
impl ExecutionPlan for ProjectionPlan {
async fn execute(
&self,
partition: usize,
state: Arc<TaskContext>,
) -> Result<SendableRecordBatchStream> {
// 1. 执行输入计划
let input_stream = self.input.execute(partition, state).await?;
// 2. 对每个 batch 应用投影
let output_stream = input_stream.map(|batch| {
self.project_batch(&batch)
});
Ok(Box::pin(output_stream))
}
fn schema(&self) -> SchemaRef {
// 构建投影后的 schema
let fields: Vec<_> = self.output_names.iter()
.zip(&self.expressions)
.map(|(name, expr)| {
Field::new(name.clone(), expr.return_type(), true)
})
.collect();
Arc::new(Schema::new(fields))
}
}
impl ProjectionPlan {
fn project_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
// 对 batch 中的每个表达式求值
let mut columns = vec![];
for expr in &self.expressions {
let col_array = expr.evaluate(batch)?;
columns.push(col_array);
}
// 构建新的 batch
Ok(RecordBatch::try_new(self.schema(), columns)?)
}
}
投影下推优化
投影下推的目标是尽早减少列,减少 IO 和内存消耗。
rust
pub struct ProjectionPushdown;
impl ProjectionPushdown {
pub fn optimize(&self, plan: &ExecutionPlan) -> Result<Arc<dyn ExecutionPlan>> {
match plan {
// 投影 - 扫描:下推到扫描
ProjectionPlan {
input: box ScanPlan { table, .. },
expressions,
..
} => {
// 分析投影表达式中引用的列
let required_columns = self.extract_columns(expressions);
// 创建只扫描这些列的新 ScanPlan
Ok(Arc::new(ScanPlan {
table: table.clone(),
projection: Some(required_columns),
// ...
}))
}
// 投影 - 过滤:分析过滤中的列
ProjectionPlan {
input: box FilterPlan { predicate, .. },
expressions,
..
} => {
// 需要的列 = 投影列 + 过滤列
let mut required = self.extract_columns(expressions);
required.extend(self.extract_columns_from_expr(predicate));
required.sort();
required.dedup();
// ... 下推
Ok(Arc::new(...))
}
_ => Ok(Arc::new(plan.clone())),
}
}
// 从表达式中提取引用的列索引
fn extract_columns(&self, expressions: &[Expr]) -> Vec<usize> {
let mut columns = vec![];
for expr in expressions {
self.extract_columns_from_expr_into(expr, &mut columns);
}
columns
}
fn extract_columns_from_expr_into(&self, expr: &Expr, columns: &mut Vec<usize>) {
match expr {
Expr::Column { index, .. } => {
columns.push(*index);
}
Expr::BinaryOp { left, right, .. } => {
self.extract_columns_from_expr_into(left, columns);
self.extract_columns_from_expr_into(right, columns);
}
Expr::UnaryOp { operand, .. } => {
self.extract_columns_from_expr_into(operand, columns);
}
Expr::Function { args, .. } => {
for arg in args {
self.extract_columns_from_expr_into(arg, columns);
}
}
_ => {}
}
}
}
🔍 谓词分析与下推
谓词分析
谓词分析用于识别哪些条件可以在索引层面处理,以减少数据扫描。
rust
pub struct PredicateAnalyzer {
schema: SchemaRef,
index_info: IndexInfo, // 索引信息
}
pub enum PredicatePushdownSupport {
Unsupported, // 无法下推
Partial, // 部分支持(如需要额外过滤)
Full, // 完全支持
}
impl PredicateAnalyzer {
pub fn analyze_pushdown(&self, predicate: &Expr) -> PredicatePushdownSupport {
match predicate {
// 等值查询:column = value
Expr::BinaryOp {
left: box Expr::Column { name, .. },
op: BinaryOperator::Eq,
right: box Expr::Literal(_),
} => {
// 检查列是否有索引
if self.index_info.has_index(name) {
PredicatePushdownSupport::Full
} else {
PredicatePushdownSupport::Partial
}
}
// 范围查询:column > value AND column < value
Expr::BinaryOp {
left,
op: BinaryOperator::And,
right,
} => {
let left_support = self.analyze_pushdown(left);
let right_support = self.analyze_pushdown(right);
match (left_support, right_support) {
(PredicatePushdownSupport::Full, PredicatePushdownSupport::Full) => {
PredicatePushdownSupport::Full
}
(PredicatePushdownSupport::Unsupported, _) | (_, PredicatePushdownSupport::Unsupported) => {
PredicatePushdownSupport::Unsupported
}
_ => PredicatePushdownSupport::Partial,
}
}
// 向量搜索:特殊处理,不能普通下推
Expr::VectorSearch { .. } => {
PredicatePushdownSupport::Unsupported
}
_ => PredicatePushdownSupport::Partial,
}
}
pub fn split_predicate(&self, predicate: &Expr) -> (Option<Expr>, Option<Expr>) {
// 将谓词分为两部分:
// 1. 可以下推的(在索引层处理)
// 2. 无法下推的(在执行器处理)
match predicate {
Expr::BinaryOp {
left,
op: BinaryOperator::And,
right,
} => {
let (push_left, no_push_left) = self.split_predicate(left);
let (push_right, no_push_right) = self.split_predicate(right);
// 合并可推的条件
let pushdown = match (push_left, push_right) {
(Some(l), Some(r)) => Some(Expr::BinaryOp {
left: Box::new(l),
op: BinaryOperator::And,
right: Box::new(r),
}),
(Some(e), None) | (None, Some(e)) => Some(e),
(None, None) => None,
};
// 合并无法推的条件
let no_pushdown = match (no_push_left, no_push_right) {
(Some(l), Some(r)) => Some(Expr::BinaryOp {
left: Box::new(l),
op: BinaryOperator::And,
right: Box::new(r),
}),
(Some(e), None) | (None, Some(e)) => Some(e),
(None, None) => None,
};
(pushdown, no_pushdown)
}
_ => {
// 单个谓词:判断是否可推
if matches!(self.analyze_pushdown(predicate), PredicatePushdownSupport::Full) {
(Some(predicate.clone()), None)
} else {
(None, Some(predicate.clone()))
}
}
}
}
}
📊 性能优化示例
CASE WHEN 优化
python
import lance
import numpy as np
# 创建示例数据
data = {
"id": np.arange(1000),
"price": np.random.randint(10, 1000, 1000),
"category": np.random.choice(["A", "B", "C"], 1000),
}
table = lance.write_table(data, uri="data.lance")
# SQL 查询:使用 CASE WHEN 进行分类
result = table.search_sql("""
SELECT
id,
price,
CASE
WHEN price < 100 THEN 'cheap'
WHEN price < 500 THEN 'medium'
ELSE 'expensive'
END as price_tier,
CASE category
WHEN 'A' THEN 1
WHEN 'B' THEN 2
WHEN 'C' THEN 3
ELSE 0
END as category_code
FROM data
WHERE price > 50
""")
# 转换为 Pandas
df = result.to_pandas()
print(df)
列投影优化
python
# 优化前:扫描所有列
result = table.search_sql("""
SELECT * FROM data WHERE price > 100
""")
# 优化后:只扫描需要的列(投影下推)
result = table.search_sql("""
SELECT id, price FROM data WHERE price > 100
""")
# 性能提升:假设表有 100 列,只需要 2 列
# 性能提升比例 ≈ 100 / 2 = 50x
📚 总结
表达式与投影是查询执行的核心:
- 表达式求值:支持所有常见的运算和函数
- 投影下推:尽早减少列,减少 IO
- 谓词分析:判断条件是否可在索引层处理
- 性能优化:通过下推和融合大幅降低成本
这些机制共同作用,使 Lance 能够高效地处理复杂的分析查询。