第7章:编码器与解码器实现
概述
编码器与解码器是 Lance 的高效率数据处理引擎。本章讨论编码器、解码器的设计、上下方案、优化策略。
编码器接口设计
rust
pub trait Encoder: Send + Sync {
// 编码一个批次
fn encode(&mut self, batch: &RecordBatch) -> Result<EncodedBatch>;
// 编码配置
fn encoding_type(&self) -> EncodingType;
// 编码统计
fn statistics(&self) -> Option<EncodingStatistics>;
}
pub struct EncodedBatch {
pub data: Vec<u8>,
pub page_headers: Vec<PageHeader>,
pub statistics: EncodingStatistics,
}
pub enum EncodingType {
Bitpacking(u32),
Dictionary,
Delta,
RLE,
Prefix,
Plain,
}
Encoder 实例
rust
pub struct BitpackingEncoder {
bit_width: u32,
buffer: Vec<u8>,
}
impl Encoder for BitpackingEncoder {
fn encode(&mut self, batch: &RecordBatch) -> Result<EncodedBatch> {
// 编码数据
let encoded_data = bitpack_array(&batch.data, self.bit_width)?;
Ok(EncodedBatch {
data: encoded_data,
page_headers: vec![],
statistics: EncodingStatistics {
original_size: batch.size(),
encoded_size: self.buffer.len(),
compression_ratio: (self.buffer.len() as f64) / (batch.size() as f64),
},
})
}
fn encoding_type(&self) -> EncodingType {
EncodingType::Bitpacking(self.bit_width)
}
fn statistics(&self) -> Option<EncodingStatistics> {
Some(EncodingStatistics {
original_size: 0,
encoded_size: self.buffer.len(),
compression_ratio: 1.0,
})
}
}
解码器接口设计
rust
pub trait Decoder: Send + Sync {
// 解码批次
async fn decode(
&self,
data: &[u8],
page_header: &PageHeader,
row_range: Option<Range<u64>>,
) -> Result<RecordBatch>;
// 解码统计
fn statistics(&self) -> Option<DecodingStatistics>;
}
pub struct DecodingStatistics {
pub encoded_size: u64,
pub decoded_size: u64,
pub decoding_time_ms: u64,
}
Decoder 实例
rust
pub struct BitpackingDecoder {
bit_width: u32,
}
impl Decoder for BitpackingDecoder {
async fn decode(
&self,
data: &[u8],
page_header: &PageHeader,
row_range: Option<Range<u64>>,
) -> Result<RecordBatch> {
// 准备解码
let start = Instant::now();
// 解码批次
let decoded = bitunpack_array(data, self.bit_width, page_header.num_rows)?;
// 履行控制批次范例
let batch = if let Some(range) = row_range {
decoded.slice(range.start, range.end - range.start)?
} else {
decoded
};
Ok(batch)
}
fn statistics(&self) -> Option<DecodingStatistics> {
None
}
}
批次编码流程
rust
pub struct BatchEncoder {
encoders: HashMap<String, Box<dyn Encoder>>,
compression_config: CompressionConfig,
}
impl BatchEncoder {
pub async fn encode_batch(
&mut self,
batch: &RecordBatch,
) -> Result<EncodedBatch> {
let mut encoded_batch = EncodedBatch::new();
// 序列编码所有列
for field in batch.schema().fields() {
let column_data = batch.column_by_name(field.name())?;
// 选择适当的编码器
let encoder = self.select_encoder(field, column_data);
// 编码该列
let encoded = encoder.encode(&column_data)?;
// 适当的压缩
let compressed = self.compress(&encoded, &self.compression_config)?;
encoded_batch.add_column(field.name(), compressed);
}
Ok(encoded_batch)
}
}
总结
编码器与解码器是 Lance 的核心优化组件:
- 接口准一:上下设计讯问
- 批次处理:并列编解码
- RepDef 支持:支供嵌套数据
- 缓存优化:批次缓存减排
- 核心衡量:合并类及批次并列
下一章讨论 IO 抽象与对象存储。