第24章:事务与提交协议
🎯 核心概览
Lance 通过乐观并发控制和分布式锁机制实现 ACID 事务。多个客户端可以并发读写,通过版本机制解决冲突,确保数据一致性和可用性。
🔐 ACID 保证实现
Atomicity(原子性)
原子性通过版本不可分割来实现:
rust
pub struct AtomicCommit {
version: u64,
fragments: Vec<Fragment>,
manifest_delta: ManifestEntry,
}
impl Dataset {
pub async fn commit(
&mut self,
changes: Vec<Change>, // 记录的改动
) -> Result<u64> {
// Step 1: 验证冲突
let current_version = self.get_current_version().await?;
// Step 2: 准备新版本(不可见)
let new_version = current_version + 1;
let temp_fragments = self.prepare_fragments(&changes).await?;
// Step 3: 写入 manifest(原子操作)
// 在此之前,新数据对外不可见
self.write_manifest_atomic(new_version, &temp_fragments).await?;
// Step 4: 原子更新 _current 指针
// 此时新版本才变为可见
self.update_current_version(new_version).await?;
Ok(new_version)
}
async fn write_manifest_atomic(
&self,
version: u64,
fragments: &[Fragment],
) -> Result<()> {
// 1. 写入临时文件
let temp_path = self.path.join(format!("_manifest/v{}.tmp", version));
let manifest = self.create_manifest_entry(version, fragments);
let data = bincode::serialize(&manifest)?;
tokio::fs::write(&temp_path, data).await?;
// 2. fsync 确保持久化
let file = std::fs::File::create(&temp_path)?;
file.sync_all()?;
// 3. 原子重命名
let final_path = self.path.join(format!("_manifest/v{}_manifest", version));
tokio::fs::rename(&temp_path, &final_path).await?;
Ok(())
}
}
Consistency(一致性)
Schema 验证和约束检查:
rust
pub struct ConsistencyChecker {
schema: SchemaRef,
constraints: Vec<Constraint>,
}
#[derive(Debug)]
pub enum Constraint {
NotNull(String), // 非空约束
Unique(String), // 唯一约束
PrimaryKey(Vec<String>), // 主键
ForeignKey { // 外键
column: String,
ref_table: String,
ref_column: String,
},
}
impl ConsistencyChecker {
pub async fn validate(&self, batch: &RecordBatch) -> Result<()> {
for constraint in &self.constraints {
match constraint {
Constraint::NotNull(col) => {
let array = batch.column_by_name(col)?;
if array.null_count() > 0 {
return Err(format!(
"NOT NULL constraint violated on column '{}'",
col
).into());
}
}
Constraint::Unique(col) => {
let array = batch.column_by_name(col)?;
let set: std::collections::HashSet<_> = array.iter()
.filter_map(|v| v)
.collect();
if set.len() != array.len() - array.null_count() {
return Err(format!(
"UNIQUE constraint violated on column '{}'",
col
).into());
}
}
Constraint::PrimaryKey(cols) => {
// 检查组合主键唯一性
// ...
}
Constraint::ForeignKey { .. } => {
// 检查外键引用存在
// ...
}
}
}
Ok(())
}
}
Isolation(隔离性)
基于 MVCC(多版本并发控制):
rust
pub enum IsolationLevel {
ReadUncommitted, // 脏读(不推荐)
ReadCommitted, // 已提交读
RepeatableRead, // 可重复读
Serializable, // 序列化
}
pub struct TransactionContext {
txn_id: u64,
start_version: u64, // 事务开始时的版本号
isolation_level: IsolationLevel,
read_set: Vec<u64>, // 读过的版本
write_set: Vec<(u64, Change)>, // 写过的改动
}
impl Dataset {
pub async fn begin_txn(&self, isolation_level: IsolationLevel) -> Result<TransactionContext> {
let current_version = self.get_current_version().await?;
Ok(TransactionContext {
txn_id: self.allocate_txn_id().await?,
start_version: current_version,
isolation_level,
read_set: vec![],
write_set: vec![],
})
}
// 版本可见性检查
fn is_version_visible(&self, version: u64, txn: &TransactionContext) -> bool {
match txn.isolation_level {
IsolationLevel::ReadUncommitted => true, // 读所有版本
IsolationLevel::ReadCommitted => {
// 读最新已提交版本
version <= txn.start_version
}
IsolationLevel::RepeatableRead => {
// 读事务开始时的版本快照
version <= txn.start_version
}
IsolationLevel::Serializable => {
// 严格的版本检查
version <= txn.start_version && txn.read_set.contains(&version)
}
}
}
}
Durability(持久性)
通过预写日志(WAL):
rust
pub struct WriteAheadLog {
log_path: PathBuf,
current_sequence: u64,
}
#[derive(Serialize, Deserialize)]
pub struct LogEntry {
sequence: u64,
timestamp: u64,
operation: Operation,
checksum: u32,
}
pub enum Operation {
Write {
version: u64,
fragment_id: u64,
data: Vec<u8>,
},
Delete {
version: u64,
fragment_id: u64,
},
CreateIndex {
version: u64,
index_spec: String,
},
}
impl WriteAheadLog {
pub async fn append(&mut self, op: Operation) -> Result<u64> {
self.current_sequence += 1;
let entry = LogEntry {
sequence: self.current_sequence,
timestamp: SystemTime::now()
.duration_since(UNIX_EPOCH)?
.as_millis() as u64,
operation: op,
checksum: 0, // 计算校验和
};
// 写入 WAL 文件
let log_file = self.log_path.join(format!("wal_{}.log", self.current_sequence));
let data = bincode::serialize(&entry)?;
tokio::fs::write(&log_file, &data).await?;
// fsync 确保持久化
let file = std::fs::File::create(&log_file)?;
file.sync_all()?;
Ok(self.current_sequence)
}
// 从 WAL 恢复
pub async fn recover(&mut self) -> Result<()> {
let mut entries = tokio::fs::read_dir(&self.log_path).await?;
let mut pending_ops = Vec::new();
while let Some(entry) = entries.next_entry().await? {
let data = tokio::fs::read(entry.path()).await?;
let log_entry: LogEntry = bincode::deserialize(&data)?;
pending_ops.push(log_entry);
}
// 按顺序重播
pending_ops.sort_by_key(|e| e.sequence);
for entry in pending_ops {
self.replay_log_entry(&entry).await?;
}
Ok(())
}
}
⚡ 乐观并发控制
冲突检测
rust
pub struct OptimisticConcurrencyControl {
dataset: Arc<Dataset>,
}
impl OptimisticConcurrencyControl {
pub async fn commit(
&self,
txn: &TransactionContext,
new_changes: Vec<Change>,
) -> Result<u64> {
// 1. 获取当前版本
let current_version = self.dataset.get_current_version().await?;
// 2. 检查是否有冲突
if current_version != txn.start_version {
// 有其他事务提交过
let has_conflict = self.detect_conflict(
txn,
txn.start_version,
current_version,
).await?;
if has_conflict {
return Err("Transaction conflict detected".into());
}
}
// 3. 没有冲突,直接提交
let new_version = self.dataset.commit(new_changes).await?;
Ok(new_version)
}
async fn detect_conflict(
&self,
txn: &TransactionContext,
from_version: u64,
to_version: u64,
) -> Result<bool> {
// 检查两个版本之间是否修改了本事务读过的数据
for version in from_version..=to_version {
let manifest = self.dataset.load_manifest(version).await?;
for fragment in manifest.fragments {
// 如果这个 fragment 在事务的读集中,就是冲突
if txn.read_set.contains(&fragment.id) {
return Ok(true);
}
}
}
Ok(false)
}
}
🔒 分布式锁机制
CommitHandler
rust
pub trait CommitHandler: Send + Sync {
async fn acquire_lock(&self, dataset_path: &str) -> Result<Lock>;
async fn release_lock(&self, lock: Lock) -> Result<()>;
async fn is_locked(&self, dataset_path: &str) -> Result<bool>;
}
pub struct Lock {
lock_id: String,
holder: String, // 获得锁的客户端 ID
acquired_at: u64,
expires_at: u64,
}
// 基于文件的锁实现
pub struct FileLockHandler {
lock_dir: PathBuf,
lease_duration: Duration,
}
#[async_trait]
impl CommitHandler for FileLockHandler {
async fn acquire_lock(&self, dataset_path: &str) -> Result<Lock> {
let lock_file = self.lock_dir.join(format!("{}.lock", hash(dataset_path)));
// 尝试以独占模式创建文件
match tokio::fs::OpenOptions::new()
.create_new(true)
.write(true)
.open(&lock_file)
.await
{
Ok(file) => {
// 成功获得锁
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)?
.as_secs();
let lock = Lock {
lock_id: uuid::Uuid::new_v4().to_string(),
holder: get_hostname(),
acquired_at: now,
expires_at: now + self.lease_duration.as_secs(),
};
// 写入锁信息
tokio::fs::write(&lock_file, serde_json::to_string(&lock)?).await?;
Ok(lock)
}
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
// 文件已存在,说明有其他客户端持有锁
// 检查锁是否过期
if let Ok(content) = tokio::fs::read_to_string(&lock_file).await {
if let Ok(existing_lock) = serde_json::from_str::<Lock>(&content) {
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)?
.as_secs();
if now > existing_lock.expires_at {
// 锁已过期,尝试删除并重新获取
tokio::fs::remove_file(&lock_file).await?;
return self.acquire_lock(dataset_path).await;
}
}
}
Err("Lock already held by another process".into())
}
Err(e) => Err(e.into()),
}
}
async fn release_lock(&self, lock: Lock) -> Result<()> {
let lock_file = self.lock_dir.join(format!("{}.lock", hash(&lock.lock_id)));
tokio::fs::remove_file(&lock_file).await?;
Ok(())
}
async fn is_locked(&self, dataset_path: &str) -> Result<bool> {
let lock_file = self.lock_dir.join(format!("{}.lock", hash(dataset_path)));
Ok(lock_file.exists())
}
}
// 基于 etcd 的分布式锁实现
pub struct EtcdLockHandler {
etcd_client: EtcdClient,
}
#[async_trait]
impl CommitHandler for EtcdLockHandler {
async fn acquire_lock(&self, dataset_path: &str) -> Result<Lock> {
let lock_key = format!("/lance/locks/{}", dataset_path);
// 使用 etcd 的原子 CAS(Compare and Swap)操作
let lock = Lock {
lock_id: uuid::Uuid::new_v4().to_string(),
holder: get_hostname(),
acquired_at: SystemTime::now()
.duration_since(UNIX_EPOCH)?
.as_secs(),
expires_at: 0, // etcd 会自动设置过期时间
};
self.etcd_client.put_if_not_exists(
&lock_key,
serde_json::to_string(&lock)?,
60, // 60 秒租约
).await?;
Ok(lock)
}
async fn release_lock(&self, lock: Lock) -> Result<()> {
let lock_key = format!("/lance/locks/{}", lock.lock_id);
self.etcd_client.delete(&lock_key).await?;
Ok(())
}
async fn is_locked(&self, dataset_path: &str) -> Result<bool> {
let lock_key = format!("/lance/locks/{}", dataset_path);
Ok(self.etcd_client.get(&lock_key).await?.is_some())
}
}
📝 提交协议
两阶段提交(2PC)
对于需要强一致性的场景:
rust
pub struct TwoPhaseCommit {
participants: Vec<Node>,
}
impl TwoPhaseCommit {
// Phase 1: Prepare
pub async fn prepare(
&self,
txn_id: u64,
changes: Vec<Change>,
) -> Result<Vec<VoteResult>> {
let mut votes = Vec::new();
for node in &self.participants {
let vote = node.prepare(txn_id, &changes).await?;
votes.push(vote);
}
// 检查是否所有节点都投了赞成票
let all_yes = votes.iter().all(|v| v.can_commit);
Ok(if all_yes {
vec![VoteResult::Yes]
} else {
vec![VoteResult::No]
})
}
// Phase 2: Commit
pub async fn commit(&self, txn_id: u64) -> Result<()> {
let mut results = Vec::new();
for node in &self.participants {
let result = node.commit(txn_id).await?;
results.push(result);
}
// 至少一个节点成功才认为提交成功
if results.iter().any(|r| r.success) {
Ok(())
} else {
Err("All nodes failed to commit".into())
}
}
// Phase 2 (Abort): Rollback
pub async fn abort(&self, txn_id: u64) -> Result<()> {
for node in &self.participants {
node.abort(txn_id).await?;
}
Ok(())
}
}
📚 总结
Lance 的事务与提交协议通过以下机制实现 ACID:
- Atomicity:不可分割的版本更新
- Consistency:Schema 验证和约束检查
- Isolation:基于 MVCC 的多版本并发控制
- Durability:预写日志和 fsync
同时支持乐观并发控制和分布式锁,适应不同的使用场景。