一、分布式会话基础架构
1.1 会话管理核心挑战
python
复制
下载
class DistributedSessionChallenges:
"""分布式会话管理挑战"""
def __init__(self):
self.challenges = self._analyze_challenges()
def _analyze_challenges(self):
"""分析分布式会话核心挑战"""
return {
"数据一致性": {
"问题": "多副本间数据如何保持一致",
"具体表现": [
"用户会话在不同服务器间跳转",
"多个数据中心间会话同步",
"并发修改导致数据冲突"
],
"一致性级别": {
"强一致性": "所有节点立即同步,性能低",
"最终一致性": "异步同步,可能短暂不一致",
"会话一致性": "保证同一用户会话内一致性"
}
},
"容灾与高可用": {
"问题": "节点故障时如何保证会话不丢失",
"具体表现": [
"服务器宕机会话丢失",
"网络分区导致服务中断",
"数据恢复时间长"
],
"要求": {
"RTO(恢复时间目标)": "通常<5分钟",
"RPO(数据恢复点目标)": "通常=0(零数据丢失)",
"可用性": "99.99%(年停机时间<52分钟)"
}
},
"性能与扩展性": {
"问题": "如何支持海量用户会话",
"具体表现": [
"单点瓶颈",
"网络延迟影响",
"存储成本高"
],
"指标": {
"吞吐量": "QPS > 10万",
"延迟": "P99 < 50ms",
"可扩展性": "线性扩展"
}
},
"安全与合规": {
"问题": "如何保证会话安全",
"具体表现": [
"会话劫持",
"数据泄露",
"合规性要求(GDPR等)"
],
"要求": {
"加密": "传输和存储加密",
"访问控制": "严格权限管理",
"审计": "完整操作日志"
}
}
}
1.2 分布式会话架构模式
python
复制
下载
class SessionArchitecturePatterns:
"""分布式会话架构模式"""
def __init__(self):
self.patterns = self._define_patterns()
def _define_patterns(self):
"""定义架构模式"""
return {
"集中式会话存储": {
"架构": "所有会话数据集中存储在Redis/DB集群",
"工作流程": """
1. 用户请求到达任意应用服务器
2. 服务器从集中存储读取会话
3. 处理请求,更新会话
4. 写回集中存储
""",
"优点": [
"数据一致性好",
"扩展应用服务器容易",
"故障转移简单"
],
"缺点": [
"存储成为单点瓶颈",
"网络延迟增加",
"存储故障影响所有用户"
],
"适用场景": "中小规模,强一致性要求"
},
"客户端会话存储": {
"架构": "会话数据加密后存储在客户端(Cookie/Token)",
"工作流程": """
1. 服务器生成加密的会话数据
2. 发送给客户端存储
3. 客户端后续请求携带会话数据
4. 服务器解密验证
""",
"优点": [
"无状态服务器",
"扩展性强",
"无单点故障"
],
"缺点": [
"数据大小受限(Cookie 4KB)",
"安全风险(客户端可能篡改)",
"每次请求传输数据量大"
],
"适用场景": "移动应用、微服务架构"
},
"复制式会话存储": {
"架构": "会话数据复制到多个节点",
"工作流程": """
1. 主节点处理会话更新
2. 同步复制到从节点
3. 读取可以从任意节点
4. 故障时自动切换
""",
"复制策略": {
"同步复制": "强一致性,性能低",
"异步复制": "最终一致性,性能高",
"半同步复制": "平衡一致性和性能"
},
"优点": [
"高可用性",
"读取性能好",
"容灾能力强"
],
"缺点": [
"复制延迟",
"数据冲突可能",
"存储成本高"
],
"适用场景": "大规模、高可用要求"
},
"分片式会话存储": {
"架构": "会话数据分片存储在不同节点",
"工作流程": """
1. 根据用户ID哈希选择分片
2. 请求路由到对应分片
3. 分片内可能还有副本
""",
"分片策略": {
"哈希分片": "均匀分布",
"范围分片": "便于范围查询",
"地理位置分片": "减少延迟"
},
"优点": [
"线性扩展",
"无单点瓶颈",
"性能好"
],
"缺点": [
"数据迁移复杂",
"热点问题",
"跨分片查询困难"
],
"适用场景": "超大规模系统"
},
"混合架构": {
"架构": "结合多种模式的优点",
"示例": """
1. 热数据:内存缓存(Redis)
2. 温数据:分布式KV存储(Cassandra)
3. 冷数据:对象存储(S3)
4. 备份:跨数据中心复制
""",
"策略": {
"分层存储": "根据访问频率分层",
"读写分离": "读多写少场景优化",
"多活架构": "多个数据中心同时服务"
},
"优点": "综合性能最优",
"缺点": "架构复杂"
}
}
二、强一致性方案实现
2.1 分布式锁与事务
python
复制
下载
class StrongConsistencyImplementation:
"""强一致性实现方案"""
def __init__(self):
self.solutions = self._design_solutions()
def _design_solutions(self):
"""设计强一致性方案"""
return {
"分布式锁方案": {
"实现原理": "使用分布式锁保证同一会话的并发安全",
"技术选型": {
"Redis Redlock": "基于Redis的分布式锁算法",
"ZooKeeper": "基于临时顺序节点的分布式锁",
"etcd": "基于租约的分布式锁"
},
"代码实现": self._implement_distributed_lock(),
"优化策略": {
"锁粒度优化": "细粒度锁减少竞争",
"锁超时机制": "防止死锁",
"锁续期": "长时间操作自动续期"
}
},
"分布式事务方案": {
"实现原理": "使用分布式事务保证跨节点操作一致性",
"技术选型": {
"2PC(两阶段提交)": "传统方案,存在阻塞问题",
"3PC(三阶段提交)": "改进2PC,减少阻塞",
"TCC(Try-Confirm-Cancel)": "补偿型事务",
"Saga模式": "长事务解决方案"
},
"会话事务示例": self._implement_session_transaction(),
"注意事项": {
"事务隔离级别": "根据业务选择合适级别",
"性能影响": "分布式事务性能开销大",
"回滚机制": "确保异常时正确回滚"
}
},
"Paxos/Raft共识算法": {
"实现原理": "基于共识算法保证多副本数据一致性",
"应用场景": "会话元数据管理、配置同步",
"实现示例": self._implement_consensus_algorithm(),
"对比分析": {
"Paxos": "理论完备,实现复杂",
"Raft": "易于理解,工程友好",
"ZAB": "ZooKeeper使用,顺序一致性"
}
}
}
def _implement_distributed_lock(self):
"""实现分布式锁"""
import redis
import threading
import time
import uuid
class DistributedSessionLock:
"""基于Redis的分布式会话锁"""
def __init__(self, redis_client, lock_key_prefix="session_lock:"):
self.redis = redis_client
self.lock_key_prefix = lock_key_prefix
self.local_lock = threading.RLock() # 本地锁,避免同一进程内竞争
self.lock_holders = {} # 记录锁持有者
def acquire_session_lock(self, session_id, timeout=10, expire=30):
"""
获取会话锁
Args:
session_id: 会话ID
timeout: 获取锁超时时间(秒)
expire: 锁过期时间(秒)
Returns:
bool: 是否成功获取锁
"""
lock_key = f"{self.lock_key_prefix}{session_id}"
lock_value = str(uuid.uuid4()) # 唯一标识,用于安全释放
start_time = time.time()
with self.local_lock:
while time.time() - start_time < timeout:
# 尝试获取锁
acquired = self.redis.set(
lock_key,
lock_value,
nx=True, # 仅当key不存在时设置
ex=expire # 过期时间
)
if acquired:
# 记录锁持有者
self.lock_holders[session_id] = {
'value': lock_value,
'acquire_time': time.time(),
'expire_time': time.time() + expire
}
# 启动看门狗线程自动续期
self._start_watchdog(session_id, lock_key, lock_value, expire)
return True
# 等待重试
time.sleep(0.1) # 避免CPU忙等待
return False
def release_session_lock(self, session_id):
"""释放会话锁"""
lock_key = f"{self.lock_key_prefix}{session_id}"
with self.local_lock:
if session_id in self.lock_holders:
lock_info = self.lock_holders[session_id]
# 使用Lua脚本保证原子性
lua_script = """
if redis.call("get", KEYS[1]) == ARGV[1] then
return redis.call("del", KEYS[1])
else
return 0
end
"""
result = self.redis.eval(
lua_script,
1, # 一个key
lock_key,
lock_info['value']
)
if result == 1:
# 停止看门狗
self._stop_watchdog(session_id)
del self.lock_holders[session_id]
return True
return False
def _start_watchdog(self, session_id, lock_key, lock_value, expire):
"""启动看门狗线程自动续期"""
def watchdog():
renew_interval = expire / 3 # 每过期时间1/3续期一次
while True:
time.sleep(renew_interval)
with self.local_lock:
if session_id not in self.lock_holders:
break # 锁已释放
# 续期锁
try:
lua_script = """
if redis.call("get", KEYS[1]) == ARGV[1] then
return redis.call("expire", KEYS[1], ARGV[2])
else
return 0
end
"""
renewed = self.redis.eval(
lua_script,
1,
lock_key,
lock_value,
expire
)
if renewed:
# 更新过期时间
self.lock_holders[session_id]['expire_time'] = time.time() + expire
else:
# 锁可能已被其他客户端获取
del self.lock_holders[session_id]
break
except Exception as e:
# 网络异常等,停止续期
print(f"Watchdog error for session {session_id}: {e}")
break
# 启动守护线程
import threading
thread = threading.Thread(target=watchdog, daemon=True)
thread.start()
# 记录线程
if session_id in self.lock_holders:
self.lock_holders[session_id]['watchdog_thread'] = thread
def _stop_watchdog(self, session_id):
"""停止看门狗线程"""
if session_id in self.lock_holders:
# 通过设置标志让线程自然退出
self.lock_holders[session_id]['watchdog_thread'] = None
def is_locked(self, session_id):
"""检查会话是否被锁定"""
lock_key = f"{self.lock_key_prefix}{session_id}"
return self.redis.exists(lock_key) == 1
def get_lock_ttl(self, session_id):
"""获取锁剩余时间"""
lock_key = f"{self.lock_key_prefix}{session_id}"
ttl = self.redis.ttl(lock_key)
return ttl if ttl > 0 else 0
return DistributedSessionLock
def _implement_session_transaction(self):
"""实现会话事务"""
class SessionTransactionManager:
"""会话事务管理器"""
def __init__(self, session_storage, transaction_logger):
self.session_storage = session_storage
self.transaction_logger = transaction_logger
self.transactions = {} # 正在进行的事务
def begin_transaction(self, session_id, transaction_id=None):
"""
开始会话事务
Args:
session_id: 会话ID
transaction_id: 事务ID,如果None则自动生成
Returns:
str: 事务ID
"""
if transaction_id is None:
import uuid
transaction_id = str(uuid.uuid4())
# 获取会话锁
if not self._acquire_session_for_transaction(session_id, transaction_id):
raise Exception(f"Failed to acquire session {session_id} for transaction")
# 初始化事务状态
self.transactions[transaction_id] = {
'session_id': session_id,
'status': 'active',
'start_time': time.time(),
'operations': [], # 记录所有操作
'rollback_operations': [] # 回滚操作
}
# 记录事务开始日志
self.transaction_logger.log_transaction_start(transaction_id, session_id)
return transaction_id
def execute_operation(self, transaction_id, operation_type, key, new_value, old_value=None):
"""
执行事务操作
Args:
transaction_id: 事务ID
operation_type: 操作类型(SET/REMOVE等)
key: 操作的key
new_value: 新值
old_value: 旧值(用于回滚)
"""
if transaction_id not in self.transactions:
raise Exception(f"Transaction {transaction_id} not found")
transaction = self.transactions[transaction_id]
# 记录操作
operation = {
'type': operation_type,
'key': key,
'new_value': new_value,
'old_value': old_value,
'timestamp': time.time()
}
transaction['operations'].append(operation)
# 生成回滚操作
rollback_op = self._create_rollback_operation(operation)
transaction['rollback_operations'].insert(0, rollback_op) # 逆序插入
# 执行实际操作(预提交)
self._execute_precommit(transaction['session_id'], operation)
def commit_transaction(self, transaction_id):
"""提交事务"""
if transaction_id not in self.transactions:
raise Exception(f"Transaction {transaction_id} not found")
transaction = self.transactions[transaction_id]
try:
# 阶段1:预提交检查
if not self._prepare_commit(transaction):
self._rollback_transaction(transaction_id)
return False
# 阶段2:正式提交
self._do_commit(transaction)
# 更新事务状态
transaction['status'] = 'committed'
transaction['commit_time'] = time.time()
# 记录提交日志
self.transaction_logger.log_transaction_commit(transaction_id)
return True
except Exception as e:
# 提交失败,回滚
self._rollback_transaction(transaction_id)
raise e
finally:
# 释放会话锁
self._release_session_for_transaction(transaction['session_id'], transaction_id)
# 清理事务记录(可以异步归档)
del self.transactions[transaction_id]
def rollback_transaction(self, transaction_id):
"""回滚事务"""
return self._rollback_transaction(transaction_id)
def _rollback_transaction(self, transaction_id):
"""内部回滚实现"""
if transaction_id not in self.transactions:
return False
transaction = self.transactions[transaction_id]
try:
# 执行回滚操作
for rollback_op in transaction['rollback_operations']:
self._execute_rollback(transaction['session_id'], rollback_op)
# 更新事务状态
transaction['status'] = 'rolled_back'
transaction['rollback_time'] = time.time()
# 记录回滚日志
self.transaction_logger.log_transaction_rollback(transaction_id)
return True
finally:
# 释放会话锁
self._release_session_for_transaction(transaction['session_id'], transaction_id)
# 清理事务记录
del self.transactions[transaction_id]
def _acquire_session_for_transaction(self, session_id, transaction_id):
"""为事务获取会话锁"""
# 实现获取分布式锁的逻辑
return True # 简化实现
def _release_session_for_transaction(self, session_id, transaction_id):
"""释放事务的会话锁"""
# 实现释放分布式锁的逻辑
pass
def _execute_precommit(self, session_id, operation):
"""执行预提交操作"""
# 在实际存储前,可以先在事务日志中记录
pass
def _create_rollback_operation(self, operation):
"""创建回滚操作"""
rollback_op = operation.copy()
if operation['type'] == 'SET':
rollback_op['type'] = 'SET' if operation['old_value'] is not None else 'REMOVE'
rollback_op['new_value'] = operation['old_value']
elif operation['type'] == 'REMOVE':
rollback_op['type'] = 'SET'
# old_value应该是删除前的值
return rollback_op
def _prepare_commit(self, transaction):
"""准备提交检查"""
# 检查所有操作是否仍然有效
# 检查会话是否仍然有效
# 检查资源是否充足
return True
def _do_commit(self, transaction):
"""执行实际提交"""
# 批量执行所有操作
for operation in transaction['operations']:
self._execute_operation_commit(transaction['session_id'], operation)
def _execute_operation_commit(self, session_id, operation):
"""执行操作提交"""
if operation['type'] == 'SET':
self.session_storage.set(session_id, operation['key'], operation['new_value'])
elif operation['type'] == 'REMOVE':
self.session_storage.remove(session_id, operation['key'])
def _execute_rollback(self, session_id, rollback_op):
"""执行回滚操作"""
if rollback_op['type'] == 'SET':
if rollback_op['new_value'] is not None:
self.session_storage.set(session_id, rollback_op['key'], rollback_op['new_value'])
else:
self.session_storage.remove(session_id, rollback_op['key'])
elif rollback_op['type'] == 'REMOVE':
self.session_storage.remove(session_id, rollback_op['key'])
return SessionTransactionManager
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
2.2 多版本并发控制(MVCC)
python
复制
下载
class SessionMVCC:
"""会话多版本并发控制"""
def __init__(self):
self.mvcc_system = self._design_mvcc()
def _design_mvcc(self):
"""设计MVCC系统"""
return {
"MVCC原理": {
"核心思想": "每个数据项维护多个版本,读操作读取旧版本,写操作创建新版本",
"解决痛点": "读写不阻塞,提高并发性能",
"关键概念": {
"版本号(timestamp)": "单调递增,标识版本顺序",
"可见性规则": "决定哪个版本对事务可见",
"垃圾回收": "清理不再需要的旧版本"
}
},
"会话MVCC设计": {
"数据结构": """
session_data {
session_id: string,
versions: [
{
version: int,
data: dict,
created_at: timestamp,
expired_at: timestamp,
transaction_id: string
}
],
current_version: int
}
""",
"读写规则": {
"读操作": "读取小于等于当前事务版本号的最新版本",
"写操作": "创建新版本,版本号递增",
"删除操作": "标记版本为删除,不立即物理删除"
}
},
"实现方案": self._implement_session_mvcc(),
"性能优化": {
"版本链优化": "使用跳表或B+树加速版本查找",
"批量清理": "定期异步清理过期版本",
"内存优化": "压缩旧版本数据"
}
}
def _implement_session_mvcc(self):
"""实现会话MVCC"""
import time
import threading
class SessionMVCCStorage:
"""基于MVCC的会话存储"""
def __init__(self, storage_backend):
self.storage = storage_backend
self.version_lock = threading.RLock()
self.next_version = 1 # 全局版本号生成器
self.active_transactions = {} # 活跃事务
def begin_transaction(self, transaction_id=None, read_only=False):
"""开始事务"""
if transaction_id is None:
import uuid
transaction_id = str(uuid.uuid4())
with self.version_lock:
# 分配版本号
start_version = self.next_version
if not read_only:
self.next_version += 1
# 记录事务
self.active_transactions[transaction_id] = {
'start_version': start_version,
'read_only': read_only,
'status': 'active',
'start_time': time.time()
}
return transaction_id, start_version
def read_session(self, session_id, transaction_id):
"""读取会话(MVCC版本)"""
if transaction_id not in self.active_transactions:
raise Exception(f"Transaction {transaction_id} not active")
transaction = self.active_transactions[transaction_id]
read_version = transaction['start_version']
# 从存储读取会话数据
session_data = self.storage.get(session_id)
if not session_data:
return None
# 找到对当前事务可见的版本
visible_version = self._find_visible_version(session_data, read_version)
return visible_version['data'] if visible_version else None
def write_session(self, session_id, data, transaction_id):
"""写入会话(创建新版本)"""
if transaction_id not in self.active_transactions:
raise Exception(f"Transaction {transaction_id} not active")
transaction = self.active_transactions[transaction_id]
if transaction['read_only']:
raise Exception(f"Transaction {transaction_id} is read-only")
write_version = transaction['start_version']
with self.version_lock:
# 获取当前会话数据
session_data = self.storage.get(session_id)
if not session_data:
session_data = {
'session_id': session_id,
'versions': [],
'current_version': 0
}
# 创建新版本
new_version = {
'version': write_version,
'data': data.copy(), # 深拷贝,避免后续修改影响
'created_at': time.time(),
'transaction_id': transaction_id,
'status': 'pending' # 提交前为pending状态
}
# 添加到版本列表
session_data['versions'].append(new_version)
session_data['current_version'] = write_version
# 标记旧版本为可能过期(但不立即删除)
self._mark_old_versions(session_data, write_version)
# 保存到存储
self.storage.set(session_id, session_data)
# 记录写操作
if 'writes' not in transaction:
transaction['writes'] = []
transaction['writes'].append({
'session_id': session_id,
'version': write_version
})
def commit_transaction(self, transaction_id):
"""提交事务"""
if transaction_id not in self.active_transactions:
raise Exception(f"Transaction {transaction_id} not active")
transaction = self.active_transactions[transaction_id]
try:
with self.version_lock:
# 更新所有写入版本的状态为committed
if 'writes' in transaction:
for write in transaction['writes']:
session_id = write['session_id']
version_num = write['version']
session_data = self.storage.get(session_id)
if session_data:
for version in session_data['versions']:
if version['version'] == version_num:
version['status'] = 'committed'
version['committed_at'] = time.time()
break
self.storage.set(session_id, session_data)
# 更新事务状态
transaction['status'] = 'committed'
transaction['commit_time'] = time.time()
# 触发垃圾回收检查
self._trigger_garbage_collection()
return True
finally:
# 清理事务记录(可以延迟清理)
self._cleanup_transaction(transaction_id)
def rollback_transaction(self, transaction_id):
"""回滚事务"""
if transaction_id not in self.active_transactions:
raise Exception(f"Transaction {transaction_id} not active")
transaction = self.active_transactions[transaction_id]
with self.version_lock:
# 标记所有写入版本为rolled_back
if 'writes' in transaction:
for write in transaction['writes']:
session_id = write['session_id']
version_num = write['version']
session_data = self.storage.get(session_id)
if session_data:
# 移除pending状态的版本
session_data['versions'] = [
v for v in session_data['versions']
if not (v['version'] == version_num and v['status'] == 'pending')
]
# 更新current_version
if session_data['versions']:
session_data['current_version'] = max(
v['version'] for v in session_data['versions']
if v['status'] == 'committed'
)
else:
session_data['current_version'] = 0
self.storage.set(session_id, session_data)
# 更新事务状态
transaction['status'] = 'rolled_back'
transaction['rollback_time'] = time.time()
# 清理事务记录
self._cleanup_transaction(transaction_id)
def _find_visible_version(self, session_data, read_version):
"""找到对读事务可见的版本"""
# 按版本号降序排序,找到第一个满足条件的版本
committed_versions = [
v for v in session_data['versions']
if v['status'] == 'committed' and v['version'] <= read_version
]
if not committed_versions:
return None
# 返回版本号最大的(最新的)
return max(committed_versions, key=lambda v: v['version'])
def _mark_old_versions(self, session_data, new_version):
"""标记旧版本可能过期"""
# 可以设置过期时间,或者标记为可回收
for version in session_data['versions']:
if version['version'] < new_version - 10: # 保留最近10个版本
version['expired'] = True
def _trigger_garbage_collection(self):
"""触发垃圾回收"""
# 可以异步执行,避免阻塞主流程
pass
def _cleanup_transaction(self, transaction_id):
"""清理事务记录"""
# 可以延迟清理,用于支持长事务的查询
if transaction_id in self.active_transactions:
del self.active_transactions[transaction_id]
def garbage_collect(self, session_id=None):
"""执行垃圾回收"""
if session_id:
# 清理特定会话的旧版本
session_data = self.storage.get(session_id)
if session_data:
# 保留最近N个版本,或者根据时间清理
kept_versions = self._select_versions_to_keep(session_data['versions'])
session_data['versions'] = kept_versions
self.storage.set(session_id, session_data)
else:
# 全局垃圾回收
# 注意:需要谨慎处理,避免影响性能
pass
def _select_versions_to_keep(self, versions):
"""选择要保留的版本"""
# 策略1: 保留最近N个提交的版本
committed_versions = [v for v in versions if v['status'] == 'committed']
committed_versions.sort(key=lambda v: v['version'], reverse=True)
# 保留最近10个版本
keep_count = min(10, len(committed_versions))
versions_to_keep = committed_versions[:keep_count]
# 加上所有未提交的版本(pending状态)
pending_versions = [v for v in versions if v['status'] == 'pending']
return versions_to_keep + pending_versions
return SessionMVCCStorage
三、最终一致性方案
3.1 基于CRDT的会话同步
python
复制
下载
class SessionCRDT:
"""基于CRDT的最终一致性会话"""
def __init__(self):
self.crdt_design = self._design_crdt_session()
def _design_crdt_session(self):
"""设计CRDT会话系统"""
return {
"CRDT选择": {
"LWW-Register(最后写入胜出)": {
"适用场景": "简单的键值对,如用户最后访问时间",
"实现": "带时间戳的寄存器",
"冲突解决": "时间戳最新的胜出"
},
"OR-Set(观察移除集合)": {
"适用场景": "购物车、收藏夹等集合操作",
"实现": "每个元素有唯一标识和标记集合",
"冲突解决": "元素在添加集且不在移除集时存在"
},
"PN-Counter(正负计数器)": {
"适用场景": "点赞数、在线人数等计数",
"实现": "分别记录增加和减少次数",
"冲突解决": "分别合并增减次数"
},
"G-Counter(增长计数器)": {
"适用场景": "只能增加不能减少的计数",
"实现": "每个节点记录自己的增加量",
"冲突解决": "取各节点最大值"
}
},
"会话CRDT设计": self._design_session_crdt_structure(),
"同步策略": {
"状态-based同步": "传输完整状态,适合小数据量",
"操作-based同步": "传输操作日志,适合大数据量",
"混合同步": "结合两种策略,根据场景选择"
},
"冲突处理": {
"自动合并": "CRDT保证自动收敛",
"业务逻辑修复": "某些冲突需要业务逻辑介入",
"用户干预": "提示用户解决冲突"
}
}
def _design_session_crdt_structure(self):
"""设计会话CRDT结构"""
import time
import json
import hashlib
class SessionCRDTStore:
"""CRDT会话存储"""
def __init__(self, node_id):
self.node_id = node_id
self.data = {
'registers': {}, # LWW-Register
'sets': {}, # OR-Set
'counters': {} # PN-Counter/G-Counter
}
self.operations_log = [] # 操作日志
self.vector_clock = {node_id: 0} # 向量时钟
def set_register(self, key, value):
"""设置LWW寄存器"""
timestamp = self._get_logical_timestamp()
if key not in self.data['registers']:
self.data['registers'][key] = {'value': None, 'timestamp': 0, 'node': ''}
current = self.data['registers'][key]
if timestamp > current['timestamp'] or (
timestamp == current['timestamp'] and self.node_id > current['node']
):
# 更新值
old_value = current['value']
self.data['registers'][key] = {
'value': value,
'timestamp': timestamp,
'node': self.node_id
}
# 记录操作
self._log_operation('SET_REGISTER', key, value, timestamp)
return True
return False
def get_register(self, key):
"""获取寄存器值"""
if key in self.data['registers']:
return self.data['registers'][key]['value']
return None
def add_to_set(self, set_key, element):
"""向OR-Set添加元素"""
if set_key not in self.data['sets']:
self.data['sets'][set_key] = {'elements': {}, 'tombstones': {}}
# 生成元素唯一标识
element_id = self._generate_element_id(set_key, element)
timestamp = self._get_logical_timestamp()
# 添加到元素集
self.data['sets'][set_key]['elements'][element_id] = {
'element': element,
'timestamp': timestamp,
'node': self.node_id
}
# 从墓碑集中移除(如果存在)
if element_id in self.data['sets'][set_key]['tombstones']:
del self.data['sets'][set_key]['tombstones'][element_id]
# 记录操作
self._log_operation('ADD_TO_SET', set_key, element, timestamp)
return element_id
def remove_from_set(self, set_key, element_or_id):
"""从OR-Set移除元素"""
if set_key not in self.data['sets']:
return False
# 确定元素ID
if isinstance(element_or_id, str) and element_or_id in self.data['sets'][set_key]['elements']:
element_id = element_or_id
else:
# 根据元素值查找ID
element_id = self._find_element_id(set_key, element_or_id)
if not element_id:
return False
if element_id in self.data['sets'][set_key]['elements']:
# 添加到墓碑集
element_info = self.data['sets'][set_key]['elements'][element_id]
self.data['sets'][set_key]['tombstones'][element_id] = {
'timestamp': self._get_logical_timestamp(),
'node': self.node_id
}
# 记录操作
self._log_operation('REMOVE_FROM_SET', set_key, element_id, element_info['timestamp'])
return True
return False
def get_set(self, set_key):
"""获取集合当前元素"""
if set_key not in self.data['sets']:
return []
elements = []
for element_id, element_info in self.data['sets'][set_key]['elements'].items():
# 检查是否在墓碑集中
if element_id not in self.data['sets'][set_key]['tombstones']:
elements.append(element_info['element'])
return elements
def increment_counter(self, counter_key, delta=1):
"""增加计数器"""
if counter_key not in self.data['counters']:
self.data['counters'][counter_key] = {'inc': {self.node_id: 0}, 'dec': {self.node_id: 0}}
counter = self.data['counters'][counter_key]
if self.node_id not in counter['inc']:
counter['inc'][self.node_id] = 0
counter['inc'][self.node_id] += delta
# 记录操作
self._log_operation('INCREMENT_COUNTER', counter_key, delta, self._get_logical_timestamp())
return self.get_counter_value(counter_key)
def decrement_counter(self, counter_key, delta=1):
"""减少计数器"""
if counter_key not in self.data['counters']:
self.data['counters'][counter_key] = {'inc': {self.node_id: 0}, 'dec': {self.node_id: 0}}
counter = self.data['counters'][counter_key]
if self.node_id not in counter['dec']:
counter['dec'][self.node_id] = 0
counter['dec'][self.node_id] += delta
# 记录操作
self._log_operation('DECREMENT_COUNTER', counter_key, delta, self._get_logical_timestamp())
return self.get_counter_value(counter_key)
def get_counter_value(self, counter_key):
"""获取计数器值"""
if counter_key not in self.data['counters']:
return 0
counter = self.data['counters'][counter_key]
total_inc = sum(counter['inc'].values())
total_dec = sum(counter['dec'].values())
return total_inc - total_dec
def merge(self, other_state):
"""合并另一个节点的状态"""
# 合并寄存器(LWW)
for key, other_reg in other_state.get('registers', {}).items():
if key not in self.data['registers']:
self.data['registers'][key] = other_reg
else:
self_reg = self.data['registers'][key]
if (other_reg['timestamp'] > self_reg['timestamp'] or
(other_reg['timestamp'] == self_reg['timestamp'] and other_reg['node'] > self_reg['node'])):
self.data['registers'][key] = other_reg
# 合并集合(OR-Set)
for set_key, other_set in other_state.get('sets', {}).items():
if set_key not in self.data['sets']:
self.data['sets'][set_key] = other_set
else:
# 合并元素
for element_id, element_info in other_set['elements'].items():
if element_id not in self.data['sets'][set_key]['elements']:
self.data['sets'][set_key]['elements'][element_id] = element_info
else:
# 保留时间戳最新的
current_info = self.data['sets'][set_key]['elements'][element_id]
if element_info['timestamp'] > current_info['timestamp']:
self.data['sets'][set_key]['elements'][element_id] = element_info
# 合并墓碑
for element_id, tombstone_info in other_set['tombstones'].items():
if element_id not in self.data['sets'][set_key]['tombstones']:
self.data['sets'][set_key]['tombstones'][element_id] = tombstone_info
else:
current_tombstone = self.data['sets'][set_key]['tombstones'][element_id]
if tombstone_info['timestamp'] > current_tombstone['timestamp']:
self.data['sets'][set_key]['tombstones'][element_id] = tombstone_info
# 合并计数器(PN-Counter)
for counter_key, other_counter in other_state.get('counters', {}).items():
if counter_key not in self.data['counters']:
self.data['counters'][counter_key] = other_counter
else:
# 合并增加计数
for node_id, inc_value in other_counter['inc'].items():
if node_id not in self.data['counters'][counter_key]['inc']:
self.data['counters'][counter_key]['inc'][node_id] = inc_value
else:
self.data['counters'][counter_key]['inc'][node_id] = max(
self.data['counters'][counter_key]['inc'][node_id],
inc_value
)
# 合并减少计数
for node_id, dec_value in other_counter['dec'].items():
if node_id not in self.data['counters'][counter_key]['dec']:
self.data['counters'][counter_key]['dec'][node_id] = dec_value
else:
self.data['counters'][counter_key]['dec'][node_id] = max(
self.data['counters'][counter_key]['dec'][node_id],
dec_value
)
# 合并向量时钟
for node_id, timestamp in other_state.get('vector_clock', {}).items():
if node_id not in self.vector_clock:
self.vector_clock[node_id] = timestamp
else:
self.vector_clock[node_id] = max(self.vector_clock[node_id], timestamp)
# 更新本地节点时钟
self.vector_clock[self.node_id] += 1
def get_state(self):
"""获取当前状态(用于同步)"""
return {
'registers': self.data['registers'],
'sets': self.data['sets'],
'counters': self.data['counters'],
'vector_clock': self.vector_clock.copy(),
'node_id': self.node_id
}
def _get_logical_timestamp(self):
"""获取逻辑时间戳"""
# 使用向量时钟
self.vector_clock[self.node_id] += 1
return self.vector_clock[self.node_id]
def _generate_element_id(self, set_key, element):
"""生成元素唯一ID"""
# 使用哈希确保唯一性
content = f"{set_key}:{json.dumps(element, sort_keys=True)}:{self.node_id}:{self._get_logical_timestamp()}"
return hashlib.md5(content.encode()).hexdigest()
def _find_element_id(self, set_key, element):
"""根据元素值查找ID"""
if set_key not in self.data['sets']:
return None
for element_id, element_info in self.data['sets'][set_key]['elements'].items():
if element_info['element'] == element:
return element_id
return None
def _log_operation(self, op_type, key, value, timestamp):
"""记录操作日志"""
self.operations_log.append({
'type': op_type,
'key': key,
'value': value,
'timestamp': timestamp,
'node_id': self.node_id,
'vector_clock': self.vector_clock.copy()
})
# 限制日志大小
if len(self.operations_log) > 1000:
self.operations_log = self.operations_log[-500:]
return SessionCRDTStore
3.2 基于操作日志的最终一致性
python
复制
下载
class OperationLogConsistency:
"""基于操作日志的最终一致性"""
def __init__(self):
self.log_system = self._design_log_based_system()
def _design_log_based_system(self):
"""设计基于操作日志的系统"""
return {
"系统架构": {
"组件": {
"操作日志存储": "分布式日志系统(如Kafka, Pulsar)",
"状态机": "应用操作日志到本地状态",
"冲突检测": "检测和处理冲突操作",
"垃圾回收": "清理已应用的操作日志"
},
"数据流": """
1. 客户端提交操作到日志
2. 所有节点订阅日志
3. 节点按顺序应用操作到本地状态
4. 定期检查点加速恢复
"""
},
"操作日志设计": self._implement_operation_log(),
"冲突处理策略": {
"时间戳排序": "按时间戳顺序应用,后操作覆盖前操作",
"业务规则解决": "根据业务逻辑解决冲突",
"向量时钟": "使用向量时钟检测并发操作",
"操作转换": "将冲突操作转换为无冲突操作"
},
"性能优化": {
"批量应用": "批量应用操作减少IO",
"检查点": "定期保存状态快照",
"压缩合并": "合并多个操作为一个",
"延迟应用": "非关键操作延迟处理"
}
}
def _implement_operation_log(self):
"""实现操作日志系统"""
import time
import json
import threading
from collections import defaultdict
class SessionOperationLog:
"""会话操作日志系统"""
def __init__(self, storage_backend, node_id):
self.storage = storage_backend
self.node_id = node_id
self.log_position = 0 # 已应用位置
self.pending_operations = [] # 待处理操作
self.state_machine = SessionStateMachine()
self.lock = threading.RLock()
self.subscribers = [] # 状态变更订阅者
# 向量时钟,记录各节点最新操作
self.vector_clock = defaultdict(int)
self.vector_clock[node_id] = 0
# 恢复检查点
self._recover_from_checkpoint()
def submit_operation(self, session_id, operation_type, key, value, metadata=None):
"""提交操作到日志"""
with self.lock:
# 生成操作ID
operation_id = self._generate_operation_id()
# 更新向量时钟
self.vector_clock[self.node_id] += 1
# 构建操作记录
operation = {
'id': operation_id,
'session_id': session_id,
'type': operation_type,
'key': key,
'value': value,
'metadata': metadata or {},
'timestamp': time.time(),
'node_id': self.node_id,
'vector_clock': dict(self.vector_clock),
'applied': False
}
# 写入存储
log_position = self.storage.append_log(operation)
# 立即应用(本地优先)
self._apply_operation_local(operation)
# 异步同步到其他节点
self._async_replicate_operation(operation, log_position)
return operation_id
def apply_remote_operation(self, operation):
"""应用远程操作"""
with self.lock:
# 检查是否已应用
if self._is_operation_applied(operation['id']):
return True
# 检测冲突
conflicts = self._detect_conflicts(operation)
if conflicts:
# 解决冲突
resolved = self._resolve_conflicts(operation, conflicts)
if not resolved:
# 冲突无法自动解决,需要特殊处理
self._handle_unresolved_conflict(operation, conflicts)
return False
# 应用操作
success = self._apply_operation(operation)
if success:
# 记录已应用
self._mark_operation_applied(operation['id'])
# 更新向量时钟
self._update_vector_clock(operation['vector_clock'])
# 通知订阅者
self._notify_subscribers(operation)
return success
def get_session_state(self, session_id):
"""获取会话状态"""
return self.state_machine.get_session(session_id)
def subscribe(self, callback):
"""订阅状态变更"""
self.subscribers.append(callback)
def create_checkpoint(self):
"""创建检查点"""
with self.lock:
checkpoint = {
'state': self.state_machine.get_all_sessions(),
'log_position': self.log_position,
'vector_clock': dict(self.vector_clock),
'timestamp': time.time(),
'node_id': self.node_id
}
# 保存检查点
checkpoint_id = f"checkpoint_{int(time.time())}"
self.storage.save_checkpoint(checkpoint_id, checkpoint)
# 清理旧日志
self._compact_logs()
return checkpoint_id
def _apply_operation_local(self, operation):
"""本地立即应用操作"""
# 直接应用到状态机
self.state_machine.apply_operation(operation)
# 标记为已应用
operation['applied'] = True
self._mark_operation_applied(operation['id'])
def _apply_operation(self, operation):
"""应用操作到状态机"""
try:
self.state_machine.apply_operation(operation)
return True
except Exception as e:
print(f"Failed to apply operation {operation['id']}: {e}")
return False
def _detect_conflicts(self, operation):
"""检测冲突"""
conflicts = []
# 检查向量时钟冲突
if self._has_vector_clock_conflict(operation['vector_clock']):
conflicts.append('vector_clock_conflict')
# 检查会话状态冲突
session_state = self.get_session_state(operation['session_id'])
if session_state:
# 检查是否并发修改同一key
last_op = self._get_last_operation_for_key(
operation['session_id'],
operation['key']
)
if last_op and self._is_concurrent(last_op, operation):
conflicts.append('concurrent_modification')
return conflicts
def _resolve_conflicts(self, operation, conflicts):
"""解决冲突"""
if 'vector_clock_conflict' in conflicts:
# 使用时间戳解决
return self._resolve_by_timestamp(operation)
if 'concurrent_modification' in conflicts:
# 最后写入胜出
return self._resolve_last_write_wins(operation)
return True
def _has_vector_clock_conflict(self, remote_clock):
"""检查向量时钟冲突"""
# 如果存在节点,远程时钟大于本地时钟,但本地时钟也大于远程时钟在某些节点上
has_greater = False
has_smaller = False
for node, ts in remote_clock.items():
local_ts = self.vector_clock.get(node, 0)
if ts > local_ts:
has_greater = True
elif ts < local_ts:
has_smaller = True
return has_greater and has_smaller
def _is_concurrent(self, op1, op2):
"""检查两个操作是否并发"""
# 使用向量时钟判断并发
clock1 = op1.get('vector_clock', {})
clock2 = op2.get('vector_clock', {})
# 如果两个时钟无法比较(互有大小),则为并发
return not self._happens_before(clock1, clock2) and \
not self._happens_before(clock2, clock1)
def _happens_before(self, clock1, clock2):
"""判断clock1是否发生在clock2之前"""
# 如果clock1在所有节点上都小于等于clock2
all_nodes = set(clock1.keys()) | set(clock2.keys())
for node in all_nodes:
ts1 = clock1.get(node, 0)
ts2 = clock2.get(node, 0)
if ts1 > ts2:
return False
return True
def _resolve_by_timestamp(self, operation):
"""按时间戳解决冲突"""
# 获取本地最后操作
local_op = self._get_last_operation_for_key(
operation['session_id'],
operation['key']
)
if local_op and local_op['timestamp'] > operation['timestamp']:
# 本地操作更新,忽略远程操作
return False
return True
def _resolve_last_write_wins(self, operation):
"""最后写入胜出"""
# 在apply_operation中已经按时间戳应用
return True
def _generate_operation_id(self):
"""生成操作ID"""
import uuid
return str(uuid.uuid4())
def _async_replicate_operation(self, operation, log_position):
"""异步复制操作到其他节点"""
# 这里可以实现为后台线程或消息队列
pass
def _is_operation_applied(self, operation_id):
"""检查操作是否已应用"""
# 可以从存储中查询
return False
def _mark_operation_applied(self, operation_id):
"""标记操作已应用"""
# 记录到存储
pass
def _update_vector_clock(self, remote_clock):
"""更新向量时钟"""
for node, ts in remote_clock.items():
if node not in self.vector_clock or ts > self.vector_clock[node]:
self.vector_clock[node] = ts
def _notify_subscribers(self, operation):
"""通知订阅者"""
for callback in self.subscribers:
try:
callback(operation)
except Exception as e:
print(f"Subscriber callback error: {e}")
def _recover_from_checkpoint(self):
"""从检查点恢复"""
try:
checkpoint = self.storage.get_latest_checkpoint()
if checkpoint:
self.state_machine.restore(checkpoint['state'])
self.log_position = checkpoint['log_position']
self.vector_clock = defaultdict(int, checkpoint['vector_clock'])
except Exception as e:
print(f"Checkpoint recovery failed: {e}")
def _compact_logs(self):
"""压缩日志"""
# 删除已应用的旧日志
pass
def _get_last_operation_for_key(self, session_id, key):
"""获取key的最后操作"""
# 从存储中查询
return None
def _handle_unresolved_conflict(self, operation, conflicts):
"""处理无法解决的冲突"""
# 记录冲突,可能需要人工介入
print(f"Unresolved conflict for operation {operation['id']}: {conflicts}")
# 可以保存到特殊队列供后续处理
self.storage.save_conflict(operation, conflicts)
class SessionStateMachine:
"""会话状态机"""
def __init__(self):
self.sessions = {} # session_id -> session_data
def apply_operation(self, operation):
"""应用操作"""
session_id = operation['session_id']
op_type = operation['type']
key = operation['key']
value = operation['value']
# 确保会话存在
if session_id not in self.sessions:
self.sessions[session_id] = {}
session_data = self.sessions[session_id]
# 根据操作类型更新
if op_type == 'SET':
session_data[key] = value
elif op_type == 'DELETE':
if key in session_data:
del session_data[key]
elif op_type == 'INCREMENT':
current = session_data.get(key, 0)
if isinstance(value, (int, float)):
session_data[key] = current + value
elif op_type == 'APPEND':
current = session_data.get(key, [])
if isinstance(current, list):
current.append(value)
session_data[key] = current
elif op_type == 'MERGE':
current = session_data.get(key, {})
if isinstance(current, dict) and isinstance(value, dict):
current.update(value)
session_data[key] = current
# 更新元数据
if 'metadata' not in session_data:
session_data['metadata'] = {}
session_data['metadata']['last_updated'] = time.time()
session_data['metadata']['last_operation'] = operation['id']
def get_session(self, session_id):
"""获取会话"""
return self.sessions.get(session_id, {}).copy()
def get_all_sessions(self):
"""获取所有会话"""
return self.sessions.copy()
def restore(self, sessions_state):
"""恢复状态"""
self.sessions = sessions_state.copy()
return SessionOperationLog
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
四、容灾与高可用方案
4.1 多数据中心容灾
python
复制
下载
class MultiDCDisasterRecovery:
"""多数据中心容灾方案"""
def __init__(self):
self.dr_strategies = self._design_disaster_recovery()
def _design_disaster_recovery(self):
"""设计容灾方案"""
return {
"容灾架构模式": {
"主备模式(Active-Standby)": {
"架构": "一个数据中心活跃,其他备份",
"切换方式": "手动或自动故障转移",
"RTO": "分钟级",
"RPO": "秒级到分钟级",
"成本": "较低"
},
"双活模式(Active-Active)": {
"架构": "多个数据中心同时服务",
"流量分发": "基于地理位置或负载均衡",
"RTO": "接近0",
"RPO": "秒级",
"成本": "较高"
},
"多活模式(Multi-Active)": {
"架构": "多个数据中心,每个服务部分用户",
"数据分区": "按用户或业务分片",
"RTO": "接近0",
"RPO": "秒级",
"成本": "高"
}
},
"会话数据同步策略": self._implement_cross_dc_sync(),
"故障检测与切换": {
"健康检查": "多层次健康检查(网络、服务、数据)",
"脑裂处理": "使用第三方仲裁或多数派决策",
"优雅切换": "保证数据一致性下的平滑切换"
},
"数据恢复机制": {
"备份策略": "全量备份 + 增量备份",
"恢复演练": "定期进行灾难恢复演练",
"监控告警": "实时监控和自动告警"
}
}
def _implement_cross_dc_sync(self):
"""实现跨数据中心同步"""
import time
import threading
from queue import Queue
class CrossDCSessionSync:
"""跨数据中心会话同步"""
def __init__(self, dc_id, storage_backend, config):
self.dc_id = dc_id
self.storage = storage_backend
self.config = config
# 其他数据中心连接
self.peer_dcs = {} # dc_id -> connection
# 同步队列
self.sync_queue = Queue()
self.replication_queue = Queue()
# 状态跟踪
self.replication_lag = {} # dc_id -> lag_seconds
self.last_sync_time = {}
self.sync_workers = []
# 启动同步工作线程
self._start_sync_workers()
def write_session(self, session_id, data, sync_mode='async'):
"""写入会话数据"""
# 本地写入
local_success = self.storage.set(session_id, data)
if not local_success:
return False
# 根据同步模式处理
if sync_mode == 'sync':
# 同步复制到其他DC
return self._sync_replicate(session_id, data)
elif sync_mode == 'async':
# 异步复制
self._async_replicate(session_id, data)
return True
elif sync_mode == 'quorum':
# 多数派写入
return self._quorum_write(session_id, data)
else:
return local_success
def read_session(self, session_id, read_mode='local'):
"""读取会话数据"""
if read_mode == 'local':
# 只读本地
return self.storage.get(session_id)
elif read_mode == 'quorum':
# 从多数派读取最新数据
return self._quorum_read(session_id)
elif read_mode == 'sticky':
# 粘性读取(总是从上次写入的DC读取)
return self._sticky_read(session_id)
else:
return self.storage.get(session_id)
def _sync_replicate(self, session_id, data):
"""同步复制到其他DC"""
success_count = 1 # 本地已成功
required_count = self._get_required_replica_count()
failures = []
for dc_id, connection in self.peer_dcs.items():
try:
# 设置超时
if connection.replicate(session_id, data, timeout=2):
success_count += 1
else:
failures.append(dc_id)
except Exception as e:
failures.append(f"{dc_id}:{str(e)}")
# 如果已经达到要求,可以提前返回
if success_count >= required_count:
break
if success_count >= required_count:
# 异步处理失败的复制
if failures:
self._handle_replication_failures(session_id, data, failures)
return True
else:
# 回滚本地写入
self.storage.delete(session_id)
return False
def _async_replicate(self, session_id, data):
"""异步复制"""
# 放入复制队列
self.replication_queue.put({
'session_id': session_id,
'data': data,
'timestamp': time.time(),
'source_dc': self.dc_id
})
return True
def _quorum_write(self, session_id, data):
"""多数派写入"""
# 需要写入多数派数据中心
total_dcs = len(self.peer_dcs) + 1 # 包括自己
required = (total_dcs // 2) + 1
success_count = 0
attempts = []
# 首先尝试本地
if self.storage.set(session_id, data):
success_count += 1
attempts.append(f"{self.dc_id}:success")
# 尝试其他DC
for dc_id, connection in self.peer_dcs.items():
try:
if connection.replicate(session_id, data, timeout=3):
success_count += 1
attempts.append(f"{dc_id}:success")
else:
attempts.append(f"{dc_id}:failed")
except Exception as e:
attempts.append(f"{dc_id}:error:{str(e)}")
# 检查是否达到多数派
if success_count >= required:
break
result = success_count >= required
# 记录日志
self._log_quorum_write(session_id, result, success_count, required, attempts)
return result
def _quorum_read(self, session_id):
"""多数派读取"""
# 从多个DC读取,返回最新的数据
responses = []
# 本地读取
local_data = self.storage.get(session_id)
if local_data:
responses.append({
'dc_id': self.dc_id,
'data': local_data,
'timestamp': local_data.get('_timestamp', 0),
'version': local_data.get('_version', 0)
})
# 从其他DC读取
for dc_id, connection in self.peer_dcs.items():
try:
remote_data = connection.get_session(session_id, timeout=1)
if remote_data:
responses.append({
'dc_id': dc_id,
'data': remote_data,
'timestamp': remote_data.get('_timestamp', 0),
'version': remote_data.get('_version', 0)
})
except Exception:
# 读取失败,继续尝试其他DC
pass
if not responses:
return None
# 按版本号或时间戳选择最新的
latest_response = max(responses, key=lambda x: (x['version'], x['timestamp']))
# 如果需要,修复其他DC的旧数据
self._repair_stale_data(session_id, latest_response, responses)
return latest_response['data']
def _sticky_read(self, session_id):
"""粘性读取"""
# 查找上次写入的DC
last_write_dc = self._get_last_write_dc(session_id)
if last_write_dc == self.dc_id:
# 本地读取
return self.storage.get(session_id)
elif last_write_dc in self.peer_dcs:
# 从上次写入的DC读取
try:
return self.peer_dcs[last_write_dc].get_session(session_id)
except Exception:
# 回退到本地读取
return self.storage.get(session_id)
else:
# 默认本地读取
return self.storage.get(session_id)
def _start_sync_workers(self):
"""启动同步工作线程"""
# 复制工作线程
for i in range(self.config.get('replication_workers', 3)):
worker = threading.Thread(
target=self._replication_worker,
daemon=True
)
worker.start()
self.sync_workers.append(worker)
# 同步工作线程
for i in range(self.config.get('sync_workers', 2)):
worker = threading.Thread(
target=self._sync_worker,
daemon=True
)
worker.start()
self.sync_workers.append(worker)
# 监控线程
monitor = threading.Thread(
target=self._monitor_replication_lag,
daemon=True
)
monitor.start()
self.sync_workers.append(monitor)
def _replication_worker(self):
"""复制工作线程"""
while True:
try:
task = self.replication_queue.get(timeout=1)
session_id = task['session_id']
data = task['data']
# 复制到所有对等DC
for dc_id, connection in self.peer_dcs.items():
try:
# 设置适当的超时
success = connection.replicate(session_id, data, timeout=5)
if not success:
# 记录失败,稍后重试
self._retry_replication(dc_id, session_id, data)
except Exception as e:
print(f"Replication to {dc_id} failed: {e}")
self._retry_replication(dc_id, session_id, data)
# 标记任务完成
self.replication_queue.task_done()
except Exception as e:
# 队列超时或其他异常
pass
def _sync_worker(self):
"""同步工作线程(处理双向同步)"""
while True:
try:
# 检查对等DC是否有需要拉取的数据
for dc_id, connection in self.peer_dcs.items():
try:
# 获取自上次同步以来的变更
changes = connection.get_changes_since(
self.last_sync_time.get(dc_id, 0)
)
if changes:
# 应用变更到本地
for change in changes:
self._apply_remote_change(change)
# 更新同步时间
self.last_sync_time[dc_id] = time.time()
except Exception as e:
print(f"Sync from {dc_id} failed: {e}")
time.sleep(self.config.get('sync_interval', 5))
except Exception as e:
print(f"Sync worker error: {e}")
time.sleep(10)
def _monitor_replication_lag(self):
"""监控复制延迟"""
while True:
try:
for dc_id, connection in self.peer_dcs.items():
try:
# 获取对方的状态
status = connection.get_status()
# 计算延迟
if 'timestamp' in status:
lag = time.time() - status['timestamp']
self.replication_lag[dc_id] = lag
# 如果延迟过大,告警
if lag > self.config.get('max_allowed_lag', 30):
self._alert_high_lag(dc_id, lag)
except Exception:
# 连接失败
self.replication_lag[dc_id] = float('inf')
time.sleep(self.config.get('monitor_interval', 10))
except Exception as e:
print(f"Monitor error: {e}")
time.sleep(30)
def _get_required_replica_count(self):
"""获取需要的副本数"""
# 根据配置决定
replication_factor = self.config.get('replication_factor', 3)
return min(replication_factor, len(self.peer_dcs) + 1)
def _handle_replication_failures(self, session_id, data, failures):
"""处理复制失败"""
# 记录失败,稍后重试
for failure in failures:
self._retry_replication(failure, session_id, data)
def _retry_replication(self, dc_id, session_id, data):
"""重试复制"""
# 可以放入重试队列,指数退避重试
retry_task = {
'dc_id': dc_id,
'session_id': session_id,
'data': data,
'retry_count': 0,
'next_retry': time.time() + 5 # 5秒后重试
}
self.sync_queue.put(retry_task)
def _log_quorum_write(self, session_id, success, success_count, required, attempts):
"""记录多数派写入日志"""
log_entry = {
'session_id': session_id,
'success': success,
'success_count': success_count,
'required': required,
'attempts': attempts,
'timestamp': time.time(),
'dc_id': self.dc_id
}
# 写入日志
self.storage.append_log('quorum_write', log_entry)
def _repair_stale_data(self, session_id, latest_response, all_responses):
"""修复过时数据"""
# 如果其他DC的数据过时,异步修复
for response in all_responses:
if response['dc_id'] != latest_response['dc_id']:
if (response['version'] < latest_response['version'] or
response['timestamp'] < latest_response['timestamp'] - 60): # 超过60秒
# 异步修复
self._async_repair(
response['dc_id'],
session_id,
latest_response['data']
)
def _get_last_write_dc(self, session_id):
"""获取上次写入的DC"""
# 可以从会话元数据中获取
session_data = self.storage.get(session_id)
if session_data and '_last_write_dc' in session_data:
return session_data['_last_write_dc']
return self.dc_id # 默认返回本地DC
def _apply_remote_change(self, change):
"""应用远程变更"""
# 冲突检测和解决
current_data = self.storage.get(change['session_id'])
if current_data:
# 检查版本
current_version = current_data.get('_version', 0)
change_version = change['data'].get('_version', 0)
if change_version > current_version:
# 远程版本更新,应用变更
self.storage.set(change['session_id'], change['data'])
elif change_version == current_version:
# 版本相同,合并冲突
merged = self._merge_conflict(current_data, change['data'])
self.storage.set(change['session_id'], merged)
else:
# 本地不存在,直接应用
self.storage.set(change['session_id'], change['data'])
def _merge_conflict(self, local_data, remote_data):
"""合并冲突"""
# 简单的合并策略:远程数据优先
merged = local_data.copy()
merged.update(remote_data)
# 更新版本
merged['_version'] = max(
local_data.get('_version', 0),
remote_data.get('_version', 0)
) + 1
merged['_merge_timestamp'] = time.time()
return merged
def _async_repair(self, target_dc, session_id, data):
"""异步修复数据"""
repair_task = {
'type': 'repair',
'target_dc': target_dc,
'session_id': session_id,
'data': data,
'timestamp': time.time()
}
self.sync_queue.put(repair_task)
def _alert_high_lag(self, dc_id, lag):
"""告警高延迟"""
# 发送告警
alert = {
'type': 'high_replication_lag',
'dc_id': dc_id,
'lag_seconds': lag,
'threshold': self.config.get('max_allowed_lag', 30),
'timestamp': time.time()
}
# 可以发送到监控系统或日志
print(f"ALERT: High replication lag to {dc_id}: {lag}s")
# 触发降级策略
self._trigger_degradation_strategy(dc_id, lag)
def _trigger_degradation_strategy(self, dc_id, lag):
"""触发降级策略"""
if lag > 300: # 5分钟
# 严重延迟,可能网络分区
# 停止向该DC同步,标记为不可用
self._mark_dc_unavailable(dc_id)
elif lag > 60: # 1分钟
# 较大延迟,降低同步优先级
self._reduce_sync_priority(dc_id)
return CrossDCSessionSync
4.2 会话迁移与故障恢复
python
复制
下载
class SessionMigrationAndRecovery:
"""会话迁移与故障恢复"""
def __init__(self):
self.recovery_plan = self._design_recovery_plan()
def _design_recovery_plan(self):
"""设计恢复方案"""
return {
"故障检测": {
"心跳检测": "定期心跳检测节点健康状态",
"健康检查API": "提供健康检查端点",
"第三方监控": "使用外部监控服务",
"故障判定标准": {
"网络故障": "连续3次心跳失败",
"服务故障": "HTTP状态码5xx持续30秒",
"数据故障": "数据不一致或损坏"
}
},
"自动故障转移": self._implement_automatic_failover(),
"会话迁移策略": {
"热迁移": "服务不中断的迁移",
"冷迁移": "停机迁移",
"渐进式迁移": "分批迁移用户"
},
"数据恢复流程": {
"步骤1": "识别故障范围和影响",
"步骤2": "切换到备用节点/数据中心",
"步骤3": "从备份恢复数据",
"步骤4": "验证数据一致性",
"步骤5": "恢复服务并监控"
},
"恢复时间目标(RTO)优化": {
"预热备用节点": "提前加载热点数据",
"并行恢复": "同时恢复多个分片",
"增量恢复": "先恢复关键数据"
}
}
def _implement_automatic_failover(self):
"""实现自动故障转移"""
import time
import threading
import random
class AutomaticFailoverManager:
"""自动故障转移管理器"""
def __init__(self, cluster_manager, config):
self.cluster = cluster_manager
self.config = config
self.nodes = {} # node_id -> node_info
self.session_distribution = {} # session_id -> node_id
self.backup_nodes = {} # node_id -> [backup_node_ids]
self.failover_history = []
# 状态
self.failover_in_progress = False
self.health_check_thread = None
self.recovery_thread = None
# 启动健康检查
self._start_health_check()
def register_node(self, node_id, node_info):
"""注册节点"""
self.nodes[node_id] = {
'info': node_info,
'status': 'healthy',
'last_heartbeat': time.time(),
'failures': 0,
'sessions': set(),
'backup_of': [] # 作为哪些节点的备份
}
# 分配备份关系
self._assign_backup_relationships(node_id)
def register_session(self, session_id, primary_node_id):
"""注册会话"""
if primary_node_id not in self.nodes:
raise Exception(f"Node {primary_node_id} not found")
self.session_distribution[session_id] = primary_node_id
self.nodes[primary_node_id]['sessions'].add(session_id)
# 通知备份节点
backup_nodes = self.backup_nodes.get(primary_node_id, [])
for backup_id in backup_nodes:
if backup_id in self.nodes:
# 异步复制会话到备份节点
self._replicate_session_to_backup(session_id, primary_node_id, backup_id)
def _start_health_check(self):
"""启动健康检查线程"""
self.health_check_thread = threading.Thread(
target=self._health_check_loop,
daemon=True
)
self.health_check_thread.start()
def _health_check_loop(self):
"""健康检查循环"""
check_interval = self.config.get('health_check_interval', 5)
while True:
try:
self._perform_health_checks()
time.sleep(check_interval)
except Exception as e:
print(f"Health check loop error: {e}")
time.sleep(10)
def _perform_health_checks(self):
"""执行健康检查"""
current_time = time.time()
for node_id, node_data in self.nodes.items():
try:
# 检查心跳
last_heartbeat = node_data['last_heartbeat']
heartbeat_timeout = self.config.get('heartbeat_timeout', 30)
if current_time - last_heartbeat > heartbeat_timeout:
# 心跳超时
node_data['failures'] += 1
if node_data['failures'] >= self.config.get('max_failures', 3):
# 标记为故障
self._mark_node_failed(node_id)
else:
# 心跳正常,重置失败计数
if node_data['failures'] > 0:
node_data['failures'] = max(0, node_data['failures'] - 1)
# 主动健康检查(如果配置了检查URL)
if 'health_check_url' in node_data['info']:
healthy = self._check_node_health(node_data['info']['health_check_url'])
if not healthy:
node_data['failures'] += 1
if node_data['failures'] >= self.config.get('max_failures', 3):
self._mark_node_failed(node_id)
except Exception as e:
print(f"Health check for node {node_id} failed: {e}")
def _check_node_health(self, health_check_url):
"""检查节点健康状态"""
import requests
try:
response = requests.get(
health_check_url,
timeout=self.config.get('health_check_timeout', 3)
)
return response.status_code == 200
except Exception:
return False
def _mark_node_failed(self, node_id):
"""标记节点故障"""
if self.nodes[node_id]['status'] == 'failed':
return # 已经标记过了
print(f"Marking node {node_id} as failed")
# 更新状态
self.nodes[node_id]['status'] = 'failed'
self.nodes[node_id]['failed_at'] = time.time()
# 触发故障转移
self._trigger_failover(node_id)
def _trigger_failover(self, failed_node_id):
"""触发故障转移"""
if self.failover_in_progress:
print(f"Failover already in progress, skipping for node {failed_node_id}")
return
self.failover_in_progress = True
try:
print(f"Initiating failover for node {failed_node_id}")
# 获取故障节点上的所有会话
failed_sessions = list(self.nodes[failed_node_id]['sessions'])
if not failed_sessions:
print(f"No sessions on failed node {failed_node_id}")
return
# 记录故障转移开始
failover_record = {
'failed_node': failed_node_id,
'sessions_count': len(failed_sessions),
'start_time': time.time(),
'sessions_migrated': 0,
'sessions_failed': 0
}
# 为每个会话选择新的主节点
for session_id in failed_sessions:
try:
# 选择新的主节点
new_primary = self._select_new_primary(session_id, failed_node_id)
if new_primary:
# 迁移会话
success = self._migrate_session(session_id, failed_node_id, new_primary)
if success:
failover_record['sessions_migrated'] += 1
# 更新分发映射
self.session_distribution[session_id] = new_primary
self.nodes[new_primary]['sessions'].add(session_id)
# 分配新的备份节点
self._assign_backup_for_session(session_id, new_primary)
else:
failover_record['sessions_failed'] += 1
else:
failover_record['sessions_failed'] += 1
print(f"No available node for session {session_id}")
except Exception as e:
failover_record['sessions_failed'] += 1
print(f"Failed to migrate session {session_id}: {e}")
# 清理故障节点
self.nodes[failed_node_id]['sessions'].clear()
# 记录故障转移完成
failover_record['end_time'] = time.time()
failover_record['duration'] = failover_record['end_time'] - failover_record['start_time']
self.failover_history.append(failover_record)
print(f"Failover completed: {failover_record}")
# 启动恢复线程(尝试恢复故障节点)
self._start_node_recovery(failed_node_id)
finally:
self.failover_in_progress = False
def _select_new_primary(self, session_id, failed_node_id):
"""选择新的主节点"""
# 优先选择备份节点
backup_nodes = self.backup_nodes.get(failed_node_id, [])
for backup_id in backup_nodes:
if (backup_id in self.nodes and
self.nodes[backup_id]['status'] == 'healthy'):
# 检查备份节点是否有该会话的最新数据
if self._check_session_on_backup(session_id, backup_id):
return backup_id
# 如果没有合适的备份节点,选择负载最轻的健康节点
healthy_nodes = [
(node_id, node_data)
for node_id, node_data in self.nodes.items()
if node_data['status'] == 'healthy' and node_id != failed_node_id
]
if not healthy_nodes:
return None
# 选择负载最轻的节点
healthy_nodes.sort(key=lambda x: len(x[1]['sessions']))
return healthy_nodes[0][0]
def _check_session_on_backup(self, session_id, backup_id):
"""检查备份节点是否有会话数据"""
# 这里可以检查备份节点的数据新鲜度
# 简化实现:假设备份节点总是有最新数据
return True
def _migrate_session(self, session_id, old_primary, new_primary):
"""迁移会话"""
try:
# 获取会话数据
session_data = self._retrieve_session_data(session_id, old_primary)
if not session_data:
# 尝试从备份获取
session_data = self._retrieve_session_from_backup(session_id, old_primary)
if not session_data:
print(f"No session data found for {session_id}")
return False
# 写入新主节点
success = self._write_session_to_node(session_id, session_data, new_primary)
if success:
# 通知客户端会话已迁移
self._notify_client_session_moved(session_id, new_primary)
# 更新路由表
self._update_routing_table(session_id, new_primary)
return success
except Exception as e:
print(f"Session migration failed: {e}")
return False
def _retrieve_session_data(self, session_id, node_id):
"""从节点获取会话数据"""
# 这里应该调用节点的API获取会话数据
# 简化实现
return {'session_id': session_id, 'data': 'placeholder'}
def _retrieve_session_from_backup(self, session_id, primary_id):
"""从备份获取会话数据"""
backup_nodes = self.backup_nodes.get(primary_id, [])
for backup_id in backup_nodes:
if backup_id in self.nodes:
data = self._retrieve_session_data(session_id, backup_id)
if data:
return data
return None
def _write_session_to_node(self, session_id, data, node_id):
"""写入会话数据到节点"""
# 这里应该调用节点的API写入会话数据
# 简化实现
return True
def _notify_client_session_moved(self, session_id, new_primary):
"""通知客户端会话已迁移"""
# 可以通过重定向或消息通知客户端
pass
def _update_routing_table(self, session_id, new_primary):
"""更新路由表"""
# 更新负载均衡器或网关的路由配置
pass
def _assign_backup_relationships(self, node_id):
"""分配备份关系"""
# 为节点选择备份节点
all_nodes = list(self.nodes.keys())
if len(all_nodes) < 2:
return # 至少需要2个节点才能有备份
# 排除自己
candidates = [n for n in all_nodes if n != node_id]
# 随机选择备份节点(实际应该考虑地理位置、负载等因素)
backup_count = min(self.config.get('backup_count', 2), len(candidates))
selected_backups = random.sample(candidates, backup_count)
self.backup_nodes[node_id] = selected_backups
# 在备份节点上记录备份关系
for backup_id in selected_backups:
if 'backup_of' not in self.nodes[backup_id]:
self.nodes[backup_id]['backup_of'] = []
self.nodes[backup_id]['backup_of'].append(node_id)
def _assign_backup_for_session(self, session_id, primary_node_id):
"""为会话分配备份节点"""
backup_nodes = self.backup_nodes.get(primary_node_id, [])
for backup_id in backup_nodes:
if backup_id in self.nodes:
# 异步复制到备份
self._replicate_session_to_backup(session_id, primary_node_id, backup_id)
def _replicate_session_to_backup(self, session_id, primary_id, backup_id):
"""复制会话到备份节点"""
# 异步复制实现
pass
def _start_node_recovery(self, failed_node_id):
"""启动节点恢复"""
if self.recovery_thread and self.recovery_thread.is_alive():
# 已经有恢复线程在运行
return
self.recovery_thread = threading.Thread(
target=self._recover_failed_node,
args=(failed_node_id,),
daemon=True
)
self.recovery_thread.start()
def _recover_failed_node(self, failed_node_id):
"""恢复故障节点"""
print(f"Starting recovery for node {failed_node_id}")
# 等待一段时间,确保节点真的故障
time.sleep(self.config.get('recovery_delay', 60))
try:
# 检查节点是否恢复
if self._check_node_recovered(failed_node_id):
print(f"Node {failed_node_id} has recovered")
# 更新状态
self.nodes[failed_node_id]['status'] = 'recovered'
self.nodes[failed_node_id]['failures'] = 0
self.nodes[failed_node_id]['recovered_at'] = time.time()
# 重新分配会话(可以逐步迁移回来)
self._redistribute_sessions_after_recovery(failed_node_id)
else:
print(f"Node {failed_node_id} still not recovered")
# 标记为永久故障
self.nodes[failed_node_id]['status'] = 'permanently_failed'
# 触发永久移除流程
self._remove_failed_node_permanently(failed_node_id)
except Exception as e:
print(f"Node recovery failed: {e}")
def _check_node_recovered(self, node_id):
"""检查节点是否恢复"""
node_info = self.nodes[node_id]['info']
if 'health_check_url' in node_info:
return self._check_node_health(node_info['health_check_url'])
# 尝试连接节点
return self._attempt_connection(node_info)
def _attempt_connection(self, node_info):
"""尝试连接节点"""
# 简化实现
import socket
try:
host = node_info.get('host', 'localhost')
port = node_info.get('port', 80)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(5)
result = sock.connect_ex((host, port))
sock.close()
return result == 0
except Exception:
return False
def _redistribute_sessions_after_recovery(self, recovered_node_id):
"""恢复节点后的会话重新分配"""
# 可以逐步将会话迁移回恢复的节点
# 避免一次性迁移造成负载冲击
pass
def _remove_failed_node_permanently(self, node_id):
"""永久移除故障节点"""
print(f"Permanently removing node {node_id}")
# 从备份关系中移除
for primary_id, backups in self.backup_nodes.items():
if node_id in backups:
backups.remove(node_id)
# 从节点的备份关系记录中移除
for other_id, node_data in self.nodes.items():
if 'backup_of' in node_data and node_id in node_data['backup_of']:
node_data['backup_of'].remove(node_id)
# 重新分配备份关系
for primary_id in self.backup_nodes:
self._assign_backup_relationships(primary_id)
# 标记为已移除
self.nodes[node_id]['status'] = 'removed'
# 通知集群管理器移除该节点
self.cluster.remove_node(node_id)
return AutomaticFailoverManager
五、生产级实现方案
5.1 基于Redis的分布式会话
python
复制
下载
class RedisBasedSessionManager:
"""基于Redis的生产级分布式会话管理器"""
def __init__(self):
self.implementation = self._implement_redis_session()
def _implement_redis_session(self):
"""实现Redis会话管理器"""
import redis
import json
import time
import hashlib
import threading
class ProductionSessionManager:
"""生产级会话管理器"""
def __init__(self, redis_pool, config=None):
self.redis_pool = redis_pool
self.config = config or {
'session_timeout': 1800, # 30分钟
'session_prefix': 'session:',
'lock_prefix': 'lock:session:',
'backup_prefix': 'backup:session:',
'max_lock_wait': 5, # 秒
'lock_expire': 30, # 秒
'replication_factor': 2,
'enable_backup': True
}
# 连接池
self._redis = None
self._backup_redis = None
# 本地缓存(减少Redis访问)
self.local_cache = {}
self.cache_lock = threading.RLock()
self.cache_ttl = 30 # 本地缓存30秒
# 统计信息
self.stats = {
'hits': 0,
'misses': 0,
'locks_acquired': 0,
'locks_failed': 0,
'backup_writes': 0,
'backup_fails': 0
}
# 启动清理线程
self._start_cleanup_thread()
def get_session(self, session_id):
"""获取会话"""
# 首先检查本地缓存
with self.cache_lock:
if session_id in self.local_cache:
entry = self.local_cache[session_id]
if time.time() - entry['timestamp'] < self.cache_ttl:
self.stats['hits'] += 1
return entry['data'].copy()
# 从Redis获取
redis_key = f"{self.config['session_prefix']}{session_id}"
try:
data = self.redis.get(redis_key)
if data:
# 解析数据
session_data = json.loads(data)
# 更新本地缓存
with self.cache_lock:
self.local_cache[session_id] = {
'data': session_data.copy(),
'timestamp': time.time()
}
# 延长过期时间(访问续期)
self.redis.expire(redis_key, self.config['session_timeout'])
self.stats['hits'] += 1
return session_data
else:
self.stats['misses'] += 1
return None
except Exception as e:
print(f"Failed to get session {session_id}: {e}")
# 尝试从备份获取
if self.config['enable_backup']:
return self._get_from_backup(session_id)
return None
def set_session(self, session_id, data, lock_required=True):
"""设置会话"""
if lock_required:
# 获取分布式锁
lock_acquired = self._acquire_session_lock(session_id)
if not lock_acquired:
self.stats['locks_failed'] += 1
return False
try:
# 准备数据
session_data = data.copy()
session_data['_updated_at'] = time.time()
session_data['_version'] = self._generate_version(session_id, data)
# 序列化
serialized = json.dumps(session_data)
# Redis键
redis_key = f"{self.config['session_prefix']}{session_id}"
# 写入Redis
pipeline = self.redis.pipeline()
pipeline.set(redis_key, serialized)
pipeline.expire(redis_key, self.config['session_timeout'])
pipeline.execute()
# 更新本地缓存
with self.cache_lock:
self.local_cache[session_id] = {
'data': session_data.copy(),
'timestamp': time.time()
}
# 写入备份
if self.config['enable_backup']:
success = self._write_to_backup(session_id, session_data)
if success:
self.stats['backup_writes'] += 1
else:
self.stats['backup_fails'] += 1
return True
finally:
if lock_required:
# 释放锁
self._release_session_lock(session_id)
def delete_session(self, session_id):
"""删除会话"""
# 获取锁
lock_acquired = self._acquire_session_lock(session_id)
if not lock_acquired:
return False
try:
# Redis键
redis_key = f"{self.config['session_prefix']}{session_id}"
# 删除
result = self.redis.delete(redis_key)
# 清理本地缓存
with self.cache_lock:
if session_id in self.local_cache:
del self.local_cache[session_id]
# 删除备份
if self.config['enable_backup']:
self._delete_from_backup(session_id)
return result > 0
finally:
self._release_session_lock(session_id)
def update_session_field(self, session_id, field, value, lock_required=True):
"""更新会话字段"""
if lock_required:
lock_acquired = self._acquire_session_lock(session_id)
if not lock_acquired:
return False
try:
# 获取当前会话
current = self.get_session(session_id)
if not current:
return False
# 更新字段
current[field] = value
current['_updated_at'] = time.time()
current['_version'] = self._generate_version(session_id, current)
# 保存
return self.set_session(session_id, current, lock_required=False)
finally:
if lock_required:
self._release_session_lock(session_id)
def increment_session_field(self, session_id, field, amount=1):
"""递增会话字段"""
# 使用Redis原子操作,避免锁
redis_key = f"{self.config['session_prefix']}{session_id}"
# Lua脚本保证原子性
lua_script = """
local key = KEYS[1]
local field = ARGV[1]
local amount = tonumber(ARGV[2])
local timeout = tonumber(ARGV[3])
-- 获取当前会话
local data = redis.call('GET', key)
if not data then
-- 会话不存在,创建新会话
local new_data = {[field] = amount, _created_at = tonumber(ARGV[4]), _version = 1}
data = cjson.encode(new_data)
redis.call('SET', key, data)
redis.call('EXPIRE', key, timeout)
return {amount, 1}
end
-- 解析和更新
local session = cjson.decode(data)
local current = session[field] or 0
local new_value = current + amount
session[field] = new_value
session['_updated_at'] = tonumber(ARGV[4])
session['_version'] = (session['_version'] or 0) + 1
-- 保存
local new_data = cjson.encode(session)
redis.call('SET', key, new_data)
redis.call('EXPIRE', key, timeout)
return {new_value, session['_version']}
"""
try:
result = self.redis.eval(
lua_script,
1, # 一个key
redis_key,
field,
amount,
self.config['session_timeout'],
time.time()
)
# 清理本地缓存
with self.cache_lock:
if session_id in self.local_cache:
del self.local_cache[session_id]
return result[0] if result else None
except Exception as e:
print(f"Increment failed: {e}")
return None
def get_sessions_by_pattern(self, pattern, limit=100):
"""模式匹配查找会话"""
try:
# 使用SCAN避免阻塞
match_pattern = f"{self.config['session_prefix']}{pattern}"
cursor = 0
sessions = []
while len(sessions) < limit:
cursor, keys = self.redis.scan(
cursor=cursor,
match=match_pattern,
count=50
)
if not keys:
break
# 批量获取
if len(keys) > 0:
values = self.redis.mget(keys)
for key, value in zip(keys, values):
if value:
session_id = key.decode().replace(self.config['session_prefix'], '')
session_data = json.loads(value)
sessions.append((session_id, session_data))
if cursor == 0:
break
return sessions[:limit]
except Exception as e:
print(f"Pattern search failed: {e}")
return []
def _acquire_session_lock(self, session_id):
"""获取会话锁"""
lock_key = f"{self.config['lock_prefix']}{session_id}"
lock_value = str(time.time())
start_time = time.time()
while time.time() - start_time < self.config['max_lock_wait']:
# 尝试获取锁
acquired = self.redis.set(
lock_key,
lock_value,
nx=True, # 仅当不存在时设置
ex=self.config['lock_expire']
)
if acquired:
self.stats['locks_acquired'] += 1
return True
# 等待重试
time.sleep(0.1)
return False
def _release_session_lock(self, session_id):
"""释放会话锁"""
lock_key = f"{self.config['lock_prefix']}{session_id}"
# 使用Lua脚本确保安全释放
lua_script = """
if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("DEL", KEYS[1])
else
return 0
end
"""
try:
lock_value = self.redis.get(lock_key)
if lock_value:
self.redis.eval(lua_script, 1, lock_key, lock_value)
except Exception:
pass # 锁释放失败也没关系,会超时自动释放
def _generate_version(self, session_id, data):
"""生成版本号"""
# 基于内容和时间生成版本
content = json.dumps(data, sort_keys=True)
version_str = f"{session_id}:{content}:{time.time()}"
return hashlib.md5(version_str.encode()).hexdigest()[:8]
def _write_to_backup(self, session_id, data):
"""写入备份"""
try:
if not self._backup_redis:
self._init_backup_redis()
backup_key = f"{self.config['backup_prefix']}{session_id}"
serialized = json.dumps(data)
self._backup_redis.set(backup_key, serialized)
self._backup_redis.expire(backup_key, self.config['session_timeout'] * 2) # 备份时间更长
return True
except Exception as e:
print(f"Backup write failed: {e}")
return False
def _get_from_backup(self, session_id):
"""从备份获取"""
try:
if not self._backup_redis:
self._init_backup_redis()
backup_key = f"{self.config['backup_prefix']}{session_id}"
data = self._backup_redis.get(backup_key)
if data:
session_data = json.loads(data)
# 恢复主存储
self.set_session(session_id, session_data, lock_required=False)
return session_data
return None
except Exception as e:
print(f"Backup read failed: {e}")
return None
def _delete_from_backup(self, session_id):
"""删除备份"""
try:
if not self._backup_redis:
self._init_backup_redis()
backup_key = f"{self.config['backup_prefix']}{session_id}"
self._backup_redis.delete(backup_key)
except Exception:
pass
def _init_backup_redis(self):
"""初始化备份Redis"""
# 这里应该从配置读取备份Redis连接信息
# 简化实现
self._backup_redis = redis.Redis(
host='localhost',
port=6380, # 备份Redis端口
db=0,
decode_responses=True
)
def _start_cleanup_thread(self):
"""启动清理线程"""
def cleanup_loop():
while True:
try:
self._cleanup_expired_sessions()
self._cleanup_local_cache()
time.sleep(300) # 5分钟清理一次
except Exception as e:
print(f"Cleanup error: {e}")
time.sleep(60)
thread = threading.Thread(target=cleanup_loop, daemon=True)
thread.start()
def _cleanup_expired_sessions(self):
"""清理过期会话"""
# 可以使用Redis的过期策略,这里主要清理本地缓存
pass
def _cleanup_local_cache(self):
"""清理本地缓存"""
with self.cache_lock:
current_time = time.time()
expired_keys = [
key for key, entry in self.local_cache.items()
if current_time - entry['timestamp'] > self.cache_ttl
]
for key in expired_keys:
del self.local_cache[key]
@property
def redis(self):
"""获取Redis连接"""
if not self._redis:
self._redis = redis.Redis(connection_pool=self.redis_pool)
return self._redis
def get_stats(self):
"""获取统计信息"""
return self.stats.copy()
return ProductionSessionManager
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
六、监控与运维
6.1 监控指标体系
python
复制
下载
class SessionMonitoring:
"""会话监控指标体系"""
def __init__(self):
self.metrics = self._define_metrics()
def _define_metrics(self):
"""定义监控指标"""
return {
"可用性指标": {
"会话成功率": {
"公式": "成功会话操作数 / 总会话操作数",
"目标": "> 99.9%",
"告警阈值": "< 99%",
"监控频率": "每分钟"
},
"会话丢失率": {
"公式": "丢失会话数 / 活跃会话数",
"目标": "< 0.1%",
"告警阈值": "> 1%",
"监控频率": "每5分钟"
}
},
"性能指标": {
"会话读取延迟": {
"P50": "< 10ms",
"P95": "< 50ms",
"P99": "< 100ms",
"监控方法": "从客户端打点或中间件监控"
},
"会话写入延迟": {
"P50": "< 20ms",
"P95": "< 100ms",
"P99": "< 200ms"
},
"并发处理能力": {
"QPS": "根据业务规模设定",
"最大连接数": "监控连接池使用率",
"线程池使用率": "< 80%"
}
},
"容量指标": {
"会话存储使用率": {
"内存使用率": "< 70%",
"磁盘使用率": "< 80%",
"监控频率": "每分钟"
},
"会话数量趋势": {
"活跃会话数": "监控增长趋势",
"会话创建速率": "监控异常波动",
"会话过期速率": "验证清理机制"
},
"数据大小": {
"平均会话大小": "监控异常大会话",
"总数据大小": "预测容量需求"
}
},
"一致性指标": {
"复制延迟": {
"主从延迟": "< 1秒",
"跨数据中心延迟": "< 5秒",
"监控方法": "时间戳比对"
},
"数据一致性检查": {
"定期校验": "每日全量校验",
"实时抽样": "实时抽样检查",
"不一致告警": "立即告警"
}
},
"业务指标": {
"用户活跃度": "通过会话分析用户行为",
"会话生命周期": "分析会话创建到销毁的时长",
"异常模式检测": "检测异常会话模式"
}
}
6.2 故障排查与优化
python
复制
下载
class SessionTroubleshooting:
"""会话故障排查与优化"""
def __init__(self):
self.troubleshooting_guide = self._create_guide()
def _create_guide(self):
"""创建故障排查指南"""
return {
"常见问题与解决方案": {
"问题1: 会话丢失": {
"可能原因": [
"Redis重启或故障",
"会话超时设置过短",
"内存不足导致数据被清理",
"网络分区"
],
"排查步骤": [
"1. 检查Redis运行状态和日志",
"2. 检查会话超时配置",
"3. 检查内存使用情况",
"4. 检查网络连接"
],
"解决方案": [
"增加Redis持久化频率",
"调整会话超时时间",
"增加内存或使用LRU策略",
"配置多副本和高可用"
]
},
"问题2: 会话不一致": {
"可能原因": [
"主从复制延迟",
"并发写入冲突",
"脑裂问题",
"数据损坏"
],
"排查步骤": [
"1. 检查复制延迟监控",
"2. 分析并发写入日志",
"3. 检查集群状态",
"4. 验证数据完整性"
],
"解决方案": [
"使用更强的一致性级别",
"实现乐观锁或分布式锁",
"配置仲裁机制防止脑裂",
"定期数据校验和修复"
]
},
"问题3: 性能下降": {
"可能原因": [
"热点数据访问",
"Redis内存碎片",
"网络带宽限制",
"锁竞争"
],
"排查步骤": [
"1. 分析访问模式找出热点",
"2. 检查Redis内存碎片率",
"3. 监控网络带宽使用",
"4. 分析锁等待时间"
],
"解决方案": [
"数据分片或缓存本地",
"重启Redis或使用内存整理",
"升级网络或压缩数据",
"优化锁粒度或使用无锁结构"
]
}
},
"性能优化建议": {
"存储优化": [
"使用合适的数据结构(Hash vs String)",
"启用压缩(特别是大会话)",
"合理设置过期时间",
"使用Pipeline减少网络往返"
],
"架构优化": [
"实现多级缓存(本地+Redis)",
"读写分离",
"数据分片",
"边缘缓存"
],
"代码优化": [
"批量操作代替循环操作",
"异步写入非关键数据",
"合理使用连接池",
"避免大Key和Big Value"
]
},
"容量规划": {
"计算方法": {
"总内存需求": "活跃会话数 × 平均会话大小 × 安全系数(1.5)",
"带宽需求": "QPS × 平均数据大小 × 副本数",
"存储需求": "总会话数 × 平均会话大小 × 保留天数"
},
"扩容策略": [
"垂直扩容:升级单机配置",
"水平扩容:增加节点数量",
"分片扩容:重新分片分布数据"
],
"降级策略": [
"优先保证核心用户会话",
"减少非关键数据同步",
"临时延长会话超时",
"启用只读模式"
]
}
}
这个完整的分布式会话一致性与容灾方案涵盖了从基础架构到生产实现的各个方面,特别适合构建高可用的分布式系统。关键要点:
-
一致性级别选择:根据业务需求选择合适的一致性级别
-
容灾架构设计:多数据中心、自动故障转移、数据备份
-
性能优化:缓存、分片、批量操作、异步处理
-
监控运维:全面的监控指标和故障排查指南
-
安全考虑:加密、访问控制、审计日志
实际应用中需要根据具体业务场景和规模进行适当调整和优化。