上下文管理革命:MCP的动态分块与分级压缩技术突破1M tokens瓶颈
一、MCP核心架构设计
1.1 动态分块引擎实现
python
复制代码
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import re
from typing import List, Dict, Tuple
class DynamicChunker:
def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
self.embedder = SentenceTransformer(model_name)
self.min_chunk_size = 128 # tokens
self.max_chunk_size = 1024 # tokens
self.semantic_threshold = 0.82
self.cache = {} # 缓存已计算嵌入
def chunk_text(self, text: str, tokenizer) -> List[Dict]:
"""动态分块主方法"""
# 预处理文本
cleaned_text = self._preprocess_text(text)
# 初始分句
sentences = self._split_into_sentences(cleaned_text)
if not sentences:
return []
# 计算句子嵌入(带缓存)
sentence_embeddings = []
for sent in sentences:
if sent in self.cache:
sentence_embeddings.append(self.cache[sent])
else:
emb = self.embedder.encode(sent)
self.cache[sent] = emb
sentence_embeddings.append(emb)
sentence_embeddings = np.array(sentence_embeddings)
# 自适应分块策略
if len(sentences) > 50: # 大文本使用聚类分块
chunks = self._cluster_chunking(sentences, sentence_embeddings)
else: # 小文本使用滑动窗口
chunks = self._sliding_window_chunking(sentences, sentence_embeddings, tokenizer)
# 后处理验证
final_chunks = []
for chunk in chunks:
chunk_text = ' '.join(chunk['sentences'])
token_count = len(tokenizer.tokenize(chunk_text))
if token_count > self.max_chunk_size:
# 递归处理过大块
sub_chunks = self.chunk_text(chunk_text, tokenizer)
final_chunks.extend(sub_chunks)
else:
final_chunks.append({
'text': chunk_text,
'embeddings': chunk['embedding'],
'tokens': token_count,
'type': 'text'
})
return final_chunks
def _preprocess_text(self, text: str) -> str:
"""文本预处理"""
# 移除多余空白和特殊字符
text = re.sub(r'\s+', ' ', text).strip()
# 其他预处理规则...
return text
def _split_into_sentences(self, text: str) -> List[str]:
"""改进的分句逻辑"""
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
return sentences
def _sliding_window_chunking(self, sentences: List[str],
embeddings: np.ndarray,
tokenizer) -> List[Dict]:
"""滑动窗口分块算法"""
chunks = []
current_chunk = []
current_embeddings = []
current_size = 0
for i, (sent, emb) in enumerate(zip(sentences, embeddings)):
sent_size = len(tokenizer.tokenize(sent))
if not current_chunk:
# 开始新块
current_chunk.append(sent)
current_embeddings.append(emb)
current_size += sent_size
continue
# 计算与当前块的相似度
avg_embedding = np.mean(current_embeddings, axis=0)
similarity = np.dot(emb, avg_embedding) / (
np.linalg.norm(emb) * np.linalg.norm(avg_embedding))
# 决定是否扩展当前块
if (similarity > self.semantic_threshold and
current_size + sent_size <= self.max_chunk_size):
current_chunk.append(sent)
current_embeddings.append(emb)
current_size += sent_size
else:
# 保存当前块
chunks.append({
'sentences': current_chunk.copy(),
'embedding': avg_embedding
})
# 开始新块
current_chunk = [sent]
current_embeddings = [emb]
current_size = sent_size
# 添加最后一个块
if current_chunk:
chunks.append({
'sentences': current_chunk,
'embedding': np.mean(current_embeddings, axis=0)
})
return chunks
def _cluster_chunking(self, sentences: List[str],
embeddings: np.ndarray) -> List[Dict]:
"""基于聚类的分块策略"""
# 动态确定聚类数量
n_clusters = max(2, min(10, len(sentences)//20))
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)
# 构建分块
chunks = []
for cluster_id in range(n_clusters):
cluster_indices = np.where(clusters == cluster_id)[0]
cluster_sents = [sentences[i] for i in cluster_indices]
cluster_embs = embeddings[cluster_indices]
chunks.append({
'sentences': cluster_sents,
'embedding': np.mean(cluster_embs, axis=0)
})
return chunks
1.2 分级压缩系统实现
python
复制代码
import zlib
import pickle
import msgpack
from transformers import AutoTokenizer, AutoModel
import torch
from typing import Union, Dict
class HierarchicalCompressor:
COMPRESSION_LEVELS = {
'raw': 0,
'light': 1,
'medium': 2,
'aggressive': 3,
'extreme': 4
}
def __init__(self, model_name='bert-base-uncased'):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.compression_algos = {
0: self._raw_compress,
1: self._light_compress,
2: self._medium_compress,
3: self._aggressive_compress,
4: self._extreme_compress
}
self.decompression_algos = {
0: self._raw_decompress,
1: self._light_decompress,
2: self._medium_decompress,
3: self._aggressive_decompress,
4: self._extreme_decompress
}
def compress(self, text: str, level: Union[str, int] = 'medium') -> bytes:
"""分级压缩入口"""
level = self._resolve_level(level)
return self.compression_algos[level](text)
def decompress(self, data: bytes, level: Union[str, int]) -> str:
"""分级解压入口"""
level = self._resolve_level(level)
return self.decompression_algos[level](data)
def _resolve_level(self, level: Union[str, int]) -> int:
"""解析压缩级别"""
if isinstance(level, str):
return self.COMPRESSION_LEVELS[level.lower()]
return level
def _raw_compress(self, text: str) -> bytes:
"""原始存储"""
return text.encode('utf-8')
def _light_compress(self, text: str) -> bytes:
"""轻量压缩"""
return zlib.compress(text.encode('utf-8'), level=1)
def _medium_compress(self, text: str) -> bytes:
"""中等压缩"""
# 提取关键信息
inputs = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
with torch.no_grad():
outputs = self.model(**inputs)
# 获取语义表示
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
# 提取实体和关键词
entities = self._extract_entities(text)
keywords = self._extract_keywords(text)
# 使用msgpack高效序列化
return msgpack.packb({
'embedding': cls_embedding.tolist(),
'entities': entities,
'keywords': keywords,
'length': len(text),
'type': 'medium'
})
def _aggressive_compress(self, text: str) -> bytes:
"""激进压缩"""
# 生成摘要和关键信息
summary = self._generate_summary(text, ratio=0.3)
entities = self._extract_entities(text)
return msgpack.packb({
'summary': summary,
'entities': entities,
'length': len(text),
'type': 'aggressive'
})
def _extreme_compress(self, text: str) -> bytes:
"""极限压缩"""
# 仅保留最核心信息
extreme_summary = self._generate_summary(text, ratio=0.1)
return extreme_summary.encode('utf-8')
# 解压方法对应实现
def _raw_decompress(self, data: bytes) -> str:
return data.decode('utf-8')
def _light_decompress(self, data: bytes) -> str:
return zlib.decompress(data).decode('utf-8')
def _medium_decompress(self, data: bytes) -> str:
obj = msgpack.unpackb(data)
# 这里可以添加基于embedding的文本重建逻辑
return f"[MEDIUM] Keywords: {', '.join(obj['keywords'])} | Entities: {', '.join(obj['entities'])}"
def _aggressive_decompress(self, data: bytes) -> str:
obj = msgpack.unpackb(data)
return f"[AGGRESSIVE] Summary: {obj['summary']}"
def _extreme_decompress(self, data: bytes) -> str:
return f"[EXTREME] {data.decode('utf-8')}"
def _extract_entities(self, text: str) -> List[str]:
"""实体提取(简化版)"""
# 实际实现应使用NER模型
return list(set(re.findall(r'\b[A-Z][a-z]+\b', text)))[:5]
def _extract_keywords(self, text: str) -> List[str]:
"""关键词提取"""
words = re.findall(r'\b\w{4,}\b', text.lower())
word_counts = {}
for word in words:
if word not in {'that', 'this', 'with', 'which'}:
word_counts[word] = word_counts.get(word, 0) + 1
return sorted(word_counts, key=word_counts.get, reverse=True)[:10]
def _generate_summary(self, text: str, ratio=0.3) -> str:
"""生成摘要(简化版)"""
sentences = re.split(r'(?<=[.!?])\s+', text)
return ' '.join(sentences[:max(1, int(len(sentences)*ratio))])
二、MCP内存管理系统
2.1 智能内存单元设计
python
复制代码
from dataclasses import dataclass
import time
from typing import Optional
import hashlib
@dataclass
class MemoryUnit:
content_hash: str
compressed_data: bytes
compression_level: int
metadata: dict
last_accessed: float = None
access_count: int = 0
def __post_init__(self):
if self.last_accessed is None:
self.last_accessed = time.time()
@classmethod
def create(cls, content: str, compressor: HierarchicalCompressor,
importance: float = 0.5) -> 'MemoryUnit':
"""工厂方法创建内存单元"""
# 确定压缩级别
level = cls._determine_compression_level(importance)
# 计算内容哈希
content_hash = hashlib.sha256(content.encode()).hexdigest()
# 压缩内容
compressed = compressor.compress(content, level)
# 生成元数据
metadata = {
'created': time.time(),
'importance': max(0.0, min(1.0, importance)),
'original_length': len(content),
'compressed_size': len(compressed),
'compression_ratio': len(compressed)/len(content) if content else 0
}
return cls(
content_hash=content_hash,
compressed_data=compressed,
compression_level=level,
metadata=metadata
)
@staticmethod
def _determine_compression_level(importance: float) -> int:
"""基于重要性确定压缩级别"""
if importance > 0.8: return 0 # raw
elif importance > 0.6: return 1 # light
elif importance > 0.4: return 2 # medium
elif importance > 0.2: return 3 # aggressive
else: return 4 # extreme
def get_content(self, compressor: HierarchicalCompressor) -> str:
"""获取解压后的内容"""
self.access_count += 1
self.last_accessed = time.time()
return compressor.decompress(self.compressed_data, self.compression_level)
def get_info(self) -> dict:
"""获取单元信息"""
return {
'hash': self.content_hash[:8],
'size': self.metadata['compressed_size'],
'ratio': round(self.metadata['compression_ratio'], 2),
'importance': self.metadata['importance'],
'access_count': self.access_count,
'last_accessed': time.ctime(self.last_accessed)
}
2.2 记忆池实现
python
复制代码
class MemoryPool:
def __init__(self, max_size_mb: int = 1024, chunker=None, compressor=None):
self.max_size = max_size_mb * 1024 * 1024 # 转换为字节
self.current_size = 0
self.memory_units = {}
self.chunker = chunker or DynamicChunker()
self.compressor = compressor or HierarchicalCompressor()
self.tokenizer = self.compressor.tokenizer
# 索引结构
self.semantic_index = {} # {embedding_hash: unit_hash}
self.temporal_index = [] # [(timestamp, unit_hash)]
def add_content(self, content: str, importance: float = 0.5) -> List[str]:
"""添加内容到记忆池"""
# 动态分块
chunks = self.chunker.chunk_text(content, self.tokenizer)
if not chunks:
return []
added_hashes = []
for chunk in chunks:
# 创建内存单元
unit = MemoryUnit.create(chunk['text'], self.compressor, importance)
# 检查是否已存在
if unit.content_hash in self.memory_units:
continue
# 检查内存限制
self._ensure_capacity(unit.metadata['compressed_size'])
# 添加到记忆池
self.memory_units[unit.content_hash] = unit
self.current_size += unit.metadata['compressed_size']
added_hashes.append(unit.content_hash)
# 更新索引
self._update_indexes(unit, chunk.get('embeddings'))
return added_hashes
def _ensure_capacity(self, required_size: int):
"""确保有足够空间"""
while self.current_size + required_size > self.max_size and self.memory_units:
# 找到最不重要的单元(基于LRU和重要性)
to_remove = min(
self.memory_units.values(),
key=lambda u: (
u.metadata['importance'],
-u.access_count,
u.last_accessed
)
)
# 移除单元
self._remove_unit(to_remove.content_hash)
def _remove_unit(self, unit_hash: str):
"""移除内存单元"""
if unit_hash not in self.memory_units:
return
unit = self.memory_units[unit_hash]
self.current_size -= unit.metadata['compressed_size']
del self.memory_units[unit_hash]
# 清理索引
self._clean_indexes(unit_hash)
def _update_indexes(self, unit: MemoryUnit, embedding: np.ndarray = None):
"""更新索引结构"""
# 时间索引
self.temporal_index.append((unit.metadata['created'], unit.content_hash))
# 语义索引
if embedding is not None:
emb_hash = hashlib.sha256(embedding.tobytes()).hexdigest()
self.semantic_index[emb_hash] = unit.content_hash
def _clean_indexes(self, unit_hash: str):
"""清理索引"""
# 时间索引
self.temporal_index = [(t, h) for t, h in self.temporal_index if h != unit_hash]
# 语义索引
for emb_hash, h in list(self.semantic_index.items()):
if h == unit_hash:
del self.semantic_index[emb_hash]
def retrieve_by_semantics(self, query: str, top_k: int = 5) -> List[dict]:
"""语义检索"""
query_embed = self.chunker.embedder.encode(query)
# 计算与所有记忆单元的相似度
scores = []
for unit in self.memory_units.values():
# 获取单元嵌入(如果有)
if unit.compression_level == 2: # medium压缩级别存储了嵌入
try:
data = msgpack.unpackb(unit.compressed_data)
if 'embedding' in data:
emb = np.array(data['embedding'])
similarity = np.dot(query_embed, emb) / (
np.linalg.norm(query_embed) * np