长文本处理技术综述:突破上下文限制
前言
大模型的上下文窗口是有限的,但很多应用场景需要处理超长文本。如何高效处理长文本是大模型应用开发中的重要挑战。
我在项目中处理过各种长文本场景,从法律文档分析到代码仓库理解。今天分享一些常用的长文本处理技术。
文本分块技术
基于长度的分块
python
class FixedSizeChunker:
"""固定大小分块"""
def __init__(self, chunk_size: int = 512, overlap: int = 50):
self.chunk_size = chunk_size
self.overlap = overlap
def chunk(self, text: str) -> list:
"""分块"""
words = text.split()
chunks = []
for i in range(0, len(words), self.chunk_size - self.overlap):
chunk_words = words[i:i + self.chunk_size]
chunks.append(" ".join(chunk_words))
return chunks
基于语义的分块
python
import re
class SemanticChunker:
"""语义分块"""
def __init__(self, max_tokens: int = 512):
self.max_tokens = max_tokens
def chunk(self, text: str) -> list:
"""按语义边界分块"""
# 按段落分割
paragraphs = re.split(r'\n\n+', text)
chunks = []
current_chunk = []
current_size = 0
for para in paragraphs:
para_size = self._count_tokens(para)
if current_size + para_size > self.max_tokens:
if current_chunk:
chunks.append("\n\n".join(current_chunk))
current_chunk = [para]
current_size = para_size
else:
current_chunk.append(para)
current_size += para_size
if current_chunk:
chunks.append("\n\n".join(current_chunk))
return chunks
def _count_tokens(self, text: str) -> int:
"""估算 token 数量"""
return len(text) // 4
Map-Reduce 策略
python
class MapReduceProcessor:
"""Map-Reduce 处理长文本"""
def __init__(self, llm, chunk_size: int = 512):
self.llm = llm
self.chunker = FixedSizeChunker(chunk_size=chunk_size)
def process(self, task: str, text: str) -> str:
"""处理长文本"""
# Map 阶段:处理每个块
chunks = self.chunker.chunk(text)
summaries = []
for i, chunk in enumerate(chunks):
prompt = f"""任务:{task}
文本片段 {i+1}/{len(chunks)}:
{chunk}
请提取与任务相关的信息:"""
summary = self.llm.generate(prompt)
summaries.append(summary)
# Reduce 阶段:合并结果
combined = "\n\n".join(summaries)
prompt = f"""任务:{task}
各部分分析结果:
{combined}
请综合以上信息给出最终回答:"""
return self.llm.generate(prompt)
滑动窗口技术
python
class SlidingWindowProcessor:
"""滑动窗口处理"""
def __init__(self, llm, window_size: int = 512, step: int = 256):
self.llm = llm
self.window_size = window_size
self.step = step
def process(self, task: str, text: str) -> str:
"""滑动窗口处理"""
words = text.split()
results = []
for i in range(0, len(words), self.step):
window_words = words[i:i + self.window_size]
window_text = " ".join(window_words)
prompt = f"""任务:{task}
文本:{window_text}
分析:"""
result = self.llm.generate(prompt)
results.append(result)
# 综合结果
return self._synthesize(results, task)
def _synthesize(self, results: list, task: str) -> str:
"""综合结果"""
combined = "\n\n".join(results)
prompt = f"""基于以下分析结果,给出综合回答:
{combined}
任务:{task}
综合回答:"""
return self.llm.generate(prompt)
递归总结
python
class RecursiveSummarizer:
"""递归总结"""
def __init__(self, llm, target_length: int = 500):
self.llm = llm
self.target_length = target_length
def summarize(self, text: str) -> str:
"""递归总结"""
current_length = self._count_tokens(text)
if current_length <= self.target_length:
return text
# 分割为两部分
words = text.split()
mid = len(words) // 2
left = " ".join(words[:mid])
right = " ".join(words[mid:])
# 递归总结
left_summary = self.summarize(left)
right_summary = self.summarize(right)
# 合并
combined = f"{left_summary}\n\n{right_summary}"
prompt = f"""请总结以下内容:
{combined}
总结:"""
return self.llm.generate(prompt)
def _count_tokens(self, text: str) -> int:
return len(text) // 4
实际应用
python
class LongTextAnalyzer:
"""长文本分析器"""
def __init__(self, llm):
self.llm = llm
self.processor = MapReduceProcessor(llm)
def analyze_document(self, document_path: str, task: str) -> str:
"""分析文档"""
with open(document_path, "r") as f:
text = f.read()
return self.processor.process(task, text)
def answer_question(self, document_path: str, question: str) -> str:
"""基于文档回答问题"""
with open(document_path, "r") as f:
text = f.read()
# 使用 RAG 风格
chunks = FixedSizeChunker().chunk(text)
# 找到最相关的块
relevant_chunks = self._find_relevant(chunks, question)
# 基于相关块回答
context = "\n\n".join(relevant_chunks)
prompt = f"""基于以下内容回答问题:
{context}
问题:{question}
回答:"""
return self.llm.generate(prompt)
def _find_relevant(self, chunks: list, query: str) -> list:
"""找到相关的块"""
# 简化实现:返回前 3 个块
return chunks[:3]
总结
长文本处理技术:
- 分块策略:固定大小 vs 语义分块
- Map-Reduce:分而治之的经典方法
- 滑动窗口:处理连续文本
- 递归总结:层次化压缩
关键要点:
- 根据任务选择合适的分块策略
- 保持一定的重叠避免信息丢失
- 对于问答任务,优先检索相关部分
- 考虑使用向量数据库进行语义检索