docs.langchain.com/oss/python/...
py
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_text(document)
py
#from langchain.text_splitter import RecursiveCharacterTextSplitter
class RecursiveCharacterTextSplitter:
def __init__(self,chunk_size=128,chunk_overlap=30,separators=None):
self.chunk_size=chunk_size
self.chunk_overlap=chunk_overlap
if separators is None:
self.separators=["\n\n","\n"," ",""]
else:
self.separators=separators
def split_text(self,text):
def recursive_split(txt,seps):
if not seps:
# 1 txt[0:50] txt[50,100] txt[100,150]
return [
txt[i:i+self.chunk_size]
for i in range(0,len(txt),self.chunk_size)
]
#取出第一个分隔符
sep = seps[0]
#按当前的分隔符进行分割文本
parts = txt.split(sep)
#初始化结果列表
result = []
for part in parts:
# 如果不是最后一段,需要补回分隔符
if part != parts[-1]:
part = part+sep
#如果当前段落长度大于规定的每个块的长度,递归使用下一个分隔符继续分割
if len(part)>self.chunk_size:
result.extend(recursive_split(part,seps[1:]))
else:
result.append(part)
return result
# 递归分割文本,并去除空白分块
splits = [
s for s in recursive_split(text,self.separators) if s.strip()
]
return splits
r_splitter = RecursiveCharacterTextSplitter(
chunk_size=50,# 每个分块最大50个字符
#chunk_overlap=2,#相邻分块重叠是10个字符
separators = [
"\n\n",#段落分割符
"\n",# 换行符分割符
".",#英文句号分割符
"。",# 中文句号分割符
",",# 英文逗号分割符
","# 中文逗号分割符
]
)
text = """"""
chunks = r_splitter.split_text(text)
for i ,chunk in enumerate(chunks):
print(f"Chunk{i+1}: {chunk}")