目的
将PDF文档拆开,拆开后每个数据是文档中的某一段,目的是保证每条数据都有较完整的语义,并且长度不会太长
项目自述
看了很多切分项目,包括langchain、Langchain-Chatchat、、Chinese-LangChain、LangChain-ChatGLM-Webui、ChatPDF、semchunk等等,效果还行,但是不够完美,毕竟他们的对"\n"的优先级设置的较高,使用pymupdf得到的文本中充斥着大量的"\n",如果全部删掉也十分影响语义
切分逻辑
1、保持段落完整性
2、保持语义完整性
代码逻辑
1、转换PDF文件为DOCX文件
2、循环遍历paragraphs保持段落完整性
3、 以句号为节点,保持语义完整性
代码实现
python
import re
import os
import csv
from pdf2docx import Converter
from docx import Document
_NON_WHITESPACE_SEMANTIC_SPLITTERS = (
'。\n\n','。\n','。', '?', '!'
';','\n','\r' ')', '"', ''', '】', '......', # Sentence terminators.
'?', '!', '*', # Sentence terminators.
';', ',', '(', ')', '[', ']', """, """, ''', ''', "'", '"', '`', # Clause separators.
':', '---', '...', # Sentence interrupters.
'/', '\\', '--', '&', '-', # Word joiners.
)
def split_text(text, chunk_size=400):
# 按照标点符号优先级逐级尝试进行切分
for splitter in _NON_WHITESPACE_SEMANTIC_SPLITTERS:
if splitter:
parts = text.split(splitter)
chunks = []
current_chunk = ''
for part in parts:
if len(current_chunk) + len(part) + len(splitter) <= chunk_size:
current_chunk += part + splitter
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = part + splitter
if current_chunk:
chunks.append(current_chunk.strip())
# 如果切分后的块总大小符合预期,则返回结果
if all(len(chunk) <= chunk_size for chunk in chunks):
return chunks
# 如果所有标点符号都无法进行有效切分,则按固定大小切分
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
def pdf_to_docx(pdf_file_path):
try:
docx_path = os.path.join(os.path.dirname(pdf_file_path), os.path.basename(pdf_file_path).split(".")[0] +".docx")
cv = Converter(pdf_file_path)
cv.convert(docx_path)
cv.close()
return docx_path
except Exception as e:
print(f"转换过程中发生错误:{str(e)}")
return False
def pdf2docx_to_csv(pdf_file_path, max_length=400):
docx_path = pdf_to_docx(pdf_file_path)
if not docx_path:
return False
docx = Document(docx_path)
result = []
current_text = ""
for paragraph in docx.paragraphs:
sections = paragraph.text.strip()
sections = re.sub(r'\s+', ' ', sections)
sections = re.sub(r'(.)\1{4,}', r'\1', sections)
if len(sections) > max_length:
chunk_size = int(len(sections) / ((len(sections) // max_length) + 1))
sections = split_text(sections, chunk_size=chunk_size)
if isinstance(sections,str):
sections = [sections,]
for section in sections:
if not current_text or len(current_text) + len(section) + 1 <= max_length:
current_text += " " + section
else:
period_index = current_text.rfind('。')
if period_index != -1:
period_text = current_text[:period_index+1].strip()
result.append(period_text)
current_text = current_text[period_index+1:].strip() + section
else:
result.append(current_text.strip())
current_text = section
if current_text.strip():
result.append(current_text.strip())
output_path = os.path.join(os.path.dirname(pdf_file_path), os.path.basename(pdf_file_path).split(".")[0] + "_pdf2docx_"+ ".csv")
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(['filename', 'text'])
csvwriter.writerows(result)
print(f"{pdf_file_path} 处理完成")
if __name__ == "__main__":
pdf_file_path = "/path/to/your/xxx.pdf"
pdf2docx_to_csv(pdf_file_path)
如果觉得好用就点个赞!