pdf、docx、markdown、txt提取文档内容,可以应用于rag文档解析

返回的是文档解析分段内容组成的列表,分段内容默认chunk_size: int = 250, chunk_overlap: int = 50,250字分段,50分段处保留后面一段的前50字拼接即窗口包含下下一段前面50个字划分

cpp 复制代码
from typing import Union, List

import jieba
import re




class SentenceSplitter:
    def __init__(self, chunk_size: int = 250, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, text: str) -> List[str]:
        if self._is_has_chinese(text):
            return self._split_chinese_text(text)
        else:
            return self._split_english_text(text)

    def _split_chinese_text(self, text: str) -> List[str]:
        sentence_endings = {'\n', '。', '!', '?', ';', '...'}  # 句末标点符号
        chunks, current_chunk = [], ''
        for word in jieba.cut(text):
            if len(current_chunk) + len(word) > self.chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = word
            else:
                current_chunk += word
            if word[-1] in sentence_endings and len(current_chunk) > self.chunk_size - self.chunk_overlap:
                chunks.append(current_chunk.strip())
                current_chunk = ''
        if current_chunk:
            chunks.append(current_chunk.strip())
        if self.chunk_overlap > 0 and len(chunks) > 1:
            chunks = self._handle_overlap(chunks)
        return chunks

    def _split_english_text(self, text: str) -> List[str]:
        # 使用正则表达式按句子分割英文文本
        sentences = re.split(r'(?<=[.!?])\s+', text.replace('\n', ' '))
        chunks, current_chunk = [], ''
        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= self.chunk_size or not current_chunk:
                current_chunk += (' ' if current_chunk else '') + sentence
            else:
                chunks.append(current_chunk)
                current_chunk = sentence
        if current_chunk:  # Add the last chunk
            chunks.append(current_chunk)

        if self.chunk_overlap > 0 and len(chunks) > 1:
            chunks = self._handle_overlap(chunks)

        return chunks

    def _is_has_chinese(self, text: str) -> bool:
        # check if contains chinese characters
        if any("\u4e00" <= ch <= "\u9fff" for ch in text):
            return True
        else:
            return False

    def _handle_overlap(self, chunks: List[str]) -> List[str]:
        # 处理块间重叠
        overlapped_chunks = []
        for i in range(len(chunks) - 1):
            chunk = chunks[i] + ' ' + chunks[i + 1][:self.chunk_overlap]
            overlapped_chunks.append(chunk.strip())
        overlapped_chunks.append(chunks[-1])
        return overlapped_chunks


text_splitter = SentenceSplitter()

def load_file(filepath):
    print("filepath:",filepath)
    if filepath.endswith(".md"):
        contents = extract_text_from_markdown(filepath)

    elif filepath.endswith(".pdf"):
        contents = extract_text_from_pdf(filepath)
    elif filepath.endswith('.docx'):
        contents = extract_text_from_docx(filepath)
    else:
        contents = extract_text_from_txt(filepath)
    return contents

  
def extract_text_from_pdf(file_path: str):
    """Extract text content from a PDF file."""
    import PyPDF2
    contents = []
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        for page in pdf_reader.pages:
            page_text = page.extract_text().strip()
            raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
            new_text = ''
            for text in raw_text:
                new_text += text
                if text[-1] in ['.', '!', '?', '。', '!', '?', '...', ';', ';', ':', ':', '"', ''', ')', '】', '》', '」',
                                '』', '〕', '〉', '》', '〗', '〞', '〟', '>>', '"', "'", ')', ']', '}']:
                    contents.append(new_text)
                    new_text = ''
            if new_text:
                contents.append(new_text)
    return contents

def extract_text_from_txt(file_path: str):
    """Extract text content from a TXT file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        contents = [text.strip() for text in f.readlines() if text.strip()]
    return contents

def extract_text_from_docx(file_path: str):
    """Extract text content from a DOCX file."""
    import docx
    document = docx.Document(file_path)
    contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
    return contents

def extract_text_from_markdown(file_path: str):
    """Extract text content from a Markdown file."""
    import markdown
    from bs4 import BeautifulSoup
    with open(file_path, 'r', encoding='utf-8') as f:
        markdown_text = f.read()
    html = markdown.markdown(markdown_text)
    soup = BeautifulSoup(html, 'html.parser')
    contents = [text.strip() for text in soup.get_text().splitlines() if text.strip()]
    return contents



texts = load_file(r"C:\Users\lo***山市城市建筑外立面管理条例.docx")
print(texts)
相关推荐
Access开发易登软件30 分钟前
Access开发导出PDF的N种姿势,你get了吗?
后端·低代码·pdf·excel·vba·access·access开发
中国胖子风清扬1 小时前
Rust 序列化技术全解析:从基础到实战
开发语言·c++·spring boot·vscode·后端·中间件·rust
我就是全世界1 小时前
【存储选型终极指南】RustFS vs MinIO:5大维度深度对决,95%技术团队的选择秘密!
开发语言·分布式·rust·存储
_oP_i1 小时前
WinForms 项目里生成时选择“首选目标平台 32 位导致有些电脑在获取office word对象时获取不到
c#·office
yudiandian20142 小时前
【QT 5.12.12 打包-Windows 平台下】
开发语言·qt
要记得喝水2 小时前
C#某公司面试题(含题目和解析)--1
开发语言·windows·面试·c#·.net
金融数据出海2 小时前
黄金金融期货数据API对接技术文档
开发语言·金融·github
诗书画唱2 小时前
【前端教程】JavaScript 实现图片鼠标悬停切换效果与==和=的区别
开发语言·前端·javascript