pdf、docx、markdown、txt提取文档内容,可以应用于rag文档解析

返回的是文档解析分段内容组成的列表,分段内容默认chunk_size: int = 250, chunk_overlap: int = 50,250字分段,50分段处保留后面一段的前50字拼接即窗口包含下下一段前面50个字划分

cpp 复制代码
from typing import Union, List

import jieba
import re




class SentenceSplitter:
    def __init__(self, chunk_size: int = 250, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, text: str) -> List[str]:
        if self._is_has_chinese(text):
            return self._split_chinese_text(text)
        else:
            return self._split_english_text(text)

    def _split_chinese_text(self, text: str) -> List[str]:
        sentence_endings = {'\n', '。', '!', '?', ';', '...'}  # 句末标点符号
        chunks, current_chunk = [], ''
        for word in jieba.cut(text):
            if len(current_chunk) + len(word) > self.chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = word
            else:
                current_chunk += word
            if word[-1] in sentence_endings and len(current_chunk) > self.chunk_size - self.chunk_overlap:
                chunks.append(current_chunk.strip())
                current_chunk = ''
        if current_chunk:
            chunks.append(current_chunk.strip())
        if self.chunk_overlap > 0 and len(chunks) > 1:
            chunks = self._handle_overlap(chunks)
        return chunks

    def _split_english_text(self, text: str) -> List[str]:
        # 使用正则表达式按句子分割英文文本
        sentences = re.split(r'(?<=[.!?])\s+', text.replace('\n', ' '))
        chunks, current_chunk = [], ''
        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= self.chunk_size or not current_chunk:
                current_chunk += (' ' if current_chunk else '') + sentence
            else:
                chunks.append(current_chunk)
                current_chunk = sentence
        if current_chunk:  # Add the last chunk
            chunks.append(current_chunk)

        if self.chunk_overlap > 0 and len(chunks) > 1:
            chunks = self._handle_overlap(chunks)

        return chunks

    def _is_has_chinese(self, text: str) -> bool:
        # check if contains chinese characters
        if any("\u4e00" <= ch <= "\u9fff" for ch in text):
            return True
        else:
            return False

    def _handle_overlap(self, chunks: List[str]) -> List[str]:
        # 处理块间重叠
        overlapped_chunks = []
        for i in range(len(chunks) - 1):
            chunk = chunks[i] + ' ' + chunks[i + 1][:self.chunk_overlap]
            overlapped_chunks.append(chunk.strip())
        overlapped_chunks.append(chunks[-1])
        return overlapped_chunks


text_splitter = SentenceSplitter()

def load_file(filepath):
    print("filepath:",filepath)
    if filepath.endswith(".md"):
        contents = extract_text_from_markdown(filepath)

    elif filepath.endswith(".pdf"):
        contents = extract_text_from_pdf(filepath)
    elif filepath.endswith('.docx'):
        contents = extract_text_from_docx(filepath)
    else:
        contents = extract_text_from_txt(filepath)
    return contents

  
def extract_text_from_pdf(file_path: str):
    """Extract text content from a PDF file."""
    import PyPDF2
    contents = []
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        for page in pdf_reader.pages:
            page_text = page.extract_text().strip()
            raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
            new_text = ''
            for text in raw_text:
                new_text += text
                if text[-1] in ['.', '!', '?', '。', '!', '?', '...', ';', ';', ':', ':', '"', ''', ')', '】', '》', '」',
                                '』', '〕', '〉', '》', '〗', '〞', '〟', '>>', '"', "'", ')', ']', '}']:
                    contents.append(new_text)
                    new_text = ''
            if new_text:
                contents.append(new_text)
    return contents

def extract_text_from_txt(file_path: str):
    """Extract text content from a TXT file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        contents = [text.strip() for text in f.readlines() if text.strip()]
    return contents

def extract_text_from_docx(file_path: str):
    """Extract text content from a DOCX file."""
    import docx
    document = docx.Document(file_path)
    contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
    return contents

def extract_text_from_markdown(file_path: str):
    """Extract text content from a Markdown file."""
    import markdown
    from bs4 import BeautifulSoup
    with open(file_path, 'r', encoding='utf-8') as f:
        markdown_text = f.read()
    html = markdown.markdown(markdown_text)
    soup = BeautifulSoup(html, 'html.parser')
    contents = [text.strip() for text in soup.get_text().splitlines() if text.strip()]
    return contents



texts = load_file(r"C:\Users\lo***山市城市建筑外立面管理条例.docx")
print(texts)
相关推荐
西阳未落2 小时前
C++基础(21)——内存管理
开发语言·c++·面试
我的xiaodoujiao2 小时前
Windows系统Web UI自动化测试学习系列2--环境搭建--Python-PyCharm-Selenium
开发语言·python·测试工具
callJJ2 小时前
从 0 开始理解 Spring 的核心思想 —— IoC 和 DI(2)
java·开发语言·后端·spring·ioc·di
hsjkdhs4 小时前
万字详解C++之构造函数析构函数
开发语言·c++
Lin_Aries_04214 小时前
容器化简单的 Java 应用程序
java·linux·运维·开发语言·docker·容器·rpc
techdashen5 小时前
12分钟讲解Python核心理念
开发语言·python
山海不说话5 小时前
Java后端面经(八股——Redis)
java·开发语言·redis
郝学胜-神的一滴5 小时前
谨慎地迭代函数所收到的参数 (Effective Python 第31条)
开发语言·python·程序人生·软件工程
大虾别跑6 小时前
vc无法启动
java·开发语言
郭老二6 小时前
【JAVA】从入门到放弃-01-HelloWorld
java·开发语言