pdf、docx、markdown、txt提取文档内容,可以应用于rag文档解析

返回的是文档解析分段内容组成的列表,分段内容默认chunk_size: int = 250, chunk_overlap: int = 50,250字分段,50分段处保留后面一段的前50字拼接即窗口包含下下一段前面50个字划分

cpp 复制代码
from typing import Union, List

import jieba
import re




class SentenceSplitter:
    def __init__(self, chunk_size: int = 250, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, text: str) -> List[str]:
        if self._is_has_chinese(text):
            return self._split_chinese_text(text)
        else:
            return self._split_english_text(text)

    def _split_chinese_text(self, text: str) -> List[str]:
        sentence_endings = {'\n', '。', '!', '?', ';', '...'}  # 句末标点符号
        chunks, current_chunk = [], ''
        for word in jieba.cut(text):
            if len(current_chunk) + len(word) > self.chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = word
            else:
                current_chunk += word
            if word[-1] in sentence_endings and len(current_chunk) > self.chunk_size - self.chunk_overlap:
                chunks.append(current_chunk.strip())
                current_chunk = ''
        if current_chunk:
            chunks.append(current_chunk.strip())
        if self.chunk_overlap > 0 and len(chunks) > 1:
            chunks = self._handle_overlap(chunks)
        return chunks

    def _split_english_text(self, text: str) -> List[str]:
        # 使用正则表达式按句子分割英文文本
        sentences = re.split(r'(?<=[.!?])\s+', text.replace('\n', ' '))
        chunks, current_chunk = [], ''
        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= self.chunk_size or not current_chunk:
                current_chunk += (' ' if current_chunk else '') + sentence
            else:
                chunks.append(current_chunk)
                current_chunk = sentence
        if current_chunk:  # Add the last chunk
            chunks.append(current_chunk)

        if self.chunk_overlap > 0 and len(chunks) > 1:
            chunks = self._handle_overlap(chunks)

        return chunks

    def _is_has_chinese(self, text: str) -> bool:
        # check if contains chinese characters
        if any("\u4e00" <= ch <= "\u9fff" for ch in text):
            return True
        else:
            return False

    def _handle_overlap(self, chunks: List[str]) -> List[str]:
        # 处理块间重叠
        overlapped_chunks = []
        for i in range(len(chunks) - 1):
            chunk = chunks[i] + ' ' + chunks[i + 1][:self.chunk_overlap]
            overlapped_chunks.append(chunk.strip())
        overlapped_chunks.append(chunks[-1])
        return overlapped_chunks


text_splitter = SentenceSplitter()

def load_file(filepath):
    print("filepath:",filepath)
    if filepath.endswith(".md"):
        contents = extract_text_from_markdown(filepath)

    elif filepath.endswith(".pdf"):
        contents = extract_text_from_pdf(filepath)
    elif filepath.endswith('.docx'):
        contents = extract_text_from_docx(filepath)
    else:
        contents = extract_text_from_txt(filepath)
    return contents

  
def extract_text_from_pdf(file_path: str):
    """Extract text content from a PDF file."""
    import PyPDF2
    contents = []
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        for page in pdf_reader.pages:
            page_text = page.extract_text().strip()
            raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
            new_text = ''
            for text in raw_text:
                new_text += text
                if text[-1] in ['.', '!', '?', '。', '!', '?', '...', ';', ';', ':', ':', '"', ''', ')', '】', '》', '」',
                                '』', '〕', '〉', '》', '〗', '〞', '〟', '>>', '"', "'", ')', ']', '}']:
                    contents.append(new_text)
                    new_text = ''
            if new_text:
                contents.append(new_text)
    return contents

def extract_text_from_txt(file_path: str):
    """Extract text content from a TXT file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        contents = [text.strip() for text in f.readlines() if text.strip()]
    return contents

def extract_text_from_docx(file_path: str):
    """Extract text content from a DOCX file."""
    import docx
    document = docx.Document(file_path)
    contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
    return contents

def extract_text_from_markdown(file_path: str):
    """Extract text content from a Markdown file."""
    import markdown
    from bs4 import BeautifulSoup
    with open(file_path, 'r', encoding='utf-8') as f:
        markdown_text = f.read()
    html = markdown.markdown(markdown_text)
    soup = BeautifulSoup(html, 'html.parser')
    contents = [text.strip() for text in soup.get_text().splitlines() if text.strip()]
    return contents



texts = load_file(r"C:\Users\lo***山市城市建筑外立面管理条例.docx")
print(texts)
相关推荐
waicsdn_haha11 分钟前
Java/JDK下载、安装及环境配置超详细教程【Windows10、macOS和Linux图文详解】
java·运维·服务器·开发语言·windows·后端·jdk
_WndProc13 分钟前
C++ 日志输出
开发语言·c++·算法
web1478621072314 分钟前
C# .Net Web 路由相关配置
前端·c#·.net
qq_4335545422 分钟前
C++ 面向对象编程:+号运算符重载,左移运算符重载
开发语言·c++
数据小爬虫@41 分钟前
如何高效利用Python爬虫按关键字搜索苏宁商品
开发语言·爬虫·python
ZJ_.42 分钟前
WPSJS:让 WPS 办公与 JavaScript 完美联动
开发语言·前端·javascript·vscode·ecmascript·wps
Narutolxy1 小时前
深入探讨 Go 中的高级表单验证与翻译:Gin 与 Validator 的实践之道20241223
开发语言·golang·gin
Hello.Reader1 小时前
全面解析 Golang Gin 框架
开发语言·golang·gin
禁默1 小时前
深入浅出:AWT的基本组件及其应用
java·开发语言·界面编程
Jasmine_llq1 小时前
《 火星人 》
算法·青少年编程·c#