pdf、docx、markdown、txt提取文档内容,可以应用于rag文档解析

返回的是文档解析分段内容组成的列表,分段内容默认chunk_size: int = 250, chunk_overlap: int = 50,250字分段,50分段处保留后面一段的前50字拼接即窗口包含下下一段前面50个字划分

cpp 复制代码
from typing import Union, List

import jieba
import re




class SentenceSplitter:
    def __init__(self, chunk_size: int = 250, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, text: str) -> List[str]:
        if self._is_has_chinese(text):
            return self._split_chinese_text(text)
        else:
            return self._split_english_text(text)

    def _split_chinese_text(self, text: str) -> List[str]:
        sentence_endings = {'\n', '。', '!', '?', ';', '...'}  # 句末标点符号
        chunks, current_chunk = [], ''
        for word in jieba.cut(text):
            if len(current_chunk) + len(word) > self.chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = word
            else:
                current_chunk += word
            if word[-1] in sentence_endings and len(current_chunk) > self.chunk_size - self.chunk_overlap:
                chunks.append(current_chunk.strip())
                current_chunk = ''
        if current_chunk:
            chunks.append(current_chunk.strip())
        if self.chunk_overlap > 0 and len(chunks) > 1:
            chunks = self._handle_overlap(chunks)
        return chunks

    def _split_english_text(self, text: str) -> List[str]:
        # 使用正则表达式按句子分割英文文本
        sentences = re.split(r'(?<=[.!?])\s+', text.replace('\n', ' '))
        chunks, current_chunk = [], ''
        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= self.chunk_size or not current_chunk:
                current_chunk += (' ' if current_chunk else '') + sentence
            else:
                chunks.append(current_chunk)
                current_chunk = sentence
        if current_chunk:  # Add the last chunk
            chunks.append(current_chunk)

        if self.chunk_overlap > 0 and len(chunks) > 1:
            chunks = self._handle_overlap(chunks)

        return chunks

    def _is_has_chinese(self, text: str) -> bool:
        # check if contains chinese characters
        if any("\u4e00" <= ch <= "\u9fff" for ch in text):
            return True
        else:
            return False

    def _handle_overlap(self, chunks: List[str]) -> List[str]:
        # 处理块间重叠
        overlapped_chunks = []
        for i in range(len(chunks) - 1):
            chunk = chunks[i] + ' ' + chunks[i + 1][:self.chunk_overlap]
            overlapped_chunks.append(chunk.strip())
        overlapped_chunks.append(chunks[-1])
        return overlapped_chunks


text_splitter = SentenceSplitter()

def load_file(filepath):
    print("filepath:",filepath)
    if filepath.endswith(".md"):
        contents = extract_text_from_markdown(filepath)

    elif filepath.endswith(".pdf"):
        contents = extract_text_from_pdf(filepath)
    elif filepath.endswith('.docx'):
        contents = extract_text_from_docx(filepath)
    else:
        contents = extract_text_from_txt(filepath)
    return contents

  
def extract_text_from_pdf(file_path: str):
    """Extract text content from a PDF file."""
    import PyPDF2
    contents = []
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        for page in pdf_reader.pages:
            page_text = page.extract_text().strip()
            raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
            new_text = ''
            for text in raw_text:
                new_text += text
                if text[-1] in ['.', '!', '?', '。', '!', '?', '...', ';', ';', ':', ':', '"', ''', ')', '】', '》', '」',
                                '』', '〕', '〉', '》', '〗', '〞', '〟', '>>', '"', "'", ')', ']', '}']:
                    contents.append(new_text)
                    new_text = ''
            if new_text:
                contents.append(new_text)
    return contents

def extract_text_from_txt(file_path: str):
    """Extract text content from a TXT file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        contents = [text.strip() for text in f.readlines() if text.strip()]
    return contents

def extract_text_from_docx(file_path: str):
    """Extract text content from a DOCX file."""
    import docx
    document = docx.Document(file_path)
    contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
    return contents

def extract_text_from_markdown(file_path: str):
    """Extract text content from a Markdown file."""
    import markdown
    from bs4 import BeautifulSoup
    with open(file_path, 'r', encoding='utf-8') as f:
        markdown_text = f.read()
    html = markdown.markdown(markdown_text)
    soup = BeautifulSoup(html, 'html.parser')
    contents = [text.strip() for text in soup.get_text().splitlines() if text.strip()]
    return contents



texts = load_file(r"C:\Users\lo***山市城市建筑外立面管理条例.docx")
print(texts)
相关推荐
巷北夜未央5 分钟前
Python每日一题(14)
开发语言·python·算法
雾月5534 分钟前
LeetCode 914 卡牌分组
java·开发语言·算法·leetcode·职场和发展
Y.O.U..44 分钟前
今日八股——C++
开发语言·c++·面试
weixin_307779131 小时前
使用C#实现从Hive的CREATE TABLE语句中提取分区字段名和数据类型
开发语言·数据仓库·hive·c#
Xiaok10181 小时前
解决 Hugging Face SentenceTransformer 下载失败的完整指南:ProxyError、SSLError与手动下载方案
开发语言·神经网络·php
绿草在线1 小时前
Mock.js虚拟接口
开发语言·javascript·ecmascript
go_bai1 小时前
Linux环境基础开发工具——(2)vim
linux·开发语言·经验分享·笔记·vim·学习方法
小郝 小郝1 小时前
【C语言】strstr查找字符串函数
c语言·开发语言
yinhezhanshen1 小时前
理解rust里面的copy和clone
开发语言·后端·rust
Jtti2 小时前
PHP在Debian环境上的并发处理能力如何
开发语言·debian·php