PDF匹配文本精准标记红框算法(单个、多个、单行、多行)

复制代码
## pip install pdfminer.six
## pip install PyMuPDF

import fitz
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTChar


## pdf匹配文本标红框
def pdfMarkedBox(string, name, address="", mode=0, order=0, skew=0, line=0):
    """
    :param string:  匹配的字符串
    :param name:    标框的pdf文件命名或者放置地址
    :param address: 对应的pdf文件地址
    :param mode:    对应匹配模式 mode = 0 完全匹配 mode =1 包含匹配 mode = 2 PyMuPDF单行匹配
    :param order:   匹配到的文本取值位置,默认第一个
    :param skew:    偏移位置,用于飘忽不定的文本匹配不上,找到特定标识位置偏移匹配
    :param line:    读取多个用于一起标记
    :return:        无
    """
    ## 文本存储
    list_text = []
    ## 坐标存储
    list_box = []
    ## 页码存储
    list_number = []

    def parse_pdf(address):
        with open(address, 'rb') as fp:
            ## 准备工作
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            page_number = 0  # 初始化页码计数器
            ## 获取pdf页面数据
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for element in layout:
                    if isinstance(element, LTTextBox) or isinstance(element, LTTextLine):
                        # print(f"Text: {element.get_text()}")
                        # print(f"Coordinates: {element.bbox}")
                        list_text.append(element.get_text())
                        list_box.append(element.bbox)
                        list_number.append(page_number)
                    # elif isinstance(element, LTChar):
                    #     # 单个字符遍历LTChar对象
                    #     print(f"Character: {element.get_text()}")
                    #     print(f"Coordinates: {element.bbox}")
                page_number += 1

    ## 匹配列表 进行匹配操作 获取index来定位
    list_index = []
    if mode == 0:
        parse_pdf(address)
        print(list_text)
        print(list_box)
        print(list_number)
        print("进行完全匹配")
        number_matches = 0
        for index, value in enumerate(list_text):
            if string == value.replace("\n", ""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:", number_matches)
    elif mode == 1:
        parse_pdf(address)
        print(list_text)
        print(list_box)
        print(list_number)
        print("进行包含匹配")
        number_matches = 0
        for index, value in enumerate(list_text):
            if string in value.replace("\n", ""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:", number_matches)
    elif mode == 2:
        ## PyMuPDF 单行匹配
        list_fitz_text = []
        doc = fitz.open(address)
        # 获取 PDF 文件中的总页数
        num_pages = doc.page_count
        # 逐页读取内容
        for page_num in range(num_pages):
            page = doc.load_page(page_num)  # 加载页面
            # 获取页面上的所有文本块
            blocks = page.get_text_blocks()
            print(f"Page {page_num + 1} Blocks:")
            for block in blocks:
                print(block)
                list_text.append(block[4])
                list_box.append((block[0], block[1], block[2], block[3]))
                list_number.append(page_num)
        number_matches = 0
        for index, value in enumerate(list_text):
            print(index, value)
            if string in value.replace("\n", ""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:", number_matches)

    if list_index == []:
        print("没有匹配到对应的文本,请检查一下")
        return
    if line == 0:
        print("读取位置:", list_index[order] + skew)
        print("读取文本:", list_text[list_index[order] + skew])
        print("读取坐标:", list_box[list_index[order] + skew])
        print("读取页码:", list_number[list_index[order] + skew])
    else:
        print("读取位置:", list_index[order] + skew, list_index[order] + skew + line)
        print("读取文本:", list_text[list_index[order] + skew: list_index[order] + skew + line])
        print("读取坐标:", list_box[list_index[order] + skew: list_index[order] + skew + line])
        print("读取页码:", list_number[list_index[order] + skew: list_index[order] + skew + line])

    ## 绘制红色框框并保存
    def redBox(address, page, box, name):
        """
        :param address:  pdf文件地址
        :param page:    页码
        :param box:     坐标的位置
        :param name:    标框的pdf文件命名或者放置地址
        :return:        无
        """
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = box  # (left, bottom, right, top)
        list1 = []
        for i in rect:
            list1.append(i)
        n = 5
        new_rect = (list1[0] - n, list_rect[3] - list1[3] - n, list1[2] + n, list_rect[3] - list1[1] + n)

        ## 变化为下划线
        # new_rect = (list1[0] - 0.5, list_rect[3] - list1[1] +1, list1[2] + 0.5, list_rect[3] - list1[1] +1.5)

        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:", name)

    ## 多个文本绘制红色框框并保存  需同一页
    def redBoxAll(address, page, box, name):
        """
        :param address:  pdf文件地址
        :param page:    单个页码
        :param box:     多个坐标的位置
        :param name:    标框的pdf文件命名或者放置地址
        :return:        无
        """
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        ## 处理box多个坐标 取最大值
        left = []
        bottom = []
        right = []
        top = []

        for i in box:
            left.append(i[0])
            bottom.append(i[1])
            right.append(i[2])
            top.append(i[3])
        new_box = (min(left), min(bottom), max(right), max(top))
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = new_box  # (left, bottom, right, top)
        print(rect)
        list1 = []
        for i in rect:
            list1.append(i)

        n = 5
        new_rect = (list1[0] - n, list_rect[3] - list1[3] - n, list1[2] + n, list_rect[3] - list1[1] + n)

        ## 变化为下划线
        # new_rect = (list1[0] - 0.5, list_rect[3] - list1[1] +1, list1[2] + 0.5, list_rect[3] - list1[1] +1.5)

        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:", name)

    ## 绘制红色框框并保存 PyMuPDF版本
    def redBoxMuPDF(address, page, box, name):
        """
        :param address:  pdf文件地址
        :param page:    页码
        :param box:     坐标的位置
        :param name:    标框的pdf文件命名或者放置地址
        :return:        无
        """
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = box  # (left, bottom, right, top)
        list1 = []
        for i in rect:
            list1.append(i)
        n = 5
        new_rect = (list1[0] - n, list1[1] - n, list1[2] + n, list1[3] + n)

        ## 变化为下划线
        # new_rect = (list1[0] - 0.5, list1[3] +1.5, list1[2] + 0.5,list1[3] +2)

        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:", name)

    ## 多个文本绘制红色框框并保存  需同一页 PyMuPDF版本
    def redBoxAllMuPDF(address, page, box, name):
        """
        :param address:  pdf文件地址
        :param page:    单个页码
        :param box:     多个坐标的位置
        :param name:    标框的pdf文件命名或者放置地址
        :return:        无
        """
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        ## 处理box多个坐标 取最大值
        left = []
        bottom = []
        right = []
        top = []

        for i in box:
            left.append(i[0])
            bottom.append(i[1])
            right.append(i[2])
            top.append(i[3])
        new_box = (min(left), min(bottom), max(right), max(top))
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = new_box  # (left, bottom, right, top)
        print(rect)
        list1 = []
        for i in rect:
            list1.append(i)
        n = 5
        new_rect = (list1[0] - n, list1[1] - n, list1[2] + n, list1[3] + n)

        ## 变化为下划线
        # new_rect = (list1[0] - 0.5, list1[3] +1.5, list1[2] + 0.5,list1[3] +2)

        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:", name)

    if mode == 2 and line == 0:
        redBoxMuPDF(address, list_number[list_index[order] + skew], list_box[list_index[order] + skew], name)
        return
    elif mode == 2 and line != 0:
        redBoxAllMuPDF(address, list_number[list_index[order] + skew],
                       list_box[list_index[order] + skew: list_index[order] + skew + line], name)
        return

    if line == 0:
        redBox(address, list_number[list_index[order] + skew], list_box[list_index[order] + skew], name)
        return
    else:
        redBoxAll(address, list_number[list_index[order] + skew],
                  list_box[list_index[order] + skew: list_index[order] + skew + line], name)
        return
python 复制代码
## pip install pdfminer.six
## pip install PyMuPDF

import fitz
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTChar


## pdf匹配文本标红框
def pdfMarkedBox(string, name, address="", mode=0, order=0, skew=0, line=0):
    """
    :param string:  匹配的字符串
    :param name:    标框的pdf文件命名或者放置地址
    :param address: 对应的pdf文件地址
    :param mode:    对应匹配模式 mode = 0 完全匹配 mode =1 包含匹配 mode = 2 PyMuPDF单行匹配
    :param order:   匹配到的文本取值位置,默认第一个
    :param skew:    偏移位置,用于飘忽不定的文本匹配不上,找到特定标识位置偏移匹配
    :param line:    读取多个用于一起标记
    :return:        无
    """
    ## 文本存储
    list_text = []
    ## 坐标存储
    list_box = []
    ## 页码存储
    list_number = []

    def parse_pdf(address):
        with open(address, 'rb') as fp:
            ## 准备工作
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            page_number = 0  # 初始化页码计数器
            ## 获取pdf页面数据
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for element in layout:
                    if isinstance(element, LTTextBox) or isinstance(element, LTTextLine):
                        # print(f"Text: {element.get_text()}")
                        # print(f"Coordinates: {element.bbox}")
                        list_text.append(element.get_text())
                        list_box.append(element.bbox)
                        list_number.append(page_number)
                    # elif isinstance(element, LTChar):
                    #     # 单个字符遍历LTChar对象
                    #     print(f"Character: {element.get_text()}")
                    #     print(f"Coordinates: {element.bbox}")
                page_number += 1

    ## 匹配列表 进行匹配操作 获取index来定位
    list_index = []
    if mode == 0:
        parse_pdf(address)
        print(list_text)
        print(list_box)
        print(list_number)
        print("进行完全匹配")
        number_matches = 0
        for index, value in enumerate(list_text):
            if string == value.replace("\n", ""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:", number_matches)
    elif mode == 1:
        parse_pdf(address)
        print(list_text)
        print(list_box)
        print(list_number)
        print("进行包含匹配")
        number_matches = 0
        for index, value in enumerate(list_text):
            if string in value.replace("\n", ""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:", number_matches)
    elif mode == 2:
        ## PyMuPDF 单行匹配
        list_fitz_text = []
        doc = fitz.open(address)
        # 获取 PDF 文件中的总页数
        num_pages = doc.page_count
        # 逐页读取内容
        for page_num in range(num_pages):
            page = doc.load_page(page_num)  # 加载页面
            # 获取页面上的所有文本块
            blocks = page.get_text_blocks()
            print(f"Page {page_num + 1} Blocks:")
            for block in blocks:
                print(block)
                list_text.append(block[4])
                list_box.append((block[0], block[1], block[2], block[3]))
                list_number.append(page_num)
        number_matches = 0
        for index, value in enumerate(list_text):
            print(index, value)
            if string in value.replace("\n", ""):
                list_index.append(index)
                number_matches += 1
        print("匹配数量为:", number_matches)

    if list_index == []:
        print("没有匹配到对应的文本,请检查一下")
        return
    if line == 0:
        print("读取位置:", list_index[order] + skew)
        print("读取文本:", list_text[list_index[order] + skew])
        print("读取坐标:", list_box[list_index[order] + skew])
        print("读取页码:", list_number[list_index[order] + skew])
    else:
        print("读取位置:", list_index[order] + skew, list_index[order] + skew + line)
        print("读取文本:", list_text[list_index[order] + skew: list_index[order] + skew + line])
        print("读取坐标:", list_box[list_index[order] + skew: list_index[order] + skew + line])
        print("读取页码:", list_number[list_index[order] + skew: list_index[order] + skew + line])

    ## 绘制红色框框并保存
    def redBox(address, page, box, name):
        """
        :param address:  pdf文件地址
        :param page:    页码
        :param box:     坐标的位置
        :param name:    标框的pdf文件命名或者放置地址
        :return:        无
        """
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = box  # (left, bottom, right, top)
        list1 = []
        for i in rect:
            list1.append(i)
        n = 5
        new_rect = (list1[0] - n, list_rect[3] - list1[3] - n, list1[2] + n, list_rect[3] - list1[1] + n)

        ## 变化为下划线
        # new_rect = (list1[0] - 0.5, list_rect[3] - list1[1] +1, list1[2] + 0.5, list_rect[3] - list1[1] +1.5)

        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:", name)

    ## 多个文本绘制红色框框并保存  需同一页
    def redBoxAll(address, page, box, name):
        """
        :param address:  pdf文件地址
        :param page:    单个页码
        :param box:     多个坐标的位置
        :param name:    标框的pdf文件命名或者放置地址
        :return:        无
        """
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        ## 处理box多个坐标 取最大值
        left = []
        bottom = []
        right = []
        top = []

        for i in box:
            left.append(i[0])
            bottom.append(i[1])
            right.append(i[2])
            top.append(i[3])
        new_box = (min(left), min(bottom), max(right), max(top))
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = new_box  # (left, bottom, right, top)
        print(rect)
        list1 = []
        for i in rect:
            list1.append(i)

        n = 5
        new_rect = (list1[0] - n, list_rect[3] - list1[3] - n, list1[2] + n, list_rect[3] - list1[1] + n)

        ## 变化为下划线
        # new_rect = (list1[0] - 0.5, list_rect[3] - list1[1] +1, list1[2] + 0.5, list_rect[3] - list1[1] +1.5)

        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:", name)

    ## 绘制红色框框并保存 PyMuPDF版本
    def redBoxMuPDF(address, page, box, name):
        """
        :param address:  pdf文件地址
        :param page:    页码
        :param box:     坐标的位置
        :param name:    标框的pdf文件命名或者放置地址
        :return:        无
        """
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = box  # (left, bottom, right, top)
        list1 = []
        for i in rect:
            list1.append(i)
        n = 5
        new_rect = (list1[0] - n, list1[1] - n, list1[2] + n, list1[3] + n)

        ## 变化为下划线
        # new_rect = (list1[0] - 0.5, list1[3] +1.5, list1[2] + 0.5,list1[3] +2)

        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:", name)

    ## 多个文本绘制红色框框并保存  需同一页 PyMuPDF版本
    def redBoxAllMuPDF(address, page, box, name):
        """
        :param address:  pdf文件地址
        :param page:    单个页码
        :param box:     多个坐标的位置
        :param name:    标框的pdf文件命名或者放置地址
        :return:        无
        """
        ##PyMuPDF进行处理
        doc = fitz.open(address)
        # 选择要添加注释的页面
        page = doc.load_page(page)  # 0 表示第一页
        # 获取页面大小来做处理
        page_rect = page.rect
        list_rect = list(page_rect)
        ## 处理box多个坐标 取最大值
        left = []
        bottom = []
        right = []
        top = []

        for i in box:
            left.append(i[0])
            bottom.append(i[1])
            right.append(i[2])
            top.append(i[3])
        new_box = (min(left), min(bottom), max(right), max(top))
        # 定义矩形注释的位置和大小(左下角和右上角的坐标)
        rect = new_box  # (left, bottom, right, top)
        print(rect)
        list1 = []
        for i in rect:
            list1.append(i)
        n = 5
        new_rect = (list1[0] - n, list1[1] - n, list1[2] + n, list1[3] + n)

        ## 变化为下划线
        # new_rect = (list1[0] - 0.5, list1[3] +1.5, list1[2] + 0.5,list1[3] +2)

        # 添加矩形注释到页面
        annot = page.add_rect_annot(new_rect)
        # 保存 PDF 文件
        doc.save(name)
        # 关闭 PDF 文件
        doc.close()
        print("文件完成标红框:", name)

    if mode == 2 and line == 0:
        redBoxMuPDF(address, list_number[list_index[order] + skew], list_box[list_index[order] + skew], name)
        return
    elif mode == 2 and line != 0:
        redBoxAllMuPDF(address, list_number[list_index[order] + skew],
                       list_box[list_index[order] + skew: list_index[order] + skew + line], name)
        return

    if line == 0:
        redBox(address, list_number[list_index[order] + skew], list_box[list_index[order] + skew], name)
        return
    else:
        redBoxAll(address, list_number[list_index[order] + skew],
                  list_box[list_index[order] + skew: list_index[order] + skew + line], name)
        return
相关推荐
nnsix2 分钟前
MVC、MVP、MVVM 架构 笔记
java·开发语言·前端
财经资讯数据_灵砚智能3 分钟前
基于全球经济类多源新闻的NLP情感分析与数据可视化(日间)2026年5月29日
大数据·人工智能·python·信息可视化·自然语言处理·ai编程·灵砚智能
财经资讯数据_灵砚智能7 分钟前
基于全球经济类多源新闻的NLP情感分析与数据可视化(夜间-次晨)2026年5月28日
大数据·人工智能·python·信息可视化·自然语言处理·ai编程·灵砚智能
m沐沐7 分钟前
【机器学习】聚类算法-K-means聚类
人工智能·python·算法·机器学习·pycharm·kmeans·聚类
Smile_2542204188 分钟前
vue3 + ts reactive方式清空表单对象
开发语言·前端·javascript
jjjava2.012 分钟前
Java 多线程核心基础与线程安全
java·开发语言
若鱼文化创意14 分钟前
品牌设计CI规划使用后交付偏差先分项核对验收标准
python·ci/cd
水木流年追梦14 分钟前
大模型入门-大模型优化方法3
人工智能·分布式·python·深度学习·机器学习
私人珍藏库17 分钟前
【PC】[吾爱大神原创工具] PDFImageViewer V1 永久免费的PDF图像查看和导出工具
windows·pdf·工具·软件·多功能
悟乙己19 分钟前
因果推断方法实践:Python实现合成控制法
开发语言·python