企业级-封装Java对内卷PDF利用关键字分页导出标题

提供 PDF 文件 File入参,根据需要将其中内卷文件需要分页利用关键字读取分页,转成 XML

使用 依赖:itextpdfpdfbox

1、导入依赖

xml 复制代码
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.21</version>
</dependency>

<!-- https://mvnrepository.com/artifact/com.itextpdf/itextpdf -->
<dependency>
    <groupId>com.itextpdf</groupId>
    <artifactId>itextpdf</artifactId>
    <version>5.5.13.3</version>
</dependency>

2、封装代码:PdfPaginationUtil

java 复制代码
package com.gwssi.common.servicesupport.test.generator.service.impl;
import com.gwssi.util.Md5TokenGenerator;
import com.gwssi.util.PDFencodeUtil;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.SimpleBookmark;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
 * @Auther: fyp
 * @Date: 2024/6/17
 * @Description:
 * @Package: com.gwssi.common.servicesupport.test.generator.service.impl
 * @Version: 1.0
 */
public class PdfPaginationUtil {

    private static final String[] keyWords = new String[]{"1", "2", "3"};
    private static final String[] keyTitleWords = new String[]{"4", "5", "6"};


    public List<Map<String,Object>> transToXml(File file) {
        //1.给定文件
        PdfReader reader = null;
        try {
            InputStream is = new FileInputStream(file);
            reader = new PdfReader(is);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        //2.定义一个byte数组,长度为文件的长度
        byte[] pdfData = new byte[(int) file.length()];

        //3.IO流读取文件内容到byte数组
        FileInputStream inputStream = null;
        try {
            inputStream = new FileInputStream(file);
            inputStream.read(pdfData);
        } catch (IOException e) {
            throw new RuntimeException(e);
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
        }

        List<Map<String,Object>> elecFileItemList =new LinkedList<>();
        Map<String,Object> elecFileItemMap= new HashMap<>();
        List<HashMap<String, Object>> list = SimpleBookmark.getBookmark ( reader ) ;
        List<Map<String,Object>>innerfileItemList =new LinkedList<>();
        String elecFileName = file.getName();

        for (int i = 0; i < list.size(); i++) {
            Map<String, Object> innerfileItemMap = new HashMap<>();
            HashMap<String, Object> currentBookmark = list.get(i);
            String currentBookmarkName = PDFencodeUtil.showBookmark(currentBookmark);
            int currentBookmarkPage = PDFencodeUtil.getPageNumbers(currentBookmark);

            int beginPageNum = currentBookmarkPage;
            int endPageNum;

            if (i < list.size() - 1) {
                HashMap<String, Object> nextBookmark = list.get(i + 1);
                int nextBookmarkPage = PDFencodeUtil.getPageNumbers(nextBookmark);
                endPageNum = nextBookmarkPage - 1;
            } else {
                try {
                    endPageNum = PDFencodeUtil.getNumberOfPages(file); // You need to determine the total number of pages in your PDF.
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }

            // 申请书 或 申请表 有卷内文件
            if (currentBookmarkName.contains("某一页需要分页唯一关键字")) {
                try {
                    PDDocument document = PDDocument.load(file);
                    int numPages = document.getNumberOfPages();
                    Map<String, Object> titleAndPageInfo = new HashMap<>();
                    for (int page = beginPageNum; page <= endPageNum; page++) {
                        Map<String, Object> szqmInnerfileItemMap = new HashMap<>();
                        Map<String, Object> keywordPositionMap = new HashMap<>();
                        Map<String, Object> nextKeywordPositionMap = new HashMap<>();
                        PDFTextStripper stripper = new PDFTextStripper() {
                            @Override
                            protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
                                for (TextPosition textPosition : textPositions) {
                                    // 检查当前文本是否包含关键字
                                    for (String keyword : keyWords) {
                                        if (text.contains(keyword)) {
                                            // 获取关键字的x和y坐标
                                            Float x = textPosition.getXDirAdj();
                                            Float y = textPosition.getYDirAdj();
                                            // 将关键字的x和y坐标存储到Map中
                                            Map<String, Float> coordinates = new HashMap<>();
                                            coordinates.put("x", x);
                                            coordinates.put("y", y);
                                            keywordPositionMap.put(keyword, coordinates);
                                        }
                                    }
                                }
                                super.writeString(text, textPositions);
                            }
                        };
                        PDFTextStripper nextStripper = new PDFTextStripper() {
                            @Override
                            protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
                                for (TextPosition textPosition : textPositions) {
                                    // 检查当前文本是否包含关键字
                                    for (String keyword : keyWords) {
                                        if (text.contains(keyword)) {
                                            // 获取关键字的x和y坐标
                                            Float x = textPosition.getXDirAdj();
                                            Float y = textPosition.getYDirAdj();
                                            // 将关键字的x和y坐标存储到Map中
                                            Map<String, Float> coordinates = new HashMap<>();
                                            coordinates.put("x", x);
                                            coordinates.put("y", y);
                                            nextKeywordPositionMap.put(keyword, coordinates);
                                        }
                                    }
                                }
                                super.writeString(text, textPositions);
                            }
                        };
                        // 当前页
                        stripper.setStartPage(page);
                        stripper.setEndPage(page);
                        String text = stripper.getText(document);
                        // 下一页
                        nextStripper.setStartPage(page + 1);
                        nextStripper.setEndPage(page + 1);
                        String nextText = nextStripper.getText(document);
                        int beginPage = page;
                        boolean containTitle = false;
                        boolean nextContainTitle = false;
                        for (int idx = 0;  idx < keyWords.length; idx++) {
                            // 当前页 有关键字 (标题)
                            if (text.contains(keyWords[idx])) {
                                // 记录关键字位置
                                HashMap map = (HashMap) keywordPositionMap.get(keyWords[idx]);
                                Float x = (Float)  map.get("x");
                                Float y = (Float)  map.get("y");
                                if (y < 200) {
                                    System.out.println("Keyword " + keyWords[idx] + " found on page: " + (idx + 1));
                                    titleAndPageInfo.put("title", keyTitleWords[idx]);
                                    titleAndPageInfo.put("beginPage", beginPage);
                                    // 初始化 endPage
                                    titleAndPageInfo.put("endPage", beginPage);
                                    containTitle = true;
                                }

                            }
                            // 下一页 有关键字 (标题)
                            // 到达 申请书 下一页 关键字 找不到,根据是否到 申请书最后一页判断
                            if (nextText.contains(keyWords[idx])) {
                                HashMap map = (HashMap) nextKeywordPositionMap.get(keyWords[idx]);
                                Float x = (Float) map.get("x");
                                Float y = (Float) map.get("y");
                                if (y < 200) {
                                    nextContainTitle = true;
                                }

                            }
                        }
                        // 下一页 存在 关键字,endPage 已获取到
                        if (nextContainTitle) {
                            szqmInnerfileItemMap.put("innerfileTitle", titleAndPageInfo.get("title"));
                            szqmInnerfileItemMap.put("elecFileName", elecFileName);
                            szqmInnerfileItemMap.put("beginPageNum", titleAndPageInfo.get("beginPage"));
                            szqmInnerfileItemMap.put("endPageNum", titleAndPageInfo.get("endPage"));
                            innerfileItemList.add(szqmInnerfileItemMap);
                            // 下一页没有 关键字,且 是 申请书 最后一页
                        } else if (!nextContainTitle && page == endPageNum) {
                            titleAndPageInfo.put("endPage", endPageNum);
                            szqmInnerfileItemMap.put("innerfileTitle", titleAndPageInfo.get("title"));
                            szqmInnerfileItemMap.put("elecFileName", elecFileName);
                            szqmInnerfileItemMap.put("beginPageNum", titleAndPageInfo.get("beginPage"));
                            szqmInnerfileItemMap.put("endPageNum", titleAndPageInfo.get("endPage"));
                            innerfileItemList.add(szqmInnerfileItemMap);
                        } else {
                            titleAndPageInfo.put("endPage", beginPage + 1);

                        }
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }

            } else {
                innerfileItemMap.put("innerfileTitle", currentBookmarkName);
                innerfileItemMap.put("elecFileName", elecFileName);
                innerfileItemMap.put("beginPageNum", beginPageNum);
                innerfileItemMap.put("endPageNum", endPageNum);
                innerfileItemList.add(innerfileItemMap);
            }



        }

        elecFileItemMap.put("elecFileID", Md5TokenGenerator.generate(file.getPath()));
        elecFileItemMap.put("elecFileName",elecFileName);
        elecFileItemMap.put("elecFileSavePath", "文件下载地址");
        elecFileItemMap.put("elecFileType","szqm");
        elecFileItemMap.put("innerfileItemList",innerfileItemList);

        elecFileItemList.add(elecFileItemMap);

        return elecFileItemList;

    }

}
相关推荐
喵手16 分钟前
Python爬虫实战:旅游数据采集实战 - 携程&去哪儿酒店机票价格监控完整方案(附CSV导出 + SQLite持久化存储)!
爬虫·python·爬虫实战·零基础python爬虫教学·采集结果csv导出·旅游数据采集·携程/去哪儿酒店机票价格监控
Coder_Boy_19 分钟前
技术让开发更轻松的底层矛盾
java·大数据·数据库·人工智能·深度学习
2501_9449347320 分钟前
高职大数据技术专业,CDA和Python认证优先考哪个?
大数据·开发语言·python
helloworldandy27 分钟前
使用Pandas进行数据分析:从数据清洗到可视化
jvm·数据库·python
invicinble37 分钟前
对tomcat的提供的功能与底层拓扑结构与实现机制的理解
java·tomcat
较真的菜鸟1 小时前
使用ASM和agent监控属性变化
java
黎雁·泠崖1 小时前
【魔法森林冒险】5/14 Allen类(三):任务进度与状态管理
java·开发语言
肖永威2 小时前
macOS环境安装/卸载python实践笔记
笔记·python·macos
TechWJ2 小时前
PyPTO编程范式深度解读:让NPU开发像写Python一样简单
开发语言·python·cann·pypto
qq_12498707532 小时前
基于SSM的动物保护系统的设计与实现(源码+论文+部署+安装)
java·数据库·spring boot·毕业设计·ssm·计算机毕业设计