提供 PDF
文件 File
入参,根据需要将其中内卷文件需要分页利用关键字读取分页,转成 XML
。
使用 依赖:itextpdf
、pdfbox
1、导入依赖
xml
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.21</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.itextpdf/itextpdf -->
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.13.3</version>
</dependency>
2、封装代码:PdfPaginationUtil
java
package com.gwssi.common.servicesupport.test.generator.service.impl;
import com.gwssi.util.Md5TokenGenerator;
import com.gwssi.util.PDFencodeUtil;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.SimpleBookmark;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
/**
* @Auther: fyp
* @Date: 2024/6/17
* @Description:
* @Package: com.gwssi.common.servicesupport.test.generator.service.impl
* @Version: 1.0
*/
public class PdfPaginationUtil {
private static final String[] keyWords = new String[]{"1", "2", "3"};
private static final String[] keyTitleWords = new String[]{"4", "5", "6"};
public List<Map<String,Object>> transToXml(File file) {
//1.给定文件
PdfReader reader = null;
try {
InputStream is = new FileInputStream(file);
reader = new PdfReader(is);
} catch (IOException e) {
throw new RuntimeException(e);
}
//2.定义一个byte数组,长度为文件的长度
byte[] pdfData = new byte[(int) file.length()];
//3.IO流读取文件内容到byte数组
FileInputStream inputStream = null;
try {
inputStream = new FileInputStream(file);
inputStream.read(pdfData);
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
List<Map<String,Object>> elecFileItemList =new LinkedList<>();
Map<String,Object> elecFileItemMap= new HashMap<>();
List<HashMap<String, Object>> list = SimpleBookmark.getBookmark ( reader ) ;
List<Map<String,Object>>innerfileItemList =new LinkedList<>();
String elecFileName = file.getName();
for (int i = 0; i < list.size(); i++) {
Map<String, Object> innerfileItemMap = new HashMap<>();
HashMap<String, Object> currentBookmark = list.get(i);
String currentBookmarkName = PDFencodeUtil.showBookmark(currentBookmark);
int currentBookmarkPage = PDFencodeUtil.getPageNumbers(currentBookmark);
int beginPageNum = currentBookmarkPage;
int endPageNum;
if (i < list.size() - 1) {
HashMap<String, Object> nextBookmark = list.get(i + 1);
int nextBookmarkPage = PDFencodeUtil.getPageNumbers(nextBookmark);
endPageNum = nextBookmarkPage - 1;
} else {
try {
endPageNum = PDFencodeUtil.getNumberOfPages(file); // You need to determine the total number of pages in your PDF.
} catch (IOException e) {
throw new RuntimeException(e);
}
}
// 申请书 或 申请表 有卷内文件
if (currentBookmarkName.contains("某一页需要分页唯一关键字")) {
try {
PDDocument document = PDDocument.load(file);
int numPages = document.getNumberOfPages();
Map<String, Object> titleAndPageInfo = new HashMap<>();
for (int page = beginPageNum; page <= endPageNum; page++) {
Map<String, Object> szqmInnerfileItemMap = new HashMap<>();
Map<String, Object> keywordPositionMap = new HashMap<>();
Map<String, Object> nextKeywordPositionMap = new HashMap<>();
PDFTextStripper stripper = new PDFTextStripper() {
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
for (TextPosition textPosition : textPositions) {
// 检查当前文本是否包含关键字
for (String keyword : keyWords) {
if (text.contains(keyword)) {
// 获取关键字的x和y坐标
Float x = textPosition.getXDirAdj();
Float y = textPosition.getYDirAdj();
// 将关键字的x和y坐标存储到Map中
Map<String, Float> coordinates = new HashMap<>();
coordinates.put("x", x);
coordinates.put("y", y);
keywordPositionMap.put(keyword, coordinates);
}
}
}
super.writeString(text, textPositions);
}
};
PDFTextStripper nextStripper = new PDFTextStripper() {
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
for (TextPosition textPosition : textPositions) {
// 检查当前文本是否包含关键字
for (String keyword : keyWords) {
if (text.contains(keyword)) {
// 获取关键字的x和y坐标
Float x = textPosition.getXDirAdj();
Float y = textPosition.getYDirAdj();
// 将关键字的x和y坐标存储到Map中
Map<String, Float> coordinates = new HashMap<>();
coordinates.put("x", x);
coordinates.put("y", y);
nextKeywordPositionMap.put(keyword, coordinates);
}
}
}
super.writeString(text, textPositions);
}
};
// 当前页
stripper.setStartPage(page);
stripper.setEndPage(page);
String text = stripper.getText(document);
// 下一页
nextStripper.setStartPage(page + 1);
nextStripper.setEndPage(page + 1);
String nextText = nextStripper.getText(document);
int beginPage = page;
boolean containTitle = false;
boolean nextContainTitle = false;
for (int idx = 0; idx < keyWords.length; idx++) {
// 当前页 有关键字 (标题)
if (text.contains(keyWords[idx])) {
// 记录关键字位置
HashMap map = (HashMap) keywordPositionMap.get(keyWords[idx]);
Float x = (Float) map.get("x");
Float y = (Float) map.get("y");
if (y < 200) {
System.out.println("Keyword " + keyWords[idx] + " found on page: " + (idx + 1));
titleAndPageInfo.put("title", keyTitleWords[idx]);
titleAndPageInfo.put("beginPage", beginPage);
// 初始化 endPage
titleAndPageInfo.put("endPage", beginPage);
containTitle = true;
}
}
// 下一页 有关键字 (标题)
// 到达 申请书 下一页 关键字 找不到,根据是否到 申请书最后一页判断
if (nextText.contains(keyWords[idx])) {
HashMap map = (HashMap) nextKeywordPositionMap.get(keyWords[idx]);
Float x = (Float) map.get("x");
Float y = (Float) map.get("y");
if (y < 200) {
nextContainTitle = true;
}
}
}
// 下一页 存在 关键字,endPage 已获取到
if (nextContainTitle) {
szqmInnerfileItemMap.put("innerfileTitle", titleAndPageInfo.get("title"));
szqmInnerfileItemMap.put("elecFileName", elecFileName);
szqmInnerfileItemMap.put("beginPageNum", titleAndPageInfo.get("beginPage"));
szqmInnerfileItemMap.put("endPageNum", titleAndPageInfo.get("endPage"));
innerfileItemList.add(szqmInnerfileItemMap);
// 下一页没有 关键字,且 是 申请书 最后一页
} else if (!nextContainTitle && page == endPageNum) {
titleAndPageInfo.put("endPage", endPageNum);
szqmInnerfileItemMap.put("innerfileTitle", titleAndPageInfo.get("title"));
szqmInnerfileItemMap.put("elecFileName", elecFileName);
szqmInnerfileItemMap.put("beginPageNum", titleAndPageInfo.get("beginPage"));
szqmInnerfileItemMap.put("endPageNum", titleAndPageInfo.get("endPage"));
innerfileItemList.add(szqmInnerfileItemMap);
} else {
titleAndPageInfo.put("endPage", beginPage + 1);
}
}
} catch (Exception e) {
e.printStackTrace();
}
} else {
innerfileItemMap.put("innerfileTitle", currentBookmarkName);
innerfileItemMap.put("elecFileName", elecFileName);
innerfileItemMap.put("beginPageNum", beginPageNum);
innerfileItemMap.put("endPageNum", endPageNum);
innerfileItemList.add(innerfileItemMap);
}
}
elecFileItemMap.put("elecFileID", Md5TokenGenerator.generate(file.getPath()));
elecFileItemMap.put("elecFileName",elecFileName);
elecFileItemMap.put("elecFileSavePath", "文件下载地址");
elecFileItemMap.put("elecFileType","szqm");
elecFileItemMap.put("innerfileItemList",innerfileItemList);
elecFileItemList.add(elecFileItemMap);
return elecFileItemList;
}
}