近期因工作需要需要解析PDF,需要把PDF中的文本和表格分离,最终要实现的目标是PDF中的文本内容放一块,表格内容放一块,以list的形式存储。解析PDF的技术有很多,经过多次尝试发现使用AdobeAcrobat可以实现表格和文本分离。
注意事项:
1、下载AdobeAcrobat进行安装
链接:https://pan.baidu.com/s/1Ki2H4gxDaKj5z8Dli3amCg
提取码:ckfx
2、使用时如果有报jacob的错把jacob-1.18-M1-x64.dll、jacob-1.18-M1-x86.dll这两个文件放到Java>jdk1.8.0_271>bin目录下
3、引入依赖:
XML
<dependency>
<groupId>jacob</groupId>
<artifactId>jacob</artifactId>
<version>1.1.8</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
工具类如下(内附PDF的文件转换方法以及获取纯文本和纯表格的测试方法):
java
package com.ylx.test.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.ComThread;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;
/**
*
* 2024年4月18日下午3:53:36
*/
@Component
public class PdfToHtmlAdobe {
public static void main(String[] args) throws Exception {
/**
* 将PDF转成html
* */
String filepath = "E:\\工作\\";
String htmlpath = "E:\\gongzuo\\zjk\\html_file\\";
String filename = "cehi";
Integer fileType = 1;//1.pdf,2word
String ext = ".pdf";
filedata(filepath, filename,htmlpath,fileType,ext);
/**
* 获取纯文本
* */
String htmlFilePath = "E:\\gongzuo\\html_file\\ceshi.html";
List<String> dlList = getDLList(htmlFilePath);
for (int i = 0; i < dlList.size(); i++) {
System.out.println("第"+i+"条:"+dlList.get(i));
}
/**
* 获取所有表格
* */
List<String> bgList = getAllTable(htmlFilePath);
for (int i = 0; i < bgList.size(); i++) {
System.out.println("第"+i+"条:"+bgList .get(i));
}
}
public static String getRemoveTableStr(String htmlFilePath) throws IOException {
String content = new String(Files.readAllBytes(Paths.get(htmlFilePath)),"GBK");
String specialStr = removeContentStyle(content);
// 使用 Jsoup 解析 HTML 字符串
Document doc = Jsoup.parse(specialStr);
// 获取纯文本
String plainText = doc.text();
// System.out.println(plainText);
return plainText;
}
/**
* 获取所有的Table
* @param htmlFilePath
* @return
* @throws IOException
* 2024年4月19日下午3:36:39
*/
public static List<String> getAllTable(String htmlFilePath) throws IOException {
List<String> bgList = new ArrayList<>();
// 加载 HTML 文件
File input = new File(htmlFilePath);
try {
// 解析 HTML 文件
Document doc = Jsoup.parse(input, null);
// 选择所有的表格元素
Elements tables = doc.select("table");
// 遍历每个表格元素
for (Element table : tables) {
// 获取当前表格中的所有行
Elements rows = table.select("tr");
//创建StringBuilder类的实例
StringBuilder builder = new StringBuilder();
// 遍历每行
for (Element row : rows) {
// 输出行内容
// System.out.println("行内容:");
// System.out.println(row.text());
//将获取的text写入StringBuilder容器
builder.append(row.text());
builder.append("\r\n");
}
// System.out.println("-----------------------------------------------");
// System.out.println(builder.toString());
// System.out.println("-----------------------------------------------");
bgList.add(builder.toString());
}
} catch (IOException e) {
e.printStackTrace();
}
return bgList;
}
/**
* 获取以10个逗号为一个段落的list
* @param content
* @return
* 2024年4月19日下午2:04:17
* @throws IOException
*/
public static List<String> getDLList(String htmlFilePath) throws IOException {
/**
* 删除Table表格
* */
String content = getRemoveTableStr(htmlFilePath);
// System.out.println(content);
// String[] jhArr = content.split("。");
// List<String> dlList = new ArrayList<String>();//10个句号为一个段落
// String jhStrTen=new String();
// int lastDl=0;
// for (int i = 0; i < jhArr.length; i++) {
// if (i % 10 == 0 && i>0) {
// dlList.add(jhStrTen);
// if (jhArr.length-i>10) {
// jhStrTen="";
// }
// if (jhArr.length-i<=10 && lastDl==0) {
// lastDl = 1;
// jhStrTen = "";
// }
// }else{
// jhStrTen = jhStrTen +jhArr[i]+"。";
// if ((i+1) == jhArr.length) {
// dlList.add(jhStrTen);
// }
// }
// }
List<String> dlList = new ArrayList<>();
int index = 0;
while (index < content.length()) {
int endIndex = index + 1; // 初始化结束索引
for (int i = 0; i < 10; i++) {
endIndex = content.indexOf("。", endIndex + 1); // 查找句号
if (endIndex == -1) {
break; // 如果没有找到句号,跳出循环
}
}
if (endIndex == -1) {
endIndex = content.length(); // 如果不满十个句号,结束索引为字符串长度
}
dlList.add(content.substring(index, endIndex)); // 将段落添加到列表中
index = endIndex + 1; // 更新起始索引
}
return dlList;
}
/**
* @param filepath 存放PDF的文件目录
* @param filename 不带后缀名的PDF文件
* @throws Exception
* 2024年4月18日下午3:31:55
*/
public static void filedata(String fileDir, String filename,String htmlPath,Integer fileType,String ext) throws Exception{
final ExecutorService exec = Executors.newFixedThreadPool(1);
Callable<String> call = new Callable<String>() {
public String call() throws Exception {
//开始执行耗时操作
if (fileType==1) {
//文件路径不存在则进行创建
FileUtils.createPath(htmlPath);
if (!FileUtils.isFileExists(htmlPath + filename + ".docx")) {
PDFtoWord(fileDir + filename + ".PDF", htmlPath + filename + ".docx");
}
if (!FileUtils.isFileExists(htmlPath + filename + ".html")) {
wordToHtml(htmlPath + filename + ".docx", htmlPath + filename + ".html");
}
}
if (fileType==2){
//文件路径不存在则进行创建
FileUtils.createPath(htmlPath);
if (!FileUtils.isFileExists(htmlPath + filename + ".html")) {
wordToHtml(fileDir + filename + ext, htmlPath + filename + ".html");
}
}
return "线程执行完成.";
}
};
try {
Future<String> future = exec.submit(call);
String obj = future.get(1000 * 600, TimeUnit.MILLISECONDS); //任务处理超时时间设为 1 秒
System.out.println("文件转换:" + obj);
} catch (Exception e) {
//关闭Acrobat
String command = "taskkill /f /im Acrobat.exe";
Runtime.getRuntime().exec(command);
throw e;
} finally {
// 关闭线程池
exec.shutdown();
}
}
public static boolean wordToHtml(String inPath, String toPath) {
ComThread.InitSTA();//初始化com的线程
// 启动word
ActiveXComponent axc = new ActiveXComponent("Word.Application");
boolean flag = false;
try {
// 设置word不可见
axc.setProperty("Visible", new Variant(false));
Dispatch docs = axc.getProperty("Documents").toDispatch();
// 打开word文档
Dispatch doc = Dispatch.invoke(
docs,
"Open",
Dispatch.Method,
new Object[] { inPath, new Variant(false), new Variant(true) },
new int[1]).toDispatch();
// 作为html格式保存到临时文件
Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[] {
toPath, new Variant(8) }, new int[1]);
Variant f = new Variant(false);
Dispatch.call(doc, "Close", f);
axc.invoke("Quit", new Variant[] {});
flag = true;
return flag;
} catch (Exception e) {
e.printStackTrace();
return flag;
} finally {
ComThread.Release();//关闭com的线程 真正kill进程
}
}
public static boolean PDFtoWord(String source, String target) {
ComThread.InitSTA();//初始化com的线程
// pdfActiveX PDDoc对象 主要建立PDF对象
ActiveXComponent app = null ;
try {
File inPath = new File(source);
File outPath = new File(target);
app = new ActiveXComponent("AcroExch.PDDoc");
// PDF控制对象
Dispatch pdfObject = app.getObject();
long start = System.currentTimeMillis();
// 打开PDF文件,建立PDF操作的开始
Dispatch.call(pdfObject, "Open", new Variant(inPath.getAbsolutePath()));
Variant jsObj = Dispatch.call(pdfObject, "GetJSObject");
Dispatch.call(jsObj.getDispatch(), "SaveAs", outPath.getPath(), "com.adobe.acrobat.docx");
app.invoke("Close");
// 关闭PDF
app.invoke("Close", new Variant[] {});
long end = System.currentTimeMillis();
} catch (Exception e) {
System.out.println(e.getMessage());
} finally {
ComThread.Release();//关闭com的线程 真正kill进程
}
return true;
}
public static String chineseDate(String chineseDate){
String aa4 = chineseDate.substring(chineseDate.indexOf("年")-2, chineseDate.indexOf("年"));
String aa1 = chineseDate.substring(chineseDate.indexOf("年"),chineseDate.indexOf("月")+1);
String aa2 = chineseDate.substring(chineseDate.indexOf("月")+1,chineseDate.indexOf("日"));
String aa3 = chineseDate.substring(chineseDate.indexOf("日"));
aa1 = aa1.replaceAll("十二", "12");
aa1 = aa1.replaceAll("十一", "11");
aa1 = aa1.replaceAll("十", "10");
aa1 = aa1.replaceAll("一", "1");
aa1 = aa1.replaceAll("二", "2");
aa1 = aa1.replaceAll("三", "3");
aa1 = aa1.replaceAll("四", "4");
aa1 = aa1.replaceAll("五", "5");
aa1 = aa1.replaceAll("六", "6");
aa1 = aa1.replaceAll("七", "7");
aa1 = aa1.replaceAll("八", "8");
aa1 = aa1.replaceAll("九", "9");
aa1 = aa1.replaceAll("零", "0");
aa1 = aa1.replaceAll("〇", "0");
aa1 = aa1.replaceAll("○", "0");
aa1 = aa1.replaceAll("O", "0");
aa1 = aa1.replaceAll("", "0");
aa4 = aa4.replaceAll("十二", "12");
aa4 = aa4.replaceAll("十一", "11");
aa4 = aa4.replaceAll("十", "10");
aa4 = aa4.replaceAll("一", "1");
aa4 = aa4.replaceAll("二", "2");
aa4 = aa4.replaceAll("三", "3");
aa4 = aa4.replaceAll("四", "4");
aa4 = aa4.replaceAll("五", "5");
aa4 = aa4.replaceAll("六", "6");
aa4 = aa4.replaceAll("七", "7");
aa4 = aa4.replaceAll("八", "8");
aa4 = aa4.replaceAll("九", "9");
aa4 = aa4.replaceAll("零", "0");
aa4 = aa4.replaceAll("〇", "0");
aa4 = aa4.replaceAll("○", "0");
aa4 = aa4.replaceAll("O", "0");
aa4 = aa4.replaceAll("", "0");
return 20+aa4+aa1+chineseNumber(aa2)+aa3;
}
/**
* 中文數字转阿拉伯数组【十万九千零六十 --> 109060】
* @param chineseNumber
* @return
*/
private static String chineseNumber(String chineseNumber){
int result = 0;
int temp = 1;//存放一个单位的数字如:十万
int count = 0;//判断是否有chArr
char[] cnArr = new char[]{'一','二','三','四','五','六','七','八','九'};
char[] chArr = new char[]{'十','百','千','万','亿'};
for (int i = 0; i < chineseNumber.length(); i++) {
boolean b = true;//判断是否是chArr
char c = chineseNumber.charAt(i);
for (int j = 0; j < cnArr.length; j++) {//非单位,即数字
if (c == cnArr[j]) {
if(0 != count){//添加下一个单位之前,先把上一个单位值添加到结果中
result += temp;
temp = 1;
count = 0;
}
// 下标+1,就是对应的值
temp = j + 1;
b = false;
break;
}
}
if(b){//单位{'十','百','千','万','亿'}
for (int j = 0; j < chArr.length; j++) {
if (c == chArr[j]) {
switch (j) {
case 0:
temp *= 10;
break;
case 1:
temp *= 100;
break;
case 2:
temp *= 1000;
break;
case 3:
temp *= 10000;
break;
case 4:
temp *= 100000000;
break;
default:
break;
}
count++;
}
}
}
if (i == chineseNumber.length() - 1) {//遍历到最后一个字符
result += temp;
}
}
return String.valueOf(result);
}
/**
* 读取本地html文件里的html代码
* @return
*/
public static String toHtmlString(File file) {
// 获取HTML文件流
StringBuffer htmlSb = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(file), "GBK"));
while (br.ready()) {
htmlSb.append(br.readLine());
}
br.close();
// 删除临时文件
//file.delete();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// HTML文件字符串
String htmlStr = htmlSb.toString();
// 返回经过清洁的html文本
return htmlStr;
}
/**
* 符合以特定字符串开头 ,以特定字符串结尾的所有结果
*/
public static String getSpecialStr(String line, String pattern) {
// 创建 Pattern 对象
Pattern r = Pattern.compile(pattern);
// 创建 matcher 对象
Matcher m = r.matcher(line);
String str = "";
if (m.find()) {
str = m.group(2);
}
if (StringUtils.isEmpty(str)) {
return str;
} else {
return str.substring(0, str.length() - 1);
}
}
/**
* 清除文件中的table
*
* @param content
* 公告内容
* @return 字符串结果集
*/
public static String removeContentStyle(String content) {
String regEx = "<table(.*?)</table>";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(content);
if (m.find()) {
content = m.replaceAll("");
}
// String regEx2 = " style=\"([\\s\\S]*?)\"";
// Pattern p2 = Pattern.compile(regEx2);
// Matcher m2 = p2.matcher(content);
// if (m2.find()) {
// content = m2.replaceAll("");
// }
// String regEx3 = " border=\"(.*?)\"";
// Pattern p3 = Pattern.compile(regEx3);
// Matcher m3 = p3.matcher(content);
// if (m3.find()) {
// content = m3.replaceAll(" border=\"1\" ");
// }
//
// String regEx4 = " class=.*?\\>";
// Pattern p4 = Pattern.compile(regEx4);
// Matcher m4 = p4.matcher(content);
// if (m4.find()) {
// content = m4.replaceAll("\\>");
// }
// String regEx5 = "\\<!--(.*?)--\\>";
// Pattern p5 = Pattern.compile(regEx5);
// Matcher m5 = p5.matcher(content);
// if (m5.find()) {
// content = m5.replaceAll("");
// }
// String regEx6 = "\\<o:p(.*?)/o:p\\>";
// Pattern p6 = Pattern.compile(regEx6);
// Matcher m6 = p6.matcher(content);
// if (m6.find()) {
// content = m6.replaceAll("");
// }
// String regEx7 = "\\<!(.*?)\\>";
// Pattern p7 = Pattern.compile(regEx7);
// Matcher m7 = p7.matcher(content);
// if (m7.find()) {
// content = m7.replaceAll("");
// }
// String regEx8 = "\\<font(.*?)\\>";
// Pattern p8 = Pattern.compile(regEx8);
// Matcher m8 = p8.matcher(content);
// if (m8.find()) {
// content = m8.replaceAll("");
// }
return content;
}
}