pdf 转html 在线预览和查询

复制代码
方案一:
复制代码
pdf2htmlex
复制代码
package com.realize.controller;

import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson2.JSONObject;
import com.realize.util.MsgUtil;
import com.realize.util.OssUtil;
import com.realize.util.PdfConvertUtil;
import com.realize.util.StreamGobbler;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;

@RestController
@Slf4j
public class ParserController {

    @GetMapping("/test")
    public String test() {
        return "test";
    }

//    @PostMapping("/parseHtml")
//    public JSONObject parseHtml(@ModelAttribute("htmlUrl") String htmlUrl) {
//        try (Playwright playwright = Playwright.create()) {
//            Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(true));
//            Page page = browser.newPage();
//            String filePath = "/mnt/temp/html/" + RandomUtil.randomString(10) + ".html";
            String filePath = "/Users/sunyechen/IdeaProjects/realize-nacos/bin/" + RandomUtil.randomString(10) + ".html";
//            HttpUtil.downloadFile(htmlUrl, filePath);
//            page.navigate("file:" + filePath);
//            page.evaluate("var imgList=document.getElementsByTagName('img');" +
//                    "for(var i=0;i<imgList.length;i++){" +
//                    "var src=imgList[i].getAttribute('src');" +
//                    "imgList[i].setAttribute('src','https://realizedongmi.oss-cn-shanghai.aliyuncs.com/a-filings/test/'+src);" +
//                    "}");
//            JSONObject result = new JSONObject();
//            result.put("html", page.innerHTML("css=body"));
//            result.put("css", page.innerHTML("css=style"));
//            result.put("txt", page.innerText("css=body").trim().replaceAll("\n", ""));
//            page.close();
//            browser.close();
//            return result;
//        } catch (Exception e) {
//            e.printStackTrace();
//        }
//        return null;
//    }

    @GetMapping("/batchConvertPdf")
    public String batchConvertPdf() {
        String folderName = "/root/pdf";
        File folder = new File(folderName);
        File[] files = folder.listFiles();
        for (int i = 0; i < files.length; i++) {
            String fileName = files[i].getName();
            if (fileName.toLowerCase().endsWith(".pdf")) {
                String exec = " docker run -i --rm -v /root/pdf:/pdf -w /pdf docker.io/pdf2htmlex/pdf2htmlex:0.18.8.rc2-master-20200820-ubuntu-20.04-x86_64 --font-size-multiplier 1 --zoom 1.3 " + fileName;
                try {
//                    Process process = new ProcessBuilder("/bin/sh", "-c", exec).start();
//                    String result = IOUtils.toString(process.getInputStream(), "utf-8");
//                    log.info("Executing Command [result]:{}", result);
//                    Runtime rt = Runtime.getRuntime();
                    log.info("exec:{}", exec);
//                    String[] execArray = new String[]{"/bin/sh", "-c", " docker run -i --rm -v /root/pdf:/pdf -w /pdf docker.io/pdf2htmlex/pdf2htmlex:0.18.8.rc2-master-20200820-ubuntu-20.04-x86_64 --font-size-multiplier 1 --zoom 1.3 ", fileName};
                    Process process = Runtime.getRuntime().exec(exec);
                    StreamGobbler errorGobbler = new StreamGobbler(process.getErrorStream(), "ERROR");
                    // 开启屏幕标准错误流
                    errorGobbler.start();
                    StreamGobbler outGobbler = new StreamGobbler(process.getInputStream(), "OUTPUT");
                    // 开启屏幕标准输出流
                    outGobbler.start();
                    int w = process.waitFor();
                    int v = process.exitValue();
                    if (w == 0 && v == 0) {
                        log.info("转换成功:{}", fileName);
                    } else {
                        log.info("转换失败:{}", fileName);
                    }
                } catch (Exception e) {
                    log.error("{}", e);
                    return null;
                }
            }
        }
        return "ok";
    }

    @PostMapping("/convertAndParsePdf")
    public JSONObject convertAndParsePdf(@ModelAttribute("pdfUrl") String pdfUrl, @ModelAttribute("ossKey") String ossKey) throws Exception {
        log.info("接收到转换请求{},{}", pdfUrl, ossKey);
        String fileFolder = "/mnt_real/pdf/";
        JSONObject result = new JSONObject();

        String tempFileName = ossKey.substring(ossKey.lastIndexOf("/") + 1, ossKey.lastIndexOf("."));
        String pdfFilePath = fileFolder + tempFileName + ".pdf";
        String htmlFilePath = fileFolder + tempFileName + ".html";

        HttpUtil.downloadFile(pdfUrl, pdfFilePath);
        log.info("pdf文件下载成功{},{}", pdfUrl, ossKey);
        //解析pdf正文
//        String pdfText = PdfBoxUtil.getPdfText(pdfFilePath);
//        result.put("pdfText", pdfText);
//        log.info("pdfbox正文解析成功{},{}", pdfUrl, ossKey);
        //解析
        Boolean convertResult = PdfConvertUtil.convertPdf(tempFileName + ".pdf");
        if (convertResult) {
            try {
                List<String> allLines = Files.readAllLines(Paths.get(htmlFilePath), Charset.forName("UTF-8"));
                String content = String.join("\n", allLines);
//                File file = new File(htmlFilePath);
//                BufferedReader reader = new BufferedReader(new FileReader(file));
//                String line = "", oldContent = "";
//                while ((line = reader.readLine()) != null) {
//                    oldContent += line + "\n";
//                }
//                reader.close();
                content = content.replaceAll("github", "zzz").replaceAll("pdf2htmlEX", "tg").replaceAll("<meta charset=\"utf-8\"/>", "<meta charset=\"utf-8\"/><script src=\"https://oss.imvib.com/a-filings/test/test/search.js\" type=\"text/javascript\" charset=\"utf-8\"></script> ");
                File file = new File(htmlFilePath);
                file.delete();
                FileWriter writer = new FileWriter(htmlFilePath);
                writer.write(content);
                writer.close();
                log.info("html文件处理完成{},{}", pdfUrl, ossKey);
                result.put("code", 0);
            } catch (IOException e) {
                e.printStackTrace();
                result.put("code", -1);
            } finally {
                //上传所有文件
                String ossPath = ossKey.substring(0, ossKey.lastIndexOf("/") + 1);
                OssUtil.batchFileUploadOssUrl(fileFolder, ossPath);
                log.info("文件上传成功,完整链接:https://oss.imvib.com/{}", ossKey.replace(".html", ".pdf"));
            }
        } else {
            MsgUtil.sendDingTalkMsg(pdfUrl);
            result.put("code", -1);
        }
        return result;
    }

    private static byte[] readAllBytes(File file) throws IOException {
        try (FileInputStream fileInputStream = new FileInputStream(file)) {
            byte[] buffer = new byte[(int) file.length()];
            fileInputStream.read(buffer);
            return buffer;
        }
    }

    public static void main(String[] args) throws Exception {
//        String htmlFilePath = "/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见.html";
//        File htmlFile = new File(htmlFilePath);
//        Document html = Jsoup.parse(htmlFile);
        Element script = html.select("script").first();
        String sourceScript = script.html();
        script.html(sourceScript + PdfConvertUtil.addScript);
//        FileOutputStream fos = new FileOutputStream(htmlFilePath.replace(".html", "_.html"), false);
//        OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
//        osw.write(html.outerHtml());
//        osw.close();
//        try (Playwright playwright = Playwright.create()) {
//            Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(true));
//            BrowserContext context = browser.newContext(new Browser.NewContextOptions());
//            Page page = browser.newPage();
//            String htmlUrl = "https://realizedongmi.oss-cn-shanghai.aliyuncs.com/a-filings/test/2023-01-03%201%20%E5%8F%91%E8%A1%8C%E4%BA%BA%E5%8F%8A%E4%BF%9D%E8%8D%90%E6%9C%BA%E6%9E%84%E5%85%B3%E4%BA%8E%E4%BA%8C%E8%BD%AE%E5%AE%A1%E6%A0%B8%E9%97%AE%E8%AF%A2%E5%87%BD%E7%9A%84%E5%9B%9E%E5%A4%8D%EF%BC%88%E4%BF%AE%E8%AE%A2%E7%A8%BF%EF%BC%89_%E6%98%93%E7%91%9E%E7%94%9F%E7%89%A9.htm";
//            HttpUtil.downloadFile(htmlUrl, "/Users/sunyechen/IdeaProjects/realize-nacos/bin/1.html");
//            page.navigate("file:/Users/sunyechen/IdeaProjects/realize-nacos/bin/1.html");
//            System.out.println(page.innerHTML("body"));
//        }
//        System.out.println(URLDecoder.decode("https://oss.imvib.com/a-filings%252Foriginal%252F000586%252F2023-04-07+%25E5%25B9%25B4%25E5%25BA%25A6%25E5%2585%25B3%25E8%2581%2594%25E6%2596%25B9%25E8%25B5%2584%25E9%2587%2591%25E5%258D%25A0%25E7%2594%25A8%25E4%25B8%2593%25E9%25A1%25B9%25E5%25AE%25A1%25E8%25AE%25A1%25E6%258A%25A5%25E5%2591%258A.PDF", "UTF-8"));
//        try {
//            File file = new File("/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见.html");
//            BufferedReader reader = new BufferedReader(new FileReader(file));
//            String line = "", oldContent = "";
//            while ((line = reader.readLine()) != null) {
//                oldContent += line + "\n";
//            }
//            reader.close();
//            String newContent = oldContent.replaceAll("<meta charset=\"utf-8\"/>", "<meta charset=\"utf-8\"/><script src=\"https://oss.imvib.com/a-filings/test/test/search.js\" type=\"text/javascript\" charset=\"utf-8\"></script> ");
//            FileWriter writer = new FileWriter(new File("/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见_1.html"));
//            writer.write(newContent);
//            writer.close();
//            System.out.println("File updated successfully.");
//        } catch (IOException e) {
//            e.printStackTrace();
//        }
//        log.info("start");
//        String htmlFilePath = "/Users/sunyechen/sfit/7745c98a5ba34525937bce19519c0b1e.html";
//        try {
//            File file = new File(htmlFilePath);
//            BufferedReader reader = new BufferedReader(new FileReader(file));
//            String line = "", oldContent = "";
//            while ((line = reader.readLine()) != null) {
//                oldContent += line + "\n";
//            }
//            reader.close();
//            String newContent = oldContent.replaceAll("github", "zzz").replaceAll("pdf2htmlEX", "tanqiuhuashigou").replaceAll("<meta charset=\"utf-8\"/>", "<meta charset=\"utf-8\"/><script src=\"https://oss.imvib.com/a-filings/test/test/search.js\" type=\"text/javascript\" charset=\"utf-8\"></script> ");
//            String newHtmlFilePath = htmlFilePath.replace(".html", "_.html");
//            FileWriter writer = new FileWriter(newHtmlFilePath);
//            writer.write(newContent);
//            writer.close();
//            log.info("html文件处理完成{},{}");
//
//        } catch (IOException e) {
//            e.printStackTrace();
//        }
        log.info("start");
//        String ossKey = "ann/688249/2023/4/688249_20230412_9XYK/688249_20230412_9XYK.html";
//        String ossPath = ossKey.substring(0, ossKey.lastIndexOf("/") + 1);
//        OssUtil.batchFileUploadOssUrl("/Users/sunyechen/sfit/test/", ossPath);
//        File[] fileList = new File("/Users/sunyechen/sfit/test/").listFiles();
//        for (int i = 0; i < fileList.length; i++) {
//            OssUtil.fileUploadOssUrl(fileList[i], ossPath + fileList[i].getName());
//            fileList[i].delete();
//        }
        String htmlFilePath = "/Users/sunyechen/sfit/test/600499_20230415_EVH7.html";
        List<String> allLines = Files.readAllLines(Paths.get(htmlFilePath), Charset.forName("UTF-8"));
        String content = String.join("\n", allLines);
        System.out.println(content);
        log.info("start");
        File file = new File(htmlFilePath);
        BufferedReader reader = new BufferedReader(new FileReader(file));
        String line = "", oldContent = "";
        while ((line = reader.readLine()) != null) {
            oldContent += line + "\n";
        }
        reader.close();
        System.out.println(oldContent);
        log.info("end");
    }


}

方案二:

kkFileView-4.0.0

kkFileView - 在线文件预览

方案三:

wkhtmltox-0.12.6-1.centos7.x86_64.rpm

wkhtmltopdf

相关推荐
Nan_Shu_61419 分钟前
学习: Threejs (2)
前端·javascript·学习
G_G#27 分钟前
纯前端js插件实现同一浏览器控制只允许打开一个标签,处理session变更问题
前端·javascript·浏览器标签页通信·只允许一个标签页
@大迁世界43 分钟前
TypeScript 的本质并非类型,而是信任
开发语言·前端·javascript·typescript·ecmascript
GIS之路1 小时前
GDAL 实现矢量裁剪
前端·python·信息可视化
是一个Bug1 小时前
后端开发者视角的前端开发面试题清单(50道)
前端
Amumu121381 小时前
React面向组件编程
开发语言·前端·javascript
持续升级打怪中1 小时前
Vue3 中虚拟滚动与分页加载的实现原理与实践
前端·性能优化
GIS之路1 小时前
GDAL 实现矢量合并
前端
hxjhnct1 小时前
React useContext的缺陷
前端·react.js·前端框架
前端 贾公子2 小时前
从入门到实践:前端 Monorepo 工程化实战(4)
前端