Java实现word、pdf转html保留格式

一、word转html

依赖:

XML 复制代码
<properties>
    <poi.version>5.2.3</poi.version>
    <xhtml.version>2.0.4</xhtml.version>
</properties>

<!--word转html-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
    <version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>${poi.version}</version>
</dependency>

代码:

java 复制代码
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.codec.binary.Base64;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;

public class WordUtil {

    public static String wordToHtml(String fileUrl,String fileSuffix) throws Exception {
        URL url = new URL(fileUrl);
        try (InputStream inputStream = url.openStream()) {
            if(fileSuffix.equals(".docx") || fileSuffix.equals(".DOCX")){
                return word2007ToHtml(inputStream);
            } else if (fileSuffix.equals(".doc") || fileSuffix.equals(".DOC")) {
                return word2003ToHtml(inputStream);
            }else{
                throw new RuntimeException("错误的文件后缀");
            }
        } catch (RuntimeException e) {
            throw new RuntimeException(e.getMessage());
        }
    }

    /**
     * word2007转换成html
     * 对于docx,可以用下面这种方式:
     * @throws Exception
     */
    public static String word2007ToHtml(InputStream inputStream) {
        try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
             XWPFDocument docxDocument = new XWPFDocument(inputStream)) {
            XHTMLOptions options = XHTMLOptions.create();
            // 是否忽略未使用的样式
            options.setIgnoreStylesIfUnused(false);
            // 设置片段模式,<div>标签包裹
            options.setFragment(true);
            // 图片转base64
            options.setImageManager(new Base64EmbedImgManager());
            // 转换htm1
            XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
            return htmlStream.toString();
        } catch (Exception e) {
            System.out.println("Word转Html过程出现异常!");
            throw new RuntimeException(e.getMessage());
        }
    }
    /**
     * word2003转换成html
     * 对于doc,可以用下面这种方式:
     * @throws Exception
     */
    public static String word2003ToHtml(InputStream inputStream ) throws Exception {
        try (StringWriter writer = new StringWriter();
             HWPFDocument document = new HWPFDocument(inputStream)) {
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
            //将图片转成base64的格式
            wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));
            wordToHtmlConverter.processDocument(document);
            org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
            DOMSource domSource = new DOMSource(htmlDocument);
            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, new StreamResult(writer));
            return writer.toString();
        } catch (Exception e) {
            System.out.println("Word转Html过程出现异常!");
            throw new RuntimeException(e.getMessage());
        }
    }

}

来源博客:Java实现word转html_java word转html-CSDN博客

二、pdf转html

依赖:

XML 复制代码
        <dependency>
            <groupId>net.sf.cssbox</groupId>
            <artifactId>pdf2dom</artifactId>
        </dependency>

        <dependency>
            <groupId>net.mabboud.fontverter</groupId>
            <artifactId>FontVerter</artifactId>
        </dependency>
        <dependency>
            <groupId>org.reflections</groupId>
            <artifactId>reflections</artifactId>
        </dependency>
        <!--pdf转文本-->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
        </dependency>

代码:

java 复制代码
import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;

import java.io.*;
import java.net.URL;

public class PDFUtil {
    public static String pdfToHtml(String fileUrl) throws IOException {
        URL url = new URL(fileUrl);
        try (InputStream inputStream = url.openStream()){
            return pdfToHtml(inputStream);
        }catch (Exception e){
            throw new IOException(e.getMessage());
        }
    }
    public static String pdfToHtml(InputStream inputStream) throws IOException {
        String outFilePath = "mypdf.html";
        String pdfContent = "";
        PDDocument document = PDDocument.load(inputStream);
        Writer writer = new PrintWriter(outFilePath, "UTF-8");
        new PDFDomTree().writeText(document, writer);
        writer.close();
        document.close();
        // 获取html内容
        try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) {
            StringBuilder htmlContent = new StringBuilder();
            String line;
            while ((line = reader.readLine()) != null) {
                htmlContent.append(line).append("\n"); // 追加每一行内容,并添加换行符
            }
            pdfContent = String.valueOf(htmlContent);
            return pdfContent;
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("读取 HTML 文件时出错。");
        }
        return null;
    }
}

来源博客:使用Java实现PDF到HTML的转换_java pdf转html-CSDN博客

相关推荐
运维帮手大橙子2 小时前
完整的登陆学生管理系统(配置数据库)
java·前端·数据库·eclipse·intellij-idea
王大锤·2 小时前
基于spring boot的个人博客系统
java·spring boot·后端
sg_knight3 小时前
Spring Cloud Gateway全栈实践:动态路由能力与WebFlux深度整合
java·spring boot·网关·spring·spring cloud·微服务·gateway
JosieBook3 小时前
【IDEA】IntelliJ IDEA 中文官方文档全面介绍与总结
java·ide·intellij-idea
三只蛋黄派3 小时前
Websocket
java
JIngJaneIL3 小时前
专利服务系统平台|个人专利服务系统|基于java和小程序的专利服务系统设计与实现(源码+数据库+文档)
java·数据库·小程序·论文·毕设·专利服务系统平台
崎岖Qiu3 小时前
leetcode1343:大小为K的子数组(定长滑动窗口)
java·算法·leetcode·力扣·滑动窗口
freed_Day3 小时前
Java学习进阶--集合体系结构
java·开发语言·学习
zuozewei4 小时前
高可用改造之构建双活冗余的TDengine时序数据处理架构
java·架构·tdengine
嫩萝卜头儿4 小时前
从零掌握 Java AWT:原理、实战与性能优化
java·开发语言·性能优化