Java 富文本转word

前言：

本文的目的是将传入的富文本内容(html标签，图片)并且分页导出为word文档。

所使用的为docx4j

一、依赖导入

bash 复制代码

        <!-- 富文本转word -->
        <dependency>
            <groupId>org.docx4j</groupId>
            <artifactId>docx4j</artifactId>
            <version>6.1.2</version>
            <exclusions>
                <exclusion>
                    <artifactId>slf4j-log4j12</artifactId>
                    <groupId>org.slf4j</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>log4j</artifactId>
                    <groupId>log4j</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>commons-io</artifactId>
                    <groupId>commons-io</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>commons-compress</artifactId>
                    <groupId>org.apache.commons</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>guava</artifactId>
                    <groupId>com.google.guava</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>mbassador</artifactId>
                    <groupId>net.engio</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.docx4j</groupId>
            <artifactId>docx4j-ImportXHTML</artifactId>
            <version>8.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.docx4j</groupId>
            <artifactId>docx4j-JAXB-ReferenceImpl</artifactId>
            <version>8.1.0</version>
            <exclusions>
                <exclusion>
                    <artifactId>docx4j-core</artifactId>
                    <groupId>org.docx4j</groupId>
                </exclusion>
            </exclusions>
        </dependency>

二、字体文件

将字体文件上传到子项目resources的static.fonts目录中

三、工具类

bash 复制代码

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.StringUtils;
import sun.misc.BASE64Decoder;

import javax.crypto.Cipher;
import javax.crypto.KeyGenerator;
import javax.crypto.spec.SecretKeySpec;
import java.math.BigInteger;

/** 
 * 编码工具类 
 * 实现aes加密、解密 
 */  
public class AESEncryptUtils {
	
	public static final String aesKey = "this-is-aescrypt";

    private AESEncryptUtils(){
        throw new AssertionError();
    }

    /**
     * 算法 
     */  
    private static final String ALGORITHMSTR = "AES/ECB/PKCS5Padding";  
  
    public static void main(String[] args) throws Exception {
        System.out.println(AESEncryptUtils.aesEncrypt("html2Pdf", "this-is-aescrypt"));
    }
    

    public static String aesEncryptToString(String content) throws Exception {
    	return aesEncrypt(content, aesKey);
    }
    
    public static String aesDecryptToString(String content) throws Exception {
    	return aesDecrypt(content, aesKey);
    }
 
    /** 
     * 将byte[]转为各种进制的字符串 
     * @param bytes byte[] 
     * @param radix 可以转换进制的范围，从Character.MIN_RADIX到Character.MAX_RADIX，超出范围后变为10进制 
     * @return 转换后的字符串 
     */  
    public static String binary(byte[] bytes, int radix){  
        return new BigInteger(1, bytes).toString(radix);// 这里的1代表正数  
    }  
  
    /** 
     * base 64 encode 
     * @param bytes 待编码的byte[] 
     * @return 编码后的base 64 code 
     */  
    public static String base64Encode(byte[] bytes){  
        return Base64.encodeBase64String(bytes);
    }  
  
    /** 
     * base 64 decode 
     * @param base64Code 待解码的base 64 code 
     * @return 解码后的byte[] 
     * @throws Exception 
     */  
    public static byte[] base64Decode(String base64Code) throws Exception{  
        return StringUtils.isEmpty(base64Code) ? null : new BASE64Decoder().decodeBuffer(base64Code);
    }  
  
    /**
     * AES加密 
     * @param content 待加密的内容 
     * @param encryptKey 加密密钥 
     * @return 加密后的byte[] 
     * @throws Exception 
     */  
    public static byte[] aesEncryptToBytes(String content, String encryptKey) throws Exception {  
        KeyGenerator kgen = KeyGenerator.getInstance("AES");  
        kgen.init(128);  
        Cipher cipher = Cipher.getInstance(ALGORITHMSTR);  
        cipher.init(Cipher.ENCRYPT_MODE, new SecretKeySpec(encryptKey.getBytes(), "AES"));  
  
        return cipher.doFinal(content.getBytes("utf-8"));  
    }  
  
    /**
     * AES加密为base 64 code 
     * @param content 待加密的内容 
     * @param encryptKey 加密密钥 
     * @return 加密后的base 64 code 
     * @throws Exception 
     */  
    public static String aesEncrypt(String content, String encryptKey) throws Exception {  
        return base64Encode(aesEncryptToBytes(content, encryptKey));  
    }  
  
    /** 
     * AES解密 
     * @param encryptBytes 待解密的byte[] 
     * @param decryptKey 解密密钥 
     * @return 解密后的String 
     * @throws Exception 
     */  
     public static String aesDecryptByBytes(byte[] encryptBytes, String decryptKey) throws Exception {  
            KeyGenerator kgen = KeyGenerator.getInstance("AES");  
            kgen.init(128);  
  
            Cipher cipher = Cipher.getInstance(ALGORITHMSTR);  
            cipher.init(Cipher.DECRYPT_MODE, new SecretKeySpec(decryptKey.getBytes(), "AES"));  
            byte[] decryptBytes = cipher.doFinal(encryptBytes);  
  
            return new String(decryptBytes);  
        }  
  
  
    /** 
     * 将base 64 code AES解密 
     * @param encryptStr 待解密的base 64 code 
     * @param decryptKey 解密密钥 
     * @return 解密后的string 
     * @throws Exception 
     */  
    public static String aesDecrypt(String encryptStr, String decryptKey) throws Exception {  
        return StringUtils.isEmpty(encryptStr) ? null : aesDecryptByBytes(base64Decode(encryptStr), decryptKey);  
    }  

}

bash 复制代码

import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;

@Data
@Configuration
@ConfigurationProperties(prefix = "html.convert")
public class HtmlConvertproperties {

    /** 生成的文件保存路径 */
    private String fileSavePath;

    /** echarts转换后的图片保存路径 */
    private String echartsImgSavePath;
}

bash 复制代码

import org.docx4j.Docx4J;
import org.docx4j.convert.in.xhtml.XHTMLImporterImpl;
import org.docx4j.fonts.IdentityPlusMapper;
import org.docx4j.fonts.Mapper;
import org.docx4j.fonts.PhysicalFont;
import org.docx4j.fonts.PhysicalFonts;
import org.docx4j.jaxb.Context;
import org.docx4j.model.structure.PageSizePaper;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.wml.RFonts;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.*;
import java.net.URL;
import java.net.URLEncoder;

import static cn.aotu.sss.module.sss.util.text.word.HtmlConverter.RemoveTag.*;

/**
 * html转换工具类
 *
 * 图片长宽乘积不能太大，不然会导致内存溢出
 *
 * HtmlConverter
 * @author: huangbing
 * @date: 2020/8/7 2:32 下午
 */
public class HtmlConverter {

    /**
     * 页面大小
     */
    public enum PageSize {
        /** 大小*/
        LETTER("letter"),
        LEGAL("legal"),
        A3("A3"),
        A4("A4"),
        A5("A5"),
        B4JIS("B4JIS");

        PageSize(String code){
            this.code = code;
        }
        private String code;

        public String getCode() {
            return code;
        }
    }

    /**
     * 移除的标签
     */
    enum RemoveTag {
        /** 移除的标签*/
        SCRIPT("script"), A("a"), LINK("link"), HREF("href");

        RemoveTag(String code){
            this.code = code;
        }
        private String code;

        public String getCode() {
            return code;
        }
    }

    /**
     * 参数类
     */
    private static class Params {

        /** 默认字体库*/
        private final static String DEFAULT_FONT_FAMILY = "STSongStd-Light";
        /** 默认字体库路径*/
        private final static String DEFAULT_FONT_PATH = "/static/fonts/STSongStd-Light.ttf";
        /** 默认是否横版*/
        private final static boolean DEFAULT_LAND_SCAPE = false;
        /** 默认页面尺寸*/
        private final static String DEFAULT_PAGE_SIZE = PageSize.A4.getCode();
        /** 字体库*/
        private String fontFamily = DEFAULT_FONT_FAMILY;
        /** 字体库路径*/
        private String fontPath = DEFAULT_FONT_PATH;
        /** 页面尺寸*/
        private String pageSize = DEFAULT_PAGE_SIZE;
        /** 是否横版*/
        private boolean isLandScape = DEFAULT_LAND_SCAPE;
        /** 保存的文件的路径 */
        private String saveFilePath = HtmlConverter.class.getResource("/").getPath() + "output/";

    }

    private final Logger logger = LoggerFactory.getLogger(HtmlConverter.class);

    private Builder builder;

    public HtmlConverter(Builder builder) {
        this.builder = builder;
    }

    /**
     * 构建类
     */
    public static class Builder {

        private Params params;

        public Builder() {
            this.params = new Params();
            this.params.fontFamily = Params.DEFAULT_FONT_FAMILY;
            this.params.fontPath = Params.DEFAULT_FONT_PATH;
            this.params.pageSize = Params.DEFAULT_PAGE_SIZE;
            this.params.isLandScape = Params.DEFAULT_LAND_SCAPE;
        }

        public Builder fontFamily(String fontFamily) {
            this.params.fontFamily = fontFamily;
            return this;
        }

        public Builder fontPath(String fontPath) {
            this.params.fontPath = fontPath;
            return this;
        }

        public Builder pageSize(String pageSize) {
            this.params.pageSize = pageSize;
            return this;
        }

        public Builder isLandScape(boolean isLandScape) {
            this.params.isLandScape = isLandScape;
            return this;
        }

        public Builder saveFilePath(String saveFilePath) {
            this.params.saveFilePath = saveFilePath;
            return this;
        }

        /**
         * 数据处理完毕之后处理逻辑放在构造函数里面
         *
         * @return
         */
        public HtmlConverter builder() {
            return new HtmlConverter(this);
        }

    }

    /**
     * 将页面保存为 docx
     *
     * @param url
     * @param fileName
     * @return
     * @throws Exception
     */
    public File saveUrlToDocx(String url, String fileName) throws Exception {
        return saveDocx(url2word(url), fileName);
    }

    /**
     * 将页面保存为 pdf
     *
     * @param url
     * @param fileName
     * @return
     * @throws Exception
     */
    public File saveUrlToPdf(String url, String fileName) throws Exception {
        return savePdf(url2word(url), fileName);
    }

    /**
     * 将页面转为 {@link WordprocessingMLPackage}
     *
     * @param url
     * @return
     * @throws Exception
     */
    public WordprocessingMLPackage url2word(String url) throws Exception {
        return xhtml2word(url2xhtml(url));
    }

    /**
     * 将 {@link WordprocessingMLPackage} 存为 docx
     *
     * @param wordMLPackage
     * @param fileName
     * @return
     * @throws Exception
     */
    public File saveDocx(WordprocessingMLPackage wordMLPackage, String fileName) throws Exception {
        File file = new File(genFilePath(fileName) + ".docx");
        //保存到 docx 文件
        wordMLPackage.save(file);

        if (logger.isDebugEnabled()) {
            logger.debug("Save to [.docx]: {}", file.getAbsolutePath());
        }

        return file;
    }

    /**
     * 将 {@link WordprocessingMLPackage} 存为 pdf
     *
     * @param wordMLPackage
     * @param fileName
     * @return
     * @throws Exception
     */
    public File savePdf(WordprocessingMLPackage wordMLPackage, String fileName) throws Exception {

        File file = new File(genFilePath(fileName) + ".pdf");

        OutputStream os = new FileOutputStream(file);

        Docx4J.toPDF(wordMLPackage, os);

        os.flush();
        os.close();

        if (logger.isDebugEnabled()) {
//            logger.debug("Save to [.pdf]: {}", file.getAbsolutePath());
        }
        return file;
    }

    /**
     * 将 {@link Document} 对象转为 {@link WordprocessingMLPackage}
     * xhtml to word
     *
     * @param doc
     * @return
     * @throws Exception
     */
    protected WordprocessingMLPackage xhtml2word(Document doc) throws Exception {
        //A4纸，//横版:true
        WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage(PageSizePaper.valueOf(this.builder.params.pageSize), this.builder.params.isLandScape);

        //配置中文字体
        configSimSunFont(wordMLPackage);

        XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage);

        //导入 xhtml
        wordMLPackage.getMainDocumentPart().getContent().addAll(
                xhtmlImporter.convert(doc.html(), doc.baseUri()));

        return wordMLPackage;
    }

    /**
     * 将页面转为{@link Document}对象，xhtml 格式
     *
     * @param url
     * @return
     * @throws Exception
     */
    protected Document url2xhtml(String url) throws Exception {
        // 添加头部授权参数防止被过滤
        String token = AESEncryptUtils.aesEncryptToString("html2File");

        Document doc = Jsoup.connect(url).header("Authorization", token).get();

        if (logger.isDebugEnabled()) {
//            logger.debug("baseUri: {}", doc.baseUri());
        }
        //除去所有 script
        for (Element script : doc.getElementsByTag(SCRIPT.getCode())) {
            script.remove();
        }

        //除去 a 的 onclick，href 属性
        for (Element a : doc.getElementsByTag(A.getCode())) {
            a.removeAttr("onclick");
//            a.removeAttr("href");
        }
        //将link中的地址替换为绝对地址
        Elements links = doc.getElementsByTag(LINK.getCode());
        for (Element element : links) {
            String href = element.absUrl(HREF.getCode());

            if (logger.isDebugEnabled()) {
//                logger.debug("href: {} -> {}", element.attr(HREF.getCode()), href);
            }

            element.attr(HREF.getCode(), href);
        }

        //转为 xhtml 格式
        doc.outputSettings()
                .syntax(Document.OutputSettings.Syntax.xml)
                .escapeMode(Entities.EscapeMode.xhtml);

        if (logger.isDebugEnabled()) {
            String[] split = doc.html().split("\n");
            for (int c = 0; c < split.length; c++) {
//                logger.debug("line {}:\t{}", c + 1, split[c]);
            }
        }
        return doc;
    }

    /**
     * 为 {@link WordprocessingMLPackage} 配置中文字体
     *
     * @param wordMLPackage
     * @throws Exception
     */
    protected void configSimSunFont(WordprocessingMLPackage wordMLPackage) throws Exception {
        Mapper fontMapper = new IdentityPlusMapper();
        wordMLPackage.setFontMapper(fontMapper);

        //加载字体文件（解决linux环境下无中文字体问题）
        URL simsunUrl = this.getClass().getResource(this.builder.params.fontPath);
        PhysicalFonts.addPhysicalFont(simsunUrl);
        PhysicalFont simsunFont = PhysicalFonts.get(this.builder.params.fontFamily);
        fontMapper.put(this.builder.params.fontFamily, simsunFont);
        //设置文件默认字体
        RFonts rfonts = Context.getWmlObjectFactory().createRFonts();
        rfonts.setAsciiTheme(null);
        rfonts.setAscii(this.builder.params.fontFamily);
        wordMLPackage.getMainDocumentPart().getPropertyResolver()
                .getDocumentDefaultRPr().setRFonts(rfonts);
    }

    /**
     * 直接通过HTML字符串生成Word处理包（核心修改点）
     */
    public WordprocessingMLPackage htmlString2word(String htmlContent) throws Exception {
        // 解析 HTML 字符串为 Document 对象
        Document doc = Jsoup.parse(htmlContent);

        // 配置输出设置（修正后的关键步骤）
        doc.outputSettings()
                .syntax(Document.OutputSettings.Syntax.xml)
                .escapeMode(Entities.EscapeMode.xhtml);

        // 清理不安全标签（复用原有逻辑）
        cleanHtml(doc);

        // 转换为 Word 处理包
        return xhtml2word(doc);
    }

    /**
     * 清理HTML标签（提取公共方法）
     */
    private void cleanHtml(Document doc) {
        // 移除script标签
        doc.getElementsByTag(RemoveTag.SCRIPT.getCode()).remove();
        // 移除a标签的事件和链接属性
        doc.getElementsByTag(RemoveTag.A.getCode()).forEach(a -> {
            a.removeAttr("onclick");
//            a.removeAttr("href");
        });
        // 处理link标签的绝对路径（如需加载外部资源，可保留此逻辑）
        doc.getElementsByTag(RemoveTag.LINK.getCode()).forEach(link -> {
            String href = link.absUrl(RemoveTag.HREF.getCode());
            link.attr(RemoveTag.HREF.getCode(), href);
        });
    }

    /**
     * 公共文件下载处理方法
     */
    public void handleFileDownload(
            File file,
            String displayFileName,
            HttpServletRequest request,
            HttpServletResponse response
    ) throws Exception {
        // 文件名编码处理
        String encodedFileName = URLEncoder.encode(displayFileName, "UTF-8")
                .replaceAll("\\+", "%20"); // 处理空格问题

        // 设置响应头
        response.setContentType("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
        response.setHeader("Content-Disposition", "attachment; filename*=UTF-8''" + encodedFileName);
        response.setHeader("Content-Length", String.valueOf(file.length()));

        // 流传输
        try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
             BufferedOutputStream bos = new BufferedOutputStream(response.getOutputStream())) {
            byte[] buffer = new byte[1024 * 8];
            int bytesRead;
            while ((bytesRead = bis.read(buffer)) != -1) {
                bos.write(buffer, 0, bytesRead);
            }
            bos.flush();
        }
    }

    /**
     * 生成文件位置
     *
     * @return
     */
    protected String genFilePath(String fileName) {
        return this.builder.params.saveFilePath + fileName;
    }

    public static void main(String[] args) throws Exception {
//        //输入要转换的网址
//        String url = "http://192.168.20.56:8080/viewReport";
//        new Builder().saveFilePath("/Users/huangbing/Desktop/echartsImages/")
//                     .builder()
//                     .saveUrlToDocx(url, "test");

        String s = "[img1] [img1] [img1]";
        String s1 = s.replaceAll("\\[img1\\]", "22");
        System.out.println(s1);
    }
}

四、controller

bash 复制代码

@Autowired
private HtmlConvertproperties htmlConvertproperties; // 注入配置类获取文件路径

     /**
     * 直接接收HTML富文本内容生成Word文档
     * @param htmlContent 富文本HTML代码（如：<p>富文本内容</p>）
     */
    @PostMapping("/export")
    @Operation(summary = "导出word")
    @Parameter(name = "htmlContent", description = "富文本内容", required = true)
    public void generateWord(
            @RequestParam("htmlContent") String htmlContent,
            HttpServletRequest request,
            HttpServletResponse response
    ) throws Exception {
        // 1. 初始化HtmlConverter（使用配置中的文件保存路径）
        HtmlConverter htmlConverter = new HtmlConverter.Builder()
                .saveFilePath(htmlConvertproperties.getFileSavePath()) // 从配置获取路径
                .builder();

        // 2. 转换HTML字符串为Word处理包
        WordprocessingMLPackage wordMLPackage = htmlConverter.htmlString2word(htmlContent);

        // 3. 生成临时文件并设置响应
        String fileName = "report_" + System.currentTimeMillis();
        File tempFile = htmlConverter.saveDocx(wordMLPackage, fileName); // 调用原有保存逻辑

        // 4. 处理文件下载（兼容不同浏览器）
        htmlConverter.handleFileDownload(tempFile, "报告.docx", request, response);

        // 5. 清理临时文件（根据需求可选，生产环境建议异步清理或设置过期策略）
        tempFile.deleteOnExit();
    }

五、引用说明

工具类参考github上的文章，但是对于工具类中的的具体逻辑作了修改。

https://github.com/FTOLs/report-demo

Java 富文本转word

前言：

一、依赖导入

二、字体文件

三、工具类

四、controller

五、引用说明

六、测试