Springboot功能模块之使用poi4.1.2 将word转换成html

因为自己的任务需要用到word转html,但是poi3.1.2的版本与我poi4.1.2版本冲突，所以尝试用4.1.2版本来写一个word转html,它是可以同时支持doc和docx两种格式，非常好用，当前文章是关于docx转html的，doc相对来说比较简单，有兴趣的可以尝试一下

步骤1：创建项目和依赖

java 复制代码

  <poi.version>4.1.2</poi.version>
   <!--注意版本保持一致 poi poi-ooxml  poi-scratchpad-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
        </dependency>
        <!-- 操作doc ppt xls  -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>${poi.version}</version>
        </dependency>
        <!-- 操作docx pptx xlsx  -->

        <!--word S-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>${poi.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>ooxml-schemas</artifactId>
            <version>1.4</version>
        </dependency>

        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
            <version>2.0.2</version>
        </dependency>
        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
            <version>2.0.1</version>
        </dependency>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.17.2</version>
        </dependency>

步骤2：读取Word文档

使用Apache POI库读取Word文档。对于.docx文件，使用XWPFDocument类；对于.doc文件，使用HWPFDocument类。

word07转html

java 复制代码

 public static String Word2007ToHtml(MultipartFile file) throws IOException{
        if (file.isEmpty() || file.getSize() <= 0) {
            throw new RuntimeException("文件为空,请添加文件");
        }else{
            if (file.getOriginalFilename().endsWith(".docx") || file.getOriginalFilename().endsWith(".DOCX")){
                try (InputStream input = file.getInputStream()) {
                    XWPFDocument wordDocument = new XWPFDocument(input);
                    XHTMLOptions options = XHTMLOptions.create();
                    // 图片转base64
                    //options.setImageManager(new Base64EmbedImgManager());
                    // 获取所有图片数据
                    options.setImageManager(new CustomImageManager(staticmediaUploadApi));
                    options.setFragment(true);
                    //忽略页眉页脚
                    options.setOmitHeaderFooterPages(true);

                    options.setIgnoreStylesIfUnused(false);
                    // 转换html
                    ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
                    XHTMLConverter.getInstance().convert(wordDocument, htmlStream, options);
                    String htmlStr = htmlStream.toString();
                    htmlStream.close();
                    return htmlStr;
                } catch (IOException e) {
                    e.printStackTrace();
                    return null; // 或者抛出异常，取决于你的业务逻辑
                }
            }
            else{
                throw new RuntimeException("文件格式错误,只能输入 MS Office 2007+ files");
            }
        }
    }

word03版本转html

java 复制代码

/**
 * 将doc格式的文件转换为html格式
 * @param inFileName 输入的doc文件名
 * @param outFileName 输出的html文件名
 */
public static void docToHtml(String inFileName, String outFileName) {
    String content = null;
    ByteArrayOutputStream baos = null;
    try {
        // 新建word输入流，用于读取doc文件内容
        FileInputStream source = new FileInputStream(new File(inFileName));
        // 获取word对象，用于后续处理
        HWPFDocument wordDocument = new HWPFDocument(source);
        // 创建WordToHtmlConverter对象，用于转换文档
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        // 设置图片存放的位置
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            @Override
            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
                File imgPath = new File("F:\\TestFile\\images");
                if (!imgPath.exists()) { // 如果图片目录不存在则创建
                    imgPath.mkdirs();
                }
                File file = new File("F:\\TestFile\\images" + suggestedName);
                try {
                    OutputStream os = new FileOutputStream(file);
                    os.write(content);
                    os.close();
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                return "F:\\TestFile\\images" + suggestedName;
            }
        });
        // 处理文档转换
        wordToHtmlConverter.processDocument(wordDocument);
        // 获取转换后的html文档对象
        Document htmlDocument = wordToHtmlConverter.getDocument();
        // 创建DOMSource对象，用于后续转换
        DOMSource domSource = new DOMSource(htmlDocument);

        // 创建TransformerFactory对象，用于后续转换操作
        TransformerFactory tf = TransformerFactory.newInstance();
        // 创建Transformer对象，用于执行实际的转换操作
        Transformer serializer = tf.newTransformer();
        // 设置输出属性，如编码、缩进等
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        // 新建输出流，用于写入转换后的html内容到指定文件
        FileOutputStream fos = new FileOutputStream(new File(outFileName));
        StreamResult streamResult = new StreamResult(fos);
        // 执行转换操作，将domSource转换为streamResult
        serializer.transform(domSource, streamResult);

    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        try {
            if (baos != null) {
                content = new String(baos.toByteArray(), "utf-8");
                baos.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

步骤3：图片处理

java 复制代码

/**
 * 自定义的图片管理器，继承自ImageManager，用于处理Word文档中的图片。
 */
@Slf4j
public class CustomImageManager extends ImageManager {

    private MediaUploadApi mediaUploadApi;
    private byte[] picture;
    private String MediaId;

    /**
     * 构造函数，初始化基础目录和图片子目录。
     *

     */
    public CustomImageManager(MediaUploadApi  staticmediaUploadApi) {
        super(new File(""), ""); // 调用父类构造函数
        this.mediaUploadApi = staticmediaUploadApi;
    }

    /**
     * 重写extract方法，用于从Word文档中提取图片数据。
     *
     * @param imagePath 图片的路径
     * @param imageData 图片的数据
     * @throws IOException 如果读写文件时发生错误
     */
    @Override
    public void extract(String imagePath, byte[] imageData) throws IOException {

        this.picture = imageData;// 调用父类的extract方法
        File file = FileUtil.writeBytes(imageData, imagePath);
        MultipartFile multipartFile;
        try {
            DiskFileItem item = (DiskFileItem) new DiskFileItemFactory().createItem("file", "image/png", true, file.getName());
            Files.copy(Paths.get(file.getAbsolutePath()), item.getOutputStream());
            multipartFile = new CommonsMultipartFile(item);
            MediaGetResponse upload = mediaUploadApi.upload(multipartFile, null, null, null, null, null, null, null, null);
            this.MediaId = upload.getMedia().getMediaId();
            log.error(this.MediaId);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        //byte 转化成 file

    }

    /**
     * 重写resolve方法，用于解析图片的URI。
     *
     * @param uri 图片的URI
     * @return 解析后的图片路径
     */
    @Override
    public String resolve(String uri) {
        // 使用上传至云存储的方法
        // String imageid = mediaUploadApi.upload()
        String imageUrl = "https://api.jizhibao.cn.com/file/download/file/" +  this.MediaId;

        return imageUrl;
    }

    /**
     * 获取图片相对于基本目录的路径。
     *
     * @param imagePath 图片的完整路径
     * @return 相对路径
     */
    private String getImageRelativePath(String imagePath) {
        return imagePath; // 返回图片的原始路径，未做任何处理
    }
}

Springboot功能模块之 使用poi4.1.2 将word转换成html

步骤1：创建项目和依赖

步骤2：读取Word文档

步骤3：图片处理

Springboot功能模块之使用poi4.1.2 将word转换成html