最近做一个富文本的需求,要求把文档内容转换到富文本内,文档中的格式也好,样式也好,图片啥的都要一致展示;踩了不少坑,据说word文档其实是一个压缩包,我不是特别清楚但是也能理解,自己借鉴参考凑合看的,大佬勿喷
啥都不说了看代码吧;其中关于图片的导出有两种方式比较大的那种是用的jdk8自带的base
64搞的,大小有差别同一个图片的话我这个实测的图片是差200k左右,有要求的你可以换着来引用;jar的引用pom中有
java
<!--注意版本保持一致 poi poi-ooxml poi-scratchpad-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 操作doc ppt xls -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 操作docx pptx xlsx -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>2.0.2</version>
</dependency>
java
import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver;
import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;
import sun.misc.BASE64Encoder;
import javax.imageio.ImageIO;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.Base64;
/**
* @author :Xiaoning Fan
* @date :Created in 2023-10-16 下午 3:49
* @description: 上传word文档并转换为html字符串返回,保持样式不变,图片替换为base64
* @version: 1.0
*/
@Slf4j
public class WordToHtmlStringConverter {
/**
* wordToHtml
*
* @return
* @throws IOException
* @throws ParserConfigurationException
* @throws TransformerException
*/
public static String wordToHtml(MultipartFile file) {
// 提取出word文档名称和后缀
String filename = file.getOriginalFilename();
try {
if (filename.endsWith(".docx")) {
// 将上传的文件传入Document转换
return new WordToHtmlStringConverter().docxToHtmlText(file);
} else if (filename.endsWith(".doc")) {
return new WordToHtmlStringConverter().docToHtmlText(file);
} else {
log.error("不支持的文件格式!");
return null;
}
} catch (FileNotFoundException e) {
log.error("文件找不到异常!");
e.printStackTrace();
} catch (IOException e) {
log.error("io转换异常!");
e.printStackTrace();
} catch (Exception e) {
log.error("文件转换异常!");
e.printStackTrace();
}
return null;
}
/**
* 上传Word文档,返回解析后的Html
*/
public static String docToHtmlText(MultipartFile file) throws Exception {
//使用字符数组流获取解析的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStream outStream = new BufferedOutputStream(baos);
try {
//将上传的文件传入Document转换
HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
//将读取到的图片上传并添加链接地址
wordToHtmlConverter.setPicturesManager((imageStream, pictureType, name, width, height) -> {
try {
//首先要判断图片是否能识别
if (pictureType.equals(PictureType.UNKNOWN)) {
return "[不能识别的图片]";
}
//此处转换图片文件为Base64
return Base64.getEncoder().encodeToString(imageStream).trim();
} catch (Exception e) {
log.info("upload exception", e);
}
return "[图片上传失败]";
});
// word文档转Html文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
String content = baos.toString();
log.info("docToHtmlText--->{}", content);
return content;
} catch (Exception e) {
log.error("docToHtmlText 异常", e);
} finally {
baos.close();
outStream.close();
}
return null;
}
/**
* 上传docx文档,返回解析后的Html
*/
public static String docxToHtmlText(MultipartFile file) throws Exception {
ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
ByteArrayOutputStream htmlImg = new ByteArrayOutputStream();
String htmlStr = null;
try {
// 将上传的文件传入Document转换
XWPFDocument docxDocument = new XWPFDocument(file.getInputStream());
XHTMLOptions options = XHTMLOptions.create();
// 设置图片存储路径
String path = System.getProperty("java.io.tmpdir");
String firstImagePathStr = path + "/" + System.currentTimeMillis();
options.setExtractor(new FileImageExtractor(new File(firstImagePathStr)));
options.URIResolver(new BasicURIResolver(firstImagePathStr));
// 转换html
docxDocument.createNumbering();
XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
htmlStr = htmlStream.toString();
String middleImageDirStr = "/word/media";
String imageDirStr = firstImagePathStr + middleImageDirStr;
File imageDir = new File(imageDirStr);
String[] imageList = imageDir.list();
if (imageList != null) {
for (int i = 0; i < imageList.length; i++) {
try {
String oneImagePathStr = imageDirStr + "/" + imageList[i];
File fileImage = new File(oneImagePathStr);
if (fileImage.exists()) {
log.info("处理图片开始。。。。。。。。");
// 处理图片成为Base64格式
// 读取图片字节数组
InputStream in = new FileInputStream(fileImage);
byte[] data = new byte[in.available()];
in.read(data);
String encode = new BASE64Encoder().encode(data);
log.info("处理图片结束。。。。。。。" + encode);
//修改文档中的图片信息
htmlStr = htmlStr.replace(oneImagePathStr, "data:image/png;base64,"+encode);
/* BufferedImage bi = ImageIO.read(fileImage);// 图片存储大小比较大
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ImageIO.write(bi, "png", baos);
byte[] bytes = baos.toByteArray();
String sd = Base64.getEncoder().encodeToString(bytes).trim();
log.info("处理图片结束。。。。。。。" + sd);
htmlStr = htmlStr.replace(oneImagePathStr, "data:image/png;base64,"+sd);*/
}
} catch (Exception e) {
log.info("upload docxToHtmlText exception", e);
}
}
}
log.info("处理结果:{}", htmlStr);
} catch (Exception e) {
log.error("docxToHtmlText 解析异常", e);
} finally {
if (htmlStream != null) {
htmlStream.close();
}
return htmlStr;
}
}
}
直接引用就行,但是有一点,一定要注意接口返回的时候,如果直接返回页面接口上要加
@ResponseBody不然就悲剧了;当然如果直接存库的那就无所谓了
这次就先这样,自娱自乐,手下留情勿喷!!