功能需求
将html字符串保存为.mhtml文件
代码实现
- pom.xml依赖
XML
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all -->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.43</version>
</dependency>
<!-- Jsoup:解析HTML标签、提取图片/样式资源,必备 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- Apache工具包:Base64编码图片资源、IO流处理,必备 -->
<!-- Source: https://mvnrepository.com/artifact/commons-codec/commons-codec -->
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.15</version>
<scope>compile</scope>
</dependency>
<!-- Source: https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.15.1</version>
<scope>compile</scope>
</dependency>
<!-- Source: https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.38</version>
<scope>compile</scope>
</dependency>
</dependencies>
- 获取通过访问url获取html字符串内容工具类
java
@Slf4j
public class WikiUtils {
/**
* 获取wiki 页面html
*/
public static String getConfluencePageHtml(String url,String cookie) {
String value = "";
HttpResponse httpResponse = HttpClient.httpGetResponse(url, cookie);
if (httpResponse.isOk()){
value = httpResponse.body();
}else if (httpResponse.getStatus() == 403|| httpResponse.getStatus() == 302){
log.error("无效的cookie,无权限访问");
}else {
log.error("获取html页面失败");
}
return value;
}
/**
* 在请求头中放入cookie,避免登录拦截
*/
public static HttpResponse httpGetResponse(String url,String cookie) {
Map<String, String> headers = new HashMap<>();
headers.put("Cookie", cookie);
//登录
HttpResponse response = HttpRequest.get(url).headerMap(headers, true).execute();
return response;
}
}
- Html转换.mhtml核心类
java
@Slf4j
public class Html2MHTCompiler {
public static String parseTittle(String html) {
Document doc = Jsoup.parse(html);
Element titleElement = doc.selectFirst("title");
if (titleElement != null) {
String text = titleElement.text();
int i = text.indexOf("-");
if (i > 0) {
return text.substring(0, i).trim();
}
return text.trim();
}
return null;
}
// 原资源URL -> 资源的Base64编码(带MIME头)
public static Map<String, String> parseHtmlPage(String cookie,String html, String baseUrl) {
Map<String, String> resourceMap = new HashMap<>();
Document doc = Jsoup.parse(html);
// ========== 1. 提取所有 img 图片资源 ==========
Elements imgElements = doc.select("img[src]");
for (Element imgElement : imgElements) {
String imgSrc = imgElement.attr("src");
parseResource(cookie,imgSrc,"image",baseUrl, resourceMap);
}
// ========== 2. 提取所有 link 外链CSS样式表资源==========
Elements cssElements = doc.select("link[rel=stylesheet][href]");
for (Element cssElement : cssElements) {
String cssHref = cssElement.attr("href");
parseResource(cookie,cssHref, "CSS",baseUrl, resourceMap);
}
// ========== 3. 提取所有 script 外链JS脚本资源 ==========
Elements jsElements = doc.select("script[src]");
for (Element jsElement : jsElements) {
String jsSrc = jsElement.attr("src");
parseResource(cookie,jsSrc,"javascript",baseUrl, resourceMap);
}
return resourceMap;
}
// ========== 删除部分元素class="acs-side-bar ia-scrollable-section" 、
// class="ia-splitter-left"、
// id="header"
// id="navigation"
// id="likes-and-labels-container"、
// id="footer" 、
// id="comments-section"
// id="page-metadata-banner"
// id="breadcrumb-section"
// 、id="main"的style="margin-left: 285px;" ==========
public static String removeUnwantedElements(String html) {
Document doc = Jsoup.parse(html);
//删除head标签下的style标签的属性中的.ia-splitter-left #main 这两个选择器
removeCssSelectorFromStyleTag(doc, ".ia-splitter-left");
removeCssSelectorFromStyleTag(doc, "#main");
// 1. 删除指定class的元素 → 侧边栏/左侧面板 等冗余区域
doc.select(".acs-side-bar .ia-scrollable-section").remove();
doc.select(".ia-splitter-left").remove();
// 2. 删除指定id的元素 → 点赞标签区、页脚、评论区 等无用模块
// doc.getElementById("likes-and-labels-container").remove();
doc.getElementById("footer").remove();
doc.getElementById("header").remove();
doc.getElementById("navigation").remove();
doc.getElementById("comments-section").remove();
doc.getElementById("page-metadata-banner").remove();
doc.getElementById("breadcrumb-section").remove();
// 3. 精准移除 id="main" 标签中【指定的style样式:margin-left: 285px;】,保留其他style样式
Element mainElement = doc.getElementById("main");
if (mainElement != null && mainElement.hasAttr("style")) {
// 获取原style属性值
String oldStyle = mainElement.attr("style");
// 移除指定的样式段,保留其他样式
String newStyle = oldStyle.replace("margin-left: 285px;", "").trim();
// 处理移除后style为空的情况,避免残留空的style=""属性
if (newStyle.isEmpty()) {
mainElement.removeAttr("style");
} else {
mainElement.attr("style", newStyle);
}
}
return doc.html();
}
/**
* 核心工具方法:删除<head>标签下所有<style>标签内的【指定CSS选择器】及其对应的所有样式
* @param doc jsoup解析后的文档对象
* @param selector 要删除的css选择器,如:.ia-splitter-left 、 #main
*/
private static void removeCssSelectorFromStyleTag(Document doc, String selector) {
// 1. 获取head标签下所有的style样式标签
Elements styleTags = doc.head().select("style");
if (styleTags.isEmpty()) {
return; // 没有style标签,直接返回
}
// 2. 遍历每一个style标签,处理内部的css内容
for (Element styleTag : styleTags) {
String cssContent = styleTag.html();
if (cssContent.isEmpty()) continue;
// 3. 精准匹配【选择器 { 任意样式内容 }】 完整块,含换行/空格/制表符,匹配规则全覆盖
// 匹配规则:匹配 .ia-splitter-left { ... } 或 #main { ... } 完整的样式块
String regex = selector + "\\s*\\{[^}]*\\}";
// 替换匹配到的内容为空,即删除该选择器及对应样式
String newCssContent = cssContent.replaceAll(regex, "").trim();
// 处理替换后多余的空行/空格,让css内容更整洁
newCssContent = newCssContent.replaceAll("\\n+", "\n").replaceAll("\\s+", " ");
// 4. 将处理后的css内容重新写入style标签
styleTag.html(newCssContent);
}
}
// ========== 图片/CSS/JS都复用这个方法 ==========
private static void parseResource(String cookie,String resourceSrc,String resourceType,String baseUrl, Map<String, String> resourceMap) {
try {
// 拼接完整URL(兼容:绝对路径/相对路径)
String fullResourceUrl = getFullUrl(baseUrl, resourceSrc);
// 下载资源文件,转成【带MIME头的Base64编码】
String base64Resource = downloadResourceToBase64(fullResourceUrl,resourceType, cookie);
resourceMap.put(resourceSrc, base64Resource);
} catch (Exception e) {
log.error("资源解析失败,跳过该资源:" + resourceSrc, e);
}
}
// 拼接完整URL:处理相对路径/绝对路径 (原有方法,复用)
private static String getFullUrl(String baseUrl, String src) {
if (src.startsWith("http://") || src.startsWith("https://")) {
return src; // 绝对路径,直接返回
} else if(src.startsWith("//")){
return "https:" + src; // 兼容 //xxx.com/xxx.css 这种无协议路径
} else {
return src.startsWith("/") ? baseUrl + src : baseUrl + "/" + src; // 相对路径,拼接根路径
}
}
// ========== 通用资源下载+Base64编码方法,支持【图片/CSS/JS】所有类型 ==========
private static String downloadResourceToBase64(String resourceUrl,String resourceType,String cookie) throws Exception {
URL url = new URL(resourceUrl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(5000);
conn.setReadTimeout(5000);
conn.setRequestMethod("GET");
conn.setRequestProperty("Cookie",cookie);
// 解决部分网站的反爬/跨域问题
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0");
conn.setRequestProperty("Connection", "keep-alive");
conn.setRequestProperty("Accept", "*/*");
if (resourceType.equals("image")){
conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
}
if (conn.getResponseCode() == 200) {
InputStream in = conn.getInputStream();
ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len;
while ((len = in.read(buffer)) != -1) {
out.write(buffer, 0, len);
}
byte[] resourceBytes = out.toByteArray();
// 对图片类型做【体积压缩+无损渲染】处理
if ("image".equalsIgnoreCase(resourceType) && resourceBytes.length > 0) {
resourceBytes = compressImage(resourceBytes, 0.7f); // 0.7是压缩质量,可调整
}
// 获取资源的MIME类型 + Base64编码,自动适配图片/CSS/JS
String mimeType = conn.getContentType();
String base64 = Base64.encodeBase64String(resourceBytes);
in.close();
out.close();
conn.disconnect();
// 返回标准的data-url格式,可直接嵌入HTML替换原URL
return "data:" + mimeType + ";base64," + base64;
}
return null;
}
/**
* 核心图片压缩工具方法:图片质量压缩(核心无坑)
* @param imageBytes 原图字节流
* @param quality 压缩质量 0.1~1.0 ,推荐0.6~0.8 (数值越大越清晰,体积越大)
* @return 压缩后的图片字节流
*/
private static byte[] compressImage(byte[] imageBytes, float quality) throws Exception {
// 质量值兜底,防止传参错误
if (quality < 0.1f) quality = 0.1f;
if (quality > 1.0f) quality = 1.0f;
ByteArrayInputStream bais = new ByteArrayInputStream(imageBytes);
BufferedImage bufferedImage = ImageIO.read(bais);
if (bufferedImage == null) {
return imageBytes; // 非标准图片,返回原图
}
// 获取图片格式(png/jpg等)
String format = getImageFormat(imageBytes);
if (format == null) {
format = "jpeg";
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// 质量压缩,尺寸不变,清晰度无损,体积减小
ImageIO.write(bufferedImage, format, new MemoryCacheImageOutputStream(baos) {
@Override
public void write(byte[] b, int off, int len) {
try {
super.write(b, off, len);
} catch (Exception e) {
// 异常时直接写入原图,不影响
}
}
});
// 如果压缩后体积变大,返回原图
byte[] compressedBytes = baos.toByteArray();
bais.close();
baos.close();
return compressedBytes.length < imageBytes.length ? compressedBytes : imageBytes;
}
/**
* 获取图片真实格式
*/
private static String getImageFormat(byte[] imageBytes) throws Exception {
ByteArrayInputStream bais = new ByteArrayInputStream(imageBytes);
ImageInputStream iis = ImageIO.createImageInputStream(bais);
Iterator<ImageReader> readers = ImageIO.getImageReaders(iis);
if (readers.hasNext()) {
ImageReader reader = readers.next();
String format = reader.getFormatName();
iis.close();
bais.close();
return format;
}
iis.close();
bais.close();
return null;
}
public static String embedResources(String html, Map<String, String> resources) {
String embeddedHtml = html;
// 遍历所有资源,替换原URL为Base64编码
for (Map.Entry<String, String> entry : resources.entrySet()) {
String resourceUrl = entry.getKey();
String resourceUrlEscape = resourceUrl.replace("&", "&");
String embeddedUrl = entry.getValue();
embeddedHtml = embeddedHtml.replace(resourceUrlEscape, embeddedUrl);
}
return embeddedHtml;
}
public static void saveAsMhtml(String html, String filePath) {
try (BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(filePath), StandardCharsets.UTF_8)
)) {
// 写入MHTML标准协议头
writer.write("MIME-Version: 1.0");
writer.newLine();
writer.write("Content-Type: multipart/related; boundary=\"boundary\"");
writer.newLine();
writer.newLine();
// 写入内容边界开始标识
writer.write("--boundary");
writer.newLine();
writer.write("Content-Type: text/html; charset=UTF-8");
writer.newLine();
writer.newLine();
// 写入核心的、已嵌入所有资源的HTML内容
writer.write(html);
writer.newLine();
writer.newLine();
// 写入MHTML结束边界标识(必须写,否则文件格式不完整)
writer.write("--boundary--");
writer.flush();
}catch (IOException e){
log.error("保存MHTML文件失败:" + filePath, e);
}
}
逻辑调用:
- 通过url和cookie免密获取html字符串
- 获取html中的图片、CSS、JS转成base64的字符串,因为.mhtml文件中超链接类型的样式无法渲染
- 删除html中不需要的布局和内容
- 使用2. 中获取的图片、CSS、JS转成base64的字符串 替换html字符串中的超链接
- 保存为.mhtml文件
java
String html = WikiUtils.getConfluencePageHtml(link, cookie);
if (html.isEmpty()){
log.error("获取html页面失败");
return;
}
Map<String, String> htmlMap = Html2MHTCompiler.parseHtmlPage(cookie, html, properties.baseURL);
String tittle = Html2MHTCompiler.parseTittle(html);
String html2 = Html2MHTCompiler.removeUnwantedElements(html);
String parseHtml = Html2MHTCompiler.embedResources(html2, htmlMap);
Html2MHTCompiler.saveAsMhtml(parseHtml, currentDir+File.separator + tittle + ".mhtml");