Java实现HTML转PDF,主要为了解决将ai返回的html文本数据转为PDF文件方便用户下载查看。
一、deepSeek-AI提问词
基于以上个人数据。总结个人身体信息,分析个人身体指标信息。再按一个月为维度,详细列举一个月内训练计划,维度详细至每周每天,要求:不可省略表格内容以精简示例,文本结构顺序为标题个人信息,第一步,第二步。最终回答结果以标准的html形式返回结果,不能带有meta标签,字体为STSong-Light,SimSun,html内容禁止使用单标签。
二、表设计
sql
CREATE TABLE `p_deep_seek_task` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '任务id',
`user_id` bigint(20) NOT NULL COMMENT '用户id',
`status` char(2) NOT NULL DEFAULT '0' COMMENT '任务状态(0:待处理,1:处理中,2:处理成功,3:异常处理失败,4:重试中,5:重试失败)',
`try_time` int(11) NOT NULL DEFAULT '0' COMMENT '执行次数',
`result_url` varchar(255) DEFAULT NULL COMMENT '结果文件url',
`prompt` longtext NOT NULL COMMENT '提问内容',
`content` longtext COMMENT '结果内容',
`reasoning_content` longtext COMMENT '思考过程',
`create_time` datetime DEFAULT NULL COMMENT '创建时间',
`create_by` bigint(20) DEFAULT NULL COMMENT '创建人',
`update_by` bigint(20) DEFAULT NULL COMMENT '更新人',
`update_time` datetime DEFAULT NULL COMMENT '更新时间',
`task_time` date DEFAULT NULL COMMENT '任务日期',
`execute_time` datetime DEFAULT NULL COMMENT '执行时间',
`exception_msg` longtext COMMENT '异常信息',
`cost_time` bigint(20) NOT NULL DEFAULT '0' COMMENT '执行耗时(s)'
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=16 DEFAULT CHARSET=utf8mb4 COMMENT='deepseek任务';
三、导入Jar包
XML
<dependency>
<groupId>io.github.pig-mesh.ai</groupId>
<artifactId>deepseek-spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.xhtmlrenderer</groupId>
<artifactId>flying-saucer-pdf</artifactId>
<version>9.1.22</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
四、工具类
ai请求工具
java
package com.company.project.service.client;
import io.github.pigmesh.ai.deepseek.config.DeepSeekProperties;
import io.github.pigmesh.ai.deepseek.core.DeepSeekClient;
import io.github.pigmesh.ai.deepseek.core.chat.ChatCompletionRequest;
import io.github.pigmesh.ai.deepseek.core.chat.ChatCompletionResponse;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* @author: reshui
* description:DeepSeek服务
* DateTime:2025/3/31-14:48
*/
@Service
public class DeepSeekAiClient {
@Resource
private DeepSeekClient deepSeekClient;
@Resource
private DeepSeekProperties deepSeekProperties;
/**
* 提问接口
* 获取deepseek的响应结果
* @param prompt 提示词
*/
public ChatCompletionResponse syncChat(String prompt) {
ChatCompletionRequest request = ChatCompletionRequest.builder()
// 根据渠道模型名称动态修改这个参数
.model(deepSeekProperties.getModel()).addUserMessage(prompt).build();
return deepSeekClient.chatCompletion(request).execute();
}
}
特定html字符内容过滤工具
java
package com.company.project.service.tools;
import cn.hutool.core.collection.CollUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author reshui
* description
* dateTime 2025/04/17
*/
public class HtmlFormatter {
private String htmlContent;
private HtmlFormatter(String htmlContent) {
this.htmlContent = htmlContent;
}
// 入口方法,创建处理器实例
public static HtmlFormatter process(String htmlContent) {
return new HtmlFormatter(htmlContent);
}
// 链式方法:前置过滤干扰字符
public HtmlFormatter beforeFilter() {
this.htmlContent = beforeFilterInterferenceCharacters(this.htmlContent);
return this;
}
// 链式方法:替换内容标签
public HtmlFormatter replaceTags() {
this.htmlContent = replaceContentTag(this.htmlContent);
return this;
}
// 链式方法:后置过滤干扰字符
public HtmlFormatter afterFilter() {
this.htmlContent = afterFilterInterferenceCharacters(this.htmlContent);
return this;
}
// 获取最终结果
public String get() {
return this.htmlContent;
}
public static String formatHtml(String htmlContent) {
// 过滤掉html中的干扰字符
String filteredHtml = beforeFilterInterferenceCharacters(htmlContent);
// 去除内容中的大于小于号干扰
String replaceContentTag = replaceContentTag(filteredHtml);
//过滤html中的干扰标签
return afterFilterInterferenceCharacters(replaceContentTag);
}
/**
* 替换html中的干扰内容
*
* @param html 文本
*/
public static String replaceContentTag(String html) {
Document doc = Jsoup.parse(html);
removeTag(doc);
traverse(doc.body());
doc.outputSettings().prettyPrint(false);
return doc.html();
}
/**
* 去除不支持的meta标签
* @param doc jsoupdoc
*/
public static void removeTag(Document doc) {
Elements meta = doc.getElementsByTag("meta");
for (Element metaElement : meta) {
metaElement.remove();
}
}
public static void traverse(Element element) {
if (CollUtil.isEmpty(element.children())) {
String text = element.text()
.replace("<", "小于")
.replace(">", "大于");
element.text(text);
}
for (Element child : element.children()) { // 遍历子元素
traverse(child); // 递归调用以处理子元素及其子元素
}
}
/**
* 后置过滤掉html中的干扰字符
*
* @param html 文本
*/
public static String afterFilterInterferenceCharacters(String html) {
return html
.replace("<br></br>", "<br/>")
.replace("<br>", "<br/>")
.replace("</br>", "<br/>")
;
}
/**
* 前置过滤掉html中的干扰字符
*
* @param html 文本
*/
public static String beforeFilterInterferenceCharacters(String html) {
return html
.replace("```html", "")
.replace("```", "")
.replace("<!DOCTYPE html>", "")
.replace("<!doctype html>", "")
;
}
/**
* 将HTML字符串中的所有标签转为小写
*
* @param html 原始HTML字符串
* @return 转换后的HTML字符串
*/
public static String convertTagsToLowerCase(String html) {
// 正则表达式匹配HTML标签
Pattern pattern = Pattern.compile("</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>");
Matcher matcher = pattern.matcher(html);
StringBuffer result = new StringBuffer();
while (matcher.find()) {
// 将匹配到的标签转为小写
String lowerCaseTag = matcher.group().toLowerCase();
matcher.appendReplacement(result, lowerCaseTag);
}
matcher.appendTail(result);
return result.toString();
}
}
html转pdf工具
java
package com.company.project.service.tools;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.io.FileUtil;
import com.company.project.common.utils.SpringUtils;
import com.company.project.service.properties.PdfFontProperties;
import lombok.extern.slf4j.Slf4j;
import org.xhtmlrenderer.pdf.ITextRenderer;
import java.io.File;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Date;
/**
* @author: reshui
* description: html转pdf工具类
* DateTime:2025/3/31-15:55
*/
@Slf4j
public class HtmlToPdfTools {
/**
* 文件暂存地址
*/
private static final String TEMP_FILE_PATH = System.getProperty("java.io.tmpdir");
/**
* pdf文件暂存地址
*/
private static final String PDF_FILE_PATH = TEMP_FILE_PATH + File.separator + "ai_train_pdf";
/**
* 时间格式
*/
private static final String TIMESTAMP_FORMAT = "yyyyMMddHHmmss";
/**
* pdf配置文件
*/
public final static PdfFontProperties CONFIG = SpringUtils.getBean(PdfFontProperties.class);
/**
* html转pdf文件
*
* @param htmlContent html内容文本
*/
public static File convertHtmlToPdfFile(String htmlContent) throws Exception {
String formatDateTimeStamp = DateUtil.format(new Date(), TIMESTAMP_FORMAT);
String pdfFilePath = PDF_FILE_PATH + File.separator + formatDateTimeStamp + ".pdf";
FileUtil.touch(pdfFilePath);
String resultHtmlContent = HtmlFormatter.process(htmlContent)
.beforeFilter()
.replaceTags()
.afterFilter()
.get();
generatePdfReport(pdfFilePath, resultHtmlContent);
log.info("pdf文件储存地址:{}", pdfFilePath);
return new File(pdfFilePath);
}
/**
* 生成pdf文件
*
* @param outputPath 输出文件地址
* @param htmlContent html内容文本
*/
public static void generatePdfReport(String outputPath, String htmlContent) throws Exception {
try (OutputStream outputStream = Files.newOutputStream(Paths.get(outputPath))) {
ITextRenderer renderer = new ITextRenderer();
renderer.getFontResolver().addFont(
CONFIG.getPath(),
CONFIG.getEncoding(),
CONFIG.getEmbedded()
);
// renderer.getFontResolver().addFont(
// "c://Windows//Fonts//simsun.ttc",
// "Identity-H",
// true
// );
renderer.setDocumentFromString(htmlContent);
renderer.layout();
renderer.createPDF(outputStream);
}
}
public static void main(String[] args) throws Exception {
String html = "";
convertHtmlToPdfFile(html);
}
}
配置文件yml
java
# deepseek配置文件
deepseek:
base-url: https://api.deepseek.com/v1
api-key: xxxxxxxxxxxxx
model: deepseek-reasoner
connectTimeout: 60
readTimeout: 240
callTimeout: 360
# windows-pdf字体配置
pdf:
font:
path: c://Windows//Fonts//simsun.ttc
encoding: Identity-H
embedded: true
# linux-pdf字体配置
pdf:
font:
path: c://Windows//Fonts//simsun.ttc
encoding: Identity-H
embedded: true