java中使用Jsoup和Itext实现将html转换为PDF

1.在build.gradle中安装所需依赖:

java 复制代码
implementation group: 'com.itextpdf', name: 'itextpdf', version: '5.5.13'
implementation group: 'com.itextpdf.tool', name: 'xmlworker', version: '5.5.13'
implementation group: 'org.jsoup', name: 'jsoup', version: '1.15.3'

2.创建工具类,实现转换方法

java 复制代码
/**
     * convert the html to pdf.
     */
    public void htmlToPdf(String oldFilePath, String newFilePath) throws IOException, com.itextpdf.text.DocumentException {
        Document doc = Jsoup.parse(new File(oldFilePath), "UTF-8");
        // jsoup标准化标签,生成闭合标签
        doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
        doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);

        com.itextpdf.text.Document document = new com.itextpdf.text.Document(PageSize.A4, 36, 36, 36, 36);
        PdfWriter pdfWriter = PdfWriter.getInstance(document, new FileOutputStream(newFilePath));
        document.open();
        //html to pdf, base64 image support.
        final TagProcessorFactory tagProcessorFactory = Tags.getHtmlTagProcessorFactory();
        tagProcessorFactory.removeProcessor(HTML.Tag.IMG);
        tagProcessorFactory.addProcessor(new ImageTagRefreshFilter(), HTML.Tag.IMG);
        
        //设置中文字体
        final CssFilesImpl cssFiles = new CssFilesImpl();
        cssFiles.add(XMLWorkerHelper.getInstance().getDefaultCSS());
        final StyleAttrCSSResolver cssResolver = new StyleAttrCSSResolver(cssFiles);
        final HtmlPipelineContext hpc = new HtmlPipelineContext(new CssAppliersImpl(asianFontRefreshFilter));
        hpc.setAcceptUnknown(true).autoBookmark(true).setTagFactory(tagProcessorFactory);
        final HtmlPipeline htmlPipeline = new HtmlPipeline(hpc, new PdfWriterPipeline(document, pdfWriter));
        final Pipeline<?> pipeline = new CssResolverPipeline(cssResolver, htmlPipeline);

        final XMLWorker worker = new XMLWorker(pipeline, true);
        final Charset charset = StandardCharsets.UTF_8;
        final XMLParser xmlParser = new XMLParser(true, worker, charset);
        InputStream inputStream = new ByteArrayInputStream(doc.html().getBytes());
        xmlParser.parse(inputStream, charset);
//        XMLWorkerHelper.getInstance().parseXHtml(pdfWriter, document, inputStream, Charset.forName("UTF-8"));
        document.close();
    }

3.base64过滤类:

java 复制代码
import com.itextpdf.text.Chunk;
import com.itextpdf.text.Element;
import com.itextpdf.text.Image;
import com.itextpdf.text.pdf.codec.Base64;
import com.itextpdf.tool.xml.NoCustomContextException;
import com.itextpdf.tool.xml.Tag;
import com.itextpdf.tool.xml.WorkerContext;
import com.itextpdf.tool.xml.exceptions.RuntimeWorkerException;
import com.itextpdf.tool.xml.html.HTML;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;

public class ImageTagRefreshFilter extends com.itextpdf.tool.xml.html.Image {

    /**
     * html to pdf, base64 image support.
     * */

    @Override
    public List<Element> end(final WorkerContext ctx, final Tag tag, final List<Element> currentContent) {
        final Map<String, String> attributes = tag.getAttributes();
        String src = attributes.get(HTML.Attribute.SRC);
        List<Element> elements = new ArrayList<Element>(1);
        if (null != src && src.length() > 0) {
            Image img = null;
            if (src.startsWith("data:image/")) {
                final String base64Data = src.substring(src.indexOf(",") + 1);
                try {
                    img = Image.getInstance(Base64.decode(base64Data));
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
                if (img != null) {
                    try {
                        final HtmlPipelineContext htmlPipelineContext = getHtmlPipelineContext(ctx);
                        elements.add(getCssAppliers().apply(new Chunk((com.itextpdf.text.Image) getCssAppliers().apply(img, tag, htmlPipelineContext), 0, 0, true), tag,
                                htmlPipelineContext));
                    } catch (NoCustomContextException e) {
                        throw new RuntimeWorkerException(e);
                    }
                }
            }

            if (img == null) {
                elements = super.end(ctx, tag, currentContent);
            }
        }
        return elements;
    }
}

4.字体类代码,window用户可在C:\windows\font\中寻找自己所需字体即可。我这里用的为黑体:

simhei.ttf

java 复制代码
import com.itextpdf.text.Font;
import com.itextpdf.text.pdf.BaseFont;
import com.itextpdf.tool.xml.XMLWorkerFontProvider;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;


@Component
public class AsianFontRefreshFilter extends XMLWorkerFontProvider {
    //此处写字体文件的绝对路径
    private String fontPath;

    @Override
    public Font getFont(String fontname, String encoding, float size, final int style) {
        try {
            //字体文件绝对路径
            BaseFont bfChinese = BaseFont.createFont(fontPath, BaseFont.IDENTITY_H, BaseFont.EMBEDDED);
            return new Font(bfChinese, size, style);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return super.getFont(fontname, encoding, size, style);
    }
}

效果如下:

html页面预览:

pdf页面预览:

相关推荐
方也_arkling7 小时前
【Java-Day08】static / final / 枚举
java·开发语言
橙淮7 小时前
Spring Bean作用域与生命周期全解析
java·spring
Chengbei118 小时前
一站式源码安全检测工具、云安全 / APP / 小程序源码敏感信息递归多层目录扫描AK、JWT、手机号、身份证等敏感信息
java·开发语言·安全·web安全·网络安全·系统安全·安全架构
llz_1128 小时前
web-第一次课后作业
java·开发语言·idea
秋98 小时前
Java项目运行5天左右自动宕机:系统性定位与解决方案
java·开发语言·python
小江的记录本8 小时前
【JVM虚拟机】垃圾回收GC:垃圾收集器:CMS:核心原理、回收流程、优缺点、废弃原因(附《思维导图》+《面试高频考点清单》)
java·jvm·后端·python·spring·面试·maven
DIY源码阁8 小时前
JavaSwing学生成绩管理系统 - MySQL版
java·数据库·mysql·eclipse
basketball6169 小时前
C++ NULL 和 nullptr 区别 以及 nullptr 的核心实现
java·开发语言·c++
JAVA面经实录91710 小时前
MyBatis面试题库
java·mybatis
小江的记录本10 小时前
【JVM虚拟机】垃圾回收GC:垃圾回收算法:标记-清除、标记-复制、标记-整理、分代收集(附《思维导图》+《面试高频考点清单》)
java·jvm·后端·python·算法·安全·面试