java 读取pdf文件内容

一、引入maven

xml 复制代码
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.25</version>
</dependency>

二、代码工具类

java 复制代码
package com.jiayou.peis.utils;

//import com.itextpdf.text.pdf.PdfReader;
//import com.itextpdf.text.pdf.parser.PdfTextExtractor;
//import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;

import com.google.common.collect.Lists;
import com.jiayou.peis.entity.ImageObject;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * PDF处理
 *
 * @author Bob Ren (Copyright © 2015-2029 贵州家有在线网络有限公司)
 * @version 1.0.0
 * @date 2022-02-07 16:21
 */
public class PdfUtils {
    //    /**
//     * 使用itextpdf提取PDF文本(解析不靠谱)
//     *
//     * @param inputStream
//     * @return
//     * @throws IOException
//     */
//    @Deprecated
//    public static String toText(InputStream inputStream) throws IOException {
//        try {
//            StringBuilder buf = new StringBuilder();
//            PdfReader reader = new PdfReader(inputStream);
//            int pageNum = reader.getNumberOfPages();
//            for(int i=1;i<=pageNum;i++){
//                // 读取第i页的文档内容
//                buf.append(PdfTextExtractor.getTextFromPage(reader, i, new SimpleTextExtractionStrategy()));
//            }
            return buf.toString();
//            return StrUtils.removeReturnChar(buf.toString());
//        } finally {
//            CloseUtils.closeQuietly(inputStream);
//        }
//    }
    public static String text(byte[] data) throws IOException {
        return PdfUtils.text(data, true);
    }
    public static String text(byte[] data, boolean sortByPosition) throws IOException {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    /**
     * 使用pdfbox提取PDF文本(解析正常,可使用)
     *
     * @param file
     * @return
     * @throws IOException
     */
    public static String text(File file, boolean sortByPosition) throws IOException {
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    public static String text(File file) throws IOException {
        return PdfUtils.text(file, true);
    }
    public static String text(InputStream inputStream) throws IOException {
        return text(inputStream, true);
    }
    /**
     * 使用pdfbox提取PDF文本(解析正常,可使用)
     *
     * @param inputStream
     * @return
     * @throws IOException
     */
    public static String text(InputStream inputStream, boolean sortByPosition) throws IOException {
        PDDocument document = null;
        try {
//            document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            PDFTextStripper textStripper = new PDFTextStripper();
            // Get total page count of the PDF document
            int numberOfPages = document.getNumberOfPages();
            //set the first page to be extracted
            textStripper.setStartPage(1);
            // set the last page to be extracted
            textStripper.setEndPage(numberOfPages);
            // 获取文本内容
            textStripper.setSortByPosition(sortByPosition);
            textStripper.setShouldSeparateByBeads(true);
            return StrUtils.removeReturnChar(textStripper.getText(document));
        } finally {
            CloseUtils.closeQuietly(document, inputStream);
        }
    }

    /**
     * 使用pdfbox提取PDF文本(解析正常,可使用)
     *
     * @param file
     * @return
     * @throws IOException
     */
    public static List<ImageObject> images(File file) throws IOException {
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.images(inputStream);
    }

    public static List<ImageObject> images(byte[] data) throws IOException {
        ByteArrayInputStream inputStream = null;
        try {
            inputStream = new ByteArrayInputStream(data);
            return PdfUtils.images(inputStream);
        } finally {
            CloseUtils.closeQuietly(inputStream);
        }
    }

    /**
     * 使用pdfbox提取PDF图片列表
     *
     * @param inputStream
     * @return
     * @throws IOException
     */
    public static List<ImageObject> images(InputStream inputStream) throws IOException {
        List<ImageObject> imageList = Lists.newArrayList();
        PDDocument document = null;
        try {
//            document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            // get resources for a page
            PDResources pdResources = document.getPage(0).getResources();
            int i = 0;
            for (COSName csName : pdResources.getXObjectNames()) {
//                System.out.println(i+":"+csName);
                PDXObject pdxObject = pdResources.getXObject(csName);
                if (pdxObject instanceof PDImageXObject) {
//                    i++;
                    PDStream pdStream = pdxObject.getStream();
                    PDImageXObject image = new PDImageXObject(pdStream, pdResources);
                    String imageSuffix = imageSuffix(image);
                    // image storage location and image name
                    BufferedImage bufferedImage = image.getImage();
                    ImageObject object = new ImageObject();
                    object.setIndex(i++);
                    object.setImage(bufferedImage);
                    object.setSuffix(imageSuffix);
                    imageList.add(object);
                }
            }
        } finally {
            CloseUtils.closeQuietly(document, inputStream);
        }
        return imageList;
    }

    /**
     * 获取图片后缀
     *
     * @param pdImage
     * @return
     * @throws IOException
     */
    private static String imageSuffix(PDImageXObject pdImage) throws IOException {
        String suffix = pdImage.getSuffix();
        if (suffix == null || "jb2".equals(suffix)) {
            suffix = "png";
        } else if ("jpx".equals(suffix)) {
            // use jp2 suffix for file because jpx not known by windows
            suffix = "jp2";
        }

        if (hasMasks(pdImage)) {
            // TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG
            suffix = "png";
        }
        return suffix;
    }

    private static boolean hasMasks(PDImage pdImage) throws IOException {
        if (pdImage instanceof PDImageXObject) {
            PDImageXObject ximg = (PDImageXObject) pdImage;
            return ximg.getMask() != null || ximg.getSoftMask() != null;
        }
        return false;
    }

    /**
     * 保存图片到指定文件夹
     *
     * @param imageList
     * @param dir
     * @param prefixName
     * @throws IOException
     */
    public static void saveImage(List<ImageObject> imageList, String dir, String prefixName) throws IOException {
        File imgDir = new File(dir);
        FileUtils.forceMkdir(imgDir);
        for(ImageObject image:imageList){
            File imgFile = new File(dir, prefixName+"_"+image.getIndex()+"."+image.getSuffix());
            ImageIO.write(image.getImage(), image.getSuffix(), imgFile);
        }
    }
}
相关推荐
2301_764441336 小时前
Aella Science Dataset Explorer 部署教程笔记
笔记·python·全文检索
派大鑫wink7 小时前
【Java 学习日记】开篇:以日记为舟,渡 Java 进阶之海
java·笔记·程序人生·学习方法
永远都不秃头的程序员(互关)8 小时前
大模型Agent落地实战:从核心原理到工业级任务规划器开发
笔记
TL滕10 小时前
从0开始学算法——第十八天(分治算法)
笔记·学习·算法
算法与双吉汉堡10 小时前
【短链接项目笔记】Day2 用户注册
java·redis·笔记·后端·spring
思成不止于此10 小时前
【MySQL 零基础入门】MySQL 约束精讲(一):基础约束篇
数据库·笔记·sql·学习·mysql
WizLC12 小时前
【JAVA】JVM类加载器知识笔记
java·jvm·笔记
TL滕12 小时前
从0开始学算法——第十八天(分治算法练习)
笔记·学习·算法
لا معنى له13 小时前
学习笔记:卷积神经网络(CNN)
人工智能·笔记·深度学习·神经网络·学习·cnn
蒙奇D索大13 小时前
【数据结构】考研408 | 冲突解决精讲: 拉链法——链式存储的艺术与优化
数据结构·笔记·考研·改行学it