java 读取pdf文件内容

一、引入maven

xml 复制代码
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.25</version>
</dependency>

二、代码工具类

java 复制代码
package com.jiayou.peis.utils;

//import com.itextpdf.text.pdf.PdfReader;
//import com.itextpdf.text.pdf.parser.PdfTextExtractor;
//import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;

import com.google.common.collect.Lists;
import com.jiayou.peis.entity.ImageObject;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * PDF处理
 *
 * @author Bob Ren (Copyright © 2015-2029 贵州家有在线网络有限公司)
 * @version 1.0.0
 * @date 2022-02-07 16:21
 */
public class PdfUtils {
    //    /**
//     * 使用itextpdf提取PDF文本(解析不靠谱)
//     *
//     * @param inputStream
//     * @return
//     * @throws IOException
//     */
//    @Deprecated
//    public static String toText(InputStream inputStream) throws IOException {
//        try {
//            StringBuilder buf = new StringBuilder();
//            PdfReader reader = new PdfReader(inputStream);
//            int pageNum = reader.getNumberOfPages();
//            for(int i=1;i<=pageNum;i++){
//                // 读取第i页的文档内容
//                buf.append(PdfTextExtractor.getTextFromPage(reader, i, new SimpleTextExtractionStrategy()));
//            }
            return buf.toString();
//            return StrUtils.removeReturnChar(buf.toString());
//        } finally {
//            CloseUtils.closeQuietly(inputStream);
//        }
//    }
    public static String text(byte[] data) throws IOException {
        return PdfUtils.text(data, true);
    }
    public static String text(byte[] data, boolean sortByPosition) throws IOException {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    /**
     * 使用pdfbox提取PDF文本(解析正常,可使用)
     *
     * @param file
     * @return
     * @throws IOException
     */
    public static String text(File file, boolean sortByPosition) throws IOException {
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    public static String text(File file) throws IOException {
        return PdfUtils.text(file, true);
    }
    public static String text(InputStream inputStream) throws IOException {
        return text(inputStream, true);
    }
    /**
     * 使用pdfbox提取PDF文本(解析正常,可使用)
     *
     * @param inputStream
     * @return
     * @throws IOException
     */
    public static String text(InputStream inputStream, boolean sortByPosition) throws IOException {
        PDDocument document = null;
        try {
//            document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            PDFTextStripper textStripper = new PDFTextStripper();
            // Get total page count of the PDF document
            int numberOfPages = document.getNumberOfPages();
            //set the first page to be extracted
            textStripper.setStartPage(1);
            // set the last page to be extracted
            textStripper.setEndPage(numberOfPages);
            // 获取文本内容
            textStripper.setSortByPosition(sortByPosition);
            textStripper.setShouldSeparateByBeads(true);
            return StrUtils.removeReturnChar(textStripper.getText(document));
        } finally {
            CloseUtils.closeQuietly(document, inputStream);
        }
    }

    /**
     * 使用pdfbox提取PDF文本(解析正常,可使用)
     *
     * @param file
     * @return
     * @throws IOException
     */
    public static List<ImageObject> images(File file) throws IOException {
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.images(inputStream);
    }

    public static List<ImageObject> images(byte[] data) throws IOException {
        ByteArrayInputStream inputStream = null;
        try {
            inputStream = new ByteArrayInputStream(data);
            return PdfUtils.images(inputStream);
        } finally {
            CloseUtils.closeQuietly(inputStream);
        }
    }

    /**
     * 使用pdfbox提取PDF图片列表
     *
     * @param inputStream
     * @return
     * @throws IOException
     */
    public static List<ImageObject> images(InputStream inputStream) throws IOException {
        List<ImageObject> imageList = Lists.newArrayList();
        PDDocument document = null;
        try {
//            document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            // get resources for a page
            PDResources pdResources = document.getPage(0).getResources();
            int i = 0;
            for (COSName csName : pdResources.getXObjectNames()) {
//                System.out.println(i+":"+csName);
                PDXObject pdxObject = pdResources.getXObject(csName);
                if (pdxObject instanceof PDImageXObject) {
//                    i++;
                    PDStream pdStream = pdxObject.getStream();
                    PDImageXObject image = new PDImageXObject(pdStream, pdResources);
                    String imageSuffix = imageSuffix(image);
                    // image storage location and image name
                    BufferedImage bufferedImage = image.getImage();
                    ImageObject object = new ImageObject();
                    object.setIndex(i++);
                    object.setImage(bufferedImage);
                    object.setSuffix(imageSuffix);
                    imageList.add(object);
                }
            }
        } finally {
            CloseUtils.closeQuietly(document, inputStream);
        }
        return imageList;
    }

    /**
     * 获取图片后缀
     *
     * @param pdImage
     * @return
     * @throws IOException
     */
    private static String imageSuffix(PDImageXObject pdImage) throws IOException {
        String suffix = pdImage.getSuffix();
        if (suffix == null || "jb2".equals(suffix)) {
            suffix = "png";
        } else if ("jpx".equals(suffix)) {
            // use jp2 suffix for file because jpx not known by windows
            suffix = "jp2";
        }

        if (hasMasks(pdImage)) {
            // TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG
            suffix = "png";
        }
        return suffix;
    }

    private static boolean hasMasks(PDImage pdImage) throws IOException {
        if (pdImage instanceof PDImageXObject) {
            PDImageXObject ximg = (PDImageXObject) pdImage;
            return ximg.getMask() != null || ximg.getSoftMask() != null;
        }
        return false;
    }

    /**
     * 保存图片到指定文件夹
     *
     * @param imageList
     * @param dir
     * @param prefixName
     * @throws IOException
     */
    public static void saveImage(List<ImageObject> imageList, String dir, String prefixName) throws IOException {
        File imgDir = new File(dir);
        FileUtils.forceMkdir(imgDir);
        for(ImageObject image:imageList){
            File imgFile = new File(dir, prefixName+"_"+image.getIndex()+"."+image.getSuffix());
            ImageIO.write(image.getImage(), image.getSuffix(), imgFile);
        }
    }
}
相关推荐
雍凉明月夜13 分钟前
c++ 精学笔记记录Ⅰ
开发语言·c++·笔记
金融小师妹30 分钟前
基于NLP政策信号解析的联邦基金利率预测:美银动态调整12月降息概率至88%,2026年双降路径的强化学习模拟
大数据·人工智能·深度学习·1024程序员节
自不量力的A同学2 小时前
FreeFileSync 14.6 发布
笔记
可可苏饼干2 小时前
ELK(Elastic Stack)日志采集与分析
linux·运维·笔记·elk
s1ckrain3 小时前
数字逻辑笔记—组合逻辑电路
笔记·fpga开发·嵌入式
金融小师妹3 小时前
基于LSTM趋势预测的白银价格突破58美元阈值,年度累计涨幅超100%的强化学习驱动分析
大数据·人工智能·编辑器·1024程序员节
可可苏饼干4 小时前
NoSQL 与 Redis
数据库·redis·笔记·学习·nosql
重生之我在番茄自学网安拯救世界4 小时前
网络安全中级阶段学习笔记(一):DVWA靶场安装配置教程与网络空间搜索语法
笔记·学习·网络安全·靶场·dvwa·fofa·google hack
摇滚侠4 小时前
零基础小白自学 Git_Github 教程,Git 命令行操作2,笔记19
笔记·git·github
TL滕4 小时前
从0开始学算法——第五天(初级排序算法)
数据结构·笔记·学习·算法·排序算法