Java实现pdf/word文字识别,调用OCR提取图片文字聚合

java 复制代码
@Tag(name = "pdf/word/图片文字识别")
public class OcrController extends BaseController {


    @Autowired
    private OcrService ocrService;


    @Autowired
    private BaiduOcrServiceImpl baiduOcrService;


    /**
     * pdf/word文字识别
     *
     * @param file
     * @return
     */
    @PostMapping("/recognize-text")
    @Operation(summary = "pdf/word识别文字", description = "识别")
    public String recognizeText(@RequestParam("file") MultipartFile file) {
        return ocrService.recognizeText(file);
    }

}
java 复制代码
package com.jt.console.service.impl;

import com.jt.common.beans.ServiceAssert;
import com.jt.console.service.OcrService;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.xwpf.usermodel.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URLEncoder;
import java.util.Base64;

import static com.jt.console.service.impl.BaiduOcrServiceImpl.formatOcrResult;

/**
 * pdf/word/图片识别
 * @author chenchao
 * @date 2024/8/12 16:17
 */
@Service
public class OcrServiceImpl implements OcrService {



    @Autowired
    private BaiduOcrServiceImpl baiduOcrService;


    /**
     * 对于一些表格和公式的处理会有识别错乱问题
     * 识别上传文件中的文本内容
     * @param file 上传的文件
     * @return 提取的文本内容或错误信息
     */
    @Override
    public String recognizeText(MultipartFile file) {
        String contentType = file.getContentType();
        if (contentType == null) {
            ServiceAssert.isTrue(false, "文件类型不支持");
            return null;
        }
        InputStream inputStream = null;
        try {
            inputStream = file.getInputStream();
            if (contentType.equals("application/pdf")) {
                return extractTextFromPdf(inputStream);
            } else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ||
                    contentType.equals("application/x-tika-ooxml")) {
                return extractTextFromDocx(inputStream);
            } else if (contentType.equals("application/msword")) {
                return extractTextFromDoc(inputStream);
            } else {
                ServiceAssert.isTrue(false, "不支持的文件类型");
                return null;
            }
        } catch (Exception e) {
            e.printStackTrace();
            ServiceAssert.isTrue(false, "处理文件出错");
            return null;
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    /**
     * 从 PDF 文件中提取文本内容
     * @param inputStream PDF 文件的输入流
     * @return 提取的文本内容
     * @throws IOException 读取文件时发生的异常
     */
    private String extractTextFromPdf(InputStream inputStream) throws IOException {
        StringBuilder text = new StringBuilder();
        try (PDDocument document = PDDocument.load(inputStream)) {
            // 禁止显示与 CMap 表相关的特定警告
            System.setProperty("org.apache.pdfbox.logging.SILENT", "true");

            PDFTextStripper pdfStripper = new PDFTextStripper();
            text.append(pdfStripper.getText(document));
            // 如果您需要从 PDF 中提取图像,请取消注释下面的行
            // extractImagesFromPdf(document);
        }
        return text.toString();
    }

    /**
     * 从 DOCX 文件中提取文本内容
     * @param inputStream DOCX 文件的输入流
     * @return 提取的文本内容
     * @throws IOException 读取文件时发生的异常
     */
    private String extractTextFromDocx(InputStream inputStream) throws IOException {
        StringBuilder text = new StringBuilder();
        ZipSecureFile.setMinInflateRatio(0.001); // For safety
        try (XWPFDocument document = new XWPFDocument(inputStream)) {
            // Extract text from paragraphs
            document.getParagraphs().forEach(paragraph -> text.append(paragraph.getText()).append("\n"));

            // Extract text from tables
            for (XWPFTable table : document.getTables()) {
                for (XWPFTableRow row : table.getRows()) {
                    for (XWPFTableCell cell : row.getTableCells()) {
                        text.append(cell.getText()).append("\t");
                    }
                    text.append("\n");
                }
            }

            // 如果您需要从 DOCX 中提取图像,请取消注释下面的行
            // extractImagesFromDocx(document);
        }
        return text.toString();
    }

    /**
     * 从 DOC 文件中提取文本内容
     * @param inputStream DOC 文件的输入流
     * @return 提取的文本内容
     * @throws IOException 读取文件时发生的异常
     */
    private String extractTextFromDoc(InputStream inputStream) throws IOException {
        StringBuilder text = new StringBuilder();
        try (HWPFDocument document = new HWPFDocument(inputStream)) {
            WordExtractor extractor = new WordExtractor(document);
            String[] paragraphs = extractor.getParagraphText();
            for (String paragraph : paragraphs) {
                text.append(paragraph).append("\n");
            }
        }
        return text.toString();
    }




    /**
     * 从 PDF 文件中提取图片
     * @param document PDF 文档对象
     * @throws IOException 读取文件时发生的异常
     */
    private void extractImagesFromPdf(PDDocument document) throws IOException {
        PDPageTree pages = document.getPages();
        int imageCounter = 0;
        for (PDPage page : pages) {
            PDResources resources = page.getResources();
            for (COSName xObjectName : resources.getXObjectNames()) {
                PDXObject xObject = resources.getXObject(xObjectName);
                if (xObject instanceof PDImageXObject) {
                    PDImageXObject image = (PDImageXObject) xObject;
                    BufferedImage bufferedImage = image.getImage();
                    // Save image to file
                    File imageFile = new File("image" + (++imageCounter) + ".png");
                    try (FileOutputStream fos = new FileOutputStream(imageFile)) {
                        ImageIO.write(bufferedImage, "PNG", fos);
                    }
                }
            }
        }
    }




    /**
     * 从 DOCX 文件中提取图片
     * @param document DOCX 文档对象
     * @throws IOException 读取文件时发生的异常
     */
    public String extractImagesFromDocx(XWPFDocument document, boolean urlEncode) throws IOException {
        StringBuilder recognitionResults = new StringBuilder();
        int imageCounter = 0;

        for (XWPFPictureData pictureData : document.getAllPictures()) {
            byte[] bytes = pictureData.getData();
            // 将图片数据转换为 Base64 编码
            String base64Image = Base64.getEncoder().encodeToString(bytes);
            // 如果需要 URL 编码
            if (urlEncode) {
                base64Image = URLEncoder.encode(base64Image, "utf-8");
            }
            // 识别图片
            String ocrResult = baiduOcrService.recognizeImage(base64Image);
            String formattedResult = formatOcrResult(ocrResult);
            recognitionResults.append("Image ").append(++imageCounter).append(": ").append(formattedResult).append("\n");
        }
        return recognitionResults.toString();
    }


}
java 复制代码
package com.jt.console.service.impl;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.jt.common.beans.ServiceAssert;
import okhttp3.*;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import java.io.IOException;
import java.net.URLEncoder;
import java.util.Base64;
import java.util.List;
import java.util.Arrays;

/**
 * 百度OCR识别实现类
 */
@Service("baiduOcrServiceImpl")
public class BaiduOcrServiceImpl {

    @Value("${baidu.ocr.apiKey}")
    private String API_KEY;  // 客户端id

    @Value("${baidu.ocr.secretKey}")
    private String SECRET_KEY; // 客户端秘钥

    // 支持的图片格式列表
    private static final List<String> SUPPORTED_FORMATS = Arrays.asList("png", "jpg", "jpeg", "bmp", "gif");

    // 构建 OkHttpClient 实例
    private static final OkHttpClient HTTP_CLIENT = new OkHttpClient().newBuilder().build();

    // 获取 Access Token
    private String getAccessToken() throws IOException {
        MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");
        RequestBody body = RequestBody.create(mediaType, "grant_type=client_credentials&client_id=" + API_KEY
                + "&client_secret=" + SECRET_KEY);
        Request request = new Request.Builder()
                .url("https://aip.baidubce.com/oauth/2.0/token")
                .method("POST", body)
                .addHeader("Content-Type", "application/x-www-form-urlencoded")
                .build();
        Response response = HTTP_CLIENT.newCall(request).execute();
        if (!response.isSuccessful()) {
            //throw new IOException("Unexpected code " + response);
            // 自定义提示信息
            String errorMessage = "OCR request failed. Status code: " + response.code() + ", Message: " + response.message();
            ServiceAssert.isTrue(false, errorMessage);
        }
        String responseBody = response.body().string();
        JSONObject jsonObject = JSON.parseObject(responseBody);
        return jsonObject.getString("access_token");
    }

    // 调用 OCR 接口,返回结果
    public String recognizeImage(String base64Image) throws IOException {
        MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");
        RequestBody body = RequestBody.create(mediaType, "image=" + base64Image + "&detect_direction=false&paragraph=false&probability=false");
        Request request = new Request.Builder()
                .url("https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + getAccessToken())
                .method("POST", body)
                .addHeader("Content-Type", "application/x-www-form-urlencoded")
                .addHeader("Accept", "application/json")
                .build();
        try (Response response = HTTP_CLIENT.newCall(request).execute()) {
            if (!response.isSuccessful()) {
                //throw new IOException("Unexpected code " + response);
                // 自定义提示信息
                String errorMessage = "Failed to obtain access token. Status code: " + response.code() + ", Message: " + response.message();
                ServiceAssert.isTrue(false, errorMessage);
            }
            return formatOcrResult(response.body().string());
        }
    }

    // 将 MultipartFile 转换为 Base64 编码
    public String convertToBase64(MultipartFile file, boolean urlEncode) throws IOException {
        // 检查图片格式
        String filename = file.getOriginalFilename();
        if (filename == null) {
            ServiceAssert.isTrue(false, "文件名为空");
        }
        String extension = filename.substring(filename.lastIndexOf('.') + 1).toLowerCase();
        if (!SUPPORTED_FORMATS.contains(extension)) {
            ServiceAssert.isTrue(false, "不支持的图片格式: " + extension);
        }
        // 从 MultipartFile 中获取字节数组
        byte[] bytes = file.getBytes();
        // 将字节数组编码为 Base64 字符串
        String base64 = Base64.getEncoder().encodeToString(bytes);
        // 如果需要 URL 编码
        if (urlEncode) {
            base64 = URLEncoder.encode(base64, "utf-8");
        }
        return base64;
    }


    //组装返回OCR识别的结果
    public static String formatOcrResult(String ocrResult) {
        StringBuilder resultText = new StringBuilder();
        try {
            // 解析 OCR 结果
            JSONObject jsonObject = JSON.parseObject(ocrResult);

            // 检查是否包含 words_result 数组
            if (jsonObject.containsKey("words_result")) {
                var wordsResult = jsonObject.getJSONArray("words_result");
                if (wordsResult != null && !wordsResult.isEmpty()) {
                    for (int i = 0; i < wordsResult.size(); i++) {
                        JSONObject wordObject = wordsResult.getJSONObject(i);
                        String word = wordObject.getString("words");
                        if (word != null && !word.isEmpty()) {
                            resultText.append(word).append(" ");
                        }
                    }
                } else {
                    // 如果没有识别到文字,直接返回空值
                    return "";
                }
            } else {
                // OCR 结果中不包含 words_result,也返回空值
                return "";
            }
        } catch (Exception e) {
            ServiceAssert.isTrue(false,e.getMessage());
            //resultText.append("处理 OCR 结果时出错:").append(e.getMessage());
        }
        return resultText.toString().trim();
    }
}
相关推荐
吾日三省吾码1 小时前
JVM 性能调优
java
弗拉唐2 小时前
springBoot,mp,ssm整合案例
java·spring boot·mybatis
CodeCraft Studio2 小时前
【实用技能】使用 TX Text Control 创建带有嵌入式附件的 PDF 文档
pdf·asp.net·.net
oi772 小时前
使用itextpdf进行pdf模版填充中文文本时部分字不显示问题
java·服务器
少说多做3433 小时前
Android 不同情况下使用 runOnUiThread
android·java
知兀3 小时前
Java的方法、基本和引用数据类型
java·笔记·黑马程序员
蓝黑20203 小时前
IntelliJ IDEA常用快捷键
java·ide·intellij-idea
Ysjt | 深3 小时前
C++多线程编程入门教程(优质版)
java·开发语言·jvm·c++
shuangrenlong3 小时前
slice介绍slice查看器
java·ubuntu