java
@Tag(name = "pdf/word/图片文字识别")
public class OcrController extends BaseController {
@Autowired
private OcrService ocrService;
@Autowired
private BaiduOcrServiceImpl baiduOcrService;
/**
* pdf/word文字识别
*
* @param file
* @return
*/
@PostMapping("/recognize-text")
@Operation(summary = "pdf/word识别文字", description = "识别")
public String recognizeText(@RequestParam("file") MultipartFile file) {
return ocrService.recognizeText(file);
}
}
java
package com.jt.console.service.impl;
import com.jt.common.beans.ServiceAssert;
import com.jt.console.service.OcrService;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.xwpf.usermodel.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URLEncoder;
import java.util.Base64;
import static com.jt.console.service.impl.BaiduOcrServiceImpl.formatOcrResult;
/**
* pdf/word/图片识别
* @author chenchao
* @date 2024/8/12 16:17
*/
@Service
public class OcrServiceImpl implements OcrService {
@Autowired
private BaiduOcrServiceImpl baiduOcrService;
/**
* 对于一些表格和公式的处理会有识别错乱问题
* 识别上传文件中的文本内容
* @param file 上传的文件
* @return 提取的文本内容或错误信息
*/
@Override
public String recognizeText(MultipartFile file) {
String contentType = file.getContentType();
if (contentType == null) {
ServiceAssert.isTrue(false, "文件类型不支持");
return null;
}
InputStream inputStream = null;
try {
inputStream = file.getInputStream();
if (contentType.equals("application/pdf")) {
return extractTextFromPdf(inputStream);
} else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ||
contentType.equals("application/x-tika-ooxml")) {
return extractTextFromDocx(inputStream);
} else if (contentType.equals("application/msword")) {
return extractTextFromDoc(inputStream);
} else {
ServiceAssert.isTrue(false, "不支持的文件类型");
return null;
}
} catch (Exception e) {
e.printStackTrace();
ServiceAssert.isTrue(false, "处理文件出错");
return null;
} finally {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 从 PDF 文件中提取文本内容
* @param inputStream PDF 文件的输入流
* @return 提取的文本内容
* @throws IOException 读取文件时发生的异常
*/
private String extractTextFromPdf(InputStream inputStream) throws IOException {
StringBuilder text = new StringBuilder();
try (PDDocument document = PDDocument.load(inputStream)) {
// 禁止显示与 CMap 表相关的特定警告
System.setProperty("org.apache.pdfbox.logging.SILENT", "true");
PDFTextStripper pdfStripper = new PDFTextStripper();
text.append(pdfStripper.getText(document));
// 如果您需要从 PDF 中提取图像,请取消注释下面的行
// extractImagesFromPdf(document);
}
return text.toString();
}
/**
* 从 DOCX 文件中提取文本内容
* @param inputStream DOCX 文件的输入流
* @return 提取的文本内容
* @throws IOException 读取文件时发生的异常
*/
private String extractTextFromDocx(InputStream inputStream) throws IOException {
StringBuilder text = new StringBuilder();
ZipSecureFile.setMinInflateRatio(0.001); // For safety
try (XWPFDocument document = new XWPFDocument(inputStream)) {
// Extract text from paragraphs
document.getParagraphs().forEach(paragraph -> text.append(paragraph.getText()).append("\n"));
// Extract text from tables
for (XWPFTable table : document.getTables()) {
for (XWPFTableRow row : table.getRows()) {
for (XWPFTableCell cell : row.getTableCells()) {
text.append(cell.getText()).append("\t");
}
text.append("\n");
}
}
// 如果您需要从 DOCX 中提取图像,请取消注释下面的行
// extractImagesFromDocx(document);
}
return text.toString();
}
/**
* 从 DOC 文件中提取文本内容
* @param inputStream DOC 文件的输入流
* @return 提取的文本内容
* @throws IOException 读取文件时发生的异常
*/
private String extractTextFromDoc(InputStream inputStream) throws IOException {
StringBuilder text = new StringBuilder();
try (HWPFDocument document = new HWPFDocument(inputStream)) {
WordExtractor extractor = new WordExtractor(document);
String[] paragraphs = extractor.getParagraphText();
for (String paragraph : paragraphs) {
text.append(paragraph).append("\n");
}
}
return text.toString();
}
/**
* 从 PDF 文件中提取图片
* @param document PDF 文档对象
* @throws IOException 读取文件时发生的异常
*/
private void extractImagesFromPdf(PDDocument document) throws IOException {
PDPageTree pages = document.getPages();
int imageCounter = 0;
for (PDPage page : pages) {
PDResources resources = page.getResources();
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDImageXObject) {
PDImageXObject image = (PDImageXObject) xObject;
BufferedImage bufferedImage = image.getImage();
// Save image to file
File imageFile = new File("image" + (++imageCounter) + ".png");
try (FileOutputStream fos = new FileOutputStream(imageFile)) {
ImageIO.write(bufferedImage, "PNG", fos);
}
}
}
}
}
/**
* 从 DOCX 文件中提取图片
* @param document DOCX 文档对象
* @throws IOException 读取文件时发生的异常
*/
public String extractImagesFromDocx(XWPFDocument document, boolean urlEncode) throws IOException {
StringBuilder recognitionResults = new StringBuilder();
int imageCounter = 0;
for (XWPFPictureData pictureData : document.getAllPictures()) {
byte[] bytes = pictureData.getData();
// 将图片数据转换为 Base64 编码
String base64Image = Base64.getEncoder().encodeToString(bytes);
// 如果需要 URL 编码
if (urlEncode) {
base64Image = URLEncoder.encode(base64Image, "utf-8");
}
// 识别图片
String ocrResult = baiduOcrService.recognizeImage(base64Image);
String formattedResult = formatOcrResult(ocrResult);
recognitionResults.append("Image ").append(++imageCounter).append(": ").append(formattedResult).append("\n");
}
return recognitionResults.toString();
}
}
java
package com.jt.console.service.impl;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.jt.common.beans.ServiceAssert;
import okhttp3.*;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import java.io.IOException;
import java.net.URLEncoder;
import java.util.Base64;
import java.util.List;
import java.util.Arrays;
/**
* 百度OCR识别实现类
*/
@Service("baiduOcrServiceImpl")
public class BaiduOcrServiceImpl {
@Value("${baidu.ocr.apiKey}")
private String API_KEY; // 客户端id
@Value("${baidu.ocr.secretKey}")
private String SECRET_KEY; // 客户端秘钥
// 支持的图片格式列表
private static final List<String> SUPPORTED_FORMATS = Arrays.asList("png", "jpg", "jpeg", "bmp", "gif");
// 构建 OkHttpClient 实例
private static final OkHttpClient HTTP_CLIENT = new OkHttpClient().newBuilder().build();
// 获取 Access Token
private String getAccessToken() throws IOException {
MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");
RequestBody body = RequestBody.create(mediaType, "grant_type=client_credentials&client_id=" + API_KEY
+ "&client_secret=" + SECRET_KEY);
Request request = new Request.Builder()
.url("https://aip.baidubce.com/oauth/2.0/token")
.method("POST", body)
.addHeader("Content-Type", "application/x-www-form-urlencoded")
.build();
Response response = HTTP_CLIENT.newCall(request).execute();
if (!response.isSuccessful()) {
//throw new IOException("Unexpected code " + response);
// 自定义提示信息
String errorMessage = "OCR request failed. Status code: " + response.code() + ", Message: " + response.message();
ServiceAssert.isTrue(false, errorMessage);
}
String responseBody = response.body().string();
JSONObject jsonObject = JSON.parseObject(responseBody);
return jsonObject.getString("access_token");
}
// 调用 OCR 接口,返回结果
public String recognizeImage(String base64Image) throws IOException {
MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");
RequestBody body = RequestBody.create(mediaType, "image=" + base64Image + "&detect_direction=false¶graph=false&probability=false");
Request request = new Request.Builder()
.url("https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + getAccessToken())
.method("POST", body)
.addHeader("Content-Type", "application/x-www-form-urlencoded")
.addHeader("Accept", "application/json")
.build();
try (Response response = HTTP_CLIENT.newCall(request).execute()) {
if (!response.isSuccessful()) {
//throw new IOException("Unexpected code " + response);
// 自定义提示信息
String errorMessage = "Failed to obtain access token. Status code: " + response.code() + ", Message: " + response.message();
ServiceAssert.isTrue(false, errorMessage);
}
return formatOcrResult(response.body().string());
}
}
// 将 MultipartFile 转换为 Base64 编码
public String convertToBase64(MultipartFile file, boolean urlEncode) throws IOException {
// 检查图片格式
String filename = file.getOriginalFilename();
if (filename == null) {
ServiceAssert.isTrue(false, "文件名为空");
}
String extension = filename.substring(filename.lastIndexOf('.') + 1).toLowerCase();
if (!SUPPORTED_FORMATS.contains(extension)) {
ServiceAssert.isTrue(false, "不支持的图片格式: " + extension);
}
// 从 MultipartFile 中获取字节数组
byte[] bytes = file.getBytes();
// 将字节数组编码为 Base64 字符串
String base64 = Base64.getEncoder().encodeToString(bytes);
// 如果需要 URL 编码
if (urlEncode) {
base64 = URLEncoder.encode(base64, "utf-8");
}
return base64;
}
//组装返回OCR识别的结果
public static String formatOcrResult(String ocrResult) {
StringBuilder resultText = new StringBuilder();
try {
// 解析 OCR 结果
JSONObject jsonObject = JSON.parseObject(ocrResult);
// 检查是否包含 words_result 数组
if (jsonObject.containsKey("words_result")) {
var wordsResult = jsonObject.getJSONArray("words_result");
if (wordsResult != null && !wordsResult.isEmpty()) {
for (int i = 0; i < wordsResult.size(); i++) {
JSONObject wordObject = wordsResult.getJSONObject(i);
String word = wordObject.getString("words");
if (word != null && !word.isEmpty()) {
resultText.append(word).append(" ");
}
}
} else {
// 如果没有识别到文字,直接返回空值
return "";
}
} else {
// OCR 结果中不包含 words_result,也返回空值
return "";
}
} catch (Exception e) {
ServiceAssert.isTrue(false,e.getMessage());
//resultText.append("处理 OCR 结果时出错:").append(e.getMessage());
}
return resultText.toString().trim();
}
}