Python OCR PDF Extraction

Tesseract Installation

python 复制代码
#!/usr/bin/python3
# 
# Python OCR PDF Extraction
# https://github.com/tesseract-ocr/tesseract
#
# sudo apt install tesseract-ocr
# sudo apt install libtesseract-dev
# pip install pytesseract PyPDF2 pdfplumber opencv-python pillow
# pip install pdf2image
# sudo apt-get install poppler-utils
# sudo apt-get install tesseract-ocr-chi-sim  # Simplified Chinese
# sudo apt-get install tesseract-ocr-chi-tra  # Traditional Chinese
# tesseract --list-langs

import pytesseract
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
import cv2
import numpy as np
from PIL import Image

# Path to Tesseract executable (update to match your system)
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

def preprocess_image(pil_image):
    """
    Preprocesses an image for OCR using OpenCV.
    Converts to grayscale, applies thresholding.
    """
    # Convert PIL image to OpenCV format
    open_cv_image = np.array(pil_image)
    # Convert RGB to BGR (OpenCV default format)
    open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)
    # Convert to grayscale
    gray_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
    # Apply binary thresholding
    _, thresh_image = cv2.threshold(gray_image, 128, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return thresh_image

def extract_text_from_pdf(pdf_path):
    # First try extracting text from the PDF directly
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""

    # If no text is extracted, assume it's a scanned PDF and use OCR
    if not text.strip():
        images = convert_from_path(pdf_path)
        for image in images:
            # Preprocess image for better OCR results
            preprocessed_image = preprocess_image(image)
            # Convert OpenCV image back to PIL format for Tesseract
            pil_image = Image.fromarray(preprocessed_image)
            # Perform OCR
            text += pytesseract.image_to_string(pil_image, lang='chi_sim')

    return text

# Example usage
pdf_path = "scan_2025-01-02_09.31.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)
相关推荐
爱上python的猴子38 分钟前
用python编写一个放烟花的小程序
开发语言·python·pygame
B站计算机毕业设计超人1 小时前
计算机毕业设计PyHive+Hadoop深圳共享单车预测系统 共享单车数据分析可视化大屏 共享单车爬虫 共享单车数据仓库 机器学习 深度学习
大数据·hadoop·python·深度学习·机器学习·数据分析·数据可视化
Edward-tan2 小时前
【玩转全栈】----Django连接MySQL
python·mysql·django
油头少年_w2 小时前
Python数据容器
python
有杨既安然2 小时前
Python爬虫入门指南:从零开始抓取数据
开发语言·爬虫·python·信息可视化·数据分析·excel
Grovvy_Deng2 小时前
使用rust加速python的tgz解压
开发语言·python·rust
Tiandaren2 小时前
医学图像分析工具02:3D Slicer || 医学影像可视化与分析工具 支持第三方插件
c++·人工智能·python·深度学习·3d·开源
EnochChen_3 小时前
PyTorch快速入门教程【小土堆】之Sequential使用和小实战
人工智能·pytorch·python
半夏知半秋3 小时前
python中常用的内置函数介绍
服务器·开发语言·笔记·后端·python·学习
视觉人机器视觉3 小时前
halcon中图像滤波分为空间域和频域两种方法
图像处理·python·计算机视觉