PaddleOCR 截图自动文字识别

春节假期在家无聊,撸了三个小工具:PC截图+编辑/PC录屏(用于meeting录屏)/PC截屏文字识别。因为感觉这三个小工具是工作中常常需要用到的,github上也有很多开源的,不过总有点或多或少的小问题,不利于自己的使用。脚本的编写尽量减少对三方库的使用。

已全部完成,这是其中的一个,后续将三个集成在在一个工具中。

python 复制代码
import tkinter as tk
from tkinter import ttk, messagebox, font, filedialog
from PIL import Image, ImageTk, ImageGrab
import sys
import tempfile
import threading
from pathlib import Path
import ctypes
import logging.handlers
from datetime import datetime

# 最小化控制台窗口
def minimize_console():
    ctypes.windll.user32.ShowWindow(ctypes.windll.kernel32.GetConsoleWindow(), 6)

minimize_console()  # 调用最小化函数

# 获取脚本所在目录路径
def get_script_directory():
    return Path(__file__).parent

# 配置日志文件路径和日志级别
log_file_path = get_script_directory() / 'ocr_errors.log'
logging.basicConfig(
    filename=log_file_path,
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
# 添加日志轮转
handler = logging.handlers.RotatingFileHandler(log_file_path, maxBytes=1024*1024*5, backupCount=3)
logger = logging.getLogger()
logger.addHandler(handler)

# 保存临时图片到磁盘
def save_temp_image(image, suffix='.png'):
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
        image.save(temp_file.name)
        return Path(temp_file.name)

class OCRApp:
    def __init__(self):
        try:
            self.root = tk.Tk()
            self.root.withdraw()

            # 禁用最大化按钮
            self.root.resizable(False, True)

            self.screenshot = None
            self.ocr_model = None  # 延迟初始化
            self.recognized_text = ""
            self.main_frame = None
            self.load_win = None  # 初始化 load_win 为 None

            # 启动后台线程加载OCR模型以优化性能,使run脚本后能马上进入截图状态
            threading.Thread(target=self.load_ocr_model, daemon=True).start()

            # 立即开始截图选择
            self.start_selection()

        except Exception as e:
            self.show_crash_message(f"程序启动失败: {str(e)}")
            sys.exit(1)

    def load_ocr_model(self):
        from paddleocr import PaddleOCR
        try:
            self.ocr_model = PaddleOCR(use_angle_cls=True, show_log=False, lang='ch')
        except Exception as e:
            logger.error(f"OCR模型加载失败: {str(e)}")

    # 开始截图选择区域
    def start_selection(self):
        self.selection_win = tk.Toplevel()
        self.selection_win.attributes("-fullscreen", True)
        self.selection_win.attributes("-alpha", 0.3)

        # 绑定整个窗口的 ESC 键事件
        self.selection_win.bind("<Escape>", self.on_escape)

        self.canvas = tk.Canvas(
            self.selection_win,
            cursor="cross",
            bg="gray30",
            highlightthickness=0
        )
        self.canvas.pack(fill=tk.BOTH, expand=True)

        self.start_x = self.start_y = 0
        self.rect_id = None
        self.crosshair_ids = []

        self.canvas.bind("<Button-1>", self.on_mouse_down)
        self.canvas.bind("<B1-Motion>", self.on_mouse_drag)
        self.canvas.bind("<ButtonRelease-1>", self.on_mouse_up)
        self.canvas.bind("<Motion>", self.on_mouse_move)

        self.escape_label = tk.Label(
            self.selection_win,
            text="按ESC键退出截图",
            fg="yellow",
            bg="gray20",
            font=("Helvetica", 12, "bold")
        )
        self.escape_label.place(x=10, y=10)

        self.update_crosshair(0, 0)

    # 鼠标按下事件处理
    def on_mouse_down(self, event):
        self.start_x = event.x
        self.start_y = event.y
        self.clear_crosshair()
        if self.rect_id:
            self.canvas.delete(self.rect_id)
            self.rect_id = None

    # 鼠标拖动事件处理
    def on_mouse_drag(self, event):
        current_x = event.x
        current_y = event.y

        if self.rect_id:
            self.canvas.coords(self.rect_id, self.start_x, self.start_y, current_x, current_y)
        else:
            self.rect_id = self.canvas.create_rectangle(
                self.start_x, self.start_y,
                current_x, current_y,
                outline="blue", width=2, fill="gray75", tags="rect"
            )

    # 鼠标释放事件处理
    def on_mouse_up(self, event):
        try:
            x1 = min(self.start_x, event.x)
            y1 = min(self.start_y, event.y)
            x2 = max(self.start_x, event.x)
            y2 = max(self.start_y, event.y)

            if (x2 - x1) < 10 or (y2 - y1) < 10:
                raise ValueError("选区过小,请选择更大的区域")
            if (x2 - x1) > self.canvas.winfo_width() or (y2 - y1) > self.canvas.winfo_height():
                raise ValueError("选区过大,请选择更小的区域")

            self.screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
            self.selection_win.destroy()
            self.initialize_ocr_and_process()

        except Exception as e:
            logger.error(f"截图错误: {str(e)}")
            messagebox.showerror("截图错误", str(e))
            self.restart_selection()

    # 初始化OCR引擎并处理截图
    def initialize_ocr_and_process(self):
        try:
            if self.ocr_model is None:
                self.load_win = self.show_loading("OCR模型正在加载中,请稍后...")
                self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
            else:
                self.process_ocr()
                self.setup_main_ui()
                self.root.deiconify()

        except Exception as e:
            logger.error(f"OCR初始化失败: {str(e)}")
            if self.load_win:
                self.load_win.destroy()
            self.handle_ocr_init_error(str(e))

    def check_ocr_model(self):
        if self.ocr_model is None:
            self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
        else:
            if self.load_win:
                self.load_win.destroy()
            self.process_ocr()
            self.setup_main_ui()
            self.root.deiconify()

    # 执行OCR处理
    def process_ocr(self):
        try:
            temp_image_path = save_temp_image(self.screenshot)
            result = self.ocr_model.ocr(str(temp_image_path), cls=True)
            temp_image_path.unlink()  # 确保临时文件被删除

            # 后处理识别结果,合并同一行的文字
            merged_text = self.merge_lines(result[0])

            self.recognized_text = merged_text
        except Exception as e:
            logger.error(f"OCR处理失败: {str(e)}")
            messagebox.showerror("识别错误", f"OCR处理失败: {str(e)}")
            self.restart_selection()

    # 合并同一行的文字
    def merge_lines(self, ocr_result):
        merged_text = []
        current_line = []
        current_y1 = None
        current_y2 = None
        line_threshold = 5  # 设置行间距阈值,可以根据需要调整
    
        for line in ocr_result:
            # 提取坐标点
            x1, y1 = line[0][0]  # 第一个坐标点
            x2, y2 = line[0][2]  # 第三个坐标点
            text = line[1][0]  # 提取文本
    
            if current_y1 is None or current_y2 is None:
                current_y1 = y1
                current_y2 = y2
                current_line.append(text)
            elif abs(y1 - current_y1) <= line_threshold and abs(y2 - current_y2) <= line_threshold:
                current_line.append(text)
            else:
                merged_text.append(" ".join(current_line))
                current_line = [text]
                current_y1 = y1
                current_y2 = y2
    
        if current_line:
            merged_text.append(" ".join(current_line))
    
        return "\n".join(merged_text)

    # 设置主界面UI
    def setup_main_ui(self):
        if self.main_frame is None:
            self.main_frame = ttk.Frame(self.root, padding=20)
            self.main_frame.grid(row=0, column=0, sticky="nsew")

            self.root.grid_rowconfigure(0, weight=1)
            self.root.grid_columnconfigure(0, weight=1)

            # 使用 PanedWindow 来分割图片框和文本框
            self.paned_window = ttk.PanedWindow(self.main_frame, orient=tk.VERTICAL)
            self.paned_window.grid(row=0, column=0, sticky="nsew")

            # 创建一个 Frame 来包含图片和滚动条
            self.image_frame = ttk.Frame(self.paned_window)
            self.image_frame.pack(fill=tk.BOTH, expand=True)

            # 使用 Canvas 来显示图片并添加滚动条
            self.image_canvas = tk.Canvas(self.image_frame, highlightbackground=self.root.cget("bg"), highlightthickness=0)
            self.image_canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

            self.image_scrollbar = ttk.Scrollbar(self.image_frame, orient=tk.VERTICAL, command=self.image_canvas.yview)
            self.image_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
            self.image_canvas.config(yscrollcommand=self.image_scrollbar.set)

            self.image_canvas.bind("<Configure>", self.on_canvas_configure)

            self.image_container = ttk.Frame(self.image_canvas)
            self.image_container_id = self.image_canvas.create_window((0, 0), window=self.image_container, anchor="nw")

            self.img_label = ttk.Label(self.image_container)
            self.img_label.pack(fill=tk.BOTH, expand=True)

            # 定义字体
            custom_font = font.Font(family="Microsoft YaHei", size=9)

            self.text_area = tk.Text(
                self.paned_window,
                wrap=tk.WORD,
                font=custom_font,  # 设置字体
                height=15  # 初始高度设置为15行
            )
            self.text_area.pack(fill=tk.BOTH, expand=True)

            self.paned_window.add(self.image_frame)
            self.paned_window.add(self.text_area)

            btn_frame = ttk.Frame(self.main_frame)
            btn_frame.grid(row=1, column=0, sticky="ew", pady=10)

            # 确保按钮行不会被压缩
            self.main_frame.grid_rowconfigure(0, weight=1)
            self.main_frame.grid_rowconfigure(1, weight=0)

            ttk.Button(
                btn_frame,
                text="重新选择",
                command=self.restart_selection
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="复制文本",
                command=self.copy_result
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="保存图片",
                command=self.save_image
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="退出",
                command=self.safe_exit
            ).pack(side=tk.RIGHT, padx=5)

        # 设置窗口标题
        self.root.title("文字识别@PDM3")

        self.update_image_display()
        self.text_area.delete(1.0, tk.END)
        self.text_area.insert(tk.END, self.recognized_text.strip())
        self.update_text_area_height()  # 更新文本框高度

        # 设置窗口总是最顶层
        self.root.attributes('-topmost', True)

    # 更新图片显示
    def update_image_display(self):
        if self.screenshot:
            photo = ImageTk.PhotoImage(self.screenshot)
            self.img_label.config(image=photo)
            self.img_label.image = photo

            # 获取图片的实际大小
            img_width, img_height = self.screenshot.size

            # 获取屏幕高度
            screen_height = self.root.winfo_screenheight()

            # 计算图片框的最大高度
            max_image_height = screen_height // 2

            # 设置 Canvas 的滚动区域
            self.image_canvas.config(scrollregion=(0, 0, img_width, img_height))

            # 调整 image_canvas 的高度
            if img_height > max_image_height:
                self.image_canvas.config(height=max_image_height)
            else:
                self.image_canvas.config(height=img_height)

    # 配置 Canvas 大小
    def on_canvas_configure(self, event):
        # 更新 Canvas 的滚动区域
        self.image_canvas.config(scrollregion=self.image_canvas.bbox("all"))

    # 显示加载中的窗口
    def show_loading(self, message):
        load_win = tk.Toplevel()
        load_win.title("请稍候")

        frame = ttk.Frame(load_win, padding=20)
        frame.pack()

        ttk.Label(frame, text=message).pack(pady=10)
        progress = ttk.Progressbar(frame, mode='indeterminate')
        progress.pack(pady=5)
        progress.start()

        return load_win

    # 处理OCR初始化错误
    def handle_ocr_init_error(self, error_msg):
        choice = messagebox.askretrycancel(
            "OCR初始化失败",
            f"{error_msg}\n\n是否重试?",
            icon='error'
        )
        if choice:
            threading.Thread(target=self.initialize_ocr_and_process).start()
        else:
            self.safe_exit()

    # 重新开始截图选择
    def restart_selection(self):
        if self.root.winfo_exists():
            self.root.withdraw()
        self.screenshot = None
        self.recognized_text = ""
        self.clear_ui()
        self.start_selection()

    # 清理UI界面
    def clear_ui(self):
        if hasattr(self, 'img_label'):
            self.img_label.config(image='')
            self.img_label.image = None
        if hasattr(self, 'text_area'):
            self.text_area.delete(1.0, tk.END)

    # 复制识别结果到剪贴板
    def copy_result(self):
        self.root.clipboard_clear()
        self.root.clipboard_append(self.recognized_text)
        messagebox.showinfo("成功", "已复制到剪贴板")

    # 安全退出程序
    def safe_exit(self):
        if self.root.winfo_exists():
            self.root.destroy()
        sys.exit(0)

    # 显示程序崩溃错误信息
    def show_crash_message(self, message):
        crash_win = tk.Tk()
        crash_win.withdraw()
        messagebox.showerror("致命错误", message)
        crash_win.destroy()

    # 按下ESC键时退出程序
    def on_escape(self, event):
        self.selection_win.destroy()
        self.safe_exit()

    # 鼠标移动事件处理
    def on_mouse_move(self, event):
        current_x = event.x
        current_y = event.y
        self.update_crosshair(current_x, current_y)

    # 更新十字线位置
    def update_crosshair(self, x, y):
        self.clear_crosshair()
        self.crosshair_ids.append(
            self.canvas.create_line(0, y, self.canvas.winfo_width(), y,
                                   tags="crosshair", fill="yellow", width=2))
        self.crosshair_ids.append(
            self.canvas.create_line(x, 0, x, self.canvas.winfo_height(),
                                    tags="crosshair", fill="yellow", width=2))

    # 清除十字线
    def clear_crosshair(self):
        for crosshair_id in self.crosshair_ids:
            self.canvas.delete(crosshair_id)
        self.crosshair_ids = []

    # 保存图片
    def save_image(self):
        if self.screenshot:
            # 获取用户桌面路径
            desktop_path = Path.home() / 'Desktop'

            # 生成当前日期和时间的字符串
            current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
            default_filename = f"screenshot_{current_datetime}.png"

            file_path = filedialog.asksaveasfilename(
                initialdir=desktop_path,  # 设置初始目录为用户桌面
                initialfile=default_filename,  # 设置默认文件名
                defaultextension=".png",
                filetypes=[("PNG files", "*.png"), ("JPEG files", "*.jpg"), ("All files", "*.*")]
            )
            if file_path:
                self.screenshot.save(file_path)
                messagebox.showinfo("保存成功", f"图片已保存到 {file_path}")

    # 更新文本框高度
    def update_text_area_height(self):
        # 计算当前文本行数
        line_count = int(self.text_area.index('end-1c').split('.')[0])
        if line_count > 15:
            self.text_area.config(height=15)  # 如果行数超过15行,固定高度为15行
        else:
            self.text_area.config(height=line_count)  # 否则根据内容调整高度

    # 运行主循环
    def run(self):
        self.root.mainloop()

if __name__ == "__main__":
    app = OCRApp()
    app.run()
相关推荐
红树林073 分钟前
BeautifulSoup 的页面中需要获取某个元素的 xpath 路径
前端·python·网络爬虫·beautifulsoup
jieyu111919 分钟前
Python 实战:Web 漏洞 Python POC 代码及原理详解(1)
python·web安全
QQLOVEYY1 小时前
Python和PyCharm的安装教程
python·pycharm
想名字好难啊竟然不止我一个1 小时前
清除 Pip 缓存, 释放磁盘空间
python·缓存·pip
Eiceblue2 小时前
Python 快速提取扫描件 PDF 中的文本:OCR 实操教程
vscode·python·ocr·1024程序员节
APIshop2 小时前
淘宝/天猫 API 接口深度解析:商品详情获取与按图搜索商品(拍立淘)实战指南
python·1024程序员节
WangYan20222 小时前
ArcGIS Pro与Python下空间数据采集与管理——涵盖矢量、栅格、GPS、点云、多维数据与遥感云平台等
python·arcgis pro·空间数据采集与管理
浔川python社2 小时前
浔川社团总访问量超 13 万:数据见证成长,热爱铸就辉煌
python
AI_56782 小时前
脑科学支持的Python学习法:每天2小时碎片化训练,用‘神经可塑性’打败拖延症“
开发语言·python·学习
合作小小程序员小小店2 小时前
大屏开发,在线歌词舆情分析系统demo,基于python,flask,web,echart,nlp,自然语言数据库mysql。
后端·python·flask·nlp·echarts