PaddleOCR 截图自动文字识别

春节假期在家无聊，撸了三个小工具：PC截图+编辑/PC录屏(用于meeting录屏)/PC截屏文字识别。因为感觉这三个小工具是工作中常常需要用到的，github上也有很多开源的，不过总有点或多或少的小问题，不利于自己的使用。脚本的编写尽量减少对三方库的使用。
已全部完成，这是其中的一个，后续将三个集成在在一个工具中。
python 复制代码
import tkinter as tk
from tkinter import ttk, messagebox, font, filedialog
from PIL import Image, ImageTk, ImageGrab
import sys
import tempfile
import threading
from pathlib import Path
import ctypes
import logging.handlers
from datetime import datetime

# 最小化控制台窗口
def minimize_console():
    ctypes.windll.user32.ShowWindow(ctypes.windll.kernel32.GetConsoleWindow(), 6)

minimize_console()  # 调用最小化函数

# 获取脚本所在目录路径
def get_script_directory():
    return Path(__file__).parent

# 配置日志文件路径和日志级别
log_file_path = get_script_directory() / 'ocr_errors.log'
logging.basicConfig(
    filename=log_file_path,
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
# 添加日志轮转
handler = logging.handlers.RotatingFileHandler(log_file_path, maxBytes=1024*1024*5, backupCount=3)
logger = logging.getLogger()
logger.addHandler(handler)

# 保存临时图片到磁盘
def save_temp_image(image, suffix='.png'):
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
        image.save(temp_file.name)
        return Path(temp_file.name)

class OCRApp:
    def __init__(self):
        try:
            self.root = tk.Tk()
            self.root.withdraw()

            # 禁用最大化按钮
            self.root.resizable(False, True)

            self.screenshot = None
            self.ocr_model = None  # 延迟初始化
            self.recognized_text = ""
            self.main_frame = None
            self.load_win = None  # 初始化 load_win 为 None

            # 启动后台线程加载OCR模型以优化性能，使run脚本后能马上进入截图状态
            threading.Thread(target=self.load_ocr_model, daemon=True).start()

            # 立即开始截图选择
            self.start_selection()

        except Exception as e:
            self.show_crash_message(f"程序启动失败: {str(e)}")
            sys.exit(1)

    def load_ocr_model(self):
        from paddleocr import PaddleOCR
        try:
            self.ocr_model = PaddleOCR(use_angle_cls=True, show_log=False, lang='ch')
        except Exception as e:
            logger.error(f"OCR模型加载失败: {str(e)}")

    # 开始截图选择区域
    def start_selection(self):
        self.selection_win = tk.Toplevel()
        self.selection_win.attributes("-fullscreen", True)
        self.selection_win.attributes("-alpha", 0.3)

        # 绑定整个窗口的 ESC 键事件
        self.selection_win.bind("<Escape>", self.on_escape)

        self.canvas = tk.Canvas(
            self.selection_win,
            cursor="cross",
            bg="gray30",
            highlightthickness=0
        )
        self.canvas.pack(fill=tk.BOTH, expand=True)

        self.start_x = self.start_y = 0
        self.rect_id = None
        self.crosshair_ids = []

        self.canvas.bind("<Button-1>", self.on_mouse_down)
        self.canvas.bind("<B1-Motion>", self.on_mouse_drag)
        self.canvas.bind("<ButtonRelease-1>", self.on_mouse_up)
        self.canvas.bind("<Motion>", self.on_mouse_move)

        self.escape_label = tk.Label(
            self.selection_win,
            text="按ESC键退出截图",
            fg="yellow",
            bg="gray20",
            font=("Helvetica", 12, "bold")
        )
        self.escape_label.place(x=10, y=10)

        self.update_crosshair(0, 0)

    # 鼠标按下事件处理
    def on_mouse_down(self, event):
        self.start_x = event.x
        self.start_y = event.y
        self.clear_crosshair()
        if self.rect_id:
            self.canvas.delete(self.rect_id)
            self.rect_id = None

    # 鼠标拖动事件处理
    def on_mouse_drag(self, event):
        current_x = event.x
        current_y = event.y

        if self.rect_id:
            self.canvas.coords(self.rect_id, self.start_x, self.start_y, current_x, current_y)
        else:
            self.rect_id = self.canvas.create_rectangle(
                self.start_x, self.start_y,
                current_x, current_y,
                outline="blue", width=2, fill="gray75", tags="rect"
            )

    # 鼠标释放事件处理
    def on_mouse_up(self, event):
        try:
            x1 = min(self.start_x, event.x)
            y1 = min(self.start_y, event.y)
            x2 = max(self.start_x, event.x)
            y2 = max(self.start_y, event.y)

            if (x2 - x1) < 10 or (y2 - y1) < 10:
                raise ValueError("选区过小，请选择更大的区域")
            if (x2 - x1) > self.canvas.winfo_width() or (y2 - y1) > self.canvas.winfo_height():
                raise ValueError("选区过大，请选择更小的区域")

            self.screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
            self.selection_win.destroy()
            self.initialize_ocr_and_process()

        except Exception as e:
            logger.error(f"截图错误: {str(e)}")
            messagebox.showerror("截图错误", str(e))
            self.restart_selection()

    # 初始化OCR引擎并处理截图
    def initialize_ocr_and_process(self):
        try:
            if self.ocr_model is None:
                self.load_win = self.show_loading("OCR模型正在加载中，请稍后...")
                self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
            else:
                self.process_ocr()
                self.setup_main_ui()
                self.root.deiconify()

        except Exception as e:
            logger.error(f"OCR初始化失败: {str(e)}")
            if self.load_win:
                self.load_win.destroy()
            self.handle_ocr_init_error(str(e))

    def check_ocr_model(self):
        if self.ocr_model is None:
            self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
        else:
            if self.load_win:
                self.load_win.destroy()
            self.process_ocr()
            self.setup_main_ui()
            self.root.deiconify()

    # 执行OCR处理
    def process_ocr(self):
        try:
            temp_image_path = save_temp_image(self.screenshot)
            result = self.ocr_model.ocr(str(temp_image_path), cls=True)
            temp_image_path.unlink()  # 确保临时文件被删除

            # 后处理识别结果，合并同一行的文字
            merged_text = self.merge_lines(result[0])

            self.recognized_text = merged_text
        except Exception as e:
            logger.error(f"OCR处理失败: {str(e)}")
            messagebox.showerror("识别错误", f"OCR处理失败: {str(e)}")
            self.restart_selection()

    # 合并同一行的文字
    def merge_lines(self, ocr_result):
        merged_text = []
        current_line = []
        current_y1 = None
        current_y2 = None
        line_threshold = 5  # 设置行间距阈值，可以根据需要调整
    
        for line in ocr_result:
            # 提取坐标点
            x1, y1 = line[0][0]  # 第一个坐标点
            x2, y2 = line[0][2]  # 第三个坐标点
            text = line[1][0]  # 提取文本
    
            if current_y1 is None or current_y2 is None:
                current_y1 = y1
                current_y2 = y2
                current_line.append(text)
            elif abs(y1 - current_y1) <= line_threshold and abs(y2 - current_y2) <= line_threshold:
                current_line.append(text)
            else:
                merged_text.append(" ".join(current_line))
                current_line = [text]
                current_y1 = y1
                current_y2 = y2
    
        if current_line:
            merged_text.append(" ".join(current_line))
    
        return "\n".join(merged_text)

    # 设置主界面UI
    def setup_main_ui(self):
        if self.main_frame is None:
            self.main_frame = ttk.Frame(self.root, padding=20)
            self.main_frame.grid(row=0, column=0, sticky="nsew")

            self.root.grid_rowconfigure(0, weight=1)
            self.root.grid_columnconfigure(0, weight=1)

            # 使用 PanedWindow 来分割图片框和文本框
            self.paned_window = ttk.PanedWindow(self.main_frame, orient=tk.VERTICAL)
            self.paned_window.grid(row=0, column=0, sticky="nsew")

            # 创建一个 Frame 来包含图片和滚动条
            self.image_frame = ttk.Frame(self.paned_window)
            self.image_frame.pack(fill=tk.BOTH, expand=True)

            # 使用 Canvas 来显示图片并添加滚动条
            self.image_canvas = tk.Canvas(self.image_frame, highlightbackground=self.root.cget("bg"), highlightthickness=0)
            self.image_canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

            self.image_scrollbar = ttk.Scrollbar(self.image_frame, orient=tk.VERTICAL, command=self.image_canvas.yview)
            self.image_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
            self.image_canvas.config(yscrollcommand=self.image_scrollbar.set)

            self.image_canvas.bind("<Configure>", self.on_canvas_configure)

            self.image_container = ttk.Frame(self.image_canvas)
            self.image_container_id = self.image_canvas.create_window((0, 0), window=self.image_container, anchor="nw")

            self.img_label = ttk.Label(self.image_container)
            self.img_label.pack(fill=tk.BOTH, expand=True)

            # 定义字体
            custom_font = font.Font(family="Microsoft YaHei", size=9)

            self.text_area = tk.Text(
                self.paned_window,
                wrap=tk.WORD,
                font=custom_font,  # 设置字体
                height=15  # 初始高度设置为15行
            )
            self.text_area.pack(fill=tk.BOTH, expand=True)

            self.paned_window.add(self.image_frame)
            self.paned_window.add(self.text_area)

            btn_frame = ttk.Frame(self.main_frame)
            btn_frame.grid(row=1, column=0, sticky="ew", pady=10)

            # 确保按钮行不会被压缩
            self.main_frame.grid_rowconfigure(0, weight=1)
            self.main_frame.grid_rowconfigure(1, weight=0)

            ttk.Button(
                btn_frame,
                text="重新选择",
                command=self.restart_selection
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="复制文本",
                command=self.copy_result
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="保存图片",
                command=self.save_image
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="退出",
                command=self.safe_exit
            ).pack(side=tk.RIGHT, padx=5)

        # 设置窗口标题
        self.root.title("文字识别@PDM3")

        self.update_image_display()
        self.text_area.delete(1.0, tk.END)
        self.text_area.insert(tk.END, self.recognized_text.strip())
        self.update_text_area_height()  # 更新文本框高度

        # 设置窗口总是最顶层
        self.root.attributes('-topmost', True)

    # 更新图片显示
    def update_image_display(self):
        if self.screenshot:
            photo = ImageTk.PhotoImage(self.screenshot)
            self.img_label.config(image=photo)
            self.img_label.image = photo

            # 获取图片的实际大小
            img_width, img_height = self.screenshot.size

            # 获取屏幕高度
            screen_height = self.root.winfo_screenheight()

            # 计算图片框的最大高度
            max_image_height = screen_height // 2

            # 设置 Canvas 的滚动区域
            self.image_canvas.config(scrollregion=(0, 0, img_width, img_height))

            # 调整 image_canvas 的高度
            if img_height > max_image_height:
                self.image_canvas.config(height=max_image_height)
            else:
                self.image_canvas.config(height=img_height)

    # 配置 Canvas 大小
    def on_canvas_configure(self, event):
        # 更新 Canvas 的滚动区域
        self.image_canvas.config(scrollregion=self.image_canvas.bbox("all"))

    # 显示加载中的窗口
    def show_loading(self, message):
        load_win = tk.Toplevel()
        load_win.title("请稍候")

        frame = ttk.Frame(load_win, padding=20)
        frame.pack()

        ttk.Label(frame, text=message).pack(pady=10)
        progress = ttk.Progressbar(frame, mode='indeterminate')
        progress.pack(pady=5)
        progress.start()

        return load_win

    # 处理OCR初始化错误
    def handle_ocr_init_error(self, error_msg):
        choice = messagebox.askretrycancel(
            "OCR初始化失败",
            f"{error_msg}\n\n是否重试？",
            icon='error'
        )
        if choice:
            threading.Thread(target=self.initialize_ocr_and_process).start()
        else:
            self.safe_exit()

    # 重新开始截图选择
    def restart_selection(self):
        if self.root.winfo_exists():
            self.root.withdraw()
        self.screenshot = None
        self.recognized_text = ""
        self.clear_ui()
        self.start_selection()

    # 清理UI界面
    def clear_ui(self):
        if hasattr(self, 'img_label'):
            self.img_label.config(image='')
            self.img_label.image = None
        if hasattr(self, 'text_area'):
            self.text_area.delete(1.0, tk.END)

    # 复制识别结果到剪贴板
    def copy_result(self):
        self.root.clipboard_clear()
        self.root.clipboard_append(self.recognized_text)
        messagebox.showinfo("成功", "已复制到剪贴板")

    # 安全退出程序
    def safe_exit(self):
        if self.root.winfo_exists():
            self.root.destroy()
        sys.exit(0)

    # 显示程序崩溃错误信息
    def show_crash_message(self, message):
        crash_win = tk.Tk()
        crash_win.withdraw()
        messagebox.showerror("致命错误", message)
        crash_win.destroy()

    # 按下ESC键时退出程序
    def on_escape(self, event):
        self.selection_win.destroy()
        self.safe_exit()

    # 鼠标移动事件处理
    def on_mouse_move(self, event):
        current_x = event.x
        current_y = event.y
        self.update_crosshair(current_x, current_y)

    # 更新十字线位置
    def update_crosshair(self, x, y):
        self.clear_crosshair()
        self.crosshair_ids.append(
            self.canvas.create_line(0, y, self.canvas.winfo_width(), y,
                                   tags="crosshair", fill="yellow", width=2))
        self.crosshair_ids.append(
            self.canvas.create_line(x, 0, x, self.canvas.winfo_height(),
                                    tags="crosshair", fill="yellow", width=2))

    # 清除十字线
    def clear_crosshair(self):
        for crosshair_id in self.crosshair_ids:
            self.canvas.delete(crosshair_id)
        self.crosshair_ids = []

    # 保存图片
    def save_image(self):
        if self.screenshot:
            # 获取用户桌面路径
            desktop_path = Path.home() / 'Desktop'

            # 生成当前日期和时间的字符串
            current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
            default_filename = f"screenshot_{current_datetime}.png"

            file_path = filedialog.asksaveasfilename(
                initialdir=desktop_path,  # 设置初始目录为用户桌面
                initialfile=default_filename,  # 设置默认文件名
                defaultextension=".png",
                filetypes=[("PNG files", "*.png"), ("JPEG files", "*.jpg"), ("All files", "*.*")]
            )
            if file_path:
                self.screenshot.save(file_path)
                messagebox.showinfo("保存成功", f"图片已保存到 {file_path}")

    # 更新文本框高度
    def update_text_area_height(self):
        # 计算当前文本行数
        line_count = int(self.text_area.index('end-1c').split('.')[0])
        if line_count > 15:
            self.text_area.config(height=15)  # 如果行数超过15行，固定高度为15行
        else:
            self.text_area.config(height=line_count)  # 否则根据内容调整高度

    # 运行主循环
    def run(self):
        self.root.mainloop()

if __name__ == "__main__":
    app = OCRApp()
    app.run()