PaddleOCR 截图自动文字识别

春节假期在家无聊,撸了三个小工具:PC截图+编辑/PC录屏(用于meeting录屏)/PC截屏文字识别。因为感觉这三个小工具是工作中常常需要用到的,github上也有很多开源的,不过总有点或多或少的小问题,不利于自己的使用。脚本的编写尽量减少对三方库的使用。

已全部完成,这是其中的一个,后续将三个集成在在一个工具中。

python 复制代码
import tkinter as tk
from tkinter import ttk, messagebox, font, filedialog
from PIL import Image, ImageTk, ImageGrab
import sys
import tempfile
import threading
from pathlib import Path
import ctypes
import logging.handlers
from datetime import datetime

# 最小化控制台窗口
def minimize_console():
    ctypes.windll.user32.ShowWindow(ctypes.windll.kernel32.GetConsoleWindow(), 6)

minimize_console()  # 调用最小化函数

# 获取脚本所在目录路径
def get_script_directory():
    return Path(__file__).parent

# 配置日志文件路径和日志级别
log_file_path = get_script_directory() / 'ocr_errors.log'
logging.basicConfig(
    filename=log_file_path,
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
# 添加日志轮转
handler = logging.handlers.RotatingFileHandler(log_file_path, maxBytes=1024*1024*5, backupCount=3)
logger = logging.getLogger()
logger.addHandler(handler)

# 保存临时图片到磁盘
def save_temp_image(image, suffix='.png'):
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
        image.save(temp_file.name)
        return Path(temp_file.name)

class OCRApp:
    def __init__(self):
        try:
            self.root = tk.Tk()
            self.root.withdraw()

            # 禁用最大化按钮
            self.root.resizable(False, True)

            self.screenshot = None
            self.ocr_model = None  # 延迟初始化
            self.recognized_text = ""
            self.main_frame = None
            self.load_win = None  # 初始化 load_win 为 None

            # 启动后台线程加载OCR模型以优化性能,使run脚本后能马上进入截图状态
            threading.Thread(target=self.load_ocr_model, daemon=True).start()

            # 立即开始截图选择
            self.start_selection()

        except Exception as e:
            self.show_crash_message(f"程序启动失败: {str(e)}")
            sys.exit(1)

    def load_ocr_model(self):
        from paddleocr import PaddleOCR
        try:
            self.ocr_model = PaddleOCR(use_angle_cls=True, show_log=False, lang='ch')
        except Exception as e:
            logger.error(f"OCR模型加载失败: {str(e)}")

    # 开始截图选择区域
    def start_selection(self):
        self.selection_win = tk.Toplevel()
        self.selection_win.attributes("-fullscreen", True)
        self.selection_win.attributes("-alpha", 0.3)

        # 绑定整个窗口的 ESC 键事件
        self.selection_win.bind("<Escape>", self.on_escape)

        self.canvas = tk.Canvas(
            self.selection_win,
            cursor="cross",
            bg="gray30",
            highlightthickness=0
        )
        self.canvas.pack(fill=tk.BOTH, expand=True)

        self.start_x = self.start_y = 0
        self.rect_id = None
        self.crosshair_ids = []

        self.canvas.bind("<Button-1>", self.on_mouse_down)
        self.canvas.bind("<B1-Motion>", self.on_mouse_drag)
        self.canvas.bind("<ButtonRelease-1>", self.on_mouse_up)
        self.canvas.bind("<Motion>", self.on_mouse_move)

        self.escape_label = tk.Label(
            self.selection_win,
            text="按ESC键退出截图",
            fg="yellow",
            bg="gray20",
            font=("Helvetica", 12, "bold")
        )
        self.escape_label.place(x=10, y=10)

        self.update_crosshair(0, 0)

    # 鼠标按下事件处理
    def on_mouse_down(self, event):
        self.start_x = event.x
        self.start_y = event.y
        self.clear_crosshair()
        if self.rect_id:
            self.canvas.delete(self.rect_id)
            self.rect_id = None

    # 鼠标拖动事件处理
    def on_mouse_drag(self, event):
        current_x = event.x
        current_y = event.y

        if self.rect_id:
            self.canvas.coords(self.rect_id, self.start_x, self.start_y, current_x, current_y)
        else:
            self.rect_id = self.canvas.create_rectangle(
                self.start_x, self.start_y,
                current_x, current_y,
                outline="blue", width=2, fill="gray75", tags="rect"
            )

    # 鼠标释放事件处理
    def on_mouse_up(self, event):
        try:
            x1 = min(self.start_x, event.x)
            y1 = min(self.start_y, event.y)
            x2 = max(self.start_x, event.x)
            y2 = max(self.start_y, event.y)

            if (x2 - x1) < 10 or (y2 - y1) < 10:
                raise ValueError("选区过小,请选择更大的区域")
            if (x2 - x1) > self.canvas.winfo_width() or (y2 - y1) > self.canvas.winfo_height():
                raise ValueError("选区过大,请选择更小的区域")

            self.screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
            self.selection_win.destroy()
            self.initialize_ocr_and_process()

        except Exception as e:
            logger.error(f"截图错误: {str(e)}")
            messagebox.showerror("截图错误", str(e))
            self.restart_selection()

    # 初始化OCR引擎并处理截图
    def initialize_ocr_and_process(self):
        try:
            if self.ocr_model is None:
                self.load_win = self.show_loading("OCR模型正在加载中,请稍后...")
                self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
            else:
                self.process_ocr()
                self.setup_main_ui()
                self.root.deiconify()

        except Exception as e:
            logger.error(f"OCR初始化失败: {str(e)}")
            if self.load_win:
                self.load_win.destroy()
            self.handle_ocr_init_error(str(e))

    def check_ocr_model(self):
        if self.ocr_model is None:
            self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
        else:
            if self.load_win:
                self.load_win.destroy()
            self.process_ocr()
            self.setup_main_ui()
            self.root.deiconify()

    # 执行OCR处理
    def process_ocr(self):
        try:
            temp_image_path = save_temp_image(self.screenshot)
            result = self.ocr_model.ocr(str(temp_image_path), cls=True)
            temp_image_path.unlink()  # 确保临时文件被删除

            # 后处理识别结果,合并同一行的文字
            merged_text = self.merge_lines(result[0])

            self.recognized_text = merged_text
        except Exception as e:
            logger.error(f"OCR处理失败: {str(e)}")
            messagebox.showerror("识别错误", f"OCR处理失败: {str(e)}")
            self.restart_selection()

    # 合并同一行的文字
    def merge_lines(self, ocr_result):
        merged_text = []
        current_line = []
        current_y1 = None
        current_y2 = None
        line_threshold = 5  # 设置行间距阈值,可以根据需要调整
    
        for line in ocr_result:
            # 提取坐标点
            x1, y1 = line[0][0]  # 第一个坐标点
            x2, y2 = line[0][2]  # 第三个坐标点
            text = line[1][0]  # 提取文本
    
            if current_y1 is None or current_y2 is None:
                current_y1 = y1
                current_y2 = y2
                current_line.append(text)
            elif abs(y1 - current_y1) <= line_threshold and abs(y2 - current_y2) <= line_threshold:
                current_line.append(text)
            else:
                merged_text.append(" ".join(current_line))
                current_line = [text]
                current_y1 = y1
                current_y2 = y2
    
        if current_line:
            merged_text.append(" ".join(current_line))
    
        return "\n".join(merged_text)

    # 设置主界面UI
    def setup_main_ui(self):
        if self.main_frame is None:
            self.main_frame = ttk.Frame(self.root, padding=20)
            self.main_frame.grid(row=0, column=0, sticky="nsew")

            self.root.grid_rowconfigure(0, weight=1)
            self.root.grid_columnconfigure(0, weight=1)

            # 使用 PanedWindow 来分割图片框和文本框
            self.paned_window = ttk.PanedWindow(self.main_frame, orient=tk.VERTICAL)
            self.paned_window.grid(row=0, column=0, sticky="nsew")

            # 创建一个 Frame 来包含图片和滚动条
            self.image_frame = ttk.Frame(self.paned_window)
            self.image_frame.pack(fill=tk.BOTH, expand=True)

            # 使用 Canvas 来显示图片并添加滚动条
            self.image_canvas = tk.Canvas(self.image_frame, highlightbackground=self.root.cget("bg"), highlightthickness=0)
            self.image_canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

            self.image_scrollbar = ttk.Scrollbar(self.image_frame, orient=tk.VERTICAL, command=self.image_canvas.yview)
            self.image_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
            self.image_canvas.config(yscrollcommand=self.image_scrollbar.set)

            self.image_canvas.bind("<Configure>", self.on_canvas_configure)

            self.image_container = ttk.Frame(self.image_canvas)
            self.image_container_id = self.image_canvas.create_window((0, 0), window=self.image_container, anchor="nw")

            self.img_label = ttk.Label(self.image_container)
            self.img_label.pack(fill=tk.BOTH, expand=True)

            # 定义字体
            custom_font = font.Font(family="Microsoft YaHei", size=9)

            self.text_area = tk.Text(
                self.paned_window,
                wrap=tk.WORD,
                font=custom_font,  # 设置字体
                height=15  # 初始高度设置为15行
            )
            self.text_area.pack(fill=tk.BOTH, expand=True)

            self.paned_window.add(self.image_frame)
            self.paned_window.add(self.text_area)

            btn_frame = ttk.Frame(self.main_frame)
            btn_frame.grid(row=1, column=0, sticky="ew", pady=10)

            # 确保按钮行不会被压缩
            self.main_frame.grid_rowconfigure(0, weight=1)
            self.main_frame.grid_rowconfigure(1, weight=0)

            ttk.Button(
                btn_frame,
                text="重新选择",
                command=self.restart_selection
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="复制文本",
                command=self.copy_result
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="保存图片",
                command=self.save_image
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="退出",
                command=self.safe_exit
            ).pack(side=tk.RIGHT, padx=5)

        # 设置窗口标题
        self.root.title("文字识别@PDM3")

        self.update_image_display()
        self.text_area.delete(1.0, tk.END)
        self.text_area.insert(tk.END, self.recognized_text.strip())
        self.update_text_area_height()  # 更新文本框高度

        # 设置窗口总是最顶层
        self.root.attributes('-topmost', True)

    # 更新图片显示
    def update_image_display(self):
        if self.screenshot:
            photo = ImageTk.PhotoImage(self.screenshot)
            self.img_label.config(image=photo)
            self.img_label.image = photo

            # 获取图片的实际大小
            img_width, img_height = self.screenshot.size

            # 获取屏幕高度
            screen_height = self.root.winfo_screenheight()

            # 计算图片框的最大高度
            max_image_height = screen_height // 2

            # 设置 Canvas 的滚动区域
            self.image_canvas.config(scrollregion=(0, 0, img_width, img_height))

            # 调整 image_canvas 的高度
            if img_height > max_image_height:
                self.image_canvas.config(height=max_image_height)
            else:
                self.image_canvas.config(height=img_height)

    # 配置 Canvas 大小
    def on_canvas_configure(self, event):
        # 更新 Canvas 的滚动区域
        self.image_canvas.config(scrollregion=self.image_canvas.bbox("all"))

    # 显示加载中的窗口
    def show_loading(self, message):
        load_win = tk.Toplevel()
        load_win.title("请稍候")

        frame = ttk.Frame(load_win, padding=20)
        frame.pack()

        ttk.Label(frame, text=message).pack(pady=10)
        progress = ttk.Progressbar(frame, mode='indeterminate')
        progress.pack(pady=5)
        progress.start()

        return load_win

    # 处理OCR初始化错误
    def handle_ocr_init_error(self, error_msg):
        choice = messagebox.askretrycancel(
            "OCR初始化失败",
            f"{error_msg}\n\n是否重试?",
            icon='error'
        )
        if choice:
            threading.Thread(target=self.initialize_ocr_and_process).start()
        else:
            self.safe_exit()

    # 重新开始截图选择
    def restart_selection(self):
        if self.root.winfo_exists():
            self.root.withdraw()
        self.screenshot = None
        self.recognized_text = ""
        self.clear_ui()
        self.start_selection()

    # 清理UI界面
    def clear_ui(self):
        if hasattr(self, 'img_label'):
            self.img_label.config(image='')
            self.img_label.image = None
        if hasattr(self, 'text_area'):
            self.text_area.delete(1.0, tk.END)

    # 复制识别结果到剪贴板
    def copy_result(self):
        self.root.clipboard_clear()
        self.root.clipboard_append(self.recognized_text)
        messagebox.showinfo("成功", "已复制到剪贴板")

    # 安全退出程序
    def safe_exit(self):
        if self.root.winfo_exists():
            self.root.destroy()
        sys.exit(0)

    # 显示程序崩溃错误信息
    def show_crash_message(self, message):
        crash_win = tk.Tk()
        crash_win.withdraw()
        messagebox.showerror("致命错误", message)
        crash_win.destroy()

    # 按下ESC键时退出程序
    def on_escape(self, event):
        self.selection_win.destroy()
        self.safe_exit()

    # 鼠标移动事件处理
    def on_mouse_move(self, event):
        current_x = event.x
        current_y = event.y
        self.update_crosshair(current_x, current_y)

    # 更新十字线位置
    def update_crosshair(self, x, y):
        self.clear_crosshair()
        self.crosshair_ids.append(
            self.canvas.create_line(0, y, self.canvas.winfo_width(), y,
                                   tags="crosshair", fill="yellow", width=2))
        self.crosshair_ids.append(
            self.canvas.create_line(x, 0, x, self.canvas.winfo_height(),
                                    tags="crosshair", fill="yellow", width=2))

    # 清除十字线
    def clear_crosshair(self):
        for crosshair_id in self.crosshair_ids:
            self.canvas.delete(crosshair_id)
        self.crosshair_ids = []

    # 保存图片
    def save_image(self):
        if self.screenshot:
            # 获取用户桌面路径
            desktop_path = Path.home() / 'Desktop'

            # 生成当前日期和时间的字符串
            current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
            default_filename = f"screenshot_{current_datetime}.png"

            file_path = filedialog.asksaveasfilename(
                initialdir=desktop_path,  # 设置初始目录为用户桌面
                initialfile=default_filename,  # 设置默认文件名
                defaultextension=".png",
                filetypes=[("PNG files", "*.png"), ("JPEG files", "*.jpg"), ("All files", "*.*")]
            )
            if file_path:
                self.screenshot.save(file_path)
                messagebox.showinfo("保存成功", f"图片已保存到 {file_path}")

    # 更新文本框高度
    def update_text_area_height(self):
        # 计算当前文本行数
        line_count = int(self.text_area.index('end-1c').split('.')[0])
        if line_count > 15:
            self.text_area.config(height=15)  # 如果行数超过15行,固定高度为15行
        else:
            self.text_area.config(height=line_count)  # 否则根据内容调整高度

    # 运行主循环
    def run(self):
        self.root.mainloop()

if __name__ == "__main__":
    app = OCRApp()
    app.run()
相关推荐
c***871938 分钟前
Flask:后端框架使用
后端·python·flask
Q_Q5110082852 小时前
python+django/flask的情绪宣泄系统
spring boot·python·pycharm·django·flask·node.js·php
撸码猿2 小时前
《Python AI入门》第9章 让机器读懂文字——NLP基础与情感分析实战
人工智能·python·自然语言处理
二川bro2 小时前
多模态AI开发:Python实现跨模态学习
人工智能·python·学习
2301_764441332 小时前
Python构建输入法应用
开发语言·python·算法
love530love2 小时前
【笔记】ComfUI RIFEInterpolation 节点缺失问题(cupy CUDA 安装)解决方案
人工智能·windows·笔记·python·插件·comfyui
青瓷程序设计2 小时前
昆虫识别系统【最新版】Python+TensorFlow+Vue3+Django+人工智能+深度学习+卷积神经网络算法
人工智能·python·深度学习
秋邱3 小时前
智启未来:AGI 教育融合 × 跨平台联盟 × 个性化空间,重构教育 AI 新范式开篇:一场 “教育 ×AI” 的范式革命
人工智能·python·重构·推荐算法·agi
爱吃泡芙的小白白3 小时前
vscode、anaconda、git、python配置安装(自用)
ide·git·vscode·python·anaconda·学习记录
谷隐凡二3 小时前
Kubernetes主从架构简单解析:基于Python的模拟实现
python·架构·kubernetes