PaddleOCR 截图自动文字识别

春节假期在家无聊,撸了三个小工具:PC截图+编辑/PC录屏(用于meeting录屏)/PC截屏文字识别。因为感觉这三个小工具是工作中常常需要用到的,github上也有很多开源的,不过总有点或多或少的小问题,不利于自己的使用。脚本的编写尽量减少对三方库的使用。

已全部完成,这是其中的一个,后续将三个集成在在一个工具中。

python 复制代码
import tkinter as tk
from tkinter import ttk, messagebox, font, filedialog
from PIL import Image, ImageTk, ImageGrab
import sys
import tempfile
import threading
from pathlib import Path
import ctypes
import logging.handlers
from datetime import datetime

# 最小化控制台窗口
def minimize_console():
    ctypes.windll.user32.ShowWindow(ctypes.windll.kernel32.GetConsoleWindow(), 6)

minimize_console()  # 调用最小化函数

# 获取脚本所在目录路径
def get_script_directory():
    return Path(__file__).parent

# 配置日志文件路径和日志级别
log_file_path = get_script_directory() / 'ocr_errors.log'
logging.basicConfig(
    filename=log_file_path,
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
# 添加日志轮转
handler = logging.handlers.RotatingFileHandler(log_file_path, maxBytes=1024*1024*5, backupCount=3)
logger = logging.getLogger()
logger.addHandler(handler)

# 保存临时图片到磁盘
def save_temp_image(image, suffix='.png'):
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
        image.save(temp_file.name)
        return Path(temp_file.name)

class OCRApp:
    def __init__(self):
        try:
            self.root = tk.Tk()
            self.root.withdraw()

            # 禁用最大化按钮
            self.root.resizable(False, True)

            self.screenshot = None
            self.ocr_model = None  # 延迟初始化
            self.recognized_text = ""
            self.main_frame = None
            self.load_win = None  # 初始化 load_win 为 None

            # 启动后台线程加载OCR模型以优化性能,使run脚本后能马上进入截图状态
            threading.Thread(target=self.load_ocr_model, daemon=True).start()

            # 立即开始截图选择
            self.start_selection()

        except Exception as e:
            self.show_crash_message(f"程序启动失败: {str(e)}")
            sys.exit(1)

    def load_ocr_model(self):
        from paddleocr import PaddleOCR
        try:
            self.ocr_model = PaddleOCR(use_angle_cls=True, show_log=False, lang='ch')
        except Exception as e:
            logger.error(f"OCR模型加载失败: {str(e)}")

    # 开始截图选择区域
    def start_selection(self):
        self.selection_win = tk.Toplevel()
        self.selection_win.attributes("-fullscreen", True)
        self.selection_win.attributes("-alpha", 0.3)

        # 绑定整个窗口的 ESC 键事件
        self.selection_win.bind("<Escape>", self.on_escape)

        self.canvas = tk.Canvas(
            self.selection_win,
            cursor="cross",
            bg="gray30",
            highlightthickness=0
        )
        self.canvas.pack(fill=tk.BOTH, expand=True)

        self.start_x = self.start_y = 0
        self.rect_id = None
        self.crosshair_ids = []

        self.canvas.bind("<Button-1>", self.on_mouse_down)
        self.canvas.bind("<B1-Motion>", self.on_mouse_drag)
        self.canvas.bind("<ButtonRelease-1>", self.on_mouse_up)
        self.canvas.bind("<Motion>", self.on_mouse_move)

        self.escape_label = tk.Label(
            self.selection_win,
            text="按ESC键退出截图",
            fg="yellow",
            bg="gray20",
            font=("Helvetica", 12, "bold")
        )
        self.escape_label.place(x=10, y=10)

        self.update_crosshair(0, 0)

    # 鼠标按下事件处理
    def on_mouse_down(self, event):
        self.start_x = event.x
        self.start_y = event.y
        self.clear_crosshair()
        if self.rect_id:
            self.canvas.delete(self.rect_id)
            self.rect_id = None

    # 鼠标拖动事件处理
    def on_mouse_drag(self, event):
        current_x = event.x
        current_y = event.y

        if self.rect_id:
            self.canvas.coords(self.rect_id, self.start_x, self.start_y, current_x, current_y)
        else:
            self.rect_id = self.canvas.create_rectangle(
                self.start_x, self.start_y,
                current_x, current_y,
                outline="blue", width=2, fill="gray75", tags="rect"
            )

    # 鼠标释放事件处理
    def on_mouse_up(self, event):
        try:
            x1 = min(self.start_x, event.x)
            y1 = min(self.start_y, event.y)
            x2 = max(self.start_x, event.x)
            y2 = max(self.start_y, event.y)

            if (x2 - x1) < 10 or (y2 - y1) < 10:
                raise ValueError("选区过小,请选择更大的区域")
            if (x2 - x1) > self.canvas.winfo_width() or (y2 - y1) > self.canvas.winfo_height():
                raise ValueError("选区过大,请选择更小的区域")

            self.screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
            self.selection_win.destroy()
            self.initialize_ocr_and_process()

        except Exception as e:
            logger.error(f"截图错误: {str(e)}")
            messagebox.showerror("截图错误", str(e))
            self.restart_selection()

    # 初始化OCR引擎并处理截图
    def initialize_ocr_and_process(self):
        try:
            if self.ocr_model is None:
                self.load_win = self.show_loading("OCR模型正在加载中,请稍后...")
                self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
            else:
                self.process_ocr()
                self.setup_main_ui()
                self.root.deiconify()

        except Exception as e:
            logger.error(f"OCR初始化失败: {str(e)}")
            if self.load_win:
                self.load_win.destroy()
            self.handle_ocr_init_error(str(e))

    def check_ocr_model(self):
        if self.ocr_model is None:
            self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
        else:
            if self.load_win:
                self.load_win.destroy()
            self.process_ocr()
            self.setup_main_ui()
            self.root.deiconify()

    # 执行OCR处理
    def process_ocr(self):
        try:
            temp_image_path = save_temp_image(self.screenshot)
            result = self.ocr_model.ocr(str(temp_image_path), cls=True)
            temp_image_path.unlink()  # 确保临时文件被删除

            # 后处理识别结果,合并同一行的文字
            merged_text = self.merge_lines(result[0])

            self.recognized_text = merged_text
        except Exception as e:
            logger.error(f"OCR处理失败: {str(e)}")
            messagebox.showerror("识别错误", f"OCR处理失败: {str(e)}")
            self.restart_selection()

    # 合并同一行的文字
    def merge_lines(self, ocr_result):
        merged_text = []
        current_line = []
        current_y1 = None
        current_y2 = None
        line_threshold = 5  # 设置行间距阈值,可以根据需要调整
    
        for line in ocr_result:
            # 提取坐标点
            x1, y1 = line[0][0]  # 第一个坐标点
            x2, y2 = line[0][2]  # 第三个坐标点
            text = line[1][0]  # 提取文本
    
            if current_y1 is None or current_y2 is None:
                current_y1 = y1
                current_y2 = y2
                current_line.append(text)
            elif abs(y1 - current_y1) <= line_threshold and abs(y2 - current_y2) <= line_threshold:
                current_line.append(text)
            else:
                merged_text.append(" ".join(current_line))
                current_line = [text]
                current_y1 = y1
                current_y2 = y2
    
        if current_line:
            merged_text.append(" ".join(current_line))
    
        return "\n".join(merged_text)

    # 设置主界面UI
    def setup_main_ui(self):
        if self.main_frame is None:
            self.main_frame = ttk.Frame(self.root, padding=20)
            self.main_frame.grid(row=0, column=0, sticky="nsew")

            self.root.grid_rowconfigure(0, weight=1)
            self.root.grid_columnconfigure(0, weight=1)

            # 使用 PanedWindow 来分割图片框和文本框
            self.paned_window = ttk.PanedWindow(self.main_frame, orient=tk.VERTICAL)
            self.paned_window.grid(row=0, column=0, sticky="nsew")

            # 创建一个 Frame 来包含图片和滚动条
            self.image_frame = ttk.Frame(self.paned_window)
            self.image_frame.pack(fill=tk.BOTH, expand=True)

            # 使用 Canvas 来显示图片并添加滚动条
            self.image_canvas = tk.Canvas(self.image_frame, highlightbackground=self.root.cget("bg"), highlightthickness=0)
            self.image_canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

            self.image_scrollbar = ttk.Scrollbar(self.image_frame, orient=tk.VERTICAL, command=self.image_canvas.yview)
            self.image_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
            self.image_canvas.config(yscrollcommand=self.image_scrollbar.set)

            self.image_canvas.bind("<Configure>", self.on_canvas_configure)

            self.image_container = ttk.Frame(self.image_canvas)
            self.image_container_id = self.image_canvas.create_window((0, 0), window=self.image_container, anchor="nw")

            self.img_label = ttk.Label(self.image_container)
            self.img_label.pack(fill=tk.BOTH, expand=True)

            # 定义字体
            custom_font = font.Font(family="Microsoft YaHei", size=9)

            self.text_area = tk.Text(
                self.paned_window,
                wrap=tk.WORD,
                font=custom_font,  # 设置字体
                height=15  # 初始高度设置为15行
            )
            self.text_area.pack(fill=tk.BOTH, expand=True)

            self.paned_window.add(self.image_frame)
            self.paned_window.add(self.text_area)

            btn_frame = ttk.Frame(self.main_frame)
            btn_frame.grid(row=1, column=0, sticky="ew", pady=10)

            # 确保按钮行不会被压缩
            self.main_frame.grid_rowconfigure(0, weight=1)
            self.main_frame.grid_rowconfigure(1, weight=0)

            ttk.Button(
                btn_frame,
                text="重新选择",
                command=self.restart_selection
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="复制文本",
                command=self.copy_result
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="保存图片",
                command=self.save_image
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="退出",
                command=self.safe_exit
            ).pack(side=tk.RIGHT, padx=5)

        # 设置窗口标题
        self.root.title("文字识别@PDM3")

        self.update_image_display()
        self.text_area.delete(1.0, tk.END)
        self.text_area.insert(tk.END, self.recognized_text.strip())
        self.update_text_area_height()  # 更新文本框高度

        # 设置窗口总是最顶层
        self.root.attributes('-topmost', True)

    # 更新图片显示
    def update_image_display(self):
        if self.screenshot:
            photo = ImageTk.PhotoImage(self.screenshot)
            self.img_label.config(image=photo)
            self.img_label.image = photo

            # 获取图片的实际大小
            img_width, img_height = self.screenshot.size

            # 获取屏幕高度
            screen_height = self.root.winfo_screenheight()

            # 计算图片框的最大高度
            max_image_height = screen_height // 2

            # 设置 Canvas 的滚动区域
            self.image_canvas.config(scrollregion=(0, 0, img_width, img_height))

            # 调整 image_canvas 的高度
            if img_height > max_image_height:
                self.image_canvas.config(height=max_image_height)
            else:
                self.image_canvas.config(height=img_height)

    # 配置 Canvas 大小
    def on_canvas_configure(self, event):
        # 更新 Canvas 的滚动区域
        self.image_canvas.config(scrollregion=self.image_canvas.bbox("all"))

    # 显示加载中的窗口
    def show_loading(self, message):
        load_win = tk.Toplevel()
        load_win.title("请稍候")

        frame = ttk.Frame(load_win, padding=20)
        frame.pack()

        ttk.Label(frame, text=message).pack(pady=10)
        progress = ttk.Progressbar(frame, mode='indeterminate')
        progress.pack(pady=5)
        progress.start()

        return load_win

    # 处理OCR初始化错误
    def handle_ocr_init_error(self, error_msg):
        choice = messagebox.askretrycancel(
            "OCR初始化失败",
            f"{error_msg}\n\n是否重试?",
            icon='error'
        )
        if choice:
            threading.Thread(target=self.initialize_ocr_and_process).start()
        else:
            self.safe_exit()

    # 重新开始截图选择
    def restart_selection(self):
        if self.root.winfo_exists():
            self.root.withdraw()
        self.screenshot = None
        self.recognized_text = ""
        self.clear_ui()
        self.start_selection()

    # 清理UI界面
    def clear_ui(self):
        if hasattr(self, 'img_label'):
            self.img_label.config(image='')
            self.img_label.image = None
        if hasattr(self, 'text_area'):
            self.text_area.delete(1.0, tk.END)

    # 复制识别结果到剪贴板
    def copy_result(self):
        self.root.clipboard_clear()
        self.root.clipboard_append(self.recognized_text)
        messagebox.showinfo("成功", "已复制到剪贴板")

    # 安全退出程序
    def safe_exit(self):
        if self.root.winfo_exists():
            self.root.destroy()
        sys.exit(0)

    # 显示程序崩溃错误信息
    def show_crash_message(self, message):
        crash_win = tk.Tk()
        crash_win.withdraw()
        messagebox.showerror("致命错误", message)
        crash_win.destroy()

    # 按下ESC键时退出程序
    def on_escape(self, event):
        self.selection_win.destroy()
        self.safe_exit()

    # 鼠标移动事件处理
    def on_mouse_move(self, event):
        current_x = event.x
        current_y = event.y
        self.update_crosshair(current_x, current_y)

    # 更新十字线位置
    def update_crosshair(self, x, y):
        self.clear_crosshair()
        self.crosshair_ids.append(
            self.canvas.create_line(0, y, self.canvas.winfo_width(), y,
                                   tags="crosshair", fill="yellow", width=2))
        self.crosshair_ids.append(
            self.canvas.create_line(x, 0, x, self.canvas.winfo_height(),
                                    tags="crosshair", fill="yellow", width=2))

    # 清除十字线
    def clear_crosshair(self):
        for crosshair_id in self.crosshair_ids:
            self.canvas.delete(crosshair_id)
        self.crosshair_ids = []

    # 保存图片
    def save_image(self):
        if self.screenshot:
            # 获取用户桌面路径
            desktop_path = Path.home() / 'Desktop'

            # 生成当前日期和时间的字符串
            current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
            default_filename = f"screenshot_{current_datetime}.png"

            file_path = filedialog.asksaveasfilename(
                initialdir=desktop_path,  # 设置初始目录为用户桌面
                initialfile=default_filename,  # 设置默认文件名
                defaultextension=".png",
                filetypes=[("PNG files", "*.png"), ("JPEG files", "*.jpg"), ("All files", "*.*")]
            )
            if file_path:
                self.screenshot.save(file_path)
                messagebox.showinfo("保存成功", f"图片已保存到 {file_path}")

    # 更新文本框高度
    def update_text_area_height(self):
        # 计算当前文本行数
        line_count = int(self.text_area.index('end-1c').split('.')[0])
        if line_count > 15:
            self.text_area.config(height=15)  # 如果行数超过15行,固定高度为15行
        else:
            self.text_area.config(height=line_count)  # 否则根据内容调整高度

    # 运行主循环
    def run(self):
        self.root.mainloop()

if __name__ == "__main__":
    app = OCRApp()
    app.run()
相关推荐
叫我:松哥3 小时前
python案例:基于python 神经网络cnn和LDA主题分析的旅游景点满意度分析
人工智能·python·神经网络·数据挖掘·数据分析·cnn·课程设计
2202_756749694 小时前
01 基于sklearn的机械学习-机械学习的分类、sklearn的安装、sklearn数据集及数据集的划分、特征工程(特征提取与无量纲化、特征降维)
人工智能·python·机器学习·分类·sklearn
王者鳜錸4 小时前
PYTHON从入门到实践-18Django从零开始构建Web应用
前端·python·sqlite
冗量4 小时前
PPT自动化 python-pptx - 8: 文本(text)
python·自动化·powerpoint
超级晒盐人5 小时前
用落霞归雁的思维框架推导少林寺用什么数据库?
java·python·系统架构·学习方法·教育电商
AI_RSER5 小时前
第一篇:【Python-geemap教程(三)上】3D地形渲染与Landsat NDVI计算
开发语言·python·3d·信息可视化·遥感·gee
WSSWWWSSW6 小时前
Python编程基础与实践:Python循环结构基础
开发语言·python
im_AMBER6 小时前
学习日志25 python
开发语言·python·学习
Blossom.1187 小时前
基于深度学习的医学图像分析:使用DeepLabv3+实现医学图像分割
人工智能·python·深度学习·yolo·目标检测·机器学习·迁移学习
花酒锄作田8 小时前
[python]基于动态实例的命令处理设计
python