拖放pdf转化为txt文件多进程多线程合并分词版

pdf是图片，多张图片的pdf文件，需要提取里面的文字，为了加快速度，在转化为图片文件后，采用多线程对图片进行缩放和旋转，采用多进程paddleOCR进行提取图片中的文字，再利用jieba分词判断前后句是否需要去掉换行符。
python 复制代码
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 25 10:42:39 2024

@author: YBK
"""

import tkinter as tk
import windnd
from tkinter.messagebox import showinfo
import os
from PIL import Image
import jieba
import time
import multiprocessing as mp
from paddleocr import PaddleOCR
import subprocess
from pdf2image import convert_from_path
import queue
import threading

def dec_to_36(num):
    base = [str(x) for x in range(10)] + [chr(x) for x in range(ord('A'),ord("A")+26)]
    # 前者把 0 ~ 9 转换成字符串存进列表 base 里，后者把 A ~ Z 存进列表
    l = []
    if num<0:
        return "-"+dec_to_36(abs(num))
    while True:
        num,rem = divmod(num,36) # 求商 和 留余数
        l.append(base[rem])
        if num == 0:
            return "".join(l[::-1])
        
def nowtime_to_str():
    #将当前时间戳转化为36进制，约6位字符，减少文件名长度
    unix_timestamp = int(time.time())
    return(dec_to_36(unix_timestamp))

def pdf2pic(path, pic_path, max_workers=4):
    """
    优化版多线程PDF转图片，提高处理效率
    
    Args:
        path: PDF文件路径
        pic_path: 图片保存路径
        max_workers: 线程数，根据CPU核心数调整
    """
    # 转换PDF为图片
    print("开始转换PDF为图片...")
    images = convert_from_path(path, thread_count=max_workers)  # 使用多线程转换
    total_images = len(images)
    print(f"共转换了 {total_images} 张图片")
    
    # 创建保存目录
    if not os.path.exists(pic_path):
        os.makedirs(pic_path, exist_ok=True)
    
    # 处理图片的worker函数
    def process_image_worker(image_queue, result_queue, worker_id):
        while True:
            try:
                # 从队列获取任务
                i, image = image_queue.get(timeout=1)
            except queue.Empty:
                # 队列为空，结束工作线程
                break
            
            try:
                image_path = os.path.join(pic_path, f"{i:03d}.png")
                
                # 保存图片
                image.save(image_path, "PNG", optimize=True)
                
                # 处理图片
                with Image.open(image_path) as img:
                    # 检查是否需要缩放
                    max_size = 2000
                    if img.width > max_size or img.height > max_size:
                        # 计算缩放比例
                        scale = min(max_size / img.width, max_size / img.height)
                        new_size = (int(img.width * scale), int(img.height * scale))
                        
                        # 缩放图片
                        img_resized = img.resize(new_size, Image.Resampling.LANCZOS)
                        img_resized.save(image_path, "PNG", optimize=True)
                        print(f"Worker-{worker_id}: 缩放图片{i}")
                    
                    # 检查是否需要旋转
                    if img.width > img.height:
                        img_rotated = img.transpose(Image.ROTATE_90)
                        img_rotated.save(image_path, "PNG", optimize=True)
                        print(f"Worker-{worker_id}: 旋转图片{i}")
                
                result_queue.put((i, True))
                
            except Exception as e:
                print(f"Worker-{worker_id}: 处理图片{i}失败 - {e}")
                result_queue.put((i, False))
            
            finally:
                image_queue.task_done()
    
    # 创建任务队列和结果队列
    image_queue = queue.Queue()
    result_queue = queue.Queue()
    
    # 将任务放入队列
    for i, image in enumerate(images):
        image_queue.put((i, image))
    
    # 创建并启动工作线程
    workers = []
    for worker_id in range(max_workers):
        worker = threading.Thread(
            target=process_image_worker,
            args=(image_queue, result_queue, worker_id),
            daemon=True
        )
        worker.start()
        workers.append(worker)
    
    # 等待所有任务完成
    image_queue.join()
    
    # 收集结果
    results = {}
    while not result_queue.empty():
        i, success = result_queue.get()
        results[i] = success
    
    # 统计结果
    success_count = sum(1 for success in results.values() if success)
    
    print(f"\n处理完成！成功处理 {success_count}/{total_images} 张图片")
    return success_count
def get_file_size(file_path):
    # 获取文件的大小（单位为字节）
    file_size = os.stat(file_path).st_size
    return file_size

def process_single_image(img_path):
    """处理单张图片"""
    try:
        # 在每个进程中初始化OCR（避免序列化问题）
        ocr = PaddleOCR(use_angle_cls=True, lang='ch')
        result = ocr.ocr(img_path, cls=True)
        return img_path, result, None
    except Exception as e:
        return img_path, None, str(e)

def process_images_ordered_imap(image_folder, image_prefix="", num_processes=None):
    """
    使用imap按顺序处理图片，内存效率更高
    """
    if num_processes is None:
        num_processes = min(mp.cpu_count(), 8)
    
    # 获取并排序图片文件
    image_files = []
    for filename in os.listdir(image_folder):
        if filename.startswith(image_prefix) and filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            image_files.append(filename)
    
    # 按数字排序
    image_files.sort(key=lambda x: int(''.join(filter(str.isdigit, x)) or 0))
    image_paths = [os.path.join(image_folder, f) for f in image_files]
    
    print(f"找到 {len(image_paths)} 张图片，按顺序处理...")
    
    results = []
    with mp.Pool(processes=num_processes) as pool:
        # 使用imap保持顺序，chunksize提高性能
        for i, (img_path, result, error) in enumerate(pool.imap(process_single_image, image_paths, chunksize=2)):
            if error:
                print(f"处理图片 {img_path} 失败: {error}")
                results.append((img_path, None, error))
            else:
                print(f"已完成 {i+1}/{len(image_paths)}: {os.path.basename(img_path)}")
                results.append((img_path, result, None))
    
    return results

def should_merge(line1, line2):
    """使用jieba分词判断两行是否应该合并"""
    if not line1 or not line2:
        return False
    
    # 如果line1以句子结束标点结尾，不应该合并
    sentence_endings = {'。', '！', '？', '!', '?', '.', ';', '；'}
    if line1[-1] in sentence_endings:
        return False
    
    # 如果line1以连接标点结尾，应该合并
    connecting_punctuations = {'，', '、', ',', ';', '；'}
    if line1[-1] in connecting_punctuations:
        return True
    
    # 使用jieba分析边界处的词语
    # 获取line1的最后几个字符和line2的前几个字符
    test_text = line1[-3:] + line2[:3] if len(line1) >= 3 and len(line2) >= 3 else line1 + line2
    
    # 使用jieba分词
    words = list(jieba.cut(test_text))
    
    # 找到边界位置（line1的最后几个字符的长度）
    boundary = min(3, len(line1))
    
    # 检查是否有词跨越了边界
    current_pos = 0
    for word in words:
        start = current_pos
        end = current_pos + len(word)
        
        # 如果词跨越了边界，应该合并
        if start < boundary < end:
            return True
        
        current_pos = end
    
    # 默认不合并
    return False

def dragged_files(files):
    fileurl = ''
    if len(files) > 1:
        # print("请拖放一个文件！")
        showinfo("提示","请拖放一个文件！")
    else:
        # print(files[0].decode('gbk'))
        fileurl = files[0].decode('gbk')
        # print(os.path.splitext(fileurl)[1])
    if fileurl != '' and os.path.splitext(fileurl)[1] == '.pdf':
        pdfpath = fileurl
        filename0 = os.path.basename(fileurl).replace('.pdf','') + nowtime_to_str()
        # filename0 用于生成文件夹和文件名，为了不重复，在后面加入编码后的时间戳
        pic_path = f'e:\\临时文件夹\\{filename0}\\'
        if not os.path.exists(pic_path):
            os.mkdir(pic_path)
        pdf2pic(pdfpath, pic_path, max_workers=4)
        # pngpath = pic_path
        outtxtpath = 'e:\\临时文件夹\\'+filename0+'.txt'
        
        image_folder = pic_path
    
        results = process_images_ordered_imap(
            image_folder=image_folder,
            image_prefix="",
            num_processes=4
        )
        lines = []
        # 按顺序输出结果=====
        for img_path, result, error in results:
            if error:
                print(f"\n❌ 识别失败: {os.path.basename(img_path)} - {error}")
            elif result:
                print(f"\n✅ 识别成功: {os.path.basename(img_path)}")
                for idx, line in enumerate(result):
                    if line:
                        for word_info in line:
                            text = word_info[1][0]
                            confidence = word_info[1][1]
                            if confidence > 0.7:
                                lines.append(text)  # 先不加换行符
        
        # 使用jieba合并应该合并的行
        if lines:
            merged_lines = []
            i = 0
            
            while i < len(lines):
                current_line = lines[i]
                
                # 尝试与后续行合并
                j = i + 1
                while j < len(lines):
                    next_line = lines[j]
                    
                    # 判断是否应该合并
                    if should_merge(current_line, next_line):
                        current_line += next_line
                        j += 1
                    else:
                        break
                
                merged_lines.append(current_line)
                i = j
            
            # 将合并后的行写入文件
            with open(outtxtpath, 'w', encoding='utf-8') as f:
                for line in merged_lines:
                    f.write(line + '\n')
            
            print(f"\n✅ 文本处理完成: 合并了 {len(lines)} 行到 {len(merged_lines)} 行")
            
            # 用记事本打开文件
            subprocess.run(['notepad.exe', outtxtpath], check=True)
        else:
            print("\n❌ 没有识别到文本")
        #======

        # ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
        # lines = []
        # for filename in os.listdir(pngpath):
        #     img_path = pngpath+filename
        #     result = ocr.ocr(img_path, cls=True)
        #     print(img_path)
        #     # image = Image.open(img_path).convert('RGB')
        #     if result[0] is not None:
        #         boxes = [detection[0] for line in result for detection in line] # Nested loop added
        #         txts = [detection[1][0] for line in result for detection in line] # Nested loop added
        #         scores = [detection[1][1] for line in result for detection in line] # Nested loop added
        #         for box, txt, score in zip(boxes, txts, scores):
        #             if score > 0.7:
        #                 # lines.append(txt.replace('\n',''))
        #                 lines.append(txt+'\n')
        #         # lines.append('\n')
        # with open(outtxtpath, 'w', encoding='utf-8') as f:
        #     f.writelines(line for line in lines)
        # subprocess.run(['notepad.exe', outtxtpath], check=True)

 
if __name__ == '__main__':
    rootWindow = tk.Tk()
    rootWindow.title("拖放PDF文件识别文字")
    rootWindow.geometry("300x120")
    windnd.hook_dropfiles(rootWindow , func=dragged_files)
    rootWindow.mainloop()