拖放pdf转化为txt文件多进程多线程合并分词版

pdf是图片,多张图片的pdf文件,需要提取里面的文字,为了加快速度,在转化为图片文件后,采用多线程对图片进行缩放和旋转,采用多进程paddleOCR进行提取图片中的文字,再利用jieba分词判断前后句是否需要去掉换行符。

python 复制代码
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 25 10:42:39 2024

@author: YBK
"""

import tkinter as tk
import windnd
from tkinter.messagebox import showinfo
import os
from PIL import Image
import jieba
import time
import multiprocessing as mp
from paddleocr import PaddleOCR
import subprocess
from pdf2image import convert_from_path
import queue
import threading

def dec_to_36(num):
    base = [str(x) for x in range(10)] + [chr(x) for x in range(ord('A'),ord("A")+26)]
    # 前者把 0 ~ 9 转换成字符串存进列表 base 里,后者把 A ~ Z 存进列表
    l = []
    if num<0:
        return "-"+dec_to_36(abs(num))
    while True:
        num,rem = divmod(num,36) # 求商 和 留余数
        l.append(base[rem])
        if num == 0:
            return "".join(l[::-1])
        
def nowtime_to_str():
    #将当前时间戳转化为36进制,约6位字符,减少文件名长度
    unix_timestamp = int(time.time())
    return(dec_to_36(unix_timestamp))

def pdf2pic(path, pic_path, max_workers=4):
    """
    优化版多线程PDF转图片,提高处理效率
    
    Args:
        path: PDF文件路径
        pic_path: 图片保存路径
        max_workers: 线程数,根据CPU核心数调整
    """
    # 转换PDF为图片
    print("开始转换PDF为图片...")
    images = convert_from_path(path, thread_count=max_workers)  # 使用多线程转换
    total_images = len(images)
    print(f"共转换了 {total_images} 张图片")
    
    # 创建保存目录
    if not os.path.exists(pic_path):
        os.makedirs(pic_path, exist_ok=True)
    
    # 处理图片的worker函数
    def process_image_worker(image_queue, result_queue, worker_id):
        while True:
            try:
                # 从队列获取任务
                i, image = image_queue.get(timeout=1)
            except queue.Empty:
                # 队列为空,结束工作线程
                break
            
            try:
                image_path = os.path.join(pic_path, f"{i:03d}.png")
                
                # 保存图片
                image.save(image_path, "PNG", optimize=True)
                
                # 处理图片
                with Image.open(image_path) as img:
                    # 检查是否需要缩放
                    max_size = 2000
                    if img.width > max_size or img.height > max_size:
                        # 计算缩放比例
                        scale = min(max_size / img.width, max_size / img.height)
                        new_size = (int(img.width * scale), int(img.height * scale))
                        
                        # 缩放图片
                        img_resized = img.resize(new_size, Image.Resampling.LANCZOS)
                        img_resized.save(image_path, "PNG", optimize=True)
                        print(f"Worker-{worker_id}: 缩放图片{i}")
                    
                    # 检查是否需要旋转
                    if img.width > img.height:
                        img_rotated = img.transpose(Image.ROTATE_90)
                        img_rotated.save(image_path, "PNG", optimize=True)
                        print(f"Worker-{worker_id}: 旋转图片{i}")
                
                result_queue.put((i, True))
                
            except Exception as e:
                print(f"Worker-{worker_id}: 处理图片{i}失败 - {e}")
                result_queue.put((i, False))
            
            finally:
                image_queue.task_done()
    
    # 创建任务队列和结果队列
    image_queue = queue.Queue()
    result_queue = queue.Queue()
    
    # 将任务放入队列
    for i, image in enumerate(images):
        image_queue.put((i, image))
    
    # 创建并启动工作线程
    workers = []
    for worker_id in range(max_workers):
        worker = threading.Thread(
            target=process_image_worker,
            args=(image_queue, result_queue, worker_id),
            daemon=True
        )
        worker.start()
        workers.append(worker)
    
    # 等待所有任务完成
    image_queue.join()
    
    # 收集结果
    results = {}
    while not result_queue.empty():
        i, success = result_queue.get()
        results[i] = success
    
    # 统计结果
    success_count = sum(1 for success in results.values() if success)
    
    print(f"\n处理完成!成功处理 {success_count}/{total_images} 张图片")
    return success_count
def get_file_size(file_path):
    # 获取文件的大小(单位为字节)
    file_size = os.stat(file_path).st_size
    return file_size

def process_single_image(img_path):
    """处理单张图片"""
    try:
        # 在每个进程中初始化OCR(避免序列化问题)
        ocr = PaddleOCR(use_angle_cls=True, lang='ch')
        result = ocr.ocr(img_path, cls=True)
        return img_path, result, None
    except Exception as e:
        return img_path, None, str(e)

def process_images_ordered_imap(image_folder, image_prefix="", num_processes=None):
    """
    使用imap按顺序处理图片,内存效率更高
    """
    if num_processes is None:
        num_processes = min(mp.cpu_count(), 8)
    
    # 获取并排序图片文件
    image_files = []
    for filename in os.listdir(image_folder):
        if filename.startswith(image_prefix) and filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            image_files.append(filename)
    
    # 按数字排序
    image_files.sort(key=lambda x: int(''.join(filter(str.isdigit, x)) or 0))
    image_paths = [os.path.join(image_folder, f) for f in image_files]
    
    print(f"找到 {len(image_paths)} 张图片,按顺序处理...")
    
    results = []
    with mp.Pool(processes=num_processes) as pool:
        # 使用imap保持顺序,chunksize提高性能
        for i, (img_path, result, error) in enumerate(pool.imap(process_single_image, image_paths, chunksize=2)):
            if error:
                print(f"处理图片 {img_path} 失败: {error}")
                results.append((img_path, None, error))
            else:
                print(f"已完成 {i+1}/{len(image_paths)}: {os.path.basename(img_path)}")
                results.append((img_path, result, None))
    
    return results

def should_merge(line1, line2):
    """使用jieba分词判断两行是否应该合并"""
    if not line1 or not line2:
        return False
    
    # 如果line1以句子结束标点结尾,不应该合并
    sentence_endings = {'。', '!', '?', '!', '?', '.', ';', ';'}
    if line1[-1] in sentence_endings:
        return False
    
    # 如果line1以连接标点结尾,应该合并
    connecting_punctuations = {',', '、', ',', ';', ';'}
    if line1[-1] in connecting_punctuations:
        return True
    
    # 使用jieba分析边界处的词语
    # 获取line1的最后几个字符和line2的前几个字符
    test_text = line1[-3:] + line2[:3] if len(line1) >= 3 and len(line2) >= 3 else line1 + line2
    
    # 使用jieba分词
    words = list(jieba.cut(test_text))
    
    # 找到边界位置(line1的最后几个字符的长度)
    boundary = min(3, len(line1))
    
    # 检查是否有词跨越了边界
    current_pos = 0
    for word in words:
        start = current_pos
        end = current_pos + len(word)
        
        # 如果词跨越了边界,应该合并
        if start < boundary < end:
            return True
        
        current_pos = end
    
    # 默认不合并
    return False

def dragged_files(files):
    fileurl = ''
    if len(files) > 1:
        # print("请拖放一个文件!")
        showinfo("提示","请拖放一个文件!")
    else:
        # print(files[0].decode('gbk'))
        fileurl = files[0].decode('gbk')
        # print(os.path.splitext(fileurl)[1])
    if fileurl != '' and os.path.splitext(fileurl)[1] == '.pdf':
        pdfpath = fileurl
        filename0 = os.path.basename(fileurl).replace('.pdf','') + nowtime_to_str()
        # filename0 用于生成文件夹和文件名,为了不重复,在后面加入编码后的时间戳
        pic_path = f'e:\\临时文件夹\\{filename0}\\'
        if not os.path.exists(pic_path):
            os.mkdir(pic_path)
        pdf2pic(pdfpath, pic_path, max_workers=4)
        # pngpath = pic_path
        outtxtpath = 'e:\\临时文件夹\\'+filename0+'.txt'
        
        image_folder = pic_path
    
        results = process_images_ordered_imap(
            image_folder=image_folder,
            image_prefix="",
            num_processes=4
        )
        lines = []
        # 按顺序输出结果=====
        for img_path, result, error in results:
            if error:
                print(f"\n❌ 识别失败: {os.path.basename(img_path)} - {error}")
            elif result:
                print(f"\n✅ 识别成功: {os.path.basename(img_path)}")
                for idx, line in enumerate(result):
                    if line:
                        for word_info in line:
                            text = word_info[1][0]
                            confidence = word_info[1][1]
                            if confidence > 0.7:
                                lines.append(text)  # 先不加换行符
        
        # 使用jieba合并应该合并的行
        if lines:
            merged_lines = []
            i = 0
            
            while i < len(lines):
                current_line = lines[i]
                
                # 尝试与后续行合并
                j = i + 1
                while j < len(lines):
                    next_line = lines[j]
                    
                    # 判断是否应该合并
                    if should_merge(current_line, next_line):
                        current_line += next_line
                        j += 1
                    else:
                        break
                
                merged_lines.append(current_line)
                i = j
            
            # 将合并后的行写入文件
            with open(outtxtpath, 'w', encoding='utf-8') as f:
                for line in merged_lines:
                    f.write(line + '\n')
            
            print(f"\n✅ 文本处理完成: 合并了 {len(lines)} 行到 {len(merged_lines)} 行")
            
            # 用记事本打开文件
            subprocess.run(['notepad.exe', outtxtpath], check=True)
        else:
            print("\n❌ 没有识别到文本")
        #======

        # ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
        # lines = []
        # for filename in os.listdir(pngpath):
        #     img_path = pngpath+filename
        #     result = ocr.ocr(img_path, cls=True)
        #     print(img_path)
        #     # image = Image.open(img_path).convert('RGB')
        #     if result[0] is not None:
        #         boxes = [detection[0] for line in result for detection in line] # Nested loop added
        #         txts = [detection[1][0] for line in result for detection in line] # Nested loop added
        #         scores = [detection[1][1] for line in result for detection in line] # Nested loop added
        #         for box, txt, score in zip(boxes, txts, scores):
        #             if score > 0.7:
        #                 # lines.append(txt.replace('\n',''))
        #                 lines.append(txt+'\n')
        #         # lines.append('\n')
        # with open(outtxtpath, 'w', encoding='utf-8') as f:
        #     f.writelines(line for line in lines)
        # subprocess.run(['notepad.exe', outtxtpath], check=True)

 
if __name__ == '__main__':
    rootWindow = tk.Tk()
    rootWindow.title("拖放PDF文件识别文字")
    rootWindow.geometry("300x120")
    windnd.hook_dropfiles(rootWindow , func=dragged_files)
    rootWindow.mainloop()
相关推荐
humors2214 小时前
pdf工具分享
pdf·工具·程序·网站·转换·处理
冷雨夜中漫步5 小时前
Python快速入门(6)——for/if/while语句
开发语言·经验分享·笔记·python
郝学胜-神的一滴5 小时前
深入解析Python字典的继承关系:从abc模块看设计之美
网络·数据结构·python·程序人生
百锦再5 小时前
Reactive编程入门:Project Reactor 深度指南
前端·javascript·python·react.js·django·前端框架·reactjs
JH30736 小时前
SpringBoot 优雅处理金额格式化:拦截器+自定义注解方案
java·spring boot·spring
喵手7 小时前
Python爬虫实战:旅游数据采集实战 - 携程&去哪儿酒店机票价格监控完整方案(附CSV导出 + SQLite持久化存储)!
爬虫·python·爬虫实战·零基础python爬虫教学·采集结果csv导出·旅游数据采集·携程/去哪儿酒店机票价格监控
Coder_Boy_7 小时前
技术让开发更轻松的底层矛盾
java·大数据·数据库·人工智能·深度学习
2501_944934737 小时前
高职大数据技术专业,CDA和Python认证优先考哪个?
大数据·开发语言·python
helloworldandy7 小时前
使用Pandas进行数据分析:从数据清洗到可视化
jvm·数据库·python
invicinble8 小时前
对tomcat的提供的功能与底层拓扑结构与实现机制的理解
java·tomcat