pdf转word,图片文字转word(使用OCR工具)

一、开发环境

Pycharm2025,python解析器3.11

二、windows本地安装依赖包

(1)Poppler下载地址

https://release-assets.githubusercontent.com/github-production-release-asset/275646372/98c06dda-30ab-4b3b-901a-0b8ca8cebf3e?sp=r\&sv=2018-11-09\&sr=b\&spr=https\&se=2026-01-19T04%3A14%3A28Z\&rscd=attachment%3B+filename%3DRelease-25.12.0-0.zip\&rsct=application%2Foctet-stream\&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0\&sktid=398a6654-997b-47e9-b12b-9515b896b4de\&skt=2026-01-19T03%3A13%3A41Z\&ske=2026-01-19T04%3A14%3A28Z\&sks=b\&skv=2018-11-09\&sig=%2Fa8SW1mdV9vKR9KsbSvbAX%2FXvhdbw5LmOnojdPINSts%3D\&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc2ODc5NDc1NiwibmJmIjoxNzY4NzkyOTU2LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvcmUud2luZG93cy5uZXQifQ.imBEZOu3jG7_295JxD-r2e_AP9HmCWc_19vZBOBZzCk\&response-content-disposition=attachment%3B filename%3DRelease-25.12.0-0.zip\&response-content-type=application%2Foctet-stream

(2)tesseract-ocr windows版下载地址

https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-v5.3.0.20221214.exe

(3)语言包下载地址(下载后放在(2)安装的路径tesseract-ocr/tessdata)

https://github.com/tesseract-ocr/tessdata

三、编写ocrUtil.py

复制代码
"""
OCR文字识别工具类 - 使用PyMuPDF替代pdf2image
支持图片文字识别和PDF文字提取
提供多引擎支持:Tesseract OCR + 多种PDF处理方案
"""

import pytesseract
import fitz
import os
import io
import re
import logging
import subprocess
import sys
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
import json
from enum import Enum

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class ExtractionMethod(Enum):
    """提取方法枚举"""
    DIRECT = "direct"  # 直接文本提取
    PYMUPDF_OCR = "pymupdf_ocr"  # PyMuPDF转图片后OCR
    PYPDF2 = "pypdf2"  # PyPDF2提取
    PDFPLUMBER = "pdfplumber"  # pdfplumber提取
    FALLBACK = "fallback"  # 回退方案


@dataclass
class ExtractionResult:
    """提取结果数据类"""
    file_path: str
    file_type: str
    text_content: str
    confidence: float
    page_count: int
    language: str
    extraction_method: str
    metadata: Dict


@dataclass
class OCRConfig:
    """OCR配置类"""
    languages: List[str] = None
    tesseract_path: str = None
    dpi: int = 200
    enable_preprocessing: bool = True
    fallback_enabled: bool = True


class OCRProcessor:
    """OCR文字处理器 - 使用PyMuPDF版本"""

    def __init__(self, config: OCRConfig = None):
        """
        初始化OCR处理器

        Args:
            config: OCR配置对象
        """
        self.config = config or OCRConfig()
        if self.config.languages is None:
            self.config.languages = ['chi_sim', 'eng']

        self._dependencies_checked = False
        self._dependencies_available = False
        self._tesseract_available = False

    def check_dependencies(self) -> Tuple[bool, Dict]:
        """
        检查OCR依赖

        Returns:
            Tuple[bool, Dict]: (是否可用, 依赖状态详情)
        """
        if self._dependencies_checked:
            return self._dependencies_available, self._get_dependency_status()

        dependency_status = {
            'pytesseract': False,
            'PIL': False,
            'PyMuPDF': False,
            'PyPDF2': False,
            'pdfplumber': False,
            'tesseract': False
        }

        # 检查Python包依赖
        packages_to_check = [
            ('pytesseract', 'pytesseract'),
            ('PIL', 'PIL'),
            ('PyMuPDF', 'fitz'),  # PyMuPDF的导入名是fitz
            ('PyPDF2', 'PyPDF2'),
            ('pdfplumber', 'pdfplumber')
        ]

        for package_name, import_name in packages_to_check:
            try:
                __import__(import_name)
                dependency_status[package_name] = True
            except ImportError:
                logger.warning(f"缺少依赖包: {package_name}")

        # 检查Tesseract OCR是否可用
        dependency_status['tesseract'] = self._check_tesseract_availability()

        # 判断总体可用性(至少需要基本OCR功能)
        self._dependencies_available = (
                dependency_status['pytesseract'] and
                dependency_status['PIL'] and
                dependency_status['tesseract']
        )

        self._dependencies_checked = True
        return self._dependencies_available, dependency_status

    def _check_tesseract_availability(self) -> bool:
        """检查Tesseract OCR是否可用"""
        try:
            import pytesseract
            # 尝试获取Tesseract版本
            version = pytesseract.get_tesseract_version()
            if version:
                self._tesseract_available = True
                logger.info(f"Tesseract OCR可用,版本: {version}")
                return True
        except Exception as e:
            logger.warning(f"Tesseract OCR检查失败: {e}")

        # 检查自定义Tesseract路径
        if self.config.tesseract_path and os.path.exists(self.config.tesseract_path):
            try:
                import pytesseract
                pytesseract.pytesseract.tesseract_cmd = self.config.tesseract_path
                version = pytesseract.get_tesseract_version()
                if version:
                    self._tesseract_available = True
                    logger.info(f"自定义Tesseract路径可用: {self.config.tesseract_path}")
                    return True
            except Exception as e:
                logger.warning(f"自定义Tesseract路径检查失败: {e}")

        logger.error("Tesseract OCR不可用,OCR功能将无法使用")
        self._tesseract_available = False
        return False

    def _get_dependency_status(self) -> Dict:
        """获取依赖状态详情"""
        return {
            'pytesseract': self._check_package_available('pytesseract'),
            'PIL': self._check_package_available('PIL'),
            'PyMuPDF': self._check_package_available('fitz'),
            'PyPDF2': self._check_package_available('PyPDF2'),
            'pdfplumber': self._check_package_available('pdfplumber'),
            'tesseract': self._tesseract_available
        }

    def _check_package_available(self, package_name: str) -> bool:
        """检查Python包是否可用"""
        try:
            __import__(package_name)
            return True
        except ImportError:
            return False

    def extract_from_image(self, image_path: Union[str, Path],
                           language: str = None) -> ExtractionResult:
        """
        从图片中提取文字

        Args:
            image_path: 图片文件路径
            language: OCR语言,默认自动检测

        Returns:
            ExtractionResult: 提取结果
        """
        # 检查依赖
        deps_ok, status = self.check_dependencies()
        if not deps_ok:
            raise ImportError("缺少必要的OCR依赖包,请先安装依赖")

        from PIL import Image
        import pytesseract

        image_path = Path(image_path)
        if not image_path.exists():
            raise FileNotFoundError(f"图片文件不存在: {image_path}")

        # 设置语言
        lang = language or '+'.join(self.config.languages)

        try:
            # 打开图片
            image = Image.open(image_path)

            # 预处理图片(提高OCR准确率)
            if self.config.enable_preprocessing:
                processed_image = self._preprocess_image(image)
            else:
                processed_image = image

            # 提取文字
            text = pytesseract.image_to_string(processed_image, lang=lang)

            # 获取置信度
            data = pytesseract.image_to_data(processed_image, lang=lang,
                                             output_type=pytesseract.Output.DICT)
            confidence = self._calculate_confidence(data)

            return ExtractionResult(
                file_path=str(image_path),
                file_type=image_path.suffix.lower(),
                text_content=text.strip(),
                confidence=confidence,
                page_count=1,
                language=lang,
                extraction_method=ExtractionMethod.DIRECT.value,
                metadata={
                    'image_size': image.size,
                    'image_mode': image.mode,
                    'ocr_engine': 'tesseract',
                    'preprocessing': self.config.enable_preprocessing
                }
            )

        except Exception as e:
            logger.error(f"图片OCR提取失败: {e}")
            raise

    def extract_from_pdf(self, pdf_path: Union[str, Path],
                         language: str = None,
                         use_ocr: bool = True) -> List[ExtractionResult]:
        """
        从PDF中提取文字

        Args:
            pdf_path: PDF文件路径
            language: OCR语言
            use_ocr: 是否使用OCR(针对扫描版PDF)

        Returns:
            List[ExtractionResult]: 每页的提取结果列表
        """
        deps_ok, status = self.check_dependencies()
        if not deps_ok:
            raise ImportError("缺少必要的OCR依赖包,请先安装依赖")

        pdf_path = Path(pdf_path)
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")

        lang = language or '+'.join(self.config.languages)

        # 尝试多种提取方案(优先使用PyMuPDF)
        extraction_methods = [
            (self._extract_with_pymupdf_ocr, "PyMuPDF OCR提取"),
            (self._extract_pdf_directly, "直接文本提取"),
            (self._extract_with_pypdf2, "PyPDF2提取"),
            (self._extract_with_pdfplumber, "pdfplumber提取")
        ]

        for method, description in extraction_methods:
            try:
                if method == self._extract_with_pymupdf_ocr and not use_ocr:
                    continue  # 跳过OCR方案

                results = method(pdf_path, lang)
                if results and any(r.text_content.strip() for r in results):
                    logger.info(f"使用{description}方案成功")
                    # print("q",results)
                    return results

            except Exception as e:
                logger.warning(f"{description}方案失败: {e}")

        # 所有方案都失败
        logger.error("所有PDF提取方案都失败")
        return [ExtractionResult(
            file_path=str(pdf_path),
            file_type='.pdf',
            text_content='',
            confidence=0.0,
            page_count=0,
            language=lang,
            extraction_method=ExtractionMethod.FALLBACK.value,
            metadata={'error': '所有提取方案都失败'}
        )]

    def _extract_with_pymupdf_ocr(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
        """使用PyMuPDF将PDF转换为图片后OCR"""
        try:
            import fitz  # PyMuPDF
            from PIL import Image
            import pytesseract

            # 打开PDF文档
            doc = fitz.open(pdf_path)
            page_count = len(doc)


            results = []
            for page_num in range(page_count):
                try:
                    # 获取页面
                    page = doc[page_num]

                    # 将页面转换为图片
                    pix = page.get_pixmap(matrix=fitz.Matrix(self.config.dpi / 72, self.config.dpi / 72))
                    img_data = pix.tobytes("ppm")

                    # 将PPM数据转换为PIL图像
                    image = Image.open(io.BytesIO(img_data))
                    # 显示图片
                    # image.show()
                    # 预处理图片
                    if self.config.enable_preprocessing:
                        processed_image = self._preprocess_image(image)
                    else:
                        processed_image = image

                    # 提取文字
                    text = pytesseract.image_to_string(processed_image,  lang=lang)
                    print("text", text)





                    # 获取置信度
                    data = pytesseract.image_to_data(processed_image, lang=lang,
                                                     output_type=pytesseract.Output.DICT)
                    confidence = self._calculate_confidence(data)

                    result = ExtractionResult(
                        file_path=str(pdf_path),
                        file_type='.pdf',
                        text_content=text.strip(),
                        confidence=confidence,
                        page_count=page_count,
                        language=lang,
                        extraction_method=ExtractionMethod.PYMUPDF_OCR.value,
                        metadata={
                            'page_number': page_num + 1,
                            'image_size': image.size,
                            'dpi': self.config.dpi
                        }
                    )
                    results.append(result)

                except Exception as e:
                    logger.error(f"PDF第 {page_num + 1} 页OCR失败: {e}")
                    results.append(self._create_empty_result(pdf_path, lang, page_num + 1, page_count, str(e)))

            doc.close()
            return results

        except ImportError:
            logger.warning("PyMuPDF未安装,跳过此方案")
            return []
        except Exception as e:
            logger.error(f"PyMuPDF方案失败: {e}")
            return []

    def _extract_pdf_directly(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
        """直接提取PDF文本"""
        try:
            import PyPDF2

            results = []
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                page_count = len(pdf_reader.pages)

                for page_num in range(page_count):
                    page = pdf_reader.pages[page_num]
                    text = page.extract_text() or ""

                    result = ExtractionResult(
                        file_path=str(pdf_path),
                        file_type='.pdf',
                        text_content=text.strip(),
                        confidence=1.0,
                        page_count=page_count,
                        language='direct_extraction',
                        extraction_method=ExtractionMethod.DIRECT.value,
                        metadata={
                            'page_number': page_num + 1,
                            'text_length': len(text)
                        }
                    )
                    results.append(result)

            return results

        except Exception as e:
            logger.warning(f"直接提取PDF文本失败: {e}")
            return []

    def _extract_with_pypdf2(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
        """使用PyPDF2提取PDF文本"""
        return self._extract_pdf_directly(pdf_path, lang)  # 复用直接提取方法

    def _extract_with_pdfplumber(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
        """使用pdfplumber提取PDF文本"""
        try:
            import pdfplumber

            results = []
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    try:
                        text = page.extract_text() or ""

                        result = ExtractionResult(
                            file_path=str(pdf_path),
                            file_type='.pdf',
                            text_content=text.strip(),
                            confidence=0.9,
                            page_count=len(pdf.pages),
                            language='pdfplumber_extraction',
                            extraction_method=ExtractionMethod.PDFPLUMBER.value,
                            metadata={
                                'page_number': page_num,
                                'text_length': len(text)
                            }
                        )
                        results.append(result)

                    except Exception as e:
                        logger.error(f"PDF第 {page_num} 页pdfplumber提取失败: {e}")
                        results.append(self._create_empty_result(pdf_path, lang, page_num, len(pdf.pages), str(e)))

            return results

        except ImportError:
            logger.warning("pdfplumber未安装,跳过此方案")
            return []
        except Exception as e:
            logger.error(f"pdfplumber方案失败: {e}")
            return []

    def _create_empty_result(self, pdf_path: Path, lang: str, page_num: int,
                             total_pages: int, error: str) -> ExtractionResult:
        """创建空结果"""
        return ExtractionResult(
            file_path=str(pdf_path),
            file_type='.pdf',
            text_content='',
            confidence=0.0,
            page_count=total_pages,
            language=lang,
            extraction_method=ExtractionMethod.FALLBACK.value,
            metadata={'page_number': page_num, 'error': error}
        )

    def _preprocess_image(self, image):
        """预处理图片以提高OCR准确率"""
        from PIL import Image, ImageEnhance, ImageFilter

        # 转换为灰度图
        if image.mode != 'L':
            image = image.convert('L')

        # 增强对比度
        enhancer = ImageEnhance.Contrast(image)
        image = enhancer.enhance(1.5)

        # 增强锐度
        enhancer = ImageEnhance.Sharpness(image)
        image = enhancer.enhance(1.5)

        return image

    def _calculate_confidence(self, data):
        """计算OCR置信度"""
        if not data or 'conf' not in data:
            return 0.0

        confidences = [float(conf) for conf in data['conf'] if conf != '-1']
        if not confidences:
            return 0.0

        return sum(confidences) / len(confidences) / 100.0

    def batch_extract(self, input_dir: Union[str, Path],
                      output_dir: Optional[Union[str, Path]] = None,
                      file_types: List[str] = None) -> Dict:
        """
        批量提取文件内容

        Args:
            input_dir: 输入目录
            output_dir: 输出目录
            file_types: 支持的文件类型

        Returns:
            Dict: 提取结果统计
        """
        input_dir = Path(input_dir)
        output_dir = Path(output_dir) if output_dir else input_dir / 'extracted'
        output_dir.mkdir(parents=True, exist_ok=True)

        file_types = file_types or ['.jpg', '.jpeg', '.png', '.pdf', '.tiff', '.bmp']

        if not input_dir.exists():
            raise FileNotFoundError(f"输入目录不存在: {input_dir}")

        # 收集所有支持的文件
        files = []
        for ext in file_types:
            files.extend(input_dir.glob(f"*{ext}"))
            files.extend(input_dir.glob(f"*{ext.upper()}"))

        if not files:
            logger.warning(f"在目录 {input_dir} 中未找到支持的文件")
            return {"extracted": 0, "failed": 0, "total": 0}

        results = {"extracted": 0, "failed": 0, "total": len(files)}

        for file_path in files:
            try:
                if file_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
                    # 图片文件
                    result = self.extract_from_image(file_path)
                    self._save_result(result, output_dir)
                    results["extracted"] += 1

                elif file_path.suffix.lower() == '.pdf':
                    # PDF文件
                    pdf_results = self.extract_from_pdf(file_path)
                    self._save_pdf_results(pdf_results, output_dir, file_path)
                    results["extracted"] += 1

                logger.info(f"成功提取: {file_path.name}")

            except Exception as e:
                results["failed"] += 1
                logger.error(f"提取失败 {file_path.name}: {str(e)}")

        return results

    def _save_result(self, result: ExtractionResult, output_dir: Path):
        """保存单个结果"""
        output_file = output_dir / f"{Path(result.file_path).stem}_extracted.txt"

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f"文件: {Path(result.file_path).name}\n")
            f.write(f"类型: {result.file_type}\n")
            f.write(f"置信度: {result.confidence:.2f}\n")
            f.write(f"语言: {result.language}\n")
            f.write(f"提取方法: {result.extraction_method}\n")
            f.write("=" * 50 + "\n")
            f.write(result.text_content)

    def _save_pdf_results(self, results: List[ExtractionResult], output_dir: Path, file_path: Path):
        """保存PDF结果"""
        output_file = output_dir / f"{file_path.stem}_extracted.txt"

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f"文件: {file_path.name}\n")
            f.write(f"总页数: {len(results)}\n")
            f.write("=" * 50 + "\n\n")

            for i, result in enumerate(results, 1):
                f.write(f"第 {i} 页 (置信度: {result.confidence:.2f}, 方法: {result.extraction_method})\n")
                f.write("-" * 30 + "\n")
                f.write(result.text_content)
                f.write("\n\n")


# 便捷函数
def quick_extract_image(image_path: str, language: str = None) -> str:
    """快速提取图片文本"""
    processor = OCRProcessor()
    result = processor.extract_from_image(image_path, language)
    return result.text_content


def quick_extract_pdf(pdf_path: str, language: str = None) -> str:
    """快速提取PDF文本"""
    processor = OCRProcessor()
    results = processor.extract_from_pdf(pdf_path, language)
    return "\n\n".join([r.text_content for r in results if r.text_content.strip()])


if __name__ == "__main__":
    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)

    # 检查依赖
    deps_ok, status = processor.check_dependencies()
    print(f"依赖状态: {deps_ok}")
    print(f"依赖详情: {status}")

    # 测试PDF提取
    try:
        # result = quick_extract_pdf("D:/projects/pdf_test/pdf单页.pdf")
        result = quick_extract_image("D:/projects/pdf_test/4.jpg")
        print("提取结果:", result)
    except Exception as e:
        print(f"提取失败: {e}")
#

四、编写pdfToWord.py

复制代码
import os
import logging
from typing import Dict, Any, Optional
from pathlib import Path
import tempfile
from mcp.server.fastmcp import FastMCP
from starlette.middleware import Middleware as ASGIMiddleware
from starlette.middleware.cors import CORSMiddleware
from mcp.server.transport_security import TransportSecuritySettings
import pytesseract
import fitz
import io
import re
import logging
import subprocess
import sys
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
import json
from enum import Enum
from utils.ocrUtil import OCRConfig, OCRProcessor

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# 创建临时目录用于存储文件
TEMP_DIR = Path(tempfile.gettempdir()) / "pdf_mcp_temp"
TEMP_DIR.mkdir(exist_ok=True)

def pdf_to_word(pdf_file_path: str, output_file_path: Optional[str] = None) -> Dict[str, Any]:
    """
    将PDF文件转换为Word文档

    :param pdf_file_path: PDF文件的路径
    :param output_file_path: 输出的Word文件路径(可选,不指定则自动生成)
    :return: 转换结果信息
    """
    pdf_path=Path(pdf_file_path)
    folder_path = os.path.dirname(pdf_path)
    print(folder_path)

    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)
    result = processor.extract_from_pdf(pdf_file_path)

    try:
        # 生成输出文件路径
        if not output_file_path:
            output_file_path = folder_path + "/" + f"{pdf_path.stem}_converted.docx"
        output_path = Path(output_file_path)

        # 使用OCR提取的文本创建Word文档
        try:
            from docx import Document
            from docx.shared import Inches
            doc = Document()
            # 添加文档标题
            doc.add_heading(f"PDF转换结果: {pdf_path.stem}", 0)
            # 将OCR提取的文本添加到Word文档
            for i, result in enumerate(result, 1):
                if i > 1:  # 第一页之后添加分页符
                    doc.add_page_break()
                # 添加页眉
                doc.add_heading(f"第 {i} 页", level=1)
                # 添加正文内容
                if result.text_content.strip():
                    # 添加置信度信息
                    # confidence_info = f"置信度: {result.confidence:.2f} | 提取方法: {result.extraction_method}"
                    # doc.add_paragraph(confidence_info)

                    # 添加文本内容
                    content_paragraph = doc.add_paragraph()
                    content_paragraph.add_run(result.text_content)
                else:
                    doc.add_paragraph("本页无文本内容")

            doc.save(output_path)

            return {
                "status": "success",
                "message": "PDF转Word转换成功(OCR文本提取模式)",
                "input_file": str(pdf_path),
                "output_file": str(output_path),
                "file_size": output_path.stat().st_size,
                "note": "此转换使用OCR提取文本内容,可能不保留原始格式",
                "pages_extracted": len(result)
            }

        except ImportError as e:
            return {
                "status": "error",
                "message": f"缺少必要的依赖包: {str(e)}"
            }

    except Exception as e:
        logger.error(f"PDF转Word转换失败: {str(e)}")
        return {
            "status": "error",
            "message": f"转换失败: {str(e)}"
        }



if __name__ == "__main__":
    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)

    # 检查依赖
    deps_ok, status = processor.check_dependencies()
    print(f"依赖状态: {deps_ok}")
    print(f"依赖详情: {status}")

    # 测试PDF提取
    try:
        result = pdf_to_word("D:/projects/pdf_test/4.jpg")
        print("提取结果:", result)
    except Exception as e:
        print(f"提取失败: {e}")

五、编写imageToWord.py

复制代码
import os
import logging
from typing import Dict, Any, Optional
from pathlib import Path
import tempfile
from mcp.server.fastmcp import FastMCP
from starlette.middleware import Middleware as ASGIMiddleware
from starlette.middleware.cors import CORSMiddleware
from mcp.server.transport_security import TransportSecuritySettings
import pytesseract
import fitz
import io
import re
import logging
import subprocess
import sys
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
import json
from enum import Enum
from utils.ocrUtil import OCRConfig, OCRProcessor

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# 创建临时目录用于存储文件
TEMP_DIR = Path(tempfile.gettempdir()) / "pdf_mcp_temp"
TEMP_DIR.mkdir(exist_ok=True)

def image_to_word(pdf_file_path: str, output_file_path: Optional[str] = None) -> Dict[str, Any]:
    """
    将PDF文件转换为Word文档

    :param pdf_file_path: PDF文件的路径
    :param output_file_path: 输出的Word文件路径(可选,不指定则自动生成)
    :return: 转换结果信息
    """
    pdf_path=Path(pdf_file_path)
    folder_path = os.path.dirname(pdf_path)
    print(folder_path)

    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)
    result = processor.extract_from_image(pdf_file_path)
    print("qqq",result)
    try:
        # 生成输出文件路径
        if not output_file_path:
            output_file_path = folder_path + "/" + f"{pdf_path.stem}_converted.docx"
        output_path = Path(output_file_path)

        # 使用OCR提取的文本创建Word文档
        try:
            from docx import Document
            from docx.shared import Inches
            doc = Document()
            # 添加文档标题
            doc.add_heading(f"图片转换结果: {pdf_path.stem}", 0)

            # 添加页眉
            doc.add_heading("图片内容", level=1)

            # 添加正文内容(单个图片,不需要循环)
            if result.text_content.strip():
                # 添加置信度信息
                # confidence_info = f"置信度: {result.confidence:.2f} | 提取方法: {result.extraction_method}"
                # doc.add_paragraph(confidence_info)

                # 添加文本内容
                content_paragraph = doc.add_paragraph()
                content_paragraph.add_run(result.text_content)
            else:
                doc.add_paragraph("图片中未检测到文本内容")

            doc.save(output_path)

            return {
                "status": "success",
                "message": "图片转Word转换成功",
                "input_file": str(pdf_path),
                "output_file": str(output_path),
                "file_size": output_path.stat().st_size,
                # "confidence": result.confidence,
                "extraction_method": result.extraction_method
            }

        except ImportError as e:
            return {
                "status": "error",
                "message": f"缺少必要的依赖包: {str(e)}"
            }

    except Exception as e:
        logger.error(f"图片转Word转换失败: {str(e)}")
        return {
            "status": "error",
            "message": f"转换失败: {str(e)}"
        }



if __name__ == "__main__":
    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)

    # 检查依赖
    deps_ok, status = processor.check_dependencies()
    print(f"依赖状态: {deps_ok}")
    print(f"依赖详情: {status}")

    # 测试PDF提取
    try:
        result = image_to_word("D:/projects/pdf_test/1.jpg")
        print("提取结果:", result)
    except Exception as e:
        print(f"提取失败: {e}")
相关推荐
一城烟雨_2 小时前
vue3实现将HTML导出为pdf,HTML转换为文件流
vue.js·pdf
5134959218 小时前
在Vue.js项目中使用docx和file-saver实现Word文档导出
前端·vue.js·word
熊明才20 小时前
DeepSeek-OCR VLLM 环境配置指南
ocr·vllm
乐迁~1 天前
如何使用html2canvas和jsPDF库来解决PDF导出时分页内容截断问题(下--表格行截断处理)
pdf·js
拆房老料1 天前
实战复盘:自研 Office / PDF 文档处理平台的高坑预警与 AI Agent 时代架构思考
人工智能·架构·pdf·编辑器·开源软件
开开心心就好1 天前
PDF密码移除工具,免费解除打印编辑复制权限
java·网络·windows·websocket·pdf·电脑·excel
缺点内向1 天前
C# 高效统计 Word 文档字数:告别手动,拥抱自动化
c#·自动化·word
兔兔爱学习兔兔爱学习1 天前
创建CUDA11.8环境部署DeepSeek-OCR
ocr
非凡ghost1 天前
批量转双层PDF(可识别各种语言)
windows·学习·pdf·软件需求