pdf转word,图片文字转word(使用OCR工具)

一、开发环境

Pycharm2025,python解析器3.11

二、windows本地安装依赖包

(1)Poppler下载地址

https://release-assets.githubusercontent.com/github-production-release-asset/275646372/98c06dda-30ab-4b3b-901a-0b8ca8cebf3e?sp=r\&sv=2018-11-09\&sr=b\&spr=https\&se=2026-01-19T04%3A14%3A28Z\&rscd=attachment%3B+filename%3DRelease-25.12.0-0.zip\&rsct=application%2Foctet-stream\&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0\&sktid=398a6654-997b-47e9-b12b-9515b896b4de\&skt=2026-01-19T03%3A13%3A41Z\&ske=2026-01-19T04%3A14%3A28Z\&sks=b\&skv=2018-11-09\&sig=%2Fa8SW1mdV9vKR9KsbSvbAX%2FXvhdbw5LmOnojdPINSts%3D\&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc2ODc5NDc1NiwibmJmIjoxNzY4NzkyOTU2LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvcmUud2luZG93cy5uZXQifQ.imBEZOu3jG7_295JxD-r2e_AP9HmCWc_19vZBOBZzCk\&response-content-disposition=attachment%3B filename%3DRelease-25.12.0-0.zip\&response-content-type=application%2Foctet-stream

(2)tesseract-ocr windows版下载地址

https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-v5.3.0.20221214.exe

(3)语言包下载地址(下载后放在(2)安装的路径tesseract-ocr/tessdata)

https://github.com/tesseract-ocr/tessdata

三、编写ocrUtil.py

复制代码
"""
OCR文字识别工具类 - 使用PyMuPDF替代pdf2image
支持图片文字识别和PDF文字提取
提供多引擎支持:Tesseract OCR + 多种PDF处理方案
"""

import pytesseract
import fitz
import os
import io
import re
import logging
import subprocess
import sys
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
import json
from enum import Enum

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class ExtractionMethod(Enum):
    """提取方法枚举"""
    DIRECT = "direct"  # 直接文本提取
    PYMUPDF_OCR = "pymupdf_ocr"  # PyMuPDF转图片后OCR
    PYPDF2 = "pypdf2"  # PyPDF2提取
    PDFPLUMBER = "pdfplumber"  # pdfplumber提取
    FALLBACK = "fallback"  # 回退方案


@dataclass
class ExtractionResult:
    """提取结果数据类"""
    file_path: str
    file_type: str
    text_content: str
    confidence: float
    page_count: int
    language: str
    extraction_method: str
    metadata: Dict


@dataclass
class OCRConfig:
    """OCR配置类"""
    languages: List[str] = None
    tesseract_path: str = None
    dpi: int = 200
    enable_preprocessing: bool = True
    fallback_enabled: bool = True


class OCRProcessor:
    """OCR文字处理器 - 使用PyMuPDF版本"""

    def __init__(self, config: OCRConfig = None):
        """
        初始化OCR处理器

        Args:
            config: OCR配置对象
        """
        self.config = config or OCRConfig()
        if self.config.languages is None:
            self.config.languages = ['chi_sim', 'eng']

        self._dependencies_checked = False
        self._dependencies_available = False
        self._tesseract_available = False

    def check_dependencies(self) -> Tuple[bool, Dict]:
        """
        检查OCR依赖

        Returns:
            Tuple[bool, Dict]: (是否可用, 依赖状态详情)
        """
        if self._dependencies_checked:
            return self._dependencies_available, self._get_dependency_status()

        dependency_status = {
            'pytesseract': False,
            'PIL': False,
            'PyMuPDF': False,
            'PyPDF2': False,
            'pdfplumber': False,
            'tesseract': False
        }

        # 检查Python包依赖
        packages_to_check = [
            ('pytesseract', 'pytesseract'),
            ('PIL', 'PIL'),
            ('PyMuPDF', 'fitz'),  # PyMuPDF的导入名是fitz
            ('PyPDF2', 'PyPDF2'),
            ('pdfplumber', 'pdfplumber')
        ]

        for package_name, import_name in packages_to_check:
            try:
                __import__(import_name)
                dependency_status[package_name] = True
            except ImportError:
                logger.warning(f"缺少依赖包: {package_name}")

        # 检查Tesseract OCR是否可用
        dependency_status['tesseract'] = self._check_tesseract_availability()

        # 判断总体可用性(至少需要基本OCR功能)
        self._dependencies_available = (
                dependency_status['pytesseract'] and
                dependency_status['PIL'] and
                dependency_status['tesseract']
        )

        self._dependencies_checked = True
        return self._dependencies_available, dependency_status

    def _check_tesseract_availability(self) -> bool:
        """检查Tesseract OCR是否可用"""
        try:
            import pytesseract
            # 尝试获取Tesseract版本
            version = pytesseract.get_tesseract_version()
            if version:
                self._tesseract_available = True
                logger.info(f"Tesseract OCR可用,版本: {version}")
                return True
        except Exception as e:
            logger.warning(f"Tesseract OCR检查失败: {e}")

        # 检查自定义Tesseract路径
        if self.config.tesseract_path and os.path.exists(self.config.tesseract_path):
            try:
                import pytesseract
                pytesseract.pytesseract.tesseract_cmd = self.config.tesseract_path
                version = pytesseract.get_tesseract_version()
                if version:
                    self._tesseract_available = True
                    logger.info(f"自定义Tesseract路径可用: {self.config.tesseract_path}")
                    return True
            except Exception as e:
                logger.warning(f"自定义Tesseract路径检查失败: {e}")

        logger.error("Tesseract OCR不可用,OCR功能将无法使用")
        self._tesseract_available = False
        return False

    def _get_dependency_status(self) -> Dict:
        """获取依赖状态详情"""
        return {
            'pytesseract': self._check_package_available('pytesseract'),
            'PIL': self._check_package_available('PIL'),
            'PyMuPDF': self._check_package_available('fitz'),
            'PyPDF2': self._check_package_available('PyPDF2'),
            'pdfplumber': self._check_package_available('pdfplumber'),
            'tesseract': self._tesseract_available
        }

    def _check_package_available(self, package_name: str) -> bool:
        """检查Python包是否可用"""
        try:
            __import__(package_name)
            return True
        except ImportError:
            return False

    def extract_from_image(self, image_path: Union[str, Path],
                           language: str = None) -> ExtractionResult:
        """
        从图片中提取文字

        Args:
            image_path: 图片文件路径
            language: OCR语言,默认自动检测

        Returns:
            ExtractionResult: 提取结果
        """
        # 检查依赖
        deps_ok, status = self.check_dependencies()
        if not deps_ok:
            raise ImportError("缺少必要的OCR依赖包,请先安装依赖")

        from PIL import Image
        import pytesseract

        image_path = Path(image_path)
        if not image_path.exists():
            raise FileNotFoundError(f"图片文件不存在: {image_path}")

        # 设置语言
        lang = language or '+'.join(self.config.languages)

        try:
            # 打开图片
            image = Image.open(image_path)

            # 预处理图片(提高OCR准确率)
            if self.config.enable_preprocessing:
                processed_image = self._preprocess_image(image)
            else:
                processed_image = image

            # 提取文字
            text = pytesseract.image_to_string(processed_image, lang=lang)

            # 获取置信度
            data = pytesseract.image_to_data(processed_image, lang=lang,
                                             output_type=pytesseract.Output.DICT)
            confidence = self._calculate_confidence(data)

            return ExtractionResult(
                file_path=str(image_path),
                file_type=image_path.suffix.lower(),
                text_content=text.strip(),
                confidence=confidence,
                page_count=1,
                language=lang,
                extraction_method=ExtractionMethod.DIRECT.value,
                metadata={
                    'image_size': image.size,
                    'image_mode': image.mode,
                    'ocr_engine': 'tesseract',
                    'preprocessing': self.config.enable_preprocessing
                }
            )

        except Exception as e:
            logger.error(f"图片OCR提取失败: {e}")
            raise

    def extract_from_pdf(self, pdf_path: Union[str, Path],
                         language: str = None,
                         use_ocr: bool = True) -> List[ExtractionResult]:
        """
        从PDF中提取文字

        Args:
            pdf_path: PDF文件路径
            language: OCR语言
            use_ocr: 是否使用OCR(针对扫描版PDF)

        Returns:
            List[ExtractionResult]: 每页的提取结果列表
        """
        deps_ok, status = self.check_dependencies()
        if not deps_ok:
            raise ImportError("缺少必要的OCR依赖包,请先安装依赖")

        pdf_path = Path(pdf_path)
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")

        lang = language or '+'.join(self.config.languages)

        # 尝试多种提取方案(优先使用PyMuPDF)
        extraction_methods = [
            (self._extract_with_pymupdf_ocr, "PyMuPDF OCR提取"),
            (self._extract_pdf_directly, "直接文本提取"),
            (self._extract_with_pypdf2, "PyPDF2提取"),
            (self._extract_with_pdfplumber, "pdfplumber提取")
        ]

        for method, description in extraction_methods:
            try:
                if method == self._extract_with_pymupdf_ocr and not use_ocr:
                    continue  # 跳过OCR方案

                results = method(pdf_path, lang)
                if results and any(r.text_content.strip() for r in results):
                    logger.info(f"使用{description}方案成功")
                    # print("q",results)
                    return results

            except Exception as e:
                logger.warning(f"{description}方案失败: {e}")

        # 所有方案都失败
        logger.error("所有PDF提取方案都失败")
        return [ExtractionResult(
            file_path=str(pdf_path),
            file_type='.pdf',
            text_content='',
            confidence=0.0,
            page_count=0,
            language=lang,
            extraction_method=ExtractionMethod.FALLBACK.value,
            metadata={'error': '所有提取方案都失败'}
        )]

    def _extract_with_pymupdf_ocr(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
        """使用PyMuPDF将PDF转换为图片后OCR"""
        try:
            import fitz  # PyMuPDF
            from PIL import Image
            import pytesseract

            # 打开PDF文档
            doc = fitz.open(pdf_path)
            page_count = len(doc)


            results = []
            for page_num in range(page_count):
                try:
                    # 获取页面
                    page = doc[page_num]

                    # 将页面转换为图片
                    pix = page.get_pixmap(matrix=fitz.Matrix(self.config.dpi / 72, self.config.dpi / 72))
                    img_data = pix.tobytes("ppm")

                    # 将PPM数据转换为PIL图像
                    image = Image.open(io.BytesIO(img_data))
                    # 显示图片
                    # image.show()
                    # 预处理图片
                    if self.config.enable_preprocessing:
                        processed_image = self._preprocess_image(image)
                    else:
                        processed_image = image

                    # 提取文字
                    text = pytesseract.image_to_string(processed_image,  lang=lang)
                    print("text", text)





                    # 获取置信度
                    data = pytesseract.image_to_data(processed_image, lang=lang,
                                                     output_type=pytesseract.Output.DICT)
                    confidence = self._calculate_confidence(data)

                    result = ExtractionResult(
                        file_path=str(pdf_path),
                        file_type='.pdf',
                        text_content=text.strip(),
                        confidence=confidence,
                        page_count=page_count,
                        language=lang,
                        extraction_method=ExtractionMethod.PYMUPDF_OCR.value,
                        metadata={
                            'page_number': page_num + 1,
                            'image_size': image.size,
                            'dpi': self.config.dpi
                        }
                    )
                    results.append(result)

                except Exception as e:
                    logger.error(f"PDF第 {page_num + 1} 页OCR失败: {e}")
                    results.append(self._create_empty_result(pdf_path, lang, page_num + 1, page_count, str(e)))

            doc.close()
            return results

        except ImportError:
            logger.warning("PyMuPDF未安装,跳过此方案")
            return []
        except Exception as e:
            logger.error(f"PyMuPDF方案失败: {e}")
            return []

    def _extract_pdf_directly(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
        """直接提取PDF文本"""
        try:
            import PyPDF2

            results = []
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                page_count = len(pdf_reader.pages)

                for page_num in range(page_count):
                    page = pdf_reader.pages[page_num]
                    text = page.extract_text() or ""

                    result = ExtractionResult(
                        file_path=str(pdf_path),
                        file_type='.pdf',
                        text_content=text.strip(),
                        confidence=1.0,
                        page_count=page_count,
                        language='direct_extraction',
                        extraction_method=ExtractionMethod.DIRECT.value,
                        metadata={
                            'page_number': page_num + 1,
                            'text_length': len(text)
                        }
                    )
                    results.append(result)

            return results

        except Exception as e:
            logger.warning(f"直接提取PDF文本失败: {e}")
            return []

    def _extract_with_pypdf2(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
        """使用PyPDF2提取PDF文本"""
        return self._extract_pdf_directly(pdf_path, lang)  # 复用直接提取方法

    def _extract_with_pdfplumber(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
        """使用pdfplumber提取PDF文本"""
        try:
            import pdfplumber

            results = []
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    try:
                        text = page.extract_text() or ""

                        result = ExtractionResult(
                            file_path=str(pdf_path),
                            file_type='.pdf',
                            text_content=text.strip(),
                            confidence=0.9,
                            page_count=len(pdf.pages),
                            language='pdfplumber_extraction',
                            extraction_method=ExtractionMethod.PDFPLUMBER.value,
                            metadata={
                                'page_number': page_num,
                                'text_length': len(text)
                            }
                        )
                        results.append(result)

                    except Exception as e:
                        logger.error(f"PDF第 {page_num} 页pdfplumber提取失败: {e}")
                        results.append(self._create_empty_result(pdf_path, lang, page_num, len(pdf.pages), str(e)))

            return results

        except ImportError:
            logger.warning("pdfplumber未安装,跳过此方案")
            return []
        except Exception as e:
            logger.error(f"pdfplumber方案失败: {e}")
            return []

    def _create_empty_result(self, pdf_path: Path, lang: str, page_num: int,
                             total_pages: int, error: str) -> ExtractionResult:
        """创建空结果"""
        return ExtractionResult(
            file_path=str(pdf_path),
            file_type='.pdf',
            text_content='',
            confidence=0.0,
            page_count=total_pages,
            language=lang,
            extraction_method=ExtractionMethod.FALLBACK.value,
            metadata={'page_number': page_num, 'error': error}
        )

    def _preprocess_image(self, image):
        """预处理图片以提高OCR准确率"""
        from PIL import Image, ImageEnhance, ImageFilter

        # 转换为灰度图
        if image.mode != 'L':
            image = image.convert('L')

        # 增强对比度
        enhancer = ImageEnhance.Contrast(image)
        image = enhancer.enhance(1.5)

        # 增强锐度
        enhancer = ImageEnhance.Sharpness(image)
        image = enhancer.enhance(1.5)

        return image

    def _calculate_confidence(self, data):
        """计算OCR置信度"""
        if not data or 'conf' not in data:
            return 0.0

        confidences = [float(conf) for conf in data['conf'] if conf != '-1']
        if not confidences:
            return 0.0

        return sum(confidences) / len(confidences) / 100.0

    def batch_extract(self, input_dir: Union[str, Path],
                      output_dir: Optional[Union[str, Path]] = None,
                      file_types: List[str] = None) -> Dict:
        """
        批量提取文件内容

        Args:
            input_dir: 输入目录
            output_dir: 输出目录
            file_types: 支持的文件类型

        Returns:
            Dict: 提取结果统计
        """
        input_dir = Path(input_dir)
        output_dir = Path(output_dir) if output_dir else input_dir / 'extracted'
        output_dir.mkdir(parents=True, exist_ok=True)

        file_types = file_types or ['.jpg', '.jpeg', '.png', '.pdf', '.tiff', '.bmp']

        if not input_dir.exists():
            raise FileNotFoundError(f"输入目录不存在: {input_dir}")

        # 收集所有支持的文件
        files = []
        for ext in file_types:
            files.extend(input_dir.glob(f"*{ext}"))
            files.extend(input_dir.glob(f"*{ext.upper()}"))

        if not files:
            logger.warning(f"在目录 {input_dir} 中未找到支持的文件")
            return {"extracted": 0, "failed": 0, "total": 0}

        results = {"extracted": 0, "failed": 0, "total": len(files)}

        for file_path in files:
            try:
                if file_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
                    # 图片文件
                    result = self.extract_from_image(file_path)
                    self._save_result(result, output_dir)
                    results["extracted"] += 1

                elif file_path.suffix.lower() == '.pdf':
                    # PDF文件
                    pdf_results = self.extract_from_pdf(file_path)
                    self._save_pdf_results(pdf_results, output_dir, file_path)
                    results["extracted"] += 1

                logger.info(f"成功提取: {file_path.name}")

            except Exception as e:
                results["failed"] += 1
                logger.error(f"提取失败 {file_path.name}: {str(e)}")

        return results

    def _save_result(self, result: ExtractionResult, output_dir: Path):
        """保存单个结果"""
        output_file = output_dir / f"{Path(result.file_path).stem}_extracted.txt"

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f"文件: {Path(result.file_path).name}\n")
            f.write(f"类型: {result.file_type}\n")
            f.write(f"置信度: {result.confidence:.2f}\n")
            f.write(f"语言: {result.language}\n")
            f.write(f"提取方法: {result.extraction_method}\n")
            f.write("=" * 50 + "\n")
            f.write(result.text_content)

    def _save_pdf_results(self, results: List[ExtractionResult], output_dir: Path, file_path: Path):
        """保存PDF结果"""
        output_file = output_dir / f"{file_path.stem}_extracted.txt"

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f"文件: {file_path.name}\n")
            f.write(f"总页数: {len(results)}\n")
            f.write("=" * 50 + "\n\n")

            for i, result in enumerate(results, 1):
                f.write(f"第 {i} 页 (置信度: {result.confidence:.2f}, 方法: {result.extraction_method})\n")
                f.write("-" * 30 + "\n")
                f.write(result.text_content)
                f.write("\n\n")


# 便捷函数
def quick_extract_image(image_path: str, language: str = None) -> str:
    """快速提取图片文本"""
    processor = OCRProcessor()
    result = processor.extract_from_image(image_path, language)
    return result.text_content


def quick_extract_pdf(pdf_path: str, language: str = None) -> str:
    """快速提取PDF文本"""
    processor = OCRProcessor()
    results = processor.extract_from_pdf(pdf_path, language)
    return "\n\n".join([r.text_content for r in results if r.text_content.strip()])


if __name__ == "__main__":
    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)

    # 检查依赖
    deps_ok, status = processor.check_dependencies()
    print(f"依赖状态: {deps_ok}")
    print(f"依赖详情: {status}")

    # 测试PDF提取
    try:
        # result = quick_extract_pdf("D:/projects/pdf_test/pdf单页.pdf")
        result = quick_extract_image("D:/projects/pdf_test/4.jpg")
        print("提取结果:", result)
    except Exception as e:
        print(f"提取失败: {e}")
#

四、编写pdfToWord.py

复制代码
import os
import logging
from typing import Dict, Any, Optional
from pathlib import Path
import tempfile
from mcp.server.fastmcp import FastMCP
from starlette.middleware import Middleware as ASGIMiddleware
from starlette.middleware.cors import CORSMiddleware
from mcp.server.transport_security import TransportSecuritySettings
import pytesseract
import fitz
import io
import re
import logging
import subprocess
import sys
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
import json
from enum import Enum
from utils.ocrUtil import OCRConfig, OCRProcessor

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# 创建临时目录用于存储文件
TEMP_DIR = Path(tempfile.gettempdir()) / "pdf_mcp_temp"
TEMP_DIR.mkdir(exist_ok=True)

def pdf_to_word(pdf_file_path: str, output_file_path: Optional[str] = None) -> Dict[str, Any]:
    """
    将PDF文件转换为Word文档

    :param pdf_file_path: PDF文件的路径
    :param output_file_path: 输出的Word文件路径(可选,不指定则自动生成)
    :return: 转换结果信息
    """
    pdf_path=Path(pdf_file_path)
    folder_path = os.path.dirname(pdf_path)
    print(folder_path)

    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)
    result = processor.extract_from_pdf(pdf_file_path)

    try:
        # 生成输出文件路径
        if not output_file_path:
            output_file_path = folder_path + "/" + f"{pdf_path.stem}_converted.docx"
        output_path = Path(output_file_path)

        # 使用OCR提取的文本创建Word文档
        try:
            from docx import Document
            from docx.shared import Inches
            doc = Document()
            # 添加文档标题
            doc.add_heading(f"PDF转换结果: {pdf_path.stem}", 0)
            # 将OCR提取的文本添加到Word文档
            for i, result in enumerate(result, 1):
                if i > 1:  # 第一页之后添加分页符
                    doc.add_page_break()
                # 添加页眉
                doc.add_heading(f"第 {i} 页", level=1)
                # 添加正文内容
                if result.text_content.strip():
                    # 添加置信度信息
                    # confidence_info = f"置信度: {result.confidence:.2f} | 提取方法: {result.extraction_method}"
                    # doc.add_paragraph(confidence_info)

                    # 添加文本内容
                    content_paragraph = doc.add_paragraph()
                    content_paragraph.add_run(result.text_content)
                else:
                    doc.add_paragraph("本页无文本内容")

            doc.save(output_path)

            return {
                "status": "success",
                "message": "PDF转Word转换成功(OCR文本提取模式)",
                "input_file": str(pdf_path),
                "output_file": str(output_path),
                "file_size": output_path.stat().st_size,
                "note": "此转换使用OCR提取文本内容,可能不保留原始格式",
                "pages_extracted": len(result)
            }

        except ImportError as e:
            return {
                "status": "error",
                "message": f"缺少必要的依赖包: {str(e)}"
            }

    except Exception as e:
        logger.error(f"PDF转Word转换失败: {str(e)}")
        return {
            "status": "error",
            "message": f"转换失败: {str(e)}"
        }



if __name__ == "__main__":
    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)

    # 检查依赖
    deps_ok, status = processor.check_dependencies()
    print(f"依赖状态: {deps_ok}")
    print(f"依赖详情: {status}")

    # 测试PDF提取
    try:
        result = pdf_to_word("D:/projects/pdf_test/4.jpg")
        print("提取结果:", result)
    except Exception as e:
        print(f"提取失败: {e}")

五、编写imageToWord.py

复制代码
import os
import logging
from typing import Dict, Any, Optional
from pathlib import Path
import tempfile
from mcp.server.fastmcp import FastMCP
from starlette.middleware import Middleware as ASGIMiddleware
from starlette.middleware.cors import CORSMiddleware
from mcp.server.transport_security import TransportSecuritySettings
import pytesseract
import fitz
import io
import re
import logging
import subprocess
import sys
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
import json
from enum import Enum
from utils.ocrUtil import OCRConfig, OCRProcessor

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# 创建临时目录用于存储文件
TEMP_DIR = Path(tempfile.gettempdir()) / "pdf_mcp_temp"
TEMP_DIR.mkdir(exist_ok=True)

def image_to_word(pdf_file_path: str, output_file_path: Optional[str] = None) -> Dict[str, Any]:
    """
    将PDF文件转换为Word文档

    :param pdf_file_path: PDF文件的路径
    :param output_file_path: 输出的Word文件路径(可选,不指定则自动生成)
    :return: 转换结果信息
    """
    pdf_path=Path(pdf_file_path)
    folder_path = os.path.dirname(pdf_path)
    print(folder_path)

    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)
    result = processor.extract_from_image(pdf_file_path)
    print("qqq",result)
    try:
        # 生成输出文件路径
        if not output_file_path:
            output_file_path = folder_path + "/" + f"{pdf_path.stem}_converted.docx"
        output_path = Path(output_file_path)

        # 使用OCR提取的文本创建Word文档
        try:
            from docx import Document
            from docx.shared import Inches
            doc = Document()
            # 添加文档标题
            doc.add_heading(f"图片转换结果: {pdf_path.stem}", 0)

            # 添加页眉
            doc.add_heading("图片内容", level=1)

            # 添加正文内容(单个图片,不需要循环)
            if result.text_content.strip():
                # 添加置信度信息
                # confidence_info = f"置信度: {result.confidence:.2f} | 提取方法: {result.extraction_method}"
                # doc.add_paragraph(confidence_info)

                # 添加文本内容
                content_paragraph = doc.add_paragraph()
                content_paragraph.add_run(result.text_content)
            else:
                doc.add_paragraph("图片中未检测到文本内容")

            doc.save(output_path)

            return {
                "status": "success",
                "message": "图片转Word转换成功",
                "input_file": str(pdf_path),
                "output_file": str(output_path),
                "file_size": output_path.stat().st_size,
                # "confidence": result.confidence,
                "extraction_method": result.extraction_method
            }

        except ImportError as e:
            return {
                "status": "error",
                "message": f"缺少必要的依赖包: {str(e)}"
            }

    except Exception as e:
        logger.error(f"图片转Word转换失败: {str(e)}")
        return {
            "status": "error",
            "message": f"转换失败: {str(e)}"
        }



if __name__ == "__main__":
    # 创建处理器
    config = OCRConfig(
        languages=['chi_sim', 'eng'],
        dpi=200,
        enable_preprocessing=True
    )

    processor = OCRProcessor(config)

    # 检查依赖
    deps_ok, status = processor.check_dependencies()
    print(f"依赖状态: {deps_ok}")
    print(f"依赖详情: {status}")

    # 测试PDF提取
    try:
        result = image_to_word("D:/projects/pdf_test/1.jpg")
        print("提取结果:", result)
    except Exception as e:
        print(f"提取失败: {e}")
相关推荐
2601_961875243 小时前
高考真题word版下载|2025高考全科真题可编辑文档
c#·word·ar·vr·mr·高考·oneflow
2601_9618752412 小时前
高考真题电子版|2025高考全科真题分类PDF
金融·pdf·云计算·azure·七牛云存储·交友·高考
质造者12 小时前
Python 本地 RAG 实战 | Ollama+ChromaDB 实现 PDF 离线智能问答
开发语言·python·pdf·大模型·rag
王莎莎-MinerU12 小时前
从 OCR 到 Context Engineering:用 MinerU 搭一个可复现文档解析评测
人工智能·深度学习·机器学习·pdf·ocr·个人开发
DS随心转小程序14 小时前
AI导出鸭 从 Markdown 草稿到精品 Word 文档的无损之道
人工智能·word·豆包·deepseek·ai导出鸭
AI人工智能+14 小时前
往来港澳通行证识别系统,深度融合计算机视觉与自然语言处理,为“智慧口岸”和“数字政务”提供了强有力的技术支撑
人工智能·深度学习·ocr·往来港澳通行证识别
打小就很皮...14 小时前
基于 Python + LangChain + React 实现智能发票识别与验真系统实战
前端·react.js·langchain·ocr·发票识别
weixin_3077791314 小时前
从切片迷宫到结构化智能:AI Agent解析PDF的完整范式
图像处理·人工智能·python·自动化·ocr
asdzx6715 小时前
使用 C# 轻松为 Word 文档添加数字签名
c#·word
天天代码码天天15 小时前
用 OpenCV 5 DNN 跑 PP-OCR:一个适合新手学习的 C++ 动态库 + C# 可视化测试项目
opencv·ocr·dnn·opencv5·ppocrv6