一、开发环境
Pycharm2025,python解析器3.11
二、windows本地安装依赖包
(1)Poppler下载地址
(2)tesseract-ocr windows版下载地址
https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-v5.3.0.20221214.exe
(3)语言包下载地址(下载后放在(2)安装的路径tesseract-ocr/tessdata)
https://github.com/tesseract-ocr/tessdata
"""
OCR文字识别工具类 - 使用PyMuPDF替代pdf2image
支持图片文字识别和PDF文字提取
提供多引擎支持:Tesseract OCR + 多种PDF处理方案
"""
import pytesseract
import fitz
import os
import io
import re
import logging
import subprocess
import sys
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
import json
from enum import Enum
# 设置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class ExtractionMethod(Enum):
"""提取方法枚举"""
DIRECT = "direct" # 直接文本提取
PYMUPDF_OCR = "pymupdf_ocr" # PyMuPDF转图片后OCR
PYPDF2 = "pypdf2" # PyPDF2提取
PDFPLUMBER = "pdfplumber" # pdfplumber提取
FALLBACK = "fallback" # 回退方案
@dataclass
class ExtractionResult:
"""提取结果数据类"""
file_path: str
file_type: str
text_content: str
confidence: float
page_count: int
language: str
extraction_method: str
metadata: Dict
@dataclass
class OCRConfig:
"""OCR配置类"""
languages: List[str] = None
tesseract_path: str = None
dpi: int = 200
enable_preprocessing: bool = True
fallback_enabled: bool = True
class OCRProcessor:
"""OCR文字处理器 - 使用PyMuPDF版本"""
def __init__(self, config: OCRConfig = None):
"""
初始化OCR处理器
Args:
config: OCR配置对象
"""
self.config = config or OCRConfig()
if self.config.languages is None:
self.config.languages = ['chi_sim', 'eng']
self._dependencies_checked = False
self._dependencies_available = False
self._tesseract_available = False
def check_dependencies(self) -> Tuple[bool, Dict]:
"""
检查OCR依赖
Returns:
Tuple[bool, Dict]: (是否可用, 依赖状态详情)
"""
if self._dependencies_checked:
return self._dependencies_available, self._get_dependency_status()
dependency_status = {
'pytesseract': False,
'PIL': False,
'PyMuPDF': False,
'PyPDF2': False,
'pdfplumber': False,
'tesseract': False
}
# 检查Python包依赖
packages_to_check = [
('pytesseract', 'pytesseract'),
('PIL', 'PIL'),
('PyMuPDF', 'fitz'), # PyMuPDF的导入名是fitz
('PyPDF2', 'PyPDF2'),
('pdfplumber', 'pdfplumber')
]
for package_name, import_name in packages_to_check:
try:
__import__(import_name)
dependency_status[package_name] = True
except ImportError:
logger.warning(f"缺少依赖包: {package_name}")
# 检查Tesseract OCR是否可用
dependency_status['tesseract'] = self._check_tesseract_availability()
# 判断总体可用性(至少需要基本OCR功能)
self._dependencies_available = (
dependency_status['pytesseract'] and
dependency_status['PIL'] and
dependency_status['tesseract']
)
self._dependencies_checked = True
return self._dependencies_available, dependency_status
def _check_tesseract_availability(self) -> bool:
"""检查Tesseract OCR是否可用"""
try:
import pytesseract
# 尝试获取Tesseract版本
version = pytesseract.get_tesseract_version()
if version:
self._tesseract_available = True
logger.info(f"Tesseract OCR可用,版本: {version}")
return True
except Exception as e:
logger.warning(f"Tesseract OCR检查失败: {e}")
# 检查自定义Tesseract路径
if self.config.tesseract_path and os.path.exists(self.config.tesseract_path):
try:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = self.config.tesseract_path
version = pytesseract.get_tesseract_version()
if version:
self._tesseract_available = True
logger.info(f"自定义Tesseract路径可用: {self.config.tesseract_path}")
return True
except Exception as e:
logger.warning(f"自定义Tesseract路径检查失败: {e}")
logger.error("Tesseract OCR不可用,OCR功能将无法使用")
self._tesseract_available = False
return False
def _get_dependency_status(self) -> Dict:
"""获取依赖状态详情"""
return {
'pytesseract': self._check_package_available('pytesseract'),
'PIL': self._check_package_available('PIL'),
'PyMuPDF': self._check_package_available('fitz'),
'PyPDF2': self._check_package_available('PyPDF2'),
'pdfplumber': self._check_package_available('pdfplumber'),
'tesseract': self._tesseract_available
}
def _check_package_available(self, package_name: str) -> bool:
"""检查Python包是否可用"""
try:
__import__(package_name)
return True
except ImportError:
return False
def extract_from_image(self, image_path: Union[str, Path],
language: str = None) -> ExtractionResult:
"""
从图片中提取文字
Args:
image_path: 图片文件路径
language: OCR语言,默认自动检测
Returns:
ExtractionResult: 提取结果
"""
# 检查依赖
deps_ok, status = self.check_dependencies()
if not deps_ok:
raise ImportError("缺少必要的OCR依赖包,请先安装依赖")
from PIL import Image
import pytesseract
image_path = Path(image_path)
if not image_path.exists():
raise FileNotFoundError(f"图片文件不存在: {image_path}")
# 设置语言
lang = language or '+'.join(self.config.languages)
try:
# 打开图片
image = Image.open(image_path)
# 预处理图片(提高OCR准确率)
if self.config.enable_preprocessing:
processed_image = self._preprocess_image(image)
else:
processed_image = image
# 提取文字
text = pytesseract.image_to_string(processed_image, lang=lang)
# 获取置信度
data = pytesseract.image_to_data(processed_image, lang=lang,
output_type=pytesseract.Output.DICT)
confidence = self._calculate_confidence(data)
return ExtractionResult(
file_path=str(image_path),
file_type=image_path.suffix.lower(),
text_content=text.strip(),
confidence=confidence,
page_count=1,
language=lang,
extraction_method=ExtractionMethod.DIRECT.value,
metadata={
'image_size': image.size,
'image_mode': image.mode,
'ocr_engine': 'tesseract',
'preprocessing': self.config.enable_preprocessing
}
)
except Exception as e:
logger.error(f"图片OCR提取失败: {e}")
raise
def extract_from_pdf(self, pdf_path: Union[str, Path],
language: str = None,
use_ocr: bool = True) -> List[ExtractionResult]:
"""
从PDF中提取文字
Args:
pdf_path: PDF文件路径
language: OCR语言
use_ocr: 是否使用OCR(针对扫描版PDF)
Returns:
List[ExtractionResult]: 每页的提取结果列表
"""
deps_ok, status = self.check_dependencies()
if not deps_ok:
raise ImportError("缺少必要的OCR依赖包,请先安装依赖")
pdf_path = Path(pdf_path)
if not pdf_path.exists():
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
lang = language or '+'.join(self.config.languages)
# 尝试多种提取方案(优先使用PyMuPDF)
extraction_methods = [
(self._extract_with_pymupdf_ocr, "PyMuPDF OCR提取"),
(self._extract_pdf_directly, "直接文本提取"),
(self._extract_with_pypdf2, "PyPDF2提取"),
(self._extract_with_pdfplumber, "pdfplumber提取")
]
for method, description in extraction_methods:
try:
if method == self._extract_with_pymupdf_ocr and not use_ocr:
continue # 跳过OCR方案
results = method(pdf_path, lang)
if results and any(r.text_content.strip() for r in results):
logger.info(f"使用{description}方案成功")
# print("q",results)
return results
except Exception as e:
logger.warning(f"{description}方案失败: {e}")
# 所有方案都失败
logger.error("所有PDF提取方案都失败")
return [ExtractionResult(
file_path=str(pdf_path),
file_type='.pdf',
text_content='',
confidence=0.0,
page_count=0,
language=lang,
extraction_method=ExtractionMethod.FALLBACK.value,
metadata={'error': '所有提取方案都失败'}
)]
def _extract_with_pymupdf_ocr(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
"""使用PyMuPDF将PDF转换为图片后OCR"""
try:
import fitz # PyMuPDF
from PIL import Image
import pytesseract
# 打开PDF文档
doc = fitz.open(pdf_path)
page_count = len(doc)
results = []
for page_num in range(page_count):
try:
# 获取页面
page = doc[page_num]
# 将页面转换为图片
pix = page.get_pixmap(matrix=fitz.Matrix(self.config.dpi / 72, self.config.dpi / 72))
img_data = pix.tobytes("ppm")
# 将PPM数据转换为PIL图像
image = Image.open(io.BytesIO(img_data))
# 显示图片
# image.show()
# 预处理图片
if self.config.enable_preprocessing:
processed_image = self._preprocess_image(image)
else:
processed_image = image
# 提取文字
text = pytesseract.image_to_string(processed_image, lang=lang)
print("text", text)
# 获取置信度
data = pytesseract.image_to_data(processed_image, lang=lang,
output_type=pytesseract.Output.DICT)
confidence = self._calculate_confidence(data)
result = ExtractionResult(
file_path=str(pdf_path),
file_type='.pdf',
text_content=text.strip(),
confidence=confidence,
page_count=page_count,
language=lang,
extraction_method=ExtractionMethod.PYMUPDF_OCR.value,
metadata={
'page_number': page_num + 1,
'image_size': image.size,
'dpi': self.config.dpi
}
)
results.append(result)
except Exception as e:
logger.error(f"PDF第 {page_num + 1} 页OCR失败: {e}")
results.append(self._create_empty_result(pdf_path, lang, page_num + 1, page_count, str(e)))
doc.close()
return results
except ImportError:
logger.warning("PyMuPDF未安装,跳过此方案")
return []
except Exception as e:
logger.error(f"PyMuPDF方案失败: {e}")
return []
def _extract_pdf_directly(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
"""直接提取PDF文本"""
try:
import PyPDF2
results = []
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
page_count = len(pdf_reader.pages)
for page_num in range(page_count):
page = pdf_reader.pages[page_num]
text = page.extract_text() or ""
result = ExtractionResult(
file_path=str(pdf_path),
file_type='.pdf',
text_content=text.strip(),
confidence=1.0,
page_count=page_count,
language='direct_extraction',
extraction_method=ExtractionMethod.DIRECT.value,
metadata={
'page_number': page_num + 1,
'text_length': len(text)
}
)
results.append(result)
return results
except Exception as e:
logger.warning(f"直接提取PDF文本失败: {e}")
return []
def _extract_with_pypdf2(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
"""使用PyPDF2提取PDF文本"""
return self._extract_pdf_directly(pdf_path, lang) # 复用直接提取方法
def _extract_with_pdfplumber(self, pdf_path: Path, lang: str) -> List[ExtractionResult]:
"""使用pdfplumber提取PDF文本"""
try:
import pdfplumber
results = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
try:
text = page.extract_text() or ""
result = ExtractionResult(
file_path=str(pdf_path),
file_type='.pdf',
text_content=text.strip(),
confidence=0.9,
page_count=len(pdf.pages),
language='pdfplumber_extraction',
extraction_method=ExtractionMethod.PDFPLUMBER.value,
metadata={
'page_number': page_num,
'text_length': len(text)
}
)
results.append(result)
except Exception as e:
logger.error(f"PDF第 {page_num} 页pdfplumber提取失败: {e}")
results.append(self._create_empty_result(pdf_path, lang, page_num, len(pdf.pages), str(e)))
return results
except ImportError:
logger.warning("pdfplumber未安装,跳过此方案")
return []
except Exception as e:
logger.error(f"pdfplumber方案失败: {e}")
return []
def _create_empty_result(self, pdf_path: Path, lang: str, page_num: int,
total_pages: int, error: str) -> ExtractionResult:
"""创建空结果"""
return ExtractionResult(
file_path=str(pdf_path),
file_type='.pdf',
text_content='',
confidence=0.0,
page_count=total_pages,
language=lang,
extraction_method=ExtractionMethod.FALLBACK.value,
metadata={'page_number': page_num, 'error': error}
)
def _preprocess_image(self, image):
"""预处理图片以提高OCR准确率"""
from PIL import Image, ImageEnhance, ImageFilter
# 转换为灰度图
if image.mode != 'L':
image = image.convert('L')
# 增强对比度
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(1.5)
# 增强锐度
enhancer = ImageEnhance.Sharpness(image)
image = enhancer.enhance(1.5)
return image
def _calculate_confidence(self, data):
"""计算OCR置信度"""
if not data or 'conf' not in data:
return 0.0
confidences = [float(conf) for conf in data['conf'] if conf != '-1']
if not confidences:
return 0.0
return sum(confidences) / len(confidences) / 100.0
def batch_extract(self, input_dir: Union[str, Path],
output_dir: Optional[Union[str, Path]] = None,
file_types: List[str] = None) -> Dict:
"""
批量提取文件内容
Args:
input_dir: 输入目录
output_dir: 输出目录
file_types: 支持的文件类型
Returns:
Dict: 提取结果统计
"""
input_dir = Path(input_dir)
output_dir = Path(output_dir) if output_dir else input_dir / 'extracted'
output_dir.mkdir(parents=True, exist_ok=True)
file_types = file_types or ['.jpg', '.jpeg', '.png', '.pdf', '.tiff', '.bmp']
if not input_dir.exists():
raise FileNotFoundError(f"输入目录不存在: {input_dir}")
# 收集所有支持的文件
files = []
for ext in file_types:
files.extend(input_dir.glob(f"*{ext}"))
files.extend(input_dir.glob(f"*{ext.upper()}"))
if not files:
logger.warning(f"在目录 {input_dir} 中未找到支持的文件")
return {"extracted": 0, "failed": 0, "total": 0}
results = {"extracted": 0, "failed": 0, "total": len(files)}
for file_path in files:
try:
if file_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
# 图片文件
result = self.extract_from_image(file_path)
self._save_result(result, output_dir)
results["extracted"] += 1
elif file_path.suffix.lower() == '.pdf':
# PDF文件
pdf_results = self.extract_from_pdf(file_path)
self._save_pdf_results(pdf_results, output_dir, file_path)
results["extracted"] += 1
logger.info(f"成功提取: {file_path.name}")
except Exception as e:
results["failed"] += 1
logger.error(f"提取失败 {file_path.name}: {str(e)}")
return results
def _save_result(self, result: ExtractionResult, output_dir: Path):
"""保存单个结果"""
output_file = output_dir / f"{Path(result.file_path).stem}_extracted.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(f"文件: {Path(result.file_path).name}\n")
f.write(f"类型: {result.file_type}\n")
f.write(f"置信度: {result.confidence:.2f}\n")
f.write(f"语言: {result.language}\n")
f.write(f"提取方法: {result.extraction_method}\n")
f.write("=" * 50 + "\n")
f.write(result.text_content)
def _save_pdf_results(self, results: List[ExtractionResult], output_dir: Path, file_path: Path):
"""保存PDF结果"""
output_file = output_dir / f"{file_path.stem}_extracted.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(f"文件: {file_path.name}\n")
f.write(f"总页数: {len(results)}\n")
f.write("=" * 50 + "\n\n")
for i, result in enumerate(results, 1):
f.write(f"第 {i} 页 (置信度: {result.confidence:.2f}, 方法: {result.extraction_method})\n")
f.write("-" * 30 + "\n")
f.write(result.text_content)
f.write("\n\n")
# 便捷函数
def quick_extract_image(image_path: str, language: str = None) -> str:
"""快速提取图片文本"""
processor = OCRProcessor()
result = processor.extract_from_image(image_path, language)
return result.text_content
def quick_extract_pdf(pdf_path: str, language: str = None) -> str:
"""快速提取PDF文本"""
processor = OCRProcessor()
results = processor.extract_from_pdf(pdf_path, language)
return "\n\n".join([r.text_content for r in results if r.text_content.strip()])
if __name__ == "__main__":
# 创建处理器
config = OCRConfig(
languages=['chi_sim', 'eng'],
dpi=200,
enable_preprocessing=True
)
processor = OCRProcessor(config)
# 检查依赖
deps_ok, status = processor.check_dependencies()
print(f"依赖状态: {deps_ok}")
print(f"依赖详情: {status}")
# 测试PDF提取
try:
# result = quick_extract_pdf("D:/projects/pdf_test/pdf单页.pdf")
result = quick_extract_image("D:/projects/pdf_test/4.jpg")
print("提取结果:", result)
except Exception as e:
print(f"提取失败: {e}")
#
import os
import logging
from typing import Dict, Any, Optional
from pathlib import Path
import tempfile
from mcp.server.fastmcp import FastMCP
from starlette.middleware import Middleware as ASGIMiddleware
from starlette.middleware.cors import CORSMiddleware
from mcp.server.transport_security import TransportSecuritySettings
import pytesseract
import fitz
import io
import re
import logging
import subprocess
import sys
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
import json
from enum import Enum
from utils.ocrUtil import OCRConfig, OCRProcessor
# 设置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# 创建临时目录用于存储文件
TEMP_DIR = Path(tempfile.gettempdir()) / "pdf_mcp_temp"
TEMP_DIR.mkdir(exist_ok=True)
def pdf_to_word(pdf_file_path: str, output_file_path: Optional[str] = None) -> Dict[str, Any]:
"""
将PDF文件转换为Word文档
:param pdf_file_path: PDF文件的路径
:param output_file_path: 输出的Word文件路径(可选,不指定则自动生成)
:return: 转换结果信息
"""
pdf_path=Path(pdf_file_path)
folder_path = os.path.dirname(pdf_path)
print(folder_path)
# 创建处理器
config = OCRConfig(
languages=['chi_sim', 'eng'],
dpi=200,
enable_preprocessing=True
)
processor = OCRProcessor(config)
result = processor.extract_from_pdf(pdf_file_path)
try:
# 生成输出文件路径
if not output_file_path:
output_file_path = folder_path + "/" + f"{pdf_path.stem}_converted.docx"
output_path = Path(output_file_path)
# 使用OCR提取的文本创建Word文档
try:
from docx import Document
from docx.shared import Inches
doc = Document()
# 添加文档标题
doc.add_heading(f"PDF转换结果: {pdf_path.stem}", 0)
# 将OCR提取的文本添加到Word文档
for i, result in enumerate(result, 1):
if i > 1: # 第一页之后添加分页符
doc.add_page_break()
# 添加页眉
doc.add_heading(f"第 {i} 页", level=1)
# 添加正文内容
if result.text_content.strip():
# 添加置信度信息
# confidence_info = f"置信度: {result.confidence:.2f} | 提取方法: {result.extraction_method}"
# doc.add_paragraph(confidence_info)
# 添加文本内容
content_paragraph = doc.add_paragraph()
content_paragraph.add_run(result.text_content)
else:
doc.add_paragraph("本页无文本内容")
doc.save(output_path)
return {
"status": "success",
"message": "PDF转Word转换成功(OCR文本提取模式)",
"input_file": str(pdf_path),
"output_file": str(output_path),
"file_size": output_path.stat().st_size,
"note": "此转换使用OCR提取文本内容,可能不保留原始格式",
"pages_extracted": len(result)
}
except ImportError as e:
return {
"status": "error",
"message": f"缺少必要的依赖包: {str(e)}"
}
except Exception as e:
logger.error(f"PDF转Word转换失败: {str(e)}")
return {
"status": "error",
"message": f"转换失败: {str(e)}"
}
if __name__ == "__main__":
# 创建处理器
config = OCRConfig(
languages=['chi_sim', 'eng'],
dpi=200,
enable_preprocessing=True
)
processor = OCRProcessor(config)
# 检查依赖
deps_ok, status = processor.check_dependencies()
print(f"依赖状态: {deps_ok}")
print(f"依赖详情: {status}")
# 测试PDF提取
try:
result = pdf_to_word("D:/projects/pdf_test/4.jpg")
print("提取结果:", result)
except Exception as e:
print(f"提取失败: {e}")
import os
import logging
from typing import Dict, Any, Optional
from pathlib import Path
import tempfile
from mcp.server.fastmcp import FastMCP
from starlette.middleware import Middleware as ASGIMiddleware
from starlette.middleware.cors import CORSMiddleware
from mcp.server.transport_security import TransportSecuritySettings
import pytesseract
import fitz
import io
import re
import logging
import subprocess
import sys
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
import json
from enum import Enum
from utils.ocrUtil import OCRConfig, OCRProcessor
# 设置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# 创建临时目录用于存储文件
TEMP_DIR = Path(tempfile.gettempdir()) / "pdf_mcp_temp"
TEMP_DIR.mkdir(exist_ok=True)
def image_to_word(pdf_file_path: str, output_file_path: Optional[str] = None) -> Dict[str, Any]:
"""
将PDF文件转换为Word文档
:param pdf_file_path: PDF文件的路径
:param output_file_path: 输出的Word文件路径(可选,不指定则自动生成)
:return: 转换结果信息
"""
pdf_path=Path(pdf_file_path)
folder_path = os.path.dirname(pdf_path)
print(folder_path)
# 创建处理器
config = OCRConfig(
languages=['chi_sim', 'eng'],
dpi=200,
enable_preprocessing=True
)
processor = OCRProcessor(config)
result = processor.extract_from_image(pdf_file_path)
print("qqq",result)
try:
# 生成输出文件路径
if not output_file_path:
output_file_path = folder_path + "/" + f"{pdf_path.stem}_converted.docx"
output_path = Path(output_file_path)
# 使用OCR提取的文本创建Word文档
try:
from docx import Document
from docx.shared import Inches
doc = Document()
# 添加文档标题
doc.add_heading(f"图片转换结果: {pdf_path.stem}", 0)
# 添加页眉
doc.add_heading("图片内容", level=1)
# 添加正文内容(单个图片,不需要循环)
if result.text_content.strip():
# 添加置信度信息
# confidence_info = f"置信度: {result.confidence:.2f} | 提取方法: {result.extraction_method}"
# doc.add_paragraph(confidence_info)
# 添加文本内容
content_paragraph = doc.add_paragraph()
content_paragraph.add_run(result.text_content)
else:
doc.add_paragraph("图片中未检测到文本内容")
doc.save(output_path)
return {
"status": "success",
"message": "图片转Word转换成功",
"input_file": str(pdf_path),
"output_file": str(output_path),
"file_size": output_path.stat().st_size,
# "confidence": result.confidence,
"extraction_method": result.extraction_method
}
except ImportError as e:
return {
"status": "error",
"message": f"缺少必要的依赖包: {str(e)}"
}
except Exception as e:
logger.error(f"图片转Word转换失败: {str(e)}")
return {
"status": "error",
"message": f"转换失败: {str(e)}"
}
if __name__ == "__main__":
# 创建处理器
config = OCRConfig(
languages=['chi_sim', 'eng'],
dpi=200,
enable_preprocessing=True
)
processor = OCRProcessor(config)
# 检查依赖
deps_ok, status = processor.check_dependencies()
print(f"依赖状态: {deps_ok}")
print(f"依赖详情: {status}")
# 测试PDF提取
try:
result = image_to_word("D:/projects/pdf_test/1.jpg")
print("提取结果:", result)
except Exception as e:
print(f"提取失败: {e}")