本地部署DeepSeek-OCR:打造高效的PDF文字识别服务
告别繁琐的OCR配置,轻松构建企业级文档识别系统
在文档数字化处理中,OCR(光学字符识别)技术扮演着至关重要的角色。今天我要分享的是如何快速部署和优化DeepSeek-OCR,将其封装成高效的API服务,实现PDF文档的智能解析。
为什么选择DeepSeek-OCR?
DeepSeek-OCR作为先进的OCR解决方案,具备以下优势:
- 高精度识别能力
- 支持复杂版式文档
- 开源可定制
- 强大的中文支持
环境准备与部署
1. 项目初始化
首先获取项目代码并创建隔离的Python环境:
bash
git clone https://github.com/deepseek-ai/DeepSeek-OCR.git
conda create --name deepseek-ocr python=3.12
conda activate deepseek-ocr
2. 依赖安装
进入项目目录并安装必要的依赖包:
bash
cd DeepSeek-OCR-main/
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/
pip install vllm==0.8.5 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/
pip install -r requirements.txt -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/
pip install flash-attn==2.7.3 --no-build-isolation -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/
3. 模型下载
从Hugging Face或ModelScope下载预训练模型,并保存到指定目录。
从单次测试到持续服务
基础测试方法
项目提供了简单的测试脚本,通过配置config.py文件指定模型路径和输入输出路径:
python
# config.py 配置示例
MODEL_PATH = "你的模型路径"
INPUT_PATH = "输入PDF或图片路径"
OUTPUT_PATH = "输出结果路径"
PROMPT = "根据任务选择合适的提示词"
运行测试脚本:
bash
python run_dpsk_ocr_pdf.py
# run_dpsk_ocr_pdf.py 会导入config.py的配置内容,加载模型解析pdf,保存到md文件(你设置的输出目录)。
构建OCR API服务
为了避免每次使用都要重新加载模型和配置参数,我开发了一个基于FastAPI的OCR服务:
核心优势:
- 🚀 一次加载,多次使用 - 模型常驻内存,避免重复加载
- 🔄 并发处理 - 支持多任务并行处理
- 📁 异步处理 - 非阻塞式请求处理
- 🎯 灵活配置 - 支持动态参数调整
服务核心代码结构:
python
"""
@version: python3.9
@author: hcb
@software: PyCharm
@file: deepseek_ocr_pdf_server.py
@time: 2025/10/29 14:53
"""
import os
import fitz
import img2pdf
import io
import re
from tqdm import tqdm
import torch
from concurrent.futures import ThreadPoolExecutor
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse, FileResponse
import uuid
import shutil
from typing import List, Optional
import json
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
from config import MODEL_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE
import numpy as np
from deepseek_ocr import DeepseekOCRForCausalLM
from vllm.model_executor.models.registry import ModelRegistry
from vllm import LLM, SamplingParams
from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
from process.image_process import DeepseekOCRProcessor
from PIL import Image, ImageDraw, ImageFont # 确保这样导入
# 全局变量
llm = None
sampling_params = None
class Colors:
RED = '\033[31m'
GREEN = '\033[32m'
YELLOW = '\033[33m'
BLUE = '\033[34m'
RESET = '\033[0m'
def initialize_model():
"""初始化模型"""
global llm, sampling_params
print(f'{Colors.GREEN}Initializing Deepseek OCR model...{Colors.RESET}')
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
llm = LLM(
model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256,
enforce_eager=False,
trust_remote_code=True,
max_model_len=8192,
swap_space=0,
max_num_seqs=MAX_CONCURRENCY,
tensor_parallel_size=1,
gpu_memory_utilization=0.9,
disable_mm_preprocessor_cache=True
)
logits_processors = [
NoRepeatNGramLogitsProcessor(ngram_size=20, window_size=50, whitelist_token_ids={128821, 128822})]
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=8192,
logits_processors=logits_processors,
skip_special_tokens=False,
include_stop_str_in_output=True,
)
print(f'{Colors.GREEN}Model initialized successfully!{Colors.RESET}')
def pdf_to_images_high_quality(pdf_path, dpi=144, image_format="PNG"):
"""PDF转图片"""
images = []
pdf_document = fitz.open(pdf_path)
zoom = dpi / 72.0
matrix = fitz.Matrix(zoom, zoom)
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
pixmap = page.get_pixmap(matrix=matrix, alpha=False)
Image.MAX_IMAGE_PIXELS = None
if image_format.upper() == "PNG":
img_data = pixmap.tobytes("png")
img = Image.open(io.BytesIO(img_data))
else:
img_data = pixmap.tobytes("png")
img = Image.open(io.BytesIO(img_data))
if img.mode in ('RGBA', 'LA'):
background = Image.new('RGB', img.size, (255, 255, 255))
background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
img = background
images.append(img)
pdf_document.close()
return images
def pil_to_pdf_img2pdf(pil_images, output_path):
"""图片转PDF"""
if not pil_images:
return
image_bytes_list = []
for img in pil_images:
if img.mode != 'RGB':
img = img.convert('RGB')
img_buffer = io.BytesIO()
img.save(img_buffer, format='JPEG', quality=95)
img_bytes = img_buffer.getvalue()
image_bytes_list.append(img_bytes)
try:
pdf_bytes = img2pdf.convert(image_bytes_list)
with open(output_path, "wb") as f:
f.write(pdf_bytes)
except Exception as e:
print(f"Error converting to PDF: {e}")
def re_match(text):
"""正则匹配"""
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
matches = re.findall(pattern, text, re.DOTALL)
mathes_image = []
mathes_other = []
for a_match in matches:
if '<|ref|>image<|/ref|>' in a_match[0]:
mathes_image.append(a_match[0])
else:
mathes_other.append(a_match[0])
return matches, mathes_image, mathes_other
def extract_coordinates_and_label(ref_text, image_width, image_height):
"""提取坐标和标签"""
try:
label_type = ref_text[1]
cor_list = eval(ref_text[2])
except Exception as e:
print(f"Error extracting coordinates: {e}")
return None
return (label_type, cor_list)
def process_single_image(image, prompt):
"""处理单张图片"""
prompt_in = prompt
cache_item = {
"prompt": prompt_in,
"multi_modal_data": {"image": DeepseekOCRProcessor().tokenize_with_images(images=[image], bos=True, eos=True,
cropping=CROP_MODE)},
}
return cache_item
# 初始化 FastAPI 应用
app = FastAPI(title="Deepseek OCR Service", version="1.0.0")
@app.on_event("startup")
async def startup_event():
"""启动时初始化模型"""
initialize_model()
@app.get("/")
async def root():
return {"message": "Deepseek OCR Service is running", "status": "healthy"}
@app.post("/process_pdf")
async def process_pdf(
file: UploadFile = File(...),
prompt: Optional[str] = None,
skip_repeat: Optional[bool] = None
):
"""处理PDF文件"""
try:
# 验证文件类型
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="Only PDF files are supported")
# 创建临时目录
task_id = str(uuid.uuid4())
task_dir = os.path.join(OUTPUT_PATH, task_id)
os.makedirs(task_dir, exist_ok=True)
os.makedirs(os.path.join(task_dir, "images"), exist_ok=True)
# 保存上传的PDF
input_pdf_path = os.path.join(task_dir, "input.pdf")
with open(input_pdf_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# 使用参数或默认值
current_prompt = prompt if prompt is not None else PROMPT
current_skip_repeat = skip_repeat if skip_repeat is not None else SKIP_REPEAT
print(f'{Colors.BLUE}Processing PDF: {file.filename}{Colors.RESET}')
# 转换PDF为图片
images = pdf_to_images_high_quality(input_pdf_path)
# 预处理图片
batch_inputs = []
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
batch_inputs = list(tqdm(
executor.map(lambda img: process_single_image(img, current_prompt), images),
total=len(images),
desc="Pre-processed images"
))
# 模型推理
outputs_list = llm.generate(
batch_inputs,
sampling_params=sampling_params
)
# 处理结果
mmd_det_path = os.path.join(task_dir, "output_det.mmd")
mmd_path = os.path.join(task_dir, "output.mmd")
pdf_out_path = os.path.join(task_dir, "output_layouts.pdf")
contents_det = ''
contents = ''
draw_images = []
jdx = 0
for output, img in zip(outputs_list, images):
content = output.outputs[0].text
if '' in content: # 处理重复内容
content = content.replace('', '')
else:
if current_skip_repeat:
continue
page_num = f'\n<--- Page Split --->'
contents_det += content + f'\n{page_num}\n'
image_draw = img.copy()
matches_ref, matches_images, mathes_other = re_match(content)
for idx, a_match_image in enumerate(matches_images):
content = content.replace(a_match_image, f'\n')
for idx, a_match_other in enumerate(mathes_other):
content = content.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon',
'=:').replace(
'\n\n\n\n', '\n\n').replace('\n\n\n', '\n\n')
contents += content + f'\n{page_num}\n'
jdx += 1
# 保存文件
# with open(mmd_det_path, 'w', encoding='utf-8') as afile:
# afile.write(contents_det)
with open(mmd_path, 'w', encoding='utf-8') as afile:
afile.write(contents)
# pil_to_pdf_img2pdf(draw_images, pdf_out_path)
return {
"task_id": task_id,
"status": "completed",
"files": {
"mmd": f"{contents}",
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
@app.get("/download/{task_id}/{filename:path}")
async def download_file(task_id: str, filename: str):
"""下载处理结果文件"""
file_path = os.path.join(OUTPUT_PATH, task_id, filename)
if not os.path.exists(file_path):
raise HTTPException(status_code=404, detail="File not found")
return FileResponse(
path=file_path,
filename=filename.split('/')[-1],
media_type='application/octet-stream'
)
@app.get("/tasks/{task_id}")
async def get_task_status(task_id: str):
"""获取任务状态"""
task_dir = os.path.join(OUTPUT_PATH, task_id)
if not os.path.exists(task_dir):
raise HTTPException(status_code=404, detail="Task not found")
files = {}
for root, dirs, filenames in os.walk(task_dir):
for filename in filenames:
rel_path = os.path.relpath(os.path.join(root, filename), task_dir)
files[rel_path] = f"/download/{task_id}/{rel_path}"
return {
"task_id": task_id,
"status": "completed" if files else "processing",
"files": files
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8006)
把文件放在config.py同级目录下。
客户端调用示例
服务部署后,可以通过简单的HTTP请求进行调用:
python
"""
@version: python3.9
@author: hcb
@software: PyCharm
@file: parse_pdf.py
@time: 2025/10/29 15:20
"""
import requests
import json
def process_pdf(file_path, prompt=None, skip_repeat=None):
"""
处理PDF文件的客户端示例
Args:
file_path: 本地PDF文件路径
prompt: 可选的提示词,如为None则使用默认提示词
skip_repeat: 是否跳过重复内容
"""
# API端点
url = "http://{ip}:8006/process_pdf"
# 准备文件和数据
files = {
'file': ('input.pdf', open(file_path, 'rb'), 'application/pdf')
}
data = {}
if prompt is not None:
data['prompt'] = prompt
if skip_repeat is not None:
data['skip_repeat'] = str(skip_repeat).lower()
try:
print(f"正在处理PDF文件: {file_path}")
# 发送请求
response = requests.post(url, files=files, data=data)
if response.status_code == 200:
result = response.json()
print(f"处理完成: {result}")
return result
else:
print(f"处理失败: {response.status_code}")
print(f"错误信息: {response.text}")
return None
except Exception as e:
print(f"请求出错: {e}")
return None
finally:
# 确保文件被关闭
if 'file' in files:
files['file'][1].close()
# 使用示例
if __name__ == "__main__":
# 示例1: 基本使用
pdf_file = "***.pdf" # 替换为你的PDF文件路径
# 处理PDF
result = process_pdf(pdf_file)
性能优化技巧
- 内存管理 :通过
gpu_memory_utilization参数优化GPU内存使用 - 并发控制 :使用
ThreadPoolExecutor控制处理线程数 - 批量处理:支持多页PDF的批量推理,提高吞吐量
- 缓存机制:重复内容跳过处理,节省计算资源
实际应用场景
这个OCR服务特别适合以下场景:
- 📊 企业文档数字化 - 批量处理扫描版PDF
- 🔍 知识库构建 - 从技术文档提取结构化信息
- 📑 学术研究 - 处理论文和参考资料
- 💼 法律文档 - 解析合同和法律文件
总结
通过将DeepSeek-OCR封装成API服务,我们实现了:
- 部署简化:一次部署,长期使用
- 调用便捷:简单的RESTful接口
- 性能稳定:优化的内存和并发管理
- 扩展性强:易于集成到现有系统中
这种服务化的部署方式大大降低了OCR技术的使用门槛,让开发者能够专注于业务逻辑而不是底层技术细节。