github下载地址: https://github.com/deepseek-ai/DeepSeek-OCR-2
按照文档操作安装环境即可
conda create -n deepseek-ocr2 python=3.12.9 -y
conda activate deepseek-ocr2
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118pip install vllm-0.8.5+cu118-cp38-abi3-manylinux1_x86_64.whl
pip install -r requirements.txt
pip install flash-attn==2.7.3 --no-build-isolation
有一个问题在windows系统下比较难搞:pip install flash-attn==2.7.3 --no-build-isolation
flash-attn 安装比较费劲,我多次尝试未能安装成功,可以选择别人已经弄好whl文件,一定要搭配好自己安装好的环境!!感谢好心人!!!!
flash-attn 可用的windows版本下载whl地址: https://github.com/kingbri1/flash-attention/releases
环境弄好后,就可以运行代码了:
cd DeepSeek-OCR2-master/DeepSeek-OCR2-hf
python run_dpsk_ocr2.py
我运行稍微有些报错,代码我稍做了修改,首次运行,需要梯子下载模型!之后便可以访问本地已经下载好的模型即可。
默认下载地址:C:\Users\用户名\.cache\huggingface\hub\models--deepseek-ai--DeepSeek-OCR-2\snapshots\aaa02xxxxxx 目录下
简版代码:
import os
import torch
from transformers import AutoModel, AutoTokenizer
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_name = (
r"C:\Users\用户名\.cache\huggingface\hub\models--deepseek-ai--DeepSeek-OCR-2\snapshots\aaa02f3"
)
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).to("cuda").eval()
prompt = "<image>\n<|grounding|>Convert the document to markdown."
image_file = r"F:\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\jingdong_images\jingdong_40c4bf6b.jpeg"
output_path = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\output"
assert os.path.exists(image_file), f"Image not found: {image_file}"
res = model.infer(
tokenizer,
prompt=prompt,
image_file=image_file,
output_path=output_path,
base_size=1024,
image_size=768,
crop_mode=True,
save_results=True
)
拓展代码,添加读取文件夹和转换为txt
import os
import shutil
import time
import torch
from transformers import AutoModel, AutoTokenizer
class DeepSeekOCR:
def __init__(
self,
model_path: str,
device: str = "cuda",
base_size: int = 1024,
image_size: int = 768,
):
self.device = device
self.base_size = base_size
self.image_size = image_size
init_start = time.perf_counter()
print("Initializing DeepSeekOCR...")
# -------- tokenizer --------
t0 = time.perf_counter()
print("Loading tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True
)
print(f"Tokenizer loaded in {time.perf_counter() - t0:.2f}s")
# -------- model --------
t1 = time.perf_counter()
print("Loading model...")
self.model = AutoModel.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
_attn_implementation="flash_attention_2",
use_safetensors=True,
).to(device).eval()
print(f"Model loaded in {time.perf_counter() - t1:.2f}s")
self.prompt = "<image>\n<|grounding|>Convert the document to markdown."
total_init_cost = time.perf_counter() - init_start
print(f"Model initialization done. Total init time: {total_init_cost:.2f}s")
# -----------------------------
# 内部:单次推理(含计时)
# -----------------------------
def _infer_once(self, image_path: str, output_dir: str) -> float:
start = time.perf_counter()
self.model.infer(
self.tokenizer,
prompt=self.prompt,
image_file=image_path,
output_path=output_dir,
base_size=self.base_size,
image_size=self.image_size,
crop_mode=True,
save_results=True, # 后期关闭可减少时间3s左右
)
return time.perf_counter() - start
# -----------------------------
# 内部:重命名输出文件
# -----------------------------
def _rename_outputs(self, output_dir: str, base_name: str):
# result.mmd
src_mmd = os.path.join(output_dir, "result.mmd")
dst_mmd = os.path.join(output_dir, f"{base_name}.mmd")
if os.path.exists(src_mmd):
shutil.move(src_mmd, dst_mmd)
# result_with_boxes.jpg
src_box = os.path.join(output_dir, "result_with_boxes.jpg")
dst_box = os.path.join(output_dir, f"{base_name}_with_boxes.jpg")
if os.path.exists(src_box):
shutil.move(src_box, dst_box)
# -----------------------------
# 对外:单张 OCR
# -----------------------------
def ocr_image(self, image_path: str, output_dir: str):
assert os.path.exists(image_path), f"Image not found: {image_path}"
os.makedirs(output_dir, exist_ok=True)
image_name = os.path.basename(image_path)
base_name = os.path.splitext(image_name)[0]
print(f"Start OCR: {image_name}")
cost = self._infer_once(image_path, output_dir)
self._rename_outputs(output_dir, base_name)
mmd_path = os.path.join(output_dir, f"{base_name}.mmd")
self._mmd_to_txt(mmd_path)
print(f"OCR done: {image_name} | Time: {cost:.3f}s")
# -----------------------------
# 对外:批量 OCR
# -----------------------------
def ocr_folder(
self,
image_dir: str,
output_dir: str,
exts=(".jpg", ".jpeg", ".png", ".bmp", ".tiff"),
):
assert os.path.isdir(image_dir), f"Dir not found: {image_dir}"
os.makedirs(output_dir, exist_ok=True)
images = [
f for f in os.listdir(image_dir)
if f.lower().endswith(exts)
]
if not images:
print("No images found.")
return
print(f"Found {len(images)} images")
total_start = time.perf_counter()
for idx, img in enumerate(images, 1):
img_path = os.path.join(image_dir, img)
base_name = os.path.splitext(img)[0]
print(f"[{idx}/{len(images)}] OCR: {img}")
try:
cost = self._infer_once(img_path, output_dir)
self._rename_outputs(output_dir, base_name)
mmd_path = os.path.join(output_dir, f"{base_name}.mmd")
self._mmd_to_txt(mmd_path)
print(f"Done: {img} | Time: {cost:.3f}s")
except Exception as e:
print(f"Failed: {img}, Reason: {e}")
total_cost = time.perf_counter() - total_start
print("-" * 50)
print(f"Total images: {len(images)}")
print(f"Total time : {total_cost:.2f}s")
print(f"Avg per img : {total_cost / len(images):.2f}s")
# -----------------------------
# 内部:mmd 转 txt
# -----------------------------
def _mmd_to_txt(self, mmd_path: str):
if not os.path.exists(mmd_path):
return
txt_path = os.path.splitext(mmd_path)[0] + ".txt"
with open(mmd_path, "r", encoding="utf-8") as f:
content = f.read()
# 简单 Markdown 清洗(保留纯文本)
lines = []
for line in content.splitlines():
line = line.strip()
# 跳过图片、表格分隔线
if line.startswith("![]"):
continue
if set(line) <= {"|", "-", " "}:
continue
# 去掉 markdown 符号
for ch in ["#", "*", "`", "_", ">"]:
line = line.replace(ch, "")
lines.append(line)
text = "\n".join(lines).strip()
with open(txt_path, "w", encoding="utf-8") as f:
f.write(text)
if __name__ == "__main__":
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
MODEL_PATH = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\model"
OUTPUT_DIR = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\output"
ocr = DeepSeekOCR(MODEL_PATH)
# 单张
SINGLE_IMAGE = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\zhongtong_images\zhongtong_5a1d2892.jpeg"
ocr.ocr_image(SINGLE_IMAGE, OUTPUT_DIR)
# 批量
# IMAGE_DIR = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\zhongtong_images"
# ocr.ocr_folder(IMAGE_DIR, OUTPUT_DIR)
print("All OCR done.")