本地运行DeepSeek-OCR-2 识别图片文字

github下载地址： https://github.com/deepseek-ai/DeepSeek-OCR-2

按照文档操作安装环境即可

conda create -n deepseek-ocr2 python=3.12.9 -y

conda activate deepseek-ocr2
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118

pip install vllm-0.8.5+cu118-cp38-abi3-manylinux1_x86_64.whl

pip install -r requirements.txt

pip install flash-attn==2.7.3 --no-build-isolation

有一个问题在windows系统下比较难搞：pip install flash-attn==2.7.3 --no-build-isolation

flash-attn 安装比较费劲，我多次尝试未能安装成功，可以选择别人已经弄好whl文件，一定要搭配好自己安装好的环境！！感谢好心人！！！！

flash-attn 可用的windows版本下载whl地址： https://github.com/kingbri1/flash-attention/releases

环境弄好后，就可以运行代码了：

复制代码

cd DeepSeek-OCR2-master/DeepSeek-OCR2-hf
python run_dpsk_ocr2.py

我运行稍微有些报错，代码我稍做了修改，首次运行，需要梯子下载模型！之后便可以访问本地已经下载好的模型即可。

默认下载地址：C:\Users\用户名\.cache\huggingface\hub\models--deepseek-ai--DeepSeek-OCR-2\snapshots\aaa02xxxxxx 目录下

简版代码：

复制代码

import os
import torch
from transformers import AutoModel, AutoTokenizer

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_name = (
    r"C:\Users\用户名\.cache\huggingface\hub\models--deepseek-ai--DeepSeek-OCR-2\snapshots\aaa02f3"
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

model = AutoModel.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2"
).to("cuda").eval()

prompt = "<image>\n<|grounding|>Convert the document to markdown."

image_file = r"F:\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\jingdong_images\jingdong_40c4bf6b.jpeg"
output_path = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\output"

assert os.path.exists(image_file), f"Image not found: {image_file}"

res = model.infer(
    tokenizer,
    prompt=prompt,
    image_file=image_file,
    output_path=output_path,
    base_size=1024,
    image_size=768,
    crop_mode=True,
    save_results=True
)

拓展代码，添加读取文件夹和转换为txt

复制代码

import os
import shutil
import time
import torch
from transformers import AutoModel, AutoTokenizer


class DeepSeekOCR:
    def __init__(
        self,
        model_path: str,
        device: str = "cuda",
        base_size: int = 1024,
        image_size: int = 768,
    ):
        self.device = device
        self.base_size = base_size
        self.image_size = image_size

        init_start = time.perf_counter()
        print("Initializing DeepSeekOCR...")

        # -------- tokenizer --------
        t0 = time.perf_counter()
        print("Loading tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_path, trust_remote_code=True
        )
        print(f"Tokenizer loaded in {time.perf_counter() - t0:.2f}s")

        # -------- model --------
        t1 = time.perf_counter()
        print("Loading model...")
        self.model = AutoModel.from_pretrained(
            model_path,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            _attn_implementation="flash_attention_2",
            use_safetensors=True,
        ).to(device).eval()
        print(f"Model loaded in {time.perf_counter() - t1:.2f}s")

        self.prompt = "<image>\n<|grounding|>Convert the document to markdown."

        total_init_cost = time.perf_counter() - init_start
        print(f"Model initialization done. Total init time: {total_init_cost:.2f}s")


    # -----------------------------
    # 内部：单次推理（含计时）
    # -----------------------------

    def _infer_once(self, image_path: str, output_dir: str) -> float:
        start = time.perf_counter()

        self.model.infer(
            self.tokenizer,
            prompt=self.prompt,
            image_file=image_path,
            output_path=output_dir,
            base_size=self.base_size,
            image_size=self.image_size,
            crop_mode=True,
            save_results=True, # 后期关闭可减少时间3s左右
        )

        return time.perf_counter() - start

    

    # -----------------------------
    # 内部：重命名输出文件
    # -----------------------------
    def _rename_outputs(self, output_dir: str, base_name: str):
        # result.mmd
        src_mmd = os.path.join(output_dir, "result.mmd")
        dst_mmd = os.path.join(output_dir, f"{base_name}.mmd")
        if os.path.exists(src_mmd):
            shutil.move(src_mmd, dst_mmd)

        # result_with_boxes.jpg
        src_box = os.path.join(output_dir, "result_with_boxes.jpg")
        dst_box = os.path.join(output_dir, f"{base_name}_with_boxes.jpg")
        if os.path.exists(src_box):
            shutil.move(src_box, dst_box)

    # -----------------------------
    # 对外：单张 OCR
    # -----------------------------
    def ocr_image(self, image_path: str, output_dir: str):
        assert os.path.exists(image_path), f"Image not found: {image_path}"
        os.makedirs(output_dir, exist_ok=True)

        image_name = os.path.basename(image_path)
        base_name = os.path.splitext(image_name)[0]

        print(f"Start OCR: {image_name}")

        cost = self._infer_once(image_path, output_dir)
        self._rename_outputs(output_dir, base_name)

        mmd_path = os.path.join(output_dir, f"{base_name}.mmd")
        self._mmd_to_txt(mmd_path)

        print(f"OCR done: {image_name} | Time: {cost:.3f}s")

    # -----------------------------
    # 对外：批量 OCR
    # -----------------------------
    def ocr_folder(
        self,
        image_dir: str,
        output_dir: str,
        exts=(".jpg", ".jpeg", ".png", ".bmp", ".tiff"),
    ):
        assert os.path.isdir(image_dir), f"Dir not found: {image_dir}"
        os.makedirs(output_dir, exist_ok=True)

        images = [
            f for f in os.listdir(image_dir)
            if f.lower().endswith(exts)
        ]

        if not images:
            print("No images found.")
            return

        print(f"Found {len(images)} images")
        total_start = time.perf_counter()

        for idx, img in enumerate(images, 1):
            img_path = os.path.join(image_dir, img)
            base_name = os.path.splitext(img)[0]

            print(f"[{idx}/{len(images)}] OCR: {img}")

            try:
                cost = self._infer_once(img_path, output_dir)
                self._rename_outputs(output_dir, base_name)

                mmd_path = os.path.join(output_dir, f"{base_name}.mmd")
                self._mmd_to_txt(mmd_path)
                print(f"Done: {img} | Time: {cost:.3f}s")

            except Exception as e:
                print(f"Failed: {img}, Reason: {e}")

        total_cost = time.perf_counter() - total_start
        print("-" * 50)
        print(f"Total images: {len(images)}")
        print(f"Total time  : {total_cost:.2f}s")
        print(f"Avg per img : {total_cost / len(images):.2f}s")

    # -----------------------------
    # 内部：mmd 转 txt
    # -----------------------------
    def _mmd_to_txt(self, mmd_path: str):
        if not os.path.exists(mmd_path):
            return

        txt_path = os.path.splitext(mmd_path)[0] + ".txt"

        with open(mmd_path, "r", encoding="utf-8") as f:
            content = f.read()

        # 简单 Markdown 清洗（保留纯文本）
        lines = []
        for line in content.splitlines():
            line = line.strip()

            # 跳过图片、表格分隔线
            if line.startswith("![]"):
                continue
            if set(line) <= {"|", "-", " "}:
                continue

            # 去掉 markdown 符号
            for ch in ["#", "*", "`", "_", ">"]:
                line = line.replace(ch, "")

            lines.append(line)

        text = "\n".join(lines).strip()

        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(text)



if __name__ == "__main__":
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    MODEL_PATH = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\model"
    OUTPUT_DIR = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\output"

    ocr = DeepSeekOCR(MODEL_PATH)

    # 单张
    SINGLE_IMAGE = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\zhongtong_images\zhongtong_5a1d2892.jpeg"
    ocr.ocr_image(SINGLE_IMAGE, OUTPUT_DIR)

    # 批量
    # IMAGE_DIR = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\zhongtong_images"
    # ocr.ocr_folder(IMAGE_DIR, OUTPUT_DIR)

    print("All OCR done.")