本地运行DeepSeek-OCR-2 识别图片文字

github下载地址: https://github.com/deepseek-ai/DeepSeek-OCR-2

按照文档操作安装环境即可

conda create -n deepseek-ocr2 python=3.12.9 -y

conda activate deepseek-ocr2
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118

pip install vllm-0.8.5+cu118-cp38-abi3-manylinux1_x86_64.whl

pip install -r requirements.txt

pip install flash-attn==2.7.3 --no-build-isolation

有一个问题在windows系统下比较难搞:pip install flash-attn==2.7.3 --no-build-isolation

flash-attn 安装比较费劲,我多次尝试未能安装成功,可以选择别人已经弄好whl文件,一定要搭配好自己安装好的环境!!感谢好心人!!!!

flash-attn 可用的windows版本下载whl地址: https://github.com/kingbri1/flash-attention/releases

环境弄好后,就可以运行代码了:

复制代码
cd DeepSeek-OCR2-master/DeepSeek-OCR2-hf
python run_dpsk_ocr2.py

我运行稍微有些报错,代码我稍做了修改,首次运行,需要梯子下载模型!之后便可以访问本地已经下载好的模型即可。

默认下载地址:C:\Users\用户名\.cache\huggingface\hub\models--deepseek-ai--DeepSeek-OCR-2\snapshots\aaa02xxxxxx 目录下

简版代码:

复制代码
import os
import torch
from transformers import AutoModel, AutoTokenizer

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_name = (
    r"C:\Users\用户名\.cache\huggingface\hub\models--deepseek-ai--DeepSeek-OCR-2\snapshots\aaa02f3"
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

model = AutoModel.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2"
).to("cuda").eval()

prompt = "<image>\n<|grounding|>Convert the document to markdown."

image_file = r"F:\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\jingdong_images\jingdong_40c4bf6b.jpeg"
output_path = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\output"

assert os.path.exists(image_file), f"Image not found: {image_file}"

res = model.infer(
    tokenizer,
    prompt=prompt,
    image_file=image_file,
    output_path=output_path,
    base_size=1024,
    image_size=768,
    crop_mode=True,
    save_results=True
)

拓展代码,添加读取文件夹和转换为txt

复制代码
import os
import shutil
import time
import torch
from transformers import AutoModel, AutoTokenizer


class DeepSeekOCR:
    def __init__(
        self,
        model_path: str,
        device: str = "cuda",
        base_size: int = 1024,
        image_size: int = 768,
    ):
        self.device = device
        self.base_size = base_size
        self.image_size = image_size

        init_start = time.perf_counter()
        print("Initializing DeepSeekOCR...")

        # -------- tokenizer --------
        t0 = time.perf_counter()
        print("Loading tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_path, trust_remote_code=True
        )
        print(f"Tokenizer loaded in {time.perf_counter() - t0:.2f}s")

        # -------- model --------
        t1 = time.perf_counter()
        print("Loading model...")
        self.model = AutoModel.from_pretrained(
            model_path,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            _attn_implementation="flash_attention_2",
            use_safetensors=True,
        ).to(device).eval()
        print(f"Model loaded in {time.perf_counter() - t1:.2f}s")

        self.prompt = "<image>\n<|grounding|>Convert the document to markdown."

        total_init_cost = time.perf_counter() - init_start
        print(f"Model initialization done. Total init time: {total_init_cost:.2f}s")


    # -----------------------------
    # 内部:单次推理(含计时)
    # -----------------------------

    def _infer_once(self, image_path: str, output_dir: str) -> float:
        start = time.perf_counter()

        self.model.infer(
            self.tokenizer,
            prompt=self.prompt,
            image_file=image_path,
            output_path=output_dir,
            base_size=self.base_size,
            image_size=self.image_size,
            crop_mode=True,
            save_results=True, # 后期关闭可减少时间3s左右
        )

        return time.perf_counter() - start

    

    # -----------------------------
    # 内部:重命名输出文件
    # -----------------------------
    def _rename_outputs(self, output_dir: str, base_name: str):
        # result.mmd
        src_mmd = os.path.join(output_dir, "result.mmd")
        dst_mmd = os.path.join(output_dir, f"{base_name}.mmd")
        if os.path.exists(src_mmd):
            shutil.move(src_mmd, dst_mmd)

        # result_with_boxes.jpg
        src_box = os.path.join(output_dir, "result_with_boxes.jpg")
        dst_box = os.path.join(output_dir, f"{base_name}_with_boxes.jpg")
        if os.path.exists(src_box):
            shutil.move(src_box, dst_box)

    # -----------------------------
    # 对外:单张 OCR
    # -----------------------------
    def ocr_image(self, image_path: str, output_dir: str):
        assert os.path.exists(image_path), f"Image not found: {image_path}"
        os.makedirs(output_dir, exist_ok=True)

        image_name = os.path.basename(image_path)
        base_name = os.path.splitext(image_name)[0]

        print(f"Start OCR: {image_name}")

        cost = self._infer_once(image_path, output_dir)
        self._rename_outputs(output_dir, base_name)

        mmd_path = os.path.join(output_dir, f"{base_name}.mmd")
        self._mmd_to_txt(mmd_path)

        print(f"OCR done: {image_name} | Time: {cost:.3f}s")

    # -----------------------------
    # 对外:批量 OCR
    # -----------------------------
    def ocr_folder(
        self,
        image_dir: str,
        output_dir: str,
        exts=(".jpg", ".jpeg", ".png", ".bmp", ".tiff"),
    ):
        assert os.path.isdir(image_dir), f"Dir not found: {image_dir}"
        os.makedirs(output_dir, exist_ok=True)

        images = [
            f for f in os.listdir(image_dir)
            if f.lower().endswith(exts)
        ]

        if not images:
            print("No images found.")
            return

        print(f"Found {len(images)} images")
        total_start = time.perf_counter()

        for idx, img in enumerate(images, 1):
            img_path = os.path.join(image_dir, img)
            base_name = os.path.splitext(img)[0]

            print(f"[{idx}/{len(images)}] OCR: {img}")

            try:
                cost = self._infer_once(img_path, output_dir)
                self._rename_outputs(output_dir, base_name)

                mmd_path = os.path.join(output_dir, f"{base_name}.mmd")
                self._mmd_to_txt(mmd_path)
                print(f"Done: {img} | Time: {cost:.3f}s")

            except Exception as e:
                print(f"Failed: {img}, Reason: {e}")

        total_cost = time.perf_counter() - total_start
        print("-" * 50)
        print(f"Total images: {len(images)}")
        print(f"Total time  : {total_cost:.2f}s")
        print(f"Avg per img : {total_cost / len(images):.2f}s")

    # -----------------------------
    # 内部:mmd 转 txt
    # -----------------------------
    def _mmd_to_txt(self, mmd_path: str):
        if not os.path.exists(mmd_path):
            return

        txt_path = os.path.splitext(mmd_path)[0] + ".txt"

        with open(mmd_path, "r", encoding="utf-8") as f:
            content = f.read()

        # 简单 Markdown 清洗(保留纯文本)
        lines = []
        for line in content.splitlines():
            line = line.strip()

            # 跳过图片、表格分隔线
            if line.startswith("![]"):
                continue
            if set(line) <= {"|", "-", " "}:
                continue

            # 去掉 markdown 符号
            for ch in ["#", "*", "`", "_", ">"]:
                line = line.replace(ch, "")

            lines.append(line)

        text = "\n".join(lines).strip()

        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(text)



if __name__ == "__main__":
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    MODEL_PATH = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\model"
    OUTPUT_DIR = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\output"

    ocr = DeepSeekOCR(MODEL_PATH)

    # 单张
    SINGLE_IMAGE = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\zhongtong_images\zhongtong_5a1d2892.jpeg"
    ocr.ocr_image(SINGLE_IMAGE, OUTPUT_DIR)

    # 批量
    # IMAGE_DIR = r"F:\DeepSeek-OCR-2-main\DeepSeek-OCR2-master\DeepSeek-OCR2-hf\zhongtong_images"
    # ocr.ocr_folder(IMAGE_DIR, OUTPUT_DIR)

    print("All OCR done.")
相关推荐
铁蛋AI编程实战5 小时前
DeepSeek-OCR2:开源 OCR 新王者完整部署教程(vLLM+Transformers 双接口 + 动态分辨率 + 文档批量处理)
开源·ocr·vllm
Coovally AI模型快速验证7 小时前
“看起来像世界”≠“真世界”!WorldLens全维度解构自动驾驶世界模型
人工智能·机器学习·计算机视觉·目标跟踪·自动驾驶·ocr
独自归家的兔1 天前
DeepSeek-OCR 2:视觉因果流模型官方论文解读总结
ocr
DisonTangor1 天前
智谱开源基于GLM-V编码器-解码器架构的多模态OCR模型——GLM-OCR
架构·开源·ocr
Elwin Wong1 天前
浅析DeepSeek-OCR v1&v2
人工智能·大模型·llm·ocr·deepseek
一个无名的炼丹师2 天前
多模态RAG系统进阶:从零掌握olmOCR与MinerU的部署与应用
python·大模型·ocr·多模态·rag
SmartBrain2 天前
OCR 模型在医疗场景的选型研究
人工智能·算法·语言模型·架构·aigc·ocr
DisonTangor3 天前
DeepSeek-OCR 2: 视觉因果流
人工智能·开源·aigc·ocr·deepseek
一个处女座的程序猿3 天前
CV之VLM之LLM-OCR:《DeepSeek-OCR 2: Visual Causal Flow》翻译与解读
llm·ocr·cv·vlm