一、环境配置
bash
pip install langchain langchain-openai langchain-community
pip install chromadb # 向量数据库
pip install langchain_huggingface transformers
pip install sentence-transformers
pip install modelscope
pip install torch vllm pillow opencv-python
pip install "mineru-vl-utils[vllm]"
pip install pymupdf
pip install "mineru[all]"
二、实现代码
这是一个基于MinerU工具开发的批量文档智能解析与格式转换自动化脚本,核心作用是将 PDF、图片、Office 等格式的文档,批量转换为结构化的 Markdown 文本和 JSON 数据,适用于文档数字化、RAG 知识库构建、数据提取等场景。
我的文档全部是pdf,目前只是用到pdf输入。目前只是将pdf数据解析为 Markdown 文本和 JSON 数据。
python
# Copyright (c) Opendatalab. All rights reserved.
import asyncio
import os
import tempfile
import shutil
from pathlib import Path
import httpx
from mineru.cli import api_client as _api_client
from mineru.cli.common import image_suffixes, office_suffixes, pdf_suffixes
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
SUPPORTED_INPUT_SUFFIXES = set(pdf_suffixes + image_suffixes + office_suffixes)
def collect_input_files(input_path: str | Path) -> list[Path]:
path = Path(input_path).expanduser().resolve()
if not path.exists():
raise FileNotFoundError(f"Input path does not exist: {path}")
if path.is_file():
file_suffix = guess_suffix_by_path(path)
if file_suffix not in SUPPORTED_INPUT_SUFFIXES:
raise ValueError(f"Unsupported input file type: {path.name}")
return [path]
if not path.is_dir():
raise ValueError(f"Input path must be a file or directory: {path}")
input_files = sorted(
(
candidate.resolve()
for candidate in path.iterdir()
if candidate.is_file()
and guess_suffix_by_path(candidate) in SUPPORTED_INPUT_SUFFIXES
),
key=lambda item: item.name,
)
if not input_files:
raise ValueError(f"No supported files found in directory: {path}")
return input_files
def build_form_data(
language: str,
backend: str,
parse_method: str,
formula_enable: bool,
table_enable: bool,
server_url: str | None,
start_page_id: int,
end_page_id: int | None,
) -> dict[str, str | list[str]]:
return _api_client.build_parse_request_form_data(
lang_list=[language],
backend=backend,
parse_method=parse_method,
formula_enable=formula_enable,
table_enable=table_enable,
server_url=server_url,
start_page_id=start_page_id,
end_page_id=end_page_id,
return_md=True,
return_middle_json=False,
return_model_output=False,
return_content_list=True,
return_images=False,
response_format_zip=True,
return_original_file=False,
)
def format_status_message(status_snapshot: _api_client.TaskStatusSnapshot) -> str:
if status_snapshot.queued_ahead is None:
return status_snapshot.status
return f"{status_snapshot.status} (queued_ahead={status_snapshot.queued_ahead})"
def prepare_local_api_temp_dir() -> None:
current_temp_dir = Path(tempfile.gettempdir())
if os.name == "nt" or not Path("/tmp").exists():
return
if not str(current_temp_dir).startswith("/mnt/"):
return
os.environ["TMPDIR"] = "/tmp"
tempfile.tempdir = None
def clean_and_classify_extracted_files(extract_dir: Path, md_target_dir: Path, json_target_dir: Path):
"""
遍历 extract_dir 下所有文件:
- 将 .md 文件移动到 md_target_dir
- 将文件名以 '_content_list_v2.json' 结尾的文件移动到 json_target_dir
- 其余文件删除
最后删除所有空目录
"""
md_target_dir.mkdir(parents=True, exist_ok=True)
json_target_dir.mkdir(parents=True, exist_ok=True)
all_files = [p for p in extract_dir.rglob('*') if p.is_file()]
for file_path in all_files:
if file_path.suffix == '.md':
target = md_target_dir / file_path.name
shutil.move(str(file_path), str(target))
print(f" 移动 MD: {file_path.name}")
elif file_path.name.endswith('_content_list_v2.json'):
target = json_target_dir / file_path.name
shutil.move(str(file_path), str(target))
print(f" 移动 JSON: {file_path.name}")
else:
file_path.unlink()
print(f" 删除多余文件: {file_path.name}")
# 删除空目录
for dir_path in sorted(extract_dir.rglob('*'), reverse=True):
if dir_path.is_dir() and not any(dir_path.iterdir()):
dir_path.rmdir()
print(f" 删除空目录: {dir_path.relative_to(extract_dir)}")
async def process_single_file(
file_path: Path,
output_dir: Path,
http_client: httpx.AsyncClient,
base_url: str,
form_data: dict,
) -> bool:
file_name = file_path.name
output_md_path = output_dir / "md" / f"{file_path.stem}.md"
if output_md_path.exists():
print(f"✅ 已处理,跳过: {file_name}")
return True
print(f"\n======== 开始处理: {file_name} ========")
try:
upload_asset = [_api_client.UploadAsset(path=file_path, upload_name=file_name)]
submit_response = await _api_client.submit_parse_task(
base_url=base_url,
upload_assets=upload_asset,
form_data=form_data,
)
await _api_client.wait_for_task_result(
client=http_client,
submit_response=submit_response,
task_label=file_name,
)
zip_path = await _api_client.download_result_zip(
client=http_client,
submit_response=submit_response,
task_label=file_name,
)
# 使用临时目录解压,避免混合
temp_extract_dir = output_dir / f"_tmp_{file_path.stem}"
temp_extract_dir.mkdir(parents=True, exist_ok=True)
_api_client.safe_extract_zip(zip_path, temp_extract_dir)
zip_path.unlink(missing_ok=True)
clean_and_classify_extracted_files(
extract_dir=temp_extract_dir,
md_target_dir=output_dir / "md",
json_target_dir=output_dir / "json"
)
# 清理临时目录
if temp_extract_dir.exists():
shutil.rmtree(temp_extract_dir, ignore_errors=True)
print(f"✅ 处理完成: {file_name}")
return True
except Exception as e:
print(f"❌ 处理失败: {file_name}, 错误: {str(e)}")
return False
async def run_demo(
input_path: str | Path,
output_dir: str | Path,
*,
api_url: str | None = None,
backend: str = "hybrid-auto-engine",
parse_method: str = "auto",
language: str = "ch",
formula_enable: bool = True,
table_enable: bool = True,
server_url: str | None = None,
start_page_id: int = 0,
end_page_id: int | None = None,
) -> None:
input_files = collect_input_files(input_path)
output_path = Path(output_dir).expanduser().resolve()
output_path.mkdir(parents=True, exist_ok=True)
form_data = build_form_data(
language=language, backend=backend, parse_method=parse_method,
formula_enable=formula_enable, table_enable=table_enable, server_url=server_url,
start_page_id=start_page_id, end_page_id=end_page_id
)
local_server = None
try:
async with httpx.AsyncClient(
timeout=_api_client.build_http_timeout(),
follow_redirects=True,
) as http_client:
if api_url is None:
prepare_local_api_temp_dir()
local_server = _api_client.LocalAPIServer()
base_url = local_server.start()
print(f"Started local mineru-api: {base_url}")
await _api_client.wait_for_local_api_ready(http_client, local_server)
else:
base_url = _api_client.normalize_base_url(api_url)
print(f"API 服务正常,开始批量处理 {len(input_files)} 个文件\n")
success = 0
failed = 0
for f in input_files:
ok = await process_single_file(f, output_path, http_client, base_url, form_data)
if ok:
success += 1
else:
failed += 1
print(f"\n======== 全部任务完成 ========")
print(f"成功:{success} 个 | 失败:{failed} 个")
print(f"输出目录:{output_path}/md 和 {output_path}/json")
finally:
if local_server:
local_server.stop()
def main() -> None:
demo_dir = Path(__file__).resolve().parent
input_path = "/rag/bs_challenge_financial_14b_dataset/pdf"
output_dir = "/bs_challenge_financial_14b_dataset/api_output"
api_url = None
backend = "hybrid-auto-engine"
parse_method = "auto"
language = "ch"
formula_enable = True
table_enable = True
server_url = None
start_page_id = 0
end_page_id = None
os.environ['MINERU_MODEL_SOURCE'] = "modelscope"
asyncio.run(run_demo(
input_path=input_path, output_dir=output_dir, api_url=api_url,
backend=backend, parse_method=parse_method, language=language,
formula_enable=formula_enable, table_enable=table_enable, server_url=server_url,
start_page_id=start_page_id, end_page_id=end_page_id
))
if __name__ == "__main__":
main()