代码在最后
前言
在构建企业级RAG(检索增强生成)系统时,PDF文档解析是整个知识库构建流程的关键第一步。本文基于RAG Challenge竞赛获奖方案中的pdf_parsing.py
模块,深入分析如何实现高质量的PDF解析、OCR识别、表格提取和并行处理,为企业知识库构建提供完整的技术实践指南。
1. 模块架构概览
1.1 核心组件
pdf_parsing.py
模块包含两个主要类:
-
PDFParser: 主要的PDF解析器,负责文档转换和并行处理
-
JsonReportProcessor: 报告处理器,负责将解析结果组装成结构化JSON
1.2 技术栈
# 核心依赖
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from tabulate import tabulate # 表格格式化
Docling是IBM开发的高质量文档解析库,支持:
-
高精度PDF解析
-
OCR文字识别
-
表格结构识别
-
图像和公式处理
2. PDFParser类深度解析
2.1 初始化方法
class PDFParser:
def __init__(
self,
pdf_backend=DoclingParseV2DocumentBackend,
output_dir: Path = Path("./parsed_pdfs"),
num_threads: int = None,
csv_metadata_path: Path = None,
):
self.pdf_backend = pdf_backend
self.output_dir = output_dir
self.doc_converter = self._create_document_converter()
self.num_threads = num_threads
self.metadata_lookup = {}
self.debug_data_path = None
if csv_metadata_path is not None:
self.metadata_lookup = self._parse_csv_metadata(csv_metadata_path)
if self.num_threads is not None:
os.environ["OMP_NUM_THREADS"] = str(self.num_threads)
关键设计特点:
-
可配置后端:支持不同的PDF解析后端
-
元数据管理:从CSV文件加载公司名称等元数据
-
线程控制:通过环境变量控制OpenMP线程数
-
调试支持:可选的调试数据保存路径
2.2 文档转换器配置
def _create_document_converter(self) -> "DocumentConverter":
"""创建并配置文档转换器"""
from docling.document_converter import DocumentConverter, FormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions
from docling.datamodel.base_models import InputFormat
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
# 配置PDF处理选项
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True # 启用OCR
ocr_options = EasyOcrOptions(lang=['en'], force_full_page_ocr=False)
pipeline_options.ocr_options = ocr_options
pipeline_options.do_table_structure = True # 启用表格结构识别
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # 精确模式
format_options = {
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline,
pipeline_options=pipeline_options,
backend=self.pdf_backend
)
}
return DocumentConverter(format_options=format_options)
配置亮点:
-
OCR支持:使用EasyOCR进行文字识别,支持英文
-
表格识别:启用TableFormer进行精确的表格结构识别
-
单元格匹配:确保表格单元格的准确对应
-
精确模式:使用ACCURATE模式提升识别精度
2.3 元数据解析
@staticmethod
def _parse_csv_metadata(csv_path: Path) -> dict:
"""解析CSV文件并创建以sha1为键的查找字典"""
import csv
metadata_lookup = {}
with open(csv_path, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
# 处理新旧CSV格式的公司名称字段
company_name = row.get('company_name', row.get('name', '')).strip('"')
metadata_lookup[row['sha1']] = {
'company_name': company_name
}
return metadata_lookup
设计优势:
-
格式兼容:支持新旧CSV格式
-
数据清理:自动去除引号和空白字符
-
快速查找:使用SHA1作为键进行O(1)查找
3. 并行处理机制
3.1 并行处理架构
def parse_and_export_parallel(
self,
input_doc_paths: List[Path] = None,
doc_dir: Path = None,
optimal_workers: int = 10,
chunk_size: int = None
):
"""使用多进程并行解析PDF文件"""
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, as_completed
# 获取输入路径
if input_doc_paths is None and doc_dir is not None:
input_doc_paths = list(doc_dir.glob("*.pdf"))
total_pdfs = len(input_doc_paths)
_log.info(f"Starting parallel processing of {total_pdfs} documents")
cpu_count = multiprocessing.cpu_count()
# 计算最优worker数量
if optimal_workers is None:
optimal_workers = min(cpu_count, total_pdfs)
if chunk_size is None:
# 计算chunk大小(确保至少为1)
chunk_size = max(1, total_pdfs // optimal_workers)
# 将文档分割成chunks
chunks = [
input_doc_paths[i : i + chunk_size]
for i in range(0, total_pdfs, chunk_size)
]
并行处理优势:
-
智能分块:根据CPU核心数和文档数量自动计算最优分块大小
-
负载均衡:确保每个worker处理相似数量的文档
-
资源优化:避免创建过多进程导致资源浪费
3.2 进程池执行
# 使用ProcessPoolExecutor进行并行处理
with ProcessPoolExecutor(max_workers=optimal_workers) as executor:
# 调度所有任务
futures = [
executor.submit(
_process_chunk,
chunk,
self.pdf_backend,
self.output_dir,
self.num_threads,
self.metadata_lookup,
self.debug_data_path
)
for chunk in chunks
]
# 等待完成并记录结果
for future in as_completed(futures):
try:
result = future.result()
processed_count += int(result.split()[1]) # 从"Processed X PDFs"中提取数字
_log.info(f"{'#'*50}\n{result} ({processed_count}/{total_pdfs} total)\n{'#'*50}")
except Exception as e:
_log.error(f"Error processing chunk: {str(e)}")
raise
错误处理机制:
-
异常捕获:捕获并记录处理过程中的异常
-
进度跟踪:实时显示处理进度
-
失败处理:遇到错误时立即停止并报告
3.3 辅助处理函数
def _process_chunk(pdf_paths, pdf_backend, output_dir, num_threads, metadata_lookup, debug_data_path):
"""在单独进程中处理PDF chunk的辅助函数"""
# 为这个进程创建新的解析器实例
parser = PDFParser(
pdf_backend=pdf_backend,
output_dir=output_dir,
num_threads=num_threads,
csv_metadata_path=None # 元数据查找直接传递
)
parser.metadata_lookup = metadata_lookup
parser.debug_data_path = debug_data_path
parser.parse_and_export(pdf_paths)
return f"Processed {len(pdf_paths)} PDFs."
进程隔离设计:
-
独立实例:每个进程创建独立的解析器实例
-
数据传递:通过参数传递必要的配置和数据
-
状态隔离:避免进程间状态冲突
4. 页面序列标准化
4.1 页面间隙处理
def _normalize_page_sequence(self, data: dict) -> dict:
"""通过填充空白页面确保内容中的页码是连续的"""
if 'content' not in data:
return data
# 创建数据副本进行修改
normalized_data = data.copy()
# 获取现有页码并找到最大页码
existing_pages = {page['page'] for page in data['content']}
max_page = max(existing_pages)
# 创建空白页面模板
empty_page_template = {
"content": [],
"page_dimensions": {} # 或根据需要设置默认尺寸
}
# 创建包含所有页面的新内容数组
new_content = []
for page_num in range(1, max_page + 1):
# 查找现有页面或创建空白页面
page_content = next(
(page for page in data['content'] if page['page'] == page_num),
{"page": page_num, **empty_page_template}
)
new_content.append(page_content)
normalized_data['content'] = new_content
return normalized_data
标准化优势:
-
连续性保证:确保页码从1开始连续
-
空白页处理:自动填充缺失的页面
-
数据完整性:保持文档结构的完整性
5. JsonReportProcessor类详解
5.1 报告组装流程
class JsonReportProcessor:
def __init__(self, metadata_lookup: dict = None, debug_data_path: Path = None):
self.metadata_lookup = metadata_lookup or {}
self.debug_data_path = debug_data_path
def assemble_report(self, conv_result, normalized_data=None):
"""使用标准化数据或原始转换结果组装报告"""
data = normalized_data if normalized_data is not None else conv_result.document.export_to_dict()
assembled_report = {}
assembled_report['metainfo'] = self.assemble_metainfo(data)
assembled_report['content'] = self.assemble_content(data)
assembled_report['tables'] = self.assemble_tables(conv_result.document.tables, data)
assembled_report['pictures'] = self.assemble_pictures(data)
self.debug_data(data)
return assembled_report
组装结构:
-
元信息:文档基本信息和统计
-
内容:按页面组织的文本内容
-
表格:提取的表格数据
-
图片:图片信息和相关文本
5.2 元信息组装
def assemble_metainfo(self, data):
metainfo = {}
sha1_name = data['origin']['filename'].rsplit('.', 1)[0]
metainfo['sha1_name'] = sha1_name
metainfo['pages_amount'] = len(data.get('pages', []))
metainfo['text_blocks_amount'] = len(data.get('texts', []))
metainfo['tables_amount'] = len(data.get('tables', []))
metainfo['pictures_amount'] = len(data.get('pictures', []))
metainfo['equations_amount'] = len(data.get('equations', []))
metainfo['footnotes_amount'] = len([t for t in data.get('texts', []) if t.get('label') == 'footnote'])
# 如果可用,添加CSV元数据
if self.metadata_lookup and sha1_name in self.metadata_lookup:
csv_meta = self.metadata_lookup[sha1_name]
metainfo['company_name'] = csv_meta['company_name']
return metainfo
统计信息包括:
-
页面数量
-
文本块数量
-
表格数量
-
图片数量
-
公式数量
-
脚注数量
-
公司名称(从CSV获取)
5.3 内容组装与组展开
def expand_groups(self, body_children, groups):
"""展开组引用,将组信息添加到子元素中"""
expanded_children = []
for item in body_children:
if isinstance(item, dict) and '$ref' in item:
ref = item['$ref']
ref_type, ref_num = ref.split('/')[-2:]
ref_num = int(ref_num)
if ref_type == 'groups':
group = groups[ref_num]
group_id = ref_num
group_name = group.get('name', '')
group_label = group.get('label', '')
for child in group['children']:
child_copy = child.copy()
child_copy['group_id'] = group_id
child_copy['group_name'] = group_name
child_copy['group_label'] = group_label
expanded_children.append(child_copy)
else:
expanded_children.append(item)
else:
expanded_children.append(item)
return expanded_children
组展开功能:
-
层次结构保持:保持文档的层次结构
-
组信息传递:将组信息传递给子元素
-
引用解析:解析并展开所有引用
5.4 文本引用处理
def _process_text_reference(self, ref_num, data):
"""处理文本引用并创建内容项的辅助方法"""
text_item = data['texts'][ref_num]
item_type = text_item['label']
content_item = {
'text': text_item.get('text', ''),
'type': item_type,
'text_id': ref_num
}
# 只有当'orig'字段与'text'不同时才添加'orig'字段
orig_content = text_item.get('orig', '')
if orig_content != text_item.get('text', ''):
content_item['orig'] = orig_content
# 如果存在,添加其他字段
if 'enumerated' in text_item:
content_item['enumerated'] = text_item['enumerated']
if 'marker' in text_item:
content_item['marker'] = text_item['marker']
return content_item
文本处理特点:
-
类型识别:根据label识别文本类型
-
原始文本保留:保留OCR识别的原始文本
-
元数据提取:提取编号、标记等元数据
6. 表格处理机制
6.1 表格组装
def assemble_tables(self, tables, data):
assembled_tables = []
for i, table in enumerate(tables):
table_json_obj = table.model_dump()
table_md = self._table_to_md(table_json_obj)
table_html = table.export_to_html()
table_data = data['tables'][i]
table_page_num = table_data['prov'][0]['page_no']
table_bbox = table_data['prov'][0]['bbox']
table_bbox = [
table_bbox['l'],
table_bbox['t'],
table_bbox['r'],
table_bbox['b']
]
# 从表格数据结构获取行数和列数
nrows = table_data['data']['num_rows']
ncols = table_data['data']['num_cols']
ref_num = table_data['self_ref'].split('/')[-1]
ref_num = int(ref_num)
table_obj = {
'table_id': ref_num,
'page': table_page_num,
'bbox': table_bbox,
'#-rows': nrows,
'#-cols': ncols,
'markdown': table_md,
'html': table_html,
'json': table_json_obj
}
assembled_tables.append(table_obj)
return assembled_tables
表格处理优势:
-
多格式输出:同时生成Markdown、HTML和JSON格式
-
位置信息:保留表格在页面中的位置信息
-
结构信息:记录行数、列数等结构信息
6.2 Markdown表格转换
def _table_to_md(self, table):
# 从网格单元格提取文本
table_data = []
for row in table['data']['grid']:
table_row = [cell['text'] for cell in row]
table_data.append(table_row)
# 检查表格是否有标题
if len(table_data) > 1 and len(table_data[0]) > 0:
try:
md_table = tabulate(
table_data[1:], headers=table_data[0], tablefmt="github"
)
except ValueError:
md_table = tabulate(
table_data[1:],
headers=table_data[0],
tablefmt="github",
disable_numparse=True,
)
else:
md_table = tabulate(table_data, tablefmt="github")
return md_table
转换特点:
-
智能标题识别:自动识别表格标题行
-
错误处理:处理数值解析错误
-
GitHub格式:使用GitHub风格的Markdown表格
7. 图片处理机制
7.1 图片组装
def assemble_pictures(self, data):
assembled_pictures = []
for i, picture in enumerate(data['pictures']):
children_list = self._process_picture_block(picture, data)
ref_num = picture['self_ref'].split('/')[-1]
ref_num = int(ref_num)
picture_page_num = picture['prov'][0]['page_no']
picture_bbox = picture['prov'][0]['bbox']
picture_bbox = [
picture_bbox['l'],
picture_bbox['t'],
picture_bbox['r'],
picture_bbox['b']
]
picture_obj = {
'picture_id': ref_num,
'page': picture_page_num,
'bbox': picture_bbox,
'children': children_list,
}
assembled_pictures.append(picture_obj)
return assembled_pictures
7.2 图片块处理
def _process_picture_block(self, picture, data):
children_list = []
for item in picture['children']:
if isinstance(item, dict) and '$ref' in item:
ref = item['$ref']
ref_type, ref_num = ref.split('/')[-2:]
ref_num = int(ref_num)
if ref_type == 'texts':
content_item = self._process_text_reference(ref_num, data)
children_list.append(content_item)
return children_list
图片处理特点:
-
位置信息:记录图片在页面中的位置
-
文本提取:提取图片中的文字内容
-
层次结构:保持图片与文本的关联关系
8. 性能优化与最佳实践
8.1 内存管理
def process_documents(self, conv_results: Iterable[ConversionResult]):
if self.output_dir is not None:
self.output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
processor = JsonReportProcessor(metadata_lookup=self.metadata_lookup, debug_data_path=self.debug_data_path)
# 标准化文档数据以确保页面连续
data = conv_res.document.export_to_dict()
normalized_data = self._normalize_page_sequence(data)
processed_report = processor.assemble_report(conv_res, normalized_data)
doc_filename = conv_res.input.file.stem
if self.output_dir is not None:
with (self.output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
json.dump(processed_report, fp, indent=2, ensure_ascii=False)
else:
failure_count += 1
_log.info(f"Document {conv_res.input.file} failed to convert.")
_log.info(f"Processed {success_count + failure_count} docs, of which {failure_count} failed")
return success_count, failure_count
优化策略:
-
流式处理:逐个处理文档,避免内存积累
-
及时写入:处理完成后立即写入文件
-
错误统计:详细记录成功和失败的数量
8.2 调试支持
def debug_data(self, data):
if self.debug_data_path is None:
return
doc_name = data['name']
path = self.debug_data_path / f"{doc_name}.json"
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
调试功能:
-
原始数据保存:保存Docling的原始输出
-
可选启用:通过参数控制是否启用调试
-
UTF-8编码:确保中文字符正确保存
9. 使用示例与配置
9.1 基本使用
# 创建PDF解析器
parser = PDFParser(
output_dir=Path("./parsed_pdfs"),
csv_metadata_path=Path("./metadata.csv"),
num_threads=4
)
# 顺序处理
parser.parse_and_export(doc_dir=Path("./pdf_reports"))
# 并行处理
parser.parse_and_export_parallel(
doc_dir=Path("./pdf_reports"),
optimal_workers=8,
chunk_size=2
)
9.2 高级配置
# 自定义后端和配置
parser = PDFParser(
pdf_backend=DoclingParseV2DocumentBackend,
output_dir=Path("./output"),
num_threads=8,
csv_metadata_path=Path("./company_metadata.csv")
)
# 启用调试模式
parser.debug_data_path = Path("./debug_data")
10. 企业应用实践
10.1 企业年报处理
该模块特别适合处理企业年报等复杂文档:
-
表格丰富:自动识别和提取财务报表
-
多语言支持:通过OCR处理多语言内容
-
结构保持:保持文档的层次结构
-
元数据关联:将文档与公司信息关联
10.2 性能优化建议
-
硬件配置:
-
GPU:用于OCR和表格识别
-
内存:16GB+用于大文档处理
-
存储:SSD用于快速I/O
-
-
参数调优:
-
optimal_workers
:设置为CPU核心数的1-2倍 -
chunk_size
:根据文档大小调整 -
num_threads
:根据内存大小调整
-
-
错误处理:
-
监控处理失败率
-
设置重试机制
-
记录详细日志
-
11. 总结
pdf_parsing.py
模块展示了现代RAG系统中PDF解析的最佳实践:
核心技术亮点
-
高质量解析:使用Docling进行精确的文档解析
-
并行处理:多进程并行处理提升效率
-
结构化输出:生成标准化的JSON格式
-
元数据管理:完整的文档元数据管理
-
错误处理:完善的错误处理和日志记录
企业应用价值
-
知识库构建:为企业知识库提供高质量的数据源
-
文档数字化:将纸质文档转换为结构化数据
-
信息提取:自动提取文档中的关键信息
-
多格式支持:支持多种输出格式满足不同需求
技术优势
-
可扩展性:支持大规模文档处理
-
可配置性:丰富的配置选项适应不同场景
-
可维护性:清晰的代码结构和完善的文档
-
可调试性:完整的调试支持便于问题排查
通过深入理解这个模块的设计和实现,我们可以为企业级RAG系统构建高质量的文档解析能力,为后续的向量化、检索和生成奠定坚实的基础。
import os
import time
import logging
import re
import json
from tabulate import tabulate
from pathlib import Path
from typing import Iterable, List
# from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
_log = logging.getLogger(__name__)
def _process_chunk(pdf_paths, pdf_backend, output_dir, num_threads, metadata_lookup, debug_data_path):
"""Helper function to process a chunk of PDFs in a separate process."""
# Create a new parser instance for this process
parser = PDFParser(
pdf_backend=pdf_backend,
output_dir=output_dir,
num_threads=num_threads,
csv_metadata_path=None # Metadata lookup is passed directly
)
parser.metadata_lookup = metadata_lookup
parser.debug_data_path = debug_data_path
parser.parse_and_export(pdf_paths)
return f"Processed {len(pdf_paths)} PDFs."
class PDFParser:
def __init__(
self,
pdf_backend=DoclingParseV2DocumentBackend,
output_dir: Path = Path("./parsed_pdfs"),
num_threads: int = None,
csv_metadata_path: Path = None,
):
self.pdf_backend = pdf_backend
self.output_dir = output_dir
self.doc_converter = self._create_document_converter()
self.num_threads = num_threads
self.metadata_lookup = {}
self.debug_data_path = None
if csv_metadata_path is not None:
self.metadata_lookup = self._parse_csv_metadata(csv_metadata_path)
if self.num_threads is not None:
os.environ["OMP_NUM_THREADS"] = str(self.num_threads)
@staticmethod
def _parse_csv_metadata(csv_path: Path) -> dict:
"""Parse CSV file and create a lookup dictionary with sha1 as key."""
import csv
metadata_lookup = {}
with open(csv_path, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
# Handle both old and new CSV formats for company name
company_name = row.get('company_name', row.get('name', '')).strip('"')
metadata_lookup[row['sha1']] = {
'company_name': company_name
}
return metadata_lookup
def _create_document_converter(self) -> "DocumentConverter": # type: ignore
"""Creates and returns a DocumentConverter with default pipeline options."""
from docling.document_converter import DocumentConverter, FormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions
from docling.datamodel.base_models import InputFormat
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
ocr_options = EasyOcrOptions(lang=['en'], force_full_page_ocr=False)
pipeline_options.ocr_options = ocr_options
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
format_options = {
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline,
pipeline_options=pipeline_options,
backend=self.pdf_backend
)
}
return DocumentConverter(format_options=format_options)
def convert_documents(self, input_doc_paths: List[Path]) -> Iterable[ConversionResult]:
conv_results = self.doc_converter.convert_all(source=input_doc_paths)
return conv_results
def process_documents(self, conv_results: Iterable[ConversionResult]):
if self.output_dir is not None:
self.output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
processor = JsonReportProcessor(metadata_lookup=self.metadata_lookup, debug_data_path=self.debug_data_path)
# Normalize the document data to ensure sequential pages
data = conv_res.document.export_to_dict()
normalized_data = self._normalize_page_sequence(data)
processed_report = processor.assemble_report(conv_res, normalized_data)
doc_filename = conv_res.input.file.stem
if self.output_dir is not None:
with (self.output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
json.dump(processed_report, fp, indent=2, ensure_ascii=False)
else:
failure_count += 1
_log.info(f"Document {conv_res.input.file} failed to convert.")
_log.info(f"Processed {success_count + failure_count} docs, of which {failure_count} failed")
return success_count, failure_count
def _normalize_page_sequence(self, data: dict) -> dict:
"""Ensure that page numbers in content are sequential by filling gaps with empty pages."""
if 'content' not in data:
return data
# Create a copy of the data to modify
normalized_data = data.copy()
# Get existing page numbers and find max page
existing_pages = {page['page'] for page in data['content']}
max_page = max(existing_pages)
# Create template for empty page
empty_page_template = {
"content": [],
"page_dimensions": {} # or some default dimensions if needed
}
# Create new content array with all pages
new_content = []
for page_num in range(1, max_page + 1):
# Find existing page or create empty one
page_content = next(
(page for page in data['content'] if page['page'] == page_num),
{"page": page_num, **empty_page_template}
)
new_content.append(page_content)
normalized_data['content'] = new_content
return normalized_data
def parse_and_export(self, input_doc_paths: List[Path] = None, doc_dir: Path = None):
start_time = time.time()
if input_doc_paths is None and doc_dir is not None:
input_doc_paths = list(doc_dir.glob("*.pdf"))
total_docs = len(input_doc_paths)
_log.info(f"Starting to process {total_docs} documents")
conv_results = self.convert_documents(input_doc_paths)
success_count, failure_count = self.process_documents(conv_results=conv_results)
elapsed_time = time.time() - start_time
if failure_count > 0:
error_message = f"Failed converting {failure_count} out of {total_docs} documents."
failed_docs = "Paths of failed docs:\n" + '\n'.join(str(path) for path in input_doc_paths)
_log.error(error_message)
_log.error(failed_docs)
raise RuntimeError(error_message)
_log.info(f"{'#'*50}\nCompleted in {elapsed_time:.2f} seconds. Successfully converted {success_count}/{total_docs} documents.\n{'#'*50}")
def parse_and_export_parallel(
self,
input_doc_paths: List[Path] = None,
doc_dir: Path = None,
optimal_workers: int = 10,
chunk_size: int = None
):
"""Parse PDF files in parallel using multiple processes.
Args:
input_doc_paths: List of paths to PDF files to process
doc_dir: Directory containing PDF files (used if input_doc_paths is None)
optimal_workers: Number of worker processes to use. If None, uses CPU count.
"""
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, as_completed
# Get input paths if not provided
if input_doc_paths is None and doc_dir is not None:
input_doc_paths = list(doc_dir.glob("*.pdf"))
total_pdfs = len(input_doc_paths)
_log.info(f"Starting parallel processing of {total_pdfs} documents")
cpu_count = multiprocessing.cpu_count()
# Calculate optimal workers if not specified
if optimal_workers is None:
optimal_workers = min(cpu_count, total_pdfs)
if chunk_size is None:
# Calculate chunk size (ensure at least 1)
chunk_size = max(1, total_pdfs // optimal_workers)
# Split documents into chunks
chunks = [
input_doc_paths[i : i + chunk_size]
for i in range(0, total_pdfs, chunk_size)
]
start_time = time.time()
processed_count = 0
# Use ProcessPoolExecutor for parallel processing
with ProcessPoolExecutor(max_workers=optimal_workers) as executor:
# Schedule all tasks
futures = [
executor.submit(
_process_chunk,
chunk,
self.pdf_backend,
self.output_dir,
self.num_threads,
self.metadata_lookup,
self.debug_data_path
)
for chunk in chunks
]
# Wait for completion and log results
for future in as_completed(futures):
try:
result = future.result()
processed_count += int(result.split()[1]) # Extract number from "Processed X PDFs"
_log.info(f"{'#'*50}\n{result} ({processed_count}/{total_pdfs} total)\n{'#'*50}")
except Exception as e:
_log.error(f"Error processing chunk: {str(e)}")
raise
elapsed_time = time.time() - start_time
_log.info(f"Parallel processing completed in {elapsed_time:.2f} seconds.")
class JsonReportProcessor:
def __init__(self, metadata_lookup: dict = None, debug_data_path: Path = None):
self.metadata_lookup = metadata_lookup or {}
self.debug_data_path = debug_data_path
def assemble_report(self, conv_result, normalized_data=None):
"""Assemble the report using either normalized data or raw conversion result."""
data = normalized_data if normalized_data is not None else conv_result.document.export_to_dict()
assembled_report = {}
assembled_report['metainfo'] = self.assemble_metainfo(data)
assembled_report['content'] = self.assemble_content(data)
assembled_report['tables'] = self.assemble_tables(conv_result.document.tables, data)
assembled_report['pictures'] = self.assemble_pictures(data)
self.debug_data(data)
return assembled_report
def assemble_metainfo(self, data):
metainfo = {}
sha1_name = data['origin']['filename'].rsplit('.', 1)[0]
metainfo['sha1_name'] = sha1_name
metainfo['pages_amount'] = len(data.get('pages', []))
metainfo['text_blocks_amount'] = len(data.get('texts', []))
metainfo['tables_amount'] = len(data.get('tables', []))
metainfo['pictures_amount'] = len(data.get('pictures', []))
metainfo['equations_amount'] = len(data.get('equations', []))
metainfo['footnotes_amount'] = len([t for t in data.get('texts', []) if t.get('label') == 'footnote'])
# Add CSV metadata if available
if self.metadata_lookup and sha1_name in self.metadata_lookup:
csv_meta = self.metadata_lookup[sha1_name]
metainfo['company_name'] = csv_meta['company_name']
return metainfo
def process_table(self, table_data):
# Implement your table processing logic here
return 'processed_table_content'
def debug_data(self, data):
if self.debug_data_path is None:
return
doc_name = data['name']
path = self.debug_data_path / f"{doc_name}.json"
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def expand_groups(self, body_children, groups):
expanded_children = []
for item in body_children:
if isinstance(item, dict) and '$ref' in item:
ref = item['$ref']
ref_type, ref_num = ref.split('/')[-2:]
ref_num = int(ref_num)
if ref_type == 'groups':
group = groups[ref_num]
group_id = ref_num
group_name = group.get('name', '')
group_label = group.get('label', '')
for child in group['children']:
child_copy = child.copy()
child_copy['group_id'] = group_id
child_copy['group_name'] = group_name
child_copy['group_label'] = group_label
expanded_children.append(child_copy)
else:
expanded_children.append(item)
else:
expanded_children.append(item)
return expanded_children
def _process_text_reference(self, ref_num, data):
"""Helper method to process text references and create content items.
Args:
ref_num (int): Reference number for the text item
data (dict): Document data dictionary
Returns:
dict: Processed content item with text information
"""
text_item = data['texts'][ref_num]
item_type = text_item['label']
content_item = {
'text': text_item.get('text', ''),
'type': item_type,
'text_id': ref_num
}
# Add 'orig' field only if it differs from 'text'
orig_content = text_item.get('orig', '')
if orig_content != text_item.get('text', ''):
content_item['orig'] = orig_content
# Add additional fields if they exist
if 'enumerated' in text_item:
content_item['enumerated'] = text_item['enumerated']
if 'marker' in text_item:
content_item['marker'] = text_item['marker']
return content_item
def assemble_content(self, data):
pages = {}
# Expand body children to include group references
body_children = data['body']['children']
groups = data.get('groups', [])
expanded_body_children = self.expand_groups(body_children, groups)
# Process body content
for item in expanded_body_children:
if isinstance(item, dict) and '$ref' in item:
ref = item['$ref']
ref_type, ref_num = ref.split('/')[-2:]
ref_num = int(ref_num)
if ref_type == 'texts':
text_item = data['texts'][ref_num]
content_item = self._process_text_reference(ref_num, data)
# Add group information if available
if 'group_id' in item:
content_item['group_id'] = item['group_id']
content_item['group_name'] = item['group_name']
content_item['group_label'] = item['group_label']
# Get page number from prov
if 'prov' in text_item and text_item['prov']:
page_num = text_item['prov'][0]['page_no']
# Initialize page if not exists
if page_num not in pages:
pages[page_num] = {
'page': page_num,
'content': [],
'page_dimensions': text_item['prov'][0].get('bbox', {})
}
pages[page_num]['content'].append(content_item)
elif ref_type == 'tables':
table_item = data['tables'][ref_num]
content_item = {
'type': 'table',
'table_id': ref_num
}
if 'prov' in table_item and table_item['prov']:
page_num = table_item['prov'][0]['page_no']
if page_num not in pages:
pages[page_num] = {
'page': page_num,
'content': [],
'page_dimensions': table_item['prov'][0].get('bbox', {})
}
pages[page_num]['content'].append(content_item)
elif ref_type == 'pictures':
picture_item = data['pictures'][ref_num]
content_item = {
'type': 'picture',
'picture_id': ref_num
}
if 'prov' in picture_item and picture_item['prov']:
page_num = picture_item['prov'][0]['page_no']
if page_num not in pages:
pages[page_num] = {
'page': page_num,
'content': [],
'page_dimensions': picture_item['prov'][0].get('bbox', {})
}
pages[page_num]['content'].append(content_item)
sorted_pages = [pages[page_num] for page_num in sorted(pages.keys())]
return sorted_pages
def assemble_tables(self, tables, data):
assembled_tables = []
for i, table in enumerate(tables):
table_json_obj = table.model_dump()
table_md = self._table_to_md(table_json_obj)
table_html = table.export_to_html()
table_data = data['tables'][i]
table_page_num = table_data['prov'][0]['page_no']
table_bbox = table_data['prov'][0]['bbox']
table_bbox = [
table_bbox['l'],
table_bbox['t'],
table_bbox['r'],
table_bbox['b']
]
# Get rows and columns from the table data structure
nrows = table_data['data']['num_rows']
ncols = table_data['data']['num_cols']
ref_num = table_data['self_ref'].split('/')[-1]
ref_num = int(ref_num)
table_obj = {
'table_id': ref_num,
'page': table_page_num,
'bbox': table_bbox,
'#-rows': nrows,
'#-cols': ncols,
'markdown': table_md,
'html': table_html,
'json': table_json_obj
}
assembled_tables.append(table_obj)
return assembled_tables
def _table_to_md(self, table):
# Extract text from grid cells
table_data = []
for row in table['data']['grid']:
table_row = [cell['text'] for cell in row]
table_data.append(table_row)
# Check if the table has headers
if len(table_data) > 1 and len(table_data[0]) > 0:
try:
md_table = tabulate(
table_data[1:], headers=table_data[0], tablefmt="github"
)
except ValueError:
md_table = tabulate(
table_data[1:],
headers=table_data[0],
tablefmt="github",
disable_numparse=True,
)
else:
md_table = tabulate(table_data, tablefmt="github")
return md_table
def assemble_pictures(self, data):
assembled_pictures = []
for i, picture in enumerate(data['pictures']):
children_list = self._process_picture_block(picture, data)
ref_num = picture['self_ref'].split('/')[-1]
ref_num = int(ref_num)
picture_page_num = picture['prov'][0]['page_no']
picture_bbox = picture['prov'][0]['bbox']
picture_bbox = [
picture_bbox['l'],
picture_bbox['t'],
picture_bbox['r'],
picture_bbox['b']
]
picture_obj = {
'picture_id': ref_num,
'page': picture_page_num,
'bbox': picture_bbox,
'children': children_list,
}
assembled_pictures.append(picture_obj)
return assembled_pictures
def _process_picture_block(self, picture, data):
children_list = []
for item in picture['children']:
if isinstance(item, dict) and '$ref' in item:
ref = item['$ref']
ref_type, ref_num = ref.split('/')[-2:]
ref_num = int(ref_num)
if ref_type == 'texts':
content_item = self._process_text_reference(ref_num, data)
children_list.append(content_item)
return children_list
作者简介:专注于AI技术在企业应用中的实践,擅长RAG系统设计和文档处理技术。
技术交流:欢迎在评论区分享您的PDF解析实践经验和遇到的问题,共同探讨文档处理技术的最佳实践。