pdf 文件有98MB大小,共19k页,每页一个19列的表格。非图片,文本型。
python
from pathlib import Path
from typing import List, Tuple, Optional
import pandas as pd
from multiprocessing import Pool, cpu_count
import pdfplumber
import time
import PyPDF2
import os
import sys
import platform
import argparse
import logging
from logging.handlers import TimedRotatingFileHandler
# pip install pdfplumber=0.11.9 PyPDF2==3.0.1
# 配置日志
file_handler = TimedRotatingFileHandler(
"out_pdf.log",
when="midnight",
interval=1,
backupCount=7,
encoding="utf-8"
)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
handlers=[
file_handler,
logging.StreamHandler()
],
encoding="utf-8",
)
logger = logging.getLogger(__name__)
# 定义输出表头(需根据实际情况修改)
OUTPUT_HEADER = [
"Column1", "Column2", "Column3", "Column4", "Column5",
"Column6", "Column7", "Column8", "Column9", "Column10",
"Column11", "Column12", "Column13", "Column14", "Column15",
"Column16", "Column17", "Column18", "Column19"
]
# 表格提取配置(针对带线条的标准表格)
TABLE_SETTINGS = {
"vertical_strategy": "lines", # 使用线条检测列
"horizontal_strategy": "lines", # 使用线条检测行
"intersection_tolerance": 5, # 增加容差以处理不完美的线条
"snap_tolerance": 5,
"join_tolerance": 5,
"edge_min_length": 3,
}
# ==================== 第1步: PDF拆分工具 ====================
def split_large_pdf(input_pdf: str, output_dir: str, max_size_mb: int = 10):
"""
将大PDF文件拆分成多个小文件
参数:
input_pdf: 输入PDF路径
output_dir: 输出目录
max_size_mb: 每个文件最大大小(MB)
返回:
拆分后的文件列表
"""
logger.info("=" * 70)
logger.info(f"开始拆分PDF: {input_pdf}")
logger.info("=" * 70)
# 创建输出目录
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# 获取文件信息
file_size_mb = os.path.getsize(input_pdf) / (1024 * 1024)
logger.info(f"原文件大小: {file_size_mb:.2f} MB")
with open(input_pdf, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
total_pages = len(pdf_reader.pages)
logger.info(f"总页数: {total_pages}")
# 估算每个小文件的页数
pages_per_file = int(total_pages * (max_size_mb / file_size_mb))
pages_per_file = max(100, pages_per_file) # 至少100页
logger.info(f"预计每个文件: {pages_per_file} 页")
# 拆分PDF
pdf_reader = PyPDF2.PdfReader(input_pdf)
split_files = []
file_num = 1
current_page = 0
while current_page < total_pages:
pdf_writer = PyPDF2.PdfWriter()
end_page = min(current_page + pages_per_file, total_pages)
if file_num > 10:
break
for page_idx in range(current_page, end_page):
pdf_writer.add_page(pdf_reader.pages[page_idx])
output_file = output_path / f"part_{file_num:03d}.pdf"
with open(output_file, "wb") as output:
pdf_writer.write(output)
file_size = os.path.getsize(output_file) / (1024 * 1024)
logger.info(
f"已生成: {output_file.name} (页数: {current_page+1}-{end_page}, 大小: {file_size:.2f} MB)"
)
split_files.append(str(output_file))
current_page = end_page
file_num += 1
logger.info(f"拆分完成!共生成 {len(split_files)} 个文件")
return split_files
# ==================== 第2步: 提取表格数据 ====================
def clean_cell_text(text: Optional[str]) -> str:
"""
清洗单元格文本
参数:
text: 原始文本
返回:
清洗后的文本
"""
if text is None or text == "":
return ""
# 转为字符串并清理
text = str(text).strip()
# 移除多余的换行符和空格
text = ''.join(text.split())
# 移除特殊字符
text = text.replace('\n', ' ').replace('\r', ' ')
return text
def validate_data_row(row: List[str], min_first_col_length: int = 8) -> bool:
"""
验证数据行是否有效
参数:
row: 数据行
min_first_col_length: 第一列最小长度
返回:
是否为有效数据行
"""
if not row or not any(row):
return False
# 检查第一列
first_cell = clean_cell_text(row[0])
# 第一列应该至少有指定长度且以字母开头
if len(first_cell) < min_first_col_length:
return False
# 检查前两个字符是否为字母
if not first_cell[:2].isalpha():
return False
return True
def extract_tables_from_page_range(args: Tuple[str, int, int]) -> Tuple[List[List[str]], int, int]:
"""
使用pdfplumber从指定页面范围提取表格数据
参数:
args: (pdf_path, start_page, end_page)
返回:
(提取的数据行列表, 成功页数, 失败页数)
"""
pdf_path, start_page, end_page = args
all_data = []
success_pages = 0
failed_pages = 0
try:
with pdfplumber.open(pdf_path) as pdf:
for page_num in range(start_page - 1, end_page):
if page_num >= len(pdf.pages):
break
current_page = page_num + 1 # 显示用的页码(从1开始)
page = pdf.pages[page_num]
page_extracted = False
try:
# 提取表格
tables = page.extract_tables(table_settings=TABLE_SETTINGS)
if not tables:
logger.warning(f" 页面 {current_page}: 未检测到表格")
failed_pages += 1
continue
# 处理第一个表格(假设每页只有一个主表格)
table = tables[0]
if not table or len(table) <= 2: # 至少要有表头+数据
logger.warning(f" 页面 {current_page}: 表格为空或只有表头")
failed_pages += 1
continue
# 跳过前2行表头
data_rows = table[2:]
# 提取有效数据行
valid_rows = 0
for row in data_rows:
# 验证数据行
if not validate_data_row(row):
continue
# 清洗数据
cleaned_row = [clean_cell_text(cell) for cell in row[:len(OUTPUT_HEADER)]]
# 补齐到目标列数
if len(cleaned_row) < len(OUTPUT_HEADER):
cleaned_row.extend([""] * (len(OUTPUT_HEADER) - len(cleaned_row)))
else:
cleaned_row = cleaned_row[:len(OUTPUT_HEADER)]
all_data.append(cleaned_row)
valid_rows += 1
page_extracted = True
if page_extracted:
success_pages += 1
if valid_rows > 0:
logger.debug(f" 页面 {current_page}: 提取 {valid_rows} 行")
else:
failed_pages += 1
logger.warning(f" 页面 {current_page}: 无有效数据")
except Exception as e:
logger.error(f" 页面 {current_page} 提取失败: {str(e)}")
failed_pages += 1
continue
logger.debug(f" 批次 {start_page}-{end_page}: 成功 {success_pages} 页, 失败 {failed_pages} 页, 提取 {len(all_data)} 行")
except Exception as e:
logger.error(f" 批次 {start_page}-{end_page} 处理出错: {str(e)}")
return all_data, success_pages, failed_pages
def process_single_pdf_file(
pdf_file: str,
result_dir: str,
num_processes: int = None,
batch_size: int = 30,
save_checkpoint: bool = False,
checkpoint_interval: int = 10
) -> Tuple[List[List[str]], int, str]:
"""
处理单个PDF文件并保存结果
参数:
pdf_file: PDF文件路径
result_dir: 结果保存目录
num_processes: 进程数,默认为CPU核心数-1
batch_size: 每批处理的页数
save_checkpoint: 是否保存检查点
checkpoint_interval: 每处理多少批次保存一次检查点
返回:
(所有数据行, 成功页数, 中间结果文件路径)
"""
if num_processes is None:
num_processes = max(1, cpu_count() - 1)
pdf_path = Path(pdf_file)
if not pdf_path.exists():
raise FileNotFoundError(f"PDF文件不存在: {pdf_file}")
# 创建结果目录
result_path = Path(result_dir)
result_path.mkdir(parents=True, exist_ok=True)
logger.info(f"开始处理: {pdf_path.name}")
logger.info(f"{'='*60}")
# 获取总页数
try:
with pdfplumber.open(pdf_file) as pdf:
total_pages = len(pdf.pages)
except Exception as e:
logger.error(f"无法打开PDF: {str(e)}")
return [], 0, ""
logger.info(f"总页数: {total_pages}")
logger.info(f"并行进程: {num_processes}")
logger.info(f"批次大小: {batch_size} 页/批")
# 分批处理
page_ranges = []
for start_page in range(1, total_pages + 1, batch_size):
end_page = min(start_page + batch_size - 1, total_pages)
page_ranges.append((pdf_file, start_page, end_page))
logger.info(f"分批数量: {len(page_ranges)} 批")
# 并行处理
all_data = []
total_success = 0 # 总成功页数
total_failed = 0 # 总失败页数
try:
with Pool(processes=num_processes) as pool:
results = pool.map(extract_tables_from_page_range, page_ranges)
for batch_idx, (batch_data, success, failed) in enumerate(results, 1):
all_data.extend(batch_data)
total_success += success
total_failed += failed
# 定期保存检查点
if save_checkpoint and batch_idx % checkpoint_interval == 0:
checkpoint_file = result_path / f"{pdf_path.stem}_checkpoint_{batch_idx}.csv"
try:
df_checkpoint = pd.DataFrame(all_data, columns=OUTPUT_HEADER)
df_checkpoint.to_csv(checkpoint_file, index=False, encoding='utf-8-sig')
logger.info(f" 检查点已保存: {checkpoint_file.name}")
except Exception as e:
logger.warning(f" 检查点保存失败: {str(e)}")
except Exception as e:
logger.error(f"并行处理失败: {str(e)}")
return all_data, total_success, ""
# 统计信息
logger.info(f"处理完成统计:")
logger.info(f" 成功页数: {total_success}/{total_pages} ({total_success/total_pages*100:.1f}%)")
logger.info(f" 失败页数: {total_failed}/{total_pages}")
logger.info(f" 提取行数: {len(all_data)}")
# logger.info(f"{'='*60}")
# 保存最终结果
if all_data:
result_csv = result_path / f"{pdf_path.stem}_result.csv"
try:
df = pd.DataFrame(all_data, columns=OUTPUT_HEADER)
df.to_csv(result_csv, index=False, encoding='utf-8-sig')
logger.info(f"✓ 结果已保存: {result_csv}, 行数: {len(df)}")
return all_data, total_success, str(result_csv)
except Exception as e:
logger.error(f"保存CSV失败: {str(e)}")
return all_data, total_success, ""
else:
logger.warning("未提取到任何数据!")
return [], 0, ""
# ==================== 第3步: 主处理流程 ====================
def process_large_pdf_optimized(
input_pdf: str,
output_dir: str,
temp_dir: str = "./temp_split",
result_dir: str = "./result",
split_size_mb: int = 10,
num_processes: int = None,
keep_temp: bool = False,
):
"""
处理大PDF文件
参数:
input_pdf: 输入PDF文件
output_dir: 输出目录(不是文件路径)
temp_dir: 临时拆分目录
result_dir: 中间结果保存目录
split_size_mb: 拆分文件大小(MB)
num_processes: 进程数(None=全部核心)
keep_temp: 是否保留临时文件
"""
logger.info(f"temp_dir is {temp_dir}, result dir is {result_dir}")
total_start = time.time()
# 创建输出目录
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# 根据输入文件名生成输出文件名
input_filename = Path(input_pdf).stem
output_csv = output_path / f"{input_filename}.csv"
# 检测系统信息
system = platform.system()
cpu_cores = cpu_count()
if num_processes is None:
num_processes = max(1, cpu_cores - 2)
logger.info("=" * 70)
logger.info("进行PDF表格提取")
logger.info("=" * 70)
logger.info("系统信息:")
logger.info(f" 操作系统: {system}, CPU核心数: {cpu_cores}, 使用进程数: {num_processes}")
logger.info(f" 输入文件: {input_pdf}")
logger.info(f" 输出目录: {output_dir}")
logger.info(f" 中间结果目录: {result_dir}")
logger.info(f" 最终输出文件: {output_csv}")
logger.info("=" * 70)
# 步骤1: 拆分PDF
logger.info("【步骤1/3】拆分PDF文件...")
split_start = time.time()
split_files = []
for filename in os.listdir(temp_dir):
if filename.lower().endswith('.pdf'):
print(f" 添加{temp_dir}下的{filename}")
split_files.append(os.path.join(temp_dir, filename))
if len(split_files) == 0:
split_files = split_large_pdf(input_pdf, temp_dir, split_size_mb)
split_time = time.time() - split_start
logger.info(f"拆分完成,耗时: {split_time:.2f}秒")
# 步骤2: 处理所有小文件
logger.info(f"【步骤2/3】处理 {len(split_files)} 个文件...")
process_start = time.time()
all_data = []
intermediate_files = []
for i, pdf_file in enumerate(split_files, 1):
file_start = time.time()
logger.info(f"{'='*70}")
logger.info(f"【处理文件 {i}/{len(split_files)}】: {Path(pdf_file).name}")
file_data, success_pages, intermediate_file = process_single_pdf_file(pdf_file, result_dir, num_processes)
all_data.extend(file_data)
if intermediate_file:
intermediate_files.append(intermediate_file)
file_time = time.time() - file_start
logger.info(f" ✓ 提取 {len(file_data)} 行({success_pages} 页),耗时 {file_time:.2f}秒")
logger.info(f" ✓ 累计: {len(all_data)} 行\n")
process_time = time.time() - process_start
logger.info("所有文件处理完成!")
logger.info(f" 总提取行数: {len(all_data)}")
logger.info(f" 处理耗时: {process_time:.2f}秒")
logger.info(f" 中间结果文件数: {len(intermediate_files)}")
if len(all_data) == 0:
logger.warning("⚠️ 警告: 未提取到任何数据!")
return None
# 步骤3: 数据清洗和保存
logger.info("【步骤3/3】数据清洗和保存...")
clean_start = time.time()
df = pd.DataFrame(all_data, columns=OUTPUT_HEADER)
# 快速清洗
logger.info(f"原始数据: {len(df)} 行")
# df = df[
# (df["药品代码"].str.len() > 10) &
# (df["药品代码"].str[:2].str.isalpha())
# ]
df = df[df["药品代码"].notna() & (df["药品代码"] != "")]
df = df[df["注册名称"].notna() & (df["注册名称"] != "")]
df = df.drop_duplicates(subset=["药品代码", "注册名称", "注册规格"])
df = df.reset_index(drop=True)
logger.info(f"清洗后: {len(df)} 行")
# 保存最终结果
logger.info(f"保存最终结果到:")
logger.info(f" CSV: {output_csv}")
df.to_csv(output_csv, index=False, encoding='utf-8')
total_time = time.time() - total_start
clean_time = time.time() - clean_start
# 清理临时文件
if not keep_temp:
logger.info("清理临时拆分文件...")
import shutil
shutil.rmtree(temp_dir)
logger.info("临时目录已删除")
else:
logger.info(f"临时文件保留在: {temp_dir}")
# 总结输出
logger.info("=" * 70)
logger.info("✓ 处理完成!")
logger.info("=" * 70)
logger.info(f"输入文件: {input_pdf}")
logger.info(f"完成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}")
logger.info(f"总耗时: {total_time:.2f}秒")
logger.info(f" - 拆分PDF: {split_time:.2f}秒 ({split_time/total_time*100:.1f}%)")
logger.info(f" - 提取数据: {process_time:.2f}秒 ({process_time/total_time*100:.1f}%)")
logger.info(f" - 清洗保存: {clean_time:.2f}秒 ({clean_time/total_time*100:.1f}%)")
logger.info(f"最终数据: {df.shape}")
logger.info(f"输出文件:")
logger.info(f" - CSV: {output_csv}")
logger.info(f" - 中间结果: {result_dir}")
logger.info(f" 原始行数: {len(all_data)}")
logger.info(f" 清洗后行数: {len(df)}")
logger.info("=" * 70)
return df
def get_optimal_settings(cpu_cores=16):
"""
根据系统自动优化设置
"""
system = platform.system()
settings = {"num_processes": cpu_cores, "split_size_mb": 10}
if system == "Linux":
settings["num_processes"] = min(32, max(6, cpu_cores - 1))
settings["split_size_mb"] = 12
elif system == "Windows":
settings["num_processes"] = max(1, cpu_cores - 1)
settings["split_size_mb"] = 8
elif system == "Darwin": # macOS
settings["num_processes"] = max(1, cpu_cores - 1)
settings["split_size_mb"] = 10
return settings
# ==================== 使用示例 ====================
"""
python pdf_extract.py -i /path/to/data/drug_category_code_data_20251212.pdf -o /path/to/data/result -c 16 --keep-temp
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='将大型PDF表格转换为CSV')
parser.add_argument('-i', '--input', required=True, help='输入的PDF文件路径')
parser.add_argument('-o', '--output', required=True, help='输出目录路径(不是文件)')
parser.add_argument('-c', '--cores', type=int, default=16,
help='使用的CPU核心数 (默认: 16)', metavar='N')
parser.add_argument('--temp-dir', default='./temp_split',
help='临时拆分文件目录 (默认: ./temp_split)')
parser.add_argument('--keep-temp', action='store_true',
help='保留临时拆分文件')
args = parser.parse_args()
if not os.path.isfile(args.input):
print(f"错误: 输入文件 '{args.input}' 不存在")
sys.exit(1)
# 自动优化设置
settings = get_optimal_settings(args.cores)
logger.info(f"当前系统: {platform.system()}")
logger.info(f"优化设置: {settings}")
# 开始处理
df = process_large_pdf_optimized(
input_pdf=args.input,
output_dir=args.output,
temp_dir=args.temp_dir,
result_dir=os.path.join(args.output, "middle"),
split_size_mb=settings["split_size_mb"],
num_processes=settings["num_processes"],
keep_temp=args.keep_temp,
)
if df is not None:
logger.info(" ✓ 全部完成!")
else:
logger.info("⚠️ 提取失败,请检查PDF格式")