本文介绍使用python提取pdf中的表格到excel中,包含pdf的拆分、pdf提取到excel、合并excel。
一、拆分pdf
将一个大的pdf按页数拆分为多个小的pdf:
py
# pip install PyPDF2
import os, pdfplumber, PyPDF2
# 分割pdf
def split_pdf(input_pdf_path, num_splits):
# Create a PDF reader object
pdf_reader = PyPDF2.PdfReader(open(input_pdf_path, 'rb'))
total_pages = len(pdf_reader.pages)
# Calculate the number of pages per split
pages_per_split = total_pages // num_splits
# Get the directory and base name of the input PDF
base_dir = os.path.dirname(input_pdf_path)
base_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
for i in range(num_splits):
pdf_writer = PyPDF2.PdfWriter()
start_page = i * pages_per_split
end_page = start_page + pages_per_split
if i == num_splits - 1: # Make sure to include remaining pages in the last split
end_page = total_pages
for page in range(start_page, end_page):
pdf_writer.add_page(pdf_reader.pages[page])
output_pdf_path = os.path.join(base_dir, f"{base_name}_part_{i}.pdf")
with open(output_pdf_path, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
print(f"PDF split into {num_splits} parts successfully.")
二、提取pdf中的表格到excel中
将pdf表格提取到excel表格中:
py
# pip install pdfplumber
# pip install openpyxl
import os, pdfplumber
from openpyxl import Workbook, load_workbook
# pdf转为xlsx
def analysis_table(pdf_file_name):
# 打开表格
workbook = Workbook()
sheet = workbook.active
num = 0
# 打开pdf
with pdfplumber.open(pdf_file_name + '.pdf') as pdf:
# 遍历每页pdf
for page in pdf.pages:
# 提取表格信息
tables = page.extract_tables()
# 遍历提取到的所有表格
for table in tables:
# 格式化表格数据
for row in table:
sheet.append(row)
num = num + 1
print("process page: %d"%(num))
workbook.save(filename=(pdf_file_name + ".xlsx"))
三、合并多个excel表格到一个excel中
py
# pip install pdfplumber
# pip install openpyxl
import os, pdfplumber
from openpyxl import Workbook, load_workbook
# 合并excel文件
def merge_excels(files, output_name):
# 加载第一个Excel文件
wb1 = load_workbook(files[0] + '.xlsx')
sheet1 = wb1.active
file_list = files[1::1]
for file in file_list:
# 加载第二个Excel文件
wb2 = load_workbook(file + '.xlsx')
sheet2 = wb2.active
# 合并两个工作表的数据
for row in sheet2.iter_rows(values_only=True):
sheet1.append(row)
# 保存合并后的Excel文件
wb1.save(output_name + '.xlsx')
四、完整代码
完整代码在:使用python实现pdf表格转为excel表格。