我发现 pypdf 和 pypdf2 的作者是同一人:Mathieu Fenniak
pip install pypdf2 ;
pypdf2-3.0.1-py3-none-any.whl (232 kB)
编写 pdf_read_dir.py 如下
python
# -*- coding: utf-8 -*-
""" pypdf2==3.0.1 从PDF中提取目录 """
import os
import sys
from PyPDF2 import PdfReader
#每个书签的索引格式
#{'/Title': '书签名', '/Page': '指向的目标页数', '/Type': '类型'}
# 查找指定的字符出现次数
def find_char(str1, char):
cs = 0
for c in str1:
if c == char:
cs += 1
return cs
directory_str = ''
def bookmark_listhandler(list):
global directory_str
for message in list:
if isinstance(message, dict):
title = message['/Title'].strip()
if title.startswith("Chapter"):
directory_str += '\n' + title + '\n'
elif title[0:2] in ("序章","前言") or title.startswith("序"):
directory_str += '\n' + title + '\n'
elif title.startswith("第") and title.split()[0][-1] =="章":
directory_str += '\n' + title + '\n'
elif title.startswith("第") and title.split()[0][-1] =="节":
directory_str += ' ' + title + '\n'
elif title.startswith("第"):
directory_str += '\n' + title + '\n'
elif title[0] in ('一','二','三','四','五','六','七','八','九','十'):
directory_str += ' ' + title + '\n'
elif title[0] in "1234567890":
cs = find_char(title, '.')
directory_str += ' '*cs + title + '\n'
else:
directory_str += ' ' + title + '\n'
else:
bookmark_listhandler(message)
# main()
if len(sys.argv) ==2:
file1 = sys.argv[1]
else:
print('usage: python pdf_read_dir.py file.pdf')
sys.exit(1)
if not os.path.exists(file1):
print(f"{file1} is not exists.")
sys.exit(2)
fn,ext = os.path.splitext(file1)
if ext.lower() != '.pdf':
print("Please specify a valid pdf file")
sys.exit(3)
with open(file1, 'rb') as f1:
pdf = PdfReader(f1)
# 检索文档中存在的文本大纲,返回的对象是一个嵌套的列表
bookmark_listhandler(pdf.outline)
if len(directory_str) >0:
fname = fn.split('\\')[-1]
file2 = fn + '.txt'
with open(file2, 'w', encoding='utf-8') as fp:
fp.write(fname +'\n')
fp.write(directory_str)
else:
print("it no directory.")
运行 python pdf_read_dir.py your_ebook.pdf
生成 your_ebook.txt
由于算法优劣原因,生成的结果正确性始终比不过 java : pdfbox 读取 PDF文件内书签