1.背景
很多时候电子书pdf没有书签目录,阅读起来不方便,于是给它自动加个目录吧
2.步骤
步骤一:使用ds获取到目录json
截图目录,到ds中,然后输入如下提示词:
根据目录的图片,提取出如下格式的json目录数据: { "title": "第一章 概述", "page": 6, "childBookmark": [ { "title": "1.1.推荐系统简介", "page": 7, "childBookmark":[] }, { "title": "1.2.推荐系统XXX", "page": 9 } ] } 所有目录都要递归的加入到子目录中,直到最后一级

提取结果如下:
[
{
"title": "第1章 概述",
"page": 1,
"childBookmark": [
{
"title": "1.1 推荐系统简介",
"page": 1,
"childBookmark": [
{ "title": "1.1.1 信息超载", "page": 1, "childBookmark": [] },
{ "title": "1.1.2 长尾效应", "page": 2, "childBookmark": [] },
{ "title": "1.1.3 推荐系统的价值", "page": 3, "childBookmark": [] }
]
},
{
"title": "1.2 推荐系统的发展历史",
"page": 4,
"childBookmark": [
{ "title": "1.2.1 典型应用场景", "page": 4, "childBookmark": [] },
{ "title": "1.2.2 电商", "page": 5, "childBookmark": [] },
{ "title": "1.2.3 新闻", "page": 6, "childBookmark": [] },
{ "title": "1.2.4 音乐", "page": 7, "childBookmark": [] }
]
},
{
"title": "1.3 推荐系统框架",
"page": 8,
"childBookmark": [
{ "title": "1.3.1 用户画像", "page": 8, "childBookmark": [] },
{ "title": "1.3.2 项目画像", "page": 9, "childBookmark": [] }
]
},
{
"title": "1.4 推荐算法分类",
"page": 10,
"childBookmark": [
{ "title": "1.4.1 基于算法思想的分类", "page": 10, "childBookmark": [] },
{ "title": "1.4.2 基于应用问题的分类", "page": 11, "childBookmark": [] }
]
}
]
},
{
"title": "第2章 基于领域的协同过滤",
"page": 16,
"childBookmark": [
{
"title": "2.1 协同过滤简介",
"page": 16,
"childBookmark": [
{ "title": "2.1.1 基本思想", "page": 16, "childBookmark": [] },
{ "title": "2.1.2 算法分类", "page": 17, "childBookmark": [] },
{ "title": "2.1.3 一般流程", "page": 17, "childBookmark": [] }
]
},
{
"title": "2.2 基于用户的协同过滤",
"page": 20,
"childBookmark": [
{ "title": "2.2.1 Top-N 推荐", "page": 20, "childBookmark": [] },
{ "title": "2.2.2 评分预测", "page": 23, "childBookmark": [] }
]
},
{
"title": "2.3 基于项目的协同过滤",
"page": 26,
"childBookmark": [
{ "title": "2.3.1 Top-N 推荐", "page": 26, "childBookmark": [] },
{ "title": "2.3.2 评分预测", "page": 29, "childBookmark": [] }
]
},
{ "title": "2.4 基于距离的相似度度量", "page": 30, "childBookmark": [] },
{ "title": "2.5 邻域的选取", "page": 32, "childBookmark": [] },
{ "title": "2.6 Slope One 算法", "page": 33, "childBookmark": [] },
{
"title": "2.7 基于二部图的协同过滤",
"page": 34,
"childBookmark": [
{ "title": "2.7.1 激活扩散模型", "page": 35, "childBookmark": [] },
{ "title": "2.7.2 物质扩散模型", "page": 37, "childBookmark": [] },
{ "title": "2.7.3 热传导模型", "page": 39, "childBookmark": [] },
{ "title": "2.7.4 基于图扩散的推荐系统", "page": 42, "childBookmark": [] }
]
},
{ "title": "习题", "page": 43, "childBookmark": [] }
]
},
{
"title": "第3章 基于模型的协同过滤",
"page": 44,
"childBookmark": [
{
"title": "3.1 基于关联规则的协同过滤",
"page": 44,
"childBookmark": [
{ "title": "3.1.1 基本概念", "page": 45, "childBookmark": [] },
{ "title": "3.1.2 关联规则度量", "page": 45, "childBookmark": [] },
{ "title": "3.1.3 Apriori 关联规则挖掘算法", "page": 47, "childBookmark": [] },
{ "title": "3.1.4 关联规则的相关分析", "page": 48, "childBookmark": [] },
{ "title": "3.1.5 基于关联规则的推荐系统", "page": 49, "childBookmark": [] }
]
},
{
"title": "3.2 基于矩阵分解的评分预测",
"page": 50,
"childBookmark": [
{ "title": "3.2.1 奇异值分解", "page": 50, "childBookmark": [] },
{ "title": "3.2.2 隐语义模型", "page": 52, "childBookmark": [] },
{ "title": "3.2.3 概率矩阵分解", "page": 57, "childBookmark": [] },
{ "title": "3.2.4 SVD++模型", "page": 61, "childBookmark": [] }
]
},
{
"title": "3.3 基于矩阵分解的 Top-N 推荐",
"page": 62,
"childBookmark": [
{ "title": "3.3.1 基于正样本过采样的矩阵分解", "page": 62, "childBookmark": [] },
{ "title": "3.3.2 基于负样本欠采样的矩阵分解", "page": 63, "childBookmark": [] }
]
},
{ "title": "习题", "page": 64, "childBookmark": [] }
]
},
{
"title": "第4章 基于内容和知识的推荐",
"page": 65,
"childBookmark": [
{ "title": "4.1 基于内容的推荐系统框架", "page": 65, "childBookmark": [] },
{
"title": "4.2 基于词向量空间模型的文本表示",
"page": 67,
"childBookmark": [
{ "title": "4.2.1 词袋模型", "page": 67, "childBookmark": [] },
{ "title": "4.2.2 TF-IDF 模型", "page": 68, "childBookmark": [] },
{ "title": "4.2.3 模型改进", "page": 71, "childBookmark": [] },
{ "title": "4.2.4 向量相似度度量", "page": 71, "childBookmark": [] }
]
},
{
"title": "4.3 基于语义的内容相似度",
"page": 72,
"childBookmark": [
{ "title": "4.3.1 基于本体的文本相似度", "page": 72, "childBookmark": [] },
{ "title": "4.3.2 基于网络知识的文本相似度", "page": 73, "childBookmark": [] },
{ "title": "4.3.3 基于语料库的文本相似度", "page": 75, "childBookmark": [] }
]
},
{
"title": "4.4 基于知识的推荐",
"page": 77,
"childBookmark": [
{ "title": "4.4.1 基于约束的推荐", "page": 78, "childBookmark": [] },
{ "title": "4.4.2 基于效用的推荐", "page": 81, "childBookmark": [] },
{ "title": "4.4.3 基于实例的推荐", "page": 82, "childBookmark": [] },
{ "title": "4.4.4 基于知识库的推荐", "page": 84, "childBookmark": [] }
]
},
{ "title": "习题", "page": 86, "childBookmark": [] }
]
},
{
"title": "第5章 混合推荐系统",
"page": 87,
"childBookmark": [
{ "title": "5.1 混合推荐实例------Netflix 百万美金公开赛", "page": 87, "childBookmark": [] },
{
"title": "5.2 混合/组合推荐的动机",
"page": 89,
"childBookmark": [
{ "title": "5.2.1 实践经验", "page": 89, "childBookmark": [] },
{ "title": "5.2.2 理论依据", "page": 90, "childBookmark": [] }
]
},
{
"title": "5.3 混合/组合方法分类",
"page": 92,
"childBookmark": [
{ "title": "5.3.1 有监督组合和无监督组合", "page": 92, "childBookmark": [] },
{ "title": "5.3.2 基推荐器间依赖关系", "page": 93, "childBookmark": [] }
]
},
{
"title": "5.4 并行式混合推荐",
"page": 94,
"childBookmark": [
{ "title": "5.4.1 加权式混合", "page": 94, "childBookmark": [] },
{ "title": "5.4.2 切换式混合", "page": 95, "childBookmark": [] },
{ "title": "5.4.3 排序混合", "page": 96, "childBookmark": [] }
]
},
{
"title": "5.5 串行式混合推荐",
"page": 97,
"childBookmark": [
{ "title": "5.5.1 级联过滤", "page": 97, "childBookmark": [] },
{ "title": "5.5.2 级联学习", "page": 98, "childBookmark": [] }
]
},
{
"title": "5.6 整体式混合推荐",
"page": 99,
"childBookmark": [
{ "title": "5.6.1 特征组合", "page": 99, "childBookmark": [] },
{ "title": "5.6.2 特征扩充", "page": 100, "childBookmark": [] },
{ "title": "5.6.3 基于图模型的混合", "page": 101, "childBookmark": [] }
]
},
{ "title": "习题", "page": 102, "childBookmark": [] }
]
},
{
"title": "第6章 推荐系统评测",
"page": 103,
"childBookmark": [
{ "title": "6.1 评测视角", "page": 103, "childBookmark": [] },
{
"title": "6.2 实验方法",
"page": 104,
"childBookmark": [
{ "title": "6.2.1 在线实验", "page": 104, "childBookmark": [] },
{ "title": "6.2.2 用户调查", "page": 105, "childBookmark": [] },
{ "title": "6.2.3 离线实验", "page": 106, "childBookmark": [] }
]
},
{
"title": "6.3 评分预测评价指标",
"page": 107,
"childBookmark": [
{ "title": "6.3.1 MAE 和 MSE", "page": 107, "childBookmark": [] },
{ "title": "6.3.2 RMSE、NMAE 和 NRMSE", "page": 108, "childBookmark": [] }
]
},
{
"title": "6.4 Top-N 推荐评价指标",
"page": 109,
"childBookmark": [
{ "title": "6.4.1 分类准确度指标", "page": 109, "childBookmark": [] },
{ "title": "6.4.2 ROC 曲线和 AUC 值", "page": 110, "childBookmark": [] },
{ "title": "6.4.3 基于排序的评价指标", "page": 113, "childBookmark": [] },
{ "title": "6.4.4 其他常用评价指标", "page": 115, "childBookmark": [] }
]
},
{ "title": "6.5 公开实验数据集", "page": 116, "childBookmark": [] },
{ "title": "习题", "page": 118, "childBookmark": [] }
]
},
{
"title": "第7章 基于排序学习的推荐",
"page": 119,
"childBookmark": [
{ "title": "7.1 排序学习模型分类", "page": 119, "childBookmark": [] },
{
"title": "7.2 对级排序学习模型",
"page": 121,
"childBookmark": [
{ "title": "7.2.1 基本框架", "page": 121, "childBookmark": [] },
{ "title": "7.2.2 贝叶斯个性化排序", "page": 123, "childBookmark": [] },
{ "title": "7.2.3 协同对级排序学习", "page": 125, "childBookmark": [] }
]
},
{
"title": "7.3 列表级排序学习模型",
"page": 129,
"childBookmark": [
{ "title": "7.3.1 P-Push CR 算法", "page": 129, "childBookmark": [] },
{ "title": "7.3.2 CofiRank 算法", "page": 131, "childBookmark": [] }
]
},
{ "title": "习题", "page": 133, "childBookmark": [] }
]
},
{
"title": "第8章 基于情境感知的推荐",
"page": 134,
"childBookmark": [
{ "title": "8.1 情境信息的定义", "page": 134, "childBookmark": [] },
{ "title": "8.2 情境信息的获取", "page": 136, "childBookmark": [] },
{
"title": "8.3 基于情境感知的推荐系统框架",
"page": 136,
"childBookmark": [
{ "title": "8.3.1 数据立方体", "page": 137, "childBookmark": [] },
{ "title": "8.3.2 基于树的层次信息表达", "page": 138, "childBookmark": [] }
]
},
{
"title": "8.4 融合情境信息的推荐模型",
"page": 139,
"childBookmark": [
{ "title": "8.4.1 情境预过滤", "page": 140, "childBookmark": [] },
{ "title": "8.4.2 情境后过滤", "page": 141, "childBookmark": [] }
]
},
{
"title": "8.5 情境建模",
"page": 141,
"childBookmark": [
{ "title": "8.5.1 基于邻域的方法", "page": 142, "childBookmark": [] },
{ "title": "8.5.2 基于模型的方法", "page": 143, "childBookmark": [] }
]
},
{ "title": "习题", "page": 146, "childBookmark": [] }
]
},
{
"title": "第9章 基于时空信息的推荐",
"page": 147,
"childBookmark": [
{
"title": "9.1 基于时间信息的推荐",
"page": 147,
"childBookmark": [
{ "title": "9.1.1 最近最热门推荐算法", "page": 148, "childBookmark": [] },
{ "title": "9.1.2 基于时间的项目协同过滤", "page": 148, "childBookmark": [] },
{ "title": "9.1.3 基于时间的用户协同过滤", "page": 149, "childBookmark": [] },
{ "title": "9.1.4 基于会话的推荐", "page": 150, "childBookmark": [] }
]
},
{
"title": "9.2 基于序列感知的推荐",
"page": 151,
"childBookmark": [
{ "title": "9.2.1 基于马尔可夫模型的序列预测", "page": 152, "childBookmark": [] },
{ "title": "9.2.2 基于循环神经网络的序列预测", "page": 153, "childBookmark": [] },
{ "title": "9.2.3 基于注意力机制的序列预测", "page": 157, "childBookmark": [] }
]
},
{
"title": "9.3 基于空间信息的推荐",
"page": 160,
"childBookmark": [
{ "title": "9.3.1 位置信息的获取与推理", "page": 161, "childBookmark": [] },
{ "title": "9.3.2 基于位置信息的推荐", "page": 161, "childBookmark": [] },
{ "title": "9.3.3 融合其他信息的推荐", "page": 163, "childBookmark": [] }
]
},
{ "title": "习题", "page": 163, "childBookmark": [] }
]
},
{
"title": "第10章 基于社交关系的推荐",
"page": 164,
"childBookmark": [
{ "title": "10.1 社交关系数据", "page": 164, "childBookmark": [] },
{
"title": "10.2 基于邻域的社交化推荐",
"page": 166,
"childBookmark": [
{ "title": "10.2.1 基于用户的协同过滤", "page": 166, "childBookmark": [] },
{ "title": "10.2.2 基于图扩散的推荐", "page": 167, "childBookmark": [] }
]
},
{
"title": "10.3 基于模型的社交化推荐",
"page": 169,
"childBookmark": [
{ "title": "10.3.1 基于潜在社交因子学习的推荐", "page": 169, "childBookmark": [] },
{ "title": "10.3.2 基于显式社交关系的推荐", "page": 171, "childBookmark": [] }
]
},
{ "title": "10.4 基于社会曝光的协同过滤", "page": 174, "childBookmark": [] },
{ "title": "习题", "page": 176, "childBookmark": [] }
]
},
{
"title": "第11章 基于异质信息网络的推荐",
"page": 177,
"childBookmark": [
{ "title": "11.1 基本概念", "page": 177, "childBookmark": [] },
{
"title": "11.2 基于邻域的 HIN 推荐算法",
"page": 178,
"childBookmark": [
{ "title": "11.2.1 基于随机游走的相关度度量", "page": 178, "childBookmark": [] },
{ "title": "11.2.2 基于元路径的相关度度量", "page": 179, "childBookmark": [] },
{ "title": "11.2.3 基于元路径和随机游走混合的相关度度量", "page": 183, "childBookmark": [] }
]
},
{
"title": "11.3 基于模型的 HIN 推荐算法",
"page": 183,
"childBookmark": [
{ "title": "11.3.1 两阶段融合模型", "page": 183, "childBookmark": [] },
{ "title": "11.3.2 端到端的学习模型", "page": 188, "childBookmark": [] }
]
},
{ "title": "习题", "page": 191, "childBookmark": [] }
]
},
{
"title": "第12章 基于图神经网络的推荐",
"page": 192,
"childBookmark": [
{
"title": "12.1 图神经网络简介",
"page": 192,
"childBookmark": [
{ "title": "12.1.1 任务分类与定义", "page": 193, "childBookmark": [] },
{ "title": "12.1.2 一般流程与框架", "page": 194, "childBookmark": [] },
{ "title": "12.1.3 采样模块", "page": 195, "childBookmark": [] },
{ "title": "12.1.4 池化模块", "page": 196, "childBookmark": [] },
{ "title": "12.1.5 传播模块", "page": 198, "childBookmark": [] }
]
},
{
"title": "12.2 图神经网络典型算法",
"page": 202,
"childBookmark": [
{ "title": "12.2.1 GCN 算法", "page": 202, "childBookmark": [] },
{ "title": "12.2.2 GraphSAGE 算法", "page": 204, "childBookmark": [] },
{ "title": "12.2.3 CAT 算法", "page": 205, "childBookmark": [] }
]
},
{
"title": "12.3 基于图神经网络的推荐算法",
"page": 207,
"childBookmark": [
{ "title": "12.3.1 基于用户-项目二部图的协同过滤", "page": 207, "childBookmark": [] },
{ "title": "12.3.2 基于知识图谱的推荐", "page": 209, "childBookmark": [] }
]
},
{ "title": "习题", "page": 211, "childBookmark": [] }
]
},
{
"title": "实验1 基于邻域协同过滤的 Top-N 推荐",
"page": 212,
"childBookmark": []
},
{
"title": "实验2 基于矩阵分解的评分预测",
"page": 216,
"childBookmark": []
},
{
"title": "实验3 面向应用的推荐系统实现",
"page": 220,
"childBookmark": []
},
{
"title": "参考文献",
"page": 229,
"childBookmark": []
}
]
第二步:使用python代码实现目录添加
python
#!/usr/bin/env python3
"""
根据 JSON 格式的目录为 PDF 添加书签
依赖: pip install pymupdf
"""
import json
import sys
from pathlib import Path
def _load_fitz():
"""
加载 PyMuPDF。脚本目录 / 当前目录若在 sys.path 最前,本地的 pymupdf.py 或 pymupdf/
会遮蔽 site-packages,导致 ``from . import extra`` 报 relative import 错。
"""
_here = str(Path(__file__).resolve().parent)
_saved = sys.path.copy()
try:
sys.path[:] = [p for p in sys.path if p not in ("", _here)] + [
p for p in ("", _here) if p in sys.path
]
import pymupdf as m
return m
except ImportError as e:
raise ImportError(
"无法加载 PyMuPDF。请先检查:当前目录或本脚本目录下是否有多余的 "
"pymupdf.py、pymupdf 文件夹或同名项目(会遮蔽 pip 安装的包);"
"然后执行: pip install --force-reinstall pymupdf"
) from e
finally:
sys.path[:] = _saved
fitz = _load_fitz()
def parse_toc_from_json(json_str):
"""
解析 JSON 字符串,生成书签树结构。
输入 JSON 格式示例:
[
{
"title": "第一章 概述",
"page": 6,
"children": [
{
"title": "1.1 推荐系统简介",
"page": 7
},
{
"title": "1.2 发展历史",
"page": 8
}
]
},
{
"title": "第二章 推荐算法",
"page": 20
}
]
返回符合 fitz.set_toc 要求的扁平列表: [[level, title, page], ...](level 从 1 起)
"""
data = json.loads(json_str)
# 如果传入的是单个对象,转换为列表
if isinstance(data, dict):
data = [data]
toc = []
# 递归构建扁平列表;PyMuPDF 要求首条 level 为 1,故顶层从 1 开始
def traverse(items, level):
for item in items:
title = item.get("title")
page = item.get("page")
if title is None or page is None:
continue
# 添加当前书签
toc.append([level, title, page])
# 处理子书签(children 字段)
children = item.get("children") or item.get("childBookmark")
if children:
# 如果 children 是单个字典,转换为列表
if isinstance(children, dict):
children = [children]
traverse(children, level + 1)
traverse(data, 1)
return toc
def add_bookmarks_from_json(pdf_path, json_str, offset=0, output_path=None):
"""
从 JSON 字符串读取书签信息并添加到 PDF 文件。
:param pdf_path: 输入的 PDF 文件路径
:param json_str: JSON 格式的目录字符串
:param offset: 页码偏移量(实际页码 = 目录页码 - 1 + offset)
:param output_path: 输出 PDF 路径,默认为覆盖原文件
"""
pdf_path = Path(pdf_path)
if not pdf_path.exists():
raise FileNotFoundError(f"PDF 文件不存在: {pdf_path}")
# 解析 JSON 得到书签列表
try:
toc_raw = parse_toc_from_json(json_str)
except json.JSONDecodeError as e:
raise ValueError(f"JSON 解析错误: {e}")
if not toc_raw:
print("警告:未解析到任何书签", file=sys.stderr)
return
# 源文件若 xref/页树不规范,直接 set_toc 可能报 bad xref;先复制到空白文档再写书签
src = fitz.open(pdf_path)
total_pages = src.page_count
# 应用偏移并调整页码
final_toc = []
for level, title, page in toc_raw:
actual_page = (page - 1) + offset
if actual_page < 0:
actual_page = 0
print(f"警告:条目 '{title}' 的页码 {page} 偏移后为负,已修正为第 1 页", file=sys.stderr)
if actual_page >= total_pages:
actual_page = total_pages - 1
print(f"警告:条目 '{title}' 的页码 {page} 超出文档范围,已修正为最后一页", file=sys.stderr)
final_toc.append([level, title, actual_page])
if output_path is None:
output_path = pdf_path
else:
output_path = Path(output_path)
out = fitz.open()
try:
out.insert_pdf(src)
out.set_toc(final_toc)
out.save(
output_path,
garbage=4,
deflate=True,
)
finally:
out.close()
src.close()
print(f"成功添加 {len(final_toc)} 条书签到 {output_path}")
'''
提示词:
根据目录的图片,提取出如下格式的json目录数据:
{
"title": "第一章 概述",
"page": 6,
"childBookmark": [
{
"title": "1.1.推荐系统简介",
"page": 7,
"childBookmark":[]
},
{
"title": "1.2.推荐系统XXX",
"page": 9
}
]
}
所有目录都要递归的加入到子目录中,直到最后一级
'''
def main():
"""命令行入口示例"""
# 判断 json 参数是文件路径还是直接字符串
json_path = 'toc.json'
with open(json_path, "r", encoding="utf-8") as f:
json_str = f.read()
pdf_path = r"F:\ldp\p1.pdf"
add_bookmarks_from_json(pdf_path, json_str, 11, 'output0404.pdf')
if __name__ == "__main__":
main()