from flask import Flask, request, jsonify, send_file
from flask_sqlalchemy import SQLAlchemy
from flask_cors import CORS
import os
import fitz
import uuid
from datetime import datetime
import json
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///pdf_tags.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['MAX_CONTENT_LENGTH'] = 200 * 1024 * 1024 # 200MB limit
# Enable CORS
CORS(app)
# Create uploads directory if not exists
if not os.path.exists(app.config['UPLOAD_FOLDER']):
os.makedirs(app.config['UPLOAD_FOLDER'])
db = SQLAlchemy(app)
# Models
class PDF(db.Model):
id = db.Column(db.Integer, primary_key=True)
filename = db.Column(db.String(255), nullable=False)
original_filename = db.Column(db.String(255), nullable=False)
upload_date = db.Column(db.DateTime, default=datetime.utcnow)
page_count = db.Column(db.Integer)
tags = db.relationship('Tag', backref='pdf', lazy=True)
class Tag(db.Model):
id = db.Column(db.Integer, primary_key=True)
pdf_id = db.Column(db.Integer, db.ForeignKey('pdf.id'), nullable=False)
tag_type = db.Column(db.String(50), nullable=False)
content = db.Column(db.Text)
page_number = db.Column(db.Integer, nullable=False)
parent_id = db.Column(db.Integer, db.ForeignKey('tag.id'))
level = db.Column(db.Integer, nullable=False)
children = db.relationship('Tag', backref=db.backref('parent', remote_side=[id]))
class PageMapping(db.Model):
id = db.Column(db.Integer, primary_key=True)
pdf_id = db.Column(db.Integer, db.ForeignKey('pdf.id'), nullable=False)
page_number = db.Column(db.Integer, nullable=False)
content = db.Column(db.Text)
# Create database tables
with app.app_context():
db.create_all()
# Helper functions
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() == 'pdf'
def get_element_page(element, doc):
"""Get the page number for a structure element"""
try:
# Try to get the page from the element's reference
if hasattr(element, 'page'):
return element.page.number + 1 # PyMuPDF pages are 0-indexed
# Fallback: search for the element's content in pages
content = element.get_text().strip()
if content:
for i in range(len(doc)):
page = doc[i]
page_text = page.get_text().strip()
if content in page_text:
return i + 1
except Exception as e:
print(f"Error getting element page: {e}")
return 1 # Default to first page if not found
def save_tag(pdf_id, tag_type, content, page_number, parent_id, level):
"""Save a tag to the database"""
tag = Tag(
pdf_id=pdf_id,
tag_type=tag_type,
content=content,
page_number=page_number,
parent_id=parent_id,
level=level
)
db.session.add(tag)
db.session.commit()
return tag.id
def parse_pdf_structure(pdf_path, pdf_id):
"""Parse PDF structure and extract tags"""
doc = fitz.open(pdf_path)
pdf = PDF.query.get(pdf_id)
pdf.page_count = len(doc)
db.session.commit()
try:
# Try to access structure tree using different methods
struct_tree = None
# Method 1: Check for struct_tree attribute
if hasattr(doc, 'struct_tree'):
struct_tree = doc.struct_tree
# Method 2: Check for get_struct_tree method
elif hasattr(doc, 'get_struct_tree'):
struct_tree = doc.get_struct_tree()
# Method 3: Check pdf_catalog for structure tree
elif hasattr(doc, 'pdf_catalog'):
catalog = doc.pdf_catalog
if isinstance(catalog, dict) and 'StructTreeRoot' in catalog:
struct_tree = catalog['StructTreeRoot']
# If no structure tree found, return error
if not struct_tree:
return {"error": "PDF does not have a structure tree"}
# Check if struct_tree has root element
root_element = getattr(struct_tree, 'root', None)
if not root_element:
return {"error": "PDF structure tree has no root element"}
# Recursive function to parse structure elements
def parse_element(element, parent_id=None, level=0):
tag_type = getattr(element, 'type', 'Unknown')
content = getattr(element, 'get_text', lambda: '')().strip()
page_number = get_element_page(element, doc)
# Save tag
tag_id = save_tag(pdf_id, tag_type, content, page_number, parent_id, level)
# Save page mapping if not exists
existing_mapping = PageMapping.query.filter_by(
pdf_id=pdf_id,
page_number=page_number
).first()
if not existing_mapping:
page = doc[page_number - 1]
page_content = page.get_text()
mapping = PageMapping(
pdf_id=pdf_id,
page_number=page_number,
content=page_content
)
db.session.add(mapping)
db.session.commit()
# Recursively parse children
children = getattr(element, 'children', [])
for child in children:
parse_element(child, tag_id, level + 1)
# Start parsing from root element
parse_element(root_element)
return {"success": True, "message": "PDF structure parsed successfully"}
except Exception as e:
print(f"Error parsing PDF structure: {e}")
return {"error": str(e)}
finally:
doc.close()
def build_tag_tree(tags):
"""Build a hierarchical tag tree from flat tags list"""
tag_dict = {tag.id: {"id": tag.id, "tag_type": tag.tag_type, "content": tag.content, "page_number": tag.page_number, "children": []} for tag in tags}
root_tags = []
for tag in tags:
if tag.parent_id is None:
root_tags.append(tag_dict[tag.id])
else:
if tag.parent_id in tag_dict:
tag_dict[tag.parent_id]["children"].append(tag_dict[tag.id])
return root_tags
# API Routes
@app.route('/api/upload', methods=['POST'])
def upload_pdf():
"""Upload a PDF file"""
if 'file' not in request.files:
return jsonify({"error": "No file part"}), 400
file = request.files['file']
if file.filename == '':
return jsonify({"error": "No selected file"}), 400
if not allowed_file(file.filename):
return jsonify({"error": "Only PDF files are allowed"}), 400
try:
# Generate unique filename
unique_filename = str(uuid.uuid4()) + '.pdf'
file_path = os.path.join(app.config['UPLOAD_FOLDER'], unique_filename)
file.save(file_path)
# Create PDF record
pdf = PDF(
filename=unique_filename,
original_filename=file.filename
)
db.session.add(pdf)
db.session.commit()
# Parse PDF structure
parse_result = parse_pdf_structure(file_path, pdf.id)
if 'error' in parse_result:
return jsonify(parse_result), 400
return jsonify({
"success": True,
"pdf_id": pdf.id,
"message": "PDF uploaded and parsed successfully"
}), 201
except Exception as e:
print(f"Error uploading PDF: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/pdfs', methods=['GET'])
def get_pdfs():
"""Get list of uploaded PDFs"""
try:
pdfs = PDF.query.all()
pdf_list = [{
"id": pdf.id,
"original_filename": pdf.original_filename,
"upload_date": pdf.upload_date.isoformat(),
"page_count": pdf.page_count
} for pdf in pdfs]
return jsonify({"pdfs": pdf_list}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/api/pdf/<int:pdf_id>/tags', methods=['GET'])
def get_pdf_tags(pdf_id):
"""Get tag tree for a PDF"""
try:
pdf = PDF.query.get(pdf_id)
if not pdf:
return jsonify({"error": "PDF not found"}), 404
tags = Tag.query.filter_by(pdf_id=pdf_id).all()
tag_tree = build_tag_tree(tags)
return jsonify({"tags": tag_tree}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/api/pdf/<int:pdf_id>/page/<int:page_number>', methods=['GET'])
def get_pdf_page(pdf_id, page_number):
"""Get page content for a PDF"""
try:
pdf = PDF.query.get(pdf_id)
if not pdf:
return jsonify({"error": "PDF not found"}), 404
mapping = PageMapping.query.filter_by(
pdf_id=pdf_id,
page_number=page_number
).first()
if not mapping:
return jsonify({"error": "Page not found"}), 404
# Return page content as text
return jsonify({
"page_number": page_number,
"content": mapping.content
}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/api/pdf/<int:pdf_id>', methods=['GET'])
def get_pdf_info(pdf_id):
"""Get PDF information"""
try:
pdf = PDF.query.get(pdf_id)
if not pdf:
return jsonify({"error": "PDF not found"}), 404
return jsonify({
"id": pdf.id,
"original_filename": pdf.original_filename,
"upload_date": pdf.upload_date.isoformat(),
"page_count": pdf.page_count
}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/api/pdf/<int:pdf_id>', methods=['DELETE'])
def delete_pdf(pdf_id):
"""Delete a PDF and its associated data"""
try:
pdf = PDF.query.get(pdf_id)
if not pdf:
return jsonify({"error": "PDF not found"}), 404
# Delete associated tags
Tag.query.filter_by(pdf_id=pdf_id).delete()
# Delete associated page mappings
PageMapping.query.filter_by(pdf_id=pdf_id).delete()
# Delete PDF file
file_path = os.path.join(app.config['UPLOAD_FOLDER'], pdf.filename)
if os.path.exists(file_path):
os.remove(file_path)
# Delete PDF record
db.session.delete(pdf)
db.session.commit()
return jsonify({"success": True, "message": "PDF deleted successfully"}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=5001)
- 后端:Flask + SQLAlchemy
- 接收 PDF 上传
- 解析 PDF,提取「标签树」(章节层级)+ 每个标签对应的页码信息(甚至可以到坐标)
- 把文档、标签、页码关系存到数据库
- 提供接口给前端拿到整个标签树和对应页信息
- 前端:
- 展示标签树(侧边栏)
- 点击某个标签时,让 PDF 查看器跳转到对应页(或具体位置)
下面按步骤帮你拆一下。
一、数据结构 / 表设计(SQLAlchemy 模型)
可以设计几张核心表:
- Document(文档表)
- id
- filename
- file_path(服务器上存放路径)
- uploaded_at
- status(例如:uploaded / parsing / ready / failed)
- Tag(标签/目录节点)
- id
- document_id(外键)
- title(比如「第1章 总则」)
- level(层级:1=一级标题,2=二级标题...)
- parent_id(自关联,形成树)
- page_start(起始页)
- page_end(结束页,可选)
- bbox(可选,JSON 格式:在页面上的坐标,用于高亮)
> 如果只需要「标签 -> 页码」,page_start / page_end 就够用了。
二、上传接口设计(Flask)
接口: POST /api/documents
- 请求:multipart/form-data,字段名比如 file
- 流程:
- 从 request.files['file'] 获取上传的 PDF
- 把 PDF 保存到服务器某个目录(比如 uploads/{uuid}.pdf)
- 在 Document 表中插入一条记录,状态先设为 parsing
- 调用解析函数(同步 or 异步):
- 同步:当前请求里直接解析,返回时标签数据已经准备好
- 异步:丢到任务队列(Celery / RQ),接口先返回 document_id,前端稍后再查解析结果
- 返回:{"document_id": 123}
三、PDF 解析思路(提取标签层级 + 页码)
用 Python 里的 PDF 库,比如:
- 首选:PyMuPDF(fitz),支持:
- 直接读 PDF 的「目录 / 书签(outline/TOC)」信息
- 这些目录天然是带层级 + 对应页的,非常适合你要的「标签」
1. 利用 PDF 自带目录(如果有)
import fitz # PyMuPDF
doc = fitz.open(pdf_path)
toc = doc.get_toc(simple=False)
toc 通常是这样的结构(伪):
[
[1, "第一章 绪论", 1, ...],
[2, "1.1 背景", 2, ...],
[2, "1.2 目的", 5, ...],
[1, "第二章 相关工作", 10, ...],
...
]
- 第一个元素是 level
- 第二个是 title
- 第三个是 page(一般 1-based)
你要做的:
- 遍历 toc,根据 level 构建树结构:
- level=1:顶级标签,parent_id=None
- level=2:父节点是最近一个 level=1 的节点
- 以此类推
- 把每个节点插入到 Tag 表,page_start 设置为 page
- page_end 可以后处理:
- 对于某个标签,它的 page_end = 它之后「同级或更高层级的下一个标签」的 page_start - 1
- 文档最后一个标签的 page_end = 文档总页数
如果 PDF 没有目录/书签,就需要更复杂的「标题识别」逻辑(比如根据字体大小/加粗等),这个可以后面再扩展。
2. 坐标信息(可选)
如果你想前端点击标签时,高亮对应标题在页面中的位置,可以进一步做:
- 对 page_start 页,用 page.get_text("dict") 拿到文本块和坐标
- 在这一页里找到与标签 title 最匹配的那一段文字,然后记录它的 bbox
- 把 bbox 转 JSON 存在 Tag.bbox 字段里
这部分实现会比较细节化,但思路是这样。
四、对前端开放的查询接口设计
1)获取文档列表
- GET /api/documents
- 返回所有文档的 id, filename, status 等,前端可以展示一个文档列表
2)获取某文档的标签树
- GET /api/documents/<document_id>/tags
-
返回结构类似:
[
{ "id": 1, "title": "第一章 绪论", "level": 1, "page_start": 1, "page_end": 9, "children": [ { "id": 2, "title": "1.1 背景", "level": 2, "page_start": 2, "page_end": 4 }, { "id": 3, "title": "1.2 目的", "level": 2, "page_start": 5, "page_end": 9 } ] }, { "id": 4, "title": "第二章 相关工作", "level": 1, "page_start": 10, "page_end": 20 }]
前端拿到之后可以很容易做成树形组件。
3)PDF 内容本身如何展示
这里有两种常见模式:
- 方式 A:直接前端用 pdf.js 打开 PDF URL
- 后端提供静态访问地址,比如:/static/uploads/{uuid}.pdf
- 前端用 pdf.js 或浏览器自带 PDF 查看器打开
- 当用户点击标签时,前端调用 PDF 查看器的 API:
- scrollTo(page_start),滚动到指定页
- 如果有 bbox,可以进一步滚动到更精确位置,并用 overlay 高亮
- 方式 B:后端按页转图片
- 提供 GET /api/documents/<document_id>/pages/<page_number> 返回一张 PNG/JPEG
- 前端自己做翻页 + 滚动
- 点击标签,就直接跳转到对应那一页(滚动到对应图片)
一般推荐 方式 A(pdf.js),后端简单很多,只需要保证 PDF 可以被访问。
五、一个可能的前后端交互流程
- 用户在前端选择 PDF,点击上传
- 前端:POST /api/documents,上传文件
- 后端:
- 保存文件,写入 Document,开始解析
- (同步解析的话)解析完,状态改成 ready
- 返回 {"document_id": 123}
- 前端:
- 使用这个 document_id 构造 PDF URL(比如 /static/uploads/<filename>.pdf)在 pdf.js 中展示
- 同时请求 GET /api/documents/123/tags,把返回的树渲染在左侧目录栏
- 用户点击左侧某个标签:
- 前端拿到 page_start(或者更精细的 bbox)
- 调用 pdf.js 的方法,让视图跳转到那一页 / 那个位置
六、后续可以扩展的点
- 支持多种解析方式:优先用 PDF 目录;没有目录则用字体/正则识别标题
- 支持标签编辑:前端允许用户增删改标签和对应页,后端提供 PUT /api/tags/<id>
- 权限控制:Document 增加 owner_id,控制谁能访问
Flask>=3.0.0
Flask-SQLAlchemy>=3.1.1
PyMuPDF>=1.23.0
数据模型(models.py)
from datetime import datetime
from flask_sqlalchemy import SQLAlchemy
db = SQLAlchemy()
class Document(db.Model):
__tablename__ = "documents"
id = db.Column(db.Integer, primary_key=True)
filename = db.Column(db.String(255), nullable=False)
pdf_path = db.Column(db.String(1024), nullable=False)
created_at = db.Column(db.DateTime, default=datetime.utcnow, nullable=False)
tags = db.relationship("Tag", backref="document", lazy=True, cascade="all, delete-orphan")
class Tag(db.Model):
"""
解析出的"标签/目录节点"(层级树)
page_from/page_to:用于前端点击后跳转/展示范围(最常见先展示 page_from)
"""
__tablename__ = "tags"
id = db.Column(db.Integer, primary_key=True)
document_id = db.Column(db.Integer, db.ForeignKey("documents.id"), nullable=False)
title = db.Column(db.String(512), nullable=False)
level = db.Column(db.Integer, nullable=False) # 1,2,3...
order_index = db.Column(db.Integer, nullable=False) # 在 toc 列表中的顺序
parent_id = db.Column(db.Integer, db.ForeignKey("tags.id"), nullable=True)
parent = db.relationship("Tag", remote_side=[id], backref="children")
page_from = db.Column(db.Integer, nullable=False) # 0-based page index
page_to = db.Column(db.Integer, nullable=True) # 0-based,可能为空
created_at = db.Column(db.DateTime, default=datetime.utcnow, nullable=False)
接口与解析逻辑(app.py)
import os
import uuid
from pathlib import Path
import fitz # PyMuPDF
from flask import Flask, jsonify, request, send_file, abort
from werkzeug.utils import secure_filename
from models import db, Document, Tag
ALLOWED_EXT = {".pdf"}
def create_app():
app = Flask(__name__)
app.config["SQLALCHEMY_DATABASE_URI"] = os.getenv("DATABASE_URL", "sqlite:///app.db")
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
app.config["UPLOAD_DIR"] = os.getenv("UPLOAD_DIR", str(Path.cwd() / "uploads"))
app.config["PAGE_CACHE_DIR"] = os.getenv("PAGE_CACHE_DIR", str(Path.cwd() / "page_cache"))
Path(app.config["UPLOAD_DIR"]).mkdir(parents=True, exist_ok=True)
Path(app.config["PAGE_CACHE_DIR"]).mkdir(parents=True, exist_ok=True)
db.init_app(app)
@app.get("/health")
def health():
return {"ok": True}
@app.post("/api/documents")
def upload_document():
"""
multipart/form-data:
- file: PDF
返回:document_id + tags_count
"""
if "file" not in request.files:
return {"error": "missing file"}, 400
f = request.files["file"]
if not f.filename:
return {"error": "empty filename"}, 400
ext = Path(f.filename).suffix.lower()
if ext not in ALLOWED_EXT:
return {"error": "only pdf is allowed"}, 400
original_name = secure_filename(f.filename)
doc_uuid = uuid.uuid4().hex
save_name = f"{doc_uuid}{ext}"
save_path = str(Path(app.config["UPLOAD_DIR"]) / save_name)
f.save(save_path)
doc = Document(filename=original_name, pdf_path=save_path)
db.session.add(doc)
db.session.flush() # 拿到 doc.id
tags = parse_pdf_toc_to_tags(document_id=doc.id, pdf_path=save_path)
db.session.add_all(tags)
db.session.commit()
return jsonify({
"document_id": doc.id,
"filename": doc.filename,
"tags_count": len(tags),
}), 201
@app.get("/api/documents/<int:document_id>/tags")
def get_tags_tree(document_id: int):
"""
返回树结构,前端可直接渲染目录树。
"""
doc = Document.query.get_or_404(document_id)
tags = Tag.query.filter_by(document_id=doc.id).order_by(Tag.order_index.asc()).all()
return jsonify({
"document_id": doc.id,
"filename": doc.filename,
"tags": build_tag_tree(tags),
})
@app.get("/api/tags/<int:tag_id>")
def get_tag(tag_id: int):
"""
前端点击目录节点后可调用:拿 page_from/page_to
"""
tag = Tag.query.get_or_404(tag_id)
return jsonify({
"id": tag.id,
"document_id": tag.document_id,
"title": tag.title,
"level": tag.level,
"parent_id": tag.parent_id,
"page_from": tag.page_from,
"page_to": tag.page_to,
})
@app.get("/api/documents/<int:document_id>/pages/<int:page_index>.png")
def get_page_png(document_id: int, page_index: int):
"""
返回指定页的 PNG(做了简单磁盘缓存,适合前端直接 <img src="...">)
"""
doc = Document.query.get_or_404(document_id)
png_path = render_page_png_cached(
pdf_path=doc.pdf_path,
document_id=document_id,
page_index=page_index,
cache_dir=app.config["PAGE_CACHE_DIR"],
)
return send_file(png_path, mimetype="image/png")
return app
def parse_pdf_toc_to_tags(document_id: int, pdf_path: str):
"""
用 PyMuPDF 读取 PDF 目录(书签/outline)。
doc.get_toc(simple=True) 返回形如: [[level, title, page], ...]
page 是 1-based;我们存 0-based 的 page_from
同时计算 page_to:用"下一个同级或更高层级的 page_from - 1"
"""
pdf = fitz.open(pdf_path)
toc = pdf.get_toc(simple=True) or []
page_count = pdf.page_count
pdf.close()
# 先创建 Tag 列表(不做 parent 关系时也能用;这里顺便构建 parent)
tags = []
stack = [] # (level, Tag)
for idx, (level, title, page_1_based) in enumerate(toc):
page_from = max(0, int(page_1_based) - 1)
# 找 parent:弹出到 level-1
while stack and stack[-1][0] >= level:
stack.pop()
parent = stack[-1][1] if stack else None
tag = Tag(
document_id=document_id,
title=str(title).strip() or "Untitled",
level=int(level),
order_index=idx,
parent=parent,
page_from=page_from,
page_to=None, # 下面再补
)
tags.append(tag)
stack.append((level, tag))
# 计算 page_to(可选,但对"展示范围/高亮范围"很有帮助)
# 规则:找后续第一个 level <= 当前 level 的节点,它的 page_from - 1 即为当前 page_to;
# 若找不到,则到最后一页。
for i, tag in enumerate(tags):
end = page_count - 1
for j in range(i + 1, len(tags)):
if tags[j].level <= tag.level:
end = max(tag.page_from, tags[j].page_from - 1)
break
tag.page_to = end
return tags
def build_tag_tree(tags):
"""
输入:按 order_index 排序的 Tag 列表
输出:树 JSON
"""
by_id = {}
roots = []
for t in tags:
by_id[t.id] = {
"id": t.id,
"title": t.title,
"level": t.level,
"parent_id": t.parent_id,
"page_from": t.page_from,
"page_to": t.page_to,
"children": [],
}
for t in tags:
node = by_id[t.id]
if t.parent_id and t.parent_id in by_id:
by_id[t.parent_id]["children"].append(node)
else:
roots.append(node)
return roots
def render_page_png_cached(pdf_path: str, document_id: int, page_index: int, cache_dir: str):
"""
简单磁盘缓存:{cache_dir}/{document_id}/{page_index}.png
"""
if page_index < 0:
abort(400, description="page_index must be >= 0")
out_dir = Path(cache_dir) / str(document_id)
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"{page_index}.png"
if out_path.exists():
return str(out_path)
pdf = fitz.open(pdf_path)
if page_index >= pdf.page_count:
pdf.close()
abort(404, description="page out of range")
page = pdf.load_page(page_index)
# 适当放大提高清晰度:2.0~3.0 视情况
mat = fitz.Matrix(2.0, 2.0)
pix = page.get_pixmap(matrix=mat, alpha=False)
pix.save(str(out_path))
pdf.close()
return str(out_path)
if __name__ == "__main__":
app = create_app()
with app.app_context():
db.create_all()
app.run(host="0.0.0.0", port=5000, debug=True)
前端交互流程(你要的"过程"落地成接口调用)
- 上传:POST /api/documents(表单上传 PDF)→ 返回 document_id
- 拿目录树:GET /api/documents/{document_id}/tags → 渲染树
- 点击某个标签:GET /api/tags/{tag_id} → 得到 page_from
- 展示页面:<img src="/api/documents/{document_id}/pages/{page_from}.png">
如果你后续要做得更"像 PDF 阅读器"(比如点击标签后滚动到页内位置、文本高亮、同一标签对应多个离散页、标签来自正文语义而不是目录),模型会从 page_from/page_to 扩展为"标签-页-坐标"的关联表(TagPage / TagAnchor)。你要走哪种,我也可以把下一版骨架补上。
方式 B 其实就是把「展示 PDF」这件事,完全交给后端的「按页转图片」接口来做,你前端只负责拿 img 链接(或 <Image>)。
下面我用你前面那套「标签 + 页码」的模型,直接说明方式 B 怎么落地就行。
1. 核心思路
- 数据库:还是只存一份 PDF 文件路径(Document.pdf_path),以及标签到页码的映射(Tag.page_from/page_to)。
- 展示:前端不自己解析 PDF,不用 PDF.js,而是:
- 通过标签接口拿到 document_id + page_from
- 把 <img src="/api/documents/{document_id}/pages/{page_from}.png"> 填上就能看这一页
- 后端职责:
- 接收请求:GET /api/documents/<doc_id>/pages/<page_index>.png
- 用 PyMuPDF 打开该 PDF 的 page_index 页 → 渲染成 PNG → 返回
- 做一个磁盘缓存:同一个 (document_id, page_index) 第二次就直接读 PNG 文件,不再重新渲染。
2. 按页转图片接口(方式 B 核心)
import fitz
from pathlib import Path
from flask import send_file, abort
from models import Document
def render_page_png_cached(pdf_path: str, document_id: int, page_index: int, cache_dir: str) -> str:
if page_index < 0:
abort(400, description="page_index must be >= 0")
out_dir = Path(cache_dir) / str(document_id)
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"{page_index}.png"
if out_path.exists():
return str(out_path)
pdf = fitz.open(pdf_path)
if page_index >= pdf.page_count:
pdf.close()
abort(404, description="page out of range")
page = pdf.load_page(page_index)
mat = fitz.Matrix(2.0, 2.0) # 缩放控制清晰度,2~3 之间调
pix = page.get_pixmap(matrix=mat, alpha=False)
pix.save(str(out_path))
pdf.close()
return str(out_path)
@app.get("/api/documents/<int:document_id>/pages/<int:page_index>.png")
def get_page_png(document_id: int, page_index: int):
doc = Document.query.get_or_404(document_id)
png_path = render_page_png_cached(
pdf_path=doc.pdf_path,
document_id=document_id,
page_index=page_index,
cache_dir=current_app.config["PAGE_CACHE_DIR"],
)
return send_file(png_path, mimetype="image/png")
3. 与「标签」的配合方式
- 获取标签树:GET /api/documents/{document_id}/tags → 每个节点有 id, title, page_from, page_to
- 前端点击某个标签:
- 取出该标签的 page_from(比如是 5)
- 把阅读器的当前图片地址改成:
- 单页模式:/api/documents/{document_id}/pages/5.png
- 连续滚动模式:预加载 4,5,6 等几页图片
你可以理解为:
标签只负责告诉你「应该看第几页」;如何把「第几页」渲染出来,都是后端按页转 PNG 来做。
4. 方式 B 的一些优化点(简略)
- 懒加载:不要一次把所有页都转图,只在用户看到/即将看到这一页时请求。
- 缓存策略:
- 后端磁盘缓存 (document_id/page_index).png
- 接口加 Cache-Control,交给浏览器 / CDN 继续缓存。
- 清晰度与流量:
- 可以让前端传 ?scale=1.5/2/3 或 ?width=800,后端按参数渲染不同分辨率图片。
- 大文件 / 超长 PDF 再考虑缩略图接口(比如只生成低分辨率预览)。
前端示例:目录树 + 按页图片展示(React 简版)
假设你已经有后端这几个接口:
GET /api/documents/:documentId/tags → 返回标签树(含 id,title,page_from,children)
GET /api/documents/:documentId/pages/:pageIndex.png → 返回对应页的 PNG 图片
下面是一个极简 React 示例,目录在左边,右边展示对应页的图片。
import React, { useEffect, useState } from "react";
// 把后端返回的 tags 树当成这个类型
// {
// id: number;
// title: string;
// level: number;
// parent_id: number | null;
// page_from: number;
// page_to: number | null;
// children: TagNode[];
// }
function PdfWithToc({ documentId }) {
const [tagsTree, setTagsTree] = useState([]);
const [selectedTag, setSelectedTag] = useState(null);
const [loadingTags, setLoadingTags] = useState(false);
useEffect(() => {
async function fetchTags() {
setLoadingTags(true);
try {
const res = await fetch(`/api/documents/${documentId}/tags`);
const data = await res.json();
setTagsTree(data.tags || []); // 后端返回 { document_id, filename, tags }
} finally {
setLoadingTags(false);
}
}
fetchTags();
}, [documentId]);
const handleTagClick = (tag) => {
setSelectedTag(tag);
};
const currentPageIndex =
selectedTag && typeof selectedTag.page_from === "number"
? selectedTag.page_from
: 0;
const currentImgSrc = `/api/documents/${documentId}/pages/${currentPageIndex}.png`;
return (
<div style={{ display: "flex", height: "100vh", fontFamily: "sans-serif" }}>
{/* 左侧:目录树 */}
<div
style={{
width: 280,
borderRight: "1px solid #eee",
padding: "8px 12px",
overflowY: "auto",
}}
>
<h3 style={{ marginTop: 0 }}>目录</h3>
{loadingTags && <div>加载目录中...</div>}
{!loadingTags && tagsTree.length === 0 && <div>暂无目录</div>}
{!loadingTags &&
tagsTree.map((node) => (
<TagNode
key={node.id}
node={node}
onClick={handleTagClick}
activeId={selectedTag?.id}
/>
))}
</div>
{/* 右侧:页面图片 */}
<div
style={{
flex: 1,
display: "flex",
flexDirection: "column",
alignItems: "center",
overflowY: "auto",
}}
>
<div style={{ padding: "8px 0", fontSize: 14, color: "#666" }}>
当前页:{currentPageIndex + 1}
{selectedTag ? `(${selectedTag.title})` : "(默认第一页)"}
</div>
<div
style={{
maxWidth: "100%",
padding: "0 16px 16px",
boxSizing: "border-box",
}}
>
<img
src={currentImgSrc}
alt={`page-${currentPageIndex}`}
style={{
maxWidth: "100%",
boxShadow: "0 0 8px rgba(0,0,0,0.15)",
borderRadius: 4,
}}
/>
</div>
</div>
</div>
);
}
function TagNode({ node, onClick, activeId }) {
const isActive = node.id === activeId;
return (
<div style={{ marginLeft: (node.level - 1) * 12 }}>
<div
onClick={() => onClick(node)}
style={{
cursor: "pointer",
padding: "2px 4px",
borderRadius: 4,
backgroundColor: isActive ? "#e6f4ff" : "transparent",
color: isActive ? "#1677ff" : "#333",
fontSize: 14,
}}
>
{node.title}
{typeof node.page_from === "number" && (
<span style={{ marginLeft: 6, fontSize: 12, color: "#999" }}>
(p.{node.page_from + 1})
</span>
)}
</div>
{Array.isArray(node.children) &&
node.children.map((child) => (
<TagNode
key={child.id}
node={child}
onClick={onClick}
activeId={activeId}
/>
))}
</div>
);
}
export default PdfWithToc;
使用方式(例)
- 在你的路由或页面上:
// 假设你知道当前是 documentId = 123
<PdfWithToc documentId={123} />
这样就把你后端的「方式 B:按页转图片」和之前的「标签层级」串起来了:
点击左侧树节点 → 直接切换右侧 <img> 的页码 → 后端按页返回 PNG。
数据模型设计(base64 存数据库)
- Document:一条上传记录,content 字段存 base64 字符串(整份 PDF)
- Tag:PDF 解析出的标签/目录节点,带层级、顺序、起止页
完整 Flask 后端示例(单文件版,app.py)
import base64
import io
from datetime import datetime
import fitz # PyMuPDF
from flask import Flask, request, jsonify, send_file, abort
from flask_sqlalchemy import SQLAlchemy
from werkzeug.utils import secure_filename
db = SQLAlchemy()
class Document(db.Model):
__tablename__ = "documents"
id = db.Column(db.Integer, primary_key=True)
filename = db.Column(db.String(255), nullable=False)
# 整个 PDF 文件,转成 base64 字符串后存这里
content = db.Column(db.Text, nullable=False)
created_at = db.Column(db.DateTime, default=datetime.utcnow, nullable=False)
tags = db.relationship(
"Tag", backref="document", lazy=True, cascade="all, delete-orphan"
)
class Tag(db.Model):
__tablename__ = "tags"
id = db.Column(db.Integer, primary_key=True)
document_id = db.Column(db.Integer, db.ForeignKey("documents.id"), nullable=False)
title = db.Column(db.String(512), nullable=False)
level = db.Column(db.Integer, nullable=False) # 1,2,3...
order_index = db.Column(db.Integer, nullable=False) # toc 中顺序
parent_id = db.Column(db.Integer, db.ForeignKey("tags.id"), nullable=True)
parent = db.relationship("Tag", remote_side=[id], backref="children")
page_from = db.Column(db.Integer, nullable=False) # 0-based
page_to = db.Column(db.Integer, nullable=True) # 0-based
created_at = db.Column(db.DateTime, default=datetime.utcnow, nullable=False)
def create_app():
app = Flask(__name__)
app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///app.db"
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
db.init_app(app)
@app.get("/health")
def health():
return {"ok": True}
@app.post("/api/documents")
def upload_document():
"""
上传 PDF,解析目录标签并入库
form-data:
- file: pdf 文件
"""
if "file" not in request.files:
return {"error": "missing file"}, 400
f = request.files["file"]
if not f.filename:
return {"error": "empty filename"}, 400
filename = secure_filename(f.filename)
file_bytes = f.read()
if not file_bytes:
return {"error": "empty file"}, 400
content_b64 = base64.b64encode(file_bytes).decode("ascii")
doc = Document(filename=filename, content=content_b64)
db.session.add(doc)
db.session.flush() # 得到 doc.id
tags = parse_pdf_toc_to_tags_from_bytes(document_id=doc.id, pdf_bytes=file_bytes)
db.session.add_all(tags)
db.session.commit()
return jsonify(
{
"document_id": doc.id,
"filename": doc.filename,
"tags_count": len(tags),
}
), 201
@app.get("/api/documents/<int:document_id>/tags")
def get_tags_tree(document_id: int):
"""
返回标签树(目录)
"""
doc = Document.query.get_or_404(document_id)
tags = (
Tag.query.filter_by(document_id=doc.id)
.order_by(Tag.order_index.asc())
.all()
)
return jsonify(
{
"document_id": doc.id,
"filename": doc.filename,
"tags": build_tag_tree(tags),
}
)
@app.get("/api/tags/<int:tag_id>")
def get_tag(tag_id: int):
"""
返回单个标签的详细信息(含 page_from/page_to)
"""
tag = Tag.query.get_or_404(tag_id)
return jsonify(
{
"id": tag.id,
"document_id": tag.document_id,
"title": tag.title,
"level": tag.level,
"parent_id": tag.parent_id,
"page_from": tag.page_from,
"page_to": tag.page_to,
}
)
@app.get("/api/documents/<int:document_id>/pages/<int:page_index>.png")
def get_page_png(document_id: int, page_index: int):
"""
按页把 PDF 转成图片返回(方式 B:后端按页转图片)
这里为了简单演示:每次请求都从 DB 读出 PDF → 内存渲染。
生产环境可以加缓存。
"""
if page_index < 0:
abort(400, description="page_index must be >= 0")
doc = Document.query.get_or_404(document_id)
pdf_bytes = base64.b64decode(doc.content)
pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
if page_index >= pdf.page_count:
pdf.close()
abort(404, description="page out of range")
page = pdf.load_page(page_index)
mat = fitz.Matrix(2.0, 2.0) # 放大倍数:影响清晰度和体积
pix = page.get_pixmap(matrix=mat, alpha=False)
img_bytes = pix.tobytes("png")
pdf.close()
return send_file(
io.BytesIO(img_bytes),
mimetype="image/png",
as_attachment=False,
download_name=f"page-{page_index}.png",
)
with app.app_context():
db.create_all()
return app
def parse_pdf_toc_to_tags_from_bytes(document_id: int, pdf_bytes: bytes):
"""
从 PDF 二进制解析目录(TOC)并转成 Tag 列表
doc.get_toc(simple=True) → [[level, title, page], ...]
"""
pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
toc = pdf.get_toc(simple=True) or []
page_count = pdf.page_count
pdf.close()
tags = []
stack = [] # (level, Tag)
for idx, (level, title, page_1_based) in enumerate(toc):
page_from = max(0, int(page_1_based) - 1)
while stack and stack[-1][0] >= level:
stack.pop()
parent = stack[-1][1] if stack else None
tag = Tag(
document_id=document_id,
title=str(title).strip() or "Untitled",
level=int(level),
order_index=idx,
parent=parent,
page_from=page_from,
page_to=None, # 之后再补
)
tags.append(tag)
stack.append((level, tag))
# 计算 page_to
for i, tag in enumerate(tags):
end = page_count - 1
for j in range(i + 1, len(tags)):
if tags[j].level <= tag.level:
end = max(tag.page_from, tags[j].page_from - 1)
break
tag.page_to = end
return tags
def build_tag_tree(tags):
"""
把 Tag 列表转成树结构 JSON
"""
by_id = {}
roots = []
for t in tags:
by_id[t.id] = {
"id": t.id,
"title": t.title,
"level": t.level,
"parent_id": t.parent_id,
"page_from": t.page_from,
"page_to": t.page_to,
"children": [],
}
for t in tags:
node = by_id[t.id]
if t.parent_id and t.parent_id in by_id:
by_id[t.parent_id]["children"].append(node)
else:
roots.append(node)
return roots
if __name__ == "__main__":
app = create_app()
app.run(host="0.0.0.0", port=5000, debug=True)
文件存放形式说明
- 当前示例的方式:
- 上传的 PDF → file_bytes = file.read()
- base64.b64encode(file_bytes).decode("ascii") → 存到 Document.content(Text 类型)
- 数据库里是一长串 base64 字符串,不再在磁盘上单独保存 PDF 文件。
- 后续使用时:
- 需要操作 PDF(解析目录、渲染页面等)时,用 base64.b64decode(document.content) 得到原始 bytes,再通过 fitz.open(stream=bytes, filetype="pdf") 在内存中打开。
> 从工程角度讲:
> 更推荐 直接存原始二进制(LargeBinary)或存到对象存储/磁盘上记录路径,
> 因为 base64 会多占约 33% 空间、传输也更大。
> 但如果你就是想「所有内容都在一张表里」,这个 base64 方案是可行的。
后续操作流程(从上传到前端展示)
- 上传:
- 前端 POST /api/documents(form-data: file)
- 后端:读取 PDF → 转 base64 存 Document.content → 用 fitz 从 bytes 解析 TOC → 保存 Tag → 返回 document_id
- 展示目录树:
- 前端 GET /api/documents/{document_id}/tags → 拿到树状标签(包含 page_from/page_to)
- 用户点击某个标签:
- 前端取出该标签的 page_from,比如 5
- <img src="/api/documents/{document_id}/pages/5.png">
- 后端按页转图片:
- GET /api/documents/{document_id}/pages/{page_index}.png
- 后端:查出 Document → base64.b64decode(content) → fitz.open → load_page(page_index) → 渲染为 PNG → 通过 send_file 返回