在文本中,将会介绍如何使用fitz模块来处理PDF文档,并基于此,再使用大模型对PDF文档内容进行智能问答,提升问答效果。
技术逻辑
文字版PDF智能问答
- fitz模块来处理PDF文档
- 用大模型对PDF文档内容进行智能问答
扫描版PDF
- 将PDF转化为图片
- PaddleOCR模型进行文字识别
- 用大模型对PDF文档内容进行智能问答
PDF文档处理
请参考:https://blog.csdn.net/zhanghan11366/article/details/137471811?spm=1001.2014.3001.5502(中的PDF部分)
基于上述的PDF文档处理,我们将结合大模型(OpenAI)对PDF文档进行回答。
文字版PDF
文字版PDF可使用fitz轻松获取PDF文档中的纯文字内容,再使用大模型进行问答(简化版RAG)。
示例Python代码如下:
# -*- coding: utf-8 -*-
import os
import openai
import fitz
###设置代理,本地vpn
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"
openai.api_key = "api key"
def get_pdf_content(pdf_path: str) -> str:
doc = fitz.open(pdf_path)
num_pages = doc.page_count
text_content_list = []
# 读取PDF的全部文本内容
for page_index in range(num_pages):
page = doc.load_page(page_index)
text = page.get_text()
text_content_list.append(text)
# 合并全部页面的文本
return ''.join(text_content_list)
def get_answer(pdf_content: str, query: str) -> str:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"The full text of PDF file is: {pdf_content}"},
{"role": "user", "content": query}
],
max_tokens=1000
)
answer = response['choices'][0]['message']['content']
return answer
if __name__ == '__main__':
# Example usage --- make sure to update the PDF path
pdf_content = get_pdf_content("../data/oppo_n3_flip.pdf")
queries = [
"OPPO Find N3 Flip的价格?",
"蚂蚁集团发布的大模型叫什么?",
"混元大模型是什么时候发布的?"
]
# 打印所有问题的答案
for query in queries:
answer=get_answer(pdf_content=pdf_content, query=query)
print(f"query:{query},\n RAG answer:{answer}")
结果如下:
扫描版PDF
而扫描版PDF(也称为影印版PDF)想要使用fitz来获取PDF中的文字内容是困难的,因此,需要先将每一页PDF转化为图片,再使用OCR技术获取图片中的文字,这样就能获取扫描版PDF中的文字,当然,获取文本的质量主要取决于PDF文档的质量及OCR识别效果。
可以使用fitz模块将每一页PDF转化为图片,同时,本文中采用PaddlePaddle的PaddleOCR模型进行文字识别。
运行paddleOCR
hub serving start -m ch_pp-ocrv3
PDF转化为图片
def convert_pdf_2_img(pdf_file: str, pages: int) -> None:
pdf_document = fitz.open(pdf_file)
# Iterate through each page and convert to an image
for page_number in range(pages):
# Get the page
page = pdf_document[page_number]
# Convert the page to an image
pix = page.get_pixmap()
# Create a Pillow Image object from the pixmap
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Save the image
image.save(f"../output/book1_{page_number + 1}.png")
# Close the PDF file
pdf_document.close()
完整代码
# -*- coding: utf-8 -*-
import fitz
from PIL import Image
import cv2
import json
import base64
import requests
from openai import OpenAI
import httpx
from pprint import pprint
client = OpenAI(
base_url="https://api.xty.app/v1",
api_key="api_key ",
http_client=httpx.Client(
base_url="https://api.xty.app/v1",
follow_redirects=True,
),
)
def convert_pdf_2_img(pdf_file: str, pages: int) -> None:
pdf_document = fitz.open(pdf_file)
# Iterate through each page and convert to an image
for page_number in range(pages):
# Get the page
page = pdf_document[page_number]
# Convert the page to an image
pix = page.get_pixmap()
# Create a Pillow Image object from the pixmap
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Save the image
image.save(f"../output/book1_{page_number + 1}.png")
# Close the PDF file
pdf_document.close()
def cv2_to_base64(image):
data = cv2.imencode('.jpg', image)[1]
return base64.b64encode(data.tobytes()).decode('utf8')
def image_ocr(image_path):
data = {'images': [cv2_to_base64(cv2.imread(image_path))]}
headers = {"Content-type": "application/json"}
url = "http://0.0.0.0:8866/predict/ch_pp-ocrv3"
r = requests.post(url=url, headers=headers, data=json.dumps(data))
if r.json()["results"]:
return "\n".join([ocr_record["text"].strip() for ocr_record in r.json()["results"][0]["data"]])
else:
return ""
def get_answer(pdf_content: str, query: str) -> str:
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"The full text of PDF file is: {pdf_content}"},
{"role": "user", "content": query}
],
max_tokens=1000
)
content = response.choices[0].message.content
return content
if __name__ == '__main__':
test_pdf_file = "../data/book1.pdf"
convert_pdf_2_img(pdf_file=test_pdf_file, pages=2)
page1_ocr_result = image_ocr("../output/book1_2.png")
print(f"识别文字内容: {page1_ocr_result}")
query1 = "破浪理论的创始人是谁,他的出生年月?"
predict_answer = get_answer(pdf_content=page1_ocr_result, query=query1)
print("回答:", predict_answer)
query2 = "这本书的作者是谁?"
predict_answer = get_answer(pdf_content=page1_ocr_result, query=query2)
print("回答:", predict_answer)
结果如下: