PDF文档智能问答

在文本中，将会介绍如何使用fitz模块来处理PDF文档，并基于此，再使用大模型对PDF文档内容进行智能问答，提升问答效果。

技术逻辑

文字版PDF智能问答

fitz模块来处理PDF文档
用大模型对PDF文档内容进行智能问答

扫描版PDF

将PDF转化为图片
PaddleOCR模型进行文字识别
用大模型对PDF文档内容进行智能问答

PDF文档处理

请参考：https://blog.csdn.net/zhanghan11366/article/details/137471811?spm=1001.2014.3001.5502（中的PDF部分）

基于上述的PDF文档处理，我们将结合大模型（OpenAI）对PDF文档进行回答。

文字版PDF

文字版PDF可使用fitz轻松获取PDF文档中的纯文字内容，再使用大模型进行问答（简化版RAG）。

示例Python代码如下：

复制代码

# -*- coding: utf-8 -*-

import os
import openai
import fitz

###设置代理，本地vpn
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"

openai.api_key = "api key" 


def get_pdf_content(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    num_pages = doc.page_count
    text_content_list = []
    # 读取PDF的全部文本内容
    for page_index in range(num_pages):
        page = doc.load_page(page_index)
        text = page.get_text()
        text_content_list.append(text)

    # 合并全部页面的文本
    return ''.join(text_content_list)

def get_answer(pdf_content: str, query: str) -> str:
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"The full text of PDF file is: {pdf_content}"},
            {"role": "user", "content": query}
        ],
        max_tokens=1000
    )
    answer = response['choices'][0]['message']['content']
    return answer

if __name__ == '__main__':
    # Example usage --- make sure to update the PDF path
    pdf_content = get_pdf_content("../data/oppo_n3_flip.pdf")
    queries = [
        "OPPO Find N3 Flip的价格？",
        "蚂蚁集团发布的大模型叫什么？",
        "混元大模型是什么时候发布的？"
    ]

    # 打印所有问题的答案
    for query in queries:
        answer=get_answer(pdf_content=pdf_content, query=query)
        print(f"query:{query},\n RAG answer:{answer}")

结果如下：

扫描版PDF

而扫描版PDF（也称为影印版PDF）想要使用fitz来获取PDF中的文字内容是困难的，因此，需要先将每一页PDF转化为图片，再使用OCR技术获取图片中的文字，这样就能获取扫描版PDF中的文字，当然，获取文本的质量主要取决于PDF文档的质量及OCR识别效果。

可以使用fitz模块将每一页PDF转化为图片，同时，本文中采用PaddlePaddle的PaddleOCR模型进行文字识别。

运行paddleOCR

复制代码

hub serving start -m ch_pp-ocrv3

PDF转化为图片

复制代码

def convert_pdf_2_img(pdf_file: str, pages: int) -> None:
    pdf_document = fitz.open(pdf_file)

    # Iterate through each page and convert to an image
    for page_number in range(pages):
        # Get the page
        page = pdf_document[page_number]
        # Convert the page to an image
        pix = page.get_pixmap()
        # Create a Pillow Image object from the pixmap
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        # Save the image
        image.save(f"../output/book1_{page_number + 1}.png")

    # Close the PDF file
    pdf_document.close()

完整代码

复制代码

# -*- coding: utf-8 -*-
import fitz
from PIL import Image
import cv2
import json
import base64
import requests
from openai import OpenAI
import httpx
from pprint import pprint

client = OpenAI(
    base_url="https://api.xty.app/v1",
    api_key="api_key ",
    http_client=httpx.Client(
        base_url="https://api.xty.app/v1",
        follow_redirects=True,
    ),
)


def convert_pdf_2_img(pdf_file: str, pages: int) -> None:
    pdf_document = fitz.open(pdf_file)

    # Iterate through each page and convert to an image
    for page_number in range(pages):
        # Get the page
        page = pdf_document[page_number]
        # Convert the page to an image
        pix = page.get_pixmap()
        # Create a Pillow Image object from the pixmap
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        # Save the image
        image.save(f"../output/book1_{page_number + 1}.png")

    # Close the PDF file
    pdf_document.close()


def cv2_to_base64(image):
    data = cv2.imencode('.jpg', image)[1]
    return base64.b64encode(data.tobytes()).decode('utf8')


def image_ocr(image_path):
    data = {'images': [cv2_to_base64(cv2.imread(image_path))]}
    headers = {"Content-type": "application/json"}
    url = "http://0.0.0.0:8866/predict/ch_pp-ocrv3"
    r = requests.post(url=url, headers=headers, data=json.dumps(data))
    if r.json()["results"]:
        return "\n".join([ocr_record["text"].strip() for ocr_record in r.json()["results"][0]["data"]])
    else:
        return ""


def get_answer(pdf_content: str, query: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"The full text of PDF file is: {pdf_content}"},
            {"role": "user", "content": query}
        ],
        max_tokens=1000
    )
    content = response.choices[0].message.content
    return content


if __name__ == '__main__':
    test_pdf_file = "../data/book1.pdf"
    convert_pdf_2_img(pdf_file=test_pdf_file, pages=2)
    page1_ocr_result = image_ocr("../output/book1_2.png")
    print(f"识别文字内容: {page1_ocr_result}")

    query1 = "破浪理论的创始人是谁，他的出生年月？"
    predict_answer = get_answer(pdf_content=page1_ocr_result, query=query1)
    print("回答:", predict_answer)

    query2 = "这本书的作者是谁？"
    predict_answer = get_answer(pdf_content=page1_ocr_result, query=query2)
    print("回答:", predict_answer)

结果如下：