Openai API + langchain 分析小型pdf文档

声明:该版代码在2024.08.23有效。

代码如下:

python 复制代码
from langchain_community.document_loaders import PyPDFLoader
import getpass
import os
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

class QA:
    """
    A class to handle question-answering tasks on a given PDF document.

    Attributes:
        question (str): The question to be answered about the PDF.
        pdf_path (str): Path to the PDF file.
        model_name (str): Name of the model used for analysis.
        docs (list): Loaded PDF documents.
        vecstore (Chroma): The vector store object for storing document embeddings.

    Methods:
        set_environ(): Set environment variables for the OpenAI API.
        load_file(): Load a PDF file using PyPDFLoader.
        split_and_store(): Split the PDF text and store embeddings using Chroma.
        retrieve_pdf(): Retrieve and answer questions based on the PDF content.
    """
    def __init__(self, question, pdf_path, model_name):
        """
        Initializes the QA object with provided question, PDF path, and model name.

        Parameters:
            question (str): The question to be answered about the PDF.
            pdf_path (str): Path to the PDF file.
            model_name (str): Name of the model used for analysis.
        """
        self.question = question
        self.pdf_path = pdf_path
        self.model_name = model_name
        self.docs = None
        self.vecstore = None

    def set_environ(self):
        """
        Sets the environment variables necessary for OpenAI API authentication.
        """
        os.environ['OPENAI_API_KEY'] = input("your api:")
        os.environ['OPENAI_PROXY'] = 'http://127.0.0.1:20171'

    def load_file(self):
        """
        Loads the PDF file specified by the pdf_path attribute using PyPDFLoader.
        """
        loader = PyPDFLoader(self.pdf_path)
        self.docs = loader.load()

    def split_and_store(self):
        """
        Splits the loaded PDF text into manageable chunks and stores the embeddings in a vector store.
        """
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(self.docs)
        self.vecstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

    def retrieve_pdf(self):
        """
        Retrieves context from the vector store and generates an answer to the input question
        using a retrieval-augmented generation chain.
        """
        retriever = self.vecstore.as_retriever()
        llm = ChatOpenAI(model="gpt-4o")

        system_prompt = (
            "You are an assistant for question-answering tasks. "
            "Use the following pieces of retrieved context to answer "
            "the question. If you don't know the answer, say that you "
            "don't know. Use three sentences maximum and keep the "
            "answer concise."
            "\n\n"
            "{context}"
        )

        prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_prompt),
                ("human", "{input}"),
            ]
        )

        question_answer_chain = create_stuff_documents_chain(llm, prompt)
        rag_chain = create_retrieval_chain(retriever, question_answer_chain)

        results = rag_chain.invoke({"input": self.question})

        print(results['answer'])

    def run(self):
        self.set_environ()
        self.load_file()
        self.split_and_store()
        self.retrieve_pdf()

def __main__():
    """
    Main function to execute the QA class functionality.

    Prompts user for input parameters, creates a QA object, and processes the specified PDF.
    """
    question = input("Your question:")
    pdf_path = input("Enter the path of the pdf file:")
    model_name = input("Enter the model name:")
    qa = QA(question, pdf_path, model_name)
    qa.run()

if __name__ == "__main__":
    __main__()
相关推荐
LitchiCheng1 分钟前
Mujoco 使用 Pinocchio 进行逆动力学及阻抗力矩控制维持当前位置
人工智能·python
殇者知忧24 分钟前
凯斯西储(CWRU)数据集解读与数据读取
python·凯斯西储(cwru)数据集
deephub1 小时前
Scikit-Learn 1.8引入 Array API,支持 PyTorch 与 CuPy 张量的原生 GPU 加速
人工智能·pytorch·python·机器学习·scikit-learn
free-elcmacom1 小时前
机器学习高阶教程<11>当数据开始“折叠”:流形学习与深度神经网络如何发现世界的隐藏维度
人工智能·python·神经网络·学习·算法·机器学习·dnn
月明长歌1 小时前
Java数据结构:PriorityQueue堆与优先级队列:从概念到手写大根堆
java·数据结构·python·leetcode·
波克布林的矩阵6332 小时前
VS code为python文件配置默认模板
python
dhdjjsjs2 小时前
Day44 PythonStudy
python
love530love2 小时前
在 PyCharm 中配置 x64 Native Tools Command Prompt for VS 2022 作为默认终端
ide·人工智能·windows·python·pycharm·prompt·comfyui
柒.梧.2 小时前
CSS 基础样式与盒模型详解:从入门到实战进阶
人工智能·python·tensorflow
free-elcmacom2 小时前
机器学习高阶教程<9>从实验室到生产线:机器学习模型推理与部署优化实战指南
人工智能·python·机器学习