1.从PDF获取数据做RAG
python
import os
from langchain.schema import Document
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, FastEmbedEmbeddings, FakeEmbeddings
import os
import bs4
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
load_dotenv()
# 读取pdf文件
if __name__ == '__main__':
file_path = r"E:\my_code\llm_system\resource2_RAG\llama2.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()
# 分割文档, 将文本分割为多个文档片段
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=256, # 每个文档片段的最大字符数
chunk_overlap=50, # 每个文档片段之间的重叠字符数
)
split_docs = text_splitter.split_documents(docs)
# 对文本进行编码,size和分割文档的chunk_size不是同一个概念,size是每个文档片段的向量维度,常用768,1024
embedding = FakeEmbeddings(size=768)
# 对文档片段进行编码
embeddings = embedding.embed_documents([doc.page_content for doc in split_docs])
# 创建向量存储,使用FAISS.from_embeddings方法
# 需要创建(text, embedding)元组的列表
text_embeddings = list(zip([doc.page_content for doc in split_docs], embeddings))
# 元数据也需要单独提取
metadatas = [doc.metadata for doc in split_docs]
vectordb = FAISS.from_embeddings(text_embeddings, embedding, metadatas=metadatas)
# 也可以使用FAISS.from_documents方法创建向量存储, 使用from_documents方法时,会自动调用模型对象的 .embed_documents() 方法来生成向量
# vectordb = FAISS.from_documents(split_docs, embedding)
index_folder_path = "data/faiss_index"
index_name = "0"
# 保存索引
vectordb.save_local(index_folder_path, index_name)
# 加载索引 allow_dangerous_deserialization 加载由不受信任的源生成的索引文件时需要
vectordb = FAISS.load_local(index_folder_path, embedding, index_name, allow_dangerous_deserialization=True)
# 创建检索器
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
from langchain_openai import ChatOpenAI
# 创建模型,API key可以直接写在代码里,也可以从本地环境获取
api_key = os.getenv("DEEPSEEK_API_KEY")
llm = ChatOpenAI(temperature=0, model_name="deepseek-chat", api_key=api_key, base_url="https://api.deepseek.com/beta")
from langchain.chains import RetrievalQA
# 创建链
chain = RetrievalQA.from_chain_type(llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True)
# 运行链
response = chain.invoke("什么是llama2")
print(response)
2.RAG评估
python
import os
from langchain.schema import Document
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, FastEmbedEmbeddings, FakeEmbeddings
import os
import bs4
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
load_dotenv()
# 读取pdf文件
if __name__ == '__main__':
file_path = r"E:\my_code\llm_system\resource2_RAG\llama2.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()
# 分割文档, 将文本分割为多个文档片段
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=256, # 每个文档片段的最大字符数
chunk_overlap=50, # 每个文档片段之间的重叠字符数
)
split_docs = text_splitter.split_documents(docs)
# 对文本进行编码,size和分割文档的chunk_size不是同一个概念,size是每个文档片段的向量维度,常用768,1024
embedding = FakeEmbeddings(size=768)
# 对文档片段进行编码
embeddings = embedding.embed_documents([doc.page_content for doc in split_docs])
# 创建向量存储,使用FAISS.from_embeddings方法
# 需要创建(text, embedding)元组的列表
text_embeddings = list(zip([doc.page_content for doc in split_docs], embeddings))
# 元数据也需要单独提取
metadatas = [doc.metadata for doc in split_docs]
vectordb = FAISS.from_embeddings(text_embeddings, embedding, metadatas=metadatas)
# 也可以使用FAISS.from_documents方法创建向量存储, 使用from_documents方法时,会自动调用模型对象的 .embed_documents() 方法来生成向量
# vectordb = FAISS.from_documents(split_docs, embedding)
index_folder_path = "data/faiss_index"
index_name = "0"
# 保存索引
vectordb.save_local(index_folder_path, index_name)
# 加载索引 allow_dangerous_deserialization 加载由不受信任的源生成的索引文件时需要
vectordb = FAISS.load_local(index_folder_path, embedding, index_name, allow_dangerous_deserialization=True)
# 创建检索器
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
from langchain_openai import ChatOpenAI
# 创建模型,API key可以直接写在代码里,也可以从本地环境获取
api_key = os.getenv("DEEPSEEK_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key
llm = ChatOpenAI(temperature=0, model_name="deepseek-chat", api_key=api_key, base_url="https://api.deepseek.com/beta")
from langchain.chains import RetrievalQA
# 创建链
chain = RetrievalQA.from_chain_type(llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True)
# 运行链
# response = chain.invoke("什么是llama2")
# print(response)
questions = ["llama2有多少参数?"]
ground_truths = ["7B、13B和70B"]
# 生成 answers 和 contexts
answers = []
contexts = []
for question in questions:
print(question)
response = chain.invoke(question)
print(response['result'], "\n")
answers.append(response['result'])
contexts.append([doc.page_content for doc in response['source_documents']])
from datasets import Dataset
evaluate_data = {
"question": questions,
"answer": answers,
"contexts": contexts,
"ground_truth": ground_truths
}
evaluate_dataset = Dataset.from_dict(evaluate_data)
# 对Rag进行评估
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_recall,
context_precision,
)
import os
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# 使用gpt-4o作为评估模型,效果更好但成本更高
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model_name="deepseek-chat", api_key=api_key, base_url="https://api.deepseek.com/beta"))
metrics = [
faithfulness,
answer_relevancy,
context_precision,
context_recall,
]
# 运行评估
result = evaluate(
dataset=evaluate_dataset,
metrics=metrics,
llm=evaluator_llm, # 传入自定义的LLM
embeddings=embedding, # 传入自定义的嵌入模型
)
print(result)