python
复制代码
import datetime
from typing import Optional
from xml.dom.minidom import Document
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain_community.embeddings import DashScopeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import BaseModel
from pydantic import Field # Qwen 嵌入模型
# Qwen(通义千问)API Key
qwen_api_key = 'TripleH' # 请替换为您的 DashScope API Key
# 创建 Qwen LLM 模型
# 可选模型:qwen-turbo, qwen-plus, qwen-max, qwen-max-longcontext
model = ChatOpenAI(
model='qwen-turbo', # 可以根据需要改为 qwen-plus 或 qwen-max
api_key=qwen_api_key,
base_url='https://dashscope.aliyuncs.com/compatible-mode/v1'
)
embeddings = DashScopeEmbeddings(
model='text-embedding-v1', # Qwen 的嵌入模型
dashscope_api_key=qwen_api_key
)
persist_dir = './chroma_data_dir' # 存放向量数据库的目录
# 1. 先从网站获取信息,再进行本地持久化
# 一些YouTube的视频连接
# urls = [
# "https://www.youtube.com/watch?v=HAn9vnJy6S4",
# "https://www.youtube.com/watch?v=dA1cHGACXCo",
# "https://www.youtube.com/watch?v=ZcEMLz27sL4",
# "https://www.youtube.com/watch?v=hvAPnpSfSGo",
# ]
# # document的数组
# docs = []
# from langchain_community.document_loaders import WebBaseLoader, YoutubeLoader
# for url in urls:
# # 一个Youtube的视频对应一个document
# docs.extend(YoutubeLoader.from_youtube_url(url, add_video_info=False).load())
# print(len(docs))
# print(docs[0])
# # 给doc添加额外的元数据: 视频发布的年份
# for doc in docs:
# doc.metadata['publish_year'] = int(
# datetime.datetime.strptime(
# doc.metadata['publish_date'], '%Y-%m-%d %H:%M:%S').strftime('%Y')
# )
# print(docs[0].metadata)
# # 第一个视频的字幕内容
# print(docs[0].page_content[:500])
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=30)
# split_doc = text_splitter.split_documents(docs)
# 2.向量数据库的持久化
# vectorstore = Chroma.from_documents(split_doc, embeddings, persist_directory=persist_dir) # 并且把向量数据库持久化到磁盘
# 加载磁盘中的向量数据库
vectorstore = Chroma(persist_directory=persist_dir, embedding_function=embeddings)
result = vectorstore.similarity_search_with_score('how do I build a RAG agent')
print(result[0])
print(result[0][0].metadata['publish_year'])
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
system = """You are an expert at converting user questions into database queries. \
You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
Given a question, return a list of database queries optimized to retrieve the most relevant results.
If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
[
("system", system),
("human", "{question}"),
]
)
from langchain_core.runnables import RunnableWithMessageHistory, RunnablePassthrough
# pydantic
class Search(BaseModel):
"""
定义了一个数据模型
"""
# 内容的相似性和发布年份
query:str = Field(None, description = 'Similarity search query applied to video transcripts.')
publish_year: Optional[int] = Field(None, description='Year video was published')
chain = {'question' : RunnablePassthrough()} | prompt | model.with_structured_output(Search)
resp1 = chain.invoke('how do i build a RAG agent?')
# print(resp1)
# query='build RAGagent' publish_year=None
resp2 = chain.invoke('videos on RAG published in 2023')
# print(resp2)
# query='RAG' publish_year=2023
# 到目前为止,生成要去向量数据库进行检索的指令
# 根据检索条件去执行
def retrieval(search : Search) -> list[Document]:
_filter = None
if search.publish_year:
# 根据publish_year,存在得到一个检索条件
# "$eq"是Chroma向量数据库的固定语法
_filter = {'publish_year' : {"$eq" : search.publish_year}}
return vectorstore.similarity_search(search.query, filter=_filter)
new_chain = chain | retrieval
# result = new_chain.invoke('videos on RAG published in 2023')
result = new_chain.invoke('RAG tutorial')
print([(doc.metadata['title'], doc.metadata['publish_year']) for doc in result])