youtube工具包介绍
工具包引入,专注于获取Youtube的自动字幕和提供方便的api,本次利用这个包来获取视频的元信息和字幕信息,以此来比对相关性
shell
pip install youtube-transcript-api pytube
python
# 一些YouTube的视频连接
urls = [
"https://www.youtube.com/watch?v=HAn9vnJy6S4",
"https://www.youtube.com/watch?v=dA1cHGACXCo",
"https://www.youtube.com/watch?v=ZcEMLz27sL4",
"https://www.youtube.com/watch?v=hvAPnpSfSGo",
"https://www.youtube.com/watch?v=EhlPDL4QrWY",
"https://www.youtube.com/watch?v=mmBo8nlu2j0",
"https://www.youtube.com/watch?v=rQdibOsL1ps",
"https://www.youtube.com/watch?v=28lC4fqukoc",
"https://www.youtube.com/watch?v=es-9MgxB-uc",
"https://www.youtube.com/watch?v=wLRHwKuKvOE",
"https://www.youtube.com/watch?v=ObIltMaRJvY",
"https://www.youtube.com/watch?v=DjuXACWYkkU",
"https://www.youtube.com/watch?v=o7C9ld6Ln-M",
]
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
# 旧版 Python 默认不验证 HTTPS 证书
pass
else:
# 创建一个未经验证的 SSL context
ssl._create_default_https_context = _create_unverified_https_context
print("警告:已全局禁用 SSL 证书验证。这是不安全的。")
docs = []
for url in urls:
# 一个Youtube视频对应一个document
doc1 = YoutubeLoader.from_youtube_url(url,add_video_info=False).load()
print(len(docs))
print(docs[0])
# 给doc添加额外的元数据:视频发布的年份
for doc in docs:
doc.metadata['publish_year'] = int(
datetime.datetime.strptime(doc.metadata['publish_date'],'%Y-%m-%d %H:%M:%S').strftime('%Y')
)
向量数据库持久化
指定文档的位置,将文档分割形成向量存储,并将向量存储持久化到磁盘中,此处便是将youtube的内容形成持久化数据,方便后续搜索。
python
persist_dir=/a/b/c
# 根据多个doc构建向量数据库
text_spliter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=30)
split_doc = text_spliter.split_documents(docs)
# 向量数据库持久化到磁盘
vectorstore = Chroma.from_documents(split_doc,embeddings,persist_directory=persist_dir)
结构化初次接触
pydantic三方包定义了询问问题的结构,主要是视频查询内容及视频的年份,通过对问题的分析,分解为内容和年份。
python
class Search(BaseModel):
"""
定义一个数据模型
"""
query:str = Field(None,description='Similarity search query applied to video transcripts')
# Optional为允许数值为空
publish_year:Optional[int] = Field(None,description='Year video was published')
比如问的问题为RAG tutorial in 2025,结构体为query=RAG tutorial,publish_year=2025,方便进行相似性查询。
retieval的过滤逻辑
search为上面结构化设置,首先设置过滤条件,如果问题中提到了年份,则判断是否年份相等,向量数据库中设置相似性搜索,返回相似性搜索结果,数值越小,表示越接近条件。
python
def retrieval(search : Search) -> List[Document]:
_filter = None
if search.publish_year:
_filter = {'publish_year':{'$eq':search.publish_year}}
return vectorstore.similarity_search(search.query,filter=_filter)
多chain连接
python
system = """You are an expert at converting user questions into database queries. \
You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
Given a question, return a list of database queries optimized to retrieve the most relevant results.
If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
[
('system',system),
('human','{question}')
]
)
# 分析问题,将问题输出为结构化Search
chain = {'question':RunnablePassthrough()}|prompt|model.with_structured_output(Search)
# 问答结构体与向量数据库的相似性查询关联
new_chain = chain|retrieval
完整代码
预处理,将视频信息持久化到磁盘
python
import datetime
from time import sleep
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader, YoutubeLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# pip install youtube-transcript-api pytube
import os
from langchain_ollama import OllamaLLM,OllamaEmbeddings
from tornado.httpclient import HTTPError
from pytube import YouTube
# 解决 Intel OpenMP 库(如 MKL、TBB)的运行时冲突
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# 设置Ollama的主机和端口(可选,如果已在环境变量中设置则不需要)
os.environ["OLLAMA_HOST"] = "127.0.0.1"
os.environ["OLLAMA_PORT"] = "11434"
# 初始化ollama
def get_chat_llm() -> OllamaLLM:
chat_llm = OllamaLLM(
model="deepseek-r1:8b"
)
return chat_llm
model = get_chat_llm()
embeddings = OllamaEmbeddings(model="nomic-embed-text")
persist_dir = 'chroma_data_dir' # 存放向量数据库的目录
# 一些YouTube的视频连接
urls = [
"https://www.youtube.com/watch?v=HAn9vnJy6S4",
"https://www.youtube.com/watch?v=dA1cHGACXCo",
"https://www.youtube.com/watch?v=ZcEMLz27sL4",
"https://www.youtube.com/watch?v=hvAPnpSfSGo",
"https://www.youtube.com/watch?v=EhlPDL4QrWY",
"https://www.youtube.com/watch?v=mmBo8nlu2j0",
"https://www.youtube.com/watch?v=rQdibOsL1ps",
"https://www.youtube.com/watch?v=28lC4fqukoc",
"https://www.youtube.com/watch?v=es-9MgxB-uc",
"https://www.youtube.com/watch?v=wLRHwKuKvOE",
"https://www.youtube.com/watch?v=ObIltMaRJvY",
"https://www.youtube.com/watch?v=DjuXACWYkkU",
"https://www.youtube.com/watch?v=o7C9ld6Ln-M",
]
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
# 旧版 Python 默认不验证 HTTPS 证书
pass
else:
# 创建一个未经验证的 SSL context
ssl._create_default_https_context = _create_unverified_https_context
print("警告:已全局禁用 SSL 证书验证。这是不安全的。")
docs = []
for url in urls:
# 一个Youtube视频对应一个document
doc1 = YoutubeLoader.from_youtube_url(url,add_video_info=False).load()
print(len(docs))
print(docs[0])
# 给doc添加额外的元数据:视频发布的年份
for doc in docs:
doc.metadata['publish_year'] = int(
datetime.datetime.strptime(doc.metadata['publish_date'],'%Y-%m-%d %H:%M:%S').strftime('%Y')
)
print(docs[0].metadata)
print(docs[0].page_content[:500])
# 根据多个doc构建向量数据库
text_spliter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=30)
split_doc = text_spliter.split_documents(docs)
# 向量数据库持久化到磁盘
vectorstore = Chroma.from_documents(split_doc,embeddings,persist_directory=persist_dir)
问答链组合,返回符合条件的视频信息
python
# pip install youtube-transcript-api pytube
import os
from typing import Optional,List
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import OllamaLLM,OllamaEmbeddings
from pydantic.v1 import BaseModel, Field
# 解决 Intel OpenMP 库(如 MKL、TBB)的运行时冲突
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# 设置Ollama的主机和端口(可选,如果已在环境变量中设置则不需要)
os.environ["OLLAMA_HOST"] = "127.0.0.1"
os.environ["OLLAMA_PORT"] = "11434"
# 初始化ollama
def get_chat_llm() -> OllamaLLM:
chat_llm = OllamaLLM(
model="deepseek-r1:8b"
)
return chat_llm
model = get_chat_llm()
embeddings = OllamaEmbeddings(model="nomic-embed-text")
persist_dir = 'chroma_data_dir'
# 加载磁盘中的向量数据库
vectorstore = Chroma(persist_directory=persist_dir,embedding_function=embeddings)
# 测试向量数据库的相似性检索
result = vectorstore.similarity_search_with_score('how do i build a RAG agent?')
print(result[0][0].metadata['publish_year'])
system = """You are an expert at converting user questions into database queries. \
You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
Given a question, return a list of database queries optimized to retrieve the most relevant results.
If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
[
('system',system),
('human','{question}')
]
)
# pydantic 数据的校验和转化
class Search(BaseModel):
"""
定义一个数据模型
"""
query:str = Field(None,description='Similarity search query applied to video transcripts')
publish_year:Optional[int] = Field(None,description='Year video was published')
chain = {'question':RunnablePassthrough()}|prompt|model.with_structured_output(Search)
def retrieval(search : Search) -> List[Document]:
_filter = None
if search.publish_year:
_filter = {'publish_year':{'$eq':search.publish_year}}
return vectorstore.similarity_search(search.query,filter=_filter)
new_chain = chain|retrieval
result = new_chain.invoke('RAG tutorial')
print([(doc.metadata['title'],doc.metadata['publish_year']) for doc in result])
学习指路:【绝对是B站最全最细的LangChain大模型全套教程(AI学习路线Langchain+项目深度实战),七天就能从入门到就业!让你少走99%的弯路!】www.bilibili.com/video/BV1YN...