langchain Chroma 构建本地向量数据库
python
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.word_document import Docx2txtLoader
import glob
import os
# 数据库路径
db_dir = "./db"
# 文档路径
source_directory = "./docs"
# 文件后缀
file_ext = '*.docx'
# create the open-source embedding function
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# 使用中文嵌入层编码器
ebd_function = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese")
def add_files_to_db(filepath:str="",file_ext:str=""):
docx_files = glob.glob(os.path.join(source_directory, file_ext))
text_list=[]
for file_name in docx_files:
print(file_name)
loader = Docx2txtLoader(file_name)
documents = loader.load()
text_list.extend(documents)
# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(text_list)
# load it into Chroma
db = Chroma.from_documents(docs, ebd_function, persist_directory=db_dir)
# save db to disk
db.persist()
def query_db(db:Chroma,query:str=""):
# query it
docs = db.similarity_search(query)
# print results
print(docs[0].page_content)
print("-----------------------------------------")
if __name__=="__main__":
# 只需执行一次
# add_files_to_db(source_directory,file_ext)
db = Chroma(persist_directory=db_dir,embedding_function=ebd_function)
query = "怎么治疗骨质疏松症?"
query_db(db,query)
query = "怎么治疗鼻炎?"
query_db(db,query)
db = None
pass
文档在当前代码目录下
./docs/第十六章-感染性疾病.docx
./docs/第八章-骨骼关节和肌肉疾病.docx
./docs/第十九章-耳鼻咽喉疾病.docx