langchain Chroma 构建本地向量数据库

langchain Chroma 构建本地向量数据库

python 复制代码
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.embeddings import HuggingFaceEmbeddings 
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.word_document import Docx2txtLoader

import glob
import os

# 数据库路径
db_dir = "./db"
# 文档路径
source_directory = "./docs"
# 文件后缀
file_ext = '*.docx'

# create the open-source embedding function
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# 使用中文嵌入层编码器
ebd_function = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese")

def add_files_to_db(filepath:str="",file_ext:str=""):
    docx_files = glob.glob(os.path.join(source_directory, file_ext))
    text_list=[]
    for file_name in docx_files:
        print(file_name)
        loader = Docx2txtLoader(file_name)
        documents = loader.load()
        text_list.extend(documents)

    # split it into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_documents(text_list)

    # load it into Chroma
    db = Chroma.from_documents(docs, ebd_function, persist_directory=db_dir)
    # save db to disk
    db.persist()


def query_db(db:Chroma,query:str=""):

    # query it
    docs = db.similarity_search(query)

    # print results
    print(docs[0].page_content)
    print("-----------------------------------------")


    
 
if __name__=="__main__":

    # 只需执行一次
    # add_files_to_db(source_directory,file_ext)

    db = Chroma(persist_directory=db_dir,embedding_function=ebd_function)
    query = "怎么治疗骨质疏松症?"
    query_db(db,query)
    query = "怎么治疗鼻炎?"
    query_db(db,query)
    db = None
    pass

文档在当前代码目录下

./docs/第十六章-感染性疾病.docx

./docs/第八章-骨骼关节和肌肉疾病.docx

./docs/第十九章-耳鼻咽喉疾病.docx

相关推荐
momo小菜pa3 分钟前
【MySQL 01】数据库基础
数据库·mysql
码爸21 分钟前
hbase merge工具
大数据·数据库·hbase
hong_zc23 分钟前
JDBC 编程
java·数据库·mysql
Leoysq23 分钟前
Oracle 数据库常用命令与操作指南
数据库·笔记·oracle
数据与人24 分钟前
Can‘t connect to local MySQL server through socket
数据库
天蓝蓝2352828 分钟前
MySQL数据库的备份与恢复
数据库·mysql·oracle
踏浪逐行29 分钟前
数据库(mysql)常用命令
数据库·mysql
Leoysq1 小时前
Navicate 链接Oracle 提示 Oracle Library is not loaded ,账号密码都正确地址端口也对
数据库·oracle
imc.111 小时前
初识linux(2)
java·linux·数据库
武子康1 小时前
大数据-143 - ClickHouse 集群 SQL 超详细实践记录!
java·大数据·数据库·分布式·sql·clickhouse·flink