langchain chroma 与 chromadb笔记

chromadb可独立使用也可搭配langchain 框架使用。

环境:

python 3.9

langchain=0.2.16

chromadb=0.5.3

chromadb 使用示例

python 复制代码
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions


# 加载embedding模型
en_embedding_name = "/home/model/peft_prac/all-MiniLM-L6-v2"  
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
 en_embedding_name, "cuda:2", True
 )

# 实例化chromadb,添加一个collection
collection_first = 'coll_1st'
client_test = chromadb.Client()
collection = client_test.create_collection(name=collection_first, embedding_function=ef) 


# 添加数据三元组,list类型
collection.add(
    documents=["it's an apple", "this is a book"], 
    metadatas=[{"source": "t4"},  {"source": "t5"}], 
    ids=["id4",  "id5"])

## 统计collection的items数量
collection.count()

# 查找数据
coll2 = client_test.get_collection(collection_first)
print('check_collection',  coll2.peek(1)) # 取出第一个数据,此时embedding有值
print('check_collection',  coll2.get(ids=["id4"])) # 选择第一个数据,此时embedding无值
collection = client.get_or_create_collection("testname") # 有则获取,无则创建

# 更新数据
collection.upsert(
    ids=["id4", ...],
    embeddings=[[1.1, 2.3, 3.2], ...], #非必须
    metadatas=[{"chapter": "3", "verse": "16"} ...],
    documents=["it's a book", ...],
)

# 使用embedding 检索

collection.query(
    query_embeddings=[[1.1, 2.3, 3.2]],
    n_results=1,
    where={"style": "style2"}
)

# 使用text 检索(使用更新前的数据检索),distance越小,语义越接近
print('chromadb_search', coll2.query(query_texts="it's a book", n_results=2))
output:
chromadb_search {'ids': [['id5', 'id4']], 'distances': [[0.3473210334777832, 1.2127960920333862]], 'metadatas': [[{'source': 't5'}, {'source': 't4'}]], 'embeddings': None, 'documents': [['this is a book', "it's an apple"]], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}

# 使用text 检索(使用更新后的数据检索),注意:本体检索,distance 却不是1
print('chromadb_search', coll2.query(query_texts="it's a book", n_results=1))
output:
chromadb_search {'ids': [['id4']], 'distances': [[1.168771351402198e-12]], 'metadatas': [[{'info': 'new data', 'source': 't4'}]], 'embeddings': None, 'documents': [["it's a book"]], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}

langchain chroma 使用示例

python 复制代码
import chromadb
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma


# 加载embedding 模型
en_embedding_name = "/home/zmh/peft_prac/all-MiniLM-L6-v2"  
embeddings = HuggingFaceEmbeddings(
    model_name = en_embedding_name,
    model_kwargs={"device": "cuda:1"}
)

# 创建db, 还可以在本地保存db
collection_test = 'llama2_demo'
db = Chroma(
    client=client_test, 
    collection_name=collection_test,
    embedding_function=embeddings, 
    persist_directory='db/'
)

# 基本数据信息
student_info = "Alexandra Thompson, a 19-year-old computer science sophomore with a 3.7 GPA, is a member of the programming and chess clubs who enjoys pizza, swimming, and hiking in her free time in hopes of working at a tech company after graduating from the University of Washington."

club_info = "The university chess club provides an outlet for students to come together and enjoy playing the classic strategy game of chess. Members of all skill levels are welcome, from beginners learning the rules to experienced tournament players. The club typically meets a few times per week to play casual games participate in tournaments, analyze famous chess matches, and improve members' skills."

university_info = "The University of Washington, founded in 1861 in Seattle, is a public research university with over 45,000 students across three campuses in Seattle, Tacoma, and Bothell. "As the flagship institution of the six public universities in Washington state, UW encompasses over 500 buildings and 20 million square feet of space, including one of the largest library systems in the world."


texts_org = [student_info, club_info, university_info]
text_meta = [{"source": 'student_info'},  {"source": 'club_info'},  {"source": 'university_info'}]
text_ids = ['101',  '102',  '103']

# 处理数据,
text_splitter = CharacterTextSplitter(separator='.', chunk_size=1000, chunk_overlap=0)
texts_doctment = text_splitter.create_documents(texts_org, metadatas=text_meta)
# 添加数据
db.add_documents(texts_doctment, ids=text_ids)

#查询数据
coll = db._collection
print('coll', type(coll), coll.name, coll.metadata)
output:
coll <class 'chromadb.api.models.Collection.Collection'> llama2_demo None
print('sample of db_info',  coll.peek(1)) # 获取第一个数据
print("collection_info", coll.get()) # 获取整个集合的数据


#检索数据,返回的是直接的document 信息,没有distance 分数
res = db.similarity_search("What is the student name?", k=2)
print('res',  res)
output:
res [Document(metadata={'source': 'student_info'}, page_content='Alexandra Thompson, a 19-year-old computer science sophomore with a 3.7 GPA, is a member of the programming and chess clubs who enjoys pizza, swimming, and hiking in her free time in hopes of working at a tech company after graduating from the University of Washington'), Document(metadata={'source': 'club_info'}, page_content="The university chess club provides an outlet for students to come together and enjoy playing the classic strategy game of chess. Members of all skill levels are welcome, from beginners learning the rules to experienced tournament players. The club typically meets a few times per week to play casual games participate in tournaments, analyze famous chess matches, and improve members' skills")]

参考

1 ChromaDB python 使用教程及记录 - 知乎

2 langchain chromadb 的部分信息参考某个博客,忘了,待补充

相关推荐
Pandaconda12 分钟前
【Golang 面试题】每日 3 题(三十九)
开发语言·经验分享·笔记·后端·面试·golang·go
l1x1n024 分钟前
No.35 笔记 | Python学习之旅:基础语法与实践作业总结
笔记·python·学习
dal118网工任子仪6 小时前
66,【6】buuctf web [HarekazeCTF2019]Avatar Uploader 1
笔记·学习
羊小猪~~7 小时前
MYSQL学习笔记(四):多表关系、多表查询(交叉连接、内连接、外连接、自连接)、七种JSONS、集合
数据库·笔记·后端·sql·学习·mysql·考研
milk_yan8 小时前
Docker集成onlyoffice实现预览功能
前端·笔记·docker
东京老树根8 小时前
Excel 技巧15 - 在Excel中抠图头像,换背景色(★★)
笔记·学习·excel
Ronin-Lotus9 小时前
嵌入式硬件篇---ADC模拟-数字转换
笔记·stm32·单片机·嵌入式硬件·学习·低代码·模块测试
UQI-LIUWJ10 小时前
LLM笔记:LayerNorm VS RMSNorm
笔记
东京老树根11 小时前
Excel 技巧17 - 如何计算倒计时,并添加该倒计时的数据条(★)
笔记·学习·excel
m0_7482405413 小时前
AutoSar架构学习笔记
笔记·学习·架构