Chroma 向量数据库使用教程

Chroma 教程

基础依赖

复制代码

# 原生Chroma核心库
pip install chromadb
# LangChain集成包（新版统一langchain-chroma）
pip install langchain langchain-chroma
# 开源本地嵌入模型（不用OpenAI Key）
pip install sentence-transformers
# 如需OpenAI向量
pip install langchain-openai

三种运行模式

内存模式（临时，重启丢失）：测试快速调试
本地持久化文件模式（最常用）：数据存在文件夹
服务端模式（远程多程序共享） ：chroma run启动后台服务

内存模式

复制代码

import chromadb

# 1. 创建内存客户端
client = chromadb.Client()

# 2. 创建集合（等价数据库表）
collection = client.create_collection(
    name="demo",
    metadata={"hnsw:space": "cosine"}  # 相似度：cosine/l2/ip
)

# 3. 插入数据（必须传唯一ids）
collection.add(
    documents=[
        "LangChain是大模型应用开发框架",
        "Chroma是轻量向量数据库",
        "RAG用向量库做私有知识库问答"
    ],
    ids=["doc1", "doc2", "doc3"],
    metadatas=[
        {"type": "AI框架"},
        {"type": "向量库"},
        {"type": "RAG技术"}
    ]
)

# 4. 语义查询
res = collection.query(
    query_texts=["什么向量库适合本地RAG？"],
    n_results=2  # 返回Top2相似
)

print("匹配文本：", res["documents"])
print("相似度距离：", res["distances"])

持久化本地存储模式

复制代码

import chromadb

# 持久化客户端，数据存 ./my_db 文件夹
client = chromadb.PersistentClient(path="./my_chroma_db")

# 存在则读取，不存在则新建
coll = client.get_or_create_collection(
    name="knowledge_base",
    metadata={"hnsw:space": "cosine"}
)

# 后续add/query和内存模式完全一致
coll.add(
    documents=[
        "什么是向量嵌入embedding？",
        "Python基础变量与数据类型讲解",
        "Chroma向量数据库增删改查操作",
        "大模型RAG检索增强实现步骤",
        "LangChain调用本地Embedding模型"
    ],
    ids=["doc1", "doc2", "doc3", "doc4", "doc5"],
    metadatas=[
        {"source": "向量", "category": "向量", "level": "入门"},
        {"source": "python手册", "category": "编程", "level": "入门"},
        {"source": "向量库", "category": "数据库", "level": "进阶"},
        {"source": "AI实战笔记", "category": "大模型", "level": "进阶"},
        {"source": "开源项目readme", "category": "框架", "level": "高阶"}
    ]
)

coll.add(
    documents=[
        "春日春风吹绿江边柳树，细雨绵绵润万物",
        "机器学习监督学习包含回归和分类两大任务",
        "MySQL索引可以大幅提升查询速度，但写入会变慢"
    ],
    ids=["doc9", "doc10", "doc11"],
    metadatas=[
        {"source": "古诗文", "tag": "文学"},
        {"source": "AI讲义", "tag": "机器学习"},
        {"source": "后端笔记", "tag": "数据库"}
    ]
)

# 打印库中所有数据
print("\n=== 库中全部数据 ===")
all_data = coll.get()
for id_, doc, meta in zip(all_data["ids"], all_data["documents"], all_data["metadatas"]):
    print(f"[{id_}] {doc}")
    print(f"      元数据: {meta}")


# 简单查询
res = coll.query(query_texts=["大模型框架"], n_results=2)

print("匹配文本：", res["documents"])
print("相似度距离：", res["distances"])

# 带条件过滤（只查type=AI框架）
res_filter = coll.query(
    query_texts=["AI工具"],
    n_results=2,
    where={"category": "大模型"}
)

print("匹配文本：", res_filter["documents"])
print("相似度距离：", res_filter["distances"])

# 多条件 $and
res_complex = coll.query(
    query_texts=["向量技术"],
    n_results=2,
    where={
        "$and": [
            {"source": {"$in": ["向量库", "AI讲义"]}},
            {"category": "数据库"}
        ]
    }
)

print("匹配文本：", res_complex["documents"])
print("相似度距离：", res_complex["distances"])


# 更新文档内容
coll.update(
    ids=["doc2"],
    documents=["Chroma是零配置轻量级本地向量数据库"]
)
# 只更新元数据
coll.update(ids=["doc2"], metadatas={"type": "轻量向量库"})


# 全部数据
all_data = coll.get()
# 按 ID 精准查一条文档，include 控制返回什么字段：
one = coll.get(
    ids=["doc2"],
    include=["documents", "metadatas", "embeddings"]
)
print(f"\n=== 按ID查询 doc2 ===")
print(f"  文档: {one['documents'][0]}")
print(f"  元数据: {one['metadatas'][0]}")
print(f"  向量维度: {len(one['embeddings'][0])}")


# 分页 limit/offset
page = coll.get(limit=2, offset=0)
print(f"\n=== 分页查询（limit=2, offset=0）===")
for id_, doc in zip(page["ids"], page["documents"]):
    print(f"  [{id_}] {doc}")

# 按ID删
coll.delete(ids=["doc4"])
# 按条件批量删
coll.delete(where={"type": "轻量向量库"})
# 清空整个集合所有数据
all_ids = coll.get()["ids"]
if all_ids:
    coll.delete(ids=all_ids)

服务端模式

1、终端启动 Chroma 后台服务

复制代码

chroma run --host localhost --port 8000 --path ./my_chroma_db

2、客户端远程连接

复制代码

import chromadb

client = chromadb.HttpClient(host="localhost", port=8000)
coll = client.get_or_create_collection("knowledge_base")

print(f"--- 已连接到 Chroma 服务 localhost:8000")
print(f"集合: {coll.name}")
print(f"文档数: {coll.count()}")

print("\n=== 文档内容 ===")
all_data = coll.get()
for id_, doc, meta in zip(all_data["ids"], all_data["documents"], all_data["metadatas"]):
    print(f"[{id_}] {doc}")
    print(f"      元数据: {meta}")

3、查看终端

复制代码

检查端口是否占用
netstat -ano | findstr :8000
  TCP    0.0.0.0:8000           0.0.0.0:0              LISTENING       24840
杀死进程
taskkill /T /F /PID 24840
成功: 已终止 PID 24840 (属于 PID 21360 子进程)的进程。

常见踩坑与最佳实践

持久化不要混用客户端 ：同一个库文件夹只能用PersistentClient，不能混用内存 Client
集合嵌入模型统一：一个 Collection 全程只能用同一种 Embedding，中途换模型会向量维度不匹配报错
chunk_size 经验值：中文 700--1200 字符，overlap 100--200，防止上下文断裂
元数据规范 ：存入source、file、category、time方便过滤检索
数据量大时：Chroma 适合十万级以内；百万级建议换 Milvus/Qdrant
重启加载 ：直接Chroma(persist_directory=xxx)会自动读取旧库，不用重复 from_documents