https://chatglm.cn/main/alltoolsdetail?t=1783048760261\&lang=zh\&cid=6a4729d82e7bd8bd2e93725f
edge
我发现我仅仅是运行这一部分代码,都会报错# scripts/test_chroma_query.py
"""隔离测试:直接查 ChromaDB,复现段错误"""
import chromadb
from chromadb.config import Settings
import sys
CHROMA_PATH = "/home/shaolingxuan/project/FlexSearch/search_cache_db/chroma"
client = chromadb.PersistentClient(
path=CHROMA_PATH,
settings=Settings(anonymized_telemetry=False),
)
问题
列出所有 collection
print("=== 所有 collections ===")
for c in client.list_collections():
print(f" {c.name}: count={c.count()}")
(flexsearch) shaolingxuan@k8s-master:~/project/FlexSearch/results/GoogleSearch_qwen-3.5-9b_seal0/回溯9-波动性控制/运行代
码$ ls -la /home/shaolingxuan/project/FlexSearch/search_cache_db/chroma/
total 20180
drwxrwxr-x 3 shaolingxuan shaolingxuan 4096 Jul 3 11:34 .
drwxrwxr-x 11 shaolingxuan shaolingxuan 4096 Jul 3 11:25 ...
drwxrwxr-x 2 shaolingxuan shaolingxuan 4096 Jun 24 21:45 4502f8c4-d9c6-4af1-b3c9-573b755c8fb9
-rw-r--r-- 1 shaolingxuan shaolingxuan 20647936 Jul 3 11:34 chroma.sqlite3
(flexsearch) shaolingxuan@k8s-master:~/project/FlexSearch/results/GoogleSearch_qwen-3.5-9b_seal0/回溯9-波动性控制/运行代
码 码 du -sh /home/shaolingxuan/project/FlexSearch/search_cache_db/chroma/*
48M /home/shaolingxuan/project/FlexSearch/search_cache_db/chroma/4502f8c4-d9c6-4af1-b3c9-573b755c8fb9
20M /home/shaolingxuan/project/FlexSearch/search_cache_db/chroma/chroma.sqlite3
(flexsearch) shaolingxuan@k8s-master:~/project/FlexSearch/results/GoogleSearch_qwen-3.5-9b_seal0/回溯9-波动性控制/运行代
码$ 请你告诉我改怎么解决
解决
确认:ChromaDB 索引已损坏,100% 是元凶
你的隔离测试连 list_collections() / count() 都段错误,说明 HNSW 索引文件已经物理损坏 ,不是某个 query 的问题,是整个 collection 的索引二进制文件坏了。
好消息是:你的 search_cache.db(SQLite 主库,存着完整的搜索结果)是好的,ChromaDB 只是用来做向量相似匹配的辅助索引。删掉 ChromaDB 目录,从 SQLite 重新生成一遍就行,搜索结果数据一条都不丢。
完整修复步骤(按顺序执行)
第 1 步:备份
py
# ============================================================
# 第 1 步:备份(必须做!)
# ============================================================
cp -r /home/shaolingxuan/project/FlexSearch/search_cache_db \
/home/shaolingxuan/project/FlexSearch/search_cache_db.bak.$(date +%Y%m%d_%H%M%S)
# 确认备份成功
ls -d /home/shaolingxuan/project/FlexSearch/search_cache_db.bak.*
# ============================================================
# 第 2 步:确认 SQLite 主库完好
# ============================================================
sqlite3 /home/shaolingxuan/project/FlexSearch/search_cache_db/search_cache.db \
"PRAGMA integrity_check;"
# 期望输出:ok
# 看看主库里有多少条数据(按引擎+模型分组)
sqlite3 /home/shaolingxuan/project/FlexSearch/search_cache_db/search_cache.db \
"SELECT search_engine, embedding_model, COUNT(*) as cnt FROM search_cache GROUP BY search_engine, embedding_model;"
第 3 步:删除损坏的 ChromaDB 目录
bash
# 直接删掉整个 chroma 目录(SQLite 主库不受影响)
rm -rf /home/shaolingxuan/project/FlexSearch/search_cache_db/chroma
# 确认已删除
ls -la /home/shaolingxuan/project/FlexSearch/search_cache_db/
# 应该看不到 chroma 目录了
第 4 步:从 SQLite 主库重建 ChromaDB
把下面的代码保存为 /home/shaolingxuan/project/FlexSearch/scripts/rebuild_chroma.py:
python
#!/usr/bin/env python3
"""
从 SQLite 主库重建 ChromaDB。
- SQLite 主库(search_cache.db)保存着完整的搜索结果,不丢
- ChromaDB 只是向量索引,删了重建即可
- 重建后语义相似匹配功能恢复
"""
import sqlite3
import json
import logging
import re
import sys
from pathlib import Path
from collections import defaultdict
# 把项目根目录加到 path
PROJECT_ROOT = Path("/home/shaolingxuan/project/FlexSearch")
sys.path.insert(0, str(PROJECT_ROOT))
from agent.embedder import VLLMEmbedder
import chromadb
from chromadb.config import Settings
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
CACHE_DIR = PROJECT_ROOT / "search_cache_db"
CHROMA_PATH = str(CACHE_DIR / "chroma")
DB_PATH = CACHE_DIR / "search_cache.db"
# ── 工具函数(和 cache.py 里完全一致)──────────────────
def _engine_collection_name(search_engine: str, embedding_model: str = "") -> str:
safe_engine = re.sub(r"[^a-zA-Z0-9_]", "_", search_engine).lower()
safe_model = re.sub(r"[^a-zA-Z0-9_]", "_", embedding_model).lower()
if safe_model:
name = f"sq_{safe_engine}_{safe_model}"[:63]
else:
name = f"sq_{safe_engine}"[:63]
return name if len(name) >= 3 else "sq_default"
# ── 主流程 ─────────────────────────────────────────────
def main():
# 1. 检查 SQLite 主库
logger.info("=" * 60)
logger.info("第 1 步:检查 SQLite 主库")
logger.info("=" * 60)
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
# 完整性检查
integrity = conn.execute("PRAGMA integrity_check;").fetchone()[0]
logger.info(f" integrity_check: {integrity}")
if integrity != "ok":
logger.error("❌ SQLite 主库损坏!请从备份恢复。")
sys.exit(1)
# 读出所有不重复的 (query_hash, query_norm, search_engine, embedding_model)
rows = conn.execute(
"SELECT DISTINCT query_hash, query_norm, search_engine, embedding_model "
"FROM search_cache WHERE embedding_model != ''"
).fetchall()
conn.close()
logger.info(f" SQLite 主库完好,共 {len(rows)} 条记录待重建")
if not rows:
logger.warning("没有数据需要重建,退出。")
return
# 2. 初始化 Embedder
logger.info("")
logger.info("=" * 60)
logger.info("第 2 步:初始化 vLLM Embedder")
logger.info("=" * 60)
# 取第一条记录的 embedding_model 作为默认(你应该只有一种)
default_model = rows[0]["embedding_model"]
logger.info(f" 使用 embedding 模型: {default_model}")
embedder = VLLMEmbedder(
base_url="http://localhost:1827/v1",
model_name=default_model,
)
# 测试 embedder 能否正常工作
test_vec = embedder.encode("test", normalize_embeddings=True)
logger.info(f" Embedder 测试通过,向量维度: {len(test_vec)}")
# 3. 初始化新的 ChromaDB
logger.info("")
logger.info("=" * 60)
logger.info("第 3 步:创建新的 ChromaDB")
logger.info("=" * 60)
# 确保旧目录已删除
chroma_dir = Path(CHROMA_PATH)
if chroma_dir.exists():
logger.warning(f" chroma 目录已存在,先删除: {CHROMA_PATH}")
import shutil
shutil.rmtree(CHROMA_PATH)
client = chromadb.PersistentClient(
path=CHROMA_PATH,
settings=Settings(anonymized_telemetry=False),
)
# 4. 按 (engine, model) 分组
groups = defaultdict(list)
for r in rows:
groups[(r["search_engine"], r["embedding_model"])].append(dict(r))
logger.info(f" 共 {len(groups)} 个 (engine, model) 分组:")
for (engine, model), items in groups.items():
coll_name = _engine_collection_name(engine, model)
logger.info(f" {coll_name}: {len(items)} 条")
# 5. 逐组重建
logger.info("")
logger.info("=" * 60)
logger.info("第 4 步:逐组重建 ChromaDB collection")
logger.info("=" * 60)
BATCH_SIZE = 64 # 每批 embedding 64 条,避免 vLLM 一次性请求太大
for (engine, model), items in groups.items():
coll_name = _engine_collection_name(engine, model)
logger.info(f"\n--- 重建 {coll_name} ({len(items)} 条) ---")
collection = client.get_or_create_collection(
name=coll_name,
metadata={"hnsw:space": "cosine"},
)
total = len(items)
for i in range(0, total, BATCH_SIZE):
batch = items[i:i + BATCH_SIZE]
texts = [b["query_norm"] for b in batch]
# 文档侧不加 prompt(和 cache.py 的 _embed 一致)
embs = embedder.encode(texts, normalize_embeddings=True, prompt=None)
# 处理单条返回 1D 的情况
import numpy as np
if isinstance(embs, np.ndarray) and embs.ndim == 1:
embs = embs.reshape(1, -1)
collection.upsert(
ids=[b["query_hash"] for b in batch],
embeddings=embs.tolist(),
metadatas=[{
"query_norm": b["query_norm"],
"query_hash": b["query_hash"],
"search_engine": engine,
"embedding_model": model,
} for b in batch],
documents=texts,
)
done = min(i + BATCH_SIZE, total)
logger.info(f" [{done}/{total}] 已写入")
# 验证
count = collection.count()
logger.info(f" ✅ {coll_name} 重建完成,count={count}")
# 测试 query 是否正常
if count > 0:
peek = collection.peek(limit=1)
if peek["ids"]:
existing = collection.get(
ids=[peek["ids"][0]], include=["embeddings"]
)
test_result = collection.query(
query_embeddings=[existing["embeddings"][0]],
n_results=1,
include=["distances", "metadatas"],
)
logger.info(
f" ✅ query 测试通过: "
f"distance={test_result['distances'][0][0]:.4f}"
)
logger.info("")
logger.info("=" * 60)
logger.info("🎉 ChromaDB 重建全部完成!")
logger.info("=" * 60)
# 最终统计
logger.info("最终 collection 列表:")
for c in client.list_collections():
logger.info(f" {c.name}: count={c.count()}")
if __name__ == "__main__":
main()
运行:
bash
cd /home/shaolingxuan/project/FlexSearch
python scripts/rebuild_chroma.py
预计耗时:7273 条数据,batch_size=64,约 114 批,vLLM embedding 每批约 2-3 秒,总共 约 5-8 分钟。
第 5 步:验证不再段错误
bash
python scripts/test_chroma_query.py
如果输出 ✅ 查询成功!distance=0.0000(自己查自己,距离为 0),说明 ChromaDB 已恢复正常。
第 6 步:给 cache.py 加锁,防止再次损坏
ChromaDB 的 PersistentClient 不是线程安全的 。你的 web_browser.py 里 ThreadPoolExecutor 并发 3 个 query 同时调 cache.get() → collection.query(),这就是索引损坏的根源。
在 cache.py 中做以下修改:
python
# ========= cache.py 修改 1:文件顶部加 import =========
import threading
import hashlib
import json
import logging
import re
import sqlite3
import time
import signal # ★ 新增
import faulthandler # ★ 新增
import os # ★ 新增
from pathlib import Path
from typing import Optional
from datetime import datetime
# ... 其他 import 不变 ...
# ★ 新增:启用 faulthandler,段错误时把调用栈写到文件(方便下次定位)
_FAULT_LOG = os.path.join(
"/home/shaolingxuan/project/FlexSearch/search_cache_db",
"faulthandler.log"
)
os.makedirs(os.path.dirname(_FAULT_LOG), exist_ok=True)
_fault_fp = open(_FAULT_LOG, "a")
faulthandler.enable(file=_fault_fp, all_threads=True)
try:
faulthandler.register(signal.SIGSEGV, file=_fault_fp, all_threads=True)
except Exception:
pass # 非 Linux 平台可能不支持
# ========= cache.py 修改 2:SearchCache.__init__ 加 chroma_lock =========
class SearchCache:
def __init__(self, ...):
# ... 原有代码不变 ...
self._write_lock = threading.Lock() # 已有
self._chroma_lock = threading.Lock() # ★ 新增:ChromaDB 读写锁
# ... 后续代码不变 ...
# ========= cache.py 修改 3:_similar_get 加锁 =========
def _similar_get(self, embedding: list, collection) -> Optional[tuple]:
try:
# ★ 加锁:ChromaDB 的 query 不是线程安全的
with self._chroma_lock:
result = collection.query(
query_embeddings=[embedding],
n_results=1,
include=["distances", "metadatas"],
)
print(f"Cache类 _similar_get() collection.query找到的result:{result}")
print(f'Cache类 _similar_get() distance:{result["distances"][0][0]}')
print(f'Cache类 _similar_get() matched_hash:{result["metadatas"][0][0]["query_hash"]}')
if not result["ids"] or not result["ids"][0]:
return None
distance = result["distances"][0][0]
similarity = 1.0 - distance
matched_hash = result["metadatas"][0][0]["query_hash"]
if similarity >= self.similarity_threshold:
return similarity, matched_hash
return None
except Exception as e:
logger.warning(f"[SearchCache] ChromaDB 查询异常: {e}", exc_info=True)
return None
# ========= cache.py 修改 4:set() 里的 upsert 加锁 =========
def set(self, ...):
# ... 前面代码不变 ...
with self._write_lock:
# ... SQLite 写入不变 ...
collection = self._get_collection(search_engine, embedding_model)
embedding = self._embed(query_norm)
# ★ 加锁:upsert 也要串行化,防止并发写损坏 HNSW 索引
with self._chroma_lock:
collection.upsert(
ids=[query_hash],
embeddings=[embedding],
metadatas=[{
"query_norm": query_norm,
"query_hash": query_hash,
"search_engine": search_engine,
"embedding_model": embedding_model,
}],
documents=[query_norm],
)
# ... 后续日志不变 ...
第 7 步(可选但推荐):限制 web_browser 并发数
在 web_browser.py 的 search() 方法中,把并发数从 len(queries) 限制为 2:
python
# web_browser.py 的 search() 方法中
# 原来:
# with ThreadPoolExecutor(max_workers=len(queries)) as executor:
# 改成:
max_workers = min(len(queries), 2) # ★ 最多 2 个并发,减轻 ChromaDB 压力
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(_search_one, q): q for q in queries}
# ... 后续不变 ...
总结:执行顺序
第 1 步:cp -r 备份 ← 10 秒
第 2 步:sqlite3 integrity_check ← 5 秒
第 3 步:rm -rf chroma 目录 ← 1 秒
第 4 步:python scripts/rebuild_chroma.py ← 5-8 分钟
第 5 步:python scripts/test_chroma_query.py ← 10 秒(验证)
第 6 步:修改 cache.py 加 _chroma_lock ← 2 分钟
第 7 步:修改 web_browser.py 限制并发 ← 1 分钟
重跑数据集验证
核心要点:
- ChromaDB 索引损坏不可修复,只能删除重建------但你的数据在 SQLite 主库里,一条都不丢
- 重建后必须加锁 (
_chroma_lock),否则多线程并发会再次损坏索引 faulthandler能在下次段错误时把 C 层调用栈写到日志文件,帮你快速定位