混合检索实现:关键词+语义检索的完美结合
前言
单一的检索方式往往无法满足复杂需求。将关键词检索与语义检索结合,可以显著提升检索质量,兼顾精确匹配和语义理解。
我在多个搜索系统中实现过混合检索,今天分享一些实战经验。
混合检索架构
核心架构
python
from typing import List, Dict, Any
import numpy as np
from rank_bm25 import BM25Okapi
class HybridRetriever:
"""混合检索器"""
def __init__(self, vector_db, keyword_db=None):
self.vector_db = vector_db
self.keyword_db = keyword_db or BM25Index()
self.vector_weight = 0.6
self.keyword_weight = 0.4
def set_weights(self, vector_weight, keyword_weight):
"""设置权重"""
self.vector_weight = vector_weight
self.keyword_weight = keyword_weight
def search(self, query: str, query_embedding: np.ndarray, top_k: int = 10):
"""混合检索"""
# 并行获取两种检索结果
vector_results = self.vector_db.search(query_embedding, top_k=top_k*2)
keyword_results = self.keyword_db.search(query, top_k=top_k*2)
# 结果融合
combined = self._merge_results(vector_results, keyword_results)
# 重新排序
reranked = self._rerank(combined, query, top_k)
return reranked
def _merge_results(self, vector_results, keyword_results):
"""合并检索结果"""
# 构建结果字典
results_dict = {}
for item in vector_results:
doc_id = item["id"]
results_dict[doc_id] = {
**item,
"vector_score": item["score"],
"keyword_score": 0.0
}
for item in keyword_results:
doc_id = item["id"]
if doc_id in results_dict:
results_dict[doc_id]["keyword_score"] = item["score"]
else:
results_dict[doc_id] = {
**item,
"vector_score": 0.0,
"keyword_score": item["score"]
}
return list(results_dict.values())
def _rerank(self, results, query, top_k):
"""重新排序"""
for item in results:
item["hybrid_score"] = (
self.vector_weight * item["vector_score"] +
self.keyword_weight * item["keyword_score"]
)
# 按混合分数排序
results.sort(key=lambda x: x["hybrid_score"], reverse=True)
return results[:top_k]
关键词检索实现
python
import jieba
from collections import defaultdict
class BM25Index:
"""BM25 关键词索引"""
def __init__(self):
self.bm25 = None
self.documents = []
self.tokenized_docs = []
def add_document(self, doc_id: str, content: str, metadata: Dict = None):
"""添加文档"""
self.documents.append({
"id": doc_id,
"content": content,
"metadata": metadata or {}
})
self.tokenized_docs.append(self._tokenize(content))
def _tokenize(self, text: str):
"""分词"""
return list(jieba.cut(text))
def build_index(self):
"""构建索引"""
self.bm25 = BM25Okapi(self.tokenized_docs)
def search(self, query: str, top_k: int = 10):
"""BM25 检索"""
if self.bm25 is None:
self.build_index()
query_tokens = self._tokenize(query)
scores = self.bm25.get_scores(query_tokens)
# 获取top-k结果
top_indices = np.argsort(scores)[::-1][:top_k]
results = []
for idx in top_indices:
results.append({
"id": self.documents[idx]["id"],
"content": self.documents[idx]["content"],
"metadata": self.documents[idx]["metadata"],
"score": float(scores[idx])
})
return results
高级融合策略
基于分数归一化的融合
python
class NormalizedHybridRetriever(HybridRetriever):
"""归一化混合检索器"""
def _merge_results(self, vector_results, keyword_results):
"""归一化后合并结果"""
# 收集所有分数
vector_scores = [item["score"] for item in vector_results]
keyword_scores = [item["score"] for item in keyword_results]
# 归一化
if vector_scores:
v_min, v_max = min(vector_scores), max(vector_scores)
if v_max > v_min:
for item in vector_results:
item["norm_vector_score"] = (item["score"] - v_min) / (v_max - v_min)
else:
for item in vector_results:
item["norm_vector_score"] = 1.0
if keyword_scores:
k_min, k_max = min(keyword_scores), max(keyword_scores)
if k_max > k_min:
for item in keyword_results:
item["norm_keyword_score"] = (item["score"] - k_min) / (k_max - k_min)
else:
for item in keyword_results:
item["norm_keyword_score"] = 1.0
return super()._merge_results(vector_results, keyword_results)
def _rerank(self, results, query, top_k):
"""使用归一化分数重新排序"""
for item in results:
item["hybrid_score"] = (
self.vector_weight * item.get("norm_vector_score", 0.0) +
self.keyword_weight * item.get("norm_keyword_score", 0.0)
)
results.sort(key=lambda x: x["hybrid_score"], reverse=True)
return results[:top_k]
基于学习的重排序
python
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
class LearnedReranker:
"""基于学习的重排序器"""
def __init__(self):
self.model = GradientBoostingClassifier()
self.scaler = StandardScaler()
self.is_trained = False
def extract_features(self, query, doc, vector_score, keyword_score):
"""提取特征"""
features = [
vector_score,
keyword_score,
len(query) / len(doc["content"]) if doc["content"] else 0,
sum(1 for q in query.split() if q in doc["content"]),
vector_score * keyword_score
]
return features
def train(self, queries, docs, labels):
"""训练模型"""
X = []
y = []
for query, doc_candidates, relevance in zip(queries, docs, labels):
for doc, vec_score, key_score, rel in zip(
doc_candidates["docs"],
doc_candidates["vector_scores"],
doc_candidates["keyword_scores"],
relevance
):
features = self.extract_features(query, doc, vec_score, key_score)
X.append(features)
y.append(rel)
X = self.scaler.fit_transform(X)
self.model.fit(X, y)
self.is_trained = True
def rerank(self, query, results):
"""重排序"""
if not self.is_trained:
return results
X = []
for item in results:
features = self.extract_features(
query,
item,
item["vector_score"],
item["keyword_score"]
)
X.append(features)
X = self.scaler.transform(X)
scores = self.model.predict_proba(X)[:, 1]
for item, score in zip(results, scores):
item["learned_score"] = score
results.sort(key=lambda x: x["learned_score"], reverse=True)
return results
完整检索流程
python
class CompleteSearchSystem:
"""完整搜索系统"""
def __init__(self, embedding_model, vector_db):
self.embedding_model = embedding_model
self.vector_db = vector_db
self.keyword_index = BM25Index()
self.hybrid_retriever = NormalizedHybridRetriever(vector_db, self.keyword_index)
self.reranker = None
def index_document(self, doc_id: str, content: str, metadata: Dict = None):
"""索引文档"""
# 添加到关键词索引
self.keyword_index.add_document(doc_id, content, metadata)
# 添加到向量索引
embedding = self.embedding_model.encode(content)
self.vector_db.upsert(doc_id, embedding, metadata)
def index_batch(self, documents):
"""批量索引"""
for doc in documents:
self.index_document(doc["id"], doc["content"], doc.get("metadata"))
def search(self, query: str, top_k: int = 10):
"""搜索"""
# 生成查询 embedding
query_embedding = self.embedding_model.encode(query)
# 混合检索
results = self.hybrid_retriever.search(query, query_embedding, top_k=top_k)
# 可选:学习重排序
if self.reranker:
results = self.reranker.rerank(query, results)
return results
总结
混合检索的核心要点:
- 多检索源:关键词+语义双重保障
- 分数融合:权重可调,灵活适配
- 归一化:确保不同分数可比较
- 学习重排序:进一步提升质量
关键实践:
- 从简单权重融合开始
- 根据场景调整权重
- 考虑使用归一化分数
- 有数据时加入学习重排序