混合检索实现:关键词+语义检索的完美结合

混合检索实现:关键词+语义检索的完美结合

前言

单一的检索方式往往无法满足复杂需求。将关键词检索与语义检索结合,可以显著提升检索质量,兼顾精确匹配和语义理解。

我在多个搜索系统中实现过混合检索,今天分享一些实战经验。

混合检索架构

核心架构

python 复制代码
from typing import List, Dict, Any
import numpy as np
from rank_bm25 import BM25Okapi

class HybridRetriever:
    """混合检索器"""
    
    def __init__(self, vector_db, keyword_db=None):
        self.vector_db = vector_db
        self.keyword_db = keyword_db or BM25Index()
        self.vector_weight = 0.6
        self.keyword_weight = 0.4
    
    def set_weights(self, vector_weight, keyword_weight):
        """设置权重"""
        self.vector_weight = vector_weight
        self.keyword_weight = keyword_weight
    
    def search(self, query: str, query_embedding: np.ndarray, top_k: int = 10):
        """混合检索"""
        # 并行获取两种检索结果
        vector_results = self.vector_db.search(query_embedding, top_k=top_k*2)
        keyword_results = self.keyword_db.search(query, top_k=top_k*2)
        
        # 结果融合
        combined = self._merge_results(vector_results, keyword_results)
        
        # 重新排序
        reranked = self._rerank(combined, query, top_k)
        
        return reranked
    
    def _merge_results(self, vector_results, keyword_results):
        """合并检索结果"""
        # 构建结果字典
        results_dict = {}
        
        for item in vector_results:
            doc_id = item["id"]
            results_dict[doc_id] = {
                **item,
                "vector_score": item["score"],
                "keyword_score": 0.0
            }
        
        for item in keyword_results:
            doc_id = item["id"]
            if doc_id in results_dict:
                results_dict[doc_id]["keyword_score"] = item["score"]
            else:
                results_dict[doc_id] = {
                    **item,
                    "vector_score": 0.0,
                    "keyword_score": item["score"]
                }
        
        return list(results_dict.values())
    
    def _rerank(self, results, query, top_k):
        """重新排序"""
        for item in results:
            item["hybrid_score"] = (
                self.vector_weight * item["vector_score"] + 
                self.keyword_weight * item["keyword_score"]
            )
        
        # 按混合分数排序
        results.sort(key=lambda x: x["hybrid_score"], reverse=True)
        
        return results[:top_k]

关键词检索实现

python 复制代码
import jieba
from collections import defaultdict

class BM25Index:
    """BM25 关键词索引"""
    
    def __init__(self):
        self.bm25 = None
        self.documents = []
        self.tokenized_docs = []
    
    def add_document(self, doc_id: str, content: str, metadata: Dict = None):
        """添加文档"""
        self.documents.append({
            "id": doc_id,
            "content": content,
            "metadata": metadata or {}
        })
        self.tokenized_docs.append(self._tokenize(content))
    
    def _tokenize(self, text: str):
        """分词"""
        return list(jieba.cut(text))
    
    def build_index(self):
        """构建索引"""
        self.bm25 = BM25Okapi(self.tokenized_docs)
    
    def search(self, query: str, top_k: int = 10):
        """BM25 检索"""
        if self.bm25 is None:
            self.build_index()
        
        query_tokens = self._tokenize(query)
        scores = self.bm25.get_scores(query_tokens)
        
        # 获取top-k结果
        top_indices = np.argsort(scores)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                "id": self.documents[idx]["id"],
                "content": self.documents[idx]["content"],
                "metadata": self.documents[idx]["metadata"],
                "score": float(scores[idx])
            })
        
        return results

高级融合策略

基于分数归一化的融合

python 复制代码
class NormalizedHybridRetriever(HybridRetriever):
    """归一化混合检索器"""
    
    def _merge_results(self, vector_results, keyword_results):
        """归一化后合并结果"""
        # 收集所有分数
        vector_scores = [item["score"] for item in vector_results]
        keyword_scores = [item["score"] for item in keyword_results]
        
        # 归一化
        if vector_scores:
            v_min, v_max = min(vector_scores), max(vector_scores)
            if v_max > v_min:
                for item in vector_results:
                    item["norm_vector_score"] = (item["score"] - v_min) / (v_max - v_min)
            else:
                for item in vector_results:
                    item["norm_vector_score"] = 1.0
        
        if keyword_scores:
            k_min, k_max = min(keyword_scores), max(keyword_scores)
            if k_max > k_min:
                for item in keyword_results:
                    item["norm_keyword_score"] = (item["score"] - k_min) / (k_max - k_min)
            else:
                for item in keyword_results:
                    item["norm_keyword_score"] = 1.0
        
        return super()._merge_results(vector_results, keyword_results)
    
    def _rerank(self, results, query, top_k):
        """使用归一化分数重新排序"""
        for item in results:
            item["hybrid_score"] = (
                self.vector_weight * item.get("norm_vector_score", 0.0) + 
                self.keyword_weight * item.get("norm_keyword_score", 0.0)
            )
        
        results.sort(key=lambda x: x["hybrid_score"], reverse=True)
        
        return results[:top_k]

基于学习的重排序

python 复制代码
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

class LearnedReranker:
    """基于学习的重排序器"""
    
    def __init__(self):
        self.model = GradientBoostingClassifier()
        self.scaler = StandardScaler()
        self.is_trained = False
    
    def extract_features(self, query, doc, vector_score, keyword_score):
        """提取特征"""
        features = [
            vector_score,
            keyword_score,
            len(query) / len(doc["content"]) if doc["content"] else 0,
            sum(1 for q in query.split() if q in doc["content"]),
            vector_score * keyword_score
        ]
        return features
    
    def train(self, queries, docs, labels):
        """训练模型"""
        X = []
        y = []
        
        for query, doc_candidates, relevance in zip(queries, docs, labels):
            for doc, vec_score, key_score, rel in zip(
                doc_candidates["docs"],
                doc_candidates["vector_scores"],
                doc_candidates["keyword_scores"],
                relevance
            ):
                features = self.extract_features(query, doc, vec_score, key_score)
                X.append(features)
                y.append(rel)
        
        X = self.scaler.fit_transform(X)
        self.model.fit(X, y)
        self.is_trained = True
    
    def rerank(self, query, results):
        """重排序"""
        if not self.is_trained:
            return results
        
        X = []
        for item in results:
            features = self.extract_features(
                query,
                item,
                item["vector_score"],
                item["keyword_score"]
            )
            X.append(features)
        
        X = self.scaler.transform(X)
        scores = self.model.predict_proba(X)[:, 1]
        
        for item, score in zip(results, scores):
            item["learned_score"] = score
        
        results.sort(key=lambda x: x["learned_score"], reverse=True)
        
        return results

完整检索流程

python 复制代码
class CompleteSearchSystem:
    """完整搜索系统"""
    
    def __init__(self, embedding_model, vector_db):
        self.embedding_model = embedding_model
        self.vector_db = vector_db
        self.keyword_index = BM25Index()
        self.hybrid_retriever = NormalizedHybridRetriever(vector_db, self.keyword_index)
        self.reranker = None
    
    def index_document(self, doc_id: str, content: str, metadata: Dict = None):
        """索引文档"""
        # 添加到关键词索引
        self.keyword_index.add_document(doc_id, content, metadata)
        
        # 添加到向量索引
        embedding = self.embedding_model.encode(content)
        self.vector_db.upsert(doc_id, embedding, metadata)
    
    def index_batch(self, documents):
        """批量索引"""
        for doc in documents:
            self.index_document(doc["id"], doc["content"], doc.get("metadata"))
    
    def search(self, query: str, top_k: int = 10):
        """搜索"""
        # 生成查询 embedding
        query_embedding = self.embedding_model.encode(query)
        
        # 混合检索
        results = self.hybrid_retriever.search(query, query_embedding, top_k=top_k)
        
        # 可选:学习重排序
        if self.reranker:
            results = self.reranker.rerank(query, results)
        
        return results

总结

混合检索的核心要点:

  1. 多检索源:关键词+语义双重保障
  2. 分数融合:权重可调,灵活适配
  3. 归一化:确保不同分数可比较
  4. 学习重排序:进一步提升质量

关键实践:

  • 从简单权重融合开始
  • 根据场景调整权重
  • 考虑使用归一化分数
  • 有数据时加入学习重排序
相关推荐
染指11105 小时前
26.RAG进阶(Advanced RAG)-假设性问题索引
人工智能·windows·agent·rag·advanced rag
闵孚龙5 小时前
动态图机制:为什么 PyTorch 调试起来更舒服
人工智能·pytorch·python
甲维斯6 小时前
还要啥Codex!DeepSeek接入Zcode远程连接!
人工智能
百胜软件@百胜软件6 小时前
百胜软件亮相“AI消费新生活”主题日活动,AI智能运营平台入选市级案例征集
人工智能·生活·零售数字化·数智中台·珠宝行业
专注搞钱7 小时前
GPT-4o写设备Recipe:从3小时到10分钟
数据库·人工智能·gpt·半导体
闻道参看7 小时前
贝芯宠AI灵兽 ELFVET 大模型聚焦临床应用,强化宠物诊疗综合能力
人工智能·宠物
MartinYeung57 小时前
[论文学习]重新思考大型语言模型忘却目标:梯度视角与超越
人工智能·学习·语言模型
财经资讯数据_灵砚智能7 小时前
基于全球经济类多源新闻的NLP情感分析与数据可视化(夜间-次晨)2026年6月14日
大数据·人工智能·python·ai·信息可视化·自然语言处理·灵砚智能
二哈赛车手7 小时前
新人笔记---最终版智能体图片分析完整方案,包括一些总结于经验,以及各种优化点讲解
java·笔记·spring·ai·springboot
m0_380167147 小时前
加密货币价格 API、市场数据 API 与 分析 API 有什么区别?
人工智能·ai·区块链