BM25稀疏检索算法笔记

文章目录

bm25有什么用?

BM25 在 RAG/AI 里的价值就 4 点：

1、和向量检索互补，构成最强混合检索

2、精准匹配关键词、术语、编号，解决向量漂移

3、轻量、快、无GPU、中文友好

4、可解释，适合合规、专业文档

没有 BM25 的 RAG，在专业文档、中文场景、精准查询上，效果通常会明显弱一截。

bm25

bm25-手动实现示例

代码：

python 复制代码

import math
from collections import Counter


class SimpleBM25:
    def __init__(self, documents, k1=1.5, b=0.75):
        self.k1 = k1
        self.b = b
        self.documents = documents
        self.N = len(documents)

        # 1. 预处理：分词 (这里简单按空格切分，中文需先用 jieba)
        self.tokenized_docs = [doc.split() for doc in documents]

        # 2. 计算文档长度和平均长度
        self.doc_lengths = [len(doc) for doc in self.tokenized_docs]
        self.avg_dl = sum(self.doc_lengths) / self.N

        # 3. 计算每个词在所有文档中出现的次数 (用于算 IDF)
        self.freq_map = {}  # {word: count_of_docs_containing_word}
        for doc in self.tokenized_docs:
            unique_words = set(doc)
            for word in unique_words:
                self.freq_map[word] = self.freq_map.get(word, 0) + 1

        # 4. 预计算 IDF
        self.idf = {}
        for word, count in self.freq_map.items():
            # IDF 公式变种：ln((N - n + 0.5) / (n + 0.5) + 1)
            self.idf[word] = math.log((self.N - count + 0.5) / (count + 0.5) + 1)

    def get_score(self, query):
        query_tokens = query.split()
        scores = []

        for i, doc in enumerate(self.tokenized_docs):
            score = 0.0
            doc_len = self.doc_lengths[i]
            counter = Counter(doc)  # 当前文档的词频统计

            for token in query_tokens:
                if token not in counter:
                    continue

                tf = counter[token]  # 词频 f(q, D)
                idf = self.idf.get(token, 0)

                # BM25 核心公式部分
                numerator = tf * (self.k1 + 1)
                denominator = tf + self.k1 * (1 - self.b + self.b * (doc_len / self.avg_dl))

                score += idf * (numerator / denominator)

            scores.append(score)

        return scores


# --- 测试数据 ---
docs = [
    "自然语言处理 是 人工智能 的 核心",
    "人工智能 和 机器学习 都 很 重要",
    "深度学习 是 机器学习 的 一种",
    "自然语言处理 需要 大量 数据"
]

# 初始化
bm25 = SimpleBM25(docs)

# 查询
query = "自然语言处理 人工智能"
scores = bm25.get_score(query)

print(f"查询: '{query}'")
print("-" * 30)
for i, score in enumerate(scores):
    print(f"文档 {i + 1}: {docs[i]}")
    print(f"得分: {score:.4f}")
print("-" * 30)
# 排序结果
ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
print("排序后的最佳匹配:")
for idx in ranked_indices:
    print(f"[{scores[idx]:.4f}] {docs[idx]}")

输出结果：

bash 复制代码

查询: '自然语言处理 人工智能'
------------------------------
文档 1: 自然语言处理 是 人工智能 的 核心
得分: 1.3863
文档 2: 人工智能 和 机器学习 都 很 重要
得分: 0.6359
文档 3: 深度学习 是 机器学习 的 一种
得分: 0.0000
文档 4: 自然语言处理 需要 大量 数据
得分: 0.7617
------------------------------
排序后的最佳匹配:
[1.3863] 自然语言处理 是 人工智能 的 核心
[0.7617] 自然语言处理 需要 大量 数据
[0.6359] 人工智能 和 机器学习 都 很 重要
[0.0000] 深度学习 是 机器学习 的 一种

bm25-jieba分词示例

python 复制代码

from rank_bm25 import BM25Okapi
import jieba

# 1. 准备中文文档数据
documents = [
    "自然语言处理是人工智能的核心领域，广泛应用于搜索引擎。",
    "机器学习和深度学习是人工智能的重要分支。",
    "搜索引擎需要使用自然语言处理技术来理解用户查询。",
    "今天天气不错，适合出去打球。",  # 无关文档
    "自然语言处理技术在医疗领域的应用也在不断增加。"
]

# 2. 中文分词 (关键步骤！)
# 使用 jieba 进行分词，返回列表的列表
tokenized_docs = [list(jieba.cut(doc)) for doc in documents]

# 3. 初始化 BM25 模型
bm25 = BM25Okapi(tokenized_docs)

# 4. 处理查询
query = "自然语言处理 搜索引擎"
# 查询也必须分词
tokenized_query = list(jieba.cut(query))

# 5. 获取得分 (返回一个包含所有文档分数的列表)
scores = bm25.get_scores(tokenized_query)


# 6. 【修正点】手动排序并提取 Top N
def get_top_n_documents(scores, documents, n=3):
    # 将 (分数, 原始索引, 文档内容) 打包
    # enumerate(scores) 生成 (索引, 分数)
    ranked_results = sorted(
        [(score, idx, doc) for idx, (score, doc) in enumerate(zip(scores, documents))],
        key=lambda x: x[0],  # 按分数排序
        reverse=True  # 降序
    )

    # 取前 N 个
    return ranked_results[:n]


top_n = 3
top_results = get_top_n_documents(scores, documents, n=top_n)

# 7. 展示结果
print(f"查询: '{query}'")
print(f"分词后查询: {tokenized_query}")
print("-" * 50)

for rank, (score, idx, doc) in enumerate(top_results, 1):
    if score > 0:
        print(f"Rank {rank} (得分: {score:.4f}):")
        print(f"  原文: {doc}")
        print(f"  分词预览: {'/'.join(list(jieba.cut(doc))[:5])}...")
        print()
    else:
        # 如果前 N 名里有得分为 0 的，说明相关文档不足 N 个
        print(f"Rank {rank}: 无相关文档 (得分 0)")

# 如果你只需要索引列表 (模拟 top_n 的功能)
top_indices = [idx for _, idx, _ in top_results]
print(f"Top {top_n} 文档索引列表: {top_indices}")

bm25-idf示例

期待：

特别高频出现的词，如的，它的得分反而低。

而量子力学这样的词得分反而高。

代码：

python 复制代码

from rank_bm25 import BM25Okapi
import math

# 构造极端数据
# 文档 1-99: 都是废话 "的 的 的"
# 文档 100: 包含稀有词 "量子力学"
documents = [["的"] * 10] * 99 + [["量子力学"]]

# 注意：这里为了演示，我们没有过滤"的"，看看会发生什么
tokenized_docs = documents

bm25 = BM25Okapi(tokenized_docs)

# 情况 A: 搜索常见词 "的"
query_common = ["的"]
scores_common = bm25.get_scores(query_common)
print(f"搜索 '的' (出现在 99 篇文档):")
print(f"  前 5 篇得分: {scores_common[:5]}") # 应该都很低
print(f"  最后一篇(无'的')得分: {scores_common[-1]}") # 应该是 0

# 情况 B: 搜索稀有词 "量子力学"
query_rare = ["量子力学"]
scores_rare = bm25.get_scores(query_rare)
print(f"\n搜索 '量子力学' (只出现在 1 篇文档):")
print(f"  前 99 篇得分: {scores_rare[:5]}") # 应该都是 0
print(f"  最后一篇得分: {scores_rare[-1]:.4f}") # 应该非常高！

# 对比
if scores_rare[-1] > scores_common[0]:
    print("\n✅ 验证成功：稀有词的得分远高于常见词，这就是 IDF 的作用！")