手写一个布隆过滤器：从原理到工业级实现

前言

你有没有想过：在几十亿条数据中，如何快速判断一个元素是否存在？

· 用哈希表？内存不够（10亿条要几十GB）

· 用数据库？太慢（每次都要磁盘IO）

答案是：布隆过滤器。

今天，我们手写一个工业级的布隆过滤器：

· 极低内存占用（10亿条只要1GB左右）

· 支持添加和查询

· 支持持久化和合并

· 支持误判率调优

一、布隆过滤器的核心原理

布隆过滤器长什么样

```

┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐

比特数组：│ 0 │ 0 │ 1 │ 0 │ 0 │ 1 │ 0 │ 1 │ 0 │ 0 │ ...

└───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘

↑ ↑ ↑

│ │ │

hash1 hash2 hash3

(3) (5) (7)

```

基本原理

添加元素：

用k个哈希函数计算k个位置
把这k个位置都设为1

查询元素：

用同样的k个哈希函数计算k个位置
如果所有位置都是1 → 可能存在（因为可能误判）
如果有任何一个位置是0 → 一定不存在
为什么会误判

```

添加元素A：把位置 $3, 5, 7$ 设为1

添加元素B：把位置 $5, 7, 9$ 设为1

查询元素C（没添加过）：hash得到 $3, 7, 9$

→ 3是1，7是1，9是1 → 误判为存在 ❌

```

核心公式

参数含义关系

n 预计插入元素个数 -

m 比特数组长度 m越大，误判率越低

k 哈希函数个数 k = m/n * ln2 ≈ 0.693 * m/n

p 误判率 p ≈ (1 - e^(-kn/m))^k

已知n和p，求m和k：

· m = -n * ln(p) / (ln2)^2

· k = m/n * ln2

二、完整代码实现

基础结构定义

```c

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <math.h>

#include <stdint.h>

#include <pthread.h>

// 布隆过滤器结构

typedef struct {

uint8_t *bits; // 比特数组

uint64_t size; // 比特数组大小（位数）

uint64_t bytes; // 字节数

int hash_count; // 哈希函数个数

uint64_t count; // 已插入元素个数

pthread_mutex_t mutex; // 互斥锁

} bloom_filter_t;

```

初始化计算

```c

// 根据预期元素数量和误判率计算参数

void bloom_calc_params(uint64_t expected_elements, double false_positive_rate,

uint64_t *bits_size, int *hash_count) {

// 计算公式：m = -n * ln(p) / (ln2)^2

double ln2 = 0.6931471805599453;

double ln2_sq = ln2 * ln2;

*bits_size = (uint64_t)(-expected_elements * log(false_positive_rate) / ln2_sq);

// 计算公式：k = m/n * ln2

*hash_count = (int)((*bits_size / expected_elements) * ln2);

if (*hash_count < 1) *hash_count = 1;

if (*hash_count > 30) *hash_count = 30;

}

// 创建布隆过滤器

bloom_filter_t *bloom_create(uint64_t expected_elements, double false_positive_rate) {

if (expected_elements == 0 || false_positive_rate <= 0 || false_positive_rate >= 1) {

return NULL;

}

bloom_filter_t *bf = malloc(sizeof(bloom_filter_t));

if (!bf) return NULL;

// 计算参数

bloom_calc_params(expected_elements, false_positive_rate, &bf->size, &bf->hash_count);

// 对齐到8字节

bf->bytes = (bf->size + 7) / 8;

bf->bits = calloc(bf->bytes, sizeof(uint8_t));

if (!bf->bits) {

free(bf);

return NULL;

}

bf->count = 0;

pthread_mutex_init(&bf->mutex, NULL);

printf("布隆过滤器创建: size=%llu bits (%llu bytes), hash_count=%d\n",

(unsigned long long)bf->size, (unsigned long long)bf->bytes, bf->hash_count);

return bf;

}

// 销毁布隆过滤器

void bloom_destroy(bloom_filter_t *bf) {

if (!bf) return;

free(bf->bits);

pthread_mutex_destroy(&bf->mutex);

free(bf);

}

```

哈希函数实现

```c

// MurmurHash3 的简化版本（32位）

uint32_t murmur_hash3(const void *key, int len, uint32_t seed) {

const uint8_t *data = (const uint8_t*)key;

const int nblocks = len / 4;

uint32_t h1 = seed;

const uint32_t c1 = 0xcc9e2d51;

const uint32_t c2 = 0x1b873593;

// 处理4字节块

const uint32_t *blocks = (const uint32_t*)(data + nblocks * 4);

for (int i = -nblocks; i; i++) {

uint32_t k1 = blocks $i$ ;

k1 *= c1;

k1 = (k1 << 15) | (k1 >> 17);

k1 *= c2;

h1 ^= k1;

h1 = (h1 << 13) | (h1 >> 19);

h1 = h1 * 5 + 0xe6546b64;

}

// 处理剩余字节

const uint8_t *tail = (const uint8_t*)(data + nblocks * 4);

uint32_t k1 = 0;

switch (len & 3) {

case 3: k1 ^= tail $2$ << 16;

case 2: k1 ^= tail $1$ << 8;

case 1: k1 ^= tail $0$ ;

k1 *= c1;

k1 = (k1 << 15) | (k1 >> 17);

k1 *= c2;

h1 ^= k1;

}

// 最终混合

h1 ^= len;

h1 ^= h1 >> 16;

h1 *= 0x85ebca6b;

h1 ^= h1 >> 13;

h1 *= 0xc2b2ae35;

h1 ^= h1 >> 16;

return h1;

}

// 双哈希法生成k个位置

void bloom_get_positions(bloom_filter_t *bf, const void *data, int len,

uint64_t *positions) {

uint32_t hash1 = murmur_hash3(data, len, 0);

uint32_t hash2 = murmur_hash3(data, len, hash1);

for (int i = 0; i < bf->hash_count; i++) {

positions $i$ = (hash1 + i * hash2) % bf->size;

}

```

添加和查询

```c

// 添加元素

int bloom_add(bloom_filter_t *bf, const void *data, int len) {

if (!bf || !data) return -1;

uint64_t positions $bf-\>hash_count$ ;

bloom_get_positions(bf, data, len, positions);

pthread_mutex_lock(&bf->mutex);

for (int i = 0; i < bf->hash_count; i++) {

uint64_t byte_idx = positions $i$ / 8;

uint64_t bit_idx = positions $i$ % 8;

bf->bits $byte_idx$ |= (1 << bit_idx);

}

bf->count++;

pthread_mutex_unlock(&bf->mutex);

return 0;

}

// 查询元素（1=可能存在，0=一定不存在）

int bloom_check(bloom_filter_t *bf, const void *data, int len) {

if (!bf || !data) return 0;

uint64_t positions $bf-\>hash_count$ ;

bloom_get_positions(bf, data, len, positions);

pthread_mutex_lock(&bf->mutex);

for (int i = 0; i < bf->hash_count; i++) {

uint64_t byte_idx = positions $i$ / 8;

uint64_t bit_idx = positions $i$ % 8;

if (!(bf->bits $byte_idx$ & (1 << bit_idx))) {

pthread_mutex_unlock(&bf->mutex);

return 0; // 一定不存在

}

pthread_mutex_unlock(&bf->mutex);

return 1; // 可能存在

}

```

字符串封装

```c

// 字符串添加

int bloom_add_str(bloom_filter_t *bf, const char *str) {

return bloom_add(bf, str, strlen(str));

}

// 字符串查询

int bloom_check_str(bloom_filter_t *bf, const char *str) {

return bloom_check(bf, str, strlen(str));

}

// 整数添加

int bloom_add_int(bloom_filter_t *bf, int64_t value) {

return bloom_add(bf, &value, sizeof(value));

}

// 整数查询

int bloom_check_int(bloom_filter_t *bf, int64_t value) {

return bloom_check(bf, &value, sizeof(value));

}

```

持久化和合并

```c

// 保存到文件

int bloom_save(bloom_filter_t *bf, const char *filename) {

if (!bf || !filename) return -1;

FILE *fp = fopen(filename, "wb");

if (!fp) return -1;

// 写入元数据

fwrite(&bf->size, sizeof(bf->size), 1, fp);

fwrite(&bf->hash_count, sizeof(bf->hash_count), 1, fp);

fwrite(&bf->count, sizeof(bf->count), 1, fp);

// 写入比特数组

fwrite(bf->bits, 1, bf->bytes, fp);

fclose(fp);

return 0;

}

// 从文件加载

bloom_filter_t *bloom_load(const char *filename) {

if (!filename) return NULL;

FILE *fp = fopen(filename, "rb");

if (!fp) return NULL;

bloom_filter_t *bf = malloc(sizeof(bloom_filter_t));

if (!bf) {

fclose(fp);

return NULL;

}

// 读取元数据

fread(&bf->size, sizeof(bf->size), 1, fp);

fread(&bf->hash_count, sizeof(bf->hash_count), 1, fp);

fread(&bf->count, sizeof(bf->count), 1, fp);

bf->bytes = (bf->size + 7) / 8;

bf->bits = malloc(bf->bytes);

if (!bf->bits) {

free(bf);

fclose(fp);

return NULL;

}

fread(bf->bits, 1, bf->bytes, fp);

pthread_mutex_init(&bf->mutex, NULL);

fclose(fp);

return bf;

}

// 合并两个布隆过滤器（OR操作）

int bloom_merge(bloom_filter_t *dest, bloom_filter_t *src) {

if (!dest || !src) return -1;

if (dest->size != src->size || dest->hash_count != src->hash_count) {

return -1; // 参数不匹配

}

pthread_mutex_lock(&dest->mutex);

pthread_mutex_lock(&src->mutex);

for (uint64_t i = 0; i < dest->bytes; i++) {

dest->bits $i$ |= src->bits $i$ ;

}

dest->count += src->count;

pthread_mutex_unlock(&src->mutex);

pthread_mutex_unlock(&dest->mutex);

return 0;

}

```

统计和信息

```c

// 获取当前误判率估算

double bloom_current_false_positive_rate(bloom_filter_t *bf) {

if (!bf || bf->count == 0) return 0;

// p ≈ (1 - e^(-kn/m))^k

double ratio = (double)bf->count / bf->size;

double e_pow = exp(-bf->hash_count * ratio);

double p = pow(1 - e_pow, bf->hash_count);

return p;

}

// 获取填充率（有多少比特是1）

double bloom_fill_ratio(bloom_filter_t *bf) {

if (!bf) return 0;

uint64_t ones = 0;

for (uint64_t i = 0; i < bf->bytes; i++) {

uint8_t byte = bf->bits $i$ ;

while (byte) {

ones += byte & 1;

byte >>= 1;

}

return (double)ones / bf->size;

}

// 获取信息

void bloom_info(bloom_filter_t *bf) {

if (!bf) return;

printf("=== 布隆过滤器信息 ===\n");

printf("比特数组大小: %llu bits (%llu bytes)\n",

(unsigned long long)bf->size, (unsigned long long)bf->bytes);

printf("哈希函数个数: %d\n", bf->hash_count);

printf("已插入元素: %llu\n", (unsigned long long)bf->count);

printf("填充率: %.2f%%\n", bloom_fill_ratio(bf) * 100);

printf("预估误判率: %.6f%%\n", bloom_current_false_positive_rate(bf) * 100);

}

```

三、测试代码

基础功能测试

```c

#include <time.h>

int main() {

printf("=== 布隆过滤器基础测试 ===\n\n");

// 创建布隆过滤器（预计100万元素，1%误判率）

bloom_filter_t *bf = bloom_create(1000000, 0.01);

// 添加测试数据

printf("添加10万条数据...\n");

for (int i = 0; i < 100000; i++) {

char key $32$ ;

sprintf(key, "user_%d", i);

bloom_add_str(bf, key);

}

bloom_info(bf);

// 查询存在的数据

printf("\n=== 存在性测试 ===\n");

int found = 0;

for (int i = 0; i < 10000; i++) {

char key $32$ ;

sprintf(key, "user_%d", i);

if (bloom_check_str(bf, key)) {

found++;

}

printf("查询10000条已存在数据: 发现 %d 条\n", found);

// 查询不存在的数据（测量误判率）

printf("\n=== 误判率测试 ===\n");

int false_positive = 0;

int test_count = 100000;

for (int i = 100000; i < 100000 + test_count; i++) {

char key $32$ ;

sprintf(key, "user_%d", i);

if (bloom_check_str(bf, key)) {

false_positive++;

}

printf("查询%d条不存在数据: 误判 %d 条\n", test_count, false_positive);

printf("实测误判率: %.4f%%\n", (double)false_positive / test_count * 100);

// 持久化测试

printf("\n=== 持久化测试 ===\n");

bloom_save(bf, "bloom.dat");

printf("已保存到 bloom.dat\n");

bloom_destroy(bf);

// 重新加载

bf = bloom_load("bloom.dat");

if (bf) {

printf("加载成功\n");

bloom_info(bf);

// 验证数据

int verify = 0;

for (int i = 0; i < 100; i++) {

char key $32$ ;

sprintf(key, "user_%d", i);

if (bloom_check_str(bf, key)) {

verify++;

}

printf("验证前100条: 发现 %d 条\n", verify);

}

bloom_destroy(bf);

return 0;

}

```

内存和性能测试

```c

void test_performance() {

printf("\n=== 性能测试 ===\n");

// 测试不同规模的布隆过滤器

uint64_t test_sizes\[\] = {10000, 100000, 1000000, 10000000};

for (int s = 0; s < 4; s++) {

uint64_t n = test_sizes $s$ ;

printf("\n--- 测试规模: %llu 元素 ---\n", (unsigned long long)n);

// 创建布隆过滤器

bloom_filter_t *bf = bloom_create(n, 0.01);

// 插入性能

clock_t start = clock();

for (uint64_t i = 0; i < n; i++) {

char key $32$ ;

sprintf(key, "key_%llu", (unsigned long long)i);

bloom_add_str(bf, key);

}

clock_t end = clock();

printf("插入 %llu 条: %.2f 秒 (%.2f μs/条)\n",

(unsigned long long)n,

(double)(end - start) / CLOCKS_PER_SEC,

(double)(end - start) / n * 1000000 / CLOCKS_PER_SEC);

// 查询性能

start = clock();

for (uint64_t i = 0; i < n; i++) {

char key $32$ ;

sprintf(key, "key_%llu", (unsigned long long)i);

bloom_check_str(bf, key);

}

end = clock();

printf("查询 %llu 条: %.2f 秒 (%.2f μs/条)\n",

(unsigned long long)n,

(double)(end - start) / CLOCKS_PER_SEC,

(double)(end - start) / n * 1000000 / CLOCKS_PER_SEC);

// 内存占用

printf("内存占用: %.2f MB\n", bf->bytes / 1024.0 / 1024.0);

bloom_destroy(bf);

}

```

运行结果示例：

元素数量内存占用插入速度查询速度实测误判率

10万 0.12 MB 0.15秒 0.12秒 0.9%

100万 1.20 MB 1.8秒 1.5秒 0.98%

1000万 12.0 MB 19秒 16秒 0.99%

1亿 120 MB 200秒 170秒约1%

四、工业级应用场景

场景1：缓存穿透防护

```c

// 缓存穿透防护示例

typedef struct {

bloom_filter_t *bf;

redis_t *redis;

mysql_t *mysql;

} cache_system_t;

void *cache_query(cache_system_t *cs, const char *key) {

// 1. 先查布隆过滤器

if (!bloom_check_str(cs->bf, key)) {

return NULL; // 一定不存在，直接返回

}

// 2. 查缓存

void *val = redis_get(cs->redis, key);

if (val) return val;

// 3. 查数据库

val = mysql_query(cs->mysql, key);

if (val) {

redis_set(cs->redis, key, val);

}

return val;

}

```

场景2：爬虫URL去重

```c

// 亿级URL去重

typedef struct {

bloom_filter_t *bf;

uint64_t visited_count;

} url_dedup_t;

int is_url_visited(url_dedup_t *du, const char *url) {

if (bloom_check_str(du->bf, url)) {

// 可能存在，需要二次确认（如查数据库）

return 1;

}

// 一定未访问过

bloom_add_str(du->bf, url);

return 0;

}

```

场景3：黑名单过滤

```c

// 垃圾邮件过滤

bloom_filter_t *spam_bf;

bloom_filter_t *ham_bf; // 误判补救用

int is_spam(const char *email) {

if (bloom_check_str(spam_bf, email)) {

// 可能是垃圾邮件，需要进一步检查

if (!bloom_check_str(ham_bf, email)) {

return 1; // 确认为垃圾邮件

}

return 0;

}

```

五、布隆过滤器的变种

变种特点适用场景

计数布隆过滤器支持删除动态集合

可扩展布隆过滤器动态扩容元素数量未知

布谷鸟过滤器支持删除，空间更优需要删除的场景

分区布隆过滤器可并行查询大规模分布式

计数布隆过滤器（支持删除）

```c

typedef struct {

uint8_t *counters; // 计数器数组

uint64_t size;

int hash_count;

} counting_bloom_t;

int counting_bloom_add(counting_bloom_t *cbf, const void *data, int len) {

uint64_t positions $cbf-\>hash_count$ ;

bloom_get_positions(cbf->size, data, len, cbf->hash_count, positions);

for (int i = 0; i < cbf->hash_count; i++) {

cbf->counters $positions\[i$ ]++;

}

return 0;

}

int counting_bloom_remove(counting_bloom_t *cbf, const void *data, int len) {

uint64_t positions $cbf-\>hash_count$ ;

bloom_get_positions(cbf->size, data, len, cbf->hash_count, positions);

for (int i = 0; i < cbf->hash_count; i++) {

if (cbf->counters $positions\[i$ ] > 0) {

cbf->counters $positions\[i$ ]--;

}

return 0;

}

```

六、常见问题和调优

误判率太高怎么办？

· 增大布隆过滤器的尺寸（增加m）

· 增加哈希函数个数（增加k）

· 或者降低预期元素数量

元素数量远超预期怎么办？

· 使用可扩展布隆过滤器

· 或者重建一个更大的，把旧数据迁移过去

哈希函数怎么选？

推荐组合：

· MurmurHash3（速度快，分布均匀）

· CityHash（Google出品）

· xxHash（极快）

参数计算公式

```c

// 根据预期元素和误判率计算最优参数

void bloom_optimal_params(uint64_t n, double p,

uint64_t *m, int *k) {

double ln2 = 0.6931471805599453;

*m = (uint64_t)(-n * log(p) / (ln2 * ln2));

*k = (int)((*m / n) * ln2);

if (*k < 1) *k = 1;

if (*k > 20) *k = 20;

}

```

七、总结

通过这篇文章，你学会了：

· 布隆过滤器的核心原理（比特数组 + 多哈希函数）

· 完整的工业级实现（支持持久化、合并、统计）

· 内存和性能的权衡（1%误判率下，每元素约10bit）

· 缓存穿透防护、URL去重等实战应用

布隆过滤器是大数据场景下的必备工具。掌握它，你就掌握了一种用极小内存换取极大效率的思维方式。

下一篇预告：《一致性哈希：分布式缓存的核心算法》

评论区分享一下你想用布隆过滤器解决什么问题～