从零实现一个分布式搜索引擎:倒排索引与检索

前言

你有没有想过:你在Google搜索框里输入几个字,为什么能在毫秒内从几百亿网页中找到你要的结果?

搜索引擎的核心是倒排索引------将文档映射到关键词的倒排结构。

今天我们用C语言从零实现一个分布式搜索引擎:

· 倒排索引(Inverted Index)

· 分词(Tokenizer)

· 布尔检索(Boolean Search)

· 相关性排序(TF-IDF)

· 分布式索引(Sharding)

· 分布式检索(Scatter-Gather)


一、搜索引擎核心原理

  1. 倒排索引结构

```

正排索引(文档 → 词):

文档1: "谷歌搜索" → 谷歌, 搜索

文档2: "搜索引擎" → 搜索, 引擎

文档3: "谷歌地图" → 谷歌, 地图

倒排索引(词 → 文档):

谷歌 → 文档1, 文档3

搜索 → 文档1, 文档2

引擎 → 文档2

地图 → 文档3

```

  1. 检索流程

```

查询 → 分词 → 倒排索引查找 → 文档交集 → 排序 → 结果

```

  1. 分布式架构

```

┌─────────────────────────────────────────────────────────────┐

│ Query Processor │

│ (查询解析 + 结果聚合) │

└─────────────────────────────────────────────────────────────┘

│ │ │

▼ ▼ ▼

┌─────────────────────────────────────────────────────────────┐

│ Index Shards │

│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │

│ │ Shard 1 │ │ Shard 2 │ │ Shard 3 │ │

│ │ 倒排索引 │ │ 倒排索引 │ │ 倒排索引 │ │

│ │ (词A-Z) │ │ (词H-N) │ │ (词O-Z) │ │

│ └──────────────┘ └──────────────┘ └──────────────┘ │

└─────────────────────────────────────────────────────────────┘

```


二、完整代码实现

  1. 基础数据结构

```c

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <unistd.h>

#include <pthread.h>

#include <time.h>

#include <errno.h>

#include <math.h>

#include <ctype.h>

#define MAX_TERM_LEN 128

#define MAX_DOC_ID 32

#define MAX_FIELD_LEN 256

#define MAX_SHARDS 10

// 倒排列表项

typedef struct posting {

char doc_idMAX_DOC_ID;

int term_freq; // 词频

float tf_idf; // TF-IDF权重

int positions10; // 位置(用于短语查询)

int pos_count;

struct posting *next;

} posting_t;

// 倒排索引项

typedef struct inverted_entry {

char termMAX_TERM_LEN;

posting_t *postings;

int doc_count;

struct inverted_entry *next;

} inverted_entry_t;

// 文档元数据

typedef struct document {

char doc_idMAX_DOC_ID;

char titleMAX_FIELD_LEN;

char contentMAX_FIELD_LEN \* 10;

float length; // 文档长度(用于BM25)

struct document *next;

} document_t;

// 索引分片

typedef struct index_shard {

int shard_id;

inverted_entry_t *inverted_index;

document_t *documents;

int doc_count;

int term_count;

pthread_mutex_t mutex;

} index_shard_t;

// 搜索引擎

typedef struct search_engine {

index_shard_t *shards;

int shard_count;

int running;

pthread_t search_thread;

} search_engine_t;

// 查询结果

typedef struct search_result {

char doc_idMAX_DOC_ID;

float score;

struct search_result *next;

} search_result_t;

```

  1. 分词器

```c

// 简单分词(按空格和标点分割,转小写)

int tokenize(const char *text, char tokens\[\]MAX_TERM_LEN, int max_tokens) {

int count = 0;

char buffer2048;

strcpy(buffer, text);

char *p = buffer;

while (*p && count < max_tokens) {

// 跳过非字母数字

while (*p && !isalnum(*p)) p++;

if (!*p) break;

char *start = p;

while (*p && isalnum(*p)) p++;

int len = p - start;

if (len > 0 && len < MAX_TERM_LEN) {

// 转小写

for (int i = 0; i < len; i++) {

tokenscounti = tolower(starti);

}

tokenscountlen = '\0';

count++;

}

}

return count;

}

// 停用词过滤

int is_stop_word(const char *word) {

static const char *stop_words\[\] = {

"the", "a", "an", "of", "to", "for", "on", "at",

"in", "with", "by", "from", "up", "about", "into",

"through", "during", "including", "without", "and",

"or", "but", "so", "for", "nor", "yet", "as", NULL

};

for (int i = 0; stop_wordsi; i++) {

if (strcmp(word, stop_wordsi) == 0) return 1;

}

return 0;

}

```

  1. 倒排索引构建

```c

// 创建索引分片

index_shard_t *shard_create(int shard_id) {

index_shard_t *shard = malloc(sizeof(index_shard_t));

shard->shard_id = shard_id;

shard->inverted_index = NULL;

shard->documents = NULL;

shard->doc_count = 0;

shard->term_count = 0;

pthread_mutex_init(&shard->mutex, NULL);

return shard;

}

// 查找或创建倒排索引项

inverted_entry_t *find_or_create_term(index_shard_t *shard, const char *term) {

inverted_entry_t *entry = shard->inverted_index;

while (entry) {

if (strcmp(entry->term, term) == 0) {

return entry;

}

entry = entry->next;

}

// 创建新条目

entry = malloc(sizeof(inverted_entry_t));

strcpy(entry->term, term);

entry->postings = NULL;

entry->doc_count = 0;

entry->next = shard->inverted_index;

shard->inverted_index = entry;

shard->term_count++;

return entry;

}

// 添加文档到索引

int index_document(index_shard_t *shard, const char *doc_id,

const char *title, const char *content) {

pthread_mutex_lock(&shard->mutex);

// 保存文档

document_t *doc = malloc(sizeof(document_t));

strcpy(doc->doc_id, doc_id);

strcpy(doc->title, title);

strcpy(doc->content, content);

doc->length = 0;

doc->next = shard->documents;

shard->documents = doc;

shard->doc_count++;

// 分词

char tokens1000MAX_TERM_LEN;

int token_count = tokenize(content, tokens, 1000);

// 统计词频

for (int i = 0; i < token_count; i++) {

if (is_stop_word(tokensi)) continue;

// 查找或创建倒排项

inverted_entry_t *entry = find_or_create_term(shard, tokensi);

// 检查是否已存在该文档的posting

posting_t *p = entry->postings;

int found = 0;

while (p) {

if (strcmp(p->doc_id, doc_id) == 0) {

p->term_freq++;

if (p->pos_count < 10) {

p->positionsp-\>pos_count++ = i;

}

found = 1;

break;

}

p = p->next;

}

if (!found) {

// 创建新的posting

posting_t *new_p = malloc(sizeof(posting_t));

strcpy(new_p->doc_id, doc_id);

new_p->term_freq = 1;

new_p->pos_count = 1;

new_p->positions0 = i;

new_p->next = entry->postings;

entry->postings = new_p;

entry->doc_count++;

}

doc->length++;

}

pthread_mutex_unlock(&shard->mutex);

printf("索引 文档 %s 已索引 (分词: %d)\n", doc_id, token_count);

return 0;

}

```

  1. 检索与排序

```c

// 计算TF-IDF

float compute_tf_idf(inverted_entry_t *entry, posting_t *posting, int total_docs) {

// TF = log(1 + term_freq)

float tf = logf(1 + posting->term_freq);

// IDF = log(N / doc_count)

float idf = logf((float)total_docs / (entry->doc_count + 1));

return tf * idf;

}

// 布尔检索(AND查询)

search_result_t *boolean_and(index_shard_t *shard, char **terms, int term_count) {

if (term_count == 0) return NULL;

// 获取第一个词的结果

inverted_entry_t *entry = shard->inverted_index;

while (entry) {

if (strcmp(entry->term, terms0) == 0) break;

entry = entry->next;

}

if (!entry) return NULL;

// 收集文档ID

int doc_ids1000;

int doc_count = 0;

posting_t *p = entry->postings;

while (p) {

doc_idsdoc_count++ = atoi(p->doc_id);

p = p->next;

}

// 与其他词求交集

for (int i = 1; i < term_count; i++) {

// 获取当前词的文档列表

entry = shard->inverted_index;

while (entry) {

if (strcmp(entry->term, termsi) == 0) break;

entry = entry->next;

}

if (!entry) return NULL;

// 求交集

int new_ids1000;

int new_count = 0;

p = entry->postings;

while (p) {

int doc_id = atoi(p->doc_id);

for (int j = 0; j < doc_count; j++) {

if (doc_idsj == doc_id) {

new_idsnew_count++ = doc_id;

break;

}

}

p = p->next;

}

doc_count = new_count;

for (int j = 0; j < doc_count; j++) {

doc_idsj = new_idsj;

}

}

// 构建结果

search_result_t *results = NULL;

for (int i = 0; i < doc_count; i++) {

search_result_t *res = malloc(sizeof(search_result_t));

snprintf(res->doc_id, sizeof(res->doc_id), "%d", doc_idsi);

res->score = 1.0f; // 简单评分

res->next = results;

results = res;

}

return results;

}

// 评分排序(按TF-IDF)

search_result_t *rank_results(search_engine_t *engine, char **terms, int term_count) {

// 聚合所有分片的结果

search_result_t *all_results = NULL;

for (int s = 0; s < engine->shard_count; s++) {

index_shard_t *shard = &engine->shardss;

pthread_mutex_lock(&shard->mutex);

// 对每个词计算TF-IDF得分

// 简化:使用布尔AND + 简单评分

search_result_t *results = boolean_and(shard, terms, term_count);

// 合并结果

search_result_t *r = results;

while (r) {

// 查找是否已存在

search_result_t *existing = all_results;

int found = 0;

while (existing) {

if (strcmp(existing->doc_id, r->doc_id) == 0) {

existing->score += r->score;

found = 1;

break;

}

existing = existing->next;

}

if (!found) {

search_result_t *new_res = malloc(sizeof(search_result_t));

strcpy(new_res->doc_id, r->doc_id);

new_res->score = r->score;

new_res->next = all_results;

all_results = new_res;

}

r = r->next;

}

pthread_mutex_unlock(&shard->mutex);

}

return all_results;

}

```

  1. 分布式搜索

```c

// 创建搜索引擎

search_engine_t *engine_create(int shard_count) {

search_engine_t *engine = malloc(sizeof(search_engine_t));

engine->shard_count = shard_count;

engine->shards = malloc(sizeof(index_shard_t) * shard_count);

engine->running = 1;

for (int i = 0; i < shard_count; i++) {

engine->shardsi = *shard_create(i);

}

printf("引擎 创建完成,分片数: %d\n", shard_count);

return engine;

}

// 分片路由(按文档ID哈希)

int route_shard(const char *doc_id) {

unsigned int hash = 0;

const char *p = doc_id;

while (*p) {

hash = hash * 31 + *p++;

}

return hash % MAX_SHARDS;

}

// 索引文档(自动路由到分片)

int engine_index_document(search_engine_t *engine, const char *doc_id,

const char *title, const char *content) {

int shard_id = route_shard(doc_id);

index_shard_t *shard = &engine->shardsshard_id;

return index_document(shard, doc_id, title, content);

}

// 搜索

search_result_t *engine_search(search_engine_t *engine, const char *query) {

// 分词

char tokens100MAX_TERM_LEN;

int token_count = tokenize(query, tokens, 100);

// 过滤停用词

int filtered_count = 0;

char filtered100MAX_TERM_LEN;

for (int i = 0; i < token_count; i++) {

if (!is_stop_word(tokensi)) {

strcpy(filteredfiltered_count++, tokensi);

}

}

if (filtered_count == 0) return NULL;

printf("搜索 查询: %s, 关键词: %d\n", query, filtered_count);

// 执行分布式搜索

search_result_t *results = rank_results(engine, filtered, filtered_count);

// 按分数排序(冒泡)

if (results) {

int sorted = 0;

while (!sorted) {

sorted = 1;

search_result_t *cur = results;

while (cur && cur->next) {

if (cur->score < cur->next->score) {

// 交换

char tmp_idMAX_DOC_ID;

float tmp_score = cur->score;

strcpy(tmp_id, cur->doc_id);

strcpy(cur->doc_id, cur->next->doc_id);

cur->score = cur->next->score;

strcpy(cur->next->doc_id, tmp_id);

cur->next->score = tmp_score;

sorted = 0;

}

cur = cur->next;

}

}

}

return results;

}

```

  1. 测试代码

```c

void test_search_engine() {

printf("=== 分布式搜索引擎测试 ===\n\n");

// 创建搜索引擎

search_engine_t *engine = engine_create(3);

// 索引文档

printf("索引文档...\n");

engine_index_document(engine, "1", "Google Search",

"Google is a search engine. It helps people find information.");

engine_index_document(engine, "2", "Bing Search",

"Bing is a search engine by Microsoft.");

engine_index_document(engine, "3", "Search Engine",

"A search engine is a software system designed to search for information.");

engine_index_document(engine, "4", "Google Maps",

"Google Maps is a web mapping service.");

engine_index_document(engine, "5", "Machine Learning",

"Machine learning is a field of artificial intelligence.");

printf("\n");

// 搜索测试

const char *queries\[\] = {

"search engine",

"Google",

"machine learning",

"map"

};

for (int q = 0; q < 4; q++) {

printf("查询: \"%s\"\n", queriesq);

search_result_t *results = engine_search(engine, queriesq);

int rank = 1;

search_result_t *r = results;

while (r && rank <= 10) {

printf(" %d. 文档 %s (分数: %.4f)\n",

rank++, r->doc_id, r->score);

r = r->next;

}

printf("\n");

// 清理结果

while (results) {

search_result_t *next = results->next;

free(results);

results = next;

}

}

free(engine->shards);

free(engine);

}

int main() {

test_search_engine();

return 0;

}

```


三、编译和运行

```bash

gcc -o search_engine search_engine.c -lpthread -lm

./search_engine

```


四、Elasticsearch vs 本实现

特性 本实现 Elasticsearch

倒排索引 ✅ ✅

分布式 ✅ ✅

分词 基础 丰富

相关性排序 TF-IDF BM25

聚合查询 ❌ ✅

高亮 ❌ ✅


五、总结

通过这篇文章,你学会了:

· 搜索引擎的核心原理(倒排索引)

· 分词和停用词过滤

· 倒排索引构建

· 布尔检索(AND/OR)

· TF-IDF相关性排序

· 分布式索引和搜索

搜索引擎是信息检索的核心技术。掌握它,你就理解了Elasticsearch、Lucene的底层设计。

下一篇预告:《从零实现一个推荐系统:协同过滤与矩阵分解》


评论区分享一下你对搜索引擎的理解~