从零实现一个分布式搜索引擎：倒排索引与检索

前言

你有没有想过：你在Google搜索框里输入几个字，为什么能在毫秒内从几百亿网页中找到你要的结果？

搜索引擎的核心是倒排索引------将文档映射到关键词的倒排结构。

今天我们用C语言从零实现一个分布式搜索引擎：

· 倒排索引（Inverted Index）

· 分词（Tokenizer）

· 布尔检索（Boolean Search）

· 相关性排序（TF-IDF）

· 分布式索引（Sharding）

· 分布式检索（Scatter-Gather）

一、搜索引擎核心原理

倒排索引结构

```

正排索引（文档 → 词）：

文档1: "谷歌搜索" → $谷歌, 搜索$

文档2: "搜索引擎" → $搜索, 引擎$

文档3: "谷歌地图" → $谷歌, 地图$

倒排索引（词 → 文档）：

谷歌 → 文档1, 文档3

搜索 → 文档1, 文档2

引擎 → 文档2

地图 → 文档3

```

检索流程

```

查询 → 分词 → 倒排索引查找 → 文档交集 → 排序 → 结果

```

分布式架构

```

┌─────────────────────────────────────────────────────────────┐

│ Query Processor │

│ (查询解析 + 结果聚合) │

└─────────────────────────────────────────────────────────────┘

│ │ │

▼ ▼ ▼

┌─────────────────────────────────────────────────────────────┐

│ Index Shards │

│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │

│ │ Shard 1 │ │ Shard 2 │ │ Shard 3 │ │

│ │ 倒排索引 │ │ 倒排索引 │ │ 倒排索引 │ │

│ │ (词A-Z) │ │ (词H-N) │ │ (词O-Z) │ │

│ └──────────────┘ └──────────────┘ └──────────────┘ │

└─────────────────────────────────────────────────────────────┘

```

二、完整代码实现

基础数据结构

```c

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <unistd.h>

#include <pthread.h>

#include <time.h>

#include <errno.h>

#include <math.h>

#include <ctype.h>

#define MAX_TERM_LEN 128

#define MAX_DOC_ID 32

#define MAX_FIELD_LEN 256

#define MAX_SHARDS 10

// 倒排列表项

typedef struct posting {

char doc_id $MAX_DOC_ID$ ;

int term_freq; // 词频

float tf_idf; // TF-IDF权重

int positions $10$ ; // 位置（用于短语查询）

int pos_count;

struct posting *next;

} posting_t;

// 倒排索引项

typedef struct inverted_entry {

char term $MAX_TERM_LEN$ ;

posting_t *postings;

int doc_count;

struct inverted_entry *next;

} inverted_entry_t;

// 文档元数据

typedef struct document {

char doc_id $MAX_DOC_ID$ ;

char title $MAX_FIELD_LEN$ ;

char content $MAX_FIELD_LEN \* 10$ ;

float length; // 文档长度（用于BM25）

struct document *next;

} document_t;

// 索引分片

typedef struct index_shard {

int shard_id;

inverted_entry_t *inverted_index;

document_t *documents;

int doc_count;

int term_count;

pthread_mutex_t mutex;

} index_shard_t;

// 搜索引擎

typedef struct search_engine {

index_shard_t *shards;

int shard_count;

int running;

pthread_t search_thread;

} search_engine_t;

// 查询结果

typedef struct search_result {

char doc_id $MAX_DOC_ID$ ;

float score;

struct search_result *next;

} search_result_t;

```

分词器

```c

// 简单分词（按空格和标点分割，转小写）

int tokenize(const char *text, char tokens\[\] $MAX_TERM_LEN$ , int max_tokens) {

int count = 0;

char buffer $2048$ ;

strcpy(buffer, text);

char *p = buffer;

while (*p && count < max_tokens) {

// 跳过非字母数字

while (*p && !isalnum(*p)) p++;

if (!*p) break;

char *start = p;

while (*p && isalnum(*p)) p++;

int len = p - start;

if (len > 0 && len < MAX_TERM_LEN) {

// 转小写

for (int i = 0; i < len; i++) {

tokens $count$ $i$ = tolower(start $i$ );

}

tokens $count$ $len$ = '\0';

count++;

}

return count;

}

// 停用词过滤

int is_stop_word(const char *word) {

static const char *stop_words\[\] = {

"the", "a", "an", "of", "to", "for", "on", "at",

"in", "with", "by", "from", "up", "about", "into",

"through", "during", "including", "without", "and",

"or", "but", "so", "for", "nor", "yet", "as", NULL

};

for (int i = 0; stop_words $i$ ; i++) {

if (strcmp(word, stop_words $i$ ) == 0) return 1;

}

return 0;

}

```

倒排索引构建

```c

// 创建索引分片

index_shard_t *shard_create(int shard_id) {

index_shard_t *shard = malloc(sizeof(index_shard_t));

shard->shard_id = shard_id;

shard->inverted_index = NULL;

shard->documents = NULL;

shard->doc_count = 0;

shard->term_count = 0;

pthread_mutex_init(&shard->mutex, NULL);

return shard;

}

// 查找或创建倒排索引项

inverted_entry_t *find_or_create_term(index_shard_t *shard, const char *term) {

inverted_entry_t *entry = shard->inverted_index;

while (entry) {

if (strcmp(entry->term, term) == 0) {

return entry;

}

entry = entry->next;

}

// 创建新条目

entry = malloc(sizeof(inverted_entry_t));

strcpy(entry->term, term);

entry->postings = NULL;

entry->doc_count = 0;

entry->next = shard->inverted_index;

shard->inverted_index = entry;

shard->term_count++;

return entry;

}

// 添加文档到索引

int index_document(index_shard_t *shard, const char *doc_id,

const char *title, const char *content) {

pthread_mutex_lock(&shard->mutex);

// 保存文档

document_t *doc = malloc(sizeof(document_t));

strcpy(doc->doc_id, doc_id);

strcpy(doc->title, title);

strcpy(doc->content, content);

doc->length = 0;

doc->next = shard->documents;

shard->documents = doc;

shard->doc_count++;

// 分词

char tokens $1000$ $MAX_TERM_LEN$ ;

int token_count = tokenize(content, tokens, 1000);

// 统计词频

for (int i = 0; i < token_count; i++) {

if (is_stop_word(tokens $i$ )) continue;

// 查找或创建倒排项

inverted_entry_t *entry = find_or_create_term(shard, tokens $i$ );

// 检查是否已存在该文档的posting

posting_t *p = entry->postings;

int found = 0;

while (p) {

if (strcmp(p->doc_id, doc_id) == 0) {

p->term_freq++;

if (p->pos_count < 10) {

p->positions $p-\>pos_count++$ = i;

}

found = 1;

break;

}

p = p->next;

}

if (!found) {

// 创建新的posting

posting_t *new_p = malloc(sizeof(posting_t));

strcpy(new_p->doc_id, doc_id);

new_p->term_freq = 1;

new_p->pos_count = 1;

new_p->positions $0$ = i;

new_p->next = entry->postings;

entry->postings = new_p;

entry->doc_count++;

}

doc->length++;

}

pthread_mutex_unlock(&shard->mutex);

printf(" $索引$ 文档 %s 已索引 (分词: %d)\n", doc_id, token_count);

return 0;

}

```

检索与排序

```c

// 计算TF-IDF

float compute_tf_idf(inverted_entry_t *entry, posting_t *posting, int total_docs) {

// TF = log(1 + term_freq)

float tf = logf(1 + posting->term_freq);

// IDF = log(N / doc_count)

float idf = logf((float)total_docs / (entry->doc_count + 1));

return tf * idf;

}

// 布尔检索（AND查询）

search_result_t *boolean_and(index_shard_t *shard, char **terms, int term_count) {

if (term_count == 0) return NULL;

// 获取第一个词的结果

inverted_entry_t *entry = shard->inverted_index;

while (entry) {

if (strcmp(entry->term, terms $0$ ) == 0) break;

entry = entry->next;

}

if (!entry) return NULL;

// 收集文档ID

int doc_ids $1000$ ;

int doc_count = 0;

posting_t *p = entry->postings;

while (p) {

doc_ids $doc_count++$ = atoi(p->doc_id);

p = p->next;

}

// 与其他词求交集

for (int i = 1; i < term_count; i++) {

// 获取当前词的文档列表

entry = shard->inverted_index;

while (entry) {

if (strcmp(entry->term, terms $i$ ) == 0) break;

entry = entry->next;

}

if (!entry) return NULL;

// 求交集

int new_ids $1000$ ;

int new_count = 0;

p = entry->postings;

while (p) {

int doc_id = atoi(p->doc_id);

for (int j = 0; j < doc_count; j++) {

if (doc_ids $j$ == doc_id) {

new_ids $new_count++$ = doc_id;

break;

}

p = p->next;

}

doc_count = new_count;

for (int j = 0; j < doc_count; j++) {

doc_ids $j$ = new_ids $j$ ;

}

// 构建结果

search_result_t *results = NULL;

for (int i = 0; i < doc_count; i++) {

search_result_t *res = malloc(sizeof(search_result_t));

snprintf(res->doc_id, sizeof(res->doc_id), "%d", doc_ids $i$ );

res->score = 1.0f; // 简单评分

res->next = results;

results = res;

}

return results;

}

// 评分排序（按TF-IDF）

search_result_t *rank_results(search_engine_t *engine, char **terms, int term_count) {

// 聚合所有分片的结果

search_result_t *all_results = NULL;

for (int s = 0; s < engine->shard_count; s++) {

index_shard_t *shard = &engine->shards $s$ ;

pthread_mutex_lock(&shard->mutex);

// 对每个词计算TF-IDF得分

// 简化：使用布尔AND + 简单评分

search_result_t *results = boolean_and(shard, terms, term_count);

// 合并结果

search_result_t *r = results;

while (r) {

// 查找是否已存在

search_result_t *existing = all_results;

int found = 0;

while (existing) {

if (strcmp(existing->doc_id, r->doc_id) == 0) {

existing->score += r->score;

found = 1;

break;

}

existing = existing->next;

}

if (!found) {

search_result_t *new_res = malloc(sizeof(search_result_t));

strcpy(new_res->doc_id, r->doc_id);

new_res->score = r->score;

new_res->next = all_results;

all_results = new_res;

}

r = r->next;

}

pthread_mutex_unlock(&shard->mutex);

}

return all_results;

}

```

分布式搜索

```c

// 创建搜索引擎

search_engine_t *engine_create(int shard_count) {

search_engine_t *engine = malloc(sizeof(search_engine_t));

engine->shard_count = shard_count;

engine->shards = malloc(sizeof(index_shard_t) * shard_count);

engine->running = 1;

for (int i = 0; i < shard_count; i++) {

engine->shards $i$ = *shard_create(i);

}

printf(" $引擎$ 创建完成，分片数: %d\n", shard_count);

return engine;

}

// 分片路由（按文档ID哈希）

int route_shard(const char *doc_id) {

unsigned int hash = 0;

const char *p = doc_id;

while (*p) {

hash = hash * 31 + *p++;

}

return hash % MAX_SHARDS;

}

// 索引文档（自动路由到分片）

int engine_index_document(search_engine_t *engine, const char *doc_id,

const char *title, const char *content) {

int shard_id = route_shard(doc_id);

index_shard_t *shard = &engine->shards $shard_id$ ;

return index_document(shard, doc_id, title, content);

}

// 搜索

search_result_t *engine_search(search_engine_t *engine, const char *query) {

// 分词

char tokens $100$ $MAX_TERM_LEN$ ;

int token_count = tokenize(query, tokens, 100);

// 过滤停用词

int filtered_count = 0;

char filtered $100$ $MAX_TERM_LEN$ ;

for (int i = 0; i < token_count; i++) {

if (!is_stop_word(tokens $i$ )) {

strcpy(filtered $filtered_count++$ , tokens $i$ );

}

if (filtered_count == 0) return NULL;

printf(" $搜索$ 查询: %s, 关键词: %d\n", query, filtered_count);

// 执行分布式搜索

search_result_t *results = rank_results(engine, filtered, filtered_count);

// 按分数排序（冒泡）

if (results) {

int sorted = 0;

while (!sorted) {

sorted = 1;

search_result_t *cur = results;

while (cur && cur->next) {

if (cur->score < cur->next->score) {

// 交换

char tmp_id $MAX_DOC_ID$ ;

float tmp_score = cur->score;

strcpy(tmp_id, cur->doc_id);

strcpy(cur->doc_id, cur->next->doc_id);

cur->score = cur->next->score;

strcpy(cur->next->doc_id, tmp_id);

cur->next->score = tmp_score;

sorted = 0;

}

cur = cur->next;

}

return results;

}

```

测试代码

```c

void test_search_engine() {

printf("=== 分布式搜索引擎测试 ===\n\n");

// 创建搜索引擎

search_engine_t *engine = engine_create(3);

// 索引文档

printf("索引文档...\n");

engine_index_document(engine, "1", "Google Search",

"Google is a search engine. It helps people find information.");

engine_index_document(engine, "2", "Bing Search",

"Bing is a search engine by Microsoft.");

engine_index_document(engine, "3", "Search Engine",

"A search engine is a software system designed to search for information.");

engine_index_document(engine, "4", "Google Maps",

"Google Maps is a web mapping service.");

engine_index_document(engine, "5", "Machine Learning",

"Machine learning is a field of artificial intelligence.");

printf("\n");

// 搜索测试

const char *queries\[\] = {

"search engine",

"Google",

"machine learning",

"map"

};

for (int q = 0; q < 4; q++) {

printf("查询: \"%s\"\n", queries $q$ );

search_result_t *results = engine_search(engine, queries $q$ );

int rank = 1;

search_result_t *r = results;

while (r && rank <= 10) {

printf(" %d. 文档 %s (分数: %.4f)\n",

rank++, r->doc_id, r->score);

r = r->next;

}

printf("\n");

// 清理结果

while (results) {

search_result_t *next = results->next;

free(results);

results = next;

}

free(engine->shards);

free(engine);

}

int main() {

test_search_engine();

return 0;

}

```

三、编译和运行

```bash

gcc -o search_engine search_engine.c -lpthread -lm

./search_engine

```

四、Elasticsearch vs 本实现

特性本实现 Elasticsearch

倒排索引 ✅ ✅

分布式 ✅ ✅

分词基础丰富