前言
你有没有想过:你在Google搜索框里输入几个字,为什么能在毫秒内从几百亿网页中找到你要的结果?
搜索引擎的核心是倒排索引------将文档映射到关键词的倒排结构。
今天我们用C语言从零实现一个分布式搜索引擎:
· 倒排索引(Inverted Index)
· 分词(Tokenizer)
· 布尔检索(Boolean Search)
· 相关性排序(TF-IDF)
· 分布式索引(Sharding)
· 分布式检索(Scatter-Gather)
一、搜索引擎核心原理
- 倒排索引结构
```
正排索引(文档 → 词):
文档1: "谷歌搜索" → 谷歌, 搜索
文档2: "搜索引擎" → 搜索, 引擎
文档3: "谷歌地图" → 谷歌, 地图
倒排索引(词 → 文档):
谷歌 → 文档1, 文档3
搜索 → 文档1, 文档2
引擎 → 文档2
地图 → 文档3
```
- 检索流程
```
查询 → 分词 → 倒排索引查找 → 文档交集 → 排序 → 结果
```
- 分布式架构
```
┌─────────────────────────────────────────────────────────────┐
│ Query Processor │
│ (查询解析 + 结果聚合) │
└─────────────────────────────────────────────────────────────┘
│ │ │
▼ ▼ ▼
┌─────────────────────────────────────────────────────────────┐
│ Index Shards │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Shard 1 │ │ Shard 2 │ │ Shard 3 │ │
│ │ 倒排索引 │ │ 倒排索引 │ │ 倒排索引 │ │
│ │ (词A-Z) │ │ (词H-N) │ │ (词O-Z) │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
└─────────────────────────────────────────────────────────────┘
```
二、完整代码实现
- 基础数据结构
```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
#include <errno.h>
#include <math.h>
#include <ctype.h>
#define MAX_TERM_LEN 128
#define MAX_DOC_ID 32
#define MAX_FIELD_LEN 256
#define MAX_SHARDS 10
// 倒排列表项
typedef struct posting {
char doc_idMAX_DOC_ID;
int term_freq; // 词频
float tf_idf; // TF-IDF权重
int positions10; // 位置(用于短语查询)
int pos_count;
struct posting *next;
} posting_t;
// 倒排索引项
typedef struct inverted_entry {
char termMAX_TERM_LEN;
posting_t *postings;
int doc_count;
struct inverted_entry *next;
} inverted_entry_t;
// 文档元数据
typedef struct document {
char doc_idMAX_DOC_ID;
char titleMAX_FIELD_LEN;
char contentMAX_FIELD_LEN \* 10;
float length; // 文档长度(用于BM25)
struct document *next;
} document_t;
// 索引分片
typedef struct index_shard {
int shard_id;
inverted_entry_t *inverted_index;
document_t *documents;
int doc_count;
int term_count;
pthread_mutex_t mutex;
} index_shard_t;
// 搜索引擎
typedef struct search_engine {
index_shard_t *shards;
int shard_count;
int running;
pthread_t search_thread;
} search_engine_t;
// 查询结果
typedef struct search_result {
char doc_idMAX_DOC_ID;
float score;
struct search_result *next;
} search_result_t;
```
- 分词器
```c
// 简单分词(按空格和标点分割,转小写)
int tokenize(const char *text, char tokens\[\]MAX_TERM_LEN, int max_tokens) {
int count = 0;
char buffer2048;
strcpy(buffer, text);
char *p = buffer;
while (*p && count < max_tokens) {
// 跳过非字母数字
while (*p && !isalnum(*p)) p++;
if (!*p) break;
char *start = p;
while (*p && isalnum(*p)) p++;
int len = p - start;
if (len > 0 && len < MAX_TERM_LEN) {
// 转小写
for (int i = 0; i < len; i++) {
tokenscounti = tolower(starti);
}
tokenscountlen = '\0';
count++;
}
}
return count;
}
// 停用词过滤
int is_stop_word(const char *word) {
static const char *stop_words\[\] = {
"the", "a", "an", "of", "to", "for", "on", "at",
"in", "with", "by", "from", "up", "about", "into",
"through", "during", "including", "without", "and",
"or", "but", "so", "for", "nor", "yet", "as", NULL
};
for (int i = 0; stop_wordsi; i++) {
if (strcmp(word, stop_wordsi) == 0) return 1;
}
return 0;
}
```
- 倒排索引构建
```c
// 创建索引分片
index_shard_t *shard_create(int shard_id) {
index_shard_t *shard = malloc(sizeof(index_shard_t));
shard->shard_id = shard_id;
shard->inverted_index = NULL;
shard->documents = NULL;
shard->doc_count = 0;
shard->term_count = 0;
pthread_mutex_init(&shard->mutex, NULL);
return shard;
}
// 查找或创建倒排索引项
inverted_entry_t *find_or_create_term(index_shard_t *shard, const char *term) {
inverted_entry_t *entry = shard->inverted_index;
while (entry) {
if (strcmp(entry->term, term) == 0) {
return entry;
}
entry = entry->next;
}
// 创建新条目
entry = malloc(sizeof(inverted_entry_t));
strcpy(entry->term, term);
entry->postings = NULL;
entry->doc_count = 0;
entry->next = shard->inverted_index;
shard->inverted_index = entry;
shard->term_count++;
return entry;
}
// 添加文档到索引
int index_document(index_shard_t *shard, const char *doc_id,
const char *title, const char *content) {
pthread_mutex_lock(&shard->mutex);
// 保存文档
document_t *doc = malloc(sizeof(document_t));
strcpy(doc->doc_id, doc_id);
strcpy(doc->title, title);
strcpy(doc->content, content);
doc->length = 0;
doc->next = shard->documents;
shard->documents = doc;
shard->doc_count++;
// 分词
char tokens1000MAX_TERM_LEN;
int token_count = tokenize(content, tokens, 1000);
// 统计词频
for (int i = 0; i < token_count; i++) {
if (is_stop_word(tokensi)) continue;
// 查找或创建倒排项
inverted_entry_t *entry = find_or_create_term(shard, tokensi);
// 检查是否已存在该文档的posting
posting_t *p = entry->postings;
int found = 0;
while (p) {
if (strcmp(p->doc_id, doc_id) == 0) {
p->term_freq++;
if (p->pos_count < 10) {
p->positionsp-\>pos_count++ = i;
}
found = 1;
break;
}
p = p->next;
}
if (!found) {
// 创建新的posting
posting_t *new_p = malloc(sizeof(posting_t));
strcpy(new_p->doc_id, doc_id);
new_p->term_freq = 1;
new_p->pos_count = 1;
new_p->positions0 = i;
new_p->next = entry->postings;
entry->postings = new_p;
entry->doc_count++;
}
doc->length++;
}
pthread_mutex_unlock(&shard->mutex);
printf("索引 文档 %s 已索引 (分词: %d)\n", doc_id, token_count);
return 0;
}
```
- 检索与排序
```c
// 计算TF-IDF
float compute_tf_idf(inverted_entry_t *entry, posting_t *posting, int total_docs) {
// TF = log(1 + term_freq)
float tf = logf(1 + posting->term_freq);
// IDF = log(N / doc_count)
float idf = logf((float)total_docs / (entry->doc_count + 1));
return tf * idf;
}
// 布尔检索(AND查询)
search_result_t *boolean_and(index_shard_t *shard, char **terms, int term_count) {
if (term_count == 0) return NULL;
// 获取第一个词的结果
inverted_entry_t *entry = shard->inverted_index;
while (entry) {
if (strcmp(entry->term, terms0) == 0) break;
entry = entry->next;
}
if (!entry) return NULL;
// 收集文档ID
int doc_ids1000;
int doc_count = 0;
posting_t *p = entry->postings;
while (p) {
doc_idsdoc_count++ = atoi(p->doc_id);
p = p->next;
}
// 与其他词求交集
for (int i = 1; i < term_count; i++) {
// 获取当前词的文档列表
entry = shard->inverted_index;
while (entry) {
if (strcmp(entry->term, termsi) == 0) break;
entry = entry->next;
}
if (!entry) return NULL;
// 求交集
int new_ids1000;
int new_count = 0;
p = entry->postings;
while (p) {
int doc_id = atoi(p->doc_id);
for (int j = 0; j < doc_count; j++) {
if (doc_idsj == doc_id) {
new_idsnew_count++ = doc_id;
break;
}
}
p = p->next;
}
doc_count = new_count;
for (int j = 0; j < doc_count; j++) {
doc_idsj = new_idsj;
}
}
// 构建结果
search_result_t *results = NULL;
for (int i = 0; i < doc_count; i++) {
search_result_t *res = malloc(sizeof(search_result_t));
snprintf(res->doc_id, sizeof(res->doc_id), "%d", doc_idsi);
res->score = 1.0f; // 简单评分
res->next = results;
results = res;
}
return results;
}
// 评分排序(按TF-IDF)
search_result_t *rank_results(search_engine_t *engine, char **terms, int term_count) {
// 聚合所有分片的结果
search_result_t *all_results = NULL;
for (int s = 0; s < engine->shard_count; s++) {
index_shard_t *shard = &engine->shardss;
pthread_mutex_lock(&shard->mutex);
// 对每个词计算TF-IDF得分
// 简化:使用布尔AND + 简单评分
search_result_t *results = boolean_and(shard, terms, term_count);
// 合并结果
search_result_t *r = results;
while (r) {
// 查找是否已存在
search_result_t *existing = all_results;
int found = 0;
while (existing) {
if (strcmp(existing->doc_id, r->doc_id) == 0) {
existing->score += r->score;
found = 1;
break;
}
existing = existing->next;
}
if (!found) {
search_result_t *new_res = malloc(sizeof(search_result_t));
strcpy(new_res->doc_id, r->doc_id);
new_res->score = r->score;
new_res->next = all_results;
all_results = new_res;
}
r = r->next;
}
pthread_mutex_unlock(&shard->mutex);
}
return all_results;
}
```
- 分布式搜索
```c
// 创建搜索引擎
search_engine_t *engine_create(int shard_count) {
search_engine_t *engine = malloc(sizeof(search_engine_t));
engine->shard_count = shard_count;
engine->shards = malloc(sizeof(index_shard_t) * shard_count);
engine->running = 1;
for (int i = 0; i < shard_count; i++) {
engine->shardsi = *shard_create(i);
}
printf("引擎 创建完成,分片数: %d\n", shard_count);
return engine;
}
// 分片路由(按文档ID哈希)
int route_shard(const char *doc_id) {
unsigned int hash = 0;
const char *p = doc_id;
while (*p) {
hash = hash * 31 + *p++;
}
return hash % MAX_SHARDS;
}
// 索引文档(自动路由到分片)
int engine_index_document(search_engine_t *engine, const char *doc_id,
const char *title, const char *content) {
int shard_id = route_shard(doc_id);
index_shard_t *shard = &engine->shardsshard_id;
return index_document(shard, doc_id, title, content);
}
// 搜索
search_result_t *engine_search(search_engine_t *engine, const char *query) {
// 分词
char tokens100MAX_TERM_LEN;
int token_count = tokenize(query, tokens, 100);
// 过滤停用词
int filtered_count = 0;
char filtered100MAX_TERM_LEN;
for (int i = 0; i < token_count; i++) {
if (!is_stop_word(tokensi)) {
strcpy(filteredfiltered_count++, tokensi);
}
}
if (filtered_count == 0) return NULL;
printf("搜索 查询: %s, 关键词: %d\n", query, filtered_count);
// 执行分布式搜索
search_result_t *results = rank_results(engine, filtered, filtered_count);
// 按分数排序(冒泡)
if (results) {
int sorted = 0;
while (!sorted) {
sorted = 1;
search_result_t *cur = results;
while (cur && cur->next) {
if (cur->score < cur->next->score) {
// 交换
char tmp_idMAX_DOC_ID;
float tmp_score = cur->score;
strcpy(tmp_id, cur->doc_id);
strcpy(cur->doc_id, cur->next->doc_id);
cur->score = cur->next->score;
strcpy(cur->next->doc_id, tmp_id);
cur->next->score = tmp_score;
sorted = 0;
}
cur = cur->next;
}
}
}
return results;
}
```
- 测试代码
```c
void test_search_engine() {
printf("=== 分布式搜索引擎测试 ===\n\n");
// 创建搜索引擎
search_engine_t *engine = engine_create(3);
// 索引文档
printf("索引文档...\n");
engine_index_document(engine, "1", "Google Search",
"Google is a search engine. It helps people find information.");
engine_index_document(engine, "2", "Bing Search",
"Bing is a search engine by Microsoft.");
engine_index_document(engine, "3", "Search Engine",
"A search engine is a software system designed to search for information.");
engine_index_document(engine, "4", "Google Maps",
"Google Maps is a web mapping service.");
engine_index_document(engine, "5", "Machine Learning",
"Machine learning is a field of artificial intelligence.");
printf("\n");
// 搜索测试
const char *queries\[\] = {
"search engine",
"Google",
"machine learning",
"map"
};
for (int q = 0; q < 4; q++) {
printf("查询: \"%s\"\n", queriesq);
search_result_t *results = engine_search(engine, queriesq);
int rank = 1;
search_result_t *r = results;
while (r && rank <= 10) {
printf(" %d. 文档 %s (分数: %.4f)\n",
rank++, r->doc_id, r->score);
r = r->next;
}
printf("\n");
// 清理结果
while (results) {
search_result_t *next = results->next;
free(results);
results = next;
}
}
free(engine->shards);
free(engine);
}
int main() {
test_search_engine();
return 0;
}
```
三、编译和运行
```bash
gcc -o search_engine search_engine.c -lpthread -lm
./search_engine
```
四、Elasticsearch vs 本实现
特性 本实现 Elasticsearch
倒排索引 ✅ ✅
分布式 ✅ ✅
分词 基础 丰富
相关性排序 TF-IDF BM25
聚合查询 ❌ ✅
高亮 ❌ ✅
五、总结
通过这篇文章,你学会了:
· 搜索引擎的核心原理(倒排索引)
· 分词和停用词过滤
· 倒排索引构建
· 布尔检索(AND/OR)
· TF-IDF相关性排序
· 分布式索引和搜索
搜索引擎是信息检索的核心技术。掌握它,你就理解了Elasticsearch、Lucene的底层设计。
下一篇预告:《从零实现一个推荐系统:协同过滤与矩阵分解》
评论区分享一下你对搜索引擎的理解~