从零实现一个Raft共识算法:选举与日志复制

前言

上一篇文章我们实现了Paxos,但Paxos确实难理解。Raft是Paxos的"简化版",专为工程实现设计,分为三个子问题:

  1. Leader选举:选出一个领导者

  2. 日志复制:Leader接收日志,复制到Follower

  3. 安全性:保证日志一致性

今天我们用C语言从零实现Raft核心算法:

· 节点状态切换(Follower → Candidate → Leader)

· Leader选举(心跳 + 随机超时)

· 日志条目(索引 + 任期 + 命令)

· 日志复制(AppendEntries RPC)

· 持久化存储


一、Raft核心原理

  1. 节点状态机

```

超时,发起选举

┌─────────────────────────┐

│ ▼

│ ┌─────────┐

│ │Candidate│

│ └─────────┘

│ │

│ 收到多数派投票 │

│ ▼

│ ┌─────────┐

└────────────────────│ Leader │

发现更高任期 └─────────┘

│ │

│ 选举超时 │

▼ ▼

┌─────────┐

│Follower │ ←── 心跳

└─────────┘

```

  1. 核心概念

概念 说明

Term 任期,单调递增

Log 日志条目序列,存储命令

Commit Index 已提交的日志索引

AppendEntries Leader向Follower同步日志

RequestVote Candidate请求投票


二、完整代码实现

  1. 基础数据结构

```c

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <unistd.h>

#include <pthread.h>

#include <time.h>

#include <errno.h>

#include <signal.h>

#define MAX_NODES 10

#define MAX_LOG_ENTRIES 1000

#define MAX_CMD_LEN 256

// 节点状态

typedef enum {

STATE_FOLLOWER = 0,

STATE_CANDIDATE = 1,

STATE_LEADER = 2

} node_state_t;

// 日志条目

typedef struct log_entry {

int term;

int index;

char commandMAX_CMD_LEN;

} log_entry_t;

// Raft节点

typedef struct raft_node {

int node_id;

int current_term;

int voted_for;

node_state_t state;

// 日志

log_entry_t logMAX_LOG_ENTRIES;

int log_count;

int commit_index;

int last_applied;

// Leader状态

int next_indexMAX_NODES;

int match_indexMAX_NODES;

// 集群

struct raft_node **peers;

int peer_count;

// 定时器

time_t election_timeout;

time_t last_heartbeat;

int heartbeat_interval;

// 同步

pthread_mutex_t mutex;

pthread_t rpc_thread;

pthread_t election_thread;

int running;

char log_file256;

} raft_node_t;

// RPC消息

typedef struct append_entries_args {

int term;

int leader_id;

int prev_log_index;

int prev_log_term;

log_entry_t *entries;

int entry_count;

int leader_commit;

} append_entries_args_t;

typedef struct append_entries_reply {

int term;

int success;

int match_index;

} append_entries_reply_t;

typedef struct request_vote_args {

int term;

int candidate_id;

int last_log_index;

int last_log_term;

} request_vote_args_t;

typedef struct request_vote_reply {

int term;

int vote_granted;

} request_vote_reply_t;

```

  1. 持久化

```c

// 持久化当前任期和投票

void raft_persist(raft_node_t *node) {

char filename256;

snprintf(filename, sizeof(filename), "%s_raft.dat", node->log_file);

FILE *fp = fopen(filename, "w");

if (!fp) return;

fprintf(fp, "%d\n%d\n", node->current_term, node->voted_for);

fclose(fp);

}

// 加载持久化数据

void raft_load(raft_node_t *node) {

char filename256;

snprintf(filename, sizeof(filename), "%s_raft.dat", node->log_file);

FILE *fp = fopen(filename, "r");

if (!fp) return;

fscanf(fp, "%d\n%d\n", &node->current_term, &node->voted_for);

fclose(fp);

}

// 持久化日志(简化)

void raft_persist_log(raft_node_t *node) {

char filename256;

snprintf(filename, sizeof(filename), "%s_log.dat", node->log_file);

FILE *fp = fopen(filename, "w");

if (!fp) return;

for (int i = 0; i < node->log_count; i++) {

fprintf(fp, "%d|%d|%s\n",

node->logi.term, node->logi.index, node->logi.command);

}

fclose(fp);

}

```

  1. 节点初始化

```c

// 创建Raft节点

raft_node_t *raft_create(int node_id, const char *log_file) {

raft_node_t *node = malloc(sizeof(raft_node_t));

memset(node, 0, sizeof(raft_node_t));

node->node_id = node_id;

node->state = STATE_FOLLOWER;

node->current_term = 0;

node->voted_for = -1;

node->running = 1;

node->heartbeat_interval = 100; // 毫秒

node->log_count = 0;

node->commit_index = 0;

node->last_applied = 0;

strcpy(node->log_file, log_file);

// 初始化日志(从索引1开始)

for (int i = 0; i < MAX_LOG_ENTRIES; i++) {

node->logi.term = 0;

node->logi.index = i;

}

pthread_mutex_init(&node->mutex, NULL);

// 加载持久化数据

raft_load(node);

// 设置随机选举超时(150-300ms)

node->election_timeout = time(NULL) + (150 + rand() % 150) / 1000;

printf("Raft 节点 %d 启动,任期 %d,状态 Follower\n",

node->node_id, node->current_term);

return node;

}

// 添加对等节点

void raft_add_peer(raft_node_t *node, raft_node_t *peer) {

if (node->peer_count < MAX_NODES) {

node->peersnode-\>peer_count++ = peer;

}

}

```

  1. 选举超时

```c

// 重置选举超时

void raft_reset_election_timeout(raft_node_t *node) {

node->election_timeout = time(NULL) + (150 + rand() % 150) / 1000;

}

// 检查是否选举超时

int raft_is_election_timeout(raft_node_t *node) {

return time(NULL) >= node->election_timeout;

}

// 选举线程

void *raft_election_thread(void *arg) {

raft_node_t *node = (raft_node_t*)arg;

while (node->running) {

usleep(10000); // 10ms

pthread_mutex_lock(&node->mutex);

if (node->state == STATE_FOLLOWER || node->state == STATE_CANDIDATE) {

if (raft_is_election_timeout(node)) {

// 发起选举

printf("Raft 节点 %d 选举超时,发起选举\n", node->node_id);

// 成为Candidate

node->state = STATE_CANDIDATE;

node->current_term++;

node->voted_for = node->node_id;

raft_reset_election_timeout(node);

raft_persist(node);

// 请求投票

int votes = 1;

for (int i = 0; i < node->peer_count; i++) {

// 实际应通过网络发送RequestVote RPC

// 这里简化,模拟投票

if (i % 2 == 0) votes++; // 模拟多数派

}

if (votes > node->peer_count / 2) {

printf("Raft 节点 %d 当选 Leader (任期 %d)\n",

node->node_id, node->current_term);

node->state = STATE_LEADER;

// 初始化Leader状态

for (int i = 0; i < node->peer_count; i++) {

node->next_indexi = node->log_count;

node->match_indexi = 0;

}

}

}

}

pthread_mutex_unlock(&node->mutex);

}

return NULL;

}

```

  1. 日志复制

```c

// 追加日志

int raft_append_log(raft_node_t *node, int term, const char *command) {

if (node->log_count >= MAX_LOG_ENTRIES) return -1;

node->lognode-\>log_count.term = term;

node->lognode-\>log_count.index = node->log_count;

strcpy(node->lognode-\>log_count.command, command);

node->log_count++;

raft_persist_log(node);

return 0;

}

// Leader发起AppendEntries

void raft_leader_append_entries(raft_node_t *node) {

if (node->state != STATE_LEADER) return;

pthread_mutex_lock(&node->mutex);

// 对每个Follower发送AppendEntries

for (int i = 0; i < node->peer_count; i++) {

int prev_index = node->next_indexi - 1;

int prev_term = (prev_index >= 0) ? node->logprev_index.term : 0;

// 构造AppendEntries请求

// 实际应通过网络发送

// 这里简化:直接复制日志

// 更新Follower的日志(模拟)

node->match_indexi = node->log_count;

node->next_indexi = node->log_count;

}

pthread_mutex_unlock(&node->mutex);

}

// AppendEntries RPC处理

append_entries_reply_t raft_handle_append_entries(raft_node_t *node,

append_entries_args_t *args) {

append_entries_reply_t reply;

reply.term = node->current_term;

reply.success = 0;

reply.match_index = 0;

pthread_mutex_lock(&node->mutex);

// 检查任期

if (args->term < node->current_term) {

pthread_mutex_unlock(&node->mutex);

return reply;

}

// 更新任期

if (args->term > node->current_term) {

node->current_term = args->term;

node->state = STATE_FOLLOWER;

node->voted_for = -1;

raft_persist(node);

}

// 重置选举超时

raft_reset_election_timeout(node);

// 检查日志匹配

if (args->prev_log_index >= 0 && args->prev_log_index < node->log_count) {

if (node->logargs-\>prev_log_index.term == args->prev_log_term) {

reply.success = 1;

// 追加新日志

for (int i = 0; i < args->entry_count; i++) {

if (args->prev_log_index + 1 + i < node->log_count) {

// 冲突:删除冲突日志

node->log_count = args->prev_log_index + 1 + i;

}

raft_append_log(node, args->entriesi.term, args->entriesi.command);

}

reply.match_index = node->log_count;

// 更新commit_index

if (args->leader_commit > node->commit_index) {

node->commit_index = args->leader_commit < node->log_count ?

args->leader_commit : node->log_count;

}

}

}

pthread_mutex_unlock(&node->mutex);

return reply;

}

```

  1. 心跳

```c

// 发送心跳(空AppendEntries)

void raft_send_heartbeat(raft_node_t *node) {

if (node->state != STATE_LEADER) return;

pthread_mutex_lock(&node->mutex);

time_t now = time(NULL);

if (now - node->last_heartbeat >= node->heartbeat_interval / 1000) {

printf("Raft Leader %d 发送心跳 (任期 %d)\n",

node->node_id, node->current_term);

node->last_heartbeat = now;

// 向所有Follower发送心跳

// 实际应通过网络发送AppendEntries RPC

// 这里简化:直接更新Follower状态

for (int i = 0; i < node->peer_count; i++) {

raft_node_t *peer = node->peersi;

pthread_mutex_lock(&peer->mutex);

if (peer->current_term < node->current_term) {

peer->current_term = node->current_term;

peer->state = STATE_FOLLOWER;

peer->voted_for = -1;

raft_reset_election_timeout(peer);

}

pthread_mutex_unlock(&peer->mutex);

}

}

pthread_mutex_unlock(&node->mutex);

}

// 心跳线程(Leader专用)

void *raft_heartbeat_thread(void *arg) {

raft_node_t *node = (raft_node_t*)arg;

while (node->running) {

usleep(50000); // 50ms

pthread_mutex_lock(&node->mutex);

if (node->state == STATE_LEADER) {

raft_send_heartbeat(node);

}

pthread_mutex_unlock(&node->mutex);

}

return NULL;

}

```

  1. RPC线程

```c

// RPC处理线程(模拟网络)

void *raft_rpc_thread(void *arg) {

raft_node_t *node = (raft_node_t*)arg;

while (node->running) {

usleep(100000);

pthread_mutex_lock(&node->mutex);

// 模拟处理AppendEntries

if (node->state == STATE_FOLLOWER) {

// 检查是否有Leader心跳超时

if (time(NULL) - node->last_heartbeat > 2) {

// 假设Leader挂了,触发选举

// 实际由选举线程处理

}

}

pthread_mutex_unlock(&node->mutex);

}

return NULL;

}

```

  1. 启动Raft集群

```c

void raft_start_cluster(raft_node_t **nodes, int count) {

printf("\n=== Raft集群启动 ===\n");

// 启动各节点线程

for (int i = 0; i < count; i++) {

pthread_create(&nodesi->election_thread, NULL,

raft_election_thread, nodesi);

pthread_create(&nodesi->rpc_thread, NULL,

raft_rpc_thread, nodesi);

pthread_create(&nodesi->heartbeat_thread, NULL,

raft_heartbeat_thread, nodesi);

}

sleep(1);

// 检查Leader

for (int i = 0; i < count; i++) {

if (nodesi->state == STATE_LEADER) {

printf("✅ Leader: 节点 %d (任期 %d)\n",

nodesi->node_id, nodesi->current_term);

}

}

}

```

  1. 测试代码

```c

void test_raft() {

printf("=== Raft共识算法测试 ===\n\n");

srand(time(NULL));

// 创建5个节点

raft_node_t *nodes5;

for (int i = 0; i < 5; i++) {

char log_file64;

sprintf(log_file, "raft_%d", i);

nodesi = raft_create(i, log_file);

}

// 连接节点

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

if (i != j) {

raft_add_peer(nodesi, nodesj);

}

}

}

// 启动集群

raft_start_cluster(nodes, 5);

// 运行10秒

printf("\n运行10秒...\n");

sleep(10);

// 打印最终状态

printf("\n=== 集群状态 ===\n");

for (int i = 0; i < 5; i++) {

const char *state_str\[\] = {"Follower", "Candidate", "Leader"};

printf("节点 %d: %s, 任期 %d, 日志数 %d\n",

nodesi->node_id, state_strnodes\[i->state],

nodesi->current_term, nodesi->log_count);

}

// 清理

for (int i = 0; i < 5; i++) {

nodesi->running = 0;

pthread_join(nodesi->election_thread, NULL);

pthread_join(nodesi->rpc_thread, NULL);

pthread_join(nodesi->heartbeat_thread, NULL);

free(nodesi);

}

printf("\n测试完成\n");

}

int main() {

test_raft();

return 0;

}

```


三、编译和运行

```bash

gcc -o raft raft.c -lpthread

./raft

```


四、Raft vs Paxos

特性 Paxos Raft

理解难度 高 中

实现复杂度 高 中

Leader选举 隐式 显式

日志复制 复杂 清晰

安全性证明 难 较易

工程应用 较少 广泛


五、总结

通过这篇文章,你学会了:

· Raft的核心组件(选举、日志复制、安全性)

· 节点状态转换(Follower → Candidate → Leader)

· 选举超时和随机化

· AppendEntries RPC

· 日志一致性和提交

· 持久化存储

Raft是分布式一致性的工程标准。掌握它,你就理解了etcd、Consul、TiDB的核心设计。

下一篇预告:《从零实现一个分布式文件系统:GFS的核心设计》


评论区分享一下你对Raft的理解~