前言
上一篇文章我们实现了Paxos,但Paxos确实难理解。Raft是Paxos的"简化版",专为工程实现设计,分为三个子问题:
-
Leader选举:选出一个领导者
-
日志复制:Leader接收日志,复制到Follower
-
安全性:保证日志一致性
今天我们用C语言从零实现Raft核心算法:
· 节点状态切换(Follower → Candidate → Leader)
· Leader选举(心跳 + 随机超时)
· 日志条目(索引 + 任期 + 命令)
· 日志复制(AppendEntries RPC)
· 持久化存储
一、Raft核心原理
- 节点状态机
```
超时,发起选举
┌─────────────────────────┐
│ ▼
│ ┌─────────┐
│ │Candidate│
│ └─────────┘
│ │
│ 收到多数派投票 │
│ ▼
│ ┌─────────┐
└────────────────────│ Leader │
发现更高任期 └─────────┘
│ │
│ 选举超时 │
▼ ▼
┌─────────┐
│Follower │ ←── 心跳
└─────────┘
```
- 核心概念
概念 说明
Term 任期,单调递增
Log 日志条目序列,存储命令
Commit Index 已提交的日志索引
AppendEntries Leader向Follower同步日志
RequestVote Candidate请求投票
二、完整代码实现
- 基础数据结构
```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
#include <errno.h>
#include <signal.h>
#define MAX_NODES 10
#define MAX_LOG_ENTRIES 1000
#define MAX_CMD_LEN 256
// 节点状态
typedef enum {
STATE_FOLLOWER = 0,
STATE_CANDIDATE = 1,
STATE_LEADER = 2
} node_state_t;
// 日志条目
typedef struct log_entry {
int term;
int index;
char commandMAX_CMD_LEN;
} log_entry_t;
// Raft节点
typedef struct raft_node {
int node_id;
int current_term;
int voted_for;
node_state_t state;
// 日志
log_entry_t logMAX_LOG_ENTRIES;
int log_count;
int commit_index;
int last_applied;
// Leader状态
int next_indexMAX_NODES;
int match_indexMAX_NODES;
// 集群
struct raft_node **peers;
int peer_count;
// 定时器
time_t election_timeout;
time_t last_heartbeat;
int heartbeat_interval;
// 同步
pthread_mutex_t mutex;
pthread_t rpc_thread;
pthread_t election_thread;
int running;
char log_file256;
} raft_node_t;
// RPC消息
typedef struct append_entries_args {
int term;
int leader_id;
int prev_log_index;
int prev_log_term;
log_entry_t *entries;
int entry_count;
int leader_commit;
} append_entries_args_t;
typedef struct append_entries_reply {
int term;
int success;
int match_index;
} append_entries_reply_t;
typedef struct request_vote_args {
int term;
int candidate_id;
int last_log_index;
int last_log_term;
} request_vote_args_t;
typedef struct request_vote_reply {
int term;
int vote_granted;
} request_vote_reply_t;
```
- 持久化
```c
// 持久化当前任期和投票
void raft_persist(raft_node_t *node) {
char filename256;
snprintf(filename, sizeof(filename), "%s_raft.dat", node->log_file);
FILE *fp = fopen(filename, "w");
if (!fp) return;
fprintf(fp, "%d\n%d\n", node->current_term, node->voted_for);
fclose(fp);
}
// 加载持久化数据
void raft_load(raft_node_t *node) {
char filename256;
snprintf(filename, sizeof(filename), "%s_raft.dat", node->log_file);
FILE *fp = fopen(filename, "r");
if (!fp) return;
fscanf(fp, "%d\n%d\n", &node->current_term, &node->voted_for);
fclose(fp);
}
// 持久化日志(简化)
void raft_persist_log(raft_node_t *node) {
char filename256;
snprintf(filename, sizeof(filename), "%s_log.dat", node->log_file);
FILE *fp = fopen(filename, "w");
if (!fp) return;
for (int i = 0; i < node->log_count; i++) {
fprintf(fp, "%d|%d|%s\n",
node->logi.term, node->logi.index, node->logi.command);
}
fclose(fp);
}
```
- 节点初始化
```c
// 创建Raft节点
raft_node_t *raft_create(int node_id, const char *log_file) {
raft_node_t *node = malloc(sizeof(raft_node_t));
memset(node, 0, sizeof(raft_node_t));
node->node_id = node_id;
node->state = STATE_FOLLOWER;
node->current_term = 0;
node->voted_for = -1;
node->running = 1;
node->heartbeat_interval = 100; // 毫秒
node->log_count = 0;
node->commit_index = 0;
node->last_applied = 0;
strcpy(node->log_file, log_file);
// 初始化日志(从索引1开始)
for (int i = 0; i < MAX_LOG_ENTRIES; i++) {
node->logi.term = 0;
node->logi.index = i;
}
pthread_mutex_init(&node->mutex, NULL);
// 加载持久化数据
raft_load(node);
// 设置随机选举超时(150-300ms)
node->election_timeout = time(NULL) + (150 + rand() % 150) / 1000;
printf("Raft 节点 %d 启动,任期 %d,状态 Follower\n",
node->node_id, node->current_term);
return node;
}
// 添加对等节点
void raft_add_peer(raft_node_t *node, raft_node_t *peer) {
if (node->peer_count < MAX_NODES) {
node->peersnode-\>peer_count++ = peer;
}
}
```
- 选举超时
```c
// 重置选举超时
void raft_reset_election_timeout(raft_node_t *node) {
node->election_timeout = time(NULL) + (150 + rand() % 150) / 1000;
}
// 检查是否选举超时
int raft_is_election_timeout(raft_node_t *node) {
return time(NULL) >= node->election_timeout;
}
// 选举线程
void *raft_election_thread(void *arg) {
raft_node_t *node = (raft_node_t*)arg;
while (node->running) {
usleep(10000); // 10ms
pthread_mutex_lock(&node->mutex);
if (node->state == STATE_FOLLOWER || node->state == STATE_CANDIDATE) {
if (raft_is_election_timeout(node)) {
// 发起选举
printf("Raft 节点 %d 选举超时,发起选举\n", node->node_id);
// 成为Candidate
node->state = STATE_CANDIDATE;
node->current_term++;
node->voted_for = node->node_id;
raft_reset_election_timeout(node);
raft_persist(node);
// 请求投票
int votes = 1;
for (int i = 0; i < node->peer_count; i++) {
// 实际应通过网络发送RequestVote RPC
// 这里简化,模拟投票
if (i % 2 == 0) votes++; // 模拟多数派
}
if (votes > node->peer_count / 2) {
printf("Raft 节点 %d 当选 Leader (任期 %d)\n",
node->node_id, node->current_term);
node->state = STATE_LEADER;
// 初始化Leader状态
for (int i = 0; i < node->peer_count; i++) {
node->next_indexi = node->log_count;
node->match_indexi = 0;
}
}
}
}
pthread_mutex_unlock(&node->mutex);
}
return NULL;
}
```
- 日志复制
```c
// 追加日志
int raft_append_log(raft_node_t *node, int term, const char *command) {
if (node->log_count >= MAX_LOG_ENTRIES) return -1;
node->lognode-\>log_count.term = term;
node->lognode-\>log_count.index = node->log_count;
strcpy(node->lognode-\>log_count.command, command);
node->log_count++;
raft_persist_log(node);
return 0;
}
// Leader发起AppendEntries
void raft_leader_append_entries(raft_node_t *node) {
if (node->state != STATE_LEADER) return;
pthread_mutex_lock(&node->mutex);
// 对每个Follower发送AppendEntries
for (int i = 0; i < node->peer_count; i++) {
int prev_index = node->next_indexi - 1;
int prev_term = (prev_index >= 0) ? node->logprev_index.term : 0;
// 构造AppendEntries请求
// 实际应通过网络发送
// 这里简化:直接复制日志
// 更新Follower的日志(模拟)
node->match_indexi = node->log_count;
node->next_indexi = node->log_count;
}
pthread_mutex_unlock(&node->mutex);
}
// AppendEntries RPC处理
append_entries_reply_t raft_handle_append_entries(raft_node_t *node,
append_entries_args_t *args) {
append_entries_reply_t reply;
reply.term = node->current_term;
reply.success = 0;
reply.match_index = 0;
pthread_mutex_lock(&node->mutex);
// 检查任期
if (args->term < node->current_term) {
pthread_mutex_unlock(&node->mutex);
return reply;
}
// 更新任期
if (args->term > node->current_term) {
node->current_term = args->term;
node->state = STATE_FOLLOWER;
node->voted_for = -1;
raft_persist(node);
}
// 重置选举超时
raft_reset_election_timeout(node);
// 检查日志匹配
if (args->prev_log_index >= 0 && args->prev_log_index < node->log_count) {
if (node->logargs-\>prev_log_index.term == args->prev_log_term) {
reply.success = 1;
// 追加新日志
for (int i = 0; i < args->entry_count; i++) {
if (args->prev_log_index + 1 + i < node->log_count) {
// 冲突:删除冲突日志
node->log_count = args->prev_log_index + 1 + i;
}
raft_append_log(node, args->entriesi.term, args->entriesi.command);
}
reply.match_index = node->log_count;
// 更新commit_index
if (args->leader_commit > node->commit_index) {
node->commit_index = args->leader_commit < node->log_count ?
args->leader_commit : node->log_count;
}
}
}
pthread_mutex_unlock(&node->mutex);
return reply;
}
```
- 心跳
```c
// 发送心跳(空AppendEntries)
void raft_send_heartbeat(raft_node_t *node) {
if (node->state != STATE_LEADER) return;
pthread_mutex_lock(&node->mutex);
time_t now = time(NULL);
if (now - node->last_heartbeat >= node->heartbeat_interval / 1000) {
printf("Raft Leader %d 发送心跳 (任期 %d)\n",
node->node_id, node->current_term);
node->last_heartbeat = now;
// 向所有Follower发送心跳
// 实际应通过网络发送AppendEntries RPC
// 这里简化:直接更新Follower状态
for (int i = 0; i < node->peer_count; i++) {
raft_node_t *peer = node->peersi;
pthread_mutex_lock(&peer->mutex);
if (peer->current_term < node->current_term) {
peer->current_term = node->current_term;
peer->state = STATE_FOLLOWER;
peer->voted_for = -1;
raft_reset_election_timeout(peer);
}
pthread_mutex_unlock(&peer->mutex);
}
}
pthread_mutex_unlock(&node->mutex);
}
// 心跳线程(Leader专用)
void *raft_heartbeat_thread(void *arg) {
raft_node_t *node = (raft_node_t*)arg;
while (node->running) {
usleep(50000); // 50ms
pthread_mutex_lock(&node->mutex);
if (node->state == STATE_LEADER) {
raft_send_heartbeat(node);
}
pthread_mutex_unlock(&node->mutex);
}
return NULL;
}
```
- RPC线程
```c
// RPC处理线程(模拟网络)
void *raft_rpc_thread(void *arg) {
raft_node_t *node = (raft_node_t*)arg;
while (node->running) {
usleep(100000);
pthread_mutex_lock(&node->mutex);
// 模拟处理AppendEntries
if (node->state == STATE_FOLLOWER) {
// 检查是否有Leader心跳超时
if (time(NULL) - node->last_heartbeat > 2) {
// 假设Leader挂了,触发选举
// 实际由选举线程处理
}
}
pthread_mutex_unlock(&node->mutex);
}
return NULL;
}
```
- 启动Raft集群
```c
void raft_start_cluster(raft_node_t **nodes, int count) {
printf("\n=== Raft集群启动 ===\n");
// 启动各节点线程
for (int i = 0; i < count; i++) {
pthread_create(&nodesi->election_thread, NULL,
raft_election_thread, nodesi);
pthread_create(&nodesi->rpc_thread, NULL,
raft_rpc_thread, nodesi);
pthread_create(&nodesi->heartbeat_thread, NULL,
raft_heartbeat_thread, nodesi);
}
sleep(1);
// 检查Leader
for (int i = 0; i < count; i++) {
if (nodesi->state == STATE_LEADER) {
printf("✅ Leader: 节点 %d (任期 %d)\n",
nodesi->node_id, nodesi->current_term);
}
}
}
```
- 测试代码
```c
void test_raft() {
printf("=== Raft共识算法测试 ===\n\n");
srand(time(NULL));
// 创建5个节点
raft_node_t *nodes5;
for (int i = 0; i < 5; i++) {
char log_file64;
sprintf(log_file, "raft_%d", i);
nodesi = raft_create(i, log_file);
}
// 连接节点
for (int i = 0; i < 5; i++) {
for (int j = 0; j < 5; j++) {
if (i != j) {
raft_add_peer(nodesi, nodesj);
}
}
}
// 启动集群
raft_start_cluster(nodes, 5);
// 运行10秒
printf("\n运行10秒...\n");
sleep(10);
// 打印最终状态
printf("\n=== 集群状态 ===\n");
for (int i = 0; i < 5; i++) {
const char *state_str\[\] = {"Follower", "Candidate", "Leader"};
printf("节点 %d: %s, 任期 %d, 日志数 %d\n",
nodesi->node_id, state_strnodes\[i->state],
nodesi->current_term, nodesi->log_count);
}
// 清理
for (int i = 0; i < 5; i++) {
nodesi->running = 0;
pthread_join(nodesi->election_thread, NULL);
pthread_join(nodesi->rpc_thread, NULL);
pthread_join(nodesi->heartbeat_thread, NULL);
free(nodesi);
}
printf("\n测试完成\n");
}
int main() {
test_raft();
return 0;
}
```
三、编译和运行
```bash
gcc -o raft raft.c -lpthread
./raft
```
四、Raft vs Paxos
特性 Paxos Raft
理解难度 高 中
实现复杂度 高 中
Leader选举 隐式 显式
日志复制 复杂 清晰
安全性证明 难 较易
工程应用 较少 广泛
五、总结
通过这篇文章,你学会了:
· Raft的核心组件(选举、日志复制、安全性)
· 节点状态转换(Follower → Candidate → Leader)
· 选举超时和随机化
· AppendEntries RPC
· 日志一致性和提交
· 持久化存储
Raft是分布式一致性的工程标准。掌握它,你就理解了etcd、Consul、TiDB的核心设计。
下一篇预告:《从零实现一个分布式文件系统:GFS的核心设计》
评论区分享一下你对Raft的理解~