Mit 6.824 Lab：2A Raft 实现历程

实验原址：mit 6.824 Lab2 Raft

中文翻译：mit 6.824 Lab2 翻译

Raft论文： In Search of an Understandable Consensus Algorithm

Raft论文翻译：(zhuanlan.zhihu.com/p/524885008)

介绍

在Raft论文当中，作者不仅仅为我们进行详细的理论分析，并且还提供了详细的设计思路。还提供了一份2000多行的C++实现代码。在本实验中，我会严格按照论文figure2的描述，在实验中提供的骨架代码中编写程序。

倘若在实现中，始终无法调试出Bug在何处，不妨再仔细阅读论文，仔细思考写的代码是否符合论文中的描述。

实验目标

实现Raft共识算法的第一部分：选举与心跳。在Raft算法当中，第一部分就是领导者选举。为了维护的数据一致性，Raft采用简单粗暴的架构方式：主从架构。如果不是很熟悉这些，可以看看这篇文章 Raft共识算法PartA：Leader Election And HeartBeat

初始化

按照论文中的State描述，将变量写出来。其中的log[]中元素是一个结构体。包含了command与Term。

go 复制代码

type LogEntry struct{
	Command interface {}
	Term            int 
}
// A Go object implementing a single Raft peer.
type Raft struct {
	mu              sync.Mutex          // Lock to protect shared access to this peer's state
	peers    []  *labrpc.ClientEnd // RPC end points of all peers
	persister  *Persister          // Object to hold this peer's persisted state
	me              int                 // this peer's index into peers[]
	dead          int32               // set by Kill()

	// Your data here (2A, 2B, 2C).
	// Look at the paper's Figure 2 for a description of what
	// state a Raft server must maintain.
	currentTerm int // 最新任期
	votedFor int //server票给了谁
	logs [] LogEntry 
	//all servers
	commitIndex int
	lastApplied int 
	//leaders
	nextIndex [] int 
	matchIndex []int
	//others
	state int 
	voteNum int
	stamp time.Time  //时间戳
	nextBeatTime time.Time //上一次心跳时间
	//与上层交互通道
	applyCh chan ApplyMsg

	snapshot [] byte //快照
	lastIncludedTerm int// 快照最后一个Index
	lastIncludedIndex int //快照最后一个Index的Term
}
const(
	Follower = 0
	Candidate = 1
	Leader = 2
)
const(
	None = -1
)
const(
	HeartBeatInterval = 100 
	MaxElectionTime = 300
	MinElectionTime = 150
	ApplyInterval = 10
)

func Make(peers []*labrpc.ClientEnd, me int,
	persister *Persister, applyCh chan ApplyMsg) *Raft {
	rf := &Raft{}
	rf.peers = peers
	rf.persister = persister
	rf.me = me
	length := len(rf.peers)
	// Your initialization code here (2A, 2B, 2C).
	rf.state = Follower
	rf.voteNum = 0
	rf.votedFor = None
	rf.currentTerm = 0
	rf.matchIndex = make([]int,length)
	
	rf.stamp = time.Now()
	rf.logs = append(rf.logs,LogEntry{
		Term : 0,
		Command: nil,
	})
	rf.applyCh = applyCh
	// initialize from state persisted before a crash
	rf.readSnapshot(persister.ReadSnapshot())
	rf.readPersist(persister.ReadRaftState())
	rf.nextIndex = make([]int,length)

	// start ticker goroutine to start elections
	go rf.ticker()
	go rf.appliyToState()

	return rf
}

上面包含了后续Lab的字段，可以先不用管。

接下来是领导者选举最重要的RPC，RequestVote。定义请求参数与回复参数。依旧与论文描述一致。

go 复制代码

type RequestVoteArgs struct {
	// Your data here (2A, 2B).
	Term 	       int 
	CandidateId    int
	LastLogIndex   int //index of candidate's last log entry
	LastLogTerm    int // term of candidate's last log entry
}

type RequestVoteReply struct {
	// Your data here (2A).
	Term 	    int
	VoteGranted bool
}

注意点

//All Servers： If RPC request or resopnse contains term T > currentTerm: set currentTerm = T, convert to follower

对于所有的RPC的请求，或者回复，只要Reply or Args 中Term大的，一律将自己转变成Follower，并改成更大的Term。
对于所有的RPC回复，都需要判断是不是Old RPC，倘若发现是Old RPC，一律丢弃不处理。

go 复制代码

//来自过时的Term
  rf.mu.Lock()
	if args.Term != rf.currentTerm {
		rf.mu.Unlock()
		return
	}

一把大锁保平安。锁的粒度可以很大。
在发送RPC的时候，一定不要持有锁。

定时器

在Raft中，初始化，所有的Server都是Follower，在Follower心跳超时的时候，会进行Leader选举。在Ticker中，会不断检查stamp时间戳，来判断是否心跳过期。

易错点

有效的RPC请求，才可以更新心跳时间。

什么是有效的心跳？

收到了有效的选票（自己将票投了出去）。
收到了有效的AE RPC。
收到了有效的快照RPC（较后Lab中）。

心跳超时的选择: 为了解决选票一直被平分问题，必须采取随机超时。

选举时间 >> 心跳时间

go 复制代码

func (rf *Raft) ticker() {
	rdTimeOut := rand.Intn(MaxElectionTime) + MinElectionTime
	DPrintf("server %v init timeout  %v",rf.me,rdTimeOut)
	for !rf.killed()  {
		// Your code here (2A)
		// Check if a leader election should be started.
		//当前时间 与 时间戳的时间差 是否大于 当前时间与上一次时间间隔
		rf.mu.Lock()
		if rf.state != Leader && time.Now().After(rf.stamp.Add(time.Duration(rdTimeOut) * time.Millisecond)){
			DPrintf("%v 时间过期, 上一次stamp %v",rf.me,rf.stamp) 
			go rf.election()
		}
		rf.mu.Unlock()
		// pause for a random amount of time between 50 and 350
		// milliseconds.
		rdTimeOut = rand.Intn(MaxElectionTime) + MinElectionTime
		time.Sleep(time.Duration(rdTimeOut) * time.Millisecond)
	}
}

Follower选举发送求票信息

If election timeout elapses without receiving AE RPC from currentLeader or granting vote to candidate：convert to candidate

go 复制代码

func(rf*Raft) election(){
	rf.mu.Lock()
	DPrintf("Server %v Start Election",rf.me)
	rf.state = Candidate
	rf.votedFor = rf.me
	rf.voteNum = 1
	rf.currentTerm  += 1
	//投票给自己也是rpc
	rf.stamp = time.Now()
	//变更，持久化
	rf.persist()
	args := RequestVoteArgs{
		Term: rf.currentTerm,
		CandidateId: rf.me,
		//全局递增的索引，应当进行转换
		LastLogIndex: rf.ToVirtualIndex(len(rf.logs) - 1),
		LastLogTerm: rf.logs[len(rf.logs) - 1].Term,
	}
		rf.mu.Unlock()
	//并行求票
	for i := 0; i < len(rf.peers); i++ {
		if i == rf.me {
			continue
		}
		reply := RequestVoteReply{}
		go rf.collctVote(i,&args,&reply)
	}

}
func (rf*Raft) collctVote(server int,args*RequestVoteArgs,reply*RequestVoteReply){
	ok:= rf.sendRequestVote(server,args,reply)
	if !ok {
		return
	}
	rf.mu.Lock()
	//来自过时的Term
	if args.Term != rf.currentTerm {
			rf.mu.Unlock()
			return
	}
	//过时的Term
	if reply.Term > rf.currentTerm {
		rf.currentTerm = reply.Term
		rf.state = Follower
		rf.votedFor = None
		rf.voteNum = 0
		rf.persist()
		rf.mu.Unlock()
		return
	}
	//注意，Leader只能从Candidate转变过去
	if !reply.VoteGranted || rf.state != Candidate {
		rf.mu.Unlock()
		return
	}
	
	rf.voteNum += 1
	if rf.voteNum > len(rf.peers) / 2 && rf.state == Candidate  {
		rf.state = Leader
		DPrintf("server %v become leader Term %v logs %v",rf.me,rf.currentTerm,rf.logs)
	
		for i := 0; i < len(rf.nextIndex); i++ {
			rf.nextIndex[i] =  rf.ToVirtualIndex(len(rf.logs))  // initiated to leader last log index + 1
			//rf.matchIndex[i] = rf.lastIncludedIndex // 默认已匹配快照最后Index
		}
		go rf.cycleAppendEntries()
	}
	rf.mu.Unlock()
}

处理求票RPC

此处有一些易错点：

无论何时，收到Term更大，自己就必须判断票是否还在。voteFor就是用在此处的。
一个Candidate想要获得票成为Leader，那他必须拥有目前为止最新的日志。并且还没有投过票。

什么是最新的日志？

If voteFor is null or candidate , and candidate's log is at least as up-to-date as receiver's log, grant vote

当最后一个日志的Term相同时，谁的日志长谁就更新，谁就获得票选当最后一个日志的Term不相同，任期大的更新

go 复制代码

func (rf *Raft) HandlerRequestVote(args *RequestVoteArgs, reply *RequestVoteReply) {
	// Your code here (2A, 2B).
	rf.mu.Lock()
	DPrintf("Server %v 收到 Server %v 求票信息", rf.me,args.CandidateId)
	if args.Term < rf.currentTerm {
		DPrintf("Server %v  任期小 求票失败", args.CandidateId)
		reply.VoteGranted = false
		reply.Term = rf.currentTerm
		rf.mu.Unlock()
		return
	}

	//All Servers： If RPC request or resopnse contains term T > currentTerm: set currentTerm = T, convert to follower
	if args.Term > rf.currentTerm {
		//改朝换代，投票机会刷新
		DPrintf("存在更大的Term %v Server %v 在选举", args.Term,args.CandidateId)
		rf.votedFor = None
		rf.state = Follower
		rf.voteNum = 0
		rf.currentTerm = args.Term
		rf.persist()
	}
	
	if rf.votedFor == None ||  rf.votedFor == args.CandidateId {
		if (args.LastLogTerm == rf.logs[len(rf.logs) - 1].Term && args.LastLogIndex >= rf.ToVirtualIndex(len(rf.logs) - 1)) || args.LastLogTerm > rf.logs[len(rf.logs) - 1].Term {
			rf.currentTerm = args.Term
			//如果自己还是Follower，convert to Candidate
			rf.state = Follower
			rf.voteNum = 0
			rf.votedFor = args.CandidateId
			reply.VoteGranted = true
			reply.Term = rf.currentTerm
			//有效的投票，或者有效的AE，都要更新时间戳
			rf.persist()
			rf.stamp = time.Now()
			rf.mu.Unlock()	
			return	
		}
	}
	DPrintf("Server %v 拒绝向 %v 投票",rf.me,args.CandidateId)
	reply.VoteGranted = false
	reply.Term = rf.currentTerm
	rf.mu.Unlock()
}

周期心跳函数

注意选取心跳的时间间隔。注意不要写死心跳间隔，后续可能需要立即触发心跳的功能。

这边选择的是100毫秒一次，也即1秒10次心跳。

go 复制代码

//周期心跳函数
func (rf*Raft) cycleAppendEntries(){
	rf.nextBeatTime = time.Now()
	for !rf.killed() {
		rf.mu.Lock()
		// if the server is dead or is not the leader, just return
		if rf.state != Leader{
			// 不是leader则终止心跳的发送
			rf.mu.Unlock()
			return
		}
		if !time.Now().After(rf.nextBeatTime){
			rf.mu.Unlock()
			continue
		}
		for i := 0; i < len(rf.peers); i++ {
			if i == rf.me {
				continue
			}
			reply := AppendEntriesReply{}
			args := AppendEntriesArgs{
				LeaderId : rf.me,
				Term : rf.currentTerm,
				LeaderCommit: rf.commitIndex,
				PrevLogIndex: rf.nextIndex[i] - 1,
			}
			flag := false
			//如果发送时，发现要发送的日志存在于快照中
			if args.PrevLogIndex  < rf.lastIncludedIndex  {
				flag = true
	//If last log index>= nextIndex for a follower，AppendEntries RPC with log entries starting at nextIndex
        //有日志要发送
			}else if rf.ToVirtualIndex(len(rf.logs) - 1) > args.PrevLogIndex  { 
					args.Entries = rf.logs[rf.ToRealIndex(args.PrevLogIndex+1):]

					DPrintf("Server %v Send AE Args %v To %v",rf.me,args,i)
			}else {
					args.Entries = make([]LogEntry,0)
			}
			if flag {
				go rf.SendInstallSnapshot(i)
			}else{
				args.PrevLogTerm = rf.logs[rf.ToRealIndex(args.PrevLogIndex)].Term
				go rf.SendAppendEntries(i,&args,&reply)
			}
		}
		rf.nextBeatTime = time.Now().Add(time.Duration(HeartBeatInterval)*time.Millisecond)
		rf.mu.Unlock()
	}
}

func(rf*Raft) SendAppendEntries(server int, args *AppendEntriesArgs, reply *AppendEntriesReply){
	ok := rf.sendAppendEntries(server,args,reply)
	if !ok {
		return
	}
	rf.mu.Lock()
	//不是当前Term的RPC，丢弃
	if rf.currentTerm != args.Term {
		DPrintf("Old RPC")
		rf.mu.Unlock()
		return
	}
	
	//All Servers： If RPC request or resopnse contains term T > currentTerm: set currentTerm = T, convert to follower
	if reply.Term > rf.currentTerm{
		rf.currentTerm = reply.Term
		rf.state = Follower
		rf.votedFor = None
		rf.voteNum = 0
		rf.persist()
		rf.mu.Unlock()
		return
	}
	.....

处理心跳函数

go 复制代码

func (rf*Raft) HandlerAppendEntries(args*AppendEntriesArgs, reply * AppendEntriesReply){
	rf.mu.Lock()
	 
	// 1. Reply false if term < currentTerm
	if args.Term < rf.currentTerm {
		DPrintf("Old Leader %v ",args.LeaderId)
		reply.Success = false
		reply.Term = rf.currentTerm
		rf.mu.Unlock()
		return
	}
	//All Servers： If RPC request or resopnse contains term T > currentTerm: set currentTerm = T, convert to follower
	//有效的投票，或者有效的AE，都要更新时间戳
	rf.stamp = time.Now()
	if args.Term > rf.currentTerm {
		rf.currentTerm = args.Term
		rf.state = Follower
		rf.voteNum = 0
		rf.votedFor = None
		rf.persist() 
	}
	
	
	DPrintf("Server %v Term：%v\n",rf.me,rf.currentTerm)
....
....
	//这个RPC只能是Follower接受
	if rf.state == Follower {
		reply.Success = true
		reply.Term = rf.currentTerm
		rf.mu.Unlock()
		return
	}
	rf.mu.Unlock()
}

关于测试点

测试程序中，会调用GetState函数，来获取IsLeader，Term。调用过程中记得加锁。

go 复制代码

// return currentTerm and whether this server
// believes it is the leader.
func (rf *Raft) GetState() (int, bool) {

	var term int
	var isleader bool
	// Your code here (2A).
	rf.mu.Lock()
	defer rf.mu.Unlock()
	term = rf.currentTerm
	isleader = rf.state == Leader
	return term, isleader
}

在2A中，有三个测试。在测试过程中，需要注意是否有Warn警告。个人建议，还是采用脚本的方式，将日志写入文件中。多线程环境下如何debug也是一大难点。

shell 复制代码

rm outA
go test -run 2A > outA

例如：

yaml 复制代码

warning: term changed even though there were no failures //检查是否正确发送Or接收心跳

测试结果：

shell 复制代码

Test (2A): initial election ...
  ... Passed --   3.0  3   60   16270    0
Test (2A): election after network failure ...
  ... Passed --   5.0  3  150   27808    0
Test (2A): multiple elections ...
  ... Passed --   5.8  7  600  119184    0
PASS
ok  	6.5840/raft	13.832s