1.绪论
Dledger的选举主要是由DLedgerLeaderElector这个组件来实现的,接下来我们将仔细剖析一下这个组件。
2. 选举组件-DLedgerLeaderElector
2.1 组成
java
public class DLedgerLeaderElector {
private static Logger logger = LoggerFactory.getLogger(DLedgerLeaderElector.class);
private Random random = new Random();
//Dlegder的配置信息
private DLedgerConfig dLedgerConfig;
//节点元数据信息
private final MemberState memberState;
//网络请求组件
private DLedgerRpcService dLedgerRpcService;
//as a server handler
//record the last leader state
//心跳时间
private volatile long lastLeaderHeartBeatTime = -1;
private volatile long lastSendHeartBeatTime = -1;
private volatile long lastSuccHeartBeatTime = -1;
//每2000ms发送一次心跳
private int heartBeatTimeIntervalMs = 2000;
//心跳失败,最多尝试次数为3
private int maxHeartBeatLeak = 3;
//as a client
private long nextTimeToRequestVote = -1;
//是否需要增加term
private volatile boolean needIncreaseTermImmediately = false;
//最小选举时间
private int minVoteIntervalMs = 300;
private int maxVoteIntervalMs = 1000;
private List<RoleChangeHandler> roleChangeHandlers = new ArrayList<>();
//上一次的选举结果通过 或者需要新的一轮选举等
private VoteResponse.ParseResult lastParseResult = VoteResponse.ParseResult.WAIT_TO_REVOTE;
private long lastVoteCost = 0L;
//状态机
private StateMaintainer stateMaintainer = new StateMaintainer("StateMaintainer", logger);
//发送更改leader角色的异步任务
private final TakeLeadershipTask takeLeadershipTask = new TakeLeadershipTask();
}
2.2 leader与follower的选举步骤
在某个节点当选leader过后,会定时向follower发送心跳包,巩固自己的leader地位。当超过一定时间follower未接收到leader的心跳包过后,便会更改自己的状态为candidate,并且开始选举。
2.2.1.StateMaintainer总览
java
public class StateMaintainer extends ShutdownAbleThread {
public StateMaintainer(String name, Logger logger) {
super(name, logger);
}
@Override
public void doWork() {
try {
if (DLedgerLeaderElector.this.dLedgerConfig.isEnableLeaderElector()) {
//设置心跳事件间隔,最小或者最大选举时间等
DLedgerLeaderElector.this.refreshIntervals(dLedgerConfig);
//维护状态
DLedgerLeaderElector.this.maintainState();
}
sleep(10);
} catch (Throwable t) {
DLedgerLeaderElector.logger.error("Error in heartbeat", t);
}
}
}
java
private void maintainState() throws Exception {
if (memberState.isLeader()) {
maintainAsLeader();
} else if (memberState.isFollower()) {
maintainAsFollower();
} else {
maintainAsCandidate();
}
}
可以看出StateMaintainer是一个线程,并且一直在循环,根据不同角色做出不同的处理。
2.2.2. leader发送心跳包
java
private void maintainAsLeader() throws Exception {
//如果当前时间距离上次发送心跳的超过2s
if (DLedgerUtils.elapsed(lastSendHeartBeatTime) > heartBeatTimeIntervalMs) {
long term;
String leaderId;
synchronized (memberState) {
if (!memberState.isLeader()) {
//stop sending
return;
}
//设置自己的term和leaderId
term = memberState.currTerm();
leaderId = memberState.getLeaderId();
lastSendHeartBeatTime = System.currentTimeMillis();
}
//发送心跳包
sendHeartbeats(term, leaderId);
}
}
java
private void sendHeartbeats(long term, String leaderId) throws Exception {
//总共发送的心跳包数
final AtomicInteger allNum = new AtomicInteger(1);
//成功的个数
final AtomicInteger succNum = new AtomicInteger(1);
//失败的个数
final AtomicInteger notReadyNum = new AtomicInteger(0);
final AtomicLong maxTerm = new AtomicLong(-1);
final AtomicBoolean inconsistLeader = new AtomicBoolean(false);
final CountDownLatch beatLatch = new CountDownLatch(1);
long startHeartbeatTimeMs = System.currentTimeMillis();
//变量peerId中的所有节点
for (String id : memberState.getPeerMap().keySet()) {
//如果心跳包是发送给自己,跳过
if (memberState.getSelfId().equals(id)) {
continue;
}
//构建心跳请求
HeartBeatRequest heartBeatRequest = new HeartBeatRequest();
heartBeatRequest.setGroup(memberState.getGroup());
heartBeatRequest.setLocalId(memberState.getSelfId());
//需要发送的远端节点的Id,leaderId,当前的term等
heartBeatRequest.setRemoteId(id);
heartBeatRequest.setLeaderId(leaderId);
heartBeatRequest.setTerm(term);
//发送心跳请求
CompletableFuture<HeartBeatResponse> future = dLedgerRpcService.heartBeat(heartBeatRequest);
future.whenComplete((HeartBeatResponse x, Throwable ex) -> {
try {
if (ex != null) {
//根据心跳包,判断节点是否存活
memberState.getPeersLiveTable().put(id, Boolean.FALSE);
throw ex;
}
switch (DLedgerResponseCode.valueOf(x.getCode())) {
//如果成功,更新成功的心跳包个数
case SUCCESS:
succNum.incrementAndGet();
break;
//可能该leader节点已经下线了,并且其他节点进行选举,如果选举出新leader后重连上,此时term已经过时,需要更新自己的term
case EXPIRED_TERM:
maxTerm.set(x.getTerm());
break;
//这里其实是做的一个兼容,正常情况下不会发生,也即集群存在多个leader的时候,会返回该响应
case INCONSISTENT_LEADER:
inconsistLeader.compareAndSet(false, true);
break;
//如果follower没有准备好,便更新notReadyNum
case TERM_NOT_READY:
notReadyNum.incrementAndGet();
break;
default:
break;
}
//如果是网络异常该接待那边已经下线
if (x.getCode() == DLedgerResponseCode.NETWORK_ERROR.getCode())
memberState.getPeersLiveTable().put(id, Boolean.FALSE);
else
memberState.getPeersLiveTable().put(id, Boolean.TRUE);
if (memberState.isQuorum(succNum.get())
|| memberState.isQuorum(succNum.get() + notReadyNum.get())) {
beatLatch.countDown();
}
} catch (Throwable t) {
logger.error("heartbeat response failed", t);
} finally {
allNum.incrementAndGet();
if (allNum.get() == memberState.peerSize()) {
beatLatch.countDown();
}
}
});
}
//再等待2s中
beatLatch.await(heartBeatTimeIntervalMs, TimeUnit.MILLISECONDS);
//如果超过半数以上节点成功,便认为成功
if (memberState.isQuorum(succNum.get())) {
lastSuccHeartBeatTime = System.currentTimeMillis();
} else {
logger.info("[{}] Parse heartbeat responses in cost={} term={} allNum={} succNum={} notReadyNum={} inconsistLeader={} maxTerm={} peerSize={} lastSuccHeartBeatTime={}",
memberState.getSelfId(), DLedgerUtils.elapsed(startHeartbeatTimeMs), term, allNum.get(), succNum.get(), notReadyNum.get(), inconsistLeader.get(), maxTerm.get(), memberState.peerSize(), new Timestamp(lastSuccHeartBeatTime));
if (memberState.isQuorum(succNum.get() + notReadyNum.get())) {
lastSendHeartBeatTime = -1;
} else if (maxTerm.get() > term) {
//如果已经有其他节点的term大于自己便更该自己为candidate
changeRoleToCandidate(maxTerm.get());
//如果发生脑裂,设置自己为候选人
} else if (inconsistLeader.get()) {
changeRoleToCandidate(term);
//如果超过3次都没有发送心跳包成功,便会更新自己为candidate重新选举
} else if (DLedgerUtils.elapsed(lastSuccHeartBeatTime) > maxHeartBeatLeak * heartBeatTimeIntervalMs) {
changeRoleToCandidate(term);
}
}
}
可以看出,其实就是leader封装心跳没2s发送给follower,如果遇到如下3种情况,会更新自己为candidate重新选举:
1.如果已经有其他节点的term大于自己便更该自己为candidate,证明自己已经发生掉线重连,此时有新的leader。
2.如果发生脑裂,设置自己为候选人,该场景一般不会发生,做兜底。
3.如果超过3次都没有发送心跳包成功,便会更新自己为candidate重新选举
2.2.3.follower接受心跳包
其实就是根据leader发送过来的心跳包做逻辑判断,如果自己没有leader,便更新自己的leader为心跳包中的leader。如果心跳包无异常,刷新心跳时钟。
java
//follower处理heartBeat请求
public CompletableFuture<HeartBeatResponse> handleHeartBeat(HeartBeatRequest request) throws Exception {
//判断当前的的term,leaderId和是否正确
if (!memberState.isPeerMember(request.getLeaderId())) {
logger.warn("[BUG] [HandleHeartBeat] remoteId={} is an unknown member", request.getLeaderId());
return CompletableFuture.completedFuture(new HeartBeatResponse().term(memberState.currTerm()).code(DLedgerResponseCode.UNKNOWN_MEMBER.getCode()));
}
if (memberState.getSelfId().equals(request.getLeaderId())) {
logger.warn("[BUG] [HandleHeartBeat] selfId={} but remoteId={}", memberState.getSelfId(), request.getLeaderId());
return CompletableFuture.completedFuture(new HeartBeatResponse().term(memberState.currTerm()).code(DLedgerResponseCode.UNEXPECTED_MEMBER.getCode()));
}
//如果心跳请求的term是否小于当前term的话,返回EXPIRED_TERM的异常
if (request.getTerm() < memberState.currTerm()) {
return CompletableFuture.completedFuture(new HeartBeatResponse().term(memberState.currTerm()).code(DLedgerResponseCode.EXPIRED_TERM.getCode()));
} else if (request.getTerm() == memberState.currTerm()) {
if (request.getLeaderId().equals(memberState.getLeaderId())) {
//如果心跳校验没有问题,便更新上一次发送心跳的时间
lastLeaderHeartBeatTime = System.currentTimeMillis();
return CompletableFuture.completedFuture(new HeartBeatResponse());
}
}
//abnormal case
//hold the lock to get the latest term and leaderId
synchronized (memberState) {
//如果leader的心跳小于自己的term,便返回EXPIRED_TERM的异常
if (request.getTerm() < memberState.currTerm()) {
return CompletableFuture.completedFuture(new HeartBeatResponse().term(memberState.currTerm()).code(DLedgerResponseCode.EXPIRED_TERM.getCode()));
} else if (request.getTerm() == memberState.currTerm()) {
//如果自己的leader是空的,便更新自己的leder为心跳包中的leader
if (memberState.getLeaderId() == null) {
changeRoleToFollower(request.getTerm(), request.getLeaderId());
return CompletableFuture.completedFuture(new HeartBeatResponse());
} else if (request.getLeaderId().equals(memberState.getLeaderId())) {
//如果发送成功,便更新心跳事件
lastLeaderHeartBeatTime = System.currentTimeMillis();
return CompletableFuture.completedFuture(new HeartBeatResponse());
} else {
//this should not happen, but if happened
logger.error("[{}][BUG] currTerm {} has leader {}, but received leader {}", memberState.getSelfId(), memberState.currTerm(), memberState.getLeaderId(), request.getLeaderId());
return CompletableFuture.completedFuture(new HeartBeatResponse().code(DLedgerResponseCode.INCONSISTENT_LEADER.getCode()));
}
} else {
//To make it simple, for larger term, do not change to follower immediately
//first change to candidate, and notify the state-maintainer thread
changeRoleToCandidate(request.getTerm());
needIncreaseTermImmediately = true;
//TOOD notify
return CompletableFuture.completedFuture(new HeartBeatResponse().code(DLedgerResponseCode.TERM_NOT_READY.getCode()));
}
}
}
2.2.4.follower心跳时钟超过最大心跳事时间,更改自己为candidate
follower接收心跳包,并且超过8s钟未收到心跳包,便会切换自己为candidate进行选举.
java
private void maintainAsFollower() {
//如果超过4s中没有收到心跳包
if (DLedgerUtils.elapsed(lastLeaderHeartBeatTime) > 2 * heartBeatTimeIntervalMs) {
synchronized (memberState) {
//然后再超过了重试次数* 心跳间隔,也即6s没有收到心跳包
if (memberState.isFollower() && DLedgerUtils.elapsed(lastLeaderHeartBeatTime) > maxHeartBeatLeak * heartBeatTimeIntervalMs) {
logger.info("[{}][HeartBeatTimeOut] lastLeaderHeartBeatTime: {} heartBeatTimeIntervalMs: {} lastLeader={}", memberState.getSelfId(), new Timestamp(lastLeaderHeartBeatTime), heartBeatTimeIntervalMs, memberState.getLeaderId());
//便会更新自己状态为candidate
changeRoleToCandidate(memberState.currTerm());
}
}
}
}
java
public void changeRoleToCandidate(long term) {
synchronized (memberState) {
if (term >= memberState.currTerm()) {
//改变自己的角色为candidate
memberState.changeToCandidate(term);
//执行角色改变后的逻辑
handleRoleChange(term, MemberState.Role.CANDIDATE);
logger.info("[{}] [ChangeRoleToCandidate] from term: {} and currTerm: {}", memberState.getSelfId(), term, memberState.currTerm());
} else {
logger.info("[{}] skip to be candidate in term: {}, but currTerm: {}", memberState.getSelfId(), term, memberState.currTerm());
}
}
}
其实就是改变了自己的状态为candidate,并且调用RoleChangeHandler这个钩子函数,RoleChangeHandler其实是提供给客户端的一个钩子,客户端可以实现它来做角色变更后的一些后置处理。
2.2.5. 节点检测到自己角色切换为candidate,开始选举
StateMaintainer是一个无限循环,当检测到自己的角色切换为candidate过后,会调用maintainAsCandidate方法。
java
private void maintainState() throws Exception {
if (memberState.isLeader()) {
maintainAsLeader();
} else if (memberState.isFollower()) {
maintainAsFollower();
} else {
maintainAsCandidate();
}
}
java
private void maintainAsCandidate() throws Exception {
//for candidate
if (System.currentTimeMillis() < nextTimeToRequestVote && !needIncreaseTermImmediately) {
return;
}
//当前的周期
long term;
long ledgerEndTerm;
long ledgerEndIndex;
//确保角色为candidate
if (!memberState.isCandidate()) {
return;
}
synchronized (memberState) {
if (!memberState.isCandidate()) {
return;
}
//第一次选举状态为WAIT_TO_REVOTE,如果第一次选举未成功,状态应该为WAIT_TO_VOTE_NEXT
if (lastParseResult == VoteResponse.ParseResult.WAIT_TO_VOTE_NEXT || needIncreaseTermImmediately) {
long prevTerm = memberState.currTerm();
//获取下一周期为,直接term+1
term = memberState.nextTerm();
logger.info("{}_[INCREASE_TERM] from {} to {}", memberState.getSelfId(), prevTerm, term);
lastParseResult = VoteResponse.ParseResult.WAIT_TO_REVOTE;
} else {
//term为当前term
term = memberState.currTerm();
}
//获取最新的日志索引和term
ledgerEndIndex = memberState.getLedgerEndIndex();
ledgerEndTerm = memberState.getLedgerEndTerm();
}
if (needIncreaseTermImmediately) {
nextTimeToRequestVote = getNextTimeToRequestVote();
needIncreaseTermImmediately = false;
return;
}
long startVoteTimeMs = System.currentTimeMillis();
//接收投票请求结果的list
final List<CompletableFuture<VoteResponse>> quorumVoteResponses = voteForQuorumResponses(term, ledgerEndTerm, ledgerEndIndex);
//整个group里面最大的term是多少
final AtomicLong knownMaxTermInGroup = new AtomicLong(term);
//发送总的投票请求数量
final AtomicInteger allNum = new AtomicInteger(0);
//不同意的节点数量
final AtomicInteger validNum = new AtomicInteger(0);
//同意的节点数量
final AtomicInteger acceptedNum = new AtomicInteger(0);
//未准备好的节点数量
final AtomicInteger notReadyTermNum = new AtomicInteger(0);
//日志的index大于当前candidate节点的数量
final AtomicInteger biggerLedgerNum = new AtomicInteger(0);
//已经有leader的节点数量
final AtomicBoolean alreadyHasLeader = new AtomicBoolean(false);
CountDownLatch voteLatch = new CountDownLatch(1);
for (CompletableFuture<VoteResponse> future : quorumVoteResponses) {
future.whenComplete((VoteResponse x, Throwable ex) -> {
try {
if (ex != null) {
throw ex;
}
logger.info("[{}][GetVoteResponse] {}", memberState.getSelfId(), JSON.toJSONString(x));
if (x.getVoteResult() != VoteResponse.RESULT.UNKNOWN) {
validNum.incrementAndGet();
}
synchronized (knownMaxTermInGroup) {
//统计每个请求响应的数量
switch (x.getVoteResult()) {
case ACCEPT:
acceptedNum.incrementAndGet();
break;
case REJECT_ALREADY_HAS_LEADER:
alreadyHasLeader.compareAndSet(false, true);
break;
case REJECT_TERM_SMALL_THAN_LEDGER:
case REJECT_EXPIRED_VOTE_TERM:
if (x.getTerm() > knownMaxTermInGroup.get()) {
knownMaxTermInGroup.set(x.getTerm());
}
break;
case REJECT_EXPIRED_LEDGER_TERM:
case REJECT_SMALL_LEDGER_END_INDEX:
biggerLedgerNum.incrementAndGet();
break;
case REJECT_TERM_NOT_READY:
notReadyTermNum.incrementAndGet();
break;
case REJECT_ALREADY_VOTED:
case REJECT_TAKING_LEADERSHIP:
default:
break;
}
}
//如果已经有节点已经有leader或者超过半数投票通过,便结束选举
if (alreadyHasLeader.get()
|| memberState.isQuorum(acceptedNum.get())
|| memberState.isQuorum(acceptedNum.get() + notReadyTermNum.get())) {
voteLatch.countDown();
}
} catch (Throwable t) {
logger.error("vote response failed", t);
} finally {
allNum.incrementAndGet();
if (allNum.get() == memberState.peerSize()) {
voteLatch.countDown();
}
}
});
}
try {
voteLatch.await(2000 + random.nextInt(maxVoteIntervalMs), TimeUnit.MILLISECONDS);
} catch (Throwable ignore) {
}
lastVoteCost = DLedgerUtils.elapsed(startVoteTimeMs);
VoteResponse.ParseResult parseResult;
//重要:
//1.如果的同意的节点数量超过半数以上,便选举成功
if (knownMaxTermInGroup.get() > term) {
parseResult = VoteResponse.ParseResult.WAIT_TO_VOTE_NEXT;
nextTimeToRequestVote = getNextTimeToRequestVote();
changeRoleToCandidate(knownMaxTermInGroup.get());
} else if (alreadyHasLeader.get()) {
parseResult = VoteResponse.ParseResult.WAIT_TO_REVOTE;
nextTimeToRequestVote = getNextTimeToRequestVote() + heartBeatTimeIntervalMs * maxHeartBeatLeak;
} else if (!memberState.isQuorum(validNum.get())) {
parseResult = VoteResponse.ParseResult.WAIT_TO_REVOTE;
nextTimeToRequestVote = getNextTimeToRequestVote();
} else if (!memberState.isQuorum(validNum.get() - biggerLedgerNum.get())) {
parseResult = VoteResponse.ParseResult.WAIT_TO_REVOTE;
nextTimeToRequestVote = getNextTimeToRequestVote() + maxVoteIntervalMs;
} else if (memberState.isQuorum(acceptedNum.get())) {
parseResult = VoteResponse.ParseResult.PASSED;
} else if (memberState.isQuorum(acceptedNum.get() + notReadyTermNum.get())) {
parseResult = VoteResponse.ParseResult.REVOTE_IMMEDIATELY;
} else {
parseResult = VoteResponse.ParseResult.WAIT_TO_VOTE_NEXT;
nextTimeToRequestVote = getNextTimeToRequestVote();
}
lastParseResult = parseResult;
logger.info("[{}] [PARSE_VOTE_RESULT] cost={} term={} memberNum={} allNum={} acceptedNum={} notReadyTermNum={} biggerLedgerNum={} alreadyHasLeader={} maxTerm={} result={}",
memberState.getSelfId(), lastVoteCost, term, memberState.peerSize(), allNum, acceptedNum, notReadyTermNum, biggerLedgerNum, alreadyHasLeader, knownMaxTermInGroup.get(), parseResult);
//如果投票通过,改变自己角色为leader
if (parseResult == VoteResponse.ParseResult.PASSED) {
logger.info("[{}] [VOTE_RESULT] has been elected to be the leader in term {}", memberState.getSelfId(), term);
changeRoleToLeader(term);
}
}
2.2.6.如果选举成功
如果超过半数以上follower支持自己便选举成功。改变自己的角色为leader。
java
public void changeRoleToLeader(long term) {
synchronized (memberState) {
//更新周期为当前周期
if (memberState.currTerm() == term) {
memberState.changeToLeader(term);
lastSendHeartBeatTime = -1;
//更改自己的角色为leader
handleRoleChange(term, MemberState.Role.LEADER);
logger.info("[{}] [ChangeRoleToLeader] from term: {} and currTerm: {}", memberState.getSelfId(), term, memberState.currTerm());
} else {
logger.warn("[{}] skip to be the leader in term: {}, but currTerm is: {}", memberState.getSelfId(), term, memberState.currTerm());
}
}
}
至此leader便选举完成。然后开始给follower发送心跳包和一致性检查。开启新leader的纪元。