本文详细分析一下zookeeper的数据存储。

ZKDatabase

维护zookeeper服务器内存数据库，包括session、dataTree和committedlog数据，从磁盘读取日志和快照后启动。

关键字段

java 复制代码

// 数据节点树
protected DataTree dataTree;
protected ConcurrentHashMap<Long, Integer> sessionsWithTimeouts;
protected FileTxnSnapLog snapLog; // 用于操作底层数据文件
// committedLog中第一条和最后一条数据的zxid
protected long minCommittedLog, maxCommittedLog;
// committedLog最大容量，默认500
public int commitLogCount;
// 维护最后提交的请求集，可用于快速follower同步
protected Queue<Proposal> committedLog = new ArrayDeque<>();

protected ReentrantReadWriteLock logLock = new ReentrantReadWriteLock();
private volatile boolean initialized = false;

// txnlog计数
private AtomicInteger txnCount = new AtomicInteger(0);

构造方法

java 复制代码

public ZKDatabase(FileTxnSnapLog snapLog) {
    dataTree = createDataTree();
    sessionsWithTimeouts = new ConcurrentHashMap<>();
    this.snapLog = snapLog;

    // 初始化snapshotSizeFactor默认0.33
    // 初始化commitLogCount默认500
}

public DataTree createDataTree() {
    return new DataTree();
}

创建DataTree对象：创建/zookeeper/quota、/zookeeper/config节点，创建dataWatches和childWatches对象(使用WatchManager实现类)。

主要方法

java 复制代码

// 返回committedLog集
public synchronized Collection<Proposal> getCommittedLog();
// 返回dataTree.lastProcessedZxid的值
public long getDataTreeLastProcessedZxid();
// 返回dataTree.getSessions()集
public Collection<Long> getSessions();
// 返回sessionsWithTimeouts的size
public long getSessionCount();
// 从磁盘加载dataTree并把txnLog加载到committedLog中
public long loadDataBase() throws IOException;
// 从磁盘加载txnLog到committedLog中
public long fastForwardDataBase() throws IOException;
// 使用addCommittedProposal方法添加committedLog
private void addCommittedProposal(TxnHeader hdr, Record txn, TxnDigest digest);
// 添加committedLog
public void addCommittedProposal(Request request);
// 从txnLog加载Proposal
public Iterator<Proposal> getProposalsFromTxnLog(long startZxid, long sizeLimit);
// 使用dataTree.removeCnxn(cnxn)
public void removeCnxn(ServerCnxn cnxn);
// 使用dataTree.killSession(sessionId, zxid)
public void killSession(long sessionId, long zxid);
// 使用dataTree.dumpEphemerals(pwriter)
public void dumpEphemerals(PrintWriter pwriter);
// 使用dataTree.getEphemerals()
public Map<Long, Set<String>> getEphemerals();
// 使用dataTree.getNodeCount()
public int getNodeCount();
// 使用dataTree.getEphemerals(sessionId)
public Set<String> getEphemerals(long sessionId);
// 给dataTree.lastProcessedZxid赋值
public void setlastProcessedZxid(long zxid);
// 使用dataTree.processTxn(hdr, txn, digest)
public ProcessTxnResult processTxn(TxnHeader hdr, Record txn, TxnDigest digest);
// 使用dataTree.statNode(path, serverCnxn)
public Stat statNode(String path, ServerCnxn serverCnxn) throws KeeperException.NoNodeException;
// 使用dataTree.getNode(path)
public DataNode getNode(String path);
// 使用dataTree.getData(path, stat, watcher)
public byte[] getData(String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException;
// 使用dataTree.setWatches方法实现
public void setWatches(long relativeZxid, List<String> dataWatches,
                       List<String> existWatches, List<String> childWatches,
                       List<String> persistentWatches, List<String> persistentRecursiveWatches,
                       Watcher watcher);
// 使用dataTree.addWatch(basePath, watcher, mode)
public void addWatch(String basePath, Watcher watcher, int mode);
// 使用dataTree.getChildren(path, stat, watcher)
public List<String> getChildren(
    String path, Stat stat, Watcher watcher) throws KeeperException.NoNodeException;
// 使用dataTree.getAllChildrenNumber(path)
public int getAllChildrenNumber(String path) throws KeeperException.NoNodeException;
// Truncate the ZKDatabase to the specified zxid
public boolean truncateLog(long zxid) throws IOException;
// Deserialize a snapshot from an input archive
public void deserializeSnapshot(InputArchive ia) throws IOException;
// Deserialize a snapshot that contains FileHeader from an input archive
// It is used by the admin restore command
public void deserializeSnapshot(final InputArchive ia, final CheckedInputStream is) throws IOException;
// Serialize the snapshot
public void serializeSnapshot(OutputArchive oa) throws IOException, InterruptedException;
// 使用snapLog.append(si)保存数据，txnCount++
public boolean append(Request si) throws IOException;
// 使用snapLog.rollLog()滚动底层txnLog
public void rollLog() throws IOException;
// 使用snapLog.commit()提交底层txnLog
public void commit() throws IOException;
// 初始化/zookeeper/config数据，集群启动时已介绍
public synchronized void initConfigInZKDatabase(QuorumVerifier qv);
// 使用dataTree.containsWatcher(path, type, watcher)
public boolean containsWatcher(String path, WatcherType type, Watcher watcher);
// 使用dataTree.removeWatch(path, type, watcher)
public boolean removeWatch(String path, WatcherType type, Watcher watcher);

loadDataBase方法

从磁盘加载dataTree并把txnLog加载到committedLog中：

java 复制代码

public long loadDataBase() throws IOException {
    long startTime = Time.currentElapsedTime();
    // 1. 从snapshot加载dataTree
    // 2. 使用fastForwardFromEdits方法从txnLog加载dataTree和committedlog
    long zxid = snapLog.restore(dataTree, sessionsWithTimeouts, commitProposalPlaybackListener);
    initialized = true;
    // 略
    return zxid;
}

fastForwardDataBase方法

从txnLog加载dataTree和committedlog集：

java 复制代码

public long fastForwardDataBase() throws IOException {
    // 会通过commitProposalPlaybackListener调用addCommittedProposal添加committedlog
    long zxid = snapLog.fastForwardFromEdits(
        dataTree, sessionsWithTimeouts, commitProposalPlaybackListener);
    initialized = true;
    return zxid;
}

addCommittedProposal方法

java 复制代码

private void addCommittedProposal(TxnHeader hdr, Record txn, TxnDigest digest) {
    Request r = new Request(0, hdr.getCxid(), hdr.getType(), hdr, txn, hdr.getZxid());
    r.setTxnDigest(digest);
    addCommittedProposal(r);
}

public void addCommittedProposal(Request request) {
    WriteLock wl = logLock.writeLock();
    try {
        wl.lock();
        if (committedLog.size() > commitLogCount) {
            committedLog.remove();
            minCommittedLog = committedLog.peek().packet.getZxid();
        }
        if (committedLog.isEmpty()) {
            minCommittedLog = request.zxid;
            maxCommittedLog = request.zxid;
        }
        byte[] data = request.getSerializeData();
        QuorumPacket pp = new QuorumPacket(Leader.PROPOSAL, request.zxid, data, null);
        Proposal p = new Proposal();
        p.packet = pp;
        p.request = request;
        committedLog.add(p);
        maxCommittedLog = p.packet.getZxid();
    } finally {
        wl.unlock();
    }
}

getProposalsFromTxnLog方法

从txnlog获取Proposal，只填充packet字段：

java 复制代码

public Iterator<Proposal> getProposalsFromTxnLog(long startZxid, long sizeLimit) {
    if (sizeLimit < 0) {
        return TxnLogProposalIterator.EMPTY_ITERATOR;
    }

    TxnIterator itr = null;
    try {
        // 从txnLog文件读取数据
        // 底层通过FileTxnIterator类读取文件流实现
        itr = snapLog.readTxnLog(startZxid, false);

        // If we cannot guarantee that this is strictly the starting txn
        // after a given zxid, we should fail.
        if ((itr.getHeader() != null) && (itr.getHeader().getZxid() > startZxid)) {
            itr.close();
            return TxnLogProposalIterator.EMPTY_ITERATOR;
        }

        if (sizeLimit > 0) {
            long txnSize = itr.getStorageSize();
            if (txnSize > sizeLimit) {
                itr.close();
                return TxnLogProposalIterator.EMPTY_ITERATOR;
            }
        }
    } catch (IOException e) {
        itr.close();
        return TxnLogProposalIterator.EMPTY_ITERATOR;
    }
    return new TxnLogProposalIterator(itr);
}

truncateLog方法

把txnlog数据truncate到指定的zxid位置，然后重新加载DataTree数据：

java 复制代码

public boolean truncateLog(long zxid) throws IOException {
    clear();

    // truncate the log
    boolean truncated = snapLog.truncateLog(zxid);

    if (!truncated) {
        return false;
    }

    loadDataBase();
    return true;
}

deserializeSnapshot方法

java 复制代码

public void deserializeSnapshot(InputArchive ia) throws IOException {
    clear();
    SerializeUtils.deserializeSnapshot(getDataTree(), ia, getSessionWithTimeOuts());
    initialized = true;
}

public void deserializeSnapshot(final InputArchive ia, final CheckedInputStream is) throws IOException {
    clear();

    // deserialize data tree
    final DataTree dataTree = getDataTree();
    FileSnap.deserialize(dataTree, getSessionWithTimeOuts(), ia);
    SnapStream.checkSealIntegrity(is, ia);

    // deserialize digest and check integrity
    if (dataTree.deserializeZxidDigest(ia, 0)) {
        SnapStream.checkSealIntegrity(is, ia);
    }

    // deserialize lastProcessedZxid and check integrity
    if (dataTree.deserializeLastProcessedZxid(ia)) {
        SnapStream.checkSealIntegrity(is, ia);
    }

    // compare the digest to find inconsistency
    if (dataTree.getDigestFromLoadedSnapshot() != null) {
        dataTree.compareSnapshotDigests(dataTree.lastProcessedZxid);
    }

    initialized = true;
}

serializeSnapshot方法

java 复制代码

public void serializeSnapshot(OutputArchive oa) throws IOException, InterruptedException {
    SerializeUtils.serializeSnapshot(getDataTree(), oa, getSessionWithTimeOuts());
}

DataTree

维护树状结构，没有任何网络或客户端连接代码，因此可以以独立的方式进行测试。

维护两个并行的数据结构：一个从完整路径映射到DataNodes的哈希表和一个DataNodes树，对路径的所有访问都是通过哈希表进行的，只有在序列化到磁盘时才遍历DataNodes树。

关键字段

java 复制代码

// This map provides a fast lookup to the data nodes
private final NodeHashMap nodes;
// Watcher
private IWatchManager dataWatches;
private IWatchManager childWatches;
// cached total size of paths and data for all DataNodes
private final AtomicLong nodeDataSize = new AtomicLong(0);
// This hashtable lists the paths of the ephemeral nodes of a session
private final Map<Long, HashSet<String>> ephemerals = new ConcurrentHashMap<>();
// This set contains the paths of all container nodes
private final Set<String> containers = Collections.newSetFromMap(new ConcurrentHashMap<>());
// This set contains the paths of all ttl nodes
private final Set<String> ttls = Collections.newSetFromMap(new ConcurrentHashMap<>());
// This is a pointer to the root of the DataTree
private DataNode root = new DataNode(new byte[0], -1L, new StatPersisted());
// create a /zookeeper filesystem that is the proc filesystem of zookeeper
private final DataNode procDataNode = new DataNode(new byte[0], -1L, new StatPersisted());
// create a /zookeeper/quota node for maintaining quota properties for zookeeper
private final DataNode quotaDataNode = new DataNode(new byte[0], -1L, new StatPersisted());
// 最新被处理的zxid
public volatile long lastProcessedZxid = 0;

NodeHashMap - NodeHashMapImpl实现类使用ConcurrentHashMap保存path -> DataNode数据
IWatchManager和Watcher - 监听器管理
DataNode - 封装树节点信息，包括data、children、stat等

构造方法

java 复制代码

DataTree(DigestCalculator digestCalculator) {
    this.digestCalculator = digestCalculator;
    nodes = new NodeHashMapImpl(digestCalculator);

    // rather than fight it, let root have an alias
    nodes.put("", root); // "" -> root
    nodes.putWithoutDigest(rootZookeeper, root); // "/" -> root

    // add the proc node and quota node
    root.addChild(procChildZookeeper); // 添加zookeeper子节点
    nodes.put(procZookeeper, procDataNode); // "/zookeeper" -> procDataNode

    procDataNode.addChild(quotaChildZookeeper); // 添加quota子节点
    nodes.put(quotaZookeeper, quotaDataNode); // "/zookeeper/quota" -> quotaDataNode

    addConfigNode(); // 添加/zookeeper/config节点

    nodeDataSize.set(approximateDataSize());
    try {
        // 使用WatchManager实现类
        dataWatches = WatchManagerFactory.createWatchManager();
        childWatches = WatchManagerFactory.createWatchManager();
    } catch (Exception e) {}
}

public void addConfigNode() {
    DataNode zookeeperZnode = nodes.get(procZookeeper); // 找到/zookeeper节点
    if (zookeeperZnode != null) {
        zookeeperZnode.addChild(configChildZookeeper); // 添加config子节点
    }

    nodes.put(configZookeeper, new DataNode(new byte[0], -1L, new StatPersisted()));
    try {
        // Reconfig node is access controlled by default (ZOOKEEPER-2014).
        setACL(configZookeeper, ZooDefs.Ids.READ_ACL_UNSAFE, -1);
    } catch (NoNodeException e) {}
}

主要方法

java 复制代码

// Add a new node to the DataTree
public void createNode(final String path, byte[] data, List<ACL> acl, long ephemeralOwner,
                       int parentCVersion, long zxid, long time, Stat outputStat);
// Remove path from the DataTree
public void deleteNode(String path, long zxid);
// 为节点设置数据
public Stat setData(String path, byte[] data, int version, long zxid, long time);
// 1. 获取path的data
// 2. 如果watcher不为null则addWatch
public byte[] getData(String path, Stat stat, Watcher watcher);
// 使用node.copyStat(stat)保存stat数据
public Stat statNode(String path, Watcher watcher);

// 1. copyStat到stat中
// 2. addWatch
// 3. getChildren
public List<String> getChildren(String path, Stat stat, Watcher watcher);

// 设置、获取权限
public Stat setACL(String path, List<ACL> acl, int version);
public List<ACL> getACL(String path, Stat stat);
public List<ACL> getACL(DataNode node);
// 添加Watcher
public void addWatch(String basePath, Watcher watcher, int mode);

// 处理事务请求
public ProcessTxnResult processTxn(TxnHeader header, Record txn, boolean isSubTxn);

// 杀会话，使用deleteNodes删除paths2DeleteLocal和paths2DeleteInTxn集
void killSession(
    long session, long zxid, Set<String> paths2DeleteLocal, List<String> paths2DeleteInTxn);
// 遍历paths2Delete调用deleteNode方法删除节点
void deleteNodes(long session, long zxid, Iterable<String> paths2Delete);
// 递归方式获取path下面的总节点数和总字节数
private void getCounts(String path, Counts counts);

// 序列化
void serializeNode(OutputArchive oa, StringBuilder path);
public void serializeNodeData(OutputArchive oa, String path, DataNode node);
public void serializeAcls(OutputArchive oa);
public void serializeNodes(OutputArchive oa);
public void serialize(OutputArchive oa, String tag);
// 反序列化
public void deserialize(InputArchive ia, String tag);
// 从dataWatches和childWatches移除watcher
public void removeCnxn(Watcher watcher);
// 触发或addWatch
public void setWatches(long relativeZxid, List<String> dataWatches,
                       List<String> existWatches, List<String> childWatches,
                       List<String> persistentWatches, List<String> persistentRecursiveWatches,
                       Watcher watcher);
// 为path设置新的cversion和zxid
public void setCversionPzxid(String path, int newCversion, long zxid);
// Add the digest to the historical list, and update the latest zxid digest
private void logZxidDigest(long zxid, long digest);
// 序列化、反序列化lastProcessedZxidDigest
public boolean serializeZxidDigest(OutputArchive oa);
public boolean deserializeZxidDigest(InputArchive ia, long startZxidOfSnapshot);
// 序列化、反序列化lastProcessedZxid
public boolean serializeLastProcessedZxid(final OutputArchive oa);
public boolean deserializeLastProcessedZxid(final InputArchive ia);
// Compares the actual tree's digest with that in the snapshot.
// Resets digestFromLoadedSnapshot after comparison.
public void compareSnapshotDigests(long zxid);
// Compares the digest of the tree with the digest present in transaction digest.
// If there is any error, logs and alerts the watchers.
public boolean compareDigest(TxnHeader header, Record txn, TxnDigest digest);

createNode方法

processTxn中会使用该方法创建节点：

java 复制代码

public void createNode(final String path, byte[] data, List<ACL> acl,
                       long ephemeralOwner, int parentCVersion, long zxid,
                       long time, Stat outputStat) throws NoNodeException, NodeExistsException {
    int lastSlash = path.lastIndexOf('/');
    String parentName = path.substring(0, lastSlash);
    String childName = path.substring(lastSlash + 1);
    StatPersisted stat = createStat(zxid, time, ephemeralOwner); // Create a node stat
    DataNode parent = nodes.get(parentName); // 父节点需要存在
    synchronized (parent) {
        Long acls = aclCache.convertAcls(acl);

        Set<String> children = parent.getChildren(); // path节点不能存在

        nodes.preChange(parentName, parent); // 执行removeDigest
        if (parentCVersion == -1) {
            parentCVersion = parent.stat.getCversion();
            parentCVersion++; // childVersion递增
        }

        if (parentCVersion > parent.stat.getCversion()) {
            parent.stat.setCversion(parentCVersion); // 父节点的childVersion
            parent.stat.setPzxid(zxid); // 父节点processZxid
        }
        DataNode child = new DataNode(data, acls, stat);
        parent.addChild(childName); // 添加节点
        nodes.postChange(parentName, parent);
        nodeDataSize.addAndGet(getNodeSize(path, child.data));
        nodes.put(path, child); // 维护NodeHashMap
        EphemeralType ephemeralType = EphemeralType.get(ephemeralOwner);// 通常是VOID|NORMAL
        if (ephemeralType == EphemeralType.CONTAINER) {
            containers.add(path);
        } else if (ephemeralType == EphemeralType.TTL) {
            ttls.add(path);
        } else if (ephemeralOwner != 0) {
            // 维护临时节点
            HashSet<String> list = ephemerals.computeIfAbsent(ephemeralOwner, k -> new HashSet<>());
            synchronized (list) {
                list.add(path);
            }
        }
        if (outputStat != null) {
            child.copyStat(outputStat); // 把权限保存到outputStat中
        }
    }

    // 略

    // 触发监听器
    dataWatches.triggerWatch(path, Event.EventType.NodeCreated, zxid);
    childWatches.triggerWatch(
        parentName.equals("") ? "/" : parentName, Event.EventType.NodeChildrenChanged, zxid);
}

deleteNode方法

java 复制代码

public void deleteNode(String path, long zxid) throws NoNodeException {
    int lastSlash = path.lastIndexOf('/');
    String parentName = path.substring(0, lastSlash);
    String childName = path.substring(lastSlash + 1);

    DataNode parent = nodes.get(parentName); // 父节点要存在
    synchronized (parent) {
        nodes.preChange(parentName, parent);
        parent.removeChild(childName); // 移除子节点
        if (zxid > parent.stat.getPzxid()) {
            parent.stat.setPzxid(zxid);
        }
        nodes.postChange(parentName, parent);
    }

    DataNode node = nodes.get(path); // 节点要存在
    nodes.remove(path); // 从NodeHashMap移除
    synchronized (node) {
        aclCache.removeUsage(node.acl);
        nodeDataSize.addAndGet(-getNodeSize(path, node.data));
    }

    synchronized (parent) {
        long owner = node.stat.getEphemeralOwner();
        EphemeralType ephemeralType = EphemeralType.get(owner);
        if (ephemeralType == EphemeralType.CONTAINER) {
            containers.remove(path);
        } else if (ephemeralType == EphemeralType.TTL) {
            ttls.remove(path);
        } else if (owner != 0) { // 移除临时节点
            Set<String> nodes = ephemerals.get(owner);
            if (nodes != null) {
                synchronized (nodes) {
                    nodes.remove(path);
                }
            }
        }
    }

    // 略

    // 触发监听器
    WatcherOrBitSet processed = dataWatches.triggerWatch(path, EventType.NodeDeleted, zxid);
    childWatches.triggerWatch(path, EventType.NodeDeleted, zxid, processed);
    childWatches.triggerWatch(
        "".equals(parentName) ? "/" : parentName, EventType.NodeChildrenChanged, zxid);
}

setData方法

java 复制代码

public Stat setData(String path, byte[] data, int version, long zxid, long time) throws NoNodeException {
    Stat s = new Stat();
    DataNode n = nodes.get(path); // 节点要存在
    byte[] lastData;
    synchronized (n) {
        lastData = n.data;
        nodes.preChange(path, n);
        n.data = data; // data赋值
        n.stat.setMtime(time); // 修改时间
        n.stat.setMzxid(zxid); // 修改的zxid
        n.stat.setVersion(version); // 版本
        n.copyStat(s); // 保存stat
        nodes.postChange(path, n);
    }

    // 略
    // 触发监听器
    dataWatches.triggerWatch(path, EventType.NodeDataChanged, zxid);
    return s;
}

setAcl等acl方法

java 复制代码

public Stat setACL(String path, List<ACL> acl, int version) throws NoNodeException {
    DataNode n = nodes.get(path);
    synchronized (n) {
        Stat stat = new Stat();
        aclCache.removeUsage(n.acl);
        nodes.preChange(path, n);
        n.stat.setAversion(version); // access时间
        n.acl = aclCache.convertAcls(acl); // 设置权限
        n.copyStat(stat);
        nodes.postChange(path, n);
        return stat;
    }
}

public List<ACL> getACL(String path, Stat stat) throws NoNodeException {
    DataNode n = nodes.get(path);
    synchronized (n) {
        if (stat != null) {
            n.copyStat(stat);
        }
        return new ArrayList<>(aclCache.convertLong(n.acl));
    }
}

public List<ACL> getACL(DataNode node) {
    synchronized (node) {
        return aclCache.convertLong(node.acl);
    }
}

addWatch方法

java 复制代码

public void addWatch(String basePath, Watcher watcher, int mode) {
    WatcherMode watcherMode = WatcherMode.fromZooDef(mode); // PERSISTENT_RECURSIVE or PERSISTENT
    dataWatches.addWatch(basePath, watcher, watcherMode); // 只给节点添加Watcher
    if (watcherMode != WatcherMode.PERSISTENT_RECURSIVE) {
        childWatches.addWatch(basePath, watcher, watcherMode); // 递归添加Watcher
    }
}

processTxn方法

java 复制代码

public ProcessTxnResult processTxn(TxnHeader header, Record txn, boolean isSubTxn) {
    ProcessTxnResult rc = new ProcessTxnResult();

    try {
        rc.clientId = header.getClientId();
        rc.cxid = header.getCxid();
        rc.zxid = header.getZxid();
        rc.type = header.getType();
        rc.err = 0;
        rc.multiResult = null;
        switch (header.getType()) {
        case OpCode.create: // 创建节点
            CreateTxn createTxn = (CreateTxn) txn;
            rc.path = createTxn.getPath();
            createNode(createTxn.getPath(), createTxn.getData(), createTxn.getAcl(),
                createTxn.getEphemeral() ? header.getClientId() : 0, createTxn.getParentCVersion(),
                header.getZxid(), header.getTime(), null);
            break;
        case OpCode.create2: // 创建节点并保存stat
            CreateTxn create2Txn = (CreateTxn) txn;
            rc.path = create2Txn.getPath();
            Stat stat = new Stat();
            createNode(create2Txn.getPath(), create2Txn.getData(), create2Txn.getAcl(),
                create2Txn.getEphemeral() ? header.getClientId() : 0, create2Txn.getParentCVersion(),
                header.getZxid(), header.getTime(), stat);
            rc.stat = stat;
            break;
        case OpCode.createTTL:
            CreateTTLTxn createTtlTxn = (CreateTTLTxn) txn;
            rc.path = createTtlTxn.getPath();
            stat = new Stat();
            createNode(createTtlTxn.getPath(), createTtlTxn.getData(), createTtlTxn.getAcl(),
                EphemeralType.TTL.toEphemeralOwner(createTtlTxn.getTtl()), // ttl
                createTtlTxn.getParentCVersion(), header.getZxid(), header.getTime(), stat);
            rc.stat = stat;
            break;
        case OpCode.createContainer:
            CreateContainerTxn createContainerTxn = (CreateContainerTxn) txn;
            rc.path = createContainerTxn.getPath();
            stat = new Stat();
            createNode(createContainerTxn.getPath(), createContainerTxn.getData(),
                createContainerTxn.getAcl(), EphemeralType.CONTAINER_EPHEMERAL_OWNER,
                createContainerTxn.getParentCVersion(), header.getZxid(), header.getTime(), stat);
            rc.stat = stat;
            break;
        case OpCode.delete:
        case OpCode.deleteContainer:
            DeleteTxn deleteTxn = (DeleteTxn) txn;
            rc.path = deleteTxn.getPath();
            deleteNode(deleteTxn.getPath(), header.getZxid()); // 删除节点
            break;
        case OpCode.reconfig:
        case OpCode.setData: // 设置节点数据
            SetDataTxn setDataTxn = (SetDataTxn) txn;
            rc.path = setDataTxn.getPath();
            rc.stat = setData(setDataTxn.getPath(), setDataTxn.getData(), setDataTxn.getVersion(),
                header.getZxid(), header.getTime());
            break;
        case OpCode.setACL: // 设置ACL
            SetACLTxn setACLTxn = (SetACLTxn) txn;
            rc.path = setACLTxn.getPath();
            rc.stat = setACL(setACLTxn.getPath(), setACLTxn.getAcl(), setACLTxn.getVersion());
            break;
        case OpCode.closeSession: // 关闭session
            long sessionId = header.getClientId();
            if (txn != null) {
                killSession(sessionId, header.getZxid(), ephemerals.remove(sessionId),
                        ((CloseSessionTxn) txn).getPaths2Delete());
            } else {
                killSession(sessionId, header.getZxid());
            }
            break;
        case OpCode.error:
            ErrorTxn errTxn = (ErrorTxn) txn;
            rc.err = errTxn.getErr();
            break;
        case OpCode.check:
            CheckVersionTxn checkTxn = (CheckVersionTxn) txn;
            rc.path = checkTxn.getPath();
            break;
        case OpCode.multi:
            // 遍历处理每一个Txn
            break;
        }
    } catch (KeeperException e) {
        rc.err = e.code().intValue();
    } catch (IOException e) {}

    //
    if (header.getType() == OpCode.create && rc.err == Code.NODEEXISTS.intValue()) {
        int lastSlash = rc.path.lastIndexOf('/');
        String parentName = rc.path.substring(0, lastSlash);
        CreateTxn cTxn = (CreateTxn) txn;
        try {
            setCversionPzxid(parentName, cTxn.getParentCVersion(), header.getZxid());
        } catch (NoNodeException e) {
            rc.err = e.code().intValue();
        }
    }
    //
    if (!isSubTxn) {
        if (rc.zxid > lastProcessedZxid) {
            lastProcessedZxid = rc.zxid; // 设置最新lastProcessedZxid
        }

        // 略
    }

    return rc;
}

serialize相关方法

java 复制代码

void serializeNode(OutputArchive oa, StringBuilder path) throws IOException {
    String pathString = path.toString();
    DataNode node = getNode(pathString); // 查找节点
    String[] children;
    DataNode nodeCopy;
    synchronized (node) {
        StatPersisted statCopy = new StatPersisted();
        copyStatPersisted(node.stat, statCopy);
        // we do not need to make a copy of node.data because the contents are never changed
        nodeCopy = new DataNode(node.data, node.acl, statCopy);
        children = node.getChildren().toArray(new String[0]);
    }
    serializeNodeData(oa, pathString, nodeCopy); // 把节点写入到oa中
    path.append('/');
    int off = path.length();
    // 遍历子节点，将子节点写入oa中
    for (String child : children) {
        path.delete(off, Integer.MAX_VALUE);
        path.append(child);
        serializeNode(oa, path);
    }
}

// visible for test
public void serializeNodeData(OutputArchive oa, String path, DataNode node) throws IOException {
    oa.writeString(path, "path");
    oa.writeRecord(node, "node");
}

public void serializeAcls(OutputArchive oa) throws IOException {
    aclCache.serialize(oa);
}

// 序列化整个NodeHashMap对象
public void serializeNodes(OutputArchive oa) throws IOException {
    serializeNode(oa, new StringBuilder());
    // / marks end of stream
    // we need to check if clear had been called in between the snapshot.
    if (root != null) {
        oa.writeString("/", "path");
    }
}

// 完整序列化
public void serialize(OutputArchive oa, String tag) throws IOException {
    serializeAcls(oa);
    serializeNodes(oa);
}

deserialize相关方法

java 复制代码

public void deserialize(InputArchive ia, String tag) throws IOException {
    aclCache.deserialize(ia);
    nodes.clear();
    pTrie.clear();
    nodeDataSize.set(0);
    String path = ia.readString("path");
    while (!"/".equals(path)) {
        DataNode node = new DataNode();
        ia.readRecord(node, "node");
        nodes.put(path, node);
        synchronized (node) {
            aclCache.addUsage(node.acl);
        }
        int lastSlash = path.lastIndexOf('/');
        if (lastSlash == -1) {
            root = node;
        } else {
            String parentPath = path.substring(0, lastSlash);
            DataNode parent = nodes.get(parentPath);
            if (parent == null) {
                throw new IOException(
                        "Invalid Datatree, unable to find parent " + parentPath + " of path " + path);
            }
            parent.addChild(path.substring(lastSlash + 1));
            long owner = node.stat.getEphemeralOwner();
            EphemeralType ephemeralType = EphemeralType.get(owner);
            if (ephemeralType == EphemeralType.CONTAINER) {
                containers.add(path);
            } else if (ephemeralType == EphemeralType.TTL) {
                ttls.add(path);
            } else if (owner != 0) {
                HashSet<String> list = ephemerals.computeIfAbsent(owner, k -> new HashSet<>());
                list.add(path);
            }
        }
        path = ia.readString("path");
    }
    // have counted digest for root node with "", ignore here to avoid counting twice for root node
    nodes.putWithoutDigest("/", root);

    nodeDataSize.set(approximateDataSize());

    // we are done with deserializing the datatree update the quotas - create path trie
    // and also update the stat nodes
    setupQuota();

    aclCache.purgeUnused();
}

FileTxnSnapLog

操作TxnLog和SnapShot的入口类。

构造方法会创建dataDir和snapDir目录，判断数据目录可写，创建txnLog和snapLog对象访问数据文件。

主要方法

java 复制代码

// 从snapshots和transaction logs加载数据库
public long restore(DataTree dt, Map<Long, Integer> sessions, PlayBackListener listener);

// fast forward the server database to have the latest transactions in it
// This is the same as restore, but only reads from the transaction logs and not restores from a snapshot
public long fastForwardFromEdits(DataTree dt, Map<Long, Integer> sessions, PlayBackListener listener);

// 使用txnLog.read(zxid, fastForward)方法从指定zxid加载TxnIterator
public TxnIterator readTxnLog(long zxid, boolean fastForward);

// process the transaction on the datatree
public void processTransaction(TxnHeader hdr, DataTree dt, Map<Long, Integer> sessions, Record txn);

// 使用txnLog.getLastLoggedZxid()方法获取last logged zxid
public long getLastLoggedZxid();

// 把datatree和sessions保存到snapshot中
public File save(
    DataTree dataTree, ConcurrentHashMap<Long, Integer> sessionsWithTimeouts, boolean syncSnap);

// 把txnLog truncate到指定的zxid
public boolean truncateLog(long zxid);

// 使用snaplog.findMostRecentSnapshot()方法加载最近snapshot文件
public File findMostRecentSnapshot();
// 使用snaplog.findNRecentSnapshots(n)方法加载n个最近snapshot文件
public List<File> findNRecentSnapshots(int n);
// 使用snaplog.findNValidSnapshots(n)方法加载n个合法snapshot文件
public List<File> findNValidSnapshots(int n);

// 获取快照文件，可能包含比给定zxid更新的事务。
// 包括起始zxid大于给定zxid的日志，以及起始zxid小于给定zxid的最新事务日志。
// 后一个日志文件可能包含超出给定zxid的事务。
public File[] getSnapshotLogs(long zxid);

// 使用txnLog.append(si)追加数据
public boolean append(Request si);
// txnLog.commit()提交数据
public void commit();

restore方法

从snapshot加载dataTree数据
从txnlog加载dataTree和committedlog数据
如果没有加载到dataTree数据，将空的dataTree数据保存到snapshot.0文件中

fastForwardFromEdits方法

从txnlog加载dataTree和committedlog数据。

processTransaction方法

java 复制代码

public void processTransaction(TxnHeader hdr, DataTree dt, Map<Long, Integer> sessions,
        Record txn) throws KeeperException.NoNodeException {
    ProcessTxnResult rc;
    switch (hdr.getType()) {
    case OpCode.createSession:
        sessions.put(hdr.getClientId(), ((CreateSessionTxn) txn).getTimeOut());
        // give dataTree a chance to sync its lastProcessedZxid
        rc = dt.processTxn(hdr, txn);
        break;
    case OpCode.closeSession:
        sessions.remove(hdr.getClientId());
        rc = dt.processTxn(hdr, txn);
        break;
    default:
        rc = dt.processTxn(hdr, txn);
    }
}

save方法

java 复制代码

public File save(DataTree dataTree, ConcurrentHashMap<Long, Integer> sessionsWithTimeouts,
        boolean syncSnap) throws IOException {
    long lastZxid = dataTree.lastProcessedZxid;
    // 文件名snapshot.${lastZxid}
    File snapshotFile = new File(snapDir, Util.makeSnapshotName(lastZxid));
    try {
        snapLog.serialize(dataTree, sessionsWithTimeouts, snapshotFile, syncSnap);
        return snapshotFile;
    } catch (IOException e) {
        throw e;
    }
}

truncateLog方法

java 复制代码

public boolean truncateLog(long zxid) {
    try {
        // close the existing txnLog and snapLog
        close();

        // truncate it
        try (FileTxnLog truncLog = new FileTxnLog(dataDir)) {
            boolean truncated = truncLog.truncate(zxid);

            // re-open the txnLog and snapLog
            // I'd rather just close/reopen this object itself, however that
            // would have a big impact outside ZKDatabase as there are other
            // objects holding a reference to this object.
            txnLog = new FileTxnLog(dataDir);
            snapLog = new FileSnap(snapDir);

            return truncated;
        }
    } catch (IOException e) {
        return false;
    }
}

TxnLog接口和FileTxnLog实现类

txnlog

使用文件保存所有的事务操作，客户端的写操作会先写入txnlog文件，在follower达到quorum状态后提交到dataTree中，在ZKDatabase启动阶段，如果txnlog的zxid大于snapshot的zxid时，会加载txnlog文件数据回放事务，提交到dataTree中。

TxnLog接口

Interface for reading transaction logs.

java 复制代码

public interface TxnLog extends Closeable {

    // Setter for ServerStats to monitor fsync threshold exceed
    void setServerStats(ServerStats serverStats);

    // roll the current log being appended to
    void rollLog() throws IOException;

    // Append a request to the transaction log with a digset
    boolean append(Request request) throws IOException;

    // Start reading the transaction logs from a given zxid
    TxnIterator read(long zxid) throws IOException;

    // the last zxid of the logged transactions
    long getLastLoggedZxid() throws IOException;

    // truncate the log to get in sync with the leader
    boolean truncate(long zxid) throws IOException;

    // the dbid for this transaction log
    long getDbId() throws IOException;

    // commit the transaction and make sure they are persisted
    void commit() throws IOException;

    // return transaction log's elapsed sync time in milliseconds
    long getTxnLogSyncElapsedTime();

    void close() throws IOException;
    void setTotalLogSize(long size);
    long getTotalLogSize();
}

FileTxnLog实现类

复制代码

This class implements the TxnLog interface. It provides api's to access the txnlogs and add entries to it.
The format of a Transactional log is as follows:

   LogFile:
       FileHeader TxnList ZeroPad

   FileHeader: {
       magic 4bytes (ZKLG)
       version 4bytes
       dbid 8bytes
   }

   TxnList:
       Txn || Txn TxnList

   Txn:
       checksum Txnlen TxnHeader Record 0x42

   checksum: 8bytes Adler32 is currently used
     calculated across payload -- Txnlen, TxnHeader, Record and 0x42

   Txnlen:
       len 4bytes

   TxnHeader: {
       sessionid 8bytes
       cxid 4bytes
       zxid 8bytes
       time 8bytes
       type 4bytes
   }

   Record:
       See Jute definition file for details on the various record types

   ZeroPad:
       0 padded to EOF (filled during preallocation stage)

FileTxnLog主要方法实现

java 复制代码

public synchronized boolean append(Request request) throws IOException {
    TxnHeader hdr = request.getHdr();
    if (hdr == null) { // 不是事务请求
        return false;
    }
    if (hdr.getZxid() <= lastZxidSeen) {
        LOG.warn("...");
    } else {
        lastZxidSeen = hdr.getZxid();
    }
    if (logStream == null) {
        // 创建新log.${hdr.zxid}文件
        logFileWrite = new File(logDir, Util.makeLogName(hdr.getZxid()));
        fos = new FileOutputStream(logFileWrite);
        logStream = new BufferedOutputStream(fos);
        oa = BinaryOutputArchive.getArchive(logStream);
        FileHeader fhdr = new FileHeader(TXNLOG_MAGIC, VERSION, dbId); // 文件头
        long dataSize = oa.getDataSize();
        fhdr.serialize(oa, "fileheader"); // 写文件头
        logStream.flush();
        // 文件偏移量
        filePosition += oa.getDataSize() - dataSize;
        filePadding.setCurrentSize(filePosition);
        streamsToFlush.add(fos);
    }
    fileSize = filePadding.padFile(fos.getChannel(), filePosition);
    byte[] buf = request.getSerializeData();
    long dataSize = oa.getDataSize();
    Checksum crc = makeChecksumAlgorithm();
    crc.update(buf, 0, buf.length);
    oa.writeLong(crc.getValue(), "txnEntryCRC"); // checksum
    Util.writeTxnBytes(oa, buf); // 写len, hdr, txn, digest, 0x42
    unFlushedSize += oa.getDataSize() - dataSize; // 计算未flush字节数
    return true;
}

public long getLastLoggedZxid() {
    File[] files = getLogFiles(logDir.listFiles(), 0);
    long maxLog = files.length > 0 ?
        Util.getZxidFromName(files[files.length - 1].getName(), LOG_FILE_PREFIX) : -1;
    // 最新的log文件的后缀作为zxid
    long zxid = maxLog;
    // 从文件解析最新zxid
    try (FileTxnLog txn = new FileTxnLog(logDir); TxnIterator itr = txn.read(maxLog)) {
        while (true) {
            if (!itr.next()) {
                break;
            }
            TxnHeader hdr = itr.getHeader();
            zxid = hdr.getZxid();
        }
    } catch (IOException e) {
    }
    return zxid;
}

public synchronized void rollLog() throws IOException {
    if (logStream != null) {
        this.logStream.flush(); // 把当前文件刷写出去
        prevLogsRunningTotal += getCurrentLogSize();
        this.logStream = null; // 重置相关变量，后续append时会创建新的文件
        oa = null;
        fileSize = 0;
        filePosition = 0;
        unFlushedSize = 0;
    }
}

public synchronized void commit() throws IOException {
    if (logStream != null) {
        logStream.flush(); // 刷写文件
        filePosition += unFlushedSize;
        // If we have written more than we have previously preallocated,
        // we should override the fileSize by filePosition.
        if (filePosition > fileSize) {
            fileSize = filePosition;
        }
        unFlushedSize = 0;
    }
    for (FileOutputStream log : streamsToFlush) {
        log.flush(); // 刷写文件
        if (forceSync) {
            long startSyncNS = System.nanoTime();

            FileChannel channel = log.getChannel();
            channel.force(false);

            // 略
        }
    }
    // 关闭文件流
    while (streamsToFlush.size() > 1) {
        streamsToFlush.poll().close();
    }

    // Roll the log file if we exceed the size limit
    if (txnLogSizeLimit > 0) { // 默认-1分支进不来
        long logSize = getCurrentLogSize();
        if (logSize > txnLogSizeLimit) {
            rollLog();
        }
    }
}

// FileTxnIterator封装logFile和输入流对象，可以按照协议从文件流读取txnLog数据
public TxnIterator read(long zxid) throws IOException {
    return read(zxid, true);
}
public TxnIterator read(long zxid, boolean fastForward) throws IOException {
    return new FileTxnIterator(logDir, zxid, fastForward);
}

// 将log文件truncate到指定zxid位置
public boolean truncate(long zxid) throws IOException {
    try (FileTxnIterator itr = new FileTxnIterator(this.logDir, zxid)) {
        PositionInputStream input = itr.inputStream;
        if (input == null) {
            throw new IOException("No log files found to truncate");
        }
        long pos = input.getPosition();
        // now, truncate at the current position
        RandomAccessFile raf = new RandomAccessFile(itr.logFile, "rw");
        raf.setLength(pos);
        raf.close(); // 把最小的文件truncate到指定zxid位置
        while (itr.goToNextLog()) { // 删除所有>zxid的log文件
            if (!itr.logFile.delete()) {
            }
        }
    }
    return true;
}

private static FileHeader readHeader(File file) throws IOException {
    InputStream is = null;
    try {
        is = new BufferedInputStream(new FileInputStream(file));
        InputArchive ia = BinaryInputArchive.getArchive(is);
        FileHeader hdr = new FileHeader();
        hdr.deserialize(ia, "fileheader"); // 反序列化
        return hdr;
    } finally {
        // is.close();
    }
}

FileTxnIterator类

this class implements the txnlog iterator interface which is used for reading the transaction logs.

内部使用List保存着比指定zxid大或者含有指定zxid数据的log文件，初始化阶段会定位到参数zxid指定的位置，这样在后续访问时就可以从参数指定的zxid开始读取数据了。

java 复制代码

public FileTxnIterator(File logDir, long zxid, boolean fastForward) throws IOException {
    this.logDir = logDir;
    this.zxid = zxid;
    init();

    if (fastForward && hdr != null) {
        while (hdr.getZxid() < zxid) { // 这里将数据移动到zxid位置
            if (!next()) {
                break;
            }
        }
    }
}

void init() throws IOException {
    storedFiles = new ArrayList<>();
    // 倒序查找log文件
    List<File> files = Util.sortDataDir(
        FileTxnLog.getLogFiles(logDir.listFiles(), 0),
        LOG_FILE_PREFIX,
        false);
    for (File f : files) {
        if (Util.getZxidFromName(f.getName(), LOG_FILE_PREFIX) >= zxid) {
            storedFiles.add(f);
        } else if (Util.getZxidFromName(f.getName(), LOG_FILE_PREFIX) < zxid) {
            // add the last logfile that is less than the zxid
            storedFiles.add(f);
            break;
        }
    }
    goToNextLog(); // 定位到下一个文件
    next(); // 定位到下一个log数据
}

SnapShot接口和FileSnap实现类

SnapShot接口

snapshot interface for the persistence layer. implement this interface for implementing snapshots.

java 复制代码

public interface SnapShot {

    // deserialize a data tree from the last valid snapshot and return the last zxid that was deserialized
    long deserialize(DataTree dt, Map<Long, Integer> sessions) throws IOException;

    // persist the datatree and the sessions into a persistence storage
    void serialize(DataTree dt, Map<Long, Integer> sessions, File name, boolean fsync) throws IOException;

    // find the most recent snapshot file
    File findMostRecentSnapshot() throws IOException;

    // get information of the last saved/restored snapshot
    SnapshotInfo getLastSnapshotInfo();

    // free resources from this snapshot immediately
    void close() throws IOException;
}

FileSnap实现类

负责存储、序列化和反序列化正确的快照。并提供对快照的访问：

java 复制代码

public long deserialize(DataTree dt, Map<Long, Integer> sessions) throws IOException {
    // 在snapDir下查找合法的快照文件，倒序，所以最新的在前面
    List<File> snapList = findNValidSnapshots(100);
    File snap = null;
    long snapZxid = -1;
    boolean foundValid = false;
    for (int i = 0, snapListSize = snapList.size(); i < snapListSize; i++) {
        snap = snapList.get(i);
        snapZxid = Util.getZxidFromName(snap.getName(), SNAPSHOT_FILE_PREFIX);
        try (CheckedInputStream snapIS = SnapStream.getInputStream(snap)) {
            InputArchive ia = BinaryInputArchive.getArchive(snapIS);
            deserialize(dt, sessions, ia); // 将数据反序列到dt
            SnapStream.checkSealIntegrity(snapIS, ia);

            // Deserializing the zxid digest from the input
            // stream and update the digestFromLoadedSnapshot.
            // 格式: zxid digestVersion digest
            if (dt.deserializeZxidDigest(ia, snapZxid)) {
                SnapStream.checkSealIntegrity(snapIS, ia);
            }

            // deserialize lastProcessedZxid and check inconsistency
            // 读lastZxid字段得到
            if (dt.deserializeLastProcessedZxid(ia)) {
                SnapStream.checkSealIntegrity(snapIS, ia);
            }

            foundValid = true;
            break;
        } catch (IOException e) {}
    }
    // 验证foundValid
    // 上次处理到的zxid
    dt.lastProcessedZxid = snapZxid;
    lastSnapshotInfo = new SnapshotInfo(dt.lastProcessedZxid, snap.lastModified() / 1000);

    // compare the digest if this is not a fuzzy snapshot, we want to compare and find inconsistent asap
    if (dt.getDigestFromLoadedSnapshot() != null) {
        dt.compareSnapshotDigests(dt.lastProcessedZxid);
    }
    return dt.lastProcessedZxid;
}

public static void deserialize(
        DataTree dt, Map<Long, Integer> sessions, InputArchive ia) throws IOException {
    FileHeader header = new FileHeader(); // magic, version, dbid
    header.deserialize(ia, "fileheader"); // 解析文件头并验证magic
    if (header.getMagic() != SNAP_MAGIC) {
        throw new IOException("mismatching magic headers");
    }
    // 反序列化
    // 会话:
    //   Count Session(s)
    //   Session {id, timeout}
    // 节点:
    //   AclCache PathNode(s)
    //   PathNode {path, node}
    //   node {data, acl, stat}
    SerializeUtils.deserializeSnapshot(dt, ia, sessions);
}

protected List<File> findNValidSnapshots(int n) {
    // 在snapDir下查找快照文件，倒序，最新的在前面
    List<File> files = Util.sortDataDir(snapDir.listFiles(), SNAPSHOT_FILE_PREFIX, false);
    int count = 0;
    List<File> list = new ArrayList<>();
    for (File f : files) {
        try {
            if (SnapStream.isValidSnapshot(f)) { // 验证文件合法
                list.add(f);
                count++;
                if (count == n) {
                    break;
                }
            }
        } catch (IOException e) {}
    }
    return list;
}

public List<File> findNRecentSnapshots(int n) throws IOException {
    // 在snapDir下查找快照文件，倒序，最新的在前面
    List<File> files = Util.sortDataDir(snapDir.listFiles(), SNAPSHOT_FILE_PREFIX, false);
    int count = 0;
    List<File> list = new ArrayList<>();
    for (File f : files) {
        if (count == n) {
            break;
        }
        if (Util.getZxidFromName(f.getName(), SNAPSHOT_FILE_PREFIX) != -1) {
            count++;
            list.add(f);
        }
    }
    return list;
}

protected void serialize(
        DataTree dt, Map<Long, Integer> sessions, OutputArchive oa, FileHeader header) throws IOException {
    // 验证header!=null
    header.serialize(oa, "fileheader"); // 序列化文件头
    SerializeUtils.serializeSnapshot(dt, oa, sessions); // 序列化dataTree
}

public synchronized void serialize(
        DataTree dt, Map<Long, Integer> sessions, File snapShot, boolean fsync) throws IOException {
    if (!close) {
        try (CheckedOutputStream snapOS = SnapStream.getOutputStream(snapShot, fsync)) {
            OutputArchive oa = BinaryOutputArchive.getArchive(snapOS);
            FileHeader header = new FileHeader(SNAP_MAGIC, VERSION, dbId);
            serialize(dt, sessions, oa, header);
            SnapStream.sealStream(snapOS, oa);

            // 序列化digest
            if (dt.serializeZxidDigest(oa)) {
                SnapStream.sealStream(snapOS, oa);
            }

            // 序列化lastProcessZxid
            if (dt.serializeLastProcessedZxid(oa)) {
                SnapStream.sealStream(snapOS, oa);
            }

            lastSnapshotInfo = new SnapshotInfo(
                Util.getZxidFromName(snapShot.getName(), SNAPSHOT_FILE_PREFIX),
                snapShot.lastModified() / 1000);
        }
    } else {
        throw new IOException("FileSnap has already been closed");
    }
}

DatadirCleanupManager

启动周期任务

清理过期文件，保留最新的snapRetainCount个snapshot文件和对应的txnlog文件，将其余过期的文件删除掉。

purgeInterval参数指定执行周期(小时)，默认0不开启清理功能。

java 复制代码

public void start() {
    if (PurgeTaskStatus.STARTED == purgeTaskStatus) {
        return;
    }
    // Don't schedule the purge task with zero or negative purge interval.
    if (purgeInterval <= 0) {
        return;
    }

    timer = new Timer("PurgeTask", true);
    TimerTask task = new PurgeTask(dataLogDir, snapDir, snapRetainCount);
    timer.scheduleAtFixedRate(task, 0, TimeUnit.HOURS.toMillis(purgeInterval));

    purgeTaskStatus = PurgeTaskStatus.STARTED;
}

PurgeTask

java 复制代码

public void run() {
    try {
        PurgeTxnLog.purge(logsDir, snapsDir, snapRetainCount);
    } catch (Exception e) {}
}

PurgeTxnLog.purge方法：

java 复制代码

public static void purge(File dataDir, File snapDir, int num) throws IOException {
    if (num < 3) {
        throw new IllegalArgumentException(COUNT_ERR_MSG);
    }

    FileTxnSnapLog txnLog = new FileTxnSnapLog(dataDir, snapDir);

    // 倒序查找最新的num个snapshot文件
    List<File> snaps = txnLog.findNValidSnapshots(num);
    int numSnaps = snaps.size();
    if (numSnaps > 0) {
        // 删除掉zxid比snaps小的txnlog和snapshot文件
        purgeOlderSnapshots(txnLog, snaps.get(numSnaps - 1));
    }
}

ContainerManager

负责清理container节点，只能有leader管理。启动后，定期检查cversion>0且没有子级的container节点和ttl节点。尝试删除节点，删除的结果并不重要。如果提议失败或容器节点不为空，则没有任何危害。

zookeeper源码(05)数据存储

ZKDatabase

关键字段

构造方法

主要方法

loadDataBase方法

fastForwardDataBase方法

addCommittedProposal方法

getProposalsFromTxnLog方法

truncateLog方法

deserializeSnapshot方法

serializeSnapshot方法

DataTree

关键字段

构造方法

主要方法

createNode方法

deleteNode方法

setData方法

setAcl等acl方法

addWatch方法

processTxn方法

serialize相关方法

deserialize相关方法

FileTxnSnapLog

主要方法

restore方法

fastForwardFromEdits方法

processTransaction方法

save方法

truncateLog方法

TxnLog接口和FileTxnLog实现类

txnlog

TxnLog接口

FileTxnLog实现类

FileTxnLog主要方法实现

FileTxnIterator类

SnapShot接口和FileSnap实现类

SnapShot接口

FileSnap实现类

DatadirCleanupManager

启动周期任务

PurgeTask

ContainerManager