Flume-WAL日志机制源码分析

一、WAL

WAL 的全称是 Write-Ahead Logging,中文称预写式日志,是一种数据安全写入机制,记录变更操作。就是先写日志,然后在写入磁盘,这样保证数据的安全性。WAL在关系型数据库中非常常见,Mysql中的Redo Log就是采用WAL机制。

二、Put

java 复制代码
// org.apache.flume.channel.file.Log.java
FlumeEventPointer put(long transactionID, Event event)
      throws IOException {
    Preconditions.checkState(open, "Log is closed");
    FlumeEvent flumeEvent = new FlumeEvent(
        event.getHeaders(), event.getBody());
    //封装Put操作,WAL日志会记录四种操作,分别是Put,Take,Commit和Rollback
    //Put操作,全局写顺序ID加1
    Put put = new Put(transactionID, WriteOrderOracle.next(), flumeEvent);
    ByteBuffer buffer = TransactionEventRecord.toByteBuffer(put);
    //选择数据目录的数据文件,比如log-1
    int logFileIndex = nextLogWriter(transactionID);
    long usableSpace = logFiles.get(logFileIndex).getUsableSpace();
    long requiredSpace = minimumRequiredSpace + buffer.limit();
    if (usableSpace <= requiredSpace) {
      throw new IOException("Usable space exhausted, only " + usableSpace +
          " bytes remaining, required " + requiredSpace + " bytes");
    }
    boolean error = true;
    try {
      try {
        // Put事件写入WAL日志文件,Event也就持久化到文件了
        // logFileIndex就是数据文件ID,比如log-1文件
        // 需要注意的是,这里并不意味着数据立即被物理写入到磁盘上,数据被写到了操作系统缓冲区
        // 最终由操作系统决定何时物理写入到磁盘。
        // 如果你需要确保数据已经物理写入到磁盘,可以调用FileChannel的force()方法进行强刷
        FlumeEventPointer ptr = logFiles.get(logFileIndex).put(buffer);
        error = false;
        return ptr;
      } catch (LogFileRetryableIOException e) {
        if (!open) {
          throw e;
        }
        roll(logFileIndex, buffer);
        FlumeEventPointer ptr = logFiles.get(logFileIndex).put(buffer);
        error = false;
        return ptr;
      }
    } finally {
      if (error && open) {
        roll(logFileIndex);
      }
    }
  }
java 复制代码
//org.apache.flume.channel.file.LogFile.Writer
synchronized FlumeEventPointer put(ByteBuffer buffer) throws IOException {
      if (encryptor != null) {
        buffer = ByteBuffer.wrap(encryptor.encrypt(buffer.array()));
      }
      //往fileChannel写入数据
      Pair<Integer, Integer> pair = write(buffer);
      return new FlumeEventPointer(pair.getLeft(), pair.getRight());
    }

write(buffer)是一个公共方法,put、take、commit、rollback操作都通过该方法进行持久化。

java 复制代码
private Pair<Integer, Integer> write(ByteBuffer buffer)
        throws IOException {
      if (!isOpen()) {
        throw new LogFileRetryableIOException("File closed " + file);
      }
      long length = position();
      long expectedLength = length + (long) buffer.limit();
      if (expectedLength > maxFileSize) {
        throw new LogFileRetryableIOException(expectedLength + " > " +
            maxFileSize);
      }
      int offset = (int) length;
      Preconditions.checkState(offset >= 0, String.valueOf(offset));
      // OP_RECORD + size + buffer
      int recordLength = 1 + (int) Serialization.SIZE_OF_INT + buffer.limit();
      usableSpace.decrement(recordLength);
      preallocate(recordLength);
      ByteBuffer toWrite = ByteBuffer.allocate(recordLength);
      //写入代表WAL Record标识,占一个字节
      toWrite.put(OP_RECORD);
      writeDelimitedBuffer(toWrite, buffer);
      toWrite.position(0);
      //fileChannel写入数据,write()调用,但这时还没有物理持久化到磁盘
      int wrote = getFileChannel().write(toWrite);
      Preconditions.checkState(wrote == toWrite.limit());
      return Pair.of(getLogFileID(), offset);
    }
java 复制代码
protected static void writeDelimitedBuffer(ByteBuffer output, ByteBuffer buffer)
      throws IOException {
    //写入原始数据buffer的limit位,Int型,占4个字节
    output.putInt(buffer.limit());
    //写入原始数据buffer
    output.put(buffer);
  }

三、Take

java 复制代码
void take(long transactionID, FlumeEventPointer pointer)
      throws IOException {
    Preconditions.checkState(open, "Log is closed");
    //封装Take,全局写顺序ID加1
    Take take = new Take(transactionID, WriteOrderOracle.next(),
        pointer.getOffset(), pointer.getFileID());
    ByteBuffer buffer = TransactionEventRecord.toByteBuffer(take);
    int logFileIndex = nextLogWriter(transactionID);
    long usableSpace = logFiles.get(logFileIndex).getUsableSpace();
    long requiredSpace = minimumRequiredSpace + buffer.limit();
    if (usableSpace <= requiredSpace) {
      throw new IOException("Usable space exhausted, only " + usableSpace +
          " bytes remaining, required " + requiredSpace + " bytes");
    }
    boolean error = true;
    try {
      try {
        //往fileChannel写入数据,调用公共方法,后续和put一样了
        logFiles.get(logFileIndex).take(buffer);
        error = false;
      } catch (LogFileRetryableIOException e) {
        if (!open) {
          throw e;
        }
        roll(logFileIndex, buffer);
        logFiles.get(logFileIndex).take(buffer);
        error = false;
      }
    } finally {
      if (error && open) {
        roll(logFileIndex);
      }
    }
  }

四、Commit

java 复制代码
//org.apache.flume.channel.file.Log
//传入事务ID,以及type用于区分take还put的提交
private void commit(long transactionID, short type) throws IOException {
    Preconditions.checkState(open, "Log is closed");
    //封装Commit,全局写顺序ID加1
    Commit commit = new Commit(transactionID, WriteOrderOracle.next(), type);
    ByteBuffer buffer = TransactionEventRecord.toByteBuffer(commit);
    int logFileIndex = nextLogWriter(transactionID);
    long usableSpace = logFiles.get(logFileIndex).getUsableSpace();
    long requiredSpace = minimumRequiredSpace + buffer.limit();
    if (usableSpace <= requiredSpace) {
      throw new IOException("Usable space exhausted, only " + usableSpace +
          " bytes remaining, required " + requiredSpace + " bytes");
    }
    boolean error = true;
    try {
      try {
        LogFile.Writer logFileWriter = logFiles.get(logFileIndex);
        // If multiple transactions are committing at the same time,
        // this ensures that the number of actual fsyncs is small and a
        // number of them are grouped together into one.
        //往fileChannel写入数据
        logFileWriter.commit(buffer);
        //每次提交事务需要强制刷数据到磁盘
        logFileWriter.sync();
        error = false;
      } catch (LogFileRetryableIOException e) {
        if (!open) {
          throw e;
        }
        roll(logFileIndex, buffer);
        LogFile.Writer logFileWriter = logFiles.get(logFileIndex);
        logFileWriter.commit(buffer);
        logFileWriter.sync();
        error = false;
      }
    } finally {
      if (error && open) {
        roll(logFileIndex);
      }
    }
  }
java 复制代码
synchronized void commit(ByteBuffer buffer) throws IOException {
      if (encryptor != null) {
        buffer = ByteBuffer.wrap(encryptor.encrypt(buffer.array()));
      }
      //公共方法
      write(buffer);
      dirty = true;
      lastCommitPosition = position();
    }
java 复制代码
//org.apache.flume.channel.file.LogFile.Writer
synchronized void sync() throws IOException {
      if (!fsyncPerTransaction && !dirty) {
        if (LOG.isDebugEnabled()) {
          LOG.debug(
              "No events written to file, " + getFile().toString() +
                  " in last " + fsyncInterval + " or since last commit.");
        }
        return;
      }
      if (!isOpen()) {
        throw new LogFileRetryableIOException("File closed " + file);
      }
      if (lastSyncPosition < lastCommitPosition) {
        //强制刷数据到物理磁盘
        getFileChannel().force(false);
        lastSyncPosition = position();
        syncCount++;
        dirty = false;
      }
    }

需要注意的是,除了每次提交事务的时候进行一次fsync(),还可以配置定时fsync()

java 复制代码
//org.apache.flume.channel.file.LogFile.Writer
Writer(File file, int logFileID, long maxFileSize,
           CipherProvider.Encryptor encryptor, long usableSpaceRefreshInterval,
           boolean fsyncPerTransaction, int fsyncInterval) throws IOException {
      this.file = file;
      this.logFileID = logFileID;
      this.maxFileSize = Math.min(maxFileSize,
          FileChannelConfiguration.DEFAULT_MAX_FILE_SIZE);
      this.encryptor = encryptor;
      writeFileHandle = new RandomAccessFile(file, "rw");
      writeFileChannel = writeFileHandle.getChannel();
      this.fsyncPerTransaction = fsyncPerTransaction;
      this.fsyncInterval = fsyncInterval;
      //判读每个事务都fsync(),这也是一个用户配置项
      if (!fsyncPerTransaction) {
        LOG.info("Sync interval = " + fsyncInterval);
        syncExecutor = Executors.newSingleThreadScheduledExecutor();
        syncExecutor.scheduleWithFixedDelay(new Runnable() {
          @Override
          public void run() {
            try {
              //定时fsync(),默认是5秒
              sync();
            } catch (Throwable ex) {
              LOG.error("Data file, " + getFile().toString() + " could not " +
                  "be synced to disk due to an error.", ex);
            }
          }
        }, fsyncInterval, fsyncInterval, TimeUnit.SECONDS);
      } else {
        syncExecutor = null;
      }
      usableSpace = new CachedFSUsableSpace(file, usableSpaceRefreshInterval);
      LOG.info("Opened " + file);
      open = true;
    }

五、Rollback

java 复制代码
void rollback(long transactionID) throws IOException {
    Preconditions.checkState(open, "Log is closed");

    if (LOGGER.isDebugEnabled()) {
      LOGGER.debug("Rolling back " + transactionID);
    }
    //封装Rollback,全局写顺序ID加1
    Rollback rollback = new Rollback(transactionID, WriteOrderOracle.next());
    ByteBuffer buffer = TransactionEventRecord.toByteBuffer(rollback);
    int logFileIndex = nextLogWriter(transactionID);
    long usableSpace = logFiles.get(logFileIndex).getUsableSpace();
    long requiredSpace = minimumRequiredSpace + buffer.limit();
    if (usableSpace <= requiredSpace) {
      throw new IOException("Usable space exhausted, only " + usableSpace +
          " bytes remaining, required " + requiredSpace + " bytes");
    }
    boolean error = true;
    try {
      try {
        //和上面一样
        logFiles.get(logFileIndex).rollback(buffer);
        error = false;
      } catch (LogFileRetryableIOException e) {
        if (!open) {
          throw e;
        }
        roll(logFileIndex, buffer);
        logFiles.get(logFileIndex).rollback(buffer);
        error = false;
      }
    } finally {
      if (error && open) {
        roll(logFileIndex);
      }
    }
  }

六、读取WAL

java 复制代码
public LogRecord next() throws IOException, CorruptEventException {
      int offset = -1;
      try {
        //replay的时候读取log-x.meta文件获取上一次checkpoint时log-x文件的读写position
        //前面已经设置跳到这个position了,从检查点的位置读取log-x
        long position = fileChannel.position();
        if (position > FileChannelConfiguration.DEFAULT_MAX_FILE_SIZE) {
          LOG.info("File position exceeds the threshold: "
              + FileChannelConfiguration.DEFAULT_MAX_FILE_SIZE
              + ", position: " + position);
        }
        offset = (int) position;
        Preconditions.checkState(offset >= 0);
        while (offset < fileHandle.length()) {
          //读取一个字节获取标识
          byte operation = fileHandle.readByte();
          //如果标识是OP_RECORD,表示WAL Record
          if (operation == OP_RECORD) {
            break;
          } else if (operation == OP_EOF) {
            LOG.info("Encountered EOF at " + offset + " in " + file);
            return null;
          } else if (operation == OP_NOOP) {
            LOG.info("No op event found in file: " + file.toString() +
                " at " + offset + ". Skipping event.");
            skipRecord(fileHandle, offset + 1);
            offset = (int) fileHandle.getFilePointer();
            continue;
          } else {
            LOG.error("Encountered non op-record at " + offset + " " +
                Integer.toHexString(operation) + " in " + file);
            return null;
          }
        }
        if (offset >= fileHandle.length()) {
          return null;
        }
        //标识是OP_RECORD,继续读取数据
        return doNext(offset);
      } catch (EOFException e) {
        return null;
      } catch (IOException e) {
        throw new IOException("Unable to read next Transaction from log file " +
            file.getCanonicalPath() + " at offset " + offset, e);
      }
    }
java 复制代码
LogRecord doNext(int offset) throws IOException, CorruptEventException,
        DecryptionFailureException {
      byte[] buffer = null;
      TransactionEventRecord event = null;
      try {
        //把数据读到buffer
        buffer = readDelimitedBuffer(getFileHandle());
        if (decryptor != null) {
          buffer = decryptor.decrypt(buffer);
        }
        event = TransactionEventRecord.fromByteArray(buffer);
      } catch (CorruptEventException ex) {
        LOGGER.warn("Corrupt file found. File id: log-" + this.getLogFileID(),
            ex);
        // Return null so that replay handler thinks all events in this file
        // have been taken.
        if (!fsyncPerTransaction) {
          return null;
        }
        throw ex;
      } catch (DecryptionFailureException ex) {
        if (!fsyncPerTransaction) {
          LOGGER.warn("Could not decrypt even read from channel. Skipping " +
              "event.", ex);
          return null;
        }
        throw ex;
      }
      return new LogRecord(getLogFileID(), offset, event);
    }
java 复制代码
protected static byte[] readDelimitedBuffer(RandomAccessFile fileHandle)
      throws IOException, CorruptEventException {
    //读取Int,也就是4个字节,表示data buffer size
    int length = fileHandle.readInt();
    if (length < 0) {
      throw new CorruptEventException("Length of event is: " + String.valueOf(length) +
          ". Event must have length >= 0. Possible corruption of data or partial fsync.");
    }
    byte[] buffer = new byte[length];
    try {
      //将data读到buffer
      fileHandle.readFully(buffer);
    } catch (EOFException ex) {
      throw new CorruptEventException("Remaining data in file less than " +
                                      "expected size of event.", ex);
    }
    return buffer;
  }

七、讨论

值得一提的是,当source往channel put一批数据,如果事务没有提交,这时候断电了,或者crash了,数据还没来得及fsync到磁盘,那么是有丢数的风险的。

相关推荐
zmd-zk1 小时前
kafka+zookeeper的搭建
大数据·分布式·zookeeper·中间件·kafka
激流丶1 小时前
【Kafka 实战】如何解决Kafka Topic数量过多带来的性能问题?
java·大数据·kafka·topic
测试界的酸菜鱼1 小时前
Python 大数据展示屏实例
大数据·开发语言·python
时差9531 小时前
【面试题】Hive 查询:如何查找用户连续三天登录的记录
大数据·数据库·hive·sql·面试·database
Mephisto.java1 小时前
【大数据学习 | kafka高级部分】kafka中的选举机制
大数据·学习·kafka
Mephisto.java1 小时前
【大数据学习 | kafka高级部分】kafka的优化参数整理
大数据·sql·oracle·kafka·json·database
道可云1 小时前
道可云人工智能&元宇宙每日资讯|2024国际虚拟现实创新大会将在青岛举办
大数据·人工智能·3d·机器人·ar·vr
成都古河云2 小时前
智慧场馆:安全、节能与智能化管理的未来
大数据·运维·人工智能·安全·智慧城市
软工菜鸡2 小时前
预训练语言模型BERT——PaddleNLP中的预训练模型
大数据·人工智能·深度学习·算法·语言模型·自然语言处理·bert
武子康3 小时前
大数据-212 数据挖掘 机器学习理论 - 无监督学习算法 KMeans 基本原理 簇内误差平方和
大数据·人工智能·学习·算法·机器学习·数据挖掘