Flink Connector 写入 Iceberg 流程源码解析_confluent icebergsinkconnector

复制代码

  // 添加 Writer 算子，有并行度
  SingleOutputStreamOperator<WriteResult> writerStream =
      appendWriter(distributeStream, flinkRowType, equalityFieldIds);

  // 添加 Commit 算子，并行度固定为 1 
  SingleOutputStreamOperator<Void> committerStream = appendCommitter(writerStream);

  // 添加 sink
  return appendDummySink(committerStream);
}


### appendWriter 方法

private SingleOutputStreamOperator appendWriter(

DataStream input, RowType flinkRowType, List equalityFieldIds) {

复制代码

 .... 
  if (flinkWriteConf.upsertMode()) {
    if (!table.spec().isUnpartitioned()) {
        // 在 upser 模式下检查分区建必须在 equalityFieldIds 中 
      for (PartitionField partitionField : table.spec().fields()) {
        Preconditions.checkState(
            equalityFieldIds.contains(partitionField.sourceId()),
            "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'",
            partitionField,
            equalityFieldColumns);
      }
    }
  }
  // 创建 streamWriter 
  IcebergStreamWriter<RowData> streamWriter =
      createStreamWriter(table, flinkWriteConf, flinkRowType, equalityFieldIds);

  // 设置并行度 如果没有指定则和输入流的并行度一样
  int parallelism =
      flinkWriteConf.writeParallelism() == null
          ? input.getParallelism()
          : flinkWriteConf.writeParallelism();
  .... 
  return writerStream;
}


### createStreamWriter 方法

static IcebergStreamWriter createStreamWriter(

Table table,

FlinkWriteConf flinkWriteConf,

RowType flinkRowType,

List equalityFieldIds) {

...

复制代码

Table serializableTable = SerializableTable.copyOf(table);
FileFormat format = flinkWriteConf.dataFileFormat();
// 创建 TaskWriterFactory 根据 表的 Schema 创建对应的 Writer
TaskWriterFactory<RowData> taskWriterFactory =
    new RowDataTaskWriterFactory(
        serializableTable,
        flinkRowType,
        flinkWriteConf.targetDataFileSize(),
        format,
        writeProperties(table, format, flinkWriteConf),
        equalityFieldIds,
        flinkWriteConf.upsertMode());
// 新建 IcebergStreamWriter
return new IcebergStreamWriter<>(table.name(), taskWriterFactory);

}

复制代码

### IcebergStreamWriter 类


该类为一个 Flink 内部的 OneInputStreamOperator 类，拥有 Flink 算子相关特性

class IcebergStreamWriter extends AbstractStreamOperator

implements OneInputStreamOperator<T, WriteResult>, BoundedOneInput {

...

@Override

public void open() {

...

// 初始化相关监控类

this.writerMetrics = new IcebergStreamWriterMetrics(super.metrics, fullTableName);

复制代码

// 初始化 taskWriterFactory 用于创建 writer 
this.taskWriterFactory.initialize(subTaskId, attemptId);

// 创建 writer 
  // 主要分成四类
  // 根据 Iceberg 表是否有分区和开启Upsert模式
  // UnpartitionedWriter ： 无分区 insert only 
  // RowDataPartitionedFanoutWriter ： 分区 insert only 
  // UnpartitionedDeltaWriter ：无分区 Upsert
  // PartitionedDeltaWriter ：有分区 Upsert
this.writer = taskWriterFactory.create();

}

@Override

public void processElement(StreamRecord element) throws Exception {

// 处理数据写入

writer.write(element.getValue());

}

...

// 将本次写入数据文件下发至 Commit 进行统一提交

private void flush() throws IOException {

if (writer == null) {

return;

}

复制代码

long startNano = System.nanoTime();
WriteResult result = writer.complete();
writerMetrics.updateFlushResult(result);
output.collect(new StreamRecord<>(result));
writerMetrics.flushDuration(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano));


writer = null;

}

复制代码

### IcebergFilesCommitter 类

class IcebergFilesCommitter extends AbstractStreamOperator

implements OneInputStreamOperator<WriteResult, Void>, BoundedOneInput {

...

@Override

public void initializeState(StateInitializationContext context) throws Exception {

复制代码

// 最大连续空提交 
// 在间断指定次数 Checkpoint 都没有数据后才真正触发 Commit，生成 Snapshot。
  // 减少空快照生成
maxContinuousEmptyCommits =
    PropertyUtil.propertyAsInt(table.properties(), MAX\_CONTINUOUS\_EMPTY\_COMMITS, 10);

// 创建 文件输出 OutputFileFactory
this.manifestOutputFileFactory =
    FlinkManifestUtil.createOutputFileFactory(
        table, flinkJobId, operatorUniqueId, subTaskId, attemptId);

if (context.isRestored()) {

...

// 从状态中恢复未提交的数据文件

NavigableMap<Long, byte[]> uncommittedDataFiles =

Maps.newTreeMap(checkpointsState.get().iterator().next())

.tailMap(maxCommittedCheckpointId, false);

if (!uncommittedDataFiles.isEmpty()) {

// Committed all uncommitted data files from the old flink job to iceberg table.

long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey();

// 如果存在未提交的文件进行提交

commitUpToCheckpoint(

uncommittedDataFiles, restoredFlinkJobId, operatorUniqueId, maxUncommittedCheckpointId);

}

@Override

public void snapshotState(StateSnapshotContext context) throws Exception {

复制代码

  // 将 checkpointId 对应的写入完成的DATA-FILE生成清单文件并放入 dataFilesPerCheckpoint
dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId));
// Reset the snapshot state to the latest state.
checkpointsState.clear();
  // 存入状态
checkpointsState.add(dataFilesPerCheckpoint);

jobIdState.clear();
jobIdState.add(flinkJobId);

// Clear the local buffer for current checkpoint.
writeResultsOfCurrentCkpt.clear();
committerMetrics.checkpointDuration(
    TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano));

}

@Override

public void notifyCheckpointComplete(long checkpointId) throws Exception {

...

if (checkpointId > maxCommittedCheckpointId) {

LOG.info("Checkpoint {} completed. Attempting commit.", checkpointId);

// 完成 checkpoint 对数据进行 COMMIT

commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, checkpointId);

this.maxCommittedCheckpointId = checkpointId;

} else {

LOG.info(

"Skipping committing checkpoint {}. {} is already committed.",

checkpointId,

maxCommittedCheckpointId);

}

private void commitUpToCheckpoint(

NavigableMap<Long, byte[]> deltaManifestsMap,

String newFlinkJobId,

String operatorId,

long checkpointId)

throws IOException {

NavigableMap<Long, byte[]> pendingMap =

// 获取等待提交的数据文件

deltaManifestsMap.headMap(checkpointId, true);

List manifests = Lists.newArrayList();

NavigableMap<Long, WriteResult> pendingResults = Maps.newTreeMap();

for (Map.Entry<Long, byte[]> e : pendingMap.entrySet()) {

// 数据文件为空则跳过

if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) {

continue;

}

复制代码

  DeltaManifests deltaManifests =
      SimpleVersionedSerialization.readVersionAndDeSerialize(
          DeltaManifestsSerializer.INSTANCE, e.getValue());
  pendingResults.put(
      e.getKey(),
      FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()));
  manifests.addAll(deltaManifests.manifests());
}

// 获取当前待提交文件的 数据条数及数据文件大小 
CommitSummary summary = new CommitSummary(pendingResults);
// 提交数据
commitPendingResult(pendingResults, summary, newFlinkJobId, operatorId, checkpointId);
committerMetrics.updateCommitSummary(summary);
pendingMap.clear();
// 清除已提交数据
deleteCommittedManifests(manifests, newFlinkJobId, checkpointId);

}

private void commitPendingResult(

NavigableMap<Long, WriteResult> pendingResults,

CommitSummary summary,

String newFlinkJobId,

String operatorId,

long checkpointId) {

...

continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0;

// 数据文件不问 0 或者连续最大空提交到达了配置的参数阈值触发提交

if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) {

if (replacePartitions) {

// replace 提交

// 使用 newReplacePartitions()

replacePartitions(pendingResults, summary, newFlinkJobId, operatorId, checkpointId);

} else {

// 普通提交

// 使用 newAppend()

commitDeltaTxn(pendingResults, summary, newFlinkJobId, operatorId, checkpointId);

}

continuousEmptyCheckpoints = 0;

...

}

...

private void replacePartitions(

NavigableMap<Long, WriteResult> pendingResults,

CommitSummary summary,

String newFlinkJobId,

String operatorId,

long checkpointId) {

Preconditions.checkState(

summary.deleteFilesCount() == 0, "Cannot overwrite partitions with delete files.");

// 使用 newReplacePartitions 提交

ReplacePartitions dynamicOverwrite = table.newReplacePartitions().scanManifestsWith(workerPool);

for (WriteResult result : pendingResults.values()) {

Preconditions.checkState(

result.referencedDataFiles().length == 0, "Should have no referenced data files.");

Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile);

}

...

}

private void commitDeltaTxn(

NavigableMap<Long, WriteResult> pendingResults,

CommitSummary summary,

String newFlinkJobId,

String operatorId,

long checkpointId) {

if (summary.deleteFilesCount() == 0) {

// To be compatible with iceberg format V1.

AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool);

for (WriteResult result : pendingResults.values()) {

Preconditions.checkState(

result.referencedDataFiles().length == 0,

"Should have no referenced data files for append.");

Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile);

}

commitOperation(appendFiles, summary, "append", newFlinkJobId, operatorId, checkpointId);

} else {

// To be compatible with iceberg format V2.

for (Map.Entry<Long, WriteResult> e : pendingResults.entrySet()) {

// We don't commit the merged result into a single transaction because for the sequential

// transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied

// to data files from txn1. Committing the merged one will lead to the incorrect delete

// semantic.

WriteResult result = e.getValue();

复制代码

    RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool);
	// 分别写入 DataFile 和 Delete File 
    Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
    Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
    commitOperation(rowDelta, summary, "rowDelta", newFlinkJobId, operatorId, e.getKey());
  }
}

}

private void commitOperation(

SnapshotUpdate<?> operation,

CommitSummary summary,

String description,

String newFlinkJobId,

String operatorId,

long checkpointId) {

...

// 提交操作

operation.commit(); // abort is automatically called if this fails.

...

committerMetrics.commitDuration(durationMs);

}

@Override

public void processElement(StreamRecord element) throws Exception{

final WriteResult value = element.getValue();

if ("DDL".equalsIgnoreCase(value.getType())) {

this.writeResultsOfCurrentCkpt.add(element.getValue());

doCommit();

} else {

this.writeResultsOfCurrentCkpt.add(element.getValue());

}

...

}

复制代码