Flink Connector 写入 Iceberg 流程源码解析_confluent icebergsinkconnector

复制代码
  // 添加 Writer 算子,有并行度
  SingleOutputStreamOperator<WriteResult> writerStream =
      appendWriter(distributeStream, flinkRowType, equalityFieldIds);

  // 添加 Commit 算子,并行度固定为 1 
  SingleOutputStreamOperator<Void> committerStream = appendCommitter(writerStream);

  // 添加 sink
  return appendDummySink(committerStream);
}


### appendWriter 方法

private SingleOutputStreamOperator appendWriter(

DataStream input, RowType flinkRowType, List equalityFieldIds) {

复制代码
 .... 
  if (flinkWriteConf.upsertMode()) {
    if (!table.spec().isUnpartitioned()) {
        // 在 upser 模式下检查分区建必须在 equalityFieldIds 中 
      for (PartitionField partitionField : table.spec().fields()) {
        Preconditions.checkState(
            equalityFieldIds.contains(partitionField.sourceId()),
            "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'",
            partitionField,
            equalityFieldColumns);
      }
    }
  }
  // 创建 streamWriter 
  IcebergStreamWriter<RowData> streamWriter =
      createStreamWriter(table, flinkWriteConf, flinkRowType, equalityFieldIds);

  // 设置并行度 如果没有指定则和输入流的并行度一样
  int parallelism =
      flinkWriteConf.writeParallelism() == null
          ? input.getParallelism()
          : flinkWriteConf.writeParallelism();
  .... 
  return writerStream;
}


### createStreamWriter 方法

static IcebergStreamWriter createStreamWriter(

Table table,

FlinkWriteConf flinkWriteConf,

RowType flinkRowType,

List equalityFieldIds) {

...

复制代码
Table serializableTable = SerializableTable.copyOf(table);
FileFormat format = flinkWriteConf.dataFileFormat();
// 创建 TaskWriterFactory 根据 表的 Schema 创建对应的 Writer
TaskWriterFactory<RowData> taskWriterFactory =
    new RowDataTaskWriterFactory(
        serializableTable,
        flinkRowType,
        flinkWriteConf.targetDataFileSize(),
        format,
        writeProperties(table, format, flinkWriteConf),
        equalityFieldIds,
        flinkWriteConf.upsertMode());
// 新建 IcebergStreamWriter
return new IcebergStreamWriter<>(table.name(), taskWriterFactory);

}

复制代码
### IcebergStreamWriter 类


该类为一个 Flink 内部的 OneInputStreamOperator 类,拥有 Flink 算子相关特性

class IcebergStreamWriter extends AbstractStreamOperator

implements OneInputStreamOperator<T, WriteResult>, BoundedOneInput {

...

@Override

public void open() {

...

// 初始化相关监控类

this.writerMetrics = new IcebergStreamWriterMetrics(super.metrics, fullTableName);

复制代码
// 初始化 taskWriterFactory 用于创建 writer 
this.taskWriterFactory.initialize(subTaskId, attemptId);

// 创建 writer 
  // 主要分成四类
  // 根据 Iceberg 表是否有分区和开启Upsert模式
  // UnpartitionedWriter : 无分区 insert only 
  // RowDataPartitionedFanoutWriter : 分区 insert only 
  // UnpartitionedDeltaWriter :无分区 Upsert
  // PartitionedDeltaWriter :有分区 Upsert
this.writer = taskWriterFactory.create();

}

@Override

public void processElement(StreamRecord element) throws Exception {

// 处理数据写入

writer.write(element.getValue());

}

...

// 将本次写入数据文件下发至 Commit 进行统一提交

private void flush() throws IOException {

if (writer == null) {

return;

}

复制代码
long startNano = System.nanoTime();
WriteResult result = writer.complete();
writerMetrics.updateFlushResult(result);
output.collect(new StreamRecord<>(result));
writerMetrics.flushDuration(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano));


writer = null;

}

}

复制代码
### IcebergFilesCommitter 类

class IcebergFilesCommitter extends AbstractStreamOperator

implements OneInputStreamOperator<WriteResult, Void>, BoundedOneInput {

...

@Override

public void initializeState(StateInitializationContext context) throws Exception {

复制代码
// 最大连续空提交 
// 在间断指定次数 Checkpoint 都没有数据后才真正触发 Commit,生成 Snapshot。
  // 减少空快照生成
maxContinuousEmptyCommits =
    PropertyUtil.propertyAsInt(table.properties(), MAX\_CONTINUOUS\_EMPTY\_COMMITS, 10);

// 创建 文件输出 OutputFileFactory
this.manifestOutputFileFactory =
    FlinkManifestUtil.createOutputFileFactory(
        table, flinkJobId, operatorUniqueId, subTaskId, attemptId);

if (context.isRestored()) {

...

// 从状态中恢复未提交的数据文件

NavigableMap<Long, byte[]> uncommittedDataFiles =

Maps.newTreeMap(checkpointsState.get().iterator().next())

.tailMap(maxCommittedCheckpointId, false);

if (!uncommittedDataFiles.isEmpty()) {

// Committed all uncommitted data files from the old flink job to iceberg table.

long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey();

// 如果存在未提交的文件 进行提交

commitUpToCheckpoint(

uncommittedDataFiles, restoredFlinkJobId, operatorUniqueId, maxUncommittedCheckpointId);

}

}

}

@Override

public void snapshotState(StateSnapshotContext context) throws Exception {

复制代码
  // 将 checkpointId 对应的写入完成的DATA-FILE生成清单文件并放入 dataFilesPerCheckpoint
dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId));
// Reset the snapshot state to the latest state.
checkpointsState.clear();
  // 存入状态
checkpointsState.add(dataFilesPerCheckpoint);

jobIdState.clear();
jobIdState.add(flinkJobId);

// Clear the local buffer for current checkpoint.
writeResultsOfCurrentCkpt.clear();
committerMetrics.checkpointDuration(
    TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano));

}

@Override

public void notifyCheckpointComplete(long checkpointId) throws Exception {

...

if (checkpointId > maxCommittedCheckpointId) {

LOG.info("Checkpoint {} completed. Attempting commit.", checkpointId);

// 完成 checkpoint 对数据进行 COMMIT

commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, checkpointId);

this.maxCommittedCheckpointId = checkpointId;

} else {

LOG.info(

"Skipping committing checkpoint {}. {} is already committed.",

checkpointId,

maxCommittedCheckpointId);

}

}

private void commitUpToCheckpoint(

NavigableMap<Long, byte[]> deltaManifestsMap,

String newFlinkJobId,

String operatorId,

long checkpointId)

throws IOException {

NavigableMap<Long, byte[]> pendingMap =

// 获取等待提交的数据文件

deltaManifestsMap.headMap(checkpointId, true);

List manifests = Lists.newArrayList();

NavigableMap<Long, WriteResult> pendingResults = Maps.newTreeMap();

for (Map.Entry<Long, byte[]> e : pendingMap.entrySet()) {

// 数据文件为空则跳过

if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) {

continue;

}

复制代码
  DeltaManifests deltaManifests =
      SimpleVersionedSerialization.readVersionAndDeSerialize(
          DeltaManifestsSerializer.INSTANCE, e.getValue());
  pendingResults.put(
      e.getKey(),
      FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()));
  manifests.addAll(deltaManifests.manifests());
}

// 获取当前待提交文件的 数据条数及数据文件大小 
CommitSummary summary = new CommitSummary(pendingResults);
// 提交数据
commitPendingResult(pendingResults, summary, newFlinkJobId, operatorId, checkpointId);
committerMetrics.updateCommitSummary(summary);
pendingMap.clear();
// 清除已提交数据
deleteCommittedManifests(manifests, newFlinkJobId, checkpointId);

}

private void commitPendingResult(

NavigableMap<Long, WriteResult> pendingResults,

CommitSummary summary,

String newFlinkJobId,

String operatorId,

long checkpointId) {

...

continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0;

// 数据文件不问 0 或者 连续最大空提交到达了配置的参数阈值触发提交

if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) {

if (replacePartitions) {

// replace 提交

// 使用 newReplacePartitions()

replacePartitions(pendingResults, summary, newFlinkJobId, operatorId, checkpointId);

} else {

// 普通提交

// 使用 newAppend()

commitDeltaTxn(pendingResults, summary, newFlinkJobId, operatorId, checkpointId);

}

continuousEmptyCheckpoints = 0;

...

}

...

private void replacePartitions(

NavigableMap<Long, WriteResult> pendingResults,

CommitSummary summary,

String newFlinkJobId,

String operatorId,

long checkpointId) {

Preconditions.checkState(

summary.deleteFilesCount() == 0, "Cannot overwrite partitions with delete files.");

// 使用 newReplacePartitions 提交

ReplacePartitions dynamicOverwrite = table.newReplacePartitions().scanManifestsWith(workerPool);

for (WriteResult result : pendingResults.values()) {

Preconditions.checkState(

result.referencedDataFiles().length == 0, "Should have no referenced data files.");

Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile);

}

...

}

private void commitDeltaTxn(

NavigableMap<Long, WriteResult> pendingResults,

CommitSummary summary,

String newFlinkJobId,

String operatorId,

long checkpointId) {

if (summary.deleteFilesCount() == 0) {

// To be compatible with iceberg format V1.

AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool);

for (WriteResult result : pendingResults.values()) {

Preconditions.checkState(

result.referencedDataFiles().length == 0,

"Should have no referenced data files for append.");

Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile);

}

commitOperation(appendFiles, summary, "append", newFlinkJobId, operatorId, checkpointId);

} else {

// To be compatible with iceberg format V2.

for (Map.Entry<Long, WriteResult> e : pendingResults.entrySet()) {

// We don't commit the merged result into a single transaction because for the sequential

// transaction txn1 and txn2, the equality-delete files of txn2 are required to be applied

// to data files from txn1. Committing the merged one will lead to the incorrect delete

// semantic.

WriteResult result = e.getValue();

复制代码
    RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool);
	// 分别写入 DataFile 和 Delete File 
    Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
    Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
    commitOperation(rowDelta, summary, "rowDelta", newFlinkJobId, operatorId, e.getKey());
  }
}

}

private void commitOperation(

SnapshotUpdate<?> operation,

CommitSummary summary,

String description,

String newFlinkJobId,

String operatorId,

long checkpointId) {

...

// 提交操作

operation.commit(); // abort is automatically called if this fails.

...

committerMetrics.commitDuration(durationMs);

}

@Override

public void processElement(StreamRecord element) throws Exception{

final WriteResult value = element.getValue();

if ("DDL".equalsIgnoreCase(value.getType())) {

this.writeResultsOfCurrentCkpt.add(element.getValue());

doCommit();

} else {

this.writeResultsOfCurrentCkpt.add(element.getValue());

}

}

...

}

复制代码
相关推荐
zzb15805 小时前
RAG from Scratch-优化-query
java·数据库·人工智能·后端·spring·mybatis
V搜xhliang02465 小时前
机器人建模(URDF)与仿真配置
大数据·人工智能·深度学习·机器学习·自然语言处理·机器人
房产中介行业研习社5 小时前
2026年3月哪些房源管理系统功能全
大数据·运维·人工智能
wuqingshun3141595 小时前
如何停止一个正在退出的线程
java·开发语言·jvm
Barkamin6 小时前
队列的实现(Java)
java·开发语言
玄微云6 小时前
2026年通用软件难适配,垂直店务系统反而更省心
大数据·云计算·软件需求
骇客野人7 小时前
自己手搓磁盘清理工具(JAVA版)
java·开发语言
J2虾虾7 小时前
在SpringBoot中使用Druid
java·spring boot·后端·druid
清风徐来QCQ7 小时前
Java笔试总结一
java·开发语言
Elastic 中国社区官方博客7 小时前
Elastic 为什么捐赠其 OpenTelemetry PHP 发行版
大数据·开发语言·elasticsearch·搜索引擎·信息可视化·全文检索·php