Flink CDC系列之:Kafka Debezium JSON 序列化器的实现DebeziumJsonSerializationSchema
这是一个 Debezium JSON 序列化器的实现,负责将 Flink CDC 事件转换为标准的 Debezium JSON 格式。
类概述
java
public class DebeziumJsonSerializationSchema implements SerializationSchema<Event>
这个类将 Flink CDC 的内部事件格式转换为符合 Debezium 标准的 JSON 格式。
核心常量定义
操作类型常量
java
private static final StringData OP_INSERT = StringData.fromString("c"); // insert
private static final StringData OP_DELETE = StringData.fromString("d"); // delete
private static final StringData OP_UPDATE = StringData.fromString("u"); // update
Debezium 操作代码:
- "c":Create/Insert
- "d":Delete
- "u":Update
核心属性
java
private final Map<TableId, TableSchemaInfo> jsonSerializers; // 表结构缓存
private transient GenericRowData reuseGenericRowData; // 可重用的行数据
private transient GenericRowData payloadGenericRowData; // Payload数据
// JSON格式配置
private final TimestampFormat timestampFormat;
private final JsonFormatOptions.MapNullKeyMode mapNullKeyMode;
private final String mapNullKeyLiteral;
private final boolean encodeDecimalAsPlainNumber;
private final boolean ignoreNullFields;
private final boolean isIncludedDebeziumSchema; // 是否包含Schema
private final ZoneId zoneId;
private Map<TableId, String> schemaMap = new HashMap<>(); // Schema缓存
JsonConverter jsonConverter; // Kafka Connect JSON转换器
构造函数
java
public DebeziumJsonSerializationSchema(
TimestampFormat timestampFormat,
JsonFormatOptions.MapNullKeyMode mapNullKeyMode,
String mapNullKeyLiteral,
ZoneId zoneId,
boolean encodeDecimalAsPlainNumber,
boolean ignoreNullFields,
boolean isIncludedDebeziumSchema) {
// 初始化所有配置参数
}
核心方法详解
open() - 初始化方法
java
@Override
public void open(InitializationContext context) {
if (isIncludedDebeziumSchema) {
// 包含Schema模式:创建2字段结构 [schema, payload]
reuseGenericRowData = new GenericRowData(2);
payloadGenericRowData = new GenericRowData(4); // [before, after, op, source]
reuseGenericRowData.setField(PAYLOAD.getPosition(), payloadGenericRowData);
// 配置Kafka Connect JSON转换器
this.jsonConverter = new JsonConverter();
final HashMap<String, Object> configs = new HashMap<>(2);
configs.put(ConverterConfig.TYPE_CONFIG, ConverterType.VALUE.getName());
jsonConverter.configure(configs);
} else {
// 不包含Schema模式:直接创建4字段结构 [before, after, op, source]
reuseGenericRowData = new GenericRowData(4);
}
this.context = context;
}
serialize() - 主序列化方法
java
@Override
public byte[] serialize(Event event) {
if (event instanceof SchemaChangeEvent) {
handleSchemaChangeEvent((SchemaChangeEvent) event); // 处理Schema变更
return null;
}
DataChangeEvent dataChangeEvent = (DataChangeEvent) event;
return serializeDataChangeEvent(dataChangeEvent); // 处理数据变更
}
Schema 变更处理
java
private void handleSchemaChangeEvent(SchemaChangeEvent schemaChangeEvent) {
Schema schema;
if (event instanceof CreateTableEvent) {
// 新建表:获取完整表结构
CreateTableEvent createTableEvent = (CreateTableEvent) event;
schema = createTableEvent.getSchema();
} else {
// 更新表结构:应用Schema变更
schema = SchemaUtils.applySchemaChangeEvent(
jsonSerializers.get(schemaChangeEvent.tableId()).getSchema(),
schemaChangeEvent);
}
// 如果需要包含Schema,转换为Debezium Schema格式
if (isIncludedDebeziumSchema) {
schemaMap.put(schemaChangeEvent.tableId(), convertSchemaToDebeziumSchema(schema));
}
// 创建JSON序列化器
LogicalType rowType = DataTypeUtils.toFlinkDataType(schema.toRowDataType()).getLogicalType();
DebeziumJsonRowDataSerializationSchema jsonSerializer =
new DebeziumJsonRowDataSerializationSchema(
createJsonRowType(fromLogicalToDataType(rowType), isIncludedDebeziumSchema),
timestampFormat,
mapNullKeyMode,
mapNullKeyLiteral,
encodeDecimalAsPlainNumber,
ignoreNullFields,
isIncludedDebeziumSchema);
// 初始化并缓存序列化器
jsonSerializer.open(context);
jsonSerializers.put(schemaChangeEvent.tableId(),
new TableSchemaInfo(schemaChangeEvent.tableId(), schema, jsonSerializer, zoneId));
}
数据变更事件处理
java
private byte[] serializeDataChangeEvent(DataChangeEvent dataChangeEvent) {
// 根据操作类型选择转换器
BiConsumer<DataChangeEvent, GenericRowData> converter;
switch (dataChangeEvent.op()) {
case INSERT:
converter = this::convertInsertEventToRowData;
break;
case DELETE:
converter = this::convertDeleteEventToRowData;
break;
case UPDATE:
case REPLACE:
converter = this::convertUpdateEventToRowData;
break;
default:
throw new UnsupportedOperationException("Unsupported operation");
}
// 执行转换
if (isIncludedDebeziumSchema) {
converter.accept(dataChangeEvent, payloadGenericRowData);
reuseGenericRowData.setField(
SCHEMA.getPosition(),
StringData.fromString(schemaMap.get(dataChangeEvent.tableId())));
} else {
converter.accept(dataChangeEvent, reuseGenericRowData);
}
// 序列化为JSON字节数组
return jsonSerializers
.get(dataChangeEvent.tableId())
.getSerializationSchema()
.serialize(reuseGenericRowData);
}
事件转换方法
Insert 事件转换
java
private void convertInsertEventToRowData(
DataChangeEvent dataChangeEvent, GenericRowData genericRowData) {
genericRowData.setField(BEFORE.getPosition(), null); // Insert没有before数据
genericRowData.setField(
AFTER.getPosition(),
jsonSerializers.get(dataChangeEvent.tableId())
.getRowDataFromRecordData(dataChangeEvent.after(), false));
genericRowData.setField(OPERATION.getPosition(), OP_INSERT);
genericRowData.setField(
SOURCE.getPosition(),
GenericRowData.of(
StringData.fromString(dataChangeEvent.tableId().getSchemaName()),
StringData.fromString(dataChangeEvent.tableId().getTableName())));
}
Delete 事件转换
java
private void convertDeleteEventToRowData(
DataChangeEvent dataChangeEvent, GenericRowData genericRowData) {
genericRowData.setField(
BEFORE.getPosition(),
jsonSerializers.get(dataChangeEvent.tableId())
.getRowDataFromRecordData(dataChangeEvent.before(), false));
genericRowData.setField(AFTER.getPosition(), null); // Delete没有after数据
genericRowData.setField(OPERATION.getPosition(), OP_DELETE);
// 设置source信息...
}
Update 事件转换
java
private void convertUpdateEventToRowData(
DataChangeEvent dataChangeEvent, GenericRowData genericRowData) {
genericRowData.setField(
BEFORE.getPosition(),
jsonSerializers
.get(dataChangeEvent.tableId())
.getRowDataFromRecordData(dataChangeEvent.before(), false));
genericRowData.setField(
AFTER.getPosition(),
jsonSerializers
.get(dataChangeEvent.tableId())
.getRowDataFromRecordData(dataChangeEvent.after(), false));
genericRowData.setField(OPERATION.getPosition(), OP_UPDATE);
genericRowData.setField(
SOURCE.getPosition(),
GenericRowData.of(
StringData.fromString(dataChangeEvent.tableId().getSchemaName()),
StringData.fromString(dataChangeEvent.tableId().getTableName())));
}
Schema 转换核心方法
convertSchemaToDebeziumSchema() - 转换为Debezium Schema
java
public String convertSchemaToDebeziumSchema(Schema schema) {
List<Column> columns = schema.getColumns();
SchemaBuilder schemaBuilder = SchemaBuilder.struct();
SchemaBuilder beforeBuilder = SchemaBuilder.struct();
SchemaBuilder afterBuilder = SchemaBuilder.struct();
// 为每个列构建Debezium数据类型
for (Column column : columns) {
SchemaBuilder field = convertCDCDataTypeToDebeziumDataType(column);
beforeBuilder.field(column.getName(), field).optional();
afterBuilder.field(column.getName(), field).optional();
}
schemaBuilder.field("before", beforeBuilder);
schemaBuilder.field("after", afterBuilder);
schemaBuilder.build();
// 转换为JSON Schema字符串
return jsonConverter.asJsonSchema(schemaBuilder).toString();
}
convertCDCDataTypeToDebeziumDataType() - 数据类型映射
这是最复杂的方法,将 Flink CDC 类型映射到 Debezium 类型:
java
private static SchemaBuilder convertCDCDataTypeToDebeziumDataType(Column column) {
org.apache.flink.cdc.common.types.DataType columnType = column.getType();
final SchemaBuilder field;
switch (columnType.getTypeRoot()) {
case TINYINT:
case SMALLINT:
field = SchemaBuilder.int16(); // 16位整数
break;
case INTEGER:
field = SchemaBuilder.int32(); // 32位整数
break;
case BIGINT:
field = SchemaBuilder.int64(); // 64位整数
break;
case DECIMAL:
final int decimalPrecision = ((DecimalType) columnType).getPrecision();
final int decimalScale = ((DecimalType) columnType).getScale();
field = Decimal.builder(decimalScale)
.parameter("connect.decimal.precision", String.valueOf(decimalPrecision));
break;
case BOOLEAN:
field = SchemaBuilder.bool(); // 布尔类型
break;
case FLOAT:
field = SchemaBuilder.float32(); // 32位浮点数
break;
case DOUBLE:
field = SchemaBuilder.float64(); // 64位浮点数
break;
case DATE:
field = SchemaBuilder.int32().name(Date.SCHEMA_NAME).version(1); // 日期(天数)
break;
case TIME_WITHOUT_TIME_ZONE:
field = SchemaBuilder.int64().name(MicroTime.SCHEMA_NAME).version(1); // 时间(微秒)
break;
case TIMESTAMP_WITHOUT_TIME_ZONE:
case TIMESTAMP_WITH_TIME_ZONE:
int timestampPrecision = ((TimestampType) columnType).getPrecision();
if (timestampPrecision > 3) {
field = SchemaBuilder.int64().name(MicroTimestamp.SCHEMA_NAME).version(1); // 微秒时间戳
} else {
field = SchemaBuilder.int64().name(Timestamp.SCHEMA_NAME).version(1); // 毫秒时间戳
}
break;
case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
field = SchemaBuilder.string().name(ZonedTimestamp.SCHEMA_NAME).version(1); // 带时区时间戳
break;
case BINARY:
case VARBINARY:
field = SchemaBuilder.bytes().name(Bits.LOGICAL_NAME)
.parameter(Bits.LENGTH_FIELD,
Integer.toString(DataTypes.getLength(columnType).orElse(0)))
.version(1);
break;
case CHAR:
case VARCHAR:
default:
field = SchemaBuilder.string(); // 字符串类型
}
// 处理可空性
if (columnType.isNullable()) {
field.optional();
} else {
field.required();
}
// 处理默认值和注释
if (column.getDefaultValueExpression() != null) {
field.defaultValue(column.getDefaultValueExpression());
}
if (column.getComment() != null) {
field.doc(column.getComment());
}
return field;
}
createJsonRowType() - 创建JSON行类型
java
private static RowType createJsonRowType(DataType databaseSchema, boolean isIncludedDebeziumSchema) {
DataType payloadRowType =
DataTypes.ROW(
DataTypes.FIELD(BEFORE.getFieldName(), databaseSchema),
DataTypes.FIELD(AFTER.getFieldName(), databaseSchema),
DataTypes.FIELD(OPERATION.getFieldName(), DataTypes.STRING()),
DataTypes.FIELD(
SOURCE.getFieldName(),
DataTypes.ROW(
DataTypes.FIELD(DATABASE.getFieldName(), DataTypes.STRING()),
DataTypes.FIELD(TABLE.getFieldName(), DataTypes.STRING()))));
if (isIncludedDebeziumSchema) {
return (RowType)
DataTypes.ROW(
DataTypes.FIELD(SCHEMA.getFieldName(), DataTypes.STRING()),
DataTypes.FIELD(PAYLOAD.getFieldName(), payloadRowType))
.getLogicalType();
}
return (RowType) payloadRowType.getLogicalType();
}
生成的 JSON 格式
包含 Schema 的完整格式
java
{
"schema": {
"type": "struct",
"fields": [
{
"type": "struct",
"fields": [
{"type": "int32", "optional": false, "field": "id"},
{"type": "string", "optional": true, "field": "name"}
],
"optional": true,
"name": "before",
"field": "before"
},
{
"type": "struct",
"fields": [
{"type": "int32", "optional": false, "field": "id"},
{"type": "string", "optional": true, "field": "name"}
],
"optional": true,
"name": "after",
"field": "after"
}
],
"optional": false,
"name": "envelope"
},
"payload": {
"before": null,
"after": {
"id": 1001,
"name": "张三"
},
"op": "c",
"source": {
"db": "inventory",
"table": "users"
}
}
}
不包含 Schema 的简化格式
java
{
"before": null,
"after": {
"id": 1001,
"name": "张三"
},
"op": "c",
"source": {
"db": "inventory",
"table": "users"
}
}
设计特点
双重模式支持
- 包含Schema模式:完整的Debezium格式,包含表结构信息
- 不包含Schema模式:简化的payload格式,减少数据量
类型系统映射
- 将 Flink CDC 类型系统完整映射到 Debezium 类型系统,确保数据类型兼容性。
性能优化
- 使用可重用的 GenericRowData 对象减少内存分配
- 预编译 Schema 信息避免重复计算
- 缓存序列化器提升处理效率
总结:这个 DebeziumJsonSerializationSchema 是一个功能完整的序列化器,它:
- ✅ 生成标准Debezium格式:符合Debezium JSON规范
- ✅ 支持完整类型映射:将Flink CDC类型转换为Debezium类型
- ✅ 灵活的Schema包含:支持包含或不包含表结构信息
- ✅ 高性能设计:重用对象和缓存优化性能
- ✅ 完整的CDC支持:支持Insert、Update、Delete所有操作类型
- ✅ 生产就绪:包含完整的错误处理和资源管理
这使得 Flink CDC 能够生成与原生 Debezium 完全兼容的 JSON 数据,便于与现有的 Debezium 生态集成。