SeaTunnel扩展Transform插件,自定义转换插件

代码结构

在seatunnel-transforms-v2中新建数据包名,新建XXXTransform,XXXTransformConfig,XXXTransformFactory三个类

自定义转换插件功能说明

这是个适配KafkaSource的转换插件,接收到的原文格式为:

{"path":"xxx.log.gz","code":"011","cont":"{\"ID\":\"1\",\"NAME\":\"zhangsan\",\"TABLE\":\"USER\",\"create_time\":\"20230904\"}","timestamp":"20230823160246"}

需要转换为只保留cont里面的数据

{"create_time":"20230904","NAME":"zhangsan","TABLE":"USER","ID":"999"}

任务配置文件

env {

  # You can set engine configuration here STREAMING BATCH

  execution.parallelism = 1

  job.mode = "STREAMING"



  #execution.checkpoint.data-uri = "hdfs://localhost:9000/checkpoint"

}



source {

  # This is a example source plugin **only for test and demonstrate the feature source plugin**

   Kafka {

            bootstrap.servers = "xxxxx:9092"

            topic = "test_in2"

            consumer.group = "167321237613

            format="text"

            result_table_name="kafka"

        }

}



transform {

    ExtractFromCJ {

    source_table_name="kafka"

    result_table_name="kafka1"

    schema = {

        fields {

                NAME = "string"

                TABLE = "string"

                create_time = "string"

                ID="string"

            }

        }

    }

}



sink {

  kafka {

      source_table_name="kafka1"

      topic = "test_out2"

      bootstrap.servers = "xxxx:9092"

      kafka.request.timeout.ms = 60000

      semantics = EXACTLY_ONCE

  }

}

代码说明

XXXConfig代码,这个类主要用来保存transform的配置项

package org.apache.seatunnel.transform.extract;



import lombok.Getter;

import lombok.Setter;

import org.apache.seatunnel.api.configuration.Option;

import org.apache.seatunnel.api.configuration.Options;

import org.apache.seatunnel.api.configuration.ReadonlyConfig;



import java.io.Serializable;

import java.util.Map;



@Getter

@Setter

public class ExtractFromCJTransformConfig implements Serializable {



    public static final Option<Map<String, String>> SCHEMA =

            Options.key("schema.fields")

                    .mapType()

                    .noDefaultValue()

                    .withDescription(

                            "Specify the field mapping relationship between input and output");



    private Map<String, String> fieldColumns;

    public static ExtractFromCJTransformConfig of(ReadonlyConfig config) {

        ExtractFromCJTransformConfig extractFromCJTransformConfig = new ExtractFromCJTransformConfig();

        Map<String, String> fieldColumns = config.get(SCHEMA);

        extractFromCJTransformConfig.setFieldColumns(fieldColumns);

        return extractFromCJTransformConfig;

    }

}

XXXTransformFactory说明,工厂类,主要用来初始化具体的转换类

package org.apache.seatunnel.transform.extract;



import com.google.auto.service.AutoService;

import org.apache.seatunnel.api.configuration.ReadonlyConfig;

import org.apache.seatunnel.api.configuration.util.OptionRule;

import org.apache.seatunnel.api.table.catalog.CatalogTable;

import org.apache.seatunnel.api.table.connector.TableTransform;

import org.apache.seatunnel.api.table.factory.Factory;

import org.apache.seatunnel.api.table.factory.TableFactoryContext;

import org.apache.seatunnel.api.table.factory.TableTransformFactory;



@AutoService(Factory.class)

public class ExtractFromCJTransformFactory implements TableTransformFactory {

    @Override

    public String factoryIdentifier() {

        return  "ExtractFromCJ";

    }



    @Override

    public OptionRule optionRule() {

        return OptionRule.builder().optional(ExtractFromCJTransformConfig.SCHEMA).build();

    }



    @Override

    public TableTransform createTransform(TableFactoryContext context) {

        CatalogTable catalogTable = context.getCatalogTable();

        ReadonlyConfig options = context.getOptions();

        ExtractFromCJTransformConfig extractFromCJTransformConfig =

                ExtractFromCJTransformConfig.of(options);

        return () -> new ExtractFromCJTransform(extractFromCJTransformConfig, catalogTable);

    }

}

XXXXTransform,具体的转换类,主要用于对source数据的处理,还有数据结构类型的保存

package org.apache.seatunnel.transform.extract;



import cn.hutool.core.collection.CollUtil;

import cn.hutool.json.JSONObject;

import cn.hutool.json.JSONUtil;

import com.google.auto.service.AutoService;

import lombok.NoArgsConstructor;

import lombok.NonNull;

import lombok.extern.slf4j.Slf4j;

import org.apache.seatunnel.api.configuration.ReadonlyConfig;

import org.apache.seatunnel.api.configuration.util.ConfigValidator;

import org.apache.seatunnel.api.table.catalog.CatalogTable;

import org.apache.seatunnel.api.table.catalog.CatalogTableUtil;

import org.apache.seatunnel.api.table.catalog.Column;

import org.apache.seatunnel.api.table.catalog.ConstraintKey;

import org.apache.seatunnel.api.table.catalog.PhysicalColumn;

import org.apache.seatunnel.api.table.catalog.PrimaryKey;

import org.apache.seatunnel.api.table.catalog.TableIdentifier;

import org.apache.seatunnel.api.table.catalog.TableSchema;

import org.apache.seatunnel.api.table.type.SeaTunnelDataType;

import org.apache.seatunnel.api.table.type.SeaTunnelRow;

import org.apache.seatunnel.api.table.type.SeaTunnelRowType;

import org.apache.seatunnel.api.transform.SeaTunnelTransform;

import org.apache.seatunnel.shade.com.typesafe.config.Config;

import org.apache.seatunnel.transform.common.AbstractCatalogSupportTransform;



import java.util.ArrayList;

import java.util.List;

import java.util.stream.Collectors;



@AutoService(SeaTunnelTransform.class)

@NoArgsConstructor

@Slf4j

public class ExtractFromCJTransform extends AbstractCatalogSupportTransform {



    private ExtractFromCJTransformConfig config;

    protected SeaTunnelRowType inputRowType;

    @Override

    public String getPluginName() {

        return "ExtractFromCJ";

    }



    public ExtractFromCJTransform(

            @NonNull ExtractFromCJTransformConfig config, @NonNull CatalogTable catalogTable) {

        super(catalogTable);

        this.config = config;

    }

    @Override

    protected void setConfig(Config pluginConfig) {

        ConfigValidator.of(ReadonlyConfig.fromConfig(pluginConfig))

                .validate(new ExtractFromCJTransformFactory().optionRule());

        this.config = ExtractFromCJTransformConfig.of(ReadonlyConfig.fromConfig(pluginConfig));

    }



    @Override

    protected SeaTunnelRowType transformRowType(SeaTunnelRowType inputRowType) {

        return inputRowType;

    }



    @Override

    protected SeaTunnelRow transformRow(SeaTunnelRow inputRow) {

        Object content = inputRow.getFields()[0];

        String data = content.toString();

        Object[] outputDataArray = new Object[0];

        if (JSONUtil.isJson(data)) {

            JSONObject cont = JSONUtil.parseObj(data).getJSONObject("cont");

            if (!cont.isEmpty()) {

                if (!CollUtil.isEmpty(this.config.getFieldColumns())) {

                    outputDataArray = new Object[this.config.getFieldColumns().size()];

                    int t = 0;

                    for (String key : this.config.getFieldColumns().keySet()) {

                        String value = cont.getStr(key);

                        outputDataArray[t] = value;

                        t++;

                    }

                } else {

                    outputDataArray = new Object[1];

                    outputDataArray[0] = JSONUtil.toJsonStr(cont);

                }

            }

        }

        SeaTunnelRow outputRow = new SeaTunnelRow(outputDataArray);

        outputRow.setRowKind(inputRow.getRowKind());

        outputRow.setTableId(inputRow.getTableId());

        return outputRow;

    }



    @Override

    protected TableSchema transformTableSchema() {

        List<Column> inputColumns = inputCatalogTable.getTableSchema().getColumns();

        List<ConstraintKey> outputConstraintKeys =

                inputCatalogTable.getTableSchema().getConstraintKeys().stream()

                        .map(ConstraintKey::copy)

                        .collect(Collectors.toList());

        PrimaryKey copiedPrimaryKey =

                inputCatalogTable.getTableSchema().getPrimaryKey() == null

                        ? null

                        : inputCatalogTable.getTableSchema().getPrimaryKey().copy();



        if (CollUtil.isEmpty(this.config.getFieldColumns())) {

            return TableSchema.builder()

                    .primaryKey(copiedPrimaryKey)

                    .columns(inputColumns)

                    .constraintKey(outputConstraintKeys)

                    .build();

        } else {

            List<Column> transformColumns = new ArrayList<>();

            for (String key : this.config.getFieldColumns().keySet()) {

                SeaTunnelDataType<?> dataType = CatalogTableUtil.parseDataType(this.config.getFieldColumns().get(key));

                transformColumns.add(PhysicalColumn.of(key, dataType, 0, true, null, null));

            }

            return TableSchema.builder()

                    .primaryKey(copiedPrimaryKey)

                    .columns(transformColumns)

                    .constraintKey(outputConstraintKeys)

                    .build();

        }

    }



    @Override

    protected TableIdentifier transformTableIdentifier() {

        return inputCatalogTable.getTableId().copy();

    }

}

文中的转换实现的是AbstractCatalogSupportTransform类,Seatunel还提供SingleFieldOutputTransform和MultipleFieldOutputTransform,分别对应单字段和多字段的数据处理,具体扩展可根据需求来实现对应的类

执行结果

来源消息

结果消息

以上就是对转换插件的扩展分享,有需求的小伙伴可以参考,也欢迎大家一起评论沟通~

相关推荐
chordful3 分钟前
Leetcode热题100-32 最长有效括号
c++·算法·leetcode·动态规划
_OLi_11 分钟前
力扣 LeetCode 459. 重复的子字符串(Day4:字符串)
算法·leetcode·职场和发展·kmp
Romanticroom18 分钟前
计算机23级数据结构上机实验(第3-4周)
数据结构·算法
白藏y18 分钟前
数据结构——归并排序
数据结构·算法·排序算法
ahadee30 分钟前
蓝桥杯每日真题 - 第12天
c++·vscode·算法·蓝桥杯
zhentiya43 分钟前
微积分第五版课后习题答案详解PDF电子版 赵树嫄
算法·pdf
luky!1 小时前
算法--解决熄灯问题
python·算法
鸽鸽程序猿1 小时前
【算法】【优选算法】二分查找算法(下)
java·算法·二分查找算法
_OLi_1 小时前
力扣 LeetCode 150. 逆波兰表达式求值(Day5:栈与队列)
算法·leetcode·职场和发展
远望清一色1 小时前
基于MATLAB身份证号码识别
开发语言·图像处理·算法·matlab