FlinkSQL中【FULL OUTER JOIN】使用实例分析(坑)

Flink版本:flink1.14

最近有【FULL OUTER JOIN】场景的实时数据开发需求,想要的结果是,左右表来了数据都下发数据;左表存在的数据,右表进来可以关联下发(同样,右表存在的数据,左表进来也可以关联下发)。但在实际应用中遇到一些问题。

其中包括FlinkSQL知识点:

  • FlinkSQL 【FULL OUTER JOIN】
  • FlinkSQL 【Temporal Joins-Lookup Join】
  • FlinkSQL 【去重】
  • FlinkSQL 【upsert-kafka】

FlinkSQL demo

sql 复制代码
CREATE TABLE waybill_extend_kafka (
     mid bigint,
     db string,
     sch string,
     tab string,
     opt string,
     ts bigint,
     ddl string,
     err string,
     src map<string,string>,
     cur map<string,string>,
     cus map<string,string>,
     _proc  as proctime()
) WITH (
  'connector' = 'kafka',
  'topic' = 't1',
  'properties.bootstrap.servers' = 'xx.xx.xx.xx:9092',
  'properties.group.id' = 'g1',
  'scan.startup.mode' = 'earliest-offset',  --group-offsets/earliest-offset/latest-offset
  'format' = 'json'
);

CREATE TABLE package_state_kafka (
     mid bigint,
     db string,
     sch string,
     tab string,
     opt string,
     ts bigint,
     ddl string,
     err string,
     src map<string,string>,
     cur map<string,string>,
     cus map<string,string>,
     _proc  as proctime()
) WITH (
  'connector' = 'kafka',
  'topic' = 't2',
  'properties.bootstrap.servers' = 'xx.xx.xx.xx:9092',
  'properties.group.id' = 'g1',
  'scan.startup.mode' = 'earliest-offset',  --group-offsets/earliest-offset/latest-offset
  'format' = 'json'
);

CREATE TABLE es_dim(
    id                STRING,
    ts                STRING,
    waybill_code      STRING,
    pin               STRING,
    operater_ts       STRING,
    operater_type     STRING,
    is_enable         STRING,
    batch_no          STRING,
    activity_key      STRING,
    p_type            STRING,
    p_name            STRING,
    version            STRING,
    update_time            STRING
)
with (
    'connector' = 'elasticsearch-6',
    'index' = 'es_dim',
    'document-type' = 'es_dim',
    'hosts' = 'http://xxx:9200',
    'format' = 'json'
);

CREATE TABLE es_sink(
     waybill_code      STRING
    ,first_order       STRING -- 新客1,非新客0
    ,extend_update_time            STRING
    ,state       STRING -- 妥投150
    ,package_update_time            STRING
    ,pin              STRING
    ,coupon_use_time      STRING
    ,operater_type    STRING
    ,is_enable        STRING
    ,batch_no         STRING
    ,update_time         STRING
    ,PRIMARY KEY (waybill_code) NOT ENFORCED
)
with (
    'connector' = 'elasticsearch-6',
    'index' = 'es_sink',
    'document-type' = 'es_sink',
    'hosts' = 'http://xxx:9200',
    'format' = 'json',
    'filter.null-value'='true',
    'sink.bulk-flush.max-actions' = '1000',
    'sink.bulk-flush.max-size' = '10mb'
);

CREATE TABLE kafka_sink (
     waybill_code      STRING
    ,first_order       STRING 
    ,extend_update_time            STRING
    ,state       STRING -- 妥投150
    ,package_update_time            STRING
    ,pin              STRING
    ,coupon_use_time      STRING
    ,operater_type    STRING
    ,is_enable        STRING
    ,batch_no         STRING
    ,update_time         STRING
    ,PRIMARY KEY (waybill_code) NOT ENFORCED --注意 确保在 DDL 中定义主键。
) WITH (
  'connector' = 'upsert-kafka',
  'topic' = 't3',
  'properties.bootstrap.servers' = 'xx.xx.xx.xx:9092',
  'key.format' = 'json',
  'value.format' = 'json'
);

--新客
CREATE view  waybill_extend_temp as
select
    IF(cur['waybill_code'] IS NOT NULL , cur['waybill_code'], src ['waybill_code'])  AS waybill_code,
    IF(cur['data_key'] IS NOT NULL , cur['data_key'], src ['data_key'])  AS data_key,
    IF(cur['create_time'] IS NOT NULL , cur['create_time'], src ['create_time'])  AS create_time,
    opt,
    _proc
FROM waybill_extend_kafka
where UPPER(opt) = 'DELETE' OR UPPER(opt) = 'INSERT';

CREATE view  waybill_extend_temp_handle as
SELECT
    waybill_code,
    case when UPPER(opt) = 'INSERT'  then '1'
         when UPPER(opt) = 'DELETE'  then '0'
            end as first_order,
    create_time,
    _proc
from waybill_extend_temp
where data_key = 'firstOrder';

--妥投
CREATE view package_state_temp as
select
    IF(cur['WAYBILL_CODE'] IS NOT NULL , cur['WAYBILL_CODE'], src ['WAYBILL_CODE'])  AS waybill_code,
    IF(cur['STATE'] IS NOT NULL , cur['STATE'], src ['STATE'])  AS state,
    IF(cur['CREATE_TIME'] IS NOT NULL , cur['CREATE_TIME'], src ['CREATE_TIME'])  AS create_time,
    opt,
    _proc
FROM package_state_kafka
where UPPER(opt) = 'INSERT';

CREATE view package_state_temp_handle as
SELECT
    waybill_code,
    max(state) as state,
    min(create_time) as package_update_time,
    proctime() as _proc
from package_state_temp
where state = '150'
group by waybill_code;

--full join
-- flink1.14 注意:flinksql里面的FULL OUTER JOIN 只是分别下发左右数据,中间状态不关联下发,在流处理场景下相当于union all
CREATE view waybill_extend_package_state  as
SELECT
    COALESCE(a.waybill_code, b.waybill_code) as waybill_code,
    a.first_order,
    a.create_time as extend_update_time,
    b.state,
    b.package_update_time,
    COALESCE(a._proc, b._proc) as _proc
from waybill_extend_temp_handle as a
FULL OUTER JOIN package_state_temp_handle b
on a.waybill_code=b.waybill_code;

--result
CREATE VIEW res_view AS
SELECT
     a.waybill_code
    ,a.first_order
    ,a.extend_update_time
    ,a.state
    ,a.package_update_time
    ,b.pin
    ,b.operater_ts
    ,b.operater_type
    ,b.is_enable
    ,b.batch_no
    ,CAST(CAST(a._proc AS TIMESTAMP(3)) AS STRING) as update_time
    ,row_number() over(partition by a.waybill_code order by b.operater_ts desc) as rn 
from waybill_extend_package_state as a
JOIN es_dim FOR SYSTEM_TIME AS OF a._proc as b
on a.waybill_code=b.waybill_code;

INSERT INTO es_sink
SELECT
     waybill_code
    ,first_order
    ,extend_update_time
    ,state
    ,package_update_time
    ,pin
    ,operater_ts
    ,operater_type
    ,is_enable
    ,batch_no
    ,update_time
FROM res_view
where rn =1;

INSERT INTO kafka_sink
SELECT
     waybill_code
    ,first_order
    ,extend_update_time
    ,state
    ,package_update_time
    ,pin
    ,operater_ts 
    ,operater_type
    ,is_enable
    ,batch_no
    ,update_time
FROM res_view
where rn =1;

es_sink mapping:

bash 复制代码
POST es_sink/es_sink/_mapping
{
    "es_sink": {
        "properties": {
            "waybill_code": {
                "type": "keyword"
            },
            "pin": {
                "type": "keyword"
            },
            "operater_ts": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            },
            "operater_type": {
                "type": "keyword"
            },
            "is_enable": {
                "type": "keyword"
            },
            "batch_no": {
                "type": "keyword"
            },
            "first_order": {
                "type": "keyword"
            },
            "extend_update_time": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            },
            "state": {
                "type": "keyword"
            },
            "package_update_time": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            },
            "update_time": {
                "type": "date",
                "format": "yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            }
        }
    }
}

结果分析

从sink_es和sink_kafka获取数据都是同样的结果,部分结果如下:

但从结果中可以看出,FlinkSQL里面的【FULL OUTER JOIN】 只是分别下发左右数据,中间状态(从FlinkUI中可以看到【FULL OUTER JOIN】状态也做了保存)不关联下发,在流处理场景下 相当于【UNION ALL】,不知是否是FlinkSQL的bug。

【FULL OUTER JOIN】状态数据,如下:

此次用例分析只是针对于Flink1.14,对于其他版本尚不清楚。

相关推荐
华农DrLai6 小时前
Spark SQL Catalyst 优化器详解
大数据·hive·sql·flink·spark
岁岁种桃花儿7 小时前
Flink从入门到上天系列第一篇:搭建第一个Flink程序
大数据·linux·flink·数据同步
Hello.Reader15 小时前
Flink ZooKeeper HA 实战原理、必配项、Kerberos、安全与稳定性调优
安全·zookeeper·flink
Hello.Reader19 小时前
Flink 使用 Amazon S3 读写、Checkpoint、插件选择与性能优化
大数据·flink
Hello.Reader20 小时前
Flink 对接 Google Cloud Storage(GCS)读写、Checkpoint、插件安装与生产配置指南
大数据·flink
Hello.Reader20 小时前
Flink Kubernetes HA(高可用)实战原理、前置条件、配置项与数据保留机制
贪心算法·flink·kubernetes
wending-Y1 天前
记录一次排查Flink一直重启的问题
大数据·flink
Hello.Reader1 天前
Flink 对接 Azure Blob Storage / ADLS Gen2:wasb:// 与 abfs://(读写、Checkpoint、插件与认证)
flink·flask·azure
Hello.Reader1 天前
Flink 文件系统通用配置默认文件系统与连接数限制实战
vue.js·flink·npm