54、Flink 使用 CoGroup 实现 left/right Join 代码示例

1、概述

1)left join 实现

bash 复制代码
for (Tuple3<String, String, Long> leftTuple : left) {
                            boolean isJoin = false;

                            for (Tuple3<String, String, Long> rightTuple : right) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    isJoin = true;
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }

                            if (!isJoin) {
                                collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, "", leftTuple.f2, 0L));
                            }
                        }

2、right join 实现

bash 复制代码
                        for (Tuple3<String, String, Long> rightTuple : right) {
                            boolean isJoin = false;

                            for (Tuple3<String, String, Long> leftTuple : left) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    isJoin = true;
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }

                            if (!isJoin) {
                                collector.collect(new Tuple5<>(rightTuple.f0, "", rightTuple.f1, 0L, rightTuple.f2));
                            }
                        }

3、Inner Join 实现

bash 复制代码
                       // inner join
                        for (Tuple3<String, String, Long> leftTuple : left) {
                            for (Tuple3<String, String, Long> rightTuple : right) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }
                        }

2、完整代码示例

bash 复制代码
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichCoGroupFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.util.Collector;

import java.time.Duration;
import java.util.Objects;

public class _05_CoGroupInnerOuterJoin {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 测试时限制了分区数,生产中需要设置空闲数据源
        env.setParallelism(2);
        env.disableOperatorChaining();

        DataStreamSource<String> inputLeft = env.socketTextStream("localhost", 8888);

        // 事件时间需要设置水位线策略和时间戳
        SingleOutputStreamOperator<Tuple3<String, String, Long>> mapLeft = inputLeft.map(new MapFunction<String, Tuple3<String, String, Long>>() {
            @Override
            public Tuple3<String, String, Long> map(String input) throws Exception {
                String[] fields = input.split(",");
                return new Tuple3<>(fields[0], fields[1], Long.parseLong(fields[2]));
            }
        });

        SingleOutputStreamOperator<Tuple3<String, String, Long>> watermarkLeft = mapLeft.assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String, String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0))
                .withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> input, long l) {
                        return input.f2;
                    }
                }));

        DataStreamSource<String> inputRight = env.socketTextStream("localhost", 9999);

        // 事件时间需要设置水位线策略和时间戳
        SingleOutputStreamOperator<Tuple3<String, String, Long>> mapRight = inputRight.map(new MapFunction<String, Tuple3<String, String, Long>>() {
            @Override
            public Tuple3<String, String, Long> map(String input) throws Exception {
                String[] fields = input.split(",");
                return new Tuple3<>(fields[0], fields[1], Long.parseLong(fields[2]));
            }
        });

        SingleOutputStreamOperator<Tuple3<String, String, Long>> watermarkRight = mapRight.assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String, String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0))
                .withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> input, long l) {
                        return input.f2;
                    }
                }));

        /**
         * left-join 测试数据
         *
         * left-1
         *
         * a,1,1718089200000
         * b,2,1718089200000
         * c,3,1718089200000
         *
         * right-2
         *
         * a,1,1718089200000
         * b,2,1718089200000
         * c,3,1718089200000
         *
         * left-3
         *
         * a,4,1718089202000
         * b,5,1718089202000
         * c,6,1718089202000
         *
         * right-4
         *
         * a,1,1718089202000
         * b,2,1718089202000
         * c,3,1718089202000
         *
         * left-right-5
         *
         * a,1,1718089205001
         * b,2,1718089205001
         * c,3,1718089205001
         *
         * 1> (a,1,1,1718089200000,1718089200000)
         * 1> (b,2,2,1718089200000,1718089200000)
         * 1> (c,3,3,1718089200000,1718089200000)
         * 1> (a,4,,1718089202000,0)
         * 2> (b,5,,1718089202000,0)
         * 1> (c,6,,1718089202000,0)
         */
        watermarkLeft.keyBy(e -> e.f0)
                .coGroup(watermarkRight.keyBy(e -> e.f0))
                .where(e -> e.f1)
                .equalTo(e -> e.f1)
                .window(TumblingEventTimeWindows.of(Duration.ofSeconds(5)))
                .apply(new RichCoGroupFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, Tuple5<String, String, String, Long, Long>>() {
                    @Override
                    public void coGroup(Iterable<Tuple3<String, String, Long>> left, Iterable<Tuple3<String, String, Long>> right, Collector<Tuple5<String, String, String, Long, Long>> collector) throws Exception {
                        // left join
                        for (Tuple3<String, String, Long> leftTuple : left) {
                            boolean isJoin = false;

                            for (Tuple3<String, String, Long> rightTuple : right) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    isJoin = true;
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }

                            if (!isJoin) {
                                collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, "", leftTuple.f2, 0L));
                            }
                        }

                        // right join
//                        for (Tuple3<String, String, Long> rightTuple : right) {
//                            boolean isJoin = false;
//
//                            for (Tuple3<String, String, Long> leftTuple : left) {
//                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
//                                    isJoin = true;
//                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
//                                }
//                            }
//
//                            if (!isJoin) {
//                                collector.collect(new Tuple5<>(rightTuple.f0, "", rightTuple.f1, 0L, rightTuple.f2));
//                            }
//                        }

//                        // inner join
//                        for (Tuple3<String, String, Long> leftTuple : left) {
//                            for (Tuple3<String, String, Long> rightTuple : right) {
//                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
//                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
//                                }
//                            }
//                        }
                    }
                }).print();

        env.execute();
    }
}

3、测试用例

bash 复制代码
left-join 测试数据
         
left-1

a,1,1718089200000
b,2,1718089200000
c,3,1718089200000

right-2

a,1,1718089200000
b,2,1718089200000
c,3,1718089200000

left-3

a,4,1718089202000
b,5,1718089202000
c,6,1718089202000

right-4

a,1,1718089202000
b,2,1718089202000
c,3,1718089202000

left-right-5

a,1,1718089205001
b,2,1718089205001
c,3,1718089205001

1> (a,1,1,1718089200000,1718089200000)
1> (b,2,2,1718089200000,1718089200000)
1> (c,3,3,1718089200000,1718089200000)
1> (a,4,,1718089202000,0)
2> (b,5,,1718089202000,0)
1> (c,6,,1718089202000,0)
相关推荐
阿里云大数据AI技术1 天前
StarRocks x Fluss x Paimon湖流一体方案:构建秒级响应、湖流一体的实时数据引擎
大数据·人工智能
Databend1 天前
Agent 轨迹分析与归因的数据工程实践
大数据·数据库·agent
喵个咪1 天前
Go Wind UBA 拆解系列 - 架构总览:三服务、数据流与契约优先
大数据·后端·go
喵个咪1 天前
Go Wind UBA 拆解系列 - 多租户与安全:两套隔离机制的边界
大数据·后端·go
喵个咪1 天前
Go Wind UBA 拆解系列 - OLAP 与 SQL 硬核:25 个分析模型怎么落地
大数据·后端·go
喵个咪1 天前
Go Wind UBA 拆解系列 - SDK 与采集层:从浏览器到 Kafka
大数据·后端·go
一条鱼丶1 天前
深入理解 Flink Watermark——流数据处理中的乱序问题解决方案
flink
QCC产品中心1 天前
MiniMax Agent 接入实测:企业查询、股权穿透与 UBO 识别(附 Prompt 模板)
大数据·mcp·金融/非金融
大大大大晴天1 天前
Flink SQL 从编写到提交运行的全过程解析
flink
SelectDB2 天前
Apache Doris Python UDF:让 SQL 直接调用 Python 生态,支撑 Agent 时代复杂业务逻辑
大数据·数据库·python