54、Flink 使用 CoGroup 实现 left/right Join 代码示例

1、概述

1)left join 实现

bash 复制代码
for (Tuple3<String, String, Long> leftTuple : left) {
                            boolean isJoin = false;

                            for (Tuple3<String, String, Long> rightTuple : right) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    isJoin = true;
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }

                            if (!isJoin) {
                                collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, "", leftTuple.f2, 0L));
                            }
                        }

2、right join 实现

bash 复制代码
                        for (Tuple3<String, String, Long> rightTuple : right) {
                            boolean isJoin = false;

                            for (Tuple3<String, String, Long> leftTuple : left) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    isJoin = true;
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }

                            if (!isJoin) {
                                collector.collect(new Tuple5<>(rightTuple.f0, "", rightTuple.f1, 0L, rightTuple.f2));
                            }
                        }

3、Inner Join 实现

bash 复制代码
                       // inner join
                        for (Tuple3<String, String, Long> leftTuple : left) {
                            for (Tuple3<String, String, Long> rightTuple : right) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }
                        }

2、完整代码示例

bash 复制代码
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichCoGroupFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.util.Collector;

import java.time.Duration;
import java.util.Objects;

public class _05_CoGroupInnerOuterJoin {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 测试时限制了分区数,生产中需要设置空闲数据源
        env.setParallelism(2);
        env.disableOperatorChaining();

        DataStreamSource<String> inputLeft = env.socketTextStream("localhost", 8888);

        // 事件时间需要设置水位线策略和时间戳
        SingleOutputStreamOperator<Tuple3<String, String, Long>> mapLeft = inputLeft.map(new MapFunction<String, Tuple3<String, String, Long>>() {
            @Override
            public Tuple3<String, String, Long> map(String input) throws Exception {
                String[] fields = input.split(",");
                return new Tuple3<>(fields[0], fields[1], Long.parseLong(fields[2]));
            }
        });

        SingleOutputStreamOperator<Tuple3<String, String, Long>> watermarkLeft = mapLeft.assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String, String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0))
                .withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> input, long l) {
                        return input.f2;
                    }
                }));

        DataStreamSource<String> inputRight = env.socketTextStream("localhost", 9999);

        // 事件时间需要设置水位线策略和时间戳
        SingleOutputStreamOperator<Tuple3<String, String, Long>> mapRight = inputRight.map(new MapFunction<String, Tuple3<String, String, Long>>() {
            @Override
            public Tuple3<String, String, Long> map(String input) throws Exception {
                String[] fields = input.split(",");
                return new Tuple3<>(fields[0], fields[1], Long.parseLong(fields[2]));
            }
        });

        SingleOutputStreamOperator<Tuple3<String, String, Long>> watermarkRight = mapRight.assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String, String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0))
                .withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> input, long l) {
                        return input.f2;
                    }
                }));

        /**
         * left-join 测试数据
         *
         * left-1
         *
         * a,1,1718089200000
         * b,2,1718089200000
         * c,3,1718089200000
         *
         * right-2
         *
         * a,1,1718089200000
         * b,2,1718089200000
         * c,3,1718089200000
         *
         * left-3
         *
         * a,4,1718089202000
         * b,5,1718089202000
         * c,6,1718089202000
         *
         * right-4
         *
         * a,1,1718089202000
         * b,2,1718089202000
         * c,3,1718089202000
         *
         * left-right-5
         *
         * a,1,1718089205001
         * b,2,1718089205001
         * c,3,1718089205001
         *
         * 1> (a,1,1,1718089200000,1718089200000)
         * 1> (b,2,2,1718089200000,1718089200000)
         * 1> (c,3,3,1718089200000,1718089200000)
         * 1> (a,4,,1718089202000,0)
         * 2> (b,5,,1718089202000,0)
         * 1> (c,6,,1718089202000,0)
         */
        watermarkLeft.keyBy(e -> e.f0)
                .coGroup(watermarkRight.keyBy(e -> e.f0))
                .where(e -> e.f1)
                .equalTo(e -> e.f1)
                .window(TumblingEventTimeWindows.of(Duration.ofSeconds(5)))
                .apply(new RichCoGroupFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, Tuple5<String, String, String, Long, Long>>() {
                    @Override
                    public void coGroup(Iterable<Tuple3<String, String, Long>> left, Iterable<Tuple3<String, String, Long>> right, Collector<Tuple5<String, String, String, Long, Long>> collector) throws Exception {
                        // left join
                        for (Tuple3<String, String, Long> leftTuple : left) {
                            boolean isJoin = false;

                            for (Tuple3<String, String, Long> rightTuple : right) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    isJoin = true;
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }

                            if (!isJoin) {
                                collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, "", leftTuple.f2, 0L));
                            }
                        }

                        // right join
//                        for (Tuple3<String, String, Long> rightTuple : right) {
//                            boolean isJoin = false;
//
//                            for (Tuple3<String, String, Long> leftTuple : left) {
//                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
//                                    isJoin = true;
//                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
//                                }
//                            }
//
//                            if (!isJoin) {
//                                collector.collect(new Tuple5<>(rightTuple.f0, "", rightTuple.f1, 0L, rightTuple.f2));
//                            }
//                        }

//                        // inner join
//                        for (Tuple3<String, String, Long> leftTuple : left) {
//                            for (Tuple3<String, String, Long> rightTuple : right) {
//                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
//                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
//                                }
//                            }
//                        }
                    }
                }).print();

        env.execute();
    }
}

3、测试用例

bash 复制代码
left-join 测试数据
         
left-1

a,1,1718089200000
b,2,1718089200000
c,3,1718089200000

right-2

a,1,1718089200000
b,2,1718089200000
c,3,1718089200000

left-3

a,4,1718089202000
b,5,1718089202000
c,6,1718089202000

right-4

a,1,1718089202000
b,2,1718089202000
c,3,1718089202000

left-right-5

a,1,1718089205001
b,2,1718089205001
c,3,1718089205001

1> (a,1,1,1718089200000,1718089200000)
1> (b,2,2,1718089200000,1718089200000)
1> (c,3,3,1718089200000,1718089200000)
1> (a,4,,1718089202000,0)
2> (b,5,,1718089202000,0)
1> (c,6,,1718089202000,0)
相关推荐
Qspace丨轻空间7 分钟前
气膜场馆照明设计:科技与环保的完美结合—轻空间
大数据·科技·生活·娱乐
cab51 小时前
聊一聊Elasticsearch的索引(1)
大数据·elasticsearch·搜索引擎
时差9531 小时前
使用flink编写WordCount
java·大数据·开发语言·flink
大数据编程之光2 小时前
Flink Transformation - 转换算子全面解析
服务器·flink·负载均衡
二进制_博客2 小时前
Flink学习连载文章3-Flink中各种Source源
大数据
出发行进2 小时前
Flink的Standalone集群模式安装部署
大数据·linux·分布式·数据分析·flink
jlting1952 小时前
Spark——安装步骤详细教程
大数据·spark
大数据编程之光3 小时前
Flink普通API之Source使用全解析
大数据·windows·flink
二进制_博客3 小时前
Flink学习连载文档第一篇--Flink集群的安装
大数据
DylanlZhao4 小时前
量化策略配置神器-飞书表格
大数据·python·云计算