54、Flink 使用 CoGroup 实现 left/right Join 代码示例

1、概述

1)left join 实现

bash 复制代码
for (Tuple3<String, String, Long> leftTuple : left) {
                            boolean isJoin = false;

                            for (Tuple3<String, String, Long> rightTuple : right) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    isJoin = true;
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }

                            if (!isJoin) {
                                collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, "", leftTuple.f2, 0L));
                            }
                        }

2、right join 实现

bash 复制代码
                        for (Tuple3<String, String, Long> rightTuple : right) {
                            boolean isJoin = false;

                            for (Tuple3<String, String, Long> leftTuple : left) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    isJoin = true;
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }

                            if (!isJoin) {
                                collector.collect(new Tuple5<>(rightTuple.f0, "", rightTuple.f1, 0L, rightTuple.f2));
                            }
                        }

3、Inner Join 实现

bash 复制代码
                       // inner join
                        for (Tuple3<String, String, Long> leftTuple : left) {
                            for (Tuple3<String, String, Long> rightTuple : right) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }
                        }

2、完整代码示例

bash 复制代码
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichCoGroupFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.util.Collector;

import java.time.Duration;
import java.util.Objects;

public class _05_CoGroupInnerOuterJoin {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 测试时限制了分区数,生产中需要设置空闲数据源
        env.setParallelism(2);
        env.disableOperatorChaining();

        DataStreamSource<String> inputLeft = env.socketTextStream("localhost", 8888);

        // 事件时间需要设置水位线策略和时间戳
        SingleOutputStreamOperator<Tuple3<String, String, Long>> mapLeft = inputLeft.map(new MapFunction<String, Tuple3<String, String, Long>>() {
            @Override
            public Tuple3<String, String, Long> map(String input) throws Exception {
                String[] fields = input.split(",");
                return new Tuple3<>(fields[0], fields[1], Long.parseLong(fields[2]));
            }
        });

        SingleOutputStreamOperator<Tuple3<String, String, Long>> watermarkLeft = mapLeft.assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String, String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0))
                .withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> input, long l) {
                        return input.f2;
                    }
                }));

        DataStreamSource<String> inputRight = env.socketTextStream("localhost", 9999);

        // 事件时间需要设置水位线策略和时间戳
        SingleOutputStreamOperator<Tuple3<String, String, Long>> mapRight = inputRight.map(new MapFunction<String, Tuple3<String, String, Long>>() {
            @Override
            public Tuple3<String, String, Long> map(String input) throws Exception {
                String[] fields = input.split(",");
                return new Tuple3<>(fields[0], fields[1], Long.parseLong(fields[2]));
            }
        });

        SingleOutputStreamOperator<Tuple3<String, String, Long>> watermarkRight = mapRight.assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String, String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0))
                .withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> input, long l) {
                        return input.f2;
                    }
                }));

        /**
         * left-join 测试数据
         *
         * left-1
         *
         * a,1,1718089200000
         * b,2,1718089200000
         * c,3,1718089200000
         *
         * right-2
         *
         * a,1,1718089200000
         * b,2,1718089200000
         * c,3,1718089200000
         *
         * left-3
         *
         * a,4,1718089202000
         * b,5,1718089202000
         * c,6,1718089202000
         *
         * right-4
         *
         * a,1,1718089202000
         * b,2,1718089202000
         * c,3,1718089202000
         *
         * left-right-5
         *
         * a,1,1718089205001
         * b,2,1718089205001
         * c,3,1718089205001
         *
         * 1> (a,1,1,1718089200000,1718089200000)
         * 1> (b,2,2,1718089200000,1718089200000)
         * 1> (c,3,3,1718089200000,1718089200000)
         * 1> (a,4,,1718089202000,0)
         * 2> (b,5,,1718089202000,0)
         * 1> (c,6,,1718089202000,0)
         */
        watermarkLeft.keyBy(e -> e.f0)
                .coGroup(watermarkRight.keyBy(e -> e.f0))
                .where(e -> e.f1)
                .equalTo(e -> e.f1)
                .window(TumblingEventTimeWindows.of(Duration.ofSeconds(5)))
                .apply(new RichCoGroupFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, Tuple5<String, String, String, Long, Long>>() {
                    @Override
                    public void coGroup(Iterable<Tuple3<String, String, Long>> left, Iterable<Tuple3<String, String, Long>> right, Collector<Tuple5<String, String, String, Long, Long>> collector) throws Exception {
                        // left join
                        for (Tuple3<String, String, Long> leftTuple : left) {
                            boolean isJoin = false;

                            for (Tuple3<String, String, Long> rightTuple : right) {
                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
                                    isJoin = true;
                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
                                }
                            }

                            if (!isJoin) {
                                collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, "", leftTuple.f2, 0L));
                            }
                        }

                        // right join
//                        for (Tuple3<String, String, Long> rightTuple : right) {
//                            boolean isJoin = false;
//
//                            for (Tuple3<String, String, Long> leftTuple : left) {
//                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
//                                    isJoin = true;
//                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
//                                }
//                            }
//
//                            if (!isJoin) {
//                                collector.collect(new Tuple5<>(rightTuple.f0, "", rightTuple.f1, 0L, rightTuple.f2));
//                            }
//                        }

//                        // inner join
//                        for (Tuple3<String, String, Long> leftTuple : left) {
//                            for (Tuple3<String, String, Long> rightTuple : right) {
//                                if (Objects.equals(leftTuple.f2, rightTuple.f2)) {
//                                    collector.collect(new Tuple5<>(leftTuple.f0, leftTuple.f1, rightTuple.f1, leftTuple.f2, rightTuple.f2));
//                                }
//                            }
//                        }
                    }
                }).print();

        env.execute();
    }
}

3、测试用例

bash 复制代码
left-join 测试数据
         
left-1

a,1,1718089200000
b,2,1718089200000
c,3,1718089200000

right-2

a,1,1718089200000
b,2,1718089200000
c,3,1718089200000

left-3

a,4,1718089202000
b,5,1718089202000
c,6,1718089202000

right-4

a,1,1718089202000
b,2,1718089202000
c,3,1718089202000

left-right-5

a,1,1718089205001
b,2,1718089205001
c,3,1718089205001

1> (a,1,1,1718089200000,1718089200000)
1> (b,2,2,1718089200000,1718089200000)
1> (c,3,3,1718089200000,1718089200000)
1> (a,4,,1718089202000,0)
2> (b,5,,1718089202000,0)
1> (c,6,,1718089202000,0)
相关推荐
隐于花海,等待花开17 小时前
1.CONCAT / CONCAT_WS 函数深度解析
大数据·hive
CORNERSTONE36517 小时前
生产管理六要素(PQCDSM)
大数据·运维·人工智能·生产管理
qyr678917 小时前
全球AI服务器DAC线缆市场发展趋势与未来趋势展望
大数据·人工智能·数据分析·汽车·ai服务器·ai服务器dac线缆
阳光普照世界和平17 小时前
软件工程 3.0:大模型驱动的研发新范式,重塑软件全生命周期
大数据
小付爱coding17 小时前
Claude Code 设计哲学深度解析:从 Prompt 到 Harness 的 Agent 工程实践
大数据·elasticsearch·prompt
智能化咨询17 小时前
(200页PPT)DG1005企业IT战略规划架构设计方案(附下载方式)
大数据·人工智能
飞鸟恋上鱼17 小时前
基于Spark的短视频推荐系统设计与实现
大数据·分布式·spark
juniperhan17 小时前
Flink 系列第13篇:Flink 生产环境中的并行度与资源配置
java·大数据·数据仓库·分布式·flink
AllData公司负责人17 小时前
AllData数据中台通过开源项目AirFlow建设离线开发IDE,打造大数据离线调度引擎
大数据·python·资源管理·数据中台·airflow·离线调度·离线开发
Francek Chen17 小时前
【IoTDB】工业物联网时序数据库优选:Apache IoTDB的显著优势
大数据·数据库·物联网·时序数据库·iotdb