Flink加载维度数据

Flink加载维度数据

1、为何要加载维度数据?

在我们构建实时数仓时,不能光有事实数据,也需要加载维度数据来标明这些事实数据的具体含义。若只含有事实数据的话,就相当于只有数据本身在不断地变化,而并不知道这些数据具体表示什么意思。因此,我们应当加载维度数据进来。

2、加载维度数据的方式

此处,将提供两种常见的用于加载维度数据的方式。

方式一:缓存文件

district.txt文件:存放于resources资源目录下

txt 复制代码
1   nanjing
2   suzhou
3   changzhou
4   xuzhou

主体代码

java 复制代码
package recovery;

import modules.env.Environments;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.eventtime.TimestampAssigner;
import org.apache.flink.api.common.eventtime.TimestampAssignerSupplier;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.execution.JobClient;
import org.apache.flink.core.execution.JobListener;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import scala.Tuple3;

import javax.annotation.Nullable;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.TimeUnit;

/**
 * 缓存文件的注册与获取
 */
public class TestCache {
    public static void main(String[] args) throws Exception {
        // 创建环境
        StreamExecutionEnvironment see = new Environments()
                .build()
                .enableCheckpoint("file:///D:/phase/flink_state_backend", 3, 1, 1)
                .enableRetries(3, 1)
                .enableStateBackend("hashmap", true, false)
                .finish(RuntimeExecutionMode.STREAMING, 1, 3);

        // 1.注册缓存文件
        String path = Thread.currentThread()
                .getContextClassLoader()
                .getResource("district.txt").getPath();// 获取静态文件district.txt的路径
        see.registerCachedFile(path,"district"); // 缓存至环境中

        // 2.注册任务侦听器
        see.registerJobListener(new JobListener() {
            @Override
            public void onJobSubmitted(@Nullable JobClient jobClient, @Nullable Throwable throwable) {
                // 任务提交时
                // 任务正常:输出jobClient,任务异常:throwable
                if (Objects.nonNull(jobClient)) {
                    // 输出ID
                    System.out.println(jobClient.getJobID().toString());
                    // 输出状态
                    try {
                        System.err.println(jobClient.getJobStatus().get(10, TimeUnit.SECONDS).name());
                    } catch (Exception e) {
                        System.err.println(e.getMessage());
                    }
                }else if (Objects.nonNull(throwable)) {
                    // 异常不为空
                    System.err.println(throwable.getMessage());
                }
            }

            @Override
            public void onJobExecuted(@Nullable JobExecutionResult jobExecutionResult, @Nullable Throwable throwable) {
                // 任务执行
                // 任务正常:输出jobExecutionResult,任务异常:throwable
                if (Objects.nonNull(jobExecutionResult)) {
                    System.out.println(jobExecutionResult);
                }else if (Objects.nonNull(throwable)){
                    System.err.println(throwable.getMessage());
                }
            }
        });

        // 3.数据:ID,温度,时间戳
        // 生成水位线
        TimestampAssignerSupplier<Tuple3> supplier = new TimestampAssignerSupplier<Tuple3>() {
            @Override
            public TimestampAssigner<Tuple3> createTimestampAssigner(Context context) {
                return (element,recordTimestamp) -> (Long) element._3();
            }
        };
        WatermarkStrategy<Tuple3> watermark = WatermarkStrategy
                .<Tuple3>forMonotonousTimestamps()
                .withTimestampAssigner(supplier);
        // 数据
        see.fromCollection(Arrays.asList(
                new Tuple3(1,34,System.currentTimeMillis()),
                new Tuple3(2,36,System.currentTimeMillis()+1000),
                new Tuple3(1,35,System.currentTimeMillis()+2000),
                new Tuple3(3,32,System.currentTimeMillis()+3000),
                new Tuple3(2,33,System.currentTimeMillis()+4000)
        ))
                // 4.将缓存文件中地址内容来替代数据中的ID号【通过ID关联】
                .setParallelism(1)
                .assignTimestampsAndWatermarks(watermark)
                .map(new RichMapFunction<Tuple3, Tuple3>() {
                    Map<Integer,String> idName = new HashMap<>(); // 全局Map

                    // 初始化资源
                    @Override
                    public void open(Configuration parameters) throws Exception {
                        // 读取缓存文件
                        File district = getRuntimeContext().getDistributedCache().getFile("district");
                        try(BufferedReader br = new BufferedReader(new FileReader(district))){ // 会自动释放()内资源
                            String line;
                            while (Objects.nonNull(line = br.readLine())) {
                                String[] s = line.split("\\s+");
                                idName.put(Integer.valueOf(s[0]),s[1]);
                            }
                        }catch (Exception ex){
                            ex.printStackTrace();
                        }
                    }
                    @Override
                    public Tuple3 map(Tuple3 value) throws Exception {
                        return new Tuple3(idName.get(value._1()),value._2(),value._3());
                    }
                    // 释放资源
                    @Override
                    public void close() throws Exception {
                        idName.clear();
                    }
                }).print();

        see.execute("cache-test");
    }
}

结果展示

java 复制代码
(nanjing,34,1727094791401)
(suzhou,36,1727094792401)
(nanjing,35,1727094793401)
(changzhou,32,1727094794401)
(suzhou,33,1727094795401)

方式二:广播变量

主要代码

java 复制代码
package recovery;

import modules.env.Environments;
import modules.time.Timer;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.WindowStagger;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import scala.Tuple2;
import scala.Tuple3;

import java.util.Arrays;
import java.util.Iterator;
import java.util.concurrent.TimeUnit;

/**
 * 广播变量的发送与获取
 * 连接流 connect
 */
public class TestBroadcastConnect {
    public static void main(String[] args) throws Exception {
        // 1.创建环境
        StreamExecutionEnvironment see = new Environments()
                .build()
                .enableCheckpoint("file:///D:/phase/flink_state_backend", 3, 1, 1)
                .enableRetries(3, 1)
                .enableStateBackend("hashmap", true, false)
                .finish(RuntimeExecutionMode.STREAMING, 1, 3);

        // 2.广播变量
        MapStateDescriptor desc1 = new MapStateDescriptor("idCity", Integer.class, String.class); // 描述特征
        BroadcastStream<Tuple2> broadcastStream = see.fromCollection(Arrays.asList(
                // 广播出去的内容
                new Tuple2(1, "nanjing"),
                new Tuple2(2, "suzhou"),
                new Tuple2(3, "wuxi")
        )).broadcast(desc1); // 广播流

        // 3.数据:ID,温度,时间戳
        see.fromCollection(Arrays.asList(
                        new Tuple3(1,34,System.currentTimeMillis()),
                        new Tuple3(2,36,System.currentTimeMillis()+1000),
                        new Tuple3(1,35,System.currentTimeMillis()+2000),
                        new Tuple3(3,32,System.currentTimeMillis()+3000),
                        new Tuple3(2,33,System.currentTimeMillis()+4000)
                ))
                .setParallelism(1)
                .assignTimestampsAndWatermarks(
                        WatermarkStrategy
                                .<Tuple3>forMonotonousTimestamps()
                                .withTimestampAssigner(
                                        (SerializableTimestampAssigner<Tuple3>) (element,recordTimestamp) -> (Long) element._3()
                                )
                )
                // 4.连接流:与广播流数据进行连接(获取广播变量,变为广播连接流)
                .connect(broadcastStream)
                // 5.将广播变量中地址内容来替代数据中的ID号【通过ID关联】
                .process(new BroadcastProcessFunction<Tuple3, Tuple2, Tuple3>() {
                    @Override
                    public void processElement(Tuple3 value, BroadcastProcessFunction<Tuple3, Tuple2, Tuple3>.ReadOnlyContext ctx, Collector<Tuple3> out) throws Exception {
                        Object v = ctx.getBroadcastState(desc1).get(value._1()); // 取
                        out.collect(new Tuple3(v,value._2(),value._3()));
                    }
                    @Override
                    public void processBroadcastElement(Tuple2 value, BroadcastProcessFunction<Tuple3, Tuple2, Tuple3>.Context ctx, Collector<Tuple3> out) throws Exception {
                        ctx.getBroadcastState(desc1).put(value._1,value._2); // 存
                    }
                })
                // 6.业务: 平均温度
                .keyBy(t3->t3._1().toString())
                .window(Timer.tumbling(5,0,TimeUnit.SECONDS ,WindowStagger.NATURAL))
                .process(new ProcessWindowFunction<Tuple3, Tuple2, String, TimeWindow>() {
                    @Override
                    public void process(String city, ProcessWindowFunction<Tuple3, Tuple2, String, TimeWindow>.Context context, Iterable<Tuple3> elements, Collector<Tuple2> out) throws Exception {
                        float avg = 0.0f;
                        int count = 0;
                        Iterator<Tuple3> it = elements.iterator();
                        while(it.hasNext()){
                            count++;
                            avg += (Integer) it.next()._2();
                        }
                        avg /= count;
                        // 将平均温度往后送
                        out.collect(new Tuple2(city,avg));
                    }
                })
            	// 相当于print()操作
                .addSink(new SinkFunction<Tuple2>() {
                    @Override
                    public void invoke(Tuple2 value, Context context) throws Exception {
                        System.out.println(value);
                    }
                });

        see.execute("broadcast-connect");
    }
}

结果展示

java 复制代码
(nanjing,34.5)
(suzhou,36.0)
(wuxi,32.0)
(suzhou,33.0)
相关推荐
B站计算机毕业设计超人10 分钟前
计算机毕业设计hadoop+hive航班预测系统 飞机票航班数据分析可视化大屏 机票预测 机票爬虫 飞机票推荐系统 大数据毕业设计
大数据·hive·hadoop·爬虫·机器学习·spark·数据可视化
千夜、29 分钟前
Flink基本概念和算子使用
flink
happycao12332 分钟前
Flink 02 | 英雄莫问出处 Flink丰富多彩的数据源
大数据·flink
Norris Huang34 分钟前
如何在 Amazon EMR 中运行 Flink CDC Pipeline Connector
大数据·flink
StarRocks_labs36 分钟前
饿了么基于Flink+Paimon+StarRocks的实时湖仓探索
大数据·flink·湖仓一体·paimon·lakehouse
杰哥在此38 分钟前
Python知识点:如何使用Flink与Python进行实时数据处理
开发语言·python·面试·flink·编程
seasugar38 分钟前
Flink集群部署
java·服务器·flink
isNotNullX38 分钟前
数据流处理技术与Flink框架
大数据·数据库·数据仓库·flink
支付宝官方开放平台2 小时前
1024「爱码士」活动✖️开发者社区
大数据·c语言·c++·人工智能·python·github·aigc
Data 3172 小时前
Hive数仓操作(二)
大数据·数据库·数据仓库·hive·hadoop