Flink异步io关联Hbase

主程序

csharp 复制代码
    public static void main(String[] args) throws Exception {
        //1.获取流执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

        //设置动态参数
        ParameterTool propertiesargs = ParameterTool.fromArgs(args);
        String fileName = propertiesargs.get("CephConfPath");
        //从hdfs获取动态参数配置文件
        org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
        FileSystem fs = FileSystem.get(URI.create(fileName), conf);
        fs.open(new org.apache.hadoop.fs.Path(fileName));
        ParameterTool propertiesFile = ParameterTool.fromPropertiesFile(fs.open(new org.apache.hadoop.fs.Path(fileName)).getWrappedStream());
        // 注册给环境变量(HBASE使用)
        env.getConfig().setGlobalJobParameters(propertiesFile);
        new CephConfig(propertiesFile);

        //2.设置CK&状态后端
        env.setStateBackend(new FsStateBackend(FSSTATEBACKEND));
        env.enableCheckpointing(10000);// 每 ** ms 开始一次 checkpoint
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);// 设置模式为精确一次
        env.getCheckpointConfig().setCheckpointTimeout(100000);// Checkpoint 必须在** ms内完成,否则就会被抛弃
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(2);// 同一时间只允许一个 checkpoint 进行
        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(3000);// 确认 checkpoints 之间的时间会进行 ** ms
        env.getCheckpointConfig().setTolerableCheckpointFailureNumber(5);
        env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, Time.of(10,TimeUnit.SECONDS)));//重启策略:重启3次,间隔10s

        //3.从kafka中读取日志信息,将将每行数据转换为JavaBean对象 主流
        DataStreamSource<String> dataStream = env.addSource(KafkaUtils.getKafkaSource(KAFKA_SOURCE_TOPIC, KAFKA_SOURCE_GROUP));
        ............
        //8.读取HBase中user表,进行维度关联
        SingleOutputStreamOperator<CephAccessRecord> record = AsyncDataStream.unorderedWait(
                validDS,
                new DimAsyncFunction<CephAccessRecord>() {
                    @Override
                    public String getKey(CephAccessRecord record) {
                        return record.access_key;
                    }
                },
                60, TimeUnit.SECONDS);
        BucketAssigner<String, String> assigner = new DateTimeBucketAssigner<>("yyyy-MM-dd", ZoneId.of("Asia/Shanghai"));
        StreamingFileSink<String> fileSink = StreamingFileSink.<String>forRowFormat(
                new Path(HDFS_FILE_PATH),
                new SimpleStringEncoder<>("UTF-8"))
                .withRollingPolicy(
                        DefaultRollingPolicy.builder()
                                .withRolloverInterval(TimeUnit.DAYS.toMillis(1))//至少包含 20 分钟的数据
                                .withInactivityInterval(TimeUnit.DAYS.toMillis(1 ))//最近 20 分钟没有收到新的数据
                                .withMaxPartSize(1024 * 1024 * 1024)//文件大小已达到 1 GB
                                .build())
                .withBucketAssigner(assigner)
                .build();

        // 将record-->过滤上传数据-->转换成jsonstring-->写入到hdfs
//        allDataDS.filter(log->log.event_type.equals("upload")).map(line->JSON.toJSONString(line)).addSink(fileSink);
        dataStream.map(line->JSON.toJSONString(line)).addSink(fileSink);

        //10.流环境执行
        env.execute();

异步关联程序

java 复制代码
package com.data.ceph.function;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.security.UserGroupInformation;

import java.util.Collections;
import java.util.Map;

public abstract class DimAsyncFunction<T> extends RichAsyncFunction<T, T> implements DimAsyncJoinFunction<T> {

    private org.apache.hadoop.hbase.client.Connection connection = null;
    private ResultScanner rs = null;
    private Table table = null;

    @Override
    public void open(Configuration parameters) throws Exception {
        //不启用安全认证
        System.setProperty("zookeeper.sasl.client", "false");
        Map<String, String> stringStringMap = getRuntimeContext().getExecutionConfig().getGlobalJobParameters().toMap();
        String hbase = stringStringMap.get("hbase_zookeeper_quorum");
        org.apache.hadoop.conf.Configuration hconf = HBaseConfiguration.create();
        hconf.set(HConstants.ZOOKEEPER_QUORUM, "172.16.23.37,172.16.23.38,172.16.23.39");
//        hconf.set(HConstants.ZOOKEEPER_QUORUM, hbase);
        hconf.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181");
        hconf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/hbase");

        //指定用户名为hbase的用户去访问hbase服务
        UserGroupInformation userGroupInformation = UserGroupInformation.createRemoteUser("hive");
        connection = ConnectionFactory.createConnection(hconf, User.create(userGroupInformation));
        table = connection.getTable(TableName.valueOf("cloud:user_info"));
    }


    @Override
    public void asyncInvoke(T input, ResultFuture<T> resultFuture) throws Exception {
        Get get = new Get(Bytes.toBytes(getKey(input)));
        Result rs = table.get(get);
        for (Cell cell : rs.rawCells()) {
            String column = Bytes.toString(cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength());
            String value = Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());
            BeanUtils.setProperty(input, column, value);
        }
        resultFuture.complete(Collections.singletonList(input));
    }
    @Override
    public void close() throws Exception {
        if (rs != null) rs.close();
        if (table != null) table.close();
        if (connection != null) connection.close();
    }
    @Override
    public void timeout(T input, ResultFuture<T> resultFuture) throws Exception {
        System.out.println("TimeOut:" + input);
    }
}
相关推荐
字节跳动数据平台24 分钟前
代码量减少 70%、GPU 利用率达 95%:火山引擎多模态数据湖如何释放模思智能的算法生产力
大数据
得物技术2 小时前
深入剖析Spark UI界面:参数与界面详解|得物技术
大数据·后端·spark
大大大大晴天3 小时前
Flink生产问题排障-HBase NotServingRegionException
flink·hbase
武子康3 小时前
大数据-238 离线数仓 - 广告业务 Hive分析实战:ADS 点击率、购买率与 Top100 排名避坑
大数据·后端·apache hive
武子康1 天前
大数据-237 离线数仓 - Hive 广告业务实战:ODS→DWD 事件解析、广告明细与转化分析落地
大数据·后端·apache hive
大大大大晴天1 天前
Flink生产问题排障-Kryo serializer scala extensions are not available
大数据·flink
武子康3 天前
大数据-236 离线数仓 - 会员指标验证、DataX 导出与广告业务 ODS/DWD/ADS 全流程
大数据·后端·apache hive
武子康4 天前
大数据-235 离线数仓 - 实战:Flume+HDFS+Hive 搭建 ODS/DWD/DWS/ADS 会员分析链路
大数据·后端·apache hive
DianSan_ERP5 天前
电商API接口全链路监控:构建坚不可摧的线上运维防线
大数据·运维·网络·人工智能·git·servlet
够快云库5 天前
能源行业非结构化数据治理实战:从数据沼泽到智能资产
大数据·人工智能·机器学习·企业文件安全