flinkSql中累计窗口CUMULATE

eventTime

复制代码
package com.bigdata.day08;


import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;


public class _05_flinkSql_Cumulate_eventTime {
    /**
     * 累积窗口 + eventTime
     * 1 分钟 每十秒计算一次 3秒水印
     * 数据格式
     * {"username":"zs","price":20,"event_time":"2023-07-18 12:12:43.000"}
     * {"username":"zs","price":20,"event_time":"2023-07-18 12:12:53.000"}
     * {"username":"zs","price":20,"event_time":"2023-07-18 12:13:03.000"}
     * {"username":"zs","price":20,"event_time":"2023-07-18 12:13:13.000"}
     */

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        StreamTableEnvironment tenv = StreamTableEnvironment.create(env);

        //2. 创建表
        tenv.executeSql("CREATE TABLE table1 (\n" +
                "  `username` String,\n" +
                "  `price` int,\n" +
                "  `event_time` TIMESTAMP(3),\n" +
                "   watermark for event_time as event_time - interval '3' second\n" +
                ") WITH (\n" +
                "  'connector' = 'kafka',\n" +
                "  'topic' = 'topic1',\n" +
                "  'properties.bootstrap.servers' = 'bigdata01:9092,bigdata02:9092,bigdata03:9092',\n" +
                "  'properties.group.id' = 'testGroup1',\n" +
                "  'scan.startup.mode' = 'latest-offset',\n" +
                "  'format' = 'json'\n" +
                ")");
        //3. 通过sql语句统计结果

        tenv.executeSql("select \n" +
                "   window_start,\n" +
                "   window_end,\n" +
                "   username,\n" +
                "   count(1) zongNum,\n" +
                "   sum(price) totalMoney \n" +
                "   from table(CUMULATE(TABLE table1, DESCRIPTOR(event_time), INTERVAL '10' second ,INTERVAL '60' second))\n" +
                "group by window_start,window_end,username").print();
        //4. sink-数据输出



        //5. execute-执行
        env.execute();
    }
}

processTime

复制代码
package com.bigdata.day08;


import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;


public class _06_flinkSql_Cumulate_processTime {
    /**
     * 累积窗口 + processTime
     * 1 分钟 每十秒计算一次
     * 数据格式
     * {"username":"zs","price":20}
     * {"username":"lisi","price":15}
     * {"username":"lisi","price":20}
     * {"username":"zs","price":20}
     * {"username":"zs","price":20}
     * {"username":"zs","price":20}
     * {"username":"zs","price":20}
     */

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        StreamTableEnvironment tenv = StreamTableEnvironment.create(env);

        //2. 创建表
        tenv.executeSql("CREATE TABLE table1 (\n" +
                "  `username` String,\n" +
                "  `price` int,\n" +
                "  `event_time` as proctime()\n" +
                ") WITH (\n" +
                "  'connector' = 'kafka',\n" +
                "  'topic' = 'topic1',\n" +
                "  'properties.bootstrap.servers' = 'bigdata01:9092,bigdata02:9092,bigdata03:9092',\n" +
                "  'properties.group.id' = 'testGroup1',\n" +
                "  'scan.startup.mode' = 'latest-offset',\n" +
                "  'format' = 'json'\n" +
                ")");
        //3. 通过sql语句统计结果

        tenv.executeSql("select \n" +
                "   window_start,\n" +
                "   window_end,\n" +
                "   username,\n" +
                "   count(1) zongNum,\n" +
                "   sum(price) totalMoney \n" +
                "   from table(CUMULATE(TABLE table1, DESCRIPTOR(event_time), INTERVAL '10' second ,INTERVAL '60' second))\n" +
                "group by window_start,window_end,username").print();
        //4. sink-数据输出



        //5. execute-执行
        env.execute();
    }
}

topN案例

复制代码
需求:在每个分钟内找出点击量最多的Top 3网页。 

滚动窗口(1分钟)+eventTime+3秒水印

hive sql

with t1 as (
        select page_id,sum(clicks)  totalSum  
                from  table1
                        group by page_id
), t2 as(
        select page_id,totalSum,
         row_number() over ( order by totalSum desc) px 
                from t1 
) select  * from t2 where px <=3


flink sql

with t1 as (
        select window_start,window_end,page_id,sum(clicks)  totalSum  
                from table(tumble(table table1,DESCRIPTOR(event_time), INTERVAL '60' second )) 
                        group by window_start,window_end,page_id
), t2 as(
        select window_start,window_end,page_id,totalSum,
        row_number() over (partition by window_start,window_end order by totalSum desc) px 
                from t1 
) select  * from t2 where px <=3


* 数据格式
{"ts": "2023-09-05 12:00:10", "page_id": 1, "clicks": 100}
{"ts": "2023-09-05 12:00:20", "page_id": 2, "clicks": 90}
{"ts": "2023-09-05 12:00:30", "page_id": 3, "clicks": 110}
{"ts": "2023-09-05 12:00:40", "page_id": 4, "clicks": 23}
{"ts": "2023-09-05 12:00:50", "page_id": 5, "clicks": 456}
{"ts": "2023-09-05 12:00:55", "page_id": 5, "clicks": 456}
// 触发数据
{"ts": "2023-09-05 12:01:03", "page_id": 5, "clicks": 456}

package com.bigdata.day08;


import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;


public class _07_flinkSql_topN {


    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        StreamTableEnvironment tenv = StreamTableEnvironment.create(env);

        //2. 创建表

        //3. 通过sql语句统计结果

        tenv.executeSql("CREATE TABLE table1 (\n" +
                "    `page_id` INT,\n" +
                "    `clicks` INT,\n" +
                "  `ts` TIMESTAMP(3) ,\n" +
                "   watermark for ts as ts - interval '3' second \n" +
                ") WITH (\n" +
                "  'connector' = 'kafka',\n" +
                "  'topic' = 'topic1',\n" +
                "  'properties.bootstrap.servers' = 'bigdata01:9092,bigdata02:9092,bigdata03:9092',\n" +
                "  'properties.group.id' = 'testGroup1',\n" +
                "  'scan.startup.mode' = 'latest-offset',\n" +
                "  'format' = 'json'\n" +
                ")");

        tenv.executeSql("with t1 as (\n" +
                "\tselect window_start,window_end,page_id,sum(clicks)  totalSum  from table(tumble(table table1,DESCRIPTOR(ts), INTERVAL '60' second )) group by window_start,window_end,page_id\n" +
                "), t2 as(\n" +
                "\tselect window_start,window_end,page_id,totalSum,row_number() over (partition by window_start,window_end order by totalSum desc) px from t1 \n" +
                ") select  * from t2 where px <=3").print();
        //4. sink-数据输出


        //5. execute-执行
        env.execute();
    }
}
相关推荐
Loving_enjoy33 分钟前
基于Hadoop的明星社交媒体影响力数据挖掘平台:设计与实现
大数据·hadoop·数据挖掘
浮尘笔记40 分钟前
go-zero使用elasticsearch踩坑记:时间存储和展示问题
大数据·elasticsearch·golang·go
碳基学AI2 小时前
哈尔滨工业大学DeepSeek公开课:探索大模型原理、技术与应用从GPT到DeepSeek|附视频与讲义免费下载方法
大数据·人工智能·python·gpt·算法·语言模型·集成学习
一个天蝎座 白勺 程序猿3 小时前
大数据(4.6)Hive执行引擎选型终极指南:MapReduce/Tez/Spark性能实测×万亿级数据资源配置公式
大数据·hive·mapreduce
HelpHelp同学4 小时前
信息混乱难查找?三步搭建高效帮助中心解决难题
大数据·人工智能·知识库管理系统
TDengine (老段)10 小时前
TDengine 中的关联查询
大数据·javascript·网络·物联网·时序数据库·tdengine·iotdb
直裾14 小时前
Mapreduce的使用
大数据·数据库·mapreduce
麻芝汤圆16 小时前
使用 MapReduce 进行高效数据清洗:从理论到实践
大数据·linux·服务器·网络·数据库·windows·mapreduce
树莓集团17 小时前
树莓集团海南落子:自贸港布局的底层逻辑
大数据
不剪发的Tony老师17 小时前
Hue:一个大数据查询工具
大数据