Flink学习连载文章13--FlinkSQL高级部分

eventTime

测试数据如下:

复制代码
{"username":"zs","price":20,"event_time":"2023-07-17 10:10:10"}
{"username":"zs","price":15,"event_time":"2023-07-17 10:10:30"}
{"username":"zs","price":20,"event_time":"2023-07-17 10:10:40"}
{"username":"zs","price":20,"event_time":"2023-07-17 10:11:03"}
{"username":"zs","price":20,"event_time":"2023-07-17 10:11:04"}
{"username":"zs","price":20,"event_time":"2023-07-17 10:12:04"}
{"username":"zs","price":20,"event_time":"2023-07-17 11:12:04"}
{"username":"zs","price":20,"event_time":"2023-07-17 11:12:04"}
{"username":"zs","price":20,"event_time":"2023-07-17 12:12:04"}
{"username":"zs","price":20,"event_time":"2023-07-18 12:12:04"}

需求:每隔1分钟统计这1分钟的每个用户的总消费金额和消费次数

需要用到滚动窗口

编写好sql:

复制代码
CREATE TABLE table1 (
  `username` string,
  `price` int,
  `event_time` TIMESTAMP(3),
  watermark for event_time as event_time - interval '3' second
) WITH (
  'connector' = 'kafka',
  'topic' = 'topic1',
  'properties.bootstrap.servers' = 'bigdata01:9092',
  'properties.group.id' = 'g1',
  'scan.startup.mode' = 'latest-offset',
  'format' = 'json'
);

编写sql:
select 
   window_start,
   window_end,
   username,
   count(1) zongNum,
   sum(price) totalMoney 
   from table(TUMBLE(TABLE table1, DESCRIPTOR(event_time), INTERVAL '60' second))
group by window_start,window_end,username;

分享一个错误:

Exception in thread "main" org.apache.flink.table.api.ValidationException: SQL validation failed. The window function TUMBLE(TABLE table_name, DESCRIPTOR(timecol), datetime interval) requires the timecol is a time attribute type, but is VARCHAR(2147483647).

at org.apache.flink.table.planner.calcite.FlinkPlannerImpl.orgapacheflinktableplannercalciteFlinkPlannerImpl$$validate(FlinkPlannerImpl.scala:156)

at org.apache.flink.table.planner.calcite.FlinkPlannerImpl.validate(FlinkPlannerImpl.scala:107)

说明创建窗口的时候,使用的字段不是时间字段,需要写成时间字段TIMESTAMP(3),使用了eventtime需要添加水印,否则报错。

需求:按照滚动窗口和EventTime进行统计,每隔1分钟统计每个人的消费总额是多少

复制代码
package com.bigdata.day08;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * @基本功能:
 * @program:FlinkDemo
 * @author: 闫哥
 * @create:2023-11-28 14:12:28
 **/
public class _03EventTimeGunDongWindowDemo {

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        StreamTableEnvironment tenv = StreamTableEnvironment.create(env);

        //2. 创建表
        tenv.executeSql("CREATE TABLE table1 (\n" +
                        "  `username` String,\n" +
                        "  `price` int,\n" +
                        "  `event_time` TIMESTAMP(3),\n" +
                        "   watermark for event_time as event_time - interval '3' second\n" +
                        ") WITH (\n" +
                        "  'connector' = 'kafka',\n" +
                        "  'topic' = 'topic1',\n" +
                        "  'properties.bootstrap.servers' = 'bigdata01:9092',\n" +
                        "  'properties.group.id' = 'testGroup1',\n" +
                        "  'scan.startup.mode' = 'group-offsets',\n" +
                        "  'format' = 'json'\n" +
                        ")");
        //3. 通过sql语句统计结果

        tenv.executeSql("select \n" +
                        "   window_start,\n" +
                        "   window_end,\n" +
                        "   username,\n" +
                        "   count(1) zongNum,\n" +
                        "   sum(price) totalMoney \n" +
                        "   from table(TUMBLE(TABLE table1, DESCRIPTOR(event_time), INTERVAL '60' second))\n" +
                        "group by window_start,window_end,username").print();
        //4. sink-数据输出


        //5. execute-执行
        env.execute();
    }
}

统计结果如下:

测试一下滑动窗口,每隔10秒钟,计算前1分钟的数据:

复制代码
package com.bigdata.day08;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * @基本功能:
 * @program:FlinkDemo
 * @author: 闫哥
 * @create:2023-11-28 14:12:28
 **/
public class _03EventTimeGunDongWindowDemo {

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        StreamTableEnvironment tenv = StreamTableEnvironment.create(env);

        //2. 创建表
        tenv.executeSql("CREATE TABLE table1 (\n" +
                "  `username` String,\n" +
                "  `price` int,\n" +
                "  `event_time` TIMESTAMP(3),\n" +
                "   watermark for event_time as event_time - interval '3' second\n" +
                ") WITH (\n" +
                "  'connector' = 'kafka',\n" +
                "  'topic' = 'topic1',\n" +
                "  'properties.bootstrap.servers' = 'bigdata01:9092',\n" +
                "  'properties.group.id' = 'testGroup1',\n" +
                "  'scan.startup.mode' = 'group-offsets',\n" +
                "  'format' = 'json'\n" +
                ")");
        //3. 通过sql语句统计结果

        tenv.executeSql("select \n" +
                "   window_start,\n" +
                "   window_end,\n" +
                "   username,\n" +
                "   count(1) zongNum,\n" +
                "   sum(price) totalMoney \n" +
                "   from table(HOP(TABLE table1, DESCRIPTOR(event_time), INTERVAL '10' second,INTERVAL '60' second))\n" +
                "group by window_start,window_end,username").print();
        //4. sink-数据输出


        //5. execute-执行
        env.execute();
    }
}

结果如图所示:

复制代码
package com.bigdata.day08;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * @基本功能:
 * @program:FlinkDemo
 * @author: 闫哥
 * @create:2023-11-28 14:12:28
 **/
public class _03EventTimeGunDongWindowDemo {

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        StreamTableEnvironment tenv = StreamTableEnvironment.create(env);

        //2. 创建表
        tenv.executeSql("CREATE TABLE table1 (\n" +
                "  `username` String,\n" +
                "  `price` int,\n" +
                "  `event_time` TIMESTAMP(3),\n" +
                "   watermark for event_time as event_time - interval '3' second\n" +
                ") WITH (\n" +
                "  'connector' = 'kafka',\n" +
                "  'topic' = 'topic1',\n" +
                "  'properties.bootstrap.servers' = 'bigdata01:9092',\n" +
                "  'properties.group.id' = 'testGroup1',\n" +
                "  'scan.startup.mode' = 'group-offsets',\n" +
                "  'format' = 'json'\n" +
                ")");
        //3. 通过sql语句统计结果

        tenv.executeSql("select \n" +
                "   window_start,\n" +
                "   window_end,\n" +
                "   username,\n" +
                "   count(1) zongNum,\n" +
                "   sum(price) totalMoney \n" +
                "   from table(CUMULATE(TABLE table1, DESCRIPTOR(event_time), INTERVAL '1' hours,INTERVAL '1' days))\n" +
                "group by window_start,window_end,username").print();
        //4. sink-数据输出


        //5. execute-执行
        env.execute();
    }
}

累积窗口演示效果:

processTime

测试数据:

复制代码
{"username":"zs","price":20}
{"username":"lisi","price":15}
{"username":"lisi","price":20}
{"username":"zs","price":20}
{"username":"zs","price":20}
{"username":"zs","price":20}
{"username":"zs","price":20}

/**
 * 滚动窗口大小1分钟 延迟时间3秒
 *
 * {"username":"zs","price":20}
 * {"username":"lisi","price":15}
 * {"username":"lisi","price":20}
 * {"username":"zs","price":20}
 * {"username":"zs","price":20}
 * {"username":"zs","price":20}
 * {"username":"zs","price":20}
 *
 */
package com.bigdata.day08;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * @基本功能:
 * @program:FlinkDemo
 * @author: 闫哥
 * @create:2023-11-28 14:12:28
 **/
public class _04ProcessingTimeGunDongWindowDemo {

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        StreamTableEnvironment tenv = StreamTableEnvironment.create(env);

        //2. 创建表
        tenv.executeSql("CREATE TABLE table1 (\n" +
                "  `username` String,\n" +
                "  `price` int,\n" +
                "  `event_time` as proctime()\n" +
                ") WITH (\n" +
                "  'connector' = 'kafka',\n" +
                "  'topic' = 'topic1',\n" +
                "  'properties.bootstrap.servers' = 'bigdata01:9092',\n" +
                "  'properties.group.id' = 'testGroup1',\n" +
                "  'scan.startup.mode' = 'group-offsets',\n" +
                "  'format' = 'json'\n" +
                ")");
        //3. 通过sql语句统计结果

        tenv.executeSql("select \n" +
                "   window_start,\n" +
                "   window_end,\n" +
                "   username,\n" +
                "   count(1) zongNum,\n" +
                "   sum(price) totalMoney \n" +
                "   from table(TUMBLE(TABLE table1, DESCRIPTOR(event_time), INTERVAL '60' second ))\n" +
                "group by window_start,window_end,username").print();
        //4. sink-数据输出


        //5. execute-执行
        env.execute();
    }
}

计算结果:

结果需要等1分钟,才能显示出来,不要着急!

窗口分为滚动和滑动,时间分为事件时间和处理时间,两两组合,4个案例。

以下是滑动窗口+处理时间:

复制代码
package com.bigdata.sql;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * @基本功能:
 * @program:FlinkDemo
 * @author: 闫哥
 * @create:2024-11-29 14:28:19
 **/
public class _04_FlinkSQLProcessTime_HOP {

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
        // 获取tableEnv对象
        // 通过env 获取一个table 环境
        StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);

        tEnv.executeSql("CREATE TABLE table1 (\n" +
                        "  `username` string,\n" +
                        "  `price` int,\n" +
                        "  `event_time` as proctime() \n"+
                        ") WITH (\n" +
                        "  'connector' = 'kafka',\n" +
                        "  'topic' = 'topic1',\n" +
                        "  'properties.bootstrap.servers' = 'bigdata01:9092',\n" +
                        "  'properties.group.id' = 'g1',\n" +
                        "  'scan.startup.mode' = 'latest-offset',\n" +
                        "  'format' = 'json'\n" +
                        ")");

        // 语句中的 ; 不能添加
        tEnv.executeSql("select \n" +
                        "   window_start,\n" +
                        "   window_end,\n" +
                        "   username,\n" +
                        "   count(1) zongNum,\n" +
                        "   sum(price) totalMoney \n" +
                        "   from table(HOP(TABLE table1, DESCRIPTOR(event_time),INTERVAL '10' second, INTERVAL '60' second))\n" +
                        "group by window_start,window_end,username").print();


        //5. execute-执行
        env.execute();
    }
}

测试时假如你的控制台不出数据,触发不了,请进入如下操作:

1、重新创建一个新的 topic,分区数为 1

2、kafka 对接的 server,写全 bigdata01:9092,bigdata02:9092,bigdata03:9092

二、窗口TopN(不是新的技术)

需求:在每个小时内找出点击量最多的Top 3网页。

复制代码
测试数据
{"ts": "2023-09-05 12:00:00", "page_id": 1, "clicks": 100}
{"ts": "2023-09-05 12:01:00", "page_id": 2, "clicks": 90}
{"ts": "2023-09-05 12:10:00", "page_id": 3, "clicks": 110}
{"ts": "2023-09-05 12:20:00", "page_id": 4, "clicks": 23}
{"ts": "2023-09-05 12:30:00", "page_id": 5, "clicks": 456}
{"ts": "2023-09-05 13:10:00", "page_id": 5, "clicks": 456}

假如没有每隔1小时的需求,仅仅是统计点击量最多的Top 3网页,结果如下
select * from (
select 
    page_id,
    totalSum, 
    row_number() over (order by totalSum desc) px
  from (
     select page_id,
      sum(clicks)  totalSum
      from kafka_page_clicks group by page_id )  ) where px <=3;

根据以上代码,添加滚动窗口的写法:

复制代码
select 
    window_start,
    window_end,
    page_id,
    sum(clicks) totalSum  
    from 
   table ( 
     tumble( table kafka_page_clicks, descriptor(ts), INTERVAL '1' HOUR ) 
         ) 
    group by window_start,window_end,page_id;


在这个基础之上添加排名的写法:
select 
   window_start,
   window_end,
   page_id,
   pm
  from   (
select 
    window_start,
    window_end,
    page_id,
    row_number() over(partition by window_start,window_end order by totalSum desc ) pm
  from (
select 
    window_start,
    window_end,
    page_id,
    sum(clicks) totalSum  
    from 
   table ( 
     tumble( table kafka_page_clicks, descriptor(ts), INTERVAL '1' HOUR ) 
         ) 
    group by window_start,window_end,page_id ) t2 ) t1  where pm <= 3;

编写建表语句:

复制代码
{"ts": "2023-09-05 12:00:00", "page_id": 1, "clicks": 100}

CREATE TABLE kafka_page_clicks (
  `ts` TIMESTAMP(3),
  `page_id` int,
  `clicks` int,
  watermark for ts as ts - interval '3' second
) WITH (
  'connector' = 'kafka',
  'topic' = 'topic1',
  'properties.bootstrap.servers' = 'bigdata01:9092',
  'properties.group.id' = 'g1',
  'scan.startup.mode' = 'latest-offset',
  'format' = 'json'
)

package com.bigdata.day08;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * @基本功能:
 * @program:FlinkDemo
 * @author: 闫哥
 * @create:2023-11-28 15:23:46
 **/
public class _05TopNDemo {

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // ctrl + y 删除光标所在的那一行数据  ctrl + d 复制当前行
        StreamTableEnvironment tenv = StreamTableEnvironment.create(env);

        //2. source-加载数据
        // 一定要注意:ts 是一个年月日时分秒的数据,所以在建表时一定要是TIMESTAMP,否则进行WATERMARK 报错
        // 因为使用的是event_time 所以,需要指定WATERMARK
        tenv.executeSql("CREATE TABLE kafka_page_clicks (" +
                "    `ts` TIMESTAMP(3),\n" +
                "    page_id INT,\n" +
                "    clicks INT,\n" +
                "  WATERMARK FOR ts AS ts - INTERVAL '10' SECOND \n" +
                ") WITH (\n" +
                "    'connector' = 'kafka',\n" +
                "    'topic' = 'topic1',\n" +
                "    'properties.bootstrap.servers' = 'bigdata01:9092',\n" +
                "   'scan.startup.mode' = 'group-offsets',\n" +
                "    'format' = 'json'\n" +
                ")");


        tenv.executeSql("select \n" +
                "   window_start,\n" +
                "   window_end,\n" +
                "   page_id,\n" +
                "   pm\n" +
                "  from   (\n" +
                "select \n" +
                "    window_start,\n" +
                "    window_end,\n" +
                "    page_id,\n" +
                "    row_number() over(partition by window_start,window_end order by totalSum desc ) pm\n" +
                "  from (\n" +
                "select \n" +
                "    window_start,\n" +
                "    window_end,\n" +
                "    page_id,\n" +
                "    sum(clicks) totalSum  \n" +
                "    from \n" +
                "   table ( \n" +
                "     tumble( table kafka_page_clicks, descriptor(ts), INTERVAL '1' HOUR ) \n" +
                "         ) \n" +
                "    group by window_start,window_end,page_id ) t2 ) t1  where pm <= 3").print();
        //4. sink-数据输出


        //5. execute-执行
        env.execute();
    }
}

最后的运行结果如下:

相关推荐
Loving_enjoy20 分钟前
基于Hadoop的明星社交媒体影响力数据挖掘平台:设计与实现
大数据·hadoop·数据挖掘
浮尘笔记26 分钟前
go-zero使用elasticsearch踩坑记:时间存储和展示问题
大数据·elasticsearch·golang·go
碳基学AI2 小时前
哈尔滨工业大学DeepSeek公开课:探索大模型原理、技术与应用从GPT到DeepSeek|附视频与讲义免费下载方法
大数据·人工智能·python·gpt·算法·语言模型·集成学习
一个天蝎座 白勺 程序猿3 小时前
大数据(4.6)Hive执行引擎选型终极指南:MapReduce/Tez/Spark性能实测×万亿级数据资源配置公式
大数据·hive·mapreduce
HelpHelp同学3 小时前
信息混乱难查找?三步搭建高效帮助中心解决难题
大数据·人工智能·知识库管理系统
TDengine (老段)9 小时前
TDengine 中的关联查询
大数据·javascript·网络·物联网·时序数据库·tdengine·iotdb
直裾14 小时前
Mapreduce的使用
大数据·数据库·mapreduce
麻芝汤圆16 小时前
使用 MapReduce 进行高效数据清洗:从理论到实践
大数据·linux·服务器·网络·数据库·windows·mapreduce
树莓集团16 小时前
树莓集团海南落子:自贸港布局的底层逻辑
大数据
不剪发的Tony老师16 小时前
Hue:一个大数据查询工具
大数据