Flink SQL 读取 Kafka 数据到 Mysql 实战

案例需求

通过Flinksql使用DDL的方式,实现读取kafka用户行为数据,对数据进行实时处理,根据时间分组,求PV 和UV ,然后输出到 mysql 中。

1、kafka中的消息的格式
  • 数据以 JSON 格式编码,格式如下:
json 复制代码
{"user_id":1101,"item_id":1875,"category_id":456876,"behavior":"pv","ts":"2017-12-13T11:27:50Z"}
{"user_id":1101,"item_id":1875,"category_id":456876,"behavior":"pv","ts":"2017-12-13T11:27:51Z"}
{"user_id":1102,"item_id":1876,"category_id":456876,"behavior":"pv","ts":"2017-12-13T11:27:54Z"}
{"user_id":1103,"item_id":1877,"category_id":456876,"behavior":"pv","ts":"2017-12-13T11:27:55Z"}
{"user_id":1104,"item_id":1878,"category_id":456876,"behavior":"pv","ts":"2017-12-13T11:28:01Z"}
2、构建maven工程引入相关依赖
xml 复制代码
<dependency>
    <groupId>org.scala-lang</groupId>
    <artifactId>scala-library</artifactId>
    <version>2.11.8</version>
</dependency> 

<dependency>
     <groupId>org.apache.flink</groupId>
     <artifactId>flink-streaming-scala_2.11</artifactId>
     <version>1.9.2</version>
 </dependency>

<dependency>
   <groupId>org.apache.flink</groupId>
   <artifactId>flink-connector-kafka_2.11</artifactId>
   <version>1.9.2</version>
</dependency>

<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-table-api-scala-bridge_2.11</artifactId>
    <version>1.9.2</version>
</dependency>

<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-table-planner-blink_2.11</artifactId>
    <version>1.9.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-json -->
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-json</artifactId>
    <version>1.9.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-jdbc -->
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-jdbc_2.11</artifactId>
    <version>1.9.2</version>
</dependency>

<dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>5.1.38</version>
</dependency>
3、代码开发
  • 驱动主类FlinkSqlConnectKafka

    scala 复制代码
    package com.kaikeba.connector.kafka
    
    import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
    import org.apache.flink.table.api.{EnvironmentSettings, Table}
    import org.apache.flink.table.api.scala.StreamTableEnvironment
    import org.apache.flink.types.Row
    import org.apache.log4j.{Level, Logger}
    import org.apache.flink.api.scala._
    
    //todo: Flinksql使用DDL的方式,读取kafka数据,进行数据实时处理,写结果到mysql表中
    
    object FlinkSqlConnectKafka {
    
      Logger.getLogger("org").setLevel(Level.ERROR)
    
      def main(args: Array[String]): Unit = {
    
        //todo:1、构建流处理环境
        val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
        streamEnv.setParallelism(1)
    
        //todo:2、构建TableEnvironment
            val settings: EnvironmentSettings = EnvironmentSettings.newInstance().useBlinkPlanner().build()
            val tableEnvironment: StreamTableEnvironment = StreamTableEnvironment.create(streamEnv,settings)
    
    
        //todo: 3、通过DDL,注册kafka数据源表
        tableEnvironment.sqlUpdate(SqlContext.kafkaSourceSql)
    
        //todo: 4、执行查询任务
         val result: Table = tableEnvironment.sqlQuery("select * from kafkaTable")
    
        //todo: 5、table转换成流
         val dataStream: DataStream[Row] = tableEnvironment.toAppendStream[Row](result)
    
        dataStream.print("数据流结果:")
    
    
        //todo: 6、通过DDL,注册mysql sink表
        tableEnvironment.sqlUpdate(SqlContext.mysqlSinkSql)
    
    
        //todo: 7、计算指标落地到mysql表
        tableEnvironment.sqlUpdate(SqlContext.pvuvSql)
    
    
        //todo: 8、启动任务
        tableEnvironment.execute("FlinkSqlConnectKafka")
      }
    }
  • 封装sql的类SqlContext

    scala 复制代码
    package com.kaikeba.connector.kafka
    
    //todo:定义sql语句
    object SqlContext {
    
      //todo: 连接kafka构建源表的sql语句
    
        lazy val kafkaSourceSql =
          """
            |create table kafkaTable
            |(
            |    user_id BIGINT,
            |    item_id BIGINT,
            |    category_id BIGINT,
            |    behavior STRING,
            |    ts TIMESTAMP(3)
            |) with (
            |	'connector.type' = 'kafka',                                                 -- 使用 kafka connector
            |	'connector.version' = 'universal',                                          -- kafka 版本,universal 支持 0.11 以上的版本
            |	'connector.topic' = 'user_behavior',                                        -- kafka topic
            |	'connector.properties.0.key' = 'group.id',                                  -- kafka consumer group.id参数名
            |	'connector.properties.0.value' = 'testGroup',                               -- kafka consumer group.id参数值
            |	'connector.properties.1.key' = 'bootstrap.servers',                         -- kafka 集群参数名
            |	'connector.properties.1.value' = 'node01:9092,node02:9092,node03:9092',     -- kafka 集群参数值
            |	'connector.startup-mode' = 'latest-offset',                                 -- 从最新 offset 开始读取
            |	'format.type' = 'json',                                                     -- 数据源格式为 json
            |	'format.derive-schema' = 'true',                                            -- 从 DDL schema 确定 json 解析规则
            |	'update-mode' = 'append')                                                   -- 仅支持 append
          """.stripMargin
    
    
      //todo: 定义要输出的表的sql语句
      lazy val mysqlSinkSql  =
        """
          |CREATE TABLE pvuv(
          |    dt VARCHAR,
          |    pv BIGINT,
          |    uv BIGINT
          |) WITH (
          |    'connector.type' = 'jdbc',                                 -- 使用 jdbc connector
          |    'connector.url' = 'jdbc:mysql://node03:3306/flink',        -- mysql jdbc url
          |    'connector.table' = 'pvuv',                                -- 表名
          |    'connector.username' = 'root',                             -- 用户名
          |    'connector.password' = '123456',                           -- 密码
          |    'connector.write.flush.max-rows' = '1'                     -- 默认5000条,为了演示改为1条,表示多少条数据写入一次
          |)
        """.stripMargin
    
    
    
      //todo: 业务分析处理的sql,求取pv和uv
      lazy val pvuvSql  =
        """
          |INSERT INTO pvuv
          |SELECT
          |  DATE_FORMAT(ts, 'yyyy-MM-dd HH:00') dt,
          |  COUNT(*) AS pv,
          |  COUNT(DISTINCT user_id) AS uv
          |FROM kafkaTable
          |GROUP BY DATE_FORMAT(ts, 'yyyy-MM-dd HH:00')
        """.stripMargin
    
    }
4、操作流程
  • 1、创建名称为user_behavior 的topic

    shell 复制代码
    kafka-topics.sh --create --topic user_behavior --partitions 3 --replication-factor 2 --zookeeper node01:2181,node02:2181,node03:2181
  • 2、在mysql的flink数据库中创建pvuv

    sql 复制代码
    CREATE DATABASE /*!32312 IF NOT EXISTS*/`flink` /*!40100 DEFAULT CHARACTER SET utf8 */;
    
    USE `flink`;
    
    /*Table structure for table `pvuv` */
    
    DROP TABLE IF EXISTS `pvuv`;
    
    CREATE TABLE `pvuv` (
      `dt` varchar(20) NOT NULL,
      `pv` bigint(20) DEFAULT NULL,
      `uv` bigint(20) DEFAULT NULL,
      PRIMARY KEY (`dt`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
  • 3、启动驱动主类FlinkSqlConnectKafka

  • 4、通过kafka-topic.sh 命令模拟数据产生

    shell 复制代码
    kafka-console-producer.sh --topic user_behavior --broker-list node01:9092,node02:9092,node03:9092
  • 5、观察mysql数据库中pvuv表结果数据

相关推荐
*星星之火*4 小时前
【Flink银行反欺诈系统设计方案】3.欺诈的7种场景和架构方案、核心表设计
大数据·架构·flink
TsuanS4 小时前
SQL 注入 (C++向)
数据库·c++·sql
豪门土狗5 小时前
渗透测试之利用sql拿shell(附完整流程+防御方案)【上】
数据库·笔记·sql·网络安全
计算机软件程序设计7 小时前
Windows下安装kafka
windows·分布式·kafka
java技术小馆7 小时前
Kafka 消息传递模型
分布式·kafka
远川_Horizon9 小时前
Lab17_ Blind SQL injection with out-of-band data exfiltration
sql·web安全
HBryce2410 小时前
Kafka&&RocketMQ
分布式·kafka·rocketmq
一个儒雅随和的男子12 小时前
kafka消息中间件的rebalance机制
分布式·kafka
RainbowSea12 小时前
6. MySQL 索引的数据结构(详细说明)
数据库·sql·mysql