Flink-SQL计算中的维表连接使用举例

Flink-SQL中,事实表(又称为流表)和维表连接的计算规则与两条流连接是不同的,两条流连接必须有时间属性和窗口的约束,否则状态会无限膨胀。维表一般是静态或缓慢变化的,Flink不会把维表全部加载进状态,而是连接时查询当前ID的最新值,也可设置缓存进行延迟优化等。连接维表的使用举例:

sql 复制代码
-- Kafka源表:计划小时报表
CREATE TEMPORARY TABLE kafka_campaign_hour_report (
  `data` STRING,
  `hour_id` AS JSON_VALUE(`data`,'$.hour_id'),
  `biz_code` AS JSON_VALUE(`data`,'$.bizCode'),
  `campaign_id` AS JSON_VALUE(`data`,'$.campaign_id'),
  `ad_pv` AS JSON_VALUE(`data`,'$.ad_pv'),
  `click` AS JSON_VALUE(`data`,'$.click'),
  `charge` AS JSON_VALUE(`data`,'$.charge'),
  `date` VARCHAR(20),
  `hour` VARCHAR(20),
  `brandId` VARCHAR(64),
  `accountId` VARCHAR(64),
  `isBatchEnd` INT,
  `offset` INT NOT NULL METADATA VIRTUAL,
  `my_part` BIGINT NOT NULL METADATA FROM 'partition',
  `my_time` TIMESTAMP(3) METADATA FROM 'timestamp',
  `my_date` AS CAST(`my_time` AS DATE)
) WITH (
  'connector' = 'kafka',
  'properties.bootstrap.servers' = 'kafka_server1:9092,kafka_server2:9092,kafka_server3:9092',
  'properties.group.id' = 'flink_group',
  'topic' = 'campaign_hour_report',
  'scan.startup.mode' = 'latest-offset',
  'format' = 'json'
);

-- MySQL维表:计划维表
CREATE TEMPORARY TABLE mysql_campaign_info (
  `id` BIGINT,
  `brand_id` VARCHAR(32) COMMENT '品牌id',
  `campaign_id` BIGINT COMMENT '计划ID',
  `campaign_name` VARCHAR(128) COMMENT '计划名称',
  `campaign_type` BIGINT COMMENT '类型',
  `day_budget` VARCHAR(128) COMMENT '日预算',
  `unii_create_time` TIMESTAMP(3) COMMENT '创建时间',
  `unii_update_time` TIMESTAMP(3) COMMENT '更新时间',
  PRIMARY KEY(`id`) NOT ENFORCED
) WITH (
  'connector' = 'mysql', 
  'hostname' = 'mysql_host',
  'port' = '3306',
  'username' = 'mysql_user',
  'password' = 'password',
  'database-name' = 'db_name',
  'table-name' = 'campaign_info',
  'server-id' = '101000-101004',
  'lookup.cache.strategy' = 'LRU',
  'lookup.cache.max-rows'='200000'
);

-- 结果表
CREATE TEMPORARY TABLE mysql_plan_report_hour (
  `stat_date` DATE COMMENT '日期',
  `stat_hour` VARCHAR(20) COMMENT '小时',
  `brand_id` VARCHAR(32) COMMENT '品牌ID',
  `account_id` VARCHAR(64) COMMENT '账户ID',
  `biz_code` VARCHAR(64) COMMENT '业务类型',
  `plan_id` VARCHAR(64) COMMENT '计划ID',
  `plan_name` VARCHAR(255) COMMENT '计划名称',
  `budget` DECIMAL(20,2) COMMENT '预算',
  `cost` DECIMAL(20,2) COMMENT '花费',
  `show_num` BIGINT COMMENT '曝光量',
  `click_num` BIGINT COMMENT '点击量',
  `my_time` VARCHAR(20) COMMENT '时间',
  PRIMARY KEY (`log_date`,`brand_id`,`platform_id`,`cycle_type`,`cycle_value`,
    `level_1st_id`,`level_2nd_id`,`level_3rd_id`,`level_type`) NOT ENFORCED
) WITH (
  'connector' = 'mysql',
  'hostname' = 'host_name',
  'port' = '3306',
  'username' = 'mysql_user',
  'password' = 'password',
  'database-name' = 'db_name',
  'table-name' = 'ads_plan_report_hour'
);


-- 数据解析
CREATE TEMPORARY VIEW view_plan_report_orig AS
SELECT
  TO_DATE(TIMESTAMPADD(HOUR,-1,TO_TIMESTAMP(CONCAT(`date`,' ',`hour`,':00:00')))) AS stat_date,
  LPAD(`hour_id`,2,'0') AS stat_hour,
  `brandId` AS brand_id,
  `accountId` AS account_id,
  `biz_code` AS biz_code,
  CAST(`campaign_id` AS BIGINT) AS plan_id,
  `isBatchEnd` AS batch_end,
  CAST(`charge` AS DECIMAL(20,5)) AS cost,
  CAST(`ad_pv` AS INT) AS show_num,
  CAST(`click` AS INT) AS click_num,
  CONCAT(SUBSTR(`date`,1,4),SUBSTR(`date`,6,2),SUBSTR(`date`,9,2)) AS batch_date,
  `hour` AS batch_hour,
  my_time
FROM kafka_campaign_hour_report
WHERE `date`>=DATE_FORMAT(LOCALTIMESTAMP,'yyyy-MM-dd') AND `isBatchEnd`=0;

-- 关联计划维表
CREATE TEMPORARY VIEW view_plan_report_info AS
SELECT
  t.stat_date,
  t.stat_hour,
  t.brand_id,
  t.account_id,
  t.biz_code,
  CAST(t.plan_id AS VARCHAR(64)) AS plan_id,
  a.campaign_name AS plan_name,
  CAST(a.day_budget AS DECIMAL(20,2)) AS budget,
  t.batch_end,
  t.cost,
  t.show_num,
  t.click_num,
  t.my_time
FROM view_plan_report_orig t 
  JOIN mysql_campaign_info FOR SYSTEM_TIME AS OF PROCTIME() AS a ON t.plan_id=a.id;

-- 数据去重
CREATE TEMPORARY VIEW view_plan_report_filter AS
SELECT
  stat_date,
  stat_hour,
  brand_id,
  account_id,
  biz_code,
  plan_id,
  plan_name,
  budget,
  cost,
  show_num,
  click_num,
  my_time
FROM
  (
    SELECT *,ROW_NUMBER() OVER(PARTITION BY stat_date,stat_hour,plan_id ORDER BY my_time DESC) AS rn
    FROM view_plan_report_info
  ) t
WHERE rn=1;


-- Sink 开始
BEGIN STATEMENT SET;

-- 写入结果表
INSERT INTO mysql_plan_report_hour
SELECT
  stat_date,
  stat_hour,
  brand_id,
  account_id,
  biz_code,
  plan_id,
  plan_name,
  budget,
  cost,
  show_num,
  click_num,
  my_time
FROM view_plan_report_filter;

END;
-- Sink结束

可根据维表的数据量设置缓存机制,比如采用LRU(Least Recently Used策略)或者ALL(全部加载到缓存)方式缓存维表数据,使用缓存后性能更好,Flink会周期性(cache.ttl控制)的更新缓存。连接时读取维表当前时刻的快照数据,如果join后维表数据更新不会触发已连接的ID重新join计算,即更新后的维表数据不会对已经关联的且不更新的事实表数据产生影响。

相关推荐
莳花微语1 小时前
记录一次OGG进程abended,报错OGG-01431、OGG-01003、OGG-01151、OGG-01296问题的处理
数据库·sql·mysql
萧曵 丶2 小时前
MySQL三大日志系统浅谈
数据库·sql·mysql
麦聪聊数据4 小时前
MySQL 性能调优:从EXPLAIN到JSON索引优化
数据库·sql·mysql·安全·json
CappuccinoRose4 小时前
流计算概述
python·flink·流计算·数据流·pyflink
yumgpkpm4 小时前
AI评判:信创替代对Cloudera CDH CDP Hadoop大数据平台有何影响?
大数据·hive·oracle·flink·kafka·hbase·cloudera
小四的快乐生活5 小时前
大数据SQL诊断(采集、分析、优化方案)
大数据·数据库·sql
l1t6 小时前
DeepSeek辅助编写的利用唯一可选数求解数独SQL
数据库·sql·算法·postgresql
樱花味的小奶牛7 小时前
DECLARE CURSOR 才允许使用 FOR UPDATE 子句
数据库·sql
问今域中7 小时前
Spring Security登录认证
数据库·sql·oracle
Hello.Reader8 小时前
PyFlink 向量化 UDF(Vectorized UDF)Arrow 批传输原理、pandas 标量/聚合函数、配置与内存陷阱、五种写法一网打尽
python·flink·pandas