Flink-SQL计算中的维表连接使用举例

Flink-SQL中,事实表(又称为流表)和维表连接的计算规则与两条流连接是不同的,两条流连接必须有时间属性和窗口的约束,否则状态会无限膨胀。维表一般是静态或缓慢变化的,Flink不会把维表全部加载进状态,而是连接时查询当前ID的最新值,也可设置缓存进行延迟优化等。连接维表的使用举例:

sql 复制代码
-- Kafka源表:计划小时报表
CREATE TEMPORARY TABLE kafka_campaign_hour_report (
  `data` STRING,
  `hour_id` AS JSON_VALUE(`data`,'$.hour_id'),
  `biz_code` AS JSON_VALUE(`data`,'$.bizCode'),
  `campaign_id` AS JSON_VALUE(`data`,'$.campaign_id'),
  `ad_pv` AS JSON_VALUE(`data`,'$.ad_pv'),
  `click` AS JSON_VALUE(`data`,'$.click'),
  `charge` AS JSON_VALUE(`data`,'$.charge'),
  `date` VARCHAR(20),
  `hour` VARCHAR(20),
  `brandId` VARCHAR(64),
  `accountId` VARCHAR(64),
  `isBatchEnd` INT,
  `offset` INT NOT NULL METADATA VIRTUAL,
  `my_part` BIGINT NOT NULL METADATA FROM 'partition',
  `my_time` TIMESTAMP(3) METADATA FROM 'timestamp',
  `my_date` AS CAST(`my_time` AS DATE)
) WITH (
  'connector' = 'kafka',
  'properties.bootstrap.servers' = 'kafka_server1:9092,kafka_server2:9092,kafka_server3:9092',
  'properties.group.id' = 'flink_group',
  'topic' = 'campaign_hour_report',
  'scan.startup.mode' = 'latest-offset',
  'format' = 'json'
);

-- MySQL维表:计划维表
CREATE TEMPORARY TABLE mysql_campaign_info (
  `id` BIGINT,
  `brand_id` VARCHAR(32) COMMENT '品牌id',
  `campaign_id` BIGINT COMMENT '计划ID',
  `campaign_name` VARCHAR(128) COMMENT '计划名称',
  `campaign_type` BIGINT COMMENT '类型',
  `day_budget` VARCHAR(128) COMMENT '日预算',
  `unii_create_time` TIMESTAMP(3) COMMENT '创建时间',
  `unii_update_time` TIMESTAMP(3) COMMENT '更新时间',
  PRIMARY KEY(`id`) NOT ENFORCED
) WITH (
  'connector' = 'mysql', 
  'hostname' = 'mysql_host',
  'port' = '3306',
  'username' = 'mysql_user',
  'password' = 'password',
  'database-name' = 'db_name',
  'table-name' = 'campaign_info',
  'server-id' = '101000-101004',
  'lookup.cache.strategy' = 'LRU',
  'lookup.cache.max-rows'='200000'
);

-- 结果表
CREATE TEMPORARY TABLE mysql_plan_report_hour (
  `stat_date` DATE COMMENT '日期',
  `stat_hour` VARCHAR(20) COMMENT '小时',
  `brand_id` VARCHAR(32) COMMENT '品牌ID',
  `account_id` VARCHAR(64) COMMENT '账户ID',
  `biz_code` VARCHAR(64) COMMENT '业务类型',
  `plan_id` VARCHAR(64) COMMENT '计划ID',
  `plan_name` VARCHAR(255) COMMENT '计划名称',
  `budget` DECIMAL(20,2) COMMENT '预算',
  `cost` DECIMAL(20,2) COMMENT '花费',
  `show_num` BIGINT COMMENT '曝光量',
  `click_num` BIGINT COMMENT '点击量',
  `my_time` VARCHAR(20) COMMENT '时间',
  PRIMARY KEY (`log_date`,`brand_id`,`platform_id`,`cycle_type`,`cycle_value`,
    `level_1st_id`,`level_2nd_id`,`level_3rd_id`,`level_type`) NOT ENFORCED
) WITH (
  'connector' = 'mysql',
  'hostname' = 'host_name',
  'port' = '3306',
  'username' = 'mysql_user',
  'password' = 'password',
  'database-name' = 'db_name',
  'table-name' = 'ads_plan_report_hour'
);


-- 数据解析
CREATE TEMPORARY VIEW view_plan_report_orig AS
SELECT
  TO_DATE(TIMESTAMPADD(HOUR,-1,TO_TIMESTAMP(CONCAT(`date`,' ',`hour`,':00:00')))) AS stat_date,
  LPAD(`hour_id`,2,'0') AS stat_hour,
  `brandId` AS brand_id,
  `accountId` AS account_id,
  `biz_code` AS biz_code,
  CAST(`campaign_id` AS BIGINT) AS plan_id,
  `isBatchEnd` AS batch_end,
  CAST(`charge` AS DECIMAL(20,5)) AS cost,
  CAST(`ad_pv` AS INT) AS show_num,
  CAST(`click` AS INT) AS click_num,
  CONCAT(SUBSTR(`date`,1,4),SUBSTR(`date`,6,2),SUBSTR(`date`,9,2)) AS batch_date,
  `hour` AS batch_hour,
  my_time
FROM kafka_campaign_hour_report
WHERE `date`>=DATE_FORMAT(LOCALTIMESTAMP,'yyyy-MM-dd') AND `isBatchEnd`=0;

-- 关联计划维表
CREATE TEMPORARY VIEW view_plan_report_info AS
SELECT
  t.stat_date,
  t.stat_hour,
  t.brand_id,
  t.account_id,
  t.biz_code,
  CAST(t.plan_id AS VARCHAR(64)) AS plan_id,
  a.campaign_name AS plan_name,
  CAST(a.day_budget AS DECIMAL(20,2)) AS budget,
  t.batch_end,
  t.cost,
  t.show_num,
  t.click_num,
  t.my_time
FROM view_plan_report_orig t 
  JOIN mysql_campaign_info FOR SYSTEM_TIME AS OF PROCTIME() AS a ON t.plan_id=a.id;

-- 数据去重
CREATE TEMPORARY VIEW view_plan_report_filter AS
SELECT
  stat_date,
  stat_hour,
  brand_id,
  account_id,
  biz_code,
  plan_id,
  plan_name,
  budget,
  cost,
  show_num,
  click_num,
  my_time
FROM
  (
    SELECT *,ROW_NUMBER() OVER(PARTITION BY stat_date,stat_hour,plan_id ORDER BY my_time DESC) AS rn
    FROM view_plan_report_info
  ) t
WHERE rn=1;


-- Sink 开始
BEGIN STATEMENT SET;

-- 写入结果表
INSERT INTO mysql_plan_report_hour
SELECT
  stat_date,
  stat_hour,
  brand_id,
  account_id,
  biz_code,
  plan_id,
  plan_name,
  budget,
  cost,
  show_num,
  click_num,
  my_time
FROM view_plan_report_filter;

END;
-- Sink结束

可根据维表的数据量设置缓存机制,比如采用LRU(Least Recently Used策略)或者ALL(全部加载到缓存)方式缓存维表数据,使用缓存后性能更好,Flink会周期性(cache.ttl控制)的更新缓存。连接时读取维表当前时刻的快照数据,如果join后维表数据更新不会触发已连接的ID重新join计算,即更新后的维表数据不会对已经关联的且不更新的事实表数据产生影响。

相关推荐
大大大大晴天8 小时前
Flink生产问题排障-Kryo serializer scala extensions are not available
大数据·flink
tryCbest4 天前
数据库SQL学习
数据库·sql
cowboy2584 天前
mysql5.7及以下版本查询所有后代值(包括本身)
数据库·sql
努力的lpp4 天前
SQL 报错注入
数据库·sql·web安全·网络安全·sql注入
麦聪聊数据4 天前
统一 Web SQL 平台如何收编企业内部的“野生数据看板”?
数据库·sql·低代码·微服务·架构
山峰哥4 天前
吃透 SQL 优化:告别慢查询,解锁数据库高性能
服务器·数据库·sql·oracle·性能优化·编辑器
yumgpkpm4 天前
AI视频生成:Wan 2.2(阿里通义万相)在华为昇腾下的部署?
人工智能·hadoop·elasticsearch·zookeeper·flink·kafka·cloudera
轩情吖4 天前
MySQL初识
android·数据库·sql·mysql·adb·存储引擎
james的分享4 天前
大数据领域核心 SQL 优化框架Apache Calcite介绍
大数据·sql·apache·calcite