Flink-SQL计算中的维表连接使用举例

Flink-SQL中,事实表(又称为流表)和维表连接的计算规则与两条流连接是不同的,两条流连接必须有时间属性和窗口的约束,否则状态会无限膨胀。维表一般是静态或缓慢变化的,Flink不会把维表全部加载进状态,而是连接时查询当前ID的最新值,也可设置缓存进行延迟优化等。连接维表的使用举例:

sql 复制代码
-- Kafka源表:计划小时报表
CREATE TEMPORARY TABLE kafka_campaign_hour_report (
  `data` STRING,
  `hour_id` AS JSON_VALUE(`data`,'$.hour_id'),
  `biz_code` AS JSON_VALUE(`data`,'$.bizCode'),
  `campaign_id` AS JSON_VALUE(`data`,'$.campaign_id'),
  `ad_pv` AS JSON_VALUE(`data`,'$.ad_pv'),
  `click` AS JSON_VALUE(`data`,'$.click'),
  `charge` AS JSON_VALUE(`data`,'$.charge'),
  `date` VARCHAR(20),
  `hour` VARCHAR(20),
  `brandId` VARCHAR(64),
  `accountId` VARCHAR(64),
  `isBatchEnd` INT,
  `offset` INT NOT NULL METADATA VIRTUAL,
  `my_part` BIGINT NOT NULL METADATA FROM 'partition',
  `my_time` TIMESTAMP(3) METADATA FROM 'timestamp',
  `my_date` AS CAST(`my_time` AS DATE)
) WITH (
  'connector' = 'kafka',
  'properties.bootstrap.servers' = 'kafka_server1:9092,kafka_server2:9092,kafka_server3:9092',
  'properties.group.id' = 'flink_group',
  'topic' = 'campaign_hour_report',
  'scan.startup.mode' = 'latest-offset',
  'format' = 'json'
);

-- MySQL维表:计划维表
CREATE TEMPORARY TABLE mysql_campaign_info (
  `id` BIGINT,
  `brand_id` VARCHAR(32) COMMENT '品牌id',
  `campaign_id` BIGINT COMMENT '计划ID',
  `campaign_name` VARCHAR(128) COMMENT '计划名称',
  `campaign_type` BIGINT COMMENT '类型',
  `day_budget` VARCHAR(128) COMMENT '日预算',
  `unii_create_time` TIMESTAMP(3) COMMENT '创建时间',
  `unii_update_time` TIMESTAMP(3) COMMENT '更新时间',
  PRIMARY KEY(`id`) NOT ENFORCED
) WITH (
  'connector' = 'mysql', 
  'hostname' = 'mysql_host',
  'port' = '3306',
  'username' = 'mysql_user',
  'password' = 'password',
  'database-name' = 'db_name',
  'table-name' = 'campaign_info',
  'server-id' = '101000-101004',
  'lookup.cache.strategy' = 'LRU',
  'lookup.cache.max-rows'='200000'
);

-- 结果表
CREATE TEMPORARY TABLE mysql_plan_report_hour (
  `stat_date` DATE COMMENT '日期',
  `stat_hour` VARCHAR(20) COMMENT '小时',
  `brand_id` VARCHAR(32) COMMENT '品牌ID',
  `account_id` VARCHAR(64) COMMENT '账户ID',
  `biz_code` VARCHAR(64) COMMENT '业务类型',
  `plan_id` VARCHAR(64) COMMENT '计划ID',
  `plan_name` VARCHAR(255) COMMENT '计划名称',
  `budget` DECIMAL(20,2) COMMENT '预算',
  `cost` DECIMAL(20,2) COMMENT '花费',
  `show_num` BIGINT COMMENT '曝光量',
  `click_num` BIGINT COMMENT '点击量',
  `my_time` VARCHAR(20) COMMENT '时间',
  PRIMARY KEY (`log_date`,`brand_id`,`platform_id`,`cycle_type`,`cycle_value`,
    `level_1st_id`,`level_2nd_id`,`level_3rd_id`,`level_type`) NOT ENFORCED
) WITH (
  'connector' = 'mysql',
  'hostname' = 'host_name',
  'port' = '3306',
  'username' = 'mysql_user',
  'password' = 'password',
  'database-name' = 'db_name',
  'table-name' = 'ads_plan_report_hour'
);


-- 数据解析
CREATE TEMPORARY VIEW view_plan_report_orig AS
SELECT
  TO_DATE(TIMESTAMPADD(HOUR,-1,TO_TIMESTAMP(CONCAT(`date`,' ',`hour`,':00:00')))) AS stat_date,
  LPAD(`hour_id`,2,'0') AS stat_hour,
  `brandId` AS brand_id,
  `accountId` AS account_id,
  `biz_code` AS biz_code,
  CAST(`campaign_id` AS BIGINT) AS plan_id,
  `isBatchEnd` AS batch_end,
  CAST(`charge` AS DECIMAL(20,5)) AS cost,
  CAST(`ad_pv` AS INT) AS show_num,
  CAST(`click` AS INT) AS click_num,
  CONCAT(SUBSTR(`date`,1,4),SUBSTR(`date`,6,2),SUBSTR(`date`,9,2)) AS batch_date,
  `hour` AS batch_hour,
  my_time
FROM kafka_campaign_hour_report
WHERE `date`>=DATE_FORMAT(LOCALTIMESTAMP,'yyyy-MM-dd') AND `isBatchEnd`=0;

-- 关联计划维表
CREATE TEMPORARY VIEW view_plan_report_info AS
SELECT
  t.stat_date,
  t.stat_hour,
  t.brand_id,
  t.account_id,
  t.biz_code,
  CAST(t.plan_id AS VARCHAR(64)) AS plan_id,
  a.campaign_name AS plan_name,
  CAST(a.day_budget AS DECIMAL(20,2)) AS budget,
  t.batch_end,
  t.cost,
  t.show_num,
  t.click_num,
  t.my_time
FROM view_plan_report_orig t 
  JOIN mysql_campaign_info FOR SYSTEM_TIME AS OF PROCTIME() AS a ON t.plan_id=a.id;

-- 数据去重
CREATE TEMPORARY VIEW view_plan_report_filter AS
SELECT
  stat_date,
  stat_hour,
  brand_id,
  account_id,
  biz_code,
  plan_id,
  plan_name,
  budget,
  cost,
  show_num,
  click_num,
  my_time
FROM
  (
    SELECT *,ROW_NUMBER() OVER(PARTITION BY stat_date,stat_hour,plan_id ORDER BY my_time DESC) AS rn
    FROM view_plan_report_info
  ) t
WHERE rn=1;


-- Sink 开始
BEGIN STATEMENT SET;

-- 写入结果表
INSERT INTO mysql_plan_report_hour
SELECT
  stat_date,
  stat_hour,
  brand_id,
  account_id,
  biz_code,
  plan_id,
  plan_name,
  budget,
  cost,
  show_num,
  click_num,
  my_time
FROM view_plan_report_filter;

END;
-- Sink结束

可根据维表的数据量设置缓存机制,比如采用LRU(Least Recently Used策略)或者ALL(全部加载到缓存)方式缓存维表数据,使用缓存后性能更好,Flink会周期性(cache.ttl控制)的更新缓存。连接时读取维表当前时刻的快照数据,如果join后维表数据更新不会触发已连接的ID重新join计算,即更新后的维表数据不会对已经关联的且不更新的事实表数据产生影响。

相关推荐
Hello.Reader5 小时前
用 Flink Table API 打造实时交易看板从 Kafka 到 MySQL 再到 Grafana
mysql·flink·kafka
学习2年半5 小时前
sql题目
数据库·sql
还是奇怪5 小时前
SQL 注入攻防:绕过注释符过滤的N种方法
数据库·sql·web安全
编程充电站pro6 小时前
聚合函数陷阱:AVG 和 GROUP BY 搭配使用注意点
数据库·sql
cookqq7 小时前
MongoDB源码delete分析观察者getOpObserver()->onDelete
数据库·sql·mongodb·nosql
TDengine (老段)7 小时前
TDengine 聚合函数 HISTOGRAM 用户手册
大数据·数据库·sql·物联网·时序数据库·iot·tdengine
DokiDoki之父8 小时前
JDBC入门
java·sql·mysql
麦聪聊数据8 小时前
企业级数据库管理实战(七):SQL 到 API,让数据库成为团队的数据服务
数据库·sql·低代码·数据服务
小志开发21 小时前
SQL从入门到起飞:完整数据库操作练习
数据库·sql·学习·oracle·sqlserver·navicat