目录
[1.1 首日装载代码详解](#1.1 首日装载代码详解)
[1.2 每日装载代码详解](#1.2 每日装载代码详解)
[3.1 首日装载(简单直接)](#3.1 首日装载(简单直接))
[3.2 每日装载(Full Outer Join实现)](#3.2 每日装载(Full Outer Join实现))
[3.3 每日装载(Union All实现)](#3.3 每日装载(Union All实现))
[4.1 字段命名规范](#4.1 字段命名规范)
[4.2 数据完整性保障](#4.2 数据完整性保障)
[4.3 性能优化技巧](#4.3 性能优化技巧)
一、最近1日汇总表的核心代码解析
1.1 首日装载代码详解
sql
-- 关闭矢量化查询优化,解决某些数据类型处理时可能出现的Bug
set hive.vectorized.execution.enabled = false;
-- 核心装载语句
insert overwrite table dws_trade_user_sku_order_1d partition(dt)
select
user_id, -- 用户ID
id, -- 从维度表获取的SKU_ID
sku_name, -- SKU名称
-- 各级品类信息
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
-- 品牌信息
tm_id,
tm_name,
-- 聚合指标
order_count_1d,
order_num_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d,
dt -- 分区字段
from
(
-- 子查询od:从订单明细事实表分组聚合
select
dt, -- 订单日期
user_id,
sku_id,
count(*) order_count_1d, -- 统计下单次数
sum(sku_num) order_num_1d, -- 统计下单件数
sum(split_original_amount) order_original_amount_1d, -- 原始金额汇总
sum(nvl(split_activity_amount,0.0)) activity_reduce_amount_1d, -- 活动优惠,处理空值
sum(nvl(split_coupon_amount,0.0)) coupon_reduce_amount_1d, -- 优惠券优惠,处理空值
sum(split_total_amount) order_total_amount_1d -- 最终金额
from dwd_trade_order_detail_inc
-- 注意:首日装载没有WHERE条件,会处理所有历史分区数据
group by dt,user_id,sku_id -- 关键:按天、用户、商品三级分组
)od
left join
(
-- 子查询sku:获取商品维度信息
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from dim_sku_full
where dt='2022-06-08' -- 取维度表首日快照
)sku
on od.sku_id=sku.id; -- 通过SKU_ID关联事实表和维度表
-- 重新打开矢量化查询优化,提升执行效率
set hive.vectorized.execution.enabled = true;
关键点解析:
-
nvl(split_activity_amount,0.0):处理可能为null的优惠金额 -
group by dt,user_id,sku_id:三个维度的组合确保统计粒度唯一性 -
动态分区:
partition(dt)会根据SELECT中的dt字段自动分区
1.2 每日装载代码详解
sql
-- 注意:每日装载写入固定分区
insert overwrite table dws_trade_user_sku_order_1d partition(dt='2022-06-09')
select
user_id,
id,
-- ... 维度字段省略
order_count, -- 字段名不同:order_count_1d变为order_count
order_num,
order_original_amount,
activity_reduce_amount,
coupon_reduce_amount,
order_total_amount
from
(
select
user_id,
sku_id,
count(*) order_count,
sum(sku_num) order_num,
sum(split_original_amount) order_original_amount,
sum(nvl(split_activity_amount,0)) activity_reduce_amount,
sum(nvl(split_coupon_amount,0)) coupon_reduce_amount,
sum(split_total_amount) order_total_amount
from dwd_trade_order_detail_inc
where dt='2022-06-09' -- 关键区别:只取当日数据
group by user_id,sku_id -- 关键区别:不需要按dt分组
)od
left join
(
-- 维度表取当日最新快照
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from dim_sku_full
where dt='2022-06-09' -- 维度表取当日分区
)sku
on od.sku_id=sku.id;
每日与首日的主要区别:
-
WHERE条件:
where dt='2022-06-09'vs 无WHERE条件 -
分组字段:
group by user_id,sku_idvsgroup by dt,user_id,sku_id -
分区写入:固定分区
dt='2022-06-09'vs 动态分区partition(dt)
二、最近N日汇总表的核心代码解析
sql
-- 单SQL实现多周期统计
insert overwrite table dws_trade_user_sku_order_nd partition(dt='2022-06-08')
select
user_id,
sku_id,
sku_name,
-- ... 维度字段省略
-- 最近7日指标:通过条件判断实现
sum(if(dt>=date_add('2022-06-08',-6),order_count_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),order_num_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),order_original_amount_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),activity_reduce_amount_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),coupon_reduce_amount_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),order_total_amount_1d,0)),
-- 最近30日指标:直接求和
sum(order_count_1d),
sum(order_num_1d),
sum(order_original_amount_1d),
sum(activity_reduce_amount_1d),
sum(coupon_reduce_amount_1d),
sum(order_total_amount_1d)
from dws_trade_user_sku_order_1d
-- 关键:只取最近30天的1日汇总数据
where dt>=date_add('2022-06-08',-29)
-- 按用户和商品维度分组
group by user_id,sku_id,sku_name,category1_id,category1_name,
category2_id,category2_name,category3_id,category3_name,
tm_id,tm_name;
代码精妙之处:
-
条件聚合 :
sum(if(dt>=date_add('2022-06-08',-6),order_count_1d,0))-
当日期在最近7天内时,累加order_count_1d
-
否则累加0
-
最终得到最近7日总和
-
-
日期计算 :
date_add('2022-06-08',-6)-
计算7天前的日期
-
date_add('2022-06-08',-29)计算30天前的日期
-
-
数据复用 :直接从
dws_trade_user_sku_order_1d取数-
避免重复计算DWD层数据
-
提升计算效率
-
三、历史至今汇总表的核心代码解析
3.1 首日装载(简单直接)
sql
-- 首日装载:基于1日汇总表全量计算
insert overwrite table dws_trade_user_order_td partition(dt='2022-06-08')
select
user_id,
min(dt) order_date_first, -- 首次下单日期:所有日期中的最小值
max(dt) order_date_last, -- 末次下单日期:所有日期中的最大值
sum(order_count_1d) order_count,
sum(order_num_1d) order_num,
sum(order_original_amount_1d) original_amount,
sum(activity_reduce_amount_1d) activity_reduce_amount,
sum(coupon_reduce_amount_1d) coupon_reduce_amount,
sum(order_total_amount_1d) total_amount
from dws_trade_user_order_1d
group by user_id; -- 只按用户分组
3.2 每日装载(Full Outer Join实现)
sql
-- 方法一:Full Outer Join实现(推荐)
insert overwrite table dws_trade_user_order_td partition (dt = '2022-06-09')
select
-- 用户ID处理:两张表至少有一个不为NULL
nvl(old.user_id, new.user_id),
-- 首次下单日期:如果历史有记录,用历史的;否则今天就是首次
if(old.user_id is not null, old.order_date_first, '2022-06-09'),
-- 末次下单日期:如果今天有下单,用今天;否则用历史的
if(new.user_id is not null, '2022-06-09', old.order_date_last),
-- 累计指标:昨日累计 + 今日新增
nvl(old.order_count_td, 0) + nvl(new.order_count_1d, 0),
nvl(old.order_num_td, 0) + nvl(new.order_num_1d, 0),
nvl(old.original_amount_td, 0) + nvl(new.order_original_amount_1d, 0),
nvl(old.activity_reduce_amount_td, 0) + nvl(new.activity_reduce_amount_1d, 0),
nvl(old.coupon_reduce_amount_td, 0) + nvl(new.coupon_reduce_amount_1d, 0),
nvl(old.total_amount_td, 0) + nvl(new.order_total_amount_1d, 0)
from (
-- 子查询old:取昨日的累计数据
select user_id,
order_date_first,
order_date_last,
order_count_td,
order_num_td,
original_amount_td,
activity_reduce_amount_td,
coupon_reduce_amount_td,
total_amount_td
from dws_trade_user_order_td
where dt = date_add('2022-06-09', -1) -- 昨日分区
) old
full outer join -- 关键:全外连接,保留所有用户
(
-- 子查询new:取今日的新增数据
select user_id,
order_count_1d,
order_num_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d
from dws_trade_user_order_1d
where dt = '2022-06-09' -- 今日分区
) new
on old.user_id = new.user_id; -- 按用户ID关联
关键逻辑解析:
-
用户覆盖:
-
老用户今日有下单:
old和new都有记录 -
老用户今日无下单:只有
old有记录 -
新用户今日首单:只有
new有记录
-
-
nvl函数使用 :
nvl(old.order_count_td, 0) + nvl(new.order_count_1d, 0)-
处理NULL值,确保计算正确
-
历史无记录时为0,今日无新增时为0
-
3.3 每日装载(Union All实现)
sql
-- 方法二:Union All + Group By实现
insert overwrite table dws_trade_user_order_td partition(dt='2022-06-09')
select user_id,
min(order_date_first) order_date_first, -- 取最早的首次下单日期
max(order_date_last) order_date_last, -- 取最晚的末次下单日期
sum(order_count_td) order_count_td,
sum(order_num_td) order_num_td,
sum(original_amount_td) original_amount_td,
sum(activity_reduce_amount_td) activity_reduce_amount_td,
sum(coupon_reduce_amount_td) coupon_reduce_amount_td,
sum(total_amount_td) total_amount_td
from (
-- 历史累计数据
select user_id,
order_date_first,
order_date_last,
order_count_td,
order_num_td,
original_amount_td,
activity_reduce_amount_td,
coupon_reduce_amount_td,
total_amount_td
from dws_trade_user_order_td
where dt = date_add('2022-06-09', -1)
union all -- 合并今日新增
-- 今日新增数据(格式转换)
select user_id,
'2022-06-09' order_date_first, -- 今日作为首次下单日期
'2022-06-09' order_date_last, -- 今日作为末次下单日期
order_count_1d,
order_num_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d
from dws_trade_user_order_1d
where dt = '2022-06-09'
) t1
group by user_id; -- 重新聚合
两种方法对比:
| 方面 | Full Outer Join | Union All |
|---|---|---|
| 逻辑清晰度 | 高,字段处理明确 | 中等,需要格式转换 |
| 执行性能 | 通常较好 | 需要额外聚合 |
| 可读性 | 条件判断直观 | 结构简单 |
| 适用场景 | 关联逻辑复杂时 | 逻辑简单时 |
四、代码设计中的隐藏智慧
4.1 字段命名规范
-
_1d:最近1日指标 -
_7d:最近7日指标 -
_30d:最近30日指标 -
_td:历史至今指标 -
统一的后缀让表字段含义一目了然
4.2 数据完整性保障
sql
-- 多处出现的NULL值处理
sum(nvl(split_activity_amount,0.0))
nvl(old.order_count_td, 0) + nvl(new.order_count_1d, 0)
这些处理确保即使数据缺失,计算结果也不会出错。
4.3 性能优化技巧
-
分区裁剪 :
where dt='2022-06-09'让Hive只读取特定分区 -
中间结果复用:N日表复用1日表,历史至今表也复用1日表
-
矢量化优化:通过开关控制,平衡兼容性和性能
五、完整代码
sql
--一日汇总表
DROP TABLE IF EXISTS dws_trade_user_sku_order_1d;
CREATE EXTERNAL TABLE dws_trade_user_sku_order_1d
(
`user_id` STRING COMMENT '用户ID',
`sku_id` STRING COMMENT 'SKU_ID',
`sku_name` STRING COMMENT 'SKU名称',
`category1_id` STRING COMMENT '一级品类ID',
`category1_name` STRING COMMENT '一级品类名称',
`category2_id` STRING COMMENT '二级品类ID',
`category2_name` STRING COMMENT '二级品类名称',
`category3_id` STRING COMMENT '三级品类ID',
`category3_name` STRING COMMENT '三级品类名称',
`tm_id` STRING COMMENT '品牌ID',
`tm_name` STRING COMMENT '品牌名称',
`order_count_1d` BIGINT COMMENT '最近1日下单次数',
`order_num_1d` BIGINT COMMENT '最近1日下单件数',
`order_original_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单原始金额',
`activity_reduce_amount_1d` DECIMAL(16, 2) COMMENT '最近1日活动优惠金额',
`coupon_reduce_amount_1d` DECIMAL(16, 2) COMMENT '最近1日优惠券优惠金额',
`order_total_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单最终金额'
) COMMENT '交易域用户商品粒度订单最近1日汇总表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_user_sku_order_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
set hive.exec.dynamic.partition.mode=nonstrict;
-- Hive的bug:对某些类型数据的处理可能会导致报错,关闭矢量化查询优化解决
set hive.vectorized.execution.enabled = false;
insert overwrite table dws_trade_user_sku_order_1d partition(dt)
select
user_id,
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
order_count_1d,
order_num_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d,
dt
from
(
select
dt,
user_id,
sku_id,
count(*) order_count_1d,
sum(sku_num) order_num_1d,
sum(split_original_amount) order_original_amount_1d,
sum(nvl(split_activity_amount,0.0)) activity_reduce_amount_1d,
sum(nvl(split_coupon_amount,0.0)) coupon_reduce_amount_1d,
sum(split_total_amount) order_total_amount_1d
from dwd_trade_order_detail_inc
group by dt,user_id,sku_id
)od
left join
(
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from dim_sku_full
where dt='2022-06-08'
)sku
on od.sku_id=sku.id;
-- 矢量化查询优化可以一定程度上提升执行效率,不会触发前述Bug时,应打开
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.enabled = false;
insert overwrite table dws_trade_user_sku_order_1d partition(dt='2022-06-09')
select
user_id,
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
order_count,
order_num,
order_original_amount,
activity_reduce_amount,
coupon_reduce_amount,
order_total_amount
from
(
select
user_id,
sku_id,
count(*) order_count,
sum(sku_num) order_num,
sum(split_original_amount) order_original_amount,
sum(nvl(split_activity_amount,0)) activity_reduce_amount,
sum(nvl(split_coupon_amount,0)) coupon_reduce_amount,
sum(split_total_amount) order_total_amount
from dwd_trade_order_detail_inc
where dt='2022-06-09'
group by user_id,sku_id
)od
left join
(
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from dim_sku_full
where dt='2022-06-09'
)sku
on od.sku_id=sku.id;
set hive.vectorized.execution.enabled = true;
--n日汇总表
DROP TABLE IF EXISTS dws_trade_user_sku_order_nd;
CREATE EXTERNAL TABLE dws_trade_user_sku_order_nd
(
`user_id` STRING COMMENT '用户ID',
`sku_id` STRING COMMENT 'SKU_ID',
`sku_name` STRING COMMENT 'SKU名称',
`category1_id` STRING COMMENT '一级品类ID',
`category1_name` STRING COMMENT '一级品类名称',
`category2_id` STRING COMMENT '二级品类ID',
`category2_name` STRING COMMENT '二级品类名称',
`category3_id` STRING COMMENT '三级品类ID',
`category3_name` STRING COMMENT '三级品类名称',
`tm_id` STRING COMMENT '品牌ID',
`tm_name` STRING COMMENT '品牌名称',
`order_count_7d` STRING COMMENT '最近7日下单次数',
`order_num_7d` BIGINT COMMENT '最近7日下单件数',
`order_original_amount_7d` DECIMAL(16, 2) COMMENT '最近7日下单原始金额',
`activity_reduce_amount_7d` DECIMAL(16, 2) COMMENT '最近7日活动优惠金额',
`coupon_reduce_amount_7d` DECIMAL(16, 2) COMMENT '最近7日优惠券优惠金额',
`order_total_amount_7d` DECIMAL(16, 2) COMMENT '最近7日下单最终金额',
`order_count_30d` BIGINT COMMENT '最近30日下单次数',
`order_num_30d` BIGINT COMMENT '最近30日下单件数',
`order_original_amount_30d` DECIMAL(16, 2) COMMENT '最近30日下单原始金额',
`activity_reduce_amount_30d` DECIMAL(16, 2) COMMENT '最近30日活动优惠金额',
`coupon_reduce_amount_30d` DECIMAL(16, 2) COMMENT '最近30日优惠券优惠金额',
`order_total_amount_30d` DECIMAL(16, 2) COMMENT '最近30日下单最终金额'
) COMMENT '交易域用户商品粒度订单最近n日汇总表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_user_sku_order_nd'
TBLPROPERTIES ('orc.compress' = 'snappy');
insert overwrite table dws_trade_user_sku_order_nd partition(dt='2022-06-08')
select
user_id,
sku_id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
sum(if(dt>=date_add('2022-06-08',-6),order_count_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),order_num_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),order_original_amount_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),activity_reduce_amount_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),coupon_reduce_amount_1d,0)),
sum(if(dt>=date_add('2022-06-08',-6),order_total_amount_1d,0)),
sum(order_count_1d),
sum(order_num_1d),
sum(order_original_amount_1d),
sum(activity_reduce_amount_1d),
sum(coupon_reduce_amount_1d),
sum(order_total_amount_1d)
from dws_trade_user_sku_order_1d
where dt>=date_add('2022-06-08',-29)
group by user_id,sku_id,sku_name,category1_id,category1_name,category2_id,category2_name,category3_id,category3_name,tm_id,tm_name;
--历史至今汇总表
DROP TABLE IF EXISTS dws_trade_user_order_td;
CREATE EXTERNAL TABLE dws_trade_user_order_td
(
`user_id` STRING COMMENT '用户ID',
`order_date_first` STRING COMMENT '历史至今首次下单日期',
`order_date_last` STRING COMMENT '历史至今末次下单日期',
`order_count_td` BIGINT COMMENT '历史至今下单次数',
`order_num_td` BIGINT COMMENT '历史至今购买商品件数',
`original_amount_td` DECIMAL(16, 2) COMMENT '历史至今下单原始金额',
`activity_reduce_amount_td` DECIMAL(16, 2) COMMENT '历史至今下单活动优惠金额',
`coupon_reduce_amount_td` DECIMAL(16, 2) COMMENT '历史至今下单优惠券优惠金额',
`total_amount_td` DECIMAL(16, 2) COMMENT '历史至今下单最终金额'
) COMMENT '交易域用户粒度订单历史至今汇总表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_user_order_td'
TBLPROPERTIES ('orc.compress' = 'snappy');
--full outer join实现
insert overwrite table dws_trade_user_order_td partition(dt='2022-06-08')
select
user_id,
min(dt) order_date_first,
max(dt) order_date_last,
sum(order_count_1d) order_count,
sum(order_num_1d) order_num,
sum(order_original_amount_1d) original_amount,
sum(activity_reduce_amount_1d) activity_reduce_amount,
sum(coupon_reduce_amount_1d) coupon_reduce_amount,
sum(order_total_amount_1d) total_amount
from dws_trade_user_order_1d
group by user_id;
insert overwrite table dws_trade_user_order_td partition (dt = '2022-06-09')
select nvl(old.user_id, new.user_id),
if(old.user_id is not null, old.order_date_first, '2022-06-09'),
if(new.user_id is not null, '2022-06-09', old.order_date_last),
nvl(old.order_count_td, 0) + nvl(new.order_count_1d, 0),
nvl(old.order_num_td, 0) + nvl(new.order_num_1d, 0),
nvl(old.original_amount_td, 0) + nvl(new.order_original_amount_1d, 0),
nvl(old.activity_reduce_amount_td, 0) + nvl(new.activity_reduce_amount_1d, 0),
nvl(old.coupon_reduce_amount_td, 0) + nvl(new.coupon_reduce_amount_1d, 0),
nvl(old.total_amount_td, 0) + nvl(new.order_total_amount_1d, 0)
from (
select user_id,
order_date_first,
order_date_last,
order_count_td,
order_num_td,
original_amount_td,
activity_reduce_amount_td,
coupon_reduce_amount_td,
total_amount_td
from dws_trade_user_order_td
where dt = date_add('2022-06-09', -1)
) old
full outer join
(
select user_id,
order_count_1d,
order_num_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d
from dws_trade_user_order_1d
where dt = '2022-06-09'
) new
on old.user_id = new.user_id;
--union all实现
insert overwrite table dws_trade_user_order_td partition(dt='2022-06-09')
select user_id,
min(order_date_first) order_date_first,
max(order_date_last) order_date_last,
sum(order_count_td) order_count_td,
sum(order_num_td) order_num_td,
sum(original_amount_td) original_amount_td,
sum(activity_reduce_amount_td) activity_reduce_amount_td,
sum(coupon_reduce_amount_td) coupon_reduce_amount_td,
sum(total_amount_td) total_amount_td
from (
select user_id,
order_date_first,
order_date_last,
order_count_td,
order_num_td,
original_amount_td,
activity_reduce_amount_td,
coupon_reduce_amount_td,
total_amount_td
from dws_trade_user_order_td
where dt = date_add('2022-06-09', -1)
union all
select user_id,
'2022-06-09' order_date_first,
'2022-06-09' order_date_last,
order_count_1d,
order_num_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d
from dws_trade_user_order_1d
where dt = '2022-06-09') t1
group by user_id;
六、总结
这三段代码从简单到复杂,展现了数据仓库DWS层设计的精髓:
-
最近1日表:基础构建块,采用增量更新策略
-
最近N日表:复用思想,巧用条件聚合
-
历史至今表:增量累加,两种实现方案对比
每一行代码都蕴含着对业务需求、数据特性和系统性能的深刻理解。在实际开发中,理解这些代码背后的设计思想,比记住具体语法更重要。