目录
一、完整的建表及数据装载代码(含注释)
sql
-- ---------- 1. 创建用户维度拉链表 ----------
DROP TABLE IF EXISTS dim_user_zip;
CREATE EXTERNAL TABLE dim_user_zip
(
`id` STRING COMMENT '用户ID',
`name` STRING COMMENT '用户姓名',
`phone_num` STRING COMMENT '手机号码',
`email` STRING COMMENT '邮箱',
`user_level` STRING COMMENT '用户等级',
`birthday` STRING COMMENT '生日',
`gender` STRING COMMENT '性别',
`create_time` STRING COMMENT '创建时间',
`operate_time` STRING COMMENT '操作时间',
`start_date` STRING COMMENT '开始日期',
`end_date` STRING COMMENT '结束日期'
) COMMENT '用户维度表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dim/dim_user_zip/'
TBLPROPERTIES ('orc.compress' = 'snappy');
-- ---------- 2. 首日装载(第一次初始化数据) ----------
-- 首次将全部用户数据导入拉链表,所有数据都是当前有效的
insert overwrite table dim_user_zip partition (dt = '9999-12-31')
select
data.id, -- 用户ID,直接取原值
-- 姓名脱敏:只保留姓氏,名字用*代替
-- 示例:张三 -> 张*
concat(substr(data.name, 1, 1), '*') name,
-- 手机号处理:验证格式 + 脱敏
-- 1. 先用正则验证是否是有效手机号(11位,符合中国手机号规则)
-- 2. 如果有效:保留前3位,后8位用**代替(13812345678 -> 138**)
-- 3. 如果无效:设为NULL
if(data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$',
concat(substr(data.phone_num, 1, 3), '**'),
null) phone_num,
-- 邮箱处理:验证格式 + 脱敏
-- 1. 验证邮箱格式是否正确
-- 2. 如果正确:保留@后面部分,前面用*代替(zhangsan@example.com -> *@example.com)
-- 3. 如果不正确:设为NULL
if(data.email regexp '[a-zA-Z0-9-_]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+$',
concat('*', split(data.email, '@')[1]),
null) email,
data.user_level, -- 用户等级,直接取原值
data.birthday, -- 生日,直接取原值
data.gender, -- 性别,直接取原值
data.create_time, -- 创建时间,直接取原值
data.operate_time, -- 操作时间,直接取原值
-- 拉链表核心字段:
'2022-06-08' start_date, -- 开始日期:因为是首日,所以是当天
'9999-12-31' end_date -- 结束日期:未来很远的一天,表示当前有效
-- 数据来源:原始数据表(ods层)
from ods_user_info_inc
-- 筛选条件:
where dt = '2022-06-08' -- 分区日期:2022年6月8日
and type = 'bootstrap-insert'; -- 数据类型:全量初始化数据
-- ---------- 3. 每日装载(每天增量更新) ----------
-- 说明:每天执行一次,处理前一天的用户信息变化
-- 设置动态分区模式,允许自动创建新分区
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dim_user_zip partition (dt)
select id,
name,
phone_num,
email,
user_level,
birthday,
gender,
create_time,
operate_time,
start_date,
-- 处理结束日期(end_date):
-- 情况1:如果是历史数据(rn=2),结束日期设为前一天(表示到前一天为止有效)
-- 情况2:如果是最新数据(rn=1),保持原来的结束日期(9999-12-31)
if(rn = 2, date_sub('2022-06-09', 1), end_date) end_date,
-- 处理分区字段(dt):
-- 情况1:如果是最新数据(rn=1),放入'9999-12-31'分区
-- 情况2:如果是历史数据(rn=2),放入前一天的分区('2022-06-08')
if(rn = 1, '9999-12-31', date_sub('2022-06-09', 1)) dt
from (
select id,
name,
phone_num,
email,
user_level,
birthday,
gender,
create_time,
operate_time,
start_date,
end_date,
-- 关键窗口函数:给每个用户的数据编号
-- partition by id:按用户ID分组
-- order by start_date desc:按开始日期倒序排列(最新的排第一)
-- rn=1:最新数据 rn=2:历史数据
row_number() over (partition by id order by start_date desc) rn
from (
select id,
name,
phone_num,
email,
user_level,
birthday,
gender,
create_time,
operate_time,
start_date,
end_date
from dim_user_zip
where dt = '9999-12-31'
union
select id,
concat(substr(name, 1, 1), '*') name,
if(phone_num regexp
'^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\\d{8}$',
concat(substr(phone_num, 1, 3), '*'), null) phone_num,
if(email regexp '^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+$',
concat('*@', split(email, '@')[1]), null) email,
user_level,
birthday,
gender,
create_time,
operate_time,
'2022-06-09' start_date,
'9999-12-31' end_date
from (
select data.id,
data.name,
data.phone_num,
data.email,
data.user_level,
data.birthday,
data.gender,
data.create_time,
data.operate_time,
row_number() over (partition by data.id order by ts desc) rn
from ods_user_info_inc
where dt = '2022-06-09'
) t1
where rn = 1
) t2
) t3;
二、拉链表设计思路
- 什么是拉链表?
拉链表就像一条可以拉伸的链子,记录了每条数据从开始到结束的有效时间范围。它有两个关键字段:
start_date:这条记录开始生效的日期
end_date:这条记录结束有效的日期
- 分区设计(为什么有两个分区?)
sql
-- 分区字段:dt
-- 1. '9999-12-31'分区:保存当前最新的、正在使用的数据
-- 2. 普通日期分区(如'2022-06-08'):保存已经过期的历史数据
为什么要这样设计?
查询最新数据快:要查用户最新信息,直接查dt='9999-12-31'分区
查询历史数据方便:要查某天的用户信息,就查对应日期的分区
数据管理清晰:新老数据分开存放,不混在一起
- 数据流向理解
首日(数据仓库第一次建表):
把所有用户的最新数据都放进9999-12-31分区
这时候start_date是当天日期,end_date是'9999-12-31'(表示一直有效)
每日更新:
新增用户:直接插入到9999-12-31分区
修改用户:比如小明的等级从1变成2
把原来9999-12-31分区里小明的旧记录拿出来
修改它的end_date为昨天(表示到昨天为止有效)
把这条旧记录放入昨天的分区(如dt='2022-06-08')
把小明的新记录(等级2)放入9999-12-31分区
三、代码详解(结合例子讲解)
- 创建表语句解读
sql
CREATE EXTERNAL TABLE dim_user_zip
(
'id' STRING COMMENT '用户ID', -- 用户的唯一标识,如"user_001"
'name' STRING COMMENT '用户姓名', -- 用户真实姓名
'phone_num' STRING COMMENT '手机号', -- 用户手机号
'email' STRING COMMENT '邮箱', -- 用户邮箱
'user_level' STRING COMMENT '用户等级', -- 如"VIP1"、"普通会员"
'birthday' STRING COMMENT '生日', -- 用户生日
'gender' STRING COMMENT '性别', -- 男/女
'create_time' STRING COMMENT '创建时间', -- 用户注册时间
'operate_time' STRING COMMENT '操作时间', -- 最后操作时间
'start_date' STRING COMMENT '开始日期', -- 这条记录生效的开始日期
'end_date' STRING COMMENT '结束日期' -- 这条记录生效的结束日期
) COMMENT '用户维度表';
-- 按dt字段分区,就像把文件放进不同的文件夹
PARTITIONED BY ('dt' STRING)
-- 使用ORC格式存储(一种高效的数据存储格式)
STORED AS ORC
-- 数据存放的位置
LOCATION '/warehouse/gmall/dim/dim_user_zip/'
-- 设置压缩格式为snappy(减少存储空间)
TBLPROPERTIES ('orc.compress' = 'snappy');
- 重要函数讲解
(1)数据脱敏函数(保护用户隐私)
sql
-- 1. 姓名脱敏:只显示第一个字,后面用*代替
-- 例如:"张三" -> "张*"
concat(substr(data.name, 1, 1), '*')
-- 2. 手机号脱敏:保留前3位,后面用**代替
-- 例如:"13812345678" -> "138**"
concat(substr(data.phone_num, 1, 3), '**')
-- 3. 邮箱脱敏:@符号后面的部分保留,前面用*代替
-- 例如:"zhangsan@example.com" -> "*@example.com"
concat('*', split(data.email, '@')[1])
(2)正则表达式函数(数据验证)
sql
-- 验证手机号格式是否正确(11位数字,符合中国手机号规则)
-- regexp是正则表达式匹配函数
data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$'
-- 验证邮箱格式是否正确
data.email regexp '[a-zA-Z0-9-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9_-]+)+$'
(3)窗口函数(处理排名)
sql
-- row_number() over:给每组数据编号
-- partition by id:按用户ID分组
-- order by start_date desc:按开始日期倒序排列(最新的排第一)
row_number() over (partition by id order by start_date desc) rn
作用:给每个用户的数据编号,最新的是1,次新的是2,依此类推。
- 首日装载代码讲解(第一次导入数据)
sql
-- 向dim_user_zip表的9999-12-31分区插入数据
insert overwrite table dim_user_zip partition (dt = '9999-12-31')
select
data.id, -- 用户ID
-- 姓名脱敏:只显示姓,名用*代替
concat(substr(data.name, 1, 1), '*') name,
-- 手机号处理:先验证格式,如果正确就脱敏,否则设为null
if(data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$',
concat(substr(data.phone_num, 1, 3), '**'),
null) phone_num,
-- 邮箱处理:先验证格式,如果正确就脱敏,否则设为null
if(data.email regexp '[a-zA-Z0-9-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9_-]+)+$',
concat('*', split(data.email, '@')[1]),
null) email,
data.user_level, -- 用户等级
data.birthday, -- 生日
data.gender, -- 性别
data.create_time, -- 创建时间
data.operate_time, -- 操作时间
'2022-06-08' start_date, -- 开始日期是今天(因为是首日)
'9999-12-31' end_date -- 结束日期是很远的未来,表示当前有效
-- 从原始数据表取数据
from ods_user_info_inc
where dt = '2022-06-08' -- 取今天的数据
and type = 'bootstrap-insert'; -- 只取全量导入的数据
- 每日装载代码讲解(每天更新数据)
这是最关键的部分,我们用一个具体例子来理解:
场景:2022年6月9日,用户"user_001"的手机号从"13812345678"改为"13987654321"
sql
-- 设置动态分区模式(允许自动创建分区)
set hive.exec.dynamic.partition.mode=nonstrict;
-- 更新dim_user_zip表
insert overwrite table dim_user_zip partition (dt)
select
id, name, phone_num, email, user_level, birthday, gender,
create_time, operate_time, start_date,
-- 处理结束日期:
-- 如果rn=2(历史记录),结束日期设为昨天(2022-06-08)
-- 如果rn=1(最新记录),保持原来的end_date
if(rn = 2, date_sub('2022-06-09', 1), end_date) end_date,
-- 处理分区dt:
-- 如果rn=1(最新记录),放入'9999-12-31'分区
-- 如果rn=2(历史记录),放入昨天分区('2022-06-08')
if(rn = 1, '9999-12-31', date_sub('2022-06-09', 1)) dt
from (
-- 步骤2:给新旧数据编号(每个用户的最新数据是1,历史数据是2)
select
id, name, phone_num, email, user_level, birthday, gender,
create_time, operate_time, start_date, end_date,
-- 关键:按用户分组,按开始日期倒序排名
row_number() over (partition by id order by start_date desc) rn
from (
-- 步骤1:合并新旧数据
-- 1.1 从拉链表取当前最新的数据(旧数据)
select
id, name, phone_num, email, user_level, birthday, gender,
create_time, operate_time, start_date, end_date
from dim_user_zip
where dt = '9999-12-31' -- 只取最新的数据
union all -- 合并
-- 1.2 从原始表取今天变更的数据(新数据)
select
id,
concat(substr(name, 1, 1), '*') name, -- 脱敏
if(phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$',
concat(substr(phone_num, 1, 3), '**'),
null) phone_num, -- 脱敏
if(email regexp '[a-zA-Z0-9-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9_-]+)+$',
concat('*', split(email, '@')[1]),
null) email, -- 脱敏
user_level, birthday, gender, create_time, operate_time,
'2022-06-09' start_date, -- 新数据的开始日期是今天
'9999-12-31' end_date -- 新数据的结束日期是未来
from ods_user_info_inc
where dt = '2022-06-09' -- 今天的数据
and type in ('insert', 'update') -- 新增或修改的数据
) t1
) t2;
四、为什么不用JOIN?
这是一个很好的问题!很多人会想:为什么不直接用JOIN把新旧数据连起来呢?
JOIN方式的缺点:
数据膨胀:如果一个用户有10次变更,JOIN会产生10条关联记录
查询复杂:要查最新数据需要复杂的子查询
性能差:每天全表JOIN,数据量大时非常慢
拉链表的优势:
查询简单:
sql
-- 查最新数据(简单)
select * from dim_user_zip where dt = '9999-12-31';
-- 查某天数据(简单)
select * from dim_user_zip where dt = '2022-06-08';
历史追踪容易:
sql
-- 查用户"user_001"的所有历史变化
select * from dim_user_zip where id = 'user_001' order by start_date;
存储高效:每个用户每天只有一条有效记录
五、这样设计的好处总结
保存完整历史:用户信息的每次变化都被记录下来
查询性能高:按分区查询,速度快
数据管理方便:新数据和老数据分开存放
节省存储空间:相比每天全量快照,拉链表更节省空间
支持时间旅行:可以查询任意时间点的用户状态
六、数据装载脚本
1.首日全量脚本
sql
cd /home/atguigu/bin
vim ods_to_dim_init.sh
添加下面的内容:
sql
#!/bin/bash
APP=gmall
if [ -n "$2" ] ;then
do_date=$2
else
echo "请传入日期参数"
exit
fi
dim_user_zip="
insert overwrite table ${APP}.dim_user_zip partition (dt = '9999-12-31')
select data.id,
concat(substr(data.name, 1, 1), '*') name,
if(data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\\d{8}$',
concat(substr(data.phone_num, 1, 3), '*'), null) phone_num,
if(data.email regexp '^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+$',
concat('*@', split(data.email, '@')[1]), null) email,
data.user_level,
data.birthday,
data.gender,
data.create_time,
data.operate_time,
'$do_date' start_date,
'9999-12-31' end_date
from ${APP}.ods_user_info_inc
where dt = '$do_date'
and type = 'bootstrap-insert';
"
dim_sku_full="
with
sku as
(
select
id,
price,
sku_name,
sku_desc,
weight,
is_sale,
spu_id,
category3_id,
tm_id,
create_time
from ${APP}.ods_sku_info_full
where dt='$do_date'
),
spu as
(
select
id,
spu_name
from ${APP}.ods_spu_info_full
where dt='$do_date'
),
c3 as
(
select
id,
name,
category2_id
from ${APP}.ods_base_category3_full
where dt='$do_date'
),
c2 as
(
select
id,
name,
category1_id
from ${APP}.ods_base_category2_full
where dt='$do_date'
),
c1 as
(
select
id,
name
from ${APP}.ods_base_category1_full
where dt='$do_date'
),
tm as
(
select
id,
tm_name
from ${APP}.ods_base_trademark_full
where dt='$do_date'
),
attr as
(
select
sku_id,
collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
from ${APP}.ods_sku_attr_value_full
where dt='$do_date'
group by sku_id
),
sale_attr as
(
select
sku_id,
collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
from ${APP}.ods_sku_sale_attr_value_full
where dt='$do_date'
group by sku_id
)
insert overwrite table ${APP}.dim_sku_full partition(dt='$do_date')
select
sku.id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.is_sale,
sku.spu_id,
spu.spu_name,
sku.category3_id,
c3.name,
c3.category2_id,
c2.name,
c2.category1_id,
c1.name,
sku.tm_id,
tm.tm_name,
attr.attrs,
sale_attr.sale_attrs,
sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;
"
dim_province_full="
insert overwrite table ${APP}.dim_province_full partition(dt='$do_date')
select
province.id,
province.name,
province.area_code,
province.iso_code,
province.iso_3166_2,
region_id,
region_name
from
(
select
id,
name,
region_id,
area_code,
iso_code,
iso_3166_2
from ${APP}.ods_base_province_full
where dt='$do_date'
)province
left join
(
select
id,
region_name
from ${APP}.ods_base_region_full
where dt='$do_date'
)region
on province.region_id=region.id;
"
dim_coupon_full="
insert overwrite table ${APP}.dim_coupon_full partition(dt='$do_date')
select
id,
coupon_name,
coupon_type,
coupon_dic.dic_name,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
case coupon_type
when '3201' then concat('满',condition_amount,'元减',benefit_amount,'元')
when '3202' then concat('满',condition_num,'件打', benefit_discount,' 折')
when '3203' then concat('减',benefit_amount,'元')
end benefit_rule,
create_time,
range_type,
range_dic.dic_name,
limit_num,
taken_count,
start_time,
end_time,
operate_time,
expire_time
from
(
select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
limit_num,
taken_count,
start_time,
end_time,
operate_time,
expire_time
from ${APP}.ods_coupon_info_full
where dt='$do_date'
)ci
left join
(
select
dic_code,
dic_name
from ${APP}.ods_base_dic_full
where dt='$do_date'
and parent_code='32'
)coupon_dic
on ci.coupon_type=coupon_dic.dic_code
left join
(
select
dic_code,
dic_name
from ${APP}.ods_base_dic_full
where dt='$do_date'
and parent_code='33'
)range_dic
on ci.range_type=range_dic.dic_code;
"
dim_activity_full="
insert overwrite table ${APP}.dim_activity_full partition(dt='$do_date')
select
rule.id,
info.id,
activity_name,
rule.activity_type,
dic.dic_name,
activity_desc,
start_time,
end_time,
create_time,
condition_amount,
condition_num,
benefit_amount,
benefit_discount,
case rule.activity_type
when '3101' then concat('满',condition_amount,'元减',benefit_amount,'元')
when '3102' then concat('满',condition_num,'件打', benefit_discount,' 折')
when '3103' then concat('打', benefit_discount,'折')
end benefit_rule,
benefit_level
from
(
select
id,
activity_id,
activity_type,
condition_amount,
condition_num,
benefit_amount,
benefit_discount,
benefit_level
from ${APP}.ods_activity_rule_full
where dt='$do_date'
)rule
left join
(
select
id,
activity_name,
activity_type,
activity_desc,
start_time,
end_time,
create_time
from ${APP}.ods_activity_info_full
where dt='$do_date'
)info
on rule.activity_id=info.id
left join
(
select
dic_code,
dic_name
from ${APP}.ods_base_dic_full
where dt='$do_date'
and parent_code='31'
)dic
on rule.activity_type=dic.dic_code;
"
dim_promotion_pos_full="
insert overwrite table ${APP}.dim_promotion_pos_full partition(dt='$do_date')
select
id,
pos_location,
pos_type,
promotion_type,
create_time,
operate_time
from ${APP}.ods_promotion_pos_full
where dt='$do_date';
"
dim_promotion_refer_full="
insert overwrite table ${APP}.dim_promotion_refer_full partition(dt='$do_date')
select
id,
refer_name,
create_time,
operate_time
from ${APP}.ods_promotion_refer_full
where dt='$do_date';
"
case $1 in
"dim_user_zip")
hive -e "$dim_user_zip"
;;
"dim_sku_full")
hive -e "$dim_sku_full"
;;
"dim_province_full")
hive -e "$dim_province_full"
;;
"dim_coupon_full")
hive -e "$dim_coupon_full"
;;
"dim_activity_full")
hive -e "$dim_activity_full"
;;
"dim_promotion_pos_full")
hive -e "$dim_promotion_pos_full"
;;
"dim_promotion_refer_full")
hive -e "$dim_promotion_refer_full"
;;
"all")
hive -e "$dim_user_zip$dim_sku_full$dim_province_full$dim_coupon_full$dim_activity_full$dim_promotion_refer_full$dim_promotion_pos_full"
;;
esac
添加权限
sql
chmod +x ods_to_dim_init.sh
执行脚本的命令
sql
ods_to_dim_init.sh all 2022-06-08
2.每日增量脚本
sql
cd /home/atguigu/bin
vim ods_to_dim.sh
添加下面的内容:
sql
#!/bin/bash
APP=gmall
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$2" ] ;then
do_date=$2
else
do_date=`date -d "-1 day" +%F`
fi
dim_user_zip="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table ${APP}.dim_user_zip partition (dt)
select id,
name,
phone_num,
email,
user_level,
birthday,
gender,
create_time,
operate_time,
start_date,
if(rn = 2, date_sub('$do_date', 1), end_date) end_date,
if(rn = 1, '9999-12-31', date_sub('$do_date', 1)) dt
from (
select id,
name,
phone_num,
email,
user_level,
birthday,
gender,
create_time,
operate_time,
start_date,
end_date,
row_number() over (partition by id order by start_date desc) rn
from (
select id,
name,
phone_num,
email,
user_level,
birthday,
gender,
create_time,
operate_time,
start_date,
end_date
from ${APP}.dim_user_zip
where dt = '9999-12-31'
union
select id,
concat(substr(name, 1, 1), '*') name,
if(phone_num regexp
'^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\\d{8}$',
concat(substr(phone_num, 1, 3), '*'), null) phone_num,
if(email regexp '^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+$',
concat('*@', split(email, '@')[1]), null) email,
user_level,
birthday,
gender,
create_time,
operate_time,
'$do_date' start_date,
'9999-12-31' end_date
from (
select data.id,
data.name,
data.phone_num,
data.email,
data.user_level,
data.birthday,
data.gender,
data.create_time,
data.operate_time,
row_number() over (partition by data.id order by ts desc) rn
from ${APP}.ods_user_info_inc
where dt = '$do_date'
) t1
where rn = 1
) t2
) t3;
"
dim_sku_full="
with
sku as
(
select
id,
price,
sku_name,
sku_desc,
weight,
is_sale,
spu_id,
category3_id,
tm_id,
create_time
from ${APP}.ods_sku_info_full
where dt='$do_date'
),
spu as
(
select
id,
spu_name
from ${APP}.ods_spu_info_full
where dt='$do_date'
),
c3 as
(
select
id,
name,
category2_id
from ${APP}.ods_base_category3_full
where dt='$do_date'
),
c2 as
(
select
id,
name,
category1_id
from ${APP}.ods_base_category2_full
where dt='$do_date'
),
c1 as
(
select
id,
name
from ${APP}.ods_base_category1_full
where dt='$do_date'
),
tm as
(
select
id,
tm_name
from ${APP}.ods_base_trademark_full
where dt='$do_date'
),
attr as
(
select
sku_id,
collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
from ${APP}.ods_sku_attr_value_full
where dt='$do_date'
group by sku_id
),
sale_attr as
(
select
sku_id,
collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
from ${APP}.ods_sku_sale_attr_value_full
where dt='$do_date'
group by sku_id
)
insert overwrite table ${APP}.dim_sku_full partition(dt='$do_date')
select
sku.id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.is_sale,
sku.spu_id,
spu.spu_name,
sku.category3_id,
c3.name,
c3.category2_id,
c2.name,
c2.category1_id,
c1.name,
sku.tm_id,
tm.tm_name,
attr.attrs,
sale_attr.sale_attrs,
sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;
"
dim_province_full="
insert overwrite table ${APP}.dim_province_full partition(dt='$do_date')
select
province.id,
province.name,
province.area_code,
province.iso_code,
province.iso_3166_2,
region_id,
region_name
from
(
select
id,
name,
region_id,
area_code,
iso_code,
iso_3166_2
from ${APP}.ods_base_province_full
where dt='$do_date'
)province
left join
(
select
id,
region_name
from ${APP}.ods_base_region_full
where dt='$do_date'
)region
on province.region_id=region.id;
"
dim_coupon_full="
insert overwrite table ${APP}.dim_coupon_full partition(dt='$do_date')
select
id,
coupon_name,
coupon_type,
coupon_dic.dic_name,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
case coupon_type
when '3201' then concat('满',condition_amount,'元减',benefit_amount,'元')
when '3202' then concat('满',condition_num,'件打', benefit_discount,' 折')
when '3203' then concat('减',benefit_amount,'元')
end benefit_rule,
create_time,
range_type,
range_dic.dic_name,
limit_num,
taken_count,
start_time,
end_time,
operate_time,
expire_time
from
(
select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
limit_num,
taken_count,
start_time,
end_time,
operate_time,
expire_time
from ${APP}.ods_coupon_info_full
where dt='$do_date'
)ci
left join
(
select
dic_code,
dic_name
from ${APP}.ods_base_dic_full
where dt='$do_date'
and parent_code='32'
)coupon_dic
on ci.coupon_type=coupon_dic.dic_code
left join
(
select
dic_code,
dic_name
from ${APP}.ods_base_dic_full
where dt='$do_date'
and parent_code='33'
)range_dic
on ci.range_type=range_dic.dic_code;
"
dim_activity_full="
insert overwrite table ${APP}.dim_activity_full partition(dt='$do_date')
select
rule.id,
info.id,
activity_name,
rule.activity_type,
dic.dic_name,
activity_desc,
start_time,
end_time,
create_time,
condition_amount,
condition_num,
benefit_amount,
benefit_discount,
case rule.activity_type
when '3101' then concat('满',condition_amount,'元减',benefit_amount,'元')
when '3102' then concat('满',condition_num,'件打', benefit_discount,' 折')
when '3103' then concat('打', benefit_discount,'折')
end benefit_rule,
benefit_level
from
(
select
id,
activity_id,
activity_type,
condition_amount,
condition_num,
benefit_amount,
benefit_discount,
benefit_level
from ${APP}.ods_activity_rule_full
where dt='$do_date'
)rule
left join
(
select
id,
activity_name,
activity_type,
activity_desc,
start_time,
end_time,
create_time
from ${APP}.ods_activity_info_full
where dt='$do_date'
)info
on rule.activity_id=info.id
left join
(
select
dic_code,
dic_name
from ${APP}.ods_base_dic_full
where dt='$do_date'
and parent_code='31'
)dic
on rule.activity_type=dic.dic_code;
"
dim_promotion_pos_full="
insert overwrite table ${APP}.dim_promotion_pos_full partition(dt='$do_date')
select
id,
pos_location,
pos_type,
promotion_type,
create_time,
operate_time
from ${APP}.ods_promotion_pos_full
where dt='$do_date';
"
dim_promotion_refer_full="
insert overwrite table ${APP}.dim_promotion_refer_full partition(dt='$do_date')
select
id,
refer_name,
create_time,
operate_time
from ${APP}.ods_promotion_refer_full
where dt='$do_date';
"
case $1 in
"dim_user_zip")
hive -e "$dim_user_zip"
;;
"dim_sku_full")
hive -e "$dim_sku_full"
;;
"dim_province_full")
hive -e "$dim_province_full"
;;
"dim_coupon_full")
hive -e "$dim_coupon_full"
;;
"dim_activity_full")
hive -e "$dim_activity_full"
;;
"dim_promotion_pos_full")
hive -e "$dim_promotion_pos_full"
;;
"dim_promotion_refer_full")
hive -e "$dim_promotion_refer_full"
;;
"all")
hive -e "$dim_user_zip$dim_sku_full$dim_province_full$dim_coupon_full$dim_activity_full$dim_promotion_refer_full$dim_promotion_pos_full"
;;
esac
添加权限
sql
chmod +x ods_to_dim.sh
脚本执行的命令(以6月9号为例)
sql
ods_to_dim.sh all 2022-06-09