26-学习笔记尚硅谷数仓搭建-DIM层特殊的维度表——用户维度表的建表、分析及DIM层数据装载脚本

目录

一、完整的建表及数据装载代码(含注释)

二、拉链表设计思路

三、代码详解(结合例子讲解)

四、为什么不用JOIN?

五、这样设计的好处总结

六、数据装载脚本


一、完整的建表及数据装载代码(含注释)

sql 复制代码
-- ---------- 1. 创建用户维度拉链表 ----------
DROP TABLE IF EXISTS dim_user_zip;
CREATE EXTERNAL TABLE dim_user_zip
(
    `id`           STRING COMMENT '用户ID',
    `name`         STRING COMMENT '用户姓名',
    `phone_num`    STRING COMMENT '手机号码',
    `email`        STRING COMMENT '邮箱',
    `user_level`   STRING COMMENT '用户等级',
    `birthday`     STRING COMMENT '生日',
    `gender`       STRING COMMENT '性别',
    `create_time`  STRING COMMENT '创建时间',
    `operate_time` STRING COMMENT '操作时间',
    `start_date`   STRING COMMENT '开始日期',
    `end_date`     STRING COMMENT '结束日期'
) COMMENT '用户维度表'
    PARTITIONED BY (`dt` STRING)
    STORED AS ORC
    LOCATION '/warehouse/gmall/dim/dim_user_zip/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

-- ---------- 2. 首日装载(第一次初始化数据) ----------
-- 首次将全部用户数据导入拉链表,所有数据都是当前有效的
insert overwrite table dim_user_zip partition (dt = '9999-12-31')
select
    data.id,  -- 用户ID,直接取原值

    -- 姓名脱敏:只保留姓氏,名字用*代替
    -- 示例:张三 -> 张*
    concat(substr(data.name, 1, 1), '*') name,

    -- 手机号处理:验证格式 + 脱敏
    -- 1. 先用正则验证是否是有效手机号(11位,符合中国手机号规则)
    -- 2. 如果有效:保留前3位,后8位用**代替(13812345678 -> 138**)
    -- 3. 如果无效:设为NULL
    if(data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$',
        concat(substr(data.phone_num, 1, 3), '**'),
        null) phone_num,

    -- 邮箱处理:验证格式 + 脱敏
    -- 1. 验证邮箱格式是否正确
    -- 2. 如果正确:保留@后面部分,前面用*代替(zhangsan@example.com -> *@example.com)
    -- 3. 如果不正确:设为NULL
    if(data.email regexp '[a-zA-Z0-9-_]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+$',
        concat('*', split(data.email, '@')[1]),
        null) email,

    data.user_level,      -- 用户等级,直接取原值
    data.birthday,        -- 生日,直接取原值
    data.gender,          -- 性别,直接取原值
    data.create_time,     -- 创建时间,直接取原值
    data.operate_time,    -- 操作时间,直接取原值

    -- 拉链表核心字段:
    '2022-06-08' start_date,  -- 开始日期:因为是首日,所以是当天
    '9999-12-31' end_date     -- 结束日期:未来很远的一天,表示当前有效

-- 数据来源:原始数据表(ods层)
from ods_user_info_inc
-- 筛选条件:
where dt = '2022-06-08'                -- 分区日期:2022年6月8日
  and type = 'bootstrap-insert';       -- 数据类型:全量初始化数据

-- ---------- 3. 每日装载(每天增量更新) ----------
-- 说明:每天执行一次,处理前一天的用户信息变化

-- 设置动态分区模式,允许自动创建新分区
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dim_user_zip partition (dt)
select id,
       name,
       phone_num,
       email,
       user_level,
       birthday,
       gender,
       create_time,
       operate_time,
       start_date,
        -- 处理结束日期(end_date):
        -- 情况1:如果是历史数据(rn=2),结束日期设为前一天(表示到前一天为止有效)
        -- 情况2:如果是最新数据(rn=1),保持原来的结束日期(9999-12-31)
       if(rn = 2, date_sub('2022-06-09', 1), end_date)     end_date,
        -- 处理分区字段(dt):
        -- 情况1:如果是最新数据(rn=1),放入'9999-12-31'分区
        -- 情况2:如果是历史数据(rn=2),放入前一天的分区('2022-06-08')
       if(rn = 1, '9999-12-31', date_sub('2022-06-09', 1)) dt
from (
         select id,
                name,
                phone_num,
                email,
                user_level,
                birthday,
                gender,
                create_time,
                operate_time,
                start_date,
                end_date,
                -- 关键窗口函数:给每个用户的数据编号
                -- partition by id:按用户ID分组
                -- order by start_date desc:按开始日期倒序排列(最新的排第一)
                -- rn=1:最新数据  rn=2:历史数据
                row_number() over (partition by id order by start_date desc) rn
         from (
                  select id,
                         name,
                         phone_num,
                         email,
                         user_level,
                         birthday,
                         gender,
                         create_time,
                         operate_time,
                         start_date,
                         end_date
                  from dim_user_zip
                  where dt = '9999-12-31'
                  union
                  select id,
                         concat(substr(name, 1, 1), '*')                name,
                         if(phone_num regexp
                            '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\\d{8}$',
                            concat(substr(phone_num, 1, 3), '*'), null) phone_num,
                         if(email regexp '^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+$',
                            concat('*@', split(email, '@')[1]), null)   email,
                         user_level,
                         birthday,
                         gender,
                         create_time,
                         operate_time,
                         '2022-06-09'                                   start_date,
                         '9999-12-31'                                   end_date
                  from (
                           select data.id,
                                  data.name,
                                  data.phone_num,
                                  data.email,
                                  data.user_level,
                                  data.birthday,
                                  data.gender,
                                  data.create_time,
                                  data.operate_time,
                                  row_number() over (partition by data.id order by ts desc) rn
                           from ods_user_info_inc
                           where dt = '2022-06-09'
                       ) t1
                  where rn = 1
              ) t2
     ) t3;

二、拉链表设计思路

  1. 什么是拉链表?

拉链表就像一条可以拉伸的链子,记录了每条数据从开始到结束的有效时间范围。它有两个关键字段:

start_date:这条记录开始生效的日期

end_date:这条记录结束有效的日期

  1. 分区设计(为什么有两个分区?)

sql

-- 分区字段:dt

-- 1. '9999-12-31'分区:保存当前最新的、正在使用的数据

-- 2. 普通日期分区(如'2022-06-08'):保存已经过期的历史数据

为什么要这样设计?

查询最新数据快:要查用户最新信息,直接查dt='9999-12-31'分区

查询历史数据方便:要查某天的用户信息,就查对应日期的分区

数据管理清晰:新老数据分开存放,不混在一起

  1. 数据流向理解

首日(数据仓库第一次建表):

把所有用户的最新数据都放进9999-12-31分区

这时候start_date是当天日期,end_date是'9999-12-31'(表示一直有效)

每日更新:

新增用户:直接插入到9999-12-31分区

修改用户:比如小明的等级从1变成2

把原来9999-12-31分区里小明的旧记录拿出来

修改它的end_date为昨天(表示到昨天为止有效)

把这条旧记录放入昨天的分区(如dt='2022-06-08')

把小明的新记录(等级2)放入9999-12-31分区

三、代码详解(结合例子讲解)

  1. 创建表语句解读

sql

CREATE EXTERNAL TABLE dim_user_zip

(

'id' STRING COMMENT '用户ID', -- 用户的唯一标识,如"user_001"

'name' STRING COMMENT '用户姓名', -- 用户真实姓名

'phone_num' STRING COMMENT '手机号', -- 用户手机号

'email' STRING COMMENT '邮箱', -- 用户邮箱

'user_level' STRING COMMENT '用户等级', -- 如"VIP1"、"普通会员"

'birthday' STRING COMMENT '生日', -- 用户生日

'gender' STRING COMMENT '性别', -- 男/女

'create_time' STRING COMMENT '创建时间', -- 用户注册时间

'operate_time' STRING COMMENT '操作时间', -- 最后操作时间

'start_date' STRING COMMENT '开始日期', -- 这条记录生效的开始日期

'end_date' STRING COMMENT '结束日期' -- 这条记录生效的结束日期

) COMMENT '用户维度表';

-- 按dt字段分区,就像把文件放进不同的文件夹

PARTITIONED BY ('dt' STRING)

-- 使用ORC格式存储(一种高效的数据存储格式)

STORED AS ORC

-- 数据存放的位置

LOCATION '/warehouse/gmall/dim/dim_user_zip/'

-- 设置压缩格式为snappy(减少存储空间)

TBLPROPERTIES ('orc.compress' = 'snappy');

  1. 重要函数讲解

(1)数据脱敏函数(保护用户隐私)

sql

-- 1. 姓名脱敏:只显示第一个字,后面用*代替

-- 例如:"张三" -> "张*"

concat(substr(data.name, 1, 1), '*')

-- 2. 手机号脱敏:保留前3位,后面用**代替

-- 例如:"13812345678" -> "138**"

concat(substr(data.phone_num, 1, 3), '**')

-- 3. 邮箱脱敏:@符号后面的部分保留,前面用*代替

-- 例如:"zhangsan@example.com" -> "*@example.com"

concat('*', split(data.email, '@')[1])

(2)正则表达式函数(数据验证)

sql

-- 验证手机号格式是否正确(11位数字,符合中国手机号规则)

-- regexp是正则表达式匹配函数

data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$'

-- 验证邮箱格式是否正确

data.email regexp '[a-zA-Z0-9-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9_-]+)+$'

(3)窗口函数(处理排名)

sql

-- row_number() over:给每组数据编号

-- partition by id:按用户ID分组

-- order by start_date desc:按开始日期倒序排列(最新的排第一)

row_number() over (partition by id order by start_date desc) rn

作用:给每个用户的数据编号,最新的是1,次新的是2,依此类推。

  1. 首日装载代码讲解(第一次导入数据)

sql

-- 向dim_user_zip表的9999-12-31分区插入数据

insert overwrite table dim_user_zip partition (dt = '9999-12-31')

select

data.id, -- 用户ID

-- 姓名脱敏:只显示姓,名用*代替

concat(substr(data.name, 1, 1), '*') name,

-- 手机号处理:先验证格式,如果正确就脱敏,否则设为null

if(data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$',

concat(substr(data.phone_num, 1, 3), '**'),

null) phone_num,

-- 邮箱处理:先验证格式,如果正确就脱敏,否则设为null

if(data.email regexp '[a-zA-Z0-9-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9_-]+)+$',

concat('*', split(data.email, '@')[1]),

null) email,

data.user_level, -- 用户等级

data.birthday, -- 生日

data.gender, -- 性别

data.create_time, -- 创建时间

data.operate_time, -- 操作时间

'2022-06-08' start_date, -- 开始日期是今天(因为是首日)

'9999-12-31' end_date -- 结束日期是很远的未来,表示当前有效

-- 从原始数据表取数据

from ods_user_info_inc

where dt = '2022-06-08' -- 取今天的数据

and type = 'bootstrap-insert'; -- 只取全量导入的数据

  1. 每日装载代码讲解(每天更新数据)

这是最关键的部分,我们用一个具体例子来理解:

场景:2022年6月9日,用户"user_001"的手机号从"13812345678"改为"13987654321"

sql

-- 设置动态分区模式(允许自动创建分区)

set hive.exec.dynamic.partition.mode=nonstrict;

-- 更新dim_user_zip表

insert overwrite table dim_user_zip partition (dt)

select

id, name, phone_num, email, user_level, birthday, gender,

create_time, operate_time, start_date,

-- 处理结束日期:

-- 如果rn=2(历史记录),结束日期设为昨天(2022-06-08)

-- 如果rn=1(最新记录),保持原来的end_date

if(rn = 2, date_sub('2022-06-09', 1), end_date) end_date,

-- 处理分区dt:

-- 如果rn=1(最新记录),放入'9999-12-31'分区

-- 如果rn=2(历史记录),放入昨天分区('2022-06-08')

if(rn = 1, '9999-12-31', date_sub('2022-06-09', 1)) dt

from (

-- 步骤2:给新旧数据编号(每个用户的最新数据是1,历史数据是2)

select

id, name, phone_num, email, user_level, birthday, gender,

create_time, operate_time, start_date, end_date,

-- 关键:按用户分组,按开始日期倒序排名

row_number() over (partition by id order by start_date desc) rn

from (

-- 步骤1:合并新旧数据

-- 1.1 从拉链表取当前最新的数据(旧数据)

select

id, name, phone_num, email, user_level, birthday, gender,

create_time, operate_time, start_date, end_date

from dim_user_zip

where dt = '9999-12-31' -- 只取最新的数据

union all -- 合并

-- 1.2 从原始表取今天变更的数据(新数据)

select

id,

concat(substr(name, 1, 1), '*') name, -- 脱敏

if(phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$',

concat(substr(phone_num, 1, 3), '**'),

null) phone_num, -- 脱敏

if(email regexp '[a-zA-Z0-9-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9_-]+)+$',

concat('*', split(email, '@')[1]),

null) email, -- 脱敏

user_level, birthday, gender, create_time, operate_time,

'2022-06-09' start_date, -- 新数据的开始日期是今天

'9999-12-31' end_date -- 新数据的结束日期是未来

from ods_user_info_inc

where dt = '2022-06-09' -- 今天的数据

and type in ('insert', 'update') -- 新增或修改的数据

) t1

) t2;

四、为什么不用JOIN?

这是一个很好的问题!很多人会想:为什么不直接用JOIN把新旧数据连起来呢?

JOIN方式的缺点:

数据膨胀:如果一个用户有10次变更,JOIN会产生10条关联记录

查询复杂:要查最新数据需要复杂的子查询

性能差:每天全表JOIN,数据量大时非常慢

拉链表的优势:

查询简单:

sql

-- 查最新数据(简单)

select * from dim_user_zip where dt = '9999-12-31';

-- 查某天数据(简单)

select * from dim_user_zip where dt = '2022-06-08';

历史追踪容易:

sql

-- 查用户"user_001"的所有历史变化

select * from dim_user_zip where id = 'user_001' order by start_date;

存储高效:每个用户每天只有一条有效记录

五、这样设计的好处总结

保存完整历史:用户信息的每次变化都被记录下来

查询性能高:按分区查询,速度快

数据管理方便:新数据和老数据分开存放

节省存储空间:相比每天全量快照,拉链表更节省空间

支持时间旅行:可以查询任意时间点的用户状态

六、数据装载脚本

1.首日全量脚本

sql 复制代码
cd /home/atguigu/bin
vim ods_to_dim_init.sh 

添加下面的内容:

sql 复制代码
#!/bin/bash

APP=gmall

if [ -n "$2" ] ;then
   do_date=$2
else 
   echo "请传入日期参数"
   exit
fi 

dim_user_zip="
insert overwrite table ${APP}.dim_user_zip partition (dt = '9999-12-31')
select data.id,
       concat(substr(data.name, 1, 1), '*')                name,
       if(data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\\d{8}$',
          concat(substr(data.phone_num, 1, 3), '*'), null) phone_num,
       if(data.email regexp '^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+$',
          concat('*@', split(data.email, '@')[1]), null)   email,
       data.user_level,
       data.birthday,
       data.gender,
       data.create_time,
       data.operate_time,
       '$do_date'                                        start_date,
       '9999-12-31'                                        end_date
from ${APP}.ods_user_info_inc
where dt = '$do_date'
  and type = 'bootstrap-insert';
"

dim_sku_full="
with
sku as
(
    select
        id,
        price,
        sku_name,
        sku_desc,
        weight,
        is_sale,
        spu_id,
        category3_id,
        tm_id,
        create_time
    from ${APP}.ods_sku_info_full
    where dt='$do_date'
),
spu as
(
    select
        id,
        spu_name
    from ${APP}.ods_spu_info_full
    where dt='$do_date'
),
c3 as
(
    select
        id,
        name,
        category2_id
    from ${APP}.ods_base_category3_full
    where dt='$do_date'
),
c2 as
(
    select
        id,
        name,
        category1_id
    from ${APP}.ods_base_category2_full
    where dt='$do_date'
),
c1 as
(
    select
        id,
        name
    from ${APP}.ods_base_category1_full
    where dt='$do_date'
),
tm as
(
    select
        id,
        tm_name
    from ${APP}.ods_base_trademark_full
    where dt='$do_date'
),
attr as
(
    select
        sku_id,
        collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    from ${APP}.ods_sku_attr_value_full
    where dt='$do_date'
    group by sku_id
),
sale_attr as
(
    select
        sku_id,
        collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    from ${APP}.ods_sku_sale_attr_value_full
    where dt='$do_date'
    group by sku_id
)
insert overwrite table ${APP}.dim_sku_full partition(dt='$do_date')
select
    sku.id,
    sku.price,
    sku.sku_name,
    sku.sku_desc,
    sku.weight,
    sku.is_sale,
    sku.spu_id,
    spu.spu_name,
    sku.category3_id,
    c3.name,
    c3.category2_id,
    c2.name,
    c2.category1_id,
    c1.name,
    sku.tm_id,
    tm.tm_name,
    attr.attrs,
    sale_attr.sale_attrs,
    sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;
"

dim_province_full="
insert overwrite table ${APP}.dim_province_full partition(dt='$do_date')
select
    province.id,
    province.name,
    province.area_code,
    province.iso_code,
    province.iso_3166_2,
    region_id,
    region_name
from
(
    select
        id,
        name,
        region_id,
        area_code,
        iso_code,
        iso_3166_2
    from ${APP}.ods_base_province_full
    where dt='$do_date'
)province
left join
(
    select
        id,
        region_name
    from ${APP}.ods_base_region_full
    where dt='$do_date'
)region
on province.region_id=region.id;
"

dim_coupon_full="
insert overwrite table ${APP}.dim_coupon_full partition(dt='$do_date')
select
    id,
    coupon_name,
    coupon_type,
    coupon_dic.dic_name,
    condition_amount,
    condition_num,
    activity_id,
    benefit_amount,
    benefit_discount,
    case coupon_type
        when '3201' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3202' then concat('满',condition_num,'件打', benefit_discount,' 折')
        when '3203' then concat('减',benefit_amount,'元')
    end benefit_rule,
    create_time,
    range_type,
    range_dic.dic_name,
    limit_num,
    taken_count,
    start_time,
    end_time,
    operate_time,
    expire_time
from
(
    select
        id,
        coupon_name,
        coupon_type,
        condition_amount,
        condition_num,
        activity_id,
        benefit_amount,
        benefit_discount,
        create_time,
        range_type,
        limit_num,
        taken_count,
        start_time,
        end_time,
        operate_time,
        expire_time
    from ${APP}.ods_coupon_info_full
    where dt='$do_date'
)ci
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='32'
)coupon_dic
on ci.coupon_type=coupon_dic.dic_code
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='33'
)range_dic
on ci.range_type=range_dic.dic_code;
"

dim_activity_full="
insert overwrite table ${APP}.dim_activity_full partition(dt='$do_date')
select
    rule.id,
    info.id,
    activity_name,
    rule.activity_type,
    dic.dic_name,
    activity_desc,
    start_time,
    end_time,
    create_time,
    condition_amount,
    condition_num,
    benefit_amount,
    benefit_discount,
    case rule.activity_type
        when '3101' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3102' then concat('满',condition_num,'件打', benefit_discount,' 折')
        when '3103' then concat('打', benefit_discount,'折')
    end benefit_rule,
    benefit_level
from
(
    select
        id,
        activity_id,
        activity_type,
        condition_amount,
        condition_num,
        benefit_amount,
        benefit_discount,
        benefit_level
    from ${APP}.ods_activity_rule_full
    where dt='$do_date'
)rule
left join
(
    select
        id,
        activity_name,
        activity_type,
        activity_desc,
        start_time,
        end_time,
        create_time
    from ${APP}.ods_activity_info_full
    where dt='$do_date'
)info
on rule.activity_id=info.id
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='31'
)dic
on rule.activity_type=dic.dic_code;
"

dim_promotion_pos_full="
insert overwrite table ${APP}.dim_promotion_pos_full partition(dt='$do_date')
select
    id,           
    pos_location,
    pos_type,
    promotion_type,
    create_time,
    operate_time   
from ${APP}.ods_promotion_pos_full 
where dt='$do_date';
"

dim_promotion_refer_full="
insert overwrite table ${APP}.dim_promotion_refer_full partition(dt='$do_date')
select
    id, 
    refer_name,
    create_time,
    operate_time
from ${APP}.ods_promotion_refer_full 
where dt='$do_date';
"


case $1 in
"dim_user_zip")
    hive -e "$dim_user_zip"
;;
"dim_sku_full")
    hive -e "$dim_sku_full"
;;
"dim_province_full")
    hive -e "$dim_province_full"
;;
"dim_coupon_full")
    hive -e "$dim_coupon_full"
;;
"dim_activity_full")
    hive -e "$dim_activity_full"
    ;;
"dim_promotion_pos_full")
    hive -e "$dim_promotion_pos_full"
;;
"dim_promotion_refer_full")
    hive -e "$dim_promotion_refer_full"
;;

"all")
    hive -e "$dim_user_zip$dim_sku_full$dim_province_full$dim_coupon_full$dim_activity_full$dim_promotion_refer_full$dim_promotion_pos_full"
;;
esac

添加权限

sql 复制代码
chmod +x ods_to_dim_init.sh

执行脚本的命令

sql 复制代码
ods_to_dim_init.sh all 2022-06-08

2.每日增量脚本

sql 复制代码
cd /home/atguigu/bin
vim ods_to_dim.sh

添加下面的内容:

sql 复制代码
#!/bin/bash

APP=gmall

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$2" ] ;then
    do_date=$2
else 
    do_date=`date -d "-1 day" +%F`
fi

dim_user_zip="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table ${APP}.dim_user_zip partition (dt)
select id,
       name,
       phone_num,
       email,
       user_level,
       birthday,
       gender,
       create_time,
       operate_time,
       start_date,
       if(rn = 2, date_sub('$do_date', 1), end_date)     end_date,
       if(rn = 1, '9999-12-31', date_sub('$do_date', 1)) dt
from (
         select id,
                name,
                phone_num,
                email,
                user_level,
                birthday,
                gender,
                create_time,
                operate_time,
                start_date,
                end_date,
                row_number() over (partition by id order by start_date desc) rn
         from (
                  select id,
                         name,
                         phone_num,
                         email,
                         user_level,
                         birthday,
                         gender,
                         create_time,
                         operate_time,
                         start_date,
                         end_date
                  from ${APP}.dim_user_zip
                  where dt = '9999-12-31'
                  union
                  select id,
                         concat(substr(name, 1, 1), '*')                name,
                         if(phone_num regexp
                            '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\\d{8}$',
                            concat(substr(phone_num, 1, 3), '*'), null) phone_num,
                         if(email regexp '^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+$',
                            concat('*@', split(email, '@')[1]), null)   email,
                         user_level,
                         birthday,
                         gender,
                         create_time,
                         operate_time,
                         '$do_date'                                   start_date,
                         '9999-12-31'                                   end_date
                  from (
                           select data.id,
                                  data.name,
                                  data.phone_num,
                                  data.email,
                                  data.user_level,
                                  data.birthday,
                                  data.gender,
                                  data.create_time,
                                  data.operate_time,
                                  row_number() over (partition by data.id order by ts desc) rn
                           from ${APP}.ods_user_info_inc
                           where dt = '$do_date'
                       ) t1
                  where rn = 1
              ) t2
     ) t3;
"

dim_sku_full="
with
sku as
(
    select
        id,
        price,
        sku_name,
        sku_desc,
        weight,
        is_sale,
        spu_id,
        category3_id,
        tm_id,
        create_time
    from ${APP}.ods_sku_info_full
    where dt='$do_date'
),
spu as
(
    select
        id,
        spu_name
    from ${APP}.ods_spu_info_full
    where dt='$do_date'
),
c3 as
(
    select
        id,
        name,
        category2_id
    from ${APP}.ods_base_category3_full
    where dt='$do_date'
),
c2 as
(
    select
        id,
        name,
        category1_id
    from ${APP}.ods_base_category2_full
    where dt='$do_date'
),
c1 as
(
    select
        id,
        name
    from ${APP}.ods_base_category1_full
    where dt='$do_date'
),
tm as
(
    select
        id,
        tm_name
    from ${APP}.ods_base_trademark_full
    where dt='$do_date'
),
attr as
(
    select
        sku_id,
        collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    from ${APP}.ods_sku_attr_value_full
    where dt='$do_date'
    group by sku_id
),
sale_attr as
(
    select
        sku_id,
        collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    from ${APP}.ods_sku_sale_attr_value_full
    where dt='$do_date'
    group by sku_id
)
insert overwrite table ${APP}.dim_sku_full partition(dt='$do_date')
select
    sku.id,
    sku.price,
    sku.sku_name,
    sku.sku_desc,
    sku.weight,
    sku.is_sale,
    sku.spu_id,
    spu.spu_name,
    sku.category3_id,
    c3.name,
    c3.category2_id,
    c2.name,
    c2.category1_id,
    c1.name,
    sku.tm_id,
    tm.tm_name,
    attr.attrs,
    sale_attr.sale_attrs,
    sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;
"

dim_province_full="
insert overwrite table ${APP}.dim_province_full partition(dt='$do_date')
select
    province.id,
    province.name,
    province.area_code,
    province.iso_code,
    province.iso_3166_2,
    region_id,
    region_name
from
(
    select
        id,
        name,
        region_id,
        area_code,
        iso_code,
        iso_3166_2
    from ${APP}.ods_base_province_full
    where dt='$do_date'
)province
left join
(
    select
        id,
        region_name
    from ${APP}.ods_base_region_full
    where dt='$do_date'
)region
on province.region_id=region.id;
"

dim_coupon_full="
insert overwrite table ${APP}.dim_coupon_full partition(dt='$do_date')
select
    id,
    coupon_name,
    coupon_type,
    coupon_dic.dic_name,
    condition_amount,
    condition_num,
    activity_id,
    benefit_amount,
    benefit_discount,
    case coupon_type
        when '3201' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3202' then concat('满',condition_num,'件打', benefit_discount,' 折')
        when '3203' then concat('减',benefit_amount,'元')
    end benefit_rule,
    create_time,
    range_type,
    range_dic.dic_name,
    limit_num,
    taken_count,
    start_time,
    end_time,
    operate_time,
    expire_time
from
(
    select
        id,
        coupon_name,
        coupon_type,
        condition_amount,
        condition_num,
        activity_id,
        benefit_amount,
        benefit_discount,
        create_time,
        range_type,
        limit_num,
        taken_count,
        start_time,
        end_time,
        operate_time,
        expire_time
    from ${APP}.ods_coupon_info_full
    where dt='$do_date'
)ci
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='32'
)coupon_dic
on ci.coupon_type=coupon_dic.dic_code
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='33'
)range_dic
on ci.range_type=range_dic.dic_code;
"

dim_activity_full="
insert overwrite table ${APP}.dim_activity_full partition(dt='$do_date')
select
    rule.id,
    info.id,
    activity_name,
    rule.activity_type,
    dic.dic_name,
    activity_desc,
    start_time,
    end_time,
    create_time,
    condition_amount,
    condition_num,
    benefit_amount,
    benefit_discount,
    case rule.activity_type
        when '3101' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3102' then concat('满',condition_num,'件打', benefit_discount,' 折')
        when '3103' then concat('打', benefit_discount,'折')
    end benefit_rule,
    benefit_level
from
(
    select
        id,
        activity_id,
        activity_type,
        condition_amount,
        condition_num,
        benefit_amount,
        benefit_discount,
        benefit_level
    from ${APP}.ods_activity_rule_full
    where dt='$do_date'
)rule
left join
(
    select
        id,
        activity_name,
        activity_type,
        activity_desc,
        start_time,
        end_time,
        create_time
    from ${APP}.ods_activity_info_full
    where dt='$do_date'
)info
on rule.activity_id=info.id
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='31'
)dic
on rule.activity_type=dic.dic_code;
"


dim_promotion_pos_full="
insert overwrite table ${APP}.dim_promotion_pos_full partition(dt='$do_date')
select
    id,            
    pos_location,
    pos_type,
    promotion_type,
    create_time,
    operate_time
from ${APP}.ods_promotion_pos_full 
where dt='$do_date';
"

dim_promotion_refer_full="
insert overwrite table ${APP}.dim_promotion_refer_full partition(dt='$do_date')
select
    id, 
    refer_name,
    create_time,
    operate_time
from ${APP}.ods_promotion_refer_full 
where dt='$do_date';
"


case $1 in
"dim_user_zip")
    hive -e "$dim_user_zip"
;;
"dim_sku_full")
    hive -e "$dim_sku_full"
;;
"dim_province_full")
    hive -e "$dim_province_full"
;;
"dim_coupon_full")
    hive -e "$dim_coupon_full"
;;
"dim_activity_full")
    hive -e "$dim_activity_full"
;;
"dim_promotion_pos_full")
    hive -e "$dim_promotion_pos_full"
;;
"dim_promotion_refer_full")
    hive -e "$dim_promotion_refer_full"
;;

"all")
    hive -e "$dim_user_zip$dim_sku_full$dim_province_full$dim_coupon_full$dim_activity_full$dim_promotion_refer_full$dim_promotion_pos_full"
;;
esac

添加权限

sql 复制代码
chmod +x ods_to_dim.sh

脚本执行的命令(以6月9号为例)

sql 复制代码
ods_to_dim.sh all 2022-06-09
相关推荐
lxl13073 小时前
学习C++(5)运算符重载+赋值运算符重载
学习
ruxshui3 小时前
个人笔记: 星环Inceptor/hive普通分区表与范围分区表核心技术总结
hive·hadoop·笔记
慾玄3 小时前
渗透笔记总结
笔记
AutumnorLiuu4 小时前
C++并发编程学习(一)——线程基础
开发语言·c++·学习
CS创新实验室4 小时前
关于 Moltbot 的学习总结笔记
笔记·学习·clawdbot·molbot
峥嵘life4 小时前
Android EDLA CTS、GTS等各项测试命令汇总
android·学习·elasticsearch
千谦阙听4 小时前
数据结构入门:栈与队列
数据结构·学习·visual studio
.小墨迹4 小时前
C++学习——C++中`memcpy`和**赋值拷贝**的核心区别
java·linux·开发语言·c++·学习·算法·机器学习
望忆4 小时前
关于《Cold & Warm Net: Addressing Cold-Start Usersin Recommender Systems》
学习
笔画人生4 小时前
教培机构用蓝耘MaaS做“个性化学习计划”:从学情分析到方案生成的完整链路
python·学习