26-学习笔记尚硅谷数仓搭建-DIM层特殊的维度表——用户维度表的建表、分析及DIM层数据装载脚本

一、完整的建表及数据装载代码（含注释）

sql 复制代码

-- ---------- 1. 创建用户维度拉链表 ----------
DROP TABLE IF EXISTS dim_user_zip;
CREATE EXTERNAL TABLE dim_user_zip
(
    `id`           STRING COMMENT '用户ID',
    `name`         STRING COMMENT '用户姓名',
    `phone_num`    STRING COMMENT '手机号码',
    `email`        STRING COMMENT '邮箱',
    `user_level`   STRING COMMENT '用户等级',
    `birthday`     STRING COMMENT '生日',
    `gender`       STRING COMMENT '性别',
    `create_time`  STRING COMMENT '创建时间',
    `operate_time` STRING COMMENT '操作时间',
    `start_date`   STRING COMMENT '开始日期',
    `end_date`     STRING COMMENT '结束日期'
) COMMENT '用户维度表'
    PARTITIONED BY (`dt` STRING)
    STORED AS ORC
    LOCATION '/warehouse/gmall/dim/dim_user_zip/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

-- ---------- 2. 首日装载（第一次初始化数据） ----------
-- 首次将全部用户数据导入拉链表，所有数据都是当前有效的
insert overwrite table dim_user_zip partition (dt = '9999-12-31')
select
    data.id,  -- 用户ID，直接取原值

    -- 姓名脱敏：只保留姓氏，名字用*代替
    -- 示例：张三 -> 张*
    concat(substr(data.name, 1, 1), '*') name,

    -- 手机号处理：验证格式 + 脱敏
    -- 1. 先用正则验证是否是有效手机号（11位，符合中国手机号规则）
    -- 2. 如果有效：保留前3位，后8位用**代替（13812345678 -> 138**）
    -- 3. 如果无效：设为NULL
    if(data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$',
        concat(substr(data.phone_num, 1, 3), '**'),
        null) phone_num,

    -- 邮箱处理：验证格式 + 脱敏
    -- 1. 验证邮箱格式是否正确
    -- 2. 如果正确：保留@后面部分，前面用*代替（zhangsan@example.com -> *@example.com）
    -- 3. 如果不正确：设为NULL
    if(data.email regexp '[a-zA-Z0-9-_]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+$',
        concat('*', split(data.email, '@')[1]),
        null) email,

    data.user_level,      -- 用户等级，直接取原值
    data.birthday,        -- 生日，直接取原值
    data.gender,          -- 性别，直接取原值
    data.create_time,     -- 创建时间，直接取原值
    data.operate_time,    -- 操作时间，直接取原值

    -- 拉链表核心字段：
    '2022-06-08' start_date,  -- 开始日期：因为是首日，所以是当天
    '9999-12-31' end_date     -- 结束日期：未来很远的一天，表示当前有效

-- 数据来源：原始数据表（ods层）
from ods_user_info_inc
-- 筛选条件：
where dt = '2022-06-08'                -- 分区日期：2022年6月8日
  and type = 'bootstrap-insert';       -- 数据类型：全量初始化数据

-- ---------- 3. 每日装载（每天增量更新） ----------
-- 说明：每天执行一次，处理前一天的用户信息变化

-- 设置动态分区模式，允许自动创建新分区
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dim_user_zip partition (dt)
select id,
       name,
       phone_num,
       email,
       user_level,
       birthday,
       gender,
       create_time,
       operate_time,
       start_date,
        -- 处理结束日期(end_date)：
        -- 情况1：如果是历史数据（rn=2），结束日期设为前一天（表示到前一天为止有效）
        -- 情况2：如果是最新数据（rn=1），保持原来的结束日期（9999-12-31）
       if(rn = 2, date_sub('2022-06-09', 1), end_date)     end_date,
        -- 处理分区字段(dt)：
        -- 情况1：如果是最新数据（rn=1），放入'9999-12-31'分区
        -- 情况2：如果是历史数据（rn=2），放入前一天的分区（'2022-06-08'）
       if(rn = 1, '9999-12-31', date_sub('2022-06-09', 1)) dt
from (
         select id,
                name,
                phone_num,
                email,
                user_level,
                birthday,
                gender,
                create_time,
                operate_time,
                start_date,
                end_date,
                -- 关键窗口函数：给每个用户的数据编号
                -- partition by id：按用户ID分组
                -- order by start_date desc：按开始日期倒序排列（最新的排第一）
                -- rn=1：最新数据  rn=2：历史数据
                row_number() over (partition by id order by start_date desc) rn
         from (
                  select id,
                         name,
                         phone_num,
                         email,
                         user_level,
                         birthday,
                         gender,
                         create_time,
                         operate_time,
                         start_date,
                         end_date
                  from dim_user_zip
                  where dt = '9999-12-31'
                  union
                  select id,
                         concat(substr(name, 1, 1), '*')                name,
                         if(phone_num regexp
                            '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\\d{8}$',
                            concat(substr(phone_num, 1, 3), '*'), null) phone_num,
                         if(email regexp '^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+$',
                            concat('*@', split(email, '@')[1]), null)   email,
                         user_level,
                         birthday,
                         gender,
                         create_time,
                         operate_time,
                         '2022-06-09'                                   start_date,
                         '9999-12-31'                                   end_date
                  from (
                           select data.id,
                                  data.name,
                                  data.phone_num,
                                  data.email,
                                  data.user_level,
                                  data.birthday,
                                  data.gender,
                                  data.create_time,
                                  data.operate_time,
                                  row_number() over (partition by data.id order by ts desc) rn
                           from ods_user_info_inc
                           where dt = '2022-06-09'
                       ) t1
                  where rn = 1
              ) t2
     ) t3;

二、拉链表设计思路

什么是拉链表？

拉链表就像一条可以拉伸的链子，记录了每条数据从开始到结束的有效时间范围。它有两个关键字段：

start_date：这条记录开始生效的日期

end_date：这条记录结束有效的日期

分区设计（为什么有两个分区？）

sql

-- 分区字段：dt

-- 1. '9999-12-31'分区：保存当前最新的、正在使用的数据

-- 2. 普通日期分区（如'2022-06-08'）：保存已经过期的历史数据

为什么要这样设计？

查询最新数据快：要查用户最新信息，直接查dt='9999-12-31'分区

查询历史数据方便：要查某天的用户信息，就查对应日期的分区

数据管理清晰：新老数据分开存放，不混在一起

数据流向理解

首日（数据仓库第一次建表）：

把所有用户的最新数据都放进9999-12-31分区

这时候start_date是当天日期，end_date是'9999-12-31'（表示一直有效）

每日更新：

新增用户：直接插入到9999-12-31分区

修改用户：比如小明的等级从1变成2

把原来9999-12-31分区里小明的旧记录拿出来

修改它的end_date为昨天（表示到昨天为止有效）

把这条旧记录放入昨天的分区（如dt='2022-06-08'）

把小明的新记录（等级2）放入9999-12-31分区

三、代码详解（结合例子讲解）

创建表语句解读

sql

CREATE EXTERNAL TABLE dim_user_zip

(

'id' STRING COMMENT '用户ID', -- 用户的唯一标识，如"user_001"

'name' STRING COMMENT '用户姓名', -- 用户真实姓名

'phone_num' STRING COMMENT '手机号', -- 用户手机号

'email' STRING COMMENT '邮箱', -- 用户邮箱

'user_level' STRING COMMENT '用户等级', -- 如"VIP1"、"普通会员"

'birthday' STRING COMMENT '生日', -- 用户生日

'gender' STRING COMMENT '性别', -- 男/女

'create_time' STRING COMMENT '创建时间', -- 用户注册时间

'operate_time' STRING COMMENT '操作时间', -- 最后操作时间

'start_date' STRING COMMENT '开始日期', -- 这条记录生效的开始日期

'end_date' STRING COMMENT '结束日期' -- 这条记录生效的结束日期

) COMMENT '用户维度表';

-- 按dt字段分区，就像把文件放进不同的文件夹

PARTITIONED BY ('dt' STRING)

-- 使用ORC格式存储（一种高效的数据存储格式）

STORED AS ORC

-- 数据存放的位置

LOCATION '/warehouse/gmall/dim/dim_user_zip/'

-- 设置压缩格式为snappy（减少存储空间）

TBLPROPERTIES ('orc.compress' = 'snappy');

重要函数讲解

（1）数据脱敏函数（保护用户隐私）

sql

-- 1. 姓名脱敏：只显示第一个字，后面用*代替

-- 例如："张三" -> "张*"

concat(substr(data.name, 1, 1), '*')

-- 2. 手机号脱敏：保留前3位，后面用**代替

-- 例如："13812345678" -> "138**"

concat(substr(data.phone_num, 1, 3), '**')

-- 3. 邮箱脱敏：@符号后面的部分保留，前面用*代替

-- 例如："zhangsan@example.com" -> "*@example.com"

concat('*', split(data.email, '@')[1])

（2）正则表达式函数（数据验证）

sql

-- 验证手机号格式是否正确（11位数字，符合中国手机号规则）

-- regexp是正则表达式匹配函数

data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$'

-- 验证邮箱格式是否正确

data.email regexp '[a-zA-Z0-9-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9_-]+)+$'

（3）窗口函数（处理排名）

sql

-- row_number() over：给每组数据编号

-- partition by id：按用户ID分组

-- order by start_date desc：按开始日期倒序排列（最新的排第一）

row_number() over (partition by id order by start_date desc) rn

作用：给每个用户的数据编号，最新的是1，次新的是2，依此类推。

首日装载代码讲解（第一次导入数据）

sql

-- 向dim_user_zip表的9999-12-31分区插入数据

insert overwrite table dim_user_zip partition (dt = '9999-12-31')

select

data.id, -- 用户ID

-- 姓名脱敏：只显示姓，名用*代替

concat(substr(data.name, 1, 1), '*') name,

-- 手机号处理：先验证格式，如果正确就脱敏，否则设为null

if(data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$',

concat(substr(data.phone_num, 1, 3), '**'),

null) phone_num,

-- 邮箱处理：先验证格式，如果正确就脱敏，否则设为null

if(data.email regexp '[a-zA-Z0-9-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9_-]+)+$',

concat('*', split(data.email, '@')[1]),

null) email,

data.user_level, -- 用户等级

data.birthday, -- 生日

data.gender, -- 性别

data.create_time, -- 创建时间

data.operate_time, -- 操作时间

'2022-06-08' start_date, -- 开始日期是今天（因为是首日）

'9999-12-31' end_date -- 结束日期是很远的未来，表示当前有效

-- 从原始数据表取数据

from ods_user_info_inc

where dt = '2022-06-08' -- 取今天的数据

and type = 'bootstrap-insert'; -- 只取全量导入的数据

每日装载代码讲解（每天更新数据）

这是最关键的部分，我们用一个具体例子来理解：

场景：2022年6月9日，用户"user_001"的手机号从"13812345678"改为"13987654321"

sql

-- 设置动态分区模式（允许自动创建分区）

set hive.exec.dynamic.partition.mode=nonstrict;

-- 更新dim_user_zip表

insert overwrite table dim_user_zip partition (dt)

select

id, name, phone_num, email, user_level, birthday, gender,

create_time, operate_time, start_date,

-- 处理结束日期：

-- 如果rn=2（历史记录），结束日期设为昨天（2022-06-08）

-- 如果rn=1（最新记录），保持原来的end_date

if(rn = 2, date_sub('2022-06-09', 1), end_date) end_date,

-- 处理分区dt：

-- 如果rn=1（最新记录），放入'9999-12-31'分区

-- 如果rn=2（历史记录），放入昨天分区（'2022-06-08'）

if(rn = 1, '9999-12-31', date_sub('2022-06-09', 1)) dt

from (

-- 步骤2：给新旧数据编号（每个用户的最新数据是1，历史数据是2）

select

id, name, phone_num, email, user_level, birthday, gender,

create_time, operate_time, start_date, end_date,

-- 关键：按用户分组，按开始日期倒序排名

row_number() over (partition by id order by start_date desc) rn

from (

-- 步骤1：合并新旧数据

-- 1.1 从拉链表取当前最新的数据（旧数据）

select

id, name, phone_num, email, user_level, birthday, gender,

create_time, operate_time, start_date, end_date

from dim_user_zip

where dt = '9999-12-31' -- 只取最新的数据

union all -- 合并

-- 1.2 从原始表取今天变更的数据（新数据）

select

id,

concat(substr(name, 1, 1), '*') name, -- 脱敏

if(phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\d{8}$',

concat(substr(phone_num, 1, 3), '**'),

null) phone_num, -- 脱敏

if(email regexp '[a-zA-Z0-9-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9_-]+)+$',

concat('*', split(email, '@')[1]),

null) email, -- 脱敏

user_level, birthday, gender, create_time, operate_time,

'2022-06-09' start_date, -- 新数据的开始日期是今天

'9999-12-31' end_date -- 新数据的结束日期是未来

from ods_user_info_inc

where dt = '2022-06-09' -- 今天的数据

and type in ('insert', 'update') -- 新增或修改的数据

) t1

) t2;

四、为什么不用JOIN？

这是一个很好的问题！很多人会想：为什么不直接用JOIN把新旧数据连起来呢？

JOIN方式的缺点：

数据膨胀：如果一个用户有10次变更，JOIN会产生10条关联记录

查询复杂：要查最新数据需要复杂的子查询

性能差：每天全表JOIN，数据量大时非常慢

拉链表的优势：

查询简单：

sql

-- 查最新数据（简单）

select * from dim_user_zip where dt = '9999-12-31';

-- 查某天数据（简单）

select * from dim_user_zip where dt = '2022-06-08';

历史追踪容易：

sql

-- 查用户"user_001"的所有历史变化

select * from dim_user_zip where id = 'user_001' order by start_date;

存储高效：每个用户每天只有一条有效记录

五、这样设计的好处总结

保存完整历史：用户信息的每次变化都被记录下来

查询性能高：按分区查询，速度快

数据管理方便：新数据和老数据分开存放

节省存储空间：相比每天全量快照，拉链表更节省空间

支持时间旅行：可以查询任意时间点的用户状态

六、数据装载脚本

1.首日全量脚本

sql 复制代码

cd /home/atguigu/bin
vim ods_to_dim_init.sh

添加下面的内容：

sql 复制代码

#!/bin/bash

APP=gmall

if [ -n "$2" ] ;then
   do_date=$2
else 
   echo "请传入日期参数"
   exit
fi 

dim_user_zip="
insert overwrite table ${APP}.dim_user_zip partition (dt = '9999-12-31')
select data.id,
       concat(substr(data.name, 1, 1), '*')                name,
       if(data.phone_num regexp '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\\d{8}$',
          concat(substr(data.phone_num, 1, 3), '*'), null) phone_num,
       if(data.email regexp '^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+$',
          concat('*@', split(data.email, '@')[1]), null)   email,
       data.user_level,
       data.birthday,
       data.gender,
       data.create_time,
       data.operate_time,
       '$do_date'                                        start_date,
       '9999-12-31'                                        end_date
from ${APP}.ods_user_info_inc
where dt = '$do_date'
  and type = 'bootstrap-insert';
"

dim_sku_full="
with
sku as
(
    select
        id,
        price,
        sku_name,
        sku_desc,
        weight,
        is_sale,
        spu_id,
        category3_id,
        tm_id,
        create_time
    from ${APP}.ods_sku_info_full
    where dt='$do_date'
),
spu as
(
    select
        id,
        spu_name
    from ${APP}.ods_spu_info_full
    where dt='$do_date'
),
c3 as
(
    select
        id,
        name,
        category2_id
    from ${APP}.ods_base_category3_full
    where dt='$do_date'
),
c2 as
(
    select
        id,
        name,
        category1_id
    from ${APP}.ods_base_category2_full
    where dt='$do_date'
),
c1 as
(
    select
        id,
        name
    from ${APP}.ods_base_category1_full
    where dt='$do_date'
),
tm as
(
    select
        id,
        tm_name
    from ${APP}.ods_base_trademark_full
    where dt='$do_date'
),
attr as
(
    select
        sku_id,
        collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    from ${APP}.ods_sku_attr_value_full
    where dt='$do_date'
    group by sku_id
),
sale_attr as
(
    select
        sku_id,
        collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    from ${APP}.ods_sku_sale_attr_value_full
    where dt='$do_date'
    group by sku_id
)
insert overwrite table ${APP}.dim_sku_full partition(dt='$do_date')
select
    sku.id,
    sku.price,
    sku.sku_name,
    sku.sku_desc,
    sku.weight,
    sku.is_sale,
    sku.spu_id,
    spu.spu_name,
    sku.category3_id,
    c3.name,
    c3.category2_id,
    c2.name,
    c2.category1_id,
    c1.name,
    sku.tm_id,
    tm.tm_name,
    attr.attrs,
    sale_attr.sale_attrs,
    sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;
"

dim_province_full="
insert overwrite table ${APP}.dim_province_full partition(dt='$do_date')
select
    province.id,
    province.name,
    province.area_code,
    province.iso_code,
    province.iso_3166_2,
    region_id,
    region_name
from
(
    select
        id,
        name,
        region_id,
        area_code,
        iso_code,
        iso_3166_2
    from ${APP}.ods_base_province_full
    where dt='$do_date'
)province
left join
(
    select
        id,
        region_name
    from ${APP}.ods_base_region_full
    where dt='$do_date'
)region
on province.region_id=region.id;
"

dim_coupon_full="
insert overwrite table ${APP}.dim_coupon_full partition(dt='$do_date')
select
    id,
    coupon_name,
    coupon_type,
    coupon_dic.dic_name,
    condition_amount,
    condition_num,
    activity_id,
    benefit_amount,
    benefit_discount,
    case coupon_type
        when '3201' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3202' then concat('满',condition_num,'件打', benefit_discount,' 折')
        when '3203' then concat('减',benefit_amount,'元')
    end benefit_rule,
    create_time,
    range_type,
    range_dic.dic_name,
    limit_num,
    taken_count,
    start_time,
    end_time,
    operate_time,
    expire_time
from
(
    select
        id,
        coupon_name,
        coupon_type,
        condition_amount,
        condition_num,
        activity_id,
        benefit_amount,
        benefit_discount,
        create_time,
        range_type,
        limit_num,
        taken_count,
        start_time,
        end_time,
        operate_time,
        expire_time
    from ${APP}.ods_coupon_info_full
    where dt='$do_date'
)ci
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='32'
)coupon_dic
on ci.coupon_type=coupon_dic.dic_code
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='33'
)range_dic
on ci.range_type=range_dic.dic_code;
"

dim_activity_full="
insert overwrite table ${APP}.dim_activity_full partition(dt='$do_date')
select
    rule.id,
    info.id,
    activity_name,
    rule.activity_type,
    dic.dic_name,
    activity_desc,
    start_time,
    end_time,
    create_time,
    condition_amount,
    condition_num,
    benefit_amount,
    benefit_discount,
    case rule.activity_type
        when '3101' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3102' then concat('满',condition_num,'件打', benefit_discount,' 折')
        when '3103' then concat('打', benefit_discount,'折')
    end benefit_rule,
    benefit_level
from
(
    select
        id,
        activity_id,
        activity_type,
        condition_amount,
        condition_num,
        benefit_amount,
        benefit_discount,
        benefit_level
    from ${APP}.ods_activity_rule_full
    where dt='$do_date'
)rule
left join
(
    select
        id,
        activity_name,
        activity_type,
        activity_desc,
        start_time,
        end_time,
        create_time
    from ${APP}.ods_activity_info_full
    where dt='$do_date'
)info
on rule.activity_id=info.id
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='31'
)dic
on rule.activity_type=dic.dic_code;
"

dim_promotion_pos_full="
insert overwrite table ${APP}.dim_promotion_pos_full partition(dt='$do_date')
select
    id,           
    pos_location,
    pos_type,
    promotion_type,
    create_time,
    operate_time   
from ${APP}.ods_promotion_pos_full 
where dt='$do_date';
"

dim_promotion_refer_full="
insert overwrite table ${APP}.dim_promotion_refer_full partition(dt='$do_date')
select
    id, 
    refer_name,
    create_time,
    operate_time
from ${APP}.ods_promotion_refer_full 
where dt='$do_date';
"


case $1 in
"dim_user_zip")
    hive -e "$dim_user_zip"
;;
"dim_sku_full")
    hive -e "$dim_sku_full"
;;
"dim_province_full")
    hive -e "$dim_province_full"
;;
"dim_coupon_full")
    hive -e "$dim_coupon_full"
;;
"dim_activity_full")
    hive -e "$dim_activity_full"
    ;;
"dim_promotion_pos_full")
    hive -e "$dim_promotion_pos_full"
;;
"dim_promotion_refer_full")
    hive -e "$dim_promotion_refer_full"
;;

"all")
    hive -e "$dim_user_zip$dim_sku_full$dim_province_full$dim_coupon_full$dim_activity_full$dim_promotion_refer_full$dim_promotion_pos_full"
;;
esac

添加权限

sql 复制代码

chmod +x ods_to_dim_init.sh

执行脚本的命令

sql 复制代码

ods_to_dim_init.sh all 2022-06-08

2.每日增量脚本

sql 复制代码

cd /home/atguigu/bin
vim ods_to_dim.sh

添加下面的内容：

sql 复制代码

#!/bin/bash

APP=gmall

# 如果是输入的日期按照取输入日期；如果没输入日期取当前时间的前一天
if [ -n "$2" ] ;then
    do_date=$2
else 
    do_date=`date -d "-1 day" +%F`
fi

dim_user_zip="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table ${APP}.dim_user_zip partition (dt)
select id,
       name,
       phone_num,
       email,
       user_level,
       birthday,
       gender,
       create_time,
       operate_time,
       start_date,
       if(rn = 2, date_sub('$do_date', 1), end_date)     end_date,
       if(rn = 1, '9999-12-31', date_sub('$do_date', 1)) dt
from (
         select id,
                name,
                phone_num,
                email,
                user_level,
                birthday,
                gender,
                create_time,
                operate_time,
                start_date,
                end_date,
                row_number() over (partition by id order by start_date desc) rn
         from (
                  select id,
                         name,
                         phone_num,
                         email,
                         user_level,
                         birthday,
                         gender,
                         create_time,
                         operate_time,
                         start_date,
                         end_date
                  from ${APP}.dim_user_zip
                  where dt = '9999-12-31'
                  union
                  select id,
                         concat(substr(name, 1, 1), '*')                name,
                         if(phone_num regexp
                            '^(13[0-9]|14[01456879]|15[0-35-9]|16[2567]|17[0-8]|18[0-9]|19[0-35-9])\\d{8}$',
                            concat(substr(phone_num, 1, 3), '*'), null) phone_num,
                         if(email regexp '^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+$',
                            concat('*@', split(email, '@')[1]), null)   email,
                         user_level,
                         birthday,
                         gender,
                         create_time,
                         operate_time,
                         '$do_date'                                   start_date,
                         '9999-12-31'                                   end_date
                  from (
                           select data.id,
                                  data.name,
                                  data.phone_num,
                                  data.email,
                                  data.user_level,
                                  data.birthday,
                                  data.gender,
                                  data.create_time,
                                  data.operate_time,
                                  row_number() over (partition by data.id order by ts desc) rn
                           from ${APP}.ods_user_info_inc
                           where dt = '$do_date'
                       ) t1
                  where rn = 1
              ) t2
     ) t3;
"

dim_sku_full="
with
sku as
(
    select
        id,
        price,
        sku_name,
        sku_desc,
        weight,
        is_sale,
        spu_id,
        category3_id,
        tm_id,
        create_time
    from ${APP}.ods_sku_info_full
    where dt='$do_date'
),
spu as
(
    select
        id,
        spu_name
    from ${APP}.ods_spu_info_full
    where dt='$do_date'
),
c3 as
(
    select
        id,
        name,
        category2_id
    from ${APP}.ods_base_category3_full
    where dt='$do_date'
),
c2 as
(
    select
        id,
        name,
        category1_id
    from ${APP}.ods_base_category2_full
    where dt='$do_date'
),
c1 as
(
    select
        id,
        name
    from ${APP}.ods_base_category1_full
    where dt='$do_date'
),
tm as
(
    select
        id,
        tm_name
    from ${APP}.ods_base_trademark_full
    where dt='$do_date'
),
attr as
(
    select
        sku_id,
        collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    from ${APP}.ods_sku_attr_value_full
    where dt='$do_date'
    group by sku_id
),
sale_attr as
(
    select
        sku_id,
        collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    from ${APP}.ods_sku_sale_attr_value_full
    where dt='$do_date'
    group by sku_id
)
insert overwrite table ${APP}.dim_sku_full partition(dt='$do_date')
select
    sku.id,
    sku.price,
    sku.sku_name,
    sku.sku_desc,
    sku.weight,
    sku.is_sale,
    sku.spu_id,
    spu.spu_name,
    sku.category3_id,
    c3.name,
    c3.category2_id,
    c2.name,
    c2.category1_id,
    c1.name,
    sku.tm_id,
    tm.tm_name,
    attr.attrs,
    sale_attr.sale_attrs,
    sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;
"

dim_province_full="
insert overwrite table ${APP}.dim_province_full partition(dt='$do_date')
select
    province.id,
    province.name,
    province.area_code,
    province.iso_code,
    province.iso_3166_2,
    region_id,
    region_name
from
(
    select
        id,
        name,
        region_id,
        area_code,
        iso_code,
        iso_3166_2
    from ${APP}.ods_base_province_full
    where dt='$do_date'
)province
left join
(
    select
        id,
        region_name
    from ${APP}.ods_base_region_full
    where dt='$do_date'
)region
on province.region_id=region.id;
"

dim_coupon_full="
insert overwrite table ${APP}.dim_coupon_full partition(dt='$do_date')
select
    id,
    coupon_name,
    coupon_type,
    coupon_dic.dic_name,
    condition_amount,
    condition_num,
    activity_id,
    benefit_amount,
    benefit_discount,
    case coupon_type
        when '3201' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3202' then concat('满',condition_num,'件打', benefit_discount,' 折')
        when '3203' then concat('减',benefit_amount,'元')
    end benefit_rule,
    create_time,
    range_type,
    range_dic.dic_name,
    limit_num,
    taken_count,
    start_time,
    end_time,
    operate_time,
    expire_time
from
(
    select
        id,
        coupon_name,
        coupon_type,
        condition_amount,
        condition_num,
        activity_id,
        benefit_amount,
        benefit_discount,
        create_time,
        range_type,
        limit_num,
        taken_count,
        start_time,
        end_time,
        operate_time,
        expire_time
    from ${APP}.ods_coupon_info_full
    where dt='$do_date'
)ci
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='32'
)coupon_dic
on ci.coupon_type=coupon_dic.dic_code
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='33'
)range_dic
on ci.range_type=range_dic.dic_code;
"

dim_activity_full="
insert overwrite table ${APP}.dim_activity_full partition(dt='$do_date')
select
    rule.id,
    info.id,
    activity_name,
    rule.activity_type,
    dic.dic_name,
    activity_desc,
    start_time,
    end_time,
    create_time,
    condition_amount,
    condition_num,
    benefit_amount,
    benefit_discount,
    case rule.activity_type
        when '3101' then concat('满',condition_amount,'元减',benefit_amount,'元')
        when '3102' then concat('满',condition_num,'件打', benefit_discount,' 折')
        when '3103' then concat('打', benefit_discount,'折')
    end benefit_rule,
    benefit_level
from
(
    select
        id,
        activity_id,
        activity_type,
        condition_amount,
        condition_num,
        benefit_amount,
        benefit_discount,
        benefit_level
    from ${APP}.ods_activity_rule_full
    where dt='$do_date'
)rule
left join
(
    select
        id,
        activity_name,
        activity_type,
        activity_desc,
        start_time,
        end_time,
        create_time
    from ${APP}.ods_activity_info_full
    where dt='$do_date'
)info
on rule.activity_id=info.id
left join
(
    select
        dic_code,
        dic_name
    from ${APP}.ods_base_dic_full
    where dt='$do_date'
    and parent_code='31'
)dic
on rule.activity_type=dic.dic_code;
"


dim_promotion_pos_full="
insert overwrite table ${APP}.dim_promotion_pos_full partition(dt='$do_date')
select
    id,            
    pos_location,
    pos_type,
    promotion_type,
    create_time,
    operate_time
from ${APP}.ods_promotion_pos_full 
where dt='$do_date';
"

dim_promotion_refer_full="
insert overwrite table ${APP}.dim_promotion_refer_full partition(dt='$do_date')
select
    id, 
    refer_name,
    create_time,
    operate_time
from ${APP}.ods_promotion_refer_full 
where dt='$do_date';
"


case $1 in
"dim_user_zip")
    hive -e "$dim_user_zip"
;;
"dim_sku_full")
    hive -e "$dim_sku_full"
;;
"dim_province_full")
    hive -e "$dim_province_full"
;;
"dim_coupon_full")
    hive -e "$dim_coupon_full"
;;
"dim_activity_full")
    hive -e "$dim_activity_full"
;;
"dim_promotion_pos_full")
    hive -e "$dim_promotion_pos_full"
;;
"dim_promotion_refer_full")
    hive -e "$dim_promotion_refer_full"
;;

"all")
    hive -e "$dim_user_zip$dim_sku_full$dim_province_full$dim_coupon_full$dim_activity_full$dim_promotion_refer_full$dim_promotion_pos_full"
;;
esac

添加权限

sql 复制代码

chmod +x ods_to_dim.sh

脚本执行的命令（以6月9号为例）

sql 复制代码

ods_to_dim.sh all 2022-06-09