数仓搭建(hive):DM搭建(数据集市层)

DM层 数据集市层 (Data Mart)

粒度上卷(Roll-up):

指的是沿着维度层次向上聚合汇总数据,从细粒度到粗粒度观察数据的操作。

示例

数仓的上一层DWS的是按日汇总

DM层基于DWS层主题日宽表上卷统计出按年,月,周的数据 >>用DWS层的宽表连接DWD层的时间维度表

创建DM层 : 建数据库>>建表

复制代码
CREATE DATABASE if NOT EXISTS DM;

建表: 表结构和DWS层的表结构几乎一致, 只多了关于日期的维度字段

建表sql

复制代码
CREATE TABLE dm.dm_sale(
   date_time string COMMENT '统计日期,不能用来分组统计' ,--记录哪一天干活
   time_type string COMMENT '统计时间维度:year、month、week、date(就是天day)',
   year_code string COMMENT '年code',
   year_month string COMMENT '年月',
   month_code string COMMENT '月份编码', 
   day_month_num string COMMENT '一月第几天', 
   dim_date_id string COMMENT '日期',
   year_week_name_cn string COMMENT '年中第几周',

   group_type string COMMENT '分组类型:store,trade_area,city,brand,min_class,mid_class,max_class,all',
   city_id string COMMENT '城市id',
   city_name string COMMENT '城市name',
   trade_area_id string COMMENT '商圈id',
   trade_area_name string COMMENT '商圈名称',
   store_id string COMMENT '店铺的id',
   store_name string COMMENT '店铺名称',
   brand_id string COMMENT '品牌id',
   brand_name string COMMENT '品牌名称',
   max_class_id string COMMENT '商品大类id',
   max_class_name string COMMENT '大类名称',
   mid_class_id string COMMENT '中类id', 
   mid_class_name string COMMENT '中类名称',
   min_class_id string COMMENT '小类id', 
   min_class_name string COMMENT '小类名称',
   --   =======统计=======
   sale_amt DECIMAL(38,2) COMMENT '销售收入',
   plat_amt DECIMAL(38,2) COMMENT '平台收入',
   deliver_sale_amt DECIMAL(38,2) COMMENT '配送成交额',
   mini_app_sale_amt DECIMAL(38,2) COMMENT '小程序成交额',
   android_sale_amt DECIMAL(38,2) COMMENT '安卓APP成交额',
   ios_sale_amt DECIMAL(38,2) COMMENT '苹果APP成交额',
   pcweb_sale_amt DECIMAL(38,2) COMMENT 'PC商城成交额',
   order_cnt BIGINT COMMENT '成交单量',
   eva_order_cnt BIGINT COMMENT '参评单量comment=>cmt',
   bad_eva_order_cnt BIGINT COMMENT '差评单量negtive-comment=>ncmt',
   deliver_order_cnt BIGINT COMMENT '配送单量',
   refund_order_cnt BIGINT COMMENT '退款单量',
   miniapp_order_cnt BIGINT COMMENT '小程序成交单量',
   android_order_cnt BIGINT COMMENT '安卓APP订单量',
   ios_order_cnt BIGINT COMMENT '苹果APP订单量',
   pcweb_order_cnt BIGINT COMMENT 'PC商城成交单量'
)
COMMENT '销售主题宽表' 
ROW format delimited fields terminated BY '\t' 
stored AS orc tblproperties ('orc.compress' = 'SNAPPY');

插入数据sql

复制代码
WITH TEMP AS (
    SELECT
        D.year_code,
        D.year_month,
        D.month_code,
        D.day_month_num,
        D.dim_date_id,
        D.year_week_name_cn,
        city_id, city_name, trade_area_id, trade_area_name, store_id, store_name, brand_id, brand_name, max_class_id, max_class_name, mid_class_id, mid_class_name, min_class_id, min_class_name, sale_amt, plat_amt, deliver_sale_amt, mini_app_sale_amt, android_sale_amt, ios_sale_amt, pcweb_sale_amt, order_cnt, eva_order_cnt, bad_eva_order_cnt, deliver_order_cnt, refund_order_cnt, miniapp_order_cnt, android_order_cnt, ios_order_cnt, pcweb_order_cnt, dt
    FROM DWS.DWS_SALE_DAYCOUNT S
    INNER JOIN DWD.DIM_DATE D
    ON S.dt = D.date_code
)
INSERT overwrite table dm.dm_sale
SELECT
    CURRENT_DATE AS DATE_TIME,
    CASE
        WHEN dim_date_id IS NOT NULL THEN 'DATE'
        WHEN year_week_name_cn IS NOT NULL THEN 'WEEK'
        WHEN month_code IS NOT NULL THEN 'MONTH'
        WHEN year_code IS NOT NULL THEN 'YEAR'
    END AS TIME_TYPE,
    year_code,
    year_month,
    month_code,
    day_month_num,
    dim_date_id,
    year_week_name_cn,
    CASE
        WHEN T.store_id IS NOT NULL THEN '店铺'
        WHEN T.trade_area_id IS NOT NULL THEN '商圈'
        WHEN T.city_id IS NOT NULL THEN '城市'
        WHEN T.min_class_id IS NOT NULL THEN '小类'
        WHEN T.mid_class_id IS NOT NULL THEN '中类'
        WHEN T.max_class_id IS NOT NULL THEN '大类'
        WHEN T.brand_id IS NOT NULL THEN '品牌'
        ELSE '日期'
    END AS GROUP_TYPE,
    city_id,
    city_name,
    trade_area_ID,
    trade_area_name,
    store_id,
    store_name,
    brand_id,
    brand_name,
    max_class_id,
    max_class_name,
    mid_class_id,
    mid_class_name,
    min_class_id,
    min_class_name,
    SUM(sale_amt),
    SUM(plat_amt),
    SUM(deliver_sale_amt),
    SUM(mini_app_sale_amt),
    SUM(android_sale_amt),
    SUM(ios_sale_amt),
    SUM(pcweb_sale_amt),
    SUM(order_cnt),
    SUM(eva_order_cnt),
    SUM(bad_eva_order_cnt),
    SUM(deliver_order_cnt),
    SUM(refund_order_cnt),
    SUM(miniapp_order_cnt),
    SUM(android_order_cnt),
    SUM(ios_order_cnt),
    SUM(pcweb_order_cnt)
FROM TEMP T
GROUP BY
    -- 所有 GROUPING SETS 中出现的列都要包含在 GROUP BY 中
    day_month_num,
    dim_date_id,
    city_id, city_name, trade_area_id, trade_area_name, store_id, store_name, brand_id, brand_name, max_class_id, max_class_name, mid_class_id, mid_class_name, min_class_id, min_class_name,
    year_code,
    year_month,
    month_code,
    year_week_name_cn
GROUPING SETS (
    (day_month_num, dim_date_id),
    (day_month_num, dim_date_id, city_id, city_name),
    (day_month_num, dim_date_id, city_id, city_name, trade_area_id, trade_area_name),
    (day_month_num, dim_date_id, city_id, city_name, trade_area_id, trade_area_name, store_id, store_name),
    (day_month_num, dim_date_id, brand_id, brand_name),
    (day_month_num, dim_date_id, max_class_id, max_class_name),
    (day_month_num, dim_date_id, max_class_id, max_class_name, mid_class_name, mid_class_id),
    (day_month_num, dim_date_id, max_class_id, max_class_name, mid_class_name, mid_class_id, min_class_id, min_class_name),
    (year_week_name_cn),
    (year_week_name_cn, city_id, city_name),
    (year_week_name_cn, city_id, city_name, trade_area_id, trade_area_name),
    (year_week_name_cn, city_id, city_name, trade_area_id, trade_area_name, store_id, store_name),
    (year_week_name_cn, brand_id, brand_name),
    (year_week_name_cn, max_class_id, max_class_name),
    (year_week_name_cn, max_class_id, max_class_name, mid_class_name, mid_class_id),
    (year_week_name_cn, max_class_id, max_class_name, mid_class_name, mid_class_id, min_class_id, min_class_name),
    (year_month, month_code),
    (year_month, month_code, city_id, city_name),
    (year_month, month_code, city_id, city_name, trade_area_id, trade_area_name),
    (year_month, month_code, city_id, city_name, trade_area_id, trade_area_name, store_id, store_name),
    (year_month, month_code, brand_id, brand_name),
    (year_month, month_code, max_class_id, max_class_name),
    (year_month, month_code, max_class_id, max_class_name, mid_class_name, mid_class_id),
    (year_month, month_code, max_class_id, max_class_name, mid_class_name, mid_class_id, min_class_id, min_class_name),
    (year_code),
    (year_code, city_id, city_name),
    (year_code, city_id, city_name, trade_area_id, trade_area_name),
    (year_code, city_id, city_name, trade_area_id, trade_area_name, store_id, store_name),
    (year_code, brand_id, brand_name),
    (year_code, max_class_id, max_class_name),
    (year_code, max_class_id, max_class_name, mid_class_name, mid_class_id),
    (year_code, max_class_id, max_class_name, mid_class_name, mid_class_id, min_class_id, min_class_name)
);

插入sql分析

查询DWS层的宽表>>确认连接字段dt的数据格式

查询时间维度表>>找到和DWS层的宽表的连接字段数据格式一样的字段>>查找新维度的相应字段

复制代码
select * from DWD.DIM_DATE

在with as临时表里面把用DWS层的宽表连接DWD层的时间维度表, 内连接,连接字段dt(日)

在临时表查询语句中把目标表新增的时间维度的字段添加进去

目标表

临时表

INSERT overwrite table dm.dm_sale 是hive中全量插入的语法

在查询语句中把目标表新增的列实现

group_type string COMMENT '分组类型:store,trade_area,city,brand,min_class,mid_class,max_class,all',

枚举类型>>case when

group by 分组后面跟除了指标字段及 group_type 的所有字段(维度字段)

用grouping sets 写出需要的维度组合

相关推荐
武汉格发Gofartlic1 小时前
FEKO许可证的安全与合规性
大数据·运维·安全
姬激薄2 小时前
HDFS概述
大数据·hadoop·hdfs
依年南台2 小时前
克隆虚拟机组成集群
大数据·hadoop
依年南台2 小时前
搭建大数据学习的平台
大数据·学习
张人玉4 小时前
数据可视化大屏——物流大数据服务平台(二)
大数据·信息可视化
Leo.yuan4 小时前
数据分析怎么做?高效的数据分析方法有哪些?
大数据·数据库·信息可视化·数据挖掘·数据分析
杜清卿6 小时前
Spark处理过程-转换算子和行动算子
大数据·分布式·spark
小冻梨!!!6 小时前
Spark,在shell中运行RDD程序
大数据·javascript·spark
IvanCodes7 小时前
五、Hive表类型、分区及数据加载
大数据·数据仓库·hive
小白的白是白痴的白7 小时前
Spark基础介绍
大数据·分布式·spark