注:参考文章:
0 需求分析
t_order表结构如图:

要求:t_order表扫描次数不超过2次的前提下,统计所有在2018年1月下过单且在2月份没有下过单的用户,该用户在3月份的下单情况:

1 数据准备
sql
CREATE TABLE t_order (
oid int ,
uid int ,
otime string,
oamount int
)
ROW format delimited FIELDS TERMINATED BY ",";
load data local inpath "/opt/module/hive_data/t_order.txt" into table t_order;
2 数据分析
完整代码为:
sql
with tmp as (
select
oid,
uid,
otime,
date_format(otime, 'yyyy-MM') as dt,
oamount,
---计算rk的目的是为了获取记录中的第一条
row_number() over (partition by uid,date_format(otime, 'yyyy-MM') order by otime) rk,
--- 计算cnt的目的是为了获取记录中的最后一条
count(*) over (partition by uid,date_format(otime, 'yyyy-MM')) cnt
from t_order
order by uid
)
select
uid,
--每个用户一月份的订单数
sum(if(dt = '2018-01', 1, 0)) as m1_count,
--每个用户二月份的订单数
sum(if(dt = '2018-02', 1, 0)) as m2_count,
--每个用户三月份的订单数(当月订单金额超过10元的订单个数)
sum(if(dt = '2018-03' and oamount > 10, 1, 0)) m3_count,
--当月(3月份)首次下单的金额
sum(if(dt = '2018-03' and rk = 1, oamount, 0)) m3_first_amount,
--当月(3月份)末次下单的金额(rk =cnt小技巧)
sum(if(dt = '2018-03' and rk = cnt, oamount, 0)) m3_last_amount
from tmp
group by uid
--将下单记录转化成下单次数判断
having m1_count >0 and m2_count=0;
最终的输出结果为:
上述代码解析:
**step1:**用date_format函数进行日期格式化,row_number() over() 获得排名rk, count(*)over()获得统计值cnt
sql
select
oid,
uid,
otime,
date_format(otime, 'yyyy-MM') as dt,
oamount,
---计算rk的目的是为了获取记录中的第一条
row_number() over (partition by uid,date_format(otime, 'yyyy-MM') order by otime) rk,
--- 计算cnt的目的是为了获取记录中的最后一条
count(*) over (partition by uid,date_format(otime, 'yyyy-MM')) cnt
from t_order
step2:
- 获取当月订单金额超过10元的订单个数 :sum(if(条件, 1, 0)) 或者 sum( case when 条件 then 1 else 0 end );
- 获取当月首次下单金额:rk=1
- 获取当月末次下单金额:rk=cnt (每个分组的记录数cnt 同时也等于分组内,最后一条记录数的排序值rk)
sql
with tmp as (
select
oid,
uid,
otime,
date_format(otime, 'yyyy-MM') as dt,
oamount,
---计算rk的目的是为了获取记录中的第一条
row_number() over (partition by uid,date_format(otime, 'yyyy-MM') order by otime) rk,
--- 计算cnt的目的是为了获取记录中的最后一条
count(*) over (partition by uid,date_format(otime, 'yyyy-MM')) cnt
from t_order
order by uid
)
select
uid,
--每个用户一月份的订单数
sum(if(dt = '2018-01', 1, 0)) as m1_count,
--每个用户二月份的订单数
sum(if(dt = '2018-02', 1, 0)) as m2_count,
--每个用户三月份的订单数(当月订单金额超过10元的订单个数)
sum(if(dt = '2018-03' and oamount > 10, 1, 0)) m3_count,
--当月(3月份)首次下单的金额
sum(if(dt = '2018-03' and rk = 1, oamount, 0)) m3_first_amount,
--当月(3月份)末次下单的金额(rk =cnt小技巧)
sum(if(dt = '2018-03' and rk = cnt, oamount, 0)) m3_last_amount
from tmp
group by uid
having m1_count >0 and m2_count=0;
3 小结
本案例用到的知识点:
- sum(if()) 有条件累加;
- row_number() over(partition by ....order by ..) 排序,求分组topN
- count(*) over(partition by ...) 分组统计记录数。每组的记录数同时也是最后一条记录的排序值。
- 将下单记录转化成下单次数判断 m1_count >0 and m2_count=0;