注:参考文章:
0 需求描述
现有一张订单表shop_order ,含有字段shop_id,order_id,order_time,order_amt, 需要统计过去至少连续3天销售金额连续增长的商户shop_id。
1 数据准备
sql
create table shop_order(
shop_id int,
order_amt int,
order_time string
)
row format delimited fields terminated by '\t';
load data local inpath "/opt/module/hive_data/shop_order.txt" into table shop_order;
2 数据分析
完整的代码如下:
sql
with tmp as (
select
shop_id,
to_date(order_time) as dt,
sum(order_amt) as amt
from shop_order
group by shop_id, to_date(order_time)
)
select
shop_id
from (select *,
-- 判断日期是否连续
date_sub(dt, row_number() over (partition by shop_id order by dt )) as order_date_diff
from (
select
shop_id,
dt,
amt,
--判断销售额是否增长
-- 当前行的销售金额与上一行的销售金额之间的差值 order_amt_diff
amt - lag(amt, 1, 0) over (partition by shop_id order by dt) as order_amt_diff
from tmp
) t1
-- 差值大于0的代表销售额增长
where order_amt_diff > 0
) t2
group by shop_id, order_date_diff
having count(1) >=3;
输出结果为 shop_id 为2
上述代码分析:
**step1:**求出每家商户销售金额连续增长的记录
sql
with tmp as (
select
shop_id,
to_date(order_time) as dt,
sum(order_amt) as amt
from shop_order
group by shop_id, to_date(order_time)
)
select *
from (
select
shop_id,
dt,
amt,
--判断销售额是否增长
-- 当前行的销售金额与上一行的销售金额之间的差值 order_amt_diff
amt - lag(amt, 1, 0) over (partition by shop_id order by dt) as order_amt_diff
from tmp
) t1
-- 差值大于0的代表销售额增长
where order_amt_diff > 0
step2: 求出每家商户至少连续3天销售金额连续增长**,在step1的基础上,还要求dt是连续的**
sql
with tmp as (
select
shop_id,
to_date(order_time) as dt,
sum(order_amt) as amt
from shop_order
group by shop_id, to_date(order_time)
)
select *,
-- 判断日期是否连续
date_sub(dt, row_number() over (partition by shop_id order by dt )) as order_date_diff
from (
select
shop_id,
dt,
amt,
--判断销售额是否增长
-- 当前行的销售金额与上一行的销售金额之间的差值 order_amt_diff
amt - lag(amt, 1, 0) over (partition by shop_id order by dt) as order_amt_diff
from tmp
) t1
-- 差值大于0的代表销售额增长
where order_amt_diff > 0
**step3:**对商户shop_id以及日期差值order_date_diff这两个字段分组,求出最终结果
sql
with tmp as (
select
shop_id,
to_date(order_time) as dt,
sum(order_amt) as amt
from shop_order
group by shop_id, to_date(order_time)
)
select
shop_id
from (select *,
-- 判断日期是否连续
date_sub(dt, row_number() over (partition by shop_id order by dt )) as order_date_diff
from (
select
shop_id,
dt,
amt,
--判断销售额是否增长
-- 当前行的销售金额与上一行的销售金额之间的差值 order_amt_diff
amt - lag(amt, 1, 0) over (partition by shop_id order by dt) as order_amt_diff --判断是否增长
from tmp
) t1
-- 差值大于0的代表销售额增长
where order_amt_diff > 0
) t2
group by shop_id, order_date_diff
having count(1) >=3;
3 小结
date_sub(日期减少函数)
- 语法:date_sub(string startdate,int days)
- 返回值:string
- 说明:返回 开始日期startdate 减去days天后的日期
- 举例:select date_sub('2024-02-01',3) --->2024-01-29
lag
- 语法:lag(column,n,default) over(partition by ....order by....)
- 说明:取得column列前边的第n行数据,如果存在则返回,如果不存在,返回默认值default
针对【日期连续 】等类型的题型,一般处理思路:先计算date_sub(dt, row_number() over (partition by shop_id order by dt )) as dt_diff ,再对 dt_diff 分组,求count()值
针对【xx连续增长 】等类型的题型,一般处理思路:利用前后函数lag或者lead 往前/往后取一行,计算两者的差值diff ,再利用 if( diff >0,1,0) as flag 等条件判断函数 进行打标签,基于标签再进行后续的分组计算.......