注:参考文章:
0 需求描述


1 数据准备
sql
create table if not exists table23
(
user_id int comment '用户id',
room_num string comment '房间号',
in_time string comment '入住时间',
out_time string comment '离店时间'
)
comment '旅客入住离店表';
insert overwrite table table23
values (7, '2004', '2021-03-05','2021-03-07'),
(23,'2010', '2021-03-05','2021-03-06'),
(7, '1003', '2021-03-07','2021-03-08'),
(8, '2014', '2021-03-07','2021-03-08'),
(14, '3001','2021-03-07','2021-03-10'),
(18, '3002','2021-03-08','2021-03-10'),
(23, '3020','2021-03-08','2021-03-09'),
(25, '2006','2021-03-09','2021-03-12');
2 数据分析
需求:求出每个时间段,有客人在住的房间数量。
如果只考虑一人一房,可以借助于【直播间同时在线人数】统计的思路,相关sql逻辑指路:
sql
select
start_time,
end_time,
acc_cnt
from (select
`time` as start_time,
lead(`time`) over ( order by `time`) as end_time,
acc_cnt
from (select
`time`,
sum(flag) over (order by `time`) as acc_cnt
from (
select
in_time as `time`,
1 as flag
from table23
union all
select
out_time as `time`,
-1 as flag
from table23
) t1
) t2
group by `time`, acc_cnt
) t
where end_time is not null;
上述代码需要考虑一个问题:如果有多个人共住一房间 ,今天退了一个人,明天又退了一个人,后天的时候才退完,虽然这期间一直有人在退,但房间还是有人住的,这种情况是不是也算【有客人在住的房间】? 如果考虑上述情况,需要对累加的状态进行调整,此时需要考虑每个房间中截止当前时间的人数情况。
第一步 :先求出每个房间截至当前时间 的人数累计值 ,作为状态判断辅助条件
sql
select
`time`,
room_num,
sum( user_cnt) over (partition by room_num order by `time`) user_cnt
from (
select
in_time as `time`,
room_num,
count(user_id) user_cnt
from table23
group by in_time, room_num
union all
select
out_time as `time`,
room_num,
-1 * count(user_id) user_cnt
from table23
group by out_time, room_num
) t1
第二步 :基于累计的每个房间人数进行判断:如果房间有人就标记1,没有人时候就标记为-1。代码为:case when user_cnt > 0 时标记1,否则标记-1
sql
select
`time`,
room_num,
user_cnt,
case when user_cnt > 0 then 1 else -1 end flag
from (select
`time`,
room_num,
sum(user_cnt) over (partition by room_num order by `time`) user_cnt
from (
select
in_time as `time`,
room_num,
count(user_id) user_cnt
from table23
group by in_time, room_num
union all
select
out_time as `time`,
room_num,
-1 * count(user_id) user_cnt
from table23
group by out_time, room_num
) t1
) t2;

第三步: 基于第二步的结果,计算截止当前时间点的有人入住的房间数量 acc_cnt,SQL如下:
sql
select
`time`,
room_num,
user_cnt,
case when user_cnt > 0 then 1 else -1 end flag,
sum(case when user_cnt > 0 then 1 else -1 end) over (order by `time`) acc_cnt
from (select
`time`,
room_num,
sum(user_cnt) over (partition by room_num order by `time`) user_cnt
from (
select
in_time as `time`,
room_num,
count(user_id) user_cnt
from table23
group by in_time, room_num
union all
select
out_time as `time`,
room_num,
-1 * count(user_id) user_cnt
from table23
group by out_time, room_num
) t1
) t2;
第四步: 基于第三步的结果,对时间time 和截止当前时间点的有人入住的房间数量acc_cnt 这两个字段进行去重,SQL如下:
sql
select
`time`,
acc_cnt
from (
select
`time`,
room_num,
user_cnt,
case when user_cnt > 0 then 1 else -1 end flag,
sum(case when user_cnt > 0 then 1 else -1 end) over (order by `time`) acc_cnt
from (select
`time`,
room_num,
sum(user_cnt) over (partition by room_num order by `time`) user_cnt
from (
select
in_time as `time`,
room_num,
count(user_id) user_cnt
from table23
group by in_time, room_num
union all
select
out_time as `time`,
room_num,
-1 * count(user_id) user_cnt
from table23
group by out_time, room_num
) t1
) t2
) t3
group by `time`, acc_cnt

第五步: 基于第四步的结果,通过lead函数 (对time字段往后偏移一行)求出当前数据的结束时间end_time,SQL如下:
sql
select
`time` as start_time,
lead(`time`, 1) over (order by `time`) as end_time,
acc_cnt
from (
select
`time`,
acc_cnt
from (
select
`time`,
room_num,
user_cnt,
case when user_cnt > 0 then 1 else -1 end flag,
sum(case when user_cnt > 0 then 1 else -1 end) over (order by `time`) acc_cnt
from (select
`time`,
room_num,
sum(user_cnt) over (partition by room_num order by `time`) user_cnt
from (
select
in_time as `time`,
room_num,
count(user_id) user_cnt
from table23
group by in_time, room_num
union all
select
out_time as `time`,
room_num,
-1 * count(user_id) user_cnt
from table23
group by out_time, room_num
) t1
) t2
) t3
group by `time`, acc_cnt
) t4

:基于第五步的结果,过滤掉end_time 是null的数据,SQL如下:
sql
select
start_time,
end_time,
acc_cnt
from (
select
`time` as start_time,
lead(`time`, 1) over (order by `time`) as end_time,
acc_cnt
from (
select
`time`,
acc_cnt
from (
select
`time`,
room_num,
user_cnt,
case when user_cnt > 0 then 1 else -1 end as flag,
sum(case when user_cnt > 0 then 1 else -1 end) over (order by `time`) acc_cnt
from (select
`time`,
room_num,
sum(user_cnt) over (partition by room_num order by `time`) user_cnt
from (
select
in_time as `time`,
room_num,
count(user_id) user_cnt
from table23
group by in_time, room_num
union all
select
out_time as `time`,
room_num,
-1 * count(user_id) user_cnt
from table23
group by out_time, room_num
) t1
) t2
) t3
group by `time`, acc_cnt
) t4
) t5
where end_time is not null;
3 小结
针对【每个时间段的直播同时在线人数】 【每个时间段有客人在住的房间数量】这种类型的题目,本质是对(截至)当前时间点的状态统计。 这种问题常见的解决思路是:对当前时间点的状态打标记flag ,之后基于标记flag 做开窗计算(结合窗口函数)或聚合计算**。**