目录
[1 groupby(大表分组-局部聚合+全局聚合)](#1 groupby(大表分组-局部聚合+全局聚合))
[2 join(大中表Join - 加salt + 小表膨胀)](#2 join(大中表Join - 加salt + 小表膨胀))
[3 双大表Join - 抽样取倾斜key+BroadJoin](#3 双大表Join - 抽样取倾斜key+BroadJoin)
[4 小结](#4 小结)
1 groupby(大表分组-局部聚合+全局聚合)
示例1:
sql
select label,sum(cnt) as all from
(
select rd,label,sum(1) as cnt from
(
select id,label,round(rand(),2) as rd,value from tmp1
) as tmp
group by rd,label
) as tmp
group by label;
示例2:
sql
select
split(new_source,'\\_')[0] as source
,sum(cnt) as cnt
from
(select
concat(source,'_', rand()*100) as new_source
,count(1) as cnt
from test_table
where day ='2022-01-01'
group by
concat(source,'_', rand()*100)
)tt
group by
split(new_source,'\\_')[0]
2 join(大中表Join - 加salt + 小表膨胀)
示例1:
sql
select label,sum(value) as all from
(
select rd,label,sum(value) as cnt from
(
select tmp1.rd as rd,tmp1.label as label,tmp1.value*tmp2.value as value
from
(
select id,round(rand(),1) as rd,label,value from tmp1
) as tmp1
join
(
select id,rd,label,value from tmp2
lateral view explode(split('0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9',',')) mytable as rd
) as tmp2
on tmp1.rd = tmp2.rd and tmp1.label = tmp2.label
) as tmp1
group by rd,label
) as tmp1
group by label;
示例2:
sql
select
source
,source_name
,sum(cnt) as cnt
from
(select
t1.source
,new_source
,nvl(source_name,'未知') as source_name
,count(imei) as cnt
from
(select
imei
,source
,concat(cast(rand()*10 as int ),'_',source ) as new_source
from test_table_1
where day ='2022-01-01'
) t1
inner join
(
select
source_name
,concat(preflix,'_',source) as new_source
from test_table_1
where day ='2022-01-01'
lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10',','))b as preflix
) t2
on t1.new_source =t2.new_source
group by
t1.source
,new_source
,nvl(source_name,'未知')
) tta
group by
source
,source_name
3 双大表Join - 抽样取倾斜key+BroadJoin
sql
##优化前:
create table test.tmp_table_test_all as
select
imei
,lable_id
,nvl(label_name,'未知')
from tmp_table_1 t1
left join
(select
lable_id
,label_name
from tmp_table_2
where day ='2024-01-01') t2
on t1.lable_id =t2.lable_id
where t1.day ='2024-01-01'
;
## 优化后 :
create table test.tmp_table_test_all_new as
with tmp_table_test_1 as
(select
lable_id
,count(1) as cnt
from tmp_table_1 t1
tablesample(5 percent) --抽样取5%的数据,减少table scan的量
group by lable_id
order by cnt desc
limit 100
)
select
imei
,lable_id
,nvl(label_name,'未知') as label_name
from tmp_table_1 t1
left join tmp_table_test_1 t2
on t1.lable_id =t2.lable_id
left join
(select
lable_id
,label_name
from tmp_table_2
where day ='2024-01-01') t3
on t1.lable_id =t3.lable_id
where t1.day ='2024-01-01' and t2.lable_id is null
union all
select
imei
,lable_id
,nvl(label_name,'未知') as label_name
from tmp_table_1 t1
inner join
(select
lable_id
from tmp_table_test_1 t1
left join tmp_table_2 t2
on t1.lable_id =t2.lable_id
where t2.day ='2024-01-01') t3
on t1.lable_id =t3.lable_id
where t1.day ='2024-01-01'
;
4 小结
本文总结了Hive中数据倾斜的常见处理方法。主要包含:
如果您觉得本文还不错,对你有帮助,那么不妨可以关注一下我的数字化建设实践之路专栏,这里的内容会更精彩。
[专栏 原价99,现在活动价59.9,按照阶梯式增长,还差5个人上升到69.9,最终恢复到原价]( "专栏 原价99,现在活动价59.9,按照阶梯式增长,还差5个人上升到69.9,最终恢复到原价")。
专栏优势:
(1)一次收费持续更新。
(2)实战中总结的SQL技巧,帮助SQLBOY 在SQL语言上有质的飞越,无论你应对业务难题及面试都会游刃有余【全网唯一讲SQL实战技巧,方法独特】
(3)实战中数仓建模技巧总结,让你认识不一样的数仓。【数据建模+业务建模,不一样的认知体系】(如果只懂数据建模而不懂业务建模,数仓体系认知是不全面的)
(4)数字化建设当中遇到难题解决思路及问题思考。
我的 专栏具体链接如下: