hive分区
在hdfs上创建文件夹
为什么分区
1、提高查询效率,分区不用全表扫描;常见用天区分
语法
partitioned by
效率很慢
切换引擎,spark
语法
string数据,需要添加单''
sql
#加载数据到hdfs
load data local inpath '/home/hivedata/user11.txt' into table part1 partition (dt='2026-03-19')
load data local inpath '/home/hivedata/user11.txt' into table part1 partition (year='2026',month='12',day='31')
sql
#创建分区表
create table part1(
id int,
name string,
gae int
)
partitioned by (dt string)
row format delimited
fields terminated by ',';
create table part3(
id int,
name string,
gae int
)
partitioned by (year string,month string,day string)
row format delimited
fields terminated by ',';
字段名称是否区分大小写
不区分
分区的增删改查
sql
select * from part1 where dt='' union select * from part1 where dt = '20260824' #会转化成mapreduce
select * from part1 where dt = '20260823' or dt = '20260824'
select * from part1 where dt in ( '20260823' '20260824')
show partitions part1;
alter table part1 add partition (dt = '20260423')
#查看表的设计
desc part1;
desc formatted part1;
分区关联数据的三种方式
sql
#上传数据后修复
create table part5(
id int,
name string,
gae int
)
partitioned by (year string,month string,day string)
row format delimited
fields terminated by ',';
dfs -put /home/hivedata/user1.txt /user/hive/warehouse/yhdb.db/part5/year=2023/month=08/day=26
load data local inpath '/home/hivedata/user.txt' into table
part5 partition(year='2026',month='03',day='21')
分区的种类
静态分区:先创建,再加载数据
动态分区:直接加载数据,根据数据动态创建分区
混合分区:有静态,还有动态,某个字段指定;
sql
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
hive函数
sql
show function;
show function abs
sql
# abs 绝对值
select abs(-7);
#日期函数
#current_data()、current_timestamp()、unix_timestamp();
#lower()
#upper()
#length()
#concat() 拼接
#concat_ws() 过滤-
#substr(),3 截取
#replace("123124","2",",")1,31,4
#split("","")切割
#size()数组
#round()取整数
#case when
#get_json_object()
#parse_url()
#if(1=1,1,2)
#pmod(3,2) = 1
窗口函数
sql
select *,count(1) over() '订单量 ' from t_order;
#over是窗口,不加条件,默认所有数据统计;可以加条件限制
select *,count(1) over() '订单量' from t_order where substr(orderdata,1,7) = '2018-01'
#分组distribute by name,substr(orderdate,1,7)
select *,sum(cost) over(distribute by name,substr(orderdate,1,7)) from t_order
#每个顾客的明细,
select *, sum(cost) over (distribute by name,substr(orderdate,1,7) sort by orderdate desc) from t_order
select*,sum(cost),over(partition by name,month(orderdate) order by orderdate desc)from t_order
#