聚合,取值函数 排序函数 over(partition by 分组字段 order by 字段 row between 起始行 and 结束行)
复制代码
/*创建部门表*/
CREATE TABLE dept
(
deptno INT PRIMARY KEY,
dname VARCHAR(50) comment '部门名称',
loc VARCHAR(50) comment '工作地点'
);
/*创建雇员表*/
CREATE TABLE emp
(
empno INT PRIMARY KEY,
ename VARCHAR(50),
job VARCHAR(50) comment '职位',
mgr INT comment '上级',
hiredate DATE comment '入职时间',
sal DECIMAL(7, 2) comment '薪资',
COMM DECIMAL(7, 2) comment '奖金',
deptno INT comment '所属部门id'
);
/*创建工资等级表*/
CREATE TABLE salgrade
(
grade INT PRIMARY KEY,
losal INT comment '最低薪资',
hisal INT comment '最高薪资'
);
/*插入dept表数据*/
INSERT INTO dept
VALUES (10, '教研部', '北京');
INSERT INTO dept
VALUES (20, '学工部', '上海');
INSERT INTO dept
VALUES (30, '销售部', '广州');
INSERT INTO dept
VALUES (40, '财务部', '武汉');
/*插入emp表数据*/
INSERT INTO emp
VALUES (1009, '曾阿牛', '董事长', NULL, '2001-11-17', 50000, NULL, 10);
INSERT INTO emp
VALUES (1004, '刘备', '经理', 1009, '2001-04-02', 29750, NULL, 20);
INSERT INTO emp
VALUES (1006, '关羽', '经理', 1009, '2001-05-01', 28500, NULL, 30);
INSERT INTO emp
VALUES (1007, '张飞', '经理', 1009, '2001-09-01', 24500, NULL, 10);
INSERT INTO emp
VALUES (1008, '诸葛亮', '分析师', 1004, '2007-04-19', 30000, NULL, 20);
INSERT INTO emp
VALUES (1013, '庞统', '分析师', 1004, '2001-12-03', 30000, NULL, 20);
INSERT INTO emp
VALUES (1002, '黛绮丝', '销售员', 1006, '2001-02-20', 16000, 3000, 30);
INSERT INTO emp
VALUES (1003, '殷天正', '销售员', 1006, '2001-02-22', 12500, 5000, 30);
INSERT INTO emp
VALUES (1005, '谢逊', '销售员', 1006, '2001-09-28', 12500, 14000, 30);
INSERT INTO emp
VALUES (1010, '韦一笑', '销售员', 1006, '2001-09-08', 15000, 0, 30);
INSERT INTO emp
VALUES (1012, '程普', '文员', 1006, '2001-12-03', 9500, NULL, 30);
INSERT INTO emp
VALUES (1014, '黄盖', '文员', 1007, '2002-01-23', 13000, NULL, 10);
INSERT INTO emp
VALUES (1011, '周泰', '文员', 1008, '2007-05-23', 11000, NULL, 20);
INSERT INTO emp
VALUES (1001, '甘宁', '文员', 1013, '2000-12-17', 8000, NULL, 20);
/*插入salgrade表数据*/
INSERT INTO salgrade
VALUES (1, 7000, 12000);
INSERT INTO salgrade
VALUES (2, 12010, 14000);
INSERT INTO salgrade
VALUES (3, 14010, 20000);
INSERT INTO salgrade
VALUES (4, 20010, 30000);
INSERT INTO salgrade
VALUES (5, 30010, 99990);
窗口计算范围的指定
需要使用rows 进行指定 计算行数
范围的确认:
默认情况下没有指定partition by 的字段,范围是全表,如果指定了partition by ,范围是分组内的范围
可以通过rows指定计算行的范围大小 row between 起始行 and 结束行
指定计算行范围后,只对范围内的数据进行计算
指定范围关键字
复制代码
向上无限制: unbounded preceding 向上的行数没有限制
向上指定行数: 行数 preceding
当前行: current row
向下指定行数: 行数 following
向下无限制: unbounded following
不同关键字可以组合成一个范围
between 起始行范围 and 结束行范围
between 2 preceding and 1 following 范围查找是以当前行为基准 计算四行
between current row and 2 following 计算3行
复制代码
select *,count(ename) over(rows between unbounded preceding and current row ) cnt from emp;
select *,count(ename) over(rows between 2 preceding and 2 following ) cnt from emp;
-- 范围顺序要注意,一般起始行写向上查找,结束行写向下查找
# select *,count(ename) over(rows between 2 following and 2 preceding ) cnt from emp;
select *,sum(sal) over(rows between 2 following and 3 following ) cnt from emp;
order by的计算范围说明
复制代码
使用了order by 后会自带计算范围统计数据
rows between unbounded preceding and current row
复制代码
-- 统计每天的用户访问量,及截止当天的总用户访问量
-- 2019-02-11 6 6
-- 2019-02-12 1 7
select distinct dt,count(user_id) over(partition by dt order by dt) cnt ,count(user_id) over(order by dt) as ct from test5;
over中的 order by 是计算数据时,先排序在计算数据、
from 后的 order by 对计算后的结果排序
复制代码
select *,count(user_id) over(partition by dt order by dt) as ct from test5 order by ct
二、CTE语法
CTE语法类似子查询,可以将一个select语句计算的结果当成一个新的临时表使用
复制代码
-- 子查询,将子查询的结果当做表使用
select empno,ename from (
select * from emp) t1;
复制代码
-- 基本用法
with 临时表名 as(查询语句)
select * from 临时表名
-- 多个计算结果保存
with tb1 as(查询语句),,
tb2 as(查询语句 select * from tb1),
tb3 as(查询语句)
.....
select * from tb3 join tb2
复制代码
with tb1 as(select * from emp)
select ename,sal from tb1;
将如下子查询改为cte语法实现
复制代码
SELECT t2.shop, t2.user_id, t2.cnt
FROM (SELECT t1.*,
row_number() over (partition BY t1.shop ORDER BY t1.cnt DESC) rk
FROM (SELECT user_id, shop, count(*) AS cnt
FROM test2
GROUP BY user_id, shop) t1) t2
WHERE rk <= 3;
复制代码
-- CTE语法可以方便代码阅读,将多个计算步骤拆分
with tb1 as(
SELECT user_id, shop, count(*) AS cnt
FROM test2
GROUP BY user_id, shop
),
tb2 as(
SELECT tb1.*,
row_number() over (partition BY tb1.shop ORDER BY tb1.cnt DESC) rk
FROM tb1
)
select * from tb2 where rk <=3;
create table tb_user(
id int,
name string,
hobby string
)row format delimited fields terminated by ',';
select id,name,split(hobby,'-') as hobby from tb_user;
-- explode不能直接和其他字段出现在select中
select explode(split(hobby,'-')) as hobby from tb_user;
-- 使用侧视图的方法和其他字段一起展示
-- lateral view 爆炸函数 表名 as 字段名
select id,name,new_hobby from tb_user lateral view explode(split(hobby,'-')) tb1 as new_hobby;
-- 不能简单使用join进行关联数据
select * from tb_user join (select explode(split(hobby,'-')) as hobby from tb_user) tb1;
collect方法
将一列数据中的多行数据合并成一行
复制代码
-- collect_list 合并后不会去重
select collect_list(name) from tb_user;
-- collect_list 合并会对数据进行去重
select collect_set(name) from tb_user;
用户访问数据
复制代码
create table tb_visit(
id int,
name string,
url string
)row format delimited fields terminated by ',';
select * from tb_visit;
select collect_set(name) from tb_visit;
-- 统计不同用户访问了哪些网址
select name,collect_set(url) from tb_visit group by name;
四、随机抽样
从海量数据中随机抽取部分样本数据进行计算得到的结果趋势和整体趋势一致
格式
复制代码
SELECT ... FROM tbl TABLESAMPLE(BUCKET x OUT OF y ON(colname | rand()))
y表示将表数据随机划分成y份(y个桶)
x表示从y里面随机抽取x份数据作为取样
colname表示随机的依据基于某个列的值
rand()表示随机的依据基于整行
随机抽样的原理,是先将数据进行分桶,在从多个分桶中抽取数据
复制代码
create table tb_stu(
id int,
name string,
age int,
gender int,
dt string
)row format delimited fields terminated by ',';
-- 指定字段进行分桶抽样
select * from tb_stu tablesample (bucket 2 out of 30 on name);
-- 随机抽取
with tb1 as (select *
from tb_stu tablesample (bucket 2 out of 20 on rand()))
select gender,count(*)
from tb1 group by gender;
select *,INPUT__FILE__NAME from brand;
select * from brand where INPUT__FILE__NAME='hdfs://node1:8020/user/hive/warehouse/pydata.db/brand/000001_0';
select *,BLOCK__OFFSET__INSIDE__FILE from tb_stu;
SET hive.exec.rowoffset=true;
select *,ROW__OFFSET__INSIDE__BLOCK from tb_stu;
六、快速建表
基于已经存在的表创建新的表,对原始表复制一个新的表
like语法
将原始表的元数据(也就是表的名字字段等信息复制一份),不会复制行数据
创建之后是一个空表
复制代码
create table 新的表名 like 原始表名
as语法
会将原始数据表的内容全部复制一份到新表中
复制代码
create table 新的表名 as select * from 原始表
复制代码
select * from tb_user;
create table tb_user_new like tb_user;
select * from tb_user_new;
create table tb_user_new_new as select * from tb_user;
select * from tb_user_new_new;
-- 将计算的sql语句保存在视图中
create view sum_view as select sum(if(name is not null,1,0) ) from tb_user;
-- 当查询视图时,就会自动执行视图中的sql语句
select * from sum_view;
mysql中也是可以使用视图
复制代码
-- (1)修改表字段注解和表注解
use hive3;
alter table COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
alter table TABLE_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
-- (2)修改分区字段注解
alter table PARTITION_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8 ;
alter table PARTITION_KEYS modify column PKEY_COMMENT varchar(4000) character set utf8;
-- (3)修改索引注解
alter table INDEX_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;