查询响应
hive从3.0版本开始移除了索引。由于客户端抽样机制,超过五十行的,oracle要在查询语句外层添加select count(*) from (query语句),与query语句对比,获取到准确查询时间。详见附录关于抽样的说明。
查询时,oracle客户端是sqldeveloper,hive客户端是dbvear。
单表
不限制字段数量,使用 * 查询所有字段
等值查询
单条查询 code或者pk
oracle
select * from t_od_bd_stordoc WHERE code='1020002';
SELECT * FROM t_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT * FROM t_od_bd_material WHERE code = '302550991000019';
SELECT * FROM t_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
SELECT * FROM t_od_ic_flow_10 WHERE pk_flow = '1001A11000000003KBHH';
SELECT * FROM t_od_ic_flow_100 WHERE pk_flow = '1001A11000000003KBHH';
create unique index uninx_pk_flow on t_od_ic_flow (pk_flow);
create index inx_pk_flow_10 on t_od_ic_flow_10 (pk_flow);
create index inx_pk_flow_100 on t_od_ic_flow_100 (pk_flow);
drop index uninx_pk_flow;
drop index inx_pk_flow_10;
drop index inx_pk_flow_100;
hive
select * from ht_od_bd_stordoc WHERE code='1020002';
SELECT * FROM ht_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT * FROM ht_od_bd_material WHERE code = '302550991000019';
SELECT * FROM ht_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
|-----------------|----------|------|---------|------|--------|
| 表名 | 返回量 | 无索引 || 用索引 ||
| 表名 | 返回量 | hive | oracle | hive | oracle |
| XXX_BD_STORDOC | 行1列45 | 3s | 0.012s | | |
| XXX_ORG_ORGS | 行1列130 | 2s | 0.015s | | |
| XXX_BD_MATERIAL | 行1列86 | 6s | 0.037s | | |
| XXX_IC_FLOW | 行1列166 | 8s | 0.534s | | 0.043s |
| XXX_IC_FLOW_10 | 行10列166 | | 11.261s | | 0.047s |
| XXX_IC_FLOW_100 | 行100列166 | | 53.61s | | 0.044s |
结论:当前硬件配置下,表行数在百万以内,oracle即使没有索引,响应速度依然能够很快。百万、千万记录的表,走索引的查询和不走索引的查询,响应速度差别巨大。
hive行数上万后,响应速度过长。
模糊查询
少量查询 code模糊查询
oracle
select * from t_od_bd_stordoc WHERE code LIKE '10%'
SELECT * FROM t_od_org_orgs WHERE code like '100%'
SELECT * FROM t_od_bd_material WHERE code like '30255099%'
SELECT * FROM t_od_ic_flow WHERE vbillcode like 'CR202308%'
SELECT * FROM t_od_ic_flow_10 WHERE vbillcode like 'CR202308%'
SELECT * FROM t_od_ic_flow_100 WHERE vbillcode like 'CR202308%'
CREATE INDEX inx_code ON T_OD_BD_MATERIAL (CODE);
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
CREATE INDEX inx_vbillcode_10 ON T_OD_IC_FLOW_10 (vbillcode );
CREATE INDEX inx_vbillcode_100 ON T_OD_IC_FLOW_100 (vbillcode );
drop index inx_vbillcode;
drop index inx_vbillcode_10;
drop index inx_vbillcode_100;
drop index inx_code;
hive
select * from ht_od_bd_stordoc WHERE code LIKE '10%'
SELECT * FROM ht_od_org_orgs WHERE code like '100%'
SELECT * FROM ht_od_bd_material WHERE code like '30255099%'
SELECT * FROM ht_od_ic_flow WHERE vbillcode like 'CR202308%'
|-----------------|-------------|------|----------|------|--------|
| 表名 | 返回量 | 无索引 || 用索引 ||
| 表名 | 返回量 | hive | oracle | hive | oracle |
| XXX_BD_STORDOC | 行3列45 | 4s | 0.005s | | |
| XXX_ORG_ORGS | 行13列130 | 8s | 0.024s | | |
| XXX_BD_MATERIAL | 行191列86 | 10s | 0.120s | | 0.064s |
| XXX_IC_FLOW | 行1834列166 | 4s | 0.682s | | 0.147s |
| XXX_IC_FLOW_10 | 行18340列166 | | 10.754s | | 0.029s |
| XXX_IC_FLOW_100 | 行183400列166 | | 102.334s | | 0.226s |
结论:类似等值查询。
范围查询
少量查询 code范围查询
少量查询 code范围查询
oracle
select * from t_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT * FROM t_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT * FROM t_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT * FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT * FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT * FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
hive
select * from ht_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT * FROM ht_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT * FROM ht_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT * FROM ht_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
|-----------------|-------------|---------|---------|------|---------------|
| 表名 | 返回量 | 无索引 || 用索引 ||
| 表名 | 返回量 | hive | oracle | hive | oracle |
| XXX_BD_STORDOC | 行6列45 | 4s | 0.009s | | |
| XXX_ORG_ORGS | 行69列130 | 6s | 0.021s | | |
| XXX_BD_MATERIAL | 行999列86 | 13.115s | 0.193s | | 0.022s |
| XXX_IC_FLOW | 行1668列166 | 6s | 0.625s | | 0.232s |
| XXX_IC_FLOW_10 | 行16680列166 | | 10.065s | | 0.148s/0.011s |
| XXX_IC_FLOW_100 | 行166800列166 | | 90.398s | | 0.145s/0.045s |
单表2
根据分析类查询的特点,一般仪表板select 3-5个字段足矣(多维查询则需要更多字段),所以这里统计查询3-5个字段的情况。
等值查询
单条查询 code或者pk
oracle
select pk_stordoc,code,name from t_od_bd_stordoc WHERE code='1020002';
SELECT pk_org,code,name FROM t_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT pk_material,code,name FROM t_od_bd_material WHERE code = '302550991000019';
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_10 WHERE pk_flow = '1001A11000000003KBHH';
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_100 WHERE pk_flow = '1001A11000000003KBHH';
create unique index uninx_pk_flow on t_od_ic_flow (pk_flow);
create index inx_pk_flow_10 on t_od_ic_flow_10 (pk_flow);
create index inx_pk_flow_100 on t_od_ic_flow_100 (pk_flow);
drop index uninx_pk_flow;
drop index inx_pk_flow_10;
drop index inx_pk_flow_100;
hive
select pk_org,code,name from ht_od_bd_stordoc WHERE code='1020002';
SELECT * FROM ht_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT * FROM ht_od_bd_material WHERE code = '302550991000019';
SELECT * FROM ht_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
|-----------------|----------|---------------|----------|------|--------|
| 表名 | 返回量 | 无索引 || 用索引 ||
| 表名 | 返回量 | hive | oracle | hive | oracle |
| XXX_BD_STORDOC | 行1列45 | 0.982s/0.264s | 0.117s | | |
| XXX_ORG_ORGS | 行1列130 | 1s/0.272s | 0.032s | | |
| XXX_BD_MATERIAL | 行1列86 | 1s/0.855s | 0.337s | | 0.02s |
| XXX_IC_FLOW | 行1列166 | 7s/6s | 2.816s | | 0.035s |
| XXX_IC_FLOW_10 | 行10列166 | | 17.308s | | 0.024s |
| XXX_IC_FLOW_100 | 行100列166 | | 108.944s | | 0.042s |
结论:当前硬件配置下,表行数在百万以内,oracle即使没有索引,响应速度依然能够很快。百万、千万记录的表,走索引的查询和不走索引的查询,响应速度差别巨大。
hive行数上万后,响应速度过长。
模糊查询
少量查询 code模糊查询
oracle
select pk_stordoc,code,name from t_od_bd_stordoc WHERE code LIKE '10%'
SELECT pk_org,code,name FROM t_od_org_orgs WHERE code like '100%'
SELECT pk_material,code,name FROM t_od_bd_material WHERE code like '30255099%'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow WHERE vbillcode like 'CR202308%'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_10 WHERE vbillcode like 'CR202308%'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_100 WHERE vbillcode like 'CR202308%'
CREATE INDEX inx_code ON T_OD_BD_MATERIAL (CODE);
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
select count(*) cc from (SELECT * FROM t_od_ic_flow_10 WHERE vbillcode like 'CR202308%')
select count(*) cc from (SELECT * FROM t_od_ic_flow_100 WHERE vbillcode like 'CR202308%')
hive
select * from ht_od_bd_stordoc WHERE code LIKE '10%'
SELECT * FROM ht_od_org_orgs WHERE code like '100%'
SELECT * FROM ht_od_bd_material WHERE code like '30255099%'
SELECT * FROM ht_od_ic_flow WHERE vbillcode like 'CR202308%'
SELECT * FROM ht_od_ic_flow_10 WHERE vbillcode like 'CR202308%'
SELECT * FROM ht_od_ic_flow_100 WHERE vbillcode like 'CR202308%'
select count(*) cc from (SELECT * FROM ht_od_ic_flow_10 WHERE vbillcode like 'CR202308%')
select count(*) cc from (SELECT * FROM ht_od_ic_flow_100 WHERE vbillcode like 'CR202308%')
|-----------------|-------------|------|----------|------|--------|
| 表名 | 返回量 | 无索引 || 用索引 ||
| 表名 | 返回量 | hive | oracle | hive | oracle |
| XXX_BD_STORDOC | 行3列45 | | 0.016s | | |
| XXX_ORG_ORGS | 行13列130 | | 0.022s | | |
| XXX_BD_MATERIAL | 行191列86 | | 0.13s | | 0.018s |
| XXX_IC_FLOW | 行1834列166 | | 0.901s | | 0.031s |
| XXX_IC_FLOW_10 | 行18340列166 | | 11.723s | | 0.049s |
| XXX_IC_FLOW_100 | 行183400列166 | | 117.365s | | 0.29s |
结论:类似等值查询。
范围查询
少量查询 code范围查询
oracle
drop index inx_code
drop index inx_vbillcode
select pk_stordoc,code,name from t_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT pk_org,code,name FROM t_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT pk_material,code,name FROM t_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_code ON T_OD_BD_MATERIAL (CODE);
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
hive
select pk_stordoc,code,name from ht_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT pk_org,code,name FROM ht_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT pk_material,code,name FROM ht_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM ht_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
|-----------------|-------------|-----------|---------|------|--------|
| 表名 | 返回量 | 无索引 || 用索引 ||
| 表名 | 返回量 | hive | oracle | hive | oracle |
| XXX_BD_STORDOC | 行6列45 | 1s/0.139s | 0.043s | | |
| XXX_ORG_ORGS | 行69列130 | 1s/0.343s | 0.039s | | |
| XXX_BD_MATERIAL | 行999列86 | 1/0.502s | 0.091s | | 0.031s |
| XXX_IC_FLOW | 行1668列166 | 4s/3s | 0.907s | | 0.031s |
| XXX_IC_FLOW_10 | 行16680列166 | | 9.196s | | 0.029s |
| XXX_IC_FLOW_100 | 行166800列166 | | 91.147s | | 0.08s |
聚合查询
oracle
drop index inx_code
drop index inx_vbillcode
select count(*) from t_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT count(*) FROM t_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT count(*) FROM t_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT sum(ncostmny),sum(ninnum) FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT sum(ncostmny),sum(ninnum) FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT sum(ncostmny),sum(ninnum) FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
hive
select count(*) from ht_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT count(*) FROM ht_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT count(*) FROM ht_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT sum(ncostmny),sum(ninnum) FROM ht_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
|-----------------|------|------|---------|------|--------|
| 表名 | 返回量 | 无索引 || 用索引 ||
| 表名 | 返回量 | hive | oracle | hive | oracle |
| XXX_BD_STORDOC | 行1列1 | | 0.038s | | |
| XXX_ORG_ORGS | 行1列1 | | 0.024s | | |
| XXX_BD_MATERIAL | 行1列1 | | 0.118s | | 0.032s |
| XXX_IC_FLOW | 行1列1 | | 0.401s | | 0.059s |
| XXX_IC_FLOW_10 | 行1列1 | | 9.371s | | 0.454s |
| XXX_IC_FLOW_100 | 行1列1 | | 91.001s | | 58.89s |
结论:查询ht_od_ic_flow时,有时会因内存崩溃而报错,详见hive聚合查询异常。oracle数据量达到千万级后,IO读写成为了瓶颈,所以即使加索引,耗时依然很久,甚至会出现超过不加索引时的耗时。同一会话多次执行同一sql,会有缓存,耗时会极大降低,即使千万级会降低至1.62s,即使清缓存,同样的sql语句依然是1.xxxs至3.xxxs,应该还有其他缓存机制。将SELECT sum(ncostmny),sum(ninnum) FROM ht_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'中的vbillcode起始改为CR202308xx00000001,速度依然很快,往前调个两三天0731、0730、0729,速度依然很快,维持在1.xxxs,或者至多不超过10s。
分析查询
oracle
SELECT pk_org,cmaterialoid,pk_flow,ninnum
,sum(ninnum) OVER (PARTITION BY cmaterialoid )
,CASE WHEN sum(ninnum) OVER (PARTITION BY cmaterialoid )!=0
THEN ninnum / sum(ninnum) OVER (PARTITION BY cmaterialoid )
END pp
FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
SELECT pk_org,cmaterialoid,pk_flow,ninnum
,sum(ninnum) OVER (PARTITION BY cmaterialoid )
,CASE WHEN sum(ninnum) OVER (PARTITION BY cmaterialoid )!=0
THEN ninnum / sum(ninnum) OVER (PARTITION BY cmaterialoid )
END pp
FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
SELECT pk_org,cmaterialoid,pk_flow,ninnum
,sum(ninnum) OVER (PARTITION BY cmaterialoid )
,CASE WHEN sum(ninnum) OVER (PARTITION BY cmaterialoid )!=0
THEN ninnum / sum(ninnum) OVER (PARTITION BY cmaterialoid )
END pp
FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
hive
SELECT PK_ORG,CMATERIALOID,PK_FLOW,NINNUM,SUM(NINNUM) OVER (PARTITION BY CMATERIALOID )
,CASE WHEN SUM(NINNUM) OVER (PARTITION BY CMATERIALOID )!=0
THEN NINNUM / SUM(NINNUM) OVER (PARTITION BY CMATERIALOID )
END pp
FROM ht_od_ic_flow WHERE VBILLCODE >= 'CR2023080100000001'
AND VBILLCODE <= 'CR2023083000000001'
|-----------------|-----------|------|---------|------|---------|
| 表名 | 返回量 | 无索引 || 用索引 ||
| 表名 | 返回量 | hive | oracle | hive | oracle |
| XXX_IC_FLOW | 行1668列6 | | 0.375s | | 0.261s |
| XXX_IC_FLOW_10 | 行16680列6 | | 9.016s | | 6.53s |
| XXX_IC_FLOW_100 | 行166800列6 | | 91.337s | | 66.474s |
多表
多表连接查询
分析查询
oracle
drop index inx_vbillcode
SELECT t1.pk_org,t2.code orgcode,t2.name orgname
,t1.cmaterialoid,t3.code materialcode,t3.name materialname
,t1.pk_flow,t1.ninnum
,sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
,CASE WHEN sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )!=0
THEN t1.ninnum / sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
END pp
FROM t_od_ic_flow t1
inner JOIN t_od_org_orgs t2 ON t1.pk_org = t2.pk_org
inner JOIN t_od_bd_material t3 ON t1.cmaterialoid = t3.pk_material
WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
CREATE INDEX inx_vbillcode_10 ON T_OD_IC_FLOW_10 (vbillcode );
CREATE INDEX inx_vbillcode_100 ON T_OD_IC_FLOW_100 (vbillcode );
hive
SELECT t1.pk_org,t2.code orgcode,t2.name orgname
,t1.cmaterialoid,t3.code materialcode,t3.name materialname
,t1.pk_flow,t1.ninnum
,sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
,CASE WHEN sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )!=0
THEN t1.ninnum / sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
END pp
FROM ht_od_ic_flow t1
inner JOIN ht_od_org_orgs t2 ON t1.pk_org = t2.pk_org
inner JOIN ht_od_bd_material t3 ON t1.cmaterialoid = t3.pk_material
WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
|-----------------|------------|------|---------|------|---------|
| 表名 | 返回量 | 无索引 || 用索引 ||
| 表名 | 返回量 | hive | oracle | hive | oracle |
| XXX_IC_FLOW | 行1668列10 | | 0.555s | | 0.266s |
| XXX_IC_FLOW_10 | 行16680列10 | | 9.358s | | 0.338s |
| XXX_IC_FLOW_100 | 行166800列10 | | 92.048s | | 92.019s |
orcle数据达到千万级时,有无索引,响应时间几乎没有差别,此时瓶颈应该在硬件配置上。