查询响应
hive从3.0版本开始移除了索引。由于客户端抽样机制,超过五十行的,oracle要在查询语句外层添加select count(*) from (query语句),与query语句对比,获取到准确查询时间。详见附录关于抽样的说明。
查询时,oracle客户端是sqldeveloper,hive客户端是dbvear。
单表
不限制字段数量,使用 * 查询所有字段
等值查询
单条查询 code或者pk
oracle
select * from t_od_bd_stordoc WHERE code='1020002';
SELECT * FROM t_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT * FROM t_od_bd_material WHERE code = '302550991000019';
SELECT * FROM t_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
SELECT * FROM t_od_ic_flow_10 WHERE pk_flow = '1001A11000000003KBHH';
SELECT * FROM t_od_ic_flow_100 WHERE pk_flow = '1001A11000000003KBHH';
create unique index uninx_pk_flow on t_od_ic_flow (pk_flow);
create index inx_pk_flow_10 on t_od_ic_flow_10 (pk_flow);
create index inx_pk_flow_100 on t_od_ic_flow_100 (pk_flow);
drop index uninx_pk_flow;
drop index inx_pk_flow_10;
drop index inx_pk_flow_100;
hive
select * from ht_od_bd_stordoc WHERE code='1020002';
SELECT * FROM ht_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT * FROM ht_od_bd_material WHERE code = '302550991000019';
SELECT * FROM ht_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
|-----------------|----------|------|---------|------|--------|
| 表名              | 返回量      | 无索引           || 用索引          ||
| 表名              | 返回量      | hive | oracle  | hive | oracle |
| XXX_BD_STORDOC  | 行1列45    | 3s   | 0.012s  |      |        |
| XXX_ORG_ORGS    | 行1列130   | 2s   | 0.015s  |      |        |
| XXX_BD_MATERIAL | 行1列86    | 6s   | 0.037s  |      |        |
| XXX_IC_FLOW     | 行1列166   | 8s   | 0.534s  |      | 0.043s |
| XXX_IC_FLOW_10  | 行10列166  |      | 11.261s |      | 0.047s |
| XXX_IC_FLOW_100 | 行100列166 |      | 53.61s  |      | 0.044s |
结论:当前硬件配置下,表行数在百万以内,oracle即使没有索引,响应速度依然能够很快。百万、千万记录的表,走索引的查询和不走索引的查询,响应速度差别巨大。
hive行数上万后,响应速度过长。
模糊查询
少量查询 code模糊查询
oracle
select * from t_od_bd_stordoc WHERE code LIKE '10%'
SELECT * FROM t_od_org_orgs WHERE code like '100%'
SELECT * FROM t_od_bd_material WHERE code like '30255099%'
SELECT * FROM t_od_ic_flow WHERE vbillcode like 'CR202308%'
SELECT * FROM t_od_ic_flow_10 WHERE vbillcode like 'CR202308%'
SELECT * FROM t_od_ic_flow_100 WHERE vbillcode like 'CR202308%'
CREATE INDEX inx_code ON T_OD_BD_MATERIAL (CODE);
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
CREATE INDEX inx_vbillcode_10 ON T_OD_IC_FLOW_10 (vbillcode );
CREATE INDEX inx_vbillcode_100 ON T_OD_IC_FLOW_100 (vbillcode );
drop index inx_vbillcode;
drop index inx_vbillcode_10;
drop index inx_vbillcode_100;
drop index inx_code;
hive
select * from ht_od_bd_stordoc WHERE code LIKE '10%'
SELECT * FROM ht_od_org_orgs WHERE code like '100%'
SELECT * FROM ht_od_bd_material WHERE code like '30255099%'
SELECT * FROM ht_od_ic_flow WHERE vbillcode like 'CR202308%'
|-----------------|-------------|------|----------|------|--------|
| 表名              | 返回量         | 无索引            || 用索引          ||
| 表名              | 返回量         | hive | oracle   | hive | oracle |
| XXX_BD_STORDOC  | 行3列45       | 4s   | 0.005s   |      |        |
| XXX_ORG_ORGS    | 行13列130     | 8s   | 0.024s   |      |        |
| XXX_BD_MATERIAL | 行191列86     | 10s  | 0.120s   |      | 0.064s |
| XXX_IC_FLOW     | 行1834列166   | 4s   | 0.682s   |      | 0.147s |
| XXX_IC_FLOW_10  | 行18340列166  |      | 10.754s  |      | 0.029s |
| XXX_IC_FLOW_100 | 行183400列166 |      | 102.334s |      | 0.226s |
结论:类似等值查询。
范围查询
少量查询 code范围查询
少量查询 code范围查询
oracle
select * from t_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT * FROM t_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT * FROM t_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT * FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT * FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT * FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
hive
select * from ht_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT * FROM ht_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT * FROM ht_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT * FROM ht_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
|-----------------|-------------|---------|---------|------|---------------|
| 表名              | 返回量         | 无索引              || 用索引                 ||
| 表名              | 返回量         | hive    | oracle  | hive | oracle        |
| XXX_BD_STORDOC  | 行6列45       | 4s      | 0.009s  |      |               |
| XXX_ORG_ORGS    | 行69列130     | 6s      | 0.021s  |      |               |
| XXX_BD_MATERIAL | 行999列86     | 13.115s | 0.193s  |      | 0.022s        |
| XXX_IC_FLOW     | 行1668列166   | 6s      | 0.625s  |      | 0.232s        |
| XXX_IC_FLOW_10  | 行16680列166  |         | 10.065s |      | 0.148s/0.011s |
| XXX_IC_FLOW_100 | 行166800列166 |         | 90.398s |      | 0.145s/0.045s |
单表2
根据分析类查询的特点,一般仪表板select 3-5个字段足矣(多维查询则需要更多字段),所以这里统计查询3-5个字段的情况。
等值查询
单条查询 code或者pk
oracle
select pk_stordoc,code,name from t_od_bd_stordoc WHERE code='1020002';
SELECT pk_org,code,name FROM t_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT pk_material,code,name FROM t_od_bd_material WHERE code = '302550991000019';
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_10 WHERE pk_flow = '1001A11000000003KBHH';
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_100 WHERE pk_flow = '1001A11000000003KBHH';
create unique index uninx_pk_flow on t_od_ic_flow (pk_flow);
create index inx_pk_flow_10 on t_od_ic_flow_10 (pk_flow);
create index inx_pk_flow_100 on t_od_ic_flow_100 (pk_flow);
drop index uninx_pk_flow;
drop index inx_pk_flow_10;
drop index inx_pk_flow_100;
hive
select pk_org,code,name from ht_od_bd_stordoc WHERE code='1020002';
SELECT * FROM ht_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT * FROM ht_od_bd_material WHERE code = '302550991000019';
SELECT * FROM ht_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
|-----------------|----------|---------------|----------|------|--------|
| 表名              | 返回量      | 无索引                     || 用索引          ||
| 表名              | 返回量      | hive          | oracle   | hive | oracle |
| XXX_BD_STORDOC  | 行1列45    | 0.982s/0.264s | 0.117s   |      |        |
| XXX_ORG_ORGS    | 行1列130   | 1s/0.272s     | 0.032s   |      |        |
| XXX_BD_MATERIAL | 行1列86    | 1s/0.855s     | 0.337s   |      | 0.02s  |
| XXX_IC_FLOW     | 行1列166   | 7s/6s         | 2.816s   |      | 0.035s |
| XXX_IC_FLOW_10  | 行10列166  |               | 17.308s  |      | 0.024s |
| XXX_IC_FLOW_100 | 行100列166 |               | 108.944s |      | 0.042s |
结论:当前硬件配置下,表行数在百万以内,oracle即使没有索引,响应速度依然能够很快。百万、千万记录的表,走索引的查询和不走索引的查询,响应速度差别巨大。
hive行数上万后,响应速度过长。
模糊查询
少量查询 code模糊查询
oracle
select pk_stordoc,code,name from t_od_bd_stordoc WHERE code LIKE '10%'
SELECT pk_org,code,name FROM t_od_org_orgs WHERE code like '100%'
SELECT pk_material,code,name FROM t_od_bd_material WHERE code like '30255099%'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow WHERE vbillcode like 'CR202308%'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_10 WHERE vbillcode like 'CR202308%'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_100 WHERE vbillcode like 'CR202308%'
CREATE INDEX inx_code ON T_OD_BD_MATERIAL (CODE);
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
select count(*) cc from (SELECT * FROM t_od_ic_flow_10 WHERE vbillcode like 'CR202308%')
select count(*) cc from (SELECT * FROM t_od_ic_flow_100 WHERE vbillcode like 'CR202308%')
hive
select * from ht_od_bd_stordoc WHERE code LIKE '10%'
SELECT * FROM ht_od_org_orgs WHERE code like '100%'
SELECT * FROM ht_od_bd_material WHERE code like '30255099%'
SELECT * FROM ht_od_ic_flow WHERE vbillcode like 'CR202308%'
SELECT * FROM ht_od_ic_flow_10 WHERE vbillcode like 'CR202308%'
SELECT * FROM ht_od_ic_flow_100 WHERE vbillcode like 'CR202308%'
select count(*) cc from (SELECT * FROM ht_od_ic_flow_10 WHERE vbillcode like 'CR202308%')
select count(*) cc from (SELECT * FROM ht_od_ic_flow_100 WHERE vbillcode like 'CR202308%')
|-----------------|-------------|------|----------|------|--------|
| 表名              | 返回量         | 无索引            || 用索引          ||
| 表名              | 返回量         | hive | oracle   | hive | oracle |
| XXX_BD_STORDOC  | 行3列45       |      | 0.016s   |      |        |
| XXX_ORG_ORGS    | 行13列130     |      | 0.022s   |      |        |
| XXX_BD_MATERIAL | 行191列86     |      | 0.13s    |      | 0.018s |
| XXX_IC_FLOW     | 行1834列166   |      | 0.901s   |      | 0.031s |
| XXX_IC_FLOW_10  | 行18340列166  |      | 11.723s  |      | 0.049s |
| XXX_IC_FLOW_100 | 行183400列166 |      | 117.365s |      | 0.29s  |
结论:类似等值查询。
范围查询
少量查询 code范围查询
oracle
drop index inx_code
drop index inx_vbillcode
select pk_stordoc,code,name from t_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT pk_org,code,name FROM t_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT pk_material,code,name FROM t_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_code ON T_OD_BD_MATERIAL (CODE);
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
hive
select pk_stordoc,code,name from ht_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT pk_org,code,name FROM ht_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT pk_material,code,name FROM ht_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM ht_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
|-----------------|-------------|-----------|---------|------|--------|
| 表名              | 返回量         | 无索引                || 用索引          ||
| 表名              | 返回量         | hive      | oracle  | hive | oracle |
| XXX_BD_STORDOC  | 行6列45       | 1s/0.139s | 0.043s  |      |        |
| XXX_ORG_ORGS    | 行69列130     | 1s/0.343s | 0.039s  |      |        |
| XXX_BD_MATERIAL | 行999列86     | 1/0.502s  | 0.091s  |      | 0.031s |
| XXX_IC_FLOW     | 行1668列166   | 4s/3s     | 0.907s  |      | 0.031s |
| XXX_IC_FLOW_10  | 行16680列166  |           | 9.196s  |      | 0.029s |
| XXX_IC_FLOW_100 | 行166800列166 |           | 91.147s |      | 0.08s  |
聚合查询
oracle
drop index inx_code
drop index inx_vbillcode
select count(*) from t_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT count(*) FROM t_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT count(*) FROM t_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT sum(ncostmny),sum(ninnum) FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT sum(ncostmny),sum(ninnum) FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT sum(ncostmny),sum(ninnum) FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
hive
select count(*) from ht_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT count(*) FROM ht_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT count(*) FROM ht_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT sum(ncostmny),sum(ninnum) FROM ht_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
|-----------------|------|------|---------|------|--------|
| 表名              | 返回量  | 无索引           || 用索引          ||
| 表名              | 返回量  | hive | oracle  | hive | oracle |
| XXX_BD_STORDOC  | 行1列1 |      | 0.038s  |      |        |
| XXX_ORG_ORGS    | 行1列1 |      | 0.024s  |      |        |
| XXX_BD_MATERIAL | 行1列1 |      | 0.118s  |      | 0.032s |
| XXX_IC_FLOW     | 行1列1 |      | 0.401s  |      | 0.059s |
| XXX_IC_FLOW_10  | 行1列1 |      | 9.371s  |      | 0.454s |
| XXX_IC_FLOW_100 | 行1列1 |      | 91.001s |      | 58.89s |
结论:查询ht_od_ic_flow时,有时会因内存崩溃而报错,详见hive聚合查询异常。oracle数据量达到千万级后,IO读写成为了瓶颈,所以即使加索引,耗时依然很久,甚至会出现超过不加索引时的耗时。同一会话多次执行同一sql,会有缓存,耗时会极大降低,即使千万级会降低至1.62s,即使清缓存,同样的sql语句依然是1.xxxs至3.xxxs,应该还有其他缓存机制。将SELECT sum(ncostmny),sum(ninnum) FROM ht_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'中的vbillcode起始改为CR202308xx00000001,速度依然很快,往前调个两三天0731、0730、0729,速度依然很快,维持在1.xxxs,或者至多不超过10s。
分析查询
oracle
SELECT pk_org,cmaterialoid,pk_flow,ninnum
,sum(ninnum) OVER (PARTITION BY cmaterialoid )
,CASE WHEN sum(ninnum) OVER (PARTITION BY cmaterialoid )!=0
THEN ninnum / sum(ninnum) OVER (PARTITION BY cmaterialoid )
END pp
FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
SELECT pk_org,cmaterialoid,pk_flow,ninnum
,sum(ninnum) OVER (PARTITION BY cmaterialoid )
,CASE WHEN sum(ninnum) OVER (PARTITION BY cmaterialoid )!=0
THEN ninnum / sum(ninnum) OVER (PARTITION BY cmaterialoid )
END pp
FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
SELECT pk_org,cmaterialoid,pk_flow,ninnum
,sum(ninnum) OVER (PARTITION BY cmaterialoid )
,CASE WHEN sum(ninnum) OVER (PARTITION BY cmaterialoid )!=0
THEN ninnum / sum(ninnum) OVER (PARTITION BY cmaterialoid )
END pp
FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
hive
SELECT PK_ORG,CMATERIALOID,PK_FLOW,NINNUM,SUM(NINNUM) OVER (PARTITION BY CMATERIALOID )
,CASE WHEN SUM(NINNUM) OVER (PARTITION BY CMATERIALOID )!=0
THEN NINNUM / SUM(NINNUM) OVER (PARTITION BY CMATERIALOID )
END pp
FROM ht_od_ic_flow WHERE VBILLCODE >= 'CR2023080100000001'
AND VBILLCODE <= 'CR2023083000000001'
|-----------------|-----------|------|---------|------|---------|
| 表名              | 返回量       | 无索引           || 用索引           ||
| 表名              | 返回量       | hive | oracle  | hive | oracle  |
| XXX_IC_FLOW     | 行1668列6   |      | 0.375s  |      | 0.261s  |
| XXX_IC_FLOW_10  | 行16680列6  |      | 9.016s  |      | 6.53s   |
| XXX_IC_FLOW_100 | 行166800列6 |      | 91.337s |      | 66.474s |
多表
多表连接查询
分析查询
oracle
drop index inx_vbillcode
SELECT t1.pk_org,t2.code orgcode,t2.name orgname
,t1.cmaterialoid,t3.code materialcode,t3.name materialname
,t1.pk_flow,t1.ninnum
,sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
,CASE WHEN sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )!=0
THEN t1.ninnum / sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
END pp
FROM t_od_ic_flow t1
inner JOIN t_od_org_orgs t2 ON t1.pk_org = t2.pk_org
inner JOIN t_od_bd_material t3 ON t1.cmaterialoid = t3.pk_material
WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
CREATE INDEX inx_vbillcode_10 ON T_OD_IC_FLOW_10 (vbillcode );
CREATE INDEX inx_vbillcode_100 ON T_OD_IC_FLOW_100 (vbillcode );
hive
SELECT t1.pk_org,t2.code orgcode,t2.name orgname
,t1.cmaterialoid,t3.code materialcode,t3.name materialname
,t1.pk_flow,t1.ninnum
,sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
,CASE WHEN sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )!=0
THEN t1.ninnum / sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
END pp
FROM ht_od_ic_flow t1
inner JOIN ht_od_org_orgs t2 ON t1.pk_org = t2.pk_org
inner JOIN ht_od_bd_material t3 ON t1.cmaterialoid = t3.pk_material
WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
|-----------------|------------|------|---------|------|---------|
| 表名              | 返回量        | 无索引           || 用索引           ||
| 表名              | 返回量        | hive | oracle  | hive | oracle  |
| XXX_IC_FLOW     | 行1668列10   |      | 0.555s  |      | 0.266s  |
| XXX_IC_FLOW_10  | 行16680列10  |      | 9.358s  |      | 0.338s  |
| XXX_IC_FLOW_100 | 行166800列10 |      | 92.048s |      | 92.019s |
orcle数据达到千万级时,有无索引,响应时间几乎没有差别,此时瓶颈应该在硬件配置上。