hive 小文件分析

1、获取fsimage文件:

hdfs dfsadmin -fetchImage /data/xy/

2、从二进制文件解析:

hdfs oiv -i /data/xy/fsimage_0000000019891608958 -t /data/xy/tmpdir -o /data/xy/out -p Delimited -delimiter ","

3、创建hive表

create database if not exists hdfsinfo;

use hdfsinfo;

CREATE TABLE fsimage_info_csv(

path string,

replication int,

modificationtime string,

accesstime string,

preferredblocksize bigint,

blockscount int,

filesize bigint,

nsquota string,

dsquota string,

permission string,

username string,

groupname string)

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'

WITH SERDEPROPERTIES ('field.delim'=',', 'serialization.format'=',')

STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat';

4、存储HDFS元数据加载进hive中

hdfs dfs -put /data/xy/out /user/hive/warehouse/hdfsinfo.db/fsimage_info_csv/

hdfs dfs -ls /user/hive/warehouse/hdfsinfo.db/fsimage_info_csv/

Hive: MSCK REPAIR TABLE hdfsinfo.fsimage_info_csv;

select * from hdfsinfo.fsimage_info_csv limit 5;

5、统计叶子目录下小文件数据量(4194304 H字节,即<4M)

SELECT

dir_path ,

COUNT(*) AS small_file_num,

modificationtime,

accesstime

FROM

( SELECT

modificationtime,

accesstime,

relative_size,

dir_path

FROM

(

SELECT

(CASE filesize < 4194304 WHEN TRUE THEN 'small' ELSE 'large' END) AS relative_size,

modificationtime,

accesstime,

split(

substr(

concat_ws('/', split(PATH, '/')),

1,

length(concat_ws('/', split(PATH, '/'))) - length(last_element) - 1

),

',')[0] as dir_path

FROM (

SELECT

modificationtime,

accesstime,

filesize,

PATH,

split(PATH, '/')[size(split(PATH, '/')) - 1] as last_element

FROM hdfsinfo.fsimage_info_csv

) t0 ) t1

WHERE

relative_size='small') t2

GROUP BY

dir_path,modificationtime,accesstime

ORDER BY

small_file_num desc

limit 500;

5、统计叶子目录下小文件数据量(4194304 H字节,即<4M)

SELECT

dir_path,

COUNT(*) AS small_file_num

FROM

( SELECT

relative_size,

dir_path

FROM

(

SELECT

(CASE filesize < 41943040 WHEN TRUE THEN 'small' ELSE 'large' END) AS relative_size,

split(

substr(

concat_ws('/', split(PATH, '/')),

1,

length(concat_ws('/', split(PATH, '/'))) - length(last_element) - 1

),

',')[0] as dir_path

FROM (

SELECT

filesize,

PATH,

split(PATH, '/')[size(split(PATH, '/')) - 1] as last_element

FROM hdfsinfo.fsimage_info_csv

WHERE

permission not LIKE 'd%'

) t0 ) t1

WHERE

relative_size='small') t2

GROUP BY

dir_path

ORDER BY

small_file_num desc

limit 50000;

相关推荐
yumgpkpm15 小时前
CMP(类Cloudera CDP 7.3 404版华为Kunpeng)与其他大数据平台对比
大数据·hive·hadoop·elasticsearch·kafka·hbase·cloudera
陈辛chenxin19 小时前
【大数据技术06】大数据技术
大数据·hadoop·分布式·python·信息可视化
yumgpkpm20 小时前
Hadoop在AI时代如何实现生态协同? CMP 7.13(或类 Cloudera CDP7.3 的 CMP 7.13 平台,如华为鲲鹏 ARM 版)
大数据·hadoop·elasticsearch·zookeeper·kafka·hbase·cloudera
piepis1 天前
Doris Docker 完整部署指南
数据仓库·docker·doris·容器部署
qqxhb1 天前
系统架构设计师备考第68天——大数据处理架构
大数据·hadoop·flink·spark·系统架构·lambda·kappa
yumgpkpm2 天前
Hadoop大数据平台在中国AI时代的后续发展趋势研究CMP(类Cloudera CDP 7.3 404版华为鲲鹏Kunpeng)
大数据·hive·hadoop·python·zookeeper·oracle·cloudera
凯子坚持 c2 天前
基于VMware与CentOS 7的Hadoop集群部署全景指南
linux·hadoop·centos
KANGBboy3 天前
ES 总结
hive·elasticsearch
FeelTouch Labs3 天前
数据仓库和数据集市之ODS、CDM、ADS、DWD、DWS
数据仓库
大数据CLUB3 天前
酒店预订数据分析及预测可视化
大数据·hadoop·分布式·数据挖掘·数据分析·spark·mapreduce