hive 小文件分析

1、获取fsimage文件:

hdfs dfsadmin -fetchImage /data/xy/

2、从二进制文件解析:

hdfs oiv -i /data/xy/fsimage_0000000019891608958 -t /data/xy/tmpdir -o /data/xy/out -p Delimited -delimiter ","

3、创建hive表

create database if not exists hdfsinfo;

use hdfsinfo;

CREATE TABLE fsimage_info_csv(

path string,

replication int,

modificationtime string,

accesstime string,

preferredblocksize bigint,

blockscount int,

filesize bigint,

nsquota string,

dsquota string,

permission string,

username string,

groupname string)

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'

WITH SERDEPROPERTIES ('field.delim'=',', 'serialization.format'=',')

STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat';

4、存储HDFS元数据加载进hive中

hdfs dfs -put /data/xy/out /user/hive/warehouse/hdfsinfo.db/fsimage_info_csv/

hdfs dfs -ls /user/hive/warehouse/hdfsinfo.db/fsimage_info_csv/

Hive: MSCK REPAIR TABLE hdfsinfo.fsimage_info_csv;

select * from hdfsinfo.fsimage_info_csv limit 5;

5、统计叶子目录下小文件数据量(4194304 H字节,即<4M)

SELECT

dir_path ,

COUNT(*) AS small_file_num,

modificationtime,

accesstime

FROM

( SELECT

modificationtime,

accesstime,

relative_size,

dir_path

FROM

(

SELECT

(CASE filesize < 4194304 WHEN TRUE THEN 'small' ELSE 'large' END) AS relative_size,

modificationtime,

accesstime,

split(

substr(

concat_ws('/', split(PATH, '/')),

1,

length(concat_ws('/', split(PATH, '/'))) - length(last_element) - 1

),

',')0 as dir_path

FROM (

SELECT

modificationtime,

accesstime,

filesize,

PATH,

split(PATH, '/')size(split(PATH, '/')) - 1 as last_element

FROM hdfsinfo.fsimage_info_csv

) t0 ) t1

WHERE

relative_size='small') t2

GROUP BY

dir_path,modificationtime,accesstime

ORDER BY

small_file_num desc

limit 500;

5、统计叶子目录下小文件数据量(4194304 H字节,即<4M)

SELECT

dir_path,

COUNT(*) AS small_file_num

FROM

( SELECT

relative_size,

dir_path

FROM

(

SELECT

(CASE filesize < 41943040 WHEN TRUE THEN 'small' ELSE 'large' END) AS relative_size,

split(

substr(

concat_ws('/', split(PATH, '/')),

1,

length(concat_ws('/', split(PATH, '/'))) - length(last_element) - 1

),

',')0 as dir_path

FROM (

SELECT

filesize,

PATH,

split(PATH, '/')size(split(PATH, '/')) - 1 as last_element

FROM hdfsinfo.fsimage_info_csv

WHERE

permission not LIKE 'd%'

) t0 ) t1

WHERE

relative_size='small') t2

GROUP BY

dir_path

ORDER BY

small_file_num desc

limit 50000;

相关推荐
qiuyepiaoling6 小时前
数仓设计基础
数据仓库
兔子宇航员03017 小时前
HIVE SQL 中 NULL 值在 JOIN 和 GROUP BY 中的致命陷阱与解决方案
hive·hadoop·sql
段一凡-华北理工大学10 小时前
工业领域的Hadoop架构学习~系列文章02:HDFS架构深度剖析
大数据·人工智能·hadoop·学习·架构·高炉炼铁
Irene199111 小时前
Oracle(字符集分为服务端和客户端) 和 Hive(依赖 MySQL(或 PostgreSQL)存储元数据)字符集编码格式查询,中文乱码处理
hive·sql·oracle
段一凡-华北理工大学12 小时前
工业领域的Hadoop架构学习~系列文章03:MapReduce编程模型深度解读
大数据·人工智能·hadoop·学习·架构·高炉炼铁·高炉智能化
兔子宇航员030112 小时前
HiveSQL 中 NULL 与空字符串的区别与注意事项
数据库·数据仓库·sql
无关868813 小时前
StarRocks 存算分离 + Spark + Hive Metastore + MinIO 数据湖搭建全流程
大数据·hive·spark
小欣加油1 天前
Hadoop开发环境搭建
大数据·数据库·hadoop
段一凡-华北理工大学2 天前
工业领域的Hadoop架构学习~系列文章01:Hadoop与工业4.0深度融合
大数据·hadoop·学习·架构·知识图谱·高炉炼铁·工业智能体
宽海智能仓储物流2 天前
从状态检查到数据备份:仓储PLC控制器保养周期与实操清单
大数据·数据仓库·自动化