匯出namenode的後設資料檔案,並將資料轉成csv格式,逗號分割欄位
hdfs dfsadmin -fetchImage ./ # 將檔案拉到本地
hdfs oiv -i fsimage_0000000000243832876 -o fsimage.csv -p Delimited -delimiter "," -Xmx30720m # 使用hdfs工具本地解析檔案,我的映象是30G,我就用了30的堆記憶體解析
# 建立hive表
CREATE TABLE temp_dev_db.fsimage_info_csv(
path string,
replication int,
modificationtime string,
accesstime string,
preferredblocksize bigint,
blockscount int,
filesize bigint,
nsquota string,
dsquota string,
permission string,
username string,
groupname string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'=',',
'serialization.format'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
# 將解析的檔案匯入到hive的目錄下
hdfs dfs -put fsimage.csv hdfs://ns/xxxxxx
# 統計小於10MB的檔案個數,根據路徑分組
select concat('/',split(path,'/')[1], '/',split(path,'/')[2], '/',split(path,'/')[3], '/',split(path,'/')[4], '/', split(path,'/')[5]) AS dir_path,count(1) as small_file_num from temp_dev_db.fsimage_info_csv |
# 將結果匯出本地開始治理小檔案問題