hadoop透過CombineFileInputFormat實現小檔案合併減少map的個數
//map讀入的鍵 package hgs.combinefileinputformat.test; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; public class CombineFileKey implements WritableComparable<CombineFileKey> { private String fileName; private long offset; public String getFileName() { return fileName; } public void setFileName(String fileName) { this.fileName = fileName; } public long getOffset() { return offset; } public void setOffset(long offset) { this.offset = offset; } @Override public void readFields(DataInput input) throws IOException { this.fileName = Text.readString(input); this.offset = input.readLong(); } @Override public void write(DataOutput output) throws IOException { Text.writeString(output, fileName); output.writeLong(offset); } @Override public int compareTo(CombineFileKey obj) { int f = this.fileName.compareTo(obj.fileName); if(f==0) return (int)Math.signum((double)(this.offset-obj.offset)); return f; } @Override public int hashCode() { //摘自於 http://www.idryman.org/blog/2013/09/22/process-small-files-on-hadoop-using-combinefileinputformat-1/ final int prime = 31; int result = 1; result = prime * result + ((fileName == null) ? 0 : fileName.hashCode()); result = prime * result + (int) (offset ^ (offset >>> 32)); return result; } @Override public boolean equals(Object o) { if(o instanceof CombineFileKey) return this.compareTo((CombineFileKey)o)==0; return false; } }
package hgs.combinefileinputformat.test; import java.io.IOException; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; import org.apache.hadoop.util.LineReader; public class CombineFileReader extends RecordReader<CombineFileKey, Text>{ private long startOffset; //offset of the chunk; private long end; //end of the chunk; private long position; // current pos private FileSystem fs; private Path path; private CombineFileKey key; private Text value; private FSDataInputStream input; private LineReader reader; public CombineFileReader(CombineFileSplit split,TaskAttemptContext context , Integer index) throws IOException { //初始化path fs startOffset end this.path = split.getPath(index); this.fs = this.path.getFileSystem(context.getConfiguration()); this.startOffset = split.getOffset(index); this.end = split.getLength()+this.startOffset; //判斷現在開始的位置是否在一行的內部 boolean skipFirstLine = false; //open the file this.input = fs.open(this.path); //不等於0說明讀取位置在一行的內部 if(this.startOffset !=0 ){ skipFirstLine = true; --(this.startOffset); //定位到開始讀取的位置 this.input.seek(this.startOffset); } //初始化reader this.reader = new LineReader(input); if(skipFirstLine){ // skip first line and re-establish "startOffset". //這裡著這樣做的原因是 一行可能包含了這個檔案的所有的資料,猜測如果遇到一行的話,還是會讀取一行 //將其實位置調整到一行的開始,這樣的話會捨棄部分資料 this.startOffset += this.reader.readLine(new Text(), 0, (int)Math.min ((long)Integer.MAX_VALUE, this.end - this.startOffset)); } this.position = this.startOffset; } @Override public void close() throws IOException {} @Override public void initialize(InputSplit splite, TaskAttemptContext context) throws IOException, InterruptedException {} //返回當前的key @Override public CombineFileKey getCurrentKey() throws IOException, InterruptedException { return key; } //返回當前的value @Override public Text getCurrentValue() throws IOException, InterruptedException { return value; } //執行的進度 @Override public float getProgress() throws IOException, InterruptedException { //返回的型別為float if(this.startOffset==this.end){ return 0.0f; }else{ return Math.min(1.0f, (this.position - this.startOffset)/(float)(this.end - this.startOffset)); } } //該方法判斷是否有下一個key value @Override public boolean nextKeyValue() throws IOException, InterruptedException { //對key和value初始化 if(this.key == null){ this.key = new CombineFileKey(); this.key.setFileName(this.path.getName()); } this.key.setOffset(this.position); if(this.value == null){ this.value = new Text(); } //讀取一行資料,如果讀取的newSieze=0說明split的資料已經處理完成 int newSize = 0; if(this.position<this.end){ newSize = reader.readLine(this.value); position += newSize; } //沒有資料,將key value置位空 if(newSize == 0){ this.key = null; this.value = null; return false; }else{ return true; } } }
package hgs.combinefileinputformat.test; import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat; import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader; import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; public class CustCombineInputFormat extends CombineFileInputFormat<CombineFileKey, Text> { public CustCombineInputFormat(){ super(); //最大切片大小 this.setMaxSplitSize(67108864);//64 MB } @Override public RecordReader<CombineFileKey, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { return new CombineFileRecordReader<CombineFileKey, Text>((CombineFileSplit)split,context,CombineFileReader.class); } @Override protected boolean isSplitable(JobContext context, Path file) { return false; } } //驅動類 package hgs.test; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import hgs.combinefileinputformat.test.CustCombineInputFormat; public class LetterCountDriver { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); //conf.set("mapreduce.map.log.level", "INFO"); ///conf.set("mapreduce.reduce.log.level", "INFO"); Job job = Job.getInstance(conf, "LetterCount"); job.setJarByClass(hgs.test.LetterCountDriver.class); // TODO: specify a mapper job.setMapperClass(LetterCountMapper.class); // TODO: specify a reducer job.setReducerClass(LetterReducer.class); // TODO: specify output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); if(args[0].equals("1")) job.setInputFormatClass(CustCombineInputFormat.class); else{} // TODO: specify input and output DIRECTORIES (not files) FileInputFormat.setInputPaths(job, new Path("/words")); FileOutputFormat.setOutputPath(job, new Path("/result")); if (!job.waitForCompletion(true)) return; } }
hdfs檔案:
執行結果:不使用自定義的:CustCombineInputFormat
執行結果:在使用自定義的:CustCombineInputFormat
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/31506529/viewspace-2217548/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- hadoop archive合併小檔案並進行mapreduce來減少map的數量HadoopHive
- CDN的combo技術能把多個資原始檔合併引用,減少請求次數
- Hive小檔案合併Hive
- 透過列舉來減少if..else
- hadoop的archive歸檔和CombineFileInputFormat的使用HadoopHiveORM
- hadoop 合併sequcefie並在map中讀取Hadoop
- 多個excel檔案合併成一個excel表的方法 如何快速合併多個excel檔案Excel
- hadoop之 map個數控制Hadoop
- Hadoop--map/reduce實現單詞計數Hadoop
- python實現將資料夾內所有txt檔案合併成一個檔案Python
- 多個 EXCEL 檔案如何合併成一個檔案Excel
- css合併減少重複程式碼簡單例項CSS單例
- Hive表小檔案合併方法總結Hive
- iceberg合併小檔案衝突測試
- git小技巧--提取/合併某分支的部分檔案Git
- sqlserver 透過壓縮bak檔案實現從伺服器還原資料庫《資料差異數個小時》SQLServer伺服器資料庫
- 減少運維工作量,如何透過 ROS 輕鬆實現資源編排新方式運維ROS
- 如何在hadoop中控制map的個數Hadoop
- 麒麟V10、UOS系統實現線上合併多個Word檔案
- Python合併多個csv檔案Python
- Spark優化之小檔案是否需要合併?Spark優化
- 減小Delphi的Exe檔案大小
- windows合併檔案Windows
- 合併iso檔案
- 如何在Mac上減少PDF檔案大小Mac
- 辦公自動化:PDF檔案合併器,將多個PDF檔案進行合併
- 多個excel檔案合併到一個檔案中的多個sheet表中Excel
- WPS演示透過打包實現連續播放檔案
- Hadoop-Map/Reduce實現實現倒排索引Hadoop索引
- Builder設計模式結合lombok減少過多傳參UI設計模式Lombok
- git合併單個檔案到其他分支Git
- 透過alert日誌重建引數檔案
- 通過Guava實現兩個包含不同物件的List合併成一個ListGuava物件
- 基於檔案的表合併及行轉列實現參考
- git合併分支,如果選擇性的合併檔案?Git
- HDFS 07 - HDFS 效能調優之 合併小檔案
- 減小SQL SERVER的日誌檔案SQLServer
- Docker的`COPY --chmod`可將映象檔案大小減少35%Docker