一個MapReduce 程式示例 細節決定成敗(七) :自定義Key 及RecordReader

self_control發表於2016-05-30
上一篇中,演示瞭如何使用CombineFileInputFormat 來優化當有多個輸入小檔案時,減少起動的map task個數。
在自定義的MyCombineFileInputFormat中的MyRecordReader是簡單代理了LineRecordReader。
其它我也還可以在這個地方做更多的東西。
本次實驗是使用自定義的
RecordReader從split中自定義 key value。

自定義MyKey
自定義的key 需要實現WritableComparable  介面。

點選(此處)摺疊或開啟

  1. package wordcount;

  2. import java.io.DataInput;
  3. import java.io.DataOutput;
  4. import java.io.IOException;

  5. import org.apache.hadoop.io.WritableComparable;

  6. public class MyKey implements WritableComparable<MyKey> {

  7.         private char c;
  8.         @Override
  9.         public void write(DataOutput out) throws IOException {
  10.                 out.writeChar(c);
  11.         }

  12.         @Override
  13.         public void readFields(DataInput in) throws IOException {
  14.                 c= in.readChar();
  15.         }

  16.         @Override
  17.         public int compareTo(MyKey key) {
  18.                 if(c==key.c)
  19.                         return 0;
  20.                 else if(c> key.c)
  21.                         return 1;
  22.                 else
  23.                         return -1;
  24.         }

  25.         public char getC() {
  26.                 return c;
  27.         }

  28.         public void setC(char c) {
  29.                 this.c = c;
  30.         }


  31. }

自定義CombinedFilesInputFormat 自定義RecordReader

點選(此處)摺疊或開啟

  1. package wordcount;

  2. import java.io.IOException;

  3. import org.apache.commons.lang.StringUtils;
  4. import org.apache.hadoop.io.IntWritable;
  5. import org.apache.hadoop.io.LongWritable;
  6. import org.apache.hadoop.io.Text;
  7. import org.apache.hadoop.mapreduce.InputSplit;
  8. import org.apache.hadoop.mapreduce.RecordReader;
  9. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  10. import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
  11. import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
  12. import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
  13. import org.apache.hadoop.mapreduce.lib.input.FileSplit;
  14. import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
  15. import org.apache.log4j.Logger;


  16. public class MyCombinedFilesInputFormat extends CombineFileInputFormat<LongWritable, Text> {

  17.         @SuppressWarnings({ "unchecked", "rawtypes" })
  18.         @Override
  19.         public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
  20.                 return new CombineFileRecordReader((CombineFileSplit) split,context,MyCombinedFilesRecordReader.class);
  21.         }

  22.         public static class MyCombinedFilesRecordReader extends RecordReader<MyKey, IntWritable> {
  23.                 private int index;
  24.                 private LineRecordReader reader;

  25.                 private String tValue;
  26.                 private int pos=0;

  27.                 private MyKey key = new MyKey();

  28.                 Logger log = Logger.getLogger(MyCombinedFilesRecordReader.class);
  29.                 public MyCombinedFilesRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) {
  30.                         this.index = index;
  31.                         reader = new LineRecordReader();
  32.                 }

  33.                 @Override
  34.                 public void initialize(InputSplit split, TaskAttemptContext context)
  35.                                 throws IOException, InterruptedException {
  36.                         CombineFileSplit cfsplit = (CombineFileSplit) split;
  37.                         FileSplit fileSplit = new FileSplit(cfsplit.getPath(index),
  38.                                                                                                 cfsplit.getOffset(index),
  39.                                                                                                 cfsplit.getLength(index),
  40.                                                                                                 cfsplit.getLocations()
  41.                                         );
  42.                         reader.initialize(fileSplit, context);
  43.                 }

  44.                 @Override
  45.                 public boolean nextKeyValue() throws IOException, InterruptedException {
  46.                         if(StringUtils.isEmpty(tValue)||pos>=tValue.length()-1){
  47.                                 if(reader.nextKeyValue()){
  48.                                         pos = 0;
  49.                                         this.tValue = reader.getCurrentValue().toString();
  50.                                         return true;
  51.                                 }
  52.                                 else{
  53.                                         return false;
  54.                                 }
  55.                         }
  56.                         else{
  57.                                 pos ++;
  58.                                 if(tValue.charAt(pos)<='z' && tValue.charAt(pos)>='A'){
  59.                                         return true;
  60.                                 }
  61.                                 else{
  62.                                         return nextKeyValue();
  63.                                 }
  64.                         }
  65.                 }

  66.                 @Override
  67.                 public MyKey getCurrentKey() throws IOException,
  68.                                 InterruptedException {
  69.                         key.setC(tValue.charAt(pos));
  70.                         return key;
  71.                 }

  72.                 @Override
  73.                 public IntWritable getCurrentValue() throws IOException, InterruptedException {
  74.                         return new IntWritable(1);
  75.                 }

  76.                 @Override
  77.                 public float getProgress() throws IOException, InterruptedException {
  78.                         return reader.getProgress();
  79.                 }

  80.                 @Override
  81.                 public void close() throws IOException {
  82.                         reader.close();
  83.                 }

  84.         }
  85. }
Mapper程式程式碼
可以看到使用MyRecordReader返回自定義Key後,Map 函式得到了很大的簡化。

點選(此處)摺疊或開啟

  1. public static class MyWordCountMapper extends
  2.                         Mapper<MyKey, NullWritable, Text, IntWritable> {
  3.                 Text mKey = new Text();
  4.                 IntWritable mValue = new IntWritable(1);
  5.                 @Override
  6.                 protected void map(MyKey key, NullWritable value, Context context)
  7.                                 throws IOException, InterruptedException {
  8.                         mKey.set(String.valueOf(key.getC()));
  9.                         context.write(mKey, mValue);
  10.                 }
  11.         }
為了方便以後檢視,把主程式程式碼也貼上來。

點選(此處)摺疊或開啟

  1. package wordcount;

  2. import java.io.IOException;

  3. import org.apache.hadoop.conf.Configuration;
  4. import org.apache.hadoop.conf.Configured;
  5. import org.apache.hadoop.fs.Path;
  6. import org.apache.hadoop.io.IntWritable;
  7. import org.apache.hadoop.io.NullWritable;
  8. import org.apache.hadoop.io.Text;
  9. import org.apache.hadoop.mapreduce.Job;
  10. import org.apache.hadoop.mapreduce.Mapper;
  11. import org.apache.hadoop.mapreduce.Reducer;
  12. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
  13. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
  14. import org.apache.hadoop.util.Tool;
  15. import org.apache.hadoop.util.ToolRunner;
  16. import org.apache.log4j.Logger;

  17. public class MyWordCountJob extends Configured implements Tool {
  18.         Logger log = Logger.getLogger(MyWordCountJob.class);

  19.         public static class MyWordCountMapper extends
  20.                         Mapper<MyKey, NullWritable, Text, IntWritable> {
  21.                 Text mKey = new Text();
  22.                 IntWritable mValue = new IntWritable(1);
  23.                 @Override
  24.                 protected void map(MyKey key, NullWritable value, Context context)
  25.                                 throws IOException, InterruptedException {
  26.                         mKey.set(String.valueOf(key.getC()));
  27.                         context.write(mKey, mValue);
  28.                 }
  29.         }

  30.         public static class MyWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
  31.                 Text rkey = new Text();
  32.                 IntWritable rvalue = new IntWritable(1);
  33.                 @Override
  34.                 protected void reduce(Text key, Iterable<IntWritable> values,Context context)
  35.                                 throws IOException, InterruptedException {

  36.                         int n=0;
  37.                         for(IntWritable value :values){
  38.                                 n+= value.get();
  39.                         }
  40.                         rvalue.set(n);
  41.                         context.write(key, rvalue);
  42.                 }
  43.         }

  44.         @Override
  45.         public int run(String[] args) throws Exception {
  46.                 //valid the parameters
  47.                 if(args.length !=2){
  48.                         return -1;
  49.                 }

  50.                 Job job = Job.getInstance(getConf(), "MyWordCountJob");
  51.                 job.setJarByClass(MyWordCountJob.class);

  52.                 Path inPath = new Path(args[0]);
  53.                 Path outPath = new Path(args[1]);

  54.                 outPath.getFileSystem(getConf()).delete(outPath,true);
  55.                 TextInputFormat.setInputPaths(job, inPath);
  56.                 TextOutputFormat.setOutputPath(job, outPath);


  57.                 job.setMapperClass(MyWordCountJob.MyWordCountMapper.class);
  58.                 job.setReducerClass(MyWordCountJob.MyWordCountReducer.class);

  59.                 job.setInputFormatClass(MyCombinedFilesInputFormat.class);
  60.                 MyCombinedFilesInputFormat.setMaxInputSplitSize(job, 1024*1024*64);
  61.                 job.setOutputFormatClass(TextOutputFormat.class);
  62.                 job.setMapOutputKeyClass(Text.class);
  63.                 job.setMapOutputValueClass(IntWritable.class);
  64.                 job.setOutputKeyClass(Text.class);
  65.                 job.setOutputValueClass(IntWritable.class);


  66.                 return job.waitForCompletion(true)?0:1;
  67.         }
  68.         public static void main(String [] args){
  69.                 int result = 0;
  70.                 try {
  71.                         result = ToolRunner.run(new Configuration(), new MyWordCountJob(), args);
  72.                 } catch (Exception e) {
  73.                         e.printStackTrace();
  74.                 }
  75.                 System.exit(result);
  76.         }

  77. }

而且使用自定義的MyRecordReader的好處不限於此,稍後一篇討論使用TotalOrderPartitioner時會發現,對於詞頻統計,使用TotalOrderPartitioner 自定義RecordReader是必要的。

來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/30066956/viewspace-2109264/,如需轉載,請註明出處,否則將追究法律責任。

相關文章