Mapreduce原始碼分析分片、處理流程

破棉襖發表於2014-08-12

InputFormat(程式碼以TextInputFormat為例

FileinputFormat中有三個重要方法:

            1).isSplitable
            2).getSplits
            3).createRecordReader

一.isSplitable方法:
                改方法返回值為bool型別,判斷是否進行分片。
  
二.getSplits方法:
         該方法返回值為
List。如果isSplitable返回值為ture,則該方法返回的為分塊之後的檔案,否則為沒有分塊的檔案。
          InputSplit是一個抽象類,FileSplit繼承該類,FileSplit有以下屬性:
              private Path file;               //檔案路徑
              private long start;            //該塊儲存的檔案的起始下標
              private long length;         //該塊儲存塊大小
              private String[] hosts;    //儲存該塊的主機

     getSplits方法原始碼:                 
  1. /**
  2.    * Generate the list of files and make them into FileSplits.
  3.    * @param job the job context
  4.    * @throws IOException
  5.    */
  6.   public List<InputSplit> getSplits(JobContext job) throws IOException {
  7.     long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
  8.     long maxSize = getMaxSplitSize(job);

  9.     // generate splits
  10.     List<InputSplit> splits = new ArrayList<InputSplit>();
  11.     List<FileStatus> files = listStatus(job);                //該方法會遍歷輸入目錄和目錄的子目錄,將檔案資訊儲存到List中。
  12.     for (FileStatus file: files) {                           //遍歷該List將檔案放入到 splits 中。
  13.       Path path = file.getPath();
  14.       long length = file.getLen();
  15.       if (length != 0) {
  16.         BlockLocation[] blkLocations;
  17.         if (file instanceof LocatedFileStatus) {
  18.           blkLocations = ((LocatedFileStatus) file).getBlockLocations();
  19.         } else {
  20.           FileSystem fs = path.getFileSystem(job.getConfiguration());
  21.           blkLocations = fs.getFileBlockLocations(file, 0, length);
  22.         }
  23.         if (isSplitable(job, path)) {                                  //如果為分片
  24.           long blockSize = file.getBlockSize();
  25.           long splitSize = computeSplitSize(blockSize, minSize, maxSize); //獲取分片大小

  26.           long bytesRemaining = length;
  27.           while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {  // SPLIT_SLOP值為1.1 如果 檔案總大小/分片大小<1.1 即使該檔案大小大於塊大小,那該檔案也不會分割。
  28.             int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
  29.             splits.add(makeSplit(path, length-bytesRemaining, splitSize,blkLocations[blkIndex].getHosts()));    
  30.              // makeSplit方法: protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
  31.              //            return new FileSplit(file, start, length, hosts);
             

  32.          
    1. bytesRemaining -= splitSize;                                   //          bytesRemaining 減去 已經分片的大小splitSize
  33.           }

  34.           if (bytesRemaining != 0) {                                     
  35.             int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
  36.             splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
  37.                        blkLocations[blkIndex].getHosts()));
  38.           }
  39.         } else { // not splitable                                            //如果不分片,則直接將檔案放入到List中,起始位置為0,大小為檔案總大小。                                                                            
  40.           splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
  41.         }
  42.       } else {
  43.         //Create empty hosts array for zero length files
  44.         splits.add(makeSplit(path, 0, length, new String[0]));
  45.       }
  46.     }
  47.     // Save the number of input files for metrics/loadgen
  48.     job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
  49.     LOG.debug("Total # of splits: " + splits.size());
        
    return splits;
      }
 三.createRecordReader方法:
      該方法返回一個RecordReader物件。以支援分片的LineRecordReader為例:
       該物件有四個重要方法:
        1).initialize
        2).nextKeyValue
        3).getCurrentKey
        4).getCurrentValue

     1.initialize方法:     
  1. public void initialize(InputSplit genericSplit,
  2.                          TaskAttemptContext context) throws IOException {
  3.     FileSplit split = (FileSplit) genericSplit;
  4.     Configuration job = context.getConfiguration();
  5.     this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
  6.     start = split.getStart();                                            //獲取起始下標
  7.     end = start + split.getLength();                                     //獲取大小
  8.     final Path file = split.getPath();

  9.     // open the file and seek to the start of the split
  10.     final FileSystem fs = file.getFileSystem(job);
  11.     fileIn = fs.open(file);                      
  12.     
  13.     CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);   //根據檔案字尾名獲取相應解碼器。 如果輸入檔案為壓縮檔案則會自動獲取。
  14.     if (null!=codec) {                                                         //如果是壓縮檔案
  15.       isCompressedInput = true;    
  16.       decompressor = CodecPool.getDecompressor(codec);
  17.       if (codec instanceof SplittableCompressionCodec) {
  18.         final SplitCompressionInputStream cIn =
  19.           ((SplittableCompressionCodec)codec).createInputStream(
  20.             fileIn, decompressor, start, end,
  21.             SplittableCompressionCodec.READ_MODE.BYBLOCK);
  22.         if (null == this.recordDelimiterBytes){                       //recordDelimiterBytes為終止符,如果讀取一行時遇到該符號則結束。
  23.           in = new LineReader(cIn, job);                              //獲取到LineReader   LineReader封裝了一個InputStream
  24.         } else {
  25.           in = new LineReader(cIn, job, this.recordDelimiterBytes);
  26.         }

  27.         start = cIn.getAdjustedStart();
  28.         end = cIn.getAdjustedEnd();
  29.         filePosition = cIn;
  30.       } else {
  31.         if (null == this.recordDelimiterBytes) {
  32.           in = new LineReader(codec.createInputStream(fileIn, decompressor),
  33.               job);
  34.         } else {
  35.           in = new LineReader(codec.createInputStream(fileIn,
  36.               decompressor), job, this.recordDelimiterBytes);
  37.         }
  38.         filePosition = fileIn;
  39.       }
  40.     } else {
  41.       fileIn.seek(start);
  42.       if (null == this.recordDelimiterBytes){
  43.         in = new LineReader(fileIn, job);
  44.       } else {
  45.         in = new LineReader(fileIn, job, this.recordDelimiterBytes);
  46.       }

  47.       filePosition = fileIn;
  48.     }
  49.     // If this is not the first split, we always throw away first record
  50.     // because we always (except the last split) read one extra line in
  51.     // next() method.
  52.     if (start != 0) {
  53.       start += in.readLine(new Text(), 0, maxBytesToConsume(start));
  54.     }
  55.     this.pos = start;
  56.   }
  2.nextKeyValue方法
 
  1. public boolean nextKeyValue() throws IOException {
  2.     if (key == null) {
  3.       key = new LongWritable();
  4.     }
  5.     key.set(pos);                                                       //pos預設值為FileSplit的start,也就是行號。
  6.     if (value == null) {
  7.       value = new Text();
  8.     }
  9.     int newSize = 0;
  10.     // We always read one extra line, which lies outside the upper
  11.     // split limit i.e. (end - 1)
  12.     while (getFilePosition() <= end) {
  13.       newSize = in.readLine(value, maxLineLength,                      //獲取到value
  14.           Math.max(maxBytesToConsume(pos), maxLineLength));
  15.       pos += newSize;
  16.       if (newSize < maxLineLength) {
  17.         break;
  18.       }

  19.       // line too long. try again
  20.       LOG.info("Skipped line of size " + newSize + " at pos " + 
                   (pos - newSize));
        }
        if (newSize == 0) {
          key = null;
          value = null;
          return false;
        } else {
          return true;
        }
      }
 3.getCurrentKey 和 getCurrentValue 為獲取 key value

看一下map的run方法:
  1. public void run(Context context) throws IOException, InterruptedException {
  2.     setup(context);
  3.     try {
  4.       while (context.nextKeyValue()) {
  5.         map(context.getCurrentKey(), context.getCurrentValue(), context);
  6.       }
  7.     } finally {
  8.       cleanup(context);
  9.     }
  10.   }



可能不是很全面,主要目的為個人備忘



       




來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/29754888/viewspace-1249907/,如需轉載,請註明出處,否則將追究法律責任。

相關文章