InputFormat（程式碼以TextInputFormat為例）：

FileinputFormat中有三個重要方法：
            1).isSplitable
            2).getSplits
            3).createRecordReader

一.isSplitable方法：
改方法返回值為bool型別，判斷是否進行分片。

二.getSplits方法：
該方法返回值為List。如果isSplitable返回值為ture，則該方法返回的為分塊之後的檔案，否則為沒有分塊的檔案。
InputSplit是一個抽象類，FileSplit繼承該類，FileSplit有以下屬性：
              private Path file; //檔案路徑
              private long start; //該塊儲存的檔案的起始下標
           private long length;      //該塊儲存塊大小
            private String[] hosts; //儲存該塊的主機

getSplits方法原始碼：

/**

   * Generate the list of files and make them into FileSplits.

   * @param job the job context

   * @throws IOException

   */

  public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));

    long maxSize = getMaxSplitSize(job);

    // generate splits

    List<InputSplit> splits = new ArrayList<InputSplit>();

    List<FileStatus> files = listStatus(job); //該方法會遍歷輸入目錄和目錄的子目錄，將檔案資訊儲存到List中。

    for (FileStatus file: files) { //遍歷該List將檔案放入到 splits 中。

      Path path = file.getPath();

      long length = file.getLen();

      if (length != 0) {

        BlockLocation[] blkLocations;

        if (file instanceof LocatedFileStatus) {

          blkLocations = ((LocatedFileStatus) file).getBlockLocations();

        } else {

          FileSystem fs = path.getFileSystem(job.getConfiguration());

          blkLocations = fs.getFileBlockLocations(file, 0, length);

        }

        if (isSplitable(job, path)) { //如果為分片

          long blockSize = file.getBlockSize();

          long splitSize = computeSplitSize(blockSize, minSize, maxSize); //獲取分片大小

          long bytesRemaining = length;

          while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) { // SPLIT_SLOP值為1.1 如果檔案總大小/分片大小<1.1 即使該檔案大小大於塊大小，那該檔案也不會分割。

            int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);

            splits.add(makeSplit(path, length-bytesRemaining, splitSize,blkLocations[blkIndex].getHosts()));

             // makeSplit方法： protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) {

   // return new FileSplit(file, start, length, hosts);


bytesRemaining -= splitSize;                                  // bytesRemaining 減去已經分片的大小splitSize

          }

          if (bytesRemaining != 0) {

            int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);

            splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,

                       blkLocations[blkIndex].getHosts()));

          }

        } else { // not splitable                                            //如果不分片，則直接將檔案放入到List中，起始位置為0，大小為檔案總大小。

          splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));

        }

      } else {

        //Create empty hosts array for zero length files

        splits.add(makeSplit(path, 0, length, new String[0]));

      }

    }

    // Save the number of input files for metrics/loadgen

    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: " + splits.size());
  return splits;
}

三.createRecordReader方法：
該方法返回一個RecordReader物件。以支援分片的LineRecordReader為例：
該物件有四個重要方法：
1).initialize
2).nextKeyValue
3).getCurrentKey
4).getCurrentValue

1.initialize方法：

public void initialize(InputSplit genericSplit,

                         TaskAttemptContext context) throws IOException {

    FileSplit split = (FileSplit) genericSplit;

    Configuration job = context.getConfiguration();

    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);

    start = split.getStart();                                          //獲取起始下標

    end = start + split.getLength();                                     //獲取大小

    final Path file = split.getPath();

    // open the file and seek to the start of the split

    final FileSystem fs = file.getFileSystem(job);

    fileIn = fs.open(file);



    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); //根據檔案字尾名獲取相應解碼器。如果輸入檔案為壓縮檔案則會自動獲取。

    if (null!=codec) {                                                         //如果是壓縮檔案

      isCompressedInput = true;

      decompressor = CodecPool.getDecompressor(codec);

      if (codec instanceof SplittableCompressionCodec) {

        final SplitCompressionInputStream cIn =

          ((SplittableCompressionCodec)codec).createInputStream(

            fileIn, decompressor, start, end,

            SplittableCompressionCodec.READ_MODE.BYBLOCK);

        if (null == this.recordDelimiterBytes){ //recordDelimiterBytes為終止符，如果讀取一行時遇到該符號則結束。

          in = new LineReader(cIn, job);                              //獲取到LineReader LineReader封裝了一個InputStream

        } else {

          in = new LineReader(cIn, job, this.recordDelimiterBytes);

        }

        start = cIn.getAdjustedStart();

        end = cIn.getAdjustedEnd();

        filePosition = cIn;

      } else {

        if (null == this.recordDelimiterBytes) {

          in = new LineReader(codec.createInputStream(fileIn, decompressor),

              job);

        } else {

          in = new LineReader(codec.createInputStream(fileIn,

              decompressor), job, this.recordDelimiterBytes);

        }

        filePosition = fileIn;

      }

    } else {

      fileIn.seek(start);

      if (null == this.recordDelimiterBytes){

        in = new LineReader(fileIn, job);

      } else {

        in = new LineReader(fileIn, job, this.recordDelimiterBytes);

      }

      filePosition = fileIn;

    }

    // If this is not the first split, we always throw away first record

    // because we always (except the last split) read one extra line in

    // next() method.

    if (start != 0) {

      start += in.readLine(new Text(), 0, maxBytesToConsume(start));

    }

    this.pos = start;

  }

2.nextKeyValue方法：

public boolean nextKeyValue() throws IOException {

    if (key == null) {

      key = new LongWritable();

    }

    key.set(pos);                                                      //pos預設值為FileSplit的start，也就是行號。

    if (value == null) {

      value = new Text();

    }

    int newSize = 0;

    // We always read one extra line, which lies outside the upper

    // split limit i.e. (end - 1)

    while (getFilePosition() <= end) {

      newSize = in.readLine(value, maxLineLength, //獲取到value

          Math.max(maxBytesToConsume(pos), maxLineLength));

      pos += newSize;

      if (newSize < maxLineLength) {

        break;

      }

      // line too long. try again

      LOG.info("Skipped line of size " + newSize + " at pos " +
(pos - newSize));
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
}

3.getCurrentKey 和 getCurrentValue 為獲取 key value

看一下map的run方法：

public void run(Context context) throws IOException, InterruptedException {

    setup(context);

    try {

      while (context.nextKeyValue()) {

        map(context.getCurrentKey(), context.getCurrentValue(), context);

      }

    } finally {

      cleanup(context);

    }

  }

可能不是很全面，主要目的為個人備忘

Mapreduce原始碼分析分片、處理流程

相關文章