MapReduce 大量小檔案

破棉襖發表於2014-07-23

我們知道Map的數量一般是由分片數量決定的,而當輸入為大量的小檔案時,可能就會產生大量的Map任務,並且每個Map任務的任務量都很小。
所以,我們需要將多個小檔案交由一個Map 任務來處理,開始想自己寫一套InputFormat、RecordReader、FileSplit。後來得知其實Hadoop早已為
我們提供了CombineFileInputFormat和CombineFileInputFormat。 (提供的CombineFileInputFormat會產生資料傾斜問題,後邊講


CombineFileInputFormat的大致原理是,他會將輸入多個資料檔案(小檔案)的後設資料全部包裝到CombineFileSplit類裡面。也就是說,因為小檔案的情況下,在HDFS中都是單Block的檔案,即一個檔案一個Block,一個CombineFileSplit包含了一組檔案Block,包括每個檔案的起始偏移(offset),長度(length),Block位置(localtions)等後設資料。

(程式碼大多采用自其它兄弟的博文,自己只是吸收完後在此記錄以作備忘,在此先謝過)

1.先看一下CombineFileSplit 的原始碼:(簡簡單單五個物件)
  1. public class CombineFileSplit extends InputSplit implements Writable {

  2.   private Path[] paths;
  3.   private long[] startoffset;
  4.   private long[] lengths;
  5.   private String[] locations;
  6.   private long totLength;

  7.   /**
  8.    * default constructor
  9.    */
  10.   public CombineFileSplit() {}
  11.   public CombineFileSplit(Path[] files, long[] start,
  12.                           long[] lengths, String[] locations) {
  13.     initSplit(files, start, lengths, locations);
  14.   }

  15.   public CombineFileSplit(Path[] files, long[] lengths) {
  16.     long[] startoffset = new long[files.length];
  17.     for (int i = 0; i < startoffset.length; i++) {
  18.       startoffset[i] = 0;
  19.     }
  20.     String[] locations = new String[files.length];
  21.     for (int i = 0; i < locations.length; i++) {
  22.       locations[i] = "\";
  23.     }
  24.     initSplit(files, startoffset, lengths, locations);
  25.   }
  26.   
  27.   private void initSplit(Path[] files, long[] start,
  28.                          long[] lengths, String[] locations) {
  29.     this.startoffset = start;
  30.     this.lengths = lengths;
  31.     this.paths = files;
  32.     this.totLength = 0;
  33.     this.locations = locations;
  34.     for(long length : lengths) {
  35.       totLength += length;
  36.     }
  37.   }

  38.   /**
  39.    * Copy constructor
  40.    */
  41.   public CombineFileSplit(CombineFileSplit old) throws IOException {
  42.     this(old.getPaths(), old.getStartOffsets(),
  43.          old.getLengths(), old.getLocations());
  44.   }

  45.   public long getLength() {
  46.     return totLength;
  47.   }

  48.   /** Returns an array containing the start offsets of the files in the split*/
  49.   public long[] getStartOffsets() {
  50.     return startoffset;
  51.   }
  52.   
  53.   /** Returns an array containing the lengths of the files in the split*/
  54.   public long[] getLengths() {
  55.     return lengths;
  56.   }

  57.   /** Returns the start offset of the i<sup>th</sup> Path */
  58.   public long getOffset(int i) {
  59.     return startoffset[i];
  60.   }
  61.   
  62.   /** Returns the length of the i<sup>th</sup> Path */
  63.   public long getLength(int i) {
  64.     return lengths[i];
  65.   }
  66.   
  67.   /** Returns the number of Paths in the split */
  68.   public int getNumPaths() {
  69.     return paths.length;
  70.   }

  71.   /** Returns the i<sup>th</sup> Path */
  72.   public Path getPath(int i) {
  73.     return paths[i];
  74.   }
  75.   
  76.   /** Returns all the Paths in the split */
  77.   public Path[] getPaths() {
  78.     return paths;
  79.   }

  80.   /** Returns all the Paths where this input-split resides */
  81.   public String[] getLocations() throws IOException {
  82.     return locations;
  83.   }

  84.   public void readFields(DataInput in) throws IOException {
  85.     totLength = in.readLong();
  86.     int arrLength = in.readInt();
  87.     lengths = new long[arrLength];
  88.     for(int i=0; i<arrLength;i++) {
  89.       lengths[i] = in.readLong();
  90.     }
  91.     int filesLength = in.readInt();
  92.     paths = new Path[filesLength];
  93.     for(int i=0; i<filesLength;i++) {
  94.       paths[i] = new Path(Text.readString(in));
  95.     }
  96.     arrLength = in.readInt();
  97.     startoffset = new long[arrLength];
  98.     for(int i=0; i<arrLength;i++) {
  99.       startoffset[i] = in.readLong();
  100.     }
  101.   }

  102.   public void write(DataOutput out) throws IOException {
  103.     out.writeLong(totLength);
  104.     out.writeInt(lengths.length);
  105.     for(long length : lengths) {
  106.       out.writeLong(length);
  107.     }
  108.     out.writeInt(paths.length);
  109.     for(Path p : paths) {
  110.       Text.writeString(out, p.toString());
  111.     }
  112.     out.writeInt(startoffset.length);
  113.     for(long length : startoffset) {
  114.       out.writeLong(length);
  115.     }
  116.   }
  117.   
  118.   @Override
  119.  public String toString() {
  120.     StringBuffer sb = new StringBuffer();
  121.     for (int i = 0; i < paths.length; i++) {
  122.       if (i == 0 ) {
  123.         sb.append("Paths:");
  124.       }
  125.       sb.append(paths[i].toUri().getPath() + ":" + startoffset[i] +
  126.                 "+" + lengths[i]);
  127.       if (i < paths.length -1) {
  128.         sb.append(",");
  129.       }
  130.     }
  131.     if (locations != null) {
  132.       String locs = "";
  133.       StringBuffer locsb = new StringBuffer();
  134.       for (int i = 0; i < locations.length; i++) {
  135.         locsb.append(locations[i] + ":");
  136.       }
  137.       locs = locsb.toString();
  138.       sb.append(" Locations:" + locs + "; ");
  139.     }
  140.     return sb.toString();
  141.   }
  142. }

2.接下來我們需要來實現一個
RecordReader,這個RecordReader其實為LineRecordReader的封裝
  1. import java.io.IOException;

  2. import org.apache.hadoop.fs.Path;
  3. import org.apache.hadoop.io.BytesWritable;
  4. import org.apache.hadoop.io.LongWritable;
  5. import org.apache.hadoop.io.Text;
  6. import org.apache.hadoop.mapreduce.InputSplit;
  7. import org.apache.hadoop.mapreduce.RecordReader;
  8. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  9. import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
  10. import org.apache.hadoop.mapreduce.lib.input.FileSplit;
  11. import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

  12. public class CombineSmallfileRecordReader extends RecordReader<LongWritable,Text> {

  13.     private CombineFileSplit combineFileSplit;
  14.     private LineRecordReader lineRecordReader = new LineRecordReader();
  15.     private Path[] paths;
  16.     private int totalLength;
  17.     private int currentIndex;
  18.     private float currentProgress = 0;
  19.     private LongWritable currentKey;
  20.     private Text currentValue = new Text();

  21.     public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
  22.         super();
  23.         this.combineFileSplit = combineFileSplit;
  24.         this.currentIndex = index; // 當前要處理的小檔案Block在CombineFileSplit中的索引
  25.     }

  26.     @Override
  27.     public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
  28.         this.combineFileSplit = (CombineFileSplit) split;
  29.         // 處理CombineFileSplit中的一個小檔案Block,因為使用LineRecordReader,需要構造一個FileSplit物件,然後才能夠讀取資料
  30.         FileSplit fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
  31.         lineRecordReader.initialize(fileSplit, context);

  32.         this.paths = combineFileSplit.getPaths();
  33.         totalLength = paths.length;
  34.         context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName());
  35.     }

  36.     @Override
  37.     public LongWritable getCurrentKey() throws IOException, InterruptedException {
  38.         currentKey = lineRecordReader.getCurrentKey();
  39.         return currentKey;
  40.     }

  41.     @Override
  42.     public Text getCurrentValue() throws IOException, InterruptedException {
  43.         currentValue = lineRecordReader.getCurrentValue();
            return currentValue;
  44.     }

  45.     @Override
  46.     public boolean nextKeyValue() throws IOException, InterruptedException {
  47.         if (currentIndex >= 0 && currentIndex < totalLength) {
  48.             return lineRecordReader.nextKeyValue();
  49.         } else {
  50.             return false;
  51.         }
  52.     }

  53.     @Override
  54.     public float getProgress() throws IOException {
  55.         if (currentIndex >= 0 && currentIndex < totalLength) {
  56.             currentProgress = (float) currentIndex / totalLength;
  57.             return currentProgress;
  58.         }
  59.         return currentProgress;
  60.     }

  61.     @Override
  62.     public void close() throws IOException {
  63.         lineRecordReader.close();
  64.     }
  65. }

3.接著實現一個CombineFileInputFormat的子類
  1. import java.io.IOException;

  2. import org.apache.hadoop.io.BytesWritable;
  3. import org.apache.hadoop.io.LongWritable;
  4. import org.apache.hadoop.io.Text;
  5. import org.apache.hadoop.mapreduce.InputSplit;
  6. import org.apache.hadoop.mapreduce.RecordReader;
  7. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  8. import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
  9. import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
  10. import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

  11. public class CombineSmallfileInputFormat extends CombineFileInputFormat<LongWritable,Text> {

  12.     @Override
  13.     public RecordReader<LongWritable,Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {

  14.         CombineFileSplit combineFileSplit = (CombineFileSplit) split;
  15.         CombineFileRecordReader<LongWritable,Text> recordReader = new CombineFileRecordReader<LongWritable,Text>(combineFileSplit, context, CombineSmallfileRecordReader.class);
  16.         try {
  17.             recordReader.initialize(combineFileSplit, context);
  18.         } catch (InterruptedException e) {
  19.             new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
  20.         }
  21.         return recordReader;
  22.     }

  23. }

4.至此我們已經完成了小檔案輸入的問題,但是讓我們看一下 CombineInputFormat 
的分片邏輯

         CombineInputFormat 中利用三個引數 maxSplitSize、minSizeNode、minSizeRack 來控制分片:
        1.如果指定了maxSplitSize(“mapreduce.input.fileinputformat.split.maxsize”),那麼在同一個節點上的Blocks合併,一個超過maxSplitSize就生成新分片。如果沒有指定,則只彙總本節點BLock,暫不分片。
         2.如果指定了minSizeNode(“mapreduce.input.fileinputformat.split.minsize.per.node”),那麼會把1.中處理剩餘的Block,進行合併,如果超過minSizeNode,那麼全部作為一個分片。否則這些Block與同一機架     Rack上的塊進行合併。
        3.每個節點上如上同樣的方式處理,然後針對整個Rack的所有Block,按照1.方式處理。剩餘部分,如果指定了minSizeRack(“mapreduce.input.fileinputformat.split.minsize.per.rack”),並且超過minSizeRack,則全部作為一個分片,否則這些Block保留,等待與所有機架上的剩餘Block進行彙總處理。

      每個機架上都按照1,2,3方式處理,彙總所有處理剩下的部分,再按照1的邏輯處理。再剩餘的,作為一個分片。

以上邏輯我們可以知道:

如果只設定maxSplitSize(如job.getConfiguration().set( “mapreduce.input.fileinputformat.split.maxsize” , “33554432″)),那麼基本每個分片大小都需湊滿maxSplitSize。

如果maxSplitSize,minSizeNode,minSizeRack三個都沒有設定,那是所有輸入整合成一個分片!

CombineInputFormat 提供的分片機制實在是複雜,並且可能會出現資料傾斜問題。所以我們來自己Override一個簡單實用的getSplits方法:

  1. private static final Log LOG = LogFactory.getLog(MultiFileInputFormat.class);
  2.     public static final String CONFNAME_INPUT_SPLIT_MAX_NUM = "multifileinputformat.max_split_num";
  3.     public static final Integer DEFAULT_MAX_SPLIT_NUM = 50;

  4.     public static void setMaxInputSplitNum(Job job, Integer maxSplitNum) {
  5.         job.getConfiguration().setInt(CONFNAME_INPUT_SPLIT_MAX_NUM, maxSplitNum);
  6.     }

  7.     @Override
  8.     public List<InputSplit> getSplits(JobContext job) throws IOException {
  9.         // get all the files in input path
  10.         List<FileStatus> stats = listStatus(job);
  11.         List<InputSplit> splits = new ArrayList<InputSplit>();
  12.         if (stats.size() == 0) {
  13.             return splits;
  14.         }
  15.         // 計算split的平均長度
  16.         long totalLen = 0;
  17.         for (FileStatus stat : stats) {
  18.             totalLen += stat.getLen();
  19.         }
  20.         int maxSplitNum = job.getConfiguration().getInt(CONFNAME_INPUT_SPLIT_MAX_NUM, DEFAULT_MAX_SPLIT_NUM);
  21.         int expectSplitNum = maxSplitNum < stats.size() ? maxSplitNum : stats.size();
  22.         long averageLen = totalLen / expectSplitNum;
  23.         LOG.info("Prepare InputSplit : averageLen(" + averageLen + ") totalLen(" + totalLen
  24.                 + ") expectSplitNum(" + expectSplitNum + ") ");
  25.         // 設定inputSplit
  26.         List<Path> pathLst = new ArrayList<Path>();
  27.         List<Long> offsetLst = new ArrayList<Long>();
  28.         List<Long> lengthLst = new ArrayList<Long>();
  29.         long currentLen = 0;
  30.         for (int i = 0; i < stats.size(); i++) {
  31.             FileStatus stat = stats.get(i);
  32.             pathLst.add(stat.getPath());
  33.             offsetLst.add(0L);
  34.             lengthLst.add(stat.getLen());
  35.             currentLen += stat.getLen();
  36.             if (splits.size() < expectSplitNum - 1 && currentLen > averageLen) {
  37.                 Path[] pathArray = new Path[pathLst.size()];
  38.                 CombineFileSplit thissplit = new CombineFileSplit(pathLst.toArray(pathArray),
  39.                     getLongArray(offsetLst), getLongArray(lengthLst), new String[0]);
  40.                 LOG.info("combineFileSplit(" + splits.size() + ") fileNum(" + pathLst.size()
  41.                         + ") length(" + currentLen + ")");
  42.                 splits.add(thissplit);
  43.                 
  44.                 pathLst.clear();
  45.                 offsetLst.clear();
  46.                 lengthLst.clear();
  47.                 currentLen = 0;
  48.             }
  49.         }
  50.         if (pathLst.size() > 0) {
  51.             Path[] pathArray = new Path[pathLst.size()];
  52.             CombineFileSplit thissplit =
  53.                     new CombineFileSplit(pathLst.toArray(pathArray), getLongArray(offsetLst),
  54.                             getLongArray(lengthLst), new String[0]);
  55.             LOG.info("combineFileSplit(" + splits.size() + ") fileNum(" + pathLst.size()
  56.                     + ") length(" + currentLen + ")");
  57.             splits.add(thissplit);
  58.         }
  59.         return splits;
  60.     }

  61.     private long[] getLongArray(List<Long> lst) {
  62.         long[] rst = new long[lst.size()];
  63.         for (int i = 0; i < lst.size(); i++) {
  64.             rst[i] = lst.get(i);
  65.         }
  66.         return rst;
  67.     }
這樣就可以透過 引數準確的控制map數量,且會均勻的分發資料到各Map


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/29754888/viewspace-1225105/,如需轉載,請註明出處,否則將追究法律責任。

相關文章