MapReduce —— MapTask階段原始碼分析(Output環節)

Lau發表於2021-06-11

Dream car 鎮樓 ~ !

MapReduce —— MapTask階段原始碼分析(Output環節)

接上一節Input環節,接下來分析 output環節。程式碼在runNewMapper()方法中:

private <INKEY,INVALUE,OUTKEY,OUTVALUE>
  void runNewMapper(final JobConf job,final TaskSplitIndex splitIndex,
  final TaskUmbilicalProtocol umbilical,TaskReporter reporter) {
                          .......
       // 這個out也被包含在map的上下文當中了,所以在map方法中的輸出,呼叫的是output的write方法
      org.apache.hadoop.mapreduce.RecordWriter output = null;
       // 記住這個數值  0 
    if (job.getNumReduceTasks() == 0) {  // 判斷ReduceTask的數量
      output = 
        new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
    } else {    // > 0
        // 建立一個 Collector 物件  【看構造原始碼可以知道輸出的時候是需要分割槽的】
      output = new NewOutputCollector(taskContext, job, umbilical, reporter);
    }
                                 
//  -----------new NewOutputCollector() begin ------------------
    NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
                       JobConf job,
                       TaskUmbilicalProtocol umbilical,
                       TaskReporter reporter
                       ) throws IOException, ClassNotFoundException {
        //1、 賦值操作。先不仔細看,跳過~  下一段說
      collector = createSortingCollector(job, reporter);
        
        // 2、有多少個reducetask 就有多少個分割槽
        // 回憶:一個分割槽可以有若干組,相同的key為一組
      partitions = jobContext.getNumReduceTasks();
        
      if (partitions > 1) {
        partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
            // 常見套路:反射生成例項物件,如果有自定義分割槽器,則不使用預設的
            // 預設的分割槽演算法是簡單的hash取模,會保證相同的key在一組
          ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
          
      } else {  // reducetask = 1,所有的組都會進入一個分割槽
        partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
           // 返回分割槽號,返回的值固定為 0
          public int getPartition(K key, V value, int numPartitions) {
            return partitions - 1;
          }
        };
      }
    }
//  -----------new NewOutputCollector()  end ------------------
   
                                 
//  -----------write(K key, V value) begin ------------------
     // output往外寫的時候帶著 (k v p)  三元組       
    public void write(K key, V value) throws IOException, InterruptedException {
      collector.collect(key, value,
                        partitioner.getPartition(key, value, partitions));
//  -----------write(K key, V value) end --------------------
        
                             ..............                          

  }

createSortingCollector(job, reporter)方法進去:

private <KEY, VALUE> MapOutputCollector<KEY, VALUE>
          createSortingCollector(JobConf job, TaskReporter reporter)
    throws IOException, ClassNotFoundException {
    
    // 反射建立collector例項
    MapOutputCollector<KEY, VALUE> collector
      = (MapOutputCollector<KEY, VALUE>)
        // 常見套路:如果沒有使用者自定義collector,那麼就取預設的
       ReflectionUtils.newInstance(
                        job.getClass(JobContext.MAP_OUTPUT_COLLECTOR_CLASS_ATTR,
                        // MapOutputBuffer 這玩意牛逼,後邊再說。
                        MapOutputBuffer.class, MapOutputCollector.class), job);
    
    MapOutputCollector.Context context =
                           new MapOutputCollector.Context(this, job, reporter);
    
    // 初始化的就是 MapOutputBuffer,真正要使用它之前要初始化。
    // 重要方法,下段分析
    collector.init(context);
    return collector;
  }

重頭戲了,進入初始化環節:collector.init(context) ,刪除非核心程式碼,清清爽爽開開心心讀原始碼 ~

    public void init(MapOutputCollector.Context context)  {
      // 0.隨便看看
      job = context.getJobConf();
      reporter = context.getReporter();
      mapTask = context.getMapTask();
      mapOutputFile = mapTask.getMapOutputFile();
      sortPhase = mapTask.getSortPhase();
      spilledRecordsCounter = reporter.getCounter(TaskCounter.SPILLED_RECORDS);
      partitions = job.getNumReduceTasks();
      rfs = ((LocalFileSystem)FileSystem.getLocal(job)).getRaw();

      // 1.溢寫的閾值 0.8 , 剩下的 0.2 空間還可以繼續使用
      final float spillper =
        job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8);
        
        // 2.緩衝區的預設大小
      final int sortmb = job.getInt(JobContext.IO_SORT_MB, 100);
      indexCacheMemoryLimit = job.getInt(JobContext.INDEX_CACHE_MEMORY_LIMIT,
                                         INDEX_CACHE_MEMORY_LIMIT_DEFAULT);
        
        // 3. 排序器:如果沒有自定義,就使用預設的快排演算法
        // 排序的本質就是在做比較:字典序或者數值序,所以排序器要用到【比較器】後邊會說
      sorter = ReflectionUtils.newInstance(job.getClass("map.sort.class",
            QuickSort.class, IndexedSorter.class), job);
        
      //--------------------這可就是大名鼎鼎的環形緩衝區,真™牛X的設計---------------
      int maxMemUsage = sortmb << 20;
      maxMemUsage -= maxMemUsage % METASIZE;
      kvbuffer = new byte[maxMemUsage];
      bufvoid = kvbuffer.length;
      kvmeta = ByteBuffer.wrap(kvbuffer)
         .order(ByteOrder.nativeOrder())
         .asIntBuffer();
      setEquator(0);
      bufstart = bufend = bufindex = equator;
      kvstart = kvend = kvindex;

      maxRec = kvmeta.capacity() / NMETA;
      softLimit = (int)(kvbuffer.length * spillper);
      bufferRemaining = softLimit;
       //--------------------------------------------------------------------

      // k/v serialization
      // 4.獲取【比較器】進行排序。如果沒有自定義,就使用預設的。
      // key 型別都是Hadoop封裝的可序列化類,自身都帶比較器
      comparator = job.getOutputKeyComparator();
        .............

      // output counters
       .............

      // compression:資料壓縮
         ............
            
      // combiner:相同的key在map端做一次合併,減少reduce拉取的資料量.為我們提供了調優介面
      // 俗稱:小reduce ,會在map端發生一次或多次. 之後的文章會介紹這個原始碼
        .............
            
      // 4. 溢寫執行緒 
      // 當環形緩衝區的佔用到80%,將緩衝區中的資料寫入到磁碟
      // 此時的緩衝區是多個執行緒共享的:有執行緒在往磁碟寫,有執行緒在往緩衝區寫
      // 怎樣防止讀寫執行緒碰撞?答:反向寫資料到緩衝區
      spillInProgress = false;
      minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
      spillThread.setDaemon(true);
      spillThread.setName("SpillThread");
      spillLock.lock();
      try {
        spillThread.start();
        while (!spillThreadRunning) {
          spillDone.await();
        }
      } catch (InterruptedException e) {
      } finally {
        spillLock.unlock();
      }
    }

後邊原始碼也沒必要一行行看了,直接文字總結描述了

MapOutBuffer:

map 輸出的K-V會被序列化成位元組陣列,計算出分割槽號,最終是三元組<k,v,p>

buffer 是map過程使用到的環形緩衝區:

  • 本質是位元組陣列;
  • 赤道:兩端分別存放K-V,索引;
  • 索引:對K-V的索引,固定長度16B,4個int:分割槽號P,K的偏移量,V的偏移量,V的資料長度;
  • 資料填充到緩衝區的閾值 80% 時,啟動溢寫執行緒;
  • 快速排序 80%的資料,同時Map輸出的執行緒向緩衝區的剩餘部分寫入;
  • 快速排序的過程,比較的是key,但是移動的是索引;
  • 溢寫時只要排序後的索引,溢位資料就是有序的;

注意:排序是二次排序:

  • 分割槽有序:reduce拉取資料是按照分割槽拉取;
  • 分割槽內key 有序:因為reduce計算是按照分組計算;

調優:在溢寫過程中會發生combiner

  • 其實就是一個 map 裡的reduce,按照組進行統計;
  • 發生時間點:排序之後相同的key放在一起了,開始combiner,然後溢寫;
  • minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3),最終map結束輸出過程buffer會溢位多個小檔案,當檔案的個數達到3個時,map會把小檔案合併,避免檔案的碎片化【小檔案問題,後邊還會提及】

附 溢寫執行緒相關原始碼:

protected class SpillThread extends Thread {
      @Override
      public void run() {
        spillLock.lock();
        spillThreadRunning = true;
        try {
          while (true) {
            spillDone.signal();
            while (!spillInProgress) {
              spillReady.await();
            }
            try {
              spillLock.unlock();
                // 排序並溢寫會被呼叫
              sortAndSpill();
            } catch (Throwable t) {
              sortSpillException = t;
            } finally {
              spillLock.lock();
              if (bufend < bufstart) {
                bufvoid = kvbuffer.length;
              }
              kvstart = kvend;
              bufstart = bufend;
              spillInProgress = false;
            }
          }
        } catch (InterruptedException e) {
          Thread.currentThread().interrupt();
        } finally {
          spillLock.unlock();
          spillThreadRunning = false;
        }
      }
    }

sortAndSpill()

private void sortAndSpill() throws IOException, ClassNotFoundException,
                                       InterruptedException {
      //approximate the length of the output file to be the length of the
      //buffer + header lengths for the partitions
      final long size = (bufend >= bufstart
          ? bufend - bufstart
          : (bufvoid - bufend) + bufstart) +
                  partitions * APPROX_HEADER_LENGTH;
      FSDataOutputStream out = null;
      try {
        // create spill file
        final SpillRecord spillRec = new SpillRecord(partitions);
        final Path filename =
            mapOutputFile.getSpillFileForWrite(numSpills, size);
        out = rfs.create(filename);

        final int mstart = kvend / NMETA;
        final int mend = 1 + // kvend is a valid record
          (kvstart >= kvend
          ? kvstart
          : kvmeta.capacity() + kvstart) / NMETA;
        sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
        int spindex = mstart;
        final IndexRecord rec = new IndexRecord();
        final InMemValBytes value = new InMemValBytes();
        for (int i = 0; i < partitions; ++i) {
          IFile.Writer<K, V> writer = null;
          try {
            long segmentStart = out.getPos();
            writer = new Writer<K, V>(job, out, keyClass, valClass, codec,
                                      spilledRecordsCounter);
              // 會呼叫combiner
            if (combinerRunner == null) {
              // spill directly
              DataInputBuffer key = new DataInputBuffer();
              while (spindex < mend &&
                  kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
                final int kvoff = offsetFor(spindex % maxRec);
                int keystart = kvmeta.get(kvoff + KEYSTART);
                int valstart = kvmeta.get(kvoff + VALSTART);
                key.reset(kvbuffer, keystart, valstart - keystart);
                getVBytesForOffset(kvoff, value);
                writer.append(key, value);
                ++spindex;
              }
            } else {
              int spstart = spindex;
              while (spindex < mend &&
                  kvmeta.get(offsetFor(spindex % maxRec)
                            + PARTITION) == i) {
                ++spindex;
              }
              // Note: we would like to avoid the combiner if we've fewer
              // than some threshold of records for a partition
              if (spstart != spindex) {
                combineCollector.setWriter(writer);
                RawKeyValueIterator kvIter =
                  new MRResultIterator(spstart, spindex);
                combinerRunner.combine(kvIter, combineCollector);
              }
            }

相關文章