Dream car 鎮樓 ~ !
接上一節Input
環節,接下來分析 output
環節。程式碼在runNewMapper()
方法中:
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,final TaskSplitIndex splitIndex,
final TaskUmbilicalProtocol umbilical,TaskReporter reporter) {
.......
// 這個out也被包含在map的上下文當中了,所以在map方法中的輸出,呼叫的是output的write方法
org.apache.hadoop.mapreduce.RecordWriter output = null;
// 記住這個數值 0
if (job.getNumReduceTasks() == 0) { // 判斷ReduceTask的數量
output =
new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
} else { // > 0
// 建立一個 Collector 物件 【看構造原始碼可以知道輸出的時候是需要分割槽的】
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
}
// -----------new NewOutputCollector() begin ------------------
NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
JobConf job,
TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, ClassNotFoundException {
//1、 賦值操作。先不仔細看,跳過~ 下一段說
collector = createSortingCollector(job, reporter);
// 2、有多少個reducetask 就有多少個分割槽
// 回憶:一個分割槽可以有若干組,相同的key為一組
partitions = jobContext.getNumReduceTasks();
if (partitions > 1) {
partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
// 常見套路:反射生成例項物件,如果有自定義分割槽器,則不使用預設的
// 預設的分割槽演算法是簡單的hash取模,會保證相同的key在一組
ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
} else { // reducetask = 1,所有的組都會進入一個分割槽
partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
// 返回分割槽號,返回的值固定為 0
public int getPartition(K key, V value, int numPartitions) {
return partitions - 1;
}
};
}
}
// -----------new NewOutputCollector() end ------------------
// -----------write(K key, V value) begin ------------------
// output往外寫的時候帶著 (k v p) 三元組
public void write(K key, V value) throws IOException, InterruptedException {
collector.collect(key, value,
partitioner.getPartition(key, value, partitions));
// -----------write(K key, V value) end --------------------
..............
}
createSortingCollector(job, reporter)
方法進去:
private <KEY, VALUE> MapOutputCollector<KEY, VALUE>
createSortingCollector(JobConf job, TaskReporter reporter)
throws IOException, ClassNotFoundException {
// 反射建立collector例項
MapOutputCollector<KEY, VALUE> collector
= (MapOutputCollector<KEY, VALUE>)
// 常見套路:如果沒有使用者自定義collector,那麼就取預設的
ReflectionUtils.newInstance(
job.getClass(JobContext.MAP_OUTPUT_COLLECTOR_CLASS_ATTR,
// MapOutputBuffer 這玩意牛逼,後邊再說。
MapOutputBuffer.class, MapOutputCollector.class), job);
MapOutputCollector.Context context =
new MapOutputCollector.Context(this, job, reporter);
// 初始化的就是 MapOutputBuffer,真正要使用它之前要初始化。
// 重要方法,下段分析
collector.init(context);
return collector;
}
重頭戲了,進入初始化環節:collector.init(context)
,刪除非核心程式碼,清清爽爽開開心心讀原始碼 ~
public void init(MapOutputCollector.Context context) {
// 0.隨便看看
job = context.getJobConf();
reporter = context.getReporter();
mapTask = context.getMapTask();
mapOutputFile = mapTask.getMapOutputFile();
sortPhase = mapTask.getSortPhase();
spilledRecordsCounter = reporter.getCounter(TaskCounter.SPILLED_RECORDS);
partitions = job.getNumReduceTasks();
rfs = ((LocalFileSystem)FileSystem.getLocal(job)).getRaw();
// 1.溢寫的閾值 0.8 , 剩下的 0.2 空間還可以繼續使用
final float spillper =
job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8);
// 2.緩衝區的預設大小
final int sortmb = job.getInt(JobContext.IO_SORT_MB, 100);
indexCacheMemoryLimit = job.getInt(JobContext.INDEX_CACHE_MEMORY_LIMIT,
INDEX_CACHE_MEMORY_LIMIT_DEFAULT);
// 3. 排序器:如果沒有自定義,就使用預設的快排演算法
// 排序的本質就是在做比較:字典序或者數值序,所以排序器要用到【比較器】後邊會說
sorter = ReflectionUtils.newInstance(job.getClass("map.sort.class",
QuickSort.class, IndexedSorter.class), job);
//--------------------這可就是大名鼎鼎的環形緩衝區,真™牛X的設計---------------
int maxMemUsage = sortmb << 20;
maxMemUsage -= maxMemUsage % METASIZE;
kvbuffer = new byte[maxMemUsage];
bufvoid = kvbuffer.length;
kvmeta = ByteBuffer.wrap(kvbuffer)
.order(ByteOrder.nativeOrder())
.asIntBuffer();
setEquator(0);
bufstart = bufend = bufindex = equator;
kvstart = kvend = kvindex;
maxRec = kvmeta.capacity() / NMETA;
softLimit = (int)(kvbuffer.length * spillper);
bufferRemaining = softLimit;
//--------------------------------------------------------------------
// k/v serialization
// 4.獲取【比較器】進行排序。如果沒有自定義,就使用預設的。
// key 型別都是Hadoop封裝的可序列化類,自身都帶比較器
comparator = job.getOutputKeyComparator();
.............
// output counters
.............
// compression:資料壓縮
............
// combiner:相同的key在map端做一次合併,減少reduce拉取的資料量.為我們提供了調優介面
// 俗稱:小reduce ,會在map端發生一次或多次. 之後的文章會介紹這個原始碼
.............
// 4. 溢寫執行緒
// 當環形緩衝區的佔用到80%,將緩衝區中的資料寫入到磁碟
// 此時的緩衝區是多個執行緒共享的:有執行緒在往磁碟寫,有執行緒在往緩衝區寫
// 怎樣防止讀寫執行緒碰撞?答:反向寫資料到緩衝區
spillInProgress = false;
minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
spillThread.setDaemon(true);
spillThread.setName("SpillThread");
spillLock.lock();
try {
spillThread.start();
while (!spillThreadRunning) {
spillDone.await();
}
} catch (InterruptedException e) {
} finally {
spillLock.unlock();
}
}
後邊原始碼也沒必要一行行看了,直接文字總結描述了
MapOutBuffer:
map 輸出的K-V會被序列化成位元組陣列,計算出分割槽號,最終是三元組<k,v,p>
buffer 是map過程使用到的環形緩衝區:
- 本質是位元組陣列;
- 赤道:兩端分別存放K-V,索引;
- 索引:對K-V的索引,固定長度16B,4個int:分割槽號P,K的偏移量,V的偏移量,V的資料長度;
- 資料填充到緩衝區的閾值 80% 時,啟動溢寫執行緒;
- 快速排序 80%的資料,同時Map輸出的執行緒向緩衝區的剩餘部分寫入;
- 快速排序的過程,比較的是key,但是移動的是索引;
- 溢寫時只要排序後的索引,溢位資料就是有序的;
注意:排序是二次排序:
- 分割槽有序:reduce拉取資料是按照分割槽拉取;
- 分割槽內key 有序:因為reduce計算是按照分組計算;
調優:在溢寫過程中會發生combiner
- 其實就是一個 map 裡的reduce,按照組進行統計;
- 發生時間點:排序之後相同的key放在一起了,開始combiner,然後溢寫;
minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3)
,最終map結束輸出過程buffer會溢位多個小檔案,當檔案的個數達到3個時,map會把小檔案合併,避免檔案的碎片化【小檔案問題,後邊還會提及】
附 溢寫執行緒相關原始碼:
protected class SpillThread extends Thread {
@Override
public void run() {
spillLock.lock();
spillThreadRunning = true;
try {
while (true) {
spillDone.signal();
while (!spillInProgress) {
spillReady.await();
}
try {
spillLock.unlock();
// 排序並溢寫會被呼叫
sortAndSpill();
} catch (Throwable t) {
sortSpillException = t;
} finally {
spillLock.lock();
if (bufend < bufstart) {
bufvoid = kvbuffer.length;
}
kvstart = kvend;
bufstart = bufend;
spillInProgress = false;
}
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} finally {
spillLock.unlock();
spillThreadRunning = false;
}
}
}
sortAndSpill()
private void sortAndSpill() throws IOException, ClassNotFoundException,
InterruptedException {
//approximate the length of the output file to be the length of the
//buffer + header lengths for the partitions
final long size = (bufend >= bufstart
? bufend - bufstart
: (bufvoid - bufend) + bufstart) +
partitions * APPROX_HEADER_LENGTH;
FSDataOutputStream out = null;
try {
// create spill file
final SpillRecord spillRec = new SpillRecord(partitions);
final Path filename =
mapOutputFile.getSpillFileForWrite(numSpills, size);
out = rfs.create(filename);
final int mstart = kvend / NMETA;
final int mend = 1 + // kvend is a valid record
(kvstart >= kvend
? kvstart
: kvmeta.capacity() + kvstart) / NMETA;
sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
int spindex = mstart;
final IndexRecord rec = new IndexRecord();
final InMemValBytes value = new InMemValBytes();
for (int i = 0; i < partitions; ++i) {
IFile.Writer<K, V> writer = null;
try {
long segmentStart = out.getPos();
writer = new Writer<K, V>(job, out, keyClass, valClass, codec,
spilledRecordsCounter);
// 會呼叫combiner
if (combinerRunner == null) {
// spill directly
DataInputBuffer key = new DataInputBuffer();
while (spindex < mend &&
kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
final int kvoff = offsetFor(spindex % maxRec);
int keystart = kvmeta.get(kvoff + KEYSTART);
int valstart = kvmeta.get(kvoff + VALSTART);
key.reset(kvbuffer, keystart, valstart - keystart);
getVBytesForOffset(kvoff, value);
writer.append(key, value);
++spindex;
}
} else {
int spstart = spindex;
while (spindex < mend &&
kvmeta.get(offsetFor(spindex % maxRec)
+ PARTITION) == i) {
++spindex;
}
// Note: we would like to avoid the combiner if we've fewer
// than some threshold of records for a partition
if (spstart != spindex) {
combineCollector.setWriter(writer);
RawKeyValueIterator kvIter =
new MRResultIterator(spstart, spindex);
combinerRunner.combine(kvIter, combineCollector);
}
}