我們知道Map的數量一般是由分片數量決定的，而當輸入為大量的小檔案時，可能就會產生大量的Map任務，並且每個Map任務的任務量都很小。
所以，我們需要將多個小檔案交由一個Map 任務來處理，開始想自己寫一套InputFormat、RecordReader、FileSplit。後來得知其實Hadoop早已為
我們提供了CombineFileInputFormat和CombineFileInputFormat。（提供的CombineFileInputFormat會產生資料傾斜問題，後邊講）

CombineFileInputFormat的大致原理是，他會將輸入多個資料檔案（小檔案）的後設資料全部包裝到CombineFileSplit類裡面。也就是說，因為小檔案的情況下，在HDFS中都是單Block的檔案，即一個檔案一個Block，一個CombineFileSplit包含了一組檔案Block，包括每個檔案的起始偏移（offset），長度（length），Block位置（localtions）等後設資料。

（程式碼大多采用自其它兄弟的博文，自己只是吸收完後在此記錄以作備忘，在此先謝過）

1.先看一下CombineFileSplit 的原始碼：（簡簡單單五個物件）

public class CombineFileSplit extends InputSplit implements Writable {
private Path[] paths;
private long[] startoffset;
private long[] lengths;
private String[] locations;
private long totLength;
/**
* default constructor
*/
public CombineFileSplit() {}
public CombineFileSplit(Path[] files, long[] start,
long[] lengths, String[] locations) {
initSplit(files, start, lengths, locations);
}
public CombineFileSplit(Path[] files, long[] lengths) {
long[] startoffset = new long[files.length];
for (int i = 0; i < startoffset.length; i++) {
startoffset[i] = 0;
}
String[] locations = new String[files.length];
for (int i = 0; i < locations.length; i++) {
locations[i] = "\";
}
initSplit(files, startoffset, lengths, locations);
}
private void initSplit(Path[] files, long[] start,
long[] lengths, String[] locations) {
this.startoffset = start;
this.lengths = lengths;
this.paths = files;
this.totLength = 0;
this.locations = locations;
for(long length : lengths) {
totLength += length;
}
}
/**
* Copy constructor
*/
public CombineFileSplit(CombineFileSplit old) throws IOException {
this(old.getPaths(), old.getStartOffsets(),
old.getLengths(), old.getLocations());
}
public long getLength() {
return totLength;
}
/** Returns an array containing the start offsets of the files in the split*/
public long[] getStartOffsets() {
return startoffset;
}
/** Returns an array containing the lengths of the files in the split*/
public long[] getLengths() {
return lengths;
}
/** Returns the start offset of the ith Path */
public long getOffset(int i) {
return startoffset[i];
}
/** Returns the length of the ith Path */
public long getLength(int i) {
return lengths[i];
}
/** Returns the number of Paths in the split */
public int getNumPaths() {
return paths.length;
}
/** Returns the ith Path */
public Path getPath(int i) {
return paths[i];
}
/** Returns all the Paths in the split */
public Path[] getPaths() {
return paths;
}
/** Returns all the Paths where this input-split resides */
public String[] getLocations() throws IOException {
return locations;
}
public void readFields(DataInput in) throws IOException {
totLength = in.readLong();
int arrLength = in.readInt();
lengths = new long[arrLength];
for(int i=0; i<arrLength;i++) {
lengths[i] = in.readLong();
}
int filesLength = in.readInt();
paths = new Path[filesLength];
for(int i=0; i<filesLength;i++) {
paths[i] = new Path(Text.readString(in));
}
arrLength = in.readInt();
startoffset = new long[arrLength];
for(int i=0; i<arrLength;i++) {
startoffset[i] = in.readLong();
}
}
public void write(DataOutput out) throws IOException {
out.writeLong(totLength);
out.writeInt(lengths.length);
for(long length : lengths) {
out.writeLong(length);
}
out.writeInt(paths.length);
for(Path p : paths) {
Text.writeString(out, p.toString());
}
out.writeInt(startoffset.length);
for(long length : startoffset) {
out.writeLong(length);
}
}
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < paths.length; i++) {
if (i == 0 ) {
sb.append("Paths:");
}
sb.append(paths[i].toUri().getPath() + ":" + startoffset[i] +
"+" + lengths[i]);
if (i < paths.length -1) {
sb.append(",");
}
}
if (locations != null) {
String locs = "";
StringBuffer locsb = new StringBuffer();
for (int i = 0; i < locations.length; i++) {
locsb.append(locations[i] + ":");
}
locs = locsb.toString();
sb.append(" Locations:" + locs + "; ");
}
return sb.toString();
}
}

2.接下來我們需要來實現一個RecordReader，這個RecordReader其實為LineRecordReader的封裝

import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
public class CombineSmallfileRecordReader extends RecordReader<LongWritable,Text> {
private CombineFileSplit combineFileSplit;
private LineRecordReader lineRecordReader = new LineRecordReader();
private Path[] paths;
private int totalLength;
private int currentIndex;
private float currentProgress = 0;
private LongWritable currentKey;
private Text currentValue = new Text();
public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
super();
this.combineFileSplit = combineFileSplit;
this.currentIndex = index; // 當前要處理的小檔案Block在CombineFileSplit中的索引
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.combineFileSplit = (CombineFileSplit) split;
// 處理CombineFileSplit中的一個小檔案Block，因為使用LineRecordReader，需要構造一個FileSplit物件，然後才能夠讀取資料
FileSplit fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
lineRecordReader.initialize(fileSplit, context);
this.paths = combineFileSplit.getPaths();
totalLength = paths.length;
context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName());
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
currentKey = lineRecordReader.getCurrentKey();
return currentKey;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
currentValue = lineRecordReader.getCurrentValue();
return currentValue;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (currentIndex >= 0 && currentIndex < totalLength) {
return lineRecordReader.nextKeyValue();
} else {
return false;
}
}
@Override
public float getProgress() throws IOException {
if (currentIndex >= 0 && currentIndex < totalLength) {
currentProgress = (float) currentIndex / totalLength;
return currentProgress;
}
return currentProgress;
}
@Override
public void close() throws IOException {
lineRecordReader.close();
}
}

3.接著實現一個CombineFileInputFormat的子類

import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
public class CombineSmallfileInputFormat extends CombineFileInputFormat<LongWritable,Text> {
@Override
public RecordReader<LongWritable,Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
CombineFileSplit combineFileSplit = (CombineFileSplit) split;
CombineFileRecordReader<LongWritable,Text> recordReader = new CombineFileRecordReader<LongWritable,Text>(combineFileSplit, context, CombineSmallfileRecordReader.class);
try {
recordReader.initialize(combineFileSplit, context);
} catch (InterruptedException e) {
new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
}
return recordReader;
}
}

4.至此我們已經完成了小檔案輸入的問題，但是讓我們看一下 CombineInputFormat 的分片邏輯

CombineInputFormat 中利用三個引數 maxSplitSize、minSizeNode、minSizeRack 來控制分片：
1.如果指定了maxSplitSize(“mapreduce.input.fileinputformat.split.maxsize”)，那麼在同一個節點上的Blocks合併，一個超過maxSplitSize就生成新分片。如果沒有指定，則只彙總本節點BLock，暫不分片。
2.如果指定了minSizeNode(“mapreduce.input.fileinputformat.split.minsize.per.node”),那麼會把1.中處理剩餘的Block，進行合併，如果超過minSizeNode，那麼全部作為一個分片。否則這些Block與同一機架 Rack上的塊進行合併。
3.每個節點上如上同樣的方式處理，然後針對整個Rack的所有Block，按照1.方式處理。剩餘部分，如果指定了minSizeRack(“mapreduce.input.fileinputformat.split.minsize.per.rack”)，並且超過minSizeRack，則全部作為一個分片，否則這些Block保留，等待與所有機架上的剩餘Block進行彙總處理。

每個機架上都按照1，2，3方式處理，彙總所有處理剩下的部分，再按照1的邏輯處理。再剩餘的，作為一個分片。

以上邏輯我們可以知道：

如果只設定maxSplitSize(如job.getConfiguration().set( “mapreduce.input.fileinputformat.split.maxsize” , “33554432″))，那麼基本每個分片大小都需湊滿maxSplitSize。

如果maxSplitSize，minSizeNode，minSizeRack三個都沒有設定，那是所有輸入整合成一個分片！

CombineInputFormat 提供的分片機制實在是複雜，並且可能會出現資料傾斜問題。所以我們來自己Override一個簡單實用的getSplits方法：

private static final Log LOG = LogFactory.getLog(MultiFileInputFormat.class);
public static final String CONFNAME_INPUT_SPLIT_MAX_NUM = "multifileinputformat.max_split_num";
public static final Integer DEFAULT_MAX_SPLIT_NUM = 50;
public static void setMaxInputSplitNum(Job job, Integer maxSplitNum) {
job.getConfiguration().setInt(CONFNAME_INPUT_SPLIT_MAX_NUM, maxSplitNum);
}
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
// get all the files in input path
List<FileStatus> stats = listStatus(job);
List<InputSplit> splits = new ArrayList<InputSplit>();
if (stats.size() == 0) {
return splits;
}
// 計算split的平均長度
long totalLen = 0;
for (FileStatus stat : stats) {
totalLen += stat.getLen();
}
int maxSplitNum = job.getConfiguration().getInt(CONFNAME_INPUT_SPLIT_MAX_NUM, DEFAULT_MAX_SPLIT_NUM);
int expectSplitNum = maxSplitNum < stats.size() ? maxSplitNum : stats.size();
long averageLen = totalLen / expectSplitNum;
LOG.info("Prepare InputSplit : averageLen(" + averageLen + ") totalLen(" + totalLen
+ ") expectSplitNum(" + expectSplitNum + ") ");
// 設定inputSplit
List<Path> pathLst = new ArrayList<Path>();
List<Long> offsetLst = new ArrayList<Long>();
List<Long> lengthLst = new ArrayList<Long>();
long currentLen = 0;
for (int i = 0; i < stats.size(); i++) {
FileStatus stat = stats.get(i);
pathLst.add(stat.getPath());
offsetLst.add(0L);
lengthLst.add(stat.getLen());
currentLen += stat.getLen();
if (splits.size() < expectSplitNum - 1 && currentLen > averageLen) {
Path[] pathArray = new Path[pathLst.size()];
CombineFileSplit thissplit = new CombineFileSplit(pathLst.toArray(pathArray),
getLongArray(offsetLst), getLongArray(lengthLst), new String[0]);
LOG.info("combineFileSplit(" + splits.size() + ") fileNum(" + pathLst.size()
+ ") length(" + currentLen + ")");
splits.add(thissplit);
pathLst.clear();
offsetLst.clear();
lengthLst.clear();
currentLen = 0;
}
}
if (pathLst.size() > 0) {
Path[] pathArray = new Path[pathLst.size()];
CombineFileSplit thissplit =
new CombineFileSplit(pathLst.toArray(pathArray), getLongArray(offsetLst),
getLongArray(lengthLst), new String[0]);
LOG.info("combineFileSplit(" + splits.size() + ") fileNum(" + pathLst.size()
+ ") length(" + currentLen + ")");
splits.add(thissplit);
}
return splits;
}
private long[] getLongArray(List<Long> lst) {
long[] rst = new long[lst.size()];
for (int i = 0; i < lst.size(); i++) {
rst[i] = lst.get(i);
}
return rst;
}

這樣就可以透過引數準確的控制map數量，且會均勻的分發資料到各Map

MapReduce 大量小檔案

相關文章