MapReduce 大量小檔案
我們知道Map的數量一般是由分片數量決定的,而當輸入為大量的小檔案時,可能就會產生大量的Map任務,並且每個Map任務的任務量都很小。
所以,我們需要將多個小檔案交由一個Map 任務來處理,開始想自己寫一套InputFormat、RecordReader、FileSplit。後來得知其實Hadoop早已為
我們提供了CombineFileInputFormat和CombineFileInputFormat。 (提供的CombineFileInputFormat會產生資料傾斜問題,後邊講)
CombineFileInputFormat的大致原理是,他會將輸入多個資料檔案(小檔案)的後設資料全部包裝到CombineFileSplit類裡面。也就是說,因為小檔案的情況下,在HDFS中都是單Block的檔案,即一個檔案一個Block,一個CombineFileSplit包含了一組檔案Block,包括每個檔案的起始偏移(offset),長度(length),Block位置(localtions)等後設資料。
(程式碼大多采用自其它兄弟的博文,自己只是吸收完後在此記錄以作備忘,在此先謝過)
1.先看一下CombineFileSplit 的原始碼:(簡簡單單五個物件)
-
public class CombineFileSplit extends InputSplit implements Writable {
-
-
private Path[] paths;
-
private long[] startoffset;
-
private long[] lengths;
-
private String[] locations;
-
private long totLength;
-
-
/**
-
* default constructor
-
*/
-
public CombineFileSplit() {}
-
public CombineFileSplit(Path[] files, long[] start,
-
long[] lengths, String[] locations) {
-
initSplit(files, start, lengths, locations);
-
}
-
-
public CombineFileSplit(Path[] files, long[] lengths) {
-
long[] startoffset = new long[files.length];
-
for (int i = 0; i < startoffset.length; i++) {
-
startoffset[i] = 0;
-
}
-
String[] locations = new String[files.length];
-
for (int i = 0; i < locations.length; i++) {
-
locations[i] = "\";
-
}
-
initSplit(files, startoffset, lengths, locations);
-
}
-
-
private void initSplit(Path[] files, long[] start,
-
long[] lengths, String[] locations) {
-
this.startoffset = start;
-
this.lengths = lengths;
-
this.paths = files;
-
this.totLength = 0;
-
this.locations = locations;
-
for(long length : lengths) {
-
totLength += length;
-
}
-
}
-
-
/**
-
* Copy constructor
-
*/
-
public CombineFileSplit(CombineFileSplit old) throws IOException {
-
this(old.getPaths(), old.getStartOffsets(),
-
old.getLengths(), old.getLocations());
-
}
-
-
public long getLength() {
-
return totLength;
-
}
-
-
/** Returns an array containing the start offsets of the files in the split*/
-
public long[] getStartOffsets() {
-
return startoffset;
-
}
-
-
/** Returns an array containing the lengths of the files in the split*/
-
public long[] getLengths() {
-
return lengths;
-
}
-
-
/** Returns the start offset of the i<sup>th</sup> Path */
-
public long getOffset(int i) {
-
return startoffset[i];
-
}
-
-
/** Returns the length of the i<sup>th</sup> Path */
-
public long getLength(int i) {
-
return lengths[i];
-
}
-
-
/** Returns the number of Paths in the split */
-
public int getNumPaths() {
-
return paths.length;
-
}
-
-
/** Returns the i<sup>th</sup> Path */
-
public Path getPath(int i) {
-
return paths[i];
-
}
-
-
/** Returns all the Paths in the split */
-
public Path[] getPaths() {
-
return paths;
-
}
-
-
/** Returns all the Paths where this input-split resides */
-
public String[] getLocations() throws IOException {
-
return locations;
-
}
-
-
public void readFields(DataInput in) throws IOException {
-
totLength = in.readLong();
-
int arrLength = in.readInt();
-
lengths = new long[arrLength];
-
for(int i=0; i<arrLength;i++) {
-
lengths[i] = in.readLong();
-
}
-
int filesLength = in.readInt();
-
paths = new Path[filesLength];
-
for(int i=0; i<filesLength;i++) {
-
paths[i] = new Path(Text.readString(in));
-
}
-
arrLength = in.readInt();
-
startoffset = new long[arrLength];
-
for(int i=0; i<arrLength;i++) {
-
startoffset[i] = in.readLong();
-
}
-
}
-
-
public void write(DataOutput out) throws IOException {
-
out.writeLong(totLength);
-
out.writeInt(lengths.length);
-
for(long length : lengths) {
-
out.writeLong(length);
-
}
-
out.writeInt(paths.length);
-
for(Path p : paths) {
-
Text.writeString(out, p.toString());
-
}
-
out.writeInt(startoffset.length);
-
for(long length : startoffset) {
-
out.writeLong(length);
-
}
-
}
-
-
@Override
-
public String toString() {
-
StringBuffer sb = new StringBuffer();
-
for (int i = 0; i < paths.length; i++) {
-
if (i == 0 ) {
-
sb.append("Paths:");
-
}
-
sb.append(paths[i].toUri().getPath() + ":" + startoffset[i] +
-
"+" + lengths[i]);
-
if (i < paths.length -1) {
-
sb.append(",");
-
}
-
}
-
if (locations != null) {
-
String locs = "";
-
StringBuffer locsb = new StringBuffer();
-
for (int i = 0; i < locations.length; i++) {
-
locsb.append(locations[i] + ":");
-
}
-
locs = locsb.toString();
-
sb.append(" Locations:" + locs + "; ");
-
}
-
return sb.toString();
-
}
- }
2.接下來我們需要來實現一個RecordReader,這個RecordReader其實為LineRecordReader的封裝
-
import java.io.IOException;
-
-
import org.apache.hadoop.fs.Path;
-
import org.apache.hadoop.io.BytesWritable;
-
import org.apache.hadoop.io.LongWritable;
-
import org.apache.hadoop.io.Text;
-
import org.apache.hadoop.mapreduce.InputSplit;
-
import org.apache.hadoop.mapreduce.RecordReader;
-
import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
-
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
-
-
public class CombineSmallfileRecordReader extends RecordReader<LongWritable,Text> {
-
-
private CombineFileSplit combineFileSplit;
-
private LineRecordReader lineRecordReader = new LineRecordReader();
-
private Path[] paths;
-
private int totalLength;
-
private int currentIndex;
-
private float currentProgress = 0;
-
private LongWritable currentKey;
-
private Text currentValue = new Text();
-
-
public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
-
super();
-
this.combineFileSplit = combineFileSplit;
-
this.currentIndex = index; // 當前要處理的小檔案Block在CombineFileSplit中的索引
-
}
-
-
@Override
-
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
-
this.combineFileSplit = (CombineFileSplit) split;
-
// 處理CombineFileSplit中的一個小檔案Block,因為使用LineRecordReader,需要構造一個FileSplit物件,然後才能夠讀取資料
-
FileSplit fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
-
lineRecordReader.initialize(fileSplit, context);
-
-
this.paths = combineFileSplit.getPaths();
-
totalLength = paths.length;
-
context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName());
-
}
-
-
@Override
-
public LongWritable getCurrentKey() throws IOException, InterruptedException {
-
currentKey = lineRecordReader.getCurrentKey();
-
return currentKey;
-
}
-
-
@Override
-
public Text getCurrentValue() throws IOException, InterruptedException {
-
currentValue = lineRecordReader.getCurrentValue();
return currentValue; -
}
-
-
@Override
-
public boolean nextKeyValue() throws IOException, InterruptedException {
-
if (currentIndex >= 0 && currentIndex < totalLength) {
-
return lineRecordReader.nextKeyValue();
-
} else {
-
return false;
-
}
-
}
-
-
@Override
-
public float getProgress() throws IOException {
-
if (currentIndex >= 0 && currentIndex < totalLength) {
-
currentProgress = (float) currentIndex / totalLength;
-
return currentProgress;
-
}
-
return currentProgress;
-
}
-
-
@Override
-
public void close() throws IOException {
-
lineRecordReader.close();
-
}
- }
3.接著實現一個CombineFileInputFormat的子類
-
import java.io.IOException;
-
-
import org.apache.hadoop.io.BytesWritable;
-
import org.apache.hadoop.io.LongWritable;
-
import org.apache.hadoop.io.Text;
-
import org.apache.hadoop.mapreduce.InputSplit;
-
import org.apache.hadoop.mapreduce.RecordReader;
-
import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
-
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
-
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
-
-
public class CombineSmallfileInputFormat extends CombineFileInputFormat<LongWritable,Text> {
-
-
@Override
-
public RecordReader<LongWritable,Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
-
-
CombineFileSplit combineFileSplit = (CombineFileSplit) split;
-
CombineFileRecordReader<LongWritable,Text> recordReader = new CombineFileRecordReader<LongWritable,Text>(combineFileSplit, context, CombineSmallfileRecordReader.class);
-
try {
-
recordReader.initialize(combineFileSplit, context);
-
} catch (InterruptedException e) {
-
new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
-
}
-
return recordReader;
-
}
-
- }
4.至此我們已經完成了小檔案輸入的問題,但是讓我們看一下 CombineInputFormat 的分片邏輯
CombineInputFormat 中利用三個引數 maxSplitSize、minSizeNode、minSizeRack 來控制分片:
1.如果指定了maxSplitSize(“mapreduce.input.fileinputformat.split.maxsize”),那麼在同一個節點上的Blocks合併,一個超過maxSplitSize就生成新分片。如果沒有指定,則只彙總本節點BLock,暫不分片。
2.如果指定了minSizeNode(“mapreduce.input.fileinputformat.split.minsize.per.node”),那麼會把1.中處理剩餘的Block,進行合併,如果超過minSizeNode,那麼全部作為一個分片。否則這些Block與同一機架 Rack上的塊進行合併。
3.每個節點上如上同樣的方式處理,然後針對整個Rack的所有Block,按照1.方式處理。剩餘部分,如果指定了minSizeRack(“mapreduce.input.fileinputformat.split.minsize.per.rack”),並且超過minSizeRack,則全部作為一個分片,否則這些Block保留,等待與所有機架上的剩餘Block進行彙總處理。
每個機架上都按照1,2,3方式處理,彙總所有處理剩下的部分,再按照1的邏輯處理。再剩餘的,作為一個分片。
以上邏輯我們可以知道:
如果只設定maxSplitSize(如job.getConfiguration().set( “mapreduce.input.fileinputformat.split.maxsize” , “33554432″)),那麼基本每個分片大小都需湊滿maxSplitSize。
如果maxSplitSize,minSizeNode,minSizeRack三個都沒有設定,那是所有輸入整合成一個分片!
-
private static final Log LOG = LogFactory.getLog(MultiFileInputFormat.class);
-
public static final String CONFNAME_INPUT_SPLIT_MAX_NUM = "multifileinputformat.max_split_num";
-
public static final Integer DEFAULT_MAX_SPLIT_NUM = 50;
-
-
public static void setMaxInputSplitNum(Job job, Integer maxSplitNum) {
-
job.getConfiguration().setInt(CONFNAME_INPUT_SPLIT_MAX_NUM, maxSplitNum);
-
}
-
-
@Override
-
public List<InputSplit> getSplits(JobContext job) throws IOException {
-
// get all the files in input path
-
List<FileStatus> stats = listStatus(job);
-
List<InputSplit> splits = new ArrayList<InputSplit>();
-
if (stats.size() == 0) {
-
return splits;
-
}
-
// 計算split的平均長度
-
long totalLen = 0;
-
for (FileStatus stat : stats) {
-
totalLen += stat.getLen();
-
}
-
int maxSplitNum = job.getConfiguration().getInt(CONFNAME_INPUT_SPLIT_MAX_NUM, DEFAULT_MAX_SPLIT_NUM);
-
int expectSplitNum = maxSplitNum < stats.size() ? maxSplitNum : stats.size();
-
long averageLen = totalLen / expectSplitNum;
-
LOG.info("Prepare InputSplit : averageLen(" + averageLen + ") totalLen(" + totalLen
-
+ ") expectSplitNum(" + expectSplitNum + ") ");
-
// 設定inputSplit
-
List<Path> pathLst = new ArrayList<Path>();
-
List<Long> offsetLst = new ArrayList<Long>();
-
List<Long> lengthLst = new ArrayList<Long>();
-
long currentLen = 0;
-
for (int i = 0; i < stats.size(); i++) {
-
FileStatus stat = stats.get(i);
-
pathLst.add(stat.getPath());
-
offsetLst.add(0L);
-
lengthLst.add(stat.getLen());
-
currentLen += stat.getLen();
-
if (splits.size() < expectSplitNum - 1 && currentLen > averageLen) {
-
Path[] pathArray = new Path[pathLst.size()];
-
CombineFileSplit thissplit = new CombineFileSplit(pathLst.toArray(pathArray),
-
getLongArray(offsetLst), getLongArray(lengthLst), new String[0]);
-
LOG.info("combineFileSplit(" + splits.size() + ") fileNum(" + pathLst.size()
-
+ ") length(" + currentLen + ")");
-
splits.add(thissplit);
-
-
pathLst.clear();
-
offsetLst.clear();
-
lengthLst.clear();
-
currentLen = 0;
-
}
-
}
-
if (pathLst.size() > 0) {
-
Path[] pathArray = new Path[pathLst.size()];
-
CombineFileSplit thissplit =
-
new CombineFileSplit(pathLst.toArray(pathArray), getLongArray(offsetLst),
-
getLongArray(lengthLst), new String[0]);
-
LOG.info("combineFileSplit(" + splits.size() + ") fileNum(" + pathLst.size()
-
+ ") length(" + currentLen + ")");
-
splits.add(thissplit);
-
}
-
return splits;
-
}
-
-
private long[] getLongArray(List<Long> lst) {
-
long[] rst = new long[lst.size()];
-
for (int i = 0; i < lst.size(); i++) {
-
rst[i] = lst.get(i);
-
}
-
return rst;
- }
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/29754888/viewspace-1225105/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- mapreduce將若干小檔案合成大檔案
- 批量刪除大量小檔案
- 大量小檔案不適合儲存於HDFS的原因
- MySQL大量使用swap檔案MySql
- 大量小檔案儲存提高效率要點詳解
- Linux 刪除大量小檔案的兩種方案 | 運維進階Linux運維
- hadoop archive合併小檔案並進行mapreduce來減少map的數量HadoopHive
- Flume採集資料時在HDFS上產生大量小檔案的問題
- Linux如何快速刪除大量碎小檔案?Linux
- 教你批次歸類大量桌面檔案的方法
- EM執行後產生大量err檔案
- 將大量檔案的擴充名中大寫字母改為小寫:Python實現Python
- 高延時下如何快速傳輸大量檔案
- Python讀取大量Excel檔案並跨檔案批次計算平均值PythonExcel
- hdfs小檔案分析
- PHP匯出大量資料,儲存為CSV檔案PHP
- 怎麼快速複製移動大量電腦檔案?
- 關於websphere5.1產生大量heapdump檔案原因Web
- 重新命名一千個檔案要多久? Name Mangler 快速為大量檔案重新命名
- 智慧小程式檔案館——檔案系統能力
- 刪除大量檔案Argument list too long錯誤解決
- 簡單2步就能將大量檔案歸類儲存
- 檔案包含漏洞小結
- Hive小檔案合併Hive
- 如何把大 Excel 檔案拆成多個小檔案Excel
- Word檔案如何轉為PDF檔案,小技能分享!
- 【大資料】MapReduce開發小實戰大資料
- 智慧小程式檔案館——小程式包更新
- Linux中RM快速刪除大量檔案/資料夾方法Linux
- imp匯入檔案時報大量的imp-0008錯誤
- pdf檔案太大如何變小 減小pdf檔案大小最簡單的方法
- PHP檔案包含小總結PHP
- 修改hosts檔案的小程式
- 小程式之檔案上傳
- 呼叫MapReduce對檔案中單詞出現次數進行統計
- 怎麼將大量的電腦檔案進行歸類處理?
- 1.4檔案操作之修改程式配置檔案小應用
- 微信小程式檔案預覽和下載-檔案系統微信小程式