1.job = map+reduce
2.Map的輸出是reduce的輸入
3.所有的輸入和輸出都是<Key,Values>,一共4對。
4.K2=K3 V3是一個集合,該集合的每個元素就是V2。
5.所有的資料型別都必須是Hadoop自己的資料型別。
int--->IntWritable
long--->LongWritable
string--->Text
null--->NUllWritable
複製程式碼
Submitter執行類:
package com.etc;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.File;
import java.io.IOException;
public class JobSubmitter {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReduce.class);
//map輸出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//reduce輸出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
File file = new File("F:\\wordcountfengze");
if (file.exists()){
FileUtils.deleteDirectory(file);
}
FileInputFormat.setInputPaths(job,new Path("F:\\wordcountwangcc"));
FileOutputFormat.setOutputPath(job,new Path("F:\\wordcountfengze"));
job.setNumReduceTasks(1);
boolean tt = job.waitForCompletion(true);
System.out.println(tt);
}
}
複製程式碼
Mapper類1:
package com.etc;
import org.apache.commons.collections.ArrayStack;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* KEYIN :是map task讀取到的資料的key的型別,是一行的起始偏移量Long
* VALUEIN:是map task讀取到的資料的value的型別,是一行的內容String
* KEYOUT:是使用者的自定義map方法要返回的結果kv資料的key的型別,在wordcount邏輯中,我們需要返回的是單詞String
* VALUEOUT:是使用者的自定義map方法要返回的結果kv資料的value的型別,在wordcount邏輯中,我們需要返回的是整數Integer
* 但是,在mapreduce中,map產生的資料需要傳輸給reduce,需要進行序列化和反序列化,而jdk中的原生序列化機制產生的資料量比較冗餘,就會導致資料在mapreduce執行過程中傳輸效率低下
* 所以,hadoop專門設計了自己的序列化機制,那麼,mapreduce中傳輸的資料型別就必須實現hadoop自己的序列化介面
* hadoop為jdk中的常用基本型別Long String Integer Float等資料型別封住了自己的實現了hadoop序列化介面的型別:LongWritable,Text,IntWritable,FloatWritable
*/
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//切單詞
String line = value.toString();
String[] words = line.split("\t");//分裂空格
List<String> listStri = new ArrayList<String>(Arrays.asList(words));//String陣列轉化ArrayList
//遍歷陣列查詢空格刪除空格
for (int i = 0; i <listStri.size() ; i++) {
if (listStri.get(i).equals(" ")){
listStri.remove(i);
}
}
for (String word : listStri) {
context.write(new Text( word), new IntWritable(1));
}
}
}
複製程式碼
Mapper類2:
package com.etc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(",");
for (String s : split) {
//重點values個數只能為1
context.write(new Text(s),new IntWritable(1));
}
}
}
複製程式碼
Reduce類:
package com.etc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
// k3 v3 k4 v4
public class WordCountReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count = count + value.get();
}
context.write(new Text(key),new IntWritable(count));
}
}
複製程式碼
動態計數器:
Context物件的getCounter方法有兩個String型別的輸入引數,分別代表組名稱和計數器名稱。
Public Counter getCounter(String groupName,String counterName)