hadoop版本0.x的包一般是mapred
package old; import java.io.IOException; import java.net.URI; import java.util.Iterator; import mapreduce.WordCountApp; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; /** * hadoop版本1.x的包一般是mapreduce * hadoop版本0.x的包一般是mapred * */ public class OldAPP { static final String INPUT_PATH = "hdfs://hadoop:9000/hello"; static final String OUT_PATH = "hdfs://hadoop:9000/out"; /** * 改動: * 1.不再使用Job,而是使用JobConf * 2.類的包名不再使用mapreduce,而是使用mapred * 3.不再使用job.waitForCompletion(true)提交作業,而是使用JobClient.runJob(job); * */ public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf); final Path outPath = new Path(OUT_PATH); if(fileSystem.exists(outPath)){ fileSystem.delete(outPath, true); } final JobConf job = new JobConf(conf , WordCountApp.class); //1.1指定讀取的檔案位於哪裡 FileInputFormat.setInputPaths(job, INPUT_PATH); //指定如何對輸入檔案進行格式化,把輸入檔案每一行解析成鍵值對 //job.setInputFormatClass(TextInputFormat.class); //1.2 指定自定義的map類 job.setMapperClass(MyMapper.class); //map輸出的<k,v>型別。如果<k3,v3>的型別與<k2,v2>型別一致,則可以省略 //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(LongWritable.class); //1.3 分割槽 //job.setPartitionerClass(HashPartitioner.class); //有一個reduce任務執行 //job.setNumReduceTasks(1); //1.4 TODO 排序、分組 //1.5 TODO 規約 //2.2 指定自定義reduce類 job.setReducerClass(MyReducer.class); //指定reduce的輸出型別 job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //2.3 指定寫出到哪裡 FileOutputFormat.setOutputPath(job, outPath); //指定輸出檔案的格式化類 //job.setOutputFormatClass(TextOutputFormat.class); //把job提交給JobTracker執行 JobClient.runJob(job); } /** * 新api:extends Mapper * 老api:extends MapRedcueBase implements Mapper */ static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{ @Override public void map(LongWritable k1, Text v1, OutputCollector<Text, LongWritable> collector, Reporter reporter) throws IOException { final String[] splited = v1.toString().split("\t"); for (String word : splited) { collector.collect(new Text(word), new LongWritable(1)); } } } static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{ @Override public void reduce(Text k2, Iterator<LongWritable> v2s, OutputCollector<Text, LongWritable> collector, Reporter reporter) throws IOException { long times = 0L; while (v2s.hasNext()) { final long temp = v2s.next().get(); times += temp; } collector.collect(k2, new LongWritable(times)); } } }
程式碼 1.1
一、自定義Mapper類的不同
在新api中,是繼承類org.apache.hadoop.mapreduce.Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>。在舊api中,是繼承類org.apache.hadoop.mapred.MapReduceBase,然後實現介面 org.apache.hadoop.mapred.Mapper<K1, V1, K2, V2>。在新api中,覆蓋的map方法的第三個引數是Context類;在舊api中,覆蓋的map方法的第三、四個形參分別是OutputCollector和Reporter類。在新api的Context中已經把兩個類的功能合併到一起了,使用者操作更簡單。使用舊api的自定義Mapper類,如程式碼1.2所示所示。key、value對。每一個鍵值對呼叫一次map函式。
1 /** 2 * 新api:extends Mapper 3 * 老api:extends MapRedcueBase implements Mapper 4 */ 5 static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{ 6 @Override 7 public void map(LongWritable k1, Text v1, 8 OutputCollector<Text, LongWritable> collector, Reporter reporter) 9 throws IOException { 10 final String[] splited = v1.toString().split("\t"); 11 for (String word : splited) { 12 collector.collect(new Text(word), new LongWritable(1)); 13 } 14 } 15 }
程式碼 1.2
二、自定義Reducer類的不同
在新api中,是繼承類org.apache.hadoop.mapreduce.Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>。在舊api中,是繼承類org.apache.hadoop.mapred.MapReduceBase,然後實現介面 org.apache.hadoop.mapred. Reducer<K1, V1, K2, V2>。在新api中覆蓋的reduce方法的第二個引數是java.lang.Iterable<VALUEIN>。在舊api中,覆蓋的 reduce方法的第二個引數是java.util.Iterator<V 2>。前者可以使用增強for迴圈進行處理,後者只能使用 while迴圈處理了。在新api中,覆蓋的reduce方法的第三個引數是Context類;在舊api中,覆蓋的reduce方法的第三、四個形參分別是OutputCollector和Reporter類。在新api的Context中已經把兩個類的功能合併到一起了,使用者操作更簡單。使用舊api的自定義Reducer類,程式碼如2.1所示。
1 static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{ 2 @Override 3 public void reduce(Text k2, Iterator<LongWritable> v2s, 4 OutputCollector<Text, LongWritable> collector, Reporter reporter) 5 throws IOException { 6 long times = 0L; 7 while (v2s.hasNext()) { 8 final long temp = v2s.next().get(); 9 times += temp; 10 } 11 collector.collect(k2, new LongWritable(times)); 12 } 13 }
程式碼 2.1
三、 驅動程式碼main方法的不同
在新api中,驅動代碼主要是通過org.apache.hadoop.mapreduce.Job類實現的,通過該類管理各種配置,然後呼叫waitForCompleti on(boolean)方法把程式碼提交給JobTracker執行。在舊api中,驅動程式碼主要是通過 org.apache.hadoop.mapred.JobConf.JobConf(Con figuration, Class)類實現的,通過該類管理各種配置。對於job的提交,是通過org.apache.hadoop.mapred.JobClient類的 runJob(JobC onf)方法實現的。可見,新api中把JobConf和JobClient的功能進行了合併,使用者呼叫更方便。
其中,JobConf類與Job類的方法名稱幾乎一致,只是傳遞的形參型別大不相同了。在新api中的Job類,要求setXXX(…)的形參必須是org .apache.hadoop.mapreduce及其子包下面的類;而舊api中的JobConf類,要求setXXX(…)的形參必須是 org.apache.hadoop.mapred及其子包下面的類。使用舊api的驅動程式碼main方法,如程式碼3.1所示。
1 package old; 2 3 import java.io.IOException; 4 import java.net.URI; 5 import java.util.Iterator; 6 7 import mapreduce.WordCountApp; 8 9 import org.apache.hadoop.conf.Configuration; 10 import org.apache.hadoop.fs.FileSystem; 11 import org.apache.hadoop.fs.Path; 12 import org.apache.hadoop.io.LongWritable; 13 import org.apache.hadoop.io.Text; 14 import org.apache.hadoop.mapred.FileInputFormat; 15 import org.apache.hadoop.mapred.FileOutputFormat; 16 import org.apache.hadoop.mapred.JobClient; 17 import org.apache.hadoop.mapred.JobConf; 18 import org.apache.hadoop.mapred.MapReduceBase; 19 import org.apache.hadoop.mapred.Mapper; 20 import org.apache.hadoop.mapred.OutputCollector; 21 import org.apache.hadoop.mapred.Reducer; 22 import org.apache.hadoop.mapred.Reporter; 23 import org.apache.hadoop.mapred.TextInputFormat; 24 import org.apache.hadoop.mapred.TextOutputFormat; 25 import org.apache.hadoop.mapred.lib.HashPartitioner; 26 /** 27 * hadoop版本1.x的包一般是mapreduce 28 * hadoop版本0.x的包一般是mapred 29 * 30 */ 31 public class OldAPP { 32 static final String INPUT_PATH = "hdfs://hadoop:9000/hello"; 33 static final String OUT_PATH = "hdfs://hadoop:9000/out"; 34 /** 35 * 改動: 36 * 1.不再使用Job,而是使用JobConf 37 * 2.類的包名不再使用mapreduce,而是使用mapred 38 * 3.不再使用job.waitForCompletion(true)提交作業,而是使用JobClient.runJob(job); 39 * 40 */ 41 public static void main(String[] args) throws Exception { 42 43 Configuration conf = new Configuration(); 44 final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf); 45 final Path outPath = new Path(OUT_PATH); 46 if(fileSystem.exists(outPath)){ 47 fileSystem.delete(outPath, true); 48 } 49 50 final JobConf job = new JobConf(conf , WordCountApp.class); 51 52 FileInputFormat.setInputPaths(job, INPUT_PATH);//1.1指定讀取的檔案位於哪裡 53 job.setMapperClass(MyMapper.class);//1.2 指定自定義的map類 54 job.setMapOutputKeyClass(Text.class);//map輸出的<k,v>型別。如果<k3,v3>的型別與<k2,v2>型別一致,則可以省略 55 job.setMapOutputValueClass(LongWritable.class); 56 job.setPartitionerClass(HashPartitioner.class);//1.3 分割槽 57 job.setNumReduceTasks(1);//有一個reduce任務執行 58 job.setReducerClass(MyReducer.class);//2.2 指定自定義reduce類 59 job.setOutputKeyClass(Text.class);//指定reduce的輸出型別 60 job.setOutputValueClass(LongWritable.class); 61 FileOutputFormat.setOutputPath(job, outPath);//2.3 指定寫出到哪裡 62 JobClient.runJob(job);//把job提交給JobTracker執行 63 } 64 65 66 67 /** 68 * 新api:extends Mapper 69 * 老api:extends MapRedcueBase implements Mapper 70 */ 71 static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{ 72 @Override 73 public void map(LongWritable k1, Text v1, 74 OutputCollector<Text, LongWritable> collector, Reporter reporter) 75 throws IOException { 76 final String[] splited = v1.toString().split("\t"); 77 for (String word : splited) { 78 collector.collect(new Text(word), new LongWritable(1)); 79 } 80 } 81 } 82 83 static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{ 84 @Override 85 public void reduce(Text k2, Iterator<LongWritable> v2s, 86 OutputCollector<Text, LongWritable> collector, Reporter reporter) 87 throws IOException { 88 long times = 0L; 89 while (v2s.hasNext()) { 90 final long temp = v2s.next().get(); 91 times += temp; 92 } 93 collector.collect(k2, new LongWritable(times)); 94 } 95 } 96 }
程式碼 3.1