Hadoop日記Day15---MapReduce新舊api的比較

sunddenly發表於2014-09-29
  我使用hadoop的是hadoop1.1.2,而很多公司也在使用hadoop0.2x版本,因此市面上的hadoop資料版本不一,為了擴充自己的知識面,MapReduce的新舊api進行了比較研究。
  hadoop版本1.x的包一般是mapreduce
  hadoop版本0.x的包一般是mapred
我們還是以單詞統計為例進行研究,程式碼如下,如程式碼1.1所示:
package old;

import java.io.IOException;
import java.net.URI;
import java.util.Iterator;

import mapreduce.WordCountApp;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
/**
 * hadoop版本1.x的包一般是mapreduce
 * hadoop版本0.x的包一般是mapred
 *
 */
public class OldAPP {
    static final String INPUT_PATH = "hdfs://hadoop:9000/hello";
    static final String OUT_PATH = "hdfs://hadoop:9000/out";
    /**
     * 改動:
     * 1.不再使用Job,而是使用JobConf
     * 2.類的包名不再使用mapreduce,而是使用mapred
     * 3.不再使用job.waitForCompletion(true)提交作業,而是使用JobClient.runJob(job);
     * 
     */
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
        final Path outPath = new Path(OUT_PATH);
        if(fileSystem.exists(outPath)){
            fileSystem.delete(outPath, true);
        }
        
        final JobConf job = new JobConf(conf , WordCountApp.class);
        //1.1指定讀取的檔案位於哪裡
        FileInputFormat.setInputPaths(job, INPUT_PATH);
        //指定如何對輸入檔案進行格式化,把輸入檔案每一行解析成鍵值對
        //job.setInputFormatClass(TextInputFormat.class);
        
        //1.2 指定自定義的map類
        job.setMapperClass(MyMapper.class);
        //map輸出的<k,v>型別。如果<k3,v3>的型別與<k2,v2>型別一致,則可以省略
        //job.setMapOutputKeyClass(Text.class);
        //job.setMapOutputValueClass(LongWritable.class);
        
        //1.3 分割槽
        //job.setPartitionerClass(HashPartitioner.class);
        //有一個reduce任務執行
        //job.setNumReduceTasks(1);
        
        //1.4 TODO 排序、分組
        
        //1.5 TODO 規約
        
        //2.2 指定自定義reduce類
        job.setReducerClass(MyReducer.class);
        //指定reduce的輸出型別
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        
        //2.3 指定寫出到哪裡
        FileOutputFormat.setOutputPath(job, outPath);
        //指定輸出檔案的格式化類
        //job.setOutputFormatClass(TextOutputFormat.class);
        
        //把job提交給JobTracker執行
        JobClient.runJob(job);
    }

    
    
    /**
     * 新api:extends Mapper
     * 老api:extends MapRedcueBase implements Mapper
     */
    static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{
        @Override
        public void map(LongWritable k1, Text v1,
                OutputCollector<Text, LongWritable> collector, Reporter reporter)
                throws IOException {
            final String[] splited = v1.toString().split("\t");
            for (String word : splited) {
                collector.collect(new Text(word), new LongWritable(1));
            }
        }
    }
    
    static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{
        @Override
        public void reduce(Text k2, Iterator<LongWritable> v2s,
                OutputCollector<Text, LongWritable> collector, Reporter reporter)
                throws IOException {
            long times = 0L;
            while (v2s.hasNext()) {
                final long temp = v2s.next().get();
                times += temp;
            }
            collector.collect(k2, new LongWritable(times));
        }
    }
}
View Code

程式碼 1.1

一、自定義Mapper類的不同

  在新api中,是繼承類org.apache.hadoop.mapreduce.Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>。在舊api中,是繼承類org.apache.hadoop.mapred.MapReduceBase,然後實現介面 org.apache.hadoop.mapred.Mapper<K1, V1, K2, V2>。在新api中,覆蓋的map方法的第三個引數是Context類;在舊api中,覆蓋的map方法的第三、四個形參分別是OutputCollectorReporter類。在新api的Context中已經把兩個類的功能合併到一起了,使用者操作更簡單。使用舊api的自定義Mapper類,如程式碼1.2所示所示。key、value對。每一個鍵值對呼叫一次map函式。

 1 /**
 2      * 新api:extends Mapper
 3      * 老api:extends MapRedcueBase implements Mapper
 4      */
 5     static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{
 6         @Override
 7         public void map(LongWritable k1, Text v1,
 8                 OutputCollector<Text, LongWritable> collector, Reporter reporter)
 9                 throws IOException {
10             final String[] splited = v1.toString().split("\t");
11             for (String word : splited) {
12                 collector.collect(new Text(word), new LongWritable(1));
13             }
14         }
15     }
View Code

程式碼 1.2

二、自定義Reducer類的不同

  在新api中,是繼承類org.apache.hadoop.mapreduce.Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>。在舊api中,是繼承類org.apache.hadoop.mapred.MapReduceBase,然後實現介面 org.apache.hadoop.mapred. Reducer<K1, V1, K2, V2>。在新api中覆蓋的reduce方法的第二個引數是java.lang.Iterable<VALUEIN>。在舊api中,覆蓋的 reduce方法的第二個引數是java.util.Iterator<V 2>。前者可以使用增強for迴圈進行處理,後者只能使用 while迴圈處理了。在新api中,覆蓋的reduce方法的第三個引數是Context類;在舊api中,覆蓋的reduce方法的第三、四個形參分別是OutputCollectorReporter類。在新api的Context中已經把兩個類的功能合併到一起了,使用者操作更簡單。使用舊api的自定義Reducer類,程式碼如2.1所示。

 1 static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{
 2         @Override
 3         public void reduce(Text k2, Iterator<LongWritable> v2s,
 4                 OutputCollector<Text, LongWritable> collector, Reporter reporter)
 5                 throws IOException {
 6             long times = 0L;
 7             while (v2s.hasNext()) {
 8                 final long temp = v2s.next().get();
 9                 times += temp;
10             }
11             collector.collect(k2, new LongWritable(times));
12         }
13     }
View Code

程式碼 2.1

三、 驅動程式碼main方法的不同

  在新api中驅動代碼主要是通過org.apache.hadoop.mapreduce.Job類實現的,通過該類管理各種配置,然後呼叫waitForCompleti on(boolean)方法把程式碼提交給JobTracker執行。在舊api中驅動程式碼主要是通過 org.apache.hadoop.mapred.JobConf.JobConf(Con figuration, Class)類實現的,通過該類管理各種配置。對於job的提交,是通過org.apache.hadoop.mapred.JobClient類的 runJob(JobC onf)方法實現的。可見,新api中把JobConfJobClient的功能進行了合併,使用者呼叫更方便。

  其中,JobConf類與Job類的方法名稱幾乎一致,只是傳遞的形參型別大不相同了。在新api中的Job類,要求setXXX(…)的形參必須是org .apache.hadoop.mapreduce及其子包下面的類;而舊api中的JobConf類,要求setXXX(…)的形參必須是 org.apache.hadoop.mapred及其子包下面的類。使用舊api的驅動程式碼main方法,如程式碼3.1所示。

 1 package old;
 2 
 3 import java.io.IOException;
 4 import java.net.URI;
 5 import java.util.Iterator;
 6 
 7 import mapreduce.WordCountApp;
 8 
 9 import org.apache.hadoop.conf.Configuration;
10 import org.apache.hadoop.fs.FileSystem;
11 import org.apache.hadoop.fs.Path;
12 import org.apache.hadoop.io.LongWritable;
13 import org.apache.hadoop.io.Text;
14 import org.apache.hadoop.mapred.FileInputFormat;
15 import org.apache.hadoop.mapred.FileOutputFormat;
16 import org.apache.hadoop.mapred.JobClient;
17 import org.apache.hadoop.mapred.JobConf;
18 import org.apache.hadoop.mapred.MapReduceBase;
19 import org.apache.hadoop.mapred.Mapper;
20 import org.apache.hadoop.mapred.OutputCollector;
21 import org.apache.hadoop.mapred.Reducer;
22 import org.apache.hadoop.mapred.Reporter;
23 import org.apache.hadoop.mapred.TextInputFormat;
24 import org.apache.hadoop.mapred.TextOutputFormat;
25 import org.apache.hadoop.mapred.lib.HashPartitioner;
26 /**
27  * hadoop版本1.x的包一般是mapreduce
28  * hadoop版本0.x的包一般是mapred
29  *
30  */
31 public class OldAPP {
32     static final String INPUT_PATH = "hdfs://hadoop:9000/hello";
33     static final String OUT_PATH = "hdfs://hadoop:9000/out";
34     /**
35      * 改動:
36      * 1.不再使用Job,而是使用JobConf
37      * 2.類的包名不再使用mapreduce,而是使用mapred
38      * 3.不再使用job.waitForCompletion(true)提交作業,而是使用JobClient.runJob(job);
39      * 
40      */
41     public static void main(String[] args) throws Exception {
42         
43         Configuration conf = new Configuration();
44         final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
45         final Path outPath = new Path(OUT_PATH);
46         if(fileSystem.exists(outPath)){
47             fileSystem.delete(outPath, true);
48         }
49         
50         final JobConf job = new JobConf(conf , WordCountApp.class);
51         
52         FileInputFormat.setInputPaths(job, INPUT_PATH);//1.1指定讀取的檔案位於哪裡
53         job.setMapperClass(MyMapper.class);//1.2 指定自定義的map類
54         job.setMapOutputKeyClass(Text.class);//map輸出的<k,v>型別。如果<k3,v3>的型別與<k2,v2>型別一致,則可以省略
55         job.setMapOutputValueClass(LongWritable.class);
56         job.setPartitionerClass(HashPartitioner.class);//1.3 分割槽
57         job.setNumReduceTasks(1);//有一個reduce任務執行
58         job.setReducerClass(MyReducer.class);//2.2 指定自定義reduce類
59         job.setOutputKeyClass(Text.class);//指定reduce的輸出型別
60         job.setOutputValueClass(LongWritable.class);
61         FileOutputFormat.setOutputPath(job, outPath);//2.3 指定寫出到哪裡
62         JobClient.runJob(job);//把job提交給JobTracker執行
63     }
64 
65     
66     
67     /**
68      * 新api:extends Mapper
69      * 老api:extends MapRedcueBase implements Mapper
70      */
71     static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{
72         @Override
73         public void map(LongWritable k1, Text v1,
74                 OutputCollector<Text, LongWritable> collector, Reporter reporter)
75                 throws IOException {
76             final String[] splited = v1.toString().split("\t");
77             for (String word : splited) {
78                 collector.collect(new Text(word), new LongWritable(1));
79             }
80         }
81     }
82     
83     static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{
84         @Override
85         public void reduce(Text k2, Iterator<LongWritable> v2s,
86                 OutputCollector<Text, LongWritable> collector, Reporter reporter)
87                 throws IOException {
88             long times = 0L;
89             while (v2s.hasNext()) {
90                 final long temp = v2s.next().get();
91                 times += temp;
92             }
93             collector.collect(k2, new LongWritable(times));
94         }
95     }
96 }
View Code

程式碼 3.1

相關文章