map/reduce實現資料去重

林六天發表於2014-07-07
  1 import java.io.IOException;
  2 
  3 import org.apache.hadoop.conf.Configuration;
  4 import org.apache.hadoop.conf.Configured;
  5 import org.apache.hadoop.fs.Path;
  6 import org.apache.hadoop.io.LongWritable;
  7 import org.apache.hadoop.io.Text;
  8 import org.apache.hadoop.mapreduce.Job;
  9 import org.apache.hadoop.mapreduce.Mapper;
 10 import org.apache.hadoop.mapreduce.Reducer;
 11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 12 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 14 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 15 import org.apache.hadoop.util.Tool;
 16 import org.apache.hadoop.util.ToolRunner;
 17 public class Dedpu extends Configured implements Tool {
 18 /**
 19  * 資料去重
 20  * 資料樣例:
 21  * 輸入資料
 22  * 2006-6-9 a
 23  * 2006-6-10 b
 24  * 2006-6-9 a
 25  * 結果資料
 26  * 2006-6-9 a
 27  * 2006-6-10 b
 28  * 設計思路:
 29  * Map階段 <時間,字元>
 30  * Reduce階段輸入<時間,list<字元>>,去除重複的字元,輸出
 31  * 
 32  * **/
 33     public static class Map extends Mapper<LongWritable,Text,Text,Text>{
 34         public void map(LongWritable key,Text value,Context context)throws IOException, InterruptedException{
 35             String line=value.toString();
 36             Text myvalue=new Text("");
 37             context.write(new Text(line), myvalue);
 38 //            StringTokenizer tokenizer=new StringTokenizer(line);
 39 //            String datestr="",datastr="";
 40 //            while(tokenizer.hasMoreTokens())
 41 //            {
 42 //                datestr=tokenizer.nextToken();
 43 //                datastr=tokenizer.nextToken();
 44 //                context.write(new Text(datestr), new Text(datastr));
 45 //                
 46 //            }
 47         }
 48         
 49     }
 50     
 51     public static class Reduce extends Reducer<Text,Text,Text,Text>{
 52         public void reduce(Text key,Iterable<Text>values,Context context)throws IOException,InterruptedException{
 53 
 54             context.write(key, new Text(""));
 55 //            ArrayList  arr=new ArrayList();
 56 //            Text mykey=key;
 57 //            for(Text txt:values)
 58 //            {
 59 //                
 60 //                if(!arr.contains(txt.toString())){
 61 //                    arr.add(txt.toString());
 62 //                }
 63 //                    
 64 //                
 65 //            }
 66 //            for(int i=0;i<arr.size();i++){
 67 //                context.write(mykey, new Text(arr.get(i).toString()));
 68 //                
 69 //            }
 70             
 71         
 72             
 73         }
 74         
 75     }
 76     
 77     public int run(String[] args)throws Exception
 78     {
 79         Configuration conf=new Configuration();
 80         Job job=new Job(conf,"Data Depution");
 81         job.setJarByClass(Dedpu.class);
 82         
 83         job.setMapperClass(Map.class);
 84         job.setCombinerClass(Reduce.class);
 85         job.setReducerClass(Reduce.class);
 86         
 87         job.setOutputKeyClass(Text.class);
 88         job.setOutputValueClass(Text.class);
 89         
 90         job.setInputFormatClass(TextInputFormat.class);
 91         job.setOutputFormatClass(TextOutputFormat.class);
 92         
 93         FileInputFormat.setInputPaths(job, new Path(args[0]));
 94         FileOutputFormat.setOutputPath(job, new Path(args[1]));
 95         
 96         boolean success=job.waitForCompletion(true);
 97         return success?0:1;
 98         
 99     }
100     
101     public static void main(String[] args) throws Exception{
102         int ret=ToolRunner.run(new Dedpu(), args);
103         System.exit(ret);
104     }
105 }

 

相關文章