Hadoop--Map/Reduce實現多表連結

林六天發表於2014-07-07

MR實現多表連線的原理和單表連線時一樣的,甚至比單表連線還要簡單。

在map階段只需要根據檔案的名稱區分左表還是右表。使用關聯的欄位作為key2。

在reduce中對values中的值分別儲存到一個左表list和右表list中。對左表list和右表list進行一個笛卡爾積完事。

 1 import java.io.*;
 2 import java.util.*;
 3 
 4 import org.apache.hadoop.io.*;
 5 import org.apache.hadoop.util.*;
 6 import org.apache.hadoop.fs.Path;
 7 import org.apache.hadoop.mapreduce.*;
 8 import org.apache.hadoop.mapreduce.lib.input.*;
 9 import org.apache.hadoop.mapreduce.lib.output.*;
10 import org.apache.hadoop.conf.*;
11 import org.apache.hadoop.util.Tool;
12 public class MTjoin extends Configured implements Tool {
13     /*
14      * 多表連結,與單錶連結思路類似。將關聯列作為map的key值,用數字區分左表和右表。在Reduce階段對兩個表進行笛卡爾積
15      * */
16     public static class Map extends Mapper<LongWritable,Text,Text,Text>{
17         public void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException{
18             String line=value.toString();
19             int linelen=line.length();
20             //去除檔案首行
21             if(line.indexOf("factoryname")==-1&&line.indexOf("addressID")==-1)
22             {
23                 //處理factory資料
24                 if(line.charAt(linelen-2)==' ')
25                 {
26                     String facstr="1"+line.substring(0, linelen-2);
27                     String addrestr=String.valueOf(line.charAt(linelen-1));
28                     context.write(new Text(addrestr), new Text(facstr));
29                 }else{
30                     String addreidstr=String.valueOf(line.charAt(0));
31                     String addrenastr="2"+line.substring(1);
32                     context.write(new Text(addreidstr), new Text(addrenastr));
33                 }
34                                 
35             }
36         }
37         
38     }
39     
40     public static class Reduce extends Reducer<Text,Text,Text,Text>{
41         public void reduce(Text key,Iterable<Text> values,Context context)throws IOException, InterruptedException{
42             ArrayList<String> facarr=new ArrayList<String>();
43             ArrayList<String> addarr=new ArrayList<String>();
44             for(Text var:values){
45                 if(var.toString().charAt(0)=='1')
46                 {
47                     facarr.add(var.toString().substring(1));
48                 }else if(var.toString().charAt(0)=='2')
49                 {
50                     addarr.add(var.toString().substring(1));
51                 }
52                 
53             }
54             if(facarr.size()!=0&&addarr.size()!=0)
55             {
56                 for(int i=0;i<facarr.size();i++)
57                 {
58                     context.write(new Text(facarr.get(i)), new Text(addarr.get(0)));
59                 }
60                 
61             }
62         }
63     }
64     @Override
65     public int run(String[] args) throws Exception {
66         // TODO Auto-generated method stub
67         Configuration conf=new Configuration();
68         Job job=new Job(conf,"MTjoin");
69         job.setJarByClass(MTjoin.class);
70         
71         job.setOutputKeyClass(Text.class);
72         job.setOutputValueClass(Text.class);
73         
74         job.setMapperClass(Map.class);
75         job.setReducerClass(Reduce.class);
76         
77         job.setInputFormatClass(TextInputFormat.class);
78         job.setOutputFormatClass(TextOutputFormat.class);
79         
80         FileInputFormat.setInputPaths(job, new Path(args[0]));
81         FileOutputFormat.setOutputPath(job, new Path(args[1]));
82         
83         boolean success=job.waitForCompletion(true);
84         return success?0:1;
85     }
86     public static void main(String[] args)throws Exception{
87         int ret=ToolRunner.run(new MTjoin(), args);
88         System.exit(ret);
89     }
90 
91 }

 

相關文章