MapReduce程式就是根據其特性對資料進行一個簡單的邏輯處理,其中最為重要的一個特性就是根據key值將value值進行合併,其次就是在shuffle階段有排序。
遇到一個MR程式就是要巧妙利用合併、排序的特性。
單表關聯就是根據利用了合併的原理。
先上測試資料
child parent
Tom Lucy
Tom Jack
Lucy Marry
Lucy Ben
Jack Alice
Jack Jesse
結果資料
grandchild grandparent
Tom Marry
Tom Ben
Tom Alice
Tom Jesse
原理說明:
從要求中我們很容易想到利用parent作為key,這樣就能夠把grandchild和grandparent放到valuelist中。對valueList中的值進行一個笛卡爾積就能夠得到最終結果。
單表連線中,左表和右表都是自身,我們用c#區分左表,用p#區分右表
map\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
context.write(" Lucy", " C#Tom") context.write(" Jack", " C#Tom") context.write(" Marry", " C#Lucy") context.write(" Alice", " C#Jack") ......
context.write(" Tom", " P#Lucy") context.write(" Tom", " P#Jack") context.write(" Lucy", " P#Marry") context.write(" Jack", " P#Alice") ......
<" Lucy" , {" C#Tom", " P#Marry", " P#Ben"}> <" Jack" , {" C#Tom", " P#Alice", " P#Jesse"}> <" Marry" , { " C#Lucy"}> <" Alice" , { " C#Jack"}> <" Tom" , {" P#Lucy"," P#Jack"}>
Reduce\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
context.write(" Tom", " Marry") context.write(" Tom", " Ben") context.write(" Tom", " Alice") context.write(" Tom", " Jesse")
程式碼奉上
package cn.genekang.hadoop.test; import java.io.IOException; import java.util.ArrayList; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class STjoin { /* * child parentTom LucyTom JackLucy MarryLucy BenJack AliceJack Jesse* * */ // 單表連線 public static class StjoinMap extends Mapper<LongWritable, Text, Text, Text> { private Text kText = new Text(); private Text vText = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] lineSplit = value.toString().split("\t"); // c#代表的是左表 p#代表的是右表 // 右表 kText.set(lineSplit[1]); vText.set("p#" + lineSplit[0]); context.write(kText, vText); // 左表 kText.set(lineSplit[0]); vText.set("c#" + lineSplit[1]); context.write(kText, vText); } } public static class StjoinReduce extends Reducer<Text, Text, Text, Text> { private Text kText = new Text(); private Text vText = new Text(); @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { ArrayList<String> cList = new ArrayList<String>(); ArrayList<String> pList = new ArrayList<String>(); for (Text v : values) { if (v.toString().contains("c#")) { cList.add(v.toString().substring(2)); } else if (v.toString().contains("p#")) { pList.add(v.toString().substring(2)); } } if (!cList.isEmpty() && !pList.isEmpty()) { for (String c : cList) { for (String p : pList) { kText.set(c); vText.set(p); context.write(kText, vText); } } } // 清空list cList.clear(); pList.clear(); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(STjoin.class); job.setMapperClass(StjoinMap.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(StjoinReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }