1. 例項描述
單表關聯這個例項要求從給出的資料中尋找出所關心的資料,它是對原始資料所包含資訊的挖掘。
例項中給出child-parent 表, 求出grandchild-grandparent表。
輸入資料 file01:
child parent
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Marry
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma
希望輸出為:
grandchild grandparent
Tom Alice
Tom Jesse
Jone Alice
Jone Jesse
Tom Marry
Tom Ben
Jone Marry
Jone Ben
Philip Alice
Philip Jesse
Mark Alice
Mark Jesse
2. 設計思路
1. 在map階段,將原資料進行分割,將parent作為map輸出的key值,child作為map輸出的value值,這樣形成左表。
2. 同時在map階段過程中,將child作為map輸出的key值,parent作為map輸出的value值,這樣形成右表。
3. 連線左表的paren列和右表的child列。
3. 具體實現
package tablerelation; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * * @author Amei 單錶連結,求grandchild grandparent表 */ public class SingleTableRelation { public static int time = 0; /** * * @author Amei 左表的paren 和 右表的 child 做連結 */ public static class Map extends Mapper<LongWritable, Text, Text, Text> { protected void map(LongWritable key, Text value, Context context) throws java.io.IOException, InterruptedException {
// 左右表的標識 int relation; StringTokenizer tokenizer = new StringTokenizer(value.toString()); String child = tokenizer.nextToken(); String parent = tokenizer.nextToken(); if (child.compareTo("child") != 0) { // 左表 relation = 1; context.write(new Text(parent), new Text(relation + "+" + child)); // 右表 relation = 2; context.write(new Text(child), new Text(relation + "+" + parent)); } }; } public static class Reduce extends Reducer<Text, Text, Text, Text> { protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context output) throws java.io.IOException, InterruptedException { int grandchildnum = 0; int grandparentnum = 0; List<String> grandchilds = new ArrayList<>(); List<String> grandparents = new ArrayList<>(); /** 輸出表頭 */ if (time == 0) { output.write(new Text("grandchild"), new Text("grandparent")); time++; } for (Text val : values) { String record = val.toString(); char relation = record.charAt(0); // 取出此時key所對應的child if (relation == '1') { String child = record.substring(2); grandchilds.add(child); grandchildnum++; } // 取出此時key所對應的parent else { String parent = record.substring(2); grandparents.add(parent); grandparentnum++; } } if (grandchildnum != 0 && grandparentnum != 0) { for (int i = 0; i < grandchildnum; i++) for (int j = 0; j < grandparentnum; j++) output.write(new Text(grandchilds.get(i)), new Text( grandparents.get(j))); } } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf,"single tale relation"); job.setJarByClass(SingleTableRelation.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path("/user/hadoop_admin/singletalein")); FileOutputFormat.setOutputPath(job, new Path("/user/hadoop_admin/singletableout")); System.exit((job.waitForCompletion(true) ? 0 : 1)); } }