MapReduce應用案例--單表關聯

Amei1314發表於2016-04-11

1. 例項描述

  單表關聯這個例項要求從給出的資料中尋找出所關心的資料,它是對原始資料所包含資訊的挖掘。

  例項中給出child-parent 表, 求出grandchild-grandparent表。

  輸入資料 file01:

 

child        parent
Tom          Lucy
Tom          Jack
Jone         Lucy
Jone         Jack
Lucy         Marry
Lucy         Ben
Jack         Alice
Jack         Jesse
Terry        Alice
Terry        Jesse
Philip       Terry
Philip       Alma
Mark         Terry
Mark         Alma

  希望輸出為:

 

grandchild    grandparent
Tom    Alice
Tom    Jesse
Jone    Alice
Jone    Jesse
Tom    Marry
Tom    Ben
Jone    Marry
Jone    Ben
Philip    Alice
Philip    Jesse
Mark    Alice
Mark    Jesse

2. 設計思路

  1. 在map階段,將原資料進行分割,將parent作為map輸出的key值,child作為map輸出的value值,這樣形成左表。

  2. 同時在map階段過程中,將child作為map輸出的key值,parent作為map輸出的value值,這樣形成右表。

  3. 連線左表的paren列和右表的child列。

3. 具體實現

  

package tablerelation;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 
 * @author Amei 單錶連結,求grandchild grandparent表
 */

public class SingleTableRelation {
    public static int time = 0;

    /**
     * 
     * @author Amei 左表的paren 和 右表的 child 做連結
     */
    public static class Map extends Mapper<LongWritable, Text, Text, Text> {
        protected void map(LongWritable key, Text value, Context context)
                throws java.io.IOException, InterruptedException {
       // 左右表的標識
int relation; StringTokenizer tokenizer = new StringTokenizer(value.toString()); String child = tokenizer.nextToken(); String parent = tokenizer.nextToken(); if (child.compareTo("child") != 0) { // 左表 relation = 1; context.write(new Text(parent), new Text(relation + "+" + child)); // 右表 relation = 2; context.write(new Text(child), new Text(relation + "+" + parent)); } }; } public static class Reduce extends Reducer<Text, Text, Text, Text> { protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context output) throws java.io.IOException, InterruptedException { int grandchildnum = 0; int grandparentnum = 0; List<String> grandchilds = new ArrayList<>(); List<String> grandparents = new ArrayList<>(); /** 輸出表頭 */ if (time == 0) { output.write(new Text("grandchild"), new Text("grandparent")); time++; } for (Text val : values) { String record = val.toString(); char relation = record.charAt(0); // 取出此時key所對應的child if (relation == '1') { String child = record.substring(2); grandchilds.add(child); grandchildnum++; } // 取出此時key所對應的parent else { String parent = record.substring(2); grandparents.add(parent); grandparentnum++; } } if (grandchildnum != 0 && grandparentnum != 0) { for (int i = 0; i < grandchildnum; i++) for (int j = 0; j < grandparentnum; j++) output.write(new Text(grandchilds.get(i)), new Text( grandparents.get(j))); } } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf,"single tale relation"); job.setJarByClass(SingleTableRelation.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path("/user/hadoop_admin/singletalein")); FileOutputFormat.setOutputPath(job, new Path("/user/hadoop_admin/singletableout")); System.exit((job.waitForCompletion(true) ? 0 : 1)); } }

 

相關文章