Hadoop-Map/Reduce之單表連線的實現

林六天發表於2014-10-30
MapReduce程式就是根據其特性對資料進行一個簡單的邏輯處理,其中最為重要的一個特性就是根據key值將value值進行合併,其次就是在shuffle階段有排序。
遇到一個MR程式就是要巧妙利用合併、排序的特性。
單表關聯就是根據利用了合併的原理。
先上測試資料
child    parent
Tom    Lucy
Tom    Jack
Lucy    Marry
Lucy    Ben
Jack    Alice
Jack    Jesse
 
結果資料
grandchild    grandparent
Tom    Marry
Tom    Ben
Tom    Alice
Tom    Jesse
 
原理說明:
從要求中我們很容易想到利用parent作為key,這樣就能夠把grandchild和grandparent放到valuelist中。對valueList中的值進行一個笛卡爾積就能夠得到最終結果。
單表連線中,左表和右表都是自身,我們用c#區分左表,用p#區分右表
map\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
context.write(" Lucy", " C#Tom")        context.write(" Jack", " C#Tom")    context.write(" Marry", " C#Lucy")   context.write(" Alice", " C#Jack")    ......
context.write(" Tom", " P#Lucy")        context.write(" Tom", " P#Jack")    context.write(" Lucy", " P#Marry")   context.write(" Jack", " P#Alice")    ......
 
<" Lucy" , {" C#Tom", " P#Marry", " P#Ben"}>  <" Jack" , {" C#Tom", " P#Alice", " P#Jesse"}>     <" Marry" , { " C#Lucy"}>    <" Alice" , { " C#Jack"}>     <" Tom" , {" P#Lucy"," P#Jack"}>
Reduce\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
context.write(" Tom", " Marry")    context.write(" Tom", " Ben")        context.write(" Tom", " Alice")    context.write(" Tom", " Jesse")
 
程式碼奉上
 
package cn.genekang.hadoop.test;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class STjoin {
    /*
     * child parentTom LucyTom JackLucy MarryLucy BenJack AliceJack Jesse* *
     */
    // 單表連線
    public static class StjoinMap extends
            Mapper<LongWritable, Text, Text, Text> {

        private Text kText = new Text();
        private Text vText = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String[] lineSplit = value.toString().split("\t");
            // c#代表的是左表 p#代表的是右表
            // 右表
            kText.set(lineSplit[1]);
            vText.set("p#" + lineSplit[0]);
            context.write(kText, vText);

            // 左表
            kText.set(lineSplit[0]);
            vText.set("c#" + lineSplit[1]);
            context.write(kText, vText);

        }

    }

    public static class StjoinReduce extends Reducer<Text, Text, Text, Text> {
        private Text kText = new Text();
        private Text vText = new Text();

        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            ArrayList<String> cList = new ArrayList<String>();
            ArrayList<String> pList = new ArrayList<String>();
            for (Text v : values) {
                if (v.toString().contains("c#")) {
                    cList.add(v.toString().substring(2));
                } else if (v.toString().contains("p#")) {
                    pList.add(v.toString().substring(2));

                }
            }

            if (!cList.isEmpty() && !pList.isEmpty()) {
                for (String c : cList) {
                    for (String p : pList) {
                        kText.set(c);
                        vText.set(p);
                        context.write(kText, vText);
                    }
                }
            }

            // 清空list
            cList.clear();
            pList.clear();
        }

    }

    public static void main(String[] args) throws IOException,
            ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(STjoin.class);

        job.setMapperClass(StjoinMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(StjoinReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

}

 

相關文章