hadoop的mapreduce串聯執行
import java.io.IOException; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob; import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class PickMain { private static final Log LOG = LogFactory.getLog(PickMain.class); public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { /* * Configuration conf = new Configuration(); Job job1 = Job.getInstance(conf); job1.setJarByClass(PickMain.class); job1.setMapperClass(FindMapper.class); job1.setReducerClass(FindReducer.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job1, new Path(args[0])); FileOutputFormat.setOutputPath(job1, new Path(args[1])); boolean flag1 = job1.waitForCompletion(true); //下面這種方法也可以實現串聯執行job if(flag1) { Job job2 = Job.getInstance(conf); job2.setJarByClass(PickMain.class); job2.setMapperClass(SecondFindMapper.class); job2.setReducerClass(SecondFindReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job2, new Path(args[1])); FileOutputFormat.setOutputPath(job2, new Path(args[2])); boolean flag2 = job2.waitForCompletion(true); System.out.println(flag2?0:1); if(flag2) { LOG.info("The job is done!"); System.exit(0); }else { LOG.info("The Second job is wrong!"); System.exit(1); } }else { LOG.info("The firt job is Running Wrong job break!"); System.exit(1); } */ //下面透過使用ContolledJob和JobControl來實現提交多個作業 Configuration conf = new Configuration(); Job job1 = Job.getInstance(conf); job1.setJarByClass(PickMain.class); job1.setMapperClass(FindMapper.class); job1.setReducerClass(FindReducer.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job1, new Path(args[0])); FileOutputFormat.setOutputPath(job1, new Path(args[1])); Configuration conf2 = new Configuration(); Job job2 = Job.getInstance(conf2); job2.setJarByClass(PickMain.class); job2.setMapperClass(SecondFindMapper.class); job2.setReducerClass(SecondFindReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job2, new Path(args[1])); FileOutputFormat.setOutputPath(job2, new Path(args[2])); //建立ControlledJob對job進行包裝 ControlledJob cjob1 = new ControlledJob(conf); ControlledJob cjob2 = new ControlledJob(conf2); cjob1.setJob(job1); cjob2.setJob(job2); //設定依賴關係,這個時候只有等到job1執行完成後job2才會執行 cjob2.addDependingJob(cjob1); //JobControl該類相當於一個job控制器,它是一個執行緒,需要透過執行緒啟動 JobControl jc = new JobControl("my_jobcontrol"); jc.addJob(cjob1); jc.addJob(cjob2); Thread th = new Thread(jc); th.start(); //等到所有的job都執行完成後在退出 while(!jc.allFinished()) { Thread.sleep(5000); } System.exit(0); } } class FindMapper extends Mapper<LongWritable, Text, Text, Text>{ Text m1 = new Text(); Text m2 = new Text(); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException { String line = value.toString(); String[] tmp1 = line.split(":"); String outval = tmp1[0]; String[] outkeys = tmp1[1].split(","); for(int i = 0 ; i<outkeys.length;i++) { m1.set(outkeys[i]);m2.set(outval); context.write(m1,m2); } } } class FindReducer extends Reducer<Text, Text, Text, NullWritable>{ StringBuilder sb = new StringBuilder(); NullWritable nul = NullWritable.get(); Text outval = new Text(); String spector = ":"; @Override protected void reduce(Text txt, Iterable<Text> txtiter, Reducer<Text, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException { sb.delete(0, sb.length()); sb.append(txt.toString()); Iterator<Text> it = txtiter.iterator(); while(it.hasNext()) { sb.append(spector+it.next().toString()); } outval.set(sb.toString()); context.write(outval, nul); } } class SecondFindMapper extends Mapper<LongWritable, Text, Text, Text>{ Text keyout = new Text(); Text valueout = new Text(); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException { String[] fs = value.toString().split(":"); valueout.set(fs[0]); if(fs.length>0) { for(int i = 1;i<fs.length-1;i++) { for(int j = i+1;j<fs.length;j++) { if((int)fs[i].toCharArray()[0]>(int)fs[j].toCharArray()[0]) { keyout.set(fs[j]+"-"+fs[i]); }else { keyout.set(fs[i]+"-"+fs[j]); } context.write(keyout, valueout); } } } } } class SecondFindReducer extends Reducer<Text, Text, Text, Text>{ StringBuilder sb = new StringBuilder(); Text outvalue = new Text(); @Override protected void reduce(Text key, Iterable<Text> iter, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException { sb.delete(0, sb.length()); Iterator<Text> it = iter.iterator(); if(it.hasNext()) { sb.append(it.next().toString()); } while(it.hasNext()) { sb.append(","+it.next().toString()); } outvalue.set(sb.toString()); context.write(key, outvalue); } }
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/31506529/viewspace-2213390/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- 在Docker容器中使用Hadoop執行Python MapReduce作業DockerHadoopPython
- MapReduce執行流程
- MapReduce的執行流程概述
- MapReduce程式執行流程
- Hadoop學習——MapReduceHadoop
- hadoop_MapReduce yarnHadoopYarn
- Hadoop(十四)MapReduce概述Hadoop
- MapReduce 示例:減少 Hadoop MapReduce 中的側連線Hadoop
- 使用MapReduce執行WordCount案例
- 第一章:Hadoop生態系統及執行MapReduce任務介紹!Hadoop
- MapReduce 執行全過程解析
- Hadoop面試題之MapReduceHadoop面試題
- Hadoop 專欄 - MapReduce 入門Hadoop
- Hadoop的mapreduce出現問題,報錯The auxService:mapreduce_shuffle does not existHadoopUX
- Hadoop(三)通過C#/python實現Hadoop MapReduceHadoopC#Python
- Hadoop-叢集執行Hadoop
- hadoop(二)—hadoop配置、執行錯誤總結Hadoop
- 30串聯所有單詞的子串
- 從分治演算法到 Hadoop MapReduce演算法Hadoop
- Hadoop學習(二)——MapReduce\Yarn架構HadoopYarn架構
- Hadoop(十九)MapReduce OutputFormat 資料壓縮HadoopORM
- Hadoop面試題總結(三)——MapReduceHadoop面試題
- MapReduce如何作為Yarn應用程式執行?Yarn
- 搭建本地執行Hadoop環境Hadoop
- Hadoop之MapReduce2架構設計Hadoop架構
- 談談Hadoop MapReduce和Spark MR實現HadoopSpark
- hadoop archive合併小檔案並進行mapreduce來減少map的數量HadoopHive
- Hadoop系列,執行jar檔案命令HadoopJAR
- hadoop之mapreduce.input.fileinputformat.split.minsize引數HadoopORM
- Hadoop 學習系列(四)之 MapReduce 原理講解Hadoop
- Hadoop之MapReduce2基礎梳理及案例Hadoop
- IDEA本地執行hadoop程式成功,叢集執行找不到自定義的Mapper類IdeaHadoopAPP
- [leetcode 30 串聯所有單詞的子串 10ms]LeetCode
- Hadoop框架:MapReduce基本原理和入門案例Hadoop框架
- Hadoop 三劍客之 —— 分散式計算框架 MapReduceHadoop分散式框架
- Hadoop學習第四天--MapReduce提交過程Hadoop
- 2. TeraSort在Hadoop分散式叢集中的執行Hadoop分散式
- 大資料學習開發技術:MapReduce執行原理大資料