MapReduce 實現 搜尋指數統計和找到人氣王

菜鳥級的IT之路發表於2018-04-02

專案介紹

本專案我們使用明星搜尋指數資料,分別統計出搜尋指數最高的男明星和女明星。

思路分析

基於專案的需求,我們通過以下幾步完成:

1、編寫 Mapper類,按需求將資料集解析為 key=gendervalue=name+hotIndex,然後

輸出。

2、編寫 Combiner 類,合併 Mapper 輸出結果,然後輸出給 Reducer

3、編寫 Partitioner 類,按性別,將結果指定給不同的 Reduce 執行。

4、編寫 Reducer 類,分別統計出男、女明星的最高搜尋指數。

5、編寫 run 方法執行 MapReduce 任務。

資料格式

程式碼


package com.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

/**

 *

 * 統計分別統計出男女明星最大搜尋指數

 */

public class Star extends Configured implements Tool {

/**

 *  Mapper 解析明星資料

 * @input key=偏移量  value=明星資料

 * @output key=gender value=name+hotIndex

 */

public static class ActorMapper extends Mapper<Object, Text, Text, Text> {

 

public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

//value=name+gender+hotIndex

String[] tokens = value.toString().split("\t");//使用分隔符\t,將資料解析為陣列 tokens

String gender = tokens[1].trim();//性別

String nameHotIndex = tokens[0] + "\t" + tokens[2];//名稱和關注指數

//輸出key=gender value=name+hotIndex

context.write(new Text(gender), new Text(nameHotIndex));

}

}

 

/**

 *

 *  Partitioner 根據sex選擇分割槽

 *

 */

public static class ActorPartitioner extends Partitioner<Text, Text> {	 

        @Override

        public int getPartition(Text key, Text value, int numReduceTasks) {

        

            String sex = key.toString();//按性別分割槽

            

            // 預設指定分割槽 0

            if(numReduceTasks==0)

            	return 0;

            

            //性別為male 選擇分割槽0

            if(sex.equals("male"))             

                return 0;

            //性別為female 選擇分割槽1

            if(sex.equals("female"))

            	return 1 % numReduceTasks;

            //其他性別 選擇分割槽2

            else

                return 2 % numReduceTasks;

           

        }

    }

 

/**

 *

 *  定義Combiner 合併 Mapper 輸出結果

 *

 */

public static class ActorCombiner extends Reducer<Text, Text, Text, Text> {

private Text text = new Text();

@Override

public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {

int maxHotIndex = Integer.MIN_VALUE;

int hotIndex = 0;

String name="";

for (Text val : values) {

String[] valTokens = val.toString().split("\\t");

hotIndex = Integer.parseInt(valTokens[1]);

if(hotIndex>maxHotIndex){

name = valTokens[0];

maxHotIndex = hotIndex;

}

}

text.set(name+"\t"+maxHotIndex);

context.write(key, text);

}

}

/**

 *

 *  Reducer 統計男、女明星最高搜尋指數

 * @input key=gender  value=name+hotIndex

 * @output key=name value=gender+hotIndex(max)

 */

public static class ActorReducer extends Reducer<Text, Text, Text, Text> {

 

@Override

public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {

 

int maxHotIndex = Integer.MIN_VALUE;

 

String name = " ";

int hotIndex = 0;

// 根據key,迭代 values 集合,求出最高搜尋指數

for (Text val : values) {

String[] valTokens = val.toString().split("\\t");

hotIndex = Integer.parseInt(valTokens[1]);

if (hotIndex > maxHotIndex) {

name = valTokens[0];

maxHotIndex = hotIndex;

}

}

context.write(new Text(name), new Text( key + "\t"+ maxHotIndex));

}

}

 

/**

 *  任務驅動方法

 * @param args

 * @return

 * @throws Exception

 */

@Override

public int run(String[] args) throws Exception {

// TODO Auto-generated method stub

Configuration conf = new Configuration();//讀取配置檔案

Path mypath = new Path(args[1]);

FileSystem hdfs = mypath.getFileSystem(conf);

if (hdfs.isDirectory(mypath)) {

hdfs.delete(mypath, true);

}

 

Job job = new Job(conf, "star");//新建一個任務

job.setJarByClass(Star.class);//主類

job.setNumReduceTasks(2);//reduce的個數設定為2

job.setPartitionerClass(ActorPartitioner.class);//設定Partitioner類

job.setMapperClass(ActorMapper.class);//Mapper

job.setMapOutputKeyClass(Text.class);//map 輸出key型別

job.setMapOutputValueClass(Text.class);//map 輸出value型別

job.setCombinerClass(ActorCombiner.class);//設定Combiner類

job.setReducerClass(ActorReducer.class);//Reducer

job.setOutputKeyClass(Text.class);//輸出結果 key型別

job.setOutputValueClass(Text.class);//輸出結果 value型別

FileInputFormat.addInputPath(job, new Path(args[0]));// 輸入路徑

FileOutputFormat.setOutputPath(job, new Path(args[1]));// 輸出路徑

job.waitForCompletion(true);//提交任務

return 0;

}

/**

 *  main 方法

 * @param args

 * @throws Exception

 */

public static void main(String[] args) throws Exception {

String[] args0 = { "hdfs://cdh001:9000/actor/actor.txt",

"hdfs://cdh001:9000/actor/out/" };

int ec = ToolRunner.run(new Configuration(), new Star(), args0);

System.exit(ec);

}

}

程式碼分析

Map

 

每次呼叫map(LongWritable key, Text value, Context context)解析一行資料。每行資料儲存在value引數值中。然後根據'\t'分隔符,解析出明星姓名,性別和搜尋指數

 

public static class ActorMapper extends Mapper< Object, Text, Text, Text> {

public void map(Object key, Text value, Context context) throws IOException, InterruptedExcept{

 //value=name+gender+hotIndex

String[] tokens = value.toString().split("\t"); String gender = tokens[1].trim();//性別

 

String nameHotIndex = tokens[0] + "\t" + tokens[2];//名稱和搜尋指數 context.write(new Text(gender), new Text(nameHotIndex));

}

}

map()函式期望的輸出結果Map = {key = gender, value = name+hotIndex}

 

Combiner

 

map 端的輸出結果,先進行一次合併,減少資料的網路輸出。

public static class ActorCombiner extends Reducer< Text, Text, Text, Text> {

private Text text = new Text();

@Override

public void reduce(Text key, Iterable< Text> values, Context context) throws IOException, InterruptedException{

int maxHotIndex = Integer.MIN_VALUE;

int hotIndex = 0;

String name="";

for (Text val : values) {

String[] valTokens = val.toString().split("\\t"); hotIndex = Integer.parseInt(valTokens[1]); if(hotIndex>maxHotIndex){

 

name = valTokens[0]; maxHotIndex = hotIndex;

}

}

text.set(name+"\t"+maxHotIndex); context.write(key, text);

}

}

Partitioner

 

根據明星性別對資料進行分割槽,將 Mapper 的輸出結果均勻分佈在 reduce 上。

/**

 *  Partitioner 根據sex選擇分割槽

 */

public static class ActorPartitioner extends Partitioner<Text, Text> {  

        @Override

        public int getPartition(Text key, Text value, int numReduceTasks) {

        

            String sex = key.toString();//按性別分割槽

            

            // 預設指定分割槽 0

            if(numReduceTasks==0)

             return 0;

            

            //性別為male 選擇分割槽0

            if(sex.equals("male"))             

                return 0;

            //性別為female 選擇分割槽1

            if(sex.equals("female"))

             return 1 % numReduceTasks;

            //其他性別 選擇分割槽2

            else

                return 2 % numReduceTasks;

           

        }

}

Reduce

 

呼叫reduce(key, Iterable< Text> values, context)方法來處理每個keyvalues的集

 

合。我們在values集合中,計算出明星的最大搜尋指數。

/**

 *

 *  Reducer 統計男、女明星最高搜尋指數

 * @input key=gender  value=name+hotIndex

 * @output key=name value=gender+hotIndex(max)

 */

public static class ActorReducer extends Reducer<Text, Text, Text, Text> {

 

@Override

public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {

 

int maxHotIndex = Integer.MIN_VALUE;

 

String name = " ";

int hotIndex = 0;

// 根據key,迭代 values 集合,求出最高搜尋指數

for (Text val : values) {

String[] valTokens = val.toString().split("\\t");

hotIndex = Integer.parseInt(valTokens[1]);

if (hotIndex > maxHotIndex) {

name = valTokens[0];

maxHotIndex = hotIndex;

}

}

context.write(new Text(name), new Text( key + "\t"+ maxHotIndex));

}

}

reduce()函式期望的輸出結果Reduce = {key = name, value = gender+max(hotIndex)}

 

Run 驅動方法

 

run 方法中,設定任務執行各種資訊。

/**

 *  任務驅動方法

 * @param args

 * @return

 * @throws Exception

 */

@Override

public int run(String[] args) throws Exception {

// TODO Auto-generated method stub

Configuration conf = new Configuration();//讀取配置檔案

Path mypath = new Path(args[1]);

FileSystem hdfs = mypath.getFileSystem(conf);

if (hdfs.isDirectory(mypath)) {

hdfs.delete(mypath, true);

}

 

Job job = new Job(conf, "star");//新建一個任務

job.setJarByClass(Star.class);//主類

job.setNumReduceTasks(2);//reduce的個數設定為2

job.setPartitionerClass(ActorPartitioner.class);//設定Partitioner

job.setMapperClass(ActorMapper.class);//Mapper

job.setMapOutputKeyClass(Text.class);//map 輸出key型別

job.setMapOutputValueClass(Text.class);//map 輸出value型別

job.setCombinerClass(ActorCombiner.class);//設定Combiner

job.setReducerClass(ActorReducer.class);//Reducer

job.setOutputKeyClass(Text.class);//輸出結果 key型別

job.setOutputValueClass(Text.class);//輸出結果 value型別

FileInputFormat.addInputPath(job, new Path(args[0]));// 輸入路徑

FileOutputFormat.setOutputPath(job, new Path(args[1]));// 輸出路徑

job.waitForCompletion(true);//提交任務

return 0;

}

編譯和執行 MapReduce作業

 

1IntelliJ IDEA 環境下,  將專案編譯和打包為star.jar,使用SSHstar.jar上傳至hadoop/home/hadoop/actor目錄下。

 

2、使用cd /home/hadoop/actor切換到當前目錄,通過命令列執行Hadoop作業

3 hadoop jar star.jar com.mapreduce.Star

 


相關文章