Hadoop 2.6 使用Map Reduce實現矩陣相乘1 矩陣轉置

土豆拍死馬鈴薯發表於2017-10-10

專案地址:https://github.com/tudoupaisimalingshu/hadoop_matrix

矩陣相乘

一、理論基礎



二、如何用程式實現?A[M][N]*B[N][P]

import java.util.Arrays;

public class Matrix {

	public static void main(String[] args) {
		int[][] matrix1 = {
				{1,2,-2,0},
				{3,3,4,-3},
				{-2,0,2,3},
				{5,3,-1,2},
				{-4,2,0,2}};//左矩陣,5*4
		int[][] matrix2 = {
				{0,3,-1,2,-3},
				{1,3,5,-2,-1},
				{0,1,4,-1,2},
				{-2,2,-1,1,2}};//右矩陣,4*5
		int [][] matrix3 = new int[5][5];//結果矩陣,5*5
		for(int i=0;i<5;i++)//計算結果矩陣的每一行
		{
			int[] row = matrix1[i];//左邊矩陣第i行
			System.out.println("row=" + Arrays.toString(row));
			for(int j=0;j<5;j++)//計算結果矩陣的每一列
			{
				int[] line = new int[4];//右邊矩陣第j列
				for(int k=0;k<4;k++)
				{
					line[k] = matrix2[k][j];
				}//因為列向量是豎著的,用迴圈獲得該列的各個元素
				System.out.println("line=" + Arrays.toString(line));
				int result_i_j = 0;//定義相乘結果
				for(int m=0;m<4;m++)
				{
					result_i_j += row[m] * line[m];//累加乘積
				}
				System.out.println("result_i_j=" + result_i_j);
				System.out.println("--------------------");
				matrix3[i][j] = result_i_j;//設定結果矩陣對應位置的值
			}
		}
		
		//輸出結果矩陣
		for(int i=0;i<5;i++)
		{
			for(int j=0;j<5;j++)
			{
				System.out.print(matrix3[i][j] + "\t");
			}
			System.out.println();
		}
	}
}


輸出:

row=[1, 2, -2, 0]
line=[0, 1, 0, -2]
result_i_j=2
--------------------
line=[3, 3, 1, 2]
result_i_j=7
--------------------
line=[-1, 5, 4, -1]
result_i_j=1
--------------------
line=[2, -2, -1, 1]
result_i_j=0
--------------------
line=[-3, -1, 2, 2]
result_i_j=-9
--------------------
row=[3, 3, 4, -3]
line=[0, 1, 0, -2]
result_i_j=9
--------------------
line=[3, 3, 1, 2]
result_i_j=16
--------------------
line=[-1, 5, 4, -1]
result_i_j=31
--------------------
line=[2, -2, -1, 1]
result_i_j=-7
--------------------
line=[-3, -1, 2, 2]
result_i_j=-10
--------------------
row=[-2, 0, 2, 3]
line=[0, 1, 0, -2]
result_i_j=-6
--------------------
line=[3, 3, 1, 2]
result_i_j=2
--------------------
line=[-1, 5, 4, -1]
result_i_j=7
--------------------
line=[2, -2, -1, 1]
result_i_j=-3
--------------------
line=[-3, -1, 2, 2]
result_i_j=16
--------------------
row=[5, 3, -1, 2]
line=[0, 1, 0, -2]
result_i_j=-1
--------------------
line=[3, 3, 1, 2]
result_i_j=27
--------------------
line=[-1, 5, 4, -1]
result_i_j=4
--------------------
line=[2, -2, -1, 1]
result_i_j=7
--------------------
line=[-3, -1, 2, 2]
result_i_j=-16
--------------------
row=[-4, 2, 0, 2]
line=[0, 1, 0, -2]
result_i_j=-2
--------------------
line=[3, 3, 1, 2]
result_i_j=-2
--------------------
line=[-1, 5, 4, -1]
result_i_j=12
--------------------
line=[2, -2, -1, 1]
result_i_j=-10
--------------------
line=[-3, -1, 2, 2]
result_i_j=14
--------------------
2	7	1	0	-9	
9	16	31	-7	-10	
-6	2	7	-3	16	
-1	27	4	7	-16	
-2	-2	12	-10	14	


三、傳統程式的問題:

1、不能併發執行,總是按照迴圈的條件一次一次執行。

2、如果矩陣的規模很大,以至於放不到記憶體中,則可能要放入檔案中,那麼對於左側矩陣還好說,每次只需要讀取一行放入記憶體,下次迴圈再讀取下一行即可;從程式中可以看到,對於右側矩陣,我們需要得到列向量,也就是遍歷所有的行,每行取一個元素,然後組成列向量,當檔案很大時,速度太慢。

 

四、解決方案

1、針對問題1引入併發執行框架Hadoop,其中的Map和Reduce操作可以併發執行。

2、針對問題2,將右邊矩陣轉置,從而實現列向量轉為行向量



五、使用Hadoop Map Reduce 進行矩陣相乘

1、矩陣的儲存結構



為什麼要把每一行的所有列寫在一行?

矩陣檔案可能很大,此時Hadoop的HDFS就會將檔案分片,如果沒有將同一行的所有列寫在一起,則屬於同一行的元素可能會被分到不同的分片,導致後面還會消耗時間和空間去查詢拼接,也就是還需要寫reduce來合併行。

 

為什麼要對一行的每一個元素標出列的序號?

由於Hadoop是並行的,在進行map拆分的之後進行reduce合併的過程中,並不能保證一行的各個元素是有序的,因此要標出元素對應的下標,在hadoop中,由於行號是唯一的,再加上標明的列號,就能保證在並行處理過程中的正確性。

 

 

 

2、矩陣轉置的Map Reduce實現


package hadoop;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

public class Step1 {
	public static class Mapper1 extends Mapper<LongWritable,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		
		/*
			待轉置矩陣
			0	3	-1	2	-3
			1	3	5	-2	-1
			0	1	4	-1	2
			-2	2	-1	1	2
		*/
		/*
			目標矩陣
			0	1	1	-2
			3	3	1	2
			-1	5	4	-1
			2	-2	-1	1
			-3	-1	2	2
		*/
		//對於每一行,以第一行為例
		//key : 1
		//value : "1	1_0,2_3,3_-1,4_2,5_-3"
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			String[] rowAndline = value.toString().split("\t");
			//rowAndline : {"1","1_0,2_3,3_-1,4_2,5_-3"}
			String row = rowAndline[0];
			//row "1"
			String[] lines = rowAndline[1].split(",");
			//rowAndline[1] : "1_0,2_3,3_-1,4_2,5_-3"
			//lines : {"1_0","2_3","3_-1","4_2","5_-3"}
			for(String line : lines)//對於每一列,以第一列為例,line "1_0"
			{
				String colunm = line.split("_")[0];
				//colunm : 1
				String valueStr = line.split("_")[1];
				//valueStr : 0 
				outKey.set(colunm);
				//將列作為行
				outValue.set(row + "_" + valueStr);
				//將行作為列
				context.write(outKey, outValue);
				// 產生(1,"1_0")
			}
			//迴圈結束,對於{"1_0","2_3","3_-1","4_2","5_-3"}
			//產生(1,"1_0") 第一行,第一列_0    (2,"1_3")  第二行,第一列_3		(3,"1_-1") (4,"1_2")(5,"1_-3")
			/*
			目標轉置矩陣
			0	1	1	-2
			3	3	1	2
			-1	5	4	-1
			2	-2	-1	1
			-3	-1	2	2
			*/
			//正好對應於轉置矩陣的第一列
		}
		/*
			所有map操作產生
			 ("1","1_0")	("2","1_3") 	("3","1_-1")	("4","1_2")		("5","1_-3")
			("1","2_1")	("2","2_3") 	("3","2_5")	    ("4","2_-2")	("5","2_-1")
			("1","3_0")	("2","3_1")	    ("3","3_4")		("4","3_-1")	("5","3_2")
			("1","4_-2")  ("2","4_2")	    ("3","4_-1")	("4","4_1")		("5","4_2")
		*/

	}
	

	/*
		Reduce任務,將map操作產生的所有鍵值對集合進行合併,生成轉置矩陣的儲存表示
		key值相同的值會組成值的集合
		如:
		key:"1"時
		values:{"3_0","1_0","4_-2","2_1"} 
		注意:這裡就是為什麼要進行列標號的原因,values的順序不一定就是原來矩陣列的順序
	*/
	
	public static class Reducer1 extends Reducer<Text,Text,Text,Text>
	{
		private Text outKey = new Text();
		private Text outValue = new Text();
		
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			
			StringBuilder sb = new StringBuilder();
			for(Text text : values)
			{
				sb.append(text + ",");
			}
			//sb : "3_0,1_0,4_-2,2_1,"
			//注意這裡末尾有個逗號
			String line = "";
			if(sb.toString().endsWith(","))
			{
				line = sb.substring(0,sb.length()-1);
			}
			//去掉逗號
			//line : "3_0,1_0,4_-2,2_1"
			outKey.set(key);
			outValue.set(line);
			//("1","3_0,1_0,4_-2,2_1")
			context.write(outKey, outValue);
		}
		
	}
	
	private static final String INPATH = "input/matrix.txt";//輸入檔案路徑
	private static final String OUTPATH = "output/step1";//輸出檔案路徑
	private static final String HDFS = "hdfs://pc1:9000";//HDFS路徑
	
	public void run() throws IOException, ClassNotFoundException, InterruptedException {
		 Configuration conf = new Configuration();
		    //String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		    //String[] otherArgs = {"hdfs://pc1:9000/input/chenjie.txt","hdfs://pc1:9000/output/out4"};
		    String[] otherArgs = {"input/matrix.txt","hdfs://pc1:9000/output/step1"};
		    //這裡需要配置引數即輸入和輸出的HDFS的檔案路徑
		    if (otherArgs.length != 2) {
		      System.err.println("Usage: wordcount <in> <out>");
		      System.exit(2);
		    }
		    //conf.set("fs.defaultFS",HDFS);
		   // JobConf conf1 = new JobConf(WordCount.class);
		    Job job = new Job(conf, "step1");//Job(Configuration conf, String jobName) 設定job名稱和
		    job.setJarByClass(Step1.class);
		    job.setMapperClass(Mapper1.class); //為job設定Mapper類 
		    //job.setCombinerClass(IntSumReducer.class); //為job設定Combiner類  
		    job.setReducerClass(Reducer1.class); //為job設定Reduce類 

		    job.setMapOutputKeyClass(Text.class);  
		    job.setMapOutputValueClass(Text.class); 

		    job.setOutputKeyClass(Text.class);        //設定輸出key的型別
		    job.setOutputValueClass(Text.class);//  設定輸出value的型別

		    job.setOutputFormatClass(SequenceFileOutputFormat.class);
		    FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //為map-reduce任務設定InputFormat實現類   設定輸入路徑

		    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//為map-reduce任務設定OutputFormat實現類  設定輸出路徑
		    System.exit(job.waitForCompletion(true) ? 0 : 1);
		
		
		/*Configuration conf = new Configuration();
		conf.set("fs.defaultFS",HDFS);
		Job job = Job.getInstance(conf,"step1");
		job.setJarByClass(Step1.class);
		job.setMapperClass(Mapper1.class);
		job.setReducerClass(Reducer1.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileSystem fs = FileSystem.get(conf);
		Path inPath = new Path(INPATH);
		if(fs.exists(inPath))
		{
			//FileInputFormat.addInputPath(conf, inPath);
		}
		Path outPath = new Path(OUTPATH);
		if(fs.exists(outPath))
		{
			fs.delete(outPath, true);
		}*/
		
	}
	
	public static void main(String[] args)
	{
		try {
			new Step1().run();
		} catch (ClassNotFoundException | IOException | InterruptedException e) {
			e.printStackTrace();
		}
	}
	
}


執行結果:




使用hadoop fs -text  檔案路徑檢視轉置結果:



然後進行矩陣相乘(點選開啟)

相關文章