一、前述
今天繼續整理幾個Transformation運算元如下:
- mapPartitionWithIndex
- repartition
- coalesce
- groupByKey
- zip
- zipWithIndex
二、具體細節
- mapPartitionWithIndex
類似於mapPartitions,除此之外還會攜帶分割槽的索引值。
java程式碼:
package com.spark.spark.transformations; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; public class Operator_mapPartitionWithIndex { public static void main(String[] args) { SparkConf conf = new SparkConf(); conf.setMaster("local").setAppName("mapPartitionWithIndex"); JavaSparkContext sc = new JavaSparkContext(conf); List<String> names = Arrays.asList("zhangsan1", "zhangsan2", "zhangsan3","zhangsan4"); /** * 這裡的第二個引數是設定並行度,也是RDD的分割槽數,並行度理論上來說設定大小為core的2~3倍 */ JavaRDD<String> parallelize = sc.parallelize(names, 3); JavaRDD<String> mapPartitionsWithIndex = parallelize.mapPartitionsWithIndex( new Function2<Integer, Iterator<String>, Iterator<String>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(Integer index, Iterator<String> iter) throws Exception { List<String> list = new ArrayList<String>(); while(iter.hasNext()){ String s = iter.next(); list.add(s+"~"); System.out.println("partition id is "+index +",value is "+s ); } return list.iterator(); } }, true); mapPartitionsWithIndex.collect(); sc.stop(); } }
scala程式碼:
import org.apache.spark.SparkConf import org.apache.spark.SparkContext import scala.collection.mutable.ListBuffer object Operator_mapPartitionsWithIndex { def main(args: Array[String]): Unit = { val conf = new SparkConf() conf.setMaster("local").setAppName("mapPartitionsWithIndex") val sc = new SparkContext(conf) val rdd = sc.makeRDD(List("a","b","c"),3) rdd.mapPartitionsWithIndex((index,iter)=>{ val list = ListBuffer[String]() while(iter.hasNext){ val v = iter.next() list.+(v) println("index = "+index+" , value = "+v) } list.iterator }, true).foreach(println) sc.stop(); } }
程式碼解釋:
結果:
- coalesce
coalesce常用來減少分割槽,第二個引數是減少分割槽的過程中是否產生shuffle。
true為產生shuffle,false不產生shuffle。預設是false。
如果coalesce設定的分割槽數比原來的RDD的分割槽數還多的話,第二個引數設定為false不會起作用,如果設定成true,效果和repartition一樣。即repartition(numPartitions) = coalesce(numPartitions,true)
java程式碼:
package com.spark.spark.transformations; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; /** * coalesce減少分割槽 * 第二個引數是減少分割槽的過程中是否產生shuffle,true是產生shuffle,false是不產生shuffle,預設是false. * 如果coalesce的分割槽數比原來的分割槽數還多,第二個引數設定false,即不產生shuffle,不會起作用。 * 如果第二個引數設定成true則效果和repartition一樣,即coalesce(numPartitions,true) = repartition(numPartitions) * * @author root * */ public class Operator_coalesce { public static void main(String[] args) { SparkConf conf = new SparkConf(); conf.setMaster("local").setAppName("coalesce"); JavaSparkContext sc = new JavaSparkContext(conf); List<String> list = Arrays.asList( "love1","love2","love3", "love4","love5","love6", "love7","love8","love9", "love10","love11","love12" ); JavaRDD<String> rdd1 = sc.parallelize(list,3); JavaRDD<String> rdd2 = rdd1.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>(){ /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(Integer partitionId, Iterator<String> iter) throws Exception { List<String> list = new ArrayList<String>(); while(iter.hasNext()){ list.add("RDD1的分割槽索引:ll【"+partitionId+"】,值為:"+iter.next()); } return list.iterator(); } }, true); JavaRDD<String> coalesceRDD = rdd2.coalesce(2, false);//不產生shuffle //JavaRDD<String> coalesceRDD = rdd2.coalesce(2, true);//產生shuffle //JavaRDD<String> coalesceRDD = rdd2.coalesce(4,false);//設定分割槽數大於原RDD的分割槽數且不產生shuffle,不起作用 // System.out.println("coalesceRDD partitions length = "+coalesceRDD.partitions().size()); //JavaRDD<String> coalesceRDD = rdd2.coalesce(5,true);//設定分割槽數大於原RDD的分割槽數且產生shuffle,相當於repartition // JavaRDD<String> coalesceRDD = rdd2.repartition(4); JavaRDD<String> result = coalesceRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>(){ /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(Integer partitionId, Iterator<String> iter) throws Exception { List<String> list = new ArrayList<String>(); while(iter.hasNext()){ list.add("coalesceRDD的分割槽索引:【"+partitionId+"】,值為: "+iter.next()); } return list.iterator(); } }, true); for(String s: result.collect()){ System.out.println(s); } sc.stop(); } }
scala程式碼:
package com.bjsxt.spark.transformations import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.collection.mutable.ListBuffer object Operator_repartition { def main(args: Array[String]): Unit = { val conf = new SparkConf() conf.setMaster("local").setAppName("repartition") val sc = new SparkContext(conf) val rdd1 = sc.makeRDD(List(1,2,3,4,5,6,7),3) val rdd2 = rdd1.mapPartitionsWithIndex((partitionIndex,iter)=>{ val list = new ListBuffer[String]() while(iter.hasNext){ list += "rdd1partitionIndex : "+partitionIndex+",value :"+iter.next() } list.iterator }) rdd2.foreach{ println } val rdd3 = rdd2.repartition(4) val result = rdd3.mapPartitionsWithIndex((partitionIndex,iter)=>{ val list = ListBuffer[String]() while(iter.hasNext){ list +=("repartitionIndex : "+partitionIndex+",value :"+iter.next()) } list.iterator }) result.foreach{ println} sc.stop() } }
程式碼解釋:
JavaRDD<String> coalesceRDD = rdd2.coalesce(2, true);//產生shuffle
![](https://i.iter01.com/images/b0936a3d99d2a37bbdf04e927825255d4ca36a642297355cebad38e5bef791c5.png)
程式碼結果:
![](https://i.iter01.com/images/79829ad1fb88b145addc22518a5885c2306ba8dd94c5c9c47f4a859e7e5c63e3.png)
JavaRDD<String> coalesceRDD = rdd2.coalesce(2, false);//不產生shuffle
程式碼解釋:
程式碼結果:
JavaRDD<String> coalesceRDD = rdd2.coalesce(4,false);//設定分割槽數大於原RDD的分割槽數且不產生shuffle,不起作用
程式碼結果:
![](https://i.iter01.com/images/eef6f1a980b755d7f74b7cd3e4c47fc7c9c4478ef9d64f04a37fee4ac3870a9c.png)
JavaRDD<String> coalesceRDD = rdd2.coalesce(4,true);//設定分割槽數大於原RDD的分割槽數且產生shuffle,相當於repartition
程式碼結果:
- repartition
增加或減少分割槽。會產生shuffle。(多個分割槽分到一個分割槽不會產生shuffle)
java程式碼
package com.spark.spark.transformations; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; /** * repartition * 減少或者增多分割槽,會產生shuffle.(多個分割槽分到一個分割槽中不會產生shuffle) * @author root * */ public class Operator_repartition { public static void main(String[] args) { SparkConf conf = new SparkConf(); conf.setMaster("local").setAppName("coalesce"); JavaSparkContext sc = new JavaSparkContext(conf); List<String> list = Arrays.asList( "love1","love2","love3", "love4","love5","love6", "love7","love8","love9", "love10","love11","love12" ); JavaRDD<String> rdd1 = sc.parallelize(list,3); JavaRDD<String> rdd2 = rdd1.mapPartitionsWithIndex( new Function2<Integer, Iterator<String>, Iterator<String>>(){ /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(Integer partitionId, Iterator<String> iter) throws Exception { List<String> list = new ArrayList<String>(); while(iter.hasNext()){ list.add("RDD1的分割槽索引:【"+partitionId+"】,值為:"+iter.next()); } return list.iterator(); } }, true); // JavaRDD<String> repartitionRDD = rdd2.repartition(1); JavaRDD<String> repartitionRDD = rdd2.repartition(2); // JavaRDD<String> repartitionRDD = rdd2.repartition(6); JavaRDD<String> result = repartitionRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>(){ /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(Integer partitionId, Iterator<String> iter) throws Exception { List<String> list = new ArrayList<String>(); while(iter.hasNext()){ list.add("repartitionRDD的分割槽索引:【"+partitionId+"】,值為: "+iter.next()); } return list.iterator(); } }, true); for(String s: result.collect()){ System.out.println(s); } sc.stop(); } }
scala程式碼:
package com.bjsxt.spark.transformations import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.collection.mutable.ListBuffer object Operator_repartition { def main(args: Array[String]): Unit = { val conf = new SparkConf() conf.setMaster("local").setAppName("repartition") val sc = new SparkContext(conf) val rdd1 = sc.makeRDD(List(1,2,3,4,5,6,7),3) val rdd2 = rdd1.mapPartitionsWithIndex((partitionIndex,iter)=>{ val list = new ListBuffer[String]() while(iter.hasNext){ list += "rdd1partitionIndex : "+partitionIndex+",value :"+iter.next() } list.iterator }) rdd2.foreach{ println } val rdd3 = rdd2.repartition(4) val result = rdd3.mapPartitionsWithIndex((partitionIndex,iter)=>{ val list = ListBuffer[String]() while(iter.hasNext){ list +=("repartitionIndex : "+partitionIndex+",value :"+iter.next()) } list.iterator }) result.foreach{ println} sc.stop() } }
程式碼解釋:
JavaRDD<String> repartitionRDD = rdd2.repartition(2);
![](https://i.iter01.com/images/b0936a3d99d2a37bbdf04e927825255d4ca36a642297355cebad38e5bef791c5.png)
程式碼結果:
![](https://i.iter01.com/images/74e7fa83cf45c6a1e2fa9d718485ad5565350c470e78b5901aa3cd21d84573ce.png)
JavaRDD<String> repartitionRDD = rdd2.repartition(1);//不產生shuffle
程式碼結果:
- groupByKey(是一個transformation運算元注意和reducebykey區分)
作用在K,V格式的RDD上。根據Key進行分組。作用在(K,V),返回(K,Iterable <V>)。
java程式碼:
package com.spark.spark.transformations; import java.util.Arrays; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; public class Operator_groupByKey { public static void main(String[] args) { SparkConf conf = new SparkConf(); conf.setMaster("local").setAppName("groupByKey"); JavaSparkContext sc = new JavaSparkContext(conf); JavaPairRDD<String, Integer> parallelizePairs = sc.parallelizePairs(Arrays.asList( new Tuple2<String,Integer>("a", 1), new Tuple2<String,Integer>("a", 2), new Tuple2<String,Integer>("b", 3), new Tuple2<String,Integer>("c", 4), new Tuple2<String,Integer>("d", 5), new Tuple2<String,Integer>("d", 6) )); JavaPairRDD<String, Iterable<Integer>> groupByKey = parallelizePairs.groupByKey(); groupByKey.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Iterable<Integer>> t) throws Exception { System.out.println(t); } }); } }
scala程式碼:
package com.bjsxt.spark.transformations import org.apache.spark.SparkConf import org.apache.spark.SparkContext object Operator_groupByKey { def main(args: Array[String]): Unit = { val conf = new SparkConf() conf.setMaster("local").setAppName("groupByKey") val sc = new SparkContext(conf) val rdd1 = sc.makeRDD(Array( (1,"a"), (1,"b"), (2,"c"), (3,"d") )) val result = rdd1.groupByKey() result.foreach(println) sc.stop() } }
程式碼結果:
- zip
將兩個RDD中的元素(KV格式/非KV格式)變成一個KV格式的RDD,兩個RDD的個數必須相同。
java程式碼:
package com.spark.spark.transformations; import java.util.Arrays; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; public class Operator_zip { public static void main(String[] args) { SparkConf conf = new SparkConf(); conf.setMaster("local").setAppName("zip"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> nameRDD = sc.parallelize(Arrays.asList("zhangsan","lisi","wangwu")); JavaRDD<Integer> scoreRDD = sc.parallelize(Arrays.asList(100,200,300)); // JavaRDD<Integer> scoreRDD = sc.parallelize(Arrays.asList(100,200,300,400)); JavaPairRDD<String, Integer> zip = nameRDD.zip(scoreRDD); zip.foreach(new VoidFunction<Tuple2<String,Integer>>() { /** * */ private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Integer> tuple) throws Exception { System.out.println("tuple --- " + tuple); } }); // JavaPairRDD<String, String> parallelizePairs = sc.parallelizePairs(Arrays.asList( // new Tuple2<String, String >("a","aaa"), // new Tuple2<String, String >("b","bbb"), // new Tuple2<String, String >("c","ccc") // )); // JavaPairRDD<String, String> parallelizePairs1 = sc.parallelizePairs(Arrays.asList( // new Tuple2<String, String >("1","111"), // new Tuple2<String, String >("2","222"), // new Tuple2<String, String >("3","333") // )); // JavaPairRDD<Tuple2<String, String>, Tuple2<String, String>> result = parallelizePairs.zip(parallelizePairs1); sc.stop(); } }
scala程式碼:
package com.bjsxt.spark.transformations import org.apache.spark.SparkConf import org.apache.spark.SparkContext /** * 將兩個RDD中的元素(KV格式/非KV格式)變成一個KV格式的RDD,兩個RDD的個數必須相同。 */ object Operator_zip { def main(args: Array[String]): Unit = { val conf = new SparkConf() conf.setAppName("zip").setMaster("local") val sc = new SparkContext(conf) val nameRDD = sc.makeRDD(Array("zhangsan","lisi","wangwu")) val scoreRDD = sc.parallelize(Array(1,2,3)) val result = nameRDD.zip(scoreRDD) result.foreach(println) sc.stop() } }
結果:
- zipWithIndex
該函式將RDD中的元素和這個元素在RDD中的索引號(從0開始)組合成(K,V)對。
java程式碼:
package com.spark.spark.transformations; import java.util.Arrays; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /** * zipWithIndex 會將RDD中的元素和這個元素在RDD中的索引號(從0開始) 組合成(K,V)對 * @author root * */ public class Operator_zipWithIndex { public static void main(String[] args) { SparkConf conf = new SparkConf(); conf.setMaster("local").setAppName("zipWithIndex"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> nameRDD = sc.parallelize(Arrays.asList("zhangsan","lisi","wangwu")); JavaPairRDD<String, Long> zipWithIndex = nameRDD.zipWithIndex(); zipWithIndex.foreach(new VoidFunction<Tuple2<String,Long>>() { /** * */ private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Long> t) throws Exception { System.out.println("t ---- "+ t); } }); // JavaPairRDD<String, String> parallelizePairs = sc.parallelizePairs(Arrays.asList( // new Tuple2<String, String >("a","aaa"), // new Tuple2<String, String >("b","bbb"), // new Tuple2<String, String >("c","ccc") // )); // JavaPairRDD<Tuple2<String, String>, Long> zipWithIndex2 = parallelizePairs.zipWithIndex(); // zipWithIndex2.foreach(new VoidFunction<Tuple2<Tuple2<String,String>,Long>>() { // // /** // * // */ // private static final long serialVersionUID = 1L; // // @Override // public void call(Tuple2<Tuple2<String, String>, Long> t) // throws Exception { // System.out.println(" t ----" + t); // } // }); sc.stop(); } }
scala程式碼:
package com.bjsxt.spark.transformations import org.apache.spark.SparkConf import org.apache.spark.SparkContext /** * 該函式將RDD中的元素和這個元素在RDD中的索引號(從0開始)組合成(K,V)對 */ object zipWithIndex { def main(args: Array[String]): Unit = { val conf = new SparkConf() conf.setMaster("local").setAppName("zipWithIndex") val sc = new SparkContext(conf) val rdd1 = sc.makeRDD(Array((1,"a"),(2,"b"),(3,"c"))) val result = rdd1.zipWithIndex() result.foreach(println) sc.stop() } }
程式碼結果:
java結果:
scala結果: