aggregateByKey與aggregate類似,都是進行兩次聚合,不同的是後者只對分割槽有效,前者對分割槽中key進一步細分
def
aggregateByKey[U
:
ClassTag](zeroValue
:
U, partitioner
:
Partitioner)
(seqOp
:
(U, V)
=
> U, combOp
:
(U, U)
=
> U)
:
RDD[(K, U)]
def
aggregateByKey[U
:
ClassTag](zeroValue
:
U, numPartitions
:
Int)
(seqOp
:
(U, V)
=
> U, combOp
:
(U, U)
=
> U)
:
RDD[(K, U)]
def
aggregateByKey[U
:
ClassTag](zeroValue
:
U)
(seqOp
:
(U, V)
=
> U, combOp
:
(U, U)
=
> U)
:
RDD[(K, U)]
//資料被分為兩個分割槽 //分割槽1:(1,3),(1,2) //分割槽2:(1, 4),(2,3),(2,4) scala> var data = sc.parallelize(List((1,3),(1,2),(1, 4),(2,3),(2,4)),2) data: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[7] at parallelize at <console>:24 //每個分割槽中按key聚合 scala> def InnerCom(a:Int, b:Int) : Int ={ | println("InnerCom: " + a + " :" + b) | math.max(a,b) | } InnerCom: (a: Int, b: Int)Int //分割槽間的聚合 scala> def PartitionCom(a:Int, b:Int) : Int ={ | println("PartitionCom: " + a + " :" + b) | a + b | } PartitionCom: (a: Int, b: Int)Int //第一個分割槽中只有一個key,兩個元素 //聚合後結果為(1,3) //第二個分割槽中兩個key,1、2 //聚合後結果為(1,4)、(2,3) //二次聚合後結果為(1,7)(2,4) scala> data.aggregateByKey(2)(InnerCom, PartitionCom).collect InnerCom: 2 :3 InnerCom: 3 :2 InnerCom: 2 :4 InnerCom: 2 :3 InnerCom: 3 :4 PartitionCom: 3 :4 res: Array[(Int, Int)] = Array((2,4), (1,7))