RDD的詳解
RDD:彈性分散式資料集,是Spark中最基本的資料抽象,用來表示分散式集合,支援分散式操作!
RDD的建立
RDD中的資料可以來源於2個地方:本地集合或外部資料來源
RDD操作
分類
轉換運算元
Map
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo03Map {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
conf.setAppName("Demo03Map").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
//讀取檔案資料
val linesRDD: RDD[String] = sc.textFile("spark/data/words.txt")
//對資料進行扁平化處理
val flatRDD: RDD[String] = linesRDD.flatMap(_.split(","))
//按照單詞分組
val groupRDD: RDD[(String, Iterable[String])] = flatRDD.groupBy(w => w)
//聚合
val wordsRDD: RDD[String] = groupRDD.map(kv => {
val key: String = kv._1
val words: Iterable[String] = kv._2
key + "," + words.size
})
//分組+聚合
val mapRDD1: RDD[(String, Int)] = flatRDD.map((_, 1))
val words1: RDD[(String, Int)] = mapRDD1.reduceByKey(_ + _)
////分組+聚合
val mapRDD2: RDD[(String, Int)] = flatRDD.map((_, 1))
val words2: RDD[(String, Iterable[Int])] = mapRDD2.groupByKey()
val wordSum: RDD[(String, Int)] = words2.mapValues(_.size)
wordSum.foreach(println)
//輸出
wordsRDD.foreach(println)
words1.foreach(println)
}
}
flatMap(資料扁平化處理)
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo04FlatMap {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("Demo04FlatMap").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val linesRDD: RDD[String] = sc.parallelize(List("java,scala,python", "map,java,scala"))
//扁平化處理
val flatRDD: RDD[String] = linesRDD.flatMap(_.split(","))
flatRDD.foreach(println)
}
}
Mappartitions
map和mapPartitions區別
1)map:每次處理一條資料
2)mapPartitions:每次處理一個分割槽資料
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo05MapPartition {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("Demo05MapPartition").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val stuRDD: RDD[String] = sc.textFile("spark/data/words.txt",3)
stuRDD.mapPartitions(rdd => {
println("map partition")
// 按分割槽去處理資料
rdd.map(line => line.split(",")(1))
}).foreach(println)
}
}
fliter 過濾
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo06Filter {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("Demo05MapPartition").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val linesRDD: RDD[Int] = sc.parallelize(List(1, 2, 3, 4, 5))
//過濾,轉換運算元
linesRDD.filter(kv => {
kv % 2 == 1
}).foreach(println)
}
}
sample 取樣
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Demo07Sample {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("Demo05MapPartition").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
/**
* sample:對資料取樣
* withReplacement 有無放回
* fraction 抽樣比例
* withReplacement:表示抽出樣本後是否在放回去,true表示會放回去
* 這也就意味著抽出的樣本可能有重複
* fraction :抽出多少,這是一個double型別的引數,0-1之間,eg:0.3表示抽出30%
*/
val stuRDD: RDD[String] = sc.textFile("spark/data/students.txt",3)
stuRDD.sample(withReplacement = true,0.1).foreach(println)
}
}
union 將相同結結構的資料連線到一起
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo08Union {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("Demo05MapPartition").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
/**union
* 將兩個相同結構的資料連線在一起
*/
val lineRDD1: RDD[String] = sc.parallelize(List("java,scala", "data,python"))
val lineRDD2: RDD[String] = sc.parallelize(List("spark,scala", "java,python"))
println(lineRDD1.getNumPartitions)
val unionRDD: RDD[String] = lineRDD1.union(lineRDD2)
println(unionRDD.getNumPartitions)
unionRDD.foreach(println)
}
}
mappatitionWIthindex
//mapPartitionsWithIndex也是一個轉換運算元
// 會在處理每一個分割槽的時候獲得一個index
//可以選擇的執行的分割槽
stuRDD.mapPartitionsWithIndex((index, rdd) => {
println("當前遍歷的分割槽:" + index)
// 按分割槽去處理資料
rdd.map(line => line.split(",")(1))
}).foreach(println)
join 將資料按照相同key進行關聯(資料必須是(K,V))
import java.io
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo09Join {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("Demo05MapPartition").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
// 構建K-V格式的RDD
val tuple2RDD1: RDD[(String, String)] = sc.parallelize(List(("001", "張三"), "002" -> "小紅", "003" -> "小明"))
val tuple2RDD2: RDD[(String, Int)] = sc.parallelize(List(("001", 20), "002" -> 22, "003" -> 21))
val tuple2RDD3: RDD[(String, String)] = sc.parallelize(List(("001", "男"), "002" -> "女"))
//將檔案進行join
val joinRDD: RDD[(String, (String, Int))] = tuple2RDD1.join(tuple2RDD2)
joinRDD.map(kv => {
val i: String = kv._1
val j: String = kv._2._1
val k: Int = kv._2._2
i + "," + j + "," + k
}).foreach(println)
//第二種方式
joinRDD.map {
case (id: String, (name: String, age: Int)) => id + "*" + name + "*" + age
}.foreach(println)
val leftJoinRDD: RDD[(String, (String, Option[String]))] = tuple2RDD1.leftOuterJoin(tuple2RDD3)
leftJoinRDD.map {
//存在關聯
case (id: String, (name: String, Some(gender))) =>
id + "*" + name + "*" + gender
//不存在關聯
case (id: String, (name: String, None)) =>
id + "*" + name + "*" + "_"
}
}
}
groupByKey 將kv格式的資料進行key的聚合
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo10GroupByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("Demo10GroupByKey").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
/**
* groupBy 指定分組的欄位進行分組
*/
// 統計班級人數
val linesRDD: RDD[String] = sc.textFile("spark/data/students.txt")
linesRDD.groupBy(word => word.split(",")(4))
.map(kv => {
val key = kv._1
val wordsCnt = kv._2.size
key + "," + wordsCnt
}).foreach(println)
val linesMap: RDD[(String, String)] = linesRDD.map(lines => (lines.split(",")(4), lines))
//按照key進行分組
linesMap.groupByKey()
.map(lines=>{
val key = lines._1
val wordsCnt: Int = lines._2.size
key+","+wordsCnt
}).foreach(println)
}
}
ReduceByKey
reduceByKey 需要接收一個聚合函式
首先會對資料按key分組 然後在組內進行聚合(一般是加和,也可以是Max、Min之類的操作)
相當於 MR 中的combiner
可以在Map端進行預聚合,減少shuffle過程需要傳輸的資料量,以此提高效率
相對於groupByKey來說,效率更高,但功能更弱
冪等操作
y = f(x) = f(y) = f(f(x))
reducebyKey與groupbykey的區別
reduceByKey:具有預聚合操作
groupByKey:沒有預聚合
在不影響業務邏輯的前提下,優先採用reduceByKey。
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo11ReduceByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("Demo11ReduceByKey").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val linesRDD: RDD[String] = sc.textFile("spark/data/students.txt")
//統計班級人數
linesRDD.map(lines => (lines.split(",")(4), lines))
.groupByKey()
.map(kv => {
val key = kv._1
val cnt = kv._2.size
key + "" + cnt
}).foreach(println)
//ReduceByKey
/**
* reduceByKey 需要接收一個聚合函式
* 首先會對資料按key分組 然後在組內進行聚合(一般是加和,也可以是Max、Min之類的操作)
* 相當於 MR 中的combiner
* 可以在Map端進行預聚合,減少shuffle過程需要傳輸的資料量,以此提高效率
* 相對於groupByKey來說,效率更高,但功能更弱
* 冪等操作
* y = f(x) = f(y) = f(f(x))
*/
linesRDD.map(lines=>(lines.split(",")(4),1))
.reduceByKey(_+_)
.foreach(println)
}
}
sort 排序,預設升序
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Demo12Sort {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("Demo12Sort").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val linesRDD: RDD[String] = sc.textFile("spark/data/students.txt")
/**
* sortBy 轉換運算元
* 指定按什麼排序 預設升序
*
* sortByKey 轉換運算元
* 需要作用在KV格式的RDD上,直接按key排序 預設升序
*/
linesRDD.sortBy(lines => lines.split(",")(2), ascending = false) //按照年紀降序
.take(10) //轉換運算元列印十行
.foreach(println)
val mapRDD: RDD[(String, String)] = linesRDD.map(l => (l.split(",")(2), l))
mapRDD.sortByKey(ascending = false)
.take(10)
.foreach(println)
}
}
Mapvalue
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo13MapValue {
def main(args: Array[String]): Unit = {
/**
* mapValues 轉換運算元
* 需要作用在K—V格式的RDD上
* 傳入一個函式f
* 將RDD的每一條資料的value傳給函式f,key保持不變
* 資料規模也不會改變
*/
val conf: SparkConf = new SparkConf().setAppName("Demo13MapValue").setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val linesRDD: RDD[(String, Int)] = sc.parallelize(List(("zs", 10), ("zzw", 34), ("lm", 18)))
linesRDD.mapValues(lines=>lines*2)
.foreach(println)
}