SparkSQL手冊

calong發表於2021-03-26

第一節

# 讀取檔案
scala> val df = spark.read.json("E:/project/python/data/user.json")
# 查詢資料
scala> df.show

+---+------+---+
|age|  name|sex|
+---+------+---+
| 23|Calong||
| 21|  Test||
| 22|   Cat||
+---+------+---+
# 從DataFrame建立當前會話臨時表
scala> df.createOrReplaceTempView("user")
# 從DataFrame建立全域性臨時表
scala> df.createOrReplaceGlobalTempView("user")
# 從臨時表中查詢資料
scala> df.createOrReplaceTempView("user")

+---+------+---+
|age|  name|sex|
+---+------+---+
| 23|Calong||
| 21|  Test||
| 22|   Cat||
+---+------+---+
# 查詢Schema資訊
scala> df.printSchema
root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)
 |-- sex: string (nullable = true)
 # 查詢列資訊
 scala> df.select("age").show
+---+
|age|
+---+
| 23|
| 21|
| 22|
+---+
# 列修改查詢
scala> df.select($"age" + 1).show
+---------+
|(age + 1)|
+---------+
|       24|
|       22|
|       23|
+---------+
# 過濾查詢
scala> df.filter($"age" > 22).show
+---+------+---+
|age|  name|sex|
+---+------+---+
| 23|Calong||
+---+------+---+
# 分組查詢
scala> df.groupBy($"sex").count.show
+---+-----+
|sex|count|
+---+-----+
||    2|
||    1|
+---+-----+

DataFrame沒有資料型別之分

# RDD轉換為DataFrame
scala> val rdd = sc.makeRDD(List((1, "One"), (2, "Two"), (3, "Three"), (4, "Four")))
rdd: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[47] at makeRDD at <console>:24

scala> val df = rdd.toDF("num", "eng")
df: org.apache.spark.sql.DataFrame = [num: int, eng: string]

scala> df.show
+---+-----+
|num|  eng|
+---+-----+
|  1|  One|
|  2|  Two|
|  3|Three|
|  4| Four|
+---+-----+
# 從樣例類宣告DataSet
scala> case class Person(name: String, age: Int)
defined class Person

scala> val list = List(Person("Test", 18), Person("Calong", 20), Person("Mike", 21))
list: List[Person] = List(Person(Test,18), Person(Calong,20), Person(Mike,21))

scala> val ds = list.toDS
ds: org.apache.spark.sql.Dataset[Person] = [name: string, age: int]

scala> ds.show
+------+---+
|  name|age|
+------+---+
|  Test| 18|
|Calong| 20|
|  Mike| 21|
+------+---+
# DataFrame轉換為DataSet
scala> val df = spark.read.json("E:/project/python/data/user.json")
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string ... 1 more field]

scala> case class User(name: String, age: BigInt, sex: String)
defined class User

scala> val ds = df.as[User]
ds: org.apache.spark.sql.Dataset[User] = [age: bigint, name: string ... 1 more field]

scala> ds.show
+---+------+---+
|age|  name|sex|
+---+------+---+
| 23|Calong||
| 21|  Test||
| 22|   Cat||
+---+------+---+
# DataSet轉換為DataFrame
scala> val df = ds.toDF
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string ... 1 more field]

scala> df.show
+---+------+---+
|age|  name|sex|
+---+------+---+
| 23|Calong||
| 21|  Test||
| 22|   Cat||
+---+------+---+
# RDD轉換為DataSet
scala> val rdd = sc.makeRDD(List(Person("Test", 18), Person("Calong", 20), Person("Mike", 21)))
rdd: org.apache.spark.rdd.RDD[Person] = ParallelCollectionRDD[12] at makeRDD at <console>:26

scala> rdd.toDS
res3: org.apache.spark.sql.Dataset[Person] = [name: string, age: int]

scala> res3.show
+------+---+
|  name|age|
+------+---+
|  Test| 18|
|Calong| 20|
|  Mike| 21|
+------+---+
# DataSet轉換為RDD
scala> val rdd = res3.rdd
rdd: org.apache.spark.rdd.RDD[Person] = MapPartitionsRDD[18] at rdd at <console>:25
本作品採用《CC 協議》,轉載必須註明作者和本文連結