package com.spark.sparksql.save
import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.hive.HiveContext
public class RowNumberWindowFunction {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("RowNumberWindowFunction")
JavaSparkContext sc = new JavaSparkContext(conf)
HiveContext hiveContext = new HiveContext(sc.sc())
// 建立銷售額表,sales表
hiveContext.sql("DROP TABLE IF EXISTS sales")
hiveContext.sql("CREATE TABLE IF NOT EXISTS sales ("
+ "product STRING," + "category STRING," + "revenue BIGINT)")
hiveContext.sql("LOAD DATA LOCAL INPATH '/usr/hadoopsoft/spark-1.6.0-bin-hadoop2.4/project/sales.txt' INTO TABLE sales")
// 先說明一下,row_number()開窗函式,它的作用是什麼?
// 其實,就是給每個分組的資料,按照其排序順序,打上一個分組內的行號!!!!
// 比如說,有一個分組date=20160706,裡面看有3資料,11211,11212,11213
// 那麼對這個分組的每一行使用row_number()開窗函式以後,這個三行會打上一個組內的行號!!!
// 行號是從1開始遞增!!! 比如最後結果就是 11211 1, 11212 2, 11213 3
DataFrame top3SalesDF = hiveContext.sql("SELECT product, category, revenue "
+ "FROM ("
+ "SELECT "
+ "product, "
+ "category , "
+ "revenue, "
+ "row_number() OVER (PARTITION BY category ORDER BY revenue DESC) rank "
+ "FROM sales "
+ ") tmp_sales "
+ "WHERE rank <= 3")
// 將每組排名前3的資料,儲存到一個表中
hiveContext.sql("DROP TABLE IF EXISTS top3_sales")
top3SalesDF.saveAsTable("top3_sales")
sc.close()
}
}