SparkSQL---開窗函式(java)

Star-Technology發表於2017-08-11
package com.spark.sparksql.save;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.hive.HiveContext;

/*
 * 開窗函式
 */

public class RowNumberWindowFunction {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("RowNumberWindowFunction");
        JavaSparkContext sc = new JavaSparkContext(conf);
        HiveContext hiveContext = new HiveContext(sc.sc());

        // 建立銷售額表,sales表
        hiveContext.sql("DROP TABLE IF EXISTS sales");
        hiveContext.sql("CREATE TABLE IF NOT EXISTS sales ("
                + "product STRING," + "category STRING," + "revenue BIGINT)");
        hiveContext.sql("LOAD DATA LOCAL INPATH '/usr/hadoopsoft/spark-1.6.0-bin-hadoop2.4/project/sales.txt' INTO TABLE sales");

        // 先說明一下,row_number()開窗函式,它的作用是什麼?
        // 其實,就是給每個分組的資料,按照其排序順序,打上一個分組內的行號!!!!
        // 比如說,有一個分組date=20160706,裡面看有3資料,112111121211213
        // 那麼對這個分組的每一行使用row_number()開窗函式以後,這個三行會打上一個組內的行號!!!
        // 行號是從1開始遞增!!! 比如最後結果就是 11211 1, 11212 2, 11213 3

        DataFrame top3SalesDF = hiveContext.sql("SELECT product, category, revenue "
                + "FROM ("
                + "SELECT "
                + "product, "
                + "category , "
                + "revenue, "
                + "row_number() OVER (PARTITION BY category ORDER BY revenue DESC) rank "
                + "FROM sales "
                + ") tmp_sales "
                + "WHERE rank <= 3");

        // 將每組排名前3的資料,儲存到一個表中
        hiveContext.sql("DROP TABLE IF EXISTS top3_sales");
        top3SalesDF.saveAsTable("top3_sales");

        sc.close();
    }
}

相關文章