Hive的一些常用的高階開發
內容
1.開窗函式
2.行轉列,列轉行,多行轉一行,一行轉多行
3.分組: 增強型group
4.排序
5.關聯
本次的內容: 內容1 和內容2,採用的是示例資料以及對應的實現。資料可以直接放在Hive中執行。可以直觀的觀察資料,進而對函式以及相應的功能有所熟悉。
對於不同的場景的資料計算,瞭解SQL的基本語法以及一些高階用法,在這些基礎上組合相應的功能。這些都是一些工程上的應用,多練習的。通過構建資料集來驗證的方式,是可以自己來確認一些似是而非的語法。對於Hive底層原理和程式碼的瞭解也是途徑之一。構建資料集驗證與通過原理去分析瞭解的方法可以相互配合使用。最終的目標之一就是更好的實現業務分析目標。-- over() 子句 有order by, 分割槽內排序後一個個疊-- windows子句 WINDOW子句(靈活控制視窗的子集)
--
WITH table_1 AS(
SELECT '1' AS mem_id , 10 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 2 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 3 AS score , '2020-08-09 09:08:10' AS createtime
UNION ALL
SELECT '2' AS mem_id , 6 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '3' AS mem_id , 7 AS score , '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '3' AS mem_id , 9 AS score , '2020-08-09 09:08:10' AS createtime
)
SELECT
mem_id
, score
, SUM(score) OVER(PARTITION BY mem_id ) AS pv1
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime) AS pv1 -- 預設為從起點到當前行
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS pv2 --從起點到當前行,結果同pv1
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW ) AS pv3 --當前行+往前3行
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING ) AS pv4 --當前行+往前3行+往後1行
, SUM(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv5 -- ---當前行+往後所有行
FROM table_1
;WITH table_1 AS(
SELECT '1' AS mem_id , 10 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 5 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 2 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 3 AS score , '2020-08-09 09:08:10' AS createtime
)
SELECT
mem_id
, score
, createtime
, MAX(score) OVER(PARTITION BY mem_id ) AS pv1 -- 分組的
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime) AS pv1 -- 預設為從起點到當前行
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2 --從起點到當前行,結果同pv1
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv3 --當前行+往前3行
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv4 --當前行+往前3行+往後1行
, MAX(score) OVER(PARTITION BY mem_id ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv5 ---當前行+往後所有行
FROM table_1
;-- 1.排序開窗函式
-- row_number() :從1開始,按照順序,生成分組內記錄的序列,row_number()的值不會存在重複 1 2 3 4
-- dense_rank() :生成資料項在分組中的排名,排名相等會在名次中不會留下空位 1 2 2 3
-- rank() :生成資料項在分組中的排名,排名相等會在名次中留下空位 1 2 2 4WITH table_1 AS(
SELECT '1' AS mem_id , 10 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 5 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 2 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 3 AS score , '2020-08-09 09:08:10' AS createtime
)
SELECT
mem_id
, score
, createtime
, ROW_NUMBER() OVER(PARTITION BY mem_id ORDER BY createtime ) AS ROW_NUMBER_pv1 -- 1 2 3 4
, DENSE_RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS DENSE_RANK_pv2 -- 1 2 2 3
, RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS RANK_pv3 -- 1 2 2 4
, ROW_NUMBER() OVER(PARTITION BY mem_id ORDER BY createtime desc) AS ROW_NUMBER_desc_pv1 -- 1 2 3 4
, DENSE_RANK() OVER(PARTITION BY mem_id ORDER BY createtime desc) AS DENSE_RANK_desc_pv2 -- 1 2 2 3
, RANK() OVER(PARTITION BY mem_id ORDER BY createtime desc) AS RANK_desc_pv3 -- 1 2 2 4
FROM table_1
ORDER BY createtime
;
-- ntile(10) over ( partition by t1.grp_cd order by t1.pay_amt desc ) Monyrank
WITH table_1 AS(
SELECT '1' AS mem_id , 10 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 5 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 2 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 3 AS score , '2020-08-09 09:08:10' AS createtime
UNION ALL
SELECT '2' AS mem_id , 6 AS score , '2020-08-09 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 7 AS score , '2020-08-09 09:08:10' AS createtime
)
SELECT
mem_id
, score
, createtime
, ntile(3) OVER(PARTITION BY mem_id ORDER BY createtime ) AS ntile_pv1
, ntile(3) OVER(PARTITION BY score ORDER BY createtime ) AS ntile_pv2
, ntile(3) OVER(PARTITION BY mem_id ORDER BY score ) AS ntile_pv3
FROM table_1
;-- datediff(from_unixtime(unix_timestamp('${hivevar:statis_date}','yyyyMMdd'),'yyyy-MM-dd'), from_unixtime(unix_timestamp(statis_date,'yyyyMMdd'),'yyyy-MM-dd') ) as date_flag
-- pow(2, date_flag ) AS data_flag
-- conv(CAST(SUM(data_flag ) AS int),10,2) AS continuity_flag
-- locate('0',REVERSE(continuity_flag)) AS continuity_locate,
-- length(continuity_flag) AS continuity_len
-- CASE WHEN continuity_locate= 0 then continuity_len ELSE continuity_locate-1 END AS con_pv_day,
-- 序列函式不支援WINDOW子句
CUME_DIST -- 小於等於當前值的行數/分組內總行數 -- 統計小於等於當前薪水的人數,所佔總人數的比例
PERCENT_RANK -- 分組內當前行的RANK值-1/分組內總行數-1WITH table_1 AS(
SELECT '1' AS mem_id , 10 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 5 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 2 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 3 AS score , '2020-08-09 09:08:10' AS createtime
)
SELECT
mem_id
, score
, createtime
, ROW_NUMBER() OVER(PARTITION BY mem_id ORDER BY createtime ) AS ROW_NUMBER_pv1 -- 1 2 3 4
, RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS RANK_pv3 -- 1 2 2 4
, CUME_DIST() OVER(PARTITION BY mem_id ORDER BY createtime ) AS ROW_NUMBER_desc_pv1 -- 小於等於當前值的行數/分組內總行數
, PERCENT_RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS DENSE_RANK_desc_pv2 -- 分組內當前行的RANK值-1/分組內總行數-1
FROM table_1
ORDER BY createtime
;--1.LEAD(col,n,DEFAULT) 用於統計視窗內往下第n行值-- 第一個引數為列名,第二個引數為往下第n行(可選,預設為1,不可為負數),第三個引數為預設值(當往下第n行為NULL時候,取預設值,如不指定,則為--2.LAG(col,n,DEFAULT) 用於統計視窗內往上第n行值-- 第一個引數為列名,第二個引數為往上第n行(可選,預設為1,不可為負數),第三個引數為預設值(當往上第n行為NULL時候,取預設值,如不指定,則為NULL)WITH table_1 AS(
SELECT '1' AS mem_id , 10 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 2 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 3 AS score , '2020-08-09 09:08:10' AS createtime
UNION ALL
SELECT '2' AS mem_id , 6 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '3' AS mem_id , 7 AS score , '2020-08-07 09:09:10' AS createtime
UNION ALL
SELECT '3' AS mem_id , 9 AS score , '2020-08-09 09:08:10' AS createtime
)
select mem_id
,createtime
,score
,lead(score,2) over (partition by mem_id order by createtime) as lead_2_pv
,lead(score,1) over (partition by mem_id order by createtime) as lead_1_pv -
,lead(score,1,-9999) over (partition by mem_id order by createtime) as lead_1_null_pv
,LAG (score,1,-9999) over (partition by mem_id order by createtime) as lag_1_pv -- 統計視窗內往上第n行值
FROM table_1
;
WITH table_1 AS(
SELECT '1' AS mem_id , 10 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 2 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 3 AS score , '2020-08-09 09:08:10' AS createtime
UNION ALL
SELECT '2' AS mem_id , 6 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '3' AS mem_id , 7 AS score , '2020-08-07 09:09:10' AS createtime
UNION ALL
SELECT '3' AS mem_id , 9 AS score , '2020-08-09 09:08:10' AS createtime
)
select
mem_id
,createtime
,score
,LAG (score,2,-9999) over (partition by mem_id order by createtime) as lag_1_pv
,LAG (score,1,-9999) over (partition by mem_id order by createtime desc) as lag_1_desc_pv
,LAG (score,1) over (partition by mem_id order by createtime desc) as lag_1_desc_pv
,LAG (createtime,1,-9999) over (partition by mem_id order by createtime) as lag_1_pv
,LAG (createtime,1,-9999) over (partition by mem_id order by createtime desc) as lag_1_desc_pv -- -- 統計視窗內往上第n行值
FROM table_1
order by mem_id,createtime
;
-- FIRST_VALUE取分組內排序後,截止到當前行,第一個值,
-- 需要兩個引數。第一個引數是您想要第一個值的列,第二個(可選)引數必須是false預設為布林值的布林值。如果設定為true,則跳過空值。
-- LAST_VALUE取分組內排序後,截止到當前行,最後一個值,
-- 需要兩個引數。第一個引數是您想要第一個值的列,第二個(可選)引數必須是false預設為布林值的布林值。如果設定為true,則跳過空值。
WITH table_1 AS(
SELECT '1' AS mem_id , 10 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 2 AS score , '2020-08-08 09:08:10' AS createtime
UNION ALL
SELECT '1' AS mem_id , 3 AS score , '2020-08-09 09:08:10' AS createtime
UNION ALL
SELECT '2' AS mem_id , 6 AS score, '2020-08-07 09:08:10' AS createtime
UNION ALL
SELECT '3' AS mem_id , 7 AS score , '2020-08-07 09:09:10' AS createtime
UNION ALL
SELECT '3' AS mem_id , 9 AS score , '2020-08-09 09:08:10' AS createtime
)
SELECT
mem_id
, score
, createtime
, ROW_NUMBER() OVER(PARTITION BY mem_id ORDER BY createtime ) AS ROW_NUMBER_pv1 -- 1 2 3 4
, DENSE_RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS DENSE_RANK_pv2 -- 1 2 2 3
, RANK() OVER(PARTITION BY mem_id ORDER BY createtime ) AS RANK_pv3 -- 1 2 2 4
, FIRST_VALUE(score) OVER(partition by mem_id order by createtime) as first1
, LAST_VALUE(score) OVER(partition by mem_id order by createtime) as last1 -- 分組內排序後,截止到當前行,最後一個值
FROM table_1
ORDER BY createti-- 多行轉一行 hive collect_set 結果順序不一致
--- concat_ws、collect_set
WITH table_1 AS(
SELECT '1' AS mem_id , 10 AS score, '2020-08-07 09:08:10' AS createtime, '10ab' AS topic_id
UNION ALL
SELECT '1' AS mem_id , 2 AS score , '2020-08-08 09:08:10' AS createtime, '2hb' AS topic_id
UNION ALL
SELECT '1' AS mem_id , 3 AS score , '2020-08-09 09:08:10' AS createtime, '3fg' AS topic_id
UNION ALL
SELECT '2' AS mem_id , 6 AS score, '2020-08-07 09:08:10' AS createtime, '6sf' AS topic_id
UNION ALL
SELECT '3' AS mem_id , 7 AS score , '2020-08-07 09:09:10' AS createtime, '7dr' AS topic_id
UNION ALL
SELECT '3' AS mem_id , 9 AS score , '2020-08-09 09:08:10' AS createtime, '9ng' AS topic_id
)
SELECT
mem_id
, concat_ws(',',collect_list(score)) as order_value
, concat_ws(',',sort_array(collect_list(score))) as order_value
, collect_list( concat_ws(':',lpad(cast(score as string),5,'0'),cast(topic_id as string)) ) AS demo1
, sort_array( collect_list( concat_ws(':',lpad(cast(score as string),5,'0'),cast(topic_id as string)) )) AS demo2
,concat_ws(',', sort_array( collect_list( concat_ws(':',lpad(cast(score as string),5,'0'),cast(topic_id as string))) ) ) AS demo3
, regexp_replace(
concat_ws(',',
sort_array(
collect_list(
concat_ws(':',lpad(cast(score as string),5,'0'),cast(topic_id as string))) ) ),'\\d+\:','') AS data
FROM table_1
group by mem_id
-- 一行轉多行 select explode(map_col) as (may_key_col, may_value_col) from table_name
-- posexplode 相比在 explode 之上,將一列資料轉為多行之後,還會輸出資料的下標
WITH table_1 AS(
select "1" AS class_id,split('Test400|Test531|Test536','\\|') AS stu_id, split('60|30|90','\\|') AS score
UNION ALL
select "2" AS class_id,split('Test400|Test531|Test536','\\|') AS stu_id, split('70|60|70','\\|') AS score
UNION ALL
select "3" AS class_id,split('Test500|Test521|Test536','\\|') AS stu_id, split('70|60|70','\\|') AS score
)
SELECT class_id,stu_id,examples_id1
FROM table_1
LATERAL VIEW explode(stu_id) examples as examples_id1
;-- 一行轉多行 兩列的匹配 --
WITH table_1 AS(
select "1" AS class_id,split('Test400|Test531|Test536','\\|') AS stu_id, split('60|30|90','\\|') AS score
UNION ALL
select "2" AS class_id,split('Test400|Test531|Test536','\\|') AS stu_id, split('70|60|70','\\|') AS score
UNION ALL
select "3" AS class_id,split('Test500|Test521|Test536','\\|') AS stu_id, split('70|60|70','\\|') AS score
)
SELECT class_id,sn_name,sn_score
FROM table_1
lateral view posexplode(stu_id ) sn as sn_index ,sn_name
lateral view posexplode(score ) sc as sc_index ,sn_score
WHERE sc_index = sn_index;
-- 行轉列
WITH table_1 AS(
SELECT '1' AS mem_id , "開心" AS tagtype, '2020-08-07 09:08:10' AS createtime, '10ab' AS topic_id
UNION ALL
SELECT '1' AS mem_id , "開心" AS tagtype , '2020-08-08 09:08:10' AS createtime, '2hb' AS topic_id
UNION ALL
SELECT '1' AS mem_id , "有趣" AS tagtype , '2020-08-09 09:08:10' AS createtime, '3fg' AS topic_id
UNION ALL
SELECT '2' AS mem_id , "有趣" AS tagtype, '2020-08-07 09:08:10' AS createtime, '6sf' AS topic_id
UNION ALL
SELECT '3' AS mem_id , "開心" AS tagtype , '2020-08-07 09:09:10' AS createtime, '7dr' AS topic_id
UNION ALL
SELECT '3' AS mem_id , "抗壓" AS tagtype , '2020-08-09 09:08:10' AS createtime, '9ng' AS topic_id
)
select mem_id
,case when tagtype='有趣' then "1" else '0' end as import_fun
,case when tagtype='開心' then "1" else '0' end as import_status
,case when tagtype='抗壓' then "1" else '0' end as import_chara
,createtime
,topic_id
from table_1
;
---縱表變橫表
-- 欄位 userid flag
'張三' AS userid,'收藏' AS flag
'張三' AS userid,'購買' AS flag
'張三' AS userid,'點選' AS flag
'李四' AS userid,'點選' AS flag
'李四' AS userid,'收藏' AS flag
--結果資料
userid collction purchase click
'張三','1','1','1'
'李四','1','0','1'
--解決方案
-- 使用兩種解決方案--使用union之後max
WITH t1 AS (
SELECT '張三' AS userid,'收藏' AS flag
UNION ALL
SELECT '張三' AS userid,'購買' AS flag
UNION ALL
SELECT '張三' AS userid,'點選' AS flag
UNION ALL
SELECT '李四' AS userid,'點選' AS flag
UNION ALL
SELECT '李四' AS userid,'收藏' AS flag
)
SELECT tt1.userid
, MAX(tt1.collction) AS collction
, MAX(tt1.purchase) AS purchase
, MAX(tt1.click) AS click
FROM
(SELECT
t1.userid, '1' AS collction ,'0' AS purchase, '0' AS click
FROM t1
WHERE t1.flag='收藏'
UNION ALL
select
t1.userid, '0' AS collction ,'1' AS purchase, '0' AS click
FROM t1
WHERE t1.flag='購買'
UNION ALL
select
t1.userid, '0' AS collction ,'0' AS purchase, '1' AS click
FROM t1
WHERE t1.flag='點選')tt1
GROUP BY tt1.userid
;
--使用left join的方式
WITH t1 AS (
SELECT '張三' AS userid,'收藏' AS flag
UNION ALL
SELECT '張三' AS userid,'購買' AS flag
UNION ALL
SELECT '張三' AS userid,'點選' AS flag
UNION ALL
SELECT '李四' AS userid,'點選' AS flag
UNION ALL
SELECT '李四' AS userid,'收藏' AS flag
)
SELECT
tt1.userid,
CASE WHEN tt2.userid IS NOT NULL then'1' ELSE '0'END AS collction,
CASE WHEN tt3.userid IS NOT NULL then'1' ELSE '0'END AS purchase,
CASE WHEN tt4.userid IS NOT NULL then'1' ELSE '0'END AS click
FROM (SELECT DISTINCT t1.userid FROM t1)tt1
LEFT JOIN (SELECT DISTINCT t1.userid FROM t1 WHERE t1.flag = '收藏')tt2
ON tt1.userid = tt2.userid
LEFT JOIN (SELECT DISTINCT t1.userid FROM t1 WHERE t1.flag = '購買')tt3
ON tt1.userid = tt3.userid
LEFT JOIN (SELECT DISTINCT t1.userid FROM t1 WHERE t1.flag = '點選')tt4
ON tt1.userid = tt4.userid--多列轉一列 橫表變縱表,列轉行
WITH table_1 AS (
SELECT "張三" AS userid, '1' AS collction ,'1' AS purchase, '1' AS click
UNION ALL
"張三" AS userid, '0' AS collction ,'1' AS purchase, '1' AS click)
SELECT
from
(SELECT userid,'收藏' AS flag FROM table_1 WHERE collction='1'
UNION ALL
SELECT userid,'購買' AS flag FROM table_1 WHERE purchase='1'
UNION ALL
SELECT userid,'點選' AS flag FROM table_1 WHERE click='1')t1
本次分享主要是涉及開窗函式以及行列的一些開發內容。後續的一些內容,也是應用開發中的一些比較常見的要注意和區分的點。
本示例參考了一些網上的資料和書本的內容,由於來源未做標記,如有侵刪。