在使用Apache DolphinScheduler排程執行復雜的HiveSQL時,HQL包含多種海豚無法正確識別的符號,怎麼辦?本文提供了可行的思路和方法,供使用者參考。
一、目的
在Hive中完成複雜JSON,既有物件還有陣列而且陣列中包含陣列的解析後,原本以為沒啥問題了,結果在DolphinScheduler中排程又出現了大問題,搞了一天。試了很多種方法,死了無數腦細胞,才解決了這個問題!
二、HiveSQL
insert overwrite table hurys_dc_dwd.dwd_json_statistics partition(day)
select
t1.device_no,
source_device_type,
sn,
model,
create_time,
cycle,
get_json_object(coil_list,'$.laneNo') lane_no,
get_json_object(coil_list,'$.laneType') lane_type,
section_no,
get_json_object(coil_list,'$.coilNo') coil_no,
get_json_object(coil_list,'$.volumeSum') volume_sum,
get_json_object(coil_list,'$.volumePerson') volume_person,
get_json_object(coil_list,'$.volumeCarNon') volume_car_non,
get_json_object(coil_list,'$.volumeCarSmall') volume_car_small,
get_json_object(coil_list,'$.volumeCarMiddle') volume_car_middle,
get_json_object(coil_list,'$.volumeCarBig') volume_car_big,
get_json_object(coil_list,'$.speedAvg') speed_avg,
get_json_object(coil_list,'$.speed85') speed_85,
get_json_object(coil_list,'$.timeOccupancy') time_occupancy,
get_json_object(coil_list,'$.averageHeadway') average_headway,
get_json_object(coil_list,'$.averageGap') average_gap,
substr(create_time,1,10) day
from (select
get_json_object(statistics_json,'$.deviceNo') device_no,
get_json_object(statistics_json,'$.sourceDeviceType') source_device_type,
get_json_object(statistics_json,'$.sn') sn,
get_json_object(statistics_json,'$.model') model,
get_json_object(statistics_json,'$.createTime') create_time ,
get_json_object(statistics_json,'$.data.cycle') cycle,
get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.sectionNo') section_no,
section_list
from hurys_dc_ods.ods_statistics
lateral view explode(split(replace(replace(replace(get_json_object(statistics_json,'$.data.sectionList'),
'[',''),']',''),'},{"sectionNo"','}|{"sectionNo"'),"\\|")) tf as section_list
where day='2024-07-18' -- date_sub(current_date(), 1) -- '2024-07-18' --
) as t1
lateral view explode(split(replace(replace(replace(get_json_object(replace(replace(section_list,
':{',':[{'),'}}','}]}'),'$.coilList'),'[',''),']',''),'},','}|'),"\\|")) tf1 as coil_list
where substr(create_time,1,10) = '2024-07-18' --date_sub(current_date(), 1) --'2024-07-17'
;
三、原先海豚的任務排程方式
在shell指令碼里新增HiveSQL語句
#! /bin/bash
source /etc/profile
nowdate=`date --date='0 days ago' "+%Y%m%d"`
yesdate=`date -d yesterday +%Y-%m-%d`
hive -e "
use hurys_dc_dwd;
set hive.vectorized.execution.enabled=false;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set hive.exec.max.dynamic.partitions=1500;
with t1 as(
select
get_json_object(statistics_json,'$.deviceNo') device_no,
get_json_object(statistics_json,'$.sourceDeviceType') source_device_type,
get_json_object(statistics_json,'$.sn') sn,
get_json_object(statistics_json,'$.model') model,
get_json_object(statistics_json,'$.createTime') create_time ,
get_json_object(statistics_json,'$.data.cycle') cycle,
get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.sectionNo') section_no,
section_list
from hurys_dc_ods.ods_statistics
lateral view explode(split(replace(replace(replace(get_json_object(statistics_json,'$.data.sectionList'),'[',''),']',''),'},{"sectionNo"','}|{"sectionNo"'),"\\\\|")) tf as section_list
where day='$yesdate'
)
insert overwrite table hurys_dc_dwd.dwd_json_statistics partition(day)
select
t1.device_no,
source_device_type,
sn,
model,
substr(create_time,1,19) create_time ,
cycle,
get_json_object(coil_list,'$.laneNo') lane_no,
get_json_object(coil_list,'$.laneType') lane_type,
section_no,
get_json_object(coil_list,'$.coilNo') coil_no,
get_json_object(coil_list,'$.volumeSum') volume_sum,
get_json_object(coil_list,'$.volumePerson') volume_person,
get_json_object(coil_list,'$.volumeCarNon') volume_car_non,
get_json_object(coil_list,'$.volumeCarSmall') volume_car_small,
get_json_object(coil_list,'$.volumeCarMiddle') volume_car_middle,
get_json_object(coil_list,'$.volumeCarBig') volume_car_big,
get_json_object(coil_list,'$.speedAvg') speed_avg,
get_json_object(coil_list,'$.speed85') speed_85,
get_json_object(coil_list,'$.timeOccupancy') time_occupancy,
get_json_object(coil_list,'$.averageHeadway') average_headway,
get_json_object(coil_list,'$.averageGap') average_gap,
substr(create_time,1,10) day
from t1
lateral view explode(split(replace(replace(replace(get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.coilList'),'[',''),']',''),'},','}|'),"\\\\|")) tf1 as coil_list
where substr(create_time,1,10) ='$yesdate'
"
四、原先方式報錯日誌
DolphinScheduler無法正確識別HiveSQL裡解析複雜JSON的多種符號。
五、解決方式
把HiveSQL放在一個SQL檔案裡,然後在指令碼里是執行Hive的sourceSQL檔案。
1 SQL檔案
--使用hurys_dc_ods資料庫
use hurys_dc_dwd;
--hive調優(必須先執行調優語句,否則部分複雜SQL執行會有問題)
set hive.vectorized.execution.enabled=false;
--開啟動態分割槽功能(預設 true,開啟)
set hive.exec.dynamic.partition=true;
--設定為非嚴格模式 nonstrict 模式表示允許所有的分割槽欄位都可以使用動態分割槽
set hive.exec.dynamic.partition.mode=nonstrict;
--在每個執行 MR 的節點上,最大可以建立多少個動態分割槽
set hive.exec.max.dynamic.partitions.pernode=1000;
--在所有執行 MR 的節點上,最大一共可以建立多少個動態分割槽。預設 1000
set hive.exec.max.dynamic.partitions=1500;
insert overwrite table hurys_dc_dwd.dwd_json_statistics partition(day)
select
t1.device_no,
source_device_type,
sn,
model,
create_time,
cycle,
get_json_object(coil_list,'$.laneNo') lane_no,
get_json_object(coil_list,'$.laneType') lane_type,
section_no,
get_json_object(coil_list,'$.coilNo') coil_no,
get_json_object(coil_list,'$.volumeSum') volume_sum,
get_json_object(coil_list,'$.volumePerson') volume_person,
get_json_object(coil_list,'$.volumeCarNon') volume_car_non,
get_json_object(coil_list,'$.volumeCarSmall') volume_car_small,
get_json_object(coil_list,'$.volumeCarMiddle') volume_car_middle,
get_json_object(coil_list,'$.volumeCarBig') volume_car_big,
get_json_object(coil_list,'$.speedAvg') speed_avg,
get_json_object(coil_list,'$.speed85') speed_85,
get_json_object(coil_list,'$.timeOccupancy') time_occupancy,
get_json_object(coil_list,'$.averageHeadway') average_headway,
get_json_object(coil_list,'$.averageGap') average_gap,
substr(create_time,1,10) day
from (select
get_json_object(statistics_json,'$.deviceNo') device_no,
get_json_object(statistics_json,'$.sourceDeviceType') source_device_type,
get_json_object(statistics_json,'$.sn') sn,
get_json_object(statistics_json,'$.model') model,
get_json_object(statistics_json,'$.createTime') create_time ,
get_json_object(statistics_json,'$.data.cycle') cycle,
get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.sectionNo') section_no,
section_list
from hurys_dc_ods.ods_statistics
lateral view explode(split(replace(replace(replace(get_json_object(statistics_json,'$.data.sectionList'),'[',''),']',''),'},{"sectionNo"','}|{"sectionNo"'),"\\|")) tf as section_list
where day= date_sub(current_date(), 1)
) as t1
lateral view explode(split(replace(replace(replace(get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.coilList'),'[',''),']',''),'},','}|'),"\\|")) tf1 as coil_list
where substr(create_time,1,10) = date_sub(current_date(), 1)
;
2 海豚任務執行指令碼
#! /bin/bash
source /etc/profile
nowdate=`date --date='0 days ago' "+%Y-%m-%d"`
yesdate=`date -d yesterday +%Y-%m-%d`
hive -e "
source dwd_json_statistics.sql
"
3 執行任務,驗證結果
終於解決了!以後碰到類似排程器識別不了SQL裡符號的問題,可以用這個方法,把SQL放在SQL檔案裡,然後在指令碼里執行這個SQL檔案,這樣就能規避這類問題了。
轉載自天地風雷水火山澤
原文連結:https://blog.csdn.net/tiantang2renjian/article/details/140605840
本文由 白鯨開源 提供釋出支援!