使用hive增量更新

哇塞兒發表於2021-02-22

原文網址 : https://www.cnblogs.com/wasaier/p/14431527.html

Hive

參考文末文章，加上自己的理解。

1、增量更新

有一個 base_table 表存放的是 12 月 15 日及其之前的所有資料，當 12 月 16 日的資料產生後，存入 incremental_table 表的當日分割槽中。

現在需要，將 incremental_table 表的新增資料合併到 base_table 表中。

那麼，就有兩種情況：

（1）保留歷史資料

通過將主表建成拉鍊表實現：

將 歷史資料中修改了的資料 union 當日新增的資料，

再 insert overwrite 到 base_table 表。

這樣的話，就會存在重複的資料，保留了歷史資料。

（2）不保留了歷史資料

方法1：

先將 base_table 表和 incremental_table 表 left join，將 base_table 表中沒有修改的資料插入到 base_table 表，

再將 incremental_table 表中的增量資料（最新資料）插入到 base_table 表。

方法2：

將 base_table 表和 incremental_table 表 union all ，再取更新時間最新的記錄。

這樣，就不會存在重複的資料，但是沒有了歷史資料。

2、對第一種情況

通過將主表建成拉鍊表實現

2.1、準備工作

（1）建表

-- 存放產生的每日增量資料，按天分割槽
create table incremental_table (
    id string,
    name string,
    addr string
) comment '增量表'
partitioned by (dt string)
row format delimited fields terminated by ','
stored as textfile;

-- 存放更新後的資料
create table base_table (
    id string,
    name string,
    addr string,
    start_date string,
    end_date string
) comment '主表'
row format delimited fields terminated by ','
stored as textfile;

（2）資料

incre0.txt：匯入主錶的歷史資料

（模擬主表已有資料）

1,lijie,chongqing,20191020,99991231
2,zhangshan,sz,20191020,99991231
3,lisi,shanghai,20191020,99991231
4,wangwu,usa,20191020,99991231

incre1.txt：匯入增量表的 20191020 新增資料

1,lijie,chongqing
2,zhangshan,sz
3,lisi,shanghai
4,wangwu,usa

incre2.txt：匯入增量表的 20191021 新增資料

1,lijie,chengdu      # 地址變了
2,zhangshan,huoxing  # 地址變了
4,wangwu,lalalala    # 地址變了
5,xinzeng,hehe       # 新增資料

（3）匯入資料

-- 將 incre0.txt 匯入主表中，表示主表已經有資料了，
-- 現在需要更新主表裡的資料
load data local inpath '/root/data/incre0.txt' overwrite into table base_table;

hive> select * from base_table;
OK
1       lijie   chongqing       20191020        99991231
2       zhangshan       sz      20191020        99991231
3       lisi    shanghai        20191020        99991231
4       wangwu  usa     20191020        99991231

-- 將 incre1.txt 和 incre2.txt 分別匯入增量表中的相應分割槽中
load data local inpath '/root/data/incre1.txt' overwrite into table incremental_table partition (dt='20191020');

load data local inpath '/root/data/incre2.txt' overwrite into table incremental_table partition (dt='20191021');

hive> select * from incremental_table;
OK
1       lijie   chongqing       20191020
2       zhangshan       sz      20191020
3       lisi    shanghai        20191020
4       wangwu  usa     20191020
1       lijie   chengdu 20191021
2       zhangshan       huoxing 20191021
4       wangwu  lalalala        20191021
5       xinzeng hehe    20191021

2.2、更新資料

-- 將 歷史資料中修改了的資料 union 當日新增的資料，
-- 再 insert overwrite 到 base_table 表。
-- 也可以使用 hive 的 merge into 語法，但從 Hive 2.2 版本才開始可用，且只能在支援 ACID 的表上執行。
insert overwrite table base_table
select * from
(
    select a.id,  -- 更新歷史資料中修改了的資料
           a.name,
           a.addr,
           a.start_date,
           case 
                when a.end_date='99991231' and b.id is not null then '20191020'  -- 更新了end_date
                else a.end_date
           end as end_date
    from base_table as a
    left join (select * from incremental_table where dt='20191021') as b 
    on a.id=b.id
union 
    select c.id,   -- 新增當日新增的資料
           c.name,
           c.addr,
           '20191021' as start_date,
           '99991231' as end_date
    from incremental_table c
    where c.dt='20191021'
) as t;

hive> select * from base_table;
OK
1       lijie   chengdu 20191021        99991231
1       lijie   chongqing       20191020        20191020
2       zhangshan       huoxing 20191021        99991231
2       zhangshan       sz      20191020        20191020
3       lisi    shanghai        20191020        99991231
4       wangwu  lalalala        20191021        99991231
4       wangwu  usa     20191020        20191020
5       xinzeng hehe    20191021        99991231

3、對第二種情況

3.1、準備工作

（1）建表

create table incremental_table (
    id string,
    name string,
    addr string
) comment '增量表'
partitioned by (dt string)
row format delimited fields terminated by ','
stored as textfile;

create table base_table (
    id string,
    name string,
    addr string
) comment '主表'
partitioned by (dt string)
row format delimited fields terminated by ','
stored as textfile;

（2）資料

源資料incre0.txt

1,lijie,chongqing
2,zhangshan,sz
3,lisi,shanghai
4,wangwu,usa

增量資料incre1.txt

1,lijie,chengdu      # 地址變了
2,zhangshan,huoxing  # 地址變了
4,wangwu,lalalala    # 地址變了
5,xinzeng,hehe       # 新增資料

（3）匯入資料

-- 將 incre0.txt 匯入主表中
load data local inpath '/root/data/incre0.txt' overwrite into table base_table partition (dt='20191020');

hive> select * from base_table;
OK
1       lijie   chongqing       20191020
2       zhangshan       sz      20191020
3       lisi    shanghai        20191020
4       wangwu  usa     20191020

-- 將 incre0.txt 和 incre1.txt 匯入增量表中
load data local inpath '/root/data/incre0.txt' overwrite into table incremental_table partition (dt='20191020');

load data local inpath '/root/data/incre1.txt' overwrite into table incremental_table partition (dt='20191021');

hive> select * from incremental_table;
OK
1       lijie   chongqing       20191020
2       zhangshan       sz      20191020
3       lisi    shanghai        20191020
4       wangwu  usa     20191020
1       lijie   chengdu 20191021
2       zhangshan       huoxing 20191021
4       wangwu  lalalala        20191021
5       xinzeng hehe    20191021

3.2、方法1

先將 base_table 表和 incremental_table 表 left join，將 base_table 表中沒有修改的資料插入到 base_table 表，

再將 incremental_table 表中的增量資料插入到 base_table 表。

hive> select * from base_table;
OK
1       lijie   chongqing       20191020
2       zhangshan       sz      20191020
3       lisi    shanghai        20191020
4       wangwu  usa     20191020

hive> select * from incremental_table;
OK
1       lijie   chongqing       20191020
2       zhangshan       sz      20191020
3       lisi    shanghai        20191020
4       wangwu  usa     20191020
1       lijie   chengdu 20191021
2       zhangshan       huoxing 20191021
4       wangwu  lalalala        20191021
5       xinzeng hehe    20191021

insert overwrite table base_table
    select a.id,  -- 插入 base_table 表中沒有修改的資料
           a.name,
           a.addr,
           a.dt
    from base_table a
    left join (select * from incremental_table where dt='20191021') b
    on a.id=b.id
    where b.id is null 
    union
    select c.id,   -- 插入 incremental_table 表中的增量資料，即最新資料
           c.name,
           c.addr,
           c.dt
    from (select * from incremental_table where dt='20191021') c;

hive> select * from base_table;
OK
3       lisi    shanghai        20191020
1       lijie   chengdu 20191021
2       zhangshan       huoxing 20191021
4       wangwu  lalalala        20191021
5       xinzeng hehe    20191021

3.3、方法2

將 base_table 表和 incremental_table 表 union all ，再取更新時間最新的記錄。

【可以通過視窗函式編一個序號，也可以使用 hive 的預定義屬性最近更新時間欄位】

hive> select * from base_table;
OK
1       lijie   chongqing       20191020
2       zhangshan       sz      20191020
3       lisi    shanghai        20191020
4       wangwu  usa     20191020

hive> select * from incremental_table;
OK
1       lijie   chongqing       20191020
2       zhangshan       sz      20191020
3       lisi    shanghai        20191020
4       wangwu  usa     20191020
1       lijie   chengdu 20191021
2       zhangshan       huoxing 20191021
4       wangwu  lalalala        20191021
5       xinzeng hehe    20191021

insert overwrite table base_table
select b.id,b.name,b.addr,b.dt 
from 
(
    select a.*,
           row_number() over(distribute by a.id sort by a.dt desc) as rn
    from 
    (
        select id,name,addr,dt from base_table
        union all  -- 這裡是 union all
        select id,name,addr,dt from incremental_table where dt='20191021'
    ) a
) b
where b.rn=1;

hive> select * from base_table;
OK
3       lisi    shanghai        20191020
1       lijie   chengdu 20191021
2       zhangshan       huoxing 20191021
4       wangwu  lalalala        20191021
5       xinzeng hehe    20191021

參考地址：

https://www.cnblogs.com/lxbmaomao/p/9821128.html

https://blog.csdn.net/qq_20641565/article/details/52763663

https://blog.csdn.net/qq_20641565/article/details/53164155?utm_medium=distribute.pc_relevant_t0.none-task-blog-BlogCommendFromMachineLearnPai2-1.control&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-BlogCommendFromMachineLearnPai2-1.control

https://blog.csdn.net/ZhouyuanLinli/article/details/86638454?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.control