一、背景
mysql資料入湖後,有同事需要實時抽取iceberg v2表,想透過iceberg做分鐘級實時數倉。目前flink社群暫不支援讀取v2表。騰訊內部支援
目前只能用Oceanus內建connector,支援flink1.13版本。需要讀寫時都用iceberg-1.1去處理,因為寫入是定製iceberg有對iceberg格式內容做增強,增強後才支援流式讀取。
流式讀取做加工時,sum函式因iceberg不支援,具體內容看圖三
流式寫入v2表
CREATE TABLE `iceberg_sink`(
`id` int,
`name` char(50),
`age` int,
`weight` double,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'iceberg-1.1',
'incremental.sequence.mode'='true',
'format-version' = '2',
'write.upsert.enabled' = 'true'
-- other properties ...
);
insert into iceberg_sink select * from xxx_source;
流式讀取v2表
CREATE TABLE `iceberg_source`(
`id` int,
`name` char(50),
`age` int,
`weight` double,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'iceberg-1.1',
'incremental.sequence.mode'='true',
'format-version' = '2',
'write.upsert.enabled' = 'true'
'connector.iceberg.starting-strategy' = 'TABLE_SCAN_THEN_INCREMENTAL'
-- connector.iceberg.starting-strategy 有 4 個選項:
-- (1) TABLE_SCAN_THEN_INCREMENTAL: 先全量讀,再增量讀
-- (2) INCREMENTAL_FROM_LATEST_SNAPSHOT: 從最新的 snapshot 開始增量讀 (inclusive)
-- (3) INCREMENTAL_FROM_EARLIEST_SNAPSHOT: 從最老的 snapshot 開始增量讀 (inclusive)
-- (4) INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP: 從指定時間戳的快照開始增量讀(inclusive)。如果時間戳在兩個快照之間,則應該從時間戳之後的快照開始。
-- other properties ...
);
insert into xxx_sink select * from iceberg_source;