PostgreSQL,SQLServer邏輯增量(通過邏輯標記update,delete)同步到Greenplum,PostgreSQL

create table t_src (  
  id int primary key,   
  info text not null,   
  is_del boolean default null,   -- 刪除標記，NULL表示未刪除，非空表示已刪除  
  mod_time timestamp not null default clock_timestamp()  -- 插入、刪除、修改的時間戳  
);

2、建立索引，加速同步

create index idx_t_src_1 on t_src(mod_time);

3、建立觸發器函式，更新、刪除資料時，更新時間戳，同時修改刪除標記位

當被刪除的記錄重新被插入時，把刪除標記改成未刪除。

create or replace function tg1_t_src() returns trigger as $$  
declare  
begin  
  NEW.mod_time := clock_timestamp();  
  select case when OLD.is_del is null and NEW.is_del = true then true else null end into NEW.is_del;     -- 如果以前這個ID被刪除過，則插入，並將is_del重新置為未刪除   
  return NEW;  
end;  
$$ language plpgsql strict;

4、建立觸發器，更新時觸發

create trigger tg1 before update on t_src for each row execute procedure tg1_t_src();

5、建立規則，當刪除記錄時，使用UPDATE代替DELETE

create rule r1 as on delete to t_src do instead update t_src set is_del=true where t_src.id=OLD.id and t_src.is_del is null;     -- 未標記為刪除的記錄is_del=null，標記為刪除.

6、檢視插入、更新、刪除是否符合預期

postgres=# insert into t_src values (1, md5(random()::text)) on conflict (id) do update set info=excluded.info;  
INSERT 0 1  
postgres=# select * from t_src where id=1;  
 id |               info               | is_del |          mod_time            
----+----------------------------------+--------+----------------------------  
  1 | 56c21963342997fd8bf80a5b542abde9 |        | 2018-05-12 08:54:19.393532  
(1 row)  
  
postgres=# insert into t_src values (1, md5(random()::text)) on conflict (id) do update set info=excluded.info;  
INSERT 0 1  
postgres=# select * from t_src where id=1;  
 id |               info               | is_del |          mod_time            
----+----------------------------------+--------+----------------------------  
  1 | 5bca407559081d6cfc1154fd0f17b6a9 |        | 2018-05-12 08:54:23.465005  
(1 row)  
  
  
postgres=# delete from t_src where id=1;  
DELETE 0  
postgres=# select * from t_src;  
 id |               info               | is_del |          mod_time            
----+----------------------------------+--------+----------------------------  
  1 | 5bca407559081d6cfc1154fd0f17b6a9 | t      | 2018-05-12 08:54:43.158809  
(1 row)

7、建立壓測指令碼

vi test.sql  
  
set id1 random(1,10000000)  
set id2 random(1,20000000)  
insert into t_src values (:id1, md5(random()::text)) on conflict (id) do update set info=excluded.info;  
delete from t_src where id=:id2;

8、壓測，高壓插入、更新、刪除動作，每秒處理13.8萬行。

pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 120  
  
  
transaction type: ./test.sql  
scaling factor: 1  
query mode: prepared  
number of clients: 64  
number of threads: 64  
duration: 120 s  
number of transactions actually processed: 16634225  
latency average = 0.462 ms  
latency stddev = 0.506 ms  
tps = 138437.925513 (including connections establishing)  
tps = 138445.093708 (excluding connections establishing)  
statement latencies in milliseconds:  
         0.002  set id1 random(1,10000000)  
         0.000  set id2 random(1,20000000)  
         0.298  insert into t_src values (:id1, md5(random()::text)) on conflict (id) do update set info=excluded.info;  
         0.163  delete from t_src where id=:id2;

目標端

1、建立目標表

create table t_dst (  
  id int primary key,   
  info text not null,   
  is_del boolean default null,   
  mod_time timestamp not null default clock_timestamp()  
);

2、建立索引

create index idx_t_dst_1 on t_dst(mod_time);

資料同步

1、源端資料增量同步到目標端

(在同一DB中模擬，後面有例子講源和目標在不同叢集的DEMO)

do language plpgsql $$  
declare  
  pos timestamp;           -- 位點  
  pre interval := `10 s`;  -- 緩衝10秒，防止空洞，根據業務層設定  
begin  
  -- 已同步位點  
  select max(mod_time) into pos from t_dst;  
    
  -- NULL表示目標端沒有資料  
  if pos is null then  
    -- 緩衝上限   
    pos := now()::timestamp;  
    insert into t_dst select * from t_src where mod_time < (pos - pre) on conflict (id) do update set   -- on conflict 合併insert,update  
      info=excluded.info, is_del=excluded.is_del, mod_time=excluded.mod_time  
      where t_dst.info is distinct from excluded.info or  
      t_dst.is_del is distinct from excluded.is_del or  
      t_dst.mod_time is distinct from excluded.mod_time;  
    return;  
  end if;  
  
  -- 同步超過位點的資料  
  insert into t_dst select * from t_src where mod_time > pos and mod_time < (now()::timestamp - pre) on conflict (id) do update set   
    info=excluded.info, is_del=excluded.is_del, mod_time=excluded.mod_time  
    where t_dst.info is distinct from excluded.info or  
    t_dst.is_del is distinct from excluded.is_del or  
    t_dst.mod_time is distinct from excluded.mod_time;  
  return;  
end;  
$$;

2、一邊壓測，一邊呼叫以上過程同步，最後達到一致性狀態，檢查一致性的SQL如下：

select sum(hashtext((t.*)::text)),count(*),sum(case is_del when true then 0 else 1 end) from t_src t;  
  
      sum       |  count  |   sum     
----------------+---------+---------  
 -2967631712018 | 8299587 | 6199359  
(1 row)  
  
  
select sum(hashtext((t.*)::text)),count(*),sum(case is_del when true then 0 else 1 end) from t_dst t;  
  
      sum       |  count  |   sum     
----------------+---------+---------  
 -2967631712018 | 8299587 | 6199359  
(1 row)

二、源PostgreSQL, 目標Greenplum, 增量複製delete,insert,update

第一種方法，DELETE使用邏輯標記，所以實際上資料並沒有刪除。

還有一種方法，可以把DELETE的記錄，MOVE到另一張表。

方法與一差不多，只是把rule改一下，改成INSERT到其他表。

略

三、源PostgreSQL, 目標Greenplum, 增量複製insert,update

上游沒有del，或者說不需要捕獲DEL操作 (DEL操作，人為在上下游同時執行SQL來刪除)

源端

1、建立表

create table tbl_src (  
  id int primary key,   
  info text not null,   
  mod_time timestamp not null default clock_timestamp()  -- update時間戳  
);

2、建立索引

create index idx_tbl_src_1 on tbl_src(mod_time);

3、建立觸發器函式，更新時，自動更新時間戳欄位

create or replace function tg1_tbl_src() returns trigger as $$  
declare  
begin  
  NEW.mod_time := clock_timestamp();  
  return NEW;  
end;  
$$ language plpgsql strict;

4、建立觸發器

create trigger tg1 before update on tbl_src for each row execute procedure tg1_tbl_src();

5、壓測

vi test.sql  
set id1 random(1,10000000)  
insert into tbl_src values (:id1, md5(random()::text)) on conflict (id) do update set info=excluded.info;  
  
  
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 120

目標端

1、建立目標表

create table tbl_dst (  
  id int primary key,   
  info text not null,   
  mod_time timestamp not null default clock_timestamp()  
);

2、建立索引

create index idx_tbl_dst_1 on tbl_dst(mod_time);

資料同步

在同一個例項中模擬。

1、呼叫過程，同步資料。

do language plpgsql $$  
declare  
  pos timestamp;  -- 位點  
  pre interval := `10 s`;  -- 緩衝10秒，防止空洞，根據業務層設定  
begin  
  select max(mod_time) into pos from tbl_dst;  
  if pos is null then  
    pos := now()::timestamp;  
    insert into tbl_dst select * from tbl_src where mod_time < (pos - pre) on conflict (id) do update set   
      info=excluded.info, mod_time=excluded.mod_time  
      where tbl_dst.info is distinct from excluded.info or  
      tbl_dst.mod_time is distinct from excluded.mod_time;  
    return;  
  end if;  
  
  insert into tbl_dst select * from tbl_src where mod_time > pos and mod_time < (now()::timestamp - pre) on conflict (id) do update set   
    info=excluded.info, mod_time=excluded.mod_time  
    where tbl_dst.info is distinct from excluded.info or  
    tbl_dst.mod_time is distinct from excluded.mod_time;  
  return;  
end;  
$$;

2、一邊壓測，一邊呼叫以上過程同步資料，最後檢查一致性

select sum(hashtext((t.*)::text)),count(*) from tbl_src t;  
  
      sum      |  count    
---------------+---------  
 2739329060132 | 6725930  
(1 row)  
  
select sum(hashtext((t.*)::text)),count(*) from tbl_dst t;  
  
      sum      |  count    
---------------+---------  
 2739329060132 | 6725930  
(1 row)

四、源PostgreSQL, 目標Greenplum, 增量複製insert,update

同樣，不包括DELETE的複製。只複製insert和update。

由於greenplum 沒有 insert into on conflict 的功能，所以需要採用臨時表，分步實現MERGE。

同步方法

1. 目標端，HDB PG, 獲取max(mod_time)

2. 源端，PG, 拉取增量

3. 目標端，增量資料，匯入HDB PG 臨時表

4. 目標端，HDB PG ，DELETE from 目標表 using 臨時表

5. 目標端，HDB PG ，insert into 目標表 select * from 臨時表

6. 目標端，清空臨時表

目標端

1、建立臨時表

create table tbl_dst_tmp (  
  id int primary key,   
  info text not null,   
  mod_time timestamp not null default clock_timestamp()  
);

2、建立資料同步的指令碼

vi imp.sh  
  
#!/bin/bash  
  
# 1. HDB PG, 獲取max(mod_time)   
MOD_TIME=`PGPASSWORD="pwd1234" psql -h 127.0.0.1 -p 4000 -U postgres postgres -q -t -A -c "select coalesce(max(mod_time),`4714-11-24 00:00:00 BC`::timestamp) from tbl_dst"`  
  
# 2. PG, 拉取增量  
# 3. 增量資料，匯入HDB PG 臨時表  
# 使用 linux 管道同步上下游  
PGPASSWORD="pwd" psql -h 10.31.124.69 -p 4000 -U postgres postgres -c "copy (select * from tbl_src where mod_time > `$MOD_TIME`::timestamp and mod_time < (now()-`10 sec`::interval) ) to stdout" | PGPASSWORD="pwd1234" psql -h 127.0.0.1 -p 4000 -U postgres postgres -c "copy tbl_dst_tmp from stdin"  
  
# 4. HDB PG ，DELETE from 目標表 using 臨時表   
# 5. HDB PG ，insert into 目標表 select * from 臨時表   
# 6. 清空臨時表   
# 放在一個事務中，並且對臨時表加鎖保護。  
PGPASSWORD="pwd1234" psql -h 127.0.0.1 -p 4000 -U postgres postgres <<EOF  
begin;  
lock table tbl_dst_tmp in ACCESS EXCLUSIVE mode;  
delete from tbl_dst using tbl_dst_tmp where tbl_dst.id=tbl_dst_tmp.id;  
insert into tbl_dst select * from tbl_dst_tmp;  
truncate tbl_dst_tmp;  
end;  
EOF  
  
  
chmod 700 imp.sh

3、一邊壓測，一邊同步

COPY 6725930  
BEGIN  
LOCK TABLE  
DELETE 0  
INSERT 0 6725930  
TRUNCATE TABLE  
COMMIT  
  
  
COPY 0  
BEGIN  
LOCK TABLE  
DELETE 0  
INSERT 0 0  
TRUNCATE TABLE  
COMMIT  
  
  
COPY 78423  
BEGIN  
LOCK TABLE  
DELETE 52796  
INSERT 0 78423  
TRUNCATE TABLE  
COMMIT  
  
  
COPY 486817  
BEGIN  
LOCK TABLE  
DELETE 327603  
INSERT 0 486817  
TRUNCATE TABLE  
COMMIT  
  
  
COPY 2019059  
BEGIN  
LOCK TABLE  
DELETE 1381561  
INSERT 0 2019059  
TRUNCATE TABLE  
COMMIT

COPY 864687  
Timing is on.  
BEGIN  
Time: 0.149 ms  
LOCK TABLE  
Time: 0.629 ms  
DELETE 652117  
Time: 11029.147 ms (00:11.029)  
INSERT 0 864687  
Time: 10180.576 ms (00:10.181)  
TRUNCATE TABLE  
Time: 4.729 ms  
COMMIT  
Time: 53.555 ms  
  
  
  
COPY 2235200  
Timing is on.  
BEGIN  
Time: 0.178 ms  
LOCK TABLE  
Time: 0.702 ms  
DELETE 1719572  
Time: 18621.210 ms (00:18.621)  
INSERT 0 2235200  
Time: 27716.155 ms (00:27.716)  
TRUNCATE TABLE  
Time: 63.408 ms  
COMMIT  
Time: 81.915 ms  



COPY 5448790
Timing is on.
BEGIN
Time: 0.141 ms
LOCK TABLE
Time: 0.552 ms
DELETE 4486067
Time: 15297.884 ms (00:15.298)
INSERT 0 5448790
Time: 35654.723 ms (00:35.655)
TRUNCATE TABLE
Time: 1.860 ms
COMMIT
Time: 125.503 ms

4、檢查資料一致性

postgres=# select sum(hashtext((t.*)::text)),count(*) from tbl_dst t;  
      sum       |  count    
----------------+---------  
 -1351270286348 | 7548269  
(1 row)  
  
postgres=# select sum(hashtext((t.*)::text)),count(*) from tbl_src t;  
      sum       |  count    
----------------+---------  
 -1351270286348 | 7548269  
(1 row)

五、源SQL Server, 目標Greenplum, 增量複製insert,update

SQL Server有一個時間戳型別timestamp，會自動記錄資料插入，更新的時間，所以不需要用觸發器來實現標記。

SQL Server的timestamp，每個表只能建一個，並且在表上絕對唯一。8個位元組，是一個相對時間，可以與bigint互相轉換。

目標端可以使用bigint來儲存SQL Server的timestamp型別。

其他的方法與章節四類似，注意標記位的讀取、比較時需要轉換一下(timestamp, bigint)。 BIGINT 最小值為 (-9223372036854775808)::int8 。

DEMO1，位點從HDB PG讀取。

1、HDB PG, 建立臨時表

create table tbl_dst (  
  id int primary key,   
  info text not null,   
  mod_time int8 not null   -- 使用int8代替時間戳，對應ms sql的timestamp
); 

-- 臨時表

create table tbl_dst_tmp (  
  id int primary key,   
  info text not null,   
  mod_time int8 not null   -- 使用int8代替時間戳，對應ms sql的timestamp
);

2、資料同步的流程（MS SQL -> HDB PG，無法封裝到LINUX SHELL中，可能需要ETL程式介入。流程大致如下）

# 1. HDB PG, 獲取max(mod_time)   
MOD_TIME=`PGPASSWORD="pwd1234" psql -h 127.0.0.1 -p 4000 -U postgres postgres -q -t -A -c "select coalesce(max(mod_time),(-9223372036854775808)::int8) from tbl_dst"`  
  
# 2. MS SQL, 拉取增量  
select id, info, convert(bigint, mod_time) from tbl_src where convert(bigint,mod_time) > $MOD_TIME;

# 3. 增量資料，匯入HDB PG 臨時表  

insert into tbl_dst_tmp ...從步驟2來的資料...;

## 可能的話，還是用COPY更快，"copy tbl_dst_tmp from stdin"  
  
# 4. HDB PG ，DELETE from 目標表 using 臨時表   
# 5. HDB PG ，insert into 目標表 select * from 臨時表   
# 6. 清空臨時表   

# 放在一個事務中，並且對臨時表加鎖保護。  
PGPASSWORD="pwd1234" psql -h 127.0.0.1 -p 4000 -U postgres postgres <<EOF  
begin;  
lock table tbl_dst_tmp in ACCESS EXCLUSIVE mode;  
delete from tbl_dst using tbl_dst_tmp where tbl_dst.id=tbl_dst_tmp.id;  
insert into tbl_dst select * from tbl_dst_tmp;  
truncate tbl_dst_tmp;  
end;  
EOF

DEMO2, 位點記錄在MS SQL裡面，而不是從HDB PG讀取。

MS SQL, 建立任務表

create table tbl_job (
  tbl_name name primary key,
  job_time datetime,
  offset_val_low bigint,
  offset_val_up bigint
)

從任務表選擇上一次的最低位點

select offset_val_up into v1 from tbl_job where tbl_name=`tbl_src`; 

如果上一次沒有同步，則說明是第一次同步

if v1 is null then v1 := -1; end if; 

選擇本次同步的最大位點

select max(convert(bigint, mod_time)) into v2 from tbl_src; 
  
更新任務表

insert or update into tbl_job values (`tbl_src`, now(), v1, v2 ); 

匯出本次的增量資料
  
select * from tbl_src where convert(bigint, mod_time) > v1 and convert(bigint, mod_time) <= v2;

或者你還可以試試其他ETL軟體：

http://www.symmetricds.org/about/overview

https://github.com/pivotalguru/outsourcer 專業的sql server,oracle同步到greenplum的軟體

商業軟體：

https://dbconvert.com/mssql/postgresql/

https://www.convert-in.com/mss2pgs.htm#

xDB

https://www.enterprisedb.com/docs/en/6.2/repguide/EDB_Postgres_Replication_Server_Users_Guide.1.10.html#pID0E0HSK0HA