PostgreSQL刪除表中重複資料

T1YSL發表於2022-01-19

一、測試資料建立

postgres=# create table test_chongfu(id int not null default null,name char(50) default null,class int  default null);
CREATE TABLE
postgres=# insert into test_chongfu values(1,'AA',1);
INSERT 0 1
postgres=# insert into test_chongfu values(2,'AB',8);
INSERT 0 1
postgres=# insert into test_chongfu values(3,'AB',8);
INSERT 0 1
postgres=# insert into test_chongfu values(4,'BB',5);
INSERT 0 1
postgres=# insert into test_chongfu values(5,'BD',7);
INSERT 0 1
postgres=# insert into test_chongfu values(6,'BD',7);
INSERT 0 1
postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  3 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
  6 | BD                                                 |     7
(6 rows)
postgres=# alter table test_chongfu add constraint pk1 primary key ("id");
ALTER TABLE
postgres=# \d test_chongfu
                 Table "public.test_chongfu"
 Column |     Type      | Collation | Nullable |   Default
--------+---------------+-----------+----------+--------------
 id     | integer       |           | not null |
 name   | character(50) |           |          | NULL::bpchar
 class  | integer       |           |          |
Indexes:
    "pk1" PRIMARY KEY, btree (id)

二、存在唯一標識情況

如果刪除重複資料的表裡有除ctid外的唯一標識,可以利用這一列,我這裡為id列。

postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  3 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
  6 | BD                                                 |     7
(6 rows)
postgres=# select * from test_chongfu where id not in (select min(id)  from test_chongfu group by name,class);
 id |                        name                        | class
----+----------------------------------------------------+-------
  3 | AB                                                 |     8
  6 | BD                                                 |     7
(2 rows)

透過id列,檢視根據name和class兩列匹配的重複列。執行如下語句刪除重複資料。

postgres=# delete from test_chongfu where id not in (select min(id)  from test_chongfu group by name,class);
DELETE 2
postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
(4 rows)

三、不存在唯一標識情況

恢復初始測試環境。如果表裡不存在唯一標識列,可以透過ctid來進行。ctid類似於oracle的rowid,但是形式不同。

postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  3 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
  6 | BD                                                 |     7
(6 rows)
postgres=# select * from test_chongfu where ctid not in( select min(ctid) from test_chongfu group by name,class);
 id |                        name                        | class
----+----------------------------------------------------+-------
  3 | AB                                                 |     8
  6 | BD                                                 |     7
(2 rows)
postgres=# delete from test_chongfu where ctid not in( select min(ctid) from test_chongfu group by name,class);
DELETE 2
postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
(4 rows)

四、not in最佳化

in本來效率就很低,not in的效率更加低,如果資料量很大的情況,可能執行時間會特別長。因此可以嘗試把查詢到的唯一資料重新插入到一張新表裡,然後把原來舊錶刪掉,最後修改去重資料後的新表名為舊錶名。

postgres=# create table tab_new as select * from test_chongfu where ctid in(select min(ctid) from test_chongfu group by name,class);
SELECT 4
postgres=# select * from tab_new;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
(4 rows)


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69990629/viewspace-2853046/,如需轉載,請註明出處,否則將追究法律責任。

相關文章