PostgreSQL家譜、族譜類應用實踐-圖式關係儲存與搜尋

德哥發表於2018-05-06

原文網址 : https://flycode.co/archives/185770

SQL

背景

最近《最強大腦》節目的國際PK賽中，來自谷歌的一位國際選手展示了他在谷歌時做的一套系統，把三國人物關係整理並展示成了一張大圖，屬於非常典型的圖式應用。

PostgreSQL非常適合於這類場景，有著豐富的SQL介面和良好的效能。下面這些都是PG在圖式搜尋方面的應用：

《小微貸款、天使投資(風控助手)業務資料庫設計(圖式搜尋圖譜分析) – 阿里雲RDS PostgreSQL, HybridDB for PostgreSQL最佳實踐》

《金融風控、公安刑偵、社會關係、人脈分析等需求分析與資料庫實現 – PostgreSQL圖資料庫場景應用》

《PostgreSQL 圖式搜尋(graph search)實踐 – 百億級圖譜，毫秒響應》

《PostgreSQL 實踐 – 內容社群(如論壇)圖式搜尋應用》

今天這篇文件與之類似，來自一位社群朋友的問題，如何儲存家族關係，並快速提取N級資訊。是不是和我之前寫的社交類使用者關係，風控類企業關係相似呢？

設計表結構

1、個人資訊表，描述個人的詳細資訊、出生年月、住址、城市、等等。因為無法列舉全個人的資訊，所以我們可以用JSON來擴充套件這個等等，是不是很爽呢。

create table tbl_p_detail  -- 個人資訊  
(  
  id int primary key,    -- 人物ID  
  info jsonb,            -- 人物描述  
  crt_time timestamp     -- 建立時間  
);

2、關係的後設資料表，例如父親，母親，丈夫，妻子，兒子，女兒，養女，繼父，乾爹，乾女兒，等等。

create table tbl_er_desc  -- 關係描述  
(  
  id int2 primary key,    -- 關係ID  
  info text  -- 描述  
);

3、關係表，這裡面為了保證查詢的準確性（或者說簡化查詢語句），我們使用雙向冗餘儲存，例如父親、兒子一對，存兩條。

create table tbl_er     -- id1是id2的誰      
(  
  c1 int references tbl_p_detail(id),    
  c2 int references tbl_p_detail(id),    
  prop int2[],             -- 可能存在多種關係，我們使用陣列儲存。這個就是邊，當然我們也可以用JSON來儲存邊。請參考我寫的另一篇文件   
  crt_time timestamp,  
  check (c1<>c2),  
  unique (c1,c2)  
  -- FOREIGN KEY (EACH ELEMENT OF prop) REFERENCES tbl_er_desc(id)  -- 陣列外來鍵, PG 11會支援，很不錯  
);

4、建立索引加速

create index idx_tbl_er_c1 on tbl_er(c1);  
create index idx_tbl_er_c2 on tbl_er(c2);

5、寫入一些測試資料

insert into tbl_p_detail select generate_series(1,10000);

insert into tbl_er values (1,2,array[10],now());  -- 比如 1是2的父親  
insert into tbl_er values (2,1,array[9],now());   -- 比如 2是1的女兒  
  
insert into tbl_er values (1,3,array[10],now());  -- 比如 1是3的父親  
insert into tbl_er values (3,1,array[9],now());   -- 比如 3是1的女兒  
  
insert into tbl_er values (5,2,array[11],now());  -- 比如 5是2的母親  
insert into tbl_er values (2,5,array[9],now());   -- 比如 2是5的女兒  
  
insert into tbl_er values (5,3,array[11],now());  -- 比如 5是3的母親  
insert into tbl_er values (3,5,array[9],now());   -- 比如 3是5的女兒  
  
  
insert into tbl_er values (4,1,array[10],now());  -- 比如 4是1的父親  
insert into tbl_er values (1,4,array[8],now());   -- 比如 1是4的兒子  
  
insert into tbl_er values (6,5,array[10],now());  -- 比如 6是5的父親  
insert into tbl_er values (5,6,array[9],now());   -- 比如 5是6的女兒  
  
insert into tbl_er values (7,1,array[11],now());  -- 比如 7是1的母親  
insert into tbl_er values (1,7,array[8],now());   -- 比如 1是7的兒子  
  
insert into tbl_er values (8,5,array[11],now());  -- 比如 8是5的母親  
insert into tbl_er values (5,8,array[9],now());   -- 比如 5是8的女兒

定義搜尋函式

1、搜尋某個使用者的N層關係資料，以及每一層的限制記錄數。（通常家族資料不會那麼恐怖，所以限制層級即可，每一層輸出所有也無所謂）

create or replace function graph_search1(      
  IN i_root int,                       -- 根據哪個節點開始搜        
  IN i_depth int  default 99999,       -- 搜尋層級、深度限制      
  IN i_limit int8 default 2000000000,  -- 限制每一層返回的記錄數      
  OUT o_path int[],                    -- 輸出：路徑, ID 組成的陣列      
  OUT o_point1 int,                    -- 輸出：點1 ID      
  OUT o_point2 int,                    -- 輸出：點2 ID      
  OUT o_link_prop int2[],              -- 輸出：當前兩點之間的連線屬性      
  OUT o_link_prop_all text,            -- 輸出：從開始到當前點的連線屬性      
  OUT o_depth int                      -- 輸出：當前深度、層級      
) returns setof record as $$      
declare      
  sql text;      
begin      
sql := format($_$      
WITH RECURSIVE search_graph(        
  c1,     -- 點1        
  c2,     -- 點2        
  prop,   -- 當前邊的屬性      
  all_prop,  -- all 邊的屬性  
  depth,  -- 當前深度，從1開始         
  path    -- 路徑，陣列儲存         
) AS (        
        select c1,c2,prop,all_prop,depth,path from (        
        SELECT                               -- ROOT節點查詢        
          g.c1,                              -- 點1        
          g.c2,                              -- 點2        
          g.prop,                            -- 邊的屬性        
	  g.prop::text as all_prop,          -- all 邊的屬性  
          1 depth,                           -- 初始深度=1        
          ARRAY[g.c1, g.c2] path             -- 初始路徑        
        FROM tbl_er AS g         
        WHERE         
          c1 = %s                            -- ROOT節點=?        
          limit %s                           -- 每個層級限制多少條？        
        ) t        
      UNION ALL        
        select c1,c2,prop,all_prop,depth,path from (        
        SELECT                               -- 遞迴子句         
          g.c1,                              -- 點1        
          g.c2,                              -- 點2        
          g.prop,                            -- 邊的屬性     
	  sg.all_prop || g.prop::text as all_prop,    -- all 邊的屬性  
          sg.depth + 1 depth,                   -- 深度+1        
          sg.path || g.c2 path                 -- 路徑中加入新的點        
        FROM tbl_er AS g, search_graph AS sg    -- 迴圈 INNER JOIN        
        WHERE         
          g.c1 = sg.c2                       -- 遞迴JOIN條件        
          AND (g.c2 <> ALL(sg.path))                      -- 防止迴圈     , 是否迴圈，判斷新點是否已經在之前的路徑中   
          AND sg.depth <= %s                 -- 搜尋深度=?          
          limit %s                           -- 每個層級限制多少條？       
        ) t        
)        
SELECT path as o_path, c1 as o_point1, c2 as o_point2, prop as o_link_prop, all_prop as o_link_prop_all, depth as o_depth      
FROM search_graph;                           -- 查詢遞迴表，可以加LIMIT輸出，也可以使用遊標       
$_$, i_root, i_limit, i_depth, i_limit      
);      
      
return query execute sql;      
      
end;      
$$ language plpgsql strict;

例子

postgres=# select * from graph_search1(1);  
  o_path   | o_point1 | o_point2 | o_link_prop | o_link_prop_all | o_depth   
-----------+----------+----------+-------------+-----------------+---------  
 {1,2}     |        1 |        2 | {10}        | {10}            |       1  
 {1,3}     |        1 |        3 | {10}        | {10}            |       1  
 {1,4}     |        1 |        4 | {8}         | {8}             |       1  
 {1,7}     |        1 |        7 | {8}         | {8}             |       1  
 {1,2,5}   |        2 |        5 | {9}         | {10}{9}         |       2  
 {1,3,5}   |        3 |        5 | {9}         | {10}{9}         |       2  
 {1,2,5,8} |        5 |        8 | {9}         | {10}{9}{9}      |       3  
 {1,2,5,6} |        5 |        6 | {9}         | {10}{9}{9}      |       3  
 {1,2,5,3} |        5 |        3 | {11}        | {10}{9}{11}     |       3  
 {1,3,5,8} |        5 |        8 | {9}         | {10}{9}{9}      |       3  
 {1,3,5,6} |        5 |        6 | {9}         | {10}{9}{9}      |       3  
 {1,3,5,2} |        5 |        2 | {11}        | {10}{9}{11}     |       3  
(12 rows)  
  
Time: 1.120 ms

2、定義類似的搜尋函式，但是返回遊標。（家族圖譜關係沒有那麼多，所以不需要遊標返回，如果是社交類圖譜，資料量大，建議用遊標返回。）

create or replace function graph_search2(      
  IN i_root int,                       -- 根據哪個節點開始搜        
  IN i_res name,                       -- 遊標名      
  IN i_depth int  default 99999,       -- 搜尋層級、深度限制      
  IN i_limit int8 default 2000000000   -- 限制每一層返回的記錄數      
) returns refcursor as $$      
declare      
  sql text;      
  res refcursor := i_res;      
begin      
sql := format($_$      
WITH RECURSIVE search_graph(        
  c1,     -- 點1        
  c2,     -- 點2        
  prop,   -- 當前邊的屬性     
  all_prop,  -- all 邊的屬性   
  depth,  -- 當前深度，從1開始         
  path   -- 路徑，陣列儲存         
) AS (        
        select c1,c2,prop,all_prop,depth,path from (        
        SELECT                               -- ROOT節點查詢        
          g.c1,                              -- 點1        
          g.c2,                              -- 點2        
          g.prop,                            -- 邊的屬性      
	  g.prop::text as all_prop,          -- all 邊的屬性	    
          1 depth,                           -- 初始深度=1        
          ARRAY[g.c1, g.c2] path             -- 初始路徑    
        FROM tbl_er AS g         
        WHERE         
          c1 = %s                            -- ROOT節點=?        
          limit %s                           -- 每個層級限制多少條？        
        ) t        
      UNION ALL        
        select c1,c2,prop,all_prop,depth,path from (        
        SELECT                               -- 遞迴子句         
          g.c1,                              -- 點1        
          g.c2,                              -- 點2        
          g.prop,                            -- 邊的屬性        
	  sg.all_prop || g.prop::text as all_prop,    -- all 邊的屬性  
          sg.depth + 1 depth,                -- 深度+1        
          sg.path || g.c2 path                 -- 路徑中加入新的點          
        FROM tbl_er AS g, search_graph AS sg      -- 迴圈 INNER JOIN        
        WHERE         
          g.c1 = sg.c2                       -- 遞迴JOIN條件        
          AND (g.c2 <> ALL(sg.path))         -- 防止迴圈 , 是否迴圈，判斷新點是否已經在之前的路徑中        
          AND sg.depth <= %s                 -- 搜尋深度=?          
          limit %s                           -- 每個層級限制多少條？                   
        ) t        
)        
SELECT path as o_path, c1 as o_point1, c2 as o_point2, prop as o_link_prop, all_prop as o_link_prop_all, depth as o_depth      
FROM search_graph;                           -- 查詢遞迴表，可以加LIMIT輸出，也可以使用遊標       
$_$, i_root, i_limit, i_depth, i_limit      
);      
      
open res for execute sql;      
return res;      
      
end;      
$$ language plpgsql strict;

使用舉例

postgres=# begin;  
BEGIN  
Time: 0.096 ms  
postgres=# select * from graph_search2(1,`a`);  
 graph_search2   
---------------  
 a  
(1 row)  
  
Time: 1.110 ms  
postgres=# fetch 10 from a;  
  o_path   | o_point1 | o_point2 | o_link_prop | o_link_prop_all | o_depth   
-----------+----------+----------+-------------+-----------------+---------  
 {1,2}     |        1 |        2 | {10}        | {10}            |       1  
 {1,3}     |        1 |        3 | {10}        | {10}            |       1  
 {1,4}     |        1 |        4 | {8}         | {8}             |       1  
 {1,7}     |        1 |        7 | {8}         | {8}             |       1  
 {1,2,5}   |        2 |        5 | {9}         | {10}{9}         |       2  
 {1,3,5}   |        3 |        5 | {9}         | {10}{9}         |       2  
 {1,2,5,8} |        5 |        8 | {9}         | {10}{9}{9}      |       3  
 {1,2,5,6} |        5 |        6 | {9}         | {10}{9}{9}      |       3  
 {1,2,5,3} |        5 |        3 | {11}        | {10}{9}{11}     |       3  
 {1,3,5,8} |        5 |        8 | {9}         | {10}{9}{9}      |       3  
(10 rows)  
  
Time: 0.256 ms  
  
postgres=# fetch 10 from a;  
  o_path   | o_point1 | o_point2 | o_link_prop | o_link_prop_all | o_depth   
-----------+----------+----------+-------------+-----------------+---------  
 {1,3,5,6} |        5 |        6 | {9}         | {10}{9}{9}      |       3  
 {1,3,5,2} |        5 |        2 | {11}        | {10}{9}{11}     |       3  
(2 rows)  
  
Time: 0.103 ms

3、定義兩點的最短路徑，比如搜尋張三與李四的關係。當關系超過N級時，不返還，以免出現長時間搜尋。（當然，我們也可以定義語句超時，當執行時間超過N秒後退出）

create or replace function graph_search3(      
  IN i_p1 int,                       -- 節點1        
  IN i_p2 int,                       -- 節點2        
  IN i_depth int  default 99999,     -- 搜尋層級、深度限制        
  OUT o_path int[],                    -- 輸出：路徑, ID 組成的陣列      
  OUT o_link_prop text,                -- 輸出：當前兩點之間的連線屬性      
  OUT o_depth int                      -- 輸出：當前深度、層級      
) returns record as $$      
declare      
  sql text;      
begin      
sql := format($_$      
WITH RECURSIVE search_graph(        
  c1,   -- 點1        
  c2,   -- 點2        
  prop, -- 邊的屬性        
  depth, -- 深度，從1開始        
  path  -- 路徑，陣列儲存            
) AS (        
        SELECT    -- ROOT節點查詢        
          g.c1,   -- 點1        
          g.c2,   -- 點2        
          g.prop::text,   -- 邊的屬性        
          1 depth,        -- 初始深度=1        
          ARRAY[g.c1, g.c2] path             -- 初始路徑        
        FROM tbl_er AS g         
        WHERE         
          c1 = %s         -- ROOT節點=?      --（最短路徑的起點）        
      UNION ALL        
        SELECT     -- 遞迴子句        
          g.c1,    -- 點1        
          g.c2,    -- 點2        
          sg.prop::text || g.prop::text,          -- 邊的屬性        
          sg.depth + 1 as depth,    -- 深度+1        
          sg.path || g.c2 path                 -- 路徑中加入新的點      
        FROM tbl_er AS g, search_graph AS sg   -- 迴圈 INNER JOIN        
        WHERE         
          g.c1 = sg.c2         -- 遞迴JOIN條件        
          AND (g.c2 <> ALL(sg.path))        -- 防止迴圈 , 是否迴圈，判斷新點是否已經在之前的路徑中          
          AND sg.depth <= %s    -- 搜尋深度=?        
    
)        
SELECT      
  path as o_path,    
  prop as o_link_prop,    
  depth as o_depth    
FROM search_graph        
  where c2 = %s   -- 最短路徑的終點        
  limit 1         -- 查詢遞迴表，可以加LIMIT輸出，也可以使用遊標        
$_$, i_p1, i_depth, i_p2);    
    
execute sql into o_path,o_link_prop,o_depth;    
return;    
end;    
$$ language plpgsql strict;