本節介紹了插入資料時與WAL相關的處理邏輯，主要的函式是heap_insert。

一、資料結構

宏定義
包括Pointer/Page/XLOG_HEAP_INSERT/XLogRecPtr等

typedef char* Pointer;//指標
typedef Pointer Page;//Page

#define XLOG_HEAP_INSERT   0x00

/*
 * Pointer to a location in the XLOG.  These pointers are 64 bits wide,
 * because we don't want them ever to overflow.
 * 指向XLOG中的位置.
 * 這些指標大小為64bit,以確保指標不會溢位.
 */
typedef uint64 XLogRecPtr;

/*
 * Additional macros for access to page headers. (Beware multiple evaluation
 * of the arguments!)
 */
#define PageGetLSN(page) \
    PageXLogRecPtrGet(((PageHeader) (page))->pd_lsn)
#define PageSetLSN(page, lsn) \
    PageXLogRecPtrSet(((PageHeader) (page))->pd_lsn, lsn)

xl_heap_insert
插入時需要獲知的資訊結構

/*
 * xl_heap_insert/xl_heap_multi_insert flag values, 8 bits are available.
 */
/* PD_ALL_VISIBLE was cleared */
#define XLH_INSERT_ALL_VISIBLE_CLEARED          (1<<0)
#define XLH_INSERT_LAST_IN_MULTI                (1<<1)
#define XLH_INSERT_IS_SPECULATIVE               (1<<2)
#define XLH_INSERT_CONTAINS_NEW_TUPLE           (1<<3)

/* This is what we need to know about insert */
//這是在插入時需要獲知的資訊
typedef struct xl_heap_insert
{
    //元組在page中的偏移
    OffsetNumber offnum;        /* inserted tuple's offset */
    uint8       flags;          //標記位

    /* xl_heap_header & TUPLE DATA in backup block 0 */
    //xl_heap_header & TUPLE DATA在備份塊0中
} xl_heap_insert;
//xl_heap_insert大小
#define SizeOfHeapInsert    (offsetof(xl_heap_insert, flags) + sizeof(uint8))

xl_heap_header
PG不會在WAL中儲存插入/更新的元組的全部固定部分(HeapTupleHeaderData)
xl_heap_header是必須儲存的結構.

/*
 * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted
 * or updated tuple in WAL; we can save a few bytes by reconstructing the
 * fields that are available elsewhere in the WAL record, or perhaps just
 * plain needn't be reconstructed.  These are the fields we must store.
 * NOTE: t_hoff could be recomputed, but we may as well store it because
 * it will come for free due to alignment considerations.
 * PG不會在WAL中儲存插入/更新的元組的全部固定部分(HeapTupleHeaderData);
 *   我們可以透過重新構造在WAL記錄中可用的一些欄位來節省一些空間,或者直接扁平化處理。
 * 這些都是我們必須儲存的欄位。
 * 注意:t_hoff可以重新計算，但我們也需要儲存它，因為出於對齊的考慮,會被析構。
 */
typedef struct xl_heap_header
{
    uint16      t_infomask2;//t_infomask2標記
    uint16      t_infomask;//t_infomask標記
    uint8       t_hoff;//t_hoff
} xl_heap_header;
//HeapHeader的大小
#define SizeOfHeapHeader    (offsetof(xl_heap_header, t_hoff) + sizeof(uint8))

二、原始碼解讀

heap_insert的主要邏輯是插入元組到堆中,其中存在對WAL(XLog)進行處理的部分.

/*
 *  heap_insert     - insert tuple into a heap
 *                    插入元組到堆中
 * 
 * The new tuple is stamped with current transaction ID and the specified
 * command ID.
 * 新元組使用當前事務ID和指定的命令ID標記。
 * 
 * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
 * logged in WAL, even for a non-temp relation.  Safe usage of this behavior
 * requires that we arrange that all new tuples go into new pages not
 * containing any tuples from other transactions, and that the relation gets
 * fsync'd before commit.  (See also heap_sync() comments)
 * 如果指定了HEAP_INSERT_SKIP_WAL選項，那麼新的元組就不會記錄到WAL中，
 *   即使對於非臨時關係也是如此。
 * 如希望安全使用此選項,要求我們組織所有的新元組寫入不包含來自其他事務的其他元組的新頁面，
 *   並且關係在提交之前得到fsync。(請參閱heap_sync()函式的註釋)
 * 
 * The HEAP_INSERT_SKIP_FSM option is passed directly to
 * RelationGetBufferForTuple, which see for more info.
 * HEAP_INSERT_SKIP_FSM選項作為引數直接傳給RelationGetBufferForTuple
 *
 * HEAP_INSERT_FROZEN should only be specified for inserts into
 * relfilenodes created during the current subtransaction and when
 * there are no prior snapshots or pre-existing portals open.
 * This causes rows to be frozen, which is an MVCC violation and
 * requires explicit options chosen by user.
 * HEAP_INSERT_FROZEN應該只針對在當前子事務中建立的relfilenodes插入，
 *   以及在沒有開啟以前的snapshots或已經存在的portals時指定。
 * 這會導致行凍結，這是一種違反MVCC的行為，需要使用者選擇顯式選項。
 * 
 * HEAP_INSERT_SPECULATIVE is used on so-called "speculative insertions",
 * which can be backed out afterwards without aborting the whole transaction.
 * Other sessions can wait for the speculative insertion to be confirmed,
 * turning it into a regular tuple, or aborted, as if it never existed.
 * Speculatively inserted tuples behave as "value locks" of short duration,
 * used to implement INSERT .. ON CONFLICT.
 * HEAP_INSERT_SPECULATIVE用於所謂的“投機性插入 speculative insertions”，
 *   這些插入可以在不中止整個事務的情況下在事後退出。
 * 其他會話可以等待投機性插入得到確認，將其轉換為常規元組，
 *   或者中止，就像它不存在一樣。
 * Speculatively插入的元組表現為短期的“值鎖”，用於實現INSERT .. ON CONFLICT。
 *
 * Note that most of these options will be applied when inserting into the
 * heap's TOAST table, too, if the tuple requires any out-of-line data.  Only
 * HEAP_INSERT_SPECULATIVE is explicitly ignored, as the toast data does not
 * partake in speculative insertion.
 * 注意:在插入到堆的TOAST表時,如需要out-of-line資料,那麼也會應用這些選項.
 * 只有HEAP_INSERT_SPECULATIVE選項是顯式忽略的,因為toast資料不能speculative insertion.
 *
 * The BulkInsertState object (if any; bistate can be NULL for default
 * behavior) is also just passed through to RelationGetBufferForTuple.
 * BulkInsertState物件(如存在,位狀態可以設定為NULL)也傳遞給RelationGetBufferForTuple函式
 *
 * The return value is the OID assigned to the tuple (either here or by the
 * caller), or InvalidOid if no OID.  The header fields of *tup are updated
 * to match the stored tuple; in particular tup->t_self receives the actual
 * TID where the tuple was stored.  But note that any toasting of fields
 * within the tuple data is NOT reflected into *tup.
 * 返回值是分配給元組的OID(在這裡或由呼叫方指定)，
 *   如果沒有OID，則是InvalidOid。
 * 更新*tup的頭部欄位以匹配儲存的元組;特別是tup->t_self接收元組儲存的實際TID。
 * 但是請注意，元組資料中的欄位的任何toasting都不會反映到*tup中。
 */
/*
輸入：
    relation-資料表結構體
    tup-Heap Tuple資料（包括頭部資料等），亦即資料行
    cid-命令ID（順序）
    options-選項
    bistate-BulkInsert狀態
輸出：
    Oid-資料表Oid
*/
Oid
heap_insert(Relation relation, HeapTuple tup, CommandId cid,
            int options, BulkInsertState bistate)
{
    TransactionId xid = GetCurrentTransactionId();//事務id
    HeapTuple   heaptup;//Heap Tuple資料，亦即資料行
    Buffer      buffer;//資料快取塊
    Buffer      vmbuffer = InvalidBuffer;//vm緩衝塊
    bool        all_visible_cleared = false;//標記

    /*
     * Fill in tuple header fields, assign an OID, and toast the tuple if
     * necessary.
     * 填充元組的頭部欄位,分配OID,如需要處理元組的toast資訊
     *
     * Note: below this point, heaptup is the data we actually intend to store
     * into the relation; tup is the caller's original untoasted data.
     * 注意:在這一點以下，heaptup是我們實際打算儲存到關係中的資料;
     *   tup是呼叫方的原始untoasted的資料。
     */
    //插入前準備工作，比如設定t_infomask標記等
    heaptup = heap_prepare_insert(relation, tup, xid, cid, options);

    /*
     * Find buffer to insert this tuple into.  If the page is all visible,
     * this will also pin the requisite visibility map page.
     * 查詢緩衝區並將此元組插入。
     * 如頁面都是可見的，這也將固定必需的可見性對映頁面。
     */
    //獲取相應的buffer，詳見上面的子函式解析
    buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
                                       InvalidBuffer, options, bistate,
                                       &vmbuffer, NULL);

    /*
     * We're about to do the actual insert -- but check for conflict first, to
     * avoid possibly having to roll back work we've just done.
     * 即將執行實際的插入操作 -- 但首先要檢查衝突,以避免可能的回滾.
     *
     * This is safe without a recheck as long as there is no possibility of
     * another process scanning the page between this check and the insert
     * being visible to the scan (i.e., an exclusive buffer content lock is
     * continuously held from this point until the tuple insert is visible).
     * 不重新檢查也是安全的,只要在該檢查和插入之間不存在其他正在執行掃描頁面的程式
     *   (頁面對於掃描程式是可見的)
     *
     * For a heap insert, we only need to check for table-level SSI locks. Our
     * new tuple can't possibly conflict with existing tuple locks, and heap
     * page locks are only consolidated versions of tuple locks; they do not
     * lock "gaps" as index page locks do.  So we don't need to specify a
     * buffer when making the call, which makes for a faster check.
     * 對於堆插入,我們只需要檢查表級別的SSI鎖.
     * 新元組不可能與現有的元組鎖衝突，堆頁鎖只是元組鎖的合併版本;
     *   它們不像索引頁鎖那樣鎖定“間隙”。
     * 所以我們在呼叫時不需要指定緩衝區，這樣可以更快地進行檢查。
     */
    //檢查序列化是否衝突
    CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);

    /* NO EREPORT(ERROR) from here till changes are logged */
    //開始，變數+1
    START_CRIT_SECTION();
    //插入資料（詳見上一節對該函式的解析）
    RelationPutHeapTuple(relation, buffer, heaptup,
                         (options & HEAP_INSERT_SPECULATIVE) != 0);
    //如Page is All Visible
    if (PageIsAllVisible(BufferGetPage(buffer)))
    {
        //復位
        all_visible_cleared = true;
        PageClearAllVisible(BufferGetPage(buffer));
        visibilitymap_clear(relation,
                            ItemPointerGetBlockNumber(&(heaptup->t_self)),
                            vmbuffer, VISIBILITYMAP_VALID_BITS);
    }

    /*
     * XXX Should we set PageSetPrunable on this page ?
     * XXX 在頁面上設定PageSetPrunable標記?
     *
     * The inserting transaction may eventually abort thus making this tuple
     * DEAD and hence available for pruning. Though we don't want to optimize
     * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
     * aborted tuple will never be pruned until next vacuum is triggered.
     * 插入事務可能會中止，從而使這個元組"死亡/DEAD"，需要進行裁剪pruning。
     *  雖然我們不想最佳化事務的中止處理，但是如果本頁中沒有其他元組被更新/刪除，
     *  中止的元組將永遠不會被刪除，直到下一次觸發vacuum。
     *
     * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
     * 如果在這裡增加PageSetPrunable,也需要在heap_xlog_insert中新增
     */
    //設定緩衝塊為髒塊
    MarkBufferDirty(buffer);

    /* XLOG stuff */
    //記錄日誌
    if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
    {
        xl_heap_insert xlrec;
        xl_heap_header xlhdr;
        XLogRecPtr  recptr;//uint64
        Page        page = BufferGetPage(buffer);//獲取相應的Page
        uint8       info = XLOG_HEAP_INSERT;//XLOG_HEAP_INSERT -> 0x00
        int         bufflags = 0;

        /*
         * If this is a catalog, we need to transmit combocids to properly
         * decode, so log that as well.
         * 如果這是一個catalog,需要傳輸combocids進行解碼,因此也需要記錄到日誌中.
         */
        if (RelationIsAccessibleInLogicalDecoding(relation))
            log_heap_new_cid(relation, heaptup);

        /*
         * If this is the single and first tuple on page, we can reinit the
         * page instead of restoring the whole thing.  Set flag, and hide
         * buffer references from XLogInsert.
         * 如果這是頁面上獨立的第一個元組，我們可以重新初始化頁面，而不是恢復整個頁面。
         * 設定標誌，並隱藏XLogInsert的緩衝區引用。
         */
        if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
            PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
        {
            info |= XLOG_HEAP_INIT_PAGE;
            bufflags |= REGBUF_WILL_INIT;
        }
        //Item在頁面中的偏移
        xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
        xlrec.flags = 0;//標記
        if (all_visible_cleared)
            xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
        if (options & HEAP_INSERT_SPECULATIVE)
            xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
        Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));


        /*
         * For logical decoding, we need the tuple even if we're doing a full
         * page write, so make sure it's included even if we take a full-page
         * image. (XXX We could alternatively store a pointer into the FPW).
         * 對於邏輯解碼,即使正在進行全page write,也需要元組資料,
         *   以確保該元組在全頁面映象時包含在內.
         * (XXX 我們也可以將指標儲存到FPW中)
         */
        if (RelationIsLogicallyLogged(relation))
        {
            xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
            bufflags |= REGBUF_KEEP_DATA;
        }

        XLogBeginInsert();//開始WAL插入
        XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);//註冊資料
        //設定標記
        xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
        xlhdr.t_infomask = heaptup->t_data->t_infomask;
        xlhdr.t_hoff = heaptup->t_data->t_hoff;

        /*
         * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
         * write the whole page to the xlog, we don't need to store
         * xl_heap_header in the xlog.
         * 注意:我們標記xlhdr屬於緩衝區;
         *   如果XLogInsert確定把整個page寫入到xlog中,那麼不需要在xlog中儲存xl_heap_header
         */
        XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);//標記
        XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);//tuple頭部
        /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
        XLogRegisterBufData(0,
                            (char *) heaptup->t_data + SizeofHeapTupleHeader,
                            heaptup->t_len - SizeofHeapTupleHeader);//tuple實際資料

        /* filtering by origin on a row level is much more efficient */
        //根據行級別上的原點進行過濾要有效得多
        XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
        //插入資料
        recptr = XLogInsert(RM_HEAP_ID, info);
        //設定LSN
        PageSetLSN(page, recptr);
    }
    //完成！
    END_CRIT_SECTION();
    //解鎖Buffer，包括vm buffer
    UnlockReleaseBuffer(buffer);
    if (vmbuffer != InvalidBuffer)
        ReleaseBuffer(vmbuffer);

    /*
     * If tuple is cachable, mark it for invalidation from the caches in case
     * we abort.  Note it is OK to do this after releasing the buffer, because
     * the heaptup data structure is all in local memory, not in the shared
     * buffer.
     * 如果tuple已快取，在終止事務時,則將它標記為無效快取。
     * 注意，在釋放緩衝區之後這樣做是可以的，
     *   因為heaptup資料結構都在本地記憶體中，而不是在共享緩衝區中。
     */
    //快取操作後變“無效”的Tuple
    CacheInvalidateHeapTuple(relation, heaptup, NULL);

    /* Note: speculative insertions are counted too, even if aborted later */
    //注意:speculative insertions也會統計(即使終止此事務)
    //更新統計資訊
    pgstat_count_heap_insert(relation, 1);

    /*
     * If heaptup is a private copy, release it.  Don't forget to copy t_self
     * back to the caller's image, too.
     * 如果heaptup是一個私有的複製,釋放之.
     *   不要忘了把t_self複製回撥用者的映象中.
     */
    if (heaptup != tup)
    {
        tup->t_self = heaptup->t_self;
        heap_freetuple(heaptup);
    }

    return HeapTupleGetOid(tup);
}

三、跟蹤分析

測試指令碼如下


-- Hash Partition
drop table if exists t_wal_partition;
create table t_wal_partition (c1 int not null,c2  varchar(40),c3 varchar(40)) partition by hash(c1);
create table t_wal_partition_1 partition of t_wal_partition for values with (modulus 6,remainder 0);
create table t_wal_partition_2 partition of t_wal_partition for values with (modulus 6,remainder 1);
create table t_wal_partition_3 partition of t_wal_partition for values with (modulus 6,remainder 2);
create table t_wal_partition_4 partition of t_wal_partition for values with (modulus 6,remainder 3);
create table t_wal_partition_5 partition of t_wal_partition for values with (modulus 6,remainder 4);
create table t_wal_partition_6 partition of t_wal_partition for values with (modulus 6,remainder 5);

-- 插入路由
-- delete from t_wal_partition where c1 = 0;
insert into t_wal_partition(c1,c2,c3) VALUES(0,'HASH0','HAHS0');

啟動gdb,設定斷點,進入heap_insert

(gdb) b heap_insert
Breakpoint 1 at 0x4df4d1: file heapam.c, line 2449.
(gdb) c
Continuing.

Breakpoint 1, heap_insert (relation=0x7f9c470a8bd8, tup=0x2908850, cid=0, options=0, bistate=0x0) at heapam.c:2449
2449        TransactionId xid = GetCurrentTransactionId();

構造Heap Tuple資料/獲取相應的Buffer(102號)

2449        TransactionId xid = GetCurrentTransactionId();
(gdb) n
2452        Buffer      vmbuffer = InvalidBuffer;
(gdb) 
2453        bool        all_visible_cleared = false;
(gdb) 
2462        heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
(gdb) 
2468        buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
(gdb) 
2487        CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
(gdb) p *heaptup
$1 = {t_len = 172, t_self = {ip_blkid = {bi_hi = 65535, bi_lo = 65535}, ip_posid = 0}, t_tableOid = 1247, 
  t_data = 0x2908868}
(gdb) p buffer
$2 = 102

插入到Buffer中,標記Buffer為Dirty

(gdb) n
2490        START_CRIT_SECTION();
(gdb) 
2493                             (options & HEAP_INSERT_SPECULATIVE) != 0);
(gdb) 
2492        RelationPutHeapTuple(relation, buffer, heaptup,
(gdb) 
2495        if (PageIsAllVisible(BufferGetPage(buffer)))
(gdb) 
2515        MarkBufferDirty(buffer);
(gdb) 
2518        if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
(gdb)

進入WAL處理部分,獲取page(位於1號Page)等資訊

(gdb) n
2523            Page        page = BufferGetPage(buffer);
(gdb) 
2524            uint8       info = XLOG_HEAP_INSERT;
(gdb) 
2525            int         bufflags = 0;
(gdb) 
2531            if (RelationIsAccessibleInLogicalDecoding(relation))
(gdb) p page
$3 = (Page) 0x7f9c1a505380 "\001"
(gdb) p *page
$4 = 1 '\001'
(gdb) p info
$5 = 0 '\000'

設定xl_heap_insert結構體
其中偏移=34,標記位為0x0

(gdb) n
2539            if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
(gdb) 
2546            xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
(gdb) 
2547            xlrec.flags = 0;
(gdb) 
2548            if (all_visible_cleared)
(gdb) 
2550            if (options & HEAP_INSERT_SPECULATIVE)
(gdb) 
2552            Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
(gdb) 
2559            if (RelationIsLogicallyLogged(relation) &&
(gdb) 
(gdb) p xlrec
$6 = {offnum = 34, flags = 0 '\000'}

開始插入WAL,並註冊相關資料

(gdb) 
2566            XLogBeginInsert();
(gdb) n
2567            XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
(gdb) 
2569            xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
(gdb) n
2570            xlhdr.t_infomask = heaptup->t_data->t_infomask;
(gdb) n
2571            xlhdr.t_hoff = heaptup->t_data->t_hoff;
(gdb) 
2578            XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
(gdb) 
2579            XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
(gdb) 
2583                                heaptup->t_len - SizeofHeapTupleHeader);
(gdb) 
2581            XLogRegisterBufData(0,
(gdb) 
2582                                (char *) heaptup->t_data + SizeofHeapTupleHeader,
(gdb) 
2581            XLogRegisterBufData(0,
(gdb) 
2586            XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
(gdb) 
2588            recptr = XLogInsert(RM_HEAP_ID, info);
(gdb) 
(gdb) p xlhdr
$8 = {t_infomask2 = 30, t_infomask = 2057, t_hoff = 32 ' '}

呼叫XLogInsert,插入WAL,並設定LSN

(gdb) n
2590            PageSetLSN(page, recptr);
(gdb) p recptr
$9 = 5411089336
(gdb) p info
$10 = 0 '\000'
(gdb)

執行其他後續操作,完成函式呼叫

(gdb) n
2593        END_CRIT_SECTION();
(gdb) 
2595        UnlockReleaseBuffer(buffer);
(gdb) 
2596        if (vmbuffer != InvalidBuffer)
(gdb) 
2605        CacheInvalidateHeapTuple(relation, heaptup, NULL);
(gdb) 
2608        pgstat_count_heap_insert(relation, 1);
(gdb) 
2614        if (heaptup != tup)
(gdb) 
2620        return HeapTupleGetOid(tup);
(gdb) 
2621    }
(gdb)

函式XLogBeginInsert/XLogRegisterData/XLogRegisterBuffer/XLogRegisterBufData/XLogInsert下一節再行介紹.

四、參考資料

Write Ahead Logging — WAL
PostgreSQL 原始碼解讀（4）- 插入資料#3（heap_insert）
PgSQL · 特性分析 · 資料庫崩潰恢復（上）
PgSQL · 特性分析 · 資料庫崩潰恢復（下）
PgSQL · 特性分析 · Write-Ahead Logging機制淺析
PostgreSQL WAL Buffers, Clog Buffers Deep Dive

PostgreSQL 原始碼解讀（104）- WAL#1（Insert & WAL-heap_i...

一、資料結構

二、原始碼解讀

三、跟蹤分析

四、參考資料

相關文章