PostgreSQL 原始碼解讀(89)- 查詢語句#74(SeqNext函式#2)

husthxd發表於2018-11-22

本節是SeqNext函式介紹的第二部分,主要介紹了SeqNext->heap_getnext函式的實現邏輯。

一、資料結構

TupleTableSlot
Tuple Table Slot,用於儲存元組相關資訊

/* base tuple table slot type */
typedef struct TupleTableSlot
{
    NodeTag     type;//Node標記
#define FIELDNO_TUPLETABLESLOT_FLAGS 1
    uint16      tts_flags;      /* 布林狀態;Boolean states */
#define FIELDNO_TUPLETABLESLOT_NVALID 2
    AttrNumber  tts_nvalid;     /* 在tts_values中有多少有效的values;# of valid values in tts_values */
    const TupleTableSlotOps *const tts_ops; /* 實現一個slot的成本;implementation of slot */
#define FIELDNO_TUPLETABLESLOT_TUPLEDESCRIPTOR 4
    TupleDesc   tts_tupleDescriptor;    /* slot的元組描述符;slot's tuple descriptor */
#define FIELDNO_TUPLETABLESLOT_VALUES 5
    Datum      *tts_values;     /* 當前屬性值;current per-attribute values */
#define FIELDNO_TUPLETABLESLOT_ISNULL 6
    bool       *tts_isnull;     /* 當前屬性isnull標記;current per-attribute isnull flags */
    MemoryContext tts_mcxt;     /*記憶體上下文; slot itself is in this context */
} TupleTableSlot;


typedef struct tupleDesc
{
    int         natts;          /* tuple中的屬性數量;number of attributes in the tuple */
    Oid         tdtypeid;       /* tuple型別的組合型別ID;composite type ID for tuple type */
    int32       tdtypmod;       /* tuple型別的typmode;typmod for tuple type */
    int         tdrefcount;     /* 依賴計數,如為-1,則沒有依賴;reference count, or -1 if not counting */
    TupleConstr *constr;        /* 約束,如無則為NULL;constraints, or NULL if none */
    /* attrs[N] is the description of Attribute Number N+1 */
    //attrs[N]是第N+1個屬性的描述符
    FormData_pg_attribute attrs[FLEXIBLE_ARRAY_MEMBER];
}  *TupleDesc;

HeapTuple
HeapTupleData是一個指向元組的記憶體資料結構
HeapTuple是指向HeapTupleData指標

/*
 * HeapTupleData is an in-memory data structure that points to a tuple.
 * HeapTupleData是一個指向元組的記憶體資料結構。
 *
 * There are several ways in which this data structure is used:
 * 使用這種資料結構有幾種方式:
 *
 * * Pointer to a tuple in a disk buffer: t_data points directly into the
 *   buffer (which the code had better be holding a pin on, but this is not
 *   reflected in HeapTupleData itself).
 *   指向磁碟緩衝區中的一個tuple的指標:
 *      t_data點直接指向緩衝區(程式碼最好將pin放在緩衝區中,但這在HeapTupleData本身中沒有反映出來)。
 *  
 * * Pointer to nothing: t_data is NULL.  This is used as a failure indication
 *   in some functions.
 *   沒有指標:
 *      t_data是空的。用於在一些函式中作為故障指示。
 *
 * * Part of a palloc'd tuple: the HeapTupleData itself and the tuple
 *   form a single palloc'd chunk.  t_data points to the memory location
 *   immediately following the HeapTupleData struct (at offset HEAPTUPLESIZE).
 *   This is the output format of heap_form_tuple and related routines.
 *   palloc'd tuple的一部分:HeapTupleData本身和tuple形成一個單一的palloc'd chunk。
 *      t_data指向HeapTupleData結構體後面的記憶體位置(偏移HEAPTUPLESIZE)。
 *      這是heap_form_tuple和相關例程的輸出格式。
 *
 * * Separately allocated tuple: t_data points to a palloc'd chunk that
 *   is not adjacent to the HeapTupleData.  (This case is deprecated since
 *   it's difficult to tell apart from case #1.  It should be used only in
 *   limited contexts where the code knows that case #1 will never apply.)
 *   單獨分配的tuple: 
 *      t_data指向一個與HeapTupleData不相鄰的palloc資料塊。
 *      (這個情況已廢棄不用,因為很難與第一種情況中進行區分。
 *      它應該只在程式碼知道第一種情況永遠不會適用的有限上下文中使用。
 *
 * * Separately allocated minimal tuple: t_data points MINIMAL_TUPLE_OFFSET
 *   bytes before the start of a MinimalTuple.  As with the previous case,
 *   this can't be told apart from case #1 by inspection; code setting up
 *   or destroying this representation has to know what it's doing.
 *   獨立分配的最小元組:
 *      t_data指向MinimalTuple開始前偏移MINIMAL_TUPLE_OFFSET個位元組的位置。
 *      與前一種情況一樣,不能透過檢查與第一種情況相區別;
 *      設定或銷燬這種表示的程式碼必須知道它在做什麼。
 *
 * t_len should always be valid, except in the pointer-to-nothing case.
 * t_self and t_tableOid should be valid if the HeapTupleData points to
 * a disk buffer, or if it represents a copy of a tuple on disk.  They
 * should be explicitly set invalid in manufactured tuples.
 * t_len應該總是有效的,除非在指標為NULL。
 * 如果HeapTupleData指向磁碟緩衝區,或者它表示磁碟上元組的副本,那麼t_self和t_tableOid應該是有效的。
 * 它們應該顯式地在製造的元組中設定為無效。
 */
typedef struct HeapTupleData
{
    uint32      t_len;          /* *t_data指標的長度;length of *t_data */
    ItemPointerData t_self;     /* SelfItemPointer */
    Oid         t_tableOid;     /* 該元組所屬的table;table the tuple came from */
#define FIELDNO_HEAPTUPLEDATA_DATA 3
    HeapTupleHeader t_data;     /* 指向元組的header&資料;-> tuple header and data */
} HeapTupleData;

typedef HeapTupleData *HeapTuple;

#define HEAPTUPLESIZE   MAXALIGN(sizeof(HeapTupleData))


HeapScanDesc
HeapScanDesc是指向HeapScanDescData結構體的指標

typedef struct HeapScanDescData
{
    /* scan parameters */
    Relation    rs_rd;          /* 堆表描述符;heap relation descriptor */
    Snapshot    rs_snapshot;    /* 快照;snapshot to see */
    int         rs_nkeys;       /* 掃描鍵數;number of scan keys */
    ScanKey     rs_key;         /* 掃描鍵陣列;array of scan key descriptors */
    bool        rs_bitmapscan;  /* bitmap scan=>T;true if this is really a bitmap scan */
    bool        rs_samplescan;  /* sample scan=>T;true if this is really a sample scan */
    bool        rs_pageatatime; /* 是否驗證可見性(MVCC機制);verify visibility page-at-a-time? */
    bool        rs_allow_strat; /* 是否允許訪問策略的使用;allow or disallow use of access strategy */
    bool        rs_allow_sync;  /* 是否允許syncscan的使用;allow or disallow use of syncscan */
    bool        rs_temp_snap;   /* 是否在掃描結束後取消快照"登記";unregister snapshot at scan end? */

    /* state set up at initscan time */
    //在initscan時配置的狀態
    BlockNumber rs_nblocks;     /* rel中的blocks總數;total number of blocks in rel */
    BlockNumber rs_startblock;  /* 開始的block編號;block # to start at */
    BlockNumber rs_numblocks;   /* 最大的block編號;max number of blocks to scan */
    /* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */
    //rs_numblocks通常值為InvalidBlockNumber,意味著掃描整個rel
    
    BufferAccessStrategy rs_strategy;   /* 讀取時的訪問場景;access strategy for reads */
    bool        rs_syncscan;    /* 在syncscan邏輯處理時是否報告位置;report location to syncscan logic? */

    /* scan current state */
    //掃描時的當前狀態
    bool        rs_inited;      /* 如為F,則掃描尚未初始化;false = scan not init'd yet */
    HeapTupleData rs_ctup;      /* 當前掃描的tuple;current tuple in scan, if any */
    BlockNumber rs_cblock;      /* 當前掃描的block;current block # in scan, if any */
    Buffer      rs_cbuf;        /* 當前掃描的buffer;current buffer in scan, if any */
    /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
    //注意:如果rs_cbuf<>InvalidBuffer,在buffer設定pin

    ParallelHeapScanDesc rs_parallel;   /* 並行掃描資訊;parallel scan information */

    /* these fields only used in page-at-a-time mode and for bitmap scans */
    //下面的變數只用於page-at-a-time模式以及點陣圖掃描
    int         rs_cindex;      /* 在vistuples中的當前元組索引;current tuple's index in vistuples */
    int         rs_ntuples;     /* page中的可見元組計數;number of visible tuples on page */
    OffsetNumber rs_vistuples[MaxHeapTuplesPerPage];    /* 元組的偏移;their offsets */
} HeapScanDescData;

/* struct definitions appear in relscan.h */
typedef struct HeapScanDescData *HeapScanDesc;

ScanState
ScanState擴充套件了對錶示底層關係掃描的節點型別的PlanState。

/* ----------------
 *   ScanState information
 *
 *      ScanState extends PlanState for node types that represent
 *      scans of an underlying relation.  It can also be used for nodes
 *      that scan the output of an underlying plan node --- in that case,
 *      only ScanTupleSlot is actually useful, and it refers to the tuple
 *      retrieved from the subplan.
 *      ScanState擴充套件了對錶示底層關係掃描的節點型別的PlanState。
 *      它還可以用於掃描底層計劃節點的輸出的節點——在這種情況下,實際上只有ScanTupleSlot有用,它引用從子計劃檢索到的元組。
 *
 *      currentRelation    relation being scanned (NULL if none)
 *                          正在掃描的relation,如無則為NULL
 *      currentScanDesc    current scan descriptor for scan (NULL if none)
 *                         當前的掃描描述符,如無則為NULL
 *      ScanTupleSlot      pointer to slot in tuple table holding scan tuple
 *                         指向tuple table中的slot
 * ----------------
 */
typedef struct ScanState
{
    PlanState   ps;             /* its first field is NodeTag */
    Relation    ss_currentRelation;
    HeapScanDesc ss_currentScanDesc;
    TupleTableSlot *ss_ScanTupleSlot;
} ScanState;

/* ----------------
 *   SeqScanState information
 * ----------------
 */
typedef struct SeqScanState
{
    ScanState   ss;             /* its first field is NodeTag */
    Size        pscan_len;      /* size of parallel heap scan descriptor */
} SeqScanState;

二、原始碼解讀

heap_getnext函式從資料表中獲取下一個tuple.根據ScanDesc->rs_pageatatime的設定,如為T,則呼叫heapgettup_pagemode函式,使用page-at-a-time模式提取元組,否則呼叫函式heapgettup使用常規模式提取.


HeapTuple
heap_getnext(HeapScanDesc scan, ScanDirection direction)
{
    /* Note: no locking manipulations needed */
    //注意:無需鎖定處理
    HEAPDEBUG_1;                /* heap_getnext( info ) */

    if (scan->rs_pageatatime)
        heapgettup_pagemode(scan, direction,
                            scan->rs_nkeys, scan->rs_key);//page-at-a-time模式
    else
        heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);//常規模式

    if (scan->rs_ctup.t_data == NULL)//已完成
    {
        HEAPDEBUG_2;            /* heap_getnext returning EOS */
        return NULL;
    }

    /*
     * if we get here it means we have a new current scan tuple, so point to
     * the proper return buffer and return the tuple.
     * 如果實現邏輯到這裡,意味著有一個新的當前掃描元組,指向正確的返回緩衝區並返回元組。
     */
    HEAPDEBUG_3;                /* heap_getnext returning tuple */

    pgstat_count_heap_getnext(scan->rs_rd);

    return &(scan->rs_ctup);
}


/* ----------------
 *      heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
 *      heapgettup_pagemode - 以page-at-a-time模式提取下一個元組
 *
 *      Same API as heapgettup, but used in page-at-a-time mode
 *      與heapgettup是相同的API,只不過僅用於page-at-a-time模式
 *
 * The internal logic is much the same as heapgettup's too, but there are some
 * differences: we do not take the buffer content lock (that only needs to
 * happen inside heapgetpage), and we iterate through just the tuples listed
 * in rs_vistuples[] rather than all tuples on the page.  Notice that
 * lineindex is 0-based, where the corresponding loop variable lineoff in
 * heapgettup is 1-based.
 * 內部邏輯與heapgettup的邏輯大體相同,但也有一些區別:
 *      不使用緩衝區鎖(這隻需要在heapgetpage內發生),只迭代rs_vistuples[]中列出的元組,而不是頁面上的所有元組。
 * 注意,lineindex是從0開始的,而heapgettup中對應的迴圈變數lineoff是從1開始的。
 * ----------------
 */
static void
heapgettup_pagemode(HeapScanDesc scan,//ScanDesc
                    ScanDirection dir,//掃描方向
                    int nkeys,//鍵個數
                    ScanKey key)//掃描鍵
{
    HeapTuple   tuple = &(scan->rs_ctup);//當前掃描的Tuple(scan->rs_ctup型別為HeapTupleData)
    bool        backward = ScanDirectionIsBackward(dir);//是否後向掃描
    BlockNumber page;//page編號
    bool        finished;//是否已完成
    Page        dp;//page
    int         lines;//
    int         lineindex;
    OffsetNumber lineoff;//偏移
    int         linesleft;
    ItemId      lpp;//項ID

    /*
     * calculate next starting lineindex, given scan direction
     * 給定掃描方向,計算下一個開始的lineindex
     */
    if (ScanDirectionIsForward(dir))
    {
        //前向掃描
        if (!scan->rs_inited)
        {
            //尚未初始化
            /*
             * return null immediately if relation is empty
             * 如relation為空,則馬上返回null
             */
            if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
            {
                Assert(!BufferIsValid(scan->rs_cbuf));
                tuple->t_data = NULL;
                return;
            }
            //判斷是否並行掃描
            if (scan->rs_parallel != NULL)
            {
                //並行掃描初始化
                heap_parallelscan_startblock_init(scan);

                page = heap_parallelscan_nextpage(scan);

                /* Other processes might have already finished the scan. */
                //其他程式可能已經完成了掃描
                if (page == InvalidBlockNumber)
                {
                    Assert(!BufferIsValid(scan->rs_cbuf));
                    tuple->t_data = NULL;
                    return;
                }
            }
            else
                page = scan->rs_startblock; /* 非並行掃描,返回開始頁;first page */
            //獲取page
            heapgetpage(scan, page);
            //初始化lineindex為0
            lineindex = 0;
            //設定初始化標記為T
            scan->rs_inited = true;
        }
        else
        {
            //已完成初始化
            /* continue from previously returned page/tuple */
            //從上一次返回的page/tuple處開始
            page = scan->rs_cblock; /* 當前頁;current page */
            lineindex = scan->rs_cindex + 1;//加+1
        }
        //根據buffer獲取相應的page
        dp = BufferGetPage(scan->rs_cbuf);
        //驗證快照是否過舊
        TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
        lines = scan->rs_ntuples;
        /* page and lineindex now reference the next visible tid */
        //page和lineindex現在依賴於下一個可見的tid
        linesleft = lines - lineindex;
    }
    else if (backward)
    {
        //反向掃描
        /* backward parallel scan not supported */
        //並行後向掃描目前不支援
        Assert(scan->rs_parallel == NULL);

        if (!scan->rs_inited)
        {
            /*
             * return null immediately if relation is empty
             * 同正向掃描
             */
            if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
            {
                Assert(!BufferIsValid(scan->rs_cbuf));
                tuple->t_data = NULL;
                return;
            }

            /*
             * Disable reporting to syncscan logic in a backwards scan; it's
             * not very likely anyone else is doing the same thing at the same
             * time, and much more likely that we'll just bollix things for
             * forward scanners.
             * 在反向掃描中禁用對syncscan邏輯的報告;
             * 不太可能有其他人在同一時間做同樣的事情,更可能的是我們會把向前掃描的事情搞砸:( 
             */
            scan->rs_syncscan = false;//禁用sync掃描
            /* start from last page of the scan */
            //從最後一個page開始
            if (scan->rs_startblock > 0)
                page = scan->rs_startblock - 1;//已開始掃描,減一
            else
                page = scan->rs_nblocks - 1;//未開始掃描,頁數減一
            //獲取page
            heapgetpage(scan, page);
        }
        else
        {
            /* continue from previously returned page/tuple */
            //獲取當前page
            page = scan->rs_cblock; /* current page */
        }
        //根據buffer獲取page
        dp = BufferGetPage(scan->rs_cbuf);
        //快照是否過舊判斷
        TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
        //行數
        lines = scan->rs_ntuples;

        if (!scan->rs_inited)
        {
            //未初始化,初始化相關資訊
            lineindex = lines - 1;
            scan->rs_inited = true;
        }
        else
        {
            //已完成初始化,index-1
            lineindex = scan->rs_cindex - 1;
        }
        /* page and lineindex now reference the previous visible tid */

        linesleft = lineindex + 1;
    }
    else
    {
        //既不是正向也不是反向,掃描不能移動
        /*
         * ``no movement'' scan direction: refetch prior tuple
         * ``no movement'' scan direction: 取回之前的元組
         */
        if (!scan->rs_inited)
        {
            Assert(!BufferIsValid(scan->rs_cbuf));
            tuple->t_data = NULL;
            return;
        }

        page = ItemPointerGetBlockNumber(&(tuple->t_self));
        if (page != scan->rs_cblock)
            heapgetpage(scan, page);

        /* Since the tuple was previously fetched, needn't lock page here */
        dp = BufferGetPage(scan->rs_cbuf);
        TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
        lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
        lpp = PageGetItemId(dp, lineoff);
        Assert(ItemIdIsNormal(lpp));

        tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
        tuple->t_len = ItemIdGetLength(lpp);

        /* check that rs_cindex is in sync */
        Assert(scan->rs_cindex < scan->rs_ntuples);
        Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);

        return;
    }

    /*
     * advance the scan until we find a qualifying tuple or run out of stuff
     * to scan
     * 推進掃描,直到找到一個合格的元組或耗盡了要掃描的東西
     */
    for (;;)
    {
        while (linesleft > 0)//該page中剩餘的行數>0(linesleft > 0),亦即掃描該page
        {
            //獲得偏移
            lineoff = scan->rs_vistuples[lineindex];
            //獲取ItemID
            lpp = PageGetItemId(dp, lineoff);
            Assert(ItemIdIsNormal(lpp));
            //獲取元組頭部資料
            tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
            //大小
            tuple->t_len = ItemIdGetLength(lpp);
            //設定指標(ItemPointer是ItemPointerData結構體指標)
            ItemPointerSet(&(tuple->t_self), page, lineoff);

            /*
             * if current tuple qualifies, return it.
             * 當前元組滿足條件,返回
             */
            if (key != NULL)
            {
                //掃描鍵不為NULL
                bool        valid;
                //驗證是否符合要求
                HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
                            nkeys, key, valid);
                if (valid)
                {
                    //滿足,則返回
                    scan->rs_cindex = lineindex;
                    return;
                }
            }
            else
            {
                //不存在掃描鍵,直接返回
                scan->rs_cindex = lineindex;
                return;
            }

            /*
             * otherwise move to the next item on the page
             * 不滿足掃描鍵要求,繼續page中的下一個Item
             */
            --linesleft;//減少剩餘計數
            if (backward)
                --lineindex;//反向,減一
            else
                ++lineindex;//正向,加一
        }

        /*
         * if we get here, it means we've exhausted the items on this page and
         * it's time to move to the next.
         * 如果執行到這裡,意味著已經把這一頁的內容已完成,是時候轉移到下一頁了。
         */
        if (backward)//反向
        {
            finished = (page == scan->rs_startblock) ||
                (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);//判斷是否已完成
            if (page == 0)//如page為0
                page = scan->rs_nblocks;//重置為block數
            page--;//page減一
        }
        else if (scan->rs_parallel != NULL)
        {
            //並行掃描
            page = heap_parallelscan_nextpage(scan);
            finished = (page == InvalidBlockNumber);
        }
        else
        {
            //正向掃描
            page++;//page加一
            if (page >= scan->rs_nblocks)
                page = 0;//page超出總數,重置為0
            finished = (page == scan->rs_startblock) ||
                (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);//判斷是否已完成

            /*
             * Report our new scan position for synchronization purposes. We
             * don't do that when moving backwards, however. That would just
             * mess up any other forward-moving scanners.
             * 報告新掃描位置用於同步。然而,在反向掃描時不會這樣做。那隻會把其他前進的掃描器弄亂。
             *
             * Note: we do this before checking for end of scan so that the
             * final state of the position hint is back at the start of the
             * rel.  That's not strictly necessary, but otherwise when you run
             * the same query multiple times the starting position would shift
             * a little bit backwards on every invocation, which is confusing.
             * We don't guarantee any specific ordering in general, though.
             * 注意:在掃描結束前做這個前置檢查以便位置的最終狀態是在rel開始之後。
             * 這不是嚴格必要的,否則當多次執行相同的查詢時,起始位置將稍微有一點靠後,這相當令人困惑。
             * 不過,我們一般來說不保證任何特定的順序。
             */
            if (scan->rs_syncscan)
                //同步掃描,報告位置
                ss_report_location(scan->rs_rd, page);
        }

        /*
         * return NULL if we've exhausted all the pages
         * 已耗盡所有page,返回NULL
         */
        if (finished)
        {
            if (BufferIsValid(scan->rs_cbuf))
                ReleaseBuffer(scan->rs_cbuf);
            scan->rs_cbuf = InvalidBuffer;
            scan->rs_cblock = InvalidBlockNumber;
            tuple->t_data = NULL;
            scan->rs_inited = false;
            return;
        }
        //獲取下一個page,繼續迴圈
        heapgetpage(scan, page);
        //執行類似的邏輯
        dp = BufferGetPage(scan->rs_cbuf);
        TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
        lines = scan->rs_ntuples;
        linesleft = lines;
        if (backward)
            lineindex = lines - 1;
        else
            lineindex = 0;//ItemID從0開始
    }
}



/* ----------------
 *      heapgettup - fetch next heap tuple
 *      提取下一個元組
 *
 *      Initialize the scan if not already done; then advance to the next
 *      tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
 *      or set scan->rs_ctup.t_data = NULL if no more tuples.
 *      如尚未完成初始化,則初始化掃描;
 *          然後按照“dir”的方向指示前進到下一個元組;
 *          返回下一個元組scan->rs_ctup,如無元組則設定scan->rs_ctup.t_data = NULL。
 *
 * dir == NoMovementScanDirection means "re-fetch the tuple indicated
 * by scan->rs_ctup".
 * dir == NoMovementScanDirection意味著"使用scan->rs_ctup重新提取元組"
 *
 * Note: the reason nkeys/key are passed separately, even though they are
 * kept in the scan descriptor, is that the caller may not want us to check
 * the scankeys.
 * 注意:nkeys/key是單獨傳遞的,即使它們儲存在掃描描述符中,原因是呼叫者可能不希望我們檢查scankeys。
 *
 * Note: when we fall off the end of the scan in either direction, we
 * reset rs_inited.  This means that a further request with the same
 * scan direction will restart the scan, which is a bit odd, but a
 * request with the opposite scan direction will start a fresh scan
 * in the proper direction.  The latter is required behavior for cursors,
 * while the former case is generally undefined behavior in Postgres
 * so we don't care too much.
 * 注意:當我們從掃描結束的任意方向前進時,需要重置rs_inited標記。
 * 這意味著具有相同掃描方向的進一步請求將重新啟動掃描,這有點奇怪,
 * 但是具有相反掃描方向的請求將在正確的方向上重新啟動掃描。
 * 後者是遊標需要的行為,而前者通常是Postgres中未定義的行為,所以我們不太關心。
 * ----------------
 */
static void
heapgettup(HeapScanDesc scan,//ScanDesc
           ScanDirection dir,//掃描方向
           int nkeys,//掃描鍵個數
           ScanKey key)//掃描鍵
{
    HeapTuple   tuple = &(scan->rs_ctup);//當前的tuple
    Snapshot    snapshot = scan->rs_snapshot;//快照
    bool        backward = ScanDirectionIsBackward(dir);//
    BlockNumber page;
    bool        finished;
    Page        dp;
    int         lines;
    OffsetNumber lineoff;
    int         linesleft;
    ItemId      lpp;

    /*
     * calculate next starting lineoff, given scan direction
     * 給定掃描方向,計算下一個開始的偏移
     */
    if (ScanDirectionIsForward(dir))
    {
        //參照heapgettup_pagemode註釋
        if (!scan->rs_inited)
        {
            /*
             * return null immediately if relation is empty
             */
            if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
            {
                Assert(!BufferIsValid(scan->rs_cbuf));
                tuple->t_data = NULL;
                return;
            }
            if (scan->rs_parallel != NULL)
            {
                heap_parallelscan_startblock_init(scan);

                page = heap_parallelscan_nextpage(scan);

                /* Other processes might have already finished the scan. */
                if (page == InvalidBlockNumber)
                {
                    Assert(!BufferIsValid(scan->rs_cbuf));
                    tuple->t_data = NULL;
                    return;
                }
            }
            else
                page = scan->rs_startblock; /* first page */
            heapgetpage(scan, page);
            lineoff = FirstOffsetNumber;    /* first offnum */
            scan->rs_inited = true;
        }
        else
        {
            /* continue from previously returned page/tuple */
            page = scan->rs_cblock; /* current page */
            lineoff =           /* next offnum */
                OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
        }

        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);

        dp = BufferGetPage(scan->rs_cbuf);
        TestForOldSnapshot(snapshot, scan->rs_rd, dp);
        lines = PageGetMaxOffsetNumber(dp);
        /* page and lineoff now reference the physically next tid */

        linesleft = lines - lineoff + 1;
    }
    else if (backward)
    {
        //參照heapgettup_pagemode註釋
        /* backward parallel scan not supported */
        Assert(scan->rs_parallel == NULL);

        if (!scan->rs_inited)
        {
            /*
             * return null immediately if relation is empty
             */
            if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
            {
                Assert(!BufferIsValid(scan->rs_cbuf));
                tuple->t_data = NULL;
                return;
            }

            /*
             * Disable reporting to syncscan logic in a backwards scan; it's
             * not very likely anyone else is doing the same thing at the same
             * time, and much more likely that we'll just bollix things for
             * forward scanners.
             */
            scan->rs_syncscan = false;
            /* start from last page of the scan */
            if (scan->rs_startblock > 0)
                page = scan->rs_startblock - 1;
            else
                page = scan->rs_nblocks - 1;
            heapgetpage(scan, page);
        }
        else
        {
            /* continue from previously returned page/tuple */
            page = scan->rs_cblock; /* current page */
        }
        //鎖定buffer(BUFFER_LOCK_SHARE)
        //這裡跟pagemode不同,需要鎖定
        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
        //
        dp = BufferGetPage(scan->rs_cbuf);
        TestForOldSnapshot(snapshot, scan->rs_rd, dp);
        //獲取最大偏移
        lines = PageGetMaxOffsetNumber(dp);

        if (!scan->rs_inited)
        {
            lineoff = lines;    /* 設定為最後的偏移;final offnum */
            scan->rs_inited = true;
        }
        else
        {
            lineoff =           /* previous offnum */
                OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
        }
        /* page and lineoff now reference the physically previous tid */

        linesleft = lineoff;
    }
    else
    {
        /*
         * ``no movement'' scan direction: refetch prior tuple
         */
        if (!scan->rs_inited)
        {
            Assert(!BufferIsValid(scan->rs_cbuf));
            tuple->t_data = NULL;
            return;
        }

        page = ItemPointerGetBlockNumber(&(tuple->t_self));
        if (page != scan->rs_cblock)
            heapgetpage(scan, page);

        /* Since the tuple was previously fetched, needn't lock page here */
        dp = BufferGetPage(scan->rs_cbuf);
        TestForOldSnapshot(snapshot, scan->rs_rd, dp);
        lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
        lpp = PageGetItemId(dp, lineoff);
        Assert(ItemIdIsNormal(lpp));

        tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
        tuple->t_len = ItemIdGetLength(lpp);

        return;
    }

    /*
     * advance the scan until we find a qualifying tuple or run out of stuff
     * to scan
     */
    lpp = PageGetItemId(dp, lineoff);
    for (;;)
    {
        while (linesleft > 0)
        {
            if (ItemIdIsNormal(lpp))
            {
                bool        valid;

                tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
                tuple->t_len = ItemIdGetLength(lpp);
                ItemPointerSet(&(tuple->t_self), page, lineoff);

                /*
                 * if current tuple qualifies, return it.
                 */
                //判斷是否滿足可見性(MVCC機制)
                valid = HeapTupleSatisfiesVisibility(tuple,
                                                     snapshot,
                                                     scan->rs_cbuf);
                //檢查是否存在Serializable衝突
                CheckForSerializableConflictOut(valid, scan->rs_rd, tuple,
                                                scan->rs_cbuf, snapshot);

                if (valid && key != NULL)
                    HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
                                nkeys, key, valid);//掃描鍵驗證

                if (valid)
                {
                    //解鎖buffer,返回
                    LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
                    return;
                }
            }

            /*
             * otherwise move to the next item on the page
             */
            --linesleft;//下一個Item
            if (backward)
            {
                --lpp;          /* move back in this page's ItemId array */
                --lineoff;
            }
            else
            {
                ++lpp;          /* move forward in this page's ItemId array */
                ++lineoff;
            }
        }

        /*
         * if we get here, it means we've exhausted the items on this page and
         * it's time to move to the next.
         * 下一頁
         */
        //解鎖
        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);

        /*
         * advance to next/prior page and detect end of scan
         */
        if (backward)
        {
            finished = (page == scan->rs_startblock) ||
                (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
            if (page == 0)
                page = scan->rs_nblocks;
            page--;
        }
        else if (scan->rs_parallel != NULL)
        {
            page = heap_parallelscan_nextpage(scan);
            finished = (page == InvalidBlockNumber);
        }
        else
        {
            page++;
            if (page >= scan->rs_nblocks)
                page = 0;
            finished = (page == scan->rs_startblock) ||
                (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);

            /*
             * Report our new scan position for synchronization purposes. We
             * don't do that when moving backwards, however. That would just
             * mess up any other forward-moving scanners.
             *
             * Note: we do this before checking for end of scan so that the
             * final state of the position hint is back at the start of the
             * rel.  That's not strictly necessary, but otherwise when you run
             * the same query multiple times the starting position would shift
             * a little bit backwards on every invocation, which is confusing.
             * We don't guarantee any specific ordering in general, though.
             */
            if (scan->rs_syncscan)
                ss_report_location(scan->rs_rd, page);
        }

        /*
         * return NULL if we've exhausted all the pages
         */
        if (finished)
        {
            if (BufferIsValid(scan->rs_cbuf))
                ReleaseBuffer(scan->rs_cbuf);
            scan->rs_cbuf = InvalidBuffer;
            scan->rs_cblock = InvalidBlockNumber;
            tuple->t_data = NULL;
            scan->rs_inited = false;
            return;
        }

        heapgetpage(scan, page);
        //鎖定buffer
        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);

        dp = BufferGetPage(scan->rs_cbuf);
        TestForOldSnapshot(snapshot, scan->rs_rd, dp);
        lines = PageGetMaxOffsetNumber((Page) dp);
        linesleft = lines;
        if (backward)
        {
            lineoff = lines;
            lpp = PageGetItemId(dp, lines);
        }
        else
        {
            lineoff = FirstOffsetNumber;
            lpp = PageGetItemId(dp, FirstOffsetNumber);
        }
    }
}


//--------------------------------------------------- heapgetpage

/*
 * heapgetpage - subroutine for heapgettup()
 * heapgettup()的子函式
 * 
 * This routine reads and pins the specified page of the relation.
 * In page-at-a-time mode it performs additional work, namely determining
 * which tuples on the page are visible.
 * 這個例程讀取並pins固定關聯的指定頁面。
 * 在逐頁page-at-a-time模式中,它執行額外的工作,即確定頁面上哪些元組可見。
 */
void
heapgetpage(HeapScanDesc scan, BlockNumber page)
{
    Buffer      buffer;
    Snapshot    snapshot;
    Page        dp;
    int         lines;
    int         ntup;
    OffsetNumber lineoff;
    ItemId      lpp;
    bool        all_visible;

    Assert(page < scan->rs_nblocks);

    /* release previous scan buffer, if any */
    //釋放上一次掃描使用的buffer
    if (BufferIsValid(scan->rs_cbuf))
    {
        ReleaseBuffer(scan->rs_cbuf);
        scan->rs_cbuf = InvalidBuffer;
    }

    /*
     * Be sure to check for interrupts at least once per page.  Checks at
     * higher code levels won't be able to stop a seqscan that encounters many
     * pages' worth of consecutive dead tuples.
     * 務必記住每頁掃描完畢都要檢查一次中斷!
     * 更上層程式碼的檢查不能夠停止遇到很多無效tuples的seqscan
     */
    CHECK_FOR_INTERRUPTS();

    /* read page using selected strategy */
    //使用選定的策略讀取page
    //賦值:rs_cbuf & rs_cblock
    scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
                                       RBM_NORMAL, scan->rs_strategy);
    scan->rs_cblock = page;
    //如非page-at-a-time模式,直接返回
    if (!scan->rs_pageatatime)
        return;

    //page-at-a-time模式
    buffer = scan->rs_cbuf;
    snapshot = scan->rs_snapshot;

    /*
     * Prune and repair fragmentation for the whole page, if possible.
     * 如可能,修剪(Prune)和修復整個頁面的碎片。
     */
    heap_page_prune_opt(scan->rs_rd, buffer);

    /*
     * We must hold share lock on the buffer content while examining tuple
     * visibility.  Afterwards, however, the tuples we have found to be
     * visible are guaranteed good as long as we hold the buffer pin.
     * 在檢查元組可見性時,必須持有共享鎖.
     * 在上鎖之後,只要我們持有buffer pin,發現可見元組的邏輯會工作得很好。
     */
    //上鎖BUFFER_LOCK_SHARE
    LockBuffer(buffer, BUFFER_LOCK_SHARE);
    //獲取page
    dp = BufferGetPage(buffer);
    //驗證快照是否過舊
    TestForOldSnapshot(snapshot, scan->rs_rd, dp);
    //行數
    lines = PageGetMaxOffsetNumber(dp);
    //初始化
    ntup = 0;

    /*
     * If the all-visible flag indicates that all tuples on the page are
     * visible to everyone, we can skip the per-tuple visibility tests.
     * 如all-visible標誌表明頁面上的所有元組對每個人都可見,那麼可以跳過每個元組可見性測試。
     * 
     * Note: In hot standby, a tuple that's already visible to all
     * transactions in the master might still be invisible to a read-only
     * transaction in the standby. We partly handle this problem by tracking
     * the minimum xmin of visible tuples as the cut-off XID while marking a
     * page all-visible on master and WAL log that along with the visibility
     * map SET operation. In hot standby, we wait for (or abort) all
     * transactions that can potentially may not see one or more tuples on the
     * page. That's how index-only scans work fine in hot standby. A crucial
     * difference between index-only scans and heap scans is that the
     * index-only scan completely relies on the visibility map where as heap
     * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
     * the page-level flag can be trusted in the same way, because it might
     * get propagated somehow without being explicitly WAL-logged, e.g. via a
     * full page write. Until we can prove that beyond doubt, let's check each
     * tuple for visibility the hard way.
     * 注意:在熱備份中,對於主伺服器中的所有事務都可見的元組可能對備用伺服器中的只讀事務仍然不可見。
     *   透過跟蹤可見元組的最小xmin作為截止XID來部分處理這個問題,
     * 同時在master和WAL log上標記一個頁面全可見,以及可見對映設定操作。
     * 在熱備份中,我們等待(或中止)所有可能在頁面上看不到一個或多個元組的事務。
     * 這就是隻有索引的掃描在熱待機狀態下執行良好的原因。
     * 僅索引掃描和堆掃描之間的一個關鍵區別是,僅索引掃描完全依賴於可見性對映,
     *   當堆掃描檢視頁面級別的PD_ALL_VISIBLE標誌時,可見性對映將依賴於此。
     * 我們不確定是否可以以同樣的方式信任頁面級別的標誌,因為它可能會以某種方式傳播,
     *   而不會被顯式地保留在日誌中,例如透過整個頁面寫入。
     * 在我們能夠毫無疑問地證明這一點之前,必須以艱苦的方式檢查每個元組的可見性。
     */
    //驗證可見性
    all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
    //掃描Item
    for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
         lineoff <= lines;
         lineoff++, lpp++)
    {
        if (ItemIdIsNormal(lpp))
        {
            HeapTupleData loctup;
            bool        valid;

            loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
            loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
            loctup.t_len = ItemIdGetLength(lpp);
            ItemPointerSet(&(loctup.t_self), page, lineoff);

            if (all_visible)
                valid = true;
            else
                valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);

            CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
                                            buffer, snapshot);

            if (valid)
                scan->rs_vistuples[ntup++] = lineoff;
        }
    }
    //done,釋放共享鎖
    LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

    Assert(ntup <= MaxHeapTuplesPerPage);
    scan->rs_ntuples = ntup;
}

//--------------------------------------------------- PageGetMaxOffsetNumber

/*
 * PageGetMaxOffsetNumber
 *      Returns the maximum offset number used by the given page.
 *      Since offset numbers are 1-based, this is also the number
 *      of items on the page.
 *      返回給定頁面使用的最大偏移。由於偏移量是基於1的,所以這也是頁面上Item的數量。
 *
 *      NOTE: if the page is not initialized (pd_lower == 0), we must
 *      return zero to ensure sane behavior.  Accept double evaluation
 *      of the argument so that we can ensure this.
 *      注意:如果頁面沒有初始化(pd_lower == 0),我們必須返回0以確保正常的行為。
 *           接受對引數的雙重解析,這樣我們才能確保這一點。
 */
#define PageGetMaxOffsetNumber(page) \
    (((PageHeader) (page))->pd_lower <= SizeOfPageHeaderData ? 0 : \
     ((((PageHeader) (page))->pd_lower - SizeOfPageHeaderData) \
      / sizeof(ItemIdData)))


//--------------------------------------------------- TestForOldSnapshot
/*
 * Check whether the given snapshot is too old to have safely read the given
 * page from the given table.  If so, throw a "snapshot too old" error.
 * 檢查給定的快照是否過舊,不能安全地從給定的表中讀取給定的頁面。如果是,丟擲一個“snapshot too old”錯誤。
 *
 * This test generally needs to be performed after every BufferGetPage() call
 * that is executed as part of a scan.  It is not needed for calls made for
 * modifying the page (for example, to position to the right place to insert a
 * new index tuple or for vacuuming).  It may also be omitted where calls to
 * lower-level functions will have already performed the test.
 * 這個測試通常需要在作為掃描的一部分執行,在每個BufferGetPage()呼叫之後執行。
 * 不需要透過呼叫修改頁面(例如,定位到正確的位置以插入一個新的索引元組或進行清理)。
 * 對低階函式的呼叫已經執行測試的地方也可以省略。
 *
 * Note that a NULL snapshot argument is allowed and causes a fast return
 * without error; this is to support call sites which can be called from
 * either scans or index modification areas.
 * 注意,snapshot為NULL是允許的,可快速返回,沒有錯誤;
 * 這是為了支援可以從掃描或索引修改區域呼叫的呼叫位置。
 *
 * For best performance, keep the tests that are fastest and/or most likely to
 * exclude a page from old snapshot testing near the front.
 * 為了獲得最好的效能,保持最快的測試和/或最可能從最前面的舊快照測試中排除一個頁面。
 */
static inline void
TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
{
    Assert(relation != NULL);

    if (old_snapshot_threshold >= 0
        && (snapshot) != NULL
        && ((snapshot)->satisfies == HeapTupleSatisfiesMVCC
            || (snapshot)->satisfies == HeapTupleSatisfiesToast)
        && !XLogRecPtrIsInvalid((snapshot)->lsn)
        && PageGetLSN(page) > (snapshot)->lsn)
        TestForOldSnapshot_impl(snapshot, relation);
}

//--------------------------------------------------- TestForOldSnapshot_impl

/*
 * Implement slower/larger portions of TestForOldSnapshot
 * 實現TestForOldSnapshot
 * 
 * Smaller/faster portions are put inline, but the entire set of logic is too
 * big for that.
 * 更小/更快的部分是內聯的,但是整個實現邏輯顯得過大了。
 */
void
TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
{
    if (RelationAllowsEarlyPruning(relation)
        && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
        ereport(ERROR,
                (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
                 errmsg("snapshot too old")));
}


//--------------------------------------------------- HeapKeyTest

/*
 *      HeapKeyTest
 *
 *      Test a heap tuple to see if it satisfies a scan key.
 *       驗證heap tuple是否滿足掃描鍵要求
 */
#define HeapKeyTest(tuple, \
                    tupdesc, \
                    nkeys, \
                    keys, \
                    result) \
do \
{ \
    /* Use underscores to protect the variables passed in as parameters */ \
    /* 使用下劃線來保護作為引數傳入的變數*/ \
    int         __cur_nkeys = (nkeys); \
    ScanKey     __cur_keys = (keys); \
 \
    (result) = true; /* may change */ \
    for (; __cur_nkeys--; __cur_keys++) \
    { \
        Datum   __atp; \
        bool    __isnull; \
        Datum   __test; \
 \
        if (__cur_keys->sk_flags & SK_ISNULL) \
        { \
            (result) = false; \
            break; \
        } \
 \
        __atp = heap_getattr((tuple), \
                             __cur_keys->sk_attno, \
                             (tupdesc), \
                             &__isnull); \
 \
        if (__isnull) \
        { \
            (result) = false; \
            break; \
        } \
 \
        __test = FunctionCall2Coll(&__cur_keys->sk_func, \
                                   __cur_keys->sk_collation, \
                                   __atp, __cur_keys->sk_argument); \
 \
        if (!DatumGetBool(__test)) \
        { \
            (result) = false; \
            break; \
        } \
    } \
} while (0)

三、跟蹤分析

測試指令碼如下

testdb=# explain select dw.*,grjf.grbh,grjf.xm,grjf.ny,grjf.je 
testdb-# from t_dwxx dw,lateral (select gr.grbh,gr.xm,jf.ny,jf.je 
testdb(#                         from t_grxx gr inner join t_jfxx jf 
testdb(#                                        on gr.dwbh = dw.dwbh 
testdb(#                                           and gr.grbh = jf.grbh) grjf
testdb-# order by dw.dwbh;
                                        QUERY PLAN                                        
------------------------------------------------------------------------------------------
 Sort  (cost=20070.93..20320.93 rows=100000 width=47)
   Sort Key: dw.dwbh
   ->  Hash Join  (cost=3754.00..8689.61 rows=100000 width=47)
         Hash Cond: ((gr.dwbh)::text = (dw.dwbh)::text)
         ->  Hash Join  (cost=3465.00..8138.00 rows=100000 width=31)
               Hash Cond: ((jf.grbh)::text = (gr.grbh)::text)
               ->  Seq Scan on t_jfxx jf  (cost=0.00..1637.00 rows=100000 width=20)
               ->  Hash  (cost=1726.00..1726.00 rows=100000 width=16)
                     ->  Seq Scan on t_grxx gr  (cost=0.00..1726.00 rows=100000 width=16)
         ->  Hash  (cost=164.00..164.00 rows=10000 width=20)
               ->  Seq Scan on t_dwxx dw  (cost=0.00..164.00 rows=10000 width=20)
(11 rows)

啟動gdb,設定斷點,進入heap_getnext

(gdb) b heap_getnext
Breakpoint 1 at 0x4de01f: file heapam.c, line 1841.
(gdb) c
Continuing.

Breakpoint 1, heap_getnext (scan=0x2aadc18, direction=ForwardScanDirection) at heapam.c:1841
1841        if (scan->rs_pageatatime)

檢視輸入引數,注意rs_pageatatime = true,使用page-at-a-time模式查詢

(gdb) p *scan
$1 = {rs_rd = 0x7efdb8f2dfd8, rs_snapshot = 0x2a2a6d0, rs_nkeys = 0, rs_key = 0x0, rs_bitmapscan = false, 
  rs_samplescan = false, rs_pageatatime = true, rs_allow_strat = true, rs_allow_sync = true, rs_temp_snap = false, 
  rs_nblocks = 726, rs_startblock = 0, rs_numblocks = 4294967295, rs_strategy = 0x0, rs_syncscan = false, 
  rs_inited = false, rs_ctup = {t_len = 2139062143, t_self = {ip_blkid = {bi_hi = 65535, bi_lo = 65535}, ip_posid = 0}, 
    t_tableOid = 16742, t_data = 0x0}, rs_cblock = 4294967295, rs_cbuf = 0, rs_parallel = 0x0, rs_cindex = 2139062143, 
  rs_ntuples = 2139062143, rs_vistuples = {32639 <repeats 291 times>}}

進入heapgettup_pagemode函式

(gdb) n
1842            heapgettup_pagemode(scan, direction,
(gdb) step
heapgettup_pagemode (scan=0x2aadc18, dir=ForwardScanDirection, nkeys=0, key=0x0) at heapam.c:794
794     HeapTuple   tuple = &(scan->rs_ctup);
(gdb) 

heapgettup_pagemode->變數賦值,注意tuple還是一個"野"指標;尚未初始化p scan->rs_inited = false

794     HeapTuple   tuple = &(scan->rs_ctup);
(gdb) n
795     bool        backward = ScanDirectionIsBackward(dir);
(gdb) p *tuple
$2 = {t_len = 2139062143, t_self = {ip_blkid = {bi_hi = 65535, bi_lo = 65535}, ip_posid = 0}, t_tableOid = 16742, 
  t_data = 0x0}
(gdb) n
808     if (ScanDirectionIsForward(dir))
(gdb) p scan->rs_inited
$3 = false

heapgettup_pagemode->非並行掃描,page = scan->rs_startblock(即page = 0)

(gdb) n
815             if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
(gdb) n
821             if (scan->rs_parallel != NULL)
(gdb) 
836                 page = scan->rs_startblock; /* first page */
(gdb) 

進入heapgetpage

(gdb) n
837             heapgetpage(scan, page);
(gdb) step
heapgetpage (scan=0x2aadc18, page=0) at heapam.c:362
362     Assert(page < scan->rs_nblocks);

heapgetpage->檢查驗證&讀取page

362     Assert(page < scan->rs_nblocks);
(gdb) n
365     if (BufferIsValid(scan->rs_cbuf))
(gdb) p scan->rs_cbuf
$4 = 0
(gdb) n
376     CHECK_FOR_INTERRUPTS();
(gdb) 
379     scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
(gdb) 
381     scan->rs_cblock = page;
(gdb) 
383     if (!scan->rs_pageatatime)

heapgetpage->rs_cbuf為346/rs_cblock為0

(gdb) p scan->rs_cbuf
$5 = 346
(gdb) p scan->rs_cblock
$6 = 0

heapgetpage->page-at-a-time模式讀取,變數賦值,鎖定緩衝區

(gdb) n
386     buffer = scan->rs_cbuf;
(gdb) n
386     buffer = scan->rs_cbuf;
(gdb) 
387     snapshot = scan->rs_snapshot;
(gdb) 
392     heap_page_prune_opt(scan->rs_rd, buffer);
(gdb) 
399     LockBuffer(buffer, BUFFER_LOCK_SHARE);
(gdb) p buffer
$7 = 346

heapgetpage->獲取page,檢查快照是否過舊,獲取行數

(gdb) n
401     dp = BufferGetPage(buffer);
(gdb) 
402     TestForOldSnapshot(snapshot, scan->rs_rd, dp);
(gdb) p dp
$8 = (Page) 0x7efda4b7ac00 "\001"
(gdb) n
403     lines = PageGetMaxOffsetNumber(dp);
(gdb) 
404     ntup = 0;
(gdb) p lines
$11 = 158

heapgetpage->驗證可見性

(gdb) n
426     all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
(gdb) 
428     for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
(gdb) p all_visible
$12 = false
(gdb) n
429          lineoff <= lines;
(gdb) 
428     for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
(gdb) p lineoff
$13 = 1
(gdb) n
432         if (ItemIdIsNormal(lpp))
(gdb) 
437             loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
(gdb) 
438             loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
(gdb) 
439             loctup.t_len = ItemIdGetLength(lpp);
(gdb) 
440             ItemPointerSet(&(loctup.t_self), page, lineoff);
(gdb) 
442             if (all_visible)
(gdb) 
445                 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
(gdb) 
447             CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
(gdb) 
450             if (valid)
(gdb) 
451                 scan->rs_vistuples[ntup++] = lineoff;
(gdb) 
430          lineoff++, lpp++)
(gdb) 
...

heapgettup_pagemode->退出heapgetpage,回到heapgettup_pagemode,初始化lineindex為0,設定rs_inited為T

(gdb) finish
Run till exit from #0  heapgetpage (scan=0x2aadc18, page=0) at heapam.c:430
heapgettup_pagemode (scan=0x2aadc18, dir=ForwardScanDirection, nkeys=0, key=0x0) at heapam.c:838
838             lineindex = 0;
(gdb) n
839             scan->rs_inited = true;
(gdb) 
848         dp = BufferGetPage(scan->rs_cbuf);

heapgettup_pagemode->獲取page,驗證快照是否過舊

(gdb) n
849         TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
(gdb) p dp
$18 = (Page) 0x7efda4b7ac00 "\001"

heapgettup_pagemode->計算Item數,開始迴圈

(gdb) n
850         lines = scan->rs_ntuples;
(gdb) 
853         linesleft = lines - lineindex;
(gdb) 
948         while (linesleft > 0)
(gdb) p lines
$19 = 158
(gdb) p linesleft
$20 = 158
(gdb) 

heapgettup_pagemode->獲取Item偏移(lineoff)和ItemId

(gdb) p lineoff
$21 = 1
(gdb) p lpp
$22 = (ItemId) 0x7efda4b7ac18
(gdb) p *lpp
$23 = {lp_off = 8152, lp_flags = 1, lp_len = 40}

heapgettup_pagemode->給tuple中的變數賦值,ItemPointer是ItemPointerData結構體指標

(gdb) n
954             tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
(gdb) 
955             tuple->t_len = ItemIdGetLength(lpp);
(gdb) 
956             ItemPointerSet(&(tuple->t_self), page, lineoff);
(gdb) 
961             if (key != NULL)
(gdb) p *tuple->t_data
$26 = {t_choice = {t_heap = {t_xmin = 862, t_xmax = 0, t_field3 = {t_cid = 0, t_xvac = 0}}, t_datum = {datum_len_ = 862, 
      datum_typmod = 0, datum_typeid = 0}}, t_ctid = {ip_blkid = {bi_hi = 0, bi_lo = 0}, ip_posid = 1}, t_infomask2 = 5, 
  t_infomask = 2306, t_hoff = 24 '\030', t_bits = 0x7efda4b7cbef ""}
(gdb) p tuple->t_len
$28 = 40
(gdb) p tuple->t_self
$29 = {ip_blkid = {bi_hi = 0, bi_lo = 0}, ip_posid = 1}

heapgettup_pagemode->設定scan->rs_cindex,返回

(gdb) n
975                 scan->rs_cindex = lineindex;
(gdb) n
976                 return;
(gdb) p scan->rs_cindex 
$30 = 0

回到heap_getnext

(gdb) 
heap_getnext (scan=0x2aadc18, direction=ForwardScanDirection) at heapam.c:1847
1847        if (scan->rs_ctup.t_data == NULL)

返回獲得的tuple

1847        if (scan->rs_ctup.t_data == NULL)
(gdb) n
1859        pgstat_count_heap_getnext(scan->rs_rd);
(gdb) 
1861        return &(scan->rs_ctup);
(gdb) p scan->rs_ctup
$31 = {t_len = 40, t_self = {ip_blkid = {bi_hi = 0, bi_lo = 0}, ip_posid = 1}, t_tableOid = 16742, t_data = 0x7efda4b7cbd8}

結束第一次呼叫,再次進入該函式

(gdb) c
Continuing.

Breakpoint 1, heap_getnext (scan=0x2aadc18, direction=ForwardScanDirection) at heapam.c:1841
1841        if (scan->rs_pageatatime)
(gdb) n
1842            heapgettup_pagemode(scan, direction,
(gdb) step
heapgettup_pagemode (scan=0x2aadc18, dir=ForwardScanDirection, nkeys=0, key=0x0) at heapam.c:794
794     HeapTuple   tuple = &(scan->rs_ctup);
(gdb) n
795     bool        backward = ScanDirectionIsBackward(dir);
(gdb) 
808     if (ScanDirectionIsForward(dir))
(gdb) 
810         if (!scan->rs_inited)
(gdb) 
844             page = scan->rs_cblock; /* current page */

檢視輸入引數scan,與上一次有所不同,儲存了上一次呼叫返回的一些資訊,如rs_vistuples等

(gdb) p *scan
$32 = {rs_rd = 0x7efdb8f2dfd8, rs_snapshot = 0x2a2a6d0, rs_nkeys = 0, rs_key = 0x0, rs_bitmapscan = false, 
  rs_samplescan = false, rs_pageatatime = true, rs_allow_strat = true, rs_allow_sync = true, rs_temp_snap = false, 
  rs_nblocks = 726, rs_startblock = 0, rs_numblocks = 4294967295, rs_strategy = 0x0, rs_syncscan = false, rs_inited = true, 
  rs_ctup = {t_len = 40, t_self = {ip_blkid = {bi_hi = 0, bi_lo = 0}, ip_posid = 1}, t_tableOid = 16742, 
    t_data = 0x7efda4b7cbd8}, rs_cblock = 0, rs_cbuf = 346, rs_parallel = 0x0, rs_cindex = 0, rs_ntuples = 158, 
  rs_vistuples = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 
    29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 
    59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 
    89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 
    115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 
    139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 
    32639 <repeats 133 times>}}

DONE!

四、參考資料

PostgreSQL Page頁結構解析(1)-基礎
PostgreSQL Page頁結構解析(2)- 頁頭和行資料指標
PostgreSQL Page頁結構解析(3)- 行資料

來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/6906/viewspace-2374801/,如需轉載,請註明出處,否則將追究法律責任。

相關文章