瀚高資料庫

環境

文件用途

詳細資訊

環境

系統平臺： Linux x86-64 Red Hat Enterprise Linux 7

版本： 14

文件用途

瞭解儲存管理器

詳細資訊

0. 相關資料型別

開啟的每一個段用如下結構表示，pg 中有 MdfdVec 陣列並且記錄了這個陣列的長度。

typedef struct _MdfdVec{File        mdfd_vfd;                      /* fd number in fd.c's pool */ 虛擬檔案描述符表的下標
BlockNumber mdfd_segno;        /* segment number, from 0 */ 段號，從0開始計數
} MdfdVec;

1. smgrinit

smgr 是 storage manager 的縮寫，即磁碟管理器，它作為磁碟管理器的上層，對下層操作進行一定程度的封裝。後端程式啟動時會做初始化操作。

由於歷史原因，只保留對磁碟（檔案系統支援的儲存體，用磁碟代表）的操作，NSmgr 為 1。

初始化只是為磁碟管理器申請了一塊記憶體。

void smgrinit(void){int            i;for (i = 0; i < NSmgr; i++)
{
    if (smgrsw[i].smgr_init)
    smgrsw[i].smgr_init();
}/* register the shutdown proc */on_proc_exit(smgrshutdown, 0);
}voidmdinit(void){
MdCxt = AllocSetContextCreate(TopMemoryContext,"MdSmgr",
ALLOCSET_DEFAULT_SIZES);
}

2. smgropen

不是真正開啟磁碟檔案，只是從 hash 表中查詢一個項，該項快取了包含了對當前關係中的開啟的檔案描述符以及檔案描述符的數目。

hash 表或者相應的 entry 沒有就建立。

第一次初始化該項把表的各種型別開啟的段的數目全都初始化為 0。

關於該 hash 操作可看另一篇動態 hash 的 support。

/*
*    smgropen() -- Return an SMgrRelation object, creating it if need be.
*
*        This does not attempt to actually open the underlying file.
*/SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend){
RelFileLocatorBackend brlocator;
SMgrRelation reln;bool        found;if (SMgrRelationHash == NULL)
{
    /* First time through: initialize the hash table */    HASHCTL        ctl;
    ctl.keysize = sizeof(RelFileLocatorBackend);
    ctl.entrysize = sizeof(SMgrRelationData);
    SMgrRelationHash = hash_create("smgr relation table", 400,
    &ctl, HASH_ELEM | HASH_BLOBS);
    dlist_init(&unowned_relns);
}
 /* Look up or create an entry */brlocator.locator = rlocator;
brlocator.backend = backend;
reln = (SMgrRelation) hash_search(SMgrRelationHash,
&brlocator,
HASH_ENTER, &found);/* Initialize it if not present before */if (!found)
{
    /* hash_search already filled in the lookup key */    reln->smgr_owner = NULL;
    reln->smgr_targblock = InvalidBlockNumber;
    for (int i = 0; i <= MAX_FORKNUM; ++i)
        reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
    reln->smgr_which = 0;    /* we only have md.c at present */    /* implementation-specific initialization */    smgrsw[reln->smgr_which].smgr_open(reln);
    /* it has no owner yet */    dlist_push_tail(&unowned_relns, &reln->node);
}return reln;
}void mdopen(SMgrRelation reln){/* mark it not open */for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
    reln->md_num_open_segs[forknum] = 0;
}

3. smgrread

smgrread 從非易失性儲存上根據儲存體型別，檔案型別，塊號讀取指定的表中指定的塊，根據本塊內容初始化 shared buffer 中快取頁。

底層會呼叫 md.c (magnetic disk) 相關函式，檔名有誤導性，因為凡是檔案系統支援的儲存介質操作都可以由 md.c 中函式完成該操作，不單單是磁碟。

/*
*    smgrread() -- read a particular block from a relation into the supplied buffer.
*
*        This routine is called from the buffer manager in order to
*        instantiate pages in the shared buffer cache.  All storage managers
*        return pages in the format that POSTGRES expects.
*/void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,char *buffer)
{smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
}

3.1 mdread

從一個關係中讀取指定的塊號，預設大小是 8KB，比較關鍵的函式是_mdfd_getseg ()，根據指定的引數獲得一段，一段預設是 1GB。

/*
*    mdread() -- Read the specified block from a relation.
*/void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer){off_t        seekpos;int            nbytes;
MdfdVec    *v;// 獲取MdfdVec指標，包含了虛擬檔案描述符和relation的segment numberv = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
      // 計算讀取位置seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
      // 斷言，位置不超過1GBAssert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
      // 該函式主要透過pread系統呼叫，從指定位置讀取BLCKSZ位元組的資料到buffer中nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);// nbytes作為返回結果，判斷是否是讀出錯還是有壞塊以及是否在恢復模式if (nbytes != BLCKSZ)
{
    if (nbytes < 0)
    ereport(ERROR,
    (errcode_for_file_access(),
    errmsg("could not read block %u in file \"%s\": %m",
    blocknum, FilePathName(v->mdfd_vfd))));/*
* Short read: we are at or past EOF, or we read a partial block at
* EOF.  Normally this is an error; upper levels should never try to
* read a nonexistent block.  However, if zero_damaged_pages is ON or
* we are InRecovery, we should instead return zeroes without
* complaining.  This allows, for example, the case of trying to
* update a block that was later truncated away.
*/if (zero_damaged_pages || InRecovery)
MemSet(buffer, 0, BLCKSZ);elseereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
blocknum, FilePathName(v->mdfd_vfd),
nbytes, BLCKSZ)));
}
}

3.2 _mdfd_getseg()

從 relation 中找一個 segment，這個 segment 包含了 blocknum 指定的 block。

返回值：要根據 behavior 判斷，可能為 NULL，可能是指向 MdfdVec 的指標，中間可能會建立一個新的段。

/*
*    _mdfd_getseg() -- Find the segment of the relation holding the specified block.
*
*  If the segment doesn't exist, we ereport, return NULL, or create the segment, according to "behavior".  
*  Note: skipFsync is only used in the EXTENSION_CREATE case.
*/static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
{
MdfdVec    *v;
BlockNumber targetseg;
BlockNumber nextsegno;/* some way to handle non-existent segments needs to be specified */// 根據blkno算出來的segment可能並不存在，但是某些情況下並不是一種錯誤，比如在恢復模式// 針對不存在的segment採取的措施主要是：報告錯誤、無則建立、返回NULL幾種。// EXTENSION_DONT_OPEN：當前段在磁碟存在，但是之前沒有開啟，那麼將不進行開啟操作 // 該函式上層的呼叫者會根據自己的情況設定相應的behaviorAssert(behavior & (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL | EXTENSION_DONT_OPEN));
      
      /* 預設情況下：RELSEG_SIZE = 2^30 / 2 ^ 13 = 2 ^17 = 131072 ， 1GB的表檔案，8KB的block */      /* targetset是一個relation在物理儲存上的segment number，從0開始 */targetseg = blkno / ((BlockNumber) RELSEG_SIZE);/* md_num_open_segs是檔案型別陣列，一個relation關聯的型別有普通的資料檔案、fsm、vm等，不同的檔案型別在pg用fork(分支）表示
 * forknum是對應於某個fork，reln->md_num_open_segs[forknum]表示某個reln中某種fork開啟的段的數量
 * 比如forknum = MAIN_FORKNUM(列舉型別）,即普通的資料檔案，reln->md_num_open_segs[forknum] == 5, 那麼segment = {0, 1, 2, 3, 4}的都已經開啟
 * 此時若滿足條件，直接返回就可以。
 */if (targetseg < reln->md_num_open_segs[forknum])
{
    v = &reln->md_seg_fds[forknum][targetseg];
    return v;
}
       
      /*若behavior中設定了EXTENSION_DONT_OPEN，表示只讀開啟的檔案，但是走到這裡意味著當前段沒有開啟，返回NULL *//* The caller only wants the segment if we already had it open. */if (behavior & EXTENSION_DONT_OPEN)
    return NULL;/*開啟段要保證連續，例如不能出現0，1， 3， 4只能是0，1，2，3，4
 *md_num_open_segs[forknum]確定當前型別有沒有開啟的segment, 有則直接開啟最近的，沒有就開啟該型別上的第一個segment
 */if (reln->md_num_open_segs[forknum] > 0)
    v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];else{
    v = mdopenfork(reln, forknum, behavior);
    if (!v)
        return NULL; 
}
      
      // nextsegno表示還沒開啟的下一個段 for (nextsegno = reln->md_num_open_segs[forknum]; nextsegno <= targetseg;  nextsegno++)
{
   /* 獲取單個磁碟檔案block的數量，該磁碟檔案由pg虛擬檔案描述符索引 */    BlockNumber nblocks = _mdnblocks(reln, forknum, v);
    int            flags = 0;
   /*保證連續開啟*/    Assert(nextsegno == v->mdfd_segno + 1);
   /*一個磁碟檔案(segment)中的block的數量總是應該<= RELSEG_SIZE（131072）
    if (nblocks > ((BlockNumber) RELSEG_SIZE))
        elog(FATAL, "segment too big");
            
        // 處於恢復模式，會在段的末尾增加一個block, 並設定flags
        if ((behavior & EXTENSION_CREATE) || (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
       {
              if (nblocks < ((BlockNumber) RELSEG_SIZE))
             {
                    char       *zerobuf = palloc0(BLCKSZ);
                    mdextend(reln, forknum, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, zerobuf, skipFsync);
                    pfree(zerobuf);
              }
              flags = O_CREAT;
       }
//  EXTENSION_DONT_CHECK_SIZE為checkpointer時設定的標誌， 此時mdnblocks會失效，因為這個標誌會導致segment在沒有被填滿的情況下，分配一個新的segment。
// 而計算mdnblocks會假設除最後的一個segment，所有的segment都有1GB/8KB個塊（看配置，這裡說的預設情況）
        else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) && nblocks < ((BlockNumber) RELSEG_SIZE))
{
    if (behavior & EXTENSION_RETURN_NULL)
   {
         errno = ENOENT;
         return NULL;
   }
      ereport(ERROR,
     (errcode_for_file_access(),
     errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
    _mdfd_segpath(reln, forknum, nextsegno),
    blkno, nblocks)));
}  // end else if 
      
      // 開啟下一個段
v = _mdfd_openseg(reln, forknum, nextsegno, flags);
   
if (v == NULL)
{
    if ((behavior & EXTENSION_RETURN_NULL) && FILE_POSSIBLY_DELETED(errno))
         return NULL;
    ereport(ERROR,
    (errcode_for_file_access(),
    errmsg("could not open file \"%s\" (target block %u): %m",
    _mdfd_segpath(reln, forknum, nextsegno),
    blkno)));
}
} // end for 
return v;
}

3.3 _mdfd_openseg

開啟一個段，更新 md_num_open_segs 和 md_seg_fds 兩個成員變數，並返回 MdfdVec 型別的指標。

/*
* Open the specified segment of the relation,
* and make a MdfdVec object for it.  Returns NULL on failure.
*/static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
{
MdfdVec    *v;
File        fd;char       *fullpath;
      
      // 字串，比如  pg_tblspc/PG_16_202208251/dbOid/relNumber.3 或者 base/dbOid/relNumberfullpath = _mdfd_segpath(reln, forknum, segno);// 獲取虛擬檔案描述符fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
      
     //釋放fullpath記憶體pfree(fullpath);if (fd < 0)
    return NULL;/*
* Segments are always opened in order from lowest to highest, so we must
* be adding a new one at the end.
*/Assert(segno == reln->md_num_open_segs[forknum]);
      
_fdvec_resize(reln, forknum, segno + 1);/* fill the entry */v = &reln->md_seg_fds[forknum][segno];
v->mdfd_vfd = fd;
v->mdfd_segno = segno;
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));/* all done */return v;
}

3.4 _mdfd_segpath

比較簡單，為了記一下在路徑後追加 segno。

/*
* Return the filename for the specified segment of the relation. The
* returned string is palloc'd.
*/static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
{char       *path,*fullpath;
path = relpath(reln->smgr_rlocator, forknum);if (segno > 0)
{
    fullpath = psprintf("%s.%u", path, segno);
    pfree(path);
}else    fullpath = path;return fullpath;
}

3.5 _fdvec_resize

對 SMgrRelationData 中的兩個成員變數做調整。

/*
*    _fdvec_resize() -- Resize the fork's open segments array
*/static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
{if (nseg == 0)
{
    if (reln->md_num_open_segs[forknum] > 0)
    {
        pfree(reln->md_seg_fds[forknum]);
        reln->md_seg_fds[forknum] = NULL;
    }
}else if (reln->md_num_open_segs[forknum] == 0)
{
    reln->md_seg_fds[forknum] = MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
}else{
    reln->md_seg_fds[forknum] = repalloc(reln->md_seg_fds[forknum],sizeof(MdfdVec) * nseg);
}
reln->md_num_open_segs[forknum] = nseg;
}

3.6 _mdnblocks

獲取單個檔案佔用的 block 數，用的整除沒有把 partial block 計算在內。

/*
* Get number of blocks present in a single disk file
*/static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
{
off_t        len;
      // 從檔案開始一直到EOF，該檔案佔用的位元組數。len = FileSize(seg->mdfd_vfd);if (len < 0)
    ereport(ERROR,
    (errcode_for_file_access(),
    errmsg("could not seek to end of file \"%s\": %m",
    FilePathName(seg->mdfd_vfd))));/* note that this calculation will ignore any partial block at EOF */return (BlockNumber) (len / BLCKSZ);
}

4. mdextend

在指定的關係中增加 1 個 block。

/*
*    mdextend() -- Add a block to the specified relation.
*
*        The semantics are nearly the same as mdwrite(): write at the
*        specified position.  However, this is to be used for the case of
*        extending a relation (i.e., blocknum is at or beyond the current
*        EOF).  Note that we assume writing a block beyond current EOF
*        causes intervening file space to become filled with zeroes.
*/void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,char *buffer, bool skipFsync){off_t        seekpos;int            nbytes;
MdfdVec    *v;
      // 這個斷言時間成本高，因統計一個關係的所有block數量，要開啟所有的段，每個段要用lseek定位到檔案末尾獲取大小，大檔案lseek定位到檔案尾慢。/* This assert is too expensive to have on normally ... */#ifdef CHECK_WRITE_VS_EXTENDAssert(blocknum >= mdnblocks(reln, forknum));#endif      // 一個relation的大小不能超過0xFFFFFFFF個塊，最大的塊號大小為0xFFFFFFFE      // 判斷走不到這裡，上層buffer manager先行判斷/*
* If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
* more --- we mustn't create a block whose number actually is
* InvalidBlockNumber.  (Note that this failure should be unreachable
* because of upstream checks in bufmgr.c.)
*/if (blocknum == InvalidBlockNumber)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("cannot extend file \"%s\" beyond %u blocks",
relpath(reln->smgr_rlocator, forknum),
InvalidBlockNumber)));
      // 獲取段號v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
      // 獲取塊號seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
      
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
      // pwrite寫入，因為訊號打斷或者沒有足夠的物理儲存空間或者軟資源限制，寫入的位元組數不夠nbytes或者返回-1（出錯）if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
{
    if (nbytes < 0)
    ereport(ERROR,
    (errcode_for_file_access(),
    errmsg("could not extend file \"%s\": %m",
    FilePathName(v->mdfd_vfd)),
    errhint("Check free disk space.")));
    /* short write: complain appropriately */    ereport(ERROR,
    (errcode(ERRCODE_DISK_FULL),
    errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
    FilePathName(v->mdfd_vfd),
    nbytes, BLCKSZ, blocknum),
    errhint("Check free disk space.")));
}if (!skipFsync && !SmgrIsTemp(reln))
    register_dirty_segment(reln, forknum, v);
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
}

5. mdwrite

/*
*    mdwrite() -- Write the supplied block at the appropriate location.
*
*        This is to be used only for updating already-existing blocks of a
*        relation (ie, those before the current EOF).  To extend a relation,
*        use mdextend().
*/void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,char *buffer, bool skipFsync){off_t        seekpos;int            nbytes;
MdfdVec    *v;/* This assert is too expensive to have on normally ... */#ifdef CHECK_WRITE_VS_EXTENDAssert(blocknum < mdnblocks(reln, forknum));#endifv = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);if (nbytes != BLCKSZ)
{if (nbytes < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write block %u in file \"%s\": %m",
blocknum, FilePathName(v->mdfd_vfd))));/* short write: complain appropriately */ereport(ERROR,
(errcode(ERRCODE_DISK_FULL),
errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
blocknum,
FilePathName(v->mdfd_vfd),
nbytes, BLCKSZ),
errhint("Check free disk space.")));
}if (!skipFsync && !SmgrIsTemp(reln))
    register_dirty_segment(reln, forknum, v);
}

pg從磁碟讀取檔案