本節是ExecHashJoin函式介紹的第二部分，主要介紹了ExecHashJoin中依賴的其他函式的實現邏輯，包括ExecHashTableCreate、ExecChooseHashTableSize等。

一、資料結構

Plan
所有計劃節點透過將Plan結構作為第一個欄位從Plan結構“派生”。這確保了在將節點轉換為計劃節點時，一切都能正常工作。(在執行器中以通用方式傳遞時，節點指標經常被轉換為Plan *)

/* ----------------
 *      Plan node
 *
 * All plan nodes "derive" from the Plan structure by having the
 * Plan structure as the first field.  This ensures that everything works
 * when nodes are cast to Plan's.  (node pointers are frequently cast to Plan*
 * when passed around generically in the executor)
 * 所有計劃節點透過將Plan結構作為第一個欄位從Plan結構“派生”。
 * 這確保了在將節點轉換為計劃節點時，一切都能正常工作。
 * (在執行器中以通用方式傳遞時，節點指標經常被轉換為Plan *)
 *
 * We never actually instantiate any Plan nodes; this is just the common
 * abstract superclass for all Plan-type nodes.
 * 從未例項化任何Plan節點;這只是所有Plan-type節點的通用抽象超類。
 * ----------------
 */
typedef struct Plan
{
    NodeTag     type;//節點型別

    /*
     * 成本估算資訊;estimated execution costs for plan (see costsize.c for more info)
     */
    Cost        startup_cost;   /* 啟動成本;cost expended before fetching any tuples */
    Cost        total_cost;     /* 總成本;total cost (assuming all tuples fetched) */

    /*
     * 最佳化器估算資訊;planner's estimate of result size of this plan step
     */
    double      plan_rows;      /* 行數;number of rows plan is expected to emit */
    int         plan_width;     /* 平均行大小(Byte為單位);average row width in bytes */

    /*
     * 並行執行相關的資訊;information needed for parallel query
     */
    bool        parallel_aware; /* 是否參與並行執行邏輯?engage parallel-aware logic? */
    bool        parallel_safe;  /* 是否並行安全;OK to use as part of parallel plan? */

    /*
     * Plan型別節點通用的資訊.Common structural data for all Plan types.
     */
    int         plan_node_id;   /* unique across entire final plan tree */
    List       *targetlist;     /* target list to be computed at this node */
    List       *qual;           /* implicitly-ANDed qual conditions */
    struct Plan *lefttree;      /* input plan tree(s) */
    struct Plan *righttree;
    List       *initPlan;       /* Init Plan nodes (un-correlated expr
                                 * subselects) */

    /*
     * Information for management of parameter-change-driven rescanning
     * parameter-change-driven重掃描的管理資訊.
     * 
     * extParam includes the paramIDs of all external PARAM_EXEC params
     * affecting this plan node or its children.  setParam params from the
     * node's initPlans are not included, but their extParams are.
     *
     * allParam includes all the extParam paramIDs, plus the IDs of local
     * params that affect the node (i.e., the setParams of its initplans).
     * These are _all_ the PARAM_EXEC params that affect this node.
     */
    Bitmapset  *extParam;
    Bitmapset  *allParam;
} Plan;

JoinState
Hash/NestLoop/Merge Join的基類

/* ----------------
 *   JoinState information
 *
 *      Superclass for state nodes of join plans.
 *      Hash/NestLoop/Merge Join的基類
 * ----------------
 */
typedef struct JoinState
{
    PlanState   ps;//基類PlanState
    JoinType    jointype;//連線型別
    //在找到一個匹配inner tuple的時候,如需要跳轉到下一個outer tuple,則該值為T
    bool        single_match;   /* True if we should skip to next outer tuple
                                 * after finding one inner match */
    //連線條件表示式(除了ps.qual)
    ExprState  *joinqual;       /* JOIN quals (in addition to ps.qual) */
} JoinState;

HashJoinState
Hash Join執行期狀態結構體

/* these structs are defined in executor/hashjoin.h: */
typedef struct HashJoinTupleData *HashJoinTuple;
typedef struct HashJoinTableData *HashJoinTable;

typedef struct HashJoinState
{
    JoinState   js;             /* 基類;its first field is NodeTag */
    ExprState  *hashclauses;//hash連線條件
    List       *hj_OuterHashKeys;   /* 外表條件連結串列;list of ExprState nodes */
    List       *hj_InnerHashKeys;   /* 內表連線條件;list of ExprState nodes */
    List       *hj_HashOperators;   /* 運算子OIDs連結串列;list of operator OIDs */
    HashJoinTable hj_HashTable;//Hash表
    uint32      hj_CurHashValue;//當前的Hash值
    int         hj_CurBucketNo;//當前的bucket編號
    int         hj_CurSkewBucketNo;//行傾斜bucket編號
    HashJoinTuple hj_CurTuple;//當前元組
    TupleTableSlot *hj_OuterTupleSlot;//outer relation slot
    TupleTableSlot *hj_HashTupleSlot;//Hash tuple slot
    TupleTableSlot *hj_NullOuterTupleSlot;//用於外連線的outer虛擬slot
    TupleTableSlot *hj_NullInnerTupleSlot;//用於外連線的inner虛擬slot
    TupleTableSlot *hj_FirstOuterTupleSlot;//
    int         hj_JoinState;//JoinState狀態
    bool        hj_MatchedOuter;//是否匹配
    bool        hj_OuterNotEmpty;//outer relation是否為空
} HashJoinState;

HashJoinTable
Hash表資料結構

typedef struct HashJoinTableData
{
    int         nbuckets;       /* 記憶體中的hash桶數;# buckets in the in-memory hash table */
    int         log2_nbuckets;  /* 2的對數(nbuckets必須是2的冪);its log2 (nbuckets must be a power of 2) */

    int         nbuckets_original;  /* 首次hash時的桶數;# buckets when starting the first hash */
    int         nbuckets_optimal;   /* 最佳化後的桶數(每個批次);optimal # buckets (per batch) */
    int         log2_nbuckets_optimal;  /* 2的對數;log2(nbuckets_optimal) */

    /* buckets[i] is head of list of tuples in i'th in-memory bucket */
    //bucket [i]是記憶體中第i個桶中的元組連結串列的head item
    union
    {
        /* unshared array is per-batch storage, as are all the tuples */
        //未共享陣列是按批處理儲存的，所有元組均如此
        struct HashJoinTupleData **unshared;
        /* shared array is per-query DSA area, as are all the tuples */
        //共享陣列是每個查詢的DSA區域，所有元組均如此
        dsa_pointer_atomic *shared;
    }           buckets;

    bool        keepNulls;      /*如不匹配則儲存NULL元組,該值為T;true to store unmatchable NULL tuples */

    bool        skewEnabled;    /*是否使用傾斜最佳化?;are we using skew optimization? */
    HashSkewBucket **skewBucket;    /* 傾斜的hash表桶數;hashtable of skew buckets */
    int         skewBucketLen;  /* skewBucket陣列大小;size of skewBucket array (a power of 2!) */
    int         nSkewBuckets;   /* 活動的傾斜桶數;number of active skew buckets */
    int        *skewBucketNums; /* 活動傾斜桶陣列索引;array indexes of active skew buckets */

    int         nbatch;         /* 批次數;number of batches */
    int         curbatch;       /* 當前批次,第一輪為0;current batch #; 0 during 1st pass */

    int         nbatch_original;    /* 在開始inner掃描時的批次;nbatch when we started inner scan */
    int         nbatch_outstart;    /* 在開始outer掃描時的批次;nbatch when we started outer scan */

    bool        growEnabled;    /* 關閉nbatch增加的標記;flag to shut off nbatch increases */

    double      totalTuples;    /* 從inner plan獲得的元組數;# tuples obtained from inner plan */
    double      partialTuples;  /* 透過hashjoin獲得的inner元組數;# tuples obtained from inner plan by me */
    double      skewTuples;     /* 傾斜元組數;# tuples inserted into skew tuples */

    /*
     * These arrays are allocated for the life of the hash join, but only if
     * nbatch > 1.  A file is opened only when we first write a tuple into it
     * (otherwise its pointer remains NULL).  Note that the zero'th array
     * elements never get used, since we will process rather than dump out any
     * tuples of batch zero.
     * 這些陣列在雜湊連線的生命週期內分配，但僅當nbatch > 1時分配。
     * 只有當第一次將元組寫入檔案時，檔案才會開啟(否則它的指標將保持NULL)。
     * 注意，第0個陣列元素永遠不會被使用，因為批次0的元組永遠不會轉儲.
     */
    BufFile   **innerBatchFile; /* 每個批次的inner虛擬臨時檔案快取;buffered virtual temp file per batch */
    BufFile   **outerBatchFile; /* 每個批次的outer虛擬臨時檔案快取;buffered virtual temp file per batch */

    /*
     * Info about the datatype-specific hash functions for the datatypes being
     * hashed. These are arrays of the same length as the number of hash join
     * clauses (hash keys).
     * 有關正在雜湊的資料型別的特定於資料型別的雜湊函式的資訊。
     * 這些陣列的長度與雜湊連線子句(雜湊鍵)的數量相同。
     */
    FmgrInfo   *outer_hashfunctions;    /* outer hash函式FmgrInfo結構體;lookup data for hash functions */
    FmgrInfo   *inner_hashfunctions;    /* inner hash函式FmgrInfo結構體;lookup data for hash functions */
    bool       *hashStrict;     /* 每個hash運算子是嚴格?is each hash join operator strict? */

    Size        spaceUsed;      /* 元組使用的當前記憶體空間大小;memory space currently used by tuples */
    Size        spaceAllowed;   /* 空間使用上限;upper limit for space used */
    Size        spacePeak;      /* 峰值的空間使用;peak space used */
    Size        spaceUsedSkew;  /* 傾斜雜湊表的當前空間使用情況;skew hash table's current space usage */
    Size        spaceAllowedSkew;   /* 傾斜雜湊表的使用上限;upper limit for skew hashtable */

    MemoryContext hashCxt;      /* 整個雜湊連線儲存的上下文;context for whole-hash-join storage */
    MemoryContext batchCxt;     /* 該批次儲存的上下文;context for this-batch-only storage */

    /* used for dense allocation of tuples (into linked chunks) */
    //用於密集分配元組(到連結塊中)
    HashMemoryChunk chunks;     /* 整個批次使用一個連結串列;one list for the whole batch */

    /* Shared and private state for Parallel Hash. */
    //並行hash使用的共享和私有狀態
    HashMemoryChunk current_chunk;  /* 後臺程式的當前chunk;this backend's current chunk */
    dsa_area   *area;           /* 用於分配記憶體的DSA區域;DSA area to allocate memory from */
    ParallelHashJoinState *parallel_state;//並行執行狀態
    ParallelHashJoinBatchAccessor *batches;//並行訪問器
    dsa_pointer current_chunk_shared;//當前chunk的開始指標
} HashJoinTableData;

typedef struct HashJoinTableData *HashJoinTable;

HashJoinTupleData
Hash連線元組資料

/* ----------------------------------------------------------------
 *              hash-join hash table structures
 *
 * Each active hashjoin has a HashJoinTable control block, which is
 * palloc'd in the executor's per-query context.  All other storage needed
 * for the hashjoin is kept in private memory contexts, two for each hashjoin.
 * This makes it easy and fast to release the storage when we don't need it
 * anymore.  (Exception: data associated with the temp files lives in the
 * per-query context too, since we always call buffile.c in that context.)
 * 每個活動的hashjoin都有一個可雜湊的控制塊，它在執行程式的每個查詢上下文中都是透過palloc分配的。
 * hashjoin所需的所有其他儲存都儲存在私有記憶體上下文中，每個hashjoin有兩個。
 * 當不再需要它的時候，這使得釋放它變得簡單和快速。
 * (例外:與臨時檔案相關的資料也存在於每個查詢上下文中，因為在這種情況下總是呼叫buffile.c。)
 *
 * The hashtable contexts are made children of the per-query context, ensuring
 * that they will be discarded at end of statement even if the join is
 * aborted early by an error.  (Likewise, any temporary files we make will
 * be cleaned up by the virtual file manager in event of an error.)
 * hashtable上下文是每個查詢上下文的子上下文，確保在語句結束時丟棄它們，即使連線因錯誤而提前中止。
 *   (同樣，如果出現錯誤，虛擬檔案管理器將清理建立的任何臨時檔案。)
 *
 * Storage that should live through the entire join is allocated from the
 * "hashCxt", while storage that is only wanted for the current batch is
 * allocated in the "batchCxt".  By resetting the batchCxt at the end of
 * each batch, we free all the per-batch storage reliably and without tedium.
 * 透過整個連線的儲存空間應從“hashCxt”分配，而只需要當前批處理的儲存空間在“batchCxt”中分配。
 * 透過在每個批處理結束時重置batchCxt，可以可靠地釋放每個批處理的所有儲存，而不會感到單調乏味。
 * 
 * During first scan of inner relation, we get its tuples from executor.
 * If nbatch > 1 then tuples that don't belong in first batch get saved
 * into inner-batch temp files. The same statements apply for the
 * first scan of the outer relation, except we write tuples to outer-batch
 * temp files.  After finishing the first scan, we do the following for
 * each remaining batch:
 *  1. Read tuples from inner batch file, load into hash buckets.
 *  2. Read tuples from outer batch file, match to hash buckets and output.
 * 在內部關係的第一次掃描中，從執行者那裡得到了它的元組。
 * 如果nbatch > 1，那麼不屬於第一批的元組將儲存到批內臨時檔案中。
 * 相同的語句適用於外關係的第一次掃描，但是我們將元組寫入外部批處理臨時檔案。
 * 完成第一次掃描後，我們對每批剩餘的元組做如下處理: 
 * 1.從內部批處理檔案讀取元組，載入到雜湊桶中。
 * 2.從外部批處理檔案讀取元組，匹配雜湊桶和輸出。 
 *
 * It is possible to increase nbatch on the fly if the in-memory hash table
 * gets too big.  The hash-value-to-batch computation is arranged so that this
 * can only cause a tuple to go into a later batch than previously thought,
 * never into an earlier batch.  When we increase nbatch, we rescan the hash
 * table and dump out any tuples that are now of a later batch to the correct
 * inner batch file.  Subsequently, while reading either inner or outer batch
 * files, we might find tuples that no longer belong to the current batch;
 * if so, we just dump them out to the correct batch file.
 * 如果記憶體中的雜湊表太大，可以動態增加nbatch。
 * 雜湊值到批處理的計算是這樣安排的:
 *   這隻會導致元組進入比以前認為的更晚的批處理，而不會進入更早的批處理。
 * 當增加nbatch時，重新掃描雜湊表，並將現在屬於後面批處理的任何元組轉儲到正確的內部批處理檔案。
 * 隨後，在讀取內部或外部批處理檔案時，可能會發現不再屬於當前批處理的元組;
 *   如果是這樣，只需將它們轉儲到正確的批處理檔案即可。
 * ----------------------------------------------------------------
 */

/* these are in nodes/execnodes.h: */
/* typedef struct HashJoinTupleData *HashJoinTuple; */
/* typedef struct HashJoinTableData *HashJoinTable; */

typedef struct HashJoinTupleData
{
    /* link to next tuple in same bucket */
    //link同一個桶中的下一個元組
    union
    {
        struct HashJoinTupleData *unshared;
        dsa_pointer shared;
    }           next;
    uint32      hashvalue;      /* 元組的hash值;tuple's hash code */
    /* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
}           HashJoinTupleData;

#define HJTUPLE_OVERHEAD  MAXALIGN(sizeof(HashJoinTupleData))
#define HJTUPLE_MINTUPLE(hjtup)  \
    ((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))

二、原始碼解讀

ExecHashTableCreate
ExecHashTableCreate函式初始化hashjoin需要使用的hashtable.

/*----------------------------------------------------------------------------------------------------
                                    HJ_BUILD_HASHTABLE 階段
-----------------------------------------------------------------------------------------------------*/

/* ----------------
 *  these are defined to avoid confusion problems with "left"
 *  and "right" and "inner" and "outer".  The convention is that
 *  the "left" plan is the "outer" plan and the "right" plan is
 *  the inner plan, but these make the code more readable.
 *  這些定義是為了避免“左”和“右”以及“內”和“外”的混淆問題。
 *  約定是，“左”計劃是“外部”計劃，“右”計劃是內部計劃，但是這些計劃使程式碼更具可讀性。
 * ----------------
 */
#define innerPlan(node)         (((Plan *)(node))->righttree)
#define outerPlan(node)         (((Plan *)(node))->lefttree)

/* ----------------------------------------------------------------
 *      ExecHashTableCreate
 *
 *      create an empty hashtable data structure for hashjoin.
 *      初始化hashjoin需要使用的hashtable.
 * ----------------------------------------------------------------
 */
HashJoinTable
ExecHashTableCreate(HashState *state, List *hashOperators, bool keepNulls)
{
    Hash       *node;
    HashJoinTable hashtable;
    Plan       *outerNode;
    size_t      space_allowed;
    int         nbuckets;
    int         nbatch;
    double      rows;
    int         num_skew_mcvs;
    int         log2_nbuckets;
    int         nkeys;
    int         i;
    ListCell   *ho;
    MemoryContext oldcxt;

    /*
     * Get information about the size of the relation to be hashed (it's the
     * "outer" subtree of this node, but the inner relation of the hashjoin).
     * Compute the appropriate size of the hash table.
     * 獲取有關要雜湊的關係大小的資訊(它是該節點的“outer”子樹，hashjoin的inner relation)。
     * 計算雜湊表的適當大小。
     */
    node = (Hash *) state->ps.plan;//獲取Hash節點
    outerNode = outerPlan(node);//獲取outer relation Plan節點

    /*
     * If this is shared hash table with a partial plan, then we can't use
     * outerNode->plan_rows to estimate its size.  We need an estimate of the
     * total number of rows across all copies of the partial plan.
     * 如果這是帶有部分計劃(並行處理)的共享雜湊表，那麼不能使用outerNode->plan_rows來估計它的大小。
     * 需要估算跨部分計劃的所有副本的行總數。
     */
    rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;//獲取總行數

    ExecChooseHashTableSize(rows, outerNode->plan_width,
                            OidIsValid(node->skewTable),
                            state->parallel_state != NULL,
                            state->parallel_state != NULL ?
                            state->parallel_state->nparticipants - 1 : 0,
                            &space_allowed,
                            &nbuckets, &nbatch, &num_skew_mcvs);//計算Hash Table的大小尺寸

    /* nbuckets must be a power of 2 */
    //nbuckets(hash桶數)必須是2的n次方
    log2_nbuckets = my_log2(nbuckets);
    Assert(nbuckets == (1 << log2_nbuckets));

    /*
     * Initialize the hash table control block.
     * 初始化hash表的控制塊
     *
     * The hashtable control block is just palloc'd from the executor's
     * per-query memory context.  Everything else should be kept inside the
     * subsidiary hashCxt or batchCxt.
     * hashtable控制元件塊是從執行程式的每個查詢記憶體上下文中調取的。
     * 其他內容都應該儲存在附屬hashCxt或batchCxt中。
     */
    hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));//分配記憶體
    hashtable->nbuckets = nbuckets;//桶數
    hashtable->nbuckets_original = nbuckets;
    hashtable->nbuckets_optimal = nbuckets;
    hashtable->log2_nbuckets = log2_nbuckets;
    hashtable->log2_nbuckets_optimal = log2_nbuckets;
    hashtable->buckets.unshared = NULL;
    hashtable->keepNulls = keepNulls;
    hashtable->skewEnabled = false;
    hashtable->skewBucket = NULL;
    hashtable->skewBucketLen = 0;
    hashtable->nSkewBuckets = 0;
    hashtable->skewBucketNums = NULL;
    hashtable->nbatch = nbatch;
    hashtable->curbatch = 0;
    hashtable->nbatch_original = nbatch;
    hashtable->nbatch_outstart = nbatch;
    hashtable->growEnabled = true;
    hashtable->totalTuples = 0;
    hashtable->partialTuples = 0;
    hashtable->skewTuples = 0;
    hashtable->innerBatchFile = NULL;
    hashtable->outerBatchFile = NULL;
    hashtable->spaceUsed = 0;
    hashtable->spacePeak = 0;
    hashtable->spaceAllowed = space_allowed;
    hashtable->spaceUsedSkew = 0;
    hashtable->spaceAllowedSkew =
        hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
    hashtable->chunks = NULL;
    hashtable->current_chunk = NULL;
    hashtable->parallel_state = state->parallel_state;
    hashtable->area = state->ps.state->es_query_dsa;
    hashtable->batches = NULL;

#ifdef HJDEBUG
    printf("Hashjoin %p: initial nbatch = %d, nbuckets = %d\n",
           hashtable, nbatch, nbuckets);
#endif

    /*
     * Create temporary memory contexts in which to keep the hashtable working
     * storage.  See notes in executor/hashjoin.h.
     * 建立臨時記憶體上下文，以便在其中保持雜湊表的相關資訊。
     * 參見executor/hashjoin.h中的註釋。
     */
    hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
                                               "HashTableContext",
                                               ALLOCSET_DEFAULT_SIZES);

    hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
                                                "HashBatchContext",
                                                ALLOCSET_DEFAULT_SIZES);

    /* Allocate data that will live for the life of the hashjoin */
    //分配記憶體,切換至hashCxt
    oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);

    /*
     * Get info about the hash functions to be used for each hash key. Also
     * remember whether the join operators are strict.
     * 獲取關於每個雜湊鍵要使用的雜湊函式的資訊。
     * 還要記住連線運算子是否嚴格。
     */
    nkeys = list_length(hashOperators);//鍵值數
    hashtable->outer_hashfunctions =
        (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));//outer relation所使用的hash函式
    hashtable->inner_hashfunctions =
        (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));//inner relation所使用的hash函式
    hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));//是否嚴格的運算子
    i = 0;
    foreach(ho, hashOperators)//遍歷hash運算子
    {
        Oid         hashop = lfirst_oid(ho);//hash運算子
        Oid         left_hashfn;//左函式
        Oid         right_hashfn;//右函式
        //獲取與給定運算子相容的標準雜湊函式的OID，並根據需要對其LHS和/或RHS資料型別進行操作。
        if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))//獲取hash函式
            elog(ERROR, "could not find hash function for hash operator %u",
                 hashop);
        fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);
        fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);
        hashtable->hashStrict[i] = op_strict(hashop);
        i++;
    }

    if (nbatch > 1 && hashtable->parallel_state == NULL)//批次>1而且並行狀態為NULL
    {
        /*
         * allocate and initialize the file arrays in hashCxt (not needed for
         * parallel case which uses shared tuplestores instead of raw files)
         * 在hashCxt中分配和初始化檔案陣列(對於使用共享tuplestore而不是原始檔案的並行情況不需要)
         */
        hashtable->innerBatchFile = (BufFile **)
            palloc0(nbatch * sizeof(BufFile *));//用於快取該批次的inner relation的tuple
        hashtable->outerBatchFile = (BufFile **)
            palloc0(nbatch * sizeof(BufFile *));//用於快取該批次的outerr relation的tuple
        /* The files will not be opened until needed... */
        /* ... but make sure we have temp tablespaces established for them */
        //這些檔案需要時才會開啟……
        //…但是要確保為它們建立了臨時表空間
        PrepareTempTablespaces();
    }

    MemoryContextSwitchTo(oldcxt);//切換回原記憶體上下文

    if (hashtable->parallel_state)//並行處理
    {
        ParallelHashJoinState *pstate = hashtable->parallel_state;
        Barrier    *build_barrier;

        /*
         * Attach to the build barrier.  The corresponding detach operation is
         * in ExecHashTableDetach.  Note that we won't attach to the
         * batch_barrier for batch 0 yet.  We'll attach later and start it out
         * in PHJ_BATCH_PROBING phase, because batch 0 is allocated up front
         * and then loaded while hashing (the standard hybrid hash join
         * algorithm), and we'll coordinate that using build_barrier.
         */
        build_barrier = &pstate->build_barrier;
        BarrierAttach(build_barrier);

        /*
         * So far we have no idea whether there are any other participants,
         * and if so, what phase they are working on.  The only thing we care
         * about at this point is whether someone has already created the
         * SharedHashJoinBatch objects and the hash table for batch 0.  One
         * backend will be elected to do that now if necessary.
         */
        if (BarrierPhase(build_barrier) == PHJ_BUILD_ELECTING &&
            BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ELECTING))
        {
            pstate->nbatch = nbatch;
            pstate->space_allowed = space_allowed;
            pstate->growth = PHJ_GROWTH_OK;

            /* Set up the shared state for coordinating batches. */
            ExecParallelHashJoinSetUpBatches(hashtable, nbatch);

            /*
             * Allocate batch 0's hash table up front so we can load it
             * directly while hashing.
             */
            pstate->nbuckets = nbuckets;
            ExecParallelHashTableAlloc(hashtable, 0);
        }

        /*
         * The next Parallel Hash synchronization point is in
         * MultiExecParallelHash(), which will progress it all the way to
         * PHJ_BUILD_DONE.  The caller must not return control from this
         * executor node between now and then.
         */
    }
    else//非並行處理
    {
        /*
         * Prepare context for the first-scan space allocations; allocate the
         * hashbucket array therein, and set each bucket "empty".
         * 為第一次掃描空間分配準備上下文;在其中分配hashbucket陣列，並將每個bucket設定為“空”。
         */
        MemoryContextSwitchTo(hashtable->batchCxt);//切換上下文

        hashtable->buckets.unshared = (HashJoinTuple *)
            palloc0(nbuckets * sizeof(HashJoinTuple));//分配記憶體空間

        /*
         * Set up for skew optimization, if possible and there's a need for
         * more than one batch.  (In a one-batch join, there's no point in
         * it.)
         * 如需要多個批處理,設定傾斜最佳化。(在單批處理連線中，這是沒有意義的。)
         */
        if (nbatch > 1)
            ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);

        MemoryContextSwitchTo(oldcxt);//切換上下文
    }

    return hashtable;//返回Hash表
}

 /*
  * This routine fills a FmgrInfo struct, given the OID
  * of the function to be called.
  * 給定要呼叫的函式的OID，這個例程填充一個FmgrInfo結構體。
  *
  * The caller's CurrentMemoryContext is used as the fn_mcxt of the info
  * struct; this means that any subsidiary data attached to the info struct
  * (either by fmgr_info itself, or later on by a function call handler)
  * will be allocated in that context.  The caller must ensure that this
  * context is at least as long-lived as the info struct itself.  This is
  * not a problem in typical cases where the info struct is on the stack or
  * in freshly-palloc'd space.  However, if one intends to store an info
  * struct in a long-lived table, it's better to use fmgr_info_cxt.
  * 呼叫方的CurrentMemoryContext用作info結構體的fn_mcxt;
  * 這意味著附加到info結構體的任何附屬資料(透過fmgr_info本身，或者稍後透過函式呼叫處理程式)將在該上下文中分配。
  * 呼叫者必須確保這個上下文的生命週期至少與info結構本身一樣。
  * 在資訊結構位於堆疊上或在新palloc空間中的典型情況下，這不是一個問題。
  * 但是，如果希望在long-lived表中儲存資訊結構，最好使用fmgr_info_cxt。
  */
 void
 fmgr_info(Oid functionId, FmgrInfo *finfo)
 {
     fmgr_info_cxt_security(functionId, finfo, CurrentMemoryContext, false);
 }

ExecChooseHashTableSize
ExecChooseHashTableSize函式根據給定要雜湊的關係的估計大小(行數和平均行寬)，計算適當的雜湊表大小。


/*
 * Compute appropriate size for hashtable given the estimated size of the
 * relation to be hashed (number of rows and average row width).
 * 給定要雜湊的關係的估計大小(行數和平均行寬)，計算適當的雜湊表大小。
 *
 * This is exported so that the planner's costsize.c can use it.
 * 這些資訊已匯出以便計劃器costsize.c可以使用
 */

/* Target bucket loading (tuples per bucket) */
#define NTUP_PER_BUCKET         1

void
ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
                        bool try_combined_work_mem,
                        int parallel_workers,
                        size_t *space_allowed,
                        int *numbuckets,
                        int *numbatches,
                        int *num_skew_mcvs)
{
    int         tupsize;//元組大小
    double      inner_rel_bytes;//inner relation大小
    long        bucket_bytes;//桶大小
    long        hash_table_bytes;//hash table大小
    long        skew_table_bytes;//傾斜表大小
    long        max_pointers;//最大的指標數
    long        mppow2;//
    int         nbatch = 1;//批次
    int         nbuckets;//桶數
    double      dbuckets;//

    /* Force a plausible relation size if no info */
    //如relation大小沒有資訊,則設定為預設值1000.0
    if (ntuples <= 0.0)
        ntuples = 1000.0;

    /*
     * Estimate tupsize based on footprint of tuple in hashtable... note this
     * does not allow for any palloc overhead.  The manipulations of spaceUsed
     * don't count palloc overhead either.
     * 根據雜湊表中tuple的佔用空間估計tupsize…
     * 注意，這不允許任何palloc開銷。使用的空間操作也不包括palloc開銷。
     */
    tupsize = HJTUPLE_OVERHEAD +
        MAXALIGN(SizeofMinimalTupleHeader) +
        MAXALIGN(tupwidth);//估算元組大小
    inner_rel_bytes = ntuples * tupsize;//inner relation大小

    /*
     * Target in-memory hashtable size is work_mem kilobytes.
     * 目標記憶體中的雜湊表大小為work_mem KB。
     */
    hash_table_bytes = work_mem * 1024L;

    /*
     * Parallel Hash tries to use the combined work_mem of all workers to
     * avoid the need to batch.  If that won't work, it falls back to work_mem
     * per worker and tries to process batches in parallel.
     * 並行雜湊試圖使用所有worker的所有work_mem來避免分批處理。
     * 如果這不起作用，它將返回到每個worker的work_mem，並嘗試並行處理批處理。
     */
    if (try_combined_work_mem)//嘗試融合work_mem
        hash_table_bytes += hash_table_bytes * parallel_workers;

    *space_allowed = hash_table_bytes;

    /*
     * If skew optimization is possible, estimate the number of skew buckets
     * that will fit in the memory allowed, and decrement the assumed space
     * available for the main hash table accordingly.
     * 如果可以進行傾斜最佳化，估算允許記憶體中容納的傾斜桶的數量，並相應地減少主雜湊表的假定可用空間。
     *
     * We make the optimistic assumption that each skew bucket will contain
     * one inner-relation tuple.  If that turns out to be low, we will recover
     * at runtime by reducing the number of skew buckets.
     * 我們樂觀地假設，每個傾斜桶將包含一個內部關係元組。
     * 如果結果很低，將透過減少傾斜桶的數量在執行時進行恢復。
     *
     * hashtable->skewBucket will have up to 8 times as many HashSkewBucket
     * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash
     * will round up to the next power of 2 and then multiply by 4 to reduce
     * collisions.
     * hashtable->skewBucket的指標數量將是允許的mcv數量的8倍，
     *   因為ExecHashBuildSkewHash將四捨五入到下一個2次方，然後乘以4以減少衝突。
     */
    if (useskew)
    {
        //傾斜最佳化
        skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;

        /*----------
         * Divisor is:
         * size of a hash tuple +
         * worst-case size of skewBucket[] per MCV +
         * size of skewBucketNums[] entry +
         * size of skew bucket struct itself
         *----------
         */
        *num_skew_mcvs = skew_table_bytes / (tupsize +
                                             (8 * sizeof(HashSkewBucket *)) +
                                             sizeof(int) +
                                             SKEW_BUCKET_OVERHEAD);
        if (*num_skew_mcvs > 0)
            hash_table_bytes -= skew_table_bytes;
    }
    else
        *num_skew_mcvs = 0;//不使用傾斜最佳化,預設為0

    /*
     * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
     * memory is filled, assuming a single batch; but limit the value so that
     * the pointer arrays we'll try to allocate do not exceed work_mem nor
     * MaxAllocSize.
     * 設定nbuckets，假設為單批處理，當記憶體被填滿時，實現NTUP_PER_BUCKET的平均桶負載;
     *   但是要限制這個值，以便試圖分配的指標陣列不會超過work_mem或MaxAllocSize。
     *
     * Note that both nbuckets and nbatch must be powers of 2 to make
     * ExecHashGetBucketAndBatch fast.
     * 注意，nbucket和nbatch都必須是2的冪，才能使ExecHashGetBucketAndBatch更快。
     */
    max_pointers = *space_allowed / sizeof(HashJoinTuple);//最大指標數
    max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));//控制上限
    /* If max_pointers isn't a power of 2, must round it down to one */
    //如果max_pointer不是2的冪，則必須四捨五入到符合規則的某個值(如110.1 --> 128)
    mppow2 = 1L << my_log2(max_pointers);
    if (max_pointers != mppow2)
        max_pointers = mppow2 / 2;

    /* Also ensure we avoid integer overflow in nbatch and nbuckets */
    /* (this step is redundant given the current value of MaxAllocSize) */
    //還要確保在nbatch和nbucket中避免整數溢位
    //(鑑於MaxAllocSize的當前值，此步驟是多餘的)
    max_pointers = Min(max_pointers, INT_MAX / 2);//設定上限

    dbuckets = ceil(ntuples / NTUP_PER_BUCKET);//取整
    dbuckets = Min(dbuckets, max_pointers);//設定上限
    nbuckets = (int) dbuckets;//桶數
    /* don't let nbuckets be really small, though ... */
    //但是，不要讓nbucket非常小……
    nbuckets = Max(nbuckets, 1024);//設定下限(1024)
    /* ... and force it to be a power of 2. */
    //2的冪
    nbuckets = 1 << my_log2(nbuckets);

    /*
     * If there's not enough space to store the projected number of tuples and
     * the required bucket headers, we will need multiple batches.
     * 如果沒有足夠的空間來儲存預計的元組數量和所需的bucket headers，將需要多個批處理。
     */
    bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
    if (inner_rel_bytes + bucket_bytes > hash_table_bytes)//inner relation大小 + 桶數大於可用空間
    {
        /* We'll need multiple batches */
        //需要多批次
        long        lbuckets;
        double      dbatch;
        int         minbatch;
        long        bucket_size;

        /*
         * If Parallel Hash with combined work_mem would still need multiple
         * batches, we'll have to fall back to regular work_mem budget.
         * 如果合併了work_mem的並行雜湊仍然需要多個批處理，將不得不回到常規的work_mem預算。
         */
        if (try_combined_work_mem)
        {
            ExecChooseHashTableSize(ntuples, tupwidth, useskew,
                                    false, parallel_workers,
                                    space_allowed,
                                    numbuckets,
                                    numbatches,
                                    num_skew_mcvs);
            return;
        }

        /*
         * Estimate the number of buckets we'll want to have when work_mem is
         * entirely full.  Each bucket will contain a bucket pointer plus
         * NTUP_PER_BUCKET tuples, whose projected size already includes
         * overhead for the hash code, pointer to the next tuple, etc.
         * 估計work_mem完全用完時需要的桶數。
         * 每個桶將包含一個桶指標和NTUP_PER_BUCKET元組，
         *   其投影大小已經包括雜湊碼的開銷、指向下一個元組的指標等等。
         */
        bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));//桶大小
        lbuckets = 1L << my_log2(hash_table_bytes / bucket_size);
        lbuckets = Min(lbuckets, max_pointers);
        nbuckets = (int) lbuckets;
        nbuckets = 1 << my_log2(nbuckets);
        bucket_bytes = nbuckets * sizeof(HashJoinTuple);

        /*
         * Buckets are simple pointers to hashjoin tuples, while tupsize
         * includes the pointer, hash code, and MinimalTupleData.  So buckets
         * should never really exceed 25% of work_mem (even for
         * NTUP_PER_BUCKET=1); except maybe for work_mem values that are not
         * 2^N bytes, where we might get more because of doubling. So let's
         * look for 50% here.
         * Buckets是指向hashjoin元組的簡單指標，而tupsize包含指標、雜湊程式碼和MinimalTupleData。
         * 所以Buckets的實際大小不應該超過work_mem的25%(即使對於NTUP_PER_BUCKET=1);
         *   除了work_mem值不是2 ^ N個位元組這個原因外,翻倍可能會得到更多的,這裡試著使用50%
         */
        Assert(bucket_bytes <= hash_table_bytes / 2);

        /* Calculate required number of batches. */
        //計算批次數
        dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));
        dbatch = Min(dbatch, max_pointers);
        minbatch = (int) dbatch;
        nbatch = 2;
        while (nbatch < minbatch)
            nbatch <<= 1;
    }

    Assert(nbuckets > 0);
    Assert(nbatch > 0);

    *numbuckets = nbuckets;
    *numbatches = nbatch;
}

三、跟蹤分析

測試指令碼如下

testdb=# set enable_nestloop=false;
SET
testdb=# set enable_mergejoin=false;
SET
testdb=# explain verbose select dw.*,grjf.grbh,grjf.xm,grjf.ny,grjf.je 
testdb-# from t_dwxx dw,lateral (select gr.grbh,gr.xm,jf.ny,jf.je 
testdb(#                         from t_grxx gr inner join t_jfxx jf 
testdb(#                                        on gr.dwbh = dw.dwbh 
testdb(#                                           and gr.grbh = jf.grbh) grjf
testdb-# order by dw.dwbh;
                                          QUERY PLAN                                           
-----------------------------------------------------------------------------------------------
 Sort  (cost=14828.83..15078.46 rows=99850 width=47)
   Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je
   Sort Key: dw.dwbh
   ->  Hash Join  (cost=3176.00..6537.55 rows=99850 width=47)
         Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je
         Hash Cond: ((gr.grbh)::text = (jf.grbh)::text)
         ->  Hash Join  (cost=289.00..2277.61 rows=99850 width=32)
               Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm
               Inner Unique: true
               Hash Cond: ((gr.dwbh)::text = (dw.dwbh)::text)
               ->  Seq Scan on public.t_grxx gr  (cost=0.00..1726.00 rows=100000 width=16)
                     Output: gr.dwbh, gr.grbh, gr.xm, gr.xb, gr.nl
               ->  Hash  (cost=164.00..164.00 rows=10000 width=20)
                     Output: dw.dwmc, dw.dwbh, dw.dwdz
                     ->  Seq Scan on public.t_dwxx dw  (cost=0.00..164.00 rows=10000 width=20)
                           Output: dw.dwmc, dw.dwbh, dw.dwdz
         ->  Hash  (cost=1637.00..1637.00 rows=100000 width=20)
               Output: jf.ny, jf.je, jf.grbh
               ->  Seq Scan on public.t_jfxx jf  (cost=0.00..1637.00 rows=100000 width=20)
                     Output: jf.ny, jf.je, jf.grbh
(20 rows)

啟動gdb,設定斷點,進入ExecHashTableCreate

(gdb) b ExecHashTableCreate
Breakpoint 1 at 0x6fc75d: file nodeHash.c, line 449.
(gdb) c
Continuing.

Breakpoint 1, ExecHashTableCreate (state=0x1e3cbc8, hashOperators=0x1e59890, keepNulls=false) at nodeHash.c:449
449     node = (Hash *) state->ps.plan;

獲取相關資訊

449     node = (Hash *) state->ps.plan;
(gdb) n
450     outerNode = outerPlan(node);
(gdb) 
457     rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;
(gdb) 
462                             state->parallel_state != NULL ?
(gdb) 
459     ExecChooseHashTableSize(rows, outerNode->plan_width,
(gdb)

獲取Hash節點;
outer節點為順序掃描SeqScan節點
inner(構造hash表的relation)行數為10000

(gdb) p *node
$1 = {plan = {type = T_Hash, startup_cost = 164, total_cost = 164, plan_rows = 10000, plan_width = 20, 
    parallel_aware = false, parallel_safe = true, plan_node_id = 4, targetlist = 0x1e4bf90, qual = 0x0, 
    lefttree = 0x1e493e8, righttree = 0x0, initPlan = 0x0, extParam = 0x0, allParam = 0x0}, skewTable = 16977, 
  skewColumn = 1, skewInherit = false, rows_total = 0}
(gdb) p *outerNode
$2 = {type = T_SeqScan, startup_cost = 0, total_cost = 164, plan_rows = 10000, plan_width = 20, parallel_aware = false, 
  parallel_safe = true, plan_node_id = 5, targetlist = 0x1e492b0, qual = 0x0, lefttree = 0x0, righttree = 0x0, 
  initPlan = 0x0, extParam = 0x0, allParam = 0x0}
(gdb) p rows
$3 = 10000
(gdb)

進入ExecChooseHashTableSize函式

(gdb) step
ExecChooseHashTableSize (ntuples=10000, tupwidth=20, useskew=true, try_combined_work_mem=false, parallel_workers=0, 
    space_allowed=0x7ffdcf148540, numbuckets=0x7ffdcf14853c, numbatches=0x7ffdcf148538, num_skew_mcvs=0x7ffdcf148534)
    at nodeHash.c:677
677     int         nbatch = 1;

ExecChooseHashTableSize->計算元組大小(56B)/inner relation大小(約560K)/hash表空間(16M)

(gdb) n
682     if (ntuples <= 0.0)
(gdb) 
690     tupsize = HJTUPLE_OVERHEAD +
(gdb) 
693     inner_rel_bytes = ntuples * tupsize;
(gdb) 
698     hash_table_bytes = work_mem * 1024L;
(gdb) 
705     if (try_combined_work_mem)
(gdb) p tupsize
$4 = 56
(gdb) p inner_rel_bytes
$5 = 560000
(gdb) p hash_table_bytes
$6 = 16777216

ExecChooseHashTableSize->使用資料傾斜最佳化(所需空間從Hash Table中獲取)

(gdb) n
708     *space_allowed = hash_table_bytes;
(gdb) 
724     if (useskew)
(gdb) 
726         skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
(gdb) p useskew
$8 = true
(gdb) p hash_table_bytes
$9 = 16441672
(gdb) p skew_table_bytes
$10 = 335544
(gdb) p num_skew_mcvs
$11 = (int *) 0x7ffdcf148534
(gdb) p *num_skew_mcvs
$12 = 2396
(gdb)

ExecChooseHashTableSize->獲取最大指標數目(2097152)

(gdb) n
756     max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
(gdb) 
758     mppow2 = 1L << my_log2(max_pointers);
(gdb) n
759     if (max_pointers != mppow2)
(gdb) p max_pointers
$13 = 2097152
(gdb) p mppow2
$15 = 2097152

ExecChooseHashTableSize->計算Hash桶數

(gdb) n
764     max_pointers = Min(max_pointers, INT_MAX / 2);
(gdb) 
766     dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
(gdb) 
767     dbuckets = Min(dbuckets, max_pointers);
(gdb) 
768     nbuckets = (int) dbuckets;
(gdb) 
770     nbuckets = Max(nbuckets, 1024);
(gdb) 
772     nbuckets = 1 << my_log2(nbuckets);
(gdb) 
778     bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
(gdb) n
779     if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
(gdb) 
834     Assert(nbuckets > 0);
(gdb) p dbuckets
$16 = 10000
(gdb) p nbuckets
$17 = 16384
(gdb) p bucket_bytes
$18 = 131072

ExecChooseHashTableSize->只需要一個批次,賦值,返回

835     Assert(nbatch > 0);
(gdb) 
837     *numbuckets = nbuckets;
(gdb) 
838     *numbatches = nbatch;
(gdb) 
839 }
(gdb) 
(gdb) 
ExecHashTableCreate (state=0x1e3cbc8, hashOperators=0x1e59890, keepNulls=false) at nodeHash.c:468
468     log2_nbuckets = my_log2(nbuckets);

初始化Hash表

468     log2_nbuckets = my_log2(nbuckets);
(gdb) p nbuckets
$19 = 16384
(gdb) n
469     Assert(nbuckets == (1 << log2_nbuckets));
(gdb) 
478     hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
(gdb) 
479     hashtable->nbuckets = nbuckets;
...

分配記憶體上下文

...
(gdb) 
522     hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
(gdb) 
526     hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
(gdb) 
532     oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
(gdb)

切換上下文,並初始化hash函式

(gdb) 
532     oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
(gdb) n
538     nkeys = list_length(hashOperators);
(gdb) 
540         (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
(gdb) p nkeys
$20 = 1
(gdb) n
539     hashtable->outer_hashfunctions =
(gdb) 
542         (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
(gdb) 
541     hashtable->inner_hashfunctions =
(gdb) 
543     hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));
(gdb) 
544     i = 0;

初始化Hash運算子

(gdb) n
545     foreach(ho, hashOperators)
(gdb) 
547         Oid         hashop = lfirst_oid(ho);
(gdb) 
551         if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
(gdb) 
554         fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);
(gdb) 
555         fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);
(gdb) 
556         hashtable->hashStrict[i] = op_strict(hashop);
(gdb) 
557         i++;
(gdb) 
545     foreach(ho, hashOperators)
(gdb) p *hashtable->hashStrict
$21 = true
(gdb) n
560     if (nbatch > 1 && hashtable->parallel_state == NULL)

分配hash桶記憶體空間

gdb) n
575     MemoryContextSwitchTo(oldcxt);
(gdb) 
577     if (hashtable->parallel_state)
(gdb) 
631         MemoryContextSwitchTo(hashtable->batchCxt);
(gdb) 
634             palloc0(nbuckets * sizeof(HashJoinTuple));
(gdb) 
633         hashtable->buckets.unshared = (HashJoinTuple *)
(gdb) p nbuckets
$23 = 16384

構造完成,返回hash表

(gdb) n
641         if (nbatch > 1)
(gdb) 
644         MemoryContextSwitchTo(oldcxt);
(gdb) 
647     return hashtable;
(gdb) 
648 }
(gdb) 
ExecHashJoinImpl (pstate=0x1e3c048, parallel=false) at nodeHashjoin.c:282
282                 node->hj_HashTable = hashtable;
(gdb)

DONE!

四、參考資料

Hash Joins: Past, Present and Future/PGCon 2017
A Look at How Postgres Executes a Tiny Join - Part 1
A Look at How Postgres Executes a Tiny Join - Part 2
Assignment 2 Symmetric Hash Join

PostgreSQL 原始碼解讀（91）- 查詢語句#76（ExecHashJoin函式#2）

一、資料結構

二、原始碼解讀

三、跟蹤分析

四、參考資料

相關文章