PostgreSQL 原始碼解讀(97)- 查詢語句#79(ExecHashJoin函式#5-H...
本節是ExecHashJoin函式介紹的第五部分,主要介紹了ExecHashJoin中依賴的其他函式的實現邏輯,這些函式在HJ_NEED_NEW_BATCH階段中使用,主要的函式是ExecHashJoinNewBatch。
一、資料結構
JoinState
Hash/NestLoop/Merge Join的基類
/* ----------------
* JoinState information
*
* Superclass for state nodes of join plans.
* Hash/NestLoop/Merge Join的基類
* ----------------
*/
typedef struct JoinState
{
PlanState ps;//基類PlanState
JoinType jointype;//連線型別
//在找到一個匹配inner tuple的時候,如需要跳轉到下一個outer tuple,則該值為T
bool single_match; /* True if we should skip to next outer tuple
* after finding one inner match */
//連線條件表示式(除了ps.qual)
ExprState *joinqual; /* JOIN quals (in addition to ps.qual) */
} JoinState;
HashJoinState
Hash Join執行期狀態結構體
/* these structs are defined in executor/hashjoin.h: */
typedef struct HashJoinTupleData *HashJoinTuple;
typedef struct HashJoinTableData *HashJoinTable;
typedef struct HashJoinState
{
JoinState js; /* 基類;its first field is NodeTag */
ExprState *hashclauses;//hash連線條件
List *hj_OuterHashKeys; /* 外表條件連結串列;list of ExprState nodes */
List *hj_InnerHashKeys; /* 內表連線條件;list of ExprState nodes */
List *hj_HashOperators; /* 運算子OIDs連結串列;list of operator OIDs */
HashJoinTable hj_HashTable;//Hash表
uint32 hj_CurHashValue;//當前的Hash值
int hj_CurBucketNo;//當前的bucket編號
int hj_CurSkewBucketNo;//行傾斜bucket編號
HashJoinTuple hj_CurTuple;//當前元組
TupleTableSlot *hj_OuterTupleSlot;//outer relation slot
TupleTableSlot *hj_HashTupleSlot;//Hash tuple slot
TupleTableSlot *hj_NullOuterTupleSlot;//用於外連線的outer虛擬slot
TupleTableSlot *hj_NullInnerTupleSlot;//用於外連線的inner虛擬slot
TupleTableSlot *hj_FirstOuterTupleSlot;//
int hj_JoinState;//JoinState狀態
bool hj_MatchedOuter;//是否匹配
bool hj_OuterNotEmpty;//outer relation是否為空
} HashJoinState;
HashJoinTable
Hash表資料結構
typedef struct HashJoinTableData
{
int nbuckets; /* 記憶體中的hash桶數;# buckets in the in-memory hash table */
int log2_nbuckets; /* 2的對數(nbuckets必須是2的冪);its log2 (nbuckets must be a power of 2) */
int nbuckets_original; /* 首次hash時的桶數;# buckets when starting the first hash */
int nbuckets_optimal; /* 最佳化後的桶數(每個批次);optimal # buckets (per batch) */
int log2_nbuckets_optimal; /* 2的對數;log2(nbuckets_optimal) */
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
//bucket [i]是記憶體中第i個桶中的元組連結串列的head item
union
{
/* unshared array is per-batch storage, as are all the tuples */
//未共享陣列是按批處理儲存的,所有元組均如此
struct HashJoinTupleData **unshared;
/* shared array is per-query DSA area, as are all the tuples */
//共享陣列是每個查詢的DSA區域,所有元組均如此
dsa_pointer_atomic *shared;
} buckets;
bool keepNulls; /*如不匹配則儲存NULL元組,該值為T;true to store unmatchable NULL tuples */
bool skewEnabled; /*是否使用傾斜最佳化?;are we using skew optimization? */
HashSkewBucket **skewBucket; /* 傾斜的hash表桶數;hashtable of skew buckets */
int skewBucketLen; /* skewBucket陣列大小;size of skewBucket array (a power of 2!) */
int nSkewBuckets; /* 活動的傾斜桶數;number of active skew buckets */
int *skewBucketNums; /* 活動傾斜桶陣列索引;array indexes of active skew buckets */
int nbatch; /* 批次數;number of batches */
int curbatch; /* 當前批次,第一輪為0;current batch #; 0 during 1st pass */
int nbatch_original; /* 在開始inner掃描時的批次;nbatch when we started inner scan */
int nbatch_outstart; /* 在開始outer掃描時的批次;nbatch when we started outer scan */
bool growEnabled; /* 關閉nbatch增加的標記;flag to shut off nbatch increases */
double totalTuples; /* 從inner plan獲得的元組數;# tuples obtained from inner plan */
double partialTuples; /* 透過hashjoin獲得的inner元組數;# tuples obtained from inner plan by me */
double skewTuples; /* 傾斜元組數;# tuples inserted into skew tuples */
/*
* These arrays are allocated for the life of the hash join, but only if
* nbatch > 1. A file is opened only when we first write a tuple into it
* (otherwise its pointer remains NULL). Note that the zero'th array
* elements never get used, since we will process rather than dump out any
* tuples of batch zero.
* 這些陣列在雜湊連線的生命週期內分配,但僅當nbatch > 1時分配。
* 只有當第一次將元組寫入檔案時,檔案才會開啟(否則它的指標將保持NULL)。
* 注意,第0個陣列元素永遠不會被使用,因為批次0的元組永遠不會轉儲.
*/
BufFile **innerBatchFile; /* 每個批次的inner虛擬臨時檔案快取;buffered virtual temp file per batch */
BufFile **outerBatchFile; /* 每個批次的outer虛擬臨時檔案快取;buffered virtual temp file per batch */
/*
* Info about the datatype-specific hash functions for the datatypes being
* hashed. These are arrays of the same length as the number of hash join
* clauses (hash keys).
* 有關正在雜湊的資料型別的特定於資料型別的雜湊函式的資訊。
* 這些陣列的長度與雜湊連線子句(雜湊鍵)的數量相同。
*/
FmgrInfo *outer_hashfunctions; /* outer hash函式FmgrInfo結構體;lookup data for hash functions */
FmgrInfo *inner_hashfunctions; /* inner hash函式FmgrInfo結構體;lookup data for hash functions */
bool *hashStrict; /* 每個hash運算子是嚴格?is each hash join operator strict? */
Size spaceUsed; /* 元組使用的當前記憶體空間大小;memory space currently used by tuples */
Size spaceAllowed; /* 空間使用上限;upper limit for space used */
Size spacePeak; /* 峰值的空間使用;peak space used */
Size spaceUsedSkew; /* 傾斜雜湊表的當前空間使用情況;skew hash table's current space usage */
Size spaceAllowedSkew; /* 傾斜雜湊表的使用上限;upper limit for skew hashtable */
MemoryContext hashCxt; /* 整個雜湊連線儲存的上下文;context for whole-hash-join storage */
MemoryContext batchCxt; /* 該批次儲存的上下文;context for this-batch-only storage */
/* used for dense allocation of tuples (into linked chunks) */
//用於密集分配元組(到連結塊中)
HashMemoryChunk chunks; /* 整個批次使用一個連結串列;one list for the whole batch */
/* Shared and private state for Parallel Hash. */
//並行hash使用的共享和私有狀態
HashMemoryChunk current_chunk; /* 後臺程式的當前chunk;this backend's current chunk */
dsa_area *area; /* 用於分配記憶體的DSA區域;DSA area to allocate memory from */
ParallelHashJoinState *parallel_state;//並行執行狀態
ParallelHashJoinBatchAccessor *batches;//並行訪問器
dsa_pointer current_chunk_shared;//當前chunk的開始指標
} HashJoinTableData;
typedef struct HashJoinTableData *HashJoinTable;
HashJoinTupleData
Hash連線元組資料
/* ----------------------------------------------------------------
* hash-join hash table structures
*
* Each active hashjoin has a HashJoinTable control block, which is
* palloc'd in the executor's per-query context. All other storage needed
* for the hashjoin is kept in private memory contexts, two for each hashjoin.
* This makes it easy and fast to release the storage when we don't need it
* anymore. (Exception: data associated with the temp files lives in the
* per-query context too, since we always call buffile.c in that context.)
* 每個活動的hashjoin都有一個可雜湊的控制塊,它在執行程式的每個查詢上下文中都是透過palloc分配的。
* hashjoin所需的所有其他儲存都儲存在私有記憶體上下文中,每個hashjoin有兩個。
* 當不再需要它的時候,這使得釋放它變得簡單和快速。
* (例外:與臨時檔案相關的資料也存在於每個查詢上下文中,因為在這種情況下總是呼叫buffile.c。)
*
* The hashtable contexts are made children of the per-query context, ensuring
* that they will be discarded at end of statement even if the join is
* aborted early by an error. (Likewise, any temporary files we make will
* be cleaned up by the virtual file manager in event of an error.)
* hashtable上下文是每個查詢上下文的子上下文,確保在語句結束時丟棄它們,即使連線因錯誤而提前中止。
* (同樣,如果出現錯誤,虛擬檔案管理器將清理建立的任何臨時檔案。)
*
* Storage that should live through the entire join is allocated from the
* "hashCxt", while storage that is only wanted for the current batch is
* allocated in the "batchCxt". By resetting the batchCxt at the end of
* each batch, we free all the per-batch storage reliably and without tedium.
* 透過整個連線的儲存空間應從“hashCxt”分配,而只需要當前批處理的儲存空間在“batchCxt”中分配。
* 透過在每個批處理結束時重置batchCxt,可以可靠地釋放每個批處理的所有儲存,而不會感到單調乏味。
*
* During first scan of inner relation, we get its tuples from executor.
* If nbatch > 1 then tuples that don't belong in first batch get saved
* into inner-batch temp files. The same statements apply for the
* first scan of the outer relation, except we write tuples to outer-batch
* temp files. After finishing the first scan, we do the following for
* each remaining batch:
* 1. Read tuples from inner batch file, load into hash buckets.
* 2. Read tuples from outer batch file, match to hash buckets and output.
* 在內部關係的第一次掃描中,從執行者那裡得到了它的元組。
* 如果nbatch > 1,那麼不屬於第一批的元組將儲存到批內臨時檔案中。
* 相同的語句適用於外關係的第一次掃描,但是我們將元組寫入外部批處理臨時檔案。
* 完成第一次掃描後,我們對每批剩餘的元組做如下處理:
* 1.從內部批處理檔案讀取元組,載入到雜湊桶中。
* 2.從外部批處理檔案讀取元組,匹配雜湊桶和輸出。
*
* It is possible to increase nbatch on the fly if the in-memory hash table
* gets too big. The hash-value-to-batch computation is arranged so that this
* can only cause a tuple to go into a later batch than previously thought,
* never into an earlier batch. When we increase nbatch, we rescan the hash
* table and dump out any tuples that are now of a later batch to the correct
* inner batch file. Subsequently, while reading either inner or outer batch
* files, we might find tuples that no longer belong to the current batch;
* if so, we just dump them out to the correct batch file.
* 如果記憶體中的雜湊表太大,可以動態增加nbatch。
* 雜湊值到批處理的計算是這樣安排的:
* 這隻會導致元組進入比以前認為的更晚的批處理,而不會進入更早的批處理。
* 當增加nbatch時,重新掃描雜湊表,並將現在屬於後面批處理的任何元組轉儲到正確的內部批處理檔案。
* 隨後,在讀取內部或外部批處理檔案時,可能會發現不再屬於當前批處理的元組;
* 如果是這樣,只需將它們轉儲到正確的批處理檔案即可。
* ----------------------------------------------------------------
*/
/* these are in nodes/execnodes.h: */
/* typedef struct HashJoinTupleData *HashJoinTuple; */
/* typedef struct HashJoinTableData *HashJoinTable; */
typedef struct HashJoinTupleData
{
/* link to next tuple in same bucket */
//link同一個桶中的下一個元組
union
{
struct HashJoinTupleData *unshared;
dsa_pointer shared;
} next;
uint32 hashvalue; /* 元組的hash值;tuple's hash code */
/* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
} HashJoinTupleData;
#define HJTUPLE_OVERHEAD MAXALIGN(sizeof(HashJoinTupleData))
#define HJTUPLE_MINTUPLE(hjtup) \
((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
二、原始碼解讀
ExecHashJoinNewBatch
切換到新的hashjoin批次,如成功,則返回T;已完成,返回F
/*----------------------------------------------------------------------------------------------------
HJ_FILL_OUTER_TUPLE 階段
----------------------------------------------------------------------------------------------------*/
//參見ExecHashJoin
/*----------------------------------------------------------------------------------------------------
HJ_FILL_INNER_TUPLES 階段
----------------------------------------------------------------------------------------------------*/
//參見ExecHashJoin
/*----------------------------------------------------------------------------------------------------
HJ_NEED_NEW_BATCH 階段
----------------------------------------------------------------------------------------------------*/
/*
* ExecHashJoinNewBatch
* switch to a new hashjoin batch
* 切換到新的hashjoin批次
*
* Returns true if successful, false if there are no more batches.
* 如成功,則返回T;已完成,返回F
*/
static bool
ExecHashJoinNewBatch(HashJoinState *hjstate)
{
HashJoinTable hashtable = hjstate->hj_HashTable;//Hash表
int nbatch;//批次數
int curbatch;//當前批次
BufFile *innerFile;//inner relation快取檔案
TupleTableSlot *slot;//slot
uint32 hashvalue;//hash值
nbatch = hashtable->nbatch;
curbatch = hashtable->curbatch;
if (curbatch > 0)
{
/*
* We no longer need the previous outer batch file; close it right
* away to free disk space.
* 不再需要以前的外批處理檔案;關閉它以釋放磁碟空間。
*/
if (hashtable->outerBatchFile[curbatch])
BufFileClose(hashtable->outerBatchFile[curbatch]);
hashtable->outerBatchFile[curbatch] = NULL;
}
else /* curbatch ==0,剛剛完成了第一個批次;we just finished the first batch */
{
/*
* Reset some of the skew optimization state variables, since we no
* longer need to consider skew tuples after the first batch. The
* memory context reset we are about to do will release the skew
* hashtable itself.
* 重置一些傾斜最佳化狀態變數,因為在第一批之後我們不再需要考慮傾斜元組。
* 我們將要進行的記憶體上下文重置將釋放傾斜散連結串列本身。
*/
hashtable->skewEnabled = false;
hashtable->skewBucket = NULL;
hashtable->skewBucketNums = NULL;
hashtable->nSkewBuckets = 0;
hashtable->spaceUsedSkew = 0;
}
/*
* We can always skip over any batches that are completely empty on both
* sides. We can sometimes skip over batches that are empty on only one
* side, but there are exceptions:
* 可以跳過任何兩邊都是空的批次。有時我們可以跳過只在一側為空的批處理,但也有例外:
*
* 1. In a left/full outer join, we have to process outer batches even if
* the inner batch is empty. Similarly, in a right/full outer join, we
* have to process inner batches even if the outer batch is empty.
* 1、在左/全外連線中,即使內部批是空的,我們也必須處理外批資料。
* 類似地,在右/完整外部連線中,即使外批資料為空,也必須處理內批資料。
*
* 2. If we have increased nbatch since the initial estimate, we have to
* scan inner batches since they might contain tuples that need to be
* reassigned to later inner batches.
* 2、如果在初始估算之後增加了nbatch,必須掃描內部批處理,
* 因為它們可能包含需要重新分配到後面的內部批處理的元組。
*
* 3. Similarly, if we have increased nbatch since starting the outer
* scan, we have to rescan outer batches in case they contain tuples that
* need to be reassigned.
* 3、類似地,如果在開始外部掃描之後增加了nbatch,必須重新掃描外部批處理,
* 以防它們包含需要重新分配的元組。
*/
curbatch++;
while (curbatch < nbatch &&
(hashtable->outerBatchFile[curbatch] == NULL ||
hashtable->innerBatchFile[curbatch] == NULL))
{
if (hashtable->outerBatchFile[curbatch] &&
HJ_FILL_OUTER(hjstate))
break; /* 符合規則1,需要處理;must process due to rule 1 */
if (hashtable->innerBatchFile[curbatch] &&
HJ_FILL_INNER(hjstate))
break; /* 符合規則1,需要處理;must process due to rule 1 */
if (hashtable->innerBatchFile[curbatch] &&
nbatch != hashtable->nbatch_original)
break; /* 符合規則2,需要處理;must process due to rule 2 */
if (hashtable->outerBatchFile[curbatch] &&
nbatch != hashtable->nbatch_outstart)
break; /* 符合規則3,需要處理;must process due to rule 3 */
/* We can ignore this batch. */
/* Release associated temp files right away. */
//均不符合規則1-3,則可以忽略這個批次了
//釋放臨時檔案
if (hashtable->innerBatchFile[curbatch])
BufFileClose(hashtable->innerBatchFile[curbatch]);
hashtable->innerBatchFile[curbatch] = NULL;
if (hashtable->outerBatchFile[curbatch])
BufFileClose(hashtable->outerBatchFile[curbatch]);
hashtable->outerBatchFile[curbatch] = NULL;
curbatch++;//下一個批次
}
if (curbatch >= nbatch)
return false; /* 已完成處理所有批次;no more batches */
hashtable->curbatch = curbatch;//下一個批次
/*
* Reload the hash table with the new inner batch (which could be empty)
* 使用新的內部批處理資料(有可能是空的)重新載入雜湊表
*/
ExecHashTableReset(hashtable);//重置Hash表
//inner relation檔案
innerFile = hashtable->innerBatchFile[curbatch];
//批次檔案不為NULL
if (innerFile != NULL)
{
if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))//掃描innerFile,不成功,則報錯
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not rewind hash-join temporary file: %m")));
while ((slot = ExecHashJoinGetSavedTuple(hjstate,
innerFile,
&hashvalue,
hjstate->hj_HashTupleSlot)))//
{
/*
* NOTE: some tuples may be sent to future batches. Also, it is
* possible for hashtable->nbatch to be increased here!
* 注意:一些元組可能被髮送到未來的批次中。
* 另外,這裡也可以增加hashtable->nbatch !
*/
ExecHashTableInsert(hashtable, slot, hashvalue);
}
/*
* after we build the hash table, the inner batch file is no longer
* needed
* 構建雜湊表之後,內部批處理臨時檔案就不再需要了,關閉之
*/
BufFileClose(innerFile);
hashtable->innerBatchFile[curbatch] = NULL;
}
/*
* Rewind outer batch file (if present), so that we can start reading it.
*/
if (hashtable->outerBatchFile[curbatch] != NULL)
{
if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not rewind hash-join temporary file: %m")));
}
return true;
}
/*
* ExecHashJoinGetSavedTuple
* read the next tuple from a batch file. Return NULL if no more.
* 從批次檔案中讀取下一個元組,如無則返回NULL
*
* On success, *hashvalue is set to the tuple's hash value, and the tuple
* itself is stored in the given slot.
* 如成功,*hashvalue引數設定為元組的Hash值,元組儲存在給定的slot中
*/
static TupleTableSlot *
ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
BufFile *file,
uint32 *hashvalue,
TupleTableSlot *tupleSlot)
{
uint32 header[2];
size_t nread;
MinimalTuple tuple;
/*
* We check for interrupts here because this is typically taken as an
* alternative code path to an ExecProcNode() call, which would include
* such a check.
* 在這裡檢查中斷,因為這通常被作為ExecProcNode()呼叫的替代程式碼路徑,通常包含這樣的檢查。
*/
CHECK_FOR_INTERRUPTS();
/*
* Since both the hash value and the MinimalTuple length word are uint32,
* we can read them both in one BufFileRead() call without any type
* cheating.
* 因為雜湊值和最小長度字都是uint32,所以可以在一個BufFileRead()呼叫中讀取它們,
* 而不需要任何型別的cheating。
*/
nread = BufFileRead(file, (void *) header, sizeof(header));//讀取檔案
if (nread == 0) /* end of file */
{
//已讀取完畢,返回NULL
ExecClearTuple(tupleSlot);
return NULL;
}
if (nread != sizeof(header))//讀取的大小不等於header的大小,報錯
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read from hash-join temporary file: %m")));
//hash值
*hashvalue = header[0];
//tuple,分配的記憶體大小為MinimalTuple結構體大小
tuple = (MinimalTuple) palloc(header[1]);
//元組大小
tuple->t_len = header[1];
//讀取檔案
nread = BufFileRead(file,
(void *) ((char *) tuple + sizeof(uint32)),
header[1] - sizeof(uint32));
//讀取有誤,報錯
if (nread != header[1] - sizeof(uint32))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read from hash-join temporary file: %m")));
//儲存到slot中
ExecForceStoreMinimalTuple(tuple, tupleSlot, true);
return tupleSlot;//返回slot
}
/*
* ExecHashTableInsert
* insert a tuple into the hash table depending on the hash value
* it may just go to a temp file for later batches
* 根據雜湊值向雜湊表中插入一個tuple,它可能只是轉到一個臨時檔案中以供以後的批處理
*
* Note: the passed TupleTableSlot may contain a regular, minimal, or virtual
* tuple; the minimal case in particular is certain to happen while reloading
* tuples from batch files. We could save some cycles in the regular-tuple
* case by not forcing the slot contents into minimal form; not clear if it's
* worth the messiness required.
* 注意:傳遞的TupleTableSlot可能包含一個常規、最小或虛擬元組;
* 在從批處理檔案中重新載入元組時,肯定會出現最小的情況。
* 如為常規元組,可以透過不強制slot內容變成最小形式來節省一些處理時間;
* 但不清楚這樣的混亂是否值得。
*/
void
ExecHashTableInsert(HashJoinTable hashtable,
TupleTableSlot *slot,
uint32 hashvalue)
{
bool shouldFree;//是否釋放資源
MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);//獲取一個MinimalTuple
int bucketno;//桶號
int batchno;//批次號
ExecHashGetBucketAndBatch(hashtable, hashvalue,
&bucketno, &batchno);//獲取桶號和批次號
/*
* decide whether to put the tuple in the hash table or a temp file
* 判斷是否放到hash表中還是放到臨時檔案中
*/
if (batchno == hashtable->curbatch)
{
//批次號==hash表的批次號,放到hash表中
/*
* put the tuple in hash table
* 把元組放到hash表中
*/
HashJoinTuple hashTuple;//hash tuple
int hashTupleSize;//大小
double ntuples = (hashtable->totalTuples - hashtable->skewTuples);//常規元組數量
/* Create the HashJoinTuple */
//建立HashJoinTuple
hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;//大小
hashTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);//分配儲存空間
//hash值
hashTuple->hashvalue = hashvalue;
//複製資料
memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
/*
* We always reset the tuple-matched flag on insertion. This is okay
* even when reloading a tuple from a batch file, since the tuple
* could not possibly have been matched to an outer tuple before it
* went into the batch file.
* 我們總是在插入時重置元組匹配的標誌。
* 即使在從批處理檔案中重新載入元組時,這也是可以的,
* 因為在元組進入批處理檔案之前,它不可能與外部元組匹配。
*/
HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
/* Push it onto the front of the bucket's list */
//
hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];
hashtable->buckets.unshared[bucketno] = hashTuple;
/*
* Increase the (optimal) number of buckets if we just exceeded the
* NTUP_PER_BUCKET threshold, but only when there's still a single
* batch.
* 如果剛剛超過了NTUP_PER_BUCKET閾值,但是隻有在仍然有單個批處理時,
* 才增加桶的(最佳化後)數量。
*/
if (hashtable->nbatch == 1 &&
ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
{
//只有1個批次而且元組數大於桶數*每桶的元組數
/* Guard against integer overflow and alloc size overflow */
//確保整數不要溢位
if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
{
hashtable->nbuckets_optimal *= 2;
hashtable->log2_nbuckets_optimal += 1;
}
}
/* Account for space used, and back off if we've used too much */
//宣告使用的儲存空間,如果使用太多,需要回退
hashtable->spaceUsed += hashTupleSize;
if (hashtable->spaceUsed > hashtable->spacePeak)
hashtable->spacePeak = hashtable->spaceUsed;//超出峰值,則跳轉
if (hashtable->spaceUsed +
hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
> hashtable->spaceAllowed)
ExecHashIncreaseNumBatches(hashtable);//超出允許的空間,則增加批次
}
else
{
//不在這個批次中
/*
* put the tuple into a temp file for later batches
* 放在臨時檔案中以便後續處理(減少重複掃描)
*/
Assert(batchno > hashtable->curbatch);
ExecHashJoinSaveTuple(tuple,
hashvalue,
&hashtable->innerBatchFile[batchno]);//儲存tuple到臨時檔案中
}
if (shouldFree)//如需要釋放空間,則處理之
heap_free_minimal_tuple(tuple);
}
三、跟蹤分析
設定work_mem為較低的值(1MB),便於手工產生批次
testdb=# set work_mem='1MB';
SET
testdb=# show work_mem;
work_mem
----------
1MB
(1 row)
測試指令碼如下
testdb=# set enable_nestloop=false;
SET
testdb=# set enable_mergejoin=false;
SET
testdb=# explain verbose select dw.*,grjf.grbh,grjf.xm,grjf.ny,grjf.je
testdb-# from t_dwxx dw,lateral (select gr.grbh,gr.xm,jf.ny,jf.je
testdb(# from t_grxx gr inner join t_jfxx jf
testdb(# on gr.dwbh = dw.dwbh
testdb(# and gr.grbh = jf.grbh) grjf
testdb-# order by dw.dwbh;
QUERY PLAN
-----------------------------------------------------------------------------------------------
Sort (cost=14828.83..15078.46 rows=99850 width=47)
Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je
Sort Key: dw.dwbh
-> Hash Join (cost=3176.00..6537.55 rows=99850 width=47)
Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je
Hash Cond: ((gr.grbh)::text = (jf.grbh)::text)
-> Hash Join (cost=289.00..2277.61 rows=99850 width=32)
Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm
Inner Unique: true
Hash Cond: ((gr.dwbh)::text = (dw.dwbh)::text)
-> Seq Scan on public.t_grxx gr (cost=0.00..1726.00 rows=100000 width=16)
Output: gr.dwbh, gr.grbh, gr.xm, gr.xb, gr.nl
-> Hash (cost=164.00..164.00 rows=10000 width=20)
Output: dw.dwmc, dw.dwbh, dw.dwdz
-> Seq Scan on public.t_dwxx dw (cost=0.00..164.00 rows=10000 width=20)
Output: dw.dwmc, dw.dwbh, dw.dwdz
-> Hash (cost=1637.00..1637.00 rows=100000 width=20)
Output: jf.ny, jf.je, jf.grbh
-> Seq Scan on public.t_jfxx jf (cost=0.00..1637.00 rows=100000 width=20)
Output: jf.ny, jf.je, jf.grbh
(20 rows)
啟動gdb,設定斷點,進入ExecHashJoinNewBatch
(gdb) b ExecHashJoinNewBatch
Breakpoint 1 at 0x7031f5: file nodeHashjoin.c, line 943.
(gdb) c
Continuing.
Breakpoint 1, ExecHashJoinNewBatch (hjstate=0x1c40738) at nodeHashjoin.c:943
943 HashJoinTable hashtable = hjstate->hj_HashTable;
獲取批次數(8個批次)和當前批次(0,第一個批次)
(gdb) n
950 nbatch = hashtable->nbatch;
(gdb)
951 curbatch = hashtable->curbatch;
(gdb)
953 if (curbatch > 0)
(gdb) p nbatch
$5 = 8
(gdb) p curbatch
$6 = 0
curbatch ==0,剛剛完成了第一個批次,重置傾斜最佳化的相關狀態變數
(gdb) n
971 hashtable->skewEnabled = false;
(gdb)
972 hashtable->skewBucket = NULL;
(gdb)
973 hashtable->skewBucketNums = NULL;
(gdb)
974 hashtable->nSkewBuckets = 0;
(gdb)
975 hashtable->spaceUsedSkew = 0;
(gdb)
995 curbatch++;
外表為空或內表為空時的最佳化,本次呼叫均不為空
(gdb) n
996 while (curbatch < nbatch &&
(gdb)
997 (hashtable->outerBatchFile[curbatch] == NULL ||
(gdb) p hashtable->outerBatchFile[curbatch]
$7 = (BufFile *) 0x1d85290
(gdb) p hashtable->outerBatchFile[curbatch]
$8 = (BufFile *) 0x1d85290
設定當前批次,重建Hash表
(gdb)
1023 if (curbatch >= nbatch)
(gdb)
1026 hashtable->curbatch = curbatch;
(gdb)
1031 ExecHashTableReset(hashtable);
獲取inner relation批次臨時檔案
(gdb)
1033 innerFile = hashtable->innerBatchFile[curbatch];
(gdb)
1035 if (innerFile != NULL)
(gdb) p innerFile
$9 = (BufFile *) 0x1cc0540
臨時檔案不為NULL,讀取檔案
(gdb) n
1037 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
(gdb)
1042 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
進入函式ExecHashJoinGetSavedTuple
(gdb) step
ExecHashJoinGetSavedTuple (hjstate=0x1c40fd8, file=0x1cc0540, hashvalue=0x7ffeace60824, tupleSlot=0x1c4cc20)
at nodeHashjoin.c:1259
1259 CHECK_FOR_INTERRUPTS();
(gdb)
ExecHashJoinGetSavedTuple->讀取頭部8個位元組(header,型別為void *,在64 bit的機器上,大小8個位元組)
gdb) n
1266 nread = BufFileRead(file, (void *) header, sizeof(header));
(gdb)
1267 if (nread == 0) /* end of file */
(gdb) p nread
$10 = 8
(gdb) n
1272 if (nread != sizeof(header))
(gdb)
ExecHashJoinGetSavedTuple->獲取Hash值(1978434688)
(gdb)
1276 *hashvalue = header[0];
(gdb) n
1277 tuple = (MinimalTuple) palloc(header[1]);
(gdb) p *hashvalue
$11 = 1978434688
ExecHashJoinGetSavedTuple->獲取tuple&元組長度
(gdb) n
1278 tuple->t_len = header[1];
(gdb)
1281 header[1] - sizeof(uint32));
(gdb) p tuple->t_len
$16 = 24
(gdb) p *tuple
$17 = {t_len = 24, mt_padding = "\177\177\177\177\177\177", t_infomask2 = 32639, t_infomask = 32639, t_hoff = 127 '\177',
t_bits = 0x1c5202f "\177\177\177\177\177\177\177\177\177~\177\177\177\177\177\177\177"}
(gdb)
ExecHashJoinGetSavedTuple->根據大小讀取檔案獲取元組
(gdb) n
1279 nread = BufFileRead(file,
(gdb)
1282 if (nread != header[1] - sizeof(uint32))
(gdb) p header[1]
$18 = 24
(gdb) p sizeof(uint32)
$19 = 4
(gdb) p *tuple
$20 = {t_len = 24, mt_padding = "\000\000\000\000\000", t_infomask2 = 3, t_infomask = 2, t_hoff = 24 '\030',
t_bits = 0x1c5202f ""}
ExecHashJoinGetSavedTuple->儲存到slot中,完成呼叫
(gdb) n
1286 return ExecStoreMinimalTuple(tuple, tupleSlot, true);
(gdb)
1287 }
(gdb)
ExecHashJoinNewBatch (hjstate=0x1c40fd8) at nodeHashjoin.c:1051
1051 ExecHashTableInsert(hashtable, slot, hashvalue);
插入到Hash表中
(gdb)
1051 ExecHashTableInsert(hashtable, slot, hashvalue);
進入ExecHashTableInsert
(gdb) step
ExecHashTableInsert (hashtable=0x1c6e1c0, slot=0x1c4cc20, hashvalue=3757101760) at nodeHash.c:1593
1593 MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot);
(gdb)
ExecHashTableInsert->獲取批次號和hash桶號
(gdb) n
1597 ExecHashGetBucketAndBatch(hashtable, hashvalue,
(gdb)
1603 if (batchno == hashtable->curbatch)
(gdb) p batchno
$21 = 1
(gdb) p bucketno
$22 = 21184
(gdb)
(gdb) p hashtable->curbatch
$23 = 1
ExecHashTableInsert->批次號與Hash表中的批次號一致,把元組放到Hash表中
常規元組數量=100000
(gdb) n
1610 double ntuples = (hashtable->totalTuples - hashtable->skewTuples);
(gdb) n
1613 hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
(gdb) p ntuples
$24 = 100000
ExecHashTableInsert->建立HashJoinTuple,重置元組匹配標記
(gdb) n
1614 hashTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
(gdb)
1616 hashTuple->hashvalue = hashvalue;
(gdb)
1617 memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
(gdb)
1625 HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
(gdb)
ExecHashTableInsert->元組放在Hash表桶連結串列的前面
(gdb) n
1628 hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];
(gdb)
1629 hashtable->buckets.unshared[bucketno] = hashTuple;
(gdb)
1636 if (hashtable->nbatch == 1 &&
(gdb)
ExecHashTableInsert->調整或記錄Hash表記憶體使用的峰值並返回,回到ExecHashJoinNewBatch
(gdb)
1649 hashtable->spaceUsed += hashTupleSize;
(gdb)
...
(gdb)
1667 }
(gdb) n
ExecHashJoinNewBatch (hjstate=0x1c40fd8) at nodeHashjoin.c:1042
1042 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
迴圈插入到Hash表中
1042 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
(gdb) n
1051 ExecHashTableInsert(hashtable, slot, hashvalue);
...
DONE!
四、參考資料
Hash Joins: Past, Present and Future/PGCon 2017
A Look at How Postgres Executes a Tiny Join - Part 1
A Look at How Postgres Executes a Tiny Join - Part 2
Assignment 2 Symmetric Hash Join
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/6906/viewspace-2374793/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- PostgreSQL 原始碼解讀(90)- 查詢語句#75(ExecHashJoin函式#1)SQL原始碼函式
- PostgreSQL 原始碼解讀(91)- 查詢語句#76(ExecHashJoin函式#2)SQL原始碼函式
- PostgreSQL 原始碼解讀(93)- 查詢語句#77(ExecHashJoin函式#3)SQL原始碼函式
- PostgreSQL 原始碼解讀(95)- 查詢語句#78(ExecHashJoin函式#4-H...SQL原始碼函式
- PostgreSQL 原始碼解讀(79)- 查詢語句#64(create_plan函式#3-Se...SQL原始碼函式
- PostgreSQL 原始碼解讀(83)- 查詢語句#68(PortalStart函式)SQL原始碼函式
- PostgreSQL 原始碼解讀(88)- 查詢語句#73(SeqNext函式#1)SQL原始碼函式
- PostgreSQL 原始碼解讀(89)- 查詢語句#74(SeqNext函式#2)SQL原始碼函式
- PostgreSQL 原始碼解讀(46)- 查詢語句#31(query_planner函式#7)SQL原始碼函式
- PostgreSQL 原始碼解讀(47)- 查詢語句#32(query_planner函式#8)SQL原始碼函式
- PostgreSQL 原始碼解讀(48)- 查詢語句#33(query_planner函式#9)SQL原始碼函式
- PostgreSQL 原始碼解讀(41)- 查詢語句#26(query_planner函式#4)SQL原始碼函式
- PostgreSQL 原始碼解讀(40)- 查詢語句#25(query_planner函式#3)SQL原始碼函式
- PostgreSQL 原始碼解讀(43)- 查詢語句#28(query_planner函式#5)SQL原始碼函式
- PostgreSQL 原始碼解讀(45)- 查詢語句#30(query_planner函式#6)SQL原始碼函式
- PostgreSQL 原始碼解讀(38)- 查詢語句#23(query_planner函式#1)SQL原始碼函式
- PostgreSQL 原始碼解讀(39)- 查詢語句#24(query_planner函式#2)SQL原始碼函式
- PostgreSQL 原始碼解讀(73)- 查詢語句#58(grouping_planner函式...SQL原始碼函式
- PostgreSQL 原始碼解讀(181)- 查詢#97(聚合函式#2-ExecInitAgg)SQL原始碼函式
- PostgreSQL 原始碼解讀(82)- 查詢語句#67(PortalXXX系列函式)SQL原始碼函式
- PostgreSQL 原始碼解讀(24)- 查詢語句#9(查詢重寫)SQL原始碼
- PostgreSQL 原始碼解讀(78)- 查詢語句#63(create_plan函式#2-cr...SQL原始碼函式
- PostgreSQL 原始碼解讀(80)- 查詢語句#65(create_plan函式#4-Jo...SQL原始碼函式
- PostgreSQL 原始碼解讀(65)- 查詢語句#50(make_one_rel函式#15-...SQL原始碼函式
- PostgreSQL 原始碼解讀(62)- 查詢語句#47(make_one_rel函式#12-...SQL原始碼函式
- PostgreSQL 原始碼解讀(63)- 查詢語句#48(make_one_rel函式#13-...SQL原始碼函式
- PostgreSQL 原始碼解讀(64)- 查詢語句#49(make_one_rel函式#14-...SQL原始碼函式
- PostgreSQL 原始碼解讀(60)- 查詢語句#45(make_one_rel函式#10-...SQL原始碼函式
- PostgreSQL 原始碼解讀(61)- 查詢語句#46(make_one_rel函式#11-...SQL原始碼函式
- PostgreSQL 原始碼解讀(69)- 查詢語句#54(make_one_rel函式#19-...SQL原始碼函式
- PostgreSQL 原始碼解讀(70)- 查詢語句#55(make_one_rel函式#20-...SQL原始碼函式
- PostgreSQL 原始碼解讀(66)- 查詢語句#51(make_one_rel函式#16-...SQL原始碼函式
- PostgreSQL 原始碼解讀(67)- 查詢語句#52(make_one_rel函式#17-...SQL原始碼函式
- PostgreSQL 原始碼解讀(68)- 查詢語句#53(make_one_rel函式#18-...SQL原始碼函式
- PostgreSQL 原始碼解讀(71)- 查詢語句#56(make_one_rel函式#21-...SQL原始碼函式
- PostgreSQL 原始碼解讀(72)- 查詢語句#57(make_one_rel函式#22-...SQL原始碼函式
- PostgreSQL 原始碼解讀(20)- 查詢語句#5(查詢樹Query詳解)SQL原始碼
- PostgreSQL 原始碼解讀(18)- 查詢語句#3(SQL Parse)SQL原始碼