PostgreSQL 原始碼解讀（52）- 查詢語句#37（make_one_rel函式#2-估算關係大小）

weixin_34208283發表於2018-09-20

原文網址 : https://blog.csdn.net/weixin_34208283/article/details/86984306

先前的章節已介紹了函式query_planner中子函式make_one_rel的主實現邏輯,本節繼續介紹make_one_rel函式中的set_base_rel_sizes函式及其子函式。

make_one_rel原始碼:

 RelOptInfo *
 make_one_rel(PlannerInfo *root, List *joinlist)
 {
     RelOptInfo *rel;
     Index       rti;
 
     /*
      * Construct the all_baserels Relids set.
      */
     root->all_baserels = NULL;
     for (rti = 1; rti < root->simple_rel_array_size; rti++)//遍歷RelOptInfo
     {
         RelOptInfo *brel = root->simple_rel_array[rti];
 
         /* there may be empty slots corresponding to non-baserel RTEs */
         if (brel == NULL)
             continue;
 
         Assert(brel->relid == rti); /* sanity check on array */
 
         /* ignore RTEs that are "other rels" */
         if (brel->reloptkind != RELOPT_BASEREL)
             continue;
 
         root->all_baserels = bms_add_member(root->all_baserels, brel->relid);//新增到all_baserels遍歷中
     }
 
     /* Mark base rels as to whether we care about fast-start plans */
     //設定RelOptInfo的consider_param_startup變數,是否考量fast-start plans
     set_base_rel_consider_startup(root);
 
     /*
      * Compute size estimates and consider_parallel flags for each base rel,
      * then generate access paths.
      */
     set_base_rel_sizes(root);//估算Relation的Size並且設定consider_parallel標記
     set_base_rel_pathlists(root);//生成Relation的掃描(訪問)路徑
 
     /*
      * Generate access paths for the entire join tree.
      * 通過動態規劃或遺傳演算法生成連線路徑 
      */
     rel = make_rel_from_joinlist(root, joinlist);
 
     /*
      * The result should join all and only the query's base rels.
      */
     Assert(bms_equal(rel->relids, root->all_baserels));
   //返回最上層的RelOptInfo
     return rel;
 }

一、資料結構

RelOptInfo

 typedef struct RelOptInfo
 {
     NodeTag     type;//節點標識
 
     RelOptKind  reloptkind;//RelOpt型別
 
     /* all relations included in this RelOptInfo */
     Relids      relids;         /*Relids(rtindex)集合 set of base relids (rangetable indexes) */
 
     /* size estimates generated by planner */
     double      rows;           /*結果元組的估算數量 estimated number of result tuples */
 
     /* per-relation planner control flags */
     bool        consider_startup;   /*是否考慮啟動成本?是,需要保留啟動成本低的路徑 keep cheap-startup-cost paths? */
     bool        consider_param_startup; /*是否考慮引數化?的路徑 ditto, for parameterized paths? */
     bool        consider_parallel;  /*是否考慮並行處理路徑 consider parallel paths? */
 
     /* default result targetlist for Paths scanning this relation */
     struct PathTarget *reltarget;   /*掃描該Relation時預設的結果 list of Vars/Exprs, cost, width */
 
     /* materialization information */
     List       *pathlist;       /*訪問路徑連結串列 Path structures */
     List       *ppilist;        /*路徑連結串列中使用引數化路徑進行 ParamPathInfos used in pathlist */
     List       *partial_pathlist;   /* partial Paths */
     struct Path *cheapest_startup_path;//代價最低的啟動路徑
     struct Path *cheapest_total_path;//代價最低的整體路徑
     struct Path *cheapest_unique_path;//代價最低的獲取唯一值的路徑
     List       *cheapest_parameterized_paths;//代價最低的引數化?路徑連結串列
 
     /* parameterization information needed for both base rels and join rels */
     /* (see also lateral_vars and lateral_referencers) */
     Relids      direct_lateral_relids;  /*使用lateral語法,需依賴的Relids rels directly laterally referenced */
     Relids      lateral_relids; /* minimum parameterization of rel */
 
     /* information about a base rel (not set for join rels!) */
     //reloptkind=RELOPT_BASEREL時使用的資料結構
     Index       relid;          /* Relation ID */
     Oid         reltablespace;  /* 表空間 containing tablespace */
     RTEKind     rtekind;        /* 基表?子查詢?還是函式等等?RELATION, SUBQUERY, FUNCTION, etc */
     AttrNumber  min_attr;       /* 最小的屬性編號 smallest attrno of rel (often <0) */
     AttrNumber  max_attr;       /* 最大的屬性編號 largest attrno of rel */
     Relids     *attr_needed;    /* 陣列 array indexed [min_attr .. max_attr] */
     int32      *attr_widths;    /* 屬性寬度 array indexed [min_attr .. max_attr] */
     List       *lateral_vars;   /* 關係依賴的Vars/PHVs LATERAL Vars and PHVs referenced by rel */
     Relids      lateral_referencers;    /*依賴該關係的Relids rels that reference me laterally */
     List       *indexlist;      /* 該關係的IndexOptInfo連結串列 list of IndexOptInfo */
     List       *statlist;       /* 統計資訊連結串列 list of StatisticExtInfo */
     BlockNumber pages;          /* 塊數 size estimates derived from pg_class */
     double      tuples;         /* 元組數 */
     double      allvisfrac;     /* ? */
     PlannerInfo *subroot;       /* 如為子查詢,儲存子查詢的root if subquery */
     List       *subplan_params; /* 如為子查詢,儲存子查詢的引數 if subquery */
     int         rel_parallel_workers;   /* 並行執行,需要多少個workers? wanted number of parallel workers */
 
     /* Information about foreign tables and foreign joins */
     //FWD相關資訊
     Oid         serverid;       /* identifies server for the table or join */
     Oid         userid;         /* identifies user to check access as */
     bool        useridiscurrent;    /* join is only valid for current user */
     /* use "struct FdwRoutine" to avoid including fdwapi.h here */
     struct FdwRoutine *fdwroutine;
     void       *fdw_private;
 
     /* cache space for remembering if we have proven this relation unique */
     //已知的,可保證唯一的Relids連結串列
     List       *unique_for_rels;    /* known unique for these other relid
                                      * set(s) */
     List       *non_unique_for_rels;    /* 已知的,不唯一的Relids連結串列 known not unique for these set(s) */
 
     /* used by various scans and joins: */
     List       *baserestrictinfo;   /* 如為基本關係,儲存約束條件 RestrictInfo structures (if base rel) */
     QualCost    baserestrictcost;   /* 解析約束表示式的成本? cost of evaluating the above */
     Index       baserestrict_min_security;  /* 最低安全等級 min security_level found in
                                              * baserestrictinfo */
     List       *joininfo;       /* 連線語句的約束條件資訊 RestrictInfo structures for join clauses
                                  * involving this rel */
     bool        has_eclass_joins;   /* 是否存在等價類連線? T means joininfo is incomplete */
 
     /* used by partitionwise joins: */
     bool        consider_partitionwise_join;    /* 分割槽? consider partitionwise
                                                  * join paths? (if
                                                  * partitioned rel) */
     Relids      top_parent_relids;  /* Relids of topmost parents (if "other"
                                      * rel) */
 
     /* used for partitioned relations */
     //分割槽表使用
     PartitionScheme part_scheme;    /* 分割槽的schema Partitioning scheme. */
     int         nparts;         /* 分割槽數 number of partitions */
     struct PartitionBoundInfoData *boundinfo;   /* 分割槽邊界資訊 Partition bounds */
     List       *partition_qual; /* 分割槽約束 partition constraint */
     struct RelOptInfo **part_rels;  /* 分割槽的RelOptInfo陣列 Array of RelOptInfos of partitions,
                                      * stored in the same order of bounds */
     List      **partexprs;      /* 非空分割槽鍵表示式 Non-nullable partition key expressions. */
     List      **nullable_partexprs; /* 可為空的分割槽鍵表示式 Nullable partition key expressions. */
     List       *partitioned_child_rels; /* RT Indexes連結串列 List of RT indexes. */
 } RelOptInfo;

二、原始碼解讀

make_one_rel函式呼叫了set_base_rel_sizes,該函式的主要實現邏輯通過呼叫set_rel_size實現.現重點考察函式set_rel_size中對基礎關係進行估算的處理邏輯,即函式set_plain_rel_size的實現邏輯.
set_plain_rel_size

 /*
  * set_plain_rel_size
  *    Set size estimates for a plain relation (no subquery, no inheritance)
  */
 static void
 set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 {
     /*
      * Test any partial indexes of rel for applicability.  We must do this
      * first since partial unique indexes can affect size estimates.
      */
     check_index_predicates(root, rel);//驗證部分(條件)索引的可用性
 
     /* Mark rel with estimated output rows, width, etc */
     set_baserel_size_estimates(root, rel);//標記rel的輸出行數/行寬等資訊
 }

set_plain_rel_size->check_index_predicates
如果部分(條件)索引的謂詞與查詢語句相匹配,則predOK設定為true
比如:
資料表t1(c1 int,c2 varchar(40),...),如存在索引idx_t1_partial_c1,條件為where c1 > 100
查詢條件為where c1 > 100(是否支援>200?),那麼該謂詞與查詢條件相匹配

//----------------------------------------- check_index_predicates
 /*
  * check_index_predicates
  *      Set the predicate-derived IndexOptInfo fields for each index
  *      of the specified relation.
  *
  * predOK is set true if the index is partial and its predicate is satisfied
  * for this query, ie the query's WHERE clauses imply the predicate.
  * 如果部分(條件)索引的謂詞與查詢語句相匹配,則predOK設定為true
  * 比如:
  *    資料表t1(c1 int,c2 varchar(40),...),如存在索引idx_t1_partial_c1,條件為where c1 > 100
  *    查詢條件為where c1 > 100(是否支援>200?),那麼該謂詞與查詢條件相匹配
  *
  * indrestrictinfo is set to the relation's baserestrictinfo list less any
  * conditions that are implied by the index's predicate.  (Obviously, for a
  * non-partial index, this is the same as baserestrictinfo.)  Such conditions
  * can be dropped from the plan when using the index, in certain cases.
  * 
  * indrestrictinfo會加入到rel的baserestrictinfo連結串列中,減少了索引謂詞所隱含的限制條件.
  *
  * At one time it was possible for this to get re-run after adding more
  * restrictions to the rel, thus possibly letting us prove more indexes OK.
  * That doesn't happen any more (at least not in the core code's usage),
  * but this code still supports it in case extensions want to mess with the
  * baserestrictinfo list.  We assume that adding more restrictions can't make
  * an index not predOK.  We must recompute indrestrictinfo each time, though,
  * to make sure any newly-added restrictions get into it if needed.
  */

 void
 check_index_predicates(PlannerInfo *root, RelOptInfo *rel)
 {
     List       *clauselist;//條件連結串列
     bool        have_partial;//是否包含部分索引
     bool        is_target_rel;//目標rel?
     Relids      otherrels;//Relids
     ListCell   *lc;//臨時變數
 
     /* Indexes are available only on base or "other" member relations. */
     Assert(IS_SIMPLE_REL(rel));//rel必須是基礎關係
 
     /*
      * Initialize the indrestrictinfo lists to be identical to
      * baserestrictinfo, and check whether there are any partial indexes.  If
      * not, this is all we need to do.
      */
     have_partial = false;
     foreach(lc, rel->indexlist)//遍歷index
     {
         IndexOptInfo *index = (IndexOptInfo *) lfirst(lc);
 
         index->indrestrictinfo = rel->baserestrictinfo;//設定索引約束條件
         if (index->indpred)
             have_partial = true;//存在部分索引
     }
     if (!have_partial)
         return;
 
     /*
      * Construct a list of clauses that we can assume true for the purpose of
      * proving the index(es) usable.  Restriction clauses for the rel are
      * always usable, and so are any join clauses that are "movable to" this
      * rel.  Also, we can consider any EC-derivable join clauses (which must
      * be "movable to" this rel, by definition).
      */
     clauselist = list_copy(rel->baserestrictinfo);//條件語句初始化
 
     /* Scan the rel's join clauses */
     foreach(lc, rel->joininfo)//遍歷連線條件
     {
         RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);//條件
 
         /* Check if clause can be moved to this rel */
         if (!join_clause_is_movable_to(rinfo, rel))//條件是否可以下推到Rel中
             continue;//不可用,下一個條件
 
         clauselist = lappend(clauselist, rinfo);//可以,則新增到條件語句連結串列中
     }
 
     /*
      * Add on any equivalence-derivable join clauses.  Computing the correct
      * relid sets for generate_join_implied_equalities is slightly tricky
      * because the rel could be a child rel rather than a true baserel, and in
      * that case we must remove its parents' relid(s) from all_baserels.
      */
     if (rel->reloptkind == RELOPT_OTHER_MEMBER_REL)
         otherrels = bms_difference(root->all_baserels,
                                    find_childrel_parents(root, rel));//
     else
         otherrels = bms_difference(root->all_baserels, rel->relids);//獲取與rel無關的其他rels
 
     if (!bms_is_empty(otherrels))
         clauselist =
             list_concat(clauselist,
                         generate_join_implied_equalities(root,
                                                          bms_union(rel->relids,
                                                                    otherrels),
                                                          otherrels,
                                                          rel));//新增到條件語句
 
     /*
      * Normally we remove quals that are implied by a partial index's
      * predicate from indrestrictinfo, indicating that they need not be
      * checked explicitly by an indexscan plan using this index.  However, if
      * the rel is a target relation of UPDATE/DELETE/SELECT FOR UPDATE, we
      * cannot remove such quals from the plan, because they need to be in the
      * plan so that they will be properly rechecked by EvalPlanQual testing.
      * Some day we might want to remove such quals from the main plan anyway
      * and pass them through to EvalPlanQual via a side channel; but for now,
      * we just don't remove implied quals at all for target relations.
      */
     is_target_rel = (rel->relid == root->parse->resultRelation ||
                      get_plan_rowmark(root->rowMarks, rel->relid) != NULL);
 
     /*
      * Now try to prove each index predicate true, and compute the
      * indrestrictinfo lists for partial indexes.  Note that we compute the
      * indrestrictinfo list even for non-predOK indexes; this might seem
      * wasteful, but we may be able to use such indexes in OR clauses, cf
      * generate_bitmap_or_paths().
      */
     foreach(lc, rel->indexlist)//遍歷index
     {
         IndexOptInfo *index = (IndexOptInfo *) lfirst(lc);
         ListCell   *lcr;
 
         if (index->indpred == NIL)
             continue;           /* ignore non-partial indexes here */
 
         if (!index->predOK)     /* don't repeat work if already proven OK */
             index->predOK = predicate_implied_by(index->indpred, clauselist,
                                                  false);//設定predOK引數
 
         /* If rel is an update target, leave indrestrictinfo as set above */
         if (is_target_rel)
             continue;
 
         /* Else compute indrestrictinfo as the non-implied quals */
         index->indrestrictinfo = NIL;
         foreach(lcr, rel->baserestrictinfo)
         {
             RestrictInfo *rinfo = (RestrictInfo *) lfirst(lcr);
 
             /* predicate_implied_by() assumes first arg is immutable */
             if (contain_mutable_functions((Node *) rinfo->clause) ||
                 !predicate_implied_by(list_make1(rinfo->clause),
                                       index->indpred, false))
                 index->indrestrictinfo = lappend(index->indrestrictinfo, rinfo);//
         }
     }
 }

set_plain_rel_size->set_baserel_size_estimates
函式註釋:

//----------------------------------------- set_baserel_size_estimates
 /*
  * set_baserel_size_estimates
  *      Set the size estimates for the given base relation.
  *      估算base rel的估算大小
  *
  * The rel's targetlist and restrictinfo list must have been constructed
  * already, and rel->tuples must be set.
  *   rel的targetlist和限制條件連結串列已構建,並且tuples已獲取.
  *
  * We set the following fields of the rel node:
  * 通過該方法,設定以下3個變數
  *  rows: the estimated number of output tuples (after applying
  *        restriction clauses).
  *        應用限制條件後,估算得出的輸出元組數目
  *  width: the estimated average output tuple width in bytes.
  *        以位元組為單位輸出估算的平均元組大小
  *  baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
  *        解析限制條件的估算成本
  */

原始碼:

 void
 set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
 {
     double      nrows;
 
     /* Should only be applied to base relations */
     Assert(rel->relid > 0);
 
     nrows = rel->tuples *
         clauselist_selectivity(root,
                                rel->baserestrictinfo,
                                0,
                                JOIN_INNER,
                                NULL);//元組總數*選擇率
 
     rel->rows = clamp_row_est(nrows);
 
     cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
 
     set_rel_width(root, rel);
 }

//----------------------------------------- clauselist_selectivity

 /*
  * clauselist_selectivity -
  *    Compute the selectivity of an implicitly-ANDed list of boolean
  *    expression clauses.  The list can be empty, in which case 1.0
  *    must be returned.  List elements may be either RestrictInfos
  *    or bare expression clauses --- the former is preferred since
  *    it allows caching of results.
  *
  *    計算布林表示式條件(隱式AND條件語句連結串列儲存)的選擇性.如果連結串列為空,
  *    則返回1.0.連結串列中的元素可以是RestrictInfo結構體,也可以是裸條件表示式
  *    (前者因為允許快取,因此是首選)
  *
  * See clause_selectivity() for the meaning of the additional parameters.
  *
  * Our basic approach is to take the product of the selectivities of the
  * subclauses.  However, that's only right if the subclauses have independent
  * probabilities, and in reality they are often NOT independent.  So,
  * we want to be smarter where we can.
  *
  * 採取的基本方法是subclauses的選擇性乘積。採取這種做法的前臺是子項具有獨立的概率.
  * 但實際上它們往往不是獨立的,因此,需要在能做到的地方變得更好。 
  *
  * If the clauses taken together refer to just one relation, we'll try to
  * apply selectivity estimates using any extended statistics for that rel.
  * Currently we only have (soft) functional dependencies, so apply these in as
  * many cases as possible, and fall back on normal estimates for remaining
  * clauses.
  *
  * 如果在一起的子句只涉及一個關係，將嘗試應用關係上的擴充套件統計進行選擇性的估算。
  *
  * We also recognize "range queries", such as "x > 34 AND x < 42".  Clauses
  * are recognized as possible range query components if they are restriction
  * opclauses whose operators have scalarltsel or a related function as their
  * restriction selectivity estimator.  We pair up clauses of this form that
  * refer to the same variable.  An unpairable clause of this kind is simply
  * multiplied into the selectivity product in the normal way.  But when we
  * find a pair, we know that the selectivities represent the relative
  * positions of the low and high bounds within the column's range, so instead
  * of figuring the selectivity as hisel * losel, we can figure it as hisel +
  * losel - 1.  (To visualize this, see that hisel is the fraction of the range
  * below the high bound, while losel is the fraction above the low bound; so
  * hisel can be interpreted directly as a 0..1 value but we need to convert
  * losel to 1-losel before interpreting it as a value.  Then the available
  * range is 1-losel to hisel.  However, this calculation double-excludes
  * nulls, so really we need hisel + losel + null_frac - 1.)
  *
  * 優化器還可以識別範圍查詢,比如x > 34 AND x < 42,這類範圍查詢不能簡單的把x > 34
  * 的選擇率乘以x < 42的選擇率,為方便起見,假定x < 42的選擇率為hisel,x < 34的選擇率為losel,
  * 那麼計算公式應該為hisel - (1 - losel),即hisel + losel -1,考慮NULL值,則範圍查詢的選擇率
  * 為hisel + losel + null_frac - 1
  *
  * If either selectivity is exactly DEFAULT_INEQ_SEL, we forget this equation
  * and instead use DEFAULT_RANGE_INEQ_SEL.  The same applies if the equation
  * yields an impossible (negative) result.
  *
  * 如果任意一個選擇性都恰好是DEFAULT_INEQ_SEL，那麼我們將忘記這個等式，
  * 而使用DEFAULT_RANGE_INEQ_SEL。這種情況同樣適用於如果等式產生了一個不可能的(負的)結果。
  *
  * A free side-effect is that we can recognize redundant inequalities such
  * as "x < 4 AND x < 5"; only the tighter constraint will be counted.
  *
  * 我們可以識別冗餘的不等式,比如x < 4 AND x <5,只有嚴格的約束條件才會計算在內
  *
  * Of course this is all very dependent on the behavior of the inequality
  * selectivity functions; perhaps some day we can generalize the approach.
  * 
  * 這完全取決於不等選擇性函式的行為,也許有一天我們可以推廣這種方法.
  *
  */
 Selectivity
 clauselist_selectivity(PlannerInfo *root,
                        List *clauses,
                        int varRelid,
                        JoinType jointype,
                        SpecialJoinInfo *sjinfo)
 {
     Selectivity s1 = 1.0;//預設返回值
     RelOptInfo *rel;//
     Bitmapset  *estimatedclauses = NULL;//點陣圖集合
     RangeQueryClause *rqlist = NULL;//範圍查詢語句
     ListCell   *l;//臨時變數
     int         listidx;
 
     /*
      * If there's exactly one clause, just go directly to
      * clause_selectivity(). None of what we might do below is relevant.
      */
     if (list_length(clauses) == 1)
         return clause_selectivity(root, (Node *) linitial(clauses),
                                   varRelid, jointype, sjinfo);//單個條件
 
     /*
      * Determine if these clauses reference a single relation.  If so, and if
      * it has extended statistics, try to apply those.
      */
     //如果條件連結串列中的元素依賴的rel有且只有一個,則返回此rel
     rel = find_single_rel_for_clauses(root, clauses);
     //應用dependencies_clauselist_selectivity中的可用的條件進行選擇率估算
     if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
     {
         /*
          * Perform selectivity estimations on any clauses found applicable by
          * dependencies_clauselist_selectivity.  'estimatedclauses' will be
          * filled with the 0-based list positions of clauses used that way, so
          * that we can ignore them below.
          */
         s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid,
                                                   jointype, sjinfo, rel,
                                                   &estimatedclauses);
 
         /*
          * This would be the place to apply any other types of extended
          * statistics selectivity estimations for remaining clauses.
          */
     }
 
     /*
      * Apply normal selectivity estimates for remaining clauses. We'll be
      * careful to skip any clauses which were already estimated above.
      * 剩下的條件語句,應用常規的選擇率估算.
      *
      * Anything that doesn't look like a potential rangequery clause gets
      * multiplied into s1 and forgotten. Anything that does gets inserted into
      * an rqlist entry.
      * 非範圍條件語句乘上s1後被丟棄,範圍條件語句則加入到rqlist連結串列中.
      */
     listidx = -1;
     foreach(l, clauses)//遍歷
     {
         Node       *clause = (Node *) lfirst(l);//連結串列中的元素
         RestrictInfo *rinfo;
         Selectivity s2;
 
         listidx++;
 
         /*
          * Skip this clause if it's already been estimated by some other
          * statistics above.
          */
         if (bms_is_member(listidx, estimatedclauses))//跳過已處理的條件
             continue;
 
         /* Always compute the selectivity using clause_selectivity */
         s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);//獲取條件選擇率
 
         /*
          * Check for being passed a RestrictInfo.
          *
          * If it's a pseudoconstant RestrictInfo, then s2 is either 1.0 or
          * 0.0; just use that rather than looking for range pairs.
          */
         if (IsA(clause, RestrictInfo))//條件語句是RestrictInfo型別
         {
             rinfo = (RestrictInfo *) clause;
             if (rinfo->pseudoconstant)//常量
             {
                 s1 = s1 * s2;//直接相乘
                 continue;
             }
             clause = (Node *) rinfo->clause;//條件表示式
         }
         else
             rinfo = NULL;//不是RestrictInfo型別,rinfo設定為NULL
 
         /*
          * See if it looks like a restriction clause with a pseudoconstant on
          * one side.  (Anything more complicated than that might not behave in
          * the simple way we are expecting.)  Most of the tests here can be
          * done more efficiently with rinfo than without.
          */
         if (is_opclause(clause) && list_length(((OpExpr *) clause)->args) == 2)//OpExpr
         {
             OpExpr     *expr = (OpExpr *) clause;//條件語句
             bool        varonleft = true;
             bool        ok;
 
             if (rinfo)//rinfo中的條件語句
             {
                 ok = (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) &&
                     (is_pseudo_constant_clause_relids(lsecond(expr->args),
                                                       rinfo->right_relids) ||
                      (varonleft = false,
                       is_pseudo_constant_clause_relids(linitial(expr->args),
                                                        rinfo->left_relids)));
             }
             else//裸條件語句
             {
                 ok = (NumRelids(clause) == 1) &&
                     (is_pseudo_constant_clause(lsecond(expr->args)) ||
                      (varonleft = false,
                       is_pseudo_constant_clause(linitial(expr->args))));
             }
 
             if (ok)//校驗通過
             {
                 /*
                  * If it's not a "<"/"<="/">"/">=" operator, just merge the
                  * selectivity in generically.  But if it's the right oprrest,
                  * add the clause to rqlist for later processing.
                  */
                 switch (get_oprrest(expr->opno))
                 {
                     case F_SCALARLTSEL:
                     case F_SCALARLESEL:
                         addRangeClause(&rqlist, clause,
                                        varonleft, true, s2);//範圍條件
                         break;
                     case F_SCALARGTSEL:
                     case F_SCALARGESEL:
                         addRangeClause(&rqlist, clause,
                                        varonleft, false, s2);//範圍條件
                         break;
                     default:
                         /* Just merge the selectivity in generically */
                         s1 = s1 * s2;//直接相乘
                         break;
                 }
                 continue;       /* drop to loop bottom */
             }
         }
 
         /* Not the right form, so treat it generically. */
         s1 = s1 * s2;//直接相乘
     }
 
     /*
      * Now scan the rangequery pair list.
      */
     while (rqlist != NULL)//處理範圍條件
     {
         RangeQueryClause *rqnext;
 
         if (rqlist->have_lobound && rqlist->have_hibound)//存在上下限
         {
             /* Successfully matched a pair of range clauses */
             Selectivity s2;//選擇率
 
             /*
              * Exact equality to the default value probably means the
              * selectivity function punted.  This is not airtight but should
              * be good enough.
              */
             if (rqlist->hibound == DEFAULT_INEQ_SEL ||
                 rqlist->lobound == DEFAULT_INEQ_SEL)//預設值
             {
                 s2 = DEFAULT_RANGE_INEQ_SEL;//預設為DEFAULT_RANGE_INEQ_SEL
             }
             else
             {
                 s2 = rqlist->hibound + rqlist->lobound - 1.0;//計算公式在註釋已解釋
 
                 /* Adjust for double-exclusion of NULLs */
                 s2 += nulltestsel(root, IS_NULL, rqlist->var,
                                   varRelid, jointype, sjinfo);//NULL值
 
                 /*
                  * A zero or slightly negative s2 should be converted into a
                  * small positive value; we probably are dealing with a very
                  * tight range and got a bogus result due to roundoff errors.
                  * However, if s2 is very negative, then we probably have
                  * default selectivity estimates on one or both sides of the
                  * range that we failed to recognize above for some reason.
                  */
                 if (s2 <= 0.0)//小於0?
                 {
                     if (s2 < -0.01)
                     {
                         /*
                          * No data available --- use a default estimate that
                          * is small, but not real small.
                          */
                         s2 = DEFAULT_RANGE_INEQ_SEL;//小於﹣1%,預設值
                     }
                     else
                     {
                         /*
                          * It's just roundoff error; use a small positive
                          * value
                          */
                         s2 = 1.0e-10;,//否則設定為1的﹣10次方
                     }
                 }
             }
             /* Merge in the selectivity of the pair of clauses */
             s1 *= s2;//直接相乘
         }
         else//只有其中一個限制
         {
             /* Only found one of a pair, merge it in generically */
             if (rqlist->have_lobound)
                 s1 *= rqlist->lobound;//下限
             else
                 s1 *= rqlist->hibound;//上限
         }
         /* release storage and advance */
         rqnext = rqlist->next;//釋放資源
         pfree(rqlist);
         rqlist = rqnext;
     }
 
     return s1;//返回選擇率
 }

 /*
  * clause_selectivity -
  *    Compute the selectivity of a general boolean expression clause.
  *    計算布林表示式條件語句的選擇率
  *
  * The clause can be either a RestrictInfo or a plain expression.  If it's
  * a RestrictInfo, we try to cache the selectivity for possible re-use,
  * so passing RestrictInfos is preferred.
  *   clause可以是RestrictInfo結構體或者是常規的表示式.如果是RestrictInfo,
  *   那麼我們會嘗試快取結果已便於重用.
  *
  * varRelid is either 0 or a rangetable index.
  *    varRelid是0或者是RTE編號
  *
  * When varRelid is not 0, only variables belonging to that relation are
  * considered in computing selectivity; other vars are treated as constants
  * of unknown values.  This is appropriate for estimating the selectivity of
  * a join clause that is being used as a restriction clause in a scan of a
  * nestloop join's inner relation --- varRelid should then be the ID of the
  * inner relation.
  *    如果varRelid不為0,只有屬於該Rel的變數才會考慮計算選擇率,其他變數作為未知常量處理
  *    這種情況常見於巢狀迴圈內表的選擇率估算(varRelid是內部的RTE)
  *
  * When varRelid is 0, all variables are treated as variables.  This
  * is appropriate for ordinary join clauses and restriction clauses.
  *    如果varRelid為0,所有的變數都需要考慮,用於常規的連線條件和限制條件估算
  *
  *
  * jointype is the join type, if the clause is a join clause.  Pass JOIN_INNER
  * if the clause isn't a join clause.
  *    如果條件是連線條件,那麼jointype是連線型別,如果不是連線調整,則該引數為JOIN_INNER
  *
  * sjinfo is NULL for a non-join clause, otherwise it provides additional
  * context information about the join being performed.  There are some
  * special cases:
  *  1. For a special (not INNER) join, sjinfo is always a member of
  *     root->join_info_list.非INNER_JOIN,sjinfo通常是oot->join_info_list的一個元素
  *  2. For an INNER join, sjinfo is just a transient struct, and only the
  *     relids and jointype fields in it can be trusted.INNER_JOIN,sjinfo通常是臨時的結構
  * It is possible for jointype to be different from sjinfo->jointype.//jointype可能跟sjinfo中的jointype不同
  * This indicates we are considering a variant join: either with
  * the LHS and RHS switched, or with one input unique-ified.這意味著連線有可能會變換
  *
  * Note: when passing nonzero varRelid, it's normally appropriate to set
  * jointype == JOIN_INNER, sjinfo == NULL, even if the clause is really a
  * join clause; because we aren't treating it as a join clause.
  */
 Selectivity
 clause_selectivity(PlannerInfo *root,
                    Node *clause,
                    int varRelid,
                    JoinType jointype,
                    SpecialJoinInfo *sjinfo)
 {
     Selectivity s1 = 0.5;       /* 預設值,default for any unhandled clause type */
     RestrictInfo *rinfo = NULL;
     bool        cacheable = false;
 
     if (clause == NULL)         /* can this still happen? */
         return s1;
 
     if (IsA(clause, RestrictInfo))//RestrictInfo
     {
         rinfo = (RestrictInfo *) clause;
 
         /*
          * If the clause is marked pseudoconstant, then it will be used as a
          * gating qual and should not affect selectivity estimates; hence
          * return 1.0.  The only exception is that a constant FALSE may be
          * taken as having selectivity 0.0, since it will surely mean no rows
          * out of the plan.  This case is simple enough that we need not
          * bother caching the result.
          */
         if (rinfo->pseudoconstant)//pseudoconstant,不影響選擇率
         {
             if (!IsA(rinfo->clause, Const))
                 return (Selectivity) 1.0;//返回1.0
         }
 
         /*
          * If the clause is marked redundant, always return 1.0.
          */
         if (rinfo->norm_selec > 1)
             return (Selectivity) 1.0;//返回1.0
 
         /*
          * If possible, cache the result of the selectivity calculation for
          * the clause.  We can cache if varRelid is zero or the clause
          * contains only vars of that relid --- otherwise varRelid will affect
          * the result, so mustn't cache.  Outer join quals might be examined
          * with either their join's actual jointype or JOIN_INNER, so we need
          * two cache variables to remember both cases.  Note: we assume the
          * result won't change if we are switching the input relations or
          * considering a unique-ified case, so we only need one cache variable
          * for all non-JOIN_INNER cases.
          */
         if (varRelid == 0 ||
             bms_is_subset_singleton(rinfo->clause_relids, varRelid))//varRelid為0
         {
             /* Cacheable --- do we already have the result? */
             if (jointype == JOIN_INNER)
             {
                 if (rinfo->norm_selec >= 0)
                     return rinfo->norm_selec;
             }
             else
             {
                 if (rinfo->outer_selec >= 0)
                     return rinfo->outer_selec;
             }
             cacheable = true;
         }
 
         /*
          * Proceed with examination of contained clause.  If the clause is an
          * OR-clause, we want to look at the variant with sub-RestrictInfos,
          * so that per-subclause selectivities can be cached.
          */
         if (rinfo->orclause)
             clause = (Node *) rinfo->orclause;
         else
             clause = (Node *) rinfo->clause;
     }
 
     if (IsA(clause, Var))//Var
     {
         Var        *var = (Var *) clause;
 
         /*
          * We probably shouldn't ever see an uplevel Var here, but if we do,
          * return the default selectivity...
          */
         if (var->varlevelsup == 0 &&
             (varRelid == 0 || varRelid == (int) var->varno))
         {
             /* Use the restriction selectivity function for a bool Var */
             s1 = boolvarsel(root, (Node *) var, varRelid);//bool Var選擇率
         }
     }
     else if (IsA(clause, Const))//常量
     {
         /* bool constant is pretty easy... */
         Const      *con = (Const *) clause;
 
         s1 = con->constisnull ? 0.0 :
             DatumGetBool(con->constvalue) ? 1.0 : 0.0;//常量有效則為1.0,否則為0.0
     }
     else if (IsA(clause, Param))//引數
     {
         /* see if we can replace the Param */
         Node       *subst = estimate_expression_value(root, clause);
 
         if (IsA(subst, Const))
         {
             /* bool constant is pretty easy... */
             Const      *con = (Const *) subst;
 
             s1 = con->constisnull ? 0.0 :
                 DatumGetBool(con->constvalue) ? 1.0 : 0.0;
         }
         else
         {
             /* XXX any way to do better than default? */
         }
     }
     else if (not_clause(clause))//反選
     {
         /* inverse of the selectivity of the underlying clause */
         s1 = 1.0 - clause_selectivity(root,
                                       (Node *) get_notclausearg((Expr *) clause),
                                       varRelid,
                                       jointype,
                                       sjinfo);
     }
     else if (and_clause(clause))//AND語句
     {
         /* share code with clauselist_selectivity() */
         s1 = clauselist_selectivity(root,
                                     ((BoolExpr *) clause)->args,
                                     varRelid,
                                     jointype,
                                     sjinfo);//遞迴呼叫
     }
     else if (or_clause(clause))//OR語句
     {
         /*
          * Selectivities for an OR clause are computed as s1+s2 - s1*s2 to
          * account for the probable overlap of selected tuple sets.
          *
          * XXX is this too conservative?
          */
         ListCell   *arg;
 
         s1 = 0.0;
         foreach(arg, ((BoolExpr *) clause)->args)
         {
             Selectivity s2 = clause_selectivity(root,
                                                 (Node *) lfirst(arg),
                                                 varRelid,
                                                 jointype,
                                                 sjinfo);//遞迴呼叫
 
             s1 = s1 + s2 - s1 * s2;
         }
     }
     else if (is_opclause(clause) || IsA(clause, DistinctExpr))
     {
         OpExpr     *opclause = (OpExpr *) clause;
         Oid         opno = opclause->opno;
 
         if (treat_as_join_clause(clause, rinfo, varRelid, sjinfo))
         {
             /* Estimate selectivity for a join clause. */
             s1 = join_selectivity(root, opno,
                                   opclause->args,
                                   opclause->inputcollid,
                                   jointype,
                                   sjinfo);//連線條件
         }
         else
         {
             /* Estimate selectivity for a restriction clause. */
             s1 = restriction_selectivity(root, opno,
                                          opclause->args,
                                          opclause->inputcollid,
                                          varRelid);//限制條件
         }
 
         /*
          * DistinctExpr has the same representation as OpExpr, but the
          * contained operator is "=" not "<>", so we must negate the result.
          * This estimation method doesn't give the right behavior for nulls,
          * but it's better than doing nothing.
          */
         if (IsA(clause, DistinctExpr))
             s1 = 1.0 - s1;//Distinct?
     }
     else if (IsA(clause, ScalarArrayOpExpr))//陣列
     {
         /* Use node specific selectivity calculation function */
         s1 = scalararraysel(root,
                             (ScalarArrayOpExpr *) clause,
                             treat_as_join_clause(clause, rinfo,
                                                  varRelid, sjinfo),
                             varRelid,
                             jointype,
                             sjinfo);
     }
     else if (IsA(clause, RowCompareExpr))//行對比
     {
         /* Use node specific selectivity calculation function */
         s1 = rowcomparesel(root,
                            (RowCompareExpr *) clause,
                            varRelid,
                            jointype,
                            sjinfo);
     }
     else if (IsA(clause, NullTest))//NullTest
     {
         /* Use node specific selectivity calculation function */
         s1 = nulltestsel(root,
                          ((NullTest *) clause)->nulltesttype,
                          (Node *) ((NullTest *) clause)->arg,
                          varRelid,
                          jointype,
                          sjinfo);
     }
     else if (IsA(clause, BooleanTest))//BooleanTest
     {
         /* Use node specific selectivity calculation function */
         s1 = booltestsel(root,
                          ((BooleanTest *) clause)->booltesttype,
                          (Node *) ((BooleanTest *) clause)->arg,
                          varRelid,
                          jointype,
                          sjinfo);
     }
     else if (IsA(clause, CurrentOfExpr))//CurrentOfExpr
     {
         /* CURRENT OF selects at most one row of its table */
         CurrentOfExpr *cexpr = (CurrentOfExpr *) clause;
         RelOptInfo *crel = find_base_rel(root, cexpr->cvarno);
 
         if (crel->tuples > 0)
             s1 = 1.0 / crel->tuples;
     }
     else if (IsA(clause, RelabelType))//RelabelType
     {
         /* Not sure this case is needed, but it can't hurt */
         s1 = clause_selectivity(root,
                                 (Node *) ((RelabelType *) clause)->arg,
                                 varRelid,
                                 jointype,
                                 sjinfo);
     }
     else if (IsA(clause, CoerceToDomain))//CoerceToDomain
     {
         /* Not sure this case is needed, but it can't hurt */
         s1 = clause_selectivity(root,
                                 (Node *) ((CoerceToDomain *) clause)->arg,
                                 varRelid,
                                 jointype,
                                 sjinfo);
     }
     else
     {
         /*
          * For anything else, see if we can consider it as a boolean variable.
          * This only works if it's an immutable expression in Vars of a single
          * relation; but there's no point in us checking that here because
          * boolvarsel() will do it internally, and return a suitable default
          * selectivity if not.
          */
         s1 = boolvarsel(root, clause, varRelid);//預設為bool Var
     }
 
     /* Cache the result if possible */
     if (cacheable)//快取?
     {
         if (jointype == JOIN_INNER)
             rinfo->norm_selec = s1;
         else
             rinfo->outer_selec = s1;
     }
 
 #ifdef SELECTIVITY_DEBUG
     elog(DEBUG4, "clause_selectivity: s1 %f", s1);
 #endif                          /* SELECTIVITY_DEBUG */
 
     return s1;
 }

 /*
  * find_single_rel_for_clauses
  *      Examine each clause in 'clauses' and determine if all clauses
  *      reference only a single relation.  If so return that relation,
  *      otherwise return NULL.
  *
  *      如果條件連結串列中的元素依賴的rel有且只有一個,則返回此rel
  */
 static RelOptInfo *
 find_single_rel_for_clauses(PlannerInfo *root, List *clauses)
 {
     int         lastrelid = 0;
     ListCell   *l;
 
     foreach(l, clauses)
     {
         RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
         int         relid;
 
         /*
          * If we have a list of bare clauses rather than RestrictInfos, we
          * could pull out their relids the hard way with pull_varnos().
          * However, currently the extended-stats machinery won't do anything
          * with non-RestrictInfo clauses anyway, so there's no point in
          * spending extra cycles; just fail if that's what we have.
          */
         if (!IsA(rinfo, RestrictInfo))
             return NULL;
 
         if (bms_is_empty(rinfo->clause_relids))
             continue;           /* we can ignore variable-free clauses */
         if (!bms_get_singleton_member(rinfo->clause_relids, &relid))
             return NULL;        /* multiple relations in this clause */
         if (lastrelid == 0)
             lastrelid = relid;  /* first clause referencing a relation */
         else if (relid != lastrelid)
             return NULL;        /* relation not same as last one */
     }
 
     if (lastrelid != 0)
         return find_base_rel(root, lastrelid);
 
     return NULL;                /* no clauses */
 }

//---------------------------------------------- clamp_row_est
 /*
  * clamp_row_est
  *      Force a row-count estimate to a sane value.
  *      返回合法值
  */
 double
 clamp_row_est(double nrows)
 {
     /*
      * Force estimate to be at least one row, to make explain output look
      * better and to avoid possible divide-by-zero when interpolating costs.
      * Make it an integer, too.
      */
     if (nrows <= 1.0)
         nrows = 1.0;//小於1,返回1
     else
         nrows = rint(nrows);//整型
 
     return nrows;
 }
 
//---------------------------------------------- cost_qual_eval

 /*
  * cost_qual_eval
  *      Estimate the CPU costs of evaluating a WHERE clause.
  *      The input can be either an implicitly-ANDed list of boolean
  *      expressions, or a list of RestrictInfo nodes.  (The latter is
  *      preferred since it allows caching of the results.)
  *      The result includes both a one-time (startup) component,
  *      and a per-evaluation component.
  */
 void
 cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root)
 {
     cost_qual_eval_context context;
     ListCell   *l;
 
     context.root = root;
     context.total.startup = 0;
     context.total.per_tuple = 0;
 
     /* We don't charge any cost for the implicit ANDing at top level ... */
 
     foreach(l, quals)
     {
         Node       *qual = (Node *) lfirst(l);
 
         cost_qual_eval_walker(qual, &context);
     }
 
     *cost = context.total;
 }


//---------------------------------------------- set_rel_width

 /*
  * set_rel_width
  *      Set the estimated output width of a base relation.
  *
  * The estimated output width is the sum of the per-attribute width estimates
  * for the actually-referenced columns, plus any PHVs or other expressions
  * that have to be calculated at this relation.  This is the amount of data
  * we'd need to pass upwards in case of a sort, hash, etc.
  *
  * This function also sets reltarget->cost, so it's a bit misnamed now.
  *
  * NB: this works best on plain relations because it prefers to look at
  * real Vars.  For subqueries, set_subquery_size_estimates will already have
  * copied up whatever per-column estimates were made within the subquery,
  * and for other types of rels there isn't much we can do anyway.  We fall
  * back on (fairly stupid) datatype-based width estimates if we can't get
  * any better number.
  *
  * The per-attribute width estimates are cached for possible re-use while
  * building join relations or post-scan/join pathtargets.
  */
 static void
 set_rel_width(PlannerInfo *root, RelOptInfo *rel)
 {
     Oid         reloid = planner_rt_fetch(rel->relid, root)->relid;
     int32       tuple_width = 0;
     bool        have_wholerow_var = false;
     ListCell   *lc;
 
     /* Vars are assumed to have cost zero, but other exprs do not */
     rel->reltarget->cost.startup = 0;
     rel->reltarget->cost.per_tuple = 0;
 
     foreach(lc, rel->reltarget->exprs)
     {
         Node       *node = (Node *) lfirst(lc);
 
         /*
          * Ordinarily, a Var in a rel's targetlist must belong to that rel;
          * but there are corner cases involving LATERAL references where that
          * isn't so.  If the Var has the wrong varno, fall through to the
          * generic case (it doesn't seem worth the trouble to be any smarter).
          */
         if (IsA(node, Var) &&
             ((Var *) node)->varno == rel->relid)
         {
             Var        *var = (Var *) node;
             int         ndx;
             int32       item_width;
 
             Assert(var->varattno >= rel->min_attr);
             Assert(var->varattno <= rel->max_attr);
 
             ndx = var->varattno - rel->min_attr;
 
             /*
              * If it's a whole-row Var, we'll deal with it below after we have
              * already cached as many attr widths as possible.
              */
             if (var->varattno == 0)
             {
                 have_wholerow_var = true;
                 continue;
             }
 
             /*
              * The width may have been cached already (especially if it's a
              * subquery), so don't duplicate effort.
              */
             if (rel->attr_widths[ndx] > 0)
             {
                 tuple_width += rel->attr_widths[ndx];
                 continue;
             }
 
             /* Try to get column width from statistics */
             if (reloid != InvalidOid && var->varattno > 0)
             {
                 item_width = get_attavgwidth(reloid, var->varattno);
                 if (item_width > 0)
                 {
                     rel->attr_widths[ndx] = item_width;
                     tuple_width += item_width;
                     continue;
                 }
             }
 
             /*
              * Not a plain relation, or can't find statistics for it. Estimate
              * using just the type info.
              */
             item_width = get_typavgwidth(var->vartype, var->vartypmod);
             Assert(item_width > 0);
             rel->attr_widths[ndx] = item_width;
             tuple_width += item_width;
         }
         else if (IsA(node, PlaceHolderVar))
         {
             /*
              * We will need to evaluate the PHV's contained expression while
              * scanning this rel, so be sure to include it in reltarget->cost.
              */
             PlaceHolderVar *phv = (PlaceHolderVar *) node;
             PlaceHolderInfo *phinfo = find_placeholder_info(root, phv, false);
             QualCost    cost;
 
             tuple_width += phinfo->ph_width;
             cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
             rel->reltarget->cost.startup += cost.startup;
             rel->reltarget->cost.per_tuple += cost.per_tuple;
         }
         else
         {
             /*
              * We could be looking at an expression pulled up from a subquery,
              * or a ROW() representing a whole-row child Var, etc.  Do what we
              * can using the expression type information.
              */
             int32       item_width;
             QualCost    cost;
 
             item_width = get_typavgwidth(exprType(node), exprTypmod(node));
             Assert(item_width > 0);
             tuple_width += item_width;
             /* Not entirely clear if we need to account for cost, but do so */
             cost_qual_eval_node(&cost, node, root);
             rel->reltarget->cost.startup += cost.startup;
             rel->reltarget->cost.per_tuple += cost.per_tuple;
         }
     }
 
     /*
      * If we have a whole-row reference, estimate its width as the sum of
      * per-column widths plus heap tuple header overhead.
      */
     if (have_wholerow_var)
     {
         int32       wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
 
         if (reloid != InvalidOid)
         {
             /* Real relation, so estimate true tuple width */
             wholerow_width += get_relation_data_width(reloid,
                                                       rel->attr_widths - rel->min_attr);
         }
         else
         {
             /* Do what we can with info for a phony rel */
             AttrNumber  i;
 
             for (i = 1; i <= rel->max_attr; i++)
                 wholerow_width += rel->attr_widths[i - rel->min_attr];
         }
 
         rel->attr_widths[0 - rel->min_attr] = wholerow_width;
 
         /*
          * Include the whole-row Var as part of the output tuple.  Yes, that
          * really is what happens at runtime.
          */
         tuple_width += wholerow_width;
     }
 
     Assert(tuple_width >= 0);
     rel->reltarget->width = tuple_width;
 }

三、跟蹤分析

測試指令碼:
單位資訊表,插入100,000行資料,dwbh為主鍵,同時建立函式索引和部分索引

drop table if exists t_dwxx;
create table t_dwxx(dwmc varchar(100),dwbh varchar(20),dwdz varchar(100));
alter table t_dwxx add primary key(dwbh);
create index idx_dwxx_expr on t_dwxx(trim(dwmc));
create index idx_dwxx_predicate_dwbh on t_dwxx(dwbh) where dwbh > '50000';

truncate table t_dwxx;
insert into t_dwxx(dwmc,dwbh,dwdz) 
select 'DWMC'||generate_series(1,100000),generate_series(1,100000)||'','DWDZ'||generate_series(1,100000);

個人資訊表,插入5,000,000行資料,在grbh和dwbh上建立索引

drop table if exists t_grxx;
create table t_grxx(dwbh varchar(10),grbh varchar(10),xm varchar(20),xb varchar(10),nl int);

insert into t_grxx(dwbh,grbh,xm,xb,nl) 
select generate_series(1,5000000)/50||'',generate_series(1,5000000),'XM'||generate_series(1,5000000),
(case when (floor(random()*2)=0) then '男' else '女' end),floor(random() * 100 + 1)::int;

create index idx_t_grxx_grbh on t_grxx(grbh);
create index idx_t_dwxx_grbh on t_grxx(dwbh);

個人繳費資訊表,在grbh上建立索引,多次插入5,000,000行資料

drop table if exists t_jfxx;
create table t_jfxx(grbh varchar(10),ny varchar(10),je float);
-- 根據實際情況,多次執行以下SQL
insert into t_jfxx(grbh,ny,je) 
select generate_series(1,5000000),
       to_char(now()::timestamp - (floor(random()*240+1)||' month')::interval,'yyyymm'),
       floor(random()*10000+1);

create index idx_t_jfxx_grbh on t_jfxx(grbh);

執行簡單的查詢SQL:

select t1.* from t_dwxx t1 where dwbh > '60000' and dwbh < '70000' and dwbh < '65000';

執行計劃如下:

testdb=# explain (analyze true,verbose) select t1.* from t_dwxx t1 where dwbh > '60000' and dwbh < '70000' and dwbh < '65000';
                                                               QUERY PLAN                                                    
           
-----------------------------------------------------------------------------------------------------------------------------
-----------
 Bitmap Heap Scan on public.t_dwxx t1  (cost=134.19..956.12 rows=5482 width=23) (actual time=1.484..2.744 rows=5554 loops=1)
   Output: dwmc, dwbh, dwdz
   Recheck Cond: (((t1.dwbh)::text > '60000'::text) AND ((t1.dwbh)::text < '70000'::text) AND ((t1.dwbh)::text < '65000'::tex
t))
   Heap Blocks: exact=45
   ->  Bitmap Index Scan on idx_dwxx_predicate_dwbh  (cost=0.00..132.81 rows=5482 width=0) (actual time=1.467..1.467 rows=555
4 loops=1)
         Index Cond: (((t1.dwbh)::text > '60000'::text) AND ((t1.dwbh)::text < '70000'::text) AND ((t1.dwbh)::text < '65000':
:text))
 Planning Time: 0.204 ms
 Execution Time: 3.288 ms

啟動gdb跟蹤分析:

(gdb) b set_baserel_size_estimates
Breakpoint 1 at 0x747bf5: file costsize.c, line 4302.
(gdb) c
Continuing.

Breakpoint 1, set_baserel_size_estimates (root=0x2686fa8, rel=0x26873b8) at costsize.c:4302
4302    nrows = rel->tuples *

進入函式clauselist_selectivity:

(gdb) step
clauselist_selectivity (root=0x2686fa8, clauses=0x271f600, varRelid=0, jointype=JOIN_INNER, sjinfo=0x0) at clausesel.c:105
105   Selectivity s1 = 1.0;
...
124   rel = find_single_rel_for_clauses(root, clauses);
(gdb) 
125   if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
#與限制條件相關的rel(t_dwxx)
(gdb) p *rel
$1 = {type = T_RelOptInfo, reloptkind = RELOPT_BASEREL, relids = 0x2687728, rows = 0, consider_startup = false, 
  consider_param_startup = false, consider_parallel = true, reltarget = 0x271e228, pathlist = 0x0, ppilist = 0x0, 
  partial_pathlist = 0x0, cheapest_startup_path = 0x0, cheapest_total_path = 0x0, cheapest_unique_path = 0x0, 
  cheapest_parameterized_paths = 0x0, direct_lateral_relids = 0x0, lateral_relids = 0x0, relid = 1, reltablespace = 0, 
  rtekind = RTE_RELATION, min_attr = -7, max_attr = 3, attr_needed = 0x271e278, attr_widths = 0x271e308, 
  lateral_vars = 0x0, lateral_referencers = 0x0, indexlist = 0x271e700, statlist = 0x0, pages = 726, tuples = 100000, 
  allvisfrac = 0, subroot = 0x0, subplan_params = 0x0, rel_parallel_workers = -1, serverid = 0, userid = 0, 
  useridiscurrent = false, fdwroutine = 0x0, fdw_private = 0x0, unique_for_rels = 0x0, non_unique_for_rels = 0x0, 
  baserestrictinfo = 0x271f600, baserestrictcost = {startup = 0, per_tuple = 0}, baserestrict_min_security = 0, 
  joininfo = 0x0, has_eclass_joins = false, top_parent_relids = 0x0, part_scheme = 0x0, nparts = 0, boundinfo = 0x0, 
  partition_qual = 0x0, part_rels = 0x0, partexprs = 0x0, nullable_partexprs = 0x0, partitioned_child_rels = 0x0}

開始迴圈處理:

152   foreach(l, clauses)
...

第一個條件語句,呼叫clause_selectivity後,選擇率為0.44...

168     s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
(gdb) 
176     if (IsA(clause, RestrictInfo))
(gdb) p s2
$2 = 0.44045086705202319

新增到範圍條件語句中:

...
225         switch (get_oprrest(expr->opno))
(gdb) 
234             addRangeClause(&rqlist, clause,
(gdb) 
236             break;

第二個條件語句,呼叫clause_selectivity後,選擇率為0.66...,,同樣會新增到範圍條件語句中:

168     s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
(gdb) 
176     if (IsA(clause, RestrictInfo))
(gdb) p s2
$3 = 0.66904390539053915
...
225         switch (get_oprrest(expr->opno))
(gdb) 
229             addRangeClause(&rqlist, clause,

第三個條件語句,呼叫clause_selectivity後,選擇率為0.61...,,同樣會新增到範圍條件語句中:

168     s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
(gdb) 
176     if (IsA(clause, RestrictInfo))
(gdb) p s2
$4 = 0.61437297872340435
...
225         switch (get_oprrest(expr->opno))
(gdb) 
229             addRangeClause(&rqlist, clause,

結束迴圈,開始處理範圍條件語句:

253   while (rqlist != NULL)
(gdb) n
#既有上限,也有下限
(gdb) p *rqlist
$7 = {next = 0x0, var = 0x271dba8, have_lobound = true, have_hibound = true, lobound = 0.44045086705202319, 
  hibound = 0.61437297872340435}
...
#計算公式註釋已作介紹
(gdb) n
274         s2 = rqlist->hibound + rqlist->lobound - 1.0;
(gdb) 
277         s2 += nulltestsel(root, IS_NULL, rqlist->var,
#最終結果  
(gdb) 
325   return s1;
(gdb) p s1
$11 = 0.054823845775427538
...

回到主函式:

(gdb) 
set_baserel_size_estimates (root=0x2686fa8, rel=0x26873b8) at costsize.c:4302
4302    nrows = rel->tuples *
(gdb) 
4309    rel->rows = clamp_row_est(nrows);
(gdb) 
4311    cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
(gdb) 
4313    set_rel_width(root, rel);
(gdb) p rel->rows
$12 = 5482

結果為5482,執行計劃中的rows=5482就是這麼來的.

四、參考資料

allpaths.c

PostgreSQL 原始碼解讀（52）- 查詢語句#37（make_one_rel函式#2-估算關係大小）

一、資料結構

二、原始碼解讀

三、跟蹤分析

四、參考資料

相關文章