balance_dirty_pages_ratelimited詳細分析

yooooooo發表於2024-11-22

balance_dirty_pages_ratelimited分析

void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
    struct inode *inode = mapping->host;
    //inode_to_bdi(inode) 這個函式在 Linux 核心的檔案系統中扮演著非常重要的角色,它用於將一個 inode(索引節點)對映到對應的 backing device information (BDI)。
    struct backing_dev_info *bdi = inode_to_bdi(inode);
    //bdi_writeback 結構體通常用於描述一個後臺寫回工作佇列。這個工作佇列負責將髒頁(即記憶體中被修改但尚未寫入磁碟的資料)非同步地寫入磁碟。
    struct bdi_writeback *wb = NULL;
    int ratelimit;
    int *p;
    //backing_dev_info (BDI): 這是一個結構體,它包含了關於一個塊裝置(比如硬碟)的資訊,例如它的I/O排程策略、寫回行為等。
    //髒頁計數: 核心會跟蹤系統中哪些記憶體頁被修改了但還沒有寫入磁碟(即髒頁)。這個跟蹤過程就是髒頁計數。
    //函式返回true,表示需要對這個塊裝置進行髒頁計數。函式返回false,表示不需要對這個塊裝置進行髒頁計數。
    //BDI_CAP_NO_ACCT_DIRTY: 這是一個標誌位,如果這個標誌位在一個BDI結構體中被設定了,就表示這個對應的塊裝置不需要參與髒頁會計。
    if (!bdi_cap_account_dirty(bdi))
        return;

    //inode_cgwb_enabled 函式透過檢查一系列條件來判斷一個 inode 是否啟用了 cgroup 寫回機制。
    if (inode_cgwb_enabled(inode))
        //這個函式的主要作用是獲取或建立一個與指定塊裝置資訊 (BDI) 相關的 bdi_writeback 結構體。這個結構體代表一個寫回工作執行緒,負責將髒頁寫入磁碟。
        wb = wb_get_create_current(bdi, GFP_KERNEL);
    if (!wb)
        //如果沒有成功獲取或建立 bdi_writeback 結構體(wb == NULL),那麼程式碼會退而求其次,直接使用 bdi 結構體中的預設 bdi_writeback 結構體
        wb = &bdi->wb;
    //current是指當前程序的task_struct結構體
    // 獲取當前程序的髒頁速率限制
    ratelimit = current->nr_dirtied_pause;
    //wb->dirty_exceeded 表示裝置的髒頁數量已超出限制。
    if (wb->dirty_exceeded)
        //wb->dirty_exceeded 表示裝置的髒頁數量已超出限制。如果裝置的髒頁超限,ratelimit 會被降低,以減少當前程序允許髒化的頁面數量。
        //  32>>(12−10)=32>>2=8
        //如何透過這個值限制當前程序的髒頁生成速率,往下看。
        ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
    //禁止核心搶佔
    preempt_disable();
    /*
     * This prevents one CPU to accumulate too many dirtied pages without
     * calling into balance_dirty_pages(), which can happen when there are
     * 1000+ tasks, all of them start dirtying pages at exactly the same
     * time, hence all honoured too large initial task->nr_dirtied_pause.
     */
    //獲取當前 CPU 上與變數 bdp_ratelimits 對應的 每 CPU 本地變數(per-CPU variable) 的指標。
    p =  this_cpu_ptr(&bdp_ratelimits);
    // 判斷當前程序的髒頁數量是否超過了速率限制。如果當前程序的髒頁數量超過了速率限制,那麼將當前CPU上的速率限制設定為0,意味著暫時禁止該CPU上的磁碟 I/O 操作
    if (unlikely(current->nr_dirtied >= ratelimit))
        *p = 0;
    //如果當前CPU的髒頁速率計數器值*p 超過了per CPU允許的閾值 ratelimit_pages。
    else if (unlikely(*p >= ratelimit_pages)) {
        //將速率計數器 *p 重置為 0。
        //將 ratelimit 設定為 0,表示不允許當前程序繼續產生髒頁。
        *p = 0;
        ratelimit = 0;
    }
    /*
     * Pick up the dirtied pages by the exited tasks. This avoids lots of
     * short-lived tasks (eg. gcc invocations in a kernel build) escaping
     * the dirty throttling and livelock other long-run dirtiers.
     */
    // per-CPU 變數,用於記錄那些未能計入具體程序的髒頁數量(即"洩漏"的髒頁數量)
    p = this_cpu_ptr(&dirty_throttle_leaks);
    //確保當前程序的髒頁計數未超過限制值 ratelimit 和 檢查是否有未計入具體程序的洩漏髒頁
    //測試時if很少成立,dirty_throttle_leaks基本是0,有時會大於0
    if (*p > 0 && current->nr_dirtied < ratelimit) {
        unsigned long nr_pages_dirtied;
        //取 *p(洩漏的髒頁數量)和 ratelimit - current->nr_dirtied(當前程序還可以生成的髒頁數量)的最小值。
        //不會吸收超過洩漏的髒頁數量。
        //吸收後,current->nr_dirtied 不會超過 ratelimit。
        nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
        //將吸收的髒頁數量從 *p 中扣減。
        *p -= nr_pages_dirtied;
        //將吸收的髒頁數量計入當前程序的髒頁計數器 current->nr_dirtied。
        current->nr_dirtied += nr_pages_dirtied;
    }
    //上面這段的作用是避免大量髒頁洩漏積累,可能導致寫入過載或系統效能下降
    //恢復核心的搶佔狀態
    preempt_enable();

    //當前髒頁超過限制,執行balance_dirty_pages
    if (unlikely(current->nr_dirtied >= ratelimit))
        balance_dirty_pages(wb, current->nr_dirtied);

    wb_put(wb);
}
  • 當前task的髒頁數量大於ratelimit
(unlikely(current->nr_dirtied >= ratelimit))
  • 髒頁洩漏的補償後,依然超過限制
  • 系統嘗試從 dirty_throttle_leaks(洩漏的髒頁池)中補償當前程序的髒頁計數。
  • 補償後,current->nr_dirtied 達到或超過 ratelimit。
p = this_cpu_ptr(&dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
    unsigned long nr_pages_dirtied;
    nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
    *p -= nr_pages_dirtied;
    current->nr_dirtied += nr_pages_dirtied;
}
if (unlikely(current->nr_dirtied >= ratelimit))
    balance_dirty_pages(wb, current->nr_dirtied);
  • 如果當前裝置的髒頁數超限,則會降低 ratelimit 的值
if (wb->dirty_exceeded)
    ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));

更低的 ratelimit 會更容易觸發 balance_dirty_pages。

  • 每個 CPU 的髒頁生成計數超過限制時,會將 ratelimit 設定為 0,觸發限制:
p = this_cpu_ptr(&bdp_ratelimits);
if (unlikely(*p >= ratelimit_pages)) {
    *p = 0;
    ratelimit = 0;
}

強制阻止髒頁生成,隨後觸發 balance_dirty_pages。

balance_dirty_pages函式

#define GDTC_INIT(__wb)     .wb = (__wb),                           \
                .wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)

/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control {
#ifdef CONFIG_CGROUP_WRITEBACK
    struct wb_domain    *dom;
    struct dirty_throttle_control *gdtc;    /* only set in memcg dtc's */
#endif
    struct bdi_writeback    *wb;
    // 這個結構體在 Linux 核心中主要用於表示每個 CPU 上的本地事件計數器。
    struct fprop_local_percpu *wb_completions;

    unsigned long       avail;      /* dirtyable */
    unsigned long       dirty;      /* file_dirty + write + nfs */
    unsigned long       thresh;     /* dirty threshold */
    unsigned long       bg_thresh;  /* dirty background threshold */

    unsigned long       wb_dirty;   /* per-wb counterparts */
    unsigned long       wb_thresh;
    unsigned long       wb_bg_thresh;

    unsigned long       pos_ratio;
};
static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
    return false;
}
static void balance_dirty_pages(struct bdi_writeback *wb,
                unsigned long pages_dirtied)
{
    //gdtc_stor 是一個全域性髒頁控制資訊結構體,其中wb 引數是傳入的 bdi_writeback 結構體的指標。wb中的completions賦值給gdtc_stor的wb_completions
    struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
    //mdtc_stor 這個變數是一個記憶體控制組的髒頁控制結構體。
    struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
    //gdtc這個變數是一個指向全域性髒頁控制結構體的指標
    struct dirty_throttle_control * const gdtc = &gdtc_stor;
    //mdtc這個變數是一個指向記憶體控制組髒頁控制結構體的指標。如果記憶體控制組有效,則指向 mdtc_stor,否則為 NULL。這個值為空,因為沒有CONFIG_CGROUP_WRITEBACK
    struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
                             &mdtc_stor : NULL;
    struct dirty_throttle_control *sdtc;
    unsigned long nr_reclaimable;   /* = file_dirty + unstable_nfs */
    long period;
    long pause;
    long max_pause;
    long min_pause;
    int nr_dirtied_pause;
    bool dirty_exceeded = false;
    unsigned long task_ratelimit;
    unsigned long dirty_ratelimit;
    struct backing_dev_info *bdi = wb->bdi;
    //檢查後端裝置(bdi,block device inode)的能力標誌是否包含 BDI_CAP_STRICTLIMIT。這是一個布林值,用於確定是否需要對寫入髒頁的行為進行嚴格限制。
    bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
    unsigned long start_time = jiffies;

    for (;;) {
    	//now 變數被賦值為當前的 jiffies 值,這樣可以記錄當前時間點,用於後續的時間計算或比較操作。
        unsigned long now = jiffies;
        unsigned long dirty, thresh, bg_thresh;
        unsigned long m_dirty = 0;  /* stop bogus uninit warnings */
        unsigned long m_thresh = 0;
        unsigned long m_bg_thresh = 0;

        /*
         * Unstable writes are a feature of certain networked
         * filesystems (i.e. NFS) in which data may have been
         * written to the server's write cache, but has not yet
         * been flushed to permanent storage.
         */
         //global_node_page_state(NR_FILE_DIRTY)獲取系統中所有節點(NUMA 節點)的普通檔案髒頁數量的總和。
         //global_node_page_state(NR_UNSTABLE_NFS)獲取系統中所有節點的NFS協議下的不穩定髒頁數量的總和。
         //將上述兩種型別的髒頁數量相加,得到系統中總共可以被回收的髒頁數量
        nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
                    global_node_page_state(NR_UNSTABLE_NFS);
        //可以被標記為髒的頁面數量
        gdtc->avail = global_dirtyable_memory();
        //系統中實際的髒頁總數 = 可回收的髒頁數量 + 正在寫入磁碟的頁面數量
        gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
        //傳入全域性髒頁控制資訊
        domain_dirty_limits(gdtc);
        //當 strictlimit 啟用時,系統會使用 wb_dirty 和相關的寫回限制(如 wb_thresh)來確保裝置不會因過多髒頁積累而導致效能問題
        if (unlikely(strictlimit)) {
            wb_dirty_limits(gdtc);

            dirty = gdtc->wb_dirty;
            thresh = gdtc->wb_thresh;
            bg_thresh = gdtc->wb_bg_thresh;
        } else {
            /*系統中實際的髒頁總數,在domain_dirty_limits中的
            dtc->thresh = thresh;
            dtc->bg_thresh = bg_thresh;被賦值 */
            dirty = gdtc->dirty;
            thresh = gdtc->thresh;
            bg_thresh = gdtc->bg_thresh;
        }
        //這個為空,不用計算;
        if (mdtc) {
            unsigned long filepages, headroom, writeback;

            /*
             * If @wb belongs to !root memcg, repeat the same
             * basic calculations for the memcg domain.
             */
            mem_cgroup_wb_stats(wb, &filepages, &headroom,
                        &mdtc->dirty, &writeback);
            mdtc->dirty += writeback;
            mdtc_calc_avail(mdtc, filepages, headroom);

            domain_dirty_limits(mdtc);

            if (unlikely(strictlimit)) {
                wb_dirty_limits(mdtc);
                m_dirty = mdtc->wb_dirty;
                m_thresh = mdtc->wb_thresh;
                m_bg_thresh = mdtc->wb_bg_thresh;
            } else {
                m_dirty = mdtc->dirty;
                m_thresh = mdtc->thresh;
                m_bg_thresh = mdtc->bg_thresh;
            }
        }

        /*
         * Throttle it only when the background writeback cannot
         * catch-up. This avoids (excessively) small writeouts
         * when the wb limits are ramping up in case of !strictlimit.
         *
         * In strictlimit case make decision based on the wb counters
         * and limits. Small writeouts when the wb limits are ramping
         * up are the price we consciously pay for strictlimit-ing.
         *
         * If memcg domain is in effect, @dirty should be under
         * both global and memcg freerun ceilings.
         */
         //這段程式碼的目的是根據髒頁數量和閾值決定是否暫停當前的髒頁寫回操作。如果髒頁數處於正常範圍內,系統將計算適當的寫回間隔,並在暫停寫回時記錄暫停的時間。
         //dirty <= dirty_freerun_ceiling(thresh, bg_thresh):首先,判斷當前髒頁數 dirty 是否低於前臺和後臺髒頁閾值的平均值。如果是,表示系統可以繼續執行髒頁寫回。公式:(thresh + bg_thresh) / 2,
         //dirty_freerun_ceiling(thresh, bg_thresh): 這個函式計算前臺和後臺髒頁閾值的平均值,表示一個髒頁的“自由執行上限”。當髒頁數量低於這個上限時,系統可以繼續以正常的頻率進行髒頁的寫回,不需要進行強制的寫回操作。
         //dirty <= dirty_freerun_ceiling(thresh, bg_thresh): 這個條件的作用是檢查當前髒頁數量是否小於或等於這個“自由執行上限”。如果是,說明髒頁數量在允許的範圍內,系統可以繼續操作而無需觸發寫回操作。
         //(!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh)):如果 mdtc 為 NULL(即沒有記憶體控制組資訊)或者記憶體控制組的髒頁數 m_dirty 小於或等於其相應的自由執行上限,則也允許繼續執行。
        if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
            (!mdtc ||
             m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
             /**
             * 計算髒頁寫回的時間間隔:unsigned long intv = dirty_poll_interval(dirty, thresh);:根據當前的髒頁數量 (dirty) 和前臺髒頁閾值 (thresh),呼叫 dirty_poll_interval 函式來計算髒頁寫回的時間間隔(intv)。這個間隔時間控制了系統在髒頁寫回操作中等待的時間,目的是避免頻繁寫回而導致效能問題。
             * 暫停髒頁寫回操作:current->dirty_paused_when = now;:記錄當前時間 now,表示當前執行緒已暫停髒頁寫回。
                             current->nr_dirtied = 0;:重置當前執行緒髒頁數量的計數器,表示已暫停髒頁寫回操作。
             */
            //如果 thresh 大於 dirty,即閾值大於當前的髒頁數量,那麼 dirty_poll_interval 函式會計算一個較長的寫回間隔時間。
            //如果 thresh 小於 dirty,即閾值小於當前的髒頁數量,那麼 dirty_poll_interval 函式會返回最小的寫回間隔 1。這意味著系統會頻繁地進行髒頁寫回操作。
            unsigned long intv = dirty_poll_interval(dirty, thresh);
            //#define ULONG_MAX (~0UL):~0UL 表示對 0 進行按位取反操作,並將結果轉換為無符號長整型(unsigned long)。
			//將變數 m_intv 初始化為無符號長整型的最大值。
            unsigned long m_intv = ULONG_MAX;
			//系統將當前時間(now)賦值給 current->dirty_paused_when,從而記錄下當前執行緒暫停髒頁寫回操作的具體時間點。
            current->dirty_paused_when = now;
            //將當前執行緒的髒頁計數器重置為零。
            current->nr_dirtied = 0;
            //這裡跳過
            if (mdtc)
                m_intv = dirty_poll_interval(m_dirty, m_thresh);
            //選擇最小的寫回間隔:
            //current->nr_dirtied_pause = min(intv, m_intv);:選擇全域性和記憶體控制組髒頁寫回間隔中較小的一個,作為暫停髒頁寫回操作的時間間隔。
            //這個間隔決定了髒頁在暫停後恢復寫回的時間,避免過度頻繁的髒頁寫回,提升系統效率。
           //current->nr_dirtied_pause 是一個變數,用於記錄當前執行緒暫停髒頁寫回操作的時間間隔。
           //這個變數的值是透過選擇全域性和記憶體控制組髒頁寫回間隔中較小的一個來確定的。如果沒有記憶體控制組那麼必然使用全域性髒頁寫回間隔。
            current->nr_dirtied_pause = min(intv, m_intv);
            //break;:當髒頁寫回操作暫停後,退出當前迴圈,可能會等待下一次髒頁寫回的時機。
            break;
        }
        //writeback_in_progress(wb) 是一個函式,它檢查是否有正在進行的寫回操作。如果沒有寫回操作,writeback_in_progress(wb) 返回 false,條件 !writeback_in_progress(wb) 成立,表示沒有正在進行的寫回。進行unlikely邏輯判斷,最佳化,認為不太可能進入這裡。
        if (unlikely(!writeback_in_progress(wb)))
            //如果沒有寫回操作,呼叫 wb_start_background_writeback(wb) 函式啟動後臺寫回。這個函式喚醒後臺寫回執行緒,讓它開始進行髒頁的寫回操作,確保資料最終被寫入磁碟。
            wb_start_background_writeback(wb);

        /*
         * Calculate global domain's pos_ratio and select the
         * global dtc by default.
         */
         //如果 strictlimit 為 false,即當前系統或條件下不需要強制限制寫回閾值
        if (!strictlimit)
            //計算並更新與後臺寫回相關的髒頁閾值 (wb_thresh 和 wb_bg_thresh),以及計算當前正在寫回的髒頁數量 (wb_dirty)。
            wb_dirty_limits(gdtc);

        //gdtc->wb_dirty > gdtc->wb_thresh:判斷裝置的髒頁數量是否超過了寫回閾值(wb_thresh)。如果髒頁數超過閾值,表示裝置需要進行寫回。
        //gdtc->dirty > gdtc->thresh:判斷全域性髒頁數是否超過了閾值(thresh),如果超過則需要進行髒頁寫回。
        //全域性髒頁數量是否超過其閾值,或者是否啟用了嚴格限制。
        //dirty_exceeded 為true時,表示髒頁數量已經超過了閾值,需要採取相應的措施來處理這些髒頁,例如觸發寫回操作以減少髒頁數量。
        dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
            ((gdtc->dirty > gdtc->thresh) || strictlimit);
        //這個函式調整裝置寫回速率的比例 (pos_ratio),以確保髒頁能夠在合理的速率下寫回。具體來說,wb_position_ratio 根據裝置的髒頁數量、寫頻寬等因素動態計算一個比例,來決定寫回任務的速率。
        wb_position_ratio(gdtc);
        //這行程式碼將 gdtc(全域性髒頁控制結構)賦值給 sdtc(當前髒頁控制結構)
        sdtc = gdtc;
		//跳過
        if (mdtc) {
            /*
             * If memcg domain is in effect, calculate its
             * pos_ratio.  @wb should satisfy constraints from
             * both global and memcg domains.  Choose the one
             * w/ lower pos_ratio.
             */
            if (!strictlimit)
                wb_dirty_limits(mdtc);

            dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
                ((mdtc->dirty > mdtc->thresh) || strictlimit);

            wb_position_ratio(mdtc);
            if (mdtc->pos_ratio < gdtc->pos_ratio)
                sdtc = mdtc;
        }

        if (dirty_exceeded && !wb->dirty_exceeded)
            wb->dirty_exceeded = 1;

        if (time_is_before_jiffies(wb->bw_time_stamp +
                       BANDWIDTH_INTERVAL)) {
            spin_lock(&wb->list_lock);
            __wb_update_bandwidth(gdtc, mdtc, start_time, true);
            spin_unlock(&wb->list_lock);
        }

        /* throttle according to the chosen dtc */
        dirty_ratelimit = wb->dirty_ratelimit;
        task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                            RATELIMIT_CALC_SHIFT;
        max_pause = wb_max_pause(wb, sdtc->wb_dirty);
        min_pause = wb_min_pause(wb, max_pause,
                     task_ratelimit, dirty_ratelimit,
                     &nr_dirtied_pause);

        if (unlikely(task_ratelimit == 0)) {
            period = max_pause;
            pause = max_pause;
            goto pause;
        }
        period = HZ * pages_dirtied / task_ratelimit;
        pause = period;
        if (current->dirty_paused_when)
            pause -= now - current->dirty_paused_when;
        /*
         * For less than 1s think time (ext3/4 may block the dirtier
         * for up to 800ms from time to time on 1-HDD; so does xfs,
         * however at much less frequency), try to compensate it in
         * future periods by updating the virtual time; otherwise just
         * do a reset, as it may be a light dirtier.
         */
        if (pause < min_pause) {
            trace_balance_dirty_pages(wb,
                          sdtc->thresh,
                          sdtc->bg_thresh,
                          sdtc->dirty,
                          sdtc->wb_thresh,
                          sdtc->wb_dirty,
                          dirty_ratelimit,
                          task_ratelimit,
                          pages_dirtied,
                          period,
                          min(pause, 0L),
                          start_time);
            if (pause < -HZ) {
                current->dirty_paused_when = now;
                current->nr_dirtied = 0;
            } else if (period) {
                current->dirty_paused_when += period;
                current->nr_dirtied = 0;
            } else if (current->nr_dirtied_pause <= pages_dirtied)
                current->nr_dirtied_pause += pages_dirtied;
            break;
        }
        if (unlikely(pause > max_pause)) {
            /* for occasional dropped task_ratelimit */
            now += min(pause - max_pause, max_pause);
            pause = max_pause;
        }

pause:
        trace_balance_dirty_pages(wb,
                      sdtc->thresh,
                      sdtc->bg_thresh,
                      sdtc->dirty,
                      sdtc->wb_thresh,
                      sdtc->wb_dirty,
                      dirty_ratelimit,
                      task_ratelimit,
                      pages_dirtied,
                      period,
                      pause,
                      start_time);
        __set_current_state(TASK_KILLABLE);
        wb->dirty_sleep = now;
        io_schedule_timeout(pause);

        current->dirty_paused_when = now + pause;
        current->nr_dirtied = 0;
        current->nr_dirtied_pause = nr_dirtied_pause;

        /*
         * This is typically equal to (dirty < thresh) and can also
         * keep "1000+ dd on a slow USB stick" under control.
         */
        if (task_ratelimit)
            break;

        /*
         * In the case of an unresponding NFS server and the NFS dirty
         * pages exceeds dirty_thresh, give the other good wb's a pipe
         * to go through, so that tasks on them still remain responsive.
         *
         * In theory 1 page is enough to keep the consumer-producer
         * pipe going: the flusher cleans 1 page => the task dirties 1
         * more page. However wb_dirty has accounting errors.  So use
         * the larger and more IO friendly wb_stat_error.
         */
        if (sdtc->wb_dirty <= wb_stat_error())
            break;

        if (fatal_signal_pending(current))
            break;
    }

    if (!dirty_exceeded && wb->dirty_exceeded)
        wb->dirty_exceeded = 0;

    if (writeback_in_progress(wb))
        return;

    /*
     * In laptop mode, we wait until hitting the higher threshold before
     * starting background writeout, and then write out all the way down
     * to the lower threshold.  So slow writers cause minimal disk activity.
     *
     * In normal mode, we start background writeout at the lower
     * background_thresh, to keep the amount of dirty memory low.
     */
    if (laptop_mode)
        return;

    if (nr_reclaimable > gdtc->bg_thresh)
        wb_start_background_writeback(wb);
}

其中有幾個函式

domain_dirty_limits

這個函式的主要目的是計算並設定髒頁閾值,以控制系統何時將記憶體中修改但未寫入磁碟的資料(髒頁)寫入磁碟。這個閾值決定了系統在觸發後臺寫回程序之前,允許記憶體中存在多少髒頁。

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
    return NULL;
}

詳情解釋:

static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
    //available_memory 是當前域的可用記憶體頁數。
    const unsigned long available_memory = dtc->avail;
    //gdtc這個為空
    struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
    //這個值在sysctl下修改;
    /*
vm_dirty_bytes 是 Linux 核心中的一個引數,用於控制記憶體中髒頁(dirty pages)的數量。髒頁是指那些已經被修改並且需要被寫回磁碟的資料頁。vm_dirty_bytes 引數指定了當髒頁所佔用的記憶體量達到某個位元組值時,系統將開始將這些髒資料寫回到磁碟。
具體來說,vm_dirty_bytes 與 vm_dirty_ratio 一起工作,它們定義了核心在開始將髒頁資料寫回磁碟之前,允許髒頁佔用的記憶體的最大量。這兩個引數只能設定其中一個,如果設定了 vm_dirty_bytes,則 vm_dirty_ratio 的設定將被忽略,反之亦然。
當設定 vm_dirty_bytes 時,它定義了一個絕對值,即系統髒記憶體超過該位元組值時,執行磁碟寫操作的程序開始回寫髒頁到磁碟。
如果 vm_dirty_bytes 設定為 0,則表示不透過位元組值來限制髒頁,而完全依賴於 vm_dirty_ratio 百分比值來決定何時開始回寫髒頁。
調整 vm_dirty_bytes 引數可以影響系統的寫入效能和記憶體使用效率。如果設定得過高,可能會導致記憶體中積壓大量髒頁,增加資料丟失的風險(在系統崩潰時未寫入磁碟的資料會丟失);如果設定得過低,則可能會導致頻繁的磁碟寫入操作,影響系統效能
節點為/proc/sys/vm/dirty_byte
    */
    unsigned long bytes = vm_dirty_bytes;
    unsigned long bg_bytes = dirty_background_bytes;
    /* convert ratios to per-PAGE_SIZE for higher precision */
    //完成百分比到實際比例的轉換。例如,如果 vm_dirty_ratio 是 10(表示 10%),計算後的 ratio 將是 10 * PAGE_SIZE / 100,即 0.1 * PAGE_SIZE。
    unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
    unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
    unsigned long thresh;
    unsigned long bg_thresh;
    struct task_struct *tsk;

    /* gdtc is !NULL iff @dtc is for memcg domain */
    //不會進來這裡
    if (gdtc) {
        unsigned long global_avail = gdtc->avail;

        /*
         * The byte settings can't be applied directly to memcg
         * domains.  Convert them to ratios by scaling against
         * globally available memory.  As the ratios are in
         * per-PAGE_SIZE, they can be obtained by dividing bytes by
         * number of pages.
         */
        if (bytes)
            ratio = min(DIV_ROUND_UP(bytes, global_avail),
                    PAGE_SIZE);
        if (bg_bytes)
            bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
                       PAGE_SIZE);
        bytes = bg_bytes = 0;
    }
    //優先使用bytes,計算前臺閾值
    if (bytes)
        thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
    else
        //否則基於比例值和可用記憶體計算閾值。
        thresh = (ratio * available_memory) / PAGE_SIZE;

    if (bg_bytes)
        bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
    else
        bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;

    //調整後臺閾值以避免大於前臺閾值:
    if (bg_thresh >= thresh)
        bg_thresh = thresh / 2;
    tsk = current;
    //對實時任務和低節流任務的額外調整:
    if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
        //後臺閾值(bg_thresh)和前臺閾值(thresh)都會增加:
        //增加 25% 的現有閾值。
        //加上一小部分全域性髒頁限制值(global_wb_domain.dirty_limit 的 1/32)。
        bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
        thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
    }
    //前臺閾值 (thresh) 和後臺閾值 (bg_thresh) 提高後,允許實時任務積累更多的髒資料,減少它們被核心強制寫回或阻塞的可能性。
    dtc->thresh = thresh;
    dtc->bg_thresh = bg_thresh;

    /* we should eventually report the domain in the TP */
    if (!gdtc)
        trace_global_dirty_state(bg_thresh, thresh);
}

dirty_freerun_ceiling:

static unsigned long dirty_freerun_ceiling(unsigned long thresh,
                       unsigned long bg_thresh)
{
    return (thresh + bg_thresh) / 2;
}

dirty_poll_interval

static unsigned long dirty_poll_interval(unsigned long dirty,
                     unsigned long thresh)
{
    if (thresh > dirty)
        return 1UL << (ilog2(thresh - dirty) >> 1);

    return 1;
}

這裡,ilog2(thresh - dirty) 會計算 thresh - dirty 的對數值,然後右移一位。這個操作用於調整間隔時間,使得髒頁接近閾值時,寫回間隔較短,而當髒頁離閾值較遠時,寫回間隔較長。這有助於減少頻繁的髒頁寫回,提升系統效率。

如果 thresh 大於 dirty,即閾值大於當前的髒頁數量,那麼 dirty_poll_interval 函式會計算一個較長的寫回間隔時間。

如果 thresh 小於 dirty,即閾值小於當前的髒頁數量,那麼 dirty_poll_interval 函式會返回最小的寫回間隔 1。這意味著系統會頻繁地進行髒頁寫回操作。

wb_start_background_writeback

void wb_start_background_writeback(struct bdi_writeback *wb)
{
    /*
     * We just wake up the flusher thread. It will perform background
     * writeback as soon as there is no other work to do.
     */
    trace_writeback_wake_background(wb);
    wb_wakeup(wb);
}

wb_start_background_writeback 函式的作用是啟動後臺寫回操作,將後臺寫回執行緒喚醒,開始執行髒頁的寫回工作。

詳細解析

  1. 喚醒後臺寫回執行緒
  2. wb_wakeup(wb);:該函式呼叫喚醒後臺的寫回執行緒。具體來說,它會通知寫回執行緒開始工作,通常後臺寫回執行緒會在沒有其他任務時進行髒頁資料的寫入操作。
  3. 跟蹤寫回操作的啟動
  4. trace_writeback_wake_background(wb);:該行程式碼透過 trace_writeback_wake_background 函式跟蹤後臺寫回的啟動過程。通常,這種跟蹤機制用於日誌記錄或效能監控,幫助開發者瞭解何時啟動了後臺寫回。

目的:

這個函式的主要目的是確保後臺寫回操作的啟動。後臺寫回執行緒負責將記憶體中的髒頁(修改過但尚未寫入磁碟的頁)定期寫回磁碟,以釋放記憶體並保持資料一致性。透過呼叫 wb_wakeup,後臺寫回執行緒會在沒有其他任務時啟動,執行這些寫回操作,避免記憶體被過多髒頁佔用。

總結來說,wb_start_background_writeback 是一個用於管理後臺寫回過程的函式,確保髒頁的寫回工作被有效排程。

writeback_in_progress

該函式的作用是檢查後臺寫回操作是否正在進行。

static inline bool writeback_in_progress(struct bdi_writeback *wb)
{
    return test_bit(WB_writeback_running, &wb->state);
}

test_bit(WB_writeback_running, &wb->state)

  • 這個函式使用 test_bit 來檢查 wb->state 點陣圖中的 WB_writeback_running 標誌位。如果該位為 1,表示當前有寫回操作正在進行,函式返回 true。
  • test_bit 是一個用於檢查點陣圖中特定位是否被設定的函式。如果指定的位被設定為 1,它會返回 true,否則返回 false。

WB_writeback_running

  • WB_writeback_running 是一個標誌,通常用於指示是否有寫回操作正在進行。這個標誌通常在後臺寫回操作啟動時被設定,並在寫回操作完成後清除

wb_dirty_limits

wb_dirty_limits 函式的作用是計算並更新與後臺寫回相關的髒頁閾值 (wb_thresh 和 wb_bg_thresh),以及計算當前正在寫回的髒頁數量 (wb_dirty)。

static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
	//獲取寫回控制結構體中的 wb 指標。
    struct bdi_writeback *wb = dtc->wb;
    //宣告一個無符號長整型變數 wb_reclaimable,用於儲存可回收的髒頁數量。
    unsigned long wb_reclaimable;

    /*
     * wb_thresh is not treated as some limiting factor as
     * dirty_thresh, due to reasons
     * - in JBOD setup, wb_thresh can fluctuate a lot
     * - in a system with HDD and USB key, the USB key may somehow
     *   go into state (wb_dirty >> wb_thresh) either because
     *   wb_dirty starts high, or because wb_thresh drops low.
     *   In this case we don't want to hard throttle the USB key
     *   dirtiers for 100 seconds until wb_dirty drops under
     *   wb_thresh. Instead the auxiliary wb control line in
     *   wb_position_ratio() will let the dirtier task progress
     *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
     */
    //呼叫 __wb_calc_thresh  函式計算寫回閾值,並將結果賦值給 dtc->wb_thresh。
    dtc->wb_thresh = __wb_calc_thresh(dtc);
    //計算後臺寫回閾值 wb_bg_thresh,如果 dtc->thresh 為非零,則根據比例計算,否則設定為 0。
    dtc->wb_bg_thresh = dtc->thresh ?
        div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;

    /*
     * In order to avoid the stacked BDI deadlock we need
     * to ensure we accurately count the 'dirty' pages when
     * the threshold is low.
     *
     * Otherwise it would be possible to get thresh+n pages
     * reported dirty, even though there are thresh-m pages
     * actually dirty; with m+n sitting in the percpu
     * deltas.
     */
     
    if (dtc->wb_thresh < 2 * wb_stat_error()) {
        wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
        dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
    } else {
        wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
        dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
    }
}

更新 wb_thresh wb_bg_thresh

  • dtc->wb_thresh = __wb_calc_thresh(dtc);:透過呼叫 __wb_calc_thresh 函式計算並設定 wb_thresh(寫回閾值)。這個閾值表示何時應該啟動寫回操作。具體的計算方式依賴於裝置和寫回控制的狀態。
  • dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;:計算 wb_bg_thresh(後臺寫回閾值),如果 dtc->thresh 非零,使用給定的比率來調整前臺和後臺的寫回閾值。否則,將 wb_bg_thresh 設定為 0。

避免 BDI 堆疊死鎖

  • 該函式中提到的 "BDI 堆疊死鎖"(stacked BDI deadlock)是指當寫回閾值非常低時,可能會因為在髒頁統計時沒有正確同步,導致系統錯誤地報告超過閾值的髒頁數量。為了避免這個問題,如果 dtc->wb_thresh 小於一個較小的閾值(2 * wb_stat_error()),它會透過 wb_stat_sum 函式更精確地計算出 wb_dirty(即正在寫回的髒頁數)。

計算 wb_dirty

  • 如果 wb_thresh 較低,則使用 wb_stat_sum 函式計算 WB_RECLAIMABLE 和 WB_WRITEBACK 狀態的髒頁數,並將其合併為 wb_dirty。
  • 否則,直接使用 wb_stat 函式計算當前的 WB_RECLAIMABLE 和 WB_WRITEBACK 髒頁數,並將其加總為 wb_dirty。

__wb_calc_thresh

static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
	//獲取寫回域(domain)的指標。
	struct wb_domain *dom = dtc_dom(dtc);
	//獲取全域性髒頁閾值。
	unsigned long thresh = dtc->thresh;
	u64 wb_thresh;
	long numerator, denominator;
	unsigned long wb_min_ratio, wb_max_ratio;

	/*
	 * Calculate this BDI's share of the thresh ratio.
	 */
	//計算該BDI在整個寫回域中的完成比例,得到分子 numerator 和分母 denominator。
	fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
			      &numerator, &denominator);

	//bdi_min_ratio這個值從bdi_min_ratio來的
	wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
	//根據完成比例調整寫回閾值。?
	wb_thresh *= numerator;
	do_div(wb_thresh, denominator);
	
	//wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);:獲取該BDI的最小和最大比例。
	wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
	//根據最小比例調整寫回閾值。
	wb_thresh += (thresh * wb_min_ratio) / 100;
	//確保寫回閾值不超過最大比例。
	if (wb_thresh > (thresh * wb_max_ratio) / 100)
		wb_thresh = thresh * wb_max_ratio / 100;
	//return wb_thresh;:返回最終計算的寫回閾值。
	return wb_thresh;
}

該函式的主要作用就是計算wb_thresh

wb_min_max_ratio

static void wb_min_max_ratio(struct bdi_writeback *wb,
			     unsigned long *minp, unsigned long *maxp)
{
	*minp = wb->bdi->min_ratio;
	*maxp = wb->bdi->max_ratio;
}
  • 將寫回裝置的最小髒頁比例賦值給 minp 指向的變數。
  • 將寫回裝置的最大髒頁比例賦值給 maxp 指向的變數。

bdi_min_ratio

static unsigned int bdi_min_ratio;
//bdi_min_ratio 是一個全域性變數,表示所有寫回裝置的最小髒頁比例的總和。

int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
	int ret = 0;

	spin_lock_bh(&bdi_lock);
	if (min_ratio > bdi->max_ratio) {
		ret = -EINVAL;
	} else {
		//bdi->min_ratio 是特定寫回裝置的最小髒頁比例。
		min_ratio -= bdi->min_ratio;
		//檢查全域性最小比例加上新的最小比例是否小於 100。
		if (bdi_min_ratio + min_ratio < 100) {
			//如果是,則更新全域性最小比例 bdi_min_ratio 和裝置的最小比例 bdi->min_ratio。
			bdi_min_ratio += min_ratio;
			bdi->min_ratio += min_ratio;
		} else {
			ret = -EINVAL;
		}
	}
	spin_unlock_bh(&bdi_lock);

	return ret;
}

bdi_set_min_ratio 的作用是設定特定寫回裝置(BDI)的最小髒頁比例。

總結來說,這個函式確保每個寫回裝置的最小髒頁比例在合理範圍內,並且總和不超過 100%

wb_position_ratio

static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
    struct bdi_writeback *wb = dtc->wb;
    unsigned long write_bw = wb->avg_write_bandwidth;
    unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
    unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
    unsigned long wb_thresh = dtc->wb_thresh;
    unsigned long x_intercept;
    unsigned long setpoint;     /* dirty pages' target balance point */
    unsigned long wb_setpoint;
    unsigned long span;
    long long pos_ratio;        /* for scaling up/down the rate limit */
    long x;

    dtc->pos_ratio = 0;

    if (unlikely(dtc->dirty >= limit))
        return;

    /*
     * global setpoint
     *
     * See comment for pos_ratio_polynom().
     */
    setpoint = (freerun + limit) / 2;
    pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);

    /*
     * The strictlimit feature is a tool preventing mistrusted filesystems
     * from growing a large number of dirty pages before throttling. For
     * such filesystems balance_dirty_pages always checks wb counters
     * against wb limits. Even if global "nr_dirty" is under "freerun".
     * This is especially important for fuse which sets bdi->max_ratio to
     * 1% by default. Without strictlimit feature, fuse writeback may
     * consume arbitrary amount of RAM because it is accounted in
     * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
     *
     * Here, in wb_position_ratio(), we calculate pos_ratio based on
     * two values: wb_dirty and wb_thresh. Let's consider an example:
     * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
     * limits are set by default to 10% and 20% (background and throttle).
     * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
     * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
     * about ~6K pages (as the average of background and throttle wb
     * limits). The 3rd order polynomial will provide positive feedback if
     * wb_dirty is under wb_setpoint and vice versa.
     *
     * Note, that we cannot use global counters in these calculations
     * because we want to throttle process writing to a strictlimit wb
     * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
     * in the example above).
     */
    if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
        long long wb_pos_ratio;

        if (dtc->wb_dirty < 8) {
            dtc->pos_ratio = min_t(long long, pos_ratio * 2,
                       2 << RATELIMIT_CALC_SHIFT);
            return;
        }

        if (dtc->wb_dirty >= wb_thresh)
            return;

        wb_setpoint = dirty_freerun_ceiling(wb_thresh,
                            dtc->wb_bg_thresh);

        if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
            return;

        wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
                         wb_thresh);

        /*
         * Typically, for strictlimit case, wb_setpoint << setpoint
         * and pos_ratio >> wb_pos_ratio. In the other words global
         * state ("dirty") is not limiting factor and we have to
         * make decision based on wb counters. But there is an
         * important case when global pos_ratio should get precedence:
         * global limits are exceeded (e.g. due to activities on other
         * wb's) while given strictlimit wb is below limit.
         *
         * "pos_ratio * wb_pos_ratio" would work for the case above,
         * but it would look too non-natural for the case of all
         * activity in the system coming from a single strictlimit wb
         * with bdi->max_ratio == 100%.
         *
         * Note that min() below somewhat changes the dynamics of the
         * control system. Normally, pos_ratio value can be well over 3
         * (when globally we are at freerun and wb is well below wb
         * setpoint). Now the maximum pos_ratio in the same situation
         * is 2. We might want to tweak this if we observe the control
         * system is too slow to adapt.
         */
        dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
        return;
    }

    /*
     * We have computed basic pos_ratio above based on global situation. If
     * the wb is over/under its share of dirty pages, we want to scale
     * pos_ratio further down/up. That is done by the following mechanism.
     */

    /*
     * wb setpoint
     *
     *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
     *
     *                        x_intercept - wb_dirty
     *                     := --------------------------
     *                        x_intercept - wb_setpoint
     *
     * The main wb control line is a linear function that subjects to
     *
     * (1) f(wb_setpoint) = 1.0
     * (2) k = - 1 / (8 * write_bw)  (in single wb case)
     *     or equally: x_intercept = wb_setpoint + 8 * write_bw
     *
     * For single wb case, the dirty pages are observed to fluctuate
     * regularly within range
     *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
     * for various filesystems, where (2) can yield in a reasonable 12.5%
     * fluctuation range for pos_ratio.
     *
     * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
     * own size, so move the slope over accordingly and choose a slope that
     * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
     */
    if (unlikely(wb_thresh > dtc->thresh))
        wb_thresh = dtc->thresh;
    /*
     * It's very possible that wb_thresh is close to 0 not because the
     * device is slow, but that it has remained inactive for long time.
     * Honour such devices a reasonable good (hopefully IO efficient)
     * threshold, so that the occasional writes won't be blocked and active
     * writes can rampup the threshold quickly.
     */
    wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
    /*
     * scale global setpoint to wb's:
     *  wb_setpoint = setpoint * wb_thresh / thresh
     */
    x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
    wb_setpoint = setpoint * (u64)x >> 16;
    /*
     * Use span=(8*write_bw) in single wb case as indicated by
     * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
     *
     *        wb_thresh                    thresh - wb_thresh
     * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
     *         thresh                           thresh
     */
    span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
    x_intercept = wb_setpoint + span;

    if (dtc->wb_dirty < x_intercept - span / 4) {
        pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
                      (x_intercept - wb_setpoint) | 1);
    } else
        pos_ratio /= 4;

    /*
     * wb reserve area, safeguard against dirty pool underrun and disk idle
     * It may push the desired control point of global dirty pages higher
     * than setpoint.
     */
    x_intercept = wb_thresh / 2;
    if (dtc->wb_dirty < x_intercept) {
        if (dtc->wb_dirty > x_intercept / 8)
            pos_ratio = div_u64(pos_ratio * x_intercept,
                        dtc->wb_dirty);
        else
            pos_ratio *= 8;
    }

    dtc->pos_ratio = pos_ratio;
}

這段程式碼的主要功能是根據系統的寫回狀態(dirty pages),動態調整寫回任務的速率限制(pos_ratio),以實現髒頁寫回的平衡控制。具體來說,這段程式碼是針對不同情況下的髒頁寫回進行精細控制,尤其是在存在嚴格限制(strictlimit)和多個寫回裝置(例如JBOD配置)時,如何動態調整寫回速率,以避免過度的髒頁積累和系統資源耗盡。

初始化變數

  • write_bw:獲取當前裝置的平均寫頻寬。
  • freerun:計算髒頁自由流動的閾值。
  • limit:計算硬性髒頁限制(hard_dirty_limit)。
  • wb_thresh:從 dtc 中獲取寫回閾值。

計算 setpoint pos_ratio

  • setpoint 是目標髒頁平衡點(即想要達到的髒頁數量),它是 freerun 和 limit 的中間值。
  • pos_ratio 是位置比例,它是基於全域性髒頁數 (dtc->dirty) 和髒頁限制 (limit) 計算的,用來調節寫回速率。

處理 strictlimit 情況

  • 如果裝置支援 BDI_CAP_STRICTLIMIT(即嚴格限制),則在 dtc->wb_dirty(裝置的髒頁數)小於設定的閾值時,調整 pos_ratio,以避免髒頁積累過多,特別是對於不可信的檔案系統(如 FUSE 檔案系統),防止髒頁過多而不觸發寫回操作。
  • 在 strictlimit 情況下,pos_ratio 是透過與 wb_setpoint(寫回閾值)相關的值來進一步調整的。如果 wb_dirty 數量過少,pos_ratio 被加倍;如果超過了 wb_thresh,則停止調節。

處理非 strictlimit 情況

  • 對於普通情況,根據計算出的 pos_ratio,進一步調整寫回速率。如果當前裝置的髒頁數 (wb_dirty) 超過了設定的寫回閾值,則停止進一步調整。
  • 如果 wb_thresh 大於全域性閾值(dtc->thresh),則將 wb_thresh 調整為全域性閾值。
  • 根據裝置的寫頻寬和髒頁數量,計算一個新的 wb_setpoint 和 span,並使用這些值來動態調整 pos_ratio,使其更加平衡地反映系統當前的髒頁狀態。

動態調整 pos_ratio

  • 如果髒頁數接近設定的寫回閾值,則會根據裝置的寫頻寬和髒頁數進行比例縮放。具體來說,pos_ratio 會根據裝置的髒頁數與目標平衡點之間的差距進行調整,以便平滑地控制寫回速率。
  • 此外,如果裝置的髒頁數低於某個安全值(如 x_intercept),則進一步調整 pos_ratio,以避免寫回速率過低,保證系統的髒頁寫回操作不會完全停止。

__wb_update_bandwidth

static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                  struct dirty_throttle_control *mdtc,
                  unsigned long start_time,
                  bool update_ratelimit)
{
    struct bdi_writeback *wb = gdtc->wb;
    unsigned long now = jiffies;
    unsigned long elapsed = now - wb->bw_time_stamp;
    unsigned long dirtied;
    unsigned long written;

    lockdep_assert_held(&wb->list_lock);

    /*
     * rate-limit, only update once every 200ms.
     */
    if (elapsed < BANDWIDTH_INTERVAL)
        return;

    dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
    written = percpu_counter_read(&wb->stat[WB_WRITTEN]);

    /*
     * Skip quiet periods when disk bandwidth is under-utilized.
     * (at least 1s idle time between two flusher runs)
     */
    if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
        goto snapshot;

    if (update_ratelimit) {
        domain_update_bandwidth(gdtc, now);
        wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);

        /*
         * @mdtc is always NULL if !CGROUP_WRITEBACK but the
         * compiler has no way to figure that out.  Help it.
         */
        if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
            domain_update_bandwidth(mdtc, now);
            wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
        }
    }
    wb_update_write_bandwidth(wb, elapsed, written);

snapshot:
    wb->dirtied_stamp = dirtied;
    wb->written_stamp = written;
    wb->bw_time_stamp = now;
}

__wb_update_bandwidth 函式的作用是更新與磁碟寫回頻寬相關的統計資訊,並進行速率限制管理。它主要負責根據過去的時間段,更新寫回的頻寬、髒頁寫回速率以及相關的時間戳等資料。以下是對函式各部分功能的詳細解釋:

  1. 獲取當前時間和計算時間間隔
unsigned long now = jiffies;
unsigned long elapsed = now - wb->bw_time_stamp;
  • now 獲取當前的系統時間(以 jiffies 為單位),elapsed 計算自上次更新以來的時間間隔。
  • 檢查時間間隔是否符合更新條件
if (elapsed < BANDWIDTH_INTERVAL)
    return;
  • 如果自上次更新以來的時間小於設定的 BANDWIDTH_INTERVAL(通常為 200ms),則直接返回,不更新頻寬資訊。這樣可以避免頻繁更新,減少效能開銷。
  • 讀取髒頁和寫入的資料
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
  • 從 wb->stat 讀取當前裝置的髒頁(WB_DIRTIED)和已寫入的資料(WB_WRITTEN)。
  • 處理安靜期(磁碟空閒期)
if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
    goto snapshot;
  • 如果裝置的空閒時間超過了 1 秒(elapsed > HZ),且當前時間點早於開始時間 start_time,則跳過頻寬更新,直接進行快照(即不更新頻寬資料)。
  • 更新速率限制(如果需要)
if (update_ratelimit) {
    domain_update_bandwidth(gdtc, now);
    wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
    if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
        domain_update_bandwidth(mdtc, now);
        wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
    }
}
  • 如果

update_ratelimit

為真,更新寫回的速率限制:

  • 呼叫 domain_update_bandwidth 更新頻寬資訊。
  • 呼叫 wb_update_dirty_ratelimit 根據髒頁數和時間間隔更新髒頁寫回速率。
  • 如果啟用了 CGROUP_WRITEBACK 配置且 mdtc 不為空,則也更新 cgroup 的頻寬和速率。

6. 更新寫入頻寬資訊

wb_update_write_bandwidth(wb, elapsed, written);
  • 呼叫 wb_update_write_bandwidth 更新磁碟的寫入頻寬統計。

7. 儲存當前髒頁和已寫入資料的時間戳

snapshot:
wb->dirtied_stamp = dirtied;
wb->written_stamp = written;
wb->bw_time_stamp = now;
  • 將當前的髒頁數、已寫入資料量和時間戳儲存到 wb 結構中,以便下次更新時使用。

總結:

__wb_update_bandwidth 函式負責在一定時間間隔後,更新磁碟裝置的頻寬統計資訊,並計算和更新髒頁寫回的速率。它確保在磁碟空閒時跳過更新,並且根據配置選擇是否更新速率限制(例如,針對 cgroup 的寫回)。該函式透過時間戳和頻寬資訊來控制寫回速率,避免頻繁更新頻寬資訊。

相關文章