linux記憶體管理(六)- 核心新struct - folio

半山随笔發表於2024-06-11

folio大概是5.16引入的,看起來像是page的封裝,這裡有一篇講解folio很好的部落格,論好名字的重要性: Linux核心page到folio的變遷-CSDN部落格

struct folio {
    /* private: don't document the anon union */
    union {
        struct {
    /* public: */
            unsigned long flags;
            union {
                struct list_head lru;
    /* private: avoid cluttering the output */
                struct {
                    void *__filler;
    /* public: */
                    unsigned int mlock_count;
    /* private: */
                };
    /* public: */
            };
            struct address_space *mapping;
            pgoff_t index;
            union {
                void *private;
                swp_entry_t swap;
            };
            atomic_t _mapcount;
            atomic_t _refcount;
#ifdef CONFIG_MEMCG
            unsigned long memcg_data;
#endif
#if defined(WANT_PAGE_VIRTUAL)
            void *virtual;
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
            int _last_cpupid;
#endif
    /* private: the union with struct page is transitional */
        };
        struct page page;
    };
    union {
        struct {
            unsigned long _flags_1;
            unsigned long _head_1;
            unsigned long _folio_avail;
    /* public: */
            atomic_t _entire_mapcount;
            atomic_t _nr_pages_mapped;
            atomic_t _pincount;
#ifdef CONFIG_64BIT
            unsigned int _folio_nr_pages;
#endif
    /* private: the union with struct page is transitional */
        };
        struct page __page_1;
    };
    union {
        struct {
            unsigned long _flags_2;
            unsigned long _head_2;
    /* public: */
            void *_hugetlb_subpool;
            void *_hugetlb_cgroup;
            void *_hugetlb_cgroup_rsvd;
            void *_hugetlb_hwpoison;
    /* private: the union with struct page is transitional */
        };
        struct {
            unsigned long _flags_2a;
            unsigned long _head_2a;
    /* public: */
            struct list_head _deferred_list;
    /* private: the union with struct page is transitional */
        };
        struct page __page_2;
    };
};

簡單來看它似乎是三個page結構的組合。與第一個page union的結構跟page結構幾乎一致。引入folio是為了解決長久以來page混亂的語義。page除了可以代表單頁也可以代表連續多個頁面,甚至大頁。page在核心中應用廣泛,這種混亂增加了寫程式碼和理解程式碼的難度,人為的增加混亂。folio代表一個或多個page,本身就可以代表page所有的語義。在新的核心程式碼中folio在很多場合完成了page的替代,但是page依然存在。

比如compound_order的實現。在folio之前是這樣的。

static inline unsigned int compound_order(struct page *page)
{
    if (!PageHead(page))
        return 0;
    return page[1].compound_order;
}

先檢查page是不是單頁,如果是單頁直接返回0,對於複合頁order儲存在後一個page的compound_order成員中。也即是單個page是表示不了多頁的,但是folio可以。

static inline unsigned int compound_order(struct page *page)
{
    struct folio *folio = (struct folio *)page;

    if (!test_bit(PG_head, &folio->flags))
        return 0;
    return folio->_flags_1 & 0xff;
}

新的程式碼中首先將page強轉為folio,判斷其是否為複合頁,如果是複合頁order儲存在_flags_1中。看起來也沒簡化,反而更復雜一點,但是可以在一個folio結構中解決問題,不再依賴於tail page。

看一下page結構

struct page {
    unsigned long flags;        /* Atomic flags, some possibly
                     * updated asynchronously */
    /*
     * Five words (20/40 bytes) are available in this union.
     * WARNING: bit 0 of the first word is used for PageTail(). That
     * means the other users of this union MUST NOT use the bit to
     * avoid collision and false-positive PageTail().
     */
    union {
        struct {    /* Page cache and anonymous pages */
            /**
             * @lru: Pageout list, eg. active_list protected by
             * lruvec->lru_lock.  Sometimes used as a generic list
             * by the page owner.
             */
            union {
                struct list_head lru;

                /* Or, for the Unevictable "LRU list" slot */
                struct {
                    /* Always even, to negate PageTail */
                    void *__filler;
                    /* Count page's or folio's mlocks */
                    unsigned int mlock_count;
                };

                /* Or, free page */
                struct list_head buddy_list;
                struct list_head pcp_list;
            };
            /* See page-flags.h for PAGE_MAPPING_FLAGS */
            struct address_space *mapping;
            union {
                pgoff_t index;        /* Our offset within mapping. */
                unsigned long share;    /* share count for fsdax */
            };
            /**
             * @private: Mapping-private opaque data.
             * Usually used for buffer_heads if PagePrivate.
             * Used for swp_entry_t if PageSwapCache.
             * Indicates order in the buddy system if PageBuddy.
             */
            unsigned long private;
        };
        struct {    /* page_pool used by netstack */
            /**
             * @pp_magic: magic value to avoid recycling non
             * page_pool allocated pages.
             */
            unsigned long pp_magic;
            struct page_pool *pp;
            unsigned long _pp_mapping_pad;
            unsigned long dma_addr;
            atomic_long_t pp_ref_count;
        };
        struct {    /* Tail pages of compound page */
            unsigned long compound_head;    /* Bit zero is set */
        };
        struct {    /* ZONE_DEVICE pages */
            /** @pgmap: Points to the hosting device page map. */
            struct dev_pagemap *pgmap;
            void *zone_device_data;
            /*
             * ZONE_DEVICE private pages are counted as being
             * mapped so the next 3 words hold the mapping, index,
             * and private fields from the source anonymous or
             * page cache page while the page is migrated to device
             * private memory.
             * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
             * use the mapping, index, and private fields when
             * pmem backed DAX files are mapped.
             */
        };

        /** @rcu_head: You can use this to free a page by RCU. */
        struct rcu_head rcu_head;
    };

    union {        /* This union is 4 bytes in size. */
        /*
         * If the page can be mapped to userspace, encodes the number
         * of times this page is referenced by a page table.
         */
        atomic_t _mapcount;

        /*
         * If the page is neither PageSlab nor mappable to userspace,
         * the value stored here may help determine what this page
         * is used for.  See page-flags.h for a list of page types
         * which are currently stored here.
         */
        unsigned int page_type;
    };

    /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
    atomic_t _refcount;

#ifdef CONFIG_MEMCG
    unsigned long memcg_data;
#endif

    /*
     * On machines where all RAM is mapped into kernel address space,
     * we can simply calculate the virtual address. On machines with
     * highmem some memory is mapped into kernel virtual memory
     * dynamically, so we need a place to store that address.
     * Note that this field could be 16 bits on x86 ... ;)
     *
     * Architectures with slow multiplication can define
     * WANT_PAGE_VIRTUAL in asm/page.h
     */
#if defined(WANT_PAGE_VIRTUAL)
    void *virtual;            /* Kernel virtual address (NULL if
                       not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
    int _last_cpupid;
#endif

#ifdef CONFIG_KMSAN
    /*
     * KMSAN metadata for this page:
     *  - shadow page: every bit indicates whether the corresponding
     *    bit of the original page is initialized (0) or not (1);
     *  - origin page: every 4 bytes contain an id of the stack trace
     *    where the uninitialized value was created.
     */
    struct page *kmsan_shadow;
    struct page *kmsan_origin;
#endif
} _struct_page_alignment;

1. flags

enum pageflags {
    PG_locked,        /* Page is locked. Don't touch. */
    PG_writeback,        /* Page is under writeback */
    PG_referenced,
    PG_uptodate,
    PG_dirty,
    PG_lru,
    PG_head,        /* Must be in bit 6 */
    PG_waiters,        /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
    PG_active,
    PG_workingset,
    PG_error,
    PG_slab,
    PG_owner_priv_1,    /* Owner use. If pagecache, fs may use*/
    PG_arch_1,
    PG_reserved,
    PG_private,        /* If pagecache, has fs-private data */
    PG_private_2,        /* If pagecache, has fs aux data */
    PG_mappedtodisk,    /* Has blocks allocated on-disk */
    PG_reclaim,        /* To be reclaimed asap */
    PG_swapbacked,        /* Page is backed by RAM/swap */
    PG_unevictable,
...

flags由四部分構成,|node|zone|last_cpuid|flags|

2. mapping

最低兩個bits可以用來判斷是否為匿名對映或ksm對映。對於匿名對映指向anon_vma. 對於file對映指向address_space結構。

#define PAGE_MAPPING_ANON    0x1
#define PAGE_MAPPING_MOVABLE    0x2
#define PAGE_MAPPING_KSM    (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS    (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)

3. _refcount

表示頁面在核心中的引用次數。大於0代表正在使用。

static inline void get_page(struct page *page)
{
    folio_get(page_folio(page));
}

static inline void folio_get(struct folio *folio)
{
    VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
    folio_ref_inc(folio);
}

static inline void folio_ref_inc(struct folio *folio)
{
    page_ref_inc(&folio->page);
}

static inline void page_ref_inc(struct page *page)
{
    atomic_inc(&page->_refcount);
    if (page_ref_tracepoint_active(page_ref_mod))
        __page_ref_mod(page, 1);
}

folio讓page操作變得非常繁瑣,這樣真的好嗎?

分配記憶體時_refcount + 1, 加入lru連結串列時+1等。

4. _mapcount

表示這個頁面被程序對映的次數,用做反向對映。-1代表沒有頁表對映。

page相關的API

static inline struct zone *page_zone(const struct page *page)
{
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}
static inline int page_zone_id(struct page *page)
{
        return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}

mapping相關

struct address_space *page_mapping(struct page *page)
{
    return folio_mapping(page_folio(page));
}

struct address_space *folio_mapping(struct folio *folio)
{
    struct address_space *mapping;

    /* This happens if someone calls flush_dcache_page on slab page */
    if (unlikely(folio_test_slab(folio)))
        return NULL;

    if (unlikely(folio_test_swapcache(folio)))
        return swap_address_space(folio->swap);

    mapping = folio->mapping;
//如果是匿名頁或ksm頁
if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; return mapping; }

page_mapped

static inline bool page_mapped(struct page *page)
{
    if (likely(!PageCompound(page)))
        return atomic_read(&page->_mapcount) >= 0;
    return folio_large_is_mapped(page_folio(page));
}

對於普通頁面只需判斷_mapcount值。

相關文章