linux記憶體管理(二)- vmalloc

半山随笔發表於2024-06-11

個人筆記,謹慎觀看.

先看看vmalloc是怎麼實現的。它能在非連續實體記憶體之上建立連續的虛擬記憶體對映。這裡有一篇部落格Linux記憶體管理 (6)vmalloc - ArnoldLu - 部落格園 (cnblogs.com)

呼叫鏈vmalloc->_vmalloc_node->_vmalloc_node_range

void *__vmalloc_node(unsigned long size, unsigned long align,
                gfp_t gfp_mask, int node, const void *caller)
{
    return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
                gfp_mask, PAGE_KERNEL, 0, node, caller);
}

 * Map them into contiguous kernel virtual space, using a pagetable
 * protection of @prot.
 *
 * Return: the address of the area or %NULL on failure
 */
void *__vmalloc_node_range(unsigned long size, unsigned long align,
            unsigned long start, unsigned long end, gfp_t gfp_mask,
            pgprot_t prot, unsigned long vm_flags, int node,
            const void *caller)
{
    if ((size >> PAGE_SHIFT) > totalram_pages()) {
        warn_alloc(gfp_mask, NULL,
            "vmalloc error: size %lu, exceeds total pages",
            real_size);
        return NULL;
    }

        size_per_node = size;
        if (node == NUMA_NO_NODE)
            size_per_node /= num_online_nodes();

//分配並初始化一個vm_struct area
= __get_vm_area_node(real_size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; return area->addr; ... }

totalram_pages是一個儲存系統總可用記憶體頁的全域性變數。__get_vm_area_node分配一個vm_struct並初始化,這個結構描述了要分配的vmalloc。

static struct vm_struct *__get_vm_area_node(unsigned long size,
        unsigned long align, unsigned long shift, unsigned long flags,
        unsigned long start, unsigned long end, int node,
        gfp_t gfp_mask, const void *caller)
{
    BUG_ON(in_interrupt());
    size = ALIGN(size, 1ul << shift);//按page size對齊
        //分配一個vm_struct
    area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
    if (!(flags & VM_NO_GUARD))
        size += PAGE_SIZE;
        //分配一個vmap_area
    va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
        //設定vmap_area到vm_struct
    setup_vmalloc_vm(area, va, flags, caller);
    return area;
}

這裡涉及到倆結構體。vm_struct, vmap_area.

struct vm_struct {
    struct vm_struct    *next;
    void            *addr;
    unsigned long        size;
    unsigned long        flags;
    struct page        **pages;
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    unsigned int        page_order;
#endif
    unsigned int        nr_pages;
    phys_addr_t        phys_addr;
    const void        *caller;
};

描述vmalloc區域。

struct vmap_area {
    unsigned long va_start;
    unsigned long va_end;

    struct rb_node rb_node;         /* address sorted rbtree */
    struct list_head list;          /* address sorted list */

    /*
     * The following two variables can be packed, because
     * a vmap_area object can be either:
     *    1) in "free" tree (root is free_vmap_area_root)
     *    2) or "busy" tree (root is vmap_area_root)
     */
    union {
        unsigned long subtree_max_size; /* in "free" tree */
        struct vm_struct *vm;           /* in "busy" tree */
    };
    unsigned long flags; /* mark type of vm_map_ram area */
};

也用來描述vmalloc的那個區域,主要描述區域的範圍,並且連結到一個全域性rbtree上。alloc_vmap_area會找到當前地址最低的一個空閒區域。

__vmalloc_area_node是核心函式,分配實體記憶體,建立對映。

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                 pgprot_t prot, unsigned int page_shift,
                 int node)
{
    const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
    bool nofail = gfp_mask & __GFP_NOFAIL;
    unsigned long addr = (unsigned long)area->addr;
    unsigned long size = get_vm_area_size(area);
    unsigned long array_size;
    unsigned int nr_small_pages = size >> PAGE_SHIFT;
    unsigned int page_order;
    unsigned int flags;
    int ret;

    // 計算需要儲存page指標的記憶體大小
    array_size = (unsigned long)nr_small_pages * sizeof(struct page *);

    // 可能會使用遞迴來分配給area->pages的記憶體,pages儲存的是page指標陣列
    /* Please note that the recursion is strictly bounded. */
    if (array_size > PAGE_SIZE) {
        area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
                    area->caller);
    } else {
        area->pages = kmalloc_node(array_size, nested_gfp, node);
    }

    // 如果沒有enable CONFIG_HAVE_ARCH_HUGE_VMALLOC階就是0
    set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
    page_order = vm_area_page_order(area);

    // 分配記憶體頁面,因為我們要的是不連續物理頁面,對於大多數情形每次獲取1頁,這樣就可以得到nr_pages個不連續的頁面
    area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
        node, page_order, nr_small_pages, area->pages);

    // nr_vmalloc_pages應該是儲存vmalloc分配總頁數的全域性變數
    atomic_long_add(area->nr_pages, &nr_vmalloc_pages);

    do {
        // 頁面分配好了,建立對映吧,看起如果不允許失敗那就要一直迴圈知道成功
        ret = vmap_pages_range(addr, addr + size, prot, area->pages,
            page_shift);
        if (nofail && (ret < 0))
            schedule_timeout_uninterruptible(1);
    } while (nofail && (ret < 0));

    return area->addr;
}
static int vmap_pages_range(unsigned long addr, unsigned long end,
        pgprot_t prot, struct page **pages, unsigned int page_shift)
{
    int err;
    // 用連續的虛擬地址區對映離散的物理頁面
    err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
// 頁面對映好了,刷一下cache flush_cache_vmap(addr, end);
return err; }
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
        pgprot_t prot, struct page **pages, unsigned int page_shift)
{
    int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
                         page_shift);

    if (ret)
        return ret;
    return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}

忽略kmsan相關的操作,直接看看__vmap_pages_range_noflush.

int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
        pgprot_t prot, struct page **pages, unsigned int page_shift)
{
    unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
...
    for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
        int err;

        // 每次只對映一個page
        err = vmap_range_noflush(addr, addr + (1UL << page_shift),
                    page_to_phys(pages[i]), prot,
                    page_shift);
        if (err)
            return err;

        addr += 1UL << page_shift;
    }

    return 0;
}
static int vmap_range_noflush(unsigned long addr, unsigned long end,
            phys_addr_t phys_addr, pgprot_t prot,
            unsigned int max_page_shift)
{
。。。
    start = addr;
    pgd = pgd_offset_k(addr);
    do {
        next = pgd_addr_end(addr, end);
        //終於看到熟悉的建頁表的邏輯了
        err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
                    max_page_shift, &mask);
        if (err)
            break;
    } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
。。。

    return err;
}

從程式碼可以看到vmalloc的分配的頁面是虛擬地址連續而物理頁面不連續的,分配邏輯複雜,只能是按page分配,因此相對域kmalloc可以分配連續物理頁面和小記憶體,vmalloc比較耗時,只針對較大記憶體。

相關文章