個人筆記,謹慎觀看.
先看看vmalloc是怎麼實現的。它能在非連續實體記憶體之上建立連續的虛擬記憶體對映。這裡有一篇部落格Linux記憶體管理 (6)vmalloc - ArnoldLu - 部落格園 (cnblogs.com)
呼叫鏈vmalloc->_vmalloc_node->_vmalloc_node_range
void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. * * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, exceeds total pages", real_size); return NULL; } size_per_node = size; if (node == NUMA_NO_NODE) size_per_node /= num_online_nodes();
//分配並初始化一個vm_struct area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; return area->addr; ... }
totalram_pages是一個儲存系統總可用記憶體頁的全域性變數。__get_vm_area_node分配一個vm_struct並初始化,這個結構描述了要分配的vmalloc。
static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { BUG_ON(in_interrupt()); size = ALIGN(size, 1ul << shift);//按page size對齊 //分配一個vm_struct area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; //分配一個vmap_area va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); //設定vmap_area到vm_struct setup_vmalloc_vm(area, va, flags, caller); return area; }
這裡涉及到倆結構體。vm_struct, vmap_area.
struct vm_struct { struct vm_struct *next; void *addr; unsigned long size; unsigned long flags; struct page **pages; #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC unsigned int page_order; #endif unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; };
描述vmalloc區域。
struct vmap_area { unsigned long va_start; unsigned long va_end; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ /* * The following two variables can be packed, because * a vmap_area object can be either: * 1) in "free" tree (root is free_vmap_area_root) * 2) or "busy" tree (root is vmap_area_root) */ union { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ }; unsigned long flags; /* mark type of vm_map_ram area */ };
也用來描述vmalloc的那個區域,主要描述區域的範圍,並且連結到一個全域性rbtree上。alloc_vmap_area會找到當前地址最低的一個空閒區域。
__vmalloc_area_node是核心函式,分配實體記憶體,建立對映。
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; unsigned int flags; int ret; // 計算需要儲存page指標的記憶體大小 array_size = (unsigned long)nr_small_pages * sizeof(struct page *); // 可能會使用遞迴來分配給area->pages的記憶體,pages儲存的是page指標陣列 /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); } else { area->pages = kmalloc_node(array_size, nested_gfp, node); } // 如果沒有enable CONFIG_HAVE_ARCH_HUGE_VMALLOC階就是0 set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); // 分配記憶體頁面,因為我們要的是不連續物理頁面,對於大多數情形每次獲取1頁,這樣就可以得到nr_pages個不連續的頁面 area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); // nr_vmalloc_pages應該是儲存vmalloc分配總頁數的全域性變數 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); do { // 頁面分配好了,建立對映吧,看起如果不允許失敗那就要一直迴圈知道成功 ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); return area->addr; }
static int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; // 用連續的虛擬地址區對映離散的物理頁面 err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
// 頁面對映好了,刷一下cache flush_cache_vmap(addr, end); return err; }
int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); }
忽略kmsan相關的操作,直接看看__vmap_pages_range_noflush.
int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; ... for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; // 每次只對映一個page err = vmap_range_noflush(addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) return err; addr += 1UL << page_shift; } return 0; }
static int vmap_range_noflush(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { 。。。 start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); //終於看到熟悉的建頁表的邏輯了 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); 。。。 return err; }
從程式碼可以看到vmalloc的分配的頁面是虛擬地址連續而物理頁面不連續的,分配邏輯複雜,只能是按page分配,因此相對域kmalloc可以分配連續物理頁面和小記憶體,vmalloc比較耗時,只針對較大記憶體。