這裡只說實體記憶體管理 linux核心的,看了很多講解的記憶體的東西,但是自己總結的時候總感覺無從下手,這裡就從實際實體記憶體分配介面開始吧。
Kmalloc 它分配連續的實體記憶體空間 ,它不負責把分配的記憶體空間清零,它能分配多大的呢?並且它只能分配ZONE_NORMAL的不能分配dma和high裡的,也就是隻分配低端記憶體.一般情況下記憶體被分為三個zone:NORMAL、DMA、HIGH.
這個函式是建立在slab分配器的基礎上的,通過cache 而cache有通過slab 分配obj 。
在開始分析kmalloc函式之前,我們需要說明一下linux核心實體記憶體的分配函式API:
__get_free_pages它會呼叫alloc_pages,它的特點是不能從HIGHMEM分配記憶體,分配2的冪個連續物理頁面。它有簡化模式(只分配一page)
__get_free_page,而get_zeroed_page介面分配的頁面內容對應填充為0. 從dma分配可以呼叫__get_dma_pages(它本質也是呼叫__get_free_pages)
那麼終極介面alloc_pages它可以從任何zone裡申請記憶體,當然前提設定對應的flags.
參考核心:linux3.18.13
參考書籍:《linux核心設計與實現》《linux裝置驅動程式》《深入理解linux裝置驅動核心機制》
下面我們就說說kmalloc:(關於分配時候的flags這裡不討論,具體可以參考資料)
我們先看標頭檔案
#include
而關於它的具體實現我們看slab.h中
1 2 3 4 5 6 7 |
#ifdef CONFIG_SLUB #include <linux/slub_def.h> #elif defined(CONFIG_SLOB) #include <linux/slob_def.h> #else #include <linux/slab_def.h> #endif |
一般系統預設#include
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
static __always_inline void *kmalloc(size_t size, gfp_t flags) { struct kmem_cache *cachep; void *ret; if (__builtin_constant_p(size)) { int i = 0; if (!size) return ZERO_SIZE_PTR; #define CACHE(x) \ if (size <= x) \ goto found; \ else \ i++; #include <linux/kmalloc_sizes.h> //這裡查詢申請的size在哪個範圍 從32乘2遞增。I每次加1. #undef CACHE return NULL; found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) cachep = malloc_sizes[i].cs_dmacachep; //很明顯如果定義了dma,並且設定了dma標誌則優先從dma cache裡申請。malloc_sizes的初始化在slab.c裡。可以具體分析一下。 else #endif cachep = malloc_sizes[i].cs_cachep; //從指定的cache連結串列分配記憶體,不浪費空間。 ret = kmem_cache_alloc_trace(cachep, flags, size); return ret; } return __kmalloc(size, flags); } |
這裡可以補充下程式碼關於kmalloc_sizes.h
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
#if (PAGE_SIZE == 4096) CACHE(32) #endif CACHE(64) #if L1_CACHE_BYTES < 64 CACHE(96) #endif CACHE(128) #if L1_CACHE_BYTES < 128 CACHE(192) #endif CACHE(256) CACHE(512) CACHE(1024) CACHE(2048) CACHE(4096) CACHE(8192) CACHE(16384) CACHE(32768) CACHE(65536) CACHE(131072) #if KMALLOC_MAX_SIZE >= 262144 CACHE(262144) #endif #if KMALLOC_MAX_SIZE >= 524288 CACHE(524288) #endif #if KMALLOC_MAX_SIZE >= 1048576 CACHE(1048576) #endif #if KMALLOC_MAX_SIZE >= 2097152 CACHE(2097152) #endif #if KMALLOC_MAX_SIZE >= 4194304 CACHE(4194304) #endif #if KMALLOC_MAX_SIZE >= 8388608 CACHE(8388608) #endif #if KMALLOC_MAX_SIZE >= 16777216 CACHE(16777216) #endif #if KMALLOC_MAX_SIZE >= 33554432 CACHE(33554432) #endif |
我們看到函式開頭需要說明一下:
__builtin_constant_p 是編譯器gcc內建函式,用於判斷一個值是否為編譯時常量,如果是常數,函式返回1 ,否則返回0。此內建函式的典型用法是在巨集中用於手動編譯時優化顯然如果size為常數 則用__kmalloc(size, flags);申請記憶體.
它查詢需要分配的記憶體在哪個系統cache然後呼叫
1 2 3 4 5 6 7 8 9 |
#ifdef CONFIG_TRACING extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t); #else static __always_inline void * kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) { return kmem_cache_alloc(cachep, flags); } #endif |
我們看具體程式碼:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
/** * kmem_cache_alloc - Allocate an object * @cachep: The cache to allocate from. * @flags: See kmalloc(). * * Allocate an object from this cache. The flags are only relevant * if the cache has no available objects. */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, // 跟蹤除錯會用到 cachep->object_size, cachep->size, flags); return ret; } |
它實際的分配是slab_alloc:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
static __always_inline void * slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) { unsigned long save_flags; void *objp; flags &= gfp_allowed_mask; // 說明在gfp.h中 ,如下 /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are * enabled, it is set to __GFP_BITS_MASK while the system is running. During * hibernation, it is used by PM to avoid I/O during memory allocation while * devices are suspended. */ extern gfp_t gfp_allowed_mask; lockdep_trace_alloc(flags); // 除錯用 if (slab_should_failslab(cachep, flags)) return NULL; cachep = memcg_kmem_get_cache(cachep, flags); cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, flags); prefetchw(objp); if (likely(objp)) kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); if (unlikely((flags & __GFP_ZERO) && objp)) memset(objp, 0, cachep->object_size); return objp; } |
它呼叫objp = __do_cache_alloc(cachep, flags); 除了檢查一些標誌等繼續呼叫
____cache_alloc(cachep, flags);
它是一個統一的介面 (有檢測numa和uma ,linux預設是uma 除非指定了numa)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { ac->touched = 1; objp = ac_get_obj(cachep, ac, flags, false); /* * Allow for the possibility all avail objects are not allowed * by the current flags */ if (objp) { STATS_INC_ALLOCHIT(cachep); goto out; } force_refill = true; } STATS_INC_ALLOCMISS(cachep); objp = cache_alloc_refill(cachep, flags, force_refill); /* * the 'ac' may be updated by cache_alloc_refill(), * and kmemleak_erase() requires its correct value. */ ac = cpu_cache_get(cachep); out: /* * To avoid a false negative, if an object that is in one of the * per-CPU caches is leaked, we need to make sure kmemleak doesn't * treat the array pointers as a reference to the object. */ if (objp) kmemleak_erase(&ac->entry[ac->avail]); return objp; } |
這裡我們假定是第一次使用分配記憶體,那麼根據在kmem_cache_init中的malloc_sizes[]的初始化,在kmalloc的時候返回的kmalloc_cache指標指向的cache中用到這樣個函式:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { if (slab_state >= FULL) return enable_cpucache(cachep, gfp); if (slab_state == DOWN) { /* * Note: Creation of first cache (kmem_cache). * The setup_list3s is taken care * of by the caller of __kmem_cache_create */ cachep->array[smp_processor_id()] = &initarray_generic.cache; slab_state = PARTIAL; } else if (slab_state == PARTIAL) { /* * Note: the second kmem_cache_create must create the cache * that's used by kmalloc(24), otherwise the creation of * further caches will BUG(). */ cachep->array[smp_processor_id()] = &initarray_generic.cache; /* * If the cache that's used by kmalloc(sizeof(kmem_list3)) is * the second cache, then we need to set up all its list3s, * otherwise the creation of further caches will BUG(). */ set_up_list3s(cachep, SIZE_AC); if (INDEX_AC == INDEX_L3) slab_state = PARTIAL_L3; else slab_state = PARTIAL_ARRAYCACHE; } else { /* Remaining boot caches */ cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init), gfp); if (slab_state == PARTIAL_ARRAYCACHE) { set_up_list3s(cachep, SIZE_L3); slab_state = PARTIAL_L3; } else { int node; for_each_online_node(node) { cachep->nodelists[node] = kmalloc_node(sizeof(struct kmem_list3), gfp, node); BUG_ON(!cachep->nodelists[node]); kmem_list3_init(cachep->nodelists[node]); } } } cachep->nodelists[numa_mem_id()]->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; cpu_cache_get(cachep)->avail = 0; cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; // 1 cpu_cache_get(cachep)->batchcount = 1; cpu_cache_get(cachep)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; return 0; } |
我們知道不論array被賦了什麼值,最後都要初始化avail等值.
所以如果array不可用,那麼就會呼叫;當然如果array可用那麼直接返回申請的obj的記憶體指標.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, bool force_refill) { int batchcount; struct kmem_list3 *l3; struct array_cache *ac; int node; check_irq_off(); node = numa_mem_id(); if (unlikely(force_refill)) goto force_grow; retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { /* * If there was little recent activity on this cache, then * perform only a partial refill. Otherwise we could generate * refill bouncing. */ batchcount = BATCHREFILL_LIMIT; } l3 = cachep->nodelists[node]; BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); /* See if we can refill from the shared array */ if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { l3->shared->touched = 1; goto alloc_done; } while (batchcount > 0) { struct list_head *entry; struct slab *slabp; /* Get slab alloc is to come from. */ entry = l3->slabs_partial.next; if (entry == &l3->slabs_partial) { l3->free_touched = 1; entry = l3->slabs_free.next; if (entry == &l3->slabs_free) goto must_grow; } slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); check_spinlock_acquired(cachep); /* * The slab was either on partial or free list so * there must be at least one object available for * allocation. */ BUG_ON(slabp->inuse >= cachep->num); while (slabp->inuse < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, node)); } check_slabp(cachep, slabp); /* move slabp to correct slabp list: */ list_del(&slabp->list); if (slabp->free == BUFCTL_END) list_add(&slabp->list, &l3->slabs_full); else list_add(&slabp->list, &l3->slabs_partial); } must_grow: l3->free_objects -= ac->avail; alloc_done: spin_unlock(&l3->list_lock); if (unlikely(!ac->avail)) { int x; force_grow: x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); // grow成功返回 1 /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); node = numa_mem_id(); /* no objects in sight? abort */ if (!x && (ac->avail == 0 || force_refill)) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ goto retry; } ac->touched = 1; return ac_get_obj(cachep, ac, flags, force_refill); } |
由於第一次使用nodelist上slab連結串列都為空,所以must_grow
它呼叫cache_grow,這個函式首先計算了slab著色處理。然後呼叫kmem_getpages申請page,大小根據cache->gfporder,它返回申請pages的虛擬地址.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
/* * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid, void *objp) { struct slab *slabp; size_t offset; gfp_t local_flags; struct kmem_list3 *l3; /* * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ BUG_ON(flags & GFP_SLAB_BUG_MASK); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); /* Take the l3 list lock to change the colour_next on this node */ check_irq_off(); l3 = cachep->nodelists[nodeid]; spin_lock(&l3->list_lock); /* Get colour for the slab, and cal the next value. */ offset = l3->colour_next; // default 0 l3->colour_next++; if (l3->colour_next >= cachep->colour) l3->colour_next = 0; spin_unlock(&l3->list_lock); offset *= cachep->colour_off; // first time ,offset is 0 ; if (local_flags & __GFP_WAIT) local_irq_enable(); /* * The test for missing atomic flag is performed here, rather than * the more obvious place, simply to reduce the critical path length * in kmem_cache_alloc(). If a caller is seriously mis-behaving they * will eventually be caught here (where it matters). */ kmem_flagcheck(cachep, flags); /* * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ if (!objp) objp = kmem_getpages(cachep, local_flags, nodeid); if (!objp) goto failed; /* Get slab management. */ slabp = alloc_slabmgmt(cachep, objp, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); if (!slabp) goto opps1; slab_map_pages(cachep, slabp, objp); cache_init_objs(cachep, slabp); if (local_flags & __GFP_WAIT) local_irq_disable(); check_irq_off(); spin_lock(&l3->list_lock); /* Make slab active. */ list_add_tail(&slabp->list, &(l3->slabs_free)); // 把新申請的slab新增到nodelist的slabs_free連結串列。 STATS_INC_GROWN(cachep); l3->free_objects += cachep->num; //初始化可用的物件即每個slab可以包含的obj數目 spin_unlock(&l3->list_lock); return 1; opps1: kmem_freepages(cachep, objp); failed: if (local_flags & __GFP_WAIT) local_irq_disable(); return 0; } |
而關於slab著色跟硬體緩衝有關,為了儘量避免快取衝突不命中問題,提高效率(cache_line問題)。可以參考《深入理解計算機系統》。
具體操作見:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
/* * Get the memory for a slab management obj. * For a slab cache when the slab descriptor is off-slab, slab descriptors * always come from malloc_sizes caches. The slab descriptor cannot * come from the same cache which is getting created because, * when we are searching for an appropriate cache for these * descriptors in kmem_cache_create, we search through the malloc_sizes array. * If we are creating a malloc_sizes cache here it would not be visible to * kmem_find_general_cachep till the initialization is complete. * Hence we cannot have slabp_cache same as the original cache. */ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, int colour_off, gfp_t local_flags, int nodeid) { struct slab *slabp; if (OFF_SLAB(cachep)) { // 關於OFF_SLAB問題 可以看程式碼: CFLGS_OFF_SLAB 在__kmem_cache_create /* * Determine if the slab management is 'on' or 'off' slab. * (bootstrapping cannot cope with offslab caches so don't do * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB; /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, local_flags, nodeid); /* * If the first object in the slab is leaked (it's allocated * but no one has a reference to it), we want to make sure * kmemleak does not treat the ->s_mem pointer as a reference * to the object. Otherwise we will not report the leak. */ kmemleak_scan_area(&slabp->list, sizeof(struct list_head), local_flags); if (!slabp) return NULL; } else { slabp = objp + colour_off; // 在__kmem_cache_create中cachep->colour_off = cache_line_size(); // 在cache.h中#define cache_line_size() L1_CACHE_BYTES; 一般為32B 大小. // cachep->colour = left_over / cachep->colour_off; colour_off += cachep->slab_size; } slabp->inuse = 0; // num of objs active in slab slabp->colouroff = colour_off; //第一個obj相對page地址的偏移 slabp->s_mem = objp + colour_off; //第一個obj的地址 slabp->nodeid = nodeid; slabp->free = 0; return slabp; } |
我們看看另外一個很重要的操作:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
static void cache_init_objs(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, slabp, i); #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) poison_obj(cachep, objp, POISON_FREE); if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; if (cachep->flags & SLAB_RED_ZONE) { *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } /* * Constructors are not allowed to allocate memory from the same * cache which they are a constructor for. Otherwise, deadlock. * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) cachep->ctor(objp + obj_offset(cachep)); if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) slab_error(cachep, "constructor overwrote the" " end of an object"); if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) slab_error(cachep, "constructor overwrote the" " start of an object"); } if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, 0); #else if (cachep->ctor) cachep->ctor(objp); // 根據建構函式初始化物件 #endif slab_bufctl(slabp)[i] = i + 1; // init bufctl陣列 1、2、3、4 ..... 最後一個設定成為BUFCTL_END } slab_bufctl(slabp)[i - 1] = BUFCTL_END; } |