之前多多少少接觸過cache之類的東西,總覺的很神祕,當然cache就是為了讀寫記憶體更高效。比如檢視meminfo或者slabinfo的時候,你是否真的對記憶體機制理解的很清晰?
參考核心linux 3.8.13
我們看看呼叫它的函式介面:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
/* * Set up kernel memory allocators */ static void __init mm_init(void) { /* * page_cgroup requires contiguous pages, * bigger than MAX_ORDER unless SPARSEMEM. */ page_cgroup_init_flatmem(); mem_init(); kmem_cache_init(); percpu_init_late(); pgtable_cache_init(); vmalloc_init(); } |
這個函式在start_kernel裡呼叫. 下面我們就看看 kmem_cache_init(); //預設slab分配器
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
/* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). */ void __init kmem_cache_init(void) { struct cache_sizes *sizes; struct cache_names *names; int i; kmem_cache = &kmem_cache_boot; setup_nodelists_pointer(kmem_cache); // 關於為什麼要設定這個玩意,我找到一個patch說明 From 3c58346525d82625e68e24f071804c2dc057b6f4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter <cl@linux.com> Date: Wed, 28 Nov 2012 16:23:01 +0000 Subject: [PATCH] slab: Simplify bootstrap The nodelists field in kmem_cache is pointing to the first unused object in the array field when bootstrap is complete. A problem with the current approach is that the statically sized kmem_cache structure use on boot can only contain NR_CPUS entries. If the number of nodes plus the number of cpus is greater then we would overwrite memory following the kmem_cache_boot definition. Increase the size of the array field to ensure that also the node pointers fit into the array field. Once we do that we no longer need the kmem_cache_nodelists array and we can then also use that structure elsewhere. Acked-by: Glauber Costa <glommer@parallels.com> Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Pekka Enberg <penberg@kernel.org> if (num_possible_nodes() == 1) use_alien_caches = 0; for (i = 0; i < NUM_INIT_LISTS; i++) kmem_list3_init(&initkmem_list3[i]); set_up_list3s(kmem_cache, CACHE_CACHE); /* * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory if * not overridden on the command line. */ if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: * 1) initialize the kmem_cache cache: it contains the struct * kmem_cache structures of all caches, except kmem_cache itself: * kmem_cache is statically allocated. * Initially an __init data area is used for the head array and the * kmem_list3 structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. * 2) Create the first kmalloc cache. * The struct kmem_cache for the new cache is allocated normally. * An __init data area is used for the head array. * 3) Create the remaining kmalloc caches, with minimally sized * head arrays. * 4) Replace the __init data head arrays for kmem_cache and the first * kmalloc cache with kmalloc allocated arrays. * 5) Replace the __init data for kmem_list3 for kmem_cache and * the other cache's with kmalloc allocated memory. * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ /* 1) create the kmem_cache */ /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, array[nr_cpu_ids]) + nr_node_ids * sizeof(struct kmem_list3 *), SLAB_HWCACHE_ALIGN); list_add(&kmem_cache->list, &slab_caches); // create kmem_cache後把它新增到slab_caches全域性連結串列. /* 2+3) create the kmalloc caches */ sizes = malloc_sizes; names = cache_names; /* * Initialize the caches that provide memory for the array cache and the * kmem_list3 structures first. Without this, further allocations will * bug. */ sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS); if (INDEX_AC != INDEX_L3) sizes[INDEX_L3].cs_cachep = create_kmalloc_cache(names[INDEX_L3].name, sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS); slab_early_init = 0; while (sizes->cs_size != ULONG_MAX) { /* * For performance, all the general caches are L1 aligned. * This should be particularly beneficial on SMP boxes, as it * eliminates "false sharing". * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ if (!sizes->cs_cachep) sizes->cs_cachep = create_kmalloc_cache(names->name, sizes->cs_size, ARCH_KMALLOC_FLAGS); #ifdef CONFIG_ZONE_DMA sizes->cs_dmacachep = create_kmalloc_cache( names->name_dma, sizes->cs_size, SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS); #endif sizes++; names++; } /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); kmem_cache->array[smp_processor_id()] = ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = ptr; } /* 5) Replace the bootstrap kmem_list3's */ { int nid; for_each_online_node(nid) { init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid); init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); if (INDEX_AC != INDEX_L3) { init_list(malloc_sizes[INDEX_L3].cs_cachep, &initkmem_list3[SIZE_L3 + nid], nid); } } } slab_state = UP; } |
第一行來自一個全域性的指標變數,即為建立第一個cache( kmem_cache)
在mm/slab_common.c中
struct kmem_cache *kmem_cache;
建立的所有cache都會掛在LIST_HEAD(slab_caches); 這個全域性連結串列上.在cat /proc/slabinfo可以檢視》
這裡可以看看struct kmem_cache:在slab_def.h中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
struct kmem_cache { /* 1) Cache tunables. Protected by cache_chain_mutex */ unsigned int batchcount; unsigned int limit; unsigned int shared; unsigned int size; u32 reciprocal_buffer_size; /* 2) touched by every alloc & free from the backend */ unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ /* 3) cache_grow/shrink */ /* order of pgs per slab (2^n) */ unsigned int gfporder; /* force GFP flags, e.g. GFP_DMA */ gfp_t allocflags; size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ struct kmem_cache *slabp_cache; unsigned int slab_size; /* constructor func */ void (*ctor)(void *obj); /* 4) cache creation/removal */ const char *name; struct list_head list; int refcount; int object_size; int align; /* 5) statistics */ #ifdef CONFIG_DEBUG_SLAB unsigned long num_active; unsigned long num_allocations; unsigned long high_mark; unsigned long grown; unsigned long reaped; unsigned long errors; unsigned long max_freeable; unsigned long node_allocs; unsigned long node_frees; unsigned long node_overflow; atomic_t allochit; atomic_t allocmiss; atomic_t freehit; atomic_t freemiss; /* * If debugging is enabled, then the allocator can add additional * fields and/or padding to every object. size contains the total * object size including these internal fields, the following two * variables contain the offset to the user object and its size. */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ #ifdef CONFIG_MEMCG_KMEM struct memcg_cache_params *memcg_params; #endif /* 6) per-cpu/per-node data, touched during every alloc/free */ /* * We put array[] at the end of kmem_cache, because we want to size * this array to nr_cpu_ids slots instead of NR_CPUS * (see kmem_cache_init()) * We still use [NR_CPUS] and not [1] or [0] because cache_cache * is statically defined, so we reserve the max number of cpus. * * We also need to guarantee that the list is able to accomodate a * pointer for each node since "nodelists" uses the remainder of * available pointers. */ struct kmem_list3 **nodelists; struct array_cache *array[NR_CPUS + MAX_NUMNODES]; /* * Do not add fields after array[] */ } |
這個結構體裡面幾個關鍵的元素之前在kmalloc裡已經說到了。
而kmem_cache_boot則是:
1 2 3 4 5 6 7 8 |
/* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, // 預設為 1 .shared = 1, .size = sizeof(struct kmem_cache), .name = "kmem_cache", }; |
註釋解釋的已經很清晰了.
而setup_nodelists_pointer的作用就是把struct kmem_cache裡array指標地址存放在nodelists.目的是為了便於操作指標.
對於一致性記憶體訪問,inode只有一個.
1 |
static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; |
它是slab.c中靜態全域性變數
1 2 3 |
/* * Need this for bootstrapping a per node allocator. */ |
kmem_list3_init初始化slab的三個連結串列slabs_full、slabs_partial、slabs_free.為什麼初始化這個和cache組成結構有關係,可以看個圖:
這裡CACHE_CACHE在檔案的開頭部分被定義為0.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
/* * For setting up all the kmem_list3s for cache whose buffer_size is same as * size of kmem_list3. */ static void __init set_up_list3s(struct kmem_cache *cachep, int index) { int node; for_each_online_node(node) { cachep->nodelists[node] = &initkmem_list3[index + node]; cachep->nodelists[node]->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; } } |
接著就要開始了真正的建立cache的工作,並且給出了初始化步驟和說明:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
/* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: * 1) initialize the kmem_cache cache: it contains the struct * kmem_cache structures of all caches, except kmem_cache itself: * kmem_cache is statically allocated. * Initially an __init data area is used for the head array and the * kmem_list3 structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. * 2) Create the first kmalloc cache. * The struct kmem_cache for the new cache is allocated normally. * An __init data area is used for the head array. * 3) Create the remaining kmalloc caches, with minimally sized * head arrays. * 4) Replace the __init data head arrays for kmem_cache and the first * kmalloc cache with kmalloc allocated arrays. * 5) Replace the __init data for kmem_list3 for kmem_cache and * the other cache's with kmalloc allocated memory. * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ /* 1) create the kmem_cache */ /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, array[nr_cpu_ids]) + nr_node_ids * sizeof(struct kmem_list3 *), SLAB_HWCACHE_ALIGN); list_add(&kmem_cache->list, &slab_caches); |
首先建立第一個cache它名為kmem_cache,並且kmem_cache指標變數指向了kmem_cache_boot.
下面我們看看create_boot_cache函式
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
#ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, unsigned long flags) { int err; s->name = name; s->size = s->object_size = size; s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); err = __kmem_cache_create(s, flags); if (err) panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n", name, size, err); s->refcount = -1; /* Exempt from merging for now */ } struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, unsigned long flags) { struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); if (!s) panic("Out of memory when creating slab %s\n", name); create_boot_cache(s, name, size, flags); list_add(&s->list, &slab_caches); s->refcount = 1; return s; } #endif /* !CONFIG_SLOB */ |
而它接著呼叫了__kmem_cache_create:這是最關鍵的函式
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 |
/** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor * @flags: SLAB flags * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache. * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) * to catch references to uninitialised memory. * * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { size_t left_over, slab_size, ralign; gfp_t gfp; int err; size_t size = cachep->size; #if DEBUG #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with * large objects, if the increased size would increase the object size * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif /* * Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ if (size & (BYTES_PER_WORD - 1)) { size += (BYTES_PER_WORD - 1); size &= ~(BYTES_PER_WORD - 1); } //4//四位元組對齊 /* * Redzoning and user store require word alignment or possibly larger. * Note this will be overridden by architecture or caller mandated * alignment if either is greater than BYTES_PER_WORD. */ if (flags & SLAB_STORE_USER) ralign = BYTES_PER_WORD; if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably * aligned, by adjusting the object size accordingly. */ size += REDZONE_ALIGN - 1; size &= ~(REDZONE_ALIGN - 1); } /* 3) caller mandated alignment */ if (ralign < cachep->align) { ralign = cachep->align; } /* disable debug if necessary */ if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. */ cachep->align = ralign; if (slab_is_available()) // 為什麼要插入這一段註釋,因為它就是判斷slab_state的值,預設它的值沒人初始化即為DOWN. /* * State of the slab allocator. * * This is used to describe the states of the allocator during bootup. * Allocators use this to gradually bootstrap themselves. Most allocators * have the problem that the structures used for managing slab caches are * allocated from slab caches themselves. */ enum slab_state { DOWN, /* No slab functionality yet */ PARTIAL, /* SLUB: kmem_cache_node available */ PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */ UP, /* Slab caches usable but not all extras yet */ FULL /* Everything is working */ }; gfp = GFP_KERNEL; else gfp = GFP_NOWAIT; #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) setup_nodelists_pointer(cachep); #if DEBUG /* * Both debugging options require word-alignment which is calculated * into align above. */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ cachep->obj_offset += sizeof(unsigned long long); size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of * the real object. But if the second red zone needs to be * aligned to 64 bits, we must allow that much space. */ if (flags & SLAB_RED_ZONE) size += REDZONE_ALIGN; else size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size && cachep->object_size > cache_line_size() && ALIGN(size, cachep->align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); size = PAGE_SIZE; } #endif #endif /* * Determine if the slab management is 'on' or 'off' slab. * (bootstrapping cannot cope with offslab caches so don't do // 判斷slab管理資訊是否在slab分配的記憶體頁上,判斷條件見下面: * it too early on. Always use on-slab management when // size >= (預設page =4k/8k) 512/1024 ; slab_early_init在建立kmem_cache的時候為1;當建立通用cache * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) //的時才會把它初始化為0 . 而第一傳遞的flags為 SLAB_HWCACHE_ALIGN */ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB; size = ALIGN(size, cachep->align); left_over = calculate_slab_order(cachep, size, cachep->align, flags); // 根據obj size 計算申請page的個數即一個slab包含多少個pages, if (!cachep->num) // 也包含了多少個obj,除去管理資訊等 剩餘的空間。很簡單易懂. return -E2BIG; slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { flags &= ~CFLGS_OFF_SLAB; left_over -= slab_size; } if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() * poisoning, then it's going to smash the contents of * the redzone and userword anyhow, so switch them off. */ if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); #endif } cachep->colour_off = cache_line_size(); //32B /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < cachep->align) cachep->colour_off = cachep->align; cachep->colour = left_over / cachep->colour_off; // slab 著色的初始化工作. cachep->slab_size = slab_size; cachep->flags = flags; cachep->allocflags = 0; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->allocflags |= GFP_DMA; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } err = setup_cpu_cache(cachep, gfp); if (err) { __kmem_cache_shutdown(cachep); return err; } if (flags & SLAB_DEBUG_OBJECTS) { /* * Would deadlock through slab_destroy()->call_rcu()-> * debug_object_activate()->kmem_cache_alloc(). */ WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); slab_set_debugobj_lock_classes(cachep); } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) on_slab_lock_classes(cachep); return 0; } |
它裡面有個很有趣的函式很關鍵的一個函式:它洩露了slab具體管理obj的佈局和方法.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
/** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. * @align: required alignment for the objects. * @flags: slab allocation flags * * Also calculates the number of objects per slab. * * This could be made much more intelligent. For now, try to avoid using * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { unsigned long offslab_limit; size_t left_over = 0; int gfporder; for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { unsigned int num; size_t remainder; cache_estimate(gfporder, size, align, flags, &remainder, &num); // 根據是off-slab 還是on-slab除去管理資訊後多少個頁面才能存下一個obj.以及其他資訊,值得仔細看看. if (!num) // 必須保證slab至少能裝下一個obj continue; if (flags & CFLGS_OFF_SLAB) { /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ offslab_limit = size - sizeof(struct slab); offslab_limit /= sizeof(kmem_bufctl_t); if (num > offslab_limit) break; } /* Found something acceptable - save it away */ cachep->num = num; cachep->gfporder = gfporder; left_over = remainder; /* * A VFS-reclaimable slab tends to have most allocations * as GFP_NOFS and we really don't want to have to be allocating * higher-order pages when we are unable to shrink dcache. */ if (flags & SLAB_RECLAIM_ACCOUNT) break; /* * Large number of objects is good, but very large slabs are * currently bad for the gfp()s. */ if (gfporder >= slab_max_order) break; /* * Acceptable internal fragmentation? */ if (left_over * 8 <= (PAGE_SIZE << gfporder)) break; } return left_over; } |
經過上面的初始化和設定,最後呼叫setup_cpu_cache就完成了一個建立cache的工作.接著進行第2、3步的工作:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
/* 2+3) create the kmalloc caches */ sizes = malloc_sizes; names = cache_names; /* * Initialize the caches that provide memory for the array cache and the * kmem_list3 structures first. Without this, further allocations will * bug. */ sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, // create obj size 為sizeof(struct arraycache_init) 的cache sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS); if (INDEX_AC != INDEX_L3) sizes[INDEX_L3].cs_cachep = create_kmalloc_cache(names[INDEX_L3].name, //// create obj size 為sizeof(struct kmem_list3) 的cache sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS); slab_early_init = 0; while (sizes->cs_size != ULONG_MAX) { //建立通用cache 根據 malloc_sizes ,cache_names /* * For performance, all the general caches are L1 aligned. * This should be particularly beneficial on SMP boxes, as it * eliminates "false sharing". * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ if (!sizes->cs_cachep) sizes->cs_cachep = create_kmalloc_cache(names->name, sizes->cs_size, ARCH_KMALLOC_FLAGS); #ifdef CONFIG_ZONE_DMA sizes->cs_dmacachep = create_kmalloc_cache( names->name_dma, sizes->cs_size, SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS); #endif sizes++; names++; } |
這裡在說一下cache_names和malloc_sizes:
1 2 3 4 5 6 7 8 9 |
/* * These are the default caches for kmalloc. Custom caches can have other sizes. */ struct cache_sizes malloc_sizes[] = { #define CACHE(x) { .cs_size = (x) }, #include <linux/kmalloc_sizes.h> CACHE(ULONG_MAX) #undef CACHE }; |
這裡就不擴充套件開了.
1 2 3 4 5 6 7 8 9 10 11 12 |
/* Must match cache_sizes above. Out of line to keep cache footprint low. */ struct cache_names { char *name; char *name_dma; }; static struct cache_names __initdata cache_names[] = { #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, #include <linux/kmalloc_sizes.h> {NULL,} #undef CACHE }; |
create_kmalloc_cache實際上是呼叫create_boot_cache. 把kernel預定義的通用cache建立一遍.之後我們進入第四步、第5步:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
/* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); kmem_cache->array[smp_processor_id()] = ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = ptr; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
/* 5) Replace the bootstrap kmem_list3's */ { int nid; for_each_online_node(nid) { init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid); init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); if (INDEX_AC != INDEX_L3) { init_list(malloc_sizes[INDEX_L3].cs_cachep, &initkmem_list3[SIZE_L3 + nid], nid); } } } slab_state = UP; |
最後把slab_state狀態設定為up 即已經可以正常使用了。雖然上面大部分是程式碼,具體申請記憶體的流程前面kmalloc已經講過了。僅僅是為了弄明白cache到底是個什麼玩意,以及如何初始化的。
在kmem_cache_init後,還有一個kmem_cache_init_late函式.
它主要是呼叫了enable_cpucache和註冊一個cpu通知連
1 2 3 4 5 |
/* * Register a cpu startup notifier callback that initializes * cpu_cache_get for all new cpus */ register_cpu_notifier(&cpucache_notifier); |
還記不記得之前我們分析batchcount的時候的矛盾點?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
/* Called with slab_mutex held always */ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; int limit = 0; int shared = 0; int batchcount = 0; if (!is_root_cache(cachep)) { struct kmem_cache *root = memcg_root_cache(cachep); limit = root->limit; shared = root->shared; batchcount = root->batchcount; } if (limit && shared && batchcount) goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm * - reduce the number of spinlock operations. * - reduce the number of linked list operations on the slab and * bufctl chains: array operations are cheaper. * The numbers are guessed, we should auto-tune as described by * Bonwick. */ if (cachep->size > 131072) limit = 1; else if (cachep->size > PAGE_SIZE) limit = 8; else if (cachep->size > 1024) limit = 24; else if (cachep->size > 256) limit = 54; else limit = 120; /* * CPU bound tasks (e.g. network routing) can exhibit cpu bound * allocation behaviour: Most allocs on one cpu, most free operations * on another cpu. For these cases, an efficient object passing between * cpus is necessary. This is provided by a shared array. The array * replaces Bonwick's magazine layer. * On uniprocessor, it's functionally equivalent (but less efficient) * to a larger limit. Thus disabled by default. */ shared = 0; if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; #if DEBUG /* * With debugging enabled, large batchcount lead to excessively long * periods with disabled local interrupts. Limit the batchcount */ if (limit > 32) limit = 32; #endif batchcount = (limit + 1) / 2; skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); return err; } |
它會根據obj size 計算limit值 ,再去計算batchcount的值.
這個只是一個小小的開始吧,記憶體管理本來就博大精深,只有遇到具體問題具體分析,來加深理解了.