上一篇文章中簡單說了下slab分配器下kmalloc是如何分配記憶體的。在看cache_alloc_refill這個函式的時候邏輯上還有一些困惑。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, bool force_refill) { int batchcount; struct kmem_list3 *l3; struct array_cache *ac; int node; check_irq_off(); node = numa_mem_id(); if (unlikely(force_refill)) goto force_grow; retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { /* * If there was little recent activity on this cache, then * perform only a partial refill. Otherwise we could generate * refill bouncing. */ batchcount = BATCHREFILL_LIMIT; } l3 = cachep->nodelists[node]; BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); /* See if we can refill from the shared array */ if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { l3->shared->touched = 1; goto alloc_done; } while (batchcount > 0) { struct list_head *entry; struct slab *slabp; /* Get slab alloc is to come from. */ entry = l3->slabs_partial.next; if (entry == &l3->slabs_partial) { l3->free_touched = 1; entry = l3->slabs_free.next; if (entry == &l3->slabs_free) goto must_grow; } slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); check_spinlock_acquired(cachep); /* * The slab was either on partial or free list so * there must be at least one object available for * allocation. */ BUG_ON(slabp->inuse >= cachep->num); while (slabp->inuse < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, node)); } check_slabp(cachep, slabp); /* move slabp to correct slabp list: */ list_del(&slabp->list); if (slabp->free == BUFCTL_END) list_add(&slabp->list, &l3->slabs_full); else list_add(&slabp->list, &l3->slabs_partial); } must_grow: l3->free_objects -= ac->avail; alloc_done: spin_unlock(&l3->list_lock); if (unlikely(!ac->avail)) { int x; force_grow: x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); node = numa_mem_id(); /* no objects in sight? abort */ if (!x && (ac->avail == 0 || force_refill)) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ goto retry; } ac->touched = 1; return ac_get_obj(cachep, ac, flags, force_refill); } |
主要是關於 batchcount = ac->batchcount; 的問題。在預設初始化的時候即在kmem_cache_init中系統的cache都會呼叫到__kmem_cache_create中setup_cpu_cache的有這樣一段程式碼:
1 2 3 4 5 6 7 |
cpu_cache_get(cachep)->avail = 0; cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; cpu_cache_get(cachep)->batchcount = 1; cpu_cache_get(cachep)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; return 0; |
那麼我是不是就可以認為ac->batchcount的值就是1了呢?那麼 ac_put_obj的時候只放一個obj到array中。每次都這樣,那麼在__cache_alloc中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
ac = cpu_cache_get(cachep); if (likely(ac->avail)) { ac->touched = 1; objp = ac_get_obj(cachep, ac, flags, false); /* * Allow for the possibility all avail objects are not allowed * by the current flags */ if (objp) { STATS_INC_ALLOCHIT(cachep); goto out; } force_refill = true; } |
它的意義又何在呢? 因為batchcount為1的話,每次放入一個obj到array 設定avail從0到1,但是get一個obj後,avail又為0了。當然這樣效率很低。
後來才發現是自己程式碼沒看全- -, 我們看這樣一段程式碼它在kmem_cache_init初始化後,呼叫的
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; slab_state = UP; /* 6) resize the head arrays to their final sizes */ mutex_lock(&slab_mutex); list_for_each_entry(cachep, &slab_caches, list) if (enable_cpucache(cachep, GFP_NOWAIT)) BUG(); mutex_unlock(&slab_mutex); /* Annotate slab for lockdep -- annotate the malloc caches */ init_lock_keys(); /* */ slab_state = FULL; /* * Register a cpu startup notifier callback that initializes * cpu_cache_get for all new cpus */ register_cpu_notifier(&cpucache_notifier); #ifdef CONFIG_NUMA /* * Register a memory hotplug callback that initializes and frees * nodelists. */ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); #endif /* * The reap timers are started later, with a module init call: That part * of the kernel is not yet operational. */ } |
這個函式就是把slab_caches連結串列上的所有cache都呼叫enable_cpucache(cachep, GFP_NOWAIT)一遍!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
/* Called with slab_mutex held always */ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; int limit = 0; int shared = 0; int batchcount = 0; if (!is_root_cache(cachep)) { struct kmem_cache *root = memcg_root_cache(cachep); limit = root->limit; shared = root->shared; batchcount = root->batchcount; } if (limit && shared && batchcount) goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm * - reduce the number of spinlock operations. * - reduce the number of linked list operations on the slab and * bufctl chains: array operations are cheaper. * The numbers are guessed, we should auto-tune as described by * Bonwick. */ if (cachep->size > 131072) // size 大一128k 小於page_size 則limit為1 limit = 1; else if (cachep->size > PAGE_SIZE) limit = 8; else if (cachep->size > 1024) limit = 24; else if (cachep->size > 256) limit = 54; else limit = 120; /* * CPU bound tasks (e.g. network routing) can exhibit cpu bound * allocation behaviour: Most allocs on one cpu, most free operations * on another cpu. For these cases, an efficient object passing between * cpus is necessary. This is provided by a shared array. The array * replaces Bonwick's magazine layer. * On uniprocessor, it's functionally equivalent (but less efficient) * to a larger limit. Thus disabled by default. */ shared = 0; if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) // smp 下 shared為8 ,單核為0 shared = 8; #if DEBUG /* * With debugging enabled, large batchcount lead to excessively long * periods with disabled local interrupts. Limit the batchcount */ if (limit > 32) limit = 32; #endif batchcount = (limit + 1) / 2; skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); //設定 引數值到cache裡 if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); return err; } |
對我們看到了limit 、shared 、 batchcount的新初始化.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { int ret; struct kmem_cache *c = NULL; int i = 0; ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); // 設定傳遞進來的cache的東西 if (slab_state < FULL) return ret; if ((ret < 0) || !is_root_cache(cachep)) return ret; VM_BUG_ON(!mutex_is_locked(&slab_mutex)); for_each_memcg_cache_index(i) { c = cache_from_memcg(cachep, i); if (c) /* return value determined by the parent cache only */ __do_tune_cpucache(c, limit, batchcount, shared, gfp); } return ret; } |
而具體實現在
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
/* Always called with the slab_mutex held */ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; // 說明一下上面的結構體 struct ccupdate_struct { struct kmem_cache *cachep; struct array_cache *new[0]; }; int i; new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), // 這個函式用完new就釋放了。說明它只是起到一箇中轉的作用. gfp); if (!new) return -ENOMEM; for_each_online_cpu(i) { new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, batchcount, gfp); if (!new->new[i]) { for (i--; i >= 0; i--) kfree(new->new[i]); kfree(new); return -ENOMEM; } } new->cachep = cachep; on_each_cpu(do_ccupdate_local, (void *)new, 1); // 關鍵點: 每個cpu上都呼叫do_ccupdate_local處理new。 check_irq_on(); cachep->batchcount = batchcount; cachep->limit = limit; cachep->shared = shared; for_each_online_cpu(i) { struct array_cache *ccold = new->new[i]; if (!ccold) continue; spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); // spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); kfree(ccold); } kfree(new); return alloc_kmemlist(cachep, gfp); } |
我們就看看do_ccupdate_local做了什麼
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
static void do_ccupdate_local(void *info) { struct ccupdate_struct *new = info; struct array_cache *old; check_irq_off(); old = cpu_cache_get(new->cachep); new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];// 由於之前 new->cache已經指向了我們的cache,所以這裡操作的是我們cache的array指向新的地方. // 而new->new這個array的初始化是在申請它的時候 見上個函式裡的alloc_arraycache: static struct array_cache *alloc_arraycache(int node, int entries, int batchcount, gfp_t gfp) { int memsize = sizeof(void *) * entries + sizeof(struct array_cache); struct array_cache *nc = NULL; nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. * However, when such objects are allocated or transferred to another * cache the pointers are not cleared and they could be counted as * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. */ kmemleak_no_scan(nc); if (nc) { nc->avail = 0; nc->limit = entries; nc->batchcount = batchcount; nc->touched = 0; spin_lock_init(&nc->lock); } return nc; } new->new[smp_processor_id()] = old; } |
這樣就和函式cache_alloc_refill接起來了
我們可以看看實際的核心開啟slab的資訊:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
cat /proc/slabinfo slabinfo - version: 2.1 # name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail> nf_conntrack_expect 0 0 152 26 1 : tunables 120 60 8 : slabdata 0 0 0 nf_conntrack_8050c5f0 2 26 296 13 1 : tunables 54 27 8 : slabdata 2 2 0 bridge_fdb_cache 4 78 48 78 1 : tunables 120 60 8 : slabdata 1 1 0 fib6_nodes 12 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0 ip6_dst_cache 25 57 208 19 1 : tunables 120 60 8 : slabdata 3 3 0 ip6_mrt_cache 0 0 112 35 1 : tunables 120 60 8 : slabdata 0 0 0 RAWv6 8 15 720 5 1 : tunables 54 27 8 : slabdata 3 3 0 UDPLITEv6 0 0 688 11 2 : tunables 54 27 8 : slabdata 0 0 0 UDPv6 3 22 688 11 2 : tunables 54 27 8 : slabdata 2 2 0 tw_sock_TCPv6 0 0 144 27 1 : tunables 120 60 8 : slabdata 0 0 0 request_sock_TCPv6 0 0 112 35 1 : tunables 120 60 8 : slabdata 0 0 0 TCPv6 5 6 1328 3 1 : tunables 24 12 8 : slabdata 2 2 0 ubi_wl_entry_slab 463 580 24 145 1 : tunables 120 60 8 : slabdata 4 4 0 sd_ext_cdb 2 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0 fuse_request 0 0 384 10 1 : tunables 54 27 8 : slabdata 0 0 0 fuse_inode 0 0 416 9 1 : tunables 54 27 8 : slabdata 0 0 0 jffs2_inode_cache 15 145 24 145 1 : tunables 120 60 8 : slabdata 1 1 0 jffs2_node_frag 130 290 24 145 1 : tunables 120 60 8 : slabdata 2 2 0 uid_cache 0 0 48 78 1 : tunables 120 60 8 : slabdata 0 0 0 UNIX 24 32 480 8 1 : tunables 54 27 8 : slabdata 4 4 0 ip_mrt_cache 0 0 96 40 1 : tunables 120 60 8 : slabdata 0 0 0 UDP-Lite 0 0 560 7 1 : tunables 54 27 8 : slabdata 0 0 0 tcp_bind_bucket 6 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0 inet_peer_cache 8 24 160 24 1 : tunables 120 60 8 : slabdata 1 1 0 ip_fib_trie 7 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0 ip_fib_alias 8 145 24 145 1 : tunables 120 60 8 : slabdata 1 1 0 ip_dst_cache 6 27 144 27 1 : tunables 120 60 8 : slabdata 1 1 0 PING 0 0 528 7 1 : tunables 54 27 8 : slabdata 0 0 0 RAW 4 7 544 7 1 : tunables 54 27 8 : slabdata 1 1 0 UDP 13 14 560 7 1 : tunables 54 27 8 : slabdata 2 2 0 tw_sock_TCP 0 0 112 35 1 : tunables 120 60 8 : slabdata 0 0 0 request_sock_TCP 0 0 80 48 1 : tunables 120 60 8 : slabdata 0 0 0 TCP 1 6 1184 6 2 : tunables 24 12 8 : slabdata 1 1 0 ...... size-2048(DMA) 0 0 2048 2 1 : tunables 24 12 8 : slabdata 0 0 0 size-2048 192 192 2048 2 1 : tunables 24 12 8 : slabdata 96 96 0 size-1024(DMA) 0 0 1024 4 1 : tunables 54 27 8 : slabdata 0 0 0 size-1024 215 216 1024 4 1 : tunables 54 27 8 : slabdata 54 54 0 size-512(DMA) 0 0 512 8 1 : tunables 54 27 8 : slabdata 0 0 0 size-512 601 624 512 8 1 : tunables 54 27 8 : slabdata 78 78 0 size-256(DMA) 0 0 256 15 1 : tunables 120 60 8 : slabdata 0 0 0 size-256 1234 1245 256 15 1 : tunables 120 60 8 : slabdata 83 83 0 size-192(DMA) 0 0 256 15 1 : tunables 120 60 8 : slabdata 0 0 0 size-192 287 300 256 15 1 : tunables 120 60 8 : slabdata 20 20 0 size-128(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0 size-128 1890 1890 128 30 1 : tunables 120 60 8 : slabdata 63 63 0 size-96(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0 size-96 930 930 128 30 1 : tunables 120 60 8 : slabdata 31 31 0 size-64(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0 size-32(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0 size-64 1577 1650 128 30 1 : tunables 120 60 8 : slabdata 55 55 0 size-32 6213 6300 128 30 1 : tunables 120 60 8 : slabdata 210 210 0 kmem_cache 150 160 96 40 1 : tunables 120 60 8 : slabdata 4 4 0 |
或許你看ubuntu系統的時候發現limit batchcount值為0 ,其實它是用了slub分配器.在slub.c中
1 2 3 |
void __init kmem_cache_init_late(void) { } |
這裡順便說明一下關於slab、slub、slob的簡單區別:(具體如何實現的請參考核心程式碼slab.c /slub.c/slob.c)
slab是slub和slob的基礎。
SLOB的目標是針對嵌入式系統的,主要是適用於那些記憶體非常有限的系統,比如32MB以下的記憶體,它不太注重large smp系統,雖然最近在這方面有一些小的改進
SLUB allocator,用於替代 slab 程式碼。通過取消了大量的佇列和相關開銷、簡化 slab 的結構,SLUB 承諾提供更好的效能和更好的系統可伸縮性,並且可以同時保持現有的 slab 分配器介面
說了這麼多,我們用個圖來簡單描述下slab機制: