Linux 記憶體管理: Kmalloc

發表於2015-09-22

這裡只說實體記憶體管理 linux核心的，看了很多講解的記憶體的東西，但是自己總結的時候總感覺無從下手，這裡就從實際實體記憶體分配介面開始吧。

Kmalloc 它分配連續的實體記憶體空間，它不負責把分配的記憶體空間清零，它能分配多大的呢？並且它只能分配ZONE_NORMAL的不能分配dma和high裡的，也就是隻分配低端記憶體.一般情況下記憶體被分為三個zone：NORMAL、DMA、HIGH.

這個函式是建立在slab分配器的基礎上的,通過cache 而cache有通過slab 分配obj 。

在開始分析kmalloc函式之前，我們需要說明一下linux核心實體記憶體的分配函式API：

__get_free_pages它會呼叫alloc_pages，它的特點是不能從HIGHMEM分配記憶體，分配2的冪個連續物理頁面。它有簡化模式（只分配一page）
__get_free_page,而get_zeroed_page介面分配的頁面內容對應填充為0. 從dma分配可以呼叫__get_dma_pages(它本質也是呼叫__get_free_pages）

那麼終極介面alloc_pages它可以從任何zone裡申請記憶體，當然前提設定對應的flags.

參考核心：linux3.18.13
參考書籍:《linux核心設計與實現》《linux裝置驅動程式》《深入理解linux裝置驅動核心機制》

下面我們就說說kmalloc：（關於分配時候的flags這裡不討論，具體可以參考資料）

我們先看標頭檔案
#include
而關於它的具體實現我們看slab.h中

#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#elif defined(CONFIG_SLOB)
#include <linux/slob_def.h>
#else
#include <linux/slab_def.h>
#endif

#ifdef CONFIG_SLUB

#include <linux/slub_def.h>

#elif defined(CONFIG_SLOB)

#include <linux/slob_def.h>

#else

#include <linux/slab_def.h>

#endif

一般系統預設#include

static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
    struct kmem_cache *cachep;
    void *ret;

    if (__builtin_constant_p(size)) {
        int i = 0;

        if (!size)
            return ZERO_SIZE_PTR;

#define CACHE(x) \
        if (size <= x) \
            goto found; \
        else \
            i++;
#include <linux/kmalloc_sizes.h> //這裡查詢申請的size在哪個範圍 從32乘2遞增。I每次加1.
#undef CACHE
        return NULL;
found:
#ifdef CONFIG_ZONE_DMA
        if (flags & GFP_DMA)
            cachep = malloc_sizes[i].cs_dmacachep; //很明顯如果定義了dma，並且設定了dma標誌則優先從dma cache裡申請。malloc_sizes的初始化在slab.c裡。可以具體分析一下。
        else
#endif
            cachep = malloc_sizes[i].cs_cachep; //從指定的cache連結串列分配記憶體，不浪費空間。

        ret = kmem_cache_alloc_trace(cachep, flags, size);

        return ret;
    }
    return __kmalloc(size, flags);
}

static __always_inline void *kmalloc(size_t size, gfp_t flags)

{

struct kmem_cache *cachep;

void *ret;

if (__builtin_constant_p(size)) {

int i = 0;

if (!size)

return ZERO_SIZE_PTR;

#define CACHE(x) \

if (size <= x) \

goto found; \

else \

i++;

#include <linux/kmalloc_sizes.h> //這裡查詢申請的size在哪個範圍從32乘2遞增。I每次加1.

#undef CACHE

return NULL;

found:

#ifdef CONFIG_ZONE_DMA

if (flags & GFP_DMA)

cachep = malloc_sizes[i].cs_dmacachep; //很明顯如果定義了dma，並且設定了dma標誌則優先從dma cache裡申請。malloc_sizes的初始化在slab.c裡。可以具體分析一下。

else

#endif

cachep = malloc_sizes[i].cs_cachep; //從指定的cache連結串列分配記憶體，不浪費空間。

ret = kmem_cache_alloc_trace(cachep, flags, size);

return ret;

}

return __kmalloc(size, flags);

}

這裡可以補充下程式碼關於kmalloc_sizes.h

#if (PAGE_SIZE == 4096)
    CACHE(32)
#endif
    CACHE(64)
#if L1_CACHE_BYTES < 64
    CACHE(96)
#endif
    CACHE(128)
#if L1_CACHE_BYTES < 128
    CACHE(192)
#endif
    CACHE(256)
    CACHE(512)
    CACHE(1024)
    CACHE(2048)
    CACHE(4096)
    CACHE(8192)
    CACHE(16384)
    CACHE(32768)
    CACHE(65536)
    CACHE(131072)
#if KMALLOC_MAX_SIZE >= 262144
    CACHE(262144)
#endif
#if KMALLOC_MAX_SIZE >= 524288
    CACHE(524288)
#endif
#if KMALLOC_MAX_SIZE >= 1048576
    CACHE(1048576)
#endif
#if KMALLOC_MAX_SIZE >= 2097152
    CACHE(2097152)
#endif
#if KMALLOC_MAX_SIZE >= 4194304
    CACHE(4194304)
#endif
#if KMALLOC_MAX_SIZE >= 8388608
    CACHE(8388608)
#endif
#if KMALLOC_MAX_SIZE >= 16777216
    CACHE(16777216)
#endif
#if KMALLOC_MAX_SIZE >= 33554432
    CACHE(33554432)
#endif

#if (PAGE_SIZE == 4096)

CACHE(32)

#endif

CACHE(64)

#if L1_CACHE_BYTES < 64

CACHE(96)

#endif

CACHE(128)

#if L1_CACHE_BYTES < 128

CACHE(192)

#endif

CACHE(256)

CACHE(512)

CACHE(1024)

CACHE(2048)

CACHE(4096)

CACHE(8192)

CACHE(16384)

CACHE(32768)

CACHE(65536)

CACHE(131072)

#if KMALLOC_MAX_SIZE >= 262144

CACHE(262144)

#endif

#if KMALLOC_MAX_SIZE >= 524288

CACHE(524288)

#endif

#if KMALLOC_MAX_SIZE >= 1048576

CACHE(1048576)

#endif

#if KMALLOC_MAX_SIZE >= 2097152

CACHE(2097152)

#endif

#if KMALLOC_MAX_SIZE >= 4194304

CACHE(4194304)

#endif

#if KMALLOC_MAX_SIZE >= 8388608

CACHE(8388608)

#endif

#if KMALLOC_MAX_SIZE >= 16777216

CACHE(16777216)

#endif

#if KMALLOC_MAX_SIZE >= 33554432

CACHE(33554432)

#endif

我們看到函式開頭需要說明一下：
__builtin_constant_p 是編譯器gcc內建函式，用於判斷一個值是否為編譯時常量，如果是常數，函式返回1 ，否則返回0。此內建函式的典型用法是在巨集中用於手動編譯時優化顯然如果size為常數則用__kmalloc(size, flags);申請記憶體.

它查詢需要分配的記憶體在哪個系統cache然後呼叫

#ifdef CONFIG_TRACING
extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
#else
static __always_inline void *
kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
    return kmem_cache_alloc(cachep, flags);
}
#endif

#ifdef CONFIG_TRACING

extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);

#else

static __always_inline void *

kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)

{

return kmem_cache_alloc(cachep, flags);

}

#endif

我們看具體程式碼：

/**
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache. The flags are only relevant
 * if the cache has no available objects.
 */
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    void *ret = slab_alloc(cachep, flags, _RET_IP_);

    trace_kmem_cache_alloc(_RET_IP_, ret,                    // 跟蹤除錯會用到
             cachep->object_size, cachep->size, flags);

    return ret;
}

/**

* kmem_cache_alloc - Allocate an object

* @cachep: The cache to allocate from.

* @flags: See kmalloc().

* Allocate an object from this cache. The flags are only relevant

* if the cache has no available objects.

void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)

{

void *ret = slab_alloc(cachep, flags, _RET_IP_);

trace_kmem_cache_alloc(_RET_IP_, ret, // 跟蹤除錯會用到

cachep->object_size, cachep->size, flags);

return ret;

}

它實際的分配是slab_alloc：

static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
    unsigned long save_flags;
    void *objp;

    flags &= gfp_allowed_mask;   //  說明在gfp.h中 ，如下

/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
extern gfp_t gfp_allowed_mask;

    lockdep_trace_alloc(flags);  // 除錯用

    if (slab_should_failslab(cachep, flags))
        return NULL;

    cachep = memcg_kmem_get_cache(cachep, flags);

    cache_alloc_debugcheck_before(cachep, flags);
    local_irq_save(save_flags);
    objp = __do_cache_alloc(cachep, flags);
    local_irq_restore(save_flags);
    objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
    kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
                 flags);
    prefetchw(objp);

    if (likely(objp))
        kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);

    if (unlikely((flags & __GFP_ZERO) && objp))
        memset(objp, 0, cachep->object_size);

    return objp;
}

static __always_inline void *

slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)

{

unsigned long save_flags;

void *objp;

flags &= gfp_allowed_mask; // 說明在gfp.h中，如下

* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what

* GFP flags are used before interrupts are enabled. Once interrupts are

* enabled, it is set to __GFP_BITS_MASK while the system is running. During

* hibernation, it is used by PM to avoid I/O during memory allocation while

* devices are suspended.

extern gfp_t gfp_allowed_mask;

lockdep_trace_alloc(flags); // 除錯用

if (slab_should_failslab(cachep, flags))

return NULL;

cachep = memcg_kmem_get_cache(cachep, flags);

cache_alloc_debugcheck_before(cachep, flags);

local_irq_save(save_flags);

objp = __do_cache_alloc(cachep, flags);

local_irq_restore(save_flags);

objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);

kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,

flags);

prefetchw(objp);

if (likely(objp))

kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);

if (unlikely((flags & __GFP_ZERO) && objp))

memset(objp, 0, cachep->object_size);

return objp;

}

它呼叫objp = __do_cache_alloc(cachep, flags); 除了檢查一些標誌等繼續呼叫
____cache_alloc(cachep, flags);

它是一個統一的介面（有檢測numa和uma ，linux預設是uma 除非指定了numa）

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    void *objp;
    struct array_cache *ac;
    bool force_refill = false;

    check_irq_off();

    ac = cpu_cache_get(cachep);
    if (likely(ac->avail)) {
        ac->touched = 1;
        objp = ac_get_obj(cachep, ac, flags, false);

        /*
         * Allow for the possibility all avail objects are not allowed
         * by the current flags
         */
        if (objp) {
            STATS_INC_ALLOCHIT(cachep);
            goto out;
        }
        force_refill = true;
    }

    STATS_INC_ALLOCMISS(cachep);
    objp = cache_alloc_refill(cachep, flags, force_refill);
    /*
     * the 'ac' may be updated by cache_alloc_refill(),
     * and kmemleak_erase() requires its correct value.
     */
    ac = cpu_cache_get(cachep);

out:
    /*
     * To avoid a false negative, if an object that is in one of the
     * per-CPU caches is leaked, we need to make sure kmemleak doesn't
     * treat the array pointers as a reference to the object.
     */
    if (objp)
        kmemleak_erase(&ac->entry[ac->avail]);
    return objp;
}

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)

{

void *objp;

struct array_cache *ac;

bool force_refill = false;

check_irq_off();

ac = cpu_cache_get(cachep);

if (likely(ac->avail)) {

ac->touched = 1;

objp = ac_get_obj(cachep, ac, flags, false);

* Allow for the possibility all avail objects are not allowed

* by the current flags

if (objp) {

STATS_INC_ALLOCHIT(cachep);

goto out;

}

force_refill = true;

}

STATS_INC_ALLOCMISS(cachep);

objp = cache_alloc_refill(cachep, flags, force_refill);

* the 'ac' may be updated by cache_alloc_refill(),

* and kmemleak_erase() requires its correct value.

ac = cpu_cache_get(cachep);

out:

* To avoid a false negative, if an object that is in one of the

* per-CPU caches is leaked, we need to make sure kmemleak doesn't

* treat the array pointers as a reference to the object.

if (objp)

kmemleak_erase(&ac->entry[ac->avail]);

return objp;

}

這裡我們假定是第一次使用分配記憶體，那麼根據在kmem_cache_init中的malloc_sizes[]的初始化，在kmalloc的時候返回的kmalloc_cache指標指向的cache中用到這樣個函式：

static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
    if (slab_state >= FULL)
        return enable_cpucache(cachep, gfp);

    if (slab_state == DOWN) {
        /*
         * Note: Creation of first cache (kmem_cache).
         * The setup_list3s is taken care
         * of by the caller of __kmem_cache_create
         */
        cachep->array[smp_processor_id()] = &initarray_generic.cache;
        slab_state = PARTIAL;
    } else if (slab_state == PARTIAL) {
        /*
         * Note: the second kmem_cache_create must create the cache
         * that's used by kmalloc(24), otherwise the creation of
         * further caches will BUG().
         */
        cachep->array[smp_processor_id()] = &initarray_generic.cache;

        /*
         * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
         * the second cache, then we need to set up all its list3s,
         * otherwise the creation of further caches will BUG().
         */
        set_up_list3s(cachep, SIZE_AC);
        if (INDEX_AC == INDEX_L3)
            slab_state = PARTIAL_L3;
        else
            slab_state = PARTIAL_ARRAYCACHE;
    } else {
        /* Remaining boot caches */
        cachep->array[smp_processor_id()] =
            kmalloc(sizeof(struct arraycache_init), gfp);

        if (slab_state == PARTIAL_ARRAYCACHE) {
            set_up_list3s(cachep, SIZE_L3);
            slab_state = PARTIAL_L3;
        } else {
            int node;
            for_each_online_node(node) {
                cachep->nodelists[node] =
                 kmalloc_node(sizeof(struct kmem_list3),
                        gfp, node);
                BUG_ON(!cachep->nodelists[node]);
                kmem_list3_init(cachep->nodelists[node]);
            }
        }
    }
    cachep->nodelists[numa_mem_id()]->next_reap =
            jiffies + REAPTIMEOUT_LIST3 +
            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;

    cpu_cache_get(cachep)->avail = 0;
    cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;   // 1
    cpu_cache_get(cachep)->batchcount = 1;
    cpu_cache_get(cachep)->touched = 0;
    cachep->batchcount = 1;
    cachep->limit = BOOT_CPUCACHE_ENTRIES;
    return 0;
}

static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)

{

if (slab_state >= FULL)

return enable_cpucache(cachep, gfp);

if (slab_state == DOWN) {

* Note: Creation of first cache (kmem_cache).

* The setup_list3s is taken care

* of by the caller of __kmem_cache_create

cachep->array[smp_processor_id()] = &initarray_generic.cache;

slab_state = PARTIAL;

} else if (slab_state == PARTIAL) {

* Note: the second kmem_cache_create must create the cache

* that's used by kmalloc(24), otherwise the creation of

* further caches will BUG().

cachep->array[smp_processor_id()] = &initarray_generic.cache;

* If the cache that's used by kmalloc(sizeof(kmem_list3)) is

* the second cache, then we need to set up all its list3s,

* otherwise the creation of further caches will BUG().

set_up_list3s(cachep, SIZE_AC);

if (INDEX_AC == INDEX_L3)

slab_state = PARTIAL_L3;

else

slab_state = PARTIAL_ARRAYCACHE;

} else {

/* Remaining boot caches */

cachep->array[smp_processor_id()] =

kmalloc(sizeof(struct arraycache_init), gfp);

if (slab_state == PARTIAL_ARRAYCACHE) {

set_up_list3s(cachep, SIZE_L3);

slab_state = PARTIAL_L3;

} else {

int node;

for_each_online_node(node) {

cachep->nodelists[node] =

kmalloc_node(sizeof(struct kmem_list3),

gfp, node);

BUG_ON(!cachep->nodelists[node]);

kmem_list3_init(cachep->nodelists[node]);

}

cachep->nodelists[numa_mem_id()]->next_reap =

jiffies + REAPTIMEOUT_LIST3 +

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

cpu_cache_get(cachep)->avail = 0;

cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; // 1

cpu_cache_get(cachep)->batchcount = 1;

cpu_cache_get(cachep)->touched = 0;

cachep->batchcount = 1;

cachep->limit = BOOT_CPUCACHE_ENTRIES;

return 0;

}

我們知道不論array被賦了什麼值，最後都要初始化avail等值.

所以如果array不可用，那麼就會呼叫；當然如果array可用那麼直接返回申請的obj的記憶體指標.

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
                            bool force_refill)
{
    int batchcount;
    struct kmem_list3 *l3;
    struct array_cache *ac;
    int node;

    check_irq_off();
    node = numa_mem_id();
    if (unlikely(force_refill))
        goto force_grow;
retry:
    ac = cpu_cache_get(cachep);
    batchcount = ac->batchcount;
    if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
        /*
         * If there was little recent activity on this cache, then
         * perform only a partial refill. Otherwise we could generate
         * refill bouncing.
         */
        batchcount = BATCHREFILL_LIMIT;
    }
    l3 = cachep->nodelists[node];

    BUG_ON(ac->avail > 0 || !l3);
    spin_lock(&l3->list_lock);

    /* See if we can refill from the shared array */
    if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
        l3->shared->touched = 1;
        goto alloc_done;
    }

    while (batchcount > 0) {
        struct list_head *entry;
        struct slab *slabp;
        /* Get slab alloc is to come from. */
        entry = l3->slabs_partial.next;
        if (entry == &l3->slabs_partial) {
            l3->free_touched = 1;
            entry = l3->slabs_free.next;
            if (entry == &l3->slabs_free)
                goto must_grow;
        }

        slabp = list_entry(entry, struct slab, list);
        check_slabp(cachep, slabp);
        check_spinlock_acquired(cachep);

        /*
         * The slab was either on partial or free list so
         * there must be at least one object available for
         * allocation.
         */
        BUG_ON(slabp->inuse >= cachep->num);

        while (slabp->inuse < cachep->num && batchcount--) {
            STATS_INC_ALLOCED(cachep);
            STATS_INC_ACTIVE(cachep);
            STATS_SET_HIGH(cachep);

            ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
                                    node));
        }
        check_slabp(cachep, slabp);

        /* move slabp to correct slabp list: */
        list_del(&slabp->list);
        if (slabp->free == BUFCTL_END)
            list_add(&slabp->list, &l3->slabs_full);
        else
            list_add(&slabp->list, &l3->slabs_partial);
    }

must_grow:
    l3->free_objects -= ac->avail;
alloc_done:
    spin_unlock(&l3->list_lock);

    if (unlikely(!ac->avail)) {
        int x;
force_grow:
        x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);   //  grow成功返回 1 

        /* cache_grow can reenable interrupts, then ac could change. */
        ac = cpu_cache_get(cachep);
        node = numa_mem_id();

        /* no objects in sight? abort */
        if (!x && (ac->avail == 0 || force_refill))
            return NULL;

        if (!ac->avail)        /* objects refilled by interrupt? */
            goto retry;
    }
    ac->touched = 1;

    return ac_get_obj(cachep, ac, flags, force_refill);
}

100

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,

bool force_refill)

{

int batchcount;

struct kmem_list3 *l3;

struct array_cache *ac;

int node;

check_irq_off();

node = numa_mem_id();

if (unlikely(force_refill))

goto force_grow;

retry:

ac = cpu_cache_get(cachep);

batchcount = ac->batchcount;

if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {

* If there was little recent activity on this cache, then

* perform only a partial refill. Otherwise we could generate

* refill bouncing.

batchcount = BATCHREFILL_LIMIT;

}

l3 = cachep->nodelists[node];

BUG_ON(ac->avail > 0 || !l3);

spin_lock(&l3->list_lock);

/* See if we can refill from the shared array */

if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {

l3->shared->touched = 1;

goto alloc_done;

}

while (batchcount > 0) {

struct list_head *entry;

struct slab *slabp;

/* Get slab alloc is to come from. */

entry = l3->slabs_partial.next;

if (entry == &l3->slabs_partial) {

l3->free_touched = 1;

entry = l3->slabs_free.next;

if (entry == &l3->slabs_free)

goto must_grow;

}

slabp = list_entry(entry, struct slab, list);

check_slabp(cachep, slabp);

check_spinlock_acquired(cachep);

* The slab was either on partial or free list so

* there must be at least one object available for

* allocation.

BUG_ON(slabp->inuse >= cachep->num);

while (slabp->inuse < cachep->num && batchcount--) {

STATS_INC_ALLOCED(cachep);

STATS_INC_ACTIVE(cachep);

STATS_SET_HIGH(cachep);

ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,

node));

}

check_slabp(cachep, slabp);

/* move slabp to correct slabp list: */

list_del(&slabp->list);

if (slabp->free == BUFCTL_END)

list_add(&slabp->list, &l3->slabs_full);

else

list_add(&slabp->list, &l3->slabs_partial);

}

must_grow:

l3->free_objects -= ac->avail;

alloc_done:

spin_unlock(&l3->list_lock);

if (unlikely(!ac->avail)) {

int x;

force_grow:

x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); // grow成功返回 1

/* cache_grow can reenable interrupts, then ac could change. */

ac = cpu_cache_get(cachep);

node = numa_mem_id();

/* no objects in sight? abort */

if (!x && (ac->avail == 0 || force_refill))

return NULL;

if (!ac->avail) /* objects refilled by interrupt? */

goto retry;

}

ac->touched = 1;

return ac_get_obj(cachep, ac, flags, force_refill);

}

由於第一次使用nodelist上slab連結串列都為空，所以must_grow

它呼叫cache_grow，這個函式首先計算了slab著色處理。然後呼叫kmem_getpages申請page，大小根據cache->gfporder，它返回申請pages的虛擬地址.

/*
 * Grow (by 1) the number of slabs within a cache. This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
static int cache_grow(struct kmem_cache *cachep,
        gfp_t flags, int nodeid, void *objp)
{
    struct slab *slabp;
    size_t offset;
    gfp_t local_flags;
    struct kmem_list3 *l3;

    /*
     * Be lazy and only check for valid flags here, keeping it out of the
     * critical path in kmem_cache_alloc().
     */
    BUG_ON(flags & GFP_SLAB_BUG_MASK);
    local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

    /* Take the l3 list lock to change the colour_next on this node */
    check_irq_off();
    l3 = cachep->nodelists[nodeid];
    spin_lock(&l3->list_lock);

    /* Get colour for the slab, and cal the next value. */
    offset = l3->colour_next; // default 0 
    l3->colour_next++;
    if (l3->colour_next >= cachep->colour)
        l3->colour_next = 0;
    spin_unlock(&l3->list_lock);

    offset *= cachep->colour_off; // first time ,offset is 0 ; 

    if (local_flags & __GFP_WAIT)
        local_irq_enable();

    /*
     * The test for missing atomic flag is performed here, rather than
     * the more obvious place, simply to reduce the critical path length
     * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
     * will eventually be caught here (where it matters).
     */
    kmem_flagcheck(cachep, flags);

    /*
     * Get mem for the objs. Attempt to allocate a physical page from
     * 'nodeid'.
     */
    if (!objp)
        objp = kmem_getpages(cachep, local_flags, nodeid);
    if (!objp)
        goto failed;

    /* Get slab management. */
    slabp = alloc_slabmgmt(cachep, objp, offset,
            local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
    if (!slabp)
        goto opps1;

    slab_map_pages(cachep, slabp, objp);

    cache_init_objs(cachep, slabp);

    if (local_flags & __GFP_WAIT)
        local_irq_disable();
    check_irq_off();
    spin_lock(&l3->list_lock);

    /* Make slab active. */
    list_add_tail(&slabp->list, &(l3->slabs_free));  //  把新申請的slab新增到nodelist的slabs_free連結串列。
    STATS_INC_GROWN(cachep);
    l3->free_objects += cachep->num;                    //初始化可用的物件即每個slab可以包含的obj數目
    spin_unlock(&l3->list_lock);
    return 1;
opps1:
    kmem_freepages(cachep, objp);
failed:
    if (local_flags & __GFP_WAIT)
        local_irq_disable();
    return 0;
}

* Grow (by 1) the number of slabs within a cache. This is called by

* kmem_cache_alloc() when there are no active objs left in a cache.

static int cache_grow(struct kmem_cache *cachep,

gfp_t flags, int nodeid, void *objp)

{

struct slab *slabp;

size_t offset;

gfp_t local_flags;

struct kmem_list3 *l3;

* Be lazy and only check for valid flags here, keeping it out of the

* critical path in kmem_cache_alloc().

BUG_ON(flags & GFP_SLAB_BUG_MASK);

local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

/* Take the l3 list lock to change the colour_next on this node */

check_irq_off();

l3 = cachep->nodelists[nodeid];

spin_lock(&l3->list_lock);

/* Get colour for the slab, and cal the next value. */

offset = l3->colour_next; // default 0

l3->colour_next++;

if (l3->colour_next >= cachep->colour)

l3->colour_next = 0;

spin_unlock(&l3->list_lock);

offset *= cachep->colour_off; // first time ,offset is 0 ;

if (local_flags & __GFP_WAIT)

local_irq_enable();

* The test for missing atomic flag is performed here, rather than

* the more obvious place, simply to reduce the critical path length

* in kmem_cache_alloc(). If a caller is seriously mis-behaving they

* will eventually be caught here (where it matters).

kmem_flagcheck(cachep, flags);

* Get mem for the objs. Attempt to allocate a physical page from

* 'nodeid'.

if (!objp)

objp = kmem_getpages(cachep, local_flags, nodeid);

if (!objp)

goto failed;

/* Get slab management. */

slabp = alloc_slabmgmt(cachep, objp, offset,

local_flags & ~GFP_CONSTRAINT_MASK, nodeid);

if (!slabp)

goto opps1;

slab_map_pages(cachep, slabp, objp);

cache_init_objs(cachep, slabp);

if (local_flags & __GFP_WAIT)

local_irq_disable();

check_irq_off();

spin_lock(&l3->list_lock);

/* Make slab active. */

list_add_tail(&slabp->list, &(l3->slabs_free)); // 把新申請的slab新增到nodelist的slabs_free連結串列。

STATS_INC_GROWN(cachep);

l3->free_objects += cachep->num; //初始化可用的物件即每個slab可以包含的obj數目

spin_unlock(&l3->list_lock);

return 1;

opps1:

kmem_freepages(cachep, objp);

failed:

if (local_flags & __GFP_WAIT)

local_irq_disable();

return 0;

}

而關於slab著色跟硬體緩衝有關，為了儘量避免快取衝突不命中問題，提高效率（cache_line問題）。可以參考《深入理解計算機系統》。

具體操作見：

/*
 * Get the memory for a slab management obj.
 * For a slab cache when the slab descriptor is off-slab, slab descriptors
 * always come from malloc_sizes caches. The slab descriptor cannot
 * come from the same cache which is getting created because,
 * when we are searching for an appropriate cache for these
 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
 * If we are creating a malloc_sizes cache here it would not be visible to
 * kmem_find_general_cachep till the initialization is complete.
 * Hence we cannot have slabp_cache same as the original cache.
 */
static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
                 int colour_off, gfp_t local_flags,
                 int nodeid)
{
    struct slab *slabp;

    if (OFF_SLAB(cachep)) {     
   //  關於OFF_SLAB問題 可以看程式碼：

CFLGS_OFF_SLAB 在__kmem_cache_create
    /*
     * Determine if the slab management is 'on' or 'off' slab.
     * (bootstrapping cannot cope with offslab caches so don't do
     * it too early on. Always use on-slab management when
     * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
     */
    if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
     !(flags & SLAB_NOLEAKTRACE))
        /*
         * Size is large, assume best to place the slab management obj
         * off-slab (should allow better packing of objs).
         */
        flags |= CFLGS_OFF_SLAB;

        /* Slab management obj is off-slab. */
        slabp = kmem_cache_alloc_node(cachep->slabp_cache,
                     local_flags, nodeid);
        /*
         * If the first object in the slab is leaked (it's allocated
         * but no one has a reference to it), we want to make sure
         * kmemleak does not treat the ->s_mem pointer as a reference
         * to the object. Otherwise we will not report the leak.
         */
        kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
                 local_flags);
        if (!slabp)
            return NULL;
    } else {
        slabp = objp + colour_off;    //   在__kmem_cache_create中cachep->colour_off = cache_line_size();
                                       // 在cache.h中#define cache_line_size()	L1_CACHE_BYTES；  一般為32B 大小.
                                        // cachep->colour = left_over / cachep->colour_off;
        colour_off += cachep->slab_size;
    }
    slabp->inuse = 0;                    // num of objs active in slab
    slabp->colouroff = colour_off;   //第一個obj相對page地址的偏移
    slabp->s_mem = objp + colour_off;  //第一個obj的地址
    slabp->nodeid = nodeid;
    slabp->free = 0;                    
    return slabp;
}

* Get the memory for a slab management obj.

* For a slab cache when the slab descriptor is off-slab, slab descriptors

* always come from malloc_sizes caches. The slab descriptor cannot

* come from the same cache which is getting created because,

* when we are searching for an appropriate cache for these

* descriptors in kmem_cache_create, we search through the malloc_sizes array.

* If we are creating a malloc_sizes cache here it would not be visible to

* kmem_find_general_cachep till the initialization is complete.

* Hence we cannot have slabp_cache same as the original cache.

static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,

int colour_off, gfp_t local_flags,

int nodeid)

{

struct slab *slabp;

if (OFF_SLAB(cachep)) {

// 關於OFF_SLAB問題可以看程式碼：

CFLGS_OFF_SLAB 在__kmem_cache_create

* Determine if the slab management is 'on' or 'off' slab.

* (bootstrapping cannot cope with offslab caches so don't do

* it too early on. Always use on-slab management when

* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)

if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&

!(flags & SLAB_NOLEAKTRACE))

* Size is large, assume best to place the slab management obj

* off-slab (should allow better packing of objs).

flags |= CFLGS_OFF_SLAB;

/* Slab management obj is off-slab. */

slabp = kmem_cache_alloc_node(cachep->slabp_cache,

local_flags, nodeid);

* If the first object in the slab is leaked (it's allocated

* but no one has a reference to it), we want to make sure

* kmemleak does not treat the ->s_mem pointer as a reference

* to the object. Otherwise we will not report the leak.

kmemleak_scan_area(&slabp->list, sizeof(struct list_head),

local_flags);

if (!slabp)

return NULL;

} else {

slabp = objp + colour_off; // 在__kmem_cache_create中cachep->colour_off = cache_line_size();

// 在cache.h中#define cache_line_size() L1_CACHE_BYTES；一般為32B 大小.

// cachep->colour = left_over / cachep->colour_off;

colour_off += cachep->slab_size;

}

slabp->inuse = 0; // num of objs active in slab

slabp->colouroff = colour_off; //第一個obj相對page地址的偏移

slabp->s_mem = objp + colour_off; //第一個obj的地址

slabp->nodeid = nodeid;

slabp->free = 0;

return slabp;

}

我們看看另外一個很重要的操作：

static void cache_init_objs(struct kmem_cache *cachep,
             struct slab *slabp)
{
    int i;

    for (i = 0; i < cachep->num; i++) {
        void *objp = index_to_obj(cachep, slabp, i);
#if DEBUG
        /* need to poison the objs? */
        if (cachep->flags & SLAB_POISON)
            poison_obj(cachep, objp, POISON_FREE);
        if (cachep->flags & SLAB_STORE_USER)
            *dbg_userword(cachep, objp) = NULL;

        if (cachep->flags & SLAB_RED_ZONE) {
            *dbg_redzone1(cachep, objp) = RED_INACTIVE;
            *dbg_redzone2(cachep, objp) = RED_INACTIVE;
        }
        /*
         * Constructors are not allowed to allocate memory from the same
         * cache which they are a constructor for. Otherwise, deadlock.
         * They must also be threaded.
         */
        if (cachep->ctor && !(cachep->flags & SLAB_POISON))
            cachep->ctor(objp + obj_offset(cachep));

        if (cachep->flags & SLAB_RED_ZONE) {
            if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
                slab_error(cachep, "constructor overwrote the"
                     " end of an object");
            if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
                slab_error(cachep, "constructor overwrote the"
                     " start of an object");
        }
        if ((cachep->size % PAGE_SIZE) == 0 &&
             OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
            kernel_map_pages(virt_to_page(objp),
                     cachep->size / PAGE_SIZE, 0);
#else
        if (cachep->ctor)
            cachep->ctor(objp); //  根據建構函式初始化物件
#endif
        slab_bufctl(slabp)[i] = i + 1;  //  init  bufctl陣列 1、2、3、4 .....  最後一個設定成為BUFCTL_END
    }
    slab_bufctl(slabp)[i - 1] = BUFCTL_END;
}

static void cache_init_objs(struct kmem_cache *cachep,

struct slab *slabp)

{

int i;

for (i = 0; i < cachep->num; i++) {

void *objp = index_to_obj(cachep, slabp, i);

#if DEBUG

/* need to poison the objs? */

if (cachep->flags & SLAB_POISON)

poison_obj(cachep, objp, POISON_FREE);

if (cachep->flags & SLAB_STORE_USER)

*dbg_userword(cachep, objp) = NULL;

if (cachep->flags & SLAB_RED_ZONE) {

*dbg_redzone1(cachep, objp) = RED_INACTIVE;

*dbg_redzone2(cachep, objp) = RED_INACTIVE;

}

* Constructors are not allowed to allocate memory from the same

* cache which they are a constructor for. Otherwise, deadlock.

* They must also be threaded.

if (cachep->ctor && !(cachep->flags & SLAB_POISON))

cachep->ctor(objp + obj_offset(cachep));

if (cachep->flags & SLAB_RED_ZONE) {

if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)

slab_error(cachep, "constructor overwrote the"

" end of an object");

if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)

slab_error(cachep, "constructor overwrote the"

" start of an object");

}

if ((cachep->size % PAGE_SIZE) == 0 &&

OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)

kernel_map_pages(virt_to_page(objp),

cachep->size / PAGE_SIZE, 0);

#else

if (cachep->ctor)

cachep->ctor(objp); // 根據建構函式初始化物件

#endif

slab_bufctl(slabp)[i] = i + 1; // init bufctl陣列 1、2、3、4 ..... 最後一個設定成為BUFCTL_END

}

slab_bufctl(slabp)[i - 1] = BUFCTL_END;

}

Linux實體記憶體管理
2024-11-28
Linux記憶體
Linux共享記憶體的管理
2018-06-07
Linux記憶體
Linux 記憶體區管理 slab
2024-04-26
Linux記憶體
linux記憶體管理（二）- vmalloc
2024-06-11
Linux記憶體
linux記憶體管理（一）實體記憶體的組織和記憶體分配
2024-06-07
Linux記憶體
Linux記憶體洩露案例分析和記憶體管理分享
2024-10-24
Linux記憶體洩露
記憶體管理記憶體管理概述
2020-11-03
記憶體
Linux 的記憶體分頁管理
2018-08-08
Linux記憶體
Linux 記憶體管理 pt.3
2023-05-17
Linux記憶體
Linux 記憶體管理 pt.1
2023-04-27
Linux記憶體
Linux 記憶體管理 pt.2
2023-05-05
Linux記憶體
Linux的記憶體分頁管理
2020-03-26
Linux記憶體
Linux-記憶體和磁碟管理
2022-02-14
Linux記憶體
記憶體管理篇——實體記憶體的管理
2022-02-23
記憶體
linux記憶體管理學習總結
2024-11-04
Linux記憶體
【記憶體管理】記憶體佈局
2024-06-10
記憶體
Linux使用者空間記憶體管理
2018-09-26
Linux記憶體
linux 非連續記憶體區管理 vmalloc
2024-04-26
Linux記憶體
linux記憶體管理（八）- 反向對映RMAP
2024-06-15
Linux記憶體
linux記憶體管理（十）- 頁面回收（二）
2024-06-18
Linux記憶體
linux記憶體管理（十一）- 頁面遷移
2024-06-18
Linux記憶體
linux記憶體管理（六）- 核心新struct - folio
2024-06-11
Linux記憶體Struct
淺析Linux Kernel[5.11.0]記憶體管理（一）
2022-01-18
Linux記憶體
記憶體管理兩部曲之實體記憶體管理
2021-05-22
記憶體
Java的記憶體 -JVM 記憶體管理
2018-08-20
Java記憶體JVM
Go：記憶體管理與記憶體清理
2020-08-04
Go記憶體
【記憶體管理】Oracle AMM自動記憶體管理詳解
2020-08-27
記憶體Oracle
記憶體管理兩部曲之虛擬記憶體管理
2021-05-31
記憶體
Linux堆記憶體管理深入分析(下半部)
2020-08-19
Linux記憶體
[Linux]共享記憶體
2024-12-07
Linux記憶體
Linux核心筆記004 - 從記憶體管理開始，認識Linux核心
2020-05-28
Linux筆記記憶體
JavaScript 記憶體管理
2018-11-02
JavaScript記憶體
iOS 記憶體管理
2018-12-20
iOS記憶體
Android記憶體管理
2018-06-13
Android記憶體
OC記憶體管理
2018-08-29
記憶體
記憶體管理-swMemoryGlobal
2019-09-05
記憶體
Flink記憶體管理
2022-12-11
記憶體
MySQL記憶體管理
2021-01-03
MySql記憶體
【記憶體管理】Oracle如何使用ASMM自動共享記憶體管理
2020-12-03
記憶體OracleASM

Linux 記憶體管理: Kmalloc

相關文章