kernel: kmalloc

1 源码

kmalloc 申请不超过8k则使用slab，再大就走伙伴系统

/*** kmalloc - allocate memory* @size: how many bytes of memory are required.* @flags: the type of memory to allocate.** kmalloc is the normal method of allocating memory* for objects smaller than page size in the kernel.** The @flags argument may be one of the GFP flags defined at* include/linux/gfp.h and described at* :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`** The recommended usage of the @flags is described at* :ref:`Documentation/core-api/memory-allocation.rst <memory-allocation>`** Below is a brief outline of the most useful GFP flags** %GFP_KERNEL*  Allocate normal kernel ram. May sleep.** %GFP_NOWAIT*   Allocation will not sleep.** %GFP_ATOMIC*   Allocation will not sleep.  May use emergency pools.** %GFP_HIGHUSER*   Allocate memory from high memory on behalf of user.** Also it is possible to set different flags by OR'ing* in one or more of the following additional @flags:** %__GFP_HIGH* This allocation has high priority and may use emergency pools.** %__GFP_NOFAIL* Indicate that this allocation is in no way allowed to fail* (think twice before using).** %__GFP_NORETRY*   If memory is not immediately available,*    then give up at once.** %__GFP_NOWARN*  If allocation fails, don't issue any warnings.** %__GFP_RETRY_MAYFAIL* Try really hard to succeed the allocation but fail* eventually.*/
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{if (__builtin_constant_p(size)) {#ifndef CONFIG_SLOBunsigned int index;
#endifif (size > KMALLOC_MAX_CACHE_SIZE)return kmalloc_large(size, flags);/* 分支1：之后将会使用__get_free_pages()来获取页*/
#ifndef CONFIG_SLOBindex = kmalloc_index(size); //计算出相应indexif (!index)return ZERO_SIZE_PTR;return kmem_cache_alloc_trace(kmalloc_caches[kmalloc_type(flags)][index],flags, size);
#endif}return __kmalloc(size, flags);
}

kmalloc()并不是真正实现内存分配的地方，在此处仅仅是进行了一些判断，并形成了3个分支：
第一个分支，是size大于KMALLOC_MAX_CACHE_SIZE就会调用kmalloc_large(size, flags)来继续完成分配，其中这个MAX的值是(1UL << (12 + 1))，即8KB。当使用kmalloc申请大于8K的内存时，会进入这个分支，专门用于处理大块内存（以page为粒度）的申请。这个分支的内容与buddy system有关：
将会经kmalloc_large()->kmalloc_order_trace()->kmalloc_order()->alloc_pages()，最终通过Buddy伙伴算法申请所需内存
第二个分支，要求“ndef CONFIG_SLOB”，这个CONFIG_SLOB是一个编译时配置的选项，我们就默认这个选项没有定义好了。在此基础上，如果没有超过MAX，那就先调用kmalloc_index(size)

/** Figure out which kmalloc slab an allocation of a certain size* belongs to.* 0 = zero alloc* 1 =  65 .. 96 bytes* 2 = 129 .. 192 bytes* n = 2^(n-1)+1 .. 2^n*/
static __always_inline unsigned int kmalloc_index(size_t size)
{if (!size)return 0;if (size <= KMALLOC_MIN_SIZE)return KMALLOC_SHIFT_LOW;if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)return 1;if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)return 2;if (size <=          8) return 3;if (size <=         16) return 4;if (size <=         32) return 5;if (size <=         64) return 6;if (size <=        128) return 7;if (size <=        256) return 8;if (size <=        512) return 9;if (size <=       1024) return 10;if (size <=   2 * 1024) return 11;if (size <=   4 * 1024) return 12;if (size <=   8 * 1024) return 13;if (size <=  16 * 1024) return 14;if (size <=  32 * 1024) return 15;if (size <=  64 * 1024) return 16;if (size <= 128 * 1024) return 17;if (size <= 256 * 1024) return 18;if (size <= 512 * 1024) return 19;if (size <= 1024 * 1024) return 20;if (size <=  2 * 1024 * 1024) return 21;if (size <=  4 * 1024 * 1024) return 22;if (size <=  8 * 1024 * 1024) return 23;if (size <=  16 * 1024 * 1024) return 24;if (size <=  32 * 1024 * 1024) return 25;if (size <=  64 * 1024 * 1024) return 26;BUG();/* Will never be reached. Needed because the compiler may complain */return -1;
}

之后则根据获得的index，调用:

kmem_cache_alloc_trace(kmalloc_caches[kmalloc_type(flags)][index], flags, size);

值得注意的是，这里将index作为一个二维数组kmalloc_caches的第二个序号，而第一个则是一个由flag确定的类别。类比用户态的bin，猜想这个kmalloc_caches数组，用于维护一个类似bin的池子，这个池子的分类标准除了size以外，还有一个由flag确定的类别，每一个类别中，都有不同size对应的池子，不妨瞥一眼这些池子对于flag的分类标准吧：

static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
{#ifdef CONFIG_ZONE_DMA/** The most common case is KMALLOC_NORMAL, so test for it* with a single branch for both flags.*/if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))return KMALLOC_NORMAL;/** At least one of the flags has to be set. If both are, __GFP_DMA* is more important.*/return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
#elsereturn flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
#endif
}

对于这些分类，我暂时没打算细究。进入kmem_cache_alloc_trace()之后，使用slab_alloc()分配地址。

至于第三个分支，一般只有在定义了CONFIG_SLOB时才会执行到，在此就先不细究了。

以上仅仅是宏观的看看kmalloc的实现。可以发现，针对大size的请求，最后我们会使用alloc_pages()完成请求，小size则使用slab_alloc()完成，这两个申请则对应伙伴系统和slab分配机制，我们先从大的，从开始alloc_pages()这边开始吧。

2 buddy system相关

先大致浏览一下这部分的源码：

static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
{unsigned int order = get_order(size);return kmalloc_order_trace(size, flags, order);
}

其中，需要先计算一个order，然后再使用order去申请内存，这是一个看上去有点乱的宏定义，好在注释很清楚doge：

/*** get_order - Determine the allocation order of a memory size* @size: The size for which to get the order** Determine the allocation order of a particular sized block of memory.  This* is on a logarithmic scale, where:** 0 -> 2^0 * PAGE_SIZE and below*  1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1*    2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1*    3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1*    4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1*    ...** The order returned is used to find the smallest allocation granule required* to hold an object of the specified size.** The result is undefined if the size is 0.** This function may be used to initialise variables with compile time* evaluations of constants.*/
#define get_order(n)                        \
(                               \__builtin_constant_p(n) ? (                \((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :  \(((n) < (1UL << PAGE_SHIFT)) ? 0 :        \ilog2((n) - 1) - PAGE_SHIFT + 1)      \) :                            \__get_order(n)                     \
)

继续深入，其主干在于kmalloc_order()：

void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{void *ret = kmalloc_order(size, flags, order);trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);return ret;
}

终于找到了这个大块内存分配的重要函数，alloc_pages()：

void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{void *ret;struct page *page;flags |= __GFP_COMP;page = alloc_pages(flags, order);ret = page ? page_address(page) : NULL;ret = kasan_kmalloc_large(ret, size, flags);/* As ret might get tagged, call kmemleak hook after KASAN. */kmemleak_alloc(ret, size, 1, flags);return ret;
}

alloc_pages()再往里面的代码不全贴出来了，实在是太多了…

alloc_pages()的分配的实现，靠的是__alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);

该函数的分配实现，又靠的是page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);

再进一步，靠的是page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype);

终于，在这个rmqueue()函数，我可以看出一些buddy system的逻辑了：

/** Allocate a page from the given zone. Use pcplists for order-0 allocations.*/
static inline
struct page *rmqueue(struct zone *preferred_zone,struct zone *zone, unsigned int order,gfp_t gfp_flags, unsigned int alloc_flags,int migratetype)
{unsigned long flags;struct page *page;if (likely(order == 0)) {     //Use pcplists for order-0 allocations.page = rmqueue_pcplist(preferred_zone, zone, order,gfp_flags, migratetype, alloc_flags);goto out;}/** We most definitely don't want callers attempting to* allocate greater than order-1 page units with __GFP_NOFAIL.*/WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));spin_lock_irqsave(&zone->lock, flags);do {page = NULL;if (alloc_flags & ALLOC_HARDER) {page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);if (page)trace_mm_page_alloc_zone_locked(page, order, migratetype);}if (!page)page = __rmqueue(zone, order, migratetype, alloc_flags);//偷取机制} while (page && check_new_pages(page, order));//一页一页检查，看是不是有了足够个连续的pagespin_unlock(&zone->lock);if (!page)goto failed;__mod_zone_freepage_state(zone, -(1 << order),get_pcppage_migratetype(page));__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);zone_statistics(preferred_zone, zone); //更新zone信息local_irq_restore(flags);
out:/* Separate test+clear to avoid unnecessary atomics */if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);wakeup_kswapd(zone, 0, 0, zone_idx(zone));}VM_BUG_ON_PAGE(page && bad_range(zone, page), page);return page;
failed:local_irq_restore(flags);return NULL;
}

buddy system分配内存的源头，在于zone结构体的free_area域：

struct zone {/* Read-mostly fields */...struct per_cpu_pageset __percpu *pageset; //针对单个cpu的冷热页.../* free areas of different sizes */struct free_area    free_area[MAX_ORDER];...
}____cacheline_internodealigned_in_smp;

zone是一个非常复杂的结构体，负责管理内存分配，其中也包含了提供给buddy system的内存池，也就是free_area：

struct free_area {struct list_head   free_list[MIGRATE_TYPES];unsigned long      nr_free; //用于表示当前order下空闲内存块的数量
};

可以看出，每一个不同的order，都会对应一个free_area，然后同一个order，同一个free_area下，再有每一个不同的 MIGRATE_TYPES ，都会对应一个free_list，即一个链表，我们可以视为一个内存池，这个链表具体是指向struct page的lru域。

也就是说，同一个链表上的内存区域，都是order(即对应的size)相同，迁移类型也相同的。

zone中还维护了一个pageset 域，用于管理per cpu的冷热页：

struct per_cpu_pageset {struct per_cpu_pages pcp;
#ifdef CONFIG_NUMAs8 expire;u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMPs8 stat_threshold;s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};struct per_cpu_pages {int count;      /* number of pages in the list */int high;      /* high watermark, emptying needed */int batch;     /* chunk size for buddy add/remove *//* Lists of pages, one per migrate type stored on the pcp-lists */struct list_head lists[MIGRATE_PCPTYPES];
};

所谓的冷热页，是指：
冷页表示该空闲页已经不再高速缓存中了(一般是指L2 Cache)，热页表示该空闲页仍然在高速缓存中。冷热页是针对于每CPU的，每个zone中，都会针对于所有的CPU初始化一个冷热页的per-cpu-pageset，即pcp。
可以类比用户态中的fastbin或者tcachebin，将处在高速缓存中的页单独拎出来，优先分配，从而提高了工作效率。
pageset中的冷热页链表元素数量是有限制的，由per_cpu_pages的high成员控制，毕竟如果热页太多，实际上最早加进来的页已经不热了。
当一次释放1个page的时候，会优先将这个page放到pcp链表中，这和fastbin也是类似的。如果释放超过一个page，则会使用正常的buddy算法。

此时回看rmqueue()函数，可以看出，在order为0，即申请一个页的时候，会调用针对pcp链表的申请，与此吻合。

接着就是针对order>=1的情况，调用__rmqueue_smallest()，将会在zone中寻找能够满足order要求的，存在的，最小的对应内存块。即如果order对应链表里的内存不足，则到order+1处继续寻找，如此往复，直到找到为止：

/** Go through the free lists for the given migratetype and remove* the smallest available page from the freelists*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,int migratetype)
{unsigned int current_order;struct free_area *area;struct page *page;/* Find a page of the appropriate size in the preferred list */for (current_order = order; current_order < MAX_ORDER; ++current_order) {area = &(zone->free_area[current_order]);page = list_first_entry_or_null(&area->free_list[migratetype],struct page, lru);if (!page)continue;list_del(&page->lru);rmv_page_order(page);area->nr_free--;expand(zone, page, order, current_order, area, migratetype);//将根据buddy算法 调整更新后的free_areaset_pcppage_migratetype(page, migratetype);return page;}return NULL;
}

如果__rmqueue_smallest()的分配失败了，即当前migratetype所对应的池子里面没有满足要求的。则会进入__rmqueue()函数，在该函数中，将会从别的迁移类型中，偷取page，即一次fallback的分配。

/** Do the hard work of removing an element from the buddy allocator.* Call me with the zone->lock already held.*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,unsigned int alloc_flags)
{struct page *page;
retry:page = __rmqueue_smallest(zone, order, migratetype);if (unlikely(!page)) {if (migratetype == MIGRATE_MOVABLE)page = __rmqueue_cma_fallback(zone, order);if (!page && __rmqueue_fallback(zone, order, migratetype,alloc_flags))goto retry;}trace_mm_page_alloc_zone_locked(page, order, migratetype);return page;
}

kernel: kmalloc相关推荐

linux kernel内存映射实例分析
作者:JHJ(jianghuijun211@gmail.com) 日期:2012/08/24 欢迎转载,请注明出处引子现在android智能手机市场异常火热,硬件升级非常迅猛,arm cortex ...
Linux Kernel Coding Style
2019独角兽企业重金招聘Python工程师标准>>> Linux kernel coding styleThis is a short document describing th ...
[内存管理] linux kernel内存映射实例分析
作者:JHJ(jianghuijun211@gmail.com ) 日期:2012/08/24 欢迎转载,请注明出处引子现在android智能手机市场异常火热,硬件升级非常迅猛,arm corte ...
0ctf 2017 kernel pwn knote write up
UAF due to using hlist_add_behind() without checking. There is a pair locker(mutex_lock) at delete_n ...
【kernel 中内存分配那点事】
首先呢作为车载bsp开发人员,写大量的内核代码是不现实的事情,多数都是修修改改,但是要有内核代码阅读浏览理解的能力,毕竟linux kernel 还是很nb 的,所有技术人员深入研究内核代码是必须的, ...
kmalloc/kfree,vmalloc/vfree函数用法和区别
kmalloc/kfree,vmalloc/vfree函数用法和区别 1.kmalloc 1>kmalloc内存分配和malloc相似,除非被阻塞否则他执行的速度非常快,而且不对获得空间清零. ...
Linux-3.14.12内存管理笔记【kmalloc与kfree实现】【转】
本文转载自:http://blog.chinaunix.net/uid-26859697-id-5573776.html kmalloc()是基于slab/slob/slub分配分配算法上实现的,不少 ...
【译】Writing a Simple Linux Kernel Module
掌握 Golden Ring-0 Linux为应用程序提供了强大而广泛的API,但有时这还不够. 与一块硬件交互或执行需要访问系统中特权信息的操作需要内核模块. Linux内核模块是一段编译的二进制代 ...
nand ubi -4 kernel和mtd
tiny6410 linux2.6.38 1.nand驱动 nand是作为平台设备,在板子文件mach-mini6410.c调用,系统启动时自动加载进内核 static struct platform ...

kernel: kmalloc

1 源码

2 buddy system相关

kernel: kmalloc相关推荐

最新文章

热门文章