（转）linux内存管理之伙伴系统（内存分配）

2024-04-07 11:16:20

一、Linux伙伴系统分配器

伙伴系统分配器大体上分为两类。__get_free_pages()类函数返回分配的第一个页面的线性地址；alloc_pages()类函数返回页面描述符地址。不管以哪种函数进行分配，最终会调用alloc_pages()进行分配页面。

为清楚了解其分配制度，先给个伙伴系统数据的存储框图

也就是每个order对应一个free_area结构，free_area以不同的类型以链表的方式存储这些内存块。

二、主分配函数

下面我们来看这个函数(在UMA模式下)

[cpp] view plaincopyprint?

#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)

[cpp] view plaincopyprint?

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
/* Unknown node is current node */
if (nid
nid = numa_node_id();
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}

[cpp] view plaincopyprint?

static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}

上层分配函数__alloc_pages_nodemask()

[cpp] view plaincopyprint?

/*
* This is the 'heart' of the zoned buddy allocator.
*/
/*上层分配器运用了各种方式进行*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page;
/* Convert GFP flags to their corresponding migrate type */
int migratetype = allocflags_to_migratetype(gfp_mask);
gfp_mask &= gfp_allowed_mask;
/*调试用*/
lockdep_trace_alloc(gfp_mask);
/*如果__GFP_WAIT标志设置了，需要等待和重新调度*/
might_sleep_if(gfp_mask & __GFP_WAIT);
/*没有设置对应的宏*/
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
/* The preferred zone is used for statistics later */
/* 英文注释所说*/
first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
if (!preferred_zone)
return NULL;
/* First allocation attempt */
/*从pcp和伙伴系统中正常的分配内存空间*/
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
preferred_zone, migratetype);
if (unlikely(!page))/*如果上面没有分配到空间，调用下面函数慢速分配，允许等待和回收*/
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
/*调试用*/
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
}

三、从pcp和伙伴系统中正常的分配内存空间

函数get_page_from_freelist()

[cpp] view plaincopyprint?

/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
/*为分配制定内存空间，遍历每个zone*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
/*zone对应的下标*/
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
/*遍历每个zone，进行分配*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
/*在UMA模式下不成立*/ high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
goto try_next_zone;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS
/*需要关注水位*/
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
/*从flags中取的mark*/
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
/*如果水位正常，从本zone中分配*/
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
if (zone_reclaim_mode == 0)/*如果上面检查的水位低于正常值，且没有设置页面回收值*/
goto this_zone_full;
/*在UMA模式下下面函数直接返回0*/
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
goto try_next_zone;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
goto this_zone_full;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto this_zone_full;
}
}
try_this_zone:/*本zone正常水位*/
/*先从pcp中分配，然后不行的话再从伙伴系统中分配*/
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
if (NUMA_BUILD)/*UMA模式为0*/
zlc_mark_zone_full(zonelist, z);
try_next_zone:
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup after the first zone is tried but only
* if there are multiple nodes make it worthwhile
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
return page;/*返回页面*/
}

主分配函数

[cpp] view plaincopyprint?

/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
/*先考虑从pcp中分配空间，当order大于0时再考虑从伙伴系统中分配*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);/*如果分配参数指定了__GFP_COLD标志，则设置cold标志*/
int cpu;
again:
cpu = get_cpu();
if (likely(order == 0)) {/*分配一个页面时，使用pcp*/
struct per_cpu_pages *pcp;
struct list_head *list;
/*找到zone对应的pcp*/
pcp = &zone_pcp(zone, cpu)->pcp;
list = &pcp->lists[migratetype];/*pcp中对应类型的list*/
/* 这里需要关中断，因为内存回收过程可能发送核间中断，强制每个核从每CPU
缓存中释放页面。而且中断处理函数也会分配单页。 */
local_irq_save(flags);
if (list_empty(list)) {/*如果pcp中没有页面,需要补充*/
/*从伙伴系统中获得batch个页面
batch为一次分配的页面数*/
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
/*如果链表仍然为空，申请失败返回*/
if (unlikely(list_empty(list)))
goto failed;
}
/* 如果分配的页面不需要考虑硬件缓存(注意不是每CPU页面缓存)
，则取出链表的最后一个节点返回给上层*/
if (cold)
page = list_entry(list->prev, struct page, lru);
else/* 如果要考虑硬件缓存，则取出链表的第一个页面，这个页面是最近刚释放到每CPU
缓存的，缓存热度更高 */
page = list_entry(list->next, struct page, lru);
list_del(&page->lru);/*从pcp中脱离*/
pcp->count--;/*pcp计数减一*/
}
else {/*当order为大于1时，不从pcp中分配，直接考虑从伙伴系统中分配*/
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
}
/* 关中断，并获得管理区的锁*/
spin_lock_irqsave(&zone->lock, flags);
/*从伙伴系统中相应类型的相应链表中分配空间*/
page = __rmqueue(zone, order, migratetype);
/* 已经分配了1 <
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 <
spin_unlock(&zone->lock);/* 这里仅仅打开自旋锁，待后面统计计数设置完毕后再开中断*/
if (!page)
goto failed;
}
/*事件统计计数，调试*/
__count_zone_vm_events(PGALLOC, zone, 1 <
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);/*恢复中断*/
put_cpu();
VM_BUG_ON(bad_range(zone, page));
/* 这里进行安全性检查，并进行一些善后工作。
如果页面标志破坏，返回的页面出现了问题，则返回试图分配其他页面*/
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
failed:
local_irq_restore(flags);
put_cpu();
return NULL;
}

3.1 pcp缓存补充

从伙伴系统中获得batch个页面，batch为一次分配的页面数rmqueue_bulk()函数。

[cpp] view plaincopyprint?

/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
/*该函数返回的是1<
处理中调用，其他地方没看到，order为0
也就是说返回的是页面数，加入的链表为
对应调用pcp的链表*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, int cold)
{
int i;
spin_lock(&zone->lock);/* 上层函数已经关了中断，这里需要操作管理区，获取管理区的自旋锁 */
for (i = 0; i /* 重复指定的次数，从伙伴系统中分配页面*/
/* 从伙伴系统中取出页面 */
struct page *page = __rmqueue(zone, order, migratetype);
if (unlikely(page == NULL))/*分配失败*/
break;
/*
* Split buddy pages returned by expand() are received here
* in physical page order. The page is added to the callers and
* list and the list head then moves forward. From the callers
* perspective, the linked list is ordered by page number in
* some conditions. This is useful for IO devices that can
* merge IO requests if the physical pages are ordered
* properly.
*/
if (likely(cold == 0))/*根据调用者的要求，将页面放到每CPU缓存链表的头部或者尾部*/
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
set_page_private(page, migratetype);/*设置private属性为页面的迁移类型*/
list = &page->lru;
}
/*递减管理区的空闲页面计数*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i <
spin_unlock(&zone->lock);/*释放管理区的子璇锁*/
return i;
}

3.2 从伙伴系统中取出页面

__rmqueue()函数

[cpp] view plaincopyprint?

/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
/*采用两种范式试着分配order个page*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
struct page *page;
retry_reserve:
/*从指定order开始从小到达遍历,优先从指定的迁移类型链表中分配页面*/
page = __rmqueue_smallest(zone, order, migratetype);
/*
* 如果满足以下两个条件,就从备用链表中分配页面:
* 快速流程没有分配到页面,需要从备用迁移链表中分配.
* 当前不是从保留的链表中分配.因为保留的链表是最后可用的链表,
* 不能从该链表分配的话,说明本管理区真的没有可用内存了.
*/
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
/*order从大到小遍历，从备用链表中分配页面*/
page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
* is used because __rmqueue_smallest is an inline function
* and we want just one call site
*/
if (!page) {/* 备用链表中没有分配到页面,从保留链表中分配页面了 */
migratetype = MIGRATE_RESERVE;
goto retry_reserve;/* 跳转到retry_reserve,从保留的链表中分配页面*/
}
}
/*调试代码*/
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}

3.2.1 从指定的迁移类型链表中分配页面

从指定order开始从小到达遍历,优先从指定的迁移类型链表中分配页面__rmqueue_smallest(zone, order, migratetype);

[cpp] view plaincopyprint?

/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
/*从给定的order开始，从小到大遍历；
找到后返回页面基址，合并分割后的空间*/
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area * area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order
area = &(zone->free_area[current_order]);/*得到指定order的area*/
/*如果area指定类型的伙伴系统链表为空*/
if (list_empty(&area->free_list[migratetype]))
continue;/*查找下一个order*/
/*对应的链表不空，得到链表中数据*/
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
list_del(&page->lru);/*从伙伴系统中删除；*/
rmv_page_order(page);/*移除page中order的变量*/
area->nr_free--;/*空闲块数减一*/
/*拆分、合并*/
expand(zone, page, order, current_order, area, migratetype);
return page;
}
return NULL;
}

伙伴系统内存块拆分和合并

看一个辅助函数，用于伙伴系统中内存块的拆分、合并

[cpp] view plaincopyprint?

/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
* testing. Specifically, as large blocks of memory are subdivided,
* the order in which smaller blocks are delivered depends on the order
* they're subdivided in this function. This is the primary factor
* influencing the order in which pages are delivered to the IO
* subsystem according to empirical testing, and this is also justified
* by considering the behavior of a buddy system containing a single
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
* -- wli
*/
/*此函数主要用于下面这种情况:
分配函数从high中分割出去了low大小的内存；
然后要将high留下的内存块合并放到伙伴系统中；*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{
unsigned long size = 1 <
while (high > low) {/*因为去掉了low的大小，所以最后肯定剩下的
是low的大小(2的指数运算)*/
area--;/*减一到order减一的area*/
high--;/*order减一*/
size >>= 1;/*大小除以2*/
VM_BUG_ON(bad_range(zone, &page[size]));
/*加到指定的伙伴系统中*/
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;/*空闲块加一*/
set_page_order(&page[size], high);/*设置相关order*/
}
}

3.2.2 从备用链表中分配页面

[cpp] view plaincopyprint?

/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area * area;
int current_order;
struct page *page;
int migratetype, i;
/* Find the largest possible block of pages in the other list */
/* 从最高阶搜索,这样可以尽量的将其他迁移列表中的大块分割,避免形成过多的碎片 */
for (current_order = MAX_ORDER-1; current_order >= order;
--current_order) {
for (i = 0; i
/*回调到下一个migratetype*/
migratetype = fallbacks[start_migratetype][i];
/* MIGRATE_RESERVE handled later if necessary */
/* 本函数不处理MIGRATE_RESERVE类型的迁移链表,如果本函数返回NULL,
则上层函数直接从MIGRATE_RESERVE中分配 */
if (migratetype == MIGRATE_RESERVE)
continue;/*访问下一个类型*/
area = &(zone->free_area[current_order]);
/*如果指定order和类型的链表为空*/
if (list_empty(&area->free_list[migratetype]))
continue;/*访问下一个类型*/
/*得到指定类型和order的页面基址*/
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
area->nr_free--;/*空闲块数减一*/
/*
* If breaking a large block of pages, move all free
* pages to the preferred allocation list. If falling
* back for a reclaimable kernel allocation, be more
* agressive about taking ownership of free pages
*/
if (unlikely(current_order >= (pageblock_order >> 1)) ||/* 要分割的页面是一个大页面,则将整个页面全部迁移到当前迁移类型的链表中,
这样可以避免过多的碎片 */
start_migratetype == MIGRATE_RECLAIMABLE ||/* 目前分配的是可回收页面,这类页面有突发的特点,将页面全部迁移到可回收链表中,
可以避免将其他迁移链表分割成太多的碎片 */
page_group_by_mobility_disabled) {/* 指定了迁移策略,总是将被分割的页面迁移 */
unsigned long pages;
/*移动到先前类型的伙伴系统中*/
pages = move_freepages_block(zone, page,
start_migratetype);
/* Claim the whole block if over half of it is free */
/* pages是移动的页面数,如果可移动的页面数量较多,
则将整个大内存块的迁移类型修改 */
if (pages >= (1 <
page_group_by_mobility_disabled)
/*设置页面标示*/
set_pageblock_migratetype(page,
start_migratetype);
migratetype = start_migratetype;
}
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order)//大于pageblock_order的部分设置相应标示
/*这个不太可能，因为pageblock_order为10*/
change_pageblock_range(page, current_order,
start_migratetype);
/*拆分和合并*/
expand(zone, page, order, current_order, area, migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype);
return page;
}
}
return NULL;
}

备用链表

[cpp] view plaincopyprint?

/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
/*指定类型的链表为空时，这个数组规定
回调的到那个类型的链表*/
static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
[MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
};

移动到指定类型的伙伴系统中

[cpp] view plaincopyprint?

/*将指定区域段的页面移动到指定类型的
伙伴系统中，其实就是将页面的类型做了
更改，但是是采用移动的方式
功能和上面函数类似，但是要求以
页面块方式对其*/
static int move_freepages_block(struct zone *zone, struct page *page,
int migratetype)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
/*如下是对齐操作，其中变量pageblock_nr_pages为MAX_ORDER-1*/
start_pfn = page_to_pfn(page);
start_pfn = start_pfn & ~(pageblock_nr_pages-1);
start_page = pfn_to_page(start_pfn);
end_page = start_page + pageblock_nr_pages - 1;
end_pfn = start_pfn + pageblock_nr_pages - 1;
/* Do not cross zone boundaries */
if (start_pfn zone_start_pfn)
start_page = page;
/*结束边界检查*/
if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
return 0;
/*调用上面函数*/
return move_freepages(zone, start_page, end_page, migratetype);
}

[cpp] view plaincopyprint?

/*
* Move the free pages in a range to the free lists of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
/*将指定区域段的页面移动到指定类型的
伙伴系统中，其实就是将页面的类型做了更改，但是是采用移动的方式*/
static int move_freepages(struct zone *zone,
struct page *start_page, struct page *end_page,
int migratetype)
{
struct page *page;
unsigned long order;
int pages_moved = 0;
#ifndef CONFIG_HOLES_IN_ZONE
/*
* page_zone is not safe to call in this context when
* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
* anyway as we check zone boundaries in move_freepages_block().
* Remove at a later date when no bug reports exist related to
* grouping pages by mobility
*/
BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif
for (page = start_page; page <= end_page;) {
/* Make sure we are not inadvertently changing nodes */
VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
if (!PageBuddy(page)) {
page++;
continue;
}
order = page_order(page);
list_del(&page->lru);/*将页面块从原来的伙伴系统链表*/
/*中删除，注意，这里不是一个页面
*而是以该页面的伙伴块*/
list_add(&page->lru,/*添加到指定order和类型下的伙伴系统链表*/
&zone->free_area[order].free_list[migratetype]);
page += 1 </*移动页面数往上定位*/
pages_moved += 1 </*移动的页面数*/
}
return pages_moved;
}

四、慢速分配，允许等待和回收

[cpp] view plaincopyprint?

/**
* 当无法快速分配页面时，如果调用者允许等待
，则通过本函数进行慢速分配。
* 此时允许进行内存回收。
*/
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
struct task_struct *p = current;
/*
* In the slowpath, we sanity check order to avoid ever trying to
* reclaim >= MAX_ORDER areas which will never succeed. Callers may
* be using allocators in order of preference for an area that is
* too large.
*//*参数合法性检查*/
if (order >= MAX_ORDER) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
* __GFP_NOWARN set) should not cause reclaim since the subsystem
* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
* using a larger set of nodes after it has established that the
* allowed per node queues are empty and that nodes are
* over allocated.
*/
/**
* 调用者指定了GFP_THISNODE标志，表示不能进行内存回收。
* 上层调用者应当在指定了GFP_THISNODE失败后，使用其他标志进行分配。
*/
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:/*如果调用者没有禁止kswapd，则唤醒该线程进行内存回收。*/
wake_all_kswapd(order, zonelist, high_zoneidx);
/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*/
/*根据分配标志确定内部标志，主要是用于水线 */
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/**
* 与快速分配流程相比，这里的分配标志使用了低的水线。
* 在进行内存回收操作前，我们使用低水线再尝试分配一下。
* 当然，不管是否允许ALLOC_NO_WATERMARKS标志，我们都将它清除。
*/
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page)/*分配成功，找到页面*/
goto got_pg;
rebalance:
/* Allocate without watermarks if the context allows */
/* 某些上下文，如内存回收进程及被杀死的任务，都允许它完全突破水线的限制分配内存。 */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
if (page))/* 在不考虑水线的情况下，分配到了内存 */
goto got_pg;
}
/* Atomic allocations - we can't balance anything */
/* 调用者希望原子分配内存，此时不能等待内存回收，返回NULL */
if (!wait)
goto nopage;
/* Avoid recursion of direct reclaim */
/* 调用者本身就是内存回收进程，不能进入后面的内存回收处理流程，否则死锁 */
if (p->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
/**
* 当前线程正在被杀死，它可以完全突破水线分配内存。这里向上层返回NULL，是为了避免系统进入死循环。
* 当然，如果上层调用不允许失败，则死循环继续分配，等待其他线程释放一点点内存。
*/
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
/* Try direct reclaim and then allocating */
/**
* 直接在内存分配上下文中进行内存回收操作。
*/
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, &did_some_progress);
if (page))/* 庆幸，回收了一些内存后，满足了上层分配需求 */
goto got_pg;
/*
* If we failed to make any progress reclaiming, then we are
* running out of options and have to consider going OOM
*/
/* 内存回收过程没有回收到内存，系统真的内存不足了 */
if (!did_some_progress) {
/**
* 调用者不是文件系统的代码，允许进行文件系统操作，并且允许重试。
* 这里需要__GFP_FS标志可能是进入OOM流程后会杀进程或进入panic，需要文件操作。
*/
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)/* 系统禁止了OOM，向上层返回NULL */
goto nopage;
/**
* 杀死其他进程后再尝试分配内存
*/
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
migratetype);
if (page)
goto got_pg;
/*
* The OOM killer does not trigger for high-order
* ~__GFP_NOFAIL allocations so if no progress is being
* made, there are no other options and retrying is
* unlikely to help.
*/)/* 要求的页面数量较多，再试意义不大 */
if (order > PAGE_ALLOC_COSTLY_ORDER &&
!(gfp_mask & __GFP_NOFAIL))
goto nopage;
goto restart;
}
}
/* Check if we should retry the allocation */
/* 内存回收过程回收了一些内存，接下来判断是否有必要继续重试 */
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto rebalance;
}
nopage:
/* 内存分配失败了，打印内存分配失败的警告 */
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
printk(KERN_WARNING "%s: page allocation failure."
" order:%d, mode:0x%x\n",
p->comm, order, gfp_mask);
dump_stack();
show_mem();
}
return page;
got_pg:
/* 运行到这里，说明成功分配了内存，这里进行内存检测调试 */
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
return page;
}

总结：Linux伙伴系统主要分配流程为

正常非配(或叫快速分配)流程：

1，如果分配的是单个页面，考虑从per CPU缓存中分配空间，如果缓存中没有页面，从伙伴系统中提取页面做补充。

2，分配多个页面时，从指定类型中分配，如果指定类型中没有足够的页面，从备用类型链表中分配。最后会试探保留类型链表。

慢速(允许等待和页面回收)分配：

3，当上面两种分配方案都不能满足要求时，考虑页面回收、杀死进程等操作后在试。

（转）linux内存管理之伙伴系统（内存分配）相关推荐

linux进程管理内存管理,Linux专业知识四:Linux系统进程管理及查看内存
本文主讲Linux专业知识之Linux系统进程管理及查看内存的情况,以Redhat RHEL7操作系统为例. 一.进程程序与进程:程序是静态的(文件),进程是动态的(运行的程序). 进程和线程:一个 ...
【Linux 内核内存管理】优化内存屏障 ③ ( 编译器屏障 | 禁止 / 开启内核抢占与方法保护临界区 | preempt_disable 禁止内核抢占源码 | 开启内核抢占源码 )
文章目录一.禁止 / 开启内核抢占与方法保护临界区二.编译器优化屏障三.preempt_disable 禁止内核抢占源码四.preempt_enable 开启内核抢占源码一.禁止 / ...
操作系统的内存管理机制（连续分配管理、页式、段式、段页式、快表、二级页表）
来源:https://www.bilibili.com/video/BV1YE411D7nH 操作系统的内存管理机制(连续分配管理.页式.段式.段页式.快表.二级页表) 内存被分为系统区和用户区,系统 ...
属性与内存管理(属性与内存管理都是相互关联的)
<span style="font-size:18px;"> 属性与内存管理(属性与内存管理都是相互关联的)第一部分一,属性:属性是OC2.0之后出来的新语法,用来取代 ...
JVM自动内存管理机制——Java内存区域（下）
一.虚拟机参数配置在上一篇<Java自动内存管理机制--Java内存区域(上)>中介绍了有关的基础知识,这一篇主要是通过一些示例来了解有关虚拟机参数的配置. 1.Java堆参数设置 a) ...
95-290-050-源码-内存管理-堆外内存与堆内内存概述
2.概述 Flink的内存管理器管理着用于排序.散列和缓存所需的内存.内存以相等大小的(Segments)表示,称为内存页.操作器通过请求多个内存页来分配内存.在Flink中,内存又分为堆内存和非 ...
Java内存管理：Java内存区域 JVM运行时数据区
Java内存管理:Java内存区域 JVM运行时数据区在前面的一些文章了解到javac编译的大体过程.Class文件结构.以及JVM字节码指令. 下面我们详细了解Java内存区域:先说明JVM规范定 ...
C++：内存管理：C++内存管理详解
C++语言内存管理是指:对系统的分配.创建.使用这一系列操作.在内存管理中,由于是操作系统内存,使用不当会造成很麻烦的后果.本文将从系统内存的分配.创建出发,并且结合例子来说明内存管理不当会造成的结果 ...
[JAVA]第二篇（内存管理，HashMap内存泄漏解决办法）
网上看到一个关于内存泄漏处理的例子,原网址:http://www.jb51.net/article/49428.htm,下面笔者将具体分析下这篇文章中的代码,并从中学习JAVA的内存管理. (Begi ...
Unity 之 Mono内存管理与泄漏 — 内存是手游的硬伤(转)
WeTest导读内存是游戏的硬伤,如果没有做好内存的管理问题,游戏极有可能会出现卡顿,闪退等影响用户体验的现象.本文介绍了在腾讯游戏在Unity游戏开发过程中常见的Mono内存管理问题,并介绍了一系 ...

最新文章

热门文章