linux内存管理（十三）-内存规整过程分析

现在看看内存规整迁移再分配函数，__alloc_pages_direct_compact，函数在mm/page_alloc.c文件中：

/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,unsigned int alloc_flags, const struct alloc_context *ac,enum compact_priority prio, enum compact_result *compact_result)
{struct page *page;unsigned int noreclaim_flag;if (!order)//order为0情况，不用进行内存规整return NULL;noreclaim_flag = memalloc_noreclaim_save();//直接内存规整来满足高阶分配需求*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,prio);memalloc_noreclaim_restore(noreclaim_flag);if (*compact_result <= COMPACT_INACTIVE)return NULL;/** At least in one zone compaction wasn't deferred or skipped, so let's* count a compaction stall*/count_vm_event(COMPACTSTALL);//在规整完成后进行页面分配操作page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page) {struct zone *zone = page_zone(page);zone->compact_blockskip_flush = false;compaction_defer_reset(zone, order, true);count_vm_event(COMPACTSUCCESS);return page;}/** It's bad if compaction run occurs and fails. The most likely reason* is that pages exist, but not enough to satisfy watermarks.*/count_vm_event(COMPACTFAIL);cond_resched();return NULL;
}

__alloc_pages_direct_compact首先执行规整操作，然后进行页面分配。页面分配就好说了，就是我们很熟悉的get_page_from_freelist函数，规整操作是调用try_to_compact_pages完成的。linux系统进行页面回收只有两种方式，一种是上面没有详细讲解的异步回收，通过唤醒一个线程回收页面，另一种就是try_to_compact_pages函数实现的同步页面回收。try_to_compact_pages函数位于mm/compaction.c文件中：

/*** try_to_compact_pages - Direct compact to satisfy a high-order allocation* @gfp_mask: The GFP mask of the current allocation* @order: The order of the current allocation* @alloc_flags: The allocation flags of the current allocation* @ac: The context of current allocation* @prio: Determines how hard direct compaction should try to succeed** This is the main entry point for direct page compaction.*/
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,unsigned int alloc_flags, const struct alloc_context *ac,enum compact_priority prio)
{int may_perform_io = gfp_mask & __GFP_IO;struct zoneref *z;struct zone *zone;enum compact_result rc = COMPACT_SKIPPED;/** Check if the GFP flags allow compaction - GFP_NOIO is really* tricky context because the migration might require IO*///检查GFP标志是否允许规整if (!may_perform_io)return COMPACT_SKIPPED;trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);/* Compact each zone in the list *///根据掩码遍历zonelist每个zonefor_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,ac->nodemask) {enum compact_result status;//如果优先级低并且判断为可能overflow就跳过这个zoneif (prio > MIN_COMPACT_PRIORITY&& compaction_deferred(zone, order)) {rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);continue;}//针对特定zone进行规整status = compact_zone_order(zone, order, gfp_mask, prio,alloc_flags, ac_classzone_idx(ac));rc = max(status, rc);/* The allocation should succeed, stop compacting */if (status == COMPACT_SUCCESS) {/** We think the allocation will succeed in this zone,* but it is not certain, hence the false. The caller* will repeat this with true if allocation indeed* succeeds in this zone.*/compaction_defer_reset(zone, order, false);break;}if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||status == COMPACT_PARTIAL_SKIPPED))/** We think that allocation won't succeed in this zone* so we defer compaction there. If it ends up* succeeding after all, it will be reset.*/defer_compaction(zone, order);/** We might have stopped compacting due to need_resched() in* async compaction, or due to a fatal signal detected. In that* case do not try further zones*///检测到致命信号，停止规整，不要尝试进一步的区域  if ((prio == COMPACT_PRIO_ASYNC && need_resched())|| fatal_signal_pending(current))break;}return rc;
}

try_to_compact_pages会遍历每一个zone，然后针对每一个zone调用compact_zone_order进行内存规整，compact_zone_order：

static enum compact_result compact_zone_order(struct zone *zone, int order,gfp_t gfp_mask, enum compact_priority prio,unsigned int alloc_flags, int classzone_idx)
{enum compact_result ret;struct compact_control cc = {.nr_freepages = 0,.nr_migratepages = 0,.total_migrate_scanned = 0,.total_free_scanned = 0,.order = order,//需要规整的页面阶数.gfp_mask = gfp_mask,//页面规整的页面掩码.zone = zone,.mode = (prio == COMPACT_PRIO_ASYNC) ?MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,//页面规整模式(同步、异步).alloc_flags = alloc_flags,.classzone_idx = classzone_idx,.direct_compaction = true,.whole_zone = (prio == MIN_COMPACT_PRIORITY),.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)};INIT_LIST_HEAD(&cc.freepages);//初始化迁移目的地的链表INIT_LIST_HEAD(&cc.migratepages);//初始化将要迁移页面链表ret = compact_zone(zone, &cc);//对zone进行规整VM_BUG_ON(!list_empty(&cc.freepages));VM_BUG_ON(!list_empty(&cc.migratepages));return ret;
}

struct compact_control数据结构记录了被迁移的页面，以及规整过程中迁移到的页面列表，compact_zone_order主要将参数填入struct compact_control结构体，然后和zone一起作为参数传递给compact_zone，compact_zone：

static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
{enum compact_result ret;unsigned long start_pfn = zone->zone_start_pfn;unsigned long end_pfn = zone_end_pfn(zone);const bool sync = cc->mode != MIGRATE_ASYNC;cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);//根据当前zone水位来判断是否需要进行内存规整，COMPACT_CONTINUE表示可以做内存规整ret = compaction_suitable(zone, cc->order, cc->alloc_flags,cc->classzone_idx);/* Compaction is likely to fail */if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)return ret;/* huh, compaction_suitable is returning something unexpected */VM_BUG_ON(ret != COMPACT_CONTINUE);/** Clear pageblock skip if there were failures recently and compaction* is about to be retried after being deferred.*/if (compaction_restarting(zone, cc->order))__reset_isolation_suitable(zone);/** Setup to move all movable pages to the end of the zone. Used cached* information on where the scanners should start (unless we explicitly* want to compact the whole zone), but check that it is initialised* by ensuring the values are within zone boundaries.*/if (cc->whole_zone) {cc->migrate_pfn = start_pfn;cc->free_pfn = pageblock_start_pfn(end_pfn - 1);} else {//表示从zone的开始页面开始扫描和查找哪些页面可以被迁移cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];//从zone末端开始扫描和查找哪些空闲的页面可以用作迁移页面的目的地cc->free_pfn = zone->compact_cached_free_pfn;if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {cc->free_pfn = pageblock_start_pfn(end_pfn - 1);zone->compact_cached_free_pfn = cc->free_pfn;}//下面对free_pfn和migrate_pfn进行范围限制if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {cc->migrate_pfn = start_pfn;zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;}if (cc->migrate_pfn == start_pfn)cc->whole_zone = true;}cc->last_migrated_pfn = 0;trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,cc->free_pfn, end_pfn, sync);migrate_prep_local();//while中从zone开头扫描查找合适的迁移页面，然后尝试迁移到zone末端空闲页面中，直到zone处于低水位WMARK_LOW之上while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {int err;//用于扫描和查找合适迁移的页，查找到后进行页面隔离switch (isolate_migratepages(zone, cc)) {case ISOLATE_ABORT://迁移失败ret = COMPACT_CONTENDED;putback_movable_pages(&cc->migratepages);//需要迁移回去cc->nr_migratepages = 0;goto out;case ISOLATE_NONE://找不到/** We haven't isolated and migrated anything, but* there might still be unflushed migrations from* previous cc->order aligned block.*/goto check_drain;case ISOLATE_SUCCESS://成功;}//migrate_pages是页面迁移核心函数，从cc->migratepages中摘取页，然后尝试去迁移err = migrate_pages(&cc->migratepages, compaction_alloc,compaction_free, (unsigned long)cc, cc->mode,MR_COMPACTION);trace_mm_compaction_migratepages(cc->nr_migratepages, err,&cc->migratepages);/* All pages were either migrated or will be released */cc->nr_migratepages = 0;//没处理成功的页面会放回到合适的LRU链表中if (err) {putback_movable_pages(&cc->migratepages);/** migrate_pages() may return -ENOMEM when scanners meet* and we want compact_finished() to detect it*/if (err == -ENOMEM && !compact_scanners_met(cc)) {ret = COMPACT_CONTENDED;goto out;}/** We failed to migrate at least one page in the current* order-aligned block, so skip the rest of it.*/if (cc->direct_compaction &&(cc->mode == MIGRATE_ASYNC)) {cc->migrate_pfn = block_end_pfn(cc->migrate_pfn - 1, cc->order);/* Draining pcplists is useless in this case */cc->last_migrated_pfn = 0;}}check_drain:/** Has the migration scanner moved away from the previous* cc->order aligned block where we migrated from? If yes,* flush the pages that were freed, so that they can merge and* compact_finished() can detect immediately if allocation* would succeed.*/if (cc->order > 0 && cc->last_migrated_pfn) {int cpu;unsigned long current_block_start =block_start_pfn(cc->migrate_pfn, cc->order);if (cc->last_migrated_pfn < current_block_start) {cpu = get_cpu();lru_add_drain_cpu(cpu);drain_local_pages(zone);put_cpu();/* No more flushing until we migrate again */cc->last_migrated_pfn = 0;}}}out:/** Release free pages and update where the free scanner should restart,* so we don't leave any returned pages behind in the next attempt.*/if (cc->nr_freepages > 0) {unsigned long free_pfn = release_freepages(&cc->freepages);cc->nr_freepages = 0;VM_BUG_ON(free_pfn == 0);/* The cached pfn is always the first in a pageblock */free_pfn = pageblock_start_pfn(free_pfn);/** Only go back, not forward. The cached pfn might have been* already reset to zone end in compact_finished()*/if (free_pfn > zone->compact_cached_free_pfn)zone->compact_cached_free_pfn = free_pfn;}count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);trace_mm_compaction_end(start_pfn, cc->migrate_pfn,cc->free_pfn, end_pfn, sync, ret);return ret;
}

compact_zone主要流程如下图：

这里要分析3个函数：

compaction_suitable 判断是否需要进行内存规整
compact_finished 判断是否可以停止内存规整
migrate_pages 页面迁移核心函数

先看compaction_suitable 函数如何判断是否需要进行内存规整：

enum compact_result compaction_suitable(struct zone *zone, int order,unsigned int alloc_flags,int classzone_idx)
{enum compact_result ret;int fragindex;//检查是否继续规整主要函数ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,zone_page_state(zone, NR_FREE_PAGES));/** fragmentation index determines if allocation failures are due to* low memory or external fragmentation** index of -1000 would imply allocations might succeed depending on* watermarks, but we already failed the high-order watermark check* index towards 0 implies failure is due to lack of memory* index towards 1000 implies failure is due to fragmentation** Only compact if a failure would be due to fragmentation. Also* ignore fragindex for non-costly orders where the alternative to* a successful reclaim/compaction is OOM. Fragindex and the* vm.extfrag_threshold sysctl is meant as a heuristic to prevent* excessive compaction for costly orders, but it should not be at the* expense of system stability.*/if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {fragindex = fragmentation_index(zone, order);if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)ret = COMPACT_NOT_SUITABLE_ZONE;}trace_mm_compaction_suitable(zone, order, ret);if (ret == COMPACT_NOT_SUITABLE_ZONE)ret = COMPACT_SKIPPED;return ret;
}/** compaction_suitable: Is this suitable to run compaction on this zone now?* Returns*   COMPACT_SKIPPED  - If there are too few free pages for compaction*   COMPACT_SUCCESS  - If the allocation would succeed without compaction*   COMPACT_CONTINUE - If compaction should run now*/
static enum compact_result __compaction_suitable(struct zone *zone, int order,unsigned int alloc_flags,int classzone_idx,unsigned long wmark_target)
{unsigned long watermark;if (is_via_compact_memory(order))return COMPACT_CONTINUE;watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];/** If watermarks for high-order allocation are already met, there* should be no need for compaction at all.*///如果满足低水位，则不需要进行内存规整。if (zone_watermark_ok(zone, order, watermark, classzone_idx,alloc_flags))return COMPACT_SUCCESS;/** Watermarks for order-0 must be met for compaction to be able to* isolate free pages for migration targets. This means that the* watermark and alloc_flags have to match, or be more pessimistic than* the check in __isolate_free_page(). We don't use the direct* compactor's alloc_flags, as they are not relevant for freepage* isolation. We however do use the direct compactor's classzone_idx to* skip over zones where lowmem reserves would prevent allocation even* if compaction succeeds.* For costly orders, we require low watermark instead of min for* compaction to proceed to increase its chances.* ALLOC_CMA is used, as pages in CMA pageblocks are considered* suitable migration targets*/watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?low_wmark_pages(zone) : min_wmark_pages(zone);watermark += compact_gap(order);//增加水位高度为watermark+2<<order//如果达不到新水位，说明当前zone中空闲页面很少，不适合作内存规整，跳过此zoneif (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,ALLOC_CMA, wmark_target))return COMPACT_SKIPPED;return COMPACT_CONTINUE;
}

compaction_suitable 主要通过__compaction_suitable来判断是否需要进行内存规整，__compaction_suitable则通过zone的水位来判断书否应该进行内存规整，只有低于低水位线和高于最低水位线才能进行内存规整，因为高于低水位线不需要进行内存规整，低于最低水位线则没办法进行内存规整。

然后看看compact_finished 如何判断是否可以停止内存规整：

static enum compact_result compact_finished(struct zone *zone,struct compact_control *cc)
{int ret;ret = __compact_finished(zone, cc);//真正的规整结果检查trace_mm_compaction_finished(zone, cc->order, ret);//追踪规整结果if (ret == COMPACT_NO_SUITABLE_PAGE)ret = COMPACT_CONTINUE;return ret;
}static enum compact_result __compact_finished(struct zone *zone,struct compact_control *cc)
{unsigned int order;const int migratetype = cc->migratetype;if (cc->contended || fatal_signal_pending(current))return COMPACT_CONTENDED;/* Compaction run completes if the migrate and free scanner meet *///扫描可迁移页面和空闲页面，从zone的头尾向中间运行。当两者相遇，可以停止规整if (compact_scanners_met(cc)) {/* Let the next compaction start anew. */reset_cached_positions(zone);/** Mark that the PG_migrate_skip information should be cleared* by kswapd when it goes to sleep. kcompactd does not set the* flag itself as the decision to be clear should be directly* based on an allocation request.*/if (cc->direct_compaction)zone->compact_blockskip_flush = true;if (cc->whole_zone)return COMPACT_COMPLETE;elsereturn COMPACT_PARTIAL_SKIPPED;}//如果order为-1表示强制执行内存规整，继续内存规整if (is_via_compact_memory(cc->order))return COMPACT_CONTINUE;if (cc->finishing_block) {/** We have finished the pageblock, but better check again that* we really succeeded.*/if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))cc->finishing_block = false;elsereturn COMPACT_CONTINUE;}/* Direct compactor: Is a suitable page free? *///直接内存规整，在这里判断规整结果for (order = cc->order; order < MAX_ORDER; order++) {struct free_area *area = &zone->free_area[order];bool can_steal;/* Job done if page is free of the right migratetype */if (!list_empty(&area->free_list[migratetype]))return COMPACT_SUCCESS;#ifdef CONFIG_CMA/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */if (migratetype == MIGRATE_MOVABLE &&!list_empty(&area->free_list[MIGRATE_CMA]))return COMPACT_SUCCESS;
#endif/** Job done if allocation would steal freepages from* other migratetype buddy lists.*/if (find_suitable_fallback(area, order, migratetype,true, &can_steal) != -1) {/* movable pages are OK in any pageblock */if (migratetype == MIGRATE_MOVABLE)return COMPACT_SUCCESS;/** We are stealing for a non-movable allocation. Make* sure we finish compacting the current pageblock* first so it is as free as possible and we won't* have to steal another one soon. This only applies* to sync compaction, as async compaction operates* on pageblocks of the same migratetype.*/if (cc->mode == MIGRATE_ASYNC ||IS_ALIGNED(cc->migrate_pfn,pageblock_nr_pages)) {return COMPACT_SUCCESS;}cc->finishing_block = true;return COMPACT_CONTINUE;}}return COMPACT_NO_SUITABLE_PAGE;
}

compact_finished主要通过__compact_finished来判断内存规整是否完成，__compact_finished判断内存规整流程是否可以结束，结束的条件有两个：一是cc->migrate_pfn和cc->free_pfn两个指针相遇；二是以order为条件判断当前zone的水位在低水位之上。

最后要看看migrate_pages 这个页面迁移核心函数如何进行页面迁移，首先知道migrate_pages 调用参数，

migrate_pages(&cc->migratepages, compaction_alloc,compaction_free, (unsigned long)cc, cc->mode,MR_COMPACTION);

migrate_pages 要从cc->migratepages中摘取页，然后尝试去迁移。然后通过第二参数compaction_alloc函数和第三参数compaction_free调整cc->migratepages。
所以，先看看compaction_alloc和compaction_free函数，函数位于mm/compaction.c文件：

static struct page *compaction_alloc(struct page *migratepage,unsigned long data)
{struct compact_control *cc = (struct compact_control *)data;struct page *freepage;/** Isolate free pages if necessary, and if we are not aborting due to* contention.*/if (list_empty(&cc->freepages)) {if (!cc->contended)isolate_freepages(cc);//查找可以用来作为迁移目的页面if (list_empty(&cc->freepages))//如果没有页面可被用来作为迁移目的页面，返回NULLreturn NULL;}freepage = list_entry(cc->freepages.next, struct page, lru);//找到可以被用作迁移目的的页面list_del(&freepage->lru);//将空闲页面从cc->freepages中摘除cc->nr_freepages--;return freepage;
}static void compaction_free(struct page *page, unsigned long data)
{struct compact_control *cc = (struct compact_control *)data;list_add(&page->lru, &cc->freepages);//失败情况下，将页面放回cc->freepagescc->nr_freepages++;
}

compaction_alloc从zone的末尾开始查找空闲页面，并把空闲页面添加到cc->freepages链表中，然后从cc->freepages中摘除页面，返回给migrate_pages作为迁移使用。compaction_free是规整失败的处理函数，将空闲页面返回给cc->freepages。
最后看看migrate_pages做了什么：

int migrate_pages(struct list_head *from, new_page_t get_new_page,free_page_t put_new_page, unsigned long private,enum migrate_mode mode, int reason)
{int retry = 1;int nr_failed = 0;int nr_succeeded = 0;int pass = 0;struct page *page;struct page *page2;int swapwrite = current->flags & PF_SWAPWRITE;int rc;if (!swapwrite)current->flags |= PF_SWAPWRITE;//尝试10次，从from摘取一个页面，调用unmap_and_move()函数进行页迁移，返回MIGRATEPAGE_SUCCESS表示页迁移成功。for(pass = 0; pass < 10 && retry; pass++) {retry = 0;list_for_each_entry_safe(page, page2, from, lru) {retry:cond_resched();if (PageHuge(page))rc = unmap_and_move_huge_page(get_new_page,put_new_page, private, page,pass > 2, mode, reason);else//尝试迁移页面page到新分配的页面newpagerc = unmap_and_move(get_new_page, put_new_page,private, page, pass > 2, mode,reason);switch(rc) {case -ENOMEM:/** THP migration might be unsupported or the* allocation could've failed so we should* retry on the same page with the THP split* to base pages.** Head page is retried immediately and tail* pages are added to the tail of the list so* we encounter them after the rest of the list* is processed.*/if (PageTransHuge(page) && !PageHuge(page)) {lock_page(page);rc = split_huge_page_to_list(page, from);unlock_page(page);if (!rc) {list_safe_reset_next(page, page2, lru);goto retry;}}nr_failed++;goto out;case -EAGAIN:retry++;break;case MIGRATEPAGE_SUCCESS:nr_succeeded++;break;default:/** Permanent failure (-EBUSY, -ENOSYS, etc.):* unlike -EAGAIN case, the failed page is* removed from migration page list and not* retried in the next outer loop.*/nr_failed++;break;}}}nr_failed += retry;rc = nr_failed;
out:if (nr_succeeded)count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);if (nr_failed)count_vm_events(PGMIGRATE_FAIL, nr_failed);trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);if (!swapwrite)current->flags &= ~PF_SWAPWRITE;return rc;
}

migrate_pages主要调用unmap_and_mov来尝试迁移页面page到新分配的页面newpage，下面看看unmap_and_mov函数：

static ICE_noinline int unmap_and_move(new_page_t get_new_page,free_page_t put_new_page,unsigned long private, struct page *page,int force, enum migrate_mode mode,enum migrate_reason reason)
{int rc = MIGRATEPAGE_SUCCESS;struct page *newpage;if (!thp_migration_supported() && PageTransHuge(page))return -ENOMEM;//调用get_new_page参数,也就是上面的compaction_alloc函数newpage = get_new_page(page, private);if (!newpage)return -ENOMEM;if (page_count(page) == 1) {/* page was freed from under us. So we are done. */ClearPageActive(page);ClearPageUnevictable(page);if (unlikely(__PageMovable(page))) {lock_page(page);if (!PageMovable(page))__ClearPageIsolated(page);unlock_page(page);}if (put_new_page)put_new_page(newpage, private);elseput_page(newpage);goto out;}//真正尝试迁移页面page到新分配的页面newpage的函数rc = __unmap_and_move(page, newpage, force, mode);if (rc == MIGRATEPAGE_SUCCESS)set_page_owner_migrate_reason(newpage, reason);out:if (rc != -EAGAIN) {/** A page that has been migrated has all references* removed and will be freed. A page that has not been* migrated will have kepts its references and be* restored.*/list_del(&page->lru);/** Compaction can migrate also non-LRU pages which are* not accounted to NR_ISOLATED_*. They can be recognized* as __PageMovable*/if (likely(!__PageMovable(page)))mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +page_is_file_cache(page), -hpage_nr_pages(page));}/** If migration is successful, releases reference grabbed during* isolation. Otherwise, restore the page to right list unless* we want to retry.*/if (rc == MIGRATEPAGE_SUCCESS) {put_page(page);if (reason == MR_MEMORY_FAILURE) {/** Set PG_HWPoison on just freed page* intentionally. Although it's rather weird,* it's how HWPoison flag works at the moment.*/if (set_hwpoison_free_buddy_page(page))num_poisoned_pages_inc();}} else {if (rc != -EAGAIN) {if (likely(!__PageMovable(page))) {putback_lru_page(page);goto put_new;}lock_page(page);if (PageMovable(page))putback_movable_page(page);else__ClearPageIsolated(page);unlock_page(page);put_page(page);}
put_new:if (put_new_page)put_new_page(newpage, private);elseput_page(newpage);}return rc;
}static int __unmap_and_move(struct page *page, struct page *newpage,int force, enum migrate_mode mode)
{int rc = -EAGAIN;int page_was_mapped = 0;struct anon_vma *anon_vma = NULL;bool is_lru = !__PageMovable(page);if (!trylock_page(page)) {//尝试给页面枷锁if (!force || mode == MIGRATE_ASYNC)//加锁失败，且强制迁移或异步模式，则忽略这个页面goto out;/** It's not safe for direct compaction to call lock_page.* For example, during page readahead pages are added locked* to the LRU. Later, when the IO completes the pages are* marked uptodate and unlocked. However, the queueing* could be merging multiple pages for one bio (e.g.* mpage_readpages). If an allocation happens for the* second or third page, the process can end up locking* the same page twice and deadlocking. Rather than* trying to be clever about what pages can be locked,* avoid the use of lock_page for direct compaction* altogether.*/if (current->flags & PF_MEMALLOC)goto out;lock_page(page);}if (PageWriteback(page)) {/** Only in the case of a full synchronous migration is it* necessary to wait for PageWriteback. In the async case,* the retry loop is too short and in the sync-light case,* the overhead of stalling is too much*/switch (mode) {case MIGRATE_SYNC:case MIGRATE_SYNC_NO_COPY:break;default:rc = -EBUSY;goto out_unlock;}if (!force)goto out_unlock;wait_on_page_writeback(page);}/** By try_to_unmap(), page->mapcount goes down to 0 here. In this case,* we cannot notice that anon_vma is freed while we migrates a page.* This get_anon_vma() delays freeing anon_vma pointer until the end* of migration. File cache pages are no problem because of page_lock()* File Caches may use write_page() or lock_page() in migration, then,* just care Anon page here.** Only page_get_anon_vma() understands the subtleties of* getting a hold on an anon_vma from outside one of its mms.* But if we cannot get anon_vma, then we won't need it anyway,* because that implies that the anon page is no longer mapped* (and cannot be remapped so long as we hold the page lock).*/if (PageAnon(page) && !PageKsm(page))anon_vma = page_get_anon_vma(page);/** Block others from accessing the new page when we get around to* establishing additional references. We are usually the only one* holding a reference to newpage at this point. We used to have a BUG* here if trylock_page(newpage) fails, but would like to allow for* cases where there might be a race with the previous use of newpage.* This is much like races on refcount of oldpage: just don't BUG().*/if (unlikely(!trylock_page(newpage)))goto out_unlock;if (unlikely(!is_lru)) {rc = move_to_new_page(newpage, page, mode);goto out_unlock_both;}/** Corner case handling:* 1. When a new swap-cache page is read into, it is added to the LRU* and treated as swapcache but it has no rmap yet.* Calling try_to_unmap() against a page->mapping==NULL page will* trigger a BUG.  So handle it here.* 2. An orphaned page (see truncate_complete_page) might have* fs-private metadata. The page can be picked up due to memory* offlining.  Everywhere else except page reclaim, the page is* invisible to the vm, so the page can not be migrated.  So try to* free the metadata, so the page can be freed.*/if (!page->mapping) {VM_BUG_ON_PAGE(PageAnon(page), page);if (page_has_private(page)) {try_to_free_buffers(page);goto out_unlock_both;}} else if (page_mapped(page)) {//有pte映射的页面，调用try_to_unmap()解除页面所有映射/* Establish migration ptes */VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,page);try_to_unmap(page,TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);page_was_mapped = 1;}if (!page_mapped(page))//已经解除完所有映射的页面，将页面迁移到新分配的页面newpagerc = move_to_new_page(newpage, page, mode);if (page_was_mapped)remove_migration_ptes(page,rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);out_unlock_both:unlock_page(newpage);
out_unlock:/* Drop an anon_vma reference if we took one */if (anon_vma)put_anon_vma(anon_vma);unlock_page(page);
out:/** If migration is successful, decrease refcount of the newpage* which will not free the page because new page owner increased* refcounter. As well, if it is LRU page, add the page to LRU* list in here. Use the old state of the isolated source page to* determine if we migrated a LRU page. newpage was already unlocked* and possibly modified by its owner - don't rely on the page* state.*/if (rc == MIGRATEPAGE_SUCCESS) {if (unlikely(!is_lru))put_page(newpage);elseputback_lru_page(newpage);}return rc;
}

unmap_and_move主要调用了__unmap_and_move，__unmap_and_move工作如下图：

其中move_to_new_page是主要将page内容迁移到newpage的函数，move_to_new_page：

static int move_to_new_page(struct page *newpage, struct page *page,enum migrate_mode mode)
{struct address_space *mapping;int rc = -EAGAIN;bool is_lru = !__PageMovable(page);VM_BUG_ON_PAGE(!PageLocked(page), page);VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);//检查当前页面你是否被映射。如果page属于slab或是匿名页面，返回为空。SWAP则返回swap_address_space空间；其余page cache直接返回page->mappingmapping = page_mapping(page);if (likely(is_lru)) {if (!mapping)//slab或者匿名页面调用migrate_page()将旧页面相关信息迁移到新页面rc = migrate_page(mapping, newpage, page, mode);else if (mapping->a_ops->migratepage)/** Most pages have a mapping and most filesystems* provide a migratepage callback. Anonymous pages* are part of swap space which also has its own* migratepage callback. This is the most common path* for page migration.*///有mapping的情况，调用migratepage函数进行迁移rc = mapping->a_ops->migratepage(mapping, newpage,page, mode);else//没有映射则回退页面迁移操作rc = fallback_migrate_page(mapping, newpage,page, mode);} else {/** In case of non-lru page, it could be released after* isolation step. In that case, we shouldn't try migration.*/VM_BUG_ON_PAGE(!PageIsolated(page), page);if (!PageMovable(page)) {rc = MIGRATEPAGE_SUCCESS;__ClearPageIsolated(page);goto out;}rc = mapping->a_ops->migratepage(mapping, newpage,page, mode);WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&!PageIsolated(page));}/** When successful, old pagecache page->mapping must be cleared before* page is freed; but stats require that PageAnon be left as PageAnon.*/if (rc == MIGRATEPAGE_SUCCESS) {if (__PageMovable(page)) {VM_BUG_ON_PAGE(!PageIsolated(page), page);/** We clear PG_movable under page_lock so any compactor* cannot try to migrate this page.*/__ClearPageIsolated(page);}/** Anonymous and movable page->mapping will be cleard by* free_pages_prepare so don't reset it here for keeping* the type to work PageAnon, for example.*/if (!PageMappingFlags(page))page->mapping = NULL;if (unlikely(is_zone_device_page(newpage))) {if (is_device_public_page(newpage))flush_dcache_page(newpage);} elseflush_dcache_page(newpage);}
out:return rc;
}

move_to_new_page函数首先会检查是否已经映射，如果是匿名页面映射就调用migrate_page进行迁移，如果是文件映射就回调migratepage函数进行迁移，没有映射则调用fallback_migrate_page回退页面迁移操作。现在看看migrate_page函数：

int migrate_page(struct address_space *mapping,struct page *newpage, struct page *page,enum migrate_mode mode)
{int rc;BUG_ON(PageWriteback(page));    /* Writeback must be complete *///对于匿名页面来说，什么也不做直接返回rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);if (rc != MIGRATEPAGE_SUCCESS)return rc;if (mode != MIGRATE_SYNC_NO_COPY)migrate_page_copy(newpage, page);//把页面page复制到新页面newpage中elsemigrate_page_states(newpage, page);return MIGRATEPAGE_SUCCESS;
}void migrate_page_copy(struct page *newpage, struct page *page)
{if (PageHuge(page) || PageTransHuge(page))copy_huge_page(newpage, page);elsecopy_highpage(newpage, page);migrate_page_states(newpage, page);
}
EXPORT_SYMBOL(migrate_page_copy);

回调migratepage函数目前无法确定，后期补上，而fallback_migrate_page现在看看：

static int fallback_migrate_page(struct address_space *mapping,struct page *newpage, struct page *page, enum migrate_mode mode)
{if (PageDirty(page)) {//如果是脏页/* Only writeback pages in full synchronous migration */switch (mode) {case MIGRATE_SYNC:case MIGRATE_SYNC_NO_COPY:break;default:return -EBUSY;}return writeout(mapping, page);//回写}/** Buffers may be managed in a filesystem specific way.* We must have no buffers or drop them.*///如果是私有文件并且没有办法释放返回错误if (page_has_private(page) &&!try_to_release_page(page, GFP_KERNEL))return -EAGAIN;return migrate_page(mapping, newpage, page, mode);
}

fallback_migrate_page主要做了三件事：1.如果文件页是脏页则回写后返回，2.如果文件页是私有文件且没办法释放则返回错误，3.嘴都调用migrate_page进行迁移。

linux内存管理（十三）-内存规整过程分析相关推荐

【Linux 内核内存管理】内存管理架构 ④ ( 内存分配系统调用过程 | 用户层 malloc free | 系统调用层 brk mmap | 内核层 kmalloc | 内存管理流程 )
文章目录一.内存分配系统调用过程 ( 用户层 | 系统调用 | 内核层 ) 二.内存管理流程一.内存分配系统调用过程 ( 用户层 | 系统调用 | 内核层 ) " 堆内存 " ...
Linux内存管理之内存管理单元（MMU）（二）
Linux内存管理之内存管理单元(二) 1.1.什么是MMU 在CPU内部,有一个专门的硬件单元来负责这个虚拟页面到物理页面的转换,它被被称为内存管理单元(Memory Management Unit ...
Linux内存管理：内存寻址之分段机制与分页机制
目录 Linux 内存寻址之分段机制前言分段到底是怎么回事? 实模式的诞生(16位处理器及寻址) 保护模式的诞生(32位处理器及寻址) IA32的内存寻址机制寻址硬件 IA32的三种地址 MMU ...
Linux内存管理：内存分配：slab分配器
<linux内核之slob.slab.slub> <Linux内核:kmalloc()和SLOB.SLAB.SLUB内存分配器> <Linux内存管理:内存分配:slab ...
Linux内存管理：内存描述之高端内存
<Linux内存管理:内存描述之内存节点node> <Linux内存管理:内存描述之内存区域zone> <Linux内存管理:内存描述之内存页面page> < ...
Linux内存管理：内存描述之内存页面page
<Linux内存管理:内存描述之内存节点node> <Linux内存管理:内存描述之内存区域zone> <Linux内存管理:内存描述之内存页面page> 目录 1 ...
Linux内存管理：内存描述之内存节点node
<Linux内存管理:内存描述之内存区域zone> <Linux内存管理:内存描述之内存节点node> 目录 1 前景回顾 1.1 UMA和NUMA两种模型 1.2 (N)UM ...
【Linux 内核内存管理】内存映射相关数据结构 ③ ( vm_area_struct 结构体成员分析 | shared 成员 | anon_vma_chain 成员 | anon_vma 成员 )
文章目录一.vm_area_struct 结构体成员分析 1.shared 成员 2.anon_vma_chain 成员 3.anon_vma 成员二.vm_area_struct 结构体完整源码 ...
C++ 内存管理中内存泄漏问题产生原因以及解决方法
C++内存管理中内存泄露(memory leak)一般指的是程序在申请内存后,无法释放已经申请的内存空间,内存泄露的积累往往会导致内存溢出. 一.内存分配方式通常内存分配方式有以下三种: (1)从静 ...
C++内存管理__内存管理(栈、堆(new/delete)、自由存储区(malloc/freee)、全局/静态存储区、常量区)！堆栈内存管理方式的区别
内存管理是C++最令人切齿痛恨的问题,也是C++最有争议的问题,C++高手从中获得了更好的性能,更大的自由,C++菜鸟的收获则是一遍一遍的检查代码和对C++的痛恨,但内存管理在C++中无处不在,内存泄 ...

linux内存管理（十三）-内存规整过程分析

linux内存管理（十三）-内存规整过程分析相关推荐

最新文章

热门文章