关键词:warn_alloc()、__GFP_XXX、order、CMA等等。

在内存申请的时候经常会遇到类似“ xxx: page allocation failure: order:10...”类型的问题,这是warn_alloc()的输出。

warn_alloc()被如下函数调用:__alloc_pages_slowpath()、__vmalloc_area_node()、__vmalloc_node_range。

下面分三部分了解这种问题的来龙去脉:

  • 什么情况会导致warn_alloc()?
  • warn_alloc()都做了哪些事情?
  • 结合实际问题分析问题原因。

1.触发warn_alloc()情况

要了什么情况下会导致warn_alloc(),就需要分析在何种情况下会被调用。

__alloc_pages_slowpath()表示页面申请进入了slowpath,那相对就有fastpath。

从__alloc_pages_nodemask()中可知,这个fastpath就是get_page_from_freelist()。__alloc_pages_nodemask()是分配页面的后备选择。

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,struct alloc_context *ac)
{bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;struct page *page = NULL;unsigned int alloc_flags;unsigned long did_some_progress;enum compact_priority compact_priority;enum compact_result compact_result;int compaction_retries;int no_progress_loops;unsigned long alloc_start = jiffies;unsigned int stall_timeout = 10 * HZ;unsigned int cpuset_mems_cookie;    if (order >= MAX_ORDER) {WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));return NULL;}    if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))gfp_mask &= ~__GFP_ATOMIC;retry_cpuset:compaction_retries = 0;no_progress_loops = 0;compact_priority = DEF_COMPACT_PRIORITY;cpuset_mems_cookie = read_mems_allowed_begin();
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,ac->high_zoneidx, ac->nodemask);if (!ac->preferred_zoneref->zone)------------------------------------------------找不到合适的zone,进入nopage处理。goto nopage;    alloc_flags = gfp_to_alloc_flags(gfp_mask);if (gfp_mask & __GFP_KSWAPD_RECLAIM)wake_all_kswapds(order, ac);
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page)goto got_pg;    if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&!gfp_pfmemalloc_allowed(gfp_mask)) {-----------------------------------------在定义__GFP_DIRECT_RECLAIM、__GFP_MEMALLOC并且order大于3,也即分配超过8页内存的时候。page = __alloc_pages_direct_compact(gfp_mask, order,alloc_flags, ac,INIT_COMPACT_PRIORITY,&compact_result);---------------------------------------------页面较大情况下,走直接页面回收来获取内存。if (page)goto got_pg;        if (gfp_mask & __GFP_NORETRY) {----------------------------------------------不做重试的情况。
if (compact_result == COMPACT_DEFERRED)----------------------------------compaction不成功,进入nopage处理。goto nopage;
compact_priority = INIT_COMPACT_PRIORITY;}}retry:/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */if (gfp_mask & __GFP_KSWAPD_RECLAIM)wake_all_kswapds(order, ac);-------------------------------------------------唤醒kswapd内核线程,让其处于工作状态。if (gfp_pfmemalloc_allowed(gfp_mask))alloc_flags = ALLOC_NO_WATERMARKS;    if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,ac->high_zoneidx, ac->nodemask);}/* Attempt with potentially adjusted zonelist and alloc_flags */page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);-----------------申请内存分配,成功则返回struct page地址。if (page)goto got_pg;/* Caller is not willing to reclaim, we can't balance anything */if (!can_direct_reclaim) {-------------------------------------------------------既不能内存规整direct compact,也无法从freelist获取内存的情况,进入nopage流程。
WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);goto nopage;}/* Avoid recursion of direct reclaim */if (current->flags & PF_MEMALLOC) {
if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {cond_resched();goto retry;}goto nopage;}/* Avoid allocations with no watermarks from looping endlessly */if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))goto nopage;/* Try direct reclaim and then allocating */page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,&did_some_progress);if (page)goto got_pg;/* Try direct compaction and then allocating */page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,compact_priority, &compact_result);if (page)goto got_pg;/* Do not loop if specifically requested */if (gfp_mask & __GFP_NORETRY)--------------------------------------------------------------强调不允许循环重试情况。goto nopage;/** Do not retry costly high order allocations unless they are* __GFP_REPEAT*/if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))-------------------------针对高order情况,并且不允许__GFP_REPEAT的情况,进入nopage流程。goto nopage;/* Make sure we know about allocations which stall for too long */if (time_after(jiffies, alloc_start + stall_timeout)) {------------------------------------内存分配持续时间超过stall_timeout,初始为10秒,后面以10秒递增报警。warn_alloc(gfp_mask,"page allocation stalls for %ums, order:%u",jiffies_to_msecs(jiffies-alloc_start), order);stall_timeout += 10 * HZ;}if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,did_some_progress > 0, &no_progress_loops))goto retry;    if (did_some_progress > 0 &&should_compact_retry(ac, order, alloc_flags,compact_result, &compact_priority,&compaction_retries))goto retry;    if (read_mems_allowed_retry(cpuset_mems_cookie))goto retry_cpuset;/* Reclaim has failed us, start killing things */page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);----------------------分配页面,并且判断是否需要启动OOM killer,did_some_progress会导致retry。如果order小于3则不会进入OOM。if (page)goto got_pg;/* Retry as long as the OOM killer is making progress */if (did_some_progress) {no_progress_loops = 0;goto retry;}nopage:
if (read_mems_allowed_retry(cpuset_mems_cookie))goto retry_cpuset;----------------------------------------------------------------------进入retry_cpuset循环处理。warn_alloc(gfp_mask,"page allocation failure: order:%u", order);----------------------------------------无法满足分配order大小页面。
got_pg:return page;
}

下面两个函数都是vmalloc相关,__vmalloc_area_node()在分配失败之后进入fail,调用warn_alloc()输出log。

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,pgprot_t prot, int node)
{struct page **pages;unsigned int nr_pages, array_size, i;const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;array_size = (nr_pages * sizeof(struct page *));area->nr_pages = nr_pages;/* Please note that the recursion is strictly bounded. */if (array_size > PAGE_SIZE) {pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,PAGE_KERNEL, node, area->caller);} else {pages = kmalloc_node(array_size, nested_gfp, node);}area->pages = pages;if (!area->pages) {remove_vm_area(area->addr);kfree(area);return NULL;}for (i = 0; i < area->nr_pages; i++) {struct page *page;if (node == NUMA_NO_NODE)page = alloc_page(alloc_mask);elsepage = alloc_pages_node(node, alloc_mask, 0);if (unlikely(!page)) {/* Successfully allocated i pages, free them in __vunmap() */area->nr_pages = i;goto fail;}area->pages[i] = page;if (gfpflags_allow_blocking(gfp_mask))cond_resched();}if (map_vm_area(area, prot, pages))goto fail;return area->addr;fail:warn_alloc(gfp_mask,"vmalloc: allocation failure, allocated %ld of %ld bytes",(area->nr_pages*PAGE_SIZE), area->size);vfree(area->addr);return NULL;
}

void *__vmalloc_node_range(unsigned long size, unsigned long align,unsigned long start, unsigned long end, gfp_t gfp_mask,pgprot_t prot, unsigned long vm_flags, int node,const void *caller)
{struct vm_struct *area;void *addr;unsigned long real_size = size;size = PAGE_ALIGN(size);if (!size || (size >> PAGE_SHIFT) > totalram_pages)goto fail;area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |vm_flags, start, end, node, gfp_mask, caller);if (!area)goto fail;addr = __vmalloc_area_node(area, gfp_mask, prot, node);if (!addr)return NULL;    clear_vm_uninitialized_flag(area);    kmemleak_alloc(addr, real_size, 2, gfp_mask);return addr;fail:warn_alloc(gfp_mask,"vmalloc: allocation failure: %lu bytes", real_size);return NULL;
}

2. warn_alloc()解析

warn_alloc()首先显示相关进程和内存分配gfp_mask信息,然后打印栈信息,

void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
{unsigned int filter = SHOW_MEM_FILTER_NODES;struct va_format vaf;va_list args;if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||debug_guardpage_minorder() > 0)return;    if (!(gfp_mask & __GFP_NOMEMALLOC))if (test_thread_flag(TIF_MEMDIE) ||(current->flags & (PF_MEMALLOC | PF_EXITING)))filter &= ~SHOW_MEM_FILTER_NODES;if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))filter &= ~SHOW_MEM_FILTER_NODES;pr_warn("%s: ", current->comm);------------------------------------显示对应进程名称。va_start(args, fmt);vaf.fmt = fmt;vaf.va = &args;pr_cont("%pV", &vaf);va_end(args);------------------------------------------------------显示warn_alloc()传入的参数。pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);----------------显示gfp_mask。dump_stack();------------------------------------------------------显示栈信息。if (!should_suppress_show_mem())show_mem(filter);----------------------------------------------显示内存信息,这里是重点。
}

show_mem()显示详细的内存信息。

void show_mem(unsigned int filter)
{pg_data_t *pgdat;unsigned long total = 0, reserved = 0, highmem = 0;printk("Mem-Info:\n");show_free_areas(filter);for_each_online_pgdat(pgdat) {unsigned long flags;int zoneid;pgdat_resize_lock(pgdat, &flags);for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {struct zone *zone = &pgdat->node_zones[zoneid];if (!populated_zone(zone))continue;total += zone->present_pages;reserved += zone->present_pages - zone->managed_pages;if (is_highmem_idx(zoneid))highmem += zone->present_pages;}pgdat_resize_unlock(pgdat, &flags);}printk("%lu pages RAM\n", total);-------------------------------整个平台的页面统计信息:所有页面数、reserved、cma等等。printk("%lu pages HighMem/MovableOnly\n", highmem);printk("%lu pages reserved\n", reserved);
#ifdef CONFIG_CMAprintk("%lu pages cma reserved\n", totalcma_pages);
#endif
#ifdef CONFIG_QUICKLISTprintk("%lu pages in pagetable cache\n",quicklist_total_size());
#endif
#ifdef CONFIG_MEMORY_FAILUREprintk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
}

show_free_areas()从所有node、不同node、不同zone、同一zone下不同order分别显示空闲页面信息。

void show_free_areas(unsigned int filter)
{unsigned long free_pcp = 0;int cpu;struct zone *zone;pg_data_t *pgdat;for_each_populated_zone(zone) {if (skip_free_areas_node(filter, zone_to_nid(zone)))continue;for_each_online_cpu(cpu)free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;}printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"-----------------显示所有node的统计信息。" active_file:%lu inactive_file:%lu isolated_file:%lu\n"" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"" slab_reclaimable:%lu slab_unreclaimable:%lu\n"" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"" free:%lu free_pcp:%lu free_cma:%lu\n",global_node_page_state(NR_ACTIVE_ANON),global_node_page_state(NR_INACTIVE_ANON),global_node_page_state(NR_ISOLATED_ANON),global_node_page_state(NR_ACTIVE_FILE),global_node_page_state(NR_INACTIVE_FILE),global_node_page_state(NR_ISOLATED_FILE),global_node_page_state(NR_UNEVICTABLE),global_node_page_state(NR_FILE_DIRTY),global_node_page_state(NR_WRITEBACK),global_node_page_state(NR_UNSTABLE_NFS),global_page_state(NR_SLAB_RECLAIMABLE),global_page_state(NR_SLAB_UNRECLAIMABLE),global_node_page_state(NR_FILE_MAPPED),global_node_page_state(NR_SHMEM),global_page_state(NR_PAGETABLE),global_page_state(NR_BOUNCE),global_page_state(NR_FREE_PAGES),free_pcp,global_page_state(NR_FREE_CMA_PAGES));for_each_online_pgdat(pgdat) {-------------------------------------------------分别显示不同node的统计信息。printk("Node %d"" active_anon:%lukB"" inactive_anon:%lukB"" active_file:%lukB"" inactive_file:%lukB"" unevictable:%lukB"" isolated(anon):%lukB"" isolated(file):%lukB"" mapped:%lukB"" dirty:%lukB"" writeback:%lukB"" shmem:%lukB"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE" shmem_thp: %lukB"" shmem_pmdmapped: %lukB"" anon_thp: %lukB"
#endif" writeback_tmp:%lukB"" unstable:%lukB"" pages_scanned:%lu"" all_unreclaimable? %s""\n",pgdat->node_id,K(node_page_state(pgdat, NR_ACTIVE_ANON)),K(node_page_state(pgdat, NR_INACTIVE_ANON)),K(node_page_state(pgdat, NR_ACTIVE_FILE)),K(node_page_state(pgdat, NR_INACTIVE_FILE)),K(node_page_state(pgdat, NR_UNEVICTABLE)),K(node_page_state(pgdat, NR_ISOLATED_ANON)),K(node_page_state(pgdat, NR_ISOLATED_FILE)),K(node_page_state(pgdat, NR_FILE_MAPPED)),K(node_page_state(pgdat, NR_FILE_DIRTY)),K(node_page_state(pgdat, NR_WRITEBACK)),K(node_page_state(pgdat, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGEK(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)* HPAGE_PMD_NR),K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endifK(node_page_state(pgdat, NR_WRITEBACK_TEMP)),K(node_page_state(pgdat, NR_UNSTABLE_NFS)),node_page_state(pgdat, NR_PAGES_SCANNED),!pgdat_reclaimable(pgdat) ? "yes" : "no");}for_each_populated_zone(zone) {----------------------------------------------分别显示所有zone的统计信息。int i;if (skip_free_areas_node(filter, zone_to_nid(zone)))continue;free_pcp = 0;for_each_online_cpu(cpu)free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;show_node(zone);printk(KERN_CONT"%s"" free:%lukB"" min:%lukB"" low:%lukB"" high:%lukB"" active_anon:%lukB"" inactive_anon:%lukB"" active_file:%lukB"" inactive_file:%lukB"" unevictable:%lukB"" writepending:%lukB"" present:%lukB"" managed:%lukB"" mlocked:%lukB"" slab_reclaimable:%lukB"" slab_unreclaimable:%lukB"" kernel_stack:%lukB"" pagetables:%lukB"" bounce:%lukB"" free_pcp:%lukB"" local_pcp:%ukB"" free_cma:%lukB""\n",zone->name,K(zone_page_state(zone, NR_FREE_PAGES)),K(min_wmark_pages(zone)),K(low_wmark_pages(zone)),K(high_wmark_pages(zone)),K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),K(zone->present_pages),K(zone->managed_pages),K(zone_page_state(zone, NR_MLOCK)),K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),zone_page_state(zone, NR_KERNEL_STACK_KB),K(zone_page_state(zone, NR_PAGETABLE)),K(zone_page_state(zone, NR_BOUNCE)),K(free_pcp),K(this_cpu_read(zone->pageset->pcp.count)),K(zone_page_state(zone, NR_FREE_CMA_PAGES)));printk("lowmem_reserve[]:");for (i = 0; i < MAX_NR_ZONES; i++)printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);printk(KERN_CONT "\n");}for_each_populated_zone(zone) {-------------------------------------------显示所有zone下不同order空闲数目统计信息。unsigned int order;unsigned long nr[MAX_ORDER], flags, total = 0;unsigned char types[MAX_ORDER];if (skip_free_areas_node(filter, zone_to_nid(zone)))continue;show_node(zone);printk(KERN_CONT "%s: ", zone->name);spin_lock_irqsave(&zone->lock, flags);for (order = 0; order < MAX_ORDER; order++) {-------------------------遍历当前zone的不同order,不同order区域数目存在nr[]中,total是总的页面数目。struct free_area *area = &zone->free_area[order];int type;nr[order] = area->nr_free;total += nr[order] << order;types[order] = 0;for (type = 0; type < MIGRATE_TYPES; type++) {if (!list_empty(&area->free_list[type]))types[order] |= 1 << type;--------------------------------记录order区域中页面类型。}}spin_unlock_irqrestore(&zone->lock, flags);for (order = 0; order < MAX_ORDER; order++) {printk(KERN_CONT "%lu*%lukB ",nr[order], K(1UL) << order);-------------------------------输出不同order区域数量和区域大小。if (nr[order])show_migration_types(types[order]);---------------------------输出页面类型。}printk(KERN_CONT "= %lukB\n", K(total));------------------------------显示总大小。}hugetlb_show_meminfo();---------------------------------------------------显示huge page统计信息。printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));---总的文件缓存页面数量。show_swap_cache_info();----------------------------------------------------显示swap cache统计信息。
}

不同的页面有不同的属性,在warn_alloc()输出的字母对应了页面的属性。主要有M、U、E、C。

static void show_migration_types(unsigned char type)
{static const char types[MIGRATE_TYPES] = {[MIGRATE_UNMOVABLE]    = 'U',--------------------------不可移动。[MIGRATE_MOVABLE]    = 'M',----------------------------可移动。[MIGRATE_RECLAIMABLE]    = 'E',------------------------可回收。[MIGRATE_HIGHATOMIC]    = 'H',-------------------------等同于MIGRATE_PCPTYPES。
#ifdef CONFIG_CMA[MIGRATE_CMA]        = 'C',----------------------------CMA区域页面。
#endif
#ifdef CONFIG_MEMORY_ISOLATION[MIGRATE_ISOLATE]    = 'I',
#endif};char tmp[MIGRATE_TYPES + 1];char *p = tmp;int i;for (i = 0; i < MIGRATE_TYPES; i++) {if (type & (1 << i))*p++ = types[i];}*p = '\0';printk(KERN_CONT "(%s) ", tmp);
}

经过上面的分析,基本上明白每一行的输出的来源。具体每个字段表示的内存含义,还需要结合代码阅读。

3. 实例解析

下面结合实际问题log输出来分析问题,进而解决问题。

表示进程xxxx在分配order为10个连续物理页面时失败,mode表示内存分配的页模式,具体在include/linux/gfp.h中定义。

内存碎片会导致page分配失败,即使还有很多空闲page。当order=0时,表示系统当前已经完全OOM。

[ 2161.623563] xxxx: page allocation failure: order:10, mode:0x2084020(GFP_ATOMIC|__GFP_COMP)-----------------warn_alloc(),从这里可以知道是哪个进程页面分配失败,并且有对应的gfp_mask。
[ 2161.632085] CPU: 0 PID: 179 Comm: AiApp Not tainted 4.9.56 #53---------------------------------------------dump_stack(),栈信息指出了更详细的调用路径。
[ 2161.637947]
Call Trace:
[<802f63f2>] dump_stack+0x1e/0x3c
[<800f6cf4>] warn_alloc+0x100/0x148
[<800f709c>] __alloc_pages_nodemask+0x2bc/0xb5c
[<801120fe>] kmalloc_order+0x26/0x48
[<80112158>] kmalloc_order_trace+0x38/0x98
[<8012c5d8>] __kmalloc+0xf4/0x12c
[<8048ac78>] alloc_ep_req+0x5c/0x98
[<8048f232>] source_sink_recv+0x2a/0xe0
[<8048f35e>] usb_sourcesink_bulk_read+0x76/0x1c8
[<8048f770>] usb_sourcesink_read+0xfc/0x2c8
[<80134d58>] __vfs_read+0x30/0x108
[<80135c14>] vfs_read+0x94/0x128
[<80136d12>] SyS_read+0x52/0xd4
[<8004a246>] csky_systemcall+0x96/0xe0
[ 2161.689204] Mem-Info:--------------------------------------------------------------show_mem()
[ 2161.691518] active_anon:3268 inactive_anon:2 isolated_anon:0-----------------------所有node统计信息。
[ 2161.691518]  active_file:1271 inactive_file:89286 isolated_file:0
[ 2161.691518]  unevictable:0 dirty:343 writeback:0 unstable:0
[ 2161.691518]  slab_reclaimable:2019 slab_unreclaimable:644
[ 2161.691518]  mapped:4282 shmem:4 pagetables:59 bounce:0
[ 2161.691518]  free:62086 free_pcp:199 free_cma:60234--------------------------------------------------------------------------------------只有一个node,输出node 0统计信息。
[ 2161.724334] Node 0 active_anon:13072kB inactive_anon:8kB active_file:5084kB inactive_file:357144kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:17128kB dirty:1372kB writeback:0kB shmem:16kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no--------------------------------------------------------------------------------------输出Normal zone统计信息。
[ 2161.748626] Normal free:248344kB min:2444kB low:3052kB high:3660kB active_anon:13072kB inactive_anon:8kB active_file:5084kB inactive_file:357144kB unevictable:0kB writepending:1372kB present:1048572kB managed:734568kB mlocked:0kB slab_reclaimable:8076kB slab_unreclaimable:2576kB kernel_stack:608kB pagetables:236kB bounce:0kB free_pcp:796kB local_pcp:796kB free_cma:240936kB
[ 2161.781670] lowmem_reserve[]: 0 0 0---------------------------------------------------------------------------------------输出Normal zone下不同order的空闲情况,包括其中页面属性。
[ 2161.785225] Normal: 4*4kB (UEC) 3*8kB (EC) 3*16kB (UEC) 2*32kB (UE) 2*64kB (UE) 2*128kB (UE) 2*256kB (EC) 1*512kB (E) 3*1024kB (UEC) 3*2048kB (UEC) 58*4096kB (C) = 248344kB
90573 total pagecache pages---------------------------------------------------------------------------------------整个平台页面统计信息。
[ 2161.803526] 262143 pages RAM
[ 2161.806410] 0 pages HighMem/MovableOnly
[ 2161.810264] 78501 pages reserved
[ 2161.813509] 90112 pages cma reserved

从stack信息可以得知,alloc_ep_req()是分配内存的起点。

struct usb_request *alloc_ep_req(struct usb_ep *ep, size_t len)
{struct usb_request      *req;req = usb_ep_alloc_request(ep, GFP_ATOMIC);if (req) {req->length = usb_endpoint_dir_out(ep->desc) ?usb_ep_align(ep, len) : len;req->buf = kmalloc(req->length, GFP_ATOMIC);if (!req->buf) {usb_ep_free_request(ep, req);req = NULL;}}return req;
}

3.1 GFP_ATOMIC和__GFP_COMP:页面分配标志

从代码可知此时gfp_mask为GFP_ATOMIC,这种情况是不允许__GFP_DIRECT_RECLAIM页面直接回收的。

#define GFP_ATOMIC    (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define __GFP_HIGH    ((__force gfp_t)___GFP_HIGH)----------------------------------------------表示更高优先级。
#define __GFP_ATOMIC    ((__force gfp_t)___GFP_ATOMIC)------------------------------------------表示调用者不可以回收页面或者睡眠,并且是高优先级。典型的应用是中断处理中。
#define __GFP_KSWAPD_RECLAIM    ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */----在内存分配的时候,主动唤醒kswapd线程。
#define __GFP_COMP    ((__force gfp_t)___GFP_COMP)----------------------------------------------复合页标志位,表示将两个或多个也看成一个页面。

GFP位掩码定义如下:

#define ___GFP_DMA        0x01u
#define ___GFP_HIGHMEM        0x02u
#define ___GFP_DMA32        0x04u
#define ___GFP_MOVABLE        0x08u
#define ___GFP_RECLAIMABLE    0x10u
#define ___GFP_HIGH        0x20u
#define ___GFP_IO        0x40u
#define ___GFP_FS        0x80u
#define ___GFP_COLD        0x100u
#define ___GFP_NOWARN        0x200u
#define ___GFP_REPEAT        0x400u
#define ___GFP_NOFAIL        0x800u
#define ___GFP_NORETRY        0x1000u
#define ___GFP_MEMALLOC        0x2000u
#define ___GFP_COMP        0x4000u
#define ___GFP_ZERO        0x8000u
#define ___GFP_NOMEMALLOC    0x10000u
#define ___GFP_HARDWALL        0x20000u
#define ___GFP_THISNODE        0x40000u
#define ___GFP_ATOMIC        0x80000u
#define ___GFP_ACCOUNT        0x100000u
#define ___GFP_NOTRACK        0x200000u
#define ___GFP_DIRECT_RECLAIM    0x400000u
#define ___GFP_OTHER_NODE    0x800000u
#define ___GFP_WRITE        0x1000000u
#define ___GFP_KSWAPD_RECLAIM    0x2000000u

3.2 gfp和migrate转换,进而alloc_flags:为什么不能使用CMA区域?

gfp_mask决定了申请页面的migratetype,然后在CMA存在的情况下根据migratetype决定是否可用CMA区域。

static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);    alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);------------------------------__GFP_HIGH到ALLOC_HIGH转换。if (gfp_mask & __GFP_ATOMIC) {
if (!(gfp_mask & __GFP_NOMEMALLOC))alloc_flags |= ALLOC_HARDER;
alloc_flags &= ~ALLOC_CPUSET;} else if (unlikely(rt_task(current)) && !in_interrupt())alloc_flags |= ALLOC_HARDER;#ifdef CONFIG_CMAif (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)---------------------------将gfp_mask转换到migratetype,判断是否是MIGRATE_MOVABLE。如果是,则可以在CMA去榆中分配。也就是说必须gfp_flags中包含__GFP_MOVABLE才可以在CMA中分配。alloc_flags |= ALLOC_CMA;
#endifreturn alloc_flags;
}#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)------------------------------___GFP_MOVABLE为0x08,___GFP_RECLAIMABLE为0x10。
#define GFP_MOVABLE_SHIFT 3static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
{VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);if (unlikely(page_group_by_mobility_disabled))return MIGRATE_UNMOVABLE;/* Group based on mobility */return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;--------------------------这里面只会与__GFP_RECLAIMABLE|__GFP_MOVABLE,然后右移3bit,就将___GFP_MOVABLE转换到MIGRATE_MOVABLE,将__GFP_RECLAIMABLE转换到MIGRATE_RECLAIMABLE。
}

由于此次申请的gfp_mask可知没有___GFP_MOVABLE,所以alloc_flags不会包括ALLOC_CMA。反之,如果要复用CMA进行内存申请,需要在gfp_mask中包括__GFP_MOVABLE。

从Normal区域空闲页面可以看出,有58个4MB空闲,但是属于CMA区域。所以申请不成功。

3.3 问题的根源

结合warn_alloc()和实例归纳如下:

1. 虽然存在很多空闲内存,但是alloc_ep_req()无法使用

由于alloc_ep_req()申请内存的gfp_mask为GFP_ATOMIC|__GFP_COMP。

由于不具备__GFP_MOVABLE,所以即使存在很多空闲4MB连续页面,也无法使用,因为这些4MB页面都是CMA的。

[ 2161.785225] Normal: 4*4kB (UEC) 3*8kB (EC) 3*16kB (UEC) 2*32kB (UE) 2*64kB (UE) 2*128kB (UE) 2*256kB (EC) 1*512kB (E) 3*1024kB (UEC) 3*2048kB (UEC) 58*4096kB (C) = 248344kB-----光4MB CMA就达到了232M,其他只有16MB。

2. 为什么剩下的内存绝大部分是CMA?

从Normal区域空闲页面情况看,绝大部分都是CMA的。但是初始化的时候存在很多其他类型的页面。

通过cat /proc/pagetypeinfo查看前后对比,可以发现Movable类型的页面基本被申请完。

所以这里怀疑是内存泄漏,通过下面脚本跟踪MemFree。

while true; do cat /proc/meminfo | grep MemFree; sleep 10; done

发现内存在不停的下降,达到260M左右的时候出现warn_alloc()。

所以问题的根源在内存泄漏。

3. 如何降低内存碎片?

对内存碎片,可以提供页面规整来解决。请参考《Linux内存管理 (16)内存规整》

4. 调整/proc/sys/vm/min_free_kbytes

转载于:https://www.cnblogs.com/arnoldlu/p/10691034.html

warn_alloc():page allocation failure问题分析相关推荐

  1. 一次Linux线上系统page allocation failure问题处理实战记录

    作者:arstercz 来源(阅读原文可直达):https://blog.arstercz.com/ 问题说明 近期一台主机报以下 kernel 警告信息: Apr 28 05:30:51 cztes ...

  2. Linux page allocation failure 的问题处理 - zone_reclaim_mode

    标签 PostgreSQL , Linux , page allocation failure , 内存 背景 Linux内核分配失败,现象: 内存使用一定量后,HANG. dmesg中可能会有类似这 ...

  3. kernel: swapper: page allocation failure. order:1, mode:0x20

    场景:领导电话通知,我们的主站宕机了,到家后从另外一台机器上ssh一直处于等待状态,开始怀疑机器的负载比较高, 后查看监控机器,发现网卡.cpu.nginx连接数.....通通都没有数据了,显然不是负 ...

  4. Java中GC (Allocation Failure)日志分析实战

    概述 博主在最近使用spring batch的过程当中遇到了内存容量耗尽程序崩溃的问题,于是决定将此次的内存问题分析通过本篇博客记录下来. 在分析gc日实例志之前,我们先通过一条<深入理解jav ...

  5. 频繁GC (Allocation Failure)及young gc时间过长分析

    序 本文主要分析一个频繁GC (Allocation Failure)及young gc时间过长的case. 症状 gc throughput percent逐步下降,从一般的99.96%逐步下降,跌 ...

  6. MIT 6.S081 lab 5:lazy page allocation

    1 Lab lab 5就是去实现xv6 book 4.6中写的 Lazy page allocation 有个问题:page fault的trap是如何出现的? 1.1 Eliminate alloc ...

  7. GC(Allocation Failure)

    日前查看某个程序的日志,发现一直在报GC相关的信息,不确定这样的信息是代表正确还是不正确,所以正好借此机会再复习下GC相关的内容: 以其中一行为例来解读下日志信息: [GC (Allocation F ...

  8. CMS 触发GC(Allocation Failure)解析之标梵信息

    针对GC中发生的"Allocation Failure",源码描述为:Allocation Failure is a cause of GC cycle to kick in. & ...

  9. GC (Allocation Failure) 那些事

    GC (Allocation Failure) 那些事 平常写Spark程序,经常看到 GC(Allocation Failure) 这个日志,大概查了查意思是是jvm在执行垃圾回收,一般情况下不影响 ...

最新文章

  1. Java 过一下基础
  2. 计算机密码行业专题研究:网络安全最大弹性领域
  3. 二元偏导数存在的条件_高等数学入门——高阶偏导数的概念和计算
  4. 多态Poly中的向上/下转型 Upcast/Downcast
  5. 笨办法学 Python · 续 练习 6:`find`
  6. jwt判断token是否过期_4spring-security5整合jwt做登录、权限验证,全网最全!!!可用...
  7. 是时候了解原码、反码和补码
  8. mybatis两个内置参数
  9. 金针工具箱5.0安装版(多功能软件快捷工具)hh852作品
  10. 测试面试/笔试题 大集合
  11. 在计算机检索中 有哪些方法能缩小,使用“或OR”运算将同义词连接起来可以缩小检索。()...
  12. TMS320C64x DSP L1 L2 Cache架构(1)——C64x Cache Architecture
  13. centos7 关闭自动yum更新
  14. 单链表的逆置(递归和非递归)
  15. 小米10获取root权限_2020年小米红米Miflash新版刷机救砖恢复去除ROOT权限教程
  16. Python写的我的世界源码
  17. 15种方法活力一整天
  18. Android 悬浮窗日志工具
  19. Android安全攻防战,反编译与混淆技术完全解析(下)
  20. html最多显示两行,css 实现两行或多行文本溢出显示省略号(...)

热门文章

  1. 东南亚移动支付兼并洗牌
  2. 怎么在html中居中添加视频,HTML中的居中方法
  3. PTA数据结构练习题——旅游规划
  4. WEB前端学习-合并单元格
  5. sqlserver broker远端端点证书认证
  6. oracle查询最近十天日期,ORACLE中距离某日期最近的记录的查询
  7. java.sql.SQLException: Access denied for user ‘‘@‘localhost‘ (using password: YES)问题
  8. 51单片机——秒表(定时器实现)
  9. 如何使用码云高校版布置小组作业? | 码云高校版最佳实践
  10. android textview 用html设置字体