hugepages初始化入口函数hugepage_info_init。hugepages目录sys_dir_path指向"/sys/kernel/mm/hugepages",目前此目录下包含两个子目录,分别对应1G和2M页面大小的hugepages:

$ ls /sys/kernel/mm/hugepages -l
total 0
drwxr-xr-x 2 root root 0 7月   7 13:51 hugepages-1048576kB
drwxr-xr-x 2 root root 0 7月   7 15:08 hugepages-2048kB

遍历sys_dir_path的子目录,解析页面大小保存到全局hugepage_info数组中。

static int
hugepage_info_init(void)
{   const char dirent_start_text[] = "hugepages-";const size_t dirent_start_len = sizeof(dirent_start_text) - 1;unsigned int i, num_sizes = 0;struct internal_config *internal_conf = eal_get_internal_configuration();dir = opendir(sys_dir_path);for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {struct hugepage_info *hpi;if (strncmp(dirent->d_name, dirent_start_text,dirent_start_len) != 0)continue;if (num_sizes >= MAX_HUGEPAGE_SIZES) break;hpi = &internal_conf->hugepage_info[num_sizes];hpi->hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]);

检查当前页面大小的hugepage是否已经进行了mount操作,将mount目录保存到hugedir中。如果还没有执行mount,使用get_num_hugepages获取可用的页面数量。

        /* first, check if we have a mountpoint */if (get_hugepage_dir(hpi->hugepage_sz,hpi->hugedir, sizeof(hpi->hugedir)) < 0) {uint32_t num_pages;num_pages = get_num_hugepages(dirent->d_name,hpi->hugepage_sz, 0);if (num_pages > 0)RTE_LOG(NOTICE, EAL,"%" PRIu32 " hugepages of size ""%" PRIu64 " reserved, but no mounted ""hugetlbfs found for that size\n",num_pages, hpi->hugepage_sz);/* if we have kernel support for reserving hugepages* through mmap, and we're in in-memory mode, treat this* page size as valid. we cannot be in legacy mode at* this point because we've checked this earlier in the* init process.*/
#ifdef MAP_HUGE_SHIFTif (internal_conf->in_memory) {RTE_LOG(DEBUG, EAL, "In-memory mode enabled, ""hugepages of size %" PRIu64 " bytes ""will be allocated anonymously\n",hpi->hugepage_sz);calc_num_pages(hpi, dirent, 0);num_sizes++;}
#endifcontinue;}

否则,如果页面已经进行了mount操作,锁定mount的挂载目录,由inspect_hugedir检查目录中可重用的空间大小,根据页面大小,获得可重用的页面数量。

        /* try to obtain a writelock */hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);/* if blocking lock failed */if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {RTE_LOG(CRIT, EAL,"Failed to lock hugepage directory!\n");break;}/* Check for existing hugepage files and either remove them* or count how many of them can be reused.*/reusable_pages = 0;if (!internal_conf->hugepage_file.unlink_existing) {reusable_bytes = 0;if (inspect_hugedir(hpi->hugedir, &reusable_bytes) < 0)break;RTE_ASSERT(reusable_bytes % hpi->hugepage_sz == 0);reusable_pages = reusable_bytes / hpi->hugepage_sz;} else if (clear_hugedir(hpi->hugedir) < 0) {break;}calc_num_pages(hpi, dirent, reusable_pages);num_sizes++;}closedir(dir);

变量num_hugepage_sizes记录有多少种可用的页面大小,qsort按照页面大小将hugepage_info进行排序。最后,检查是否有可用的页面(num_pages大于0)。

    /* something went wrong, and we broke from the for loop above */if (dirent != NULL)return -1;internal_conf->num_hugepage_sizes = num_sizes;/* sort the page directory entries by size, largest to smallest */qsort(&internal_conf->hugepage_info[0], num_sizes,sizeof(internal_conf->hugepage_info[0]), compare_hpi);/* now we have all info, check we have at least one valid size */for (i = 0; i < num_sizes; i++) {/* pages may no longer all be on socket 0, so check all */unsigned int j, num_pages = 0;struct hugepage_info *hpi = &internal_conf->hugepage_info[i];for (j = 0; j < RTE_MAX_NUMA_NODES; j++)num_pages += hpi->num_pages[j];if (num_pages > 0)return 0;}/* no valid hugepage mounts available, return error */return -1;

获取hugetlbfs挂载点

查看PROC文件mounts可看到所有挂载的文件系统,如下。

$ cat /proc/mounts | grep hugetlbfs
hugetlbfs /dev/hugepages hugetlbfs rw,relatime,pagesize=2M 0 0

文件mounts中内容的格式如proc_mount_fieldnames中的定义,分隔符为空格。

static int
get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len)
{enum proc_mount_fieldnames {DEVICE = 0,MOUNTPT,FSTYPE,OPTIONS,_FIELDNAME_MAX};static uint64_t default_size = 0;const char proc_mounts[] = "/proc/mounts";const char hugetlbfs_str[] = "hugetlbfs";const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;const char pagesize_opt[] = "pagesize=";const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;const char split_tok = ' ';char *splitstr[_FIELDNAME_MAX];char found[PATH_MAX] = "";const struct internal_config *internal_conf =eal_get_internal_configuration();/* If the specified dir doesn't exist, we can't match it.*/if (internal_conf->hugepage_dir != NULL &&stat(internal_conf->hugepage_dir, &st) != 0) {return -1;}

默认的hugepage页面大小可在/proc/meminfo中查看(get_default_hp_size)。打开/proc/mounts文件,逐行遍历,寻找文件系统类型(FSTYPE)字段等于hugetlbfs的行。在OPTIONS字段中查找"pagesize="字符串。

    FILE *fd = fopen(proc_mounts, "r");if (fd == NULL)rte_panic("Cannot open %s\n", proc_mounts);if (default_size == 0)default_size = get_default_hp_size();while (fgets(buf, sizeof(buf), fd)){const char *pagesz_str;if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,split_tok) != _FIELDNAME_MAX) {RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);break; /* return NULL */}if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) != 0)continue;pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt);

如果没有找到"pagesize=",使用默认的页面大小,检查其是否与参数hugepage_sz相等,否则,检查pagesize自定的页面大小是否与参数hugepage_sz相等,相等的话,表明找到了。

        /* if no explicit page size, the default page size is compared */if (pagesz_str == NULL) {if (hugepage_sz != default_size)continue;}/* there is an explicit page size, so check it */else {uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);if (pagesz != hugepage_sz)continue;}

如果DPDK没有指定–huge-dir参数,将当前遍历行中的MOUNTPT挂载点字段拷贝到found中。否则,如果指定了–huge-dir,当前遍历行的挂载点字段与指定的不相等,继续遍历下一行。

最后,如果存在两个满足条件的挂载点,使用最长匹配的那一个。

        /* If no --huge-dir option has been given, we're done.*/if (internal_conf->hugepage_dir == NULL) {strlcpy(found, splitstr[MOUNTPT], len);break;}/* Ignore any mount that doesn't contain the --huge-dir* directory.*/if (strncmp(internal_conf->hugepage_dir, splitstr[MOUNTPT],strlen(splitstr[MOUNTPT])) != 0) {continue;}/* We found a match, but only prefer it if it's a longer match* (so /mnt/1 is preferred over /mnt for matching /mnt/1/2)).*/if (strlen(splitstr[MOUNTPT]) > strlen(found))strlcpy(found, splitstr[MOUNTPT], len);} /* end while fgets */fclose(fd);

优先使用–huge-dir指定的挂载点目录,其次使用以上找到的目录。

    if (found[0] != '\0') {/* If needed, return the requested dir, not the mount point. */strlcpy(hugedir, internal_conf->hugepage_dir != NULL ?internal_conf->hugepage_dir : found, len);return 0;}return -1;

获取页面大小(传统方式)

根据以下2M大小页面配置,获取可用的页面数量。

$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages
1003
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_overcommit_hugepages
0
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/resv_hugepages
0
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/surplus_hugepages
0

开始假定free_hugepages中的页面都是可用的。首先,检查保留页面数量resv_hugepages,减去保留页面数量,即为可用的页面。

static uint32_t
get_num_hugepages(const char *subdir, size_t sz, unsigned int reusable_pages)
{   unsigned long resv_pages, num_pages, over_pages, surplus_pages;const char *nr_hp_file = "free_hugepages";const char *nr_rsvd_file = "resv_hugepages";const char *nr_over_file = "nr_overcommit_hugepages";const char *nr_splus_file = "surplus_hugepages";/* first, check how many reserved pages kernel reports */if (get_hp_sysfs_value(subdir, nr_rsvd_file, &resv_pages) < 0)return 0;if (get_hp_sysfs_value(subdir, nr_hp_file, &num_pages) < 0)return 0;if (get_hp_sysfs_value(subdir, nr_over_file, &over_pages) < 0)over_pages = 0;if (get_hp_sysfs_value(subdir, nr_splus_file, &surplus_pages) < 0)surplus_pages = 0;/* adjust num_pages */if (num_pages >= resv_pages)num_pages -= resv_pages;else if (resv_pages)num_pages = 0;

可超限的页面数量减去过剩页面数量,等于可用的超限页面数量。

    if (over_pages >= surplus_pages)over_pages -= surplus_pages;elseover_pages = 0;if (num_pages == 0 && over_pages == 0 && reusable_pages)RTE_LOG(WARNING, EAL, "No available %zu kB hugepages reported\n",sz >> 10);

可用页面数量加上可超限使用的页面数量,如果结果溢出,使用uint32的最大值。最后,加上可重用的页面数量,即为最终的可用页面数量,不超过无符号32位的最大值。

    num_pages += over_pages;if (num_pages < over_pages) /* overflow */num_pages = UINT32_MAX;num_pages += reusable_pages;if (num_pages < reusable_pages) /* overflow */num_pages = UINT32_MAX;/* we want to return a uint32_t and more than this looks suspicious* anyway ... */if (num_pages > UINT32_MAX)num_pages = UINT32_MAX;return num_pages;

计算可重用页面

在hugepage已经mount的情况下,遍历挂载点目录(walk_hugedir),由回调函数inspect_hugedir_cb计算可重用描述所占用的空间。

static void
inspect_hugedir_cb(const struct walk_hugedir_data *whd)
{uint64_t *total_size = whd->user_data;if (fstat(whd->file_fd, &st) < 0)RTE_LOG(DEBUG, EAL, "%s(): stat(\"%s\") failed: %s",__func__, whd->file_name, strerror(errno));else(*total_size) += st.st_size;
}/* Count the total size in bytes of all files in the directory* not mapped by other DPDK process.*/
static int
inspect_hugedir(const char *hugedir, uint64_t *total_size)
{return walk_hugedir(hugedir, inspect_hugedir_cb, total_size);

遍历挂载点目录,查找符合"map_"命名规则的文件,如果成功锁定此文件,即认为其为可重用的文件,调用回调cb计算其空间大小。

/* Search the hugepage directory for whatever hugepage files there are.* Check if the file is in use by another DPDK process.* If not, execute a callback on it.*/
static int
walk_hugedir(const char *hugedir, walk_hugedir_t *cb, void *user_data)
{const char filter[] = "*map_*"; /* matches hugepage files */dir = opendir(hugedir);dir_fd = dirfd(dir);dirent = readdir(dir);while (dirent != NULL) {/* skip files that don't match the hugepage pattern */if (fnmatch(filter, dirent->d_name, 0) > 0) {dirent = readdir(dir);continue;}/* try and lock the file */fd = openat(dir_fd, dirent->d_name, O_RDONLY);/* skip to next file */if (fd == -1) {dirent = readdir(dir);continue;}/* non-blocking lock */lck_result = flock(fd, LOCK_EX | LOCK_NB);/* if lock succeeds, execute callback */if (lck_result != -1)cb(&(struct walk_hugedir_data){.dir_fd = dir_fd,.file_fd = fd,.file_name = dirent->d_name,.user_data = user_data,});close (fd);dirent = readdir(dir);

删除不再使用页面

遍历mount挂载点目录,对于不再使用的文件,由clear_hugedir_cb回调函数执行删除操作。

static void
clear_hugedir_cb(const struct walk_hugedir_data *whd)
{unlinkat(whd->dir_fd, whd->file_name, 0);
}/* Remove hugepage files not used by other DPDK processes from a directory. */
static int
clear_hugedir(const char *hugedir)
{return walk_hugedir(hugedir, clear_hugedir_cb, NULL);

可用页面计算

static void
calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent,unsigned int reusable_pages)
{uint64_t total_pages = 0;const struct internal_config *internal_conf =eal_get_internal_configuration();/** first, try to put all hugepages into relevant sockets, but* if first attempts fails, fall back to collecting all pages* in one socket and sorting them later*/total_pages = 0;

首先,使用numa节点获取可用的页面数量。total_pages保存所有节点页面数量的总和。

    /** We also don't want to do this for legacy init.* When there are hugepage files to reuse it is unknown* what NUMA node the pages are on.* This could be determined by mapping,* but it is precisely what hugepage file reuse is trying to avoid.*/if (!internal_conf->legacy_mem && reusable_pages == 0)for (i = 0; i < rte_socket_count(); i++) {int socket = rte_socket_id_by_idx(i);unsigned int num_pages =get_num_hugepages_on_node(dirent->d_name, socket, hpi->hugepage_sz);hpi->num_pages[socket] = num_pages;total_pages += num_pages;}

其次,如果以上没有得到任何页面,以下采用传统的方式获取可用页面,此时,所以可用的页面数量保存在了num_pages数组的0索引位置,之后根据NUMA节点再进行分配。

    /* we failed to sort memory from the get go, so fall* back to old way*/if (total_pages == 0) {hpi->num_pages[0] = get_num_hugepages(dirent->d_name,hpi->hugepage_sz, reusable_pages);#ifndef RTE_ARCH_64/* for 32-bit systems, limit number of hugepages to* 1GB per page size */hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],RTE_PGSIZE_1G / hpi->hugepage_sz);
#endif

计算NUMA可用页面(新方式)

目录sys_pages_numa_dir_path[]定义为"/sys/devices/system/node",例如,对于node0有如下的目录结构:

$ ls -l /sys/devices/system/node/node0/hugepages/
total 0
drwxr-xr-x 2 root root 0 7月   7 13:51 hugepages-1048576kB
drwxr-xr-x 2 root root 0 7月   7 13:51 hugepages-2048kB
$
$ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages
1003
$ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
1024
$ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages
0

对应于socketdir目录,根据页面大小子目录(subdir),获取free_hugepages的大小。

static uint32_t
get_num_hugepages_on_node(const char *subdir, unsigned int socket, size_t sz)
{unsigned long num_pages = 0;const char *nr_hp_file = "free_hugepages";snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",sys_pages_numa_dir_path, socket);socketdir = opendir(socketpath);if (socketdir) {/* Keep calm and carry on */closedir(socketdir);} else {/* Can't find socket dir, so ignore it */return 0;}snprintf(path, sizeof(path), "%s/%s/%s",socketpath, subdir, nr_hp_file);if (eal_parse_sysfs_value(path, &num_pages) < 0)return 0;

确保可用页面的大小不大于uint32表示的最大值。

    if (num_pages == 0)RTE_LOG(WARNING, EAL, "No free %zu kB hugepages reported on node %u\n",sz >> 10, socket);/* we want to return a uint32_t and more than this looks suspicious* anyway ...*/if (num_pages > UINT32_MAX)num_pages = UINT32_MAX;return num_pages;

DPDK初始化hugepages相关推荐

  1. Linux平台上DPDK入门指南

    目录 1. 简介 1.1. 文档地图 2. 系统要求 2.1. X86 上预先设置 BIOS 2.2. 编译DPDK 2.3. 运行DPDK应用程序 2.3.1. 系统软件 2.3.2. 在 Linu ...

  2. dpdk:vfio-pci模式下iommu(N+Y)-Huge配置-numa配置

    DPDK大内存页Hugepages配置: Hugepages是DPDK用于提升性能的重要手段. 通过使用Hugepages,可以降低内存页数,减少TLB页表数量,增加TLB hit率. 在Linux上 ...

  3. vpp与dpdk的关系

    vpp与dpdk的关系 dpdk作为vpp的一个插件,主要用来从网卡收发包. vpp使用如下命令参数初始化dpdk eal. EAL init args: -c a -n 1 --in-memory ...

  4. linux报文高速捕获技术对比--napi/libpcap/afpacket/pfring/dpdk/xdp

    1. 传统linux网络协议栈流程和性能分析 Linux网络协议栈是处理网络数据包的典型系统,它包含了从物理层直到应用层的全过程. 数据包到达网卡设备. 网卡设备依据配置进行DMA操作.(第1次拷贝: ...

  5. DPDK内存篇(二): 深入学习 IOVA

    Anatoly Burakov:英特尔软件工程师,目前在维护DPDK中的VFIO和内存子系统. "文章转载自DPDK与SPDK开源社区公众号" 目录 引言 环境抽象层(EAL)参数 ...

  6. FD.io VPP:vlib buffer pool(vlib_buffer) 内存初始化

    Table of Contents vlib buffer创建过程 vlib_buffer相关内存初始化 1.函数一开始就查询numa的个数 2.遍历numa节点来初始化 3.查询系统大页大小. 4. ...

  7. Linux网络报文捕获/抓包技术对比:napi、libpcap、afpacket、PF_RING、PACKET_MMAP、DPDK、XDP(eXpress Data Path)

    Table of Contents 1.传统linux网络协议栈流程和性能分析 协议栈的主要问题 针对单个数据包级别的资源分配和释放 流量的串行访问 从驱动到用户态的数据拷贝 内核到用户空间的上下文切 ...

  8. DPDK PMD( Poll Mode Driver)轮询模式驱动程序

    DPDK PMD( Poll Mode Driver)轮询模式驱动程序 目录 Mellanox PMDs 轮询模式驱动程序 要求和假设 设计原则 逻辑核心,内存和NIC队列关系 设备标识,所有权和配置 ...

  9. DPU网络开发SDK——DPDK(九)

    rte_eal_remote_launch() 在过去的几篇文章中,我们重点分析了DPDK初始化过程rte_eal_init()的主要流程,了解了其内存分配,primary和secondary之间如何 ...

  10. DPDK 学习笔记(一)

    本文为笔者阅读其他网络资料结合自身所感,如有冒犯,请联系笔者. 目录 1.概述 2.dpdk的突破 2.1 UIO (用户空间的 I/O 技术) 2.2 内存池技术 2.3 大页内存管理 2.4 无锁 ...

最新文章

  1. python读取csv文件并修改指定内容-pandas读取CSV文件时查看修改各列的数据类型格式...
  2. ubuntu下配置eclipse环境
  3. product sales data determination in Opportunity item
  4. DOxygen for C++使用说明——Markdown支持
  5. PyTorch 1.0 中文官方教程:Autograd:自动求导
  6. 八皇后(N皇后)问题
  7. 【免费毕设】ASP.NET基于.NET的城市公交查询系统的实现与设计(源代码+lunwen)
  8. 计算机网络物理防护,计算机网络的物理安全
  9. 2018深圳杯数学建模A题
  10. java 本地文件上传到服务器,java本地文件上传到远程服务器
  11. 利用服务器在家远程登录办公室电脑
  12. 记一次公司被勒索病毒攻击事迹,上上下下咬牙切齿
  13. java的datasource_JAVA创建DataSource
  14. BMFont 制作字体时,无法导入图片
  15. 用计算机写作文的好处,第7课 用计算机写作文教案
  16. 洛谷 P1357 花园
  17. 采用keras深度学习框架搭建卷积神经网络模型实现垃圾分类,基于树莓派上进行实时视频流的垃圾识别源代码
  18. Adobe带你解锁办会新技能
  19. 把我给另外一个朋友的炒股劝告发给你一遍,希望你可以得到帮助!
  20. 小妹想学习BI,不知从何下手

热门文章

  1. Kaggle:入门赛Tatanic(泰坦尼克号)84.21%带你冲进前2%
  2. 红魔5S游戏手机与努比亚watch闪耀ChinaJoy
  3. 团队价值观五个字_一个优秀的团队应该具有的价值观
  4. 机器学习:决策树的划分依据
  5. 【开源多媒体编辑软件工程】VirtualDub
  6. 推荐10个免费的html,10个免费的HTML在线编辑工具
  7. 抖音SEO,抖音排名优化,抖音排名规则
  8. 服务器自动压缩access数据库代码,Access数据库体积过大问题的解决方法
  9. 薅羊毛算副业吗?薅羊毛到底是怎么赚钱的?
  10. 苹果safari浏览器video视频无法播放