DPDK初始化hugepages
hugepages初始化入口函数hugepage_info_init。hugepages目录sys_dir_path指向"/sys/kernel/mm/hugepages",目前此目录下包含两个子目录,分别对应1G和2M页面大小的hugepages:
$ ls /sys/kernel/mm/hugepages -l
total 0
drwxr-xr-x 2 root root 0 7月 7 13:51 hugepages-1048576kB
drwxr-xr-x 2 root root 0 7月 7 15:08 hugepages-2048kB
遍历sys_dir_path的子目录,解析页面大小保存到全局hugepage_info数组中。
static int
hugepage_info_init(void)
{ const char dirent_start_text[] = "hugepages-";const size_t dirent_start_len = sizeof(dirent_start_text) - 1;unsigned int i, num_sizes = 0;struct internal_config *internal_conf = eal_get_internal_configuration();dir = opendir(sys_dir_path);for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {struct hugepage_info *hpi;if (strncmp(dirent->d_name, dirent_start_text,dirent_start_len) != 0)continue;if (num_sizes >= MAX_HUGEPAGE_SIZES) break;hpi = &internal_conf->hugepage_info[num_sizes];hpi->hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]);
检查当前页面大小的hugepage是否已经进行了mount操作,将mount目录保存到hugedir中。如果还没有执行mount,使用get_num_hugepages获取可用的页面数量。
/* first, check if we have a mountpoint */if (get_hugepage_dir(hpi->hugepage_sz,hpi->hugedir, sizeof(hpi->hugedir)) < 0) {uint32_t num_pages;num_pages = get_num_hugepages(dirent->d_name,hpi->hugepage_sz, 0);if (num_pages > 0)RTE_LOG(NOTICE, EAL,"%" PRIu32 " hugepages of size ""%" PRIu64 " reserved, but no mounted ""hugetlbfs found for that size\n",num_pages, hpi->hugepage_sz);/* if we have kernel support for reserving hugepages* through mmap, and we're in in-memory mode, treat this* page size as valid. we cannot be in legacy mode at* this point because we've checked this earlier in the* init process.*/
#ifdef MAP_HUGE_SHIFTif (internal_conf->in_memory) {RTE_LOG(DEBUG, EAL, "In-memory mode enabled, ""hugepages of size %" PRIu64 " bytes ""will be allocated anonymously\n",hpi->hugepage_sz);calc_num_pages(hpi, dirent, 0);num_sizes++;}
#endifcontinue;}
否则,如果页面已经进行了mount操作,锁定mount的挂载目录,由inspect_hugedir检查目录中可重用的空间大小,根据页面大小,获得可重用的页面数量。
/* try to obtain a writelock */hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);/* if blocking lock failed */if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {RTE_LOG(CRIT, EAL,"Failed to lock hugepage directory!\n");break;}/* Check for existing hugepage files and either remove them* or count how many of them can be reused.*/reusable_pages = 0;if (!internal_conf->hugepage_file.unlink_existing) {reusable_bytes = 0;if (inspect_hugedir(hpi->hugedir, &reusable_bytes) < 0)break;RTE_ASSERT(reusable_bytes % hpi->hugepage_sz == 0);reusable_pages = reusable_bytes / hpi->hugepage_sz;} else if (clear_hugedir(hpi->hugedir) < 0) {break;}calc_num_pages(hpi, dirent, reusable_pages);num_sizes++;}closedir(dir);
变量num_hugepage_sizes记录有多少种可用的页面大小,qsort按照页面大小将hugepage_info进行排序。最后,检查是否有可用的页面(num_pages大于0)。
/* something went wrong, and we broke from the for loop above */if (dirent != NULL)return -1;internal_conf->num_hugepage_sizes = num_sizes;/* sort the page directory entries by size, largest to smallest */qsort(&internal_conf->hugepage_info[0], num_sizes,sizeof(internal_conf->hugepage_info[0]), compare_hpi);/* now we have all info, check we have at least one valid size */for (i = 0; i < num_sizes; i++) {/* pages may no longer all be on socket 0, so check all */unsigned int j, num_pages = 0;struct hugepage_info *hpi = &internal_conf->hugepage_info[i];for (j = 0; j < RTE_MAX_NUMA_NODES; j++)num_pages += hpi->num_pages[j];if (num_pages > 0)return 0;}/* no valid hugepage mounts available, return error */return -1;
获取hugetlbfs挂载点
查看PROC文件mounts可看到所有挂载的文件系统,如下。
$ cat /proc/mounts | grep hugetlbfs
hugetlbfs /dev/hugepages hugetlbfs rw,relatime,pagesize=2M 0 0
文件mounts中内容的格式如proc_mount_fieldnames中的定义,分隔符为空格。
static int
get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len)
{enum proc_mount_fieldnames {DEVICE = 0,MOUNTPT,FSTYPE,OPTIONS,_FIELDNAME_MAX};static uint64_t default_size = 0;const char proc_mounts[] = "/proc/mounts";const char hugetlbfs_str[] = "hugetlbfs";const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;const char pagesize_opt[] = "pagesize=";const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;const char split_tok = ' ';char *splitstr[_FIELDNAME_MAX];char found[PATH_MAX] = "";const struct internal_config *internal_conf =eal_get_internal_configuration();/* If the specified dir doesn't exist, we can't match it.*/if (internal_conf->hugepage_dir != NULL &&stat(internal_conf->hugepage_dir, &st) != 0) {return -1;}
默认的hugepage页面大小可在/proc/meminfo中查看(get_default_hp_size)。打开/proc/mounts文件,逐行遍历,寻找文件系统类型(FSTYPE)字段等于hugetlbfs的行。在OPTIONS字段中查找"pagesize="字符串。
FILE *fd = fopen(proc_mounts, "r");if (fd == NULL)rte_panic("Cannot open %s\n", proc_mounts);if (default_size == 0)default_size = get_default_hp_size();while (fgets(buf, sizeof(buf), fd)){const char *pagesz_str;if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,split_tok) != _FIELDNAME_MAX) {RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);break; /* return NULL */}if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) != 0)continue;pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt);
如果没有找到"pagesize=",使用默认的页面大小,检查其是否与参数hugepage_sz相等,否则,检查pagesize自定的页面大小是否与参数hugepage_sz相等,相等的话,表明找到了。
/* if no explicit page size, the default page size is compared */if (pagesz_str == NULL) {if (hugepage_sz != default_size)continue;}/* there is an explicit page size, so check it */else {uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);if (pagesz != hugepage_sz)continue;}
如果DPDK没有指定–huge-dir参数,将当前遍历行中的MOUNTPT挂载点字段拷贝到found中。否则,如果指定了–huge-dir,当前遍历行的挂载点字段与指定的不相等,继续遍历下一行。
最后,如果存在两个满足条件的挂载点,使用最长匹配的那一个。
/* If no --huge-dir option has been given, we're done.*/if (internal_conf->hugepage_dir == NULL) {strlcpy(found, splitstr[MOUNTPT], len);break;}/* Ignore any mount that doesn't contain the --huge-dir* directory.*/if (strncmp(internal_conf->hugepage_dir, splitstr[MOUNTPT],strlen(splitstr[MOUNTPT])) != 0) {continue;}/* We found a match, but only prefer it if it's a longer match* (so /mnt/1 is preferred over /mnt for matching /mnt/1/2)).*/if (strlen(splitstr[MOUNTPT]) > strlen(found))strlcpy(found, splitstr[MOUNTPT], len);} /* end while fgets */fclose(fd);
优先使用–huge-dir指定的挂载点目录,其次使用以上找到的目录。
if (found[0] != '\0') {/* If needed, return the requested dir, not the mount point. */strlcpy(hugedir, internal_conf->hugepage_dir != NULL ?internal_conf->hugepage_dir : found, len);return 0;}return -1;
获取页面大小(传统方式)
根据以下2M大小页面配置,获取可用的页面数量。
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages
1003
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_overcommit_hugepages
0
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/resv_hugepages
0
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/surplus_hugepages
0
开始假定free_hugepages中的页面都是可用的。首先,检查保留页面数量resv_hugepages,减去保留页面数量,即为可用的页面。
static uint32_t
get_num_hugepages(const char *subdir, size_t sz, unsigned int reusable_pages)
{ unsigned long resv_pages, num_pages, over_pages, surplus_pages;const char *nr_hp_file = "free_hugepages";const char *nr_rsvd_file = "resv_hugepages";const char *nr_over_file = "nr_overcommit_hugepages";const char *nr_splus_file = "surplus_hugepages";/* first, check how many reserved pages kernel reports */if (get_hp_sysfs_value(subdir, nr_rsvd_file, &resv_pages) < 0)return 0;if (get_hp_sysfs_value(subdir, nr_hp_file, &num_pages) < 0)return 0;if (get_hp_sysfs_value(subdir, nr_over_file, &over_pages) < 0)over_pages = 0;if (get_hp_sysfs_value(subdir, nr_splus_file, &surplus_pages) < 0)surplus_pages = 0;/* adjust num_pages */if (num_pages >= resv_pages)num_pages -= resv_pages;else if (resv_pages)num_pages = 0;
可超限的页面数量减去过剩页面数量,等于可用的超限页面数量。
if (over_pages >= surplus_pages)over_pages -= surplus_pages;elseover_pages = 0;if (num_pages == 0 && over_pages == 0 && reusable_pages)RTE_LOG(WARNING, EAL, "No available %zu kB hugepages reported\n",sz >> 10);
可用页面数量加上可超限使用的页面数量,如果结果溢出,使用uint32的最大值。最后,加上可重用的页面数量,即为最终的可用页面数量,不超过无符号32位的最大值。
num_pages += over_pages;if (num_pages < over_pages) /* overflow */num_pages = UINT32_MAX;num_pages += reusable_pages;if (num_pages < reusable_pages) /* overflow */num_pages = UINT32_MAX;/* we want to return a uint32_t and more than this looks suspicious* anyway ... */if (num_pages > UINT32_MAX)num_pages = UINT32_MAX;return num_pages;
计算可重用页面
在hugepage已经mount的情况下,遍历挂载点目录(walk_hugedir),由回调函数inspect_hugedir_cb计算可重用描述所占用的空间。
static void
inspect_hugedir_cb(const struct walk_hugedir_data *whd)
{uint64_t *total_size = whd->user_data;if (fstat(whd->file_fd, &st) < 0)RTE_LOG(DEBUG, EAL, "%s(): stat(\"%s\") failed: %s",__func__, whd->file_name, strerror(errno));else(*total_size) += st.st_size;
}/* Count the total size in bytes of all files in the directory* not mapped by other DPDK process.*/
static int
inspect_hugedir(const char *hugedir, uint64_t *total_size)
{return walk_hugedir(hugedir, inspect_hugedir_cb, total_size);
遍历挂载点目录,查找符合"map_"命名规则的文件,如果成功锁定此文件,即认为其为可重用的文件,调用回调cb计算其空间大小。
/* Search the hugepage directory for whatever hugepage files there are.* Check if the file is in use by another DPDK process.* If not, execute a callback on it.*/
static int
walk_hugedir(const char *hugedir, walk_hugedir_t *cb, void *user_data)
{const char filter[] = "*map_*"; /* matches hugepage files */dir = opendir(hugedir);dir_fd = dirfd(dir);dirent = readdir(dir);while (dirent != NULL) {/* skip files that don't match the hugepage pattern */if (fnmatch(filter, dirent->d_name, 0) > 0) {dirent = readdir(dir);continue;}/* try and lock the file */fd = openat(dir_fd, dirent->d_name, O_RDONLY);/* skip to next file */if (fd == -1) {dirent = readdir(dir);continue;}/* non-blocking lock */lck_result = flock(fd, LOCK_EX | LOCK_NB);/* if lock succeeds, execute callback */if (lck_result != -1)cb(&(struct walk_hugedir_data){.dir_fd = dir_fd,.file_fd = fd,.file_name = dirent->d_name,.user_data = user_data,});close (fd);dirent = readdir(dir);
删除不再使用页面
遍历mount挂载点目录,对于不再使用的文件,由clear_hugedir_cb回调函数执行删除操作。
static void
clear_hugedir_cb(const struct walk_hugedir_data *whd)
{unlinkat(whd->dir_fd, whd->file_name, 0);
}/* Remove hugepage files not used by other DPDK processes from a directory. */
static int
clear_hugedir(const char *hugedir)
{return walk_hugedir(hugedir, clear_hugedir_cb, NULL);
可用页面计算
static void
calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent,unsigned int reusable_pages)
{uint64_t total_pages = 0;const struct internal_config *internal_conf =eal_get_internal_configuration();/** first, try to put all hugepages into relevant sockets, but* if first attempts fails, fall back to collecting all pages* in one socket and sorting them later*/total_pages = 0;
首先,使用numa节点获取可用的页面数量。total_pages保存所有节点页面数量的总和。
/** We also don't want to do this for legacy init.* When there are hugepage files to reuse it is unknown* what NUMA node the pages are on.* This could be determined by mapping,* but it is precisely what hugepage file reuse is trying to avoid.*/if (!internal_conf->legacy_mem && reusable_pages == 0)for (i = 0; i < rte_socket_count(); i++) {int socket = rte_socket_id_by_idx(i);unsigned int num_pages =get_num_hugepages_on_node(dirent->d_name, socket, hpi->hugepage_sz);hpi->num_pages[socket] = num_pages;total_pages += num_pages;}
其次,如果以上没有得到任何页面,以下采用传统的方式获取可用页面,此时,所以可用的页面数量保存在了num_pages数组的0索引位置,之后根据NUMA节点再进行分配。
/* we failed to sort memory from the get go, so fall* back to old way*/if (total_pages == 0) {hpi->num_pages[0] = get_num_hugepages(dirent->d_name,hpi->hugepage_sz, reusable_pages);#ifndef RTE_ARCH_64/* for 32-bit systems, limit number of hugepages to* 1GB per page size */hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],RTE_PGSIZE_1G / hpi->hugepage_sz);
#endif
计算NUMA可用页面(新方式)
目录sys_pages_numa_dir_path[]定义为"/sys/devices/system/node",例如,对于node0有如下的目录结构:
$ ls -l /sys/devices/system/node/node0/hugepages/
total 0
drwxr-xr-x 2 root root 0 7月 7 13:51 hugepages-1048576kB
drwxr-xr-x 2 root root 0 7月 7 13:51 hugepages-2048kB
$
$ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages
1003
$ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
1024
$ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages
0
对应于socketdir目录,根据页面大小子目录(subdir),获取free_hugepages的大小。
static uint32_t
get_num_hugepages_on_node(const char *subdir, unsigned int socket, size_t sz)
{unsigned long num_pages = 0;const char *nr_hp_file = "free_hugepages";snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",sys_pages_numa_dir_path, socket);socketdir = opendir(socketpath);if (socketdir) {/* Keep calm and carry on */closedir(socketdir);} else {/* Can't find socket dir, so ignore it */return 0;}snprintf(path, sizeof(path), "%s/%s/%s",socketpath, subdir, nr_hp_file);if (eal_parse_sysfs_value(path, &num_pages) < 0)return 0;
确保可用页面的大小不大于uint32表示的最大值。
if (num_pages == 0)RTE_LOG(WARNING, EAL, "No free %zu kB hugepages reported on node %u\n",sz >> 10, socket);/* we want to return a uint32_t and more than this looks suspicious* anyway ...*/if (num_pages > UINT32_MAX)num_pages = UINT32_MAX;return num_pages;
DPDK初始化hugepages相关推荐
- Linux平台上DPDK入门指南
目录 1. 简介 1.1. 文档地图 2. 系统要求 2.1. X86 上预先设置 BIOS 2.2. 编译DPDK 2.3. 运行DPDK应用程序 2.3.1. 系统软件 2.3.2. 在 Linu ...
- dpdk:vfio-pci模式下iommu(N+Y)-Huge配置-numa配置
DPDK大内存页Hugepages配置: Hugepages是DPDK用于提升性能的重要手段. 通过使用Hugepages,可以降低内存页数,减少TLB页表数量,增加TLB hit率. 在Linux上 ...
- vpp与dpdk的关系
vpp与dpdk的关系 dpdk作为vpp的一个插件,主要用来从网卡收发包. vpp使用如下命令参数初始化dpdk eal. EAL init args: -c a -n 1 --in-memory ...
- linux报文高速捕获技术对比--napi/libpcap/afpacket/pfring/dpdk/xdp
1. 传统linux网络协议栈流程和性能分析 Linux网络协议栈是处理网络数据包的典型系统,它包含了从物理层直到应用层的全过程. 数据包到达网卡设备. 网卡设备依据配置进行DMA操作.(第1次拷贝: ...
- DPDK内存篇(二): 深入学习 IOVA
Anatoly Burakov:英特尔软件工程师,目前在维护DPDK中的VFIO和内存子系统. "文章转载自DPDK与SPDK开源社区公众号" 目录 引言 环境抽象层(EAL)参数 ...
- FD.io VPP:vlib buffer pool(vlib_buffer) 内存初始化
Table of Contents vlib buffer创建过程 vlib_buffer相关内存初始化 1.函数一开始就查询numa的个数 2.遍历numa节点来初始化 3.查询系统大页大小. 4. ...
- Linux网络报文捕获/抓包技术对比:napi、libpcap、afpacket、PF_RING、PACKET_MMAP、DPDK、XDP(eXpress Data Path)
Table of Contents 1.传统linux网络协议栈流程和性能分析 协议栈的主要问题 针对单个数据包级别的资源分配和释放 流量的串行访问 从驱动到用户态的数据拷贝 内核到用户空间的上下文切 ...
- DPDK PMD( Poll Mode Driver)轮询模式驱动程序
DPDK PMD( Poll Mode Driver)轮询模式驱动程序 目录 Mellanox PMDs 轮询模式驱动程序 要求和假设 设计原则 逻辑核心,内存和NIC队列关系 设备标识,所有权和配置 ...
- DPU网络开发SDK——DPDK(九)
rte_eal_remote_launch() 在过去的几篇文章中,我们重点分析了DPDK初始化过程rte_eal_init()的主要流程,了解了其内存分配,primary和secondary之间如何 ...
- DPDK 学习笔记(一)
本文为笔者阅读其他网络资料结合自身所感,如有冒犯,请联系笔者. 目录 1.概述 2.dpdk的突破 2.1 UIO (用户空间的 I/O 技术) 2.2 内存池技术 2.3 大页内存管理 2.4 无锁 ...
最新文章
- python读取csv文件并修改指定内容-pandas读取CSV文件时查看修改各列的数据类型格式...
- ubuntu下配置eclipse环境
- product sales data determination in Opportunity item
- DOxygen for C++使用说明——Markdown支持
- PyTorch 1.0 中文官方教程:Autograd:自动求导
- 八皇后(N皇后)问题
- 【免费毕设】ASP.NET基于.NET的城市公交查询系统的实现与设计(源代码+lunwen)
- 计算机网络物理防护,计算机网络的物理安全
- 2018深圳杯数学建模A题
- java 本地文件上传到服务器,java本地文件上传到远程服务器
- 利用服务器在家远程登录办公室电脑
- 记一次公司被勒索病毒攻击事迹,上上下下咬牙切齿
- java的datasource_JAVA创建DataSource
- BMFont 制作字体时,无法导入图片
- 用计算机写作文的好处,第7课 用计算机写作文教案
- 洛谷 P1357 花园
- 采用keras深度学习框架搭建卷积神经网络模型实现垃圾分类,基于树莓派上进行实时视频流的垃圾识别源代码
- Adobe带你解锁办会新技能
- 把我给另外一个朋友的炒股劝告发给你一遍,希望你可以得到帮助!
- 小妹想学习BI,不知从何下手
热门文章
- Kaggle:入门赛Tatanic(泰坦尼克号)84.21%带你冲进前2%
- 红魔5S游戏手机与努比亚watch闪耀ChinaJoy
- 团队价值观五个字_一个优秀的团队应该具有的价值观
- 机器学习:决策树的划分依据
- 【开源多媒体编辑软件工程】VirtualDub
- 推荐10个免费的html,10个免费的HTML在线编辑工具
- 抖音SEO,抖音排名优化,抖音排名规则
- 服务器自动压缩access数据库代码,Access数据库体积过大问题的解决方法
- 薅羊毛算副业吗?薅羊毛到底是怎么赚钱的?
- 苹果safari浏览器video视频无法播放