NVIDIA NCCL 源码学习(四)- 建图过程
上次分析到nccl对机器PCI系统进行拓扑分析的过程,产出的结果为xml格式,接下来,nccl会根据这个xml进图的建立过程以便之后进行路径搜索。
ncclTopoGetSystem的最后会执行ncclTopoGetSystemFromXml将xml格式转成图格式
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {NCCLCHECK(ncclCalloc(topoSystem, 1));struct ncclXmlNode* topNode;NCCLCHECK(xmlFindTag(xml, "system", &topNode));for (int s=0; s<topNode->nSubs; s++) {struct ncclXmlNode* node = topNode->subs[s];if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));}NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));NCCLCHECK(ncclTopoConnectCpus(*topoSystem));NCCLCHECK(ncclTopoSortSystem(*topoSystem));return ncclSuccess;
}
从xml中拿到根节点"system",然后遍历子节点中的"cpu",对每个cpu通过ncclTopoAddCpu进行建图,这里一个cpu其实就是一个numa。
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {int numaId;NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));struct ncclTopoNode* cpu;NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));const char* str;NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));if (str != NULL) {NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));}NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {int familyId, modelId;NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;} }for (int s=0; s<xmlCpu->nSubs; s++) {struct ncclXmlNode* node = xmlCpu->subs[s];if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));if (strcmp(node->name, "nic") == 0) {struct ncclTopoNode* nic = NULL;NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));if (nic == NULL) {NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));} NCCLCHECK(ncclTopoAddNic(node, system, nic));} }return ncclSuccess;
}
接着创建一个cpu node,id为numaid,设置cpu的affinity,即该numa对应的核,设置cpu对应vendor等信息
然后遍历cpu node的子节点,根据不同的类型执行不同的函数,如果是PCI节点,则执行ncclTopoAddPci
ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {const char* str;int type;NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));int64_t busId;NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));NCCLCHECK(busIdToInt64(str, &busId));struct ncclTopoNode* node = NULL;if (type == GPU) {struct ncclXmlNode* xmlGpu;NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));if (xmlGpu == NULL) return ncclSuccess;int index;NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));if (index == -1) return ncclSuccess;NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));}if (type == NIC) {struct ncclXmlNode* xmlNic;NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));if (xmlNic == NULL) return ncclSuccess;// Ignore sub device ID and merge multi-port NICs into one PCI device.busId &= 0xfffffffffffffff0;struct ncclTopoNode* nicNode = NULL;NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));if (nicNode == NULL) {NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));node = nicNode; // Connect it to parent later on}NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));} else if (type == PCI) {NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));for (int s=0; s<xmlPci->nSubs; s++) {struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));}}if (node) {int width, speed;NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));// Manage cases where speed was not indicated in /sysif (width == 0) width = 16;NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));}return ncclSuccess;
}
首先获取pci的type和busId, 然后判断type,如果是PCI,那么创建一个PCI node,递归执行ncclTopoAddPci,直到遇到NIC或者GPU xml节点。
如果遇到的是NIC,那么创建NIC节点,然后执行ncclTopoAddNic,这里会在xml nic下遍历xml net,对每个xml net创建net node,id为dev,然后设置speed,port,gdr等属性
ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {int dev;NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));struct ncclTopoNode* net;NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));const char* str;NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));if (str) sscanf(str, "0x%lx", &net->net.asic);else net->net.asic = dev;ncclDebugNoWarn = NCCL_GRAPH;int mbps;if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1net->net.width = mbps / 8000.0;if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;ncclDebugNoWarn = 0;NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));return ncclSuccess;
}ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {for (int s=0; s<xmlNic->nSubs; s++) {struct ncclXmlNode* xmlNet = xmlNic->subs[s];if (strcmp(xmlNet->name, "net") != 0) continue;int index;NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));if (index == -1) continue;NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));}return ncclSuccess;
}
然后通过建立net node到nic node的正反向边,设置边的类型,边上累计带宽,并且当前节点的边按照带宽从大到小排序
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {// Aggregate links into higher width for NVLinkstruct ncclTopoLink* link;for (link = node->links; link->remNode; link++) {if (link->remNode == remNode && link->type == type) break;}if (link->remNode == NULL) node->nlinks++;link->type = type;link->remNode = remNode;link->width += width;// Sort links in BW descending orderstruct ncclTopoLink linkSave;memcpy(&linkSave, link, sizeof(struct ncclTopoLink));while (link != node->links) {if ((link-1)->width >= linkSave.width) break;memcpy(link, link-1, sizeof(struct ncclTopoLink));link--;}memcpy(link, &linkSave, sizeof(struct ncclTopoLink));return ncclSuccess;
}
到这里就添加完成了NIC,回到ncclTopoAddPci里,如果是gpu的话则创建gpu node,然后设置gpu node的rank,dev,gdr等属性。最后通过ncclTopoConnectNodes建立当前节点到子节点的双向边。
到这里就完成了每个numa节点下的建图,然后开始添加nvlink和QPI以连接,先看下nvlink
ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {if (strcmp(node->name, "nvlink") == 0) {struct ncclTopoNode* gpu = NULL;int64_t pBusId;NCCLCHECK(busIdToInt64(parentBusId, &pBusId));NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));if (gpu == NULL) {WARN("Add NVLink error : could not find GPU %lx\n", pBusId);return ncclInternalError;}int count;NCCLCHECK(xmlGetAttrInt(node, "count", &count));const char* targetClass;NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));int targetType;NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));struct ncclTopoNode* remote = NULL;if (targetType == GPU) {// NVL P2P connection to another GPUconst char* target;NCCLCHECK(xmlGetAttrStr(node, "target", &target));int64_t busId;NCCLCHECK(busIdToInt64(target, &busId));NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));} else if (targetType == CPU) {// NVL connection to the local CPUNCCLCHECK(findLocalCpu(gpu, &remote));} else {if (system->nodes[NVS].count == 0) {NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));} else {remote = system->nodes[NVS].nodes;}}if (remote) {int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));if (remote->type != GPU) {NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));}}} else {const char* busId;NCCLCHECK(xmlGetAttr(node, "busid", &busId));for (int s=0; s<node->nSubs; s++) {NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));}}return ncclSuccess;
}
从根节点递归遍历下去,直到遇到nvlink xml节点,然后拿到nvlink的父节点,即gpu节点,然后通过tclass获取对端PCI设备类型,如果是gpu或者cpu,直接返回对端node,如果是nvswitch,那就先创建nvswitch节点,然后创建当前gpu节点和对端的双向边。然后通过ncclTopoConnectCpus将cpu两两连接。
最后为了方便后续搜索channel,通过ncclTopoSort递归将每个PCI节点的边按照nvlink,向下的PCI连接,向上的PCI连接,QPI的顺序进行排序,因为建边的过程中已经按照带宽排序过,所以nvlink一定在最前边,QPI一定在最后,因此只需要对中间的PCI排序即可
static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {// Shift all links to have upLink as last linkif (upNode) {int l=0;while (node->links[l].remNode != upNode) l++;struct ncclTopoLink upLink;memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));while (node->links[l+1].remNode) {memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));l++;} memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));}// Recursively sort the PCI treefor (int l=0; l<node->nlinks; l++) {struct ncclTopoLink* link = node->links+l;if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));}return ncclSuccess;
}
到这里就完成了整个的建图过程,总结下,由于拓扑分析产出的xml不便于进行后续的路径搜索,所以本节基于xml对PCI系统进行了建图
NVIDIA NCCL 源码学习(四)- 建图过程相关推荐
- NVIDIA NCCL 源码学习(五)- 路径计算
上节NCCL完成了对机器PCI系统拓扑的建图,其中建好的图如下所示,其中GPU之间是通过NVLink连接起来的 为了方便之后的搜索channel,接下来NCCL会先计算GPU和NIC节点到其他任意节点 ...
- NVIDIA NCCL 源码学习(九)- 单机内ncclSend和ncclRecv的过程
上节介绍了通信链路的建立过程,本节介绍下单机内部ncclSend和ncclRecv的运行过程. 单机内的通信都是通过kernel来进行的,所以整个通信的过程可以分为两步,第一步是准备kernel相关的 ...
- NVIDIA NCCL 源码学习(八)- 数据通信链路transport的建立
上节以ringGraph为例介绍了机器间channel的连接过程,现在环里每个rank都知道了从哪个rank接收数据以及将数据发送给哪个rank,本节具体介绍下P2P和rdma NET场景下数据通信链 ...
- NVIDIA NCCL 源码学习(六)- channel搜索
上节讲到我们已经计算出GPU和NIC节点到其他任意节点的最优路径了,本节看下nccl中channel的搜索过程. nccl中channel的概念表示一个通信路径,为了更好的利用带宽和网卡,以及同一块数 ...
- NVIDIA NCCL 源码学习(一)- 初始化及ncclUniqueId的产生
NCCL是英伟达开源的GPU通信库,支持集合通信和点对点通信 看下官方给的一个demo #include <stdio.h> #include "cuda_runtime.h&q ...
- NVIDIA NCCL 源码学习(七)- 机器间channel连接
上节中完成了单机内部的channel搜索,仍然以ringGraph为例的话,相当于在单台机器内部搜索出来了一系列的环,接下来需要将机器之间的环连接起来. 为了方便理解假设两机十六卡的情况下第一台机器的 ...
- NVIDIA NCCL 源码学习(二)- bootstrap网络连接的建立
上次介绍到rank0的机器生成了ncclUniqueId,并完成了机器的bootstrap网络和通信网络的初始化,这节接着看下所有节点间bootstrap的连接是如何建立的 rank0节点执行nccl ...
- dubbo源码学习(四):暴露服务的过程
dubbo采用的nio异步的通信,通信协议默认为 netty,当然也可以选择 mina,grizzy.在服务端(provider)在启动时主要是开启netty监听,在zookeeper上注册服务节点, ...
- webrtc源码学习 - 点对点(P2P)链接过程(peer connection)
创建PC pc 是 peer connection 的简写,以下文章中pc 都特指 peer connection PeerConnection 是webrtc 中链接过程非常重要的接口,提供了包括, ...
最新文章
- CTFshow 命令执行 web52
- 《第一行代码》学习笔记9-活动Activity(7)
- jgGrid获得的id值是主键的id而不是jqGrid的行号值
- .NET Core 使用 grpc 实现微服务
- 在SQL SERVER里面用命令查包含某字段的表
- 物联网设备数量激增,续航难题该如何解决?
- polardb mysql 事务隔离级别_事务的四种隔离级别
- hana数据库 字段长度_SAP HANA: 列式内存数据库评测
- windows7系统适合哪个python_windows7如何下载python系统
- openssl 加盐_nodejs-md5加盐到解密比对
- AutoCAD2013 以上利用AccoreConsole+ c# NetApi 批量处理图纸
- 星际蜗牛8盘位装机记录 蜗牛C双全 益衡7030电源
- 成功解决 nginx: [emerg] invalid number of arguments in “root“ directive in 问题
- 激活win10专业版,桌面设置我的电脑,测试过可行
- Linux重启命令shutdown与reboot
- 最新“量子纠缠”原子使量子计算机更进一步
- 如何为你的网站添加二级域名?
- VB中实现IObjectSafety接口以声明控件安全的方法
- Spring 编程式事务实例
- 智能表带如何为Apple Watch添加功能