上次分析到nccl对机器PCI系统进行拓扑分析的过程,产出的结果为xml格式,接下来,nccl会根据这个xml进图的建立过程以便之后进行路径搜索。

ncclTopoGetSystem的最后会执行ncclTopoGetSystemFromXml将xml格式转成图格式

ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {NCCLCHECK(ncclCalloc(topoSystem, 1));struct ncclXmlNode* topNode;NCCLCHECK(xmlFindTag(xml, "system", &topNode));for (int s=0; s<topNode->nSubs; s++) {struct ncclXmlNode* node = topNode->subs[s];if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));}NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));NCCLCHECK(ncclTopoConnectCpus(*topoSystem));NCCLCHECK(ncclTopoSortSystem(*topoSystem));return ncclSuccess;
}

从xml中拿到根节点"system",然后遍历子节点中的"cpu",对每个cpu通过ncclTopoAddCpu进行建图,这里一个cpu其实就是一个numa。

ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {int numaId;NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));struct ncclTopoNode* cpu;NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));const char* str;NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));if (str != NULL) {NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));}NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {int familyId, modelId;NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;}   }for (int s=0; s<xmlCpu->nSubs; s++) {struct ncclXmlNode* node = xmlCpu->subs[s];if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));if (strcmp(node->name, "nic") == 0) {struct ncclTopoNode* nic = NULL;NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));if (nic == NULL) {NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));}   NCCLCHECK(ncclTopoAddNic(node, system, nic));}   }return ncclSuccess;
}

接着创建一个cpu node,id为numaid,设置cpu的affinity,即该numa对应的核,设置cpu对应vendor等信息

然后遍历cpu node的子节点,根据不同的类型执行不同的函数,如果是PCI节点,则执行ncclTopoAddPci

ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {const char* str;int type;NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));int64_t busId;NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));NCCLCHECK(busIdToInt64(str, &busId));struct ncclTopoNode* node = NULL;if (type == GPU) {struct ncclXmlNode* xmlGpu;NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));if (xmlGpu == NULL) return ncclSuccess;int index;NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));if (index == -1) return ncclSuccess;NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));}if (type == NIC) {struct ncclXmlNode* xmlNic;NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));if (xmlNic == NULL) return ncclSuccess;// Ignore sub device ID and merge multi-port NICs into one PCI device.busId &= 0xfffffffffffffff0;struct ncclTopoNode* nicNode = NULL;NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));if (nicNode == NULL) {NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));node = nicNode; // Connect it to parent later on}NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));} else if (type == PCI) {NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));for (int s=0; s<xmlPci->nSubs; s++) {struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));}}if (node) {int width, speed;NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));// Manage cases where speed was not indicated in /sysif (width == 0) width = 16;NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));}return ncclSuccess;
}

首先获取pci的type和busId, 然后判断type,如果是PCI,那么创建一个PCI node,递归执行ncclTopoAddPci,直到遇到NIC或者GPU xml节点。

如果遇到的是NIC,那么创建NIC节点,然后执行ncclTopoAddNic,这里会在xml nic下遍历xml net,对每个xml net创建net node,id为dev,然后设置speed,port,gdr等属性

ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {int dev;NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));struct ncclTopoNode* net;NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));const char* str;NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));if (str) sscanf(str, "0x%lx", &net->net.asic);else net->net.asic = dev;ncclDebugNoWarn = NCCL_GRAPH;int mbps;if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1net->net.width = mbps / 8000.0;if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;ncclDebugNoWarn = 0;NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));return ncclSuccess;
}ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {for (int s=0; s<xmlNic->nSubs; s++) {struct ncclXmlNode* xmlNet = xmlNic->subs[s];if (strcmp(xmlNet->name, "net") != 0) continue;int index;NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));if (index == -1) continue;NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));}return ncclSuccess;
}

然后通过建立net node到nic node的正反向边,设置边的类型,边上累计带宽,并且当前节点的边按照带宽从大到小排序

ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {// Aggregate links into higher width for NVLinkstruct ncclTopoLink* link;for (link = node->links; link->remNode; link++) {if (link->remNode == remNode && link->type == type) break;}if (link->remNode == NULL) node->nlinks++;link->type = type;link->remNode = remNode;link->width += width;// Sort links in BW descending orderstruct ncclTopoLink linkSave;memcpy(&linkSave, link, sizeof(struct ncclTopoLink));while (link != node->links) {if ((link-1)->width >= linkSave.width) break;memcpy(link, link-1, sizeof(struct ncclTopoLink));link--;}memcpy(link, &linkSave, sizeof(struct ncclTopoLink));return ncclSuccess;
}

到这里就添加完成了NIC,回到ncclTopoAddPci里,如果是gpu的话则创建gpu node,然后设置gpu node的rank,dev,gdr等属性。最后通过ncclTopoConnectNodes建立当前节点到子节点的双向边。

到这里就完成了每个numa节点下的建图,然后开始添加nvlink和QPI以连接,先看下nvlink

ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {if (strcmp(node->name, "nvlink") == 0) {struct ncclTopoNode* gpu = NULL;int64_t pBusId;NCCLCHECK(busIdToInt64(parentBusId, &pBusId));NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));if (gpu == NULL) {WARN("Add NVLink error : could not find GPU %lx\n", pBusId);return ncclInternalError;}int count;NCCLCHECK(xmlGetAttrInt(node, "count", &count));const char* targetClass;NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));int targetType;NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));struct ncclTopoNode* remote = NULL;if (targetType == GPU) {// NVL P2P connection to another GPUconst char* target;NCCLCHECK(xmlGetAttrStr(node, "target", &target));int64_t busId;NCCLCHECK(busIdToInt64(target, &busId));NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));} else if (targetType == CPU) {// NVL connection to the local CPUNCCLCHECK(findLocalCpu(gpu, &remote));} else {if (system->nodes[NVS].count == 0) {NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));} else {remote = system->nodes[NVS].nodes;}}if (remote) {int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));if (remote->type != GPU) {NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));}}} else {const char* busId;NCCLCHECK(xmlGetAttr(node, "busid", &busId));for (int s=0; s<node->nSubs; s++) {NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));}}return ncclSuccess;
}

从根节点递归遍历下去,直到遇到nvlink xml节点,然后拿到nvlink的父节点,即gpu节点,然后通过tclass获取对端PCI设备类型,如果是gpu或者cpu,直接返回对端node,如果是nvswitch,那就先创建nvswitch节点,然后创建当前gpu节点和对端的双向边。然后通过ncclTopoConnectCpus将cpu两两连接。

最后为了方便后续搜索channel,通过ncclTopoSort递归将每个PCI节点的边按照nvlink,向下的PCI连接,向上的PCI连接,QPI的顺序进行排序,因为建边的过程中已经按照带宽排序过,所以nvlink一定在最前边,QPI一定在最后,因此只需要对中间的PCI排序即可

static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {// Shift all links to have upLink as last linkif (upNode) {int l=0;while (node->links[l].remNode != upNode) l++;struct ncclTopoLink upLink;memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));while (node->links[l+1].remNode) {memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));l++;}   memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));}// Recursively sort the PCI treefor (int l=0; l<node->nlinks; l++) {struct ncclTopoLink* link = node->links+l;if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));}return ncclSuccess;
}

到这里就完成了整个的建图过程,总结下,由于拓扑分析产出的xml不便于进行后续的路径搜索,所以本节基于xml对PCI系统进行了建图

NVIDIA NCCL 源码学习(四)- 建图过程相关推荐

  1. NVIDIA NCCL 源码学习(五)- 路径计算

    上节NCCL完成了对机器PCI系统拓扑的建图,其中建好的图如下所示,其中GPU之间是通过NVLink连接起来的 为了方便之后的搜索channel,接下来NCCL会先计算GPU和NIC节点到其他任意节点 ...

  2. NVIDIA NCCL 源码学习(九)- 单机内ncclSend和ncclRecv的过程

    上节介绍了通信链路的建立过程,本节介绍下单机内部ncclSend和ncclRecv的运行过程. 单机内的通信都是通过kernel来进行的,所以整个通信的过程可以分为两步,第一步是准备kernel相关的 ...

  3. NVIDIA NCCL 源码学习(八)- 数据通信链路transport的建立

    上节以ringGraph为例介绍了机器间channel的连接过程,现在环里每个rank都知道了从哪个rank接收数据以及将数据发送给哪个rank,本节具体介绍下P2P和rdma NET场景下数据通信链 ...

  4. NVIDIA NCCL 源码学习(六)- channel搜索

    上节讲到我们已经计算出GPU和NIC节点到其他任意节点的最优路径了,本节看下nccl中channel的搜索过程. nccl中channel的概念表示一个通信路径,为了更好的利用带宽和网卡,以及同一块数 ...

  5. NVIDIA NCCL 源码学习(一)- 初始化及ncclUniqueId的产生

    NCCL是英伟达开源的GPU通信库,支持集合通信和点对点通信 看下官方给的一个demo #include <stdio.h> #include "cuda_runtime.h&q ...

  6. NVIDIA NCCL 源码学习(七)- 机器间channel连接

    上节中完成了单机内部的channel搜索,仍然以ringGraph为例的话,相当于在单台机器内部搜索出来了一系列的环,接下来需要将机器之间的环连接起来. 为了方便理解假设两机十六卡的情况下第一台机器的 ...

  7. NVIDIA NCCL 源码学习(二)- bootstrap网络连接的建立

    上次介绍到rank0的机器生成了ncclUniqueId,并完成了机器的bootstrap网络和通信网络的初始化,这节接着看下所有节点间bootstrap的连接是如何建立的 rank0节点执行nccl ...

  8. dubbo源码学习(四):暴露服务的过程

    dubbo采用的nio异步的通信,通信协议默认为 netty,当然也可以选择 mina,grizzy.在服务端(provider)在启动时主要是开启netty监听,在zookeeper上注册服务节点, ...

  9. webrtc源码学习 - 点对点(P2P)链接过程(peer connection)

    创建PC pc 是 peer connection 的简写,以下文章中pc 都特指 peer connection PeerConnection 是webrtc 中链接过程非常重要的接口,提供了包括, ...

最新文章

  1. CTFshow 命令执行 web52
  2. 《第一行代码》学习笔记9-活动Activity(7)
  3. jgGrid获得的id值是主键的id而不是jqGrid的行号值
  4. .NET Core 使用 grpc 实现微服务
  5. 在SQL SERVER里面用命令查包含某字段的表
  6. 物联网设备数量激增,续航难题该如何解决?
  7. polardb mysql 事务隔离级别_事务的四种隔离级别
  8. hana数据库 字段长度_SAP HANA: 列式内存数据库评测
  9. windows7系统适合哪个python_windows7如何下载python系统
  10. openssl 加盐_nodejs-md5加盐到解密比对
  11. AutoCAD2013 以上利用AccoreConsole+ c# NetApi 批量处理图纸
  12. 星际蜗牛8盘位装机记录 蜗牛C双全 益衡7030电源
  13. 成功解决 nginx: [emerg] invalid number of arguments in “root“ directive in 问题
  14. 激活win10专业版,桌面设置我的电脑,测试过可行
  15. Linux重启命令shutdown与reboot
  16. 最新“量子纠缠”原子使量子计算机更进一步
  17. 如何为你的网站添加二级域名?
  18. VB中实现IObjectSafety接口以声明控件安全的方法
  19. Spring 编程式事务实例
  20. 智能表带如何为Apple Watch添加功能

热门文章

  1. 在报表开发工具Stimulsoft Report数据透视表的新功能介绍
  2. java电商项目搭建-------分布式文件存储系统(fastDFS)
  3. 开源代码学习之persepolis【二】
  4. 内蒙古大学计算机考研892,893计算机考研真题分享
  5. vm时序数据库-导入数据
  6. java排序输出序号_Java对map进行排序并生成序号
  7. Java Web课程设计
  8. 杂记(关于域名、网名以及一些常用图像格式、像素)
  9. OS学习笔记-11(清华大学慕课)进程与线程
  10. 【题解】CSP-J2021第二轮题解