OVS DPDK与QEMU之间如何通过vhost user协议通信 vhost user协议的控制和数据通道
netdev_dpdk_vhost_construct定义在文件openvswitch-2.9.2/lib/netdev-dpdk.c1058 static int 1059 netdev_dpdk_vhost_construct(struct netdev *netdev) 1060 { 1061 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); 1062 const char *name = netdev->name; 1063 int err; 1064 1065 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in 1066 * the file system. '/' or '\' would traverse directories, so they're not 1067 * acceptable in 'name'. */ 1068 if (strchr(name, '/') || strchr(name, '\\')) { 1069 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. " 1070 "A valid name must not include '/' or '\\'", 1071 name); 1072 return EINVAL; 1073 } 1074 1075 ovs_mutex_lock(&dpdk_mutex); 1076 /* Take the name of the vhost-user port and append it to the location where 1077 * the socket is to be created, then register the socket. 1078 */ 1079 snprintf(dev->vhost_id, sizeof dev->vhost_id, "%s/%s", 1080 dpdk_get_vhost_sock_dir(), name); 1081 1082 dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT; 1083 err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags); 1084 if (err) { 1085 VLOG_ERR("vhost-user socket device setup failure for socket %s\n", 1086 dev->vhost_id); 1087 goto out; 1088 } else { 1089 fatal_signal_add_file_to_unlink(dev->vhost_id); 1090 VLOG_INFO("Socket %s created for vhost-user port %s\n", 1091 dev->vhost_id, name); 1092 } 1093 1094 err = rte_vhost_driver_callback_register(dev->vhost_id, 1095 &virtio_net_device_ops); 1096 if (err) { 1097 VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user " 1098 "port: %s\n", name); 1099 goto out; 1100 } 1101 1102 err = rte_vhost_driver_disable_features(dev->vhost_id, 1103 1ULL << VIRTIO_NET_F_HOST_TSO4 1104 | 1ULL << VIRTIO_NET_F_HOST_TSO6 1105 | 1ULL << VIRTIO_NET_F_CSUM); 1106 if (err) { 1107 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user " 1108 "port: %s\n", name); 1109 goto out; 1110 } 1111 1112 err = rte_vhost_driver_start(dev->vhost_id); 1113 if (err) { 1114 VLOG_ERR("rte_vhost_driver_start failed for vhost user " 1115 "port: %s\n", name); 1116 goto out; 1117 } 1118 1119 err = vhost_common_construct(netdev); 1120 if (err) { 1121 VLOG_ERR("vhost_common_construct failed for vhost user " 1122 "port: %s\n", name); 1123 } 1124 1125 out: 1126 ovs_mutex_unlock(&dpdk_mutex); 1127 VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; " 1128 "please migrate to dpdkvhostuserclient ports."); 1129 return err; 1130 } netdev_dpdk_vhost_construct函数调用rte_vhost_driver_register。以下代码均定义在dpdk-18.08/lib/librte_vhost/socket.c798 /*799 * Register a new vhost-user socket; here we could act as server800 * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag801 * is set.802 */803 int804 rte_vhost_driver_register(const char *path, uint64_t flags)805 {867 if ((flags & RTE_VHOST_USER_CLIENT) != 0) {868 vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);869 if (vsocket->reconnect && reconn_tid == 0) {870 if (vhost_user_reconnect_init() != 0)871 goto out_mutex;872 }873 } else {874 vsocket->is_server = true;875 }876 ret = create_unix_socket(vsocket);877 if (ret < 0) {878 goto out_mutex;879 } netdev_dpdk_vhost_construct函数调用rte_vhost_driver_start。定义在dpdk-18.08/lib/librte_vhost/socket.c1023 int 1024 rte_vhost_driver_start(const char *path) 1025 {1059 if (vsocket->is_server) 1060 return vhost_user_start_server(vsocket); 1061 else 1062 return vhost_user_start_client(vsocket); 1063 } vhost_user_create_server调用vhost_user_server_new_connection:以下的3个函数调用vhost_user_add_connection:266 /* call back when there is new vhost-user connection from client */267 static void268 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)269 { (...)424 static void *425 vhost_user_client_reconnect(void *arg __rte_unused)426 { (...)494 static int495 vhost_user_start_client(struct vhost_user_socket *vsocket)496 { (...)194 static void195 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)196 { vhost_user_add_connection接下来执行vhost_user_read_cb函数,其又调用vhost_user_msg_handler函数处理接收到的消息。280 static void281 vhost_user_read_cb(int connfd, void *dat, int *remove)282 {283 struct vhost_user_connection *conn = dat;284 struct vhost_user_socket *vsocket = conn->vsocket;285 int ret;286 287 ret = vhost_user_msg_handler(conn->vid, connfd);288 if (ret < 0) {289 close(connfd);290 *remove = 1;291 vhost_destroy_device(conn->vid);292 293 if (vsocket->notify_ops->destroy_connection)294 vsocket->notify_ops->destroy_connection(conn->vid);295 296 pthread_mutex_lock(&vsocket->conn_mutex);297 TAILQ_REMOVE(&vsocket->conn_list, conn, next);298 pthread_mutex_unlock(&vsocket->conn_mutex);299 300 free(conn);301 302 if (vsocket->reconnect) {303 create_unix_socket(vsocket);304 vhost_user_start_client(vsocket);305 }306 }307 } dpdk-18.08/lib/librte_vhost/vhost_user.c1548 int 1549 vhost_user_msg_handler(int vid, int fd) 1550 { 1551 struct virtio_net *dev; 1552 struct VhostUserMsg msg; 1553 struct rte_vdpa_device *vdpa_dev; 1554 int did = -1; 1555 int ret; 1556 int unlock_required = 0; 1557 uint32_t skip_master = 0; 1558 1559 dev = get_device(vid); 1560 if (dev == NULL) 1561 return -1; 1562 1563 if (!dev->notify_ops) { 1564 dev->notify_ops = vhost_driver_callback_get(dev->ifname); 1565 if (!dev->notify_ops) { 1566 RTE_LOG(ERR, VHOST_CONFIG, 1567 "failed to get callback ops for driver %s\n", 1568 dev->ifname); 1569 return -1; 1570 } 1571 } 1572 1573 ret = read_vhost_message(fd, &msg); 1574 if (ret <= 0 || msg.request.master >= VHOST_USER_MAX) { 1575 if (ret < 0) 1576 RTE_LOG(ERR, VHOST_CONFIG, 1577 "vhost read message failed\n"); 1578 else if (ret == 0) 1579 RTE_LOG(INFO, VHOST_CONFIG, 1580 "vhost peer closed\n"); 1581 else 1582 RTE_LOG(ERR, VHOST_CONFIG, 1583 "vhost read incorrect message\n"); 1584 1585 return -1; 1586 } 1587 1588 ret = 0; 1589 if (msg.request.master != VHOST_USER_IOTLB_MSG) 1590 RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", 1591 vhost_message_str[msg.request.master]); 1592 else 1593 RTE_LOG(DEBUG, VHOST_CONFIG, "read message %s\n", 1594 vhost_message_str[msg.request.master]); 1595 1596 ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); 1597 if (ret < 0) { 1598 RTE_LOG(ERR, VHOST_CONFIG, 1599 "failed to alloc queue\n"); 1600 return -1; 1601 } 1602 1603 /* 1604 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE 1605 * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops 1606 * and device is destroyed. destroy_device waits for queues to be 1607 * inactive, so it is safe. Otherwise taking the access_lock 1608 * would cause a dead lock. 1609 */ 1610 switch (msg.request.master) { 1611 case VHOST_USER_SET_FEATURES: 1612 case VHOST_USER_SET_PROTOCOL_FEATURES: 1613 case VHOST_USER_SET_OWNER: 1614 case VHOST_USER_SET_MEM_TABLE: 1615 case VHOST_USER_SET_LOG_BASE: 1616 case VHOST_USER_SET_LOG_FD: 1617 case VHOST_USER_SET_VRING_NUM: 1618 case VHOST_USER_SET_VRING_ADDR: 1619 case VHOST_USER_SET_VRING_BASE: 1620 case VHOST_USER_SET_VRING_KICK: 1621 case VHOST_USER_SET_VRING_CALL: 1622 case VHOST_USER_SET_VRING_ERR: 1623 case VHOST_USER_SET_VRING_ENABLE: 1624 case VHOST_USER_SEND_RARP: 1625 case VHOST_USER_NET_SET_MTU: 1626 case VHOST_USER_SET_SLAVE_REQ_FD: 1627 vhost_user_lock_all_queue_pairs(dev); 1628 unlock_required = 1; 1629 break; 1630 default: 1631 break; 1632 1633 } 1634 1635 if (dev->extern_ops.pre_msg_handle) { 1636 uint32_t need_reply; 1637 1638 ret = (*dev->extern_ops.pre_msg_handle)(dev->vid, 1639 (void *)&msg, &need_reply, &skip_master); 1640 if (ret < 0) 1641 goto skip_to_reply; 1642 1643 if (need_reply) 1644 send_vhost_reply(fd, &msg); 1645 1646 if (skip_master) 1647 goto skip_to_post_handle; 1648 } 1649 1650 switch (msg.request.master) { 1651 case VHOST_USER_GET_FEATURES: 1652 msg.payload.u64 = vhost_user_get_features(dev); 1653 msg.size = sizeof(msg.payload.u64); 1654 send_vhost_reply(fd, &msg); 1655 break; 1656 case VHOST_USER_SET_FEATURES: 1657 ret = vhost_user_set_features(dev, msg.payload.u64); 1658 if (ret) 1659 return -1; 1660 break; 1661 1662 case VHOST_USER_GET_PROTOCOL_FEATURES: 1663 vhost_user_get_protocol_features(dev, &msg); 1664 send_vhost_reply(fd, &msg); 1665 break; 1666 case VHOST_USER_SET_PROTOCOL_FEATURES: 1667 vhost_user_set_protocol_features(dev, msg.payload.u64); 1668 break; 1669 1670 case VHOST_USER_SET_OWNER: 1671 vhost_user_set_owner(); 1672 break; 1673 case VHOST_USER_RESET_OWNER: 1674 vhost_user_reset_owner(dev); 1675 break; 1676 1677 case VHOST_USER_SET_MEM_TABLE: 1678 ret = vhost_user_set_mem_table(&dev, &msg); 1679 break; 1680 1681 case VHOST_USER_SET_LOG_BASE: 1682 vhost_user_set_log_base(dev, &msg); 1683 1684 /* it needs a reply */ 1685 msg.size = sizeof(msg.payload.u64); 1686 send_vhost_reply(fd, &msg); 1687 break; 1688 case VHOST_USER_SET_LOG_FD: 1689 close(msg.fds[0]); 1690 RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); 1691 break; 1692 1693 case VHOST_USER_SET_VRING_NUM: 1694 vhost_user_set_vring_num(dev, &msg); 1695 break; 1696 case VHOST_USER_SET_VRING_ADDR: 1697 vhost_user_set_vring_addr(&dev, &msg); 1698 break; 1699 case VHOST_USER_SET_VRING_BASE: 1700 vhost_user_set_vring_base(dev, &msg); 1701 break; 1702 1703 case VHOST_USER_GET_VRING_BASE: 1704 vhost_user_get_vring_base(dev, &msg); 1705 msg.size = sizeof(msg.payload.state); 1706 send_vhost_reply(fd, &msg); 1707 break; 1708 1709 case VHOST_USER_SET_VRING_KICK: 1710 vhost_user_set_vring_kick(&dev, &msg); 1711 break; 1712 case VHOST_USER_SET_VRING_CALL: 1713 vhost_user_set_vring_call(dev, &msg); 1714 break; 1715 1716 case VHOST_USER_SET_VRING_ERR: 1717 if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) 1718 close(msg.fds[0]); 1719 RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); 1720 break; 1721 1722 case VHOST_USER_GET_QUEUE_NUM: 1723 msg.payload.u64 = (uint64_t)vhost_user_get_queue_num(dev); 1724 msg.size = sizeof(msg.payload.u64); 1725 send_vhost_reply(fd, &msg); 1726 break; 1727 1728 case VHOST_USER_SET_VRING_ENABLE: 1729 vhost_user_set_vring_enable(dev, &msg); 1730 break; 1731 case VHOST_USER_SEND_RARP: 1732 vhost_user_send_rarp(dev, &msg); 1733 break; 1734 1735 case VHOST_USER_NET_SET_MTU: 1736 ret = vhost_user_net_set_mtu(dev, &msg); 1737 break; 1738 1739 case VHOST_USER_SET_SLAVE_REQ_FD: 1740 ret = vhost_user_set_req_fd(dev, &msg); 1741 break; 1742 1743 case VHOST_USER_IOTLB_MSG: 1744 ret = vhost_user_iotlb_msg(&dev, &msg); 1745 break; 1746 1747 default: 1748 ret = -1; 1749 break; 1750 } 1751 1752 skip_to_post_handle: 1753 if (dev->extern_ops.post_msg_handle) { 1754 uint32_t need_reply; 1755 1756 ret = (*dev->extern_ops.post_msg_handle)( 1757 dev->vid, (void *)&msg, &need_reply); 1758 if (ret < 0) 1759 goto skip_to_reply; 1760 1761 if (need_reply) 1762 send_vhost_reply(fd, &msg); 1763 } 1764 1765 skip_to_reply: 1766 if (unlock_required) 1767 vhost_user_unlock_all_queue_pairs(dev); 1768 1769 if (msg.flags & VHOST_USER_NEED_REPLY) { 1770 msg.payload.u64 = !!ret; 1771 msg.size = sizeof(msg.payload.u64); 1772 send_vhost_reply(fd, &msg); 1773 } 1774 1775 if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { 1776 dev->flags |= VIRTIO_DEV_READY; 1777 1778 if (!(dev->flags & VIRTIO_DEV_RUNNING)) { 1779 if (dev->dequeue_zero_copy) { 1780 RTE_LOG(INFO, VHOST_CONFIG, 1781 "dequeue zero copy is enabled\n"); 1782 } 1783 1784 if (dev->notify_ops->new_device(dev->vid) == 0) 1785 dev->flags |= VIRTIO_DEV_RUNNING; 1786 } 1787 } 1788 1789 did = dev->vdpa_dev_id; 1790 vdpa_dev = rte_vdpa_get_device(did); 1791 if (vdpa_dev && virtio_is_ready(dev) && 1792 !(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) && 1793 msg.request.master == VHOST_USER_SET_VRING_ENABLE) { 1794 if (vdpa_dev->ops->dev_conf) 1795 vdpa_dev->ops->dev_conf(dev->vid); 1796 dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED; 1797 if (vhost_user_host_notifier_ctrl(dev->vid, true) != 0) { 1798 RTE_LOG(INFO, VHOST_CONFIG, 1799 "(%d) software relay is used for vDPA, performance may be low.\n", 1800 dev->vid); 1801 } 1802 } 1803 1804 return 0; 1805 } virtio告知DPDK共享内存的virtio queues内存地址 DPDK使用函数vhost_user_set_vring_addr将virtio的描述符、已用环和可用环地址转化为DPDK自身的地址空间。dpdk-18.08/lib/librte_vhost/vhost_user.c607 /*608 * The virtio device sends us the desc, used and avail ring addresses.609 * This function then converts these to our address space.610 */611 static int612 vhost_user_set_vring_addr(struct virtio_net **pdev, VhostUserMsg *msg)613 { 614 struct vhost_virtqueue *vq;615 struct vhost_vring_addr *addr = &msg->payload.addr;616 struct virtio_net *dev = *pdev;617 618 if (dev->mem == NULL)619 return -1;620 621 /* addr->index refers to the queue index. The txq 1, rxq is 0. */622 vq = dev->virtqueue[msg->payload.addr.index];623 624 /*625 * Rings addresses should not be interpreted as long as the ring is not626 * started and enabled627 */628 memcpy(&vq->ring_addrs, addr, sizeof(*addr));629 630 vring_invalidate(dev, vq);631 632 if (vq->enabled && (dev->features &633 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {634 dev = translate_ring_addresses(dev, msg->payload.addr.index);635 if (!dev)636 return -1;637 638 *pdev = dev;639 }640 641 return 0;642 } 只有在通过控制通道vhu套接口接收到VHOST_USER_SET_VRING_ADDR类型消息时,设置内存地址。dpdk-18.08/lib/librte_vhost/vhost_user.c1548 int 1549 vhost_user_msg_handler(int vid, int fd) 1550 { (...) 1650 switch (msg.request.master) { (...) 1696 case VHOST_USER_SET_VRING_ADDR: 1697 vhost_user_set_vring_addr(&dev, &msg); 1698 break; 实际上,QEMU中有一个与DPDK的消息处理函数类型的处理函数。 qemu-3.0.0/contrib/libvhost-user/libvhost-user.c1218 static bool 1219 vu_process_message(VuDev *dev, VhostUserMsg *vmsg) 1220 { (...) 1244 switch (vmsg->request) { (...) 1265 case VHOST_USER_SET_VRING_ADDR: 1266 return vu_set_vring_addr_exec(dev, vmsg); (...)显然,QEMU中需要有函数通过UNIX套接口发送内存地址信息到DPDK中。 qemu-3.0.0/hw/virtio/vhost-user.c588 static int vhost_user_set_vring_addr(struct vhost_dev *dev,589 struct vhost_vring_addr *addr)590 { 591 VhostUserMsg msg = {592 .hdr.request = VHOST_USER_SET_VRING_ADDR,593 .hdr.flags = VHOST_USER_VERSION,594 .payload.addr = *addr,595 .hdr.size = sizeof(msg.payload.addr),596 };597 598 if (vhost_user_write(dev, &msg, NULL, 0) < 0) {599 return -1;600 }601 602 return 0;603 } OVS DPDK发送数据包到客户机与发送丢包 OVS DPDK中向客户机发送数据包的函数为__netdev_dpdk_vhost_send,位于文件openvswitch-2.9.2/lib/netdev-dpdk.c。OVS发送程序,在空间用完后,仍会尝试发送VHOST_ENQ_RETRY_NUM (默认8)次。如果在第一次尝试发送中,没有任何数据包发送成功(无数据包写入共享内存的环中),或者超过了VHOST_ENQ_RETRY_NUM宏限定的次数,剩余的数据包将被丢弃(批量发送最大可由32个数据包组成)。2072 do { 2073 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ; 2074 unsigned int tx_pkts; 2075 2076 tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt); 2077 if (OVS_LIKELY(tx_pkts)) { 2078 /* Packets have been sent.*/ 2079 cnt -= tx_pkts; 2080 /* Prepare for possible retry.*/ 2081 cur_pkts = &cur_pkts[tx_pkts]; 2082 } else { 2083 /* No packets sent - do not retry.*/ 2084 break; 2085 } 2086 } while (cnt && (retries++ <= VHOST_ENQ_RETRY_NUM)); 2087 (...) 2094 2095 out: 2096 for (i = 0; i < total_pkts - dropped; i++) { 2097 dp_packet_delete(pkts[i]); 2098 } 客户机接收中断处理 当OVS DPDK将新的数据包填入virtio环中时,有以下两种情形:客户机没有在轮询其队列,需要告知其新数据包的到达; 客户机正在轮询队列,不需要告知新数据包的到达。 如果客户机使用Linux内核网络协议栈,内核中负责接收报文的NAPI机制混合使用中断和轮询模式。客户机OS开始工作在中断模式,一直到第一个中断进来。此时,CPU快速响应中断,调度内核软中断ksoftirqd线程处理,同时禁止后续中断。ksoftirqd运行时,尝试处理尽可能多的数据包,但是不能超出netdev_budget限定的数量。如果队列中还有更多的数据包,ksoftirqd线程将重新调度自身,继续处理数据包,直到没有可用的数据包为止。此过程中一直是轮询处理,中断处于关闭状态。处理完数据包之后,ksoftirqd线程停止轮询,重新打开中断,等待下一个数据包到来的中断发生。当客户机轮询时,CPU的caches高速缓存利用率非常高,避免了额外的延时。宿主机和客户机中合适的进程在运行,进一步降低了延时。另外的,宿主机发送中断IRQ到客户机时,需要对UNIX套接口写操作(系统调用),非常耗时,增加了额外的延时和开销。作为NFV应用的一部分,客户机中运行DPDK的优势在于其PMD驱动处理流量的方式。PMD驱动工作在轮询模式,关闭了系统中断,OVS DPDK不再需要给客户机发送中断通知。OVS DPDK节省了写UNIX套接口的操作,不在需要执行内核系统调用。OVS DPDK得以一直运行在用户空间,客户机也可以省去处理由控制通道而来的中断,快速运行。
(免费订阅,永久学习)学习地址: Dpdk/网络协议栈/vpp/OvS/DDos/NFV/虚拟化/高性能专家-学习视频教程-腾讯课堂
更多DPDK相关学习资料有需要的可以自行报名学习,免费订阅,永久学习,或点击这里加qun免费
领取,关注我持续更新哦! ! 如果没有设置VRING_AVAIL_F_NO_INTERRUPT标志,表明客户机可以接收中断。到客户机的中断通过callfd和操作系统的eventfd组件实现。 客户机的OS可以启用或禁用中断。当客户机禁用virtio接口的中断时,virtio-net驱动通过宏VRING_AVAIL_F_NO_INTERRUPT实现。此宏在DPDK和QEMU中都有定义: [root@overcloud-0 SOURCES]# grep VRING_AVAIL_F_NO_INTERRUPT -R | grep def dpdk-18.08/drivers/net/virtio/virtio_ring.h:#define VRING_AVAIL_F_NO_INTERRUPT 1 dpdk-18.08/drivers/crypto/virtio/virtio_ring.h:#define VRING_AVAIL_F_NO_INTERRUPT 1 [root@overcloud-0 qemu]# grep AVAIL_F_NO_INTERRUPT -R -i | grep def qemu-3.0.0/include/standard-headers/linux/virtio_ring.h:#define VRING_AVAIL_F_NO_INTERRUPT 1 qemu-3.0.0/roms/seabios/src/hw/virtio-ring.h:#define VRING_AVAIL_F_NO_INTERRUPT 1 qemu-3.0.0/roms/ipxe/src/include/ipxe/virtio-ring.h:#define VRING_AVAIL_F_NO_INTERRUPT 1 qemu-3.0.0/roms/seabios-hppa/src/hw/virtio-ring.h:#define VRING_AVAIL_F_NO_INTERRUPT 1 qemu-3.0.0/roms/SLOF/lib/libvirtio/virtio.h:#define VRING_AVAIL_F_NO_INTERRUPT 1 一旦vq->avail->flags中的VRING_AVAIL_F_NO_INTERRUPT标志位设置,指示DPDK不要发送中断到客户机。dpdk-18.08/lib/librte_vhost/vhost.h 666 static __rte_always_inline void 667 vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) 668 { 669 /* Flush used->idx update before we read avail->flags. */ 670 rte_smp_mb(); 671 672 /* Don't kick guest if we don't reach index specified by guest. */ 673 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { 674 uint16_t old = vq->signalled_used; 675 uint16_t new = vq->last_used_idx; 676 677 VHOST_LOG_DEBUG(VHOST_DATA, "%s: used_event_idx=%d, old=%d, new=%d\n", 678 __func__, 679 vhost_used_event(vq), 680 old, new); 681 if (vhost_need_event(vhost_used_event(vq), new, old) 682 && (vq->callfd >= 0)) { 683 vq->signalled_used = vq->last_used_idx; 684 eventfd_write(vq->callfd, (eventfd_t) 1); 685 } 686 } else { 687 /* Kick the guest if necessary. */ 688 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 689 && (vq->callfd >= 0)) 690 eventfd_write(vq->callfd, (eventfd_t)1); 691 } 692 } 如前所述,PMD驱动不需要执行写UNIX套接口的系统调用了。 OVS DPDK发送数据包到客户机-代码详情 2044 static void 2045 __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, 2046 struct dp_packet **pkts, int cnt) 2047 { 2048 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); 2049 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts; 2050 unsigned int total_pkts = cnt; 2051 unsigned int dropped = 0; 2052 int i, retries = 0; 2053 int vid = netdev_dpdk_get_vid(dev); 2054 2055 qid = dev->tx_q[qid % netdev->n_txq].map; 2056 2057 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0 2058 || !(dev->flags & NETDEV_UP))) { 2059 rte_spinlock_lock(&dev->stats_lock); 2060 dev->stats.tx_dropped+= cnt; 2061 rte_spinlock_unlock(&dev->stats_lock); 2062 goto out; 2063 } 2064 2065 rte_spinlock_lock(&dev->tx_q[qid].tx_lock); 2066 2067 cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt); 2068 /* Check has QoS has been configured for the netdev */ 2069 cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true); 2070 dropped = total_pkts - cnt; 2071 2072 do { 2073 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ; 2074 unsigned int tx_pkts; 2075 2076 tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt); 2077 if (OVS_LIKELY(tx_pkts)) { 2078 /* Packets have been sent.*/ 2079 cnt -= tx_pkts; 2080 /* Prepare for possible retry.*/ 2081 cur_pkts = &cur_pkts[tx_pkts]; 2082 } else { 2083 /* No packets sent - do not retry.*/ 2084 break; 2085 } 2086 } while (cnt && (retries++ <= VHOST_ENQ_RETRY_NUM)); 2087 2088 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); 2089 2090 rte_spinlock_lock(&dev->stats_lock); 2091 netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, 2092 cnt + dropped); 2093 rte_spinlock_unlock(&dev->stats_lock); 2094 2095 out: 2096 for (i = 0; i < total_pkts - dropped; i++) { 2097 dp_packet_delete(pkts[i]); 2098 } 2099 } rte_vhost_enqueue_burst函数来自于DPDK的vhost库。 [root@overcloud-0 src]# grep rte_vhost_enqueue_burst dpdk-16.08/ -R dpdk-18.08/examples/vhost/main.c: ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); dpdk-18.08/examples/vhost/main.c: enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, dpdk-18.08/examples/tep_termination/vxlan_setup.c: ret = rte_vhost_enqueue_burst(vid, VIRTIO_RXQ, pkts_valid, count); dpdk-18.08/tags:rte_vhost_enqueue_burst lib/librte_vhost/virtio_net.c /^rte_vhost_enqueue_burst(int vid, uint16_t queue_id,$/;" f dpdk-18.08/lib/librte_vhost/rte_vhost_version.map: rte_vhost_enqueue_burst; dpdk-18.08/lib/librte_vhost/rte_vhost.h:uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, dpdk-18.08/lib/librte_vhost/virtio_net.c:rte_vhost_enqueue_burst(int vid, uint16_t queue_id, dpdk-18.08/drivers/net/vhost/rte_eth_vhost.c: nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id, dpdk-18.08/doc/guides/prog_guide/vhost_lib.rst:* ``rte_vhost_enqueue_burst(vid, queue_id, pkts, count)`` dpdk-18.08/doc/guides/rel_notes/release_16_07.rst:* The function ``rte_vhost_enqueue_burst`` no longer supports concurrent enqueuing dpdk-18.08/lib/librte_vhost/rte_vhost.h 492 /** 493 * This function adds buffers to the virtio devices RX virtqueue. Buffers can 494 * be received from the physical port or from another virtual device. A packet 495 * count is returned to indicate the number of packets that were successfully 496 * added to the RX queue. 497 * @param vid 498 * vhost device ID 499 * @param queue_id 500 * virtio queue index in mq case 501 * @param pkts 502 * array to contain packets to be enqueued 503 * @param count 504 * packets num to be enqueued 505 * @return 506 * num of packets enqueued 507 */ 508 uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, 509 struct rte_mbuf **pkts, uint16_t count); dpdk-18.08/lib/librte_vhost/virtio_net.c 932 uint16_t 933 rte_vhost_enqueue_burst(int vid, uint16_t queue_id, 934 struct rte_mbuf **pkts, uint16_t count) 935 { 936 struct virtio_net *dev = get_device(vid); 937 938 if (!dev) 939 return 0; 940 941 if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { 942 RTE_LOG(ERR, VHOST_DATA, 943 "(%d) %s: built-in vhost net backend is disabled.\n", 944 dev->vid, __func__); 945 return 0; 946 } 947 948 return virtio_dev_rx(dev, queue_id, pkts, count); 949 } virtio_dev_rx_packed函数和virtio_dev_rx_split函数都将数据包发送到客户机,并根据设置决定是否发送中断通知(write系统调用)。 dpdk-18.08/lib/librte_vhost/virtio_net.c 886 static __rte_always_inline uint32_t 887 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, 888 struct rte_mbuf **pkts, uint32_t count) 889 { 890 struct vhost_virtqueue *vq; (...) 917 if (vq_is_packed(dev)) 918 count = virtio_dev_rx_packed(dev, vq, pkts, count); 919 else 920 count = virtio_dev_rx_split(dev, vq, pkts, count); (...) 在virtio_dev_rx函数中: 913 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); 914 if (count == 0) 915 goto out; 发送数据包数量设置为MAX_PKT_BURST宏与空闲项数量(count)两者中的较小值。 最后,根据发送的数据包数量增加已用索引的值。 833 static __rte_always_inline uint32_t 834 virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, 835 struct rte_mbuf **pkts, uint32_t count) 836 { (...) 841 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { (...) 862 if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], 863 buf_vec, nr_vec, 864 num_buffers) < 0) { 865 vq->shadow_used_idx -= num_buffers; 866 break; 867 } 868 869 vq->last_avail_idx += nr_descs; 870 if (vq->last_avail_idx >= vq->size) { 871 vq->last_avail_idx -= vq->size; 872 vq->avail_wrap_counter ^= 1; 873 } 781 static __rte_always_inline uint32_t 782 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, 783 struct rte_mbuf **pkts, uint32_t count) 784 { 793 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { 794 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; 795 uint16_t nr_vec = 0; (...) 813 if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], 814 buf_vec, nr_vec, 815 num_buffers) < 0) { 816 vq->shadow_used_idx -= num_buffers; 817 break; 818 } 819 820 vq->last_avail_idx += num_buffers; 821 } 数据包通过函数copy_mbuf_to_desc拷贝到客户机的内存中。最后,根据配置决定是否发送中断通知,参见函数vhost_vring_call_split和vhost_vring_call_packed。 666 static __rte_always_inline void 667 vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) 668 { (...) 672 /* Don't kick guest if we don't reach index specified by guest. */ 673 if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { 674 uint16_t old = vq->signalled_used; 675 uint16_t new = vq->last_used_idx; (...) 681 if (vhost_need_event(vhost_used_event(vq), new, old) 682 && (vq->callfd >= 0)) { 683 vq->signalled_used = vq->last_used_idx; 684 eventfd_write(vq->callfd, (eventfd_t) 1); 685 } 686 } else { 687 /* Kick the guest if necessary. */ 688 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 689 && (vq->callfd >= 0)) 690 eventfd_write(vq->callfd, (eventfd_t)1); 691 } 692 } 694 static __rte_always_inline void 695 vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) 696 { (...) 703 if (!(dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))) { 704 if (vq->driver_event->flags != 705 VRING_EVENT_F_DISABLE) 706 kick = true; 707 goto kick; 708 } (...) 740 kick: 741 if (kick) 742 eventfd_write(vq->callfd, (eventfd_t)1); 743 }
A detailed view of the vhost user protocol and its implementation in OVS DPDK, qemu and virtio-net
所有的控制信息通过UNIX套接口(控制通道)交互。包括为进行直接内存访问而交换的内存映射信息,以及当数据填入virtio队列后需要出发的kick事件和中断信息。在Neutron中此UNIX套接口命名为vhuxxxxxxxx-xx;
数据通道事实上由内存直接访问实现。客户机中的virtio-net驱动分配一部分内存用于virtio的队列。virtio标准定义了此队列的结构。QEMU通过控制通道将此部分内存的地址共享给OVS DPDK。DPDK自身映射一个相同标准的virtio队列结构到此内存上,藉此来读写客户机巨页内存中的virtio队列。直接内存访问的实现需要在OVS DPDK和QEMU之间使用巨页内存。如果QEMU设置正确,但是没有配置巨页内存,OVS DPDK将不能访问QEMU的内存,二者也就不能交换数据报文。如果用户忘记了请求客户机巨页内存,nova将通过宏数据通知用户。
当OVS DPDK向客户机发送数据包时,这些数据包在OVS DPDK的统计里面显示为接口vhuxxxxxxxx-xx的发送Tx流量。在客户机中,显示为接收Rx流量。
当客户机向OVS DPDK发送数据包时,这些数据包在客户机中显示为发送Tx流量,而在OVS DPDK中显示为接口vhuxxxxxxxx-xx的接收Rx流量。
客户机并没有硬件的统计计数。ethtool工具的-s选项未实现。所有的底层统计计数只能使用OVS的命令显示(ovs-vsctl list get interfave vhuxxxxxxxx-xx statistics),因此显示的数据都是基于OVS DPDK的视角。
虽然数据包可通过共享内存传输,但是还需要一种方法告知对端数据包已经拷贝到virtio队列中。通过vhost user套接口vhuxxxxxxxx-xx实现的控制通道可用来完成通知(kicking)对方的功能。通知必然有代价。首先,需要一个写套接口的系统调用;之后对端需要处理一个中断操作。所以,接收双方都会在控制通道上消耗时间。
为避免控制通道的通知消耗,OpenvSwitch和QEMU都可以设置特殊标志以告知对方其不愿接收中断。尽管如此,只有在采用临时或者固定查询virtio队列方式时才能使用不接收中断的功能。
为客户机的性能考虑其本身可采用DPDK处理数据包。尽管Linux内核采用轮询处理和中断相结合的NAPI机制,但是产生的中断数量仍然很多。OVS DPDK以非常高的速率发送数据包到客户机。同时,QEMU的virtio队列的收发缓存数被限制在了默认的256与最大1024之间。结果,客户机必须以非常快的速度处理数据包。理想的实现就是使用DPDK的PMD驱动不停的轮询客户机端口进行数据包处理。
vhost user协议标准
参见QEMU代码库中文档:https://github.com/qemu/qemu/blob/master/docs/interop/vhost-user.txt
Vhost-user协议 ===================Copyright (c) 2014 Virtual Open Systems Sarl.This work is licensed under the terms of the GNU GPL, version 2 or later. See the COPYING file in the top-level directory. ===================此协议旨在补充实现在Linux内核中的vhost的ioctl接口。实现了与同一宿主机中的用户进程交互建立virtqueue队列的控制平面。通过UNIX套接口消息中的附加数据字段来共享文件描述符。协议定义了通信的两端:主和从。主时要共享其virtqueues队列的进程,即QEMU。从为virtqueues队列的消费者。当前实现中QEMU作为主,从为运行在用户空间的软件交换机,如Snabbswitch。主和从在通信时都可以作为客户端(主动连接)或者服务端(监听)。
vhost user协议由两方组成:
- 主方 - QEMU
- 从方 - Open vSwitch或者其它软件交换机
vhost user各方都可运行在2中模式下:
- vhostuser-client - QEMU作为服务端,软件交换机作为客户端
- vhostuser - 软件交换机作为服务端,QEMU作为客户端。
vhost user实现基于内核的vhost架构,将所有特性实现在用户空间。
当QEMU客户机启动时,它将所有的客户机内存分配为共享的巨页内存。其操作系统的半虚拟化驱动virtio将保留这些巨页内存的一部分用作virtio环形缓存。这样OVS DPDK将可以直接读写客户机的virtio环形缓存。OVS DPDK和QEMU可通过此保留的内存空间交换网络数据包。
用户空间进程接收到客户机预先分配的共享内存文件描述符后,可直接存取与之关联的客户机内存空间中的vrings环结构。 (http://www.virtualopensystems.com/en/solutions/guides/snabbswitch-qemu/).
参见以下的VM虚拟机,模式为vhostuser:
$ /usr/libexec/qemu-kvm -name guest=instance-00000028,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-58-instance-00000028/master-key.aes -machine pc-i440fx-rhel7.4.0,accel=kvm,usb=off,dump-guest-core=off -cpu Skylake-Client,ss=on,hypervisor=on,tsc_adjust=on,pdpe1gb=on,mpx=off,xsavec=off,xgetbv1=off -m 2048 -realtime mlock=off -smp 8,sockets=4,cores=1,threads=2 -object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu/58-instance-00000028,share=yes,size=1073741824,host-nodes=0,policy=bind -numa node,nodeid=0,cpus=0-3,memdev=ram-node0 -object memory-backend-file,id=ram-node1,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu/58-instance-00000028,share=yes,size=1073741824,host-nodes=1,policy=bind -numa node,nodeid=1,cpus=4-7,memdev=ram-node1 -uuid 48888226-7b6b-415c-bcf7-b278ba0bca62 -smbios type=1,manufacturer=Red Hat,product=OpenStack Compute,version=14.1.0-3.el7ost,serial=3d5e138a-8193-41e4-ac95-de9bfc1a3ef1,uuid=48888226-7b6b-415c-bcf7-b278ba0bca62,family=Virtual Machine -no-user-config -nodefaults -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-58-instance-00000028/monitor.sock,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=delay \ -no-hpet -no-shutdown -boot strict=on -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive file=/var/lib/nova/instances/48888226-7b6b-415c-bcf7-b278ba0bca62/disk,format=qcow2,if=none,id=drive-virtio-disk0,cache=none -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 -chardev socket,id=charnet0,path=/var/run/openvswitch/vhuc26fd3c6-4b -netdev vhost-user,chardev=charnet0,queues=8,id=hostnet0 \ -device virtio-net-pci,mq=on,vectors=18,netdev=hostnet0,id=net0,mac=fa:16:3e:52:30:73,bus=pci.0,addr=0x3 -add-fd set=0,fd=33 -chardev file,id=charserial0,path=/dev/fdset/0,append=on \ -device isa-serial,chardev=charserial0,id=serial0 -chardev pty,id=charserial1 \ -device isa-serial,chardev=charserial1,id=serial1 \ -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 172.16.2.10:1 -k en-us \ -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 \
指定QEMU从巨页池中分配内存,并设置为共享内存。
-object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu/58-instance-00000028,share=yes,size=1073741824,host-nodes=0,policy=bind -numa node,nodeid=0,cpus=0-3,memdev=ram-node0 \ -object memory-backend-file,id=ram-node1,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu/58-instance-00000028,share=yes,size=1073741824,host-nodes=1,policy=bind
管如此,简单的拷贝数据包到对方的缓存中还不足够。另外,vhost user协议使用一个UNIX套接口(vhu[a-f0-9-])处理vswitch和QEMU之间的通信,包括在初始化过程中,和数据包拷贝到共享内存的virtio环中需要通知对方时。所以两者的交互包括基于控制通道(vhu)的创建操作和通知机制,与拷贝数据包的数据通道(直接内存访问)。
所述virtio机制要能工作,我们需要建立一个接口来初始化共享内存区域和交换event事件描述符。UNIX套接口提供的API接口可实现此要求。此套接口可用于初始化用户空间virtio传输(vhost-user),特别是:* 初始化时确定Vrings,并且放入两个进程间的共享内存中; * 使用eventfd映射到Vring事件。这样就可与QEMU/KVM中的实现相兼容,KVM可以关联客户机系统中virtio_pci驱动所触发事件与宿主机的eventfd(ioventfd和irqfd)文件描述符。在两个进程间共享文件描述符与在一个进程和内核直接不相同。前者需要在UNIX套接口的sendmsg系统调用中设置SCM_RIGHTS标志。
(http://www.virtualopensystems.com/en/solutions/guides/snabbswitch-qemu/)
hostuser模式下,OVS创建vhu套接口,QEMU主动进行连接。vhostuser client模式下,QEMU创建vhu套接口,OVS进行连接。
在上面创建的vhostuser模式客户机实例中,指示QEMU连接一个类型为vhost-user的netdev到套接口/var/run/openvswitch/vhuc26fd3c6-4b:
-chardev socket,id=charnet0,path=/var/run/openvswitch/vhuc26fd3c6-4b \ -netdev vhost-user,chardev=charnet0,queues=8,id=hostnet0 \ -device virtio-net-pci,mq=on,vectors=18, \ netdev=hostnet0,id=net0,mac=fa:16:3e:52:30:73,bus=pci.0,addr=0x3
使用lsof命令显示此套接口为OVS所创建:
[root@overcloud-0 ~]# lsof -nn | grep vhuc26fd3c6-4b | awk '{print $1}' | uniq
当一方拷贝一个数据报文到共享内存的virtio环中时,另一方有两种选择:
- 类似(e.g. Linux kernel's NAPI)或者 (e.g. DPDK's PMD)的轮询队列,不需要通知就可取得新的数据报文;
- 非队列轮询,必须得到新报文到达的通知。
针对第二种情况,可通过独立的vhu套接口控制通道发送通知到客户机。通过交换eventfd文件描述符数据,控制通道可在用户空间实现中断。套接口的写操作要求系统调用,必将引起PMDs花费时间在内核空间。客户机可通过设置VRING_AVAIL_F_NO_INTERRUPT标志关闭控制通道中断通知。否则,当Open vSwitch网virtio环中填入新数据包时,将发送中断通知到客户机。
详情可参加此博客文章:http://blog.vmsplice.net/2011/09/qemu-internals-vhost-architecture.html
用户空间的vhost接口vhost架构的一个惊人的特性是其并没有绑定在KVM上。其仅是一个用户空间接口并不依赖于KVM内核模块。这意味着其它的用户空间程序, 比如libpcap,如果要获得高性能I/O接口,理论上也可以使用vhost设备。当客户机通知宿主机其已在virtqueue中填入了数据时,需要通知vhost的工作进程有数据要进行处理(对于内核的virtio-net驱动, vhost工作进程为一个内核线程,名称为vhost-$pid,其中pid为QEMU的进程号)。既然vhost不依赖于KVM内核模块,二者就不能直接通信。 所以vhost实例创建了一个eventfd文件描述符,提供给vhost工作进程去监听。KVM内核模块的ioeventfd特性可将一个eventfd文件 描述符关联到一个特殊的客户机I/O操作上。QEMU用户空间在硬件寄存器VIRTIO_PCI_QUEUE_NOTIFY的I/O访问上注册了virtqueue的通知ioeventfd。 当客户机写VIRTIO_PCI_QUEUE_NOTIFY寄存器时将会发送virtqueue队列通知,vhost工作进程将接收到KVM内核模块通过ioeventfd发来的通知。在vhost工作进程需要发送中断到客户机的反向路径上使用相同的方式。vhost通过写一个“call”文件描述符去通知客户机。 KVM内核模块的另一个特性irqfd中断描述符可使eventfd出发客户机中断。QEMU用户空间为virtio的PCI设备中断注册了一个irqfd文件描述符, 并将此irqfd交于vhost实例。vhost工作进程即可通过此“call”文件描述符去中断客户机。最终,vhost实例仅了解到客户机的内存映射、kick通知eventfd文件描述符和call中断文件描述符。更多细节,参考Linux内核中内核相关代码:drivers/vhost/vhost.c - 通用vhost驱动代码drivers/vhost/net.c - vhost-net网络设备驱动代码virt/kvm/eventfd.c - ioeventfd事件和irqfd中断文件描述符实现QEMU初始化vhost实例的用户空间代码:hw/vhost.c - 通用vhost初始化代码hw/vhost_net.c - vhost-net网络设备初始化代码
数据通道-直接内存访问
virtqueue的内存映射
virtio官方标准定义了virtqueue的结构。
2.4 Virtqueuesvirtio设备的大数据传输机制命名为virtqueue虚拟队列。每个设备可以有多个virtqueues,也可以没有 virtqueue队列。16位的队列大小参数指定了队列内成员的数量,也限定了队列的总大小。每个virtqueue队列有三个部分组成:Descriptor Table - 描述符表Available Ring - 可用环Used Ring - 已用环
http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
virtio标志精确的定义了描述符表、可用环和已用环的结构。例如,可用环的定义:
2.4.6 virtqueue可用环结构struct virtq_avail { #define VIRTQ_AVAIL_F_NO_INTERRUPT 1le16 flags;le16 idx;le16 ring[ /* Queue Size */ ];le16 used_event; /* Only if VIRTIO_F_EVENT_IDX */ };驱动程序使用可用环提供发送缓存给设备。其中每个环项指向一个描述符链的开头。可用环只能由驱动程序写,由设备读。idx成员指示驱动程序将下一个描述符入口项放在了ring成员的哪个位置(不超过队列长度)。其从0开始增加。 传统的标准[Virtio PCI Draft]将此结构定义为vring_avail,将宏定义命名为 VRING_AVAIL_F_NO_INTERRUPT,但是本质结构都还是相同的。
http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
DPDK的virtio标准实现代码,其也是使用传统virtio标准中的结构定义:
dpdk-18.08/drivers/net/virtio/virtio_ring.h
48 /* The Host uses this in used->flags to advise the Guest: don't kick me49 * when you add a buffer. It's unreliable, so it's simply an50 * optimization. Guest will still kick if it's out of buffers. */51 #define VRING_USED_F_NO_NOTIFY 152 /* The Guest uses this in avail->flags to advise the Host: don't53 * interrupt me when you consume a buffer. It's unreliable, so it's54 * simply an optimization. */55 #define VRING_AVAIL_F_NO_INTERRUPT 15657 /* VirtIO ring descriptors: 16 bytes.58 * These can chain together via "next". */59 struct vring_desc {60 uint64_t addr; /* Address (guest-physical). */61 uint32_t len; /* Length. */62 uint16_t flags; /* The flags as indicated above. */63 uint16_t next; /* We chain unused descriptors via this. */64 };6566 struct vring_avail {67 uint16_t flags;68 uint16_t idx;69 uint16_t ring[0];70 };7172 /* id is a 16bit index. uint32_t is used here for ids for padding reasons. */73 struct vring_used_elem {74 /* Index of start of used descriptor chain. */75 uint32_t id;76 /* Total length of the descriptor chain which was written to. */77 uint32_t len;78 };7980 struct vring_used {81 uint16_t flags;82 volatile uint16_t idx;83 struct vring_used_elem ring[0];84 };8586 struct vring {87 unsigned int num;88 struct vring_desc *desc;89 struct vring_avail *avail;90 struct vring_used *used;91 };
dpdk-18.08/lib/librte_vhost/vhost.h
90 /**91 * Structure contains variables relevant to RX/TX virtqueues.92 */93 struct vhost_virtqueue {94 union {95 struct vring_desc *desc;96 struct vring_packed_desc *desc_packed;97 };98 union {99 struct vring_avail *avail; 100 struct vring_packed_desc_event *driver_event; 101 }; 102 union { 103 struct vring_used *used; 104 struct vring_packed_desc_event *device_event; 105 }; 106 uint32_t size; 107 108 uint16_t last_avail_idx; 109 uint16_t last_used_idx; 110 /* Last used index we notify to front end. */ 111 uint16_t signalled_used; 112 bool signalled_used_valid; 113 #define VIRTIO_INVALID_EVENTFD (-1) 114 #define VIRTIO_UNINITIALIZED_EVENTFD (-2) 115 116 /* Backend value to determine if device should started/stopped */ 117 int backend; 118 int enabled; 119 int access_ok; 120 rte_spinlock_t access_lock; 121 122 /* Used to notify the guest (trigger interrupt) */ 123 int callfd; 124 /* Currently unused as polling mode is enabled */ 125 int kickfd; 126 127 /* Physical address of used ring, for logging */ 128 uint64_t log_guest_addr; 129 130 uint16_t nr_zmbuf; 131 uint16_t zmbuf_size; 132 uint16_t last_zmbuf_idx; 133 struct zcopy_mbuf *zmbufs; 134 struct zcopy_mbuf_list zmbuf_list; 135 136 union { 137 struct vring_used_elem *shadow_used_split; 138 struct vring_used_elem_packed *shadow_used_packed; 139 }; 140 uint16_t shadow_used_idx; 141 struct vhost_vring_addr ring_addrs; 142 143 struct batch_copy_elem *batch_copy_elems; 144 uint16_t batch_copy_nb_elems; 145 bool used_wrap_counter; 146 bool avail_wrap_counter; 147 148 struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR]; 149 uint16_t log_cache_nb_elem; 150 151 rte_rwlock_t iotlb_lock; 152 rte_rwlock_t iotlb_pending_lock; 153 struct rte_mempool *iotlb_pool; 154 TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list; 155 int iotlb_cache_nr; 156 TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list; 157 } __rte_cache_aligned;
内存映射完成之后,DPDP就可像客户机的virtio-net驱动一样直接操作其共享内存中的同一结构了。
控制通道-UNIX套接口
QEMU与DPDK通过vhost user套接口交换消息。
DPDK与QEMU的通信遵照标准的vhost-user协议。
消息类型如下:
dpdk-18.08/lib/librte_vhost/vhost_user.h
typedef enum VhostUserRequest {28 VHOST_USER_NONE = 0,29 VHOST_USER_GET_FEATURES = 1,30 VHOST_USER_SET_FEATURES = 2,31 VHOST_USER_SET_OWNER = 3,32 VHOST_USER_RESET_OWNER = 4,33 VHOST_USER_SET_MEM_TABLE = 5,34 VHOST_USER_SET_LOG_BASE = 6,35 VHOST_USER_SET_LOG_FD = 7,36 VHOST_USER_SET_VRING_NUM = 8,37 VHOST_USER_SET_VRING_ADDR = 9,38 VHOST_USER_SET_VRING_BASE = 10,39 VHOST_USER_GET_VRING_BASE = 11,40 VHOST_USER_SET_VRING_KICK = 12,41 VHOST_USER_SET_VRING_CALL = 13,42 VHOST_USER_SET_VRING_ERR = 14,43 VHOST_USER_GET_PROTOCOL_FEATURES = 15,44 VHOST_USER_SET_PROTOCOL_FEATURES = 16,45 VHOST_USER_GET_QUEUE_NUM = 17,46 VHOST_USER_SET_VRING_ENABLE = 18,47 VHOST_USER_SEND_RARP = 19,48 VHOST_USER_NET_SET_MTU = 20,49 VHOST_USER_SET_SLAVE_REQ_FD = 21,50 VHOST_USER_IOTLB_MSG = 22,51 VHOST_USER_CRYPTO_CREATE_SESS = 26,52 VHOST_USER_CRYPTO_CLOSE_SESS = 27,53 VHOST_USER_MAX = 2854 } VhostUserRequest;
更详细的有关消息类型的信息参见QEMU源代码中的文件:
https://github.com/qemu/qemu/blob/master/docs/interop/vhost-user.txt
DPDK使用如下函数处理接收到的消息:
dpdk-18.08/lib/librte_vhost/vhost_user.c
1548 int 1549 vhost_user_msg_handler(int vid, int fd) 1550 {
还有dpdk-18.08/lib/librte_vhost/vhost_user.c:1406 /* return bytes# of read on success or negative val on failure. */ 1407 static int 1408 read_vhost_message(int sockfd, struct VhostUserMsg *msg) 1409 { DPDK向外发送消息使用如下函数 dpdk-18.08/lib/librte_vhost/vhost_user.c1436 static int 1437 send_vhost_message(int sockfd, struct VhostUserMsg *msg, int *fds, int fd_num) 1438 { 1439 if (!msg) 1440 return 0; 1441 1442 return send_fd_message(sockfd, (char *)msg, 1443 VHOST_USER_HDR_SIZE + msg->size, fds, fd_num); 1444 } QEMU与之相对应的接收函数为: qemu-3.0.0/contrib/libvhost-user/libvhost-user.c1218 static bool 1219 vu_process_message(VuDev *dev, VhostUserMsg *vmsg) QEMU对应的消息发送函数: qemu-3.0.0/hw/virtio/vhost-user.c297 /* most non-init callers ignore the error */ 298 static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg, 299 int *fds, int fd_num) 300 {
vu_process_message(VuDev *dev, VhostUserMsg *vmsg) {int do_reply = 0;printf("vu_process_message \n");/* Print out generic part of the request. */DPRINT("================ Vhost user message ================\n");DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request),vmsg->request);DPRINT("Flags: 0x%x\n", vmsg->flags);DPRINT("Size: %d\n", vmsg->size);if (vmsg->fd_num) {int i;DPRINT("Fds:");for (i = 0; i < vmsg->fd_num; i++) {DPRINT(" %d", vmsg->fds[i]);}DPRINT("\n");}if (dev->iface->process_msg &&dev->iface->process_msg(dev, vmsg, &do_reply)) {return do_reply;}switch (vmsg->request) {case VHOST_USER_GET_FEATURES:return vu_get_features_exec(dev, vmsg);case VHOST_USER_SET_FEATURES:return vu_set_features_exec(dev, vmsg);case VHOST_USER_GET_PROTOCOL_FEATURES:return vu_get_protocol_features_exec(dev, vmsg);case VHOST_USER_SET_PROTOCOL_FEATURES:return vu_set_protocol_features_exec(dev, vmsg);case VHOST_USER_SET_OWNER:return vu_set_owner_exec(dev, vmsg);case VHOST_USER_RESET_OWNER:return vu_reset_device_exec(dev, vmsg);case VHOST_USER_SET_MEM_TABLE:return vu_set_mem_table_exec(dev, vmsg);case VHOST_USER_SET_LOG_BASE:return vu_set_log_base_exec(dev, vmsg);case VHOST_USER_SET_LOG_FD:return vu_set_log_fd_exec(dev, vmsg);case VHOST_USER_SET_VRING_NUM:return vu_set_vring_num_exec(dev, vmsg);case VHOST_USER_SET_VRING_ADDR:return vu_set_vring_addr_exec(dev, vmsg);case VHOST_USER_SET_VRING_BASE:return vu_set_vring_base_exec(dev, vmsg);case VHOST_USER_GET_VRING_BASE:return vu_get_vring_base_exec(dev, vmsg); case VHOST_USER_SET_VRING_KICK:return vu_set_vring_kick_exec(dev, vmsg);case VHOST_USER_SET_VRING_CALL:return vu_set_vring_call_exec(dev, vmsg);case VHOST_USER_SET_VRING_ERR:return vu_set_vring_err_exec(dev, vmsg);case VHOST_USER_GET_QUEUE_NUM:return vu_get_queue_num_exec(dev, vmsg);case VHOST_USER_SET_VRING_ENABLE:return vu_set_vring_enable_exec(dev, vmsg);case VHOST_USER_SET_SLAVE_REQ_FD:return vu_set_slave_req_fd(dev, vmsg);case VHOST_USER_GET_CONFIG:
DPDK UNIX套接口的注册和消息交互
neutron控制Open vSwitch创建一个名称为vhuxxxxxxxx-xx的接口。在OVS内部,此名称保存在netdev结构体的成员name中(netdev->name)。
当创建vhost user接口时,Open vSwitch控制DPDK注册一个新的vhost-user UNIX套接口。套接口的路径为vhost_sock_dir加netdev->name加设备的dev->vhost_id。
通过设置RTE_VHOST_USER_CLIENT标志,OVS可请求创建vhost user套接口的客户端模式。
OVS函数netdev_dpdk_vhost_construct调用DPDK的rte_vhost_driver_register函数,其又调用vhost_user_create_server或者vhost_user_create_client函数创建套接口。默认使用前者创建服务端模式的套接口,如果设置了RTE_VHOST_USER_CLIENT标志,创建客户端模式套接口。
相关的函数调用关系如下:
OVSnetdev_dpdk_vhost_construct(struct netdev *netdev)|| DPDK Vrte_vhost_driver_register(const char *path, uint64_t flags)|Vcreate_unix_socket(struct vhost_user_socket *vsocket)|OVSnetdev_dpdk_vhost_construct(struct netdev *netdev)| DPDK Vrte_vhost_driver_start(const char *path)) ----------------------------------------------- | |V | vhost_user_start_server |(struct vhost_user_socket *vsocket) || |V V vhost_user_server_new_connection vhost_user_start_client vhost_user_client_reconnect (int fd, void *dat, int *remove __rte_unused) (struct vhost_user_socket *vsocket) (void *arg __rte_unused)| | |V V V--------------------------------------------------------------------------------------------------|Vvhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)|Vvhost_user_read_cb(int connfd, void *dat, int *remove)|Vvhost_user_msg_handler
netdev_dpdk_vhost_construct定义在文件openvswitch-2.9.2/lib/netdev-dpdk.c
1058 static int 1059 netdev_dpdk_vhost_construct(struct netdev *netdev) 1060 { 1061 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); 1062 const char *name = netdev->name; 1063 int err; 1064 1065 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in 1066 * the file system. '/' or '\' would traverse directories, so they're not 1067 * acceptable in 'name'. */ 1068 if (strchr(name, '/') || strchr(name, '\\')) { 1069 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. " 1070 "A valid name must not include '/' or '\\'", 1071 name); 1072 return EINVAL; 1073 } 1074 1075 ovs_mutex_lock(&dpdk_mutex); 1076 /* Take the name of the vhost-user port and append it to the location where 1077 * the socket is to be created, then register the socket. 1078 */ 1079 snprintf(dev->vhost_id, sizeof dev->vhost_id, "%s/%s", 1080 dpdk_get_vhost_sock_dir(), name); 1081 1082 dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT; 1083 err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags); 1084 if (err) { 1085 VLOG_ERR("vhost-user socket device setup failure for socket %s\n", 1086 dev->vhost_id); 1087 goto out; 1088 } else { 1089 fatal_signal_add_file_to_unlink(dev->vhost_id); 1090 VLOG_INFO("Socket %s created for vhost-user port %s\n", 1091 dev->vhost_id, name); 1092 } 1093 1094 err = rte_vhost_driver_callback_register(dev->vhost_id, 1095 &virtio_net_device_ops); 1096 if (err) { 1097 VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user " 1098 "port: %s\n", name); 1099 goto out; 1100 } 1101 1102 err = rte_vhost_driver_disable_features(dev->vhost_id, 1103 1ULL << VIRTIO_NET_F_HOST_TSO4 1104 | 1ULL << VIRTIO_NET_F_HOST_TSO6 1105 | 1ULL << VIRTIO_NET_F_CSUM); 1106 if (err) { 1107 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user " 1108 "port: %s\n", name); 1109 goto out; 1110 } 1111 1112 err = rte_vhost_driver_start(dev->vhost_id); 1113 if (err) { 1114 VLOG_ERR("rte_vhost_driver_start failed for vhost user " 1115 "port: %s\n", name); 1116 goto out; 1117 } 1118 1119 err = vhost_common_construct(netdev); 1120 if (err) { 1121 VLOG_ERR("vhost_common_construct failed for vhost user " 1122 "port: %s\n", name); 1123 } 1124 1125 out: 1126 ovs_mutex_unlock(&dpdk_mutex); 1127 VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; " 1128 "please migrate to dpdkvhostuserclient ports."); 1129 return err; 1130 }
原文链接:https://www.cnblogs.com/dream397/p/13952664.html
OVS DPDK与QEMU之间如何通过vhost user协议通信 vhost user协议的控制和数据通道相关推荐
- OVS DPDK vhost-user详解(十三)
vhost user协议的控制和数据通道 所有的控制信息通过UNIX套接口(控制通道)交互.包括为进行直接内存访问而交换的内存映射信息,以及当数据填入virtio队列后需要出发的kick事件和中断信息 ...
- OVS DPDK vs OVS Deep Dive(十六)
背景 OvS(Open vSwitch)是云计算平台的重要连接组件,为虚拟机提供网络连,被各大云平台,基础设施供应商广泛使用,比如OpenStack, OpenNebula.vSwitch–Virtu ...
- OVS DPDK vhost-user详解(十二)
在软件实现的网络I/O半虚拟化中,vhost-user在性能.灵活性和兼容性等方面达到了近乎完美的权衡.虽然它的提出已经过了四年多,也已经有了越来越多的新特性加入,但是万变不离其宗,那么今天就从整个v ...
- OVS+DPDK Datapath 包分类技术
本文主体内容译于[DPDK社区文档],但并没有逐字翻译,在原文的基础上进行了一些调整,增加了对TSS分类器的详细阐述. 1. 概览 本文描述了OVS+DPDK中的包分类器(datapath class ...
- 网络上的计算机之间又是如何交换信息的。(TCP/IP协议、IPX/SPX协议、NetBEUI协议)
TCP/IP协议.IPX/SPX协议.NetBEUI协议 网络上的计算机之间又是如何交换信息的呢?就像我们说话用某种语言一样,在网络上的各台计算机之间也有一种语言,这就是网络协议,不同的计算机之间必须 ...
- OVS DPDK vhost-user详解(十一)
这篇文章是对vhost-user / virtio-pmd架构的深入技术研究,该架构针对基于DPDK的高性能用户空间网络,面向所有有兴趣了解这些基本细节的架构师和开发人员. 介绍 本文将在读者熟悉vh ...
- OVS DPDK vhost-user搭建全过程(四十四)
系统:ubuntu14.04.1 系统:centos7 内核:4.4.0 qemu : >=2.7 Install DPDK 1. Download DPDK cd /usr/src/ wget ...
- ovs+dpdk场景下的tx checksum offload
一.checksum: tcp checksum包括三部分: 1).伪头部校验和: 伪头部包括: 源ip .宿ip. 协议号.tcp 长度,主要用于校验是正确的目的机器接收到数据包 2).tcp ...
- 【Apache】 配置 (http协议的) vhost
前言 Apache 2.4.39 phpStudy 8.1.1.2 tomcat 9.0 的项目 准备 启用代理模块. 在 httpd.conf 配置文件中加载 Http 反向代理用到的模块 Load ...
最新文章
- 反转链表:输入一个链表的头结点,反转该链表并输出反转后的链表的头结点。...
- [react] 说出几点你认为的React实践
- linux shell rman删除归档_我们一起学一学渗透测试——黑客应该掌握的Linux基础
- 如何产生cpk图形_在评估或选型SMT设备的时候,“印刷机、贴片机Cp、Cpk是什么意思...
- 有钱人的学习能力,穷人该得好好模仿
- Linux 搭建SVN server
- 锋利的JQuery —— 事件和动画
- 存储器容量的扩展 —— 位/字扩展
- python爬微博数据合法吗_爬取新浪微博数据(python)
- 如何书写IT行业的个人简历
- 浅谈JAVA工作流的优雅实现方式
- matlab中global
- UMTS语音通话问题定位分析
- ftp服务器无法上传文件,ftp无法上传文件的原因
- 神经管理学是什么样的学科?
- C++游戏——小胎大乱斗
- 马云收购士兰微_根本停不下来!又一家国内半导体公司将被吞并!
- oracle gbk ebcdic,文件编码 ANSI、GBK、GB2312、MS936、MS932、SJIS、Windows-31 、EUC-JP 、EBCDIC 等等之间的区别与联系...
- Facade(外观)
- 拉钩教育高薪训练营学习笔记
热门文章
- 8个数据分析方法,指导营销策略
- linux 下GPRS模块使用sim900打电话发短信
- 南京邮电考研计算机科学大纲,南京邮电大学|811《数据结构》考研大纲
- 无线测试软件市场份额,USBType C的应用方向及市场规模 - 关于USB Type C技术、应用和产业链的最强解读...
- laradock php扩展,laradock中使用xhprof、xhgui
- 普通瀑布流(错落有致的排列,水平不对齐,左右对齐,下一排的图片的第一张位于上一排高度最小的照片的下面)
- 为回馈广大投资者对imToken的支持与厚爱,项目方决定向投资者免费发放空投
- Docker10: docker compose
- 智能家居新品-华尔兹智能语音面板,支持按键+触屏+语音控制交互的智能面板
- PHP敏感词汇过滤SDK(树形结构遍历命中违禁词)