赞
踩
在网络IO的半虚拟中,vhost-user是目前最优的解决方案。在DPDK中,同样也采用了这种方式。vhost-user是为了解决内核状态数据操作复杂的情况提出的一种解决方式,通过在用户进程来替代内核进程来实现数据交互的最少化。在vhost-user在应用场景中,虚拟化的容器支持是一个重点方向。起初的virtio-user就是为了支持容器内部与DPDK通信的。后来也发展到虚拟设备间的通信。
DPDK与Kernel的通信也叫做“exception path”,通常来说,这种通信方式主要有几种:
1、KNI,是目前DPDK中用户使用的主要方案。即通过虚拟网络接口,利用队列和DPDK应用交的数据,但无法实现upstream(一种负载均衡的手段)
2、Tun/Tap或者pcap PMD.需要内核切换,效率差
3、Flow Bifurcation,虚拟多张网卡,依赖硬件,不灵活
4、virtio-user和vhost-net,这是比较好的一种实现机制。
virtio-user在DPDK和虚拟场景下的应用还是非常多的。virtio-user虚拟出的设备和真实的设备在上层看没有区别,这个非常重要。
下面看一下在DPDK中相关的数据结构定义:
struct virtio_user_queue { uint16_t used_idx; bool avail_wrap_counter; bool used_wrap_counter; }; struct virtio_user_dev { /* for vhost_user backend */ int vhostfd; int listenfd; /* listening fd */ bool is_server; /* server or client mode */ /* for vhost_kernel backend */ char *ifname; int *vhostfds; int *tapfds; /* for both vhost_user and vhost_kernel */ int callfds[VIRTIO_MAX_VIRTQUEUES]; int kickfds[VIRTIO_MAX_VIRTQUEUES]; int mac_specified; uint32_t max_queue_pairs; uint32_t queue_pairs; uint32_t queue_size; uint64_t features; /* the negotiated features with driver, * and will be sync with device */ uint64_t device_features; /* supported features by device */ uint64_t frontend_features; /* enabled frontend features */ uint64_t unsupported_features; /* unsupported features mask */ uint8_t status; uint16_t net_status; uint16_t port_id; uint8_t mac_addr[RTE_ETHER_ADDR_LEN]; char path[PATH_MAX]; union { struct vring vrings[VIRTIO_MAX_VIRTQUEUES]; struct vring_packed packed_vrings[VIRTIO_MAX_VIRTQUEUES]; }; struct virtio_user_queue packed_queues[VIRTIO_MAX_VIRTQUEUES]; bool qp_enabled[VIRTIO_MAX_VIRTQUEUE_PAIRS]; struct virtio_user_backend_ops *ops; pthread_mutex_t mutex; bool started; };
除了虚拟设备外,其实它主要是和VHOST以及相关数据队列的操作,而那些数据结构在前面已经基本都介绍过了。
其实在前面说了,virtio-user在虚拟环境中应用非常广泛,在virtio-user文件夹(driver/net/virtio)下可以看到,其实最主要的就是那几个文件:
int virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues, int cq, int queue_size, const char *mac, char **ifname, int server, int mrg_rxbuf, int in_order, int packed_vq) { pthread_mutex_init(&dev->mutex, NULL); strlcpy(dev->path, path, PATH_MAX); dev->started = 0; dev->max_queue_pairs = queues; dev->queue_pairs = 1; /* mq disabled by default */ dev->queue_size = queue_size; dev->is_server = server; dev->mac_specified = 0; dev->frontend_features = 0; dev->unsupported_features = ~VIRTIO_USER_SUPPORTED_FEATURES; parse_mac(dev, mac); if (*ifname) { dev->ifname = *ifname; *ifname = NULL; } if (virtio_user_dev_setup(dev) < 0) { PMD_INIT_LOG(ERR, "backend set up fails"); return -1; } if (!dev->is_server) { if (dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL) < 0) { PMD_INIT_LOG(ERR, "set_owner fails: %s", strerror(errno)); return -1; } if (dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &dev->device_features) < 0) { PMD_INIT_LOG(ERR, "get_features failed: %s", strerror(errno)); return -1; } } else { /* We just pretend vhost-user can support all these features. * Note that this could be problematic that if some feature is * negotiated but not supported by the vhost-user which comes * later. */ dev->device_features = VIRTIO_USER_SUPPORTED_FEATURES; } if (!mrg_rxbuf) dev->unsupported_features |= (1ull << VIRTIO_NET_F_MRG_RXBUF); if (!in_order) dev->unsupported_features |= (1ull << VIRTIO_F_IN_ORDER); if (!packed_vq) dev->unsupported_features |= (1ull << VIRTIO_F_RING_PACKED); if (dev->mac_specified) dev->frontend_features |= (1ull << VIRTIO_NET_F_MAC); else dev->unsupported_features |= (1ull << VIRTIO_NET_F_MAC); if (cq) { /* device does not really need to know anything about CQ, * so if necessary, we just claim to support CQ */ dev->frontend_features |= (1ull << VIRTIO_NET_F_CTRL_VQ); } else { dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_VQ); /* Also disable features that depend on VIRTIO_NET_F_CTRL_VQ */ dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_RX); dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_VLAN); dev->unsupported_features |= (1ull << VIRTIO_NET_F_GUEST_ANNOUNCE); dev->unsupported_features |= (1ull << VIRTIO_NET_F_MQ); dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_MAC_ADDR); } /* The backend will not report this feature, we add it explicitly */ if (is_vhost_user_by_type(dev->path)) dev->frontend_features |= (1ull << VIRTIO_NET_F_STATUS); /* * Device features = * (frontend_features | backend_features) & ~unsupported_features; */ dev->device_features |= dev->frontend_features; dev->device_features &= ~dev->unsupported_features; if (rte_mem_event_callback_register(VIRTIO_USER_MEM_EVENT_CLB_NAME, virtio_user_mem_event_cb, dev)) { if (rte_errno != ENOTSUP) { PMD_INIT_LOG(ERR, "Failed to register mem event" " callback\n"); return -1; } } return 0; }
先是对设备的初始化,然后进行Setup:
static int virtio_user_dev_setup(struct virtio_user_dev *dev) { uint32_t q; dev->vhostfd = -1; dev->vhostfds = NULL; dev->tapfds = NULL; if (dev->is_server) { if (access(dev->path, F_OK) == 0 && !is_vhost_user_by_type(dev->path)) { PMD_DRV_LOG(ERR, "Server mode doesn't support vhost-kernel!"); return -1; } dev->ops = &virtio_ops_user; } else { if (is_vhost_user_by_type(dev->path)) { dev->ops = &virtio_ops_user; } else { dev->ops = &virtio_ops_kernel; dev->vhostfds = malloc(dev->max_queue_pairs * sizeof(int)); dev->tapfds = malloc(dev->max_queue_pairs * sizeof(int)); if (!dev->vhostfds || !dev->tapfds) { PMD_INIT_LOG(ERR, "Failed to malloc"); return -1; } for (q = 0; q < dev->max_queue_pairs; ++q) { dev->vhostfds[q] = -1; dev->tapfds[q] = -1; } } } if (dev->ops->setup(dev) < 0) return -1; if (virtio_user_dev_init_notify(dev) < 0) return -1; if (virtio_user_fill_intr_handle(dev) < 0) return -1; return 0; }
然后在处理用户状态时可以启动:
//drivers/net/virtio/virtio_user_ethdev.c static void virtio_user_set_status(struct virtio_hw *hw, uint8_t status) { struct virtio_user_dev *dev = virtio_user_get_dev(hw); if (status & VIRTIO_CONFIG_STATUS_DRIVER_OK) virtio_user_start_device(dev); else if (status == VIRTIO_CONFIG_STATUS_RESET) virtio_user_reset(hw); dev->status = status; } int virtio_user_start_device(struct virtio_user_dev *dev) { uint64_t features; int ret; /* * XXX workaround! * * We need to make sure that the locks will be * taken in the correct order to avoid deadlocks. * * Before releasing this lock, this thread should * not trigger any memory hotplug events. * * This is a temporary workaround, and should be * replaced when we get proper supports from the * memory subsystem in the future. */ rte_mcfg_mem_read_lock(); pthread_mutex_lock(&dev->mutex); if (is_vhost_user_by_type(dev->path) && dev->vhostfd < 0) goto error; /* Step 0: tell vhost to create queues */ if (virtio_user_queue_setup(dev, virtio_user_create_queue) < 0) goto error; /* Step 1: set features */ features = dev->features; /* Strip VIRTIO_NET_F_MAC, as MAC address is handled in vdev init */ features &= ~(1ull << VIRTIO_NET_F_MAC); /* Strip VIRTIO_NET_F_CTRL_VQ, as devices do not really need to know */ features &= ~(1ull << VIRTIO_NET_F_CTRL_VQ); features &= ~(1ull << VIRTIO_NET_F_STATUS); ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features); if (ret < 0) goto error; PMD_DRV_LOG(INFO, "set features: %" PRIx64, features); /* Step 2: share memory regions */ ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL); if (ret < 0) goto error; /* Step 3: kick queues */ if (virtio_user_queue_setup(dev, virtio_user_kick_queue) < 0) goto error; /* Step 4: enable queues * we enable the 1st queue pair by default. */ dev->ops->enable_qp(dev, 0, 1); dev->started = true; pthread_mutex_unlock(&dev->mutex); rte_mcfg_mem_read_unlock(); return 0; error: pthread_mutex_unlock(&dev->mutex); rte_mcfg_mem_read_unlock(); /* TODO: free resource here or caller to check */ return -1; }
这里其实会调用send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL)来传递内存数据(ops中设置),如果后端为vhost-user时,即为vhost_user_sock。
//drivers/net/virtio/virtio-user static int vhost_user_sock(struct virtio_user_dev *dev, enum vhost_user_request req, void *arg) { struct vhost_user_msg msg; struct vhost_vring_file *file = 0; int need_reply = 0; int fds[VHOST_MEMORY_MAX_NREGIONS]; int fd_num = 0; int len; int vhostfd = dev->vhostfd; RTE_SET_USED(m); PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]); if (dev->is_server && vhostfd < 0) return -1; msg.request = req; msg.flags = VHOST_USER_VERSION; msg.size = 0; switch (req) { case VHOST_USER_GET_FEATURES: need_reply = 1; break; case VHOST_USER_SET_FEATURES: case VHOST_USER_SET_LOG_BASE: msg.payload.u64 = *((__u64 *)arg); msg.size = sizeof(m.payload.u64); break; case VHOST_USER_SET_OWNER: case VHOST_USER_RESET_OWNER: break; case VHOST_USER_SET_MEM_TABLE: if (prepare_vhost_memory_user(&msg, fds) < 0) return -1; fd_num = msg.payload.memory.nregions; msg.size = sizeof(m.payload.memory.nregions); msg.size += sizeof(m.payload.memory.padding); msg.size += fd_num * sizeof(struct vhost_memory_region); break; case VHOST_USER_SET_LOG_FD: fds[fd_num++] = *((int *)arg); break; case VHOST_USER_SET_VRING_NUM: case VHOST_USER_SET_VRING_BASE: case VHOST_USER_SET_VRING_ENABLE: memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); msg.size = sizeof(m.payload.state); break; case VHOST_USER_GET_VRING_BASE: memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); msg.size = sizeof(m.payload.state); need_reply = 1; break; case VHOST_USER_SET_VRING_ADDR: memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr)); msg.size = sizeof(m.payload.addr); break; case VHOST_USER_SET_VRING_KICK: case VHOST_USER_SET_VRING_CALL: case VHOST_USER_SET_VRING_ERR: file = arg; msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK; msg.size = sizeof(m.payload.u64); if (file->fd > 0) fds[fd_num++] = file->fd; else msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK; break; default: PMD_DRV_LOG(ERR, "trying to send unhandled msg type"); return -1; } len = VHOST_USER_HDR_SIZE + msg.size; if (vhost_user_write(vhostfd, &msg, len, fds, fd_num) < 0) { PMD_DRV_LOG(ERR, "%s failed: %s", vhost_msg_strings[req], strerror(errno)); return -1; } if (need_reply) { if (vhost_user_read(vhostfd, &msg) < 0) { PMD_DRV_LOG(ERR, "Received msg failed: %s", strerror(errno)); return -1; } if (req != msg.request) { PMD_DRV_LOG(ERR, "Received unexpected msg type"); return -1; } switch (req) { case VHOST_USER_GET_FEATURES: if (msg.size != sizeof(m.payload.u64)) { PMD_DRV_LOG(ERR, "Received bad msg size"); return -1; } *((__u64 *)arg) = msg.payload.u64; break; case VHOST_USER_GET_VRING_BASE: if (msg.size != sizeof(m.payload.state)) { PMD_DRV_LOG(ERR, "Received bad msg size"); return -1; } memcpy(arg, &msg.payload.state, sizeof(struct vhost_vring_state)); break; default: PMD_DRV_LOG(ERR, "Received unexpected msg type"); return -1; } } return 0; }
找到相关的VHOST_USER_SET_MEM_TABLE选项设置就看了数据的准备,从调用函数就可以一路深入进去,明白整个过程。这里就不再做介绍。
通过上面的分析可以看出,virtio-user既可以实现虚拟机前后端的通信,也可以实现不同设备间的通信,还可以实现与内核间的通信。所以一种新的技术被提出后,会不断的推动应用的向前发展,反过来,应用的发展又不断要求前者提供更好的支持。互相促进,就会形成一个新的应用场景并有可能暴发。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。