赞
踩
vhost是virtio的另一种方案,用于跳过qemu,减少qemu和内核之间上下文切换的开销,对于网络IO而言提升尤其明显。vhost目前有两种实现方案,内核态和用户态,本文重点讨论内核态的vhost
vhost内核模块主要处理数据面的事情,控制面上还是交给qemu,vhost的数据结构如下
- struct vhost_dev {
- MemoryListener memory_listener; /* MemoryListener是物理内存操作的回调函数集合 */
- struct vhost_memory *mem;
- int n_mem_sections;
- MemoryRegionSection *mem_sections;
- struct vhost_virtqueue *vqs; /* vhost_virtqueue列表和个数 */
- int nvqs;
- /* the first virtuque which would be used by this vhost dev */
- int vq_index;
- unsigned long long features; /* vhost设备支持的features */
- unsigned long long acked_features; /* guest acked的features */
- unsigned long long backend_features; /* backend, e.g. tap设备,支持的features */
- bool started;
- bool log_enabled;
- vhost_log_chunk_t *log;
- unsigned long long log_size;
- Error *migration_blocker;
- bool force;
- bool memory_changed;
- hwaddr mem_changed_start_addr;
- hwaddr mem_changed_end_addr;
- const VhostOps *vhost_ops; /* VhostOps基于kernel和user两种形态的vhost有不同的实现,内核的实现最终调用ioctl完成 */
- void *opaque;
- };
-
- struct vhost_virtqueue {
- int kick;
- int call;
- void *desc;
- void *avail;
- void *used;
- int num;
- unsigned long long used_phys;
- unsigned used_size;
- void *ring;
- unsigned long long ring_phys;
- unsigned ring_size;
- EventNotifier masked_notifier;
- };
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
vhost的内存布局,也是由一组vhost_memory_region构成,
- struct vhost_memory_region {
- __u64 guest_phys_addr;
- __u64 memory_size; /* bytes */
- __u64 userspace_addr;
- __u64 flags_padding; /* No flags are currently specified. */
- };
-
- /* All region addresses and sizes must be 4K aligned. */
- #define VHOST_PAGE_SIZE 0x1000
-
- struct vhost_memory {
- __u32 nregions;
- __u32 padding;
- struct vhost_memory_region regions[0];
- };
vhost的控制面由qemu来控制,通过ioctl操作vhost_xxx的内核模块,e.g.
- long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
- {
- void __user *argp = (void __user *)arg;
- struct file *eventfp, *filep = NULL;
- struct eventfd_ctx *ctx = NULL;
- u64 p;
- long r;
- int i, fd;
-
- /* If you are not the owner, you can become one */
- if (ioctl == VHOST_SET_OWNER) {
- r = vhost_dev_set_owner(d);
- goto done;
- }
-
- /* You must be the owner to do anything else */
- r = vhost_dev_check_owner(d);
- if (r)
- goto done;
-
- switch (ioctl) {
- case VHOST_SET_MEM_TABLE:
- r = vhost_set_memory(d, argp);
- break;
- ...
- default:
- r = vhost_set_vring(d, ioctl, argp);
- break;
- }
- done:
- return r;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
VHOST_SET_OWNER,用于把当前guest对应的qemu进程和vhost内核线程关联起来
- VHOST_SET_OWNER
- /* Caller should have device mutex */
- static long vhost_dev_set_owner(struct vhost_dev *dev)
- {
- struct task_struct *worker;
- int err;
- /* Is there an owner already? */
- if (dev->mm) {
- err = -EBUSY;
- goto err_mm;
- }
- /* No owner, become one */
- dev->mm = get_task_mm(current); /* 拿到qemu进程的mm_struct,即guest的内存分布结构 */
- worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); /* 创建vhost线程 */
- if (IS_ERR(worker)) {
- err = PTR_ERR(worker);
- goto err_worker;
- }
-
- dev->worker = worker;
- wake_up_process(worker); /* avoid contributing to loadavg */
-
- err = vhost_attach_cgroups(dev);
- if (err)
- goto err_cgroup;
-
- err = vhost_dev_alloc_iovecs(dev); /* 为vhost_virtqueue分配iovec内存空间 */
- if (err)
- goto err_cgroup;
-
- return 0;
- err_cgroup:
- kthread_stop(worker);
- dev->worker = NULL;
- err_worker:
- if (dev->mm)
- mmput(dev->mm);
- dev->mm = NULL;
- err_mm:
- return err;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
VHOST_SET_MEM_TABLE,初始化vhost_dev的vhost_memory内存成员
- static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
- {
- struct vhost_memory mem, *newmem, *oldmem;
- unsigned long size = offsetof(struct vhost_memory, regions);
- if (copy_from_user(&mem, m, size))
- return -EFAULT;
- if (mem.padding)
- return -EOPNOTSUPP;
- if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS)
- return -E2BIG;
- newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); /* 分配多个vhost_memory_region */
- if (!newmem)
- return -ENOMEM;
-
- memcpy(newmem, &mem, size);
- if (copy_from_user(newmem->regions, m->regions,
- mem.nregions * sizeof *m->regions)) {
- kfree(newmem);
- return -EFAULT;
- }
-
- if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) {
- kfree(newmem);
- return -EFAULT;
- }
- oldmem = d->memory;
- rcu_assign_pointer(d->memory, newmem);
- synchronize_rcu();
- kfree(oldmem);
- return 0;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
VHOST_GET_FEATURES, VHOST_SET_FEATURES,用于读写vhost支持的features,目前只有vhost_net模块用到,
- enum {
- VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
- (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
- (1ULL << VIRTIO_RING_F_EVENT_IDX) |
- (1ULL << VHOST_F_LOG_ALL) |
- (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
- (1ULL << VIRTIO_NET_F_MRG_RXBUF),
- };
-
- static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
- unsigned long arg)
- {
- ....
- case VHOST_GET_FEATURES:
- features = VHOST_FEATURES;
- if (copy_to_user(featurep, &features, sizeof features))
- return -EFAULT;
- return 0;
- case VHOST_SET_FEATURES:
- if (copy_from_user(&features, featurep, sizeof features))
- return -EFAULT;
- if (features & ~VHOST_FEATURES)
- return -EOPNOTSUPP;
- return vhost_net_set_features(n, features);
- ....
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
VHOST_SET_VRING_CALL,设置irqfd,把中断注入guest
VHOST_SET_VRING_KICK,设置ioeventfd,获取guest notify
- case VHOST_SET_VRING_KICK:
- if (copy_from_user(&f, argp, sizeof f)) {
- r = -EFAULT;
- break;
- }
- eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
- if (IS_ERR(eventfp)) {
- r = PTR_ERR(eventfp);
- break;
- }
- if (eventfp != vq->kick) { /* eventfp不同于vq->kick,此时需要stop vq->kick同时start eventfp */
- pollstop = filep = vq->kick;
- pollstart = vq->kick = eventfp;
- } else
- filep = eventfp; /* 两者相同,无需stop & start */
- break;
- case VHOST_SET_VRING_CALL:
- if (copy_from_user(&f, argp, sizeof f)) {
- r = -EFAULT;
- break;
- }
- eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
- if (IS_ERR(eventfp)) {
- r = PTR_ERR(eventfp);
- break;
- }
- if (eventfp != vq->call) { /* eventfp不同于vq->call,此时需要stop vq->call同时start eventfp */
- filep = vq->call;
- ctx = vq->call_ctx;
- vq->call = eventfp;
- vq->call_ctx = eventfp ?
- eventfd_ctx_fileget(eventfp) : NULL;
- } else
- filep = eventfp;
- break;
- if (pollstop && vq->handle_kick)
- vhost_poll_stop(&vq->poll);
-
- if (ctx)
- eventfd_ctx_put(ctx); /* pollstop之后,释放之前占用的ctx */
- if (filep)
- fput(filep); /* pollstop之后,释放之前占用的filep */
-
- if (pollstart && vq->handle_kick)
- vhost_poll_start(&vq->poll, vq->kick);
-
- mutex_unlock(&vq->mutex);
-
- if (pollstop && vq->handle_kick)
- vhost_poll_flush(&vq->poll);
- return r;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
下面来看下vhost的数据流,vhost与kvm模块之间通过eventfd来实现,guest到host方向的kick event,通过ioeventfd实现,host到guest方向的call event,通过irqfd实现
host到guest方向
首先host处理used ring,然后判断如果KVM_IRQFD成功设置,kvm模块会通过irqfd把中断注入guest。qemu是通过virtio_pci_set_guest_notifiers -> kvm_virtio_pci_vector_use -> kvm_virtio_pci_irqfd_use -> kvm_irqchip_add_irqfd_notifier -> kvm_irqchip_assign_irqfd最终调用kvm_vm_ioctl来设置kvm模块的irqfd的,包含write fd和read fd(可选)
- static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs)
- {
- PCIDevice *dev = &proxy->pci_dev;
- VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
- VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
- unsigned int vector;
- int ret, queue_no;
- MSIMessage msg;
-
- for (queue_no = 0; queue_no < nvqs; queue_no++) {
- if (!virtio_queue_get_num(vdev, queue_no)) {
- break;
- }
- vector = virtio_queue_vector(vdev, queue_no);
- if (vector >= msix_nr_vectors_allocated(dev)) {
- continue;
- }
- msg = msix_get_message(dev, vector);
- ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector, msg);
- if (ret < 0) {
- goto undo;
- }
- /* If guest supports masking, set up irqfd now.
- * Otherwise, delay until unmasked in the frontend.
- */
- if (k->guest_notifier_mask) {
- ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector);
- if (ret < 0) {
- kvm_virtio_pci_vq_vector_release(proxy, vector);
- goto undo;
- }
- }
- }
- return 0;
-
- undo:
- while (--queue_no >= 0) {
- vector = virtio_queue_vector(vdev, queue_no);
- if (vector >= msix_nr_vectors_allocated(dev)) {
- continue;
- }
- if (k->guest_notifier_mask) {
- kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
- }
- kvm_virtio_pci_vq_vector_release(proxy, vector);
- }
- return ret;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
如果没有设置irqfd,则guest notifier fd会通知到等待fd的qemu进程,进入注册函数virtio_queue_guest_notifier_read,调用virtio_irq,最终调用到virtio_pci_notify
- static void virtio_queue_guest_notifier_read(EventNotifier *n)
- {
- VirtQueue *vq = container_of(n, VirtQueue, guest_notifier);
- if (event_notifier_test_and_clear(n)) {
- virtio_irq(vq);
- }
- }
-
- void virtio_irq(VirtQueue *vq)
- {
- trace_virtio_irq(vq);
- vq->vdev->isr |= 0x01;
- virtio_notify_vector(vq->vdev, vq->vector);
- }
-
- static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
- {
- BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
- VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-
- if (k->notify) {
- k->notify(qbus->parent, vector);
- }
- }
-
- static void virtio_pci_notify(DeviceState *d, uint16_t vector)
- {
- VirtIOPCIProxy *proxy = to_virtio_pci_proxy_fast(d);
-
- if (msix_enabled(&proxy->pci_dev))
- msix_notify(&proxy->pci_dev, vector);
- else {
- VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
- pci_set_irq(&proxy->pci_dev, vdev->isr & 1);
- }
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
整个过程如图所示 ( 摘自http://royluo.org/2014/08/22/vhost/ )
guest通过向pci配置空间写入从而产生VMEXIT,被kvm截获之后触发注册fd的notification
- kvm_init:
- memory_listener_register(&kvm_memory_listener, &address_space_memory);
- memory_listener_register(&kvm_io_listener, &address_space_io);
-
- static MemoryListener kvm_memory_listener = {
- .region_add = kvm_region_add,
- .region_del = kvm_region_del,
- .log_start = kvm_log_start,
- .log_stop = kvm_log_stop,
- .log_sync = kvm_log_sync,
- .log_global_start = kvm_log_global_start,
- .log_global_stop = kvm_log_global_stop,
- .eventfd_add = kvm_mem_ioeventfd_add,
- .eventfd_del = kvm_mem_ioeventfd_del,
- .coalesced_mmio_add = kvm_coalesce_mmio_region,
- .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
- .priority = 10,
- };
-
- static MemoryListener kvm_io_listener = {
- .eventfd_add = kvm_io_ioeventfd_add,
- .eventfd_del = kvm_io_ioeventfd_del,
- .priority = 10,
- };
-
- static void kvm_io_ioeventfd_add(MemoryListener *listener,
- MemoryRegionSection *section,
- bool match_data, uint64_t data,
- EventNotifier *e)
- {
- int fd = event_notifier_get_fd(e);
- int r;
-
- r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
- data, true, int128_get64(section->size),
- match_data);
- if (r < 0) {
- fprintf(stderr, "%s: error adding ioeventfd: %s\n",
- __func__, strerror(-r));
- abort();
- }
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
而kvm_io_ioeventfd_add最终调用了kvm_set_ioeventfd_pio,后者调用了kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick)进入到了kvm.ko中
- static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
- bool assign, uint32_t size, bool datamatch)
- {
- struct kvm_ioeventfd kick = {
- .datamatch = datamatch ? val : 0,
- .addr = addr,
- .flags = KVM_IOEVENTFD_FLAG_PIO,
- .len = size,
- .fd = fd,
- };
- int r;
- if (!kvm_enabled()) {
- return -ENOSYS;
- }
- if (datamatch) {
- kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
- }
- if (!assign) {
- kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
- }
- r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
- if (r < 0) {
- return r;
- }
- return 0;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
KVM_IOEVENTFD的ioctl最终调用了kvm的kvm_ioeventfd函数,后者会调用到kvm_assign_ioeventfd或者kvm_deassign_ioeventfd
- int
- kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
- {
- if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
- return kvm_deassign_ioeventfd(kvm, args);
-
- return kvm_assign_ioeventfd(kvm, args);
- }
-
- static int
- kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
- {
- int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
- enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
- struct _ioeventfd *p; /* ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. */
- struct eventfd_ctx *eventfd; /* mostly wait_queue_head_t */
- int ret;
-
- /* must be natural-word sized */
- switch (args->len) {
- case 1:
- case 2:
- case 4:
- case 8:
- break;
- default:
- return -EINVAL;
- }
-
- /* check for range overflow */
- if (args->addr + args->len < args->addr)
- return -EINVAL;
-
- /* check for extra flags that we don't understand */
- if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
- return -EINVAL;
-
- eventfd = eventfd_ctx_fdget(args->fd); /* file->private_data */
- if (IS_ERR(eventfd))
- return PTR_ERR(eventfd);
-
- p = kzalloc(sizeof(*p), GFP_KERNEL); /* 分配一个_ioeventfd,并把内存地址,长度,eventfd_ctx与其关联起来 */
- if (!p) {
- ret = -ENOMEM;
- goto fail;
- }
-
- INIT_LIST_HEAD(&p->list);
- p->addr = args->addr;
- p->length = args->len;
- p->eventfd = eventfd;
-
- /* The datamatch feature is optional, otherwise this is a wildcard */
- if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
- p->datamatch = args->datamatch;
- else
- p->wildcard = true;
-
- mutex_lock(&kvm->slots_lock);
-
- /* Verify that there isnt a match already */
- if (ioeventfd_check_collision(kvm, p)) {
- ret = -EEXIST;
- goto unlock_fail;
- }
-
- kvm_iodevice_init(&p->dev, &ioeventfd_ops);
-
- ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); /* 注册到kvm的pio bus或者mmio bus上 */
- if (ret < 0)
- goto unlock_fail;
-
- list_add_tail(&p->list, &kvm->ioeventfds); /* 添加到kvm.ko的ioeventfds的list中 */
-
- mutex_unlock(&kvm->slots_lock);
-
- return 0;
-
- unlock_fail:
- mutex_unlock(&kvm->slots_lock);
-
- fail:
- kfree(p);
- eventfd_ctx_put(eventfd);
-
- return ret;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
kvm_assign_ioeventfd中,通过注册一个pio/mmio的地址段和一个fd,当访问这块内存产生的VMEXIT就会在kvm.ko中被转化成为fd的event notification,
- static const struct kvm_io_device_ops ioeventfd_ops = {
- .write = ioeventfd_write,
- .destructor = ioeventfd_destructor,
- };
-
- /* MMIO/PIO writes trigger an event if the addr/val match */
- static int
- ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
- const void *val)
- {
- struct _ioeventfd *p = to_ioeventfd(this);
-
- if (!ioeventfd_in_range(p, addr, len, val))
- return -EOPNOTSUPP;
-
- eventfd_signal(p->eventfd, 1);
- return 0;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
最终event notification通过eventfd_signal,唤醒vhost线程,整体的流程如下图所示
vhost的控制面和数据面如下图所示
最后,以vhost-net为例说明下vhost网络报文的初始化以及收发流程,e.g.
qemu通过netdev tap,vhost=on在创建网络设备时指定后端基于vhost,net_init_tap会对vhost的每个queue,调用net_init_tap_one初始化vhost。初始化的工作通过vhost_net_init完成
- typedef struct VhostNetOptions {
- VhostBackendType backend_type; /* vhost kernel or userspace */
- NetClientState *net_backend; /* TAPState device */
- void *opaque; /* ioctl vhostfd, /dev/vhost-net */
- bool force;
- } VhostNetOptions;
-
- static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
- const char *model, const char *name,
- const char *ifname, const char *script,
- const char *downscript, const char *vhostfdname,
- int vnet_hdr, int fd)
- {
- ...
- if (tap->has_vhost ? tap->vhost :
- vhostfdname || (tap->has_vhostforce && tap->vhostforce)) {
- VhostNetOptions options;
-
- options.backend_type = VHOST_BACKEND_TYPE_KERNEL;
- options.net_backend = &s->nc;
- options.force = tap->has_vhostforce && tap->vhostforce;
-
- if ((tap->has_vhostfd || tap->has_vhostfds)) {
- vhostfd = monitor_handle_fd_param(cur_mon, vhostfdname);
- if (vhostfd == -1) {
- return -1;
- }
- } else {
- vhostfd = open("/dev/vhost-net", O_RDWR); /* open /dev/vhost-net for ioctl usage */
- if (vhostfd < 0) {
- error_report("tap: open vhost char device failed: %s",
- strerror(errno));
- return -1;
- }
- }
- qemu_set_cloexec(vhostfd);
- options.opaque = (void *)(uintptr_t)vhostfd;
- s->vhost_net = vhost_net_init(&options); /* 初始化struct vhost_net */
- if (!s->vhost_net) {
- error_report("vhost-net requested but could not be initialized");
- return -1;
- }
- }
- ...
- }
-
- struct vhost_net {
- struct vhost_dev dev;
- struct vhost_virtqueue vqs[2];
- int backend;
- NetClientState *nc;
- };
-
- struct vhost_net *vhost_net_init(VhostNetOptions *options)
- {
- int r;
- bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL;
- struct vhost_net *net = g_malloc(sizeof *net);
-
- if (!options->net_backend) {
- fprintf(stderr, "vhost-net requires net backend to be setup\n");
- goto fail;
- }
-
- if (backend_kernel) {
- r = vhost_net_get_fd(options->net_backend);
- if (r < 0) {
- goto fail;
- }
- net->dev.backend_features = qemu_has_vnet_hdr(options->net_backend)
- ? 0 : (1 << VHOST_NET_F_VIRTIO_NET_HDR);
- net->backend = r; /* backend设置为NetClientState对应的fd */
- } else {
- net->dev.backend_features = 0;
- net->backend = -1;
- }
- net->nc = options->net_backend; /* nc设置为NetClientState */
-
- net->dev.nvqs = 2; /* TX queue和RX queue */
- net->dev.vqs = net->vqs; /* vhost_dev,vhost_net公用vhost_virtqueue */
-
- r = vhost_dev_init(&net->dev, options->opaque,
- options->backend_type, options->force); /* 初始化vhost_dev,这里通过VHOST_SET_OWNER的ioctl创建vhost kthread */
- if (r < 0) {
- goto fail;
- }
- if (!qemu_has_vnet_hdr_len(options->net_backend,
- sizeof(struct virtio_net_hdr_mrg_rxbuf))) {
- net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
- }
- if (backend_kernel) {
- if (~net->dev.features & net->dev.backend_features) {
- fprintf(stderr, "vhost lacks feature mask %" PRIu64
- " for backend\n",
- (uint64_t)(~net->dev.features & net->dev.backend_features));
- vhost_dev_cleanup(&net->dev);
- goto fail;
- }
- }
- /* Set sane init value. Override when guest acks. */
- vhost_net_ack_features(net, 0);
- return net;
- fail:
- g_free(net);
- return NULL;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
当guest启动成功,qemu会配置相应的vhost,调用virtio_net_set_status用于开启/关闭virtio-net设备及队列,virtio_net_set_status会调用到vhost_net_start用于打开vhost队列,调用vhost_net_stop用于关闭vhost队列
- int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
- int total_queues)
- {
- BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev)));
- VirtioBusState *vbus = VIRTIO_BUS(qbus);
- VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
- int r, e, i;
-
- if (!vhost_net_device_endian_ok(dev)) {
- error_report("vhost-net does not support cross-endian");
- r = -ENOSYS;
- goto err;
- }
-
- if (!k->set_guest_notifiers) {
- error_report("binding does not support guest notifiers");
- r = -ENOSYS;
- goto err;
- }
-
- for (i = 0; i < total_queues; i++) {
- vhost_net_set_vq_index(get_vhost_net(ncs[i].peer), i * 2);
- }
- /* 调用virtio_pci_set_guest_notifiers来配置irqfd等信息;如果没有enable vhost,qemu同样会调用到这里 */
- r = k->set_guest_notifiers(qbus->parent, total_queues * 2, true);
- if (r < 0) {
- error_report("Error binding guest notifier: %d", -r);
- goto err;
- }
- /* 如果tun支持多队列的场景,会有多个NetClientState,分别代表tap设备的一个队列,每个NetClientState都会对应一个vhost_net结构 */
- for (i = 0; i < total_queues; i++) {
- r = vhost_net_start_one(get_vhost_net(ncs[i].peer), dev); /* 对每个队列调用vhost_net_start_one */
-
- if (r < 0) {
- goto err_start;
- }
- }
-
- return 0;
-
- err_start:
- while (--i >= 0) {
- vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev);
- }
- e = k->set_guest_notifiers(qbus->parent, total_queues * 2, false);
- if (e < 0) {
- fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", e);
- fflush(stderr);
- }
- err:
- return r;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- static int vhost_net_start_one(struct vhost_net *net,
- VirtIODevice *dev)
- {
- struct vhost_vring_file file = { };
- int r;
-
- if (net->dev.started) {
- return 0;
- }
-
- net->dev.nvqs = 2; /* vqs包含一个TX virtqueue和一个RX virtqueue */
- net->dev.vqs = net->vqs;
- /* 调用<span style="font-family: Arial, Helvetica, sans-serif;">virtio_pci_set_guest_notifiers来enable vhost ioeventfd */</span>
- r = vhost_dev_enable_notifiers(&net->dev, dev); /* 停止在qemu中处理guest的IO通知,开始在vhost里处理guest的IO通知 */
- if (r < 0) {
- goto fail_notifiers;
- }
-
- r = vhost_dev_start(&net->dev, dev);
- if (r < 0) {
- goto fail_start;
- }
-
- if (net->nc->info->poll) {
- net->nc->info->poll(net->nc, false);
- }
-
- if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) {
- qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
- file.fd = net->backend;
- for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
- const VhostOps *vhost_ops = net->dev.vhost_ops;
- r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND,
- &file);
- if (r < 0) {
- r = -errno;
- goto fail;
- }
- }
- }
- return 0;
- fail:
- file.fd = -1;
- if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) {
- while (file.index-- > 0) {
- const VhostOps *vhost_ops = net->dev.vhost_ops;
- int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND,
- &file);
- assert(r >= 0);
- }
- }
- if (net->nc->info->poll) {
- net->nc->info->poll(net->nc, true);
- }
- vhost_dev_stop(&net->dev, dev);
- fail_start:
- vhost_dev_disable_notifiers(&net->dev, dev);
- fail_notifiers:
- return r;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
vhost net的各个数据结构之间关系如下图所示
我们来看下内核对vhost_net的定义,e.g.
- static const struct file_operations vhost_net_fops = {
- .owner = THIS_MODULE,
- .release = vhost_net_release,
- .unlocked_ioctl = vhost_net_ioctl,
- #ifdef CONFIG_COMPAT
- .compat_ioctl = vhost_net_compat_ioctl,
- #endif
- .open = vhost_net_open,
- };
-
- static struct miscdevice vhost_net_misc = {
- MISC_DYNAMIC_MINOR,
- "vhost-net",
- &vhost_net_fops,
- };
-
- enum {
- VHOST_NET_VQ_RX = 0,
- VHOST_NET_VQ_TX = 1,
- VHOST_NET_VQ_MAX = 2,
- };
-
- enum vhost_net_poll_state {
- VHOST_NET_POLL_DISABLED = 0,
- VHOST_NET_POLL_STARTED = 1,
- VHOST_NET_POLL_STOPPED = 2,
- };
-
- struct vhost_net {
- struct vhost_dev dev;
- struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; /* vhost的virtqueue封装,其handle_kick的回调函数会被ioeventfd唤醒 */
- struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* 对应于NetClientState的socket IO,分别用两个vhost_poll结构体 */
- /* Tells us whether we are polling a socket for TX.
- * We only do this when socket buffer fills up.
- * Protected by tx vq lock. */
- enum vhost_net_poll_state tx_poll_state;
- };
-
- static int vhost_net_open(struct inode *inode, struct file *f)
- {
- struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
- struct vhost_dev *dev;
- int r;
-
- if (!n)
- return -ENOMEM;
-
- dev = &n->dev;
- n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; /* TX virtqueue->kick的callback函数 */
- n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; /* RX virtqueue->kick的callback函数 */
- r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
- if (r < 0) {
- kfree(n);
- return r;
- }
-
- vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); /* 初始化vhost_net的TX vhost_poll */
- vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); /* 初始化vhost_net的RX vhost_poll */
- n->tx_poll_state = VHOST_NET_POLL_DISABLED;
-
- f->private_data = n;
-
- return 0;
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
handle_tx_kick/handle_rx_kick的实现和handle_tx_net/handle_rx_net完全一致,这里为什么要有两个不同的函数呢?看完下面的代码分析后你会有一个答案,不过我先在这里剧透下,handle_tx_kick/handle_rx_kick是阻塞在TX queue/RX queue的kick fd上的回调函数,handle_tx_net/handle_rx_net是阻塞在vhost_net TX poll/RX poll上的阻塞函数,无论对于TX还是RX而言,报文的路径都是一个两阶段的过程,e.g.
TX首先是kick virtqueue的fd,之后进行vring的buffer传递,最后通过NetClientState的socket fd发送,但socket有可能会出现缓冲区不足,或者本次发送的quota不够等情况,此时需要poll在socket的fd上阻塞等待。同理RX也是如此,一阶段阻塞在socket fd上,二阶段阻塞在virtqueue kick fd上
当guest发送报文时,ioeventfd触发了vhost_virtqueue的kick fd,POLLIN事件导致vhost_poll_wakeup被调用,最后唤醒了vhost worker线程,线程会调用注册的handle_kick函数,即handle_tx_kick
- static void handle_tx_kick(struct vhost_work *work)
- {
- struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
- poll.work);
- struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
-
- handle_tx(net);
- }
-
- static void handle_tx(struct vhost_net *net)
- {
- struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
- unsigned out, in, s;
- int head;
- struct msghdr msg = {
- .msg_name = NULL,
- .msg_namelen = 0,
- .msg_control = NULL,
- .msg_controllen = 0,
- .msg_iov = vq->iov,
- .msg_flags = MSG_DONTWAIT,
- };
- size_t len, total_len = 0;
- int err, wmem;
- size_t hdr_size;
- struct vhost_ubuf_ref *uninitialized_var(ubufs);
- bool zcopy;
- struct socket *sock = rcu_dereference(vq->private_data); /* NetClientState对应的socket以private_data的形式保存在vhost_virtqueue */
- if (!sock)
- return;
-
- wmem = atomic_read(&sock->sk->sk_wmem_alloc);
- if (wmem >= sock->sk->sk_sndbuf) { /* 已经申请的socket写内存,超过了发送缓冲区 */
- mutex_lock(&vq->mutex);
- tx_poll_start(net, sock); /* 此时无法发送,阻塞等待在sock上 */
- mutex_unlock(&vq->mutex);
- return;
- }
-
- mutex_lock(&vq->mutex);
- vhost_disable_notify(&net->dev, vq); /* disable virtqueue的notify通知,通过VRING_USED_F_NO_NOTIFY标志位 */
-
- if (wmem < sock->sk->sk_sndbuf / 2)
- tx_poll_stop(net);
- hdr_size = vq->vhost_hlen;
- zcopy = vq->ubufs;
-
- for (;;) {
- /* Release DMAs done buffers first */
- if (zcopy)
- vhost_zerocopy_signal_used(vq);
-
- head = vhost_get_vq_desc(&net->dev, vq, vq->iov, /* 从last_avail_idx开始,把avail desc内容拷贝过来 */
- ARRAY_SIZE(vq->iov),
- &out, &in,
- NULL, NULL);
- /* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
- break;
- /* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) { /* 此时vq->avail_idx == vq->last_avail_idx,前端没有新buf过来 */
- int num_pends;
-
- wmem = atomic_read(&sock->sk->sk_wmem_alloc);
- if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
- tx_poll_start(net, sock);
- set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
- break;
- }
- /* If more outstanding DMAs, queue the work.
- * Handle upend_idx wrap around
- */
- num_pends = likely(vq->upend_idx >= vq->done_idx) ?
- (vq->upend_idx - vq->done_idx) :
- (vq->upend_idx + UIO_MAXIOV - vq->done_idx);
- if (unlikely(num_pends > VHOST_MAX_PEND)) {
- tx_poll_start(net, sock);
- set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
- break;
- }
- if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* 重新调用vhost_enable_notify打开event notify flag */
- vhost_disable_notify(&net->dev, vq); /* vhost_enable_notify返回false,说明avail_idx有了变化,那么continue */
- continue;
- }
- break;
- }
- if (in) { /* Tx应该全部是out */
- vq_err(vq, "Unexpected descriptor format for TX: "
- "out %d, int %d\n", out, in);
- break;
- }
- /* Skip header. TODO: support TSO. */
- s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); /* hdr_size是VNET_HDR的元数据,里面没有实际报文内容 */
- msg.msg_iovlen = out;
- len = iov_length(vq->iov, out);
- /* Sanity check */
- if (!len) {
- vq_err(vq, "Unexpected header len for TX: "
- "%zd expected %zd\n",
- iov_length(vq->hdr, s), hdr_size);
- break;
- }
- /* use msg_control to pass vhost zerocopy ubuf info to skb */
- if (zcopy) {
- vq->heads[vq->upend_idx].id = head;
- if (len < VHOST_GOODCOPY_LEN) {
- /* copy don't need to wait for DMA done */
- vq->heads[vq->upend_idx].len =
- VHOST_DMA_DONE_LEN;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- ubufs = NULL;
- } else {
- struct ubuf_info *ubuf = &vq->ubuf_info[head];
-
- vq->heads[vq->upend_idx].len = len;
- ubuf->callback = vhost_zerocopy_callback;
- ubuf->arg = vq->ubufs;
- ubuf->desc = vq->upend_idx;
- msg.msg_control = ubuf;
- msg.msg_controllen = sizeof(ubuf);
- ubufs = vq->ubufs;
- kref_get(&ubufs->kref);
- }
- vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
- }
- /* TODO: Check specific error and bomb out unless ENOBUFS? */
- err = sock->ops->sendmsg(NULL, sock, &msg, len);
- if (unlikely(err < 0)) {
- if (zcopy) {
- if (ubufs)
- vhost_ubuf_put(ubufs);
- vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
- UIO_MAXIOV;
- }
- vhost_discard_vq_desc(vq, 1); /* 发送失败,回退last_avail_idx */
- if (err == -EAGAIN || err == -ENOBUFS)
- tx_poll_start(net, sock); /* 阻塞等待vhost_net->poll之后尝试重新发送 */
- break;
- }
- if (err != len)
- pr_debug("Truncated TX packet: "
- " len %d != %zd\n", err, len);
- if (!zcopy)
- vhost_add_used_and_signal(&net->dev, vq, head, 0); /* 更新virtqueue used ring部分,e.g. used_elem, last_used_idx */
- else
- vhost_zerocopy_signal_used(vq);
- total_len += len;
- if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
- vhost_poll_queue(&vq->poll); /* 超出了quota,重新入队列等待调度 */
- break;
- }
- }
-
- mutex_unlock(&vq->mutex);
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
收包过程首先是vhost阻塞在NetClientState的socket上,e.g.
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev)
- static void handle_rx_net(struct vhost_work *work)
- {
- struct vhost_net *net = container_of(work, struct vhost_net,
- poll[VHOST_NET_VQ_RX].work);
- handle_rx(net);
- }
-
- static void handle_rx(struct vhost_net *net)
- {
- struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
- unsigned uninitialized_var(in), log;
- struct vhost_log *vq_log;
- struct msghdr msg = {
- .msg_name = NULL,
- .msg_namelen = 0,
- .msg_control = NULL, /* FIXME: get and handle RX aux data. */
- .msg_controllen = 0,
- .msg_iov = vq->iov,
- .msg_flags = MSG_DONTWAIT,
- };
-
- struct virtio_net_hdr_mrg_rxbuf hdr = {
- .hdr.flags = 0,
- .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
- };
-
- size_t total_len = 0;
- int err, headcount, mergeable;
- size_t vhost_hlen, sock_hlen;
- size_t vhost_len, sock_len;
-
- struct socket *sock = rcu_dereference(vq->private_data);
-
- if (!sock)
- return;
-
- mutex_lock(&vq->mutex);
- vhost_disable_notify(&net->dev, vq); /* disable virtqueue event notify机制 */
- vhost_hlen = vq->vhost_hlen;
- sock_hlen = vq->sock_hlen;
-
- vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
- vq->log : NULL;
- mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
-
- while ((sock_len = peek_head_len(sock->sk))) { /* 下一个报文的长度 */
- sock_len += sock_hlen;
- vhost_len = sock_len + vhost_hlen;
- headcount = get_rx_bufs(vq, vq->heads, vhost_len, /* get_rx_bufs用于从virtqueue中拿到多个avail desc, */
- &in, vq_log, &log, /* 直到满足所有这些iov加起来可以容纳下一个报文的长度 */
- likely(mergeable) ? UIO_MAXIOV : 1); /* 相当于多次调用<span style="font-family: Arial, Helvetica, sans-serif;">vhost_get_vq_desc */</span>
-
- /* On error, stop handling until the next kick. */
- if (unlikely(headcount < 0))
- break;
- /* OK, now we need to know about added descriptors. */
- if (!headcount) {
- if (unlikely(vhost_enable_notify(&net->dev, vq))) {
- /* They have slipped one in as we were
- * doing that: check again. */
- vhost_disable_notify(&net->dev, vq);
- continue;
- }
- /* Nothing new? Wait for eventfd to tell us
- * they refilled. */
- break;
- }
- /* We don't need to be notified again. */
- if (unlikely((vhost_hlen)))
- /* Skip header. TODO: support TSO. */
- move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
- else
- /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
- * needed because sendmsg can modify msg_iov. */
- copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
- msg.msg_iovlen = in;
- err = sock->ops->recvmsg(NULL, sock, &msg,
- sock_len, MSG_DONTWAIT | MSG_TRUNC); /* 报文被收到virtqueue->iov里面 */
- /* Userspace might have consumed the packet meanwhile:
- * it's not supposed to do this usually, but might be hard
- * to prevent. Discard data we got (if any) and keep going. */
- if (unlikely(err != sock_len)) {
- pr_debug("Discarded rx packet: "
- " len %d, expected %zd\n", err, sock_len);
- vhost_discard_vq_desc(vq, headcount); /* 回滚used ring */
- continue;
- }
- if (unlikely(vhost_hlen) &&
- memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
- vhost_hlen)) {
- vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
- vq->iov->iov_base);
- break;
- }
- /* TODO: Should check and handle checksum. */
- if (likely(mergeable) &&
- memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
- offsetof(typeof(hdr), num_buffers),
- sizeof hdr.num_buffers)) {
- vq_err(vq, "Failed num_buffers write");
- vhost_discard_vq_desc(vq, headcount);
- break;
- }
- vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
- headcount); /* 添加多个vring_used_elem,并notify前端 */
- if (unlikely(vq_log))
- vhost_log_write(vq, vq_log, log, vhost_len);
- total_len += vhost_len;
- if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
- vhost_poll_queue(&vq->poll); /* 超出了quota,重新入队列等待,注意此时加入的是vq的poll,下次会触发调用handle_rx_kick */
- break;
- }
- }
-
- mutex_unlock(&vq->mutex);
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。