当前位置:   article > 正文

virtio后端方案vhost_virtio 数据流

virtio 数据流

vhost是virtio的另一种方案,用于跳过qemu,减少qemu和内核之间上下文切换的开销,对于网络IO而言提升尤其明显。vhost目前有两种实现方案,内核态和用户态,本文重点讨论内核态的vhost

vhost内核模块主要处理数据面的事情,控制面上还是交给qemu,vhost的数据结构如下

  1. struct vhost_dev {
  2. MemoryListener memory_listener; /* MemoryListener是物理内存操作的回调函数集合 */
  3. struct vhost_memory *mem;
  4. int n_mem_sections;
  5. MemoryRegionSection *mem_sections;
  6. struct vhost_virtqueue *vqs; /* vhost_virtqueue列表和个数 */
  7. int nvqs;
  8. /* the first virtuque which would be used by this vhost dev */
  9. int vq_index;
  10. unsigned long long features; /* vhost设备支持的features */
  11. unsigned long long acked_features; /* guest acked的features */
  12. unsigned long long backend_features; /* backend, e.g. tap设备,支持的features */
  13. bool started;
  14. bool log_enabled;
  15. vhost_log_chunk_t *log;
  16. unsigned long long log_size;
  17. Error *migration_blocker;
  18. bool force;
  19. bool memory_changed;
  20. hwaddr mem_changed_start_addr;
  21. hwaddr mem_changed_end_addr;
  22. const VhostOps *vhost_ops; /* VhostOps基于kernel和user两种形态的vhost有不同的实现,内核的实现最终调用ioctl完成 */
  23. void *opaque;
  24. };
  25. struct vhost_virtqueue {
  26. int kick;
  27. int call;
  28. void *desc;
  29. void *avail;
  30. void *used;
  31. int num;
  32. unsigned long long used_phys;
  33. unsigned used_size;
  34. void *ring;
  35. unsigned long long ring_phys;
  36. unsigned ring_size;
  37. EventNotifier masked_notifier;
  38. };
vhost的内存布局,也是由一组vhost_memory_region构成,

  1. struct vhost_memory_region {
  2. __u64 guest_phys_addr;
  3. __u64 memory_size; /* bytes */
  4. __u64 userspace_addr;
  5. __u64 flags_padding; /* No flags are currently specified. */
  6. };
  7. /* All region addresses and sizes must be 4K aligned. */
  8. #define VHOST_PAGE_SIZE 0x1000
  9. struct vhost_memory {
  10. __u32 nregions;
  11. __u32 padding;
  12. struct vhost_memory_region regions[0];
  13. };
vhost的控制面由qemu来控制,通过ioctl操作vhost_xxx的内核模块,e.g.

  1. long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
  2. {
  3. void __user *argp = (void __user *)arg;
  4. struct file *eventfp, *filep = NULL;
  5. struct eventfd_ctx *ctx = NULL;
  6. u64 p;
  7. long r;
  8. int i, fd;
  9. /* If you are not the owner, you can become one */
  10. if (ioctl == VHOST_SET_OWNER) {
  11. r = vhost_dev_set_owner(d);
  12. goto done;
  13. }
  14. /* You must be the owner to do anything else */
  15. r = vhost_dev_check_owner(d);
  16. if (r)
  17. goto done;
  18. switch (ioctl) {
  19. case VHOST_SET_MEM_TABLE:
  20. r = vhost_set_memory(d, argp);
  21. break;
  22. ...
  23. default:
  24. r = vhost_set_vring(d, ioctl, argp);
  25. break;
  26. }
  27. done:
  28. return r;
  29. }

VHOST_SET_OWNER,用于把当前guest对应的qemu进程和vhost内核线程关联起来

  1. VHOST_SET_OWNER
  2. /* Caller should have device mutex */
  3. static long vhost_dev_set_owner(struct vhost_dev *dev)
  4. {
  5. struct task_struct *worker;
  6. int err;
  7. /* Is there an owner already? */
  8. if (dev->mm) {
  9. err = -EBUSY;
  10. goto err_mm;
  11. }
  12. /* No owner, become one */
  13. dev->mm = get_task_mm(current); /* 拿到qemu进程的mm_struct,即guest的内存分布结构 */
  14. worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); /* 创建vhost线程 */
  15. if (IS_ERR(worker)) {
  16. err = PTR_ERR(worker);
  17. goto err_worker;
  18. }
  19. dev->worker = worker;
  20. wake_up_process(worker); /* avoid contributing to loadavg */
  21. err = vhost_attach_cgroups(dev);
  22. if (err)
  23. goto err_cgroup;
  24. err = vhost_dev_alloc_iovecs(dev); /* 为vhost_virtqueue分配iovec内存空间 */
  25. if (err)
  26. goto err_cgroup;
  27. return 0;
  28. err_cgroup:
  29. kthread_stop(worker);
  30. dev->worker = NULL;
  31. err_worker:
  32. if (dev->mm)
  33. mmput(dev->mm);
  34. dev->mm = NULL;
  35. err_mm:
  36. return err;
  37. }
VHOST_SET_MEM_TABLE,初始化vhost_dev的vhost_memory内存成员
  1. static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
  2. {
  3. struct vhost_memory mem, *newmem, *oldmem;
  4. unsigned long size = offsetof(struct vhost_memory, regions);
  5. if (copy_from_user(&mem, m, size))
  6. return -EFAULT;
  7. if (mem.padding)
  8. return -EOPNOTSUPP;
  9. if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS)
  10. return -E2BIG;
  11. newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); /* 分配多个vhost_memory_region */
  12. if (!newmem)
  13. return -ENOMEM;
  14. memcpy(newmem, &mem, size);
  15. if (copy_from_user(newmem->regions, m->regions,
  16. mem.nregions * sizeof *m->regions)) {
  17. kfree(newmem);
  18. return -EFAULT;
  19. }
  20. if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) {
  21. kfree(newmem);
  22. return -EFAULT;
  23. }
  24. oldmem = d->memory;
  25. rcu_assign_pointer(d->memory, newmem);
  26. synchronize_rcu();
  27. kfree(oldmem);
  28. return 0;
  29. }
VHOST_GET_FEATURES, VHOST_SET_FEATURES,用于读写vhost支持的features,目前只有vhost_net模块用到,
  1. enum {
  2. VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
  3. (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
  4. (1ULL << VIRTIO_RING_F_EVENT_IDX) |
  5. (1ULL << VHOST_F_LOG_ALL) |
  6. (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
  7. (1ULL << VIRTIO_NET_F_MRG_RXBUF),
  8. };
  9. static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
  10. unsigned long arg)
  11. {
  12. ....
  13. case VHOST_GET_FEATURES:
  14. features = VHOST_FEATURES;
  15. if (copy_to_user(featurep, &features, sizeof features))
  16. return -EFAULT;
  17. return 0;
  18. case VHOST_SET_FEATURES:
  19. if (copy_from_user(&features, featurep, sizeof features))
  20. return -EFAULT;
  21. if (features & ~VHOST_FEATURES)
  22. return -EOPNOTSUPP;
  23. return vhost_net_set_features(n, features);
  24. ....
  25. }
VHOST_SET_VRING_CALL,设置irqfd,把中断注入guest

VHOST_SET_VRING_KICK,设置ioeventfd,获取guest notify

  1. case VHOST_SET_VRING_KICK:
  2. if (copy_from_user(&f, argp, sizeof f)) {
  3. r = -EFAULT;
  4. break;
  5. }
  6. eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
  7. if (IS_ERR(eventfp)) {
  8. r = PTR_ERR(eventfp);
  9. break;
  10. }
  11. if (eventfp != vq->kick) { /* eventfp不同于vq->kick,此时需要stop vq->kick同时start eventfp */
  12. pollstop = filep = vq->kick;
  13. pollstart = vq->kick = eventfp;
  14. } else
  15. filep = eventfp; /* 两者相同,无需stop & start */
  16. break;
  17. case VHOST_SET_VRING_CALL:
  18. if (copy_from_user(&f, argp, sizeof f)) {
  19. r = -EFAULT;
  20. break;
  21. }
  22. eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
  23. if (IS_ERR(eventfp)) {
  24. r = PTR_ERR(eventfp);
  25. break;
  26. }
  27. if (eventfp != vq->call) { /* eventfp不同于vq->call,此时需要stop vq->call同时start eventfp */
  28. filep = vq->call;
  29. ctx = vq->call_ctx;
  30. vq->call = eventfp;
  31. vq->call_ctx = eventfp ?
  32. eventfd_ctx_fileget(eventfp) : NULL;
  33. } else
  34. filep = eventfp;
  35. break;
  36. if (pollstop && vq->handle_kick)
  37. vhost_poll_stop(&vq->poll);
  38. if (ctx)
  39. eventfd_ctx_put(ctx); /* pollstop之后,释放之前占用的ctx */
  40. if (filep)
  41. fput(filep); /* pollstop之后,释放之前占用的filep */
  42. if (pollstart && vq->handle_kick)
  43. vhost_poll_start(&vq->poll, vq->kick);
  44. mutex_unlock(&vq->mutex);
  45. if (pollstop && vq->handle_kick)
  46. vhost_poll_flush(&vq->poll);
  47. return r;
下面来看下vhost的数据流,vhost与kvm模块之间通过eventfd来实现,guest到host方向的kick event,通过ioeventfd实现,host到guest方向的call event,通过irqfd实现

host到guest方向

首先host处理used ring,然后判断如果KVM_IRQFD成功设置,kvm模块会通过irqfd把中断注入guest。qemu是通过virtio_pci_set_guest_notifiers -> kvm_virtio_pci_vector_use -> kvm_virtio_pci_irqfd_use -> kvm_irqchip_add_irqfd_notifier -> kvm_irqchip_assign_irqfd最终调用kvm_vm_ioctl来设置kvm模块的irqfd的,包含write fd和read fd(可选)

  1. static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs)
  2. {
  3. PCIDevice *dev = &proxy->pci_dev;
  4. VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
  5. VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
  6. unsigned int vector;
  7. int ret, queue_no;
  8. MSIMessage msg;
  9. for (queue_no = 0; queue_no < nvqs; queue_no++) {
  10. if (!virtio_queue_get_num(vdev, queue_no)) {
  11. break;
  12. }
  13. vector = virtio_queue_vector(vdev, queue_no);
  14. if (vector >= msix_nr_vectors_allocated(dev)) {
  15. continue;
  16. }
  17. msg = msix_get_message(dev, vector);
  18. ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector, msg);
  19. if (ret < 0) {
  20. goto undo;
  21. }
  22. /* If guest supports masking, set up irqfd now.
  23. * Otherwise, delay until unmasked in the frontend.
  24. */
  25. if (k->guest_notifier_mask) {
  26. ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector);
  27. if (ret < 0) {
  28. kvm_virtio_pci_vq_vector_release(proxy, vector);
  29. goto undo;
  30. }
  31. }
  32. }
  33. return 0;
  34. undo:
  35. while (--queue_no >= 0) {
  36. vector = virtio_queue_vector(vdev, queue_no);
  37. if (vector >= msix_nr_vectors_allocated(dev)) {
  38. continue;
  39. }
  40. if (k->guest_notifier_mask) {
  41. kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
  42. }
  43. kvm_virtio_pci_vq_vector_release(proxy, vector);
  44. }
  45. return ret;
  46. }
如果没有设置irqfd,则guest notifier fd会通知到等待fd的qemu进程,进入注册函数virtio_queue_guest_notifier_read,调用virtio_irq,最终调用到virtio_pci_notify

  1. static void virtio_queue_guest_notifier_read(EventNotifier *n)
  2. {
  3. VirtQueue *vq = container_of(n, VirtQueue, guest_notifier);
  4. if (event_notifier_test_and_clear(n)) {
  5. virtio_irq(vq);
  6. }
  7. }
  8. void virtio_irq(VirtQueue *vq)
  9. {
  10. trace_virtio_irq(vq);
  11. vq->vdev->isr |= 0x01;
  12. virtio_notify_vector(vq->vdev, vq->vector);
  13. }
  14. static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
  15. {
  16. BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
  17. VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
  18. if (k->notify) {
  19. k->notify(qbus->parent, vector);
  20. }
  21. }
  22. static void virtio_pci_notify(DeviceState *d, uint16_t vector)
  23. {
  24. VirtIOPCIProxy *proxy = to_virtio_pci_proxy_fast(d);
  25. if (msix_enabled(&proxy->pci_dev))
  26. msix_notify(&proxy->pci_dev, vector);
  27. else {
  28. VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
  29. pci_set_irq(&proxy->pci_dev, vdev->isr & 1);
  30. }
  31. }
整个过程如图所示 ( 摘自http://royluo.org/2014/08/22/vhost/ )

guest到host方向

guest通过向pci配置空间写入从而产生VMEXIT,被kvm截获之后触发注册fd的notification

  1. kvm_init:
  2. memory_listener_register(&kvm_memory_listener, &address_space_memory);
  3. memory_listener_register(&kvm_io_listener, &address_space_io);
  4. static MemoryListener kvm_memory_listener = {
  5. .region_add = kvm_region_add,
  6. .region_del = kvm_region_del,
  7. .log_start = kvm_log_start,
  8. .log_stop = kvm_log_stop,
  9. .log_sync = kvm_log_sync,
  10. .log_global_start = kvm_log_global_start,
  11. .log_global_stop = kvm_log_global_stop,
  12. .eventfd_add = kvm_mem_ioeventfd_add,
  13. .eventfd_del = kvm_mem_ioeventfd_del,
  14. .coalesced_mmio_add = kvm_coalesce_mmio_region,
  15. .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
  16. .priority = 10,
  17. };
  18. static MemoryListener kvm_io_listener = {
  19. .eventfd_add = kvm_io_ioeventfd_add,
  20. .eventfd_del = kvm_io_ioeventfd_del,
  21. .priority = 10,
  22. };
  23. static void kvm_io_ioeventfd_add(MemoryListener *listener,
  24. MemoryRegionSection *section,
  25. bool match_data, uint64_t data,
  26. EventNotifier *e)
  27. {
  28. int fd = event_notifier_get_fd(e);
  29. int r;
  30. r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
  31. data, true, int128_get64(section->size),
  32. match_data);
  33. if (r < 0) {
  34. fprintf(stderr, "%s: error adding ioeventfd: %s\n",
  35. __func__, strerror(-r));
  36. abort();
  37. }
  38. }
而kvm_io_ioeventfd_add最终调用了kvm_set_ioeventfd_pio,后者调用了kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick)进入到了kvm.ko中

  1. static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
  2. bool assign, uint32_t size, bool datamatch)
  3. {
  4. struct kvm_ioeventfd kick = {
  5. .datamatch = datamatch ? val : 0,
  6. .addr = addr,
  7. .flags = KVM_IOEVENTFD_FLAG_PIO,
  8. .len = size,
  9. .fd = fd,
  10. };
  11. int r;
  12. if (!kvm_enabled()) {
  13. return -ENOSYS;
  14. }
  15. if (datamatch) {
  16. kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
  17. }
  18. if (!assign) {
  19. kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
  20. }
  21. r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
  22. if (r < 0) {
  23. return r;
  24. }
  25. return 0;
  26. }
KVM_IOEVENTFD的ioctl最终调用了kvm的kvm_ioeventfd函数,后者会调用到kvm_assign_ioeventfd或者kvm_deassign_ioeventfd

  1. int
  2. kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
  3. {
  4. if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
  5. return kvm_deassign_ioeventfd(kvm, args);
  6. return kvm_assign_ioeventfd(kvm, args);
  7. }
  8. static int
  9. kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
  10. {
  11. int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
  12. enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
  13. struct _ioeventfd *p; /* ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. */
  14. struct eventfd_ctx *eventfd; /* mostly wait_queue_head_t */
  15. int ret;
  16. /* must be natural-word sized */
  17. switch (args->len) {
  18. case 1:
  19. case 2:
  20. case 4:
  21. case 8:
  22. break;
  23. default:
  24. return -EINVAL;
  25. }
  26. /* check for range overflow */
  27. if (args->addr + args->len < args->addr)
  28. return -EINVAL;
  29. /* check for extra flags that we don't understand */
  30. if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
  31. return -EINVAL;
  32. eventfd = eventfd_ctx_fdget(args->fd); /* file->private_data */
  33. if (IS_ERR(eventfd))
  34. return PTR_ERR(eventfd);
  35. p = kzalloc(sizeof(*p), GFP_KERNEL); /* 分配一个_ioeventfd,并把内存地址,长度,eventfd_ctx与其关联起来 */
  36. if (!p) {
  37. ret = -ENOMEM;
  38. goto fail;
  39. }
  40. INIT_LIST_HEAD(&p->list);
  41. p->addr = args->addr;
  42. p->length = args->len;
  43. p->eventfd = eventfd;
  44. /* The datamatch feature is optional, otherwise this is a wildcard */
  45. if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
  46. p->datamatch = args->datamatch;
  47. else
  48. p->wildcard = true;
  49. mutex_lock(&kvm->slots_lock);
  50. /* Verify that there isnt a match already */
  51. if (ioeventfd_check_collision(kvm, p)) {
  52. ret = -EEXIST;
  53. goto unlock_fail;
  54. }
  55. kvm_iodevice_init(&p->dev, &ioeventfd_ops);
  56. ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); /* 注册到kvm的pio bus或者mmio bus上 */
  57. if (ret < 0)
  58. goto unlock_fail;
  59. list_add_tail(&p->list, &kvm->ioeventfds); /* 添加到kvm.ko的ioeventfds的list中 */
  60. mutex_unlock(&kvm->slots_lock);
  61. return 0;
  62. unlock_fail:
  63. mutex_unlock(&kvm->slots_lock);
  64. fail:
  65. kfree(p);
  66. eventfd_ctx_put(eventfd);
  67. return ret;
  68. }
kvm_assign_ioeventfd中,通过注册一个pio/mmio的地址段和一个fd,当访问这块内存产生的VMEXIT就会在kvm.ko中被转化成为fd的event notification,
  1. static const struct kvm_io_device_ops ioeventfd_ops = {
  2. .write = ioeventfd_write,
  3. .destructor = ioeventfd_destructor,
  4. };
  5. /* MMIO/PIO writes trigger an event if the addr/val match */
  6. static int
  7. ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
  8. const void *val)
  9. {
  10. struct _ioeventfd *p = to_ioeventfd(this);
  11. if (!ioeventfd_in_range(p, addr, len, val))
  12. return -EOPNOTSUPP;
  13. eventfd_signal(p->eventfd, 1);
  14. return 0;
  15. }
最终event notification通过eventfd_signal,唤醒vhost线程,整体的流程如下图所示



vhost的控制面和数据面如下图所示





最后,以vhost-net为例说明下vhost网络报文的初始化以及收发流程,e.g.

qemu通过netdev tap,vhost=on在创建网络设备时指定后端基于vhost,net_init_tap会对vhost的每个queue,调用net_init_tap_one初始化vhost。初始化的工作通过vhost_net_init完成

  1. typedef struct VhostNetOptions {
  2. VhostBackendType backend_type; /* vhost kernel or userspace */
  3. NetClientState *net_backend; /* TAPState device */
  4. void *opaque; /* ioctl vhostfd, /dev/vhost-net */
  5. bool force;
  6. } VhostNetOptions;
  7. static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
  8. const char *model, const char *name,
  9. const char *ifname, const char *script,
  10. const char *downscript, const char *vhostfdname,
  11. int vnet_hdr, int fd)
  12. {
  13. ...
  14. if (tap->has_vhost ? tap->vhost :
  15. vhostfdname || (tap->has_vhostforce && tap->vhostforce)) {
  16. VhostNetOptions options;
  17. options.backend_type = VHOST_BACKEND_TYPE_KERNEL;
  18. options.net_backend = &s->nc;
  19. options.force = tap->has_vhostforce && tap->vhostforce;
  20. if ((tap->has_vhostfd || tap->has_vhostfds)) {
  21. vhostfd = monitor_handle_fd_param(cur_mon, vhostfdname);
  22. if (vhostfd == -1) {
  23. return -1;
  24. }
  25. } else {
  26. vhostfd = open("/dev/vhost-net", O_RDWR); /* open /dev/vhost-net for ioctl usage */
  27. if (vhostfd < 0) {
  28. error_report("tap: open vhost char device failed: %s",
  29. strerror(errno));
  30. return -1;
  31. }
  32. }
  33. qemu_set_cloexec(vhostfd);
  34. options.opaque = (void *)(uintptr_t)vhostfd;
  35. s->vhost_net = vhost_net_init(&options); /* 初始化struct vhost_net */
  36. if (!s->vhost_net) {
  37. error_report("vhost-net requested but could not be initialized");
  38. return -1;
  39. }
  40. }
  41. ...
  42. }
  43. struct vhost_net {
  44. struct vhost_dev dev;
  45. struct vhost_virtqueue vqs[2];
  46. int backend;
  47. NetClientState *nc;
  48. };
  49. struct vhost_net *vhost_net_init(VhostNetOptions *options)
  50. {
  51. int r;
  52. bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL;
  53. struct vhost_net *net = g_malloc(sizeof *net);
  54. if (!options->net_backend) {
  55. fprintf(stderr, "vhost-net requires net backend to be setup\n");
  56. goto fail;
  57. }
  58. if (backend_kernel) {
  59. r = vhost_net_get_fd(options->net_backend);
  60. if (r < 0) {
  61. goto fail;
  62. }
  63. net->dev.backend_features = qemu_has_vnet_hdr(options->net_backend)
  64. ? 0 : (1 << VHOST_NET_F_VIRTIO_NET_HDR);
  65. net->backend = r; /* backend设置为NetClientState对应的fd */
  66. } else {
  67. net->dev.backend_features = 0;
  68. net->backend = -1;
  69. }
  70. net->nc = options->net_backend; /* nc设置为NetClientState */
  71. net->dev.nvqs = 2; /* TX queue和RX queue */
  72. net->dev.vqs = net->vqs; /* vhost_dev,vhost_net公用vhost_virtqueue */
  73. r = vhost_dev_init(&net->dev, options->opaque,
  74. options->backend_type, options->force); /* 初始化vhost_dev,这里通过VHOST_SET_OWNER的ioctl创建vhost kthread */
  75. if (r < 0) {
  76. goto fail;
  77. }
  78. if (!qemu_has_vnet_hdr_len(options->net_backend,
  79. sizeof(struct virtio_net_hdr_mrg_rxbuf))) {
  80. net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
  81. }
  82. if (backend_kernel) {
  83. if (~net->dev.features & net->dev.backend_features) {
  84. fprintf(stderr, "vhost lacks feature mask %" PRIu64
  85. " for backend\n",
  86. (uint64_t)(~net->dev.features & net->dev.backend_features));
  87. vhost_dev_cleanup(&net->dev);
  88. goto fail;
  89. }
  90. }
  91. /* Set sane init value. Override when guest acks. */
  92. vhost_net_ack_features(net, 0);
  93. return net;
  94. fail:
  95. g_free(net);
  96. return NULL;
  97. }
当guest启动成功,qemu会配置相应的vhost,调用virtio_net_set_status用于开启/关闭virtio-net设备及队列,virtio_net_set_status会调用到vhost_net_start用于打开vhost队列,调用vhost_net_stop用于关闭vhost队列

  1. int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
  2. int total_queues)
  3. {
  4. BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev)));
  5. VirtioBusState *vbus = VIRTIO_BUS(qbus);
  6. VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
  7. int r, e, i;
  8. if (!vhost_net_device_endian_ok(dev)) {
  9. error_report("vhost-net does not support cross-endian");
  10. r = -ENOSYS;
  11. goto err;
  12. }
  13. if (!k->set_guest_notifiers) {
  14. error_report("binding does not support guest notifiers");
  15. r = -ENOSYS;
  16. goto err;
  17. }
  18. for (i = 0; i < total_queues; i++) {
  19. vhost_net_set_vq_index(get_vhost_net(ncs[i].peer), i * 2);
  20. }
  21. /* 调用virtio_pci_set_guest_notifiers来配置irqfd等信息;如果没有enable vhost,qemu同样会调用到这里 */
  22. r = k->set_guest_notifiers(qbus->parent, total_queues * 2, true);
  23. if (r < 0) {
  24. error_report("Error binding guest notifier: %d", -r);
  25. goto err;
  26. }
  27. /* 如果tun支持多队列的场景,会有多个NetClientState,分别代表tap设备的一个队列,每个NetClientState都会对应一个vhost_net结构 */
  28. for (i = 0; i < total_queues; i++) {
  29. r = vhost_net_start_one(get_vhost_net(ncs[i].peer), dev); /* 对每个队列调用vhost_net_start_one */
  30. if (r < 0) {
  31. goto err_start;
  32. }
  33. }
  34. return 0;
  35. err_start:
  36. while (--i >= 0) {
  37. vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev);
  38. }
  39. e = k->set_guest_notifiers(qbus->parent, total_queues * 2, false);
  40. if (e < 0) {
  41. fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", e);
  42. fflush(stderr);
  43. }
  44. err:
  45. return r;
  46. }
  1. static int vhost_net_start_one(struct vhost_net *net,
  2. VirtIODevice *dev)
  3. {
  4. struct vhost_vring_file file = { };
  5. int r;
  6. if (net->dev.started) {
  7. return 0;
  8. }
  9. net->dev.nvqs = 2; /* vqs包含一个TX virtqueue和一个RX virtqueue */
  10. net->dev.vqs = net->vqs;
  11. /* 调用<span style="font-family: Arial, Helvetica, sans-serif;">virtio_pci_set_guest_notifiers来enable vhost ioeventfd */</span>
  12. r = vhost_dev_enable_notifiers(&net->dev, dev); /* 停止在qemu中处理guest的IO通知,开始在vhost里处理guest的IO通知 */
  13. if (r < 0) {
  14. goto fail_notifiers;
  15. }
  16. r = vhost_dev_start(&net->dev, dev);
  17. if (r < 0) {
  18. goto fail_start;
  19. }
  20. if (net->nc->info->poll) {
  21. net->nc->info->poll(net->nc, false);
  22. }
  23. if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) {
  24. qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
  25. file.fd = net->backend;
  26. for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
  27. const VhostOps *vhost_ops = net->dev.vhost_ops;
  28. r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND,
  29. &file);
  30. if (r < 0) {
  31. r = -errno;
  32. goto fail;
  33. }
  34. }
  35. }
  36. return 0;
  37. fail:
  38. file.fd = -1;
  39. if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) {
  40. while (file.index-- > 0) {
  41. const VhostOps *vhost_ops = net->dev.vhost_ops;
  42. int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND,
  43. &file);
  44. assert(r >= 0);
  45. }
  46. }
  47. if (net->nc->info->poll) {
  48. net->nc->info->poll(net->nc, true);
  49. }
  50. vhost_dev_stop(&net->dev, dev);
  51. fail_start:
  52. vhost_dev_disable_notifiers(&net->dev, dev);
  53. fail_notifiers:
  54. return r;
  55. }
vhost net的各个数据结构之间关系如下图所示

我们来看下内核对vhost_net的定义,e.g.

  1. static const struct file_operations vhost_net_fops = {
  2. .owner = THIS_MODULE,
  3. .release = vhost_net_release,
  4. .unlocked_ioctl = vhost_net_ioctl,
  5. #ifdef CONFIG_COMPAT
  6. .compat_ioctl = vhost_net_compat_ioctl,
  7. #endif
  8. .open = vhost_net_open,
  9. };
  10. static struct miscdevice vhost_net_misc = {
  11. MISC_DYNAMIC_MINOR,
  12. "vhost-net",
  13. &vhost_net_fops,
  14. };
  15. enum {
  16. VHOST_NET_VQ_RX = 0,
  17. VHOST_NET_VQ_TX = 1,
  18. VHOST_NET_VQ_MAX = 2,
  19. };
  20. enum vhost_net_poll_state {
  21. VHOST_NET_POLL_DISABLED = 0,
  22. VHOST_NET_POLL_STARTED = 1,
  23. VHOST_NET_POLL_STOPPED = 2,
  24. };
  25. struct vhost_net {
  26. struct vhost_dev dev;
  27. struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; /* vhost的virtqueue封装,其handle_kick的回调函数会被ioeventfd唤醒 */
  28. struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* 对应于NetClientState的socket IO,分别用两个vhost_poll结构体 */
  29. /* Tells us whether we are polling a socket for TX.
  30. * We only do this when socket buffer fills up.
  31. * Protected by tx vq lock. */
  32. enum vhost_net_poll_state tx_poll_state;
  33. };
  34. static int vhost_net_open(struct inode *inode, struct file *f)
  35. {
  36. struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
  37. struct vhost_dev *dev;
  38. int r;
  39. if (!n)
  40. return -ENOMEM;
  41. dev = &n->dev;
  42. n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; /* TX virtqueue->kick的callback函数 */
  43. n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; /* RX virtqueue->kick的callback函数 */
  44. r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
  45. if (r < 0) {
  46. kfree(n);
  47. return r;
  48. }
  49. vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); /* 初始化vhost_net的TX vhost_poll */
  50. vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); /* 初始化vhost_net的RX vhost_poll */
  51. n->tx_poll_state = VHOST_NET_POLL_DISABLED;
  52. f->private_data = n;
  53. return 0;
  54. }

handle_tx_kick/handle_rx_kick的实现和handle_tx_net/handle_rx_net完全一致,这里为什么要有两个不同的函数呢?看完下面的代码分析后你会有一个答案,不过我先在这里剧透下,handle_tx_kick/handle_rx_kick是阻塞在TX queue/RX queue的kick fd上的回调函数,handle_tx_net/handle_rx_net是阻塞在vhost_net TX poll/RX poll上的阻塞函数,无论对于TX还是RX而言,报文的路径都是一个两阶段的过程,e.g.

TX首先是kick virtqueue的fd,之后进行vring的buffer传递,最后通过NetClientState的socket fd发送,但socket有可能会出现缓冲区不足,或者本次发送的quota不够等情况,此时需要poll在socket的fd上阻塞等待。同理RX也是如此,一阶段阻塞在socket fd上,二阶段阻塞在virtqueue kick fd上


前面分析时已经提到过,qemu在vhost_virtqueue_start时,会取得VirtQueue的host_notifier的rfd,并把fd通过VHOST_SET_VRING_KICK传入kvm.ko,这样kvm.ko后续就会通过eventfd_signal通知这个fd。vhost模块会把这个fd和vhost_virtqueue->kick关联起来,并最终调用vhost_poll_start阻塞在这个poll fd上。

当guest发送报文时,ioeventfd触发了vhost_virtqueue的kick fd,POLLIN事件导致vhost_poll_wakeup被调用,最后唤醒了vhost worker线程,线程会调用注册的handle_kick函数,即handle_tx_kick

  1. static void handle_tx_kick(struct vhost_work *work)
  2. {
  3. struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
  4. poll.work);
  5. struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
  6. handle_tx(net);
  7. }
  8. static void handle_tx(struct vhost_net *net)
  9. {
  10. struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
  11. unsigned out, in, s;
  12. int head;
  13. struct msghdr msg = {
  14. .msg_name = NULL,
  15. .msg_namelen = 0,
  16. .msg_control = NULL,
  17. .msg_controllen = 0,
  18. .msg_iov = vq->iov,
  19. .msg_flags = MSG_DONTWAIT,
  20. };
  21. size_t len, total_len = 0;
  22. int err, wmem;
  23. size_t hdr_size;
  24. struct vhost_ubuf_ref *uninitialized_var(ubufs);
  25. bool zcopy;
  26. struct socket *sock = rcu_dereference(vq->private_data); /* NetClientState对应的socket以private_data的形式保存在vhost_virtqueue */
  27. if (!sock)
  28. return;
  29. wmem = atomic_read(&sock->sk->sk_wmem_alloc);
  30. if (wmem >= sock->sk->sk_sndbuf) { /* 已经申请的socket写内存,超过了发送缓冲区 */
  31. mutex_lock(&vq->mutex);
  32. tx_poll_start(net, sock); /* 此时无法发送,阻塞等待在sock上 */
  33. mutex_unlock(&vq->mutex);
  34. return;
  35. }
  36. mutex_lock(&vq->mutex);
  37. vhost_disable_notify(&net->dev, vq); /* disable virtqueue的notify通知,通过VRING_USED_F_NO_NOTIFY标志位 */
  38. if (wmem < sock->sk->sk_sndbuf / 2)
  39. tx_poll_stop(net);
  40. hdr_size = vq->vhost_hlen;
  41. zcopy = vq->ubufs;
  42. for (;;) {
  43. /* Release DMAs done buffers first */
  44. if (zcopy)
  45. vhost_zerocopy_signal_used(vq);
  46. head = vhost_get_vq_desc(&net->dev, vq, vq->iov, /* 从last_avail_idx开始,把avail desc内容拷贝过来 */
  47. ARRAY_SIZE(vq->iov),
  48. &out, &in,
  49. NULL, NULL);
  50. /* On error, stop handling until the next kick. */
  51. if (unlikely(head < 0))
  52. break;
  53. /* Nothing new? Wait for eventfd to tell us they refilled. */
  54. if (head == vq->num) { /* 此时vq->avail_idx == vq->last_avail_idx,前端没有新buf过来 */
  55. int num_pends;
  56. wmem = atomic_read(&sock->sk->sk_wmem_alloc);
  57. if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
  58. tx_poll_start(net, sock);
  59. set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
  60. break;
  61. }
  62. /* If more outstanding DMAs, queue the work.
  63. * Handle upend_idx wrap around
  64. */
  65. num_pends = likely(vq->upend_idx >= vq->done_idx) ?
  66. (vq->upend_idx - vq->done_idx) :
  67. (vq->upend_idx + UIO_MAXIOV - vq->done_idx);
  68. if (unlikely(num_pends > VHOST_MAX_PEND)) {
  69. tx_poll_start(net, sock);
  70. set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
  71. break;
  72. }
  73. if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* 重新调用vhost_enable_notify打开event notify flag */
  74. vhost_disable_notify(&net->dev, vq); /* vhost_enable_notify返回false,说明avail_idx有了变化,那么continue */
  75. continue;
  76. }
  77. break;
  78. }
  79. if (in) { /* Tx应该全部是out */
  80. vq_err(vq, "Unexpected descriptor format for TX: "
  81. "out %d, int %d\n", out, in);
  82. break;
  83. }
  84. /* Skip header. TODO: support TSO. */
  85. s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); /* hdr_size是VNET_HDR的元数据,里面没有实际报文内容 */
  86. msg.msg_iovlen = out;
  87. len = iov_length(vq->iov, out);
  88. /* Sanity check */
  89. if (!len) {
  90. vq_err(vq, "Unexpected header len for TX: "
  91. "%zd expected %zd\n",
  92. iov_length(vq->hdr, s), hdr_size);
  93. break;
  94. }
  95. /* use msg_control to pass vhost zerocopy ubuf info to skb */
  96. if (zcopy) {
  97. vq->heads[vq->upend_idx].id = head;
  98. if (len < VHOST_GOODCOPY_LEN) {
  99. /* copy don't need to wait for DMA done */
  100. vq->heads[vq->upend_idx].len =
  101. VHOST_DMA_DONE_LEN;
  102. msg.msg_control = NULL;
  103. msg.msg_controllen = 0;
  104. ubufs = NULL;
  105. } else {
  106. struct ubuf_info *ubuf = &vq->ubuf_info[head];
  107. vq->heads[vq->upend_idx].len = len;
  108. ubuf->callback = vhost_zerocopy_callback;
  109. ubuf->arg = vq->ubufs;
  110. ubuf->desc = vq->upend_idx;
  111. msg.msg_control = ubuf;
  112. msg.msg_controllen = sizeof(ubuf);
  113. ubufs = vq->ubufs;
  114. kref_get(&ubufs->kref);
  115. }
  116. vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
  117. }
  118. /* TODO: Check specific error and bomb out unless ENOBUFS? */
  119. err = sock->ops->sendmsg(NULL, sock, &msg, len);
  120. if (unlikely(err < 0)) {
  121. if (zcopy) {
  122. if (ubufs)
  123. vhost_ubuf_put(ubufs);
  124. vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
  125. UIO_MAXIOV;
  126. }
  127. vhost_discard_vq_desc(vq, 1); /* 发送失败,回退last_avail_idx */
  128. if (err == -EAGAIN || err == -ENOBUFS)
  129. tx_poll_start(net, sock); /* 阻塞等待vhost_net->poll之后尝试重新发送 */
  130. break;
  131. }
  132. if (err != len)
  133. pr_debug("Truncated TX packet: "
  134. " len %d != %zd\n", err, len);
  135. if (!zcopy)
  136. vhost_add_used_and_signal(&net->dev, vq, head, 0); /* 更新virtqueue used ring部分,e.g. used_elem, last_used_idx */
  137. else
  138. vhost_zerocopy_signal_used(vq);
  139. total_len += len;
  140. if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
  141. vhost_poll_queue(&vq->poll); /* 超出了quota,重新入队列等待调度 */
  142. break;
  143. }
  144. }
  145. mutex_unlock(&vq->mutex);
  146. }
收包过程首先是vhost阻塞在NetClientState的socket上,e.g.

vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev)

  1. static void handle_rx_net(struct vhost_work *work)
  2. {
  3. struct vhost_net *net = container_of(work, struct vhost_net,
  4. poll[VHOST_NET_VQ_RX].work);
  5. handle_rx(net);
  6. }
  7. static void handle_rx(struct vhost_net *net)
  8. {
  9. struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
  10. unsigned uninitialized_var(in), log;
  11. struct vhost_log *vq_log;
  12. struct msghdr msg = {
  13. .msg_name = NULL,
  14. .msg_namelen = 0,
  15. .msg_control = NULL, /* FIXME: get and handle RX aux data. */
  16. .msg_controllen = 0,
  17. .msg_iov = vq->iov,
  18. .msg_flags = MSG_DONTWAIT,
  19. };
  20. struct virtio_net_hdr_mrg_rxbuf hdr = {
  21. .hdr.flags = 0,
  22. .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
  23. };
  24. size_t total_len = 0;
  25. int err, headcount, mergeable;
  26. size_t vhost_hlen, sock_hlen;
  27. size_t vhost_len, sock_len;
  28. struct socket *sock = rcu_dereference(vq->private_data);
  29. if (!sock)
  30. return;
  31. mutex_lock(&vq->mutex);
  32. vhost_disable_notify(&net->dev, vq); /* disable virtqueue event notify机制 */
  33. vhost_hlen = vq->vhost_hlen;
  34. sock_hlen = vq->sock_hlen;
  35. vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
  36. vq->log : NULL;
  37. mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
  38. while ((sock_len = peek_head_len(sock->sk))) { /* 下一个报文的长度 */
  39. sock_len += sock_hlen;
  40. vhost_len = sock_len + vhost_hlen;
  41. headcount = get_rx_bufs(vq, vq->heads, vhost_len, /* get_rx_bufs用于从virtqueue中拿到多个avail desc, */
  42. &in, vq_log, &log, /* 直到满足所有这些iov加起来可以容纳下一个报文的长度 */
  43. likely(mergeable) ? UIO_MAXIOV : 1); /* 相当于多次调用<span style="font-family: Arial, Helvetica, sans-serif;">vhost_get_vq_desc */</span>
  44. /* On error, stop handling until the next kick. */
  45. if (unlikely(headcount < 0))
  46. break;
  47. /* OK, now we need to know about added descriptors. */
  48. if (!headcount) {
  49. if (unlikely(vhost_enable_notify(&net->dev, vq))) {
  50. /* They have slipped one in as we were
  51. * doing that: check again. */
  52. vhost_disable_notify(&net->dev, vq);
  53. continue;
  54. }
  55. /* Nothing new? Wait for eventfd to tell us
  56. * they refilled. */
  57. break;
  58. }
  59. /* We don't need to be notified again. */
  60. if (unlikely((vhost_hlen)))
  61. /* Skip header. TODO: support TSO. */
  62. move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
  63. else
  64. /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
  65. * needed because sendmsg can modify msg_iov. */
  66. copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
  67. msg.msg_iovlen = in;
  68. err = sock->ops->recvmsg(NULL, sock, &msg,
  69. sock_len, MSG_DONTWAIT | MSG_TRUNC); /* 报文被收到virtqueue->iov里面 */
  70. /* Userspace might have consumed the packet meanwhile:
  71. * it's not supposed to do this usually, but might be hard
  72. * to prevent. Discard data we got (if any) and keep going. */
  73. if (unlikely(err != sock_len)) {
  74. pr_debug("Discarded rx packet: "
  75. " len %d, expected %zd\n", err, sock_len);
  76. vhost_discard_vq_desc(vq, headcount); /* 回滚used ring */
  77. continue;
  78. }
  79. if (unlikely(vhost_hlen) &&
  80. memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
  81. vhost_hlen)) {
  82. vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
  83. vq->iov->iov_base);
  84. break;
  85. }
  86. /* TODO: Should check and handle checksum. */
  87. if (likely(mergeable) &&
  88. memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
  89. offsetof(typeof(hdr), num_buffers),
  90. sizeof hdr.num_buffers)) {
  91. vq_err(vq, "Failed num_buffers write");
  92. vhost_discard_vq_desc(vq, headcount);
  93. break;
  94. }
  95. vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
  96. headcount); /* 添加多个vring_used_elem,并notify前端 */
  97. if (unlikely(vq_log))
  98. vhost_log_write(vq, vq_log, log, vhost_len);
  99. total_len += vhost_len;
  100. if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
  101. vhost_poll_queue(&vq->poll); /* 超出了quota,重新入队列等待,注意此时加入的是vq的poll,下次会触发调用handle_rx_kick */
  102. break;
  103. }
  104. }
  105. mutex_unlock(&vq->mutex);
  106. }

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号