hw/virtio/vhost-shadow-virtqueue.c

   1 /*
   2  * vhost shadow virtqueue
   3  *
   4  * SPDX-FileCopyrightText: Red Hat, Inc. 2021
   5  * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
   6  *
   7  * SPDX-License-Identifier: GPL-2.0-or-later
   8  */
   9
  10 #include "qemu/osdep.h"
  11 #include "hw/virtio/vhost-shadow-virtqueue.h"
  12
  13 #include "qemu/error-report.h"
  14 #include "qapi/error.h"
  15 #include "qemu/main-loop.h"
  16 #include "qemu/log.h"
  17 #include "qemu/memalign.h"
  18 #include "linux-headers/linux/vhost.h"
  19
  20 /**
  21  * Validate the transport device features that both guests can use with the SVQ
  22  * and SVQs can use with the device.
  23  *
  24  * @dev_features: The features
  25  * @errp: Error pointer
  26  */
  27 bool vhost_svq_valid_features(uint64_t features, Error **errp)
  28 {
  29     bool ok = true;
  30     uint64_t svq_features = features;
  31
  32     for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
  33          ++b) {
  34         switch (b) {
  35         case VIRTIO_F_ANY_LAYOUT:
  36             continue;
  37
  38         case VIRTIO_F_ACCESS_PLATFORM:
  39             /* SVQ trust in the host's IOMMU to translate addresses */
  40         case VIRTIO_F_VERSION_1:
  41             /* SVQ trust that the guest vring is little endian */
  42             if (!(svq_features & BIT_ULL(b))) {
  43                 svq_features |= BIT_ULL(b);
  44                 ok = false;
  45             }
  46             continue;
  47
  48         default:
  49             if (svq_features & BIT_ULL(b)) {
  50                 svq_features &= ~BIT_ULL(b);
  51                 ok = false;
  52             }
  53         }
  54     }
  55
  56     if (!ok) {
  57         error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
  58                          ", ok: 0x%"PRIx64, features, svq_features);
  59     }
  60     return ok;
  61 }
  62
  63 /**
  64  * Number of descriptors that the SVQ can make available from the guest.
  65  *
  66  * @svq: The svq
  67  */
  68 static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
  69 {
  70     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
  71 }
  72
  73 /**
  74  * Translate addresses between the qemu's virtual address and the SVQ IOVA
  75  *
  76  * @svq: Shadow VirtQueue
  77  * @vaddr: Translated IOVA addresses
  78  * @iovec: Source qemu's VA addresses
  79  * @num: Length of iovec and minimum length of vaddr
  80  */
  81 static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
  82                                      hwaddr *addrs, const struct iovec *iovec,
  83                                      size_t num)
  84 {
  85     if (num == 0) {
  86         return true;
  87     }
  88
  89     for (size_t i = 0; i < num; ++i) {
  90         DMAMap needle = {
  91             .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
  92             .size = iovec[i].iov_len,
  93         };
  94         Int128 needle_last, map_last;
  95         size_t off;
  96
  97         const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
  98         /*
  99          * Map cannot be NULL since iova map contains all guest space and
 100          * qemu already has a physical address mapped
 101          */
 102         if (unlikely(!map)) {
 103             qemu_log_mask(LOG_GUEST_ERROR,
 104                           "Invalid address 0x%"HWADDR_PRIx" given by guest",
 105                           needle.translated_addr);
 106             return false;
 107         }
 108
 109         off = needle.translated_addr - map->translated_addr;
 110         addrs[i] = map->iova + off;
 111
 112         needle_last = int128_add(int128_make64(needle.translated_addr),
 113                                  int128_make64(iovec[i].iov_len));
 114         map_last = int128_make64(map->translated_addr + map->size);
 115         if (unlikely(int128_gt(needle_last, map_last))) {
 116             qemu_log_mask(LOG_GUEST_ERROR,
 117                           "Guest buffer expands over iova range");
 118             return false;
 119         }
 120     }
 121
 122     return true;
 123 }
 124
 125 static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
 126                                     const struct iovec *iovec, size_t num,
 127                                     bool more_descs, bool write)
 128 {
 129     uint16_t i = svq->free_head, last = svq->free_head;
 130     unsigned n;
 131     uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
 132     vring_desc_t *descs = svq->vring.desc;
 133
 134     if (num == 0) {
 135         return;
 136     }
 137
 138     for (n = 0; n < num; n++) {
 139         if (more_descs || (n + 1 < num)) {
 140             descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
 141         } else {
 142             descs[i].flags = flags;
 143         }
 144         descs[i].addr = cpu_to_le64(sg[n]);
 145         descs[i].len = cpu_to_le32(iovec[n].iov_len);
 146
 147         last = i;
 148         i = cpu_to_le16(descs[i].next);
 149     }
 150
 151     svq->free_head = le16_to_cpu(descs[last].next);
 152 }
 153
 154 static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 155                                 VirtQueueElement *elem, unsigned *head)
 156 {
 157     unsigned avail_idx;
 158     vring_avail_t *avail = svq->vring.avail;
 159     bool ok;
 160     g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
 161
 162     *head = svq->free_head;
 163
 164     /* We need some descriptors here */
 165     if (unlikely(!elem->out_num && !elem->in_num)) {
 166         qemu_log_mask(LOG_GUEST_ERROR,
 167                       "Guest provided element with no descriptors");
 168         return false;
 169     }
 170
 171     ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
 172     if (unlikely(!ok)) {
 173         return false;
 174     }
 175     vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
 176                             elem->in_num > 0, false);
 177
 178
 179     ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
 180     if (unlikely(!ok)) {
 181         return false;
 182     }
 183
 184     vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
 185
 186     /*
 187      * Put the entry in the available array (but don't update avail->idx until
 188      * they do sync).
 189      */
 190     avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
 191     avail->ring[avail_idx] = cpu_to_le16(*head);
 192     svq->shadow_avail_idx++;
 193
 194     /* Update the avail index after write the descriptor */
 195     smp_wmb();
 196     avail->idx = cpu_to_le16(svq->shadow_avail_idx);
 197
 198     return true;
 199 }
 200
 201 static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 202 {
 203     unsigned qemu_head;
 204     bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
 205     if (unlikely(!ok)) {
 206         return false;
 207     }
 208
 209     svq->ring_id_maps[qemu_head] = elem;
 210     return true;
 211 }
 212
 213 static void vhost_svq_kick(VhostShadowVirtqueue *svq)
 214 {
 215     /*
 216      * We need to expose the available array entries before checking the used
 217      * flags
 218      */
 219     smp_mb();
 220     if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
 221         return;
 222     }
 223
 224     event_notifier_set(&svq->hdev_kick);
 225 }
 226
 227 /**
 228  * Forward available buffers.
 229  *
 230  * @svq: Shadow VirtQueue
 231  *
 232  * Note that this function does not guarantee that all guest's available
 233  * buffers are available to the device in SVQ avail ring. The guest may have
 234  * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
 235  * qemu vaddr.
 236  *
 237  * If that happens, guest's kick notifications will be disabled until the
 238  * device uses some buffers.
 239  */
 240 static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 241 {
 242     /* Clear event notifier */
 243     event_notifier_test_and_clear(&svq->svq_kick);
 244
 245     /* Forward to the device as many available buffers as possible */
 246     do {
 247         virtio_queue_set_notification(svq->vq, false);
 248
 249         while (true) {
 250             VirtQueueElement *elem;
 251             bool ok;
 252
 253             if (svq->next_guest_avail_elem) {
 254                 elem = g_steal_pointer(&svq->next_guest_avail_elem);
 255             } else {
 256                 elem = virtqueue_pop(svq->vq, sizeof(*elem));
 257             }
 258
 259             if (!elem) {
 260                 break;
 261             }
 262
 263             if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
 264                 /*
 265                  * This condition is possible since a contiguous buffer in GPA
 266                  * does not imply a contiguous buffer in qemu's VA
 267                  * scatter-gather segments. If that happens, the buffer exposed
 268                  * to the device needs to be a chain of descriptors at this
 269                  * moment.
 270                  *
 271                  * SVQ cannot hold more available buffers if we are here:
 272                  * queue the current guest descriptor and ignore further kicks
 273                  * until some elements are used.
 274                  */
 275                 svq->next_guest_avail_elem = elem;
 276                 return;
 277             }
 278
 279             ok = vhost_svq_add(svq, elem);
 280             if (unlikely(!ok)) {
 281                 /* VQ is broken, just return and ignore any other kicks */
 282                 return;
 283             }
 284             vhost_svq_kick(svq);
 285         }
 286
 287         virtio_queue_set_notification(svq->vq, true);
 288     } while (!virtio_queue_empty(svq->vq));
 289 }
 290
 291 /**
 292  * Handle guest's kick.
 293  *
 294  * @n: guest kick event notifier, the one that guest set to notify svq.
 295  */
 296 static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 297 {
 298     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
 299     event_notifier_test_and_clear(n);
 300     vhost_handle_guest_kick(svq);
 301 }
 302
 303 static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
 304 {
 305     if (svq->last_used_idx != svq->shadow_used_idx) {
 306         return true;
 307     }
 308
 309     svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
 310
 311     return svq->last_used_idx != svq->shadow_used_idx;
 312 }
 313
 314 /**
 315  * Enable vhost device calls after disable them.
 316  *
 317  * @svq: The svq
 318  *
 319  * It returns false if there are pending used buffers from the vhost device,
 320  * avoiding the possible races between SVQ checking for more work and enabling
 321  * callbacks. True if SVQ used vring has no more pending buffers.
 322  */
 323 static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
 324 {
 325     svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 326     /* Make sure the flag is written before the read of used_idx */
 327     smp_mb();
 328     return !vhost_svq_more_used(svq);
 329 }
 330
 331 static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 332 {
 333     svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 334 }
 335
 336 static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 337                                            uint32_t *len)
 338 {
 339     vring_desc_t *descs = svq->vring.desc;
 340     const vring_used_t *used = svq->vring.used;
 341     vring_used_elem_t used_elem;
 342     uint16_t last_used;
 343
 344     if (!vhost_svq_more_used(svq)) {
 345         return NULL;
 346     }
 347
 348     /* Only get used array entries after they have been exposed by dev */
 349     smp_rmb();
 350     last_used = svq->last_used_idx & (svq->vring.num - 1);
 351     used_elem.id = le32_to_cpu(used->ring[last_used].id);
 352     used_elem.len = le32_to_cpu(used->ring[last_used].len);
 353
 354     svq->last_used_idx++;
 355     if (unlikely(used_elem.id >= svq->vring.num)) {
 356         qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
 357                       svq->vdev->name, used_elem.id);
 358         return NULL;
 359     }
 360
 361     if (unlikely(!svq->ring_id_maps[used_elem.id])) {
 362         qemu_log_mask(LOG_GUEST_ERROR,
 363             "Device %s says index %u is used, but it was not available",
 364             svq->vdev->name, used_elem.id);
 365         return NULL;
 366     }
 367
 368     descs[used_elem.id].next = svq->free_head;
 369     svq->free_head = used_elem.id;
 370
 371     *len = used_elem.len;
 372     return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 373 }
 374
 375 static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 376                             bool check_for_avail_queue)
 377 {
 378     VirtQueue *vq = svq->vq;
 379
 380     /* Forward as many used buffers as possible. */
 381     do {
 382         unsigned i = 0;
 383
 384         vhost_svq_disable_notification(svq);
 385         while (true) {
 386             uint32_t len;
 387             g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
 388             if (!elem) {
 389                 break;
 390             }
 391
 392             if (unlikely(i >= svq->vring.num)) {
 393                 qemu_log_mask(LOG_GUEST_ERROR,
 394                          "More than %u used buffers obtained in a %u size SVQ",
 395                          i, svq->vring.num);
 396                 virtqueue_fill(vq, elem, len, i);
 397                 virtqueue_flush(vq, i);
 398                 return;
 399             }
 400             virtqueue_fill(vq, elem, len, i++);
 401         }
 402
 403         virtqueue_flush(vq, i);
 404         event_notifier_set(&svq->svq_call);
 405
 406         if (check_for_avail_queue && svq->next_guest_avail_elem) {
 407             /*
 408              * Avail ring was full when vhost_svq_flush was called, so it's a
 409              * good moment to make more descriptors available if possible.
 410              */
 411             vhost_handle_guest_kick(svq);
 412         }
 413     } while (!vhost_svq_enable_notification(svq));
 414 }
 415
 416 /**
 417  * Forward used buffers.
 418  *
 419  * @n: hdev call event notifier, the one that device set to notify svq.
 420  *
 421  * Note that we are not making any buffers available in the loop, there is no
 422  * way that it runs more than virtqueue size times.
 423  */
 424 static void vhost_svq_handle_call(EventNotifier *n)
 425 {
 426     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
 427                                              hdev_call);
 428     event_notifier_test_and_clear(n);
 429     vhost_svq_flush(svq, true);
 430 }
 431
 432 /**
 433  * Set the call notifier for the SVQ to call the guest
 434  *
 435  * @svq: Shadow virtqueue
 436  * @call_fd: call notifier
 437  *
 438  * Called on BQL context.
 439  */
 440 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 441 {
 442     if (call_fd == VHOST_FILE_UNBIND) {
 443         /*
 444          * Fail event_notifier_set if called handling device call.
 445          *
 446          * SVQ still needs device notifications, since it needs to keep
 447          * forwarding used buffers even with the unbind.
 448          */
 449         memset(&svq->svq_call, 0, sizeof(svq->svq_call));
 450     } else {
 451         event_notifier_init_fd(&svq->svq_call, call_fd);
 452     }
 453 }
 454
 455 /**
 456  * Get the shadow vq vring address.
 457  * @svq: Shadow virtqueue
 458  * @addr: Destination to store address
 459  */
 460 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 461                               struct vhost_vring_addr *addr)
 462 {
 463     addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
 464     addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
 465     addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
 466 }
 467
 468 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
 469 {
 470     size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
 471     size_t avail_size = offsetof(vring_avail_t, ring) +
 472                                              sizeof(uint16_t) * svq->vring.num;
 473
 474     return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
 475 }
 476
 477 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
 478 {
 479     size_t used_size = offsetof(vring_used_t, ring) +
 480                                     sizeof(vring_used_elem_t) * svq->vring.num;
 481     return ROUND_UP(used_size, qemu_real_host_page_size);
 482 }
 483
 484 /**
 485  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
 486  *
 487  * @svq: The svq
 488  * @svq_kick_fd: The svq kick fd
 489  *
 490  * Note that the SVQ will never close the old file descriptor.
 491  */
 492 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 493 {
 494     EventNotifier *svq_kick = &svq->svq_kick;
 495     bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
 496     bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
 497
 498     if (poll_stop) {
 499         event_notifier_set_handler(svq_kick, NULL);
 500     }
 501
 502     /*
 503      * event_notifier_set_handler already checks for guest's notifications if
 504      * they arrive at the new file descriptor in the switch, so there is no
 505      * need to explicitly check for them.
 506      */
 507     if (poll_start) {
 508         event_notifier_init_fd(svq_kick, svq_kick_fd);
 509         event_notifier_set(svq_kick);
 510         event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
 511     }
 512 }
 513
 514 /**
 515  * Start the shadow virtqueue operation.
 516  *
 517  * @svq: Shadow Virtqueue
 518  * @vdev: VirtIO device
 519  * @vq: Virtqueue to shadow
 520  */
 521 void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 522                      VirtQueue *vq)
 523 {
 524     size_t desc_size, driver_size, device_size;
 525
 526     svq->next_guest_avail_elem = NULL;
 527     svq->shadow_avail_idx = 0;
 528     svq->shadow_used_idx = 0;
 529     svq->last_used_idx = 0;
 530     svq->vdev = vdev;
 531     svq->vq = vq;
 532
 533     svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 534     driver_size = vhost_svq_driver_area_size(svq);
 535     device_size = vhost_svq_device_area_size(svq);
 536     svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
 537     desc_size = sizeof(vring_desc_t) * svq->vring.num;
 538     svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
 539     memset(svq->vring.desc, 0, driver_size);
 540     svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 541     memset(svq->vring.used, 0, device_size);
 542     svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 543     for (unsigned i = 0; i < svq->vring.num - 1; i++) {
 544         svq->vring.desc[i].next = cpu_to_le16(i + 1);
 545     }
 546 }
 547
 548 /**
 549  * Stop the shadow virtqueue operation.
 550  * @svq: Shadow Virtqueue
 551  */
 552 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 553 {
 554     event_notifier_set_handler(&svq->svq_kick, NULL);
 555     g_autofree VirtQueueElement *next_avail_elem = NULL;
 556
 557     if (!svq->vq) {
 558         return;
 559     }
 560
 561     /* Send all pending used descriptors to guest */
 562     vhost_svq_flush(svq, false);
 563
 564     for (unsigned i = 0; i < svq->vring.num; ++i) {
 565         g_autofree VirtQueueElement *elem = NULL;
 566         elem = g_steal_pointer(&svq->ring_id_maps[i]);
 567         if (elem) {
 568             virtqueue_detach_element(svq->vq, elem, 0);
 569         }
 570     }
 571
 572     next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
 573     if (next_avail_elem) {
 574         virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 575     }
 576     svq->vq = NULL;
 577     g_free(svq->ring_id_maps);
 578     qemu_vfree(svq->vring.desc);
 579     qemu_vfree(svq->vring.used);
 580 }
 581
 582 /**
 583  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
 584  * shadow methods and file descriptors.
 585  *
 586  * @iova_tree: Tree to perform descriptors translations
 587  *
 588  * Returns the new virtqueue or NULL.
 589  *
 590  * In case of error, reason is reported through error_report.
 591  */
 592 VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 593 {
 594     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 595     int r;
 596
 597     r = event_notifier_init(&svq->hdev_kick, 0);
 598     if (r != 0) {
 599         error_report("Couldn't create kick event notifier: %s (%d)",
 600                      g_strerror(errno), errno);
 601         goto err_init_hdev_kick;
 602     }
 603
 604     r = event_notifier_init(&svq->hdev_call, 0);
 605     if (r != 0) {
 606         error_report("Couldn't create call event notifier: %s (%d)",
 607                      g_strerror(errno), errno);
 608         goto err_init_hdev_call;
 609     }
 610
 611     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
 612     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 613     svq->iova_tree = iova_tree;
 614     return g_steal_pointer(&svq);
 615
 616 err_init_hdev_call:
 617     event_notifier_cleanup(&svq->hdev_kick);
 618
 619 err_init_hdev_kick:
 620     return NULL;
 621 }
 622
 623 /**
 624  * Free the resources of the shadow virtqueue.
 625  *
 626  * @pvq: gpointer to SVQ so it can be used by autofree functions.
 627  */
 628 void vhost_svq_free(gpointer pvq)
 629 {
 630     VhostShadowVirtqueue *vq = pvq;
 631     vhost_svq_stop(vq);
 632     event_notifier_cleanup(&vq->hdev_kick);
 633     event_notifier_set_handler(&vq->hdev_call, NULL);
 634     event_notifier_cleanup(&vq->hdev_call);
 635     g_free(vq);
 636 }