hw/virtio/vhost.c

   1 /*
   2  * vhost support
   3  *
   4  * Copyright Red Hat, Inc. 2010
   5  *
   6  * Authors:
   7  *  Michael S. Tsirkin <mst@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Contributions after 2012-01-13 are licensed under the terms of the
  13  * GNU GPL, version 2 or (at your option) any later version.
  14  */
  15
  16 #include "qemu/osdep.h"
  17 #include "qapi/error.h"
  18 #include "hw/virtio/vhost.h"
  19 #include "qemu/atomic.h"
  20 #include "qemu/range.h"
  21 #include "qemu/error-report.h"
  22 #include "qemu/memfd.h"
  23 #include "qemu/log.h"
  24 #include "standard-headers/linux/vhost_types.h"
  25 #include "hw/virtio/virtio-bus.h"
  26 #include "hw/mem/memory-device.h"
  27 #include "migration/blocker.h"
  28 #include "migration/qemu-file-types.h"
  29 #include "sysemu/dma.h"
  30 #include "trace.h"
  31
  32 /* enabled until disconnected backend stabilizes */
  33 #define _VHOST_DEBUG 1
  34
  35 #ifdef _VHOST_DEBUG
  36 #define VHOST_OPS_DEBUG(retval, fmt, ...) \
  37     do { \
  38         error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
  39                      strerror(-retval), -retval); \
  40     } while (0)
  41 #else
  42 #define VHOST_OPS_DEBUG(retval, fmt, ...) \
  43     do { } while (0)
  44 #endif
  45
  46 static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
  47 static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
  48 static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
  49
  50 /* Memslots used by backends that support private memslots (without an fd). */
  51 static unsigned int used_memslots;
  52
  53 /* Memslots used by backends that only support shared memslots (with an fd). */
  54 static unsigned int used_shared_memslots;
  55
  56 static QLIST_HEAD(, vhost_dev) vhost_devices =
  57     QLIST_HEAD_INITIALIZER(vhost_devices);
  58
  59 unsigned int vhost_get_max_memslots(void)
  60 {
  61     unsigned int max = UINT_MAX;
  62     struct vhost_dev *hdev;
  63
  64     QLIST_FOREACH(hdev, &vhost_devices, entry) {
  65         max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev));
  66     }
  67     return max;
  68 }
  69
  70 unsigned int vhost_get_free_memslots(void)
  71 {
  72     unsigned int free = UINT_MAX;
  73     struct vhost_dev *hdev;
  74
  75     QLIST_FOREACH(hdev, &vhost_devices, entry) {
  76         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
  77         unsigned int cur_free;
  78
  79         if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
  80             hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
  81             cur_free = r - used_shared_memslots;
  82         } else {
  83             cur_free = r - used_memslots;
  84         }
  85         free = MIN(free, cur_free);
  86     }
  87     return free;
  88 }
  89
  90 static void vhost_dev_sync_region(struct vhost_dev *dev,
  91                                   MemoryRegionSection *section,
  92                                   uint64_t mfirst, uint64_t mlast,
  93                                   uint64_t rfirst, uint64_t rlast)
  94 {
  95     vhost_log_chunk_t *dev_log = dev->log->log;
  96
  97     uint64_t start = MAX(mfirst, rfirst);
  98     uint64_t end = MIN(mlast, rlast);
  99     vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK;
 100     vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1;
 101     uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
 102
 103     if (end < start) {
 104         return;
 105     }
 106     assert(end / VHOST_LOG_CHUNK < dev->log_size);
 107     assert(start / VHOST_LOG_CHUNK < dev->log_size);
 108
 109     for (;from < to; ++from) {
 110         vhost_log_chunk_t log;
 111         /* We first check with non-atomic: much cheaper,
 112          * and we expect non-dirty to be the common case. */
 113         if (!*from) {
 114             addr += VHOST_LOG_CHUNK;
 115             continue;
 116         }
 117         /* Data must be read atomically. We don't really need barrier semantics
 118          * but it's easier to use atomic_* than roll our own. */
 119         log = qatomic_xchg(from, 0);
 120         while (log) {
 121             int bit = ctzl(log);
 122             hwaddr page_addr;
 123             hwaddr section_offset;
 124             hwaddr mr_offset;
 125             page_addr = addr + bit * VHOST_LOG_PAGE;
 126             section_offset = page_addr - section->offset_within_address_space;
 127             mr_offset = section_offset + section->offset_within_region;
 128             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
 129             log &= ~(0x1ull << bit);
 130         }
 131         addr += VHOST_LOG_CHUNK;
 132     }
 133 }
 134
 135 bool vhost_dev_has_iommu(struct vhost_dev *dev)
 136 {
 137     VirtIODevice *vdev = dev->vdev;
 138
 139     /*
 140      * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
 141      * incremental memory mapping API via IOTLB API. For platform that
 142      * does not have IOMMU, there's no need to enable this feature
 143      * which may cause unnecessary IOTLB miss/update transactions.
 144      */
 145     if (vdev) {
 146         return virtio_bus_device_iommu_enabled(vdev) &&
 147             virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
 148     } else {
 149         return false;
 150     }
 151 }
 152
 153 static inline bool vhost_dev_should_log(struct vhost_dev *dev)
 154 {
 155     assert(dev->vhost_ops);
 156     assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
 157     assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
 158
 159     return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
 160 }
 161
 162 static inline void vhost_dev_elect_mem_logger(struct vhost_dev *hdev, bool add)
 163 {
 164     VhostBackendType backend_type;
 165
 166     assert(hdev->vhost_ops);
 167
 168     backend_type = hdev->vhost_ops->backend_type;
 169     assert(backend_type > VHOST_BACKEND_TYPE_NONE);
 170     assert(backend_type < VHOST_BACKEND_TYPE_MAX);
 171
 172     if (add && !QLIST_IS_INSERTED(hdev, logdev_entry)) {
 173         if (QLIST_EMPTY(&vhost_log_devs[backend_type])) {
 174             QLIST_INSERT_HEAD(&vhost_log_devs[backend_type],
 175                               hdev, logdev_entry);
 176         } else {
 177             /*
 178              * The first vhost_device in the list is selected as the shared
 179              * logger to scan memory sections. Put new entry next to the head
 180              * to avoid inadvertent change to the underlying logger device.
 181              * This is done in order to get better cache locality and to avoid
 182              * performance churn on the hot path for log scanning. Even when
 183              * new devices come and go quickly, it wouldn't end up changing
 184              * the active leading logger device at all.
 185              */
 186             QLIST_INSERT_AFTER(QLIST_FIRST(&vhost_log_devs[backend_type]),
 187                                hdev, logdev_entry);
 188         }
 189     } else if (!add && QLIST_IS_INSERTED(hdev, logdev_entry)) {
 190         QLIST_REMOVE(hdev, logdev_entry);
 191     }
 192 }
 193
 194 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
 195                                    MemoryRegionSection *section,
 196                                    hwaddr first,
 197                                    hwaddr last)
 198 {
 199     int i;
 200     hwaddr start_addr;
 201     hwaddr end_addr;
 202
 203     if (!dev->log_enabled || !dev->started) {
 204         return 0;
 205     }
 206     start_addr = section->offset_within_address_space;
 207     end_addr = range_get_last(start_addr, int128_get64(section->size));
 208     start_addr = MAX(first, start_addr);
 209     end_addr = MIN(last, end_addr);
 210
 211     if (vhost_dev_should_log(dev)) {
 212         for (i = 0; i < dev->mem->nregions; ++i) {
 213             struct vhost_memory_region *reg = dev->mem->regions + i;
 214             vhost_dev_sync_region(dev, section, start_addr, end_addr,
 215                                   reg->guest_phys_addr,
 216                                   range_get_last(reg->guest_phys_addr,
 217                                                  reg->memory_size));
 218         }
 219     }
 220     for (i = 0; i < dev->nvqs; ++i) {
 221         struct vhost_virtqueue *vq = dev->vqs + i;
 222
 223         if (!vq->used_phys && !vq->used_size) {
 224             continue;
 225         }
 226
 227         if (vhost_dev_has_iommu(dev)) {
 228             IOMMUTLBEntry iotlb;
 229             hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
 230             hwaddr phys, s, offset;
 231
 232             while (used_size) {
 233                 rcu_read_lock();
 234                 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
 235                                                       used_phys,
 236                                                       true,
 237                                                       MEMTXATTRS_UNSPECIFIED);
 238                 rcu_read_unlock();
 239
 240                 if (!iotlb.target_as) {
 241                     qemu_log_mask(LOG_GUEST_ERROR, "translation "
 242                                   "failure for used_iova %"PRIx64"\n",
 243                                   used_phys);
 244                     return -EINVAL;
 245                 }
 246
 247                 offset = used_phys & iotlb.addr_mask;
 248                 phys = iotlb.translated_addr + offset;
 249
 250                 /*
 251                  * Distance from start of used ring until last byte of
 252                  * IOMMU page.
 253                  */
 254                 s = iotlb.addr_mask - offset;
 255                 /*
 256                  * Size of used ring, or of the part of it until end
 257                  * of IOMMU page. To avoid zero result, do the adding
 258                  * outside of MIN().
 259                  */
 260                 s = MIN(s, used_size - 1) + 1;
 261
 262                 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
 263                                       range_get_last(phys, s));
 264                 used_size -= s;
 265                 used_phys += s;
 266             }
 267         } else {
 268             vhost_dev_sync_region(dev, section, start_addr,
 269                                   end_addr, vq->used_phys,
 270                                   range_get_last(vq->used_phys, vq->used_size));
 271         }
 272     }
 273     return 0;
 274 }
 275
 276 static void vhost_log_sync(MemoryListener *listener,
 277                           MemoryRegionSection *section)
 278 {
 279     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 280                                          memory_listener);
 281     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
 282 }
 283
 284 static void vhost_log_sync_range(struct vhost_dev *dev,
 285                                  hwaddr first, hwaddr last)
 286 {
 287     int i;
 288     /* FIXME: this is N^2 in number of sections */
 289     for (i = 0; i < dev->n_mem_sections; ++i) {
 290         MemoryRegionSection *section = &dev->mem_sections[i];
 291         vhost_sync_dirty_bitmap(dev, section, first, last);
 292     }
 293 }
 294
 295 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
 296 {
 297     uint64_t log_size = 0;
 298     int i;
 299     for (i = 0; i < dev->mem->nregions; ++i) {
 300         struct vhost_memory_region *reg = dev->mem->regions + i;
 301         uint64_t last = range_get_last(reg->guest_phys_addr,
 302                                        reg->memory_size);
 303         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
 304     }
 305     return log_size;
 306 }
 307
 308 static int vhost_set_backend_type(struct vhost_dev *dev,
 309                                   VhostBackendType backend_type)
 310 {
 311     int r = 0;
 312
 313     switch (backend_type) {
 314 #ifdef CONFIG_VHOST_KERNEL
 315     case VHOST_BACKEND_TYPE_KERNEL:
 316         dev->vhost_ops = &kernel_ops;
 317         break;
 318 #endif
 319 #ifdef CONFIG_VHOST_USER
 320     case VHOST_BACKEND_TYPE_USER:
 321         dev->vhost_ops = &user_ops;
 322         break;
 323 #endif
 324 #ifdef CONFIG_VHOST_VDPA
 325     case VHOST_BACKEND_TYPE_VDPA:
 326         dev->vhost_ops = &vdpa_ops;
 327         break;
 328 #endif
 329     default:
 330         error_report("Unknown vhost backend type");
 331         r = -1;
 332     }
 333
 334     if (r == 0) {
 335         assert(dev->vhost_ops->backend_type == backend_type);
 336     }
 337
 338     return r;
 339 }
 340
 341 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 342 {
 343     Error *err = NULL;
 344     struct vhost_log *log;
 345     uint64_t logsize = size * sizeof(*(log->log));
 346     int fd = -1;
 347
 348     log = g_new0(struct vhost_log, 1);
 349     if (share) {
 350         log->log = qemu_memfd_alloc("vhost-log", logsize,
 351                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
 352                                     &fd, &err);
 353         if (err) {
 354             error_report_err(err);
 355             g_free(log);
 356             return NULL;
 357         }
 358         memset(log->log, 0, logsize);
 359     } else {
 360         log->log = g_malloc0(logsize);
 361     }
 362
 363     log->size = size;
 364     log->refcnt = 1;
 365     log->fd = fd;
 366
 367     return log;
 368 }
 369
 370 static struct vhost_log *vhost_log_get(VhostBackendType backend_type,
 371                                        uint64_t size, bool share)
 372 {
 373     struct vhost_log *log;
 374
 375     assert(backend_type > VHOST_BACKEND_TYPE_NONE);
 376     assert(backend_type < VHOST_BACKEND_TYPE_MAX);
 377
 378     log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
 379
 380     if (!log || log->size != size) {
 381         log = vhost_log_alloc(size, share);
 382         if (share) {
 383             vhost_log_shm[backend_type] = log;
 384         } else {
 385             vhost_log[backend_type] = log;
 386         }
 387     } else {
 388         ++log->refcnt;
 389     }
 390
 391     return log;
 392 }
 393
 394 static void vhost_log_put(struct vhost_dev *dev, bool sync)
 395 {
 396     struct vhost_log *log = dev->log;
 397     VhostBackendType backend_type;
 398
 399     if (!log) {
 400         return;
 401     }
 402
 403     assert(dev->vhost_ops);
 404     backend_type = dev->vhost_ops->backend_type;
 405
 406     if (backend_type == VHOST_BACKEND_TYPE_NONE ||
 407         backend_type >= VHOST_BACKEND_TYPE_MAX) {
 408         return;
 409     }
 410
 411     --log->refcnt;
 412     if (log->refcnt == 0) {
 413         /* Sync only the range covered by the old log */
 414         if (dev->log_size && sync) {
 415             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
 416         }
 417
 418         if (vhost_log[backend_type] == log) {
 419             g_free(log->log);
 420             vhost_log[backend_type] = NULL;
 421         } else if (vhost_log_shm[backend_type] == log) {
 422             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
 423                             log->fd);
 424             vhost_log_shm[backend_type] = NULL;
 425         }
 426
 427         g_free(log);
 428     }
 429
 430     vhost_dev_elect_mem_logger(dev, false);
 431     dev->log = NULL;
 432     dev->log_size = 0;
 433 }
 434
 435 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 436 {
 437     return dev->vhost_ops->vhost_requires_shm_log &&
 438            dev->vhost_ops->vhost_requires_shm_log(dev);
 439 }
 440
 441 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 442 {
 443     struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type,
 444                                           size, vhost_dev_log_is_shared(dev));
 445     uint64_t log_base = (uintptr_t)log->log;
 446     int r;
 447
 448     /* inform backend of log switching, this must be done before
 449        releasing the current log, to ensure no logging is lost */
 450     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
 451     if (r < 0) {
 452         VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
 453     }
 454
 455     vhost_log_put(dev, true);
 456     dev->log = log;
 457     dev->log_size = size;
 458 }
 459
 460 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
 461                               hwaddr *plen, bool is_write)
 462 {
 463     if (!vhost_dev_has_iommu(dev)) {
 464         return cpu_physical_memory_map(addr, plen, is_write);
 465     } else {
 466         return (void *)(uintptr_t)addr;
 467     }
 468 }
 469
 470 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
 471                                hwaddr len, int is_write,
 472                                hwaddr access_len)
 473 {
 474     if (!vhost_dev_has_iommu(dev)) {
 475         cpu_physical_memory_unmap(buffer, len, is_write, access_len);
 476     }
 477 }
 478
 479 static int vhost_verify_ring_part_mapping(void *ring_hva,
 480                                           uint64_t ring_gpa,
 481                                           uint64_t ring_size,
 482                                           void *reg_hva,
 483                                           uint64_t reg_gpa,
 484                                           uint64_t reg_size)
 485 {
 486     uint64_t hva_ring_offset;
 487     uint64_t ring_last = range_get_last(ring_gpa, ring_size);
 488     uint64_t reg_last = range_get_last(reg_gpa, reg_size);
 489
 490     if (ring_last < reg_gpa || ring_gpa > reg_last) {
 491         return 0;
 492     }
 493     /* check that whole ring's is mapped */
 494     if (ring_last > reg_last) {
 495         return -ENOMEM;
 496     }
 497     /* check that ring's MemoryRegion wasn't replaced */
 498     hva_ring_offset = ring_gpa - reg_gpa;
 499     if (ring_hva != reg_hva + hva_ring_offset) {
 500         return -EBUSY;
 501     }
 502
 503     return 0;
 504 }
 505
 506 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 507                                       void *reg_hva,
 508                                       uint64_t reg_gpa,
 509                                       uint64_t reg_size)
 510 {
 511     int i, j;
 512     int r = 0;
 513     const char *part_name[] = {
 514         "descriptor table",
 515         "available ring",
 516         "used ring"
 517     };
 518
 519     if (vhost_dev_has_iommu(dev)) {
 520         return 0;
 521     }
 522
 523     for (i = 0; i < dev->nvqs; ++i) {
 524         struct vhost_virtqueue *vq = dev->vqs + i;
 525
 526         if (vq->desc_phys == 0) {
 527             continue;
 528         }
 529
 530         j = 0;
 531         r = vhost_verify_ring_part_mapping(
 532                 vq->desc, vq->desc_phys, vq->desc_size,
 533                 reg_hva, reg_gpa, reg_size);
 534         if (r) {
 535             break;
 536         }
 537
 538         j++;
 539         r = vhost_verify_ring_part_mapping(
 540                 vq->avail, vq->avail_phys, vq->avail_size,
 541                 reg_hva, reg_gpa, reg_size);
 542         if (r) {
 543             break;
 544         }
 545
 546         j++;
 547         r = vhost_verify_ring_part_mapping(
 548                 vq->used, vq->used_phys, vq->used_size,
 549                 reg_hva, reg_gpa, reg_size);
 550         if (r) {
 551             break;
 552         }
 553     }
 554
 555     if (r == -ENOMEM) {
 556         error_report("Unable to map %s for ring %d", part_name[j], i);
 557     } else if (r == -EBUSY) {
 558         error_report("%s relocated for ring %d", part_name[j], i);
 559     }
 560     return r;
 561 }
 562
 563 /*
 564  * vhost_section: identify sections needed for vhost access
 565  *
 566  * We only care about RAM sections here (where virtqueue and guest
 567  * internals accessed by virtio might live).
 568  */
 569 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
 570 {
 571     MemoryRegion *mr = section->mr;
 572
 573     if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
 574         uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
 575         uint8_t handled_dirty;
 576
 577         /*
 578          * Kernel based vhost doesn't handle any block which is doing
 579          * dirty-tracking other than migration for which it has
 580          * specific logging support. However for TCG the kernel never
 581          * gets involved anyway so we can also ignore it's
 582          * self-modiying code detection flags. However a vhost-user
 583          * client could still confuse a TCG guest if it re-writes
 584          * executable memory that has already been translated.
 585          */
 586         handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
 587             (1 << DIRTY_MEMORY_CODE);
 588
 589         if (dirty_mask & ~handled_dirty) {
 590             trace_vhost_reject_section(mr->name, 1);
 591             return false;
 592         }
 593
 594         /*
 595          * Some backends (like vhost-user) can only handle memory regions
 596          * that have an fd (can be mapped into a different process). Filter
 597          * the ones without an fd out, if requested.
 598          *
 599          * TODO: we might have to limit to MAP_SHARED as well.
 600          */
 601         if (memory_region_get_fd(section->mr) < 0 &&
 602             dev->vhost_ops->vhost_backend_no_private_memslots &&
 603             dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
 604             trace_vhost_reject_section(mr->name, 2);
 605             return false;
 606         }
 607
 608         trace_vhost_section(mr->name);
 609         return true;
 610     } else {
 611         trace_vhost_reject_section(mr->name, 3);
 612         return false;
 613     }
 614 }
 615
 616 static void vhost_begin(MemoryListener *listener)
 617 {
 618     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 619                                          memory_listener);
 620     dev->tmp_sections = NULL;
 621     dev->n_tmp_sections = 0;
 622 }
 623
 624 static void vhost_commit(MemoryListener *listener)
 625 {
 626     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 627                                          memory_listener);
 628     MemoryRegionSection *old_sections;
 629     int n_old_sections;
 630     uint64_t log_size;
 631     size_t regions_size;
 632     int r;
 633     int i;
 634     bool changed = false;
 635
 636     /* Note we can be called before the device is started, but then
 637      * starting the device calls set_mem_table, so we need to have
 638      * built the data structures.
 639      */
 640     old_sections = dev->mem_sections;
 641     n_old_sections = dev->n_mem_sections;
 642     dev->mem_sections = dev->tmp_sections;
 643     dev->n_mem_sections = dev->n_tmp_sections;
 644
 645     if (dev->n_mem_sections != n_old_sections) {
 646         changed = true;
 647     } else {
 648         /* Same size, lets check the contents */
 649         for (i = 0; i < n_old_sections; i++) {
 650             if (!MemoryRegionSection_eq(&old_sections[i],
 651                                         &dev->mem_sections[i])) {
 652                 changed = true;
 653                 break;
 654             }
 655         }
 656     }
 657
 658     trace_vhost_commit(dev->started, changed);
 659     if (!changed) {
 660         goto out;
 661     }
 662
 663     /* Rebuild the regions list from the new sections list */
 664     regions_size = offsetof(struct vhost_memory, regions) +
 665                        dev->n_mem_sections * sizeof dev->mem->regions[0];
 666     dev->mem = g_realloc(dev->mem, regions_size);
 667     dev->mem->nregions = dev->n_mem_sections;
 668
 669     if (dev->vhost_ops->vhost_backend_no_private_memslots &&
 670         dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
 671         used_shared_memslots = dev->mem->nregions;
 672     } else {
 673         used_memslots = dev->mem->nregions;
 674     }
 675
 676     for (i = 0; i < dev->n_mem_sections; i++) {
 677         struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
 678         struct MemoryRegionSection *mrs = dev->mem_sections + i;
 679
 680         cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
 681         cur_vmr->memory_size     = int128_get64(mrs->size);
 682         cur_vmr->userspace_addr  =
 683             (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
 684             mrs->offset_within_region;
 685         cur_vmr->flags_padding   = 0;
 686     }
 687
 688     if (!dev->started) {
 689         goto out;
 690     }
 691
 692     for (i = 0; i < dev->mem->nregions; i++) {
 693         if (vhost_verify_ring_mappings(dev,
 694                        (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
 695                        dev->mem->regions[i].guest_phys_addr,
 696                        dev->mem->regions[i].memory_size)) {
 697             error_report("Verify ring failure on region %d", i);
 698             abort();
 699         }
 700     }
 701
 702     if (!dev->log_enabled) {
 703         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 704         if (r < 0) {
 705             VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
 706         }
 707         goto out;
 708     }
 709     log_size = vhost_get_log_size(dev);
 710     /* We allocate an extra 4K bytes to log,
 711      * to reduce the * number of reallocations. */
 712 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
 713     /* To log more, must increase log size before table update. */
 714     if (dev->log_size < log_size) {
 715         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
 716     }
 717     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
 718     if (r < 0) {
 719         VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
 720     }
 721     /* To log less, can only decrease log size after table update. */
 722     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
 723         vhost_dev_log_resize(dev, log_size);
 724     }
 725
 726 out:
 727     /* Deref the old list of sections, this must happen _after_ the
 728      * vhost_set_mem_table to ensure the client isn't still using the
 729      * section we're about to unref.
 730      */
 731     while (n_old_sections--) {
 732         memory_region_unref(old_sections[n_old_sections].mr);
 733     }
 734     g_free(old_sections);
 735     return;
 736 }
 737
 738 /* Adds the section data to the tmp_section structure.
 739  * It relies on the listener calling us in memory address order
 740  * and for each region (via the _add and _nop methods) to
 741  * join neighbours.
 742  */
 743 static void vhost_region_add_section(struct vhost_dev *dev,
 744                                      MemoryRegionSection *section)
 745 {
 746     bool need_add = true;
 747     uint64_t mrs_size = int128_get64(section->size);
 748     uint64_t mrs_gpa = section->offset_within_address_space;
 749     uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
 750                          section->offset_within_region;
 751     RAMBlock *mrs_rb = section->mr->ram_block;
 752
 753     trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
 754                                    mrs_host);
 755
 756     if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
 757         /* Round the section to it's page size */
 758         /* First align the start down to a page boundary */
 759         size_t mrs_page = qemu_ram_pagesize(mrs_rb);
 760         uint64_t alignage = mrs_host & (mrs_page - 1);
 761         if (alignage) {
 762             mrs_host -= alignage;
 763             mrs_size += alignage;
 764             mrs_gpa  -= alignage;
 765         }
 766         /* Now align the size up to a page boundary */
 767         alignage = mrs_size & (mrs_page - 1);
 768         if (alignage) {
 769             mrs_size += mrs_page - alignage;
 770         }
 771         trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
 772                                                mrs_size, mrs_host);
 773     }
 774
 775     if (dev->n_tmp_sections && !section->unmergeable) {
 776         /* Since we already have at least one section, lets see if
 777          * this extends it; since we're scanning in order, we only
 778          * have to look at the last one, and the FlatView that calls
 779          * us shouldn't have overlaps.
 780          */
 781         MemoryRegionSection *prev_sec = dev->tmp_sections +
 782                                                (dev->n_tmp_sections - 1);
 783         uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
 784         uint64_t prev_size = int128_get64(prev_sec->size);
 785         uint64_t prev_gpa_end   = range_get_last(prev_gpa_start, prev_size);
 786         uint64_t prev_host_start =
 787                         (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
 788                         prev_sec->offset_within_region;
 789         uint64_t prev_host_end   = range_get_last(prev_host_start, prev_size);
 790
 791         if (mrs_gpa <= (prev_gpa_end + 1)) {
 792             /* OK, looks like overlapping/intersecting - it's possible that
 793              * the rounding to page sizes has made them overlap, but they should
 794              * match up in the same RAMBlock if they do.
 795              */
 796             if (mrs_gpa < prev_gpa_start) {
 797                 error_report("%s:Section '%s' rounded to %"PRIx64
 798                              " prior to previous '%s' %"PRIx64,
 799                              __func__, section->mr->name, mrs_gpa,
 800                              prev_sec->mr->name, prev_gpa_start);
 801                 /* A way to cleanly fail here would be better */
 802                 return;
 803             }
 804             /* Offset from the start of the previous GPA to this GPA */
 805             size_t offset = mrs_gpa - prev_gpa_start;
 806
 807             if (prev_host_start + offset == mrs_host &&
 808                 section->mr == prev_sec->mr && !prev_sec->unmergeable) {
 809                 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
 810                 need_add = false;
 811                 prev_sec->offset_within_address_space =
 812                     MIN(prev_gpa_start, mrs_gpa);
 813                 prev_sec->offset_within_region =
 814                     MIN(prev_host_start, mrs_host) -
 815                     (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
 816                 prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
 817                                                mrs_host));
 818                 trace_vhost_region_add_section_merge(section->mr->name,
 819                                         int128_get64(prev_sec->size),
 820                                         prev_sec->offset_within_address_space,
 821                                         prev_sec->offset_within_region);
 822             } else {
 823                 /* adjoining regions are fine, but overlapping ones with
 824                  * different blocks/offsets shouldn't happen
 825                  */
 826                 if (mrs_gpa != prev_gpa_end + 1) {
 827                     error_report("%s: Overlapping but not coherent sections "
 828                                  "at %"PRIx64,
 829                                  __func__, mrs_gpa);
 830                     return;
 831                 }
 832             }
 833         }
 834     }
 835
 836     if (need_add) {
 837         ++dev->n_tmp_sections;
 838         dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
 839                                     dev->n_tmp_sections);
 840         dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
 841         /* The flatview isn't stable and we don't use it, making it NULL
 842          * means we can memcmp the list.
 843          */
 844         dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
 845         memory_region_ref(section->mr);
 846     }
 847 }
 848
 849 /* Used for both add and nop callbacks */
 850 static void vhost_region_addnop(MemoryListener *listener,
 851                                 MemoryRegionSection *section)
 852 {
 853     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 854                                          memory_listener);
 855
 856     if (!vhost_section(dev, section)) {
 857         return;
 858     }
 859     vhost_region_add_section(dev, section);
 860 }
 861
 862 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 863 {
 864     struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
 865     struct vhost_dev *hdev = iommu->hdev;
 866     hwaddr iova = iotlb->iova + iommu->iommu_offset;
 867
 868     if (vhost_backend_invalidate_device_iotlb(hdev, iova,
 869                                               iotlb->addr_mask + 1)) {
 870         error_report("Fail to invalidate device iotlb");
 871     }
 872 }
 873
 874 static void vhost_iommu_region_add(MemoryListener *listener,
 875                                    MemoryRegionSection *section)
 876 {
 877     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 878                                          iommu_listener);
 879     struct vhost_iommu *iommu;
 880     Int128 end;
 881     int iommu_idx;
 882     IOMMUMemoryRegion *iommu_mr;
 883
 884     if (!memory_region_is_iommu(section->mr)) {
 885         return;
 886     }
 887
 888     iommu_mr = IOMMU_MEMORY_REGION(section->mr);
 889
 890     iommu = g_malloc0(sizeof(*iommu));
 891     end = int128_add(int128_make64(section->offset_within_region),
 892                      section->size);
 893     end = int128_sub(end, int128_one());
 894     iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
 895                                                    MEMTXATTRS_UNSPECIFIED);
 896     iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
 897                         dev->vdev->device_iotlb_enabled ?
 898                             IOMMU_NOTIFIER_DEVIOTLB_UNMAP :
 899                             IOMMU_NOTIFIER_UNMAP,
 900                         section->offset_within_region,
 901                         int128_get64(end),
 902                         iommu_idx);
 903     iommu->mr = section->mr;
 904     iommu->iommu_offset = section->offset_within_address_space -
 905                           section->offset_within_region;
 906     iommu->hdev = dev;
 907     memory_region_register_iommu_notifier(section->mr, &iommu->n,
 908                                           &error_fatal);
 909     QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
 910     /* TODO: can replay help performance here? */
 911 }
 912
 913 static void vhost_iommu_region_del(MemoryListener *listener,
 914                                    MemoryRegionSection *section)
 915 {
 916     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
 917                                          iommu_listener);
 918     struct vhost_iommu *iommu;
 919
 920     if (!memory_region_is_iommu(section->mr)) {
 921         return;
 922     }
 923
 924     QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
 925         if (iommu->mr == section->mr &&
 926             iommu->n.start == section->offset_within_region) {
 927             memory_region_unregister_iommu_notifier(iommu->mr,
 928                                                     &iommu->n);
 929             QLIST_REMOVE(iommu, iommu_next);
 930             g_free(iommu);
 931             break;
 932         }
 933     }
 934 }
 935
 936 void vhost_toggle_device_iotlb(VirtIODevice *vdev)
 937 {
 938     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
 939     struct vhost_dev *dev;
 940     struct vhost_iommu *iommu;
 941
 942     if (vdev->vhost_started) {
 943         dev = vdc->get_vhost(vdev);
 944     } else {
 945         return;
 946     }
 947
 948     QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
 949         memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n);
 950         iommu->n.notifier_flags = vdev->device_iotlb_enabled ?
 951                 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP;
 952         memory_region_register_iommu_notifier(iommu->mr, &iommu->n,
 953                                               &error_fatal);
 954     }
 955 }
 956
 957 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
 958                                     struct vhost_virtqueue *vq,
 959                                     unsigned idx, bool enable_log)
 960 {
 961     struct vhost_vring_addr addr;
 962     int r;
 963     memset(&addr, 0, sizeof(struct vhost_vring_addr));
 964
 965     if (dev->vhost_ops->vhost_vq_get_addr) {
 966         r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
 967         if (r < 0) {
 968             VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed");
 969             return r;
 970         }
 971     } else {
 972         addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
 973         addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
 974         addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
 975     }
 976     addr.index = idx;
 977     addr.log_guest_addr = vq->used_phys;
 978     addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
 979     r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
 980     if (r < 0) {
 981         VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed");
 982     }
 983     return r;
 984 }
 985
 986 static int vhost_dev_set_features(struct vhost_dev *dev,
 987                                   bool enable_log)
 988 {
 989     uint64_t features = dev->acked_features;
 990     int r;
 991     if (enable_log) {
 992         features |= 0x1ULL << VHOST_F_LOG_ALL;
 993     }
 994     if (!vhost_dev_has_iommu(dev)) {
 995         features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
 996     }
 997     if (dev->vhost_ops->vhost_force_iommu) {
 998         if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
 999             features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
1000        }
1001     }
1002     r = dev->vhost_ops->vhost_set_features(dev, features);
1003     if (r < 0) {
1004         VHOST_OPS_DEBUG(r, "vhost_set_features failed");
1005         goto out;
1006     }
1007     if (dev->vhost_ops->vhost_set_backend_cap) {
1008         r = dev->vhost_ops->vhost_set_backend_cap(dev);
1009         if (r < 0) {
1010             VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed");
1011             goto out;
1012         }
1013     }
1014
1015 out:
1016     return r;
1017 }
1018
1019 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
1020 {
1021     int r, i, idx;
1022     hwaddr addr;
1023
1024     r = vhost_dev_set_features(dev, enable_log);
1025     if (r < 0) {
1026         goto err_features;
1027     }
1028     for (i = 0; i < dev->nvqs; ++i) {
1029         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
1030         addr = virtio_queue_get_desc_addr(dev->vdev, idx);
1031         if (!addr) {
1032             /*
1033              * The queue might not be ready for start. If this
1034              * is the case there is no reason to continue the process.
1035              * The similar logic is used by the vhost_virtqueue_start()
1036              * routine.
1037              */
1038             continue;
1039         }
1040         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
1041                                      enable_log);
1042         if (r < 0) {
1043             goto err_vq;
1044         }
1045     }
1046
1047     /*
1048      * At log start we select our vhost_device logger that will scan the
1049      * memory sections and skip for the others. This is possible because
1050      * the log is shared amongst all vhost devices for a given type of
1051      * backend.
1052      */
1053     vhost_dev_elect_mem_logger(dev, enable_log);
1054
1055     return 0;
1056 err_vq:
1057     for (; i >= 0; --i) {
1058         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
1059         addr = virtio_queue_get_desc_addr(dev->vdev, idx);
1060         if (!addr) {
1061             continue;
1062         }
1063         vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
1064                                  dev->log_enabled);
1065     }
1066     vhost_dev_set_features(dev, dev->log_enabled);
1067 err_features:
1068     return r;
1069 }
1070
1071 static int vhost_migration_log(MemoryListener *listener, bool enable)
1072 {
1073     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
1074                                          memory_listener);
1075     int r;
1076     if (enable == dev->log_enabled) {
1077         return 0;
1078     }
1079     if (!dev->started) {
1080         dev->log_enabled = enable;
1081         return 0;
1082     }
1083
1084     r = 0;
1085     if (!enable) {
1086         r = vhost_dev_set_log(dev, false);
1087         if (r < 0) {
1088             goto check_dev_state;
1089         }
1090         vhost_log_put(dev, false);
1091     } else {
1092         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
1093         r = vhost_dev_set_log(dev, true);
1094         if (r < 0) {
1095             goto check_dev_state;
1096         }
1097     }
1098
1099 check_dev_state:
1100     dev->log_enabled = enable;
1101     /*
1102      * vhost-user-* devices could change their state during log
1103      * initialization due to disconnect. So check dev state after
1104      * vhost communication.
1105      */
1106     if (!dev->started) {
1107         /*
1108          * Since device is in the stopped state, it is okay for
1109          * migration. Return success.
1110          */
1111         r = 0;
1112     }
1113     if (r) {
1114         /* An error occurred. */
1115         dev->log_enabled = false;
1116     }
1117
1118     return r;
1119 }
1120
1121 static bool vhost_log_global_start(MemoryListener *listener, Error **errp)
1122 {
1123     int r;
1124
1125     r = vhost_migration_log(listener, true);
1126     if (r < 0) {
1127         abort();
1128     }
1129     return true;
1130 }
1131
1132 static void vhost_log_global_stop(MemoryListener *listener)
1133 {
1134     int r;
1135
1136     r = vhost_migration_log(listener, false);
1137     if (r < 0) {
1138         abort();
1139     }
1140 }
1141
1142 static void vhost_log_start(MemoryListener *listener,
1143                             MemoryRegionSection *section,
1144                             int old, int new)
1145 {
1146     /* FIXME: implement */
1147 }
1148
1149 static void vhost_log_stop(MemoryListener *listener,
1150                            MemoryRegionSection *section,
1151                            int old, int new)
1152 {
1153     /* FIXME: implement */
1154 }
1155
1156 /* The vhost driver natively knows how to handle the vrings of non
1157  * cross-endian legacy devices and modern devices. Only legacy devices
1158  * exposed to a bi-endian guest may require the vhost driver to use a
1159  * specific endianness.
1160  */
1161 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
1162 {
1163     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1164         return false;
1165     }
1166 #if HOST_BIG_ENDIAN
1167     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
1168 #else
1169     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
1170 #endif
1171 }
1172
1173 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
1174                                                    bool is_big_endian,
1175                                                    int vhost_vq_index)
1176 {
1177     int r;
1178     struct vhost_vring_state s = {
1179         .index = vhost_vq_index,
1180         .num = is_big_endian
1181     };
1182
1183     r = dev->vhost_ops->vhost_set_vring_endian(dev, &s);
1184     if (r < 0) {
1185         VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed");
1186     }
1187     return r;
1188 }
1189
1190 static int vhost_memory_region_lookup(struct vhost_dev *hdev,
1191                                       uint64_t gpa, uint64_t *uaddr,
1192                                       uint64_t *len)
1193 {
1194     int i;
1195
1196     for (i = 0; i < hdev->mem->nregions; i++) {
1197         struct vhost_memory_region *reg = hdev->mem->regions + i;
1198
1199         if (gpa >= reg->guest_phys_addr &&
1200             reg->guest_phys_addr + reg->memory_size > gpa) {
1201             *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
1202             *len = reg->guest_phys_addr + reg->memory_size - gpa;
1203             return 0;
1204         }
1205     }
1206
1207     return -EFAULT;
1208 }
1209
1210 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
1211 {
1212     IOMMUTLBEntry iotlb;
1213     uint64_t uaddr, len;
1214     int ret = -EFAULT;
1215
1216     RCU_READ_LOCK_GUARD();
1217
1218     trace_vhost_iotlb_miss(dev, 1);
1219
1220     iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
1221                                           iova, write,
1222                                           MEMTXATTRS_UNSPECIFIED);
1223     if (iotlb.target_as != NULL) {
1224         ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
1225                                          &uaddr, &len);
1226         if (ret) {
1227             trace_vhost_iotlb_miss(dev, 3);
1228             error_report("Fail to lookup the translated address "
1229                          "%"PRIx64, iotlb.translated_addr);
1230             goto out;
1231         }
1232
1233         len = MIN(iotlb.addr_mask + 1, len);
1234         iova = iova & ~iotlb.addr_mask;
1235
1236         ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
1237                                                 len, iotlb.perm);
1238         if (ret) {
1239             trace_vhost_iotlb_miss(dev, 4);
1240             error_report("Fail to update device iotlb");
1241             goto out;
1242         }
1243     }
1244
1245     trace_vhost_iotlb_miss(dev, 2);
1246
1247 out:
1248     return ret;
1249 }
1250
1251 int vhost_virtqueue_start(struct vhost_dev *dev,
1252                           struct VirtIODevice *vdev,
1253                           struct vhost_virtqueue *vq,
1254                           unsigned idx)
1255 {
1256     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1257     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1258     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1259     hwaddr s, l, a;
1260     int r;
1261     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1262     struct vhost_vring_file file = {
1263         .index = vhost_vq_index
1264     };
1265     struct vhost_vring_state state = {
1266         .index = vhost_vq_index
1267     };
1268     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1269
1270     a = virtio_queue_get_desc_addr(vdev, idx);
1271     if (a == 0) {
1272         /* Queue might not be ready for start */
1273         return 0;
1274     }
1275
1276     vq->num = state.num = virtio_queue_get_num(vdev, idx);
1277     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1278     if (r) {
1279         VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed");
1280         return r;
1281     }
1282
1283     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1284     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1285     if (r) {
1286         VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed");
1287         return r;
1288     }
1289
1290     if (vhost_needs_vring_endian(vdev)) {
1291         r = vhost_virtqueue_set_vring_endian_legacy(dev,
1292                                                     virtio_is_big_endian(vdev),
1293                                                     vhost_vq_index);
1294         if (r) {
1295             return r;
1296         }
1297     }
1298
1299     vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1300     vq->desc_phys = a;
1301     vq->desc = vhost_memory_map(dev, a, &l, false);
1302     if (!vq->desc || l != s) {
1303         r = -ENOMEM;
1304         goto fail_alloc_desc;
1305     }
1306     vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1307     vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1308     vq->avail = vhost_memory_map(dev, a, &l, false);
1309     if (!vq->avail || l != s) {
1310         r = -ENOMEM;
1311         goto fail_alloc_avail;
1312     }
1313     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1314     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1315     vq->used = vhost_memory_map(dev, a, &l, true);
1316     if (!vq->used || l != s) {
1317         r = -ENOMEM;
1318         goto fail_alloc_used;
1319     }
1320
1321     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1322     if (r < 0) {
1323         goto fail_alloc;
1324     }
1325
1326     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1327     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1328     if (r) {
1329         VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed");
1330         goto fail_kick;
1331     }
1332
1333     /* Clear and discard previous events if any. */
1334     event_notifier_test_and_clear(&vq->masked_notifier);
1335
1336     /* Init vring in unmasked state, unless guest_notifier_mask
1337      * will do it later.
1338      */
1339     if (!vdev->use_guest_notifier_mask) {
1340         /* TODO: check and handle errors. */
1341         vhost_virtqueue_mask(dev, vdev, idx, false);
1342     }
1343
1344     if (k->query_guest_notifiers &&
1345         k->query_guest_notifiers(qbus->parent) &&
1346         virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1347         file.fd = -1;
1348         r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1349         if (r) {
1350             goto fail_vector;
1351         }
1352     }
1353
1354     return 0;
1355
1356 fail_vector:
1357 fail_kick:
1358 fail_alloc:
1359     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1360                        0, 0);
1361 fail_alloc_used:
1362     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1363                        0, 0);
1364 fail_alloc_avail:
1365     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1366                        0, 0);
1367 fail_alloc_desc:
1368     return r;
1369 }
1370
1371 void vhost_virtqueue_stop(struct vhost_dev *dev,
1372                           struct VirtIODevice *vdev,
1373                           struct vhost_virtqueue *vq,
1374                           unsigned idx)
1375 {
1376     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1377     struct vhost_vring_state state = {
1378         .index = vhost_vq_index,
1379     };
1380     int r;
1381
1382     if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
1383         /* Don't stop the virtqueue which might have not been started */
1384         return;
1385     }
1386
1387     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1388     if (r < 0) {
1389         VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
1390         /* Connection to the backend is broken, so let's sync internal
1391          * last avail idx to the device used idx.
1392          */
1393         virtio_queue_restore_last_avail_idx(vdev, idx);
1394     } else {
1395         virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1396     }
1397     virtio_queue_invalidate_signalled_used(vdev, idx);
1398     virtio_queue_update_used_idx(vdev, idx);
1399
1400     /* In the cross-endian case, we need to reset the vring endianness to
1401      * native as legacy devices expect so by default.
1402      */
1403     if (vhost_needs_vring_endian(vdev)) {
1404         vhost_virtqueue_set_vring_endian_legacy(dev,
1405                                                 !virtio_is_big_endian(vdev),
1406                                                 vhost_vq_index);
1407     }
1408
1409     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1410                        1, virtio_queue_get_used_size(vdev, idx));
1411     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1412                        0, virtio_queue_get_avail_size(vdev, idx));
1413     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1414                        0, virtio_queue_get_desc_size(vdev, idx));
1415 }
1416
1417 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1418                                                 int n, uint32_t timeout)
1419 {
1420     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1421     struct vhost_vring_state state = {
1422         .index = vhost_vq_index,
1423         .num = timeout,
1424     };
1425     int r;
1426
1427     if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1428         return -EINVAL;
1429     }
1430
1431     r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1432     if (r) {
1433         VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed");
1434         return r;
1435     }
1436
1437     return 0;
1438 }
1439
1440 static void vhost_virtqueue_error_notifier(EventNotifier *n)
1441 {
1442     struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue,
1443                                               error_notifier);
1444     struct vhost_dev *dev = vq->dev;
1445     int index = vq - dev->vqs;
1446
1447     if (event_notifier_test_and_clear(n) && dev->vdev) {
1448         VHOST_OPS_DEBUG(-EINVAL,  "vhost vring error in virtqueue %d",
1449                         dev->vq_index + index);
1450     }
1451 }
1452
1453 static int vhost_virtqueue_init(struct vhost_dev *dev,
1454                                 struct vhost_virtqueue *vq, int n)
1455 {
1456     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1457     struct vhost_vring_file file = {
1458         .index = vhost_vq_index,
1459     };
1460     int r = event_notifier_init(&vq->masked_notifier, 0);
1461     if (r < 0) {
1462         return r;
1463     }
1464
1465     file.fd = event_notifier_get_wfd(&vq->masked_notifier);
1466     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1467     if (r) {
1468         VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed");
1469         goto fail_call;
1470     }
1471
1472     vq->dev = dev;
1473
1474     if (dev->vhost_ops->vhost_set_vring_err) {
1475         r = event_notifier_init(&vq->error_notifier, 0);
1476         if (r < 0) {
1477             goto fail_call;
1478         }
1479
1480         file.fd = event_notifier_get_fd(&vq->error_notifier);
1481         r = dev->vhost_ops->vhost_set_vring_err(dev, &file);
1482         if (r) {
1483             VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed");
1484             goto fail_err;
1485         }
1486
1487         event_notifier_set_handler(&vq->error_notifier,
1488                                    vhost_virtqueue_error_notifier);
1489     }
1490
1491     return 0;
1492
1493 fail_err:
1494     event_notifier_cleanup(&vq->error_notifier);
1495 fail_call:
1496     event_notifier_cleanup(&vq->masked_notifier);
1497     return r;
1498 }
1499
1500 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1501 {
1502     event_notifier_cleanup(&vq->masked_notifier);
1503     if (vq->dev->vhost_ops->vhost_set_vring_err) {
1504         event_notifier_set_handler(&vq->error_notifier, NULL);
1505         event_notifier_cleanup(&vq->error_notifier);
1506     }
1507 }
1508
1509 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1510                    VhostBackendType backend_type, uint32_t busyloop_timeout,
1511                    Error **errp)
1512 {
1513     unsigned int used, reserved, limit;
1514     uint64_t features;
1515     int i, r, n_initialized_vqs = 0;
1516
1517     hdev->vdev = NULL;
1518     hdev->migration_blocker = NULL;
1519
1520     r = vhost_set_backend_type(hdev, backend_type);
1521     assert(r >= 0);
1522
1523     r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp);
1524     if (r < 0) {
1525         goto fail;
1526     }
1527
1528     r = hdev->vhost_ops->vhost_set_owner(hdev);
1529     if (r < 0) {
1530         error_setg_errno(errp, -r, "vhost_set_owner failed");
1531         goto fail;
1532     }
1533
1534     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1535     if (r < 0) {
1536         error_setg_errno(errp, -r, "vhost_get_features failed");
1537         goto fail;
1538     }
1539
1540     limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
1541     if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS &&
1542         memory_devices_memslot_auto_decision_active()) {
1543         error_setg(errp, "some memory device (like virtio-mem)"
1544             " decided how many memory slots to use based on the overall"
1545             " number of memory slots; this vhost backend would further"
1546             " restricts the overall number of memory slots");
1547         error_append_hint(errp, "Try plugging this vhost backend before"
1548             " plugging such memory devices.\n");
1549         r = -EINVAL;
1550         goto fail;
1551     }
1552
1553     for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1554         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1555         if (r < 0) {
1556             error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i);
1557             goto fail;
1558         }
1559     }
1560
1561     if (busyloop_timeout) {
1562         for (i = 0; i < hdev->nvqs; ++i) {
1563             r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1564                                                      busyloop_timeout);
1565             if (r < 0) {
1566                 error_setg_errno(errp, -r, "Failed to set busyloop timeout");
1567                 goto fail_busyloop;
1568             }
1569         }
1570     }
1571
1572     hdev->features = features;
1573
1574     hdev->memory_listener = (MemoryListener) {
1575         .name = "vhost",
1576         .begin = vhost_begin,
1577         .commit = vhost_commit,
1578         .region_add = vhost_region_addnop,
1579         .region_nop = vhost_region_addnop,
1580         .log_start = vhost_log_start,
1581         .log_stop = vhost_log_stop,
1582         .log_sync = vhost_log_sync,
1583         .log_global_start = vhost_log_global_start,
1584         .log_global_stop = vhost_log_global_stop,
1585         .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND
1586     };
1587
1588     hdev->iommu_listener = (MemoryListener) {
1589         .name = "vhost-iommu",
1590         .region_add = vhost_iommu_region_add,
1591         .region_del = vhost_iommu_region_del,
1592     };
1593
1594     if (hdev->migration_blocker == NULL) {
1595         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1596             error_setg(&hdev->migration_blocker,
1597                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1598         } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
1599             error_setg(&hdev->migration_blocker,
1600                        "Migration disabled: failed to allocate shared memory");
1601         }
1602     }
1603
1604     if (hdev->migration_blocker != NULL) {
1605         r = migrate_add_blocker_normal(&hdev->migration_blocker, errp);
1606         if (r < 0) {
1607             goto fail_busyloop;
1608         }
1609     }
1610
1611     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1612     hdev->n_mem_sections = 0;
1613     hdev->mem_sections = NULL;
1614     hdev->log = NULL;
1615     hdev->log_size = 0;
1616     hdev->log_enabled = false;
1617     hdev->started = false;
1618     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1619     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1620
1621     /*
1622      * The listener we registered properly updated the corresponding counter.
1623      * So we can trust that these values are accurate.
1624      */
1625     if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
1626         hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
1627         used = used_shared_memslots;
1628     } else {
1629         used = used_memslots;
1630     }
1631     /*
1632      * We assume that all reserved memslots actually require a real memslot
1633      * in our vhost backend. This might not be true, for example, if the
1634      * memslot would be ROM. If ever relevant, we can optimize for that --
1635      * but we'll need additional information about the reservations.
1636      */
1637     reserved = memory_devices_get_reserved_memslots();
1638     if (used + reserved > limit) {
1639         error_setg(errp, "vhost backend memory slots limit (%d) is less"
1640                    " than current number of used (%d) and reserved (%d)"
1641                    " memory slots for memory devices.", limit, used, reserved);
1642         r = -EINVAL;
1643         goto fail_busyloop;
1644     }
1645
1646     return 0;
1647
1648 fail_busyloop:
1649     if (busyloop_timeout) {
1650         while (--i >= 0) {
1651             vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1652         }
1653     }
1654 fail:
1655     hdev->nvqs = n_initialized_vqs;
1656     vhost_dev_cleanup(hdev);
1657     return r;
1658 }
1659
1660 void vhost_dev_cleanup(struct vhost_dev *hdev)
1661 {
1662     int i;
1663
1664     trace_vhost_dev_cleanup(hdev);
1665
1666     for (i = 0; i < hdev->nvqs; ++i) {
1667         vhost_virtqueue_cleanup(hdev->vqs + i);
1668     }
1669     if (hdev->mem) {
1670         /* those are only safe after successful init */
1671         memory_listener_unregister(&hdev->memory_listener);
1672         QLIST_REMOVE(hdev, entry);
1673     }
1674     migrate_del_blocker(&hdev->migration_blocker);
1675     g_free(hdev->mem);
1676     g_free(hdev->mem_sections);
1677     if (hdev->vhost_ops) {
1678         hdev->vhost_ops->vhost_backend_cleanup(hdev);
1679     }
1680     assert(!hdev->log);
1681
1682     memset(hdev, 0, sizeof(struct vhost_dev));
1683 }
1684
1685 void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev,
1686                                       VirtIODevice *vdev,
1687                                       unsigned int nvqs)
1688 {
1689     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1690     int i, r;
1691
1692     /*
1693      * Batch all the host notifiers in a single transaction to avoid
1694      * quadratic time complexity in address_space_update_ioeventfds().
1695      */
1696     memory_region_transaction_begin();
1697
1698     for (i = 0; i < nvqs; ++i) {
1699         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1700                                          false);
1701         if (r < 0) {
1702             error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1703         }
1704         assert(r >= 0);
1705     }
1706
1707     /*
1708      * The transaction expects the ioeventfds to be open when it
1709      * commits. Do it now, before the cleanup loop.
1710      */
1711     memory_region_transaction_commit();
1712
1713     for (i = 0; i < nvqs; ++i) {
1714         virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1715     }
1716     virtio_device_release_ioeventfd(vdev);
1717 }
1718
1719 /* Stop processing guest IO notifications in qemu.
1720  * Start processing them in vhost in kernel.
1721  */
1722 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1723 {
1724     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1725     int i, r;
1726
1727     /* We will pass the notifiers to the kernel, make sure that QEMU
1728      * doesn't interfere.
1729      */
1730     r = virtio_device_grab_ioeventfd(vdev);
1731     if (r < 0) {
1732         error_report("binding does not support host notifiers");
1733         return r;
1734     }
1735
1736     /*
1737      * Batch all the host notifiers in a single transaction to avoid
1738      * quadratic time complexity in address_space_update_ioeventfds().
1739      */
1740     memory_region_transaction_begin();
1741
1742     for (i = 0; i < hdev->nvqs; ++i) {
1743         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1744                                          true);
1745         if (r < 0) {
1746             error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1747             memory_region_transaction_commit();
1748             vhost_dev_disable_notifiers_nvqs(hdev, vdev, i);
1749             return r;
1750         }
1751     }
1752
1753     memory_region_transaction_commit();
1754
1755     return 0;
1756 }
1757
1758 /* Stop processing guest IO notifications in vhost.
1759  * Start processing them in qemu.
1760  * This might actually run the qemu handlers right away,
1761  * so virtio in qemu must be completely setup when this is called.
1762  */
1763 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1764 {
1765     vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs);
1766 }
1767
1768 /* Test and clear event pending status.
1769  * Should be called after unmask to avoid losing events.
1770  */
1771 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1772 {
1773     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1774     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1775     return event_notifier_test_and_clear(&vq->masked_notifier);
1776 }
1777
1778 /* Mask/unmask events from this vq. */
1779 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1780                          bool mask)
1781 {
1782     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1783     int r, index = n - hdev->vq_index;
1784     struct vhost_vring_file file;
1785
1786     /* should only be called after backend is connected */
1787     assert(hdev->vhost_ops);
1788
1789     if (mask) {
1790         assert(vdev->use_guest_notifier_mask);
1791         file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier);
1792     } else {
1793         file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq));
1794     }
1795
1796     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1797     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1798     if (r < 0) {
1799         error_report("vhost_set_vring_call failed %d", -r);
1800     }
1801 }
1802
1803 bool vhost_config_pending(struct vhost_dev *hdev)
1804 {
1805     assert(hdev->vhost_ops);
1806     if ((hdev->started == false) ||
1807         (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1808         return false;
1809     }
1810
1811     EventNotifier *notifier =
1812         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1813     return event_notifier_test_and_clear(notifier);
1814 }
1815
1816 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask)
1817 {
1818     int fd;
1819     int r;
1820     EventNotifier *notifier =
1821         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1822     EventNotifier *config_notifier = &vdev->config_notifier;
1823     assert(hdev->vhost_ops);
1824
1825     if ((hdev->started == false) ||
1826         (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1827         return;
1828     }
1829     if (mask) {
1830         assert(vdev->use_guest_notifier_mask);
1831         fd = event_notifier_get_fd(notifier);
1832     } else {
1833         fd = event_notifier_get_fd(config_notifier);
1834     }
1835     r = hdev->vhost_ops->vhost_set_config_call(hdev, fd);
1836     if (r < 0) {
1837         error_report("vhost_set_config_call failed %d", -r);
1838     }
1839 }
1840
1841 static void vhost_stop_config_intr(struct vhost_dev *dev)
1842 {
1843     int fd = -1;
1844     assert(dev->vhost_ops);
1845     if (dev->vhost_ops->vhost_set_config_call) {
1846         dev->vhost_ops->vhost_set_config_call(dev, fd);
1847     }
1848 }
1849
1850 static void vhost_start_config_intr(struct vhost_dev *dev)
1851 {
1852     int r;
1853
1854     assert(dev->vhost_ops);
1855     int fd = event_notifier_get_fd(&dev->vdev->config_notifier);
1856     if (dev->vhost_ops->vhost_set_config_call) {
1857         r = dev->vhost_ops->vhost_set_config_call(dev, fd);
1858         if (!r) {
1859             event_notifier_set(&dev->vdev->config_notifier);
1860         }
1861     }
1862 }
1863
1864 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1865                             uint64_t features)
1866 {
1867     const int *bit = feature_bits;
1868     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1869         uint64_t bit_mask = (1ULL << *bit);
1870         if (!(hdev->features & bit_mask)) {
1871             features &= ~bit_mask;
1872         }
1873         bit++;
1874     }
1875     return features;
1876 }
1877
1878 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1879                         uint64_t features)
1880 {
1881     const int *bit = feature_bits;
1882     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1883         uint64_t bit_mask = (1ULL << *bit);
1884         if (features & bit_mask) {
1885             hdev->acked_features |= bit_mask;
1886         }
1887         bit++;
1888     }
1889 }
1890
1891 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
1892                          uint32_t config_len, Error **errp)
1893 {
1894     assert(hdev->vhost_ops);
1895
1896     if (hdev->vhost_ops->vhost_get_config) {
1897         return hdev->vhost_ops->vhost_get_config(hdev, config, config_len,
1898                                                  errp);
1899     }
1900
1901     error_setg(errp, "vhost_get_config not implemented");
1902     return -ENOSYS;
1903 }
1904
1905 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1906                          uint32_t offset, uint32_t size, uint32_t flags)
1907 {
1908     assert(hdev->vhost_ops);
1909
1910     if (hdev->vhost_ops->vhost_set_config) {
1911         return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1912                                                  size, flags);
1913     }
1914
1915     return -ENOSYS;
1916 }
1917
1918 void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1919                                    const VhostDevConfigOps *ops)
1920 {
1921     hdev->config_ops = ops;
1922 }
1923
1924 void vhost_dev_free_inflight(struct vhost_inflight *inflight)
1925 {
1926     if (inflight && inflight->addr) {
1927         qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
1928         inflight->addr = NULL;
1929         inflight->fd = -1;
1930     }
1931 }
1932
1933 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev)
1934 {
1935     int r;
1936
1937     if (hdev->vhost_ops->vhost_get_inflight_fd == NULL ||
1938         hdev->vhost_ops->vhost_set_inflight_fd == NULL) {
1939         return 0;
1940     }
1941
1942     hdev->vdev = vdev;
1943
1944     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1945     if (r < 0) {
1946         VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed");
1947         return r;
1948     }
1949
1950     return 0;
1951 }
1952
1953 int vhost_dev_set_inflight(struct vhost_dev *dev,
1954                            struct vhost_inflight *inflight)
1955 {
1956     int r;
1957
1958     if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
1959         r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
1960         if (r) {
1961             VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed");
1962             return r;
1963         }
1964     }
1965
1966     return 0;
1967 }
1968
1969 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
1970                            struct vhost_inflight *inflight)
1971 {
1972     int r;
1973
1974     if (dev->vhost_ops->vhost_get_inflight_fd) {
1975         r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
1976         if (r) {
1977             VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed");
1978             return r;
1979         }
1980     }
1981
1982     return 0;
1983 }
1984
1985 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
1986 {
1987     if (!hdev->vhost_ops->vhost_set_vring_enable) {
1988         return 0;
1989     }
1990
1991     /*
1992      * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not
1993      * been negotiated, the rings start directly in the enabled state, and
1994      * .vhost_set_vring_enable callback will fail since
1995      * VHOST_USER_SET_VRING_ENABLE is not supported.
1996      */
1997     if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER &&
1998         !virtio_has_feature(hdev->backend_features,
1999                             VHOST_USER_F_PROTOCOL_FEATURES)) {
2000         return 0;
2001     }
2002
2003     return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable);
2004 }
2005
2006 /*
2007  * Host notifiers must be enabled at this point.
2008  *
2009  * If @vrings is true, this function will enable all vrings before starting the
2010  * device. If it is false, the vring initialization is left to be done by the
2011  * caller.
2012  */
2013 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
2014 {
2015     int i, r;
2016
2017     /* should only be called after backend is connected */
2018     assert(hdev->vhost_ops);
2019
2020     trace_vhost_dev_start(hdev, vdev->name, vrings);
2021
2022     vdev->vhost_started = true;
2023     hdev->started = true;
2024     hdev->vdev = vdev;
2025
2026     r = vhost_dev_set_features(hdev, hdev->log_enabled);
2027     if (r < 0) {
2028         goto fail_features;
2029     }
2030
2031     if (vhost_dev_has_iommu(hdev)) {
2032         memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
2033     }
2034
2035     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
2036     if (r < 0) {
2037         VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
2038         goto fail_mem;
2039     }
2040     for (i = 0; i < hdev->nvqs; ++i) {
2041         r = vhost_virtqueue_start(hdev,
2042                                   vdev,
2043                                   hdev->vqs + i,
2044                                   hdev->vq_index + i);
2045         if (r < 0) {
2046             goto fail_vq;
2047         }
2048     }
2049
2050     r = event_notifier_init(
2051         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0);
2052     if (r < 0) {
2053         VHOST_OPS_DEBUG(r, "event_notifier_init failed");
2054         goto fail_vq;
2055     }
2056     event_notifier_test_and_clear(
2057         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2058     if (!vdev->use_guest_notifier_mask) {
2059         vhost_config_mask(hdev, vdev, true);
2060     }
2061     if (hdev->log_enabled) {
2062         uint64_t log_base;
2063
2064         hdev->log_size = vhost_get_log_size(hdev);
2065         hdev->log = vhost_log_get(hdev->vhost_ops->backend_type,
2066                                   hdev->log_size,
2067                                   vhost_dev_log_is_shared(hdev));
2068         log_base = (uintptr_t)hdev->log->log;
2069         r = hdev->vhost_ops->vhost_set_log_base(hdev,
2070                                                 hdev->log_size ? log_base : 0,
2071                                                 hdev->log);
2072         if (r < 0) {
2073             VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
2074             goto fail_log;
2075         }
2076         vhost_dev_elect_mem_logger(hdev, true);
2077     }
2078     if (vrings) {
2079         r = vhost_dev_set_vring_enable(hdev, true);
2080         if (r) {
2081             goto fail_log;
2082         }
2083     }
2084     if (hdev->vhost_ops->vhost_dev_start) {
2085         r = hdev->vhost_ops->vhost_dev_start(hdev, true);
2086         if (r) {
2087             goto fail_start;
2088         }
2089     }
2090     if (vhost_dev_has_iommu(hdev) &&
2091         hdev->vhost_ops->vhost_set_iotlb_callback) {
2092             hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
2093
2094         /* Update used ring information for IOTLB to work correctly,
2095          * vhost-kernel code requires for this.*/
2096         for (i = 0; i < hdev->nvqs; ++i) {
2097             struct vhost_virtqueue *vq = hdev->vqs + i;
2098             vhost_device_iotlb_miss(hdev, vq->used_phys, true);
2099         }
2100     }
2101     vhost_start_config_intr(hdev);
2102     return 0;
2103 fail_start:
2104     if (vrings) {
2105         vhost_dev_set_vring_enable(hdev, false);
2106     }
2107 fail_log:
2108     vhost_log_put(hdev, false);
2109 fail_vq:
2110     while (--i >= 0) {
2111         vhost_virtqueue_stop(hdev,
2112                              vdev,
2113                              hdev->vqs + i,
2114                              hdev->vq_index + i);
2115     }
2116
2117 fail_mem:
2118     if (vhost_dev_has_iommu(hdev)) {
2119         memory_listener_unregister(&hdev->iommu_listener);
2120     }
2121 fail_features:
2122     vdev->vhost_started = false;
2123     hdev->started = false;
2124     return r;
2125 }
2126
2127 /* Host notifiers must be enabled at this point. */
2128 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
2129 {
2130     int i;
2131
2132     /* should only be called after backend is connected */
2133     assert(hdev->vhost_ops);
2134     event_notifier_test_and_clear(
2135         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2136     event_notifier_test_and_clear(&vdev->config_notifier);
2137     event_notifier_cleanup(
2138         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2139
2140     trace_vhost_dev_stop(hdev, vdev->name, vrings);
2141
2142     if (hdev->vhost_ops->vhost_dev_start) {
2143         hdev->vhost_ops->vhost_dev_start(hdev, false);
2144     }
2145     if (vrings) {
2146         vhost_dev_set_vring_enable(hdev, false);
2147     }
2148     for (i = 0; i < hdev->nvqs; ++i) {
2149         vhost_virtqueue_stop(hdev,
2150                              vdev,
2151                              hdev->vqs + i,
2152                              hdev->vq_index + i);
2153     }
2154     if (hdev->vhost_ops->vhost_reset_status) {
2155         hdev->vhost_ops->vhost_reset_status(hdev);
2156     }
2157
2158     if (vhost_dev_has_iommu(hdev)) {
2159         if (hdev->vhost_ops->vhost_set_iotlb_callback) {
2160             hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
2161         }
2162         memory_listener_unregister(&hdev->iommu_listener);
2163     }
2164     vhost_stop_config_intr(hdev);
2165     vhost_log_put(hdev, true);
2166     hdev->started = false;
2167     vdev->vhost_started = false;
2168     hdev->vdev = NULL;
2169 }
2170
2171 int vhost_net_set_backend(struct vhost_dev *hdev,
2172                           struct vhost_vring_file *file)
2173 {
2174     if (hdev->vhost_ops->vhost_net_set_backend) {
2175         return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
2176     }
2177
2178     return -ENOSYS;
2179 }
2180
2181 int vhost_reset_device(struct vhost_dev *hdev)
2182 {
2183     if (hdev->vhost_ops->vhost_reset_device) {
2184         return hdev->vhost_ops->vhost_reset_device(hdev);
2185     }
2186
2187     return -ENOSYS;
2188 }
2189
2190 bool vhost_supports_device_state(struct vhost_dev *dev)
2191 {
2192     if (dev->vhost_ops->vhost_supports_device_state) {
2193         return dev->vhost_ops->vhost_supports_device_state(dev);
2194     }
2195
2196     return false;
2197 }
2198
2199 int vhost_set_device_state_fd(struct vhost_dev *dev,
2200                               VhostDeviceStateDirection direction,
2201                               VhostDeviceStatePhase phase,
2202                               int fd,
2203                               int *reply_fd,
2204                               Error **errp)
2205 {
2206     if (dev->vhost_ops->vhost_set_device_state_fd) {
2207         return dev->vhost_ops->vhost_set_device_state_fd(dev, direction, phase,
2208                                                          fd, reply_fd, errp);
2209     }
2210
2211     error_setg(errp,
2212                "vhost transport does not support migration state transfer");
2213     return -ENOSYS;
2214 }
2215
2216 int vhost_check_device_state(struct vhost_dev *dev, Error **errp)
2217 {
2218     if (dev->vhost_ops->vhost_check_device_state) {
2219         return dev->vhost_ops->vhost_check_device_state(dev, errp);
2220     }
2221
2222     error_setg(errp,
2223                "vhost transport does not support migration state transfer");
2224     return -ENOSYS;
2225 }
2226
2227 int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp)
2228 {
2229     ERRP_GUARD();
2230     /* Maximum chunk size in which to transfer the state */
2231     const size_t chunk_size = 1 * 1024 * 1024;
2232     g_autofree void *transfer_buf = NULL;
2233     g_autoptr(GError) g_err = NULL;
2234     int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1;
2235     int ret;
2236
2237     /* [0] for reading (our end), [1] for writing (back-end's end) */
2238     if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) {
2239         error_setg(errp, "Failed to set up state transfer pipe: %s",
2240                    g_err->message);
2241         ret = -EINVAL;
2242         goto fail;
2243     }
2244
2245     read_fd = pipe_fds[0];
2246     write_fd = pipe_fds[1];
2247
2248     /*
2249      * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped.
2250      * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for
2251      * vhost-user, so just check that it is stopped at all.
2252      */
2253     assert(!dev->started);
2254
2255     /* Transfer ownership of write_fd to the back-end */
2256     ret = vhost_set_device_state_fd(dev,
2257                                     VHOST_TRANSFER_STATE_DIRECTION_SAVE,
2258                                     VHOST_TRANSFER_STATE_PHASE_STOPPED,
2259                                     write_fd,
2260                                     &reply_fd,
2261                                     errp);
2262     if (ret < 0) {
2263         error_prepend(errp, "Failed to initiate state transfer: ");
2264         goto fail;
2265     }
2266
2267     /* If the back-end wishes to use a different pipe, switch over */
2268     if (reply_fd >= 0) {
2269         close(read_fd);
2270         read_fd = reply_fd;
2271     }
2272
2273     transfer_buf = g_malloc(chunk_size);
2274
2275     while (true) {
2276         ssize_t read_ret;
2277
2278         read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size));
2279         if (read_ret < 0) {
2280             ret = -errno;
2281             error_setg_errno(errp, -ret, "Failed to receive state");
2282             goto fail;
2283         }
2284
2285         assert(read_ret <= chunk_size);
2286         qemu_put_be32(f, read_ret);
2287
2288         if (read_ret == 0) {
2289             /* EOF */
2290             break;
2291         }
2292
2293         qemu_put_buffer(f, transfer_buf, read_ret);
2294     }
2295
2296     /*
2297      * Back-end will not really care, but be clean and close our end of the pipe
2298      * before inquiring the back-end about whether transfer was successful
2299      */
2300     close(read_fd);
2301     read_fd = -1;
2302
2303     /* Also, verify that the device is still stopped */
2304     assert(!dev->started);
2305
2306     ret = vhost_check_device_state(dev, errp);
2307     if (ret < 0) {
2308         goto fail;
2309     }
2310
2311     ret = 0;
2312 fail:
2313     if (read_fd >= 0) {
2314         close(read_fd);
2315     }
2316
2317     return ret;
2318 }
2319
2320 int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp)
2321 {
2322     ERRP_GUARD();
2323     size_t transfer_buf_size = 0;
2324     g_autofree void *transfer_buf = NULL;
2325     g_autoptr(GError) g_err = NULL;
2326     int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1;
2327     int ret;
2328
2329     /* [0] for reading (back-end's end), [1] for writing (our end) */
2330     if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) {
2331         error_setg(errp, "Failed to set up state transfer pipe: %s",
2332                    g_err->message);
2333         ret = -EINVAL;
2334         goto fail;
2335     }
2336
2337     read_fd = pipe_fds[0];
2338     write_fd = pipe_fds[1];
2339
2340     /*
2341      * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped.
2342      * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for
2343      * vhost-user, so just check that it is stopped at all.
2344      */
2345     assert(!dev->started);
2346
2347     /* Transfer ownership of read_fd to the back-end */
2348     ret = vhost_set_device_state_fd(dev,
2349                                     VHOST_TRANSFER_STATE_DIRECTION_LOAD,
2350                                     VHOST_TRANSFER_STATE_PHASE_STOPPED,
2351                                     read_fd,
2352                                     &reply_fd,
2353                                     errp);
2354     if (ret < 0) {
2355         error_prepend(errp, "Failed to initiate state transfer: ");
2356         goto fail;
2357     }
2358
2359     /* If the back-end wishes to use a different pipe, switch over */
2360     if (reply_fd >= 0) {
2361         close(write_fd);
2362         write_fd = reply_fd;
2363     }
2364
2365     while (true) {
2366         size_t this_chunk_size = qemu_get_be32(f);
2367         ssize_t write_ret;
2368         const uint8_t *transfer_pointer;
2369
2370         if (this_chunk_size == 0) {
2371             /* End of state */
2372             break;
2373         }
2374
2375         if (transfer_buf_size < this_chunk_size) {
2376             transfer_buf = g_realloc(transfer_buf, this_chunk_size);
2377             transfer_buf_size = this_chunk_size;
2378         }
2379
2380         if (qemu_get_buffer(f, transfer_buf, this_chunk_size) <
2381                 this_chunk_size)
2382         {
2383             error_setg(errp, "Failed to read state");
2384             ret = -EINVAL;
2385             goto fail;
2386         }
2387
2388         transfer_pointer = transfer_buf;
2389         while (this_chunk_size > 0) {
2390             write_ret = RETRY_ON_EINTR(
2391                 write(write_fd, transfer_pointer, this_chunk_size)
2392             );
2393             if (write_ret < 0) {
2394                 ret = -errno;
2395                 error_setg_errno(errp, -ret, "Failed to send state");
2396                 goto fail;
2397             } else if (write_ret == 0) {
2398                 error_setg(errp, "Failed to send state: Connection is closed");
2399                 ret = -ECONNRESET;
2400                 goto fail;
2401             }
2402
2403             assert(write_ret <= this_chunk_size);
2404             this_chunk_size -= write_ret;
2405             transfer_pointer += write_ret;
2406         }
2407     }
2408
2409     /*
2410      * Close our end, thus ending transfer, before inquiring the back-end about
2411      * whether transfer was successful
2412      */
2413     close(write_fd);
2414     write_fd = -1;
2415
2416     /* Also, verify that the device is still stopped */
2417     assert(!dev->started);
2418
2419     ret = vhost_check_device_state(dev, errp);
2420     if (ret < 0) {
2421         goto fail;
2422     }
2423
2424     ret = 0;
2425 fail:
2426     if (write_fd >= 0) {
2427         close(write_fd);
2428     }
2429
2430     return ret;
2431 }