4 * Copyright Red Hat, Inc. 2010
7 * Michael S. Tsirkin <mst@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
12 * Contributions after 2012-01-13 are licensed under the terms of the
13 * GNU GPL, version 2 or (at your option) any later version.
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
18 #include "hw/virtio/vhost.h"
19 #include "qemu/atomic.h"
20 #include "qemu/range.h"
21 #include "qemu/error-report.h"
22 #include "qemu/memfd.h"
24 #include "standard-headers/linux/vhost_types.h"
25 #include "hw/virtio/virtio-bus.h"
26 #include "hw/mem/memory-device.h"
27 #include "migration/blocker.h"
28 #include "migration/qemu-file-types.h"
29 #include "sysemu/dma.h"
32 /* enabled until disconnected backend stabilizes */
33 #define _VHOST_DEBUG 1
36 #define VHOST_OPS_DEBUG(retval, fmt, ...) \
38 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
39 strerror(-retval), -retval); \
42 #define VHOST_OPS_DEBUG(retval, fmt, ...) \
46 static struct vhost_log
*vhost_log
[VHOST_BACKEND_TYPE_MAX
];
47 static struct vhost_log
*vhost_log_shm
[VHOST_BACKEND_TYPE_MAX
];
48 static QLIST_HEAD(, vhost_dev
) vhost_log_devs
[VHOST_BACKEND_TYPE_MAX
];
50 /* Memslots used by backends that support private memslots (without an fd). */
51 static unsigned int used_memslots
;
53 /* Memslots used by backends that only support shared memslots (with an fd). */
54 static unsigned int used_shared_memslots
;
56 static QLIST_HEAD(, vhost_dev
) vhost_devices
=
57 QLIST_HEAD_INITIALIZER(vhost_devices
);
59 unsigned int vhost_get_max_memslots(void)
61 unsigned int max
= UINT_MAX
;
62 struct vhost_dev
*hdev
;
64 QLIST_FOREACH(hdev
, &vhost_devices
, entry
) {
65 max
= MIN(max
, hdev
->vhost_ops
->vhost_backend_memslots_limit(hdev
));
70 unsigned int vhost_get_free_memslots(void)
72 unsigned int free
= UINT_MAX
;
73 struct vhost_dev
*hdev
;
75 QLIST_FOREACH(hdev
, &vhost_devices
, entry
) {
76 unsigned int r
= hdev
->vhost_ops
->vhost_backend_memslots_limit(hdev
);
77 unsigned int cur_free
;
79 if (hdev
->vhost_ops
->vhost_backend_no_private_memslots
&&
80 hdev
->vhost_ops
->vhost_backend_no_private_memslots(hdev
)) {
81 cur_free
= r
- used_shared_memslots
;
83 cur_free
= r
- used_memslots
;
85 free
= MIN(free
, cur_free
);
90 static void vhost_dev_sync_region(struct vhost_dev
*dev
,
91 MemoryRegionSection
*section
,
92 uint64_t mfirst
, uint64_t mlast
,
93 uint64_t rfirst
, uint64_t rlast
)
95 vhost_log_chunk_t
*dev_log
= dev
->log
->log
;
97 uint64_t start
= MAX(mfirst
, rfirst
);
98 uint64_t end
= MIN(mlast
, rlast
);
99 vhost_log_chunk_t
*from
= dev_log
+ start
/ VHOST_LOG_CHUNK
;
100 vhost_log_chunk_t
*to
= dev_log
+ end
/ VHOST_LOG_CHUNK
+ 1;
101 uint64_t addr
= QEMU_ALIGN_DOWN(start
, VHOST_LOG_CHUNK
);
106 assert(end
/ VHOST_LOG_CHUNK
< dev
->log_size
);
107 assert(start
/ VHOST_LOG_CHUNK
< dev
->log_size
);
109 for (;from
< to
; ++from
) {
110 vhost_log_chunk_t log
;
111 /* We first check with non-atomic: much cheaper,
112 * and we expect non-dirty to be the common case. */
114 addr
+= VHOST_LOG_CHUNK
;
117 /* Data must be read atomically. We don't really need barrier semantics
118 * but it's easier to use atomic_* than roll our own. */
119 log
= qatomic_xchg(from
, 0);
123 hwaddr section_offset
;
125 page_addr
= addr
+ bit
* VHOST_LOG_PAGE
;
126 section_offset
= page_addr
- section
->offset_within_address_space
;
127 mr_offset
= section_offset
+ section
->offset_within_region
;
128 memory_region_set_dirty(section
->mr
, mr_offset
, VHOST_LOG_PAGE
);
129 log
&= ~(0x1ull
<< bit
);
131 addr
+= VHOST_LOG_CHUNK
;
135 bool vhost_dev_has_iommu(struct vhost_dev
*dev
)
137 VirtIODevice
*vdev
= dev
->vdev
;
140 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
141 * incremental memory mapping API via IOTLB API. For platform that
142 * does not have IOMMU, there's no need to enable this feature
143 * which may cause unnecessary IOTLB miss/update transactions.
146 return virtio_bus_device_iommu_enabled(vdev
) &&
147 virtio_host_has_feature(vdev
, VIRTIO_F_IOMMU_PLATFORM
);
153 static inline bool vhost_dev_should_log(struct vhost_dev
*dev
)
155 assert(dev
->vhost_ops
);
156 assert(dev
->vhost_ops
->backend_type
> VHOST_BACKEND_TYPE_NONE
);
157 assert(dev
->vhost_ops
->backend_type
< VHOST_BACKEND_TYPE_MAX
);
159 return dev
== QLIST_FIRST(&vhost_log_devs
[dev
->vhost_ops
->backend_type
]);
162 static inline void vhost_dev_elect_mem_logger(struct vhost_dev
*hdev
, bool add
)
164 VhostBackendType backend_type
;
166 assert(hdev
->vhost_ops
);
168 backend_type
= hdev
->vhost_ops
->backend_type
;
169 assert(backend_type
> VHOST_BACKEND_TYPE_NONE
);
170 assert(backend_type
< VHOST_BACKEND_TYPE_MAX
);
172 if (add
&& !QLIST_IS_INSERTED(hdev
, logdev_entry
)) {
173 if (QLIST_EMPTY(&vhost_log_devs
[backend_type
])) {
174 QLIST_INSERT_HEAD(&vhost_log_devs
[backend_type
],
178 * The first vhost_device in the list is selected as the shared
179 * logger to scan memory sections. Put new entry next to the head
180 * to avoid inadvertent change to the underlying logger device.
181 * This is done in order to get better cache locality and to avoid
182 * performance churn on the hot path for log scanning. Even when
183 * new devices come and go quickly, it wouldn't end up changing
184 * the active leading logger device at all.
186 QLIST_INSERT_AFTER(QLIST_FIRST(&vhost_log_devs
[backend_type
]),
189 } else if (!add
&& QLIST_IS_INSERTED(hdev
, logdev_entry
)) {
190 QLIST_REMOVE(hdev
, logdev_entry
);
194 static int vhost_sync_dirty_bitmap(struct vhost_dev
*dev
,
195 MemoryRegionSection
*section
,
203 if (!dev
->log_enabled
|| !dev
->started
) {
206 start_addr
= section
->offset_within_address_space
;
207 end_addr
= range_get_last(start_addr
, int128_get64(section
->size
));
208 start_addr
= MAX(first
, start_addr
);
209 end_addr
= MIN(last
, end_addr
);
211 if (vhost_dev_should_log(dev
)) {
212 for (i
= 0; i
< dev
->mem
->nregions
; ++i
) {
213 struct vhost_memory_region
*reg
= dev
->mem
->regions
+ i
;
214 vhost_dev_sync_region(dev
, section
, start_addr
, end_addr
,
215 reg
->guest_phys_addr
,
216 range_get_last(reg
->guest_phys_addr
,
220 for (i
= 0; i
< dev
->nvqs
; ++i
) {
221 struct vhost_virtqueue
*vq
= dev
->vqs
+ i
;
223 if (!vq
->used_phys
&& !vq
->used_size
) {
227 if (vhost_dev_has_iommu(dev
)) {
229 hwaddr used_phys
= vq
->used_phys
, used_size
= vq
->used_size
;
230 hwaddr phys
, s
, offset
;
234 iotlb
= address_space_get_iotlb_entry(dev
->vdev
->dma_as
,
237 MEMTXATTRS_UNSPECIFIED
);
240 if (!iotlb
.target_as
) {
241 qemu_log_mask(LOG_GUEST_ERROR
, "translation "
242 "failure for used_iova %"PRIx64
"\n",
247 offset
= used_phys
& iotlb
.addr_mask
;
248 phys
= iotlb
.translated_addr
+ offset
;
251 * Distance from start of used ring until last byte of
254 s
= iotlb
.addr_mask
- offset
;
256 * Size of used ring, or of the part of it until end
257 * of IOMMU page. To avoid zero result, do the adding
260 s
= MIN(s
, used_size
- 1) + 1;
262 vhost_dev_sync_region(dev
, section
, start_addr
, end_addr
, phys
,
263 range_get_last(phys
, s
));
268 vhost_dev_sync_region(dev
, section
, start_addr
,
269 end_addr
, vq
->used_phys
,
270 range_get_last(vq
->used_phys
, vq
->used_size
));
276 static void vhost_log_sync(MemoryListener
*listener
,
277 MemoryRegionSection
*section
)
279 struct vhost_dev
*dev
= container_of(listener
, struct vhost_dev
,
281 vhost_sync_dirty_bitmap(dev
, section
, 0x0, ~0x0ULL
);
284 static void vhost_log_sync_range(struct vhost_dev
*dev
,
285 hwaddr first
, hwaddr last
)
288 /* FIXME: this is N^2 in number of sections */
289 for (i
= 0; i
< dev
->n_mem_sections
; ++i
) {
290 MemoryRegionSection
*section
= &dev
->mem_sections
[i
];
291 vhost_sync_dirty_bitmap(dev
, section
, first
, last
);
295 static uint64_t vhost_get_log_size(struct vhost_dev
*dev
)
297 uint64_t log_size
= 0;
299 for (i
= 0; i
< dev
->mem
->nregions
; ++i
) {
300 struct vhost_memory_region
*reg
= dev
->mem
->regions
+ i
;
301 uint64_t last
= range_get_last(reg
->guest_phys_addr
,
303 log_size
= MAX(log_size
, last
/ VHOST_LOG_CHUNK
+ 1);
308 static int vhost_set_backend_type(struct vhost_dev
*dev
,
309 VhostBackendType backend_type
)
313 switch (backend_type
) {
314 #ifdef CONFIG_VHOST_KERNEL
315 case VHOST_BACKEND_TYPE_KERNEL
:
316 dev
->vhost_ops
= &kernel_ops
;
319 #ifdef CONFIG_VHOST_USER
320 case VHOST_BACKEND_TYPE_USER
:
321 dev
->vhost_ops
= &user_ops
;
324 #ifdef CONFIG_VHOST_VDPA
325 case VHOST_BACKEND_TYPE_VDPA
:
326 dev
->vhost_ops
= &vdpa_ops
;
330 error_report("Unknown vhost backend type");
335 assert(dev
->vhost_ops
->backend_type
== backend_type
);
341 static struct vhost_log
*vhost_log_alloc(uint64_t size
, bool share
)
344 struct vhost_log
*log
;
345 uint64_t logsize
= size
* sizeof(*(log
->log
));
348 log
= g_new0(struct vhost_log
, 1);
350 log
->log
= qemu_memfd_alloc("vhost-log", logsize
,
351 F_SEAL_GROW
| F_SEAL_SHRINK
| F_SEAL_SEAL
,
354 error_report_err(err
);
358 memset(log
->log
, 0, logsize
);
360 log
->log
= g_malloc0(logsize
);
370 static struct vhost_log
*vhost_log_get(VhostBackendType backend_type
,
371 uint64_t size
, bool share
)
373 struct vhost_log
*log
;
375 assert(backend_type
> VHOST_BACKEND_TYPE_NONE
);
376 assert(backend_type
< VHOST_BACKEND_TYPE_MAX
);
378 log
= share
? vhost_log_shm
[backend_type
] : vhost_log
[backend_type
];
380 if (!log
|| log
->size
!= size
) {
381 log
= vhost_log_alloc(size
, share
);
383 vhost_log_shm
[backend_type
] = log
;
385 vhost_log
[backend_type
] = log
;
394 static void vhost_log_put(struct vhost_dev
*dev
, bool sync
)
396 struct vhost_log
*log
= dev
->log
;
397 VhostBackendType backend_type
;
403 assert(dev
->vhost_ops
);
404 backend_type
= dev
->vhost_ops
->backend_type
;
406 if (backend_type
== VHOST_BACKEND_TYPE_NONE
||
407 backend_type
>= VHOST_BACKEND_TYPE_MAX
) {
412 if (log
->refcnt
== 0) {
413 /* Sync only the range covered by the old log */
414 if (dev
->log_size
&& sync
) {
415 vhost_log_sync_range(dev
, 0, dev
->log_size
* VHOST_LOG_CHUNK
- 1);
418 if (vhost_log
[backend_type
] == log
) {
420 vhost_log
[backend_type
] = NULL
;
421 } else if (vhost_log_shm
[backend_type
] == log
) {
422 qemu_memfd_free(log
->log
, log
->size
* sizeof(*(log
->log
)),
424 vhost_log_shm
[backend_type
] = NULL
;
430 vhost_dev_elect_mem_logger(dev
, false);
435 static bool vhost_dev_log_is_shared(struct vhost_dev
*dev
)
437 return dev
->vhost_ops
->vhost_requires_shm_log
&&
438 dev
->vhost_ops
->vhost_requires_shm_log(dev
);
441 static inline void vhost_dev_log_resize(struct vhost_dev
*dev
, uint64_t size
)
443 struct vhost_log
*log
= vhost_log_get(dev
->vhost_ops
->backend_type
,
444 size
, vhost_dev_log_is_shared(dev
));
445 uint64_t log_base
= (uintptr_t)log
->log
;
448 /* inform backend of log switching, this must be done before
449 releasing the current log, to ensure no logging is lost */
450 r
= dev
->vhost_ops
->vhost_set_log_base(dev
, log_base
, log
);
452 VHOST_OPS_DEBUG(r
, "vhost_set_log_base failed");
455 vhost_log_put(dev
, true);
457 dev
->log_size
= size
;
460 static void *vhost_memory_map(struct vhost_dev
*dev
, hwaddr addr
,
461 hwaddr
*plen
, bool is_write
)
463 if (!vhost_dev_has_iommu(dev
)) {
464 return cpu_physical_memory_map(addr
, plen
, is_write
);
466 return (void *)(uintptr_t)addr
;
470 static void vhost_memory_unmap(struct vhost_dev
*dev
, void *buffer
,
471 hwaddr len
, int is_write
,
474 if (!vhost_dev_has_iommu(dev
)) {
475 cpu_physical_memory_unmap(buffer
, len
, is_write
, access_len
);
479 static int vhost_verify_ring_part_mapping(void *ring_hva
,
486 uint64_t hva_ring_offset
;
487 uint64_t ring_last
= range_get_last(ring_gpa
, ring_size
);
488 uint64_t reg_last
= range_get_last(reg_gpa
, reg_size
);
490 if (ring_last
< reg_gpa
|| ring_gpa
> reg_last
) {
493 /* check that whole ring's is mapped */
494 if (ring_last
> reg_last
) {
497 /* check that ring's MemoryRegion wasn't replaced */
498 hva_ring_offset
= ring_gpa
- reg_gpa
;
499 if (ring_hva
!= reg_hva
+ hva_ring_offset
) {
506 static int vhost_verify_ring_mappings(struct vhost_dev
*dev
,
513 const char *part_name
[] = {
519 if (vhost_dev_has_iommu(dev
)) {
523 for (i
= 0; i
< dev
->nvqs
; ++i
) {
524 struct vhost_virtqueue
*vq
= dev
->vqs
+ i
;
526 if (vq
->desc_phys
== 0) {
531 r
= vhost_verify_ring_part_mapping(
532 vq
->desc
, vq
->desc_phys
, vq
->desc_size
,
533 reg_hva
, reg_gpa
, reg_size
);
539 r
= vhost_verify_ring_part_mapping(
540 vq
->avail
, vq
->avail_phys
, vq
->avail_size
,
541 reg_hva
, reg_gpa
, reg_size
);
547 r
= vhost_verify_ring_part_mapping(
548 vq
->used
, vq
->used_phys
, vq
->used_size
,
549 reg_hva
, reg_gpa
, reg_size
);
556 error_report("Unable to map %s for ring %d", part_name
[j
], i
);
557 } else if (r
== -EBUSY
) {
558 error_report("%s relocated for ring %d", part_name
[j
], i
);
564 * vhost_section: identify sections needed for vhost access
566 * We only care about RAM sections here (where virtqueue and guest
567 * internals accessed by virtio might live).
569 static bool vhost_section(struct vhost_dev
*dev
, MemoryRegionSection
*section
)
571 MemoryRegion
*mr
= section
->mr
;
573 if (memory_region_is_ram(mr
) && !memory_region_is_rom(mr
)) {
574 uint8_t dirty_mask
= memory_region_get_dirty_log_mask(mr
);
575 uint8_t handled_dirty
;
578 * Kernel based vhost doesn't handle any block which is doing
579 * dirty-tracking other than migration for which it has
580 * specific logging support. However for TCG the kernel never
581 * gets involved anyway so we can also ignore it's
582 * self-modiying code detection flags. However a vhost-user
583 * client could still confuse a TCG guest if it re-writes
584 * executable memory that has already been translated.
586 handled_dirty
= (1 << DIRTY_MEMORY_MIGRATION
) |
587 (1 << DIRTY_MEMORY_CODE
);
589 if (dirty_mask
& ~handled_dirty
) {
590 trace_vhost_reject_section(mr
->name
, 1);
595 * Some backends (like vhost-user) can only handle memory regions
596 * that have an fd (can be mapped into a different process). Filter
597 * the ones without an fd out, if requested.
599 * TODO: we might have to limit to MAP_SHARED as well.
601 if (memory_region_get_fd(section
->mr
) < 0 &&
602 dev
->vhost_ops
->vhost_backend_no_private_memslots
&&
603 dev
->vhost_ops
->vhost_backend_no_private_memslots(dev
)) {
604 trace_vhost_reject_section(mr
->name
, 2);
608 trace_vhost_section(mr
->name
);
611 trace_vhost_reject_section(mr
->name
, 3);
616 static void vhost_begin(MemoryListener
*listener
)
618 struct vhost_dev
*dev
= container_of(listener
, struct vhost_dev
,
620 dev
->tmp_sections
= NULL
;
621 dev
->n_tmp_sections
= 0;
624 static void vhost_commit(MemoryListener
*listener
)
626 struct vhost_dev
*dev
= container_of(listener
, struct vhost_dev
,
628 MemoryRegionSection
*old_sections
;
634 bool changed
= false;
636 /* Note we can be called before the device is started, but then
637 * starting the device calls set_mem_table, so we need to have
638 * built the data structures.
640 old_sections
= dev
->mem_sections
;
641 n_old_sections
= dev
->n_mem_sections
;
642 dev
->mem_sections
= dev
->tmp_sections
;
643 dev
->n_mem_sections
= dev
->n_tmp_sections
;
645 if (dev
->n_mem_sections
!= n_old_sections
) {
648 /* Same size, lets check the contents */
649 for (i
= 0; i
< n_old_sections
; i
++) {
650 if (!MemoryRegionSection_eq(&old_sections
[i
],
651 &dev
->mem_sections
[i
])) {
658 trace_vhost_commit(dev
->started
, changed
);
663 /* Rebuild the regions list from the new sections list */
664 regions_size
= offsetof(struct vhost_memory
, regions
) +
665 dev
->n_mem_sections
* sizeof dev
->mem
->regions
[0];
666 dev
->mem
= g_realloc(dev
->mem
, regions_size
);
667 dev
->mem
->nregions
= dev
->n_mem_sections
;
669 if (dev
->vhost_ops
->vhost_backend_no_private_memslots
&&
670 dev
->vhost_ops
->vhost_backend_no_private_memslots(dev
)) {
671 used_shared_memslots
= dev
->mem
->nregions
;
673 used_memslots
= dev
->mem
->nregions
;
676 for (i
= 0; i
< dev
->n_mem_sections
; i
++) {
677 struct vhost_memory_region
*cur_vmr
= dev
->mem
->regions
+ i
;
678 struct MemoryRegionSection
*mrs
= dev
->mem_sections
+ i
;
680 cur_vmr
->guest_phys_addr
= mrs
->offset_within_address_space
;
681 cur_vmr
->memory_size
= int128_get64(mrs
->size
);
682 cur_vmr
->userspace_addr
=
683 (uintptr_t)memory_region_get_ram_ptr(mrs
->mr
) +
684 mrs
->offset_within_region
;
685 cur_vmr
->flags_padding
= 0;
692 for (i
= 0; i
< dev
->mem
->nregions
; i
++) {
693 if (vhost_verify_ring_mappings(dev
,
694 (void *)(uintptr_t)dev
->mem
->regions
[i
].userspace_addr
,
695 dev
->mem
->regions
[i
].guest_phys_addr
,
696 dev
->mem
->regions
[i
].memory_size
)) {
697 error_report("Verify ring failure on region %d", i
);
702 if (!dev
->log_enabled
) {
703 r
= dev
->vhost_ops
->vhost_set_mem_table(dev
, dev
->mem
);
705 VHOST_OPS_DEBUG(r
, "vhost_set_mem_table failed");
709 log_size
= vhost_get_log_size(dev
);
710 /* We allocate an extra 4K bytes to log,
711 * to reduce the * number of reallocations. */
712 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
713 /* To log more, must increase log size before table update. */
714 if (dev
->log_size
< log_size
) {
715 vhost_dev_log_resize(dev
, log_size
+ VHOST_LOG_BUFFER
);
717 r
= dev
->vhost_ops
->vhost_set_mem_table(dev
, dev
->mem
);
719 VHOST_OPS_DEBUG(r
, "vhost_set_mem_table failed");
721 /* To log less, can only decrease log size after table update. */
722 if (dev
->log_size
> log_size
+ VHOST_LOG_BUFFER
) {
723 vhost_dev_log_resize(dev
, log_size
);
727 /* Deref the old list of sections, this must happen _after_ the
728 * vhost_set_mem_table to ensure the client isn't still using the
729 * section we're about to unref.
731 while (n_old_sections
--) {
732 memory_region_unref(old_sections
[n_old_sections
].mr
);
734 g_free(old_sections
);
738 /* Adds the section data to the tmp_section structure.
739 * It relies on the listener calling us in memory address order
740 * and for each region (via the _add and _nop methods) to
743 static void vhost_region_add_section(struct vhost_dev
*dev
,
744 MemoryRegionSection
*section
)
746 bool need_add
= true;
747 uint64_t mrs_size
= int128_get64(section
->size
);
748 uint64_t mrs_gpa
= section
->offset_within_address_space
;
749 uintptr_t mrs_host
= (uintptr_t)memory_region_get_ram_ptr(section
->mr
) +
750 section
->offset_within_region
;
751 RAMBlock
*mrs_rb
= section
->mr
->ram_block
;
753 trace_vhost_region_add_section(section
->mr
->name
, mrs_gpa
, mrs_size
,
756 if (dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_USER
) {
757 /* Round the section to it's page size */
758 /* First align the start down to a page boundary */
759 size_t mrs_page
= qemu_ram_pagesize(mrs_rb
);
760 uint64_t alignage
= mrs_host
& (mrs_page
- 1);
762 mrs_host
-= alignage
;
763 mrs_size
+= alignage
;
766 /* Now align the size up to a page boundary */
767 alignage
= mrs_size
& (mrs_page
- 1);
769 mrs_size
+= mrs_page
- alignage
;
771 trace_vhost_region_add_section_aligned(section
->mr
->name
, mrs_gpa
,
775 if (dev
->n_tmp_sections
&& !section
->unmergeable
) {
776 /* Since we already have at least one section, lets see if
777 * this extends it; since we're scanning in order, we only
778 * have to look at the last one, and the FlatView that calls
779 * us shouldn't have overlaps.
781 MemoryRegionSection
*prev_sec
= dev
->tmp_sections
+
782 (dev
->n_tmp_sections
- 1);
783 uint64_t prev_gpa_start
= prev_sec
->offset_within_address_space
;
784 uint64_t prev_size
= int128_get64(prev_sec
->size
);
785 uint64_t prev_gpa_end
= range_get_last(prev_gpa_start
, prev_size
);
786 uint64_t prev_host_start
=
787 (uintptr_t)memory_region_get_ram_ptr(prev_sec
->mr
) +
788 prev_sec
->offset_within_region
;
789 uint64_t prev_host_end
= range_get_last(prev_host_start
, prev_size
);
791 if (mrs_gpa
<= (prev_gpa_end
+ 1)) {
792 /* OK, looks like overlapping/intersecting - it's possible that
793 * the rounding to page sizes has made them overlap, but they should
794 * match up in the same RAMBlock if they do.
796 if (mrs_gpa
< prev_gpa_start
) {
797 error_report("%s:Section '%s' rounded to %"PRIx64
798 " prior to previous '%s' %"PRIx64
,
799 __func__
, section
->mr
->name
, mrs_gpa
,
800 prev_sec
->mr
->name
, prev_gpa_start
);
801 /* A way to cleanly fail here would be better */
804 /* Offset from the start of the previous GPA to this GPA */
805 size_t offset
= mrs_gpa
- prev_gpa_start
;
807 if (prev_host_start
+ offset
== mrs_host
&&
808 section
->mr
== prev_sec
->mr
&& !prev_sec
->unmergeable
) {
809 uint64_t max_end
= MAX(prev_host_end
, mrs_host
+ mrs_size
);
811 prev_sec
->offset_within_address_space
=
812 MIN(prev_gpa_start
, mrs_gpa
);
813 prev_sec
->offset_within_region
=
814 MIN(prev_host_start
, mrs_host
) -
815 (uintptr_t)memory_region_get_ram_ptr(prev_sec
->mr
);
816 prev_sec
->size
= int128_make64(max_end
- MIN(prev_host_start
,
818 trace_vhost_region_add_section_merge(section
->mr
->name
,
819 int128_get64(prev_sec
->size
),
820 prev_sec
->offset_within_address_space
,
821 prev_sec
->offset_within_region
);
823 /* adjoining regions are fine, but overlapping ones with
824 * different blocks/offsets shouldn't happen
826 if (mrs_gpa
!= prev_gpa_end
+ 1) {
827 error_report("%s: Overlapping but not coherent sections "
837 ++dev
->n_tmp_sections
;
838 dev
->tmp_sections
= g_renew(MemoryRegionSection
, dev
->tmp_sections
,
839 dev
->n_tmp_sections
);
840 dev
->tmp_sections
[dev
->n_tmp_sections
- 1] = *section
;
841 /* The flatview isn't stable and we don't use it, making it NULL
842 * means we can memcmp the list.
844 dev
->tmp_sections
[dev
->n_tmp_sections
- 1].fv
= NULL
;
845 memory_region_ref(section
->mr
);
849 /* Used for both add and nop callbacks */
850 static void vhost_region_addnop(MemoryListener
*listener
,
851 MemoryRegionSection
*section
)
853 struct vhost_dev
*dev
= container_of(listener
, struct vhost_dev
,
856 if (!vhost_section(dev
, section
)) {
859 vhost_region_add_section(dev
, section
);
862 static void vhost_iommu_unmap_notify(IOMMUNotifier
*n
, IOMMUTLBEntry
*iotlb
)
864 struct vhost_iommu
*iommu
= container_of(n
, struct vhost_iommu
, n
);
865 struct vhost_dev
*hdev
= iommu
->hdev
;
866 hwaddr iova
= iotlb
->iova
+ iommu
->iommu_offset
;
868 if (vhost_backend_invalidate_device_iotlb(hdev
, iova
,
869 iotlb
->addr_mask
+ 1)) {
870 error_report("Fail to invalidate device iotlb");
874 static void vhost_iommu_region_add(MemoryListener
*listener
,
875 MemoryRegionSection
*section
)
877 struct vhost_dev
*dev
= container_of(listener
, struct vhost_dev
,
879 struct vhost_iommu
*iommu
;
882 IOMMUMemoryRegion
*iommu_mr
;
884 if (!memory_region_is_iommu(section
->mr
)) {
888 iommu_mr
= IOMMU_MEMORY_REGION(section
->mr
);
890 iommu
= g_malloc0(sizeof(*iommu
));
891 end
= int128_add(int128_make64(section
->offset_within_region
),
893 end
= int128_sub(end
, int128_one());
894 iommu_idx
= memory_region_iommu_attrs_to_index(iommu_mr
,
895 MEMTXATTRS_UNSPECIFIED
);
896 iommu_notifier_init(&iommu
->n
, vhost_iommu_unmap_notify
,
897 dev
->vdev
->device_iotlb_enabled
?
898 IOMMU_NOTIFIER_DEVIOTLB_UNMAP
:
899 IOMMU_NOTIFIER_UNMAP
,
900 section
->offset_within_region
,
903 iommu
->mr
= section
->mr
;
904 iommu
->iommu_offset
= section
->offset_within_address_space
-
905 section
->offset_within_region
;
907 memory_region_register_iommu_notifier(section
->mr
, &iommu
->n
,
909 QLIST_INSERT_HEAD(&dev
->iommu_list
, iommu
, iommu_next
);
910 /* TODO: can replay help performance here? */
913 static void vhost_iommu_region_del(MemoryListener
*listener
,
914 MemoryRegionSection
*section
)
916 struct vhost_dev
*dev
= container_of(listener
, struct vhost_dev
,
918 struct vhost_iommu
*iommu
;
920 if (!memory_region_is_iommu(section
->mr
)) {
924 QLIST_FOREACH(iommu
, &dev
->iommu_list
, iommu_next
) {
925 if (iommu
->mr
== section
->mr
&&
926 iommu
->n
.start
== section
->offset_within_region
) {
927 memory_region_unregister_iommu_notifier(iommu
->mr
,
929 QLIST_REMOVE(iommu
, iommu_next
);
936 void vhost_toggle_device_iotlb(VirtIODevice
*vdev
)
938 VirtioDeviceClass
*vdc
= VIRTIO_DEVICE_GET_CLASS(vdev
);
939 struct vhost_dev
*dev
;
940 struct vhost_iommu
*iommu
;
942 if (vdev
->vhost_started
) {
943 dev
= vdc
->get_vhost(vdev
);
948 QLIST_FOREACH(iommu
, &dev
->iommu_list
, iommu_next
) {
949 memory_region_unregister_iommu_notifier(iommu
->mr
, &iommu
->n
);
950 iommu
->n
.notifier_flags
= vdev
->device_iotlb_enabled
?
951 IOMMU_NOTIFIER_DEVIOTLB_UNMAP
: IOMMU_NOTIFIER_UNMAP
;
952 memory_region_register_iommu_notifier(iommu
->mr
, &iommu
->n
,
957 static int vhost_virtqueue_set_addr(struct vhost_dev
*dev
,
958 struct vhost_virtqueue
*vq
,
959 unsigned idx
, bool enable_log
)
961 struct vhost_vring_addr addr
;
963 memset(&addr
, 0, sizeof(struct vhost_vring_addr
));
965 if (dev
->vhost_ops
->vhost_vq_get_addr
) {
966 r
= dev
->vhost_ops
->vhost_vq_get_addr(dev
, &addr
, vq
);
968 VHOST_OPS_DEBUG(r
, "vhost_vq_get_addr failed");
972 addr
.desc_user_addr
= (uint64_t)(unsigned long)vq
->desc
;
973 addr
.avail_user_addr
= (uint64_t)(unsigned long)vq
->avail
;
974 addr
.used_user_addr
= (uint64_t)(unsigned long)vq
->used
;
977 addr
.log_guest_addr
= vq
->used_phys
;
978 addr
.flags
= enable_log
? (1 << VHOST_VRING_F_LOG
) : 0;
979 r
= dev
->vhost_ops
->vhost_set_vring_addr(dev
, &addr
);
981 VHOST_OPS_DEBUG(r
, "vhost_set_vring_addr failed");
986 static int vhost_dev_set_features(struct vhost_dev
*dev
,
989 uint64_t features
= dev
->acked_features
;
992 features
|= 0x1ULL
<< VHOST_F_LOG_ALL
;
994 if (!vhost_dev_has_iommu(dev
)) {
995 features
&= ~(0x1ULL
<< VIRTIO_F_IOMMU_PLATFORM
);
997 if (dev
->vhost_ops
->vhost_force_iommu
) {
998 if (dev
->vhost_ops
->vhost_force_iommu(dev
) == true) {
999 features
|= 0x1ULL
<< VIRTIO_F_IOMMU_PLATFORM
;
1002 r
= dev
->vhost_ops
->vhost_set_features(dev
, features
);
1004 VHOST_OPS_DEBUG(r
, "vhost_set_features failed");
1007 if (dev
->vhost_ops
->vhost_set_backend_cap
) {
1008 r
= dev
->vhost_ops
->vhost_set_backend_cap(dev
);
1010 VHOST_OPS_DEBUG(r
, "vhost_set_backend_cap failed");
1019 static int vhost_dev_set_log(struct vhost_dev
*dev
, bool enable_log
)
1024 r
= vhost_dev_set_features(dev
, enable_log
);
1028 for (i
= 0; i
< dev
->nvqs
; ++i
) {
1029 idx
= dev
->vhost_ops
->vhost_get_vq_index(dev
, dev
->vq_index
+ i
);
1030 addr
= virtio_queue_get_desc_addr(dev
->vdev
, idx
);
1033 * The queue might not be ready for start. If this
1034 * is the case there is no reason to continue the process.
1035 * The similar logic is used by the vhost_virtqueue_start()
1040 r
= vhost_virtqueue_set_addr(dev
, dev
->vqs
+ i
, idx
,
1048 * At log start we select our vhost_device logger that will scan the
1049 * memory sections and skip for the others. This is possible because
1050 * the log is shared amongst all vhost devices for a given type of
1053 vhost_dev_elect_mem_logger(dev
, enable_log
);
1057 for (; i
>= 0; --i
) {
1058 idx
= dev
->vhost_ops
->vhost_get_vq_index(dev
, dev
->vq_index
+ i
);
1059 addr
= virtio_queue_get_desc_addr(dev
->vdev
, idx
);
1063 vhost_virtqueue_set_addr(dev
, dev
->vqs
+ i
, idx
,
1066 vhost_dev_set_features(dev
, dev
->log_enabled
);
1071 static int vhost_migration_log(MemoryListener
*listener
, bool enable
)
1073 struct vhost_dev
*dev
= container_of(listener
, struct vhost_dev
,
1076 if (enable
== dev
->log_enabled
) {
1079 if (!dev
->started
) {
1080 dev
->log_enabled
= enable
;
1086 r
= vhost_dev_set_log(dev
, false);
1088 goto check_dev_state
;
1090 vhost_log_put(dev
, false);
1092 vhost_dev_log_resize(dev
, vhost_get_log_size(dev
));
1093 r
= vhost_dev_set_log(dev
, true);
1095 goto check_dev_state
;
1100 dev
->log_enabled
= enable
;
1102 * vhost-user-* devices could change their state during log
1103 * initialization due to disconnect. So check dev state after
1104 * vhost communication.
1106 if (!dev
->started
) {
1108 * Since device is in the stopped state, it is okay for
1109 * migration. Return success.
1114 /* An error occurred. */
1115 dev
->log_enabled
= false;
1121 static bool vhost_log_global_start(MemoryListener
*listener
, Error
**errp
)
1125 r
= vhost_migration_log(listener
, true);
1132 static void vhost_log_global_stop(MemoryListener
*listener
)
1136 r
= vhost_migration_log(listener
, false);
1142 static void vhost_log_start(MemoryListener
*listener
,
1143 MemoryRegionSection
*section
,
1146 /* FIXME: implement */
1149 static void vhost_log_stop(MemoryListener
*listener
,
1150 MemoryRegionSection
*section
,
1153 /* FIXME: implement */
1156 /* The vhost driver natively knows how to handle the vrings of non
1157 * cross-endian legacy devices and modern devices. Only legacy devices
1158 * exposed to a bi-endian guest may require the vhost driver to use a
1159 * specific endianness.
1161 static inline bool vhost_needs_vring_endian(VirtIODevice
*vdev
)
1163 if (virtio_vdev_has_feature(vdev
, VIRTIO_F_VERSION_1
)) {
1167 return vdev
->device_endian
== VIRTIO_DEVICE_ENDIAN_LITTLE
;
1169 return vdev
->device_endian
== VIRTIO_DEVICE_ENDIAN_BIG
;
1173 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev
*dev
,
1178 struct vhost_vring_state s
= {
1179 .index
= vhost_vq_index
,
1180 .num
= is_big_endian
1183 r
= dev
->vhost_ops
->vhost_set_vring_endian(dev
, &s
);
1185 VHOST_OPS_DEBUG(r
, "vhost_set_vring_endian failed");
1190 static int vhost_memory_region_lookup(struct vhost_dev
*hdev
,
1191 uint64_t gpa
, uint64_t *uaddr
,
1196 for (i
= 0; i
< hdev
->mem
->nregions
; i
++) {
1197 struct vhost_memory_region
*reg
= hdev
->mem
->regions
+ i
;
1199 if (gpa
>= reg
->guest_phys_addr
&&
1200 reg
->guest_phys_addr
+ reg
->memory_size
> gpa
) {
1201 *uaddr
= reg
->userspace_addr
+ gpa
- reg
->guest_phys_addr
;
1202 *len
= reg
->guest_phys_addr
+ reg
->memory_size
- gpa
;
1210 int vhost_device_iotlb_miss(struct vhost_dev
*dev
, uint64_t iova
, int write
)
1212 IOMMUTLBEntry iotlb
;
1213 uint64_t uaddr
, len
;
1216 RCU_READ_LOCK_GUARD();
1218 trace_vhost_iotlb_miss(dev
, 1);
1220 iotlb
= address_space_get_iotlb_entry(dev
->vdev
->dma_as
,
1222 MEMTXATTRS_UNSPECIFIED
);
1223 if (iotlb
.target_as
!= NULL
) {
1224 ret
= vhost_memory_region_lookup(dev
, iotlb
.translated_addr
,
1227 trace_vhost_iotlb_miss(dev
, 3);
1228 error_report("Fail to lookup the translated address "
1229 "%"PRIx64
, iotlb
.translated_addr
);
1233 len
= MIN(iotlb
.addr_mask
+ 1, len
);
1234 iova
= iova
& ~iotlb
.addr_mask
;
1236 ret
= vhost_backend_update_device_iotlb(dev
, iova
, uaddr
,
1239 trace_vhost_iotlb_miss(dev
, 4);
1240 error_report("Fail to update device iotlb");
1245 trace_vhost_iotlb_miss(dev
, 2);
1251 int vhost_virtqueue_start(struct vhost_dev
*dev
,
1252 struct VirtIODevice
*vdev
,
1253 struct vhost_virtqueue
*vq
,
1256 BusState
*qbus
= BUS(qdev_get_parent_bus(DEVICE(vdev
)));
1257 VirtioBusState
*vbus
= VIRTIO_BUS(qbus
);
1258 VirtioBusClass
*k
= VIRTIO_BUS_GET_CLASS(vbus
);
1261 int vhost_vq_index
= dev
->vhost_ops
->vhost_get_vq_index(dev
, idx
);
1262 struct vhost_vring_file file
= {
1263 .index
= vhost_vq_index
1265 struct vhost_vring_state state
= {
1266 .index
= vhost_vq_index
1268 struct VirtQueue
*vvq
= virtio_get_queue(vdev
, idx
);
1270 a
= virtio_queue_get_desc_addr(vdev
, idx
);
1272 /* Queue might not be ready for start */
1276 vq
->num
= state
.num
= virtio_queue_get_num(vdev
, idx
);
1277 r
= dev
->vhost_ops
->vhost_set_vring_num(dev
, &state
);
1279 VHOST_OPS_DEBUG(r
, "vhost_set_vring_num failed");
1283 state
.num
= virtio_queue_get_last_avail_idx(vdev
, idx
);
1284 r
= dev
->vhost_ops
->vhost_set_vring_base(dev
, &state
);
1286 VHOST_OPS_DEBUG(r
, "vhost_set_vring_base failed");
1290 if (vhost_needs_vring_endian(vdev
)) {
1291 r
= vhost_virtqueue_set_vring_endian_legacy(dev
,
1292 virtio_is_big_endian(vdev
),
1299 vq
->desc_size
= s
= l
= virtio_queue_get_desc_size(vdev
, idx
);
1301 vq
->desc
= vhost_memory_map(dev
, a
, &l
, false);
1302 if (!vq
->desc
|| l
!= s
) {
1304 goto fail_alloc_desc
;
1306 vq
->avail_size
= s
= l
= virtio_queue_get_avail_size(vdev
, idx
);
1307 vq
->avail_phys
= a
= virtio_queue_get_avail_addr(vdev
, idx
);
1308 vq
->avail
= vhost_memory_map(dev
, a
, &l
, false);
1309 if (!vq
->avail
|| l
!= s
) {
1311 goto fail_alloc_avail
;
1313 vq
->used_size
= s
= l
= virtio_queue_get_used_size(vdev
, idx
);
1314 vq
->used_phys
= a
= virtio_queue_get_used_addr(vdev
, idx
);
1315 vq
->used
= vhost_memory_map(dev
, a
, &l
, true);
1316 if (!vq
->used
|| l
!= s
) {
1318 goto fail_alloc_used
;
1321 r
= vhost_virtqueue_set_addr(dev
, vq
, vhost_vq_index
, dev
->log_enabled
);
1326 file
.fd
= event_notifier_get_fd(virtio_queue_get_host_notifier(vvq
));
1327 r
= dev
->vhost_ops
->vhost_set_vring_kick(dev
, &file
);
1329 VHOST_OPS_DEBUG(r
, "vhost_set_vring_kick failed");
1333 /* Clear and discard previous events if any. */
1334 event_notifier_test_and_clear(&vq
->masked_notifier
);
1336 /* Init vring in unmasked state, unless guest_notifier_mask
1339 if (!vdev
->use_guest_notifier_mask
) {
1340 /* TODO: check and handle errors. */
1341 vhost_virtqueue_mask(dev
, vdev
, idx
, false);
1344 if (k
->query_guest_notifiers
&&
1345 k
->query_guest_notifiers(qbus
->parent
) &&
1346 virtio_queue_vector(vdev
, idx
) == VIRTIO_NO_VECTOR
) {
1348 r
= dev
->vhost_ops
->vhost_set_vring_call(dev
, &file
);
1359 vhost_memory_unmap(dev
, vq
->used
, virtio_queue_get_used_size(vdev
, idx
),
1362 vhost_memory_unmap(dev
, vq
->avail
, virtio_queue_get_avail_size(vdev
, idx
),
1365 vhost_memory_unmap(dev
, vq
->desc
, virtio_queue_get_desc_size(vdev
, idx
),
1371 void vhost_virtqueue_stop(struct vhost_dev
*dev
,
1372 struct VirtIODevice
*vdev
,
1373 struct vhost_virtqueue
*vq
,
1376 int vhost_vq_index
= dev
->vhost_ops
->vhost_get_vq_index(dev
, idx
);
1377 struct vhost_vring_state state
= {
1378 .index
= vhost_vq_index
,
1382 if (virtio_queue_get_desc_addr(vdev
, idx
) == 0) {
1383 /* Don't stop the virtqueue which might have not been started */
1387 r
= dev
->vhost_ops
->vhost_get_vring_base(dev
, &state
);
1389 VHOST_OPS_DEBUG(r
, "vhost VQ %u ring restore failed: %d", idx
, r
);
1390 /* Connection to the backend is broken, so let's sync internal
1391 * last avail idx to the device used idx.
1393 virtio_queue_restore_last_avail_idx(vdev
, idx
);
1395 virtio_queue_set_last_avail_idx(vdev
, idx
, state
.num
);
1397 virtio_queue_invalidate_signalled_used(vdev
, idx
);
1398 virtio_queue_update_used_idx(vdev
, idx
);
1400 /* In the cross-endian case, we need to reset the vring endianness to
1401 * native as legacy devices expect so by default.
1403 if (vhost_needs_vring_endian(vdev
)) {
1404 vhost_virtqueue_set_vring_endian_legacy(dev
,
1405 !virtio_is_big_endian(vdev
),
1409 vhost_memory_unmap(dev
, vq
->used
, virtio_queue_get_used_size(vdev
, idx
),
1410 1, virtio_queue_get_used_size(vdev
, idx
));
1411 vhost_memory_unmap(dev
, vq
->avail
, virtio_queue_get_avail_size(vdev
, idx
),
1412 0, virtio_queue_get_avail_size(vdev
, idx
));
1413 vhost_memory_unmap(dev
, vq
->desc
, virtio_queue_get_desc_size(vdev
, idx
),
1414 0, virtio_queue_get_desc_size(vdev
, idx
));
1417 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev
*dev
,
1418 int n
, uint32_t timeout
)
1420 int vhost_vq_index
= dev
->vhost_ops
->vhost_get_vq_index(dev
, n
);
1421 struct vhost_vring_state state
= {
1422 .index
= vhost_vq_index
,
1427 if (!dev
->vhost_ops
->vhost_set_vring_busyloop_timeout
) {
1431 r
= dev
->vhost_ops
->vhost_set_vring_busyloop_timeout(dev
, &state
);
1433 VHOST_OPS_DEBUG(r
, "vhost_set_vring_busyloop_timeout failed");
1440 static void vhost_virtqueue_error_notifier(EventNotifier
*n
)
1442 struct vhost_virtqueue
*vq
= container_of(n
, struct vhost_virtqueue
,
1444 struct vhost_dev
*dev
= vq
->dev
;
1445 int index
= vq
- dev
->vqs
;
1447 if (event_notifier_test_and_clear(n
) && dev
->vdev
) {
1448 VHOST_OPS_DEBUG(-EINVAL
, "vhost vring error in virtqueue %d",
1449 dev
->vq_index
+ index
);
1453 static int vhost_virtqueue_init(struct vhost_dev
*dev
,
1454 struct vhost_virtqueue
*vq
, int n
)
1456 int vhost_vq_index
= dev
->vhost_ops
->vhost_get_vq_index(dev
, n
);
1457 struct vhost_vring_file file
= {
1458 .index
= vhost_vq_index
,
1460 int r
= event_notifier_init(&vq
->masked_notifier
, 0);
1465 file
.fd
= event_notifier_get_wfd(&vq
->masked_notifier
);
1466 r
= dev
->vhost_ops
->vhost_set_vring_call(dev
, &file
);
1468 VHOST_OPS_DEBUG(r
, "vhost_set_vring_call failed");
1474 if (dev
->vhost_ops
->vhost_set_vring_err
) {
1475 r
= event_notifier_init(&vq
->error_notifier
, 0);
1480 file
.fd
= event_notifier_get_fd(&vq
->error_notifier
);
1481 r
= dev
->vhost_ops
->vhost_set_vring_err(dev
, &file
);
1483 VHOST_OPS_DEBUG(r
, "vhost_set_vring_err failed");
1487 event_notifier_set_handler(&vq
->error_notifier
,
1488 vhost_virtqueue_error_notifier
);
1494 event_notifier_cleanup(&vq
->error_notifier
);
1496 event_notifier_cleanup(&vq
->masked_notifier
);
1500 static void vhost_virtqueue_cleanup(struct vhost_virtqueue
*vq
)
1502 event_notifier_cleanup(&vq
->masked_notifier
);
1503 if (vq
->dev
->vhost_ops
->vhost_set_vring_err
) {
1504 event_notifier_set_handler(&vq
->error_notifier
, NULL
);
1505 event_notifier_cleanup(&vq
->error_notifier
);
1509 int vhost_dev_init(struct vhost_dev
*hdev
, void *opaque
,
1510 VhostBackendType backend_type
, uint32_t busyloop_timeout
,
1513 unsigned int used
, reserved
, limit
;
1515 int i
, r
, n_initialized_vqs
= 0;
1518 hdev
->migration_blocker
= NULL
;
1520 r
= vhost_set_backend_type(hdev
, backend_type
);
1523 r
= hdev
->vhost_ops
->vhost_backend_init(hdev
, opaque
, errp
);
1528 r
= hdev
->vhost_ops
->vhost_set_owner(hdev
);
1530 error_setg_errno(errp
, -r
, "vhost_set_owner failed");
1534 r
= hdev
->vhost_ops
->vhost_get_features(hdev
, &features
);
1536 error_setg_errno(errp
, -r
, "vhost_get_features failed");
1540 limit
= hdev
->vhost_ops
->vhost_backend_memslots_limit(hdev
);
1541 if (limit
< MEMORY_DEVICES_SAFE_MAX_MEMSLOTS
&&
1542 memory_devices_memslot_auto_decision_active()) {
1543 error_setg(errp
, "some memory device (like virtio-mem)"
1544 " decided how many memory slots to use based on the overall"
1545 " number of memory slots; this vhost backend would further"
1546 " restricts the overall number of memory slots");
1547 error_append_hint(errp
, "Try plugging this vhost backend before"
1548 " plugging such memory devices.\n");
1553 for (i
= 0; i
< hdev
->nvqs
; ++i
, ++n_initialized_vqs
) {
1554 r
= vhost_virtqueue_init(hdev
, hdev
->vqs
+ i
, hdev
->vq_index
+ i
);
1556 error_setg_errno(errp
, -r
, "Failed to initialize virtqueue %d", i
);
1561 if (busyloop_timeout
) {
1562 for (i
= 0; i
< hdev
->nvqs
; ++i
) {
1563 r
= vhost_virtqueue_set_busyloop_timeout(hdev
, hdev
->vq_index
+ i
,
1566 error_setg_errno(errp
, -r
, "Failed to set busyloop timeout");
1572 hdev
->features
= features
;
1574 hdev
->memory_listener
= (MemoryListener
) {
1576 .begin
= vhost_begin
,
1577 .commit
= vhost_commit
,
1578 .region_add
= vhost_region_addnop
,
1579 .region_nop
= vhost_region_addnop
,
1580 .log_start
= vhost_log_start
,
1581 .log_stop
= vhost_log_stop
,
1582 .log_sync
= vhost_log_sync
,
1583 .log_global_start
= vhost_log_global_start
,
1584 .log_global_stop
= vhost_log_global_stop
,
1585 .priority
= MEMORY_LISTENER_PRIORITY_DEV_BACKEND
1588 hdev
->iommu_listener
= (MemoryListener
) {
1589 .name
= "vhost-iommu",
1590 .region_add
= vhost_iommu_region_add
,
1591 .region_del
= vhost_iommu_region_del
,
1594 if (hdev
->migration_blocker
== NULL
) {
1595 if (!(hdev
->features
& (0x1ULL
<< VHOST_F_LOG_ALL
))) {
1596 error_setg(&hdev
->migration_blocker
,
1597 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1598 } else if (vhost_dev_log_is_shared(hdev
) && !qemu_memfd_alloc_check()) {
1599 error_setg(&hdev
->migration_blocker
,
1600 "Migration disabled: failed to allocate shared memory");
1604 if (hdev
->migration_blocker
!= NULL
) {
1605 r
= migrate_add_blocker_normal(&hdev
->migration_blocker
, errp
);
1611 hdev
->mem
= g_malloc0(offsetof(struct vhost_memory
, regions
));
1612 hdev
->n_mem_sections
= 0;
1613 hdev
->mem_sections
= NULL
;
1616 hdev
->log_enabled
= false;
1617 hdev
->started
= false;
1618 memory_listener_register(&hdev
->memory_listener
, &address_space_memory
);
1619 QLIST_INSERT_HEAD(&vhost_devices
, hdev
, entry
);
1622 * The listener we registered properly updated the corresponding counter.
1623 * So we can trust that these values are accurate.
1625 if (hdev
->vhost_ops
->vhost_backend_no_private_memslots
&&
1626 hdev
->vhost_ops
->vhost_backend_no_private_memslots(hdev
)) {
1627 used
= used_shared_memslots
;
1629 used
= used_memslots
;
1632 * We assume that all reserved memslots actually require a real memslot
1633 * in our vhost backend. This might not be true, for example, if the
1634 * memslot would be ROM. If ever relevant, we can optimize for that --
1635 * but we'll need additional information about the reservations.
1637 reserved
= memory_devices_get_reserved_memslots();
1638 if (used
+ reserved
> limit
) {
1639 error_setg(errp
, "vhost backend memory slots limit (%d) is less"
1640 " than current number of used (%d) and reserved (%d)"
1641 " memory slots for memory devices.", limit
, used
, reserved
);
1649 if (busyloop_timeout
) {
1651 vhost_virtqueue_set_busyloop_timeout(hdev
, hdev
->vq_index
+ i
, 0);
1655 hdev
->nvqs
= n_initialized_vqs
;
1656 vhost_dev_cleanup(hdev
);
1660 void vhost_dev_cleanup(struct vhost_dev
*hdev
)
1664 trace_vhost_dev_cleanup(hdev
);
1666 for (i
= 0; i
< hdev
->nvqs
; ++i
) {
1667 vhost_virtqueue_cleanup(hdev
->vqs
+ i
);
1670 /* those are only safe after successful init */
1671 memory_listener_unregister(&hdev
->memory_listener
);
1672 QLIST_REMOVE(hdev
, entry
);
1674 migrate_del_blocker(&hdev
->migration_blocker
);
1676 g_free(hdev
->mem_sections
);
1677 if (hdev
->vhost_ops
) {
1678 hdev
->vhost_ops
->vhost_backend_cleanup(hdev
);
1682 memset(hdev
, 0, sizeof(struct vhost_dev
));
1685 void vhost_dev_disable_notifiers_nvqs(struct vhost_dev
*hdev
,
1689 BusState
*qbus
= BUS(qdev_get_parent_bus(DEVICE(vdev
)));
1693 * Batch all the host notifiers in a single transaction to avoid
1694 * quadratic time complexity in address_space_update_ioeventfds().
1696 memory_region_transaction_begin();
1698 for (i
= 0; i
< nvqs
; ++i
) {
1699 r
= virtio_bus_set_host_notifier(VIRTIO_BUS(qbus
), hdev
->vq_index
+ i
,
1702 error_report("vhost VQ %d notifier cleanup failed: %d", i
, -r
);
1708 * The transaction expects the ioeventfds to be open when it
1709 * commits. Do it now, before the cleanup loop.
1711 memory_region_transaction_commit();
1713 for (i
= 0; i
< nvqs
; ++i
) {
1714 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus
), hdev
->vq_index
+ i
);
1716 virtio_device_release_ioeventfd(vdev
);
1719 /* Stop processing guest IO notifications in qemu.
1720 * Start processing them in vhost in kernel.
1722 int vhost_dev_enable_notifiers(struct vhost_dev
*hdev
, VirtIODevice
*vdev
)
1724 BusState
*qbus
= BUS(qdev_get_parent_bus(DEVICE(vdev
)));
1727 /* We will pass the notifiers to the kernel, make sure that QEMU
1728 * doesn't interfere.
1730 r
= virtio_device_grab_ioeventfd(vdev
);
1732 error_report("binding does not support host notifiers");
1737 * Batch all the host notifiers in a single transaction to avoid
1738 * quadratic time complexity in address_space_update_ioeventfds().
1740 memory_region_transaction_begin();
1742 for (i
= 0; i
< hdev
->nvqs
; ++i
) {
1743 r
= virtio_bus_set_host_notifier(VIRTIO_BUS(qbus
), hdev
->vq_index
+ i
,
1746 error_report("vhost VQ %d notifier binding failed: %d", i
, -r
);
1747 memory_region_transaction_commit();
1748 vhost_dev_disable_notifiers_nvqs(hdev
, vdev
, i
);
1753 memory_region_transaction_commit();
1758 /* Stop processing guest IO notifications in vhost.
1759 * Start processing them in qemu.
1760 * This might actually run the qemu handlers right away,
1761 * so virtio in qemu must be completely setup when this is called.
1763 void vhost_dev_disable_notifiers(struct vhost_dev
*hdev
, VirtIODevice
*vdev
)
1765 vhost_dev_disable_notifiers_nvqs(hdev
, vdev
, hdev
->nvqs
);
1768 /* Test and clear event pending status.
1769 * Should be called after unmask to avoid losing events.
1771 bool vhost_virtqueue_pending(struct vhost_dev
*hdev
, int n
)
1773 struct vhost_virtqueue
*vq
= hdev
->vqs
+ n
- hdev
->vq_index
;
1774 assert(n
>= hdev
->vq_index
&& n
< hdev
->vq_index
+ hdev
->nvqs
);
1775 return event_notifier_test_and_clear(&vq
->masked_notifier
);
1778 /* Mask/unmask events from this vq. */
1779 void vhost_virtqueue_mask(struct vhost_dev
*hdev
, VirtIODevice
*vdev
, int n
,
1782 struct VirtQueue
*vvq
= virtio_get_queue(vdev
, n
);
1783 int r
, index
= n
- hdev
->vq_index
;
1784 struct vhost_vring_file file
;
1786 /* should only be called after backend is connected */
1787 assert(hdev
->vhost_ops
);
1790 assert(vdev
->use_guest_notifier_mask
);
1791 file
.fd
= event_notifier_get_wfd(&hdev
->vqs
[index
].masked_notifier
);
1793 file
.fd
= event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq
));
1796 file
.index
= hdev
->vhost_ops
->vhost_get_vq_index(hdev
, n
);
1797 r
= hdev
->vhost_ops
->vhost_set_vring_call(hdev
, &file
);
1799 error_report("vhost_set_vring_call failed %d", -r
);
1803 bool vhost_config_pending(struct vhost_dev
*hdev
)
1805 assert(hdev
->vhost_ops
);
1806 if ((hdev
->started
== false) ||
1807 (hdev
->vhost_ops
->vhost_set_config_call
== NULL
)) {
1811 EventNotifier
*notifier
=
1812 &hdev
->vqs
[VHOST_QUEUE_NUM_CONFIG_INR
].masked_config_notifier
;
1813 return event_notifier_test_and_clear(notifier
);
1816 void vhost_config_mask(struct vhost_dev
*hdev
, VirtIODevice
*vdev
, bool mask
)
1820 EventNotifier
*notifier
=
1821 &hdev
->vqs
[VHOST_QUEUE_NUM_CONFIG_INR
].masked_config_notifier
;
1822 EventNotifier
*config_notifier
= &vdev
->config_notifier
;
1823 assert(hdev
->vhost_ops
);
1825 if ((hdev
->started
== false) ||
1826 (hdev
->vhost_ops
->vhost_set_config_call
== NULL
)) {
1830 assert(vdev
->use_guest_notifier_mask
);
1831 fd
= event_notifier_get_fd(notifier
);
1833 fd
= event_notifier_get_fd(config_notifier
);
1835 r
= hdev
->vhost_ops
->vhost_set_config_call(hdev
, fd
);
1837 error_report("vhost_set_config_call failed %d", -r
);
1841 static void vhost_stop_config_intr(struct vhost_dev
*dev
)
1844 assert(dev
->vhost_ops
);
1845 if (dev
->vhost_ops
->vhost_set_config_call
) {
1846 dev
->vhost_ops
->vhost_set_config_call(dev
, fd
);
1850 static void vhost_start_config_intr(struct vhost_dev
*dev
)
1854 assert(dev
->vhost_ops
);
1855 int fd
= event_notifier_get_fd(&dev
->vdev
->config_notifier
);
1856 if (dev
->vhost_ops
->vhost_set_config_call
) {
1857 r
= dev
->vhost_ops
->vhost_set_config_call(dev
, fd
);
1859 event_notifier_set(&dev
->vdev
->config_notifier
);
1864 uint64_t vhost_get_features(struct vhost_dev
*hdev
, const int *feature_bits
,
1867 const int *bit
= feature_bits
;
1868 while (*bit
!= VHOST_INVALID_FEATURE_BIT
) {
1869 uint64_t bit_mask
= (1ULL << *bit
);
1870 if (!(hdev
->features
& bit_mask
)) {
1871 features
&= ~bit_mask
;
1878 void vhost_ack_features(struct vhost_dev
*hdev
, const int *feature_bits
,
1881 const int *bit
= feature_bits
;
1882 while (*bit
!= VHOST_INVALID_FEATURE_BIT
) {
1883 uint64_t bit_mask
= (1ULL << *bit
);
1884 if (features
& bit_mask
) {
1885 hdev
->acked_features
|= bit_mask
;
1891 int vhost_dev_get_config(struct vhost_dev
*hdev
, uint8_t *config
,
1892 uint32_t config_len
, Error
**errp
)
1894 assert(hdev
->vhost_ops
);
1896 if (hdev
->vhost_ops
->vhost_get_config
) {
1897 return hdev
->vhost_ops
->vhost_get_config(hdev
, config
, config_len
,
1901 error_setg(errp
, "vhost_get_config not implemented");
1905 int vhost_dev_set_config(struct vhost_dev
*hdev
, const uint8_t *data
,
1906 uint32_t offset
, uint32_t size
, uint32_t flags
)
1908 assert(hdev
->vhost_ops
);
1910 if (hdev
->vhost_ops
->vhost_set_config
) {
1911 return hdev
->vhost_ops
->vhost_set_config(hdev
, data
, offset
,
1918 void vhost_dev_set_config_notifier(struct vhost_dev
*hdev
,
1919 const VhostDevConfigOps
*ops
)
1921 hdev
->config_ops
= ops
;
1924 void vhost_dev_free_inflight(struct vhost_inflight
*inflight
)
1926 if (inflight
&& inflight
->addr
) {
1927 qemu_memfd_free(inflight
->addr
, inflight
->size
, inflight
->fd
);
1928 inflight
->addr
= NULL
;
1933 int vhost_dev_prepare_inflight(struct vhost_dev
*hdev
, VirtIODevice
*vdev
)
1937 if (hdev
->vhost_ops
->vhost_get_inflight_fd
== NULL
||
1938 hdev
->vhost_ops
->vhost_set_inflight_fd
== NULL
) {
1944 r
= vhost_dev_set_features(hdev
, hdev
->log_enabled
);
1946 VHOST_OPS_DEBUG(r
, "vhost_dev_prepare_inflight failed");
1953 int vhost_dev_set_inflight(struct vhost_dev
*dev
,
1954 struct vhost_inflight
*inflight
)
1958 if (dev
->vhost_ops
->vhost_set_inflight_fd
&& inflight
->addr
) {
1959 r
= dev
->vhost_ops
->vhost_set_inflight_fd(dev
, inflight
);
1961 VHOST_OPS_DEBUG(r
, "vhost_set_inflight_fd failed");
1969 int vhost_dev_get_inflight(struct vhost_dev
*dev
, uint16_t queue_size
,
1970 struct vhost_inflight
*inflight
)
1974 if (dev
->vhost_ops
->vhost_get_inflight_fd
) {
1975 r
= dev
->vhost_ops
->vhost_get_inflight_fd(dev
, queue_size
, inflight
);
1977 VHOST_OPS_DEBUG(r
, "vhost_get_inflight_fd failed");
1985 static int vhost_dev_set_vring_enable(struct vhost_dev
*hdev
, int enable
)
1987 if (!hdev
->vhost_ops
->vhost_set_vring_enable
) {
1992 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not
1993 * been negotiated, the rings start directly in the enabled state, and
1994 * .vhost_set_vring_enable callback will fail since
1995 * VHOST_USER_SET_VRING_ENABLE is not supported.
1997 if (hdev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_USER
&&
1998 !virtio_has_feature(hdev
->backend_features
,
1999 VHOST_USER_F_PROTOCOL_FEATURES
)) {
2003 return hdev
->vhost_ops
->vhost_set_vring_enable(hdev
, enable
);
2007 * Host notifiers must be enabled at this point.
2009 * If @vrings is true, this function will enable all vrings before starting the
2010 * device. If it is false, the vring initialization is left to be done by the
2013 int vhost_dev_start(struct vhost_dev
*hdev
, VirtIODevice
*vdev
, bool vrings
)
2017 /* should only be called after backend is connected */
2018 assert(hdev
->vhost_ops
);
2020 trace_vhost_dev_start(hdev
, vdev
->name
, vrings
);
2022 vdev
->vhost_started
= true;
2023 hdev
->started
= true;
2026 r
= vhost_dev_set_features(hdev
, hdev
->log_enabled
);
2031 if (vhost_dev_has_iommu(hdev
)) {
2032 memory_listener_register(&hdev
->iommu_listener
, vdev
->dma_as
);
2035 r
= hdev
->vhost_ops
->vhost_set_mem_table(hdev
, hdev
->mem
);
2037 VHOST_OPS_DEBUG(r
, "vhost_set_mem_table failed");
2040 for (i
= 0; i
< hdev
->nvqs
; ++i
) {
2041 r
= vhost_virtqueue_start(hdev
,
2044 hdev
->vq_index
+ i
);
2050 r
= event_notifier_init(
2051 &hdev
->vqs
[VHOST_QUEUE_NUM_CONFIG_INR
].masked_config_notifier
, 0);
2053 VHOST_OPS_DEBUG(r
, "event_notifier_init failed");
2056 event_notifier_test_and_clear(
2057 &hdev
->vqs
[VHOST_QUEUE_NUM_CONFIG_INR
].masked_config_notifier
);
2058 if (!vdev
->use_guest_notifier_mask
) {
2059 vhost_config_mask(hdev
, vdev
, true);
2061 if (hdev
->log_enabled
) {
2064 hdev
->log_size
= vhost_get_log_size(hdev
);
2065 hdev
->log
= vhost_log_get(hdev
->vhost_ops
->backend_type
,
2067 vhost_dev_log_is_shared(hdev
));
2068 log_base
= (uintptr_t)hdev
->log
->log
;
2069 r
= hdev
->vhost_ops
->vhost_set_log_base(hdev
,
2070 hdev
->log_size
? log_base
: 0,
2073 VHOST_OPS_DEBUG(r
, "vhost_set_log_base failed");
2076 vhost_dev_elect_mem_logger(hdev
, true);
2079 r
= vhost_dev_set_vring_enable(hdev
, true);
2084 if (hdev
->vhost_ops
->vhost_dev_start
) {
2085 r
= hdev
->vhost_ops
->vhost_dev_start(hdev
, true);
2090 if (vhost_dev_has_iommu(hdev
) &&
2091 hdev
->vhost_ops
->vhost_set_iotlb_callback
) {
2092 hdev
->vhost_ops
->vhost_set_iotlb_callback(hdev
, true);
2094 /* Update used ring information for IOTLB to work correctly,
2095 * vhost-kernel code requires for this.*/
2096 for (i
= 0; i
< hdev
->nvqs
; ++i
) {
2097 struct vhost_virtqueue
*vq
= hdev
->vqs
+ i
;
2098 vhost_device_iotlb_miss(hdev
, vq
->used_phys
, true);
2101 vhost_start_config_intr(hdev
);
2105 vhost_dev_set_vring_enable(hdev
, false);
2108 vhost_log_put(hdev
, false);
2111 vhost_virtqueue_stop(hdev
,
2114 hdev
->vq_index
+ i
);
2118 if (vhost_dev_has_iommu(hdev
)) {
2119 memory_listener_unregister(&hdev
->iommu_listener
);
2122 vdev
->vhost_started
= false;
2123 hdev
->started
= false;
2127 /* Host notifiers must be enabled at this point. */
2128 void vhost_dev_stop(struct vhost_dev
*hdev
, VirtIODevice
*vdev
, bool vrings
)
2132 /* should only be called after backend is connected */
2133 assert(hdev
->vhost_ops
);
2134 event_notifier_test_and_clear(
2135 &hdev
->vqs
[VHOST_QUEUE_NUM_CONFIG_INR
].masked_config_notifier
);
2136 event_notifier_test_and_clear(&vdev
->config_notifier
);
2137 event_notifier_cleanup(
2138 &hdev
->vqs
[VHOST_QUEUE_NUM_CONFIG_INR
].masked_config_notifier
);
2140 trace_vhost_dev_stop(hdev
, vdev
->name
, vrings
);
2142 if (hdev
->vhost_ops
->vhost_dev_start
) {
2143 hdev
->vhost_ops
->vhost_dev_start(hdev
, false);
2146 vhost_dev_set_vring_enable(hdev
, false);
2148 for (i
= 0; i
< hdev
->nvqs
; ++i
) {
2149 vhost_virtqueue_stop(hdev
,
2152 hdev
->vq_index
+ i
);
2154 if (hdev
->vhost_ops
->vhost_reset_status
) {
2155 hdev
->vhost_ops
->vhost_reset_status(hdev
);
2158 if (vhost_dev_has_iommu(hdev
)) {
2159 if (hdev
->vhost_ops
->vhost_set_iotlb_callback
) {
2160 hdev
->vhost_ops
->vhost_set_iotlb_callback(hdev
, false);
2162 memory_listener_unregister(&hdev
->iommu_listener
);
2164 vhost_stop_config_intr(hdev
);
2165 vhost_log_put(hdev
, true);
2166 hdev
->started
= false;
2167 vdev
->vhost_started
= false;
2171 int vhost_net_set_backend(struct vhost_dev
*hdev
,
2172 struct vhost_vring_file
*file
)
2174 if (hdev
->vhost_ops
->vhost_net_set_backend
) {
2175 return hdev
->vhost_ops
->vhost_net_set_backend(hdev
, file
);
2181 int vhost_reset_device(struct vhost_dev
*hdev
)
2183 if (hdev
->vhost_ops
->vhost_reset_device
) {
2184 return hdev
->vhost_ops
->vhost_reset_device(hdev
);
2190 bool vhost_supports_device_state(struct vhost_dev
*dev
)
2192 if (dev
->vhost_ops
->vhost_supports_device_state
) {
2193 return dev
->vhost_ops
->vhost_supports_device_state(dev
);
2199 int vhost_set_device_state_fd(struct vhost_dev
*dev
,
2200 VhostDeviceStateDirection direction
,
2201 VhostDeviceStatePhase phase
,
2206 if (dev
->vhost_ops
->vhost_set_device_state_fd
) {
2207 return dev
->vhost_ops
->vhost_set_device_state_fd(dev
, direction
, phase
,
2208 fd
, reply_fd
, errp
);
2212 "vhost transport does not support migration state transfer");
2216 int vhost_check_device_state(struct vhost_dev
*dev
, Error
**errp
)
2218 if (dev
->vhost_ops
->vhost_check_device_state
) {
2219 return dev
->vhost_ops
->vhost_check_device_state(dev
, errp
);
2223 "vhost transport does not support migration state transfer");
2227 int vhost_save_backend_state(struct vhost_dev
*dev
, QEMUFile
*f
, Error
**errp
)
2230 /* Maximum chunk size in which to transfer the state */
2231 const size_t chunk_size
= 1 * 1024 * 1024;
2232 g_autofree
void *transfer_buf
= NULL
;
2233 g_autoptr(GError
) g_err
= NULL
;
2234 int pipe_fds
[2], read_fd
= -1, write_fd
= -1, reply_fd
= -1;
2237 /* [0] for reading (our end), [1] for writing (back-end's end) */
2238 if (!g_unix_open_pipe(pipe_fds
, FD_CLOEXEC
, &g_err
)) {
2239 error_setg(errp
, "Failed to set up state transfer pipe: %s",
2245 read_fd
= pipe_fds
[0];
2246 write_fd
= pipe_fds
[1];
2249 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped.
2250 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for
2251 * vhost-user, so just check that it is stopped at all.
2253 assert(!dev
->started
);
2255 /* Transfer ownership of write_fd to the back-end */
2256 ret
= vhost_set_device_state_fd(dev
,
2257 VHOST_TRANSFER_STATE_DIRECTION_SAVE
,
2258 VHOST_TRANSFER_STATE_PHASE_STOPPED
,
2263 error_prepend(errp
, "Failed to initiate state transfer: ");
2267 /* If the back-end wishes to use a different pipe, switch over */
2268 if (reply_fd
>= 0) {
2273 transfer_buf
= g_malloc(chunk_size
);
2278 read_ret
= RETRY_ON_EINTR(read(read_fd
, transfer_buf
, chunk_size
));
2281 error_setg_errno(errp
, -ret
, "Failed to receive state");
2285 assert(read_ret
<= chunk_size
);
2286 qemu_put_be32(f
, read_ret
);
2288 if (read_ret
== 0) {
2293 qemu_put_buffer(f
, transfer_buf
, read_ret
);
2297 * Back-end will not really care, but be clean and close our end of the pipe
2298 * before inquiring the back-end about whether transfer was successful
2303 /* Also, verify that the device is still stopped */
2304 assert(!dev
->started
);
2306 ret
= vhost_check_device_state(dev
, errp
);
2320 int vhost_load_backend_state(struct vhost_dev
*dev
, QEMUFile
*f
, Error
**errp
)
2323 size_t transfer_buf_size
= 0;
2324 g_autofree
void *transfer_buf
= NULL
;
2325 g_autoptr(GError
) g_err
= NULL
;
2326 int pipe_fds
[2], read_fd
= -1, write_fd
= -1, reply_fd
= -1;
2329 /* [0] for reading (back-end's end), [1] for writing (our end) */
2330 if (!g_unix_open_pipe(pipe_fds
, FD_CLOEXEC
, &g_err
)) {
2331 error_setg(errp
, "Failed to set up state transfer pipe: %s",
2337 read_fd
= pipe_fds
[0];
2338 write_fd
= pipe_fds
[1];
2341 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped.
2342 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for
2343 * vhost-user, so just check that it is stopped at all.
2345 assert(!dev
->started
);
2347 /* Transfer ownership of read_fd to the back-end */
2348 ret
= vhost_set_device_state_fd(dev
,
2349 VHOST_TRANSFER_STATE_DIRECTION_LOAD
,
2350 VHOST_TRANSFER_STATE_PHASE_STOPPED
,
2355 error_prepend(errp
, "Failed to initiate state transfer: ");
2359 /* If the back-end wishes to use a different pipe, switch over */
2360 if (reply_fd
>= 0) {
2362 write_fd
= reply_fd
;
2366 size_t this_chunk_size
= qemu_get_be32(f
);
2368 const uint8_t *transfer_pointer
;
2370 if (this_chunk_size
== 0) {
2375 if (transfer_buf_size
< this_chunk_size
) {
2376 transfer_buf
= g_realloc(transfer_buf
, this_chunk_size
);
2377 transfer_buf_size
= this_chunk_size
;
2380 if (qemu_get_buffer(f
, transfer_buf
, this_chunk_size
) <
2383 error_setg(errp
, "Failed to read state");
2388 transfer_pointer
= transfer_buf
;
2389 while (this_chunk_size
> 0) {
2390 write_ret
= RETRY_ON_EINTR(
2391 write(write_fd
, transfer_pointer
, this_chunk_size
)
2393 if (write_ret
< 0) {
2395 error_setg_errno(errp
, -ret
, "Failed to send state");
2397 } else if (write_ret
== 0) {
2398 error_setg(errp
, "Failed to send state: Connection is closed");
2403 assert(write_ret
<= this_chunk_size
);
2404 this_chunk_size
-= write_ret
;
2405 transfer_pointer
+= write_ret
;
2410 * Close our end, thus ending transfer, before inquiring the back-end about
2411 * whether transfer was successful
2416 /* Also, verify that the device is still stopped */
2417 assert(!dev
->started
);
2419 ret
= vhost_check_device_state(dev
, errp
);
2426 if (write_fd
>= 0) {