4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
28 #include "qapi/error.h"
31 * Return one past the end of the end of section. Be careful with uint64_t
34 static Int128
vhost_vdpa_section_end(const MemoryRegionSection
*section
)
36 Int128 llend
= int128_make64(section
->offset_within_address_space
);
37 llend
= int128_add(llend
, section
->size
);
38 llend
= int128_and(llend
, int128_exts64(TARGET_PAGE_MASK
));
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection
*section
,
49 if ((!memory_region_is_ram(section
->mr
) &&
50 !memory_region_is_iommu(section
->mr
)) ||
51 memory_region_is_protected(section
->mr
) ||
52 /* vhost-vDPA doesn't allow MMIO to be mapped */
53 memory_region_is_ram_device(section
->mr
)) {
57 if (section
->offset_within_address_space
< iova_min
) {
58 error_report("RAM section out of device range (min=0x%" PRIx64
59 ", addr=0x%" HWADDR_PRIx
")",
60 iova_min
, section
->offset_within_address_space
);
64 llend
= vhost_vdpa_section_end(section
);
65 if (int128_gt(llend
, int128_make64(iova_max
))) {
66 error_report("RAM section out of device range (max=0x%" PRIx64
67 ", end addr=0x%" PRIx64
")",
68 iova_max
, int128_get64(llend
));
76 * The caller must set asid = 0 if the device does not support asid.
77 * This is not an ABI break since it is set to 0 by the initializer anyway.
79 int vhost_vdpa_dma_map(struct vhost_vdpa
*v
, uint32_t asid
, hwaddr iova
,
80 hwaddr size
, void *vaddr
, bool readonly
)
82 struct vhost_msg_v2 msg
= {};
83 int fd
= v
->device_fd
;
86 msg
.type
= v
->msg_type
;
88 msg
.iotlb
.iova
= iova
;
89 msg
.iotlb
.size
= size
;
90 msg
.iotlb
.uaddr
= (uint64_t)(uintptr_t)vaddr
;
91 msg
.iotlb
.perm
= readonly
? VHOST_ACCESS_RO
: VHOST_ACCESS_RW
;
92 msg
.iotlb
.type
= VHOST_IOTLB_UPDATE
;
94 trace_vhost_vdpa_dma_map(v
, fd
, msg
.type
, msg
.asid
, msg
.iotlb
.iova
,
95 msg
.iotlb
.size
, msg
.iotlb
.uaddr
, msg
.iotlb
.perm
,
98 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
99 error_report("failed to write, fd=%d, errno=%d (%s)",
100 fd
, errno
, strerror(errno
));
108 * The caller must set asid = 0 if the device does not support asid.
109 * This is not an ABI break since it is set to 0 by the initializer anyway.
111 int vhost_vdpa_dma_unmap(struct vhost_vdpa
*v
, uint32_t asid
, hwaddr iova
,
114 struct vhost_msg_v2 msg
= {};
115 int fd
= v
->device_fd
;
118 msg
.type
= v
->msg_type
;
120 msg
.iotlb
.iova
= iova
;
121 msg
.iotlb
.size
= size
;
122 msg
.iotlb
.type
= VHOST_IOTLB_INVALIDATE
;
124 trace_vhost_vdpa_dma_unmap(v
, fd
, msg
.type
, msg
.asid
, msg
.iotlb
.iova
,
125 msg
.iotlb
.size
, msg
.iotlb
.type
);
127 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
128 error_report("failed to write, fd=%d, errno=%d (%s)",
129 fd
, errno
, strerror(errno
));
136 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa
*v
)
138 int fd
= v
->device_fd
;
139 struct vhost_msg_v2 msg
= {
141 .iotlb
.type
= VHOST_IOTLB_BATCH_BEGIN
,
144 trace_vhost_vdpa_listener_begin_batch(v
, fd
, msg
.type
, msg
.iotlb
.type
);
145 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
146 error_report("failed to write, fd=%d, errno=%d (%s)",
147 fd
, errno
, strerror(errno
));
151 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa
*v
)
153 if (v
->dev
->backend_cap
& (0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
) &&
154 !v
->iotlb_batch_begin_sent
) {
155 vhost_vdpa_listener_begin_batch(v
);
158 v
->iotlb_batch_begin_sent
= true;
161 static void vhost_vdpa_listener_commit(MemoryListener
*listener
)
163 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
164 struct vhost_dev
*dev
= v
->dev
;
165 struct vhost_msg_v2 msg
= {};
166 int fd
= v
->device_fd
;
168 if (!(dev
->backend_cap
& (0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
))) {
172 if (!v
->iotlb_batch_begin_sent
) {
176 msg
.type
= v
->msg_type
;
177 msg
.iotlb
.type
= VHOST_IOTLB_BATCH_END
;
179 trace_vhost_vdpa_listener_commit(v
, fd
, msg
.type
, msg
.iotlb
.type
);
180 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
181 error_report("failed to write, fd=%d, errno=%d (%s)",
182 fd
, errno
, strerror(errno
));
185 v
->iotlb_batch_begin_sent
= false;
188 static void vhost_vdpa_listener_region_add(MemoryListener
*listener
,
189 MemoryRegionSection
*section
)
191 DMAMap mem_region
= {};
192 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
194 Int128 llend
, llsize
;
198 if (vhost_vdpa_listener_skipped_section(section
, v
->iova_range
.first
,
199 v
->iova_range
.last
)) {
203 if (unlikely((section
->offset_within_address_space
& ~TARGET_PAGE_MASK
) !=
204 (section
->offset_within_region
& ~TARGET_PAGE_MASK
))) {
205 error_report("%s received unaligned region", __func__
);
209 iova
= TARGET_PAGE_ALIGN(section
->offset_within_address_space
);
210 llend
= vhost_vdpa_section_end(section
);
211 if (int128_ge(int128_make64(iova
), llend
)) {
215 memory_region_ref(section
->mr
);
217 /* Here we assume that memory_region_is_ram(section->mr)==true */
219 vaddr
= memory_region_get_ram_ptr(section
->mr
) +
220 section
->offset_within_region
+
221 (iova
- section
->offset_within_address_space
);
223 trace_vhost_vdpa_listener_region_add(v
, iova
, int128_get64(llend
),
224 vaddr
, section
->readonly
);
226 llsize
= int128_sub(llend
, int128_make64(iova
));
227 if (v
->shadow_data
) {
230 mem_region
.translated_addr
= (hwaddr
)(uintptr_t)vaddr
,
231 mem_region
.size
= int128_get64(llsize
) - 1,
232 mem_region
.perm
= IOMMU_ACCESS_FLAG(true, section
->readonly
),
234 r
= vhost_iova_tree_map_alloc(v
->iova_tree
, &mem_region
);
235 if (unlikely(r
!= IOVA_OK
)) {
236 error_report("Can't allocate a mapping (%d)", r
);
240 iova
= mem_region
.iova
;
243 vhost_vdpa_iotlb_batch_begin_once(v
);
244 ret
= vhost_vdpa_dma_map(v
, VHOST_VDPA_GUEST_PA_ASID
, iova
,
245 int128_get64(llsize
), vaddr
, section
->readonly
);
247 error_report("vhost vdpa map fail!");
254 if (v
->shadow_data
) {
255 vhost_iova_tree_remove(v
->iova_tree
, mem_region
);
260 * On the initfn path, store the first error in the container so we
261 * can gracefully fail. Runtime, there's not much we can do other
262 * than throw a hardware error.
264 error_report("vhost-vdpa: DMA mapping failed, unable to continue");
269 static void vhost_vdpa_listener_region_del(MemoryListener
*listener
,
270 MemoryRegionSection
*section
)
272 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
274 Int128 llend
, llsize
;
277 if (vhost_vdpa_listener_skipped_section(section
, v
->iova_range
.first
,
278 v
->iova_range
.last
)) {
282 if (unlikely((section
->offset_within_address_space
& ~TARGET_PAGE_MASK
) !=
283 (section
->offset_within_region
& ~TARGET_PAGE_MASK
))) {
284 error_report("%s received unaligned region", __func__
);
288 iova
= TARGET_PAGE_ALIGN(section
->offset_within_address_space
);
289 llend
= vhost_vdpa_section_end(section
);
291 trace_vhost_vdpa_listener_region_del(v
, iova
, int128_get64(llend
));
293 if (int128_ge(int128_make64(iova
), llend
)) {
297 llsize
= int128_sub(llend
, int128_make64(iova
));
299 if (v
->shadow_data
) {
300 const DMAMap
*result
;
301 const void *vaddr
= memory_region_get_ram_ptr(section
->mr
) +
302 section
->offset_within_region
+
303 (iova
- section
->offset_within_address_space
);
304 DMAMap mem_region
= {
305 .translated_addr
= (hwaddr
)(uintptr_t)vaddr
,
306 .size
= int128_get64(llsize
) - 1,
309 result
= vhost_iova_tree_find_iova(v
->iova_tree
, &mem_region
);
311 /* The memory listener map wasn't mapped */
315 vhost_iova_tree_remove(v
->iova_tree
, *result
);
317 vhost_vdpa_iotlb_batch_begin_once(v
);
318 ret
= vhost_vdpa_dma_unmap(v
, VHOST_VDPA_GUEST_PA_ASID
, iova
,
319 int128_get64(llsize
));
321 error_report("vhost_vdpa dma unmap error!");
324 memory_region_unref(section
->mr
);
327 * IOTLB API is used by vhost-vdpa which requires incremental updating
328 * of the mapping. So we can not use generic vhost memory listener which
329 * depends on the addnop().
331 static const MemoryListener vhost_vdpa_memory_listener
= {
332 .name
= "vhost-vdpa",
333 .commit
= vhost_vdpa_listener_commit
,
334 .region_add
= vhost_vdpa_listener_region_add
,
335 .region_del
= vhost_vdpa_listener_region_del
,
338 static int vhost_vdpa_call(struct vhost_dev
*dev
, unsigned long int request
,
341 struct vhost_vdpa
*v
= dev
->opaque
;
342 int fd
= v
->device_fd
;
345 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
347 ret
= ioctl(fd
, request
, arg
);
348 return ret
< 0 ? -errno
: ret
;
351 static int vhost_vdpa_add_status(struct vhost_dev
*dev
, uint8_t status
)
356 trace_vhost_vdpa_add_status(dev
, status
);
357 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_STATUS
, &s
);
364 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_STATUS
, &s
);
369 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_STATUS
, &s
);
381 int vhost_vdpa_get_iova_range(int fd
, struct vhost_vdpa_iova_range
*iova_range
)
383 int ret
= ioctl(fd
, VHOST_VDPA_GET_IOVA_RANGE
, iova_range
);
385 return ret
< 0 ? -errno
: 0;
389 * The use of this function is for requests that only need to be
390 * applied once. Typically such request occurs at the beginning
391 * of operation, and before setting up queues. It should not be
392 * used for request that performs operation until all queues are
393 * set, which would need to check dev->vq_index_end instead.
395 static bool vhost_vdpa_first_dev(struct vhost_dev
*dev
)
397 struct vhost_vdpa
*v
= dev
->opaque
;
399 return v
->index
== 0;
402 static int vhost_vdpa_get_dev_features(struct vhost_dev
*dev
,
407 ret
= vhost_vdpa_call(dev
, VHOST_GET_FEATURES
, features
);
408 trace_vhost_vdpa_get_features(dev
, *features
);
412 static void vhost_vdpa_init_svq(struct vhost_dev
*hdev
, struct vhost_vdpa
*v
)
414 g_autoptr(GPtrArray
) shadow_vqs
= NULL
;
416 shadow_vqs
= g_ptr_array_new_full(hdev
->nvqs
, vhost_svq_free
);
417 for (unsigned n
= 0; n
< hdev
->nvqs
; ++n
) {
418 VhostShadowVirtqueue
*svq
;
420 svq
= vhost_svq_new(v
->shadow_vq_ops
, v
->shadow_vq_ops_opaque
);
421 g_ptr_array_add(shadow_vqs
, svq
);
424 v
->shadow_vqs
= g_steal_pointer(&shadow_vqs
);
427 static int vhost_vdpa_init(struct vhost_dev
*dev
, void *opaque
, Error
**errp
)
429 struct vhost_vdpa
*v
;
430 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
431 trace_vhost_vdpa_init(dev
, opaque
);
436 dev
->opaque
= opaque
;
437 v
->listener
= vhost_vdpa_memory_listener
;
438 v
->msg_type
= VHOST_IOTLB_MSG_V2
;
439 vhost_vdpa_init_svq(dev
, v
);
441 error_propagate(&dev
->migration_blocker
, v
->migration_blocker
);
442 if (!vhost_vdpa_first_dev(dev
)) {
447 * If dev->shadow_vqs_enabled at initialization that means the device has
448 * been started with x-svq=on, so don't block migration
450 if (dev
->migration_blocker
== NULL
&& !v
->shadow_vqs_enabled
) {
451 /* We don't have dev->features yet */
453 ret
= vhost_vdpa_get_dev_features(dev
, &features
);
455 error_setg_errno(errp
, -ret
, "Could not get device features");
458 vhost_svq_valid_features(features
, &dev
->migration_blocker
);
462 * Similar to VFIO, we end up pinning all guest memory and have to
463 * disable discarding of RAM.
465 ret
= ram_block_discard_disable(true);
467 error_report("Cannot set discarding of RAM broken");
471 vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_ACKNOWLEDGE
|
472 VIRTIO_CONFIG_S_DRIVER
);
477 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev
*dev
,
480 size_t page_size
= qemu_real_host_page_size();
481 struct vhost_vdpa
*v
= dev
->opaque
;
482 VirtIODevice
*vdev
= dev
->vdev
;
483 VhostVDPAHostNotifier
*n
;
485 n
= &v
->notifier
[queue_index
];
488 virtio_queue_set_host_notifier_mr(vdev
, queue_index
, &n
->mr
, false);
489 object_unparent(OBJECT(&n
->mr
));
490 munmap(n
->addr
, page_size
);
495 static int vhost_vdpa_host_notifier_init(struct vhost_dev
*dev
, int queue_index
)
497 size_t page_size
= qemu_real_host_page_size();
498 struct vhost_vdpa
*v
= dev
->opaque
;
499 VirtIODevice
*vdev
= dev
->vdev
;
500 VhostVDPAHostNotifier
*n
;
501 int fd
= v
->device_fd
;
505 vhost_vdpa_host_notifier_uninit(dev
, queue_index
);
507 n
= &v
->notifier
[queue_index
];
509 addr
= mmap(NULL
, page_size
, PROT_WRITE
, MAP_SHARED
, fd
,
510 queue_index
* page_size
);
511 if (addr
== MAP_FAILED
) {
515 name
= g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
517 memory_region_init_ram_device_ptr(&n
->mr
, OBJECT(vdev
), name
,
521 if (virtio_queue_set_host_notifier_mr(vdev
, queue_index
, &n
->mr
, true)) {
522 object_unparent(OBJECT(&n
->mr
));
523 munmap(addr
, page_size
);
534 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev
*dev
, int n
)
539 * Pack all the changes to the memory regions in a single
540 * transaction to avoid a few updating of the address space
543 memory_region_transaction_begin();
545 for (i
= dev
->vq_index
; i
< dev
->vq_index
+ n
; i
++) {
546 vhost_vdpa_host_notifier_uninit(dev
, i
);
549 memory_region_transaction_commit();
552 static void vhost_vdpa_host_notifiers_init(struct vhost_dev
*dev
)
554 struct vhost_vdpa
*v
= dev
->opaque
;
557 if (v
->shadow_vqs_enabled
) {
558 /* FIXME SVQ is not compatible with host notifiers mr */
563 * Pack all the changes to the memory regions in a single
564 * transaction to avoid a few updating of the address space
567 memory_region_transaction_begin();
569 for (i
= dev
->vq_index
; i
< dev
->vq_index
+ dev
->nvqs
; i
++) {
570 if (vhost_vdpa_host_notifier_init(dev
, i
)) {
571 vhost_vdpa_host_notifiers_uninit(dev
, i
- dev
->vq_index
);
576 memory_region_transaction_commit();
579 static void vhost_vdpa_svq_cleanup(struct vhost_dev
*dev
)
581 struct vhost_vdpa
*v
= dev
->opaque
;
584 for (idx
= 0; idx
< v
->shadow_vqs
->len
; ++idx
) {
585 vhost_svq_stop(g_ptr_array_index(v
->shadow_vqs
, idx
));
587 g_ptr_array_free(v
->shadow_vqs
, true);
590 static int vhost_vdpa_cleanup(struct vhost_dev
*dev
)
592 struct vhost_vdpa
*v
;
593 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
595 trace_vhost_vdpa_cleanup(dev
, v
);
596 if (vhost_vdpa_first_dev(dev
)) {
597 ram_block_discard_disable(false);
600 vhost_vdpa_host_notifiers_uninit(dev
, dev
->nvqs
);
601 memory_listener_unregister(&v
->listener
);
602 vhost_vdpa_svq_cleanup(dev
);
609 static int vhost_vdpa_memslots_limit(struct vhost_dev
*dev
)
611 trace_vhost_vdpa_memslots_limit(dev
, INT_MAX
);
615 static int vhost_vdpa_set_mem_table(struct vhost_dev
*dev
,
616 struct vhost_memory
*mem
)
618 if (!vhost_vdpa_first_dev(dev
)) {
622 trace_vhost_vdpa_set_mem_table(dev
, mem
->nregions
, mem
->padding
);
623 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE
) &&
624 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS
)) {
626 for (i
= 0; i
< mem
->nregions
; i
++) {
627 trace_vhost_vdpa_dump_regions(dev
, i
,
628 mem
->regions
[i
].guest_phys_addr
,
629 mem
->regions
[i
].memory_size
,
630 mem
->regions
[i
].userspace_addr
,
631 mem
->regions
[i
].flags_padding
);
641 static int vhost_vdpa_set_features(struct vhost_dev
*dev
,
644 struct vhost_vdpa
*v
= dev
->opaque
;
647 if (!vhost_vdpa_first_dev(dev
)) {
651 if (v
->shadow_vqs_enabled
) {
652 if ((v
->acked_features
^ features
) == BIT_ULL(VHOST_F_LOG_ALL
)) {
654 * QEMU is just trying to enable or disable logging. SVQ handles
655 * this sepparately, so no need to forward this.
657 v
->acked_features
= features
;
661 v
->acked_features
= features
;
663 /* We must not ack _F_LOG if SVQ is enabled */
664 features
&= ~BIT_ULL(VHOST_F_LOG_ALL
);
667 trace_vhost_vdpa_set_features(dev
, features
);
668 ret
= vhost_vdpa_call(dev
, VHOST_SET_FEATURES
, &features
);
673 return vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_FEATURES_OK
);
676 static int vhost_vdpa_set_backend_cap(struct vhost_dev
*dev
)
679 uint64_t f
= 0x1ULL
<< VHOST_BACKEND_F_IOTLB_MSG_V2
|
680 0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
|
681 0x1ULL
<< VHOST_BACKEND_F_IOTLB_ASID
|
682 0x1ULL
<< VHOST_BACKEND_F_SUSPEND
;
685 if (vhost_vdpa_call(dev
, VHOST_GET_BACKEND_FEATURES
, &features
)) {
691 if (vhost_vdpa_first_dev(dev
)) {
692 r
= vhost_vdpa_call(dev
, VHOST_SET_BACKEND_FEATURES
, &features
);
698 dev
->backend_cap
= features
;
703 static int vhost_vdpa_get_device_id(struct vhost_dev
*dev
,
707 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_DEVICE_ID
, device_id
);
708 trace_vhost_vdpa_get_device_id(dev
, *device_id
);
712 static int vhost_vdpa_reset_device(struct vhost_dev
*dev
)
714 struct vhost_vdpa
*v
= dev
->opaque
;
718 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_STATUS
, &status
);
719 trace_vhost_vdpa_reset_device(dev
, status
);
720 v
->suspended
= false;
724 static int vhost_vdpa_get_vq_index(struct vhost_dev
*dev
, int idx
)
726 assert(idx
>= dev
->vq_index
&& idx
< dev
->vq_index
+ dev
->nvqs
);
728 trace_vhost_vdpa_get_vq_index(dev
, idx
, idx
);
732 static int vhost_vdpa_set_vring_ready(struct vhost_dev
*dev
)
735 trace_vhost_vdpa_set_vring_ready(dev
);
736 for (i
= 0; i
< dev
->nvqs
; ++i
) {
737 struct vhost_vring_state state
= {
738 .index
= dev
->vq_index
+ i
,
741 vhost_vdpa_call(dev
, VHOST_VDPA_SET_VRING_ENABLE
, &state
);
746 static int vhost_vdpa_set_config_call(struct vhost_dev
*dev
,
749 trace_vhost_vdpa_set_config_call(dev
, fd
);
750 return vhost_vdpa_call(dev
, VHOST_VDPA_SET_CONFIG_CALL
, &fd
);
753 static void vhost_vdpa_dump_config(struct vhost_dev
*dev
, const uint8_t *config
,
757 char line
[QEMU_HEXDUMP_LINE_LEN
];
759 for (b
= 0; b
< config_len
; b
+= 16) {
760 len
= config_len
- b
;
761 qemu_hexdump_line(line
, b
, config
, len
, false);
762 trace_vhost_vdpa_dump_config(dev
, line
);
766 static int vhost_vdpa_set_config(struct vhost_dev
*dev
, const uint8_t *data
,
767 uint32_t offset
, uint32_t size
,
770 struct vhost_vdpa_config
*config
;
772 unsigned long config_size
= offsetof(struct vhost_vdpa_config
, buf
);
774 trace_vhost_vdpa_set_config(dev
, offset
, size
, flags
);
775 config
= g_malloc(size
+ config_size
);
776 config
->off
= offset
;
778 memcpy(config
->buf
, data
, size
);
779 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG
) &&
780 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG
)) {
781 vhost_vdpa_dump_config(dev
, data
, size
);
783 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_CONFIG
, config
);
788 static int vhost_vdpa_get_config(struct vhost_dev
*dev
, uint8_t *config
,
789 uint32_t config_len
, Error
**errp
)
791 struct vhost_vdpa_config
*v_config
;
792 unsigned long config_size
= offsetof(struct vhost_vdpa_config
, buf
);
795 trace_vhost_vdpa_get_config(dev
, config
, config_len
);
796 v_config
= g_malloc(config_len
+ config_size
);
797 v_config
->len
= config_len
;
799 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_CONFIG
, v_config
);
800 memcpy(config
, v_config
->buf
, config_len
);
802 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG
) &&
803 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG
)) {
804 vhost_vdpa_dump_config(dev
, config
, config_len
);
809 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev
*dev
,
810 struct vhost_vring_state
*ring
)
812 trace_vhost_vdpa_set_vring_base(dev
, ring
->index
, ring
->num
);
813 return vhost_vdpa_call(dev
, VHOST_SET_VRING_BASE
, ring
);
816 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev
*dev
,
817 struct vhost_vring_file
*file
)
819 trace_vhost_vdpa_set_vring_kick(dev
, file
->index
, file
->fd
);
820 return vhost_vdpa_call(dev
, VHOST_SET_VRING_KICK
, file
);
823 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev
*dev
,
824 struct vhost_vring_file
*file
)
826 trace_vhost_vdpa_set_vring_call(dev
, file
->index
, file
->fd
);
827 return vhost_vdpa_call(dev
, VHOST_SET_VRING_CALL
, file
);
830 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev
*dev
,
831 struct vhost_vring_addr
*addr
)
833 trace_vhost_vdpa_set_vring_addr(dev
, addr
->index
, addr
->flags
,
834 addr
->desc_user_addr
, addr
->used_user_addr
,
835 addr
->avail_user_addr
,
836 addr
->log_guest_addr
);
838 return vhost_vdpa_call(dev
, VHOST_SET_VRING_ADDR
, addr
);
843 * Set the shadow virtqueue descriptors to the device
845 * @dev: The vhost device model
846 * @svq: The shadow virtqueue
847 * @idx: The index of the virtqueue in the vhost device
850 * Note that this function does not rewind kick file descriptor if cannot set
853 static int vhost_vdpa_svq_set_fds(struct vhost_dev
*dev
,
854 VhostShadowVirtqueue
*svq
, unsigned idx
,
857 struct vhost_vring_file file
= {
858 .index
= dev
->vq_index
+ idx
,
860 const EventNotifier
*event_notifier
= &svq
->hdev_kick
;
863 r
= event_notifier_init(&svq
->hdev_kick
, 0);
865 error_setg_errno(errp
, -r
, "Couldn't create kick event notifier");
866 goto err_init_hdev_kick
;
869 r
= event_notifier_init(&svq
->hdev_call
, 0);
871 error_setg_errno(errp
, -r
, "Couldn't create call event notifier");
872 goto err_init_hdev_call
;
875 file
.fd
= event_notifier_get_fd(event_notifier
);
876 r
= vhost_vdpa_set_vring_dev_kick(dev
, &file
);
877 if (unlikely(r
!= 0)) {
878 error_setg_errno(errp
, -r
, "Can't set device kick fd");
879 goto err_init_set_dev_fd
;
882 event_notifier
= &svq
->hdev_call
;
883 file
.fd
= event_notifier_get_fd(event_notifier
);
884 r
= vhost_vdpa_set_vring_dev_call(dev
, &file
);
885 if (unlikely(r
!= 0)) {
886 error_setg_errno(errp
, -r
, "Can't set device call fd");
887 goto err_init_set_dev_fd
;
893 event_notifier_set_handler(&svq
->hdev_call
, NULL
);
896 event_notifier_cleanup(&svq
->hdev_kick
);
903 * Unmap a SVQ area in the device
905 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa
*v
, hwaddr addr
)
907 const DMAMap needle
= {
908 .translated_addr
= addr
,
910 const DMAMap
*result
= vhost_iova_tree_find_iova(v
->iova_tree
, &needle
);
914 if (unlikely(!result
)) {
915 error_report("Unable to find SVQ address to unmap");
919 size
= ROUND_UP(result
->size
, qemu_real_host_page_size());
920 r
= vhost_vdpa_dma_unmap(v
, v
->address_space_id
, result
->iova
, size
);
921 if (unlikely(r
< 0)) {
922 error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r
), -r
);
926 vhost_iova_tree_remove(v
->iova_tree
, *result
);
929 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev
*dev
,
930 const VhostShadowVirtqueue
*svq
)
932 struct vhost_vdpa
*v
= dev
->opaque
;
933 struct vhost_vring_addr svq_addr
;
935 vhost_svq_get_vring_addr(svq
, &svq_addr
);
937 vhost_vdpa_svq_unmap_ring(v
, svq_addr
.desc_user_addr
);
939 vhost_vdpa_svq_unmap_ring(v
, svq_addr
.used_user_addr
);
943 * Map the SVQ area in the device
945 * @v: Vhost-vdpa device
946 * @needle: The area to search iova
947 * @errorp: Error pointer
949 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa
*v
, DMAMap
*needle
,
954 r
= vhost_iova_tree_map_alloc(v
->iova_tree
, needle
);
955 if (unlikely(r
!= IOVA_OK
)) {
956 error_setg(errp
, "Cannot allocate iova (%d)", r
);
960 r
= vhost_vdpa_dma_map(v
, v
->address_space_id
, needle
->iova
,
962 (void *)(uintptr_t)needle
->translated_addr
,
963 needle
->perm
== IOMMU_RO
);
964 if (unlikely(r
!= 0)) {
965 error_setg_errno(errp
, -r
, "Cannot map region to device");
966 vhost_iova_tree_remove(v
->iova_tree
, *needle
);
973 * Map the shadow virtqueue rings in the device
975 * @dev: The vhost device
976 * @svq: The shadow virtqueue
977 * @addr: Assigned IOVA addresses
978 * @errp: Error pointer
980 static bool vhost_vdpa_svq_map_rings(struct vhost_dev
*dev
,
981 const VhostShadowVirtqueue
*svq
,
982 struct vhost_vring_addr
*addr
,
986 DMAMap device_region
, driver_region
;
987 struct vhost_vring_addr svq_addr
;
988 struct vhost_vdpa
*v
= dev
->opaque
;
989 size_t device_size
= vhost_svq_device_area_size(svq
);
990 size_t driver_size
= vhost_svq_driver_area_size(svq
);
994 vhost_svq_get_vring_addr(svq
, &svq_addr
);
996 driver_region
= (DMAMap
) {
997 .translated_addr
= svq_addr
.desc_user_addr
,
998 .size
= driver_size
- 1,
1001 ok
= vhost_vdpa_svq_map_ring(v
, &driver_region
, errp
);
1002 if (unlikely(!ok
)) {
1003 error_prepend(errp
, "Cannot create vq driver region: ");
1006 addr
->desc_user_addr
= driver_region
.iova
;
1007 avail_offset
= svq_addr
.avail_user_addr
- svq_addr
.desc_user_addr
;
1008 addr
->avail_user_addr
= driver_region
.iova
+ avail_offset
;
1010 device_region
= (DMAMap
) {
1011 .translated_addr
= svq_addr
.used_user_addr
,
1012 .size
= device_size
- 1,
1015 ok
= vhost_vdpa_svq_map_ring(v
, &device_region
, errp
);
1016 if (unlikely(!ok
)) {
1017 error_prepend(errp
, "Cannot create vq device region: ");
1018 vhost_vdpa_svq_unmap_ring(v
, driver_region
.translated_addr
);
1020 addr
->used_user_addr
= device_region
.iova
;
1025 static bool vhost_vdpa_svq_setup(struct vhost_dev
*dev
,
1026 VhostShadowVirtqueue
*svq
, unsigned idx
,
1029 uint16_t vq_index
= dev
->vq_index
+ idx
;
1030 struct vhost_vring_state s
= {
1035 r
= vhost_vdpa_set_dev_vring_base(dev
, &s
);
1037 error_setg_errno(errp
, -r
, "Cannot set vring base");
1041 r
= vhost_vdpa_svq_set_fds(dev
, svq
, idx
, errp
);
1045 static bool vhost_vdpa_svqs_start(struct vhost_dev
*dev
)
1047 struct vhost_vdpa
*v
= dev
->opaque
;
1051 if (!v
->shadow_vqs_enabled
) {
1055 for (i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
1056 VirtQueue
*vq
= virtio_get_queue(dev
->vdev
, dev
->vq_index
+ i
);
1057 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
1058 struct vhost_vring_addr addr
= {
1059 .index
= dev
->vq_index
+ i
,
1062 bool ok
= vhost_vdpa_svq_setup(dev
, svq
, i
, &err
);
1063 if (unlikely(!ok
)) {
1067 vhost_svq_start(svq
, dev
->vdev
, vq
, v
->iova_tree
);
1068 ok
= vhost_vdpa_svq_map_rings(dev
, svq
, &addr
, &err
);
1069 if (unlikely(!ok
)) {
1073 /* Override vring GPA set by vhost subsystem */
1074 r
= vhost_vdpa_set_vring_dev_addr(dev
, &addr
);
1075 if (unlikely(r
!= 0)) {
1076 error_setg_errno(&err
, -r
, "Cannot set device address");
1084 vhost_vdpa_svq_unmap_rings(dev
, g_ptr_array_index(v
->shadow_vqs
, i
));
1087 vhost_svq_stop(g_ptr_array_index(v
->shadow_vqs
, i
));
1090 error_reportf_err(err
, "Cannot setup SVQ %u: ", i
);
1091 for (unsigned j
= 0; j
< i
; ++j
) {
1092 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, j
);
1093 vhost_vdpa_svq_unmap_rings(dev
, svq
);
1094 vhost_svq_stop(svq
);
1100 static void vhost_vdpa_svqs_stop(struct vhost_dev
*dev
)
1102 struct vhost_vdpa
*v
= dev
->opaque
;
1104 if (!v
->shadow_vqs_enabled
) {
1108 for (unsigned i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
1109 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
1111 vhost_svq_stop(svq
);
1112 vhost_vdpa_svq_unmap_rings(dev
, svq
);
1114 event_notifier_cleanup(&svq
->hdev_kick
);
1115 event_notifier_cleanup(&svq
->hdev_call
);
1119 static void vhost_vdpa_suspend(struct vhost_dev
*dev
)
1121 struct vhost_vdpa
*v
= dev
->opaque
;
1124 if (!vhost_vdpa_first_dev(dev
)) {
1128 if (dev
->backend_cap
& BIT_ULL(VHOST_BACKEND_F_SUSPEND
)) {
1129 trace_vhost_vdpa_suspend(dev
);
1130 r
= ioctl(v
->device_fd
, VHOST_VDPA_SUSPEND
);
1132 error_report("Cannot suspend: %s(%d)", g_strerror(errno
), errno
);
1134 v
->suspended
= true;
1139 vhost_vdpa_reset_device(dev
);
1142 static int vhost_vdpa_dev_start(struct vhost_dev
*dev
, bool started
)
1144 struct vhost_vdpa
*v
= dev
->opaque
;
1146 trace_vhost_vdpa_dev_start(dev
, started
);
1149 vhost_vdpa_host_notifiers_init(dev
);
1150 ok
= vhost_vdpa_svqs_start(dev
);
1151 if (unlikely(!ok
)) {
1154 vhost_vdpa_set_vring_ready(dev
);
1156 vhost_vdpa_suspend(dev
);
1157 vhost_vdpa_svqs_stop(dev
);
1158 vhost_vdpa_host_notifiers_uninit(dev
, dev
->nvqs
);
1161 if (dev
->vq_index
+ dev
->nvqs
!= dev
->vq_index_end
) {
1166 memory_listener_register(&v
->listener
, &address_space_memory
);
1167 return vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_DRIVER_OK
);
1173 static void vhost_vdpa_reset_status(struct vhost_dev
*dev
)
1175 struct vhost_vdpa
*v
= dev
->opaque
;
1177 if (dev
->vq_index
+ dev
->nvqs
!= dev
->vq_index_end
) {
1181 vhost_vdpa_reset_device(dev
);
1182 vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_ACKNOWLEDGE
|
1183 VIRTIO_CONFIG_S_DRIVER
);
1184 memory_listener_unregister(&v
->listener
);
1187 static int vhost_vdpa_set_log_base(struct vhost_dev
*dev
, uint64_t base
,
1188 struct vhost_log
*log
)
1190 struct vhost_vdpa
*v
= dev
->opaque
;
1191 if (v
->shadow_vqs_enabled
|| !vhost_vdpa_first_dev(dev
)) {
1195 trace_vhost_vdpa_set_log_base(dev
, base
, log
->size
, log
->refcnt
, log
->fd
,
1197 return vhost_vdpa_call(dev
, VHOST_SET_LOG_BASE
, &base
);
1200 static int vhost_vdpa_set_vring_addr(struct vhost_dev
*dev
,
1201 struct vhost_vring_addr
*addr
)
1203 struct vhost_vdpa
*v
= dev
->opaque
;
1205 if (v
->shadow_vqs_enabled
) {
1207 * Device vring addr was set at device start. SVQ base is handled by
1213 return vhost_vdpa_set_vring_dev_addr(dev
, addr
);
1216 static int vhost_vdpa_set_vring_num(struct vhost_dev
*dev
,
1217 struct vhost_vring_state
*ring
)
1219 trace_vhost_vdpa_set_vring_num(dev
, ring
->index
, ring
->num
);
1220 return vhost_vdpa_call(dev
, VHOST_SET_VRING_NUM
, ring
);
1223 static int vhost_vdpa_set_vring_base(struct vhost_dev
*dev
,
1224 struct vhost_vring_state
*ring
)
1226 struct vhost_vdpa
*v
= dev
->opaque
;
1228 if (v
->shadow_vqs_enabled
) {
1230 * Device vring base was set at device start. SVQ base is handled by
1236 return vhost_vdpa_set_dev_vring_base(dev
, ring
);
1239 static int vhost_vdpa_get_vring_base(struct vhost_dev
*dev
,
1240 struct vhost_vring_state
*ring
)
1242 struct vhost_vdpa
*v
= dev
->opaque
;
1245 if (v
->shadow_vqs_enabled
) {
1246 ring
->num
= virtio_queue_get_last_avail_idx(dev
->vdev
, ring
->index
);
1250 if (!v
->suspended
) {
1252 * Cannot trust in value returned by device, let vhost recover used
1258 ret
= vhost_vdpa_call(dev
, VHOST_GET_VRING_BASE
, ring
);
1259 trace_vhost_vdpa_get_vring_base(dev
, ring
->index
, ring
->num
);
1263 static int vhost_vdpa_set_vring_kick(struct vhost_dev
*dev
,
1264 struct vhost_vring_file
*file
)
1266 struct vhost_vdpa
*v
= dev
->opaque
;
1267 int vdpa_idx
= file
->index
- dev
->vq_index
;
1269 if (v
->shadow_vqs_enabled
) {
1270 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, vdpa_idx
);
1271 vhost_svq_set_svq_kick_fd(svq
, file
->fd
);
1274 return vhost_vdpa_set_vring_dev_kick(dev
, file
);
1278 static int vhost_vdpa_set_vring_call(struct vhost_dev
*dev
,
1279 struct vhost_vring_file
*file
)
1281 struct vhost_vdpa
*v
= dev
->opaque
;
1282 int vdpa_idx
= file
->index
- dev
->vq_index
;
1283 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, vdpa_idx
);
1285 /* Remember last call fd because we can switch to SVQ anytime. */
1286 vhost_svq_set_svq_call_fd(svq
, file
->fd
);
1287 if (v
->shadow_vqs_enabled
) {
1291 return vhost_vdpa_set_vring_dev_call(dev
, file
);
1294 static int vhost_vdpa_get_features(struct vhost_dev
*dev
,
1297 int ret
= vhost_vdpa_get_dev_features(dev
, features
);
1300 /* Add SVQ logging capabilities */
1301 *features
|= BIT_ULL(VHOST_F_LOG_ALL
);
1307 static int vhost_vdpa_set_owner(struct vhost_dev
*dev
)
1309 if (!vhost_vdpa_first_dev(dev
)) {
1313 trace_vhost_vdpa_set_owner(dev
);
1314 return vhost_vdpa_call(dev
, VHOST_SET_OWNER
, NULL
);
1317 static int vhost_vdpa_vq_get_addr(struct vhost_dev
*dev
,
1318 struct vhost_vring_addr
*addr
, struct vhost_virtqueue
*vq
)
1320 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
1321 addr
->desc_user_addr
= (uint64_t)(unsigned long)vq
->desc_phys
;
1322 addr
->avail_user_addr
= (uint64_t)(unsigned long)vq
->avail_phys
;
1323 addr
->used_user_addr
= (uint64_t)(unsigned long)vq
->used_phys
;
1324 trace_vhost_vdpa_vq_get_addr(dev
, vq
, addr
->desc_user_addr
,
1325 addr
->avail_user_addr
, addr
->used_user_addr
);
1329 static bool vhost_vdpa_force_iommu(struct vhost_dev
*dev
)
1334 const VhostOps vdpa_ops
= {
1335 .backend_type
= VHOST_BACKEND_TYPE_VDPA
,
1336 .vhost_backend_init
= vhost_vdpa_init
,
1337 .vhost_backend_cleanup
= vhost_vdpa_cleanup
,
1338 .vhost_set_log_base
= vhost_vdpa_set_log_base
,
1339 .vhost_set_vring_addr
= vhost_vdpa_set_vring_addr
,
1340 .vhost_set_vring_num
= vhost_vdpa_set_vring_num
,
1341 .vhost_set_vring_base
= vhost_vdpa_set_vring_base
,
1342 .vhost_get_vring_base
= vhost_vdpa_get_vring_base
,
1343 .vhost_set_vring_kick
= vhost_vdpa_set_vring_kick
,
1344 .vhost_set_vring_call
= vhost_vdpa_set_vring_call
,
1345 .vhost_get_features
= vhost_vdpa_get_features
,
1346 .vhost_set_backend_cap
= vhost_vdpa_set_backend_cap
,
1347 .vhost_set_owner
= vhost_vdpa_set_owner
,
1348 .vhost_set_vring_endian
= NULL
,
1349 .vhost_backend_memslots_limit
= vhost_vdpa_memslots_limit
,
1350 .vhost_set_mem_table
= vhost_vdpa_set_mem_table
,
1351 .vhost_set_features
= vhost_vdpa_set_features
,
1352 .vhost_reset_device
= vhost_vdpa_reset_device
,
1353 .vhost_get_vq_index
= vhost_vdpa_get_vq_index
,
1354 .vhost_get_config
= vhost_vdpa_get_config
,
1355 .vhost_set_config
= vhost_vdpa_set_config
,
1356 .vhost_requires_shm_log
= NULL
,
1357 .vhost_migration_done
= NULL
,
1358 .vhost_backend_can_merge
= NULL
,
1359 .vhost_net_set_mtu
= NULL
,
1360 .vhost_set_iotlb_callback
= NULL
,
1361 .vhost_send_device_iotlb_msg
= NULL
,
1362 .vhost_dev_start
= vhost_vdpa_dev_start
,
1363 .vhost_get_device_id
= vhost_vdpa_get_device_id
,
1364 .vhost_vq_get_addr
= vhost_vdpa_vq_get_addr
,
1365 .vhost_force_iommu
= vhost_vdpa_force_iommu
,
1366 .vhost_set_config_call
= vhost_vdpa_set_config_call
,
1367 .vhost_reset_status
= vhost_vdpa_reset_status
,