2 * vhost shadow virtqueue
4 * SPDX-FileCopyrightText: Red Hat, Inc. 2021
5 * SPDX-FileContributor: Author: Eugenio PĂ©rez <eperezma@redhat.com>
7 * SPDX-License-Identifier: GPL-2.0-or-later
10 #include "qemu/osdep.h"
11 #include "hw/virtio/vhost-shadow-virtqueue.h"
13 #include "qemu/error-report.h"
14 #include "qapi/error.h"
15 #include "qemu/main-loop.h"
17 #include "qemu/memalign.h"
18 #include "linux-headers/linux/vhost.h"
21 * Validate the transport device features that both guests can use with the SVQ
22 * and SVQs can use with the device.
24 * @dev_features: The features
25 * @errp: Error pointer
27 bool vhost_svq_valid_features(uint64_t features
, Error
**errp
)
30 uint64_t svq_features
= features
;
32 for (uint64_t b
= VIRTIO_TRANSPORT_F_START
; b
<= VIRTIO_TRANSPORT_F_END
;
35 case VIRTIO_F_ANY_LAYOUT
:
38 case VIRTIO_F_ACCESS_PLATFORM
:
39 /* SVQ trust in the host's IOMMU to translate addresses */
40 case VIRTIO_F_VERSION_1
:
41 /* SVQ trust that the guest vring is little endian */
42 if (!(svq_features
& BIT_ULL(b
))) {
43 svq_features
|= BIT_ULL(b
);
49 if (svq_features
& BIT_ULL(b
)) {
50 svq_features
&= ~BIT_ULL(b
);
57 error_setg(errp
, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
58 ", ok: 0x%"PRIx64
, features
, svq_features
);
64 * Number of descriptors that the SVQ can make available from the guest.
68 static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue
*svq
)
70 return svq
->vring
.num
- (svq
->shadow_avail_idx
- svq
->shadow_used_idx
);
74 * Translate addresses between the qemu's virtual address and the SVQ IOVA
76 * @svq: Shadow VirtQueue
77 * @vaddr: Translated IOVA addresses
78 * @iovec: Source qemu's VA addresses
79 * @num: Length of iovec and minimum length of vaddr
81 static bool vhost_svq_translate_addr(const VhostShadowVirtqueue
*svq
,
82 hwaddr
*addrs
, const struct iovec
*iovec
,
89 for (size_t i
= 0; i
< num
; ++i
) {
91 .translated_addr
= (hwaddr
)(uintptr_t)iovec
[i
].iov_base
,
92 .size
= iovec
[i
].iov_len
,
94 Int128 needle_last
, map_last
;
97 const DMAMap
*map
= vhost_iova_tree_find_iova(svq
->iova_tree
, &needle
);
99 * Map cannot be NULL since iova map contains all guest space and
100 * qemu already has a physical address mapped
102 if (unlikely(!map
)) {
103 qemu_log_mask(LOG_GUEST_ERROR
,
104 "Invalid address 0x%"HWADDR_PRIx
" given by guest",
105 needle
.translated_addr
);
109 off
= needle
.translated_addr
- map
->translated_addr
;
110 addrs
[i
] = map
->iova
+ off
;
112 needle_last
= int128_add(int128_make64(needle
.translated_addr
),
113 int128_make64(iovec
[i
].iov_len
));
114 map_last
= int128_make64(map
->translated_addr
+ map
->size
);
115 if (unlikely(int128_gt(needle_last
, map_last
))) {
116 qemu_log_mask(LOG_GUEST_ERROR
,
117 "Guest buffer expands over iova range");
125 static void vhost_vring_write_descs(VhostShadowVirtqueue
*svq
, hwaddr
*sg
,
126 const struct iovec
*iovec
, size_t num
,
127 bool more_descs
, bool write
)
129 uint16_t i
= svq
->free_head
, last
= svq
->free_head
;
131 uint16_t flags
= write
? cpu_to_le16(VRING_DESC_F_WRITE
) : 0;
132 vring_desc_t
*descs
= svq
->vring
.desc
;
138 for (n
= 0; n
< num
; n
++) {
139 if (more_descs
|| (n
+ 1 < num
)) {
140 descs
[i
].flags
= flags
| cpu_to_le16(VRING_DESC_F_NEXT
);
142 descs
[i
].flags
= flags
;
144 descs
[i
].addr
= cpu_to_le64(sg
[n
]);
145 descs
[i
].len
= cpu_to_le32(iovec
[n
].iov_len
);
148 i
= cpu_to_le16(descs
[i
].next
);
151 svq
->free_head
= le16_to_cpu(descs
[last
].next
);
154 static bool vhost_svq_add_split(VhostShadowVirtqueue
*svq
,
155 VirtQueueElement
*elem
, unsigned *head
)
158 vring_avail_t
*avail
= svq
->vring
.avail
;
160 g_autofree hwaddr
*sgs
= g_new(hwaddr
, MAX(elem
->out_num
, elem
->in_num
));
162 *head
= svq
->free_head
;
164 /* We need some descriptors here */
165 if (unlikely(!elem
->out_num
&& !elem
->in_num
)) {
166 qemu_log_mask(LOG_GUEST_ERROR
,
167 "Guest provided element with no descriptors");
171 ok
= vhost_svq_translate_addr(svq
, sgs
, elem
->out_sg
, elem
->out_num
);
175 vhost_vring_write_descs(svq
, sgs
, elem
->out_sg
, elem
->out_num
,
176 elem
->in_num
> 0, false);
179 ok
= vhost_svq_translate_addr(svq
, sgs
, elem
->in_sg
, elem
->in_num
);
184 vhost_vring_write_descs(svq
, sgs
, elem
->in_sg
, elem
->in_num
, false, true);
187 * Put the entry in the available array (but don't update avail->idx until
190 avail_idx
= svq
->shadow_avail_idx
& (svq
->vring
.num
- 1);
191 avail
->ring
[avail_idx
] = cpu_to_le16(*head
);
192 svq
->shadow_avail_idx
++;
194 /* Update the avail index after write the descriptor */
196 avail
->idx
= cpu_to_le16(svq
->shadow_avail_idx
);
201 static bool vhost_svq_add(VhostShadowVirtqueue
*svq
, VirtQueueElement
*elem
)
204 bool ok
= vhost_svq_add_split(svq
, elem
, &qemu_head
);
209 svq
->ring_id_maps
[qemu_head
] = elem
;
213 static void vhost_svq_kick(VhostShadowVirtqueue
*svq
)
216 * We need to expose the available array entries before checking the used
220 if (svq
->vring
.used
->flags
& VRING_USED_F_NO_NOTIFY
) {
224 event_notifier_set(&svq
->hdev_kick
);
228 * Forward available buffers.
230 * @svq: Shadow VirtQueue
232 * Note that this function does not guarantee that all guest's available
233 * buffers are available to the device in SVQ avail ring. The guest may have
234 * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
237 * If that happens, guest's kick notifications will be disabled until the
238 * device uses some buffers.
240 static void vhost_handle_guest_kick(VhostShadowVirtqueue
*svq
)
242 /* Clear event notifier */
243 event_notifier_test_and_clear(&svq
->svq_kick
);
245 /* Forward to the device as many available buffers as possible */
247 virtio_queue_set_notification(svq
->vq
, false);
250 VirtQueueElement
*elem
;
253 if (svq
->next_guest_avail_elem
) {
254 elem
= g_steal_pointer(&svq
->next_guest_avail_elem
);
256 elem
= virtqueue_pop(svq
->vq
, sizeof(*elem
));
263 if (elem
->out_num
+ elem
->in_num
> vhost_svq_available_slots(svq
)) {
265 * This condition is possible since a contiguous buffer in GPA
266 * does not imply a contiguous buffer in qemu's VA
267 * scatter-gather segments. If that happens, the buffer exposed
268 * to the device needs to be a chain of descriptors at this
271 * SVQ cannot hold more available buffers if we are here:
272 * queue the current guest descriptor and ignore further kicks
273 * until some elements are used.
275 svq
->next_guest_avail_elem
= elem
;
279 ok
= vhost_svq_add(svq
, elem
);
281 /* VQ is broken, just return and ignore any other kicks */
287 virtio_queue_set_notification(svq
->vq
, true);
288 } while (!virtio_queue_empty(svq
->vq
));
292 * Handle guest's kick.
294 * @n: guest kick event notifier, the one that guest set to notify svq.
296 static void vhost_handle_guest_kick_notifier(EventNotifier
*n
)
298 VhostShadowVirtqueue
*svq
= container_of(n
, VhostShadowVirtqueue
, svq_kick
);
299 event_notifier_test_and_clear(n
);
300 vhost_handle_guest_kick(svq
);
303 static bool vhost_svq_more_used(VhostShadowVirtqueue
*svq
)
305 if (svq
->last_used_idx
!= svq
->shadow_used_idx
) {
309 svq
->shadow_used_idx
= cpu_to_le16(svq
->vring
.used
->idx
);
311 return svq
->last_used_idx
!= svq
->shadow_used_idx
;
315 * Enable vhost device calls after disable them.
319 * It returns false if there are pending used buffers from the vhost device,
320 * avoiding the possible races between SVQ checking for more work and enabling
321 * callbacks. True if SVQ used vring has no more pending buffers.
323 static bool vhost_svq_enable_notification(VhostShadowVirtqueue
*svq
)
325 svq
->vring
.avail
->flags
&= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT
);
326 /* Make sure the flag is written before the read of used_idx */
328 return !vhost_svq_more_used(svq
);
331 static void vhost_svq_disable_notification(VhostShadowVirtqueue
*svq
)
333 svq
->vring
.avail
->flags
|= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT
);
336 static VirtQueueElement
*vhost_svq_get_buf(VhostShadowVirtqueue
*svq
,
339 vring_desc_t
*descs
= svq
->vring
.desc
;
340 const vring_used_t
*used
= svq
->vring
.used
;
341 vring_used_elem_t used_elem
;
344 if (!vhost_svq_more_used(svq
)) {
348 /* Only get used array entries after they have been exposed by dev */
350 last_used
= svq
->last_used_idx
& (svq
->vring
.num
- 1);
351 used_elem
.id
= le32_to_cpu(used
->ring
[last_used
].id
);
352 used_elem
.len
= le32_to_cpu(used
->ring
[last_used
].len
);
354 svq
->last_used_idx
++;
355 if (unlikely(used_elem
.id
>= svq
->vring
.num
)) {
356 qemu_log_mask(LOG_GUEST_ERROR
, "Device %s says index %u is used",
357 svq
->vdev
->name
, used_elem
.id
);
361 if (unlikely(!svq
->ring_id_maps
[used_elem
.id
])) {
362 qemu_log_mask(LOG_GUEST_ERROR
,
363 "Device %s says index %u is used, but it was not available",
364 svq
->vdev
->name
, used_elem
.id
);
368 descs
[used_elem
.id
].next
= svq
->free_head
;
369 svq
->free_head
= used_elem
.id
;
371 *len
= used_elem
.len
;
372 return g_steal_pointer(&svq
->ring_id_maps
[used_elem
.id
]);
375 static void vhost_svq_flush(VhostShadowVirtqueue
*svq
,
376 bool check_for_avail_queue
)
378 VirtQueue
*vq
= svq
->vq
;
380 /* Forward as many used buffers as possible. */
384 vhost_svq_disable_notification(svq
);
387 g_autofree VirtQueueElement
*elem
= vhost_svq_get_buf(svq
, &len
);
392 if (unlikely(i
>= svq
->vring
.num
)) {
393 qemu_log_mask(LOG_GUEST_ERROR
,
394 "More than %u used buffers obtained in a %u size SVQ",
396 virtqueue_fill(vq
, elem
, len
, i
);
397 virtqueue_flush(vq
, i
);
400 virtqueue_fill(vq
, elem
, len
, i
++);
403 virtqueue_flush(vq
, i
);
404 event_notifier_set(&svq
->svq_call
);
406 if (check_for_avail_queue
&& svq
->next_guest_avail_elem
) {
408 * Avail ring was full when vhost_svq_flush was called, so it's a
409 * good moment to make more descriptors available if possible.
411 vhost_handle_guest_kick(svq
);
413 } while (!vhost_svq_enable_notification(svq
));
417 * Forward used buffers.
419 * @n: hdev call event notifier, the one that device set to notify svq.
421 * Note that we are not making any buffers available in the loop, there is no
422 * way that it runs more than virtqueue size times.
424 static void vhost_svq_handle_call(EventNotifier
*n
)
426 VhostShadowVirtqueue
*svq
= container_of(n
, VhostShadowVirtqueue
,
428 event_notifier_test_and_clear(n
);
429 vhost_svq_flush(svq
, true);
433 * Set the call notifier for the SVQ to call the guest
435 * @svq: Shadow virtqueue
436 * @call_fd: call notifier
438 * Called on BQL context.
440 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue
*svq
, int call_fd
)
442 if (call_fd
== VHOST_FILE_UNBIND
) {
444 * Fail event_notifier_set if called handling device call.
446 * SVQ still needs device notifications, since it needs to keep
447 * forwarding used buffers even with the unbind.
449 memset(&svq
->svq_call
, 0, sizeof(svq
->svq_call
));
451 event_notifier_init_fd(&svq
->svq_call
, call_fd
);
456 * Get the shadow vq vring address.
457 * @svq: Shadow virtqueue
458 * @addr: Destination to store address
460 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue
*svq
,
461 struct vhost_vring_addr
*addr
)
463 addr
->desc_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.desc
;
464 addr
->avail_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.avail
;
465 addr
->used_user_addr
= (uint64_t)(uintptr_t)svq
->vring
.used
;
468 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue
*svq
)
470 size_t desc_size
= sizeof(vring_desc_t
) * svq
->vring
.num
;
471 size_t avail_size
= offsetof(vring_avail_t
, ring
) +
472 sizeof(uint16_t) * svq
->vring
.num
;
474 return ROUND_UP(desc_size
+ avail_size
, qemu_real_host_page_size
);
477 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue
*svq
)
479 size_t used_size
= offsetof(vring_used_t
, ring
) +
480 sizeof(vring_used_elem_t
) * svq
->vring
.num
;
481 return ROUND_UP(used_size
, qemu_real_host_page_size
);
485 * Set a new file descriptor for the guest to kick the SVQ and notify for avail
488 * @svq_kick_fd: The svq kick fd
490 * Note that the SVQ will never close the old file descriptor.
492 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue
*svq
, int svq_kick_fd
)
494 EventNotifier
*svq_kick
= &svq
->svq_kick
;
495 bool poll_stop
= VHOST_FILE_UNBIND
!= event_notifier_get_fd(svq_kick
);
496 bool poll_start
= svq_kick_fd
!= VHOST_FILE_UNBIND
;
499 event_notifier_set_handler(svq_kick
, NULL
);
503 * event_notifier_set_handler already checks for guest's notifications if
504 * they arrive at the new file descriptor in the switch, so there is no
505 * need to explicitly check for them.
508 event_notifier_init_fd(svq_kick
, svq_kick_fd
);
509 event_notifier_set(svq_kick
);
510 event_notifier_set_handler(svq_kick
, vhost_handle_guest_kick_notifier
);
515 * Start the shadow virtqueue operation.
517 * @svq: Shadow Virtqueue
518 * @vdev: VirtIO device
519 * @vq: Virtqueue to shadow
521 void vhost_svq_start(VhostShadowVirtqueue
*svq
, VirtIODevice
*vdev
,
524 size_t desc_size
, driver_size
, device_size
;
526 svq
->next_guest_avail_elem
= NULL
;
527 svq
->shadow_avail_idx
= 0;
528 svq
->shadow_used_idx
= 0;
529 svq
->last_used_idx
= 0;
533 svq
->vring
.num
= virtio_queue_get_num(vdev
, virtio_get_queue_index(vq
));
534 driver_size
= vhost_svq_driver_area_size(svq
);
535 device_size
= vhost_svq_device_area_size(svq
);
536 svq
->vring
.desc
= qemu_memalign(qemu_real_host_page_size
, driver_size
);
537 desc_size
= sizeof(vring_desc_t
) * svq
->vring
.num
;
538 svq
->vring
.avail
= (void *)((char *)svq
->vring
.desc
+ desc_size
);
539 memset(svq
->vring
.desc
, 0, driver_size
);
540 svq
->vring
.used
= qemu_memalign(qemu_real_host_page_size
, device_size
);
541 memset(svq
->vring
.used
, 0, device_size
);
542 svq
->ring_id_maps
= g_new0(VirtQueueElement
*, svq
->vring
.num
);
543 for (unsigned i
= 0; i
< svq
->vring
.num
- 1; i
++) {
544 svq
->vring
.desc
[i
].next
= cpu_to_le16(i
+ 1);
549 * Stop the shadow virtqueue operation.
550 * @svq: Shadow Virtqueue
552 void vhost_svq_stop(VhostShadowVirtqueue
*svq
)
554 event_notifier_set_handler(&svq
->svq_kick
, NULL
);
555 g_autofree VirtQueueElement
*next_avail_elem
= NULL
;
561 /* Send all pending used descriptors to guest */
562 vhost_svq_flush(svq
, false);
564 for (unsigned i
= 0; i
< svq
->vring
.num
; ++i
) {
565 g_autofree VirtQueueElement
*elem
= NULL
;
566 elem
= g_steal_pointer(&svq
->ring_id_maps
[i
]);
568 virtqueue_detach_element(svq
->vq
, elem
, 0);
572 next_avail_elem
= g_steal_pointer(&svq
->next_guest_avail_elem
);
573 if (next_avail_elem
) {
574 virtqueue_detach_element(svq
->vq
, next_avail_elem
, 0);
577 g_free(svq
->ring_id_maps
);
578 qemu_vfree(svq
->vring
.desc
);
579 qemu_vfree(svq
->vring
.used
);
583 * Creates vhost shadow virtqueue, and instructs the vhost device to use the
584 * shadow methods and file descriptors.
586 * @iova_tree: Tree to perform descriptors translations
588 * Returns the new virtqueue or NULL.
590 * In case of error, reason is reported through error_report.
592 VhostShadowVirtqueue
*vhost_svq_new(VhostIOVATree
*iova_tree
)
594 g_autofree VhostShadowVirtqueue
*svq
= g_new0(VhostShadowVirtqueue
, 1);
597 r
= event_notifier_init(&svq
->hdev_kick
, 0);
599 error_report("Couldn't create kick event notifier: %s (%d)",
600 g_strerror(errno
), errno
);
601 goto err_init_hdev_kick
;
604 r
= event_notifier_init(&svq
->hdev_call
, 0);
606 error_report("Couldn't create call event notifier: %s (%d)",
607 g_strerror(errno
), errno
);
608 goto err_init_hdev_call
;
611 event_notifier_init_fd(&svq
->svq_kick
, VHOST_FILE_UNBIND
);
612 event_notifier_set_handler(&svq
->hdev_call
, vhost_svq_handle_call
);
613 svq
->iova_tree
= iova_tree
;
614 return g_steal_pointer(&svq
);
617 event_notifier_cleanup(&svq
->hdev_kick
);
624 * Free the resources of the shadow virtqueue.
626 * @pvq: gpointer to SVQ so it can be used by autofree functions.
628 void vhost_svq_free(gpointer pvq
)
630 VhostShadowVirtqueue
*vq
= pvq
;
632 event_notifier_cleanup(&vq
->hdev_kick
);
633 event_notifier_set_handler(&vq
->hdev_call
, NULL
);
634 event_notifier_cleanup(&vq
->hdev_call
);