1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 * Copyright Red Hat, Inc.
8 * Stefan Hajnoczi <stefanha@redhat.com>
11 #include "qemu/osdep.h"
13 #include "block/block_int.h"
14 #include "exec/memory.h"
15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
16 #include "qemu/defer-call.h"
17 #include "qapi/error.h"
18 #include "qemu/error-report.h"
19 #include "qapi/qmp/qdict.h"
20 #include "qemu/module.h"
21 #include "sysemu/block-backend.h"
22 #include "exec/memory.h" /* for ram_block_discard_disable() */
24 #include "block/block-io.h"
27 * Allocated bounce buffers are kept in a list sorted by buffer address.
29 typedef struct BlkioBounceBuf
{
30 QLIST_ENTRY(BlkioBounceBuf
) next
;
32 /* The bounce buffer */
38 * libblkio is not thread-safe so this lock protects ->blkio and
43 struct blkioq
*blkioq
; /* make this multi-queue in the future... */
47 * Polling fetches the next completion into this field.
49 * No lock is necessary since only one thread calls aio_poll() and invokes
50 * fd and poll handlers.
52 struct blkio_completion poll_completion
;
55 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
57 * Lock ordering: ->bounce_lock before ->blkio_lock.
61 /* Bounce buffer pool */
62 struct blkio_mem_region bounce_pool
;
64 /* Sorted list of allocated bounce buffers */
65 QLIST_HEAD(, BlkioBounceBuf
) bounce_bufs
;
67 /* Queue for coroutines waiting for bounce buffer space */
68 CoQueue bounce_available
;
70 /* The value of the "mem-region-alignment" property */
71 uint64_t mem_region_alignment
;
73 /* Can we skip adding/deleting blkio_mem_regions? */
74 bool needs_mem_regions
;
76 /* Are file descriptors necessary for blkio_mem_regions? */
77 bool needs_mem_region_fd
;
79 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
80 bool may_pin_mem_regions
;
83 /* Called with s->bounce_lock held */
84 static int blkio_resize_bounce_pool(BDRVBlkioState
*s
, int64_t bytes
)
86 /* There can be no allocated bounce buffers during resize */
87 assert(QLIST_EMPTY(&s
->bounce_bufs
));
89 /* Pad size to reduce frequency of resize calls */
92 /* Align the pool size to avoid blkio_alloc_mem_region() failure */
93 bytes
= QEMU_ALIGN_UP(bytes
, s
->mem_region_alignment
);
95 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
98 if (s
->bounce_pool
.addr
) {
99 blkio_unmap_mem_region(s
->blkio
, &s
->bounce_pool
);
100 blkio_free_mem_region(s
->blkio
, &s
->bounce_pool
);
101 memset(&s
->bounce_pool
, 0, sizeof(s
->bounce_pool
));
104 /* Automatically freed when s->blkio is destroyed */
105 ret
= blkio_alloc_mem_region(s
->blkio
, &s
->bounce_pool
, bytes
);
110 ret
= blkio_map_mem_region(s
->blkio
, &s
->bounce_pool
);
112 blkio_free_mem_region(s
->blkio
, &s
->bounce_pool
);
113 memset(&s
->bounce_pool
, 0, sizeof(s
->bounce_pool
));
121 /* Called with s->bounce_lock held */
123 blkio_do_alloc_bounce_buffer(BDRVBlkioState
*s
, BlkioBounceBuf
*bounce
,
126 void *addr
= s
->bounce_pool
.addr
;
127 BlkioBounceBuf
*cur
= NULL
;
128 BlkioBounceBuf
*prev
= NULL
;
132 * This is just a linear search over the holes between requests. An
133 * efficient allocator would be nice.
135 QLIST_FOREACH(cur
, &s
->bounce_bufs
, next
) {
136 space
= cur
->buf
.iov_base
- addr
;
137 if (bytes
<= space
) {
138 QLIST_INSERT_BEFORE(cur
, bounce
, next
);
139 bounce
->buf
.iov_base
= addr
;
140 bounce
->buf
.iov_len
= bytes
;
144 addr
= cur
->buf
.iov_base
+ cur
->buf
.iov_len
;
148 /* Is there space after the last request? */
149 space
= s
->bounce_pool
.addr
+ s
->bounce_pool
.len
- addr
;
154 QLIST_INSERT_AFTER(prev
, bounce
, next
);
156 QLIST_INSERT_HEAD(&s
->bounce_bufs
, bounce
, next
);
158 bounce
->buf
.iov_base
= addr
;
159 bounce
->buf
.iov_len
= bytes
;
163 static int coroutine_fn
164 blkio_alloc_bounce_buffer(BDRVBlkioState
*s
, BlkioBounceBuf
*bounce
,
168 * Ensure fairness: first time around we join the back of the queue,
169 * subsequently we join the front so we don't lose our place.
171 CoQueueWaitFlags wait_flags
= 0;
173 QEMU_LOCK_GUARD(&s
->bounce_lock
);
175 /* Ensure fairness: don't even try if other requests are already waiting */
176 if (!qemu_co_queue_empty(&s
->bounce_available
)) {
177 qemu_co_queue_wait_flags(&s
->bounce_available
, &s
->bounce_lock
,
179 wait_flags
= CO_QUEUE_WAIT_FRONT
;
183 if (blkio_do_alloc_bounce_buffer(s
, bounce
, bytes
)) {
184 /* Kick the next queued request since there may be space */
185 qemu_co_queue_next(&s
->bounce_available
);
190 * If there are no in-flight requests then the pool was simply too
193 if (QLIST_EMPTY(&s
->bounce_bufs
)) {
197 ret
= blkio_resize_bounce_pool(s
, bytes
);
199 /* Kick the next queued request since that may fail too */
200 qemu_co_queue_next(&s
->bounce_available
);
204 ok
= blkio_do_alloc_bounce_buffer(s
, bounce
, bytes
);
205 assert(ok
); /* must have space this time */
209 qemu_co_queue_wait_flags(&s
->bounce_available
, &s
->bounce_lock
,
211 wait_flags
= CO_QUEUE_WAIT_FRONT
;
215 static void coroutine_fn
blkio_free_bounce_buffer(BDRVBlkioState
*s
,
216 BlkioBounceBuf
*bounce
)
218 QEMU_LOCK_GUARD(&s
->bounce_lock
);
220 QLIST_REMOVE(bounce
, next
);
222 /* Wake up waiting coroutines since space may now be available */
223 qemu_co_queue_next(&s
->bounce_available
);
226 /* For async to .bdrv_co_*() conversion */
228 Coroutine
*coroutine
;
232 static void blkio_completion_fd_read(void *opaque
)
234 BlockDriverState
*bs
= opaque
;
235 BDRVBlkioState
*s
= bs
->opaque
;
239 /* Polling may have already fetched a completion */
240 if (s
->poll_completion
.user_data
!= NULL
) {
241 BlkioCoData
*cod
= s
->poll_completion
.user_data
;
242 cod
->ret
= s
->poll_completion
.ret
;
244 /* Clear it in case aio_co_wake() enters a nested event loop */
245 s
->poll_completion
.user_data
= NULL
;
247 aio_co_wake(cod
->coroutine
);
250 /* Reset completion fd status */
251 ret
= read(s
->completion_fd
, &val
, sizeof(val
));
253 /* Ignore errors, there's nothing we can do */
257 * Reading one completion at a time makes nested event loop re-entrancy
258 * simple. Change this loop to get multiple completions in one go if it
259 * becomes a performance bottleneck.
262 struct blkio_completion completion
;
264 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
265 ret
= blkioq_do_io(s
->blkioq
, &completion
, 0, 1, NULL
);
271 BlkioCoData
*cod
= completion
.user_data
;
272 cod
->ret
= completion
.ret
;
273 aio_co_wake(cod
->coroutine
);
277 static bool blkio_completion_fd_poll(void *opaque
)
279 BlockDriverState
*bs
= opaque
;
280 BDRVBlkioState
*s
= bs
->opaque
;
283 /* Just in case we already fetched a completion */
284 if (s
->poll_completion
.user_data
!= NULL
) {
288 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
289 ret
= blkioq_do_io(s
->blkioq
, &s
->poll_completion
, 0, 1, NULL
);
294 static void blkio_completion_fd_poll_ready(void *opaque
)
296 blkio_completion_fd_read(opaque
);
299 static void blkio_attach_aio_context(BlockDriverState
*bs
,
300 AioContext
*new_context
)
302 BDRVBlkioState
*s
= bs
->opaque
;
304 aio_set_fd_handler(new_context
, s
->completion_fd
,
305 blkio_completion_fd_read
, NULL
,
306 blkio_completion_fd_poll
,
307 blkio_completion_fd_poll_ready
, bs
);
310 static void blkio_detach_aio_context(BlockDriverState
*bs
)
312 BDRVBlkioState
*s
= bs
->opaque
;
314 aio_set_fd_handler(bdrv_get_aio_context(bs
), s
->completion_fd
, NULL
, NULL
,
319 * Called by defer_call_end() or immediately if not in a deferred section.
320 * Called without blkio_lock.
322 static void blkio_deferred_fn(void *opaque
)
324 BDRVBlkioState
*s
= opaque
;
326 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
327 blkioq_do_io(s
->blkioq
, NULL
, 0, 0, NULL
);
332 * Schedule I/O submission after enqueuing a new request. Called without
335 static void blkio_submit_io(BlockDriverState
*bs
)
337 BDRVBlkioState
*s
= bs
->opaque
;
339 defer_call(blkio_deferred_fn
, s
);
342 static int coroutine_fn
343 blkio_co_pdiscard(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
)
345 BDRVBlkioState
*s
= bs
->opaque
;
347 .coroutine
= qemu_coroutine_self(),
350 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
351 blkioq_discard(s
->blkioq
, offset
, bytes
, &cod
, 0);
355 qemu_coroutine_yield();
359 static int coroutine_fn
360 blkio_co_preadv(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
361 QEMUIOVector
*qiov
, BdrvRequestFlags flags
)
364 .coroutine
= qemu_coroutine_self(),
366 BDRVBlkioState
*s
= bs
->opaque
;
367 bool use_bounce_buffer
=
368 s
->needs_mem_regions
&& !(flags
& BDRV_REQ_REGISTERED_BUF
);
369 BlkioBounceBuf bounce
;
370 struct iovec
*iov
= qiov
->iov
;
371 int iovcnt
= qiov
->niov
;
373 if (use_bounce_buffer
) {
374 int ret
= blkio_alloc_bounce_buffer(s
, &bounce
, bytes
);
383 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
384 blkioq_readv(s
->blkioq
, offset
, iov
, iovcnt
, &cod
, 0);
388 qemu_coroutine_yield();
390 if (use_bounce_buffer
) {
392 qemu_iovec_from_buf(qiov
, 0,
397 blkio_free_bounce_buffer(s
, &bounce
);
403 static int coroutine_fn
blkio_co_pwritev(BlockDriverState
*bs
, int64_t offset
,
404 int64_t bytes
, QEMUIOVector
*qiov
, BdrvRequestFlags flags
)
406 uint32_t blkio_flags
= (flags
& BDRV_REQ_FUA
) ? BLKIO_REQ_FUA
: 0;
408 .coroutine
= qemu_coroutine_self(),
410 BDRVBlkioState
*s
= bs
->opaque
;
411 bool use_bounce_buffer
=
412 s
->needs_mem_regions
&& !(flags
& BDRV_REQ_REGISTERED_BUF
);
413 BlkioBounceBuf bounce
;
414 struct iovec
*iov
= qiov
->iov
;
415 int iovcnt
= qiov
->niov
;
417 if (use_bounce_buffer
) {
418 int ret
= blkio_alloc_bounce_buffer(s
, &bounce
, bytes
);
423 qemu_iovec_to_buf(qiov
, 0, bounce
.buf
.iov_base
, bytes
);
428 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
429 blkioq_writev(s
->blkioq
, offset
, iov
, iovcnt
, &cod
, blkio_flags
);
433 qemu_coroutine_yield();
435 if (use_bounce_buffer
) {
436 blkio_free_bounce_buffer(s
, &bounce
);
442 static int coroutine_fn
blkio_co_flush(BlockDriverState
*bs
)
444 BDRVBlkioState
*s
= bs
->opaque
;
446 .coroutine
= qemu_coroutine_self(),
449 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
450 blkioq_flush(s
->blkioq
, &cod
, 0);
454 qemu_coroutine_yield();
458 static int coroutine_fn
blkio_co_pwrite_zeroes(BlockDriverState
*bs
,
459 int64_t offset
, int64_t bytes
, BdrvRequestFlags flags
)
461 BDRVBlkioState
*s
= bs
->opaque
;
463 .coroutine
= qemu_coroutine_self(),
465 uint32_t blkio_flags
= 0;
467 if (flags
& BDRV_REQ_FUA
) {
468 blkio_flags
|= BLKIO_REQ_FUA
;
470 if (!(flags
& BDRV_REQ_MAY_UNMAP
)) {
471 blkio_flags
|= BLKIO_REQ_NO_UNMAP
;
473 if (flags
& BDRV_REQ_NO_FALLBACK
) {
474 blkio_flags
|= BLKIO_REQ_NO_FALLBACK
;
477 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
478 blkioq_write_zeroes(s
->blkioq
, offset
, bytes
, &cod
, blkio_flags
);
482 qemu_coroutine_yield();
490 } BlkioMemRegionResult
;
493 * Produce a struct blkio_mem_region for a given address and size.
495 * This function produces identical results when called multiple times with the
496 * same arguments. This property is necessary because blkio_unmap_mem_region()
497 * must receive the same struct blkio_mem_region field values that were passed
498 * to blkio_map_mem_region().
500 static BlkioMemRegionResult
501 blkio_mem_region_from_host(BlockDriverState
*bs
,
502 void *host
, size_t size
,
503 struct blkio_mem_region
*region
,
506 BDRVBlkioState
*s
= bs
->opaque
;
508 ram_addr_t fd_offset
= 0;
510 if (((uintptr_t)host
| size
) % s
->mem_region_alignment
) {
511 error_setg(errp
, "unaligned buf %p with size %zu", host
, size
);
515 /* Attempt to find the fd for the underlying memory */
516 if (s
->needs_mem_region_fd
) {
522 * bdrv_register_buf() is called with the BQL held so mr lives at least
523 * until this function returns.
525 ram_block
= qemu_ram_block_from_host(host
, false, &fd_offset
);
527 fd
= qemu_ram_get_fd(ram_block
);
531 * Ideally every RAMBlock would have an fd. pc-bios and other
532 * things don't. Luckily they are usually not I/O buffers and we
533 * can just ignore them.
538 /* Make sure the fd covers the entire range */
539 end_block
= qemu_ram_block_from_host(host
+ size
- 1, false, &offset
);
540 if (ram_block
!= end_block
) {
541 error_setg(errp
, "registered buffer at %p with size %zu extends "
542 "beyond RAMBlock", host
, size
);
547 *region
= (struct blkio_mem_region
){
551 .fd_offset
= fd_offset
,
556 static bool blkio_register_buf(BlockDriverState
*bs
, void *host
, size_t size
,
559 BDRVBlkioState
*s
= bs
->opaque
;
560 struct blkio_mem_region region
;
561 BlkioMemRegionResult region_result
;
565 * Mapping memory regions conflicts with RAM discard (virtio-mem) when
566 * there is pinning, so only do it when necessary.
568 if (!s
->needs_mem_regions
&& s
->may_pin_mem_regions
) {
572 region_result
= blkio_mem_region_from_host(bs
, host
, size
, ®ion
, errp
);
573 if (region_result
== BMRR_SKIP
) {
575 } else if (region_result
!= BMRR_OK
) {
579 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
580 ret
= blkio_map_mem_region(s
->blkio
, ®ion
);
584 error_setg(errp
, "Failed to add blkio mem region %p with size %zu: %s",
585 host
, size
, blkio_get_error_msg());
591 static void blkio_unregister_buf(BlockDriverState
*bs
, void *host
, size_t size
)
593 BDRVBlkioState
*s
= bs
->opaque
;
594 struct blkio_mem_region region
;
596 /* See blkio_register_buf() */
597 if (!s
->needs_mem_regions
&& s
->may_pin_mem_regions
) {
601 if (blkio_mem_region_from_host(bs
, host
, size
, ®ion
, NULL
) != BMRR_OK
) {
605 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
606 blkio_unmap_mem_region(s
->blkio
, ®ion
);
610 static int blkio_io_uring_connect(BlockDriverState
*bs
, QDict
*options
,
611 int flags
, Error
**errp
)
613 const char *filename
= qdict_get_str(options
, "filename");
614 BDRVBlkioState
*s
= bs
->opaque
;
617 ret
= blkio_set_str(s
->blkio
, "path", filename
);
618 qdict_del(options
, "filename");
620 error_setg_errno(errp
, -ret
, "failed to set path: %s",
621 blkio_get_error_msg());
625 if (flags
& BDRV_O_NOCACHE
) {
626 ret
= blkio_set_bool(s
->blkio
, "direct", true);
628 error_setg_errno(errp
, -ret
, "failed to set direct: %s",
629 blkio_get_error_msg());
634 ret
= blkio_connect(s
->blkio
);
636 error_setg_errno(errp
, -ret
, "blkio_connect failed: %s",
637 blkio_get_error_msg());
644 static int blkio_nvme_io_uring_connect(BlockDriverState
*bs
, QDict
*options
,
645 int flags
, Error
**errp
)
647 const char *path
= qdict_get_try_str(options
, "path");
648 BDRVBlkioState
*s
= bs
->opaque
;
652 error_setg(errp
, "missing 'path' option");
656 ret
= blkio_set_str(s
->blkio
, "path", path
);
657 qdict_del(options
, "path");
659 error_setg_errno(errp
, -ret
, "failed to set path: %s",
660 blkio_get_error_msg());
664 if (!(flags
& BDRV_O_NOCACHE
)) {
665 error_setg(errp
, "cache.direct=off is not supported");
669 ret
= blkio_connect(s
->blkio
);
671 error_setg_errno(errp
, -ret
, "blkio_connect failed: %s",
672 blkio_get_error_msg());
679 static int blkio_virtio_blk_connect(BlockDriverState
*bs
, QDict
*options
,
680 int flags
, Error
**errp
)
682 const char *path
= qdict_get_try_str(options
, "path");
683 BDRVBlkioState
*s
= bs
->opaque
;
684 bool fd_supported
= false;
688 error_setg(errp
, "missing 'path' option");
692 if (!(flags
& BDRV_O_NOCACHE
)) {
693 error_setg(errp
, "cache.direct=off is not supported");
697 if (blkio_set_int(s
->blkio
, "fd", -1) == 0) {
702 * If the libblkio driver supports fd passing, let's always use qemu_open()
703 * to open the `path`, so we can handle fd passing from the management
704 * layer through the "/dev/fdset/N" special path.
708 * `path` can contain the path of a character device
709 * (e.g. /dev/vhost-vdpa-0 or /dev/vfio/vfio) or a unix socket.
711 * So, we should always open it with O_RDWR flag, also if BDRV_O_RDWR
712 * is not set in the open flags, because the exchange of IOCTL commands
713 * for example will fail.
715 * In order to open the device read-only, we are using the `read-only`
716 * property of the libblkio driver in blkio_open().
718 fd
= qemu_open(path
, O_RDWR
, NULL
);
721 * qemu_open() can fail if the user specifies a path that is not
722 * a file or device, for example in the case of Unix Domain Socket
723 * for the virtio-blk-vhost-user driver. In such cases let's have
724 * libblkio open the path directly.
726 fd_supported
= false;
728 ret
= blkio_set_int(s
->blkio
, "fd", fd
);
730 fd_supported
= false;
738 ret
= blkio_set_str(s
->blkio
, "path", path
);
740 error_setg_errno(errp
, -ret
, "failed to set path: %s",
741 blkio_get_error_msg());
746 ret
= blkio_connect(s
->blkio
);
747 if (ret
< 0 && fd
>= 0) {
748 /* Failed to give the FD to libblkio, close it */
754 * Before https://gitlab.com/libblkio/libblkio/-/merge_requests/208
755 * (libblkio <= v1.3.0), setting the `fd` property is not enough to check
756 * whether the driver supports the `fd` property or not. In that case,
757 * blkio_connect() will fail with -EINVAL.
758 * So let's try calling blkio_connect() again by directly setting `path`
759 * to cover this scenario.
761 if (fd_supported
&& ret
== -EINVAL
) {
763 * We need to clear the `fd` property we set previously by setting
766 ret
= blkio_set_int(s
->blkio
, "fd", -1);
768 error_setg_errno(errp
, -ret
, "failed to set fd: %s",
769 blkio_get_error_msg());
773 ret
= blkio_set_str(s
->blkio
, "path", path
);
775 error_setg_errno(errp
, -ret
, "failed to set path: %s",
776 blkio_get_error_msg());
780 ret
= blkio_connect(s
->blkio
);
784 error_setg_errno(errp
, -ret
, "blkio_connect failed: %s",
785 blkio_get_error_msg());
789 qdict_del(options
, "path");
794 static int blkio_open(BlockDriverState
*bs
, QDict
*options
, int flags
,
797 const char *blkio_driver
= bs
->drv
->protocol_name
;
798 BDRVBlkioState
*s
= bs
->opaque
;
801 ret
= blkio_create(blkio_driver
, &s
->blkio
);
803 error_setg_errno(errp
, -ret
, "blkio_create failed: %s",
804 blkio_get_error_msg());
808 if (!(flags
& BDRV_O_RDWR
)) {
809 ret
= blkio_set_bool(s
->blkio
, "read-only", true);
811 error_setg_errno(errp
, -ret
, "failed to set read-only: %s",
812 blkio_get_error_msg());
813 blkio_destroy(&s
->blkio
);
818 if (strcmp(blkio_driver
, "io_uring") == 0) {
819 ret
= blkio_io_uring_connect(bs
, options
, flags
, errp
);
820 } else if (strcmp(blkio_driver
, "nvme-io_uring") == 0) {
821 ret
= blkio_nvme_io_uring_connect(bs
, options
, flags
, errp
);
822 } else if (strcmp(blkio_driver
, "virtio-blk-vfio-pci") == 0) {
823 ret
= blkio_virtio_blk_connect(bs
, options
, flags
, errp
);
824 } else if (strcmp(blkio_driver
, "virtio-blk-vhost-user") == 0) {
825 ret
= blkio_virtio_blk_connect(bs
, options
, flags
, errp
);
826 } else if (strcmp(blkio_driver
, "virtio-blk-vhost-vdpa") == 0) {
827 ret
= blkio_virtio_blk_connect(bs
, options
, flags
, errp
);
829 g_assert_not_reached();
832 blkio_destroy(&s
->blkio
);
836 ret
= blkio_get_bool(s
->blkio
,
838 &s
->needs_mem_regions
);
840 error_setg_errno(errp
, -ret
,
841 "failed to get needs-mem-regions: %s",
842 blkio_get_error_msg());
843 blkio_destroy(&s
->blkio
);
847 ret
= blkio_get_bool(s
->blkio
,
848 "needs-mem-region-fd",
849 &s
->needs_mem_region_fd
);
851 error_setg_errno(errp
, -ret
,
852 "failed to get needs-mem-region-fd: %s",
853 blkio_get_error_msg());
854 blkio_destroy(&s
->blkio
);
858 ret
= blkio_get_uint64(s
->blkio
,
859 "mem-region-alignment",
860 &s
->mem_region_alignment
);
862 error_setg_errno(errp
, -ret
,
863 "failed to get mem-region-alignment: %s",
864 blkio_get_error_msg());
865 blkio_destroy(&s
->blkio
);
869 ret
= blkio_get_bool(s
->blkio
,
870 "may-pin-mem-regions",
871 &s
->may_pin_mem_regions
);
873 /* Be conservative (assume pinning) if the property is not supported */
874 s
->may_pin_mem_regions
= s
->needs_mem_regions
;
878 * Notify if libblkio drivers pin memory and prevent features like
879 * virtio-mem from working.
881 if (s
->may_pin_mem_regions
) {
882 ret
= ram_block_discard_disable(true);
884 error_setg_errno(errp
, -ret
, "ram_block_discard_disable() failed");
885 blkio_destroy(&s
->blkio
);
890 ret
= blkio_start(s
->blkio
);
892 error_setg_errno(errp
, -ret
, "blkio_start failed: %s",
893 blkio_get_error_msg());
894 blkio_destroy(&s
->blkio
);
895 if (s
->may_pin_mem_regions
) {
896 ram_block_discard_disable(false);
901 bs
->supported_write_flags
= BDRV_REQ_FUA
| BDRV_REQ_REGISTERED_BUF
;
902 bs
->supported_zero_flags
= BDRV_REQ_MAY_UNMAP
| BDRV_REQ_NO_FALLBACK
;
903 #ifdef CONFIG_BLKIO_WRITE_ZEROS_FUA
904 bs
->supported_zero_flags
|= BDRV_REQ_FUA
;
907 qemu_mutex_init(&s
->blkio_lock
);
908 qemu_co_mutex_init(&s
->bounce_lock
);
909 qemu_co_queue_init(&s
->bounce_available
);
910 QLIST_INIT(&s
->bounce_bufs
);
911 s
->blkioq
= blkio_get_queue(s
->blkio
, 0);
912 s
->completion_fd
= blkioq_get_completion_fd(s
->blkioq
);
913 blkioq_set_completion_fd_enabled(s
->blkioq
, true);
915 blkio_attach_aio_context(bs
, bdrv_get_aio_context(bs
));
919 static void blkio_close(BlockDriverState
*bs
)
921 BDRVBlkioState
*s
= bs
->opaque
;
923 /* There is no destroy() API for s->bounce_lock */
925 qemu_mutex_destroy(&s
->blkio_lock
);
926 blkio_detach_aio_context(bs
);
927 blkio_destroy(&s
->blkio
);
929 if (s
->may_pin_mem_regions
) {
930 ram_block_discard_disable(false);
934 static int64_t coroutine_fn
blkio_co_getlength(BlockDriverState
*bs
)
936 BDRVBlkioState
*s
= bs
->opaque
;
940 WITH_QEMU_LOCK_GUARD(&s
->blkio_lock
) {
941 ret
= blkio_get_uint64(s
->blkio
, "capacity", &capacity
);
950 static int coroutine_fn
blkio_truncate(BlockDriverState
*bs
, int64_t offset
,
951 bool exact
, PreallocMode prealloc
,
952 BdrvRequestFlags flags
, Error
**errp
)
954 int64_t current_length
;
956 if (prealloc
!= PREALLOC_MODE_OFF
) {
957 error_setg(errp
, "Unsupported preallocation mode '%s'",
958 PreallocMode_str(prealloc
));
962 current_length
= blkio_co_getlength(bs
);
964 if (offset
> current_length
) {
965 error_setg(errp
, "Cannot grow device");
967 } else if (exact
&& offset
!= current_length
) {
968 error_setg(errp
, "Cannot resize device");
975 static int coroutine_fn
976 blkio_co_get_info(BlockDriverState
*bs
, BlockDriverInfo
*bdi
)
981 static void blkio_refresh_limits(BlockDriverState
*bs
, Error
**errp
)
983 BDRVBlkioState
*s
= bs
->opaque
;
984 QEMU_LOCK_GUARD(&s
->blkio_lock
);
988 ret
= blkio_get_int(s
->blkio
, "request-alignment", &value
);
990 error_setg_errno(errp
, -ret
, "failed to get \"request-alignment\": %s",
991 blkio_get_error_msg());
994 bs
->bl
.request_alignment
= value
;
995 if (bs
->bl
.request_alignment
< 1 ||
996 bs
->bl
.request_alignment
>= INT_MAX
||
997 !is_power_of_2(bs
->bl
.request_alignment
)) {
998 error_setg(errp
, "invalid \"request-alignment\" value %" PRIu32
", "
999 "must be a power of 2 less than INT_MAX",
1000 bs
->bl
.request_alignment
);
1004 ret
= blkio_get_int(s
->blkio
, "optimal-io-size", &value
);
1006 error_setg_errno(errp
, -ret
, "failed to get \"optimal-io-size\": %s",
1007 blkio_get_error_msg());
1010 bs
->bl
.opt_transfer
= value
;
1011 if (bs
->bl
.opt_transfer
> INT_MAX
||
1012 (bs
->bl
.opt_transfer
% bs
->bl
.request_alignment
)) {
1013 error_setg(errp
, "invalid \"optimal-io-size\" value %" PRIu32
", must "
1014 "be a multiple of %" PRIu32
, bs
->bl
.opt_transfer
,
1015 bs
->bl
.request_alignment
);
1019 ret
= blkio_get_int(s
->blkio
, "max-transfer", &value
);
1021 error_setg_errno(errp
, -ret
, "failed to get \"max-transfer\": %s",
1022 blkio_get_error_msg());
1025 bs
->bl
.max_transfer
= value
;
1026 if ((bs
->bl
.max_transfer
% bs
->bl
.request_alignment
) ||
1027 (bs
->bl
.opt_transfer
&& (bs
->bl
.max_transfer
% bs
->bl
.opt_transfer
))) {
1028 error_setg(errp
, "invalid \"max-transfer\" value %" PRIu32
", must be "
1029 "a multiple of %" PRIu32
" and %" PRIu32
" (if non-zero)",
1030 bs
->bl
.max_transfer
, bs
->bl
.request_alignment
,
1031 bs
->bl
.opt_transfer
);
1035 ret
= blkio_get_int(s
->blkio
, "buf-alignment", &value
);
1037 error_setg_errno(errp
, -ret
, "failed to get \"buf-alignment\": %s",
1038 blkio_get_error_msg());
1042 error_setg(errp
, "invalid \"buf-alignment\" value %d, must be "
1046 bs
->bl
.min_mem_alignment
= value
;
1048 ret
= blkio_get_int(s
->blkio
, "optimal-buf-alignment", &value
);
1050 error_setg_errno(errp
, -ret
,
1051 "failed to get \"optimal-buf-alignment\": %s",
1052 blkio_get_error_msg());
1056 error_setg(errp
, "invalid \"optimal-buf-alignment\" value %d, "
1057 "must be positive", value
);
1060 bs
->bl
.opt_mem_alignment
= value
;
1062 ret
= blkio_get_int(s
->blkio
, "max-segments", &value
);
1064 error_setg_errno(errp
, -ret
, "failed to get \"max-segments\": %s",
1065 blkio_get_error_msg());
1069 error_setg(errp
, "invalid \"max-segments\" value %d, must be positive",
1073 bs
->bl
.max_iov
= value
;
1078 * Missing libblkio APIs:
1080 * - co_invalidate_cache
1088 * Do not include .format_name and .protocol_name because module_block.py
1089 * does not parse macros in the source code.
1091 #define BLKIO_DRIVER_COMMON \
1092 .instance_size = sizeof(BDRVBlkioState), \
1093 .bdrv_open = blkio_open, \
1094 .bdrv_close = blkio_close, \
1095 .bdrv_co_getlength = blkio_co_getlength, \
1096 .bdrv_co_truncate = blkio_truncate, \
1097 .bdrv_co_get_info = blkio_co_get_info, \
1098 .bdrv_attach_aio_context = blkio_attach_aio_context, \
1099 .bdrv_detach_aio_context = blkio_detach_aio_context, \
1100 .bdrv_co_pdiscard = blkio_co_pdiscard, \
1101 .bdrv_co_preadv = blkio_co_preadv, \
1102 .bdrv_co_pwritev = blkio_co_pwritev, \
1103 .bdrv_co_flush_to_disk = blkio_co_flush, \
1104 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \
1105 .bdrv_refresh_limits = blkio_refresh_limits, \
1106 .bdrv_register_buf = blkio_register_buf, \
1107 .bdrv_unregister_buf = blkio_unregister_buf,
1110 * Use the same .format_name and .protocol_name as the libblkio driver name for
1114 static BlockDriver bdrv_io_uring
= {
1115 .format_name
= "io_uring",
1116 .protocol_name
= "io_uring",
1117 .bdrv_needs_filename
= true,
1121 static BlockDriver bdrv_nvme_io_uring
= {
1122 .format_name
= "nvme-io_uring",
1123 .protocol_name
= "nvme-io_uring",
1127 static BlockDriver bdrv_virtio_blk_vfio_pci
= {
1128 .format_name
= "virtio-blk-vfio-pci",
1129 .protocol_name
= "virtio-blk-vfio-pci",
1133 static BlockDriver bdrv_virtio_blk_vhost_user
= {
1134 .format_name
= "virtio-blk-vhost-user",
1135 .protocol_name
= "virtio-blk-vhost-user",
1139 static BlockDriver bdrv_virtio_blk_vhost_vdpa
= {
1140 .format_name
= "virtio-blk-vhost-vdpa",
1141 .protocol_name
= "virtio-blk-vhost-vdpa",
1145 static void bdrv_blkio_init(void)
1147 bdrv_register(&bdrv_io_uring
);
1148 bdrv_register(&bdrv_nvme_io_uring
);
1149 bdrv_register(&bdrv_virtio_blk_vfio_pci
);
1150 bdrv_register(&bdrv_virtio_blk_vhost_user
);
1151 bdrv_register(&bdrv_virtio_blk_vhost_vdpa
);
1154 block_init(bdrv_blkio_init
);