2 * copy-before-write filter driver
4 * The driver performs Copy-Before-Write (CBW) operation: it is injected above
5 * some node, and before each write it copies _old_ data to the target node.
7 * Copyright (c) 2018-2021 Virtuozzo International GmbH.
10 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program. If not, see <http://www.gnu.org/licenses/>.
26 #include "qemu/osdep.h"
27 #include "qapi/qmp/qjson.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/cutils.h"
31 #include "qapi/error.h"
32 #include "block/block_int.h"
33 #include "block/qdict.h"
34 #include "block/block-copy.h"
35 #include "block/dirty-bitmap.h"
37 #include "block/copy-before-write.h"
38 #include "block/reqlist.h"
40 #include "qapi/qapi-visit-block-core.h"
42 typedef struct BDRVCopyBeforeWriteState
{
45 OnCbwError on_cbw_error
;
46 uint64_t cbw_timeout_ns
;
50 * @lock: protects access to @access_bitmap, @done_bitmap and
56 * @access_bitmap: represents areas allowed for reading by fleecing user.
57 * Reading from non-dirty areas leads to -EACCES.
59 BdrvDirtyBitmap
*access_bitmap
;
62 * @done_bitmap: represents areas that was successfully copied to @target by
63 * copy-before-write operations.
65 BdrvDirtyBitmap
*done_bitmap
;
68 * @frozen_read_reqs: current read requests for fleecing user in bs->file
69 * node. These areas must not be rewritten by guest. There can be multiple
70 * overlapping read requests.
72 BlockReqList frozen_read_reqs
;
75 * @snapshot_error is normally zero. But on first copy-before-write failure
76 * when @on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT, @snapshot_error takes
77 * value of this error (<0). After that all in-flight and further
78 * snapshot-API requests will fail with that error.
81 } BDRVCopyBeforeWriteState
;
83 static int coroutine_fn GRAPH_RDLOCK
84 cbw_co_preadv(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
85 QEMUIOVector
*qiov
, BdrvRequestFlags flags
)
87 return bdrv_co_preadv(bs
->file
, offset
, bytes
, qiov
, flags
);
90 static void block_copy_cb(void *opaque
)
92 BlockDriverState
*bs
= opaque
;
94 bdrv_dec_in_flight(bs
);
98 * Do copy-before-write operation.
100 * On failure guest request must be failed too.
102 * On success, we also wait for all in-flight fleecing read requests in source
103 * node, and it's guaranteed that after cbw_do_copy_before_write() successful
104 * return there are no such requests and they will never appear.
106 static coroutine_fn
int cbw_do_copy_before_write(BlockDriverState
*bs
,
107 uint64_t offset
, uint64_t bytes
, BdrvRequestFlags flags
)
109 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
112 int64_t cluster_size
= block_copy_cluster_size(s
->bcs
);
114 if (flags
& BDRV_REQ_WRITE_UNCHANGED
) {
118 if (s
->snapshot_error
) {
122 off
= QEMU_ALIGN_DOWN(offset
, cluster_size
);
123 end
= QEMU_ALIGN_UP(offset
+ bytes
, cluster_size
);
126 * Increase in_flight, so that in case of timed-out block-copy, the
127 * remaining background block_copy() request (which can't be immediately
128 * cancelled by timeout) is presented in bs->in_flight. This way we are
129 * sure that on bs close() we'll previously wait for all timed-out but yet
130 * running block_copy calls.
132 bdrv_inc_in_flight(bs
);
133 ret
= block_copy(s
->bcs
, off
, end
- off
, true, s
->cbw_timeout_ns
,
135 if (ret
< 0 && s
->on_cbw_error
== ON_CBW_ERROR_BREAK_GUEST_WRITE
) {
139 WITH_QEMU_LOCK_GUARD(&s
->lock
) {
141 assert(s
->on_cbw_error
== ON_CBW_ERROR_BREAK_SNAPSHOT
);
142 if (!s
->snapshot_error
) {
143 s
->snapshot_error
= ret
;
146 bdrv_set_dirty_bitmap(s
->done_bitmap
, off
, end
- off
);
148 reqlist_wait_all(&s
->frozen_read_reqs
, off
, end
- off
, &s
->lock
);
154 static int coroutine_fn GRAPH_RDLOCK
155 cbw_co_pdiscard(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
)
157 int ret
= cbw_do_copy_before_write(bs
, offset
, bytes
, 0);
162 return bdrv_co_pdiscard(bs
->file
, offset
, bytes
);
165 static int coroutine_fn GRAPH_RDLOCK
166 cbw_co_pwrite_zeroes(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
167 BdrvRequestFlags flags
)
169 int ret
= cbw_do_copy_before_write(bs
, offset
, bytes
, flags
);
174 return bdrv_co_pwrite_zeroes(bs
->file
, offset
, bytes
, flags
);
177 static coroutine_fn GRAPH_RDLOCK
178 int cbw_co_pwritev(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
179 QEMUIOVector
*qiov
, BdrvRequestFlags flags
)
181 int ret
= cbw_do_copy_before_write(bs
, offset
, bytes
, flags
);
186 return bdrv_co_pwritev(bs
->file
, offset
, bytes
, qiov
, flags
);
189 static int coroutine_fn GRAPH_RDLOCK
cbw_co_flush(BlockDriverState
*bs
)
195 return bdrv_co_flush(bs
->file
->bs
);
199 * If @offset not accessible - return NULL.
201 * Otherwise, set @pnum to some bytes that accessible from @file (@file is set
202 * to bs->file or to s->target). Return newly allocated BlockReq object that
203 * should be than passed to cbw_snapshot_read_unlock().
205 * It's guaranteed that guest writes will not interact in the region until
206 * cbw_snapshot_read_unlock() called.
208 static BlockReq
* coroutine_fn GRAPH_RDLOCK
209 cbw_snapshot_read_lock(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
210 int64_t *pnum
, BdrvChild
**file
)
212 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
213 BlockReq
*req
= g_new(BlockReq
, 1);
216 QEMU_LOCK_GUARD(&s
->lock
);
218 if (s
->snapshot_error
) {
223 if (bdrv_dirty_bitmap_next_zero(s
->access_bitmap
, offset
, bytes
) != -1) {
228 done
= bdrv_dirty_bitmap_status(s
->done_bitmap
, offset
, bytes
, pnum
);
231 * Special invalid BlockReq, that is handled in
232 * cbw_snapshot_read_unlock(). We don't need to lock something to read
235 *req
= (BlockReq
) {.offset
= -1, .bytes
= -1};
238 reqlist_init_req(&s
->frozen_read_reqs
, req
, offset
, bytes
);
245 static coroutine_fn
void
246 cbw_snapshot_read_unlock(BlockDriverState
*bs
, BlockReq
*req
)
248 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
250 if (req
->offset
== -1 && req
->bytes
== -1) {
255 QEMU_LOCK_GUARD(&s
->lock
);
257 reqlist_remove_req(req
);
261 static int coroutine_fn GRAPH_RDLOCK
262 cbw_co_preadv_snapshot(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
263 QEMUIOVector
*qiov
, size_t qiov_offset
)
269 /* TODO: upgrade to async loop using AioTask */
273 req
= cbw_snapshot_read_lock(bs
, offset
, bytes
, &cur_bytes
, &file
);
278 ret
= bdrv_co_preadv_part(file
, offset
, cur_bytes
,
279 qiov
, qiov_offset
, 0);
280 cbw_snapshot_read_unlock(bs
, req
);
287 qiov_offset
+= cur_bytes
;
293 static int coroutine_fn GRAPH_RDLOCK
294 cbw_co_snapshot_block_status(BlockDriverState
*bs
,
295 bool want_zero
, int64_t offset
, int64_t bytes
,
296 int64_t *pnum
, int64_t *map
,
297 BlockDriverState
**file
)
299 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
305 req
= cbw_snapshot_read_lock(bs
, offset
, bytes
, &cur_bytes
, &child
);
310 ret
= bdrv_co_block_status(child
->bs
, offset
, cur_bytes
, pnum
, map
, file
);
311 if (child
== s
->target
) {
313 * We refer to s->target only for areas that we've written to it.
314 * And we can not report unallocated blocks in s->target: this will
315 * break generic block-status-above logic, that will go to
316 * copy-before-write filtered child in this case.
318 assert(ret
& BDRV_BLOCK_ALLOCATED
);
321 cbw_snapshot_read_unlock(bs
, req
);
326 static int coroutine_fn GRAPH_RDLOCK
327 cbw_co_pdiscard_snapshot(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
)
329 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
330 uint32_t cluster_size
= block_copy_cluster_size(s
->bcs
);
331 int64_t aligned_offset
= QEMU_ALIGN_UP(offset
, cluster_size
);
332 int64_t aligned_end
= QEMU_ALIGN_DOWN(offset
+ bytes
, cluster_size
);
333 int64_t aligned_bytes
;
335 if (aligned_end
<= aligned_offset
) {
338 aligned_bytes
= aligned_end
- aligned_offset
;
340 WITH_QEMU_LOCK_GUARD(&s
->lock
) {
341 bdrv_reset_dirty_bitmap(s
->access_bitmap
, aligned_offset
,
345 block_copy_reset(s
->bcs
, aligned_offset
, aligned_bytes
);
347 return bdrv_co_pdiscard(s
->target
, aligned_offset
, aligned_bytes
);
350 static void GRAPH_RDLOCK
cbw_refresh_filename(BlockDriverState
*bs
)
352 pstrcpy(bs
->exact_filename
, sizeof(bs
->exact_filename
),
353 bs
->file
->bs
->filename
);
356 static void GRAPH_RDLOCK
357 cbw_child_perm(BlockDriverState
*bs
, BdrvChild
*c
, BdrvChildRole role
,
358 BlockReopenQueue
*reopen_queue
,
359 uint64_t perm
, uint64_t shared
,
360 uint64_t *nperm
, uint64_t *nshared
)
362 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
364 if (!(role
& BDRV_CHILD_FILTERED
)) {
368 * Share write to target (child_file), to not interfere
369 * with guest writes to its disk which may be in target backing chain.
370 * Can't resize during a backup block job because we check the size
373 *nshared
= BLK_PERM_ALL
& ~BLK_PERM_RESIZE
;
374 *nperm
= BLK_PERM_WRITE
;
377 bdrv_default_perms(bs
, c
, role
, reopen_queue
,
378 perm
, shared
, nperm
, nshared
);
380 if (!QLIST_EMPTY(&bs
->parents
)) {
382 * Note, that source child may be shared with backup job. Backup job
383 * does create own blk parent on copy-before-write node, so this
384 * works even if source node does not have any parents before backup
387 *nperm
= *nperm
| BLK_PERM_CONSISTENT_READ
;
388 if (s
->discard_source
) {
389 *nperm
= *nperm
| BLK_PERM_WRITE
;
392 *nshared
&= ~(BLK_PERM_WRITE
| BLK_PERM_RESIZE
);
397 static BlockdevOptions
*cbw_parse_options(QDict
*options
, Error
**errp
)
399 BlockdevOptions
*opts
= NULL
;
402 qdict_put_str(options
, "driver", "copy-before-write");
404 v
= qobject_input_visitor_new_flat_confused(options
, errp
);
409 visit_type_BlockdevOptions(v
, NULL
, &opts
, errp
);
415 * Delete options which we are going to parse through BlockdevOptions
416 * object for original options.
418 qdict_extract_subqdict(options
, NULL
, "bitmap");
419 qdict_del(options
, "on-cbw-error");
420 qdict_del(options
, "cbw-timeout");
421 qdict_del(options
, "min-cluster-size");
425 qdict_del(options
, "driver");
430 static int cbw_open(BlockDriverState
*bs
, QDict
*options
, int flags
,
434 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
435 BdrvDirtyBitmap
*bitmap
= NULL
;
436 int64_t cluster_size
;
437 g_autoptr(BlockdevOptions
) full_opts
= NULL
;
438 BlockdevOptionsCbw
*opts
;
441 full_opts
= cbw_parse_options(options
, errp
);
445 assert(full_opts
->driver
== BLOCKDEV_DRIVER_COPY_BEFORE_WRITE
);
446 opts
= &full_opts
->u
.copy_before_write
;
448 ret
= bdrv_open_file_child(NULL
, options
, "file", bs
, errp
);
453 s
->target
= bdrv_open_child(NULL
, options
, "target", bs
, &child_of_bds
,
454 BDRV_CHILD_DATA
, false, errp
);
459 GRAPH_RDLOCK_GUARD_MAINLOOP();
462 bitmap
= block_dirty_bitmap_lookup(opts
->bitmap
->node
,
463 opts
->bitmap
->name
, NULL
, errp
);
468 s
->on_cbw_error
= opts
->has_on_cbw_error
? opts
->on_cbw_error
:
469 ON_CBW_ERROR_BREAK_GUEST_WRITE
;
470 s
->cbw_timeout_ns
= opts
->has_cbw_timeout
?
471 opts
->cbw_timeout
* NANOSECONDS_PER_SECOND
: 0;
473 bs
->total_sectors
= bs
->file
->bs
->total_sectors
;
474 bs
->supported_write_flags
= BDRV_REQ_WRITE_UNCHANGED
|
475 (BDRV_REQ_FUA
& bs
->file
->bs
->supported_write_flags
);
476 bs
->supported_zero_flags
= BDRV_REQ_WRITE_UNCHANGED
|
477 ((BDRV_REQ_FUA
| BDRV_REQ_MAY_UNMAP
| BDRV_REQ_NO_FALLBACK
) &
478 bs
->file
->bs
->supported_zero_flags
);
480 s
->discard_source
= flags
& BDRV_O_CBW_DISCARD_SOURCE
;
482 s
->bcs
= block_copy_state_new(bs
->file
, s
->target
, bs
, bitmap
,
483 flags
& BDRV_O_CBW_DISCARD_SOURCE
,
484 opts
->min_cluster_size
, errp
);
486 error_prepend(errp
, "Cannot create block-copy-state: ");
490 cluster_size
= block_copy_cluster_size(s
->bcs
);
492 s
->done_bitmap
= bdrv_create_dirty_bitmap(bs
, cluster_size
, NULL
, errp
);
493 if (!s
->done_bitmap
) {
496 bdrv_disable_dirty_bitmap(s
->done_bitmap
);
498 /* s->access_bitmap starts equal to bcs bitmap */
499 s
->access_bitmap
= bdrv_create_dirty_bitmap(bs
, cluster_size
, NULL
, errp
);
500 if (!s
->access_bitmap
) {
503 bdrv_disable_dirty_bitmap(s
->access_bitmap
);
504 bdrv_dirty_bitmap_merge_internal(s
->access_bitmap
,
505 block_copy_dirty_bitmap(s
->bcs
), NULL
,
508 qemu_co_mutex_init(&s
->lock
);
509 QLIST_INIT(&s
->frozen_read_reqs
);
513 static void cbw_close(BlockDriverState
*bs
)
515 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
517 bdrv_release_dirty_bitmap(s
->access_bitmap
);
518 bdrv_release_dirty_bitmap(s
->done_bitmap
);
520 block_copy_state_free(s
->bcs
);
524 static BlockDriver bdrv_cbw_filter
= {
525 .format_name
= "copy-before-write",
526 .instance_size
= sizeof(BDRVCopyBeforeWriteState
),
528 .bdrv_open
= cbw_open
,
529 .bdrv_close
= cbw_close
,
531 .bdrv_co_preadv
= cbw_co_preadv
,
532 .bdrv_co_pwritev
= cbw_co_pwritev
,
533 .bdrv_co_pwrite_zeroes
= cbw_co_pwrite_zeroes
,
534 .bdrv_co_pdiscard
= cbw_co_pdiscard
,
535 .bdrv_co_flush
= cbw_co_flush
,
537 .bdrv_co_preadv_snapshot
= cbw_co_preadv_snapshot
,
538 .bdrv_co_pdiscard_snapshot
= cbw_co_pdiscard_snapshot
,
539 .bdrv_co_snapshot_block_status
= cbw_co_snapshot_block_status
,
541 .bdrv_refresh_filename
= cbw_refresh_filename
,
543 .bdrv_child_perm
= cbw_child_perm
,
548 BlockDriverState
*bdrv_cbw_append(BlockDriverState
*source
,
549 BlockDriverState
*target
,
550 const char *filter_node_name
,
552 uint64_t min_cluster_size
,
553 BlockCopyState
**bcs
,
556 BDRVCopyBeforeWriteState
*state
;
557 BlockDriverState
*top
;
559 int flags
= BDRV_O_RDWR
| (discard_source
? BDRV_O_CBW_DISCARD_SOURCE
: 0);
561 assert(source
->total_sectors
== target
->total_sectors
);
565 qdict_put_str(opts
, "driver", "copy-before-write");
566 if (filter_node_name
) {
567 qdict_put_str(opts
, "node-name", filter_node_name
);
569 qdict_put_str(opts
, "file", bdrv_get_node_name(source
));
570 qdict_put_str(opts
, "target", bdrv_get_node_name(target
));
572 if (min_cluster_size
> INT64_MAX
) {
573 error_setg(errp
, "min-cluster-size too large: %" PRIu64
" > %" PRIi64
,
574 min_cluster_size
, INT64_MAX
);
578 qdict_put_int(opts
, "min-cluster-size", (int64_t)min_cluster_size
);
580 top
= bdrv_insert_node(source
, opts
, flags
, errp
);
591 void bdrv_cbw_drop(BlockDriverState
*bs
)
594 bdrv_drop_filter(bs
, &error_abort
);
598 static void cbw_init(void)
600 bdrv_register(&bdrv_cbw_filter
);
603 block_init(cbw_init
);