1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) STRATO AG 2012. All rights reserved.
6 #include <linux/sched.h>
8 #include <linux/slab.h>
9 #include <linux/blkdev.h>
10 #include <linux/kthread.h>
11 #include <linux/math64.h>
14 #include "extent_map.h"
16 #include "transaction.h"
17 #include "print-tree.h"
19 #include "async-thread.h"
20 #include "check-integrity.h"
21 #include "rcu-string.h"
22 #include "dev-replace.h"
27 * Device replace overview
30 * To copy all extents (both new and on-disk) from source device to target
31 * device, while still keeping the filesystem read-write.
34 * There are two main methods involved:
38 * All new writes will be written to both target and source devices, so even
39 * if replace gets canceled, sources device still contans up-to-date data.
41 * Location: handle_ops_on_dev_replace() from __btrfs_map_block()
42 * Start: btrfs_dev_replace_start()
43 * End: btrfs_dev_replace_finishing()
44 * Content: Latest data/metadata
46 * - Copy existing extents
48 * This happens by re-using scrub facility, as scrub also iterates through
49 * existing extents from commit root.
51 * Location: scrub_write_block_to_dev_replace() from
52 * scrub_block_complete()
53 * Content: Data/meta from commit root.
55 * Due to the content difference, we need to avoid nocow write when dev-replace
56 * is happening. This is done by marking the block group read-only and waiting
59 * After replace is done, the finishing part is done by swapping the target and
62 * Location: btrfs_dev_replace_update_device_in_mapping_tree() from
63 * btrfs_dev_replace_finishing()
66 static int btrfs_dev_replace_finishing(struct btrfs_fs_info
*fs_info
,
68 static int btrfs_dev_replace_kthread(void *data
);
70 int btrfs_init_dev_replace(struct btrfs_fs_info
*fs_info
)
73 struct btrfs_root
*dev_root
= fs_info
->dev_root
;
74 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
75 struct extent_buffer
*eb
;
78 struct btrfs_path
*path
= NULL
;
80 struct btrfs_dev_replace_item
*ptr
;
83 path
= btrfs_alloc_path();
90 key
.type
= BTRFS_DEV_REPLACE_KEY
;
92 ret
= btrfs_search_slot(NULL
, dev_root
, &key
, path
, 0, 0);
94 no_valid_dev_replace_entry_found
:
96 * We don't have a replace item or it's corrupted. If there is
97 * a replace target, fail the mount.
99 if (btrfs_find_device(fs_info
->fs_devices
,
100 BTRFS_DEV_REPLACE_DEVID
, NULL
, NULL
)) {
102 "found replace target device without a valid replace item");
107 dev_replace
->replace_state
=
108 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED
;
109 dev_replace
->cont_reading_from_srcdev_mode
=
110 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS
;
111 dev_replace
->time_started
= 0;
112 dev_replace
->time_stopped
= 0;
113 atomic64_set(&dev_replace
->num_write_errors
, 0);
114 atomic64_set(&dev_replace
->num_uncorrectable_read_errors
, 0);
115 dev_replace
->cursor_left
= 0;
116 dev_replace
->committed_cursor_left
= 0;
117 dev_replace
->cursor_left_last_write_of_item
= 0;
118 dev_replace
->cursor_right
= 0;
119 dev_replace
->srcdev
= NULL
;
120 dev_replace
->tgtdev
= NULL
;
121 dev_replace
->is_valid
= 0;
122 dev_replace
->item_needs_writeback
= 0;
125 slot
= path
->slots
[0];
127 item_size
= btrfs_item_size_nr(eb
, slot
);
128 ptr
= btrfs_item_ptr(eb
, slot
, struct btrfs_dev_replace_item
);
130 if (item_size
!= sizeof(struct btrfs_dev_replace_item
)) {
132 "dev_replace entry found has unexpected size, ignore entry");
133 goto no_valid_dev_replace_entry_found
;
136 src_devid
= btrfs_dev_replace_src_devid(eb
, ptr
);
137 dev_replace
->cont_reading_from_srcdev_mode
=
138 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb
, ptr
);
139 dev_replace
->replace_state
= btrfs_dev_replace_replace_state(eb
, ptr
);
140 dev_replace
->time_started
= btrfs_dev_replace_time_started(eb
, ptr
);
141 dev_replace
->time_stopped
=
142 btrfs_dev_replace_time_stopped(eb
, ptr
);
143 atomic64_set(&dev_replace
->num_write_errors
,
144 btrfs_dev_replace_num_write_errors(eb
, ptr
));
145 atomic64_set(&dev_replace
->num_uncorrectable_read_errors
,
146 btrfs_dev_replace_num_uncorrectable_read_errors(eb
, ptr
));
147 dev_replace
->cursor_left
= btrfs_dev_replace_cursor_left(eb
, ptr
);
148 dev_replace
->committed_cursor_left
= dev_replace
->cursor_left
;
149 dev_replace
->cursor_left_last_write_of_item
= dev_replace
->cursor_left
;
150 dev_replace
->cursor_right
= btrfs_dev_replace_cursor_right(eb
, ptr
);
151 dev_replace
->is_valid
= 1;
153 dev_replace
->item_needs_writeback
= 0;
154 switch (dev_replace
->replace_state
) {
155 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED
:
156 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED
:
157 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
:
159 * We don't have an active replace item but if there is a
160 * replace target, fail the mount.
162 if (btrfs_find_device(fs_info
->fs_devices
,
163 BTRFS_DEV_REPLACE_DEVID
, NULL
, NULL
)) {
165 "replace devid present without an active replace item");
168 dev_replace
->srcdev
= NULL
;
169 dev_replace
->tgtdev
= NULL
;
172 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
:
173 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
:
174 dev_replace
->srcdev
= btrfs_find_device(fs_info
->fs_devices
,
175 src_devid
, NULL
, NULL
);
176 dev_replace
->tgtdev
= btrfs_find_device(fs_info
->fs_devices
,
177 BTRFS_DEV_REPLACE_DEVID
,
180 * allow 'btrfs dev replace_cancel' if src/tgt device is
183 if (!dev_replace
->srcdev
&&
184 !btrfs_test_opt(fs_info
, DEGRADED
)) {
187 "cannot mount because device replace operation is ongoing and");
189 "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
192 if (!dev_replace
->tgtdev
&&
193 !btrfs_test_opt(fs_info
, DEGRADED
)) {
196 "cannot mount because device replace operation is ongoing and");
198 "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
199 BTRFS_DEV_REPLACE_DEVID
);
201 if (dev_replace
->tgtdev
) {
202 if (dev_replace
->srcdev
) {
203 dev_replace
->tgtdev
->total_bytes
=
204 dev_replace
->srcdev
->total_bytes
;
205 dev_replace
->tgtdev
->disk_total_bytes
=
206 dev_replace
->srcdev
->disk_total_bytes
;
207 dev_replace
->tgtdev
->commit_total_bytes
=
208 dev_replace
->srcdev
->commit_total_bytes
;
209 dev_replace
->tgtdev
->bytes_used
=
210 dev_replace
->srcdev
->bytes_used
;
211 dev_replace
->tgtdev
->commit_bytes_used
=
212 dev_replace
->srcdev
->commit_bytes_used
;
214 set_bit(BTRFS_DEV_STATE_REPLACE_TGT
,
215 &dev_replace
->tgtdev
->dev_state
);
217 WARN_ON(fs_info
->fs_devices
->rw_devices
== 0);
218 dev_replace
->tgtdev
->io_width
= fs_info
->sectorsize
;
219 dev_replace
->tgtdev
->io_align
= fs_info
->sectorsize
;
220 dev_replace
->tgtdev
->sector_size
= fs_info
->sectorsize
;
221 dev_replace
->tgtdev
->fs_info
= fs_info
;
222 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA
,
223 &dev_replace
->tgtdev
->dev_state
);
229 btrfs_free_path(path
);
234 * Initialize a new device for device replace target from a given source dev
237 * Return 0 and new device in @device_out, otherwise return < 0
239 static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info
*fs_info
,
240 const char *device_path
,
241 struct btrfs_device
*srcdev
,
242 struct btrfs_device
**device_out
)
244 struct btrfs_device
*device
;
245 struct block_device
*bdev
;
246 struct rcu_string
*name
;
247 u64 devid
= BTRFS_DEV_REPLACE_DEVID
;
251 if (srcdev
->fs_devices
->seeding
) {
252 btrfs_err(fs_info
, "the filesystem is a seed filesystem!");
256 bdev
= blkdev_get_by_path(device_path
, FMODE_WRITE
| FMODE_EXCL
,
257 fs_info
->bdev_holder
);
259 btrfs_err(fs_info
, "target device %s is invalid!", device_path
);
260 return PTR_ERR(bdev
);
263 if (!btrfs_check_device_zone_type(fs_info
, bdev
)) {
265 "dev-replace: zoned type of target device mismatch with filesystem");
272 list_for_each_entry(device
, &fs_info
->fs_devices
->devices
, dev_list
) {
273 if (device
->bdev
== bdev
) {
275 "target device is in the filesystem!");
282 if (i_size_read(bdev
->bd_inode
) <
283 btrfs_device_get_total_bytes(srcdev
)) {
285 "target device is smaller than source device!");
291 device
= btrfs_alloc_device(NULL
, &devid
, NULL
);
292 if (IS_ERR(device
)) {
293 ret
= PTR_ERR(device
);
297 name
= rcu_string_strdup(device_path
, GFP_KERNEL
);
299 btrfs_free_device(device
);
303 rcu_assign_pointer(device
->name
, name
);
305 set_bit(BTRFS_DEV_STATE_WRITEABLE
, &device
->dev_state
);
306 device
->generation
= 0;
307 device
->io_width
= fs_info
->sectorsize
;
308 device
->io_align
= fs_info
->sectorsize
;
309 device
->sector_size
= fs_info
->sectorsize
;
310 device
->total_bytes
= btrfs_device_get_total_bytes(srcdev
);
311 device
->disk_total_bytes
= btrfs_device_get_disk_total_bytes(srcdev
);
312 device
->bytes_used
= btrfs_device_get_bytes_used(srcdev
);
313 device
->commit_total_bytes
= srcdev
->commit_total_bytes
;
314 device
->commit_bytes_used
= device
->bytes_used
;
315 device
->fs_info
= fs_info
;
317 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA
, &device
->dev_state
);
318 set_bit(BTRFS_DEV_STATE_REPLACE_TGT
, &device
->dev_state
);
319 device
->mode
= FMODE_EXCL
;
320 device
->dev_stats_valid
= 1;
321 set_blocksize(device
->bdev
, BTRFS_BDEV_BLOCKSIZE
);
322 device
->fs_devices
= fs_info
->fs_devices
;
324 ret
= btrfs_get_dev_zone_info(device
);
328 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
329 list_add(&device
->dev_list
, &fs_info
->fs_devices
->devices
);
330 fs_info
->fs_devices
->num_devices
++;
331 fs_info
->fs_devices
->open_devices
++;
332 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
334 *device_out
= device
;
338 blkdev_put(bdev
, FMODE_EXCL
);
343 * called from commit_transaction. Writes changed device replace state to
346 int btrfs_run_dev_replace(struct btrfs_trans_handle
*trans
)
348 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
350 struct btrfs_root
*dev_root
= fs_info
->dev_root
;
351 struct btrfs_path
*path
;
352 struct btrfs_key key
;
353 struct extent_buffer
*eb
;
354 struct btrfs_dev_replace_item
*ptr
;
355 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
357 down_read(&dev_replace
->rwsem
);
358 if (!dev_replace
->is_valid
||
359 !dev_replace
->item_needs_writeback
) {
360 up_read(&dev_replace
->rwsem
);
363 up_read(&dev_replace
->rwsem
);
366 key
.type
= BTRFS_DEV_REPLACE_KEY
;
369 path
= btrfs_alloc_path();
374 ret
= btrfs_search_slot(trans
, dev_root
, &key
, path
, -1, 1);
377 "error %d while searching for dev_replace item!",
383 btrfs_item_size_nr(path
->nodes
[0], path
->slots
[0]) < sizeof(*ptr
)) {
385 * need to delete old one and insert a new one.
386 * Since no attempt is made to recover any old state, if the
387 * dev_replace state is 'running', the data on the target
389 * It would be possible to recover the state: just make sure
390 * that the beginning of the item is never changed and always
391 * contains all the essential information. Then read this
392 * minimal set of information and use it as a base for the
395 ret
= btrfs_del_item(trans
, dev_root
, path
);
398 "delete too small dev_replace item failed %d!",
406 /* need to insert a new item */
407 btrfs_release_path(path
);
408 ret
= btrfs_insert_empty_item(trans
, dev_root
, path
,
412 "insert dev_replace item failed %d!", ret
);
418 ptr
= btrfs_item_ptr(eb
, path
->slots
[0],
419 struct btrfs_dev_replace_item
);
421 down_write(&dev_replace
->rwsem
);
422 if (dev_replace
->srcdev
)
423 btrfs_set_dev_replace_src_devid(eb
, ptr
,
424 dev_replace
->srcdev
->devid
);
426 btrfs_set_dev_replace_src_devid(eb
, ptr
, (u64
)-1);
427 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb
, ptr
,
428 dev_replace
->cont_reading_from_srcdev_mode
);
429 btrfs_set_dev_replace_replace_state(eb
, ptr
,
430 dev_replace
->replace_state
);
431 btrfs_set_dev_replace_time_started(eb
, ptr
, dev_replace
->time_started
);
432 btrfs_set_dev_replace_time_stopped(eb
, ptr
, dev_replace
->time_stopped
);
433 btrfs_set_dev_replace_num_write_errors(eb
, ptr
,
434 atomic64_read(&dev_replace
->num_write_errors
));
435 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb
, ptr
,
436 atomic64_read(&dev_replace
->num_uncorrectable_read_errors
));
437 dev_replace
->cursor_left_last_write_of_item
=
438 dev_replace
->cursor_left
;
439 btrfs_set_dev_replace_cursor_left(eb
, ptr
,
440 dev_replace
->cursor_left_last_write_of_item
);
441 btrfs_set_dev_replace_cursor_right(eb
, ptr
,
442 dev_replace
->cursor_right
);
443 dev_replace
->item_needs_writeback
= 0;
444 up_write(&dev_replace
->rwsem
);
446 btrfs_mark_buffer_dirty(eb
);
449 btrfs_free_path(path
);
454 static char* btrfs_dev_name(struct btrfs_device
*device
)
456 if (!device
|| test_bit(BTRFS_DEV_STATE_MISSING
, &device
->dev_state
))
457 return "<missing disk>";
459 return rcu_str_deref(device
->name
);
462 static int btrfs_dev_replace_start(struct btrfs_fs_info
*fs_info
,
463 const char *tgtdev_name
, u64 srcdevid
, const char *srcdev_name
,
466 struct btrfs_root
*root
= fs_info
->dev_root
;
467 struct btrfs_trans_handle
*trans
;
468 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
470 struct btrfs_device
*tgt_device
= NULL
;
471 struct btrfs_device
*src_device
= NULL
;
473 src_device
= btrfs_find_device_by_devspec(fs_info
, srcdevid
,
475 if (IS_ERR(src_device
))
476 return PTR_ERR(src_device
);
478 if (btrfs_pinned_by_swapfile(fs_info
, src_device
)) {
479 btrfs_warn_in_rcu(fs_info
,
480 "cannot replace device %s (devid %llu) due to active swapfile",
481 btrfs_dev_name(src_device
), src_device
->devid
);
486 * Here we commit the transaction to make sure commit_total_bytes
487 * of all the devices are updated.
489 trans
= btrfs_attach_transaction(root
);
490 if (!IS_ERR(trans
)) {
491 ret
= btrfs_commit_transaction(trans
);
494 } else if (PTR_ERR(trans
) != -ENOENT
) {
495 return PTR_ERR(trans
);
498 ret
= btrfs_init_dev_replace_tgtdev(fs_info
, tgtdev_name
,
499 src_device
, &tgt_device
);
503 down_write(&dev_replace
->rwsem
);
504 switch (dev_replace
->replace_state
) {
505 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED
:
506 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED
:
507 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
:
509 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
:
510 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
:
512 ret
= BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED
;
513 up_write(&dev_replace
->rwsem
);
517 dev_replace
->cont_reading_from_srcdev_mode
= read_src
;
518 dev_replace
->srcdev
= src_device
;
519 dev_replace
->tgtdev
= tgt_device
;
521 btrfs_info_in_rcu(fs_info
,
522 "dev_replace from %s (devid %llu) to %s started",
523 btrfs_dev_name(src_device
),
525 rcu_str_deref(tgt_device
->name
));
528 * from now on, the writes to the srcdev are all duplicated to
529 * go to the tgtdev as well (refer to btrfs_map_block()).
531 dev_replace
->replace_state
= BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
;
532 dev_replace
->time_started
= ktime_get_real_seconds();
533 dev_replace
->cursor_left
= 0;
534 dev_replace
->committed_cursor_left
= 0;
535 dev_replace
->cursor_left_last_write_of_item
= 0;
536 dev_replace
->cursor_right
= 0;
537 dev_replace
->is_valid
= 1;
538 dev_replace
->item_needs_writeback
= 1;
539 atomic64_set(&dev_replace
->num_write_errors
, 0);
540 atomic64_set(&dev_replace
->num_uncorrectable_read_errors
, 0);
541 up_write(&dev_replace
->rwsem
);
543 ret
= btrfs_sysfs_add_device(tgt_device
);
545 btrfs_err(fs_info
, "kobj add dev failed %d", ret
);
547 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, 0, (u64
)-1);
549 /* Commit dev_replace state and reserve 1 item for it. */
550 trans
= btrfs_start_transaction(root
, 1);
552 ret
= PTR_ERR(trans
);
553 down_write(&dev_replace
->rwsem
);
554 dev_replace
->replace_state
=
555 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED
;
556 dev_replace
->srcdev
= NULL
;
557 dev_replace
->tgtdev
= NULL
;
558 up_write(&dev_replace
->rwsem
);
562 ret
= btrfs_commit_transaction(trans
);
565 /* the disk copy procedure reuses the scrub code */
566 ret
= btrfs_scrub_dev(fs_info
, src_device
->devid
, 0,
567 btrfs_device_get_total_bytes(src_device
),
568 &dev_replace
->scrub_progress
, 0, 1);
570 ret
= btrfs_dev_replace_finishing(fs_info
, ret
);
571 if (ret
== -EINPROGRESS
)
572 ret
= BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS
;
577 btrfs_destroy_dev_replace_tgtdev(tgt_device
);
581 int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info
*fs_info
,
582 struct btrfs_ioctl_dev_replace_args
*args
)
586 switch (args
->start
.cont_reading_from_srcdev_mode
) {
587 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS
:
588 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID
:
594 if ((args
->start
.srcdevid
== 0 && args
->start
.srcdev_name
[0] == '\0') ||
595 args
->start
.tgtdev_name
[0] == '\0')
598 ret
= btrfs_dev_replace_start(fs_info
, args
->start
.tgtdev_name
,
599 args
->start
.srcdevid
,
600 args
->start
.srcdev_name
,
601 args
->start
.cont_reading_from_srcdev_mode
);
603 /* don't warn if EINPROGRESS, someone else might be running scrub */
604 if (ret
== BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS
||
605 ret
== BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR
)
612 * blocked until all in-flight bios operations are finished.
614 static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info
*fs_info
)
616 set_bit(BTRFS_FS_STATE_DEV_REPLACING
, &fs_info
->fs_state
);
617 wait_event(fs_info
->dev_replace
.replace_wait
, !percpu_counter_sum(
618 &fs_info
->dev_replace
.bio_counter
));
622 * we have removed target device, it is safe to allow new bios request.
624 static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info
*fs_info
)
626 clear_bit(BTRFS_FS_STATE_DEV_REPLACING
, &fs_info
->fs_state
);
627 wake_up(&fs_info
->dev_replace
.replace_wait
);
631 * When finishing the device replace, before swapping the source device with the
632 * target device we must update the chunk allocation state in the target device,
633 * as it is empty because replace works by directly copying the chunks and not
634 * through the normal chunk allocation path.
636 static int btrfs_set_target_alloc_state(struct btrfs_device
*srcdev
,
637 struct btrfs_device
*tgtdev
)
639 struct extent_state
*cached_state
= NULL
;
645 lockdep_assert_held(&srcdev
->fs_info
->chunk_mutex
);
647 while (!find_first_extent_bit(&srcdev
->alloc_state
, start
,
648 &found_start
, &found_end
,
649 CHUNK_ALLOCATED
, &cached_state
)) {
650 ret
= set_extent_bits(&tgtdev
->alloc_state
, found_start
,
651 found_end
, CHUNK_ALLOCATED
);
654 start
= found_end
+ 1;
657 free_extent_state(cached_state
);
661 static void btrfs_dev_replace_update_device_in_mapping_tree(
662 struct btrfs_fs_info
*fs_info
,
663 struct btrfs_device
*srcdev
,
664 struct btrfs_device
*tgtdev
)
666 struct extent_map_tree
*em_tree
= &fs_info
->mapping_tree
;
667 struct extent_map
*em
;
668 struct map_lookup
*map
;
672 write_lock(&em_tree
->lock
);
674 em
= lookup_extent_mapping(em_tree
, start
, (u64
)-1);
677 map
= em
->map_lookup
;
678 for (i
= 0; i
< map
->num_stripes
; i
++)
679 if (srcdev
== map
->stripes
[i
].dev
)
680 map
->stripes
[i
].dev
= tgtdev
;
681 start
= em
->start
+ em
->len
;
684 write_unlock(&em_tree
->lock
);
687 static int btrfs_dev_replace_finishing(struct btrfs_fs_info
*fs_info
,
690 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
691 struct btrfs_device
*tgt_device
;
692 struct btrfs_device
*src_device
;
693 struct btrfs_root
*root
= fs_info
->tree_root
;
694 u8 uuid_tmp
[BTRFS_UUID_SIZE
];
695 struct btrfs_trans_handle
*trans
;
698 /* don't allow cancel or unmount to disturb the finishing procedure */
699 mutex_lock(&dev_replace
->lock_finishing_cancel_unmount
);
701 down_read(&dev_replace
->rwsem
);
702 /* was the operation canceled, or is it finished? */
703 if (dev_replace
->replace_state
!=
704 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
) {
705 up_read(&dev_replace
->rwsem
);
706 mutex_unlock(&dev_replace
->lock_finishing_cancel_unmount
);
710 tgt_device
= dev_replace
->tgtdev
;
711 src_device
= dev_replace
->srcdev
;
712 up_read(&dev_replace
->rwsem
);
715 * flush all outstanding I/O and inode extent mappings before the
716 * copy operation is declared as being finished
718 ret
= btrfs_start_delalloc_roots(fs_info
, U64_MAX
, false);
720 mutex_unlock(&dev_replace
->lock_finishing_cancel_unmount
);
723 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, 0, (u64
)-1);
726 btrfs_reada_remove_dev(src_device
);
729 * We have to use this loop approach because at this point src_device
730 * has to be available for transaction commit to complete, yet new
731 * chunks shouldn't be allocated on the device.
734 trans
= btrfs_start_transaction(root
, 0);
736 btrfs_reada_undo_remove_dev(src_device
);
737 mutex_unlock(&dev_replace
->lock_finishing_cancel_unmount
);
738 return PTR_ERR(trans
);
740 ret
= btrfs_commit_transaction(trans
);
743 /* Prevent write_all_supers() during the finishing procedure */
744 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
745 /* Prevent new chunks being allocated on the source device */
746 mutex_lock(&fs_info
->chunk_mutex
);
748 if (!list_empty(&src_device
->post_commit_list
)) {
749 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
750 mutex_unlock(&fs_info
->chunk_mutex
);
756 down_write(&dev_replace
->rwsem
);
757 dev_replace
->replace_state
=
758 scrub_ret
? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
759 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED
;
760 dev_replace
->tgtdev
= NULL
;
761 dev_replace
->srcdev
= NULL
;
762 dev_replace
->time_stopped
= ktime_get_real_seconds();
763 dev_replace
->item_needs_writeback
= 1;
766 * Update allocation state in the new device and replace the old device
767 * with the new one in the mapping tree.
770 scrub_ret
= btrfs_set_target_alloc_state(src_device
, tgt_device
);
773 btrfs_dev_replace_update_device_in_mapping_tree(fs_info
,
777 if (scrub_ret
!= -ECANCELED
)
778 btrfs_err_in_rcu(fs_info
,
779 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
780 btrfs_dev_name(src_device
),
782 rcu_str_deref(tgt_device
->name
), scrub_ret
);
784 up_write(&dev_replace
->rwsem
);
785 mutex_unlock(&fs_info
->chunk_mutex
);
786 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
787 btrfs_reada_undo_remove_dev(src_device
);
788 btrfs_rm_dev_replace_blocked(fs_info
);
790 btrfs_destroy_dev_replace_tgtdev(tgt_device
);
791 btrfs_rm_dev_replace_unblocked(fs_info
);
792 mutex_unlock(&dev_replace
->lock_finishing_cancel_unmount
);
797 btrfs_info_in_rcu(fs_info
,
798 "dev_replace from %s (devid %llu) to %s finished",
799 btrfs_dev_name(src_device
),
801 rcu_str_deref(tgt_device
->name
));
802 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT
, &tgt_device
->dev_state
);
803 tgt_device
->devid
= src_device
->devid
;
804 src_device
->devid
= BTRFS_DEV_REPLACE_DEVID
;
805 memcpy(uuid_tmp
, tgt_device
->uuid
, sizeof(uuid_tmp
));
806 memcpy(tgt_device
->uuid
, src_device
->uuid
, sizeof(tgt_device
->uuid
));
807 memcpy(src_device
->uuid
, uuid_tmp
, sizeof(src_device
->uuid
));
808 btrfs_device_set_total_bytes(tgt_device
, src_device
->total_bytes
);
809 btrfs_device_set_disk_total_bytes(tgt_device
,
810 src_device
->disk_total_bytes
);
811 btrfs_device_set_bytes_used(tgt_device
, src_device
->bytes_used
);
812 tgt_device
->commit_bytes_used
= src_device
->bytes_used
;
814 btrfs_assign_next_active_device(src_device
, tgt_device
);
816 list_add(&tgt_device
->dev_alloc_list
, &fs_info
->fs_devices
->alloc_list
);
817 fs_info
->fs_devices
->rw_devices
++;
819 up_write(&dev_replace
->rwsem
);
820 btrfs_rm_dev_replace_blocked(fs_info
);
822 btrfs_rm_dev_replace_remove_srcdev(src_device
);
824 btrfs_rm_dev_replace_unblocked(fs_info
);
827 * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
828 * update on-disk dev stats value during commit transaction
830 atomic_inc(&tgt_device
->dev_stats_ccnt
);
833 * this is again a consistent state where no dev_replace procedure
834 * is running, the target device is part of the filesystem, the
835 * source device is not part of the filesystem anymore and its 1st
836 * superblock is scratched out so that it is no longer marked to
837 * belong to this filesystem.
839 mutex_unlock(&fs_info
->chunk_mutex
);
840 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
842 /* replace the sysfs entry */
843 btrfs_sysfs_remove_device(src_device
);
844 btrfs_sysfs_update_devid(tgt_device
);
845 if (test_bit(BTRFS_DEV_STATE_WRITEABLE
, &src_device
->dev_state
))
846 btrfs_scratch_superblocks(fs_info
, src_device
->bdev
,
847 src_device
->name
->str
);
849 /* write back the superblocks */
850 trans
= btrfs_start_transaction(root
, 0);
852 btrfs_commit_transaction(trans
);
854 mutex_unlock(&dev_replace
->lock_finishing_cancel_unmount
);
856 btrfs_rm_dev_replace_free_srcdev(src_device
);
862 * Read progress of device replace status according to the state and last
863 * stored position. The value format is the same as for
864 * btrfs_dev_replace::progress_1000
866 static u64
btrfs_dev_replace_progress(struct btrfs_fs_info
*fs_info
)
868 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
871 switch (dev_replace
->replace_state
) {
872 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED
:
873 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
:
876 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED
:
879 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
:
880 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
:
881 ret
= div64_u64(dev_replace
->cursor_left
,
882 div_u64(btrfs_device_get_total_bytes(
883 dev_replace
->srcdev
), 1000));
890 void btrfs_dev_replace_status(struct btrfs_fs_info
*fs_info
,
891 struct btrfs_ioctl_dev_replace_args
*args
)
893 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
895 down_read(&dev_replace
->rwsem
);
896 /* even if !dev_replace_is_valid, the values are good enough for
897 * the replace_status ioctl */
898 args
->result
= BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR
;
899 args
->status
.replace_state
= dev_replace
->replace_state
;
900 args
->status
.time_started
= dev_replace
->time_started
;
901 args
->status
.time_stopped
= dev_replace
->time_stopped
;
902 args
->status
.num_write_errors
=
903 atomic64_read(&dev_replace
->num_write_errors
);
904 args
->status
.num_uncorrectable_read_errors
=
905 atomic64_read(&dev_replace
->num_uncorrectable_read_errors
);
906 args
->status
.progress_1000
= btrfs_dev_replace_progress(fs_info
);
907 up_read(&dev_replace
->rwsem
);
910 int btrfs_dev_replace_cancel(struct btrfs_fs_info
*fs_info
)
912 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
913 struct btrfs_device
*tgt_device
= NULL
;
914 struct btrfs_device
*src_device
= NULL
;
915 struct btrfs_trans_handle
*trans
;
916 struct btrfs_root
*root
= fs_info
->tree_root
;
920 if (sb_rdonly(fs_info
->sb
))
923 mutex_lock(&dev_replace
->lock_finishing_cancel_unmount
);
924 down_write(&dev_replace
->rwsem
);
925 switch (dev_replace
->replace_state
) {
926 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED
:
927 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED
:
928 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
:
929 result
= BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED
;
930 up_write(&dev_replace
->rwsem
);
932 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
:
933 tgt_device
= dev_replace
->tgtdev
;
934 src_device
= dev_replace
->srcdev
;
935 up_write(&dev_replace
->rwsem
);
936 ret
= btrfs_scrub_cancel(fs_info
);
938 result
= BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED
;
940 result
= BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR
;
942 * btrfs_dev_replace_finishing() will handle the
945 btrfs_info_in_rcu(fs_info
,
946 "dev_replace from %s (devid %llu) to %s canceled",
947 btrfs_dev_name(src_device
), src_device
->devid
,
948 btrfs_dev_name(tgt_device
));
951 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
:
953 * Scrub doing the replace isn't running so we need to do the
954 * cleanup step of btrfs_dev_replace_finishing() here
956 result
= BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR
;
957 tgt_device
= dev_replace
->tgtdev
;
958 src_device
= dev_replace
->srcdev
;
959 dev_replace
->tgtdev
= NULL
;
960 dev_replace
->srcdev
= NULL
;
961 dev_replace
->replace_state
=
962 BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
;
963 dev_replace
->time_stopped
= ktime_get_real_seconds();
964 dev_replace
->item_needs_writeback
= 1;
966 up_write(&dev_replace
->rwsem
);
968 /* Scrub for replace must not be running in suspended state */
969 ret
= btrfs_scrub_cancel(fs_info
);
970 ASSERT(ret
!= -ENOTCONN
);
972 trans
= btrfs_start_transaction(root
, 0);
974 mutex_unlock(&dev_replace
->lock_finishing_cancel_unmount
);
975 return PTR_ERR(trans
);
977 ret
= btrfs_commit_transaction(trans
);
980 btrfs_info_in_rcu(fs_info
,
981 "suspended dev_replace from %s (devid %llu) to %s canceled",
982 btrfs_dev_name(src_device
), src_device
->devid
,
983 btrfs_dev_name(tgt_device
));
986 btrfs_destroy_dev_replace_tgtdev(tgt_device
);
989 up_write(&dev_replace
->rwsem
);
993 mutex_unlock(&dev_replace
->lock_finishing_cancel_unmount
);
997 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info
*fs_info
)
999 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
1001 mutex_lock(&dev_replace
->lock_finishing_cancel_unmount
);
1002 down_write(&dev_replace
->rwsem
);
1004 switch (dev_replace
->replace_state
) {
1005 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED
:
1006 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED
:
1007 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
:
1008 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
:
1010 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
:
1011 dev_replace
->replace_state
=
1012 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
;
1013 dev_replace
->time_stopped
= ktime_get_real_seconds();
1014 dev_replace
->item_needs_writeback
= 1;
1015 btrfs_info(fs_info
, "suspending dev_replace for unmount");
1019 up_write(&dev_replace
->rwsem
);
1020 mutex_unlock(&dev_replace
->lock_finishing_cancel_unmount
);
1023 /* resume dev_replace procedure that was interrupted by unmount */
1024 int btrfs_resume_dev_replace_async(struct btrfs_fs_info
*fs_info
)
1026 struct task_struct
*task
;
1027 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
1029 down_write(&dev_replace
->rwsem
);
1031 switch (dev_replace
->replace_state
) {
1032 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED
:
1033 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED
:
1034 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
:
1035 up_write(&dev_replace
->rwsem
);
1037 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
:
1039 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
:
1040 dev_replace
->replace_state
=
1041 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
;
1044 if (!dev_replace
->tgtdev
|| !dev_replace
->tgtdev
->bdev
) {
1046 "cannot continue dev_replace, tgtdev is missing");
1048 "you may cancel the operation after 'mount -o degraded'");
1049 dev_replace
->replace_state
=
1050 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
;
1051 up_write(&dev_replace
->rwsem
);
1054 up_write(&dev_replace
->rwsem
);
1057 * This could collide with a paused balance, but the exclusive op logic
1058 * should never allow both to start and pause. We don't want to allow
1059 * dev-replace to start anyway.
1061 if (!btrfs_exclop_start(fs_info
, BTRFS_EXCLOP_DEV_REPLACE
)) {
1062 down_write(&dev_replace
->rwsem
);
1063 dev_replace
->replace_state
=
1064 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
;
1065 up_write(&dev_replace
->rwsem
);
1067 "cannot resume dev-replace, other exclusive operation running");
1071 task
= kthread_run(btrfs_dev_replace_kthread
, fs_info
, "btrfs-devrepl");
1072 return PTR_ERR_OR_ZERO(task
);
1075 static int btrfs_dev_replace_kthread(void *data
)
1077 struct btrfs_fs_info
*fs_info
= data
;
1078 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
1082 progress
= btrfs_dev_replace_progress(fs_info
);
1083 progress
= div_u64(progress
, 10);
1084 btrfs_info_in_rcu(fs_info
,
1085 "continuing dev_replace from %s (devid %llu) to target %s @%u%%",
1086 btrfs_dev_name(dev_replace
->srcdev
),
1087 dev_replace
->srcdev
->devid
,
1088 btrfs_dev_name(dev_replace
->tgtdev
),
1089 (unsigned int)progress
);
1091 ret
= btrfs_scrub_dev(fs_info
, dev_replace
->srcdev
->devid
,
1092 dev_replace
->committed_cursor_left
,
1093 btrfs_device_get_total_bytes(dev_replace
->srcdev
),
1094 &dev_replace
->scrub_progress
, 0, 1);
1095 ret
= btrfs_dev_replace_finishing(fs_info
, ret
);
1096 WARN_ON(ret
&& ret
!= -ECANCELED
);
1098 btrfs_exclop_finish(fs_info
);
1102 int __pure
btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace
*dev_replace
)
1104 if (!dev_replace
->is_valid
)
1107 switch (dev_replace
->replace_state
) {
1108 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED
:
1109 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED
:
1110 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
:
1112 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED
:
1113 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED
:
1115 * return true even if tgtdev is missing (this is
1116 * something that can happen if the dev_replace
1117 * procedure is suspended by an umount and then
1118 * the tgtdev is missing (or "btrfs dev scan") was
1119 * not called and the filesystem is remounted
1120 * in degraded state. This does not stop the
1121 * dev_replace procedure. It needs to be canceled
1122 * manually if the cancellation is wanted.
1129 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info
*fs_info
)
1131 percpu_counter_inc(&fs_info
->dev_replace
.bio_counter
);
1134 void btrfs_bio_counter_sub(struct btrfs_fs_info
*fs_info
, s64 amount
)
1136 percpu_counter_sub(&fs_info
->dev_replace
.bio_counter
, amount
);
1137 cond_wake_up_nomb(&fs_info
->dev_replace
.replace_wait
);
1140 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info
*fs_info
)
1143 percpu_counter_inc(&fs_info
->dev_replace
.bio_counter
);
1144 if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING
,
1145 &fs_info
->fs_state
)))
1148 btrfs_bio_counter_dec(fs_info
);
1149 wait_event(fs_info
->dev_replace
.replace_wait
,
1150 !test_bit(BTRFS_FS_STATE_DEV_REPLACING
,
1151 &fs_info
->fs_state
));