1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
14 #include "ordered-data.h"
15 #include "transaction.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
20 #include "block-group.h"
23 #include "accessors.h"
24 #include "file-item.h"
26 #include "raid-stripe-tree.h"
29 * This is only the first step towards a full-features scrub. It reads all
30 * extent and super block and verifies the checksums. In case a bad checksum
31 * is found or the extent cannot be read, good data will be written back if
34 * Future enhancements:
35 * - In case an unrepairable extent is encountered, track which files are
36 * affected and report them
37 * - track and record media errors, throw out bad devices
38 * - add a mode to also read unallocated space
44 * The following value only influences the performance.
46 * This determines how many stripes would be submitted in one go,
47 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
49 #define SCRUB_STRIPES_PER_GROUP 8
52 * How many groups we have for each sctx.
54 * This would be 8M per device, the same value as the old scrub in-flight bios
57 #define SCRUB_GROUPS_PER_SCTX 16
59 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP)
62 * The following value times PAGE_SIZE needs to be large enough to match the
63 * largest node/leaf/sector size that shall be supported.
65 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
67 /* Represent one sector and its needed info to verify the content. */
68 struct scrub_sector_verification
{
73 * Csum pointer for data csum verification. Should point to a
74 * sector csum inside scrub_stripe::csums.
76 * NULL if this data sector has no csum.
81 * Extra info for metadata verification. All sectors inside a
82 * tree block share the same generation.
88 enum scrub_stripe_flags
{
89 /* Set when @mirror_num, @dev, @physical and @logical are set. */
90 SCRUB_STRIPE_FLAG_INITIALIZED
,
92 /* Set when the read-repair is finished. */
93 SCRUB_STRIPE_FLAG_REPAIR_DONE
,
96 * Set for data stripes if it's triggered from P/Q stripe.
97 * During such scrub, we should not report errors in data stripes, nor
98 * update the accounting.
100 SCRUB_STRIPE_FLAG_NO_REPORT
,
103 #define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
106 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
108 struct scrub_stripe
{
109 struct scrub_ctx
*sctx
;
110 struct btrfs_block_group
*bg
;
112 struct page
*pages
[SCRUB_STRIPE_PAGES
];
113 struct scrub_sector_verification
*sectors
;
115 struct btrfs_device
*dev
;
121 /* Should be BTRFS_STRIPE_LEN / sectorsize. */
125 * How many data/meta extents are in this stripe. Only for scrub status
126 * reporting purposes.
132 wait_queue_head_t io_wait
;
133 wait_queue_head_t repair_wait
;
136 * Indicate the states of the stripe. Bits are defined in
137 * scrub_stripe_flags enum.
141 /* Indicate which sectors are covered by extent items. */
142 unsigned long extent_sector_bitmap
;
145 * The errors hit during the initial read of the stripe.
147 * Would be utilized for error reporting and repair.
149 * The remaining init_nr_* records the number of errors hit, only used
150 * by error reporting.
152 unsigned long init_error_bitmap
;
153 unsigned int init_nr_io_errors
;
154 unsigned int init_nr_csum_errors
;
155 unsigned int init_nr_meta_errors
;
158 * The following error bitmaps are all for the current status.
159 * Every time we submit a new read, these bitmaps may be updated.
161 * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
163 * IO and csum errors can happen for both metadata and data.
165 unsigned long error_bitmap
;
166 unsigned long io_error_bitmap
;
167 unsigned long csum_error_bitmap
;
168 unsigned long meta_error_bitmap
;
170 /* For writeback (repair or replace) error reporting. */
171 unsigned long write_error_bitmap
;
173 /* Writeback can be concurrent, thus we need to protect the bitmap. */
174 spinlock_t write_error_lock
;
177 * Checksum for the whole stripe if this stripe is inside a data block
182 struct work_struct work
;
186 struct scrub_stripe stripes
[SCRUB_TOTAL_STRIPES
];
187 struct scrub_stripe
*raid56_data_stripes
;
188 struct btrfs_fs_info
*fs_info
;
189 struct btrfs_path extent_path
;
190 struct btrfs_path csum_path
;
196 /* State of IO submission throttling affecting the associated device */
197 ktime_t throttle_deadline
;
203 struct mutex wr_lock
;
204 struct btrfs_device
*wr_tgtdev
;
209 struct btrfs_scrub_progress stat
;
210 spinlock_t stat_lock
;
213 * Use a ref counter to avoid use-after-free issues. Scrub workers
214 * decrement bios_in_flight and workers_pending and then do a wakeup
215 * on the list_wait wait queue. We must ensure the main scrub task
216 * doesn't free the scrub context before or while the workers are
217 * doing the wakeup() call.
222 struct scrub_warning
{
223 struct btrfs_path
*path
;
224 u64 extent_item_size
;
228 struct btrfs_device
*dev
;
231 static void release_scrub_stripe(struct scrub_stripe
*stripe
)
236 for (int i
= 0; i
< SCRUB_STRIPE_PAGES
; i
++) {
237 if (stripe
->pages
[i
])
238 __free_page(stripe
->pages
[i
]);
239 stripe
->pages
[i
] = NULL
;
241 kfree(stripe
->sectors
);
242 kfree(stripe
->csums
);
243 stripe
->sectors
= NULL
;
244 stripe
->csums
= NULL
;
249 static int init_scrub_stripe(struct btrfs_fs_info
*fs_info
,
250 struct scrub_stripe
*stripe
)
254 memset(stripe
, 0, sizeof(*stripe
));
256 stripe
->nr_sectors
= BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
;
259 init_waitqueue_head(&stripe
->io_wait
);
260 init_waitqueue_head(&stripe
->repair_wait
);
261 atomic_set(&stripe
->pending_io
, 0);
262 spin_lock_init(&stripe
->write_error_lock
);
264 ret
= btrfs_alloc_page_array(SCRUB_STRIPE_PAGES
, stripe
->pages
, false);
268 stripe
->sectors
= kcalloc(stripe
->nr_sectors
,
269 sizeof(struct scrub_sector_verification
),
271 if (!stripe
->sectors
)
274 stripe
->csums
= kcalloc(BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
,
275 fs_info
->csum_size
, GFP_KERNEL
);
280 release_scrub_stripe(stripe
);
284 static void wait_scrub_stripe_io(struct scrub_stripe
*stripe
)
286 wait_event(stripe
->io_wait
, atomic_read(&stripe
->pending_io
) == 0);
289 static void scrub_put_ctx(struct scrub_ctx
*sctx
);
291 static void __scrub_blocked_if_needed(struct btrfs_fs_info
*fs_info
)
293 while (atomic_read(&fs_info
->scrub_pause_req
)) {
294 mutex_unlock(&fs_info
->scrub_lock
);
295 wait_event(fs_info
->scrub_pause_wait
,
296 atomic_read(&fs_info
->scrub_pause_req
) == 0);
297 mutex_lock(&fs_info
->scrub_lock
);
301 static void scrub_pause_on(struct btrfs_fs_info
*fs_info
)
303 atomic_inc(&fs_info
->scrubs_paused
);
304 wake_up(&fs_info
->scrub_pause_wait
);
307 static void scrub_pause_off(struct btrfs_fs_info
*fs_info
)
309 mutex_lock(&fs_info
->scrub_lock
);
310 __scrub_blocked_if_needed(fs_info
);
311 atomic_dec(&fs_info
->scrubs_paused
);
312 mutex_unlock(&fs_info
->scrub_lock
);
314 wake_up(&fs_info
->scrub_pause_wait
);
317 static void scrub_blocked_if_needed(struct btrfs_fs_info
*fs_info
)
319 scrub_pause_on(fs_info
);
320 scrub_pause_off(fs_info
);
323 static noinline_for_stack
void scrub_free_ctx(struct scrub_ctx
*sctx
)
330 for (i
= 0; i
< SCRUB_TOTAL_STRIPES
; i
++)
331 release_scrub_stripe(&sctx
->stripes
[i
]);
336 static void scrub_put_ctx(struct scrub_ctx
*sctx
)
338 if (refcount_dec_and_test(&sctx
->refs
))
339 scrub_free_ctx(sctx
);
342 static noinline_for_stack
struct scrub_ctx
*scrub_setup_ctx(
343 struct btrfs_fs_info
*fs_info
, int is_dev_replace
)
345 struct scrub_ctx
*sctx
;
348 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use
351 sctx
= kvzalloc(sizeof(*sctx
), GFP_KERNEL
);
354 refcount_set(&sctx
->refs
, 1);
355 sctx
->is_dev_replace
= is_dev_replace
;
356 sctx
->fs_info
= fs_info
;
357 sctx
->extent_path
.search_commit_root
= 1;
358 sctx
->extent_path
.skip_locking
= 1;
359 sctx
->csum_path
.search_commit_root
= 1;
360 sctx
->csum_path
.skip_locking
= 1;
361 for (i
= 0; i
< SCRUB_TOTAL_STRIPES
; i
++) {
364 ret
= init_scrub_stripe(fs_info
, &sctx
->stripes
[i
]);
367 sctx
->stripes
[i
].sctx
= sctx
;
369 sctx
->first_free
= 0;
370 atomic_set(&sctx
->cancel_req
, 0);
372 spin_lock_init(&sctx
->stat_lock
);
373 sctx
->throttle_deadline
= 0;
375 mutex_init(&sctx
->wr_lock
);
376 if (is_dev_replace
) {
377 WARN_ON(!fs_info
->dev_replace
.tgtdev
);
378 sctx
->wr_tgtdev
= fs_info
->dev_replace
.tgtdev
;
384 scrub_free_ctx(sctx
);
385 return ERR_PTR(-ENOMEM
);
388 static int scrub_print_warning_inode(u64 inum
, u64 offset
, u64 num_bytes
,
389 u64 root
, void *warn_ctx
)
395 struct extent_buffer
*eb
;
396 struct btrfs_inode_item
*inode_item
;
397 struct scrub_warning
*swarn
= warn_ctx
;
398 struct btrfs_fs_info
*fs_info
= swarn
->dev
->fs_info
;
399 struct inode_fs_paths
*ipath
= NULL
;
400 struct btrfs_root
*local_root
;
401 struct btrfs_key key
;
403 local_root
= btrfs_get_fs_root(fs_info
, root
, true);
404 if (IS_ERR(local_root
)) {
405 ret
= PTR_ERR(local_root
);
410 * this makes the path point to (inum INODE_ITEM ioff)
413 key
.type
= BTRFS_INODE_ITEM_KEY
;
416 ret
= btrfs_search_slot(NULL
, local_root
, &key
, swarn
->path
, 0, 0);
418 btrfs_put_root(local_root
);
419 btrfs_release_path(swarn
->path
);
423 eb
= swarn
->path
->nodes
[0];
424 inode_item
= btrfs_item_ptr(eb
, swarn
->path
->slots
[0],
425 struct btrfs_inode_item
);
426 nlink
= btrfs_inode_nlink(eb
, inode_item
);
427 btrfs_release_path(swarn
->path
);
430 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
431 * uses GFP_NOFS in this context, so we keep it consistent but it does
432 * not seem to be strictly necessary.
434 nofs_flag
= memalloc_nofs_save();
435 ipath
= init_ipath(4096, local_root
, swarn
->path
);
436 memalloc_nofs_restore(nofs_flag
);
438 btrfs_put_root(local_root
);
439 ret
= PTR_ERR(ipath
);
443 ret
= paths_from_inode(inum
, ipath
);
449 * we deliberately ignore the bit ipath might have been too small to
450 * hold all of the paths here
452 for (i
= 0; i
< ipath
->fspath
->elem_cnt
; ++i
)
453 btrfs_warn_in_rcu(fs_info
,
454 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
455 swarn
->errstr
, swarn
->logical
,
456 btrfs_dev_name(swarn
->dev
),
459 fs_info
->sectorsize
, nlink
,
460 (char *)(unsigned long)ipath
->fspath
->val
[i
]);
462 btrfs_put_root(local_root
);
467 btrfs_warn_in_rcu(fs_info
,
468 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
469 swarn
->errstr
, swarn
->logical
,
470 btrfs_dev_name(swarn
->dev
),
472 root
, inum
, offset
, ret
);
478 static void scrub_print_common_warning(const char *errstr
, struct btrfs_device
*dev
,
479 bool is_super
, u64 logical
, u64 physical
)
481 struct btrfs_fs_info
*fs_info
= dev
->fs_info
;
482 struct btrfs_path
*path
;
483 struct btrfs_key found_key
;
484 struct extent_buffer
*eb
;
485 struct btrfs_extent_item
*ei
;
486 struct scrub_warning swarn
;
491 /* Super block error, no need to search extent tree. */
493 btrfs_warn_in_rcu(fs_info
, "%s on device %s, physical %llu",
494 errstr
, btrfs_dev_name(dev
), physical
);
497 path
= btrfs_alloc_path();
501 swarn
.physical
= physical
;
502 swarn
.logical
= logical
;
503 swarn
.errstr
= errstr
;
506 ret
= extent_from_logical(fs_info
, swarn
.logical
, path
, &found_key
,
511 swarn
.extent_item_size
= found_key
.offset
;
514 ei
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_extent_item
);
515 item_size
= btrfs_item_size(eb
, path
->slots
[0]);
517 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
518 unsigned long ptr
= 0;
523 ret
= tree_backref_for_extent(&ptr
, eb
, &found_key
, ei
,
524 item_size
, &ref_root
,
528 "failed to resolve tree backref for logical %llu: %d",
534 btrfs_warn_in_rcu(fs_info
,
535 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
536 errstr
, swarn
.logical
, btrfs_dev_name(dev
),
537 swarn
.physical
, (ref_level
? "node" : "leaf"),
538 ref_level
, ref_root
);
540 btrfs_release_path(path
);
542 struct btrfs_backref_walk_ctx ctx
= { 0 };
544 btrfs_release_path(path
);
546 ctx
.bytenr
= found_key
.objectid
;
547 ctx
.extent_item_pos
= swarn
.logical
- found_key
.objectid
;
548 ctx
.fs_info
= fs_info
;
553 iterate_extent_inodes(&ctx
, true, scrub_print_warning_inode
, &swarn
);
557 btrfs_free_path(path
);
560 static int fill_writer_pointer_gap(struct scrub_ctx
*sctx
, u64 physical
)
565 if (!btrfs_is_zoned(sctx
->fs_info
))
568 if (!btrfs_dev_is_sequential(sctx
->wr_tgtdev
, physical
))
571 if (sctx
->write_pointer
< physical
) {
572 length
= physical
- sctx
->write_pointer
;
574 ret
= btrfs_zoned_issue_zeroout(sctx
->wr_tgtdev
,
575 sctx
->write_pointer
, length
);
577 sctx
->write_pointer
= physical
;
582 static struct page
*scrub_stripe_get_page(struct scrub_stripe
*stripe
, int sector_nr
)
584 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
585 int page_index
= (sector_nr
<< fs_info
->sectorsize_bits
) >> PAGE_SHIFT
;
587 return stripe
->pages
[page_index
];
590 static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe
*stripe
,
593 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
595 return offset_in_page(sector_nr
<< fs_info
->sectorsize_bits
);
598 static void scrub_verify_one_metadata(struct scrub_stripe
*stripe
, int sector_nr
)
600 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
601 const u32 sectors_per_tree
= fs_info
->nodesize
>> fs_info
->sectorsize_bits
;
602 const u64 logical
= stripe
->logical
+ (sector_nr
<< fs_info
->sectorsize_bits
);
603 const struct page
*first_page
= scrub_stripe_get_page(stripe
, sector_nr
);
604 const unsigned int first_off
= scrub_stripe_get_page_offset(stripe
, sector_nr
);
605 SHASH_DESC_ON_STACK(shash
, fs_info
->csum_shash
);
606 u8 on_disk_csum
[BTRFS_CSUM_SIZE
];
607 u8 calculated_csum
[BTRFS_CSUM_SIZE
];
608 struct btrfs_header
*header
;
611 * Here we don't have a good way to attach the pages (and subpages)
612 * to a dummy extent buffer, thus we have to directly grab the members
615 header
= (struct btrfs_header
*)(page_address(first_page
) + first_off
);
616 memcpy(on_disk_csum
, header
->csum
, fs_info
->csum_size
);
618 if (logical
!= btrfs_stack_header_bytenr(header
)) {
619 bitmap_set(&stripe
->csum_error_bitmap
, sector_nr
, sectors_per_tree
);
620 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
621 btrfs_warn_rl(fs_info
,
622 "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
623 logical
, stripe
->mirror_num
,
624 btrfs_stack_header_bytenr(header
), logical
);
627 if (memcmp(header
->fsid
, fs_info
->fs_devices
->metadata_uuid
,
628 BTRFS_FSID_SIZE
) != 0) {
629 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
630 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
631 btrfs_warn_rl(fs_info
,
632 "tree block %llu mirror %u has bad fsid, has %pU want %pU",
633 logical
, stripe
->mirror_num
,
634 header
->fsid
, fs_info
->fs_devices
->fsid
);
637 if (memcmp(header
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
,
638 BTRFS_UUID_SIZE
) != 0) {
639 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
640 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
641 btrfs_warn_rl(fs_info
,
642 "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
643 logical
, stripe
->mirror_num
,
644 header
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
);
648 /* Now check tree block csum. */
649 shash
->tfm
= fs_info
->csum_shash
;
650 crypto_shash_init(shash
);
651 crypto_shash_update(shash
, page_address(first_page
) + first_off
+
652 BTRFS_CSUM_SIZE
, fs_info
->sectorsize
- BTRFS_CSUM_SIZE
);
654 for (int i
= sector_nr
+ 1; i
< sector_nr
+ sectors_per_tree
; i
++) {
655 struct page
*page
= scrub_stripe_get_page(stripe
, i
);
656 unsigned int page_off
= scrub_stripe_get_page_offset(stripe
, i
);
658 crypto_shash_update(shash
, page_address(page
) + page_off
,
659 fs_info
->sectorsize
);
662 crypto_shash_final(shash
, calculated_csum
);
663 if (memcmp(calculated_csum
, on_disk_csum
, fs_info
->csum_size
) != 0) {
664 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
665 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
666 btrfs_warn_rl(fs_info
,
667 "tree block %llu mirror %u has bad csum, has " CSUM_FMT
" want " CSUM_FMT
,
668 logical
, stripe
->mirror_num
,
669 CSUM_FMT_VALUE(fs_info
->csum_size
, on_disk_csum
),
670 CSUM_FMT_VALUE(fs_info
->csum_size
, calculated_csum
));
673 if (stripe
->sectors
[sector_nr
].generation
!=
674 btrfs_stack_header_generation(header
)) {
675 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
676 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
677 btrfs_warn_rl(fs_info
,
678 "tree block %llu mirror %u has bad generation, has %llu want %llu",
679 logical
, stripe
->mirror_num
,
680 btrfs_stack_header_generation(header
),
681 stripe
->sectors
[sector_nr
].generation
);
684 bitmap_clear(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
685 bitmap_clear(&stripe
->csum_error_bitmap
, sector_nr
, sectors_per_tree
);
686 bitmap_clear(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
689 static void scrub_verify_one_sector(struct scrub_stripe
*stripe
, int sector_nr
)
691 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
692 struct scrub_sector_verification
*sector
= &stripe
->sectors
[sector_nr
];
693 const u32 sectors_per_tree
= fs_info
->nodesize
>> fs_info
->sectorsize_bits
;
694 struct page
*page
= scrub_stripe_get_page(stripe
, sector_nr
);
695 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, sector_nr
);
696 u8 csum_buf
[BTRFS_CSUM_SIZE
];
699 ASSERT(sector_nr
>= 0 && sector_nr
< stripe
->nr_sectors
);
701 /* Sector not utilized, skip it. */
702 if (!test_bit(sector_nr
, &stripe
->extent_sector_bitmap
))
705 /* IO error, no need to check. */
706 if (test_bit(sector_nr
, &stripe
->io_error_bitmap
))
709 /* Metadata, verify the full tree block. */
710 if (sector
->is_metadata
) {
712 * Check if the tree block crosses the stripe boundary. If
713 * crossed the boundary, we cannot verify it but only give a
716 * This can only happen on a very old filesystem where chunks
717 * are not ensured to be stripe aligned.
719 if (unlikely(sector_nr
+ sectors_per_tree
> stripe
->nr_sectors
)) {
720 btrfs_warn_rl(fs_info
,
721 "tree block at %llu crosses stripe boundary %llu",
723 (sector_nr
<< fs_info
->sectorsize_bits
),
727 scrub_verify_one_metadata(stripe
, sector_nr
);
732 * Data is easier, we just verify the data csum (if we have it). For
733 * cases without csum, we have no other choice but to trust it.
736 clear_bit(sector_nr
, &stripe
->error_bitmap
);
740 ret
= btrfs_check_sector_csum(fs_info
, page
, pgoff
, csum_buf
, sector
->csum
);
742 set_bit(sector_nr
, &stripe
->csum_error_bitmap
);
743 set_bit(sector_nr
, &stripe
->error_bitmap
);
745 clear_bit(sector_nr
, &stripe
->csum_error_bitmap
);
746 clear_bit(sector_nr
, &stripe
->error_bitmap
);
750 /* Verify specified sectors of a stripe. */
751 static void scrub_verify_one_stripe(struct scrub_stripe
*stripe
, unsigned long bitmap
)
753 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
754 const u32 sectors_per_tree
= fs_info
->nodesize
>> fs_info
->sectorsize_bits
;
757 for_each_set_bit(sector_nr
, &bitmap
, stripe
->nr_sectors
) {
758 scrub_verify_one_sector(stripe
, sector_nr
);
759 if (stripe
->sectors
[sector_nr
].is_metadata
)
760 sector_nr
+= sectors_per_tree
- 1;
764 static int calc_sector_number(struct scrub_stripe
*stripe
, struct bio_vec
*first_bvec
)
768 for (i
= 0; i
< stripe
->nr_sectors
; i
++) {
769 if (scrub_stripe_get_page(stripe
, i
) == first_bvec
->bv_page
&&
770 scrub_stripe_get_page_offset(stripe
, i
) == first_bvec
->bv_offset
)
773 ASSERT(i
< stripe
->nr_sectors
);
778 * Repair read is different to the regular read:
780 * - Only reads the failed sectors
781 * - May have extra blocksize limits
783 static void scrub_repair_read_endio(struct btrfs_bio
*bbio
)
785 struct scrub_stripe
*stripe
= bbio
->private;
786 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
787 struct bio_vec
*bvec
;
788 int sector_nr
= calc_sector_number(stripe
, bio_first_bvec_all(&bbio
->bio
));
792 ASSERT(sector_nr
< stripe
->nr_sectors
);
794 bio_for_each_bvec_all(bvec
, &bbio
->bio
, i
)
795 bio_size
+= bvec
->bv_len
;
797 if (bbio
->bio
.bi_status
) {
798 bitmap_set(&stripe
->io_error_bitmap
, sector_nr
,
799 bio_size
>> fs_info
->sectorsize_bits
);
800 bitmap_set(&stripe
->error_bitmap
, sector_nr
,
801 bio_size
>> fs_info
->sectorsize_bits
);
803 bitmap_clear(&stripe
->io_error_bitmap
, sector_nr
,
804 bio_size
>> fs_info
->sectorsize_bits
);
807 if (atomic_dec_and_test(&stripe
->pending_io
))
808 wake_up(&stripe
->io_wait
);
811 static int calc_next_mirror(int mirror
, int num_copies
)
813 ASSERT(mirror
<= num_copies
);
814 return (mirror
+ 1 > num_copies
) ? 1 : mirror
+ 1;
817 static void scrub_stripe_submit_repair_read(struct scrub_stripe
*stripe
,
818 int mirror
, int blocksize
, bool wait
)
820 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
821 struct btrfs_bio
*bbio
= NULL
;
822 const unsigned long old_error_bitmap
= stripe
->error_bitmap
;
825 ASSERT(stripe
->mirror_num
>= 1);
826 ASSERT(atomic_read(&stripe
->pending_io
) == 0);
828 for_each_set_bit(i
, &old_error_bitmap
, stripe
->nr_sectors
) {
833 page
= scrub_stripe_get_page(stripe
, i
);
834 pgoff
= scrub_stripe_get_page_offset(stripe
, i
);
836 /* The current sector cannot be merged, submit the bio. */
837 if (bbio
&& ((i
> 0 && !test_bit(i
- 1, &stripe
->error_bitmap
)) ||
838 bbio
->bio
.bi_iter
.bi_size
>= blocksize
)) {
839 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
840 atomic_inc(&stripe
->pending_io
);
841 btrfs_submit_bbio(bbio
, mirror
);
843 wait_scrub_stripe_io(stripe
);
848 bbio
= btrfs_bio_alloc(stripe
->nr_sectors
, REQ_OP_READ
,
849 fs_info
, scrub_repair_read_endio
, stripe
);
850 bbio
->bio
.bi_iter
.bi_sector
= (stripe
->logical
+
851 (i
<< fs_info
->sectorsize_bits
)) >> SECTOR_SHIFT
;
854 ret
= bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
855 ASSERT(ret
== fs_info
->sectorsize
);
858 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
859 atomic_inc(&stripe
->pending_io
);
860 btrfs_submit_bbio(bbio
, mirror
);
862 wait_scrub_stripe_io(stripe
);
866 static void scrub_stripe_report_errors(struct scrub_ctx
*sctx
,
867 struct scrub_stripe
*stripe
)
869 static DEFINE_RATELIMIT_STATE(rs
, DEFAULT_RATELIMIT_INTERVAL
,
870 DEFAULT_RATELIMIT_BURST
);
871 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
872 struct btrfs_device
*dev
= NULL
;
874 int nr_data_sectors
= 0;
875 int nr_meta_sectors
= 0;
876 int nr_nodatacsum_sectors
= 0;
877 int nr_repaired_sectors
= 0;
880 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT
, &stripe
->state
))
884 * Init needed infos for error reporting.
886 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
887 * thus no need for dev/physical, error reporting still needs dev and physical.
889 if (!bitmap_empty(&stripe
->init_error_bitmap
, stripe
->nr_sectors
)) {
890 u64 mapped_len
= fs_info
->sectorsize
;
891 struct btrfs_io_context
*bioc
= NULL
;
892 int stripe_index
= stripe
->mirror_num
- 1;
895 /* For scrub, our mirror_num should always start at 1. */
896 ASSERT(stripe
->mirror_num
>= 1);
897 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_GET_READ_MIRRORS
,
898 stripe
->logical
, &mapped_len
, &bioc
,
901 * If we failed, dev will be NULL, and later detailed reports
902 * will just be skipped.
906 physical
= bioc
->stripes
[stripe_index
].physical
;
907 dev
= bioc
->stripes
[stripe_index
].dev
;
908 btrfs_put_bioc(bioc
);
912 for_each_set_bit(sector_nr
, &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
) {
913 bool repaired
= false;
915 if (stripe
->sectors
[sector_nr
].is_metadata
) {
919 if (!stripe
->sectors
[sector_nr
].csum
)
920 nr_nodatacsum_sectors
++;
923 if (test_bit(sector_nr
, &stripe
->init_error_bitmap
) &&
924 !test_bit(sector_nr
, &stripe
->error_bitmap
)) {
925 nr_repaired_sectors
++;
929 /* Good sector from the beginning, nothing need to be done. */
930 if (!test_bit(sector_nr
, &stripe
->init_error_bitmap
))
934 * Report error for the corrupted sectors. If repaired, just
935 * output the message of repaired message.
939 btrfs_err_rl_in_rcu(fs_info
,
940 "fixed up error at logical %llu on dev %s physical %llu",
941 stripe
->logical
, btrfs_dev_name(dev
),
944 btrfs_err_rl_in_rcu(fs_info
,
945 "fixed up error at logical %llu on mirror %u",
946 stripe
->logical
, stripe
->mirror_num
);
951 /* The remaining are all for unrepaired. */
953 btrfs_err_rl_in_rcu(fs_info
,
954 "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
955 stripe
->logical
, btrfs_dev_name(dev
),
958 btrfs_err_rl_in_rcu(fs_info
,
959 "unable to fixup (regular) error at logical %llu on mirror %u",
960 stripe
->logical
, stripe
->mirror_num
);
963 if (test_bit(sector_nr
, &stripe
->io_error_bitmap
))
964 if (__ratelimit(&rs
) && dev
)
965 scrub_print_common_warning("i/o error", dev
, false,
966 stripe
->logical
, physical
);
967 if (test_bit(sector_nr
, &stripe
->csum_error_bitmap
))
968 if (__ratelimit(&rs
) && dev
)
969 scrub_print_common_warning("checksum error", dev
, false,
970 stripe
->logical
, physical
);
971 if (test_bit(sector_nr
, &stripe
->meta_error_bitmap
))
972 if (__ratelimit(&rs
) && dev
)
973 scrub_print_common_warning("header error", dev
, false,
974 stripe
->logical
, physical
);
977 spin_lock(&sctx
->stat_lock
);
978 sctx
->stat
.data_extents_scrubbed
+= stripe
->nr_data_extents
;
979 sctx
->stat
.tree_extents_scrubbed
+= stripe
->nr_meta_extents
;
980 sctx
->stat
.data_bytes_scrubbed
+= nr_data_sectors
<< fs_info
->sectorsize_bits
;
981 sctx
->stat
.tree_bytes_scrubbed
+= nr_meta_sectors
<< fs_info
->sectorsize_bits
;
982 sctx
->stat
.no_csum
+= nr_nodatacsum_sectors
;
983 sctx
->stat
.read_errors
+= stripe
->init_nr_io_errors
;
984 sctx
->stat
.csum_errors
+= stripe
->init_nr_csum_errors
;
985 sctx
->stat
.verify_errors
+= stripe
->init_nr_meta_errors
;
986 sctx
->stat
.uncorrectable_errors
+=
987 bitmap_weight(&stripe
->error_bitmap
, stripe
->nr_sectors
);
988 sctx
->stat
.corrected_errors
+= nr_repaired_sectors
;
989 spin_unlock(&sctx
->stat_lock
);
992 static void scrub_write_sectors(struct scrub_ctx
*sctx
, struct scrub_stripe
*stripe
,
993 unsigned long write_bitmap
, bool dev_replace
);
996 * The main entrance for all read related scrub work, including:
998 * - Wait for the initial read to finish
999 * - Verify and locate any bad sectors
1000 * - Go through the remaining mirrors and try to read as large blocksize as
1002 * - Go through all mirrors (including the failed mirror) sector-by-sector
1003 * - Submit writeback for repaired sectors
1005 * Writeback for dev-replace does not happen here, it needs extra
1006 * synchronization for zoned devices.
1008 static void scrub_stripe_read_repair_worker(struct work_struct
*work
)
1010 struct scrub_stripe
*stripe
= container_of(work
, struct scrub_stripe
, work
);
1011 struct scrub_ctx
*sctx
= stripe
->sctx
;
1012 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1013 int num_copies
= btrfs_num_copies(fs_info
, stripe
->bg
->start
,
1014 stripe
->bg
->length
);
1015 unsigned long repaired
;
1019 ASSERT(stripe
->mirror_num
> 0);
1021 wait_scrub_stripe_io(stripe
);
1022 scrub_verify_one_stripe(stripe
, stripe
->extent_sector_bitmap
);
1023 /* Save the initial failed bitmap for later repair and report usage. */
1024 stripe
->init_error_bitmap
= stripe
->error_bitmap
;
1025 stripe
->init_nr_io_errors
= bitmap_weight(&stripe
->io_error_bitmap
,
1026 stripe
->nr_sectors
);
1027 stripe
->init_nr_csum_errors
= bitmap_weight(&stripe
->csum_error_bitmap
,
1028 stripe
->nr_sectors
);
1029 stripe
->init_nr_meta_errors
= bitmap_weight(&stripe
->meta_error_bitmap
,
1030 stripe
->nr_sectors
);
1032 if (bitmap_empty(&stripe
->init_error_bitmap
, stripe
->nr_sectors
))
1036 * Try all remaining mirrors.
1038 * Here we still try to read as large block as possible, as this is
1039 * faster and we have extra safety nets to rely on.
1041 for (mirror
= calc_next_mirror(stripe
->mirror_num
, num_copies
);
1042 mirror
!= stripe
->mirror_num
;
1043 mirror
= calc_next_mirror(mirror
, num_copies
)) {
1044 const unsigned long old_error_bitmap
= stripe
->error_bitmap
;
1046 scrub_stripe_submit_repair_read(stripe
, mirror
,
1047 BTRFS_STRIPE_LEN
, false);
1048 wait_scrub_stripe_io(stripe
);
1049 scrub_verify_one_stripe(stripe
, old_error_bitmap
);
1050 if (bitmap_empty(&stripe
->error_bitmap
, stripe
->nr_sectors
))
1055 * Last safety net, try re-checking all mirrors, including the failed
1056 * one, sector-by-sector.
1058 * As if one sector failed the drive's internal csum, the whole read
1059 * containing the offending sector would be marked as error.
1060 * Thus here we do sector-by-sector read.
1062 * This can be slow, thus we only try it as the last resort.
1065 for (i
= 0, mirror
= stripe
->mirror_num
;
1067 i
++, mirror
= calc_next_mirror(mirror
, num_copies
)) {
1068 const unsigned long old_error_bitmap
= stripe
->error_bitmap
;
1070 scrub_stripe_submit_repair_read(stripe
, mirror
,
1071 fs_info
->sectorsize
, true);
1072 wait_scrub_stripe_io(stripe
);
1073 scrub_verify_one_stripe(stripe
, old_error_bitmap
);
1074 if (bitmap_empty(&stripe
->error_bitmap
, stripe
->nr_sectors
))
1079 * Submit the repaired sectors. For zoned case, we cannot do repair
1080 * in-place, but queue the bg to be relocated.
1082 bitmap_andnot(&repaired
, &stripe
->init_error_bitmap
, &stripe
->error_bitmap
,
1083 stripe
->nr_sectors
);
1084 if (!sctx
->readonly
&& !bitmap_empty(&repaired
, stripe
->nr_sectors
)) {
1085 if (btrfs_is_zoned(fs_info
)) {
1086 btrfs_repair_one_zone(fs_info
, sctx
->stripes
[0].bg
->start
);
1088 scrub_write_sectors(sctx
, stripe
, repaired
, false);
1089 wait_scrub_stripe_io(stripe
);
1093 scrub_stripe_report_errors(sctx
, stripe
);
1094 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE
, &stripe
->state
);
1095 wake_up(&stripe
->repair_wait
);
1098 static void scrub_read_endio(struct btrfs_bio
*bbio
)
1100 struct scrub_stripe
*stripe
= bbio
->private;
1101 struct bio_vec
*bvec
;
1102 int sector_nr
= calc_sector_number(stripe
, bio_first_bvec_all(&bbio
->bio
));
1107 ASSERT(sector_nr
< stripe
->nr_sectors
);
1108 bio_for_each_bvec_all(bvec
, &bbio
->bio
, i
)
1109 bio_size
+= bvec
->bv_len
;
1110 num_sectors
= bio_size
>> stripe
->bg
->fs_info
->sectorsize_bits
;
1112 if (bbio
->bio
.bi_status
) {
1113 bitmap_set(&stripe
->io_error_bitmap
, sector_nr
, num_sectors
);
1114 bitmap_set(&stripe
->error_bitmap
, sector_nr
, num_sectors
);
1116 bitmap_clear(&stripe
->io_error_bitmap
, sector_nr
, num_sectors
);
1118 bio_put(&bbio
->bio
);
1119 if (atomic_dec_and_test(&stripe
->pending_io
)) {
1120 wake_up(&stripe
->io_wait
);
1121 INIT_WORK(&stripe
->work
, scrub_stripe_read_repair_worker
);
1122 queue_work(stripe
->bg
->fs_info
->scrub_workers
, &stripe
->work
);
1126 static void scrub_write_endio(struct btrfs_bio
*bbio
)
1128 struct scrub_stripe
*stripe
= bbio
->private;
1129 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1130 struct bio_vec
*bvec
;
1131 int sector_nr
= calc_sector_number(stripe
, bio_first_bvec_all(&bbio
->bio
));
1135 bio_for_each_bvec_all(bvec
, &bbio
->bio
, i
)
1136 bio_size
+= bvec
->bv_len
;
1138 if (bbio
->bio
.bi_status
) {
1139 unsigned long flags
;
1141 spin_lock_irqsave(&stripe
->write_error_lock
, flags
);
1142 bitmap_set(&stripe
->write_error_bitmap
, sector_nr
,
1143 bio_size
>> fs_info
->sectorsize_bits
);
1144 spin_unlock_irqrestore(&stripe
->write_error_lock
, flags
);
1146 bio_put(&bbio
->bio
);
1148 if (atomic_dec_and_test(&stripe
->pending_io
))
1149 wake_up(&stripe
->io_wait
);
1152 static void scrub_submit_write_bio(struct scrub_ctx
*sctx
,
1153 struct scrub_stripe
*stripe
,
1154 struct btrfs_bio
*bbio
, bool dev_replace
)
1156 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1157 u32 bio_len
= bbio
->bio
.bi_iter
.bi_size
;
1158 u32 bio_off
= (bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
) -
1161 fill_writer_pointer_gap(sctx
, stripe
->physical
+ bio_off
);
1162 atomic_inc(&stripe
->pending_io
);
1163 btrfs_submit_repair_write(bbio
, stripe
->mirror_num
, dev_replace
);
1164 if (!btrfs_is_zoned(fs_info
))
1167 * For zoned writeback, queue depth must be 1, thus we must wait for
1168 * the write to finish before the next write.
1170 wait_scrub_stripe_io(stripe
);
1173 * And also need to update the write pointer if write finished
1176 if (!test_bit(bio_off
>> fs_info
->sectorsize_bits
,
1177 &stripe
->write_error_bitmap
))
1178 sctx
->write_pointer
+= bio_len
;
1182 * Submit the write bio(s) for the sectors specified by @write_bitmap.
1184 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
1186 * - Only needs logical bytenr and mirror_num
1187 * Just like the scrub read path
1189 * - Would only result in writes to the specified mirror
1190 * Unlike the regular writeback path, which would write back to all stripes
1192 * - Handle dev-replace and read-repair writeback differently
1194 static void scrub_write_sectors(struct scrub_ctx
*sctx
, struct scrub_stripe
*stripe
,
1195 unsigned long write_bitmap
, bool dev_replace
)
1197 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1198 struct btrfs_bio
*bbio
= NULL
;
1201 for_each_set_bit(sector_nr
, &write_bitmap
, stripe
->nr_sectors
) {
1202 struct page
*page
= scrub_stripe_get_page(stripe
, sector_nr
);
1203 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, sector_nr
);
1206 /* We should only writeback sectors covered by an extent. */
1207 ASSERT(test_bit(sector_nr
, &stripe
->extent_sector_bitmap
));
1209 /* Cannot merge with previous sector, submit the current one. */
1210 if (bbio
&& sector_nr
&& !test_bit(sector_nr
- 1, &write_bitmap
)) {
1211 scrub_submit_write_bio(sctx
, stripe
, bbio
, dev_replace
);
1215 bbio
= btrfs_bio_alloc(stripe
->nr_sectors
, REQ_OP_WRITE
,
1216 fs_info
, scrub_write_endio
, stripe
);
1217 bbio
->bio
.bi_iter
.bi_sector
= (stripe
->logical
+
1218 (sector_nr
<< fs_info
->sectorsize_bits
)) >>
1221 ret
= bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
1222 ASSERT(ret
== fs_info
->sectorsize
);
1225 scrub_submit_write_bio(sctx
, stripe
, bbio
, dev_replace
);
1229 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1230 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1232 static void scrub_throttle_dev_io(struct scrub_ctx
*sctx
, struct btrfs_device
*device
,
1233 unsigned int bio_size
)
1235 const int time_slice
= 1000;
1241 bwlimit
= READ_ONCE(device
->scrub_speed_max
);
1246 * Slice is divided into intervals when the IO is submitted, adjust by
1247 * bwlimit and maximum of 64 intervals.
1249 div
= max_t(u32
, 1, (u32
)(bwlimit
/ (16 * 1024 * 1024)));
1250 div
= min_t(u32
, 64, div
);
1252 /* Start new epoch, set deadline */
1254 if (sctx
->throttle_deadline
== 0) {
1255 sctx
->throttle_deadline
= ktime_add_ms(now
, time_slice
/ div
);
1256 sctx
->throttle_sent
= 0;
1259 /* Still in the time to send? */
1260 if (ktime_before(now
, sctx
->throttle_deadline
)) {
1261 /* If current bio is within the limit, send it */
1262 sctx
->throttle_sent
+= bio_size
;
1263 if (sctx
->throttle_sent
<= div_u64(bwlimit
, div
))
1266 /* We're over the limit, sleep until the rest of the slice */
1267 delta
= ktime_ms_delta(sctx
->throttle_deadline
, now
);
1269 /* New request after deadline, start new epoch */
1276 timeout
= div_u64(delta
* HZ
, 1000);
1277 schedule_timeout_interruptible(timeout
);
1280 /* Next call will start the deadline period */
1281 sctx
->throttle_deadline
= 0;
1285 * Given a physical address, this will calculate it's
1286 * logical offset. if this is a parity stripe, it will return
1287 * the most left data stripe's logical offset.
1289 * return 0 if it is a data stripe, 1 means parity stripe.
1291 static int get_raid56_logic_offset(u64 physical
, int num
,
1292 struct btrfs_chunk_map
*map
, u64
*offset
,
1298 const int data_stripes
= nr_data_stripes(map
);
1300 last_offset
= (physical
- map
->stripes
[num
].physical
) * data_stripes
;
1302 *stripe_start
= last_offset
;
1304 *offset
= last_offset
;
1305 for (i
= 0; i
< data_stripes
; i
++) {
1310 *offset
= last_offset
+ btrfs_stripe_nr_to_offset(i
);
1312 stripe_nr
= (u32
)(*offset
>> BTRFS_STRIPE_LEN_SHIFT
) / data_stripes
;
1314 /* Work out the disk rotation on this stripe-set */
1315 rot
= stripe_nr
% map
->num_stripes
;
1316 /* calculate which stripe this data locates */
1318 stripe_index
= rot
% map
->num_stripes
;
1319 if (stripe_index
== num
)
1321 if (stripe_index
< num
)
1324 *offset
= last_offset
+ btrfs_stripe_nr_to_offset(j
);
1329 * Return 0 if the extent item range covers any byte of the range.
1330 * Return <0 if the extent item is before @search_start.
1331 * Return >0 if the extent item is after @start_start + @search_len.
1333 static int compare_extent_item_range(struct btrfs_path
*path
,
1334 u64 search_start
, u64 search_len
)
1336 struct btrfs_fs_info
*fs_info
= path
->nodes
[0]->fs_info
;
1338 struct btrfs_key key
;
1340 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1341 ASSERT(key
.type
== BTRFS_EXTENT_ITEM_KEY
||
1342 key
.type
== BTRFS_METADATA_ITEM_KEY
);
1343 if (key
.type
== BTRFS_METADATA_ITEM_KEY
)
1344 len
= fs_info
->nodesize
;
1348 if (key
.objectid
+ len
<= search_start
)
1350 if (key
.objectid
>= search_start
+ search_len
)
1356 * Locate one extent item which covers any byte in range
1357 * [@search_start, @search_start + @search_length)
1359 * If the path is not initialized, we will initialize the search by doing
1360 * a btrfs_search_slot().
1361 * If the path is already initialized, we will use the path as the initial
1362 * slot, to avoid duplicated btrfs_search_slot() calls.
1364 * NOTE: If an extent item starts before @search_start, we will still
1365 * return the extent item. This is for data extent crossing stripe boundary.
1367 * Return 0 if we found such extent item, and @path will point to the extent item.
1368 * Return >0 if no such extent item can be found, and @path will be released.
1369 * Return <0 if hit fatal error, and @path will be released.
1371 static int find_first_extent_item(struct btrfs_root
*extent_root
,
1372 struct btrfs_path
*path
,
1373 u64 search_start
, u64 search_len
)
1375 struct btrfs_fs_info
*fs_info
= extent_root
->fs_info
;
1376 struct btrfs_key key
;
1379 /* Continue using the existing path */
1381 goto search_forward
;
1383 if (btrfs_fs_incompat(fs_info
, SKINNY_METADATA
))
1384 key
.type
= BTRFS_METADATA_ITEM_KEY
;
1386 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1387 key
.objectid
= search_start
;
1388 key
.offset
= (u64
)-1;
1390 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
1395 * Key with offset -1 found, there would have to exist an extent
1396 * item with such offset, but this is out of the valid range.
1398 btrfs_release_path(path
);
1403 * Here we intentionally pass 0 as @min_objectid, as there could be
1404 * an extent item starting before @search_start.
1406 ret
= btrfs_previous_extent_item(extent_root
, path
, 0);
1410 * No matter whether we have found an extent item, the next loop will
1411 * properly do every check on the key.
1415 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1416 if (key
.objectid
>= search_start
+ search_len
)
1418 if (key
.type
!= BTRFS_METADATA_ITEM_KEY
&&
1419 key
.type
!= BTRFS_EXTENT_ITEM_KEY
)
1422 ret
= compare_extent_item_range(path
, search_start
, search_len
);
1428 ret
= btrfs_next_item(extent_root
, path
);
1430 /* Either no more items or a fatal error. */
1431 btrfs_release_path(path
);
1435 btrfs_release_path(path
);
1439 static void get_extent_info(struct btrfs_path
*path
, u64
*extent_start_ret
,
1440 u64
*size_ret
, u64
*flags_ret
, u64
*generation_ret
)
1442 struct btrfs_key key
;
1443 struct btrfs_extent_item
*ei
;
1445 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1446 ASSERT(key
.type
== BTRFS_METADATA_ITEM_KEY
||
1447 key
.type
== BTRFS_EXTENT_ITEM_KEY
);
1448 *extent_start_ret
= key
.objectid
;
1449 if (key
.type
== BTRFS_METADATA_ITEM_KEY
)
1450 *size_ret
= path
->nodes
[0]->fs_info
->nodesize
;
1452 *size_ret
= key
.offset
;
1453 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0], struct btrfs_extent_item
);
1454 *flags_ret
= btrfs_extent_flags(path
->nodes
[0], ei
);
1455 *generation_ret
= btrfs_extent_generation(path
->nodes
[0], ei
);
1458 static int sync_write_pointer_for_zoned(struct scrub_ctx
*sctx
, u64 logical
,
1459 u64 physical
, u64 physical_end
)
1461 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1464 if (!btrfs_is_zoned(fs_info
))
1467 mutex_lock(&sctx
->wr_lock
);
1468 if (sctx
->write_pointer
< physical_end
) {
1469 ret
= btrfs_sync_zone_write_pointer(sctx
->wr_tgtdev
, logical
,
1471 sctx
->write_pointer
);
1474 "zoned: failed to recover write pointer");
1476 mutex_unlock(&sctx
->wr_lock
);
1477 btrfs_dev_clear_zone_empty(sctx
->wr_tgtdev
, physical
);
1482 static void fill_one_extent_info(struct btrfs_fs_info
*fs_info
,
1483 struct scrub_stripe
*stripe
,
1484 u64 extent_start
, u64 extent_len
,
1485 u64 extent_flags
, u64 extent_gen
)
1487 for (u64 cur_logical
= max(stripe
->logical
, extent_start
);
1488 cur_logical
< min(stripe
->logical
+ BTRFS_STRIPE_LEN
,
1489 extent_start
+ extent_len
);
1490 cur_logical
+= fs_info
->sectorsize
) {
1491 const int nr_sector
= (cur_logical
- stripe
->logical
) >>
1492 fs_info
->sectorsize_bits
;
1493 struct scrub_sector_verification
*sector
=
1494 &stripe
->sectors
[nr_sector
];
1496 set_bit(nr_sector
, &stripe
->extent_sector_bitmap
);
1497 if (extent_flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
1498 sector
->is_metadata
= true;
1499 sector
->generation
= extent_gen
;
1504 static void scrub_stripe_reset_bitmaps(struct scrub_stripe
*stripe
)
1506 stripe
->extent_sector_bitmap
= 0;
1507 stripe
->init_error_bitmap
= 0;
1508 stripe
->init_nr_io_errors
= 0;
1509 stripe
->init_nr_csum_errors
= 0;
1510 stripe
->init_nr_meta_errors
= 0;
1511 stripe
->error_bitmap
= 0;
1512 stripe
->io_error_bitmap
= 0;
1513 stripe
->csum_error_bitmap
= 0;
1514 stripe
->meta_error_bitmap
= 0;
1518 * Locate one stripe which has at least one extent in its range.
1520 * Return 0 if found such stripe, and store its info into @stripe.
1521 * Return >0 if there is no such stripe in the specified range.
1522 * Return <0 for error.
1524 static int scrub_find_fill_first_stripe(struct btrfs_block_group
*bg
,
1525 struct btrfs_path
*extent_path
,
1526 struct btrfs_path
*csum_path
,
1527 struct btrfs_device
*dev
, u64 physical
,
1528 int mirror_num
, u64 logical_start
,
1530 struct scrub_stripe
*stripe
)
1532 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1533 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, bg
->start
);
1534 struct btrfs_root
*csum_root
= btrfs_csum_root(fs_info
, bg
->start
);
1535 const u64 logical_end
= logical_start
+ logical_len
;
1536 u64 cur_logical
= logical_start
;
1544 memset(stripe
->sectors
, 0, sizeof(struct scrub_sector_verification
) *
1545 stripe
->nr_sectors
);
1546 scrub_stripe_reset_bitmaps(stripe
);
1548 /* The range must be inside the bg. */
1549 ASSERT(logical_start
>= bg
->start
&& logical_end
<= bg
->start
+ bg
->length
);
1551 ret
= find_first_extent_item(extent_root
, extent_path
, logical_start
,
1553 /* Either error or not found. */
1556 get_extent_info(extent_path
, &extent_start
, &extent_len
, &extent_flags
,
1558 if (extent_flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
1559 stripe
->nr_meta_extents
++;
1560 if (extent_flags
& BTRFS_EXTENT_FLAG_DATA
)
1561 stripe
->nr_data_extents
++;
1562 cur_logical
= max(extent_start
, cur_logical
);
1565 * Round down to stripe boundary.
1567 * The extra calculation against bg->start is to handle block groups
1568 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
1570 stripe
->logical
= round_down(cur_logical
- bg
->start
, BTRFS_STRIPE_LEN
) +
1572 stripe
->physical
= physical
+ stripe
->logical
- logical_start
;
1575 stripe
->mirror_num
= mirror_num
;
1576 stripe_end
= stripe
->logical
+ BTRFS_STRIPE_LEN
- 1;
1578 /* Fill the first extent info into stripe->sectors[] array. */
1579 fill_one_extent_info(fs_info
, stripe
, extent_start
, extent_len
,
1580 extent_flags
, extent_gen
);
1581 cur_logical
= extent_start
+ extent_len
;
1583 /* Fill the extent info for the remaining sectors. */
1584 while (cur_logical
<= stripe_end
) {
1585 ret
= find_first_extent_item(extent_root
, extent_path
, cur_logical
,
1586 stripe_end
- cur_logical
+ 1);
1593 get_extent_info(extent_path
, &extent_start
, &extent_len
,
1594 &extent_flags
, &extent_gen
);
1595 if (extent_flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
1596 stripe
->nr_meta_extents
++;
1597 if (extent_flags
& BTRFS_EXTENT_FLAG_DATA
)
1598 stripe
->nr_data_extents
++;
1599 fill_one_extent_info(fs_info
, stripe
, extent_start
, extent_len
,
1600 extent_flags
, extent_gen
);
1601 cur_logical
= extent_start
+ extent_len
;
1604 /* Now fill the data csum. */
1605 if (bg
->flags
& BTRFS_BLOCK_GROUP_DATA
) {
1607 unsigned long csum_bitmap
= 0;
1609 /* Csum space should have already been allocated. */
1610 ASSERT(stripe
->csums
);
1613 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
1614 * should contain at most 16 sectors.
1616 ASSERT(BITS_PER_LONG
>= BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
);
1618 ret
= btrfs_lookup_csums_bitmap(csum_root
, csum_path
,
1619 stripe
->logical
, stripe_end
,
1620 stripe
->csums
, &csum_bitmap
);
1626 for_each_set_bit(sector_nr
, &csum_bitmap
, stripe
->nr_sectors
) {
1627 stripe
->sectors
[sector_nr
].csum
= stripe
->csums
+
1628 sector_nr
* fs_info
->csum_size
;
1631 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
);
1636 static void scrub_reset_stripe(struct scrub_stripe
*stripe
)
1638 scrub_stripe_reset_bitmaps(stripe
);
1640 stripe
->nr_meta_extents
= 0;
1641 stripe
->nr_data_extents
= 0;
1644 for (int i
= 0; i
< stripe
->nr_sectors
; i
++) {
1645 stripe
->sectors
[i
].is_metadata
= false;
1646 stripe
->sectors
[i
].csum
= NULL
;
1647 stripe
->sectors
[i
].generation
= 0;
1651 static u32
stripe_length(const struct scrub_stripe
*stripe
)
1655 return min(BTRFS_STRIPE_LEN
,
1656 stripe
->bg
->start
+ stripe
->bg
->length
- stripe
->logical
);
1659 static void scrub_submit_extent_sector_read(struct scrub_ctx
*sctx
,
1660 struct scrub_stripe
*stripe
)
1662 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1663 struct btrfs_bio
*bbio
= NULL
;
1664 unsigned int nr_sectors
= stripe_length(stripe
) >> fs_info
->sectorsize_bits
;
1665 u64 stripe_len
= BTRFS_STRIPE_LEN
;
1666 int mirror
= stripe
->mirror_num
;
1669 atomic_inc(&stripe
->pending_io
);
1671 for_each_set_bit(i
, &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
) {
1672 struct page
*page
= scrub_stripe_get_page(stripe
, i
);
1673 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, i
);
1675 /* We're beyond the chunk boundary, no need to read anymore. */
1676 if (i
>= nr_sectors
)
1679 /* The current sector cannot be merged, submit the bio. */
1682 !test_bit(i
- 1, &stripe
->extent_sector_bitmap
)) ||
1683 bbio
->bio
.bi_iter
.bi_size
>= stripe_len
)) {
1684 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
1685 atomic_inc(&stripe
->pending_io
);
1686 btrfs_submit_bbio(bbio
, mirror
);
1691 struct btrfs_io_stripe io_stripe
= {};
1692 struct btrfs_io_context
*bioc
= NULL
;
1693 const u64 logical
= stripe
->logical
+
1694 (i
<< fs_info
->sectorsize_bits
);
1697 io_stripe
.rst_search_commit_root
= true;
1698 stripe_len
= (nr_sectors
- i
) << fs_info
->sectorsize_bits
;
1700 * For RST cases, we need to manually split the bbio to
1701 * follow the RST boundary.
1703 err
= btrfs_map_block(fs_info
, BTRFS_MAP_READ
, logical
,
1704 &stripe_len
, &bioc
, &io_stripe
, &mirror
);
1705 btrfs_put_bioc(bioc
);
1707 set_bit(i
, &stripe
->io_error_bitmap
);
1708 set_bit(i
, &stripe
->error_bitmap
);
1712 bbio
= btrfs_bio_alloc(stripe
->nr_sectors
, REQ_OP_READ
,
1713 fs_info
, scrub_read_endio
, stripe
);
1714 bbio
->bio
.bi_iter
.bi_sector
= logical
>> SECTOR_SHIFT
;
1717 __bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
1721 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
1722 atomic_inc(&stripe
->pending_io
);
1723 btrfs_submit_bbio(bbio
, mirror
);
1726 if (atomic_dec_and_test(&stripe
->pending_io
)) {
1727 wake_up(&stripe
->io_wait
);
1728 INIT_WORK(&stripe
->work
, scrub_stripe_read_repair_worker
);
1729 queue_work(stripe
->bg
->fs_info
->scrub_workers
, &stripe
->work
);
1733 static void scrub_submit_initial_read(struct scrub_ctx
*sctx
,
1734 struct scrub_stripe
*stripe
)
1736 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1737 struct btrfs_bio
*bbio
;
1738 unsigned int nr_sectors
= stripe_length(stripe
) >> fs_info
->sectorsize_bits
;
1739 int mirror
= stripe
->mirror_num
;
1742 ASSERT(stripe
->mirror_num
> 0);
1743 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
));
1745 if (btrfs_need_stripe_tree_update(fs_info
, stripe
->bg
->flags
)) {
1746 scrub_submit_extent_sector_read(sctx
, stripe
);
1750 bbio
= btrfs_bio_alloc(SCRUB_STRIPE_PAGES
, REQ_OP_READ
, fs_info
,
1751 scrub_read_endio
, stripe
);
1753 bbio
->bio
.bi_iter
.bi_sector
= stripe
->logical
>> SECTOR_SHIFT
;
1754 /* Read the whole range inside the chunk boundary. */
1755 for (unsigned int cur
= 0; cur
< nr_sectors
; cur
++) {
1756 struct page
*page
= scrub_stripe_get_page(stripe
, cur
);
1757 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, cur
);
1760 ret
= bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
1761 /* We should have allocated enough bio vectors. */
1762 ASSERT(ret
== fs_info
->sectorsize
);
1764 atomic_inc(&stripe
->pending_io
);
1767 * For dev-replace, either user asks to avoid the source dev, or
1768 * the device is missing, we try the next mirror instead.
1770 if (sctx
->is_dev_replace
&&
1771 (fs_info
->dev_replace
.cont_reading_from_srcdev_mode
==
1772 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID
||
1773 !stripe
->dev
->bdev
)) {
1774 int num_copies
= btrfs_num_copies(fs_info
, stripe
->bg
->start
,
1775 stripe
->bg
->length
);
1777 mirror
= calc_next_mirror(mirror
, num_copies
);
1779 btrfs_submit_bbio(bbio
, mirror
);
1782 static bool stripe_has_metadata_error(struct scrub_stripe
*stripe
)
1786 for_each_set_bit(i
, &stripe
->error_bitmap
, stripe
->nr_sectors
) {
1787 if (stripe
->sectors
[i
].is_metadata
) {
1788 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1791 "stripe %llu has unrepaired metadata sector at %llu",
1793 stripe
->logical
+ (i
<< fs_info
->sectorsize_bits
));
1800 static void submit_initial_group_read(struct scrub_ctx
*sctx
,
1801 unsigned int first_slot
,
1802 unsigned int nr_stripes
)
1804 struct blk_plug plug
;
1806 ASSERT(first_slot
< SCRUB_TOTAL_STRIPES
);
1807 ASSERT(first_slot
+ nr_stripes
<= SCRUB_TOTAL_STRIPES
);
1809 scrub_throttle_dev_io(sctx
, sctx
->stripes
[0].dev
,
1810 btrfs_stripe_nr_to_offset(nr_stripes
));
1811 blk_start_plug(&plug
);
1812 for (int i
= 0; i
< nr_stripes
; i
++) {
1813 struct scrub_stripe
*stripe
= &sctx
->stripes
[first_slot
+ i
];
1815 /* Those stripes should be initialized. */
1816 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
));
1817 scrub_submit_initial_read(sctx
, stripe
);
1819 blk_finish_plug(&plug
);
1822 static int flush_scrub_stripes(struct scrub_ctx
*sctx
)
1824 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1825 struct scrub_stripe
*stripe
;
1826 const int nr_stripes
= sctx
->cur_stripe
;
1832 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &sctx
->stripes
[0].state
));
1834 /* Submit the stripes which are populated but not submitted. */
1835 if (nr_stripes
% SCRUB_STRIPES_PER_GROUP
) {
1836 const int first_slot
= round_down(nr_stripes
, SCRUB_STRIPES_PER_GROUP
);
1838 submit_initial_group_read(sctx
, first_slot
, nr_stripes
- first_slot
);
1841 for (int i
= 0; i
< nr_stripes
; i
++) {
1842 stripe
= &sctx
->stripes
[i
];
1844 wait_event(stripe
->repair_wait
,
1845 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE
, &stripe
->state
));
1848 /* Submit for dev-replace. */
1849 if (sctx
->is_dev_replace
) {
1851 * For dev-replace, if we know there is something wrong with
1852 * metadata, we should immediately abort.
1854 for (int i
= 0; i
< nr_stripes
; i
++) {
1855 if (stripe_has_metadata_error(&sctx
->stripes
[i
])) {
1860 for (int i
= 0; i
< nr_stripes
; i
++) {
1863 stripe
= &sctx
->stripes
[i
];
1865 ASSERT(stripe
->dev
== fs_info
->dev_replace
.srcdev
);
1867 bitmap_andnot(&good
, &stripe
->extent_sector_bitmap
,
1868 &stripe
->error_bitmap
, stripe
->nr_sectors
);
1869 scrub_write_sectors(sctx
, stripe
, good
, true);
1873 /* Wait for the above writebacks to finish. */
1874 for (int i
= 0; i
< nr_stripes
; i
++) {
1875 stripe
= &sctx
->stripes
[i
];
1877 wait_scrub_stripe_io(stripe
);
1878 spin_lock(&sctx
->stat_lock
);
1879 sctx
->stat
.last_physical
= stripe
->physical
+ stripe_length(stripe
);
1880 spin_unlock(&sctx
->stat_lock
);
1881 scrub_reset_stripe(stripe
);
1884 sctx
->cur_stripe
= 0;
1888 static void raid56_scrub_wait_endio(struct bio
*bio
)
1890 complete(bio
->bi_private
);
1893 static int queue_scrub_stripe(struct scrub_ctx
*sctx
, struct btrfs_block_group
*bg
,
1894 struct btrfs_device
*dev
, int mirror_num
,
1895 u64 logical
, u32 length
, u64 physical
,
1896 u64
*found_logical_ret
)
1898 struct scrub_stripe
*stripe
;
1902 * There should always be one slot left, as caller filling the last
1903 * slot should flush them all.
1905 ASSERT(sctx
->cur_stripe
< SCRUB_TOTAL_STRIPES
);
1907 /* @found_logical_ret must be specified. */
1908 ASSERT(found_logical_ret
);
1910 stripe
= &sctx
->stripes
[sctx
->cur_stripe
];
1911 scrub_reset_stripe(stripe
);
1912 ret
= scrub_find_fill_first_stripe(bg
, &sctx
->extent_path
,
1913 &sctx
->csum_path
, dev
, physical
,
1914 mirror_num
, logical
, length
, stripe
);
1915 /* Either >0 as no more extents or <0 for error. */
1918 *found_logical_ret
= stripe
->logical
;
1921 /* We filled one group, submit it. */
1922 if (sctx
->cur_stripe
% SCRUB_STRIPES_PER_GROUP
== 0) {
1923 const int first_slot
= sctx
->cur_stripe
- SCRUB_STRIPES_PER_GROUP
;
1925 submit_initial_group_read(sctx
, first_slot
, SCRUB_STRIPES_PER_GROUP
);
1928 /* Last slot used, flush them all. */
1929 if (sctx
->cur_stripe
== SCRUB_TOTAL_STRIPES
)
1930 return flush_scrub_stripes(sctx
);
1934 static int scrub_raid56_parity_stripe(struct scrub_ctx
*sctx
,
1935 struct btrfs_device
*scrub_dev
,
1936 struct btrfs_block_group
*bg
,
1937 struct btrfs_chunk_map
*map
,
1938 u64 full_stripe_start
)
1940 DECLARE_COMPLETION_ONSTACK(io_done
);
1941 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1942 struct btrfs_raid_bio
*rbio
;
1943 struct btrfs_io_context
*bioc
= NULL
;
1944 struct btrfs_path extent_path
= { 0 };
1945 struct btrfs_path csum_path
= { 0 };
1947 struct scrub_stripe
*stripe
;
1948 bool all_empty
= true;
1949 const int data_stripes
= nr_data_stripes(map
);
1950 unsigned long extent_bitmap
= 0;
1951 u64 length
= btrfs_stripe_nr_to_offset(data_stripes
);
1954 ASSERT(sctx
->raid56_data_stripes
);
1957 * For data stripe search, we cannot re-use the same extent/csum paths,
1958 * as the data stripe bytenr may be smaller than previous extent. Thus
1959 * we have to use our own extent/csum paths.
1961 extent_path
.search_commit_root
= 1;
1962 extent_path
.skip_locking
= 1;
1963 csum_path
.search_commit_root
= 1;
1964 csum_path
.skip_locking
= 1;
1966 for (int i
= 0; i
< data_stripes
; i
++) {
1971 stripe
= &sctx
->raid56_data_stripes
[i
];
1972 rot
= div_u64(full_stripe_start
- bg
->start
,
1973 data_stripes
) >> BTRFS_STRIPE_LEN_SHIFT
;
1974 stripe_index
= (i
+ rot
) % map
->num_stripes
;
1975 physical
= map
->stripes
[stripe_index
].physical
+
1976 btrfs_stripe_nr_to_offset(rot
);
1978 scrub_reset_stripe(stripe
);
1979 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT
, &stripe
->state
);
1980 ret
= scrub_find_fill_first_stripe(bg
, &extent_path
, &csum_path
,
1981 map
->stripes
[stripe_index
].dev
, physical
, 1,
1982 full_stripe_start
+ btrfs_stripe_nr_to_offset(i
),
1983 BTRFS_STRIPE_LEN
, stripe
);
1987 * No extent in this data stripe, need to manually mark them
1988 * initialized to make later read submission happy.
1991 stripe
->logical
= full_stripe_start
+
1992 btrfs_stripe_nr_to_offset(i
);
1993 stripe
->dev
= map
->stripes
[stripe_index
].dev
;
1994 stripe
->mirror_num
= 1;
1995 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
);
1999 /* Check if all data stripes are empty. */
2000 for (int i
= 0; i
< data_stripes
; i
++) {
2001 stripe
= &sctx
->raid56_data_stripes
[i
];
2002 if (!bitmap_empty(&stripe
->extent_sector_bitmap
, stripe
->nr_sectors
)) {
2012 for (int i
= 0; i
< data_stripes
; i
++) {
2013 stripe
= &sctx
->raid56_data_stripes
[i
];
2014 scrub_submit_initial_read(sctx
, stripe
);
2016 for (int i
= 0; i
< data_stripes
; i
++) {
2017 stripe
= &sctx
->raid56_data_stripes
[i
];
2019 wait_event(stripe
->repair_wait
,
2020 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE
, &stripe
->state
));
2022 /* For now, no zoned support for RAID56. */
2023 ASSERT(!btrfs_is_zoned(sctx
->fs_info
));
2026 * Now all data stripes are properly verified. Check if we have any
2027 * unrepaired, if so abort immediately or we could further corrupt the
2030 * During the loop, also populate extent_bitmap.
2032 for (int i
= 0; i
< data_stripes
; i
++) {
2033 unsigned long error
;
2035 stripe
= &sctx
->raid56_data_stripes
[i
];
2038 * We should only check the errors where there is an extent.
2039 * As we may hit an empty data stripe while it's missing.
2041 bitmap_and(&error
, &stripe
->error_bitmap
,
2042 &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
);
2043 if (!bitmap_empty(&error
, stripe
->nr_sectors
)) {
2045 "unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
2046 full_stripe_start
, i
, stripe
->nr_sectors
,
2051 bitmap_or(&extent_bitmap
, &extent_bitmap
,
2052 &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
);
2055 /* Now we can check and regenerate the P/Q stripe. */
2056 bio
= bio_alloc(NULL
, 1, REQ_OP_READ
, GFP_NOFS
);
2057 bio
->bi_iter
.bi_sector
= full_stripe_start
>> SECTOR_SHIFT
;
2058 bio
->bi_private
= &io_done
;
2059 bio
->bi_end_io
= raid56_scrub_wait_endio
;
2061 btrfs_bio_counter_inc_blocked(fs_info
);
2062 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_WRITE
, full_stripe_start
,
2063 &length
, &bioc
, NULL
, NULL
);
2065 btrfs_put_bioc(bioc
);
2066 btrfs_bio_counter_dec(fs_info
);
2069 rbio
= raid56_parity_alloc_scrub_rbio(bio
, bioc
, scrub_dev
, &extent_bitmap
,
2070 BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
);
2071 btrfs_put_bioc(bioc
);
2074 btrfs_bio_counter_dec(fs_info
);
2077 /* Use the recovered stripes as cache to avoid read them from disk again. */
2078 for (int i
= 0; i
< data_stripes
; i
++) {
2079 stripe
= &sctx
->raid56_data_stripes
[i
];
2081 raid56_parity_cache_data_pages(rbio
, stripe
->pages
,
2082 full_stripe_start
+ (i
<< BTRFS_STRIPE_LEN_SHIFT
));
2084 raid56_parity_submit_scrub_rbio(rbio
);
2085 wait_for_completion_io(&io_done
);
2086 ret
= blk_status_to_errno(bio
->bi_status
);
2088 btrfs_bio_counter_dec(fs_info
);
2090 btrfs_release_path(&extent_path
);
2091 btrfs_release_path(&csum_path
);
2097 * Scrub one range which can only has simple mirror based profile.
2098 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
2101 * Since we may need to handle a subset of block group, we need @logical_start
2102 * and @logical_length parameter.
2104 static int scrub_simple_mirror(struct scrub_ctx
*sctx
,
2105 struct btrfs_block_group
*bg
,
2106 struct btrfs_chunk_map
*map
,
2107 u64 logical_start
, u64 logical_length
,
2108 struct btrfs_device
*device
,
2109 u64 physical
, int mirror_num
)
2111 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2112 const u64 logical_end
= logical_start
+ logical_length
;
2113 u64 cur_logical
= logical_start
;
2116 /* The range must be inside the bg */
2117 ASSERT(logical_start
>= bg
->start
&& logical_end
<= bg
->start
+ bg
->length
);
2119 /* Go through each extent items inside the logical range */
2120 while (cur_logical
< logical_end
) {
2121 u64 found_logical
= U64_MAX
;
2122 u64 cur_physical
= physical
+ cur_logical
- logical_start
;
2125 if (atomic_read(&fs_info
->scrub_cancel_req
) ||
2126 atomic_read(&sctx
->cancel_req
)) {
2131 if (atomic_read(&fs_info
->scrub_pause_req
)) {
2132 /* Push queued extents */
2133 scrub_blocked_if_needed(fs_info
);
2135 /* Block group removed? */
2136 spin_lock(&bg
->lock
);
2137 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &bg
->runtime_flags
)) {
2138 spin_unlock(&bg
->lock
);
2142 spin_unlock(&bg
->lock
);
2144 ret
= queue_scrub_stripe(sctx
, bg
, device
, mirror_num
,
2145 cur_logical
, logical_end
- cur_logical
,
2146 cur_physical
, &found_logical
);
2148 /* No more extent, just update the accounting */
2149 spin_lock(&sctx
->stat_lock
);
2150 sctx
->stat
.last_physical
= physical
+ logical_length
;
2151 spin_unlock(&sctx
->stat_lock
);
2158 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
2159 ASSERT(found_logical
!= U64_MAX
);
2160 cur_logical
= found_logical
+ BTRFS_STRIPE_LEN
;
2162 /* Don't hold CPU for too long time */
2168 /* Calculate the full stripe length for simple stripe based profiles */
2169 static u64
simple_stripe_full_stripe_len(const struct btrfs_chunk_map
*map
)
2171 ASSERT(map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
2172 BTRFS_BLOCK_GROUP_RAID10
));
2174 return btrfs_stripe_nr_to_offset(map
->num_stripes
/ map
->sub_stripes
);
2177 /* Get the logical bytenr for the stripe */
2178 static u64
simple_stripe_get_logical(struct btrfs_chunk_map
*map
,
2179 struct btrfs_block_group
*bg
,
2182 ASSERT(map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
2183 BTRFS_BLOCK_GROUP_RAID10
));
2184 ASSERT(stripe_index
< map
->num_stripes
);
2187 * (stripe_index / sub_stripes) gives how many data stripes we need to
2190 return btrfs_stripe_nr_to_offset(stripe_index
/ map
->sub_stripes
) +
2194 /* Get the mirror number for the stripe */
2195 static int simple_stripe_mirror_num(struct btrfs_chunk_map
*map
, int stripe_index
)
2197 ASSERT(map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
2198 BTRFS_BLOCK_GROUP_RAID10
));
2199 ASSERT(stripe_index
< map
->num_stripes
);
2201 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
2202 return stripe_index
% map
->sub_stripes
+ 1;
2205 static int scrub_simple_stripe(struct scrub_ctx
*sctx
,
2206 struct btrfs_block_group
*bg
,
2207 struct btrfs_chunk_map
*map
,
2208 struct btrfs_device
*device
,
2211 const u64 logical_increment
= simple_stripe_full_stripe_len(map
);
2212 const u64 orig_logical
= simple_stripe_get_logical(map
, bg
, stripe_index
);
2213 const u64 orig_physical
= map
->stripes
[stripe_index
].physical
;
2214 const int mirror_num
= simple_stripe_mirror_num(map
, stripe_index
);
2215 u64 cur_logical
= orig_logical
;
2216 u64 cur_physical
= orig_physical
;
2219 while (cur_logical
< bg
->start
+ bg
->length
) {
2221 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
2222 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
2225 ret
= scrub_simple_mirror(sctx
, bg
, map
, cur_logical
,
2226 BTRFS_STRIPE_LEN
, device
, cur_physical
,
2230 /* Skip to next stripe which belongs to the target device */
2231 cur_logical
+= logical_increment
;
2232 /* For physical offset, we just go to next stripe */
2233 cur_physical
+= BTRFS_STRIPE_LEN
;
2238 static noinline_for_stack
int scrub_stripe(struct scrub_ctx
*sctx
,
2239 struct btrfs_block_group
*bg
,
2240 struct btrfs_chunk_map
*map
,
2241 struct btrfs_device
*scrub_dev
,
2244 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2245 const u64 profile
= map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
;
2246 const u64 chunk_logical
= bg
->start
;
2249 u64 physical
= map
->stripes
[stripe_index
].physical
;
2250 const u64 dev_stripe_len
= btrfs_calc_stripe_length(map
);
2251 const u64 physical_end
= physical
+ dev_stripe_len
;
2254 /* The logical increment after finishing one stripe */
2256 /* Offset inside the chunk */
2261 /* Extent_path should be released by now. */
2262 ASSERT(sctx
->extent_path
.nodes
[0] == NULL
);
2264 scrub_blocked_if_needed(fs_info
);
2266 if (sctx
->is_dev_replace
&&
2267 btrfs_dev_is_sequential(sctx
->wr_tgtdev
, physical
)) {
2268 mutex_lock(&sctx
->wr_lock
);
2269 sctx
->write_pointer
= physical
;
2270 mutex_unlock(&sctx
->wr_lock
);
2273 /* Prepare the extra data stripes used by RAID56. */
2274 if (profile
& BTRFS_BLOCK_GROUP_RAID56_MASK
) {
2275 ASSERT(sctx
->raid56_data_stripes
== NULL
);
2277 sctx
->raid56_data_stripes
= kcalloc(nr_data_stripes(map
),
2278 sizeof(struct scrub_stripe
),
2280 if (!sctx
->raid56_data_stripes
) {
2284 for (int i
= 0; i
< nr_data_stripes(map
); i
++) {
2285 ret
= init_scrub_stripe(fs_info
,
2286 &sctx
->raid56_data_stripes
[i
]);
2289 sctx
->raid56_data_stripes
[i
].bg
= bg
;
2290 sctx
->raid56_data_stripes
[i
].sctx
= sctx
;
2294 * There used to be a big double loop to handle all profiles using the
2295 * same routine, which grows larger and more gross over time.
2297 * So here we handle each profile differently, so simpler profiles
2298 * have simpler scrubbing function.
2300 if (!(profile
& (BTRFS_BLOCK_GROUP_RAID0
| BTRFS_BLOCK_GROUP_RAID10
|
2301 BTRFS_BLOCK_GROUP_RAID56_MASK
))) {
2303 * Above check rules out all complex profile, the remaining
2304 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
2305 * mirrored duplication without stripe.
2307 * Only @physical and @mirror_num needs to calculated using
2310 ret
= scrub_simple_mirror(sctx
, bg
, map
, bg
->start
, bg
->length
,
2311 scrub_dev
, map
->stripes
[stripe_index
].physical
,
2316 if (profile
& (BTRFS_BLOCK_GROUP_RAID0
| BTRFS_BLOCK_GROUP_RAID10
)) {
2317 ret
= scrub_simple_stripe(sctx
, bg
, map
, scrub_dev
, stripe_index
);
2318 offset
= btrfs_stripe_nr_to_offset(stripe_index
/ map
->sub_stripes
);
2322 /* Only RAID56 goes through the old code */
2323 ASSERT(map
->type
& BTRFS_BLOCK_GROUP_RAID56_MASK
);
2326 /* Calculate the logical end of the stripe */
2327 get_raid56_logic_offset(physical_end
, stripe_index
,
2328 map
, &logic_end
, NULL
);
2329 logic_end
+= chunk_logical
;
2331 /* Initialize @offset in case we need to go to out: label */
2332 get_raid56_logic_offset(physical
, stripe_index
, map
, &offset
, NULL
);
2333 increment
= btrfs_stripe_nr_to_offset(nr_data_stripes(map
));
2336 * Due to the rotation, for RAID56 it's better to iterate each stripe
2337 * using their physical offset.
2339 while (physical
< physical_end
) {
2340 ret
= get_raid56_logic_offset(physical
, stripe_index
, map
,
2341 &logical
, &stripe_logical
);
2342 logical
+= chunk_logical
;
2344 /* it is parity strip */
2345 stripe_logical
+= chunk_logical
;
2346 ret
= scrub_raid56_parity_stripe(sctx
, scrub_dev
, bg
,
2347 map
, stripe_logical
);
2348 spin_lock(&sctx
->stat_lock
);
2349 sctx
->stat
.last_physical
= min(physical
+ BTRFS_STRIPE_LEN
,
2351 spin_unlock(&sctx
->stat_lock
);
2358 * Now we're at a data stripe, scrub each extents in the range.
2360 * At this stage, if we ignore the repair part, inside each data
2361 * stripe it is no different than SINGLE profile.
2362 * We can reuse scrub_simple_mirror() here, as the repair part
2363 * is still based on @mirror_num.
2365 ret
= scrub_simple_mirror(sctx
, bg
, map
, logical
, BTRFS_STRIPE_LEN
,
2366 scrub_dev
, physical
, 1);
2370 logical
+= increment
;
2371 physical
+= BTRFS_STRIPE_LEN
;
2372 spin_lock(&sctx
->stat_lock
);
2374 sctx
->stat
.last_physical
=
2375 map
->stripes
[stripe_index
].physical
+ dev_stripe_len
;
2377 sctx
->stat
.last_physical
= physical
;
2378 spin_unlock(&sctx
->stat_lock
);
2383 ret2
= flush_scrub_stripes(sctx
);
2386 btrfs_release_path(&sctx
->extent_path
);
2387 btrfs_release_path(&sctx
->csum_path
);
2389 if (sctx
->raid56_data_stripes
) {
2390 for (int i
= 0; i
< nr_data_stripes(map
); i
++)
2391 release_scrub_stripe(&sctx
->raid56_data_stripes
[i
]);
2392 kfree(sctx
->raid56_data_stripes
);
2393 sctx
->raid56_data_stripes
= NULL
;
2396 if (sctx
->is_dev_replace
&& ret
>= 0) {
2399 ret2
= sync_write_pointer_for_zoned(sctx
,
2400 chunk_logical
+ offset
,
2401 map
->stripes
[stripe_index
].physical
,
2407 return ret
< 0 ? ret
: 0;
2410 static noinline_for_stack
int scrub_chunk(struct scrub_ctx
*sctx
,
2411 struct btrfs_block_group
*bg
,
2412 struct btrfs_device
*scrub_dev
,
2416 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2417 struct btrfs_chunk_map
*map
;
2421 map
= btrfs_find_chunk_map(fs_info
, bg
->start
, bg
->length
);
2424 * Might have been an unused block group deleted by the cleaner
2425 * kthread or relocation.
2427 spin_lock(&bg
->lock
);
2428 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED
, &bg
->runtime_flags
))
2430 spin_unlock(&bg
->lock
);
2434 if (map
->start
!= bg
->start
)
2436 if (map
->chunk_len
< dev_extent_len
)
2439 for (i
= 0; i
< map
->num_stripes
; ++i
) {
2440 if (map
->stripes
[i
].dev
->bdev
== scrub_dev
->bdev
&&
2441 map
->stripes
[i
].physical
== dev_offset
) {
2442 ret
= scrub_stripe(sctx
, bg
, map
, scrub_dev
, i
);
2448 btrfs_free_chunk_map(map
);
2453 static int finish_extent_writes_for_zoned(struct btrfs_root
*root
,
2454 struct btrfs_block_group
*cache
)
2456 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
2458 if (!btrfs_is_zoned(fs_info
))
2461 btrfs_wait_block_group_reservations(cache
);
2462 btrfs_wait_nocow_writers(cache
);
2463 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, cache
);
2465 return btrfs_commit_current_transaction(root
);
2468 static noinline_for_stack
2469 int scrub_enumerate_chunks(struct scrub_ctx
*sctx
,
2470 struct btrfs_device
*scrub_dev
, u64 start
, u64 end
)
2472 struct btrfs_dev_extent
*dev_extent
= NULL
;
2473 struct btrfs_path
*path
;
2474 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2475 struct btrfs_root
*root
= fs_info
->dev_root
;
2480 struct extent_buffer
*l
;
2481 struct btrfs_key key
;
2482 struct btrfs_key found_key
;
2483 struct btrfs_block_group
*cache
;
2484 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
2486 path
= btrfs_alloc_path();
2490 path
->reada
= READA_FORWARD
;
2491 path
->search_commit_root
= 1;
2492 path
->skip_locking
= 1;
2494 key
.objectid
= scrub_dev
->devid
;
2496 key
.type
= BTRFS_DEV_EXTENT_KEY
;
2501 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2505 if (path
->slots
[0] >=
2506 btrfs_header_nritems(path
->nodes
[0])) {
2507 ret
= btrfs_next_leaf(root
, path
);
2520 slot
= path
->slots
[0];
2522 btrfs_item_key_to_cpu(l
, &found_key
, slot
);
2524 if (found_key
.objectid
!= scrub_dev
->devid
)
2527 if (found_key
.type
!= BTRFS_DEV_EXTENT_KEY
)
2530 if (found_key
.offset
>= end
)
2533 if (found_key
.offset
< key
.offset
)
2536 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
2537 dev_extent_len
= btrfs_dev_extent_length(l
, dev_extent
);
2539 if (found_key
.offset
+ dev_extent_len
<= start
)
2542 chunk_offset
= btrfs_dev_extent_chunk_offset(l
, dev_extent
);
2545 * get a reference on the corresponding block group to prevent
2546 * the chunk from going away while we scrub it
2548 cache
= btrfs_lookup_block_group(fs_info
, chunk_offset
);
2550 /* some chunks are removed but not committed to disk yet,
2551 * continue scrubbing */
2555 ASSERT(cache
->start
<= chunk_offset
);
2557 * We are using the commit root to search for device extents, so
2558 * that means we could have found a device extent item from a
2559 * block group that was deleted in the current transaction. The
2560 * logical start offset of the deleted block group, stored at
2561 * @chunk_offset, might be part of the logical address range of
2562 * a new block group (which uses different physical extents).
2563 * In this case btrfs_lookup_block_group() has returned the new
2564 * block group, and its start address is less than @chunk_offset.
2566 * We skip such new block groups, because it's pointless to
2567 * process them, as we won't find their extents because we search
2568 * for them using the commit root of the extent tree. For a device
2569 * replace it's also fine to skip it, we won't miss copying them
2570 * to the target device because we have the write duplication
2571 * setup through the regular write path (by btrfs_map_block()),
2572 * and we have committed a transaction when we started the device
2573 * replace, right after setting up the device replace state.
2575 if (cache
->start
< chunk_offset
) {
2576 btrfs_put_block_group(cache
);
2580 if (sctx
->is_dev_replace
&& btrfs_is_zoned(fs_info
)) {
2581 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY
, &cache
->runtime_flags
)) {
2582 btrfs_put_block_group(cache
);
2588 * Make sure that while we are scrubbing the corresponding block
2589 * group doesn't get its logical address and its device extents
2590 * reused for another block group, which can possibly be of a
2591 * different type and different profile. We do this to prevent
2592 * false error detections and crashes due to bogus attempts to
2595 spin_lock(&cache
->lock
);
2596 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &cache
->runtime_flags
)) {
2597 spin_unlock(&cache
->lock
);
2598 btrfs_put_block_group(cache
);
2601 btrfs_freeze_block_group(cache
);
2602 spin_unlock(&cache
->lock
);
2605 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
2606 * to avoid deadlock caused by:
2607 * btrfs_inc_block_group_ro()
2608 * -> btrfs_wait_for_commit()
2609 * -> btrfs_commit_transaction()
2610 * -> btrfs_scrub_pause()
2612 scrub_pause_on(fs_info
);
2615 * Don't do chunk preallocation for scrub.
2617 * This is especially important for SYSTEM bgs, or we can hit
2618 * -EFBIG from btrfs_finish_chunk_alloc() like:
2619 * 1. The only SYSTEM bg is marked RO.
2620 * Since SYSTEM bg is small, that's pretty common.
2621 * 2. New SYSTEM bg will be allocated
2622 * Due to regular version will allocate new chunk.
2623 * 3. New SYSTEM bg is empty and will get cleaned up
2624 * Before cleanup really happens, it's marked RO again.
2625 * 4. Empty SYSTEM bg get scrubbed
2628 * This can easily boost the amount of SYSTEM chunks if cleaner
2629 * thread can't be triggered fast enough, and use up all space
2630 * of btrfs_super_block::sys_chunk_array
2632 * While for dev replace, we need to try our best to mark block
2633 * group RO, to prevent race between:
2634 * - Write duplication
2635 * Contains latest data
2637 * Contains data from commit tree
2639 * If target block group is not marked RO, nocow writes can
2640 * be overwritten by scrub copy, causing data corruption.
2641 * So for dev-replace, it's not allowed to continue if a block
2644 ret
= btrfs_inc_block_group_ro(cache
, sctx
->is_dev_replace
);
2645 if (!ret
&& sctx
->is_dev_replace
) {
2646 ret
= finish_extent_writes_for_zoned(root
, cache
);
2648 btrfs_dec_block_group_ro(cache
);
2649 scrub_pause_off(fs_info
);
2650 btrfs_put_block_group(cache
);
2657 } else if (ret
== -ENOSPC
&& !sctx
->is_dev_replace
&&
2658 !(cache
->flags
& BTRFS_BLOCK_GROUP_RAID56_MASK
)) {
2660 * btrfs_inc_block_group_ro return -ENOSPC when it
2661 * failed in creating new chunk for metadata.
2662 * It is not a problem for scrub, because
2663 * metadata are always cowed, and our scrub paused
2664 * commit_transactions.
2666 * For RAID56 chunks, we have to mark them read-only
2667 * for scrub, as later we would use our own cache
2668 * out of RAID56 realm.
2669 * Thus we want the RAID56 bg to be marked RO to
2670 * prevent RMW from screwing up out cache.
2673 } else if (ret
== -ETXTBSY
) {
2675 "skipping scrub of block group %llu due to active swapfile",
2677 scrub_pause_off(fs_info
);
2682 "failed setting block group ro: %d", ret
);
2683 btrfs_unfreeze_block_group(cache
);
2684 btrfs_put_block_group(cache
);
2685 scrub_pause_off(fs_info
);
2690 * Now the target block is marked RO, wait for nocow writes to
2691 * finish before dev-replace.
2692 * COW is fine, as COW never overwrites extents in commit tree.
2694 if (sctx
->is_dev_replace
) {
2695 btrfs_wait_nocow_writers(cache
);
2696 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, cache
);
2699 scrub_pause_off(fs_info
);
2700 down_write(&dev_replace
->rwsem
);
2701 dev_replace
->cursor_right
= found_key
.offset
+ dev_extent_len
;
2702 dev_replace
->cursor_left
= found_key
.offset
;
2703 dev_replace
->item_needs_writeback
= 1;
2704 up_write(&dev_replace
->rwsem
);
2706 ret
= scrub_chunk(sctx
, cache
, scrub_dev
, found_key
.offset
,
2708 if (sctx
->is_dev_replace
&&
2709 !btrfs_finish_block_group_to_copy(dev_replace
->srcdev
,
2710 cache
, found_key
.offset
))
2713 down_write(&dev_replace
->rwsem
);
2714 dev_replace
->cursor_left
= dev_replace
->cursor_right
;
2715 dev_replace
->item_needs_writeback
= 1;
2716 up_write(&dev_replace
->rwsem
);
2719 btrfs_dec_block_group_ro(cache
);
2722 * We might have prevented the cleaner kthread from deleting
2723 * this block group if it was already unused because we raced
2724 * and set it to RO mode first. So add it back to the unused
2725 * list, otherwise it might not ever be deleted unless a manual
2726 * balance is triggered or it becomes used and unused again.
2728 spin_lock(&cache
->lock
);
2729 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED
, &cache
->runtime_flags
) &&
2730 !cache
->ro
&& cache
->reserved
== 0 && cache
->used
== 0) {
2731 spin_unlock(&cache
->lock
);
2732 if (btrfs_test_opt(fs_info
, DISCARD_ASYNC
))
2733 btrfs_discard_queue_work(&fs_info
->discard_ctl
,
2736 btrfs_mark_bg_unused(cache
);
2738 spin_unlock(&cache
->lock
);
2741 btrfs_unfreeze_block_group(cache
);
2742 btrfs_put_block_group(cache
);
2745 if (sctx
->is_dev_replace
&&
2746 atomic64_read(&dev_replace
->num_write_errors
) > 0) {
2750 if (sctx
->stat
.malloc_errors
> 0) {
2755 key
.offset
= found_key
.offset
+ dev_extent_len
;
2756 btrfs_release_path(path
);
2759 btrfs_free_path(path
);
2764 static int scrub_one_super(struct scrub_ctx
*sctx
, struct btrfs_device
*dev
,
2765 struct page
*page
, u64 physical
, u64 generation
)
2767 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2768 struct bio_vec bvec
;
2770 struct btrfs_super_block
*sb
= page_address(page
);
2773 bio_init(&bio
, dev
->bdev
, &bvec
, 1, REQ_OP_READ
);
2774 bio
.bi_iter
.bi_sector
= physical
>> SECTOR_SHIFT
;
2775 __bio_add_page(&bio
, page
, BTRFS_SUPER_INFO_SIZE
, 0);
2776 ret
= submit_bio_wait(&bio
);
2781 ret
= btrfs_check_super_csum(fs_info
, sb
);
2783 btrfs_err_rl(fs_info
,
2784 "super block at physical %llu devid %llu has bad csum",
2785 physical
, dev
->devid
);
2788 if (btrfs_super_generation(sb
) != generation
) {
2789 btrfs_err_rl(fs_info
,
2790 "super block at physical %llu devid %llu has bad generation %llu expect %llu",
2791 physical
, dev
->devid
,
2792 btrfs_super_generation(sb
), generation
);
2796 return btrfs_validate_super(fs_info
, sb
, -1);
2799 static noinline_for_stack
int scrub_supers(struct scrub_ctx
*sctx
,
2800 struct btrfs_device
*scrub_dev
)
2807 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2809 if (BTRFS_FS_ERROR(fs_info
))
2812 page
= alloc_page(GFP_KERNEL
);
2814 spin_lock(&sctx
->stat_lock
);
2815 sctx
->stat
.malloc_errors
++;
2816 spin_unlock(&sctx
->stat_lock
);
2820 /* Seed devices of a new filesystem has their own generation. */
2821 if (scrub_dev
->fs_devices
!= fs_info
->fs_devices
)
2822 gen
= scrub_dev
->generation
;
2824 gen
= btrfs_get_last_trans_committed(fs_info
);
2826 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
2827 ret
= btrfs_sb_log_location(scrub_dev
, i
, 0, &bytenr
);
2832 spin_lock(&sctx
->stat_lock
);
2833 sctx
->stat
.super_errors
++;
2834 spin_unlock(&sctx
->stat_lock
);
2838 if (bytenr
+ BTRFS_SUPER_INFO_SIZE
>
2839 scrub_dev
->commit_total_bytes
)
2841 if (!btrfs_check_super_location(scrub_dev
, bytenr
))
2844 ret
= scrub_one_super(sctx
, scrub_dev
, page
, bytenr
, gen
);
2846 spin_lock(&sctx
->stat_lock
);
2847 sctx
->stat
.super_errors
++;
2848 spin_unlock(&sctx
->stat_lock
);
2855 static void scrub_workers_put(struct btrfs_fs_info
*fs_info
)
2857 if (refcount_dec_and_mutex_lock(&fs_info
->scrub_workers_refcnt
,
2858 &fs_info
->scrub_lock
)) {
2859 struct workqueue_struct
*scrub_workers
= fs_info
->scrub_workers
;
2861 fs_info
->scrub_workers
= NULL
;
2862 mutex_unlock(&fs_info
->scrub_lock
);
2865 destroy_workqueue(scrub_workers
);
2870 * get a reference count on fs_info->scrub_workers. start worker if necessary
2872 static noinline_for_stack
int scrub_workers_get(struct btrfs_fs_info
*fs_info
)
2874 struct workqueue_struct
*scrub_workers
= NULL
;
2875 unsigned int flags
= WQ_FREEZABLE
| WQ_UNBOUND
;
2876 int max_active
= fs_info
->thread_pool_size
;
2879 if (refcount_inc_not_zero(&fs_info
->scrub_workers_refcnt
))
2882 scrub_workers
= alloc_workqueue("btrfs-scrub", flags
, max_active
);
2886 mutex_lock(&fs_info
->scrub_lock
);
2887 if (refcount_read(&fs_info
->scrub_workers_refcnt
) == 0) {
2888 ASSERT(fs_info
->scrub_workers
== NULL
);
2889 fs_info
->scrub_workers
= scrub_workers
;
2890 refcount_set(&fs_info
->scrub_workers_refcnt
, 1);
2891 mutex_unlock(&fs_info
->scrub_lock
);
2894 /* Other thread raced in and created the workers for us */
2895 refcount_inc(&fs_info
->scrub_workers_refcnt
);
2896 mutex_unlock(&fs_info
->scrub_lock
);
2900 destroy_workqueue(scrub_workers
);
2904 int btrfs_scrub_dev(struct btrfs_fs_info
*fs_info
, u64 devid
, u64 start
,
2905 u64 end
, struct btrfs_scrub_progress
*progress
,
2906 int readonly
, int is_dev_replace
)
2908 struct btrfs_dev_lookup_args args
= { .devid
= devid
};
2909 struct scrub_ctx
*sctx
;
2911 struct btrfs_device
*dev
;
2912 unsigned int nofs_flag
;
2913 bool need_commit
= false;
2915 if (btrfs_fs_closing(fs_info
))
2918 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
2919 ASSERT(fs_info
->nodesize
<= BTRFS_STRIPE_LEN
);
2922 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
2923 * value (max nodesize / min sectorsize), thus nodesize should always
2926 ASSERT(fs_info
->nodesize
<=
2927 SCRUB_MAX_SECTORS_PER_BLOCK
<< fs_info
->sectorsize_bits
);
2929 /* Allocate outside of device_list_mutex */
2930 sctx
= scrub_setup_ctx(fs_info
, is_dev_replace
);
2932 return PTR_ERR(sctx
);
2934 ret
= scrub_workers_get(fs_info
);
2938 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
2939 dev
= btrfs_find_device(fs_info
->fs_devices
, &args
);
2940 if (!dev
|| (test_bit(BTRFS_DEV_STATE_MISSING
, &dev
->dev_state
) &&
2942 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2947 if (!is_dev_replace
&& !readonly
&&
2948 !test_bit(BTRFS_DEV_STATE_WRITEABLE
, &dev
->dev_state
)) {
2949 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2950 btrfs_err_in_rcu(fs_info
,
2951 "scrub on devid %llu: filesystem on %s is not writable",
2952 devid
, btrfs_dev_name(dev
));
2957 mutex_lock(&fs_info
->scrub_lock
);
2958 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA
, &dev
->dev_state
) ||
2959 test_bit(BTRFS_DEV_STATE_REPLACE_TGT
, &dev
->dev_state
)) {
2960 mutex_unlock(&fs_info
->scrub_lock
);
2961 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2966 down_read(&fs_info
->dev_replace
.rwsem
);
2967 if (dev
->scrub_ctx
||
2969 btrfs_dev_replace_is_ongoing(&fs_info
->dev_replace
))) {
2970 up_read(&fs_info
->dev_replace
.rwsem
);
2971 mutex_unlock(&fs_info
->scrub_lock
);
2972 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2976 up_read(&fs_info
->dev_replace
.rwsem
);
2978 sctx
->readonly
= readonly
;
2979 dev
->scrub_ctx
= sctx
;
2980 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2983 * checking @scrub_pause_req here, we can avoid
2984 * race between committing transaction and scrubbing.
2986 __scrub_blocked_if_needed(fs_info
);
2987 atomic_inc(&fs_info
->scrubs_running
);
2988 mutex_unlock(&fs_info
->scrub_lock
);
2991 * In order to avoid deadlock with reclaim when there is a transaction
2992 * trying to pause scrub, make sure we use GFP_NOFS for all the
2993 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
2994 * invoked by our callees. The pausing request is done when the
2995 * transaction commit starts, and it blocks the transaction until scrub
2996 * is paused (done at specific points at scrub_stripe() or right above
2997 * before incrementing fs_info->scrubs_running).
2999 nofs_flag
= memalloc_nofs_save();
3000 if (!is_dev_replace
) {
3001 u64 old_super_errors
;
3003 spin_lock(&sctx
->stat_lock
);
3004 old_super_errors
= sctx
->stat
.super_errors
;
3005 spin_unlock(&sctx
->stat_lock
);
3007 btrfs_info(fs_info
, "scrub: started on devid %llu", devid
);
3009 * by holding device list mutex, we can
3010 * kick off writing super in log tree sync.
3012 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
3013 ret
= scrub_supers(sctx
, dev
);
3014 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
3016 spin_lock(&sctx
->stat_lock
);
3018 * Super block errors found, but we can not commit transaction
3019 * at current context, since btrfs_commit_transaction() needs
3020 * to pause the current running scrub (hold by ourselves).
3022 if (sctx
->stat
.super_errors
> old_super_errors
&& !sctx
->readonly
)
3024 spin_unlock(&sctx
->stat_lock
);
3028 ret
= scrub_enumerate_chunks(sctx
, dev
, start
, end
);
3029 memalloc_nofs_restore(nofs_flag
);
3031 atomic_dec(&fs_info
->scrubs_running
);
3032 wake_up(&fs_info
->scrub_pause_wait
);
3035 memcpy(progress
, &sctx
->stat
, sizeof(*progress
));
3037 if (!is_dev_replace
)
3038 btrfs_info(fs_info
, "scrub: %s on devid %llu with status: %d",
3039 ret
? "not finished" : "finished", devid
, ret
);
3041 mutex_lock(&fs_info
->scrub_lock
);
3042 dev
->scrub_ctx
= NULL
;
3043 mutex_unlock(&fs_info
->scrub_lock
);
3045 scrub_workers_put(fs_info
);
3046 scrub_put_ctx(sctx
);
3049 * We found some super block errors before, now try to force a
3050 * transaction commit, as scrub has finished.
3053 struct btrfs_trans_handle
*trans
;
3055 trans
= btrfs_start_transaction(fs_info
->tree_root
, 0);
3056 if (IS_ERR(trans
)) {
3057 ret
= PTR_ERR(trans
);
3059 "scrub: failed to start transaction to fix super block errors: %d", ret
);
3062 ret
= btrfs_commit_transaction(trans
);
3065 "scrub: failed to commit transaction to fix super block errors: %d", ret
);
3069 scrub_workers_put(fs_info
);
3071 scrub_free_ctx(sctx
);
3076 void btrfs_scrub_pause(struct btrfs_fs_info
*fs_info
)
3078 mutex_lock(&fs_info
->scrub_lock
);
3079 atomic_inc(&fs_info
->scrub_pause_req
);
3080 while (atomic_read(&fs_info
->scrubs_paused
) !=
3081 atomic_read(&fs_info
->scrubs_running
)) {
3082 mutex_unlock(&fs_info
->scrub_lock
);
3083 wait_event(fs_info
->scrub_pause_wait
,
3084 atomic_read(&fs_info
->scrubs_paused
) ==
3085 atomic_read(&fs_info
->scrubs_running
));
3086 mutex_lock(&fs_info
->scrub_lock
);
3088 mutex_unlock(&fs_info
->scrub_lock
);
3091 void btrfs_scrub_continue(struct btrfs_fs_info
*fs_info
)
3093 atomic_dec(&fs_info
->scrub_pause_req
);
3094 wake_up(&fs_info
->scrub_pause_wait
);
3097 int btrfs_scrub_cancel(struct btrfs_fs_info
*fs_info
)
3099 mutex_lock(&fs_info
->scrub_lock
);
3100 if (!atomic_read(&fs_info
->scrubs_running
)) {
3101 mutex_unlock(&fs_info
->scrub_lock
);
3105 atomic_inc(&fs_info
->scrub_cancel_req
);
3106 while (atomic_read(&fs_info
->scrubs_running
)) {
3107 mutex_unlock(&fs_info
->scrub_lock
);
3108 wait_event(fs_info
->scrub_pause_wait
,
3109 atomic_read(&fs_info
->scrubs_running
) == 0);
3110 mutex_lock(&fs_info
->scrub_lock
);
3112 atomic_dec(&fs_info
->scrub_cancel_req
);
3113 mutex_unlock(&fs_info
->scrub_lock
);
3118 int btrfs_scrub_cancel_dev(struct btrfs_device
*dev
)
3120 struct btrfs_fs_info
*fs_info
= dev
->fs_info
;
3121 struct scrub_ctx
*sctx
;
3123 mutex_lock(&fs_info
->scrub_lock
);
3124 sctx
= dev
->scrub_ctx
;
3126 mutex_unlock(&fs_info
->scrub_lock
);
3129 atomic_inc(&sctx
->cancel_req
);
3130 while (dev
->scrub_ctx
) {
3131 mutex_unlock(&fs_info
->scrub_lock
);
3132 wait_event(fs_info
->scrub_pause_wait
,
3133 dev
->scrub_ctx
== NULL
);
3134 mutex_lock(&fs_info
->scrub_lock
);
3136 mutex_unlock(&fs_info
->scrub_lock
);
3141 int btrfs_scrub_progress(struct btrfs_fs_info
*fs_info
, u64 devid
,
3142 struct btrfs_scrub_progress
*progress
)
3144 struct btrfs_dev_lookup_args args
= { .devid
= devid
};
3145 struct btrfs_device
*dev
;
3146 struct scrub_ctx
*sctx
= NULL
;
3148 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
3149 dev
= btrfs_find_device(fs_info
->fs_devices
, &args
);
3151 sctx
= dev
->scrub_ctx
;
3153 memcpy(progress
, &sctx
->stat
, sizeof(*progress
));
3154 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
3156 return dev
? (sctx
? 0 : -ENOTCONN
) : -ENODEV
;