1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
14 #include "ordered-data.h"
15 #include "transaction.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
20 #include "block-group.h"
23 #include "accessors.h"
24 #include "file-item.h"
26 #include "raid-stripe-tree.h"
29 * This is only the first step towards a full-features scrub. It reads all
30 * extent and super block and verifies the checksums. In case a bad checksum
31 * is found or the extent cannot be read, good data will be written back if
34 * Future enhancements:
35 * - In case an unrepairable extent is encountered, track which files are
36 * affected and report them
37 * - track and record media errors, throw out bad devices
38 * - add a mode to also read unallocated space
44 * The following value only influences the performance.
46 * This determines how many stripes would be submitted in one go,
47 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
49 #define SCRUB_STRIPES_PER_GROUP 8
52 * How many groups we have for each sctx.
54 * This would be 8M per device, the same value as the old scrub in-flight bios
57 #define SCRUB_GROUPS_PER_SCTX 16
59 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP)
62 * The following value times PAGE_SIZE needs to be large enough to match the
63 * largest node/leaf/sector size that shall be supported.
65 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
67 /* Represent one sector and its needed info to verify the content. */
68 struct scrub_sector_verification
{
73 * Csum pointer for data csum verification. Should point to a
74 * sector csum inside scrub_stripe::csums.
76 * NULL if this data sector has no csum.
81 * Extra info for metadata verification. All sectors inside a
82 * tree block share the same generation.
88 enum scrub_stripe_flags
{
89 /* Set when @mirror_num, @dev, @physical and @logical are set. */
90 SCRUB_STRIPE_FLAG_INITIALIZED
,
92 /* Set when the read-repair is finished. */
93 SCRUB_STRIPE_FLAG_REPAIR_DONE
,
96 * Set for data stripes if it's triggered from P/Q stripe.
97 * During such scrub, we should not report errors in data stripes, nor
98 * update the accounting.
100 SCRUB_STRIPE_FLAG_NO_REPORT
,
103 #define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
106 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
108 struct scrub_stripe
{
109 struct scrub_ctx
*sctx
;
110 struct btrfs_block_group
*bg
;
112 struct page
*pages
[SCRUB_STRIPE_PAGES
];
113 struct scrub_sector_verification
*sectors
;
115 struct btrfs_device
*dev
;
121 /* Should be BTRFS_STRIPE_LEN / sectorsize. */
125 * How many data/meta extents are in this stripe. Only for scrub status
126 * reporting purposes.
132 wait_queue_head_t io_wait
;
133 wait_queue_head_t repair_wait
;
136 * Indicate the states of the stripe. Bits are defined in
137 * scrub_stripe_flags enum.
141 /* Indicate which sectors are covered by extent items. */
142 unsigned long extent_sector_bitmap
;
145 * The errors hit during the initial read of the stripe.
147 * Would be utilized for error reporting and repair.
149 * The remaining init_nr_* records the number of errors hit, only used
150 * by error reporting.
152 unsigned long init_error_bitmap
;
153 unsigned int init_nr_io_errors
;
154 unsigned int init_nr_csum_errors
;
155 unsigned int init_nr_meta_errors
;
158 * The following error bitmaps are all for the current status.
159 * Every time we submit a new read, these bitmaps may be updated.
161 * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
163 * IO and csum errors can happen for both metadata and data.
165 unsigned long error_bitmap
;
166 unsigned long io_error_bitmap
;
167 unsigned long csum_error_bitmap
;
168 unsigned long meta_error_bitmap
;
170 /* For writeback (repair or replace) error reporting. */
171 unsigned long write_error_bitmap
;
173 /* Writeback can be concurrent, thus we need to protect the bitmap. */
174 spinlock_t write_error_lock
;
177 * Checksum for the whole stripe if this stripe is inside a data block
182 struct work_struct work
;
186 struct scrub_stripe stripes
[SCRUB_TOTAL_STRIPES
];
187 struct scrub_stripe
*raid56_data_stripes
;
188 struct btrfs_fs_info
*fs_info
;
189 struct btrfs_path extent_path
;
190 struct btrfs_path csum_path
;
196 /* State of IO submission throttling affecting the associated device */
197 ktime_t throttle_deadline
;
203 struct mutex wr_lock
;
204 struct btrfs_device
*wr_tgtdev
;
209 struct btrfs_scrub_progress stat
;
210 spinlock_t stat_lock
;
213 * Use a ref counter to avoid use-after-free issues. Scrub workers
214 * decrement bios_in_flight and workers_pending and then do a wakeup
215 * on the list_wait wait queue. We must ensure the main scrub task
216 * doesn't free the scrub context before or while the workers are
217 * doing the wakeup() call.
222 struct scrub_warning
{
223 struct btrfs_path
*path
;
224 u64 extent_item_size
;
228 struct btrfs_device
*dev
;
231 static void release_scrub_stripe(struct scrub_stripe
*stripe
)
236 for (int i
= 0; i
< SCRUB_STRIPE_PAGES
; i
++) {
237 if (stripe
->pages
[i
])
238 __free_page(stripe
->pages
[i
]);
239 stripe
->pages
[i
] = NULL
;
241 kfree(stripe
->sectors
);
242 kfree(stripe
->csums
);
243 stripe
->sectors
= NULL
;
244 stripe
->csums
= NULL
;
249 static int init_scrub_stripe(struct btrfs_fs_info
*fs_info
,
250 struct scrub_stripe
*stripe
)
254 memset(stripe
, 0, sizeof(*stripe
));
256 stripe
->nr_sectors
= BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
;
259 init_waitqueue_head(&stripe
->io_wait
);
260 init_waitqueue_head(&stripe
->repair_wait
);
261 atomic_set(&stripe
->pending_io
, 0);
262 spin_lock_init(&stripe
->write_error_lock
);
264 ret
= btrfs_alloc_page_array(SCRUB_STRIPE_PAGES
, stripe
->pages
, false);
268 stripe
->sectors
= kcalloc(stripe
->nr_sectors
,
269 sizeof(struct scrub_sector_verification
),
271 if (!stripe
->sectors
)
274 stripe
->csums
= kcalloc(BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
,
275 fs_info
->csum_size
, GFP_KERNEL
);
280 release_scrub_stripe(stripe
);
284 static void wait_scrub_stripe_io(struct scrub_stripe
*stripe
)
286 wait_event(stripe
->io_wait
, atomic_read(&stripe
->pending_io
) == 0);
289 static void scrub_put_ctx(struct scrub_ctx
*sctx
);
291 static void __scrub_blocked_if_needed(struct btrfs_fs_info
*fs_info
)
293 while (atomic_read(&fs_info
->scrub_pause_req
)) {
294 mutex_unlock(&fs_info
->scrub_lock
);
295 wait_event(fs_info
->scrub_pause_wait
,
296 atomic_read(&fs_info
->scrub_pause_req
) == 0);
297 mutex_lock(&fs_info
->scrub_lock
);
301 static void scrub_pause_on(struct btrfs_fs_info
*fs_info
)
303 atomic_inc(&fs_info
->scrubs_paused
);
304 wake_up(&fs_info
->scrub_pause_wait
);
307 static void scrub_pause_off(struct btrfs_fs_info
*fs_info
)
309 mutex_lock(&fs_info
->scrub_lock
);
310 __scrub_blocked_if_needed(fs_info
);
311 atomic_dec(&fs_info
->scrubs_paused
);
312 mutex_unlock(&fs_info
->scrub_lock
);
314 wake_up(&fs_info
->scrub_pause_wait
);
317 static void scrub_blocked_if_needed(struct btrfs_fs_info
*fs_info
)
319 scrub_pause_on(fs_info
);
320 scrub_pause_off(fs_info
);
323 static noinline_for_stack
void scrub_free_ctx(struct scrub_ctx
*sctx
)
330 for (i
= 0; i
< SCRUB_TOTAL_STRIPES
; i
++)
331 release_scrub_stripe(&sctx
->stripes
[i
]);
336 static void scrub_put_ctx(struct scrub_ctx
*sctx
)
338 if (refcount_dec_and_test(&sctx
->refs
))
339 scrub_free_ctx(sctx
);
342 static noinline_for_stack
struct scrub_ctx
*scrub_setup_ctx(
343 struct btrfs_fs_info
*fs_info
, int is_dev_replace
)
345 struct scrub_ctx
*sctx
;
348 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use
351 sctx
= kvzalloc(sizeof(*sctx
), GFP_KERNEL
);
354 refcount_set(&sctx
->refs
, 1);
355 sctx
->is_dev_replace
= is_dev_replace
;
356 sctx
->fs_info
= fs_info
;
357 sctx
->extent_path
.search_commit_root
= 1;
358 sctx
->extent_path
.skip_locking
= 1;
359 sctx
->csum_path
.search_commit_root
= 1;
360 sctx
->csum_path
.skip_locking
= 1;
361 for (i
= 0; i
< SCRUB_TOTAL_STRIPES
; i
++) {
364 ret
= init_scrub_stripe(fs_info
, &sctx
->stripes
[i
]);
367 sctx
->stripes
[i
].sctx
= sctx
;
369 sctx
->first_free
= 0;
370 atomic_set(&sctx
->cancel_req
, 0);
372 spin_lock_init(&sctx
->stat_lock
);
373 sctx
->throttle_deadline
= 0;
375 mutex_init(&sctx
->wr_lock
);
376 if (is_dev_replace
) {
377 WARN_ON(!fs_info
->dev_replace
.tgtdev
);
378 sctx
->wr_tgtdev
= fs_info
->dev_replace
.tgtdev
;
384 scrub_free_ctx(sctx
);
385 return ERR_PTR(-ENOMEM
);
388 static int scrub_print_warning_inode(u64 inum
, u64 offset
, u64 num_bytes
,
389 u64 root
, void *warn_ctx
)
395 struct extent_buffer
*eb
;
396 struct btrfs_inode_item
*inode_item
;
397 struct scrub_warning
*swarn
= warn_ctx
;
398 struct btrfs_fs_info
*fs_info
= swarn
->dev
->fs_info
;
399 struct inode_fs_paths
*ipath
= NULL
;
400 struct btrfs_root
*local_root
;
401 struct btrfs_key key
;
403 local_root
= btrfs_get_fs_root(fs_info
, root
, true);
404 if (IS_ERR(local_root
)) {
405 ret
= PTR_ERR(local_root
);
410 * this makes the path point to (inum INODE_ITEM ioff)
413 key
.type
= BTRFS_INODE_ITEM_KEY
;
416 ret
= btrfs_search_slot(NULL
, local_root
, &key
, swarn
->path
, 0, 0);
418 btrfs_put_root(local_root
);
419 btrfs_release_path(swarn
->path
);
423 eb
= swarn
->path
->nodes
[0];
424 inode_item
= btrfs_item_ptr(eb
, swarn
->path
->slots
[0],
425 struct btrfs_inode_item
);
426 nlink
= btrfs_inode_nlink(eb
, inode_item
);
427 btrfs_release_path(swarn
->path
);
430 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
431 * uses GFP_NOFS in this context, so we keep it consistent but it does
432 * not seem to be strictly necessary.
434 nofs_flag
= memalloc_nofs_save();
435 ipath
= init_ipath(4096, local_root
, swarn
->path
);
436 memalloc_nofs_restore(nofs_flag
);
438 btrfs_put_root(local_root
);
439 ret
= PTR_ERR(ipath
);
443 ret
= paths_from_inode(inum
, ipath
);
449 * we deliberately ignore the bit ipath might have been too small to
450 * hold all of the paths here
452 for (i
= 0; i
< ipath
->fspath
->elem_cnt
; ++i
)
453 btrfs_warn_in_rcu(fs_info
,
454 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
455 swarn
->errstr
, swarn
->logical
,
456 btrfs_dev_name(swarn
->dev
),
459 fs_info
->sectorsize
, nlink
,
460 (char *)(unsigned long)ipath
->fspath
->val
[i
]);
462 btrfs_put_root(local_root
);
467 btrfs_warn_in_rcu(fs_info
,
468 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
469 swarn
->errstr
, swarn
->logical
,
470 btrfs_dev_name(swarn
->dev
),
472 root
, inum
, offset
, ret
);
478 static void scrub_print_common_warning(const char *errstr
, struct btrfs_device
*dev
,
479 bool is_super
, u64 logical
, u64 physical
)
481 struct btrfs_fs_info
*fs_info
= dev
->fs_info
;
482 struct btrfs_path
*path
;
483 struct btrfs_key found_key
;
484 struct extent_buffer
*eb
;
485 struct btrfs_extent_item
*ei
;
486 struct scrub_warning swarn
;
491 /* Super block error, no need to search extent tree. */
493 btrfs_warn_in_rcu(fs_info
, "%s on device %s, physical %llu",
494 errstr
, btrfs_dev_name(dev
), physical
);
497 path
= btrfs_alloc_path();
501 swarn
.physical
= physical
;
502 swarn
.logical
= logical
;
503 swarn
.errstr
= errstr
;
506 ret
= extent_from_logical(fs_info
, swarn
.logical
, path
, &found_key
,
511 swarn
.extent_item_size
= found_key
.offset
;
514 ei
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_extent_item
);
515 item_size
= btrfs_item_size(eb
, path
->slots
[0]);
517 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
518 unsigned long ptr
= 0;
523 ret
= tree_backref_for_extent(&ptr
, eb
, &found_key
, ei
,
524 item_size
, &ref_root
,
528 "failed to resolve tree backref for logical %llu: %d",
534 btrfs_warn_in_rcu(fs_info
,
535 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
536 errstr
, swarn
.logical
, btrfs_dev_name(dev
),
537 swarn
.physical
, (ref_level
? "node" : "leaf"),
538 ref_level
, ref_root
);
540 btrfs_release_path(path
);
542 struct btrfs_backref_walk_ctx ctx
= { 0 };
544 btrfs_release_path(path
);
546 ctx
.bytenr
= found_key
.objectid
;
547 ctx
.extent_item_pos
= swarn
.logical
- found_key
.objectid
;
548 ctx
.fs_info
= fs_info
;
553 iterate_extent_inodes(&ctx
, true, scrub_print_warning_inode
, &swarn
);
557 btrfs_free_path(path
);
560 static int fill_writer_pointer_gap(struct scrub_ctx
*sctx
, u64 physical
)
565 if (!btrfs_is_zoned(sctx
->fs_info
))
568 if (!btrfs_dev_is_sequential(sctx
->wr_tgtdev
, physical
))
571 if (sctx
->write_pointer
< physical
) {
572 length
= physical
- sctx
->write_pointer
;
574 ret
= btrfs_zoned_issue_zeroout(sctx
->wr_tgtdev
,
575 sctx
->write_pointer
, length
);
577 sctx
->write_pointer
= physical
;
582 static struct page
*scrub_stripe_get_page(struct scrub_stripe
*stripe
, int sector_nr
)
584 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
585 int page_index
= (sector_nr
<< fs_info
->sectorsize_bits
) >> PAGE_SHIFT
;
587 return stripe
->pages
[page_index
];
590 static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe
*stripe
,
593 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
595 return offset_in_page(sector_nr
<< fs_info
->sectorsize_bits
);
598 static void scrub_verify_one_metadata(struct scrub_stripe
*stripe
, int sector_nr
)
600 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
601 const u32 sectors_per_tree
= fs_info
->nodesize
>> fs_info
->sectorsize_bits
;
602 const u64 logical
= stripe
->logical
+ (sector_nr
<< fs_info
->sectorsize_bits
);
603 const struct page
*first_page
= scrub_stripe_get_page(stripe
, sector_nr
);
604 const unsigned int first_off
= scrub_stripe_get_page_offset(stripe
, sector_nr
);
605 SHASH_DESC_ON_STACK(shash
, fs_info
->csum_shash
);
606 u8 on_disk_csum
[BTRFS_CSUM_SIZE
];
607 u8 calculated_csum
[BTRFS_CSUM_SIZE
];
608 struct btrfs_header
*header
;
611 * Here we don't have a good way to attach the pages (and subpages)
612 * to a dummy extent buffer, thus we have to directly grab the members
615 header
= (struct btrfs_header
*)(page_address(first_page
) + first_off
);
616 memcpy(on_disk_csum
, header
->csum
, fs_info
->csum_size
);
618 if (logical
!= btrfs_stack_header_bytenr(header
)) {
619 bitmap_set(&stripe
->csum_error_bitmap
, sector_nr
, sectors_per_tree
);
620 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
621 btrfs_warn_rl(fs_info
,
622 "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
623 logical
, stripe
->mirror_num
,
624 btrfs_stack_header_bytenr(header
), logical
);
627 if (memcmp(header
->fsid
, fs_info
->fs_devices
->metadata_uuid
,
628 BTRFS_FSID_SIZE
) != 0) {
629 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
630 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
631 btrfs_warn_rl(fs_info
,
632 "tree block %llu mirror %u has bad fsid, has %pU want %pU",
633 logical
, stripe
->mirror_num
,
634 header
->fsid
, fs_info
->fs_devices
->fsid
);
637 if (memcmp(header
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
,
638 BTRFS_UUID_SIZE
) != 0) {
639 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
640 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
641 btrfs_warn_rl(fs_info
,
642 "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
643 logical
, stripe
->mirror_num
,
644 header
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
);
648 /* Now check tree block csum. */
649 shash
->tfm
= fs_info
->csum_shash
;
650 crypto_shash_init(shash
);
651 crypto_shash_update(shash
, page_address(first_page
) + first_off
+
652 BTRFS_CSUM_SIZE
, fs_info
->sectorsize
- BTRFS_CSUM_SIZE
);
654 for (int i
= sector_nr
+ 1; i
< sector_nr
+ sectors_per_tree
; i
++) {
655 struct page
*page
= scrub_stripe_get_page(stripe
, i
);
656 unsigned int page_off
= scrub_stripe_get_page_offset(stripe
, i
);
658 crypto_shash_update(shash
, page_address(page
) + page_off
,
659 fs_info
->sectorsize
);
662 crypto_shash_final(shash
, calculated_csum
);
663 if (memcmp(calculated_csum
, on_disk_csum
, fs_info
->csum_size
) != 0) {
664 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
665 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
666 btrfs_warn_rl(fs_info
,
667 "tree block %llu mirror %u has bad csum, has " CSUM_FMT
" want " CSUM_FMT
,
668 logical
, stripe
->mirror_num
,
669 CSUM_FMT_VALUE(fs_info
->csum_size
, on_disk_csum
),
670 CSUM_FMT_VALUE(fs_info
->csum_size
, calculated_csum
));
673 if (stripe
->sectors
[sector_nr
].generation
!=
674 btrfs_stack_header_generation(header
)) {
675 bitmap_set(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
676 bitmap_set(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
677 btrfs_warn_rl(fs_info
,
678 "tree block %llu mirror %u has bad generation, has %llu want %llu",
679 logical
, stripe
->mirror_num
,
680 btrfs_stack_header_generation(header
),
681 stripe
->sectors
[sector_nr
].generation
);
684 bitmap_clear(&stripe
->error_bitmap
, sector_nr
, sectors_per_tree
);
685 bitmap_clear(&stripe
->csum_error_bitmap
, sector_nr
, sectors_per_tree
);
686 bitmap_clear(&stripe
->meta_error_bitmap
, sector_nr
, sectors_per_tree
);
689 static void scrub_verify_one_sector(struct scrub_stripe
*stripe
, int sector_nr
)
691 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
692 struct scrub_sector_verification
*sector
= &stripe
->sectors
[sector_nr
];
693 const u32 sectors_per_tree
= fs_info
->nodesize
>> fs_info
->sectorsize_bits
;
694 struct page
*page
= scrub_stripe_get_page(stripe
, sector_nr
);
695 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, sector_nr
);
696 u8 csum_buf
[BTRFS_CSUM_SIZE
];
699 ASSERT(sector_nr
>= 0 && sector_nr
< stripe
->nr_sectors
);
701 /* Sector not utilized, skip it. */
702 if (!test_bit(sector_nr
, &stripe
->extent_sector_bitmap
))
705 /* IO error, no need to check. */
706 if (test_bit(sector_nr
, &stripe
->io_error_bitmap
))
709 /* Metadata, verify the full tree block. */
710 if (sector
->is_metadata
) {
712 * Check if the tree block crosses the stripe boundary. If
713 * crossed the boundary, we cannot verify it but only give a
716 * This can only happen on a very old filesystem where chunks
717 * are not ensured to be stripe aligned.
719 if (unlikely(sector_nr
+ sectors_per_tree
> stripe
->nr_sectors
)) {
720 btrfs_warn_rl(fs_info
,
721 "tree block at %llu crosses stripe boundary %llu",
723 (sector_nr
<< fs_info
->sectorsize_bits
),
727 scrub_verify_one_metadata(stripe
, sector_nr
);
732 * Data is easier, we just verify the data csum (if we have it). For
733 * cases without csum, we have no other choice but to trust it.
736 clear_bit(sector_nr
, &stripe
->error_bitmap
);
740 ret
= btrfs_check_sector_csum(fs_info
, page
, pgoff
, csum_buf
, sector
->csum
);
742 set_bit(sector_nr
, &stripe
->csum_error_bitmap
);
743 set_bit(sector_nr
, &stripe
->error_bitmap
);
745 clear_bit(sector_nr
, &stripe
->csum_error_bitmap
);
746 clear_bit(sector_nr
, &stripe
->error_bitmap
);
750 /* Verify specified sectors of a stripe. */
751 static void scrub_verify_one_stripe(struct scrub_stripe
*stripe
, unsigned long bitmap
)
753 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
754 const u32 sectors_per_tree
= fs_info
->nodesize
>> fs_info
->sectorsize_bits
;
757 for_each_set_bit(sector_nr
, &bitmap
, stripe
->nr_sectors
) {
758 scrub_verify_one_sector(stripe
, sector_nr
);
759 if (stripe
->sectors
[sector_nr
].is_metadata
)
760 sector_nr
+= sectors_per_tree
- 1;
764 static int calc_sector_number(struct scrub_stripe
*stripe
, struct bio_vec
*first_bvec
)
768 for (i
= 0; i
< stripe
->nr_sectors
; i
++) {
769 if (scrub_stripe_get_page(stripe
, i
) == first_bvec
->bv_page
&&
770 scrub_stripe_get_page_offset(stripe
, i
) == first_bvec
->bv_offset
)
773 ASSERT(i
< stripe
->nr_sectors
);
778 * Repair read is different to the regular read:
780 * - Only reads the failed sectors
781 * - May have extra blocksize limits
783 static void scrub_repair_read_endio(struct btrfs_bio
*bbio
)
785 struct scrub_stripe
*stripe
= bbio
->private;
786 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
787 struct bio_vec
*bvec
;
788 int sector_nr
= calc_sector_number(stripe
, bio_first_bvec_all(&bbio
->bio
));
792 ASSERT(sector_nr
< stripe
->nr_sectors
);
794 bio_for_each_bvec_all(bvec
, &bbio
->bio
, i
)
795 bio_size
+= bvec
->bv_len
;
797 if (bbio
->bio
.bi_status
) {
798 bitmap_set(&stripe
->io_error_bitmap
, sector_nr
,
799 bio_size
>> fs_info
->sectorsize_bits
);
800 bitmap_set(&stripe
->error_bitmap
, sector_nr
,
801 bio_size
>> fs_info
->sectorsize_bits
);
803 bitmap_clear(&stripe
->io_error_bitmap
, sector_nr
,
804 bio_size
>> fs_info
->sectorsize_bits
);
807 if (atomic_dec_and_test(&stripe
->pending_io
))
808 wake_up(&stripe
->io_wait
);
811 static int calc_next_mirror(int mirror
, int num_copies
)
813 ASSERT(mirror
<= num_copies
);
814 return (mirror
+ 1 > num_copies
) ? 1 : mirror
+ 1;
817 static void scrub_stripe_submit_repair_read(struct scrub_stripe
*stripe
,
818 int mirror
, int blocksize
, bool wait
)
820 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
821 struct btrfs_bio
*bbio
= NULL
;
822 const unsigned long old_error_bitmap
= stripe
->error_bitmap
;
825 ASSERT(stripe
->mirror_num
>= 1);
826 ASSERT(atomic_read(&stripe
->pending_io
) == 0);
828 for_each_set_bit(i
, &old_error_bitmap
, stripe
->nr_sectors
) {
833 page
= scrub_stripe_get_page(stripe
, i
);
834 pgoff
= scrub_stripe_get_page_offset(stripe
, i
);
836 /* The current sector cannot be merged, submit the bio. */
837 if (bbio
&& ((i
> 0 && !test_bit(i
- 1, &stripe
->error_bitmap
)) ||
838 bbio
->bio
.bi_iter
.bi_size
>= blocksize
)) {
839 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
840 atomic_inc(&stripe
->pending_io
);
841 btrfs_submit_bbio(bbio
, mirror
);
843 wait_scrub_stripe_io(stripe
);
848 bbio
= btrfs_bio_alloc(stripe
->nr_sectors
, REQ_OP_READ
,
849 fs_info
, scrub_repair_read_endio
, stripe
);
850 bbio
->bio
.bi_iter
.bi_sector
= (stripe
->logical
+
851 (i
<< fs_info
->sectorsize_bits
)) >> SECTOR_SHIFT
;
854 ret
= bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
855 ASSERT(ret
== fs_info
->sectorsize
);
858 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
859 atomic_inc(&stripe
->pending_io
);
860 btrfs_submit_bbio(bbio
, mirror
);
862 wait_scrub_stripe_io(stripe
);
866 static void scrub_stripe_report_errors(struct scrub_ctx
*sctx
,
867 struct scrub_stripe
*stripe
)
869 static DEFINE_RATELIMIT_STATE(rs
, DEFAULT_RATELIMIT_INTERVAL
,
870 DEFAULT_RATELIMIT_BURST
);
871 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
872 struct btrfs_device
*dev
= NULL
;
874 int nr_data_sectors
= 0;
875 int nr_meta_sectors
= 0;
876 int nr_nodatacsum_sectors
= 0;
877 int nr_repaired_sectors
= 0;
880 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT
, &stripe
->state
))
884 * Init needed infos for error reporting.
886 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
887 * thus no need for dev/physical, error reporting still needs dev and physical.
889 if (!bitmap_empty(&stripe
->init_error_bitmap
, stripe
->nr_sectors
)) {
890 u64 mapped_len
= fs_info
->sectorsize
;
891 struct btrfs_io_context
*bioc
= NULL
;
892 int stripe_index
= stripe
->mirror_num
- 1;
895 /* For scrub, our mirror_num should always start at 1. */
896 ASSERT(stripe
->mirror_num
>= 1);
897 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_GET_READ_MIRRORS
,
898 stripe
->logical
, &mapped_len
, &bioc
,
901 * If we failed, dev will be NULL, and later detailed reports
902 * will just be skipped.
906 physical
= bioc
->stripes
[stripe_index
].physical
;
907 dev
= bioc
->stripes
[stripe_index
].dev
;
908 btrfs_put_bioc(bioc
);
912 for_each_set_bit(sector_nr
, &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
) {
913 bool repaired
= false;
915 if (stripe
->sectors
[sector_nr
].is_metadata
) {
919 if (!stripe
->sectors
[sector_nr
].csum
)
920 nr_nodatacsum_sectors
++;
923 if (test_bit(sector_nr
, &stripe
->init_error_bitmap
) &&
924 !test_bit(sector_nr
, &stripe
->error_bitmap
)) {
925 nr_repaired_sectors
++;
929 /* Good sector from the beginning, nothing need to be done. */
930 if (!test_bit(sector_nr
, &stripe
->init_error_bitmap
))
934 * Report error for the corrupted sectors. If repaired, just
935 * output the message of repaired message.
939 btrfs_err_rl_in_rcu(fs_info
,
940 "fixed up error at logical %llu on dev %s physical %llu",
941 stripe
->logical
, btrfs_dev_name(dev
),
944 btrfs_err_rl_in_rcu(fs_info
,
945 "fixed up error at logical %llu on mirror %u",
946 stripe
->logical
, stripe
->mirror_num
);
951 /* The remaining are all for unrepaired. */
953 btrfs_err_rl_in_rcu(fs_info
,
954 "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
955 stripe
->logical
, btrfs_dev_name(dev
),
958 btrfs_err_rl_in_rcu(fs_info
,
959 "unable to fixup (regular) error at logical %llu on mirror %u",
960 stripe
->logical
, stripe
->mirror_num
);
963 if (test_bit(sector_nr
, &stripe
->io_error_bitmap
))
964 if (__ratelimit(&rs
) && dev
)
965 scrub_print_common_warning("i/o error", dev
, false,
966 stripe
->logical
, physical
);
967 if (test_bit(sector_nr
, &stripe
->csum_error_bitmap
))
968 if (__ratelimit(&rs
) && dev
)
969 scrub_print_common_warning("checksum error", dev
, false,
970 stripe
->logical
, physical
);
971 if (test_bit(sector_nr
, &stripe
->meta_error_bitmap
))
972 if (__ratelimit(&rs
) && dev
)
973 scrub_print_common_warning("header error", dev
, false,
974 stripe
->logical
, physical
);
977 spin_lock(&sctx
->stat_lock
);
978 sctx
->stat
.data_extents_scrubbed
+= stripe
->nr_data_extents
;
979 sctx
->stat
.tree_extents_scrubbed
+= stripe
->nr_meta_extents
;
980 sctx
->stat
.data_bytes_scrubbed
+= nr_data_sectors
<< fs_info
->sectorsize_bits
;
981 sctx
->stat
.tree_bytes_scrubbed
+= nr_meta_sectors
<< fs_info
->sectorsize_bits
;
982 sctx
->stat
.no_csum
+= nr_nodatacsum_sectors
;
983 sctx
->stat
.read_errors
+= stripe
->init_nr_io_errors
;
984 sctx
->stat
.csum_errors
+= stripe
->init_nr_csum_errors
;
985 sctx
->stat
.verify_errors
+= stripe
->init_nr_meta_errors
;
986 sctx
->stat
.uncorrectable_errors
+=
987 bitmap_weight(&stripe
->error_bitmap
, stripe
->nr_sectors
);
988 sctx
->stat
.corrected_errors
+= nr_repaired_sectors
;
989 spin_unlock(&sctx
->stat_lock
);
992 static void scrub_write_sectors(struct scrub_ctx
*sctx
, struct scrub_stripe
*stripe
,
993 unsigned long write_bitmap
, bool dev_replace
);
996 * The main entrance for all read related scrub work, including:
998 * - Wait for the initial read to finish
999 * - Verify and locate any bad sectors
1000 * - Go through the remaining mirrors and try to read as large blocksize as
1002 * - Go through all mirrors (including the failed mirror) sector-by-sector
1003 * - Submit writeback for repaired sectors
1005 * Writeback for dev-replace does not happen here, it needs extra
1006 * synchronization for zoned devices.
1008 static void scrub_stripe_read_repair_worker(struct work_struct
*work
)
1010 struct scrub_stripe
*stripe
= container_of(work
, struct scrub_stripe
, work
);
1011 struct scrub_ctx
*sctx
= stripe
->sctx
;
1012 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1013 int num_copies
= btrfs_num_copies(fs_info
, stripe
->bg
->start
,
1014 stripe
->bg
->length
);
1015 unsigned long repaired
;
1019 ASSERT(stripe
->mirror_num
> 0);
1021 wait_scrub_stripe_io(stripe
);
1022 scrub_verify_one_stripe(stripe
, stripe
->extent_sector_bitmap
);
1023 /* Save the initial failed bitmap for later repair and report usage. */
1024 stripe
->init_error_bitmap
= stripe
->error_bitmap
;
1025 stripe
->init_nr_io_errors
= bitmap_weight(&stripe
->io_error_bitmap
,
1026 stripe
->nr_sectors
);
1027 stripe
->init_nr_csum_errors
= bitmap_weight(&stripe
->csum_error_bitmap
,
1028 stripe
->nr_sectors
);
1029 stripe
->init_nr_meta_errors
= bitmap_weight(&stripe
->meta_error_bitmap
,
1030 stripe
->nr_sectors
);
1032 if (bitmap_empty(&stripe
->init_error_bitmap
, stripe
->nr_sectors
))
1036 * Try all remaining mirrors.
1038 * Here we still try to read as large block as possible, as this is
1039 * faster and we have extra safety nets to rely on.
1041 for (mirror
= calc_next_mirror(stripe
->mirror_num
, num_copies
);
1042 mirror
!= stripe
->mirror_num
;
1043 mirror
= calc_next_mirror(mirror
, num_copies
)) {
1044 const unsigned long old_error_bitmap
= stripe
->error_bitmap
;
1046 scrub_stripe_submit_repair_read(stripe
, mirror
,
1047 BTRFS_STRIPE_LEN
, false);
1048 wait_scrub_stripe_io(stripe
);
1049 scrub_verify_one_stripe(stripe
, old_error_bitmap
);
1050 if (bitmap_empty(&stripe
->error_bitmap
, stripe
->nr_sectors
))
1055 * Last safety net, try re-checking all mirrors, including the failed
1056 * one, sector-by-sector.
1058 * As if one sector failed the drive's internal csum, the whole read
1059 * containing the offending sector would be marked as error.
1060 * Thus here we do sector-by-sector read.
1062 * This can be slow, thus we only try it as the last resort.
1065 for (i
= 0, mirror
= stripe
->mirror_num
;
1067 i
++, mirror
= calc_next_mirror(mirror
, num_copies
)) {
1068 const unsigned long old_error_bitmap
= stripe
->error_bitmap
;
1070 scrub_stripe_submit_repair_read(stripe
, mirror
,
1071 fs_info
->sectorsize
, true);
1072 wait_scrub_stripe_io(stripe
);
1073 scrub_verify_one_stripe(stripe
, old_error_bitmap
);
1074 if (bitmap_empty(&stripe
->error_bitmap
, stripe
->nr_sectors
))
1079 * Submit the repaired sectors. For zoned case, we cannot do repair
1080 * in-place, but queue the bg to be relocated.
1082 bitmap_andnot(&repaired
, &stripe
->init_error_bitmap
, &stripe
->error_bitmap
,
1083 stripe
->nr_sectors
);
1084 if (!sctx
->readonly
&& !bitmap_empty(&repaired
, stripe
->nr_sectors
)) {
1085 if (btrfs_is_zoned(fs_info
)) {
1086 btrfs_repair_one_zone(fs_info
, sctx
->stripes
[0].bg
->start
);
1088 scrub_write_sectors(sctx
, stripe
, repaired
, false);
1089 wait_scrub_stripe_io(stripe
);
1093 scrub_stripe_report_errors(sctx
, stripe
);
1094 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE
, &stripe
->state
);
1095 wake_up(&stripe
->repair_wait
);
1098 static void scrub_read_endio(struct btrfs_bio
*bbio
)
1100 struct scrub_stripe
*stripe
= bbio
->private;
1101 struct bio_vec
*bvec
;
1102 int sector_nr
= calc_sector_number(stripe
, bio_first_bvec_all(&bbio
->bio
));
1107 ASSERT(sector_nr
< stripe
->nr_sectors
);
1108 bio_for_each_bvec_all(bvec
, &bbio
->bio
, i
)
1109 bio_size
+= bvec
->bv_len
;
1110 num_sectors
= bio_size
>> stripe
->bg
->fs_info
->sectorsize_bits
;
1112 if (bbio
->bio
.bi_status
) {
1113 bitmap_set(&stripe
->io_error_bitmap
, sector_nr
, num_sectors
);
1114 bitmap_set(&stripe
->error_bitmap
, sector_nr
, num_sectors
);
1116 bitmap_clear(&stripe
->io_error_bitmap
, sector_nr
, num_sectors
);
1118 bio_put(&bbio
->bio
);
1119 if (atomic_dec_and_test(&stripe
->pending_io
)) {
1120 wake_up(&stripe
->io_wait
);
1121 INIT_WORK(&stripe
->work
, scrub_stripe_read_repair_worker
);
1122 queue_work(stripe
->bg
->fs_info
->scrub_workers
, &stripe
->work
);
1126 static void scrub_write_endio(struct btrfs_bio
*bbio
)
1128 struct scrub_stripe
*stripe
= bbio
->private;
1129 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1130 struct bio_vec
*bvec
;
1131 int sector_nr
= calc_sector_number(stripe
, bio_first_bvec_all(&bbio
->bio
));
1135 bio_for_each_bvec_all(bvec
, &bbio
->bio
, i
)
1136 bio_size
+= bvec
->bv_len
;
1138 if (bbio
->bio
.bi_status
) {
1139 unsigned long flags
;
1141 spin_lock_irqsave(&stripe
->write_error_lock
, flags
);
1142 bitmap_set(&stripe
->write_error_bitmap
, sector_nr
,
1143 bio_size
>> fs_info
->sectorsize_bits
);
1144 spin_unlock_irqrestore(&stripe
->write_error_lock
, flags
);
1146 bio_put(&bbio
->bio
);
1148 if (atomic_dec_and_test(&stripe
->pending_io
))
1149 wake_up(&stripe
->io_wait
);
1152 static void scrub_submit_write_bio(struct scrub_ctx
*sctx
,
1153 struct scrub_stripe
*stripe
,
1154 struct btrfs_bio
*bbio
, bool dev_replace
)
1156 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1157 u32 bio_len
= bbio
->bio
.bi_iter
.bi_size
;
1158 u32 bio_off
= (bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
) -
1161 fill_writer_pointer_gap(sctx
, stripe
->physical
+ bio_off
);
1162 atomic_inc(&stripe
->pending_io
);
1163 btrfs_submit_repair_write(bbio
, stripe
->mirror_num
, dev_replace
);
1164 if (!btrfs_is_zoned(fs_info
))
1167 * For zoned writeback, queue depth must be 1, thus we must wait for
1168 * the write to finish before the next write.
1170 wait_scrub_stripe_io(stripe
);
1173 * And also need to update the write pointer if write finished
1176 if (!test_bit(bio_off
>> fs_info
->sectorsize_bits
,
1177 &stripe
->write_error_bitmap
))
1178 sctx
->write_pointer
+= bio_len
;
1182 * Submit the write bio(s) for the sectors specified by @write_bitmap.
1184 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
1186 * - Only needs logical bytenr and mirror_num
1187 * Just like the scrub read path
1189 * - Would only result in writes to the specified mirror
1190 * Unlike the regular writeback path, which would write back to all stripes
1192 * - Handle dev-replace and read-repair writeback differently
1194 static void scrub_write_sectors(struct scrub_ctx
*sctx
, struct scrub_stripe
*stripe
,
1195 unsigned long write_bitmap
, bool dev_replace
)
1197 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1198 struct btrfs_bio
*bbio
= NULL
;
1201 for_each_set_bit(sector_nr
, &write_bitmap
, stripe
->nr_sectors
) {
1202 struct page
*page
= scrub_stripe_get_page(stripe
, sector_nr
);
1203 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, sector_nr
);
1206 /* We should only writeback sectors covered by an extent. */
1207 ASSERT(test_bit(sector_nr
, &stripe
->extent_sector_bitmap
));
1209 /* Cannot merge with previous sector, submit the current one. */
1210 if (bbio
&& sector_nr
&& !test_bit(sector_nr
- 1, &write_bitmap
)) {
1211 scrub_submit_write_bio(sctx
, stripe
, bbio
, dev_replace
);
1215 bbio
= btrfs_bio_alloc(stripe
->nr_sectors
, REQ_OP_WRITE
,
1216 fs_info
, scrub_write_endio
, stripe
);
1217 bbio
->bio
.bi_iter
.bi_sector
= (stripe
->logical
+
1218 (sector_nr
<< fs_info
->sectorsize_bits
)) >>
1221 ret
= bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
1222 ASSERT(ret
== fs_info
->sectorsize
);
1225 scrub_submit_write_bio(sctx
, stripe
, bbio
, dev_replace
);
1229 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1230 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1232 static void scrub_throttle_dev_io(struct scrub_ctx
*sctx
, struct btrfs_device
*device
,
1233 unsigned int bio_size
)
1235 const int time_slice
= 1000;
1241 bwlimit
= READ_ONCE(device
->scrub_speed_max
);
1246 * Slice is divided into intervals when the IO is submitted, adjust by
1247 * bwlimit and maximum of 64 intervals.
1249 div
= max_t(u32
, 1, (u32
)(bwlimit
/ (16 * 1024 * 1024)));
1250 div
= min_t(u32
, 64, div
);
1252 /* Start new epoch, set deadline */
1254 if (sctx
->throttle_deadline
== 0) {
1255 sctx
->throttle_deadline
= ktime_add_ms(now
, time_slice
/ div
);
1256 sctx
->throttle_sent
= 0;
1259 /* Still in the time to send? */
1260 if (ktime_before(now
, sctx
->throttle_deadline
)) {
1261 /* If current bio is within the limit, send it */
1262 sctx
->throttle_sent
+= bio_size
;
1263 if (sctx
->throttle_sent
<= div_u64(bwlimit
, div
))
1266 /* We're over the limit, sleep until the rest of the slice */
1267 delta
= ktime_ms_delta(sctx
->throttle_deadline
, now
);
1269 /* New request after deadline, start new epoch */
1276 timeout
= div_u64(delta
* HZ
, 1000);
1277 schedule_timeout_interruptible(timeout
);
1280 /* Next call will start the deadline period */
1281 sctx
->throttle_deadline
= 0;
1285 * Given a physical address, this will calculate it's
1286 * logical offset. if this is a parity stripe, it will return
1287 * the most left data stripe's logical offset.
1289 * return 0 if it is a data stripe, 1 means parity stripe.
1291 static int get_raid56_logic_offset(u64 physical
, int num
,
1292 struct btrfs_chunk_map
*map
, u64
*offset
,
1298 const int data_stripes
= nr_data_stripes(map
);
1300 last_offset
= (physical
- map
->stripes
[num
].physical
) * data_stripes
;
1302 *stripe_start
= last_offset
;
1304 *offset
= last_offset
;
1305 for (i
= 0; i
< data_stripes
; i
++) {
1310 *offset
= last_offset
+ btrfs_stripe_nr_to_offset(i
);
1312 stripe_nr
= (u32
)(*offset
>> BTRFS_STRIPE_LEN_SHIFT
) / data_stripes
;
1314 /* Work out the disk rotation on this stripe-set */
1315 rot
= stripe_nr
% map
->num_stripes
;
1316 /* calculate which stripe this data locates */
1318 stripe_index
= rot
% map
->num_stripes
;
1319 if (stripe_index
== num
)
1321 if (stripe_index
< num
)
1324 *offset
= last_offset
+ btrfs_stripe_nr_to_offset(j
);
1329 * Return 0 if the extent item range covers any byte of the range.
1330 * Return <0 if the extent item is before @search_start.
1331 * Return >0 if the extent item is after @start_start + @search_len.
1333 static int compare_extent_item_range(struct btrfs_path
*path
,
1334 u64 search_start
, u64 search_len
)
1336 struct btrfs_fs_info
*fs_info
= path
->nodes
[0]->fs_info
;
1338 struct btrfs_key key
;
1340 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1341 ASSERT(key
.type
== BTRFS_EXTENT_ITEM_KEY
||
1342 key
.type
== BTRFS_METADATA_ITEM_KEY
);
1343 if (key
.type
== BTRFS_METADATA_ITEM_KEY
)
1344 len
= fs_info
->nodesize
;
1348 if (key
.objectid
+ len
<= search_start
)
1350 if (key
.objectid
>= search_start
+ search_len
)
1356 * Locate one extent item which covers any byte in range
1357 * [@search_start, @search_start + @search_length)
1359 * If the path is not initialized, we will initialize the search by doing
1360 * a btrfs_search_slot().
1361 * If the path is already initialized, we will use the path as the initial
1362 * slot, to avoid duplicated btrfs_search_slot() calls.
1364 * NOTE: If an extent item starts before @search_start, we will still
1365 * return the extent item. This is for data extent crossing stripe boundary.
1367 * Return 0 if we found such extent item, and @path will point to the extent item.
1368 * Return >0 if no such extent item can be found, and @path will be released.
1369 * Return <0 if hit fatal error, and @path will be released.
1371 static int find_first_extent_item(struct btrfs_root
*extent_root
,
1372 struct btrfs_path
*path
,
1373 u64 search_start
, u64 search_len
)
1375 struct btrfs_fs_info
*fs_info
= extent_root
->fs_info
;
1376 struct btrfs_key key
;
1379 /* Continue using the existing path */
1381 goto search_forward
;
1383 if (btrfs_fs_incompat(fs_info
, SKINNY_METADATA
))
1384 key
.type
= BTRFS_METADATA_ITEM_KEY
;
1386 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1387 key
.objectid
= search_start
;
1388 key
.offset
= (u64
)-1;
1390 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
1395 * Key with offset -1 found, there would have to exist an extent
1396 * item with such offset, but this is out of the valid range.
1398 btrfs_release_path(path
);
1403 * Here we intentionally pass 0 as @min_objectid, as there could be
1404 * an extent item starting before @search_start.
1406 ret
= btrfs_previous_extent_item(extent_root
, path
, 0);
1410 * No matter whether we have found an extent item, the next loop will
1411 * properly do every check on the key.
1415 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1416 if (key
.objectid
>= search_start
+ search_len
)
1418 if (key
.type
!= BTRFS_METADATA_ITEM_KEY
&&
1419 key
.type
!= BTRFS_EXTENT_ITEM_KEY
)
1422 ret
= compare_extent_item_range(path
, search_start
, search_len
);
1428 ret
= btrfs_next_item(extent_root
, path
);
1430 /* Either no more items or a fatal error. */
1431 btrfs_release_path(path
);
1435 btrfs_release_path(path
);
1439 static void get_extent_info(struct btrfs_path
*path
, u64
*extent_start_ret
,
1440 u64
*size_ret
, u64
*flags_ret
, u64
*generation_ret
)
1442 struct btrfs_key key
;
1443 struct btrfs_extent_item
*ei
;
1445 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1446 ASSERT(key
.type
== BTRFS_METADATA_ITEM_KEY
||
1447 key
.type
== BTRFS_EXTENT_ITEM_KEY
);
1448 *extent_start_ret
= key
.objectid
;
1449 if (key
.type
== BTRFS_METADATA_ITEM_KEY
)
1450 *size_ret
= path
->nodes
[0]->fs_info
->nodesize
;
1452 *size_ret
= key
.offset
;
1453 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0], struct btrfs_extent_item
);
1454 *flags_ret
= btrfs_extent_flags(path
->nodes
[0], ei
);
1455 *generation_ret
= btrfs_extent_generation(path
->nodes
[0], ei
);
1458 static int sync_write_pointer_for_zoned(struct scrub_ctx
*sctx
, u64 logical
,
1459 u64 physical
, u64 physical_end
)
1461 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1464 if (!btrfs_is_zoned(fs_info
))
1467 mutex_lock(&sctx
->wr_lock
);
1468 if (sctx
->write_pointer
< physical_end
) {
1469 ret
= btrfs_sync_zone_write_pointer(sctx
->wr_tgtdev
, logical
,
1471 sctx
->write_pointer
);
1474 "zoned: failed to recover write pointer");
1476 mutex_unlock(&sctx
->wr_lock
);
1477 btrfs_dev_clear_zone_empty(sctx
->wr_tgtdev
, physical
);
1482 static void fill_one_extent_info(struct btrfs_fs_info
*fs_info
,
1483 struct scrub_stripe
*stripe
,
1484 u64 extent_start
, u64 extent_len
,
1485 u64 extent_flags
, u64 extent_gen
)
1487 for (u64 cur_logical
= max(stripe
->logical
, extent_start
);
1488 cur_logical
< min(stripe
->logical
+ BTRFS_STRIPE_LEN
,
1489 extent_start
+ extent_len
);
1490 cur_logical
+= fs_info
->sectorsize
) {
1491 const int nr_sector
= (cur_logical
- stripe
->logical
) >>
1492 fs_info
->sectorsize_bits
;
1493 struct scrub_sector_verification
*sector
=
1494 &stripe
->sectors
[nr_sector
];
1496 set_bit(nr_sector
, &stripe
->extent_sector_bitmap
);
1497 if (extent_flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
1498 sector
->is_metadata
= true;
1499 sector
->generation
= extent_gen
;
1504 static void scrub_stripe_reset_bitmaps(struct scrub_stripe
*stripe
)
1506 stripe
->extent_sector_bitmap
= 0;
1507 stripe
->init_error_bitmap
= 0;
1508 stripe
->init_nr_io_errors
= 0;
1509 stripe
->init_nr_csum_errors
= 0;
1510 stripe
->init_nr_meta_errors
= 0;
1511 stripe
->error_bitmap
= 0;
1512 stripe
->io_error_bitmap
= 0;
1513 stripe
->csum_error_bitmap
= 0;
1514 stripe
->meta_error_bitmap
= 0;
1518 * Locate one stripe which has at least one extent in its range.
1520 * Return 0 if found such stripe, and store its info into @stripe.
1521 * Return >0 if there is no such stripe in the specified range.
1522 * Return <0 for error.
1524 static int scrub_find_fill_first_stripe(struct btrfs_block_group
*bg
,
1525 struct btrfs_path
*extent_path
,
1526 struct btrfs_path
*csum_path
,
1527 struct btrfs_device
*dev
, u64 physical
,
1528 int mirror_num
, u64 logical_start
,
1530 struct scrub_stripe
*stripe
)
1532 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1533 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, bg
->start
);
1534 struct btrfs_root
*csum_root
= btrfs_csum_root(fs_info
, bg
->start
);
1535 const u64 logical_end
= logical_start
+ logical_len
;
1536 u64 cur_logical
= logical_start
;
1544 if (unlikely(!extent_root
)) {
1545 btrfs_err(fs_info
, "no valid extent root for scrub");
1548 memset(stripe
->sectors
, 0, sizeof(struct scrub_sector_verification
) *
1549 stripe
->nr_sectors
);
1550 scrub_stripe_reset_bitmaps(stripe
);
1552 /* The range must be inside the bg. */
1553 ASSERT(logical_start
>= bg
->start
&& logical_end
<= bg
->start
+ bg
->length
);
1555 ret
= find_first_extent_item(extent_root
, extent_path
, logical_start
,
1557 /* Either error or not found. */
1560 get_extent_info(extent_path
, &extent_start
, &extent_len
, &extent_flags
,
1562 if (extent_flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
1563 stripe
->nr_meta_extents
++;
1564 if (extent_flags
& BTRFS_EXTENT_FLAG_DATA
)
1565 stripe
->nr_data_extents
++;
1566 cur_logical
= max(extent_start
, cur_logical
);
1569 * Round down to stripe boundary.
1571 * The extra calculation against bg->start is to handle block groups
1572 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
1574 stripe
->logical
= round_down(cur_logical
- bg
->start
, BTRFS_STRIPE_LEN
) +
1576 stripe
->physical
= physical
+ stripe
->logical
- logical_start
;
1579 stripe
->mirror_num
= mirror_num
;
1580 stripe_end
= stripe
->logical
+ BTRFS_STRIPE_LEN
- 1;
1582 /* Fill the first extent info into stripe->sectors[] array. */
1583 fill_one_extent_info(fs_info
, stripe
, extent_start
, extent_len
,
1584 extent_flags
, extent_gen
);
1585 cur_logical
= extent_start
+ extent_len
;
1587 /* Fill the extent info for the remaining sectors. */
1588 while (cur_logical
<= stripe_end
) {
1589 ret
= find_first_extent_item(extent_root
, extent_path
, cur_logical
,
1590 stripe_end
- cur_logical
+ 1);
1597 get_extent_info(extent_path
, &extent_start
, &extent_len
,
1598 &extent_flags
, &extent_gen
);
1599 if (extent_flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
1600 stripe
->nr_meta_extents
++;
1601 if (extent_flags
& BTRFS_EXTENT_FLAG_DATA
)
1602 stripe
->nr_data_extents
++;
1603 fill_one_extent_info(fs_info
, stripe
, extent_start
, extent_len
,
1604 extent_flags
, extent_gen
);
1605 cur_logical
= extent_start
+ extent_len
;
1608 /* Now fill the data csum. */
1609 if (bg
->flags
& BTRFS_BLOCK_GROUP_DATA
) {
1611 unsigned long csum_bitmap
= 0;
1613 /* Csum space should have already been allocated. */
1614 ASSERT(stripe
->csums
);
1617 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
1618 * should contain at most 16 sectors.
1620 ASSERT(BITS_PER_LONG
>= BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
);
1622 ret
= btrfs_lookup_csums_bitmap(csum_root
, csum_path
,
1623 stripe
->logical
, stripe_end
,
1624 stripe
->csums
, &csum_bitmap
);
1630 for_each_set_bit(sector_nr
, &csum_bitmap
, stripe
->nr_sectors
) {
1631 stripe
->sectors
[sector_nr
].csum
= stripe
->csums
+
1632 sector_nr
* fs_info
->csum_size
;
1635 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
);
1640 static void scrub_reset_stripe(struct scrub_stripe
*stripe
)
1642 scrub_stripe_reset_bitmaps(stripe
);
1644 stripe
->nr_meta_extents
= 0;
1645 stripe
->nr_data_extents
= 0;
1648 for (int i
= 0; i
< stripe
->nr_sectors
; i
++) {
1649 stripe
->sectors
[i
].is_metadata
= false;
1650 stripe
->sectors
[i
].csum
= NULL
;
1651 stripe
->sectors
[i
].generation
= 0;
1655 static u32
stripe_length(const struct scrub_stripe
*stripe
)
1659 return min(BTRFS_STRIPE_LEN
,
1660 stripe
->bg
->start
+ stripe
->bg
->length
- stripe
->logical
);
1663 static void scrub_submit_extent_sector_read(struct scrub_stripe
*stripe
)
1665 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1666 struct btrfs_bio
*bbio
= NULL
;
1667 unsigned int nr_sectors
= stripe_length(stripe
) >> fs_info
->sectorsize_bits
;
1668 u64 stripe_len
= BTRFS_STRIPE_LEN
;
1669 int mirror
= stripe
->mirror_num
;
1672 atomic_inc(&stripe
->pending_io
);
1674 for_each_set_bit(i
, &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
) {
1675 struct page
*page
= scrub_stripe_get_page(stripe
, i
);
1676 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, i
);
1678 /* We're beyond the chunk boundary, no need to read anymore. */
1679 if (i
>= nr_sectors
)
1682 /* The current sector cannot be merged, submit the bio. */
1685 !test_bit(i
- 1, &stripe
->extent_sector_bitmap
)) ||
1686 bbio
->bio
.bi_iter
.bi_size
>= stripe_len
)) {
1687 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
1688 atomic_inc(&stripe
->pending_io
);
1689 btrfs_submit_bbio(bbio
, mirror
);
1694 struct btrfs_io_stripe io_stripe
= {};
1695 struct btrfs_io_context
*bioc
= NULL
;
1696 const u64 logical
= stripe
->logical
+
1697 (i
<< fs_info
->sectorsize_bits
);
1700 io_stripe
.rst_search_commit_root
= true;
1701 stripe_len
= (nr_sectors
- i
) << fs_info
->sectorsize_bits
;
1703 * For RST cases, we need to manually split the bbio to
1704 * follow the RST boundary.
1706 err
= btrfs_map_block(fs_info
, BTRFS_MAP_READ
, logical
,
1707 &stripe_len
, &bioc
, &io_stripe
, &mirror
);
1708 btrfs_put_bioc(bioc
);
1710 if (err
!= -ENODATA
) {
1712 * Earlier btrfs_get_raid_extent_offset()
1713 * returned -ENODATA, which means there's
1714 * no entry for the corresponding range
1715 * in the stripe tree. But if it's in
1716 * the extent tree, then it's a preallocated
1717 * extent and not an error.
1719 set_bit(i
, &stripe
->io_error_bitmap
);
1720 set_bit(i
, &stripe
->error_bitmap
);
1725 bbio
= btrfs_bio_alloc(stripe
->nr_sectors
, REQ_OP_READ
,
1726 fs_info
, scrub_read_endio
, stripe
);
1727 bbio
->bio
.bi_iter
.bi_sector
= logical
>> SECTOR_SHIFT
;
1730 __bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
1734 ASSERT(bbio
->bio
.bi_iter
.bi_size
);
1735 atomic_inc(&stripe
->pending_io
);
1736 btrfs_submit_bbio(bbio
, mirror
);
1739 if (atomic_dec_and_test(&stripe
->pending_io
)) {
1740 wake_up(&stripe
->io_wait
);
1741 INIT_WORK(&stripe
->work
, scrub_stripe_read_repair_worker
);
1742 queue_work(stripe
->bg
->fs_info
->scrub_workers
, &stripe
->work
);
1746 static void scrub_submit_initial_read(struct scrub_ctx
*sctx
,
1747 struct scrub_stripe
*stripe
)
1749 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1750 struct btrfs_bio
*bbio
;
1751 unsigned int nr_sectors
= stripe_length(stripe
) >> fs_info
->sectorsize_bits
;
1752 int mirror
= stripe
->mirror_num
;
1755 ASSERT(stripe
->mirror_num
> 0);
1756 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
));
1758 if (btrfs_need_stripe_tree_update(fs_info
, stripe
->bg
->flags
)) {
1759 scrub_submit_extent_sector_read(stripe
);
1763 bbio
= btrfs_bio_alloc(SCRUB_STRIPE_PAGES
, REQ_OP_READ
, fs_info
,
1764 scrub_read_endio
, stripe
);
1766 bbio
->bio
.bi_iter
.bi_sector
= stripe
->logical
>> SECTOR_SHIFT
;
1767 /* Read the whole range inside the chunk boundary. */
1768 for (unsigned int cur
= 0; cur
< nr_sectors
; cur
++) {
1769 struct page
*page
= scrub_stripe_get_page(stripe
, cur
);
1770 unsigned int pgoff
= scrub_stripe_get_page_offset(stripe
, cur
);
1773 ret
= bio_add_page(&bbio
->bio
, page
, fs_info
->sectorsize
, pgoff
);
1774 /* We should have allocated enough bio vectors. */
1775 ASSERT(ret
== fs_info
->sectorsize
);
1777 atomic_inc(&stripe
->pending_io
);
1780 * For dev-replace, either user asks to avoid the source dev, or
1781 * the device is missing, we try the next mirror instead.
1783 if (sctx
->is_dev_replace
&&
1784 (fs_info
->dev_replace
.cont_reading_from_srcdev_mode
==
1785 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID
||
1786 !stripe
->dev
->bdev
)) {
1787 int num_copies
= btrfs_num_copies(fs_info
, stripe
->bg
->start
,
1788 stripe
->bg
->length
);
1790 mirror
= calc_next_mirror(mirror
, num_copies
);
1792 btrfs_submit_bbio(bbio
, mirror
);
1795 static bool stripe_has_metadata_error(struct scrub_stripe
*stripe
)
1799 for_each_set_bit(i
, &stripe
->error_bitmap
, stripe
->nr_sectors
) {
1800 if (stripe
->sectors
[i
].is_metadata
) {
1801 struct btrfs_fs_info
*fs_info
= stripe
->bg
->fs_info
;
1804 "stripe %llu has unrepaired metadata sector at %llu",
1806 stripe
->logical
+ (i
<< fs_info
->sectorsize_bits
));
1813 static void submit_initial_group_read(struct scrub_ctx
*sctx
,
1814 unsigned int first_slot
,
1815 unsigned int nr_stripes
)
1817 struct blk_plug plug
;
1819 ASSERT(first_slot
< SCRUB_TOTAL_STRIPES
);
1820 ASSERT(first_slot
+ nr_stripes
<= SCRUB_TOTAL_STRIPES
);
1822 scrub_throttle_dev_io(sctx
, sctx
->stripes
[0].dev
,
1823 btrfs_stripe_nr_to_offset(nr_stripes
));
1824 blk_start_plug(&plug
);
1825 for (int i
= 0; i
< nr_stripes
; i
++) {
1826 struct scrub_stripe
*stripe
= &sctx
->stripes
[first_slot
+ i
];
1828 /* Those stripes should be initialized. */
1829 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
));
1830 scrub_submit_initial_read(sctx
, stripe
);
1832 blk_finish_plug(&plug
);
1835 static int flush_scrub_stripes(struct scrub_ctx
*sctx
)
1837 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1838 struct scrub_stripe
*stripe
;
1839 const int nr_stripes
= sctx
->cur_stripe
;
1845 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &sctx
->stripes
[0].state
));
1847 /* Submit the stripes which are populated but not submitted. */
1848 if (nr_stripes
% SCRUB_STRIPES_PER_GROUP
) {
1849 const int first_slot
= round_down(nr_stripes
, SCRUB_STRIPES_PER_GROUP
);
1851 submit_initial_group_read(sctx
, first_slot
, nr_stripes
- first_slot
);
1854 for (int i
= 0; i
< nr_stripes
; i
++) {
1855 stripe
= &sctx
->stripes
[i
];
1857 wait_event(stripe
->repair_wait
,
1858 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE
, &stripe
->state
));
1861 /* Submit for dev-replace. */
1862 if (sctx
->is_dev_replace
) {
1864 * For dev-replace, if we know there is something wrong with
1865 * metadata, we should immediately abort.
1867 for (int i
= 0; i
< nr_stripes
; i
++) {
1868 if (stripe_has_metadata_error(&sctx
->stripes
[i
])) {
1873 for (int i
= 0; i
< nr_stripes
; i
++) {
1876 stripe
= &sctx
->stripes
[i
];
1878 ASSERT(stripe
->dev
== fs_info
->dev_replace
.srcdev
);
1880 bitmap_andnot(&good
, &stripe
->extent_sector_bitmap
,
1881 &stripe
->error_bitmap
, stripe
->nr_sectors
);
1882 scrub_write_sectors(sctx
, stripe
, good
, true);
1886 /* Wait for the above writebacks to finish. */
1887 for (int i
= 0; i
< nr_stripes
; i
++) {
1888 stripe
= &sctx
->stripes
[i
];
1890 wait_scrub_stripe_io(stripe
);
1891 spin_lock(&sctx
->stat_lock
);
1892 sctx
->stat
.last_physical
= stripe
->physical
+ stripe_length(stripe
);
1893 spin_unlock(&sctx
->stat_lock
);
1894 scrub_reset_stripe(stripe
);
1897 sctx
->cur_stripe
= 0;
1901 static void raid56_scrub_wait_endio(struct bio
*bio
)
1903 complete(bio
->bi_private
);
1906 static int queue_scrub_stripe(struct scrub_ctx
*sctx
, struct btrfs_block_group
*bg
,
1907 struct btrfs_device
*dev
, int mirror_num
,
1908 u64 logical
, u32 length
, u64 physical
,
1909 u64
*found_logical_ret
)
1911 struct scrub_stripe
*stripe
;
1915 * There should always be one slot left, as caller filling the last
1916 * slot should flush them all.
1918 ASSERT(sctx
->cur_stripe
< SCRUB_TOTAL_STRIPES
);
1920 /* @found_logical_ret must be specified. */
1921 ASSERT(found_logical_ret
);
1923 stripe
= &sctx
->stripes
[sctx
->cur_stripe
];
1924 scrub_reset_stripe(stripe
);
1925 ret
= scrub_find_fill_first_stripe(bg
, &sctx
->extent_path
,
1926 &sctx
->csum_path
, dev
, physical
,
1927 mirror_num
, logical
, length
, stripe
);
1928 /* Either >0 as no more extents or <0 for error. */
1931 *found_logical_ret
= stripe
->logical
;
1934 /* We filled one group, submit it. */
1935 if (sctx
->cur_stripe
% SCRUB_STRIPES_PER_GROUP
== 0) {
1936 const int first_slot
= sctx
->cur_stripe
- SCRUB_STRIPES_PER_GROUP
;
1938 submit_initial_group_read(sctx
, first_slot
, SCRUB_STRIPES_PER_GROUP
);
1941 /* Last slot used, flush them all. */
1942 if (sctx
->cur_stripe
== SCRUB_TOTAL_STRIPES
)
1943 return flush_scrub_stripes(sctx
);
1947 static int scrub_raid56_parity_stripe(struct scrub_ctx
*sctx
,
1948 struct btrfs_device
*scrub_dev
,
1949 struct btrfs_block_group
*bg
,
1950 struct btrfs_chunk_map
*map
,
1951 u64 full_stripe_start
)
1953 DECLARE_COMPLETION_ONSTACK(io_done
);
1954 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
1955 struct btrfs_raid_bio
*rbio
;
1956 struct btrfs_io_context
*bioc
= NULL
;
1957 struct btrfs_path extent_path
= { 0 };
1958 struct btrfs_path csum_path
= { 0 };
1960 struct scrub_stripe
*stripe
;
1961 bool all_empty
= true;
1962 const int data_stripes
= nr_data_stripes(map
);
1963 unsigned long extent_bitmap
= 0;
1964 u64 length
= btrfs_stripe_nr_to_offset(data_stripes
);
1967 ASSERT(sctx
->raid56_data_stripes
);
1970 * For data stripe search, we cannot reuse the same extent/csum paths,
1971 * as the data stripe bytenr may be smaller than previous extent. Thus
1972 * we have to use our own extent/csum paths.
1974 extent_path
.search_commit_root
= 1;
1975 extent_path
.skip_locking
= 1;
1976 csum_path
.search_commit_root
= 1;
1977 csum_path
.skip_locking
= 1;
1979 for (int i
= 0; i
< data_stripes
; i
++) {
1984 stripe
= &sctx
->raid56_data_stripes
[i
];
1985 rot
= div_u64(full_stripe_start
- bg
->start
,
1986 data_stripes
) >> BTRFS_STRIPE_LEN_SHIFT
;
1987 stripe_index
= (i
+ rot
) % map
->num_stripes
;
1988 physical
= map
->stripes
[stripe_index
].physical
+
1989 btrfs_stripe_nr_to_offset(rot
);
1991 scrub_reset_stripe(stripe
);
1992 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT
, &stripe
->state
);
1993 ret
= scrub_find_fill_first_stripe(bg
, &extent_path
, &csum_path
,
1994 map
->stripes
[stripe_index
].dev
, physical
, 1,
1995 full_stripe_start
+ btrfs_stripe_nr_to_offset(i
),
1996 BTRFS_STRIPE_LEN
, stripe
);
2000 * No extent in this data stripe, need to manually mark them
2001 * initialized to make later read submission happy.
2004 stripe
->logical
= full_stripe_start
+
2005 btrfs_stripe_nr_to_offset(i
);
2006 stripe
->dev
= map
->stripes
[stripe_index
].dev
;
2007 stripe
->mirror_num
= 1;
2008 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED
, &stripe
->state
);
2012 /* Check if all data stripes are empty. */
2013 for (int i
= 0; i
< data_stripes
; i
++) {
2014 stripe
= &sctx
->raid56_data_stripes
[i
];
2015 if (!bitmap_empty(&stripe
->extent_sector_bitmap
, stripe
->nr_sectors
)) {
2025 for (int i
= 0; i
< data_stripes
; i
++) {
2026 stripe
= &sctx
->raid56_data_stripes
[i
];
2027 scrub_submit_initial_read(sctx
, stripe
);
2029 for (int i
= 0; i
< data_stripes
; i
++) {
2030 stripe
= &sctx
->raid56_data_stripes
[i
];
2032 wait_event(stripe
->repair_wait
,
2033 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE
, &stripe
->state
));
2035 /* For now, no zoned support for RAID56. */
2036 ASSERT(!btrfs_is_zoned(sctx
->fs_info
));
2039 * Now all data stripes are properly verified. Check if we have any
2040 * unrepaired, if so abort immediately or we could further corrupt the
2043 * During the loop, also populate extent_bitmap.
2045 for (int i
= 0; i
< data_stripes
; i
++) {
2046 unsigned long error
;
2048 stripe
= &sctx
->raid56_data_stripes
[i
];
2051 * We should only check the errors where there is an extent.
2052 * As we may hit an empty data stripe while it's missing.
2054 bitmap_and(&error
, &stripe
->error_bitmap
,
2055 &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
);
2056 if (!bitmap_empty(&error
, stripe
->nr_sectors
)) {
2058 "unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
2059 full_stripe_start
, i
, stripe
->nr_sectors
,
2064 bitmap_or(&extent_bitmap
, &extent_bitmap
,
2065 &stripe
->extent_sector_bitmap
, stripe
->nr_sectors
);
2068 /* Now we can check and regenerate the P/Q stripe. */
2069 bio
= bio_alloc(NULL
, 1, REQ_OP_READ
, GFP_NOFS
);
2070 bio
->bi_iter
.bi_sector
= full_stripe_start
>> SECTOR_SHIFT
;
2071 bio
->bi_private
= &io_done
;
2072 bio
->bi_end_io
= raid56_scrub_wait_endio
;
2074 btrfs_bio_counter_inc_blocked(fs_info
);
2075 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_WRITE
, full_stripe_start
,
2076 &length
, &bioc
, NULL
, NULL
);
2078 btrfs_put_bioc(bioc
);
2079 btrfs_bio_counter_dec(fs_info
);
2082 rbio
= raid56_parity_alloc_scrub_rbio(bio
, bioc
, scrub_dev
, &extent_bitmap
,
2083 BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
);
2084 btrfs_put_bioc(bioc
);
2087 btrfs_bio_counter_dec(fs_info
);
2090 /* Use the recovered stripes as cache to avoid read them from disk again. */
2091 for (int i
= 0; i
< data_stripes
; i
++) {
2092 stripe
= &sctx
->raid56_data_stripes
[i
];
2094 raid56_parity_cache_data_pages(rbio
, stripe
->pages
,
2095 full_stripe_start
+ (i
<< BTRFS_STRIPE_LEN_SHIFT
));
2097 raid56_parity_submit_scrub_rbio(rbio
);
2098 wait_for_completion_io(&io_done
);
2099 ret
= blk_status_to_errno(bio
->bi_status
);
2101 btrfs_bio_counter_dec(fs_info
);
2103 btrfs_release_path(&extent_path
);
2104 btrfs_release_path(&csum_path
);
2110 * Scrub one range which can only has simple mirror based profile.
2111 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
2114 * Since we may need to handle a subset of block group, we need @logical_start
2115 * and @logical_length parameter.
2117 static int scrub_simple_mirror(struct scrub_ctx
*sctx
,
2118 struct btrfs_block_group
*bg
,
2119 u64 logical_start
, u64 logical_length
,
2120 struct btrfs_device
*device
,
2121 u64 physical
, int mirror_num
)
2123 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2124 const u64 logical_end
= logical_start
+ logical_length
;
2125 u64 cur_logical
= logical_start
;
2128 /* The range must be inside the bg */
2129 ASSERT(logical_start
>= bg
->start
&& logical_end
<= bg
->start
+ bg
->length
);
2131 /* Go through each extent items inside the logical range */
2132 while (cur_logical
< logical_end
) {
2133 u64 found_logical
= U64_MAX
;
2134 u64 cur_physical
= physical
+ cur_logical
- logical_start
;
2137 if (atomic_read(&fs_info
->scrub_cancel_req
) ||
2138 atomic_read(&sctx
->cancel_req
)) {
2143 if (atomic_read(&fs_info
->scrub_pause_req
)) {
2144 /* Push queued extents */
2145 scrub_blocked_if_needed(fs_info
);
2147 /* Block group removed? */
2148 spin_lock(&bg
->lock
);
2149 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &bg
->runtime_flags
)) {
2150 spin_unlock(&bg
->lock
);
2154 spin_unlock(&bg
->lock
);
2156 ret
= queue_scrub_stripe(sctx
, bg
, device
, mirror_num
,
2157 cur_logical
, logical_end
- cur_logical
,
2158 cur_physical
, &found_logical
);
2160 /* No more extent, just update the accounting */
2161 spin_lock(&sctx
->stat_lock
);
2162 sctx
->stat
.last_physical
= physical
+ logical_length
;
2163 spin_unlock(&sctx
->stat_lock
);
2170 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
2171 ASSERT(found_logical
!= U64_MAX
);
2172 cur_logical
= found_logical
+ BTRFS_STRIPE_LEN
;
2174 /* Don't hold CPU for too long time */
2180 /* Calculate the full stripe length for simple stripe based profiles */
2181 static u64
simple_stripe_full_stripe_len(const struct btrfs_chunk_map
*map
)
2183 ASSERT(map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
2184 BTRFS_BLOCK_GROUP_RAID10
));
2186 return btrfs_stripe_nr_to_offset(map
->num_stripes
/ map
->sub_stripes
);
2189 /* Get the logical bytenr for the stripe */
2190 static u64
simple_stripe_get_logical(struct btrfs_chunk_map
*map
,
2191 struct btrfs_block_group
*bg
,
2194 ASSERT(map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
2195 BTRFS_BLOCK_GROUP_RAID10
));
2196 ASSERT(stripe_index
< map
->num_stripes
);
2199 * (stripe_index / sub_stripes) gives how many data stripes we need to
2202 return btrfs_stripe_nr_to_offset(stripe_index
/ map
->sub_stripes
) +
2206 /* Get the mirror number for the stripe */
2207 static int simple_stripe_mirror_num(struct btrfs_chunk_map
*map
, int stripe_index
)
2209 ASSERT(map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
2210 BTRFS_BLOCK_GROUP_RAID10
));
2211 ASSERT(stripe_index
< map
->num_stripes
);
2213 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
2214 return stripe_index
% map
->sub_stripes
+ 1;
2217 static int scrub_simple_stripe(struct scrub_ctx
*sctx
,
2218 struct btrfs_block_group
*bg
,
2219 struct btrfs_chunk_map
*map
,
2220 struct btrfs_device
*device
,
2223 const u64 logical_increment
= simple_stripe_full_stripe_len(map
);
2224 const u64 orig_logical
= simple_stripe_get_logical(map
, bg
, stripe_index
);
2225 const u64 orig_physical
= map
->stripes
[stripe_index
].physical
;
2226 const int mirror_num
= simple_stripe_mirror_num(map
, stripe_index
);
2227 u64 cur_logical
= orig_logical
;
2228 u64 cur_physical
= orig_physical
;
2231 while (cur_logical
< bg
->start
+ bg
->length
) {
2233 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
2234 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
2237 ret
= scrub_simple_mirror(sctx
, bg
, cur_logical
,
2238 BTRFS_STRIPE_LEN
, device
, cur_physical
,
2242 /* Skip to next stripe which belongs to the target device */
2243 cur_logical
+= logical_increment
;
2244 /* For physical offset, we just go to next stripe */
2245 cur_physical
+= BTRFS_STRIPE_LEN
;
2250 static noinline_for_stack
int scrub_stripe(struct scrub_ctx
*sctx
,
2251 struct btrfs_block_group
*bg
,
2252 struct btrfs_chunk_map
*map
,
2253 struct btrfs_device
*scrub_dev
,
2256 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2257 const u64 profile
= map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
;
2258 const u64 chunk_logical
= bg
->start
;
2261 u64 physical
= map
->stripes
[stripe_index
].physical
;
2262 const u64 dev_stripe_len
= btrfs_calc_stripe_length(map
);
2263 const u64 physical_end
= physical
+ dev_stripe_len
;
2266 /* The logical increment after finishing one stripe */
2268 /* Offset inside the chunk */
2272 /* Extent_path should be released by now. */
2273 ASSERT(sctx
->extent_path
.nodes
[0] == NULL
);
2275 scrub_blocked_if_needed(fs_info
);
2277 if (sctx
->is_dev_replace
&&
2278 btrfs_dev_is_sequential(sctx
->wr_tgtdev
, physical
)) {
2279 mutex_lock(&sctx
->wr_lock
);
2280 sctx
->write_pointer
= physical
;
2281 mutex_unlock(&sctx
->wr_lock
);
2284 /* Prepare the extra data stripes used by RAID56. */
2285 if (profile
& BTRFS_BLOCK_GROUP_RAID56_MASK
) {
2286 ASSERT(sctx
->raid56_data_stripes
== NULL
);
2288 sctx
->raid56_data_stripes
= kcalloc(nr_data_stripes(map
),
2289 sizeof(struct scrub_stripe
),
2291 if (!sctx
->raid56_data_stripes
) {
2295 for (int i
= 0; i
< nr_data_stripes(map
); i
++) {
2296 ret
= init_scrub_stripe(fs_info
,
2297 &sctx
->raid56_data_stripes
[i
]);
2300 sctx
->raid56_data_stripes
[i
].bg
= bg
;
2301 sctx
->raid56_data_stripes
[i
].sctx
= sctx
;
2305 * There used to be a big double loop to handle all profiles using the
2306 * same routine, which grows larger and more gross over time.
2308 * So here we handle each profile differently, so simpler profiles
2309 * have simpler scrubbing function.
2311 if (!(profile
& (BTRFS_BLOCK_GROUP_RAID0
| BTRFS_BLOCK_GROUP_RAID10
|
2312 BTRFS_BLOCK_GROUP_RAID56_MASK
))) {
2314 * Above check rules out all complex profile, the remaining
2315 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
2316 * mirrored duplication without stripe.
2318 * Only @physical and @mirror_num needs to calculated using
2321 ret
= scrub_simple_mirror(sctx
, bg
, bg
->start
, bg
->length
,
2322 scrub_dev
, map
->stripes
[stripe_index
].physical
,
2327 if (profile
& (BTRFS_BLOCK_GROUP_RAID0
| BTRFS_BLOCK_GROUP_RAID10
)) {
2328 ret
= scrub_simple_stripe(sctx
, bg
, map
, scrub_dev
, stripe_index
);
2329 offset
= btrfs_stripe_nr_to_offset(stripe_index
/ map
->sub_stripes
);
2333 /* Only RAID56 goes through the old code */
2334 ASSERT(map
->type
& BTRFS_BLOCK_GROUP_RAID56_MASK
);
2337 /* Calculate the logical end of the stripe */
2338 get_raid56_logic_offset(physical_end
, stripe_index
,
2339 map
, &logic_end
, NULL
);
2340 logic_end
+= chunk_logical
;
2342 /* Initialize @offset in case we need to go to out: label */
2343 get_raid56_logic_offset(physical
, stripe_index
, map
, &offset
, NULL
);
2344 increment
= btrfs_stripe_nr_to_offset(nr_data_stripes(map
));
2347 * Due to the rotation, for RAID56 it's better to iterate each stripe
2348 * using their physical offset.
2350 while (physical
< physical_end
) {
2351 ret
= get_raid56_logic_offset(physical
, stripe_index
, map
,
2352 &logical
, &stripe_logical
);
2353 logical
+= chunk_logical
;
2355 /* it is parity strip */
2356 stripe_logical
+= chunk_logical
;
2357 ret
= scrub_raid56_parity_stripe(sctx
, scrub_dev
, bg
,
2358 map
, stripe_logical
);
2359 spin_lock(&sctx
->stat_lock
);
2360 sctx
->stat
.last_physical
= min(physical
+ BTRFS_STRIPE_LEN
,
2362 spin_unlock(&sctx
->stat_lock
);
2369 * Now we're at a data stripe, scrub each extents in the range.
2371 * At this stage, if we ignore the repair part, inside each data
2372 * stripe it is no different than SINGLE profile.
2373 * We can reuse scrub_simple_mirror() here, as the repair part
2374 * is still based on @mirror_num.
2376 ret
= scrub_simple_mirror(sctx
, bg
, logical
, BTRFS_STRIPE_LEN
,
2377 scrub_dev
, physical
, 1);
2381 logical
+= increment
;
2382 physical
+= BTRFS_STRIPE_LEN
;
2383 spin_lock(&sctx
->stat_lock
);
2384 sctx
->stat
.last_physical
= physical
;
2385 spin_unlock(&sctx
->stat_lock
);
2388 ret2
= flush_scrub_stripes(sctx
);
2391 btrfs_release_path(&sctx
->extent_path
);
2392 btrfs_release_path(&sctx
->csum_path
);
2394 if (sctx
->raid56_data_stripes
) {
2395 for (int i
= 0; i
< nr_data_stripes(map
); i
++)
2396 release_scrub_stripe(&sctx
->raid56_data_stripes
[i
]);
2397 kfree(sctx
->raid56_data_stripes
);
2398 sctx
->raid56_data_stripes
= NULL
;
2401 if (sctx
->is_dev_replace
&& ret
>= 0) {
2404 ret2
= sync_write_pointer_for_zoned(sctx
,
2405 chunk_logical
+ offset
,
2406 map
->stripes
[stripe_index
].physical
,
2412 return ret
< 0 ? ret
: 0;
2415 static noinline_for_stack
int scrub_chunk(struct scrub_ctx
*sctx
,
2416 struct btrfs_block_group
*bg
,
2417 struct btrfs_device
*scrub_dev
,
2421 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2422 struct btrfs_chunk_map
*map
;
2426 map
= btrfs_find_chunk_map(fs_info
, bg
->start
, bg
->length
);
2429 * Might have been an unused block group deleted by the cleaner
2430 * kthread or relocation.
2432 spin_lock(&bg
->lock
);
2433 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED
, &bg
->runtime_flags
))
2435 spin_unlock(&bg
->lock
);
2439 if (map
->start
!= bg
->start
)
2441 if (map
->chunk_len
< dev_extent_len
)
2444 for (i
= 0; i
< map
->num_stripes
; ++i
) {
2445 if (map
->stripes
[i
].dev
->bdev
== scrub_dev
->bdev
&&
2446 map
->stripes
[i
].physical
== dev_offset
) {
2447 ret
= scrub_stripe(sctx
, bg
, map
, scrub_dev
, i
);
2453 btrfs_free_chunk_map(map
);
2458 static int finish_extent_writes_for_zoned(struct btrfs_root
*root
,
2459 struct btrfs_block_group
*cache
)
2461 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
2463 if (!btrfs_is_zoned(fs_info
))
2466 btrfs_wait_block_group_reservations(cache
);
2467 btrfs_wait_nocow_writers(cache
);
2468 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, cache
);
2470 return btrfs_commit_current_transaction(root
);
2473 static noinline_for_stack
2474 int scrub_enumerate_chunks(struct scrub_ctx
*sctx
,
2475 struct btrfs_device
*scrub_dev
, u64 start
, u64 end
)
2477 struct btrfs_dev_extent
*dev_extent
= NULL
;
2478 struct btrfs_path
*path
;
2479 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2480 struct btrfs_root
*root
= fs_info
->dev_root
;
2485 struct extent_buffer
*l
;
2486 struct btrfs_key key
;
2487 struct btrfs_key found_key
;
2488 struct btrfs_block_group
*cache
;
2489 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
2491 path
= btrfs_alloc_path();
2495 path
->reada
= READA_FORWARD
;
2496 path
->search_commit_root
= 1;
2497 path
->skip_locking
= 1;
2499 key
.objectid
= scrub_dev
->devid
;
2501 key
.type
= BTRFS_DEV_EXTENT_KEY
;
2506 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2510 if (path
->slots
[0] >=
2511 btrfs_header_nritems(path
->nodes
[0])) {
2512 ret
= btrfs_next_leaf(root
, path
);
2525 slot
= path
->slots
[0];
2527 btrfs_item_key_to_cpu(l
, &found_key
, slot
);
2529 if (found_key
.objectid
!= scrub_dev
->devid
)
2532 if (found_key
.type
!= BTRFS_DEV_EXTENT_KEY
)
2535 if (found_key
.offset
>= end
)
2538 if (found_key
.offset
< key
.offset
)
2541 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
2542 dev_extent_len
= btrfs_dev_extent_length(l
, dev_extent
);
2544 if (found_key
.offset
+ dev_extent_len
<= start
)
2547 chunk_offset
= btrfs_dev_extent_chunk_offset(l
, dev_extent
);
2550 * get a reference on the corresponding block group to prevent
2551 * the chunk from going away while we scrub it
2553 cache
= btrfs_lookup_block_group(fs_info
, chunk_offset
);
2555 /* some chunks are removed but not committed to disk yet,
2556 * continue scrubbing */
2560 ASSERT(cache
->start
<= chunk_offset
);
2562 * We are using the commit root to search for device extents, so
2563 * that means we could have found a device extent item from a
2564 * block group that was deleted in the current transaction. The
2565 * logical start offset of the deleted block group, stored at
2566 * @chunk_offset, might be part of the logical address range of
2567 * a new block group (which uses different physical extents).
2568 * In this case btrfs_lookup_block_group() has returned the new
2569 * block group, and its start address is less than @chunk_offset.
2571 * We skip such new block groups, because it's pointless to
2572 * process them, as we won't find their extents because we search
2573 * for them using the commit root of the extent tree. For a device
2574 * replace it's also fine to skip it, we won't miss copying them
2575 * to the target device because we have the write duplication
2576 * setup through the regular write path (by btrfs_map_block()),
2577 * and we have committed a transaction when we started the device
2578 * replace, right after setting up the device replace state.
2580 if (cache
->start
< chunk_offset
) {
2581 btrfs_put_block_group(cache
);
2585 if (sctx
->is_dev_replace
&& btrfs_is_zoned(fs_info
)) {
2586 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY
, &cache
->runtime_flags
)) {
2587 btrfs_put_block_group(cache
);
2593 * Make sure that while we are scrubbing the corresponding block
2594 * group doesn't get its logical address and its device extents
2595 * reused for another block group, which can possibly be of a
2596 * different type and different profile. We do this to prevent
2597 * false error detections and crashes due to bogus attempts to
2600 spin_lock(&cache
->lock
);
2601 if (test_bit(BLOCK_GROUP_FLAG_REMOVED
, &cache
->runtime_flags
)) {
2602 spin_unlock(&cache
->lock
);
2603 btrfs_put_block_group(cache
);
2606 btrfs_freeze_block_group(cache
);
2607 spin_unlock(&cache
->lock
);
2610 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
2611 * to avoid deadlock caused by:
2612 * btrfs_inc_block_group_ro()
2613 * -> btrfs_wait_for_commit()
2614 * -> btrfs_commit_transaction()
2615 * -> btrfs_scrub_pause()
2617 scrub_pause_on(fs_info
);
2620 * Don't do chunk preallocation for scrub.
2622 * This is especially important for SYSTEM bgs, or we can hit
2623 * -EFBIG from btrfs_finish_chunk_alloc() like:
2624 * 1. The only SYSTEM bg is marked RO.
2625 * Since SYSTEM bg is small, that's pretty common.
2626 * 2. New SYSTEM bg will be allocated
2627 * Due to regular version will allocate new chunk.
2628 * 3. New SYSTEM bg is empty and will get cleaned up
2629 * Before cleanup really happens, it's marked RO again.
2630 * 4. Empty SYSTEM bg get scrubbed
2633 * This can easily boost the amount of SYSTEM chunks if cleaner
2634 * thread can't be triggered fast enough, and use up all space
2635 * of btrfs_super_block::sys_chunk_array
2637 * While for dev replace, we need to try our best to mark block
2638 * group RO, to prevent race between:
2639 * - Write duplication
2640 * Contains latest data
2642 * Contains data from commit tree
2644 * If target block group is not marked RO, nocow writes can
2645 * be overwritten by scrub copy, causing data corruption.
2646 * So for dev-replace, it's not allowed to continue if a block
2649 ret
= btrfs_inc_block_group_ro(cache
, sctx
->is_dev_replace
);
2650 if (!ret
&& sctx
->is_dev_replace
) {
2651 ret
= finish_extent_writes_for_zoned(root
, cache
);
2653 btrfs_dec_block_group_ro(cache
);
2654 scrub_pause_off(fs_info
);
2655 btrfs_put_block_group(cache
);
2662 } else if (ret
== -ENOSPC
&& !sctx
->is_dev_replace
&&
2663 !(cache
->flags
& BTRFS_BLOCK_GROUP_RAID56_MASK
)) {
2665 * btrfs_inc_block_group_ro return -ENOSPC when it
2666 * failed in creating new chunk for metadata.
2667 * It is not a problem for scrub, because
2668 * metadata are always cowed, and our scrub paused
2669 * commit_transactions.
2671 * For RAID56 chunks, we have to mark them read-only
2672 * for scrub, as later we would use our own cache
2673 * out of RAID56 realm.
2674 * Thus we want the RAID56 bg to be marked RO to
2675 * prevent RMW from screwing up out cache.
2678 } else if (ret
== -ETXTBSY
) {
2680 "skipping scrub of block group %llu due to active swapfile",
2682 scrub_pause_off(fs_info
);
2687 "failed setting block group ro: %d", ret
);
2688 btrfs_unfreeze_block_group(cache
);
2689 btrfs_put_block_group(cache
);
2690 scrub_pause_off(fs_info
);
2695 * Now the target block is marked RO, wait for nocow writes to
2696 * finish before dev-replace.
2697 * COW is fine, as COW never overwrites extents in commit tree.
2699 if (sctx
->is_dev_replace
) {
2700 btrfs_wait_nocow_writers(cache
);
2701 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, cache
);
2704 scrub_pause_off(fs_info
);
2705 down_write(&dev_replace
->rwsem
);
2706 dev_replace
->cursor_right
= found_key
.offset
+ dev_extent_len
;
2707 dev_replace
->cursor_left
= found_key
.offset
;
2708 dev_replace
->item_needs_writeback
= 1;
2709 up_write(&dev_replace
->rwsem
);
2711 ret
= scrub_chunk(sctx
, cache
, scrub_dev
, found_key
.offset
,
2713 if (sctx
->is_dev_replace
&&
2714 !btrfs_finish_block_group_to_copy(dev_replace
->srcdev
,
2715 cache
, found_key
.offset
))
2718 down_write(&dev_replace
->rwsem
);
2719 dev_replace
->cursor_left
= dev_replace
->cursor_right
;
2720 dev_replace
->item_needs_writeback
= 1;
2721 up_write(&dev_replace
->rwsem
);
2724 btrfs_dec_block_group_ro(cache
);
2727 * We might have prevented the cleaner kthread from deleting
2728 * this block group if it was already unused because we raced
2729 * and set it to RO mode first. So add it back to the unused
2730 * list, otherwise it might not ever be deleted unless a manual
2731 * balance is triggered or it becomes used and unused again.
2733 spin_lock(&cache
->lock
);
2734 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED
, &cache
->runtime_flags
) &&
2735 !cache
->ro
&& cache
->reserved
== 0 && cache
->used
== 0) {
2736 spin_unlock(&cache
->lock
);
2737 if (btrfs_test_opt(fs_info
, DISCARD_ASYNC
))
2738 btrfs_discard_queue_work(&fs_info
->discard_ctl
,
2741 btrfs_mark_bg_unused(cache
);
2743 spin_unlock(&cache
->lock
);
2746 btrfs_unfreeze_block_group(cache
);
2747 btrfs_put_block_group(cache
);
2750 if (sctx
->is_dev_replace
&&
2751 atomic64_read(&dev_replace
->num_write_errors
) > 0) {
2755 if (sctx
->stat
.malloc_errors
> 0) {
2760 key
.offset
= found_key
.offset
+ dev_extent_len
;
2761 btrfs_release_path(path
);
2764 btrfs_free_path(path
);
2769 static int scrub_one_super(struct scrub_ctx
*sctx
, struct btrfs_device
*dev
,
2770 struct page
*page
, u64 physical
, u64 generation
)
2772 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2773 struct bio_vec bvec
;
2775 struct btrfs_super_block
*sb
= page_address(page
);
2778 bio_init(&bio
, dev
->bdev
, &bvec
, 1, REQ_OP_READ
);
2779 bio
.bi_iter
.bi_sector
= physical
>> SECTOR_SHIFT
;
2780 __bio_add_page(&bio
, page
, BTRFS_SUPER_INFO_SIZE
, 0);
2781 ret
= submit_bio_wait(&bio
);
2786 ret
= btrfs_check_super_csum(fs_info
, sb
);
2788 btrfs_err_rl(fs_info
,
2789 "super block at physical %llu devid %llu has bad csum",
2790 physical
, dev
->devid
);
2793 if (btrfs_super_generation(sb
) != generation
) {
2794 btrfs_err_rl(fs_info
,
2795 "super block at physical %llu devid %llu has bad generation %llu expect %llu",
2796 physical
, dev
->devid
,
2797 btrfs_super_generation(sb
), generation
);
2801 return btrfs_validate_super(fs_info
, sb
, -1);
2804 static noinline_for_stack
int scrub_supers(struct scrub_ctx
*sctx
,
2805 struct btrfs_device
*scrub_dev
)
2812 struct btrfs_fs_info
*fs_info
= sctx
->fs_info
;
2814 if (BTRFS_FS_ERROR(fs_info
))
2817 page
= alloc_page(GFP_KERNEL
);
2819 spin_lock(&sctx
->stat_lock
);
2820 sctx
->stat
.malloc_errors
++;
2821 spin_unlock(&sctx
->stat_lock
);
2825 /* Seed devices of a new filesystem has their own generation. */
2826 if (scrub_dev
->fs_devices
!= fs_info
->fs_devices
)
2827 gen
= scrub_dev
->generation
;
2829 gen
= btrfs_get_last_trans_committed(fs_info
);
2831 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
2832 ret
= btrfs_sb_log_location(scrub_dev
, i
, 0, &bytenr
);
2837 spin_lock(&sctx
->stat_lock
);
2838 sctx
->stat
.super_errors
++;
2839 spin_unlock(&sctx
->stat_lock
);
2843 if (bytenr
+ BTRFS_SUPER_INFO_SIZE
>
2844 scrub_dev
->commit_total_bytes
)
2846 if (!btrfs_check_super_location(scrub_dev
, bytenr
))
2849 ret
= scrub_one_super(sctx
, scrub_dev
, page
, bytenr
, gen
);
2851 spin_lock(&sctx
->stat_lock
);
2852 sctx
->stat
.super_errors
++;
2853 spin_unlock(&sctx
->stat_lock
);
2860 static void scrub_workers_put(struct btrfs_fs_info
*fs_info
)
2862 if (refcount_dec_and_mutex_lock(&fs_info
->scrub_workers_refcnt
,
2863 &fs_info
->scrub_lock
)) {
2864 struct workqueue_struct
*scrub_workers
= fs_info
->scrub_workers
;
2866 fs_info
->scrub_workers
= NULL
;
2867 mutex_unlock(&fs_info
->scrub_lock
);
2870 destroy_workqueue(scrub_workers
);
2875 * get a reference count on fs_info->scrub_workers. start worker if necessary
2877 static noinline_for_stack
int scrub_workers_get(struct btrfs_fs_info
*fs_info
)
2879 struct workqueue_struct
*scrub_workers
= NULL
;
2880 unsigned int flags
= WQ_FREEZABLE
| WQ_UNBOUND
;
2881 int max_active
= fs_info
->thread_pool_size
;
2884 if (refcount_inc_not_zero(&fs_info
->scrub_workers_refcnt
))
2887 scrub_workers
= alloc_workqueue("btrfs-scrub", flags
, max_active
);
2891 mutex_lock(&fs_info
->scrub_lock
);
2892 if (refcount_read(&fs_info
->scrub_workers_refcnt
) == 0) {
2893 ASSERT(fs_info
->scrub_workers
== NULL
);
2894 fs_info
->scrub_workers
= scrub_workers
;
2895 refcount_set(&fs_info
->scrub_workers_refcnt
, 1);
2896 mutex_unlock(&fs_info
->scrub_lock
);
2899 /* Other thread raced in and created the workers for us */
2900 refcount_inc(&fs_info
->scrub_workers_refcnt
);
2901 mutex_unlock(&fs_info
->scrub_lock
);
2905 destroy_workqueue(scrub_workers
);
2909 int btrfs_scrub_dev(struct btrfs_fs_info
*fs_info
, u64 devid
, u64 start
,
2910 u64 end
, struct btrfs_scrub_progress
*progress
,
2911 int readonly
, int is_dev_replace
)
2913 struct btrfs_dev_lookup_args args
= { .devid
= devid
};
2914 struct scrub_ctx
*sctx
;
2916 struct btrfs_device
*dev
;
2917 unsigned int nofs_flag
;
2918 bool need_commit
= false;
2920 if (btrfs_fs_closing(fs_info
))
2923 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
2924 ASSERT(fs_info
->nodesize
<= BTRFS_STRIPE_LEN
);
2927 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
2928 * value (max nodesize / min sectorsize), thus nodesize should always
2931 ASSERT(fs_info
->nodesize
<=
2932 SCRUB_MAX_SECTORS_PER_BLOCK
<< fs_info
->sectorsize_bits
);
2934 /* Allocate outside of device_list_mutex */
2935 sctx
= scrub_setup_ctx(fs_info
, is_dev_replace
);
2937 return PTR_ERR(sctx
);
2939 ret
= scrub_workers_get(fs_info
);
2943 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
2944 dev
= btrfs_find_device(fs_info
->fs_devices
, &args
);
2945 if (!dev
|| (test_bit(BTRFS_DEV_STATE_MISSING
, &dev
->dev_state
) &&
2947 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2952 if (!is_dev_replace
&& !readonly
&&
2953 !test_bit(BTRFS_DEV_STATE_WRITEABLE
, &dev
->dev_state
)) {
2954 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2955 btrfs_err_in_rcu(fs_info
,
2956 "scrub on devid %llu: filesystem on %s is not writable",
2957 devid
, btrfs_dev_name(dev
));
2962 mutex_lock(&fs_info
->scrub_lock
);
2963 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA
, &dev
->dev_state
) ||
2964 test_bit(BTRFS_DEV_STATE_REPLACE_TGT
, &dev
->dev_state
)) {
2965 mutex_unlock(&fs_info
->scrub_lock
);
2966 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2971 down_read(&fs_info
->dev_replace
.rwsem
);
2972 if (dev
->scrub_ctx
||
2974 btrfs_dev_replace_is_ongoing(&fs_info
->dev_replace
))) {
2975 up_read(&fs_info
->dev_replace
.rwsem
);
2976 mutex_unlock(&fs_info
->scrub_lock
);
2977 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2981 up_read(&fs_info
->dev_replace
.rwsem
);
2983 sctx
->readonly
= readonly
;
2984 dev
->scrub_ctx
= sctx
;
2985 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
2988 * checking @scrub_pause_req here, we can avoid
2989 * race between committing transaction and scrubbing.
2991 __scrub_blocked_if_needed(fs_info
);
2992 atomic_inc(&fs_info
->scrubs_running
);
2993 mutex_unlock(&fs_info
->scrub_lock
);
2996 * In order to avoid deadlock with reclaim when there is a transaction
2997 * trying to pause scrub, make sure we use GFP_NOFS for all the
2998 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
2999 * invoked by our callees. The pausing request is done when the
3000 * transaction commit starts, and it blocks the transaction until scrub
3001 * is paused (done at specific points at scrub_stripe() or right above
3002 * before incrementing fs_info->scrubs_running).
3004 nofs_flag
= memalloc_nofs_save();
3005 if (!is_dev_replace
) {
3006 u64 old_super_errors
;
3008 spin_lock(&sctx
->stat_lock
);
3009 old_super_errors
= sctx
->stat
.super_errors
;
3010 spin_unlock(&sctx
->stat_lock
);
3012 btrfs_info(fs_info
, "scrub: started on devid %llu", devid
);
3014 * by holding device list mutex, we can
3015 * kick off writing super in log tree sync.
3017 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
3018 ret
= scrub_supers(sctx
, dev
);
3019 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
3021 spin_lock(&sctx
->stat_lock
);
3023 * Super block errors found, but we can not commit transaction
3024 * at current context, since btrfs_commit_transaction() needs
3025 * to pause the current running scrub (hold by ourselves).
3027 if (sctx
->stat
.super_errors
> old_super_errors
&& !sctx
->readonly
)
3029 spin_unlock(&sctx
->stat_lock
);
3033 ret
= scrub_enumerate_chunks(sctx
, dev
, start
, end
);
3034 memalloc_nofs_restore(nofs_flag
);
3036 atomic_dec(&fs_info
->scrubs_running
);
3037 wake_up(&fs_info
->scrub_pause_wait
);
3040 memcpy(progress
, &sctx
->stat
, sizeof(*progress
));
3042 if (!is_dev_replace
)
3043 btrfs_info(fs_info
, "scrub: %s on devid %llu with status: %d",
3044 ret
? "not finished" : "finished", devid
, ret
);
3046 mutex_lock(&fs_info
->scrub_lock
);
3047 dev
->scrub_ctx
= NULL
;
3048 mutex_unlock(&fs_info
->scrub_lock
);
3050 scrub_workers_put(fs_info
);
3051 scrub_put_ctx(sctx
);
3054 * We found some super block errors before, now try to force a
3055 * transaction commit, as scrub has finished.
3058 struct btrfs_trans_handle
*trans
;
3060 trans
= btrfs_start_transaction(fs_info
->tree_root
, 0);
3061 if (IS_ERR(trans
)) {
3062 ret
= PTR_ERR(trans
);
3064 "scrub: failed to start transaction to fix super block errors: %d", ret
);
3067 ret
= btrfs_commit_transaction(trans
);
3070 "scrub: failed to commit transaction to fix super block errors: %d", ret
);
3074 scrub_workers_put(fs_info
);
3076 scrub_free_ctx(sctx
);
3081 void btrfs_scrub_pause(struct btrfs_fs_info
*fs_info
)
3083 mutex_lock(&fs_info
->scrub_lock
);
3084 atomic_inc(&fs_info
->scrub_pause_req
);
3085 while (atomic_read(&fs_info
->scrubs_paused
) !=
3086 atomic_read(&fs_info
->scrubs_running
)) {
3087 mutex_unlock(&fs_info
->scrub_lock
);
3088 wait_event(fs_info
->scrub_pause_wait
,
3089 atomic_read(&fs_info
->scrubs_paused
) ==
3090 atomic_read(&fs_info
->scrubs_running
));
3091 mutex_lock(&fs_info
->scrub_lock
);
3093 mutex_unlock(&fs_info
->scrub_lock
);
3096 void btrfs_scrub_continue(struct btrfs_fs_info
*fs_info
)
3098 atomic_dec(&fs_info
->scrub_pause_req
);
3099 wake_up(&fs_info
->scrub_pause_wait
);
3102 int btrfs_scrub_cancel(struct btrfs_fs_info
*fs_info
)
3104 mutex_lock(&fs_info
->scrub_lock
);
3105 if (!atomic_read(&fs_info
->scrubs_running
)) {
3106 mutex_unlock(&fs_info
->scrub_lock
);
3110 atomic_inc(&fs_info
->scrub_cancel_req
);
3111 while (atomic_read(&fs_info
->scrubs_running
)) {
3112 mutex_unlock(&fs_info
->scrub_lock
);
3113 wait_event(fs_info
->scrub_pause_wait
,
3114 atomic_read(&fs_info
->scrubs_running
) == 0);
3115 mutex_lock(&fs_info
->scrub_lock
);
3117 atomic_dec(&fs_info
->scrub_cancel_req
);
3118 mutex_unlock(&fs_info
->scrub_lock
);
3123 int btrfs_scrub_cancel_dev(struct btrfs_device
*dev
)
3125 struct btrfs_fs_info
*fs_info
= dev
->fs_info
;
3126 struct scrub_ctx
*sctx
;
3128 mutex_lock(&fs_info
->scrub_lock
);
3129 sctx
= dev
->scrub_ctx
;
3131 mutex_unlock(&fs_info
->scrub_lock
);
3134 atomic_inc(&sctx
->cancel_req
);
3135 while (dev
->scrub_ctx
) {
3136 mutex_unlock(&fs_info
->scrub_lock
);
3137 wait_event(fs_info
->scrub_pause_wait
,
3138 dev
->scrub_ctx
== NULL
);
3139 mutex_lock(&fs_info
->scrub_lock
);
3141 mutex_unlock(&fs_info
->scrub_lock
);
3146 int btrfs_scrub_progress(struct btrfs_fs_info
*fs_info
, u64 devid
,
3147 struct btrfs_scrub_progress
*progress
)
3149 struct btrfs_dev_lookup_args args
= { .devid
= devid
};
3150 struct btrfs_device
*dev
;
3151 struct scrub_ctx
*sctx
= NULL
;
3153 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
3154 dev
= btrfs_find_device(fs_info
->fs_devices
, &args
);
3156 sctx
= dev
->scrub_ctx
;
3158 memcpy(progress
, &sctx
->stat
, sizeof(*progress
));
3159 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
3161 return dev
? (sctx
? 0 : -ENOTCONN
) : -ENODEV
;