2 * Copyright (C) 2011 STRATO. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/blkdev.h>
20 #include <linux/ratelimit.h>
24 #include "ordered-data.h"
25 #include "transaction.h"
27 #include "extent_io.h"
28 #include "check-integrity.h"
31 * This is only the first step towards a full-features scrub. It reads all
32 * extent and super block and verifies the checksums. In case a bad checksum
33 * is found or the extent cannot be read, good data will be written back if
36 * Future enhancements:
37 * - In case an unrepairable extent is encountered, track which files are
38 * affected and report them
39 * - In case of a read error on files with nodatasum, map the file and read
40 * the extent to trigger a writeback of the good copy
41 * - track and record media errors, throw out bad devices
42 * - add a mode to also read unallocated space
48 static void scrub_bio_end_io(struct bio
*bio
, int err
);
49 static void scrub_checksum(struct btrfs_work
*work
);
50 static int scrub_checksum_data(struct scrub_dev
*sdev
,
51 struct scrub_page
*spag
, void *buffer
);
52 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
53 struct scrub_page
*spag
, u64 logical
,
55 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
);
56 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
);
57 static void scrub_fixup_end_io(struct bio
*bio
, int err
);
58 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
60 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
);
62 #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
63 #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
66 u64 flags
; /* extent flags */
70 u8 csum
[BTRFS_CSUM_SIZE
];
75 struct scrub_dev
*sdev
;
80 struct scrub_page spag
[SCRUB_PAGES_PER_BIO
];
83 struct btrfs_work work
;
87 struct scrub_bio
*bios
[SCRUB_BIOS_PER_DEV
];
88 struct btrfs_device
*dev
;
94 wait_queue_head_t list_wait
;
96 struct list_head csum_list
;
102 struct btrfs_scrub_progress stat
;
103 spinlock_t stat_lock
;
106 struct scrub_fixup_nodatasum
{
107 struct scrub_dev
*sdev
;
109 struct btrfs_root
*root
;
110 struct btrfs_work work
;
114 struct scrub_warning
{
115 struct btrfs_path
*path
;
116 u64 extent_item_size
;
122 struct btrfs_device
*dev
;
127 static void scrub_free_csums(struct scrub_dev
*sdev
)
129 while (!list_empty(&sdev
->csum_list
)) {
130 struct btrfs_ordered_sum
*sum
;
131 sum
= list_first_entry(&sdev
->csum_list
,
132 struct btrfs_ordered_sum
, list
);
133 list_del(&sum
->list
);
138 static void scrub_free_bio(struct bio
*bio
)
141 struct page
*last_page
= NULL
;
146 for (i
= 0; i
< bio
->bi_vcnt
; ++i
) {
147 if (bio
->bi_io_vec
[i
].bv_page
== last_page
)
149 last_page
= bio
->bi_io_vec
[i
].bv_page
;
150 __free_page(last_page
);
155 static noinline_for_stack
void scrub_free_dev(struct scrub_dev
*sdev
)
162 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
163 struct scrub_bio
*sbio
= sdev
->bios
[i
];
168 scrub_free_bio(sbio
->bio
);
172 scrub_free_csums(sdev
);
176 static noinline_for_stack
177 struct scrub_dev
*scrub_setup_dev(struct btrfs_device
*dev
)
179 struct scrub_dev
*sdev
;
181 struct btrfs_fs_info
*fs_info
= dev
->dev_root
->fs_info
;
183 sdev
= kzalloc(sizeof(*sdev
), GFP_NOFS
);
187 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
188 struct scrub_bio
*sbio
;
190 sbio
= kzalloc(sizeof(*sbio
), GFP_NOFS
);
193 sdev
->bios
[i
] = sbio
;
198 sbio
->work
.func
= scrub_checksum
;
200 if (i
!= SCRUB_BIOS_PER_DEV
-1)
201 sdev
->bios
[i
]->next_free
= i
+ 1;
203 sdev
->bios
[i
]->next_free
= -1;
205 sdev
->first_free
= 0;
207 atomic_set(&sdev
->in_flight
, 0);
208 atomic_set(&sdev
->fixup_cnt
, 0);
209 atomic_set(&sdev
->cancel_req
, 0);
210 sdev
->csum_size
= btrfs_super_csum_size(fs_info
->super_copy
);
211 INIT_LIST_HEAD(&sdev
->csum_list
);
213 spin_lock_init(&sdev
->list_lock
);
214 spin_lock_init(&sdev
->stat_lock
);
215 init_waitqueue_head(&sdev
->list_wait
);
219 scrub_free_dev(sdev
);
220 return ERR_PTR(-ENOMEM
);
223 static int scrub_print_warning_inode(u64 inum
, u64 offset
, u64 root
, void *ctx
)
229 struct extent_buffer
*eb
;
230 struct btrfs_inode_item
*inode_item
;
231 struct scrub_warning
*swarn
= ctx
;
232 struct btrfs_fs_info
*fs_info
= swarn
->dev
->dev_root
->fs_info
;
233 struct inode_fs_paths
*ipath
= NULL
;
234 struct btrfs_root
*local_root
;
235 struct btrfs_key root_key
;
237 root_key
.objectid
= root
;
238 root_key
.type
= BTRFS_ROOT_ITEM_KEY
;
239 root_key
.offset
= (u64
)-1;
240 local_root
= btrfs_read_fs_root_no_name(fs_info
, &root_key
);
241 if (IS_ERR(local_root
)) {
242 ret
= PTR_ERR(local_root
);
246 ret
= inode_item_info(inum
, 0, local_root
, swarn
->path
);
248 btrfs_release_path(swarn
->path
);
252 eb
= swarn
->path
->nodes
[0];
253 inode_item
= btrfs_item_ptr(eb
, swarn
->path
->slots
[0],
254 struct btrfs_inode_item
);
255 isize
= btrfs_inode_size(eb
, inode_item
);
256 nlink
= btrfs_inode_nlink(eb
, inode_item
);
257 btrfs_release_path(swarn
->path
);
259 ipath
= init_ipath(4096, local_root
, swarn
->path
);
261 ret
= PTR_ERR(ipath
);
265 ret
= paths_from_inode(inum
, ipath
);
271 * we deliberately ignore the bit ipath might have been too small to
272 * hold all of the paths here
274 for (i
= 0; i
< ipath
->fspath
->elem_cnt
; ++i
)
275 printk(KERN_WARNING
"btrfs: %s at logical %llu on dev "
276 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
277 "length %llu, links %u (path: %s)\n", swarn
->errstr
,
278 swarn
->logical
, swarn
->dev
->name
,
279 (unsigned long long)swarn
->sector
, root
, inum
, offset
,
280 min(isize
- offset
, (u64
)PAGE_SIZE
), nlink
,
281 (char *)(unsigned long)ipath
->fspath
->val
[i
]);
287 printk(KERN_WARNING
"btrfs: %s at logical %llu on dev "
288 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
289 "resolving failed with ret=%d\n", swarn
->errstr
,
290 swarn
->logical
, swarn
->dev
->name
,
291 (unsigned long long)swarn
->sector
, root
, inum
, offset
, ret
);
297 static void scrub_print_warning(const char *errstr
, struct scrub_bio
*sbio
,
300 struct btrfs_device
*dev
= sbio
->sdev
->dev
;
301 struct btrfs_fs_info
*fs_info
= dev
->dev_root
->fs_info
;
302 struct btrfs_path
*path
;
303 struct btrfs_key found_key
;
304 struct extent_buffer
*eb
;
305 struct btrfs_extent_item
*ei
;
306 struct scrub_warning swarn
;
311 unsigned long ptr
= 0;
312 const int bufsize
= 4096;
315 path
= btrfs_alloc_path();
317 swarn
.scratch_buf
= kmalloc(bufsize
, GFP_NOFS
);
318 swarn
.msg_buf
= kmalloc(bufsize
, GFP_NOFS
);
319 swarn
.sector
= (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9;
320 swarn
.logical
= sbio
->logical
+ ix
* PAGE_SIZE
;
321 swarn
.errstr
= errstr
;
323 swarn
.msg_bufsize
= bufsize
;
324 swarn
.scratch_bufsize
= bufsize
;
326 if (!path
|| !swarn
.scratch_buf
|| !swarn
.msg_buf
)
329 ret
= extent_from_logical(fs_info
, swarn
.logical
, path
, &found_key
);
333 extent_item_pos
= swarn
.logical
- found_key
.objectid
;
334 swarn
.extent_item_size
= found_key
.offset
;
337 ei
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_extent_item
);
338 item_size
= btrfs_item_size_nr(eb
, path
->slots
[0]);
339 btrfs_release_path(path
);
341 if (ret
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
343 ret
= tree_backref_for_extent(&ptr
, eb
, ei
, item_size
,
344 &ref_root
, &ref_level
);
345 printk(KERN_WARNING
"%s at logical %llu on dev %s, "
346 "sector %llu: metadata %s (level %d) in tree "
347 "%llu\n", errstr
, swarn
.logical
, dev
->name
,
348 (unsigned long long)swarn
.sector
,
349 ref_level
? "node" : "leaf",
350 ret
< 0 ? -1 : ref_level
,
351 ret
< 0 ? -1 : ref_root
);
355 iterate_extent_inodes(fs_info
, found_key
.objectid
,
357 scrub_print_warning_inode
, &swarn
);
361 btrfs_free_path(path
);
362 kfree(swarn
.scratch_buf
);
363 kfree(swarn
.msg_buf
);
366 static int scrub_fixup_readpage(u64 inum
, u64 offset
, u64 root
, void *ctx
)
368 struct page
*page
= NULL
;
370 struct scrub_fixup_nodatasum
*fixup
= ctx
;
373 struct btrfs_key key
;
374 struct inode
*inode
= NULL
;
375 u64 end
= offset
+ PAGE_SIZE
- 1;
376 struct btrfs_root
*local_root
;
379 key
.type
= BTRFS_ROOT_ITEM_KEY
;
380 key
.offset
= (u64
)-1;
381 local_root
= btrfs_read_fs_root_no_name(fixup
->root
->fs_info
, &key
);
382 if (IS_ERR(local_root
))
383 return PTR_ERR(local_root
);
385 key
.type
= BTRFS_INODE_ITEM_KEY
;
388 inode
= btrfs_iget(fixup
->root
->fs_info
->sb
, &key
, local_root
, NULL
);
390 return PTR_ERR(inode
);
392 index
= offset
>> PAGE_CACHE_SHIFT
;
394 page
= find_or_create_page(inode
->i_mapping
, index
, GFP_NOFS
);
400 if (PageUptodate(page
)) {
401 struct btrfs_mapping_tree
*map_tree
;
402 if (PageDirty(page
)) {
404 * we need to write the data to the defect sector. the
405 * data that was in that sector is not in memory,
406 * because the page was modified. we must not write the
407 * modified page to that sector.
409 * TODO: what could be done here: wait for the delalloc
410 * runner to write out that page (might involve
411 * COW) and see whether the sector is still
412 * referenced afterwards.
414 * For the meantime, we'll treat this error
415 * incorrectable, although there is a chance that a
416 * later scrub will find the bad sector again and that
417 * there's no dirty page in memory, then.
422 map_tree
= &BTRFS_I(inode
)->root
->fs_info
->mapping_tree
;
423 ret
= repair_io_failure(map_tree
, offset
, PAGE_SIZE
,
424 fixup
->logical
, page
,
430 * we need to get good data first. the general readpage path
431 * will call repair_io_failure for us, we just have to make
432 * sure we read the bad mirror.
434 ret
= set_extent_bits(&BTRFS_I(inode
)->io_tree
, offset
, end
,
435 EXTENT_DAMAGED
, GFP_NOFS
);
437 /* set_extent_bits should give proper error */
444 ret
= extent_read_full_page(&BTRFS_I(inode
)->io_tree
, page
,
447 wait_on_page_locked(page
);
449 corrected
= !test_range_bit(&BTRFS_I(inode
)->io_tree
, offset
,
450 end
, EXTENT_DAMAGED
, 0, NULL
);
452 clear_extent_bits(&BTRFS_I(inode
)->io_tree
, offset
, end
,
453 EXTENT_DAMAGED
, GFP_NOFS
);
465 if (ret
== 0 && corrected
) {
467 * we only need to call readpage for one of the inodes belonging
468 * to this extent. so make iterate_extent_inodes stop
476 static void scrub_fixup_nodatasum(struct btrfs_work
*work
)
479 struct scrub_fixup_nodatasum
*fixup
;
480 struct scrub_dev
*sdev
;
481 struct btrfs_trans_handle
*trans
= NULL
;
482 struct btrfs_fs_info
*fs_info
;
483 struct btrfs_path
*path
;
484 int uncorrectable
= 0;
486 fixup
= container_of(work
, struct scrub_fixup_nodatasum
, work
);
488 fs_info
= fixup
->root
->fs_info
;
490 path
= btrfs_alloc_path();
492 spin_lock(&sdev
->stat_lock
);
493 ++sdev
->stat
.malloc_errors
;
494 spin_unlock(&sdev
->stat_lock
);
499 trans
= btrfs_join_transaction(fixup
->root
);
506 * the idea is to trigger a regular read through the standard path. we
507 * read a page from the (failed) logical address by specifying the
508 * corresponding copynum of the failed sector. thus, that readpage is
510 * that is the point where on-the-fly error correction will kick in
511 * (once it's finished) and rewrite the failed sector if a good copy
514 ret
= iterate_inodes_from_logical(fixup
->logical
, fixup
->root
->fs_info
,
515 path
, scrub_fixup_readpage
,
523 spin_lock(&sdev
->stat_lock
);
524 ++sdev
->stat
.corrected_errors
;
525 spin_unlock(&sdev
->stat_lock
);
528 if (trans
&& !IS_ERR(trans
))
529 btrfs_end_transaction(trans
, fixup
->root
);
531 spin_lock(&sdev
->stat_lock
);
532 ++sdev
->stat
.uncorrectable_errors
;
533 spin_unlock(&sdev
->stat_lock
);
534 printk_ratelimited(KERN_ERR
"btrfs: unable to fixup "
535 "(nodatasum) error at logical %llu\n",
539 btrfs_free_path(path
);
542 /* see caller why we're pretending to be paused in the scrub counters */
543 mutex_lock(&fs_info
->scrub_lock
);
544 atomic_dec(&fs_info
->scrubs_running
);
545 atomic_dec(&fs_info
->scrubs_paused
);
546 mutex_unlock(&fs_info
->scrub_lock
);
547 atomic_dec(&sdev
->fixup_cnt
);
548 wake_up(&fs_info
->scrub_pause_wait
);
549 wake_up(&sdev
->list_wait
);
553 * scrub_recheck_error gets called when either verification of the page
554 * failed or the bio failed to read, e.g. with EIO. In the latter case,
555 * recheck_error gets called for every page in the bio, even though only
558 static int scrub_recheck_error(struct scrub_bio
*sbio
, int ix
)
560 struct scrub_dev
*sdev
= sbio
->sdev
;
561 u64 sector
= (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9;
562 static DEFINE_RATELIMIT_STATE(_rs
, DEFAULT_RATELIMIT_INTERVAL
,
563 DEFAULT_RATELIMIT_BURST
);
566 if (scrub_fixup_io(READ
, sbio
->sdev
->dev
->bdev
, sector
,
567 sbio
->bio
->bi_io_vec
[ix
].bv_page
) == 0) {
568 if (scrub_fixup_check(sbio
, ix
) == 0)
571 if (__ratelimit(&_rs
))
572 scrub_print_warning("i/o error", sbio
, ix
);
574 if (__ratelimit(&_rs
))
575 scrub_print_warning("checksum error", sbio
, ix
);
578 spin_lock(&sdev
->stat_lock
);
579 ++sdev
->stat
.read_errors
;
580 spin_unlock(&sdev
->stat_lock
);
582 scrub_fixup(sbio
, ix
);
586 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
)
591 u64 flags
= sbio
->spag
[ix
].flags
;
593 page
= sbio
->bio
->bi_io_vec
[ix
].bv_page
;
594 buffer
= kmap_atomic(page
, KM_USER0
);
595 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
596 ret
= scrub_checksum_data(sbio
->sdev
,
597 sbio
->spag
+ ix
, buffer
);
598 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
599 ret
= scrub_checksum_tree_block(sbio
->sdev
,
601 sbio
->logical
+ ix
* PAGE_SIZE
,
606 kunmap_atomic(buffer
, KM_USER0
);
611 static void scrub_fixup_end_io(struct bio
*bio
, int err
)
613 complete((struct completion
*)bio
->bi_private
);
616 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
)
618 struct scrub_dev
*sdev
= sbio
->sdev
;
619 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
620 struct btrfs_mapping_tree
*map_tree
= &fs_info
->mapping_tree
;
621 struct btrfs_bio
*bbio
= NULL
;
622 struct scrub_fixup_nodatasum
*fixup
;
623 u64 logical
= sbio
->logical
+ ix
* PAGE_SIZE
;
627 DECLARE_COMPLETION_ONSTACK(complete
);
629 if ((sbio
->spag
[ix
].flags
& BTRFS_EXTENT_FLAG_DATA
) &&
630 (sbio
->spag
[ix
].have_csum
== 0)) {
631 fixup
= kzalloc(sizeof(*fixup
), GFP_NOFS
);
635 fixup
->logical
= logical
;
636 fixup
->root
= fs_info
->extent_root
;
637 fixup
->mirror_num
= sbio
->spag
[ix
].mirror_num
;
639 * increment scrubs_running to prevent cancel requests from
640 * completing as long as a fixup worker is running. we must also
641 * increment scrubs_paused to prevent deadlocking on pause
642 * requests used for transactions commits (as the worker uses a
643 * transaction context). it is safe to regard the fixup worker
644 * as paused for all matters practical. effectively, we only
645 * avoid cancellation requests from completing.
647 mutex_lock(&fs_info
->scrub_lock
);
648 atomic_inc(&fs_info
->scrubs_running
);
649 atomic_inc(&fs_info
->scrubs_paused
);
650 mutex_unlock(&fs_info
->scrub_lock
);
651 atomic_inc(&sdev
->fixup_cnt
);
652 fixup
->work
.func
= scrub_fixup_nodatasum
;
653 btrfs_queue_worker(&fs_info
->scrub_workers
, &fixup
->work
);
658 ret
= btrfs_map_block(map_tree
, REQ_WRITE
, logical
, &length
,
660 if (ret
|| !bbio
|| length
< PAGE_SIZE
) {
662 "scrub_fixup: btrfs_map_block failed us for %llu\n",
663 (unsigned long long)logical
);
669 if (bbio
->num_stripes
== 1)
670 /* there aren't any replicas */
674 * first find a good copy
676 for (i
= 0; i
< bbio
->num_stripes
; ++i
) {
677 if (i
+ 1 == sbio
->spag
[ix
].mirror_num
)
680 if (scrub_fixup_io(READ
, bbio
->stripes
[i
].dev
->bdev
,
681 bbio
->stripes
[i
].physical
>> 9,
682 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
683 /* I/O-error, this is not a good copy */
687 if (scrub_fixup_check(sbio
, ix
) == 0)
690 if (i
== bbio
->num_stripes
)
693 if (!sdev
->readonly
) {
695 * bi_io_vec[ix].bv_page now contains good data, write it back
697 if (scrub_fixup_io(WRITE
, sdev
->dev
->bdev
,
698 (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9,
699 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
700 /* I/O-error, writeback failed, give up */
706 spin_lock(&sdev
->stat_lock
);
707 ++sdev
->stat
.corrected_errors
;
708 spin_unlock(&sdev
->stat_lock
);
710 printk_ratelimited(KERN_ERR
"btrfs: fixed up error at logical %llu\n",
711 (unsigned long long)logical
);
716 spin_lock(&sdev
->stat_lock
);
717 ++sdev
->stat
.uncorrectable_errors
;
718 spin_unlock(&sdev
->stat_lock
);
720 printk_ratelimited(KERN_ERR
"btrfs: unable to fixup (regular) error at "
721 "logical %llu\n", (unsigned long long)logical
);
724 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
727 struct bio
*bio
= NULL
;
729 DECLARE_COMPLETION_ONSTACK(complete
);
731 bio
= bio_alloc(GFP_NOFS
, 1);
733 bio
->bi_sector
= sector
;
734 bio_add_page(bio
, page
, PAGE_SIZE
, 0);
735 bio
->bi_end_io
= scrub_fixup_end_io
;
736 bio
->bi_private
= &complete
;
737 btrfsic_submit_bio(rw
, bio
);
739 /* this will also unplug the queue */
740 wait_for_completion(&complete
);
742 ret
= !test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
747 static void scrub_bio_end_io(struct bio
*bio
, int err
)
749 struct scrub_bio
*sbio
= bio
->bi_private
;
750 struct scrub_dev
*sdev
= sbio
->sdev
;
751 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
756 btrfs_queue_worker(&fs_info
->scrub_workers
, &sbio
->work
);
759 static void scrub_checksum(struct btrfs_work
*work
)
761 struct scrub_bio
*sbio
= container_of(work
, struct scrub_bio
, work
);
762 struct scrub_dev
*sdev
= sbio
->sdev
;
772 for (i
= 0; i
< sbio
->count
; ++i
)
773 ret
|= scrub_recheck_error(sbio
, i
);
775 spin_lock(&sdev
->stat_lock
);
776 ++sdev
->stat
.unverified_errors
;
777 spin_unlock(&sdev
->stat_lock
);
780 sbio
->bio
->bi_flags
&= ~(BIO_POOL_MASK
- 1);
781 sbio
->bio
->bi_flags
|= 1 << BIO_UPTODATE
;
782 sbio
->bio
->bi_phys_segments
= 0;
783 sbio
->bio
->bi_idx
= 0;
785 for (i
= 0; i
< sbio
->count
; i
++) {
787 bi
= &sbio
->bio
->bi_io_vec
[i
];
789 bi
->bv_len
= PAGE_SIZE
;
793 for (i
= 0; i
< sbio
->count
; ++i
) {
794 page
= sbio
->bio
->bi_io_vec
[i
].bv_page
;
795 buffer
= kmap_atomic(page
, KM_USER0
);
796 flags
= sbio
->spag
[i
].flags
;
797 logical
= sbio
->logical
+ i
* PAGE_SIZE
;
799 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
800 ret
= scrub_checksum_data(sdev
, sbio
->spag
+ i
, buffer
);
801 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
802 ret
= scrub_checksum_tree_block(sdev
, sbio
->spag
+ i
,
804 } else if (flags
& BTRFS_EXTENT_FLAG_SUPER
) {
806 (void)scrub_checksum_super(sbio
, buffer
);
810 kunmap_atomic(buffer
, KM_USER0
);
812 ret
= scrub_recheck_error(sbio
, i
);
814 spin_lock(&sdev
->stat_lock
);
815 ++sdev
->stat
.unverified_errors
;
816 spin_unlock(&sdev
->stat_lock
);
822 scrub_free_bio(sbio
->bio
);
824 spin_lock(&sdev
->list_lock
);
825 sbio
->next_free
= sdev
->first_free
;
826 sdev
->first_free
= sbio
->index
;
827 spin_unlock(&sdev
->list_lock
);
828 atomic_dec(&sdev
->in_flight
);
829 wake_up(&sdev
->list_wait
);
832 static int scrub_checksum_data(struct scrub_dev
*sdev
,
833 struct scrub_page
*spag
, void *buffer
)
835 u8 csum
[BTRFS_CSUM_SIZE
];
838 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
840 if (!spag
->have_csum
)
843 crc
= btrfs_csum_data(root
, buffer
, crc
, PAGE_SIZE
);
844 btrfs_csum_final(crc
, csum
);
845 if (memcmp(csum
, spag
->csum
, sdev
->csum_size
))
848 spin_lock(&sdev
->stat_lock
);
849 ++sdev
->stat
.data_extents_scrubbed
;
850 sdev
->stat
.data_bytes_scrubbed
+= PAGE_SIZE
;
852 ++sdev
->stat
.csum_errors
;
853 spin_unlock(&sdev
->stat_lock
);
858 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
859 struct scrub_page
*spag
, u64 logical
,
862 struct btrfs_header
*h
;
863 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
864 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
865 u8 csum
[BTRFS_CSUM_SIZE
];
871 * we don't use the getter functions here, as we
872 * a) don't have an extent buffer and
873 * b) the page is already kmapped
875 h
= (struct btrfs_header
*)buffer
;
877 if (logical
!= le64_to_cpu(h
->bytenr
))
880 if (spag
->generation
!= le64_to_cpu(h
->generation
))
883 if (memcmp(h
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
886 if (memcmp(h
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
,
890 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
891 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
892 btrfs_csum_final(crc
, csum
);
893 if (memcmp(csum
, h
->csum
, sdev
->csum_size
))
896 spin_lock(&sdev
->stat_lock
);
897 ++sdev
->stat
.tree_extents_scrubbed
;
898 sdev
->stat
.tree_bytes_scrubbed
+= PAGE_SIZE
;
900 ++sdev
->stat
.csum_errors
;
902 ++sdev
->stat
.verify_errors
;
903 spin_unlock(&sdev
->stat_lock
);
905 return fail
|| crc_fail
;
908 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
)
910 struct btrfs_super_block
*s
;
912 struct scrub_dev
*sdev
= sbio
->sdev
;
913 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
914 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
915 u8 csum
[BTRFS_CSUM_SIZE
];
919 s
= (struct btrfs_super_block
*)buffer
;
920 logical
= sbio
->logical
;
922 if (logical
!= le64_to_cpu(s
->bytenr
))
925 if (sbio
->spag
[0].generation
!= le64_to_cpu(s
->generation
))
928 if (memcmp(s
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
931 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
932 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
933 btrfs_csum_final(crc
, csum
);
934 if (memcmp(csum
, s
->csum
, sbio
->sdev
->csum_size
))
939 * if we find an error in a super block, we just report it.
940 * They will get written with the next transaction commit
943 spin_lock(&sdev
->stat_lock
);
944 ++sdev
->stat
.super_errors
;
945 spin_unlock(&sdev
->stat_lock
);
951 static int scrub_submit(struct scrub_dev
*sdev
)
953 struct scrub_bio
*sbio
;
955 if (sdev
->curr
== -1)
958 sbio
= sdev
->bios
[sdev
->curr
];
961 atomic_inc(&sdev
->in_flight
);
963 btrfsic_submit_bio(READ
, sbio
->bio
);
968 static int scrub_page(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
969 u64 physical
, u64 flags
, u64 gen
, int mirror_num
,
972 struct scrub_bio
*sbio
;
978 * grab a fresh bio or wait for one to become available
980 while (sdev
->curr
== -1) {
981 spin_lock(&sdev
->list_lock
);
982 sdev
->curr
= sdev
->first_free
;
983 if (sdev
->curr
!= -1) {
984 sdev
->first_free
= sdev
->bios
[sdev
->curr
]->next_free
;
985 sdev
->bios
[sdev
->curr
]->next_free
= -1;
986 sdev
->bios
[sdev
->curr
]->count
= 0;
987 spin_unlock(&sdev
->list_lock
);
989 spin_unlock(&sdev
->list_lock
);
990 wait_event(sdev
->list_wait
, sdev
->first_free
!= -1);
993 sbio
= sdev
->bios
[sdev
->curr
];
994 if (sbio
->count
== 0) {
997 sbio
->physical
= physical
;
998 sbio
->logical
= logical
;
999 bio
= bio_alloc(GFP_NOFS
, SCRUB_PAGES_PER_BIO
);
1003 bio
->bi_private
= sbio
;
1004 bio
->bi_end_io
= scrub_bio_end_io
;
1005 bio
->bi_bdev
= sdev
->dev
->bdev
;
1006 bio
->bi_sector
= sbio
->physical
>> 9;
1009 } else if (sbio
->physical
+ sbio
->count
* PAGE_SIZE
!= physical
||
1010 sbio
->logical
+ sbio
->count
* PAGE_SIZE
!= logical
) {
1011 ret
= scrub_submit(sdev
);
1016 sbio
->spag
[sbio
->count
].flags
= flags
;
1017 sbio
->spag
[sbio
->count
].generation
= gen
;
1018 sbio
->spag
[sbio
->count
].have_csum
= 0;
1019 sbio
->spag
[sbio
->count
].mirror_num
= mirror_num
;
1021 page
= alloc_page(GFP_NOFS
);
1025 ret
= bio_add_page(sbio
->bio
, page
, PAGE_SIZE
, 0);
1028 ret
= scrub_submit(sdev
);
1035 sbio
->spag
[sbio
->count
].have_csum
= 1;
1036 memcpy(sbio
->spag
[sbio
->count
].csum
, csum
, sdev
->csum_size
);
1039 if (sbio
->count
== SCRUB_PAGES_PER_BIO
|| force
) {
1042 ret
= scrub_submit(sdev
);
1050 static int scrub_find_csum(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
1053 struct btrfs_ordered_sum
*sum
= NULL
;
1056 unsigned long num_sectors
;
1057 u32 sectorsize
= sdev
->dev
->dev_root
->sectorsize
;
1059 while (!list_empty(&sdev
->csum_list
)) {
1060 sum
= list_first_entry(&sdev
->csum_list
,
1061 struct btrfs_ordered_sum
, list
);
1062 if (sum
->bytenr
> logical
)
1064 if (sum
->bytenr
+ sum
->len
> logical
)
1067 ++sdev
->stat
.csum_discards
;
1068 list_del(&sum
->list
);
1075 num_sectors
= sum
->len
/ sectorsize
;
1076 for (i
= 0; i
< num_sectors
; ++i
) {
1077 if (sum
->sums
[i
].bytenr
== logical
) {
1078 memcpy(csum
, &sum
->sums
[i
].sum
, sdev
->csum_size
);
1083 if (ret
&& i
== num_sectors
- 1) {
1084 list_del(&sum
->list
);
1090 /* scrub extent tries to collect up to 64 kB for each bio */
1091 static int scrub_extent(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
1092 u64 physical
, u64 flags
, u64 gen
, int mirror_num
)
1095 u8 csum
[BTRFS_CSUM_SIZE
];
1098 u64 l
= min_t(u64
, len
, PAGE_SIZE
);
1101 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
1102 /* push csums to sbio */
1103 have_csum
= scrub_find_csum(sdev
, logical
, l
, csum
);
1105 ++sdev
->stat
.no_csum
;
1107 ret
= scrub_page(sdev
, logical
, l
, physical
, flags
, gen
,
1108 mirror_num
, have_csum
? csum
: NULL
, 0);
1118 static noinline_for_stack
int scrub_stripe(struct scrub_dev
*sdev
,
1119 struct map_lookup
*map
, int num
, u64 base
, u64 length
)
1121 struct btrfs_path
*path
;
1122 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
1123 struct btrfs_root
*root
= fs_info
->extent_root
;
1124 struct btrfs_root
*csum_root
= fs_info
->csum_root
;
1125 struct btrfs_extent_item
*extent
;
1126 struct blk_plug plug
;
1132 struct extent_buffer
*l
;
1133 struct btrfs_key key
;
1138 struct reada_control
*reada1
;
1139 struct reada_control
*reada2
;
1140 struct btrfs_key key_start
;
1141 struct btrfs_key key_end
;
1143 u64 increment
= map
->stripe_len
;
1148 do_div(nstripes
, map
->stripe_len
);
1149 if (map
->type
& BTRFS_BLOCK_GROUP_RAID0
) {
1150 offset
= map
->stripe_len
* num
;
1151 increment
= map
->stripe_len
* map
->num_stripes
;
1153 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID10
) {
1154 int factor
= map
->num_stripes
/ map
->sub_stripes
;
1155 offset
= map
->stripe_len
* (num
/ map
->sub_stripes
);
1156 increment
= map
->stripe_len
* factor
;
1157 mirror_num
= num
% map
->sub_stripes
+ 1;
1158 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID1
) {
1159 increment
= map
->stripe_len
;
1160 mirror_num
= num
% map
->num_stripes
+ 1;
1161 } else if (map
->type
& BTRFS_BLOCK_GROUP_DUP
) {
1162 increment
= map
->stripe_len
;
1163 mirror_num
= num
% map
->num_stripes
+ 1;
1165 increment
= map
->stripe_len
;
1169 path
= btrfs_alloc_path();
1173 path
->search_commit_root
= 1;
1174 path
->skip_locking
= 1;
1177 * trigger the readahead for extent tree csum tree and wait for
1178 * completion. During readahead, the scrub is officially paused
1179 * to not hold off transaction commits
1181 logical
= base
+ offset
;
1183 wait_event(sdev
->list_wait
,
1184 atomic_read(&sdev
->in_flight
) == 0);
1185 atomic_inc(&fs_info
->scrubs_paused
);
1186 wake_up(&fs_info
->scrub_pause_wait
);
1188 /* FIXME it might be better to start readahead at commit root */
1189 key_start
.objectid
= logical
;
1190 key_start
.type
= BTRFS_EXTENT_ITEM_KEY
;
1191 key_start
.offset
= (u64
)0;
1192 key_end
.objectid
= base
+ offset
+ nstripes
* increment
;
1193 key_end
.type
= BTRFS_EXTENT_ITEM_KEY
;
1194 key_end
.offset
= (u64
)0;
1195 reada1
= btrfs_reada_add(root
, &key_start
, &key_end
);
1197 key_start
.objectid
= BTRFS_EXTENT_CSUM_OBJECTID
;
1198 key_start
.type
= BTRFS_EXTENT_CSUM_KEY
;
1199 key_start
.offset
= logical
;
1200 key_end
.objectid
= BTRFS_EXTENT_CSUM_OBJECTID
;
1201 key_end
.type
= BTRFS_EXTENT_CSUM_KEY
;
1202 key_end
.offset
= base
+ offset
+ nstripes
* increment
;
1203 reada2
= btrfs_reada_add(csum_root
, &key_start
, &key_end
);
1205 if (!IS_ERR(reada1
))
1206 btrfs_reada_wait(reada1
);
1207 if (!IS_ERR(reada2
))
1208 btrfs_reada_wait(reada2
);
1210 mutex_lock(&fs_info
->scrub_lock
);
1211 while (atomic_read(&fs_info
->scrub_pause_req
)) {
1212 mutex_unlock(&fs_info
->scrub_lock
);
1213 wait_event(fs_info
->scrub_pause_wait
,
1214 atomic_read(&fs_info
->scrub_pause_req
) == 0);
1215 mutex_lock(&fs_info
->scrub_lock
);
1217 atomic_dec(&fs_info
->scrubs_paused
);
1218 mutex_unlock(&fs_info
->scrub_lock
);
1219 wake_up(&fs_info
->scrub_pause_wait
);
1222 * collect all data csums for the stripe to avoid seeking during
1223 * the scrub. This might currently (crc32) end up to be about 1MB
1225 blk_start_plug(&plug
);
1228 * now find all extents for each stripe and scrub them
1230 logical
= base
+ offset
;
1231 physical
= map
->stripes
[num
].physical
;
1233 for (i
= 0; i
< nstripes
; ++i
) {
1237 if (atomic_read(&fs_info
->scrub_cancel_req
) ||
1238 atomic_read(&sdev
->cancel_req
)) {
1243 * check to see if we have to pause
1245 if (atomic_read(&fs_info
->scrub_pause_req
)) {
1246 /* push queued extents */
1248 wait_event(sdev
->list_wait
,
1249 atomic_read(&sdev
->in_flight
) == 0);
1250 atomic_inc(&fs_info
->scrubs_paused
);
1251 wake_up(&fs_info
->scrub_pause_wait
);
1252 mutex_lock(&fs_info
->scrub_lock
);
1253 while (atomic_read(&fs_info
->scrub_pause_req
)) {
1254 mutex_unlock(&fs_info
->scrub_lock
);
1255 wait_event(fs_info
->scrub_pause_wait
,
1256 atomic_read(&fs_info
->scrub_pause_req
) == 0);
1257 mutex_lock(&fs_info
->scrub_lock
);
1259 atomic_dec(&fs_info
->scrubs_paused
);
1260 mutex_unlock(&fs_info
->scrub_lock
);
1261 wake_up(&fs_info
->scrub_pause_wait
);
1264 ret
= btrfs_lookup_csums_range(csum_root
, logical
,
1265 logical
+ map
->stripe_len
- 1,
1266 &sdev
->csum_list
, 1);
1270 key
.objectid
= logical
;
1271 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1272 key
.offset
= (u64
)0;
1274 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1278 ret
= btrfs_previous_item(root
, path
, 0,
1279 BTRFS_EXTENT_ITEM_KEY
);
1283 /* there's no smaller item, so stick with the
1285 btrfs_release_path(path
);
1286 ret
= btrfs_search_slot(NULL
, root
, &key
,
1295 slot
= path
->slots
[0];
1296 if (slot
>= btrfs_header_nritems(l
)) {
1297 ret
= btrfs_next_leaf(root
, path
);
1305 btrfs_item_key_to_cpu(l
, &key
, slot
);
1307 if (key
.objectid
+ key
.offset
<= logical
)
1310 if (key
.objectid
>= logical
+ map
->stripe_len
)
1313 if (btrfs_key_type(&key
) != BTRFS_EXTENT_ITEM_KEY
)
1316 extent
= btrfs_item_ptr(l
, slot
,
1317 struct btrfs_extent_item
);
1318 flags
= btrfs_extent_flags(l
, extent
);
1319 generation
= btrfs_extent_generation(l
, extent
);
1321 if (key
.objectid
< logical
&&
1322 (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)) {
1324 "btrfs scrub: tree block %llu spanning "
1325 "stripes, ignored. logical=%llu\n",
1326 (unsigned long long)key
.objectid
,
1327 (unsigned long long)logical
);
1332 * trim extent to this stripe
1334 if (key
.objectid
< logical
) {
1335 key
.offset
-= logical
- key
.objectid
;
1336 key
.objectid
= logical
;
1338 if (key
.objectid
+ key
.offset
>
1339 logical
+ map
->stripe_len
) {
1340 key
.offset
= logical
+ map
->stripe_len
-
1344 ret
= scrub_extent(sdev
, key
.objectid
, key
.offset
,
1345 key
.objectid
- logical
+ physical
,
1346 flags
, generation
, mirror_num
);
1353 btrfs_release_path(path
);
1354 logical
+= increment
;
1355 physical
+= map
->stripe_len
;
1356 spin_lock(&sdev
->stat_lock
);
1357 sdev
->stat
.last_physical
= physical
;
1358 spin_unlock(&sdev
->stat_lock
);
1360 /* push queued extents */
1364 blk_finish_plug(&plug
);
1365 btrfs_free_path(path
);
1366 return ret
< 0 ? ret
: 0;
1369 static noinline_for_stack
int scrub_chunk(struct scrub_dev
*sdev
,
1370 u64 chunk_tree
, u64 chunk_objectid
, u64 chunk_offset
, u64 length
,
1373 struct btrfs_mapping_tree
*map_tree
=
1374 &sdev
->dev
->dev_root
->fs_info
->mapping_tree
;
1375 struct map_lookup
*map
;
1376 struct extent_map
*em
;
1380 read_lock(&map_tree
->map_tree
.lock
);
1381 em
= lookup_extent_mapping(&map_tree
->map_tree
, chunk_offset
, 1);
1382 read_unlock(&map_tree
->map_tree
.lock
);
1387 map
= (struct map_lookup
*)em
->bdev
;
1388 if (em
->start
!= chunk_offset
)
1391 if (em
->len
< length
)
1394 for (i
= 0; i
< map
->num_stripes
; ++i
) {
1395 if (map
->stripes
[i
].dev
== sdev
->dev
&&
1396 map
->stripes
[i
].physical
== dev_offset
) {
1397 ret
= scrub_stripe(sdev
, map
, i
, chunk_offset
, length
);
1403 free_extent_map(em
);
1408 static noinline_for_stack
1409 int scrub_enumerate_chunks(struct scrub_dev
*sdev
, u64 start
, u64 end
)
1411 struct btrfs_dev_extent
*dev_extent
= NULL
;
1412 struct btrfs_path
*path
;
1413 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
1414 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1421 struct extent_buffer
*l
;
1422 struct btrfs_key key
;
1423 struct btrfs_key found_key
;
1424 struct btrfs_block_group_cache
*cache
;
1426 path
= btrfs_alloc_path();
1431 path
->search_commit_root
= 1;
1432 path
->skip_locking
= 1;
1434 key
.objectid
= sdev
->dev
->devid
;
1436 key
.type
= BTRFS_DEV_EXTENT_KEY
;
1440 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1444 if (path
->slots
[0] >=
1445 btrfs_header_nritems(path
->nodes
[0])) {
1446 ret
= btrfs_next_leaf(root
, path
);
1453 slot
= path
->slots
[0];
1455 btrfs_item_key_to_cpu(l
, &found_key
, slot
);
1457 if (found_key
.objectid
!= sdev
->dev
->devid
)
1460 if (btrfs_key_type(&found_key
) != BTRFS_DEV_EXTENT_KEY
)
1463 if (found_key
.offset
>= end
)
1466 if (found_key
.offset
< key
.offset
)
1469 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
1470 length
= btrfs_dev_extent_length(l
, dev_extent
);
1472 if (found_key
.offset
+ length
<= start
) {
1473 key
.offset
= found_key
.offset
+ length
;
1474 btrfs_release_path(path
);
1478 chunk_tree
= btrfs_dev_extent_chunk_tree(l
, dev_extent
);
1479 chunk_objectid
= btrfs_dev_extent_chunk_objectid(l
, dev_extent
);
1480 chunk_offset
= btrfs_dev_extent_chunk_offset(l
, dev_extent
);
1483 * get a reference on the corresponding block group to prevent
1484 * the chunk from going away while we scrub it
1486 cache
= btrfs_lookup_block_group(fs_info
, chunk_offset
);
1491 ret
= scrub_chunk(sdev
, chunk_tree
, chunk_objectid
,
1492 chunk_offset
, length
, found_key
.offset
);
1493 btrfs_put_block_group(cache
);
1497 key
.offset
= found_key
.offset
+ length
;
1498 btrfs_release_path(path
);
1501 btrfs_free_path(path
);
1504 * ret can still be 1 from search_slot or next_leaf,
1505 * that's not an error
1507 return ret
< 0 ? ret
: 0;
1510 static noinline_for_stack
int scrub_supers(struct scrub_dev
*sdev
)
1516 struct btrfs_device
*device
= sdev
->dev
;
1517 struct btrfs_root
*root
= device
->dev_root
;
1519 gen
= root
->fs_info
->last_trans_committed
;
1521 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
1522 bytenr
= btrfs_sb_offset(i
);
1523 if (bytenr
+ BTRFS_SUPER_INFO_SIZE
>= device
->total_bytes
)
1526 ret
= scrub_page(sdev
, bytenr
, PAGE_SIZE
, bytenr
,
1527 BTRFS_EXTENT_FLAG_SUPER
, gen
, i
, NULL
, 1);
1531 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1537 * get a reference count on fs_info->scrub_workers. start worker if necessary
1539 static noinline_for_stack
int scrub_workers_get(struct btrfs_root
*root
)
1541 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1544 mutex_lock(&fs_info
->scrub_lock
);
1545 if (fs_info
->scrub_workers_refcnt
== 0) {
1546 btrfs_init_workers(&fs_info
->scrub_workers
, "scrub",
1547 fs_info
->thread_pool_size
, &fs_info
->generic_worker
);
1548 fs_info
->scrub_workers
.idle_thresh
= 4;
1549 ret
= btrfs_start_workers(&fs_info
->scrub_workers
);
1553 ++fs_info
->scrub_workers_refcnt
;
1555 mutex_unlock(&fs_info
->scrub_lock
);
1560 static noinline_for_stack
void scrub_workers_put(struct btrfs_root
*root
)
1562 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1564 mutex_lock(&fs_info
->scrub_lock
);
1565 if (--fs_info
->scrub_workers_refcnt
== 0)
1566 btrfs_stop_workers(&fs_info
->scrub_workers
);
1567 WARN_ON(fs_info
->scrub_workers_refcnt
< 0);
1568 mutex_unlock(&fs_info
->scrub_lock
);
1572 int btrfs_scrub_dev(struct btrfs_root
*root
, u64 devid
, u64 start
, u64 end
,
1573 struct btrfs_scrub_progress
*progress
, int readonly
)
1575 struct scrub_dev
*sdev
;
1576 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1578 struct btrfs_device
*dev
;
1580 if (btrfs_fs_closing(root
->fs_info
))
1584 * check some assumptions
1586 if (root
->sectorsize
!= PAGE_SIZE
||
1587 root
->sectorsize
!= root
->leafsize
||
1588 root
->sectorsize
!= root
->nodesize
) {
1589 printk(KERN_ERR
"btrfs_scrub: size assumptions fail\n");
1593 ret
= scrub_workers_get(root
);
1597 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1598 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1599 if (!dev
|| dev
->missing
) {
1600 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1601 scrub_workers_put(root
);
1604 mutex_lock(&fs_info
->scrub_lock
);
1606 if (!dev
->in_fs_metadata
) {
1607 mutex_unlock(&fs_info
->scrub_lock
);
1608 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1609 scrub_workers_put(root
);
1613 if (dev
->scrub_device
) {
1614 mutex_unlock(&fs_info
->scrub_lock
);
1615 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1616 scrub_workers_put(root
);
1617 return -EINPROGRESS
;
1619 sdev
= scrub_setup_dev(dev
);
1621 mutex_unlock(&fs_info
->scrub_lock
);
1622 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1623 scrub_workers_put(root
);
1624 return PTR_ERR(sdev
);
1626 sdev
->readonly
= readonly
;
1627 dev
->scrub_device
= sdev
;
1629 atomic_inc(&fs_info
->scrubs_running
);
1630 mutex_unlock(&fs_info
->scrub_lock
);
1631 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1633 down_read(&fs_info
->scrub_super_lock
);
1634 ret
= scrub_supers(sdev
);
1635 up_read(&fs_info
->scrub_super_lock
);
1638 ret
= scrub_enumerate_chunks(sdev
, start
, end
);
1640 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1641 atomic_dec(&fs_info
->scrubs_running
);
1642 wake_up(&fs_info
->scrub_pause_wait
);
1644 wait_event(sdev
->list_wait
, atomic_read(&sdev
->fixup_cnt
) == 0);
1647 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1649 mutex_lock(&fs_info
->scrub_lock
);
1650 dev
->scrub_device
= NULL
;
1651 mutex_unlock(&fs_info
->scrub_lock
);
1653 scrub_free_dev(sdev
);
1654 scrub_workers_put(root
);
1659 int btrfs_scrub_pause(struct btrfs_root
*root
)
1661 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1663 mutex_lock(&fs_info
->scrub_lock
);
1664 atomic_inc(&fs_info
->scrub_pause_req
);
1665 while (atomic_read(&fs_info
->scrubs_paused
) !=
1666 atomic_read(&fs_info
->scrubs_running
)) {
1667 mutex_unlock(&fs_info
->scrub_lock
);
1668 wait_event(fs_info
->scrub_pause_wait
,
1669 atomic_read(&fs_info
->scrubs_paused
) ==
1670 atomic_read(&fs_info
->scrubs_running
));
1671 mutex_lock(&fs_info
->scrub_lock
);
1673 mutex_unlock(&fs_info
->scrub_lock
);
1678 int btrfs_scrub_continue(struct btrfs_root
*root
)
1680 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1682 atomic_dec(&fs_info
->scrub_pause_req
);
1683 wake_up(&fs_info
->scrub_pause_wait
);
1687 int btrfs_scrub_pause_super(struct btrfs_root
*root
)
1689 down_write(&root
->fs_info
->scrub_super_lock
);
1693 int btrfs_scrub_continue_super(struct btrfs_root
*root
)
1695 up_write(&root
->fs_info
->scrub_super_lock
);
1699 int btrfs_scrub_cancel(struct btrfs_root
*root
)
1701 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1703 mutex_lock(&fs_info
->scrub_lock
);
1704 if (!atomic_read(&fs_info
->scrubs_running
)) {
1705 mutex_unlock(&fs_info
->scrub_lock
);
1709 atomic_inc(&fs_info
->scrub_cancel_req
);
1710 while (atomic_read(&fs_info
->scrubs_running
)) {
1711 mutex_unlock(&fs_info
->scrub_lock
);
1712 wait_event(fs_info
->scrub_pause_wait
,
1713 atomic_read(&fs_info
->scrubs_running
) == 0);
1714 mutex_lock(&fs_info
->scrub_lock
);
1716 atomic_dec(&fs_info
->scrub_cancel_req
);
1717 mutex_unlock(&fs_info
->scrub_lock
);
1722 int btrfs_scrub_cancel_dev(struct btrfs_root
*root
, struct btrfs_device
*dev
)
1724 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1725 struct scrub_dev
*sdev
;
1727 mutex_lock(&fs_info
->scrub_lock
);
1728 sdev
= dev
->scrub_device
;
1730 mutex_unlock(&fs_info
->scrub_lock
);
1733 atomic_inc(&sdev
->cancel_req
);
1734 while (dev
->scrub_device
) {
1735 mutex_unlock(&fs_info
->scrub_lock
);
1736 wait_event(fs_info
->scrub_pause_wait
,
1737 dev
->scrub_device
== NULL
);
1738 mutex_lock(&fs_info
->scrub_lock
);
1740 mutex_unlock(&fs_info
->scrub_lock
);
1744 int btrfs_scrub_cancel_devid(struct btrfs_root
*root
, u64 devid
)
1746 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1747 struct btrfs_device
*dev
;
1751 * we have to hold the device_list_mutex here so the device
1752 * does not go away in cancel_dev. FIXME: find a better solution
1754 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
1755 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1757 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1760 ret
= btrfs_scrub_cancel_dev(root
, dev
);
1761 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1766 int btrfs_scrub_progress(struct btrfs_root
*root
, u64 devid
,
1767 struct btrfs_scrub_progress
*progress
)
1769 struct btrfs_device
*dev
;
1770 struct scrub_dev
*sdev
= NULL
;
1772 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1773 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1775 sdev
= dev
->scrub_device
;
1777 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1778 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1780 return dev
? (sdev
? 0 : -ENOTCONN
) : -ENODEV
;