1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
7 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
20 #include "async-thread.h"
22 /* set when additional merges to this rbio are not allowed */
23 #define RBIO_RMW_LOCKED_BIT 1
26 * set when this rbio is sitting in the hash, but it is just a cache
29 #define RBIO_CACHE_BIT 2
32 * set when it is safe to trust the stripe_pages for caching
34 #define RBIO_CACHE_READY_BIT 3
36 #define RBIO_CACHE_SIZE 1024
40 BTRFS_RBIO_READ_REBUILD
,
41 BTRFS_RBIO_PARITY_SCRUB
,
42 BTRFS_RBIO_REBUILD_MISSING
,
45 struct btrfs_raid_bio
{
46 struct btrfs_fs_info
*fs_info
;
47 struct btrfs_bio
*bbio
;
49 /* while we're doing rmw on a stripe
50 * we put it into a hash table so we can
51 * lock the stripe and merge more rbios
54 struct list_head hash_list
;
57 * LRU list for the stripe cache
59 struct list_head stripe_cache
;
62 * for scheduling work in the helper threads
64 struct btrfs_work work
;
67 * bio list and bio_list_lock are used
68 * to add more bios into the stripe
69 * in hopes of avoiding the full rmw
71 struct bio_list bio_list
;
72 spinlock_t bio_list_lock
;
74 /* also protected by the bio_list_lock, the
75 * plug list is used by the plugging code
76 * to collect partial bios while plugged. The
77 * stripe locking code also uses it to hand off
78 * the stripe lock to the next pending IO
80 struct list_head plug_list
;
83 * flags that tell us if it is safe to
88 /* size of each individual stripe on disk */
91 /* number of data stripes (no p/q) */
98 * set if we're doing a parity rebuild
99 * for a read from higher up, which is handled
100 * differently from a parity rebuild as part of
103 enum btrfs_rbio_ops operation
;
105 /* first bad stripe */
108 /* second bad stripe (for raid6 use) */
113 * number of pages needed to represent the full
119 * size of all the bios in the bio_list. This
120 * helps us decide if the rbio maps to a full
129 atomic_t stripes_pending
;
133 * these are two arrays of pointers. We allocate the
134 * rbio big enough to hold them both and setup their
135 * locations when the rbio is allocated
138 /* pointers to pages that we allocated for
139 * reading/writing stripes directly from the disk (including P/Q)
141 struct page
**stripe_pages
;
144 * pointers to the pages in the bio_list. Stored
145 * here for faster lookup
147 struct page
**bio_pages
;
150 * bitmap to record which horizontal stripe has data
152 unsigned long *dbitmap
;
154 /* allocated with real_stripes-many pointers for finish_*() calls */
155 void **finish_pointers
;
157 /* allocated with stripe_npages-many bits for finish_*() calls */
158 unsigned long *finish_pbitmap
;
161 static int __raid56_parity_recover(struct btrfs_raid_bio
*rbio
);
162 static noinline
void finish_rmw(struct btrfs_raid_bio
*rbio
);
163 static void rmw_work(struct btrfs_work
*work
);
164 static void read_rebuild_work(struct btrfs_work
*work
);
165 static int fail_bio_stripe(struct btrfs_raid_bio
*rbio
, struct bio
*bio
);
166 static int fail_rbio_index(struct btrfs_raid_bio
*rbio
, int failed
);
167 static void __free_raid_bio(struct btrfs_raid_bio
*rbio
);
168 static void index_rbio_pages(struct btrfs_raid_bio
*rbio
);
169 static int alloc_rbio_pages(struct btrfs_raid_bio
*rbio
);
171 static noinline
void finish_parity_scrub(struct btrfs_raid_bio
*rbio
,
173 static void scrub_parity_work(struct btrfs_work
*work
);
175 static void start_async_work(struct btrfs_raid_bio
*rbio
, btrfs_func_t work_func
)
177 btrfs_init_work(&rbio
->work
, btrfs_rmw_helper
, work_func
, NULL
, NULL
);
178 btrfs_queue_work(rbio
->fs_info
->rmw_workers
, &rbio
->work
);
182 * the stripe hash table is used for locking, and to collect
183 * bios in hopes of making a full stripe
185 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info
*info
)
187 struct btrfs_stripe_hash_table
*table
;
188 struct btrfs_stripe_hash_table
*x
;
189 struct btrfs_stripe_hash
*cur
;
190 struct btrfs_stripe_hash
*h
;
191 int num_entries
= 1 << BTRFS_STRIPE_HASH_TABLE_BITS
;
195 if (info
->stripe_hash_table
)
199 * The table is large, starting with order 4 and can go as high as
200 * order 7 in case lock debugging is turned on.
202 * Try harder to allocate and fallback to vmalloc to lower the chance
203 * of a failing mount.
205 table_size
= sizeof(*table
) + sizeof(*h
) * num_entries
;
206 table
= kvzalloc(table_size
, GFP_KERNEL
);
210 spin_lock_init(&table
->cache_lock
);
211 INIT_LIST_HEAD(&table
->stripe_cache
);
215 for (i
= 0; i
< num_entries
; i
++) {
217 INIT_LIST_HEAD(&cur
->hash_list
);
218 spin_lock_init(&cur
->lock
);
221 x
= cmpxchg(&info
->stripe_hash_table
, NULL
, table
);
228 * caching an rbio means to copy anything from the
229 * bio_pages array into the stripe_pages array. We
230 * use the page uptodate bit in the stripe cache array
231 * to indicate if it has valid data
233 * once the caching is done, we set the cache ready
236 static void cache_rbio_pages(struct btrfs_raid_bio
*rbio
)
243 ret
= alloc_rbio_pages(rbio
);
247 for (i
= 0; i
< rbio
->nr_pages
; i
++) {
248 if (!rbio
->bio_pages
[i
])
251 s
= kmap(rbio
->bio_pages
[i
]);
252 d
= kmap(rbio
->stripe_pages
[i
]);
256 kunmap(rbio
->bio_pages
[i
]);
257 kunmap(rbio
->stripe_pages
[i
]);
258 SetPageUptodate(rbio
->stripe_pages
[i
]);
260 set_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
264 * we hash on the first logical address of the stripe
266 static int rbio_bucket(struct btrfs_raid_bio
*rbio
)
268 u64 num
= rbio
->bbio
->raid_map
[0];
271 * we shift down quite a bit. We're using byte
272 * addressing, and most of the lower bits are zeros.
273 * This tends to upset hash_64, and it consistently
274 * returns just one or two different values.
276 * shifting off the lower bits fixes things.
278 return hash_64(num
>> 16, BTRFS_STRIPE_HASH_TABLE_BITS
);
282 * stealing an rbio means taking all the uptodate pages from the stripe
283 * array in the source rbio and putting them into the destination rbio
285 static void steal_rbio(struct btrfs_raid_bio
*src
, struct btrfs_raid_bio
*dest
)
291 if (!test_bit(RBIO_CACHE_READY_BIT
, &src
->flags
))
294 for (i
= 0; i
< dest
->nr_pages
; i
++) {
295 s
= src
->stripe_pages
[i
];
296 if (!s
|| !PageUptodate(s
)) {
300 d
= dest
->stripe_pages
[i
];
304 dest
->stripe_pages
[i
] = s
;
305 src
->stripe_pages
[i
] = NULL
;
310 * merging means we take the bio_list from the victim and
311 * splice it into the destination. The victim should
312 * be discarded afterwards.
314 * must be called with dest->rbio_list_lock held
316 static void merge_rbio(struct btrfs_raid_bio
*dest
,
317 struct btrfs_raid_bio
*victim
)
319 bio_list_merge(&dest
->bio_list
, &victim
->bio_list
);
320 dest
->bio_list_bytes
+= victim
->bio_list_bytes
;
321 dest
->generic_bio_cnt
+= victim
->generic_bio_cnt
;
322 bio_list_init(&victim
->bio_list
);
326 * used to prune items that are in the cache. The caller
327 * must hold the hash table lock.
329 static void __remove_rbio_from_cache(struct btrfs_raid_bio
*rbio
)
331 int bucket
= rbio_bucket(rbio
);
332 struct btrfs_stripe_hash_table
*table
;
333 struct btrfs_stripe_hash
*h
;
337 * check the bit again under the hash table lock.
339 if (!test_bit(RBIO_CACHE_BIT
, &rbio
->flags
))
342 table
= rbio
->fs_info
->stripe_hash_table
;
343 h
= table
->table
+ bucket
;
345 /* hold the lock for the bucket because we may be
346 * removing it from the hash table
351 * hold the lock for the bio list because we need
352 * to make sure the bio list is empty
354 spin_lock(&rbio
->bio_list_lock
);
356 if (test_and_clear_bit(RBIO_CACHE_BIT
, &rbio
->flags
)) {
357 list_del_init(&rbio
->stripe_cache
);
358 table
->cache_size
-= 1;
361 /* if the bio list isn't empty, this rbio is
362 * still involved in an IO. We take it out
363 * of the cache list, and drop the ref that
364 * was held for the list.
366 * If the bio_list was empty, we also remove
367 * the rbio from the hash_table, and drop
368 * the corresponding ref
370 if (bio_list_empty(&rbio
->bio_list
)) {
371 if (!list_empty(&rbio
->hash_list
)) {
372 list_del_init(&rbio
->hash_list
);
373 refcount_dec(&rbio
->refs
);
374 BUG_ON(!list_empty(&rbio
->plug_list
));
379 spin_unlock(&rbio
->bio_list_lock
);
380 spin_unlock(&h
->lock
);
383 __free_raid_bio(rbio
);
387 * prune a given rbio from the cache
389 static void remove_rbio_from_cache(struct btrfs_raid_bio
*rbio
)
391 struct btrfs_stripe_hash_table
*table
;
394 if (!test_bit(RBIO_CACHE_BIT
, &rbio
->flags
))
397 table
= rbio
->fs_info
->stripe_hash_table
;
399 spin_lock_irqsave(&table
->cache_lock
, flags
);
400 __remove_rbio_from_cache(rbio
);
401 spin_unlock_irqrestore(&table
->cache_lock
, flags
);
405 * remove everything in the cache
407 static void btrfs_clear_rbio_cache(struct btrfs_fs_info
*info
)
409 struct btrfs_stripe_hash_table
*table
;
411 struct btrfs_raid_bio
*rbio
;
413 table
= info
->stripe_hash_table
;
415 spin_lock_irqsave(&table
->cache_lock
, flags
);
416 while (!list_empty(&table
->stripe_cache
)) {
417 rbio
= list_entry(table
->stripe_cache
.next
,
418 struct btrfs_raid_bio
,
420 __remove_rbio_from_cache(rbio
);
422 spin_unlock_irqrestore(&table
->cache_lock
, flags
);
426 * remove all cached entries and free the hash table
429 void btrfs_free_stripe_hash_table(struct btrfs_fs_info
*info
)
431 if (!info
->stripe_hash_table
)
433 btrfs_clear_rbio_cache(info
);
434 kvfree(info
->stripe_hash_table
);
435 info
->stripe_hash_table
= NULL
;
439 * insert an rbio into the stripe cache. It
440 * must have already been prepared by calling
443 * If this rbio was already cached, it gets
444 * moved to the front of the lru.
446 * If the size of the rbio cache is too big, we
449 static void cache_rbio(struct btrfs_raid_bio
*rbio
)
451 struct btrfs_stripe_hash_table
*table
;
454 if (!test_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
))
457 table
= rbio
->fs_info
->stripe_hash_table
;
459 spin_lock_irqsave(&table
->cache_lock
, flags
);
460 spin_lock(&rbio
->bio_list_lock
);
462 /* bump our ref if we were not in the list before */
463 if (!test_and_set_bit(RBIO_CACHE_BIT
, &rbio
->flags
))
464 refcount_inc(&rbio
->refs
);
466 if (!list_empty(&rbio
->stripe_cache
)){
467 list_move(&rbio
->stripe_cache
, &table
->stripe_cache
);
469 list_add(&rbio
->stripe_cache
, &table
->stripe_cache
);
470 table
->cache_size
+= 1;
473 spin_unlock(&rbio
->bio_list_lock
);
475 if (table
->cache_size
> RBIO_CACHE_SIZE
) {
476 struct btrfs_raid_bio
*found
;
478 found
= list_entry(table
->stripe_cache
.prev
,
479 struct btrfs_raid_bio
,
483 __remove_rbio_from_cache(found
);
486 spin_unlock_irqrestore(&table
->cache_lock
, flags
);
490 * helper function to run the xor_blocks api. It is only
491 * able to do MAX_XOR_BLOCKS at a time, so we need to
494 static void run_xor(void **pages
, int src_cnt
, ssize_t len
)
498 void *dest
= pages
[src_cnt
];
501 xor_src_cnt
= min(src_cnt
, MAX_XOR_BLOCKS
);
502 xor_blocks(xor_src_cnt
, len
, dest
, pages
+ src_off
);
504 src_cnt
-= xor_src_cnt
;
505 src_off
+= xor_src_cnt
;
510 * Returns true if the bio list inside this rbio covers an entire stripe (no
513 static int rbio_is_full(struct btrfs_raid_bio
*rbio
)
516 unsigned long size
= rbio
->bio_list_bytes
;
519 spin_lock_irqsave(&rbio
->bio_list_lock
, flags
);
520 if (size
!= rbio
->nr_data
* rbio
->stripe_len
)
522 BUG_ON(size
> rbio
->nr_data
* rbio
->stripe_len
);
523 spin_unlock_irqrestore(&rbio
->bio_list_lock
, flags
);
529 * returns 1 if it is safe to merge two rbios together.
530 * The merging is safe if the two rbios correspond to
531 * the same stripe and if they are both going in the same
532 * direction (read vs write), and if neither one is
533 * locked for final IO
535 * The caller is responsible for locking such that
536 * rmw_locked is safe to test
538 static int rbio_can_merge(struct btrfs_raid_bio
*last
,
539 struct btrfs_raid_bio
*cur
)
541 if (test_bit(RBIO_RMW_LOCKED_BIT
, &last
->flags
) ||
542 test_bit(RBIO_RMW_LOCKED_BIT
, &cur
->flags
))
546 * we can't merge with cached rbios, since the
547 * idea is that when we merge the destination
548 * rbio is going to run our IO for us. We can
549 * steal from cached rbios though, other functions
552 if (test_bit(RBIO_CACHE_BIT
, &last
->flags
) ||
553 test_bit(RBIO_CACHE_BIT
, &cur
->flags
))
556 if (last
->bbio
->raid_map
[0] !=
557 cur
->bbio
->raid_map
[0])
560 /* we can't merge with different operations */
561 if (last
->operation
!= cur
->operation
)
564 * We've need read the full stripe from the drive.
565 * check and repair the parity and write the new results.
567 * We're not allowed to add any new bios to the
568 * bio list here, anyone else that wants to
569 * change this stripe needs to do their own rmw.
571 if (last
->operation
== BTRFS_RBIO_PARITY_SCRUB
)
574 if (last
->operation
== BTRFS_RBIO_REBUILD_MISSING
)
577 if (last
->operation
== BTRFS_RBIO_READ_REBUILD
) {
578 int fa
= last
->faila
;
579 int fb
= last
->failb
;
580 int cur_fa
= cur
->faila
;
581 int cur_fb
= cur
->failb
;
583 if (last
->faila
>= last
->failb
) {
588 if (cur
->faila
>= cur
->failb
) {
593 if (fa
!= cur_fa
|| fb
!= cur_fb
)
599 static int rbio_stripe_page_index(struct btrfs_raid_bio
*rbio
, int stripe
,
602 return stripe
* rbio
->stripe_npages
+ index
;
606 * these are just the pages from the rbio array, not from anything
607 * the FS sent down to us
609 static struct page
*rbio_stripe_page(struct btrfs_raid_bio
*rbio
, int stripe
,
612 return rbio
->stripe_pages
[rbio_stripe_page_index(rbio
, stripe
, index
)];
616 * helper to index into the pstripe
618 static struct page
*rbio_pstripe_page(struct btrfs_raid_bio
*rbio
, int index
)
620 return rbio_stripe_page(rbio
, rbio
->nr_data
, index
);
624 * helper to index into the qstripe, returns null
625 * if there is no qstripe
627 static struct page
*rbio_qstripe_page(struct btrfs_raid_bio
*rbio
, int index
)
629 if (rbio
->nr_data
+ 1 == rbio
->real_stripes
)
631 return rbio_stripe_page(rbio
, rbio
->nr_data
+ 1, index
);
635 * The first stripe in the table for a logical address
636 * has the lock. rbios are added in one of three ways:
638 * 1) Nobody has the stripe locked yet. The rbio is given
639 * the lock and 0 is returned. The caller must start the IO
642 * 2) Someone has the stripe locked, but we're able to merge
643 * with the lock owner. The rbio is freed and the IO will
644 * start automatically along with the existing rbio. 1 is returned.
646 * 3) Someone has the stripe locked, but we're not able to merge.
647 * The rbio is added to the lock owner's plug list, or merged into
648 * an rbio already on the plug list. When the lock owner unlocks,
649 * the next rbio on the list is run and the IO is started automatically.
652 * If we return 0, the caller still owns the rbio and must continue with
653 * IO submission. If we return 1, the caller must assume the rbio has
654 * already been freed.
656 static noinline
int lock_stripe_add(struct btrfs_raid_bio
*rbio
)
658 int bucket
= rbio_bucket(rbio
);
659 struct btrfs_stripe_hash
*h
= rbio
->fs_info
->stripe_hash_table
->table
+ bucket
;
660 struct btrfs_raid_bio
*cur
;
661 struct btrfs_raid_bio
*pending
;
663 struct btrfs_raid_bio
*freeit
= NULL
;
664 struct btrfs_raid_bio
*cache_drop
= NULL
;
667 spin_lock_irqsave(&h
->lock
, flags
);
668 list_for_each_entry(cur
, &h
->hash_list
, hash_list
) {
669 if (cur
->bbio
->raid_map
[0] == rbio
->bbio
->raid_map
[0]) {
670 spin_lock(&cur
->bio_list_lock
);
672 /* can we steal this cached rbio's pages? */
673 if (bio_list_empty(&cur
->bio_list
) &&
674 list_empty(&cur
->plug_list
) &&
675 test_bit(RBIO_CACHE_BIT
, &cur
->flags
) &&
676 !test_bit(RBIO_RMW_LOCKED_BIT
, &cur
->flags
)) {
677 list_del_init(&cur
->hash_list
);
678 refcount_dec(&cur
->refs
);
680 steal_rbio(cur
, rbio
);
682 spin_unlock(&cur
->bio_list_lock
);
687 /* can we merge into the lock owner? */
688 if (rbio_can_merge(cur
, rbio
)) {
689 merge_rbio(cur
, rbio
);
690 spin_unlock(&cur
->bio_list_lock
);
698 * we couldn't merge with the running
699 * rbio, see if we can merge with the
700 * pending ones. We don't have to
701 * check for rmw_locked because there
702 * is no way they are inside finish_rmw
705 list_for_each_entry(pending
, &cur
->plug_list
,
707 if (rbio_can_merge(pending
, rbio
)) {
708 merge_rbio(pending
, rbio
);
709 spin_unlock(&cur
->bio_list_lock
);
716 /* no merging, put us on the tail of the plug list,
717 * our rbio will be started with the currently
718 * running rbio unlocks
720 list_add_tail(&rbio
->plug_list
, &cur
->plug_list
);
721 spin_unlock(&cur
->bio_list_lock
);
727 refcount_inc(&rbio
->refs
);
728 list_add(&rbio
->hash_list
, &h
->hash_list
);
730 spin_unlock_irqrestore(&h
->lock
, flags
);
732 remove_rbio_from_cache(cache_drop
);
734 __free_raid_bio(freeit
);
739 * called as rmw or parity rebuild is completed. If the plug list has more
740 * rbios waiting for this stripe, the next one on the list will be started
742 static noinline
void unlock_stripe(struct btrfs_raid_bio
*rbio
)
745 struct btrfs_stripe_hash
*h
;
749 bucket
= rbio_bucket(rbio
);
750 h
= rbio
->fs_info
->stripe_hash_table
->table
+ bucket
;
752 if (list_empty(&rbio
->plug_list
))
755 spin_lock_irqsave(&h
->lock
, flags
);
756 spin_lock(&rbio
->bio_list_lock
);
758 if (!list_empty(&rbio
->hash_list
)) {
760 * if we're still cached and there is no other IO
761 * to perform, just leave this rbio here for others
762 * to steal from later
764 if (list_empty(&rbio
->plug_list
) &&
765 test_bit(RBIO_CACHE_BIT
, &rbio
->flags
)) {
767 clear_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
768 BUG_ON(!bio_list_empty(&rbio
->bio_list
));
772 list_del_init(&rbio
->hash_list
);
773 refcount_dec(&rbio
->refs
);
776 * we use the plug list to hold all the rbios
777 * waiting for the chance to lock this stripe.
778 * hand the lock over to one of them.
780 if (!list_empty(&rbio
->plug_list
)) {
781 struct btrfs_raid_bio
*next
;
782 struct list_head
*head
= rbio
->plug_list
.next
;
784 next
= list_entry(head
, struct btrfs_raid_bio
,
787 list_del_init(&rbio
->plug_list
);
789 list_add(&next
->hash_list
, &h
->hash_list
);
790 refcount_inc(&next
->refs
);
791 spin_unlock(&rbio
->bio_list_lock
);
792 spin_unlock_irqrestore(&h
->lock
, flags
);
794 if (next
->operation
== BTRFS_RBIO_READ_REBUILD
)
795 start_async_work(next
, read_rebuild_work
);
796 else if (next
->operation
== BTRFS_RBIO_REBUILD_MISSING
) {
797 steal_rbio(rbio
, next
);
798 start_async_work(next
, read_rebuild_work
);
799 } else if (next
->operation
== BTRFS_RBIO_WRITE
) {
800 steal_rbio(rbio
, next
);
801 start_async_work(next
, rmw_work
);
802 } else if (next
->operation
== BTRFS_RBIO_PARITY_SCRUB
) {
803 steal_rbio(rbio
, next
);
804 start_async_work(next
, scrub_parity_work
);
811 spin_unlock(&rbio
->bio_list_lock
);
812 spin_unlock_irqrestore(&h
->lock
, flags
);
816 remove_rbio_from_cache(rbio
);
819 static void __free_raid_bio(struct btrfs_raid_bio
*rbio
)
823 if (!refcount_dec_and_test(&rbio
->refs
))
826 WARN_ON(!list_empty(&rbio
->stripe_cache
));
827 WARN_ON(!list_empty(&rbio
->hash_list
));
828 WARN_ON(!bio_list_empty(&rbio
->bio_list
));
830 for (i
= 0; i
< rbio
->nr_pages
; i
++) {
831 if (rbio
->stripe_pages
[i
]) {
832 __free_page(rbio
->stripe_pages
[i
]);
833 rbio
->stripe_pages
[i
] = NULL
;
837 btrfs_put_bbio(rbio
->bbio
);
841 static void rbio_endio_bio_list(struct bio
*cur
, blk_status_t err
)
848 cur
->bi_status
= err
;
855 * this frees the rbio and runs through all the bios in the
856 * bio_list and calls end_io on them
858 static void rbio_orig_end_io(struct btrfs_raid_bio
*rbio
, blk_status_t err
)
860 struct bio
*cur
= bio_list_get(&rbio
->bio_list
);
863 if (rbio
->generic_bio_cnt
)
864 btrfs_bio_counter_sub(rbio
->fs_info
, rbio
->generic_bio_cnt
);
867 * At this moment, rbio->bio_list is empty, however since rbio does not
868 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
869 * hash list, rbio may be merged with others so that rbio->bio_list
871 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
872 * more and we can call bio_endio() on all queued bios.
875 extra
= bio_list_get(&rbio
->bio_list
);
876 __free_raid_bio(rbio
);
878 rbio_endio_bio_list(cur
, err
);
880 rbio_endio_bio_list(extra
, err
);
884 * end io function used by finish_rmw. When we finally
885 * get here, we've written a full stripe
887 static void raid_write_end_io(struct bio
*bio
)
889 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
890 blk_status_t err
= bio
->bi_status
;
894 fail_bio_stripe(rbio
, bio
);
898 if (!atomic_dec_and_test(&rbio
->stripes_pending
))
903 /* OK, we have read all the stripes we need to. */
904 max_errors
= (rbio
->operation
== BTRFS_RBIO_PARITY_SCRUB
) ?
905 0 : rbio
->bbio
->max_errors
;
906 if (atomic_read(&rbio
->error
) > max_errors
)
909 rbio_orig_end_io(rbio
, err
);
913 * the read/modify/write code wants to use the original bio for
914 * any pages it included, and then use the rbio for everything
915 * else. This function decides if a given index (stripe number)
916 * and page number in that stripe fall inside the original bio
919 * if you set bio_list_only, you'll get a NULL back for any ranges
920 * that are outside the bio_list
922 * This doesn't take any refs on anything, you get a bare page pointer
923 * and the caller must bump refs as required.
925 * You must call index_rbio_pages once before you can trust
926 * the answers from this function.
928 static struct page
*page_in_rbio(struct btrfs_raid_bio
*rbio
,
929 int index
, int pagenr
, int bio_list_only
)
932 struct page
*p
= NULL
;
934 chunk_page
= index
* (rbio
->stripe_len
>> PAGE_SHIFT
) + pagenr
;
936 spin_lock_irq(&rbio
->bio_list_lock
);
937 p
= rbio
->bio_pages
[chunk_page
];
938 spin_unlock_irq(&rbio
->bio_list_lock
);
940 if (p
|| bio_list_only
)
943 return rbio
->stripe_pages
[chunk_page
];
947 * number of pages we need for the entire stripe across all the
950 static unsigned long rbio_nr_pages(unsigned long stripe_len
, int nr_stripes
)
952 return DIV_ROUND_UP(stripe_len
, PAGE_SIZE
) * nr_stripes
;
956 * allocation and initial setup for the btrfs_raid_bio. Not
957 * this does not allocate any pages for rbio->pages.
959 static struct btrfs_raid_bio
*alloc_rbio(struct btrfs_fs_info
*fs_info
,
960 struct btrfs_bio
*bbio
,
963 struct btrfs_raid_bio
*rbio
;
965 int real_stripes
= bbio
->num_stripes
- bbio
->num_tgtdevs
;
966 int num_pages
= rbio_nr_pages(stripe_len
, real_stripes
);
967 int stripe_npages
= DIV_ROUND_UP(stripe_len
, PAGE_SIZE
);
970 rbio
= kzalloc(sizeof(*rbio
) +
971 sizeof(*rbio
->stripe_pages
) * num_pages
+
972 sizeof(*rbio
->bio_pages
) * num_pages
+
973 sizeof(*rbio
->finish_pointers
) * real_stripes
+
974 sizeof(*rbio
->dbitmap
) * BITS_TO_LONGS(stripe_npages
) +
975 sizeof(*rbio
->finish_pbitmap
) *
976 BITS_TO_LONGS(stripe_npages
),
979 return ERR_PTR(-ENOMEM
);
981 bio_list_init(&rbio
->bio_list
);
982 INIT_LIST_HEAD(&rbio
->plug_list
);
983 spin_lock_init(&rbio
->bio_list_lock
);
984 INIT_LIST_HEAD(&rbio
->stripe_cache
);
985 INIT_LIST_HEAD(&rbio
->hash_list
);
987 rbio
->fs_info
= fs_info
;
988 rbio
->stripe_len
= stripe_len
;
989 rbio
->nr_pages
= num_pages
;
990 rbio
->real_stripes
= real_stripes
;
991 rbio
->stripe_npages
= stripe_npages
;
994 refcount_set(&rbio
->refs
, 1);
995 atomic_set(&rbio
->error
, 0);
996 atomic_set(&rbio
->stripes_pending
, 0);
999 * the stripe_pages, bio_pages, etc arrays point to the extra
1000 * memory we allocated past the end of the rbio
1003 #define CONSUME_ALLOC(ptr, count) do { \
1005 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1007 CONSUME_ALLOC(rbio
->stripe_pages
, num_pages
);
1008 CONSUME_ALLOC(rbio
->bio_pages
, num_pages
);
1009 CONSUME_ALLOC(rbio
->finish_pointers
, real_stripes
);
1010 CONSUME_ALLOC(rbio
->dbitmap
, BITS_TO_LONGS(stripe_npages
));
1011 CONSUME_ALLOC(rbio
->finish_pbitmap
, BITS_TO_LONGS(stripe_npages
));
1012 #undef CONSUME_ALLOC
1014 if (bbio
->map_type
& BTRFS_BLOCK_GROUP_RAID5
)
1015 nr_data
= real_stripes
- 1;
1016 else if (bbio
->map_type
& BTRFS_BLOCK_GROUP_RAID6
)
1017 nr_data
= real_stripes
- 2;
1021 rbio
->nr_data
= nr_data
;
1025 /* allocate pages for all the stripes in the bio, including parity */
1026 static int alloc_rbio_pages(struct btrfs_raid_bio
*rbio
)
1031 for (i
= 0; i
< rbio
->nr_pages
; i
++) {
1032 if (rbio
->stripe_pages
[i
])
1034 page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
1037 rbio
->stripe_pages
[i
] = page
;
1042 /* only allocate pages for p/q stripes */
1043 static int alloc_rbio_parity_pages(struct btrfs_raid_bio
*rbio
)
1048 i
= rbio_stripe_page_index(rbio
, rbio
->nr_data
, 0);
1050 for (; i
< rbio
->nr_pages
; i
++) {
1051 if (rbio
->stripe_pages
[i
])
1053 page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
1056 rbio
->stripe_pages
[i
] = page
;
1062 * add a single page from a specific stripe into our list of bios for IO
1063 * this will try to merge into existing bios if possible, and returns
1064 * zero if all went well.
1066 static int rbio_add_io_page(struct btrfs_raid_bio
*rbio
,
1067 struct bio_list
*bio_list
,
1070 unsigned long page_index
,
1071 unsigned long bio_max_len
)
1073 struct bio
*last
= bio_list
->tail
;
1077 struct btrfs_bio_stripe
*stripe
;
1080 stripe
= &rbio
->bbio
->stripes
[stripe_nr
];
1081 disk_start
= stripe
->physical
+ (page_index
<< PAGE_SHIFT
);
1083 /* if the device is missing, just fail this stripe */
1084 if (!stripe
->dev
->bdev
)
1085 return fail_rbio_index(rbio
, stripe_nr
);
1087 /* see if we can add this page onto our existing bio */
1089 last_end
= (u64
)last
->bi_iter
.bi_sector
<< 9;
1090 last_end
+= last
->bi_iter
.bi_size
;
1093 * we can't merge these if they are from different
1094 * devices or if they are not contiguous
1096 if (last_end
== disk_start
&& stripe
->dev
->bdev
&&
1098 last
->bi_disk
== stripe
->dev
->bdev
->bd_disk
&&
1099 last
->bi_partno
== stripe
->dev
->bdev
->bd_partno
) {
1100 ret
= bio_add_page(last
, page
, PAGE_SIZE
, 0);
1101 if (ret
== PAGE_SIZE
)
1106 /* put a new bio on the list */
1107 bio
= btrfs_io_bio_alloc(bio_max_len
>> PAGE_SHIFT
?: 1);
1108 bio
->bi_iter
.bi_size
= 0;
1109 bio_set_dev(bio
, stripe
->dev
->bdev
);
1110 bio
->bi_iter
.bi_sector
= disk_start
>> 9;
1112 bio_add_page(bio
, page
, PAGE_SIZE
, 0);
1113 bio_list_add(bio_list
, bio
);
1118 * while we're doing the read/modify/write cycle, we could
1119 * have errors in reading pages off the disk. This checks
1120 * for errors and if we're not able to read the page it'll
1121 * trigger parity reconstruction. The rmw will be finished
1122 * after we've reconstructed the failed stripes
1124 static void validate_rbio_for_rmw(struct btrfs_raid_bio
*rbio
)
1126 if (rbio
->faila
>= 0 || rbio
->failb
>= 0) {
1127 BUG_ON(rbio
->faila
== rbio
->real_stripes
- 1);
1128 __raid56_parity_recover(rbio
);
1135 * helper function to walk our bio list and populate the bio_pages array with
1136 * the result. This seems expensive, but it is faster than constantly
1137 * searching through the bio list as we setup the IO in finish_rmw or stripe
1140 * This must be called before you trust the answers from page_in_rbio
1142 static void index_rbio_pages(struct btrfs_raid_bio
*rbio
)
1146 unsigned long stripe_offset
;
1147 unsigned long page_index
;
1149 spin_lock_irq(&rbio
->bio_list_lock
);
1150 bio_list_for_each(bio
, &rbio
->bio_list
) {
1151 struct bio_vec bvec
;
1152 struct bvec_iter iter
;
1155 start
= (u64
)bio
->bi_iter
.bi_sector
<< 9;
1156 stripe_offset
= start
- rbio
->bbio
->raid_map
[0];
1157 page_index
= stripe_offset
>> PAGE_SHIFT
;
1159 if (bio_flagged(bio
, BIO_CLONED
))
1160 bio
->bi_iter
= btrfs_io_bio(bio
)->iter
;
1162 bio_for_each_segment(bvec
, bio
, iter
) {
1163 rbio
->bio_pages
[page_index
+ i
] = bvec
.bv_page
;
1167 spin_unlock_irq(&rbio
->bio_list_lock
);
1171 * this is called from one of two situations. We either
1172 * have a full stripe from the higher layers, or we've read all
1173 * the missing bits off disk.
1175 * This will calculate the parity and then send down any
1178 static noinline
void finish_rmw(struct btrfs_raid_bio
*rbio
)
1180 struct btrfs_bio
*bbio
= rbio
->bbio
;
1181 void **pointers
= rbio
->finish_pointers
;
1182 int nr_data
= rbio
->nr_data
;
1187 struct bio_list bio_list
;
1191 bio_list_init(&bio_list
);
1193 if (rbio
->real_stripes
- rbio
->nr_data
== 1) {
1194 p_stripe
= rbio
->real_stripes
- 1;
1195 } else if (rbio
->real_stripes
- rbio
->nr_data
== 2) {
1196 p_stripe
= rbio
->real_stripes
- 2;
1197 q_stripe
= rbio
->real_stripes
- 1;
1202 /* at this point we either have a full stripe,
1203 * or we've read the full stripe from the drive.
1204 * recalculate the parity and write the new results.
1206 * We're not allowed to add any new bios to the
1207 * bio list here, anyone else that wants to
1208 * change this stripe needs to do their own rmw.
1210 spin_lock_irq(&rbio
->bio_list_lock
);
1211 set_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
1212 spin_unlock_irq(&rbio
->bio_list_lock
);
1214 atomic_set(&rbio
->error
, 0);
1217 * now that we've set rmw_locked, run through the
1218 * bio list one last time and map the page pointers
1220 * We don't cache full rbios because we're assuming
1221 * the higher layers are unlikely to use this area of
1222 * the disk again soon. If they do use it again,
1223 * hopefully they will send another full bio.
1225 index_rbio_pages(rbio
);
1226 if (!rbio_is_full(rbio
))
1227 cache_rbio_pages(rbio
);
1229 clear_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
1231 for (pagenr
= 0; pagenr
< rbio
->stripe_npages
; pagenr
++) {
1233 /* first collect one page from each data stripe */
1234 for (stripe
= 0; stripe
< nr_data
; stripe
++) {
1235 p
= page_in_rbio(rbio
, stripe
, pagenr
, 0);
1236 pointers
[stripe
] = kmap(p
);
1239 /* then add the parity stripe */
1240 p
= rbio_pstripe_page(rbio
, pagenr
);
1242 pointers
[stripe
++] = kmap(p
);
1244 if (q_stripe
!= -1) {
1247 * raid6, add the qstripe and call the
1248 * library function to fill in our p/q
1250 p
= rbio_qstripe_page(rbio
, pagenr
);
1252 pointers
[stripe
++] = kmap(p
);
1254 raid6_call
.gen_syndrome(rbio
->real_stripes
, PAGE_SIZE
,
1258 copy_page(pointers
[nr_data
], pointers
[0]);
1259 run_xor(pointers
+ 1, nr_data
- 1, PAGE_SIZE
);
1263 for (stripe
= 0; stripe
< rbio
->real_stripes
; stripe
++)
1264 kunmap(page_in_rbio(rbio
, stripe
, pagenr
, 0));
1268 * time to start writing. Make bios for everything from the
1269 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1272 for (stripe
= 0; stripe
< rbio
->real_stripes
; stripe
++) {
1273 for (pagenr
= 0; pagenr
< rbio
->stripe_npages
; pagenr
++) {
1275 if (stripe
< rbio
->nr_data
) {
1276 page
= page_in_rbio(rbio
, stripe
, pagenr
, 1);
1280 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
1283 ret
= rbio_add_io_page(rbio
, &bio_list
,
1284 page
, stripe
, pagenr
, rbio
->stripe_len
);
1290 if (likely(!bbio
->num_tgtdevs
))
1293 for (stripe
= 0; stripe
< rbio
->real_stripes
; stripe
++) {
1294 if (!bbio
->tgtdev_map
[stripe
])
1297 for (pagenr
= 0; pagenr
< rbio
->stripe_npages
; pagenr
++) {
1299 if (stripe
< rbio
->nr_data
) {
1300 page
= page_in_rbio(rbio
, stripe
, pagenr
, 1);
1304 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
1307 ret
= rbio_add_io_page(rbio
, &bio_list
, page
,
1308 rbio
->bbio
->tgtdev_map
[stripe
],
1309 pagenr
, rbio
->stripe_len
);
1316 atomic_set(&rbio
->stripes_pending
, bio_list_size(&bio_list
));
1317 BUG_ON(atomic_read(&rbio
->stripes_pending
) == 0);
1320 bio
= bio_list_pop(&bio_list
);
1324 bio
->bi_private
= rbio
;
1325 bio
->bi_end_io
= raid_write_end_io
;
1326 bio
->bi_opf
= REQ_OP_WRITE
;
1333 rbio_orig_end_io(rbio
, BLK_STS_IOERR
);
1335 while ((bio
= bio_list_pop(&bio_list
)))
1340 * helper to find the stripe number for a given bio. Used to figure out which
1341 * stripe has failed. This expects the bio to correspond to a physical disk,
1342 * so it looks up based on physical sector numbers.
1344 static int find_bio_stripe(struct btrfs_raid_bio
*rbio
,
1347 u64 physical
= bio
->bi_iter
.bi_sector
;
1350 struct btrfs_bio_stripe
*stripe
;
1354 for (i
= 0; i
< rbio
->bbio
->num_stripes
; i
++) {
1355 stripe
= &rbio
->bbio
->stripes
[i
];
1356 stripe_start
= stripe
->physical
;
1357 if (physical
>= stripe_start
&&
1358 physical
< stripe_start
+ rbio
->stripe_len
&&
1359 stripe
->dev
->bdev
&&
1360 bio
->bi_disk
== stripe
->dev
->bdev
->bd_disk
&&
1361 bio
->bi_partno
== stripe
->dev
->bdev
->bd_partno
) {
1369 * helper to find the stripe number for a given
1370 * bio (before mapping). Used to figure out which stripe has
1371 * failed. This looks up based on logical block numbers.
1373 static int find_logical_bio_stripe(struct btrfs_raid_bio
*rbio
,
1376 u64 logical
= bio
->bi_iter
.bi_sector
;
1382 for (i
= 0; i
< rbio
->nr_data
; i
++) {
1383 stripe_start
= rbio
->bbio
->raid_map
[i
];
1384 if (logical
>= stripe_start
&&
1385 logical
< stripe_start
+ rbio
->stripe_len
) {
1393 * returns -EIO if we had too many failures
1395 static int fail_rbio_index(struct btrfs_raid_bio
*rbio
, int failed
)
1397 unsigned long flags
;
1400 spin_lock_irqsave(&rbio
->bio_list_lock
, flags
);
1402 /* we already know this stripe is bad, move on */
1403 if (rbio
->faila
== failed
|| rbio
->failb
== failed
)
1406 if (rbio
->faila
== -1) {
1407 /* first failure on this rbio */
1408 rbio
->faila
= failed
;
1409 atomic_inc(&rbio
->error
);
1410 } else if (rbio
->failb
== -1) {
1411 /* second failure on this rbio */
1412 rbio
->failb
= failed
;
1413 atomic_inc(&rbio
->error
);
1418 spin_unlock_irqrestore(&rbio
->bio_list_lock
, flags
);
1424 * helper to fail a stripe based on a physical disk
1427 static int fail_bio_stripe(struct btrfs_raid_bio
*rbio
,
1430 int failed
= find_bio_stripe(rbio
, bio
);
1435 return fail_rbio_index(rbio
, failed
);
1439 * this sets each page in the bio uptodate. It should only be used on private
1440 * rbio pages, nothing that comes in from the higher layers
1442 static void set_bio_pages_uptodate(struct bio
*bio
)
1444 struct bio_vec
*bvec
;
1447 ASSERT(!bio_flagged(bio
, BIO_CLONED
));
1449 bio_for_each_segment_all(bvec
, bio
, i
)
1450 SetPageUptodate(bvec
->bv_page
);
1454 * end io for the read phase of the rmw cycle. All the bios here are physical
1455 * stripe bios we've read from the disk so we can recalculate the parity of the
1458 * This will usually kick off finish_rmw once all the bios are read in, but it
1459 * may trigger parity reconstruction if we had any errors along the way
1461 static void raid_rmw_end_io(struct bio
*bio
)
1463 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
1466 fail_bio_stripe(rbio
, bio
);
1468 set_bio_pages_uptodate(bio
);
1472 if (!atomic_dec_and_test(&rbio
->stripes_pending
))
1475 if (atomic_read(&rbio
->error
) > rbio
->bbio
->max_errors
)
1479 * this will normally call finish_rmw to start our write
1480 * but if there are any failed stripes we'll reconstruct
1483 validate_rbio_for_rmw(rbio
);
1488 rbio_orig_end_io(rbio
, BLK_STS_IOERR
);
1492 * the stripe must be locked by the caller. It will
1493 * unlock after all the writes are done
1495 static int raid56_rmw_stripe(struct btrfs_raid_bio
*rbio
)
1497 int bios_to_read
= 0;
1498 struct bio_list bio_list
;
1504 bio_list_init(&bio_list
);
1506 ret
= alloc_rbio_pages(rbio
);
1510 index_rbio_pages(rbio
);
1512 atomic_set(&rbio
->error
, 0);
1514 * build a list of bios to read all the missing parts of this
1517 for (stripe
= 0; stripe
< rbio
->nr_data
; stripe
++) {
1518 for (pagenr
= 0; pagenr
< rbio
->stripe_npages
; pagenr
++) {
1521 * we want to find all the pages missing from
1522 * the rbio and read them from the disk. If
1523 * page_in_rbio finds a page in the bio list
1524 * we don't need to read it off the stripe.
1526 page
= page_in_rbio(rbio
, stripe
, pagenr
, 1);
1530 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
1532 * the bio cache may have handed us an uptodate
1533 * page. If so, be happy and use it
1535 if (PageUptodate(page
))
1538 ret
= rbio_add_io_page(rbio
, &bio_list
, page
,
1539 stripe
, pagenr
, rbio
->stripe_len
);
1545 bios_to_read
= bio_list_size(&bio_list
);
1546 if (!bios_to_read
) {
1548 * this can happen if others have merged with
1549 * us, it means there is nothing left to read.
1550 * But if there are missing devices it may not be
1551 * safe to do the full stripe write yet.
1557 * the bbio may be freed once we submit the last bio. Make sure
1558 * not to touch it after that
1560 atomic_set(&rbio
->stripes_pending
, bios_to_read
);
1562 bio
= bio_list_pop(&bio_list
);
1566 bio
->bi_private
= rbio
;
1567 bio
->bi_end_io
= raid_rmw_end_io
;
1568 bio
->bi_opf
= REQ_OP_READ
;
1570 btrfs_bio_wq_end_io(rbio
->fs_info
, bio
, BTRFS_WQ_ENDIO_RAID56
);
1574 /* the actual write will happen once the reads are done */
1578 rbio_orig_end_io(rbio
, BLK_STS_IOERR
);
1580 while ((bio
= bio_list_pop(&bio_list
)))
1586 validate_rbio_for_rmw(rbio
);
1591 * if the upper layers pass in a full stripe, we thank them by only allocating
1592 * enough pages to hold the parity, and sending it all down quickly.
1594 static int full_stripe_write(struct btrfs_raid_bio
*rbio
)
1598 ret
= alloc_rbio_parity_pages(rbio
);
1600 __free_raid_bio(rbio
);
1604 ret
= lock_stripe_add(rbio
);
1611 * partial stripe writes get handed over to async helpers.
1612 * We're really hoping to merge a few more writes into this
1613 * rbio before calculating new parity
1615 static int partial_stripe_write(struct btrfs_raid_bio
*rbio
)
1619 ret
= lock_stripe_add(rbio
);
1621 start_async_work(rbio
, rmw_work
);
1626 * sometimes while we were reading from the drive to
1627 * recalculate parity, enough new bios come into create
1628 * a full stripe. So we do a check here to see if we can
1629 * go directly to finish_rmw
1631 static int __raid56_parity_write(struct btrfs_raid_bio
*rbio
)
1633 /* head off into rmw land if we don't have a full stripe */
1634 if (!rbio_is_full(rbio
))
1635 return partial_stripe_write(rbio
);
1636 return full_stripe_write(rbio
);
1640 * We use plugging call backs to collect full stripes.
1641 * Any time we get a partial stripe write while plugged
1642 * we collect it into a list. When the unplug comes down,
1643 * we sort the list by logical block number and merge
1644 * everything we can into the same rbios
1646 struct btrfs_plug_cb
{
1647 struct blk_plug_cb cb
;
1648 struct btrfs_fs_info
*info
;
1649 struct list_head rbio_list
;
1650 struct btrfs_work work
;
1654 * rbios on the plug list are sorted for easier merging.
1656 static int plug_cmp(void *priv
, struct list_head
*a
, struct list_head
*b
)
1658 struct btrfs_raid_bio
*ra
= container_of(a
, struct btrfs_raid_bio
,
1660 struct btrfs_raid_bio
*rb
= container_of(b
, struct btrfs_raid_bio
,
1662 u64 a_sector
= ra
->bio_list
.head
->bi_iter
.bi_sector
;
1663 u64 b_sector
= rb
->bio_list
.head
->bi_iter
.bi_sector
;
1665 if (a_sector
< b_sector
)
1667 if (a_sector
> b_sector
)
1672 static void run_plug(struct btrfs_plug_cb
*plug
)
1674 struct btrfs_raid_bio
*cur
;
1675 struct btrfs_raid_bio
*last
= NULL
;
1678 * sort our plug list then try to merge
1679 * everything we can in hopes of creating full
1682 list_sort(NULL
, &plug
->rbio_list
, plug_cmp
);
1683 while (!list_empty(&plug
->rbio_list
)) {
1684 cur
= list_entry(plug
->rbio_list
.next
,
1685 struct btrfs_raid_bio
, plug_list
);
1686 list_del_init(&cur
->plug_list
);
1688 if (rbio_is_full(cur
)) {
1691 /* we have a full stripe, send it down */
1692 ret
= full_stripe_write(cur
);
1697 if (rbio_can_merge(last
, cur
)) {
1698 merge_rbio(last
, cur
);
1699 __free_raid_bio(cur
);
1703 __raid56_parity_write(last
);
1708 __raid56_parity_write(last
);
1714 * if the unplug comes from schedule, we have to push the
1715 * work off to a helper thread
1717 static void unplug_work(struct btrfs_work
*work
)
1719 struct btrfs_plug_cb
*plug
;
1720 plug
= container_of(work
, struct btrfs_plug_cb
, work
);
1724 static void btrfs_raid_unplug(struct blk_plug_cb
*cb
, bool from_schedule
)
1726 struct btrfs_plug_cb
*plug
;
1727 plug
= container_of(cb
, struct btrfs_plug_cb
, cb
);
1729 if (from_schedule
) {
1730 btrfs_init_work(&plug
->work
, btrfs_rmw_helper
,
1731 unplug_work
, NULL
, NULL
);
1732 btrfs_queue_work(plug
->info
->rmw_workers
,
1740 * our main entry point for writes from the rest of the FS.
1742 int raid56_parity_write(struct btrfs_fs_info
*fs_info
, struct bio
*bio
,
1743 struct btrfs_bio
*bbio
, u64 stripe_len
)
1745 struct btrfs_raid_bio
*rbio
;
1746 struct btrfs_plug_cb
*plug
= NULL
;
1747 struct blk_plug_cb
*cb
;
1750 rbio
= alloc_rbio(fs_info
, bbio
, stripe_len
);
1752 btrfs_put_bbio(bbio
);
1753 return PTR_ERR(rbio
);
1755 bio_list_add(&rbio
->bio_list
, bio
);
1756 rbio
->bio_list_bytes
= bio
->bi_iter
.bi_size
;
1757 rbio
->operation
= BTRFS_RBIO_WRITE
;
1759 btrfs_bio_counter_inc_noblocked(fs_info
);
1760 rbio
->generic_bio_cnt
= 1;
1763 * don't plug on full rbios, just get them out the door
1764 * as quickly as we can
1766 if (rbio_is_full(rbio
)) {
1767 ret
= full_stripe_write(rbio
);
1769 btrfs_bio_counter_dec(fs_info
);
1773 cb
= blk_check_plugged(btrfs_raid_unplug
, fs_info
, sizeof(*plug
));
1775 plug
= container_of(cb
, struct btrfs_plug_cb
, cb
);
1777 plug
->info
= fs_info
;
1778 INIT_LIST_HEAD(&plug
->rbio_list
);
1780 list_add_tail(&rbio
->plug_list
, &plug
->rbio_list
);
1783 ret
= __raid56_parity_write(rbio
);
1785 btrfs_bio_counter_dec(fs_info
);
1791 * all parity reconstruction happens here. We've read in everything
1792 * we can find from the drives and this does the heavy lifting of
1793 * sorting the good from the bad.
1795 static void __raid_recover_end_io(struct btrfs_raid_bio
*rbio
)
1799 int faila
= -1, failb
= -1;
1804 pointers
= kcalloc(rbio
->real_stripes
, sizeof(void *), GFP_NOFS
);
1806 err
= BLK_STS_RESOURCE
;
1810 faila
= rbio
->faila
;
1811 failb
= rbio
->failb
;
1813 if (rbio
->operation
== BTRFS_RBIO_READ_REBUILD
||
1814 rbio
->operation
== BTRFS_RBIO_REBUILD_MISSING
) {
1815 spin_lock_irq(&rbio
->bio_list_lock
);
1816 set_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
1817 spin_unlock_irq(&rbio
->bio_list_lock
);
1820 index_rbio_pages(rbio
);
1822 for (pagenr
= 0; pagenr
< rbio
->stripe_npages
; pagenr
++) {
1824 * Now we just use bitmap to mark the horizontal stripes in
1825 * which we have data when doing parity scrub.
1827 if (rbio
->operation
== BTRFS_RBIO_PARITY_SCRUB
&&
1828 !test_bit(pagenr
, rbio
->dbitmap
))
1831 /* setup our array of pointers with pages
1834 for (stripe
= 0; stripe
< rbio
->real_stripes
; stripe
++) {
1836 * if we're rebuilding a read, we have to use
1837 * pages from the bio list
1839 if ((rbio
->operation
== BTRFS_RBIO_READ_REBUILD
||
1840 rbio
->operation
== BTRFS_RBIO_REBUILD_MISSING
) &&
1841 (stripe
== faila
|| stripe
== failb
)) {
1842 page
= page_in_rbio(rbio
, stripe
, pagenr
, 0);
1844 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
1846 pointers
[stripe
] = kmap(page
);
1849 /* all raid6 handling here */
1850 if (rbio
->bbio
->map_type
& BTRFS_BLOCK_GROUP_RAID6
) {
1852 * single failure, rebuild from parity raid5
1856 if (faila
== rbio
->nr_data
) {
1858 * Just the P stripe has failed, without
1859 * a bad data or Q stripe.
1860 * TODO, we should redo the xor here.
1862 err
= BLK_STS_IOERR
;
1866 * a single failure in raid6 is rebuilt
1867 * in the pstripe code below
1872 /* make sure our ps and qs are in order */
1873 if (faila
> failb
) {
1879 /* if the q stripe is failed, do a pstripe reconstruction
1881 * If both the q stripe and the P stripe are failed, we're
1882 * here due to a crc mismatch and we can't give them the
1885 if (rbio
->bbio
->raid_map
[failb
] == RAID6_Q_STRIPE
) {
1886 if (rbio
->bbio
->raid_map
[faila
] ==
1888 err
= BLK_STS_IOERR
;
1892 * otherwise we have one bad data stripe and
1893 * a good P stripe. raid5!
1898 if (rbio
->bbio
->raid_map
[failb
] == RAID5_P_STRIPE
) {
1899 raid6_datap_recov(rbio
->real_stripes
,
1900 PAGE_SIZE
, faila
, pointers
);
1902 raid6_2data_recov(rbio
->real_stripes
,
1903 PAGE_SIZE
, faila
, failb
,
1909 /* rebuild from P stripe here (raid5 or raid6) */
1910 BUG_ON(failb
!= -1);
1912 /* Copy parity block into failed block to start with */
1913 copy_page(pointers
[faila
], pointers
[rbio
->nr_data
]);
1915 /* rearrange the pointer array */
1916 p
= pointers
[faila
];
1917 for (stripe
= faila
; stripe
< rbio
->nr_data
- 1; stripe
++)
1918 pointers
[stripe
] = pointers
[stripe
+ 1];
1919 pointers
[rbio
->nr_data
- 1] = p
;
1921 /* xor in the rest */
1922 run_xor(pointers
, rbio
->nr_data
- 1, PAGE_SIZE
);
1924 /* if we're doing this rebuild as part of an rmw, go through
1925 * and set all of our private rbio pages in the
1926 * failed stripes as uptodate. This way finish_rmw will
1927 * know they can be trusted. If this was a read reconstruction,
1928 * other endio functions will fiddle the uptodate bits
1930 if (rbio
->operation
== BTRFS_RBIO_WRITE
) {
1931 for (i
= 0; i
< rbio
->stripe_npages
; i
++) {
1933 page
= rbio_stripe_page(rbio
, faila
, i
);
1934 SetPageUptodate(page
);
1937 page
= rbio_stripe_page(rbio
, failb
, i
);
1938 SetPageUptodate(page
);
1942 for (stripe
= 0; stripe
< rbio
->real_stripes
; stripe
++) {
1944 * if we're rebuilding a read, we have to use
1945 * pages from the bio list
1947 if ((rbio
->operation
== BTRFS_RBIO_READ_REBUILD
||
1948 rbio
->operation
== BTRFS_RBIO_REBUILD_MISSING
) &&
1949 (stripe
== faila
|| stripe
== failb
)) {
1950 page
= page_in_rbio(rbio
, stripe
, pagenr
, 0);
1952 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
1964 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1965 * valid rbio which is consistent with ondisk content, thus such a
1966 * valid rbio can be cached to avoid further disk reads.
1968 if (rbio
->operation
== BTRFS_RBIO_READ_REBUILD
||
1969 rbio
->operation
== BTRFS_RBIO_REBUILD_MISSING
) {
1971 * - In case of two failures, where rbio->failb != -1:
1973 * Do not cache this rbio since the above read reconstruction
1974 * (raid6_datap_recov() or raid6_2data_recov()) may have
1975 * changed some content of stripes which are not identical to
1976 * on-disk content any more, otherwise, a later write/recover
1977 * may steal stripe_pages from this rbio and end up with
1978 * corruptions or rebuild failures.
1980 * - In case of single failure, where rbio->failb == -1:
1982 * Cache this rbio iff the above read reconstruction is
1983 * excuted without problems.
1985 if (err
== BLK_STS_OK
&& rbio
->failb
< 0)
1986 cache_rbio_pages(rbio
);
1988 clear_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
1990 rbio_orig_end_io(rbio
, err
);
1991 } else if (err
== BLK_STS_OK
) {
1995 if (rbio
->operation
== BTRFS_RBIO_WRITE
)
1997 else if (rbio
->operation
== BTRFS_RBIO_PARITY_SCRUB
)
1998 finish_parity_scrub(rbio
, 0);
2002 rbio_orig_end_io(rbio
, err
);
2007 * This is called only for stripes we've read from disk to
2008 * reconstruct the parity.
2010 static void raid_recover_end_io(struct bio
*bio
)
2012 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
2015 * we only read stripe pages off the disk, set them
2016 * up to date if there were no errors
2019 fail_bio_stripe(rbio
, bio
);
2021 set_bio_pages_uptodate(bio
);
2024 if (!atomic_dec_and_test(&rbio
->stripes_pending
))
2027 if (atomic_read(&rbio
->error
) > rbio
->bbio
->max_errors
)
2028 rbio_orig_end_io(rbio
, BLK_STS_IOERR
);
2030 __raid_recover_end_io(rbio
);
2034 * reads everything we need off the disk to reconstruct
2035 * the parity. endio handlers trigger final reconstruction
2036 * when the IO is done.
2038 * This is used both for reads from the higher layers and for
2039 * parity construction required to finish a rmw cycle.
2041 static int __raid56_parity_recover(struct btrfs_raid_bio
*rbio
)
2043 int bios_to_read
= 0;
2044 struct bio_list bio_list
;
2050 bio_list_init(&bio_list
);
2052 ret
= alloc_rbio_pages(rbio
);
2056 atomic_set(&rbio
->error
, 0);
2059 * read everything that hasn't failed. Thanks to the
2060 * stripe cache, it is possible that some or all of these
2061 * pages are going to be uptodate.
2063 for (stripe
= 0; stripe
< rbio
->real_stripes
; stripe
++) {
2064 if (rbio
->faila
== stripe
|| rbio
->failb
== stripe
) {
2065 atomic_inc(&rbio
->error
);
2069 for (pagenr
= 0; pagenr
< rbio
->stripe_npages
; pagenr
++) {
2073 * the rmw code may have already read this
2076 p
= rbio_stripe_page(rbio
, stripe
, pagenr
);
2077 if (PageUptodate(p
))
2080 ret
= rbio_add_io_page(rbio
, &bio_list
,
2081 rbio_stripe_page(rbio
, stripe
, pagenr
),
2082 stripe
, pagenr
, rbio
->stripe_len
);
2088 bios_to_read
= bio_list_size(&bio_list
);
2089 if (!bios_to_read
) {
2091 * we might have no bios to read just because the pages
2092 * were up to date, or we might have no bios to read because
2093 * the devices were gone.
2095 if (atomic_read(&rbio
->error
) <= rbio
->bbio
->max_errors
) {
2096 __raid_recover_end_io(rbio
);
2104 * the bbio may be freed once we submit the last bio. Make sure
2105 * not to touch it after that
2107 atomic_set(&rbio
->stripes_pending
, bios_to_read
);
2109 bio
= bio_list_pop(&bio_list
);
2113 bio
->bi_private
= rbio
;
2114 bio
->bi_end_io
= raid_recover_end_io
;
2115 bio
->bi_opf
= REQ_OP_READ
;
2117 btrfs_bio_wq_end_io(rbio
->fs_info
, bio
, BTRFS_WQ_ENDIO_RAID56
);
2125 if (rbio
->operation
== BTRFS_RBIO_READ_REBUILD
||
2126 rbio
->operation
== BTRFS_RBIO_REBUILD_MISSING
)
2127 rbio_orig_end_io(rbio
, BLK_STS_IOERR
);
2129 while ((bio
= bio_list_pop(&bio_list
)))
2136 * the main entry point for reads from the higher layers. This
2137 * is really only called when the normal read path had a failure,
2138 * so we assume the bio they send down corresponds to a failed part
2141 int raid56_parity_recover(struct btrfs_fs_info
*fs_info
, struct bio
*bio
,
2142 struct btrfs_bio
*bbio
, u64 stripe_len
,
2143 int mirror_num
, int generic_io
)
2145 struct btrfs_raid_bio
*rbio
;
2149 ASSERT(bbio
->mirror_num
== mirror_num
);
2150 btrfs_io_bio(bio
)->mirror_num
= mirror_num
;
2153 rbio
= alloc_rbio(fs_info
, bbio
, stripe_len
);
2156 btrfs_put_bbio(bbio
);
2157 return PTR_ERR(rbio
);
2160 rbio
->operation
= BTRFS_RBIO_READ_REBUILD
;
2161 bio_list_add(&rbio
->bio_list
, bio
);
2162 rbio
->bio_list_bytes
= bio
->bi_iter
.bi_size
;
2164 rbio
->faila
= find_logical_bio_stripe(rbio
, bio
);
2165 if (rbio
->faila
== -1) {
2167 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
2168 __func__
, (u64
)bio
->bi_iter
.bi_sector
<< 9,
2169 (u64
)bio
->bi_iter
.bi_size
, bbio
->map_type
);
2171 btrfs_put_bbio(bbio
);
2177 btrfs_bio_counter_inc_noblocked(fs_info
);
2178 rbio
->generic_bio_cnt
= 1;
2180 btrfs_get_bbio(bbio
);
2185 * for 'mirror == 2', reconstruct from all other stripes.
2186 * for 'mirror_num > 2', select a stripe to fail on every retry.
2188 if (mirror_num
> 2) {
2190 * 'mirror == 3' is to fail the p stripe and
2191 * reconstruct from the q stripe. 'mirror > 3' is to
2192 * fail a data stripe and reconstruct from p+q stripe.
2194 rbio
->failb
= rbio
->real_stripes
- (mirror_num
- 1);
2195 ASSERT(rbio
->failb
> 0);
2196 if (rbio
->failb
<= rbio
->faila
)
2200 ret
= lock_stripe_add(rbio
);
2203 * __raid56_parity_recover will end the bio with
2204 * any errors it hits. We don't want to return
2205 * its error value up the stack because our caller
2206 * will end up calling bio_endio with any nonzero
2210 __raid56_parity_recover(rbio
);
2212 * our rbio has been added to the list of
2213 * rbios that will be handled after the
2214 * currently lock owner is done
2220 static void rmw_work(struct btrfs_work
*work
)
2222 struct btrfs_raid_bio
*rbio
;
2224 rbio
= container_of(work
, struct btrfs_raid_bio
, work
);
2225 raid56_rmw_stripe(rbio
);
2228 static void read_rebuild_work(struct btrfs_work
*work
)
2230 struct btrfs_raid_bio
*rbio
;
2232 rbio
= container_of(work
, struct btrfs_raid_bio
, work
);
2233 __raid56_parity_recover(rbio
);
2237 * The following code is used to scrub/replace the parity stripe
2239 * Caller must have already increased bio_counter for getting @bbio.
2241 * Note: We need make sure all the pages that add into the scrub/replace
2242 * raid bio are correct and not be changed during the scrub/replace. That
2243 * is those pages just hold metadata or file data with checksum.
2246 struct btrfs_raid_bio
*
2247 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info
*fs_info
, struct bio
*bio
,
2248 struct btrfs_bio
*bbio
, u64 stripe_len
,
2249 struct btrfs_device
*scrub_dev
,
2250 unsigned long *dbitmap
, int stripe_nsectors
)
2252 struct btrfs_raid_bio
*rbio
;
2255 rbio
= alloc_rbio(fs_info
, bbio
, stripe_len
);
2258 bio_list_add(&rbio
->bio_list
, bio
);
2260 * This is a special bio which is used to hold the completion handler
2261 * and make the scrub rbio is similar to the other types
2263 ASSERT(!bio
->bi_iter
.bi_size
);
2264 rbio
->operation
= BTRFS_RBIO_PARITY_SCRUB
;
2267 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
2268 * to the end position, so this search can start from the first parity
2271 for (i
= rbio
->nr_data
; i
< rbio
->real_stripes
; i
++) {
2272 if (bbio
->stripes
[i
].dev
== scrub_dev
) {
2277 ASSERT(i
< rbio
->real_stripes
);
2279 /* Now we just support the sectorsize equals to page size */
2280 ASSERT(fs_info
->sectorsize
== PAGE_SIZE
);
2281 ASSERT(rbio
->stripe_npages
== stripe_nsectors
);
2282 bitmap_copy(rbio
->dbitmap
, dbitmap
, stripe_nsectors
);
2285 * We have already increased bio_counter when getting bbio, record it
2286 * so we can free it at rbio_orig_end_io().
2288 rbio
->generic_bio_cnt
= 1;
2293 /* Used for both parity scrub and missing. */
2294 void raid56_add_scrub_pages(struct btrfs_raid_bio
*rbio
, struct page
*page
,
2300 ASSERT(logical
>= rbio
->bbio
->raid_map
[0]);
2301 ASSERT(logical
+ PAGE_SIZE
<= rbio
->bbio
->raid_map
[0] +
2302 rbio
->stripe_len
* rbio
->nr_data
);
2303 stripe_offset
= (int)(logical
- rbio
->bbio
->raid_map
[0]);
2304 index
= stripe_offset
>> PAGE_SHIFT
;
2305 rbio
->bio_pages
[index
] = page
;
2309 * We just scrub the parity that we have correct data on the same horizontal,
2310 * so we needn't allocate all pages for all the stripes.
2312 static int alloc_rbio_essential_pages(struct btrfs_raid_bio
*rbio
)
2319 for_each_set_bit(bit
, rbio
->dbitmap
, rbio
->stripe_npages
) {
2320 for (i
= 0; i
< rbio
->real_stripes
; i
++) {
2321 index
= i
* rbio
->stripe_npages
+ bit
;
2322 if (rbio
->stripe_pages
[index
])
2325 page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
2328 rbio
->stripe_pages
[index
] = page
;
2334 static noinline
void finish_parity_scrub(struct btrfs_raid_bio
*rbio
,
2337 struct btrfs_bio
*bbio
= rbio
->bbio
;
2338 void **pointers
= rbio
->finish_pointers
;
2339 unsigned long *pbitmap
= rbio
->finish_pbitmap
;
2340 int nr_data
= rbio
->nr_data
;
2345 struct page
*p_page
= NULL
;
2346 struct page
*q_page
= NULL
;
2347 struct bio_list bio_list
;
2352 bio_list_init(&bio_list
);
2354 if (rbio
->real_stripes
- rbio
->nr_data
== 1) {
2355 p_stripe
= rbio
->real_stripes
- 1;
2356 } else if (rbio
->real_stripes
- rbio
->nr_data
== 2) {
2357 p_stripe
= rbio
->real_stripes
- 2;
2358 q_stripe
= rbio
->real_stripes
- 1;
2363 if (bbio
->num_tgtdevs
&& bbio
->tgtdev_map
[rbio
->scrubp
]) {
2365 bitmap_copy(pbitmap
, rbio
->dbitmap
, rbio
->stripe_npages
);
2369 * Because the higher layers(scrubber) are unlikely to
2370 * use this area of the disk again soon, so don't cache
2373 clear_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
2378 p_page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
2381 SetPageUptodate(p_page
);
2383 if (q_stripe
!= -1) {
2384 q_page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
2386 __free_page(p_page
);
2389 SetPageUptodate(q_page
);
2392 atomic_set(&rbio
->error
, 0);
2394 for_each_set_bit(pagenr
, rbio
->dbitmap
, rbio
->stripe_npages
) {
2397 /* first collect one page from each data stripe */
2398 for (stripe
= 0; stripe
< nr_data
; stripe
++) {
2399 p
= page_in_rbio(rbio
, stripe
, pagenr
, 0);
2400 pointers
[stripe
] = kmap(p
);
2403 /* then add the parity stripe */
2404 pointers
[stripe
++] = kmap(p_page
);
2406 if (q_stripe
!= -1) {
2409 * raid6, add the qstripe and call the
2410 * library function to fill in our p/q
2412 pointers
[stripe
++] = kmap(q_page
);
2414 raid6_call
.gen_syndrome(rbio
->real_stripes
, PAGE_SIZE
,
2418 copy_page(pointers
[nr_data
], pointers
[0]);
2419 run_xor(pointers
+ 1, nr_data
- 1, PAGE_SIZE
);
2422 /* Check scrubbing parity and repair it */
2423 p
= rbio_stripe_page(rbio
, rbio
->scrubp
, pagenr
);
2425 if (memcmp(parity
, pointers
[rbio
->scrubp
], PAGE_SIZE
))
2426 copy_page(parity
, pointers
[rbio
->scrubp
]);
2428 /* Parity is right, needn't writeback */
2429 bitmap_clear(rbio
->dbitmap
, pagenr
, 1);
2432 for (stripe
= 0; stripe
< nr_data
; stripe
++)
2433 kunmap(page_in_rbio(rbio
, stripe
, pagenr
, 0));
2437 __free_page(p_page
);
2439 __free_page(q_page
);
2443 * time to start writing. Make bios for everything from the
2444 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2447 for_each_set_bit(pagenr
, rbio
->dbitmap
, rbio
->stripe_npages
) {
2450 page
= rbio_stripe_page(rbio
, rbio
->scrubp
, pagenr
);
2451 ret
= rbio_add_io_page(rbio
, &bio_list
,
2452 page
, rbio
->scrubp
, pagenr
, rbio
->stripe_len
);
2460 for_each_set_bit(pagenr
, pbitmap
, rbio
->stripe_npages
) {
2463 page
= rbio_stripe_page(rbio
, rbio
->scrubp
, pagenr
);
2464 ret
= rbio_add_io_page(rbio
, &bio_list
, page
,
2465 bbio
->tgtdev_map
[rbio
->scrubp
],
2466 pagenr
, rbio
->stripe_len
);
2472 nr_data
= bio_list_size(&bio_list
);
2474 /* Every parity is right */
2475 rbio_orig_end_io(rbio
, BLK_STS_OK
);
2479 atomic_set(&rbio
->stripes_pending
, nr_data
);
2482 bio
= bio_list_pop(&bio_list
);
2486 bio
->bi_private
= rbio
;
2487 bio
->bi_end_io
= raid_write_end_io
;
2488 bio
->bi_opf
= REQ_OP_WRITE
;
2495 rbio_orig_end_io(rbio
, BLK_STS_IOERR
);
2497 while ((bio
= bio_list_pop(&bio_list
)))
2501 static inline int is_data_stripe(struct btrfs_raid_bio
*rbio
, int stripe
)
2503 if (stripe
>= 0 && stripe
< rbio
->nr_data
)
2509 * While we're doing the parity check and repair, we could have errors
2510 * in reading pages off the disk. This checks for errors and if we're
2511 * not able to read the page it'll trigger parity reconstruction. The
2512 * parity scrub will be finished after we've reconstructed the failed
2515 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio
*rbio
)
2517 if (atomic_read(&rbio
->error
) > rbio
->bbio
->max_errors
)
2520 if (rbio
->faila
>= 0 || rbio
->failb
>= 0) {
2521 int dfail
= 0, failp
= -1;
2523 if (is_data_stripe(rbio
, rbio
->faila
))
2525 else if (is_parity_stripe(rbio
->faila
))
2526 failp
= rbio
->faila
;
2528 if (is_data_stripe(rbio
, rbio
->failb
))
2530 else if (is_parity_stripe(rbio
->failb
))
2531 failp
= rbio
->failb
;
2534 * Because we can not use a scrubbing parity to repair
2535 * the data, so the capability of the repair is declined.
2536 * (In the case of RAID5, we can not repair anything)
2538 if (dfail
> rbio
->bbio
->max_errors
- 1)
2542 * If all data is good, only parity is correctly, just
2543 * repair the parity.
2546 finish_parity_scrub(rbio
, 0);
2551 * Here means we got one corrupted data stripe and one
2552 * corrupted parity on RAID6, if the corrupted parity
2553 * is scrubbing parity, luckily, use the other one to repair
2554 * the data, or we can not repair the data stripe.
2556 if (failp
!= rbio
->scrubp
)
2559 __raid_recover_end_io(rbio
);
2561 finish_parity_scrub(rbio
, 1);
2566 rbio_orig_end_io(rbio
, BLK_STS_IOERR
);
2570 * end io for the read phase of the rmw cycle. All the bios here are physical
2571 * stripe bios we've read from the disk so we can recalculate the parity of the
2574 * This will usually kick off finish_rmw once all the bios are read in, but it
2575 * may trigger parity reconstruction if we had any errors along the way
2577 static void raid56_parity_scrub_end_io(struct bio
*bio
)
2579 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
2582 fail_bio_stripe(rbio
, bio
);
2584 set_bio_pages_uptodate(bio
);
2588 if (!atomic_dec_and_test(&rbio
->stripes_pending
))
2592 * this will normally call finish_rmw to start our write
2593 * but if there are any failed stripes we'll reconstruct
2596 validate_rbio_for_parity_scrub(rbio
);
2599 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio
*rbio
)
2601 int bios_to_read
= 0;
2602 struct bio_list bio_list
;
2608 bio_list_init(&bio_list
);
2610 ret
= alloc_rbio_essential_pages(rbio
);
2614 atomic_set(&rbio
->error
, 0);
2616 * build a list of bios to read all the missing parts of this
2619 for (stripe
= 0; stripe
< rbio
->real_stripes
; stripe
++) {
2620 for_each_set_bit(pagenr
, rbio
->dbitmap
, rbio
->stripe_npages
) {
2623 * we want to find all the pages missing from
2624 * the rbio and read them from the disk. If
2625 * page_in_rbio finds a page in the bio list
2626 * we don't need to read it off the stripe.
2628 page
= page_in_rbio(rbio
, stripe
, pagenr
, 1);
2632 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
2634 * the bio cache may have handed us an uptodate
2635 * page. If so, be happy and use it
2637 if (PageUptodate(page
))
2640 ret
= rbio_add_io_page(rbio
, &bio_list
, page
,
2641 stripe
, pagenr
, rbio
->stripe_len
);
2647 bios_to_read
= bio_list_size(&bio_list
);
2648 if (!bios_to_read
) {
2650 * this can happen if others have merged with
2651 * us, it means there is nothing left to read.
2652 * But if there are missing devices it may not be
2653 * safe to do the full stripe write yet.
2659 * the bbio may be freed once we submit the last bio. Make sure
2660 * not to touch it after that
2662 atomic_set(&rbio
->stripes_pending
, bios_to_read
);
2664 bio
= bio_list_pop(&bio_list
);
2668 bio
->bi_private
= rbio
;
2669 bio
->bi_end_io
= raid56_parity_scrub_end_io
;
2670 bio
->bi_opf
= REQ_OP_READ
;
2672 btrfs_bio_wq_end_io(rbio
->fs_info
, bio
, BTRFS_WQ_ENDIO_RAID56
);
2676 /* the actual write will happen once the reads are done */
2680 rbio_orig_end_io(rbio
, BLK_STS_IOERR
);
2682 while ((bio
= bio_list_pop(&bio_list
)))
2688 validate_rbio_for_parity_scrub(rbio
);
2691 static void scrub_parity_work(struct btrfs_work
*work
)
2693 struct btrfs_raid_bio
*rbio
;
2695 rbio
= container_of(work
, struct btrfs_raid_bio
, work
);
2696 raid56_parity_scrub_stripe(rbio
);
2699 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio
*rbio
)
2701 if (!lock_stripe_add(rbio
))
2702 start_async_work(rbio
, scrub_parity_work
);
2705 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2707 struct btrfs_raid_bio
*
2708 raid56_alloc_missing_rbio(struct btrfs_fs_info
*fs_info
, struct bio
*bio
,
2709 struct btrfs_bio
*bbio
, u64 length
)
2711 struct btrfs_raid_bio
*rbio
;
2713 rbio
= alloc_rbio(fs_info
, bbio
, length
);
2717 rbio
->operation
= BTRFS_RBIO_REBUILD_MISSING
;
2718 bio_list_add(&rbio
->bio_list
, bio
);
2720 * This is a special bio which is used to hold the completion handler
2721 * and make the scrub rbio is similar to the other types
2723 ASSERT(!bio
->bi_iter
.bi_size
);
2725 rbio
->faila
= find_logical_bio_stripe(rbio
, bio
);
2726 if (rbio
->faila
== -1) {
2733 * When we get bbio, we have already increased bio_counter, record it
2734 * so we can free it at rbio_orig_end_io()
2736 rbio
->generic_bio_cnt
= 1;
2741 void raid56_submit_missing_rbio(struct btrfs_raid_bio
*rbio
)
2743 if (!lock_stripe_add(rbio
))
2744 start_async_work(rbio
, read_rebuild_work
);