1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
7 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
21 #include "async-thread.h"
22 #include "file-item.h"
23 #include "btrfs_inode.h"
25 /* set when additional merges to this rbio are not allowed */
26 #define RBIO_RMW_LOCKED_BIT 1
29 * set when this rbio is sitting in the hash, but it is just a cache
32 #define RBIO_CACHE_BIT 2
35 * set when it is safe to trust the stripe_pages for caching
37 #define RBIO_CACHE_READY_BIT 3
39 #define RBIO_CACHE_SIZE 1024
41 #define BTRFS_STRIPE_HASH_TABLE_BITS 11
43 static void dump_bioc(const struct btrfs_fs_info
*fs_info
, const struct btrfs_io_context
*bioc
)
45 if (unlikely(!bioc
)) {
46 btrfs_crit(fs_info
, "bioc=NULL");
50 "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
51 bioc
->logical
, bioc
->full_stripe_logical
, bioc
->size
,
52 bioc
->map_type
, bioc
->mirror_num
, bioc
->replace_nr_stripes
,
53 bioc
->replace_stripe_src
, bioc
->num_stripes
);
54 for (int i
= 0; i
< bioc
->num_stripes
; i
++) {
55 btrfs_crit(fs_info
, " nr=%d devid=%llu physical=%llu",
56 i
, bioc
->stripes
[i
].dev
->devid
,
57 bioc
->stripes
[i
].physical
);
61 static void btrfs_dump_rbio(const struct btrfs_fs_info
*fs_info
,
62 const struct btrfs_raid_bio
*rbio
)
64 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT
))
67 dump_bioc(fs_info
, rbio
->bioc
);
69 "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
70 rbio
->flags
, rbio
->nr_sectors
, rbio
->nr_data
,
71 rbio
->real_stripes
, rbio
->stripe_nsectors
,
72 rbio
->scrubp
, rbio
->dbitmap
);
75 #define ASSERT_RBIO(expr, rbio) \
77 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
78 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
79 (rbio)->bioc->fs_info : NULL; \
81 btrfs_dump_rbio(__fs_info, (rbio)); \
86 #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \
88 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
89 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
90 (rbio)->bioc->fs_info : NULL; \
92 btrfs_dump_rbio(__fs_info, (rbio)); \
93 btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \
98 #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \
100 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
101 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
102 (rbio)->bioc->fs_info : NULL; \
104 btrfs_dump_rbio(__fs_info, (rbio)); \
105 btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \
110 #define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \
112 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
113 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
114 (rbio)->bioc->fs_info : NULL; \
116 btrfs_dump_rbio(__fs_info, (rbio)); \
117 btrfs_crit(__fs_info, "logical=%llu", (logical)); \
122 /* Used by the raid56 code to lock stripes for read/modify/write */
123 struct btrfs_stripe_hash
{
124 struct list_head hash_list
;
128 /* Used by the raid56 code to lock stripes for read/modify/write */
129 struct btrfs_stripe_hash_table
{
130 struct list_head stripe_cache
;
131 spinlock_t cache_lock
;
133 struct btrfs_stripe_hash table
[];
137 * A bvec like structure to present a sector inside a page.
139 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
143 unsigned int pgoff
:24;
144 unsigned int uptodate
:8;
147 static void rmw_rbio_work(struct work_struct
*work
);
148 static void rmw_rbio_work_locked(struct work_struct
*work
);
149 static void index_rbio_pages(struct btrfs_raid_bio
*rbio
);
150 static int alloc_rbio_pages(struct btrfs_raid_bio
*rbio
);
152 static int finish_parity_scrub(struct btrfs_raid_bio
*rbio
);
153 static void scrub_rbio_work_locked(struct work_struct
*work
);
155 static void free_raid_bio_pointers(struct btrfs_raid_bio
*rbio
)
157 bitmap_free(rbio
->error_bitmap
);
158 kfree(rbio
->stripe_pages
);
159 kfree(rbio
->bio_sectors
);
160 kfree(rbio
->stripe_sectors
);
161 kfree(rbio
->finish_pointers
);
164 static void free_raid_bio(struct btrfs_raid_bio
*rbio
)
168 if (!refcount_dec_and_test(&rbio
->refs
))
171 WARN_ON(!list_empty(&rbio
->stripe_cache
));
172 WARN_ON(!list_empty(&rbio
->hash_list
));
173 WARN_ON(!bio_list_empty(&rbio
->bio_list
));
175 for (i
= 0; i
< rbio
->nr_pages
; i
++) {
176 if (rbio
->stripe_pages
[i
]) {
177 __free_page(rbio
->stripe_pages
[i
]);
178 rbio
->stripe_pages
[i
] = NULL
;
182 btrfs_put_bioc(rbio
->bioc
);
183 free_raid_bio_pointers(rbio
);
187 static void start_async_work(struct btrfs_raid_bio
*rbio
, work_func_t work_func
)
189 INIT_WORK(&rbio
->work
, work_func
);
190 queue_work(rbio
->bioc
->fs_info
->rmw_workers
, &rbio
->work
);
194 * the stripe hash table is used for locking, and to collect
195 * bios in hopes of making a full stripe
197 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info
*info
)
199 struct btrfs_stripe_hash_table
*table
;
200 struct btrfs_stripe_hash_table
*x
;
201 struct btrfs_stripe_hash
*cur
;
202 struct btrfs_stripe_hash
*h
;
203 int num_entries
= 1 << BTRFS_STRIPE_HASH_TABLE_BITS
;
206 if (info
->stripe_hash_table
)
210 * The table is large, starting with order 4 and can go as high as
211 * order 7 in case lock debugging is turned on.
213 * Try harder to allocate and fallback to vmalloc to lower the chance
214 * of a failing mount.
216 table
= kvzalloc(struct_size(table
, table
, num_entries
), GFP_KERNEL
);
220 spin_lock_init(&table
->cache_lock
);
221 INIT_LIST_HEAD(&table
->stripe_cache
);
225 for (i
= 0; i
< num_entries
; i
++) {
227 INIT_LIST_HEAD(&cur
->hash_list
);
228 spin_lock_init(&cur
->lock
);
231 x
= cmpxchg(&info
->stripe_hash_table
, NULL
, table
);
237 * caching an rbio means to copy anything from the
238 * bio_sectors array into the stripe_pages array. We
239 * use the page uptodate bit in the stripe cache array
240 * to indicate if it has valid data
242 * once the caching is done, we set the cache ready
245 static void cache_rbio_pages(struct btrfs_raid_bio
*rbio
)
250 ret
= alloc_rbio_pages(rbio
);
254 for (i
= 0; i
< rbio
->nr_sectors
; i
++) {
255 /* Some range not covered by bio (partial write), skip it */
256 if (!rbio
->bio_sectors
[i
].page
) {
258 * Even if the sector is not covered by bio, if it is
259 * a data sector it should still be uptodate as it is
262 if (i
< rbio
->nr_data
* rbio
->stripe_nsectors
)
263 ASSERT(rbio
->stripe_sectors
[i
].uptodate
);
267 ASSERT(rbio
->stripe_sectors
[i
].page
);
268 memcpy_page(rbio
->stripe_sectors
[i
].page
,
269 rbio
->stripe_sectors
[i
].pgoff
,
270 rbio
->bio_sectors
[i
].page
,
271 rbio
->bio_sectors
[i
].pgoff
,
272 rbio
->bioc
->fs_info
->sectorsize
);
273 rbio
->stripe_sectors
[i
].uptodate
= 1;
275 set_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
279 * we hash on the first logical address of the stripe
281 static int rbio_bucket(struct btrfs_raid_bio
*rbio
)
283 u64 num
= rbio
->bioc
->full_stripe_logical
;
286 * we shift down quite a bit. We're using byte
287 * addressing, and most of the lower bits are zeros.
288 * This tends to upset hash_64, and it consistently
289 * returns just one or two different values.
291 * shifting off the lower bits fixes things.
293 return hash_64(num
>> 16, BTRFS_STRIPE_HASH_TABLE_BITS
);
296 static bool full_page_sectors_uptodate(struct btrfs_raid_bio
*rbio
,
297 unsigned int page_nr
)
299 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
300 const u32 sectors_per_page
= PAGE_SIZE
/ sectorsize
;
303 ASSERT(page_nr
< rbio
->nr_pages
);
305 for (i
= sectors_per_page
* page_nr
;
306 i
< sectors_per_page
* page_nr
+ sectors_per_page
;
308 if (!rbio
->stripe_sectors
[i
].uptodate
)
315 * Update the stripe_sectors[] array to use correct page and pgoff
317 * Should be called every time any page pointer in stripes_pages[] got modified.
319 static void index_stripe_sectors(struct btrfs_raid_bio
*rbio
)
321 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
325 for (i
= 0, offset
= 0; i
< rbio
->nr_sectors
; i
++, offset
+= sectorsize
) {
326 int page_index
= offset
>> PAGE_SHIFT
;
328 ASSERT(page_index
< rbio
->nr_pages
);
329 rbio
->stripe_sectors
[i
].page
= rbio
->stripe_pages
[page_index
];
330 rbio
->stripe_sectors
[i
].pgoff
= offset_in_page(offset
);
334 static void steal_rbio_page(struct btrfs_raid_bio
*src
,
335 struct btrfs_raid_bio
*dest
, int page_nr
)
337 const u32 sectorsize
= src
->bioc
->fs_info
->sectorsize
;
338 const u32 sectors_per_page
= PAGE_SIZE
/ sectorsize
;
341 if (dest
->stripe_pages
[page_nr
])
342 __free_page(dest
->stripe_pages
[page_nr
]);
343 dest
->stripe_pages
[page_nr
] = src
->stripe_pages
[page_nr
];
344 src
->stripe_pages
[page_nr
] = NULL
;
346 /* Also update the sector->uptodate bits. */
347 for (i
= sectors_per_page
* page_nr
;
348 i
< sectors_per_page
* page_nr
+ sectors_per_page
; i
++)
349 dest
->stripe_sectors
[i
].uptodate
= true;
352 static bool is_data_stripe_page(struct btrfs_raid_bio
*rbio
, int page_nr
)
354 const int sector_nr
= (page_nr
<< PAGE_SHIFT
) >>
355 rbio
->bioc
->fs_info
->sectorsize_bits
;
358 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
359 * we won't have a page which is half data half parity.
361 * Thus if the first sector of the page belongs to data stripes, then
362 * the full page belongs to data stripes.
364 return (sector_nr
< rbio
->nr_data
* rbio
->stripe_nsectors
);
368 * Stealing an rbio means taking all the uptodate pages from the stripe array
369 * in the source rbio and putting them into the destination rbio.
371 * This will also update the involved stripe_sectors[] which are referring to
374 static void steal_rbio(struct btrfs_raid_bio
*src
, struct btrfs_raid_bio
*dest
)
378 if (!test_bit(RBIO_CACHE_READY_BIT
, &src
->flags
))
381 for (i
= 0; i
< dest
->nr_pages
; i
++) {
382 struct page
*p
= src
->stripe_pages
[i
];
385 * We don't need to steal P/Q pages as they will always be
386 * regenerated for RMW or full write anyway.
388 if (!is_data_stripe_page(src
, i
))
392 * If @src already has RBIO_CACHE_READY_BIT, it should have
393 * all data stripe pages present and uptodate.
396 ASSERT(full_page_sectors_uptodate(src
, i
));
397 steal_rbio_page(src
, dest
, i
);
399 index_stripe_sectors(dest
);
400 index_stripe_sectors(src
);
404 * merging means we take the bio_list from the victim and
405 * splice it into the destination. The victim should
406 * be discarded afterwards.
408 * must be called with dest->rbio_list_lock held
410 static void merge_rbio(struct btrfs_raid_bio
*dest
,
411 struct btrfs_raid_bio
*victim
)
413 bio_list_merge_init(&dest
->bio_list
, &victim
->bio_list
);
414 dest
->bio_list_bytes
+= victim
->bio_list_bytes
;
415 /* Also inherit the bitmaps from @victim. */
416 bitmap_or(&dest
->dbitmap
, &victim
->dbitmap
, &dest
->dbitmap
,
417 dest
->stripe_nsectors
);
421 * used to prune items that are in the cache. The caller
422 * must hold the hash table lock.
424 static void __remove_rbio_from_cache(struct btrfs_raid_bio
*rbio
)
426 int bucket
= rbio_bucket(rbio
);
427 struct btrfs_stripe_hash_table
*table
;
428 struct btrfs_stripe_hash
*h
;
432 * check the bit again under the hash table lock.
434 if (!test_bit(RBIO_CACHE_BIT
, &rbio
->flags
))
437 table
= rbio
->bioc
->fs_info
->stripe_hash_table
;
438 h
= table
->table
+ bucket
;
440 /* hold the lock for the bucket because we may be
441 * removing it from the hash table
446 * hold the lock for the bio list because we need
447 * to make sure the bio list is empty
449 spin_lock(&rbio
->bio_list_lock
);
451 if (test_and_clear_bit(RBIO_CACHE_BIT
, &rbio
->flags
)) {
452 list_del_init(&rbio
->stripe_cache
);
453 table
->cache_size
-= 1;
456 /* if the bio list isn't empty, this rbio is
457 * still involved in an IO. We take it out
458 * of the cache list, and drop the ref that
459 * was held for the list.
461 * If the bio_list was empty, we also remove
462 * the rbio from the hash_table, and drop
463 * the corresponding ref
465 if (bio_list_empty(&rbio
->bio_list
)) {
466 if (!list_empty(&rbio
->hash_list
)) {
467 list_del_init(&rbio
->hash_list
);
468 refcount_dec(&rbio
->refs
);
469 BUG_ON(!list_empty(&rbio
->plug_list
));
474 spin_unlock(&rbio
->bio_list_lock
);
475 spin_unlock(&h
->lock
);
482 * prune a given rbio from the cache
484 static void remove_rbio_from_cache(struct btrfs_raid_bio
*rbio
)
486 struct btrfs_stripe_hash_table
*table
;
488 if (!test_bit(RBIO_CACHE_BIT
, &rbio
->flags
))
491 table
= rbio
->bioc
->fs_info
->stripe_hash_table
;
493 spin_lock(&table
->cache_lock
);
494 __remove_rbio_from_cache(rbio
);
495 spin_unlock(&table
->cache_lock
);
499 * remove everything in the cache
501 static void btrfs_clear_rbio_cache(struct btrfs_fs_info
*info
)
503 struct btrfs_stripe_hash_table
*table
;
504 struct btrfs_raid_bio
*rbio
;
506 table
= info
->stripe_hash_table
;
508 spin_lock(&table
->cache_lock
);
509 while (!list_empty(&table
->stripe_cache
)) {
510 rbio
= list_entry(table
->stripe_cache
.next
,
511 struct btrfs_raid_bio
,
513 __remove_rbio_from_cache(rbio
);
515 spin_unlock(&table
->cache_lock
);
519 * remove all cached entries and free the hash table
522 void btrfs_free_stripe_hash_table(struct btrfs_fs_info
*info
)
524 if (!info
->stripe_hash_table
)
526 btrfs_clear_rbio_cache(info
);
527 kvfree(info
->stripe_hash_table
);
528 info
->stripe_hash_table
= NULL
;
532 * insert an rbio into the stripe cache. It
533 * must have already been prepared by calling
536 * If this rbio was already cached, it gets
537 * moved to the front of the lru.
539 * If the size of the rbio cache is too big, we
542 static void cache_rbio(struct btrfs_raid_bio
*rbio
)
544 struct btrfs_stripe_hash_table
*table
;
546 if (!test_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
))
549 table
= rbio
->bioc
->fs_info
->stripe_hash_table
;
551 spin_lock(&table
->cache_lock
);
552 spin_lock(&rbio
->bio_list_lock
);
554 /* bump our ref if we were not in the list before */
555 if (!test_and_set_bit(RBIO_CACHE_BIT
, &rbio
->flags
))
556 refcount_inc(&rbio
->refs
);
558 if (!list_empty(&rbio
->stripe_cache
)){
559 list_move(&rbio
->stripe_cache
, &table
->stripe_cache
);
561 list_add(&rbio
->stripe_cache
, &table
->stripe_cache
);
562 table
->cache_size
+= 1;
565 spin_unlock(&rbio
->bio_list_lock
);
567 if (table
->cache_size
> RBIO_CACHE_SIZE
) {
568 struct btrfs_raid_bio
*found
;
570 found
= list_entry(table
->stripe_cache
.prev
,
571 struct btrfs_raid_bio
,
575 __remove_rbio_from_cache(found
);
578 spin_unlock(&table
->cache_lock
);
582 * helper function to run the xor_blocks api. It is only
583 * able to do MAX_XOR_BLOCKS at a time, so we need to
586 static void run_xor(void **pages
, int src_cnt
, ssize_t len
)
590 void *dest
= pages
[src_cnt
];
593 xor_src_cnt
= min(src_cnt
, MAX_XOR_BLOCKS
);
594 xor_blocks(xor_src_cnt
, len
, dest
, pages
+ src_off
);
596 src_cnt
-= xor_src_cnt
;
597 src_off
+= xor_src_cnt
;
602 * Returns true if the bio list inside this rbio covers an entire stripe (no
605 static int rbio_is_full(struct btrfs_raid_bio
*rbio
)
607 unsigned long size
= rbio
->bio_list_bytes
;
610 spin_lock(&rbio
->bio_list_lock
);
611 if (size
!= rbio
->nr_data
* BTRFS_STRIPE_LEN
)
613 BUG_ON(size
> rbio
->nr_data
* BTRFS_STRIPE_LEN
);
614 spin_unlock(&rbio
->bio_list_lock
);
620 * returns 1 if it is safe to merge two rbios together.
621 * The merging is safe if the two rbios correspond to
622 * the same stripe and if they are both going in the same
623 * direction (read vs write), and if neither one is
624 * locked for final IO
626 * The caller is responsible for locking such that
627 * rmw_locked is safe to test
629 static int rbio_can_merge(struct btrfs_raid_bio
*last
,
630 struct btrfs_raid_bio
*cur
)
632 if (test_bit(RBIO_RMW_LOCKED_BIT
, &last
->flags
) ||
633 test_bit(RBIO_RMW_LOCKED_BIT
, &cur
->flags
))
637 * we can't merge with cached rbios, since the
638 * idea is that when we merge the destination
639 * rbio is going to run our IO for us. We can
640 * steal from cached rbios though, other functions
643 if (test_bit(RBIO_CACHE_BIT
, &last
->flags
) ||
644 test_bit(RBIO_CACHE_BIT
, &cur
->flags
))
647 if (last
->bioc
->full_stripe_logical
!= cur
->bioc
->full_stripe_logical
)
650 /* we can't merge with different operations */
651 if (last
->operation
!= cur
->operation
)
654 * We've need read the full stripe from the drive.
655 * check and repair the parity and write the new results.
657 * We're not allowed to add any new bios to the
658 * bio list here, anyone else that wants to
659 * change this stripe needs to do their own rmw.
661 if (last
->operation
== BTRFS_RBIO_PARITY_SCRUB
)
664 if (last
->operation
== BTRFS_RBIO_READ_REBUILD
)
670 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio
*rbio
,
671 unsigned int stripe_nr
,
672 unsigned int sector_nr
)
674 ASSERT_RBIO_STRIPE(stripe_nr
< rbio
->real_stripes
, rbio
, stripe_nr
);
675 ASSERT_RBIO_SECTOR(sector_nr
< rbio
->stripe_nsectors
, rbio
, sector_nr
);
677 return stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
;
680 /* Return a sector from rbio->stripe_sectors, not from the bio list */
681 static struct sector_ptr
*rbio_stripe_sector(const struct btrfs_raid_bio
*rbio
,
682 unsigned int stripe_nr
,
683 unsigned int sector_nr
)
685 return &rbio
->stripe_sectors
[rbio_stripe_sector_index(rbio
, stripe_nr
,
689 /* Grab a sector inside P stripe */
690 static struct sector_ptr
*rbio_pstripe_sector(const struct btrfs_raid_bio
*rbio
,
691 unsigned int sector_nr
)
693 return rbio_stripe_sector(rbio
, rbio
->nr_data
, sector_nr
);
696 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
697 static struct sector_ptr
*rbio_qstripe_sector(const struct btrfs_raid_bio
*rbio
,
698 unsigned int sector_nr
)
700 if (rbio
->nr_data
+ 1 == rbio
->real_stripes
)
702 return rbio_stripe_sector(rbio
, rbio
->nr_data
+ 1, sector_nr
);
706 * The first stripe in the table for a logical address
707 * has the lock. rbios are added in one of three ways:
709 * 1) Nobody has the stripe locked yet. The rbio is given
710 * the lock and 0 is returned. The caller must start the IO
713 * 2) Someone has the stripe locked, but we're able to merge
714 * with the lock owner. The rbio is freed and the IO will
715 * start automatically along with the existing rbio. 1 is returned.
717 * 3) Someone has the stripe locked, but we're not able to merge.
718 * The rbio is added to the lock owner's plug list, or merged into
719 * an rbio already on the plug list. When the lock owner unlocks,
720 * the next rbio on the list is run and the IO is started automatically.
723 * If we return 0, the caller still owns the rbio and must continue with
724 * IO submission. If we return 1, the caller must assume the rbio has
725 * already been freed.
727 static noinline
int lock_stripe_add(struct btrfs_raid_bio
*rbio
)
729 struct btrfs_stripe_hash
*h
;
730 struct btrfs_raid_bio
*cur
;
731 struct btrfs_raid_bio
*pending
;
732 struct btrfs_raid_bio
*freeit
= NULL
;
733 struct btrfs_raid_bio
*cache_drop
= NULL
;
736 h
= rbio
->bioc
->fs_info
->stripe_hash_table
->table
+ rbio_bucket(rbio
);
739 list_for_each_entry(cur
, &h
->hash_list
, hash_list
) {
740 if (cur
->bioc
->full_stripe_logical
!= rbio
->bioc
->full_stripe_logical
)
743 spin_lock(&cur
->bio_list_lock
);
745 /* Can we steal this cached rbio's pages? */
746 if (bio_list_empty(&cur
->bio_list
) &&
747 list_empty(&cur
->plug_list
) &&
748 test_bit(RBIO_CACHE_BIT
, &cur
->flags
) &&
749 !test_bit(RBIO_RMW_LOCKED_BIT
, &cur
->flags
)) {
750 list_del_init(&cur
->hash_list
);
751 refcount_dec(&cur
->refs
);
753 steal_rbio(cur
, rbio
);
755 spin_unlock(&cur
->bio_list_lock
);
760 /* Can we merge into the lock owner? */
761 if (rbio_can_merge(cur
, rbio
)) {
762 merge_rbio(cur
, rbio
);
763 spin_unlock(&cur
->bio_list_lock
);
771 * We couldn't merge with the running rbio, see if we can merge
772 * with the pending ones. We don't have to check for rmw_locked
773 * because there is no way they are inside finish_rmw right now
775 list_for_each_entry(pending
, &cur
->plug_list
, plug_list
) {
776 if (rbio_can_merge(pending
, rbio
)) {
777 merge_rbio(pending
, rbio
);
778 spin_unlock(&cur
->bio_list_lock
);
786 * No merging, put us on the tail of the plug list, our rbio
787 * will be started with the currently running rbio unlocks
789 list_add_tail(&rbio
->plug_list
, &cur
->plug_list
);
790 spin_unlock(&cur
->bio_list_lock
);
795 refcount_inc(&rbio
->refs
);
796 list_add(&rbio
->hash_list
, &h
->hash_list
);
798 spin_unlock(&h
->lock
);
800 remove_rbio_from_cache(cache_drop
);
802 free_raid_bio(freeit
);
806 static void recover_rbio_work_locked(struct work_struct
*work
);
809 * called as rmw or parity rebuild is completed. If the plug list has more
810 * rbios waiting for this stripe, the next one on the list will be started
812 static noinline
void unlock_stripe(struct btrfs_raid_bio
*rbio
)
815 struct btrfs_stripe_hash
*h
;
818 bucket
= rbio_bucket(rbio
);
819 h
= rbio
->bioc
->fs_info
->stripe_hash_table
->table
+ bucket
;
821 if (list_empty(&rbio
->plug_list
))
825 spin_lock(&rbio
->bio_list_lock
);
827 if (!list_empty(&rbio
->hash_list
)) {
829 * if we're still cached and there is no other IO
830 * to perform, just leave this rbio here for others
831 * to steal from later
833 if (list_empty(&rbio
->plug_list
) &&
834 test_bit(RBIO_CACHE_BIT
, &rbio
->flags
)) {
836 clear_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
837 BUG_ON(!bio_list_empty(&rbio
->bio_list
));
841 list_del_init(&rbio
->hash_list
);
842 refcount_dec(&rbio
->refs
);
845 * we use the plug list to hold all the rbios
846 * waiting for the chance to lock this stripe.
847 * hand the lock over to one of them.
849 if (!list_empty(&rbio
->plug_list
)) {
850 struct btrfs_raid_bio
*next
;
851 struct list_head
*head
= rbio
->plug_list
.next
;
853 next
= list_entry(head
, struct btrfs_raid_bio
,
856 list_del_init(&rbio
->plug_list
);
858 list_add(&next
->hash_list
, &h
->hash_list
);
859 refcount_inc(&next
->refs
);
860 spin_unlock(&rbio
->bio_list_lock
);
861 spin_unlock(&h
->lock
);
863 if (next
->operation
== BTRFS_RBIO_READ_REBUILD
) {
864 start_async_work(next
, recover_rbio_work_locked
);
865 } else if (next
->operation
== BTRFS_RBIO_WRITE
) {
866 steal_rbio(rbio
, next
);
867 start_async_work(next
, rmw_rbio_work_locked
);
868 } else if (next
->operation
== BTRFS_RBIO_PARITY_SCRUB
) {
869 steal_rbio(rbio
, next
);
870 start_async_work(next
, scrub_rbio_work_locked
);
877 spin_unlock(&rbio
->bio_list_lock
);
878 spin_unlock(&h
->lock
);
882 remove_rbio_from_cache(rbio
);
885 static void rbio_endio_bio_list(struct bio
*cur
, blk_status_t err
)
892 cur
->bi_status
= err
;
899 * this frees the rbio and runs through all the bios in the
900 * bio_list and calls end_io on them
902 static void rbio_orig_end_io(struct btrfs_raid_bio
*rbio
, blk_status_t err
)
904 struct bio
*cur
= bio_list_get(&rbio
->bio_list
);
907 kfree(rbio
->csum_buf
);
908 bitmap_free(rbio
->csum_bitmap
);
909 rbio
->csum_buf
= NULL
;
910 rbio
->csum_bitmap
= NULL
;
913 * Clear the data bitmap, as the rbio may be cached for later usage.
914 * do this before before unlock_stripe() so there will be no new bio
917 bitmap_clear(&rbio
->dbitmap
, 0, rbio
->stripe_nsectors
);
920 * At this moment, rbio->bio_list is empty, however since rbio does not
921 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
922 * hash list, rbio may be merged with others so that rbio->bio_list
924 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
925 * more and we can call bio_endio() on all queued bios.
928 extra
= bio_list_get(&rbio
->bio_list
);
931 rbio_endio_bio_list(cur
, err
);
933 rbio_endio_bio_list(extra
, err
);
937 * Get a sector pointer specified by its @stripe_nr and @sector_nr.
939 * @rbio: The raid bio
940 * @stripe_nr: Stripe number, valid range [0, real_stripe)
941 * @sector_nr: Sector number inside the stripe,
942 * valid range [0, stripe_nsectors)
943 * @bio_list_only: Whether to use sectors inside the bio list only.
945 * The read/modify/write code wants to reuse the original bio page as much
946 * as possible, and only use stripe_sectors as fallback.
948 static struct sector_ptr
*sector_in_rbio(struct btrfs_raid_bio
*rbio
,
949 int stripe_nr
, int sector_nr
,
952 struct sector_ptr
*sector
;
955 ASSERT_RBIO_STRIPE(stripe_nr
>= 0 && stripe_nr
< rbio
->real_stripes
,
957 ASSERT_RBIO_SECTOR(sector_nr
>= 0 && sector_nr
< rbio
->stripe_nsectors
,
960 index
= stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
;
961 ASSERT(index
>= 0 && index
< rbio
->nr_sectors
);
963 spin_lock(&rbio
->bio_list_lock
);
964 sector
= &rbio
->bio_sectors
[index
];
965 if (sector
->page
|| bio_list_only
) {
966 /* Don't return sector without a valid page pointer */
969 spin_unlock(&rbio
->bio_list_lock
);
972 spin_unlock(&rbio
->bio_list_lock
);
974 return &rbio
->stripe_sectors
[index
];
978 * allocation and initial setup for the btrfs_raid_bio. Not
979 * this does not allocate any pages for rbio->pages.
981 static struct btrfs_raid_bio
*alloc_rbio(struct btrfs_fs_info
*fs_info
,
982 struct btrfs_io_context
*bioc
)
984 const unsigned int real_stripes
= bioc
->num_stripes
- bioc
->replace_nr_stripes
;
985 const unsigned int stripe_npages
= BTRFS_STRIPE_LEN
>> PAGE_SHIFT
;
986 const unsigned int num_pages
= stripe_npages
* real_stripes
;
987 const unsigned int stripe_nsectors
=
988 BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
;
989 const unsigned int num_sectors
= stripe_nsectors
* real_stripes
;
990 struct btrfs_raid_bio
*rbio
;
992 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
993 ASSERT(IS_ALIGNED(PAGE_SIZE
, fs_info
->sectorsize
));
995 * Our current stripe len should be fixed to 64k thus stripe_nsectors
996 * (at most 16) should be no larger than BITS_PER_LONG.
998 ASSERT(stripe_nsectors
<= BITS_PER_LONG
);
1001 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1004 ASSERT(real_stripes
>= 2);
1005 ASSERT(real_stripes
<= U8_MAX
);
1007 rbio
= kzalloc(sizeof(*rbio
), GFP_NOFS
);
1009 return ERR_PTR(-ENOMEM
);
1010 rbio
->stripe_pages
= kcalloc(num_pages
, sizeof(struct page
*),
1012 rbio
->bio_sectors
= kcalloc(num_sectors
, sizeof(struct sector_ptr
),
1014 rbio
->stripe_sectors
= kcalloc(num_sectors
, sizeof(struct sector_ptr
),
1016 rbio
->finish_pointers
= kcalloc(real_stripes
, sizeof(void *), GFP_NOFS
);
1017 rbio
->error_bitmap
= bitmap_zalloc(num_sectors
, GFP_NOFS
);
1019 if (!rbio
->stripe_pages
|| !rbio
->bio_sectors
|| !rbio
->stripe_sectors
||
1020 !rbio
->finish_pointers
|| !rbio
->error_bitmap
) {
1021 free_raid_bio_pointers(rbio
);
1023 return ERR_PTR(-ENOMEM
);
1026 bio_list_init(&rbio
->bio_list
);
1027 init_waitqueue_head(&rbio
->io_wait
);
1028 INIT_LIST_HEAD(&rbio
->plug_list
);
1029 spin_lock_init(&rbio
->bio_list_lock
);
1030 INIT_LIST_HEAD(&rbio
->stripe_cache
);
1031 INIT_LIST_HEAD(&rbio
->hash_list
);
1032 btrfs_get_bioc(bioc
);
1034 rbio
->nr_pages
= num_pages
;
1035 rbio
->nr_sectors
= num_sectors
;
1036 rbio
->real_stripes
= real_stripes
;
1037 rbio
->stripe_npages
= stripe_npages
;
1038 rbio
->stripe_nsectors
= stripe_nsectors
;
1039 refcount_set(&rbio
->refs
, 1);
1040 atomic_set(&rbio
->stripes_pending
, 0);
1042 ASSERT(btrfs_nr_parity_stripes(bioc
->map_type
));
1043 rbio
->nr_data
= real_stripes
- btrfs_nr_parity_stripes(bioc
->map_type
);
1044 ASSERT(rbio
->nr_data
> 0);
1049 /* allocate pages for all the stripes in the bio, including parity */
1050 static int alloc_rbio_pages(struct btrfs_raid_bio
*rbio
)
1054 ret
= btrfs_alloc_page_array(rbio
->nr_pages
, rbio
->stripe_pages
, false);
1057 /* Mapping all sectors */
1058 index_stripe_sectors(rbio
);
1062 /* only allocate pages for p/q stripes */
1063 static int alloc_rbio_parity_pages(struct btrfs_raid_bio
*rbio
)
1065 const int data_pages
= rbio
->nr_data
* rbio
->stripe_npages
;
1068 ret
= btrfs_alloc_page_array(rbio
->nr_pages
- data_pages
,
1069 rbio
->stripe_pages
+ data_pages
, false);
1073 index_stripe_sectors(rbio
);
1078 * Return the total number of errors found in the vertical stripe of @sector_nr.
1080 * @faila and @failb will also be updated to the first and second stripe
1081 * number of the errors.
1083 static int get_rbio_veritical_errors(struct btrfs_raid_bio
*rbio
, int sector_nr
,
1084 int *faila
, int *failb
)
1087 int found_errors
= 0;
1089 if (faila
|| failb
) {
1091 * Both @faila and @failb should be valid pointers if any of
1092 * them is specified.
1094 ASSERT(faila
&& failb
);
1099 for (stripe_nr
= 0; stripe_nr
< rbio
->real_stripes
; stripe_nr
++) {
1100 int total_sector_nr
= stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
;
1102 if (test_bit(total_sector_nr
, rbio
->error_bitmap
)) {
1105 /* Update faila and failb. */
1108 else if (*failb
< 0)
1113 return found_errors
;
1117 * Add a single sector @sector into our list of bios for IO.
1119 * Return 0 if everything went well.
1120 * Return <0 for error.
1122 static int rbio_add_io_sector(struct btrfs_raid_bio
*rbio
,
1123 struct bio_list
*bio_list
,
1124 struct sector_ptr
*sector
,
1125 unsigned int stripe_nr
,
1126 unsigned int sector_nr
,
1129 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
1130 struct bio
*last
= bio_list
->tail
;
1133 struct btrfs_io_stripe
*stripe
;
1137 * Note: here stripe_nr has taken device replace into consideration,
1138 * thus it can be larger than rbio->real_stripe.
1139 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1141 ASSERT_RBIO_STRIPE(stripe_nr
>= 0 && stripe_nr
< rbio
->bioc
->num_stripes
,
1143 ASSERT_RBIO_SECTOR(sector_nr
>= 0 && sector_nr
< rbio
->stripe_nsectors
,
1145 ASSERT(sector
->page
);
1147 stripe
= &rbio
->bioc
->stripes
[stripe_nr
];
1148 disk_start
= stripe
->physical
+ sector_nr
* sectorsize
;
1150 /* if the device is missing, just fail this stripe */
1151 if (!stripe
->dev
->bdev
) {
1154 set_bit(stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
,
1155 rbio
->error_bitmap
);
1157 /* Check if we have reached tolerance early. */
1158 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
,
1160 if (found_errors
> rbio
->bioc
->max_errors
)
1165 /* see if we can add this page onto our existing bio */
1167 u64 last_end
= last
->bi_iter
.bi_sector
<< SECTOR_SHIFT
;
1168 last_end
+= last
->bi_iter
.bi_size
;
1171 * we can't merge these if they are from different
1172 * devices or if they are not contiguous
1174 if (last_end
== disk_start
&& !last
->bi_status
&&
1175 last
->bi_bdev
== stripe
->dev
->bdev
) {
1176 ret
= bio_add_page(last
, sector
->page
, sectorsize
,
1178 if (ret
== sectorsize
)
1183 /* put a new bio on the list */
1184 bio
= bio_alloc(stripe
->dev
->bdev
,
1185 max(BTRFS_STRIPE_LEN
>> PAGE_SHIFT
, 1),
1187 bio
->bi_iter
.bi_sector
= disk_start
>> SECTOR_SHIFT
;
1188 bio
->bi_private
= rbio
;
1190 __bio_add_page(bio
, sector
->page
, sectorsize
, sector
->pgoff
);
1191 bio_list_add(bio_list
, bio
);
1195 static void index_one_bio(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1197 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
1198 struct bio_vec bvec
;
1199 struct bvec_iter iter
;
1200 u32 offset
= (bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
) -
1201 rbio
->bioc
->full_stripe_logical
;
1203 bio_for_each_segment(bvec
, bio
, iter
) {
1206 for (bvec_offset
= 0; bvec_offset
< bvec
.bv_len
;
1207 bvec_offset
+= sectorsize
, offset
+= sectorsize
) {
1208 int index
= offset
/ sectorsize
;
1209 struct sector_ptr
*sector
= &rbio
->bio_sectors
[index
];
1211 sector
->page
= bvec
.bv_page
;
1212 sector
->pgoff
= bvec
.bv_offset
+ bvec_offset
;
1213 ASSERT(sector
->pgoff
< PAGE_SIZE
);
1219 * helper function to walk our bio list and populate the bio_pages array with
1220 * the result. This seems expensive, but it is faster than constantly
1221 * searching through the bio list as we setup the IO in finish_rmw or stripe
1224 * This must be called before you trust the answers from page_in_rbio
1226 static void index_rbio_pages(struct btrfs_raid_bio
*rbio
)
1230 spin_lock(&rbio
->bio_list_lock
);
1231 bio_list_for_each(bio
, &rbio
->bio_list
)
1232 index_one_bio(rbio
, bio
);
1234 spin_unlock(&rbio
->bio_list_lock
);
1237 static void bio_get_trace_info(struct btrfs_raid_bio
*rbio
, struct bio
*bio
,
1238 struct raid56_bio_trace_info
*trace_info
)
1240 const struct btrfs_io_context
*bioc
= rbio
->bioc
;
1245 /* We rely on bio->bi_bdev to find the stripe number. */
1249 for (i
= 0; i
< bioc
->num_stripes
; i
++) {
1250 if (bio
->bi_bdev
!= bioc
->stripes
[i
].dev
->bdev
)
1252 trace_info
->stripe_nr
= i
;
1253 trace_info
->devid
= bioc
->stripes
[i
].dev
->devid
;
1254 trace_info
->offset
= (bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
) -
1255 bioc
->stripes
[i
].physical
;
1260 trace_info
->devid
= -1;
1261 trace_info
->offset
= -1;
1262 trace_info
->stripe_nr
= -1;
1265 static inline void bio_list_put(struct bio_list
*bio_list
)
1269 while ((bio
= bio_list_pop(bio_list
)))
1273 static void assert_rbio(struct btrfs_raid_bio
*rbio
)
1275 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT
))
1279 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1280 * we won't go beyond 256 disks anyway.
1282 ASSERT_RBIO(rbio
->real_stripes
>= 2, rbio
);
1283 ASSERT_RBIO(rbio
->nr_data
> 0, rbio
);
1286 * This is another check to make sure nr data stripes is smaller
1287 * than total stripes.
1289 ASSERT_RBIO(rbio
->nr_data
< rbio
->real_stripes
, rbio
);
1292 /* Generate PQ for one vertical stripe. */
1293 static void generate_pq_vertical(struct btrfs_raid_bio
*rbio
, int sectornr
)
1295 void **pointers
= rbio
->finish_pointers
;
1296 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
1297 struct sector_ptr
*sector
;
1299 const bool has_qstripe
= rbio
->bioc
->map_type
& BTRFS_BLOCK_GROUP_RAID6
;
1301 /* First collect one sector from each data stripe */
1302 for (stripe
= 0; stripe
< rbio
->nr_data
; stripe
++) {
1303 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 0);
1304 pointers
[stripe
] = kmap_local_page(sector
->page
) +
1308 /* Then add the parity stripe */
1309 sector
= rbio_pstripe_sector(rbio
, sectornr
);
1310 sector
->uptodate
= 1;
1311 pointers
[stripe
++] = kmap_local_page(sector
->page
) + sector
->pgoff
;
1315 * RAID6, add the qstripe and call the library function
1316 * to fill in our p/q
1318 sector
= rbio_qstripe_sector(rbio
, sectornr
);
1319 sector
->uptodate
= 1;
1320 pointers
[stripe
++] = kmap_local_page(sector
->page
) +
1324 raid6_call
.gen_syndrome(rbio
->real_stripes
, sectorsize
,
1328 memcpy(pointers
[rbio
->nr_data
], pointers
[0], sectorsize
);
1329 run_xor(pointers
+ 1, rbio
->nr_data
- 1, sectorsize
);
1331 for (stripe
= stripe
- 1; stripe
>= 0; stripe
--)
1332 kunmap_local(pointers
[stripe
]);
1335 static int rmw_assemble_write_bios(struct btrfs_raid_bio
*rbio
,
1336 struct bio_list
*bio_list
)
1338 /* The total sector number inside the full stripe. */
1339 int total_sector_nr
;
1344 ASSERT(bio_list_size(bio_list
) == 0);
1346 /* We should have at least one data sector. */
1347 ASSERT(bitmap_weight(&rbio
->dbitmap
, rbio
->stripe_nsectors
));
1350 * Reset errors, as we may have errors inherited from from degraded
1353 bitmap_clear(rbio
->error_bitmap
, 0, rbio
->nr_sectors
);
1356 * Start assembly. Make bios for everything from the higher layers (the
1357 * bio_list in our rbio) and our P/Q. Ignore everything else.
1359 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
1360 total_sector_nr
++) {
1361 struct sector_ptr
*sector
;
1363 stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
1364 sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
1366 /* This vertical stripe has no data, skip it. */
1367 if (!test_bit(sectornr
, &rbio
->dbitmap
))
1370 if (stripe
< rbio
->nr_data
) {
1371 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 1);
1375 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
1378 ret
= rbio_add_io_sector(rbio
, bio_list
, sector
, stripe
,
1379 sectornr
, REQ_OP_WRITE
);
1384 if (likely(!rbio
->bioc
->replace_nr_stripes
))
1388 * Make a copy for the replace target device.
1390 * Thus the source stripe number (in replace_stripe_src) should be valid.
1392 ASSERT(rbio
->bioc
->replace_stripe_src
>= 0);
1394 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
1395 total_sector_nr
++) {
1396 struct sector_ptr
*sector
;
1398 stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
1399 sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
1402 * For RAID56, there is only one device that can be replaced,
1403 * and replace_stripe_src[0] indicates the stripe number we
1404 * need to copy from.
1406 if (stripe
!= rbio
->bioc
->replace_stripe_src
) {
1408 * We can skip the whole stripe completely, note
1409 * total_sector_nr will be increased by one anyway.
1411 ASSERT(sectornr
== 0);
1412 total_sector_nr
+= rbio
->stripe_nsectors
- 1;
1416 /* This vertical stripe has no data, skip it. */
1417 if (!test_bit(sectornr
, &rbio
->dbitmap
))
1420 if (stripe
< rbio
->nr_data
) {
1421 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 1);
1425 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
1428 ret
= rbio_add_io_sector(rbio
, bio_list
, sector
,
1430 sectornr
, REQ_OP_WRITE
);
1437 bio_list_put(bio_list
);
1441 static void set_rbio_range_error(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1443 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1444 u32 offset
= (bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
) -
1445 rbio
->bioc
->full_stripe_logical
;
1446 int total_nr_sector
= offset
>> fs_info
->sectorsize_bits
;
1448 ASSERT(total_nr_sector
< rbio
->nr_data
* rbio
->stripe_nsectors
);
1450 bitmap_set(rbio
->error_bitmap
, total_nr_sector
,
1451 bio
->bi_iter
.bi_size
>> fs_info
->sectorsize_bits
);
1454 * Special handling for raid56_alloc_missing_rbio() used by
1455 * scrub/replace. Unlike call path in raid56_parity_recover(), they
1456 * pass an empty bio here. Thus we have to find out the missing device
1457 * and mark the stripe error instead.
1459 if (bio
->bi_iter
.bi_size
== 0) {
1460 bool found_missing
= false;
1463 for (stripe_nr
= 0; stripe_nr
< rbio
->real_stripes
; stripe_nr
++) {
1464 if (!rbio
->bioc
->stripes
[stripe_nr
].dev
->bdev
) {
1465 found_missing
= true;
1466 bitmap_set(rbio
->error_bitmap
,
1467 stripe_nr
* rbio
->stripe_nsectors
,
1468 rbio
->stripe_nsectors
);
1471 ASSERT(found_missing
);
1476 * For subpage case, we can no longer set page Up-to-date directly for
1477 * stripe_pages[], thus we need to locate the sector.
1479 static struct sector_ptr
*find_stripe_sector(struct btrfs_raid_bio
*rbio
,
1485 for (i
= 0; i
< rbio
->nr_sectors
; i
++) {
1486 struct sector_ptr
*sector
= &rbio
->stripe_sectors
[i
];
1488 if (sector
->page
== page
&& sector
->pgoff
== pgoff
)
1495 * this sets each page in the bio uptodate. It should only be used on private
1496 * rbio pages, nothing that comes in from the higher layers
1498 static void set_bio_pages_uptodate(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1500 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
1501 struct bio_vec
*bvec
;
1502 struct bvec_iter_all iter_all
;
1504 ASSERT(!bio_flagged(bio
, BIO_CLONED
));
1506 bio_for_each_segment_all(bvec
, bio
, iter_all
) {
1507 struct sector_ptr
*sector
;
1510 for (pgoff
= bvec
->bv_offset
; pgoff
- bvec
->bv_offset
< bvec
->bv_len
;
1511 pgoff
+= sectorsize
) {
1512 sector
= find_stripe_sector(rbio
, bvec
->bv_page
, pgoff
);
1515 sector
->uptodate
= 1;
1520 static int get_bio_sector_nr(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1522 struct bio_vec
*bv
= bio_first_bvec_all(bio
);
1525 for (i
= 0; i
< rbio
->nr_sectors
; i
++) {
1526 struct sector_ptr
*sector
;
1528 sector
= &rbio
->stripe_sectors
[i
];
1529 if (sector
->page
== bv
->bv_page
&& sector
->pgoff
== bv
->bv_offset
)
1531 sector
= &rbio
->bio_sectors
[i
];
1532 if (sector
->page
== bv
->bv_page
&& sector
->pgoff
== bv
->bv_offset
)
1535 ASSERT(i
< rbio
->nr_sectors
);
1539 static void rbio_update_error_bitmap(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1541 int total_sector_nr
= get_bio_sector_nr(rbio
, bio
);
1543 struct bio_vec
*bvec
;
1546 bio_for_each_bvec_all(bvec
, bio
, i
)
1547 bio_size
+= bvec
->bv_len
;
1550 * Since we can have multiple bios touching the error_bitmap, we cannot
1551 * call bitmap_set() without protection.
1553 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1555 for (i
= total_sector_nr
; i
< total_sector_nr
+
1556 (bio_size
>> rbio
->bioc
->fs_info
->sectorsize_bits
); i
++)
1557 set_bit(i
, rbio
->error_bitmap
);
1560 /* Verify the data sectors at read time. */
1561 static void verify_bio_data_sectors(struct btrfs_raid_bio
*rbio
,
1564 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1565 int total_sector_nr
= get_bio_sector_nr(rbio
, bio
);
1566 struct bio_vec
*bvec
;
1567 struct bvec_iter_all iter_all
;
1569 /* No data csum for the whole stripe, no need to verify. */
1570 if (!rbio
->csum_bitmap
|| !rbio
->csum_buf
)
1573 /* P/Q stripes, they have no data csum to verify against. */
1574 if (total_sector_nr
>= rbio
->nr_data
* rbio
->stripe_nsectors
)
1577 bio_for_each_segment_all(bvec
, bio
, iter_all
) {
1580 for (bv_offset
= bvec
->bv_offset
;
1581 bv_offset
< bvec
->bv_offset
+ bvec
->bv_len
;
1582 bv_offset
+= fs_info
->sectorsize
, total_sector_nr
++) {
1583 u8 csum_buf
[BTRFS_CSUM_SIZE
];
1584 u8
*expected_csum
= rbio
->csum_buf
+
1585 total_sector_nr
* fs_info
->csum_size
;
1588 /* No csum for this sector, skip to the next sector. */
1589 if (!test_bit(total_sector_nr
, rbio
->csum_bitmap
))
1592 ret
= btrfs_check_sector_csum(fs_info
, bvec
->bv_page
,
1593 bv_offset
, csum_buf
, expected_csum
);
1595 set_bit(total_sector_nr
, rbio
->error_bitmap
);
1600 static void raid_wait_read_end_io(struct bio
*bio
)
1602 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
1604 if (bio
->bi_status
) {
1605 rbio_update_error_bitmap(rbio
, bio
);
1607 set_bio_pages_uptodate(rbio
, bio
);
1608 verify_bio_data_sectors(rbio
, bio
);
1612 if (atomic_dec_and_test(&rbio
->stripes_pending
))
1613 wake_up(&rbio
->io_wait
);
1616 static void submit_read_wait_bio_list(struct btrfs_raid_bio
*rbio
,
1617 struct bio_list
*bio_list
)
1621 atomic_set(&rbio
->stripes_pending
, bio_list_size(bio_list
));
1622 while ((bio
= bio_list_pop(bio_list
))) {
1623 bio
->bi_end_io
= raid_wait_read_end_io
;
1625 if (trace_raid56_read_enabled()) {
1626 struct raid56_bio_trace_info trace_info
= { 0 };
1628 bio_get_trace_info(rbio
, bio
, &trace_info
);
1629 trace_raid56_read(rbio
, bio
, &trace_info
);
1634 wait_event(rbio
->io_wait
, atomic_read(&rbio
->stripes_pending
) == 0);
1637 static int alloc_rbio_data_pages(struct btrfs_raid_bio
*rbio
)
1639 const int data_pages
= rbio
->nr_data
* rbio
->stripe_npages
;
1642 ret
= btrfs_alloc_page_array(data_pages
, rbio
->stripe_pages
, false);
1646 index_stripe_sectors(rbio
);
1651 * We use plugging call backs to collect full stripes.
1652 * Any time we get a partial stripe write while plugged
1653 * we collect it into a list. When the unplug comes down,
1654 * we sort the list by logical block number and merge
1655 * everything we can into the same rbios
1657 struct btrfs_plug_cb
{
1658 struct blk_plug_cb cb
;
1659 struct btrfs_fs_info
*info
;
1660 struct list_head rbio_list
;
1664 * rbios on the plug list are sorted for easier merging.
1666 static int plug_cmp(void *priv
, const struct list_head
*a
,
1667 const struct list_head
*b
)
1669 const struct btrfs_raid_bio
*ra
= container_of(a
, struct btrfs_raid_bio
,
1671 const struct btrfs_raid_bio
*rb
= container_of(b
, struct btrfs_raid_bio
,
1673 u64 a_sector
= ra
->bio_list
.head
->bi_iter
.bi_sector
;
1674 u64 b_sector
= rb
->bio_list
.head
->bi_iter
.bi_sector
;
1676 if (a_sector
< b_sector
)
1678 if (a_sector
> b_sector
)
1683 static void raid_unplug(struct blk_plug_cb
*cb
, bool from_schedule
)
1685 struct btrfs_plug_cb
*plug
= container_of(cb
, struct btrfs_plug_cb
, cb
);
1686 struct btrfs_raid_bio
*cur
;
1687 struct btrfs_raid_bio
*last
= NULL
;
1689 list_sort(NULL
, &plug
->rbio_list
, plug_cmp
);
1691 while (!list_empty(&plug
->rbio_list
)) {
1692 cur
= list_entry(plug
->rbio_list
.next
,
1693 struct btrfs_raid_bio
, plug_list
);
1694 list_del_init(&cur
->plug_list
);
1696 if (rbio_is_full(cur
)) {
1697 /* We have a full stripe, queue it down. */
1698 start_async_work(cur
, rmw_rbio_work
);
1702 if (rbio_can_merge(last
, cur
)) {
1703 merge_rbio(last
, cur
);
1707 start_async_work(last
, rmw_rbio_work
);
1712 start_async_work(last
, rmw_rbio_work
);
1716 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1717 static void rbio_add_bio(struct btrfs_raid_bio
*rbio
, struct bio
*orig_bio
)
1719 const struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1720 const u64 orig_logical
= orig_bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
;
1721 const u64 full_stripe_start
= rbio
->bioc
->full_stripe_logical
;
1722 const u32 orig_len
= orig_bio
->bi_iter
.bi_size
;
1723 const u32 sectorsize
= fs_info
->sectorsize
;
1726 ASSERT_RBIO_LOGICAL(orig_logical
>= full_stripe_start
&&
1727 orig_logical
+ orig_len
<= full_stripe_start
+
1728 rbio
->nr_data
* BTRFS_STRIPE_LEN
,
1729 rbio
, orig_logical
);
1731 bio_list_add(&rbio
->bio_list
, orig_bio
);
1732 rbio
->bio_list_bytes
+= orig_bio
->bi_iter
.bi_size
;
1734 /* Update the dbitmap. */
1735 for (cur_logical
= orig_logical
; cur_logical
< orig_logical
+ orig_len
;
1736 cur_logical
+= sectorsize
) {
1737 int bit
= ((u32
)(cur_logical
- full_stripe_start
) >>
1738 fs_info
->sectorsize_bits
) % rbio
->stripe_nsectors
;
1740 set_bit(bit
, &rbio
->dbitmap
);
1745 * our main entry point for writes from the rest of the FS.
1747 void raid56_parity_write(struct bio
*bio
, struct btrfs_io_context
*bioc
)
1749 struct btrfs_fs_info
*fs_info
= bioc
->fs_info
;
1750 struct btrfs_raid_bio
*rbio
;
1751 struct btrfs_plug_cb
*plug
= NULL
;
1752 struct blk_plug_cb
*cb
;
1754 rbio
= alloc_rbio(fs_info
, bioc
);
1756 bio
->bi_status
= errno_to_blk_status(PTR_ERR(rbio
));
1760 rbio
->operation
= BTRFS_RBIO_WRITE
;
1761 rbio_add_bio(rbio
, bio
);
1764 * Don't plug on full rbios, just get them out the door
1765 * as quickly as we can
1767 if (!rbio_is_full(rbio
)) {
1768 cb
= blk_check_plugged(raid_unplug
, fs_info
, sizeof(*plug
));
1770 plug
= container_of(cb
, struct btrfs_plug_cb
, cb
);
1772 plug
->info
= fs_info
;
1773 INIT_LIST_HEAD(&plug
->rbio_list
);
1775 list_add_tail(&rbio
->plug_list
, &plug
->rbio_list
);
1781 * Either we don't have any existing plug, or we're doing a full stripe,
1782 * queue the rmw work now.
1784 start_async_work(rbio
, rmw_rbio_work
);
1787 static int verify_one_sector(struct btrfs_raid_bio
*rbio
,
1788 int stripe_nr
, int sector_nr
)
1790 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1791 struct sector_ptr
*sector
;
1792 u8 csum_buf
[BTRFS_CSUM_SIZE
];
1796 if (!rbio
->csum_bitmap
|| !rbio
->csum_buf
)
1799 /* No way to verify P/Q as they are not covered by data csum. */
1800 if (stripe_nr
>= rbio
->nr_data
)
1803 * If we're rebuilding a read, we have to use pages from the
1804 * bio list if possible.
1806 if (rbio
->operation
== BTRFS_RBIO_READ_REBUILD
) {
1807 sector
= sector_in_rbio(rbio
, stripe_nr
, sector_nr
, 0);
1809 sector
= rbio_stripe_sector(rbio
, stripe_nr
, sector_nr
);
1812 ASSERT(sector
->page
);
1814 csum_expected
= rbio
->csum_buf
+
1815 (stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
) *
1817 ret
= btrfs_check_sector_csum(fs_info
, sector
->page
, sector
->pgoff
,
1818 csum_buf
, csum_expected
);
1823 * Recover a vertical stripe specified by @sector_nr.
1824 * @*pointers are the pre-allocated pointers by the caller, so we don't
1825 * need to allocate/free the pointers again and again.
1827 static int recover_vertical(struct btrfs_raid_bio
*rbio
, int sector_nr
,
1828 void **pointers
, void **unmap_array
)
1830 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1831 struct sector_ptr
*sector
;
1832 const u32 sectorsize
= fs_info
->sectorsize
;
1840 * Now we just use bitmap to mark the horizontal stripes in
1841 * which we have data when doing parity scrub.
1843 if (rbio
->operation
== BTRFS_RBIO_PARITY_SCRUB
&&
1844 !test_bit(sector_nr
, &rbio
->dbitmap
))
1847 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
, &faila
,
1850 * No errors in the vertical stripe, skip it. Can happen for recovery
1851 * which only part of a stripe failed csum check.
1856 if (found_errors
> rbio
->bioc
->max_errors
)
1860 * Setup our array of pointers with sectors from each stripe
1862 * NOTE: store a duplicate array of pointers to preserve the
1865 for (stripe_nr
= 0; stripe_nr
< rbio
->real_stripes
; stripe_nr
++) {
1867 * If we're rebuilding a read, we have to use pages from the
1868 * bio list if possible.
1870 if (rbio
->operation
== BTRFS_RBIO_READ_REBUILD
) {
1871 sector
= sector_in_rbio(rbio
, stripe_nr
, sector_nr
, 0);
1873 sector
= rbio_stripe_sector(rbio
, stripe_nr
, sector_nr
);
1875 ASSERT(sector
->page
);
1876 pointers
[stripe_nr
] = kmap_local_page(sector
->page
) +
1878 unmap_array
[stripe_nr
] = pointers
[stripe_nr
];
1881 /* All raid6 handling here */
1882 if (rbio
->bioc
->map_type
& BTRFS_BLOCK_GROUP_RAID6
) {
1883 /* Single failure, rebuild from parity raid5 style */
1885 if (faila
== rbio
->nr_data
)
1887 * Just the P stripe has failed, without
1888 * a bad data or Q stripe.
1889 * We have nothing to do, just skip the
1890 * recovery for this stripe.
1894 * a single failure in raid6 is rebuilt
1895 * in the pstripe code below
1901 * If the q stripe is failed, do a pstripe reconstruction from
1903 * If both the q stripe and the P stripe are failed, we're
1904 * here due to a crc mismatch and we can't give them the
1907 if (failb
== rbio
->real_stripes
- 1) {
1908 if (faila
== rbio
->real_stripes
- 2)
1910 * Only P and Q are corrupted.
1911 * We only care about data stripes recovery,
1912 * can skip this vertical stripe.
1916 * Otherwise we have one bad data stripe and
1917 * a good P stripe. raid5!
1922 if (failb
== rbio
->real_stripes
- 2) {
1923 raid6_datap_recov(rbio
->real_stripes
, sectorsize
,
1926 raid6_2data_recov(rbio
->real_stripes
, sectorsize
,
1927 faila
, failb
, pointers
);
1932 /* Rebuild from P stripe here (raid5 or raid6). */
1933 ASSERT(failb
== -1);
1935 /* Copy parity block into failed block to start with */
1936 memcpy(pointers
[faila
], pointers
[rbio
->nr_data
], sectorsize
);
1938 /* Rearrange the pointer array */
1939 p
= pointers
[faila
];
1940 for (stripe_nr
= faila
; stripe_nr
< rbio
->nr_data
- 1;
1942 pointers
[stripe_nr
] = pointers
[stripe_nr
+ 1];
1943 pointers
[rbio
->nr_data
- 1] = p
;
1945 /* Xor in the rest */
1946 run_xor(pointers
, rbio
->nr_data
- 1, sectorsize
);
1951 * No matter if this is a RMW or recovery, we should have all
1952 * failed sectors repaired in the vertical stripe, thus they are now
1954 * Especially if we determine to cache the rbio, we need to
1955 * have at least all data sectors uptodate.
1957 * If possible, also check if the repaired sector matches its data
1961 ret
= verify_one_sector(rbio
, faila
, sector_nr
);
1965 sector
= rbio_stripe_sector(rbio
, faila
, sector_nr
);
1966 sector
->uptodate
= 1;
1969 ret
= verify_one_sector(rbio
, failb
, sector_nr
);
1973 sector
= rbio_stripe_sector(rbio
, failb
, sector_nr
);
1974 sector
->uptodate
= 1;
1978 for (stripe_nr
= rbio
->real_stripes
- 1; stripe_nr
>= 0; stripe_nr
--)
1979 kunmap_local(unmap_array
[stripe_nr
]);
1983 static int recover_sectors(struct btrfs_raid_bio
*rbio
)
1985 void **pointers
= NULL
;
1986 void **unmap_array
= NULL
;
1991 * @pointers array stores the pointer for each sector.
1993 * @unmap_array stores copy of pointers that does not get reordered
1994 * during reconstruction so that kunmap_local works.
1996 pointers
= kcalloc(rbio
->real_stripes
, sizeof(void *), GFP_NOFS
);
1997 unmap_array
= kcalloc(rbio
->real_stripes
, sizeof(void *), GFP_NOFS
);
1998 if (!pointers
|| !unmap_array
) {
2003 if (rbio
->operation
== BTRFS_RBIO_READ_REBUILD
) {
2004 spin_lock(&rbio
->bio_list_lock
);
2005 set_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
2006 spin_unlock(&rbio
->bio_list_lock
);
2009 index_rbio_pages(rbio
);
2011 for (sectornr
= 0; sectornr
< rbio
->stripe_nsectors
; sectornr
++) {
2012 ret
= recover_vertical(rbio
, sectornr
, pointers
, unmap_array
);
2023 static void recover_rbio(struct btrfs_raid_bio
*rbio
)
2025 struct bio_list bio_list
= BIO_EMPTY_LIST
;
2026 int total_sector_nr
;
2030 * Either we're doing recover for a read failure or degraded write,
2031 * caller should have set error bitmap correctly.
2033 ASSERT(bitmap_weight(rbio
->error_bitmap
, rbio
->nr_sectors
));
2035 /* For recovery, we need to read all sectors including P/Q. */
2036 ret
= alloc_rbio_pages(rbio
);
2040 index_rbio_pages(rbio
);
2043 * Read everything that hasn't failed. However this time we will
2044 * not trust any cached sector.
2045 * As we may read out some stale data but higher layer is not reading
2048 * So here we always re-read everything in recovery path.
2050 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
2051 total_sector_nr
++) {
2052 int stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
2053 int sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
2054 struct sector_ptr
*sector
;
2057 * Skip the range which has error. It can be a range which is
2058 * marked error (for csum mismatch), or it can be a missing
2061 if (!rbio
->bioc
->stripes
[stripe
].dev
->bdev
||
2062 test_bit(total_sector_nr
, rbio
->error_bitmap
)) {
2064 * Also set the error bit for missing device, which
2065 * may not yet have its error bit set.
2067 set_bit(total_sector_nr
, rbio
->error_bitmap
);
2071 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
2072 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
, stripe
,
2073 sectornr
, REQ_OP_READ
);
2075 bio_list_put(&bio_list
);
2080 submit_read_wait_bio_list(rbio
, &bio_list
);
2081 ret
= recover_sectors(rbio
);
2083 rbio_orig_end_io(rbio
, errno_to_blk_status(ret
));
2086 static void recover_rbio_work(struct work_struct
*work
)
2088 struct btrfs_raid_bio
*rbio
;
2090 rbio
= container_of(work
, struct btrfs_raid_bio
, work
);
2091 if (!lock_stripe_add(rbio
))
2095 static void recover_rbio_work_locked(struct work_struct
*work
)
2097 recover_rbio(container_of(work
, struct btrfs_raid_bio
, work
));
2100 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio
*rbio
, int mirror_num
)
2106 * This is for RAID6 extra recovery tries, thus mirror number should
2108 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2111 ASSERT(mirror_num
> 2);
2112 for (sector_nr
= 0; sector_nr
< rbio
->stripe_nsectors
; sector_nr
++) {
2117 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
,
2119 /* This vertical stripe doesn't have errors. */
2124 * If we found errors, there should be only one error marked
2125 * by previous set_rbio_range_error().
2127 ASSERT(found_errors
== 1);
2130 /* Now select another stripe to mark as error. */
2131 failb
= rbio
->real_stripes
- (mirror_num
- 1);
2135 /* Set the extra bit in error bitmap. */
2137 set_bit(failb
* rbio
->stripe_nsectors
+ sector_nr
,
2138 rbio
->error_bitmap
);
2141 /* We should found at least one vertical stripe with error.*/
2146 * the main entry point for reads from the higher layers. This
2147 * is really only called when the normal read path had a failure,
2148 * so we assume the bio they send down corresponds to a failed part
2151 void raid56_parity_recover(struct bio
*bio
, struct btrfs_io_context
*bioc
,
2154 struct btrfs_fs_info
*fs_info
= bioc
->fs_info
;
2155 struct btrfs_raid_bio
*rbio
;
2157 rbio
= alloc_rbio(fs_info
, bioc
);
2159 bio
->bi_status
= errno_to_blk_status(PTR_ERR(rbio
));
2164 rbio
->operation
= BTRFS_RBIO_READ_REBUILD
;
2165 rbio_add_bio(rbio
, bio
);
2167 set_rbio_range_error(rbio
, bio
);
2171 * for 'mirror == 2', reconstruct from all other stripes.
2172 * for 'mirror_num > 2', select a stripe to fail on every retry.
2175 set_rbio_raid6_extra_error(rbio
, mirror_num
);
2177 start_async_work(rbio
, recover_rbio_work
);
2180 static void fill_data_csums(struct btrfs_raid_bio
*rbio
)
2182 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
2183 struct btrfs_root
*csum_root
= btrfs_csum_root(fs_info
,
2184 rbio
->bioc
->full_stripe_logical
);
2185 const u64 start
= rbio
->bioc
->full_stripe_logical
;
2186 const u32 len
= (rbio
->nr_data
* rbio
->stripe_nsectors
) <<
2187 fs_info
->sectorsize_bits
;
2190 /* The rbio should not have its csum buffer initialized. */
2191 ASSERT(!rbio
->csum_buf
&& !rbio
->csum_bitmap
);
2194 * Skip the csum search if:
2196 * - The rbio doesn't belong to data block groups
2197 * Then we are doing IO for tree blocks, no need to search csums.
2199 * - The rbio belongs to mixed block groups
2200 * This is to avoid deadlock, as we're already holding the full
2201 * stripe lock, if we trigger a metadata read, and it needs to do
2202 * raid56 recovery, we will deadlock.
2204 if (!(rbio
->bioc
->map_type
& BTRFS_BLOCK_GROUP_DATA
) ||
2205 rbio
->bioc
->map_type
& BTRFS_BLOCK_GROUP_METADATA
)
2208 rbio
->csum_buf
= kzalloc(rbio
->nr_data
* rbio
->stripe_nsectors
*
2209 fs_info
->csum_size
, GFP_NOFS
);
2210 rbio
->csum_bitmap
= bitmap_zalloc(rbio
->nr_data
* rbio
->stripe_nsectors
,
2212 if (!rbio
->csum_buf
|| !rbio
->csum_bitmap
) {
2217 ret
= btrfs_lookup_csums_bitmap(csum_root
, NULL
, start
, start
+ len
- 1,
2218 rbio
->csum_buf
, rbio
->csum_bitmap
);
2221 if (bitmap_empty(rbio
->csum_bitmap
, len
>> fs_info
->sectorsize_bits
))
2227 * We failed to allocate memory or grab the csum, but it's not fatal,
2228 * we can still continue. But better to warn users that RMW is no
2229 * longer safe for this particular sub-stripe write.
2231 btrfs_warn_rl(fs_info
,
2232 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2233 rbio
->bioc
->full_stripe_logical
, ret
);
2235 kfree(rbio
->csum_buf
);
2236 bitmap_free(rbio
->csum_bitmap
);
2237 rbio
->csum_buf
= NULL
;
2238 rbio
->csum_bitmap
= NULL
;
2241 static int rmw_read_wait_recover(struct btrfs_raid_bio
*rbio
)
2243 struct bio_list bio_list
= BIO_EMPTY_LIST
;
2244 int total_sector_nr
;
2248 * Fill the data csums we need for data verification. We need to fill
2249 * the csum_bitmap/csum_buf first, as our endio function will try to
2250 * verify the data sectors.
2252 fill_data_csums(rbio
);
2255 * Build a list of bios to read all sectors (including data and P/Q).
2257 * This behavior is to compensate the later csum verification and recovery.
2259 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
2260 total_sector_nr
++) {
2261 struct sector_ptr
*sector
;
2262 int stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
2263 int sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
2265 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
2266 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
,
2267 stripe
, sectornr
, REQ_OP_READ
);
2269 bio_list_put(&bio_list
);
2275 * We may or may not have any corrupted sectors (including missing dev
2276 * and csum mismatch), just let recover_sectors() to handle them all.
2278 submit_read_wait_bio_list(rbio
, &bio_list
);
2279 return recover_sectors(rbio
);
2282 static void raid_wait_write_end_io(struct bio
*bio
)
2284 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
2285 blk_status_t err
= bio
->bi_status
;
2288 rbio_update_error_bitmap(rbio
, bio
);
2290 if (atomic_dec_and_test(&rbio
->stripes_pending
))
2291 wake_up(&rbio
->io_wait
);
2294 static void submit_write_bios(struct btrfs_raid_bio
*rbio
,
2295 struct bio_list
*bio_list
)
2299 atomic_set(&rbio
->stripes_pending
, bio_list_size(bio_list
));
2300 while ((bio
= bio_list_pop(bio_list
))) {
2301 bio
->bi_end_io
= raid_wait_write_end_io
;
2303 if (trace_raid56_write_enabled()) {
2304 struct raid56_bio_trace_info trace_info
= { 0 };
2306 bio_get_trace_info(rbio
, bio
, &trace_info
);
2307 trace_raid56_write(rbio
, bio
, &trace_info
);
2314 * To determine if we need to read any sector from the disk.
2315 * Should only be utilized in RMW path, to skip cached rbio.
2317 static bool need_read_stripe_sectors(struct btrfs_raid_bio
*rbio
)
2321 for (i
= 0; i
< rbio
->nr_data
* rbio
->stripe_nsectors
; i
++) {
2322 struct sector_ptr
*sector
= &rbio
->stripe_sectors
[i
];
2325 * We have a sector which doesn't have page nor uptodate,
2326 * thus this rbio can not be cached one, as cached one must
2327 * have all its data sectors present and uptodate.
2329 if (!sector
->page
|| !sector
->uptodate
)
2335 static void rmw_rbio(struct btrfs_raid_bio
*rbio
)
2337 struct bio_list bio_list
;
2342 * Allocate the pages for parity first, as P/Q pages will always be
2343 * needed for both full-stripe and sub-stripe writes.
2345 ret
= alloc_rbio_parity_pages(rbio
);
2350 * Either full stripe write, or we have every data sector already
2351 * cached, can go to write path immediately.
2353 if (!rbio_is_full(rbio
) && need_read_stripe_sectors(rbio
)) {
2355 * Now we're doing sub-stripe write, also need all data stripes
2356 * to do the full RMW.
2358 ret
= alloc_rbio_data_pages(rbio
);
2362 index_rbio_pages(rbio
);
2364 ret
= rmw_read_wait_recover(rbio
);
2370 * At this stage we're not allowed to add any new bios to the
2371 * bio list any more, anyone else that wants to change this stripe
2372 * needs to do their own rmw.
2374 spin_lock(&rbio
->bio_list_lock
);
2375 set_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
2376 spin_unlock(&rbio
->bio_list_lock
);
2378 bitmap_clear(rbio
->error_bitmap
, 0, rbio
->nr_sectors
);
2380 index_rbio_pages(rbio
);
2383 * We don't cache full rbios because we're assuming
2384 * the higher layers are unlikely to use this area of
2385 * the disk again soon. If they do use it again,
2386 * hopefully they will send another full bio.
2388 if (!rbio_is_full(rbio
))
2389 cache_rbio_pages(rbio
);
2391 clear_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
2393 for (sectornr
= 0; sectornr
< rbio
->stripe_nsectors
; sectornr
++)
2394 generate_pq_vertical(rbio
, sectornr
);
2396 bio_list_init(&bio_list
);
2397 ret
= rmw_assemble_write_bios(rbio
, &bio_list
);
2401 /* We should have at least one bio assembled. */
2402 ASSERT(bio_list_size(&bio_list
));
2403 submit_write_bios(rbio
, &bio_list
);
2404 wait_event(rbio
->io_wait
, atomic_read(&rbio
->stripes_pending
) == 0);
2406 /* We may have more errors than our tolerance during the read. */
2407 for (sectornr
= 0; sectornr
< rbio
->stripe_nsectors
; sectornr
++) {
2410 found_errors
= get_rbio_veritical_errors(rbio
, sectornr
, NULL
, NULL
);
2411 if (found_errors
> rbio
->bioc
->max_errors
) {
2417 rbio_orig_end_io(rbio
, errno_to_blk_status(ret
));
2420 static void rmw_rbio_work(struct work_struct
*work
)
2422 struct btrfs_raid_bio
*rbio
;
2424 rbio
= container_of(work
, struct btrfs_raid_bio
, work
);
2425 if (lock_stripe_add(rbio
) == 0)
2429 static void rmw_rbio_work_locked(struct work_struct
*work
)
2431 rmw_rbio(container_of(work
, struct btrfs_raid_bio
, work
));
2435 * The following code is used to scrub/replace the parity stripe
2437 * Caller must have already increased bio_counter for getting @bioc.
2439 * Note: We need make sure all the pages that add into the scrub/replace
2440 * raid bio are correct and not be changed during the scrub/replace. That
2441 * is those pages just hold metadata or file data with checksum.
2444 struct btrfs_raid_bio
*raid56_parity_alloc_scrub_rbio(struct bio
*bio
,
2445 struct btrfs_io_context
*bioc
,
2446 struct btrfs_device
*scrub_dev
,
2447 unsigned long *dbitmap
, int stripe_nsectors
)
2449 struct btrfs_fs_info
*fs_info
= bioc
->fs_info
;
2450 struct btrfs_raid_bio
*rbio
;
2453 rbio
= alloc_rbio(fs_info
, bioc
);
2456 bio_list_add(&rbio
->bio_list
, bio
);
2458 * This is a special bio which is used to hold the completion handler
2459 * and make the scrub rbio is similar to the other types
2461 ASSERT(!bio
->bi_iter
.bi_size
);
2462 rbio
->operation
= BTRFS_RBIO_PARITY_SCRUB
;
2465 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2466 * to the end position, so this search can start from the first parity
2469 for (i
= rbio
->nr_data
; i
< rbio
->real_stripes
; i
++) {
2470 if (bioc
->stripes
[i
].dev
== scrub_dev
) {
2475 ASSERT_RBIO_STRIPE(i
< rbio
->real_stripes
, rbio
, i
);
2477 bitmap_copy(&rbio
->dbitmap
, dbitmap
, stripe_nsectors
);
2482 * We just scrub the parity that we have correct data on the same horizontal,
2483 * so we needn't allocate all pages for all the stripes.
2485 static int alloc_rbio_essential_pages(struct btrfs_raid_bio
*rbio
)
2487 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
2488 int total_sector_nr
;
2490 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
2491 total_sector_nr
++) {
2493 int sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
2494 int index
= (total_sector_nr
* sectorsize
) >> PAGE_SHIFT
;
2496 if (!test_bit(sectornr
, &rbio
->dbitmap
))
2498 if (rbio
->stripe_pages
[index
])
2500 page
= alloc_page(GFP_NOFS
);
2503 rbio
->stripe_pages
[index
] = page
;
2505 index_stripe_sectors(rbio
);
2509 static int finish_parity_scrub(struct btrfs_raid_bio
*rbio
)
2511 struct btrfs_io_context
*bioc
= rbio
->bioc
;
2512 const u32 sectorsize
= bioc
->fs_info
->sectorsize
;
2513 void **pointers
= rbio
->finish_pointers
;
2514 unsigned long *pbitmap
= &rbio
->finish_pbitmap
;
2515 int nr_data
= rbio
->nr_data
;
2519 struct sector_ptr p_sector
= { 0 };
2520 struct sector_ptr q_sector
= { 0 };
2521 struct bio_list bio_list
;
2525 bio_list_init(&bio_list
);
2527 if (rbio
->real_stripes
- rbio
->nr_data
== 1)
2528 has_qstripe
= false;
2529 else if (rbio
->real_stripes
- rbio
->nr_data
== 2)
2535 * Replace is running and our P/Q stripe is being replaced, then we
2536 * need to duplicate the final write to replace target.
2538 if (bioc
->replace_nr_stripes
&& bioc
->replace_stripe_src
== rbio
->scrubp
) {
2540 bitmap_copy(pbitmap
, &rbio
->dbitmap
, rbio
->stripe_nsectors
);
2544 * Because the higher layers(scrubber) are unlikely to
2545 * use this area of the disk again soon, so don't cache
2548 clear_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
2550 p_sector
.page
= alloc_page(GFP_NOFS
);
2554 p_sector
.uptodate
= 1;
2557 /* RAID6, allocate and map temp space for the Q stripe */
2558 q_sector
.page
= alloc_page(GFP_NOFS
);
2559 if (!q_sector
.page
) {
2560 __free_page(p_sector
.page
);
2561 p_sector
.page
= NULL
;
2565 q_sector
.uptodate
= 1;
2566 pointers
[rbio
->real_stripes
- 1] = kmap_local_page(q_sector
.page
);
2569 bitmap_clear(rbio
->error_bitmap
, 0, rbio
->nr_sectors
);
2571 /* Map the parity stripe just once */
2572 pointers
[nr_data
] = kmap_local_page(p_sector
.page
);
2574 for_each_set_bit(sectornr
, &rbio
->dbitmap
, rbio
->stripe_nsectors
) {
2575 struct sector_ptr
*sector
;
2578 /* first collect one page from each data stripe */
2579 for (stripe
= 0; stripe
< nr_data
; stripe
++) {
2580 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 0);
2581 pointers
[stripe
] = kmap_local_page(sector
->page
) +
2587 /* RAID6, call the library function to fill in our P/Q */
2588 raid6_call
.gen_syndrome(rbio
->real_stripes
, sectorsize
,
2592 memcpy(pointers
[nr_data
], pointers
[0], sectorsize
);
2593 run_xor(pointers
+ 1, nr_data
- 1, sectorsize
);
2596 /* Check scrubbing parity and repair it */
2597 sector
= rbio_stripe_sector(rbio
, rbio
->scrubp
, sectornr
);
2598 parity
= kmap_local_page(sector
->page
) + sector
->pgoff
;
2599 if (memcmp(parity
, pointers
[rbio
->scrubp
], sectorsize
) != 0)
2600 memcpy(parity
, pointers
[rbio
->scrubp
], sectorsize
);
2602 /* Parity is right, needn't writeback */
2603 bitmap_clear(&rbio
->dbitmap
, sectornr
, 1);
2604 kunmap_local(parity
);
2606 for (stripe
= nr_data
- 1; stripe
>= 0; stripe
--)
2607 kunmap_local(pointers
[stripe
]);
2610 kunmap_local(pointers
[nr_data
]);
2611 __free_page(p_sector
.page
);
2612 p_sector
.page
= NULL
;
2613 if (q_sector
.page
) {
2614 kunmap_local(pointers
[rbio
->real_stripes
- 1]);
2615 __free_page(q_sector
.page
);
2616 q_sector
.page
= NULL
;
2620 * time to start writing. Make bios for everything from the
2621 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2624 for_each_set_bit(sectornr
, &rbio
->dbitmap
, rbio
->stripe_nsectors
) {
2625 struct sector_ptr
*sector
;
2627 sector
= rbio_stripe_sector(rbio
, rbio
->scrubp
, sectornr
);
2628 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
, rbio
->scrubp
,
2629 sectornr
, REQ_OP_WRITE
);
2638 * Replace is running and our parity stripe needs to be duplicated to
2639 * the target device. Check we have a valid source stripe number.
2641 ASSERT_RBIO(rbio
->bioc
->replace_stripe_src
>= 0, rbio
);
2642 for_each_set_bit(sectornr
, pbitmap
, rbio
->stripe_nsectors
) {
2643 struct sector_ptr
*sector
;
2645 sector
= rbio_stripe_sector(rbio
, rbio
->scrubp
, sectornr
);
2646 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
,
2648 sectornr
, REQ_OP_WRITE
);
2654 submit_write_bios(rbio
, &bio_list
);
2658 bio_list_put(&bio_list
);
2662 static inline int is_data_stripe(struct btrfs_raid_bio
*rbio
, int stripe
)
2664 if (stripe
>= 0 && stripe
< rbio
->nr_data
)
2669 static int recover_scrub_rbio(struct btrfs_raid_bio
*rbio
)
2671 void **pointers
= NULL
;
2672 void **unmap_array
= NULL
;
2677 * @pointers array stores the pointer for each sector.
2679 * @unmap_array stores copy of pointers that does not get reordered
2680 * during reconstruction so that kunmap_local works.
2682 pointers
= kcalloc(rbio
->real_stripes
, sizeof(void *), GFP_NOFS
);
2683 unmap_array
= kcalloc(rbio
->real_stripes
, sizeof(void *), GFP_NOFS
);
2684 if (!pointers
|| !unmap_array
) {
2689 for (sector_nr
= 0; sector_nr
< rbio
->stripe_nsectors
; sector_nr
++) {
2690 int dfail
= 0, failp
= -1;
2695 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
,
2697 if (found_errors
> rbio
->bioc
->max_errors
) {
2701 if (found_errors
== 0)
2704 /* We should have at least one error here. */
2705 ASSERT(faila
>= 0 || failb
>= 0);
2707 if (is_data_stripe(rbio
, faila
))
2709 else if (is_parity_stripe(faila
))
2712 if (is_data_stripe(rbio
, failb
))
2714 else if (is_parity_stripe(failb
))
2717 * Because we can not use a scrubbing parity to repair the
2718 * data, so the capability of the repair is declined. (In the
2719 * case of RAID5, we can not repair anything.)
2721 if (dfail
> rbio
->bioc
->max_errors
- 1) {
2726 * If all data is good, only parity is correctly, just repair
2727 * the parity, no need to recover data stripes.
2733 * Here means we got one corrupted data stripe and one
2734 * corrupted parity on RAID6, if the corrupted parity is
2735 * scrubbing parity, luckily, use the other one to repair the
2736 * data, or we can not repair the data stripe.
2738 if (failp
!= rbio
->scrubp
) {
2743 ret
= recover_vertical(rbio
, sector_nr
, pointers
, unmap_array
);
2753 static int scrub_assemble_read_bios(struct btrfs_raid_bio
*rbio
)
2755 struct bio_list bio_list
= BIO_EMPTY_LIST
;
2756 int total_sector_nr
;
2759 /* Build a list of bios to read all the missing parts. */
2760 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
2761 total_sector_nr
++) {
2762 int sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
2763 int stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
2764 struct sector_ptr
*sector
;
2766 /* No data in the vertical stripe, no need to read. */
2767 if (!test_bit(sectornr
, &rbio
->dbitmap
))
2771 * We want to find all the sectors missing from the rbio and
2772 * read them from the disk. If sector_in_rbio() finds a sector
2773 * in the bio list we don't need to read it off the stripe.
2775 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 1);
2779 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
2781 * The bio cache may have handed us an uptodate sector. If so,
2784 if (sector
->uptodate
)
2787 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
, stripe
,
2788 sectornr
, REQ_OP_READ
);
2790 bio_list_put(&bio_list
);
2795 submit_read_wait_bio_list(rbio
, &bio_list
);
2799 static void scrub_rbio(struct btrfs_raid_bio
*rbio
)
2804 ret
= alloc_rbio_essential_pages(rbio
);
2808 bitmap_clear(rbio
->error_bitmap
, 0, rbio
->nr_sectors
);
2810 ret
= scrub_assemble_read_bios(rbio
);
2814 /* We may have some failures, recover the failed sectors first. */
2815 ret
= recover_scrub_rbio(rbio
);
2820 * We have every sector properly prepared. Can finish the scrub
2821 * and writeback the good content.
2823 ret
= finish_parity_scrub(rbio
);
2824 wait_event(rbio
->io_wait
, atomic_read(&rbio
->stripes_pending
) == 0);
2825 for (sector_nr
= 0; sector_nr
< rbio
->stripe_nsectors
; sector_nr
++) {
2828 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
, NULL
, NULL
);
2829 if (found_errors
> rbio
->bioc
->max_errors
) {
2835 rbio_orig_end_io(rbio
, errno_to_blk_status(ret
));
2838 static void scrub_rbio_work_locked(struct work_struct
*work
)
2840 scrub_rbio(container_of(work
, struct btrfs_raid_bio
, work
));
2843 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio
*rbio
)
2845 if (!lock_stripe_add(rbio
))
2846 start_async_work(rbio
, scrub_rbio_work_locked
);
2850 * This is for scrub call sites where we already have correct data contents.
2851 * This allows us to avoid reading data stripes again.
2853 * Unfortunately here we have to do page copy, other than reusing the pages.
2854 * This is due to the fact rbio has its own page management for its cache.
2856 void raid56_parity_cache_data_pages(struct btrfs_raid_bio
*rbio
,
2857 struct page
**data_pages
, u64 data_logical
)
2859 const u64 offset_in_full_stripe
= data_logical
-
2860 rbio
->bioc
->full_stripe_logical
;
2861 const int page_index
= offset_in_full_stripe
>> PAGE_SHIFT
;
2862 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
2863 const u32 sectors_per_page
= PAGE_SIZE
/ sectorsize
;
2867 * If we hit ENOMEM temporarily, but later at
2868 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2869 * the extra read, not a big deal.
2871 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2872 * the bio would got proper error number set.
2874 ret
= alloc_rbio_data_pages(rbio
);
2878 /* data_logical must be at stripe boundary and inside the full stripe. */
2879 ASSERT(IS_ALIGNED(offset_in_full_stripe
, BTRFS_STRIPE_LEN
));
2880 ASSERT(offset_in_full_stripe
< (rbio
->nr_data
<< BTRFS_STRIPE_LEN_SHIFT
));
2882 for (int page_nr
= 0; page_nr
< (BTRFS_STRIPE_LEN
>> PAGE_SHIFT
); page_nr
++) {
2883 struct page
*dst
= rbio
->stripe_pages
[page_nr
+ page_index
];
2884 struct page
*src
= data_pages
[page_nr
];
2886 memcpy_page(dst
, 0, src
, 0, PAGE_SIZE
);
2887 for (int sector_nr
= sectors_per_page
* page_index
;
2888 sector_nr
< sectors_per_page
* (page_index
+ 1);
2890 rbio
->stripe_sectors
[sector_nr
].uptodate
= true;