1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
7 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
21 #include "async-thread.h"
22 #include "file-item.h"
23 #include "btrfs_inode.h"
25 /* set when additional merges to this rbio are not allowed */
26 #define RBIO_RMW_LOCKED_BIT 1
29 * set when this rbio is sitting in the hash, but it is just a cache
32 #define RBIO_CACHE_BIT 2
35 * set when it is safe to trust the stripe_pages for caching
37 #define RBIO_CACHE_READY_BIT 3
39 #define RBIO_CACHE_SIZE 1024
41 #define BTRFS_STRIPE_HASH_TABLE_BITS 11
43 static void dump_bioc(const struct btrfs_fs_info
*fs_info
, const struct btrfs_io_context
*bioc
)
45 if (unlikely(!bioc
)) {
46 btrfs_crit(fs_info
, "bioc=NULL");
50 "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
51 bioc
->logical
, bioc
->full_stripe_logical
, bioc
->size
,
52 bioc
->map_type
, bioc
->mirror_num
, bioc
->replace_nr_stripes
,
53 bioc
->replace_stripe_src
, bioc
->num_stripes
);
54 for (int i
= 0; i
< bioc
->num_stripes
; i
++) {
55 btrfs_crit(fs_info
, " nr=%d devid=%llu physical=%llu",
56 i
, bioc
->stripes
[i
].dev
->devid
,
57 bioc
->stripes
[i
].physical
);
61 static void btrfs_dump_rbio(const struct btrfs_fs_info
*fs_info
,
62 const struct btrfs_raid_bio
*rbio
)
64 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT
))
67 dump_bioc(fs_info
, rbio
->bioc
);
69 "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
70 rbio
->flags
, rbio
->nr_sectors
, rbio
->nr_data
,
71 rbio
->real_stripes
, rbio
->stripe_nsectors
,
72 rbio
->scrubp
, rbio
->dbitmap
);
75 #define ASSERT_RBIO(expr, rbio) \
77 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
78 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
79 (rbio)->bioc->fs_info : NULL; \
81 btrfs_dump_rbio(__fs_info, (rbio)); \
86 #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \
88 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
89 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
90 (rbio)->bioc->fs_info : NULL; \
92 btrfs_dump_rbio(__fs_info, (rbio)); \
93 btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \
98 #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \
100 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
101 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
102 (rbio)->bioc->fs_info : NULL; \
104 btrfs_dump_rbio(__fs_info, (rbio)); \
105 btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \
110 #define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \
112 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
113 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
114 (rbio)->bioc->fs_info : NULL; \
116 btrfs_dump_rbio(__fs_info, (rbio)); \
117 btrfs_crit(__fs_info, "logical=%llu", (logical)); \
122 /* Used by the raid56 code to lock stripes for read/modify/write */
123 struct btrfs_stripe_hash
{
124 struct list_head hash_list
;
128 /* Used by the raid56 code to lock stripes for read/modify/write */
129 struct btrfs_stripe_hash_table
{
130 struct list_head stripe_cache
;
131 spinlock_t cache_lock
;
133 struct btrfs_stripe_hash table
[];
137 * A bvec like structure to present a sector inside a page.
139 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
143 unsigned int pgoff
:24;
144 unsigned int uptodate
:8;
147 static void rmw_rbio_work(struct work_struct
*work
);
148 static void rmw_rbio_work_locked(struct work_struct
*work
);
149 static void index_rbio_pages(struct btrfs_raid_bio
*rbio
);
150 static int alloc_rbio_pages(struct btrfs_raid_bio
*rbio
);
152 static int finish_parity_scrub(struct btrfs_raid_bio
*rbio
);
153 static void scrub_rbio_work_locked(struct work_struct
*work
);
155 static void free_raid_bio_pointers(struct btrfs_raid_bio
*rbio
)
157 bitmap_free(rbio
->error_bitmap
);
158 kfree(rbio
->stripe_pages
);
159 kfree(rbio
->bio_sectors
);
160 kfree(rbio
->stripe_sectors
);
161 kfree(rbio
->finish_pointers
);
164 static void free_raid_bio(struct btrfs_raid_bio
*rbio
)
168 if (!refcount_dec_and_test(&rbio
->refs
))
171 WARN_ON(!list_empty(&rbio
->stripe_cache
));
172 WARN_ON(!list_empty(&rbio
->hash_list
));
173 WARN_ON(!bio_list_empty(&rbio
->bio_list
));
175 for (i
= 0; i
< rbio
->nr_pages
; i
++) {
176 if (rbio
->stripe_pages
[i
]) {
177 __free_page(rbio
->stripe_pages
[i
]);
178 rbio
->stripe_pages
[i
] = NULL
;
182 btrfs_put_bioc(rbio
->bioc
);
183 free_raid_bio_pointers(rbio
);
187 static void start_async_work(struct btrfs_raid_bio
*rbio
, work_func_t work_func
)
189 INIT_WORK(&rbio
->work
, work_func
);
190 queue_work(rbio
->bioc
->fs_info
->rmw_workers
, &rbio
->work
);
194 * the stripe hash table is used for locking, and to collect
195 * bios in hopes of making a full stripe
197 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info
*info
)
199 struct btrfs_stripe_hash_table
*table
;
200 struct btrfs_stripe_hash_table
*x
;
201 struct btrfs_stripe_hash
*cur
;
202 struct btrfs_stripe_hash
*h
;
203 int num_entries
= 1 << BTRFS_STRIPE_HASH_TABLE_BITS
;
206 if (info
->stripe_hash_table
)
210 * The table is large, starting with order 4 and can go as high as
211 * order 7 in case lock debugging is turned on.
213 * Try harder to allocate and fallback to vmalloc to lower the chance
214 * of a failing mount.
216 table
= kvzalloc(struct_size(table
, table
, num_entries
), GFP_KERNEL
);
220 spin_lock_init(&table
->cache_lock
);
221 INIT_LIST_HEAD(&table
->stripe_cache
);
225 for (i
= 0; i
< num_entries
; i
++) {
227 INIT_LIST_HEAD(&cur
->hash_list
);
228 spin_lock_init(&cur
->lock
);
231 x
= cmpxchg(&info
->stripe_hash_table
, NULL
, table
);
237 * caching an rbio means to copy anything from the
238 * bio_sectors array into the stripe_pages array. We
239 * use the page uptodate bit in the stripe cache array
240 * to indicate if it has valid data
242 * once the caching is done, we set the cache ready
245 static void cache_rbio_pages(struct btrfs_raid_bio
*rbio
)
250 ret
= alloc_rbio_pages(rbio
);
254 for (i
= 0; i
< rbio
->nr_sectors
; i
++) {
255 /* Some range not covered by bio (partial write), skip it */
256 if (!rbio
->bio_sectors
[i
].page
) {
258 * Even if the sector is not covered by bio, if it is
259 * a data sector it should still be uptodate as it is
262 if (i
< rbio
->nr_data
* rbio
->stripe_nsectors
)
263 ASSERT(rbio
->stripe_sectors
[i
].uptodate
);
267 ASSERT(rbio
->stripe_sectors
[i
].page
);
268 memcpy_page(rbio
->stripe_sectors
[i
].page
,
269 rbio
->stripe_sectors
[i
].pgoff
,
270 rbio
->bio_sectors
[i
].page
,
271 rbio
->bio_sectors
[i
].pgoff
,
272 rbio
->bioc
->fs_info
->sectorsize
);
273 rbio
->stripe_sectors
[i
].uptodate
= 1;
275 set_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
279 * we hash on the first logical address of the stripe
281 static int rbio_bucket(struct btrfs_raid_bio
*rbio
)
283 u64 num
= rbio
->bioc
->full_stripe_logical
;
286 * we shift down quite a bit. We're using byte
287 * addressing, and most of the lower bits are zeros.
288 * This tends to upset hash_64, and it consistently
289 * returns just one or two different values.
291 * shifting off the lower bits fixes things.
293 return hash_64(num
>> 16, BTRFS_STRIPE_HASH_TABLE_BITS
);
296 static bool full_page_sectors_uptodate(struct btrfs_raid_bio
*rbio
,
297 unsigned int page_nr
)
299 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
300 const u32 sectors_per_page
= PAGE_SIZE
/ sectorsize
;
303 ASSERT(page_nr
< rbio
->nr_pages
);
305 for (i
= sectors_per_page
* page_nr
;
306 i
< sectors_per_page
* page_nr
+ sectors_per_page
;
308 if (!rbio
->stripe_sectors
[i
].uptodate
)
315 * Update the stripe_sectors[] array to use correct page and pgoff
317 * Should be called every time any page pointer in stripes_pages[] got modified.
319 static void index_stripe_sectors(struct btrfs_raid_bio
*rbio
)
321 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
325 for (i
= 0, offset
= 0; i
< rbio
->nr_sectors
; i
++, offset
+= sectorsize
) {
326 int page_index
= offset
>> PAGE_SHIFT
;
328 ASSERT(page_index
< rbio
->nr_pages
);
329 rbio
->stripe_sectors
[i
].page
= rbio
->stripe_pages
[page_index
];
330 rbio
->stripe_sectors
[i
].pgoff
= offset_in_page(offset
);
334 static void steal_rbio_page(struct btrfs_raid_bio
*src
,
335 struct btrfs_raid_bio
*dest
, int page_nr
)
337 const u32 sectorsize
= src
->bioc
->fs_info
->sectorsize
;
338 const u32 sectors_per_page
= PAGE_SIZE
/ sectorsize
;
341 if (dest
->stripe_pages
[page_nr
])
342 __free_page(dest
->stripe_pages
[page_nr
]);
343 dest
->stripe_pages
[page_nr
] = src
->stripe_pages
[page_nr
];
344 src
->stripe_pages
[page_nr
] = NULL
;
346 /* Also update the sector->uptodate bits. */
347 for (i
= sectors_per_page
* page_nr
;
348 i
< sectors_per_page
* page_nr
+ sectors_per_page
; i
++)
349 dest
->stripe_sectors
[i
].uptodate
= true;
352 static bool is_data_stripe_page(struct btrfs_raid_bio
*rbio
, int page_nr
)
354 const int sector_nr
= (page_nr
<< PAGE_SHIFT
) >>
355 rbio
->bioc
->fs_info
->sectorsize_bits
;
358 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
359 * we won't have a page which is half data half parity.
361 * Thus if the first sector of the page belongs to data stripes, then
362 * the full page belongs to data stripes.
364 return (sector_nr
< rbio
->nr_data
* rbio
->stripe_nsectors
);
368 * Stealing an rbio means taking all the uptodate pages from the stripe array
369 * in the source rbio and putting them into the destination rbio.
371 * This will also update the involved stripe_sectors[] which are referring to
374 static void steal_rbio(struct btrfs_raid_bio
*src
, struct btrfs_raid_bio
*dest
)
378 if (!test_bit(RBIO_CACHE_READY_BIT
, &src
->flags
))
381 for (i
= 0; i
< dest
->nr_pages
; i
++) {
382 struct page
*p
= src
->stripe_pages
[i
];
385 * We don't need to steal P/Q pages as they will always be
386 * regenerated for RMW or full write anyway.
388 if (!is_data_stripe_page(src
, i
))
392 * If @src already has RBIO_CACHE_READY_BIT, it should have
393 * all data stripe pages present and uptodate.
396 ASSERT(full_page_sectors_uptodate(src
, i
));
397 steal_rbio_page(src
, dest
, i
);
399 index_stripe_sectors(dest
);
400 index_stripe_sectors(src
);
404 * merging means we take the bio_list from the victim and
405 * splice it into the destination. The victim should
406 * be discarded afterwards.
408 * must be called with dest->rbio_list_lock held
410 static void merge_rbio(struct btrfs_raid_bio
*dest
,
411 struct btrfs_raid_bio
*victim
)
413 bio_list_merge_init(&dest
->bio_list
, &victim
->bio_list
);
414 dest
->bio_list_bytes
+= victim
->bio_list_bytes
;
415 /* Also inherit the bitmaps from @victim. */
416 bitmap_or(&dest
->dbitmap
, &victim
->dbitmap
, &dest
->dbitmap
,
417 dest
->stripe_nsectors
);
421 * used to prune items that are in the cache. The caller
422 * must hold the hash table lock.
424 static void __remove_rbio_from_cache(struct btrfs_raid_bio
*rbio
)
426 int bucket
= rbio_bucket(rbio
);
427 struct btrfs_stripe_hash_table
*table
;
428 struct btrfs_stripe_hash
*h
;
432 * check the bit again under the hash table lock.
434 if (!test_bit(RBIO_CACHE_BIT
, &rbio
->flags
))
437 table
= rbio
->bioc
->fs_info
->stripe_hash_table
;
438 h
= table
->table
+ bucket
;
440 /* hold the lock for the bucket because we may be
441 * removing it from the hash table
446 * hold the lock for the bio list because we need
447 * to make sure the bio list is empty
449 spin_lock(&rbio
->bio_list_lock
);
451 if (test_and_clear_bit(RBIO_CACHE_BIT
, &rbio
->flags
)) {
452 list_del_init(&rbio
->stripe_cache
);
453 table
->cache_size
-= 1;
456 /* if the bio list isn't empty, this rbio is
457 * still involved in an IO. We take it out
458 * of the cache list, and drop the ref that
459 * was held for the list.
461 * If the bio_list was empty, we also remove
462 * the rbio from the hash_table, and drop
463 * the corresponding ref
465 if (bio_list_empty(&rbio
->bio_list
)) {
466 if (!list_empty(&rbio
->hash_list
)) {
467 list_del_init(&rbio
->hash_list
);
468 refcount_dec(&rbio
->refs
);
469 BUG_ON(!list_empty(&rbio
->plug_list
));
474 spin_unlock(&rbio
->bio_list_lock
);
475 spin_unlock(&h
->lock
);
482 * prune a given rbio from the cache
484 static void remove_rbio_from_cache(struct btrfs_raid_bio
*rbio
)
486 struct btrfs_stripe_hash_table
*table
;
488 if (!test_bit(RBIO_CACHE_BIT
, &rbio
->flags
))
491 table
= rbio
->bioc
->fs_info
->stripe_hash_table
;
493 spin_lock(&table
->cache_lock
);
494 __remove_rbio_from_cache(rbio
);
495 spin_unlock(&table
->cache_lock
);
499 * remove everything in the cache
501 static void btrfs_clear_rbio_cache(struct btrfs_fs_info
*info
)
503 struct btrfs_stripe_hash_table
*table
;
504 struct btrfs_raid_bio
*rbio
;
506 table
= info
->stripe_hash_table
;
508 spin_lock(&table
->cache_lock
);
509 while (!list_empty(&table
->stripe_cache
)) {
510 rbio
= list_entry(table
->stripe_cache
.next
,
511 struct btrfs_raid_bio
,
513 __remove_rbio_from_cache(rbio
);
515 spin_unlock(&table
->cache_lock
);
519 * remove all cached entries and free the hash table
522 void btrfs_free_stripe_hash_table(struct btrfs_fs_info
*info
)
524 if (!info
->stripe_hash_table
)
526 btrfs_clear_rbio_cache(info
);
527 kvfree(info
->stripe_hash_table
);
528 info
->stripe_hash_table
= NULL
;
532 * insert an rbio into the stripe cache. It
533 * must have already been prepared by calling
536 * If this rbio was already cached, it gets
537 * moved to the front of the lru.
539 * If the size of the rbio cache is too big, we
542 static void cache_rbio(struct btrfs_raid_bio
*rbio
)
544 struct btrfs_stripe_hash_table
*table
;
546 if (!test_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
))
549 table
= rbio
->bioc
->fs_info
->stripe_hash_table
;
551 spin_lock(&table
->cache_lock
);
552 spin_lock(&rbio
->bio_list_lock
);
554 /* bump our ref if we were not in the list before */
555 if (!test_and_set_bit(RBIO_CACHE_BIT
, &rbio
->flags
))
556 refcount_inc(&rbio
->refs
);
558 if (!list_empty(&rbio
->stripe_cache
)){
559 list_move(&rbio
->stripe_cache
, &table
->stripe_cache
);
561 list_add(&rbio
->stripe_cache
, &table
->stripe_cache
);
562 table
->cache_size
+= 1;
565 spin_unlock(&rbio
->bio_list_lock
);
567 if (table
->cache_size
> RBIO_CACHE_SIZE
) {
568 struct btrfs_raid_bio
*found
;
570 found
= list_entry(table
->stripe_cache
.prev
,
571 struct btrfs_raid_bio
,
575 __remove_rbio_from_cache(found
);
578 spin_unlock(&table
->cache_lock
);
582 * helper function to run the xor_blocks api. It is only
583 * able to do MAX_XOR_BLOCKS at a time, so we need to
586 static void run_xor(void **pages
, int src_cnt
, ssize_t len
)
590 void *dest
= pages
[src_cnt
];
593 xor_src_cnt
= min(src_cnt
, MAX_XOR_BLOCKS
);
594 xor_blocks(xor_src_cnt
, len
, dest
, pages
+ src_off
);
596 src_cnt
-= xor_src_cnt
;
597 src_off
+= xor_src_cnt
;
602 * Returns true if the bio list inside this rbio covers an entire stripe (no
605 static int rbio_is_full(struct btrfs_raid_bio
*rbio
)
607 unsigned long size
= rbio
->bio_list_bytes
;
610 spin_lock(&rbio
->bio_list_lock
);
611 if (size
!= rbio
->nr_data
* BTRFS_STRIPE_LEN
)
613 BUG_ON(size
> rbio
->nr_data
* BTRFS_STRIPE_LEN
);
614 spin_unlock(&rbio
->bio_list_lock
);
620 * returns 1 if it is safe to merge two rbios together.
621 * The merging is safe if the two rbios correspond to
622 * the same stripe and if they are both going in the same
623 * direction (read vs write), and if neither one is
624 * locked for final IO
626 * The caller is responsible for locking such that
627 * rmw_locked is safe to test
629 static int rbio_can_merge(struct btrfs_raid_bio
*last
,
630 struct btrfs_raid_bio
*cur
)
632 if (test_bit(RBIO_RMW_LOCKED_BIT
, &last
->flags
) ||
633 test_bit(RBIO_RMW_LOCKED_BIT
, &cur
->flags
))
637 * we can't merge with cached rbios, since the
638 * idea is that when we merge the destination
639 * rbio is going to run our IO for us. We can
640 * steal from cached rbios though, other functions
643 if (test_bit(RBIO_CACHE_BIT
, &last
->flags
) ||
644 test_bit(RBIO_CACHE_BIT
, &cur
->flags
))
647 if (last
->bioc
->full_stripe_logical
!= cur
->bioc
->full_stripe_logical
)
650 /* we can't merge with different operations */
651 if (last
->operation
!= cur
->operation
)
654 * We've need read the full stripe from the drive.
655 * check and repair the parity and write the new results.
657 * We're not allowed to add any new bios to the
658 * bio list here, anyone else that wants to
659 * change this stripe needs to do their own rmw.
661 if (last
->operation
== BTRFS_RBIO_PARITY_SCRUB
)
664 if (last
->operation
== BTRFS_RBIO_READ_REBUILD
)
670 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio
*rbio
,
671 unsigned int stripe_nr
,
672 unsigned int sector_nr
)
674 ASSERT_RBIO_STRIPE(stripe_nr
< rbio
->real_stripes
, rbio
, stripe_nr
);
675 ASSERT_RBIO_SECTOR(sector_nr
< rbio
->stripe_nsectors
, rbio
, sector_nr
);
677 return stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
;
680 /* Return a sector from rbio->stripe_sectors, not from the bio list */
681 static struct sector_ptr
*rbio_stripe_sector(const struct btrfs_raid_bio
*rbio
,
682 unsigned int stripe_nr
,
683 unsigned int sector_nr
)
685 return &rbio
->stripe_sectors
[rbio_stripe_sector_index(rbio
, stripe_nr
,
689 /* Grab a sector inside P stripe */
690 static struct sector_ptr
*rbio_pstripe_sector(const struct btrfs_raid_bio
*rbio
,
691 unsigned int sector_nr
)
693 return rbio_stripe_sector(rbio
, rbio
->nr_data
, sector_nr
);
696 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
697 static struct sector_ptr
*rbio_qstripe_sector(const struct btrfs_raid_bio
*rbio
,
698 unsigned int sector_nr
)
700 if (rbio
->nr_data
+ 1 == rbio
->real_stripes
)
702 return rbio_stripe_sector(rbio
, rbio
->nr_data
+ 1, sector_nr
);
706 * The first stripe in the table for a logical address
707 * has the lock. rbios are added in one of three ways:
709 * 1) Nobody has the stripe locked yet. The rbio is given
710 * the lock and 0 is returned. The caller must start the IO
713 * 2) Someone has the stripe locked, but we're able to merge
714 * with the lock owner. The rbio is freed and the IO will
715 * start automatically along with the existing rbio. 1 is returned.
717 * 3) Someone has the stripe locked, but we're not able to merge.
718 * The rbio is added to the lock owner's plug list, or merged into
719 * an rbio already on the plug list. When the lock owner unlocks,
720 * the next rbio on the list is run and the IO is started automatically.
723 * If we return 0, the caller still owns the rbio and must continue with
724 * IO submission. If we return 1, the caller must assume the rbio has
725 * already been freed.
727 static noinline
int lock_stripe_add(struct btrfs_raid_bio
*rbio
)
729 struct btrfs_stripe_hash
*h
;
730 struct btrfs_raid_bio
*cur
;
731 struct btrfs_raid_bio
*pending
;
732 struct btrfs_raid_bio
*freeit
= NULL
;
733 struct btrfs_raid_bio
*cache_drop
= NULL
;
736 h
= rbio
->bioc
->fs_info
->stripe_hash_table
->table
+ rbio_bucket(rbio
);
739 list_for_each_entry(cur
, &h
->hash_list
, hash_list
) {
740 if (cur
->bioc
->full_stripe_logical
!= rbio
->bioc
->full_stripe_logical
)
743 spin_lock(&cur
->bio_list_lock
);
745 /* Can we steal this cached rbio's pages? */
746 if (bio_list_empty(&cur
->bio_list
) &&
747 list_empty(&cur
->plug_list
) &&
748 test_bit(RBIO_CACHE_BIT
, &cur
->flags
) &&
749 !test_bit(RBIO_RMW_LOCKED_BIT
, &cur
->flags
)) {
750 list_del_init(&cur
->hash_list
);
751 refcount_dec(&cur
->refs
);
753 steal_rbio(cur
, rbio
);
755 spin_unlock(&cur
->bio_list_lock
);
760 /* Can we merge into the lock owner? */
761 if (rbio_can_merge(cur
, rbio
)) {
762 merge_rbio(cur
, rbio
);
763 spin_unlock(&cur
->bio_list_lock
);
771 * We couldn't merge with the running rbio, see if we can merge
772 * with the pending ones. We don't have to check for rmw_locked
773 * because there is no way they are inside finish_rmw right now
775 list_for_each_entry(pending
, &cur
->plug_list
, plug_list
) {
776 if (rbio_can_merge(pending
, rbio
)) {
777 merge_rbio(pending
, rbio
);
778 spin_unlock(&cur
->bio_list_lock
);
786 * No merging, put us on the tail of the plug list, our rbio
787 * will be started with the currently running rbio unlocks
789 list_add_tail(&rbio
->plug_list
, &cur
->plug_list
);
790 spin_unlock(&cur
->bio_list_lock
);
795 refcount_inc(&rbio
->refs
);
796 list_add(&rbio
->hash_list
, &h
->hash_list
);
798 spin_unlock(&h
->lock
);
800 remove_rbio_from_cache(cache_drop
);
802 free_raid_bio(freeit
);
806 static void recover_rbio_work_locked(struct work_struct
*work
);
809 * called as rmw or parity rebuild is completed. If the plug list has more
810 * rbios waiting for this stripe, the next one on the list will be started
812 static noinline
void unlock_stripe(struct btrfs_raid_bio
*rbio
)
815 struct btrfs_stripe_hash
*h
;
818 bucket
= rbio_bucket(rbio
);
819 h
= rbio
->bioc
->fs_info
->stripe_hash_table
->table
+ bucket
;
821 if (list_empty(&rbio
->plug_list
))
825 spin_lock(&rbio
->bio_list_lock
);
827 if (!list_empty(&rbio
->hash_list
)) {
829 * if we're still cached and there is no other IO
830 * to perform, just leave this rbio here for others
831 * to steal from later
833 if (list_empty(&rbio
->plug_list
) &&
834 test_bit(RBIO_CACHE_BIT
, &rbio
->flags
)) {
836 clear_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
837 BUG_ON(!bio_list_empty(&rbio
->bio_list
));
841 list_del_init(&rbio
->hash_list
);
842 refcount_dec(&rbio
->refs
);
845 * we use the plug list to hold all the rbios
846 * waiting for the chance to lock this stripe.
847 * hand the lock over to one of them.
849 if (!list_empty(&rbio
->plug_list
)) {
850 struct btrfs_raid_bio
*next
;
851 struct list_head
*head
= rbio
->plug_list
.next
;
853 next
= list_entry(head
, struct btrfs_raid_bio
,
856 list_del_init(&rbio
->plug_list
);
858 list_add(&next
->hash_list
, &h
->hash_list
);
859 refcount_inc(&next
->refs
);
860 spin_unlock(&rbio
->bio_list_lock
);
861 spin_unlock(&h
->lock
);
863 if (next
->operation
== BTRFS_RBIO_READ_REBUILD
) {
864 start_async_work(next
, recover_rbio_work_locked
);
865 } else if (next
->operation
== BTRFS_RBIO_WRITE
) {
866 steal_rbio(rbio
, next
);
867 start_async_work(next
, rmw_rbio_work_locked
);
868 } else if (next
->operation
== BTRFS_RBIO_PARITY_SCRUB
) {
869 steal_rbio(rbio
, next
);
870 start_async_work(next
, scrub_rbio_work_locked
);
877 spin_unlock(&rbio
->bio_list_lock
);
878 spin_unlock(&h
->lock
);
882 remove_rbio_from_cache(rbio
);
885 static void rbio_endio_bio_list(struct bio
*cur
, blk_status_t err
)
892 cur
->bi_status
= err
;
899 * this frees the rbio and runs through all the bios in the
900 * bio_list and calls end_io on them
902 static void rbio_orig_end_io(struct btrfs_raid_bio
*rbio
, blk_status_t err
)
904 struct bio
*cur
= bio_list_get(&rbio
->bio_list
);
907 kfree(rbio
->csum_buf
);
908 bitmap_free(rbio
->csum_bitmap
);
909 rbio
->csum_buf
= NULL
;
910 rbio
->csum_bitmap
= NULL
;
913 * Clear the data bitmap, as the rbio may be cached for later usage.
914 * do this before before unlock_stripe() so there will be no new bio
917 bitmap_clear(&rbio
->dbitmap
, 0, rbio
->stripe_nsectors
);
920 * At this moment, rbio->bio_list is empty, however since rbio does not
921 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
922 * hash list, rbio may be merged with others so that rbio->bio_list
924 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
925 * more and we can call bio_endio() on all queued bios.
928 extra
= bio_list_get(&rbio
->bio_list
);
931 rbio_endio_bio_list(cur
, err
);
933 rbio_endio_bio_list(extra
, err
);
937 * Get a sector pointer specified by its @stripe_nr and @sector_nr.
939 * @rbio: The raid bio
940 * @stripe_nr: Stripe number, valid range [0, real_stripe)
941 * @sector_nr: Sector number inside the stripe,
942 * valid range [0, stripe_nsectors)
943 * @bio_list_only: Whether to use sectors inside the bio list only.
945 * The read/modify/write code wants to reuse the original bio page as much
946 * as possible, and only use stripe_sectors as fallback.
948 static struct sector_ptr
*sector_in_rbio(struct btrfs_raid_bio
*rbio
,
949 int stripe_nr
, int sector_nr
,
952 struct sector_ptr
*sector
;
955 ASSERT_RBIO_STRIPE(stripe_nr
>= 0 && stripe_nr
< rbio
->real_stripes
,
957 ASSERT_RBIO_SECTOR(sector_nr
>= 0 && sector_nr
< rbio
->stripe_nsectors
,
960 index
= stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
;
961 ASSERT(index
>= 0 && index
< rbio
->nr_sectors
);
963 spin_lock(&rbio
->bio_list_lock
);
964 sector
= &rbio
->bio_sectors
[index
];
965 if (sector
->page
|| bio_list_only
) {
966 /* Don't return sector without a valid page pointer */
969 spin_unlock(&rbio
->bio_list_lock
);
972 spin_unlock(&rbio
->bio_list_lock
);
974 return &rbio
->stripe_sectors
[index
];
978 * allocation and initial setup for the btrfs_raid_bio. Not
979 * this does not allocate any pages for rbio->pages.
981 static struct btrfs_raid_bio
*alloc_rbio(struct btrfs_fs_info
*fs_info
,
982 struct btrfs_io_context
*bioc
)
984 const unsigned int real_stripes
= bioc
->num_stripes
- bioc
->replace_nr_stripes
;
985 const unsigned int stripe_npages
= BTRFS_STRIPE_LEN
>> PAGE_SHIFT
;
986 const unsigned int num_pages
= stripe_npages
* real_stripes
;
987 const unsigned int stripe_nsectors
=
988 BTRFS_STRIPE_LEN
>> fs_info
->sectorsize_bits
;
989 const unsigned int num_sectors
= stripe_nsectors
* real_stripes
;
990 struct btrfs_raid_bio
*rbio
;
992 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
993 ASSERT(IS_ALIGNED(PAGE_SIZE
, fs_info
->sectorsize
));
995 * Our current stripe len should be fixed to 64k thus stripe_nsectors
996 * (at most 16) should be no larger than BITS_PER_LONG.
998 ASSERT(stripe_nsectors
<= BITS_PER_LONG
);
1001 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1004 ASSERT(real_stripes
>= 2);
1005 ASSERT(real_stripes
<= U8_MAX
);
1007 rbio
= kzalloc(sizeof(*rbio
), GFP_NOFS
);
1009 return ERR_PTR(-ENOMEM
);
1010 rbio
->stripe_pages
= kcalloc(num_pages
, sizeof(struct page
*),
1012 rbio
->bio_sectors
= kcalloc(num_sectors
, sizeof(struct sector_ptr
),
1014 rbio
->stripe_sectors
= kcalloc(num_sectors
, sizeof(struct sector_ptr
),
1016 rbio
->finish_pointers
= kcalloc(real_stripes
, sizeof(void *), GFP_NOFS
);
1017 rbio
->error_bitmap
= bitmap_zalloc(num_sectors
, GFP_NOFS
);
1019 if (!rbio
->stripe_pages
|| !rbio
->bio_sectors
|| !rbio
->stripe_sectors
||
1020 !rbio
->finish_pointers
|| !rbio
->error_bitmap
) {
1021 free_raid_bio_pointers(rbio
);
1023 return ERR_PTR(-ENOMEM
);
1026 bio_list_init(&rbio
->bio_list
);
1027 init_waitqueue_head(&rbio
->io_wait
);
1028 INIT_LIST_HEAD(&rbio
->plug_list
);
1029 spin_lock_init(&rbio
->bio_list_lock
);
1030 INIT_LIST_HEAD(&rbio
->stripe_cache
);
1031 INIT_LIST_HEAD(&rbio
->hash_list
);
1032 btrfs_get_bioc(bioc
);
1034 rbio
->nr_pages
= num_pages
;
1035 rbio
->nr_sectors
= num_sectors
;
1036 rbio
->real_stripes
= real_stripes
;
1037 rbio
->stripe_npages
= stripe_npages
;
1038 rbio
->stripe_nsectors
= stripe_nsectors
;
1039 refcount_set(&rbio
->refs
, 1);
1040 atomic_set(&rbio
->stripes_pending
, 0);
1042 ASSERT(btrfs_nr_parity_stripes(bioc
->map_type
));
1043 rbio
->nr_data
= real_stripes
- btrfs_nr_parity_stripes(bioc
->map_type
);
1044 ASSERT(rbio
->nr_data
> 0);
1049 /* allocate pages for all the stripes in the bio, including parity */
1050 static int alloc_rbio_pages(struct btrfs_raid_bio
*rbio
)
1054 ret
= btrfs_alloc_page_array(rbio
->nr_pages
, rbio
->stripe_pages
, false);
1057 /* Mapping all sectors */
1058 index_stripe_sectors(rbio
);
1062 /* only allocate pages for p/q stripes */
1063 static int alloc_rbio_parity_pages(struct btrfs_raid_bio
*rbio
)
1065 const int data_pages
= rbio
->nr_data
* rbio
->stripe_npages
;
1068 ret
= btrfs_alloc_page_array(rbio
->nr_pages
- data_pages
,
1069 rbio
->stripe_pages
+ data_pages
, false);
1073 index_stripe_sectors(rbio
);
1078 * Return the total number of errors found in the vertical stripe of @sector_nr.
1080 * @faila and @failb will also be updated to the first and second stripe
1081 * number of the errors.
1083 static int get_rbio_veritical_errors(struct btrfs_raid_bio
*rbio
, int sector_nr
,
1084 int *faila
, int *failb
)
1087 int found_errors
= 0;
1089 if (faila
|| failb
) {
1091 * Both @faila and @failb should be valid pointers if any of
1092 * them is specified.
1094 ASSERT(faila
&& failb
);
1099 for (stripe_nr
= 0; stripe_nr
< rbio
->real_stripes
; stripe_nr
++) {
1100 int total_sector_nr
= stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
;
1102 if (test_bit(total_sector_nr
, rbio
->error_bitmap
)) {
1105 /* Update faila and failb. */
1108 else if (*failb
< 0)
1113 return found_errors
;
1117 * Add a single sector @sector into our list of bios for IO.
1119 * Return 0 if everything went well.
1120 * Return <0 for error.
1122 static int rbio_add_io_sector(struct btrfs_raid_bio
*rbio
,
1123 struct bio_list
*bio_list
,
1124 struct sector_ptr
*sector
,
1125 unsigned int stripe_nr
,
1126 unsigned int sector_nr
,
1129 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
1130 struct bio
*last
= bio_list
->tail
;
1133 struct btrfs_io_stripe
*stripe
;
1137 * Note: here stripe_nr has taken device replace into consideration,
1138 * thus it can be larger than rbio->real_stripe.
1139 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1141 ASSERT_RBIO_STRIPE(stripe_nr
>= 0 && stripe_nr
< rbio
->bioc
->num_stripes
,
1143 ASSERT_RBIO_SECTOR(sector_nr
>= 0 && sector_nr
< rbio
->stripe_nsectors
,
1145 ASSERT(sector
->page
);
1147 stripe
= &rbio
->bioc
->stripes
[stripe_nr
];
1148 disk_start
= stripe
->physical
+ sector_nr
* sectorsize
;
1150 /* if the device is missing, just fail this stripe */
1151 if (!stripe
->dev
->bdev
) {
1154 set_bit(stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
,
1155 rbio
->error_bitmap
);
1157 /* Check if we have reached tolerance early. */
1158 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
,
1160 if (found_errors
> rbio
->bioc
->max_errors
)
1165 /* see if we can add this page onto our existing bio */
1167 u64 last_end
= last
->bi_iter
.bi_sector
<< SECTOR_SHIFT
;
1168 last_end
+= last
->bi_iter
.bi_size
;
1171 * we can't merge these if they are from different
1172 * devices or if they are not contiguous
1174 if (last_end
== disk_start
&& !last
->bi_status
&&
1175 last
->bi_bdev
== stripe
->dev
->bdev
) {
1176 ret
= bio_add_page(last
, sector
->page
, sectorsize
,
1178 if (ret
== sectorsize
)
1183 /* put a new bio on the list */
1184 bio
= bio_alloc(stripe
->dev
->bdev
,
1185 max(BTRFS_STRIPE_LEN
>> PAGE_SHIFT
, 1),
1187 bio
->bi_iter
.bi_sector
= disk_start
>> SECTOR_SHIFT
;
1188 bio
->bi_private
= rbio
;
1190 __bio_add_page(bio
, sector
->page
, sectorsize
, sector
->pgoff
);
1191 bio_list_add(bio_list
, bio
);
1195 static void index_one_bio(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1197 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
1198 struct bio_vec bvec
;
1199 struct bvec_iter iter
;
1200 u32 offset
= (bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
) -
1201 rbio
->bioc
->full_stripe_logical
;
1203 bio_for_each_segment(bvec
, bio
, iter
) {
1206 for (bvec_offset
= 0; bvec_offset
< bvec
.bv_len
;
1207 bvec_offset
+= sectorsize
, offset
+= sectorsize
) {
1208 int index
= offset
/ sectorsize
;
1209 struct sector_ptr
*sector
= &rbio
->bio_sectors
[index
];
1211 sector
->page
= bvec
.bv_page
;
1212 sector
->pgoff
= bvec
.bv_offset
+ bvec_offset
;
1213 ASSERT(sector
->pgoff
< PAGE_SIZE
);
1219 * helper function to walk our bio list and populate the bio_pages array with
1220 * the result. This seems expensive, but it is faster than constantly
1221 * searching through the bio list as we setup the IO in finish_rmw or stripe
1224 * This must be called before you trust the answers from page_in_rbio
1226 static void index_rbio_pages(struct btrfs_raid_bio
*rbio
)
1230 spin_lock(&rbio
->bio_list_lock
);
1231 bio_list_for_each(bio
, &rbio
->bio_list
)
1232 index_one_bio(rbio
, bio
);
1234 spin_unlock(&rbio
->bio_list_lock
);
1237 static void bio_get_trace_info(struct btrfs_raid_bio
*rbio
, struct bio
*bio
,
1238 struct raid56_bio_trace_info
*trace_info
)
1240 const struct btrfs_io_context
*bioc
= rbio
->bioc
;
1245 /* We rely on bio->bi_bdev to find the stripe number. */
1249 for (i
= 0; i
< bioc
->num_stripes
; i
++) {
1250 if (bio
->bi_bdev
!= bioc
->stripes
[i
].dev
->bdev
)
1252 trace_info
->stripe_nr
= i
;
1253 trace_info
->devid
= bioc
->stripes
[i
].dev
->devid
;
1254 trace_info
->offset
= (bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
) -
1255 bioc
->stripes
[i
].physical
;
1260 trace_info
->devid
= -1;
1261 trace_info
->offset
= -1;
1262 trace_info
->stripe_nr
= -1;
1265 static inline void bio_list_put(struct bio_list
*bio_list
)
1269 while ((bio
= bio_list_pop(bio_list
)))
1273 static void assert_rbio(struct btrfs_raid_bio
*rbio
)
1275 if (!IS_ENABLED(CONFIG_BTRFS_DEBUG
) ||
1276 !IS_ENABLED(CONFIG_BTRFS_ASSERT
))
1280 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1281 * we won't go beyond 256 disks anyway.
1283 ASSERT_RBIO(rbio
->real_stripes
>= 2, rbio
);
1284 ASSERT_RBIO(rbio
->nr_data
> 0, rbio
);
1287 * This is another check to make sure nr data stripes is smaller
1288 * than total stripes.
1290 ASSERT_RBIO(rbio
->nr_data
< rbio
->real_stripes
, rbio
);
1293 /* Generate PQ for one vertical stripe. */
1294 static void generate_pq_vertical(struct btrfs_raid_bio
*rbio
, int sectornr
)
1296 void **pointers
= rbio
->finish_pointers
;
1297 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
1298 struct sector_ptr
*sector
;
1300 const bool has_qstripe
= rbio
->bioc
->map_type
& BTRFS_BLOCK_GROUP_RAID6
;
1302 /* First collect one sector from each data stripe */
1303 for (stripe
= 0; stripe
< rbio
->nr_data
; stripe
++) {
1304 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 0);
1305 pointers
[stripe
] = kmap_local_page(sector
->page
) +
1309 /* Then add the parity stripe */
1310 sector
= rbio_pstripe_sector(rbio
, sectornr
);
1311 sector
->uptodate
= 1;
1312 pointers
[stripe
++] = kmap_local_page(sector
->page
) + sector
->pgoff
;
1316 * RAID6, add the qstripe and call the library function
1317 * to fill in our p/q
1319 sector
= rbio_qstripe_sector(rbio
, sectornr
);
1320 sector
->uptodate
= 1;
1321 pointers
[stripe
++] = kmap_local_page(sector
->page
) +
1325 raid6_call
.gen_syndrome(rbio
->real_stripes
, sectorsize
,
1329 memcpy(pointers
[rbio
->nr_data
], pointers
[0], sectorsize
);
1330 run_xor(pointers
+ 1, rbio
->nr_data
- 1, sectorsize
);
1332 for (stripe
= stripe
- 1; stripe
>= 0; stripe
--)
1333 kunmap_local(pointers
[stripe
]);
1336 static int rmw_assemble_write_bios(struct btrfs_raid_bio
*rbio
,
1337 struct bio_list
*bio_list
)
1339 /* The total sector number inside the full stripe. */
1340 int total_sector_nr
;
1345 ASSERT(bio_list_size(bio_list
) == 0);
1347 /* We should have at least one data sector. */
1348 ASSERT(bitmap_weight(&rbio
->dbitmap
, rbio
->stripe_nsectors
));
1351 * Reset errors, as we may have errors inherited from from degraded
1354 bitmap_clear(rbio
->error_bitmap
, 0, rbio
->nr_sectors
);
1357 * Start assembly. Make bios for everything from the higher layers (the
1358 * bio_list in our rbio) and our P/Q. Ignore everything else.
1360 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
1361 total_sector_nr
++) {
1362 struct sector_ptr
*sector
;
1364 stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
1365 sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
1367 /* This vertical stripe has no data, skip it. */
1368 if (!test_bit(sectornr
, &rbio
->dbitmap
))
1371 if (stripe
< rbio
->nr_data
) {
1372 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 1);
1376 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
1379 ret
= rbio_add_io_sector(rbio
, bio_list
, sector
, stripe
,
1380 sectornr
, REQ_OP_WRITE
);
1385 if (likely(!rbio
->bioc
->replace_nr_stripes
))
1389 * Make a copy for the replace target device.
1391 * Thus the source stripe number (in replace_stripe_src) should be valid.
1393 ASSERT(rbio
->bioc
->replace_stripe_src
>= 0);
1395 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
1396 total_sector_nr
++) {
1397 struct sector_ptr
*sector
;
1399 stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
1400 sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
1403 * For RAID56, there is only one device that can be replaced,
1404 * and replace_stripe_src[0] indicates the stripe number we
1405 * need to copy from.
1407 if (stripe
!= rbio
->bioc
->replace_stripe_src
) {
1409 * We can skip the whole stripe completely, note
1410 * total_sector_nr will be increased by one anyway.
1412 ASSERT(sectornr
== 0);
1413 total_sector_nr
+= rbio
->stripe_nsectors
- 1;
1417 /* This vertical stripe has no data, skip it. */
1418 if (!test_bit(sectornr
, &rbio
->dbitmap
))
1421 if (stripe
< rbio
->nr_data
) {
1422 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 1);
1426 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
1429 ret
= rbio_add_io_sector(rbio
, bio_list
, sector
,
1431 sectornr
, REQ_OP_WRITE
);
1438 bio_list_put(bio_list
);
1442 static void set_rbio_range_error(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1444 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1445 u32 offset
= (bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
) -
1446 rbio
->bioc
->full_stripe_logical
;
1447 int total_nr_sector
= offset
>> fs_info
->sectorsize_bits
;
1449 ASSERT(total_nr_sector
< rbio
->nr_data
* rbio
->stripe_nsectors
);
1451 bitmap_set(rbio
->error_bitmap
, total_nr_sector
,
1452 bio
->bi_iter
.bi_size
>> fs_info
->sectorsize_bits
);
1455 * Special handling for raid56_alloc_missing_rbio() used by
1456 * scrub/replace. Unlike call path in raid56_parity_recover(), they
1457 * pass an empty bio here. Thus we have to find out the missing device
1458 * and mark the stripe error instead.
1460 if (bio
->bi_iter
.bi_size
== 0) {
1461 bool found_missing
= false;
1464 for (stripe_nr
= 0; stripe_nr
< rbio
->real_stripes
; stripe_nr
++) {
1465 if (!rbio
->bioc
->stripes
[stripe_nr
].dev
->bdev
) {
1466 found_missing
= true;
1467 bitmap_set(rbio
->error_bitmap
,
1468 stripe_nr
* rbio
->stripe_nsectors
,
1469 rbio
->stripe_nsectors
);
1472 ASSERT(found_missing
);
1477 * For subpage case, we can no longer set page Up-to-date directly for
1478 * stripe_pages[], thus we need to locate the sector.
1480 static struct sector_ptr
*find_stripe_sector(struct btrfs_raid_bio
*rbio
,
1486 for (i
= 0; i
< rbio
->nr_sectors
; i
++) {
1487 struct sector_ptr
*sector
= &rbio
->stripe_sectors
[i
];
1489 if (sector
->page
== page
&& sector
->pgoff
== pgoff
)
1496 * this sets each page in the bio uptodate. It should only be used on private
1497 * rbio pages, nothing that comes in from the higher layers
1499 static void set_bio_pages_uptodate(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1501 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
1502 struct bio_vec
*bvec
;
1503 struct bvec_iter_all iter_all
;
1505 ASSERT(!bio_flagged(bio
, BIO_CLONED
));
1507 bio_for_each_segment_all(bvec
, bio
, iter_all
) {
1508 struct sector_ptr
*sector
;
1511 for (pgoff
= bvec
->bv_offset
; pgoff
- bvec
->bv_offset
< bvec
->bv_len
;
1512 pgoff
+= sectorsize
) {
1513 sector
= find_stripe_sector(rbio
, bvec
->bv_page
, pgoff
);
1516 sector
->uptodate
= 1;
1521 static int get_bio_sector_nr(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1523 struct bio_vec
*bv
= bio_first_bvec_all(bio
);
1526 for (i
= 0; i
< rbio
->nr_sectors
; i
++) {
1527 struct sector_ptr
*sector
;
1529 sector
= &rbio
->stripe_sectors
[i
];
1530 if (sector
->page
== bv
->bv_page
&& sector
->pgoff
== bv
->bv_offset
)
1532 sector
= &rbio
->bio_sectors
[i
];
1533 if (sector
->page
== bv
->bv_page
&& sector
->pgoff
== bv
->bv_offset
)
1536 ASSERT(i
< rbio
->nr_sectors
);
1540 static void rbio_update_error_bitmap(struct btrfs_raid_bio
*rbio
, struct bio
*bio
)
1542 int total_sector_nr
= get_bio_sector_nr(rbio
, bio
);
1544 struct bio_vec
*bvec
;
1547 bio_for_each_bvec_all(bvec
, bio
, i
)
1548 bio_size
+= bvec
->bv_len
;
1551 * Since we can have multiple bios touching the error_bitmap, we cannot
1552 * call bitmap_set() without protection.
1554 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1556 for (i
= total_sector_nr
; i
< total_sector_nr
+
1557 (bio_size
>> rbio
->bioc
->fs_info
->sectorsize_bits
); i
++)
1558 set_bit(i
, rbio
->error_bitmap
);
1561 /* Verify the data sectors at read time. */
1562 static void verify_bio_data_sectors(struct btrfs_raid_bio
*rbio
,
1565 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1566 int total_sector_nr
= get_bio_sector_nr(rbio
, bio
);
1567 struct bio_vec
*bvec
;
1568 struct bvec_iter_all iter_all
;
1570 /* No data csum for the whole stripe, no need to verify. */
1571 if (!rbio
->csum_bitmap
|| !rbio
->csum_buf
)
1574 /* P/Q stripes, they have no data csum to verify against. */
1575 if (total_sector_nr
>= rbio
->nr_data
* rbio
->stripe_nsectors
)
1578 bio_for_each_segment_all(bvec
, bio
, iter_all
) {
1581 for (bv_offset
= bvec
->bv_offset
;
1582 bv_offset
< bvec
->bv_offset
+ bvec
->bv_len
;
1583 bv_offset
+= fs_info
->sectorsize
, total_sector_nr
++) {
1584 u8 csum_buf
[BTRFS_CSUM_SIZE
];
1585 u8
*expected_csum
= rbio
->csum_buf
+
1586 total_sector_nr
* fs_info
->csum_size
;
1589 /* No csum for this sector, skip to the next sector. */
1590 if (!test_bit(total_sector_nr
, rbio
->csum_bitmap
))
1593 ret
= btrfs_check_sector_csum(fs_info
, bvec
->bv_page
,
1594 bv_offset
, csum_buf
, expected_csum
);
1596 set_bit(total_sector_nr
, rbio
->error_bitmap
);
1601 static void raid_wait_read_end_io(struct bio
*bio
)
1603 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
1605 if (bio
->bi_status
) {
1606 rbio_update_error_bitmap(rbio
, bio
);
1608 set_bio_pages_uptodate(rbio
, bio
);
1609 verify_bio_data_sectors(rbio
, bio
);
1613 if (atomic_dec_and_test(&rbio
->stripes_pending
))
1614 wake_up(&rbio
->io_wait
);
1617 static void submit_read_wait_bio_list(struct btrfs_raid_bio
*rbio
,
1618 struct bio_list
*bio_list
)
1622 atomic_set(&rbio
->stripes_pending
, bio_list_size(bio_list
));
1623 while ((bio
= bio_list_pop(bio_list
))) {
1624 bio
->bi_end_io
= raid_wait_read_end_io
;
1626 if (trace_raid56_read_enabled()) {
1627 struct raid56_bio_trace_info trace_info
= { 0 };
1629 bio_get_trace_info(rbio
, bio
, &trace_info
);
1630 trace_raid56_read(rbio
, bio
, &trace_info
);
1635 wait_event(rbio
->io_wait
, atomic_read(&rbio
->stripes_pending
) == 0);
1638 static int alloc_rbio_data_pages(struct btrfs_raid_bio
*rbio
)
1640 const int data_pages
= rbio
->nr_data
* rbio
->stripe_npages
;
1643 ret
= btrfs_alloc_page_array(data_pages
, rbio
->stripe_pages
, false);
1647 index_stripe_sectors(rbio
);
1652 * We use plugging call backs to collect full stripes.
1653 * Any time we get a partial stripe write while plugged
1654 * we collect it into a list. When the unplug comes down,
1655 * we sort the list by logical block number and merge
1656 * everything we can into the same rbios
1658 struct btrfs_plug_cb
{
1659 struct blk_plug_cb cb
;
1660 struct btrfs_fs_info
*info
;
1661 struct list_head rbio_list
;
1665 * rbios on the plug list are sorted for easier merging.
1667 static int plug_cmp(void *priv
, const struct list_head
*a
,
1668 const struct list_head
*b
)
1670 const struct btrfs_raid_bio
*ra
= container_of(a
, struct btrfs_raid_bio
,
1672 const struct btrfs_raid_bio
*rb
= container_of(b
, struct btrfs_raid_bio
,
1674 u64 a_sector
= ra
->bio_list
.head
->bi_iter
.bi_sector
;
1675 u64 b_sector
= rb
->bio_list
.head
->bi_iter
.bi_sector
;
1677 if (a_sector
< b_sector
)
1679 if (a_sector
> b_sector
)
1684 static void raid_unplug(struct blk_plug_cb
*cb
, bool from_schedule
)
1686 struct btrfs_plug_cb
*plug
= container_of(cb
, struct btrfs_plug_cb
, cb
);
1687 struct btrfs_raid_bio
*cur
;
1688 struct btrfs_raid_bio
*last
= NULL
;
1690 list_sort(NULL
, &plug
->rbio_list
, plug_cmp
);
1692 while (!list_empty(&plug
->rbio_list
)) {
1693 cur
= list_entry(plug
->rbio_list
.next
,
1694 struct btrfs_raid_bio
, plug_list
);
1695 list_del_init(&cur
->plug_list
);
1697 if (rbio_is_full(cur
)) {
1698 /* We have a full stripe, queue it down. */
1699 start_async_work(cur
, rmw_rbio_work
);
1703 if (rbio_can_merge(last
, cur
)) {
1704 merge_rbio(last
, cur
);
1708 start_async_work(last
, rmw_rbio_work
);
1713 start_async_work(last
, rmw_rbio_work
);
1717 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1718 static void rbio_add_bio(struct btrfs_raid_bio
*rbio
, struct bio
*orig_bio
)
1720 const struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1721 const u64 orig_logical
= orig_bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
;
1722 const u64 full_stripe_start
= rbio
->bioc
->full_stripe_logical
;
1723 const u32 orig_len
= orig_bio
->bi_iter
.bi_size
;
1724 const u32 sectorsize
= fs_info
->sectorsize
;
1727 ASSERT_RBIO_LOGICAL(orig_logical
>= full_stripe_start
&&
1728 orig_logical
+ orig_len
<= full_stripe_start
+
1729 rbio
->nr_data
* BTRFS_STRIPE_LEN
,
1730 rbio
, orig_logical
);
1732 bio_list_add(&rbio
->bio_list
, orig_bio
);
1733 rbio
->bio_list_bytes
+= orig_bio
->bi_iter
.bi_size
;
1735 /* Update the dbitmap. */
1736 for (cur_logical
= orig_logical
; cur_logical
< orig_logical
+ orig_len
;
1737 cur_logical
+= sectorsize
) {
1738 int bit
= ((u32
)(cur_logical
- full_stripe_start
) >>
1739 fs_info
->sectorsize_bits
) % rbio
->stripe_nsectors
;
1741 set_bit(bit
, &rbio
->dbitmap
);
1746 * our main entry point for writes from the rest of the FS.
1748 void raid56_parity_write(struct bio
*bio
, struct btrfs_io_context
*bioc
)
1750 struct btrfs_fs_info
*fs_info
= bioc
->fs_info
;
1751 struct btrfs_raid_bio
*rbio
;
1752 struct btrfs_plug_cb
*plug
= NULL
;
1753 struct blk_plug_cb
*cb
;
1755 rbio
= alloc_rbio(fs_info
, bioc
);
1757 bio
->bi_status
= errno_to_blk_status(PTR_ERR(rbio
));
1761 rbio
->operation
= BTRFS_RBIO_WRITE
;
1762 rbio_add_bio(rbio
, bio
);
1765 * Don't plug on full rbios, just get them out the door
1766 * as quickly as we can
1768 if (!rbio_is_full(rbio
)) {
1769 cb
= blk_check_plugged(raid_unplug
, fs_info
, sizeof(*plug
));
1771 plug
= container_of(cb
, struct btrfs_plug_cb
, cb
);
1773 plug
->info
= fs_info
;
1774 INIT_LIST_HEAD(&plug
->rbio_list
);
1776 list_add_tail(&rbio
->plug_list
, &plug
->rbio_list
);
1782 * Either we don't have any existing plug, or we're doing a full stripe,
1783 * queue the rmw work now.
1785 start_async_work(rbio
, rmw_rbio_work
);
1788 static int verify_one_sector(struct btrfs_raid_bio
*rbio
,
1789 int stripe_nr
, int sector_nr
)
1791 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1792 struct sector_ptr
*sector
;
1793 u8 csum_buf
[BTRFS_CSUM_SIZE
];
1797 if (!rbio
->csum_bitmap
|| !rbio
->csum_buf
)
1800 /* No way to verify P/Q as they are not covered by data csum. */
1801 if (stripe_nr
>= rbio
->nr_data
)
1804 * If we're rebuilding a read, we have to use pages from the
1805 * bio list if possible.
1807 if (rbio
->operation
== BTRFS_RBIO_READ_REBUILD
) {
1808 sector
= sector_in_rbio(rbio
, stripe_nr
, sector_nr
, 0);
1810 sector
= rbio_stripe_sector(rbio
, stripe_nr
, sector_nr
);
1813 ASSERT(sector
->page
);
1815 csum_expected
= rbio
->csum_buf
+
1816 (stripe_nr
* rbio
->stripe_nsectors
+ sector_nr
) *
1818 ret
= btrfs_check_sector_csum(fs_info
, sector
->page
, sector
->pgoff
,
1819 csum_buf
, csum_expected
);
1824 * Recover a vertical stripe specified by @sector_nr.
1825 * @*pointers are the pre-allocated pointers by the caller, so we don't
1826 * need to allocate/free the pointers again and again.
1828 static int recover_vertical(struct btrfs_raid_bio
*rbio
, int sector_nr
,
1829 void **pointers
, void **unmap_array
)
1831 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
1832 struct sector_ptr
*sector
;
1833 const u32 sectorsize
= fs_info
->sectorsize
;
1841 * Now we just use bitmap to mark the horizontal stripes in
1842 * which we have data when doing parity scrub.
1844 if (rbio
->operation
== BTRFS_RBIO_PARITY_SCRUB
&&
1845 !test_bit(sector_nr
, &rbio
->dbitmap
))
1848 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
, &faila
,
1851 * No errors in the vertical stripe, skip it. Can happen for recovery
1852 * which only part of a stripe failed csum check.
1857 if (found_errors
> rbio
->bioc
->max_errors
)
1861 * Setup our array of pointers with sectors from each stripe
1863 * NOTE: store a duplicate array of pointers to preserve the
1866 for (stripe_nr
= 0; stripe_nr
< rbio
->real_stripes
; stripe_nr
++) {
1868 * If we're rebuilding a read, we have to use pages from the
1869 * bio list if possible.
1871 if (rbio
->operation
== BTRFS_RBIO_READ_REBUILD
) {
1872 sector
= sector_in_rbio(rbio
, stripe_nr
, sector_nr
, 0);
1874 sector
= rbio_stripe_sector(rbio
, stripe_nr
, sector_nr
);
1876 ASSERT(sector
->page
);
1877 pointers
[stripe_nr
] = kmap_local_page(sector
->page
) +
1879 unmap_array
[stripe_nr
] = pointers
[stripe_nr
];
1882 /* All raid6 handling here */
1883 if (rbio
->bioc
->map_type
& BTRFS_BLOCK_GROUP_RAID6
) {
1884 /* Single failure, rebuild from parity raid5 style */
1886 if (faila
== rbio
->nr_data
)
1888 * Just the P stripe has failed, without
1889 * a bad data or Q stripe.
1890 * We have nothing to do, just skip the
1891 * recovery for this stripe.
1895 * a single failure in raid6 is rebuilt
1896 * in the pstripe code below
1902 * If the q stripe is failed, do a pstripe reconstruction from
1904 * If both the q stripe and the P stripe are failed, we're
1905 * here due to a crc mismatch and we can't give them the
1908 if (failb
== rbio
->real_stripes
- 1) {
1909 if (faila
== rbio
->real_stripes
- 2)
1911 * Only P and Q are corrupted.
1912 * We only care about data stripes recovery,
1913 * can skip this vertical stripe.
1917 * Otherwise we have one bad data stripe and
1918 * a good P stripe. raid5!
1923 if (failb
== rbio
->real_stripes
- 2) {
1924 raid6_datap_recov(rbio
->real_stripes
, sectorsize
,
1927 raid6_2data_recov(rbio
->real_stripes
, sectorsize
,
1928 faila
, failb
, pointers
);
1933 /* Rebuild from P stripe here (raid5 or raid6). */
1934 ASSERT(failb
== -1);
1936 /* Copy parity block into failed block to start with */
1937 memcpy(pointers
[faila
], pointers
[rbio
->nr_data
], sectorsize
);
1939 /* Rearrange the pointer array */
1940 p
= pointers
[faila
];
1941 for (stripe_nr
= faila
; stripe_nr
< rbio
->nr_data
- 1;
1943 pointers
[stripe_nr
] = pointers
[stripe_nr
+ 1];
1944 pointers
[rbio
->nr_data
- 1] = p
;
1946 /* Xor in the rest */
1947 run_xor(pointers
, rbio
->nr_data
- 1, sectorsize
);
1952 * No matter if this is a RMW or recovery, we should have all
1953 * failed sectors repaired in the vertical stripe, thus they are now
1955 * Especially if we determine to cache the rbio, we need to
1956 * have at least all data sectors uptodate.
1958 * If possible, also check if the repaired sector matches its data
1962 ret
= verify_one_sector(rbio
, faila
, sector_nr
);
1966 sector
= rbio_stripe_sector(rbio
, faila
, sector_nr
);
1967 sector
->uptodate
= 1;
1970 ret
= verify_one_sector(rbio
, failb
, sector_nr
);
1974 sector
= rbio_stripe_sector(rbio
, failb
, sector_nr
);
1975 sector
->uptodate
= 1;
1979 for (stripe_nr
= rbio
->real_stripes
- 1; stripe_nr
>= 0; stripe_nr
--)
1980 kunmap_local(unmap_array
[stripe_nr
]);
1984 static int recover_sectors(struct btrfs_raid_bio
*rbio
)
1986 void **pointers
= NULL
;
1987 void **unmap_array
= NULL
;
1992 * @pointers array stores the pointer for each sector.
1994 * @unmap_array stores copy of pointers that does not get reordered
1995 * during reconstruction so that kunmap_local works.
1997 pointers
= kcalloc(rbio
->real_stripes
, sizeof(void *), GFP_NOFS
);
1998 unmap_array
= kcalloc(rbio
->real_stripes
, sizeof(void *), GFP_NOFS
);
1999 if (!pointers
|| !unmap_array
) {
2004 if (rbio
->operation
== BTRFS_RBIO_READ_REBUILD
) {
2005 spin_lock(&rbio
->bio_list_lock
);
2006 set_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
2007 spin_unlock(&rbio
->bio_list_lock
);
2010 index_rbio_pages(rbio
);
2012 for (sectornr
= 0; sectornr
< rbio
->stripe_nsectors
; sectornr
++) {
2013 ret
= recover_vertical(rbio
, sectornr
, pointers
, unmap_array
);
2024 static void recover_rbio(struct btrfs_raid_bio
*rbio
)
2026 struct bio_list bio_list
= BIO_EMPTY_LIST
;
2027 int total_sector_nr
;
2031 * Either we're doing recover for a read failure or degraded write,
2032 * caller should have set error bitmap correctly.
2034 ASSERT(bitmap_weight(rbio
->error_bitmap
, rbio
->nr_sectors
));
2036 /* For recovery, we need to read all sectors including P/Q. */
2037 ret
= alloc_rbio_pages(rbio
);
2041 index_rbio_pages(rbio
);
2044 * Read everything that hasn't failed. However this time we will
2045 * not trust any cached sector.
2046 * As we may read out some stale data but higher layer is not reading
2049 * So here we always re-read everything in recovery path.
2051 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
2052 total_sector_nr
++) {
2053 int stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
2054 int sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
2055 struct sector_ptr
*sector
;
2058 * Skip the range which has error. It can be a range which is
2059 * marked error (for csum mismatch), or it can be a missing
2062 if (!rbio
->bioc
->stripes
[stripe
].dev
->bdev
||
2063 test_bit(total_sector_nr
, rbio
->error_bitmap
)) {
2065 * Also set the error bit for missing device, which
2066 * may not yet have its error bit set.
2068 set_bit(total_sector_nr
, rbio
->error_bitmap
);
2072 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
2073 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
, stripe
,
2074 sectornr
, REQ_OP_READ
);
2076 bio_list_put(&bio_list
);
2081 submit_read_wait_bio_list(rbio
, &bio_list
);
2082 ret
= recover_sectors(rbio
);
2084 rbio_orig_end_io(rbio
, errno_to_blk_status(ret
));
2087 static void recover_rbio_work(struct work_struct
*work
)
2089 struct btrfs_raid_bio
*rbio
;
2091 rbio
= container_of(work
, struct btrfs_raid_bio
, work
);
2092 if (!lock_stripe_add(rbio
))
2096 static void recover_rbio_work_locked(struct work_struct
*work
)
2098 recover_rbio(container_of(work
, struct btrfs_raid_bio
, work
));
2101 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio
*rbio
, int mirror_num
)
2107 * This is for RAID6 extra recovery tries, thus mirror number should
2109 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2112 ASSERT(mirror_num
> 2);
2113 for (sector_nr
= 0; sector_nr
< rbio
->stripe_nsectors
; sector_nr
++) {
2118 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
,
2120 /* This vertical stripe doesn't have errors. */
2125 * If we found errors, there should be only one error marked
2126 * by previous set_rbio_range_error().
2128 ASSERT(found_errors
== 1);
2131 /* Now select another stripe to mark as error. */
2132 failb
= rbio
->real_stripes
- (mirror_num
- 1);
2136 /* Set the extra bit in error bitmap. */
2138 set_bit(failb
* rbio
->stripe_nsectors
+ sector_nr
,
2139 rbio
->error_bitmap
);
2142 /* We should found at least one vertical stripe with error.*/
2147 * the main entry point for reads from the higher layers. This
2148 * is really only called when the normal read path had a failure,
2149 * so we assume the bio they send down corresponds to a failed part
2152 void raid56_parity_recover(struct bio
*bio
, struct btrfs_io_context
*bioc
,
2155 struct btrfs_fs_info
*fs_info
= bioc
->fs_info
;
2156 struct btrfs_raid_bio
*rbio
;
2158 rbio
= alloc_rbio(fs_info
, bioc
);
2160 bio
->bi_status
= errno_to_blk_status(PTR_ERR(rbio
));
2165 rbio
->operation
= BTRFS_RBIO_READ_REBUILD
;
2166 rbio_add_bio(rbio
, bio
);
2168 set_rbio_range_error(rbio
, bio
);
2172 * for 'mirror == 2', reconstruct from all other stripes.
2173 * for 'mirror_num > 2', select a stripe to fail on every retry.
2176 set_rbio_raid6_extra_error(rbio
, mirror_num
);
2178 start_async_work(rbio
, recover_rbio_work
);
2181 static void fill_data_csums(struct btrfs_raid_bio
*rbio
)
2183 struct btrfs_fs_info
*fs_info
= rbio
->bioc
->fs_info
;
2184 struct btrfs_root
*csum_root
= btrfs_csum_root(fs_info
,
2185 rbio
->bioc
->full_stripe_logical
);
2186 const u64 start
= rbio
->bioc
->full_stripe_logical
;
2187 const u32 len
= (rbio
->nr_data
* rbio
->stripe_nsectors
) <<
2188 fs_info
->sectorsize_bits
;
2191 /* The rbio should not have its csum buffer initialized. */
2192 ASSERT(!rbio
->csum_buf
&& !rbio
->csum_bitmap
);
2195 * Skip the csum search if:
2197 * - The rbio doesn't belong to data block groups
2198 * Then we are doing IO for tree blocks, no need to search csums.
2200 * - The rbio belongs to mixed block groups
2201 * This is to avoid deadlock, as we're already holding the full
2202 * stripe lock, if we trigger a metadata read, and it needs to do
2203 * raid56 recovery, we will deadlock.
2205 if (!(rbio
->bioc
->map_type
& BTRFS_BLOCK_GROUP_DATA
) ||
2206 rbio
->bioc
->map_type
& BTRFS_BLOCK_GROUP_METADATA
)
2209 rbio
->csum_buf
= kzalloc(rbio
->nr_data
* rbio
->stripe_nsectors
*
2210 fs_info
->csum_size
, GFP_NOFS
);
2211 rbio
->csum_bitmap
= bitmap_zalloc(rbio
->nr_data
* rbio
->stripe_nsectors
,
2213 if (!rbio
->csum_buf
|| !rbio
->csum_bitmap
) {
2218 ret
= btrfs_lookup_csums_bitmap(csum_root
, NULL
, start
, start
+ len
- 1,
2219 rbio
->csum_buf
, rbio
->csum_bitmap
);
2222 if (bitmap_empty(rbio
->csum_bitmap
, len
>> fs_info
->sectorsize_bits
))
2228 * We failed to allocate memory or grab the csum, but it's not fatal,
2229 * we can still continue. But better to warn users that RMW is no
2230 * longer safe for this particular sub-stripe write.
2232 btrfs_warn_rl(fs_info
,
2233 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2234 rbio
->bioc
->full_stripe_logical
, ret
);
2236 kfree(rbio
->csum_buf
);
2237 bitmap_free(rbio
->csum_bitmap
);
2238 rbio
->csum_buf
= NULL
;
2239 rbio
->csum_bitmap
= NULL
;
2242 static int rmw_read_wait_recover(struct btrfs_raid_bio
*rbio
)
2244 struct bio_list bio_list
= BIO_EMPTY_LIST
;
2245 int total_sector_nr
;
2249 * Fill the data csums we need for data verification. We need to fill
2250 * the csum_bitmap/csum_buf first, as our endio function will try to
2251 * verify the data sectors.
2253 fill_data_csums(rbio
);
2256 * Build a list of bios to read all sectors (including data and P/Q).
2258 * This behavior is to compensate the later csum verification and recovery.
2260 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
2261 total_sector_nr
++) {
2262 struct sector_ptr
*sector
;
2263 int stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
2264 int sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
2266 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
2267 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
,
2268 stripe
, sectornr
, REQ_OP_READ
);
2270 bio_list_put(&bio_list
);
2276 * We may or may not have any corrupted sectors (including missing dev
2277 * and csum mismatch), just let recover_sectors() to handle them all.
2279 submit_read_wait_bio_list(rbio
, &bio_list
);
2280 return recover_sectors(rbio
);
2283 static void raid_wait_write_end_io(struct bio
*bio
)
2285 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
2286 blk_status_t err
= bio
->bi_status
;
2289 rbio_update_error_bitmap(rbio
, bio
);
2291 if (atomic_dec_and_test(&rbio
->stripes_pending
))
2292 wake_up(&rbio
->io_wait
);
2295 static void submit_write_bios(struct btrfs_raid_bio
*rbio
,
2296 struct bio_list
*bio_list
)
2300 atomic_set(&rbio
->stripes_pending
, bio_list_size(bio_list
));
2301 while ((bio
= bio_list_pop(bio_list
))) {
2302 bio
->bi_end_io
= raid_wait_write_end_io
;
2304 if (trace_raid56_write_enabled()) {
2305 struct raid56_bio_trace_info trace_info
= { 0 };
2307 bio_get_trace_info(rbio
, bio
, &trace_info
);
2308 trace_raid56_write(rbio
, bio
, &trace_info
);
2315 * To determine if we need to read any sector from the disk.
2316 * Should only be utilized in RMW path, to skip cached rbio.
2318 static bool need_read_stripe_sectors(struct btrfs_raid_bio
*rbio
)
2322 for (i
= 0; i
< rbio
->nr_data
* rbio
->stripe_nsectors
; i
++) {
2323 struct sector_ptr
*sector
= &rbio
->stripe_sectors
[i
];
2326 * We have a sector which doesn't have page nor uptodate,
2327 * thus this rbio can not be cached one, as cached one must
2328 * have all its data sectors present and uptodate.
2330 if (!sector
->page
|| !sector
->uptodate
)
2336 static void rmw_rbio(struct btrfs_raid_bio
*rbio
)
2338 struct bio_list bio_list
;
2343 * Allocate the pages for parity first, as P/Q pages will always be
2344 * needed for both full-stripe and sub-stripe writes.
2346 ret
= alloc_rbio_parity_pages(rbio
);
2351 * Either full stripe write, or we have every data sector already
2352 * cached, can go to write path immediately.
2354 if (!rbio_is_full(rbio
) && need_read_stripe_sectors(rbio
)) {
2356 * Now we're doing sub-stripe write, also need all data stripes
2357 * to do the full RMW.
2359 ret
= alloc_rbio_data_pages(rbio
);
2363 index_rbio_pages(rbio
);
2365 ret
= rmw_read_wait_recover(rbio
);
2371 * At this stage we're not allowed to add any new bios to the
2372 * bio list any more, anyone else that wants to change this stripe
2373 * needs to do their own rmw.
2375 spin_lock(&rbio
->bio_list_lock
);
2376 set_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
2377 spin_unlock(&rbio
->bio_list_lock
);
2379 bitmap_clear(rbio
->error_bitmap
, 0, rbio
->nr_sectors
);
2381 index_rbio_pages(rbio
);
2384 * We don't cache full rbios because we're assuming
2385 * the higher layers are unlikely to use this area of
2386 * the disk again soon. If they do use it again,
2387 * hopefully they will send another full bio.
2389 if (!rbio_is_full(rbio
))
2390 cache_rbio_pages(rbio
);
2392 clear_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
2394 for (sectornr
= 0; sectornr
< rbio
->stripe_nsectors
; sectornr
++)
2395 generate_pq_vertical(rbio
, sectornr
);
2397 bio_list_init(&bio_list
);
2398 ret
= rmw_assemble_write_bios(rbio
, &bio_list
);
2402 /* We should have at least one bio assembled. */
2403 ASSERT(bio_list_size(&bio_list
));
2404 submit_write_bios(rbio
, &bio_list
);
2405 wait_event(rbio
->io_wait
, atomic_read(&rbio
->stripes_pending
) == 0);
2407 /* We may have more errors than our tolerance during the read. */
2408 for (sectornr
= 0; sectornr
< rbio
->stripe_nsectors
; sectornr
++) {
2411 found_errors
= get_rbio_veritical_errors(rbio
, sectornr
, NULL
, NULL
);
2412 if (found_errors
> rbio
->bioc
->max_errors
) {
2418 rbio_orig_end_io(rbio
, errno_to_blk_status(ret
));
2421 static void rmw_rbio_work(struct work_struct
*work
)
2423 struct btrfs_raid_bio
*rbio
;
2425 rbio
= container_of(work
, struct btrfs_raid_bio
, work
);
2426 if (lock_stripe_add(rbio
) == 0)
2430 static void rmw_rbio_work_locked(struct work_struct
*work
)
2432 rmw_rbio(container_of(work
, struct btrfs_raid_bio
, work
));
2436 * The following code is used to scrub/replace the parity stripe
2438 * Caller must have already increased bio_counter for getting @bioc.
2440 * Note: We need make sure all the pages that add into the scrub/replace
2441 * raid bio are correct and not be changed during the scrub/replace. That
2442 * is those pages just hold metadata or file data with checksum.
2445 struct btrfs_raid_bio
*raid56_parity_alloc_scrub_rbio(struct bio
*bio
,
2446 struct btrfs_io_context
*bioc
,
2447 struct btrfs_device
*scrub_dev
,
2448 unsigned long *dbitmap
, int stripe_nsectors
)
2450 struct btrfs_fs_info
*fs_info
= bioc
->fs_info
;
2451 struct btrfs_raid_bio
*rbio
;
2454 rbio
= alloc_rbio(fs_info
, bioc
);
2457 bio_list_add(&rbio
->bio_list
, bio
);
2459 * This is a special bio which is used to hold the completion handler
2460 * and make the scrub rbio is similar to the other types
2462 ASSERT(!bio
->bi_iter
.bi_size
);
2463 rbio
->operation
= BTRFS_RBIO_PARITY_SCRUB
;
2466 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2467 * to the end position, so this search can start from the first parity
2470 for (i
= rbio
->nr_data
; i
< rbio
->real_stripes
; i
++) {
2471 if (bioc
->stripes
[i
].dev
== scrub_dev
) {
2476 ASSERT_RBIO_STRIPE(i
< rbio
->real_stripes
, rbio
, i
);
2478 bitmap_copy(&rbio
->dbitmap
, dbitmap
, stripe_nsectors
);
2483 * We just scrub the parity that we have correct data on the same horizontal,
2484 * so we needn't allocate all pages for all the stripes.
2486 static int alloc_rbio_essential_pages(struct btrfs_raid_bio
*rbio
)
2488 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
2489 int total_sector_nr
;
2491 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
2492 total_sector_nr
++) {
2494 int sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
2495 int index
= (total_sector_nr
* sectorsize
) >> PAGE_SHIFT
;
2497 if (!test_bit(sectornr
, &rbio
->dbitmap
))
2499 if (rbio
->stripe_pages
[index
])
2501 page
= alloc_page(GFP_NOFS
);
2504 rbio
->stripe_pages
[index
] = page
;
2506 index_stripe_sectors(rbio
);
2510 static int finish_parity_scrub(struct btrfs_raid_bio
*rbio
)
2512 struct btrfs_io_context
*bioc
= rbio
->bioc
;
2513 const u32 sectorsize
= bioc
->fs_info
->sectorsize
;
2514 void **pointers
= rbio
->finish_pointers
;
2515 unsigned long *pbitmap
= &rbio
->finish_pbitmap
;
2516 int nr_data
= rbio
->nr_data
;
2520 struct sector_ptr p_sector
= { 0 };
2521 struct sector_ptr q_sector
= { 0 };
2522 struct bio_list bio_list
;
2526 bio_list_init(&bio_list
);
2528 if (rbio
->real_stripes
- rbio
->nr_data
== 1)
2529 has_qstripe
= false;
2530 else if (rbio
->real_stripes
- rbio
->nr_data
== 2)
2536 * Replace is running and our P/Q stripe is being replaced, then we
2537 * need to duplicate the final write to replace target.
2539 if (bioc
->replace_nr_stripes
&& bioc
->replace_stripe_src
== rbio
->scrubp
) {
2541 bitmap_copy(pbitmap
, &rbio
->dbitmap
, rbio
->stripe_nsectors
);
2545 * Because the higher layers(scrubber) are unlikely to
2546 * use this area of the disk again soon, so don't cache
2549 clear_bit(RBIO_CACHE_READY_BIT
, &rbio
->flags
);
2551 p_sector
.page
= alloc_page(GFP_NOFS
);
2555 p_sector
.uptodate
= 1;
2558 /* RAID6, allocate and map temp space for the Q stripe */
2559 q_sector
.page
= alloc_page(GFP_NOFS
);
2560 if (!q_sector
.page
) {
2561 __free_page(p_sector
.page
);
2562 p_sector
.page
= NULL
;
2566 q_sector
.uptodate
= 1;
2567 pointers
[rbio
->real_stripes
- 1] = kmap_local_page(q_sector
.page
);
2570 bitmap_clear(rbio
->error_bitmap
, 0, rbio
->nr_sectors
);
2572 /* Map the parity stripe just once */
2573 pointers
[nr_data
] = kmap_local_page(p_sector
.page
);
2575 for_each_set_bit(sectornr
, &rbio
->dbitmap
, rbio
->stripe_nsectors
) {
2576 struct sector_ptr
*sector
;
2579 /* first collect one page from each data stripe */
2580 for (stripe
= 0; stripe
< nr_data
; stripe
++) {
2581 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 0);
2582 pointers
[stripe
] = kmap_local_page(sector
->page
) +
2588 /* RAID6, call the library function to fill in our P/Q */
2589 raid6_call
.gen_syndrome(rbio
->real_stripes
, sectorsize
,
2593 memcpy(pointers
[nr_data
], pointers
[0], sectorsize
);
2594 run_xor(pointers
+ 1, nr_data
- 1, sectorsize
);
2597 /* Check scrubbing parity and repair it */
2598 sector
= rbio_stripe_sector(rbio
, rbio
->scrubp
, sectornr
);
2599 parity
= kmap_local_page(sector
->page
) + sector
->pgoff
;
2600 if (memcmp(parity
, pointers
[rbio
->scrubp
], sectorsize
) != 0)
2601 memcpy(parity
, pointers
[rbio
->scrubp
], sectorsize
);
2603 /* Parity is right, needn't writeback */
2604 bitmap_clear(&rbio
->dbitmap
, sectornr
, 1);
2605 kunmap_local(parity
);
2607 for (stripe
= nr_data
- 1; stripe
>= 0; stripe
--)
2608 kunmap_local(pointers
[stripe
]);
2611 kunmap_local(pointers
[nr_data
]);
2612 __free_page(p_sector
.page
);
2613 p_sector
.page
= NULL
;
2614 if (q_sector
.page
) {
2615 kunmap_local(pointers
[rbio
->real_stripes
- 1]);
2616 __free_page(q_sector
.page
);
2617 q_sector
.page
= NULL
;
2621 * time to start writing. Make bios for everything from the
2622 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2625 for_each_set_bit(sectornr
, &rbio
->dbitmap
, rbio
->stripe_nsectors
) {
2626 struct sector_ptr
*sector
;
2628 sector
= rbio_stripe_sector(rbio
, rbio
->scrubp
, sectornr
);
2629 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
, rbio
->scrubp
,
2630 sectornr
, REQ_OP_WRITE
);
2639 * Replace is running and our parity stripe needs to be duplicated to
2640 * the target device. Check we have a valid source stripe number.
2642 ASSERT_RBIO(rbio
->bioc
->replace_stripe_src
>= 0, rbio
);
2643 for_each_set_bit(sectornr
, pbitmap
, rbio
->stripe_nsectors
) {
2644 struct sector_ptr
*sector
;
2646 sector
= rbio_stripe_sector(rbio
, rbio
->scrubp
, sectornr
);
2647 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
,
2649 sectornr
, REQ_OP_WRITE
);
2655 submit_write_bios(rbio
, &bio_list
);
2659 bio_list_put(&bio_list
);
2663 static inline int is_data_stripe(struct btrfs_raid_bio
*rbio
, int stripe
)
2665 if (stripe
>= 0 && stripe
< rbio
->nr_data
)
2670 static int recover_scrub_rbio(struct btrfs_raid_bio
*rbio
)
2672 void **pointers
= NULL
;
2673 void **unmap_array
= NULL
;
2678 * @pointers array stores the pointer for each sector.
2680 * @unmap_array stores copy of pointers that does not get reordered
2681 * during reconstruction so that kunmap_local works.
2683 pointers
= kcalloc(rbio
->real_stripes
, sizeof(void *), GFP_NOFS
);
2684 unmap_array
= kcalloc(rbio
->real_stripes
, sizeof(void *), GFP_NOFS
);
2685 if (!pointers
|| !unmap_array
) {
2690 for (sector_nr
= 0; sector_nr
< rbio
->stripe_nsectors
; sector_nr
++) {
2691 int dfail
= 0, failp
= -1;
2696 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
,
2698 if (found_errors
> rbio
->bioc
->max_errors
) {
2702 if (found_errors
== 0)
2705 /* We should have at least one error here. */
2706 ASSERT(faila
>= 0 || failb
>= 0);
2708 if (is_data_stripe(rbio
, faila
))
2710 else if (is_parity_stripe(faila
))
2713 if (is_data_stripe(rbio
, failb
))
2715 else if (is_parity_stripe(failb
))
2718 * Because we can not use a scrubbing parity to repair the
2719 * data, so the capability of the repair is declined. (In the
2720 * case of RAID5, we can not repair anything.)
2722 if (dfail
> rbio
->bioc
->max_errors
- 1) {
2727 * If all data is good, only parity is correctly, just repair
2728 * the parity, no need to recover data stripes.
2734 * Here means we got one corrupted data stripe and one
2735 * corrupted parity on RAID6, if the corrupted parity is
2736 * scrubbing parity, luckily, use the other one to repair the
2737 * data, or we can not repair the data stripe.
2739 if (failp
!= rbio
->scrubp
) {
2744 ret
= recover_vertical(rbio
, sector_nr
, pointers
, unmap_array
);
2754 static int scrub_assemble_read_bios(struct btrfs_raid_bio
*rbio
)
2756 struct bio_list bio_list
= BIO_EMPTY_LIST
;
2757 int total_sector_nr
;
2760 /* Build a list of bios to read all the missing parts. */
2761 for (total_sector_nr
= 0; total_sector_nr
< rbio
->nr_sectors
;
2762 total_sector_nr
++) {
2763 int sectornr
= total_sector_nr
% rbio
->stripe_nsectors
;
2764 int stripe
= total_sector_nr
/ rbio
->stripe_nsectors
;
2765 struct sector_ptr
*sector
;
2767 /* No data in the vertical stripe, no need to read. */
2768 if (!test_bit(sectornr
, &rbio
->dbitmap
))
2772 * We want to find all the sectors missing from the rbio and
2773 * read them from the disk. If sector_in_rbio() finds a sector
2774 * in the bio list we don't need to read it off the stripe.
2776 sector
= sector_in_rbio(rbio
, stripe
, sectornr
, 1);
2780 sector
= rbio_stripe_sector(rbio
, stripe
, sectornr
);
2782 * The bio cache may have handed us an uptodate sector. If so,
2785 if (sector
->uptodate
)
2788 ret
= rbio_add_io_sector(rbio
, &bio_list
, sector
, stripe
,
2789 sectornr
, REQ_OP_READ
);
2791 bio_list_put(&bio_list
);
2796 submit_read_wait_bio_list(rbio
, &bio_list
);
2800 static void scrub_rbio(struct btrfs_raid_bio
*rbio
)
2805 ret
= alloc_rbio_essential_pages(rbio
);
2809 bitmap_clear(rbio
->error_bitmap
, 0, rbio
->nr_sectors
);
2811 ret
= scrub_assemble_read_bios(rbio
);
2815 /* We may have some failures, recover the failed sectors first. */
2816 ret
= recover_scrub_rbio(rbio
);
2821 * We have every sector properly prepared. Can finish the scrub
2822 * and writeback the good content.
2824 ret
= finish_parity_scrub(rbio
);
2825 wait_event(rbio
->io_wait
, atomic_read(&rbio
->stripes_pending
) == 0);
2826 for (sector_nr
= 0; sector_nr
< rbio
->stripe_nsectors
; sector_nr
++) {
2829 found_errors
= get_rbio_veritical_errors(rbio
, sector_nr
, NULL
, NULL
);
2830 if (found_errors
> rbio
->bioc
->max_errors
) {
2836 rbio_orig_end_io(rbio
, errno_to_blk_status(ret
));
2839 static void scrub_rbio_work_locked(struct work_struct
*work
)
2841 scrub_rbio(container_of(work
, struct btrfs_raid_bio
, work
));
2844 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio
*rbio
)
2846 if (!lock_stripe_add(rbio
))
2847 start_async_work(rbio
, scrub_rbio_work_locked
);
2851 * This is for scrub call sites where we already have correct data contents.
2852 * This allows us to avoid reading data stripes again.
2854 * Unfortunately here we have to do page copy, other than reusing the pages.
2855 * This is due to the fact rbio has its own page management for its cache.
2857 void raid56_parity_cache_data_pages(struct btrfs_raid_bio
*rbio
,
2858 struct page
**data_pages
, u64 data_logical
)
2860 const u64 offset_in_full_stripe
= data_logical
-
2861 rbio
->bioc
->full_stripe_logical
;
2862 const int page_index
= offset_in_full_stripe
>> PAGE_SHIFT
;
2863 const u32 sectorsize
= rbio
->bioc
->fs_info
->sectorsize
;
2864 const u32 sectors_per_page
= PAGE_SIZE
/ sectorsize
;
2868 * If we hit ENOMEM temporarily, but later at
2869 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2870 * the extra read, not a big deal.
2872 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2873 * the bio would got proper error number set.
2875 ret
= alloc_rbio_data_pages(rbio
);
2879 /* data_logical must be at stripe boundary and inside the full stripe. */
2880 ASSERT(IS_ALIGNED(offset_in_full_stripe
, BTRFS_STRIPE_LEN
));
2881 ASSERT(offset_in_full_stripe
< (rbio
->nr_data
<< BTRFS_STRIPE_LEN_SHIFT
));
2883 for (int page_nr
= 0; page_nr
< (BTRFS_STRIPE_LEN
>> PAGE_SHIFT
); page_nr
++) {
2884 struct page
*dst
= rbio
->stripe_pages
[page_nr
+ page_index
];
2885 struct page
*src
= data_pages
[page_nr
];
2887 memcpy_page(dst
, 0, src
, 0, PAGE_SIZE
);
2888 for (int sector_nr
= sectors_per_page
* page_index
;
2889 sector_nr
< sectors_per_page
* (page_index
+ 1);
2891 rbio
->stripe_sectors
[sector_nr
].uptodate
= true;