1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 * Copyright (C) 2022 Christoph Hellwig.
12 #include "async-thread.h"
13 #include "dev-replace.h"
15 #include "file-item.h"
16 #include "raid-stripe-tree.h"
18 static struct bio_set btrfs_bioset
;
19 static struct bio_set btrfs_clone_bioset
;
20 static struct bio_set btrfs_repair_bioset
;
21 static mempool_t btrfs_failed_bio_pool
;
23 struct btrfs_failed_bio
{
24 struct btrfs_bio
*bbio
;
26 atomic_t repair_count
;
29 /* Is this a data path I/O that needs storage layer checksum and repair? */
30 static inline bool is_data_bbio(struct btrfs_bio
*bbio
)
32 return bbio
->inode
&& is_data_inode(bbio
->inode
);
35 static bool bbio_has_ordered_extent(struct btrfs_bio
*bbio
)
37 return is_data_bbio(bbio
) && btrfs_op(&bbio
->bio
) == BTRFS_MAP_WRITE
;
41 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
42 * is already initialized by the block layer.
44 void btrfs_bio_init(struct btrfs_bio
*bbio
, struct btrfs_fs_info
*fs_info
,
45 btrfs_bio_end_io_t end_io
, void *private)
47 memset(bbio
, 0, offsetof(struct btrfs_bio
, bio
));
48 bbio
->fs_info
= fs_info
;
49 bbio
->end_io
= end_io
;
50 bbio
->private = private;
51 atomic_set(&bbio
->pending_ios
, 1);
52 WRITE_ONCE(bbio
->status
, BLK_STS_OK
);
56 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
57 * btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
59 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
62 struct btrfs_bio
*btrfs_bio_alloc(unsigned int nr_vecs
, blk_opf_t opf
,
63 struct btrfs_fs_info
*fs_info
,
64 btrfs_bio_end_io_t end_io
, void *private)
66 struct btrfs_bio
*bbio
;
69 bio
= bio_alloc_bioset(NULL
, nr_vecs
, opf
, GFP_NOFS
, &btrfs_bioset
);
70 bbio
= btrfs_bio(bio
);
71 btrfs_bio_init(bbio
, fs_info
, end_io
, private);
75 static struct btrfs_bio
*btrfs_split_bio(struct btrfs_fs_info
*fs_info
,
76 struct btrfs_bio
*orig_bbio
,
79 struct btrfs_bio
*bbio
;
82 bio
= bio_split(&orig_bbio
->bio
, map_length
>> SECTOR_SHIFT
, GFP_NOFS
,
84 bbio
= btrfs_bio(bio
);
85 btrfs_bio_init(bbio
, fs_info
, NULL
, orig_bbio
);
86 bbio
->inode
= orig_bbio
->inode
;
87 bbio
->file_offset
= orig_bbio
->file_offset
;
88 orig_bbio
->file_offset
+= map_length
;
89 if (bbio_has_ordered_extent(bbio
)) {
90 refcount_inc(&orig_bbio
->ordered
->refs
);
91 bbio
->ordered
= orig_bbio
->ordered
;
93 atomic_inc(&orig_bbio
->pending_ios
);
97 /* Free a bio that was never submitted to the underlying device. */
98 static void btrfs_cleanup_bio(struct btrfs_bio
*bbio
)
100 if (bbio_has_ordered_extent(bbio
))
101 btrfs_put_ordered_extent(bbio
->ordered
);
105 static void __btrfs_bio_end_io(struct btrfs_bio
*bbio
)
107 if (bbio_has_ordered_extent(bbio
)) {
108 struct btrfs_ordered_extent
*ordered
= bbio
->ordered
;
111 btrfs_put_ordered_extent(ordered
);
117 void btrfs_bio_end_io(struct btrfs_bio
*bbio
, blk_status_t status
)
119 bbio
->bio
.bi_status
= status
;
120 if (bbio
->bio
.bi_pool
== &btrfs_clone_bioset
) {
121 struct btrfs_bio
*orig_bbio
= bbio
->private;
123 btrfs_cleanup_bio(bbio
);
128 * At this point, bbio always points to the original btrfs_bio. Save
129 * the first error in it.
131 if (status
!= BLK_STS_OK
)
132 cmpxchg(&bbio
->status
, BLK_STS_OK
, status
);
134 if (atomic_dec_and_test(&bbio
->pending_ios
)) {
135 /* Load split bio's error which might be set above. */
136 if (status
== BLK_STS_OK
)
137 bbio
->bio
.bi_status
= READ_ONCE(bbio
->status
);
138 __btrfs_bio_end_io(bbio
);
142 static int next_repair_mirror(struct btrfs_failed_bio
*fbio
, int cur_mirror
)
144 if (cur_mirror
== fbio
->num_copies
)
145 return cur_mirror
+ 1 - fbio
->num_copies
;
146 return cur_mirror
+ 1;
149 static int prev_repair_mirror(struct btrfs_failed_bio
*fbio
, int cur_mirror
)
152 return fbio
->num_copies
;
153 return cur_mirror
- 1;
156 static void btrfs_repair_done(struct btrfs_failed_bio
*fbio
)
158 if (atomic_dec_and_test(&fbio
->repair_count
)) {
159 btrfs_bio_end_io(fbio
->bbio
, fbio
->bbio
->bio
.bi_status
);
160 mempool_free(fbio
, &btrfs_failed_bio_pool
);
164 static void btrfs_end_repair_bio(struct btrfs_bio
*repair_bbio
,
165 struct btrfs_device
*dev
)
167 struct btrfs_failed_bio
*fbio
= repair_bbio
->private;
168 struct btrfs_inode
*inode
= repair_bbio
->inode
;
169 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
170 struct bio_vec
*bv
= bio_first_bvec_all(&repair_bbio
->bio
);
171 int mirror
= repair_bbio
->mirror_num
;
174 * We can only trigger this for data bio, which doesn't support larger
177 ASSERT(folio_order(page_folio(bv
->bv_page
)) == 0);
179 if (repair_bbio
->bio
.bi_status
||
180 !btrfs_data_csum_ok(repair_bbio
, dev
, 0, bv
)) {
181 bio_reset(&repair_bbio
->bio
, NULL
, REQ_OP_READ
);
182 repair_bbio
->bio
.bi_iter
= repair_bbio
->saved_iter
;
184 mirror
= next_repair_mirror(fbio
, mirror
);
185 if (mirror
== fbio
->bbio
->mirror_num
) {
186 btrfs_debug(fs_info
, "no mirror left");
187 fbio
->bbio
->bio
.bi_status
= BLK_STS_IOERR
;
191 btrfs_submit_bbio(repair_bbio
, mirror
);
196 mirror
= prev_repair_mirror(fbio
, mirror
);
197 btrfs_repair_io_failure(fs_info
, btrfs_ino(inode
),
198 repair_bbio
->file_offset
, fs_info
->sectorsize
,
199 repair_bbio
->saved_iter
.bi_sector
<< SECTOR_SHIFT
,
200 page_folio(bv
->bv_page
), bv
->bv_offset
, mirror
);
201 } while (mirror
!= fbio
->bbio
->mirror_num
);
204 btrfs_repair_done(fbio
);
205 bio_put(&repair_bbio
->bio
);
209 * Try to kick off a repair read to the next available mirror for a bad sector.
211 * This primarily tries to recover good data to serve the actual read request,
212 * but also tries to write the good data back to the bad mirror(s) when a
213 * read succeeded to restore the redundancy.
215 static struct btrfs_failed_bio
*repair_one_sector(struct btrfs_bio
*failed_bbio
,
218 struct btrfs_failed_bio
*fbio
)
220 struct btrfs_inode
*inode
= failed_bbio
->inode
;
221 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
222 const u32 sectorsize
= fs_info
->sectorsize
;
223 const u64 logical
= (failed_bbio
->saved_iter
.bi_sector
<< SECTOR_SHIFT
);
224 struct btrfs_bio
*repair_bbio
;
225 struct bio
*repair_bio
;
229 btrfs_debug(fs_info
, "repair read error: read error at %llu",
230 failed_bbio
->file_offset
+ bio_offset
);
232 num_copies
= btrfs_num_copies(fs_info
, logical
, sectorsize
);
233 if (num_copies
== 1) {
234 btrfs_debug(fs_info
, "no copy to repair from");
235 failed_bbio
->bio
.bi_status
= BLK_STS_IOERR
;
240 fbio
= mempool_alloc(&btrfs_failed_bio_pool
, GFP_NOFS
);
241 fbio
->bbio
= failed_bbio
;
242 fbio
->num_copies
= num_copies
;
243 atomic_set(&fbio
->repair_count
, 1);
246 atomic_inc(&fbio
->repair_count
);
248 repair_bio
= bio_alloc_bioset(NULL
, 1, REQ_OP_READ
, GFP_NOFS
,
249 &btrfs_repair_bioset
);
250 repair_bio
->bi_iter
.bi_sector
= failed_bbio
->saved_iter
.bi_sector
;
251 __bio_add_page(repair_bio
, bv
->bv_page
, bv
->bv_len
, bv
->bv_offset
);
253 repair_bbio
= btrfs_bio(repair_bio
);
254 btrfs_bio_init(repair_bbio
, fs_info
, NULL
, fbio
);
255 repair_bbio
->inode
= failed_bbio
->inode
;
256 repair_bbio
->file_offset
= failed_bbio
->file_offset
+ bio_offset
;
258 mirror
= next_repair_mirror(fbio
, failed_bbio
->mirror_num
);
259 btrfs_debug(fs_info
, "submitting repair read to mirror %d", mirror
);
260 btrfs_submit_bbio(repair_bbio
, mirror
);
264 static void btrfs_check_read_bio(struct btrfs_bio
*bbio
, struct btrfs_device
*dev
)
266 struct btrfs_inode
*inode
= bbio
->inode
;
267 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
268 u32 sectorsize
= fs_info
->sectorsize
;
269 struct bvec_iter
*iter
= &bbio
->saved_iter
;
270 blk_status_t status
= bbio
->bio
.bi_status
;
271 struct btrfs_failed_bio
*fbio
= NULL
;
274 /* Read-repair requires the inode field to be set by the submitter. */
278 * Hand off repair bios to the repair code as there is no upper level
279 * submitter for them.
281 if (bbio
->bio
.bi_pool
== &btrfs_repair_bioset
) {
282 btrfs_end_repair_bio(bbio
, dev
);
286 /* Clear the I/O error. A failed repair will reset it. */
287 bbio
->bio
.bi_status
= BLK_STS_OK
;
289 while (iter
->bi_size
) {
290 struct bio_vec bv
= bio_iter_iovec(&bbio
->bio
, *iter
);
292 bv
.bv_len
= min(bv
.bv_len
, sectorsize
);
293 if (status
|| !btrfs_data_csum_ok(bbio
, dev
, offset
, &bv
))
294 fbio
= repair_one_sector(bbio
, offset
, &bv
, fbio
);
296 bio_advance_iter_single(&bbio
->bio
, iter
, sectorsize
);
297 offset
+= sectorsize
;
300 if (bbio
->csum
!= bbio
->csum_inline
)
304 btrfs_repair_done(fbio
);
306 btrfs_bio_end_io(bbio
, bbio
->bio
.bi_status
);
309 static void btrfs_log_dev_io_error(struct bio
*bio
, struct btrfs_device
*dev
)
311 if (!dev
|| !dev
->bdev
)
313 if (bio
->bi_status
!= BLK_STS_IOERR
&& bio
->bi_status
!= BLK_STS_TARGET
)
316 if (btrfs_op(bio
) == BTRFS_MAP_WRITE
)
317 btrfs_dev_stat_inc_and_print(dev
, BTRFS_DEV_STAT_WRITE_ERRS
);
318 else if (!(bio
->bi_opf
& REQ_RAHEAD
))
319 btrfs_dev_stat_inc_and_print(dev
, BTRFS_DEV_STAT_READ_ERRS
);
320 if (bio
->bi_opf
& REQ_PREFLUSH
)
321 btrfs_dev_stat_inc_and_print(dev
, BTRFS_DEV_STAT_FLUSH_ERRS
);
324 static struct workqueue_struct
*btrfs_end_io_wq(struct btrfs_fs_info
*fs_info
,
327 if (bio
->bi_opf
& REQ_META
)
328 return fs_info
->endio_meta_workers
;
329 return fs_info
->endio_workers
;
332 static void btrfs_end_bio_work(struct work_struct
*work
)
334 struct btrfs_bio
*bbio
= container_of(work
, struct btrfs_bio
, end_io_work
);
336 /* Metadata reads are checked and repaired by the submitter. */
337 if (is_data_bbio(bbio
))
338 btrfs_check_read_bio(bbio
, bbio
->bio
.bi_private
);
340 btrfs_bio_end_io(bbio
, bbio
->bio
.bi_status
);
343 static void btrfs_simple_end_io(struct bio
*bio
)
345 struct btrfs_bio
*bbio
= btrfs_bio(bio
);
346 struct btrfs_device
*dev
= bio
->bi_private
;
347 struct btrfs_fs_info
*fs_info
= bbio
->fs_info
;
349 btrfs_bio_counter_dec(fs_info
);
352 btrfs_log_dev_io_error(bio
, dev
);
354 if (bio_op(bio
) == REQ_OP_READ
) {
355 INIT_WORK(&bbio
->end_io_work
, btrfs_end_bio_work
);
356 queue_work(btrfs_end_io_wq(fs_info
, bio
), &bbio
->end_io_work
);
358 if (bio_op(bio
) == REQ_OP_ZONE_APPEND
&& !bio
->bi_status
)
359 btrfs_record_physical_zoned(bbio
);
360 btrfs_bio_end_io(bbio
, bbio
->bio
.bi_status
);
364 static void btrfs_raid56_end_io(struct bio
*bio
)
366 struct btrfs_io_context
*bioc
= bio
->bi_private
;
367 struct btrfs_bio
*bbio
= btrfs_bio(bio
);
369 btrfs_bio_counter_dec(bioc
->fs_info
);
370 bbio
->mirror_num
= bioc
->mirror_num
;
371 if (bio_op(bio
) == REQ_OP_READ
&& is_data_bbio(bbio
))
372 btrfs_check_read_bio(bbio
, NULL
);
374 btrfs_bio_end_io(bbio
, bbio
->bio
.bi_status
);
376 btrfs_put_bioc(bioc
);
379 static void btrfs_orig_write_end_io(struct bio
*bio
)
381 struct btrfs_io_stripe
*stripe
= bio
->bi_private
;
382 struct btrfs_io_context
*bioc
= stripe
->bioc
;
383 struct btrfs_bio
*bbio
= btrfs_bio(bio
);
385 btrfs_bio_counter_dec(bioc
->fs_info
);
387 if (bio
->bi_status
) {
388 atomic_inc(&bioc
->error
);
389 btrfs_log_dev_io_error(bio
, stripe
->dev
);
393 * Only send an error to the higher layers if it is beyond the tolerance
396 if (atomic_read(&bioc
->error
) > bioc
->max_errors
)
397 bio
->bi_status
= BLK_STS_IOERR
;
399 bio
->bi_status
= BLK_STS_OK
;
401 if (bio_op(bio
) == REQ_OP_ZONE_APPEND
&& !bio
->bi_status
)
402 stripe
->physical
= bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
;
404 btrfs_bio_end_io(bbio
, bbio
->bio
.bi_status
);
405 btrfs_put_bioc(bioc
);
408 static void btrfs_clone_write_end_io(struct bio
*bio
)
410 struct btrfs_io_stripe
*stripe
= bio
->bi_private
;
412 if (bio
->bi_status
) {
413 atomic_inc(&stripe
->bioc
->error
);
414 btrfs_log_dev_io_error(bio
, stripe
->dev
);
415 } else if (bio_op(bio
) == REQ_OP_ZONE_APPEND
) {
416 stripe
->physical
= bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
;
419 /* Pass on control to the original bio this one was cloned from */
420 bio_endio(stripe
->bioc
->orig_bio
);
424 static void btrfs_submit_dev_bio(struct btrfs_device
*dev
, struct bio
*bio
)
426 if (!dev
|| !dev
->bdev
||
427 test_bit(BTRFS_DEV_STATE_MISSING
, &dev
->dev_state
) ||
428 (btrfs_op(bio
) == BTRFS_MAP_WRITE
&&
429 !test_bit(BTRFS_DEV_STATE_WRITEABLE
, &dev
->dev_state
))) {
434 bio_set_dev(bio
, dev
->bdev
);
437 * For zone append writing, bi_sector must point the beginning of the
440 if (bio_op(bio
) == REQ_OP_ZONE_APPEND
) {
441 u64 physical
= bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
;
442 u64 zone_start
= round_down(physical
, dev
->fs_info
->zone_size
);
444 ASSERT(btrfs_dev_is_sequential(dev
, physical
));
445 bio
->bi_iter
.bi_sector
= zone_start
>> SECTOR_SHIFT
;
447 btrfs_debug_in_rcu(dev
->fs_info
,
448 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
449 __func__
, bio_op(bio
), bio
->bi_opf
, bio
->bi_iter
.bi_sector
,
450 (unsigned long)dev
->bdev
->bd_dev
, btrfs_dev_name(dev
),
451 dev
->devid
, bio
->bi_iter
.bi_size
);
453 if (bio
->bi_opf
& REQ_BTRFS_CGROUP_PUNT
)
454 blkcg_punt_bio_submit(bio
);
459 static void btrfs_submit_mirrored_bio(struct btrfs_io_context
*bioc
, int dev_nr
)
461 struct bio
*orig_bio
= bioc
->orig_bio
, *bio
;
463 ASSERT(bio_op(orig_bio
) != REQ_OP_READ
);
465 /* Reuse the bio embedded into the btrfs_bio for the last mirror */
466 if (dev_nr
== bioc
->num_stripes
- 1) {
468 bio
->bi_end_io
= btrfs_orig_write_end_io
;
470 bio
= bio_alloc_clone(NULL
, orig_bio
, GFP_NOFS
, &fs_bio_set
);
471 bio_inc_remaining(orig_bio
);
472 bio
->bi_end_io
= btrfs_clone_write_end_io
;
475 bio
->bi_private
= &bioc
->stripes
[dev_nr
];
476 bio
->bi_iter
.bi_sector
= bioc
->stripes
[dev_nr
].physical
>> SECTOR_SHIFT
;
477 bioc
->stripes
[dev_nr
].bioc
= bioc
;
478 bioc
->size
= bio
->bi_iter
.bi_size
;
479 btrfs_submit_dev_bio(bioc
->stripes
[dev_nr
].dev
, bio
);
482 static void btrfs_submit_bio(struct bio
*bio
, struct btrfs_io_context
*bioc
,
483 struct btrfs_io_stripe
*smap
, int mirror_num
)
486 /* Single mirror read/write fast path. */
487 btrfs_bio(bio
)->mirror_num
= mirror_num
;
488 bio
->bi_iter
.bi_sector
= smap
->physical
>> SECTOR_SHIFT
;
489 if (bio_op(bio
) != REQ_OP_READ
)
490 btrfs_bio(bio
)->orig_physical
= smap
->physical
;
491 bio
->bi_private
= smap
->dev
;
492 bio
->bi_end_io
= btrfs_simple_end_io
;
493 btrfs_submit_dev_bio(smap
->dev
, bio
);
494 } else if (bioc
->map_type
& BTRFS_BLOCK_GROUP_RAID56_MASK
) {
495 /* Parity RAID write or read recovery. */
496 bio
->bi_private
= bioc
;
497 bio
->bi_end_io
= btrfs_raid56_end_io
;
498 if (bio_op(bio
) == REQ_OP_READ
)
499 raid56_parity_recover(bio
, bioc
, mirror_num
);
501 raid56_parity_write(bio
, bioc
);
503 /* Write to multiple mirrors. */
504 int total_devs
= bioc
->num_stripes
;
506 bioc
->orig_bio
= bio
;
507 for (int dev_nr
= 0; dev_nr
< total_devs
; dev_nr
++)
508 btrfs_submit_mirrored_bio(bioc
, dev_nr
);
512 static blk_status_t
btrfs_bio_csum(struct btrfs_bio
*bbio
)
514 if (bbio
->bio
.bi_opf
& REQ_META
)
515 return btree_csum_one_bio(bbio
);
516 return btrfs_csum_one_bio(bbio
);
520 * Async submit bios are used to offload expensive checksumming onto the worker
523 struct async_submit_bio
{
524 struct btrfs_bio
*bbio
;
525 struct btrfs_io_context
*bioc
;
526 struct btrfs_io_stripe smap
;
528 struct btrfs_work work
;
532 * In order to insert checksums into the metadata in large chunks, we wait
533 * until bio submission time. All the pages in the bio are checksummed and
534 * sums are attached onto the ordered extent record.
536 * At IO completion time the csums attached on the ordered extent record are
537 * inserted into the btree.
539 static void run_one_async_start(struct btrfs_work
*work
)
541 struct async_submit_bio
*async
=
542 container_of(work
, struct async_submit_bio
, work
);
545 ret
= btrfs_bio_csum(async
->bbio
);
547 async
->bbio
->bio
.bi_status
= ret
;
551 * In order to insert checksums into the metadata in large chunks, we wait
552 * until bio submission time. All the pages in the bio are checksummed and
553 * sums are attached onto the ordered extent record.
555 * At IO completion time the csums attached on the ordered extent record are
556 * inserted into the tree.
558 * If called with @do_free == true, then it will free the work struct.
560 static void run_one_async_done(struct btrfs_work
*work
, bool do_free
)
562 struct async_submit_bio
*async
=
563 container_of(work
, struct async_submit_bio
, work
);
564 struct bio
*bio
= &async
->bbio
->bio
;
567 kfree(container_of(work
, struct async_submit_bio
, work
));
571 /* If an error occurred we just want to clean up the bio and move on. */
572 if (bio
->bi_status
) {
573 btrfs_bio_end_io(async
->bbio
, async
->bbio
->bio
.bi_status
);
578 * All of the bios that pass through here are from async helpers.
579 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
580 * context. This changes nothing when cgroups aren't in use.
582 bio
->bi_opf
|= REQ_BTRFS_CGROUP_PUNT
;
583 btrfs_submit_bio(bio
, async
->bioc
, &async
->smap
, async
->mirror_num
);
586 static bool should_async_write(struct btrfs_bio
*bbio
)
588 bool auto_csum_mode
= true;
590 #ifdef CONFIG_BTRFS_EXPERIMENTAL
591 struct btrfs_fs_devices
*fs_devices
= bbio
->fs_info
->fs_devices
;
592 enum btrfs_offload_csum_mode csum_mode
= READ_ONCE(fs_devices
->offload_csum_mode
);
594 if (csum_mode
== BTRFS_OFFLOAD_CSUM_FORCE_OFF
)
597 auto_csum_mode
= (csum_mode
== BTRFS_OFFLOAD_CSUM_AUTO
);
600 /* Submit synchronously if the checksum implementation is fast. */
601 if (auto_csum_mode
&& test_bit(BTRFS_FS_CSUM_IMPL_FAST
, &bbio
->fs_info
->flags
))
605 * Try to defer the submission to a workqueue to parallelize the
606 * checksum calculation unless the I/O is issued synchronously.
608 if (op_is_sync(bbio
->bio
.bi_opf
))
611 /* Zoned devices require I/O to be submitted in order. */
612 if ((bbio
->bio
.bi_opf
& REQ_META
) && btrfs_is_zoned(bbio
->fs_info
))
619 * Submit bio to an async queue.
621 * Return true if the work has been successfully submitted, else false.
623 static bool btrfs_wq_submit_bio(struct btrfs_bio
*bbio
,
624 struct btrfs_io_context
*bioc
,
625 struct btrfs_io_stripe
*smap
, int mirror_num
)
627 struct btrfs_fs_info
*fs_info
= bbio
->fs_info
;
628 struct async_submit_bio
*async
;
630 async
= kmalloc(sizeof(*async
), GFP_NOFS
);
637 async
->mirror_num
= mirror_num
;
639 btrfs_init_work(&async
->work
, run_one_async_start
, run_one_async_done
);
640 btrfs_queue_work(fs_info
->workers
, &async
->work
);
644 static u64
btrfs_append_map_length(struct btrfs_bio
*bbio
, u64 map_length
)
646 unsigned int nr_segs
;
649 map_length
= min(map_length
, bbio
->fs_info
->max_zone_append_size
);
650 sector_offset
= bio_split_rw_at(&bbio
->bio
, &bbio
->fs_info
->limits
,
651 &nr_segs
, map_length
);
653 return sector_offset
<< SECTOR_SHIFT
;
657 static bool btrfs_submit_chunk(struct btrfs_bio
*bbio
, int mirror_num
)
659 struct btrfs_inode
*inode
= bbio
->inode
;
660 struct btrfs_fs_info
*fs_info
= bbio
->fs_info
;
661 struct bio
*bio
= &bbio
->bio
;
662 u64 logical
= bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
;
663 u64 length
= bio
->bi_iter
.bi_size
;
664 u64 map_length
= length
;
665 bool use_append
= btrfs_use_zone_append(bbio
);
666 struct btrfs_io_context
*bioc
= NULL
;
667 struct btrfs_io_stripe smap
;
671 if (!bbio
->inode
|| btrfs_is_data_reloc_root(inode
->root
))
672 smap
.rst_search_commit_root
= true;
674 smap
.rst_search_commit_root
= false;
676 btrfs_bio_counter_inc_blocked(fs_info
);
677 error
= btrfs_map_block(fs_info
, btrfs_op(bio
), logical
, &map_length
,
678 &bioc
, &smap
, &mirror_num
);
680 ret
= errno_to_blk_status(error
);
684 map_length
= min(map_length
, length
);
686 map_length
= btrfs_append_map_length(bbio
, map_length
);
688 if (map_length
< length
) {
689 bbio
= btrfs_split_bio(fs_info
, bbio
, map_length
);
694 * Save the iter for the end_io handler and preload the checksums for
697 if (bio_op(bio
) == REQ_OP_READ
&& is_data_bbio(bbio
)) {
698 bbio
->saved_iter
= bio
->bi_iter
;
699 ret
= btrfs_lookup_bio_sums(bbio
);
704 if (btrfs_op(bio
) == BTRFS_MAP_WRITE
) {
706 bio
->bi_opf
&= ~REQ_OP_WRITE
;
707 bio
->bi_opf
|= REQ_OP_ZONE_APPEND
;
710 if (is_data_bbio(bbio
) && bioc
&&
711 btrfs_need_stripe_tree_update(bioc
->fs_info
, bioc
->map_type
)) {
713 * No locking for the list update, as we only add to
714 * the list in the I/O submission path, and list
715 * iteration only happens in the completion path, which
716 * can't happen until after the last submission.
718 btrfs_get_bioc(bioc
);
719 list_add_tail(&bioc
->rst_ordered_entry
, &bbio
->ordered
->bioc_list
);
723 * Csum items for reloc roots have already been cloned at this
724 * point, so they are handled as part of the no-checksum case.
726 if (inode
&& !(inode
->flags
& BTRFS_INODE_NODATASUM
) &&
727 !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS
, &fs_info
->fs_state
) &&
728 !btrfs_is_data_reloc_root(inode
->root
)) {
729 if (should_async_write(bbio
) &&
730 btrfs_wq_submit_bio(bbio
, bioc
, &smap
, mirror_num
))
733 ret
= btrfs_bio_csum(bbio
);
736 } else if (use_append
||
737 (btrfs_is_zoned(fs_info
) && inode
&&
738 inode
->flags
& BTRFS_INODE_NODATASUM
)) {
739 ret
= btrfs_alloc_dummy_sum(bbio
);
745 btrfs_submit_bio(bio
, bioc
, &smap
, mirror_num
);
747 return map_length
== length
;
750 btrfs_bio_counter_dec(fs_info
);
752 * We have split the original bbio, now we have to end both the current
753 * @bbio and remaining one, as the remaining one will never be submitted.
755 if (map_length
< length
) {
756 struct btrfs_bio
*remaining
= bbio
->private;
758 ASSERT(bbio
->bio
.bi_pool
== &btrfs_clone_bioset
);
761 btrfs_bio_end_io(remaining
, ret
);
763 btrfs_bio_end_io(bbio
, ret
);
764 /* Do not submit another chunk */
768 void btrfs_submit_bbio(struct btrfs_bio
*bbio
, int mirror_num
)
770 /* If bbio->inode is not populated, its file_offset must be 0. */
771 ASSERT(bbio
->inode
|| bbio
->file_offset
== 0);
773 while (!btrfs_submit_chunk(bbio
, mirror_num
))
778 * Submit a repair write.
780 * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
781 * RAID setup. Here we only want to write the one bad copy, so we do the
782 * mapping ourselves and submit the bio directly.
784 * The I/O is issued synchronously to block the repair read completion from
787 int btrfs_repair_io_failure(struct btrfs_fs_info
*fs_info
, u64 ino
, u64 start
,
788 u64 length
, u64 logical
, struct folio
*folio
,
789 unsigned int folio_offset
, int mirror_num
)
791 struct btrfs_io_stripe smap
= { 0 };
796 ASSERT(!(fs_info
->sb
->s_flags
& SB_RDONLY
));
799 if (btrfs_repair_one_zone(fs_info
, logical
))
803 * Avoid races with device replace and make sure our bioc has devices
804 * associated to its stripes that don't go away while we are doing the
805 * read repair operation.
807 btrfs_bio_counter_inc_blocked(fs_info
);
808 ret
= btrfs_map_repair_block(fs_info
, &smap
, logical
, length
, mirror_num
);
810 goto out_counter_dec
;
812 if (!smap
.dev
->bdev
||
813 !test_bit(BTRFS_DEV_STATE_WRITEABLE
, &smap
.dev
->dev_state
)) {
815 goto out_counter_dec
;
818 bio_init(&bio
, smap
.dev
->bdev
, &bvec
, 1, REQ_OP_WRITE
| REQ_SYNC
);
819 bio
.bi_iter
.bi_sector
= smap
.physical
>> SECTOR_SHIFT
;
820 ret
= bio_add_folio(&bio
, folio
, length
, folio_offset
);
822 ret
= submit_bio_wait(&bio
);
824 /* try to remap that extent elsewhere? */
825 btrfs_dev_stat_inc_and_print(smap
.dev
, BTRFS_DEV_STAT_WRITE_ERRS
);
829 btrfs_info_rl_in_rcu(fs_info
,
830 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
831 ino
, start
, btrfs_dev_name(smap
.dev
),
832 smap
.physical
>> SECTOR_SHIFT
);
838 btrfs_bio_counter_dec(fs_info
);
843 * Submit a btrfs_bio based repair write.
845 * If @dev_replace is true, the write would be submitted to dev-replace target.
847 void btrfs_submit_repair_write(struct btrfs_bio
*bbio
, int mirror_num
, bool dev_replace
)
849 struct btrfs_fs_info
*fs_info
= bbio
->fs_info
;
850 u64 logical
= bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
;
851 u64 length
= bbio
->bio
.bi_iter
.bi_size
;
852 struct btrfs_io_stripe smap
= { 0 };
856 ASSERT(mirror_num
> 0);
857 ASSERT(btrfs_op(&bbio
->bio
) == BTRFS_MAP_WRITE
);
858 ASSERT(!bbio
->inode
);
860 btrfs_bio_counter_inc_blocked(fs_info
);
861 ret
= btrfs_map_repair_block(fs_info
, &smap
, logical
, length
, mirror_num
);
866 ASSERT(smap
.dev
== fs_info
->dev_replace
.srcdev
);
867 smap
.dev
= fs_info
->dev_replace
.tgtdev
;
869 btrfs_submit_bio(&bbio
->bio
, NULL
, &smap
, mirror_num
);
873 btrfs_bio_counter_dec(fs_info
);
874 btrfs_bio_end_io(bbio
, errno_to_blk_status(ret
));
877 int __init
btrfs_bioset_init(void)
879 if (bioset_init(&btrfs_bioset
, BIO_POOL_SIZE
,
880 offsetof(struct btrfs_bio
, bio
),
883 if (bioset_init(&btrfs_clone_bioset
, BIO_POOL_SIZE
,
884 offsetof(struct btrfs_bio
, bio
), 0))
885 goto out_free_bioset
;
886 if (bioset_init(&btrfs_repair_bioset
, BIO_POOL_SIZE
,
887 offsetof(struct btrfs_bio
, bio
),
889 goto out_free_clone_bioset
;
890 if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool
, BIO_POOL_SIZE
,
891 sizeof(struct btrfs_failed_bio
)))
892 goto out_free_repair_bioset
;
895 out_free_repair_bioset
:
896 bioset_exit(&btrfs_repair_bioset
);
897 out_free_clone_bioset
:
898 bioset_exit(&btrfs_clone_bioset
);
900 bioset_exit(&btrfs_bioset
);
904 void __cold
btrfs_bioset_exit(void)
906 mempool_exit(&btrfs_failed_bio_pool
);
907 bioset_exit(&btrfs_repair_bioset
);
908 bioset_exit(&btrfs_clone_bioset
);
909 bioset_exit(&btrfs_bioset
);