1 // SPDX-License-Identifier: GPL-2.0
3 * Some low level IO code, and hacks for various block layer limitations
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
12 #include "btree_update.h"
17 #include "data_update.h"
18 #include "disk_groups.h"
24 #include "subvolume.h"
27 #include <linux/sched/mm.h>
29 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
31 static bool bch2_target_congested(struct bch_fs
*c
, u16 target
)
33 const struct bch_devs_mask
*devs
;
34 unsigned d
, nr
= 0, total
= 0;
35 u64 now
= local_clock(), last
;
43 devs
= bch2_target_to_mask(c
, target
) ?:
44 &c
->rw_devs
[BCH_DATA_user
];
46 for_each_set_bit(d
, devs
->d
, BCH_SB_MEMBERS_MAX
) {
47 ca
= rcu_dereference(c
->devs
[d
]);
51 congested
= atomic_read(&ca
->congested
);
52 last
= READ_ONCE(ca
->congested_last
);
53 if (time_after64(now
, last
))
54 congested
-= (now
- last
) >> 12;
56 total
+= max(congested
, 0LL);
61 return bch2_rand_range(nr
* CONGESTED_MAX
) < total
;
66 static bool bch2_target_congested(struct bch_fs
*c
, u16 target
)
73 /* Cache promotion on read */
79 struct rhash_head hash
;
82 struct data_update write
;
83 struct bio_vec bi_inline_vecs
[]; /* must be last */
86 static const struct rhashtable_params bch_promote_params
= {
87 .head_offset
= offsetof(struct promote_op
, hash
),
88 .key_offset
= offsetof(struct promote_op
, pos
),
89 .key_len
= sizeof(struct bpos
),
90 .automatic_shrinking
= true,
93 static inline int should_promote(struct bch_fs
*c
, struct bkey_s_c k
,
95 struct bch_io_opts opts
,
97 struct bch_io_failures
*failed
)
100 BUG_ON(!opts
.promote_target
);
102 if (!(flags
& BCH_READ_MAY_PROMOTE
))
103 return -BCH_ERR_nopromote_may_not
;
105 if (bch2_bkey_has_target(c
, k
, opts
.promote_target
))
106 return -BCH_ERR_nopromote_already_promoted
;
108 if (bkey_extent_is_unwritten(k
))
109 return -BCH_ERR_nopromote_unwritten
;
111 if (bch2_target_congested(c
, opts
.promote_target
))
112 return -BCH_ERR_nopromote_congested
;
115 if (rhashtable_lookup_fast(&c
->promote_table
, &pos
,
117 return -BCH_ERR_nopromote_in_flight
;
122 static void promote_free(struct bch_fs
*c
, struct promote_op
*op
)
126 bch2_data_update_exit(&op
->write
);
128 ret
= rhashtable_remove_fast(&c
->promote_table
, &op
->hash
,
131 bch2_write_ref_put(c
, BCH_WRITE_REF_promote
);
135 static void promote_done(struct bch_write_op
*wop
)
137 struct promote_op
*op
=
138 container_of(wop
, struct promote_op
, write
.op
);
139 struct bch_fs
*c
= op
->write
.op
.c
;
141 bch2_time_stats_update(&c
->times
[BCH_TIME_data_promote
],
146 static void promote_start(struct promote_op
*op
, struct bch_read_bio
*rbio
)
148 struct bio
*bio
= &op
->write
.op
.wbio
.bio
;
150 trace_and_count(op
->write
.op
.c
, read_promote
, &rbio
->bio
);
152 /* we now own pages: */
153 BUG_ON(!rbio
->bounce
);
154 BUG_ON(rbio
->bio
.bi_vcnt
> bio
->bi_max_vecs
);
156 memcpy(bio
->bi_io_vec
, rbio
->bio
.bi_io_vec
,
157 sizeof(struct bio_vec
) * rbio
->bio
.bi_vcnt
);
158 swap(bio
->bi_vcnt
, rbio
->bio
.bi_vcnt
);
160 bch2_data_update_read_done(&op
->write
, rbio
->pick
.crc
);
163 static struct promote_op
*__promote_alloc(struct btree_trans
*trans
,
164 enum btree_id btree_id
,
167 struct extent_ptr_decoded
*pick
,
168 struct bch_io_opts opts
,
170 struct bch_read_bio
**rbio
,
171 struct bch_io_failures
*failed
)
173 struct bch_fs
*c
= trans
->c
;
174 struct promote_op
*op
= NULL
;
176 unsigned pages
= DIV_ROUND_UP(sectors
, PAGE_SECTORS
);
179 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_promote
))
180 return ERR_PTR(-BCH_ERR_nopromote_no_writes
);
182 op
= kzalloc(struct_size(op
, bi_inline_vecs
, pages
), GFP_KERNEL
);
184 ret
= -BCH_ERR_nopromote_enomem
;
188 op
->start_time
= local_clock();
192 * We don't use the mempool here because extents that aren't
193 * checksummed or compressed can be too big for the mempool:
195 *rbio
= kzalloc(sizeof(struct bch_read_bio
) +
196 sizeof(struct bio_vec
) * pages
,
199 ret
= -BCH_ERR_nopromote_enomem
;
203 rbio_init(&(*rbio
)->bio
, opts
);
204 bio_init(&(*rbio
)->bio
, NULL
, (*rbio
)->bio
.bi_inline_vecs
, pages
, 0);
206 if (bch2_bio_alloc_pages(&(*rbio
)->bio
, sectors
<< 9, GFP_KERNEL
)) {
207 ret
= -BCH_ERR_nopromote_enomem
;
211 (*rbio
)->bounce
= true;
212 (*rbio
)->split
= true;
213 (*rbio
)->kmalloc
= true;
215 if (rhashtable_lookup_insert_fast(&c
->promote_table
, &op
->hash
,
216 bch_promote_params
)) {
217 ret
= -BCH_ERR_nopromote_in_flight
;
221 bio
= &op
->write
.op
.wbio
.bio
;
222 bio_init(bio
, NULL
, bio
->bi_inline_vecs
, pages
, 0);
224 struct data_update_opts update_opts
= {};
227 update_opts
.target
= opts
.promote_target
;
228 update_opts
.extra_replicas
= 1;
229 update_opts
.write_flags
= BCH_WRITE_ALLOC_NOWAIT
|BCH_WRITE_CACHED
;
231 update_opts
.target
= opts
.foreground_target
;
233 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
235 bkey_for_each_ptr(ptrs
, ptr
) {
236 if (bch2_dev_io_failures(failed
, ptr
->dev
))
237 update_opts
.rewrite_ptrs
|= BIT(i
);
242 ret
= bch2_data_update_init(trans
, NULL
, NULL
, &op
->write
,
243 writepoint_hashed((unsigned long) current
),
248 * possible errors: -BCH_ERR_nocow_lock_blocked,
249 * -BCH_ERR_ENOSPC_disk_reservation:
252 BUG_ON(rhashtable_remove_fast(&c
->promote_table
, &op
->hash
,
253 bch_promote_params
));
257 op
->write
.op
.end_io
= promote_done
;
262 bio_free_pages(&(*rbio
)->bio
);
265 /* We may have added to the rhashtable and thus need rcu freeing: */
267 bch2_write_ref_put(c
, BCH_WRITE_REF_promote
);
272 static struct promote_op
*promote_alloc(struct btree_trans
*trans
,
273 struct bvec_iter iter
,
275 struct extent_ptr_decoded
*pick
,
276 struct bch_io_opts opts
,
278 struct bch_read_bio
**rbio
,
281 struct bch_io_failures
*failed
)
283 struct bch_fs
*c
= trans
->c
;
285 * if failed != NULL we're not actually doing a promote, we're
286 * recovering from an io/checksum error
288 bool promote_full
= (failed
||
290 READ_ONCE(c
->opts
.promote_whole_extents
));
291 /* data might have to be decompressed in the write path: */
292 unsigned sectors
= promote_full
293 ? max(pick
->crc
.compressed_size
, pick
->crc
.live_size
)
294 : bvec_iter_sectors(iter
);
295 struct bpos pos
= promote_full
296 ? bkey_start_pos(k
.k
)
297 : POS(k
.k
->p
.inode
, iter
.bi_sector
);
298 struct promote_op
*promote
;
301 ret
= should_promote(c
, k
, pos
, opts
, flags
, failed
);
305 promote
= __promote_alloc(trans
,
306 k
.k
->type
== KEY_TYPE_reflink_v
309 k
, pos
, pick
, opts
, sectors
, rbio
, failed
);
310 ret
= PTR_ERR_OR_ZERO(promote
);
315 *read_full
= promote_full
;
318 trace_read_nopromote(c
, ret
);
324 #define READ_RETRY_AVOID 1
330 RBIO_CONTEXT_HIGHPRI
,
331 RBIO_CONTEXT_UNBOUND
,
334 static inline struct bch_read_bio
*
335 bch2_rbio_parent(struct bch_read_bio
*rbio
)
337 return rbio
->split
? rbio
->parent
: rbio
;
341 static void bch2_rbio_punt(struct bch_read_bio
*rbio
, work_func_t fn
,
342 enum rbio_context context
,
343 struct workqueue_struct
*wq
)
345 if (context
<= rbio
->context
) {
348 rbio
->work
.func
= fn
;
349 rbio
->context
= context
;
350 queue_work(wq
, &rbio
->work
);
354 static inline struct bch_read_bio
*bch2_rbio_free(struct bch_read_bio
*rbio
)
356 BUG_ON(rbio
->bounce
&& !rbio
->split
);
359 promote_free(rbio
->c
, rbio
->promote
);
360 rbio
->promote
= NULL
;
363 bch2_bio_free_pages_pool(rbio
->c
, &rbio
->bio
);
366 struct bch_read_bio
*parent
= rbio
->parent
;
380 * Only called on a top level bch_read_bio to complete an entire read request,
383 static void bch2_rbio_done(struct bch_read_bio
*rbio
)
385 if (rbio
->start_time
)
386 bch2_time_stats_update(&rbio
->c
->times
[BCH_TIME_data_read
],
388 bio_endio(&rbio
->bio
);
391 static void bch2_read_retry_nodecode(struct bch_fs
*c
, struct bch_read_bio
*rbio
,
392 struct bvec_iter bvec_iter
,
393 struct bch_io_failures
*failed
,
396 struct btree_trans
*trans
= bch2_trans_get(c
);
397 struct btree_iter iter
;
402 flags
&= ~BCH_READ_LAST_FRAGMENT
;
403 flags
|= BCH_READ_MUST_CLONE
;
405 bch2_bkey_buf_init(&sk
);
407 bch2_trans_iter_init(trans
, &iter
, rbio
->data_btree
,
408 rbio
->read_pos
, BTREE_ITER_slots
);
410 bch2_trans_begin(trans
);
411 rbio
->bio
.bi_status
= 0;
413 ret
= lockrestart_do(trans
, bkey_err(k
= bch2_btree_iter_peek_slot(&iter
)));
417 bch2_bkey_buf_reassemble(&sk
, c
, k
);
418 k
= bkey_i_to_s_c(sk
.k
);
420 if (!bch2_bkey_matches_ptr(c
, k
,
422 rbio
->data_pos
.offset
-
423 rbio
->pick
.crc
.offset
)) {
424 /* extent we wanted to read no longer exists: */
429 ret
= __bch2_read_extent(trans
, rbio
, bvec_iter
,
432 k
, 0, failed
, flags
);
433 if (ret
== READ_RETRY
)
438 bch2_rbio_done(rbio
);
439 bch2_trans_iter_exit(trans
, &iter
);
440 bch2_trans_put(trans
);
441 bch2_bkey_buf_exit(&sk
, c
);
444 rbio
->bio
.bi_status
= BLK_STS_IOERR
;
448 static void bch2_rbio_retry(struct work_struct
*work
)
450 struct bch_read_bio
*rbio
=
451 container_of(work
, struct bch_read_bio
, work
);
452 struct bch_fs
*c
= rbio
->c
;
453 struct bvec_iter iter
= rbio
->bvec_iter
;
454 unsigned flags
= rbio
->flags
;
456 .subvol
= rbio
->subvol
,
457 .inum
= rbio
->read_pos
.inode
,
459 struct bch_io_failures failed
= { .nr
= 0 };
461 trace_and_count(c
, read_retry
, &rbio
->bio
);
463 if (rbio
->retry
== READ_RETRY_AVOID
)
464 bch2_mark_io_failure(&failed
, &rbio
->pick
);
466 rbio
->bio
.bi_status
= 0;
468 rbio
= bch2_rbio_free(rbio
);
470 flags
|= BCH_READ_IN_RETRY
;
471 flags
&= ~BCH_READ_MAY_PROMOTE
;
473 if (flags
& BCH_READ_NODECODE
) {
474 bch2_read_retry_nodecode(c
, rbio
, iter
, &failed
, flags
);
476 flags
&= ~BCH_READ_LAST_FRAGMENT
;
477 flags
|= BCH_READ_MUST_CLONE
;
479 __bch2_read(c
, rbio
, iter
, inum
, &failed
, flags
);
483 static void bch2_rbio_error(struct bch_read_bio
*rbio
, int retry
,
488 if (rbio
->flags
& BCH_READ_IN_RETRY
)
491 if (retry
== READ_ERR
) {
492 rbio
= bch2_rbio_free(rbio
);
494 rbio
->bio
.bi_status
= error
;
495 bch2_rbio_done(rbio
);
497 bch2_rbio_punt(rbio
, bch2_rbio_retry
,
498 RBIO_CONTEXT_UNBOUND
, system_unbound_wq
);
502 static int __bch2_rbio_narrow_crcs(struct btree_trans
*trans
,
503 struct bch_read_bio
*rbio
)
505 struct bch_fs
*c
= rbio
->c
;
506 u64 data_offset
= rbio
->data_pos
.offset
- rbio
->pick
.crc
.offset
;
507 struct bch_extent_crc_unpacked new_crc
;
508 struct btree_iter iter
;
513 if (crc_is_compressed(rbio
->pick
.crc
))
516 k
= bch2_bkey_get_iter(trans
, &iter
, rbio
->data_btree
, rbio
->data_pos
,
517 BTREE_ITER_slots
|BTREE_ITER_intent
);
518 if ((ret
= bkey_err(k
)))
521 if (bversion_cmp(k
.k
->bversion
, rbio
->version
) ||
522 !bch2_bkey_matches_ptr(c
, k
, rbio
->pick
.ptr
, data_offset
))
525 /* Extent was merged? */
526 if (bkey_start_offset(k
.k
) < data_offset
||
527 k
.k
->p
.offset
> data_offset
+ rbio
->pick
.crc
.uncompressed_size
)
530 if (bch2_rechecksum_bio(c
, &rbio
->bio
, rbio
->version
,
531 rbio
->pick
.crc
, NULL
, &new_crc
,
532 bkey_start_offset(k
.k
) - data_offset
, k
.k
->size
,
533 rbio
->pick
.crc
.csum_type
)) {
534 bch_err(c
, "error verifying existing checksum while narrowing checksum (memory corruption?)");
540 * going to be temporarily appending another checksum entry:
542 new = bch2_trans_kmalloc(trans
, bkey_bytes(k
.k
) +
543 sizeof(struct bch_extent_crc128
));
544 if ((ret
= PTR_ERR_OR_ZERO(new)))
547 bkey_reassemble(new, k
);
549 if (!bch2_bkey_narrow_crcs(new, new_crc
))
552 ret
= bch2_trans_update(trans
, &iter
, new,
553 BTREE_UPDATE_internal_snapshot_node
);
555 bch2_trans_iter_exit(trans
, &iter
);
559 static noinline
void bch2_rbio_narrow_crcs(struct bch_read_bio
*rbio
)
561 bch2_trans_commit_do(rbio
->c
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
562 __bch2_rbio_narrow_crcs(trans
, rbio
));
565 /* Inner part that may run in process context */
566 static void __bch2_read_endio(struct work_struct
*work
)
568 struct bch_read_bio
*rbio
=
569 container_of(work
, struct bch_read_bio
, work
);
570 struct bch_fs
*c
= rbio
->c
;
571 struct bio
*src
= &rbio
->bio
;
572 struct bio
*dst
= &bch2_rbio_parent(rbio
)->bio
;
573 struct bvec_iter dst_iter
= rbio
->bvec_iter
;
574 struct bch_extent_crc_unpacked crc
= rbio
->pick
.crc
;
575 struct nonce nonce
= extent_nonce(rbio
->version
, crc
);
577 struct bch_csum csum
;
580 nofs_flags
= memalloc_nofs_save();
582 /* Reset iterator for checksumming and copying bounced data: */
584 src
->bi_iter
.bi_size
= crc
.compressed_size
<< 9;
585 src
->bi_iter
.bi_idx
= 0;
586 src
->bi_iter
.bi_bvec_done
= 0;
588 src
->bi_iter
= rbio
->bvec_iter
;
591 csum
= bch2_checksum_bio(c
, crc
.csum_type
, nonce
, src
);
592 if (bch2_crc_cmp(csum
, rbio
->pick
.crc
.csum
) && !c
->opts
.no_data_io
)
597 * We need to rework the narrow_crcs path to deliver the read completion
598 * first, and then punt to a different workqueue, otherwise we're
599 * holding up reads while doing btree updates which is bad for memory
602 if (unlikely(rbio
->narrow_crcs
))
603 bch2_rbio_narrow_crcs(rbio
);
605 if (rbio
->flags
& BCH_READ_NODECODE
)
608 /* Adjust crc to point to subset of data we want: */
609 crc
.offset
+= rbio
->offset_into_extent
;
610 crc
.live_size
= bvec_iter_sectors(rbio
->bvec_iter
);
612 if (crc_is_compressed(crc
)) {
613 ret
= bch2_encrypt_bio(c
, crc
.csum_type
, nonce
, src
);
617 if (bch2_bio_uncompress(c
, src
, dst
, dst_iter
, crc
) &&
619 goto decompression_err
;
621 /* don't need to decrypt the entire bio: */
622 nonce
= nonce_add(nonce
, crc
.offset
<< 9);
623 bio_advance(src
, crc
.offset
<< 9);
625 BUG_ON(src
->bi_iter
.bi_size
< dst_iter
.bi_size
);
626 src
->bi_iter
.bi_size
= dst_iter
.bi_size
;
628 ret
= bch2_encrypt_bio(c
, crc
.csum_type
, nonce
, src
);
633 struct bvec_iter src_iter
= src
->bi_iter
;
635 bio_copy_data_iter(dst
, &dst_iter
, src
, &src_iter
);
641 * Re encrypt data we decrypted, so it's consistent with
644 ret
= bch2_encrypt_bio(c
, crc
.csum_type
, nonce
, src
);
648 promote_start(rbio
->promote
, rbio
);
649 rbio
->promote
= NULL
;
652 if (likely(!(rbio
->flags
& BCH_READ_IN_RETRY
))) {
653 rbio
= bch2_rbio_free(rbio
);
654 bch2_rbio_done(rbio
);
657 memalloc_nofs_restore(nofs_flags
);
661 * Checksum error: if the bio wasn't bounced, we may have been
662 * reading into buffers owned by userspace (that userspace can
663 * scribble over) - retry the read, bouncing it this time:
665 if (!rbio
->bounce
&& (rbio
->flags
& BCH_READ_USER_MAPPED
)) {
666 rbio
->flags
|= BCH_READ_MUST_BOUNCE
;
667 bch2_rbio_error(rbio
, READ_RETRY
, BLK_STS_IOERR
);
671 struct printbuf buf
= PRINTBUF
;
673 prt_str(&buf
, "data ");
674 bch2_csum_err_msg(&buf
, crc
.csum_type
, rbio
->pick
.crc
.csum
, csum
);
676 struct bch_dev
*ca
= rbio
->have_ioref
? bch2_dev_have_ref(c
, rbio
->pick
.ptr
.dev
) : NULL
;
678 bch_err_inum_offset_ratelimited(ca
,
679 rbio
->read_pos
.inode
,
680 rbio
->read_pos
.offset
<< 9,
682 bch2_io_error(ca
, BCH_MEMBER_ERROR_checksum
);
685 bch2_rbio_error(rbio
, READ_RETRY_AVOID
, BLK_STS_IOERR
);
688 bch_err_inum_offset_ratelimited(c
, rbio
->read_pos
.inode
,
689 rbio
->read_pos
.offset
<< 9,
690 "decompression error");
691 bch2_rbio_error(rbio
, READ_ERR
, BLK_STS_IOERR
);
694 bch_err_inum_offset_ratelimited(c
, rbio
->read_pos
.inode
,
695 rbio
->read_pos
.offset
<< 9,
697 bch2_rbio_error(rbio
, READ_ERR
, BLK_STS_IOERR
);
701 static void bch2_read_endio(struct bio
*bio
)
703 struct bch_read_bio
*rbio
=
704 container_of(bio
, struct bch_read_bio
, bio
);
705 struct bch_fs
*c
= rbio
->c
;
706 struct bch_dev
*ca
= rbio
->have_ioref
? bch2_dev_have_ref(c
, rbio
->pick
.ptr
.dev
) : NULL
;
707 struct workqueue_struct
*wq
= NULL
;
708 enum rbio_context context
= RBIO_CONTEXT_NULL
;
710 if (rbio
->have_ioref
) {
711 bch2_latency_acct(ca
, rbio
->submit_time
, READ
);
712 percpu_ref_put(&ca
->io_ref
);
716 rbio
->bio
.bi_end_io
= rbio
->end_io
;
718 if (bio
->bi_status
) {
720 bch_err_inum_offset_ratelimited(ca
,
721 rbio
->read_pos
.inode
,
722 rbio
->read_pos
.offset
,
723 "data read error: %s",
724 bch2_blk_status_to_str(bio
->bi_status
));
725 bch2_io_error(ca
, BCH_MEMBER_ERROR_read
);
727 bch2_rbio_error(rbio
, READ_RETRY_AVOID
, bio
->bi_status
);
731 if (((rbio
->flags
& BCH_READ_RETRY_IF_STALE
) && race_fault()) ||
732 (ca
&& dev_ptr_stale(ca
, &rbio
->pick
.ptr
))) {
733 trace_and_count(c
, read_reuse_race
, &rbio
->bio
);
735 if (rbio
->flags
& BCH_READ_RETRY_IF_STALE
)
736 bch2_rbio_error(rbio
, READ_RETRY
, BLK_STS_AGAIN
);
738 bch2_rbio_error(rbio
, READ_ERR
, BLK_STS_AGAIN
);
742 if (rbio
->narrow_crcs
||
744 crc_is_compressed(rbio
->pick
.crc
) ||
745 bch2_csum_type_is_encryption(rbio
->pick
.crc
.csum_type
))
746 context
= RBIO_CONTEXT_UNBOUND
, wq
= system_unbound_wq
;
747 else if (rbio
->pick
.crc
.csum_type
)
748 context
= RBIO_CONTEXT_HIGHPRI
, wq
= system_highpri_wq
;
750 bch2_rbio_punt(rbio
, __bch2_read_endio
, context
, wq
);
753 int __bch2_read_indirect_extent(struct btree_trans
*trans
,
754 unsigned *offset_into_extent
,
755 struct bkey_buf
*orig_k
)
757 struct btree_iter iter
;
762 reflink_offset
= le64_to_cpu(bkey_i_to_reflink_p(orig_k
->k
)->v
.idx
) +
765 k
= bch2_bkey_get_iter(trans
, &iter
, BTREE_ID_reflink
,
766 POS(0, reflink_offset
), 0);
771 if (k
.k
->type
!= KEY_TYPE_reflink_v
&&
772 k
.k
->type
!= KEY_TYPE_indirect_inline_data
) {
773 bch_err_inum_offset_ratelimited(trans
->c
,
774 orig_k
->k
->k
.p
.inode
,
775 orig_k
->k
->k
.p
.offset
<< 9,
776 "%llu len %u points to nonexistent indirect extent %llu",
777 orig_k
->k
->k
.p
.offset
,
780 bch2_inconsistent_error(trans
->c
);
781 ret
= -BCH_ERR_missing_indirect_extent
;
785 *offset_into_extent
= iter
.pos
.offset
- bkey_start_offset(k
.k
);
786 bch2_bkey_buf_reassemble(orig_k
, trans
->c
, k
);
788 bch2_trans_iter_exit(trans
, &iter
);
792 static noinline
void read_from_stale_dirty_pointer(struct btree_trans
*trans
,
795 struct bch_extent_ptr ptr
)
797 struct bch_fs
*c
= trans
->c
;
798 struct btree_iter iter
;
799 struct printbuf buf
= PRINTBUF
;
802 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_alloc
,
803 PTR_BUCKET_POS(ca
, &ptr
),
806 int gen
= bucket_gen_get(ca
, iter
.pos
.offset
);
808 prt_printf(&buf
, "Attempting to read from stale dirty pointer:\n");
809 printbuf_indent_add(&buf
, 2);
811 bch2_bkey_val_to_text(&buf
, c
, k
);
814 prt_printf(&buf
, "memory gen: %u", gen
);
816 ret
= lockrestart_do(trans
, bkey_err(k
= bch2_btree_iter_peek_slot(&iter
)));
819 bch2_bkey_val_to_text(&buf
, c
, k
);
822 prt_printf(&buf
, "Attempting to read from invalid bucket %llu:%llu:\n",
823 iter
.pos
.inode
, iter
.pos
.offset
);
824 printbuf_indent_add(&buf
, 2);
826 prt_printf(&buf
, "first bucket %u nbuckets %llu\n",
827 ca
->mi
.first_bucket
, ca
->mi
.nbuckets
);
829 bch2_bkey_val_to_text(&buf
, c
, k
);
833 bch2_fs_inconsistent(c
, "%s", buf
.buf
);
835 bch2_trans_iter_exit(trans
, &iter
);
839 int __bch2_read_extent(struct btree_trans
*trans
, struct bch_read_bio
*orig
,
840 struct bvec_iter iter
, struct bpos read_pos
,
841 enum btree_id data_btree
, struct bkey_s_c k
,
842 unsigned offset_into_extent
,
843 struct bch_io_failures
*failed
, unsigned flags
)
845 struct bch_fs
*c
= trans
->c
;
846 struct extent_ptr_decoded pick
;
847 struct bch_read_bio
*rbio
= NULL
;
848 struct promote_op
*promote
= NULL
;
849 bool bounce
= false, read_full
= false, narrow_crcs
= false;
850 struct bpos data_pos
= bkey_start_pos(k
.k
);
853 if (bkey_extent_is_inline_data(k
.k
)) {
854 unsigned bytes
= min_t(unsigned, iter
.bi_size
,
855 bkey_inline_data_bytes(k
.k
));
857 swap(iter
.bi_size
, bytes
);
858 memcpy_to_bio(&orig
->bio
, iter
, bkey_inline_data_p(k
));
859 swap(iter
.bi_size
, bytes
);
860 bio_advance_iter(&orig
->bio
, &iter
, bytes
);
861 zero_fill_bio_iter(&orig
->bio
, iter
);
865 pick_ret
= bch2_bkey_pick_read_device(c
, k
, failed
, &pick
);
867 /* hole or reservation - just zero fill: */
872 struct printbuf buf
= PRINTBUF
;
873 bch2_bkey_val_to_text(&buf
, c
, k
);
875 bch_err_inum_offset_ratelimited(c
,
876 read_pos
.inode
, read_pos
.offset
<< 9,
877 "no device to read from: %s\n %s",
878 bch2_err_str(pick_ret
),
884 struct bch_dev
*ca
= bch2_dev_get_ioref(c
, pick
.ptr
.dev
, READ
);
887 * Stale dirty pointers are treated as IO errors, but @failed isn't
888 * allocated unless we're in the retry path - so if we're not in the
889 * retry path, don't check here, it'll be caught in bch2_read_endio()
890 * and we'll end up in the retry path:
892 if ((flags
& BCH_READ_IN_RETRY
) &&
895 unlikely(dev_ptr_stale(ca
, &pick
.ptr
))) {
896 read_from_stale_dirty_pointer(trans
, ca
, k
, pick
.ptr
);
897 bch2_mark_io_failure(failed
, &pick
);
898 percpu_ref_put(&ca
->io_ref
);
903 * Unlock the iterator while the btree node's lock is still in
904 * cache, before doing the IO:
906 bch2_trans_unlock(trans
);
908 if (flags
& BCH_READ_NODECODE
) {
910 * can happen if we retry, and the extent we were going to read
911 * has been merged in the meantime:
913 if (pick
.crc
.compressed_size
> orig
->bio
.bi_vcnt
* PAGE_SECTORS
) {
915 percpu_ref_put(&ca
->io_ref
);
919 iter
.bi_size
= pick
.crc
.compressed_size
<< 9;
923 if (!(flags
& BCH_READ_LAST_FRAGMENT
) ||
924 bio_flagged(&orig
->bio
, BIO_CHAIN
))
925 flags
|= BCH_READ_MUST_CLONE
;
927 narrow_crcs
= !(flags
& BCH_READ_IN_RETRY
) &&
928 bch2_can_narrow_extent_crcs(k
, pick
.crc
);
930 if (narrow_crcs
&& (flags
& BCH_READ_USER_MAPPED
))
931 flags
|= BCH_READ_MUST_BOUNCE
;
933 EBUG_ON(offset_into_extent
+ bvec_iter_sectors(iter
) > k
.k
->size
);
935 if (crc_is_compressed(pick
.crc
) ||
936 (pick
.crc
.csum_type
!= BCH_CSUM_none
&&
937 (bvec_iter_sectors(iter
) != pick
.crc
.uncompressed_size
||
938 (bch2_csum_type_is_encryption(pick
.crc
.csum_type
) &&
939 (flags
& BCH_READ_USER_MAPPED
)) ||
940 (flags
& BCH_READ_MUST_BOUNCE
)))) {
945 if (orig
->opts
.promote_target
)// || failed)
946 promote
= promote_alloc(trans
, iter
, k
, &pick
, orig
->opts
, flags
,
947 &rbio
, &bounce
, &read_full
, failed
);
950 EBUG_ON(crc_is_compressed(pick
.crc
));
951 EBUG_ON(pick
.crc
.csum_type
&&
952 (bvec_iter_sectors(iter
) != pick
.crc
.uncompressed_size
||
953 bvec_iter_sectors(iter
) != pick
.crc
.live_size
||
955 offset_into_extent
));
957 data_pos
.offset
+= offset_into_extent
;
958 pick
.ptr
.offset
+= pick
.crc
.offset
+
960 offset_into_extent
= 0;
961 pick
.crc
.compressed_size
= bvec_iter_sectors(iter
);
962 pick
.crc
.uncompressed_size
= bvec_iter_sectors(iter
);
964 pick
.crc
.live_size
= bvec_iter_sectors(iter
);
969 * promote already allocated bounce rbio:
970 * promote needs to allocate a bio big enough for uncompressing
971 * data in the write path, but we're not going to use it all
974 EBUG_ON(rbio
->bio
.bi_iter
.bi_size
<
975 pick
.crc
.compressed_size
<< 9);
976 rbio
->bio
.bi_iter
.bi_size
=
977 pick
.crc
.compressed_size
<< 9;
979 unsigned sectors
= pick
.crc
.compressed_size
;
981 rbio
= rbio_init(bio_alloc_bioset(NULL
,
982 DIV_ROUND_UP(sectors
, PAGE_SECTORS
),
988 bch2_bio_alloc_pages_pool(c
, &rbio
->bio
, sectors
<< 9);
991 } else if (flags
& BCH_READ_MUST_CLONE
) {
993 * Have to clone if there were any splits, due to error
994 * reporting issues (if a split errored, and retrying didn't
995 * work, when it reports the error to its parent (us) we don't
996 * know if the error was from our bio, and we should retry, or
997 * from the whole bio, in which case we don't want to retry and
1000 rbio
= rbio_init(bio_alloc_clone(NULL
, &orig
->bio
, GFP_NOFS
,
1001 &c
->bio_read_split
),
1003 rbio
->bio
.bi_iter
= iter
;
1007 rbio
->bio
.bi_iter
= iter
;
1008 EBUG_ON(bio_flagged(&rbio
->bio
, BIO_CHAIN
));
1011 EBUG_ON(bio_sectors(&rbio
->bio
) != pick
.crc
.compressed_size
);
1014 rbio
->submit_time
= local_clock();
1016 rbio
->parent
= orig
;
1018 rbio
->end_io
= orig
->bio
.bi_end_io
;
1019 rbio
->bvec_iter
= iter
;
1020 rbio
->offset_into_extent
= offset_into_extent
;
1021 rbio
->flags
= flags
;
1022 rbio
->have_ioref
= ca
!= NULL
;
1023 rbio
->narrow_crcs
= narrow_crcs
;
1027 /* XXX: only initialize this if needed */
1028 rbio
->devs_have
= bch2_bkey_devs(k
);
1030 rbio
->subvol
= orig
->subvol
;
1031 rbio
->read_pos
= read_pos
;
1032 rbio
->data_btree
= data_btree
;
1033 rbio
->data_pos
= data_pos
;
1034 rbio
->version
= k
.k
->bversion
;
1035 rbio
->promote
= promote
;
1036 INIT_WORK(&rbio
->work
, NULL
);
1038 if (flags
& BCH_READ_NODECODE
)
1041 rbio
->bio
.bi_opf
= orig
->bio
.bi_opf
;
1042 rbio
->bio
.bi_iter
.bi_sector
= pick
.ptr
.offset
;
1043 rbio
->bio
.bi_end_io
= bch2_read_endio
;
1046 trace_and_count(c
, read_bounce
, &rbio
->bio
);
1048 this_cpu_add(c
->counters
[BCH_COUNTER_io_read
], bio_sectors(&rbio
->bio
));
1049 bch2_increment_clock(c
, bio_sectors(&rbio
->bio
), READ
);
1052 * If it's being moved internally, we don't want to flag it as a cache
1055 if (ca
&& pick
.ptr
.cached
&& !(flags
& BCH_READ_NODECODE
))
1056 bch2_bucket_io_time_reset(trans
, pick
.ptr
.dev
,
1057 PTR_BUCKET_NR(ca
, &pick
.ptr
), READ
);
1059 if (!(flags
& (BCH_READ_IN_RETRY
|BCH_READ_LAST_FRAGMENT
))) {
1060 bio_inc_remaining(&orig
->bio
);
1061 trace_and_count(c
, read_split
, &orig
->bio
);
1064 if (!rbio
->pick
.idx
) {
1065 if (!rbio
->have_ioref
) {
1066 bch_err_inum_offset_ratelimited(c
,
1068 read_pos
.offset
<< 9,
1069 "no device to read from");
1070 bch2_rbio_error(rbio
, READ_RETRY_AVOID
, BLK_STS_IOERR
);
1074 this_cpu_add(ca
->io_done
->sectors
[READ
][BCH_DATA_user
],
1075 bio_sectors(&rbio
->bio
));
1076 bio_set_dev(&rbio
->bio
, ca
->disk_sb
.bdev
);
1078 if (unlikely(c
->opts
.no_data_io
)) {
1079 if (likely(!(flags
& BCH_READ_IN_RETRY
)))
1080 bio_endio(&rbio
->bio
);
1082 if (likely(!(flags
& BCH_READ_IN_RETRY
)))
1083 submit_bio(&rbio
->bio
);
1085 submit_bio_wait(&rbio
->bio
);
1089 * We just submitted IO which may block, we expect relock fail
1090 * events and shouldn't count them:
1092 trans
->notrace_relock_fail
= true;
1094 /* Attempting reconstruct read: */
1095 if (bch2_ec_read_extent(trans
, rbio
, k
)) {
1096 bch2_rbio_error(rbio
, READ_RETRY_AVOID
, BLK_STS_IOERR
);
1100 if (likely(!(flags
& BCH_READ_IN_RETRY
)))
1101 bio_endio(&rbio
->bio
);
1104 if (likely(!(flags
& BCH_READ_IN_RETRY
))) {
1109 rbio
->context
= RBIO_CONTEXT_UNBOUND
;
1110 bch2_read_endio(&rbio
->bio
);
1113 rbio
= bch2_rbio_free(rbio
);
1115 if (ret
== READ_RETRY_AVOID
) {
1116 bch2_mark_io_failure(failed
, &pick
);
1127 if (flags
& BCH_READ_IN_RETRY
)
1130 orig
->bio
.bi_status
= BLK_STS_IOERR
;
1135 * won't normally happen in the BCH_READ_NODECODE
1136 * (bch2_move_extent()) path, but if we retry and the extent we wanted
1137 * to read no longer exists we have to signal that:
1139 if (flags
& BCH_READ_NODECODE
)
1142 zero_fill_bio_iter(&orig
->bio
, iter
);
1144 if (flags
& BCH_READ_LAST_FRAGMENT
)
1145 bch2_rbio_done(orig
);
1149 void __bch2_read(struct bch_fs
*c
, struct bch_read_bio
*rbio
,
1150 struct bvec_iter bvec_iter
, subvol_inum inum
,
1151 struct bch_io_failures
*failed
, unsigned flags
)
1153 struct btree_trans
*trans
= bch2_trans_get(c
);
1154 struct btree_iter iter
;
1159 BUG_ON(flags
& BCH_READ_NODECODE
);
1161 bch2_bkey_buf_init(&sk
);
1162 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_extents
,
1163 POS(inum
.inum
, bvec_iter
.bi_sector
),
1167 unsigned bytes
, sectors
, offset_into_extent
;
1168 enum btree_id data_btree
= BTREE_ID_extents
;
1170 bch2_trans_begin(trans
);
1173 ret
= bch2_subvolume_get_snapshot(trans
, inum
.subvol
, &snapshot
);
1177 bch2_btree_iter_set_snapshot(&iter
, snapshot
);
1179 bch2_btree_iter_set_pos(&iter
,
1180 POS(inum
.inum
, bvec_iter
.bi_sector
));
1182 k
= bch2_btree_iter_peek_slot(&iter
);
1187 offset_into_extent
= iter
.pos
.offset
-
1188 bkey_start_offset(k
.k
);
1189 sectors
= k
.k
->size
- offset_into_extent
;
1191 bch2_bkey_buf_reassemble(&sk
, c
, k
);
1193 ret
= bch2_read_indirect_extent(trans
, &data_btree
,
1194 &offset_into_extent
, &sk
);
1198 k
= bkey_i_to_s_c(sk
.k
);
1201 * With indirect extents, the amount of data to read is the min
1202 * of the original extent and the indirect extent:
1204 sectors
= min(sectors
, k
.k
->size
- offset_into_extent
);
1206 bytes
= min(sectors
, bvec_iter_sectors(bvec_iter
)) << 9;
1207 swap(bvec_iter
.bi_size
, bytes
);
1209 if (bvec_iter
.bi_size
== bytes
)
1210 flags
|= BCH_READ_LAST_FRAGMENT
;
1212 ret
= __bch2_read_extent(trans
, rbio
, bvec_iter
, iter
.pos
,
1214 offset_into_extent
, failed
, flags
);
1218 if (flags
& BCH_READ_LAST_FRAGMENT
)
1221 swap(bvec_iter
.bi_size
, bytes
);
1222 bio_advance_iter(&rbio
->bio
, &bvec_iter
, bytes
);
1225 !bch2_err_matches(ret
, BCH_ERR_transaction_restart
) &&
1226 ret
!= READ_RETRY
&&
1227 ret
!= READ_RETRY_AVOID
)
1231 bch2_trans_iter_exit(trans
, &iter
);
1232 bch2_trans_put(trans
);
1233 bch2_bkey_buf_exit(&sk
, c
);
1236 bch_err_inum_offset_ratelimited(c
, inum
.inum
,
1237 bvec_iter
.bi_sector
<< 9,
1238 "read error %i from btree lookup", ret
);
1239 rbio
->bio
.bi_status
= BLK_STS_IOERR
;
1240 bch2_rbio_done(rbio
);
1244 void bch2_fs_io_read_exit(struct bch_fs
*c
)
1246 if (c
->promote_table
.tbl
)
1247 rhashtable_destroy(&c
->promote_table
);
1248 bioset_exit(&c
->bio_read_split
);
1249 bioset_exit(&c
->bio_read
);
1252 int bch2_fs_io_read_init(struct bch_fs
*c
)
1254 if (bioset_init(&c
->bio_read
, 1, offsetof(struct bch_read_bio
, bio
),
1256 return -BCH_ERR_ENOMEM_bio_read_init
;
1258 if (bioset_init(&c
->bio_read_split
, 1, offsetof(struct bch_read_bio
, bio
),
1260 return -BCH_ERR_ENOMEM_bio_read_split_init
;
1262 if (rhashtable_init(&c
->promote_table
, &bch_promote_params
))
1263 return -BCH_ERR_ENOMEM_promote_table_init
;