1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
4 * Copyright 2012 Google, Inc.
8 #include "alloc_foreground.h"
11 #include "btree_update.h"
19 #include "extent_update.h"
25 #include "nocow_locking.h"
26 #include "rebalance.h"
27 #include "subvolume.h"
32 #include <linux/blkdev.h>
33 #include <linux/prefetch.h>
34 #include <linux/random.h>
35 #include <linux/sched/mm.h>
37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
39 static inline void bch2_congested_acct(struct bch_dev
*ca
, u64 io_latency
,
43 ca
->io_latency
[rw
].quantiles
.entries
[QUANTILE_IDX(1)].m
;
44 /* ideally we'd be taking into account the device's variance here: */
45 u64 latency_threshold
= latency_capable
<< (rw
== READ
? 2 : 3);
46 s64 latency_over
= io_latency
- latency_threshold
;
48 if (latency_threshold
&& latency_over
> 0) {
50 * bump up congested by approximately latency_over * 4 /
51 * latency_threshold - we don't need much accuracy here so don't
52 * bother with the divide:
54 if (atomic_read(&ca
->congested
) < CONGESTED_MAX
)
55 atomic_add(latency_over
>>
56 max_t(int, ilog2(latency_threshold
) - 2, 0),
59 ca
->congested_last
= now
;
60 } else if (atomic_read(&ca
->congested
) > 0) {
61 atomic_dec(&ca
->congested
);
65 void bch2_latency_acct(struct bch_dev
*ca
, u64 submit_time
, int rw
)
67 atomic64_t
*latency
= &ca
->cur_latency
[rw
];
68 u64 now
= local_clock();
69 u64 io_latency
= time_after64(now
, submit_time
)
74 old
= atomic64_read(latency
);
77 * If the io latency was reasonably close to the current
78 * latency, skip doing the update and atomic operation - most of
81 if (abs((int) (old
- io_latency
)) < (old
>> 1) &&
85 new = ewma_add(old
, io_latency
, 5);
86 } while (!atomic64_try_cmpxchg(latency
, &old
, new));
88 bch2_congested_acct(ca
, io_latency
, now
, rw
);
90 __bch2_time_stats_update(&ca
->io_latency
[rw
].stats
, submit_time
, now
);
95 /* Allocate, free from mempool: */
97 void bch2_bio_free_pages_pool(struct bch_fs
*c
, struct bio
*bio
)
99 struct bvec_iter_all iter
;
102 bio_for_each_segment_all(bv
, bio
, iter
)
103 if (bv
->bv_page
!= ZERO_PAGE(0))
104 mempool_free(bv
->bv_page
, &c
->bio_bounce_pages
);
108 static struct page
*__bio_alloc_page_pool(struct bch_fs
*c
, bool *using_mempool
)
112 if (likely(!*using_mempool
)) {
113 page
= alloc_page(GFP_NOFS
);
114 if (unlikely(!page
)) {
115 mutex_lock(&c
->bio_bounce_pages_lock
);
116 *using_mempool
= true;
122 page
= mempool_alloc(&c
->bio_bounce_pages
, GFP_NOFS
);
128 void bch2_bio_alloc_pages_pool(struct bch_fs
*c
, struct bio
*bio
,
131 bool using_mempool
= false;
134 struct page
*page
= __bio_alloc_page_pool(c
, &using_mempool
);
135 unsigned len
= min_t(size_t, PAGE_SIZE
, size
);
137 BUG_ON(!bio_add_page(bio
, page
, len
, 0));
142 mutex_unlock(&c
->bio_bounce_pages_lock
);
145 /* Extent update path: */
147 int bch2_sum_sector_overwrites(struct btree_trans
*trans
,
148 struct btree_iter
*extent_iter
,
150 bool *usage_increasing
,
151 s64
*i_sectors_delta
,
152 s64
*disk_sectors_delta
)
154 struct bch_fs
*c
= trans
->c
;
155 struct btree_iter iter
;
157 unsigned new_replicas
= bch2_bkey_replicas(c
, bkey_i_to_s_c(new));
158 bool new_compressed
= bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
161 *usage_increasing
= false;
162 *i_sectors_delta
= 0;
163 *disk_sectors_delta
= 0;
165 bch2_trans_copy_iter(&iter
, extent_iter
);
167 for_each_btree_key_upto_continue_norestart(iter
,
168 new->k
.p
, BTREE_ITER_slots
, old
, ret
) {
169 s64 sectors
= min(new->k
.p
.offset
, old
.k
->p
.offset
) -
170 max(bkey_start_offset(&new->k
),
171 bkey_start_offset(old
.k
));
173 *i_sectors_delta
+= sectors
*
174 (bkey_extent_is_allocation(&new->k
) -
175 bkey_extent_is_allocation(old
.k
));
177 *disk_sectors_delta
+= sectors
* bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
178 *disk_sectors_delta
-= new->k
.p
.snapshot
== old
.k
->p
.snapshot
179 ? sectors
* bch2_bkey_nr_ptrs_fully_allocated(old
)
182 if (!*usage_increasing
&&
183 (new->k
.p
.snapshot
!= old
.k
->p
.snapshot
||
184 new_replicas
> bch2_bkey_replicas(c
, old
) ||
185 (!new_compressed
&& bch2_bkey_sectors_compressed(old
))))
186 *usage_increasing
= true;
188 if (bkey_ge(old
.k
->p
, new->k
.p
))
192 bch2_trans_iter_exit(trans
, &iter
);
196 static inline int bch2_extent_update_i_size_sectors(struct btree_trans
*trans
,
197 struct btree_iter
*extent_iter
,
202 * Crazy performance optimization:
203 * Every extent update needs to also update the inode: the inode trigger
204 * will set bi->journal_seq to the journal sequence number of this
205 * transaction - for fsync.
207 * But if that's the only reason we're updating the inode (we're not
208 * updating bi_size or bi_sectors), then we don't need the inode update
209 * to be journalled - if we crash, the bi_journal_seq update will be
210 * lost, but that's fine.
212 unsigned inode_update_flags
= BTREE_UPDATE_nojournal
;
214 struct btree_iter iter
;
215 struct bkey_s_c k
= bch2_bkey_get_iter(trans
, &iter
, BTREE_ID_inodes
,
217 extent_iter
->pos
.inode
,
218 extent_iter
->snapshot
),
220 int ret
= bkey_err(k
);
225 * varint_decode_fast(), in the inode .invalid method, reads up to 7
226 * bytes past the end of the buffer:
228 struct bkey_i
*k_mut
= bch2_trans_kmalloc_nomemzero(trans
, bkey_bytes(k
.k
) + 8);
229 ret
= PTR_ERR_OR_ZERO(k_mut
);
233 bkey_reassemble(k_mut
, k
);
235 if (unlikely(k_mut
->k
.type
!= KEY_TYPE_inode_v3
)) {
236 k_mut
= bch2_inode_to_v3(trans
, k_mut
);
237 ret
= PTR_ERR_OR_ZERO(k_mut
);
242 struct bkey_i_inode_v3
*inode
= bkey_i_to_inode_v3(k_mut
);
244 if (!(le64_to_cpu(inode
->v
.bi_flags
) & BCH_INODE_i_size_dirty
) &&
245 new_i_size
> le64_to_cpu(inode
->v
.bi_size
)) {
246 inode
->v
.bi_size
= cpu_to_le64(new_i_size
);
247 inode_update_flags
= 0;
250 if (i_sectors_delta
) {
251 le64_add_cpu(&inode
->v
.bi_sectors
, i_sectors_delta
);
252 inode_update_flags
= 0;
255 if (inode
->k
.p
.snapshot
!= iter
.snapshot
) {
256 inode
->k
.p
.snapshot
= iter
.snapshot
;
257 inode_update_flags
= 0;
260 ret
= bch2_trans_update(trans
, &iter
, &inode
->k_i
,
261 BTREE_UPDATE_internal_snapshot_node
|
264 bch2_trans_iter_exit(trans
, &iter
);
268 int bch2_extent_update(struct btree_trans
*trans
,
270 struct btree_iter
*iter
,
272 struct disk_reservation
*disk_res
,
274 s64
*i_sectors_delta_total
,
277 struct bpos next_pos
;
278 bool usage_increasing
;
279 s64 i_sectors_delta
= 0, disk_sectors_delta
= 0;
283 * This traverses us the iterator without changing iter->path->pos to
284 * search_key() (which is pos + 1 for extents): we want there to be a
285 * path already traversed at iter->pos because
286 * bch2_trans_extent_update() will use it to attempt extent merging
288 ret
= __bch2_btree_iter_traverse(iter
);
292 ret
= bch2_extent_trim_atomic(trans
, iter
, k
);
298 ret
= bch2_sum_sector_overwrites(trans
, iter
, k
,
301 &disk_sectors_delta
);
306 disk_sectors_delta
> (s64
) disk_res
->sectors
) {
307 ret
= bch2_disk_reservation_add(trans
->c
, disk_res
,
308 disk_sectors_delta
- disk_res
->sectors
,
309 !check_enospc
|| !usage_increasing
310 ? BCH_DISK_RESERVATION_NOFAIL
: 0);
317 * We always have to do an inode update - even when i_size/i_sectors
318 * aren't changing - for fsync to work properly; fsync relies on
319 * inode->bi_journal_seq which is updated by the trigger code:
321 ret
= bch2_extent_update_i_size_sectors(trans
, iter
,
322 min(k
->k
.p
.offset
<< 9, new_i_size
),
324 bch2_trans_update(trans
, iter
, k
, 0) ?:
325 bch2_trans_commit(trans
, disk_res
, NULL
,
326 BCH_TRANS_COMMIT_no_check_rw
|
327 BCH_TRANS_COMMIT_no_enospc
);
331 if (i_sectors_delta_total
)
332 *i_sectors_delta_total
+= i_sectors_delta
;
333 bch2_btree_iter_set_pos(iter
, next_pos
);
337 static int bch2_write_index_default(struct bch_write_op
*op
)
339 struct bch_fs
*c
= op
->c
;
341 struct keylist
*keys
= &op
->insert_keys
;
342 struct bkey_i
*k
= bch2_keylist_front(keys
);
343 struct btree_trans
*trans
= bch2_trans_get(c
);
344 struct btree_iter iter
;
346 .subvol
= op
->subvol
,
347 .inum
= k
->k
.p
.inode
,
351 BUG_ON(!inum
.subvol
);
353 bch2_bkey_buf_init(&sk
);
356 bch2_trans_begin(trans
);
358 k
= bch2_keylist_front(keys
);
359 bch2_bkey_buf_copy(&sk
, c
, k
);
361 ret
= bch2_subvolume_get_snapshot(trans
, inum
.subvol
,
362 &sk
.k
->k
.p
.snapshot
);
363 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
368 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_extents
,
369 bkey_start_pos(&sk
.k
->k
),
370 BTREE_ITER_slots
|BTREE_ITER_intent
);
372 ret
= bch2_bkey_set_needs_rebalance(c
, sk
.k
, &op
->opts
) ?:
373 bch2_extent_update(trans
, inum
, &iter
, sk
.k
,
375 op
->new_i_size
, &op
->i_sectors_delta
,
376 op
->flags
& BCH_WRITE_CHECK_ENOSPC
);
377 bch2_trans_iter_exit(trans
, &iter
);
379 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
384 if (bkey_ge(iter
.pos
, k
->k
.p
))
385 bch2_keylist_pop_front(&op
->insert_keys
);
387 bch2_cut_front(iter
.pos
, k
);
388 } while (!bch2_keylist_empty(keys
));
390 bch2_trans_put(trans
);
391 bch2_bkey_buf_exit(&sk
, c
);
398 void bch2_submit_wbio_replicas(struct bch_write_bio
*wbio
, struct bch_fs
*c
,
399 enum bch_data_type type
,
400 const struct bkey_i
*k
,
403 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(bkey_i_to_s_c(k
));
404 struct bch_write_bio
*n
;
406 BUG_ON(c
->opts
.nochanges
);
408 bkey_for_each_ptr(ptrs
, ptr
) {
409 struct bch_dev
*ca
= nocow
410 ? bch2_dev_have_ref(c
, ptr
->dev
)
411 : bch2_dev_get_ioref(c
, ptr
->dev
, type
== BCH_DATA_btree
? READ
: WRITE
);
413 if (to_entry(ptr
+ 1) < ptrs
.end
) {
414 n
= to_wbio(bio_alloc_clone(NULL
, &wbio
->bio
, GFP_NOFS
, &c
->replica_set
));
416 n
->bio
.bi_end_io
= wbio
->bio
.bi_end_io
;
417 n
->bio
.bi_private
= wbio
->bio
.bi_private
;
422 n
->bio
.bi_opf
= wbio
->bio
.bi_opf
;
423 bio_inc_remaining(&wbio
->bio
);
431 n
->have_ioref
= ca
!= NULL
;
433 n
->submit_time
= local_clock();
434 n
->inode_offset
= bkey_start_offset(&k
->k
);
436 n
->nocow_bucket
= PTR_BUCKET_NR(ca
, ptr
);
437 n
->bio
.bi_iter
.bi_sector
= ptr
->offset
;
439 if (likely(n
->have_ioref
)) {
440 this_cpu_add(ca
->io_done
->sectors
[WRITE
][type
],
441 bio_sectors(&n
->bio
));
443 bio_set_dev(&n
->bio
, ca
->disk_sb
.bdev
);
445 if (type
!= BCH_DATA_btree
&& unlikely(c
->opts
.no_data_io
)) {
452 n
->bio
.bi_status
= BLK_STS_REMOVED
;
458 static void __bch2_write(struct bch_write_op
*);
460 static void bch2_write_done(struct closure
*cl
)
462 struct bch_write_op
*op
= container_of(cl
, struct bch_write_op
, cl
);
463 struct bch_fs
*c
= op
->c
;
465 EBUG_ON(op
->open_buckets
.nr
);
467 bch2_time_stats_update(&c
->times
[BCH_TIME_data_write
], op
->start_time
);
468 bch2_disk_reservation_put(c
, &op
->res
);
470 if (!(op
->flags
& BCH_WRITE_MOVE
))
471 bch2_write_ref_put(c
, BCH_WRITE_REF_write
);
472 bch2_keylist_free(&op
->insert_keys
, op
->inline_keys
);
475 closure_debug_destroy(cl
);
480 static noinline
int bch2_write_drop_io_error_ptrs(struct bch_write_op
*op
)
482 struct keylist
*keys
= &op
->insert_keys
;
483 struct bkey_i
*src
, *dst
= keys
->keys
, *n
;
485 for (src
= keys
->keys
; src
!= keys
->top
; src
= n
) {
488 if (bkey_extent_is_direct_data(&src
->k
)) {
489 bch2_bkey_drop_ptrs(bkey_i_to_s(src
), ptr
,
490 test_bit(ptr
->dev
, op
->failed
.d
));
492 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src
)))
497 memmove_u64s_down(dst
, src
, src
->k
.u64s
);
498 dst
= bkey_next(dst
);
506 * __bch2_write_index - after a write, update index to point to new data
507 * @op: bch_write_op to process
509 static void __bch2_write_index(struct bch_write_op
*op
)
511 struct bch_fs
*c
= op
->c
;
512 struct keylist
*keys
= &op
->insert_keys
;
516 if (unlikely(op
->flags
& BCH_WRITE_IO_ERROR
)) {
517 ret
= bch2_write_drop_io_error_ptrs(op
);
522 if (!bch2_keylist_empty(keys
)) {
523 u64 sectors_start
= keylist_sectors(keys
);
525 ret
= !(op
->flags
& BCH_WRITE_MOVE
)
526 ? bch2_write_index_default(op
)
527 : bch2_data_update_index_update(op
);
529 BUG_ON(bch2_err_matches(ret
, BCH_ERR_transaction_restart
));
530 BUG_ON(keylist_sectors(keys
) && !ret
);
532 op
->written
+= sectors_start
- keylist_sectors(keys
);
534 if (ret
&& !bch2_err_matches(ret
, EROFS
)) {
535 struct bkey_i
*insert
= bch2_keylist_front(&op
->insert_keys
);
537 bch_err_inum_offset_ratelimited(c
,
538 insert
->k
.p
.inode
, insert
->k
.p
.offset
<< 9,
539 "%s write error while doing btree update: %s",
540 op
->flags
& BCH_WRITE_MOVE
? "move" : "user",
548 /* If some a bucket wasn't written, we can't erasure code it: */
549 for_each_set_bit(dev
, op
->failed
.d
, BCH_SB_MEMBERS_MAX
)
550 bch2_open_bucket_write_error(c
, &op
->open_buckets
, dev
);
552 bch2_open_buckets_put(c
, &op
->open_buckets
);
555 keys
->top
= keys
->keys
;
557 op
->flags
|= BCH_WRITE_SUBMITTED
;
561 static inline void __wp_update_state(struct write_point
*wp
, enum write_point_state state
)
563 if (state
!= wp
->state
) {
564 u64 now
= ktime_get_ns();
566 if (wp
->last_state_change
&&
567 time_after64(now
, wp
->last_state_change
))
568 wp
->time
[wp
->state
] += now
- wp
->last_state_change
;
570 wp
->last_state_change
= now
;
574 static inline void wp_update_state(struct write_point
*wp
, bool running
)
576 enum write_point_state state
;
578 state
= running
? WRITE_POINT_running
:
579 !list_empty(&wp
->writes
) ? WRITE_POINT_waiting_io
580 : WRITE_POINT_stopped
;
582 __wp_update_state(wp
, state
);
585 static CLOSURE_CALLBACK(bch2_write_index
)
587 closure_type(op
, struct bch_write_op
, cl
);
588 struct write_point
*wp
= op
->wp
;
589 struct workqueue_struct
*wq
= index_update_wq(op
);
592 if ((op
->flags
& BCH_WRITE_SUBMITTED
) &&
593 (op
->flags
& BCH_WRITE_MOVE
))
594 bch2_bio_free_pages_pool(op
->c
, &op
->wbio
.bio
);
596 spin_lock_irqsave(&wp
->writes_lock
, flags
);
597 if (wp
->state
== WRITE_POINT_waiting_io
)
598 __wp_update_state(wp
, WRITE_POINT_waiting_work
);
599 list_add_tail(&op
->wp_list
, &wp
->writes
);
600 spin_unlock_irqrestore (&wp
->writes_lock
, flags
);
602 queue_work(wq
, &wp
->index_update_work
);
605 static inline void bch2_write_queue(struct bch_write_op
*op
, struct write_point
*wp
)
609 if (wp
->state
== WRITE_POINT_stopped
) {
610 spin_lock_irq(&wp
->writes_lock
);
611 __wp_update_state(wp
, WRITE_POINT_waiting_io
);
612 spin_unlock_irq(&wp
->writes_lock
);
616 void bch2_write_point_do_index_updates(struct work_struct
*work
)
618 struct write_point
*wp
=
619 container_of(work
, struct write_point
, index_update_work
);
620 struct bch_write_op
*op
;
623 spin_lock_irq(&wp
->writes_lock
);
624 op
= list_first_entry_or_null(&wp
->writes
, struct bch_write_op
, wp_list
);
626 list_del(&op
->wp_list
);
627 wp_update_state(wp
, op
!= NULL
);
628 spin_unlock_irq(&wp
->writes_lock
);
633 op
->flags
|= BCH_WRITE_IN_WORKER
;
635 __bch2_write_index(op
);
637 if (!(op
->flags
& BCH_WRITE_SUBMITTED
))
640 bch2_write_done(&op
->cl
);
644 static void bch2_write_endio(struct bio
*bio
)
646 struct closure
*cl
= bio
->bi_private
;
647 struct bch_write_op
*op
= container_of(cl
, struct bch_write_op
, cl
);
648 struct bch_write_bio
*wbio
= to_wbio(bio
);
649 struct bch_write_bio
*parent
= wbio
->split
? wbio
->parent
: NULL
;
650 struct bch_fs
*c
= wbio
->c
;
651 struct bch_dev
*ca
= wbio
->have_ioref
652 ? bch2_dev_have_ref(c
, wbio
->dev
)
655 if (bch2_dev_inum_io_err_on(bio
->bi_status
, ca
, BCH_MEMBER_ERROR_write
,
657 wbio
->inode_offset
<< 9,
658 "data write error: %s",
659 bch2_blk_status_to_str(bio
->bi_status
))) {
660 set_bit(wbio
->dev
, op
->failed
.d
);
661 op
->flags
|= BCH_WRITE_IO_ERROR
;
665 bch2_bucket_nocow_unlock(&c
->nocow_locks
,
666 POS(ca
->dev_idx
, wbio
->nocow_bucket
),
667 BUCKET_NOCOW_LOCK_UPDATE
);
668 set_bit(wbio
->dev
, op
->devs_need_flush
->d
);
671 if (wbio
->have_ioref
) {
672 bch2_latency_acct(ca
, wbio
->submit_time
, WRITE
);
673 percpu_ref_put(&ca
->io_ref
);
677 bch2_bio_free_pages_pool(c
, bio
);
683 bio_endio(&parent
->bio
);
688 static void init_append_extent(struct bch_write_op
*op
,
689 struct write_point
*wp
,
690 struct bversion version
,
691 struct bch_extent_crc_unpacked crc
)
693 struct bkey_i_extent
*e
;
695 op
->pos
.offset
+= crc
.uncompressed_size
;
697 e
= bkey_extent_init(op
->insert_keys
.top
);
699 e
->k
.size
= crc
.uncompressed_size
;
700 e
->k
.bversion
= version
;
703 crc
.compression_type
||
705 bch2_extent_crc_append(&e
->k_i
, crc
);
707 bch2_alloc_sectors_append_ptrs_inlined(op
->c
, wp
, &e
->k_i
, crc
.compressed_size
,
708 op
->flags
& BCH_WRITE_CACHED
);
710 bch2_keylist_push(&op
->insert_keys
);
713 static struct bio
*bch2_write_bio_alloc(struct bch_fs
*c
,
714 struct write_point
*wp
,
716 bool *page_alloc_failed
,
719 struct bch_write_bio
*wbio
;
721 unsigned output_available
=
722 min(wp
->sectors_free
<< 9, src
->bi_iter
.bi_size
);
723 unsigned pages
= DIV_ROUND_UP(output_available
+
725 ? ((unsigned long) buf
& (PAGE_SIZE
- 1))
728 pages
= min(pages
, BIO_MAX_VECS
);
730 bio
= bio_alloc_bioset(NULL
, pages
, 0,
731 GFP_NOFS
, &c
->bio_write
);
732 wbio
= wbio_init(bio
);
733 wbio
->put_bio
= true;
734 /* copy WRITE_SYNC flag */
735 wbio
->bio
.bi_opf
= src
->bi_opf
;
738 bch2_bio_map(bio
, buf
, output_available
);
745 * We can't use mempool for more than c->sb.encoded_extent_max
746 * worth of pages, but we'd like to allocate more if we can:
748 bch2_bio_alloc_pages_pool(c
, bio
,
749 min_t(unsigned, output_available
,
750 c
->opts
.encoded_extent_max
));
752 if (bio
->bi_iter
.bi_size
< output_available
)
754 bch2_bio_alloc_pages(bio
,
756 bio
->bi_iter
.bi_size
,
762 static int bch2_write_rechecksum(struct bch_fs
*c
,
763 struct bch_write_op
*op
,
764 unsigned new_csum_type
)
766 struct bio
*bio
= &op
->wbio
.bio
;
767 struct bch_extent_crc_unpacked new_crc
;
770 /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
772 if (bch2_csum_type_is_encryption(op
->crc
.csum_type
) !=
773 bch2_csum_type_is_encryption(new_csum_type
))
774 new_csum_type
= op
->crc
.csum_type
;
776 ret
= bch2_rechecksum_bio(c
, bio
, op
->version
, op
->crc
,
778 op
->crc
.offset
, op
->crc
.live_size
,
783 bio_advance(bio
, op
->crc
.offset
<< 9);
784 bio
->bi_iter
.bi_size
= op
->crc
.live_size
<< 9;
789 static int bch2_write_decrypt(struct bch_write_op
*op
)
791 struct bch_fs
*c
= op
->c
;
792 struct nonce nonce
= extent_nonce(op
->version
, op
->crc
);
793 struct bch_csum csum
;
796 if (!bch2_csum_type_is_encryption(op
->crc
.csum_type
))
800 * If we need to decrypt data in the write path, we'll no longer be able
801 * to verify the existing checksum (poly1305 mac, in this case) after
802 * it's decrypted - this is the last point we'll be able to reverify the
805 csum
= bch2_checksum_bio(c
, op
->crc
.csum_type
, nonce
, &op
->wbio
.bio
);
806 if (bch2_crc_cmp(op
->crc
.csum
, csum
) && !c
->opts
.no_data_io
)
809 ret
= bch2_encrypt_bio(c
, op
->crc
.csum_type
, nonce
, &op
->wbio
.bio
);
810 op
->crc
.csum_type
= 0;
811 op
->crc
.csum
= (struct bch_csum
) { 0, 0 };
815 static enum prep_encoded_ret
{
818 PREP_ENCODED_CHECKSUM_ERR
,
819 PREP_ENCODED_DO_WRITE
,
820 } bch2_write_prep_encoded_data(struct bch_write_op
*op
, struct write_point
*wp
)
822 struct bch_fs
*c
= op
->c
;
823 struct bio
*bio
= &op
->wbio
.bio
;
825 if (!(op
->flags
& BCH_WRITE_DATA_ENCODED
))
826 return PREP_ENCODED_OK
;
828 BUG_ON(bio_sectors(bio
) != op
->crc
.compressed_size
);
830 /* Can we just write the entire extent as is? */
831 if (op
->crc
.uncompressed_size
== op
->crc
.live_size
&&
832 op
->crc
.uncompressed_size
<= c
->opts
.encoded_extent_max
>> 9 &&
833 op
->crc
.compressed_size
<= wp
->sectors_free
&&
834 (op
->crc
.compression_type
== bch2_compression_opt_to_type(op
->compression_opt
) ||
835 op
->incompressible
)) {
836 if (!crc_is_compressed(op
->crc
) &&
837 op
->csum_type
!= op
->crc
.csum_type
&&
838 bch2_write_rechecksum(c
, op
, op
->csum_type
) &&
840 return PREP_ENCODED_CHECKSUM_ERR
;
842 return PREP_ENCODED_DO_WRITE
;
846 * If the data is compressed and we couldn't write the entire extent as
847 * is, we have to decompress it:
849 if (crc_is_compressed(op
->crc
)) {
850 struct bch_csum csum
;
852 if (bch2_write_decrypt(op
))
853 return PREP_ENCODED_CHECKSUM_ERR
;
855 /* Last point we can still verify checksum: */
856 csum
= bch2_checksum_bio(c
, op
->crc
.csum_type
,
857 extent_nonce(op
->version
, op
->crc
),
859 if (bch2_crc_cmp(op
->crc
.csum
, csum
) && !c
->opts
.no_data_io
)
860 return PREP_ENCODED_CHECKSUM_ERR
;
862 if (bch2_bio_uncompress_inplace(c
, bio
, &op
->crc
))
863 return PREP_ENCODED_ERR
;
867 * No longer have compressed data after this point - data might be
872 * If the data is checksummed and we're only writing a subset,
873 * rechecksum and adjust bio to point to currently live data:
875 if ((op
->crc
.live_size
!= op
->crc
.uncompressed_size
||
876 op
->crc
.csum_type
!= op
->csum_type
) &&
877 bch2_write_rechecksum(c
, op
, op
->csum_type
) &&
879 return PREP_ENCODED_CHECKSUM_ERR
;
882 * If we want to compress the data, it has to be decrypted:
884 if ((op
->compression_opt
||
885 bch2_csum_type_is_encryption(op
->crc
.csum_type
) !=
886 bch2_csum_type_is_encryption(op
->csum_type
)) &&
887 bch2_write_decrypt(op
))
888 return PREP_ENCODED_CHECKSUM_ERR
;
890 return PREP_ENCODED_OK
;
893 static int bch2_write_extent(struct bch_write_op
*op
, struct write_point
*wp
,
896 struct bch_fs
*c
= op
->c
;
897 struct bio
*src
= &op
->wbio
.bio
, *dst
= src
;
898 struct bvec_iter saved_iter
;
900 unsigned total_output
= 0, total_input
= 0;
902 bool page_alloc_failed
= false;
905 BUG_ON(!bio_sectors(src
));
907 ec_buf
= bch2_writepoint_ec_buf(c
, wp
);
909 switch (bch2_write_prep_encoded_data(op
, wp
)) {
910 case PREP_ENCODED_OK
:
912 case PREP_ENCODED_ERR
:
915 case PREP_ENCODED_CHECKSUM_ERR
:
917 case PREP_ENCODED_DO_WRITE
:
918 /* XXX look for bug here */
920 dst
= bch2_write_bio_alloc(c
, wp
, src
,
923 bio_copy_data(dst
, src
);
926 init_append_extent(op
, wp
, op
->version
, op
->crc
);
931 op
->compression_opt
||
933 !(op
->flags
& BCH_WRITE_PAGES_STABLE
)) ||
934 (bch2_csum_type_is_encryption(op
->csum_type
) &&
935 !(op
->flags
& BCH_WRITE_PAGES_OWNED
))) {
936 dst
= bch2_write_bio_alloc(c
, wp
, src
,
942 saved_iter
= dst
->bi_iter
;
945 struct bch_extent_crc_unpacked crc
= { 0 };
946 struct bversion version
= op
->version
;
947 size_t dst_len
= 0, src_len
= 0;
949 if (page_alloc_failed
&&
950 dst
->bi_iter
.bi_size
< (wp
->sectors_free
<< 9) &&
951 dst
->bi_iter
.bi_size
< c
->opts
.encoded_extent_max
)
954 BUG_ON(op
->compression_opt
&&
955 (op
->flags
& BCH_WRITE_DATA_ENCODED
) &&
956 bch2_csum_type_is_encryption(op
->crc
.csum_type
));
957 BUG_ON(op
->compression_opt
&& !bounce
);
959 crc
.compression_type
= op
->incompressible
960 ? BCH_COMPRESSION_TYPE_incompressible
961 : op
->compression_opt
962 ? bch2_bio_compress(c
, dst
, &dst_len
, src
, &src_len
,
965 if (!crc_is_compressed(crc
)) {
966 dst_len
= min(dst
->bi_iter
.bi_size
, src
->bi_iter
.bi_size
);
967 dst_len
= min_t(unsigned, dst_len
, wp
->sectors_free
<< 9);
970 dst_len
= min_t(unsigned, dst_len
,
971 c
->opts
.encoded_extent_max
);
974 swap(dst
->bi_iter
.bi_size
, dst_len
);
975 bio_copy_data(dst
, src
);
976 swap(dst
->bi_iter
.bi_size
, dst_len
);
982 BUG_ON(!src_len
|| !dst_len
);
984 if (bch2_csum_type_is_encryption(op
->csum_type
)) {
985 if (bversion_zero(version
)) {
986 version
.lo
= atomic64_inc_return(&c
->key_version
);
988 crc
.nonce
= op
->nonce
;
989 op
->nonce
+= src_len
>> 9;
993 if ((op
->flags
& BCH_WRITE_DATA_ENCODED
) &&
994 !crc_is_compressed(crc
) &&
995 bch2_csum_type_is_encryption(op
->crc
.csum_type
) ==
996 bch2_csum_type_is_encryption(op
->csum_type
)) {
997 u8 compression_type
= crc
.compression_type
;
998 u16 nonce
= crc
.nonce
;
1000 * Note: when we're using rechecksum(), we need to be
1001 * checksumming @src because it has all the data our
1002 * existing checksum covers - if we bounced (because we
1003 * were trying to compress), @dst will only have the
1004 * part of the data the new checksum will cover.
1006 * But normally we want to be checksumming post bounce,
1007 * because part of the reason for bouncing is so the
1008 * data can't be modified (by userspace) while it's in
1011 if (bch2_rechecksum_bio(c
, src
, version
, op
->crc
,
1014 bio_sectors(src
) - (src_len
>> 9),
1018 * rchecksum_bio sets compression_type on crc from op->crc,
1019 * this isn't always correct as sometimes we're changing
1020 * an extent from uncompressed to incompressible.
1022 crc
.compression_type
= compression_type
;
1025 if ((op
->flags
& BCH_WRITE_DATA_ENCODED
) &&
1026 bch2_rechecksum_bio(c
, src
, version
, op
->crc
,
1029 bio_sectors(src
) - (src_len
>> 9),
1033 crc
.compressed_size
= dst_len
>> 9;
1034 crc
.uncompressed_size
= src_len
>> 9;
1035 crc
.live_size
= src_len
>> 9;
1037 swap(dst
->bi_iter
.bi_size
, dst_len
);
1038 ret
= bch2_encrypt_bio(c
, op
->csum_type
,
1039 extent_nonce(version
, crc
), dst
);
1043 crc
.csum
= bch2_checksum_bio(c
, op
->csum_type
,
1044 extent_nonce(version
, crc
), dst
);
1045 crc
.csum_type
= op
->csum_type
;
1046 swap(dst
->bi_iter
.bi_size
, dst_len
);
1049 init_append_extent(op
, wp
, version
, crc
);
1052 bio_advance(dst
, dst_len
);
1053 bio_advance(src
, src_len
);
1054 total_output
+= dst_len
;
1055 total_input
+= src_len
;
1056 } while (dst
->bi_iter
.bi_size
&&
1057 src
->bi_iter
.bi_size
&&
1059 !bch2_keylist_realloc(&op
->insert_keys
,
1061 ARRAY_SIZE(op
->inline_keys
),
1062 BKEY_EXTENT_U64s_MAX
));
1064 more
= src
->bi_iter
.bi_size
!= 0;
1066 dst
->bi_iter
= saved_iter
;
1068 if (dst
== src
&& more
) {
1069 BUG_ON(total_output
!= total_input
);
1071 dst
= bio_split(src
, total_input
>> 9,
1072 GFP_NOFS
, &c
->bio_write
);
1073 wbio_init(dst
)->put_bio
= true;
1074 /* copy WRITE_SYNC flag */
1075 dst
->bi_opf
= src
->bi_opf
;
1078 dst
->bi_iter
.bi_size
= total_output
;
1083 bch_err_inum_offset_ratelimited(c
,
1085 op
->pos
.offset
<< 9,
1086 "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)",
1087 op
->flags
& BCH_WRITE_MOVE
? "move" : "user");
1090 if (to_wbio(dst
)->bounce
)
1091 bch2_bio_free_pages_pool(c
, dst
);
1092 if (to_wbio(dst
)->put_bio
)
1098 static bool bch2_extent_is_writeable(struct bch_write_op
*op
,
1101 struct bch_fs
*c
= op
->c
;
1102 struct bkey_s_c_extent e
;
1103 struct extent_ptr_decoded p
;
1104 const union bch_extent_entry
*entry
;
1105 unsigned replicas
= 0;
1107 if (k
.k
->type
!= KEY_TYPE_extent
)
1110 e
= bkey_s_c_to_extent(k
);
1113 extent_for_each_ptr_decode(e
, p
, entry
) {
1114 if (crc_is_encoded(p
.crc
) || p
.has_ec
) {
1119 replicas
+= bch2_extent_ptr_durability(c
, &p
);
1123 return replicas
>= op
->opts
.data_replicas
;
1126 static int bch2_nocow_write_convert_one_unwritten(struct btree_trans
*trans
,
1127 struct btree_iter
*iter
,
1128 struct bkey_i
*orig
,
1132 if (!bch2_extents_match(bkey_i_to_s_c(orig
), k
)) {
1137 struct bkey_i
*new = bch2_bkey_make_mut_noupdate(trans
, k
);
1138 int ret
= PTR_ERR_OR_ZERO(new);
1142 bch2_cut_front(bkey_start_pos(&orig
->k
), new);
1143 bch2_cut_back(orig
->k
.p
, new);
1145 struct bkey_ptrs ptrs
= bch2_bkey_ptrs(bkey_i_to_s(new));
1146 bkey_for_each_ptr(ptrs
, ptr
)
1150 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
1151 * that was done when we kicked off the write, and here it's important
1152 * that we update the extent that we wrote to - even if a snapshot has
1153 * since been created. The write is still outstanding, so we're ok
1154 * w.r.t. snapshot atomicity:
1156 return bch2_extent_update_i_size_sectors(trans
, iter
,
1157 min(new->k
.p
.offset
<< 9, new_i_size
), 0) ?:
1158 bch2_trans_update(trans
, iter
, new,
1159 BTREE_UPDATE_internal_snapshot_node
);
1162 static void bch2_nocow_write_convert_unwritten(struct bch_write_op
*op
)
1164 struct bch_fs
*c
= op
->c
;
1165 struct btree_trans
*trans
= bch2_trans_get(c
);
1167 for_each_keylist_key(&op
->insert_keys
, orig
) {
1168 int ret
= for_each_btree_key_upto_commit(trans
, iter
, BTREE_ID_extents
,
1169 bkey_start_pos(&orig
->k
), orig
->k
.p
,
1170 BTREE_ITER_intent
, k
,
1171 NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
, ({
1172 bch2_nocow_write_convert_one_unwritten(trans
, &iter
, orig
, k
, op
->new_i_size
);
1175 if (ret
&& !bch2_err_matches(ret
, EROFS
)) {
1176 struct bkey_i
*insert
= bch2_keylist_front(&op
->insert_keys
);
1178 bch_err_inum_offset_ratelimited(c
,
1179 insert
->k
.p
.inode
, insert
->k
.p
.offset
<< 9,
1180 "%s write error while doing btree update: %s",
1181 op
->flags
& BCH_WRITE_MOVE
? "move" : "user",
1191 bch2_trans_put(trans
);
1194 static void __bch2_nocow_write_done(struct bch_write_op
*op
)
1196 if (unlikely(op
->flags
& BCH_WRITE_IO_ERROR
)) {
1198 } else if (unlikely(op
->flags
& BCH_WRITE_CONVERT_UNWRITTEN
))
1199 bch2_nocow_write_convert_unwritten(op
);
1202 static CLOSURE_CALLBACK(bch2_nocow_write_done
)
1204 closure_type(op
, struct bch_write_op
, cl
);
1206 __bch2_nocow_write_done(op
);
1207 bch2_write_done(cl
);
1210 struct bucket_to_lock
{
1213 struct nocow_lock_bucket
*l
;
1216 static void bch2_nocow_write(struct bch_write_op
*op
)
1218 struct bch_fs
*c
= op
->c
;
1219 struct btree_trans
*trans
;
1220 struct btree_iter iter
;
1222 DARRAY_PREALLOCATED(struct bucket_to_lock
, 3) buckets
;
1224 struct bucket_to_lock
*stale_at
;
1227 if (op
->flags
& BCH_WRITE_MOVE
)
1230 darray_init(&buckets
);
1231 trans
= bch2_trans_get(c
);
1233 bch2_trans_begin(trans
);
1235 ret
= bch2_subvolume_get_snapshot(trans
, op
->subvol
, &snapshot
);
1239 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_extents
,
1240 SPOS(op
->pos
.inode
, op
->pos
.offset
, snapshot
),
1243 struct bio
*bio
= &op
->wbio
.bio
;
1247 ret
= bch2_trans_relock(trans
);
1251 k
= bch2_btree_iter_peek_slot(&iter
);
1256 /* fall back to normal cow write path? */
1257 if (unlikely(k
.k
->p
.snapshot
!= snapshot
||
1258 !bch2_extent_is_writeable(op
, k
)))
1261 if (bch2_keylist_realloc(&op
->insert_keys
,
1263 ARRAY_SIZE(op
->inline_keys
),
1267 /* Get iorefs before dropping btree locks: */
1268 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1269 bkey_for_each_ptr(ptrs
, ptr
) {
1270 struct bch_dev
*ca
= bch2_dev_get_ioref(c
, ptr
->dev
, WRITE
);
1274 struct bpos b
= PTR_BUCKET_POS(ca
, ptr
);
1275 struct nocow_lock_bucket
*l
=
1276 bucket_nocow_lock(&c
->nocow_locks
, bucket_to_u64(b
));
1279 /* XXX allocating memory with btree locks held - rare */
1280 darray_push_gfp(&buckets
, ((struct bucket_to_lock
) {
1281 .b
= b
, .gen
= ptr
->gen
, .l
= l
,
1282 }), GFP_KERNEL
|__GFP_NOFAIL
);
1285 op
->flags
|= BCH_WRITE_CONVERT_UNWRITTEN
;
1288 /* Unlock before taking nocow locks, doing IO: */
1289 bkey_reassemble(op
->insert_keys
.top
, k
);
1290 bch2_trans_unlock(trans
);
1292 bch2_cut_front(op
->pos
, op
->insert_keys
.top
);
1293 if (op
->flags
& BCH_WRITE_CONVERT_UNWRITTEN
)
1294 bch2_cut_back(POS(op
->pos
.inode
, op
->pos
.offset
+ bio_sectors(bio
)), op
->insert_keys
.top
);
1296 darray_for_each(buckets
, i
) {
1297 struct bch_dev
*ca
= bch2_dev_have_ref(c
, i
->b
.inode
);
1299 __bch2_bucket_nocow_lock(&c
->nocow_locks
, i
->l
,
1300 bucket_to_u64(i
->b
),
1301 BUCKET_NOCOW_LOCK_UPDATE
);
1303 int gen
= bucket_gen_get(ca
, i
->b
.offset
);
1304 stale
= gen
< 0 ? gen
: gen_after(gen
, i
->gen
);
1305 if (unlikely(stale
)) {
1307 goto err_bucket_stale
;
1311 bio
= &op
->wbio
.bio
;
1312 if (k
.k
->p
.offset
< op
->pos
.offset
+ bio_sectors(bio
)) {
1313 bio
= bio_split(bio
, k
.k
->p
.offset
- op
->pos
.offset
,
1314 GFP_KERNEL
, &c
->bio_write
);
1315 wbio_init(bio
)->put_bio
= true;
1316 bio
->bi_opf
= op
->wbio
.bio
.bi_opf
;
1318 op
->flags
|= BCH_WRITE_SUBMITTED
;
1321 op
->pos
.offset
+= bio_sectors(bio
);
1322 op
->written
+= bio_sectors(bio
);
1324 bio
->bi_end_io
= bch2_write_endio
;
1325 bio
->bi_private
= &op
->cl
;
1326 bio
->bi_opf
|= REQ_OP_WRITE
;
1327 closure_get(&op
->cl
);
1328 bch2_submit_wbio_replicas(to_wbio(bio
), c
, BCH_DATA_user
,
1329 op
->insert_keys
.top
, true);
1331 bch2_keylist_push(&op
->insert_keys
);
1332 if (op
->flags
& BCH_WRITE_SUBMITTED
)
1334 bch2_btree_iter_advance(&iter
);
1337 bch2_trans_iter_exit(trans
, &iter
);
1339 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
1343 bch_err_inum_offset_ratelimited(c
,
1344 op
->pos
.inode
, op
->pos
.offset
<< 9,
1345 "%s: btree lookup error %s", __func__
, bch2_err_str(ret
));
1347 op
->flags
|= BCH_WRITE_SUBMITTED
;
1350 bch2_trans_put(trans
);
1351 darray_exit(&buckets
);
1353 /* fallback to cow write path? */
1354 if (!(op
->flags
& BCH_WRITE_SUBMITTED
)) {
1355 closure_sync(&op
->cl
);
1356 __bch2_nocow_write_done(op
);
1357 op
->insert_keys
.top
= op
->insert_keys
.keys
;
1358 } else if (op
->flags
& BCH_WRITE_SYNC
) {
1359 closure_sync(&op
->cl
);
1360 bch2_nocow_write_done(&op
->cl
.work
);
1364 * needs to run out of process context because ei_quota_lock is
1367 continue_at(&op
->cl
, bch2_nocow_write_done
, index_update_wq(op
));
1371 darray_for_each(buckets
, i
)
1372 percpu_ref_put(&bch2_dev_have_ref(c
, i
->b
.inode
)->io_ref
);
1374 /* Fall back to COW path: */
1377 darray_for_each(buckets
, i
) {
1378 bch2_bucket_nocow_unlock(&c
->nocow_locks
, i
->b
, BUCKET_NOCOW_LOCK_UPDATE
);
1383 struct printbuf buf
= PRINTBUF
;
1384 if (bch2_fs_inconsistent_on(stale
< 0, c
,
1385 "pointer to invalid bucket in nocow path on device %llu\n %s",
1387 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
))) {
1390 /* We can retry this: */
1391 ret
= -BCH_ERR_transaction_restart
;
1393 printbuf_exit(&buf
);
1398 static void __bch2_write(struct bch_write_op
*op
)
1400 struct bch_fs
*c
= op
->c
;
1401 struct write_point
*wp
= NULL
;
1402 struct bio
*bio
= NULL
;
1403 unsigned nofs_flags
;
1406 nofs_flags
= memalloc_nofs_save();
1408 if (unlikely(op
->opts
.nocow
&& c
->opts
.nocow_enabled
)) {
1409 bch2_nocow_write(op
);
1410 if (op
->flags
& BCH_WRITE_SUBMITTED
)
1411 goto out_nofs_restore
;
1414 memset(&op
->failed
, 0, sizeof(op
->failed
));
1417 struct bkey_i
*key_to_write
;
1418 unsigned key_to_write_offset
= op
->insert_keys
.top_p
-
1419 op
->insert_keys
.keys_p
;
1421 /* +1 for possible cache device: */
1422 if (op
->open_buckets
.nr
+ op
->nr_replicas
+ 1 >
1423 ARRAY_SIZE(op
->open_buckets
.v
))
1426 if (bch2_keylist_realloc(&op
->insert_keys
,
1428 ARRAY_SIZE(op
->inline_keys
),
1429 BKEY_EXTENT_U64s_MAX
))
1433 * The copygc thread is now global, which means it's no longer
1434 * freeing up space on specific disks, which means that
1435 * allocations for specific disks may hang arbitrarily long:
1437 ret
= bch2_trans_run(c
, lockrestart_do(trans
,
1438 bch2_alloc_sectors_start_trans(trans
,
1440 op
->opts
.erasure_code
&& !(op
->flags
& BCH_WRITE_CACHED
),
1444 op
->nr_replicas_required
,
1448 if (unlikely(ret
)) {
1449 if (bch2_err_matches(ret
, BCH_ERR_operation_blocked
))
1457 bch2_open_bucket_get(c
, wp
, &op
->open_buckets
);
1458 ret
= bch2_write_extent(op
, wp
, &bio
);
1460 bch2_alloc_sectors_done_inlined(c
, wp
);
1463 op
->flags
|= BCH_WRITE_SUBMITTED
;
1466 if (!(op
->flags
& BCH_WRITE_ALLOC_NOWAIT
))
1467 bch_err_inum_offset_ratelimited(c
,
1469 op
->pos
.offset
<< 9,
1470 "%s(): %s error: %s", __func__
,
1471 op
->flags
& BCH_WRITE_MOVE
? "move" : "user",
1478 bio
->bi_end_io
= bch2_write_endio
;
1479 bio
->bi_private
= &op
->cl
;
1480 bio
->bi_opf
|= REQ_OP_WRITE
;
1482 closure_get(bio
->bi_private
);
1484 key_to_write
= (void *) (op
->insert_keys
.keys_p
+
1485 key_to_write_offset
);
1487 bch2_submit_wbio_replicas(to_wbio(bio
), c
, BCH_DATA_user
,
1488 key_to_write
, false);
1494 * If we're running asynchronously, wne may still want to block
1495 * synchronously here if we weren't able to submit all of the IO at
1496 * once, as that signals backpressure to the caller.
1498 if ((op
->flags
& BCH_WRITE_SYNC
) ||
1499 (!(op
->flags
& BCH_WRITE_SUBMITTED
) &&
1500 !(op
->flags
& BCH_WRITE_IN_WORKER
))) {
1501 bch2_wait_on_allocator(c
, &op
->cl
);
1503 __bch2_write_index(op
);
1505 if (!(op
->flags
& BCH_WRITE_SUBMITTED
))
1507 bch2_write_done(&op
->cl
);
1509 bch2_write_queue(op
, wp
);
1510 continue_at(&op
->cl
, bch2_write_index
, NULL
);
1513 memalloc_nofs_restore(nofs_flags
);
1516 static void bch2_write_data_inline(struct bch_write_op
*op
, unsigned data_len
)
1518 struct bio
*bio
= &op
->wbio
.bio
;
1519 struct bvec_iter iter
;
1520 struct bkey_i_inline_data
*id
;
1524 memset(&op
->failed
, 0, sizeof(op
->failed
));
1526 op
->flags
|= BCH_WRITE_WROTE_DATA_INLINE
;
1527 op
->flags
|= BCH_WRITE_SUBMITTED
;
1529 bch2_check_set_feature(op
->c
, BCH_FEATURE_inline_data
);
1531 ret
= bch2_keylist_realloc(&op
->insert_keys
, op
->inline_keys
,
1532 ARRAY_SIZE(op
->inline_keys
),
1533 BKEY_U64s
+ DIV_ROUND_UP(data_len
, 8));
1539 sectors
= bio_sectors(bio
);
1540 op
->pos
.offset
+= sectors
;
1542 id
= bkey_inline_data_init(op
->insert_keys
.top
);
1544 id
->k
.bversion
= op
->version
;
1545 id
->k
.size
= sectors
;
1547 iter
= bio
->bi_iter
;
1548 iter
.bi_size
= data_len
;
1549 memcpy_from_bio(id
->v
.data
, bio
, iter
);
1551 while (data_len
& 7)
1552 id
->v
.data
[data_len
++] = '\0';
1553 set_bkey_val_bytes(&id
->k
, data_len
);
1554 bch2_keylist_push(&op
->insert_keys
);
1556 __bch2_write_index(op
);
1558 bch2_write_done(&op
->cl
);
1562 * bch2_write() - handle a write to a cache device or flash only volume
1563 * @cl: &bch_write_op->cl
1565 * This is the starting point for any data to end up in a cache device; it could
1566 * be from a normal write, or a writeback write, or a write to a flash only
1567 * volume - it's also used by the moving garbage collector to compact data in
1568 * mostly empty buckets.
1570 * It first writes the data to the cache, creating a list of keys to be inserted
1571 * (if the data won't fit in a single open bucket, there will be multiple keys);
1572 * after the data is written it calls bch_journal, and after the keys have been
1573 * added to the next journal write they're inserted into the btree.
1575 * If op->discard is true, instead of inserting the data it invalidates the
1576 * region of the cache represented by op->bio and op->inode.
1578 CLOSURE_CALLBACK(bch2_write
)
1580 closure_type(op
, struct bch_write_op
, cl
);
1581 struct bio
*bio
= &op
->wbio
.bio
;
1582 struct bch_fs
*c
= op
->c
;
1585 EBUG_ON(op
->cl
.parent
);
1586 BUG_ON(!op
->nr_replicas
);
1587 BUG_ON(!op
->write_point
.v
);
1588 BUG_ON(bkey_eq(op
->pos
, POS_MAX
));
1590 if (op
->flags
& BCH_WRITE_ONLY_SPECIFIED_DEVS
)
1591 op
->flags
|= BCH_WRITE_ALLOC_NOWAIT
;
1593 op
->nr_replicas_required
= min_t(unsigned, op
->nr_replicas_required
, op
->nr_replicas
);
1594 op
->start_time
= local_clock();
1595 bch2_keylist_init(&op
->insert_keys
, op
->inline_keys
);
1596 wbio_init(bio
)->put_bio
= false;
1598 if (bio
->bi_iter
.bi_size
& (c
->opts
.block_size
- 1)) {
1599 bch_err_inum_offset_ratelimited(c
,
1601 op
->pos
.offset
<< 9,
1602 "%s write error: misaligned write",
1603 op
->flags
& BCH_WRITE_MOVE
? "move" : "user");
1608 if (c
->opts
.nochanges
) {
1609 op
->error
= -BCH_ERR_erofs_no_writes
;
1613 if (!(op
->flags
& BCH_WRITE_MOVE
) &&
1614 !bch2_write_ref_tryget(c
, BCH_WRITE_REF_write
)) {
1615 op
->error
= -BCH_ERR_erofs_no_writes
;
1619 this_cpu_add(c
->counters
[BCH_COUNTER_io_write
], bio_sectors(bio
));
1620 bch2_increment_clock(c
, bio_sectors(bio
), WRITE
);
1622 data_len
= min_t(u64
, bio
->bi_iter
.bi_size
,
1623 op
->new_i_size
- (op
->pos
.offset
<< 9));
1625 if (c
->opts
.inline_data
&&
1626 data_len
<= min(block_bytes(c
) / 2, 1024U)) {
1627 bch2_write_data_inline(op
, data_len
);
1634 bch2_disk_reservation_put(c
, &op
->res
);
1636 closure_debug_destroy(&op
->cl
);
1641 static const char * const bch2_write_flags
[] = {
1648 void bch2_write_op_to_text(struct printbuf
*out
, struct bch_write_op
*op
)
1650 prt_str(out
, "pos: ");
1651 bch2_bpos_to_text(out
, op
->pos
);
1653 printbuf_indent_add(out
, 2);
1655 prt_str(out
, "started: ");
1656 bch2_pr_time_units(out
, local_clock() - op
->start_time
);
1659 prt_str(out
, "flags: ");
1660 prt_bitflags(out
, bch2_write_flags
, op
->flags
);
1663 prt_printf(out
, "ref: %u\n", closure_nr_remaining(&op
->cl
));
1665 printbuf_indent_sub(out
, 2);
1668 void bch2_fs_io_write_exit(struct bch_fs
*c
)
1670 mempool_exit(&c
->bio_bounce_pages
);
1671 bioset_exit(&c
->replica_set
);
1672 bioset_exit(&c
->bio_write
);
1675 int bch2_fs_io_write_init(struct bch_fs
*c
)
1677 if (bioset_init(&c
->bio_write
, 1, offsetof(struct bch_write_bio
, bio
), BIOSET_NEED_BVECS
) ||
1678 bioset_init(&c
->replica_set
, 4, offsetof(struct bch_write_bio
, bio
), 0))
1679 return -BCH_ERR_ENOMEM_bio_write_init
;
1681 if (mempool_init_page_pool(&c
->bio_bounce_pages
,
1683 c
->opts
.btree_node_size
,
1684 c
->opts
.encoded_extent_max
) /
1686 return -BCH_ERR_ENOMEM_bio_bounce_pages_init
;