1 // SPDX-License-Identifier: GPL-2.0
4 #include "alloc_foreground.h"
7 #include "btree_iter.h"
8 #include "btree_journal_iter.h"
9 #include "btree_key_cache.h"
10 #include "btree_update_interior.h"
11 #include "btree_write_buffer.h"
13 #include "disk_accounting.h"
17 #include "journal_io.h"
18 #include "journal_reclaim.h"
22 #include <linux/prefetch.h>
24 static const char * const trans_commit_flags_strs
[] = {
26 BCH_TRANS_COMMIT_FLAGS()
31 void bch2_trans_commit_flags_to_text(struct printbuf
*out
, enum bch_trans_commit_flags flags
)
33 enum bch_watermark watermark
= flags
& BCH_WATERMARK_MASK
;
35 prt_printf(out
, "watermark=%s", bch2_watermarks
[watermark
]);
37 flags
>>= BCH_WATERMARK_BITS
;
40 bch2_prt_bitflags(out
, trans_commit_flags_strs
, flags
);
44 static void verify_update_old_key(struct btree_trans
*trans
, struct btree_insert_entry
*i
)
46 #ifdef CONFIG_BCACHEFS_DEBUG
47 struct bch_fs
*c
= trans
->c
;
49 struct bkey_s_c k
= bch2_btree_path_peek_slot_exact(trans
->paths
+ i
->path
, &u
);
51 if (unlikely(trans
->journal_replay_not_finished
)) {
53 bch2_journal_keys_peek_slot(c
, i
->btree_id
, i
->level
, i
->k
->k
.p
);
56 k
= bkey_i_to_s_c(j_k
);
60 u
.needs_whiteout
= i
->old_k
.needs_whiteout
;
62 BUG_ON(memcmp(&i
->old_k
, &u
, sizeof(struct bkey
)));
63 BUG_ON(i
->old_v
!= k
.v
);
67 static inline struct btree_path_level
*insert_l(struct btree_trans
*trans
, struct btree_insert_entry
*i
)
69 return (trans
->paths
+ i
->path
)->l
+ i
->level
;
72 static inline bool same_leaf_as_prev(struct btree_trans
*trans
,
73 struct btree_insert_entry
*i
)
75 return i
!= trans
->updates
&&
76 insert_l(trans
, &i
[0])->b
== insert_l(trans
, &i
[-1])->b
;
79 static inline bool same_leaf_as_next(struct btree_trans
*trans
,
80 struct btree_insert_entry
*i
)
82 return i
+ 1 < trans
->updates
+ trans
->nr_updates
&&
83 insert_l(trans
, &i
[0])->b
== insert_l(trans
, &i
[1])->b
;
86 inline void bch2_btree_node_prep_for_write(struct btree_trans
*trans
,
87 struct btree_path
*path
,
90 struct bch_fs
*c
= trans
->c
;
92 if (unlikely(btree_node_just_written(b
)) &&
93 bch2_btree_post_write_cleanup(c
, b
))
94 bch2_trans_node_reinit_iter(trans
, b
);
97 * If the last bset has been written, or if it's gotten too big - start
98 * a new bset to insert into:
100 if (want_new_bset(c
, b
))
101 bch2_btree_init_next(trans
, b
);
104 static noinline
int trans_lock_write_fail(struct btree_trans
*trans
, struct btree_insert_entry
*i
)
106 while (--i
>= trans
->updates
) {
107 if (same_leaf_as_prev(trans
, i
))
110 bch2_btree_node_unlock_write(trans
, trans
->paths
+ i
->path
, insert_l(trans
, i
)->b
);
113 trace_and_count(trans
->c
, trans_restart_would_deadlock_write
, trans
);
114 return btree_trans_restart(trans
, BCH_ERR_transaction_restart_would_deadlock_write
);
117 static inline int bch2_trans_lock_write(struct btree_trans
*trans
)
119 EBUG_ON(trans
->write_locked
);
121 trans_for_each_update(trans
, i
) {
122 if (same_leaf_as_prev(trans
, i
))
125 if (bch2_btree_node_lock_write(trans
, trans
->paths
+ i
->path
, &insert_l(trans
, i
)->b
->c
))
126 return trans_lock_write_fail(trans
, i
);
129 bch2_btree_node_prep_for_write(trans
, trans
->paths
+ i
->path
, insert_l(trans
, i
)->b
);
132 trans
->write_locked
= true;
136 static inline void bch2_trans_unlock_write(struct btree_trans
*trans
)
138 if (likely(trans
->write_locked
)) {
139 trans_for_each_update(trans
, i
)
140 if (btree_node_locked_type(trans
->paths
+ i
->path
, i
->level
) ==
141 BTREE_NODE_WRITE_LOCKED
)
142 bch2_btree_node_unlock_write_inlined(trans
,
143 trans
->paths
+ i
->path
, insert_l(trans
, i
)->b
);
144 trans
->write_locked
= false;
148 /* Inserting into a given leaf node (last stage of insert): */
150 /* Handle overwrites and do insert, for non extents: */
151 bool bch2_btree_bset_insert_key(struct btree_trans
*trans
,
152 struct btree_path
*path
,
154 struct btree_node_iter
*node_iter
,
155 struct bkey_i
*insert
)
157 struct bkey_packed
*k
;
158 unsigned clobber_u64s
= 0, new_u64s
= 0;
160 EBUG_ON(btree_node_just_written(b
));
161 EBUG_ON(bset_written(b
, btree_bset_last(b
)));
162 EBUG_ON(bkey_deleted(&insert
->k
) && bkey_val_u64s(&insert
->k
));
163 EBUG_ON(bpos_lt(insert
->k
.p
, b
->data
->min_key
));
164 EBUG_ON(bpos_gt(insert
->k
.p
, b
->data
->max_key
));
165 EBUG_ON(insert
->k
.u64s
> bch2_btree_keys_u64s_remaining(b
));
166 EBUG_ON(!b
->c
.level
&& !bpos_eq(insert
->k
.p
, path
->pos
));
168 k
= bch2_btree_node_iter_peek_all(node_iter
, b
);
169 if (k
&& bkey_cmp_left_packed(b
, k
, &insert
->k
.p
))
172 /* @k is the key being overwritten/deleted, if any: */
173 EBUG_ON(k
&& bkey_deleted(k
));
175 /* Deleting, but not found? nothing to do: */
176 if (bkey_deleted(&insert
->k
) && !k
)
179 if (bkey_deleted(&insert
->k
)) {
181 btree_account_key_drop(b
, k
);
182 k
->type
= KEY_TYPE_deleted
;
184 if (k
->needs_whiteout
)
185 push_whiteout(b
, insert
->k
.p
);
186 k
->needs_whiteout
= false;
188 if (k
>= btree_bset_last(b
)->start
) {
189 clobber_u64s
= k
->u64s
;
190 bch2_bset_delete(b
, k
, clobber_u64s
);
193 bch2_btree_path_fix_key_modified(trans
, b
, k
);
201 btree_account_key_drop(b
, k
);
202 k
->type
= KEY_TYPE_deleted
;
204 insert
->k
.needs_whiteout
= k
->needs_whiteout
;
205 k
->needs_whiteout
= false;
207 if (k
>= btree_bset_last(b
)->start
) {
208 clobber_u64s
= k
->u64s
;
211 bch2_btree_path_fix_key_modified(trans
, b
, k
);
215 k
= bch2_btree_node_iter_bset_pos(node_iter
, b
, bset_tree_last(b
));
217 bch2_bset_insert(b
, k
, insert
, clobber_u64s
);
220 if (clobber_u64s
!= new_u64s
)
221 bch2_btree_node_iter_fix(trans
, path
, b
, node_iter
, k
,
222 clobber_u64s
, new_u64s
);
226 static int __btree_node_flush(struct journal
*j
, struct journal_entry_pin
*pin
,
229 struct bch_fs
*c
= container_of(j
, struct bch_fs
, journal
);
230 struct btree_write
*w
= container_of(pin
, struct btree_write
, journal
);
231 struct btree
*b
= container_of(w
, struct btree
, writes
[i
]);
232 struct btree_trans
*trans
= bch2_trans_get(c
);
233 unsigned long old
, new;
234 unsigned idx
= w
- b
->writes
;
236 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_read
);
238 old
= READ_ONCE(b
->flags
);
242 if (!(old
& (1 << BTREE_NODE_dirty
)) ||
243 !!(old
& (1 << BTREE_NODE_write_idx
)) != idx
||
244 w
->journal
.seq
!= seq
)
247 new &= ~BTREE_WRITE_TYPE_MASK
;
248 new |= BTREE_WRITE_journal_reclaim
;
249 new |= 1 << BTREE_NODE_need_write
;
250 } while (!try_cmpxchg(&b
->flags
, &old
, new));
252 btree_node_write_if_need(c
, b
, SIX_LOCK_read
);
253 six_unlock_read(&b
->c
.lock
);
255 bch2_trans_put(trans
);
259 int bch2_btree_node_flush0(struct journal
*j
, struct journal_entry_pin
*pin
, u64 seq
)
261 return __btree_node_flush(j
, pin
, 0, seq
);
264 int bch2_btree_node_flush1(struct journal
*j
, struct journal_entry_pin
*pin
, u64 seq
)
266 return __btree_node_flush(j
, pin
, 1, seq
);
269 inline void bch2_btree_add_journal_pin(struct bch_fs
*c
,
270 struct btree
*b
, u64 seq
)
272 struct btree_write
*w
= btree_current_write(b
);
274 bch2_journal_pin_add(&c
->journal
, seq
, &w
->journal
,
275 btree_node_write_idx(b
) == 0
276 ? bch2_btree_node_flush0
277 : bch2_btree_node_flush1
);
281 * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
282 * @trans: btree transaction object
283 * @path: path pointing to @insert's pos
284 * @insert: key to insert
285 * @journal_seq: sequence number of journal reservation
287 inline void bch2_btree_insert_key_leaf(struct btree_trans
*trans
,
288 struct btree_path
*path
,
289 struct bkey_i
*insert
,
292 struct bch_fs
*c
= trans
->c
;
293 struct btree
*b
= path_l(path
)->b
;
294 struct bset_tree
*t
= bset_tree_last(b
);
295 struct bset
*i
= bset(b
, t
);
296 int old_u64s
= bset_u64s(t
);
297 int old_live_u64s
= b
->nr
.live_u64s
;
298 int live_u64s_added
, u64s_added
;
300 if (unlikely(!bch2_btree_bset_insert_key(trans
, path
, b
,
301 &path_l(path
)->iter
, insert
)))
304 i
->journal_seq
= cpu_to_le64(max(journal_seq
, le64_to_cpu(i
->journal_seq
)));
306 bch2_btree_add_journal_pin(c
, b
, journal_seq
);
308 if (unlikely(!btree_node_dirty(b
))) {
309 EBUG_ON(test_bit(BCH_FS_clean_shutdown
, &c
->flags
));
310 set_btree_node_dirty_acct(c
, b
);
313 live_u64s_added
= (int) b
->nr
.live_u64s
- old_live_u64s
;
314 u64s_added
= (int) bset_u64s(t
) - old_u64s
;
316 if (b
->sib_u64s
[0] != U16_MAX
&& live_u64s_added
< 0)
317 b
->sib_u64s
[0] = max(0, (int) b
->sib_u64s
[0] + live_u64s_added
);
318 if (b
->sib_u64s
[1] != U16_MAX
&& live_u64s_added
< 0)
319 b
->sib_u64s
[1] = max(0, (int) b
->sib_u64s
[1] + live_u64s_added
);
321 if (u64s_added
> live_u64s_added
&&
322 bch2_maybe_compact_whiteouts(c
, b
))
323 bch2_trans_node_reinit_iter(trans
, b
);
326 /* Cached btree updates: */
328 /* Normal update interface: */
330 static inline void btree_insert_entry_checks(struct btree_trans
*trans
,
331 struct btree_insert_entry
*i
)
333 struct btree_path
*path
= trans
->paths
+ i
->path
;
335 BUG_ON(!bpos_eq(i
->k
->k
.p
, path
->pos
));
336 BUG_ON(i
->cached
!= path
->cached
);
337 BUG_ON(i
->level
!= path
->level
);
338 BUG_ON(i
->btree_id
!= path
->btree_id
);
340 btree_type_has_snapshots(i
->btree_id
) &&
341 !(i
->flags
& BTREE_UPDATE_internal_snapshot_node
) &&
342 test_bit(JOURNAL_replay_done
, &trans
->c
->journal
.flags
) &&
343 i
->k
->k
.p
.snapshot
&&
344 bch2_snapshot_is_internal_node(trans
->c
, i
->k
->k
.p
.snapshot
) > 0);
347 static __always_inline
int bch2_trans_journal_res_get(struct btree_trans
*trans
,
350 return bch2_journal_res_get(&trans
->c
->journal
, &trans
->journal_res
,
351 trans
->journal_u64s
, flags
);
354 #define JSET_ENTRY_LOG_U64s 4
356 static noinline
void journal_transaction_name(struct btree_trans
*trans
)
358 struct bch_fs
*c
= trans
->c
;
359 struct journal
*j
= &c
->journal
;
360 struct jset_entry
*entry
=
361 bch2_journal_add_entry(j
, &trans
->journal_res
,
362 BCH_JSET_ENTRY_log
, 0, 0,
363 JSET_ENTRY_LOG_U64s
);
364 struct jset_entry_log
*l
=
365 container_of(entry
, struct jset_entry_log
, entry
);
367 strncpy(l
->d
, trans
->fn
, JSET_ENTRY_LOG_U64s
* sizeof(u64
));
370 static inline int btree_key_can_insert(struct btree_trans
*trans
,
371 struct btree
*b
, unsigned u64s
)
373 if (!bch2_btree_node_insert_fits(b
, u64s
))
374 return -BCH_ERR_btree_insert_btree_node_full
;
380 btree_key_can_insert_cached_slowpath(struct btree_trans
*trans
, unsigned flags
,
381 struct btree_path
*path
, unsigned new_u64s
)
383 struct bkey_cached
*ck
= (void *) path
->l
[0].b
;
384 struct bkey_i
*new_k
;
387 bch2_trans_unlock_write(trans
);
388 bch2_trans_unlock(trans
);
390 new_k
= kmalloc(new_u64s
* sizeof(u64
), GFP_KERNEL
);
392 bch_err(trans
->c
, "error allocating memory for key cache key, btree %s u64s %u",
393 bch2_btree_id_str(path
->btree_id
), new_u64s
);
394 return -BCH_ERR_ENOMEM_btree_key_cache_insert
;
397 ret
= bch2_trans_relock(trans
) ?:
398 bch2_trans_lock_write(trans
);
404 memcpy(new_k
, ck
->k
, ck
->u64s
* sizeof(u64
));
406 trans_for_each_update(trans
, i
)
407 if (i
->old_v
== &ck
->k
->v
)
408 i
->old_v
= &new_k
->v
;
416 static int btree_key_can_insert_cached(struct btree_trans
*trans
, unsigned flags
,
417 struct btree_path
*path
, unsigned u64s
)
419 struct bch_fs
*c
= trans
->c
;
420 struct bkey_cached
*ck
= (void *) path
->l
[0].b
;
422 struct bkey_i
*new_k
;
423 unsigned watermark
= flags
& BCH_WATERMARK_MASK
;
425 EBUG_ON(path
->level
);
427 if (watermark
< BCH_WATERMARK_reclaim
&&
428 !test_bit(BKEY_CACHED_DIRTY
, &ck
->flags
) &&
429 bch2_btree_key_cache_must_wait(c
))
430 return -BCH_ERR_btree_insert_need_journal_reclaim
;
433 * bch2_varint_decode can read past the end of the buffer by at most 7
434 * bytes (it won't be used):
438 if (u64s
<= ck
->u64s
)
441 new_u64s
= roundup_pow_of_two(u64s
);
442 new_k
= krealloc(ck
->k
, new_u64s
* sizeof(u64
), GFP_NOWAIT
|__GFP_NOWARN
);
443 if (unlikely(!new_k
))
444 return btree_key_can_insert_cached_slowpath(trans
, flags
, path
, new_u64s
);
446 trans_for_each_update(trans
, i
)
447 if (i
->old_v
== &ck
->k
->v
)
448 i
->old_v
= &new_k
->v
;
457 static int run_one_mem_trigger(struct btree_trans
*trans
,
458 struct btree_insert_entry
*i
,
461 verify_update_old_key(trans
, i
);
463 if (unlikely(flags
& BTREE_TRIGGER_norun
))
466 struct bkey_s_c old
= { &i
->old_k
, i
->old_v
};
467 struct bkey_i
*new = i
->k
;
468 const struct bkey_ops
*old_ops
= bch2_bkey_type_ops(old
.k
->type
);
469 const struct bkey_ops
*new_ops
= bch2_bkey_type_ops(i
->k
->k
.type
);
471 if (old_ops
->trigger
== new_ops
->trigger
)
472 return bch2_key_trigger(trans
, i
->btree_id
, i
->level
,
473 old
, bkey_i_to_s(new),
474 BTREE_TRIGGER_insert
|BTREE_TRIGGER_overwrite
|flags
);
476 return bch2_key_trigger_new(trans
, i
->btree_id
, i
->level
,
477 bkey_i_to_s(new), flags
) ?:
478 bch2_key_trigger_old(trans
, i
->btree_id
, i
->level
,
482 static int run_one_trans_trigger(struct btree_trans
*trans
, struct btree_insert_entry
*i
,
485 verify_update_old_key(trans
, i
);
487 if ((i
->flags
& BTREE_TRIGGER_norun
) ||
488 !btree_node_type_has_trans_triggers(i
->bkey_type
))
492 * Transactional triggers create new btree_insert_entries, so we can't
493 * pass them a pointer to a btree_insert_entry, that memory is going to
496 struct bkey old_k
= i
->old_k
;
497 struct bkey_s_c old
= { &old_k
, i
->old_v
};
498 const struct bkey_ops
*old_ops
= bch2_bkey_type_ops(old
.k
->type
);
499 const struct bkey_ops
*new_ops
= bch2_bkey_type_ops(i
->k
->k
.type
);
500 unsigned flags
= i
->flags
|BTREE_TRIGGER_transactional
;
502 if (!i
->insert_trigger_run
&&
503 !i
->overwrite_trigger_run
&&
504 old_ops
->trigger
== new_ops
->trigger
) {
505 i
->overwrite_trigger_run
= true;
506 i
->insert_trigger_run
= true;
507 return bch2_key_trigger(trans
, i
->btree_id
, i
->level
, old
, bkey_i_to_s(i
->k
),
508 BTREE_TRIGGER_insert
|
509 BTREE_TRIGGER_overwrite
|flags
) ?: 1;
510 } else if (overwrite
&& !i
->overwrite_trigger_run
) {
511 i
->overwrite_trigger_run
= true;
512 return bch2_key_trigger_old(trans
, i
->btree_id
, i
->level
, old
, flags
) ?: 1;
513 } else if (!overwrite
&& !i
->insert_trigger_run
) {
514 i
->insert_trigger_run
= true;
515 return bch2_key_trigger_new(trans
, i
->btree_id
, i
->level
, bkey_i_to_s(i
->k
), flags
) ?: 1;
521 static int run_btree_triggers(struct btree_trans
*trans
, enum btree_id btree_id
,
522 unsigned btree_id_start
)
524 for (int overwrite
= 1; overwrite
>= 0; --overwrite
) {
525 bool trans_trigger_run
;
528 * Running triggers will append more updates to the list of updates as
532 trans_trigger_run
= false;
534 for (unsigned i
= btree_id_start
;
535 i
< trans
->nr_updates
&& trans
->updates
[i
].btree_id
<= btree_id
;
537 if (trans
->updates
[i
].btree_id
!= btree_id
)
540 int ret
= run_one_trans_trigger(trans
, trans
->updates
+ i
, overwrite
);
544 trans_trigger_run
= true;
546 } while (trans_trigger_run
);
552 static int bch2_trans_commit_run_triggers(struct btree_trans
*trans
)
554 unsigned btree_id
= 0, btree_id_start
= 0;
559 * For a given btree, this algorithm runs insert triggers before
560 * overwrite triggers: this is so that when extents are being moved
561 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
564 for (btree_id
= 0; btree_id
< BTREE_ID_NR
; btree_id
++) {
565 if (btree_id
== BTREE_ID_alloc
)
568 while (btree_id_start
< trans
->nr_updates
&&
569 trans
->updates
[btree_id_start
].btree_id
< btree_id
)
572 ret
= run_btree_triggers(trans
, btree_id
, btree_id_start
);
577 for (unsigned idx
= 0; idx
< trans
->nr_updates
; idx
++) {
578 struct btree_insert_entry
*i
= trans
->updates
+ idx
;
580 if (i
->btree_id
> BTREE_ID_alloc
)
582 if (i
->btree_id
== BTREE_ID_alloc
) {
583 ret
= run_btree_triggers(trans
, BTREE_ID_alloc
, idx
);
590 #ifdef CONFIG_BCACHEFS_DEBUG
591 trans_for_each_update(trans
, i
)
592 BUG_ON(!(i
->flags
& BTREE_TRIGGER_norun
) &&
593 btree_node_type_has_trans_triggers(i
->bkey_type
) &&
594 (!i
->insert_trigger_run
|| !i
->overwrite_trigger_run
));
599 static noinline
int bch2_trans_commit_run_gc_triggers(struct btree_trans
*trans
)
601 trans_for_each_update(trans
, i
)
602 if (btree_node_type_has_triggers(i
->bkey_type
) &&
603 gc_visited(trans
->c
, gc_pos_btree(i
->btree_id
, i
->level
, i
->k
->k
.p
))) {
604 int ret
= run_one_mem_trigger(trans
, i
, i
->flags
|BTREE_TRIGGER_gc
);
612 static struct bversion
journal_pos_to_bversion(struct journal_res
*res
, unsigned offset
)
614 return (struct bversion
) {
615 .hi
= res
->seq
>> 32,
616 .lo
= (res
->seq
<< 32) | (res
->offset
+ offset
),
621 bch2_trans_commit_write_locked(struct btree_trans
*trans
, unsigned flags
,
622 struct btree_insert_entry
**stopped_at
,
623 unsigned long trace_ip
)
625 struct bch_fs
*c
= trans
->c
;
626 struct btree_trans_commit_hook
*h
;
630 bch2_trans_verify_not_unlocked(trans
);
631 bch2_trans_verify_not_in_restart(trans
);
634 trace_and_count(c
, trans_restart_fault_inject
, trans
, trace_ip
);
635 return btree_trans_restart_nounlock(trans
, BCH_ERR_transaction_restart_fault_inject
);
639 * Check if the insert will fit in the leaf node with the write lock
640 * held, otherwise another thread could write the node changing the
641 * amount of space available:
644 prefetch(&trans
->c
->journal
.flags
);
646 trans_for_each_update(trans
, i
) {
647 /* Multiple inserts might go to same leaf: */
648 if (!same_leaf_as_prev(trans
, i
))
651 u64s
+= i
->k
->k
.u64s
;
653 ? btree_key_can_insert(trans
, insert_l(trans
, i
)->b
, u64s
)
654 : btree_key_can_insert_cached(trans
, flags
, trans
->paths
+ i
->path
, u64s
);
660 i
->k
->k
.needs_whiteout
= false;
664 * Don't get journal reservation until after we know insert will
667 if (likely(!(flags
& BCH_TRANS_COMMIT_no_journal_res
))) {
668 ret
= bch2_trans_journal_res_get(trans
,
669 (flags
& BCH_WATERMARK_MASK
)|
670 JOURNAL_RES_GET_NONBLOCK
);
674 if (unlikely(trans
->journal_transaction_names
))
675 journal_transaction_name(trans
);
679 * Not allowed to fail after we've gotten our journal reservation - we
683 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG
) &&
684 !(flags
& BCH_TRANS_COMMIT_no_journal_res
)) {
685 if (bch2_journal_seq_verify
)
686 trans_for_each_update(trans
, i
)
687 i
->k
->k
.bversion
.lo
= trans
->journal_res
.seq
;
688 else if (bch2_inject_invalid_keys
)
689 trans_for_each_update(trans
, i
)
690 i
->k
->k
.bversion
= MAX_VERSION
;
695 ret
= h
->fn(trans
, h
);
701 struct jset_entry
*entry
= trans
->journal_entries
;
703 percpu_down_read(&c
->mark_lock
);
705 for (entry
= trans
->journal_entries
;
706 entry
!= (void *) ((u64
*) trans
->journal_entries
+ trans
->journal_entries_u64s
);
707 entry
= vstruct_next(entry
))
708 if (entry
->type
== BCH_JSET_ENTRY_write_buffer_keys
&&
709 entry
->start
->k
.type
== KEY_TYPE_accounting
) {
710 BUG_ON(!trans
->journal_res
.ref
);
712 struct bkey_i_accounting
*a
= bkey_i_to_accounting(entry
->start
);
714 a
->k
.bversion
= journal_pos_to_bversion(&trans
->journal_res
,
715 (u64
*) entry
- (u64
*) trans
->journal_entries
);
716 BUG_ON(bversion_zero(a
->k
.bversion
));
718 if (likely(!(flags
& BCH_TRANS_COMMIT_skip_accounting_apply
))) {
719 ret
= bch2_accounting_mem_mod_locked(trans
, accounting_i_to_s_c(a
), BCH_ACCOUNTING_normal
);
721 goto revert_fs_usage
;
724 percpu_up_read(&c
->mark_lock
);
726 /* XXX: we only want to run this if deltas are nonzero */
727 bch2_trans_account_disk_usage_change(trans
);
729 trans_for_each_update(trans
, i
)
730 if (btree_node_type_has_atomic_triggers(i
->bkey_type
)) {
731 ret
= run_one_mem_trigger(trans
, i
, BTREE_TRIGGER_atomic
|i
->flags
);
736 if (unlikely(c
->gc_pos
.phase
)) {
737 ret
= bch2_trans_commit_run_gc_triggers(trans
);
742 trans_for_each_update(trans
, i
) {
743 enum bch_validate_flags invalid_flags
= 0;
745 if (!(flags
& BCH_TRANS_COMMIT_no_journal_res
))
746 invalid_flags
|= BCH_VALIDATE_write
|BCH_VALIDATE_commit
;
748 ret
= bch2_bkey_validate(c
, bkey_i_to_s_c(i
->k
),
749 i
->bkey_type
, invalid_flags
);
751 bch2_trans_inconsistent(trans
, "invalid bkey on insert from %s -> %ps\n",
752 trans
->fn
, (void *) i
->ip_allocated
);
755 btree_insert_entry_checks(trans
, i
);
758 for (struct jset_entry
*i
= trans
->journal_entries
;
759 i
!= (void *) ((u64
*) trans
->journal_entries
+ trans
->journal_entries_u64s
);
760 i
= vstruct_next(i
)) {
761 enum bch_validate_flags invalid_flags
= 0;
763 if (!(flags
& BCH_TRANS_COMMIT_no_journal_res
))
764 invalid_flags
|= BCH_VALIDATE_write
|BCH_VALIDATE_commit
;
766 ret
= bch2_journal_entry_validate(c
, NULL
, i
,
767 bcachefs_metadata_version_current
,
768 CPU_BIG_ENDIAN
, invalid_flags
);
770 bch2_trans_inconsistent(trans
, "invalid journal entry on insert from %s\n",
776 if (likely(!(flags
& BCH_TRANS_COMMIT_no_journal_res
))) {
777 struct journal
*j
= &c
->journal
;
778 struct jset_entry
*entry
;
780 trans_for_each_update(trans
, i
) {
781 if (i
->key_cache_already_flushed
)
784 if (i
->flags
& BTREE_UPDATE_nojournal
)
787 verify_update_old_key(trans
, i
);
789 if (trans
->journal_transaction_names
) {
790 entry
= bch2_journal_add_entry(j
, &trans
->journal_res
,
791 BCH_JSET_ENTRY_overwrite
,
792 i
->btree_id
, i
->level
,
794 bkey_reassemble((struct bkey_i
*) entry
->start
,
795 (struct bkey_s_c
) { &i
->old_k
, i
->old_v
});
798 entry
= bch2_journal_add_entry(j
, &trans
->journal_res
,
799 BCH_JSET_ENTRY_btree_keys
,
800 i
->btree_id
, i
->level
,
802 bkey_copy((struct bkey_i
*) entry
->start
, i
->k
);
805 memcpy_u64s_small(journal_res_entry(&c
->journal
, &trans
->journal_res
),
806 trans
->journal_entries
,
807 trans
->journal_entries_u64s
);
809 trans
->journal_res
.offset
+= trans
->journal_entries_u64s
;
810 trans
->journal_res
.u64s
-= trans
->journal_entries_u64s
;
812 if (trans
->journal_seq
)
813 *trans
->journal_seq
= trans
->journal_res
.seq
;
816 trans_for_each_update(trans
, i
) {
817 struct btree_path
*path
= trans
->paths
+ i
->path
;
820 bch2_btree_insert_key_leaf(trans
, path
, i
->k
, trans
->journal_res
.seq
);
821 else if (!i
->key_cache_already_flushed
)
822 bch2_btree_insert_key_cached(trans
, flags
, i
);
824 bch2_btree_key_cache_drop(trans
, path
);
829 bch2_fs_fatal_error(c
, "fatal error in transaction commit: %s", bch2_err_str(ret
));
830 percpu_down_read(&c
->mark_lock
);
832 for (struct jset_entry
*entry2
= trans
->journal_entries
;
834 entry2
= vstruct_next(entry2
))
835 if (entry2
->type
== BCH_JSET_ENTRY_write_buffer_keys
&&
836 entry2
->start
->k
.type
== KEY_TYPE_accounting
) {
837 struct bkey_s_accounting a
= bkey_i_to_s_accounting(entry2
->start
);
839 bch2_accounting_neg(a
);
840 bch2_accounting_mem_mod_locked(trans
, a
.c
, BCH_ACCOUNTING_normal
);
841 bch2_accounting_neg(a
);
843 percpu_up_read(&c
->mark_lock
);
847 static noinline
void bch2_drop_overwrites_from_journal(struct btree_trans
*trans
)
850 * Accounting keys aren't deduped in the journal: we have to compare
851 * each individual update against what's in the btree to see if it has
852 * been applied yet, and accounting updates also don't overwrite,
853 * they're deltas that accumulate.
855 trans_for_each_update(trans
, i
)
856 if (i
->k
->k
.type
!= KEY_TYPE_accounting
)
857 bch2_journal_key_overwritten(trans
->c
, i
->btree_id
, i
->level
, i
->k
->k
.p
);
860 static int bch2_trans_commit_journal_pin_flush(struct journal
*j
,
861 struct journal_entry_pin
*_pin
, u64 seq
)
867 * Get journal reservation, take write locks, and attempt to do btree update(s):
869 static inline int do_bch2_trans_commit(struct btree_trans
*trans
, unsigned flags
,
870 struct btree_insert_entry
**stopped_at
,
871 unsigned long trace_ip
)
873 struct bch_fs
*c
= trans
->c
;
874 int ret
= 0, u64s_delta
= 0;
876 for (unsigned idx
= 0; idx
< trans
->nr_updates
; idx
++) {
877 struct btree_insert_entry
*i
= trans
->updates
+ idx
;
881 u64s_delta
+= !bkey_deleted(&i
->k
->k
) ? i
->k
->k
.u64s
: 0;
882 u64s_delta
-= i
->old_btree_u64s
;
884 if (!same_leaf_as_next(trans
, i
)) {
885 if (u64s_delta
<= 0) {
886 ret
= bch2_foreground_maybe_merge(trans
, i
->path
,
896 ret
= bch2_trans_lock_write(trans
);
900 ret
= bch2_trans_commit_write_locked(trans
, flags
, stopped_at
, trace_ip
);
902 if (!ret
&& unlikely(trans
->journal_replay_not_finished
))
903 bch2_drop_overwrites_from_journal(trans
);
905 bch2_trans_unlock_write(trans
);
907 if (!ret
&& trans
->journal_pin
)
908 bch2_journal_pin_add(&c
->journal
, trans
->journal_res
.seq
,
910 bch2_trans_commit_journal_pin_flush
);
913 * Drop journal reservation after dropping write locks, since dropping
914 * the journal reservation may kick off a journal write:
916 if (likely(!(flags
& BCH_TRANS_COMMIT_no_journal_res
)))
917 bch2_journal_res_put(&c
->journal
, &trans
->journal_res
);
922 static int journal_reclaim_wait_done(struct bch_fs
*c
)
924 int ret
= bch2_journal_error(&c
->journal
) ?:
925 bch2_btree_key_cache_wait_done(c
);
928 journal_reclaim_kick(&c
->journal
);
933 int bch2_trans_commit_error(struct btree_trans
*trans
, unsigned flags
,
934 struct btree_insert_entry
*i
,
935 int ret
, unsigned long trace_ip
)
937 struct bch_fs
*c
= trans
->c
;
938 enum bch_watermark watermark
= flags
& BCH_WATERMARK_MASK
;
941 case -BCH_ERR_btree_insert_btree_node_full
:
942 ret
= bch2_btree_split_leaf(trans
, i
->path
, flags
);
943 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
944 trace_and_count(c
, trans_restart_btree_node_split
, trans
,
945 trace_ip
, trans
->paths
+ i
->path
);
947 case -BCH_ERR_btree_insert_need_mark_replicas
:
948 ret
= drop_locks_do(trans
,
949 bch2_accounting_update_sb(trans
));
951 case -BCH_ERR_journal_res_get_blocked
:
953 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
956 if ((flags
& BCH_TRANS_COMMIT_journal_reclaim
) &&
957 watermark
< BCH_WATERMARK_reclaim
) {
958 ret
= -BCH_ERR_journal_reclaim_would_deadlock
;
962 ret
= drop_locks_do(trans
,
963 bch2_trans_journal_res_get(trans
,
964 (flags
& BCH_WATERMARK_MASK
)|
965 JOURNAL_RES_GET_CHECK
));
967 case -BCH_ERR_btree_insert_need_journal_reclaim
:
968 bch2_trans_unlock(trans
);
970 trace_and_count(c
, trans_blocked_journal_reclaim
, trans
, trace_ip
);
971 track_event_change(&c
->times
[BCH_TIME_blocked_key_cache_flush
], true);
973 wait_event_freezable(c
->journal
.reclaim_wait
,
974 (ret
= journal_reclaim_wait_done(c
)));
976 track_event_change(&c
->times
[BCH_TIME_blocked_key_cache_flush
], false);
981 ret
= bch2_trans_relock(trans
);
988 BUG_ON(bch2_err_matches(ret
, BCH_ERR_transaction_restart
) != !!trans
->restarted
);
990 bch2_fs_inconsistent_on(bch2_err_matches(ret
, ENOSPC
) &&
991 (flags
& BCH_TRANS_COMMIT_no_enospc
), c
,
992 "%s: incorrectly got %s\n", __func__
, bch2_err_str(ret
));
998 bch2_trans_commit_get_rw_cold(struct btree_trans
*trans
, unsigned flags
)
1000 struct bch_fs
*c
= trans
->c
;
1003 if (likely(!(flags
& BCH_TRANS_COMMIT_lazy_rw
)) ||
1004 test_bit(BCH_FS_started
, &c
->flags
))
1005 return -BCH_ERR_erofs_trans_commit
;
1007 ret
= drop_locks_do(trans
, bch2_fs_read_write_early(c
));
1011 bch2_write_ref_get(c
, BCH_WRITE_REF_trans
);
1016 * This is for updates done in the early part of fsck - btree_gc - before we've
1017 * gone RW. we only add the new key to the list of keys for journal replay to
1021 do_bch2_trans_commit_to_journal_replay(struct btree_trans
*trans
)
1023 struct bch_fs
*c
= trans
->c
;
1025 trans_for_each_update(trans
, i
) {
1026 int ret
= bch2_journal_key_insert(c
, i
->btree_id
, i
->level
, i
->k
);
1031 for (struct jset_entry
*i
= trans
->journal_entries
;
1032 i
!= (void *) ((u64
*) trans
->journal_entries
+ trans
->journal_entries_u64s
);
1033 i
= vstruct_next(i
))
1034 if (i
->type
== BCH_JSET_ENTRY_btree_keys
||
1035 i
->type
== BCH_JSET_ENTRY_write_buffer_keys
) {
1036 int ret
= bch2_journal_key_insert(c
, i
->btree_id
, i
->level
, i
->start
);
1044 int __bch2_trans_commit(struct btree_trans
*trans
, unsigned flags
)
1046 struct btree_insert_entry
*errored_at
= NULL
;
1047 struct bch_fs
*c
= trans
->c
;
1050 bch2_trans_verify_not_unlocked(trans
);
1051 bch2_trans_verify_not_in_restart(trans
);
1053 if (!trans
->nr_updates
&&
1054 !trans
->journal_entries_u64s
)
1057 ret
= bch2_trans_commit_run_triggers(trans
);
1061 if (unlikely(!test_bit(BCH_FS_may_go_rw
, &c
->flags
))) {
1062 ret
= do_bch2_trans_commit_to_journal_replay(trans
);
1066 if (!(flags
& BCH_TRANS_COMMIT_no_check_rw
) &&
1067 unlikely(!bch2_write_ref_tryget(c
, BCH_WRITE_REF_trans
))) {
1068 ret
= bch2_trans_commit_get_rw_cold(trans
, flags
);
1073 EBUG_ON(test_bit(BCH_FS_clean_shutdown
, &c
->flags
));
1075 trans
->journal_u64s
= trans
->journal_entries_u64s
;
1076 trans
->journal_transaction_names
= READ_ONCE(c
->opts
.journal_transaction_names
);
1077 if (trans
->journal_transaction_names
)
1078 trans
->journal_u64s
+= jset_u64s(JSET_ENTRY_LOG_U64s
);
1080 trans_for_each_update(trans
, i
) {
1081 struct btree_path
*path
= trans
->paths
+ i
->path
;
1083 EBUG_ON(!path
->should_be_locked
);
1085 ret
= bch2_btree_path_upgrade(trans
, path
, i
->level
+ 1);
1089 EBUG_ON(!btree_node_intent_locked(path
, i
->level
));
1091 if (i
->key_cache_already_flushed
)
1094 if (i
->flags
& BTREE_UPDATE_nojournal
)
1097 /* we're going to journal the key being updated: */
1098 trans
->journal_u64s
+= jset_u64s(i
->k
->k
.u64s
);
1100 /* and we're also going to log the overwrite: */
1101 if (trans
->journal_transaction_names
)
1102 trans
->journal_u64s
+= jset_u64s(i
->old_k
.u64s
);
1105 if (trans
->extra_disk_res
) {
1106 ret
= bch2_disk_reservation_add(c
, trans
->disk_res
,
1107 trans
->extra_disk_res
,
1108 (flags
& BCH_TRANS_COMMIT_no_enospc
)
1109 ? BCH_DISK_RESERVATION_NOFAIL
: 0);
1115 bch2_trans_verify_not_unlocked(trans
);
1116 bch2_trans_verify_not_in_restart(trans
);
1117 if (likely(!(flags
& BCH_TRANS_COMMIT_no_journal_res
)))
1118 memset(&trans
->journal_res
, 0, sizeof(trans
->journal_res
));
1119 memset(&trans
->fs_usage_delta
, 0, sizeof(trans
->fs_usage_delta
));
1121 ret
= do_bch2_trans_commit(trans
, flags
, &errored_at
, _RET_IP_
);
1123 /* make sure we didn't drop or screw up locks: */
1124 bch2_trans_verify_locks(trans
);
1129 trace_and_count(c
, transaction_commit
, trans
, _RET_IP_
);
1131 if (likely(!(flags
& BCH_TRANS_COMMIT_no_check_rw
)))
1132 bch2_write_ref_put(c
, BCH_WRITE_REF_trans
);
1135 bch2_trans_downgrade(trans
);
1136 bch2_trans_reset_updates(trans
);
1140 ret
= bch2_trans_commit_error(trans
, flags
, errored_at
, ret
, _RET_IP_
);
1145 * We might have done another transaction commit in the error path -
1146 * i.e. btree write buffer flush - which will have made use of
1147 * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
1148 * how the journal sequence number to pin is passed in - so we must
1151 if (flags
& BCH_TRANS_COMMIT_no_journal_res
) {
1152 ret
= -BCH_ERR_transaction_restart_nested
;