1 // SPDX-License-Identifier: GPL-2.0
4 #include "btree_locking.h"
5 #include "btree_types.h"
7 static struct lock_class_key bch2_btree_node_lock_key
;
9 void bch2_btree_lock_init(struct btree_bkey_cached_common
*b
,
10 enum six_lock_init_flags flags
)
12 __six_lock_init(&b
->lock
, "b->c.lock", &bch2_btree_node_lock_key
, flags
);
13 lockdep_set_notrack_class(&b
->lock
);
16 /* Btree node locking: */
18 struct six_lock_count
bch2_btree_node_lock_counts(struct btree_trans
*trans
,
19 struct btree_path
*skip
,
20 struct btree_bkey_cached_common
*b
,
23 struct btree_path
*path
;
24 struct six_lock_count ret
;
27 memset(&ret
, 0, sizeof(ret
));
29 if (IS_ERR_OR_NULL(b
))
32 trans_for_each_path(trans
, path
, i
)
33 if (path
!= skip
&& &path
->l
[level
].b
->c
== b
) {
34 int t
= btree_node_locked_type(path
, level
);
36 if (t
!= BTREE_NODE_UNLOCKED
)
45 void bch2_btree_node_unlock_write(struct btree_trans
*trans
,
46 struct btree_path
*path
, struct btree
*b
)
48 bch2_btree_node_unlock_write_inlined(trans
, path
, b
);
54 * @trans wants to lock @b with type @type
56 struct trans_waiting_for_lock
{
57 struct btree_trans
*trans
;
58 struct btree_bkey_cached_common
*node_want
;
59 enum six_lock_type lock_want
;
61 /* for iterating over held locks :*/
68 struct trans_waiting_for_lock g
[8];
72 static noinline
void print_cycle(struct printbuf
*out
, struct lock_graph
*g
)
74 struct trans_waiting_for_lock
*i
;
76 prt_printf(out
, "Found lock cycle (%u entries):\n", g
->nr
);
78 for (i
= g
->g
; i
< g
->g
+ g
->nr
; i
++) {
79 struct task_struct
*task
= READ_ONCE(i
->trans
->locking_wait
.task
);
83 bch2_btree_trans_to_text(out
, i
->trans
);
84 bch2_prt_task_backtrace(out
, task
, i
== g
->g
? 5 : 1, GFP_NOWAIT
);
88 static noinline
void print_chain(struct printbuf
*out
, struct lock_graph
*g
)
90 struct trans_waiting_for_lock
*i
;
92 for (i
= g
->g
; i
!= g
->g
+ g
->nr
; i
++) {
93 struct task_struct
*task
= i
->trans
->locking_wait
.task
;
96 prt_printf(out
, "%u ", task
?task
->pid
: 0);
101 static void lock_graph_up(struct lock_graph
*g
)
103 closure_put(&g
->g
[--g
->nr
].trans
->ref
);
106 static noinline
void lock_graph_pop_all(struct lock_graph
*g
)
112 static void __lock_graph_down(struct lock_graph
*g
, struct btree_trans
*trans
)
114 g
->g
[g
->nr
++] = (struct trans_waiting_for_lock
) {
116 .node_want
= trans
->locking
,
117 .lock_want
= trans
->locking_wait
.lock_want
,
121 static void lock_graph_down(struct lock_graph
*g
, struct btree_trans
*trans
)
123 closure_get(&trans
->ref
);
124 __lock_graph_down(g
, trans
);
127 static bool lock_graph_remove_non_waiters(struct lock_graph
*g
)
129 struct trans_waiting_for_lock
*i
;
131 for (i
= g
->g
+ 1; i
< g
->g
+ g
->nr
; i
++)
132 if (i
->trans
->locking
!= i
->node_want
||
133 i
->trans
->locking_wait
.start_time
!= i
[-1].lock_start_time
) {
134 while (g
->g
+ g
->nr
> i
)
142 static void trace_would_deadlock(struct lock_graph
*g
, struct btree_trans
*trans
)
144 struct bch_fs
*c
= trans
->c
;
146 count_event(c
, trans_restart_would_deadlock
);
148 if (trace_trans_restart_would_deadlock_enabled()) {
149 struct printbuf buf
= PRINTBUF
;
152 print_cycle(&buf
, g
);
154 trace_trans_restart_would_deadlock(trans
, buf
.buf
);
159 static int abort_lock(struct lock_graph
*g
, struct trans_waiting_for_lock
*i
)
162 trace_would_deadlock(g
, i
->trans
);
163 return btree_trans_restart(i
->trans
, BCH_ERR_transaction_restart_would_deadlock
);
165 i
->trans
->lock_must_abort
= true;
166 wake_up_process(i
->trans
->locking_wait
.task
);
171 static int btree_trans_abort_preference(struct btree_trans
*trans
)
173 if (trans
->lock_may_not_fail
)
175 if (trans
->locking_wait
.lock_want
== SIX_LOCK_write
)
177 if (!trans
->in_traverse_all
)
182 static noinline
int break_cycle(struct lock_graph
*g
, struct printbuf
*cycle
)
184 struct trans_waiting_for_lock
*i
, *abort
= NULL
;
185 unsigned best
= 0, pref
;
188 if (lock_graph_remove_non_waiters(g
))
191 /* Only checking, for debugfs: */
193 print_cycle(cycle
, g
);
198 for (i
= g
->g
; i
< g
->g
+ g
->nr
; i
++) {
199 pref
= btree_trans_abort_preference(i
->trans
);
206 if (unlikely(!best
)) {
207 struct printbuf buf
= PRINTBUF
;
210 prt_printf(&buf
, bch2_fmt(g
->g
->trans
->c
, "cycle of nofail locks"));
212 for (i
= g
->g
; i
< g
->g
+ g
->nr
; i
++) {
213 struct btree_trans
*trans
= i
->trans
;
215 bch2_btree_trans_to_text(&buf
, trans
);
217 prt_printf(&buf
, "backtrace:\n");
218 printbuf_indent_add(&buf
, 2);
219 bch2_prt_task_backtrace(&buf
, trans
->locking_wait
.task
, 2, GFP_NOWAIT
);
220 printbuf_indent_sub(&buf
, 2);
224 bch2_print_string_as_lines_nonblocking(KERN_ERR
, buf
.buf
);
229 ret
= abort_lock(g
, abort
);
237 static int lock_graph_descend(struct lock_graph
*g
, struct btree_trans
*trans
,
238 struct printbuf
*cycle
)
240 struct btree_trans
*orig_trans
= g
->g
->trans
;
241 struct trans_waiting_for_lock
*i
;
243 for (i
= g
->g
; i
< g
->g
+ g
->nr
; i
++)
244 if (i
->trans
== trans
) {
245 closure_put(&trans
->ref
);
246 return break_cycle(g
, cycle
);
249 if (g
->nr
== ARRAY_SIZE(g
->g
)) {
250 closure_put(&trans
->ref
);
252 if (orig_trans
->lock_may_not_fail
)
261 trace_and_count(trans
->c
, trans_restart_would_deadlock_recursion_limit
, trans
, _RET_IP_
);
262 return btree_trans_restart(orig_trans
, BCH_ERR_transaction_restart_deadlock_recursion_limit
);
265 __lock_graph_down(g
, trans
);
269 static bool lock_type_conflicts(enum six_lock_type t1
, enum six_lock_type t2
)
274 int bch2_check_for_deadlock(struct btree_trans
*trans
, struct printbuf
*cycle
)
277 struct trans_waiting_for_lock
*top
;
278 struct btree_bkey_cached_common
*b
;
279 btree_path_idx_t path_idx
;
284 if (trans
->lock_must_abort
) {
288 trace_would_deadlock(&g
, trans
);
289 return btree_trans_restart(trans
, BCH_ERR_transaction_restart_would_deadlock
);
292 lock_graph_down(&g
, trans
);
294 /* trans->paths is rcu protected vs. freeing */
302 top
= &g
.g
[g
.nr
- 1];
304 struct btree_path
*paths
= rcu_dereference(top
->trans
->paths
);
308 unsigned long *paths_allocated
= trans_paths_allocated(paths
);
310 trans_for_each_path_idx_from(paths_allocated
, *trans_paths_nr(paths
),
311 path_idx
, top
->path_idx
) {
312 struct btree_path
*path
= paths
+ path_idx
;
313 if (!path
->nodes_locked
)
316 if (path_idx
!= top
->path_idx
) {
317 top
->path_idx
= path_idx
;
319 top
->lock_start_time
= 0;
323 top
->level
< BTREE_MAX_DEPTH
;
324 top
->level
++, top
->lock_start_time
= 0) {
325 int lock_held
= btree_node_locked_type(path
, top
->level
);
327 if (lock_held
== BTREE_NODE_UNLOCKED
)
330 b
= &READ_ONCE(path
->l
[top
->level
].b
)->c
;
332 if (IS_ERR_OR_NULL(b
)) {
334 * If we get here, it means we raced with the
335 * other thread updating its btree_path
336 * structures - which means it can't be blocked
339 if (!lock_graph_remove_non_waiters(&g
)) {
341 * If lock_graph_remove_non_waiters()
342 * didn't do anything, it must be
343 * because we're being called by debugfs
344 * checking for lock cycles, which
345 * invokes us on btree_transactions that
346 * aren't actually waiting on anything.
349 lock_graph_pop_all(&g
);
355 if (list_empty_careful(&b
->lock
.wait_list
))
358 raw_spin_lock(&b
->lock
.wait_lock
);
359 list_for_each_entry(trans
, &b
->lock
.wait_list
, locking_wait
.list
) {
360 BUG_ON(b
!= trans
->locking
);
362 if (top
->lock_start_time
&&
363 time_after_eq64(top
->lock_start_time
, trans
->locking_wait
.start_time
))
366 top
->lock_start_time
= trans
->locking_wait
.start_time
;
368 /* Don't check for self deadlock: */
369 if (trans
== top
->trans
||
370 !lock_type_conflicts(lock_held
, trans
->locking_wait
.lock_want
))
373 closure_get(&trans
->ref
);
374 raw_spin_unlock(&b
->lock
.wait_lock
);
376 ret
= lock_graph_descend(&g
, trans
, cycle
);
382 raw_spin_unlock(&b
->lock
.wait_lock
);
386 if (g
.nr
> 1 && cycle
)
387 print_chain(cycle
, &g
);
397 int bch2_six_check_for_deadlock(struct six_lock
*lock
, void *p
)
399 struct btree_trans
*trans
= p
;
401 return bch2_check_for_deadlock(trans
, NULL
);
404 int __bch2_btree_node_lock_write(struct btree_trans
*trans
, struct btree_path
*path
,
405 struct btree_bkey_cached_common
*b
,
406 bool lock_may_not_fail
)
408 int readers
= bch2_btree_node_lock_counts(trans
, NULL
, b
, b
->level
).n
[SIX_LOCK_read
];
412 * Must drop our read locks before calling six_lock_write() -
413 * six_unlock() won't do wakeups until the reader count
414 * goes to 0, and it's safe because we have the node intent
417 six_lock_readers_add(&b
->lock
, -readers
);
418 ret
= __btree_node_lock_nopath(trans
, b
, SIX_LOCK_write
,
419 lock_may_not_fail
, _RET_IP_
);
420 six_lock_readers_add(&b
->lock
, readers
);
423 mark_btree_node_locked_noreset(path
, b
->level
, BTREE_NODE_INTENT_LOCKED
);
428 void bch2_btree_node_lock_write_nofail(struct btree_trans
*trans
,
429 struct btree_path
*path
,
430 struct btree_bkey_cached_common
*b
)
432 int ret
= __btree_node_lock_write(trans
, path
, b
, true);
438 static inline bool btree_path_get_locks(struct btree_trans
*trans
,
439 struct btree_path
*path
,
441 struct get_locks_fail
*f
)
443 unsigned l
= path
->level
;
447 if (!btree_path_node(path
, l
))
451 ? bch2_btree_node_upgrade(trans
, path
, l
)
452 : bch2_btree_node_relock(trans
, path
, l
))) {
462 } while (l
< path
->locks_want
);
465 * When we fail to get a lock, we have to ensure that any child nodes
466 * can't be relocked so bch2_btree_path_traverse has to walk back up to
467 * the node that we failed to relock:
470 __bch2_btree_path_unlock(trans
, path
);
471 btree_path_set_dirty(path
, BTREE_ITER_NEED_TRAVERSE
);
474 path
->l
[fail_idx
].b
= upgrade
475 ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade
)
476 : ERR_PTR(-BCH_ERR_no_btree_node_relock
);
478 } while (fail_idx
>= 0);
481 if (path
->uptodate
== BTREE_ITER_NEED_RELOCK
)
482 path
->uptodate
= BTREE_ITER_UPTODATE
;
484 return path
->uptodate
< BTREE_ITER_NEED_RELOCK
;
487 bool __bch2_btree_node_relock(struct btree_trans
*trans
,
488 struct btree_path
*path
, unsigned level
,
491 struct btree
*b
= btree_path_node(path
, level
);
492 int want
= __btree_lock_want(path
, level
);
497 if (six_relock_type(&b
->c
.lock
, want
, path
->l
[level
].lock_seq
) ||
498 (btree_node_lock_seq_matches(path
, b
, level
) &&
499 btree_node_lock_increment(trans
, &b
->c
, level
, want
))) {
500 mark_btree_node_locked(trans
, path
, level
, want
);
504 if (trace
&& !trans
->notrace_relock_fail
)
505 trace_and_count(trans
->c
, btree_path_relock_fail
, trans
, _RET_IP_
, path
, level
);
511 bool bch2_btree_node_upgrade(struct btree_trans
*trans
,
512 struct btree_path
*path
, unsigned level
)
514 struct btree
*b
= path
->l
[level
].b
;
515 struct six_lock_count count
= bch2_btree_node_lock_counts(trans
, path
, &b
->c
, level
);
517 if (!is_btree_node(path
, level
))
520 switch (btree_lock_want(path
, level
)) {
521 case BTREE_NODE_UNLOCKED
:
522 BUG_ON(btree_node_locked(path
, level
));
524 case BTREE_NODE_READ_LOCKED
:
525 BUG_ON(btree_node_intent_locked(path
, level
));
526 return bch2_btree_node_relock(trans
, path
, level
);
527 case BTREE_NODE_INTENT_LOCKED
:
529 case BTREE_NODE_WRITE_LOCKED
:
533 if (btree_node_intent_locked(path
, level
))
539 if (btree_node_locked(path
, level
)) {
542 six_lock_readers_add(&b
->c
.lock
, -count
.n
[SIX_LOCK_read
]);
543 ret
= six_lock_tryupgrade(&b
->c
.lock
);
544 six_lock_readers_add(&b
->c
.lock
, count
.n
[SIX_LOCK_read
]);
549 if (six_relock_type(&b
->c
.lock
, SIX_LOCK_intent
, path
->l
[level
].lock_seq
))
554 * Do we already have an intent lock via another path? If so, just bump
557 if (btree_node_lock_seq_matches(path
, b
, level
) &&
558 btree_node_lock_increment(trans
, &b
->c
, level
, BTREE_NODE_INTENT_LOCKED
)) {
559 btree_node_unlock(trans
, path
, level
);
563 trace_and_count(trans
->c
, btree_path_upgrade_fail
, trans
, _RET_IP_
, path
, level
);
566 mark_btree_node_locked_noreset(path
, level
, BTREE_NODE_INTENT_LOCKED
);
570 /* Btree path locking: */
573 * Only for btree_cache.c - only relocks intent locks
575 int bch2_btree_path_relock_intent(struct btree_trans
*trans
,
576 struct btree_path
*path
)
580 for (l
= path
->level
;
581 l
< path
->locks_want
&& btree_path_node(path
, l
);
583 if (!bch2_btree_node_relock(trans
, path
, l
)) {
584 __bch2_btree_path_unlock(trans
, path
);
585 btree_path_set_dirty(path
, BTREE_ITER_NEED_TRAVERSE
);
586 trace_and_count(trans
->c
, trans_restart_relock_path_intent
, trans
, _RET_IP_
, path
);
587 return btree_trans_restart(trans
, BCH_ERR_transaction_restart_relock_path_intent
);
595 bool bch2_btree_path_relock_norestart(struct btree_trans
*trans
, struct btree_path
*path
)
597 struct get_locks_fail f
;
599 bool ret
= btree_path_get_locks(trans
, path
, false, &f
);
600 bch2_trans_verify_locks(trans
);
604 int __bch2_btree_path_relock(struct btree_trans
*trans
,
605 struct btree_path
*path
, unsigned long trace_ip
)
607 if (!bch2_btree_path_relock_norestart(trans
, path
)) {
608 trace_and_count(trans
->c
, trans_restart_relock_path
, trans
, trace_ip
, path
);
609 return btree_trans_restart(trans
, BCH_ERR_transaction_restart_relock_path
);
615 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans
*trans
,
616 struct btree_path
*path
,
617 unsigned new_locks_want
,
618 struct get_locks_fail
*f
)
620 EBUG_ON(path
->locks_want
>= new_locks_want
);
622 path
->locks_want
= new_locks_want
;
624 bool ret
= btree_path_get_locks(trans
, path
, true, f
);
625 bch2_trans_verify_locks(trans
);
629 bool __bch2_btree_path_upgrade(struct btree_trans
*trans
,
630 struct btree_path
*path
,
631 unsigned new_locks_want
,
632 struct get_locks_fail
*f
)
634 bool ret
= bch2_btree_path_upgrade_noupgrade_sibs(trans
, path
, new_locks_want
, f
);
639 * XXX: this is ugly - we'd prefer to not be mucking with other
640 * iterators in the btree_trans here.
642 * On failure to upgrade the iterator, setting iter->locks_want and
643 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
644 * get the locks we want on transaction restart.
646 * But if this iterator was a clone, on transaction restart what we did
647 * to this iterator isn't going to be preserved.
649 * Possibly we could add an iterator field for the parent iterator when
650 * an iterator is a copy - for now, we'll just upgrade any other
651 * iterators with the same btree id.
653 * The code below used to be needed to ensure ancestor nodes get locked
654 * before interior nodes - now that's handled by
655 * bch2_btree_path_traverse_all().
657 if (!path
->cached
&& !trans
->in_traverse_all
) {
658 struct btree_path
*linked
;
661 trans_for_each_path(trans
, linked
, i
)
662 if (linked
!= path
&&
663 linked
->cached
== path
->cached
&&
664 linked
->btree_id
== path
->btree_id
&&
665 linked
->locks_want
< new_locks_want
) {
666 linked
->locks_want
= new_locks_want
;
667 btree_path_get_locks(trans
, linked
, true, NULL
);
671 bch2_trans_verify_locks(trans
);
675 void __bch2_btree_path_downgrade(struct btree_trans
*trans
,
676 struct btree_path
*path
,
677 unsigned new_locks_want
)
679 unsigned l
, old_locks_want
= path
->locks_want
;
681 if (trans
->restarted
)
684 EBUG_ON(path
->locks_want
< new_locks_want
);
686 path
->locks_want
= new_locks_want
;
688 while (path
->nodes_locked
&&
689 (l
= btree_path_highest_level_locked(path
)) >= path
->locks_want
) {
690 if (l
> path
->level
) {
691 btree_node_unlock(trans
, path
, l
);
693 if (btree_node_intent_locked(path
, l
)) {
694 six_lock_downgrade(&path
->l
[l
].b
->c
.lock
);
695 mark_btree_node_locked_noreset(path
, l
, BTREE_NODE_READ_LOCKED
);
701 bch2_btree_path_verify_locks(path
);
703 trace_path_downgrade(trans
, _RET_IP_
, path
, old_locks_want
);
706 /* Btree transaction locking: */
708 void bch2_trans_downgrade(struct btree_trans
*trans
)
710 struct btree_path
*path
;
713 if (trans
->restarted
)
716 trans_for_each_path(trans
, path
, i
)
718 bch2_btree_path_downgrade(trans
, path
);
721 static inline void __bch2_trans_unlock(struct btree_trans
*trans
)
723 struct btree_path
*path
;
726 trans_for_each_path(trans
, path
, i
)
727 __bch2_btree_path_unlock(trans
, path
);
730 static noinline __cold
int bch2_trans_relock_fail(struct btree_trans
*trans
, struct btree_path
*path
,
731 struct get_locks_fail
*f
, bool trace
)
736 if (trace_trans_restart_relock_enabled()) {
737 struct printbuf buf
= PRINTBUF
;
739 bch2_bpos_to_text(&buf
, path
->pos
);
740 prt_printf(&buf
, " l=%u seq=%u node seq=", f
->l
, path
->l
[f
->l
].lock_seq
);
741 if (IS_ERR_OR_NULL(f
->b
)) {
742 prt_str(&buf
, bch2_err_str(PTR_ERR(f
->b
)));
744 prt_printf(&buf
, "%u", f
->b
->c
.lock
.seq
);
746 struct six_lock_count c
=
747 bch2_btree_node_lock_counts(trans
, NULL
, &f
->b
->c
, f
->l
);
748 prt_printf(&buf
, " self locked %u.%u.%u", c
.n
[0], c
.n
[1], c
.n
[2]);
750 c
= six_lock_counts(&f
->b
->c
.lock
);
751 prt_printf(&buf
, " total locked %u.%u.%u", c
.n
[0], c
.n
[1], c
.n
[2]);
754 trace_trans_restart_relock(trans
, _RET_IP_
, buf
.buf
);
758 count_event(trans
->c
, trans_restart_relock
);
760 __bch2_trans_unlock(trans
);
761 bch2_trans_verify_locks(trans
);
762 return btree_trans_restart(trans
, BCH_ERR_transaction_restart_relock
);
765 static inline int __bch2_trans_relock(struct btree_trans
*trans
, bool trace
)
767 bch2_trans_verify_locks(trans
);
769 if (unlikely(trans
->restarted
))
770 return -((int) trans
->restarted
);
771 if (unlikely(trans
->locked
))
774 struct btree_path
*path
;
777 trans_for_each_path(trans
, path
, i
) {
778 struct get_locks_fail f
;
780 if (path
->should_be_locked
&&
781 !btree_path_get_locks(trans
, path
, false, &f
))
782 return bch2_trans_relock_fail(trans
, path
, &f
, trace
);
785 trans_set_locked(trans
);
787 bch2_trans_verify_locks(trans
);
791 int bch2_trans_relock(struct btree_trans
*trans
)
793 return __bch2_trans_relock(trans
, true);
796 int bch2_trans_relock_notrace(struct btree_trans
*trans
)
798 return __bch2_trans_relock(trans
, false);
801 void bch2_trans_unlock_noassert(struct btree_trans
*trans
)
803 __bch2_trans_unlock(trans
);
805 trans_set_unlocked(trans
);
808 void bch2_trans_unlock(struct btree_trans
*trans
)
810 __bch2_trans_unlock(trans
);
812 trans_set_unlocked(trans
);
815 void bch2_trans_unlock_long(struct btree_trans
*trans
)
817 bch2_trans_unlock(trans
);
818 bch2_trans_srcu_unlock(trans
);
821 int __bch2_trans_mutex_lock(struct btree_trans
*trans
,
824 int ret
= drop_locks_do(trans
, (mutex_lock(lock
), 0));
833 #ifdef CONFIG_BCACHEFS_DEBUG
835 void bch2_btree_path_verify_locks(struct btree_path
*path
)
838 * A path may be uptodate and yet have nothing locked if and only if
839 * there is no node at path->level, which generally means we were
840 * iterating over all nodes and got to the end of the btree
842 BUG_ON(path
->uptodate
== BTREE_ITER_UPTODATE
&&
843 btree_path_node(path
, path
->level
) &&
844 !path
->nodes_locked
);
846 if (!path
->nodes_locked
)
849 for (unsigned l
= 0; l
< BTREE_MAX_DEPTH
; l
++) {
850 int want
= btree_lock_want(path
, l
);
851 int have
= btree_node_locked_type(path
, l
);
853 BUG_ON(!is_btree_node(path
, l
) && have
!= BTREE_NODE_UNLOCKED
);
855 BUG_ON(is_btree_node(path
, l
) &&
856 (want
== BTREE_NODE_UNLOCKED
||
857 have
!= BTREE_NODE_WRITE_LOCKED
) &&
862 static bool bch2_trans_locked(struct btree_trans
*trans
)
864 struct btree_path
*path
;
867 trans_for_each_path(trans
, path
, i
)
868 if (path
->nodes_locked
)
873 void bch2_trans_verify_locks(struct btree_trans
*trans
)
875 if (!trans
->locked
) {
876 BUG_ON(bch2_trans_locked(trans
));
880 struct btree_path
*path
;
883 trans_for_each_path(trans
, path
, i
)
884 bch2_btree_path_verify_locks(path
);