1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- mode: c; c-basic-offset: 8; -*-
3 * vim: noexpandtab sw=8 ts=8 sts=0:
7 * standalone DLM module
9 * Copyright (C) 2004 Oracle. All rights reserved.
13 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/slab.h>
17 #include <linux/highmem.h>
18 #include <linux/init.h>
19 #include <linux/sysctl.h>
20 #include <linux/random.h>
21 #include <linux/blkdev.h>
22 #include <linux/socket.h>
23 #include <linux/inet.h>
24 #include <linux/spinlock.h>
25 #include <linux/delay.h>
28 #include "cluster/heartbeat.h"
29 #include "cluster/nodemanager.h"
30 #include "cluster/tcp.h"
33 #include "dlmcommon.h"
34 #include "dlmdomain.h"
37 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
38 #include "cluster/masklog.h"
40 static void dlm_mle_node_down(struct dlm_ctxt
*dlm
,
41 struct dlm_master_list_entry
*mle
,
42 struct o2nm_node
*node
,
44 static void dlm_mle_node_up(struct dlm_ctxt
*dlm
,
45 struct dlm_master_list_entry
*mle
,
46 struct o2nm_node
*node
,
49 static void dlm_assert_master_worker(struct dlm_work_item
*item
, void *data
);
50 static int dlm_do_assert_master(struct dlm_ctxt
*dlm
,
51 struct dlm_lock_resource
*res
,
52 void *nodemap
, u32 flags
);
53 static void dlm_deref_lockres_worker(struct dlm_work_item
*item
, void *data
);
55 static inline int dlm_mle_equal(struct dlm_ctxt
*dlm
,
56 struct dlm_master_list_entry
*mle
,
63 if (namelen
!= mle
->mnamelen
||
64 memcmp(name
, mle
->mname
, namelen
) != 0)
70 static struct kmem_cache
*dlm_lockres_cache
;
71 static struct kmem_cache
*dlm_lockname_cache
;
72 static struct kmem_cache
*dlm_mle_cache
;
74 static void dlm_mle_release(struct kref
*kref
);
75 static void dlm_init_mle(struct dlm_master_list_entry
*mle
,
76 enum dlm_mle_type type
,
78 struct dlm_lock_resource
*res
,
80 unsigned int namelen
);
81 static void dlm_put_mle(struct dlm_master_list_entry
*mle
);
82 static void __dlm_put_mle(struct dlm_master_list_entry
*mle
);
83 static int dlm_find_mle(struct dlm_ctxt
*dlm
,
84 struct dlm_master_list_entry
**mle
,
85 char *name
, unsigned int namelen
);
87 static int dlm_do_master_request(struct dlm_lock_resource
*res
,
88 struct dlm_master_list_entry
*mle
, int to
);
91 static int dlm_wait_for_lock_mastery(struct dlm_ctxt
*dlm
,
92 struct dlm_lock_resource
*res
,
93 struct dlm_master_list_entry
*mle
,
95 static int dlm_restart_lock_mastery(struct dlm_ctxt
*dlm
,
96 struct dlm_lock_resource
*res
,
97 struct dlm_master_list_entry
*mle
,
99 static int dlm_add_migration_mle(struct dlm_ctxt
*dlm
,
100 struct dlm_lock_resource
*res
,
101 struct dlm_master_list_entry
*mle
,
102 struct dlm_master_list_entry
**oldmle
,
103 const char *name
, unsigned int namelen
,
104 u8 new_master
, u8 master
);
106 static u8
dlm_pick_migration_target(struct dlm_ctxt
*dlm
,
107 struct dlm_lock_resource
*res
);
108 static void dlm_remove_nonlocal_locks(struct dlm_ctxt
*dlm
,
109 struct dlm_lock_resource
*res
);
110 static int dlm_mark_lockres_migrating(struct dlm_ctxt
*dlm
,
111 struct dlm_lock_resource
*res
,
113 static int dlm_pre_master_reco_lockres(struct dlm_ctxt
*dlm
,
114 struct dlm_lock_resource
*res
);
117 int dlm_is_host_down(int errno
)
134 case -EINVAL
: /* if returned from our tcp code,
135 this means there is no socket */
143 * MASTER LIST FUNCTIONS
148 * regarding master list entries and heartbeat callbacks:
150 * in order to avoid sleeping and allocation that occurs in
151 * heartbeat, master list entries are simply attached to the
152 * dlm's established heartbeat callbacks. the mle is attached
153 * when it is created, and since the dlm->spinlock is held at
154 * that time, any heartbeat event will be properly discovered
155 * by the mle. the mle needs to be detached from the
156 * dlm->mle_hb_events list as soon as heartbeat events are no
157 * longer useful to the mle, and before the mle is freed.
159 * as a general rule, heartbeat events are no longer needed by
160 * the mle once an "answer" regarding the lock master has been
163 static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt
*dlm
,
164 struct dlm_master_list_entry
*mle
)
166 assert_spin_locked(&dlm
->spinlock
);
168 list_add_tail(&mle
->hb_events
, &dlm
->mle_hb_events
);
172 static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt
*dlm
,
173 struct dlm_master_list_entry
*mle
)
175 if (!list_empty(&mle
->hb_events
))
176 list_del_init(&mle
->hb_events
);
180 static inline void dlm_mle_detach_hb_events(struct dlm_ctxt
*dlm
,
181 struct dlm_master_list_entry
*mle
)
183 spin_lock(&dlm
->spinlock
);
184 __dlm_mle_detach_hb_events(dlm
, mle
);
185 spin_unlock(&dlm
->spinlock
);
188 static void dlm_get_mle_inuse(struct dlm_master_list_entry
*mle
)
190 struct dlm_ctxt
*dlm
;
193 assert_spin_locked(&dlm
->spinlock
);
194 assert_spin_locked(&dlm
->master_lock
);
196 kref_get(&mle
->mle_refs
);
199 static void dlm_put_mle_inuse(struct dlm_master_list_entry
*mle
)
201 struct dlm_ctxt
*dlm
;
204 spin_lock(&dlm
->spinlock
);
205 spin_lock(&dlm
->master_lock
);
208 spin_unlock(&dlm
->master_lock
);
209 spin_unlock(&dlm
->spinlock
);
213 /* remove from list and free */
214 static void __dlm_put_mle(struct dlm_master_list_entry
*mle
)
216 struct dlm_ctxt
*dlm
;
219 assert_spin_locked(&dlm
->spinlock
);
220 assert_spin_locked(&dlm
->master_lock
);
221 if (!kref_read(&mle
->mle_refs
)) {
222 /* this may or may not crash, but who cares.
224 mlog(ML_ERROR
, "bad mle: %p\n", mle
);
225 dlm_print_one_mle(mle
);
228 kref_put(&mle
->mle_refs
, dlm_mle_release
);
232 /* must not have any spinlocks coming in */
233 static void dlm_put_mle(struct dlm_master_list_entry
*mle
)
235 struct dlm_ctxt
*dlm
;
238 spin_lock(&dlm
->spinlock
);
239 spin_lock(&dlm
->master_lock
);
241 spin_unlock(&dlm
->master_lock
);
242 spin_unlock(&dlm
->spinlock
);
245 static inline void dlm_get_mle(struct dlm_master_list_entry
*mle
)
247 kref_get(&mle
->mle_refs
);
250 static void dlm_init_mle(struct dlm_master_list_entry
*mle
,
251 enum dlm_mle_type type
,
252 struct dlm_ctxt
*dlm
,
253 struct dlm_lock_resource
*res
,
255 unsigned int namelen
)
257 assert_spin_locked(&dlm
->spinlock
);
261 INIT_HLIST_NODE(&mle
->master_hash_node
);
262 INIT_LIST_HEAD(&mle
->hb_events
);
263 memset(mle
->maybe_map
, 0, sizeof(mle
->maybe_map
));
264 spin_lock_init(&mle
->spinlock
);
265 init_waitqueue_head(&mle
->wq
);
266 atomic_set(&mle
->woken
, 0);
267 kref_init(&mle
->mle_refs
);
268 memset(mle
->response_map
, 0, sizeof(mle
->response_map
));
269 mle
->master
= O2NM_MAX_NODES
;
270 mle
->new_master
= O2NM_MAX_NODES
;
273 BUG_ON(mle
->type
!= DLM_MLE_BLOCK
&&
274 mle
->type
!= DLM_MLE_MASTER
&&
275 mle
->type
!= DLM_MLE_MIGRATION
);
277 if (mle
->type
== DLM_MLE_MASTER
) {
280 memcpy(mle
->mname
, res
->lockname
.name
, res
->lockname
.len
);
281 mle
->mnamelen
= res
->lockname
.len
;
282 mle
->mnamehash
= res
->lockname
.hash
;
286 memcpy(mle
->mname
, name
, namelen
);
287 mle
->mnamelen
= namelen
;
288 mle
->mnamehash
= dlm_lockid_hash(name
, namelen
);
291 atomic_inc(&dlm
->mle_tot_count
[mle
->type
]);
292 atomic_inc(&dlm
->mle_cur_count
[mle
->type
]);
294 /* copy off the node_map and register hb callbacks on our copy */
295 memcpy(mle
->node_map
, dlm
->domain_map
, sizeof(mle
->node_map
));
296 memcpy(mle
->vote_map
, dlm
->domain_map
, sizeof(mle
->vote_map
));
297 clear_bit(dlm
->node_num
, mle
->vote_map
);
298 clear_bit(dlm
->node_num
, mle
->node_map
);
300 /* attach the mle to the domain node up/down events */
301 __dlm_mle_attach_hb_events(dlm
, mle
);
304 void __dlm_unlink_mle(struct dlm_ctxt
*dlm
, struct dlm_master_list_entry
*mle
)
306 assert_spin_locked(&dlm
->spinlock
);
307 assert_spin_locked(&dlm
->master_lock
);
309 if (!hlist_unhashed(&mle
->master_hash_node
))
310 hlist_del_init(&mle
->master_hash_node
);
313 void __dlm_insert_mle(struct dlm_ctxt
*dlm
, struct dlm_master_list_entry
*mle
)
315 struct hlist_head
*bucket
;
317 assert_spin_locked(&dlm
->master_lock
);
319 bucket
= dlm_master_hash(dlm
, mle
->mnamehash
);
320 hlist_add_head(&mle
->master_hash_node
, bucket
);
323 /* returns 1 if found, 0 if not */
324 static int dlm_find_mle(struct dlm_ctxt
*dlm
,
325 struct dlm_master_list_entry
**mle
,
326 char *name
, unsigned int namelen
)
328 struct dlm_master_list_entry
*tmpmle
;
329 struct hlist_head
*bucket
;
332 assert_spin_locked(&dlm
->master_lock
);
334 hash
= dlm_lockid_hash(name
, namelen
);
335 bucket
= dlm_master_hash(dlm
, hash
);
336 hlist_for_each_entry(tmpmle
, bucket
, master_hash_node
) {
337 if (!dlm_mle_equal(dlm
, tmpmle
, name
, namelen
))
346 void dlm_hb_event_notify_attached(struct dlm_ctxt
*dlm
, int idx
, int node_up
)
348 struct dlm_master_list_entry
*mle
;
350 assert_spin_locked(&dlm
->spinlock
);
352 list_for_each_entry(mle
, &dlm
->mle_hb_events
, hb_events
) {
354 dlm_mle_node_up(dlm
, mle
, NULL
, idx
);
356 dlm_mle_node_down(dlm
, mle
, NULL
, idx
);
360 static void dlm_mle_node_down(struct dlm_ctxt
*dlm
,
361 struct dlm_master_list_entry
*mle
,
362 struct o2nm_node
*node
, int idx
)
364 spin_lock(&mle
->spinlock
);
366 if (!test_bit(idx
, mle
->node_map
))
367 mlog(0, "node %u already removed from nodemap!\n", idx
);
369 clear_bit(idx
, mle
->node_map
);
371 spin_unlock(&mle
->spinlock
);
374 static void dlm_mle_node_up(struct dlm_ctxt
*dlm
,
375 struct dlm_master_list_entry
*mle
,
376 struct o2nm_node
*node
, int idx
)
378 spin_lock(&mle
->spinlock
);
380 if (test_bit(idx
, mle
->node_map
))
381 mlog(0, "node %u already in node map!\n", idx
);
383 set_bit(idx
, mle
->node_map
);
385 spin_unlock(&mle
->spinlock
);
389 int dlm_init_mle_cache(void)
391 dlm_mle_cache
= kmem_cache_create("o2dlm_mle",
392 sizeof(struct dlm_master_list_entry
),
393 0, SLAB_HWCACHE_ALIGN
,
395 if (dlm_mle_cache
== NULL
)
400 void dlm_destroy_mle_cache(void)
402 kmem_cache_destroy(dlm_mle_cache
);
405 static void dlm_mle_release(struct kref
*kref
)
407 struct dlm_master_list_entry
*mle
;
408 struct dlm_ctxt
*dlm
;
410 mle
= container_of(kref
, struct dlm_master_list_entry
, mle_refs
);
413 assert_spin_locked(&dlm
->spinlock
);
414 assert_spin_locked(&dlm
->master_lock
);
416 mlog(0, "Releasing mle for %.*s, type %d\n", mle
->mnamelen
, mle
->mname
,
419 /* remove from list if not already */
420 __dlm_unlink_mle(dlm
, mle
);
422 /* detach the mle from the domain node up/down events */
423 __dlm_mle_detach_hb_events(dlm
, mle
);
425 atomic_dec(&dlm
->mle_cur_count
[mle
->type
]);
427 /* NOTE: kfree under spinlock here.
428 * if this is bad, we can move this to a freelist. */
429 kmem_cache_free(dlm_mle_cache
, mle
);
434 * LOCK RESOURCE FUNCTIONS
437 int dlm_init_master_caches(void)
439 dlm_lockres_cache
= kmem_cache_create("o2dlm_lockres",
440 sizeof(struct dlm_lock_resource
),
441 0, SLAB_HWCACHE_ALIGN
, NULL
);
442 if (!dlm_lockres_cache
)
445 dlm_lockname_cache
= kmem_cache_create("o2dlm_lockname",
446 DLM_LOCKID_NAME_MAX
, 0,
447 SLAB_HWCACHE_ALIGN
, NULL
);
448 if (!dlm_lockname_cache
)
453 dlm_destroy_master_caches();
457 void dlm_destroy_master_caches(void)
459 kmem_cache_destroy(dlm_lockname_cache
);
460 dlm_lockname_cache
= NULL
;
462 kmem_cache_destroy(dlm_lockres_cache
);
463 dlm_lockres_cache
= NULL
;
466 static void dlm_lockres_release(struct kref
*kref
)
468 struct dlm_lock_resource
*res
;
469 struct dlm_ctxt
*dlm
;
471 res
= container_of(kref
, struct dlm_lock_resource
, refs
);
474 /* This should not happen -- all lockres' have a name
475 * associated with them at init time. */
476 BUG_ON(!res
->lockname
.name
);
478 mlog(0, "destroying lockres %.*s\n", res
->lockname
.len
,
481 atomic_dec(&dlm
->res_cur_count
);
483 if (!hlist_unhashed(&res
->hash_node
) ||
484 !list_empty(&res
->granted
) ||
485 !list_empty(&res
->converting
) ||
486 !list_empty(&res
->blocked
) ||
487 !list_empty(&res
->dirty
) ||
488 !list_empty(&res
->recovering
) ||
489 !list_empty(&res
->purge
)) {
491 "Going to BUG for resource %.*s."
492 " We're on a list! [%c%c%c%c%c%c%c]\n",
493 res
->lockname
.len
, res
->lockname
.name
,
494 !hlist_unhashed(&res
->hash_node
) ? 'H' : ' ',
495 !list_empty(&res
->granted
) ? 'G' : ' ',
496 !list_empty(&res
->converting
) ? 'C' : ' ',
497 !list_empty(&res
->blocked
) ? 'B' : ' ',
498 !list_empty(&res
->dirty
) ? 'D' : ' ',
499 !list_empty(&res
->recovering
) ? 'R' : ' ',
500 !list_empty(&res
->purge
) ? 'P' : ' ');
502 dlm_print_one_lock_resource(res
);
505 /* By the time we're ready to blow this guy away, we shouldn't
506 * be on any lists. */
507 BUG_ON(!hlist_unhashed(&res
->hash_node
));
508 BUG_ON(!list_empty(&res
->granted
));
509 BUG_ON(!list_empty(&res
->converting
));
510 BUG_ON(!list_empty(&res
->blocked
));
511 BUG_ON(!list_empty(&res
->dirty
));
512 BUG_ON(!list_empty(&res
->recovering
));
513 BUG_ON(!list_empty(&res
->purge
));
515 kmem_cache_free(dlm_lockname_cache
, (void *)res
->lockname
.name
);
517 kmem_cache_free(dlm_lockres_cache
, res
);
520 void dlm_lockres_put(struct dlm_lock_resource
*res
)
522 kref_put(&res
->refs
, dlm_lockres_release
);
525 static void dlm_init_lockres(struct dlm_ctxt
*dlm
,
526 struct dlm_lock_resource
*res
,
527 const char *name
, unsigned int namelen
)
531 /* If we memset here, we lose our reference to the kmalloc'd
532 * res->lockname.name, so be sure to init every field
535 qname
= (char *) res
->lockname
.name
;
536 memcpy(qname
, name
, namelen
);
538 res
->lockname
.len
= namelen
;
539 res
->lockname
.hash
= dlm_lockid_hash(name
, namelen
);
541 init_waitqueue_head(&res
->wq
);
542 spin_lock_init(&res
->spinlock
);
543 INIT_HLIST_NODE(&res
->hash_node
);
544 INIT_LIST_HEAD(&res
->granted
);
545 INIT_LIST_HEAD(&res
->converting
);
546 INIT_LIST_HEAD(&res
->blocked
);
547 INIT_LIST_HEAD(&res
->dirty
);
548 INIT_LIST_HEAD(&res
->recovering
);
549 INIT_LIST_HEAD(&res
->purge
);
550 INIT_LIST_HEAD(&res
->tracking
);
551 atomic_set(&res
->asts_reserved
, 0);
552 res
->migration_pending
= 0;
553 res
->inflight_locks
= 0;
554 res
->inflight_assert_workers
= 0;
558 kref_init(&res
->refs
);
560 atomic_inc(&dlm
->res_tot_count
);
561 atomic_inc(&dlm
->res_cur_count
);
563 /* just for consistency */
564 spin_lock(&res
->spinlock
);
565 dlm_set_lockres_owner(dlm
, res
, DLM_LOCK_RES_OWNER_UNKNOWN
);
566 spin_unlock(&res
->spinlock
);
568 res
->state
= DLM_LOCK_RES_IN_PROGRESS
;
572 spin_lock(&dlm
->track_lock
);
573 list_add_tail(&res
->tracking
, &dlm
->tracking_list
);
574 spin_unlock(&dlm
->track_lock
);
576 memset(res
->lvb
, 0, DLM_LVB_LEN
);
577 memset(res
->refmap
, 0, sizeof(res
->refmap
));
580 struct dlm_lock_resource
*dlm_new_lockres(struct dlm_ctxt
*dlm
,
582 unsigned int namelen
)
584 struct dlm_lock_resource
*res
= NULL
;
586 res
= kmem_cache_zalloc(dlm_lockres_cache
, GFP_NOFS
);
590 res
->lockname
.name
= kmem_cache_zalloc(dlm_lockname_cache
, GFP_NOFS
);
591 if (!res
->lockname
.name
)
594 dlm_init_lockres(dlm
, res
, name
, namelen
);
599 kmem_cache_free(dlm_lockres_cache
, res
);
603 void dlm_lockres_set_refmap_bit(struct dlm_ctxt
*dlm
,
604 struct dlm_lock_resource
*res
, int bit
)
606 assert_spin_locked(&res
->spinlock
);
608 mlog(0, "res %.*s, set node %u, %ps()\n", res
->lockname
.len
,
609 res
->lockname
.name
, bit
, __builtin_return_address(0));
611 set_bit(bit
, res
->refmap
);
614 void dlm_lockres_clear_refmap_bit(struct dlm_ctxt
*dlm
,
615 struct dlm_lock_resource
*res
, int bit
)
617 assert_spin_locked(&res
->spinlock
);
619 mlog(0, "res %.*s, clr node %u, %ps()\n", res
->lockname
.len
,
620 res
->lockname
.name
, bit
, __builtin_return_address(0));
622 clear_bit(bit
, res
->refmap
);
625 static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt
*dlm
,
626 struct dlm_lock_resource
*res
)
628 res
->inflight_locks
++;
630 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm
->name
,
631 res
->lockname
.len
, res
->lockname
.name
, res
->inflight_locks
,
632 __builtin_return_address(0));
635 void dlm_lockres_grab_inflight_ref(struct dlm_ctxt
*dlm
,
636 struct dlm_lock_resource
*res
)
638 assert_spin_locked(&res
->spinlock
);
639 __dlm_lockres_grab_inflight_ref(dlm
, res
);
642 void dlm_lockres_drop_inflight_ref(struct dlm_ctxt
*dlm
,
643 struct dlm_lock_resource
*res
)
645 assert_spin_locked(&res
->spinlock
);
647 BUG_ON(res
->inflight_locks
== 0);
649 res
->inflight_locks
--;
651 mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm
->name
,
652 res
->lockname
.len
, res
->lockname
.name
, res
->inflight_locks
,
653 __builtin_return_address(0));
658 void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt
*dlm
,
659 struct dlm_lock_resource
*res
)
661 assert_spin_locked(&res
->spinlock
);
662 res
->inflight_assert_workers
++;
663 mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
664 dlm
->name
, res
->lockname
.len
, res
->lockname
.name
,
665 res
->inflight_assert_workers
);
668 static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt
*dlm
,
669 struct dlm_lock_resource
*res
)
671 assert_spin_locked(&res
->spinlock
);
672 BUG_ON(res
->inflight_assert_workers
== 0);
673 res
->inflight_assert_workers
--;
674 mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
675 dlm
->name
, res
->lockname
.len
, res
->lockname
.name
,
676 res
->inflight_assert_workers
);
679 static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt
*dlm
,
680 struct dlm_lock_resource
*res
)
682 spin_lock(&res
->spinlock
);
683 __dlm_lockres_drop_inflight_worker(dlm
, res
);
684 spin_unlock(&res
->spinlock
);
688 * lookup a lock resource by name.
689 * may already exist in the hashtable.
690 * lockid is null terminated
692 * if not, allocate enough for the lockres and for
693 * the temporary structure used in doing the mastering.
695 * also, do a lookup in the dlm->master_list to see
696 * if another node has begun mastering the same lock.
697 * if so, there should be a block entry in there
698 * for this name, and we should *not* attempt to master
699 * the lock here. need to wait around for that node
700 * to assert_master (or die).
703 struct dlm_lock_resource
* dlm_get_lock_resource(struct dlm_ctxt
*dlm
,
708 struct dlm_lock_resource
*tmpres
=NULL
, *res
=NULL
;
709 struct dlm_master_list_entry
*mle
= NULL
;
710 struct dlm_master_list_entry
*alloc_mle
= NULL
;
713 struct dlm_node_iter iter
;
716 int bit
, wait_on_recovery
= 0;
720 hash
= dlm_lockid_hash(lockid
, namelen
);
722 mlog(0, "get lockres %s (len %d)\n", lockid
, namelen
);
725 spin_lock(&dlm
->spinlock
);
726 tmpres
= __dlm_lookup_lockres_full(dlm
, lockid
, namelen
, hash
);
728 spin_unlock(&dlm
->spinlock
);
729 spin_lock(&tmpres
->spinlock
);
732 * Right after dlm spinlock was released, dlm_thread could have
733 * purged the lockres. Check if lockres got unhashed. If so
736 if (hlist_unhashed(&tmpres
->hash_node
)) {
737 spin_unlock(&tmpres
->spinlock
);
738 dlm_lockres_put(tmpres
);
743 /* Wait on the thread that is mastering the resource */
744 if (tmpres
->owner
== DLM_LOCK_RES_OWNER_UNKNOWN
) {
745 __dlm_wait_on_lockres(tmpres
);
746 BUG_ON(tmpres
->owner
== DLM_LOCK_RES_OWNER_UNKNOWN
);
747 spin_unlock(&tmpres
->spinlock
);
748 dlm_lockres_put(tmpres
);
753 /* Wait on the resource purge to complete before continuing */
754 if (tmpres
->state
& DLM_LOCK_RES_DROPPING_REF
) {
755 BUG_ON(tmpres
->owner
== dlm
->node_num
);
756 __dlm_wait_on_lockres_flags(tmpres
,
757 DLM_LOCK_RES_DROPPING_REF
);
758 spin_unlock(&tmpres
->spinlock
);
759 dlm_lockres_put(tmpres
);
764 /* Grab inflight ref to pin the resource */
765 dlm_lockres_grab_inflight_ref(dlm
, tmpres
);
767 spin_unlock(&tmpres
->spinlock
);
769 spin_lock(&dlm
->track_lock
);
770 if (!list_empty(&res
->tracking
))
771 list_del_init(&res
->tracking
);
773 mlog(ML_ERROR
, "Resource %.*s not "
774 "on the Tracking list\n",
777 spin_unlock(&dlm
->track_lock
);
778 dlm_lockres_put(res
);
785 spin_unlock(&dlm
->spinlock
);
786 mlog(0, "allocating a new resource\n");
787 /* nothing found and we need to allocate one. */
788 alloc_mle
= kmem_cache_alloc(dlm_mle_cache
, GFP_NOFS
);
791 res
= dlm_new_lockres(dlm
, lockid
, namelen
);
797 mlog(0, "no lockres found, allocated our own: %p\n", res
);
799 if (flags
& LKM_LOCAL
) {
800 /* caller knows it's safe to assume it's not mastered elsewhere
801 * DONE! return right away */
802 spin_lock(&res
->spinlock
);
803 dlm_change_lockres_owner(dlm
, res
, dlm
->node_num
);
804 __dlm_insert_lockres(dlm
, res
);
805 dlm_lockres_grab_inflight_ref(dlm
, res
);
806 spin_unlock(&res
->spinlock
);
807 spin_unlock(&dlm
->spinlock
);
808 /* lockres still marked IN_PROGRESS */
812 /* check master list to see if another node has started mastering it */
813 spin_lock(&dlm
->master_lock
);
815 /* if we found a block, wait for lock to be mastered by another node */
816 blocked
= dlm_find_mle(dlm
, &mle
, (char *)lockid
, namelen
);
819 if (mle
->type
== DLM_MLE_MASTER
) {
820 mlog(ML_ERROR
, "master entry for nonexistent lock!\n");
823 mig
= (mle
->type
== DLM_MLE_MIGRATION
);
824 /* if there is a migration in progress, let the migration
825 * finish before continuing. we can wait for the absence
826 * of the MIGRATION mle: either the migrate finished or
827 * one of the nodes died and the mle was cleaned up.
828 * if there is a BLOCK here, but it already has a master
829 * set, we are too late. the master does not have a ref
830 * for us in the refmap. detach the mle and drop it.
831 * either way, go back to the top and start over. */
832 if (mig
|| mle
->master
!= O2NM_MAX_NODES
) {
833 BUG_ON(mig
&& mle
->master
== dlm
->node_num
);
834 /* we arrived too late. the master does not
835 * have a ref for us. retry. */
836 mlog(0, "%s:%.*s: late on %s\n",
837 dlm
->name
, namelen
, lockid
,
838 mig
? "MIGRATION" : "BLOCK");
839 spin_unlock(&dlm
->master_lock
);
840 spin_unlock(&dlm
->spinlock
);
842 /* master is known, detach */
844 dlm_mle_detach_hb_events(dlm
, mle
);
847 /* this is lame, but we can't wait on either
848 * the mle or lockres waitqueue here */
854 /* go ahead and try to master lock on this node */
856 /* make sure this does not get freed below */
858 dlm_init_mle(mle
, DLM_MLE_MASTER
, dlm
, res
, NULL
, 0);
859 set_bit(dlm
->node_num
, mle
->maybe_map
);
860 __dlm_insert_mle(dlm
, mle
);
862 /* still holding the dlm spinlock, check the recovery map
863 * to see if there are any nodes that still need to be
864 * considered. these will not appear in the mle nodemap
865 * but they might own this lockres. wait on them. */
866 bit
= find_next_bit(dlm
->recovery_map
, O2NM_MAX_NODES
, 0);
867 if (bit
< O2NM_MAX_NODES
) {
868 mlog(0, "%s: res %.*s, At least one node (%d) "
869 "to recover before lock mastery can begin\n",
870 dlm
->name
, namelen
, (char *)lockid
, bit
);
871 wait_on_recovery
= 1;
875 /* at this point there is either a DLM_MLE_BLOCK or a
876 * DLM_MLE_MASTER on the master list, so it's safe to add the
877 * lockres to the hashtable. anyone who finds the lock will
878 * still have to wait on the IN_PROGRESS. */
880 /* finally add the lockres to its hash bucket */
881 __dlm_insert_lockres(dlm
, res
);
883 /* since this lockres is new it doesn't not require the spinlock */
884 __dlm_lockres_grab_inflight_ref(dlm
, res
);
886 /* get an extra ref on the mle in case this is a BLOCK
887 * if so, the creator of the BLOCK may try to put the last
888 * ref at this time in the assert master handler, so we
889 * need an extra one to keep from a bad ptr deref. */
890 dlm_get_mle_inuse(mle
);
891 spin_unlock(&dlm
->master_lock
);
892 spin_unlock(&dlm
->spinlock
);
895 while (wait_on_recovery
) {
896 /* any cluster changes that occurred after dropping the
897 * dlm spinlock would be detectable be a change on the mle,
898 * so we only need to clear out the recovery map once. */
899 if (dlm_is_recovery_lock(lockid
, namelen
)) {
900 mlog(0, "%s: Recovery map is not empty, but must "
901 "master $RECOVERY lock now\n", dlm
->name
);
902 if (!dlm_pre_master_reco_lockres(dlm
, res
))
903 wait_on_recovery
= 0;
905 mlog(0, "%s: waiting 500ms for heartbeat state "
906 "change\n", dlm
->name
);
912 dlm_kick_recovery_thread(dlm
);
914 dlm_wait_for_recovery(dlm
);
916 spin_lock(&dlm
->spinlock
);
917 bit
= find_next_bit(dlm
->recovery_map
, O2NM_MAX_NODES
, 0);
918 if (bit
< O2NM_MAX_NODES
) {
919 mlog(0, "%s: res %.*s, At least one node (%d) "
920 "to recover before lock mastery can begin\n",
921 dlm
->name
, namelen
, (char *)lockid
, bit
);
922 wait_on_recovery
= 1;
924 wait_on_recovery
= 0;
925 spin_unlock(&dlm
->spinlock
);
927 if (wait_on_recovery
)
928 dlm_wait_for_node_recovery(dlm
, bit
, 10000);
931 /* must wait for lock to be mastered elsewhere */
936 dlm_node_iter_init(mle
->vote_map
, &iter
);
937 while ((nodenum
= dlm_node_iter_next(&iter
)) >= 0) {
938 ret
= dlm_do_master_request(res
, mle
, nodenum
);
941 if (mle
->master
!= O2NM_MAX_NODES
) {
942 /* found a master ! */
943 if (mle
->master
<= nodenum
)
945 /* if our master request has not reached the master
946 * yet, keep going until it does. this is how the
947 * master will know that asserts are needed back to
948 * the lower nodes. */
949 mlog(0, "%s: res %.*s, Requests only up to %u but "
950 "master is %u, keep going\n", dlm
->name
, namelen
,
951 lockid
, nodenum
, mle
->master
);
956 /* keep going until the response map includes all nodes */
957 ret
= dlm_wait_for_lock_mastery(dlm
, res
, mle
, &blocked
);
959 wait_on_recovery
= 1;
960 mlog(0, "%s: res %.*s, Node map changed, redo the master "
961 "request now, blocked=%d\n", dlm
->name
, res
->lockname
.len
,
962 res
->lockname
.name
, blocked
);
964 mlog(ML_ERROR
, "%s: res %.*s, Spinning on "
965 "dlm_wait_for_lock_mastery, blocked = %d\n",
966 dlm
->name
, res
->lockname
.len
,
967 res
->lockname
.name
, blocked
);
968 dlm_print_one_lock_resource(res
);
969 dlm_print_one_mle(mle
);
975 mlog(0, "%s: res %.*s, Mastered by %u\n", dlm
->name
, res
->lockname
.len
,
976 res
->lockname
.name
, res
->owner
);
977 /* make sure we never continue without this */
978 BUG_ON(res
->owner
== O2NM_MAX_NODES
);
980 /* master is known, detach if not already detached */
981 dlm_mle_detach_hb_events(dlm
, mle
);
983 /* put the extra ref */
984 dlm_put_mle_inuse(mle
);
987 spin_lock(&res
->spinlock
);
988 res
->state
&= ~DLM_LOCK_RES_IN_PROGRESS
;
989 spin_unlock(&res
->spinlock
);
993 /* need to free the unused mle */
995 kmem_cache_free(dlm_mle_cache
, alloc_mle
);
1001 #define DLM_MASTERY_TIMEOUT_MS 5000
1003 static int dlm_wait_for_lock_mastery(struct dlm_ctxt
*dlm
,
1004 struct dlm_lock_resource
*res
,
1005 struct dlm_master_list_entry
*mle
,
1010 int map_changed
, voting_done
;
1017 /* check if another node has already become the owner */
1018 spin_lock(&res
->spinlock
);
1019 if (res
->owner
!= DLM_LOCK_RES_OWNER_UNKNOWN
) {
1020 mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm
->name
,
1021 res
->lockname
.len
, res
->lockname
.name
, res
->owner
);
1022 spin_unlock(&res
->spinlock
);
1023 /* this will cause the master to re-assert across
1024 * the whole cluster, freeing up mles */
1025 if (res
->owner
!= dlm
->node_num
) {
1026 ret
= dlm_do_master_request(res
, mle
, res
->owner
);
1028 /* give recovery a chance to run */
1029 mlog(ML_ERROR
, "link to %u went down?: %d\n", res
->owner
, ret
);
1037 spin_unlock(&res
->spinlock
);
1039 spin_lock(&mle
->spinlock
);
1041 map_changed
= (memcmp(mle
->vote_map
, mle
->node_map
,
1042 sizeof(mle
->vote_map
)) != 0);
1043 voting_done
= (memcmp(mle
->vote_map
, mle
->response_map
,
1044 sizeof(mle
->vote_map
)) == 0);
1046 /* restart if we hit any errors */
1049 mlog(0, "%s: %.*s: node map changed, restarting\n",
1050 dlm
->name
, res
->lockname
.len
, res
->lockname
.name
);
1051 ret
= dlm_restart_lock_mastery(dlm
, res
, mle
, *blocked
);
1052 b
= (mle
->type
== DLM_MLE_BLOCK
);
1053 if ((*blocked
&& !b
) || (!*blocked
&& b
)) {
1054 mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
1055 dlm
->name
, res
->lockname
.len
, res
->lockname
.name
,
1059 spin_unlock(&mle
->spinlock
);
1064 mlog(0, "%s:%.*s: restart lock mastery succeeded, "
1065 "rechecking now\n", dlm
->name
, res
->lockname
.len
,
1066 res
->lockname
.name
);
1070 mlog(0, "map not changed and voting not done "
1071 "for %s:%.*s\n", dlm
->name
, res
->lockname
.len
,
1072 res
->lockname
.name
);
1076 if (m
!= O2NM_MAX_NODES
) {
1077 /* another node has done an assert!
1082 /* have all nodes responded? */
1083 if (voting_done
&& !*blocked
) {
1084 bit
= find_next_bit(mle
->maybe_map
, O2NM_MAX_NODES
, 0);
1085 if (dlm
->node_num
<= bit
) {
1086 /* my node number is lowest.
1087 * now tell other nodes that I am
1088 * mastering this. */
1089 mle
->master
= dlm
->node_num
;
1090 /* ref was grabbed in get_lock_resource
1091 * will be dropped in dlmlock_master */
1095 /* if voting is done, but we have not received
1096 * an assert master yet, we must sleep */
1100 spin_unlock(&mle
->spinlock
);
1102 /* sleep if we haven't finished voting yet */
1104 unsigned long timeo
= msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS
);
1105 atomic_set(&mle
->woken
, 0);
1106 (void)wait_event_timeout(mle
->wq
,
1107 (atomic_read(&mle
->woken
) == 1),
1109 if (res
->owner
== O2NM_MAX_NODES
) {
1110 mlog(0, "%s:%.*s: waiting again\n", dlm
->name
,
1111 res
->lockname
.len
, res
->lockname
.name
);
1114 mlog(0, "done waiting, master is %u\n", res
->owner
);
1122 mlog(0, "about to master %.*s here, this=%u\n",
1123 res
->lockname
.len
, res
->lockname
.name
, m
);
1124 ret
= dlm_do_assert_master(dlm
, res
, mle
->vote_map
, 0);
1126 /* This is a failure in the network path,
1127 * not in the response to the assert_master
1128 * (any nonzero response is a BUG on this node).
1129 * Most likely a socket just got disconnected
1130 * due to node death. */
1133 /* no longer need to restart lock mastery.
1134 * all living nodes have been contacted. */
1138 /* set the lockres owner */
1139 spin_lock(&res
->spinlock
);
1140 /* mastery reference obtained either during
1141 * assert_master_handler or in get_lock_resource */
1142 dlm_change_lockres_owner(dlm
, res
, m
);
1143 spin_unlock(&res
->spinlock
);
1149 struct dlm_bitmap_diff_iter
1152 unsigned long *orig_bm
;
1153 unsigned long *cur_bm
;
1154 unsigned long diff_bm
[BITS_TO_LONGS(O2NM_MAX_NODES
)];
1157 enum dlm_node_state_change
1164 static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter
*iter
,
1165 unsigned long *orig_bm
,
1166 unsigned long *cur_bm
)
1168 unsigned long p1
, p2
;
1172 iter
->orig_bm
= orig_bm
;
1173 iter
->cur_bm
= cur_bm
;
1175 for (i
= 0; i
< BITS_TO_LONGS(O2NM_MAX_NODES
); i
++) {
1176 p1
= *(iter
->orig_bm
+ i
);
1177 p2
= *(iter
->cur_bm
+ i
);
1178 iter
->diff_bm
[i
] = (p1
& ~p2
) | (p2
& ~p1
);
1182 static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter
*iter
,
1183 enum dlm_node_state_change
*state
)
1187 if (iter
->curnode
>= O2NM_MAX_NODES
)
1190 bit
= find_next_bit(iter
->diff_bm
, O2NM_MAX_NODES
,
1192 if (bit
>= O2NM_MAX_NODES
) {
1193 iter
->curnode
= O2NM_MAX_NODES
;
1197 /* if it was there in the original then this node died */
1198 if (test_bit(bit
, iter
->orig_bm
))
1203 iter
->curnode
= bit
;
1208 static int dlm_restart_lock_mastery(struct dlm_ctxt
*dlm
,
1209 struct dlm_lock_resource
*res
,
1210 struct dlm_master_list_entry
*mle
,
1213 struct dlm_bitmap_diff_iter bdi
;
1214 enum dlm_node_state_change sc
;
1218 mlog(0, "something happened such that the "
1219 "master process may need to be restarted!\n");
1221 assert_spin_locked(&mle
->spinlock
);
1223 dlm_bitmap_diff_iter_init(&bdi
, mle
->vote_map
, mle
->node_map
);
1224 node
= dlm_bitmap_diff_iter_next(&bdi
, &sc
);
1226 if (sc
== NODE_UP
) {
1227 /* a node came up. clear any old vote from
1228 * the response map and set it in the vote map
1229 * then restart the mastery. */
1230 mlog(ML_NOTICE
, "node %d up while restarting\n", node
);
1232 /* redo the master request, but only for the new node */
1233 mlog(0, "sending request to new node\n");
1234 clear_bit(node
, mle
->response_map
);
1235 set_bit(node
, mle
->vote_map
);
1237 mlog(ML_ERROR
, "node down! %d\n", node
);
1239 int lowest
= find_next_bit(mle
->maybe_map
,
1242 /* act like it was never there */
1243 clear_bit(node
, mle
->maybe_map
);
1245 if (node
== lowest
) {
1246 mlog(0, "expected master %u died"
1247 " while this node was blocked "
1248 "waiting on it!\n", node
);
1249 lowest
= find_next_bit(mle
->maybe_map
,
1252 if (lowest
< O2NM_MAX_NODES
) {
1253 mlog(0, "%s:%.*s:still "
1254 "blocked. waiting on %u "
1260 /* mle is an MLE_BLOCK, but
1261 * there is now nothing left to
1262 * block on. we need to return
1263 * all the way back out and try
1264 * again with an MLE_MASTER.
1265 * dlm_do_local_recovery_cleanup
1266 * has already run, so the mle
1268 mlog(0, "%s:%.*s: no "
1269 "longer blocking. try to "
1270 "master this here\n",
1273 res
->lockname
.name
);
1274 mle
->type
= DLM_MLE_MASTER
;
1280 /* now blank out everything, as if we had never
1281 * contacted anyone */
1282 memset(mle
->maybe_map
, 0, sizeof(mle
->maybe_map
));
1283 memset(mle
->response_map
, 0, sizeof(mle
->response_map
));
1284 /* reset the vote_map to the current node_map */
1285 memcpy(mle
->vote_map
, mle
->node_map
,
1286 sizeof(mle
->node_map
));
1287 /* put myself into the maybe map */
1288 if (mle
->type
!= DLM_MLE_BLOCK
)
1289 set_bit(dlm
->node_num
, mle
->maybe_map
);
1292 node
= dlm_bitmap_diff_iter_next(&bdi
, &sc
);
1299 * DLM_MASTER_REQUEST_MSG
1301 * returns: 0 on success,
1302 * -errno on a network error
1304 * on error, the caller should assume the target node is "dead"
1308 static int dlm_do_master_request(struct dlm_lock_resource
*res
,
1309 struct dlm_master_list_entry
*mle
, int to
)
1311 struct dlm_ctxt
*dlm
= mle
->dlm
;
1312 struct dlm_master_request request
;
1313 int ret
, response
=0, resend
;
1315 memset(&request
, 0, sizeof(request
));
1316 request
.node_idx
= dlm
->node_num
;
1318 BUG_ON(mle
->type
== DLM_MLE_MIGRATION
);
1320 request
.namelen
= (u8
)mle
->mnamelen
;
1321 memcpy(request
.name
, mle
->mname
, request
.namelen
);
1324 ret
= o2net_send_message(DLM_MASTER_REQUEST_MSG
, dlm
->key
, &request
,
1325 sizeof(request
), to
, &response
);
1327 if (ret
== -ESRCH
) {
1328 /* should never happen */
1329 mlog(ML_ERROR
, "TCP stack not ready!\n");
1331 } else if (ret
== -EINVAL
) {
1332 mlog(ML_ERROR
, "bad args passed to o2net!\n");
1334 } else if (ret
== -ENOMEM
) {
1335 mlog(ML_ERROR
, "out of memory while trying to send "
1336 "network message! retrying\n");
1337 /* this is totally crude */
1340 } else if (!dlm_is_host_down(ret
)) {
1341 /* not a network error. bad. */
1343 mlog(ML_ERROR
, "unhandled error!");
1346 /* all other errors should be network errors,
1347 * and likely indicate node death */
1348 mlog(ML_ERROR
, "link to %d went down!\n", to
);
1354 spin_lock(&mle
->spinlock
);
1356 case DLM_MASTER_RESP_YES
:
1357 set_bit(to
, mle
->response_map
);
1358 mlog(0, "node %u is the master, response=YES\n", to
);
1359 mlog(0, "%s:%.*s: master node %u now knows I have a "
1360 "reference\n", dlm
->name
, res
->lockname
.len
,
1361 res
->lockname
.name
, to
);
1364 case DLM_MASTER_RESP_NO
:
1365 mlog(0, "node %u not master, response=NO\n", to
);
1366 set_bit(to
, mle
->response_map
);
1368 case DLM_MASTER_RESP_MAYBE
:
1369 mlog(0, "node %u not master, response=MAYBE\n", to
);
1370 set_bit(to
, mle
->response_map
);
1371 set_bit(to
, mle
->maybe_map
);
1373 case DLM_MASTER_RESP_ERROR
:
1374 mlog(0, "node %u hit an error, resending\n", to
);
1379 mlog(ML_ERROR
, "bad response! %u\n", response
);
1382 spin_unlock(&mle
->spinlock
);
1384 /* this is also totally crude */
1394 * locks that can be taken here:
1400 * if possible, TRIM THIS DOWN!!!
1402 int dlm_master_request_handler(struct o2net_msg
*msg
, u32 len
, void *data
,
1405 u8 response
= DLM_MASTER_RESP_MAYBE
;
1406 struct dlm_ctxt
*dlm
= data
;
1407 struct dlm_lock_resource
*res
= NULL
;
1408 struct dlm_master_request
*request
= (struct dlm_master_request
*) msg
->buf
;
1409 struct dlm_master_list_entry
*mle
= NULL
, *tmpmle
= NULL
;
1411 unsigned int namelen
, hash
;
1414 int dispatch_assert
= 0;
1418 return DLM_MASTER_RESP_NO
;
1420 if (!dlm_domain_fully_joined(dlm
)) {
1421 response
= DLM_MASTER_RESP_NO
;
1425 name
= request
->name
;
1426 namelen
= request
->namelen
;
1427 hash
= dlm_lockid_hash(name
, namelen
);
1429 if (namelen
> DLM_LOCKID_NAME_MAX
) {
1430 response
= DLM_IVBUFLEN
;
1435 spin_lock(&dlm
->spinlock
);
1436 res
= __dlm_lookup_lockres(dlm
, name
, namelen
, hash
);
1438 spin_unlock(&dlm
->spinlock
);
1440 /* take care of the easy cases up front */
1441 spin_lock(&res
->spinlock
);
1444 * Right after dlm spinlock was released, dlm_thread could have
1445 * purged the lockres. Check if lockres got unhashed. If so
1448 if (hlist_unhashed(&res
->hash_node
)) {
1449 spin_unlock(&res
->spinlock
);
1450 dlm_lockres_put(res
);
1454 if (res
->state
& (DLM_LOCK_RES_RECOVERING
|
1455 DLM_LOCK_RES_MIGRATING
)) {
1456 spin_unlock(&res
->spinlock
);
1457 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1458 "being recovered/migrated\n");
1459 response
= DLM_MASTER_RESP_ERROR
;
1461 kmem_cache_free(dlm_mle_cache
, mle
);
1465 if (res
->owner
== dlm
->node_num
) {
1466 dlm_lockres_set_refmap_bit(dlm
, res
, request
->node_idx
);
1467 spin_unlock(&res
->spinlock
);
1468 response
= DLM_MASTER_RESP_YES
;
1470 kmem_cache_free(dlm_mle_cache
, mle
);
1472 /* this node is the owner.
1473 * there is some extra work that needs to
1474 * happen now. the requesting node has
1475 * caused all nodes up to this one to
1476 * create mles. this node now needs to
1477 * go back and clean those up. */
1478 dispatch_assert
= 1;
1480 } else if (res
->owner
!= DLM_LOCK_RES_OWNER_UNKNOWN
) {
1481 spin_unlock(&res
->spinlock
);
1482 // mlog(0, "node %u is the master\n", res->owner);
1483 response
= DLM_MASTER_RESP_NO
;
1485 kmem_cache_free(dlm_mle_cache
, mle
);
1489 /* ok, there is no owner. either this node is
1490 * being blocked, or it is actively trying to
1491 * master this lock. */
1492 if (!(res
->state
& DLM_LOCK_RES_IN_PROGRESS
)) {
1493 mlog(ML_ERROR
, "lock with no owner should be "
1498 // mlog(0, "lockres is in progress...\n");
1499 spin_lock(&dlm
->master_lock
);
1500 found
= dlm_find_mle(dlm
, &tmpmle
, name
, namelen
);
1502 mlog(ML_ERROR
, "no mle found for this lock!\n");
1506 spin_lock(&tmpmle
->spinlock
);
1507 if (tmpmle
->type
== DLM_MLE_BLOCK
) {
1508 // mlog(0, "this node is waiting for "
1509 // "lockres to be mastered\n");
1510 response
= DLM_MASTER_RESP_NO
;
1511 } else if (tmpmle
->type
== DLM_MLE_MIGRATION
) {
1512 mlog(0, "node %u is master, but trying to migrate to "
1513 "node %u.\n", tmpmle
->master
, tmpmle
->new_master
);
1514 if (tmpmle
->master
== dlm
->node_num
) {
1515 mlog(ML_ERROR
, "no owner on lockres, but this "
1516 "node is trying to migrate it to %u?!\n",
1517 tmpmle
->new_master
);
1520 /* the real master can respond on its own */
1521 response
= DLM_MASTER_RESP_NO
;
1523 } else if (tmpmle
->master
!= DLM_LOCK_RES_OWNER_UNKNOWN
) {
1525 if (tmpmle
->master
== dlm
->node_num
) {
1526 response
= DLM_MASTER_RESP_YES
;
1527 /* this node will be the owner.
1528 * go back and clean the mles on any
1530 dispatch_assert
= 1;
1531 dlm_lockres_set_refmap_bit(dlm
, res
,
1534 response
= DLM_MASTER_RESP_NO
;
1536 // mlog(0, "this node is attempting to "
1537 // "master lockres\n");
1538 response
= DLM_MASTER_RESP_MAYBE
;
1541 set_bit(request
->node_idx
, tmpmle
->maybe_map
);
1542 spin_unlock(&tmpmle
->spinlock
);
1544 spin_unlock(&dlm
->master_lock
);
1545 spin_unlock(&res
->spinlock
);
1547 /* keep the mle attached to heartbeat events */
1548 dlm_put_mle(tmpmle
);
1550 kmem_cache_free(dlm_mle_cache
, mle
);
1555 * lockres doesn't exist on this node
1556 * if there is an MLE_BLOCK, return NO
1557 * if there is an MLE_MASTER, return MAYBE
1558 * otherwise, add an MLE_BLOCK, return NO
1560 spin_lock(&dlm
->master_lock
);
1561 found
= dlm_find_mle(dlm
, &tmpmle
, name
, namelen
);
1563 /* this lockid has never been seen on this node yet */
1564 // mlog(0, "no mle found\n");
1566 spin_unlock(&dlm
->master_lock
);
1567 spin_unlock(&dlm
->spinlock
);
1569 mle
= kmem_cache_alloc(dlm_mle_cache
, GFP_NOFS
);
1571 response
= DLM_MASTER_RESP_ERROR
;
1572 mlog_errno(-ENOMEM
);
1578 // mlog(0, "this is second time thru, already allocated, "
1579 // "add the block.\n");
1580 dlm_init_mle(mle
, DLM_MLE_BLOCK
, dlm
, NULL
, name
, namelen
);
1581 set_bit(request
->node_idx
, mle
->maybe_map
);
1582 __dlm_insert_mle(dlm
, mle
);
1583 response
= DLM_MASTER_RESP_NO
;
1585 spin_lock(&tmpmle
->spinlock
);
1586 if (tmpmle
->master
== dlm
->node_num
) {
1587 mlog(ML_ERROR
, "no lockres, but an mle with this node as master!\n");
1590 if (tmpmle
->type
== DLM_MLE_BLOCK
)
1591 response
= DLM_MASTER_RESP_NO
;
1592 else if (tmpmle
->type
== DLM_MLE_MIGRATION
) {
1593 mlog(0, "migration mle was found (%u->%u)\n",
1594 tmpmle
->master
, tmpmle
->new_master
);
1595 /* real master can respond on its own */
1596 response
= DLM_MASTER_RESP_NO
;
1598 response
= DLM_MASTER_RESP_MAYBE
;
1599 set_bit(request
->node_idx
, tmpmle
->maybe_map
);
1600 spin_unlock(&tmpmle
->spinlock
);
1602 spin_unlock(&dlm
->master_lock
);
1603 spin_unlock(&dlm
->spinlock
);
1606 /* keep the mle attached to heartbeat events */
1607 dlm_put_mle(tmpmle
);
1611 * __dlm_lookup_lockres() grabbed a reference to this lockres.
1612 * The reference is released by dlm_assert_master_worker() under
1613 * the call to dlm_dispatch_assert_master(). If
1614 * dlm_assert_master_worker() isn't called, we drop it here.
1616 if (dispatch_assert
) {
1617 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1618 dlm
->node_num
, res
->lockname
.len
, res
->lockname
.name
);
1619 spin_lock(&res
->spinlock
);
1620 ret
= dlm_dispatch_assert_master(dlm
, res
, 0, request
->node_idx
,
1621 DLM_ASSERT_MASTER_MLE_CLEANUP
);
1623 mlog(ML_ERROR
, "failed to dispatch assert master work\n");
1624 response
= DLM_MASTER_RESP_ERROR
;
1625 spin_unlock(&res
->spinlock
);
1626 dlm_lockres_put(res
);
1629 __dlm_lockres_grab_inflight_worker(dlm
, res
);
1630 spin_unlock(&res
->spinlock
);
1634 dlm_lockres_put(res
);
1643 * DLM_ASSERT_MASTER_MSG
1648 * NOTE: this can be used for debugging
1649 * can periodically run all locks owned by this node
1650 * and re-assert across the cluster...
1652 static int dlm_do_assert_master(struct dlm_ctxt
*dlm
,
1653 struct dlm_lock_resource
*res
,
1654 void *nodemap
, u32 flags
)
1656 struct dlm_assert_master
assert;
1658 struct dlm_node_iter iter
;
1661 const char *lockname
= res
->lockname
.name
;
1662 unsigned int namelen
= res
->lockname
.len
;
1664 BUG_ON(namelen
> O2NM_MAX_NAME_LEN
);
1666 spin_lock(&res
->spinlock
);
1667 res
->state
|= DLM_LOCK_RES_SETREF_INPROG
;
1668 spin_unlock(&res
->spinlock
);
1673 /* note that if this nodemap is empty, it returns 0 */
1674 dlm_node_iter_init(nodemap
, &iter
);
1675 while ((to
= dlm_node_iter_next(&iter
)) >= 0) {
1677 struct dlm_master_list_entry
*mle
= NULL
;
1679 mlog(0, "sending assert master to %d (%.*s)\n", to
,
1681 memset(&assert, 0, sizeof(assert));
1682 assert.node_idx
= dlm
->node_num
;
1683 assert.namelen
= namelen
;
1684 memcpy(assert.name
, lockname
, namelen
);
1685 assert.flags
= cpu_to_be32(flags
);
1687 tmpret
= o2net_send_message(DLM_ASSERT_MASTER_MSG
, dlm
->key
,
1688 &assert, sizeof(assert), to
, &r
);
1690 mlog(ML_ERROR
, "Error %d when sending message %u (key "
1691 "0x%x) to node %u\n", tmpret
,
1692 DLM_ASSERT_MASTER_MSG
, dlm
->key
, to
);
1693 if (!dlm_is_host_down(tmpret
)) {
1694 mlog(ML_ERROR
, "unhandled error=%d!\n", tmpret
);
1697 /* a node died. finish out the rest of the nodes. */
1698 mlog(0, "link to %d went down!\n", to
);
1699 /* any nonzero status return will do */
1703 /* ok, something horribly messed. kill thyself. */
1704 mlog(ML_ERROR
,"during assert master of %.*s to %u, "
1705 "got %d.\n", namelen
, lockname
, to
, r
);
1706 spin_lock(&dlm
->spinlock
);
1707 spin_lock(&dlm
->master_lock
);
1708 if (dlm_find_mle(dlm
, &mle
, (char *)lockname
,
1710 dlm_print_one_mle(mle
);
1713 spin_unlock(&dlm
->master_lock
);
1714 spin_unlock(&dlm
->spinlock
);
1718 if (r
& DLM_ASSERT_RESPONSE_REASSERT
&&
1719 !(r
& DLM_ASSERT_RESPONSE_MASTERY_REF
)) {
1720 mlog(ML_ERROR
, "%.*s: very strange, "
1721 "master MLE but no lockres on %u\n",
1722 namelen
, lockname
, to
);
1725 if (r
& DLM_ASSERT_RESPONSE_REASSERT
) {
1726 mlog(0, "%.*s: node %u create mles on other "
1727 "nodes and requests a re-assert\n",
1728 namelen
, lockname
, to
);
1731 if (r
& DLM_ASSERT_RESPONSE_MASTERY_REF
) {
1732 mlog(0, "%.*s: node %u has a reference to this "
1733 "lockres, set the bit in the refmap\n",
1734 namelen
, lockname
, to
);
1735 spin_lock(&res
->spinlock
);
1736 dlm_lockres_set_refmap_bit(dlm
, res
, to
);
1737 spin_unlock(&res
->spinlock
);
1744 spin_lock(&res
->spinlock
);
1745 res
->state
&= ~DLM_LOCK_RES_SETREF_INPROG
;
1746 spin_unlock(&res
->spinlock
);
1753 * locks that can be taken here:
1759 * if possible, TRIM THIS DOWN!!!
1761 int dlm_assert_master_handler(struct o2net_msg
*msg
, u32 len
, void *data
,
1764 struct dlm_ctxt
*dlm
= data
;
1765 struct dlm_master_list_entry
*mle
= NULL
;
1766 struct dlm_assert_master
*assert = (struct dlm_assert_master
*)msg
->buf
;
1767 struct dlm_lock_resource
*res
= NULL
;
1769 unsigned int namelen
, hash
;
1771 int master_request
= 0, have_lockres_ref
= 0;
1777 name
= assert->name
;
1778 namelen
= assert->namelen
;
1779 hash
= dlm_lockid_hash(name
, namelen
);
1780 flags
= be32_to_cpu(assert->flags
);
1782 if (namelen
> DLM_LOCKID_NAME_MAX
) {
1783 mlog(ML_ERROR
, "Invalid name length!");
1787 spin_lock(&dlm
->spinlock
);
1790 mlog(0, "assert_master with flags: %u\n", flags
);
1793 spin_lock(&dlm
->master_lock
);
1794 if (!dlm_find_mle(dlm
, &mle
, name
, namelen
)) {
1795 /* not an error, could be master just re-asserting */
1796 mlog(0, "just got an assert_master from %u, but no "
1797 "MLE for it! (%.*s)\n", assert->node_idx
,
1800 int bit
= find_next_bit (mle
->maybe_map
, O2NM_MAX_NODES
, 0);
1801 if (bit
>= O2NM_MAX_NODES
) {
1802 /* not necessarily an error, though less likely.
1803 * could be master just re-asserting. */
1804 mlog(0, "no bits set in the maybe_map, but %u "
1805 "is asserting! (%.*s)\n", assert->node_idx
,
1807 } else if (bit
!= assert->node_idx
) {
1808 if (flags
& DLM_ASSERT_MASTER_MLE_CLEANUP
) {
1809 mlog(0, "master %u was found, %u should "
1810 "back off\n", assert->node_idx
, bit
);
1812 /* with the fix for bug 569, a higher node
1813 * number winning the mastery will respond
1814 * YES to mastery requests, but this node
1815 * had no way of knowing. let it pass. */
1816 mlog(0, "%u is the lowest node, "
1817 "%u is asserting. (%.*s) %u must "
1818 "have begun after %u won.\n", bit
,
1819 assert->node_idx
, namelen
, name
, bit
,
1823 if (mle
->type
== DLM_MLE_MIGRATION
) {
1824 if (flags
& DLM_ASSERT_MASTER_MLE_CLEANUP
) {
1825 mlog(0, "%s:%.*s: got cleanup assert"
1826 " from %u for migration\n",
1827 dlm
->name
, namelen
, name
,
1829 } else if (!(flags
& DLM_ASSERT_MASTER_FINISH_MIGRATION
)) {
1830 mlog(0, "%s:%.*s: got unrelated assert"
1831 " from %u for migration, ignoring\n",
1832 dlm
->name
, namelen
, name
,
1835 spin_unlock(&dlm
->master_lock
);
1836 spin_unlock(&dlm
->spinlock
);
1841 spin_unlock(&dlm
->master_lock
);
1843 /* ok everything checks out with the MLE
1844 * now check to see if there is a lockres */
1845 res
= __dlm_lookup_lockres(dlm
, name
, namelen
, hash
);
1847 spin_lock(&res
->spinlock
);
1848 if (res
->state
& DLM_LOCK_RES_RECOVERING
) {
1849 mlog(ML_ERROR
, "%u asserting but %.*s is "
1850 "RECOVERING!\n", assert->node_idx
, namelen
, name
);
1854 if (res
->owner
!= DLM_LOCK_RES_OWNER_UNKNOWN
&&
1855 res
->owner
!= assert->node_idx
) {
1856 mlog(ML_ERROR
, "DIE! Mastery assert from %u, "
1857 "but current owner is %u! (%.*s)\n",
1858 assert->node_idx
, res
->owner
, namelen
,
1860 __dlm_print_one_lock_resource(res
);
1863 } else if (mle
->type
!= DLM_MLE_MIGRATION
) {
1864 if (res
->owner
!= DLM_LOCK_RES_OWNER_UNKNOWN
) {
1865 /* owner is just re-asserting */
1866 if (res
->owner
== assert->node_idx
) {
1867 mlog(0, "owner %u re-asserting on "
1868 "lock %.*s\n", assert->node_idx
,
1872 mlog(ML_ERROR
, "got assert_master from "
1873 "node %u, but %u is the owner! "
1874 "(%.*s)\n", assert->node_idx
,
1875 res
->owner
, namelen
, name
);
1878 if (!(res
->state
& DLM_LOCK_RES_IN_PROGRESS
)) {
1879 mlog(ML_ERROR
, "got assert from %u, but lock "
1880 "with no owner should be "
1881 "in-progress! (%.*s)\n",
1886 } else /* mle->type == DLM_MLE_MIGRATION */ {
1887 /* should only be getting an assert from new master */
1888 if (assert->node_idx
!= mle
->new_master
) {
1889 mlog(ML_ERROR
, "got assert from %u, but "
1890 "new master is %u, and old master "
1892 assert->node_idx
, mle
->new_master
,
1893 mle
->master
, namelen
, name
);
1899 spin_unlock(&res
->spinlock
);
1902 // mlog(0, "woo! got an assert_master from node %u!\n",
1903 // assert->node_idx);
1909 spin_lock(&mle
->spinlock
);
1910 if (mle
->type
== DLM_MLE_BLOCK
|| mle
->type
== DLM_MLE_MIGRATION
)
1913 /* MASTER mle: if any bits set in the response map
1914 * then the calling node needs to re-assert to clear
1915 * up nodes that this node contacted */
1916 while ((nn
= find_next_bit (mle
->response_map
, O2NM_MAX_NODES
,
1917 nn
+1)) < O2NM_MAX_NODES
) {
1918 if (nn
!= dlm
->node_num
&& nn
!= assert->node_idx
) {
1924 mle
->master
= assert->node_idx
;
1925 atomic_set(&mle
->woken
, 1);
1927 spin_unlock(&mle
->spinlock
);
1931 spin_lock(&res
->spinlock
);
1932 if (mle
->type
== DLM_MLE_MIGRATION
) {
1933 mlog(0, "finishing off migration of lockres %.*s, "
1935 res
->lockname
.len
, res
->lockname
.name
,
1936 dlm
->node_num
, mle
->new_master
);
1937 res
->state
&= ~DLM_LOCK_RES_MIGRATING
;
1939 dlm_change_lockres_owner(dlm
, res
, mle
->new_master
);
1940 BUG_ON(res
->state
& DLM_LOCK_RES_DIRTY
);
1942 dlm_change_lockres_owner(dlm
, res
, mle
->master
);
1944 spin_unlock(&res
->spinlock
);
1945 have_lockres_ref
= 1;
1950 /* master is known, detach if not already detached.
1951 * ensures that only one assert_master call will happen
1953 spin_lock(&dlm
->master_lock
);
1955 rr
= kref_read(&mle
->mle_refs
);
1956 if (mle
->inuse
> 0) {
1957 if (extra_ref
&& rr
< 3)
1959 else if (!extra_ref
&& rr
< 2)
1962 if (extra_ref
&& rr
< 2)
1964 else if (!extra_ref
&& rr
< 1)
1968 mlog(ML_ERROR
, "%s:%.*s: got assert master from %u "
1969 "that will mess up this node, refs=%d, extra=%d, "
1970 "inuse=%d\n", dlm
->name
, namelen
, name
,
1971 assert->node_idx
, rr
, extra_ref
, mle
->inuse
);
1972 dlm_print_one_mle(mle
);
1974 __dlm_unlink_mle(dlm
, mle
);
1975 __dlm_mle_detach_hb_events(dlm
, mle
);
1978 /* the assert master message now balances the extra
1979 * ref given by the master / migration request message.
1980 * if this is the last put, it will be removed
1984 spin_unlock(&dlm
->master_lock
);
1986 if (res
->owner
!= assert->node_idx
) {
1987 mlog(0, "assert_master from %u, but current "
1988 "owner is %u (%.*s), no mle\n", assert->node_idx
,
1989 res
->owner
, namelen
, name
);
1992 spin_unlock(&dlm
->spinlock
);
1997 spin_lock(&res
->spinlock
);
1998 res
->state
|= DLM_LOCK_RES_SETREF_INPROG
;
1999 spin_unlock(&res
->spinlock
);
2000 *ret_data
= (void *)res
;
2003 if (master_request
) {
2004 mlog(0, "need to tell master to reassert\n");
2005 /* positive. negative would shoot down the node. */
2006 ret
|= DLM_ASSERT_RESPONSE_REASSERT
;
2007 if (!have_lockres_ref
) {
2008 mlog(ML_ERROR
, "strange, got assert from %u, MASTER "
2009 "mle present here for %s:%.*s, but no lockres!\n",
2010 assert->node_idx
, dlm
->name
, namelen
, name
);
2013 if (have_lockres_ref
) {
2014 /* let the master know we have a reference to the lockres */
2015 ret
|= DLM_ASSERT_RESPONSE_MASTERY_REF
;
2016 mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
2017 dlm
->name
, namelen
, name
, assert->node_idx
);
2022 /* kill the caller! */
2023 mlog(ML_ERROR
, "Bad message received from another node. Dumping state "
2024 "and killing the other node now! This node is OK and can continue.\n");
2025 __dlm_print_one_lock_resource(res
);
2026 spin_unlock(&res
->spinlock
);
2027 spin_lock(&dlm
->master_lock
);
2030 spin_unlock(&dlm
->master_lock
);
2031 spin_unlock(&dlm
->spinlock
);
2032 *ret_data
= (void *)res
;
2037 void dlm_assert_master_post_handler(int status
, void *data
, void *ret_data
)
2039 struct dlm_lock_resource
*res
= (struct dlm_lock_resource
*)ret_data
;
2042 spin_lock(&res
->spinlock
);
2043 res
->state
&= ~DLM_LOCK_RES_SETREF_INPROG
;
2044 spin_unlock(&res
->spinlock
);
2046 dlm_lockres_put(res
);
2051 int dlm_dispatch_assert_master(struct dlm_ctxt
*dlm
,
2052 struct dlm_lock_resource
*res
,
2053 int ignore_higher
, u8 request_from
, u32 flags
)
2055 struct dlm_work_item
*item
;
2056 item
= kzalloc(sizeof(*item
), GFP_ATOMIC
);
2061 /* queue up work for dlm_assert_master_worker */
2062 dlm_init_work_item(dlm
, item
, dlm_assert_master_worker
, NULL
);
2063 item
->u
.am
.lockres
= res
; /* already have a ref */
2064 /* can optionally ignore node numbers higher than this node */
2065 item
->u
.am
.ignore_higher
= ignore_higher
;
2066 item
->u
.am
.request_from
= request_from
;
2067 item
->u
.am
.flags
= flags
;
2070 mlog(0, "IGNORE HIGHER: %.*s\n", res
->lockname
.len
,
2071 res
->lockname
.name
);
2073 spin_lock(&dlm
->work_lock
);
2074 list_add_tail(&item
->list
, &dlm
->work_list
);
2075 spin_unlock(&dlm
->work_lock
);
2077 queue_work(dlm
->dlm_worker
, &dlm
->dispatched_work
);
2081 static void dlm_assert_master_worker(struct dlm_work_item
*item
, void *data
)
2083 struct dlm_ctxt
*dlm
= data
;
2085 struct dlm_lock_resource
*res
;
2086 unsigned long nodemap
[BITS_TO_LONGS(O2NM_MAX_NODES
)];
2093 res
= item
->u
.am
.lockres
;
2094 ignore_higher
= item
->u
.am
.ignore_higher
;
2095 request_from
= item
->u
.am
.request_from
;
2096 flags
= item
->u
.am
.flags
;
2098 spin_lock(&dlm
->spinlock
);
2099 memcpy(nodemap
, dlm
->domain_map
, sizeof(nodemap
));
2100 spin_unlock(&dlm
->spinlock
);
2102 clear_bit(dlm
->node_num
, nodemap
);
2103 if (ignore_higher
) {
2104 /* if is this just to clear up mles for nodes below
2105 * this node, do not send the message to the original
2106 * caller or any node number higher than this */
2107 clear_bit(request_from
, nodemap
);
2108 bit
= dlm
->node_num
;
2110 bit
= find_next_bit(nodemap
, O2NM_MAX_NODES
,
2112 if (bit
>= O2NM_MAX_NODES
)
2114 clear_bit(bit
, nodemap
);
2119 * If we're migrating this lock to someone else, we are no
2120 * longer allowed to assert out own mastery. OTOH, we need to
2121 * prevent migration from starting while we're still asserting
2122 * our dominance. The reserved ast delays migration.
2124 spin_lock(&res
->spinlock
);
2125 if (res
->state
& DLM_LOCK_RES_MIGRATING
) {
2126 mlog(0, "Someone asked us to assert mastery, but we're "
2127 "in the middle of migration. Skipping assert, "
2128 "the new master will handle that.\n");
2129 spin_unlock(&res
->spinlock
);
2132 __dlm_lockres_reserve_ast(res
);
2133 spin_unlock(&res
->spinlock
);
2135 /* this call now finishes out the nodemap
2136 * even if one or more nodes die */
2137 mlog(0, "worker about to master %.*s here, this=%u\n",
2138 res
->lockname
.len
, res
->lockname
.name
, dlm
->node_num
);
2139 ret
= dlm_do_assert_master(dlm
, res
, nodemap
, flags
);
2141 /* no need to restart, we are done */
2142 if (!dlm_is_host_down(ret
))
2146 /* Ok, we've asserted ourselves. Let's let migration start. */
2147 dlm_lockres_release_ast(dlm
, res
);
2150 dlm_lockres_drop_inflight_worker(dlm
, res
);
2152 dlm_lockres_put(res
);
2154 mlog(0, "finished with dlm_assert_master_worker\n");
2157 /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
2158 * We cannot wait for node recovery to complete to begin mastering this
2159 * lockres because this lockres is used to kick off recovery! ;-)
2160 * So, do a pre-check on all living nodes to see if any of those nodes
2161 * think that $RECOVERY is currently mastered by a dead node. If so,
2162 * we wait a short time to allow that node to get notified by its own
2163 * heartbeat stack, then check again. All $RECOVERY lock resources
2164 * mastered by dead nodes are purged when the heartbeat callback is
2165 * fired, so we can know for sure that it is safe to continue once
2166 * the node returns a live node or no node. */
2167 static int dlm_pre_master_reco_lockres(struct dlm_ctxt
*dlm
,
2168 struct dlm_lock_resource
*res
)
2170 struct dlm_node_iter iter
;
2173 u8 master
= DLM_LOCK_RES_OWNER_UNKNOWN
;
2175 spin_lock(&dlm
->spinlock
);
2176 dlm_node_iter_init(dlm
->domain_map
, &iter
);
2177 spin_unlock(&dlm
->spinlock
);
2179 while ((nodenum
= dlm_node_iter_next(&iter
)) >= 0) {
2180 /* do not send to self */
2181 if (nodenum
== dlm
->node_num
)
2183 ret
= dlm_do_master_requery(dlm
, res
, nodenum
, &master
);
2186 if (!dlm_is_host_down(ret
))
2188 /* host is down, so answer for that node would be
2189 * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
2193 if (master
!= DLM_LOCK_RES_OWNER_UNKNOWN
) {
2194 /* check to see if this master is in the recovery map */
2195 spin_lock(&dlm
->spinlock
);
2196 if (test_bit(master
, dlm
->recovery_map
)) {
2197 mlog(ML_NOTICE
, "%s: node %u has not seen "
2198 "node %u go down yet, and thinks the "
2199 "dead node is mastering the recovery "
2200 "lock. must wait.\n", dlm
->name
,
2204 spin_unlock(&dlm
->spinlock
);
2205 mlog(0, "%s: reco lock master is %u\n", dlm
->name
,
2214 * DLM_DEREF_LOCKRES_MSG
2217 int dlm_drop_lockres_ref(struct dlm_ctxt
*dlm
, struct dlm_lock_resource
*res
)
2219 struct dlm_deref_lockres deref
;
2221 const char *lockname
;
2222 unsigned int namelen
;
2224 lockname
= res
->lockname
.name
;
2225 namelen
= res
->lockname
.len
;
2226 BUG_ON(namelen
> O2NM_MAX_NAME_LEN
);
2228 memset(&deref
, 0, sizeof(deref
));
2229 deref
.node_idx
= dlm
->node_num
;
2230 deref
.namelen
= namelen
;
2231 memcpy(deref
.name
, lockname
, namelen
);
2233 ret
= o2net_send_message(DLM_DEREF_LOCKRES_MSG
, dlm
->key
,
2234 &deref
, sizeof(deref
), res
->owner
, &r
);
2236 mlog(ML_ERROR
, "%s: res %.*s, error %d send DEREF to node %u\n",
2237 dlm
->name
, namelen
, lockname
, ret
, res
->owner
);
2239 /* BAD. other node says I did not have a ref. */
2240 mlog(ML_ERROR
, "%s: res %.*s, DEREF to node %u got %d\n",
2241 dlm
->name
, namelen
, lockname
, res
->owner
, r
);
2242 dlm_print_one_lock_resource(res
);
2251 int dlm_deref_lockres_handler(struct o2net_msg
*msg
, u32 len
, void *data
,
2254 struct dlm_ctxt
*dlm
= data
;
2255 struct dlm_deref_lockres
*deref
= (struct dlm_deref_lockres
*)msg
->buf
;
2256 struct dlm_lock_resource
*res
= NULL
;
2258 unsigned int namelen
;
2262 struct dlm_work_item
*item
;
2270 namelen
= deref
->namelen
;
2271 node
= deref
->node_idx
;
2273 if (namelen
> DLM_LOCKID_NAME_MAX
) {
2274 mlog(ML_ERROR
, "Invalid name length!");
2277 if (deref
->node_idx
>= O2NM_MAX_NODES
) {
2278 mlog(ML_ERROR
, "Invalid node number: %u\n", node
);
2282 hash
= dlm_lockid_hash(name
, namelen
);
2284 spin_lock(&dlm
->spinlock
);
2285 res
= __dlm_lookup_lockres_full(dlm
, name
, namelen
, hash
);
2287 spin_unlock(&dlm
->spinlock
);
2288 mlog(ML_ERROR
, "%s:%.*s: bad lockres name\n",
2289 dlm
->name
, namelen
, name
);
2292 spin_unlock(&dlm
->spinlock
);
2294 spin_lock(&res
->spinlock
);
2295 if (res
->state
& DLM_LOCK_RES_SETREF_INPROG
)
2298 BUG_ON(res
->state
& DLM_LOCK_RES_DROPPING_REF
);
2299 if (test_bit(node
, res
->refmap
)) {
2300 dlm_lockres_clear_refmap_bit(dlm
, res
, node
);
2304 spin_unlock(&res
->spinlock
);
2308 dlm_lockres_calc_usage(dlm
, res
);
2310 mlog(ML_ERROR
, "%s:%.*s: node %u trying to drop ref "
2311 "but it is already dropped!\n", dlm
->name
,
2312 res
->lockname
.len
, res
->lockname
.name
, node
);
2313 dlm_print_one_lock_resource(res
);
2315 ret
= DLM_DEREF_RESPONSE_DONE
;
2319 item
= kzalloc(sizeof(*item
), GFP_NOFS
);
2326 dlm_init_work_item(dlm
, item
, dlm_deref_lockres_worker
, NULL
);
2327 item
->u
.dl
.deref_res
= res
;
2328 item
->u
.dl
.deref_node
= node
;
2330 spin_lock(&dlm
->work_lock
);
2331 list_add_tail(&item
->list
, &dlm
->work_list
);
2332 spin_unlock(&dlm
->work_lock
);
2334 queue_work(dlm
->dlm_worker
, &dlm
->dispatched_work
);
2335 return DLM_DEREF_RESPONSE_INPROG
;
2339 dlm_lockres_put(res
);
2345 int dlm_deref_lockres_done_handler(struct o2net_msg
*msg
, u32 len
, void *data
,
2348 struct dlm_ctxt
*dlm
= data
;
2349 struct dlm_deref_lockres_done
*deref
2350 = (struct dlm_deref_lockres_done
*)msg
->buf
;
2351 struct dlm_lock_resource
*res
= NULL
;
2353 unsigned int namelen
;
2362 namelen
= deref
->namelen
;
2363 node
= deref
->node_idx
;
2365 if (namelen
> DLM_LOCKID_NAME_MAX
) {
2366 mlog(ML_ERROR
, "Invalid name length!");
2369 if (deref
->node_idx
>= O2NM_MAX_NODES
) {
2370 mlog(ML_ERROR
, "Invalid node number: %u\n", node
);
2374 hash
= dlm_lockid_hash(name
, namelen
);
2376 spin_lock(&dlm
->spinlock
);
2377 res
= __dlm_lookup_lockres_full(dlm
, name
, namelen
, hash
);
2379 spin_unlock(&dlm
->spinlock
);
2380 mlog(ML_ERROR
, "%s:%.*s: bad lockres name\n",
2381 dlm
->name
, namelen
, name
);
2385 spin_lock(&res
->spinlock
);
2386 if (!(res
->state
& DLM_LOCK_RES_DROPPING_REF
)) {
2387 spin_unlock(&res
->spinlock
);
2388 spin_unlock(&dlm
->spinlock
);
2389 mlog(ML_NOTICE
, "%s:%.*s: node %u sends deref done "
2390 "but it is already derefed!\n", dlm
->name
,
2391 res
->lockname
.len
, res
->lockname
.name
, node
);
2396 __dlm_do_purge_lockres(dlm
, res
);
2397 spin_unlock(&res
->spinlock
);
2400 spin_unlock(&dlm
->spinlock
);
2405 dlm_lockres_put(res
);
2410 static void dlm_drop_lockres_ref_done(struct dlm_ctxt
*dlm
,
2411 struct dlm_lock_resource
*res
, u8 node
)
2413 struct dlm_deref_lockres_done deref
;
2415 const char *lockname
;
2416 unsigned int namelen
;
2418 lockname
= res
->lockname
.name
;
2419 namelen
= res
->lockname
.len
;
2420 BUG_ON(namelen
> O2NM_MAX_NAME_LEN
);
2422 memset(&deref
, 0, sizeof(deref
));
2423 deref
.node_idx
= dlm
->node_num
;
2424 deref
.namelen
= namelen
;
2425 memcpy(deref
.name
, lockname
, namelen
);
2427 ret
= o2net_send_message(DLM_DEREF_LOCKRES_DONE
, dlm
->key
,
2428 &deref
, sizeof(deref
), node
, &r
);
2430 mlog(ML_ERROR
, "%s: res %.*s, error %d send DEREF DONE "
2431 " to node %u\n", dlm
->name
, namelen
,
2432 lockname
, ret
, node
);
2434 /* ignore the error */
2435 mlog(ML_ERROR
, "%s: res %.*s, DEREF to node %u got %d\n",
2436 dlm
->name
, namelen
, lockname
, node
, r
);
2437 dlm_print_one_lock_resource(res
);
2441 static void dlm_deref_lockres_worker(struct dlm_work_item
*item
, void *data
)
2443 struct dlm_ctxt
*dlm
;
2444 struct dlm_lock_resource
*res
;
2449 res
= item
->u
.dl
.deref_res
;
2450 node
= item
->u
.dl
.deref_node
;
2452 spin_lock(&res
->spinlock
);
2453 BUG_ON(res
->state
& DLM_LOCK_RES_DROPPING_REF
);
2454 __dlm_wait_on_lockres_flags(res
, DLM_LOCK_RES_SETREF_INPROG
);
2455 if (test_bit(node
, res
->refmap
)) {
2456 dlm_lockres_clear_refmap_bit(dlm
, res
, node
);
2459 spin_unlock(&res
->spinlock
);
2461 dlm_drop_lockres_ref_done(dlm
, res
, node
);
2464 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
2465 dlm
->name
, res
->lockname
.len
, res
->lockname
.name
, node
);
2466 dlm_lockres_calc_usage(dlm
, res
);
2468 mlog(ML_ERROR
, "%s:%.*s: node %u trying to drop ref "
2469 "but it is already dropped!\n", dlm
->name
,
2470 res
->lockname
.len
, res
->lockname
.name
, node
);
2471 dlm_print_one_lock_resource(res
);
2474 dlm_lockres_put(res
);
2478 * A migratable resource is one that is :
2479 * 1. locally mastered, and,
2480 * 2. zero local locks, and,
2481 * 3. one or more non-local locks, or, one or more references
2482 * Returns 1 if yes, 0 if not.
2484 static int dlm_is_lockres_migratable(struct dlm_ctxt
*dlm
,
2485 struct dlm_lock_resource
*res
)
2487 enum dlm_lockres_list idx
;
2488 int nonlocal
= 0, node_ref
;
2489 struct list_head
*queue
;
2490 struct dlm_lock
*lock
;
2493 assert_spin_locked(&res
->spinlock
);
2495 /* delay migration when the lockres is in MIGRATING state */
2496 if (res
->state
& DLM_LOCK_RES_MIGRATING
)
2499 /* delay migration when the lockres is in RECOCERING state */
2500 if (res
->state
& (DLM_LOCK_RES_RECOVERING
|
2501 DLM_LOCK_RES_RECOVERY_WAITING
))
2504 if (res
->owner
!= dlm
->node_num
)
2507 for (idx
= DLM_GRANTED_LIST
; idx
<= DLM_BLOCKED_LIST
; idx
++) {
2508 queue
= dlm_list_idx_to_ptr(res
, idx
);
2509 list_for_each_entry(lock
, queue
, list
) {
2510 if (lock
->ml
.node
!= dlm
->node_num
) {
2514 cookie
= be64_to_cpu(lock
->ml
.cookie
);
2515 mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on "
2516 "%s list\n", dlm
->name
, res
->lockname
.len
,
2518 dlm_get_lock_cookie_node(cookie
),
2519 dlm_get_lock_cookie_seq(cookie
),
2520 dlm_list_in_text(idx
));
2526 node_ref
= find_next_bit(res
->refmap
, O2NM_MAX_NODES
, 0);
2527 if (node_ref
>= O2NM_MAX_NODES
)
2531 mlog(0, "%s: res %.*s, Migratable\n", dlm
->name
, res
->lockname
.len
,
2532 res
->lockname
.name
);
2538 * DLM_MIGRATE_LOCKRES
2542 static int dlm_migrate_lockres(struct dlm_ctxt
*dlm
,
2543 struct dlm_lock_resource
*res
, u8 target
)
2545 struct dlm_master_list_entry
*mle
= NULL
;
2546 struct dlm_master_list_entry
*oldmle
= NULL
;
2547 struct dlm_migratable_lockres
*mres
= NULL
;
2550 unsigned int namelen
;
2557 BUG_ON(target
== O2NM_MAX_NODES
);
2559 name
= res
->lockname
.name
;
2560 namelen
= res
->lockname
.len
;
2562 mlog(0, "%s: Migrating %.*s to node %u\n", dlm
->name
, namelen
, name
,
2565 /* preallocate up front. if this fails, abort */
2567 mres
= (struct dlm_migratable_lockres
*) __get_free_page(GFP_NOFS
);
2573 mle
= kmem_cache_alloc(dlm_mle_cache
, GFP_NOFS
);
2581 * clear any existing master requests and
2582 * add the migration mle to the list
2584 spin_lock(&dlm
->spinlock
);
2585 spin_lock(&dlm
->master_lock
);
2586 ret
= dlm_add_migration_mle(dlm
, res
, mle
, &oldmle
, name
,
2587 namelen
, target
, dlm
->node_num
);
2588 /* get an extra reference on the mle.
2589 * otherwise the assert_master from the new
2590 * master will destroy this.
2593 dlm_get_mle_inuse(mle
);
2595 spin_unlock(&dlm
->master_lock
);
2596 spin_unlock(&dlm
->spinlock
);
2598 if (ret
== -EEXIST
) {
2599 mlog(0, "another process is already migrating it\n");
2605 * set the MIGRATING flag and flush asts
2606 * if we fail after this we need to re-dirty the lockres
2608 if (dlm_mark_lockres_migrating(dlm
, res
, target
) < 0) {
2609 mlog(ML_ERROR
, "tried to migrate %.*s to %u, but "
2610 "the target went down.\n", res
->lockname
.len
,
2611 res
->lockname
.name
, target
);
2612 spin_lock(&res
->spinlock
);
2613 res
->state
&= ~DLM_LOCK_RES_MIGRATING
;
2615 spin_unlock(&res
->spinlock
);
2620 if (ret
!= -EEXIST
&& oldmle
) {
2621 /* master is known, detach if not already detached */
2622 dlm_mle_detach_hb_events(dlm
, oldmle
);
2623 dlm_put_mle(oldmle
);
2628 dlm_mle_detach_hb_events(dlm
, mle
);
2630 dlm_put_mle_inuse(mle
);
2632 kmem_cache_free(dlm_mle_cache
, mle
);
2639 * at this point, we have a migration target, an mle
2640 * in the master list, and the MIGRATING flag set on
2644 /* now that remote nodes are spinning on the MIGRATING flag,
2645 * ensure that all assert_master work is flushed. */
2646 flush_workqueue(dlm
->dlm_worker
);
2648 /* notify new node and send all lock state */
2649 /* call send_one_lockres with migration flag.
2650 * this serves as notice to the target node that a
2651 * migration is starting. */
2652 ret
= dlm_send_one_lockres(dlm
, res
, mres
, target
,
2653 DLM_MRES_MIGRATION
);
2656 mlog(0, "migration to node %u failed with %d\n",
2658 /* migration failed, detach and clean up mle */
2659 dlm_mle_detach_hb_events(dlm
, mle
);
2661 dlm_put_mle_inuse(mle
);
2662 spin_lock(&res
->spinlock
);
2663 res
->state
&= ~DLM_LOCK_RES_MIGRATING
;
2665 spin_unlock(&res
->spinlock
);
2666 if (dlm_is_host_down(ret
))
2667 dlm_wait_for_node_death(dlm
, target
,
2668 DLM_NODE_DEATH_WAIT_MAX
);
2672 /* at this point, the target sends a message to all nodes,
2673 * (using dlm_do_migrate_request). this node is skipped since
2674 * we had to put an mle in the list to begin the process. this
2675 * node now waits for target to do an assert master. this node
2676 * will be the last one notified, ensuring that the migration
2677 * is complete everywhere. if the target dies while this is
2678 * going on, some nodes could potentially see the target as the
2679 * master, so it is important that my recovery finds the migration
2680 * mle and sets the master to UNKNOWN. */
2683 /* wait for new node to assert master */
2685 ret
= wait_event_interruptible_timeout(mle
->wq
,
2686 (atomic_read(&mle
->woken
) == 1),
2687 msecs_to_jiffies(5000));
2690 if (atomic_read(&mle
->woken
) == 1 ||
2691 res
->owner
== target
)
2694 mlog(0, "%s:%.*s: timed out during migration\n",
2695 dlm
->name
, res
->lockname
.len
, res
->lockname
.name
);
2696 /* avoid hang during shutdown when migrating lockres
2697 * to a node which also goes down */
2698 if (dlm_is_node_dead(dlm
, target
)) {
2699 mlog(0, "%s:%.*s: expected migration "
2700 "target %u is no longer up, restarting\n",
2701 dlm
->name
, res
->lockname
.len
,
2702 res
->lockname
.name
, target
);
2704 /* migration failed, detach and clean up mle */
2705 dlm_mle_detach_hb_events(dlm
, mle
);
2707 dlm_put_mle_inuse(mle
);
2708 spin_lock(&res
->spinlock
);
2709 res
->state
&= ~DLM_LOCK_RES_MIGRATING
;
2711 spin_unlock(&res
->spinlock
);
2715 mlog(0, "%s:%.*s: caught signal during migration\n",
2716 dlm
->name
, res
->lockname
.len
, res
->lockname
.name
);
2719 /* all done, set the owner, clear the flag */
2720 spin_lock(&res
->spinlock
);
2721 dlm_set_lockres_owner(dlm
, res
, target
);
2722 res
->state
&= ~DLM_LOCK_RES_MIGRATING
;
2723 dlm_remove_nonlocal_locks(dlm
, res
);
2724 spin_unlock(&res
->spinlock
);
2727 /* master is known, detach if not already detached */
2728 dlm_mle_detach_hb_events(dlm
, mle
);
2729 dlm_put_mle_inuse(mle
);
2732 dlm_lockres_calc_usage(dlm
, res
);
2735 /* re-dirty the lockres if we failed */
2737 dlm_kick_thread(dlm
, res
);
2739 /* wake up waiters if the MIGRATING flag got set
2740 * but migration failed */
2745 free_page((unsigned long)mres
);
2749 mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm
->name
, namelen
,
2754 #define DLM_MIGRATION_RETRY_MS 100
2757 * Should be called only after beginning the domain leave process.
2758 * There should not be any remaining locks on nonlocal lock resources,
2759 * and there should be no local locks left on locally mastered resources.
2761 * Called with the dlm spinlock held, may drop it to do migration, but
2762 * will re-acquire before exit.
2764 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
2766 int dlm_empty_lockres(struct dlm_ctxt
*dlm
, struct dlm_lock_resource
*res
)
2769 int lock_dropped
= 0;
2770 u8 target
= O2NM_MAX_NODES
;
2772 assert_spin_locked(&dlm
->spinlock
);
2774 spin_lock(&res
->spinlock
);
2775 if (dlm_is_lockres_migratable(dlm
, res
))
2776 target
= dlm_pick_migration_target(dlm
, res
);
2777 spin_unlock(&res
->spinlock
);
2779 if (target
== O2NM_MAX_NODES
)
2782 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2783 spin_unlock(&dlm
->spinlock
);
2785 ret
= dlm_migrate_lockres(dlm
, res
, target
);
2787 mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
2788 dlm
->name
, res
->lockname
.len
, res
->lockname
.name
,
2790 spin_lock(&dlm
->spinlock
);
2792 return lock_dropped
;
2795 int dlm_lock_basts_flushed(struct dlm_ctxt
*dlm
, struct dlm_lock
*lock
)
2798 spin_lock(&dlm
->ast_lock
);
2799 spin_lock(&lock
->spinlock
);
2800 ret
= (list_empty(&lock
->bast_list
) && !lock
->bast_pending
);
2801 spin_unlock(&lock
->spinlock
);
2802 spin_unlock(&dlm
->ast_lock
);
2806 static int dlm_migration_can_proceed(struct dlm_ctxt
*dlm
,
2807 struct dlm_lock_resource
*res
,
2811 spin_lock(&res
->spinlock
);
2812 can_proceed
= !!(res
->state
& DLM_LOCK_RES_MIGRATING
);
2813 spin_unlock(&res
->spinlock
);
2815 /* target has died, so make the caller break out of the
2816 * wait_event, but caller must recheck the domain_map */
2817 spin_lock(&dlm
->spinlock
);
2818 if (!test_bit(mig_target
, dlm
->domain_map
))
2820 spin_unlock(&dlm
->spinlock
);
2824 static int dlm_lockres_is_dirty(struct dlm_ctxt
*dlm
,
2825 struct dlm_lock_resource
*res
)
2828 spin_lock(&res
->spinlock
);
2829 ret
= !!(res
->state
& DLM_LOCK_RES_DIRTY
);
2830 spin_unlock(&res
->spinlock
);
2835 static int dlm_mark_lockres_migrating(struct dlm_ctxt
*dlm
,
2836 struct dlm_lock_resource
*res
,
2841 mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2842 res
->lockname
.len
, res
->lockname
.name
, dlm
->node_num
,
2844 /* need to set MIGRATING flag on lockres. this is done by
2845 * ensuring that all asts have been flushed for this lockres. */
2846 spin_lock(&res
->spinlock
);
2847 BUG_ON(res
->migration_pending
);
2848 res
->migration_pending
= 1;
2849 /* strategy is to reserve an extra ast then release
2850 * it below, letting the release do all of the work */
2851 __dlm_lockres_reserve_ast(res
);
2852 spin_unlock(&res
->spinlock
);
2854 /* now flush all the pending asts */
2855 dlm_kick_thread(dlm
, res
);
2856 /* before waiting on DIRTY, block processes which may
2857 * try to dirty the lockres before MIGRATING is set */
2858 spin_lock(&res
->spinlock
);
2859 BUG_ON(res
->state
& DLM_LOCK_RES_BLOCK_DIRTY
);
2860 res
->state
|= DLM_LOCK_RES_BLOCK_DIRTY
;
2861 spin_unlock(&res
->spinlock
);
2862 /* now wait on any pending asts and the DIRTY state */
2863 wait_event(dlm
->ast_wq
, !dlm_lockres_is_dirty(dlm
, res
));
2864 dlm_lockres_release_ast(dlm
, res
);
2866 mlog(0, "about to wait on migration_wq, dirty=%s\n",
2867 res
->state
& DLM_LOCK_RES_DIRTY
? "yes" : "no");
2868 /* if the extra ref we just put was the final one, this
2869 * will pass thru immediately. otherwise, we need to wait
2870 * for the last ast to finish. */
2872 ret
= wait_event_interruptible_timeout(dlm
->migration_wq
,
2873 dlm_migration_can_proceed(dlm
, res
, target
),
2874 msecs_to_jiffies(1000));
2876 mlog(0, "woken again: migrating? %s, dead? %s\n",
2877 res
->state
& DLM_LOCK_RES_MIGRATING
? "yes":"no",
2878 test_bit(target
, dlm
->domain_map
) ? "no":"yes");
2880 mlog(0, "all is well: migrating? %s, dead? %s\n",
2881 res
->state
& DLM_LOCK_RES_MIGRATING
? "yes":"no",
2882 test_bit(target
, dlm
->domain_map
) ? "no":"yes");
2884 if (!dlm_migration_can_proceed(dlm
, res
, target
)) {
2885 mlog(0, "trying again...\n");
2890 /* did the target go down or die? */
2891 spin_lock(&dlm
->spinlock
);
2892 if (!test_bit(target
, dlm
->domain_map
)) {
2893 mlog(ML_ERROR
, "aha. migration target %u just went down\n",
2897 spin_unlock(&dlm
->spinlock
);
2900 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
2901 * another try; otherwise, we are sure the MIGRATING state is there,
2902 * drop the unneeded state which blocked threads trying to DIRTY
2904 spin_lock(&res
->spinlock
);
2905 BUG_ON(!(res
->state
& DLM_LOCK_RES_BLOCK_DIRTY
));
2906 res
->state
&= ~DLM_LOCK_RES_BLOCK_DIRTY
;
2908 BUG_ON(!(res
->state
& DLM_LOCK_RES_MIGRATING
));
2910 res
->migration_pending
= 0;
2911 spin_unlock(&res
->spinlock
);
2916 * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
2917 * o there are no pending asts on this lockres
2918 * o all processes trying to reserve an ast on this
2919 * lockres must wait for the MIGRATING flag to clear
2924 /* last step in the migration process.
2925 * original master calls this to free all of the dlm_lock
2926 * structures that used to be for other nodes. */
2927 static void dlm_remove_nonlocal_locks(struct dlm_ctxt
*dlm
,
2928 struct dlm_lock_resource
*res
)
2930 struct list_head
*queue
= &res
->granted
;
2932 struct dlm_lock
*lock
, *next
;
2934 assert_spin_locked(&res
->spinlock
);
2936 BUG_ON(res
->owner
== dlm
->node_num
);
2938 for (i
=0; i
<3; i
++) {
2939 list_for_each_entry_safe(lock
, next
, queue
, list
) {
2940 if (lock
->ml
.node
!= dlm
->node_num
) {
2941 mlog(0, "putting lock for node %u\n",
2943 /* be extra careful */
2944 BUG_ON(!list_empty(&lock
->ast_list
));
2945 BUG_ON(!list_empty(&lock
->bast_list
));
2946 BUG_ON(lock
->ast_pending
);
2947 BUG_ON(lock
->bast_pending
);
2948 dlm_lockres_clear_refmap_bit(dlm
, res
,
2950 list_del_init(&lock
->list
);
2952 /* In a normal unlock, we would have added a
2953 * DLM_UNLOCK_FREE_LOCK action. Force it. */
2961 bit
= find_next_bit(res
->refmap
, O2NM_MAX_NODES
, bit
);
2962 if (bit
>= O2NM_MAX_NODES
)
2964 /* do not clear the local node reference, if there is a
2965 * process holding this, let it drop the ref itself */
2966 if (bit
!= dlm
->node_num
) {
2967 mlog(0, "%s:%.*s: node %u had a ref to this "
2968 "migrating lockres, clearing\n", dlm
->name
,
2969 res
->lockname
.len
, res
->lockname
.name
, bit
);
2970 dlm_lockres_clear_refmap_bit(dlm
, res
, bit
);
2977 * Pick a node to migrate the lock resource to. This function selects a
2978 * potential target based first on the locks and then on refmap. It skips
2979 * nodes that are in the process of exiting the domain.
2981 static u8
dlm_pick_migration_target(struct dlm_ctxt
*dlm
,
2982 struct dlm_lock_resource
*res
)
2984 enum dlm_lockres_list idx
;
2985 struct list_head
*queue
= &res
->granted
;
2986 struct dlm_lock
*lock
;
2988 u8 nodenum
= O2NM_MAX_NODES
;
2990 assert_spin_locked(&dlm
->spinlock
);
2991 assert_spin_locked(&res
->spinlock
);
2993 /* Go through all the locks */
2994 for (idx
= DLM_GRANTED_LIST
; idx
<= DLM_BLOCKED_LIST
; idx
++) {
2995 queue
= dlm_list_idx_to_ptr(res
, idx
);
2996 list_for_each_entry(lock
, queue
, list
) {
2997 if (lock
->ml
.node
== dlm
->node_num
)
2999 if (test_bit(lock
->ml
.node
, dlm
->exit_domain_map
))
3001 nodenum
= lock
->ml
.node
;
3006 /* Go thru the refmap */
3009 noderef
= find_next_bit(res
->refmap
, O2NM_MAX_NODES
,
3011 if (noderef
>= O2NM_MAX_NODES
)
3013 if (noderef
== dlm
->node_num
)
3015 if (test_bit(noderef
, dlm
->exit_domain_map
))
3025 /* this is called by the new master once all lockres
3026 * data has been received */
3027 static int dlm_do_migrate_request(struct dlm_ctxt
*dlm
,
3028 struct dlm_lock_resource
*res
,
3029 u8 master
, u8 new_master
,
3030 struct dlm_node_iter
*iter
)
3032 struct dlm_migrate_request migrate
;
3033 int ret
, skip
, status
= 0;
3036 memset(&migrate
, 0, sizeof(migrate
));
3037 migrate
.namelen
= res
->lockname
.len
;
3038 memcpy(migrate
.name
, res
->lockname
.name
, migrate
.namelen
);
3039 migrate
.new_master
= new_master
;
3040 migrate
.master
= master
;
3044 /* send message to all nodes, except the master and myself */
3045 while ((nodenum
= dlm_node_iter_next(iter
)) >= 0) {
3046 if (nodenum
== master
||
3047 nodenum
== new_master
)
3050 /* We could race exit domain. If exited, skip. */
3051 spin_lock(&dlm
->spinlock
);
3052 skip
= (!test_bit(nodenum
, dlm
->domain_map
));
3053 spin_unlock(&dlm
->spinlock
);
3055 clear_bit(nodenum
, iter
->node_map
);
3059 ret
= o2net_send_message(DLM_MIGRATE_REQUEST_MSG
, dlm
->key
,
3060 &migrate
, sizeof(migrate
), nodenum
,
3063 mlog(ML_ERROR
, "%s: res %.*s, Error %d send "
3064 "MIGRATE_REQUEST to node %u\n", dlm
->name
,
3065 migrate
.namelen
, migrate
.name
, ret
, nodenum
);
3066 if (!dlm_is_host_down(ret
)) {
3067 mlog(ML_ERROR
, "unhandled error=%d!\n", ret
);
3070 clear_bit(nodenum
, iter
->node_map
);
3072 } else if (status
< 0) {
3073 mlog(0, "migrate request (node %u) returned %d!\n",
3076 } else if (status
== DLM_MIGRATE_RESPONSE_MASTERY_REF
) {
3077 /* during the migration request we short-circuited
3078 * the mastery of the lockres. make sure we have
3079 * a mastery ref for nodenum */
3080 mlog(0, "%s:%.*s: need ref for node %u\n",
3081 dlm
->name
, res
->lockname
.len
, res
->lockname
.name
,
3083 spin_lock(&res
->spinlock
);
3084 dlm_lockres_set_refmap_bit(dlm
, res
, nodenum
);
3085 spin_unlock(&res
->spinlock
);
3092 mlog(0, "returning ret=%d\n", ret
);
3097 /* if there is an existing mle for this lockres, we now know who the master is.
3098 * (the one who sent us *this* message) we can clear it up right away.
3099 * since the process that put the mle on the list still has a reference to it,
3100 * we can unhash it now, set the master and wake the process. as a result,
3101 * we will have no mle in the list to start with. now we can add an mle for
3102 * the migration and this should be the only one found for those scanning the
3104 int dlm_migrate_request_handler(struct o2net_msg
*msg
, u32 len
, void *data
,
3107 struct dlm_ctxt
*dlm
= data
;
3108 struct dlm_lock_resource
*res
= NULL
;
3109 struct dlm_migrate_request
*migrate
= (struct dlm_migrate_request
*) msg
->buf
;
3110 struct dlm_master_list_entry
*mle
= NULL
, *oldmle
= NULL
;
3112 unsigned int namelen
, hash
;
3118 name
= migrate
->name
;
3119 namelen
= migrate
->namelen
;
3120 hash
= dlm_lockid_hash(name
, namelen
);
3122 /* preallocate.. if this fails, abort */
3123 mle
= kmem_cache_alloc(dlm_mle_cache
, GFP_NOFS
);
3130 /* check for pre-existing lock */
3131 spin_lock(&dlm
->spinlock
);
3132 res
= __dlm_lookup_lockres(dlm
, name
, namelen
, hash
);
3134 spin_lock(&res
->spinlock
);
3135 if (res
->state
& DLM_LOCK_RES_RECOVERING
) {
3136 /* if all is working ok, this can only mean that we got
3137 * a migrate request from a node that we now see as
3138 * dead. what can we do here? drop it to the floor? */
3139 spin_unlock(&res
->spinlock
);
3140 mlog(ML_ERROR
, "Got a migrate request, but the "
3141 "lockres is marked as recovering!");
3142 kmem_cache_free(dlm_mle_cache
, mle
);
3143 ret
= -EINVAL
; /* need a better solution */
3146 res
->state
|= DLM_LOCK_RES_MIGRATING
;
3147 spin_unlock(&res
->spinlock
);
3150 spin_lock(&dlm
->master_lock
);
3151 /* ignore status. only nonzero status would BUG. */
3152 ret
= dlm_add_migration_mle(dlm
, res
, mle
, &oldmle
,
3154 migrate
->new_master
,
3158 kmem_cache_free(dlm_mle_cache
, mle
);
3160 spin_unlock(&dlm
->master_lock
);
3162 spin_unlock(&dlm
->spinlock
);
3165 /* master is known, detach if not already detached */
3166 dlm_mle_detach_hb_events(dlm
, oldmle
);
3167 dlm_put_mle(oldmle
);
3171 dlm_lockres_put(res
);
3177 /* must be holding dlm->spinlock and dlm->master_lock
3178 * when adding a migration mle, we can clear any other mles
3179 * in the master list because we know with certainty that
3180 * the master is "master". so we remove any old mle from
3181 * the list after setting it's master field, and then add
3182 * the new migration mle. this way we can hold with the rule
3183 * of having only one mle for a given lock name at all times. */
3184 static int dlm_add_migration_mle(struct dlm_ctxt
*dlm
,
3185 struct dlm_lock_resource
*res
,
3186 struct dlm_master_list_entry
*mle
,
3187 struct dlm_master_list_entry
**oldmle
,
3188 const char *name
, unsigned int namelen
,
3189 u8 new_master
, u8 master
)
3196 assert_spin_locked(&dlm
->spinlock
);
3197 assert_spin_locked(&dlm
->master_lock
);
3199 /* caller is responsible for any ref taken here on oldmle */
3200 found
= dlm_find_mle(dlm
, oldmle
, (char *)name
, namelen
);
3202 struct dlm_master_list_entry
*tmp
= *oldmle
;
3203 spin_lock(&tmp
->spinlock
);
3204 if (tmp
->type
== DLM_MLE_MIGRATION
) {
3205 if (master
== dlm
->node_num
) {
3206 /* ah another process raced me to it */
3207 mlog(0, "tried to migrate %.*s, but some "
3208 "process beat me to it\n",
3210 spin_unlock(&tmp
->spinlock
);
3213 /* bad. 2 NODES are trying to migrate! */
3214 mlog(ML_ERROR
, "migration error mle: "
3215 "master=%u new_master=%u // request: "
3216 "master=%u new_master=%u // "
3218 tmp
->master
, tmp
->new_master
,
3224 /* this is essentially what assert_master does */
3225 tmp
->master
= master
;
3226 atomic_set(&tmp
->woken
, 1);
3228 /* remove it so that only one mle will be found */
3229 __dlm_unlink_mle(dlm
, tmp
);
3230 __dlm_mle_detach_hb_events(dlm
, tmp
);
3231 if (tmp
->type
== DLM_MLE_MASTER
) {
3232 ret
= DLM_MIGRATE_RESPONSE_MASTERY_REF
;
3233 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
3234 "telling master to get ref "
3235 "for cleared out mle during "
3236 "migration\n", dlm
->name
,
3237 namelen
, name
, master
,
3241 spin_unlock(&tmp
->spinlock
);
3244 /* now add a migration mle to the tail of the list */
3245 dlm_init_mle(mle
, DLM_MLE_MIGRATION
, dlm
, res
, name
, namelen
);
3246 mle
->new_master
= new_master
;
3247 /* the new master will be sending an assert master for this.
3248 * at that point we will get the refmap reference */
3249 mle
->master
= master
;
3250 /* do this for consistency with other mle types */
3251 set_bit(new_master
, mle
->maybe_map
);
3252 __dlm_insert_mle(dlm
, mle
);
3258 * Sets the owner of the lockres, associated to the mle, to UNKNOWN
3260 static struct dlm_lock_resource
*dlm_reset_mleres_owner(struct dlm_ctxt
*dlm
,
3261 struct dlm_master_list_entry
*mle
)
3263 struct dlm_lock_resource
*res
;
3265 /* Find the lockres associated to the mle and set its owner to UNK */
3266 res
= __dlm_lookup_lockres(dlm
, mle
->mname
, mle
->mnamelen
,
3269 spin_unlock(&dlm
->master_lock
);
3271 /* move lockres onto recovery list */
3272 spin_lock(&res
->spinlock
);
3273 dlm_set_lockres_owner(dlm
, res
, DLM_LOCK_RES_OWNER_UNKNOWN
);
3274 dlm_move_lockres_to_recovery_list(dlm
, res
);
3275 spin_unlock(&res
->spinlock
);
3276 dlm_lockres_put(res
);
3278 /* about to get rid of mle, detach from heartbeat */
3279 __dlm_mle_detach_hb_events(dlm
, mle
);
3282 spin_lock(&dlm
->master_lock
);
3284 spin_unlock(&dlm
->master_lock
);
3290 static void dlm_clean_migration_mle(struct dlm_ctxt
*dlm
,
3291 struct dlm_master_list_entry
*mle
)
3293 __dlm_mle_detach_hb_events(dlm
, mle
);
3295 spin_lock(&mle
->spinlock
);
3296 __dlm_unlink_mle(dlm
, mle
);
3297 atomic_set(&mle
->woken
, 1);
3298 spin_unlock(&mle
->spinlock
);
3303 static void dlm_clean_block_mle(struct dlm_ctxt
*dlm
,
3304 struct dlm_master_list_entry
*mle
, u8 dead_node
)
3308 BUG_ON(mle
->type
!= DLM_MLE_BLOCK
);
3310 spin_lock(&mle
->spinlock
);
3311 bit
= find_next_bit(mle
->maybe_map
, O2NM_MAX_NODES
, 0);
3312 if (bit
!= dead_node
) {
3313 mlog(0, "mle found, but dead node %u would not have been "
3314 "master\n", dead_node
);
3315 spin_unlock(&mle
->spinlock
);
3317 /* Must drop the refcount by one since the assert_master will
3318 * never arrive. This may result in the mle being unlinked and
3319 * freed, but there may still be a process waiting in the
3320 * dlmlock path which is fine. */
3321 mlog(0, "node %u was expected master\n", dead_node
);
3322 atomic_set(&mle
->woken
, 1);
3323 spin_unlock(&mle
->spinlock
);
3326 /* Do not need events any longer, so detach from heartbeat */
3327 __dlm_mle_detach_hb_events(dlm
, mle
);
3332 void dlm_clean_master_list(struct dlm_ctxt
*dlm
, u8 dead_node
)
3334 struct dlm_master_list_entry
*mle
;
3335 struct dlm_lock_resource
*res
;
3336 struct hlist_head
*bucket
;
3337 struct hlist_node
*tmp
;
3340 mlog(0, "dlm=%s, dead node=%u\n", dlm
->name
, dead_node
);
3342 assert_spin_locked(&dlm
->spinlock
);
3344 /* clean the master list */
3345 spin_lock(&dlm
->master_lock
);
3346 for (i
= 0; i
< DLM_HASH_BUCKETS
; i
++) {
3347 bucket
= dlm_master_hash(dlm
, i
);
3348 hlist_for_each_entry_safe(mle
, tmp
, bucket
, master_hash_node
) {
3349 BUG_ON(mle
->type
!= DLM_MLE_BLOCK
&&
3350 mle
->type
!= DLM_MLE_MASTER
&&
3351 mle
->type
!= DLM_MLE_MIGRATION
);
3353 /* MASTER mles are initiated locally. The waiting
3354 * process will notice the node map change shortly.
3355 * Let that happen as normal. */
3356 if (mle
->type
== DLM_MLE_MASTER
)
3359 /* BLOCK mles are initiated by other nodes. Need to
3360 * clean up if the dead node would have been the
3362 if (mle
->type
== DLM_MLE_BLOCK
) {
3363 dlm_clean_block_mle(dlm
, mle
, dead_node
);
3367 /* Everything else is a MIGRATION mle */
3369 /* The rule for MIGRATION mles is that the master
3370 * becomes UNKNOWN if *either* the original or the new
3371 * master dies. All UNKNOWN lockres' are sent to
3372 * whichever node becomes the recovery master. The new
3373 * master is responsible for determining if there is
3374 * still a master for this lockres, or if he needs to
3375 * take over mastery. Either way, this node should
3376 * expect another message to resolve this. */
3378 if (mle
->master
!= dead_node
&&
3379 mle
->new_master
!= dead_node
)
3382 if (mle
->new_master
== dead_node
&& mle
->inuse
) {
3383 mlog(ML_NOTICE
, "%s: target %u died during "
3384 "migration from %u, the MLE is "
3385 "still keep used, ignore it!\n",
3386 dlm
->name
, dead_node
,
3391 /* If we have reached this point, this mle needs to be
3392 * removed from the list and freed. */
3393 dlm_clean_migration_mle(dlm
, mle
);
3395 mlog(0, "%s: node %u died during migration from "
3396 "%u to %u!\n", dlm
->name
, dead_node
, mle
->master
,
3399 /* If we find a lockres associated with the mle, we've
3400 * hit this rare case that messes up our lock ordering.
3401 * If so, we need to drop the master lock so that we can
3402 * take the lockres lock, meaning that we will have to
3403 * restart from the head of list. */
3404 res
= dlm_reset_mleres_owner(dlm
, mle
);
3409 /* This may be the last reference */
3413 spin_unlock(&dlm
->master_lock
);
3416 int dlm_finish_migration(struct dlm_ctxt
*dlm
, struct dlm_lock_resource
*res
,
3419 struct dlm_node_iter iter
;
3422 spin_lock(&dlm
->spinlock
);
3423 dlm_node_iter_init(dlm
->domain_map
, &iter
);
3424 clear_bit(old_master
, iter
.node_map
);
3425 clear_bit(dlm
->node_num
, iter
.node_map
);
3426 spin_unlock(&dlm
->spinlock
);
3428 /* ownership of the lockres is changing. account for the
3429 * mastery reference here since old_master will briefly have
3430 * a reference after the migration completes */
3431 spin_lock(&res
->spinlock
);
3432 dlm_lockres_set_refmap_bit(dlm
, res
, old_master
);
3433 spin_unlock(&res
->spinlock
);
3435 mlog(0, "now time to do a migrate request to other nodes\n");
3436 ret
= dlm_do_migrate_request(dlm
, res
, old_master
,
3437 dlm
->node_num
, &iter
);
3443 mlog(0, "doing assert master of %.*s to all except the original node\n",
3444 res
->lockname
.len
, res
->lockname
.name
);
3445 /* this call now finishes out the nodemap
3446 * even if one or more nodes die */
3447 ret
= dlm_do_assert_master(dlm
, res
, iter
.node_map
,
3448 DLM_ASSERT_MASTER_FINISH_MIGRATION
);
3450 /* no longer need to retry. all living nodes contacted. */
3455 memset(iter
.node_map
, 0, sizeof(iter
.node_map
));
3456 set_bit(old_master
, iter
.node_map
);
3457 mlog(0, "doing assert master of %.*s back to %u\n",
3458 res
->lockname
.len
, res
->lockname
.name
, old_master
);
3459 ret
= dlm_do_assert_master(dlm
, res
, iter
.node_map
,
3460 DLM_ASSERT_MASTER_FINISH_MIGRATION
);
3462 mlog(0, "assert master to original master failed "
3464 /* the only nonzero status here would be because of
3465 * a dead original node. we're done. */
3469 /* all done, set the owner, clear the flag */
3470 spin_lock(&res
->spinlock
);
3471 dlm_set_lockres_owner(dlm
, res
, dlm
->node_num
);
3472 res
->state
&= ~DLM_LOCK_RES_MIGRATING
;
3473 spin_unlock(&res
->spinlock
);
3474 /* re-dirty it on the new master */
3475 dlm_kick_thread(dlm
, res
);
3482 * LOCKRES AST REFCOUNT
3483 * this is integral to migration
3486 /* for future intent to call an ast, reserve one ahead of time.
3487 * this should be called only after waiting on the lockres
3488 * with dlm_wait_on_lockres, and while still holding the
3489 * spinlock after the call. */
3490 void __dlm_lockres_reserve_ast(struct dlm_lock_resource
*res
)
3492 assert_spin_locked(&res
->spinlock
);
3493 if (res
->state
& DLM_LOCK_RES_MIGRATING
) {
3494 __dlm_print_one_lock_resource(res
);
3496 BUG_ON(res
->state
& DLM_LOCK_RES_MIGRATING
);
3498 atomic_inc(&res
->asts_reserved
);
3502 * used to drop the reserved ast, either because it went unused,
3503 * or because the ast/bast was actually called.
3505 * also, if there is a pending migration on this lockres,
3506 * and this was the last pending ast on the lockres,
3507 * atomically set the MIGRATING flag before we drop the lock.
3508 * this is how we ensure that migration can proceed with no
3509 * asts in progress. note that it is ok if the state of the
3510 * queues is such that a lock should be granted in the future
3511 * or that a bast should be fired, because the new master will
3512 * shuffle the lists on this lockres as soon as it is migrated.
3514 void dlm_lockres_release_ast(struct dlm_ctxt
*dlm
,
3515 struct dlm_lock_resource
*res
)
3517 if (!atomic_dec_and_lock(&res
->asts_reserved
, &res
->spinlock
))
3520 if (!res
->migration_pending
) {
3521 spin_unlock(&res
->spinlock
);
3525 BUG_ON(res
->state
& DLM_LOCK_RES_MIGRATING
);
3526 res
->migration_pending
= 0;
3527 res
->state
|= DLM_LOCK_RES_MIGRATING
;
3528 spin_unlock(&res
->spinlock
);
3530 wake_up(&dlm
->migration_wq
);
3533 void dlm_force_free_mles(struct dlm_ctxt
*dlm
)
3536 struct hlist_head
*bucket
;
3537 struct dlm_master_list_entry
*mle
;
3538 struct hlist_node
*tmp
;
3541 * We notified all other nodes that we are exiting the domain and
3542 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3543 * around we force free them and wake any processes that are waiting
3546 spin_lock(&dlm
->spinlock
);
3547 spin_lock(&dlm
->master_lock
);
3549 BUG_ON(dlm
->dlm_state
!= DLM_CTXT_LEAVING
);
3550 BUG_ON((find_next_bit(dlm
->domain_map
, O2NM_MAX_NODES
, 0) < O2NM_MAX_NODES
));
3552 for (i
= 0; i
< DLM_HASH_BUCKETS
; i
++) {
3553 bucket
= dlm_master_hash(dlm
, i
);
3554 hlist_for_each_entry_safe(mle
, tmp
, bucket
, master_hash_node
) {
3555 if (mle
->type
!= DLM_MLE_BLOCK
) {
3556 mlog(ML_ERROR
, "bad mle: %p\n", mle
);
3557 dlm_print_one_mle(mle
);
3559 atomic_set(&mle
->woken
, 1);
3562 __dlm_unlink_mle(dlm
, mle
);
3563 __dlm_mle_detach_hb_events(dlm
, mle
);
3567 spin_unlock(&dlm
->master_lock
);
3568 spin_unlock(&dlm
->spinlock
);