1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
26 #include <linux/types.h>
27 #include <linux/slab.h>
28 #include <linux/highmem.h>
29 #include <linux/smp_lock.h>
30 #include <linux/kthread.h>
32 #include <cluster/heartbeat.h>
33 #include <cluster/nodemanager.h>
34 #include <cluster/tcp.h>
36 #include <dlm/dlmapi.h>
38 #define MLOG_MASK_PREFIX ML_VOTE
39 #include <cluster/masklog.h>
45 #include "extent_map.h"
46 #include "heartbeat.h"
52 #include "buffer_head_io.h"
54 #define OCFS2_MESSAGE_TYPE_VOTE (0x1)
55 #define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
58 __be32 h_response_id
; /* used to lookup message handle on sending
63 __be32 h_node_num
; /* node sending this particular message. */
66 /* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
68 #define OCFS2_VOTE_FILENAME_LEN 256
71 struct ocfs2_msg_hdr v_hdr
;
74 __be32 v_orphaned_slot
; /* Used during delete votes */
75 __be32 v_nlink
; /* Used during unlink votes */
76 } md1
; /* Message type dependant 1 */
77 __be32 v_unlink_namelen
;
78 __be64 v_unlink_parent
;
79 u8 v_unlink_dirent
[OCFS2_VOTE_FILENAME_LEN
];
82 /* Responses are given these values to maintain backwards
83 * compatibility with older ocfs2 versions */
84 #define OCFS2_RESPONSE_OK (0)
85 #define OCFS2_RESPONSE_BUSY (-16)
86 #define OCFS2_RESPONSE_BAD_MSG (-22)
88 struct ocfs2_response_msg
90 struct ocfs2_msg_hdr r_hdr
;
92 __be32 r_orphaned_slot
;
95 struct ocfs2_vote_work
{
96 struct list_head w_list
;
97 struct ocfs2_vote_msg w_msg
;
100 enum ocfs2_vote_request
{
101 OCFS2_VOTE_REQ_INVALID
= 0,
102 OCFS2_VOTE_REQ_DELETE
,
103 OCFS2_VOTE_REQ_UNLINK
,
104 OCFS2_VOTE_REQ_RENAME
,
105 OCFS2_VOTE_REQ_MOUNT
,
106 OCFS2_VOTE_REQ_UMOUNT
,
110 static inline int ocfs2_is_valid_vote_request(int request
)
112 return OCFS2_VOTE_REQ_INVALID
< request
&&
113 request
< OCFS2_VOTE_REQ_LAST
;
116 typedef void (*ocfs2_net_response_callback
)(void *priv
,
117 struct ocfs2_response_msg
*resp
);
118 struct ocfs2_net_response_cb
{
119 ocfs2_net_response_callback rc_cb
;
123 struct ocfs2_net_wait_ctxt
{
124 struct list_head n_list
;
126 wait_queue_head_t n_event
;
127 struct ocfs2_node_map n_node_map
;
128 int n_response
; /* an agreggate response. 0 if
129 * all nodes are go, < 0 on any
130 * negative response from any
131 * node or network error. */
132 struct ocfs2_net_response_cb
*n_callback
;
135 static void ocfs2_process_mount_request(struct ocfs2_super
*osb
,
136 unsigned int node_num
)
138 mlog(0, "MOUNT vote from node %u\n", node_num
);
139 /* The other node only sends us this message when he has an EX
140 * on the superblock, so our recovery threads (if having been
141 * launched) are waiting on it.*/
142 ocfs2_recovery_map_clear(osb
, node_num
);
143 ocfs2_node_map_set_bit(osb
, &osb
->mounted_map
, node_num
);
145 /* We clear the umount map here because a node may have been
146 * previously mounted, safely unmounted but never stopped
147 * heartbeating - in which case we'd have a stale entry. */
148 ocfs2_node_map_clear_bit(osb
, &osb
->umount_map
, node_num
);
151 static void ocfs2_process_umount_request(struct ocfs2_super
*osb
,
152 unsigned int node_num
)
154 mlog(0, "UMOUNT vote from node %u\n", node_num
);
155 ocfs2_node_map_clear_bit(osb
, &osb
->mounted_map
, node_num
);
156 ocfs2_node_map_set_bit(osb
, &osb
->umount_map
, node_num
);
159 void ocfs2_mark_inode_remotely_deleted(struct inode
*inode
)
161 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
163 assert_spin_locked(&oi
->ip_lock
);
164 /* We set the SKIP_DELETE flag on the inode so we don't try to
165 * delete it in delete_inode ourselves, thus avoiding
166 * unecessary lock pinging. If the other node failed to wipe
167 * the inode as a result of a crash, then recovery will pick
169 oi
->ip_flags
|= OCFS2_INODE_DELETED
|OCFS2_INODE_SKIP_DELETE
;
172 static int ocfs2_process_delete_request(struct inode
*inode
,
175 int response
= OCFS2_RESPONSE_BUSY
;
177 mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
178 inode
->i_ino
, inode
->i_nlink
, *orphaned_slot
);
180 spin_lock(&OCFS2_I(inode
)->ip_lock
);
182 /* Whatever our vote response is, we want to make sure that
183 * the orphaned slot is recorded properly on this node *and*
184 * on the requesting node. Technically, if the requesting node
185 * did not know which slot the inode is orphaned in but we
186 * respond with BUSY he doesn't actually need the orphaned
187 * slot, but it doesn't hurt to do it here anyway. */
188 if ((*orphaned_slot
) != OCFS2_INVALID_SLOT
) {
189 mlog_bug_on_msg(OCFS2_I(inode
)->ip_orphaned_slot
!=
190 OCFS2_INVALID_SLOT
&&
191 OCFS2_I(inode
)->ip_orphaned_slot
!=
193 "Inode %llu: This node thinks it's "
194 "orphaned in slot %d, messaged it's in %d\n",
195 (unsigned long long)OCFS2_I(inode
)->ip_blkno
,
196 OCFS2_I(inode
)->ip_orphaned_slot
,
199 mlog(0, "Setting orphaned slot for inode %llu to %d\n",
200 (unsigned long long)OCFS2_I(inode
)->ip_blkno
,
203 OCFS2_I(inode
)->ip_orphaned_slot
= *orphaned_slot
;
205 mlog(0, "Sending back orphaned slot %d for inode %llu\n",
206 OCFS2_I(inode
)->ip_orphaned_slot
,
207 (unsigned long long)OCFS2_I(inode
)->ip_blkno
);
209 *orphaned_slot
= OCFS2_I(inode
)->ip_orphaned_slot
;
212 /* vote no if the file is still open. */
213 if (OCFS2_I(inode
)->ip_open_count
) {
214 mlog(0, "open count = %u\n",
215 OCFS2_I(inode
)->ip_open_count
);
216 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
219 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
221 /* directories are a bit ugly... What if someone is sitting in
222 * it? We want to make sure the inode is removed completely as
223 * a result of the iput in process_vote. */
224 if (S_ISDIR(inode
->i_mode
) && (atomic_read(&inode
->i_count
) != 1)) {
225 mlog(0, "i_count = %u\n", atomic_read(&inode
->i_count
));
229 if (filemap_fdatawrite(inode
->i_mapping
)) {
230 mlog(ML_ERROR
, "Could not sync inode %llu for delete!\n",
231 (unsigned long long)OCFS2_I(inode
)->ip_blkno
);
234 sync_mapping_buffers(inode
->i_mapping
);
235 truncate_inode_pages(inode
->i_mapping
, 0);
236 ocfs2_extent_map_trunc(inode
, 0);
238 spin_lock(&OCFS2_I(inode
)->ip_lock
);
239 /* double check open count - someone might have raced this
240 * thread into ocfs2_file_open while we were writing out
241 * data. If we're to allow a wipe of this inode now, we *must*
242 * hold the spinlock until we've marked it. */
243 if (OCFS2_I(inode
)->ip_open_count
) {
244 mlog(0, "Raced to wipe! open count = %u\n",
245 OCFS2_I(inode
)->ip_open_count
);
246 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
250 /* Mark the inode as being wiped from disk. */
251 ocfs2_mark_inode_remotely_deleted(inode
);
252 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
254 /* Not sure this is necessary anymore. */
255 d_prune_aliases(inode
);
257 /* If we get here, then we're voting 'yes', so commit the
258 * delete on our side. */
259 response
= OCFS2_RESPONSE_OK
;
264 static int ocfs2_match_dentry(struct dentry
*dentry
,
266 unsigned int namelen
,
269 struct inode
*parent
;
271 if (!dentry
->d_parent
) {
272 mlog(0, "Detached from parent.\n");
276 parent
= dentry
->d_parent
->d_inode
;
277 /* Negative parent dentry? */
281 /* Name is in a different directory. */
282 if (OCFS2_I(parent
)->ip_blkno
!= parent_blkno
)
285 if (dentry
->d_name
.len
!= namelen
)
288 /* comparison above guarantees this is safe. */
289 if (memcmp(dentry
->d_name
.name
, name
, namelen
))
295 static void ocfs2_process_dentry_request(struct inode
*inode
,
297 unsigned int new_nlink
,
299 unsigned int namelen
,
302 struct dentry
*dentry
= NULL
;
304 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
306 mlog(0, "parent %llu, namelen = %u, name = %.*s\n",
307 (unsigned long long)parent_blkno
, namelen
, namelen
, name
);
309 spin_lock(&dcache_lock
);
311 /* Another node is removing this name from the system. It is
312 * up to us to find the corresponding dentry and if it exists,
313 * unhash it from the dcache. */
314 list_for_each(p
, &inode
->i_dentry
) {
315 dentry
= list_entry(p
, struct dentry
, d_alias
);
317 if (ocfs2_match_dentry(dentry
, parent_blkno
, namelen
, name
)) {
318 mlog(0, "dentry found: %.*s\n",
319 dentry
->d_name
.len
, dentry
->d_name
.name
);
328 spin_unlock(&dcache_lock
);
335 /* rename votes don't send link counts */
337 mlog(0, "new_nlink = %u\n", new_nlink
);
339 /* We don't have the proper locks here to directly
340 * change i_nlink and besides, the vote is sent
341 * *before* the operation so it may have failed on the
342 * other node. This passes a hint to ocfs2_drop_inode
343 * to force ocfs2_delete_inode, who will take the
344 * proper cluster locks to sort things out. */
345 if (new_nlink
== 0) {
346 spin_lock(&oi
->ip_lock
);
347 oi
->ip_flags
|= OCFS2_INODE_MAYBE_ORPHANED
;
348 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
353 static void ocfs2_process_vote(struct ocfs2_super
*osb
,
354 struct ocfs2_vote_msg
*msg
)
356 int net_status
, vote_response
;
357 int orphaned_slot
= 0;
359 unsigned int node_num
, generation
, new_nlink
, namelen
;
360 u64 blkno
, parent_blkno
;
361 enum ocfs2_vote_request request
;
362 struct inode
*inode
= NULL
;
363 struct ocfs2_msg_hdr
*hdr
= &msg
->v_hdr
;
364 struct ocfs2_response_msg response
;
366 /* decode the network mumbo jumbo into local variables. */
367 request
= be32_to_cpu(hdr
->h_request
);
368 blkno
= be64_to_cpu(hdr
->h_blkno
);
369 generation
= be32_to_cpu(hdr
->h_generation
);
370 node_num
= be32_to_cpu(hdr
->h_node_num
);
371 if (request
== OCFS2_VOTE_REQ_DELETE
)
372 orphaned_slot
= be32_to_cpu(msg
->md1
.v_orphaned_slot
);
374 mlog(0, "processing vote: request = %u, blkno = %llu, "
375 "generation = %u, node_num = %u, priv1 = %u\n", request
,
376 (unsigned long long)blkno
, generation
, node_num
,
377 be32_to_cpu(msg
->md1
.v_generic1
));
379 if (!ocfs2_is_valid_vote_request(request
)) {
380 mlog(ML_ERROR
, "Invalid vote request %d from node %u\n",
382 vote_response
= OCFS2_RESPONSE_BAD_MSG
;
386 vote_response
= OCFS2_RESPONSE_OK
;
389 case OCFS2_VOTE_REQ_UMOUNT
:
390 ocfs2_process_umount_request(osb
, node_num
);
392 case OCFS2_VOTE_REQ_MOUNT
:
393 ocfs2_process_mount_request(osb
, node_num
);
396 /* avoids a gcc warning */
400 /* We cannot process the remaining message types before we're
401 * fully mounted. It's perfectly safe however to send a 'yes'
402 * response as we can't possibly have any of the state they're
403 * asking us to modify yet. */
404 if (atomic_read(&osb
->vol_state
) == VOLUME_INIT
)
407 /* If we get here, then the request is against an inode. */
408 inode
= ocfs2_ilookup_for_vote(osb
, blkno
,
409 request
== OCFS2_VOTE_REQ_DELETE
);
411 /* Not finding the inode is perfectly valid - it means we're
412 * not interested in what the other node is about to do to it
413 * so in those cases we automatically respond with an
414 * affirmative. Cluster locking ensures that we won't race
415 * interest in the inode with this vote request. */
419 /* Check generation values. It's possible for us to get a
420 * request against a stale inode. If so then we proceed as if
421 * we had not found an inode in the first place. */
422 if (inode
->i_generation
!= generation
) {
423 mlog(0, "generation passed %u != inode generation = %u, "
424 "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
425 "message type = %u\n", generation
, inode
->i_generation
,
426 OCFS2_I(inode
)->ip_flags
,
427 (unsigned long long)OCFS2_I(inode
)->ip_blkno
,
428 (unsigned long long)blkno
, atomic_read(&inode
->i_count
),
436 case OCFS2_VOTE_REQ_DELETE
:
437 vote_response
= ocfs2_process_delete_request(inode
,
440 case OCFS2_VOTE_REQ_RENAME
:
443 case OCFS2_VOTE_REQ_UNLINK
:
444 parent_blkno
= be64_to_cpu(msg
->v_unlink_parent
);
445 namelen
= be32_to_cpu(msg
->v_unlink_namelen
);
446 /* new_nlink will be ignored in case of a rename vote */
447 new_nlink
= be32_to_cpu(msg
->md1
.v_nlink
);
448 ocfs2_process_dentry_request(inode
, rename
, new_nlink
,
449 parent_blkno
, namelen
,
450 msg
->v_unlink_dirent
);
453 mlog(ML_ERROR
, "node %u, invalid request: %u\n",
455 vote_response
= OCFS2_RESPONSE_BAD_MSG
;
459 /* Response struture is small so we just put it on the stack
460 * and stuff it inline. */
461 memset(&response
, 0, sizeof(struct ocfs2_response_msg
));
462 response
.r_hdr
.h_response_id
= hdr
->h_response_id
;
463 response
.r_hdr
.h_blkno
= hdr
->h_blkno
;
464 response
.r_hdr
.h_generation
= hdr
->h_generation
;
465 response
.r_hdr
.h_node_num
= cpu_to_be32(osb
->node_num
);
466 response
.r_response
= cpu_to_be32(vote_response
);
467 response
.r_orphaned_slot
= cpu_to_be32(orphaned_slot
);
469 net_status
= o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE
,
472 sizeof(struct ocfs2_response_msg
),
475 /* We still want to error print for ENOPROTOOPT here. The
476 * sending node shouldn't have unregistered his net handler
477 * without sending an unmount vote 1st */
479 && net_status
!= -ETIMEDOUT
480 && net_status
!= -ENOTCONN
)
481 mlog(ML_ERROR
, "message to node %u fails with error %d!\n",
482 node_num
, net_status
);
488 static void ocfs2_vote_thread_do_work(struct ocfs2_super
*osb
)
490 unsigned long processed
;
491 struct ocfs2_lock_res
*lockres
;
492 struct ocfs2_vote_work
*work
;
496 spin_lock(&osb
->vote_task_lock
);
497 /* grab this early so we know to try again if a state change and
498 * wake happens part-way through our work */
499 osb
->vote_work_sequence
= osb
->vote_wake_sequence
;
501 processed
= osb
->blocked_lock_count
;
503 BUG_ON(list_empty(&osb
->blocked_lock_list
));
505 lockres
= list_entry(osb
->blocked_lock_list
.next
,
506 struct ocfs2_lock_res
, l_blocked_list
);
507 list_del_init(&lockres
->l_blocked_list
);
508 osb
->blocked_lock_count
--;
509 spin_unlock(&osb
->vote_task_lock
);
514 ocfs2_process_blocked_lock(osb
, lockres
);
516 spin_lock(&osb
->vote_task_lock
);
519 while (osb
->vote_count
) {
520 BUG_ON(list_empty(&osb
->vote_list
));
521 work
= list_entry(osb
->vote_list
.next
,
522 struct ocfs2_vote_work
, w_list
);
523 list_del(&work
->w_list
);
525 spin_unlock(&osb
->vote_task_lock
);
527 ocfs2_process_vote(osb
, &work
->w_msg
);
530 spin_lock(&osb
->vote_task_lock
);
532 spin_unlock(&osb
->vote_task_lock
);
537 static int ocfs2_vote_thread_lists_empty(struct ocfs2_super
*osb
)
541 spin_lock(&osb
->vote_task_lock
);
542 if (list_empty(&osb
->blocked_lock_list
) &&
543 list_empty(&osb
->vote_list
))
546 spin_unlock(&osb
->vote_task_lock
);
550 static int ocfs2_vote_thread_should_wake(struct ocfs2_super
*osb
)
554 spin_lock(&osb
->vote_task_lock
);
555 if (osb
->vote_work_sequence
!= osb
->vote_wake_sequence
)
557 spin_unlock(&osb
->vote_task_lock
);
562 int ocfs2_vote_thread(void *arg
)
565 struct ocfs2_super
*osb
= arg
;
567 /* only quit once we've been asked to stop and there is no more
569 while (!(kthread_should_stop() &&
570 ocfs2_vote_thread_lists_empty(osb
))) {
572 wait_event_interruptible(osb
->vote_event
,
573 ocfs2_vote_thread_should_wake(osb
) ||
574 kthread_should_stop());
576 mlog(0, "vote_thread: awoken\n");
578 ocfs2_vote_thread_do_work(osb
);
581 osb
->vote_task
= NULL
;
585 static struct ocfs2_net_wait_ctxt
*ocfs2_new_net_wait_ctxt(unsigned int response_id
)
587 struct ocfs2_net_wait_ctxt
*w
;
589 w
= kcalloc(1, sizeof(*w
), GFP_NOFS
);
595 INIT_LIST_HEAD(&w
->n_list
);
596 init_waitqueue_head(&w
->n_event
);
597 ocfs2_node_map_init(&w
->n_node_map
);
598 w
->n_response_id
= response_id
;
599 w
->n_callback
= NULL
;
604 static unsigned int ocfs2_new_response_id(struct ocfs2_super
*osb
)
608 spin_lock(&osb
->net_response_lock
);
609 ret
= ++osb
->net_response_ids
;
610 spin_unlock(&osb
->net_response_lock
);
615 static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super
*osb
,
616 struct ocfs2_net_wait_ctxt
*w
)
618 spin_lock(&osb
->net_response_lock
);
619 list_del(&w
->n_list
);
620 spin_unlock(&osb
->net_response_lock
);
623 static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super
*osb
,
624 struct ocfs2_net_wait_ctxt
*w
)
626 spin_lock(&osb
->net_response_lock
);
627 list_add_tail(&w
->n_list
,
628 &osb
->net_response_list
);
629 spin_unlock(&osb
->net_response_lock
);
632 static void __ocfs2_mark_node_responded(struct ocfs2_super
*osb
,
633 struct ocfs2_net_wait_ctxt
*w
,
636 assert_spin_locked(&osb
->net_response_lock
);
638 ocfs2_node_map_clear_bit(osb
, &w
->n_node_map
, node_num
);
639 if (ocfs2_node_map_is_empty(osb
, &w
->n_node_map
))
640 wake_up(&w
->n_event
);
643 /* Intended to be called from the node down callback, we fake remove
644 * the node from all our response contexts */
645 void ocfs2_remove_node_from_vote_queues(struct ocfs2_super
*osb
,
649 struct ocfs2_net_wait_ctxt
*w
= NULL
;
651 spin_lock(&osb
->net_response_lock
);
653 list_for_each(p
, &osb
->net_response_list
) {
654 w
= list_entry(p
, struct ocfs2_net_wait_ctxt
, n_list
);
656 __ocfs2_mark_node_responded(osb
, w
, node_num
);
659 spin_unlock(&osb
->net_response_lock
);
662 static int ocfs2_broadcast_vote(struct ocfs2_super
*osb
,
663 struct ocfs2_vote_msg
*request
,
664 unsigned int response_id
,
666 struct ocfs2_net_response_cb
*callback
)
668 int status
, i
, remote_err
;
669 struct ocfs2_net_wait_ctxt
*w
= NULL
;
674 w
= ocfs2_new_net_wait_ctxt(response_id
);
680 w
->n_callback
= callback
;
682 /* we're pretty much ready to go at this point, and this fills
683 * in n_response which we need anyway... */
684 ocfs2_queue_net_wait_ctxt(osb
, w
);
686 i
= ocfs2_node_map_iterate(osb
, &osb
->mounted_map
, 0);
688 while (i
!= O2NM_INVALID_NODE_NUM
) {
689 if (i
!= osb
->node_num
) {
690 mlog(0, "trying to send request to node %i\n", i
);
691 ocfs2_node_map_set_bit(osb
, &w
->n_node_map
, i
);
694 status
= o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE
,
700 if (status
== -ETIMEDOUT
) {
701 mlog(0, "remote node %d timed out!\n", i
);
705 if (remote_err
< 0) {
707 mlog(0, "remote error %d on node %d!\n",
718 i
= ocfs2_node_map_iterate(osb
, &osb
->mounted_map
, i
);
719 mlog(0, "next is %d, i am %d\n", i
, osb
->node_num
);
721 mlog(0, "done sending, now waiting on responses...\n");
723 wait_event(w
->n_event
, ocfs2_node_map_is_empty(osb
, &w
->n_node_map
));
725 ocfs2_dequeue_net_wait_ctxt(osb
, w
);
728 *response
= w
->n_response
;
733 ocfs2_dequeue_net_wait_ctxt(osb
, w
);
741 static struct ocfs2_vote_msg
* ocfs2_new_vote_request(struct ocfs2_super
*osb
,
743 unsigned int generation
,
744 enum ocfs2_vote_request type
,
747 struct ocfs2_vote_msg
*request
;
748 struct ocfs2_msg_hdr
*hdr
;
750 BUG_ON(!ocfs2_is_valid_vote_request(type
));
752 request
= kcalloc(1, sizeof(*request
), GFP_NOFS
);
756 hdr
= &request
->v_hdr
;
757 hdr
->h_node_num
= cpu_to_be32(osb
->node_num
);
758 hdr
->h_request
= cpu_to_be32(type
);
759 hdr
->h_blkno
= cpu_to_be64(blkno
);
760 hdr
->h_generation
= cpu_to_be32(generation
);
762 request
->md1
.v_generic1
= cpu_to_be32(priv
);
768 /* Complete the buildup of a new vote request and process the
769 * broadcast return value. */
770 static int ocfs2_do_request_vote(struct ocfs2_super
*osb
,
771 struct ocfs2_vote_msg
*request
,
772 struct ocfs2_net_response_cb
*callback
)
774 int status
, response
;
775 unsigned int response_id
;
776 struct ocfs2_msg_hdr
*hdr
;
778 response_id
= ocfs2_new_response_id(osb
);
780 hdr
= &request
->v_hdr
;
781 hdr
->h_response_id
= cpu_to_be32(response_id
);
783 status
= ocfs2_broadcast_vote(osb
, request
, response_id
, &response
,
796 static int ocfs2_request_vote(struct inode
*inode
,
797 struct ocfs2_vote_msg
*request
,
798 struct ocfs2_net_response_cb
*callback
)
801 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
803 if (ocfs2_inode_is_new(inode
))
807 while (status
== -EAGAIN
) {
808 if (!(osb
->s_mount_opt
& OCFS2_MOUNT_NOINTR
) &&
809 signal_pending(current
))
812 status
= ocfs2_super_lock(osb
, 0);
819 if (!ocfs2_node_map_is_only(osb
, &osb
->mounted_map
,
821 status
= ocfs2_do_request_vote(osb
, request
, callback
);
823 ocfs2_super_unlock(osb
, 0);
828 static void ocfs2_delete_response_cb(void *priv
,
829 struct ocfs2_response_msg
*resp
)
831 int orphaned_slot
, node
;
832 struct inode
*inode
= priv
;
834 orphaned_slot
= be32_to_cpu(resp
->r_orphaned_slot
);
835 node
= be32_to_cpu(resp
->r_hdr
.h_node_num
);
836 mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
837 node
, (unsigned long long)OCFS2_I(inode
)->ip_blkno
,
840 /* The other node may not actually know which slot the inode
842 if (orphaned_slot
== OCFS2_INVALID_SLOT
)
845 /* Ok, the responding node knows which slot this inode is
846 * orphaned in. We verify that the information is correct and
847 * then record this in the inode. ocfs2_delete_inode will use
848 * this information to determine which lock to take. */
849 spin_lock(&OCFS2_I(inode
)->ip_lock
);
850 mlog_bug_on_msg(OCFS2_I(inode
)->ip_orphaned_slot
!= orphaned_slot
&&
851 OCFS2_I(inode
)->ip_orphaned_slot
852 != OCFS2_INVALID_SLOT
, "Inode %llu: Node %d says it's "
853 "orphaned in slot %d, we think it's in %d\n",
854 (unsigned long long)OCFS2_I(inode
)->ip_blkno
,
855 be32_to_cpu(resp
->r_hdr
.h_node_num
),
856 orphaned_slot
, OCFS2_I(inode
)->ip_orphaned_slot
);
858 OCFS2_I(inode
)->ip_orphaned_slot
= orphaned_slot
;
859 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
862 int ocfs2_request_delete_vote(struct inode
*inode
)
864 int orphaned_slot
, status
;
865 struct ocfs2_net_response_cb delete_cb
;
866 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
867 struct ocfs2_vote_msg
*request
;
869 spin_lock(&OCFS2_I(inode
)->ip_lock
);
870 orphaned_slot
= OCFS2_I(inode
)->ip_orphaned_slot
;
871 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
873 delete_cb
.rc_cb
= ocfs2_delete_response_cb
;
874 delete_cb
.rc_priv
= inode
;
876 mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
877 (unsigned long long)OCFS2_I(inode
)->ip_blkno
, orphaned_slot
);
880 request
= ocfs2_new_vote_request(osb
, OCFS2_I(inode
)->ip_blkno
,
882 OCFS2_VOTE_REQ_DELETE
, orphaned_slot
);
884 status
= ocfs2_request_vote(inode
, request
, &delete_cb
);
892 static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg
*request
,
893 struct dentry
*dentry
)
895 struct inode
*parent
= dentry
->d_parent
->d_inode
;
897 /* We need some values which will uniquely identify a dentry
898 * on the other nodes so that they can find it and run
899 * d_delete against it. Parent directory block and full name
902 mlog(0, "unlink/rename request: parent: %llu name: %.*s\n",
903 (unsigned long long)OCFS2_I(parent
)->ip_blkno
, dentry
->d_name
.len
,
904 dentry
->d_name
.name
);
906 request
->v_unlink_parent
= cpu_to_be64(OCFS2_I(parent
)->ip_blkno
);
907 request
->v_unlink_namelen
= cpu_to_be32(dentry
->d_name
.len
);
908 memcpy(request
->v_unlink_dirent
, dentry
->d_name
.name
,
912 int ocfs2_request_unlink_vote(struct inode
*inode
,
913 struct dentry
*dentry
,
917 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
918 struct ocfs2_vote_msg
*request
;
920 if (dentry
->d_name
.len
> OCFS2_VOTE_FILENAME_LEN
)
921 return -ENAMETOOLONG
;
924 request
= ocfs2_new_vote_request(osb
, OCFS2_I(inode
)->ip_blkno
,
926 OCFS2_VOTE_REQ_UNLINK
, nlink
);
928 ocfs2_setup_unlink_vote(request
, dentry
);
930 status
= ocfs2_request_vote(inode
, request
, NULL
);
937 int ocfs2_request_rename_vote(struct inode
*inode
,
938 struct dentry
*dentry
)
941 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
942 struct ocfs2_vote_msg
*request
;
944 if (dentry
->d_name
.len
> OCFS2_VOTE_FILENAME_LEN
)
945 return -ENAMETOOLONG
;
948 request
= ocfs2_new_vote_request(osb
, OCFS2_I(inode
)->ip_blkno
,
950 OCFS2_VOTE_REQ_RENAME
, 0);
952 ocfs2_setup_unlink_vote(request
, dentry
);
954 status
= ocfs2_request_vote(inode
, request
, NULL
);
961 int ocfs2_request_mount_vote(struct ocfs2_super
*osb
)
964 struct ocfs2_vote_msg
*request
= NULL
;
966 request
= ocfs2_new_vote_request(osb
, 0ULL, 0,
967 OCFS2_VOTE_REQ_MOUNT
, 0);
974 while (status
== -EAGAIN
) {
975 if (!(osb
->s_mount_opt
& OCFS2_MOUNT_NOINTR
) &&
976 signal_pending(current
)) {
977 status
= -ERESTARTSYS
;
981 if (ocfs2_node_map_is_only(osb
, &osb
->mounted_map
,
987 status
= ocfs2_do_request_vote(osb
, request
, NULL
);
995 int ocfs2_request_umount_vote(struct ocfs2_super
*osb
)
998 struct ocfs2_vote_msg
*request
= NULL
;
1000 request
= ocfs2_new_vote_request(osb
, 0ULL, 0,
1001 OCFS2_VOTE_REQ_UMOUNT
, 0);
1008 while (status
== -EAGAIN
) {
1009 /* Do not check signals on this vote... We really want
1010 * this one to go all the way through. */
1012 if (ocfs2_node_map_is_only(osb
, &osb
->mounted_map
,
1018 status
= ocfs2_do_request_vote(osb
, request
, NULL
);
1026 /* TODO: This should eventually be a hash table! */
1027 static struct ocfs2_net_wait_ctxt
* __ocfs2_find_net_wait_ctxt(struct ocfs2_super
*osb
,
1030 struct list_head
*p
;
1031 struct ocfs2_net_wait_ctxt
*w
= NULL
;
1033 list_for_each(p
, &osb
->net_response_list
) {
1034 w
= list_entry(p
, struct ocfs2_net_wait_ctxt
, n_list
);
1035 if (response_id
== w
->n_response_id
)
1043 /* Translate response codes into local node errno values */
1044 static inline int ocfs2_translate_response(int response
)
1049 case OCFS2_RESPONSE_OK
:
1053 case OCFS2_RESPONSE_BUSY
:
1064 static int ocfs2_handle_response_message(struct o2net_msg
*msg
,
1068 unsigned int response_id
, node_num
;
1069 int response_status
;
1070 struct ocfs2_super
*osb
= data
;
1071 struct ocfs2_response_msg
*resp
;
1072 struct ocfs2_net_wait_ctxt
* w
;
1073 struct ocfs2_net_response_cb
*resp_cb
;
1075 resp
= (struct ocfs2_response_msg
*) msg
->buf
;
1077 response_id
= be32_to_cpu(resp
->r_hdr
.h_response_id
);
1078 node_num
= be32_to_cpu(resp
->r_hdr
.h_node_num
);
1080 ocfs2_translate_response(be32_to_cpu(resp
->r_response
));
1082 mlog(0, "received response message:\n");
1083 mlog(0, "h_response_id = %u\n", response_id
);
1084 mlog(0, "h_request = %u\n", be32_to_cpu(resp
->r_hdr
.h_request
));
1085 mlog(0, "h_blkno = %llu\n",
1086 (unsigned long long)be64_to_cpu(resp
->r_hdr
.h_blkno
));
1087 mlog(0, "h_generation = %u\n", be32_to_cpu(resp
->r_hdr
.h_generation
));
1088 mlog(0, "h_node_num = %u\n", node_num
);
1089 mlog(0, "r_response = %d\n", response_status
);
1091 spin_lock(&osb
->net_response_lock
);
1092 w
= __ocfs2_find_net_wait_ctxt(osb
, response_id
);
1094 mlog(0, "request not found!\n");
1097 resp_cb
= w
->n_callback
;
1099 if (response_status
&& (!w
->n_response
)) {
1100 /* we only really need one negative response so don't
1102 w
->n_response
= response_status
;
1106 spin_unlock(&osb
->net_response_lock
);
1108 resp_cb
->rc_cb(resp_cb
->rc_priv
, resp
);
1110 spin_lock(&osb
->net_response_lock
);
1113 __ocfs2_mark_node_responded(osb
, w
, node_num
);
1115 spin_unlock(&osb
->net_response_lock
);
1120 static int ocfs2_handle_vote_message(struct o2net_msg
*msg
,
1125 struct ocfs2_super
*osb
= data
;
1126 struct ocfs2_vote_work
*work
;
1128 work
= kmalloc(sizeof(struct ocfs2_vote_work
), GFP_NOFS
);
1135 INIT_LIST_HEAD(&work
->w_list
);
1136 memcpy(&work
->w_msg
, msg
->buf
, sizeof(struct ocfs2_vote_msg
));
1138 mlog(0, "scheduling vote request:\n");
1139 mlog(0, "h_response_id = %u\n",
1140 be32_to_cpu(work
->w_msg
.v_hdr
.h_response_id
));
1141 mlog(0, "h_request = %u\n", be32_to_cpu(work
->w_msg
.v_hdr
.h_request
));
1142 mlog(0, "h_blkno = %llu\n",
1143 (unsigned long long)be64_to_cpu(work
->w_msg
.v_hdr
.h_blkno
));
1144 mlog(0, "h_generation = %u\n",
1145 be32_to_cpu(work
->w_msg
.v_hdr
.h_generation
));
1146 mlog(0, "h_node_num = %u\n",
1147 be32_to_cpu(work
->w_msg
.v_hdr
.h_node_num
));
1148 mlog(0, "v_generic1 = %u\n", be32_to_cpu(work
->w_msg
.md1
.v_generic1
));
1150 spin_lock(&osb
->vote_task_lock
);
1151 list_add_tail(&work
->w_list
, &osb
->vote_list
);
1153 spin_unlock(&osb
->vote_task_lock
);
1155 ocfs2_kick_vote_thread(osb
);
1162 void ocfs2_unregister_net_handlers(struct ocfs2_super
*osb
)
1167 o2net_unregister_handler_list(&osb
->osb_net_handlers
);
1169 if (!list_empty(&osb
->net_response_list
))
1170 mlog(ML_ERROR
, "net response list not empty!\n");
1175 int ocfs2_register_net_handlers(struct ocfs2_super
*osb
)
1179 status
= o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE
,
1181 sizeof(struct ocfs2_response_msg
),
1182 ocfs2_handle_response_message
,
1183 osb
, &osb
->osb_net_handlers
);
1189 status
= o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE
,
1191 sizeof(struct ocfs2_vote_msg
),
1192 ocfs2_handle_vote_message
,
1193 osb
, &osb
->osb_net_handlers
);
1200 ocfs2_unregister_net_handlers(osb
);