1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
26 #include <linux/types.h>
27 #include <linux/slab.h>
28 #include <linux/highmem.h>
29 #include <linux/smp_lock.h>
30 #include <linux/kthread.h>
32 #include <cluster/heartbeat.h>
33 #include <cluster/nodemanager.h>
34 #include <cluster/tcp.h>
36 #include <dlm/dlmapi.h>
38 #define MLOG_MASK_PREFIX ML_VOTE
39 #include <cluster/masklog.h>
45 #include "extent_map.h"
46 #include "heartbeat.h"
52 #include "buffer_head_io.h"
54 #define OCFS2_MESSAGE_TYPE_VOTE (0x1)
55 #define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
58 __be32 h_response_id
; /* used to lookup message handle on sending
63 __be32 h_node_num
; /* node sending this particular message. */
66 /* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
68 #define OCFS2_VOTE_FILENAME_LEN 256
71 struct ocfs2_msg_hdr v_hdr
;
74 __be32 v_orphaned_slot
; /* Used during delete votes */
75 __be32 v_nlink
; /* Used during unlink votes */
76 } md1
; /* Message type dependant 1 */
77 __be32 v_unlink_namelen
;
78 __be64 v_unlink_parent
;
79 u8 v_unlink_dirent
[OCFS2_VOTE_FILENAME_LEN
];
82 /* Responses are given these values to maintain backwards
83 * compatibility with older ocfs2 versions */
84 #define OCFS2_RESPONSE_OK (0)
85 #define OCFS2_RESPONSE_BUSY (-16)
86 #define OCFS2_RESPONSE_BAD_MSG (-22)
88 struct ocfs2_response_msg
90 struct ocfs2_msg_hdr r_hdr
;
92 __be32 r_orphaned_slot
;
95 struct ocfs2_vote_work
{
96 struct list_head w_list
;
97 struct ocfs2_vote_msg w_msg
;
100 enum ocfs2_vote_request
{
101 OCFS2_VOTE_REQ_INVALID
= 0,
102 OCFS2_VOTE_REQ_DELETE
,
103 OCFS2_VOTE_REQ_UNLINK
,
104 OCFS2_VOTE_REQ_RENAME
,
105 OCFS2_VOTE_REQ_MOUNT
,
106 OCFS2_VOTE_REQ_UMOUNT
,
110 static inline int ocfs2_is_valid_vote_request(int request
)
112 return OCFS2_VOTE_REQ_INVALID
< request
&&
113 request
< OCFS2_VOTE_REQ_LAST
;
116 typedef void (*ocfs2_net_response_callback
)(void *priv
,
117 struct ocfs2_response_msg
*resp
);
118 struct ocfs2_net_response_cb
{
119 ocfs2_net_response_callback rc_cb
;
123 struct ocfs2_net_wait_ctxt
{
124 struct list_head n_list
;
126 wait_queue_head_t n_event
;
127 struct ocfs2_node_map n_node_map
;
128 int n_response
; /* an agreggate response. 0 if
129 * all nodes are go, < 0 on any
130 * negative response from any
131 * node or network error. */
132 struct ocfs2_net_response_cb
*n_callback
;
135 static void ocfs2_process_mount_request(struct ocfs2_super
*osb
,
136 unsigned int node_num
)
138 mlog(0, "MOUNT vote from node %u\n", node_num
);
139 /* The other node only sends us this message when he has an EX
140 * on the superblock, so our recovery threads (if having been
141 * launched) are waiting on it.*/
142 ocfs2_recovery_map_clear(osb
, node_num
);
143 ocfs2_node_map_set_bit(osb
, &osb
->mounted_map
, node_num
);
145 /* We clear the umount map here because a node may have been
146 * previously mounted, safely unmounted but never stopped
147 * heartbeating - in which case we'd have a stale entry. */
148 ocfs2_node_map_clear_bit(osb
, &osb
->umount_map
, node_num
);
151 static void ocfs2_process_umount_request(struct ocfs2_super
*osb
,
152 unsigned int node_num
)
154 mlog(0, "UMOUNT vote from node %u\n", node_num
);
155 ocfs2_node_map_clear_bit(osb
, &osb
->mounted_map
, node_num
);
156 ocfs2_node_map_set_bit(osb
, &osb
->umount_map
, node_num
);
159 void ocfs2_mark_inode_remotely_deleted(struct inode
*inode
)
161 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
163 assert_spin_locked(&oi
->ip_lock
);
164 /* We set the SKIP_DELETE flag on the inode so we don't try to
165 * delete it in delete_inode ourselves, thus avoiding
166 * unecessary lock pinging. If the other node failed to wipe
167 * the inode as a result of a crash, then recovery will pick
169 oi
->ip_flags
|= OCFS2_INODE_DELETED
|OCFS2_INODE_SKIP_DELETE
;
172 static int ocfs2_process_delete_request(struct inode
*inode
,
175 int response
= OCFS2_RESPONSE_BUSY
;
177 mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
178 inode
->i_ino
, inode
->i_nlink
, *orphaned_slot
);
180 spin_lock(&OCFS2_I(inode
)->ip_lock
);
182 /* Whatever our vote response is, we want to make sure that
183 * the orphaned slot is recorded properly on this node *and*
184 * on the requesting node. Technically, if the requesting node
185 * did not know which slot the inode is orphaned in but we
186 * respond with BUSY he doesn't actually need the orphaned
187 * slot, but it doesn't hurt to do it here anyway. */
188 if ((*orphaned_slot
) != OCFS2_INVALID_SLOT
) {
189 mlog_bug_on_msg(OCFS2_I(inode
)->ip_orphaned_slot
!=
190 OCFS2_INVALID_SLOT
&&
191 OCFS2_I(inode
)->ip_orphaned_slot
!=
193 "Inode %"MLFu64
": This node thinks it's "
194 "orphaned in slot %d, messaged it's in %d\n",
195 OCFS2_I(inode
)->ip_blkno
,
196 OCFS2_I(inode
)->ip_orphaned_slot
,
199 mlog(0, "Setting orphaned slot for inode %"MLFu64
" to %d\n",
200 OCFS2_I(inode
)->ip_blkno
, *orphaned_slot
);
202 OCFS2_I(inode
)->ip_orphaned_slot
= *orphaned_slot
;
204 mlog(0, "Sending back orphaned slot %d for inode %"MLFu64
"\n",
205 OCFS2_I(inode
)->ip_orphaned_slot
,
206 OCFS2_I(inode
)->ip_blkno
);
208 *orphaned_slot
= OCFS2_I(inode
)->ip_orphaned_slot
;
211 /* vote no if the file is still open. */
212 if (OCFS2_I(inode
)->ip_open_count
) {
213 mlog(0, "open count = %u\n",
214 OCFS2_I(inode
)->ip_open_count
);
215 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
218 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
220 /* directories are a bit ugly... What if someone is sitting in
221 * it? We want to make sure the inode is removed completely as
222 * a result of the iput in process_vote. */
223 if (S_ISDIR(inode
->i_mode
) && (atomic_read(&inode
->i_count
) != 1)) {
224 mlog(0, "i_count = %u\n", atomic_read(&inode
->i_count
));
228 if (filemap_fdatawrite(inode
->i_mapping
)) {
229 mlog(ML_ERROR
, "Could not sync inode %"MLFu64
" for delete!\n",
230 OCFS2_I(inode
)->ip_blkno
);
233 sync_mapping_buffers(inode
->i_mapping
);
234 truncate_inode_pages(inode
->i_mapping
, 0);
235 ocfs2_extent_map_trunc(inode
, 0);
237 spin_lock(&OCFS2_I(inode
)->ip_lock
);
238 /* double check open count - someone might have raced this
239 * thread into ocfs2_file_open while we were writing out
240 * data. If we're to allow a wipe of this inode now, we *must*
241 * hold the spinlock until we've marked it. */
242 if (OCFS2_I(inode
)->ip_open_count
) {
243 mlog(0, "Raced to wipe! open count = %u\n",
244 OCFS2_I(inode
)->ip_open_count
);
245 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
249 /* Mark the inode as being wiped from disk. */
250 ocfs2_mark_inode_remotely_deleted(inode
);
251 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
253 /* Not sure this is necessary anymore. */
254 d_prune_aliases(inode
);
256 /* If we get here, then we're voting 'yes', so commit the
257 * delete on our side. */
258 response
= OCFS2_RESPONSE_OK
;
263 static int ocfs2_match_dentry(struct dentry
*dentry
,
265 unsigned int namelen
,
268 struct inode
*parent
;
270 if (!dentry
->d_parent
) {
271 mlog(0, "Detached from parent.\n");
275 parent
= dentry
->d_parent
->d_inode
;
276 /* Negative parent dentry? */
280 /* Name is in a different directory. */
281 if (OCFS2_I(parent
)->ip_blkno
!= parent_blkno
)
284 if (dentry
->d_name
.len
!= namelen
)
287 /* comparison above guarantees this is safe. */
288 if (memcmp(dentry
->d_name
.name
, name
, namelen
))
294 static void ocfs2_process_dentry_request(struct inode
*inode
,
296 unsigned int new_nlink
,
298 unsigned int namelen
,
301 struct dentry
*dentry
= NULL
;
303 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
305 mlog(0, "parent %"MLFu64
", namelen = %u, name = %.*s\n", parent_blkno
,
306 namelen
, namelen
, name
);
308 spin_lock(&dcache_lock
);
310 /* Another node is removing this name from the system. It is
311 * up to us to find the corresponding dentry and if it exists,
312 * unhash it from the dcache. */
313 list_for_each(p
, &inode
->i_dentry
) {
314 dentry
= list_entry(p
, struct dentry
, d_alias
);
316 if (ocfs2_match_dentry(dentry
, parent_blkno
, namelen
, name
)) {
317 mlog(0, "dentry found: %.*s\n",
318 dentry
->d_name
.len
, dentry
->d_name
.name
);
327 spin_unlock(&dcache_lock
);
334 /* rename votes don't send link counts */
336 mlog(0, "new_nlink = %u\n", new_nlink
);
338 /* We don't have the proper locks here to directly
339 * change i_nlink and besides, the vote is sent
340 * *before* the operation so it may have failed on the
341 * other node. This passes a hint to ocfs2_drop_inode
342 * to force ocfs2_delete_inode, who will take the
343 * proper cluster locks to sort things out. */
344 if (new_nlink
== 0) {
345 spin_lock(&oi
->ip_lock
);
346 oi
->ip_flags
|= OCFS2_INODE_MAYBE_ORPHANED
;
347 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
352 static void ocfs2_process_vote(struct ocfs2_super
*osb
,
353 struct ocfs2_vote_msg
*msg
)
355 int net_status
, vote_response
;
356 int orphaned_slot
= 0;
358 unsigned int node_num
, generation
, new_nlink
, namelen
;
359 u64 blkno
, parent_blkno
;
360 enum ocfs2_vote_request request
;
361 struct inode
*inode
= NULL
;
362 struct ocfs2_msg_hdr
*hdr
= &msg
->v_hdr
;
363 struct ocfs2_response_msg response
;
365 /* decode the network mumbo jumbo into local variables. */
366 request
= be32_to_cpu(hdr
->h_request
);
367 blkno
= be64_to_cpu(hdr
->h_blkno
);
368 generation
= be32_to_cpu(hdr
->h_generation
);
369 node_num
= be32_to_cpu(hdr
->h_node_num
);
370 if (request
== OCFS2_VOTE_REQ_DELETE
)
371 orphaned_slot
= be32_to_cpu(msg
->md1
.v_orphaned_slot
);
373 mlog(0, "processing vote: request = %u, blkno = %"MLFu64
", "
374 "generation = %u, node_num = %u, priv1 = %u\n", request
,
375 blkno
, generation
, node_num
, be32_to_cpu(msg
->md1
.v_generic1
));
377 if (!ocfs2_is_valid_vote_request(request
)) {
378 mlog(ML_ERROR
, "Invalid vote request %d from node %u\n",
380 vote_response
= OCFS2_RESPONSE_BAD_MSG
;
384 vote_response
= OCFS2_RESPONSE_OK
;
387 case OCFS2_VOTE_REQ_UMOUNT
:
388 ocfs2_process_umount_request(osb
, node_num
);
390 case OCFS2_VOTE_REQ_MOUNT
:
391 ocfs2_process_mount_request(osb
, node_num
);
394 /* avoids a gcc warning */
398 /* We cannot process the remaining message types before we're
399 * fully mounted. It's perfectly safe however to send a 'yes'
400 * response as we can't possibly have any of the state they're
401 * asking us to modify yet. */
402 if (atomic_read(&osb
->vol_state
) == VOLUME_INIT
)
405 /* If we get here, then the request is against an inode. */
406 inode
= ocfs2_ilookup_for_vote(osb
, blkno
,
407 request
== OCFS2_VOTE_REQ_DELETE
);
409 /* Not finding the inode is perfectly valid - it means we're
410 * not interested in what the other node is about to do to it
411 * so in those cases we automatically respond with an
412 * affirmative. Cluster locking ensures that we won't race
413 * interest in the inode with this vote request. */
417 /* Check generation values. It's possible for us to get a
418 * request against a stale inode. If so then we proceed as if
419 * we had not found an inode in the first place. */
420 if (inode
->i_generation
!= generation
) {
421 mlog(0, "generation passed %u != inode generation = %u, "
422 "ip_flags = %x, ip_blkno = %"MLFu64
", msg %"MLFu64
", "
423 "i_count = %u, message type = %u\n",
424 generation
, inode
->i_generation
, OCFS2_I(inode
)->ip_flags
,
425 OCFS2_I(inode
)->ip_blkno
, blkno
,
426 atomic_read(&inode
->i_count
), request
);
433 case OCFS2_VOTE_REQ_DELETE
:
434 vote_response
= ocfs2_process_delete_request(inode
,
437 case OCFS2_VOTE_REQ_RENAME
:
440 case OCFS2_VOTE_REQ_UNLINK
:
441 parent_blkno
= be64_to_cpu(msg
->v_unlink_parent
);
442 namelen
= be32_to_cpu(msg
->v_unlink_namelen
);
443 /* new_nlink will be ignored in case of a rename vote */
444 new_nlink
= be32_to_cpu(msg
->md1
.v_nlink
);
445 ocfs2_process_dentry_request(inode
, rename
, new_nlink
,
446 parent_blkno
, namelen
,
447 msg
->v_unlink_dirent
);
450 mlog(ML_ERROR
, "node %u, invalid request: %u\n",
452 vote_response
= OCFS2_RESPONSE_BAD_MSG
;
456 /* Response struture is small so we just put it on the stack
457 * and stuff it inline. */
458 memset(&response
, 0, sizeof(struct ocfs2_response_msg
));
459 response
.r_hdr
.h_response_id
= hdr
->h_response_id
;
460 response
.r_hdr
.h_blkno
= hdr
->h_blkno
;
461 response
.r_hdr
.h_generation
= hdr
->h_generation
;
462 response
.r_hdr
.h_node_num
= cpu_to_be32(osb
->node_num
);
463 response
.r_response
= cpu_to_be32(vote_response
);
464 response
.r_orphaned_slot
= cpu_to_be32(orphaned_slot
);
466 net_status
= o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE
,
469 sizeof(struct ocfs2_response_msg
),
472 /* We still want to error print for ENOPROTOOPT here. The
473 * sending node shouldn't have unregistered his net handler
474 * without sending an unmount vote 1st */
476 && net_status
!= -ETIMEDOUT
477 && net_status
!= -ENOTCONN
)
478 mlog(ML_ERROR
, "message to node %u fails with error %d!\n",
479 node_num
, net_status
);
485 static void ocfs2_vote_thread_do_work(struct ocfs2_super
*osb
)
487 unsigned long processed
;
488 struct ocfs2_lock_res
*lockres
;
489 struct ocfs2_vote_work
*work
;
493 spin_lock(&osb
->vote_task_lock
);
494 /* grab this early so we know to try again if a state change and
495 * wake happens part-way through our work */
496 osb
->vote_work_sequence
= osb
->vote_wake_sequence
;
498 processed
= osb
->blocked_lock_count
;
500 BUG_ON(list_empty(&osb
->blocked_lock_list
));
502 lockres
= list_entry(osb
->blocked_lock_list
.next
,
503 struct ocfs2_lock_res
, l_blocked_list
);
504 list_del_init(&lockres
->l_blocked_list
);
505 osb
->blocked_lock_count
--;
506 spin_unlock(&osb
->vote_task_lock
);
511 ocfs2_process_blocked_lock(osb
, lockres
);
513 spin_lock(&osb
->vote_task_lock
);
516 while (osb
->vote_count
) {
517 BUG_ON(list_empty(&osb
->vote_list
));
518 work
= list_entry(osb
->vote_list
.next
,
519 struct ocfs2_vote_work
, w_list
);
520 list_del(&work
->w_list
);
522 spin_unlock(&osb
->vote_task_lock
);
524 ocfs2_process_vote(osb
, &work
->w_msg
);
527 spin_lock(&osb
->vote_task_lock
);
529 spin_unlock(&osb
->vote_task_lock
);
534 static int ocfs2_vote_thread_lists_empty(struct ocfs2_super
*osb
)
538 spin_lock(&osb
->vote_task_lock
);
539 if (list_empty(&osb
->blocked_lock_list
) &&
540 list_empty(&osb
->vote_list
))
543 spin_unlock(&osb
->vote_task_lock
);
547 static int ocfs2_vote_thread_should_wake(struct ocfs2_super
*osb
)
551 spin_lock(&osb
->vote_task_lock
);
552 if (osb
->vote_work_sequence
!= osb
->vote_wake_sequence
)
554 spin_unlock(&osb
->vote_task_lock
);
559 int ocfs2_vote_thread(void *arg
)
562 struct ocfs2_super
*osb
= arg
;
564 /* only quit once we've been asked to stop and there is no more
566 while (!(kthread_should_stop() &&
567 ocfs2_vote_thread_lists_empty(osb
))) {
569 wait_event_interruptible(osb
->vote_event
,
570 ocfs2_vote_thread_should_wake(osb
) ||
571 kthread_should_stop());
573 mlog(0, "vote_thread: awoken\n");
575 ocfs2_vote_thread_do_work(osb
);
578 osb
->vote_task
= NULL
;
582 static struct ocfs2_net_wait_ctxt
*ocfs2_new_net_wait_ctxt(unsigned int response_id
)
584 struct ocfs2_net_wait_ctxt
*w
;
586 w
= kcalloc(1, sizeof(*w
), GFP_KERNEL
);
592 INIT_LIST_HEAD(&w
->n_list
);
593 init_waitqueue_head(&w
->n_event
);
594 ocfs2_node_map_init(&w
->n_node_map
);
595 w
->n_response_id
= response_id
;
596 w
->n_callback
= NULL
;
601 static unsigned int ocfs2_new_response_id(struct ocfs2_super
*osb
)
605 spin_lock(&osb
->net_response_lock
);
606 ret
= ++osb
->net_response_ids
;
607 spin_unlock(&osb
->net_response_lock
);
612 static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super
*osb
,
613 struct ocfs2_net_wait_ctxt
*w
)
615 spin_lock(&osb
->net_response_lock
);
616 list_del(&w
->n_list
);
617 spin_unlock(&osb
->net_response_lock
);
620 static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super
*osb
,
621 struct ocfs2_net_wait_ctxt
*w
)
623 spin_lock(&osb
->net_response_lock
);
624 list_add_tail(&w
->n_list
,
625 &osb
->net_response_list
);
626 spin_unlock(&osb
->net_response_lock
);
629 static void __ocfs2_mark_node_responded(struct ocfs2_super
*osb
,
630 struct ocfs2_net_wait_ctxt
*w
,
633 assert_spin_locked(&osb
->net_response_lock
);
635 ocfs2_node_map_clear_bit(osb
, &w
->n_node_map
, node_num
);
636 if (ocfs2_node_map_is_empty(osb
, &w
->n_node_map
))
637 wake_up(&w
->n_event
);
640 /* Intended to be called from the node down callback, we fake remove
641 * the node from all our response contexts */
642 void ocfs2_remove_node_from_vote_queues(struct ocfs2_super
*osb
,
646 struct ocfs2_net_wait_ctxt
*w
= NULL
;
648 spin_lock(&osb
->net_response_lock
);
650 list_for_each(p
, &osb
->net_response_list
) {
651 w
= list_entry(p
, struct ocfs2_net_wait_ctxt
, n_list
);
653 __ocfs2_mark_node_responded(osb
, w
, node_num
);
656 spin_unlock(&osb
->net_response_lock
);
659 static int ocfs2_broadcast_vote(struct ocfs2_super
*osb
,
660 struct ocfs2_vote_msg
*request
,
661 unsigned int response_id
,
663 struct ocfs2_net_response_cb
*callback
)
665 int status
, i
, remote_err
;
666 struct ocfs2_net_wait_ctxt
*w
= NULL
;
671 w
= ocfs2_new_net_wait_ctxt(response_id
);
677 w
->n_callback
= callback
;
679 /* we're pretty much ready to go at this point, and this fills
680 * in n_response which we need anyway... */
681 ocfs2_queue_net_wait_ctxt(osb
, w
);
683 i
= ocfs2_node_map_iterate(osb
, &osb
->mounted_map
, 0);
685 while (i
!= O2NM_INVALID_NODE_NUM
) {
686 if (i
!= osb
->node_num
) {
687 mlog(0, "trying to send request to node %i\n", i
);
688 ocfs2_node_map_set_bit(osb
, &w
->n_node_map
, i
);
691 status
= o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE
,
697 if (status
== -ETIMEDOUT
) {
698 mlog(0, "remote node %d timed out!\n", i
);
702 if (remote_err
< 0) {
704 mlog(0, "remote error %d on node %d!\n",
715 i
= ocfs2_node_map_iterate(osb
, &osb
->mounted_map
, i
);
716 mlog(0, "next is %d, i am %d\n", i
, osb
->node_num
);
718 mlog(0, "done sending, now waiting on responses...\n");
720 wait_event(w
->n_event
, ocfs2_node_map_is_empty(osb
, &w
->n_node_map
));
722 ocfs2_dequeue_net_wait_ctxt(osb
, w
);
725 *response
= w
->n_response
;
730 ocfs2_dequeue_net_wait_ctxt(osb
, w
);
738 static struct ocfs2_vote_msg
* ocfs2_new_vote_request(struct ocfs2_super
*osb
,
740 unsigned int generation
,
741 enum ocfs2_vote_request type
,
744 struct ocfs2_vote_msg
*request
;
745 struct ocfs2_msg_hdr
*hdr
;
747 BUG_ON(!ocfs2_is_valid_vote_request(type
));
749 request
= kcalloc(1, sizeof(*request
), GFP_KERNEL
);
753 hdr
= &request
->v_hdr
;
754 hdr
->h_node_num
= cpu_to_be32(osb
->node_num
);
755 hdr
->h_request
= cpu_to_be32(type
);
756 hdr
->h_blkno
= cpu_to_be64(blkno
);
757 hdr
->h_generation
= cpu_to_be32(generation
);
759 request
->md1
.v_generic1
= cpu_to_be32(priv
);
765 /* Complete the buildup of a new vote request and process the
766 * broadcast return value. */
767 static int ocfs2_do_request_vote(struct ocfs2_super
*osb
,
768 struct ocfs2_vote_msg
*request
,
769 struct ocfs2_net_response_cb
*callback
)
771 int status
, response
;
772 unsigned int response_id
;
773 struct ocfs2_msg_hdr
*hdr
;
775 response_id
= ocfs2_new_response_id(osb
);
777 hdr
= &request
->v_hdr
;
778 hdr
->h_response_id
= cpu_to_be32(response_id
);
780 status
= ocfs2_broadcast_vote(osb
, request
, response_id
, &response
,
793 static int ocfs2_request_vote(struct inode
*inode
,
794 struct ocfs2_vote_msg
*request
,
795 struct ocfs2_net_response_cb
*callback
)
798 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
800 if (ocfs2_inode_is_new(inode
))
804 while (status
== -EAGAIN
) {
805 if (!(osb
->s_mount_opt
& OCFS2_MOUNT_NOINTR
) &&
806 signal_pending(current
))
809 status
= ocfs2_super_lock(osb
, 0);
816 if (!ocfs2_node_map_is_only(osb
, &osb
->mounted_map
,
818 status
= ocfs2_do_request_vote(osb
, request
, callback
);
820 ocfs2_super_unlock(osb
, 0);
825 static void ocfs2_delete_response_cb(void *priv
,
826 struct ocfs2_response_msg
*resp
)
828 int orphaned_slot
, node
;
829 struct inode
*inode
= priv
;
831 orphaned_slot
= be32_to_cpu(resp
->r_orphaned_slot
);
832 node
= be32_to_cpu(resp
->r_hdr
.h_node_num
);
833 mlog(0, "node %d tells us that inode %"MLFu64
" is orphaned in slot "
834 "%d\n", node
, OCFS2_I(inode
)->ip_blkno
, orphaned_slot
);
836 /* The other node may not actually know which slot the inode
838 if (orphaned_slot
== OCFS2_INVALID_SLOT
)
841 /* Ok, the responding node knows which slot this inode is
842 * orphaned in. We verify that the information is correct and
843 * then record this in the inode. ocfs2_delete_inode will use
844 * this information to determine which lock to take. */
845 spin_lock(&OCFS2_I(inode
)->ip_lock
);
846 mlog_bug_on_msg(OCFS2_I(inode
)->ip_orphaned_slot
!= orphaned_slot
&&
847 OCFS2_I(inode
)->ip_orphaned_slot
848 != OCFS2_INVALID_SLOT
, "Inode %"MLFu64
": Node %d "
849 "says it's orphaned in slot %d, we think it's in %d\n",
850 OCFS2_I(inode
)->ip_blkno
,
851 be32_to_cpu(resp
->r_hdr
.h_node_num
),
852 orphaned_slot
, OCFS2_I(inode
)->ip_orphaned_slot
);
854 OCFS2_I(inode
)->ip_orphaned_slot
= orphaned_slot
;
855 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
858 int ocfs2_request_delete_vote(struct inode
*inode
)
860 int orphaned_slot
, status
;
861 struct ocfs2_net_response_cb delete_cb
;
862 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
863 struct ocfs2_vote_msg
*request
;
865 spin_lock(&OCFS2_I(inode
)->ip_lock
);
866 orphaned_slot
= OCFS2_I(inode
)->ip_orphaned_slot
;
867 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
869 delete_cb
.rc_cb
= ocfs2_delete_response_cb
;
870 delete_cb
.rc_priv
= inode
;
872 mlog(0, "Inode %"MLFu64
", we start thinking orphaned slot is %d\n",
873 OCFS2_I(inode
)->ip_blkno
, orphaned_slot
);
876 request
= ocfs2_new_vote_request(osb
, OCFS2_I(inode
)->ip_blkno
,
878 OCFS2_VOTE_REQ_DELETE
, orphaned_slot
);
880 status
= ocfs2_request_vote(inode
, request
, &delete_cb
);
888 static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg
*request
,
889 struct dentry
*dentry
)
891 struct inode
*parent
= dentry
->d_parent
->d_inode
;
893 /* We need some values which will uniquely identify a dentry
894 * on the other nodes so that they can find it and run
895 * d_delete against it. Parent directory block and full name
898 mlog(0, "unlink/rename request: parent: %"MLFu64
" name: %.*s\n",
899 OCFS2_I(parent
)->ip_blkno
, dentry
->d_name
.len
,
900 dentry
->d_name
.name
);
902 request
->v_unlink_parent
= cpu_to_be64(OCFS2_I(parent
)->ip_blkno
);
903 request
->v_unlink_namelen
= cpu_to_be32(dentry
->d_name
.len
);
904 memcpy(request
->v_unlink_dirent
, dentry
->d_name
.name
,
908 int ocfs2_request_unlink_vote(struct inode
*inode
,
909 struct dentry
*dentry
,
913 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
914 struct ocfs2_vote_msg
*request
;
916 if (dentry
->d_name
.len
> OCFS2_VOTE_FILENAME_LEN
)
917 return -ENAMETOOLONG
;
920 request
= ocfs2_new_vote_request(osb
, OCFS2_I(inode
)->ip_blkno
,
922 OCFS2_VOTE_REQ_UNLINK
, nlink
);
924 ocfs2_setup_unlink_vote(request
, dentry
);
926 status
= ocfs2_request_vote(inode
, request
, NULL
);
933 int ocfs2_request_rename_vote(struct inode
*inode
,
934 struct dentry
*dentry
)
937 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
938 struct ocfs2_vote_msg
*request
;
940 if (dentry
->d_name
.len
> OCFS2_VOTE_FILENAME_LEN
)
941 return -ENAMETOOLONG
;
944 request
= ocfs2_new_vote_request(osb
, OCFS2_I(inode
)->ip_blkno
,
946 OCFS2_VOTE_REQ_RENAME
, 0);
948 ocfs2_setup_unlink_vote(request
, dentry
);
950 status
= ocfs2_request_vote(inode
, request
, NULL
);
957 int ocfs2_request_mount_vote(struct ocfs2_super
*osb
)
960 struct ocfs2_vote_msg
*request
= NULL
;
962 request
= ocfs2_new_vote_request(osb
, 0ULL, 0,
963 OCFS2_VOTE_REQ_MOUNT
, 0);
970 while (status
== -EAGAIN
) {
971 if (!(osb
->s_mount_opt
& OCFS2_MOUNT_NOINTR
) &&
972 signal_pending(current
)) {
973 status
= -ERESTARTSYS
;
977 if (ocfs2_node_map_is_only(osb
, &osb
->mounted_map
,
983 status
= ocfs2_do_request_vote(osb
, request
, NULL
);
993 int ocfs2_request_umount_vote(struct ocfs2_super
*osb
)
996 struct ocfs2_vote_msg
*request
= NULL
;
998 request
= ocfs2_new_vote_request(osb
, 0ULL, 0,
999 OCFS2_VOTE_REQ_UMOUNT
, 0);
1006 while (status
== -EAGAIN
) {
1007 /* Do not check signals on this vote... We really want
1008 * this one to go all the way through. */
1010 if (ocfs2_node_map_is_only(osb
, &osb
->mounted_map
,
1016 status
= ocfs2_do_request_vote(osb
, request
, NULL
);
1026 /* TODO: This should eventually be a hash table! */
1027 static struct ocfs2_net_wait_ctxt
* __ocfs2_find_net_wait_ctxt(struct ocfs2_super
*osb
,
1030 struct list_head
*p
;
1031 struct ocfs2_net_wait_ctxt
*w
= NULL
;
1033 list_for_each(p
, &osb
->net_response_list
) {
1034 w
= list_entry(p
, struct ocfs2_net_wait_ctxt
, n_list
);
1035 if (response_id
== w
->n_response_id
)
1043 /* Translate response codes into local node errno values */
1044 static inline int ocfs2_translate_response(int response
)
1049 case OCFS2_RESPONSE_OK
:
1053 case OCFS2_RESPONSE_BUSY
:
1064 static int ocfs2_handle_response_message(struct o2net_msg
*msg
,
1068 unsigned int response_id
, node_num
;
1069 int response_status
;
1070 struct ocfs2_super
*osb
= data
;
1071 struct ocfs2_response_msg
*resp
;
1072 struct ocfs2_net_wait_ctxt
* w
;
1073 struct ocfs2_net_response_cb
*resp_cb
;
1075 resp
= (struct ocfs2_response_msg
*) msg
->buf
;
1077 response_id
= be32_to_cpu(resp
->r_hdr
.h_response_id
);
1078 node_num
= be32_to_cpu(resp
->r_hdr
.h_node_num
);
1080 ocfs2_translate_response(be32_to_cpu(resp
->r_response
));
1082 mlog(0, "received response message:\n");
1083 mlog(0, "h_response_id = %u\n", response_id
);
1084 mlog(0, "h_request = %u\n", be32_to_cpu(resp
->r_hdr
.h_request
));
1085 mlog(0, "h_blkno = %"MLFu64
"\n", be64_to_cpu(resp
->r_hdr
.h_blkno
));
1086 mlog(0, "h_generation = %u\n", be32_to_cpu(resp
->r_hdr
.h_generation
));
1087 mlog(0, "h_node_num = %u\n", node_num
);
1088 mlog(0, "r_response = %d\n", response_status
);
1090 spin_lock(&osb
->net_response_lock
);
1091 w
= __ocfs2_find_net_wait_ctxt(osb
, response_id
);
1093 mlog(0, "request not found!\n");
1096 resp_cb
= w
->n_callback
;
1098 if (response_status
&& (!w
->n_response
)) {
1099 /* we only really need one negative response so don't
1101 w
->n_response
= response_status
;
1105 spin_unlock(&osb
->net_response_lock
);
1107 resp_cb
->rc_cb(resp_cb
->rc_priv
, resp
);
1109 spin_lock(&osb
->net_response_lock
);
1112 __ocfs2_mark_node_responded(osb
, w
, node_num
);
1114 spin_unlock(&osb
->net_response_lock
);
1119 static int ocfs2_handle_vote_message(struct o2net_msg
*msg
,
1124 struct ocfs2_super
*osb
= data
;
1125 struct ocfs2_vote_work
*work
;
1127 work
= kmalloc(sizeof(struct ocfs2_vote_work
), GFP_KERNEL
);
1134 INIT_LIST_HEAD(&work
->w_list
);
1135 memcpy(&work
->w_msg
, msg
->buf
, sizeof(struct ocfs2_vote_msg
));
1137 mlog(0, "scheduling vote request:\n");
1138 mlog(0, "h_response_id = %u\n",
1139 be32_to_cpu(work
->w_msg
.v_hdr
.h_response_id
));
1140 mlog(0, "h_request = %u\n", be32_to_cpu(work
->w_msg
.v_hdr
.h_request
));
1141 mlog(0, "h_blkno = %"MLFu64
"\n",
1142 be64_to_cpu(work
->w_msg
.v_hdr
.h_blkno
));
1143 mlog(0, "h_generation = %u\n",
1144 be32_to_cpu(work
->w_msg
.v_hdr
.h_generation
));
1145 mlog(0, "h_node_num = %u\n",
1146 be32_to_cpu(work
->w_msg
.v_hdr
.h_node_num
));
1147 mlog(0, "v_generic1 = %u\n", be32_to_cpu(work
->w_msg
.md1
.v_generic1
));
1149 spin_lock(&osb
->vote_task_lock
);
1150 list_add_tail(&work
->w_list
, &osb
->vote_list
);
1152 spin_unlock(&osb
->vote_task_lock
);
1154 ocfs2_kick_vote_thread(osb
);
1161 void ocfs2_unregister_net_handlers(struct ocfs2_super
*osb
)
1166 o2net_unregister_handler_list(&osb
->osb_net_handlers
);
1168 if (!list_empty(&osb
->net_response_list
))
1169 mlog(ML_ERROR
, "net response list not empty!\n");
1174 int ocfs2_register_net_handlers(struct ocfs2_super
*osb
)
1178 status
= o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE
,
1180 sizeof(struct ocfs2_response_msg
),
1181 ocfs2_handle_response_message
,
1182 osb
, &osb
->osb_net_handlers
);
1188 status
= o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE
,
1190 sizeof(struct ocfs2_vote_msg
),
1191 ocfs2_handle_vote_message
,
1192 osb
, &osb
->osb_net_handlers
);
1199 ocfs2_unregister_net_handlers(osb
);