4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
31 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
32 * Copyright 2015 Joyent, Inc.
35 #include <sys/flock_impl.h>
37 #include <sys/t_lock.h> /* for <sys/callb.h> */
38 #include <sys/callb.h>
39 #include <sys/clconf.h>
40 #include <sys/cladm.h>
41 #include <sys/nbmlock.h>
43 #include <sys/policy.h>
46 * The following four variables are for statistics purposes and they are
47 * not protected by locks. They may not be accurate but will at least be
48 * close to the actual value.
55 int flk_proc_vertex_allocs
;
56 int flk_proc_edge_allocs
;
57 int flk_proc_vertex_frees
;
58 int flk_proc_edge_frees
;
60 static kmutex_t flock_lock
;
64 #define CHECK_ACTIVE_LOCKS(gp) if (check_debug) \
65 check_active_locks(gp);
66 #define CHECK_SLEEPING_LOCKS(gp) if (check_debug) \
67 check_sleeping_locks(gp);
68 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) \
70 check_owner_locks(gp, pid, sysid, vp);
71 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
73 if (check_lock_transition(old_state, new_state)) { \
74 cmn_err(CE_PANIC, "Illegal lock transition \
75 from %d to %d", old_state, new_state); \
80 #define CHECK_ACTIVE_LOCKS(gp)
81 #define CHECK_SLEEPING_LOCKS(gp)
82 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
83 #define CHECK_LOCK_TRANSITION(old_state, new_state)
87 struct kmem_cache
*flk_edge_cache
;
89 graph_t
*lock_graph
[HASH_SIZE
];
95 * NLM REGISTRY TYPE IMPLEMENTATION
98 * 1. Nodes in a cluster are numbered starting at 1; always non-negative
99 * integers; maximum node id is returned by clconf_maximum_nodeid().
100 * 2. We use this node id to identify the node an NLM server runs on.
104 * NLM registry object keeps track of NLM servers via their
105 * nlmids (which are the node ids of the node in the cluster they run on)
106 * that have requested locks at this LLM with which this registry is
109 * Representation of abstraction:
110 * rep = record[ states: array[nlm_state],
113 * Representation invariants:
114 * 1. index i of rep.states is between 0 and n - 1 where n is number
115 * of elements in the array, which happen to be the maximum number
116 * of nodes in the cluster configuration + 1.
117 * 2. map nlmid to index i of rep.states
121 * n-1 -> clconf_maximum_nodeid()+1
122 * 3. This 1-1 mapping is quite convenient and it avoids errors resulting
123 * from forgetting to subtract 1 from the index.
124 * 4. The reason we keep the 0th index is the following. A legitimate
125 * cluster configuration includes making a UFS file system NFS
126 * exportable. The code is structured so that if you're in a cluster
127 * you do one thing; otherwise, you do something else. The problem
128 * is what to do if you think you're in a cluster with PXFS loaded,
129 * but you're using UFS not PXFS? The upper two bytes of the sysid
130 * encode the node id of the node where NLM server runs; these bytes
131 * are zero for UFS. Since the nodeid is used to index into the
132 * registry, we can record the NLM server state information at index
133 * 0 using the same mechanism used for PXFS file locks!
135 static flk_nlm_status_t
*nlm_reg_status
= NULL
; /* state array 0..N-1 */
136 static kmutex_t nlm_reg_lock
; /* lock to protect arrary */
137 static uint_t nlm_status_size
; /* size of state array */
140 * Although we need a global lock dependency graph (and associated data
141 * structures), we also need a per-zone notion of whether the lock manager is
142 * running, and so whether to allow lock manager requests or not.
144 * Thus, on a per-zone basis we maintain a ``global'' variable
145 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
146 * manager is determined to be changing state (starting or stopping).
148 * Each graph/zone pair also has a copy of this variable, which is protected by
151 * The per-graph copies are used to synchronize lock requests with shutdown
152 * requests. The global copy is used to initialize the per-graph field when a
153 * new graph is created.
155 struct flock_globals
{
156 flk_lockmgr_status_t flk_lockmgr_status
;
157 flk_lockmgr_status_t lockmgr_status
[HASH_SIZE
];
160 zone_key_t flock_zone_key
;
162 static void create_flock(lock_descriptor_t
*, flock64_t
*);
163 static lock_descriptor_t
*flk_get_lock(void);
164 static void flk_free_lock(lock_descriptor_t
*lock
);
165 static void flk_get_first_blocking_lock(lock_descriptor_t
*request
);
166 static int flk_process_request(lock_descriptor_t
*);
167 static int flk_add_edge(lock_descriptor_t
*, lock_descriptor_t
*, int, int);
168 static edge_t
*flk_get_edge(void);
169 static int flk_wait_execute_request(lock_descriptor_t
*);
170 static int flk_relation(lock_descriptor_t
*, lock_descriptor_t
*);
171 static void flk_insert_active_lock(lock_descriptor_t
*);
172 static void flk_delete_active_lock(lock_descriptor_t
*, int);
173 static void flk_insert_sleeping_lock(lock_descriptor_t
*);
174 static void flk_graph_uncolor(graph_t
*);
175 static void flk_wakeup(lock_descriptor_t
*, int);
176 static void flk_free_edge(edge_t
*);
177 static void flk_recompute_dependencies(lock_descriptor_t
*,
178 lock_descriptor_t
**, int, int);
179 static int flk_find_barriers(lock_descriptor_t
*);
180 static void flk_update_barriers(lock_descriptor_t
*);
181 static int flk_color_reachables(lock_descriptor_t
*);
182 static int flk_canceled(lock_descriptor_t
*);
183 static void flk_delete_locks_by_sysid(lock_descriptor_t
*);
184 static void report_blocker(lock_descriptor_t
*, lock_descriptor_t
*);
185 static void wait_for_lock(lock_descriptor_t
*);
186 static void unlock_lockmgr_granted(struct flock_globals
*);
187 static void wakeup_sleeping_lockmgr_locks(struct flock_globals
*);
189 /* Clustering hooks */
190 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t
);
191 static void cl_flk_wakeup_sleeping_nlm_locks(int);
192 static void cl_flk_unlock_nlm_granted(int);
195 static int check_lock_transition(int, int);
196 static void check_sleeping_locks(graph_t
*);
197 static void check_active_locks(graph_t
*);
198 static int no_path(lock_descriptor_t
*, lock_descriptor_t
*);
199 static void path(lock_descriptor_t
*, lock_descriptor_t
*);
200 static void check_owner_locks(graph_t
*, pid_t
, int, vnode_t
*);
201 static int level_one_path(lock_descriptor_t
*, lock_descriptor_t
*);
202 static int level_two_path(lock_descriptor_t
*, lock_descriptor_t
*, int);
205 /* proc_graph function definitions */
206 static int flk_check_deadlock(lock_descriptor_t
*);
207 static void flk_proc_graph_uncolor(void);
208 static proc_vertex_t
*flk_get_proc_vertex(lock_descriptor_t
*);
209 static proc_edge_t
*flk_get_proc_edge(void);
210 static void flk_proc_release(proc_vertex_t
*);
211 static void flk_free_proc_edge(proc_edge_t
*);
212 static void flk_update_proc_graph(edge_t
*, int);
214 /* Non-blocking mandatory locking */
215 static int lock_blocks_io(nbl_op_t
, u_offset_t
, ssize_t
, int, u_offset_t
,
218 static struct flock_globals
*
219 flk_get_globals(void)
222 * The KLM module had better be loaded if we're attempting to handle
225 ASSERT(flock_zone_key
!= ZONE_KEY_UNINITIALIZED
);
226 return (zone_getspecific(flock_zone_key
, curproc
->p_zone
));
229 static flk_lockmgr_status_t
230 flk_get_lockmgr_status(void)
232 struct flock_globals
*fg
;
234 ASSERT(MUTEX_HELD(&flock_lock
));
236 if (flock_zone_key
== ZONE_KEY_UNINITIALIZED
) {
238 * KLM module not loaded; lock manager definitely not running.
240 return (FLK_LOCKMGR_DOWN
);
242 fg
= flk_get_globals();
243 return (fg
->flk_lockmgr_status
);
247 * This implements Open File Description (not descriptor) style record locking.
248 * These locks can also be thought of as pid-less since they are not tied to a
249 * specific process, thus they're preserved across fork.
251 * Called directly from fcntl.
253 * See reclock() for the implementation of the traditional POSIX style record
254 * locking scheme (pid-ful). This function is derived from reclock() but
255 * simplified and modified to work for OFD style locking.
257 * The two primary advantages of OFD style of locking are:
258 * 1) It is per-file description, so closing a file descriptor that refers to a
259 * different file description for the same file will not drop the lock (i.e.
260 * two open's of the same file get different descriptions but a dup or fork
261 * will refer to the same description).
262 * 2) Locks are preserved across fork(2).
264 * Because these locks are per-description a lock ptr lives at the f_filocks
265 * member of the file_t and the lock_descriptor includes a file_t pointer
266 * to enable unique lock identification and management.
268 * Since these locks are pid-less we cannot do deadlock detection with the
269 * current process-oriented implementation. This is consistent with OFD locking
270 * behavior on other operating systems such as Linux. Since we don't do
271 * deadlock detection we never interact with the process graph that is
272 * maintained for deadlock detection on the traditional POSIX-style locks.
276 * The current implementation does not support record locks. That is,
277 * currently the single lock must cover the entire file. This is validated in
278 * fcntl. To support record locks the f_filock pointer in the file_t needs to
279 * be changed to a list of pointers to the locks. That list needs to be
280 * managed independently of the lock list on the vnode itself and it needs to
281 * be maintained as record locks are created, split, coalesced and deleted.
283 * The current implementation does not support remote file systems (e.g.
284 * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
285 * interact with the NLM is not clear since the NLM protocol/implementation
286 * appears to be oriented around locks associated with a process. A further
287 * problem is that a design is needed for what nlm_send_siglost() should do and
288 * where it will send SIGLOST. More recent versions of Linux apparently try to
289 * emulate OFD locks on NFS by converting them to traditional POSIX style locks
290 * that work with the NLM. It is not clear that this provides the correct
291 * semantics in all cases.
294 ofdlock(file_t
*fp
, int fcmd
, flock64_t
*lckdat
, int flag
, u_offset_t offset
)
298 lock_descriptor_t stack_lock_request
;
299 lock_descriptor_t
*lock_request
;
304 if (fcmd
!= F_OFD_GETLK
)
307 if (fcmd
== F_OFD_SETLKW
|| fcmd
== F_FLOCKW
)
310 /* see block comment */
311 VERIFY(lckdat
->l_whence
== 0);
312 VERIFY(lckdat
->l_start
== 0);
313 VERIFY(lckdat
->l_len
== 0);
318 * For reclock fs_frlock() would normally have set these in a few
319 * places but for us it's cleaner to centralize it here. Note that
320 * IGN_PID is -1. We use 0 for our pid-less locks.
326 * Check access permissions
328 if ((fcmd
== F_OFD_SETLK
|| fcmd
== F_OFD_SETLKW
) &&
329 ((lckdat
->l_type
== F_RDLCK
&& (flag
& FREAD
) == 0) ||
330 (lckdat
->l_type
== F_WRLCK
&& (flag
& FWRITE
) == 0)))
334 * for query and unlock we use the stack_lock_request
336 if (lckdat
->l_type
== F_UNLCK
|| !(cmd
& SETFLCK
)) {
337 lock_request
= &stack_lock_request
;
338 (void) bzero((caddr_t
)lock_request
,
339 sizeof (lock_descriptor_t
));
342 * following is added to make the assertions in
343 * flk_execute_request() pass
345 lock_request
->l_edge
.edge_in_next
= &lock_request
->l_edge
;
346 lock_request
->l_edge
.edge_in_prev
= &lock_request
->l_edge
;
347 lock_request
->l_edge
.edge_adj_next
= &lock_request
->l_edge
;
348 lock_request
->l_edge
.edge_adj_prev
= &lock_request
->l_edge
;
349 lock_request
->l_status
= FLK_INITIAL_STATE
;
351 lock_request
= flk_get_lock();
352 fp
->f_filock
= (struct filock
*)lock_request
;
354 lock_request
->l_state
= 0;
355 lock_request
->l_vnode
= vp
;
356 lock_request
->l_zoneid
= getzoneid();
357 lock_request
->l_ofd
= fp
;
360 * Convert the request range into the canonical start and end
361 * values then check the validity of the lock range.
363 error
= flk_convert_lock_data(vp
, lckdat
, &lock_request
->l_start
,
364 &lock_request
->l_end
, offset
);
368 error
= flk_check_lock_data(lock_request
->l_start
, lock_request
->l_end
,
373 ASSERT(lock_request
->l_end
>= lock_request
->l_start
);
375 lock_request
->l_type
= lckdat
->l_type
;
377 lock_request
->l_state
|= WILLING_TO_SLEEP_LOCK
;
379 if (!(cmd
& SETFLCK
)) {
380 if (lock_request
->l_type
== F_RDLCK
||
381 lock_request
->l_type
== F_WRLCK
)
382 lock_request
->l_state
|= QUERY_LOCK
;
384 lock_request
->l_flock
= (*lckdat
);
387 * We are ready for processing the request
390 if (fcmd
!= F_OFD_GETLK
&& lock_request
->l_type
!= F_UNLCK
&&
391 nbl_need_check(vp
)) {
392 nbl_start_crit(vp
, RW_WRITER
);
396 /* Get the lock graph for a particular vnode */
397 gp
= flk_get_lock_graph(vp
, FLK_INIT_GRAPH
);
399 mutex_enter(&gp
->gp_mutex
);
401 lock_request
->l_state
|= REFERENCED_LOCK
;
402 lock_request
->l_graph
= gp
;
404 switch (lock_request
->l_type
) {
407 if (IS_QUERY_LOCK(lock_request
)) {
408 flk_get_first_blocking_lock(lock_request
);
409 if (lock_request
->l_ofd
!= NULL
)
410 lock_request
->l_flock
.l_pid
= -1;
411 (*lckdat
) = lock_request
->l_flock
;
413 /* process the request now */
414 error
= flk_process_request(lock_request
);
419 /* unlock request will not block so execute it immediately */
420 error
= flk_execute_request(lock_request
);
428 if (lock_request
== &stack_lock_request
) {
429 flk_set_state(lock_request
, FLK_DEAD_STATE
);
431 lock_request
->l_state
&= ~REFERENCED_LOCK
;
432 if ((error
!= 0) || IS_DELETED(lock_request
)) {
433 flk_set_state(lock_request
, FLK_DEAD_STATE
);
434 flk_free_lock(lock_request
);
438 mutex_exit(&gp
->gp_mutex
);
445 flk_set_state(lock_request
, FLK_DEAD_STATE
);
446 if (lock_request
!= &stack_lock_request
)
447 flk_free_lock(lock_request
);
452 * Remove any lock on the vnode belonging to the given file_t.
453 * Called from closef on last close, file_t is locked.
455 * This is modeled on the cleanlocks() function but only removes the single
456 * lock associated with fp.
459 ofdcleanlock(file_t
*fp
)
461 lock_descriptor_t
*fplock
, *lock
, *nlock
;
465 ASSERT(MUTEX_HELD(&fp
->f_tlock
));
467 if ((fplock
= (lock_descriptor_t
*)fp
->f_filock
) == NULL
)
473 gp
= flk_get_lock_graph(vp
, FLK_USE_GRAPH
);
477 mutex_enter(&gp
->gp_mutex
);
479 CHECK_SLEEPING_LOCKS(gp
);
480 CHECK_ACTIVE_LOCKS(gp
);
482 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
486 nlock
= lock
->l_next
;
487 if (fplock
== lock
) {
492 } while (lock
->l_vnode
== vp
);
495 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
499 nlock
= lock
->l_next
;
500 if (fplock
== lock
) {
501 flk_delete_active_lock(lock
, 0);
507 } while (lock
->l_vnode
== vp
);
510 CHECK_SLEEPING_LOCKS(gp
);
511 CHECK_ACTIVE_LOCKS(gp
);
512 mutex_exit(&gp
->gp_mutex
);
516 * Routine called from fs_frlock in fs/fs_subr.c
518 * This implements traditional POSIX style record locking. The two primary
519 * drawbacks to this style of locking are:
520 * 1) It is per-process, so any close of a file descriptor that refers to the
521 * file will drop the lock (e.g. lock /etc/passwd, call a library function
522 * which opens /etc/passwd to read the file, when the library closes it's
523 * file descriptor the application loses its lock and does not know).
524 * 2) Locks are not preserved across fork(2).
526 * Because these locks are only associated with a PID, they are per-process.
527 * This is why any close will drop the lock and is also why, once the process
528 * forks, the lock is no longer related to the new process. These locks can
529 * be considered as PID-ful.
531 * See ofdlock() for the implementation of a similar but improved locking
535 reclock(vnode_t
*vp
, flock64_t
*lckdat
, int cmd
, int flag
, u_offset_t offset
,
536 flk_callback_t
*flk_cbp
)
538 lock_descriptor_t stack_lock_request
;
539 lock_descriptor_t
*lock_request
;
545 * Check access permissions
547 if ((cmd
& SETFLCK
) &&
548 ((lckdat
->l_type
== F_RDLCK
&& (flag
& FREAD
) == 0) ||
549 (lckdat
->l_type
== F_WRLCK
&& (flag
& FWRITE
) == 0)))
553 * for query and unlock we use the stack_lock_request
556 if ((lckdat
->l_type
== F_UNLCK
) ||
557 !((cmd
& INOFLCK
) || (cmd
& SETFLCK
))) {
558 lock_request
= &stack_lock_request
;
559 (void) bzero((caddr_t
)lock_request
,
560 sizeof (lock_descriptor_t
));
563 * following is added to make the assertions in
564 * flk_execute_request() to pass through
567 lock_request
->l_edge
.edge_in_next
= &lock_request
->l_edge
;
568 lock_request
->l_edge
.edge_in_prev
= &lock_request
->l_edge
;
569 lock_request
->l_edge
.edge_adj_next
= &lock_request
->l_edge
;
570 lock_request
->l_edge
.edge_adj_prev
= &lock_request
->l_edge
;
571 lock_request
->l_status
= FLK_INITIAL_STATE
;
573 lock_request
= flk_get_lock();
575 lock_request
->l_state
= 0;
576 lock_request
->l_vnode
= vp
;
577 lock_request
->l_zoneid
= getzoneid();
580 * Convert the request range into the canonical start and end
581 * values. The NLM protocol supports locking over the entire
582 * 32-bit range, so there's no range checking for remote requests,
583 * but we still need to verify that local requests obey the rules.
586 if ((cmd
& (RCMDLCK
| PCMDLCK
)) != 0) {
587 ASSERT(lckdat
->l_whence
== 0);
588 lock_request
->l_start
= lckdat
->l_start
;
589 lock_request
->l_end
= (lckdat
->l_len
== 0) ? MAX_U_OFFSET_T
:
590 lckdat
->l_start
+ (lckdat
->l_len
- 1);
592 /* check the validity of the lock range */
593 error
= flk_convert_lock_data(vp
, lckdat
,
594 &lock_request
->l_start
, &lock_request
->l_end
,
599 error
= flk_check_lock_data(lock_request
->l_start
,
600 lock_request
->l_end
, MAXEND
);
606 ASSERT(lock_request
->l_end
>= lock_request
->l_start
);
608 lock_request
->l_type
= lckdat
->l_type
;
610 lock_request
->l_state
|= IO_LOCK
;
612 lock_request
->l_state
|= WILLING_TO_SLEEP_LOCK
;
614 lock_request
->l_state
|= LOCKMGR_LOCK
;
616 lock_request
->l_state
|= NBMAND_LOCK
;
618 * Clustering: set flag for PXFS locks
619 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
620 * also be of type 'RCMDLCK'.
621 * We do not _only_ check the GETPXFSID() macro because local PXFS
622 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
625 if ((cmd
& PCMDLCK
) || (GETPXFSID(lckdat
->l_sysid
) != 0)) {
626 lock_request
->l_state
|= PXFS_LOCK
;
628 if (!((cmd
& SETFLCK
) || (cmd
& INOFLCK
))) {
629 if (lock_request
->l_type
== F_RDLCK
||
630 lock_request
->l_type
== F_WRLCK
)
631 lock_request
->l_state
|= QUERY_LOCK
;
633 lock_request
->l_flock
= (*lckdat
);
634 lock_request
->l_callbacks
= flk_cbp
;
637 * We are ready for processing the request
639 if (IS_LOCKMGR(lock_request
)) {
641 * If the lock request is an NLM server request ....
643 if (nlm_status_size
== 0) { /* not booted as cluster */
644 mutex_enter(&flock_lock
);
646 * Bail out if this is a lock manager request and the
647 * lock manager is not supposed to be running.
649 if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP
) {
650 mutex_exit(&flock_lock
);
654 mutex_exit(&flock_lock
);
655 } else { /* booted as a cluster */
656 nlmid
= GETNLMID(lock_request
->l_flock
.l_sysid
);
657 ASSERT(nlmid
<= nlm_status_size
&& nlmid
>= 0);
659 mutex_enter(&nlm_reg_lock
);
661 * If the NLM registry does not know about this
662 * NLM server making the request, add its nlmid
665 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status
,
667 FLK_REGISTRY_ADD_NLMID(nlm_reg_status
, nlmid
);
668 } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status
,
671 * If the NLM server is already known (has made
672 * previous lock requests) and its state is
673 * not NLM_UP (means that NLM server is
674 * shutting down), then bail out with an
675 * error to deny the lock request.
677 mutex_exit(&nlm_reg_lock
);
681 mutex_exit(&nlm_reg_lock
);
685 /* Now get the lock graph for a particular vnode */
686 gp
= flk_get_lock_graph(vp
, FLK_INIT_GRAPH
);
689 * We drop rwlock here otherwise this might end up causing a
690 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
693 if (IS_IO_LOCK(lock_request
)) {
695 (lock_request
->l_type
== F_RDLCK
) ?
696 V_WRITELOCK_FALSE
: V_WRITELOCK_TRUE
, NULL
);
698 mutex_enter(&gp
->gp_mutex
);
700 lock_request
->l_state
|= REFERENCED_LOCK
;
701 lock_request
->l_graph
= gp
;
703 switch (lock_request
->l_type
) {
706 if (IS_QUERY_LOCK(lock_request
)) {
707 flk_get_first_blocking_lock(lock_request
);
708 if (lock_request
->l_ofd
!= NULL
)
709 lock_request
->l_flock
.l_pid
= -1;
710 (*lckdat
) = lock_request
->l_flock
;
714 /* process the request now */
716 error
= flk_process_request(lock_request
);
720 /* unlock request will not block so execute it immediately */
722 if (IS_LOCKMGR(lock_request
) &&
723 flk_canceled(lock_request
)) {
726 error
= flk_execute_request(lock_request
);
732 * Recovery mechanism to release lock manager locks when
733 * NFS client crashes and restart. NFS server will clear
734 * old locks and grant new locks.
737 if (lock_request
->l_flock
.l_sysid
== 0) {
738 mutex_exit(&gp
->gp_mutex
);
741 if (secpolicy_nfs(CRED()) != 0) {
742 mutex_exit(&gp
->gp_mutex
);
745 flk_delete_locks_by_sysid(lock_request
);
746 lock_request
->l_state
&= ~REFERENCED_LOCK
;
747 flk_set_state(lock_request
, FLK_DEAD_STATE
);
748 flk_free_lock(lock_request
);
749 mutex_exit(&gp
->gp_mutex
);
757 /* Clustering: For blocked PXFS locks, return */
758 if (error
== PXFS_LOCK_BLOCKED
) {
759 lock_request
->l_state
&= ~REFERENCED_LOCK
;
760 mutex_exit(&gp
->gp_mutex
);
765 * Now that we have seen the status of locks in the system for
766 * this vnode we acquire the rwlock if it is an IO_LOCK.
769 if (IS_IO_LOCK(lock_request
)) {
770 (void) VOP_RWLOCK(vp
,
771 (lock_request
->l_type
== F_RDLCK
) ?
772 V_WRITELOCK_FALSE
: V_WRITELOCK_TRUE
, NULL
);
774 lckdat
->l_type
= F_UNLCK
;
777 * This wake up is needed otherwise
778 * if IO_LOCK has slept the dependents on this
779 * will not be woken up at all. (bugid # 1185482).
782 flk_wakeup(lock_request
, 1);
783 flk_set_state(lock_request
, FLK_DEAD_STATE
);
784 flk_free_lock(lock_request
);
787 * else if error had occurred either flk_process_request()
788 * has returned EDEADLK in which case there will be no
789 * dependents for this lock or EINTR from flk_wait_execute_
790 * request() in which case flk_cancel_sleeping_lock()
791 * would have been done. same is true with EBADF.
795 if (lock_request
== &stack_lock_request
) {
796 flk_set_state(lock_request
, FLK_DEAD_STATE
);
798 lock_request
->l_state
&= ~REFERENCED_LOCK
;
799 if ((error
!= 0) || IS_DELETED(lock_request
)) {
800 flk_set_state(lock_request
, FLK_DEAD_STATE
);
801 flk_free_lock(lock_request
);
805 mutex_exit(&gp
->gp_mutex
);
809 flk_set_state(lock_request
, FLK_DEAD_STATE
);
810 if (lock_request
!= &stack_lock_request
)
811 flk_free_lock(lock_request
);
816 * Invoke the callbacks in the given list. If before sleeping, invoke in
817 * list order. If after sleeping, invoke in reverse order.
819 * CPR (suspend/resume) support: if one of the callbacks returns a
820 * callb_cpr_t, return it. This will be used to make the thread CPR-safe
821 * while it is sleeping. There should be at most one callb_cpr_t for the
823 * XXX This is unnecessarily complicated. The CPR information should just
824 * get passed in directly through VOP_FRLOCK and reclock, rather than
825 * sneaking it in via a callback.
829 flk_invoke_callbacks(flk_callback_t
*cblist
, flk_cb_when_t when
)
831 callb_cpr_t
*cpr_callbackp
= NULL
;
832 callb_cpr_t
*one_result
;
838 if (when
== FLK_BEFORE_SLEEP
) {
841 one_result
= (*cb
->cb_callback
)(when
, cb
->cb_data
);
842 if (one_result
!= NULL
) {
843 ASSERT(cpr_callbackp
== NULL
);
844 cpr_callbackp
= one_result
;
847 } while (cb
!= cblist
);
849 cb
= cblist
->cb_prev
;
851 one_result
= (*cb
->cb_callback
)(when
, cb
->cb_data
);
852 if (one_result
!= NULL
) {
853 cpr_callbackp
= one_result
;
856 } while (cb
!= cblist
->cb_prev
);
859 return (cpr_callbackp
);
863 * Initialize a flk_callback_t to hold the given callback.
867 flk_init_callback(flk_callback_t
*flk_cb
,
868 callb_cpr_t
*(*cb_fcn
)(flk_cb_when_t
, void *), void *cbdata
)
870 flk_cb
->cb_next
= flk_cb
;
871 flk_cb
->cb_prev
= flk_cb
;
872 flk_cb
->cb_callback
= cb_fcn
;
873 flk_cb
->cb_data
= cbdata
;
877 * Initialize an flk_callback_t and then link it into the head of an
878 * existing list (which may be NULL).
882 flk_add_callback(flk_callback_t
*newcb
,
883 callb_cpr_t
*(*cb_fcn
)(flk_cb_when_t
, void *),
884 void *cbdata
, flk_callback_t
*cblist
)
886 flk_init_callback(newcb
, cb_fcn
, cbdata
);
891 newcb
->cb_prev
= cblist
->cb_prev
;
892 newcb
->cb_next
= cblist
;
893 cblist
->cb_prev
->cb_next
= newcb
;
894 cblist
->cb_prev
= newcb
;
898 * Remove the callback from a list.
902 flk_del_callback(flk_callback_t
*flk_cb
)
904 flk_cb
->cb_next
->cb_prev
= flk_cb
->cb_prev
;
905 flk_cb
->cb_prev
->cb_next
= flk_cb
->cb_next
;
907 flk_cb
->cb_prev
= flk_cb
;
908 flk_cb
->cb_next
= flk_cb
;
912 * Initialize the flk_edge_cache data structure and create the
913 * nlm_reg_status array.
921 flk_edge_cache
= kmem_cache_create("flk_edges",
922 sizeof (struct edge
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
923 if (flk_edge_cache
== NULL
) {
924 cmn_err(CE_PANIC
, "Couldn't create flk_edge_cache\n");
927 * Create the NLM registry object.
930 if (cluster_bootflags
& CLUSTER_BOOTED
) {
932 * This routine tells you the maximum node id that will be used
933 * in the cluster. This number will be the size of the nlm
934 * registry status array. We add 1 because we will be using
935 * all entries indexed from 0 to maxnodeid; e.g., from 0
936 * to 64, for a total of 65 entries.
938 nlm_status_size
= clconf_maximum_nodeid() + 1;
943 if (nlm_status_size
!= 0) { /* booted as a cluster */
944 nlm_reg_status
= (flk_nlm_status_t
*)
945 kmem_alloc(sizeof (flk_nlm_status_t
) * nlm_status_size
,
948 /* initialize all NLM states in array to NLM_UNKNOWN */
949 for (i
= 0; i
< nlm_status_size
; i
++) {
950 nlm_reg_status
[i
] = FLK_NLM_UNKNOWN
;
956 * Zone constructor/destructor callbacks to be executed when a zone is
961 flk_zone_init(zoneid_t zoneid
)
963 struct flock_globals
*fg
;
966 fg
= kmem_alloc(sizeof (*fg
), KM_SLEEP
);
967 fg
->flk_lockmgr_status
= FLK_LOCKMGR_UP
;
968 for (i
= 0; i
< HASH_SIZE
; i
++)
969 fg
->lockmgr_status
[i
] = FLK_LOCKMGR_UP
;
975 flk_zone_fini(zoneid_t zoneid
, void *data
)
977 struct flock_globals
*fg
= data
;
979 kmem_free(fg
, sizeof (*fg
));
983 * Get a lock_descriptor structure with initialization of edge lists.
986 static lock_descriptor_t
*
989 lock_descriptor_t
*l
;
991 l
= kmem_zalloc(sizeof (lock_descriptor_t
), KM_SLEEP
);
993 cv_init(&l
->l_cv
, NULL
, CV_DRIVER
, NULL
);
994 l
->l_edge
.edge_in_next
= &l
->l_edge
;
995 l
->l_edge
.edge_in_prev
= &l
->l_edge
;
996 l
->l_edge
.edge_adj_next
= &l
->l_edge
;
997 l
->l_edge
.edge_adj_prev
= &l
->l_edge
;
999 l
->l_status
= FLK_INITIAL_STATE
;
1005 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1006 * when some thread has a reference to it as in reclock().
1010 flk_free_lock(lock_descriptor_t
*lock
)
1014 ASSERT(IS_DEAD(lock
));
1016 if ((fp
= lock
->l_ofd
) != NULL
&& fp
->f_filock
== (struct filock
*)lock
)
1017 fp
->f_filock
= NULL
;
1019 if (IS_REFERENCED(lock
)) {
1020 lock
->l_state
|= DELETED_LOCK
;
1024 kmem_free((void *)lock
, sizeof (lock_descriptor_t
));
1028 flk_set_state(lock_descriptor_t
*lock
, int new_state
)
1031 * Locks in the sleeping list may be woken up in a number of ways,
1032 * and more than once. If a sleeping lock is signaled awake more
1033 * than once, then it may or may not change state depending on its
1035 * Also note that NLM locks that are sleeping could be moved to an
1036 * interrupted state more than once if the unlock request is
1037 * retransmitted by the NLM client - the second time around, this is
1039 * The ordering of being signaled awake is:
1040 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1041 * The checks below implement this ordering.
1043 if (IS_INTERRUPTED(lock
)) {
1044 if ((new_state
== FLK_CANCELLED_STATE
) ||
1045 (new_state
== FLK_GRANTED_STATE
) ||
1046 (new_state
== FLK_INTERRUPTED_STATE
)) {
1050 if (IS_CANCELLED(lock
)) {
1051 if ((new_state
== FLK_GRANTED_STATE
) ||
1052 (new_state
== FLK_CANCELLED_STATE
)) {
1056 CHECK_LOCK_TRANSITION(lock
->l_status
, new_state
);
1057 if (IS_PXFS(lock
)) {
1058 cl_flk_state_transition_notify(lock
, lock
->l_status
, new_state
);
1060 lock
->l_status
= new_state
;
1064 * Routine that checks whether there are any blocking locks in the system.
1066 * The policy followed is if a write lock is sleeping we don't allow read
1067 * locks before this write lock even though there may not be any active
1068 * locks corresponding to the read locks' region.
1070 * flk_add_edge() function adds an edge between l1 and l2 iff there
1071 * is no path between l1 and l2. This is done to have a "minimum
1072 * storage representation" of the dependency graph.
1074 * Another property of the graph is since only the new request throws
1075 * edges to the existing locks in the graph, the graph is always topologically
1080 flk_process_request(lock_descriptor_t
*request
)
1082 graph_t
*gp
= request
->l_graph
;
1083 lock_descriptor_t
*lock
;
1084 int request_blocked_by_active
= 0;
1085 int request_blocked_by_granted
= 0;
1086 int request_blocked_by_sleeping
= 0;
1087 vnode_t
*vp
= request
->l_vnode
;
1089 int request_will_wait
= 0;
1090 int found_covering_lock
= 0;
1091 lock_descriptor_t
*covered_by
= NULL
;
1093 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
1094 request_will_wait
= IS_WILLING_TO_SLEEP(request
);
1097 * check active locks
1100 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
1105 if (BLOCKS(lock
, request
)) {
1106 if (!request_will_wait
)
1108 request_blocked_by_active
= 1;
1112 * Grant lock if it is for the same owner holding active
1113 * lock that covers the request.
1116 if (SAME_OWNER(lock
, request
) &&
1117 COVERS(lock
, request
) &&
1118 (request
->l_type
== F_RDLCK
))
1119 return (flk_execute_request(request
));
1120 lock
= lock
->l_next
;
1121 } while (lock
->l_vnode
== vp
);
1124 if (!request_blocked_by_active
) {
1125 lock_descriptor_t
*lk
[1];
1126 lock_descriptor_t
*first_glock
= NULL
;
1128 * Shall we grant this?! NO!!
1129 * What about those locks that were just granted and still
1130 * in sleep queue. Those threads are woken up and so locks
1131 * are almost active.
1133 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
1136 if (BLOCKS(lock
, request
)) {
1137 if (IS_GRANTED(lock
)) {
1138 request_blocked_by_granted
= 1;
1140 request_blocked_by_sleeping
= 1;
1144 lock
= lock
->l_next
;
1145 } while ((lock
->l_vnode
== vp
));
1146 first_glock
= lock
->l_prev
;
1147 ASSERT(first_glock
->l_vnode
== vp
);
1150 if (request_blocked_by_granted
)
1153 if (!request_blocked_by_sleeping
) {
1155 * If the request isn't going to be blocked by a
1156 * sleeping request, we know that it isn't going to
1157 * be blocked; we can just execute the request --
1158 * without performing costly deadlock detection.
1160 ASSERT(!request_blocked_by_active
);
1161 return (flk_execute_request(request
));
1162 } else if (request
->l_type
== F_RDLCK
) {
1164 * If we have a sleeping writer in the requested
1165 * lock's range, block.
1171 request
->l_state
|= RECOMPUTE_LOCK
;
1172 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
1175 flk_recompute_dependencies(lock
, lk
, 1, 0);
1176 lock
= lock
->l_next
;
1177 } while (lock
->l_vnode
== vp
);
1182 if (IS_GRANTED(lock
)) {
1183 flk_recompute_dependencies(lock
, lk
, 1, 0);
1185 lock
= lock
->l_prev
;
1186 } while ((lock
->l_vnode
== vp
));
1188 request
->l_state
&= ~RECOMPUTE_LOCK
;
1189 if (!NO_DEPENDENTS(request
) && flk_check_deadlock(request
))
1191 return (flk_execute_request(request
));
1195 if (request_will_wait
)
1196 flk_graph_uncolor(gp
);
1198 /* check sleeping locks */
1200 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
1203 * If we find a sleeping write lock that is a superset of the
1204 * region wanted by request we can be assured that by adding an
1205 * edge to this write lock we have paths to all locks in the
1206 * graph that blocks the request except in one case and that is why
1207 * another check for SAME_OWNER in the loop below. The exception
1208 * case is when this process that owns the sleeping write lock 'l1'
1209 * has other locks l2, l3, l4 that are in the system and arrived
1210 * before l1. l1 does not have path to these locks as they are from
1211 * same process. We break when we find a second covering sleeping
1212 * lock l5 owned by a process different from that owning l1, because
1213 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1214 * it has l1 would have produced a deadlock already.
1219 if (BLOCKS(lock
, request
)) {
1220 if (!request_will_wait
)
1222 if (COVERS(lock
, request
) &&
1223 lock
->l_type
== F_WRLCK
) {
1224 if (found_covering_lock
&&
1225 !SAME_OWNER(lock
, covered_by
)) {
1226 found_covering_lock
++;
1229 found_covering_lock
= 1;
1232 if (found_covering_lock
&&
1233 !SAME_OWNER(lock
, covered_by
)) {
1234 lock
= lock
->l_next
;
1237 if ((error
= flk_add_edge(request
, lock
,
1238 !found_covering_lock
, 0)))
1241 lock
= lock
->l_next
;
1242 } while (lock
->l_vnode
== vp
);
1246 * found_covering_lock == 2 iff at this point 'request' has paths
1247 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1248 * point 'request' has paths to all locks that blocks 'request' whose owners
1249 * are not same as the one that covers 'request' (covered_by above) and
1250 * we can have locks whose owner is same as covered_by in the active list.
1253 if (request_blocked_by_active
&& found_covering_lock
!= 2) {
1254 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
1255 ASSERT(lock
!= NULL
);
1257 if (BLOCKS(lock
, request
)) {
1258 if (found_covering_lock
&&
1259 !SAME_OWNER(lock
, covered_by
)) {
1260 lock
= lock
->l_next
;
1263 if ((error
= flk_add_edge(request
, lock
,
1267 lock
= lock
->l_next
;
1268 } while (lock
->l_vnode
== vp
);
1271 if (NOT_BLOCKED(request
)) {
1273 * request not dependent on any other locks
1274 * so execute this request
1276 return (flk_execute_request(request
));
1279 * check for deadlock
1281 if (flk_check_deadlock(request
))
1284 * this thread has to sleep
1286 return (flk_wait_execute_request(request
));
1291 * The actual execution of the request in the simple case is only to
1292 * insert the 'request' in the list of active locks if it is not an
1294 * We have to consider the existing active locks' relation to
1295 * this 'request' if they are owned by same process. flk_relation() does
1296 * this job and sees to that the dependency graph information is maintained
1301 flk_execute_request(lock_descriptor_t
*request
)
1303 graph_t
*gp
= request
->l_graph
;
1304 vnode_t
*vp
= request
->l_vnode
;
1305 lock_descriptor_t
*lock
, *lock1
;
1306 int done_searching
= 0;
1308 CHECK_SLEEPING_LOCKS(gp
);
1309 CHECK_ACTIVE_LOCKS(gp
);
1311 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
1313 flk_set_state(request
, FLK_START_STATE
);
1315 ASSERT(NOT_BLOCKED(request
));
1317 /* IO_LOCK requests are only to check status */
1319 if (IS_IO_LOCK(request
))
1322 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
1324 if (lock
== NULL
&& request
->l_type
== F_UNLCK
)
1327 flk_insert_active_lock(request
);
1332 lock1
= lock
->l_next
;
1333 if (SAME_OWNER(request
, lock
)) {
1334 done_searching
= flk_relation(lock
, request
);
1337 } while (lock
->l_vnode
== vp
&& !done_searching
);
1340 * insert in active queue
1343 if (request
->l_type
!= F_UNLCK
)
1344 flk_insert_active_lock(request
);
1350 * 'request' is blocked by some one therefore we put it into sleep queue.
1353 flk_wait_execute_request(lock_descriptor_t
*request
)
1355 graph_t
*gp
= request
->l_graph
;
1356 callb_cpr_t
*cprp
; /* CPR info from callback */
1357 struct flock_globals
*fg
;
1360 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
1361 ASSERT(IS_WILLING_TO_SLEEP(request
));
1363 flk_insert_sleeping_lock(request
);
1365 if (IS_LOCKMGR(request
)) {
1366 index
= HASH_INDEX(request
->l_vnode
);
1367 fg
= flk_get_globals();
1369 if (nlm_status_size
== 0) { /* not booted as a cluster */
1370 if (fg
->lockmgr_status
[index
] != FLK_LOCKMGR_UP
) {
1371 flk_cancel_sleeping_lock(request
, 1);
1374 } else { /* booted as a cluster */
1376 * If the request is an NLM server lock request,
1377 * and the NLM state of the lock request is not
1378 * NLM_UP (because the NLM server is shutting
1379 * down), then cancel the sleeping lock and
1380 * return error ENOLCK that will encourage the
1381 * client to retransmit.
1383 if (!IS_NLM_UP(request
)) {
1384 flk_cancel_sleeping_lock(request
, 1);
1390 /* Clustering: For blocking PXFS locks, return */
1391 if (IS_PXFS(request
)) {
1393 * PXFS locks sleep on the client side.
1394 * The callback argument is used to wake up the sleeper
1395 * when the lock is granted.
1396 * We return -1 (rather than an errno value) to indicate
1397 * the client side should sleep
1399 return (PXFS_LOCK_BLOCKED
);
1402 if (request
->l_callbacks
!= NULL
) {
1404 * To make sure the shutdown code works correctly, either
1405 * the callback must happen after putting the lock on the
1406 * sleep list, or we must check the shutdown status after
1407 * returning from the callback (and before sleeping). At
1408 * least for now, we'll use the first option. If a
1409 * shutdown or signal or whatever happened while the graph
1410 * mutex was dropped, that will be detected by
1413 mutex_exit(&gp
->gp_mutex
);
1415 cprp
= flk_invoke_callbacks(request
->l_callbacks
,
1418 mutex_enter(&gp
->gp_mutex
);
1421 wait_for_lock(request
);
1423 mutex_enter(cprp
->cc_lockp
);
1424 CALLB_CPR_SAFE_BEGIN(cprp
);
1425 mutex_exit(cprp
->cc_lockp
);
1426 wait_for_lock(request
);
1427 mutex_enter(cprp
->cc_lockp
);
1428 CALLB_CPR_SAFE_END(cprp
, cprp
->cc_lockp
);
1429 mutex_exit(cprp
->cc_lockp
);
1432 mutex_exit(&gp
->gp_mutex
);
1433 (void) flk_invoke_callbacks(request
->l_callbacks
,
1435 mutex_enter(&gp
->gp_mutex
);
1437 wait_for_lock(request
);
1440 if (IS_LOCKMGR(request
)) {
1442 * If the lock manager is shutting down, return an
1443 * error that will encourage the client to retransmit.
1445 if (fg
->lockmgr_status
[index
] != FLK_LOCKMGR_UP
&&
1446 !IS_GRANTED(request
)) {
1447 flk_cancel_sleeping_lock(request
, 1);
1452 if (IS_INTERRUPTED(request
)) {
1453 /* we got a signal, or act like we did */
1454 flk_cancel_sleeping_lock(request
, 1);
1458 /* Cancelled if some other thread has closed the file */
1460 if (IS_CANCELLED(request
)) {
1461 flk_cancel_sleeping_lock(request
, 1);
1465 request
->l_state
&= ~GRANTED_LOCK
;
1466 REMOVE_SLEEP_QUEUE(request
);
1467 return (flk_execute_request(request
));
1471 * This routine adds an edge between from and to because from depends
1472 * to. If asked to check for deadlock it checks whether there are any
1473 * reachable locks from "from_lock" that is owned by the same process
1475 * NOTE: It is the caller's responsibility to make sure that the color
1476 * of the graph is consistent between the calls to flk_add_edge as done
1477 * in flk_process_request. This routine does not color and check for
1478 * deadlock explicitly.
1482 flk_add_edge(lock_descriptor_t
*from_lock
, lock_descriptor_t
*to_lock
,
1483 int check_cycle
, int update_graph
)
1487 lock_descriptor_t
*vertex
;
1488 lock_descriptor_t
*vertex_stack
;
1490 STACK_INIT(vertex_stack
);
1493 * if to vertex already has mark_color just return
1494 * don't add an edge as it is reachable from from vertex
1498 if (COLORED(to_lock
))
1501 edge
= flk_get_edge();
1504 * set the from and to vertex
1507 edge
->from_vertex
= from_lock
;
1508 edge
->to_vertex
= to_lock
;
1511 * put in adjacency list of from vertex
1514 from_lock
->l_edge
.edge_adj_next
->edge_adj_prev
= edge
;
1515 edge
->edge_adj_next
= from_lock
->l_edge
.edge_adj_next
;
1516 edge
->edge_adj_prev
= &from_lock
->l_edge
;
1517 from_lock
->l_edge
.edge_adj_next
= edge
;
1520 * put in list of to vertex
1523 to_lock
->l_edge
.edge_in_next
->edge_in_prev
= edge
;
1524 edge
->edge_in_next
= to_lock
->l_edge
.edge_in_next
;
1525 to_lock
->l_edge
.edge_in_next
= edge
;
1526 edge
->edge_in_prev
= &to_lock
->l_edge
;
1530 flk_update_proc_graph(edge
, 0);
1537 STACK_PUSH(vertex_stack
, from_lock
, l_stack
);
1539 while ((vertex
= STACK_TOP(vertex_stack
)) != NULL
) {
1541 STACK_POP(vertex_stack
, l_stack
);
1543 for (ep
= FIRST_ADJ(vertex
);
1545 ep
= NEXT_ADJ(ep
)) {
1546 if (COLORED(ep
->to_vertex
))
1548 COLOR(ep
->to_vertex
);
1549 if (SAME_OWNER(ep
->to_vertex
, from_lock
))
1551 STACK_PUSH(vertex_stack
, ep
->to_vertex
, l_stack
);
1562 ep
= FIRST_ADJ(from_lock
);
1564 while (ep
!= HEAD(from_lock
)) {
1566 from_lock
->l_sedge
= NEXT_ADJ(ep
);
1567 ADJ_LIST_REMOVE(ep
);
1569 ep
= from_lock
->l_sedge
;
1575 * Get an edge structure for representing the dependency between two locks.
1583 ASSERT(flk_edge_cache
!= NULL
);
1585 ep
= kmem_cache_alloc(flk_edge_cache
, KM_SLEEP
);
1591 * Free the edge structure.
1595 flk_free_edge(edge_t
*ep
)
1598 kmem_cache_free(flk_edge_cache
, (void *)ep
);
1602 * Check the relationship of request with lock and perform the
1603 * recomputation of dependencies, break lock if required, and return
1604 * 1 if request cannot have any more relationship with the next
1606 * The 'lock' and 'request' are compared and in case of overlap we
1607 * delete the 'lock' and form new locks to represent the non-overlapped
1608 * portion of original 'lock'. This function has side effects such as
1609 * 'lock' will be freed, new locks will be added to the active list.
1613 flk_relation(lock_descriptor_t
*lock
, lock_descriptor_t
*request
)
1616 lock_descriptor_t
*lock1
, *lock2
;
1617 lock_descriptor_t
*topology
[3];
1621 graph_t
*gp
= (lock
->l_graph
);
1624 CHECK_SLEEPING_LOCKS(gp
);
1625 CHECK_ACTIVE_LOCKS(gp
);
1627 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
1629 topology
[0] = topology
[1] = topology
[2] = NULL
;
1631 if (request
->l_type
== F_UNLCK
)
1632 lock_effect
= FLK_UNLOCK
;
1633 else if (request
->l_type
== F_RDLCK
&&
1634 lock
->l_type
== F_WRLCK
)
1635 lock_effect
= FLK_DOWNGRADE
;
1636 else if (request
->l_type
== F_WRLCK
&&
1637 lock
->l_type
== F_RDLCK
)
1638 lock_effect
= FLK_UPGRADE
;
1640 lock_effect
= FLK_STAY_SAME
;
1642 if (lock
->l_end
< request
->l_start
) {
1643 if (lock
->l_end
== request
->l_start
- 1 &&
1644 lock_effect
== FLK_STAY_SAME
) {
1645 topology
[0] = request
;
1646 request
->l_start
= lock
->l_start
;
1654 if (lock
->l_start
> request
->l_end
) {
1655 if (request
->l_end
== lock
->l_start
- 1 &&
1656 lock_effect
== FLK_STAY_SAME
) {
1657 topology
[0] = request
;
1658 request
->l_end
= lock
->l_end
;
1666 if (request
->l_end
< lock
->l_end
) {
1667 if (request
->l_start
> lock
->l_start
) {
1668 if (lock_effect
== FLK_STAY_SAME
) {
1669 request
->l_start
= lock
->l_start
;
1670 request
->l_end
= lock
->l_end
;
1671 topology
[0] = request
;
1674 lock1
= flk_get_lock();
1675 lock2
= flk_get_lock();
1678 lock1
->l_start
= lock
->l_start
;
1679 lock1
->l_end
= request
->l_start
- 1;
1680 lock2
->l_start
= request
->l_end
+ 1;
1681 lock2
->l_end
= lock
->l_end
;
1682 topology
[0] = lock1
;
1683 topology
[1] = lock2
;
1684 topology
[2] = request
;
1687 } else if (request
->l_start
< lock
->l_start
) {
1688 if (lock_effect
== FLK_STAY_SAME
) {
1689 request
->l_end
= lock
->l_end
;
1690 topology
[0] = request
;
1693 lock1
= flk_get_lock();
1695 lock1
->l_start
= request
->l_end
+ 1;
1696 topology
[0] = lock1
;
1697 topology
[1] = request
;
1701 if (lock_effect
== FLK_STAY_SAME
) {
1702 request
->l_start
= lock
->l_start
;
1703 request
->l_end
= lock
->l_end
;
1704 topology
[0] = request
;
1707 lock1
= flk_get_lock();
1709 lock1
->l_start
= request
->l_end
+ 1;
1710 topology
[0] = lock1
;
1711 topology
[1] = request
;
1715 } else if (request
->l_end
> lock
->l_end
) {
1716 if (request
->l_start
> lock
->l_start
) {
1717 if (lock_effect
== FLK_STAY_SAME
) {
1718 request
->l_start
= lock
->l_start
;
1719 topology
[0] = request
;
1722 lock1
= flk_get_lock();
1724 lock1
->l_end
= request
->l_start
- 1;
1725 topology
[0] = lock1
;
1726 topology
[1] = request
;
1729 } else if (request
->l_start
< lock
->l_start
) {
1730 topology
[0] = request
;
1733 topology
[0] = request
;
1737 if (request
->l_start
> lock
->l_start
) {
1738 if (lock_effect
== FLK_STAY_SAME
) {
1739 request
->l_start
= lock
->l_start
;
1740 topology
[0] = request
;
1743 lock1
= flk_get_lock();
1745 lock1
->l_end
= request
->l_start
- 1;
1746 topology
[0] = lock1
;
1747 topology
[1] = request
;
1750 } else if (request
->l_start
< lock
->l_start
) {
1751 topology
[0] = request
;
1754 if (lock_effect
!= FLK_UNLOCK
) {
1755 topology
[0] = request
;
1758 flk_delete_active_lock(lock
, 0);
1759 flk_wakeup(lock
, 1);
1760 flk_free_lock(lock
);
1761 CHECK_SLEEPING_LOCKS(gp
);
1762 CHECK_ACTIVE_LOCKS(gp
);
1771 * For unlock we don't send the 'request' to for recomputing
1772 * dependencies because no lock will add an edge to this.
1775 if (lock_effect
== FLK_UNLOCK
) {
1776 topology
[nvertex
-1] = NULL
;
1779 for (i
= 0; i
< nvertex
; i
++) {
1780 topology
[i
]->l_state
|= RECOMPUTE_LOCK
;
1781 topology
[i
]->l_color
= NO_COLOR
;
1784 ASSERT(FIRST_ADJ(lock
) == HEAD(lock
));
1787 * we remove the adjacent edges for all vertices' to this vertex
1791 ep
= FIRST_IN(lock
);
1792 while (ep
!= HEAD(lock
)) {
1793 ADJ_LIST_REMOVE(ep
);
1797 flk_delete_active_lock(lock
, 0);
1799 /* We are ready for recomputing the dependencies now */
1801 flk_recompute_dependencies(lock
, topology
, nvertex
, 1);
1803 for (i
= 0; i
< nvertex
; i
++) {
1804 topology
[i
]->l_state
&= ~RECOMPUTE_LOCK
;
1805 topology
[i
]->l_color
= NO_COLOR
;
1809 if (lock_effect
== FLK_UNLOCK
) {
1812 for (i
= 0; i
< nvertex
- 1; i
++) {
1813 flk_insert_active_lock(topology
[i
]);
1817 if (lock_effect
== FLK_DOWNGRADE
|| lock_effect
== FLK_UNLOCK
) {
1818 flk_wakeup(lock
, 0);
1820 ep
= FIRST_IN(lock
);
1821 while (ep
!= HEAD(lock
)) {
1822 lock
->l_sedge
= NEXT_IN(ep
);
1824 flk_update_proc_graph(ep
, 1);
1829 flk_free_lock(lock
);
1831 CHECK_SLEEPING_LOCKS(gp
);
1832 CHECK_ACTIVE_LOCKS(gp
);
1837 * Insert a lock into the active queue.
1841 flk_insert_active_lock(lock_descriptor_t
*new_lock
)
1843 graph_t
*gp
= new_lock
->l_graph
;
1844 vnode_t
*vp
= new_lock
->l_vnode
;
1845 lock_descriptor_t
*first_lock
, *lock
;
1847 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
1849 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
1852 if (first_lock
!= NULL
) {
1853 for (; (lock
->l_vnode
== vp
&&
1854 lock
->l_start
< new_lock
->l_start
); lock
= lock
->l_next
)
1857 lock
= ACTIVE_HEAD(gp
);
1860 lock
->l_prev
->l_next
= new_lock
;
1861 new_lock
->l_next
= lock
;
1862 new_lock
->l_prev
= lock
->l_prev
;
1863 lock
->l_prev
= new_lock
;
1865 if (first_lock
== NULL
|| (new_lock
->l_start
<= first_lock
->l_start
)) {
1866 vp
->v_filocks
= (struct filock
*)new_lock
;
1868 flk_set_state(new_lock
, FLK_ACTIVE_STATE
);
1869 new_lock
->l_state
|= ACTIVE_LOCK
;
1871 CHECK_ACTIVE_LOCKS(gp
);
1872 CHECK_SLEEPING_LOCKS(gp
);
1876 * Delete the active lock : Performs two functions depending on the
1877 * value of second parameter. One is to remove from the active lists
1878 * only and other is to both remove and free the lock.
1882 flk_delete_active_lock(lock_descriptor_t
*lock
, int free_lock
)
1884 vnode_t
*vp
= lock
->l_vnode
;
1885 graph_t
*gp
= lock
->l_graph
;
1887 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
1889 ASSERT(NO_DEPENDENTS(lock
));
1890 ASSERT(NOT_BLOCKED(lock
));
1891 ASSERT(IS_ACTIVE(lock
));
1893 ASSERT((vp
->v_filocks
!= NULL
));
1895 if (vp
->v_filocks
== (struct filock
*)lock
) {
1896 vp
->v_filocks
= (struct filock
*)
1897 ((lock
->l_next
->l_vnode
== vp
) ? lock
->l_next
:
1900 lock
->l_next
->l_prev
= lock
->l_prev
;
1901 lock
->l_prev
->l_next
= lock
->l_next
;
1902 lock
->l_next
= lock
->l_prev
= NULL
;
1903 flk_set_state(lock
, FLK_DEAD_STATE
);
1904 lock
->l_state
&= ~ACTIVE_LOCK
;
1907 flk_free_lock(lock
);
1908 CHECK_ACTIVE_LOCKS(gp
);
1909 CHECK_SLEEPING_LOCKS(gp
);
1913 * Insert into the sleep queue.
1917 flk_insert_sleeping_lock(lock_descriptor_t
*request
)
1919 graph_t
*gp
= request
->l_graph
;
1920 vnode_t
*vp
= request
->l_vnode
;
1921 lock_descriptor_t
*lock
;
1923 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
1924 ASSERT(IS_INITIAL(request
));
1926 for (lock
= gp
->sleeping_locks
.l_next
; (lock
!= &gp
->sleeping_locks
&&
1927 lock
->l_vnode
< vp
); lock
= lock
->l_next
)
1930 lock
->l_prev
->l_next
= request
;
1931 request
->l_prev
= lock
->l_prev
;
1932 lock
->l_prev
= request
;
1933 request
->l_next
= lock
;
1934 flk_set_state(request
, FLK_SLEEPING_STATE
);
1935 request
->l_state
|= SLEEPING_LOCK
;
1939 * Cancelling a sleeping lock implies removing a vertex from the
1940 * dependency graph and therefore we should recompute the dependencies
1941 * of all vertices that have a path to this vertex, w.r.t. all
1942 * vertices reachable from this vertex.
1946 flk_cancel_sleeping_lock(lock_descriptor_t
*request
, int remove_from_queue
)
1948 graph_t
*gp
= request
->l_graph
;
1949 vnode_t
*vp
= request
->l_vnode
;
1950 lock_descriptor_t
**topology
= NULL
;
1952 lock_descriptor_t
*vertex
, *lock
;
1955 lock_descriptor_t
*vertex_stack
;
1957 STACK_INIT(vertex_stack
);
1959 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
1961 * count number of vertex pointers that has to be allocated
1962 * All vertices that are reachable from request.
1965 STACK_PUSH(vertex_stack
, request
, l_stack
);
1967 while ((vertex
= STACK_TOP(vertex_stack
)) != NULL
) {
1968 STACK_POP(vertex_stack
, l_stack
);
1969 for (ep
= FIRST_ADJ(vertex
); ep
!= HEAD(vertex
);
1970 ep
= NEXT_ADJ(ep
)) {
1971 if (IS_RECOMPUTE(ep
->to_vertex
))
1973 ep
->to_vertex
->l_state
|= RECOMPUTE_LOCK
;
1974 STACK_PUSH(vertex_stack
, ep
->to_vertex
, l_stack
);
1980 * allocate memory for holding the vertex pointers
1984 topology
= kmem_zalloc(nvertex
* sizeof (lock_descriptor_t
*),
1989 * one more pass to actually store the vertices in the
1991 * We first check sleeping locks and then active locks
1992 * so that topology array will be in a topological
1997 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
2001 if (IS_RECOMPUTE(lock
)) {
2002 lock
->l_index
= nvertex
;
2003 topology
[nvertex
++] = lock
;
2005 lock
->l_color
= NO_COLOR
;
2006 lock
= lock
->l_next
;
2007 } while (lock
->l_vnode
== vp
);
2010 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
2014 if (IS_RECOMPUTE(lock
)) {
2015 lock
->l_index
= nvertex
;
2016 topology
[nvertex
++] = lock
;
2018 lock
->l_color
= NO_COLOR
;
2019 lock
= lock
->l_next
;
2020 } while (lock
->l_vnode
== vp
);
2024 * remove in and out edges of request
2025 * They are freed after updating proc_graph below.
2028 for (ep
= FIRST_IN(request
); ep
!= HEAD(request
); ep
= NEXT_IN(ep
)) {
2029 ADJ_LIST_REMOVE(ep
);
2033 if (remove_from_queue
)
2034 REMOVE_SLEEP_QUEUE(request
);
2036 /* we are ready to recompute */
2038 flk_recompute_dependencies(request
, topology
, nvertex
, 1);
2040 ep
= FIRST_ADJ(request
);
2041 while (ep
!= HEAD(request
)) {
2043 request
->l_sedge
= NEXT_ADJ(ep
);
2044 ADJ_LIST_REMOVE(ep
);
2045 flk_update_proc_graph(ep
, 1);
2047 ep
= request
->l_sedge
;
2052 * unset the RECOMPUTE flag in those vertices
2055 for (i
= 0; i
< nvertex
; i
++) {
2056 topology
[i
]->l_state
&= ~RECOMPUTE_LOCK
;
2063 kmem_free((void *)topology
,
2064 (nvertex
* sizeof (lock_descriptor_t
*)));
2066 * Possibility of some locks unblocked now
2069 flk_wakeup(request
, 0);
2072 * we expect to have a correctly recomputed graph now.
2074 flk_set_state(request
, FLK_DEAD_STATE
);
2075 flk_free_lock(request
);
2076 CHECK_SLEEPING_LOCKS(gp
);
2077 CHECK_ACTIVE_LOCKS(gp
);
2082 * Uncoloring the graph is simply to increment the mark value of the graph
2083 * And only when wrap round takes place will we color all vertices in
2084 * the graph explicitly.
2088 flk_graph_uncolor(graph_t
*gp
)
2090 lock_descriptor_t
*lock
;
2092 if (gp
->mark
== UINT_MAX
) {
2094 for (lock
= ACTIVE_HEAD(gp
)->l_next
; lock
!= ACTIVE_HEAD(gp
);
2095 lock
= lock
->l_next
)
2098 for (lock
= SLEEPING_HEAD(gp
)->l_next
; lock
!= SLEEPING_HEAD(gp
);
2099 lock
= lock
->l_next
)
2107 * Wake up locks that are blocked on the given lock.
2111 flk_wakeup(lock_descriptor_t
*lock
, int adj_list_remove
)
2114 graph_t
*gp
= lock
->l_graph
;
2115 lock_descriptor_t
*lck
;
2117 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
2118 if (NO_DEPENDENTS(lock
))
2120 ep
= FIRST_IN(lock
);
2123 * delete the edge from the adjacency list
2124 * of from vertex. if no more adjacent edges
2125 * for this vertex wake this process.
2127 lck
= ep
->from_vertex
;
2128 if (adj_list_remove
)
2129 ADJ_LIST_REMOVE(ep
);
2130 flk_update_proc_graph(ep
, 1);
2131 if (NOT_BLOCKED(lck
)) {
2134 lock
->l_sedge
= NEXT_IN(ep
);
2138 } while (ep
!= HEAD(lock
));
2139 ASSERT(NO_DEPENDENTS(lock
));
2143 * The dependents of request, is checked for its dependency against the
2144 * locks in topology (called topology because the array is and should be in
2145 * topological order for this algorithm, if not in topological order the
2146 * inner loop below might add more edges than necessary. Topological ordering
2147 * of vertices satisfies the property that all edges will be from left to
2148 * right i.e., topology[i] can have an edge to topology[j], iff i<j)
2149 * If lock l1 in the dependent set of request is dependent (blocked by)
2150 * on lock l2 in topology but does not have a path to it, we add an edge
2151 * in the inner loop below.
2153 * We don't want to add an edge between l1 and l2 if there exists
2154 * already a path from l1 to l2, so care has to be taken for those vertices
2155 * that have two paths to 'request'. These vertices are referred to here
2158 * The barriers has to be found (those vertex that originally had two paths
2159 * to request) because otherwise we may end up adding edges unnecessarily
2160 * to vertices in topology, and thus barrier vertices can have an edge
2161 * to a vertex in topology as well a path to it.
2165 flk_recompute_dependencies(lock_descriptor_t
*request
,
2166 lock_descriptor_t
**topology
, int nvertex
, int update_graph
)
2168 lock_descriptor_t
*vertex
, *lock
;
2169 graph_t
*gp
= request
->l_graph
;
2171 int barrier_found
= 0;
2173 lock_descriptor_t
*vertex_stack
;
2175 STACK_INIT(vertex_stack
);
2177 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
2180 flk_graph_uncolor(request
->l_graph
);
2181 barrier_found
= flk_find_barriers(request
);
2182 request
->l_state
|= RECOMPUTE_DONE
;
2184 STACK_PUSH(vertex_stack
, request
, l_stack
);
2185 request
->l_sedge
= FIRST_IN(request
);
2188 while ((vertex
= STACK_TOP(vertex_stack
)) != NULL
) {
2189 if (vertex
->l_state
& RECOMPUTE_DONE
) {
2193 if (IS_BARRIER(vertex
)) {
2194 /* decrement the barrier count */
2195 if (vertex
->l_index
) {
2197 /* this guy will be pushed again anyway ? */
2198 STACK_POP(vertex_stack
, l_stack
);
2199 if (vertex
->l_index
== 0) {
2201 * barrier is over we can recompute
2202 * dependencies for this lock in the
2205 vertex
->l_state
&= ~BARRIER_LOCK
;
2210 vertex
->l_state
|= RECOMPUTE_DONE
;
2211 flk_graph_uncolor(gp
);
2212 count
= flk_color_reachables(vertex
);
2213 for (i
= 0; i
< nvertex
; i
++) {
2217 if (BLOCKS(lock
, vertex
)) {
2218 (void) flk_add_edge(vertex
, lock
,
2219 NO_CHECK_CYCLE
, update_graph
);
2222 count
+= flk_color_reachables(lock
);
2228 if (count
== nvertex
||
2229 vertex
->l_sedge
== HEAD(vertex
)) {
2230 /* prune the tree below this */
2231 STACK_POP(vertex_stack
, l_stack
);
2232 vertex
->l_state
&= ~RECOMPUTE_DONE
;
2233 /* update the barrier locks below this! */
2234 if (vertex
->l_sedge
!= HEAD(vertex
) && barrier_found
) {
2235 flk_graph_uncolor(gp
);
2236 flk_update_barriers(vertex
);
2241 ep
= vertex
->l_sedge
;
2242 lock
= ep
->from_vertex
;
2243 STACK_PUSH(vertex_stack
, lock
, l_stack
);
2244 lock
->l_sedge
= FIRST_IN(lock
);
2245 vertex
->l_sedge
= NEXT_IN(ep
);
2251 * Color all reachable vertices from vertex that belongs to topology (here
2252 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2254 * Note: we need to use a different stack_link l_stack1 because this is
2255 * called from flk_recompute_dependencies() that already uses a stack with
2256 * l_stack as stack_link.
2260 flk_color_reachables(lock_descriptor_t
*vertex
)
2262 lock_descriptor_t
*ver
, *lock
;
2265 lock_descriptor_t
*vertex_stack
;
2267 STACK_INIT(vertex_stack
);
2269 STACK_PUSH(vertex_stack
, vertex
, l_stack1
);
2271 while ((ver
= STACK_TOP(vertex_stack
)) != NULL
) {
2273 STACK_POP(vertex_stack
, l_stack1
);
2274 for (ep
= FIRST_ADJ(ver
); ep
!= HEAD(ver
);
2275 ep
= NEXT_ADJ(ep
)) {
2276 lock
= ep
->to_vertex
;
2280 if (IS_RECOMPUTE(lock
))
2282 STACK_PUSH(vertex_stack
, lock
, l_stack1
);
2290 * Called from flk_recompute_dependencies() this routine decrements
2291 * the barrier count of barrier vertices that are reachable from lock.
2295 flk_update_barriers(lock_descriptor_t
*lock
)
2297 lock_descriptor_t
*vertex
, *lck
;
2299 lock_descriptor_t
*vertex_stack
;
2301 STACK_INIT(vertex_stack
);
2303 STACK_PUSH(vertex_stack
, lock
, l_stack1
);
2305 while ((vertex
= STACK_TOP(vertex_stack
)) != NULL
) {
2306 STACK_POP(vertex_stack
, l_stack1
);
2307 for (ep
= FIRST_IN(vertex
); ep
!= HEAD(vertex
);
2309 lck
= ep
->from_vertex
;
2311 if (IS_BARRIER(lck
)) {
2312 ASSERT(lck
->l_index
> 0);
2314 if (lck
->l_index
== 0)
2315 lck
->l_state
&= ~BARRIER_LOCK
;
2320 if (IS_BARRIER(lck
)) {
2321 ASSERT(lck
->l_index
> 0);
2323 if (lck
->l_index
== 0)
2324 lck
->l_state
&= ~BARRIER_LOCK
;
2326 STACK_PUSH(vertex_stack
, lck
, l_stack1
);
2332 * Finds all vertices that are reachable from 'lock' more than once and
2333 * mark them as barrier vertices and increment their barrier count.
2334 * The barrier count is one minus the total number of paths from lock
2339 flk_find_barriers(lock_descriptor_t
*lock
)
2341 lock_descriptor_t
*vertex
, *lck
;
2344 lock_descriptor_t
*vertex_stack
;
2346 STACK_INIT(vertex_stack
);
2348 STACK_PUSH(vertex_stack
, lock
, l_stack1
);
2350 while ((vertex
= STACK_TOP(vertex_stack
)) != NULL
) {
2351 STACK_POP(vertex_stack
, l_stack1
);
2352 for (ep
= FIRST_IN(vertex
); ep
!= HEAD(vertex
);
2354 lck
= ep
->from_vertex
;
2356 /* this is a barrier */
2357 lck
->l_state
|= BARRIER_LOCK
;
2358 /* index will have barrier count */
2366 STACK_PUSH(vertex_stack
, lck
, l_stack1
);
2373 * Finds the first lock that is mainly responsible for blocking this
2374 * request. If there is no such lock, request->l_flock.l_type is set to
2375 * F_UNLCK. Otherwise, request->l_flock is filled in with the particulars
2376 * of the blocking lock.
2378 * Note: It is possible a request is blocked by a sleeping lock because
2379 * of the fairness policy used in flk_process_request() to construct the
2380 * dependencies. (see comments before flk_process_request()).
2384 flk_get_first_blocking_lock(lock_descriptor_t
*request
)
2386 graph_t
*gp
= request
->l_graph
;
2387 vnode_t
*vp
= request
->l_vnode
;
2388 lock_descriptor_t
*lock
, *blocker
;
2390 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
2392 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
2396 if (BLOCKS(lock
, request
)) {
2400 lock
= lock
->l_next
;
2401 } while (lock
->l_vnode
== vp
);
2404 if (blocker
== NULL
&& request
->l_flock
.l_type
== F_RDLCK
) {
2406 * No active lock is blocking this request, but if a read
2407 * lock is requested, it may also get blocked by a waiting
2408 * writer. So search all sleeping locks and see if there is
2411 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
2414 if (BLOCKS(lock
, request
)) {
2418 lock
= lock
->l_next
;
2419 } while (lock
->l_vnode
== vp
);
2424 report_blocker(blocker
, request
);
2426 request
->l_flock
.l_type
= F_UNLCK
;
2430 * Get the graph_t structure associated with a vnode.
2431 * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2432 * not yet been initialized, then a new element is allocated and returned.
2435 flk_get_lock_graph(vnode_t
*vp
, int initialize
)
2438 graph_t
*gp_alloc
= NULL
;
2439 int index
= HASH_INDEX(vp
);
2441 if (initialize
== FLK_USE_GRAPH
) {
2442 mutex_enter(&flock_lock
);
2443 gp
= lock_graph
[index
];
2444 mutex_exit(&flock_lock
);
2448 ASSERT(initialize
== FLK_INIT_GRAPH
);
2450 if (lock_graph
[index
] == NULL
) {
2452 gp_alloc
= kmem_zalloc(sizeof (graph_t
), KM_SLEEP
);
2454 /* Initialize the graph */
2456 gp_alloc
->active_locks
.l_next
=
2457 gp_alloc
->active_locks
.l_prev
=
2458 (lock_descriptor_t
*)ACTIVE_HEAD(gp_alloc
);
2459 gp_alloc
->sleeping_locks
.l_next
=
2460 gp_alloc
->sleeping_locks
.l_prev
=
2461 (lock_descriptor_t
*)SLEEPING_HEAD(gp_alloc
);
2462 gp_alloc
->index
= index
;
2463 mutex_init(&gp_alloc
->gp_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
2466 mutex_enter(&flock_lock
);
2468 gp
= lock_graph
[index
];
2470 /* Recheck the value within flock_lock */
2472 struct flock_globals
*fg
;
2474 /* We must have previously allocated the graph_t structure */
2475 ASSERT(gp_alloc
!= NULL
);
2476 lock_graph
[index
] = gp
= gp_alloc
;
2478 * The lockmgr status is only needed if KLM is loaded.
2480 if (flock_zone_key
!= ZONE_KEY_UNINITIALIZED
) {
2481 fg
= flk_get_globals();
2482 fg
->lockmgr_status
[index
] = fg
->flk_lockmgr_status
;
2486 mutex_exit(&flock_lock
);
2488 if ((gp_alloc
!= NULL
) && (gp
!= gp_alloc
)) {
2489 /* There was a race to allocate the graph_t and we lost */
2490 mutex_destroy(&gp_alloc
->gp_mutex
);
2491 kmem_free(gp_alloc
, sizeof (graph_t
));
2498 * PSARC case 1997/292
2501 cl_flk_has_remote_locks_for_nlmid(vnode_t
*vp
, int nlmid
)
2503 lock_descriptor_t
*lock
;
2509 * Check to see if node is booted as a cluster. If not, return.
2511 if ((cluster_bootflags
& CLUSTER_BOOTED
) == 0) {
2515 gp
= flk_get_lock_graph(vp
, FLK_USE_GRAPH
);
2520 mutex_enter(&gp
->gp_mutex
);
2522 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
2525 while (lock
->l_vnode
== vp
) {
2526 /* get NLM id from sysid */
2527 lock_nlmid
= GETNLMID(lock
->l_flock
.l_sysid
);
2530 * If NLM server request _and_ nlmid of lock matches
2531 * nlmid of argument, then we've found a remote lock.
2533 if (IS_LOCKMGR(lock
) && nlmid
== lock_nlmid
) {
2537 lock
= lock
->l_next
;
2541 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
2544 while (lock
->l_vnode
== vp
) {
2545 /* get NLM id from sysid */
2546 lock_nlmid
= GETNLMID(lock
->l_flock
.l_sysid
);
2549 * If NLM server request _and_ nlmid of lock matches
2550 * nlmid of argument, then we've found a remote lock.
2552 if (IS_LOCKMGR(lock
) && nlmid
== lock_nlmid
) {
2556 lock
= lock
->l_next
;
2561 mutex_exit(&gp
->gp_mutex
);
2566 * Determine whether there are any locks for the given vnode with a remote
2567 * sysid. Returns zero if not, non-zero if there are.
2569 * Note that the return value from this function is potentially invalid
2570 * once it has been returned. The caller is responsible for providing its
2571 * own synchronization mechanism to ensure that the return value is useful
2572 * (e.g., see nfs_lockcompletion()).
2575 flk_has_remote_locks(vnode_t
*vp
)
2577 lock_descriptor_t
*lock
;
2581 gp
= flk_get_lock_graph(vp
, FLK_USE_GRAPH
);
2586 mutex_enter(&gp
->gp_mutex
);
2588 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
2591 while (lock
->l_vnode
== vp
) {
2592 if (IS_REMOTE(lock
)) {
2596 lock
= lock
->l_next
;
2600 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
2603 while (lock
->l_vnode
== vp
) {
2604 if (IS_REMOTE(lock
)) {
2608 lock
= lock
->l_next
;
2613 mutex_exit(&gp
->gp_mutex
);
2618 * Determine whether there are any locks for the given vnode with a remote
2619 * sysid matching given sysid.
2620 * Used by the new (open source) NFS Lock Manager (NLM)
2623 flk_has_remote_locks_for_sysid(vnode_t
*vp
, int sysid
)
2625 lock_descriptor_t
*lock
;
2632 gp
= flk_get_lock_graph(vp
, FLK_USE_GRAPH
);
2637 mutex_enter(&gp
->gp_mutex
);
2639 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
2642 while (lock
->l_vnode
== vp
) {
2643 if (lock
->l_flock
.l_sysid
== sysid
) {
2647 lock
= lock
->l_next
;
2651 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
2654 while (lock
->l_vnode
== vp
) {
2655 if (lock
->l_flock
.l_sysid
== sysid
) {
2659 lock
= lock
->l_next
;
2664 mutex_exit(&gp
->gp_mutex
);
2669 * Determine if there are any locks owned by the given sysid.
2670 * Returns zero if not, non-zero if there are. Note that this return code
2671 * could be derived from flk_get_{sleeping,active}_locks, but this routine
2672 * avoids all the memory allocations of those routines.
2674 * This routine has the same synchronization issues as
2675 * flk_has_remote_locks.
2679 flk_sysid_has_locks(int sysid
, int lck_type
)
2682 lock_descriptor_t
*lock
;
2686 for (i
= 0; i
< HASH_SIZE
&& !has_locks
; i
++) {
2687 mutex_enter(&flock_lock
);
2689 mutex_exit(&flock_lock
);
2694 mutex_enter(&gp
->gp_mutex
);
2696 if (lck_type
& FLK_QUERY_ACTIVE
) {
2697 for (lock
= ACTIVE_HEAD(gp
)->l_next
;
2698 lock
!= ACTIVE_HEAD(gp
) && !has_locks
;
2699 lock
= lock
->l_next
) {
2700 if (lock
->l_flock
.l_sysid
== sysid
)
2705 if (lck_type
& FLK_QUERY_SLEEPING
) {
2706 for (lock
= SLEEPING_HEAD(gp
)->l_next
;
2707 lock
!= SLEEPING_HEAD(gp
) && !has_locks
;
2708 lock
= lock
->l_next
) {
2709 if (lock
->l_flock
.l_sysid
== sysid
)
2713 mutex_exit(&gp
->gp_mutex
);
2721 * PSARC case 1997/292
2723 * Requires: "sysid" is a pair [nlmid, sysid]. The lower half is 16-bit
2724 * quantity, the real sysid generated by the NLM server; the upper half
2725 * identifies the node of the cluster where the NLM server ran.
2726 * This routine is only called by an NLM server running in a cluster.
2727 * Effects: Remove all locks held on behalf of the client identified
2731 cl_flk_remove_locks_by_sysid(int sysid
)
2735 lock_descriptor_t
*lock
, *nlock
;
2738 * Check to see if node is booted as a cluster. If not, return.
2740 if ((cluster_bootflags
& CLUSTER_BOOTED
) == 0) {
2745 for (i
= 0; i
< HASH_SIZE
; i
++) {
2746 mutex_enter(&flock_lock
);
2748 mutex_exit(&flock_lock
);
2753 mutex_enter(&gp
->gp_mutex
); /* get mutex on lock graph */
2755 /* signal sleeping requests so that they bail out */
2756 lock
= SLEEPING_HEAD(gp
)->l_next
;
2757 while (lock
!= SLEEPING_HEAD(gp
)) {
2758 nlock
= lock
->l_next
;
2759 if (lock
->l_flock
.l_sysid
== sysid
) {
2760 INTERRUPT_WAKEUP(lock
);
2765 /* delete active locks */
2766 lock
= ACTIVE_HEAD(gp
)->l_next
;
2767 while (lock
!= ACTIVE_HEAD(gp
)) {
2768 nlock
= lock
->l_next
;
2769 if (lock
->l_flock
.l_sysid
== sysid
) {
2770 flk_delete_active_lock(lock
, 0);
2771 flk_wakeup(lock
, 1);
2772 flk_free_lock(lock
);
2776 mutex_exit(&gp
->gp_mutex
); /* release mutex on lock graph */
2781 * Delete all locks in the system that belongs to the sysid of the request.
2785 flk_delete_locks_by_sysid(lock_descriptor_t
*request
)
2787 int sysid
= request
->l_flock
.l_sysid
;
2788 lock_descriptor_t
*lock
, *nlock
;
2792 ASSERT(MUTEX_HELD(&request
->l_graph
->gp_mutex
));
2795 mutex_exit(&request
->l_graph
->gp_mutex
);
2797 for (i
= 0; i
< HASH_SIZE
; i
++) {
2798 mutex_enter(&flock_lock
);
2800 mutex_exit(&flock_lock
);
2805 mutex_enter(&gp
->gp_mutex
);
2807 /* signal sleeping requests so that they bail out */
2808 lock
= SLEEPING_HEAD(gp
)->l_next
;
2809 while (lock
!= SLEEPING_HEAD(gp
)) {
2810 nlock
= lock
->l_next
;
2811 if (lock
->l_flock
.l_sysid
== sysid
) {
2812 INTERRUPT_WAKEUP(lock
);
2817 /* delete active locks */
2818 lock
= ACTIVE_HEAD(gp
)->l_next
;
2819 while (lock
!= ACTIVE_HEAD(gp
)) {
2820 nlock
= lock
->l_next
;
2821 if (lock
->l_flock
.l_sysid
== sysid
) {
2822 flk_delete_active_lock(lock
, 0);
2823 flk_wakeup(lock
, 1);
2824 flk_free_lock(lock
);
2828 mutex_exit(&gp
->gp_mutex
);
2831 mutex_enter(&request
->l_graph
->gp_mutex
);
2835 * Clustering: Deletes PXFS locks
2836 * Effects: Delete all locks on files in the given file system and with the
2840 cl_flk_delete_pxfs_locks(struct vfs
*vfsp
, int pxfsid
)
2842 lock_descriptor_t
*lock
, *nlock
;
2846 for (i
= 0; i
< HASH_SIZE
; i
++) {
2847 mutex_enter(&flock_lock
);
2849 mutex_exit(&flock_lock
);
2854 mutex_enter(&gp
->gp_mutex
);
2856 /* signal sleeping requests so that they bail out */
2857 lock
= SLEEPING_HEAD(gp
)->l_next
;
2858 while (lock
!= SLEEPING_HEAD(gp
)) {
2859 nlock
= lock
->l_next
;
2860 if (lock
->l_vnode
->v_vfsp
== vfsp
) {
2861 ASSERT(IS_PXFS(lock
));
2862 if (GETPXFSID(lock
->l_flock
.l_sysid
) ==
2865 FLK_CANCELLED_STATE
);
2866 flk_cancel_sleeping_lock(lock
, 1);
2872 /* delete active locks */
2873 lock
= ACTIVE_HEAD(gp
)->l_next
;
2874 while (lock
!= ACTIVE_HEAD(gp
)) {
2875 nlock
= lock
->l_next
;
2876 if (lock
->l_vnode
->v_vfsp
== vfsp
) {
2877 ASSERT(IS_PXFS(lock
));
2878 if (GETPXFSID(lock
->l_flock
.l_sysid
) ==
2880 flk_delete_active_lock(lock
, 0);
2881 flk_wakeup(lock
, 1);
2882 flk_free_lock(lock
);
2887 mutex_exit(&gp
->gp_mutex
);
2892 * Search for a sleeping lock manager lock which matches exactly this lock
2893 * request; if one is found, fake a signal to cancel it.
2895 * Return 1 if a matching lock was found, 0 otherwise.
2899 flk_canceled(lock_descriptor_t
*request
)
2901 lock_descriptor_t
*lock
, *nlock
;
2902 graph_t
*gp
= request
->l_graph
;
2903 vnode_t
*vp
= request
->l_vnode
;
2905 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
2906 ASSERT(IS_LOCKMGR(request
));
2907 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
2910 while (lock
->l_vnode
== vp
) {
2911 nlock
= lock
->l_next
;
2912 if (SAME_OWNER(lock
, request
) &&
2913 lock
->l_start
== request
->l_start
&&
2914 lock
->l_end
== request
->l_end
) {
2915 INTERRUPT_WAKEUP(lock
);
2925 * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
2926 * That is, since OFD locks are pid-less we'll never match on the incoming
2927 * pid. OFD locks are removed earlier in the close() path via closef() and
2931 cleanlocks(vnode_t
*vp
, pid_t pid
, int sysid
)
2934 lock_descriptor_t
*lock
, *nlock
;
2935 lock_descriptor_t
*link_stack
;
2937 STACK_INIT(link_stack
);
2939 gp
= flk_get_lock_graph(vp
, FLK_USE_GRAPH
);
2943 mutex_enter(&gp
->gp_mutex
);
2945 CHECK_SLEEPING_LOCKS(gp
);
2946 CHECK_ACTIVE_LOCKS(gp
);
2948 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
2952 nlock
= lock
->l_next
;
2953 if ((lock
->l_flock
.l_pid
== pid
||
2955 lock
->l_flock
.l_sysid
== sysid
) {
2956 CANCEL_WAKEUP(lock
);
2959 } while (lock
->l_vnode
== vp
);
2962 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
2966 nlock
= lock
->l_next
;
2967 if ((lock
->l_flock
.l_pid
== pid
||
2969 lock
->l_flock
.l_sysid
== sysid
) {
2970 flk_delete_active_lock(lock
, 0);
2971 STACK_PUSH(link_stack
, lock
, l_stack
);
2974 } while (lock
->l_vnode
== vp
);
2977 while ((lock
= STACK_TOP(link_stack
)) != NULL
) {
2978 STACK_POP(link_stack
, l_stack
);
2979 flk_wakeup(lock
, 1);
2980 flk_free_lock(lock
);
2983 CHECK_SLEEPING_LOCKS(gp
);
2984 CHECK_ACTIVE_LOCKS(gp
);
2985 CHECK_OWNER_LOCKS(gp
, pid
, sysid
, vp
);
2986 mutex_exit(&gp
->gp_mutex
);
2991 * Called from 'fs' read and write routines for files that have mandatory
2996 chklock(struct vnode
*vp
, int iomode
, u_offset_t offset
, ssize_t len
, int fmode
,
2997 caller_context_t
*ct
)
3003 bf
.l_type
= (iomode
& FWRITE
) ? F_WRLCK
: F_RDLCK
;
3005 bf
.l_start
= offset
;
3008 bf
.l_pid
= curproc
->p_pid
;
3011 bf
.l_pid
= ct
->cc_pid
;
3012 bf
.l_sysid
= ct
->cc_sysid
;
3014 i
= (fmode
& (FNDELAY
|FNONBLOCK
)) ? INOFLCK
: INOFLCK
|SLPFLCK
;
3015 if ((i
= reclock(vp
, &bf
, i
, 0, offset
, NULL
)) != 0 ||
3016 bf
.l_type
!= F_UNLCK
)
3017 error
= i
? i
: EAGAIN
;
3022 * convoff - converts the given data (start, whence) to the
3026 convoff(struct vnode
*vp
, struct flock64
*lckdat
, int whence
, offset_t offset
)
3031 if ((lckdat
->l_whence
== 2) || (whence
== 2)) {
3032 vattr
.va_mask
= AT_SIZE
;
3033 if (error
= VOP_GETATTR(vp
, &vattr
, 0, CRED(), NULL
))
3037 switch (lckdat
->l_whence
) {
3039 lckdat
->l_start
+= offset
;
3042 lckdat
->l_start
+= vattr
.va_size
;
3050 if (lckdat
->l_start
< 0)
3055 lckdat
->l_start
-= offset
;
3058 lckdat
->l_start
-= vattr
.va_size
;
3066 lckdat
->l_whence
= (short)whence
;
3071 /* proc_graph function definitions */
3074 * Function checks for deadlock due to the new 'lock'. If deadlock found
3075 * edges of this lock are freed and returned.
3079 flk_check_deadlock(lock_descriptor_t
*lock
)
3081 proc_vertex_t
*start_vertex
, *pvertex
;
3082 proc_vertex_t
*dvertex
;
3083 proc_edge_t
*pep
, *ppep
;
3085 proc_vertex_t
*process_stack
;
3088 * OFD style locks are not associated with any process so there is
3089 * no proc graph for these. Thus we cannot, and do not, do deadlock
3092 if (lock
->l_ofd
!= NULL
)
3095 STACK_INIT(process_stack
);
3097 mutex_enter(&flock_lock
);
3098 start_vertex
= flk_get_proc_vertex(lock
);
3099 ASSERT(start_vertex
!= NULL
);
3101 /* construct the edges from this process to other processes */
3103 ep
= FIRST_ADJ(lock
);
3104 while (ep
!= HEAD(lock
)) {
3105 proc_vertex_t
*adj_proc
;
3107 adj_proc
= flk_get_proc_vertex(ep
->to_vertex
);
3108 for (pep
= start_vertex
->edge
; pep
!= NULL
; pep
= pep
->next
) {
3109 if (pep
->to_proc
== adj_proc
) {
3110 ASSERT(pep
->refcount
);
3116 pep
= flk_get_proc_edge();
3117 pep
->to_proc
= adj_proc
;
3119 adj_proc
->incount
++;
3120 pep
->next
= start_vertex
->edge
;
3121 start_vertex
->edge
= pep
;
3126 ep
= FIRST_IN(lock
);
3128 while (ep
!= HEAD(lock
)) {
3129 proc_vertex_t
*in_proc
;
3131 in_proc
= flk_get_proc_vertex(ep
->from_vertex
);
3133 for (pep
= in_proc
->edge
; pep
!= NULL
; pep
= pep
->next
) {
3134 if (pep
->to_proc
== start_vertex
) {
3135 ASSERT(pep
->refcount
);
3141 pep
= flk_get_proc_edge();
3142 pep
->to_proc
= start_vertex
;
3144 start_vertex
->incount
++;
3145 pep
->next
= in_proc
->edge
;
3146 in_proc
->edge
= pep
;
3151 if (start_vertex
->incount
== 0) {
3152 mutex_exit(&flock_lock
);
3156 flk_proc_graph_uncolor();
3158 start_vertex
->p_sedge
= start_vertex
->edge
;
3160 STACK_PUSH(process_stack
, start_vertex
, p_stack
);
3162 while ((pvertex
= STACK_TOP(process_stack
)) != NULL
) {
3163 for (pep
= pvertex
->p_sedge
; pep
!= NULL
; pep
= pep
->next
) {
3164 dvertex
= pep
->to_proc
;
3165 if (!PROC_ARRIVED(dvertex
)) {
3166 STACK_PUSH(process_stack
, dvertex
, p_stack
);
3167 dvertex
->p_sedge
= dvertex
->edge
;
3168 PROC_ARRIVE(pvertex
);
3169 pvertex
->p_sedge
= pep
->next
;
3172 if (!PROC_DEPARTED(dvertex
))
3176 PROC_DEPART(pvertex
);
3177 STACK_POP(process_stack
, p_stack
);
3180 mutex_exit(&flock_lock
);
3185 /* we remove all lock edges and proc edges */
3187 ep
= FIRST_ADJ(lock
);
3188 while (ep
!= HEAD(lock
)) {
3189 proc_vertex_t
*adj_proc
;
3190 adj_proc
= flk_get_proc_vertex(ep
->to_vertex
);
3193 ADJ_LIST_REMOVE(ep
);
3195 ppep
= start_vertex
->edge
;
3196 for (pep
= start_vertex
->edge
; pep
!= NULL
; ppep
= pep
,
3198 if (pep
->to_proc
== adj_proc
) {
3200 if (pep
->refcount
== 0) {
3202 start_vertex
->edge
= pep
->next
;
3204 ppep
->next
= pep
->next
;
3206 adj_proc
->incount
--;
3207 flk_proc_release(adj_proc
);
3208 flk_free_proc_edge(pep
);
3215 ep
= FIRST_IN(lock
);
3216 while (ep
!= HEAD(lock
)) {
3217 proc_vertex_t
*in_proc
;
3218 in_proc
= flk_get_proc_vertex(ep
->from_vertex
);
3221 ADJ_LIST_REMOVE(ep
);
3223 ppep
= in_proc
->edge
;
3224 for (pep
= in_proc
->edge
; pep
!= NULL
; ppep
= pep
,
3226 if (pep
->to_proc
== start_vertex
) {
3228 if (pep
->refcount
== 0) {
3230 in_proc
->edge
= pep
->next
;
3232 ppep
->next
= pep
->next
;
3234 start_vertex
->incount
--;
3235 flk_proc_release(in_proc
);
3236 flk_free_proc_edge(pep
);
3243 flk_proc_release(start_vertex
);
3244 mutex_exit(&flock_lock
);
3249 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3250 * from the list we return that, otherwise we allocate one. If necessary,
3251 * we grow the list of vertices also.
3254 static proc_vertex_t
*
3255 flk_get_proc_vertex(lock_descriptor_t
*lock
)
3259 proc_vertex_t
**palloc
;
3261 ASSERT(MUTEX_HELD(&flock_lock
));
3262 if (lock
->pvertex
!= -1) {
3263 ASSERT(lock
->pvertex
>= 0);
3264 pv
= pgraph
.proc
[lock
->pvertex
];
3265 if (pv
!= NULL
&& PROC_SAME_OWNER(lock
, pv
)) {
3269 for (i
= 0; i
< pgraph
.gcount
; i
++) {
3270 pv
= pgraph
.proc
[i
];
3271 if (pv
!= NULL
&& PROC_SAME_OWNER(lock
, pv
)) {
3272 lock
->pvertex
= pv
->index
= i
;
3276 pv
= kmem_zalloc(sizeof (struct proc_vertex
), KM_SLEEP
);
3277 pv
->pid
= lock
->l_flock
.l_pid
;
3278 pv
->sysid
= lock
->l_flock
.l_sysid
;
3279 flk_proc_vertex_allocs
++;
3280 if (pgraph
.free
!= 0) {
3281 for (i
= 0; i
< pgraph
.gcount
; i
++) {
3282 if (pgraph
.proc
[i
] == NULL
) {
3283 pgraph
.proc
[i
] = pv
;
3284 lock
->pvertex
= pv
->index
= i
;
3290 palloc
= kmem_zalloc((pgraph
.gcount
+ PROC_CHUNK
) *
3291 sizeof (proc_vertex_t
*), KM_SLEEP
);
3294 bcopy(pgraph
.proc
, palloc
,
3295 pgraph
.gcount
* sizeof (proc_vertex_t
*));
3297 kmem_free(pgraph
.proc
,
3298 pgraph
.gcount
* sizeof (proc_vertex_t
*));
3300 pgraph
.proc
= palloc
;
3301 pgraph
.free
+= (PROC_CHUNK
- 1);
3302 pv
->index
= lock
->pvertex
= pgraph
.gcount
;
3303 pgraph
.gcount
+= PROC_CHUNK
;
3304 pgraph
.proc
[pv
->index
] = pv
;
3309 * Allocate a proc edge.
3312 static proc_edge_t
*
3317 pep
= kmem_zalloc(sizeof (proc_edge_t
), KM_SLEEP
);
3318 flk_proc_edge_allocs
++;
3323 * Free the proc edge. Called whenever its reference count goes to zero.
3327 flk_free_proc_edge(proc_edge_t
*pep
)
3329 ASSERT(pep
->refcount
== 0);
3330 kmem_free((void *)pep
, sizeof (proc_edge_t
));
3331 flk_proc_edge_frees
++;
3335 * Color the graph explicitly done only when the mark value hits max value.
3339 flk_proc_graph_uncolor()
3343 if (pgraph
.mark
== UINT_MAX
) {
3344 for (i
= 0; i
< pgraph
.gcount
; i
++)
3345 if (pgraph
.proc
[i
] != NULL
) {
3346 pgraph
.proc
[i
]->atime
= 0;
3347 pgraph
.proc
[i
]->dtime
= 0;
3356 * Release the proc vertex iff both there are no in edges and out edges
3360 flk_proc_release(proc_vertex_t
*proc
)
3362 ASSERT(MUTEX_HELD(&flock_lock
));
3363 if (proc
->edge
== NULL
&& proc
->incount
== 0) {
3364 pgraph
.proc
[proc
->index
] = NULL
;
3366 kmem_free(proc
, sizeof (proc_vertex_t
));
3367 flk_proc_vertex_frees
++;
3372 * Updates process graph to reflect change in a lock_graph.
3373 * Note: We should call this function only after we have a correctly
3374 * recomputed lock graph. Otherwise we might miss a deadlock detection.
3375 * eg: in function flk_relation() we call this function after flk_recompute_
3376 * dependencies() otherwise if a process tries to lock a vnode hashed
3377 * into another graph it might sleep for ever.
3381 flk_update_proc_graph(edge_t
*ep
, int delete)
3383 proc_vertex_t
*toproc
, *fromproc
;
3384 proc_edge_t
*pep
, *prevpep
;
3386 mutex_enter(&flock_lock
);
3389 * OFD style locks are not associated with any process so there is
3390 * no proc graph for these.
3392 if (ep
->from_vertex
->l_ofd
!= NULL
) {
3393 mutex_exit(&flock_lock
);
3397 toproc
= flk_get_proc_vertex(ep
->to_vertex
);
3398 fromproc
= flk_get_proc_vertex(ep
->from_vertex
);
3402 pep
= prevpep
= fromproc
->edge
;
3404 ASSERT(pep
!= NULL
);
3405 while (pep
!= NULL
) {
3406 if (pep
->to_proc
== toproc
) {
3407 ASSERT(pep
->refcount
> 0);
3409 if (pep
->refcount
== 0) {
3410 if (pep
== prevpep
) {
3411 fromproc
->edge
= pep
->next
;
3413 prevpep
->next
= pep
->next
;
3416 flk_proc_release(toproc
);
3417 flk_free_proc_edge(pep
);
3424 flk_proc_release(fromproc
);
3425 mutex_exit(&flock_lock
);
3429 pep
= fromproc
->edge
;
3431 while (pep
!= NULL
) {
3432 if (pep
->to_proc
== toproc
) {
3433 ASSERT(pep
->refcount
> 0);
3440 pep
= flk_get_proc_edge();
3441 pep
->to_proc
= toproc
;
3444 pep
->next
= fromproc
->edge
;
3445 fromproc
->edge
= pep
;
3447 mutex_exit(&flock_lock
);
3451 * Set the control status for lock manager requests.
3456 * PSARC case 1997/292
3458 * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3459 * Effects: Set the state of the NLM server identified by "nlmid"
3460 * in the NLM registry to state "nlm_state."
3461 * Raises exception no_such_nlm if "nlmid" doesn't identify a known
3462 * NLM server to this LLM.
3463 * Note that when this routine is called with NLM_SHUTTING_DOWN there
3464 * may be locks requests that have gotten started but not finished. In
3465 * particular, there may be blocking requests that are in the callback code
3466 * before sleeping (so they're not holding the lock for the graph). If
3467 * such a thread reacquires the graph's lock (to go to sleep) after
3468 * NLM state in the NLM registry is set to a non-up value,
3469 * it will notice the status and bail out. If the request gets
3470 * granted before the thread can check the NLM registry, let it
3471 * continue normally. It will get flushed when we are called with NLM_DOWN.
3473 * Modifies: nlm_reg_obj (global)
3475 * nlmid (IN): id uniquely identifying an NLM server
3476 * nlm_state (IN): NLM server state to change "nlmid" to
3479 cl_flk_set_nlm_status(int nlmid
, flk_nlm_status_t nlm_state
)
3482 * Check to see if node is booted as a cluster. If not, return.
3484 if ((cluster_bootflags
& CLUSTER_BOOTED
) == 0) {
3489 * Check for development/debugging. It is possible to boot a node
3490 * in non-cluster mode, and then run a special script, currently
3491 * available only to developers, to bring up the node as part of a
3492 * cluster. The problem is that running such a script does not
3493 * result in the routine flk_init() being called and hence global array
3494 * nlm_reg_status is NULL. The NLM thinks it's in cluster mode,
3495 * but the LLM needs to do an additional check to see if the global
3496 * array has been created or not. If nlm_reg_status is NULL, then
3497 * return, else continue.
3499 if (nlm_reg_status
== NULL
) {
3503 ASSERT(nlmid
<= nlm_status_size
&& nlmid
>= 0);
3504 mutex_enter(&nlm_reg_lock
);
3506 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status
, nlmid
)) {
3508 * If the NLM server "nlmid" is unknown in the NLM registry,
3509 * add it to the registry in the nlm shutting down state.
3511 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status
, nlmid
,
3512 FLK_NLM_SHUTTING_DOWN
);
3515 * Change the state of the NLM server identified by "nlmid"
3516 * in the NLM registry to the argument "nlm_state."
3518 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status
, nlmid
,
3523 * The reason we must register the NLM server that is shutting down
3524 * with an LLM that doesn't already know about it (never sent a lock
3525 * request) is to handle correctly a race between shutdown and a new
3526 * lock request. Suppose that a shutdown request from the NLM server
3527 * invokes this routine at the LLM, and a thread is spawned to
3528 * service the request. Now suppose a new lock request is in
3529 * progress and has already passed the first line of defense in
3530 * reclock(), which denies new locks requests from NLM servers
3531 * that are not in the NLM_UP state. After the current routine
3532 * is invoked for both phases of shutdown, the routine will return,
3533 * having done nothing, and the lock request will proceed and
3534 * probably be granted. The problem is that the shutdown was ignored
3535 * by the lock request because there was no record of that NLM server
3536 * shutting down. We will be in the peculiar position of thinking
3537 * that we've shutdown the NLM server and all locks at all LLMs have
3538 * been discarded, but in fact there's still one lock held.
3539 * The solution is to record the existence of NLM server and change
3540 * its state immediately to NLM_SHUTTING_DOWN. The lock request in
3541 * progress may proceed because the next phase NLM_DOWN will catch
3542 * this lock and discard it.
3544 mutex_exit(&nlm_reg_lock
);
3546 switch (nlm_state
) {
3549 * Change the NLM state of all locks still held on behalf of
3550 * the NLM server identified by "nlmid" to NLM_UP.
3552 cl_flk_change_nlm_state_all_locks(nlmid
, FLK_NLM_UP
);
3555 case FLK_NLM_SHUTTING_DOWN
:
3557 * Wake up all sleeping locks for the NLM server identified
3558 * by "nlmid." Note that eventually all woken threads will
3559 * have their lock requests cancelled and descriptors
3560 * removed from the sleeping lock list. Note that the NLM
3561 * server state associated with each lock descriptor is
3562 * changed to FLK_NLM_SHUTTING_DOWN.
3564 cl_flk_wakeup_sleeping_nlm_locks(nlmid
);
3569 * Discard all active, granted locks for this NLM server
3570 * identified by "nlmid."
3572 cl_flk_unlock_nlm_granted(nlmid
);
3576 panic("cl_set_nlm_status: bad status (%d)", nlm_state
);
3581 * Set the control status for lock manager requests.
3583 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3584 * may be locks requests that have gotten started but not finished. In
3585 * particular, there may be blocking requests that are in the callback code
3586 * before sleeping (so they're not holding the lock for the graph). If
3587 * such a thread reacquires the graph's lock (to go to sleep) after
3588 * flk_lockmgr_status is set to a non-up value, it will notice the status
3589 * and bail out. If the request gets granted before the thread can check
3590 * flk_lockmgr_status, let it continue normally. It will get flushed when
3591 * we are called with FLK_LOCKMGR_DOWN.
3595 flk_set_lockmgr_status(flk_lockmgr_status_t status
)
3599 struct flock_globals
*fg
;
3601 fg
= flk_get_globals();
3604 mutex_enter(&flock_lock
);
3605 fg
->flk_lockmgr_status
= status
;
3606 mutex_exit(&flock_lock
);
3609 * If the lock manager is coming back up, all that's needed is to
3610 * propagate this information to the graphs. If the lock manager
3611 * is going down, additional action is required, and each graph's
3612 * copy of the state is updated atomically with this other action.
3615 case FLK_LOCKMGR_UP
:
3616 for (i
= 0; i
< HASH_SIZE
; i
++) {
3617 mutex_enter(&flock_lock
);
3619 mutex_exit(&flock_lock
);
3622 mutex_enter(&gp
->gp_mutex
);
3623 fg
->lockmgr_status
[i
] = status
;
3624 mutex_exit(&gp
->gp_mutex
);
3627 case FLK_WAKEUP_SLEEPERS
:
3628 wakeup_sleeping_lockmgr_locks(fg
);
3630 case FLK_LOCKMGR_DOWN
:
3631 unlock_lockmgr_granted(fg
);
3634 panic("flk_set_lockmgr_status: bad status (%d)", status
);
3640 * This routine returns all the locks that are active or sleeping and are
3641 * associated with a particular set of identifiers. If lock_state != 0, then
3642 * only locks that match the lock_state are returned. If lock_state == 0, then
3643 * all locks are returned. If pid == NOPID, the pid is ignored. If
3644 * use_sysid is FALSE, then the sysid is ignored. If vp is NULL, then the
3645 * vnode pointer is ignored.
3647 * A list containing the vnode pointer and an flock structure
3648 * describing the lock is returned. Each element in the list is
3649 * dynamically allocated and must be freed by the caller. The
3650 * last item in the list is denoted by a NULL value in the ll_next
3653 * The vnode pointers returned are held. The caller is responsible
3654 * for releasing these. Note that the returned list is only a snapshot of
3655 * the current lock information, and that it is a snapshot of a moving
3656 * target (only one graph is locked at a time).
3660 get_lock_list(int list_type
, int lock_state
, int sysid
, boolean_t use_sysid
,
3661 pid_t pid
, const vnode_t
*vp
, zoneid_t zoneid
)
3663 lock_descriptor_t
*lock
;
3664 lock_descriptor_t
*graph_head
;
3665 locklist_t listhead
;
3666 locklist_t
*llheadp
;
3671 int first_index
; /* graph index */
3672 int num_indexes
; /* graph index */
3674 ASSERT((list_type
== FLK_ACTIVE_STATE
) ||
3675 (list_type
== FLK_SLEEPING_STATE
));
3678 * Get a pointer to something to use as a list head while building
3679 * the rest of the list.
3681 llheadp
= &listhead
;
3683 llheadp
->ll_next
= (locklist_t
*)NULL
;
3685 /* Figure out which graphs we want to look at. */
3688 num_indexes
= HASH_SIZE
;
3690 first_index
= HASH_INDEX(vp
);
3694 for (i
= first_index
; i
< first_index
+ num_indexes
; i
++) {
3695 mutex_enter(&flock_lock
);
3697 mutex_exit(&flock_lock
);
3702 mutex_enter(&gp
->gp_mutex
);
3703 graph_head
= (list_type
== FLK_ACTIVE_STATE
) ?
3704 ACTIVE_HEAD(gp
) : SLEEPING_HEAD(gp
);
3705 for (lock
= graph_head
->l_next
;
3707 lock
= lock
->l_next
) {
3708 if (use_sysid
&& lock
->l_flock
.l_sysid
!= sysid
)
3710 if (pid
!= NOPID
&& lock
->l_flock
.l_pid
!= pid
)
3712 if (vp
!= NULL
&& lock
->l_vnode
!= vp
)
3714 if (lock_state
&& !(lock_state
& lock
->l_state
))
3716 if (zoneid
!= lock
->l_zoneid
&& zoneid
!= ALL_ZONES
)
3719 * A matching lock was found. Allocate
3720 * space for a new locklist entry and fill
3723 llp
= kmem_alloc(sizeof (locklist_t
), KM_SLEEP
);
3724 lltp
->ll_next
= llp
;
3725 VN_HOLD(lock
->l_vnode
);
3726 llp
->ll_vp
= lock
->l_vnode
;
3727 create_flock(lock
, &(llp
->ll_flock
));
3728 llp
->ll_next
= (locklist_t
*)NULL
;
3731 mutex_exit(&gp
->gp_mutex
);
3734 llp
= llheadp
->ll_next
;
3739 * These two functions are simply interfaces to get_lock_list. They return
3740 * a list of sleeping or active locks for the given sysid and pid. See
3741 * get_lock_list for details.
3743 * In either case we don't particularly care to specify the zone of interest;
3744 * the sysid-space is global across zones, so the sysid will map to exactly one
3745 * zone, and we'll return information for that zone.
3749 flk_get_sleeping_locks(int sysid
, pid_t pid
)
3751 return (get_lock_list(FLK_SLEEPING_STATE
, 0, sysid
, B_TRUE
, pid
, NULL
,
3756 flk_get_active_locks(int sysid
, pid_t pid
)
3758 return (get_lock_list(FLK_ACTIVE_STATE
, 0, sysid
, B_TRUE
, pid
, NULL
,
3763 * Another interface to get_lock_list. This one returns all the active
3764 * locks for a given vnode. Again, see get_lock_list for details.
3766 * We don't need to specify which zone's locks we're interested in. The matter
3767 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3768 * be used by multiple zones, so the list of locks will all be from the right
3773 flk_active_locks_for_vp(const vnode_t
*vp
)
3775 return (get_lock_list(FLK_ACTIVE_STATE
, 0, 0, B_FALSE
, NOPID
, vp
,
3780 * Another interface to get_lock_list. This one returns all the active
3781 * nbmand locks for a given vnode. Again, see get_lock_list for details.
3783 * See the comment for flk_active_locks_for_vp() for why we don't care to
3784 * specify the particular zone of interest.
3787 flk_active_nbmand_locks_for_vp(const vnode_t
*vp
)
3789 return (get_lock_list(FLK_ACTIVE_STATE
, NBMAND_LOCK
, 0, B_FALSE
,
3790 NOPID
, vp
, ALL_ZONES
));
3794 * Another interface to get_lock_list. This one returns all the active
3795 * nbmand locks for a given pid. Again, see get_lock_list for details.
3797 * The zone doesn't need to be specified here; the locks held by a
3798 * particular process will either be local (ie, non-NFS) or from the zone
3799 * the process is executing in. This is because other parts of the system
3800 * ensure that an NFS vnode can't be used in a zone other than that in
3801 * which it was opened.
3804 flk_active_nbmand_locks(pid_t pid
)
3806 return (get_lock_list(FLK_ACTIVE_STATE
, NBMAND_LOCK
, 0, B_FALSE
,
3807 pid
, NULL
, ALL_ZONES
));
3811 * Free up all entries in the locklist.
3814 flk_free_locklist(locklist_t
*llp
)
3816 locklist_t
*next_llp
;
3819 next_llp
= llp
->ll_next
;
3820 VN_RELE(llp
->ll_vp
);
3821 kmem_free(llp
, sizeof (*llp
));
3827 cl_flk_change_nlm_state_all_locks(int nlmid
, flk_nlm_status_t nlm_state
)
3830 * For each graph "lg" in the hash table lock_graph do
3831 * a. Get the list of sleeping locks
3832 * b. For each lock descriptor in the list do
3833 * i. If the requested lock is an NLM server request AND
3834 * the nlmid is the same as the routine argument then
3835 * change the lock descriptor's state field to
3837 * c. Get the list of active locks
3838 * d. For each lock descriptor in the list do
3839 * i. If the requested lock is an NLM server request AND
3840 * the nlmid is the same as the routine argument then
3841 * change the lock descriptor's state field to
3846 graph_t
*gp
; /* lock graph */
3847 lock_descriptor_t
*lock
; /* lock */
3848 lock_descriptor_t
*nlock
= NULL
; /* next lock */
3851 for (i
= 0; i
< HASH_SIZE
; i
++) {
3852 mutex_enter(&flock_lock
);
3854 mutex_exit(&flock_lock
);
3859 /* Get list of sleeping locks in current lock graph. */
3860 mutex_enter(&gp
->gp_mutex
);
3861 for (lock
= SLEEPING_HEAD(gp
)->l_next
;
3862 lock
!= SLEEPING_HEAD(gp
);
3864 nlock
= lock
->l_next
;
3866 lock_nlmid
= GETNLMID(lock
->l_flock
.l_sysid
);
3869 * If NLM server request AND nlmid of lock matches
3870 * nlmid of argument, then set the NLM state of the
3871 * lock to "nlm_state."
3873 if (IS_LOCKMGR(lock
) && nlmid
== lock_nlmid
) {
3874 SET_NLM_STATE(lock
, nlm_state
);
3878 /* Get list of active locks in current lock graph. */
3879 for (lock
= ACTIVE_HEAD(gp
)->l_next
;
3880 lock
!= ACTIVE_HEAD(gp
);
3882 nlock
= lock
->l_next
;
3884 lock_nlmid
= GETNLMID(lock
->l_flock
.l_sysid
);
3887 * If NLM server request AND nlmid of lock matches
3888 * nlmid of argument, then set the NLM state of the
3889 * lock to "nlm_state."
3891 if (IS_LOCKMGR(lock
) && nlmid
== lock_nlmid
) {
3892 ASSERT(IS_ACTIVE(lock
));
3893 SET_NLM_STATE(lock
, nlm_state
);
3896 mutex_exit(&gp
->gp_mutex
);
3901 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3902 * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3903 * identified by "nlmid." Poke those lock requests.
3906 cl_flk_wakeup_sleeping_nlm_locks(int nlmid
)
3908 lock_descriptor_t
*lock
;
3909 lock_descriptor_t
*nlock
= NULL
; /* next lock */
3914 for (i
= 0; i
< HASH_SIZE
; i
++) {
3915 mutex_enter(&flock_lock
);
3917 mutex_exit(&flock_lock
);
3922 mutex_enter(&gp
->gp_mutex
);
3923 for (lock
= SLEEPING_HEAD(gp
)->l_next
;
3924 lock
!= SLEEPING_HEAD(gp
);
3926 nlock
= lock
->l_next
;
3928 * If NLM server request _and_ nlmid of lock matches
3929 * nlmid of argument, then set the NLM state of the
3930 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3933 if (IS_LOCKMGR(lock
)) {
3936 GETNLMID(lock
->l_flock
.l_sysid
);
3937 if (nlmid
== lock_nlmid
) {
3939 FLK_NLM_SHUTTING_DOWN
);
3940 INTERRUPT_WAKEUP(lock
);
3944 mutex_exit(&gp
->gp_mutex
);
3949 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3950 * Effects: Find all active (granted) lock manager locks _only_ for the
3951 * NLM server identified by "nlmid" and release them.
3954 cl_flk_unlock_nlm_granted(int nlmid
)
3956 lock_descriptor_t
*lock
;
3957 lock_descriptor_t
*nlock
= NULL
; /* next lock */
3962 for (i
= 0; i
< HASH_SIZE
; i
++) {
3963 mutex_enter(&flock_lock
);
3965 mutex_exit(&flock_lock
);
3970 mutex_enter(&gp
->gp_mutex
);
3971 for (lock
= ACTIVE_HEAD(gp
)->l_next
;
3972 lock
!= ACTIVE_HEAD(gp
);
3974 nlock
= lock
->l_next
;
3975 ASSERT(IS_ACTIVE(lock
));
3978 * If it's an NLM server request _and_ nlmid of
3979 * the lock matches nlmid of argument, then
3980 * remove the active lock the list, wakup blocked
3981 * threads, and free the storage for the lock.
3982 * Note that there's no need to mark the NLM state
3983 * of this lock to NLM_DOWN because the lock will
3984 * be deleted anyway and its storage freed.
3986 if (IS_LOCKMGR(lock
)) {
3988 lock_nlmid
= GETNLMID(lock
->l_flock
.l_sysid
);
3989 if (nlmid
== lock_nlmid
) {
3990 flk_delete_active_lock(lock
, 0);
3991 flk_wakeup(lock
, 1);
3992 flk_free_lock(lock
);
3996 mutex_exit(&gp
->gp_mutex
);
4001 * Find all sleeping lock manager requests and poke them.
4004 wakeup_sleeping_lockmgr_locks(struct flock_globals
*fg
)
4006 lock_descriptor_t
*lock
;
4007 lock_descriptor_t
*nlock
= NULL
; /* next lock */
4010 zoneid_t zoneid
= getzoneid();
4012 for (i
= 0; i
< HASH_SIZE
; i
++) {
4013 mutex_enter(&flock_lock
);
4015 mutex_exit(&flock_lock
);
4020 mutex_enter(&gp
->gp_mutex
);
4021 fg
->lockmgr_status
[i
] = FLK_WAKEUP_SLEEPERS
;
4022 for (lock
= SLEEPING_HEAD(gp
)->l_next
;
4023 lock
!= SLEEPING_HEAD(gp
);
4025 nlock
= lock
->l_next
;
4026 if (IS_LOCKMGR(lock
) && lock
->l_zoneid
== zoneid
) {
4027 INTERRUPT_WAKEUP(lock
);
4030 mutex_exit(&gp
->gp_mutex
);
4036 * Find all active (granted) lock manager locks and release them.
4039 unlock_lockmgr_granted(struct flock_globals
*fg
)
4041 lock_descriptor_t
*lock
;
4042 lock_descriptor_t
*nlock
= NULL
; /* next lock */
4045 zoneid_t zoneid
= getzoneid();
4047 for (i
= 0; i
< HASH_SIZE
; i
++) {
4048 mutex_enter(&flock_lock
);
4050 mutex_exit(&flock_lock
);
4055 mutex_enter(&gp
->gp_mutex
);
4056 fg
->lockmgr_status
[i
] = FLK_LOCKMGR_DOWN
;
4057 for (lock
= ACTIVE_HEAD(gp
)->l_next
;
4058 lock
!= ACTIVE_HEAD(gp
);
4060 nlock
= lock
->l_next
;
4061 if (IS_LOCKMGR(lock
) && lock
->l_zoneid
== zoneid
) {
4062 ASSERT(IS_ACTIVE(lock
));
4063 flk_delete_active_lock(lock
, 0);
4064 flk_wakeup(lock
, 1);
4065 flk_free_lock(lock
);
4068 mutex_exit(&gp
->gp_mutex
);
4074 * Wait until a lock is granted, cancelled, or interrupted.
4078 wait_for_lock(lock_descriptor_t
*request
)
4080 graph_t
*gp
= request
->l_graph
;
4082 ASSERT(MUTEX_HELD(&gp
->gp_mutex
));
4084 while (!(IS_GRANTED(request
)) && !(IS_CANCELLED(request
)) &&
4085 !(IS_INTERRUPTED(request
))) {
4086 if (!cv_wait_sig(&request
->l_cv
, &gp
->gp_mutex
)) {
4087 flk_set_state(request
, FLK_INTERRUPTED_STATE
);
4088 request
->l_state
|= INTERRUPTED_LOCK
;
4094 * Create an flock structure from the existing lock information
4096 * This routine is used to create flock structures for the lock manager
4097 * to use in a reclaim request. Since the lock was originated on this
4098 * host, it must be conforming to UNIX semantics, so no checking is
4099 * done to make sure it falls within the lower half of the 32-bit range.
4103 create_flock(lock_descriptor_t
*lp
, flock64_t
*flp
)
4105 ASSERT(lp
->l_end
== MAX_U_OFFSET_T
|| lp
->l_end
<= MAXEND
);
4106 ASSERT(lp
->l_end
>= lp
->l_start
);
4108 flp
->l_type
= lp
->l_type
;
4110 flp
->l_start
= lp
->l_start
;
4111 flp
->l_len
= (lp
->l_end
== MAX_U_OFFSET_T
) ? 0 :
4112 (lp
->l_end
- lp
->l_start
+ 1);
4113 flp
->l_sysid
= lp
->l_flock
.l_sysid
;
4114 flp
->l_pid
= lp
->l_flock
.l_pid
;
4118 * Convert flock_t data describing a lock range into unsigned long starting
4119 * and ending points, which are put into lock_request. Returns 0 or an
4124 flk_convert_lock_data(vnode_t
*vp
, flock64_t
*flp
,
4125 u_offset_t
*start
, u_offset_t
*end
, offset_t offset
)
4131 * Determine the starting point of the request
4133 switch (flp
->l_whence
) {
4134 case 0: /* SEEK_SET */
4135 *start
= (u_offset_t
)flp
->l_start
;
4137 case 1: /* SEEK_CUR */
4138 *start
= (u_offset_t
)(flp
->l_start
+ offset
);
4140 case 2: /* SEEK_END */
4141 vattr
.va_mask
= AT_SIZE
;
4142 if (error
= VOP_GETATTR(vp
, &vattr
, 0, CRED(), NULL
))
4144 *start
= (u_offset_t
)(flp
->l_start
+ vattr
.va_size
);
4151 * Determine the range covered by the request.
4153 if (flp
->l_len
== 0)
4154 *end
= MAX_U_OFFSET_T
;
4155 else if ((offset_t
)flp
->l_len
> 0) {
4156 *end
= (u_offset_t
)(*start
+ (flp
->l_len
- 1));
4159 * Negative length; why do we even allow this ?
4160 * Because this allows easy specification of
4161 * the last n bytes of the file.
4164 *start
+= (u_offset_t
)flp
->l_len
;
4171 * Check the validity of lock data. This can used by the NFS
4172 * frlock routines to check data before contacting the server. The
4173 * server must support semantics that aren't as restrictive as
4174 * the UNIX API, so the NFS client is required to check.
4175 * The maximum is now passed in by the caller.
4179 flk_check_lock_data(u_offset_t start
, u_offset_t end
, offset_t max
)
4182 * The end (length) for local locking should never be greater
4183 * than MAXEND. However, the representation for
4184 * the entire file is MAX_U_OFFSET_T.
4186 if ((start
> max
) ||
4187 ((end
> max
) && (end
!= MAX_U_OFFSET_T
))) {
4197 * Fill in request->l_flock with information about the lock blocking the
4198 * request. The complexity here is that lock manager requests are allowed
4199 * to see into the upper part of the 32-bit address range, whereas local
4200 * requests are only allowed to see signed values.
4202 * What should be done when "blocker" is a lock manager lock that uses the
4203 * upper portion of the 32-bit range, but "request" is local? Since the
4204 * request has already been determined to have been blocked by the blocker,
4205 * at least some portion of "blocker" must be in the range of the request,
4206 * or the request extends to the end of file. For the first case, the
4207 * portion in the lower range is returned with the indication that it goes
4208 * "to EOF." For the second case, the last byte of the lower range is
4209 * returned with the indication that it goes "to EOF."
4213 report_blocker(lock_descriptor_t
*blocker
, lock_descriptor_t
*request
)
4215 flock64_t
*flrp
; /* l_flock portion of request */
4217 ASSERT(blocker
!= NULL
);
4219 flrp
= &request
->l_flock
;
4221 flrp
->l_type
= blocker
->l_type
;
4222 flrp
->l_pid
= blocker
->l_flock
.l_pid
;
4223 flrp
->l_sysid
= blocker
->l_flock
.l_sysid
;
4224 request
->l_ofd
= blocker
->l_ofd
;
4226 if (IS_LOCKMGR(request
)) {
4227 flrp
->l_start
= blocker
->l_start
;
4228 if (blocker
->l_end
== MAX_U_OFFSET_T
)
4231 flrp
->l_len
= blocker
->l_end
- blocker
->l_start
+ 1;
4233 if (blocker
->l_start
> MAXEND
) {
4234 flrp
->l_start
= MAXEND
;
4237 flrp
->l_start
= blocker
->l_start
;
4238 if (blocker
->l_end
== MAX_U_OFFSET_T
)
4241 flrp
->l_len
= blocker
->l_end
-
4242 blocker
->l_start
+ 1;
4248 * PSARC case 1997/292
4251 * This is the public routine exported by flock.h.
4254 cl_flk_change_nlm_state_to_unknown(int nlmid
)
4257 * Check to see if node is booted as a cluster. If not, return.
4259 if ((cluster_bootflags
& CLUSTER_BOOTED
) == 0) {
4264 * See comment in cl_flk_set_nlm_status().
4266 if (nlm_reg_status
== NULL
) {
4271 * protect NLM registry state with a mutex.
4273 ASSERT(nlmid
<= nlm_status_size
&& nlmid
>= 0);
4274 mutex_enter(&nlm_reg_lock
);
4275 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status
, nlmid
, FLK_NLM_UNKNOWN
);
4276 mutex_exit(&nlm_reg_lock
);
4280 * Return non-zero if the given I/O request conflicts with an active NBMAND
4282 * If svmand is non-zero, it means look at all active locks, not just NBMAND
4287 nbl_lock_conflict(vnode_t
*vp
, nbl_op_t op
, u_offset_t offset
,
4288 ssize_t length
, int svmand
, caller_context_t
*ct
)
4292 lock_descriptor_t
*lock
;
4297 pid
= curproc
->p_pid
;
4301 sysid
= ct
->cc_sysid
;
4304 mutex_enter(&flock_lock
);
4305 gp
= lock_graph
[HASH_INDEX(vp
)];
4306 mutex_exit(&flock_lock
);
4310 mutex_enter(&gp
->gp_mutex
);
4311 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
4313 for (; lock
&& lock
->l_vnode
== vp
; lock
= lock
->l_next
) {
4314 if ((svmand
|| (lock
->l_state
& NBMAND_LOCK
)) &&
4315 (lock
->l_flock
.l_sysid
!= sysid
||
4316 lock
->l_flock
.l_pid
!= pid
) &&
4317 lock_blocks_io(op
, offset
, length
,
4318 lock
->l_type
, lock
->l_start
, lock
->l_end
)) {
4323 mutex_exit(&gp
->gp_mutex
);
4329 * Return non-zero if the given I/O request conflicts with the given lock.
4333 lock_blocks_io(nbl_op_t op
, u_offset_t offset
, ssize_t length
,
4334 int lock_type
, u_offset_t lock_start
, u_offset_t lock_end
)
4336 ASSERT(op
== NBL_READ
|| op
== NBL_WRITE
|| op
== NBL_READWRITE
);
4337 ASSERT(lock_type
== F_RDLCK
|| lock_type
== F_WRLCK
);
4339 if (op
== NBL_READ
&& lock_type
== F_RDLCK
)
4342 if (offset
<= lock_start
&& lock_start
< offset
+ length
)
4344 if (lock_start
<= offset
&& offset
<= lock_end
)
4352 check_active_locks(graph_t
*gp
)
4354 lock_descriptor_t
*lock
, *lock1
;
4357 for (lock
= ACTIVE_HEAD(gp
)->l_next
; lock
!= ACTIVE_HEAD(gp
);
4358 lock
= lock
->l_next
) {
4359 ASSERT(IS_ACTIVE(lock
));
4360 ASSERT(NOT_BLOCKED(lock
));
4361 ASSERT(!IS_BARRIER(lock
));
4363 ep
= FIRST_IN(lock
);
4365 while (ep
!= HEAD(lock
)) {
4366 ASSERT(IS_SLEEPING(ep
->from_vertex
));
4367 ASSERT(!NOT_BLOCKED(ep
->from_vertex
));
4371 for (lock1
= lock
->l_next
; lock1
!= ACTIVE_HEAD(gp
);
4372 lock1
= lock1
->l_next
) {
4373 if (lock1
->l_vnode
== lock
->l_vnode
) {
4374 if (BLOCKS(lock1
, lock
)) {
4376 "active lock %p blocks %p",
4377 (void *)lock1
, (void *)lock
);
4378 } else if (BLOCKS(lock
, lock1
)) {
4380 "active lock %p blocks %p",
4381 (void *)lock
, (void *)lock1
);
4389 * Effect: This functions checks to see if the transition from 'old_state' to
4390 * 'new_state' is a valid one. It returns 0 if the transition is valid
4391 * and 1 if it is not.
4392 * For a map of valid transitions, see sys/flock_impl.h
4395 check_lock_transition(int old_state
, int new_state
)
4397 switch (old_state
) {
4398 case FLK_INITIAL_STATE
:
4399 if ((new_state
== FLK_START_STATE
) ||
4400 (new_state
== FLK_SLEEPING_STATE
) ||
4401 (new_state
== FLK_ACTIVE_STATE
) ||
4402 (new_state
== FLK_DEAD_STATE
)) {
4407 case FLK_START_STATE
:
4408 if ((new_state
== FLK_ACTIVE_STATE
) ||
4409 (new_state
== FLK_DEAD_STATE
)) {
4414 case FLK_ACTIVE_STATE
:
4415 if (new_state
== FLK_DEAD_STATE
) {
4420 case FLK_SLEEPING_STATE
:
4421 if ((new_state
== FLK_GRANTED_STATE
) ||
4422 (new_state
== FLK_INTERRUPTED_STATE
) ||
4423 (new_state
== FLK_CANCELLED_STATE
)) {
4428 case FLK_GRANTED_STATE
:
4429 if ((new_state
== FLK_START_STATE
) ||
4430 (new_state
== FLK_INTERRUPTED_STATE
) ||
4431 (new_state
== FLK_CANCELLED_STATE
)) {
4436 case FLK_CANCELLED_STATE
:
4437 if ((new_state
== FLK_INTERRUPTED_STATE
) ||
4438 (new_state
== FLK_DEAD_STATE
)) {
4443 case FLK_INTERRUPTED_STATE
:
4444 if (new_state
== FLK_DEAD_STATE
) {
4449 case FLK_DEAD_STATE
:
4450 /* May be set more than once */
4451 if (new_state
== FLK_DEAD_STATE
) {
4462 check_sleeping_locks(graph_t
*gp
)
4464 lock_descriptor_t
*lock1
, *lock2
;
4466 for (lock1
= SLEEPING_HEAD(gp
)->l_next
; lock1
!= SLEEPING_HEAD(gp
);
4467 lock1
= lock1
->l_next
) {
4468 ASSERT(!IS_BARRIER(lock1
));
4469 for (lock2
= lock1
->l_next
; lock2
!= SLEEPING_HEAD(gp
);
4470 lock2
= lock2
->l_next
) {
4471 if (lock1
->l_vnode
== lock2
->l_vnode
) {
4472 if (BLOCKS(lock2
, lock1
)) {
4473 ASSERT(!IS_GRANTED(lock1
));
4474 ASSERT(!NOT_BLOCKED(lock1
));
4480 for (lock2
= ACTIVE_HEAD(gp
)->l_next
; lock2
!= ACTIVE_HEAD(gp
);
4481 lock2
= lock2
->l_next
) {
4482 ASSERT(!IS_BARRIER(lock1
));
4483 if (lock1
->l_vnode
== lock2
->l_vnode
) {
4484 if (BLOCKS(lock2
, lock1
)) {
4485 ASSERT(!IS_GRANTED(lock1
));
4486 ASSERT(!NOT_BLOCKED(lock1
));
4491 ep
= FIRST_ADJ(lock1
);
4492 while (ep
!= HEAD(lock1
)) {
4493 ASSERT(BLOCKS(ep
->to_vertex
, lock1
));
4500 level_two_path(lock_descriptor_t
*lock1
, lock_descriptor_t
*lock2
, int no_path
)
4503 lock_descriptor_t
*vertex
;
4504 lock_descriptor_t
*vertex_stack
;
4506 STACK_INIT(vertex_stack
);
4508 flk_graph_uncolor(lock1
->l_graph
);
4509 ep
= FIRST_ADJ(lock1
);
4510 ASSERT(ep
!= HEAD(lock1
));
4511 while (ep
!= HEAD(lock1
)) {
4513 ASSERT(ep
->to_vertex
!= lock2
);
4514 STACK_PUSH(vertex_stack
, ep
->to_vertex
, l_dstack
);
4515 COLOR(ep
->to_vertex
);
4519 while ((vertex
= STACK_TOP(vertex_stack
)) != NULL
) {
4520 STACK_POP(vertex_stack
, l_dstack
);
4521 for (ep
= FIRST_ADJ(vertex
); ep
!= HEAD(vertex
);
4522 ep
= NEXT_ADJ(ep
)) {
4523 if (COLORED(ep
->to_vertex
))
4525 COLOR(ep
->to_vertex
);
4526 if (ep
->to_vertex
== lock2
)
4529 STACK_PUSH(vertex_stack
, ep
->to_vertex
, l_dstack
);
4536 check_owner_locks(graph_t
*gp
, pid_t pid
, int sysid
, vnode_t
*vp
)
4538 lock_descriptor_t
*lock
;
4540 /* Ignore OFD style locks since they're not process-wide. */
4544 SET_LOCK_TO_FIRST_ACTIVE_VP(gp
, lock
, vp
);
4547 while (lock
!= ACTIVE_HEAD(gp
) && (lock
->l_vnode
== vp
)) {
4548 if (lock
->l_flock
.l_pid
== pid
&&
4549 lock
->l_flock
.l_sysid
== sysid
)
4551 "owner pid %d's lock %p in active queue",
4553 lock
= lock
->l_next
;
4556 SET_LOCK_TO_FIRST_SLEEP_VP(gp
, lock
, vp
);
4559 while (lock
!= SLEEPING_HEAD(gp
) && (lock
->l_vnode
== vp
)) {
4560 if (lock
->l_flock
.l_pid
== pid
&&
4561 lock
->l_flock
.l_sysid
== sysid
)
4563 "owner pid %d's lock %p in sleep queue",
4565 lock
= lock
->l_next
;
4571 level_one_path(lock_descriptor_t
*lock1
, lock_descriptor_t
*lock2
)
4573 edge_t
*ep
= FIRST_ADJ(lock1
);
4575 while (ep
!= HEAD(lock1
)) {
4576 if (ep
->to_vertex
== lock2
)
4585 no_path(lock_descriptor_t
*lock1
, lock_descriptor_t
*lock2
)
4587 return (!level_two_path(lock1
, lock2
, 1));
4591 path(lock_descriptor_t
*lock1
, lock_descriptor_t
*lock2
)
4593 if (level_one_path(lock1
, lock2
)) {
4594 if (level_two_path(lock1
, lock2
, 0) != 0) {
4596 "one edge one path from lock1 %p lock2 %p",
4597 (void *)lock1
, (void *)lock2
);
4599 } else if (no_path(lock1
, lock2
)) {
4601 "No path from lock1 %p to lock2 %p",
4602 (void *)lock1
, (void *)lock2
);