usr/src/uts/common/os/flock.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*      All Rights Reserved */
  29
  30 /*
  31  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  32  * Copyright 2015 Joyent, Inc.
  33  */
  34
  35 #include <sys/flock_impl.h>
  36 #include <sys/vfs.h>
  37 #include <sys/t_lock.h>         /* for <sys/callb.h> */
  38 #include <sys/callb.h>
  39 #include <sys/clconf.h>
  40 #include <sys/cladm.h>
  41 #include <sys/nbmlock.h>
  42 #include <sys/cred.h>
  43 #include <sys/policy.h>
  44
  45 /*
  46  * The following four variables are for statistics purposes and they are
  47  * not protected by locks. They may not be accurate but will at least be
  48  * close to the actual value.
  49  */
  50
  51 int     flk_lock_allocs;
  52 int     flk_lock_frees;
  53 int     edge_allocs;
  54 int     edge_frees;
  55 int     flk_proc_vertex_allocs;
  56 int     flk_proc_edge_allocs;
  57 int     flk_proc_vertex_frees;
  58 int     flk_proc_edge_frees;
  59
  60 static kmutex_t flock_lock;
  61
  62 #ifdef DEBUG
  63 int check_debug = 0;
  64 #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  65                                         check_active_locks(gp);
  66 #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  67                                                 check_sleeping_locks(gp);
  68 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  69                 if (check_debug)        \
  70                         check_owner_locks(gp, pid, sysid, vp);
  71 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  72         { \
  73                 if (check_lock_transition(old_state, new_state)) { \
  74                         cmn_err(CE_PANIC, "Illegal lock transition \
  75                             from %d to %d", old_state, new_state); \
  76                 } \
  77         }
  78 #else
  79
  80 #define CHECK_ACTIVE_LOCKS(gp)
  81 #define CHECK_SLEEPING_LOCKS(gp)
  82 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  83 #define CHECK_LOCK_TRANSITION(old_state, new_state)
  84
  85 #endif /* DEBUG */
  86
  87 struct kmem_cache       *flk_edge_cache;
  88
  89 graph_t         *lock_graph[HASH_SIZE];
  90 proc_graph_t    pgraph;
  91
  92 /*
  93  * Clustering.
  94  *
  95  * NLM REGISTRY TYPE IMPLEMENTATION
  96  *
  97  * Assumptions:
  98  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
  99  *      integers; maximum node id is returned by clconf_maximum_nodeid().
 100  *  2.  We use this node id to identify the node an NLM server runs on.
 101  */
 102
 103 /*
 104  * NLM registry object keeps track of NLM servers via their
 105  * nlmids (which are the node ids of the node in the cluster they run on)
 106  * that have requested locks at this LLM with which this registry is
 107  * associated.
 108  *
 109  * Representation of abstraction:
 110  *    rep = record[     states: array[nlm_state],
 111  *                      lock: mutex]
 112  *
 113  *    Representation invariants:
 114  *      1. index i of rep.states is between 0 and n - 1 where n is number
 115  *         of elements in the array, which happen to be the maximum number
 116  *         of nodes in the cluster configuration + 1.
 117  *      2. map nlmid to index i of rep.states
 118  *              0   -> 0
 119  *              1   -> 1
 120  *              2   -> 2
 121  *              n-1 -> clconf_maximum_nodeid()+1
 122  *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 123  *          from forgetting to subtract 1 from the index.
 124  *      4.  The reason we keep the 0th index is the following.  A legitimate
 125  *          cluster configuration includes making a UFS file system NFS
 126  *          exportable.  The code is structured so that if you're in a cluster
 127  *          you do one thing; otherwise, you do something else.  The problem
 128  *          is what to do if you think you're in a cluster with PXFS loaded,
 129  *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 130  *          encode the node id of the node where NLM server runs; these bytes
 131  *          are zero for UFS.  Since the nodeid is used to index into the
 132  *          registry, we can record the NLM server state information at index
 133  *          0 using the same mechanism used for PXFS file locks!
 134  */
 135 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 136 static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 137 static uint_t nlm_status_size;                  /* size of state array */
 138
 139 /*
 140  * Although we need a global lock dependency graph (and associated data
 141  * structures), we also need a per-zone notion of whether the lock manager is
 142  * running, and so whether to allow lock manager requests or not.
 143  *
 144  * Thus, on a per-zone basis we maintain a ``global'' variable
 145  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 146  * manager is determined to be changing state (starting or stopping).
 147  *
 148  * Each graph/zone pair also has a copy of this variable, which is protected by
 149  * the graph's mutex.
 150  *
 151  * The per-graph copies are used to synchronize lock requests with shutdown
 152  * requests.  The global copy is used to initialize the per-graph field when a
 153  * new graph is created.
 154  */
 155 struct flock_globals {
 156         flk_lockmgr_status_t flk_lockmgr_status;
 157         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 158 };
 159
 160 zone_key_t flock_zone_key;
 161
 162 static void create_flock(lock_descriptor_t *, flock64_t *);
 163 static lock_descriptor_t        *flk_get_lock(void);
 164 static void     flk_free_lock(lock_descriptor_t *lock);
 165 static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 166 static int flk_process_request(lock_descriptor_t *);
 167 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 168 static edge_t *flk_get_edge(void);
 169 static int flk_wait_execute_request(lock_descriptor_t *);
 170 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 171 static void flk_insert_active_lock(lock_descriptor_t *);
 172 static void flk_delete_active_lock(lock_descriptor_t *, int);
 173 static void flk_insert_sleeping_lock(lock_descriptor_t *);
 174 static void flk_graph_uncolor(graph_t *);
 175 static void flk_wakeup(lock_descriptor_t *, int);
 176 static void flk_free_edge(edge_t *);
 177 static void flk_recompute_dependencies(lock_descriptor_t *,
 178                         lock_descriptor_t **,  int, int);
 179 static int flk_find_barriers(lock_descriptor_t *);
 180 static void flk_update_barriers(lock_descriptor_t *);
 181 static int flk_color_reachables(lock_descriptor_t *);
 182 static int flk_canceled(lock_descriptor_t *);
 183 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 184 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 185 static void wait_for_lock(lock_descriptor_t *);
 186 static void unlock_lockmgr_granted(struct flock_globals *);
 187 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 188
 189 /* Clustering hooks */
 190 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 191 static void cl_flk_wakeup_sleeping_nlm_locks(int);
 192 static void cl_flk_unlock_nlm_granted(int);
 193
 194 #ifdef DEBUG
 195 static int check_lock_transition(int, int);
 196 static void check_sleeping_locks(graph_t *);
 197 static void check_active_locks(graph_t *);
 198 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 199 static void path(lock_descriptor_t *, lock_descriptor_t *);
 200 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 201 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 202 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 203 #endif
 204
 205 /*      proc_graph function definitions */
 206 static int flk_check_deadlock(lock_descriptor_t *);
 207 static void flk_proc_graph_uncolor(void);
 208 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 209 static proc_edge_t *flk_get_proc_edge(void);
 210 static void flk_proc_release(proc_vertex_t *);
 211 static void flk_free_proc_edge(proc_edge_t *);
 212 static void flk_update_proc_graph(edge_t *, int);
 213
 214 /* Non-blocking mandatory locking */
 215 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 216                         u_offset_t);
 217
 218 static struct flock_globals *
 219 flk_get_globals(void)
 220 {
 221         /*
 222          * The KLM module had better be loaded if we're attempting to handle
 223          * lockmgr requests.
 224          */
 225         ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 226         return (zone_getspecific(flock_zone_key, curproc->p_zone));
 227 }
 228
 229 static flk_lockmgr_status_t
 230 flk_get_lockmgr_status(void)
 231 {
 232         struct flock_globals *fg;
 233
 234         ASSERT(MUTEX_HELD(&flock_lock));
 235
 236         if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 237                 /*
 238                  * KLM module not loaded; lock manager definitely not running.
 239                  */
 240                 return (FLK_LOCKMGR_DOWN);
 241         }
 242         fg = flk_get_globals();
 243         return (fg->flk_lockmgr_status);
 244 }
 245
 246 /*
 247  * This implements Open File Description (not descriptor) style record locking.
 248  * These locks can also be thought of as pid-less since they are not tied to a
 249  * specific process, thus they're preserved across fork.
 250  *
 251  * Called directly from fcntl.
 252  *
 253  * See reclock() for the implementation of the traditional POSIX style record
 254  * locking scheme (pid-ful). This function is derived from reclock() but
 255  * simplified and modified to work for OFD style locking.
 256  *
 257  * The two primary advantages of OFD style of locking are:
 258  * 1) It is per-file description, so closing a file descriptor that refers to a
 259  *    different file description for the same file will not drop the lock (i.e.
 260  *    two open's of the same file get different descriptions but a dup or fork
 261  *    will refer to the same description).
 262  * 2) Locks are preserved across fork(2).
 263  *
 264  * Because these locks are per-description a lock ptr lives at the f_filocks
 265  * member of the file_t and the lock_descriptor includes a file_t pointer
 266  * to enable unique lock identification and management.
 267  *
 268  * Since these locks are pid-less we cannot do deadlock detection with the
 269  * current process-oriented implementation. This is consistent with OFD locking
 270  * behavior on other operating systems such as Linux. Since we don't do
 271  * deadlock detection we never interact with the process graph that is
 272  * maintained for deadlock detection on the traditional POSIX-style locks.
 273  *
 274  * Future Work:
 275  *
 276  * The current implementation does not support record locks. That is,
 277  * currently the single lock must cover the entire file. This is validated in
 278  * fcntl. To support record locks the f_filock pointer in the file_t needs to
 279  * be changed to a list of pointers to the locks. That list needs to be
 280  * managed independently of the lock list on the vnode itself and it needs to
 281  * be maintained as record locks are created, split, coalesced and deleted.
 282  *
 283  * The current implementation does not support remote file systems (e.g.
 284  * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
 285  * interact with the NLM is not clear since the NLM protocol/implementation
 286  * appears to be oriented around locks associated with a process. A further
 287  * problem is that a design is needed for what nlm_send_siglost() should do and
 288  * where it will send SIGLOST. More recent versions of Linux apparently try to
 289  * emulate OFD locks on NFS by converting them to traditional POSIX style locks
 290  * that work with the NLM. It is not clear that this provides the correct
 291  * semantics in all cases.
 292  */
 293 int
 294 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
 295 {
 296         int cmd = 0;
 297         vnode_t *vp;
 298         lock_descriptor_t       stack_lock_request;
 299         lock_descriptor_t       *lock_request;
 300         int error = 0;
 301         graph_t *gp;
 302         int serialize = 0;
 303
 304         if (fcmd != F_OFD_GETLK)
 305                 cmd = SETFLCK;
 306
 307         if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
 308                 cmd |= SLPFLCK;
 309
 310         /* see block comment */
 311         VERIFY(lckdat->l_whence == 0);
 312         VERIFY(lckdat->l_start == 0);
 313         VERIFY(lckdat->l_len == 0);
 314
 315         vp = fp->f_vnode;
 316
 317         /*
 318          * For reclock fs_frlock() would normally have set these in a few
 319          * places but for us it's cleaner to centralize it here. Note that
 320          * IGN_PID is -1. We use 0 for our pid-less locks.
 321          */
 322         lckdat->l_pid = 0;
 323         lckdat->l_sysid = 0;
 324
 325         /*
 326          * Check access permissions
 327          */
 328         if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
 329             ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 330             (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 331                 return (EBADF);
 332
 333         /*
 334          * for query and unlock we use the stack_lock_request
 335          */
 336         if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
 337                 lock_request = &stack_lock_request;
 338                 (void) bzero((caddr_t)lock_request,
 339                     sizeof (lock_descriptor_t));
 340
 341                 /*
 342                  * following is added to make the assertions in
 343                  * flk_execute_request() pass
 344                  */
 345                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 346                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 347                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 348                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 349                 lock_request->l_status = FLK_INITIAL_STATE;
 350         } else {
 351                 lock_request = flk_get_lock();
 352                 fp->f_filock = (struct filock *)lock_request;
 353         }
 354         lock_request->l_state = 0;
 355         lock_request->l_vnode = vp;
 356         lock_request->l_zoneid = getzoneid();
 357         lock_request->l_ofd = fp;
 358
 359         /*
 360          * Convert the request range into the canonical start and end
 361          * values then check the validity of the lock range.
 362          */
 363         error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
 364             &lock_request->l_end, offset);
 365         if (error)
 366                 goto done;
 367
 368         error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
 369             MAXEND);
 370         if (error)
 371                 goto done;
 372
 373         ASSERT(lock_request->l_end >= lock_request->l_start);
 374
 375         lock_request->l_type = lckdat->l_type;
 376         if (cmd & SLPFLCK)
 377                 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 378
 379         if (!(cmd & SETFLCK)) {
 380                 if (lock_request->l_type == F_RDLCK ||
 381                     lock_request->l_type == F_WRLCK)
 382                         lock_request->l_state |= QUERY_LOCK;
 383         }
 384         lock_request->l_flock = (*lckdat);
 385
 386         /*
 387          * We are ready for processing the request
 388          */
 389
 390         if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
 391             nbl_need_check(vp)) {
 392                 nbl_start_crit(vp, RW_WRITER);
 393                 serialize = 1;
 394         }
 395
 396         /* Get the lock graph for a particular vnode */
 397         gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 398
 399         mutex_enter(&gp->gp_mutex);
 400
 401         lock_request->l_state |= REFERENCED_LOCK;
 402         lock_request->l_graph = gp;
 403
 404         switch (lock_request->l_type) {
 405         case F_RDLCK:
 406         case F_WRLCK:
 407                 if (IS_QUERY_LOCK(lock_request)) {
 408                         flk_get_first_blocking_lock(lock_request);
 409                         if (lock_request->l_ofd != NULL)
 410                                 lock_request->l_flock.l_pid = -1;
 411                         (*lckdat) = lock_request->l_flock;
 412                 } else {
 413                         /* process the request now */
 414                         error = flk_process_request(lock_request);
 415                 }
 416                 break;
 417
 418         case F_UNLCK:
 419                 /* unlock request will not block so execute it immediately */
 420                 error = flk_execute_request(lock_request);
 421                 break;
 422
 423         default:
 424                 error = EINVAL;
 425                 break;
 426         }
 427
 428         if (lock_request == &stack_lock_request) {
 429                 flk_set_state(lock_request, FLK_DEAD_STATE);
 430         } else {
 431                 lock_request->l_state &= ~REFERENCED_LOCK;
 432                 if ((error != 0) || IS_DELETED(lock_request)) {
 433                         flk_set_state(lock_request, FLK_DEAD_STATE);
 434                         flk_free_lock(lock_request);
 435                 }
 436         }
 437
 438         mutex_exit(&gp->gp_mutex);
 439         if (serialize)
 440                 nbl_end_crit(vp);
 441
 442         return (error);
 443
 444 done:
 445         flk_set_state(lock_request, FLK_DEAD_STATE);
 446         if (lock_request != &stack_lock_request)
 447                 flk_free_lock(lock_request);
 448         return (error);
 449 }
 450
 451 /*
 452  * Remove any lock on the vnode belonging to the given file_t.
 453  * Called from closef on last close, file_t is locked.
 454  *
 455  * This is modeled on the cleanlocks() function but only removes the single
 456  * lock associated with fp.
 457  */
 458 void
 459 ofdcleanlock(file_t *fp)
 460 {
 461         lock_descriptor_t *fplock, *lock, *nlock;
 462         vnode_t *vp;
 463         graph_t *gp;
 464
 465         ASSERT(MUTEX_HELD(&fp->f_tlock));
 466
 467         if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
 468                 return;
 469
 470         fp->f_filock = NULL;
 471         vp = fp->f_vnode;
 472
 473         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
 474
 475         if (gp == NULL)
 476                 return;
 477         mutex_enter(&gp->gp_mutex);
 478
 479         CHECK_SLEEPING_LOCKS(gp);
 480         CHECK_ACTIVE_LOCKS(gp);
 481
 482         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 483
 484         if (lock) {
 485                 do {
 486                         nlock = lock->l_next;
 487                         if (fplock == lock) {
 488                                 CANCEL_WAKEUP(lock);
 489                                 break;
 490                         }
 491                         lock = nlock;
 492                 } while (lock->l_vnode == vp);
 493         }
 494
 495         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 496
 497         if (lock) {
 498                 do {
 499                         nlock = lock->l_next;
 500                         if (fplock == lock) {
 501                                 flk_delete_active_lock(lock, 0);
 502                                 flk_wakeup(lock, 1);
 503                                 flk_free_lock(lock);
 504                                 break;
 505                         }
 506                         lock = nlock;
 507                 } while (lock->l_vnode == vp);
 508         }
 509
 510         CHECK_SLEEPING_LOCKS(gp);
 511         CHECK_ACTIVE_LOCKS(gp);
 512         mutex_exit(&gp->gp_mutex);
 513 }
 514
 515 /*
 516  * Routine called from fs_frlock in fs/fs_subr.c
 517  *
 518  * This implements traditional POSIX style record locking. The two primary
 519  * drawbacks to this style of locking are:
 520  * 1) It is per-process, so any close of a file descriptor that refers to the
 521  *    file will drop the lock (e.g. lock /etc/passwd, call a library function
 522  *    which opens /etc/passwd to read the file, when the library closes it's
 523  *    file descriptor the application loses its lock and does not know).
 524  * 2) Locks are not preserved across fork(2).
 525  *
 526  * Because these locks are only associated with a PID, they are per-process.
 527  * This is why any close will drop the lock and is also why, once the process
 528  * forks, the lock is no longer related to the new process. These locks can
 529  * be considered as PID-ful.
 530  *
 531  * See ofdlock() for the implementation of a similar but improved locking
 532  * scheme.
 533  */
 534 int
 535 reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, u_offset_t offset,
 536     flk_callback_t *flk_cbp)
 537 {
 538         lock_descriptor_t       stack_lock_request;
 539         lock_descriptor_t       *lock_request;
 540         int error = 0;
 541         graph_t *gp;
 542         int                     nlmid;
 543
 544         /*
 545          * Check access permissions
 546          */
 547         if ((cmd & SETFLCK) &&
 548             ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 549             (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 550                         return (EBADF);
 551
 552         /*
 553          * for query and unlock we use the stack_lock_request
 554          */
 555
 556         if ((lckdat->l_type == F_UNLCK) ||
 557             !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 558                 lock_request = &stack_lock_request;
 559                 (void) bzero((caddr_t)lock_request,
 560                     sizeof (lock_descriptor_t));
 561
 562                 /*
 563                  * following is added to make the assertions in
 564                  * flk_execute_request() to pass through
 565                  */
 566
 567                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 568                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 569                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 570                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 571                 lock_request->l_status = FLK_INITIAL_STATE;
 572         } else {
 573                 lock_request = flk_get_lock();
 574         }
 575         lock_request->l_state = 0;
 576         lock_request->l_vnode = vp;
 577         lock_request->l_zoneid = getzoneid();
 578
 579         /*
 580          * Convert the request range into the canonical start and end
 581          * values.  The NLM protocol supports locking over the entire
 582          * 32-bit range, so there's no range checking for remote requests,
 583          * but we still need to verify that local requests obey the rules.
 584          */
 585         /* Clustering */
 586         if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 587                 ASSERT(lckdat->l_whence == 0);
 588                 lock_request->l_start = lckdat->l_start;
 589                 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 590                     lckdat->l_start + (lckdat->l_len - 1);
 591         } else {
 592                 /* check the validity of the lock range */
 593                 error = flk_convert_lock_data(vp, lckdat,
 594                     &lock_request->l_start, &lock_request->l_end,
 595                     offset);
 596                 if (error) {
 597                         goto done;
 598                 }
 599                 error = flk_check_lock_data(lock_request->l_start,
 600                     lock_request->l_end, MAXEND);
 601                 if (error) {
 602                         goto done;
 603                 }
 604         }
 605
 606         ASSERT(lock_request->l_end >= lock_request->l_start);
 607
 608         lock_request->l_type = lckdat->l_type;
 609         if (cmd & INOFLCK)
 610                 lock_request->l_state |= IO_LOCK;
 611         if (cmd & SLPFLCK)
 612                 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 613         if (cmd & RCMDLCK)
 614                 lock_request->l_state |= LOCKMGR_LOCK;
 615         if (cmd & NBMLCK)
 616                 lock_request->l_state |= NBMAND_LOCK;
 617         /*
 618          * Clustering: set flag for PXFS locks
 619          * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 620          * also be of type 'RCMDLCK'.
 621          * We do not _only_ check the GETPXFSID() macro because local PXFS
 622          * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 623          */
 624
 625         if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 626                 lock_request->l_state |= PXFS_LOCK;
 627         }
 628         if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 629                 if (lock_request->l_type == F_RDLCK ||
 630                     lock_request->l_type == F_WRLCK)
 631                         lock_request->l_state |= QUERY_LOCK;
 632         }
 633         lock_request->l_flock = (*lckdat);
 634         lock_request->l_callbacks = flk_cbp;
 635
 636         /*
 637          * We are ready for processing the request
 638          */
 639         if (IS_LOCKMGR(lock_request)) {
 640                 /*
 641                  * If the lock request is an NLM server request ....
 642                  */
 643                 if (nlm_status_size == 0) { /* not booted as cluster */
 644                         mutex_enter(&flock_lock);
 645                         /*
 646                          * Bail out if this is a lock manager request and the
 647                          * lock manager is not supposed to be running.
 648                          */
 649                         if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 650                                 mutex_exit(&flock_lock);
 651                                 error = ENOLCK;
 652                                 goto done;
 653                         }
 654                         mutex_exit(&flock_lock);
 655                 } else {                        /* booted as a cluster */
 656                         nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 657                         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 658
 659                         mutex_enter(&nlm_reg_lock);
 660                         /*
 661                          * If the NLM registry does not know about this
 662                          * NLM server making the request, add its nlmid
 663                          * to the registry.
 664                          */
 665                         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 666                             nlmid)) {
 667                                 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 668                         } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 669                             nlmid)) {
 670                                 /*
 671                                  * If the NLM server is already known (has made
 672                                  * previous lock requests) and its state is
 673                                  * not NLM_UP (means that NLM server is
 674                                  * shutting down), then bail out with an
 675                                  * error to deny the lock request.
 676                                  */
 677                                 mutex_exit(&nlm_reg_lock);
 678                                 error = ENOLCK;
 679                                 goto done;
 680                         }
 681                         mutex_exit(&nlm_reg_lock);
 682                 }
 683         }
 684
 685         /* Now get the lock graph for a particular vnode */
 686         gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 687
 688         /*
 689          * We drop rwlock here otherwise this might end up causing a
 690          * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 691          */
 692
 693         if (IS_IO_LOCK(lock_request)) {
 694                 VOP_RWUNLOCK(vp,
 695                     (lock_request->l_type == F_RDLCK) ?
 696                     V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 697         }
 698         mutex_enter(&gp->gp_mutex);
 699
 700         lock_request->l_state |= REFERENCED_LOCK;
 701         lock_request->l_graph = gp;
 702
 703         switch (lock_request->l_type) {
 704         case F_RDLCK:
 705         case F_WRLCK:
 706                 if (IS_QUERY_LOCK(lock_request)) {
 707                         flk_get_first_blocking_lock(lock_request);
 708                         if (lock_request->l_ofd != NULL)
 709                                 lock_request->l_flock.l_pid = -1;
 710                         (*lckdat) = lock_request->l_flock;
 711                         break;
 712                 }
 713
 714                 /* process the request now */
 715
 716                 error = flk_process_request(lock_request);
 717                 break;
 718
 719         case F_UNLCK:
 720                 /* unlock request will not block so execute it immediately */
 721
 722                 if (IS_LOCKMGR(lock_request) &&
 723                     flk_canceled(lock_request)) {
 724                         error = 0;
 725                 } else {
 726                         error = flk_execute_request(lock_request);
 727                 }
 728                 break;
 729
 730         case F_UNLKSYS:
 731                 /*
 732                  * Recovery mechanism to release lock manager locks when
 733                  * NFS client crashes and restart. NFS server will clear
 734                  * old locks and grant new locks.
 735                  */
 736
 737                 if (lock_request->l_flock.l_sysid == 0) {
 738                         mutex_exit(&gp->gp_mutex);
 739                         return (EINVAL);
 740                 }
 741                 if (secpolicy_nfs(CRED()) != 0) {
 742                         mutex_exit(&gp->gp_mutex);
 743                         return (EPERM);
 744                 }
 745                 flk_delete_locks_by_sysid(lock_request);
 746                 lock_request->l_state &= ~REFERENCED_LOCK;
 747                 flk_set_state(lock_request, FLK_DEAD_STATE);
 748                 flk_free_lock(lock_request);
 749                 mutex_exit(&gp->gp_mutex);
 750                 return (0);
 751
 752         default:
 753                 error = EINVAL;
 754                 break;
 755         }
 756
 757         /* Clustering: For blocked PXFS locks, return */
 758         if (error == PXFS_LOCK_BLOCKED) {
 759                 lock_request->l_state &= ~REFERENCED_LOCK;
 760                 mutex_exit(&gp->gp_mutex);
 761                 return (error);
 762         }
 763
 764         /*
 765          * Now that we have seen the status of locks in the system for
 766          * this vnode we acquire the rwlock if it is an IO_LOCK.
 767          */
 768
 769         if (IS_IO_LOCK(lock_request)) {
 770                 (void) VOP_RWLOCK(vp,
 771                     (lock_request->l_type == F_RDLCK) ?
 772                     V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 773                 if (!error) {
 774                         lckdat->l_type = F_UNLCK;
 775
 776                         /*
 777                          * This wake up is needed otherwise
 778                          * if IO_LOCK has slept the dependents on this
 779                          * will not be woken up at all. (bugid # 1185482).
 780                          */
 781
 782                         flk_wakeup(lock_request, 1);
 783                         flk_set_state(lock_request, FLK_DEAD_STATE);
 784                         flk_free_lock(lock_request);
 785                 }
 786                 /*
 787                  * else if error had occurred either flk_process_request()
 788                  * has returned EDEADLK in which case there will be no
 789                  * dependents for this lock or EINTR from flk_wait_execute_
 790                  * request() in which case flk_cancel_sleeping_lock()
 791                  * would have been done. same is true with EBADF.
 792                  */
 793         }
 794
 795         if (lock_request == &stack_lock_request) {
 796                 flk_set_state(lock_request, FLK_DEAD_STATE);
 797         } else {
 798                 lock_request->l_state &= ~REFERENCED_LOCK;
 799                 if ((error != 0) || IS_DELETED(lock_request)) {
 800                         flk_set_state(lock_request, FLK_DEAD_STATE);
 801                         flk_free_lock(lock_request);
 802                 }
 803         }
 804
 805         mutex_exit(&gp->gp_mutex);
 806         return (error);
 807
 808 done:
 809         flk_set_state(lock_request, FLK_DEAD_STATE);
 810         if (lock_request != &stack_lock_request)
 811                 flk_free_lock(lock_request);
 812         return (error);
 813 }
 814
 815 /*
 816  * Invoke the callbacks in the given list.  If before sleeping, invoke in
 817  * list order.  If after sleeping, invoke in reverse order.
 818  *
 819  * CPR (suspend/resume) support: if one of the callbacks returns a
 820  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 821  * while it is sleeping.  There should be at most one callb_cpr_t for the
 822  * thread.
 823  * XXX This is unnecessarily complicated.  The CPR information should just
 824  * get passed in directly through VOP_FRLOCK and reclock, rather than
 825  * sneaking it in via a callback.
 826  */
 827
 828 callb_cpr_t *
 829 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 830 {
 831         callb_cpr_t *cpr_callbackp = NULL;
 832         callb_cpr_t *one_result;
 833         flk_callback_t *cb;
 834
 835         if (cblist == NULL)
 836                 return (NULL);
 837
 838         if (when == FLK_BEFORE_SLEEP) {
 839                 cb = cblist;
 840                 do {
 841                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 842                         if (one_result != NULL) {
 843                                 ASSERT(cpr_callbackp == NULL);
 844                                 cpr_callbackp = one_result;
 845                         }
 846                         cb = cb->cb_next;
 847                 } while (cb != cblist);
 848         } else {
 849                 cb = cblist->cb_prev;
 850                 do {
 851                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 852                         if (one_result != NULL) {
 853                                 cpr_callbackp = one_result;
 854                         }
 855                         cb = cb->cb_prev;
 856                 } while (cb != cblist->cb_prev);
 857         }
 858
 859         return (cpr_callbackp);
 860 }
 861
 862 /*
 863  * Initialize a flk_callback_t to hold the given callback.
 864  */
 865
 866 void
 867 flk_init_callback(flk_callback_t *flk_cb,
 868     callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 869 {
 870         flk_cb->cb_next = flk_cb;
 871         flk_cb->cb_prev = flk_cb;
 872         flk_cb->cb_callback = cb_fcn;
 873         flk_cb->cb_data = cbdata;
 874 }
 875
 876 /*
 877  * Initialize an flk_callback_t and then link it into the head of an
 878  * existing list (which may be NULL).
 879  */
 880
 881 void
 882 flk_add_callback(flk_callback_t *newcb,
 883     callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 884     void *cbdata, flk_callback_t *cblist)
 885 {
 886         flk_init_callback(newcb, cb_fcn, cbdata);
 887
 888         if (cblist == NULL)
 889                 return;
 890
 891         newcb->cb_prev = cblist->cb_prev;
 892         newcb->cb_next = cblist;
 893         cblist->cb_prev->cb_next = newcb;
 894         cblist->cb_prev = newcb;
 895 }
 896
 897 /*
 898  * Remove the callback from a list.
 899  */
 900
 901 void
 902 flk_del_callback(flk_callback_t *flk_cb)
 903 {
 904         flk_cb->cb_next->cb_prev = flk_cb->cb_prev;
 905         flk_cb->cb_prev->cb_next = flk_cb->cb_next;
 906
 907         flk_cb->cb_prev = flk_cb;
 908         flk_cb->cb_next = flk_cb;
 909 }
 910
 911 /*
 912  * Initialize the flk_edge_cache data structure and create the
 913  * nlm_reg_status array.
 914  */
 915
 916 void
 917 flk_init(void)
 918 {
 919         uint_t  i;
 920
 921         flk_edge_cache = kmem_cache_create("flk_edges",
 922             sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 923         if (flk_edge_cache == NULL) {
 924                 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 925         }
 926         /*
 927          * Create the NLM registry object.
 928          */
 929
 930         if (cluster_bootflags & CLUSTER_BOOTED) {
 931                 /*
 932                  * This routine tells you the maximum node id that will be used
 933                  * in the cluster.  This number will be the size of the nlm
 934                  * registry status array.  We add 1 because we will be using
 935                  * all entries indexed from 0 to maxnodeid; e.g., from 0
 936                  * to 64, for a total of 65 entries.
 937                  */
 938                 nlm_status_size = clconf_maximum_nodeid() + 1;
 939         } else {
 940                 nlm_status_size = 0;
 941         }
 942
 943         if (nlm_status_size != 0) {     /* booted as a cluster */
 944                 nlm_reg_status = (flk_nlm_status_t *)
 945                     kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 946                     KM_SLEEP);
 947
 948                 /* initialize all NLM states in array to NLM_UNKNOWN */
 949                 for (i = 0; i < nlm_status_size; i++) {
 950                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 951                 }
 952         }
 953 }
 954
 955 /*
 956  * Zone constructor/destructor callbacks to be executed when a zone is
 957  * created/destroyed.
 958  */
 959 /* ARGSUSED */
 960 void *
 961 flk_zone_init(zoneid_t zoneid)
 962 {
 963         struct flock_globals *fg;
 964         uint_t i;
 965
 966         fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 967         fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 968         for (i = 0; i < HASH_SIZE; i++)
 969                 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 970         return (fg);
 971 }
 972
 973 /* ARGSUSED */
 974 void
 975 flk_zone_fini(zoneid_t zoneid, void *data)
 976 {
 977         struct flock_globals *fg = data;
 978
 979         kmem_free(fg, sizeof (*fg));
 980 }
 981
 982 /*
 983  * Get a lock_descriptor structure with initialization of edge lists.
 984  */
 985
 986 static lock_descriptor_t *
 987 flk_get_lock(void)
 988 {
 989         lock_descriptor_t       *l;
 990
 991         l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
 992
 993         cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
 994         l->l_edge.edge_in_next = &l->l_edge;
 995         l->l_edge.edge_in_prev = &l->l_edge;
 996         l->l_edge.edge_adj_next = &l->l_edge;
 997         l->l_edge.edge_adj_prev = &l->l_edge;
 998         l->pvertex = -1;
 999         l->l_status = FLK_INITIAL_STATE;
1000         flk_lock_allocs++;
1001         return (l);
1002 }
1003
1004 /*
1005  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1006  * when some thread has a reference to it as in reclock().
1007  */
1008
1009 void
1010 flk_free_lock(lock_descriptor_t *lock)
1011 {
1012         file_t *fp;
1013
1014         ASSERT(IS_DEAD(lock));
1015
1016         if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1017                 fp->f_filock = NULL;
1018
1019         if (IS_REFERENCED(lock)) {
1020                 lock->l_state |= DELETED_LOCK;
1021                 return;
1022         }
1023         flk_lock_frees++;
1024         kmem_free((void *)lock, sizeof (lock_descriptor_t));
1025 }
1026
1027 void
1028 flk_set_state(lock_descriptor_t *lock, int new_state)
1029 {
1030         /*
1031          * Locks in the sleeping list may be woken up in a number of ways,
1032          * and more than once.  If a sleeping lock is signaled awake more
1033          * than once, then it may or may not change state depending on its
1034          * current state.
1035          * Also note that NLM locks that are sleeping could be moved to an
1036          * interrupted state more than once if the unlock request is
1037          * retransmitted by the NLM client - the second time around, this is
1038          * just a nop.
1039          * The ordering of being signaled awake is:
1040          * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1041          * The checks below implement this ordering.
1042          */
1043         if (IS_INTERRUPTED(lock)) {
1044                 if ((new_state == FLK_CANCELLED_STATE) ||
1045                     (new_state == FLK_GRANTED_STATE) ||
1046                     (new_state == FLK_INTERRUPTED_STATE)) {
1047                         return;
1048                 }
1049         }
1050         if (IS_CANCELLED(lock)) {
1051                 if ((new_state == FLK_GRANTED_STATE) ||
1052                     (new_state == FLK_CANCELLED_STATE)) {
1053                         return;
1054                 }
1055         }
1056         CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1057         if (IS_PXFS(lock)) {
1058                 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1059         }
1060         lock->l_status = new_state;
1061 }
1062
1063 /*
1064  * Routine that checks whether there are any blocking locks in the system.
1065  *
1066  * The policy followed is if a write lock is sleeping we don't allow read
1067  * locks before this write lock even though there may not be any active
1068  * locks corresponding to the read locks' region.
1069  *
1070  * flk_add_edge() function adds an edge between l1 and l2 iff there
1071  * is no path between l1 and l2. This is done to have a "minimum
1072  * storage representation" of the dependency graph.
1073  *
1074  * Another property of the graph is since only the new request throws
1075  * edges to the existing locks in the graph, the graph is always topologically
1076  * ordered.
1077  */
1078
1079 static int
1080 flk_process_request(lock_descriptor_t *request)
1081 {
1082         graph_t *gp = request->l_graph;
1083         lock_descriptor_t *lock;
1084         int request_blocked_by_active = 0;
1085         int request_blocked_by_granted = 0;
1086         int request_blocked_by_sleeping = 0;
1087         vnode_t *vp = request->l_vnode;
1088         int     error = 0;
1089         int request_will_wait = 0;
1090         int found_covering_lock = 0;
1091         lock_descriptor_t *covered_by = NULL;
1092
1093         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1094         request_will_wait = IS_WILLING_TO_SLEEP(request);
1095
1096         /*
1097          * check active locks
1098          */
1099
1100         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1101
1102
1103         if (lock) {
1104                 do {
1105                         if (BLOCKS(lock, request)) {
1106                                 if (!request_will_wait)
1107                                         return (EAGAIN);
1108                                 request_blocked_by_active = 1;
1109                                 break;
1110                         }
1111                         /*
1112                          * Grant lock if it is for the same owner holding active
1113                          * lock that covers the request.
1114                          */
1115
1116                         if (SAME_OWNER(lock, request) &&
1117                             COVERS(lock, request) &&
1118                             (request->l_type == F_RDLCK))
1119                                 return (flk_execute_request(request));
1120                         lock = lock->l_next;
1121                 } while (lock->l_vnode == vp);
1122         }
1123
1124         if (!request_blocked_by_active) {
1125                         lock_descriptor_t *lk[1];
1126                         lock_descriptor_t *first_glock = NULL;
1127                 /*
1128                  * Shall we grant this?! NO!!
1129                  * What about those locks that were just granted and still
1130                  * in sleep queue. Those threads are woken up and so locks
1131                  * are almost active.
1132                  */
1133                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1134                 if (lock) {
1135                         do {
1136                                 if (BLOCKS(lock, request)) {
1137                                         if (IS_GRANTED(lock)) {
1138                                                 request_blocked_by_granted = 1;
1139                                         } else {
1140                                                 request_blocked_by_sleeping = 1;
1141                                         }
1142                                 }
1143
1144                                 lock = lock->l_next;
1145                         } while ((lock->l_vnode == vp));
1146                         first_glock = lock->l_prev;
1147                         ASSERT(first_glock->l_vnode == vp);
1148                 }
1149
1150                 if (request_blocked_by_granted)
1151                         goto block;
1152
1153                 if (!request_blocked_by_sleeping) {
1154                         /*
1155                          * If the request isn't going to be blocked by a
1156                          * sleeping request, we know that it isn't going to
1157                          * be blocked; we can just execute the request --
1158                          * without performing costly deadlock detection.
1159                          */
1160                         ASSERT(!request_blocked_by_active);
1161                         return (flk_execute_request(request));
1162                 } else if (request->l_type == F_RDLCK) {
1163                         /*
1164                          * If we have a sleeping writer in the requested
1165                          * lock's range, block.
1166                          */
1167                         goto block;
1168                 }
1169
1170                 lk[0] = request;
1171                 request->l_state |= RECOMPUTE_LOCK;
1172                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1173                 if (lock) {
1174                         do {
1175                                 flk_recompute_dependencies(lock, lk, 1, 0);
1176                                 lock = lock->l_next;
1177                         } while (lock->l_vnode == vp);
1178                 }
1179                 lock = first_glock;
1180                 if (lock) {
1181                         do {
1182                                 if (IS_GRANTED(lock)) {
1183                                 flk_recompute_dependencies(lock, lk, 1, 0);
1184                                 }
1185                                 lock = lock->l_prev;
1186                         } while ((lock->l_vnode == vp));
1187                 }
1188                 request->l_state &= ~RECOMPUTE_LOCK;
1189                 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1190                         return (EDEADLK);
1191                 return (flk_execute_request(request));
1192         }
1193
1194 block:
1195         if (request_will_wait)
1196                 flk_graph_uncolor(gp);
1197
1198         /* check sleeping locks */
1199
1200         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1201
1202         /*
1203          * If we find a sleeping write lock that is a superset of the
1204          * region wanted by request we can be assured that by adding an
1205          * edge to this write lock we have paths to all locks in the
1206          * graph that blocks the request except in one case and that is why
1207          * another check for SAME_OWNER in the loop below. The exception
1208          * case is when this process that owns the sleeping write lock 'l1'
1209          * has other locks l2, l3, l4 that are in the system and arrived
1210          * before l1. l1 does not have path to these locks as they are from
1211          * same process. We break when we find a second covering sleeping
1212          * lock l5 owned by a process different from that owning l1, because
1213          * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1214          * it has l1 would have produced a deadlock already.
1215          */
1216
1217         if (lock) {
1218                 do {
1219                         if (BLOCKS(lock, request)) {
1220                                 if (!request_will_wait)
1221                                         return (EAGAIN);
1222                                 if (COVERS(lock, request) &&
1223                                     lock->l_type == F_WRLCK) {
1224                                         if (found_covering_lock &&
1225                                             !SAME_OWNER(lock, covered_by)) {
1226                                                 found_covering_lock++;
1227                                                 break;
1228                                         }
1229                                         found_covering_lock = 1;
1230                                         covered_by = lock;
1231                                 }
1232                                 if (found_covering_lock &&
1233                                     !SAME_OWNER(lock, covered_by)) {
1234                                         lock = lock->l_next;
1235                                         continue;
1236                                 }
1237                                 if ((error = flk_add_edge(request, lock,
1238                                     !found_covering_lock, 0)))
1239                                         return (error);
1240                         }
1241                         lock = lock->l_next;
1242                 } while (lock->l_vnode == vp);
1243         }
1244
1245 /*
1246  * found_covering_lock == 2 iff at this point 'request' has paths
1247  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1248  * point 'request' has paths to all locks that blocks 'request' whose owners
1249  * are not same as the one that covers 'request' (covered_by above) and
1250  * we can have locks whose owner is same as covered_by in the active list.
1251  */
1252
1253         if (request_blocked_by_active && found_covering_lock != 2) {
1254                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1255                 ASSERT(lock != NULL);
1256                 do {
1257                         if (BLOCKS(lock, request)) {
1258                                 if (found_covering_lock &&
1259                                     !SAME_OWNER(lock, covered_by)) {
1260                                         lock = lock->l_next;
1261                                         continue;
1262                                 }
1263                                 if ((error = flk_add_edge(request, lock,
1264                                     CHECK_CYCLE, 0)))
1265                                         return (error);
1266                         }
1267                         lock = lock->l_next;
1268                 } while (lock->l_vnode == vp);
1269         }
1270
1271         if (NOT_BLOCKED(request)) {
1272                 /*
1273                  * request not dependent on any other locks
1274                  * so execute this request
1275                  */
1276                 return (flk_execute_request(request));
1277         } else {
1278                 /*
1279                  * check for deadlock
1280                  */
1281                 if (flk_check_deadlock(request))
1282                         return (EDEADLK);
1283                 /*
1284                  * this thread has to sleep
1285                  */
1286                 return (flk_wait_execute_request(request));
1287         }
1288 }
1289
1290 /*
1291  * The actual execution of the request in the simple case is only to
1292  * insert the 'request' in the list of active locks if it is not an
1293  * UNLOCK.
1294  * We have to consider the existing active locks' relation to
1295  * this 'request' if they are owned by same process. flk_relation() does
1296  * this job and sees to that the dependency graph information is maintained
1297  * properly.
1298  */
1299
1300 int
1301 flk_execute_request(lock_descriptor_t *request)
1302 {
1303         graph_t *gp = request->l_graph;
1304         vnode_t *vp = request->l_vnode;
1305         lock_descriptor_t       *lock, *lock1;
1306         int done_searching = 0;
1307
1308         CHECK_SLEEPING_LOCKS(gp);
1309         CHECK_ACTIVE_LOCKS(gp);
1310
1311         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1312
1313         flk_set_state(request, FLK_START_STATE);
1314
1315         ASSERT(NOT_BLOCKED(request));
1316
1317         /* IO_LOCK requests are only to check status */
1318
1319         if (IS_IO_LOCK(request))
1320                 return (0);
1321
1322         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1323
1324         if (lock == NULL && request->l_type == F_UNLCK)
1325                 return (0);
1326         if (lock == NULL) {
1327                 flk_insert_active_lock(request);
1328                 return (0);
1329         }
1330
1331         do {
1332                 lock1 = lock->l_next;
1333                 if (SAME_OWNER(request, lock)) {
1334                         done_searching = flk_relation(lock, request);
1335                 }
1336                 lock = lock1;
1337         } while (lock->l_vnode == vp && !done_searching);
1338
1339         /*
1340          * insert in active queue
1341          */
1342
1343         if (request->l_type != F_UNLCK)
1344                 flk_insert_active_lock(request);
1345
1346         return (0);
1347 }
1348
1349 /*
1350  * 'request' is blocked by some one therefore we put it into sleep queue.
1351  */
1352 static int
1353 flk_wait_execute_request(lock_descriptor_t *request)
1354 {
1355         graph_t *gp = request->l_graph;
1356         callb_cpr_t     *cprp;          /* CPR info from callback */
1357         struct flock_globals *fg;
1358         int index;
1359
1360         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1361         ASSERT(IS_WILLING_TO_SLEEP(request));
1362
1363         flk_insert_sleeping_lock(request);
1364
1365         if (IS_LOCKMGR(request)) {
1366                 index = HASH_INDEX(request->l_vnode);
1367                 fg = flk_get_globals();
1368
1369                 if (nlm_status_size == 0) {     /* not booted as a cluster */
1370                         if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1371                                 flk_cancel_sleeping_lock(request, 1);
1372                                 return (ENOLCK);
1373                         }
1374                 } else {                        /* booted as a cluster */
1375                         /*
1376                          * If the request is an NLM server lock request,
1377                          * and the NLM state of the lock request is not
1378                          * NLM_UP (because the NLM server is shutting
1379                          * down), then cancel the sleeping lock and
1380                          * return error ENOLCK that will encourage the
1381                          * client to retransmit.
1382                          */
1383                         if (!IS_NLM_UP(request)) {
1384                                 flk_cancel_sleeping_lock(request, 1);
1385                                 return (ENOLCK);
1386                         }
1387                 }
1388         }
1389
1390         /* Clustering: For blocking PXFS locks, return */
1391         if (IS_PXFS(request)) {
1392                 /*
1393                  * PXFS locks sleep on the client side.
1394                  * The callback argument is used to wake up the sleeper
1395                  * when the lock is granted.
1396                  * We return -1 (rather than an errno value) to indicate
1397                  * the client side should sleep
1398                  */
1399                 return (PXFS_LOCK_BLOCKED);
1400         }
1401
1402         if (request->l_callbacks != NULL) {
1403                 /*
1404                  * To make sure the shutdown code works correctly, either
1405                  * the callback must happen after putting the lock on the
1406                  * sleep list, or we must check the shutdown status after
1407                  * returning from the callback (and before sleeping).  At
1408                  * least for now, we'll use the first option.  If a
1409                  * shutdown or signal or whatever happened while the graph
1410                  * mutex was dropped, that will be detected by
1411                  * wait_for_lock().
1412                  */
1413                 mutex_exit(&gp->gp_mutex);
1414
1415                 cprp = flk_invoke_callbacks(request->l_callbacks,
1416                     FLK_BEFORE_SLEEP);
1417
1418                 mutex_enter(&gp->gp_mutex);
1419
1420                 if (cprp == NULL) {
1421                         wait_for_lock(request);
1422                 } else {
1423                         mutex_enter(cprp->cc_lockp);
1424                         CALLB_CPR_SAFE_BEGIN(cprp);
1425                         mutex_exit(cprp->cc_lockp);
1426                         wait_for_lock(request);
1427                         mutex_enter(cprp->cc_lockp);
1428                         CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1429                         mutex_exit(cprp->cc_lockp);
1430                 }
1431
1432                 mutex_exit(&gp->gp_mutex);
1433                 (void) flk_invoke_callbacks(request->l_callbacks,
1434                     FLK_AFTER_SLEEP);
1435                 mutex_enter(&gp->gp_mutex);
1436         } else {
1437                 wait_for_lock(request);
1438         }
1439
1440         if (IS_LOCKMGR(request)) {
1441                 /*
1442                  * If the lock manager is shutting down, return an
1443                  * error that will encourage the client to retransmit.
1444                  */
1445                 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1446                     !IS_GRANTED(request)) {
1447                         flk_cancel_sleeping_lock(request, 1);
1448                         return (ENOLCK);
1449                 }
1450         }
1451
1452         if (IS_INTERRUPTED(request)) {
1453                 /* we got a signal, or act like we did */
1454                 flk_cancel_sleeping_lock(request, 1);
1455                 return (EINTR);
1456         }
1457
1458         /* Cancelled if some other thread has closed the file */
1459
1460         if (IS_CANCELLED(request)) {
1461                 flk_cancel_sleeping_lock(request, 1);
1462                 return (EBADF);
1463         }
1464
1465         request->l_state &= ~GRANTED_LOCK;
1466         REMOVE_SLEEP_QUEUE(request);
1467         return (flk_execute_request(request));
1468 }
1469
1470 /*
1471  * This routine adds an edge between from and to because from depends
1472  * to. If asked to check for deadlock it checks whether there are any
1473  * reachable locks from "from_lock" that is owned by the same process
1474  * as "from_lock".
1475  * NOTE: It is the caller's responsibility to make sure that the color
1476  * of the graph is consistent between the calls to flk_add_edge as done
1477  * in flk_process_request. This routine does not color and check for
1478  * deadlock explicitly.
1479  */
1480
1481 static int
1482 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1483     int check_cycle, int update_graph)
1484 {
1485         edge_t  *edge;
1486         edge_t  *ep;
1487         lock_descriptor_t       *vertex;
1488         lock_descriptor_t *vertex_stack;
1489
1490         STACK_INIT(vertex_stack);
1491
1492         /*
1493          * if to vertex already has mark_color just return
1494          * don't add an edge as it is reachable from from vertex
1495          * before itself.
1496          */
1497
1498         if (COLORED(to_lock))
1499                 return (0);
1500
1501         edge = flk_get_edge();
1502
1503         /*
1504          * set the from and to vertex
1505          */
1506
1507         edge->from_vertex = from_lock;
1508         edge->to_vertex = to_lock;
1509
1510         /*
1511          * put in adjacency list of from vertex
1512          */
1513
1514         from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1515         edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1516         edge->edge_adj_prev = &from_lock->l_edge;
1517         from_lock->l_edge.edge_adj_next = edge;
1518
1519         /*
1520          * put in list of to vertex
1521          */
1522
1523         to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1524         edge->edge_in_next = to_lock->l_edge.edge_in_next;
1525         to_lock->l_edge.edge_in_next = edge;
1526         edge->edge_in_prev = &to_lock->l_edge;
1527
1528
1529         if (update_graph) {
1530                 flk_update_proc_graph(edge, 0);
1531                 return (0);
1532         }
1533         if (!check_cycle) {
1534                 return (0);
1535         }
1536
1537         STACK_PUSH(vertex_stack, from_lock, l_stack);
1538
1539         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1540
1541                 STACK_POP(vertex_stack, l_stack);
1542
1543                 for (ep = FIRST_ADJ(vertex);
1544                     ep != HEAD(vertex);
1545                     ep = NEXT_ADJ(ep)) {
1546                         if (COLORED(ep->to_vertex))
1547                                 continue;
1548                         COLOR(ep->to_vertex);
1549                         if (SAME_OWNER(ep->to_vertex, from_lock))
1550                                 goto dead_lock;
1551                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1552                 }
1553         }
1554         return (0);
1555
1556 dead_lock:
1557
1558         /*
1559          * remove all edges
1560          */
1561
1562         ep = FIRST_ADJ(from_lock);
1563
1564         while (ep != HEAD(from_lock)) {
1565                 IN_LIST_REMOVE(ep);
1566                 from_lock->l_sedge = NEXT_ADJ(ep);
1567                 ADJ_LIST_REMOVE(ep);
1568                 flk_free_edge(ep);
1569                 ep = from_lock->l_sedge;
1570         }
1571         return (EDEADLK);
1572 }
1573
1574 /*
1575  * Get an edge structure for representing the dependency between two locks.
1576  */
1577
1578 static edge_t *
1579 flk_get_edge()
1580 {
1581         edge_t  *ep;
1582
1583         ASSERT(flk_edge_cache != NULL);
1584
1585         ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1586         edge_allocs++;
1587         return (ep);
1588 }
1589
1590 /*
1591  * Free the edge structure.
1592  */
1593
1594 static void
1595 flk_free_edge(edge_t *ep)
1596 {
1597         edge_frees++;
1598         kmem_cache_free(flk_edge_cache, (void *)ep);
1599 }
1600
1601 /*
1602  * Check the relationship of request with lock and perform the
1603  * recomputation of dependencies, break lock if required, and return
1604  * 1 if request cannot have any more relationship with the next
1605  * active locks.
1606  * The 'lock' and 'request' are compared and in case of overlap we
1607  * delete the 'lock' and form new locks to represent the non-overlapped
1608  * portion of original 'lock'. This function has side effects such as
1609  * 'lock' will be freed, new locks will be added to the active list.
1610  */
1611
1612 static int
1613 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1614 {
1615         int lock_effect;
1616         lock_descriptor_t *lock1, *lock2;
1617         lock_descriptor_t *topology[3];
1618         int nvertex = 0;
1619         int i;
1620         edge_t  *ep;
1621         graph_t *gp = (lock->l_graph);
1622
1623
1624         CHECK_SLEEPING_LOCKS(gp);
1625         CHECK_ACTIVE_LOCKS(gp);
1626
1627         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1628
1629         topology[0] = topology[1] = topology[2] = NULL;
1630
1631         if (request->l_type == F_UNLCK)
1632                 lock_effect = FLK_UNLOCK;
1633         else if (request->l_type == F_RDLCK &&
1634             lock->l_type == F_WRLCK)
1635                 lock_effect = FLK_DOWNGRADE;
1636         else if (request->l_type == F_WRLCK &&
1637             lock->l_type == F_RDLCK)
1638                 lock_effect = FLK_UPGRADE;
1639         else
1640                 lock_effect = FLK_STAY_SAME;
1641
1642         if (lock->l_end < request->l_start) {
1643                 if (lock->l_end == request->l_start - 1 &&
1644                     lock_effect == FLK_STAY_SAME) {
1645                         topology[0] = request;
1646                         request->l_start = lock->l_start;
1647                         nvertex = 1;
1648                         goto recompute;
1649                 } else {
1650                         return (0);
1651                 }
1652         }
1653
1654         if (lock->l_start > request->l_end) {
1655                 if (request->l_end == lock->l_start - 1 &&
1656                     lock_effect == FLK_STAY_SAME) {
1657                         topology[0] = request;
1658                         request->l_end = lock->l_end;
1659                         nvertex = 1;
1660                         goto recompute;
1661                 } else {
1662                         return (1);
1663                 }
1664         }
1665
1666         if (request->l_end < lock->l_end) {
1667                 if (request->l_start > lock->l_start) {
1668                         if (lock_effect == FLK_STAY_SAME) {
1669                                 request->l_start = lock->l_start;
1670                                 request->l_end = lock->l_end;
1671                                 topology[0] = request;
1672                                 nvertex = 1;
1673                         } else {
1674                                 lock1 = flk_get_lock();
1675                                 lock2 = flk_get_lock();
1676                                 COPY(lock1, lock);
1677                                 COPY(lock2, lock);
1678                                 lock1->l_start = lock->l_start;
1679                                 lock1->l_end = request->l_start - 1;
1680                                 lock2->l_start = request->l_end + 1;
1681                                 lock2->l_end = lock->l_end;
1682                                 topology[0] = lock1;
1683                                 topology[1] = lock2;
1684                                 topology[2] = request;
1685                                 nvertex = 3;
1686                         }
1687                 } else if (request->l_start < lock->l_start) {
1688                         if (lock_effect == FLK_STAY_SAME) {
1689                                 request->l_end = lock->l_end;
1690                                 topology[0] = request;
1691                                 nvertex = 1;
1692                         } else {
1693                                 lock1 = flk_get_lock();
1694                                 COPY(lock1, lock);
1695                                 lock1->l_start = request->l_end + 1;
1696                                 topology[0] = lock1;
1697                                 topology[1] = request;
1698                                 nvertex = 2;
1699                         }
1700                 } else  {
1701                         if (lock_effect == FLK_STAY_SAME) {
1702                                 request->l_start = lock->l_start;
1703                                 request->l_end = lock->l_end;
1704                                 topology[0] = request;
1705                                 nvertex = 1;
1706                         } else {
1707                                 lock1 = flk_get_lock();
1708                                 COPY(lock1, lock);
1709                                 lock1->l_start = request->l_end + 1;
1710                                 topology[0] = lock1;
1711                                 topology[1] = request;
1712                                 nvertex = 2;
1713                         }
1714                 }
1715         } else if (request->l_end > lock->l_end) {
1716                 if (request->l_start > lock->l_start)  {
1717                         if (lock_effect == FLK_STAY_SAME) {
1718                                 request->l_start = lock->l_start;
1719                                 topology[0] = request;
1720                                 nvertex = 1;
1721                         } else {
1722                                 lock1 = flk_get_lock();
1723                                 COPY(lock1, lock);
1724                                 lock1->l_end = request->l_start - 1;
1725                                 topology[0] = lock1;
1726                                 topology[1] = request;
1727                                 nvertex = 2;
1728                         }
1729                 } else if (request->l_start < lock->l_start)  {
1730                         topology[0] = request;
1731                         nvertex = 1;
1732                 } else {
1733                         topology[0] = request;
1734                         nvertex = 1;
1735                 }
1736         } else {
1737                 if (request->l_start > lock->l_start) {
1738                         if (lock_effect == FLK_STAY_SAME) {
1739                                 request->l_start = lock->l_start;
1740                                 topology[0] = request;
1741                                 nvertex = 1;
1742                         } else {
1743                                 lock1 = flk_get_lock();
1744                                 COPY(lock1, lock);
1745                                 lock1->l_end = request->l_start - 1;
1746                                 topology[0] = lock1;
1747                                 topology[1] = request;
1748                                 nvertex = 2;
1749                         }
1750                 } else if (request->l_start < lock->l_start) {
1751                         topology[0] = request;
1752                         nvertex = 1;
1753                 } else {
1754                         if (lock_effect !=  FLK_UNLOCK) {
1755                                 topology[0] = request;
1756                                 nvertex = 1;
1757                         } else {
1758                                 flk_delete_active_lock(lock, 0);
1759                                 flk_wakeup(lock, 1);
1760                                 flk_free_lock(lock);
1761                                 CHECK_SLEEPING_LOCKS(gp);
1762                                 CHECK_ACTIVE_LOCKS(gp);
1763                                 return (1);
1764                         }
1765                 }
1766         }
1767
1768 recompute:
1769
1770         /*
1771          * For unlock we don't send the 'request' to for recomputing
1772          * dependencies because no lock will add an edge to this.
1773          */
1774
1775         if (lock_effect == FLK_UNLOCK) {
1776                 topology[nvertex-1] = NULL;
1777                 nvertex--;
1778         }
1779         for (i = 0; i < nvertex; i++) {
1780                 topology[i]->l_state |= RECOMPUTE_LOCK;
1781                 topology[i]->l_color = NO_COLOR;
1782         }
1783
1784         ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1785
1786         /*
1787          * we remove the adjacent edges for all vertices' to this vertex
1788          * 'lock'.
1789          */
1790
1791         ep = FIRST_IN(lock);
1792         while (ep != HEAD(lock)) {
1793                 ADJ_LIST_REMOVE(ep);
1794                 ep = NEXT_IN(ep);
1795         }
1796
1797         flk_delete_active_lock(lock, 0);
1798
1799         /* We are ready for recomputing the dependencies now */
1800
1801         flk_recompute_dependencies(lock, topology, nvertex, 1);
1802
1803         for (i = 0; i < nvertex; i++) {
1804                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1805                 topology[i]->l_color = NO_COLOR;
1806         }
1807
1808
1809         if (lock_effect == FLK_UNLOCK) {
1810                 nvertex++;
1811         }
1812         for (i = 0; i < nvertex - 1; i++) {
1813                 flk_insert_active_lock(topology[i]);
1814         }
1815
1816
1817         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1818                 flk_wakeup(lock, 0);
1819         } else {
1820                 ep = FIRST_IN(lock);
1821                 while (ep != HEAD(lock)) {
1822                         lock->l_sedge = NEXT_IN(ep);
1823                         IN_LIST_REMOVE(ep);
1824                         flk_update_proc_graph(ep, 1);
1825                         flk_free_edge(ep);
1826                         ep = lock->l_sedge;
1827                 }
1828         }
1829         flk_free_lock(lock);
1830
1831         CHECK_SLEEPING_LOCKS(gp);
1832         CHECK_ACTIVE_LOCKS(gp);
1833         return (0);
1834 }
1835
1836 /*
1837  * Insert a lock into the active queue.
1838  */
1839
1840 static void
1841 flk_insert_active_lock(lock_descriptor_t *new_lock)
1842 {
1843         graph_t *gp = new_lock->l_graph;
1844         vnode_t *vp = new_lock->l_vnode;
1845         lock_descriptor_t *first_lock, *lock;
1846
1847         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1848
1849         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1850         first_lock = lock;
1851
1852         if (first_lock != NULL) {
1853                 for (; (lock->l_vnode == vp &&
1854                     lock->l_start < new_lock->l_start); lock = lock->l_next)
1855                         ;
1856         } else {
1857                 lock = ACTIVE_HEAD(gp);
1858         }
1859
1860         lock->l_prev->l_next = new_lock;
1861         new_lock->l_next = lock;
1862         new_lock->l_prev = lock->l_prev;
1863         lock->l_prev = new_lock;
1864
1865         if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1866                 vp->v_filocks = (struct filock *)new_lock;
1867         }
1868         flk_set_state(new_lock, FLK_ACTIVE_STATE);
1869         new_lock->l_state |= ACTIVE_LOCK;
1870
1871         CHECK_ACTIVE_LOCKS(gp);
1872         CHECK_SLEEPING_LOCKS(gp);
1873 }
1874
1875 /*
1876  * Delete the active lock : Performs two functions depending on the
1877  * value of second parameter. One is to remove from the active lists
1878  * only and other is to both remove and free the lock.
1879  */
1880
1881 static void
1882 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1883 {
1884         vnode_t *vp = lock->l_vnode;
1885         graph_t *gp = lock->l_graph;
1886
1887         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1888         if (free_lock)
1889                 ASSERT(NO_DEPENDENTS(lock));
1890         ASSERT(NOT_BLOCKED(lock));
1891         ASSERT(IS_ACTIVE(lock));
1892
1893         ASSERT((vp->v_filocks != NULL));
1894
1895         if (vp->v_filocks == (struct filock *)lock) {
1896                 vp->v_filocks = (struct filock *)
1897                     ((lock->l_next->l_vnode == vp) ? lock->l_next :
1898                     NULL);
1899         }
1900         lock->l_next->l_prev = lock->l_prev;
1901         lock->l_prev->l_next = lock->l_next;
1902         lock->l_next = lock->l_prev = NULL;
1903         flk_set_state(lock, FLK_DEAD_STATE);
1904         lock->l_state &= ~ACTIVE_LOCK;
1905
1906         if (free_lock)
1907                 flk_free_lock(lock);
1908         CHECK_ACTIVE_LOCKS(gp);
1909         CHECK_SLEEPING_LOCKS(gp);
1910 }
1911
1912 /*
1913  * Insert into the sleep queue.
1914  */
1915
1916 static void
1917 flk_insert_sleeping_lock(lock_descriptor_t *request)
1918 {
1919         graph_t *gp = request->l_graph;
1920         vnode_t *vp = request->l_vnode;
1921         lock_descriptor_t       *lock;
1922
1923         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1924         ASSERT(IS_INITIAL(request));
1925
1926         for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1927             lock->l_vnode < vp); lock = lock->l_next)
1928                 ;
1929
1930         lock->l_prev->l_next = request;
1931         request->l_prev = lock->l_prev;
1932         lock->l_prev = request;
1933         request->l_next = lock;
1934         flk_set_state(request, FLK_SLEEPING_STATE);
1935         request->l_state |= SLEEPING_LOCK;
1936 }
1937
1938 /*
1939  * Cancelling a sleeping lock implies removing a vertex from the
1940  * dependency graph and therefore we should recompute the dependencies
1941  * of all vertices that have a path  to this vertex, w.r.t. all
1942  * vertices reachable from this vertex.
1943  */
1944
1945 void
1946 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1947 {
1948         graph_t *gp = request->l_graph;
1949         vnode_t *vp = request->l_vnode;
1950         lock_descriptor_t **topology = NULL;
1951         edge_t  *ep;
1952         lock_descriptor_t *vertex, *lock;
1953         int nvertex = 0;
1954         int i;
1955         lock_descriptor_t *vertex_stack;
1956
1957         STACK_INIT(vertex_stack);
1958
1959         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1960         /*
1961          * count number of vertex pointers that has to be allocated
1962          * All vertices that are reachable from request.
1963          */
1964
1965         STACK_PUSH(vertex_stack, request, l_stack);
1966
1967         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1968                 STACK_POP(vertex_stack, l_stack);
1969                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1970                     ep = NEXT_ADJ(ep)) {
1971                         if (IS_RECOMPUTE(ep->to_vertex))
1972                                 continue;
1973                         ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1974                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1975                         nvertex++;
1976                 }
1977         }
1978
1979         /*
1980          * allocate memory for holding the vertex pointers
1981          */
1982
1983         if (nvertex) {
1984                 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1985                     KM_SLEEP);
1986         }
1987
1988         /*
1989          * one more pass to actually store the vertices in the
1990          * allocated array.
1991          * We first check sleeping locks and then active locks
1992          * so that topology array will be in a topological
1993          * order.
1994          */
1995
1996         nvertex = 0;
1997         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1998
1999         if (lock) {
2000                 do {
2001                         if (IS_RECOMPUTE(lock)) {
2002                                 lock->l_index = nvertex;
2003                                 topology[nvertex++] = lock;
2004                         }
2005                         lock->l_color = NO_COLOR;
2006                         lock = lock->l_next;
2007                 } while (lock->l_vnode == vp);
2008         }
2009
2010         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2011
2012         if (lock) {
2013                 do {
2014                         if (IS_RECOMPUTE(lock)) {
2015                                 lock->l_index = nvertex;
2016                                 topology[nvertex++] = lock;
2017                         }
2018                         lock->l_color = NO_COLOR;
2019                         lock = lock->l_next;
2020                 } while (lock->l_vnode == vp);
2021         }
2022
2023         /*
2024          * remove in and out edges of request
2025          * They are freed after updating proc_graph below.
2026          */
2027
2028         for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
2029                 ADJ_LIST_REMOVE(ep);
2030         }
2031
2032
2033         if (remove_from_queue)
2034                 REMOVE_SLEEP_QUEUE(request);
2035
2036         /* we are ready to recompute */
2037
2038         flk_recompute_dependencies(request, topology, nvertex, 1);
2039
2040         ep = FIRST_ADJ(request);
2041         while (ep != HEAD(request)) {
2042                 IN_LIST_REMOVE(ep);
2043                 request->l_sedge = NEXT_ADJ(ep);
2044                 ADJ_LIST_REMOVE(ep);
2045                 flk_update_proc_graph(ep, 1);
2046                 flk_free_edge(ep);
2047                 ep = request->l_sedge;
2048         }
2049
2050
2051         /*
2052          * unset the RECOMPUTE flag in those vertices
2053          */
2054
2055         for (i = 0; i < nvertex; i++) {
2056                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2057         }
2058
2059         /*
2060          * free the topology
2061          */
2062         if (nvertex)
2063                 kmem_free((void *)topology,
2064                     (nvertex * sizeof (lock_descriptor_t *)));
2065         /*
2066          * Possibility of some locks unblocked now
2067          */
2068
2069         flk_wakeup(request, 0);
2070
2071         /*
2072          * we expect to have a correctly recomputed graph  now.
2073          */
2074         flk_set_state(request, FLK_DEAD_STATE);
2075         flk_free_lock(request);
2076         CHECK_SLEEPING_LOCKS(gp);
2077         CHECK_ACTIVE_LOCKS(gp);
2078
2079 }
2080
2081 /*
2082  * Uncoloring the graph is simply to increment the mark value of the graph
2083  * And only when wrap round takes place will we color all vertices in
2084  * the graph explicitly.
2085  */
2086
2087 static void
2088 flk_graph_uncolor(graph_t *gp)
2089 {
2090         lock_descriptor_t *lock;
2091
2092         if (gp->mark == UINT_MAX) {
2093                 gp->mark = 1;
2094         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
2095             lock = lock->l_next)
2096                         lock->l_color  = 0;
2097
2098         for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
2099             lock = lock->l_next)
2100                         lock->l_color  = 0;
2101         } else {
2102                 gp->mark++;
2103         }
2104 }
2105
2106 /*
2107  * Wake up locks that are blocked on the given lock.
2108  */
2109
2110 static void
2111 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
2112 {
2113         edge_t  *ep;
2114         graph_t *gp = lock->l_graph;
2115         lock_descriptor_t       *lck;
2116
2117         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2118         if (NO_DEPENDENTS(lock))
2119                 return;
2120         ep = FIRST_IN(lock);
2121         do {
2122                 /*
2123                  * delete the edge from the adjacency list
2124                  * of from vertex. if no more adjacent edges
2125                  * for this vertex wake this process.
2126                  */
2127                 lck = ep->from_vertex;
2128                 if (adj_list_remove)
2129                         ADJ_LIST_REMOVE(ep);
2130                 flk_update_proc_graph(ep, 1);
2131                 if (NOT_BLOCKED(lck)) {
2132                         GRANT_WAKEUP(lck);
2133                 }
2134                 lock->l_sedge = NEXT_IN(ep);
2135                 IN_LIST_REMOVE(ep);
2136                 flk_free_edge(ep);
2137                 ep = lock->l_sedge;
2138         } while (ep != HEAD(lock));
2139         ASSERT(NO_DEPENDENTS(lock));
2140 }
2141
2142 /*
2143  * The dependents of request, is checked for its dependency against the
2144  * locks in topology (called topology because the array is and should be in
2145  * topological order for this algorithm, if not in topological order the
2146  * inner loop below might add more edges than necessary. Topological ordering
2147  * of vertices satisfies the property that all edges will be from left to
2148  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
2149  * If lock l1 in the dependent set of request is dependent (blocked by)
2150  * on lock l2 in topology but does not have a path to it, we add an edge
2151  * in the inner loop below.
2152  *
2153  * We don't want to add an edge between l1 and l2 if there exists
2154  * already a path from l1 to l2, so care has to be taken for those vertices
2155  * that  have two paths to 'request'. These vertices are referred to here
2156  * as barrier locks.
2157  *
2158  * The barriers has to be found (those vertex that originally had two paths
2159  * to request) because otherwise we may end up adding edges unnecessarily
2160  * to vertices in topology, and thus barrier vertices can have an edge
2161  * to a vertex in topology as well a path to it.
2162  */
2163
2164 static void
2165 flk_recompute_dependencies(lock_descriptor_t *request,
2166     lock_descriptor_t **topology, int nvertex, int update_graph)
2167 {
2168         lock_descriptor_t *vertex, *lock;
2169         graph_t *gp = request->l_graph;
2170         int i, count;
2171         int barrier_found = 0;
2172         edge_t  *ep;
2173         lock_descriptor_t *vertex_stack;
2174
2175         STACK_INIT(vertex_stack);
2176
2177         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2178         if (nvertex == 0)
2179                 return;
2180         flk_graph_uncolor(request->l_graph);
2181         barrier_found = flk_find_barriers(request);
2182         request->l_state |= RECOMPUTE_DONE;
2183
2184         STACK_PUSH(vertex_stack, request, l_stack);
2185         request->l_sedge = FIRST_IN(request);
2186
2187
2188         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2189                 if (vertex->l_state & RECOMPUTE_DONE) {
2190                         count = 0;
2191                         goto next_in_edge;
2192                 }
2193                 if (IS_BARRIER(vertex)) {
2194                         /* decrement the barrier count */
2195                         if (vertex->l_index) {
2196                                 vertex->l_index--;
2197                                 /* this guy will be pushed again anyway ? */
2198                                 STACK_POP(vertex_stack, l_stack);
2199                                 if (vertex->l_index == 0)  {
2200                                 /*
2201                                  * barrier is over we can recompute
2202                                  * dependencies for this lock in the
2203                                  * next stack pop
2204                                  */
2205                                         vertex->l_state &= ~BARRIER_LOCK;
2206                                 }
2207                                 continue;
2208                         }
2209                 }
2210                 vertex->l_state |= RECOMPUTE_DONE;
2211                 flk_graph_uncolor(gp);
2212                 count = flk_color_reachables(vertex);
2213                 for (i = 0; i < nvertex; i++) {
2214                         lock = topology[i];
2215                         if (COLORED(lock))
2216                                 continue;
2217                         if (BLOCKS(lock, vertex)) {
2218                                 (void) flk_add_edge(vertex, lock,
2219                                     NO_CHECK_CYCLE, update_graph);
2220                                 COLOR(lock);
2221                                 count++;
2222                                 count += flk_color_reachables(lock);
2223                         }
2224
2225                 }
2226
2227 next_in_edge:
2228                 if (count == nvertex ||
2229                     vertex->l_sedge == HEAD(vertex)) {
2230                         /* prune the tree below this */
2231                         STACK_POP(vertex_stack, l_stack);
2232                         vertex->l_state &= ~RECOMPUTE_DONE;
2233                         /* update the barrier locks below this! */
2234                         if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
2235                                 flk_graph_uncolor(gp);
2236                                 flk_update_barriers(vertex);
2237                         }
2238                         continue;
2239                 }
2240
2241                 ep = vertex->l_sedge;
2242                 lock = ep->from_vertex;
2243                 STACK_PUSH(vertex_stack, lock, l_stack);
2244                 lock->l_sedge = FIRST_IN(lock);
2245                 vertex->l_sedge = NEXT_IN(ep);
2246         }
2247
2248 }
2249
2250 /*
2251  * Color all reachable vertices from vertex that belongs to topology (here
2252  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2253  *
2254  * Note: we need to use a different stack_link l_stack1 because this is
2255  * called from flk_recompute_dependencies() that already uses a stack with
2256  * l_stack as stack_link.
2257  */
2258
2259 static int
2260 flk_color_reachables(lock_descriptor_t *vertex)
2261 {
2262         lock_descriptor_t *ver, *lock;
2263         int count;
2264         edge_t  *ep;
2265         lock_descriptor_t *vertex_stack;
2266
2267         STACK_INIT(vertex_stack);
2268
2269         STACK_PUSH(vertex_stack, vertex, l_stack1);
2270         count = 0;
2271         while ((ver = STACK_TOP(vertex_stack)) != NULL) {
2272
2273                 STACK_POP(vertex_stack, l_stack1);
2274                 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
2275                     ep = NEXT_ADJ(ep)) {
2276                         lock = ep->to_vertex;
2277                         if (COLORED(lock))
2278                                 continue;
2279                         COLOR(lock);
2280                         if (IS_RECOMPUTE(lock))
2281                                 count++;
2282                         STACK_PUSH(vertex_stack, lock, l_stack1);
2283                 }
2284
2285         }
2286         return (count);
2287 }
2288
2289 /*
2290  * Called from flk_recompute_dependencies() this routine decrements
2291  * the barrier count of barrier vertices that are reachable from lock.
2292  */
2293
2294 static void
2295 flk_update_barriers(lock_descriptor_t *lock)
2296 {
2297         lock_descriptor_t *vertex, *lck;
2298         edge_t  *ep;
2299         lock_descriptor_t *vertex_stack;
2300
2301         STACK_INIT(vertex_stack);
2302
2303         STACK_PUSH(vertex_stack, lock, l_stack1);
2304
2305         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2306                 STACK_POP(vertex_stack, l_stack1);
2307                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2308                     ep = NEXT_IN(ep)) {
2309                         lck = ep->from_vertex;
2310                         if (COLORED(lck)) {
2311                                 if (IS_BARRIER(lck)) {
2312                                         ASSERT(lck->l_index > 0);
2313                                         lck->l_index--;
2314                                         if (lck->l_index == 0)
2315                                                 lck->l_state &= ~BARRIER_LOCK;
2316                                 }
2317                                 continue;
2318                         }
2319                         COLOR(lck);
2320                         if (IS_BARRIER(lck)) {
2321                                 ASSERT(lck->l_index > 0);
2322                                 lck->l_index--;
2323                                 if (lck->l_index == 0)
2324                                         lck->l_state &= ~BARRIER_LOCK;
2325                         }
2326                         STACK_PUSH(vertex_stack, lck, l_stack1);
2327                 }
2328         }
2329 }
2330
2331 /*
2332  * Finds all vertices that are reachable from 'lock' more than once and
2333  * mark them as barrier vertices and increment their barrier count.
2334  * The barrier count is one minus the total number of paths from lock
2335  * to that vertex.
2336  */
2337
2338 static int
2339 flk_find_barriers(lock_descriptor_t *lock)
2340 {
2341         lock_descriptor_t *vertex, *lck;
2342         int found = 0;
2343         edge_t  *ep;
2344         lock_descriptor_t *vertex_stack;
2345
2346         STACK_INIT(vertex_stack);
2347
2348         STACK_PUSH(vertex_stack, lock, l_stack1);
2349
2350         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2351                 STACK_POP(vertex_stack, l_stack1);
2352                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2353                     ep = NEXT_IN(ep)) {
2354                         lck = ep->from_vertex;
2355                         if (COLORED(lck)) {
2356                                 /* this is a barrier */
2357                                 lck->l_state |= BARRIER_LOCK;
2358                                 /* index will have barrier count */
2359                                 lck->l_index++;
2360                                 if (!found)
2361                                         found = 1;
2362                                 continue;
2363                         }
2364                         COLOR(lck);
2365                         lck->l_index = 0;
2366                         STACK_PUSH(vertex_stack, lck, l_stack1);
2367                 }
2368         }
2369         return (found);
2370 }
2371
2372 /*
2373  * Finds the first lock that is mainly responsible for blocking this
2374  * request.  If there is no such lock, request->l_flock.l_type is set to
2375  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2376  * of the blocking lock.
2377  *
2378  * Note: It is possible a request is blocked by a sleeping lock because
2379  * of the fairness policy used in flk_process_request() to construct the
2380  * dependencies. (see comments before flk_process_request()).
2381  */
2382
2383 static void
2384 flk_get_first_blocking_lock(lock_descriptor_t *request)
2385 {
2386         graph_t *gp = request->l_graph;
2387         vnode_t *vp = request->l_vnode;
2388         lock_descriptor_t *lock, *blocker;
2389
2390         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2391         blocker = NULL;
2392         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2393
2394         if (lock) {
2395                 do {
2396                         if (BLOCKS(lock, request)) {
2397                                 blocker = lock;
2398                                 break;
2399                         }
2400                         lock = lock->l_next;
2401                 } while (lock->l_vnode == vp);
2402         }
2403
2404         if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2405                 /*
2406                  * No active lock is blocking this request, but if a read
2407                  * lock is requested, it may also get blocked by a waiting
2408                  * writer. So search all sleeping locks and see if there is
2409                  * a writer waiting.
2410                  */
2411                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2412                 if (lock) {
2413                         do {
2414                                 if (BLOCKS(lock, request)) {
2415                                         blocker = lock;
2416                                         break;
2417                                 }
2418                                 lock = lock->l_next;
2419                         } while (lock->l_vnode == vp);
2420                 }
2421         }
2422
2423         if (blocker) {
2424                 report_blocker(blocker, request);
2425         } else
2426                 request->l_flock.l_type = F_UNLCK;
2427 }
2428
2429 /*
2430  * Get the graph_t structure associated with a vnode.
2431  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2432  * not yet been initialized, then a new element is allocated and returned.
2433  */
2434 graph_t *
2435 flk_get_lock_graph(vnode_t *vp, int initialize)
2436 {
2437         graph_t *gp;
2438         graph_t *gp_alloc = NULL;
2439         int index = HASH_INDEX(vp);
2440
2441         if (initialize == FLK_USE_GRAPH) {
2442                 mutex_enter(&flock_lock);
2443                 gp = lock_graph[index];
2444                 mutex_exit(&flock_lock);
2445                 return (gp);
2446         }
2447
2448         ASSERT(initialize == FLK_INIT_GRAPH);
2449
2450         if (lock_graph[index] == NULL) {
2451
2452                 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2453
2454                 /* Initialize the graph */
2455
2456                 gp_alloc->active_locks.l_next =
2457                     gp_alloc->active_locks.l_prev =
2458                     (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2459                 gp_alloc->sleeping_locks.l_next =
2460                     gp_alloc->sleeping_locks.l_prev =
2461                     (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2462                 gp_alloc->index = index;
2463                 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2464         }
2465
2466         mutex_enter(&flock_lock);
2467
2468         gp = lock_graph[index];
2469
2470         /* Recheck the value within flock_lock */
2471         if (gp == NULL) {
2472                 struct flock_globals *fg;
2473
2474                 /* We must have previously allocated the graph_t structure */
2475                 ASSERT(gp_alloc != NULL);
2476                 lock_graph[index] = gp = gp_alloc;
2477                 /*
2478                  * The lockmgr status is only needed if KLM is loaded.
2479                  */
2480                 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2481                         fg = flk_get_globals();
2482                         fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2483                 }
2484         }
2485
2486         mutex_exit(&flock_lock);
2487
2488         if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2489                 /* There was a race to allocate the graph_t and we lost */
2490                 mutex_destroy(&gp_alloc->gp_mutex);
2491                 kmem_free(gp_alloc, sizeof (graph_t));
2492         }
2493
2494         return (gp);
2495 }
2496
2497 /*
2498  * PSARC case 1997/292
2499  */
2500 int
2501 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2502 {
2503         lock_descriptor_t *lock;
2504         int result = 0;
2505         graph_t *gp;
2506         int                     lock_nlmid;
2507
2508         /*
2509          * Check to see if node is booted as a cluster. If not, return.
2510          */
2511         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2512                 return (0);
2513         }
2514
2515         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2516         if (gp == NULL) {
2517                 return (0);
2518         }
2519
2520         mutex_enter(&gp->gp_mutex);
2521
2522         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2523
2524         if (lock) {
2525                 while (lock->l_vnode == vp) {
2526                         /* get NLM id from sysid */
2527                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2528
2529                         /*
2530                          * If NLM server request _and_ nlmid of lock matches
2531                          * nlmid of argument, then we've found a remote lock.
2532                          */
2533                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2534                                 result = 1;
2535                                 goto done;
2536                         }
2537                         lock = lock->l_next;
2538                 }
2539         }
2540
2541         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2542
2543         if (lock) {
2544                 while (lock->l_vnode == vp) {
2545                         /* get NLM id from sysid */
2546                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2547
2548                         /*
2549                          * If NLM server request _and_ nlmid of lock matches
2550                          * nlmid of argument, then we've found a remote lock.
2551                          */
2552                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2553                                 result = 1;
2554                                 goto done;
2555                         }
2556                         lock = lock->l_next;
2557                 }
2558         }
2559
2560 done:
2561         mutex_exit(&gp->gp_mutex);
2562         return (result);
2563 }
2564
2565 /*
2566  * Determine whether there are any locks for the given vnode with a remote
2567  * sysid.  Returns zero if not, non-zero if there are.
2568  *
2569  * Note that the return value from this function is potentially invalid
2570  * once it has been returned.  The caller is responsible for providing its
2571  * own synchronization mechanism to ensure that the return value is useful
2572  * (e.g., see nfs_lockcompletion()).
2573  */
2574 int
2575 flk_has_remote_locks(vnode_t *vp)
2576 {
2577         lock_descriptor_t *lock;
2578         int result = 0;
2579         graph_t *gp;
2580
2581         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2582         if (gp == NULL) {
2583                 return (0);
2584         }
2585
2586         mutex_enter(&gp->gp_mutex);
2587
2588         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2589
2590         if (lock) {
2591                 while (lock->l_vnode == vp) {
2592                         if (IS_REMOTE(lock)) {
2593                                 result = 1;
2594                                 goto done;
2595                         }
2596                         lock = lock->l_next;
2597                 }
2598         }
2599
2600         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2601
2602         if (lock) {
2603                 while (lock->l_vnode == vp) {
2604                         if (IS_REMOTE(lock)) {
2605                                 result = 1;
2606                                 goto done;
2607                         }
2608                         lock = lock->l_next;
2609                 }
2610         }
2611
2612 done:
2613         mutex_exit(&gp->gp_mutex);
2614         return (result);
2615 }
2616
2617 /*
2618  * Determine whether there are any locks for the given vnode with a remote
2619  * sysid matching given sysid.
2620  * Used by the new (open source) NFS Lock Manager (NLM)
2621  */
2622 int
2623 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2624 {
2625         lock_descriptor_t *lock;
2626         int result = 0;
2627         graph_t *gp;
2628
2629         if (sysid == 0)
2630                 return (0);
2631
2632         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2633         if (gp == NULL) {
2634                 return (0);
2635         }
2636
2637         mutex_enter(&gp->gp_mutex);
2638
2639         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2640
2641         if (lock) {
2642                 while (lock->l_vnode == vp) {
2643                         if (lock->l_flock.l_sysid == sysid) {
2644                                 result = 1;
2645                                 goto done;
2646                         }
2647                         lock = lock->l_next;
2648                 }
2649         }
2650
2651         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2652
2653         if (lock) {
2654                 while (lock->l_vnode == vp) {
2655                         if (lock->l_flock.l_sysid == sysid) {
2656                                 result = 1;
2657                                 goto done;
2658                         }
2659                         lock = lock->l_next;
2660                 }
2661         }
2662
2663 done:
2664         mutex_exit(&gp->gp_mutex);
2665         return (result);
2666 }
2667
2668 /*
2669  * Determine if there are any locks owned by the given sysid.
2670  * Returns zero if not, non-zero if there are.  Note that this return code
2671  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2672  * avoids all the memory allocations of those routines.
2673  *
2674  * This routine has the same synchronization issues as
2675  * flk_has_remote_locks.
2676  */
2677
2678 int
2679 flk_sysid_has_locks(int sysid, int lck_type)
2680 {
2681         int             has_locks = 0;
2682         lock_descriptor_t       *lock;
2683         graph_t         *gp;
2684         int             i;
2685
2686         for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2687                 mutex_enter(&flock_lock);
2688                 gp = lock_graph[i];
2689                 mutex_exit(&flock_lock);
2690                 if (gp == NULL) {
2691                         continue;
2692                 }
2693
2694                 mutex_enter(&gp->gp_mutex);
2695
2696                 if (lck_type & FLK_QUERY_ACTIVE) {
2697                         for (lock = ACTIVE_HEAD(gp)->l_next;
2698                             lock != ACTIVE_HEAD(gp) && !has_locks;
2699                             lock = lock->l_next) {
2700                                 if (lock->l_flock.l_sysid == sysid)
2701                                         has_locks = 1;
2702                         }
2703                 }
2704
2705                 if (lck_type & FLK_QUERY_SLEEPING) {
2706                         for (lock = SLEEPING_HEAD(gp)->l_next;
2707                             lock != SLEEPING_HEAD(gp) && !has_locks;
2708                             lock = lock->l_next) {
2709                                 if (lock->l_flock.l_sysid == sysid)
2710                                         has_locks = 1;
2711                         }
2712                 }
2713                 mutex_exit(&gp->gp_mutex);
2714         }
2715
2716         return (has_locks);
2717 }
2718
2719
2720 /*
2721  * PSARC case 1997/292
2722  *
2723  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2724  *  quantity, the real sysid generated by the NLM server; the upper half
2725  *  identifies the node of the cluster where the NLM server ran.
2726  *  This routine is only called by an NLM server running in a cluster.
2727  * Effects: Remove all locks held on behalf of the client identified
2728  *  by "sysid."
2729  */
2730 void
2731 cl_flk_remove_locks_by_sysid(int sysid)
2732 {
2733         graph_t *gp;
2734         int i;
2735         lock_descriptor_t *lock, *nlock;
2736
2737         /*
2738          * Check to see if node is booted as a cluster. If not, return.
2739          */
2740         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2741                 return;
2742         }
2743
2744         ASSERT(sysid != 0);
2745         for (i = 0; i < HASH_SIZE; i++) {
2746                 mutex_enter(&flock_lock);
2747                 gp = lock_graph[i];
2748                 mutex_exit(&flock_lock);
2749
2750                 if (gp == NULL)
2751                         continue;
2752
2753                 mutex_enter(&gp->gp_mutex);     /*  get mutex on lock graph */
2754
2755                 /* signal sleeping requests so that they bail out */
2756                 lock = SLEEPING_HEAD(gp)->l_next;
2757                 while (lock != SLEEPING_HEAD(gp)) {
2758                         nlock = lock->l_next;
2759                         if (lock->l_flock.l_sysid == sysid) {
2760                                 INTERRUPT_WAKEUP(lock);
2761                         }
2762                         lock = nlock;
2763                 }
2764
2765                 /* delete active locks */
2766                 lock = ACTIVE_HEAD(gp)->l_next;
2767                 while (lock != ACTIVE_HEAD(gp)) {
2768                         nlock = lock->l_next;
2769                         if (lock->l_flock.l_sysid == sysid) {
2770                                 flk_delete_active_lock(lock, 0);
2771                                 flk_wakeup(lock, 1);
2772                                 flk_free_lock(lock);
2773                         }
2774                         lock = nlock;
2775                 }
2776                 mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2777         }
2778 }
2779
2780 /*
2781  * Delete all locks in the system that belongs to the sysid of the request.
2782  */
2783
2784 static void
2785 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2786 {
2787         int     sysid  = request->l_flock.l_sysid;
2788         lock_descriptor_t *lock, *nlock;
2789         graph_t *gp;
2790         int i;
2791
2792         ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2793         ASSERT(sysid != 0);
2794
2795         mutex_exit(&request->l_graph->gp_mutex);
2796
2797         for (i = 0; i < HASH_SIZE; i++) {
2798                 mutex_enter(&flock_lock);
2799                 gp = lock_graph[i];
2800                 mutex_exit(&flock_lock);
2801
2802                 if (gp == NULL)
2803                         continue;
2804
2805                 mutex_enter(&gp->gp_mutex);
2806
2807                 /* signal sleeping requests so that they bail out */
2808                 lock = SLEEPING_HEAD(gp)->l_next;
2809                 while (lock != SLEEPING_HEAD(gp)) {
2810                         nlock = lock->l_next;
2811                         if (lock->l_flock.l_sysid == sysid) {
2812                                 INTERRUPT_WAKEUP(lock);
2813                         }
2814                         lock = nlock;
2815                 }
2816
2817                 /* delete active locks */
2818                 lock = ACTIVE_HEAD(gp)->l_next;
2819                 while (lock != ACTIVE_HEAD(gp)) {
2820                         nlock = lock->l_next;
2821                         if (lock->l_flock.l_sysid == sysid) {
2822                                 flk_delete_active_lock(lock, 0);
2823                                 flk_wakeup(lock, 1);
2824                                 flk_free_lock(lock);
2825                         }
2826                         lock = nlock;
2827                 }
2828                 mutex_exit(&gp->gp_mutex);
2829         }
2830
2831         mutex_enter(&request->l_graph->gp_mutex);
2832 }
2833
2834 /*
2835  * Clustering: Deletes PXFS locks
2836  * Effects: Delete all locks on files in the given file system and with the
2837  *  given PXFS id.
2838  */
2839 void
2840 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2841 {
2842         lock_descriptor_t *lock, *nlock;
2843         graph_t *gp;
2844         int i;
2845
2846         for (i = 0; i < HASH_SIZE; i++) {
2847                 mutex_enter(&flock_lock);
2848                 gp = lock_graph[i];
2849                 mutex_exit(&flock_lock);
2850
2851                 if (gp == NULL)
2852                         continue;
2853
2854                 mutex_enter(&gp->gp_mutex);
2855
2856                 /* signal sleeping requests so that they bail out */
2857                 lock = SLEEPING_HEAD(gp)->l_next;
2858                 while (lock != SLEEPING_HEAD(gp)) {
2859                         nlock = lock->l_next;
2860                         if (lock->l_vnode->v_vfsp == vfsp) {
2861                                 ASSERT(IS_PXFS(lock));
2862                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2863                                     pxfsid) {
2864                                         flk_set_state(lock,
2865                                             FLK_CANCELLED_STATE);
2866                                         flk_cancel_sleeping_lock(lock, 1);
2867                                 }
2868                         }
2869                         lock = nlock;
2870                 }
2871
2872                 /* delete active locks */
2873                 lock = ACTIVE_HEAD(gp)->l_next;
2874                 while (lock != ACTIVE_HEAD(gp)) {
2875                         nlock = lock->l_next;
2876                         if (lock->l_vnode->v_vfsp == vfsp) {
2877                                 ASSERT(IS_PXFS(lock));
2878                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2879                                     pxfsid) {
2880                                         flk_delete_active_lock(lock, 0);
2881                                         flk_wakeup(lock, 1);
2882                                         flk_free_lock(lock);
2883                                 }
2884                         }
2885                         lock = nlock;
2886                 }
2887                 mutex_exit(&gp->gp_mutex);
2888         }
2889 }
2890
2891 /*
2892  * Search for a sleeping lock manager lock which matches exactly this lock
2893  * request; if one is found, fake a signal to cancel it.
2894  *
2895  * Return 1 if a matching lock was found, 0 otherwise.
2896  */
2897
2898 static int
2899 flk_canceled(lock_descriptor_t *request)
2900 {
2901         lock_descriptor_t *lock, *nlock;
2902         graph_t *gp = request->l_graph;
2903         vnode_t *vp = request->l_vnode;
2904
2905         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2906         ASSERT(IS_LOCKMGR(request));
2907         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2908
2909         if (lock) {
2910                 while (lock->l_vnode == vp) {
2911                         nlock = lock->l_next;
2912                         if (SAME_OWNER(lock, request) &&
2913                             lock->l_start == request->l_start &&
2914                             lock->l_end == request->l_end) {
2915                                 INTERRUPT_WAKEUP(lock);
2916                                 return (1);
2917                         }
2918                         lock = nlock;
2919                 }
2920         }
2921         return (0);
2922 }
2923
2924 /*
2925  * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
2926  * That is, since OFD locks are pid-less we'll never match on the incoming
2927  * pid. OFD locks are removed earlier in the close() path via closef() and
2928  * ofdcleanlock().
2929  */
2930 void
2931 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2932 {
2933         graph_t *gp;
2934         lock_descriptor_t *lock, *nlock;
2935         lock_descriptor_t *link_stack;
2936
2937         STACK_INIT(link_stack);
2938
2939         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2940
2941         if (gp == NULL)
2942                 return;
2943         mutex_enter(&gp->gp_mutex);
2944
2945         CHECK_SLEEPING_LOCKS(gp);
2946         CHECK_ACTIVE_LOCKS(gp);
2947
2948         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2949
2950         if (lock) {
2951                 do {
2952                         nlock = lock->l_next;
2953                         if ((lock->l_flock.l_pid == pid ||
2954                             pid == IGN_PID) &&
2955                             lock->l_flock.l_sysid == sysid) {
2956                                 CANCEL_WAKEUP(lock);
2957                         }
2958                         lock = nlock;
2959                 } while (lock->l_vnode == vp);
2960         }
2961
2962         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2963
2964         if (lock) {
2965                 do {
2966                         nlock = lock->l_next;
2967                         if ((lock->l_flock.l_pid == pid ||
2968                             pid == IGN_PID) &&
2969                             lock->l_flock.l_sysid == sysid) {
2970                                 flk_delete_active_lock(lock, 0);
2971                                 STACK_PUSH(link_stack, lock, l_stack);
2972                         }
2973                         lock = nlock;
2974                 } while (lock->l_vnode == vp);
2975         }
2976
2977         while ((lock = STACK_TOP(link_stack)) != NULL) {
2978                 STACK_POP(link_stack, l_stack);
2979                 flk_wakeup(lock, 1);
2980                 flk_free_lock(lock);
2981         }
2982
2983         CHECK_SLEEPING_LOCKS(gp);
2984         CHECK_ACTIVE_LOCKS(gp);
2985         CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2986         mutex_exit(&gp->gp_mutex);
2987 }
2988
2989
2990 /*
2991  * Called from 'fs' read and write routines for files that have mandatory
2992  * locking enabled.
2993  */
2994
2995 int
2996 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
2997     caller_context_t *ct)
2998 {
2999         register int    i;
3000         struct flock64  bf;
3001         int             error = 0;
3002
3003         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3004         bf.l_whence = 0;
3005         bf.l_start = offset;
3006         bf.l_len = len;
3007         if (ct == NULL) {
3008                 bf.l_pid = curproc->p_pid;
3009                 bf.l_sysid = 0;
3010         } else {
3011                 bf.l_pid = ct->cc_pid;
3012                 bf.l_sysid = ct->cc_sysid;
3013         }
3014         i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3015         if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3016             bf.l_type != F_UNLCK)
3017                 error = i ? i : EAGAIN;
3018         return (error);
3019 }
3020
3021 /*
3022  * convoff - converts the given data (start, whence) to the
3023  * given whence.
3024  */
3025 int
3026 convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset)
3027 {
3028         int             error;
3029         struct vattr    vattr;
3030
3031         if ((lckdat->l_whence == 2) || (whence == 2)) {
3032                 vattr.va_mask = AT_SIZE;
3033                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3034                         return (error);
3035         }
3036
3037         switch (lckdat->l_whence) {
3038         case 1:
3039                 lckdat->l_start += offset;
3040                 break;
3041         case 2:
3042                 lckdat->l_start += vattr.va_size;
3043                 /* FALLTHRU */
3044         case 0:
3045                 break;
3046         default:
3047                 return (EINVAL);
3048         }
3049
3050         if (lckdat->l_start < 0)
3051                 return (EINVAL);
3052
3053         switch (whence) {
3054         case 1:
3055                 lckdat->l_start -= offset;
3056                 break;
3057         case 2:
3058                 lckdat->l_start -= vattr.va_size;
3059                 /* FALLTHRU */
3060         case 0:
3061                 break;
3062         default:
3063                 return (EINVAL);
3064         }
3065
3066         lckdat->l_whence = (short)whence;
3067         return (0);
3068 }
3069
3070
3071 /*      proc_graph function definitions */
3072
3073 /*
3074  * Function checks for deadlock due to the new 'lock'. If deadlock found
3075  * edges of this lock are freed and returned.
3076  */
3077
3078 static int
3079 flk_check_deadlock(lock_descriptor_t *lock)
3080 {
3081         proc_vertex_t   *start_vertex, *pvertex;
3082         proc_vertex_t *dvertex;
3083         proc_edge_t *pep, *ppep;
3084         edge_t  *ep, *nep;
3085         proc_vertex_t *process_stack;
3086
3087         /*
3088          * OFD style locks are not associated with any process so there is
3089          * no proc graph for these. Thus we cannot, and do not, do deadlock
3090          * detection.
3091          */
3092         if (lock->l_ofd != NULL)
3093                 return (0);
3094
3095         STACK_INIT(process_stack);
3096
3097         mutex_enter(&flock_lock);
3098         start_vertex = flk_get_proc_vertex(lock);
3099         ASSERT(start_vertex != NULL);
3100
3101         /* construct the edges from this process to other processes */
3102
3103         ep = FIRST_ADJ(lock);
3104         while (ep != HEAD(lock)) {
3105                 proc_vertex_t *adj_proc;
3106
3107                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3108                 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
3109                         if (pep->to_proc == adj_proc) {
3110                                 ASSERT(pep->refcount);
3111                                 pep->refcount++;
3112                                 break;
3113                         }
3114                 }
3115                 if (pep == NULL) {
3116                         pep = flk_get_proc_edge();
3117                         pep->to_proc = adj_proc;
3118                         pep->refcount = 1;
3119                         adj_proc->incount++;
3120                         pep->next = start_vertex->edge;
3121                         start_vertex->edge = pep;
3122                 }
3123                 ep = NEXT_ADJ(ep);
3124         }
3125
3126         ep = FIRST_IN(lock);
3127
3128         while (ep != HEAD(lock)) {
3129                 proc_vertex_t *in_proc;
3130
3131                 in_proc = flk_get_proc_vertex(ep->from_vertex);
3132
3133                 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
3134                         if (pep->to_proc == start_vertex) {
3135                                 ASSERT(pep->refcount);
3136                                 pep->refcount++;
3137                                 break;
3138                         }
3139                 }
3140                 if (pep == NULL) {
3141                         pep = flk_get_proc_edge();
3142                         pep->to_proc = start_vertex;
3143                         pep->refcount = 1;
3144                         start_vertex->incount++;
3145                         pep->next = in_proc->edge;
3146                         in_proc->edge = pep;
3147                 }
3148                 ep = NEXT_IN(ep);
3149         }
3150
3151         if (start_vertex->incount == 0) {
3152                 mutex_exit(&flock_lock);
3153                 return (0);
3154         }
3155
3156         flk_proc_graph_uncolor();
3157
3158         start_vertex->p_sedge = start_vertex->edge;
3159
3160         STACK_PUSH(process_stack, start_vertex, p_stack);
3161
3162         while ((pvertex = STACK_TOP(process_stack)) != NULL) {
3163                 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
3164                         dvertex = pep->to_proc;
3165                         if (!PROC_ARRIVED(dvertex)) {
3166                                 STACK_PUSH(process_stack, dvertex, p_stack);
3167                                 dvertex->p_sedge = dvertex->edge;
3168                                 PROC_ARRIVE(pvertex);
3169                                 pvertex->p_sedge = pep->next;
3170                                 break;
3171                         }
3172                         if (!PROC_DEPARTED(dvertex))
3173                                 goto deadlock;
3174                 }
3175                 if (pep == NULL) {
3176                         PROC_DEPART(pvertex);
3177                         STACK_POP(process_stack, p_stack);
3178                 }
3179         }
3180         mutex_exit(&flock_lock);
3181         return (0);
3182
3183 deadlock:
3184
3185         /* we remove all lock edges and proc edges */
3186
3187         ep = FIRST_ADJ(lock);
3188         while (ep != HEAD(lock)) {
3189                 proc_vertex_t *adj_proc;
3190                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3191                 nep = NEXT_ADJ(ep);
3192                 IN_LIST_REMOVE(ep);
3193                 ADJ_LIST_REMOVE(ep);
3194                 flk_free_edge(ep);
3195                 ppep = start_vertex->edge;
3196                 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
3197                     pep = ppep->next) {
3198                         if (pep->to_proc == adj_proc) {
3199                                 pep->refcount--;
3200                                 if (pep->refcount == 0) {
3201                                         if (pep == ppep) {
3202                                                 start_vertex->edge = pep->next;
3203                                         } else {
3204                                                 ppep->next = pep->next;
3205                                         }
3206                                         adj_proc->incount--;
3207                                         flk_proc_release(adj_proc);
3208                                         flk_free_proc_edge(pep);
3209                                 }
3210                                 break;
3211                         }
3212                 }
3213                 ep = nep;
3214         }
3215         ep = FIRST_IN(lock);
3216         while (ep != HEAD(lock)) {
3217                 proc_vertex_t *in_proc;
3218                 in_proc = flk_get_proc_vertex(ep->from_vertex);
3219                 nep = NEXT_IN(ep);
3220                 IN_LIST_REMOVE(ep);
3221                 ADJ_LIST_REMOVE(ep);
3222                 flk_free_edge(ep);
3223                 ppep = in_proc->edge;
3224                 for (pep = in_proc->edge; pep != NULL; ppep = pep,
3225                     pep = ppep->next) {
3226                         if (pep->to_proc == start_vertex) {
3227                                 pep->refcount--;
3228                                 if (pep->refcount == 0) {
3229                                         if (pep == ppep) {
3230                                                 in_proc->edge = pep->next;
3231                                         } else {
3232                                                 ppep->next = pep->next;
3233                                         }
3234                                         start_vertex->incount--;
3235                                         flk_proc_release(in_proc);
3236                                         flk_free_proc_edge(pep);
3237                                 }
3238                                 break;
3239                         }
3240                 }
3241                 ep = nep;
3242         }
3243         flk_proc_release(start_vertex);
3244         mutex_exit(&flock_lock);
3245         return (1);
3246 }
3247
3248 /*
3249  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3250  * from the list we return that, otherwise we allocate one. If necessary,
3251  * we grow the list of vertices also.
3252  */
3253
3254 static proc_vertex_t *
3255 flk_get_proc_vertex(lock_descriptor_t *lock)
3256 {
3257         int i;
3258         proc_vertex_t   *pv;
3259         proc_vertex_t   **palloc;
3260
3261         ASSERT(MUTEX_HELD(&flock_lock));
3262         if (lock->pvertex != -1) {
3263                 ASSERT(lock->pvertex >= 0);
3264                 pv = pgraph.proc[lock->pvertex];
3265                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3266                         return (pv);
3267                 }
3268         }
3269         for (i = 0; i < pgraph.gcount; i++) {
3270                 pv = pgraph.proc[i];
3271                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3272                         lock->pvertex = pv->index = i;
3273                         return (pv);
3274                 }
3275         }
3276         pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
3277         pv->pid = lock->l_flock.l_pid;
3278         pv->sysid = lock->l_flock.l_sysid;
3279         flk_proc_vertex_allocs++;
3280         if (pgraph.free != 0) {
3281                 for (i = 0; i < pgraph.gcount; i++) {
3282                         if (pgraph.proc[i] == NULL) {
3283                                 pgraph.proc[i] = pv;
3284                                 lock->pvertex = pv->index = i;
3285                                 pgraph.free--;
3286                                 return (pv);
3287                         }
3288                 }
3289         }
3290         palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
3291             sizeof (proc_vertex_t *), KM_SLEEP);
3292
3293         if (pgraph.proc) {
3294                 bcopy(pgraph.proc, palloc,
3295                     pgraph.gcount * sizeof (proc_vertex_t *));
3296
3297                 kmem_free(pgraph.proc,
3298                     pgraph.gcount * sizeof (proc_vertex_t *));
3299         }
3300         pgraph.proc = palloc;
3301         pgraph.free += (PROC_CHUNK - 1);
3302         pv->index = lock->pvertex = pgraph.gcount;
3303         pgraph.gcount += PROC_CHUNK;
3304         pgraph.proc[pv->index] = pv;
3305         return (pv);
3306 }
3307
3308 /*
3309  * Allocate a proc edge.
3310  */
3311
3312 static proc_edge_t *
3313 flk_get_proc_edge()
3314 {
3315         proc_edge_t *pep;
3316
3317         pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3318         flk_proc_edge_allocs++;
3319         return (pep);
3320 }
3321
3322 /*
3323  * Free the proc edge. Called whenever its reference count goes to zero.
3324  */
3325
3326 static void
3327 flk_free_proc_edge(proc_edge_t *pep)
3328 {
3329         ASSERT(pep->refcount == 0);
3330         kmem_free((void *)pep, sizeof (proc_edge_t));
3331         flk_proc_edge_frees++;
3332 }
3333
3334 /*
3335  * Color the graph explicitly done only when the mark value hits max value.
3336  */
3337
3338 static void
3339 flk_proc_graph_uncolor()
3340 {
3341         int i;
3342
3343         if (pgraph.mark == UINT_MAX) {
3344                 for (i = 0; i < pgraph.gcount; i++)
3345                         if (pgraph.proc[i] != NULL) {
3346                                 pgraph.proc[i]->atime = 0;
3347                                 pgraph.proc[i]->dtime = 0;
3348                         }
3349                 pgraph.mark = 1;
3350         } else {
3351                 pgraph.mark++;
3352         }
3353 }
3354
3355 /*
3356  * Release the proc vertex iff both there are no in edges and out edges
3357  */
3358
3359 static void
3360 flk_proc_release(proc_vertex_t *proc)
3361 {
3362         ASSERT(MUTEX_HELD(&flock_lock));
3363         if (proc->edge == NULL && proc->incount == 0) {
3364                 pgraph.proc[proc->index] = NULL;
3365                 pgraph.free++;
3366                 kmem_free(proc, sizeof (proc_vertex_t));
3367                 flk_proc_vertex_frees++;
3368         }
3369 }
3370
3371 /*
3372  * Updates process graph to reflect change in a lock_graph.
3373  * Note: We should call this function only after we have a correctly
3374  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3375  * eg: in function flk_relation() we call this function after flk_recompute_
3376  * dependencies() otherwise if a process tries to lock a vnode hashed
3377  * into another graph it might sleep for ever.
3378  */
3379
3380 static void
3381 flk_update_proc_graph(edge_t *ep, int delete)
3382 {
3383         proc_vertex_t *toproc, *fromproc;
3384         proc_edge_t *pep, *prevpep;
3385
3386         mutex_enter(&flock_lock);
3387
3388         /*
3389          * OFD style locks are not associated with any process so there is
3390          * no proc graph for these.
3391          */
3392         if (ep->from_vertex->l_ofd != NULL) {
3393                 mutex_exit(&flock_lock);
3394                 return;
3395         }
3396
3397         toproc = flk_get_proc_vertex(ep->to_vertex);
3398         fromproc = flk_get_proc_vertex(ep->from_vertex);
3399
3400         if (!delete)
3401                 goto add;
3402         pep = prevpep = fromproc->edge;
3403
3404         ASSERT(pep != NULL);
3405         while (pep != NULL) {
3406                 if (pep->to_proc == toproc) {
3407                         ASSERT(pep->refcount > 0);
3408                         pep->refcount--;
3409                         if (pep->refcount == 0) {
3410                                 if (pep == prevpep) {
3411                                         fromproc->edge = pep->next;
3412                                 } else {
3413                                         prevpep->next = pep->next;
3414                                 }
3415                                 toproc->incount--;
3416                                 flk_proc_release(toproc);
3417                                 flk_free_proc_edge(pep);
3418                         }
3419                         break;
3420                 }
3421                 prevpep = pep;
3422                 pep = pep->next;
3423         }
3424         flk_proc_release(fromproc);
3425         mutex_exit(&flock_lock);
3426         return;
3427 add:
3428
3429         pep = fromproc->edge;
3430
3431         while (pep != NULL) {
3432                 if (pep->to_proc == toproc) {
3433                         ASSERT(pep->refcount > 0);
3434                         pep->refcount++;
3435                         break;
3436                 }
3437                 pep = pep->next;
3438         }
3439         if (pep == NULL) {
3440                 pep = flk_get_proc_edge();
3441                 pep->to_proc = toproc;
3442                 pep->refcount = 1;
3443                 toproc->incount++;
3444                 pep->next = fromproc->edge;
3445                 fromproc->edge = pep;
3446         }
3447         mutex_exit(&flock_lock);
3448 }
3449
3450 /*
3451  * Set the control status for lock manager requests.
3452  *
3453  */
3454
3455 /*
3456  * PSARC case 1997/292
3457  *
3458  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3459  * Effects: Set the state of the NLM server identified by "nlmid"
3460  *   in the NLM registry to state "nlm_state."
3461  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3462  *   NLM server to this LLM.
3463  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3464  *   may be locks requests that have gotten started but not finished.  In
3465  *   particular, there may be blocking requests that are in the callback code
3466  *   before sleeping (so they're not holding the lock for the graph).  If
3467  *   such a thread reacquires the graph's lock (to go to sleep) after
3468  *   NLM state in the NLM registry  is set to a non-up value,
3469  *   it will notice the status and bail out.  If the request gets
3470  *   granted before the thread can check the NLM registry, let it
3471  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3472  *
3473  * Modifies: nlm_reg_obj (global)
3474  * Arguments:
3475  *    nlmid     (IN):    id uniquely identifying an NLM server
3476  *    nlm_state (IN):    NLM server state to change "nlmid" to
3477  */
3478 void
3479 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3480 {
3481         /*
3482          * Check to see if node is booted as a cluster. If not, return.
3483          */
3484         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3485                 return;
3486         }
3487
3488         /*
3489          * Check for development/debugging.  It is possible to boot a node
3490          * in non-cluster mode, and then run a special script, currently
3491          * available only to developers, to bring up the node as part of a
3492          * cluster.  The problem is that running such a script does not
3493          * result in the routine flk_init() being called and hence global array
3494          * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3495          * but the LLM needs to do an additional check to see if the global
3496          * array has been created or not. If nlm_reg_status is NULL, then
3497          * return, else continue.
3498          */
3499         if (nlm_reg_status == NULL) {
3500                 return;
3501         }
3502
3503         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3504         mutex_enter(&nlm_reg_lock);
3505
3506         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3507                 /*
3508                  * If the NLM server "nlmid" is unknown in the NLM registry,
3509                  * add it to the registry in the nlm shutting down state.
3510                  */
3511                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3512                     FLK_NLM_SHUTTING_DOWN);
3513         } else {
3514                 /*
3515                  * Change the state of the NLM server identified by "nlmid"
3516                  * in the NLM registry to the argument "nlm_state."
3517                  */
3518                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3519                     nlm_state);
3520         }
3521
3522         /*
3523          *  The reason we must register the NLM server that is shutting down
3524          *  with an LLM that doesn't already know about it (never sent a lock
3525          *  request) is to handle correctly a race between shutdown and a new
3526          *  lock request.  Suppose that a shutdown request from the NLM server
3527          *  invokes this routine at the LLM, and a thread is spawned to
3528          *  service the request. Now suppose a new lock request is in
3529          *  progress and has already passed the first line of defense in
3530          *  reclock(), which denies new locks requests from NLM servers
3531          *  that are not in the NLM_UP state.  After the current routine
3532          *  is invoked for both phases of shutdown, the routine will return,
3533          *  having done nothing, and the lock request will proceed and
3534          *  probably be granted.  The problem is that the shutdown was ignored
3535          *  by the lock request because there was no record of that NLM server
3536          *  shutting down.   We will be in the peculiar position of thinking
3537          *  that we've shutdown the NLM server and all locks at all LLMs have
3538          *  been discarded, but in fact there's still one lock held.
3539          *  The solution is to record the existence of NLM server and change
3540          *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3541          *  progress may proceed because the next phase NLM_DOWN will catch
3542          *  this lock and discard it.
3543          */
3544         mutex_exit(&nlm_reg_lock);
3545
3546         switch (nlm_state) {
3547         case FLK_NLM_UP:
3548                 /*
3549                  * Change the NLM state of all locks still held on behalf of
3550                  * the NLM server identified by "nlmid" to NLM_UP.
3551                  */
3552                 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3553                 break;
3554
3555         case FLK_NLM_SHUTTING_DOWN:
3556                 /*
3557                  * Wake up all sleeping locks for the NLM server identified
3558                  * by "nlmid." Note that eventually all woken threads will
3559                  * have their lock requests cancelled and descriptors
3560                  * removed from the sleeping lock list.  Note that the NLM
3561                  * server state associated with each lock descriptor is
3562                  * changed to FLK_NLM_SHUTTING_DOWN.
3563                  */
3564                 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3565                 break;
3566
3567         case FLK_NLM_DOWN:
3568                 /*
3569                  * Discard all active, granted locks for this NLM server
3570                  * identified by "nlmid."
3571                  */
3572                 cl_flk_unlock_nlm_granted(nlmid);
3573                 break;
3574
3575         default:
3576                 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3577         }
3578 }
3579
3580 /*
3581  * Set the control status for lock manager requests.
3582  *
3583  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3584  * may be locks requests that have gotten started but not finished.  In
3585  * particular, there may be blocking requests that are in the callback code
3586  * before sleeping (so they're not holding the lock for the graph).  If
3587  * such a thread reacquires the graph's lock (to go to sleep) after
3588  * flk_lockmgr_status is set to a non-up value, it will notice the status
3589  * and bail out.  If the request gets granted before the thread can check
3590  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3591  * we are called with FLK_LOCKMGR_DOWN.
3592  */
3593
3594 void
3595 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3596 {
3597         int i;
3598         graph_t *gp;
3599         struct flock_globals *fg;
3600
3601         fg = flk_get_globals();
3602         ASSERT(fg != NULL);
3603
3604         mutex_enter(&flock_lock);
3605         fg->flk_lockmgr_status = status;
3606         mutex_exit(&flock_lock);
3607
3608         /*
3609          * If the lock manager is coming back up, all that's needed is to
3610          * propagate this information to the graphs.  If the lock manager
3611          * is going down, additional action is required, and each graph's
3612          * copy of the state is updated atomically with this other action.
3613          */
3614         switch (status) {
3615         case FLK_LOCKMGR_UP:
3616                 for (i = 0; i < HASH_SIZE; i++) {
3617                         mutex_enter(&flock_lock);
3618                         gp = lock_graph[i];
3619                         mutex_exit(&flock_lock);
3620                         if (gp == NULL)
3621                                 continue;
3622                         mutex_enter(&gp->gp_mutex);
3623                         fg->lockmgr_status[i] = status;
3624                         mutex_exit(&gp->gp_mutex);
3625                 }
3626                 break;
3627         case FLK_WAKEUP_SLEEPERS:
3628                 wakeup_sleeping_lockmgr_locks(fg);
3629                 break;
3630         case FLK_LOCKMGR_DOWN:
3631                 unlock_lockmgr_granted(fg);
3632                 break;
3633         default:
3634                 panic("flk_set_lockmgr_status: bad status (%d)", status);
3635                 break;
3636         }
3637 }
3638
3639 /*
3640  * This routine returns all the locks that are active or sleeping and are
3641  * associated with a particular set of identifiers.  If lock_state != 0, then
3642  * only locks that match the lock_state are returned. If lock_state == 0, then
3643  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3644  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3645  * vnode pointer is ignored.
3646  *
3647  * A list containing the vnode pointer and an flock structure
3648  * describing the lock is returned.  Each element in the list is
3649  * dynamically allocated and must be freed by the caller.  The
3650  * last item in the list is denoted by a NULL value in the ll_next
3651  * field.
3652  *
3653  * The vnode pointers returned are held.  The caller is responsible
3654  * for releasing these.  Note that the returned list is only a snapshot of
3655  * the current lock information, and that it is a snapshot of a moving
3656  * target (only one graph is locked at a time).
3657  */
3658
3659 locklist_t *
3660 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3661     pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3662 {
3663         lock_descriptor_t       *lock;
3664         lock_descriptor_t       *graph_head;
3665         locklist_t              listhead;
3666         locklist_t              *llheadp;
3667         locklist_t              *llp;
3668         locklist_t              *lltp;
3669         graph_t                 *gp;
3670         int                     i;
3671         int                     first_index; /* graph index */
3672         int                     num_indexes; /* graph index */
3673
3674         ASSERT((list_type == FLK_ACTIVE_STATE) ||
3675             (list_type == FLK_SLEEPING_STATE));
3676
3677         /*
3678          * Get a pointer to something to use as a list head while building
3679          * the rest of the list.
3680          */
3681         llheadp = &listhead;
3682         lltp = llheadp;
3683         llheadp->ll_next = (locklist_t *)NULL;
3684
3685         /* Figure out which graphs we want to look at. */
3686         if (vp == NULL) {
3687                 first_index = 0;
3688                 num_indexes = HASH_SIZE;
3689         } else {
3690                 first_index = HASH_INDEX(vp);
3691                 num_indexes = 1;
3692         }
3693
3694         for (i = first_index; i < first_index + num_indexes; i++) {
3695                 mutex_enter(&flock_lock);
3696                 gp = lock_graph[i];
3697                 mutex_exit(&flock_lock);
3698                 if (gp == NULL) {
3699                         continue;
3700                 }
3701
3702                 mutex_enter(&gp->gp_mutex);
3703                 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3704                     ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3705                 for (lock = graph_head->l_next;
3706                     lock != graph_head;
3707                     lock = lock->l_next) {
3708                         if (use_sysid && lock->l_flock.l_sysid != sysid)
3709                                 continue;
3710                         if (pid != NOPID && lock->l_flock.l_pid != pid)
3711                                 continue;
3712                         if (vp != NULL && lock->l_vnode != vp)
3713                                 continue;
3714                         if (lock_state && !(lock_state & lock->l_state))
3715                                 continue;
3716                         if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3717                                 continue;
3718                         /*
3719                          * A matching lock was found.  Allocate
3720                          * space for a new locklist entry and fill
3721                          * it in.
3722                          */
3723                         llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3724                         lltp->ll_next = llp;
3725                         VN_HOLD(lock->l_vnode);
3726                         llp->ll_vp = lock->l_vnode;
3727                         create_flock(lock, &(llp->ll_flock));
3728                         llp->ll_next = (locklist_t *)NULL;
3729                         lltp = llp;
3730                 }
3731                 mutex_exit(&gp->gp_mutex);
3732         }
3733
3734         llp = llheadp->ll_next;
3735         return (llp);
3736 }
3737
3738 /*
3739  * These two functions are simply interfaces to get_lock_list.  They return
3740  * a list of sleeping or active locks for the given sysid and pid.  See
3741  * get_lock_list for details.
3742  *
3743  * In either case we don't particularly care to specify the zone of interest;
3744  * the sysid-space is global across zones, so the sysid will map to exactly one
3745  * zone, and we'll return information for that zone.
3746  */
3747
3748 locklist_t *
3749 flk_get_sleeping_locks(int sysid, pid_t pid)
3750 {
3751         return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3752             ALL_ZONES));
3753 }
3754
3755 locklist_t *
3756 flk_get_active_locks(int sysid, pid_t pid)
3757 {
3758         return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3759             ALL_ZONES));
3760 }
3761
3762 /*
3763  * Another interface to get_lock_list.  This one returns all the active
3764  * locks for a given vnode.  Again, see get_lock_list for details.
3765  *
3766  * We don't need to specify which zone's locks we're interested in.  The matter
3767  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3768  * be used by multiple zones, so the list of locks will all be from the right
3769  * zone.
3770  */
3771
3772 locklist_t *
3773 flk_active_locks_for_vp(const vnode_t *vp)
3774 {
3775         return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3776             ALL_ZONES));
3777 }
3778
3779 /*
3780  * Another interface to get_lock_list.  This one returns all the active
3781  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3782  *
3783  * See the comment for flk_active_locks_for_vp() for why we don't care to
3784  * specify the particular zone of interest.
3785  */
3786 locklist_t *
3787 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3788 {
3789         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3790             NOPID, vp, ALL_ZONES));
3791 }
3792
3793 /*
3794  * Another interface to get_lock_list.  This one returns all the active
3795  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3796  *
3797  * The zone doesn't need to be specified here; the locks held by a
3798  * particular process will either be local (ie, non-NFS) or from the zone
3799  * the process is executing in.  This is because other parts of the system
3800  * ensure that an NFS vnode can't be used in a zone other than that in
3801  * which it was opened.
3802  */
3803 locklist_t *
3804 flk_active_nbmand_locks(pid_t pid)
3805 {
3806         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3807             pid, NULL, ALL_ZONES));
3808 }
3809
3810 /*
3811  * Free up all entries in the locklist.
3812  */
3813 void
3814 flk_free_locklist(locklist_t *llp)
3815 {
3816         locklist_t *next_llp;
3817
3818         while (llp) {
3819                 next_llp = llp->ll_next;
3820                 VN_RELE(llp->ll_vp);
3821                 kmem_free(llp, sizeof (*llp));
3822                 llp = next_llp;
3823         }
3824 }
3825
3826 static void
3827 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3828 {
3829         /*
3830          * For each graph "lg" in the hash table lock_graph do
3831          * a.  Get the list of sleeping locks
3832          * b.  For each lock descriptor in the list do
3833          *      i.   If the requested lock is an NLM server request AND
3834          *              the nlmid is the same as the routine argument then
3835          *              change the lock descriptor's state field to
3836          *              "nlm_state."
3837          * c.  Get the list of active locks
3838          * d.  For each lock descriptor in the list do
3839          *      i.   If the requested lock is an NLM server request AND
3840          *              the nlmid is the same as the routine argument then
3841          *              change the lock descriptor's state field to
3842          *              "nlm_state."
3843          */
3844
3845         int                     i;
3846         graph_t                 *gp;                    /* lock graph */
3847         lock_descriptor_t       *lock;                  /* lock */
3848         lock_descriptor_t       *nlock = NULL;          /* next lock */
3849         int                     lock_nlmid;
3850
3851         for (i = 0; i < HASH_SIZE; i++) {
3852                 mutex_enter(&flock_lock);
3853                 gp = lock_graph[i];
3854                 mutex_exit(&flock_lock);
3855                 if (gp == NULL) {
3856                         continue;
3857                 }
3858
3859                 /* Get list of sleeping locks in current lock graph. */
3860                 mutex_enter(&gp->gp_mutex);
3861                 for (lock = SLEEPING_HEAD(gp)->l_next;
3862                     lock != SLEEPING_HEAD(gp);
3863                     lock = nlock) {
3864                         nlock = lock->l_next;
3865                         /* get NLM id */
3866                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3867
3868                         /*
3869                          * If NLM server request AND nlmid of lock matches
3870                          * nlmid of argument, then set the NLM state of the
3871                          * lock to "nlm_state."
3872                          */
3873                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3874                                 SET_NLM_STATE(lock, nlm_state);
3875                         }
3876                 }
3877
3878                 /* Get list of active locks in current lock graph. */
3879                 for (lock = ACTIVE_HEAD(gp)->l_next;
3880                     lock != ACTIVE_HEAD(gp);
3881                     lock = nlock) {
3882                         nlock = lock->l_next;
3883                         /* get NLM id */
3884                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3885
3886                         /*
3887                          * If NLM server request AND nlmid of lock matches
3888                          * nlmid of argument, then set the NLM state of the
3889                          * lock to "nlm_state."
3890                          */
3891                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3892                                 ASSERT(IS_ACTIVE(lock));
3893                                 SET_NLM_STATE(lock, nlm_state);
3894                         }
3895                 }
3896                 mutex_exit(&gp->gp_mutex);
3897         }
3898 }
3899
3900 /*
3901  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3902  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3903  *   identified by "nlmid." Poke those lock requests.
3904  */
3905 static void
3906 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3907 {
3908         lock_descriptor_t *lock;
3909         lock_descriptor_t *nlock = NULL; /* next lock */
3910         int i;
3911         graph_t *gp;
3912         int     lock_nlmid;
3913
3914         for (i = 0; i < HASH_SIZE; i++) {
3915                 mutex_enter(&flock_lock);
3916                 gp = lock_graph[i];
3917                 mutex_exit(&flock_lock);
3918                 if (gp == NULL) {
3919                         continue;
3920                 }
3921
3922                 mutex_enter(&gp->gp_mutex);
3923                 for (lock = SLEEPING_HEAD(gp)->l_next;
3924                     lock != SLEEPING_HEAD(gp);
3925                     lock = nlock) {
3926                         nlock = lock->l_next;
3927                         /*
3928                          * If NLM server request _and_ nlmid of lock matches
3929                          * nlmid of argument, then set the NLM state of the
3930                          * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3931                          * request.
3932                          */
3933                         if (IS_LOCKMGR(lock)) {
3934                                 /* get NLM id */
3935                                 lock_nlmid =
3936                                     GETNLMID(lock->l_flock.l_sysid);
3937                                 if (nlmid == lock_nlmid) {
3938                                         SET_NLM_STATE(lock,
3939                                             FLK_NLM_SHUTTING_DOWN);
3940                                         INTERRUPT_WAKEUP(lock);
3941                                 }
3942                         }
3943                 }
3944                 mutex_exit(&gp->gp_mutex);
3945         }
3946 }
3947
3948 /*
3949  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3950  * Effects:  Find all active (granted) lock manager locks _only_ for the
3951  *   NLM server identified by "nlmid" and release them.
3952  */
3953 static void
3954 cl_flk_unlock_nlm_granted(int nlmid)
3955 {
3956         lock_descriptor_t *lock;
3957         lock_descriptor_t *nlock = NULL; /* next lock */
3958         int i;
3959         graph_t *gp;
3960         int     lock_nlmid;
3961
3962         for (i = 0; i < HASH_SIZE; i++) {
3963                 mutex_enter(&flock_lock);
3964                 gp = lock_graph[i];
3965                 mutex_exit(&flock_lock);
3966                 if (gp == NULL) {
3967                         continue;
3968                 }
3969
3970                 mutex_enter(&gp->gp_mutex);
3971                 for (lock = ACTIVE_HEAD(gp)->l_next;
3972                     lock != ACTIVE_HEAD(gp);
3973                     lock = nlock) {
3974                         nlock = lock->l_next;
3975                         ASSERT(IS_ACTIVE(lock));
3976
3977                         /*
3978                          * If it's an  NLM server request _and_ nlmid of
3979                          * the lock matches nlmid of argument, then
3980                          * remove the active lock the list, wakup blocked
3981                          * threads, and free the storage for the lock.
3982                          * Note that there's no need to mark the NLM state
3983                          * of this lock to NLM_DOWN because the lock will
3984                          * be deleted anyway and its storage freed.
3985                          */
3986                         if (IS_LOCKMGR(lock)) {
3987                                 /* get NLM id */
3988                                 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3989                                 if (nlmid == lock_nlmid) {
3990                                         flk_delete_active_lock(lock, 0);
3991                                         flk_wakeup(lock, 1);
3992                                         flk_free_lock(lock);
3993                                 }
3994                         }
3995                 }
3996                 mutex_exit(&gp->gp_mutex);
3997         }
3998 }
3999
4000 /*
4001  * Find all sleeping lock manager requests and poke them.
4002  */
4003 static void
4004 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
4005 {
4006         lock_descriptor_t *lock;
4007         lock_descriptor_t *nlock = NULL; /* next lock */
4008         int i;
4009         graph_t *gp;
4010         zoneid_t zoneid = getzoneid();
4011
4012         for (i = 0; i < HASH_SIZE; i++) {
4013                 mutex_enter(&flock_lock);
4014                 gp = lock_graph[i];
4015                 mutex_exit(&flock_lock);
4016                 if (gp == NULL) {
4017                         continue;
4018                 }
4019
4020                 mutex_enter(&gp->gp_mutex);
4021                 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
4022                 for (lock = SLEEPING_HEAD(gp)->l_next;
4023                     lock != SLEEPING_HEAD(gp);
4024                     lock = nlock) {
4025                         nlock = lock->l_next;
4026                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4027                                 INTERRUPT_WAKEUP(lock);
4028                         }
4029                 }
4030                 mutex_exit(&gp->gp_mutex);
4031         }
4032 }
4033
4034
4035 /*
4036  * Find all active (granted) lock manager locks and release them.
4037  */
4038 static void
4039 unlock_lockmgr_granted(struct flock_globals *fg)
4040 {
4041         lock_descriptor_t *lock;
4042         lock_descriptor_t *nlock = NULL; /* next lock */
4043         int i;
4044         graph_t *gp;
4045         zoneid_t zoneid = getzoneid();
4046
4047         for (i = 0; i < HASH_SIZE; i++) {
4048                 mutex_enter(&flock_lock);
4049                 gp = lock_graph[i];
4050                 mutex_exit(&flock_lock);
4051                 if (gp == NULL) {
4052                         continue;
4053                 }
4054
4055                 mutex_enter(&gp->gp_mutex);
4056                 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4057                 for (lock = ACTIVE_HEAD(gp)->l_next;
4058                     lock != ACTIVE_HEAD(gp);
4059                     lock = nlock) {
4060                         nlock = lock->l_next;
4061                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4062                                 ASSERT(IS_ACTIVE(lock));
4063                                 flk_delete_active_lock(lock, 0);
4064                                 flk_wakeup(lock, 1);
4065                                 flk_free_lock(lock);
4066                         }
4067                 }
4068                 mutex_exit(&gp->gp_mutex);
4069         }
4070 }
4071
4072
4073 /*
4074  * Wait until a lock is granted, cancelled, or interrupted.
4075  */
4076
4077 static void
4078 wait_for_lock(lock_descriptor_t *request)
4079 {
4080         graph_t *gp = request->l_graph;
4081
4082         ASSERT(MUTEX_HELD(&gp->gp_mutex));
4083
4084         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4085             !(IS_INTERRUPTED(request))) {
4086                 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
4087                         flk_set_state(request, FLK_INTERRUPTED_STATE);
4088                         request->l_state |= INTERRUPTED_LOCK;
4089                 }
4090         }
4091 }
4092
4093 /*
4094  * Create an flock structure from the existing lock information
4095  *
4096  * This routine is used to create flock structures for the lock manager
4097  * to use in a reclaim request.  Since the lock was originated on this
4098  * host, it must be conforming to UNIX semantics, so no checking is
4099  * done to make sure it falls within the lower half of the 32-bit range.
4100  */
4101
4102 static void
4103 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4104 {
4105         ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4106         ASSERT(lp->l_end >= lp->l_start);
4107
4108         flp->l_type = lp->l_type;
4109         flp->l_whence = 0;
4110         flp->l_start = lp->l_start;
4111         flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
4112             (lp->l_end - lp->l_start + 1);
4113         flp->l_sysid = lp->l_flock.l_sysid;
4114         flp->l_pid = lp->l_flock.l_pid;
4115 }
4116
4117 /*
4118  * Convert flock_t data describing a lock range into unsigned long starting
4119  * and ending points, which are put into lock_request.  Returns 0 or an
4120  * errno value.
4121  */
4122
4123 int
4124 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4125     u_offset_t *start, u_offset_t *end, offset_t offset)
4126 {
4127         struct vattr    vattr;
4128         int     error;
4129
4130         /*
4131          * Determine the starting point of the request
4132          */
4133         switch (flp->l_whence) {
4134         case 0:         /* SEEK_SET */
4135                 *start = (u_offset_t)flp->l_start;
4136                 break;
4137         case 1:         /* SEEK_CUR */
4138                 *start = (u_offset_t)(flp->l_start + offset);
4139                 break;
4140         case 2:         /* SEEK_END */
4141                 vattr.va_mask = AT_SIZE;
4142                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
4143                         return (error);
4144                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
4145                 break;
4146         default:
4147                 return (EINVAL);
4148         }
4149
4150         /*
4151          * Determine the range covered by the request.
4152          */
4153         if (flp->l_len == 0)
4154                 *end = MAX_U_OFFSET_T;
4155         else if ((offset_t)flp->l_len > 0) {
4156                 *end = (u_offset_t)(*start + (flp->l_len - 1));
4157         } else {
4158                 /*
4159                  * Negative length; why do we even allow this ?
4160                  * Because this allows easy specification of
4161                  * the last n bytes of the file.
4162                  */
4163                 *end = *start;
4164                 *start += (u_offset_t)flp->l_len;
4165                 (*start)++;
4166         }
4167         return (0);
4168 }
4169
4170 /*
4171  * Check the validity of lock data.  This can used by the NFS
4172  * frlock routines to check data before contacting the server.  The
4173  * server must support semantics that aren't as restrictive as
4174  * the UNIX API, so the NFS client is required to check.
4175  * The maximum is now passed in by the caller.
4176  */
4177
4178 int
4179 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4180 {
4181         /*
4182          * The end (length) for local locking should never be greater
4183          * than MAXEND. However, the representation for
4184          * the entire file is MAX_U_OFFSET_T.
4185          */
4186         if ((start > max) ||
4187             ((end > max) && (end != MAX_U_OFFSET_T))) {
4188                 return (EINVAL);
4189         }
4190         if (start > end) {
4191                 return (EINVAL);
4192         }
4193         return (0);
4194 }
4195
4196 /*
4197  * Fill in request->l_flock with information about the lock blocking the
4198  * request.  The complexity here is that lock manager requests are allowed
4199  * to see into the upper part of the 32-bit address range, whereas local
4200  * requests are only allowed to see signed values.
4201  *
4202  * What should be done when "blocker" is a lock manager lock that uses the
4203  * upper portion of the 32-bit range, but "request" is local?  Since the
4204  * request has already been determined to have been blocked by the blocker,
4205  * at least some portion of "blocker" must be in the range of the request,
4206  * or the request extends to the end of file.  For the first case, the
4207  * portion in the lower range is returned with the indication that it goes
4208  * "to EOF."  For the second case, the last byte of the lower range is
4209  * returned with the indication that it goes "to EOF."
4210  */
4211
4212 static void
4213 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
4214 {
4215         flock64_t *flrp;                        /* l_flock portion of request */
4216
4217         ASSERT(blocker != NULL);
4218
4219         flrp = &request->l_flock;
4220         flrp->l_whence = 0;
4221         flrp->l_type = blocker->l_type;
4222         flrp->l_pid = blocker->l_flock.l_pid;
4223         flrp->l_sysid = blocker->l_flock.l_sysid;
4224         request->l_ofd = blocker->l_ofd;
4225
4226         if (IS_LOCKMGR(request)) {
4227                 flrp->l_start = blocker->l_start;
4228                 if (blocker->l_end == MAX_U_OFFSET_T)
4229                         flrp->l_len = 0;
4230                 else
4231                         flrp->l_len = blocker->l_end - blocker->l_start + 1;
4232         } else {
4233                 if (blocker->l_start > MAXEND) {
4234                         flrp->l_start = MAXEND;
4235                         flrp->l_len = 0;
4236                 } else {
4237                         flrp->l_start = blocker->l_start;
4238                         if (blocker->l_end == MAX_U_OFFSET_T)
4239                                 flrp->l_len = 0;
4240                         else
4241                                 flrp->l_len = blocker->l_end -
4242                                     blocker->l_start + 1;
4243                 }
4244         }
4245 }
4246
4247 /*
4248  * PSARC case 1997/292
4249  */
4250 /*
4251  * This is the public routine exported by flock.h.
4252  */
4253 void
4254 cl_flk_change_nlm_state_to_unknown(int nlmid)
4255 {
4256         /*
4257          * Check to see if node is booted as a cluster. If not, return.
4258          */
4259         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
4260                 return;
4261         }
4262
4263         /*
4264          * See comment in cl_flk_set_nlm_status().
4265          */
4266         if (nlm_reg_status == NULL) {
4267                 return;
4268         }
4269
4270         /*
4271          * protect NLM registry state with a mutex.
4272          */
4273         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
4274         mutex_enter(&nlm_reg_lock);
4275         FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
4276         mutex_exit(&nlm_reg_lock);
4277 }
4278
4279 /*
4280  * Return non-zero if the given I/O request conflicts with an active NBMAND
4281  * lock.
4282  * If svmand is non-zero, it means look at all active locks, not just NBMAND
4283  * locks.
4284  */
4285
4286 int
4287 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
4288     ssize_t length, int svmand, caller_context_t *ct)
4289 {
4290         int conflict = 0;
4291         graph_t                 *gp;
4292         lock_descriptor_t       *lock;
4293         pid_t pid;
4294         int sysid;
4295
4296         if (ct == NULL) {
4297                 pid = curproc->p_pid;
4298                 sysid = 0;
4299         } else {
4300                 pid = ct->cc_pid;
4301                 sysid = ct->cc_sysid;
4302         }
4303
4304         mutex_enter(&flock_lock);
4305         gp = lock_graph[HASH_INDEX(vp)];
4306         mutex_exit(&flock_lock);
4307         if (gp == NULL)
4308                 return (0);
4309
4310         mutex_enter(&gp->gp_mutex);
4311         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4312
4313         for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4314                 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4315                     (lock->l_flock.l_sysid != sysid ||
4316                     lock->l_flock.l_pid != pid) &&
4317                     lock_blocks_io(op, offset, length,
4318                     lock->l_type, lock->l_start, lock->l_end)) {
4319                         conflict = 1;
4320                         break;
4321                 }
4322         }
4323         mutex_exit(&gp->gp_mutex);
4324
4325         return (conflict);
4326 }
4327
4328 /*
4329  * Return non-zero if the given I/O request conflicts with the given lock.
4330  */
4331
4332 static int
4333 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4334     int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4335 {
4336         ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4337         ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4338
4339         if (op == NBL_READ && lock_type == F_RDLCK)
4340                 return (0);
4341
4342         if (offset <= lock_start && lock_start < offset + length)
4343                 return (1);
4344         if (lock_start <= offset && offset <= lock_end)
4345                 return (1);
4346
4347         return (0);
4348 }
4349
4350 #ifdef DEBUG
4351 static void
4352 check_active_locks(graph_t *gp)
4353 {
4354         lock_descriptor_t *lock, *lock1;
4355         edge_t  *ep;
4356
4357         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4358             lock = lock->l_next) {
4359                 ASSERT(IS_ACTIVE(lock));
4360                 ASSERT(NOT_BLOCKED(lock));
4361                 ASSERT(!IS_BARRIER(lock));
4362
4363                 ep = FIRST_IN(lock);
4364
4365                 while (ep != HEAD(lock)) {
4366                         ASSERT(IS_SLEEPING(ep->from_vertex));
4367                         ASSERT(!NOT_BLOCKED(ep->from_vertex));
4368                         ep = NEXT_IN(ep);
4369                 }
4370
4371                 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4372                     lock1 = lock1->l_next) {
4373                         if (lock1->l_vnode == lock->l_vnode) {
4374                         if (BLOCKS(lock1, lock)) {
4375                                 cmn_err(CE_PANIC,
4376                                     "active lock %p blocks %p",
4377                                     (void *)lock1, (void *)lock);
4378                         } else if (BLOCKS(lock, lock1)) {
4379                                 cmn_err(CE_PANIC,
4380                                     "active lock %p blocks %p",
4381                                     (void *)lock, (void *)lock1);
4382                         }
4383                         }
4384                 }
4385         }
4386 }
4387
4388 /*
4389  * Effect: This functions checks to see if the transition from 'old_state' to
4390  *      'new_state' is a valid one.  It returns 0 if the transition is valid
4391  *      and 1 if it is not.
4392  *      For a map of valid transitions, see sys/flock_impl.h
4393  */
4394 static int
4395 check_lock_transition(int old_state, int new_state)
4396 {
4397         switch (old_state) {
4398         case FLK_INITIAL_STATE:
4399                 if ((new_state == FLK_START_STATE) ||
4400                     (new_state == FLK_SLEEPING_STATE) ||
4401                     (new_state == FLK_ACTIVE_STATE) ||
4402                     (new_state == FLK_DEAD_STATE)) {
4403                         return (0);
4404                 } else {
4405                         return (1);
4406                 }
4407         case FLK_START_STATE:
4408                 if ((new_state == FLK_ACTIVE_STATE) ||
4409                     (new_state == FLK_DEAD_STATE)) {
4410                         return (0);
4411                 } else {
4412                         return (1);
4413                 }
4414         case FLK_ACTIVE_STATE:
4415                 if (new_state == FLK_DEAD_STATE) {
4416                         return (0);
4417                 } else {
4418                         return (1);
4419                 }
4420         case FLK_SLEEPING_STATE:
4421                 if ((new_state == FLK_GRANTED_STATE) ||
4422                     (new_state == FLK_INTERRUPTED_STATE) ||
4423                     (new_state == FLK_CANCELLED_STATE)) {
4424                         return (0);
4425                 } else {
4426                         return (1);
4427                 }
4428         case FLK_GRANTED_STATE:
4429                 if ((new_state == FLK_START_STATE) ||
4430                     (new_state == FLK_INTERRUPTED_STATE) ||
4431                     (new_state == FLK_CANCELLED_STATE)) {
4432                         return (0);
4433                 } else {
4434                         return (1);
4435                 }
4436         case FLK_CANCELLED_STATE:
4437                 if ((new_state == FLK_INTERRUPTED_STATE) ||
4438                     (new_state == FLK_DEAD_STATE)) {
4439                         return (0);
4440                 } else {
4441                         return (1);
4442                 }
4443         case FLK_INTERRUPTED_STATE:
4444                 if (new_state == FLK_DEAD_STATE) {
4445                         return (0);
4446                 } else {
4447                         return (1);
4448                 }
4449         case FLK_DEAD_STATE:
4450                 /* May be set more than once */
4451                 if (new_state == FLK_DEAD_STATE) {
4452                         return (0);
4453                 } else {
4454                         return (1);
4455                 }
4456         default:
4457                 return (1);
4458         }
4459 }
4460
4461 static void
4462 check_sleeping_locks(graph_t *gp)
4463 {
4464         lock_descriptor_t *lock1, *lock2;
4465         edge_t *ep;
4466         for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4467             lock1 = lock1->l_next) {
4468                                 ASSERT(!IS_BARRIER(lock1));
4469         for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4470             lock2 = lock2->l_next) {
4471                 if (lock1->l_vnode == lock2->l_vnode) {
4472                         if (BLOCKS(lock2, lock1)) {
4473                                 ASSERT(!IS_GRANTED(lock1));
4474                                 ASSERT(!NOT_BLOCKED(lock1));
4475                                 path(lock1, lock2);
4476                         }
4477                 }
4478         }
4479
4480         for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4481             lock2 = lock2->l_next) {
4482                                 ASSERT(!IS_BARRIER(lock1));
4483                 if (lock1->l_vnode == lock2->l_vnode) {
4484                         if (BLOCKS(lock2, lock1)) {
4485                                 ASSERT(!IS_GRANTED(lock1));
4486                                 ASSERT(!NOT_BLOCKED(lock1));
4487                                 path(lock1, lock2);
4488                         }
4489                 }
4490         }
4491         ep = FIRST_ADJ(lock1);
4492         while (ep != HEAD(lock1)) {
4493                 ASSERT(BLOCKS(ep->to_vertex, lock1));
4494                 ep = NEXT_ADJ(ep);
4495         }
4496         }
4497 }
4498
4499 static int
4500 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4501 {
4502         edge_t  *ep;
4503         lock_descriptor_t       *vertex;
4504         lock_descriptor_t *vertex_stack;
4505
4506         STACK_INIT(vertex_stack);
4507
4508         flk_graph_uncolor(lock1->l_graph);
4509         ep = FIRST_ADJ(lock1);
4510         ASSERT(ep != HEAD(lock1));
4511         while (ep != HEAD(lock1)) {
4512                 if (no_path)
4513                         ASSERT(ep->to_vertex != lock2);
4514                 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4515                 COLOR(ep->to_vertex);
4516                 ep = NEXT_ADJ(ep);
4517         }
4518
4519         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4520                 STACK_POP(vertex_stack, l_dstack);
4521                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4522                     ep = NEXT_ADJ(ep)) {
4523                         if (COLORED(ep->to_vertex))
4524                                 continue;
4525                         COLOR(ep->to_vertex);
4526                         if (ep->to_vertex == lock2)
4527                                 return (1);
4528
4529                         STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4530                 }
4531         }
4532         return (0);
4533 }
4534
4535 static void
4536 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4537 {
4538         lock_descriptor_t *lock;
4539
4540         /* Ignore OFD style locks since they're not process-wide. */
4541         if (pid == 0)
4542                 return;
4543
4544         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4545
4546         if (lock) {
4547                 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4548                         if (lock->l_flock.l_pid == pid &&
4549                             lock->l_flock.l_sysid == sysid)
4550                                 cmn_err(CE_PANIC,
4551                                     "owner pid %d's lock %p in active queue",
4552                                     pid, (void *)lock);
4553                         lock = lock->l_next;
4554                 }
4555         }
4556         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4557
4558         if (lock) {
4559                 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4560                         if (lock->l_flock.l_pid == pid &&
4561                             lock->l_flock.l_sysid == sysid)
4562                                 cmn_err(CE_PANIC,
4563                                     "owner pid %d's lock %p in sleep queue",
4564                                     pid, (void *)lock);
4565                         lock = lock->l_next;
4566                 }
4567         }
4568 }
4569
4570 static int
4571 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4572 {
4573         edge_t *ep = FIRST_ADJ(lock1);
4574
4575         while (ep != HEAD(lock1)) {
4576                 if (ep->to_vertex == lock2)
4577                         return (1);
4578                 else
4579                         ep = NEXT_ADJ(ep);
4580         }
4581         return (0);
4582 }
4583
4584 static int
4585 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4586 {
4587         return (!level_two_path(lock1, lock2, 1));
4588 }
4589
4590 static void
4591 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4592 {
4593         if (level_one_path(lock1, lock2)) {
4594                 if (level_two_path(lock1, lock2, 0) != 0) {
4595                         cmn_err(CE_WARN,
4596                             "one edge one path from lock1 %p lock2 %p",
4597                             (void *)lock1, (void *)lock2);
4598                 }
4599         } else if (no_path(lock1, lock2)) {
4600                 cmn_err(CE_PANIC,
4601                     "No path from  lock1 %p to lock2 %p",
4602                     (void *)lock1, (void *)lock2);
4603         }
4604 }
4605 #endif /* DEBUG */