usr/src/lib/libc/port/threads/synch.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2015, Joyent, Inc.
  26  * Copyright (c) 2016 by Delphix. All rights reserved.
  27  */
  28
  29 #include "lint.h"
  30 #include "thr_uberdata.h"
  31 #include <sys/rtpriocntl.h>
  32 #include <sys/sdt.h>
  33 #include <atomic.h>
  34
  35 #if defined(THREAD_DEBUG)
  36 #define INCR32(x)       (((x) != UINT32_MAX)? (x)++ : 0)
  37 #define INCR(x)         ((x)++)
  38 #define DECR(x)         ((x)--)
  39 #define MAXINCR(m, x)   ((m < ++x)? (m = x) : 0)
  40 #else
  41 #define INCR32(x)
  42 #define INCR(x)
  43 #define DECR(x)
  44 #define MAXINCR(m, x)
  45 #endif
  46
  47 /*
  48  * This mutex is initialized to be held by lwp#1.
  49  * It is used to block a thread that has returned from a mutex_lock()
  50  * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
  51  */
  52 mutex_t stall_mutex = DEFAULTMUTEX;
  53
  54 static int shared_mutex_held(mutex_t *);
  55 static int mutex_queuelock_adaptive(mutex_t *);
  56 static void mutex_wakeup_all(mutex_t *);
  57
  58 /*
  59  * Lock statistics support functions.
  60  */
  61 void
  62 record_begin_hold(tdb_mutex_stats_t *msp)
  63 {
  64         tdb_incr(msp->mutex_lock);
  65         msp->mutex_begin_hold = gethrtime();
  66 }
  67
  68 hrtime_t
  69 record_hold_time(tdb_mutex_stats_t *msp)
  70 {
  71         hrtime_t now = gethrtime();
  72
  73         if (msp->mutex_begin_hold)
  74                 msp->mutex_hold_time += now - msp->mutex_begin_hold;
  75         msp->mutex_begin_hold = 0;
  76         return (now);
  77 }
  78
  79 /*
  80  * Called once at library initialization.
  81  */
  82 void
  83 mutex_setup(void)
  84 {
  85         if (set_lock_byte(&stall_mutex.mutex_lockw))
  86                 thr_panic("mutex_setup() cannot acquire stall_mutex");
  87         stall_mutex.mutex_owner = (uintptr_t)curthread;
  88 }
  89
  90 /*
  91  * The default spin count of 1000 is experimentally determined.
  92  * On sun4u machines with any number of processors it could be raised
  93  * to 10,000 but that (experimentally) makes almost no difference.
  94  * The environment variable:
  95  *      _THREAD_ADAPTIVE_SPIN=count
  96  * can be used to override and set the count in the range [0 .. 1,000,000].
  97  */
  98 int     thread_adaptive_spin = 1000;
  99 uint_t  thread_max_spinners = 100;
 100 int     thread_queue_verify = 0;
 101 static  int     ncpus;
 102
 103 /*
 104  * Distinguish spinning for queue locks from spinning for regular locks.
 105  * We try harder to acquire queue locks by spinning.
 106  * The environment variable:
 107  *      _THREAD_QUEUE_SPIN=count
 108  * can be used to override and set the count in the range [0 .. 1,000,000].
 109  */
 110 int     thread_queue_spin = 10000;
 111
 112 #define ALL_ATTRIBUTES                          \
 113         (LOCK_RECURSIVE | LOCK_ERRORCHECK |     \
 114         LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT | \
 115         LOCK_ROBUST)
 116
 117 /*
 118  * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
 119  * augmented by zero or more the flags:
 120  *      LOCK_RECURSIVE
 121  *      LOCK_ERRORCHECK
 122  *      LOCK_PRIO_INHERIT
 123  *      LOCK_PRIO_PROTECT
 124  *      LOCK_ROBUST
 125  */
 126 #pragma weak _mutex_init = mutex_init
 127 /* ARGSUSED2 */
 128 int
 129 mutex_init(mutex_t *mp, int type, void *arg)
 130 {
 131         int basetype = (type & ~ALL_ATTRIBUTES);
 132         const pcclass_t *pccp;
 133         int error = 0;
 134         int ceil;
 135
 136         if (basetype == USYNC_PROCESS_ROBUST) {
 137                 /*
 138                  * USYNC_PROCESS_ROBUST is a deprecated historical type.
 139                  * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
 140                  * retain the USYNC_PROCESS_ROBUST flag so we can return
 141                  * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
 142                  * mutexes will ever draw ELOCKUNMAPPED).
 143                  */
 144                 type |= (USYNC_PROCESS | LOCK_ROBUST);
 145                 basetype = USYNC_PROCESS;
 146         }
 147
 148         if (type & LOCK_PRIO_PROTECT)
 149                 pccp = get_info_by_policy(SCHED_FIFO);
 150         if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
 151             (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
 152             == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
 153             ((type & LOCK_PRIO_PROTECT) &&
 154             ((ceil = *(int *)arg) < pccp->pcc_primin ||
 155             ceil > pccp->pcc_primax))) {
 156                 error = EINVAL;
 157         } else if (type & LOCK_ROBUST) {
 158                 /*
 159                  * Callers of mutex_init() with the LOCK_ROBUST attribute
 160                  * are required to pass an initially all-zero mutex.
 161                  * Multiple calls to mutex_init() are allowed; all but
 162                  * the first return EBUSY.  A call to mutex_init() is
 163                  * allowed to make an inconsistent robust lock consistent
 164                  * (for historical usage, even though the proper interface
 165                  * for this is mutex_consistent()).  Note that we use
 166                  * atomic_or_16() to set the LOCK_INITED flag so as
 167                  * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
 168                  */
 169                 if (!(mp->mutex_flag & LOCK_INITED)) {
 170                         mp->mutex_type = (uint8_t)type;
 171                         atomic_or_16(&mp->mutex_flag, LOCK_INITED);
 172                         mp->mutex_magic = MUTEX_MAGIC;
 173                 } else if (type != mp->mutex_type ||
 174                     ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
 175                         error = EINVAL;
 176                 } else if (mutex_consistent(mp) != 0) {
 177                         error = EBUSY;
 178                 }
 179                 /* register a process robust mutex with the kernel */
 180                 if (basetype == USYNC_PROCESS)
 181                         register_lock(mp);
 182         } else {
 183                 (void) memset(mp, 0, sizeof (*mp));
 184                 mp->mutex_type = (uint8_t)type;
 185                 mp->mutex_flag = LOCK_INITED;
 186                 mp->mutex_magic = MUTEX_MAGIC;
 187         }
 188
 189         if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
 190                 mp->mutex_ceiling = ceil;
 191         }
 192
 193         /*
 194          * This should be at the beginning of the function,
 195          * but for the sake of old broken applications that
 196          * do not have proper alignment for their mutexes
 197          * (and don't check the return code from mutex_init),
 198          * we put it here, after initializing the mutex regardless.
 199          */
 200         if (error == 0 &&
 201             ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
 202             curthread->ul_misaligned == 0)
 203                 error = EINVAL;
 204
 205         return (error);
 206 }
 207
 208 /*
 209  * Delete mp from list of ceiling mutexes owned by curthread.
 210  * Return 1 if the head of the chain was updated.
 211  */
 212 int
 213 _ceil_mylist_del(mutex_t *mp)
 214 {
 215         ulwp_t *self = curthread;
 216         mxchain_t **mcpp;
 217         mxchain_t *mcp;
 218
 219         for (mcpp = &self->ul_mxchain;
 220             (mcp = *mcpp) != NULL;
 221             mcpp = &mcp->mxchain_next) {
 222                 if (mcp->mxchain_mx == mp) {
 223                         *mcpp = mcp->mxchain_next;
 224                         lfree(mcp, sizeof (*mcp));
 225                         return (mcpp == &self->ul_mxchain);
 226                 }
 227         }
 228         return (0);
 229 }
 230
 231 /*
 232  * Add mp to the list of ceiling mutexes owned by curthread.
 233  * Return ENOMEM if no memory could be allocated.
 234  */
 235 int
 236 _ceil_mylist_add(mutex_t *mp)
 237 {
 238         ulwp_t *self = curthread;
 239         mxchain_t *mcp;
 240
 241         if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
 242                 return (ENOMEM);
 243         mcp->mxchain_mx = mp;
 244         mcp->mxchain_next = self->ul_mxchain;
 245         self->ul_mxchain = mcp;
 246         return (0);
 247 }
 248
 249 /*
 250  * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
 251  */
 252 static void
 253 set_rt_priority(ulwp_t *self, int prio)
 254 {
 255         pcparms_t pcparm;
 256
 257         pcparm.pc_cid = self->ul_rtclassid;
 258         ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
 259         ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
 260         (void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
 261 }
 262
 263 /*
 264  * Inherit priority from ceiling.
 265  * This changes the effective priority, not the assigned priority.
 266  */
 267 void
 268 _ceil_prio_inherit(int prio)
 269 {
 270         ulwp_t *self = curthread;
 271
 272         self->ul_epri = prio;
 273         set_rt_priority(self, prio);
 274 }
 275
 276 /*
 277  * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
 278  * if holding at least one ceiling lock.  If no ceiling locks are held at this
 279  * point, disinherit completely, reverting back to assigned priority.
 280  */
 281 void
 282 _ceil_prio_waive(void)
 283 {
 284         ulwp_t *self = curthread;
 285         mxchain_t *mcp = self->ul_mxchain;
 286         int prio;
 287
 288         if (mcp == NULL) {
 289                 prio = self->ul_pri;
 290                 self->ul_epri = 0;
 291         } else {
 292                 prio = mcp->mxchain_mx->mutex_ceiling;
 293                 self->ul_epri = prio;
 294         }
 295         set_rt_priority(self, prio);
 296 }
 297
 298 /*
 299  * Clear the lock byte.  Retain the waiters byte and the spinners byte.
 300  * Return the old value of the lock word.
 301  */
 302 static uint32_t
 303 clear_lockbyte(volatile uint32_t *lockword)
 304 {
 305         uint32_t old;
 306         uint32_t new;
 307
 308         do {
 309                 old = *lockword;
 310                 new = old & ~LOCKMASK;
 311         } while (atomic_cas_32(lockword, old, new) != old);
 312
 313         return (old);
 314 }
 315
 316 /*
 317  * Same as clear_lockbyte(), but operates on mutex_lockword64.
 318  * The mutex_ownerpid field is cleared along with the lock byte.
 319  */
 320 static uint64_t
 321 clear_lockbyte64(volatile uint64_t *lockword64)
 322 {
 323         uint64_t old;
 324         uint64_t new;
 325
 326         do {
 327                 old = *lockword64;
 328                 new = old & ~LOCKMASK64;
 329         } while (atomic_cas_64(lockword64, old, new) != old);
 330
 331         return (old);
 332 }
 333
 334 /*
 335  * Similar to set_lock_byte(), which only tries to set the lock byte.
 336  * Here, we attempt to set the lock byte AND the mutex_ownerpid, keeping
 337  * the remaining bytes constant.  This atomic operation is required for the
 338  * correctness of process-shared robust locks, otherwise there would be
 339  * a window or vulnerability in which the lock byte had been set but the
 340  * mutex_ownerpid had not yet been set.  If the process were to die in
 341  * this window of vulnerability (due to some other thread calling exit()
 342  * or the process receiving a fatal signal), the mutex would be left locked
 343  * but without a process-ID to determine which process was holding the lock.
 344  * The kernel would then be unable to mark the robust mutex as LOCK_OWNERDEAD
 345  * when the process died.  For all other cases of process-shared locks, this
 346  * operation is just a convenience, for the sake of common code.
 347  *
 348  * This operation requires process-shared robust locks to be properly
 349  * aligned on an 8-byte boundary, at least on sparc machines, lest the
 350  * operation incur an alignment fault.  This is automatic when locks
 351  * are declared properly using the mutex_t or pthread_mutex_t data types
 352  * and the application does not allocate dynamic memory on less than an
 353  * 8-byte boundary.  See the 'horrible hack' comments below for cases
 354  * dealing with such broken applications.
 355  */
 356 static int
 357 set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
 358 {
 359         uint64_t old;
 360         uint64_t new;
 361
 362         old = *lockword64 & ~LOCKMASK64;
 363         new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
 364         if (atomic_cas_64(lockword64, old, new) == old)
 365                 return (LOCKCLEAR);
 366
 367         return (LOCKSET);
 368 }
 369
 370 /*
 371  * Increment the spinners count in the mutex lock word.
 372  * Return 0 on success.  Return -1 if the count would overflow.
 373  */
 374 static int
 375 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
 376 {
 377         uint32_t old;
 378         uint32_t new;
 379
 380         do {
 381                 old = *lockword;
 382                 if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
 383                         return (-1);
 384                 new = old + (1 << SPINNERSHIFT);
 385         } while (atomic_cas_32(lockword, old, new) != old);
 386
 387         return (0);
 388 }
 389
 390 /*
 391  * Decrement the spinners count in the mutex lock word.
 392  * Return the new value of the lock word.
 393  */
 394 static uint32_t
 395 spinners_decr(volatile uint32_t *lockword)
 396 {
 397         uint32_t old;
 398         uint32_t new;
 399
 400         do {
 401                 new = old = *lockword;
 402                 if (new & SPINNERMASK)
 403                         new -= (1 << SPINNERSHIFT);
 404         } while (atomic_cas_32(lockword, old, new) != old);
 405
 406         return (new);
 407 }
 408
 409 /*
 410  * Non-preemptive spin locks.  Used by queue_lock().
 411  * No lock statistics are gathered for these locks.
 412  * No DTrace probes are provided for these locks.
 413  */
 414 void
 415 spin_lock_set(mutex_t *mp)
 416 {
 417         ulwp_t *self = curthread;
 418
 419         no_preempt(self);
 420         if (set_lock_byte(&mp->mutex_lockw) == 0) {
 421                 mp->mutex_owner = (uintptr_t)self;
 422                 return;
 423         }
 424         /*
 425          * Spin for a while, attempting to acquire the lock.
 426          */
 427         INCR32(self->ul_spin_lock_spin);
 428         if (mutex_queuelock_adaptive(mp) == 0 ||
 429             set_lock_byte(&mp->mutex_lockw) == 0) {
 430                 mp->mutex_owner = (uintptr_t)self;
 431                 return;
 432         }
 433         /*
 434          * Try harder if we were previously at a no premption level.
 435          */
 436         if (self->ul_preempt > 1) {
 437                 INCR32(self->ul_spin_lock_spin2);
 438                 if (mutex_queuelock_adaptive(mp) == 0 ||
 439                     set_lock_byte(&mp->mutex_lockw) == 0) {
 440                         mp->mutex_owner = (uintptr_t)self;
 441                         return;
 442                 }
 443         }
 444         /*
 445          * Give up and block in the kernel for the mutex.
 446          */
 447         INCR32(self->ul_spin_lock_sleep);
 448         (void) ___lwp_mutex_timedlock(mp, NULL, self);
 449 }
 450
 451 void
 452 spin_lock_clear(mutex_t *mp)
 453 {
 454         ulwp_t *self = curthread;
 455
 456         mp->mutex_owner = 0;
 457         if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
 458                 (void) ___lwp_mutex_wakeup(mp, 0);
 459                 INCR32(self->ul_spin_lock_wakeup);
 460         }
 461         preempt(self);
 462 }
 463
 464 /*
 465  * Allocate the sleep queue hash table.
 466  */
 467 void
 468 queue_alloc(void)
 469 {
 470         ulwp_t *self = curthread;
 471         uberdata_t *udp = self->ul_uberdata;
 472         queue_head_t *qp;
 473         void *data;
 474         int i;
 475
 476         /*
 477          * No locks are needed; we call here only when single-threaded.
 478          */
 479         ASSERT(self == udp->ulwp_one);
 480         ASSERT(!udp->uberflags.uf_mt);
 481         if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
 482             PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
 483             == MAP_FAILED)
 484                 thr_panic("cannot allocate thread queue_head table");
 485         udp->queue_head = qp = (queue_head_t *)data;
 486         for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
 487                 qp->qh_type = (i < QHASHSIZE)? MX : CV;
 488                 qp->qh_lock.mutex_flag = LOCK_INITED;
 489                 qp->qh_lock.mutex_magic = MUTEX_MAGIC;
 490                 qp->qh_hlist = &qp->qh_def_root;
 491 #if defined(THREAD_DEBUG)
 492                 qp->qh_hlen = 1;
 493                 qp->qh_hmax = 1;
 494 #endif
 495         }
 496 }
 497
 498 #if defined(THREAD_DEBUG)
 499
 500 /*
 501  * Debugging: verify correctness of a sleep queue.
 502  */
 503 void
 504 QVERIFY(queue_head_t *qp)
 505 {
 506         ulwp_t *self = curthread;
 507         uberdata_t *udp = self->ul_uberdata;
 508         queue_root_t *qrp;
 509         ulwp_t *ulwp;
 510         ulwp_t *prev;
 511         uint_t index;
 512         uint32_t cnt;
 513         char qtype;
 514         void *wchan;
 515
 516         ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
 517         ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
 518         for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
 519                 cnt++;
 520                 ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
 521                     (qrp->qr_head == NULL && qrp->qr_tail == NULL));
 522         }
 523         ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
 524         qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
 525         ASSERT(qp->qh_type == qtype);
 526         if (!thread_queue_verify)
 527                 return;
 528         /* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
 529         for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
 530                 for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
 531                     prev = ulwp, ulwp = ulwp->ul_link) {
 532                         cnt++;
 533                         if (ulwp->ul_writer)
 534                                 ASSERT(prev == NULL || prev->ul_writer);
 535                         ASSERT(ulwp->ul_qtype == qtype);
 536                         ASSERT(ulwp->ul_wchan != NULL);
 537                         ASSERT(ulwp->ul_sleepq == qp);
 538                         wchan = ulwp->ul_wchan;
 539                         ASSERT(qrp->qr_wchan == wchan);
 540                         index = QUEUE_HASH(wchan, qtype);
 541                         ASSERT(&udp->queue_head[index] == qp);
 542                 }
 543                 ASSERT(qrp->qr_tail == prev);
 544         }
 545         ASSERT(qp->qh_qlen == cnt);
 546 }
 547
 548 #else   /* THREAD_DEBUG */
 549
 550 #define QVERIFY(qp)
 551
 552 #endif  /* THREAD_DEBUG */
 553
 554 /*
 555  * Acquire a queue head.
 556  */
 557 queue_head_t *
 558 queue_lock(void *wchan, int qtype)
 559 {
 560         uberdata_t *udp = curthread->ul_uberdata;
 561         queue_head_t *qp;
 562         queue_root_t *qrp;
 563
 564         ASSERT(qtype == MX || qtype == CV);
 565
 566         /*
 567          * It is possible that we could be called while still single-threaded.
 568          * If so, we call queue_alloc() to allocate the queue_head[] array.
 569          */
 570         if ((qp = udp->queue_head) == NULL) {
 571                 queue_alloc();
 572                 qp = udp->queue_head;
 573         }
 574         qp += QUEUE_HASH(wchan, qtype);
 575         spin_lock_set(&qp->qh_lock);
 576         for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
 577                 if (qrp->qr_wchan == wchan)
 578                         break;
 579         if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
 580                 /* the default queue root is available; use it */
 581                 qrp = &qp->qh_def_root;
 582                 qrp->qr_wchan = wchan;
 583                 ASSERT(qrp->qr_next == NULL);
 584                 ASSERT(qrp->qr_tail == NULL &&
 585                     qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
 586         }
 587         qp->qh_wchan = wchan;   /* valid until queue_unlock() is called */
 588         qp->qh_root = qrp;      /* valid until queue_unlock() is called */
 589         INCR32(qp->qh_lockcount);
 590         QVERIFY(qp);
 591         return (qp);
 592 }
 593
 594 /*
 595  * Release a queue head.
 596  */
 597 void
 598 queue_unlock(queue_head_t *qp)
 599 {
 600         QVERIFY(qp);
 601         spin_lock_clear(&qp->qh_lock);
 602 }
 603
 604 /*
 605  * For rwlock queueing, we must queue writers ahead of readers of the
 606  * same priority.  We do this by making writers appear to have a half
 607  * point higher priority for purposes of priority comparisons below.
 608  */
 609 #define CMP_PRIO(ulwp)  ((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
 610
 611 void
 612 enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
 613 {
 614         queue_root_t *qrp;
 615         ulwp_t **ulwpp;
 616         ulwp_t *next;
 617         int pri = CMP_PRIO(ulwp);
 618
 619         ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
 620         ASSERT(ulwp->ul_sleepq != qp);
 621
 622         if ((qrp = qp->qh_root) == NULL) {
 623                 /* use the thread's queue root for the linkage */
 624                 qrp = &ulwp->ul_queue_root;
 625                 qrp->qr_next = qp->qh_hlist;
 626                 qrp->qr_prev = NULL;
 627                 qrp->qr_head = NULL;
 628                 qrp->qr_tail = NULL;
 629                 qrp->qr_wchan = qp->qh_wchan;
 630                 qrp->qr_rtcount = 0;
 631                 qrp->qr_qlen = 0;
 632                 qrp->qr_qmax = 0;
 633                 qp->qh_hlist->qr_prev = qrp;
 634                 qp->qh_hlist = qrp;
 635                 qp->qh_root = qrp;
 636                 MAXINCR(qp->qh_hmax, qp->qh_hlen);
 637         }
 638
 639         /*
 640          * LIFO queue ordering is unfair and can lead to starvation,
 641          * but it gives better performance for heavily contended locks.
 642          * We use thread_queue_fifo (range is 0..8) to determine
 643          * the frequency of FIFO vs LIFO queuing:
 644          *      0 : every 256th time    (almost always LIFO)
 645          *      1 : every 128th time
 646          *      2 : every 64th  time
 647          *      3 : every 32nd  time
 648          *      4 : every 16th  time    (the default value, mostly LIFO)
 649          *      5 : every 8th   time
 650          *      6 : every 4th   time
 651          *      7 : every 2nd   time
 652          *      8 : every time          (never LIFO, always FIFO)
 653          * Note that there is always some degree of FIFO ordering.
 654          * This breaks live lock conditions that occur in applications
 655          * that are written assuming (incorrectly) that threads acquire
 656          * locks fairly, that is, in roughly round-robin order.
 657          * In any event, the queue is maintained in kernel priority order.
 658          *
 659          * If force_fifo is non-zero, fifo queueing is forced.
 660          * SUSV3 requires this for semaphores.
 661          */
 662         if (qrp->qr_head == NULL) {
 663                 /*
 664                  * The queue is empty.  LIFO/FIFO doesn't matter.
 665                  */
 666                 ASSERT(qrp->qr_tail == NULL);
 667                 ulwpp = &qrp->qr_head;
 668         } else if (force_fifo |
 669             (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
 670                 /*
 671                  * Enqueue after the last thread whose priority is greater
 672                  * than or equal to the priority of the thread being queued.
 673                  * Attempt first to go directly onto the tail of the queue.
 674                  */
 675                 if (pri <= CMP_PRIO(qrp->qr_tail))
 676                         ulwpp = &qrp->qr_tail->ul_link;
 677                 else {
 678                         for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
 679                             ulwpp = &next->ul_link)
 680                                 if (pri > CMP_PRIO(next))
 681                                         break;
 682                 }
 683         } else {
 684                 /*
 685                  * Enqueue before the first thread whose priority is less
 686                  * than or equal to the priority of the thread being queued.
 687                  * Hopefully we can go directly onto the head of the queue.
 688                  */
 689                 for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
 690                     ulwpp = &next->ul_link)
 691                         if (pri >= CMP_PRIO(next))
 692                                 break;
 693         }
 694         if ((ulwp->ul_link = *ulwpp) == NULL)
 695                 qrp->qr_tail = ulwp;
 696         *ulwpp = ulwp;
 697
 698         ulwp->ul_sleepq = qp;
 699         ulwp->ul_wchan = qp->qh_wchan;
 700         ulwp->ul_qtype = qp->qh_type;
 701         if ((ulwp->ul_schedctl != NULL &&
 702             ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
 703             ulwp->ul_pilocks) {
 704                 ulwp->ul_rtqueued = 1;
 705                 qrp->qr_rtcount++;
 706         }
 707         MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
 708         MAXINCR(qp->qh_qmax, qp->qh_qlen);
 709 }
 710
 711 /*
 712  * Helper function for queue_slot() and queue_slot_rt().
 713  * Try to find a non-suspended thread on the queue.
 714  */
 715 static ulwp_t **
 716 queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
 717 {
 718         ulwp_t *ulwp;
 719         ulwp_t **foundpp = NULL;
 720         int priority = -1;
 721         ulwp_t *prev;
 722         int tpri;
 723
 724         for (prev = NULL;
 725             (ulwp = *ulwpp) != NULL;
 726             prev = ulwp, ulwpp = &ulwp->ul_link) {
 727                 if (ulwp->ul_stop)      /* skip suspended threads */
 728                         continue;
 729                 tpri = rt? CMP_PRIO(ulwp) : 0;
 730                 if (tpri > priority) {
 731                         foundpp = ulwpp;
 732                         *prevp = prev;
 733                         priority = tpri;
 734                         if (!rt)
 735                                 break;
 736                 }
 737         }
 738         return (foundpp);
 739 }
 740
 741 /*
 742  * For real-time, we search the entire queue because the dispatch
 743  * (kernel) priorities may have changed since enqueueing.
 744  */
 745 static ulwp_t **
 746 queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
 747 {
 748         ulwp_t **ulwpp = ulwpp_org;
 749         ulwp_t *ulwp = *ulwpp;
 750         ulwp_t **foundpp = ulwpp;
 751         int priority = CMP_PRIO(ulwp);
 752         ulwp_t *prev;
 753         int tpri;
 754
 755         for (prev = ulwp, ulwpp = &ulwp->ul_link;
 756             (ulwp = *ulwpp) != NULL;
 757             prev = ulwp, ulwpp = &ulwp->ul_link) {
 758                 tpri = CMP_PRIO(ulwp);
 759                 if (tpri > priority) {
 760                         foundpp = ulwpp;
 761                         *prevp = prev;
 762                         priority = tpri;
 763                 }
 764         }
 765         ulwp = *foundpp;
 766
 767         /*
 768          * Try not to return a suspended thread.
 769          * This mimics the old libthread's behavior.
 770          */
 771         if (ulwp->ul_stop &&
 772             (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
 773                 foundpp = ulwpp;
 774                 ulwp = *foundpp;
 775         }
 776         ulwp->ul_rt = 1;
 777         return (foundpp);
 778 }
 779
 780 ulwp_t **
 781 queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
 782 {
 783         queue_root_t *qrp;
 784         ulwp_t **ulwpp;
 785         ulwp_t *ulwp;
 786         int rt;
 787
 788         ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
 789
 790         if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
 791                 *more = 0;
 792                 return (NULL);          /* no lwps on the queue */
 793         }
 794         rt = (qrp->qr_rtcount != 0);
 795         *prevp = NULL;
 796         if (ulwp->ul_link == NULL) {    /* only one lwp on the queue */
 797                 *more = 0;
 798                 ulwp->ul_rt = rt;
 799                 return (&qrp->qr_head);
 800         }
 801         *more = 1;
 802
 803         if (rt)         /* real-time queue */
 804                 return (queue_slot_rt(&qrp->qr_head, prevp));
 805         /*
 806          * Try not to return a suspended thread.
 807          * This mimics the old libthread's behavior.
 808          */
 809         if (ulwp->ul_stop &&
 810             (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
 811                 ulwp = *ulwpp;
 812                 ulwp->ul_rt = 0;
 813                 return (ulwpp);
 814         }
 815         /*
 816          * The common case; just pick the first thread on the queue.
 817          */
 818         ulwp->ul_rt = 0;
 819         return (&qrp->qr_head);
 820 }
 821
 822 /*
 823  * Common code for unlinking an lwp from a user-level sleep queue.
 824  */
 825 void
 826 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
 827 {
 828         queue_root_t *qrp = qp->qh_root;
 829         queue_root_t *nqrp;
 830         ulwp_t *ulwp = *ulwpp;
 831         ulwp_t *next;
 832
 833         ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
 834         ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
 835
 836         DECR(qp->qh_qlen);
 837         DECR(qrp->qr_qlen);
 838         if (ulwp->ul_rtqueued) {
 839                 ulwp->ul_rtqueued = 0;
 840                 qrp->qr_rtcount--;
 841         }
 842         next = ulwp->ul_link;
 843         *ulwpp = next;
 844         ulwp->ul_link = NULL;
 845         if (qrp->qr_tail == ulwp)
 846                 qrp->qr_tail = prev;
 847         if (qrp == &ulwp->ul_queue_root) {
 848                 /*
 849                  * We can't continue to use the unlinked thread's
 850                  * queue root for the linkage.
 851                  */
 852                 queue_root_t *qr_next = qrp->qr_next;
 853                 queue_root_t *qr_prev = qrp->qr_prev;
 854
 855                 if (qrp->qr_tail) {
 856                         /* switch to using the last thread's queue root */
 857                         ASSERT(qrp->qr_qlen != 0);
 858                         nqrp = &qrp->qr_tail->ul_queue_root;
 859                         *nqrp = *qrp;
 860                         if (qr_next)
 861                                 qr_next->qr_prev = nqrp;
 862                         if (qr_prev)
 863                                 qr_prev->qr_next = nqrp;
 864                         else
 865                                 qp->qh_hlist = nqrp;
 866                         qp->qh_root = nqrp;
 867                 } else {
 868                         /* empty queue root; just delete from the hash list */
 869                         ASSERT(qrp->qr_qlen == 0);
 870                         if (qr_next)
 871                                 qr_next->qr_prev = qr_prev;
 872                         if (qr_prev)
 873                                 qr_prev->qr_next = qr_next;
 874                         else
 875                                 qp->qh_hlist = qr_next;
 876                         qp->qh_root = NULL;
 877                         DECR(qp->qh_hlen);
 878                 }
 879         }
 880 }
 881
 882 ulwp_t *
 883 dequeue(queue_head_t *qp, int *more)
 884 {
 885         ulwp_t **ulwpp;
 886         ulwp_t *ulwp;
 887         ulwp_t *prev;
 888
 889         if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
 890                 return (NULL);
 891         ulwp = *ulwpp;
 892         queue_unlink(qp, ulwpp, prev);
 893         ulwp->ul_sleepq = NULL;
 894         ulwp->ul_wchan = NULL;
 895         return (ulwp);
 896 }
 897
 898 /*
 899  * Return a pointer to the highest priority thread sleeping on wchan.
 900  */
 901 ulwp_t *
 902 queue_waiter(queue_head_t *qp)
 903 {
 904         ulwp_t **ulwpp;
 905         ulwp_t *prev;
 906         int more;
 907
 908         if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
 909                 return (NULL);
 910         return (*ulwpp);
 911 }
 912
 913 int
 914 dequeue_self(queue_head_t *qp)
 915 {
 916         ulwp_t *self = curthread;
 917         queue_root_t *qrp;
 918         ulwp_t **ulwpp;
 919         ulwp_t *ulwp;
 920         ulwp_t *prev;
 921         int found = 0;
 922
 923         ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
 924
 925         /* find self on the sleep queue */
 926         if ((qrp = qp->qh_root) != NULL) {
 927                 for (prev = NULL, ulwpp = &qrp->qr_head;
 928                     (ulwp = *ulwpp) != NULL;
 929                     prev = ulwp, ulwpp = &ulwp->ul_link) {
 930                         if (ulwp == self) {
 931                                 queue_unlink(qp, ulwpp, prev);
 932                                 self->ul_cvmutex = NULL;
 933                                 self->ul_sleepq = NULL;
 934                                 self->ul_wchan = NULL;
 935                                 found = 1;
 936                                 break;
 937                         }
 938                 }
 939         }
 940
 941         if (!found)
 942                 thr_panic("dequeue_self(): curthread not found on queue");
 943
 944         return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
 945 }
 946
 947 /*
 948  * Called from call_user_handler() and _thrp_suspend() to take
 949  * ourself off of our sleep queue so we can grab locks.
 950  */
 951 void
 952 unsleep_self(void)
 953 {
 954         ulwp_t *self = curthread;
 955         queue_head_t *qp;
 956
 957         /*
 958          * Calling enter_critical()/exit_critical() here would lead
 959          * to recursion.  Just manipulate self->ul_critical directly.
 960          */
 961         self->ul_critical++;
 962         while (self->ul_sleepq != NULL) {
 963                 qp = queue_lock(self->ul_wchan, self->ul_qtype);
 964                 /*
 965                  * We may have been moved from a CV queue to a
 966                  * mutex queue while we were attempting queue_lock().
 967                  * If so, just loop around and try again.
 968                  * dequeue_self() clears self->ul_sleepq.
 969                  */
 970                 if (qp == self->ul_sleepq)
 971                         (void) dequeue_self(qp);
 972                 queue_unlock(qp);
 973         }
 974         self->ul_writer = 0;
 975         self->ul_critical--;
 976 }
 977
 978 /*
 979  * Common code for calling the the ___lwp_mutex_timedlock() system call.
 980  * Returns with mutex_owner and mutex_ownerpid set correctly.
 981  */
 982 static int
 983 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
 984 {
 985         ulwp_t *self = curthread;
 986         uberdata_t *udp = self->ul_uberdata;
 987         int mtype = mp->mutex_type;
 988         hrtime_t begin_sleep;
 989         int acquired;
 990         int error;
 991
 992         self->ul_sp = stkptr();
 993         self->ul_wchan = mp;
 994         if (__td_event_report(self, TD_SLEEP, udp)) {
 995                 self->ul_td_evbuf.eventnum = TD_SLEEP;
 996                 self->ul_td_evbuf.eventdata = mp;
 997                 tdb_event(TD_SLEEP, udp);
 998         }
 999         if (msp) {
1000                 tdb_incr(msp->mutex_sleep);
1001                 begin_sleep = gethrtime();
1002         }
1003
1004         DTRACE_PROBE1(plockstat, mutex__block, mp);
1005
1006         for (;;) {
1007                 /*
1008                  * A return value of EOWNERDEAD or ELOCKUNMAPPED
1009                  * means we successfully acquired the lock.
1010                  */
1011                 if ((error = ___lwp_mutex_timedlock(mp, tsp, self)) != 0 &&
1012                     error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1013                         acquired = 0;
1014                         break;
1015                 }
1016
1017                 if (mtype & USYNC_PROCESS) {
1018                         /*
1019                          * Defend against forkall().  We may be the child,
1020                          * in which case we don't actually own the mutex.
1021                          */
1022                         enter_critical(self);
1023                         if (mp->mutex_ownerpid == udp->pid) {
1024                                 exit_critical(self);
1025                                 acquired = 1;
1026                                 break;
1027                         }
1028                         exit_critical(self);
1029                 } else {
1030                         acquired = 1;
1031                         break;
1032                 }
1033         }
1034
1035         if (msp)
1036                 msp->mutex_sleep_time += gethrtime() - begin_sleep;
1037         self->ul_wchan = NULL;
1038         self->ul_sp = 0;
1039
1040         if (acquired) {
1041                 ASSERT(mp->mutex_owner == (uintptr_t)self);
1042                 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1043                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1044         } else {
1045                 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1046                 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1047         }
1048
1049         return (error);
1050 }
1051
1052 /*
1053  * Common code for calling the ___lwp_mutex_trylock() system call.
1054  * Returns with mutex_owner and mutex_ownerpid set correctly.
1055  */
1056 int
1057 mutex_trylock_kernel(mutex_t *mp)
1058 {
1059         ulwp_t *self = curthread;
1060         uberdata_t *udp = self->ul_uberdata;
1061         int mtype = mp->mutex_type;
1062         int error;
1063         int acquired;
1064
1065         for (;;) {
1066                 /*
1067                  * A return value of EOWNERDEAD or ELOCKUNMAPPED
1068                  * means we successfully acquired the lock.
1069                  */
1070                 if ((error = ___lwp_mutex_trylock(mp, self)) != 0 &&
1071                     error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1072                         acquired = 0;
1073                         break;
1074                 }
1075
1076                 if (mtype & USYNC_PROCESS) {
1077                         /*
1078                          * Defend against forkall().  We may be the child,
1079                          * in which case we don't actually own the mutex.
1080                          */
1081                         enter_critical(self);
1082                         if (mp->mutex_ownerpid == udp->pid) {
1083                                 exit_critical(self);
1084                                 acquired = 1;
1085                                 break;
1086                         }
1087                         exit_critical(self);
1088                 } else {
1089                         acquired = 1;
1090                         break;
1091                 }
1092         }
1093
1094         if (acquired) {
1095                 ASSERT(mp->mutex_owner == (uintptr_t)self);
1096                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1097         } else if (error != EBUSY) {
1098                 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1099         }
1100
1101         return (error);
1102 }
1103
1104 volatile sc_shared_t *
1105 setup_schedctl(void)
1106 {
1107         ulwp_t *self = curthread;
1108         volatile sc_shared_t *scp;
1109         sc_shared_t *tmp;
1110
1111         if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
1112             !self->ul_vfork &&                  /* not a child of vfork() */
1113             !self->ul_schedctl_called) {        /* haven't been called before */
1114                 enter_critical(self);
1115                 self->ul_schedctl_called = &self->ul_uberdata->uberflags;
1116                 if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
1117                         self->ul_schedctl = scp = tmp;
1118                 exit_critical(self);
1119         }
1120         /*
1121          * Unless the call to setup_schedctl() is surrounded
1122          * by enter_critical()/exit_critical(), the address
1123          * we are returning could be invalid due to a forkall()
1124          * having occurred in another thread.
1125          */
1126         return (scp);
1127 }
1128
1129 /*
1130  * Interfaces from libsched, incorporated into libc.
1131  * libsched.so.1 is now a filter library onto libc.
1132  */
1133 #pragma weak schedctl_lookup = schedctl_init
1134 schedctl_t *
1135 schedctl_init(void)
1136 {
1137         volatile sc_shared_t *scp = setup_schedctl();
1138         return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
1139 }
1140
1141 void
1142 schedctl_exit(void)
1143 {
1144 }
1145
1146 /*
1147  * Contract private interface for java.
1148  * Set up the schedctl data if it doesn't exist yet.
1149  * Return a pointer to the pointer to the schedctl data.
1150  */
1151 volatile sc_shared_t *volatile *
1152 _thr_schedctl(void)
1153 {
1154         ulwp_t *self = curthread;
1155         volatile sc_shared_t *volatile *ptr;
1156
1157         if (self->ul_vfork)
1158                 return (NULL);
1159         if (*(ptr = &self->ul_schedctl) == NULL)
1160                 (void) setup_schedctl();
1161         return (ptr);
1162 }
1163
1164 /*
1165  * Block signals and attempt to block preemption.
1166  * no_preempt()/preempt() must be used in pairs but can be nested.
1167  */
1168 void
1169 no_preempt(ulwp_t *self)
1170 {
1171         volatile sc_shared_t *scp;
1172
1173         if (self->ul_preempt++ == 0) {
1174                 enter_critical(self);
1175                 if ((scp = self->ul_schedctl) != NULL ||
1176                     (scp = setup_schedctl()) != NULL) {
1177                         /*
1178                          * Save the pre-existing preempt value.
1179                          */
1180                         self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
1181                         scp->sc_preemptctl.sc_nopreempt = 1;
1182                 }
1183         }
1184 }
1185
1186 /*
1187  * Undo the effects of no_preempt().
1188  */
1189 void
1190 preempt(ulwp_t *self)
1191 {
1192         volatile sc_shared_t *scp;
1193
1194         ASSERT(self->ul_preempt > 0);
1195         if (--self->ul_preempt == 0) {
1196                 if ((scp = self->ul_schedctl) != NULL) {
1197                         /*
1198                          * Restore the pre-existing preempt value.
1199                          */
1200                         scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
1201                         if (scp->sc_preemptctl.sc_yield &&
1202                             scp->sc_preemptctl.sc_nopreempt == 0) {
1203                                 yield();
1204                                 if (scp->sc_preemptctl.sc_yield) {
1205                                         /*
1206                                          * Shouldn't happen.  This is either
1207                                          * a race condition or the thread
1208                                          * just entered the real-time class.
1209                                          */
1210                                         yield();
1211                                         scp->sc_preemptctl.sc_yield = 0;
1212                                 }
1213                         }
1214                 }
1215                 exit_critical(self);
1216         }
1217 }
1218
1219 /*
1220  * If a call to preempt() would cause the current thread to yield or to
1221  * take deferred actions in exit_critical(), then unpark the specified
1222  * lwp so it can run while we delay.  Return the original lwpid if the
1223  * unpark was not performed, else return zero.  The tests are a repeat
1224  * of some of the tests in preempt(), above.  This is a statistical
1225  * optimization solely for cond_sleep_queue(), below.
1226  */
1227 static lwpid_t
1228 preempt_unpark(ulwp_t *self, lwpid_t lwpid)
1229 {
1230         volatile sc_shared_t *scp = self->ul_schedctl;
1231
1232         ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
1233         if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
1234             (self->ul_curplease && self->ul_critical == 1)) {
1235                 (void) __lwp_unpark(lwpid);
1236                 lwpid = 0;
1237         }
1238         return (lwpid);
1239 }
1240
1241 /*
1242  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1243  * If this fails, return EBUSY and let the caller deal with it.
1244  * If this succeeds, return 0 with mutex_owner set to curthread.
1245  */
1246 static int
1247 mutex_trylock_adaptive(mutex_t *mp, int tryhard)
1248 {
1249         ulwp_t *self = curthread;
1250         int error = EBUSY;
1251         ulwp_t *ulwp;
1252         volatile sc_shared_t *scp;
1253         volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1254         volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
1255         uint32_t new_lockword;
1256         int count = 0;
1257         int max_count;
1258         uint8_t max_spinners;
1259
1260         ASSERT(!(mp->mutex_type & USYNC_PROCESS));
1261
1262         if (MUTEX_OWNED(mp, self))
1263                 return (EBUSY);
1264
1265         enter_critical(self);
1266
1267         /* short-cut, not definitive (see below) */
1268         if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1269                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1270                 error = ENOTRECOVERABLE;
1271                 goto done;
1272         }
1273
1274         /*
1275          * Make one attempt to acquire the lock before
1276          * incurring the overhead of the spin loop.
1277          */
1278         if (set_lock_byte(lockp) == 0) {
1279                 *ownerp = (uintptr_t)self;
1280                 error = 0;
1281                 goto done;
1282         }
1283         if (!tryhard)
1284                 goto done;
1285         if (ncpus == 0)
1286                 ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1287         if ((max_spinners = self->ul_max_spinners) >= ncpus)
1288                 max_spinners = ncpus - 1;
1289         max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1290         if (max_count == 0)
1291                 goto done;
1292
1293         /*
1294          * This spin loop is unfair to lwps that have already dropped into
1295          * the kernel to sleep.  They will starve on a highly-contended mutex.
1296          * This is just too bad.  The adaptive spin algorithm is intended
1297          * to allow programs with highly-contended locks (that is, broken
1298          * programs) to execute with reasonable speed despite their contention.
1299          * Being fair would reduce the speed of such programs and well-written
1300          * programs will not suffer in any case.
1301          */
1302         if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1303                 goto done;
1304         DTRACE_PROBE1(plockstat, mutex__spin, mp);
1305         for (count = 1; ; count++) {
1306                 if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1307                         *ownerp = (uintptr_t)self;
1308                         error = 0;
1309                         break;
1310                 }
1311                 if (count == max_count)
1312                         break;
1313                 SMT_PAUSE();
1314                 /*
1315                  * Stop spinning if the mutex owner is not running on
1316                  * a processor; it will not drop the lock any time soon
1317                  * and we would just be wasting time to keep spinning.
1318                  *
1319                  * Note that we are looking at another thread (ulwp_t)
1320                  * without ensuring that the other thread does not exit.
1321                  * The scheme relies on ulwp_t structures never being
1322                  * deallocated by the library (the library employs a free
1323                  * list of ulwp_t structs that are reused when new threads
1324                  * are created) and on schedctl shared memory never being
1325                  * deallocated once created via __schedctl().
1326                  *
1327                  * Thus, the worst that can happen when the spinning thread
1328                  * looks at the owner's schedctl data is that it is looking
1329                  * at some other thread's schedctl data.  This almost never
1330                  * happens and is benign when it does.
1331                  */
1332                 if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1333                     ((scp = ulwp->ul_schedctl) == NULL ||
1334                     scp->sc_state != SC_ONPROC))
1335                         break;
1336         }
1337         new_lockword = spinners_decr(&mp->mutex_lockword);
1338         if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1339                 /*
1340                  * We haven't yet acquired the lock, the lock
1341                  * is free, and there are no other spinners.
1342                  * Make one final attempt to acquire the lock.
1343                  *
1344                  * This isn't strictly necessary since mutex_lock_queue()
1345                  * (the next action this thread will take if it doesn't
1346                  * acquire the lock here) makes one attempt to acquire
1347                  * the lock before putting the thread to sleep.
1348                  *
1349                  * If the next action for this thread (on failure here)
1350                  * were not to call mutex_lock_queue(), this would be
1351                  * necessary for correctness, to avoid ending up with an
1352                  * unheld mutex with waiters but no one to wake them up.
1353                  */
1354                 if (set_lock_byte(lockp) == 0) {
1355                         *ownerp = (uintptr_t)self;
1356                         error = 0;
1357                 }
1358                 count++;
1359         }
1360
1361 done:
1362         if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1363                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1364                 /*
1365                  * We shouldn't own the mutex.
1366                  * Just clear the lock; everyone has already been waked up.
1367                  */
1368                 *ownerp = 0;
1369                 (void) clear_lockbyte(&mp->mutex_lockword);
1370                 error = ENOTRECOVERABLE;
1371         }
1372
1373         exit_critical(self);
1374
1375         if (error) {
1376                 if (count) {
1377                         DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1378                 }
1379                 if (error != EBUSY) {
1380                         DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1381                 }
1382         } else {
1383                 if (count) {
1384                         DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1385                 }
1386                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1387                 if (mp->mutex_flag & LOCK_OWNERDEAD) {
1388                         ASSERT(mp->mutex_type & LOCK_ROBUST);
1389                         error = EOWNERDEAD;
1390                 }
1391         }
1392
1393         return (error);
1394 }
1395
1396 /*
1397  * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1398  * The owner field is not set here; the caller (spin_lock_set()) sets it.
1399  */
1400 static int
1401 mutex_queuelock_adaptive(mutex_t *mp)
1402 {
1403         ulwp_t *ulwp;
1404         volatile sc_shared_t *scp;
1405         volatile uint8_t *lockp;
1406         volatile uint64_t *ownerp;
1407         int count = curthread->ul_queue_spin;
1408
1409         ASSERT(mp->mutex_type == USYNC_THREAD);
1410
1411         if (count == 0)
1412                 return (EBUSY);
1413
1414         lockp = (volatile uint8_t *)&mp->mutex_lockw;
1415         ownerp = (volatile uint64_t *)&mp->mutex_owner;
1416         while (--count >= 0) {
1417                 if (*lockp == 0 && set_lock_byte(lockp) == 0)
1418                         return (0);
1419                 SMT_PAUSE();
1420                 if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1421                     ((scp = ulwp->ul_schedctl) == NULL ||
1422                     scp->sc_state != SC_ONPROC))
1423                         break;
1424         }
1425
1426         return (EBUSY);
1427 }
1428
1429 /*
1430  * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1431  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1432  * If this fails, return EBUSY and let the caller deal with it.
1433  * If this succeeds, return 0 with mutex_owner set to curthread
1434  * and mutex_ownerpid set to the current pid.
1435  */
1436 static int
1437 mutex_trylock_process(mutex_t *mp, int tryhard)
1438 {
1439         ulwp_t *self = curthread;
1440         uberdata_t *udp = self->ul_uberdata;
1441         int error = EBUSY;
1442         volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
1443         uint32_t new_lockword;
1444         int count = 0;
1445         int max_count;
1446         uint8_t max_spinners;
1447
1448 #if defined(__sparc) && !defined(_LP64)
1449         /* horrible hack, necessary only on 32-bit sparc */
1450         int fix_alignment_problem =
1451             (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1452             self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST));
1453 #endif
1454
1455         ASSERT(mp->mutex_type & USYNC_PROCESS);
1456
1457         if (shared_mutex_held(mp))
1458                 return (EBUSY);
1459
1460         enter_critical(self);
1461
1462         /* short-cut, not definitive (see below) */
1463         if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1464                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1465                 error = ENOTRECOVERABLE;
1466                 goto done;
1467         }
1468
1469         /*
1470          * Make one attempt to acquire the lock before
1471          * incurring the overhead of the spin loop.
1472          */
1473 #if defined(__sparc) && !defined(_LP64)
1474         /* horrible hack, necessary only on 32-bit sparc */
1475         if (fix_alignment_problem) {
1476                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
1477                         mp->mutex_ownerpid = udp->pid;
1478                         mp->mutex_owner = (uintptr_t)self;
1479                         error = 0;
1480                         goto done;
1481                 }
1482         } else
1483 #endif
1484         if (set_lock_byte64(lockp, udp->pid) == 0) {
1485                 mp->mutex_owner = (uintptr_t)self;
1486                 /* mp->mutex_ownerpid was set by set_lock_byte64() */
1487                 error = 0;
1488                 goto done;
1489         }
1490         if (!tryhard)
1491                 goto done;
1492         if (ncpus == 0)
1493                 ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1494         if ((max_spinners = self->ul_max_spinners) >= ncpus)
1495                 max_spinners = ncpus - 1;
1496         max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1497         if (max_count == 0)
1498                 goto done;
1499
1500         /*
1501          * This is a process-shared mutex.
1502          * We cannot know if the owner is running on a processor.
1503          * We just spin and hope that it is on a processor.
1504          */
1505         if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1506                 goto done;
1507         DTRACE_PROBE1(plockstat, mutex__spin, mp);
1508         for (count = 1; ; count++) {
1509 #if defined(__sparc) && !defined(_LP64)
1510                 /* horrible hack, necessary only on 32-bit sparc */
1511                 if (fix_alignment_problem) {
1512                         if ((*lockp & LOCKMASK64) == 0 &&
1513                             set_lock_byte(&mp->mutex_lockw) == 0) {
1514                                 mp->mutex_ownerpid = udp->pid;
1515                                 mp->mutex_owner = (uintptr_t)self;
1516                                 error = 0;
1517                                 break;
1518                         }
1519                 } else
1520 #endif
1521                 if ((*lockp & LOCKMASK64) == 0 &&
1522                     set_lock_byte64(lockp, udp->pid) == 0) {
1523                         mp->mutex_owner = (uintptr_t)self;
1524                         /* mp->mutex_ownerpid was set by set_lock_byte64() */
1525                         error = 0;
1526                         break;
1527                 }
1528                 if (count == max_count)
1529                         break;
1530                 SMT_PAUSE();
1531         }
1532         new_lockword = spinners_decr(&mp->mutex_lockword);
1533         if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1534                 /*
1535                  * We haven't yet acquired the lock, the lock
1536                  * is free, and there are no other spinners.
1537                  * Make one final attempt to acquire the lock.
1538                  *
1539                  * This isn't strictly necessary since mutex_lock_kernel()
1540                  * (the next action this thread will take if it doesn't
1541                  * acquire the lock here) makes one attempt to acquire
1542                  * the lock before putting the thread to sleep.
1543                  *
1544                  * If the next action for this thread (on failure here)
1545                  * were not to call mutex_lock_kernel(), this would be
1546                  * necessary for correctness, to avoid ending up with an
1547                  * unheld mutex with waiters but no one to wake them up.
1548                  */
1549 #if defined(__sparc) && !defined(_LP64)
1550                 /* horrible hack, necessary only on 32-bit sparc */
1551                 if (fix_alignment_problem) {
1552                         if (set_lock_byte(&mp->mutex_lockw) == 0) {
1553                                 mp->mutex_ownerpid = udp->pid;
1554                                 mp->mutex_owner = (uintptr_t)self;
1555                                 error = 0;
1556                         }
1557                 } else
1558 #endif
1559                 if (set_lock_byte64(lockp, udp->pid) == 0) {
1560                         mp->mutex_owner = (uintptr_t)self;
1561                         /* mp->mutex_ownerpid was set by set_lock_byte64() */
1562                         error = 0;
1563                 }
1564                 count++;
1565         }
1566
1567 done:
1568         if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1569                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1570                 /*
1571                  * We shouldn't own the mutex.
1572                  * Just clear the lock; everyone has already been waked up.
1573                  */
1574                 mp->mutex_owner = 0;
1575                 /* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1576                 (void) clear_lockbyte64(&mp->mutex_lockword64);
1577                 error = ENOTRECOVERABLE;
1578         }
1579
1580         exit_critical(self);
1581
1582         if (error) {
1583                 if (count) {
1584                         DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1585                 }
1586                 if (error != EBUSY) {
1587                         DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1588                 }
1589         } else {
1590                 if (count) {
1591                         DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1592                 }
1593                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1594                 if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1595                         ASSERT(mp->mutex_type & LOCK_ROBUST);
1596                         if (mp->mutex_flag & LOCK_OWNERDEAD)
1597                                 error = EOWNERDEAD;
1598                         else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
1599                                 error = ELOCKUNMAPPED;
1600                         else
1601                                 error = EOWNERDEAD;
1602                 }
1603         }
1604
1605         return (error);
1606 }
1607
1608 /*
1609  * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1610  * Returns the lwpid of the thread that was dequeued, if any.
1611  * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1612  * to wake up the specified lwp.
1613  */
1614 static lwpid_t
1615 mutex_wakeup(mutex_t *mp)
1616 {
1617         lwpid_t lwpid = 0;
1618         int more;
1619         queue_head_t *qp;
1620         ulwp_t *ulwp;
1621
1622         /*
1623          * Dequeue a waiter from the sleep queue.  Don't touch the mutex
1624          * waiters bit if no one was found on the queue because the mutex
1625          * might have been deallocated or reallocated for another purpose.
1626          */
1627         qp = queue_lock(mp, MX);
1628         if ((ulwp = dequeue(qp, &more)) != NULL) {
1629                 lwpid = ulwp->ul_lwpid;
1630                 mp->mutex_waiters = more;
1631         }
1632         queue_unlock(qp);
1633         return (lwpid);
1634 }
1635
1636 /*
1637  * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
1638  */
1639 static void
1640 mutex_wakeup_all(mutex_t *mp)
1641 {
1642         queue_head_t *qp;
1643         queue_root_t *qrp;
1644         int nlwpid = 0;
1645         int maxlwps = MAXLWPS;
1646         ulwp_t *ulwp;
1647         lwpid_t buffer[MAXLWPS];
1648         lwpid_t *lwpid = buffer;
1649
1650         /*
1651          * Walk the list of waiters and prepare to wake up all of them.
1652          * The waiters flag has already been cleared from the mutex.
1653          *
1654          * We keep track of lwpids that are to be unparked in lwpid[].
1655          * __lwp_unpark_all() is called to unpark all of them after
1656          * they have been removed from the sleep queue and the sleep
1657          * queue lock has been dropped.  If we run out of space in our
1658          * on-stack buffer, we need to allocate more but we can't call
1659          * lmalloc() because we are holding a queue lock when the overflow
1660          * occurs and lmalloc() acquires a lock.  We can't use alloca()
1661          * either because the application may have allocated a small
1662          * stack and we don't want to overrun the stack.  So we call
1663          * alloc_lwpids() to allocate a bigger buffer using the mmap()
1664          * system call directly since that path acquires no locks.
1665          */
1666         qp = queue_lock(mp, MX);
1667         for (;;) {
1668                 if ((qrp = qp->qh_root) == NULL ||
1669                     (ulwp = qrp->qr_head) == NULL)
1670                         break;
1671                 ASSERT(ulwp->ul_wchan == mp);
1672                 queue_unlink(qp, &qrp->qr_head, NULL);
1673                 ulwp->ul_sleepq = NULL;
1674                 ulwp->ul_wchan = NULL;
1675                 if (nlwpid == maxlwps)
1676                         lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
1677                 lwpid[nlwpid++] = ulwp->ul_lwpid;
1678         }
1679
1680         if (nlwpid == 0) {
1681                 queue_unlock(qp);
1682         } else {
1683                 mp->mutex_waiters = 0;
1684                 no_preempt(curthread);
1685                 queue_unlock(qp);
1686                 if (nlwpid == 1)
1687                         (void) __lwp_unpark(lwpid[0]);
1688                 else
1689                         (void) __lwp_unpark_all(lwpid, nlwpid);
1690                 preempt(curthread);
1691         }
1692
1693         if (lwpid != buffer)
1694                 (void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
1695 }
1696
1697 /*
1698  * Release a process-private mutex.
1699  * As an optimization, if there are waiters but there are also spinners
1700  * attempting to acquire the mutex, then don't bother waking up a waiter;
1701  * one of the spinners will acquire the mutex soon and it would be a waste
1702  * of resources to wake up some thread just to have it spin for a while
1703  * and then possibly go back to sleep.  See mutex_trylock_adaptive().
1704  */
1705 static lwpid_t
1706 mutex_unlock_queue(mutex_t *mp, int release_all)
1707 {
1708         ulwp_t *self = curthread;
1709         lwpid_t lwpid = 0;
1710         uint32_t old_lockword;
1711
1712         DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1713         sigoff(self);
1714         mp->mutex_owner = 0;
1715         old_lockword = clear_lockbyte(&mp->mutex_lockword);
1716         if ((old_lockword & WAITERMASK) &&
1717             (release_all || (old_lockword & SPINNERMASK) == 0)) {
1718                 no_preempt(self);       /* ensure a prompt wakeup */
1719                 if (release_all)
1720                         mutex_wakeup_all(mp);
1721                 else
1722                         lwpid = mutex_wakeup(mp);
1723                 if (lwpid == 0)
1724                         preempt(self);
1725         }
1726         sigon(self);
1727         return (lwpid);
1728 }
1729
1730 /*
1731  * Like mutex_unlock_queue(), but for process-shared mutexes.
1732  */
1733 static void
1734 mutex_unlock_process(mutex_t *mp, int release_all)
1735 {
1736         ulwp_t *self = curthread;
1737         uint64_t old_lockword64;
1738
1739         DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1740         sigoff(self);
1741         mp->mutex_owner = 0;
1742 #if defined(__sparc) && !defined(_LP64)
1743         /* horrible hack, necessary only on 32-bit sparc */
1744         if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1745             self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST)) {
1746                 uint32_t old_lockword;
1747                 mp->mutex_ownerpid = 0;
1748                 old_lockword = clear_lockbyte(&mp->mutex_lockword);
1749                 if ((old_lockword & WAITERMASK) &&
1750                     (release_all || (old_lockword & SPINNERMASK) == 0)) {
1751                         no_preempt(self);       /* ensure a prompt wakeup */
1752                         (void) ___lwp_mutex_wakeup(mp, release_all);
1753                         preempt(self);
1754                 }
1755                 sigon(self);
1756                 return;
1757         }
1758 #endif
1759         /* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1760         old_lockword64 = clear_lockbyte64(&mp->mutex_lockword64);
1761         if ((old_lockword64 & WAITERMASK64) &&
1762             (release_all || (old_lockword64 & SPINNERMASK64) == 0)) {
1763                 no_preempt(self);       /* ensure a prompt wakeup */
1764                 (void) ___lwp_mutex_wakeup(mp, release_all);
1765                 preempt(self);
1766         }
1767         sigon(self);
1768 }
1769
1770 void
1771 stall(void)
1772 {
1773         for (;;)
1774                 (void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1775 }
1776
1777 /*
1778  * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1779  * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1780  * If successful, returns with mutex_owner set correctly.
1781  */
1782 int
1783 mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1784         timespec_t *tsp)
1785 {
1786         uberdata_t *udp = curthread->ul_uberdata;
1787         queue_head_t *qp;
1788         hrtime_t begin_sleep;
1789         int error = 0;
1790
1791         self->ul_sp = stkptr();
1792         if (__td_event_report(self, TD_SLEEP, udp)) {
1793                 self->ul_wchan = mp;
1794                 self->ul_td_evbuf.eventnum = TD_SLEEP;
1795                 self->ul_td_evbuf.eventdata = mp;
1796                 tdb_event(TD_SLEEP, udp);
1797         }
1798         if (msp) {
1799                 tdb_incr(msp->mutex_sleep);
1800                 begin_sleep = gethrtime();
1801         }
1802
1803         DTRACE_PROBE1(plockstat, mutex__block, mp);
1804
1805         /*
1806          * Put ourself on the sleep queue, and while we are
1807          * unable to grab the lock, go park in the kernel.
1808          * Take ourself off the sleep queue after we acquire the lock.
1809          * The waiter bit can be set/cleared only while holding the queue lock.
1810          */
1811         qp = queue_lock(mp, MX);
1812         enqueue(qp, self, 0);
1813         mp->mutex_waiters = 1;
1814         for (;;) {
1815                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
1816                         mp->mutex_owner = (uintptr_t)self;
1817                         mp->mutex_waiters = dequeue_self(qp);
1818                         break;
1819                 }
1820                 set_parking_flag(self, 1);
1821                 queue_unlock(qp);
1822                 /*
1823                  * __lwp_park() will return the residual time in tsp
1824                  * if we are unparked before the timeout expires.
1825                  */
1826                 error = __lwp_park(tsp, 0);
1827                 set_parking_flag(self, 0);
1828                 /*
1829                  * We could have taken a signal or suspended ourself.
1830                  * If we did, then we removed ourself from the queue.
1831                  * Someone else may have removed us from the queue
1832                  * as a consequence of mutex_unlock().  We may have
1833                  * gotten a timeout from __lwp_park().  Or we may still
1834                  * be on the queue and this is just a spurious wakeup.
1835                  */
1836                 qp = queue_lock(mp, MX);
1837                 if (self->ul_sleepq == NULL) {
1838                         if (error) {
1839                                 mp->mutex_waiters = queue_waiter(qp)? 1 : 0;
1840                                 if (error != EINTR)
1841                                         break;
1842                                 error = 0;
1843                         }
1844                         if (set_lock_byte(&mp->mutex_lockw) == 0) {
1845                                 mp->mutex_owner = (uintptr_t)self;
1846                                 break;
1847                         }
1848                         enqueue(qp, self, 0);
1849                         mp->mutex_waiters = 1;
1850                 }
1851                 ASSERT(self->ul_sleepq == qp &&
1852                     self->ul_qtype == MX &&
1853                     self->ul_wchan == mp);
1854                 if (error) {
1855                         if (error != EINTR) {
1856                                 mp->mutex_waiters = dequeue_self(qp);
1857                                 break;
1858                         }
1859                         error = 0;
1860                 }
1861         }
1862         ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1863             self->ul_wchan == NULL);
1864         self->ul_sp = 0;
1865
1866         ASSERT(error == 0 || error == EINVAL || error == ETIME);
1867
1868         if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1869                 ASSERT(mp->mutex_type & LOCK_ROBUST);
1870                 /*
1871                  * We shouldn't own the mutex.
1872                  * Just clear the lock; everyone has already been waked up.
1873                  */
1874                 mp->mutex_owner = 0;
1875                 (void) clear_lockbyte(&mp->mutex_lockword);
1876                 error = ENOTRECOVERABLE;
1877         }
1878
1879         queue_unlock(qp);
1880
1881         if (msp)
1882                 msp->mutex_sleep_time += gethrtime() - begin_sleep;
1883
1884         if (error) {
1885                 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1886                 DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1887         } else {
1888                 DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1889                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1890                 if (mp->mutex_flag & LOCK_OWNERDEAD) {
1891                         ASSERT(mp->mutex_type & LOCK_ROBUST);
1892                         error = EOWNERDEAD;
1893                 }
1894         }
1895
1896         return (error);
1897 }
1898
1899 static int
1900 mutex_recursion(mutex_t *mp, int mtype, int try)
1901 {
1902         ASSERT(mutex_held(mp));
1903         ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
1904         ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1905
1906         if (mtype & LOCK_RECURSIVE) {
1907                 if (mp->mutex_rcount == RECURSION_MAX) {
1908                         DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
1909                         return (EAGAIN);
1910                 }
1911                 mp->mutex_rcount++;
1912                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1913                 return (0);
1914         }
1915         if (try == MUTEX_LOCK) {
1916                 DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1917                 return (EDEADLK);
1918         }
1919         return (EBUSY);
1920 }
1921
1922 /*
1923  * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
1924  * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
1925  * We use tdb_hash_lock here and in the synch object tracking code in
1926  * the tdb_agent.c file.  There is no conflict between these two usages.
1927  */
1928 void
1929 register_lock(mutex_t *mp)
1930 {
1931         uberdata_t *udp = curthread->ul_uberdata;
1932         uint_t hash = LOCK_HASH(mp);
1933         robust_t *rlp;
1934         robust_t *invalid;
1935         robust_t **rlpp;
1936         robust_t **table;
1937
1938         if ((table = udp->robustlocks) == NULL) {
1939                 lmutex_lock(&udp->tdb_hash_lock);
1940                 if ((table = udp->robustlocks) == NULL) {
1941                         table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
1942                         membar_producer();
1943                         udp->robustlocks = table;
1944                 }
1945                 lmutex_unlock(&udp->tdb_hash_lock);
1946         }
1947         membar_consumer();
1948
1949         /*
1950          * First search the registered table with no locks held.
1951          * This is safe because the table never shrinks
1952          * and we can only get a false negative.
1953          */
1954         for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
1955                 if (rlp->robust_lock == mp)     /* already registered */
1956                         return;
1957         }
1958
1959         /*
1960          * The lock was not found.
1961          * Repeat the operation with tdb_hash_lock held.
1962          */
1963         lmutex_lock(&udp->tdb_hash_lock);
1964
1965         invalid = NULL;
1966         for (rlpp = &table[hash];
1967             (rlp = *rlpp) != NULL;
1968             rlpp = &rlp->robust_next) {
1969                 if (rlp->robust_lock == mp) {   /* already registered */
1970                         lmutex_unlock(&udp->tdb_hash_lock);
1971                         return;
1972                 }
1973                 /* remember the first invalid entry, if any */
1974                 if (rlp->robust_lock == INVALID_ADDR && invalid == NULL)
1975                         invalid = rlp;
1976         }
1977
1978         /*
1979          * The lock has never been registered.
1980          * Add it to the table and register it now.
1981          */
1982         if ((rlp = invalid) != NULL) {
1983                 /*
1984                  * Reuse the invalid entry we found above.
1985                  * The linkages are still correct.
1986                  */
1987                 rlp->robust_lock = mp;
1988                 membar_producer();
1989         } else {
1990                 /*
1991                  * Allocate a new entry and add it to
1992                  * the hash table and to the global list.
1993                  */
1994                 rlp = lmalloc(sizeof (*rlp));
1995                 rlp->robust_lock = mp;
1996                 rlp->robust_next = NULL;
1997                 rlp->robust_list = udp->robustlist;
1998                 udp->robustlist = rlp;
1999                 membar_producer();
2000                 *rlpp = rlp;
2001         }
2002
2003         lmutex_unlock(&udp->tdb_hash_lock);
2004
2005         (void) ___lwp_mutex_register(mp, &rlp->robust_lock);
2006 }
2007
2008 /*
2009  * This is called in the child of fork()/forkall() to start over
2010  * with a clean slate.  (Each process must register its own locks.)
2011  * No locks are needed because all other threads are suspended or gone.
2012  */
2013 void
2014 unregister_locks(void)
2015 {
2016         uberdata_t *udp = curthread->ul_uberdata;
2017         robust_t **table;
2018         robust_t *rlp;
2019         robust_t *next;
2020
2021         /*
2022          * Do this first, before calling lfree().
2023          */
2024         table = udp->robustlocks;
2025         udp->robustlocks = NULL;
2026         rlp = udp->robustlist;
2027         udp->robustlist = NULL;
2028
2029         /*
2030          * Do this by traversing the global list, not the hash table.
2031          */
2032         while (rlp != NULL) {
2033                 next = rlp->robust_list;
2034                 lfree(rlp, sizeof (*rlp));
2035                 rlp = next;
2036         }
2037         if (table != NULL)
2038                 lfree(table, LOCKHASHSZ * sizeof (robust_t *));
2039 }
2040
2041 /*
2042  * Returns with mutex_owner set correctly.
2043  */
2044 int
2045 mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
2046 {
2047         ulwp_t *self = curthread;
2048         uberdata_t *udp = self->ul_uberdata;
2049         int mtype = mp->mutex_type;
2050         tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2051         int error = 0;
2052         int noceil = try & MUTEX_NOCEIL;
2053         uint8_t ceil;
2054         int myprio;
2055
2056         try &= ~MUTEX_NOCEIL;
2057         ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
2058
2059         if (!self->ul_schedctl_called)
2060                 (void) setup_schedctl();
2061
2062         if (msp && try == MUTEX_TRY)
2063                 tdb_incr(msp->mutex_try);
2064
2065         if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_held(mp))
2066                 return (mutex_recursion(mp, mtype, try));
2067
2068         if (self->ul_error_detection && try == MUTEX_LOCK &&
2069             tsp == NULL && mutex_held(mp))
2070                 lock_error(mp, "mutex_lock", NULL, NULL);
2071
2072         if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2073                 update_sched(self);
2074                 if (self->ul_cid != self->ul_rtclassid) {
2075                         DTRACE_PROBE2(plockstat, mutex__error, mp, EPERM);
2076                         return (EPERM);
2077                 }
2078                 ceil = mp->mutex_ceiling;
2079                 myprio = self->ul_epri? self->ul_epri : self->ul_pri;
2080                 if (myprio > ceil) {
2081                         DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
2082                         return (EINVAL);
2083                 }
2084                 if ((error = _ceil_mylist_add(mp)) != 0) {
2085                         DTRACE_PROBE2(plockstat, mutex__error, mp, error);
2086                         return (error);
2087                 }
2088                 if (myprio < ceil)
2089                         _ceil_prio_inherit(ceil);
2090         }
2091
2092         if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
2093             == (USYNC_PROCESS | LOCK_ROBUST))
2094                 register_lock(mp);
2095
2096         if (mtype & LOCK_PRIO_INHERIT) {
2097                 /* go straight to the kernel */
2098                 if (try == MUTEX_TRY)
2099                         error = mutex_trylock_kernel(mp);
2100                 else    /* MUTEX_LOCK */
2101                         error = mutex_lock_kernel(mp, tsp, msp);
2102                 /*
2103                  * The kernel never sets or clears the lock byte
2104                  * for LOCK_PRIO_INHERIT mutexes.
2105                  * Set it here for consistency.
2106                  */
2107                 switch (error) {
2108                 case 0:
2109                         self->ul_pilocks++;
2110                         mp->mutex_lockw = LOCKSET;
2111                         break;
2112                 case EOWNERDEAD:
2113                 case ELOCKUNMAPPED:
2114                         self->ul_pilocks++;
2115                         mp->mutex_lockw = LOCKSET;
2116                         /* FALLTHROUGH */
2117                 case ENOTRECOVERABLE:
2118                         ASSERT(mtype & LOCK_ROBUST);
2119                         break;
2120                 case EDEADLK:
2121                         if (try == MUTEX_TRY) {
2122                                 error = EBUSY;
2123                         } else if (tsp != NULL) {       /* simulate a timeout */
2124                                 /*
2125                                  * Note: mutex_timedlock() never returns EINTR.
2126                                  */
2127                                 timespec_t ts = *tsp;
2128                                 timespec_t rts;
2129
2130                                 while (__nanosleep(&ts, &rts) == EINTR)
2131                                         ts = rts;
2132                                 error = ETIME;
2133                         } else {                /* simulate a deadlock */
2134                                 stall();
2135                         }
2136                         break;
2137                 }
2138         } else if (mtype & USYNC_PROCESS) {
2139                 error = mutex_trylock_process(mp, try == MUTEX_LOCK);
2140                 if (error == EBUSY && try == MUTEX_LOCK)
2141                         error = mutex_lock_kernel(mp, tsp, msp);
2142         } else {        /* USYNC_THREAD */
2143                 error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
2144                 if (error == EBUSY && try == MUTEX_LOCK)
2145                         error = mutex_lock_queue(self, msp, mp, tsp);
2146         }
2147
2148         switch (error) {
2149         case 0:
2150         case EOWNERDEAD:
2151         case ELOCKUNMAPPED:
2152                 if (mtype & LOCK_ROBUST)
2153                         remember_lock(mp);
2154                 if (msp)
2155                         record_begin_hold(msp);
2156                 break;
2157         default:
2158                 if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2159                         (void) _ceil_mylist_del(mp);
2160                         if (myprio < ceil)
2161                                 _ceil_prio_waive();
2162                 }
2163                 if (try == MUTEX_TRY) {
2164                         if (msp)
2165                                 tdb_incr(msp->mutex_try_fail);
2166                         if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2167                                 self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2168                                 tdb_event(TD_LOCK_TRY, udp);
2169                         }
2170                 }
2171                 break;
2172         }
2173
2174         return (error);
2175 }
2176
2177 int
2178 fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
2179 {
2180         ulwp_t *self = curthread;
2181         uberdata_t *udp = self->ul_uberdata;
2182
2183         /*
2184          * We know that USYNC_PROCESS is set in mtype and that
2185          * zero, one, or both of the flags LOCK_RECURSIVE and
2186          * LOCK_ERRORCHECK are set, and that no other flags are set.
2187          */
2188         ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
2189         enter_critical(self);
2190 #if defined(__sparc) && !defined(_LP64)
2191         /* horrible hack, necessary only on 32-bit sparc */
2192         if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2193             self->ul_misaligned) {
2194                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2195                         mp->mutex_ownerpid = udp->pid;
2196                         mp->mutex_owner = (uintptr_t)self;
2197                         exit_critical(self);
2198                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2199                         return (0);
2200                 }
2201         } else
2202 #endif
2203         if (set_lock_byte64(&mp->mutex_lockword64, udp->pid) == 0) {
2204                 mp->mutex_owner = (uintptr_t)self;
2205                 /* mp->mutex_ownerpid was set by set_lock_byte64() */
2206                 exit_critical(self);
2207                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2208                 return (0);
2209         }
2210         exit_critical(self);
2211
2212         if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
2213                 return (mutex_recursion(mp, mtype, try));
2214
2215         if (try == MUTEX_LOCK) {
2216                 if (mutex_trylock_process(mp, 1) == 0)
2217                         return (0);
2218                 return (mutex_lock_kernel(mp, tsp, NULL));
2219         }
2220
2221         if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2222                 self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2223                 tdb_event(TD_LOCK_TRY, udp);
2224         }
2225         return (EBUSY);
2226 }
2227
2228 static int
2229 mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
2230 {
2231         ulwp_t *self = curthread;
2232         int mtype = mp->mutex_type;
2233         uberflags_t *gflags;
2234
2235         if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2236             self->ul_error_detection && self->ul_misaligned == 0)
2237                 lock_error(mp, "mutex_lock", NULL, "mutex is misaligned");
2238
2239         /*
2240          * Optimize the case of USYNC_THREAD, including
2241          * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2242          * no error detection, no lock statistics,
2243          * and the process has only a single thread.
2244          * (Most likely a traditional single-threaded application.)
2245          */
2246         if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2247             self->ul_uberdata->uberflags.uf_all) == 0) {
2248                 /*
2249                  * Only one thread exists so we don't need an atomic operation.
2250                  * We do, however, need to protect against signals.
2251                  */
2252                 if (mp->mutex_lockw == 0) {
2253                         sigoff(self);
2254                         mp->mutex_lockw = LOCKSET;
2255                         mp->mutex_owner = (uintptr_t)self;
2256                         sigon(self);
2257                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2258                         return (0);
2259                 }
2260                 if (mtype && MUTEX_OWNER(mp) == self)
2261                         return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2262                 /*
2263                  * We have reached a deadlock, probably because the
2264                  * process is executing non-async-signal-safe code in
2265                  * a signal handler and is attempting to acquire a lock
2266                  * that it already owns.  This is not surprising, given
2267                  * bad programming practices over the years that has
2268                  * resulted in applications calling printf() and such
2269                  * in their signal handlers.  Unless the user has told
2270                  * us that the signal handlers are safe by setting:
2271                  *      export _THREAD_ASYNC_SAFE=1
2272                  * we return EDEADLK rather than actually deadlocking.
2273                  */
2274                 if (tsp == NULL &&
2275                     MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
2276                         DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
2277                         return (EDEADLK);
2278                 }
2279         }
2280
2281         /*
2282          * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2283          * no error detection, and no lock statistics.
2284          * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2285          */
2286         if ((gflags = self->ul_schedctl_called) != NULL &&
2287             (gflags->uf_trs_ted |
2288             (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2289                 if (mtype & USYNC_PROCESS)
2290                         return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
2291                 sigoff(self);
2292                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2293                         mp->mutex_owner = (uintptr_t)self;
2294                         sigon(self);
2295                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2296                         return (0);
2297                 }
2298                 sigon(self);
2299                 if (mtype && MUTEX_OWNER(mp) == self)
2300                         return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2301                 if (mutex_trylock_adaptive(mp, 1) != 0)
2302                         return (mutex_lock_queue(self, NULL, mp, tsp));
2303                 return (0);
2304         }
2305
2306         /* else do it the long way */
2307         return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
2308 }
2309
2310 #pragma weak pthread_mutex_lock = mutex_lock
2311 #pragma weak _mutex_lock = mutex_lock
2312 int
2313 mutex_lock(mutex_t *mp)
2314 {
2315         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2316         return (mutex_lock_impl(mp, NULL));
2317 }
2318
2319 void
2320 mutex_enter(mutex_t *mp)
2321 {
2322         int ret;
2323         int attr = mp->mutex_type & ALL_ATTRIBUTES;
2324
2325         /*
2326          * Require LOCK_ERRORCHECK, accept LOCK_RECURSIVE.
2327          */
2328         if (attr != LOCK_ERRORCHECK &&
2329             attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
2330                 mutex_panic(mp, "mutex_enter: bad mutex type");
2331         }
2332         ret = mutex_lock(mp);
2333         if (ret == EDEADLK) {
2334                 mutex_panic(mp, "recursive mutex_enter");
2335         } else if (ret == EAGAIN) {
2336                 mutex_panic(mp, "excessive recursive mutex_enter");
2337         } else if (ret != 0) {
2338                 mutex_panic(mp, "unknown mutex_enter failure");
2339         }
2340 }
2341
2342 int
2343 pthread_mutex_timedlock(pthread_mutex_t *_RESTRICT_KYWD mp,
2344         const struct timespec *_RESTRICT_KYWD abstime)
2345 {
2346         timespec_t tslocal;
2347         int error;
2348
2349         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2350         abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
2351         error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2352         if (error == ETIME)
2353                 error = ETIMEDOUT;
2354         return (error);
2355 }
2356
2357 int
2358 pthread_mutex_reltimedlock_np(pthread_mutex_t *_RESTRICT_KYWD mp,
2359         const struct timespec *_RESTRICT_KYWD reltime)
2360 {
2361         timespec_t tslocal;
2362         int error;
2363
2364         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2365         tslocal = *reltime;
2366         error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2367         if (error == ETIME)
2368                 error = ETIMEDOUT;
2369         return (error);
2370 }
2371
2372 #pragma weak pthread_mutex_trylock = mutex_trylock
2373 int
2374 mutex_trylock(mutex_t *mp)
2375 {
2376         ulwp_t *self = curthread;
2377         uberdata_t *udp = self->ul_uberdata;
2378         int mtype = mp->mutex_type;
2379         uberflags_t *gflags;
2380
2381         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2382
2383         /*
2384          * Optimize the case of USYNC_THREAD, including
2385          * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2386          * no error detection, no lock statistics,
2387          * and the process has only a single thread.
2388          * (Most likely a traditional single-threaded application.)
2389          */
2390         if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2391             udp->uberflags.uf_all) == 0) {
2392                 /*
2393                  * Only one thread exists so we don't need an atomic operation.
2394                  * We do, however, need to protect against signals.
2395                  */
2396                 if (mp->mutex_lockw == 0) {
2397                         sigoff(self);
2398                         mp->mutex_lockw = LOCKSET;
2399                         mp->mutex_owner = (uintptr_t)self;
2400                         sigon(self);
2401                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2402                         return (0);
2403                 }
2404                 if (mtype && MUTEX_OWNER(mp) == self)
2405                         return (mutex_recursion(mp, mtype, MUTEX_TRY));
2406                 return (EBUSY);
2407         }
2408
2409         /*
2410          * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2411          * no error detection, and no lock statistics.
2412          * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2413          */
2414         if ((gflags = self->ul_schedctl_called) != NULL &&
2415             (gflags->uf_trs_ted |
2416             (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2417                 if (mtype & USYNC_PROCESS)
2418                         return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
2419                 sigoff(self);
2420                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2421                         mp->mutex_owner = (uintptr_t)self;
2422                         sigon(self);
2423                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2424                         return (0);
2425                 }
2426                 sigon(self);
2427                 if (mtype && MUTEX_OWNER(mp) == self)
2428                         return (mutex_recursion(mp, mtype, MUTEX_TRY));
2429                 if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2430                         self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2431                         tdb_event(TD_LOCK_TRY, udp);
2432                 }
2433                 return (EBUSY);
2434         }
2435
2436         /* else do it the long way */
2437         return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
2438 }
2439
2440 int
2441 mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
2442 {
2443         ulwp_t *self = curthread;
2444         uberdata_t *udp = self->ul_uberdata;
2445         int mtype = mp->mutex_type;
2446         tdb_mutex_stats_t *msp;
2447         int error = 0;
2448         int release_all;
2449         lwpid_t lwpid;
2450
2451         if ((mtype & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
2452             !mutex_held(mp))
2453                 return (EPERM);
2454
2455         if (self->ul_error_detection && !mutex_held(mp))
2456                 lock_error(mp, "mutex_unlock", NULL, NULL);
2457
2458         if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2459                 mp->mutex_rcount--;
2460                 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2461                 return (0);
2462         }
2463
2464         if ((msp = MUTEX_STATS(mp, udp)) != NULL)
2465                 (void) record_hold_time(msp);
2466
2467         if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
2468             (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2469                 ASSERT(mtype & LOCK_ROBUST);
2470                 mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2471                 mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2472         }
2473         release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2474
2475         if (mtype & LOCK_PRIO_INHERIT) {
2476                 no_preempt(self);
2477                 mp->mutex_owner = 0;
2478                 /* mp->mutex_ownerpid is cleared by ___lwp_mutex_unlock() */
2479                 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2480                 mp->mutex_lockw = LOCKCLEAR;
2481                 self->ul_pilocks--;
2482                 error = ___lwp_mutex_unlock(mp);
2483                 preempt(self);
2484         } else if (mtype & USYNC_PROCESS) {
2485                 mutex_unlock_process(mp, release_all);
2486         } else {        /* USYNC_THREAD */
2487                 if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
2488                         (void) __lwp_unpark(lwpid);
2489                         preempt(self);
2490                 }
2491         }
2492
2493         if (mtype & LOCK_ROBUST)
2494                 forget_lock(mp);
2495
2496         if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2497                 _ceil_prio_waive();
2498
2499         return (error);
2500 }
2501
2502 #pragma weak pthread_mutex_unlock = mutex_unlock
2503 #pragma weak _mutex_unlock = mutex_unlock
2504 int
2505 mutex_unlock(mutex_t *mp)
2506 {
2507         ulwp_t *self = curthread;
2508         int mtype = mp->mutex_type;
2509         uberflags_t *gflags;
2510         lwpid_t lwpid;
2511         short el;
2512
2513         /*
2514          * Optimize the case of USYNC_THREAD, including
2515          * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2516          * no error detection, no lock statistics,
2517          * and the process has only a single thread.
2518          * (Most likely a traditional single-threaded application.)
2519          */
2520         if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2521             self->ul_uberdata->uberflags.uf_all) == 0) {
2522                 if (mtype) {
2523                         /*
2524                          * At this point we know that one or both of the
2525                          * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2526                          */
2527                         if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2528                                 return (EPERM);
2529                         if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2530                                 mp->mutex_rcount--;
2531                                 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2532                                 return (0);
2533                         }
2534                 }
2535                 /*
2536                  * Only one thread exists so we don't need an atomic operation.
2537                  * Also, there can be no waiters.
2538                  */
2539                 sigoff(self);
2540                 mp->mutex_owner = 0;
2541                 mp->mutex_lockword = 0;
2542                 sigon(self);
2543                 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2544                 return (0);
2545         }
2546
2547         /*
2548          * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2549          * no error detection, and no lock statistics.
2550          * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2551          */
2552         if ((gflags = self->ul_schedctl_called) != NULL) {
2553                 if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2554 fast_unlock:
2555                         if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2556                                 (void) __lwp_unpark(lwpid);
2557                                 preempt(self);
2558                         }
2559                         return (0);
2560                 }
2561                 if (el)         /* error detection or lock statistics */
2562                         goto slow_unlock;
2563                 if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2564                         /*
2565                          * At this point we know that one or both of the
2566                          * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2567                          */
2568                         if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2569                                 return (EPERM);
2570                         if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2571                                 mp->mutex_rcount--;
2572                                 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2573                                 return (0);
2574                         }
2575                         goto fast_unlock;
2576                 }
2577                 if ((mtype &
2578                     ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2579                         /*
2580                          * At this point we know that zero, one, or both of the
2581                          * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2582                          * that the USYNC_PROCESS flag is set.
2583                          */
2584                         if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2585                                 return (EPERM);
2586                         if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2587                                 mp->mutex_rcount--;
2588                                 DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2589                                 return (0);
2590                         }
2591                         mutex_unlock_process(mp, 0);
2592                         return (0);
2593                 }
2594         }
2595
2596         /* else do it the long way */
2597 slow_unlock:
2598         return (mutex_unlock_internal(mp, 0));
2599 }
2600
2601 void
2602 mutex_exit(mutex_t *mp)
2603 {
2604         int ret;
2605         int attr = mp->mutex_type & ALL_ATTRIBUTES;
2606
2607         if (attr != LOCK_ERRORCHECK &&
2608             attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
2609                 mutex_panic(mp, "mutex_exit: bad mutex type");
2610         }
2611         ret = mutex_unlock(mp);
2612         if (ret == EPERM) {
2613                 mutex_panic(mp, "mutex_exit: not owner");
2614         } else if (ret != 0) {
2615                 mutex_panic(mp, "unknown mutex_exit failure");
2616         }
2617
2618 }
2619
2620 /*
2621  * Internally to the library, almost all mutex lock/unlock actions
2622  * go through these lmutex_ functions, to protect critical regions.
2623  * We replicate a bit of code from mutex_lock() and mutex_unlock()
2624  * to make these functions faster since we know that the mutex type
2625  * of all internal locks is USYNC_THREAD.  We also know that internal
2626  * locking can never fail, so we panic if it does.
2627  */
2628 void
2629 lmutex_lock(mutex_t *mp)
2630 {
2631         ulwp_t *self = curthread;
2632         uberdata_t *udp = self->ul_uberdata;
2633
2634         ASSERT(mp->mutex_type == USYNC_THREAD);
2635
2636         enter_critical(self);
2637         /*
2638          * Optimize the case of no lock statistics and only a single thread.
2639          * (Most likely a traditional single-threaded application.)
2640          */
2641         if (udp->uberflags.uf_all == 0) {
2642                 /*
2643                  * Only one thread exists; the mutex must be free.
2644                  */
2645                 ASSERT(mp->mutex_lockw == 0);
2646                 mp->mutex_lockw = LOCKSET;
2647                 mp->mutex_owner = (uintptr_t)self;
2648                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2649         } else {
2650                 tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2651
2652                 if (!self->ul_schedctl_called)
2653                         (void) setup_schedctl();
2654
2655                 if (set_lock_byte(&mp->mutex_lockw) == 0) {
2656                         mp->mutex_owner = (uintptr_t)self;
2657                         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2658                 } else if (mutex_trylock_adaptive(mp, 1) != 0) {
2659                         (void) mutex_lock_queue(self, msp, mp, NULL);
2660                 }
2661
2662                 if (msp)
2663                         record_begin_hold(msp);
2664         }
2665 }
2666
2667 void
2668 lmutex_unlock(mutex_t *mp)
2669 {
2670         ulwp_t *self = curthread;
2671         uberdata_t *udp = self->ul_uberdata;
2672
2673         ASSERT(mp->mutex_type == USYNC_THREAD);
2674
2675         /*
2676          * Optimize the case of no lock statistics and only a single thread.
2677          * (Most likely a traditional single-threaded application.)
2678          */
2679         if (udp->uberflags.uf_all == 0) {
2680                 /*
2681                  * Only one thread exists so there can be no waiters.
2682                  */
2683                 mp->mutex_owner = 0;
2684                 mp->mutex_lockword = 0;
2685                 DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2686         } else {
2687                 tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2688                 lwpid_t lwpid;
2689
2690                 if (msp)
2691                         (void) record_hold_time(msp);
2692                 if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2693                         (void) __lwp_unpark(lwpid);
2694                         preempt(self);
2695                 }
2696         }
2697         exit_critical(self);
2698 }
2699
2700 /*
2701  * For specialized code in libc, like the asynchronous i/o code,
2702  * the following sig_*() locking primitives are used in order
2703  * to make the code asynchronous signal safe.  Signals are
2704  * deferred while locks acquired by these functions are held.
2705  */
2706 void
2707 sig_mutex_lock(mutex_t *mp)
2708 {
2709         ulwp_t *self = curthread;
2710
2711         sigoff(self);
2712         (void) mutex_lock(mp);
2713 }
2714
2715 void
2716 sig_mutex_unlock(mutex_t *mp)
2717 {
2718         ulwp_t *self = curthread;
2719
2720         (void) mutex_unlock(mp);
2721         sigon(self);
2722 }
2723
2724 int
2725 sig_mutex_trylock(mutex_t *mp)
2726 {
2727         ulwp_t *self = curthread;
2728         int error;
2729
2730         sigoff(self);
2731         if ((error = mutex_trylock(mp)) != 0)
2732                 sigon(self);
2733         return (error);
2734 }
2735
2736 /*
2737  * sig_cond_wait() is a cancellation point.
2738  */
2739 int
2740 sig_cond_wait(cond_t *cv, mutex_t *mp)
2741 {
2742         int error;
2743
2744         ASSERT(curthread->ul_sigdefer != 0);
2745         pthread_testcancel();
2746         error = __cond_wait(cv, mp);
2747         if (error == EINTR && curthread->ul_cursig) {
2748                 sig_mutex_unlock(mp);
2749                 /* take the deferred signal here */
2750                 sig_mutex_lock(mp);
2751         }
2752         pthread_testcancel();
2753         return (error);
2754 }
2755
2756 /*
2757  * sig_cond_reltimedwait() is a cancellation point.
2758  */
2759 int
2760 sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
2761 {
2762         int error;
2763
2764         ASSERT(curthread->ul_sigdefer != 0);
2765         pthread_testcancel();
2766         error = __cond_reltimedwait(cv, mp, ts);
2767         if (error == EINTR && curthread->ul_cursig) {
2768                 sig_mutex_unlock(mp);
2769                 /* take the deferred signal here */
2770                 sig_mutex_lock(mp);
2771         }
2772         pthread_testcancel();
2773         return (error);
2774 }
2775
2776 /*
2777  * For specialized code in libc, like the stdio code.
2778  * the following cancel_safe_*() locking primitives are used in
2779  * order to make the code cancellation-safe.  Cancellation is
2780  * deferred while locks acquired by these functions are held.
2781  */
2782 void
2783 cancel_safe_mutex_lock(mutex_t *mp)
2784 {
2785         (void) mutex_lock(mp);
2786         curthread->ul_libc_locks++;
2787 }
2788
2789 int
2790 cancel_safe_mutex_trylock(mutex_t *mp)
2791 {
2792         int error;
2793
2794         if ((error = mutex_trylock(mp)) == 0)
2795                 curthread->ul_libc_locks++;
2796         return (error);
2797 }
2798
2799 void
2800 cancel_safe_mutex_unlock(mutex_t *mp)
2801 {
2802         ulwp_t *self = curthread;
2803
2804         ASSERT(self->ul_libc_locks != 0);
2805
2806         (void) mutex_unlock(mp);
2807
2808         /*
2809          * Decrement the count of locks held by cancel_safe_mutex_lock().
2810          * If we are then in a position to terminate cleanly and
2811          * if there is a pending cancellation and cancellation
2812          * is not disabled and we received EINTR from a recent
2813          * system call then perform the cancellation action now.
2814          */
2815         if (--self->ul_libc_locks == 0 &&
2816             !(self->ul_vfork | self->ul_nocancel |
2817             self->ul_critical | self->ul_sigdefer) &&
2818             cancel_active())
2819                 pthread_exit(PTHREAD_CANCELED);
2820 }
2821
2822 static int
2823 shared_mutex_held(mutex_t *mparg)
2824 {
2825         /*
2826          * The 'volatile' is necessary to make sure the compiler doesn't
2827          * reorder the tests of the various components of the mutex.
2828          * They must be tested in this order:
2829          *      mutex_lockw
2830          *      mutex_owner
2831          *      mutex_ownerpid
2832          * This relies on the fact that everywhere mutex_lockw is cleared,
2833          * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
2834          * is cleared, and that everywhere mutex_lockw is set, mutex_owner
2835          * and mutex_ownerpid are set after mutex_lockw is set, and that
2836          * mutex_lockw is set or cleared with a memory barrier.
2837          */
2838         volatile mutex_t *mp = (volatile mutex_t *)mparg;
2839         ulwp_t *self = curthread;
2840         uberdata_t *udp = self->ul_uberdata;
2841
2842         return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
2843 }
2844
2845 #pragma weak _mutex_held = mutex_held
2846 int
2847 mutex_held(mutex_t *mparg)
2848 {
2849         volatile mutex_t *mp = (volatile mutex_t *)mparg;
2850
2851         if (mparg->mutex_type & USYNC_PROCESS)
2852                 return (shared_mutex_held(mparg));
2853         return (MUTEX_OWNED(mp, curthread));
2854 }
2855
2856 #pragma weak pthread_mutex_destroy = mutex_destroy
2857 #pragma weak _mutex_destroy = mutex_destroy
2858 int
2859 mutex_destroy(mutex_t *mp)
2860 {
2861         if (mp->mutex_type & USYNC_PROCESS)
2862                 forget_lock(mp);
2863         (void) memset(mp, 0, sizeof (*mp));
2864         tdb_sync_obj_deregister(mp);
2865         return (0);
2866 }
2867
2868 #pragma weak pthread_mutex_consistent_np = mutex_consistent
2869 #pragma weak pthread_mutex_consistent = mutex_consistent
2870 int
2871 mutex_consistent(mutex_t *mp)
2872 {
2873         /*
2874          * Do this only for an inconsistent, initialized robust lock
2875          * that we hold.  For all other cases, return EINVAL.
2876          */
2877         if (mutex_held(mp) &&
2878             (mp->mutex_type & LOCK_ROBUST) &&
2879             (mp->mutex_flag & LOCK_INITED) &&
2880             (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2881                 mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2882                 mp->mutex_rcount = 0;
2883                 return (0);
2884         }
2885         return (EINVAL);
2886 }
2887
2888 /*
2889  * Spin locks are separate from ordinary mutexes,
2890  * but we use the same data structure for them.
2891  */
2892
2893 int
2894 pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2895 {
2896         mutex_t *mp = (mutex_t *)lock;
2897
2898         (void) memset(mp, 0, sizeof (*mp));
2899         if (pshared == PTHREAD_PROCESS_SHARED)
2900                 mp->mutex_type = USYNC_PROCESS;
2901         else
2902                 mp->mutex_type = USYNC_THREAD;
2903         mp->mutex_flag = LOCK_INITED;
2904         mp->mutex_magic = MUTEX_MAGIC;
2905
2906         /*
2907          * This should be at the beginning of the function,
2908          * but for the sake of old broken applications that
2909          * do not have proper alignment for their mutexes
2910          * (and don't check the return code from pthread_spin_init),
2911          * we put it here, after initializing the mutex regardless.
2912          */
2913         if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2914             curthread->ul_misaligned == 0)
2915                 return (EINVAL);
2916
2917         return (0);
2918 }
2919
2920 int
2921 pthread_spin_destroy(pthread_spinlock_t *lock)
2922 {
2923         (void) memset(lock, 0, sizeof (*lock));
2924         return (0);
2925 }
2926
2927 int
2928 pthread_spin_trylock(pthread_spinlock_t *lock)
2929 {
2930         mutex_t *mp = (mutex_t *)lock;
2931         ulwp_t *self = curthread;
2932         int error = 0;
2933
2934         no_preempt(self);
2935         if (set_lock_byte(&mp->mutex_lockw) != 0)
2936                 error = EBUSY;
2937         else {
2938                 mp->mutex_owner = (uintptr_t)self;
2939                 if (mp->mutex_type == USYNC_PROCESS)
2940                         mp->mutex_ownerpid = self->ul_uberdata->pid;
2941                 DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2942         }
2943         preempt(self);
2944         return (error);
2945 }
2946
2947 int
2948 pthread_spin_lock(pthread_spinlock_t *lock)
2949 {
2950         mutex_t *mp = (mutex_t *)lock;
2951         ulwp_t *self = curthread;
2952         volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
2953         int count = 0;
2954
2955         ASSERT(!self->ul_critical || self->ul_bindflags);
2956
2957         DTRACE_PROBE1(plockstat, mutex__spin, mp);
2958
2959         /*
2960          * We don't care whether the owner is running on a processor.
2961          * We just spin because that's what this interface requires.
2962          */
2963         for (;;) {
2964                 if (*lockp == 0) {      /* lock byte appears to be clear */
2965                         no_preempt(self);
2966                         if (set_lock_byte(lockp) == 0)
2967                                 break;
2968                         preempt(self);
2969                 }
2970                 if (count < INT_MAX)
2971                         count++;
2972                 SMT_PAUSE();
2973         }
2974         mp->mutex_owner = (uintptr_t)self;
2975         if (mp->mutex_type == USYNC_PROCESS)
2976                 mp->mutex_ownerpid = self->ul_uberdata->pid;
2977         preempt(self);
2978         if (count) {
2979                 DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
2980         }
2981         DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
2982         return (0);
2983 }
2984
2985 int
2986 pthread_spin_unlock(pthread_spinlock_t *lock)
2987 {
2988         mutex_t *mp = (mutex_t *)lock;
2989         ulwp_t *self = curthread;
2990
2991         no_preempt(self);
2992         mp->mutex_owner = 0;
2993         mp->mutex_ownerpid = 0;
2994         DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2995         (void) atomic_swap_32(&mp->mutex_lockword, 0);
2996         preempt(self);
2997         return (0);
2998 }
2999
3000 #define INITIAL_LOCKS   8       /* initial size of ul_heldlocks.array */
3001
3002 /*
3003  * Find/allocate an entry for 'lock' in our array of held locks.
3004  */
3005 static mutex_t **
3006 find_lock_entry(mutex_t *lock)
3007 {
3008         ulwp_t *self = curthread;
3009         mutex_t **remembered = NULL;
3010         mutex_t **lockptr;
3011         uint_t nlocks;
3012
3013         if ((nlocks = self->ul_heldlockcnt) != 0)
3014                 lockptr = self->ul_heldlocks.array;
3015         else {
3016                 nlocks = 1;
3017                 lockptr = &self->ul_heldlocks.single;
3018         }
3019
3020         for (; nlocks; nlocks--, lockptr++) {
3021                 if (*lockptr == lock)
3022                         return (lockptr);
3023                 if (*lockptr == NULL && remembered == NULL)
3024                         remembered = lockptr;
3025         }
3026         if (remembered != NULL) {
3027                 *remembered = lock;
3028                 return (remembered);
3029         }
3030
3031         /*
3032          * No entry available.  Allocate more space, converting
3033          * the single entry into an array of entries if necessary.
3034          */
3035         if ((nlocks = self->ul_heldlockcnt) == 0) {
3036                 /*
3037                  * Initial allocation of the array.
3038                  * Convert the single entry into an array.
3039                  */
3040                 self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
3041                 lockptr = lmalloc(nlocks * sizeof (mutex_t *));
3042                 /*
3043                  * The single entry becomes the first entry in the array.
3044                  */
3045                 *lockptr = self->ul_heldlocks.single;
3046                 self->ul_heldlocks.array = lockptr;
3047                 /*
3048                  * Return the next available entry in the array.
3049                  */
3050                 *++lockptr = lock;
3051                 return (lockptr);
3052         }
3053         /*
3054          * Reallocate the array, double the size each time.
3055          */
3056         lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
3057         (void) memcpy(lockptr, self->ul_heldlocks.array,
3058             nlocks * sizeof (mutex_t *));
3059         lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3060         self->ul_heldlocks.array = lockptr;
3061         self->ul_heldlockcnt *= 2;
3062         /*
3063          * Return the next available entry in the newly allocated array.
3064          */
3065         *(lockptr += nlocks) = lock;
3066         return (lockptr);
3067 }
3068
3069 /*
3070  * Insert 'lock' into our list of held locks.
3071  * Currently only used for LOCK_ROBUST mutexes.
3072  */
3073 void
3074 remember_lock(mutex_t *lock)
3075 {
3076         (void) find_lock_entry(lock);
3077 }
3078
3079 /*
3080  * Remove 'lock' from our list of held locks.
3081  * Currently only used for LOCK_ROBUST mutexes.
3082  */
3083 void
3084 forget_lock(mutex_t *lock)
3085 {
3086         *find_lock_entry(lock) = NULL;
3087 }
3088
3089 /*
3090  * Free the array of held locks.
3091  */
3092 void
3093 heldlock_free(ulwp_t *ulwp)
3094 {
3095         uint_t nlocks;
3096
3097         if ((nlocks = ulwp->ul_heldlockcnt) != 0)
3098                 lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3099         ulwp->ul_heldlockcnt = 0;
3100         ulwp->ul_heldlocks.array = NULL;
3101 }
3102
3103 /*
3104  * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
3105  * Called from _thrp_exit() to deal with abandoned locks.
3106  */
3107 void
3108 heldlock_exit(void)
3109 {
3110         ulwp_t *self = curthread;
3111         mutex_t **lockptr;
3112         uint_t nlocks;
3113         mutex_t *mp;
3114
3115         if ((nlocks = self->ul_heldlockcnt) != 0)
3116                 lockptr = self->ul_heldlocks.array;
3117         else {
3118                 nlocks = 1;
3119                 lockptr = &self->ul_heldlocks.single;
3120         }
3121
3122         for (; nlocks; nlocks--, lockptr++) {
3123                 /*
3124                  * The kernel takes care of transitioning held
3125                  * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
3126                  * We avoid that case here.
3127                  */
3128                 if ((mp = *lockptr) != NULL &&
3129                     mutex_held(mp) &&
3130                     (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
3131                     LOCK_ROBUST) {
3132                         mp->mutex_rcount = 0;
3133                         if (!(mp->mutex_flag & LOCK_UNMAPPED))
3134                                 mp->mutex_flag |= LOCK_OWNERDEAD;
3135                         (void) mutex_unlock_internal(mp, 1);
3136                 }
3137         }
3138
3139         heldlock_free(self);
3140 }
3141
3142 #pragma weak _cond_init = cond_init
3143 /* ARGSUSED2 */
3144 int
3145 cond_init(cond_t *cvp, int type, void *arg)
3146 {
3147         if (type != USYNC_THREAD && type != USYNC_PROCESS)
3148                 return (EINVAL);
3149         (void) memset(cvp, 0, sizeof (*cvp));
3150         cvp->cond_type = (uint16_t)type;
3151         cvp->cond_magic = COND_MAGIC;
3152
3153         /*
3154          * This should be at the beginning of the function,
3155          * but for the sake of old broken applications that
3156          * do not have proper alignment for their condvars
3157          * (and don't check the return code from cond_init),
3158          * we put it here, after initializing the condvar regardless.
3159          */
3160         if (((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1)) &&
3161             curthread->ul_misaligned == 0)
3162                 return (EINVAL);
3163
3164         return (0);
3165 }
3166
3167 /*
3168  * cond_sleep_queue(): utility function for cond_wait_queue().
3169  *
3170  * Go to sleep on a condvar sleep queue, expect to be waked up
3171  * by someone calling cond_signal() or cond_broadcast() or due
3172  * to receiving a UNIX signal or being cancelled, or just simply
3173  * due to a spurious wakeup (like someome calling forkall()).
3174  *
3175  * The associated mutex is *not* reacquired before returning.
3176  * That must be done by the caller of cond_sleep_queue().
3177  */
3178 static int
3179 cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3180 {
3181         ulwp_t *self = curthread;
3182         queue_head_t *qp;
3183         queue_head_t *mqp;
3184         lwpid_t lwpid;
3185         int signalled;
3186         int error;
3187         int cv_wake;
3188         int release_all;
3189
3190         /*
3191          * Put ourself on the CV sleep queue, unlock the mutex, then
3192          * park ourself and unpark a candidate lwp to grab the mutex.
3193          * We must go onto the CV sleep queue before dropping the
3194          * mutex in order to guarantee atomicity of the operation.
3195          */
3196         self->ul_sp = stkptr();
3197         qp = queue_lock(cvp, CV);
3198         enqueue(qp, self, 0);
3199         cvp->cond_waiters_user = 1;
3200         self->ul_cvmutex = mp;
3201         self->ul_cv_wake = cv_wake = (tsp != NULL);
3202         self->ul_signalled = 0;
3203         if (mp->mutex_flag & LOCK_OWNERDEAD) {
3204                 mp->mutex_flag &= ~LOCK_OWNERDEAD;
3205                 mp->mutex_flag |= LOCK_NOTRECOVERABLE;
3206         }
3207         release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
3208         lwpid = mutex_unlock_queue(mp, release_all);
3209         for (;;) {
3210                 set_parking_flag(self, 1);
3211                 queue_unlock(qp);
3212                 if (lwpid != 0) {
3213                         lwpid = preempt_unpark(self, lwpid);
3214                         preempt(self);
3215                 }
3216                 /*
3217                  * We may have a deferred signal present,
3218                  * in which case we should return EINTR.
3219                  * Also, we may have received a SIGCANCEL; if so
3220                  * and we are cancelable we should return EINTR.
3221                  * We force an immediate EINTR return from
3222                  * __lwp_park() by turning our parking flag off.
3223                  */
3224                 if (self->ul_cursig != 0 ||
3225                     (self->ul_cancelable && self->ul_cancel_pending))
3226                         set_parking_flag(self, 0);
3227                 /*
3228                  * __lwp_park() will return the residual time in tsp
3229                  * if we are unparked before the timeout expires.
3230                  */
3231                 error = __lwp_park(tsp, lwpid);
3232                 set_parking_flag(self, 0);
3233                 lwpid = 0;      /* unpark the other lwp only once */
3234                 /*
3235                  * We were waked up by cond_signal(), cond_broadcast(),
3236                  * by an interrupt or timeout (EINTR or ETIME),
3237                  * or we may just have gotten a spurious wakeup.
3238                  */
3239                 qp = queue_lock(cvp, CV);
3240                 if (!cv_wake)
3241                         mqp = queue_lock(mp, MX);
3242                 if (self->ul_sleepq == NULL)
3243                         break;
3244                 /*
3245                  * We are on either the condvar sleep queue or the
3246                  * mutex sleep queue.  Break out of the sleep if we
3247                  * were interrupted or we timed out (EINTR or ETIME).
3248                  * Else this is a spurious wakeup; continue the loop.
3249                  */
3250                 if (!cv_wake && self->ul_sleepq == mqp) { /* mutex queue */
3251                         if (error) {
3252                                 mp->mutex_waiters = dequeue_self(mqp);
3253                                 break;
3254                         }
3255                         tsp = NULL;     /* no more timeout */
3256                 } else if (self->ul_sleepq == qp) {     /* condvar queue */
3257                         if (error) {
3258                                 cvp->cond_waiters_user = dequeue_self(qp);
3259                                 break;
3260                         }
3261                         /*
3262                          * Else a spurious wakeup on the condvar queue.
3263                          * __lwp_park() has already adjusted the timeout.
3264                          */
3265                 } else {
3266                         thr_panic("cond_sleep_queue(): thread not on queue");
3267                 }
3268                 if (!cv_wake)
3269                         queue_unlock(mqp);
3270         }
3271
3272         self->ul_sp = 0;
3273         self->ul_cv_wake = 0;
3274         ASSERT(self->ul_cvmutex == NULL);
3275         ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
3276             self->ul_wchan == NULL);
3277
3278         signalled = self->ul_signalled;
3279         self->ul_signalled = 0;
3280         queue_unlock(qp);
3281         if (!cv_wake)
3282                 queue_unlock(mqp);
3283
3284         /*
3285          * If we were concurrently cond_signal()d and any of:
3286          * received a UNIX signal, were cancelled, or got a timeout,
3287          * then perform another cond_signal() to avoid consuming it.
3288          */
3289         if (error && signalled)
3290                 (void) cond_signal(cvp);
3291
3292         return (error);
3293 }
3294
3295 static void
3296 cond_wait_check_alignment(cond_t *cvp, mutex_t *mp)
3297 {
3298         if ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1))
3299                 lock_error(mp, "cond_wait", cvp, "mutex is misaligned");
3300         if ((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1))
3301                 lock_error(mp, "cond_wait", cvp, "condvar is misaligned");
3302 }
3303
3304 int
3305 cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3306 {
3307         ulwp_t *self = curthread;
3308         int error;
3309         int merror;
3310
3311         if (self->ul_error_detection && self->ul_misaligned == 0)
3312                 cond_wait_check_alignment(cvp, mp);
3313
3314         /*
3315          * The old thread library was programmed to defer signals
3316          * while in cond_wait() so that the associated mutex would
3317          * be guaranteed to be held when the application signal
3318          * handler was invoked.
3319          *
3320          * We do not behave this way by default; the state of the
3321          * associated mutex in the signal handler is undefined.
3322          *
3323          * To accommodate applications that depend on the old
3324          * behavior, the _THREAD_COND_WAIT_DEFER environment
3325          * variable can be set to 1 and we will behave in the
3326          * old way with respect to cond_wait().
3327          */
3328         if (self->ul_cond_wait_defer)
3329                 sigoff(self);
3330
3331         error = cond_sleep_queue(cvp, mp, tsp);
3332
3333         /*
3334          * Reacquire the mutex.
3335          */
3336         if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3337                 error = merror;
3338
3339         /*
3340          * Take any deferred signal now, after we have reacquired the mutex.
3341          */
3342         if (self->ul_cond_wait_defer)
3343                 sigon(self);
3344
3345         return (error);
3346 }
3347
3348 /*
3349  * cond_sleep_kernel(): utility function for cond_wait_kernel().
3350  * See the comment ahead of cond_sleep_queue(), above.
3351  */
3352 static int
3353 cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3354 {
3355         int mtype = mp->mutex_type;
3356         ulwp_t *self = curthread;
3357         int error;
3358
3359         if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
3360                 _ceil_prio_waive();
3361
3362         self->ul_sp = stkptr();
3363         self->ul_wchan = cvp;
3364         sigoff(self);
3365         mp->mutex_owner = 0;
3366         /* mp->mutex_ownerpid is cleared by ___lwp_cond_wait() */
3367         if (mtype & LOCK_PRIO_INHERIT) {
3368                 mp->mutex_lockw = LOCKCLEAR;
3369                 self->ul_pilocks--;
3370         }
3371         /*
3372          * ___lwp_cond_wait() returns immediately with EINTR if
3373          * set_parking_flag(self,0) is called on this lwp before it
3374          * goes to sleep in the kernel.  sigacthandler() calls this
3375          * when a deferred signal is noted.  This assures that we don't
3376          * get stuck in ___lwp_cond_wait() with all signals blocked
3377          * due to taking a deferred signal before going to sleep.
3378          */
3379         set_parking_flag(self, 1);
3380         if (self->ul_cursig != 0 ||
3381             (self->ul_cancelable && self->ul_cancel_pending))
3382                 set_parking_flag(self, 0);
3383         error = ___lwp_cond_wait(cvp, mp, tsp, 1);
3384         set_parking_flag(self, 0);
3385         sigon(self);
3386         self->ul_sp = 0;
3387         self->ul_wchan = NULL;
3388         return (error);
3389 }
3390
3391 int
3392 cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3393 {
3394         ulwp_t *self = curthread;
3395         int error;
3396         int merror;
3397
3398         if (self->ul_error_detection && self->ul_misaligned == 0)
3399                 cond_wait_check_alignment(cvp, mp);
3400
3401         /*
3402          * See the large comment in cond_wait_queue(), above.
3403          */
3404         if (self->ul_cond_wait_defer)
3405                 sigoff(self);
3406
3407         error = cond_sleep_kernel(cvp, mp, tsp);
3408
3409         /*
3410          * Override the return code from ___lwp_cond_wait()
3411          * with any non-zero return code from mutex_lock().
3412          * This addresses robust lock failures in particular;
3413          * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
3414          * errors in order to take corrective action.
3415          */
3416         if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3417                 error = merror;
3418
3419         /*
3420          * Take any deferred signal now, after we have reacquired the mutex.
3421          */
3422         if (self->ul_cond_wait_defer)
3423                 sigon(self);
3424
3425         return (error);
3426 }
3427
3428 /*
3429  * Common code for cond_wait() and cond_timedwait()
3430  */
3431 int
3432 cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3433 {
3434         int mtype = mp->mutex_type;
3435         hrtime_t begin_sleep = 0;
3436         ulwp_t *self = curthread;
3437         uberdata_t *udp = self->ul_uberdata;
3438         tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3439         tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
3440         uint8_t rcount;
3441         int error = 0;
3442
3443         /*
3444          * The SUSV3 Posix spec for pthread_cond_timedwait() states:
3445          *      Except in the case of [ETIMEDOUT], all these error checks
3446          *      shall act as if they were performed immediately at the
3447          *      beginning of processing for the function and shall cause
3448          *      an error return, in effect, prior to modifying the state
3449          *      of the mutex specified by mutex or the condition variable
3450          *      specified by cond.
3451          * Therefore, we must return EINVAL now if the timout is invalid.
3452          */
3453         if (tsp != NULL &&
3454             (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
3455                 return (EINVAL);
3456
3457         if (__td_event_report(self, TD_SLEEP, udp)) {
3458                 self->ul_sp = stkptr();
3459                 self->ul_wchan = cvp;
3460                 self->ul_td_evbuf.eventnum = TD_SLEEP;
3461                 self->ul_td_evbuf.eventdata = cvp;
3462                 tdb_event(TD_SLEEP, udp);
3463                 self->ul_sp = 0;
3464         }
3465         if (csp) {
3466                 if (tsp)
3467                         tdb_incr(csp->cond_timedwait);
3468                 else
3469                         tdb_incr(csp->cond_wait);
3470         }
3471         if (msp)
3472                 begin_sleep = record_hold_time(msp);
3473         else if (csp)
3474                 begin_sleep = gethrtime();
3475
3476         if (self->ul_error_detection) {
3477                 if (!mutex_held(mp))
3478                         lock_error(mp, "cond_wait", cvp, NULL);
3479                 if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
3480                         lock_error(mp, "recursive mutex in cond_wait",
3481                             cvp, NULL);
3482                 if (cvp->cond_type & USYNC_PROCESS) {
3483                         if (!(mtype & USYNC_PROCESS))
3484                                 lock_error(mp, "cond_wait", cvp,
3485                                     "condvar process-shared, "
3486                                     "mutex process-private");
3487                 } else {
3488                         if (mtype & USYNC_PROCESS)
3489                                 lock_error(mp, "cond_wait", cvp,
3490                                     "condvar process-private, "
3491                                     "mutex process-shared");
3492                 }
3493         }
3494
3495         /*
3496          * We deal with recursive mutexes by completely
3497          * dropping the lock and restoring the recursion
3498          * count after waking up.  This is arguably wrong,
3499          * but it obeys the principle of least astonishment.
3500          */
3501         rcount = mp->mutex_rcount;
3502         mp->mutex_rcount = 0;
3503         if ((mtype &
3504             (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
3505             (cvp->cond_type & USYNC_PROCESS))
3506                 error = cond_wait_kernel(cvp, mp, tsp);
3507         else
3508                 error = cond_wait_queue(cvp, mp, tsp);
3509         mp->mutex_rcount = rcount;
3510
3511         if (csp) {
3512                 hrtime_t lapse = gethrtime() - begin_sleep;
3513                 if (tsp == NULL)
3514                         csp->cond_wait_sleep_time += lapse;
3515                 else {
3516                         csp->cond_timedwait_sleep_time += lapse;
3517                         if (error == ETIME)
3518                                 tdb_incr(csp->cond_timedwait_timeout);
3519                 }
3520         }
3521         return (error);
3522 }
3523
3524 /*
3525  * cond_wait() is a cancellation point but __cond_wait() is not.
3526  * Internally, libc calls the non-cancellation version.
3527  * Other libraries need to use pthread_setcancelstate(), as appropriate,
3528  * since __cond_wait() is not exported from libc.
3529  */
3530 int
3531 __cond_wait(cond_t *cvp, mutex_t *mp)
3532 {
3533         ulwp_t *self = curthread;
3534         uberdata_t *udp = self->ul_uberdata;
3535         uberflags_t *gflags;
3536
3537         if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3538             !mutex_held(mp))
3539                 return (EPERM);
3540
3541         /*
3542          * Optimize the common case of USYNC_THREAD plus
3543          * no error detection, no lock statistics, and no event tracing.
3544          */
3545         if ((gflags = self->ul_schedctl_called) != NULL &&
3546             (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
3547             self->ul_td_events_enable |
3548             udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
3549                 return (cond_wait_queue(cvp, mp, NULL));
3550
3551         /*
3552          * Else do it the long way.
3553          */
3554         return (cond_wait_common(cvp, mp, NULL));
3555 }
3556
3557 #pragma weak _cond_wait = cond_wait
3558 int
3559 cond_wait(cond_t *cvp, mutex_t *mp)
3560 {
3561         int error;
3562
3563         _cancelon();
3564         error = __cond_wait(cvp, mp);
3565         if (error == EINTR)
3566                 _canceloff();
3567         else
3568                 _canceloff_nocancel();
3569         return (error);
3570 }
3571
3572 /*
3573  * pthread_cond_wait() is a cancellation point.
3574  */
3575 int
3576 pthread_cond_wait(pthread_cond_t *_RESTRICT_KYWD cvp,
3577         pthread_mutex_t *_RESTRICT_KYWD mp)
3578 {
3579         int error;
3580
3581         error = cond_wait((cond_t *)cvp, (mutex_t *)mp);
3582         return ((error == EINTR)? 0 : error);
3583 }
3584
3585 /*
3586  * cond_timedwait() is a cancellation point but __cond_timedwait() is not.
3587  */
3588 int
3589 __cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3590 {
3591         clockid_t clock_id = cvp->cond_clockid;
3592         timespec_t reltime;
3593         int error;
3594
3595         if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3596             !mutex_held(mp))
3597                 return (EPERM);
3598
3599         if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
3600                 clock_id = CLOCK_REALTIME;
3601         abstime_to_reltime(clock_id, abstime, &reltime);
3602         error = cond_wait_common(cvp, mp, &reltime);
3603         if (error == ETIME && clock_id == CLOCK_HIGHRES) {
3604                 /*
3605                  * Don't return ETIME if we didn't really get a timeout.
3606                  * This can happen if we return because someone resets
3607                  * the system clock.  Just return zero in this case,
3608                  * giving a spurious wakeup but not a timeout.
3609                  */
3610                 if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
3611                     abstime->tv_nsec > gethrtime())
3612                         error = 0;
3613         }
3614         return (error);
3615 }
3616
3617 int
3618 cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3619 {
3620         int error;
3621
3622         _cancelon();
3623         error = __cond_timedwait(cvp, mp, abstime);
3624         if (error == EINTR)
3625                 _canceloff();
3626         else
3627                 _canceloff_nocancel();
3628         return (error);
3629 }
3630
3631 /*
3632  * pthread_cond_timedwait() is a cancellation point.
3633  */
3634 int
3635 pthread_cond_timedwait(pthread_cond_t *_RESTRICT_KYWD cvp,
3636         pthread_mutex_t *_RESTRICT_KYWD mp,
3637         const struct timespec *_RESTRICT_KYWD abstime)
3638 {
3639         int error;
3640
3641         error = cond_timedwait((cond_t *)cvp, (mutex_t *)mp, abstime);
3642         if (error == ETIME)
3643                 error = ETIMEDOUT;
3644         else if (error == EINTR)
3645                 error = 0;
3646         return (error);
3647 }
3648
3649 /*
3650  * cond_reltimedwait() is a cancellation point but __cond_reltimedwait() is not.
3651  */
3652 int
3653 __cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3654 {
3655         timespec_t tslocal = *reltime;
3656
3657         if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3658             !mutex_held(mp))
3659                 return (EPERM);
3660
3661         return (cond_wait_common(cvp, mp, &tslocal));
3662 }
3663
3664 int
3665 cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3666 {
3667         int error;
3668
3669         _cancelon();
3670         error = __cond_reltimedwait(cvp, mp, reltime);
3671         if (error == EINTR)
3672                 _canceloff();
3673         else
3674                 _canceloff_nocancel();
3675         return (error);
3676 }
3677
3678 int
3679 pthread_cond_reltimedwait_np(pthread_cond_t *_RESTRICT_KYWD cvp,
3680         pthread_mutex_t *_RESTRICT_KYWD mp,
3681         const struct timespec *_RESTRICT_KYWD reltime)
3682 {
3683         int error;
3684
3685         error = cond_reltimedwait((cond_t *)cvp, (mutex_t *)mp, reltime);
3686         if (error == ETIME)
3687                 error = ETIMEDOUT;
3688         else if (error == EINTR)
3689                 error = 0;
3690         return (error);
3691 }
3692
3693 #pragma weak pthread_cond_signal = cond_signal
3694 #pragma weak _cond_signal = cond_signal
3695 int
3696 cond_signal(cond_t *cvp)
3697 {
3698         ulwp_t *self = curthread;
3699         uberdata_t *udp = self->ul_uberdata;
3700         tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3701         int error = 0;
3702         int more;
3703         lwpid_t lwpid;
3704         queue_head_t *qp;
3705         mutex_t *mp;
3706         queue_head_t *mqp;
3707         ulwp_t **ulwpp;
3708         ulwp_t *ulwp;
3709         ulwp_t *prev;
3710
3711         if (csp)
3712                 tdb_incr(csp->cond_signal);
3713
3714         if (cvp->cond_waiters_kernel)   /* someone sleeping in the kernel? */
3715                 error = _lwp_cond_signal(cvp);
3716
3717         if (!cvp->cond_waiters_user)    /* no one sleeping at user-level */
3718                 return (error);
3719
3720         /*
3721          * Move some thread from the condvar sleep queue to the mutex sleep
3722          * queue for the mutex that it will acquire on being waked up.
3723          * We can do this only if we own the mutex it will acquire.
3724          * If we do not own the mutex, or if its ul_cv_wake flag
3725          * is set, just dequeue and unpark it.
3726          */
3727         qp = queue_lock(cvp, CV);
3728         ulwpp = queue_slot(qp, &prev, &more);
3729         cvp->cond_waiters_user = more;
3730         if (ulwpp == NULL) {    /* no one on the sleep queue */
3731                 queue_unlock(qp);
3732                 return (error);
3733         }
3734         ulwp = *ulwpp;
3735
3736         /*
3737          * Inform the thread that it was the recipient of a cond_signal().
3738          * This lets it deal with cond_signal() and, concurrently,
3739          * one or more of a cancellation, a UNIX signal, or a timeout.
3740          * These latter conditions must not consume a cond_signal().
3741          */
3742         ulwp->ul_signalled = 1;
3743
3744         /*
3745          * Dequeue the waiter but leave its ul_sleepq non-NULL
3746          * while we move it to the mutex queue so that it can
3747          * deal properly with spurious wakeups.
3748          */
3749         queue_unlink(qp, ulwpp, prev);
3750
3751         mp = ulwp->ul_cvmutex;          /* the mutex it will acquire */
3752         ulwp->ul_cvmutex = NULL;
3753         ASSERT(mp != NULL);
3754
3755         if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3756                 /* just wake it up */
3757                 lwpid = ulwp->ul_lwpid;
3758                 no_preempt(self);
3759                 ulwp->ul_sleepq = NULL;
3760                 ulwp->ul_wchan = NULL;
3761                 queue_unlock(qp);
3762                 (void) __lwp_unpark(lwpid);
3763                 preempt(self);
3764         } else {
3765                 /* move it to the mutex queue */
3766                 mqp = queue_lock(mp, MX);
3767                 enqueue(mqp, ulwp, 0);
3768                 mp->mutex_waiters = 1;
3769                 queue_unlock(mqp);
3770                 queue_unlock(qp);
3771         }
3772
3773         return (error);
3774 }
3775
3776 /*
3777  * Utility function called by mutex_wakeup_all(), cond_broadcast(),
3778  * and rw_queue_release() to (re)allocate a big buffer to hold the
3779  * lwpids of all the threads to be set running after they are removed
3780  * from their sleep queues.  Since we are holding a queue lock, we
3781  * cannot call any function that might acquire a lock.  mmap(), munmap(),
3782  * lwp_unpark_all() are simple system calls and are safe in this regard.
3783  */
3784 lwpid_t *
3785 alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
3786 {
3787         /*
3788          * Allocate NEWLWPS ids on the first overflow.
3789          * Double the allocation each time after that.
3790          */
3791         int nlwpid = *nlwpid_ptr;
3792         int maxlwps = *maxlwps_ptr;
3793         int first_allocation;
3794         int newlwps;
3795         void *vaddr;
3796
3797         ASSERT(nlwpid == maxlwps);
3798
3799         first_allocation = (maxlwps == MAXLWPS);
3800         newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
3801         vaddr = mmap(NULL, newlwps * sizeof (lwpid_t),
3802             PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
3803
3804         if (vaddr == MAP_FAILED) {
3805                 /*
3806                  * Let's hope this never happens.
3807                  * If it does, then we have a terrible
3808                  * thundering herd on our hands.
3809                  */
3810                 (void) __lwp_unpark_all(lwpid, nlwpid);
3811                 *nlwpid_ptr = 0;
3812         } else {
3813                 (void) memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
3814                 if (!first_allocation)
3815                         (void) munmap((caddr_t)lwpid,
3816                             maxlwps * sizeof (lwpid_t));
3817                 lwpid = vaddr;
3818                 *maxlwps_ptr = newlwps;
3819         }
3820
3821         return (lwpid);
3822 }
3823
3824 #pragma weak pthread_cond_broadcast = cond_broadcast
3825 #pragma weak _cond_broadcast = cond_broadcast
3826 int
3827 cond_broadcast(cond_t *cvp)
3828 {
3829         ulwp_t *self = curthread;
3830         uberdata_t *udp = self->ul_uberdata;
3831         tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3832         int error = 0;
3833         queue_head_t *qp;
3834         queue_root_t *qrp;
3835         mutex_t *mp;
3836         mutex_t *mp_cache = NULL;
3837         queue_head_t *mqp = NULL;
3838         ulwp_t *ulwp;
3839         int nlwpid = 0;
3840         int maxlwps = MAXLWPS;
3841         lwpid_t buffer[MAXLWPS];
3842         lwpid_t *lwpid = buffer;
3843
3844         if (csp)
3845                 tdb_incr(csp->cond_broadcast);
3846
3847         if (cvp->cond_waiters_kernel)   /* someone sleeping in the kernel? */
3848                 error = _lwp_cond_broadcast(cvp);
3849
3850         if (!cvp->cond_waiters_user)    /* no one sleeping at user-level */
3851                 return (error);
3852
3853         /*
3854          * Move everyone from the condvar sleep queue to the mutex sleep
3855          * queue for the mutex that they will acquire on being waked up.
3856          * We can do this only if we own the mutex they will acquire.
3857          * If we do not own the mutex, or if their ul_cv_wake flag
3858          * is set, just dequeue and unpark them.
3859          *
3860          * We keep track of lwpids that are to be unparked in lwpid[].
3861          * __lwp_unpark_all() is called to unpark all of them after
3862          * they have been removed from the sleep queue and the sleep
3863          * queue lock has been dropped.  If we run out of space in our
3864          * on-stack buffer, we need to allocate more but we can't call
3865          * lmalloc() because we are holding a queue lock when the overflow
3866          * occurs and lmalloc() acquires a lock.  We can't use alloca()
3867          * either because the application may have allocated a small
3868          * stack and we don't want to overrun the stack.  So we call
3869          * alloc_lwpids() to allocate a bigger buffer using the mmap()
3870          * system call directly since that path acquires no locks.
3871          */
3872         qp = queue_lock(cvp, CV);
3873         cvp->cond_waiters_user = 0;
3874         for (;;) {
3875                 if ((qrp = qp->qh_root) == NULL ||
3876                     (ulwp = qrp->qr_head) == NULL)
3877                         break;
3878                 ASSERT(ulwp->ul_wchan == cvp);
3879                 queue_unlink(qp, &qrp->qr_head, NULL);
3880                 mp = ulwp->ul_cvmutex;          /* its mutex */
3881                 ulwp->ul_cvmutex = NULL;
3882                 ASSERT(mp != NULL);
3883                 if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3884                         /* just wake it up */
3885                         ulwp->ul_sleepq = NULL;
3886                         ulwp->ul_wchan = NULL;
3887                         if (nlwpid == maxlwps)
3888                                 lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
3889                         lwpid[nlwpid++] = ulwp->ul_lwpid;
3890                 } else {
3891                         /* move it to the mutex queue */
3892                         if (mp != mp_cache) {
3893                                 mp_cache = mp;
3894                                 if (mqp != NULL)
3895                                         queue_unlock(mqp);
3896                                 mqp = queue_lock(mp, MX);
3897                         }
3898                         enqueue(mqp, ulwp, 0);
3899                         mp->mutex_waiters = 1;
3900                 }
3901         }
3902         if (mqp != NULL)
3903                 queue_unlock(mqp);
3904         if (nlwpid == 0) {
3905                 queue_unlock(qp);
3906         } else {
3907                 no_preempt(self);
3908                 queue_unlock(qp);
3909                 if (nlwpid == 1)
3910                         (void) __lwp_unpark(lwpid[0]);
3911                 else
3912                         (void) __lwp_unpark_all(lwpid, nlwpid);
3913                 preempt(self);
3914         }
3915         if (lwpid != buffer)
3916                 (void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
3917         return (error);
3918 }
3919
3920 #pragma weak pthread_cond_destroy = cond_destroy
3921 int
3922 cond_destroy(cond_t *cvp)
3923 {
3924         cvp->cond_magic = 0;
3925         tdb_sync_obj_deregister(cvp);
3926         return (0);
3927 }
3928
3929 #if defined(THREAD_DEBUG)
3930 void
3931 assert_no_libc_locks_held(void)
3932 {
3933         ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
3934 }
3935
3936 /* protected by link_lock */
3937 uint64_t spin_lock_spin;
3938 uint64_t spin_lock_spin2;
3939 uint64_t spin_lock_sleep;
3940 uint64_t spin_lock_wakeup;
3941
3942 /*
3943  * Record spin lock statistics.
3944  * Called by a thread exiting itself in thrp_exit().
3945  * Also called via atexit() from the thread calling
3946  * exit() to do all the other threads as well.
3947  */
3948 void
3949 record_spin_locks(ulwp_t *ulwp)
3950 {
3951         spin_lock_spin += ulwp->ul_spin_lock_spin;
3952         spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
3953         spin_lock_sleep += ulwp->ul_spin_lock_sleep;
3954         spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
3955         ulwp->ul_spin_lock_spin = 0;
3956         ulwp->ul_spin_lock_spin2 = 0;
3957         ulwp->ul_spin_lock_sleep = 0;
3958         ulwp->ul_spin_lock_wakeup = 0;
3959 }
3960
3961 /*
3962  * atexit function:  dump the queue statistics to stderr.
3963  */
3964 #include <stdio.h>
3965 void
3966 dump_queue_statistics(void)
3967 {
3968         uberdata_t *udp = curthread->ul_uberdata;
3969         queue_head_t *qp;
3970         int qn;
3971         uint64_t spin_lock_total = 0;
3972
3973         if (udp->queue_head == NULL || thread_queue_dump == 0)
3974                 return;
3975
3976         if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
3977             fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3978                 return;
3979         for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
3980                 if (qp->qh_lockcount == 0)
3981                         continue;
3982                 spin_lock_total += qp->qh_lockcount;
3983                 if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3984                     (u_longlong_t)qp->qh_lockcount,
3985                     qp->qh_qmax, qp->qh_hmax) < 0)
3986                         return;
3987         }
3988
3989         if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
3990             fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3991                 return;
3992         for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
3993                 if (qp->qh_lockcount == 0)
3994                         continue;
3995                 spin_lock_total += qp->qh_lockcount;
3996                 if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3997                     (u_longlong_t)qp->qh_lockcount,
3998                     qp->qh_qmax, qp->qh_hmax) < 0)
3999                         return;
4000         }
4001
4002         (void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
4003             (u_longlong_t)spin_lock_total);
4004         (void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
4005             (u_longlong_t)spin_lock_spin);
4006         (void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
4007             (u_longlong_t)spin_lock_spin2);
4008         (void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
4009             (u_longlong_t)spin_lock_sleep);
4010         (void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
4011             (u_longlong_t)spin_lock_wakeup);
4012 }
4013 #endif