sys/kern/kern_rwlock.c

   1 /*      $NetBSD: kern_rwlock.c,v 1.32 2009/05/16 08:36:32 yamt Exp $    */
   2
   3 /*-
   4  * Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Jason R. Thorpe and Andrew Doran.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * Kernel reader/writer lock implementation, modeled after those
  34  * found in Solaris, a description of which can be found in:
  35  *
  36  *      Solaris Internals: Core Kernel Architecture, Jim Mauro and
  37  *          Richard McDougall.
  38  */
  39
  40 #include <sys/cdefs.h>
  41 __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.32 2009/05/16 08:36:32 yamt Exp $");
  42
  43 #define __RWLOCK_PRIVATE
  44
  45 #include <sys/param.h>
  46 #include <sys/proc.h>
  47 #include <sys/rwlock.h>
  48 #include <sys/sched.h>
  49 #include <sys/sleepq.h>
  50 #include <sys/systm.h>
  51 #include <sys/lockdebug.h>
  52 #include <sys/cpu.h>
  53 #include <sys/atomic.h>
  54 #include <sys/lock.h>
  55
  56 #include <dev/lockstat.h>
  57
  58 /*
  59  * LOCKDEBUG
  60  */
  61
  62 #if defined(LOCKDEBUG)
  63
  64 #define RW_WANTLOCK(rw, op, t)                                          \
  65         LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw),                        \
  66             (uintptr_t)__builtin_return_address(0), op == RW_READER, t);
  67 #define RW_LOCKED(rw, op)                                               \
  68         LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL,                    \
  69             (uintptr_t)__builtin_return_address(0), op == RW_READER);
  70 #define RW_UNLOCKED(rw, op)                                             \
  71         LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw),                        \
  72             (uintptr_t)__builtin_return_address(0), op == RW_READER);
  73 #define RW_DASSERT(rw, cond)                                            \
  74 do {                                                                    \
  75         if (!(cond))                                                    \
  76                 rw_abort(rw, __func__, "assertion failed: " #cond);     \
  77 } while (/* CONSTCOND */ 0);
  78
  79 #else   /* LOCKDEBUG */
  80
  81 #define RW_WANTLOCK(rw, op, t)  /* nothing */
  82 #define RW_LOCKED(rw, op)       /* nothing */
  83 #define RW_UNLOCKED(rw, op)     /* nothing */
  84 #define RW_DASSERT(rw, cond)    /* nothing */
  85
  86 #endif  /* LOCKDEBUG */
  87
  88 /*
  89  * DIAGNOSTIC
  90  */
  91
  92 #if defined(DIAGNOSTIC)
  93
  94 #define RW_ASSERT(rw, cond)                                             \
  95 do {                                                                    \
  96         if (!(cond))                                                    \
  97                 rw_abort(rw, __func__, "assertion failed: " #cond);     \
  98 } while (/* CONSTCOND */ 0)
  99
 100 #else
 101
 102 #define RW_ASSERT(rw, cond)     /* nothing */
 103
 104 #endif  /* DIAGNOSTIC */
 105
 106 #define RW_SETDEBUG(rw, on)             ((rw)->rw_owner |= (on) ? RW_DEBUG : 0)
 107 #define RW_DEBUG_P(rw)                  (((rw)->rw_owner & RW_DEBUG) != 0)
 108 #if defined(LOCKDEBUG)
 109 #define RW_INHERITDEBUG(new, old)       (new) |= (old) & RW_DEBUG
 110 #else /* defined(LOCKDEBUG) */
 111 #define RW_INHERITDEBUG(new, old)       /* nothing */
 112 #endif /* defined(LOCKDEBUG) */
 113
 114 static void     rw_abort(krwlock_t *, const char *, const char *);
 115 static void     rw_dump(volatile void *);
 116 static lwp_t    *rw_owner(wchan_t);
 117
 118 static inline uintptr_t
 119 rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
 120 {
 121
 122         RW_INHERITDEBUG(n, o);
 123         return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
 124             (void *)o, (void *)n);
 125 }
 126
 127 static inline void
 128 rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
 129 {
 130
 131         RW_INHERITDEBUG(n, o);
 132         n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
 133             (void *)n);
 134         RW_DASSERT(rw, n == o);
 135 }
 136
 137 /*
 138  * For platforms that do not provide stubs, or for the LOCKDEBUG case.
 139  */
 140 #ifdef LOCKDEBUG
 141 #undef  __HAVE_RW_STUBS
 142 #endif
 143
 144 #ifndef __HAVE_RW_STUBS
 145 __strong_alias(rw_enter,rw_vector_enter);
 146 __strong_alias(rw_exit,rw_vector_exit);
 147 __strong_alias(rw_tryenter,rw_vector_tryenter);
 148 #endif
 149
 150 lockops_t rwlock_lockops = {
 151         "Reader / writer lock",
 152         LOCKOPS_SLEEP,
 153         rw_dump
 154 };
 155
 156 syncobj_t rw_syncobj = {
 157         SOBJ_SLEEPQ_SORTED,
 158         turnstile_unsleep,
 159         turnstile_changepri,
 160         sleepq_lendpri,
 161         rw_owner,
 162 };
 163
 164 /*
 165  * rw_dump:
 166  *
 167  *      Dump the contents of a rwlock structure.
 168  */
 169 static void
 170 rw_dump(volatile void *cookie)
 171 {
 172         volatile krwlock_t *rw = cookie;
 173
 174         printf_nolog("owner/count  : %#018lx flags    : %#018x\n",
 175             (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
 176 }
 177
 178 /*
 179  * rw_abort:
 180  *
 181  *      Dump information about an error and panic the system.  This
 182  *      generates a lot of machine code in the DIAGNOSTIC case, so
 183  *      we ask the compiler to not inline it.
 184  */
 185 static void __noinline
 186 rw_abort(krwlock_t *rw, const char *func, const char *msg)
 187 {
 188
 189         if (panicstr != NULL)
 190                 return;
 191
 192         LOCKDEBUG_ABORT(rw, &rwlock_lockops, func, msg);
 193 }
 194
 195 /*
 196  * rw_init:
 197  *
 198  *      Initialize a rwlock for use.
 199  */
 200 void
 201 rw_init(krwlock_t *rw)
 202 {
 203         bool dodebug;
 204
 205         memset(rw, 0, sizeof(*rw));
 206
 207         dodebug = LOCKDEBUG_ALLOC(rw, &rwlock_lockops,
 208             (uintptr_t)__builtin_return_address(0));
 209         RW_SETDEBUG(rw, dodebug);
 210 }
 211
 212 /*
 213  * rw_destroy:
 214  *
 215  *      Tear down a rwlock.
 216  */
 217 void
 218 rw_destroy(krwlock_t *rw)
 219 {
 220
 221         RW_ASSERT(rw, (rw->rw_owner & ~RW_DEBUG) == 0);
 222         LOCKDEBUG_FREE(RW_DEBUG_P(rw), rw);
 223 }
 224
 225 /*
 226  * rw_onproc:
 227  *
 228  *      Return true if an rwlock owner is running on a CPU in the system.
 229  *      If the target is waiting on the kernel big lock, then we must
 230  *      release it.  This is necessary to avoid deadlock.
 231  *
 232  *      Note that we can't use the rwlock owner field as an LWP pointer.  We
 233  *      don't have full control over the timing of our execution, and so the
 234  *      pointer could be completely invalid by the time we dereference it.
 235  */
 236 static int
 237 rw_onproc(uintptr_t owner, struct cpu_info **cip)
 238 {
 239 #ifdef MULTIPROCESSOR
 240         CPU_INFO_ITERATOR cii;
 241         struct cpu_info *ci;
 242         lwp_t *l;
 243
 244         if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED)
 245                 return 0;
 246         l = (lwp_t *)(owner & RW_THREAD);
 247
 248         /* See if the target is running on a CPU somewhere. */
 249         if ((ci = *cip) != NULL && ci->ci_curlwp == l)
 250                 goto run;
 251         for (CPU_INFO_FOREACH(cii, ci))
 252                 if (ci->ci_curlwp == l)
 253                         goto run;
 254
 255         /* No: it may be safe to block now. */
 256         *cip = NULL;
 257         return 0;
 258
 259  run:
 260         /* Target is running; do we need to block? */
 261         *cip = ci;
 262         return ci->ci_biglock_wanted != l;
 263 #else
 264         return 0;
 265 #endif  /* MULTIPROCESSOR */
 266 }
 267
 268 /*
 269  * rw_vector_enter:
 270  *
 271  *      Acquire a rwlock.
 272  */
 273 void
 274 rw_vector_enter(krwlock_t *rw, const krw_t op)
 275 {
 276         uintptr_t owner, incr, need_wait, set_wait, curthread, next;
 277         struct cpu_info *ci;
 278         turnstile_t *ts;
 279         int queue;
 280         lwp_t *l;
 281         LOCKSTAT_TIMER(slptime);
 282         LOCKSTAT_TIMER(slpcnt);
 283         LOCKSTAT_TIMER(spintime);
 284         LOCKSTAT_COUNTER(spincnt);
 285         LOCKSTAT_FLAG(lsflag);
 286
 287         l = curlwp;
 288         curthread = (uintptr_t)l;
 289
 290         RW_ASSERT(rw, !cpu_intr_p());
 291         RW_ASSERT(rw, curthread != 0);
 292         RW_WANTLOCK(rw, op, false);
 293
 294         if (panicstr == NULL) {
 295                 LOCKDEBUG_BARRIER(&kernel_lock, 1);
 296         }
 297
 298         /*
 299          * We play a slight trick here.  If we're a reader, we want
 300          * increment the read count.  If we're a writer, we want to
 301          * set the owner field and whe WRITE_LOCKED bit.
 302          *
 303          * In the latter case, we expect those bits to be zero,
 304          * therefore we can use an add operation to set them, which
 305          * means an add operation for both cases.
 306          */
 307         if (__predict_true(op == RW_READER)) {
 308                 incr = RW_READ_INCR;
 309                 set_wait = RW_HAS_WAITERS;
 310                 need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
 311                 queue = TS_READER_Q;
 312         } else {
 313                 RW_DASSERT(rw, op == RW_WRITER);
 314                 incr = curthread | RW_WRITE_LOCKED;
 315                 set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
 316                 need_wait = RW_WRITE_LOCKED | RW_THREAD;
 317                 queue = TS_WRITER_Q;
 318         }
 319
 320         LOCKSTAT_ENTER(lsflag);
 321
 322         for (ci = NULL, owner = rw->rw_owner;;) {
 323                 /*
 324                  * Read the lock owner field.  If the need-to-wait
 325                  * indicator is clear, then try to acquire the lock.
 326                  */
 327                 if ((owner & need_wait) == 0) {
 328                         next = rw_cas(rw, owner, (owner + incr) &
 329                             ~RW_WRITE_WANTED);
 330                         if (__predict_true(next == owner)) {
 331                                 /* Got it! */
 332                                 membar_enter();
 333                                 break;
 334                         }
 335
 336                         /*
 337                          * Didn't get it -- spin around again (we'll
 338                          * probably sleep on the next iteration).
 339                          */
 340                         owner = next;
 341                         continue;
 342                 }
 343
 344                 if (__predict_false(panicstr != NULL))
 345                         return;
 346                 if (__predict_false(RW_OWNER(rw) == curthread))
 347                         rw_abort(rw, __func__, "locking against myself");
 348
 349                 /*
 350                  * If the lock owner is running on another CPU, and
 351                  * there are no existing waiters, then spin.
 352                  */
 353                 if (rw_onproc(owner, &ci)) {
 354                         LOCKSTAT_START_TIMER(lsflag, spintime);
 355                         u_int count = SPINLOCK_BACKOFF_MIN;
 356                         do {
 357                                 SPINLOCK_BACKOFF(count);
 358                                 owner = rw->rw_owner;
 359                         } while (rw_onproc(owner, &ci));
 360                         LOCKSTAT_STOP_TIMER(lsflag, spintime);
 361                         LOCKSTAT_COUNT(spincnt, 1);
 362                         if ((owner & need_wait) == 0)
 363                                 continue;
 364                 }
 365
 366                 /*
 367                  * Grab the turnstile chain lock.  Once we have that, we
 368                  * can adjust the waiter bits and sleep queue.
 369                  */
 370                 ts = turnstile_lookup(rw);
 371
 372                 /*
 373                  * Mark the rwlock as having waiters.  If the set fails,
 374                  * then we may not need to sleep and should spin again.
 375                  * Reload rw_owner because turnstile_lookup() may have
 376                  * spun on the turnstile chain lock.
 377                  */
 378                 owner = rw->rw_owner;
 379                 if ((owner & need_wait) == 0 || rw_onproc(owner, &ci)) {
 380                         turnstile_exit(rw);
 381                         continue;
 382                 }
 383                 next = rw_cas(rw, owner, owner | set_wait);
 384                 if (__predict_false(next != owner)) {
 385                         turnstile_exit(rw);
 386                         owner = next;
 387                         continue;
 388                 }
 389
 390                 LOCKSTAT_START_TIMER(lsflag, slptime);
 391                 turnstile_block(ts, queue, rw, &rw_syncobj);
 392                 LOCKSTAT_STOP_TIMER(lsflag, slptime);
 393                 LOCKSTAT_COUNT(slpcnt, 1);
 394
 395                 /*
 396                  * No need for a memory barrier because of context switch.
 397                  * If not handed the lock, then spin again.
 398                  */
 399                 if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
 400                         break;
 401         }
 402
 403         LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK |
 404             (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime);
 405         LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime);
 406         LOCKSTAT_EXIT(lsflag);
 407
 408         RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
 409             (op == RW_READER && RW_COUNT(rw) != 0));
 410         RW_LOCKED(rw, op);
 411 }
 412
 413 /*
 414  * rw_vector_exit:
 415  *
 416  *      Release a rwlock.
 417  */
 418 void
 419 rw_vector_exit(krwlock_t *rw)
 420 {
 421         uintptr_t curthread, owner, decr, new, next;
 422         turnstile_t *ts;
 423         int rcnt, wcnt;
 424         lwp_t *l;
 425
 426         curthread = (uintptr_t)curlwp;
 427         RW_ASSERT(rw, curthread != 0);
 428
 429         if (__predict_false(panicstr != NULL))
 430                 return;
 431
 432         /*
 433          * Again, we use a trick.  Since we used an add operation to
 434          * set the required lock bits, we can use a subtract to clear
 435          * them, which makes the read-release and write-release path
 436          * the same.
 437          */
 438         owner = rw->rw_owner;
 439         if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
 440                 RW_UNLOCKED(rw, RW_WRITER);
 441                 RW_ASSERT(rw, RW_OWNER(rw) == curthread);
 442                 decr = curthread | RW_WRITE_LOCKED;
 443         } else {
 444                 RW_UNLOCKED(rw, RW_READER);
 445                 RW_ASSERT(rw, RW_COUNT(rw) != 0);
 446                 decr = RW_READ_INCR;
 447         }
 448
 449         /*
 450          * Compute what we expect the new value of the lock to be. Only
 451          * proceed to do direct handoff if there are waiters, and if the
 452          * lock would become unowned.
 453          */
 454         membar_exit();
 455         for (;;) {
 456                 new = (owner - decr);
 457                 if ((new & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
 458                         break;
 459                 next = rw_cas(rw, owner, new);
 460                 if (__predict_true(next == owner))
 461                         return;
 462                 owner = next;
 463         }
 464
 465         /*
 466          * Grab the turnstile chain lock.  This gets the interlock
 467          * on the sleep queue.  Once we have that, we can adjust the
 468          * waiter bits.
 469          */
 470         ts = turnstile_lookup(rw);
 471         owner = rw->rw_owner;
 472         RW_DASSERT(rw, ts != NULL);
 473         RW_DASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
 474
 475         wcnt = TS_WAITERS(ts, TS_WRITER_Q);
 476         rcnt = TS_WAITERS(ts, TS_READER_Q);
 477
 478         /*
 479          * Give the lock away.
 480          *
 481          * If we are releasing a write lock, then prefer to wake all
 482          * outstanding readers.  Otherwise, wake one writer if there
 483          * are outstanding readers, or all writers if there are no
 484          * pending readers.  If waking one specific writer, the writer
 485          * is handed the lock here.  If waking multiple writers, we
 486          * set WRITE_WANTED to block out new readers, and let them
 487          * do the work of acquring the lock in rw_vector_enter().
 488          */
 489         if (rcnt == 0 || decr == RW_READ_INCR) {
 490                 RW_DASSERT(rw, wcnt != 0);
 491                 RW_DASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
 492
 493                 if (rcnt != 0) {
 494                         /* Give the lock to the longest waiting writer. */
 495                         l = TS_FIRST(ts, TS_WRITER_Q);
 496                         new = (uintptr_t)l | RW_WRITE_LOCKED | RW_HAS_WAITERS;
 497                         if (wcnt > 1)
 498                                 new |= RW_WRITE_WANTED;
 499                         rw_swap(rw, owner, new);
 500                         turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
 501                 } else {
 502                         /* Wake all writers and let them fight it out. */
 503                         rw_swap(rw, owner, RW_WRITE_WANTED);
 504                         turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
 505                 }
 506         } else {
 507                 RW_DASSERT(rw, rcnt != 0);
 508
 509                 /*
 510                  * Give the lock to all blocked readers.  If there
 511                  * is a writer waiting, new readers that arrive
 512                  * after the release will be blocked out.
 513                  */
 514                 new = rcnt << RW_READ_COUNT_SHIFT;
 515                 if (wcnt != 0)
 516                         new |= RW_HAS_WAITERS | RW_WRITE_WANTED;
 517
 518                 /* Wake up all sleeping readers. */
 519                 rw_swap(rw, owner, new);
 520                 turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
 521         }
 522 }
 523
 524 /*
 525  * rw_vector_tryenter:
 526  *
 527  *      Try to acquire a rwlock.
 528  */
 529 int
 530 rw_vector_tryenter(krwlock_t *rw, const krw_t op)
 531 {
 532         uintptr_t curthread, owner, incr, need_wait, next;
 533
 534         curthread = (uintptr_t)curlwp;
 535
 536         RW_ASSERT(rw, curthread != 0);
 537
 538         if (op == RW_READER) {
 539                 incr = RW_READ_INCR;
 540                 need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
 541         } else {
 542                 RW_DASSERT(rw, op == RW_WRITER);
 543                 incr = curthread | RW_WRITE_LOCKED;
 544                 need_wait = RW_WRITE_LOCKED | RW_THREAD;
 545         }
 546
 547         for (owner = rw->rw_owner;; owner = next) {
 548                 owner = rw->rw_owner;
 549                 if (__predict_false((owner & need_wait) != 0))
 550                         return 0;
 551                 next = rw_cas(rw, owner, owner + incr);
 552                 if (__predict_true(next == owner)) {
 553                         /* Got it! */
 554                         membar_enter();
 555                         break;
 556                 }
 557         }
 558
 559         RW_WANTLOCK(rw, op, true);
 560         RW_LOCKED(rw, op);
 561         RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
 562             (op == RW_READER && RW_COUNT(rw) != 0));
 563
 564         return 1;
 565 }
 566
 567 /*
 568  * rw_downgrade:
 569  *
 570  *      Downgrade a write lock to a read lock.
 571  */
 572 void
 573 rw_downgrade(krwlock_t *rw)
 574 {
 575         uintptr_t owner, curthread, new, next;
 576         turnstile_t *ts;
 577         int rcnt, wcnt;
 578
 579         curthread = (uintptr_t)curlwp;
 580         RW_ASSERT(rw, curthread != 0);
 581         RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
 582         RW_ASSERT(rw, RW_OWNER(rw) == curthread);
 583         RW_UNLOCKED(rw, RW_WRITER);
 584
 585         membar_producer();
 586         owner = rw->rw_owner;
 587         if ((owner & RW_HAS_WAITERS) == 0) {
 588                 /*
 589                  * There are no waiters, so we can do this the easy way.
 590                  * Try swapping us down to one read hold.  If it fails, the
 591                  * lock condition has changed and we most likely now have
 592                  * waiters.
 593                  */
 594                 next = rw_cas(rw, owner, RW_READ_INCR);
 595                 if (__predict_true(next == owner)) {
 596                         RW_LOCKED(rw, RW_READER);
 597                         RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
 598                         RW_DASSERT(rw, RW_COUNT(rw) != 0);
 599                         return;
 600                 }
 601                 owner = next;
 602         }
 603
 604         /*
 605          * Grab the turnstile chain lock.  This gets the interlock
 606          * on the sleep queue.  Once we have that, we can adjust the
 607          * waiter bits.
 608          */
 609         for (;; owner = next) {
 610                 ts = turnstile_lookup(rw);
 611                 RW_DASSERT(rw, ts != NULL);
 612
 613                 rcnt = TS_WAITERS(ts, TS_READER_Q);
 614                 wcnt = TS_WAITERS(ts, TS_WRITER_Q);
 615
 616                 /*
 617                  * If there are no readers, just preserve the waiters
 618                  * bits, swap us down to one read hold and return.
 619                  */
 620                 if (rcnt == 0) {
 621                         RW_DASSERT(rw, wcnt != 0);
 622                         RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
 623                         RW_DASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
 624
 625                         new = RW_READ_INCR | RW_HAS_WAITERS | RW_WRITE_WANTED;
 626                         next = rw_cas(rw, owner, new);
 627                         turnstile_exit(rw);
 628                         if (__predict_true(next == owner))
 629                                 break;
 630                 } else {
 631                         /*
 632                          * Give the lock to all blocked readers.  We may
 633                          * retain one read hold if downgrading.  If there
 634                          * is a writer waiting, new readers will be blocked
 635                          * out.
 636                          */
 637                         new = (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
 638                         if (wcnt != 0)
 639                                 new |= RW_HAS_WAITERS | RW_WRITE_WANTED;
 640
 641                         next = rw_cas(rw, owner, new);
 642                         if (__predict_true(next == owner)) {
 643                                 /* Wake up all sleeping readers. */
 644                                 turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
 645                                 break;
 646                         }
 647                         turnstile_exit(rw);
 648                 }
 649         }
 650
 651         RW_WANTLOCK(rw, RW_READER, false);
 652         RW_LOCKED(rw, RW_READER);
 653         RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
 654         RW_DASSERT(rw, RW_COUNT(rw) != 0);
 655 }
 656
 657 /*
 658  * rw_tryupgrade:
 659  *
 660  *      Try to upgrade a read lock to a write lock.  We must be the
 661  *      only reader.
 662  */
 663 int
 664 rw_tryupgrade(krwlock_t *rw)
 665 {
 666         uintptr_t owner, curthread, new, next;
 667
 668         curthread = (uintptr_t)curlwp;
 669         RW_ASSERT(rw, curthread != 0);
 670         RW_ASSERT(rw, rw_read_held(rw));
 671
 672         for (owner = rw->rw_owner;; owner = next) {
 673                 RW_ASSERT(rw, (owner & RW_WRITE_LOCKED) == 0);
 674                 if (__predict_false((owner & RW_THREAD) != RW_READ_INCR)) {
 675                         RW_ASSERT(rw, (owner & RW_THREAD) != 0);
 676                         return 0;
 677                 }
 678                 new = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
 679                 next = rw_cas(rw, owner, new);
 680                 if (__predict_true(next == owner)) {
 681                         membar_producer();
 682                         break;
 683                 }
 684         }
 685
 686         RW_UNLOCKED(rw, RW_READER);
 687         RW_WANTLOCK(rw, RW_WRITER, true);
 688         RW_LOCKED(rw, RW_WRITER);
 689         RW_DASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
 690         RW_DASSERT(rw, RW_OWNER(rw) == curthread);
 691
 692         return 1;
 693 }
 694
 695 /*
 696  * rw_read_held:
 697  *
 698  *      Returns true if the rwlock is held for reading.  Must only be
 699  *      used for diagnostic assertions, and never be used to make
 700  *      decisions about how to use a rwlock.
 701  */
 702 int
 703 rw_read_held(krwlock_t *rw)
 704 {
 705         uintptr_t owner;
 706
 707         if (panicstr != NULL)
 708                 return 1;
 709         if (rw == NULL)
 710                 return 0;
 711         owner = rw->rw_owner;
 712         return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
 713 }
 714
 715 /*
 716  * rw_write_held:
 717  *
 718  *      Returns true if the rwlock is held for writing.  Must only be
 719  *      used for diagnostic assertions, and never be used to make
 720  *      decisions about how to use a rwlock.
 721  */
 722 int
 723 rw_write_held(krwlock_t *rw)
 724 {
 725
 726         if (panicstr != NULL)
 727                 return 1;
 728         if (rw == NULL)
 729                 return 0;
 730         return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
 731             (RW_WRITE_LOCKED | (uintptr_t)curlwp);
 732 }
 733
 734 /*
 735  * rw_lock_held:
 736  *
 737  *      Returns true if the rwlock is held for reading or writing.  Must
 738  *      only be used for diagnostic assertions, and never be used to make
 739  *      decisions about how to use a rwlock.
 740  */
 741 int
 742 rw_lock_held(krwlock_t *rw)
 743 {
 744
 745         if (panicstr != NULL)
 746                 return 1;
 747         if (rw == NULL)
 748                 return 0;
 749         return (rw->rw_owner & RW_THREAD) != 0;
 750 }
 751
 752 /*
 753  * rw_owner:
 754  *
 755  *      Return the current owner of an RW lock, but only if it is write
 756  *      held.  Used for priority inheritance.
 757  */
 758 static lwp_t *
 759 rw_owner(wchan_t obj)
 760 {
 761         krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
 762         uintptr_t owner = rw->rw_owner;
 763
 764         if ((owner & RW_WRITE_LOCKED) == 0)
 765                 return NULL;
 766
 767         return (void *)(owner & RW_THREAD);
 768 }