1 /* $NetBSD: kern_rwlock.c,v 1.32 2009/05/16 08:36:32 yamt Exp $ */
4 * Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe and Andrew Doran.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * Kernel reader/writer lock implementation, modeled after those
34 * found in Solaris, a description of which can be found in:
36 * Solaris Internals: Core Kernel Architecture, Jim Mauro and
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.32 2009/05/16 08:36:32 yamt Exp $");
43 #define __RWLOCK_PRIVATE
45 #include <sys/param.h>
47 #include <sys/rwlock.h>
48 #include <sys/sched.h>
49 #include <sys/sleepq.h>
50 #include <sys/systm.h>
51 #include <sys/lockdebug.h>
53 #include <sys/atomic.h>
56 #include <dev/lockstat.h>
62 #if defined(LOCKDEBUG)
64 #define RW_WANTLOCK(rw, op, t) \
65 LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \
66 (uintptr_t)__builtin_return_address(0), op == RW_READER, t);
67 #define RW_LOCKED(rw, op) \
68 LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \
69 (uintptr_t)__builtin_return_address(0), op == RW_READER);
70 #define RW_UNLOCKED(rw, op) \
71 LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \
72 (uintptr_t)__builtin_return_address(0), op == RW_READER);
73 #define RW_DASSERT(rw, cond) \
76 rw_abort(rw, __func__, "assertion failed: " #cond); \
77 } while (/* CONSTCOND */ 0);
81 #define RW_WANTLOCK(rw, op, t) /* nothing */
82 #define RW_LOCKED(rw, op) /* nothing */
83 #define RW_UNLOCKED(rw, op) /* nothing */
84 #define RW_DASSERT(rw, cond) /* nothing */
86 #endif /* LOCKDEBUG */
92 #if defined(DIAGNOSTIC)
94 #define RW_ASSERT(rw, cond) \
97 rw_abort(rw, __func__, "assertion failed: " #cond); \
98 } while (/* CONSTCOND */ 0)
102 #define RW_ASSERT(rw, cond) /* nothing */
104 #endif /* DIAGNOSTIC */
106 #define RW_SETDEBUG(rw, on) ((rw)->rw_owner |= (on) ? RW_DEBUG : 0)
107 #define RW_DEBUG_P(rw) (((rw)->rw_owner & RW_DEBUG) != 0)
108 #if defined(LOCKDEBUG)
109 #define RW_INHERITDEBUG(new, old) (new) |= (old) & RW_DEBUG
110 #else /* defined(LOCKDEBUG) */
111 #define RW_INHERITDEBUG(new, old) /* nothing */
112 #endif /* defined(LOCKDEBUG) */
114 static void rw_abort(krwlock_t
*, const char *, const char *);
115 static void rw_dump(volatile void *);
116 static lwp_t
*rw_owner(wchan_t
);
118 static inline uintptr_t
119 rw_cas(krwlock_t
*rw
, uintptr_t o
, uintptr_t n
)
122 RW_INHERITDEBUG(n
, o
);
123 return (uintptr_t)atomic_cas_ptr((volatile void *)&rw
->rw_owner
,
124 (void *)o
, (void *)n
);
128 rw_swap(krwlock_t
*rw
, uintptr_t o
, uintptr_t n
)
131 RW_INHERITDEBUG(n
, o
);
132 n
= (uintptr_t)atomic_swap_ptr((volatile void *)&rw
->rw_owner
,
134 RW_DASSERT(rw
, n
== o
);
138 * For platforms that do not provide stubs, or for the LOCKDEBUG case.
141 #undef __HAVE_RW_STUBS
144 #ifndef __HAVE_RW_STUBS
145 __strong_alias(rw_enter
,rw_vector_enter
);
146 __strong_alias(rw_exit
,rw_vector_exit
);
147 __strong_alias(rw_tryenter
,rw_vector_tryenter
);
150 lockops_t rwlock_lockops
= {
151 "Reader / writer lock",
156 syncobj_t rw_syncobj
= {
167 * Dump the contents of a rwlock structure.
170 rw_dump(volatile void *cookie
)
172 volatile krwlock_t
*rw
= cookie
;
174 printf_nolog("owner/count : %#018lx flags : %#018x\n",
175 (long)RW_OWNER(rw
), (int)RW_FLAGS(rw
));
181 * Dump information about an error and panic the system. This
182 * generates a lot of machine code in the DIAGNOSTIC case, so
183 * we ask the compiler to not inline it.
185 static void __noinline
186 rw_abort(krwlock_t
*rw
, const char *func
, const char *msg
)
189 if (panicstr
!= NULL
)
192 LOCKDEBUG_ABORT(rw
, &rwlock_lockops
, func
, msg
);
198 * Initialize a rwlock for use.
201 rw_init(krwlock_t
*rw
)
205 memset(rw
, 0, sizeof(*rw
));
207 dodebug
= LOCKDEBUG_ALLOC(rw
, &rwlock_lockops
,
208 (uintptr_t)__builtin_return_address(0));
209 RW_SETDEBUG(rw
, dodebug
);
215 * Tear down a rwlock.
218 rw_destroy(krwlock_t
*rw
)
221 RW_ASSERT(rw
, (rw
->rw_owner
& ~RW_DEBUG
) == 0);
222 LOCKDEBUG_FREE(RW_DEBUG_P(rw
), rw
);
228 * Return true if an rwlock owner is running on a CPU in the system.
229 * If the target is waiting on the kernel big lock, then we must
230 * release it. This is necessary to avoid deadlock.
232 * Note that we can't use the rwlock owner field as an LWP pointer. We
233 * don't have full control over the timing of our execution, and so the
234 * pointer could be completely invalid by the time we dereference it.
237 rw_onproc(uintptr_t owner
, struct cpu_info
**cip
)
239 #ifdef MULTIPROCESSOR
240 CPU_INFO_ITERATOR cii
;
244 if ((owner
& (RW_WRITE_LOCKED
|RW_HAS_WAITERS
)) != RW_WRITE_LOCKED
)
246 l
= (lwp_t
*)(owner
& RW_THREAD
);
248 /* See if the target is running on a CPU somewhere. */
249 if ((ci
= *cip
) != NULL
&& ci
->ci_curlwp
== l
)
251 for (CPU_INFO_FOREACH(cii
, ci
))
252 if (ci
->ci_curlwp
== l
)
255 /* No: it may be safe to block now. */
260 /* Target is running; do we need to block? */
262 return ci
->ci_biglock_wanted
!= l
;
265 #endif /* MULTIPROCESSOR */
274 rw_vector_enter(krwlock_t
*rw
, const krw_t op
)
276 uintptr_t owner
, incr
, need_wait
, set_wait
, curthread
, next
;
281 LOCKSTAT_TIMER(slptime
);
282 LOCKSTAT_TIMER(slpcnt
);
283 LOCKSTAT_TIMER(spintime
);
284 LOCKSTAT_COUNTER(spincnt
);
285 LOCKSTAT_FLAG(lsflag
);
288 curthread
= (uintptr_t)l
;
290 RW_ASSERT(rw
, !cpu_intr_p());
291 RW_ASSERT(rw
, curthread
!= 0);
292 RW_WANTLOCK(rw
, op
, false);
294 if (panicstr
== NULL
) {
295 LOCKDEBUG_BARRIER(&kernel_lock
, 1);
299 * We play a slight trick here. If we're a reader, we want
300 * increment the read count. If we're a writer, we want to
301 * set the owner field and whe WRITE_LOCKED bit.
303 * In the latter case, we expect those bits to be zero,
304 * therefore we can use an add operation to set them, which
305 * means an add operation for both cases.
307 if (__predict_true(op
== RW_READER
)) {
309 set_wait
= RW_HAS_WAITERS
;
310 need_wait
= RW_WRITE_LOCKED
| RW_WRITE_WANTED
;
313 RW_DASSERT(rw
, op
== RW_WRITER
);
314 incr
= curthread
| RW_WRITE_LOCKED
;
315 set_wait
= RW_HAS_WAITERS
| RW_WRITE_WANTED
;
316 need_wait
= RW_WRITE_LOCKED
| RW_THREAD
;
320 LOCKSTAT_ENTER(lsflag
);
322 for (ci
= NULL
, owner
= rw
->rw_owner
;;) {
324 * Read the lock owner field. If the need-to-wait
325 * indicator is clear, then try to acquire the lock.
327 if ((owner
& need_wait
) == 0) {
328 next
= rw_cas(rw
, owner
, (owner
+ incr
) &
330 if (__predict_true(next
== owner
)) {
337 * Didn't get it -- spin around again (we'll
338 * probably sleep on the next iteration).
344 if (__predict_false(panicstr
!= NULL
))
346 if (__predict_false(RW_OWNER(rw
) == curthread
))
347 rw_abort(rw
, __func__
, "locking against myself");
350 * If the lock owner is running on another CPU, and
351 * there are no existing waiters, then spin.
353 if (rw_onproc(owner
, &ci
)) {
354 LOCKSTAT_START_TIMER(lsflag
, spintime
);
355 u_int count
= SPINLOCK_BACKOFF_MIN
;
357 SPINLOCK_BACKOFF(count
);
358 owner
= rw
->rw_owner
;
359 } while (rw_onproc(owner
, &ci
));
360 LOCKSTAT_STOP_TIMER(lsflag
, spintime
);
361 LOCKSTAT_COUNT(spincnt
, 1);
362 if ((owner
& need_wait
) == 0)
367 * Grab the turnstile chain lock. Once we have that, we
368 * can adjust the waiter bits and sleep queue.
370 ts
= turnstile_lookup(rw
);
373 * Mark the rwlock as having waiters. If the set fails,
374 * then we may not need to sleep and should spin again.
375 * Reload rw_owner because turnstile_lookup() may have
376 * spun on the turnstile chain lock.
378 owner
= rw
->rw_owner
;
379 if ((owner
& need_wait
) == 0 || rw_onproc(owner
, &ci
)) {
383 next
= rw_cas(rw
, owner
, owner
| set_wait
);
384 if (__predict_false(next
!= owner
)) {
390 LOCKSTAT_START_TIMER(lsflag
, slptime
);
391 turnstile_block(ts
, queue
, rw
, &rw_syncobj
);
392 LOCKSTAT_STOP_TIMER(lsflag
, slptime
);
393 LOCKSTAT_COUNT(slpcnt
, 1);
396 * No need for a memory barrier because of context switch.
397 * If not handed the lock, then spin again.
399 if (op
== RW_READER
|| (rw
->rw_owner
& RW_THREAD
) == curthread
)
403 LOCKSTAT_EVENT(lsflag
, rw
, LB_RWLOCK
|
404 (op
== RW_WRITER
? LB_SLEEP1
: LB_SLEEP2
), slpcnt
, slptime
);
405 LOCKSTAT_EVENT(lsflag
, rw
, LB_RWLOCK
| LB_SPIN
, spincnt
, spintime
);
406 LOCKSTAT_EXIT(lsflag
);
408 RW_DASSERT(rw
, (op
!= RW_READER
&& RW_OWNER(rw
) == curthread
) ||
409 (op
== RW_READER
&& RW_COUNT(rw
) != 0));
419 rw_vector_exit(krwlock_t
*rw
)
421 uintptr_t curthread
, owner
, decr
, new, next
;
426 curthread
= (uintptr_t)curlwp
;
427 RW_ASSERT(rw
, curthread
!= 0);
429 if (__predict_false(panicstr
!= NULL
))
433 * Again, we use a trick. Since we used an add operation to
434 * set the required lock bits, we can use a subtract to clear
435 * them, which makes the read-release and write-release path
438 owner
= rw
->rw_owner
;
439 if (__predict_false((owner
& RW_WRITE_LOCKED
) != 0)) {
440 RW_UNLOCKED(rw
, RW_WRITER
);
441 RW_ASSERT(rw
, RW_OWNER(rw
) == curthread
);
442 decr
= curthread
| RW_WRITE_LOCKED
;
444 RW_UNLOCKED(rw
, RW_READER
);
445 RW_ASSERT(rw
, RW_COUNT(rw
) != 0);
450 * Compute what we expect the new value of the lock to be. Only
451 * proceed to do direct handoff if there are waiters, and if the
452 * lock would become unowned.
456 new = (owner
- decr
);
457 if ((new & (RW_THREAD
| RW_HAS_WAITERS
)) == RW_HAS_WAITERS
)
459 next
= rw_cas(rw
, owner
, new);
460 if (__predict_true(next
== owner
))
466 * Grab the turnstile chain lock. This gets the interlock
467 * on the sleep queue. Once we have that, we can adjust the
470 ts
= turnstile_lookup(rw
);
471 owner
= rw
->rw_owner
;
472 RW_DASSERT(rw
, ts
!= NULL
);
473 RW_DASSERT(rw
, (owner
& RW_HAS_WAITERS
) != 0);
475 wcnt
= TS_WAITERS(ts
, TS_WRITER_Q
);
476 rcnt
= TS_WAITERS(ts
, TS_READER_Q
);
479 * Give the lock away.
481 * If we are releasing a write lock, then prefer to wake all
482 * outstanding readers. Otherwise, wake one writer if there
483 * are outstanding readers, or all writers if there are no
484 * pending readers. If waking one specific writer, the writer
485 * is handed the lock here. If waking multiple writers, we
486 * set WRITE_WANTED to block out new readers, and let them
487 * do the work of acquring the lock in rw_vector_enter().
489 if (rcnt
== 0 || decr
== RW_READ_INCR
) {
490 RW_DASSERT(rw
, wcnt
!= 0);
491 RW_DASSERT(rw
, (owner
& RW_WRITE_WANTED
) != 0);
494 /* Give the lock to the longest waiting writer. */
495 l
= TS_FIRST(ts
, TS_WRITER_Q
);
496 new = (uintptr_t)l
| RW_WRITE_LOCKED
| RW_HAS_WAITERS
;
498 new |= RW_WRITE_WANTED
;
499 rw_swap(rw
, owner
, new);
500 turnstile_wakeup(ts
, TS_WRITER_Q
, 1, l
);
502 /* Wake all writers and let them fight it out. */
503 rw_swap(rw
, owner
, RW_WRITE_WANTED
);
504 turnstile_wakeup(ts
, TS_WRITER_Q
, wcnt
, NULL
);
507 RW_DASSERT(rw
, rcnt
!= 0);
510 * Give the lock to all blocked readers. If there
511 * is a writer waiting, new readers that arrive
512 * after the release will be blocked out.
514 new = rcnt
<< RW_READ_COUNT_SHIFT
;
516 new |= RW_HAS_WAITERS
| RW_WRITE_WANTED
;
518 /* Wake up all sleeping readers. */
519 rw_swap(rw
, owner
, new);
520 turnstile_wakeup(ts
, TS_READER_Q
, rcnt
, NULL
);
525 * rw_vector_tryenter:
527 * Try to acquire a rwlock.
530 rw_vector_tryenter(krwlock_t
*rw
, const krw_t op
)
532 uintptr_t curthread
, owner
, incr
, need_wait
, next
;
534 curthread
= (uintptr_t)curlwp
;
536 RW_ASSERT(rw
, curthread
!= 0);
538 if (op
== RW_READER
) {
540 need_wait
= RW_WRITE_LOCKED
| RW_WRITE_WANTED
;
542 RW_DASSERT(rw
, op
== RW_WRITER
);
543 incr
= curthread
| RW_WRITE_LOCKED
;
544 need_wait
= RW_WRITE_LOCKED
| RW_THREAD
;
547 for (owner
= rw
->rw_owner
;; owner
= next
) {
548 owner
= rw
->rw_owner
;
549 if (__predict_false((owner
& need_wait
) != 0))
551 next
= rw_cas(rw
, owner
, owner
+ incr
);
552 if (__predict_true(next
== owner
)) {
559 RW_WANTLOCK(rw
, op
, true);
561 RW_DASSERT(rw
, (op
!= RW_READER
&& RW_OWNER(rw
) == curthread
) ||
562 (op
== RW_READER
&& RW_COUNT(rw
) != 0));
570 * Downgrade a write lock to a read lock.
573 rw_downgrade(krwlock_t
*rw
)
575 uintptr_t owner
, curthread
, new, next
;
579 curthread
= (uintptr_t)curlwp
;
580 RW_ASSERT(rw
, curthread
!= 0);
581 RW_DASSERT(rw
, (rw
->rw_owner
& RW_WRITE_LOCKED
) != 0);
582 RW_ASSERT(rw
, RW_OWNER(rw
) == curthread
);
583 RW_UNLOCKED(rw
, RW_WRITER
);
586 owner
= rw
->rw_owner
;
587 if ((owner
& RW_HAS_WAITERS
) == 0) {
589 * There are no waiters, so we can do this the easy way.
590 * Try swapping us down to one read hold. If it fails, the
591 * lock condition has changed and we most likely now have
594 next
= rw_cas(rw
, owner
, RW_READ_INCR
);
595 if (__predict_true(next
== owner
)) {
596 RW_LOCKED(rw
, RW_READER
);
597 RW_DASSERT(rw
, (rw
->rw_owner
& RW_WRITE_LOCKED
) == 0);
598 RW_DASSERT(rw
, RW_COUNT(rw
) != 0);
605 * Grab the turnstile chain lock. This gets the interlock
606 * on the sleep queue. Once we have that, we can adjust the
609 for (;; owner
= next
) {
610 ts
= turnstile_lookup(rw
);
611 RW_DASSERT(rw
, ts
!= NULL
);
613 rcnt
= TS_WAITERS(ts
, TS_READER_Q
);
614 wcnt
= TS_WAITERS(ts
, TS_WRITER_Q
);
617 * If there are no readers, just preserve the waiters
618 * bits, swap us down to one read hold and return.
621 RW_DASSERT(rw
, wcnt
!= 0);
622 RW_DASSERT(rw
, (rw
->rw_owner
& RW_WRITE_WANTED
) != 0);
623 RW_DASSERT(rw
, (rw
->rw_owner
& RW_HAS_WAITERS
) != 0);
625 new = RW_READ_INCR
| RW_HAS_WAITERS
| RW_WRITE_WANTED
;
626 next
= rw_cas(rw
, owner
, new);
628 if (__predict_true(next
== owner
))
632 * Give the lock to all blocked readers. We may
633 * retain one read hold if downgrading. If there
634 * is a writer waiting, new readers will be blocked
637 new = (rcnt
<< RW_READ_COUNT_SHIFT
) + RW_READ_INCR
;
639 new |= RW_HAS_WAITERS
| RW_WRITE_WANTED
;
641 next
= rw_cas(rw
, owner
, new);
642 if (__predict_true(next
== owner
)) {
643 /* Wake up all sleeping readers. */
644 turnstile_wakeup(ts
, TS_READER_Q
, rcnt
, NULL
);
651 RW_WANTLOCK(rw
, RW_READER
, false);
652 RW_LOCKED(rw
, RW_READER
);
653 RW_DASSERT(rw
, (rw
->rw_owner
& RW_WRITE_LOCKED
) == 0);
654 RW_DASSERT(rw
, RW_COUNT(rw
) != 0);
660 * Try to upgrade a read lock to a write lock. We must be the
664 rw_tryupgrade(krwlock_t
*rw
)
666 uintptr_t owner
, curthread
, new, next
;
668 curthread
= (uintptr_t)curlwp
;
669 RW_ASSERT(rw
, curthread
!= 0);
670 RW_ASSERT(rw
, rw_read_held(rw
));
672 for (owner
= rw
->rw_owner
;; owner
= next
) {
673 RW_ASSERT(rw
, (owner
& RW_WRITE_LOCKED
) == 0);
674 if (__predict_false((owner
& RW_THREAD
) != RW_READ_INCR
)) {
675 RW_ASSERT(rw
, (owner
& RW_THREAD
) != 0);
678 new = curthread
| RW_WRITE_LOCKED
| (owner
& ~RW_THREAD
);
679 next
= rw_cas(rw
, owner
, new);
680 if (__predict_true(next
== owner
)) {
686 RW_UNLOCKED(rw
, RW_READER
);
687 RW_WANTLOCK(rw
, RW_WRITER
, true);
688 RW_LOCKED(rw
, RW_WRITER
);
689 RW_DASSERT(rw
, rw
->rw_owner
& RW_WRITE_LOCKED
);
690 RW_DASSERT(rw
, RW_OWNER(rw
) == curthread
);
698 * Returns true if the rwlock is held for reading. Must only be
699 * used for diagnostic assertions, and never be used to make
700 * decisions about how to use a rwlock.
703 rw_read_held(krwlock_t
*rw
)
707 if (panicstr
!= NULL
)
711 owner
= rw
->rw_owner
;
712 return (owner
& RW_WRITE_LOCKED
) == 0 && (owner
& RW_THREAD
) != 0;
718 * Returns true if the rwlock is held for writing. Must only be
719 * used for diagnostic assertions, and never be used to make
720 * decisions about how to use a rwlock.
723 rw_write_held(krwlock_t
*rw
)
726 if (panicstr
!= NULL
)
730 return (rw
->rw_owner
& (RW_WRITE_LOCKED
| RW_THREAD
)) ==
731 (RW_WRITE_LOCKED
| (uintptr_t)curlwp
);
737 * Returns true if the rwlock is held for reading or writing. Must
738 * only be used for diagnostic assertions, and never be used to make
739 * decisions about how to use a rwlock.
742 rw_lock_held(krwlock_t
*rw
)
745 if (panicstr
!= NULL
)
749 return (rw
->rw_owner
& RW_THREAD
) != 0;
755 * Return the current owner of an RW lock, but only if it is write
756 * held. Used for priority inheritance.
759 rw_owner(wchan_t obj
)
761 krwlock_t
*rw
= (void *)(uintptr_t)obj
; /* discard qualifiers */
762 uintptr_t owner
= rw
->rw_owner
;
764 if ((owner
& RW_WRITE_LOCKED
) == 0)
767 return (void *)(owner
& RW_THREAD
);