1 /* $NetBSD: linux_futex.c,v 1.23 2009/02/23 20:28:58 rmind Exp $ */
4 * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 * must display the following acknowledgement:
16 * This product includes software developed by Emmanuel Dreyfus
17 * 4. The name of the author may not be used to endorse or promote
18 * products derived from this software without specific prior written
21 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.23 2009/02/23 20:28:58 rmind Exp $");
37 #include <sys/param.h>
39 #include <sys/systm.h>
42 #include <sys/queue.h>
43 #include <sys/condvar.h>
44 #include <sys/mutex.h>
47 #include <sys/kernel.h>
48 #include <sys/atomic.h>
50 #include <compat/linux/common/linux_types.h>
51 #include <compat/linux/common/linux_emuldata.h>
52 #include <compat/linux/common/linux_exec.h>
53 #include <compat/linux/common/linux_signal.h>
54 #include <compat/linux/common/linux_futex.h>
55 #include <compat/linux/common/linux_ipc.h>
56 #include <compat/linux/common/linux_sched.h>
57 #include <compat/linux/common/linux_sem.h>
58 #include <compat/linux/linux_syscallargs.h>
60 void linux_to_native_timespec(struct timespec
*, struct linux_timespec
*);
66 struct futex
*wp_new_futex
;
67 kcondvar_t wp_futex_cv
;
68 TAILQ_ENTRY(waiting_proc
) wp_list
;
73 LIST_ENTRY(futex
) f_list
;
74 TAILQ_HEAD(lf_waiting_proc
, waiting_proc
) f_waiting_proc
;
77 static LIST_HEAD(futex_list
, futex
) futex_list
;
78 static kmutex_t futex_lock
;
80 #define FUTEX_LOCK mutex_enter(&futex_lock);
81 #define FUTEX_UNLOCK mutex_exit(&futex_lock);
83 #define FUTEX_LOCKED 1
84 #define FUTEX_UNLOCKED 0
86 #define FUTEX_SYSTEM_LOCK KERNEL_LOCK(1, NULL);
87 #define FUTEX_SYSTEM_UNLOCK KERNEL_UNLOCK_ONE(0);
89 #ifdef DEBUG_LINUX_FUTEX
90 #define FUTEXPRINTF(a) printf a
92 #define FUTEXPRINTF(a)
95 static ONCE_DECL(futex_once
);
100 FUTEXPRINTF(("futex_init: initializing futex\n"));
101 mutex_init(&futex_lock
, MUTEX_DEFAULT
, IPL_NONE
);
105 static struct futex
*futex_get(void *, int);
106 static void futex_put(struct futex
*);
107 static int futex_sleep(struct futex
*, lwp_t
*, unsigned long);
108 static int futex_wake(struct futex
*, int, struct futex
*, int);
109 static int futex_atomic_op(lwp_t
*, int, void *);
112 linux_sys_futex(struct lwp
*l
, const struct linux_sys_futex_args
*uap
, register_t
*retval
)
115 syscallarg(int *) uaddr;
118 syscallarg(const struct linux_timespec *) timeout;
119 syscallarg(int *) uaddr2;
120 syscallarg(int) val3;
124 struct linux_timespec timeout
= { 0, 0 };
133 RUN_ONCE(&futex_once
, futex_init
);
136 * Our implementation provides only private futexes. Most of the apps
137 * should use private futexes but don't claim so. Therefore we treat
138 * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works
139 * in most cases (ie. when futexes are not shared on file descriptor
140 * or between different processes).
142 switch (SCARG(uap
, op
) & ~LINUX_FUTEX_PRIVATE_FLAG
) {
143 case LINUX_FUTEX_WAIT
:
146 if ((error
= copyin(SCARG(uap
, uaddr
),
147 &val
, sizeof(val
))) != 0) {
152 if (val
!= SCARG(uap
, val
)) {
157 if (SCARG(uap
, timeout
) != NULL
) {
158 if ((error
= copyin(SCARG(uap
, timeout
),
159 &timeout
, sizeof(timeout
))) != 0) {
165 FUTEXPRINTF(("FUTEX_WAIT %d.%d: val = %d, uaddr = %p, "
166 "*uaddr = %d, timeout = %lld.%09ld\n",
167 l
->l_proc
->p_pid
, l
->l_lid
, SCARG(uap
, val
),
168 SCARG(uap
, uaddr
), val
, (long long)timeout
.tv_sec
,
171 linux_to_native_timespec(&ts
, &timeout
);
172 if ((error
= itimespecfix(&ts
)) != 0) {
176 timeout_hz
= tstohz(&ts
);
179 * If the user process requests a non null timeout,
180 * make sure we do not turn it into an infinite
181 * timeout because timeout_hz is 0.
183 * We use a minimal timeout of 1/hz. Maybe it would make
184 * sense to just return ETIMEDOUT without sleeping.
186 if (SCARG(uap
, timeout
) != NULL
&& timeout_hz
== 0)
189 f
= futex_get(SCARG(uap
, uaddr
), FUTEX_UNLOCKED
);
190 ret
= futex_sleep(f
, l
, timeout_hz
);
193 FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, "
194 "ret = %d\n", l
->l_proc
->p_pid
, l
->l_lid
,
195 SCARG(uap
, uaddr
), ret
));
199 case EWOULDBLOCK
: /* timeout */
202 case EINTR
: /* signal */
205 case 0: /* FUTEX_WAKE received */
206 FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, got it\n",
207 l
->l_proc
->p_pid
, l
->l_lid
, SCARG(uap
, uaddr
)));
211 FUTEXPRINTF(("FUTEX_WAIT: unexpected ret = %d\n", ret
));
218 case LINUX_FUTEX_WAKE
:
221 * XXX: Linux is able cope with different addresses
222 * corresponding to the same mapped memory in the sleeping
223 * and the waker process(es).
225 FUTEXPRINTF(("FUTEX_WAKE %d.%d: uaddr = %p, val = %d\n",
226 l
->l_proc
->p_pid
, l
->l_lid
,
227 SCARG(uap
, uaddr
), SCARG(uap
, val
)));
229 f
= futex_get(SCARG(uap
, uaddr
), FUTEX_UNLOCKED
);
230 *retval
= futex_wake(f
, SCARG(uap
, val
), NULL
, 0);
237 case LINUX_FUTEX_CMP_REQUEUE
:
240 if ((error
= copyin(SCARG(uap
, uaddr
),
241 &val
, sizeof(val
))) != 0) {
246 if (val
!= SCARG(uap
, val3
)) {
251 f
= futex_get(SCARG(uap
, uaddr
), FUTEX_UNLOCKED
);
252 newf
= futex_get(SCARG(uap
, uaddr2
), FUTEX_UNLOCKED
);
253 *retval
= futex_wake(f
, SCARG(uap
, val
), newf
,
254 (int)(unsigned long)SCARG(uap
, timeout
));
261 case LINUX_FUTEX_REQUEUE
:
264 f
= futex_get(SCARG(uap
, uaddr
), FUTEX_UNLOCKED
);
265 newf
= futex_get(SCARG(uap
, uaddr2
), FUTEX_UNLOCKED
);
266 *retval
= futex_wake(f
, SCARG(uap
, val
), newf
,
267 (int)(unsigned long)SCARG(uap
, timeout
));
275 FUTEXPRINTF(("linux_sys_futex: unimplemented op %d\n",
278 case LINUX_FUTEX_WAKE_OP
:
280 f
= futex_get(SCARG(uap
, uaddr
), FUTEX_UNLOCKED
);
281 f2
= futex_get(SCARG(uap
, uaddr2
), FUTEX_UNLOCKED
);
283 * This function returns positive number as results and
286 op_ret
= futex_atomic_op(l
, SCARG(uap
, val3
), SCARG(uap
, uaddr2
));
288 /* XXX: We don't handle EFAULT yet */
289 if (op_ret
!= -EFAULT
) {
301 ret
= futex_wake(f
, SCARG(uap
, val
), NULL
, 0);
306 * Linux abuses the address of the timespec parameter
307 * as the number of retries
309 op_ret
+= futex_wake(f2
,
310 (int)(unsigned long)SCARG(uap
, timeout
), NULL
, 0);
318 FUTEXPRINTF(("linux_sys_futex: unknown op %d\n",
325 static struct futex
*
326 futex_get(void *uaddr
, int locked
)
330 if (locked
== FUTEX_UNLOCKED
)
333 LIST_FOREACH(f
, &futex_list
, f_list
) {
334 if (f
->f_uaddr
== uaddr
) {
336 if (locked
== FUTEX_UNLOCKED
)
342 /* Not found, create it */
343 f
= kmem_zalloc(sizeof(*f
), KM_SLEEP
);
346 TAILQ_INIT(&f
->f_waiting_proc
);
347 LIST_INSERT_HEAD(&futex_list
, f
, f_list
);
348 if (locked
== FUTEX_UNLOCKED
)
355 futex_put(struct futex
*f
)
360 if (f
->f_refcount
== 0) {
361 KASSERT(TAILQ_EMPTY(&f
->f_waiting_proc
));
362 LIST_REMOVE(f
, f_list
);
363 kmem_free(f
, sizeof(*f
));
371 futex_sleep(struct futex
*f
, lwp_t
*l
, unsigned long timeout
)
373 struct waiting_proc
*wp
;
376 wp
= kmem_zalloc(sizeof(*wp
), KM_SLEEP
);
378 wp
->wp_new_futex
= NULL
;
379 cv_init(&wp
->wp_futex_cv
, "futex");
382 TAILQ_INSERT_TAIL(&f
->f_waiting_proc
, wp
, wp_list
);
383 ret
= cv_timedwait_sig(&wp
->wp_futex_cv
, &futex_lock
, timeout
);
384 TAILQ_REMOVE(&f
->f_waiting_proc
, wp
, wp_list
);
387 /* if we got woken up in futex_wake */
388 if ((ret
== 0) && (wp
->wp_new_futex
!= NULL
)) {
389 /* suspend us on the new futex */
390 ret
= futex_sleep(wp
->wp_new_futex
, l
, timeout
);
391 /* and release the old one */
392 futex_put(wp
->wp_new_futex
);
395 cv_destroy(&wp
->wp_futex_cv
);
396 kmem_free(wp
, sizeof(*wp
));
401 futex_wake(struct futex
*f
, int n
, struct futex
*newf
, int n2
)
403 struct waiting_proc
*wp
;
406 count
= newf
? 0 : 1;
409 TAILQ_FOREACH(wp
, &f
->f_waiting_proc
, wp_list
) {
411 cv_signal(&wp
->wp_futex_cv
);
416 /* futex_put called after tsleep */
417 wp
->wp_new_futex
= futex_get(newf
->f_uaddr
,
419 cv_signal(&wp
->wp_futex_cv
);
430 futex_atomic_op(lwp_t
*l
, int encoded_op
, void *uaddr
)
432 const int op
= (encoded_op
>> 28) & 7;
433 const int cmp
= (encoded_op
>> 24) & 15;
434 const int cmparg
= (encoded_op
<< 20) >> 20;
435 int oparg
= (encoded_op
<< 8) >> 20;
436 int error
, oldval
, cval
;
438 if (encoded_op
& (FUTEX_OP_OPARG_SHIFT
<< 28))
441 /* XXX: linux verifies access here and returns EFAULT */
443 if (copyin(uaddr
, &cval
, sizeof(int)) != 0)
460 nval
= cval
& ~oparg
;
469 error
= ucas_int(uaddr
, cval
, nval
, &oldval
);
470 if (oldval
== cval
|| error
) {
480 case FUTEX_OP_CMP_EQ
:
481 return (oldval
== cmparg
);
482 case FUTEX_OP_CMP_NE
:
483 return (oldval
!= cmparg
);
484 case FUTEX_OP_CMP_LT
:
485 return (oldval
< cmparg
);
486 case FUTEX_OP_CMP_GE
:
487 return (oldval
>= cmparg
);
488 case FUTEX_OP_CMP_LE
:
489 return (oldval
<= cmparg
);
490 case FUTEX_OP_CMP_GT
:
491 return (oldval
> cmparg
);
498 linux_sys_set_robust_list(struct lwp
*l
,
499 const struct linux_sys_set_robust_list_args
*uap
, register_t
*retval
)
501 struct proc
*p
= l
->l_proc
;
502 struct linux_emuldata
*led
= p
->p_emuldata
;
504 if (SCARG(uap
, len
) != sizeof(*(led
->robust_futexes
)))
506 led
->robust_futexes
= SCARG(uap
, head
);
512 linux_sys_get_robust_list(struct lwp
*l
,
513 const struct linux_sys_get_robust_list_args
*uap
, register_t
*retval
)
515 struct linux_emuldata
*led
;
516 struct linux_robust_list_head
**head
;
517 size_t len
= sizeof(*led
->robust_futexes
);
520 if (!SCARG(uap
, pid
)) {
521 led
= l
->l_proc
->p_emuldata
;
522 head
= &led
->robust_futexes
;
526 mutex_enter(proc_lock
);
527 if ((p
= p_find(SCARG(uap
, pid
), PFIND_LOCKED
)) == NULL
||
528 p
->p_emul
!= &emul_linux
) {
529 mutex_exit(proc_lock
);
533 head
= &led
->robust_futexes
;
534 mutex_exit(proc_lock
);
537 error
= copyout(&len
, SCARG(uap
, len
), sizeof(len
));
540 return copyout(head
, SCARG(uap
, head
), sizeof(*head
));
544 handle_futex_death(void *uaddr
, pid_t pid
, int pi
)
546 int uval
, nval
, mval
;
550 if (copyin(uaddr
, &uval
, 4))
553 if ((uval
& FUTEX_TID_MASK
) == pid
) {
554 mval
= (uval
& FUTEX_WAITERS
) | FUTEX_OWNER_DIED
;
555 nval
= atomic_cas_32(uaddr
, uval
, mval
);
563 if (!pi
&& (uval
& FUTEX_WAITERS
)) {
564 f
= futex_get(uaddr
, FUTEX_UNLOCKED
);
565 futex_wake(f
, 1, NULL
, 0);
573 fetch_robust_entry(struct linux_robust_list
**entry
,
574 struct linux_robust_list
**head
, int *pi
)
576 unsigned long uentry
;
578 if (copyin((const void *)head
, &uentry
, sizeof(unsigned long)))
581 *entry
= (void *)(uentry
& ~1UL);
587 /* This walks the list of robust futexes, releasing them. */
589 release_futexes(struct proc
*p
)
591 struct linux_robust_list_head head
;
592 struct linux_robust_list
*entry
, *next_entry
= NULL
, *pending
;
593 unsigned int limit
= 2048, pi
, next_pi
, pip
;
594 struct linux_emuldata
*led
;
595 unsigned long futex_offset
;
599 if (led
->robust_futexes
== NULL
)
602 if (copyin(led
->robust_futexes
, &head
, sizeof(head
)))
605 if (fetch_robust_entry(&entry
, &head
.list
.next
, &pi
))
608 if (copyin(&head
.futex_offset
, &futex_offset
, sizeof(unsigned long)))
611 if (fetch_robust_entry(&pending
, &head
.pending_list
, &pip
))
614 while (entry
!= &head
.list
) {
615 rc
= fetch_robust_entry(&next_entry
, &entry
->next
, &next_pi
);
617 if (entry
!= pending
)
618 if (handle_futex_death((char *)entry
+ futex_offset
,
631 yield(); /* XXX why? */
635 handle_futex_death((char *)pending
+ futex_offset
,