1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Fast Userspace Mutexes (which I call "Futexes!").
4 * (C) Rusty Russell, IBM 2002
6 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
7 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
9 * Removed page pinning, fix privately mapped COW pages and other cleanups
10 * (C) Copyright 2003, 2004 Jamie Lokier
12 * Robust futex support started by Ingo Molnar
13 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
14 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
16 * PI-futex support started by Ingo Molnar and Thomas Gleixner
17 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
20 * PRIVATE futexes by Eric Dumazet
21 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
23 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
24 * Copyright (C) IBM Corporation, 2009
25 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
27 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
28 * enough at me, Linus for the original (flawed) idea, Matthew
29 * Kirkwood for proof-of-concept implementation.
31 * "The futexes are also cursed."
32 * "But they come in a choice of three flavours!"
34 #include <linux/compat.h>
35 #include <linux/jhash.h>
36 #include <linux/pagemap.h>
37 #include <linux/debugfs.h>
38 #include <linux/plist.h>
39 #include <linux/memblock.h>
40 #include <linux/fault-inject.h>
41 #include <linux/slab.h>
44 #include "../locking/rtmutex_common.h"
47 * The base of the bucket array and its size are always used together
48 * (after initialization only in futex_hash()), so ensure that they
49 * reside in the same cacheline.
52 struct futex_hash_bucket
*queues
;
53 unsigned long hashsize
;
54 } __futex_data __read_mostly
__aligned(2*sizeof(long));
55 #define futex_queues (__futex_data.queues)
56 #define futex_hashsize (__futex_data.hashsize)
60 * Fault injections for futexes.
62 #ifdef CONFIG_FAIL_FUTEX
65 struct fault_attr attr
;
69 .attr
= FAULT_ATTR_INITIALIZER
,
70 .ignore_private
= false,
73 static int __init
setup_fail_futex(char *str
)
75 return setup_fault_attr(&fail_futex
.attr
, str
);
77 __setup("fail_futex=", setup_fail_futex
);
79 bool should_fail_futex(bool fshared
)
81 if (fail_futex
.ignore_private
&& !fshared
)
84 return should_fail(&fail_futex
.attr
, 1);
87 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
89 static int __init
fail_futex_debugfs(void)
91 umode_t mode
= S_IFREG
| S_IRUSR
| S_IWUSR
;
94 dir
= fault_create_debugfs_attr("fail_futex", NULL
,
99 debugfs_create_bool("ignore-private", mode
, dir
,
100 &fail_futex
.ignore_private
);
104 late_initcall(fail_futex_debugfs
);
106 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
108 #endif /* CONFIG_FAIL_FUTEX */
111 * futex_hash - Return the hash bucket in the global hash
112 * @key: Pointer to the futex key for which the hash is calculated
114 * We hash on the keys returned from get_futex_key (see below) and return the
115 * corresponding hash bucket in the global hash.
117 struct futex_hash_bucket
*futex_hash(union futex_key
*key
)
119 u32 hash
= jhash2((u32
*)key
, offsetof(typeof(*key
), both
.offset
) / 4,
122 return &futex_queues
[hash
& (futex_hashsize
- 1)];
127 * futex_setup_timer - set up the sleeping hrtimer.
128 * @time: ptr to the given timeout value
129 * @timeout: the hrtimer_sleeper structure to be set up
130 * @flags: futex flags
131 * @range_ns: optional range in ns
133 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
136 struct hrtimer_sleeper
*
137 futex_setup_timer(ktime_t
*time
, struct hrtimer_sleeper
*timeout
,
138 int flags
, u64 range_ns
)
143 hrtimer_setup_sleeper_on_stack(timeout
,
144 (flags
& FLAGS_CLOCKRT
) ? CLOCK_REALTIME
: CLOCK_MONOTONIC
,
147 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
148 * effectively the same as calling hrtimer_set_expires().
150 hrtimer_set_expires_range_ns(&timeout
->timer
, *time
, range_ns
);
156 * Generate a machine wide unique identifier for this inode.
158 * This relies on u64 not wrapping in the life-time of the machine; which with
159 * 1ns resolution means almost 585 years.
161 * This further relies on the fact that a well formed program will not unmap
162 * the file while it has a (shared) futex waiting on it. This mapping will have
163 * a file reference which pins the mount and inode.
165 * If for some reason an inode gets evicted and read back in again, it will get
166 * a new sequence number and will _NOT_ match, even though it is the exact same
169 * It is important that futex_match() will never have a false-positive, esp.
170 * for PI futexes that can mess up the state. The above argues that false-negatives
171 * are only possible for malformed programs.
173 static u64
get_inode_sequence_number(struct inode
*inode
)
175 static atomic64_t i_seq
;
178 /* Does the inode already have a sequence number? */
179 old
= atomic64_read(&inode
->i_sequence
);
184 u64
new = atomic64_inc_return(&i_seq
);
185 if (WARN_ON_ONCE(!new))
189 if (!atomic64_try_cmpxchg_relaxed(&inode
->i_sequence
, &old
, new))
196 * get_futex_key() - Get parameters which are the keys for a futex
197 * @uaddr: virtual address of the futex
199 * @key: address where result is stored.
200 * @rw: mapping needs to be read/write (values: FUTEX_READ,
203 * Return: a negative error code or 0
205 * The key words are stored in @key on success.
207 * For shared mappings (when @fshared), the key is:
209 * ( inode->i_sequence, page->index, offset_within_page )
211 * [ also see get_inode_sequence_number() ]
213 * For private mappings (or when !@fshared), the key is:
215 * ( current->mm, address, 0 )
217 * This allows (cross process, where applicable) identification of the futex
218 * without keeping the page pinned for the duration of the FUTEX_WAIT.
220 * lock_page() might sleep, the caller should not hold a spinlock.
222 int get_futex_key(u32 __user
*uaddr
, unsigned int flags
, union futex_key
*key
,
223 enum futex_access rw
)
225 unsigned long address
= (unsigned long)uaddr
;
226 struct mm_struct
*mm
= current
->mm
;
229 struct address_space
*mapping
;
233 fshared
= flags
& FLAGS_SHARED
;
236 * The futex address must be "naturally" aligned.
238 key
->both
.offset
= address
% PAGE_SIZE
;
239 if (unlikely((address
% sizeof(u32
)) != 0))
241 address
-= key
->both
.offset
;
243 if (unlikely(!access_ok(uaddr
, sizeof(u32
))))
246 if (unlikely(should_fail_futex(fshared
)))
250 * PROCESS_PRIVATE futexes are fast.
251 * As the mm cannot disappear under us and the 'key' only needs
252 * virtual address, we dont even have to find the underlying vma.
253 * Note : We do have to check 'uaddr' is a valid user address,
254 * but access_ok() should be faster than find_vma()
258 * On no-MMU, shared futexes are treated as private, therefore
259 * we must not include the current process in the key. Since
260 * there is only one address space, the address is a unique key
263 if (IS_ENABLED(CONFIG_MMU
))
264 key
->private.mm
= mm
;
266 key
->private.mm
= NULL
;
268 key
->private.address
= address
;
273 /* Ignore any VERIFY_READ mapping (futex common case) */
274 if (unlikely(should_fail_futex(true)))
277 err
= get_user_pages_fast(address
, 1, FOLL_WRITE
, &page
);
279 * If write access is not required (eg. FUTEX_WAIT), try
280 * and get read-only access.
282 if (err
== -EFAULT
&& rw
== FUTEX_READ
) {
283 err
= get_user_pages_fast(address
, 1, 0, &page
);
292 * The treatment of mapping from this point on is critical. The folio
293 * lock protects many things but in this context the folio lock
294 * stabilizes mapping, prevents inode freeing in the shared
295 * file-backed region case and guards against movement to swap cache.
297 * Strictly speaking the folio lock is not needed in all cases being
298 * considered here and folio lock forces unnecessarily serialization.
299 * From this point on, mapping will be re-verified if necessary and
300 * folio lock will be acquired only if it is unavoidable
302 * Mapping checks require the folio so it is looked up now. For
303 * anonymous pages, it does not matter if the folio is split
304 * in the future as the key is based on the address. For
305 * filesystem-backed pages, the precise page is required as the
306 * index of the page determines the key.
308 folio
= page_folio(page
);
309 mapping
= READ_ONCE(folio
->mapping
);
312 * If folio->mapping is NULL, then it cannot be an anonymous
313 * page; but it might be the ZERO_PAGE or in the gate area or
314 * in a special mapping (all cases which we are happy to fail);
315 * or it may have been a good file page when get_user_pages_fast
316 * found it, but truncated or holepunched or subjected to
317 * invalidate_complete_page2 before we got the folio lock (also
318 * cases which we are happy to fail). And we hold a reference,
319 * so refcount care in invalidate_inode_page's remove_mapping
320 * prevents drop_caches from setting mapping to NULL beneath us.
322 * The case we do have to guard against is when memory pressure made
323 * shmem_writepage move it from filecache to swapcache beneath us:
324 * an unlikely race, but we do need to retry for folio->mapping.
326 if (unlikely(!mapping
)) {
330 * Folio lock is required to identify which special case above
331 * applies. If this is really a shmem page then the folio lock
332 * will prevent unexpected transitions.
335 shmem_swizzled
= folio_test_swapcache(folio
) || folio
->mapping
;
346 * Private mappings are handled in a simple way.
348 * If the futex key is stored in anonymous memory, then the associated
349 * object is the mm which is implicitly pinned by the calling process.
351 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
352 * it's a read-only handle, it's expected that futexes attach to
353 * the object not the particular process.
355 if (folio_test_anon(folio
)) {
357 * A RO anonymous page will never change and thus doesn't make
358 * sense for futex operations.
360 if (unlikely(should_fail_futex(true)) || ro
) {
365 key
->both
.offset
|= FUT_OFF_MMSHARED
; /* ref taken on mm */
366 key
->private.mm
= mm
;
367 key
->private.address
= address
;
373 * The associated futex object in this case is the inode and
374 * the folio->mapping must be traversed. Ordinarily this should
375 * be stabilised under folio lock but it's not strictly
376 * necessary in this case as we just want to pin the inode, not
377 * update i_pages or anything like that.
379 * The RCU read lock is taken as the inode is finally freed
380 * under RCU. If the mapping still matches expectations then the
381 * mapping->host can be safely accessed as being a valid inode.
385 if (READ_ONCE(folio
->mapping
) != mapping
) {
392 inode
= READ_ONCE(mapping
->host
);
400 key
->both
.offset
|= FUT_OFF_INODE
; /* inode-based key */
401 key
->shared
.i_seq
= get_inode_sequence_number(inode
);
402 key
->shared
.pgoff
= page_pgoff(folio
, page
);
412 * fault_in_user_writeable() - Fault in user address and verify RW access
413 * @uaddr: pointer to faulting user space address
415 * Slow path to fixup the fault we just took in the atomic write
418 * We have no generic implementation of a non-destructive write to the
419 * user address. We know that we faulted in the atomic pagefault
420 * disabled section so we can as well avoid the #PF overhead by
421 * calling get_user_pages() right away.
423 int fault_in_user_writeable(u32 __user
*uaddr
)
425 struct mm_struct
*mm
= current
->mm
;
429 ret
= fixup_user_fault(mm
, (unsigned long)uaddr
,
430 FAULT_FLAG_WRITE
, NULL
);
431 mmap_read_unlock(mm
);
433 return ret
< 0 ? ret
: 0;
437 * futex_top_waiter() - Return the highest priority waiter on a futex
438 * @hb: the hash bucket the futex_q's reside in
439 * @key: the futex key (to distinguish it from other futex futex_q's)
441 * Must be called with the hb lock held.
443 struct futex_q
*futex_top_waiter(struct futex_hash_bucket
*hb
, union futex_key
*key
)
445 struct futex_q
*this;
447 plist_for_each_entry(this, &hb
->chain
, list
) {
448 if (futex_match(&this->key
, key
))
455 * wait_for_owner_exiting - Block until the owner has exited
456 * @ret: owner's current futex lock status
457 * @exiting: Pointer to the exiting task
459 * Caller must hold a refcount on @exiting.
461 void wait_for_owner_exiting(int ret
, struct task_struct
*exiting
)
464 WARN_ON_ONCE(exiting
);
468 if (WARN_ON_ONCE(ret
== -EBUSY
&& !exiting
))
471 mutex_lock(&exiting
->futex_exit_mutex
);
473 * No point in doing state checking here. If the waiter got here
474 * while the task was in exec()->exec_futex_release() then it can
475 * have any FUTEX_STATE_* value when the waiter has acquired the
476 * mutex. OK, if running, EXITING or DEAD if it reached exit()
477 * already. Highly unlikely and not a problem. Just one more round
478 * through the futex maze.
480 mutex_unlock(&exiting
->futex_exit_mutex
);
482 put_task_struct(exiting
);
486 * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
487 * @q: The futex_q to unqueue
489 * The q->lock_ptr must not be NULL and must be held by the caller.
491 void __futex_unqueue(struct futex_q
*q
)
493 struct futex_hash_bucket
*hb
;
495 if (WARN_ON_SMP(!q
->lock_ptr
) || WARN_ON(plist_node_empty(&q
->list
)))
497 lockdep_assert_held(q
->lock_ptr
);
499 hb
= container_of(q
->lock_ptr
, struct futex_hash_bucket
, lock
);
500 plist_del(&q
->list
, &hb
->chain
);
501 futex_hb_waiters_dec(hb
);
504 /* The key must be already stored in q->key. */
505 struct futex_hash_bucket
*futex_q_lock(struct futex_q
*q
)
506 __acquires(&hb
->lock
)
508 struct futex_hash_bucket
*hb
;
510 hb
= futex_hash(&q
->key
);
513 * Increment the counter before taking the lock so that
514 * a potential waker won't miss a to-be-slept task that is
515 * waiting for the spinlock. This is safe as all futex_q_lock()
516 * users end up calling futex_queue(). Similarly, for housekeeping,
517 * decrement the counter at futex_q_unlock() when some error has
518 * occurred and we don't end up adding the task to the list.
520 futex_hb_waiters_inc(hb
); /* implies smp_mb(); (A) */
522 q
->lock_ptr
= &hb
->lock
;
524 spin_lock(&hb
->lock
);
528 void futex_q_unlock(struct futex_hash_bucket
*hb
)
529 __releases(&hb
->lock
)
531 spin_unlock(&hb
->lock
);
532 futex_hb_waiters_dec(hb
);
535 void __futex_queue(struct futex_q
*q
, struct futex_hash_bucket
*hb
)
540 * The priority used to register this element is
541 * - either the real thread-priority for the real-time threads
542 * (i.e. threads with a priority lower than MAX_RT_PRIO)
543 * - or MAX_RT_PRIO for non-RT threads.
544 * Thus, all RT-threads are woken first in priority order, and
545 * the others are woken last, in FIFO order.
547 prio
= min(current
->normal_prio
, MAX_RT_PRIO
);
549 plist_node_init(&q
->list
, prio
);
550 plist_add(&q
->list
, &hb
->chain
);
555 * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
556 * @q: The futex_q to unqueue
558 * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
559 * be paired with exactly one earlier call to futex_queue().
562 * - 1 - if the futex_q was still queued (and we removed unqueued it);
563 * - 0 - if the futex_q was already removed by the waking thread
565 int futex_unqueue(struct futex_q
*q
)
567 spinlock_t
*lock_ptr
;
570 /* In the common case we don't take the spinlock, which is nice. */
573 * q->lock_ptr can change between this read and the following spin_lock.
574 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
575 * optimizing lock_ptr out of the logic below.
577 lock_ptr
= READ_ONCE(q
->lock_ptr
);
578 if (lock_ptr
!= NULL
) {
581 * q->lock_ptr can change between reading it and
582 * spin_lock(), causing us to take the wrong lock. This
583 * corrects the race condition.
585 * Reasoning goes like this: if we have the wrong lock,
586 * q->lock_ptr must have changed (maybe several times)
587 * between reading it and the spin_lock(). It can
588 * change again after the spin_lock() but only if it was
589 * already changed before the spin_lock(). It cannot,
590 * however, change back to the original value. Therefore
591 * we can detect whether we acquired the correct lock.
593 if (unlikely(lock_ptr
!= q
->lock_ptr
)) {
594 spin_unlock(lock_ptr
);
601 spin_unlock(lock_ptr
);
609 * PI futexes can not be requeued and must remove themselves from the hash
610 * bucket. The hash bucket lock (i.e. lock_ptr) is held.
612 void futex_unqueue_pi(struct futex_q
*q
)
615 * If the lock was not acquired (due to timeout or signal) then the
616 * rt_waiter is removed before futex_q is. If this is observed by
617 * an unlocker after dropping the rtmutex wait lock and before
618 * acquiring the hash bucket lock, then the unlocker dequeues the
619 * futex_q from the hash bucket list to guarantee consistent state
620 * vs. userspace. Therefore the dequeue here must be conditional.
622 if (!plist_node_empty(&q
->list
))
625 BUG_ON(!q
->pi_state
);
626 put_pi_state(q
->pi_state
);
630 /* Constants for the pending_op argument of handle_futex_death */
631 #define HANDLE_DEATH_PENDING true
632 #define HANDLE_DEATH_LIST false
635 * Process a futex-list entry, check whether it's owned by the
636 * dying task, and do notification if so:
638 static int handle_futex_death(u32 __user
*uaddr
, struct task_struct
*curr
,
639 bool pi
, bool pending_op
)
641 u32 uval
, nval
, mval
;
645 /* Futex address must be 32bit aligned */
646 if ((((unsigned long)uaddr
) % sizeof(*uaddr
)) != 0)
650 if (get_user(uval
, uaddr
))
654 * Special case for regular (non PI) futexes. The unlock path in
655 * user space has two race scenarios:
657 * 1. The unlock path releases the user space futex value and
658 * before it can execute the futex() syscall to wake up
659 * waiters it is killed.
661 * 2. A woken up waiter is killed before it can acquire the
662 * futex in user space.
664 * In the second case, the wake up notification could be generated
665 * by the unlock path in user space after setting the futex value
666 * to zero or by the kernel after setting the OWNER_DIED bit below.
668 * In both cases the TID validation below prevents a wakeup of
669 * potential waiters which can cause these waiters to block
672 * In both cases the following conditions are met:
674 * 1) task->robust_list->list_op_pending != NULL
675 * @pending_op == true
676 * 2) The owner part of user space futex value == 0
677 * 3) Regular futex: @pi == false
679 * If these conditions are met, it is safe to attempt waking up a
680 * potential waiter without touching the user space futex value and
681 * trying to set the OWNER_DIED bit. If the futex value is zero,
682 * the rest of the user space mutex state is consistent, so a woken
683 * waiter will just take over the uncontended futex. Setting the
684 * OWNER_DIED bit would create inconsistent state and malfunction
685 * of the user space owner died handling. Otherwise, the OWNER_DIED
686 * bit is already set, and the woken waiter is expected to deal with
689 owner
= uval
& FUTEX_TID_MASK
;
691 if (pending_op
&& !pi
&& !owner
) {
692 futex_wake(uaddr
, FLAGS_SIZE_32
| FLAGS_SHARED
, 1,
693 FUTEX_BITSET_MATCH_ANY
);
697 if (owner
!= task_pid_vnr(curr
))
701 * Ok, this dying thread is truly holding a futex
702 * of interest. Set the OWNER_DIED bit atomically
703 * via cmpxchg, and if the value had FUTEX_WAITERS
704 * set, wake up a waiter (if any). (We have to do a
705 * futex_wake() even if OWNER_DIED is already set -
706 * to handle the rare but possible case of recursive
707 * thread-death.) The rest of the cleanup is done in
710 mval
= (uval
& FUTEX_WAITERS
) | FUTEX_OWNER_DIED
;
713 * We are not holding a lock here, but we want to have
714 * the pagefault_disable/enable() protection because
715 * we want to handle the fault gracefully. If the
716 * access fails we try to fault in the futex with R/W
717 * verification via get_user_pages. get_user() above
718 * does not guarantee R/W access. If that fails we
719 * give up and leave the futex locked.
721 if ((err
= futex_cmpxchg_value_locked(&nval
, uaddr
, uval
, mval
))) {
724 if (fault_in_user_writeable(uaddr
))
742 * Wake robust non-PI futexes here. The wakeup of
743 * PI futexes happens in exit_pi_state():
745 if (!pi
&& (uval
& FUTEX_WAITERS
)) {
746 futex_wake(uaddr
, FLAGS_SIZE_32
| FLAGS_SHARED
, 1,
747 FUTEX_BITSET_MATCH_ANY
);
754 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
756 static inline int fetch_robust_entry(struct robust_list __user
**entry
,
757 struct robust_list __user
* __user
*head
,
760 unsigned long uentry
;
762 if (get_user(uentry
, (unsigned long __user
*)head
))
765 *entry
= (void __user
*)(uentry
& ~1UL);
772 * Walk curr->robust_list (very carefully, it's a userspace list!)
773 * and mark any locks found there dead, and notify any waiters.
775 * We silently return on any sign of list-walking problem.
777 static void exit_robust_list(struct task_struct
*curr
)
779 struct robust_list_head __user
*head
= curr
->robust_list
;
780 struct robust_list __user
*entry
, *next_entry
, *pending
;
781 unsigned int limit
= ROBUST_LIST_LIMIT
, pi
, pip
;
782 unsigned int next_pi
;
783 unsigned long futex_offset
;
787 * Fetch the list head (which was registered earlier, via
788 * sys_set_robust_list()):
790 if (fetch_robust_entry(&entry
, &head
->list
.next
, &pi
))
793 * Fetch the relative futex offset:
795 if (get_user(futex_offset
, &head
->futex_offset
))
798 * Fetch any possibly pending lock-add first, and handle it
801 if (fetch_robust_entry(&pending
, &head
->list_op_pending
, &pip
))
804 next_entry
= NULL
; /* avoid warning with gcc */
805 while (entry
!= &head
->list
) {
807 * Fetch the next entry in the list before calling
808 * handle_futex_death:
810 rc
= fetch_robust_entry(&next_entry
, &entry
->next
, &next_pi
);
812 * A pending lock might already be on the list, so
813 * don't process it twice:
815 if (entry
!= pending
) {
816 if (handle_futex_death((void __user
*)entry
+ futex_offset
,
817 curr
, pi
, HANDLE_DEATH_LIST
))
825 * Avoid excessively long or circular lists:
834 handle_futex_death((void __user
*)pending
+ futex_offset
,
835 curr
, pip
, HANDLE_DEATH_PENDING
);
840 static void __user
*futex_uaddr(struct robust_list __user
*entry
,
841 compat_long_t futex_offset
)
843 compat_uptr_t base
= ptr_to_compat(entry
);
844 void __user
*uaddr
= compat_ptr(base
+ futex_offset
);
850 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
853 compat_fetch_robust_entry(compat_uptr_t
*uentry
, struct robust_list __user
**entry
,
854 compat_uptr_t __user
*head
, unsigned int *pi
)
856 if (get_user(*uentry
, head
))
859 *entry
= compat_ptr((*uentry
) & ~1);
860 *pi
= (unsigned int)(*uentry
) & 1;
866 * Walk curr->robust_list (very carefully, it's a userspace list!)
867 * and mark any locks found there dead, and notify any waiters.
869 * We silently return on any sign of list-walking problem.
871 static void compat_exit_robust_list(struct task_struct
*curr
)
873 struct compat_robust_list_head __user
*head
= curr
->compat_robust_list
;
874 struct robust_list __user
*entry
, *next_entry
, *pending
;
875 unsigned int limit
= ROBUST_LIST_LIMIT
, pi
, pip
;
876 unsigned int next_pi
;
877 compat_uptr_t uentry
, next_uentry
, upending
;
878 compat_long_t futex_offset
;
882 * Fetch the list head (which was registered earlier, via
883 * sys_set_robust_list()):
885 if (compat_fetch_robust_entry(&uentry
, &entry
, &head
->list
.next
, &pi
))
888 * Fetch the relative futex offset:
890 if (get_user(futex_offset
, &head
->futex_offset
))
893 * Fetch any possibly pending lock-add first, and handle it
896 if (compat_fetch_robust_entry(&upending
, &pending
,
897 &head
->list_op_pending
, &pip
))
900 next_entry
= NULL
; /* avoid warning with gcc */
901 while (entry
!= (struct robust_list __user
*) &head
->list
) {
903 * Fetch the next entry in the list before calling
904 * handle_futex_death:
906 rc
= compat_fetch_robust_entry(&next_uentry
, &next_entry
,
907 (compat_uptr_t __user
*)&entry
->next
, &next_pi
);
909 * A pending lock might already be on the list, so
910 * dont process it twice:
912 if (entry
!= pending
) {
913 void __user
*uaddr
= futex_uaddr(entry
, futex_offset
);
915 if (handle_futex_death(uaddr
, curr
, pi
,
921 uentry
= next_uentry
;
925 * Avoid excessively long or circular lists:
933 void __user
*uaddr
= futex_uaddr(pending
, futex_offset
);
935 handle_futex_death(uaddr
, curr
, pip
, HANDLE_DEATH_PENDING
);
940 #ifdef CONFIG_FUTEX_PI
943 * This task is holding PI mutexes at exit time => bad.
944 * Kernel cleans up PI-state, but userspace is likely hosed.
945 * (Robust-futex cleanup is separate and might save the day for userspace.)
947 static void exit_pi_state_list(struct task_struct
*curr
)
949 struct list_head
*next
, *head
= &curr
->pi_state_list
;
950 struct futex_pi_state
*pi_state
;
951 struct futex_hash_bucket
*hb
;
952 union futex_key key
= FUTEX_KEY_INIT
;
955 * We are a ZOMBIE and nobody can enqueue itself on
956 * pi_state_list anymore, but we have to be careful
957 * versus waiters unqueueing themselves:
959 raw_spin_lock_irq(&curr
->pi_lock
);
960 while (!list_empty(head
)) {
962 pi_state
= list_entry(next
, struct futex_pi_state
, list
);
964 hb
= futex_hash(&key
);
967 * We can race against put_pi_state() removing itself from the
968 * list (a waiter going away). put_pi_state() will first
969 * decrement the reference count and then modify the list, so
970 * its possible to see the list entry but fail this reference
973 * In that case; drop the locks to let put_pi_state() make
974 * progress and retry the loop.
976 if (!refcount_inc_not_zero(&pi_state
->refcount
)) {
977 raw_spin_unlock_irq(&curr
->pi_lock
);
979 raw_spin_lock_irq(&curr
->pi_lock
);
982 raw_spin_unlock_irq(&curr
->pi_lock
);
984 spin_lock(&hb
->lock
);
985 raw_spin_lock_irq(&pi_state
->pi_mutex
.wait_lock
);
986 raw_spin_lock(&curr
->pi_lock
);
988 * We dropped the pi-lock, so re-check whether this
989 * task still owns the PI-state:
991 if (head
->next
!= next
) {
992 /* retain curr->pi_lock for the loop invariant */
993 raw_spin_unlock(&pi_state
->pi_mutex
.wait_lock
);
994 spin_unlock(&hb
->lock
);
995 put_pi_state(pi_state
);
999 WARN_ON(pi_state
->owner
!= curr
);
1000 WARN_ON(list_empty(&pi_state
->list
));
1001 list_del_init(&pi_state
->list
);
1002 pi_state
->owner
= NULL
;
1004 raw_spin_unlock(&curr
->pi_lock
);
1005 raw_spin_unlock_irq(&pi_state
->pi_mutex
.wait_lock
);
1006 spin_unlock(&hb
->lock
);
1008 rt_mutex_futex_unlock(&pi_state
->pi_mutex
);
1009 put_pi_state(pi_state
);
1011 raw_spin_lock_irq(&curr
->pi_lock
);
1013 raw_spin_unlock_irq(&curr
->pi_lock
);
1016 static inline void exit_pi_state_list(struct task_struct
*curr
) { }
1019 static void futex_cleanup(struct task_struct
*tsk
)
1021 if (unlikely(tsk
->robust_list
)) {
1022 exit_robust_list(tsk
);
1023 tsk
->robust_list
= NULL
;
1026 #ifdef CONFIG_COMPAT
1027 if (unlikely(tsk
->compat_robust_list
)) {
1028 compat_exit_robust_list(tsk
);
1029 tsk
->compat_robust_list
= NULL
;
1033 if (unlikely(!list_empty(&tsk
->pi_state_list
)))
1034 exit_pi_state_list(tsk
);
1038 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
1039 * @tsk: task to set the state on
1041 * Set the futex exit state of the task lockless. The futex waiter code
1042 * observes that state when a task is exiting and loops until the task has
1043 * actually finished the futex cleanup. The worst case for this is that the
1044 * waiter runs through the wait loop until the state becomes visible.
1046 * This is called from the recursive fault handling path in make_task_dead().
1048 * This is best effort. Either the futex exit code has run already or
1049 * not. If the OWNER_DIED bit has been set on the futex then the waiter can
1050 * take it over. If not, the problem is pushed back to user space. If the
1051 * futex exit code did not run yet, then an already queued waiter might
1052 * block forever, but there is nothing which can be done about that.
1054 void futex_exit_recursive(struct task_struct
*tsk
)
1056 /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
1057 if (tsk
->futex_state
== FUTEX_STATE_EXITING
)
1058 mutex_unlock(&tsk
->futex_exit_mutex
);
1059 tsk
->futex_state
= FUTEX_STATE_DEAD
;
1062 static void futex_cleanup_begin(struct task_struct
*tsk
)
1065 * Prevent various race issues against a concurrent incoming waiter
1066 * including live locks by forcing the waiter to block on
1067 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
1068 * attach_to_pi_owner().
1070 mutex_lock(&tsk
->futex_exit_mutex
);
1073 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
1075 * This ensures that all subsequent checks of tsk->futex_state in
1076 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
1077 * tsk->pi_lock held.
1079 * It guarantees also that a pi_state which was queued right before
1080 * the state change under tsk->pi_lock by a concurrent waiter must
1081 * be observed in exit_pi_state_list().
1083 raw_spin_lock_irq(&tsk
->pi_lock
);
1084 tsk
->futex_state
= FUTEX_STATE_EXITING
;
1085 raw_spin_unlock_irq(&tsk
->pi_lock
);
1088 static void futex_cleanup_end(struct task_struct
*tsk
, int state
)
1091 * Lockless store. The only side effect is that an observer might
1092 * take another loop until it becomes visible.
1094 tsk
->futex_state
= state
;
1096 * Drop the exit protection. This unblocks waiters which observed
1097 * FUTEX_STATE_EXITING to reevaluate the state.
1099 mutex_unlock(&tsk
->futex_exit_mutex
);
1102 void futex_exec_release(struct task_struct
*tsk
)
1105 * The state handling is done for consistency, but in the case of
1106 * exec() there is no way to prevent further damage as the PID stays
1107 * the same. But for the unlikely and arguably buggy case that a
1108 * futex is held on exec(), this provides at least as much state
1109 * consistency protection which is possible.
1111 futex_cleanup_begin(tsk
);
1114 * Reset the state to FUTEX_STATE_OK. The task is alive and about
1115 * exec a new binary.
1117 futex_cleanup_end(tsk
, FUTEX_STATE_OK
);
1120 void futex_exit_release(struct task_struct
*tsk
)
1122 futex_cleanup_begin(tsk
);
1124 futex_cleanup_end(tsk
, FUTEX_STATE_DEAD
);
1127 static int __init
futex_init(void)
1129 unsigned int futex_shift
;
1132 #ifdef CONFIG_BASE_SMALL
1133 futex_hashsize
= 16;
1135 futex_hashsize
= roundup_pow_of_two(256 * num_possible_cpus());
1138 futex_queues
= alloc_large_system_hash("futex", sizeof(*futex_queues
),
1139 futex_hashsize
, 0, 0,
1141 futex_hashsize
, futex_hashsize
);
1142 futex_hashsize
= 1UL << futex_shift
;
1144 for (i
= 0; i
< futex_hashsize
; i
++) {
1145 atomic_set(&futex_queues
[i
].waiters
, 0);
1146 plist_head_init(&futex_queues
[i
].chain
);
1147 spin_lock_init(&futex_queues
[i
].lock
);
1152 core_initcall(futex_init
);