1 /* $NetBSD: kern_synch.c,v 1.273 2009/12/05 22:38:19 pooka Exp $ */
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009
5 * The NetBSD Foundation, Inc.
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
36 * Copyright (c) 1982, 1986, 1990, 1991, 1993
37 * The Regents of the University of California. All rights reserved.
38 * (c) UNIX System Laboratories, Inc.
39 * All or some portions of this file are derived from material licensed
40 * to the University of California by American Telephone and Telegraph
41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
42 * the permission of UNIX System Laboratories, Inc.
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
47 * 1. Redistributions of source code must retain the above copyright
48 * notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution.
52 * 3. Neither the name of the University nor the names of its contributors
53 * may be used to endorse or promote products derived from this software
54 * without specific prior written permission.
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
71 #include <sys/cdefs.h>
72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.273 2009/12/05 22:38:19 pooka Exp $");
74 #include "opt_kstack.h"
75 #include "opt_perfctrs.h"
78 #define __MUTEX_PRIVATE
80 #include <sys/param.h>
81 #include <sys/systm.h>
83 #include <sys/kernel.h>
88 #include <sys/resourcevar.h>
89 #include <sys/sched.h>
91 #include <sys/savar.h>
92 #include <sys/syscall_stats.h>
93 #include <sys/sleepq.h>
94 #include <sys/lockdebug.h>
95 #include <sys/evcnt.h>
97 #include <sys/lwpctl.h>
98 #include <sys/atomic.h>
99 #include <sys/simplelock.h>
101 #include <uvm/uvm_extern.h>
103 #include <dev/lockstat.h>
105 static void sched_unsleep(struct lwp
*, bool);
106 static void sched_changepri(struct lwp
*, pri_t
);
107 static void sched_lendpri(struct lwp
*, pri_t
);
108 static void resched_cpu(struct lwp
*);
110 syncobj_t sleep_syncobj
= {
118 syncobj_t sched_syncobj
= {
126 callout_t sched_pstats_ch
;
127 unsigned sched_pstats_ticks
;
128 kcondvar_t lbolt
; /* once a second sleep address */
130 /* Preemption event counters */
131 static struct evcnt kpreempt_ev_crit
;
132 static struct evcnt kpreempt_ev_klock
;
133 static struct evcnt kpreempt_ev_immed
;
136 * During autoconfiguration or after a panic, a sleep will simply lower the
137 * priority briefly to allow interrupts, then return. The priority to be
138 * used (safepri) is machine-dependent, thus this value is initialized and
139 * maintained in the machine-dependent layers. This priority will typically
140 * be 0, or the lowest priority that is safe for use on the interrupt stack;
141 * it can be made higher to block network software interrupts after panics.
149 cv_init(&lbolt
, "lbolt");
150 callout_init(&sched_pstats_ch
, CALLOUT_MPSAFE
);
151 callout_setfunc(&sched_pstats_ch
, sched_pstats
, NULL
);
153 evcnt_attach_dynamic(&kpreempt_ev_crit
, EVCNT_TYPE_MISC
, NULL
,
154 "kpreempt", "defer: critical section");
155 evcnt_attach_dynamic(&kpreempt_ev_klock
, EVCNT_TYPE_MISC
, NULL
,
156 "kpreempt", "defer: kernel_lock");
157 evcnt_attach_dynamic(&kpreempt_ev_immed
, EVCNT_TYPE_MISC
, NULL
,
158 "kpreempt", "immediate");
166 * General sleep call. Suspends the current LWP until a wakeup is
167 * performed on the specified identifier. The LWP will then be made
168 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
169 * means no timeout). If pri includes PCATCH flag, signals are checked
170 * before and after sleeping, else signals are not checked. Returns 0 if
171 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
172 * signal needs to be delivered, ERESTART is returned if the current system
173 * call should be restarted if possible, and EINTR is returned if the system
174 * call should be interrupted by the signal (return EINTR).
176 * The interlock is held until we are on a sleep queue. The interlock will
177 * be locked before returning back to the caller unless the PNORELOCK flag
178 * is specified, in which case the interlock will always be unlocked upon
182 ltsleep(wchan_t ident
, pri_t priority
, const char *wmesg
, int timo
,
183 volatile struct simplelock
*interlock
)
185 struct lwp
*l
= curlwp
;
190 KASSERT((l
->l_pflag
& LP_INTR
) == 0);
191 KASSERT(ident
!= &lbolt
);
193 if (sleepq_dontsleep(l
)) {
194 (void)sleepq_abort(NULL
, 0);
195 if ((priority
& PNORELOCK
) != 0)
196 simple_unlock(interlock
);
200 l
->l_kpriority
= true;
201 sq
= sleeptab_lookup(&sleeptab
, ident
, &mp
);
202 sleepq_enter(sq
, l
, mp
);
203 sleepq_enqueue(sq
, ident
, wmesg
, &sleep_syncobj
);
205 if (interlock
!= NULL
) {
206 KASSERT(simple_lock_held(interlock
));
207 simple_unlock(interlock
);
210 error
= sleepq_block(timo
, priority
& PCATCH
);
212 if (interlock
!= NULL
&& (priority
& PNORELOCK
) == 0)
213 simple_lock(interlock
);
219 mtsleep(wchan_t ident
, pri_t priority
, const char *wmesg
, int timo
,
222 struct lwp
*l
= curlwp
;
227 KASSERT((l
->l_pflag
& LP_INTR
) == 0);
228 KASSERT(ident
!= &lbolt
);
230 if (sleepq_dontsleep(l
)) {
231 (void)sleepq_abort(mtx
, (priority
& PNORELOCK
) != 0);
235 l
->l_kpriority
= true;
236 sq
= sleeptab_lookup(&sleeptab
, ident
, &mp
);
237 sleepq_enter(sq
, l
, mp
);
238 sleepq_enqueue(sq
, ident
, wmesg
, &sleep_syncobj
);
240 error
= sleepq_block(timo
, priority
& PCATCH
);
242 if ((priority
& PNORELOCK
) == 0)
249 * General sleep call for situations where a wake-up is not expected.
252 kpause(const char *wmesg
, bool intr
, int timo
, kmutex_t
*mtx
)
254 struct lwp
*l
= curlwp
;
259 if (sleepq_dontsleep(l
))
260 return sleepq_abort(NULL
, 0);
264 l
->l_kpriority
= true;
265 sq
= sleeptab_lookup(&sleeptab
, l
, &mp
);
266 sleepq_enter(sq
, l
, mp
);
267 sleepq_enqueue(sq
, l
, wmesg
, &sleep_syncobj
);
268 error
= sleepq_block(timo
, intr
);
279 * We believe this lwp is an SA lwp. If it's yielding,
280 * let it know it needs to wake up.
282 * We are called and exit with the lwp locked. We are
283 * called in the middle of wakeup operations, so we need
284 * to not touch the locks at all.
287 sa_awaken(struct lwp
*l
)
289 /* LOCK_ASSERT(lwp_locked(l, NULL)); */
291 if (l
== l
->l_savp
->savp_lwp
&& l
->l_flag
& LW_SA_YIELD
)
292 l
->l_flag
&= ~LW_SA_IDLE
;
299 * Make all LWPs sleeping on the specified identifier runnable.
302 wakeup(wchan_t ident
)
307 if (__predict_false(cold
))
310 sq
= sleeptab_lookup(&sleeptab
, ident
, &mp
);
311 sleepq_wake(sq
, ident
, (u_int
)-1, mp
);
317 * Make the highest priority LWP first in line on the specified
318 * identifier runnable.
321 wakeup_one(wchan_t ident
)
326 if (__predict_false(cold
))
329 sq
= sleeptab_lookup(&sleeptab
, ident
, &mp
);
330 sleepq_wake(sq
, ident
, 1, mp
);
335 * General yield call. Puts the current LWP back on its run queue and
336 * performs a voluntary context switch. Should only be called when the
337 * current LWP explicitly requests it (eg sched_yield(2)).
342 struct lwp
*l
= curlwp
;
344 KERNEL_UNLOCK_ALL(l
, &l
->l_biglocks
);
346 KASSERT(lwp_locked(l
, l
->l_cpu
->ci_schedstate
.spc_lwplock
));
347 KASSERT(l
->l_stat
== LSONPROC
);
348 l
->l_kpriority
= false;
350 KERNEL_LOCK(l
->l_biglocks
, l
);
354 * General preemption call. Puts the current LWP back on its run queue
355 * and performs an involuntary context switch.
360 struct lwp
*l
= curlwp
;
362 KERNEL_UNLOCK_ALL(l
, &l
->l_biglocks
);
364 KASSERT(lwp_locked(l
, l
->l_cpu
->ci_schedstate
.spc_lwplock
));
365 KASSERT(l
->l_stat
== LSONPROC
);
366 l
->l_kpriority
= false;
369 KERNEL_LOCK(l
->l_biglocks
, l
);
373 * Handle a request made by another agent to preempt the current LWP
374 * in-kernel. Usually called when l_dopreempt may be non-zero.
376 * Character addresses for lockstat only.
378 static char in_critical_section
;
379 static char kernel_lock_held
;
380 static char is_softint
;
381 static char cpu_kpreempt_enter_fail
;
384 kpreempt(uintptr_t where
)
392 while ((dop
= l
->l_dopreempt
) != 0) {
393 if (l
->l_stat
!= LSONPROC
) {
395 * About to block (or die), let it happen.
396 * Doesn't really count as "preemption has
397 * been blocked", since we're going to
403 if (__predict_false((l
->l_flag
& LW_IDLE
) != 0)) {
404 /* Can't preempt idle loop, don't count as failure. */
408 if (__predict_false(l
->l_nopreempt
!= 0)) {
409 /* LWP holds preemption disabled, explicitly. */
410 if ((dop
& DOPREEMPT_COUNTED
) == 0) {
411 kpreempt_ev_crit
.ev_count
++;
413 failed
= (uintptr_t)&in_critical_section
;
416 if (__predict_false((l
->l_pflag
& LP_INTR
) != 0)) {
417 /* Can't preempt soft interrupts yet. */
419 failed
= (uintptr_t)&is_softint
;
423 if (__predict_false(l
->l_blcnt
!= 0 ||
424 curcpu()->ci_biglock_wanted
!= NULL
)) {
425 /* Hold or want kernel_lock, code is not MT safe. */
427 if ((dop
& DOPREEMPT_COUNTED
) == 0) {
428 kpreempt_ev_klock
.ev_count
++;
430 failed
= (uintptr_t)&kernel_lock_held
;
433 if (__predict_false(!cpu_kpreempt_enter(where
, s
))) {
435 * It may be that the IPL is too high.
436 * kpreempt_enter() can schedule an
437 * interrupt to retry later.
440 failed
= (uintptr_t)&cpu_kpreempt_enter_fail
;
444 if (__predict_true((dop
& DOPREEMPT_COUNTED
) == 0)) {
445 kpreempt_ev_immed
.ev_count
++;
452 /* Take care of any MD cleanup. */
453 cpu_kpreempt_exit(where
);
457 if (__predict_true(!failed
)) {
461 /* Record preemption failure for reporting via lockstat. */
462 atomic_or_uint(&l
->l_dopreempt
, DOPREEMPT_COUNTED
);
464 LOCKSTAT_ENTER(lsflag
);
465 if (__predict_false(lsflag
)) {
467 where
= (uintptr_t)__builtin_return_address(0);
469 /* Preemption is on, might recurse, so make it atomic. */
470 if (atomic_cas_ptr_ni((void *)&l
->l_pfailaddr
, NULL
,
471 (void *)where
) == NULL
) {
472 LOCKSTAT_START_TIMER(lsflag
, l
->l_pfailtime
);
473 l
->l_pfaillock
= failed
;
476 LOCKSTAT_EXIT(lsflag
);
481 * Return true if preemption is explicitly disabled.
484 kpreempt_disabled(void)
486 const lwp_t
*l
= curlwp
;
488 return l
->l_nopreempt
!= 0 || l
->l_stat
== LSZOMB
||
489 (l
->l_flag
& LW_IDLE
) != 0 || cpu_kpreempt_disabled();
493 * Disable kernel preemption.
496 kpreempt_disable(void)
499 KPREEMPT_DISABLE(curlwp
);
503 * Reenable kernel preemption.
506 kpreempt_enable(void)
509 KPREEMPT_ENABLE(curlwp
);
513 * Compute the amount of time during which the current lwp was running.
515 * - update l_rtime unless it's an idle lwp.
519 updatertime(lwp_t
*l
, const struct bintime
*now
)
522 if (__predict_false(l
->l_flag
& LW_IDLE
))
525 /* rtime += now - stime */
526 bintime_add(&l
->l_rtime
, now
);
527 bintime_sub(&l
->l_rtime
, &l
->l_stime
);
531 * Select next LWP from the current CPU to run..
533 static inline lwp_t
*
534 nextlwp(struct cpu_info
*ci
, struct schedstate_percpu
*spc
)
539 * Let sched_nextlwp() select the LWP to run the CPU next.
540 * If no LWP is runnable, select the idle LWP.
542 * Note that spc_lwplock might not necessary be held, and
543 * new thread would be unlocked after setting the LWP-lock.
545 newl
= sched_nextlwp();
548 KASSERT(lwp_locked(newl
, spc
->spc_mutex
));
549 KASSERT(newl
->l_cpu
== ci
);
550 newl
->l_stat
= LSONPROC
;
551 newl
->l_pflag
|= LP_RUNNING
;
552 lwp_setlock(newl
, spc
->spc_lwplock
);
554 newl
= ci
->ci_data
.cpu_idlelwp
;
555 newl
->l_stat
= LSONPROC
;
556 newl
->l_pflag
|= LP_RUNNING
;
560 * Only clear want_resched if there are no pending (slow)
561 * software interrupts.
563 ci
->ci_want_resched
= ci
->ci_data
.cpu_softints
;
564 spc
->spc_flags
&= ~SPCF_SWITCHCLEAR
;
565 spc
->spc_curpriority
= lwp_eprio(newl
);
571 * The machine independent parts of context switch.
573 * Returns 1 if another LWP was actually run.
579 struct schedstate_percpu
*spc
;
585 KASSERT(lwp_locked(l
, NULL
));
586 KASSERT(kpreempt_disabled());
587 LOCKDEBUG_BARRIER(l
->l_mutex
, 1);
589 kstack_check_magic(l
);
593 KASSERT((l
->l_pflag
& LP_RUNNING
) != 0);
594 KASSERT(l
->l_cpu
== curcpu());
596 spc
= &ci
->ci_schedstate
;
601 * If we have been asked to switch to a specific LWP, then there
602 * is no need to inspect the run queues. If a soft interrupt is
603 * blocking, then return to the interrupted thread without adjusting
604 * VM context or its start time: neither have been changed in order
605 * to take the interrupt.
607 if (l
->l_switchto
!= NULL
) {
608 if ((l
->l_pflag
& LP_INTR
) != 0) {
611 if ((l
->l_pflag
& LP_TIMEINTR
) != 0)
614 newl
= l
->l_switchto
;
615 l
->l_switchto
= NULL
;
617 #ifndef __HAVE_FAST_SOFTINTS
618 else if (ci
->ci_data
.cpu_softints
!= 0) {
619 /* There are pending soft interrupts, so pick one. */
620 newl
= softint_picklwp();
621 newl
->l_stat
= LSONPROC
;
622 newl
->l_pflag
|= LP_RUNNING
;
624 #endif /* !__HAVE_FAST_SOFTINTS */
626 /* Count time spent in current system call */
628 SYSCALL_TIME_SLEEP(l
);
631 * XXXSMP If we are using h/w performance counters,
635 if (PMC_ENABLED(l
->l_proc
)) {
636 pmc_save_context(l
->l_proc
);
642 /* Lock the runqueue */
643 KASSERT(l
->l_stat
!= LSRUN
);
644 mutex_spin_enter(spc
->spc_mutex
);
647 * If on the CPU and we have gotten this far, then we must yield.
649 if (l
->l_stat
== LSONPROC
&& l
!= newl
) {
650 KASSERT(lwp_locked(l
, spc
->spc_lwplock
));
651 if ((l
->l_flag
& LW_IDLE
) == 0) {
653 lwp_setlock(l
, spc
->spc_mutex
);
654 sched_enqueue(l
, true);
655 /* Handle migration case */
656 KASSERT(spc
->spc_migrating
== NULL
);
657 if (l
->l_target_cpu
!= NULL
) {
658 spc
->spc_migrating
= l
;
664 /* Pick new LWP to run. */
666 newl
= nextlwp(ci
, spc
);
669 /* Items that must be updated with the CPU locked. */
671 /* Update the new LWP's start time. */
675 * ci_curlwp changes when a fast soft interrupt occurs.
676 * We use cpu_onproc to keep track of which kernel or
677 * user thread is running 'underneath' the software
678 * interrupt. This is important for time accounting,
679 * itimers and forcing user threads to preempt (aston).
681 ci
->ci_data
.cpu_onproc
= newl
;
685 * Preemption related tasks. Must be done with the current
690 if (__predict_false(l
->l_pfailaddr
!= 0)) {
691 LOCKSTAT_FLAG(lsflag
);
692 LOCKSTAT_ENTER(lsflag
);
693 LOCKSTAT_STOP_TIMER(lsflag
, l
->l_pfailtime
);
694 LOCKSTAT_EVENT_RA(lsflag
, l
->l_pfaillock
, LB_NOPREEMPT
|LB_SPIN
,
695 1, l
->l_pfailtime
, l
->l_pfailaddr
);
696 LOCKSTAT_EXIT(lsflag
);
705 /* Release all locks, but leave the current LWP locked */
706 if (l
->l_mutex
== spc
->spc_mutex
) {
708 * Drop spc_lwplock, if the current LWP has been moved
709 * to the run queue (it is now locked by spc_mutex).
711 mutex_spin_exit(spc
->spc_lwplock
);
714 * Otherwise, drop the spc_mutex, we are done with the
717 mutex_spin_exit(spc
->spc_mutex
);
721 * Mark that context switch is going to be performed
722 * for this LWP, to protect it from being switched
725 KASSERT(l
->l_ctxswtch
== 0);
728 KASSERT((l
->l_pflag
& LP_RUNNING
) != 0);
729 l
->l_pflag
&= ~LP_RUNNING
;
732 * Increase the count of spin-mutexes before the release
733 * of the last lock - we must remain at IPL_SCHED during
734 * the context switch.
736 oldspl
= MUTEX_SPIN_OLDSPL(ci
);
740 /* Count the context switch on this CPU. */
741 ci
->ci_data
.cpu_nswtch
++;
743 /* Update status for lwpctl, if present. */
744 if (l
->l_lwpctl
!= NULL
)
745 l
->l_lwpctl
->lc_curcpu
= LWPCTL_CPU_NONE
;
748 * Save old VM context, unless a soft interrupt
749 * handler is blocking.
755 * We may need to spin-wait for if 'newl' is still
756 * context switching on another CPU.
758 if (__predict_false(newl
->l_ctxswtch
!= 0)) {
760 count
= SPINLOCK_BACKOFF_MIN
;
761 while (newl
->l_ctxswtch
)
762 SPINLOCK_BACKOFF(count
);
765 /* Switch to the new LWP.. */
766 prevlwp
= cpu_switchto(l
, newl
, returning
);
770 * Switched away - we have new curlwp.
771 * Restore VM context and IPL.
776 if (prevlwp
!= NULL
) {
777 /* Normalize the count of the spin-mutexes */
779 /* Unmark the state of context switch */
781 prevlwp
->l_ctxswtch
= 0;
784 /* Update status for lwpctl, if present. */
785 if (l
->l_lwpctl
!= NULL
) {
786 l
->l_lwpctl
->lc_curcpu
= (int)cpu_index(ci
);
787 l
->l_lwpctl
->lc_pctr
++;
790 KASSERT(l
->l_cpu
== ci
);
794 /* Nothing to do - just unlock and return. */
795 mutex_spin_exit(spc
->spc_mutex
);
800 KASSERT(l
== curlwp
);
801 KASSERT(l
->l_stat
== LSONPROC
);
804 * XXXSMP If we are using h/w performance counters, restore context.
805 * XXXSMP preemption problem.
808 if (PMC_ENABLED(l
->l_proc
)) {
809 pmc_restore_context(l
->l_proc
);
812 SYSCALL_TIME_WAKEUP(l
);
813 LOCKDEBUG_BARRIER(NULL
, 1);
819 * The machine independent parts of context switch to oblivion.
820 * Does not return. Call with the LWP unlocked.
823 lwp_exit_switchaway(lwp_t
*l
)
831 KASSERT(kpreempt_disabled());
832 KASSERT(l
->l_stat
== LSZOMB
|| l
->l_stat
== LSIDL
);
833 KASSERT(ci
== curcpu());
834 LOCKDEBUG_BARRIER(NULL
, 0);
836 kstack_check_magic(l
);
838 /* Count time spent in current system call */
839 SYSCALL_TIME_SLEEP(l
);
843 /* Must stay at IPL_SCHED even after releasing run queue lock. */
847 * Let sched_nextlwp() select the LWP to run the CPU next.
848 * If no LWP is runnable, select the idle LWP.
850 * Note that spc_lwplock might not necessary be held, and
851 * new thread would be unlocked after setting the LWP-lock.
854 #ifndef __HAVE_FAST_SOFTINTS
855 if (ci
->ci_data
.cpu_softints
!= 0) {
856 /* There are pending soft interrupts, so pick one. */
857 newl
= softint_picklwp();
858 newl
->l_stat
= LSONPROC
;
859 newl
->l_pflag
|= LP_RUNNING
;
861 #endif /* !__HAVE_FAST_SOFTINTS */
863 newl
= nextlwp(ci
, &ci
->ci_schedstate
);
866 /* Update the new LWP's start time. */
868 l
->l_pflag
&= ~LP_RUNNING
;
871 * ci_curlwp changes when a fast soft interrupt occurs.
872 * We use cpu_onproc to keep track of which kernel or
873 * user thread is running 'underneath' the software
874 * interrupt. This is important for time accounting,
875 * itimers and forcing user threads to preempt (aston).
877 ci
->ci_data
.cpu_onproc
= newl
;
880 * Preemption related tasks. Must be done with the current
885 /* Unlock the run queue. */
888 /* Count the context switch on this CPU. */
889 ci
->ci_data
.cpu_nswtch
++;
891 /* Update status for lwpctl, if present. */
892 if (l
->l_lwpctl
!= NULL
)
893 l
->l_lwpctl
->lc_curcpu
= LWPCTL_CPU_EXITED
;
896 * We may need to spin-wait for if 'newl' is still
897 * context switching on another CPU.
899 if (__predict_false(newl
->l_ctxswtch
!= 0)) {
901 count
= SPINLOCK_BACKOFF_MIN
;
902 while (newl
->l_ctxswtch
)
903 SPINLOCK_BACKOFF(count
);
906 /* Switch to the new LWP.. */
907 (void)cpu_switchto(NULL
, newl
, false);
909 for (;;) continue; /* XXX: convince gcc about "noreturn" */
914 * setrunnable: change LWP state to be runnable, placing it on the run queue.
916 * Call with the process and LWP locked. Will return with the LWP unlocked.
919 setrunnable(struct lwp
*l
)
921 struct proc
*p
= l
->l_proc
;
924 KASSERT((l
->l_flag
& LW_IDLE
) == 0);
925 KASSERT(mutex_owned(p
->p_lock
));
926 KASSERT(lwp_locked(l
, NULL
));
927 KASSERT(l
->l_mutex
!= l
->l_cpu
->ci_schedstate
.spc_mutex
);
932 * If we're being traced (possibly because someone attached us
933 * while we were stopped), check for a signal from the debugger.
935 if ((p
->p_slflag
& PSL_TRACED
) != 0 && p
->p_xstat
!= 0)
940 l
->l_flag
&= ~LW_WSUSPEND
;
942 cv_broadcast(&p
->p_lwpcv
);
945 KASSERT(l
->l_wchan
!= NULL
);
948 panic("setrunnable: lwp %p state was %d", l
, l
->l_stat
);
957 * If the LWP was sleeping interruptably, then it's OK to start it
958 * again. If not, mark it as still sleeping.
960 if (l
->l_wchan
!= NULL
) {
962 /* lwp_unsleep() will release the lock. */
963 lwp_unsleep(l
, true);
968 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
969 * about to call mi_switch(), in which case it will yield.
971 if ((l
->l_pflag
& LP_RUNNING
) != 0) {
972 l
->l_stat
= LSONPROC
;
979 * Look for a CPU to run.
980 * Set the LWP runnable.
982 ci
= sched_takecpu(l
);
985 lwp_unlock_to(l
, ci
->ci_schedstate
.spc_mutex
);
986 sched_setrunnable(l
);
990 sched_enqueue(l
, false);
998 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
1003 CPU_INFO_ITERATOR cii
;
1004 struct cpu_info
*ci
;
1009 * We do this by process in order not to violate the locking rules.
1011 mutex_enter(proc_lock
);
1012 PROCLIST_FOREACH(p
, &allproc
) {
1013 if ((p
->p_flag
& PK_MARKER
) != 0)
1016 mutex_enter(p
->p_lock
);
1017 if ((p
->p_flag
& PK_SYSTEM
) != 0) {
1018 mutex_exit(p
->p_lock
);
1024 LIST_FOREACH(l
, &p
->p_lwps
, l_sibling
) {
1031 * Set L_WREBOOT so that the LWP will suspend itself
1032 * when it tries to return to user mode. We want to
1033 * try and get to get as many LWPs as possible to
1034 * the user / kernel boundary, so that they will
1035 * release any locks that they hold.
1037 l
->l_flag
|= (LW_WREBOOT
| LW_WSUSPEND
);
1039 if (l
->l_stat
== LSSLEEP
&&
1040 (l
->l_flag
& LW_SINTR
) != 0) {
1041 /* setrunnable() will release the lock. */
1049 mutex_exit(p
->p_lock
);
1051 mutex_exit(proc_lock
);
1054 * Kick all CPUs to make them preempt any LWPs running in user mode.
1055 * They'll trap into the kernel and suspend themselves in userret().
1057 for (CPU_INFO_FOREACH(cii
, ci
)) {
1059 cpu_need_resched(ci
, RESCHED_IMMED
);
1067 * The is called when the LWP has not been awoken normally but instead
1068 * interrupted: for example, if the sleep timed out. Because of this,
1069 * it's not a valid action for running or idle LWPs.
1072 sched_unsleep(struct lwp
*l
, bool cleanup
)
1076 panic("sched_unsleep");
1080 resched_cpu(struct lwp
*l
)
1082 struct cpu_info
*ci
= l
->l_cpu
;
1084 KASSERT(lwp_locked(l
, NULL
));
1085 if (lwp_eprio(l
) > ci
->ci_schedstate
.spc_curpriority
)
1086 cpu_need_resched(ci
, 0);
1090 sched_changepri(struct lwp
*l
, pri_t pri
)
1093 KASSERT(lwp_locked(l
, NULL
));
1095 if (l
->l_stat
== LSRUN
) {
1096 KASSERT(lwp_locked(l
, l
->l_cpu
->ci_schedstate
.spc_mutex
));
1098 l
->l_priority
= pri
;
1099 sched_enqueue(l
, false);
1101 l
->l_priority
= pri
;
1107 sched_lendpri(struct lwp
*l
, pri_t pri
)
1110 KASSERT(lwp_locked(l
, NULL
));
1112 if (l
->l_stat
== LSRUN
) {
1113 KASSERT(lwp_locked(l
, l
->l_cpu
->ci_schedstate
.spc_mutex
));
1115 l
->l_inheritedprio
= pri
;
1116 sched_enqueue(l
, false);
1118 l
->l_inheritedprio
= pri
;
1124 syncobj_noowner(wchan_t wchan
)
1130 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */
1131 const fixpt_t ccpu
= 0.95122942450071400909 * FSCALE
;
1136 * Update process statistics and check CPU resource allocation.
1137 * Call scheduler-specific hook to eventually adjust process/LWP
1141 sched_pstats(void *arg
)
1143 const int clkhz
= (stathz
!= 0 ? stathz
: hz
);
1144 static bool backwards
;
1145 struct rlimit
*rlim
;
1153 sched_pstats_ticks
++;
1155 mutex_enter(proc_lock
);
1156 PROCLIST_FOREACH(p
, &allproc
) {
1157 if (__predict_false((p
->p_flag
& PK_MARKER
) != 0))
1160 /* Increment sleep time (if sleeping), ignore overflow. */
1161 mutex_enter(p
->p_lock
);
1162 runtm
= p
->p_rtime
.sec
;
1163 LIST_FOREACH(l
, &p
->p_lwps
, l_sibling
) {
1164 if (__predict_false((l
->l_flag
& LW_IDLE
) != 0))
1167 runtm
+= l
->l_rtime
.sec
;
1172 l
->l_pctcpu
= (l
->l_pctcpu
* ccpu
) >> FSHIFT
;
1173 if (l
->l_slptime
!= 0)
1176 lpctcpu
= l
->l_pctcpu
;
1177 lcpticks
= atomic_swap_uint(&l
->l_cpticks
, 0);
1178 lpctcpu
+= ((FSCALE
- ccpu
) *
1179 (lcpticks
* FSCALE
/ clkhz
)) >> FSHIFT
;
1180 l
->l_pctcpu
= lpctcpu
;
1182 /* Calculating p_pctcpu only for ps(1) */
1183 p
->p_pctcpu
= (p
->p_pctcpu
* ccpu
) >> FSHIFT
;
1186 * Check if the process exceeds its CPU resource allocation.
1187 * If over max, kill it.
1189 rlim
= &p
->p_rlimit
[RLIMIT_CPU
];
1191 if (__predict_false(runtm
>= rlim
->rlim_cur
)) {
1192 if (runtm
>= rlim
->rlim_max
)
1196 if (rlim
->rlim_cur
< rlim
->rlim_max
)
1197 rlim
->rlim_cur
+= 5;
1200 mutex_exit(p
->p_lock
);
1201 if (__predict_false(runtm
< 0)) {
1204 printf("WARNING: negative runtime; "
1205 "monotonic clock has gone backwards\n");
1207 } else if (__predict_false(sig
)) {
1208 KASSERT((p
->p_flag
& PK_SYSTEM
) == 0);
1212 mutex_exit(proc_lock
);
1214 cv_broadcast(&lbolt
);
1215 callout_schedule(&sched_pstats_ch
, hz
);