4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
35 #include <sys/systm.h>
36 #include <sys/sysinfo.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/inline.h>
43 #include <sys/class.h>
44 #include <sys/bitmap.h>
46 #include <sys/cpuvar.h>
47 #include <sys/vtrace.h>
49 #include <sys/cpupart.h>
53 #include <sys/bitset.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
58 #include <sys/archsystm.h>
59 #include <sys/stdbool.h>
64 #define BOUND_PARTITION 0x2
65 #define BOUND_INTR 0x4
67 /* Dispatch queue allocation structure and functions */
68 struct disp_queue_info
{
76 static void disp_dq_alloc(struct disp_queue_info
*dptr
, int numpris
,
78 static void disp_dq_assign(struct disp_queue_info
*dptr
, int numpris
);
79 static void disp_dq_free(struct disp_queue_info
*dptr
);
81 /* platform-specific routine to call when processor is idle */
82 static void generic_idle_cpu();
83 void (*idle_cpu
)() = generic_idle_cpu
;
85 /* routines invoked when a CPU enters/exits the idle loop */
86 static void idle_enter();
87 static void idle_exit();
89 /* platform-specific routine to call when thread is enqueued */
90 static void generic_enq_thread(cpu_t
*, int);
91 void (*disp_enq_thread
)(cpu_t
*, int) = generic_enq_thread
;
93 pri_t kpreemptpri
; /* priority where kernel preemption applies */
94 pri_t upreemptpri
= 0; /* priority where normal preemption applies */
95 pri_t intr_pri
; /* interrupt thread priority base level */
97 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
98 pri_t kpqpri
= KPQPRI
; /* can be set in /etc/system */
99 disp_t cpu0_disp
; /* boot CPU's dispatch queue */
100 int nswapped
; /* total number of swapped threads */
101 static void disp_swapped_setrun(kthread_t
*tp
);
102 static void cpu_resched(cpu_t
*cp
, pri_t tpri
);
105 * If this is set, only interrupt threads will cause kernel preemptions.
106 * This is done by changing the value of kpreemptpri. kpreemptpri
107 * will either be the max sysclass pri + 1 or the min interrupt pri.
109 int only_intr_kpreempt
;
111 extern void set_idle_cpu(int cpun
);
112 extern void unset_idle_cpu(int cpun
);
113 static void setkpdq(kthread_t
*tp
, int borf
);
115 #define SETKP_FRONT 1
117 * Parameter that determines how recently a thread must have run
118 * on the CPU to be considered loosely-bound to that CPU to reduce
119 * cold cache effects. The interval is in hertz.
121 #define RECHOOSE_INTERVAL 3
122 int rechoose_interval
= RECHOOSE_INTERVAL
;
125 * Parameter that determines how long (in nanoseconds) a thread must
126 * be sitting on a run queue before it can be stolen by another CPU
127 * to reduce migrations. The interval is in nanoseconds.
129 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
130 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
131 * here indicating it is uninitiallized.
132 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
135 #define NOSTEAL_UNINITIALIZED (-1)
136 hrtime_t nosteal_nsec
= NOSTEAL_UNINITIALIZED
;
137 extern void cmp_set_nosteal_interval(void);
139 id_t defaultcid
; /* system "default" class; see dispadmin(1M) */
141 disp_lock_t transition_lock
; /* lock on transitioning threads */
142 disp_lock_t stop_lock
; /* lock on stopped threads */
144 static void cpu_dispqalloc(int numpris
);
147 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
148 * a thread because it was sitting on its run queue for a very short
151 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
153 static kthread_t
*disp_getwork(cpu_t
*to
);
154 static kthread_t
*disp_getbest(disp_t
*from
);
155 static kthread_t
*disp_ratify(kthread_t
*tp
, disp_t
*kpq
);
157 void swtch_to(kthread_t
*);
160 * dispatcher and scheduler initialization
164 * disp_setup - Common code to calculate and allocate dispatcher
165 * variables and structures based on the maximum priority.
168 disp_setup(pri_t maxglobpri
, pri_t oldnglobpris
)
172 ASSERT(MUTEX_HELD(&cpu_lock
));
174 newnglobpris
= maxglobpri
+ 1 + LOCK_LEVEL
;
176 if (newnglobpris
> oldnglobpris
) {
178 * Allocate new kp queues for each CPU partition.
180 cpupart_kpqalloc(newnglobpris
);
183 * Allocate new dispatch queues for each CPU.
185 cpu_dispqalloc(newnglobpris
);
188 * compute new interrupt thread base priority
190 intr_pri
= maxglobpri
;
191 if (only_intr_kpreempt
) {
192 kpreemptpri
= intr_pri
+ 1;
193 if (kpqpri
== KPQPRI
)
194 kpqpri
= kpreemptpri
;
196 v
.v_nglobpris
= newnglobpris
;
201 * dispinit - Called to initialize all loaded classes and the
202 * dispatcher framework.
214 * Initialize transition lock, which will always be set.
216 DISP_LOCK_INIT(&transition_lock
);
217 disp_lock_enter_high(&transition_lock
);
218 DISP_LOCK_INIT(&stop_lock
);
220 mutex_enter(&cpu_lock
);
221 CPU
->cpu_disp
->disp_maxrunpri
= -1;
222 CPU
->cpu_disp
->disp_max_unbound_pri
= -1;
225 * Initialize the default CPU partition.
227 cpupart_initialize_default();
229 * Call the class specific initialization functions for
230 * all pre-installed schedulers.
232 * We pass the size of a class specific parameter
233 * buffer to each of the initialization functions
234 * to try to catch problems with backward compatibility
237 * For example a new class module running on an old system
238 * which didn't provide sufficiently large parameter buffers
239 * would be bad news. Class initialization modules can check for
240 * this and take action if they detect a problem.
243 for (cid
= 0; cid
< nclass
; cid
++) {
247 if (SCHED_INSTALLED(sc
)) {
248 cl_maxglobpri
= sc
->cl_init(cid
, PC_CLPARMSZ
,
250 if (cl_maxglobpri
> maxglobpri
)
251 maxglobpri
= cl_maxglobpri
;
254 kpreemptpri
= (pri_t
)v
.v_maxsyspri
+ 1;
255 if (kpqpri
== KPQPRI
)
256 kpqpri
= kpreemptpri
;
258 ASSERT(maxglobpri
>= 0);
259 disp_setup(maxglobpri
, 0);
261 mutex_exit(&cpu_lock
);
264 * Platform specific sticky scheduler setup.
266 if (nosteal_nsec
== NOSTEAL_UNINITIALIZED
)
267 cmp_set_nosteal_interval();
270 * Get the default class ID; this may be later modified via
271 * dispadmin(1M). This will load the class (normally TS) and that will
272 * call disp_add(), which is why we had to drop cpu_lock first.
274 if (getcid(defaultclass
, &defaultcid
) != 0) {
275 cmn_err(CE_PANIC
, "Couldn't load default scheduling class '%s'",
281 * disp_add - Called with class pointer to initialize the dispatcher
282 * for a newly loaded class.
285 disp_add(sclass_t
*clp
)
290 mutex_enter(&cpu_lock
);
292 * Initialize the scheduler class.
294 maxglobpri
= (pri_t
)(v
.v_nglobpris
- LOCK_LEVEL
- 1);
295 cl_maxglobpri
= clp
->cl_init(clp
- sclass
, PC_CLPARMSZ
, &clp
->cl_funcs
);
296 if (cl_maxglobpri
> maxglobpri
)
297 maxglobpri
= cl_maxglobpri
;
300 * Save old queue information. Since we're initializing a
301 * new scheduling class which has just been loaded, then
302 * the size of the dispq may have changed. We need to handle
305 disp_setup(maxglobpri
, v
.v_nglobpris
);
307 mutex_exit(&cpu_lock
);
312 * For each CPU, allocate new dispatch queues
313 * with the stated number of priorities.
316 cpu_dispqalloc(int numpris
)
319 struct disp_queue_info
*disp_mem
;
322 ASSERT(MUTEX_HELD(&cpu_lock
));
324 disp_mem
= kmem_zalloc(NCPU
*
325 sizeof (struct disp_queue_info
), KM_SLEEP
);
328 * This routine must allocate all of the memory before stopping
329 * the cpus because it must not sleep in kmem_alloc while the
330 * CPUs are stopped. Locks they hold will not be freed until they
336 disp_dq_alloc(&disp_mem
[i
], numpris
, cpup
->cpu_disp
);
338 cpup
= cpup
->cpu_next
;
339 } while (cpup
!= cpu_list
);
342 pause_cpus(NULL
, NULL
);
343 for (i
= 0; i
< num
; i
++)
344 disp_dq_assign(&disp_mem
[i
], numpris
);
348 * I must free all of the memory after starting the cpus because
349 * I can not risk sleeping in kmem_free while the cpus are stopped.
351 for (i
= 0; i
< num
; i
++)
352 disp_dq_free(&disp_mem
[i
]);
354 kmem_free(disp_mem
, NCPU
* sizeof (struct disp_queue_info
));
358 disp_dq_alloc(struct disp_queue_info
*dptr
, int numpris
, disp_t
*dp
)
360 dptr
->newdispq
= kmem_zalloc(numpris
* sizeof (dispq_t
), KM_SLEEP
);
361 dptr
->newdqactmap
= kmem_zalloc(((numpris
/ BT_NBIPUL
) + 1) *
362 sizeof (long), KM_SLEEP
);
367 disp_dq_assign(struct disp_queue_info
*dptr
, int numpris
)
372 dptr
->olddispq
= dp
->disp_q
;
373 dptr
->olddqactmap
= dp
->disp_qactmap
;
374 dptr
->oldnglobpris
= dp
->disp_npri
;
376 ASSERT(dptr
->oldnglobpris
< numpris
);
378 if (dptr
->olddispq
!= NULL
) {
380 * Use kcopy because bcopy is platform-specific
381 * and could block while we might have paused the cpus.
383 (void) kcopy(dptr
->olddispq
, dptr
->newdispq
,
384 dptr
->oldnglobpris
* sizeof (dispq_t
));
385 (void) kcopy(dptr
->olddqactmap
, dptr
->newdqactmap
,
386 ((dptr
->oldnglobpris
/ BT_NBIPUL
) + 1) *
389 dp
->disp_q
= dptr
->newdispq
;
390 dp
->disp_qactmap
= dptr
->newdqactmap
;
391 dp
->disp_q_limit
= &dptr
->newdispq
[numpris
];
392 dp
->disp_npri
= numpris
;
396 disp_dq_free(struct disp_queue_info
*dptr
)
398 if (dptr
->olddispq
!= NULL
)
399 kmem_free(dptr
->olddispq
,
400 dptr
->oldnglobpris
* sizeof (dispq_t
));
401 if (dptr
->olddqactmap
!= NULL
)
402 kmem_free(dptr
->olddqactmap
,
403 ((dptr
->oldnglobpris
/ BT_NBIPUL
) + 1) * sizeof (long));
407 * For a newly created CPU, initialize the dispatch queue.
408 * This is called before the CPU is known through cpu[] or on any lists.
411 disp_cpu_init(cpu_t
*cp
)
415 ulong_t
*newdqactmap
;
417 ASSERT(MUTEX_HELD(&cpu_lock
)); /* protect dispatcher queue sizes */
419 if (cp
== cpu0_disp
.disp_cpu
)
422 dp
= kmem_alloc(sizeof (disp_t
), KM_SLEEP
);
423 bzero(dp
, sizeof (disp_t
));
426 dp
->disp_maxrunpri
= -1;
427 dp
->disp_max_unbound_pri
= -1;
428 DISP_LOCK_INIT(&cp
->cpu_thread_lock
);
430 * Allocate memory for the dispatcher queue headers
431 * and the active queue bitmap.
433 newdispq
= kmem_zalloc(v
.v_nglobpris
* sizeof (dispq_t
), KM_SLEEP
);
434 newdqactmap
= kmem_zalloc(((v
.v_nglobpris
/ BT_NBIPUL
) + 1) *
435 sizeof (long), KM_SLEEP
);
436 dp
->disp_q
= newdispq
;
437 dp
->disp_qactmap
= newdqactmap
;
438 dp
->disp_q_limit
= &newdispq
[v
.v_nglobpris
];
439 dp
->disp_npri
= v
.v_nglobpris
;
443 disp_cpu_fini(cpu_t
*cp
)
445 ASSERT(MUTEX_HELD(&cpu_lock
));
447 disp_kp_free(cp
->cpu_disp
);
448 if (cp
->cpu_disp
!= &cpu0_disp
)
449 kmem_free(cp
->cpu_disp
, sizeof (disp_t
));
453 * Allocate new, larger kpreempt dispatch queue to replace the old one.
456 disp_kp_alloc(disp_t
*dq
, pri_t npri
)
458 struct disp_queue_info mem_info
;
460 if (npri
> dq
->disp_npri
) {
462 * Allocate memory for the new array.
464 disp_dq_alloc(&mem_info
, npri
, dq
);
467 * We need to copy the old structures to the new
470 disp_dq_assign(&mem_info
, npri
);
471 disp_dq_free(&mem_info
);
476 * Free dispatch queue.
477 * Used for the kpreempt queues for a removed CPU partition and
478 * for the per-CPU queues of deleted CPUs.
481 disp_kp_free(disp_t
*dq
)
483 struct disp_queue_info mem_info
;
485 mem_info
.olddispq
= dq
->disp_q
;
486 mem_info
.olddqactmap
= dq
->disp_qactmap
;
487 mem_info
.oldnglobpris
= dq
->disp_npri
;
488 disp_dq_free(&mem_info
);
492 * End dispatcher and scheduler initialization.
496 * See if there's anything to do other than remain idle.
497 * Return non-zero if there is.
499 * This function must be called with high spl, or with
500 * kernel preemption disabled to prevent the partition's
501 * active cpu list from changing while being traversed.
503 * This is essentially a simpler version of disp_getwork()
504 * to be called by CPUs preparing to "halt".
511 volatile int *local_nrunnable
= &cp
->cpu_disp
->disp_nrunnable
;
513 if (!(cp
->cpu_flags
& CPU_OFFLINE
)) {
514 if (CP_MAXRUNPRI(cp
->cpu_part
) >= 0)
517 for (ocp
= cp
->cpu_next_part
; ocp
!= cp
;
518 ocp
= ocp
->cpu_next_part
) {
519 ASSERT(CPU_ACTIVE(ocp
));
522 * Something has appeared on the local run queue.
524 if (*local_nrunnable
> 0)
527 * If we encounter another idle CPU that will
528 * soon be trolling around through disp_anywork()
529 * terminate our walk here and let this other CPU
530 * patrol the next part of the list.
532 if (ocp
->cpu_dispatch_pri
== -1 &&
533 (ocp
->cpu_disp_flags
& CPU_DISP_HALTED
) == 0)
536 * Work can be taken from another CPU if:
537 * - There is unbound work on the run queue
538 * - That work isn't a thread undergoing a
539 * - context switch on an otherwise empty queue.
540 * - The CPU isn't running the idle loop.
542 if (ocp
->cpu_disp
->disp_max_unbound_pri
!= -1 &&
543 !((ocp
->cpu_disp_flags
& CPU_DISP_DONTSTEAL
) &&
544 ocp
->cpu_disp
->disp_nrunnable
== 1) &&
545 ocp
->cpu_dispatch_pri
!= -1)
553 * Called when CPU enters the idle loop
560 new_cpu_mstate(CMS_IDLE
, gethrtime_unscaled());
561 CPU_STATS_ADDQ(cp
, sys
, idlethread
, 1);
562 set_idle_cpu(cp
->cpu_id
); /* arch-dependent hook */
566 * Called when CPU exits the idle loop
573 new_cpu_mstate(CMS_SYSTEM
, gethrtime_unscaled());
574 unset_idle_cpu(cp
->cpu_id
); /* arch-dependent hook */
583 struct cpu
*cp
= CPU
; /* pointer to this CPU */
584 kthread_t
*t
; /* taken thread */
589 * Uniprocessor version of idle loop.
590 * Do this until notified that we're on an actual multiprocessor.
593 if (cp
->cpu_disp
->disp_nrunnable
== 0) {
600 idle_enter(); /* returned from swtch */
604 * Multiprocessor idle loop.
608 * If CPU is completely quiesced by p_online(2), just wait
609 * here with minimal bus traffic until put online.
611 while (cp
->cpu_flags
& CPU_QUIESCED
)
614 if (cp
->cpu_disp
->disp_nrunnable
!= 0) {
618 if (cp
->cpu_flags
& CPU_OFFLINE
)
620 if ((t
= disp_getwork(cp
)) == NULL
) {
621 if (cp
->cpu_chosen_level
!= -1) {
622 disp_t
*dp
= cp
->cpu_disp
;
625 disp_lock_enter(&dp
->disp_lock
);
627 * Set kpq under lock to prevent
628 * migration between partitions.
630 kpq
= &cp
->cpu_part
->cp_kp_queue
;
631 if (kpq
->disp_maxrunpri
== -1)
632 cp
->cpu_chosen_level
= -1;
633 disp_lock_exit(&dp
->disp_lock
);
639 * If there was a thread but we couldn't steal
640 * it, then keep trying.
642 if (t
== T_DONTSTEAL
)
647 idle_enter(); /* returned from swtch/swtch_to */
653 * Preempt the currently running thread in favor of the highest
654 * priority thread. The class of the current thread controls
655 * where it goes on the dispatcher queues. If panicking, turn
661 kthread_t
*t
= curthread
;
662 klwp_t
*lwp
= ttolwp(curthread
);
667 TRACE_0(TR_FAC_DISP
, TR_PREEMPT_START
, "preempt_start");
671 if (t
->t_state
!= TS_ONPROC
|| t
->t_disp_queue
!= CPU
->cpu_disp
) {
673 * this thread has already been chosen to be run on
674 * another CPU. Clear kprunrun on this CPU since we're
675 * already headed for swtch().
677 CPU
->cpu_kprunrun
= 0;
678 thread_unlock_nopreempt(t
);
679 TRACE_0(TR_FAC_DISP
, TR_PREEMPT_END
, "preempt_end");
682 lwp
->lwp_ru
.nivcsw
++;
683 CPU_STATS_ADDQ(CPU
, sys
, inv_swtch
, 1);
684 THREAD_TRANSITION(t
);
686 DTRACE_SCHED(preempt
);
687 thread_unlock_nopreempt(t
);
689 TRACE_0(TR_FAC_DISP
, TR_PREEMPT_END
, "preempt_end");
691 swtch(); /* clears CPU->cpu_runrun via disp() */
695 extern kthread_t
*thread_unpin();
698 * disp() - find the highest priority thread for this processor to run, and
699 * set it in TS_ONPROC state so that resume() can be called to run it.
712 TRACE_0(TR_FAC_DISP
, TR_DISP_START
, "disp_start");
716 * Find the highest priority loaded, runnable thread.
722 * If there is more important work on the global queue with a better
723 * priority than the maximum on this CPU, take it now.
725 kpq
= &cpup
->cpu_part
->cp_kp_queue
;
726 while ((pri
= kpq
->disp_maxrunpri
) >= 0 &&
727 pri
>= dp
->disp_maxrunpri
&&
728 (cpup
->cpu_flags
& CPU_OFFLINE
) == 0 &&
729 (tp
= disp_getbest(kpq
)) != NULL
) {
730 if (disp_ratify(tp
, kpq
) != NULL
) {
731 TRACE_1(TR_FAC_DISP
, TR_DISP_END
,
732 "disp_end:tid %p", tp
);
737 disp_lock_enter(&dp
->disp_lock
);
738 pri
= dp
->disp_maxrunpri
;
741 * If there is nothing to run, look at what's runnable on other queues.
742 * Choose the idle thread if the CPU is quiesced.
743 * Note that CPUs that have the CPU_OFFLINE flag set can still run
744 * interrupt threads, which will be the only threads on the CPU's own
745 * queue, but cannot run threads from other queues.
748 if (!(cpup
->cpu_flags
& CPU_OFFLINE
)) {
749 disp_lock_exit(&dp
->disp_lock
);
750 if ((tp
= disp_getwork(cpup
)) == NULL
||
752 tp
= cpup
->cpu_idle_thread
;
754 THREAD_ONPROC(tp
, cpup
);
755 cpup
->cpu_dispthread
= tp
;
756 cpup
->cpu_dispatch_pri
= -1;
757 cpup
->cpu_runrun
= cpup
->cpu_kprunrun
= 0;
758 cpup
->cpu_chosen_level
= -1;
761 disp_lock_exit_high(&dp
->disp_lock
);
762 tp
= cpup
->cpu_idle_thread
;
763 THREAD_ONPROC(tp
, cpup
);
764 cpup
->cpu_dispthread
= tp
;
765 cpup
->cpu_dispatch_pri
= -1;
766 cpup
->cpu_runrun
= cpup
->cpu_kprunrun
= 0;
767 cpup
->cpu_chosen_level
= -1;
769 TRACE_1(TR_FAC_DISP
, TR_DISP_END
,
770 "disp_end:tid %p", tp
);
774 dq
= &dp
->disp_q
[pri
];
779 DTRACE_SCHED2(dequeue
, kthread_t
*, tp
, disp_t
*, dp
);
782 * Found it so remove it from queue.
784 dp
->disp_nrunnable
--;
786 if ((dq
->dq_first
= tp
->t_link
) == NULL
) {
787 ulong_t
*dqactmap
= dp
->disp_qactmap
;
789 ASSERT(dq
->dq_sruncnt
== 0);
793 * The queue is empty, so the corresponding bit needs to be
794 * turned off in dqactmap. If nrunnable != 0 just took the
795 * last runnable thread off the
796 * highest queue, so recompute disp_maxrunpri.
798 maxrunword
= pri
>> BT_ULSHIFT
;
799 dqactmap
[maxrunword
] &= ~BT_BIW(pri
);
801 if (dp
->disp_nrunnable
== 0) {
802 dp
->disp_max_unbound_pri
= -1;
803 dp
->disp_maxrunpri
= -1;
807 ipri
= bt_gethighbit(dqactmap
, maxrunword
);
808 dp
->disp_maxrunpri
= ipri
;
809 if (ipri
< dp
->disp_max_unbound_pri
)
810 dp
->disp_max_unbound_pri
= ipri
;
816 cpup
->cpu_dispthread
= tp
; /* protected by spl only */
817 cpup
->cpu_dispatch_pri
= pri
;
818 ASSERT(pri
== DISP_PRIO(tp
));
819 thread_onproc(tp
, cpup
); /* set t_state to TS_ONPROC */
820 disp_lock_exit_high(&dp
->disp_lock
); /* drop run queue lock */
823 TRACE_1(TR_FAC_DISP
, TR_DISP_END
,
824 "disp_end:tid %p", tp
);
826 if (disp_ratify(tp
, kpq
) == NULL
)
834 * Find best runnable thread and run it.
835 * Called with the current thread already switched to a new state,
836 * on a sleep queue, run queue, stopped, and not zombied.
837 * May be called at any spl level less than or equal to LOCK_LEVEL.
838 * Always drops spl to the base level (spl0()).
843 kthread_t
*t
= curthread
;
847 TRACE_0(TR_FAC_DISP
, TR_SWTCH_START
, "swtch_start");
849 if (t
->t_flag
& T_INTR_THREAD
)
850 cpu_intr_swtch_enter(t
);
852 if (t
->t_intr
!= NULL
) {
854 * We are an interrupt thread. Setup and return
855 * the interrupted thread to be resumed.
857 (void) splhigh(); /* block other scheduler action */
858 cp
= CPU
; /* now protected against migration */
859 ASSERT(CPU_ON_INTR(cp
) == 0); /* not called with PIL > 10 */
860 CPU_STATS_ADDQ(cp
, sys
, pswitch
, 1);
861 CPU_STATS_ADDQ(cp
, sys
, intrblk
, 1);
862 next
= thread_unpin();
863 TRACE_0(TR_FAC_DISP
, TR_RESUME_START
, "resume_start");
864 resume_from_intr(next
);
867 if (t
->t_state
== TS_ONPROC
&&
868 t
->t_disp_queue
->disp_cpu
== CPU
&&
871 ASSERT(t
->t_state
!= TS_ONPROC
||
872 t
->t_disp_queue
->disp_cpu
!= CPU
||
873 t
->t_preempt
!= 0); /* cannot migrate */
874 thread_unlock_nopreempt(t
);
878 next
= disp(); /* returns with spl high */
879 ASSERT(CPU_ON_INTR(cp
) == 0); /* not called with PIL > 10 */
881 /* OK to steal anything left on run queue */
882 cp
->cpu_disp_flags
&= ~CPU_DISP_DONTSTEAL
;
887 now
= gethrtime_unscaled();
888 pg_ev_thread_swtch(cp
, now
, t
, next
);
891 * If t was previously in the TS_ONPROC state,
892 * setfrontdq and setbackdq won't have set its t_waitrq.
893 * Since we now finally know that we're switching away
894 * from this thread, set its t_waitrq if it is on a run
897 if ((t
->t_state
== TS_RUN
) && (t
->t_waitrq
== 0)) {
902 * restore mstate of thread that we are switching to
904 restore_mstate(next
);
906 CPU_STATS_ADDQ(cp
, sys
, pswitch
, 1);
907 cp
->cpu_last_swtch
= t
->t_disp_time
= ddi_get_lbolt();
908 TRACE_0(TR_FAC_DISP
, TR_RESUME_START
, "resume_start");
910 if (dtrace_vtime_active
)
911 dtrace_vtime_switch(next
);
915 * The TR_RESUME_END and TR_SWTCH_END trace points
916 * appear at the end of resume(), because we may not
920 if (t
->t_flag
& T_INTR_THREAD
)
921 cpu_intr_swtch_exit(t
);
923 * Threads that enqueue themselves on a run queue defer
924 * setting t_waitrq. It is then either set in swtch()
925 * when the CPU is actually yielded, or not at all if it
926 * is remaining on the CPU.
927 * There is however a window between where the thread
928 * placed itself on a run queue, and where it selects
929 * itself in disp(), where a third party (eg. clock()
930 * doing tick processing) may have re-enqueued this
931 * thread, setting t_waitrq in the process. We detect
932 * this race by noticing that despite switching to
933 * ourself, our t_waitrq has been set, and should be
936 if (t
->t_waitrq
!= 0)
939 pg_ev_thread_remain(cp
, t
);
941 DTRACE_SCHED(remain__cpu
);
942 TRACE_0(TR_FAC_DISP
, TR_SWTCH_END
, "swtch_end");
949 * swtch_from_zombie()
950 * Special case of swtch(), which allows checks for TS_ZOMB to be
951 * eliminated from normal resume.
952 * Find best runnable thread and run it.
953 * Called with the current thread zombied.
954 * Zombies cannot migrate, so CPU references are safe.
962 TRACE_0(TR_FAC_DISP
, TR_SWTCH_START
, "swtch_start");
964 ASSERT(curthread
->t_state
== TS_ZOMB
);
966 next
= disp(); /* returns with spl high */
967 ASSERT(CPU_ON_INTR(CPU
) == 0); /* not called with PIL > 10 */
968 CPU_STATS_ADDQ(CPU
, sys
, pswitch
, 1);
969 ASSERT(next
!= curthread
);
970 TRACE_0(TR_FAC_DISP
, TR_RESUME_START
, "resume_start");
972 pg_ev_thread_swtch(cpu
, gethrtime_unscaled(), curthread
, next
);
974 restore_mstate(next
);
976 if (dtrace_vtime_active
)
977 dtrace_vtime_switch(next
);
979 resume_from_zombie(next
);
981 * The TR_RESUME_END and TR_SWTCH_END trace points
982 * appear at the end of resume(), because we certainly will not
987 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
990 * search_disp_queues()
991 * Search the given dispatch queues for thread tp.
992 * Return 1 if tp is found, otherwise return 0.
995 search_disp_queues(disp_t
*dp
, kthread_t
*tp
)
1000 disp_lock_enter_high(&dp
->disp_lock
);
1002 for (dq
= dp
->disp_q
, eq
= dp
->disp_q_limit
; dq
< eq
; ++dq
) {
1005 ASSERT(dq
->dq_last
== NULL
|| dq
->dq_last
->t_link
== NULL
);
1007 for (rp
= dq
->dq_first
; rp
; rp
= rp
->t_link
)
1009 disp_lock_exit_high(&dp
->disp_lock
);
1013 disp_lock_exit_high(&dp
->disp_lock
);
1020 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1021 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1024 thread_on_queue(kthread_t
*tp
)
1027 struct cpupart
*part
;
1029 ASSERT(getpil() >= DISP_LEVEL
);
1032 * Search the per-CPU dispatch queues for tp.
1036 if (search_disp_queues(cp
->cpu_disp
, tp
))
1038 } while ((cp
= cp
->cpu_next_onln
) != CPU
);
1041 * Search the partition-wide kpreempt queues for tp.
1043 part
= CPU
->cpu_part
;
1045 if (search_disp_queues(&part
->cp_kp_queue
, tp
))
1047 } while ((part
= part
->cp_next
) != CPU
->cpu_part
);
1054 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1059 * like swtch(), but switch to a specified thread taken from another CPU.
1060 * called with spl high..
1063 swtch_to(kthread_t
*next
)
1068 TRACE_0(TR_FAC_DISP
, TR_SWTCH_START
, "swtch_start");
1071 * Update context switch statistics.
1073 CPU_STATS_ADDQ(cp
, sys
, pswitch
, 1);
1075 TRACE_0(TR_FAC_DISP
, TR_RESUME_START
, "resume_start");
1077 now
= gethrtime_unscaled();
1078 pg_ev_thread_swtch(cp
, now
, curthread
, next
);
1080 /* OK to steal anything left on run queue */
1081 cp
->cpu_disp_flags
&= ~CPU_DISP_DONTSTEAL
;
1083 /* record last execution time */
1084 cp
->cpu_last_swtch
= curthread
->t_disp_time
= ddi_get_lbolt();
1087 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1088 * won't have set its t_waitrq. Since we now finally know that we're
1089 * switching away from this thread, set its t_waitrq if it is on a run
1092 if ((curthread
->t_state
== TS_RUN
) && (curthread
->t_waitrq
== 0)) {
1093 curthread
->t_waitrq
= now
;
1096 /* restore next thread to previously running microstate */
1097 restore_mstate(next
);
1099 if (dtrace_vtime_active
)
1100 dtrace_vtime_switch(next
);
1104 * The TR_RESUME_END and TR_SWTCH_END trace points
1105 * appear at the end of resume(), because we may not
1110 #define CPU_IDLING(pri) ((pri) == -1)
1113 cpu_resched(cpu_t
*cp
, pri_t tpri
)
1115 int call_poke_cpu
= 0;
1116 pri_t cpupri
= cp
->cpu_dispatch_pri
;
1118 if (!CPU_IDLING(cpupri
) && (cpupri
< tpri
)) {
1119 TRACE_2(TR_FAC_DISP
, TR_CPU_RESCHED
,
1120 "CPU_RESCHED:Tpri %d Cpupri %d", tpri
, cpupri
);
1121 if (tpri
>= upreemptpri
&& cp
->cpu_runrun
== 0) {
1123 aston(cp
->cpu_dispthread
);
1124 if (tpri
< kpreemptpri
&& cp
!= CPU
)
1127 if (tpri
>= kpreemptpri
&& cp
->cpu_kprunrun
== 0) {
1128 cp
->cpu_kprunrun
= 1;
1135 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1140 poke_cpu(cp
->cpu_id
);
1144 * setbackdq() keeps runqs balanced such that the difference in length
1145 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1146 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1147 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1148 * try to keep runqs perfectly balanced regardless of the thread priority.
1150 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1151 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1152 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1155 * Macro that evaluates to true if it is likely that the thread has cache
1156 * warmth. This is based on the amount of time that has elapsed since the
1157 * thread last ran. If that amount of time is less than "rechoose_interval"
1158 * ticks, then we decide that the thread has enough cache warmth to warrant
1159 * some affinity for t->t_cpu.
1161 #define THREAD_HAS_CACHE_WARMTH(thread) \
1162 ((thread == curthread) || \
1163 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1166 * Put the specified thread on the front/back of the dispatcher queue
1167 * corresponding to its current priority.
1169 * Called with the thread in transition, onproc or stopped state and locked
1170 * (transition implies locked) and at high spl. Returns with the thread in
1171 * TS_RUN state and still locked.
1174 setfrontbackdq(kthread_t
*tp
, bool front
)
1183 ASSERT(THREAD_LOCK_HELD(tp
));
1184 ASSERT((tp
->t_schedflag
& TS_ALLSTART
) == 0);
1185 ASSERT(!thread_on_queue(tp
)); /* make sure tp isn't on a runq */
1187 self
= (tp
== curthread
);
1188 bound
= (tp
->t_bound_cpu
|| tp
->t_weakbound_cpu
);
1190 tpri
= DISP_PRIO(tp
);
1194 if (tpri
>= kpqpri
) {
1195 setkpdq(tp
, front
? SETKP_FRONT
: SETKP_BACK
);
1203 * We'll generally let this thread continue to run where
1204 * it last ran...but will consider migration if:
1205 * - We thread probably doesn't have much cache warmth.
1206 * - The CPU where it last ran is the target of an offline
1208 * - The thread last ran outside it's home lgroup.
1210 if ((!THREAD_HAS_CACHE_WARMTH(tp
)) || (cp
== cpu_inmotion
)) {
1211 cp
= disp_lowpri_cpu(cp
, tp
->t_lpl
, tpri
, NULL
);
1212 } else if (!LGRP_CONTAINS_CPU(tp
->t_lpl
->lpl_lgrp
, cp
)) {
1213 cp
= disp_lowpri_cpu(cp
, tp
->t_lpl
, tpri
,
1219 if (tp
->t_cpupart
== cp
->cpu_part
) {
1222 * We'll generally let this thread continue to run
1223 * where it last ran, but will consider migration if:
1224 * - The thread last ran outside it's home lgroup.
1225 * - The CPU where it last ran is the target of an
1226 * offline request (a thread_nomigrate() on the in
1227 * motion CPU relies on this when forcing a preempt).
1228 * - The thread isn't the highest priority thread where
1229 * it last ran, and it is considered not likely to
1230 * have significant cache warmth.
1232 if ((!LGRP_CONTAINS_CPU(tp
->t_lpl
->lpl_lgrp
, cp
)) ||
1233 (cp
== cpu_inmotion
)) {
1234 cp
= disp_lowpri_cpu(cp
, tp
->t_lpl
, tpri
,
1236 } else if ((tpri
< cp
->cpu_disp
->disp_maxrunpri
) &&
1237 (!THREAD_HAS_CACHE_WARMTH(tp
))) {
1238 cp
= disp_lowpri_cpu(cp
, tp
->t_lpl
, tpri
,
1245 * Perform any CMT load balancing
1247 cp
= cmt_balance(tp
, cp
);
1250 * Balance across the run queues
1252 qlen
= RUNQ_LEN(cp
, tpri
);
1253 if (tpri
>= RUNQ_MATCH_PRI
&&
1254 !(tp
->t_schedflag
& TS_RUNQMATCH
))
1255 qlen
-= RUNQ_MAX_DIFF
;
1259 if (tp
->t_lpl
->lpl_lgrpid
== LGRP_ROOTID
) {
1260 newcp
= cp
->cpu_next_part
;
1261 } else if ((newcp
= cp
->cpu_next_lpl
) == cp
) {
1262 newcp
= cp
->cpu_next_part
;
1265 if (RUNQ_LEN(newcp
, tpri
) < qlen
) {
1266 DTRACE_PROBE3(runq__balance
,
1268 cpu_t
*, cp
, cpu_t
*, newcp
);
1275 * Migrate to a cpu in the new partition.
1277 cp
= disp_lowpri_cpu(tp
->t_cpupart
->cp_cpulist
,
1278 tp
->t_lpl
, tp
->t_pri
, NULL
);
1281 ASSERT((cp
->cpu_flags
& CPU_QUIESCED
) == 0);
1284 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1285 * a short time until weak binding that existed when the
1286 * strong binding was established has dropped) so we must
1287 * favour weak binding over strong.
1289 cp
= tp
->t_weakbound_cpu
?
1290 tp
->t_weakbound_cpu
: tp
->t_bound_cpu
;
1294 * A thread that is ONPROC may be temporarily placed on the run queue
1295 * but then chosen to run again by disp. If the thread we're placing on
1296 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1297 * replacement process is actually scheduled in swtch(). In this
1298 * situation, curthread is the only thread that could be in the ONPROC
1301 if ((!self
) && (tp
->t_waitrq
== 0)) {
1304 curtime
= gethrtime_unscaled();
1305 (void) cpu_update_pct(tp
, curtime
);
1306 tp
->t_waitrq
= curtime
;
1308 (void) cpu_update_pct(tp
, gethrtime_unscaled());
1312 disp_lock_enter_high(&dp
->disp_lock
);
1314 DTRACE_SCHED3(enqueue
, kthread_t
*, tp
, disp_t
*, dp
, int, front
);
1318 if (tnf_tracing_active
)
1319 tnf_thread_queue(tp
, cp
, tpri
);
1322 ASSERT(tpri
>= 0 && tpri
< dp
->disp_npri
);
1324 THREAD_RUN(tp
, &dp
->disp_lock
); /* set t_state to TS_RUN */
1325 tp
->t_disp_queue
= dp
;
1328 dq
= &dp
->disp_q
[tpri
];
1329 dp
->disp_nrunnable
++;
1334 if (dq
->dq_sruncnt
++ != 0) {
1336 ASSERT(dq
->dq_last
!= NULL
);
1337 tp
->t_link
= dq
->dq_first
;
1340 ASSERT(dq
->dq_first
!= NULL
);
1341 dq
->dq_last
->t_link
= tp
;
1345 ASSERT(dq
->dq_first
== NULL
);
1346 ASSERT(dq
->dq_last
== NULL
);
1347 dq
->dq_first
= dq
->dq_last
= tp
;
1348 BT_SET(dp
->disp_qactmap
, tpri
);
1349 if (tpri
> dp
->disp_maxrunpri
) {
1350 dp
->disp_maxrunpri
= tpri
;
1352 cpu_resched(cp
, tpri
);
1356 if (!bound
&& tpri
> dp
->disp_max_unbound_pri
) {
1357 if (self
&& dp
->disp_max_unbound_pri
== -1 && cp
== CPU
) {
1359 * If there are no other unbound threads on the
1360 * run queue, don't allow other CPUs to steal
1361 * this thread while we are in the middle of a
1362 * context switch. We may just switch to it
1363 * again right away. CPU_DISP_DONTSTEAL is cleared
1364 * in swtch and swtch_to.
1366 cp
->cpu_disp_flags
|= CPU_DISP_DONTSTEAL
;
1368 dp
->disp_max_unbound_pri
= tpri
;
1371 (*disp_enq_thread
)(cp
, bound
);
1375 * Put the specified thread on the back of the dispatcher
1376 * queue corresponding to its current priority.
1378 * Called with the thread in transition, onproc or stopped state
1379 * and locked (transition implies locked) and at high spl.
1380 * Returns with the thread in TS_RUN state and still locked.
1383 setbackdq(kthread_t
*tp
)
1385 setfrontbackdq(tp
, false);
1389 * Put the specified thread on the front of the dispatcher
1390 * queue corresponding to its current priority.
1392 * Called with the thread in transition, onproc or stopped state
1393 * and locked (transition implies locked) and at high spl.
1394 * Returns with the thread in TS_RUN state and still locked.
1397 setfrontdq(kthread_t
*tp
)
1399 setfrontbackdq(tp
, true);
1403 * Put a high-priority unbound thread on the kp queue
1406 setkpdq(kthread_t
*tp
, int borf
)
1413 tpri
= DISP_PRIO(tp
);
1415 dp
= &tp
->t_cpupart
->cp_kp_queue
;
1416 disp_lock_enter_high(&dp
->disp_lock
);
1418 TRACE_2(TR_FAC_DISP
, TR_FRONTQ
, "frontq:pri %d tid %p", tpri
, tp
);
1420 ASSERT(tpri
>= 0 && tpri
< dp
->disp_npri
);
1421 DTRACE_SCHED3(enqueue
, kthread_t
*, tp
, disp_t
*, dp
, int, borf
);
1422 THREAD_RUN(tp
, &dp
->disp_lock
); /* set t_state to TS_RUN */
1423 tp
->t_disp_queue
= dp
;
1424 dp
->disp_nrunnable
++;
1425 dq
= &dp
->disp_q
[tpri
];
1427 if (dq
->dq_sruncnt
++ != 0) {
1428 if (borf
== SETKP_BACK
) {
1429 ASSERT(dq
->dq_first
!= NULL
);
1431 dq
->dq_last
->t_link
= tp
;
1434 ASSERT(dq
->dq_last
!= NULL
);
1435 tp
->t_link
= dq
->dq_first
;
1439 if (borf
== SETKP_BACK
) {
1440 ASSERT(dq
->dq_first
== NULL
);
1441 ASSERT(dq
->dq_last
== NULL
);
1442 dq
->dq_first
= dq
->dq_last
= tp
;
1444 ASSERT(dq
->dq_last
== NULL
);
1445 ASSERT(dq
->dq_first
== NULL
);
1447 dq
->dq_first
= dq
->dq_last
= tp
;
1449 BT_SET(dp
->disp_qactmap
, tpri
);
1450 if (tpri
> dp
->disp_max_unbound_pri
)
1451 dp
->disp_max_unbound_pri
= tpri
;
1452 if (tpri
> dp
->disp_maxrunpri
) {
1453 dp
->disp_maxrunpri
= tpri
;
1459 if (tp
->t_cpupart
!= cp
->cpu_part
) {
1460 /* migrate to a cpu in the new partition */
1461 cp
= tp
->t_cpupart
->cp_cpulist
;
1463 cp
= disp_lowpri_cpu(cp
, tp
->t_lpl
, tp
->t_pri
, NULL
);
1464 disp_lock_enter_high(&cp
->cpu_disp
->disp_lock
);
1465 ASSERT((cp
->cpu_flags
& CPU_QUIESCED
) == 0);
1469 if (tnf_tracing_active
)
1470 tnf_thread_queue(tp
, cp
, tpri
);
1473 if (cp
->cpu_chosen_level
< tpri
)
1474 cp
->cpu_chosen_level
= tpri
;
1475 cpu_resched(cp
, tpri
);
1476 disp_lock_exit_high(&cp
->cpu_disp
->disp_lock
);
1477 (*disp_enq_thread
)(cp
, 0);
1481 * Remove a thread from the dispatcher queue if it is on it.
1482 * It is not an error if it is not found but we return whether
1483 * or not it was found in case the caller wants to check.
1486 dispdeq(kthread_t
*tp
)
1495 ASSERT(THREAD_LOCK_HELD(tp
));
1497 if (tp
->t_state
!= TS_RUN
)
1500 tpri
= DISP_PRIO(tp
);
1501 dp
= tp
->t_disp_queue
;
1502 ASSERT(tpri
< dp
->disp_npri
);
1503 dq
= &dp
->disp_q
[tpri
];
1504 ptp
= &dq
->dq_first
;
1508 ASSERT(dq
->dq_last
== NULL
|| dq
->dq_last
->t_link
== NULL
);
1511 * Search for thread in queue.
1512 * Double links would simplify this at the expense of disp/setrun.
1514 while (rp
!= tp
&& rp
!= NULL
) {
1521 panic("dispdeq: thread not on queue");
1524 DTRACE_SCHED2(dequeue
, kthread_t
*, tp
, disp_t
*, dp
);
1527 * Found it so remove it from queue.
1529 if ((*ptp
= rp
->t_link
) == NULL
)
1532 dp
->disp_nrunnable
--;
1533 if (--dq
->dq_sruncnt
== 0) {
1534 dp
->disp_qactmap
[tpri
>> BT_ULSHIFT
] &= ~BT_BIW(tpri
);
1535 if (dp
->disp_nrunnable
== 0) {
1536 dp
->disp_max_unbound_pri
= -1;
1537 dp
->disp_maxrunpri
= -1;
1538 } else if (tpri
== dp
->disp_maxrunpri
) {
1541 ipri
= bt_gethighbit(dp
->disp_qactmap
,
1542 dp
->disp_maxrunpri
>> BT_ULSHIFT
);
1543 if (ipri
< dp
->disp_max_unbound_pri
)
1544 dp
->disp_max_unbound_pri
= ipri
;
1545 dp
->disp_maxrunpri
= ipri
;
1549 THREAD_TRANSITION(tp
); /* put in intermediate state */
1554 * Make a thread give up its processor. Find the processor on
1555 * which this thread is executing, and have that processor
1558 * We allow System Duty Cycle (SDC) threads to be preempted even if
1559 * they are running at kernel priorities. To implement this, we always
1560 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1561 * calls cpu_surrender() very often, we only preempt if there is anyone
1562 * competing with us.
1565 cpu_surrender(kthread_t
*tp
)
1572 ASSERT(THREAD_LOCK_HELD(tp
));
1574 if (tp
->t_state
!= TS_ONPROC
)
1576 cpup
= tp
->t_disp_queue
->disp_cpu
; /* CPU thread dispatched to */
1577 max_pri
= cpup
->cpu_disp
->disp_maxrunpri
; /* best pri of that CPU */
1578 max_run_pri
= CP_MAXRUNPRI(cpup
->cpu_part
);
1579 if (max_pri
< max_run_pri
)
1580 max_pri
= max_run_pri
;
1582 if (tp
->t_cid
== sysdccid
) {
1583 uint_t t_pri
= DISP_PRIO(tp
);
1584 if (t_pri
> max_pri
)
1585 return; /* we are not competing w/ anyone */
1586 cpup
->cpu_runrun
= cpup
->cpu_kprunrun
= 1;
1588 cpup
->cpu_runrun
= 1;
1589 if (max_pri
>= kpreemptpri
&& cpup
->cpu_kprunrun
== 0) {
1590 cpup
->cpu_kprunrun
= 1;
1595 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1599 DTRACE_SCHED1(surrender
, kthread_t
*, tp
);
1602 * Make the target thread take an excursion through trap()
1603 * to do preempt() (unless we're already in trap or post_syscall,
1604 * calling cpu_surrender via CL_TRAPRET).
1606 if (tp
!= curthread
|| (lwp
= tp
->t_lwp
) == NULL
||
1607 lwp
->lwp_state
!= LWP_USER
) {
1610 poke_cpu(cpup
->cpu_id
);
1612 TRACE_2(TR_FAC_DISP
, TR_CPU_SURRENDER
,
1613 "cpu_surrender:tid %p cpu %p", tp
, cpup
);
1617 * Commit to and ratify a scheduling decision
1621 disp_ratify(kthread_t
*tp
, disp_t
*kpq
)
1629 * Commit to, then ratify scheduling decision
1632 if (cpup
->cpu_runrun
!= 0)
1633 cpup
->cpu_runrun
= 0;
1634 if (cpup
->cpu_kprunrun
!= 0)
1635 cpup
->cpu_kprunrun
= 0;
1636 if (cpup
->cpu_chosen_level
!= -1)
1637 cpup
->cpu_chosen_level
= -1;
1639 tpri
= DISP_PRIO(tp
);
1640 maxpri
= cpup
->cpu_disp
->disp_maxrunpri
;
1641 maxkpri
= kpq
->disp_maxrunpri
;
1642 if (maxpri
< maxkpri
)
1644 if (tpri
< maxpri
) {
1646 * should have done better
1647 * put this one back and indicate to try again
1649 cpup
->cpu_dispthread
= curthread
; /* fixup dispthread */
1650 cpup
->cpu_dispatch_pri
= DISP_PRIO(curthread
);
1651 thread_lock_high(tp
);
1652 THREAD_TRANSITION(tp
);
1654 thread_unlock_nopreempt(tp
);
1662 * See if there is any work on the dispatcher queue for other CPUs.
1663 * If there is, dequeue the best thread and return.
1666 disp_getwork(cpu_t
*cp
)
1668 cpu_t
*ocp
; /* other CPU */
1670 cpu_t
*tcp
; /* target local CPU */
1672 kthread_t
*retval
= NULL
;
1674 disp_t
*kpq
; /* kp queue for this partition */
1675 lpl_t
*lpl
, *lpl_leaf
;
1676 int leafidx
, startidx
;
1683 kpq
= &cp
->cpu_part
->cp_kp_queue
;
1684 while (kpq
->disp_maxrunpri
>= 0) {
1686 * Try to take a thread from the kp_queue.
1688 tp
= (disp_getbest(kpq
));
1690 return (disp_ratify(tp
, kpq
));
1693 kpreempt_disable(); /* protect the cpu_active list */
1696 * Try to find something to do on another CPU's run queue.
1697 * Loop through all other CPUs looking for the one with the highest
1698 * priority unbound thread.
1700 * On NUMA machines, the partition's CPUs are consulted in order of
1701 * distance from the current CPU. This way, the first available
1702 * work found is also the closest, and will suffer the least
1703 * from being migrated.
1705 lpl
= lpl_leaf
= cp
->cpu_lpl
;
1706 local_id
= lpl_leaf
->lpl_lgrpid
;
1707 leafidx
= startidx
= 0;
1710 * This loop traverses the lpl hierarchy. Higher level lpls represent
1711 * broader levels of locality
1714 /* This loop iterates over the lpl's leaves */
1716 if (lpl_leaf
!= cp
->cpu_lpl
)
1717 ocp
= lpl_leaf
->lpl_cpus
;
1719 ocp
= cp
->cpu_next_lpl
;
1721 /* This loop iterates over the CPUs in the leaf */
1726 ASSERT(CPU_ACTIVE(ocp
));
1729 * End our stroll around this lpl if:
1731 * - Something became runnable on the local
1732 * queue...which also ends our stroll around
1735 * - We happen across another idle CPU.
1736 * Since it is patrolling the next portion
1737 * of the lpl's list (assuming it's not
1738 * halted, or busy servicing an interrupt),
1739 * move to the next higher level of locality.
1741 if (cp
->cpu_disp
->disp_nrunnable
!= 0) {
1745 if (ocp
->cpu_dispatch_pri
== -1) {
1746 if (ocp
->cpu_disp_flags
&
1748 ocp
->cpu_intr_actv
!= 0)
1755 * If there's only one thread and the CPU
1756 * is in the middle of a context switch,
1757 * or it's currently running the idle thread,
1760 if ((ocp
->cpu_disp_flags
&
1761 CPU_DISP_DONTSTEAL
) &&
1762 ocp
->cpu_disp
->disp_nrunnable
== 1)
1765 pri
= ocp
->cpu_disp
->disp_max_unbound_pri
;
1768 * Don't steal threads that we attempted
1769 * to steal recently until they're ready
1770 * to be stolen again.
1772 stealtime
= ocp
->cpu_disp
->disp_steal
;
1773 if (stealtime
== 0 ||
1774 stealtime
- gethrtime() <= 0) {
1779 * Don't update tcp, just set
1780 * the retval to T_DONTSTEAL, so
1781 * that if no acceptable CPUs
1782 * are found the return value
1783 * will be T_DONTSTEAL rather
1786 retval
= T_DONTSTEAL
;
1789 } while ((ocp
= ocp
->cpu_next_lpl
) != ocp_start
);
1792 * Iterate to the next leaf lpl in the resource set
1793 * at this level of locality. If we hit the end of
1794 * the set, wrap back around to the beginning.
1796 * Note: This iteration is NULL terminated for a reason
1797 * see lpl_topo_bootstrap() in lgrp.c for details.
1799 if ((lpl_leaf
= lpl
->lpl_rset
[++leafidx
]) == NULL
) {
1801 lpl_leaf
= lpl
->lpl_rset
[leafidx
];
1803 } while (leafidx
!= startidx
);
1807 * Expand the search to include farther away CPUs (next
1808 * locality level). The closer CPUs that have already been
1809 * checked will be checked again. In doing so, idle CPUs
1810 * will tend to be more aggresive about stealing from CPUs
1811 * that are closer (since the closer CPUs will be considered
1813 * Begin at this level with the CPUs local leaf lpl.
1815 if ((lpl
= lpl
->lpl_parent
) != NULL
) {
1816 leafidx
= startidx
= lpl
->lpl_id2rset
[local_id
];
1817 lpl_leaf
= lpl
->lpl_rset
[leafidx
];
1819 } while (!tcp
&& lpl
);
1824 * If another queue looks good, and there is still nothing on
1825 * the local queue, try to transfer one or more threads
1826 * from it to our queue.
1828 if (tcp
&& cp
->cpu_disp
->disp_nrunnable
== 0) {
1829 tp
= disp_getbest(tcp
->cpu_disp
);
1830 if (tp
== NULL
|| tp
== T_DONTSTEAL
)
1832 return (disp_ratify(tp
, kpq
));
1839 * disp_fix_unbound_pri()
1840 * Determines the maximum priority of unbound threads on the queue.
1841 * The priority is kept for the queue, but is only increased, never
1842 * reduced unless some CPU is looking for something on that queue.
1844 * The priority argument is the known upper limit.
1846 * Perhaps this should be kept accurately, but that probably means
1847 * separate bitmaps for bound and unbound threads. Since only idled
1848 * CPUs will have to do this recalculation, it seems better this way.
1851 disp_fix_unbound_pri(disp_t
*dp
, pri_t pri
)
1855 ulong_t
*dqactmap
= dp
->disp_qactmap
;
1859 ASSERT(DISP_LOCK_HELD(&dp
->disp_lock
));
1861 ASSERT(pri
>= 0); /* checked by caller */
1864 * Start the search at the next lowest priority below the supplied
1865 * priority. This depends on the bitmap implementation.
1868 wx
= pri
>> BT_ULSHIFT
; /* index of word in map */
1871 * Form mask for all lower priorities in the word.
1873 mapword
= dqactmap
[wx
] & (BT_BIW(pri
) - 1);
1876 * Get next lower active priority.
1879 pri
= (wx
<< BT_ULSHIFT
) + highbit(mapword
) - 1;
1880 } else if (wx
> 0) {
1881 pri
= bt_gethighbit(dqactmap
, wx
- 1); /* sign extend */
1890 * Search the queue for unbound, runnable threads.
1892 dq
= &dp
->disp_q
[pri
];
1895 while (tp
&& (tp
->t_bound_cpu
|| tp
->t_weakbound_cpu
)) {
1900 * If a thread was found, set the priority and return.
1902 } while (tp
== NULL
);
1905 * pri holds the maximum unbound thread priority or -1.
1907 if (dp
->disp_max_unbound_pri
!= pri
)
1908 dp
->disp_max_unbound_pri
= pri
;
1912 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
1913 * check if the CPU to which is was previously bound should have
1914 * its disp_max_unbound_pri increased.
1917 disp_adjust_unbound_pri(kthread_t
*tp
)
1922 ASSERT(THREAD_LOCK_HELD(tp
));
1925 * Don't do anything if the thread is not bound, or
1926 * currently not runnable.
1928 if (tp
->t_bound_cpu
== NULL
||
1929 tp
->t_state
!= TS_RUN
)
1932 tpri
= DISP_PRIO(tp
);
1933 dp
= tp
->t_bound_cpu
->cpu_disp
;
1934 ASSERT(tpri
>= 0 && tpri
< dp
->disp_npri
);
1935 if (tpri
> dp
->disp_max_unbound_pri
)
1936 dp
->disp_max_unbound_pri
= tpri
;
1941 * De-queue the highest priority unbound runnable thread.
1942 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
1943 * Returns NULL if nothing found.
1944 * Returns T_DONTSTEAL if the thread was not stealable.
1945 * so that the caller will try again later.
1947 * Passed a pointer to a dispatch queue not associated with this CPU, and
1951 disp_getbest(disp_t
*dp
)
1959 disp_lock_enter(&dp
->disp_lock
);
1962 * If there is nothing to run, or the CPU is in the middle of a
1963 * context switch of the only thread, return NULL.
1967 pri
= dp
->disp_max_unbound_pri
;
1969 (tcp
!= NULL
&& (tcp
->cpu_disp_flags
& CPU_DISP_DONTSTEAL
) &&
1970 tcp
->cpu_disp
->disp_nrunnable
== 1)) {
1971 disp_lock_exit_nopreempt(&dp
->disp_lock
);
1975 dq
= &dp
->disp_q
[pri
];
1979 * Assume that all threads are bound on this queue, and change it
1980 * later when we find out that it is not the case.
1983 for (tp
= dq
->dq_first
; tp
!= NULL
; tp
= tp
->t_link
) {
1984 hrtime_t now
, nosteal
, rqtime
;
1987 * Skip over bound threads which could be here even
1988 * though disp_max_unbound_pri indicated this level.
1990 if (tp
->t_bound_cpu
|| tp
->t_weakbound_cpu
)
1994 * We've got some unbound threads on this queue, so turn
1995 * the allbound flag off now.
2000 * The thread is a candidate for stealing from its run queue. We
2001 * don't want to steal threads that became runnable just a
2002 * moment ago. This improves CPU affinity for threads that get
2003 * preempted for short periods of time and go back on the run
2006 * We want to let it stay on its run queue if it was only placed
2007 * there recently and it was running on the same CPU before that
2008 * to preserve its cache investment. For the thread to remain on
2009 * its run queue, ALL of the following conditions must be
2012 * - the disp queue should not be the kernel preemption queue
2013 * - delayed idle stealing should not be disabled
2014 * - nosteal_nsec should be non-zero
2015 * - it should run with user priority
2016 * - it should be on the run queue of the CPU where it was
2017 * running before being placed on the run queue
2018 * - it should be the only thread on the run queue (to prevent
2019 * extra scheduling latency for other threads)
2020 * - it should sit on the run queue for less than per-chip
2021 * nosteal interval or global nosteal interval
2022 * - in case of CPUs with shared cache it should sit in a run
2023 * queue of a CPU from a different chip
2025 * The checks are arranged so that the ones that are faster are
2029 pri
>= minclsyspri
||
2034 * Steal immediately if, due to CMT processor architecture
2035 * migraiton between cp and tcp would incur no performance
2038 if (pg_cmt_can_migrate(cp
, tcp
))
2041 nosteal
= nosteal_nsec
;
2046 * Calculate time spent sitting on run queue
2048 now
= gethrtime_unscaled();
2049 rqtime
= now
- tp
->t_waitrq
;
2050 scalehrtime(&rqtime
);
2053 * Steal immediately if the time spent on this run queue is more
2054 * than allowed nosteal delay.
2056 * Negative rqtime check is needed here to avoid infinite
2057 * stealing delays caused by unlikely but not impossible
2058 * drifts between CPU times on different CPUs.
2060 if (rqtime
> nosteal
|| rqtime
< 0)
2063 DTRACE_PROBE4(nosteal
, kthread_t
*, tp
,
2064 cpu_t
*, tcp
, cpu_t
*, cp
, hrtime_t
, rqtime
);
2067 * Calculate when this thread becomes stealable
2069 now
+= (nosteal
- rqtime
);
2072 * Calculate time when some thread becomes stealable
2074 if (now
< dp
->disp_steal
)
2075 dp
->disp_steal
= now
;
2079 * If there were no unbound threads on this queue, find the queue
2080 * where they are and then return later. The value of
2081 * disp_max_unbound_pri is not always accurate because it isn't
2082 * reduced until another idle CPU looks for work.
2085 disp_fix_unbound_pri(dp
, pri
);
2088 * If we reached the end of the queue and found no unbound threads
2089 * then return NULL so that other CPUs will be considered. If there
2090 * are unbound threads but they cannot yet be stolen, then
2091 * return T_DONTSTEAL and try again later.
2094 disp_lock_exit_nopreempt(&dp
->disp_lock
);
2095 return (allbound
? NULL
: T_DONTSTEAL
);
2099 * Found a runnable, unbound thread, so remove it from queue.
2100 * dispdeq() requires that we have the thread locked, and we do,
2101 * by virtue of holding the dispatch queue lock. dispdeq() will
2102 * put the thread in transition state, thereby dropping the dispq
2108 int thread_was_on_queue
;
2110 thread_was_on_queue
= dispdeq(tp
); /* drops disp_lock */
2111 ASSERT(thread_was_on_queue
);
2115 (void) dispdeq(tp
); /* drops disp_lock */
2119 * Reset the disp_queue steal time - we do not know what is the smallest
2120 * value across the queue is.
2125 * Setup thread to run on the current CPU.
2127 tp
->t_disp_queue
= cp
->cpu_disp
;
2129 cp
->cpu_dispthread
= tp
; /* protected by spl only */
2130 cp
->cpu_dispatch_pri
= pri
;
2133 * There can be a memory synchronization race between disp_getbest()
2134 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2135 * to preempt the current thread to run the enqueued thread while
2136 * disp_getbest() and disp_ratify() are changing the current thread
2137 * to the stolen thread. This may lead to a situation where
2138 * cpu_resched() tries to preempt the wrong thread and the
2139 * stolen thread continues to run on the CPU which has been tagged
2141 * Later the clock thread gets enqueued but doesn't get to run on the
2142 * CPU causing the system to hang.
2144 * To avoid this, grabbing and dropping the disp_lock (which does
2145 * a memory barrier) is needed to synchronize the execution of
2146 * cpu_resched() with disp_getbest() and disp_ratify() and
2147 * synchronize the memory read and written by cpu_resched(),
2148 * disp_getbest(), and disp_ratify() with each other.
2149 * (see CR#6482861 for more details).
2151 disp_lock_enter_high(&cp
->cpu_disp
->disp_lock
);
2152 disp_lock_exit_high(&cp
->cpu_disp
->disp_lock
);
2154 ASSERT(pri
== DISP_PRIO(tp
));
2156 DTRACE_PROBE3(steal
, kthread_t
*, tp
, cpu_t
*, tcp
, cpu_t
*, cp
);
2158 thread_onproc(tp
, cp
); /* set t_state to TS_ONPROC */
2161 * Return with spl high so that swtch() won't need to raise it.
2162 * The disp_lock was dropped by dispdeq().
2169 * disp_bound_common() - common routine for higher level functions
2170 * that check for bound threads under certain conditions.
2171 * If 'threadlistsafe' is set then there is no need to acquire
2172 * pidlock to stop the thread list from changing (eg, if
2173 * disp_bound_* is called with cpus paused).
2176 disp_bound_common(cpu_t
*cp
, int threadlistsafe
, int flag
)
2183 if (!threadlistsafe
)
2184 mutex_enter(&pidlock
);
2185 tp
= curthread
; /* faster than allthreads */
2187 if (tp
->t_state
!= TS_FREE
) {
2189 * If an interrupt thread is busy, but the
2190 * caller doesn't care (i.e. BOUND_INTR is off),
2191 * then just ignore it and continue through.
2193 if ((tp
->t_flag
& T_INTR_THREAD
) &&
2194 !(flag
& BOUND_INTR
))
2198 * Skip the idle thread for the CPU
2199 * we're about to set offline.
2201 if (tp
== cp
->cpu_idle_thread
)
2205 * Skip the pause thread for the CPU
2206 * we're about to set offline.
2208 if (tp
== cp
->cpu_pause_thread
)
2211 if ((flag
& BOUND_CPU
) &&
2212 (tp
->t_bound_cpu
== cp
||
2213 tp
->t_bind_cpu
== cp
->cpu_id
||
2214 tp
->t_weakbound_cpu
== cp
)) {
2219 if ((flag
& BOUND_PARTITION
) &&
2220 (tp
->t_cpupart
== cp
->cpu_part
)) {
2225 } while ((tp
= tp
->t_next
) != curthread
&& found
== 0);
2226 if (!threadlistsafe
)
2227 mutex_exit(&pidlock
);
2232 * disp_bound_threads - return nonzero if threads are bound to the processor.
2233 * Called infrequently. Keep this simple.
2234 * Includes threads that are asleep or stopped but not onproc.
2237 disp_bound_threads(cpu_t
*cp
, int threadlistsafe
)
2239 return (disp_bound_common(cp
, threadlistsafe
, BOUND_CPU
));
2243 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2244 * to the given processor, including interrupt threads.
2247 disp_bound_anythreads(cpu_t
*cp
, int threadlistsafe
)
2249 return (disp_bound_common(cp
, threadlistsafe
, BOUND_CPU
| BOUND_INTR
));
2253 * disp_bound_partition - return nonzero if threads are bound to the same
2254 * partition as the processor.
2255 * Called infrequently. Keep this simple.
2256 * Includes threads that are asleep or stopped but not onproc.
2259 disp_bound_partition(cpu_t
*cp
, int threadlistsafe
)
2261 return (disp_bound_common(cp
, threadlistsafe
, BOUND_PARTITION
));
2265 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2266 * threads to other CPUs.
2269 disp_cpu_inactive(cpu_t
*cp
)
2272 disp_t
*dp
= cp
->cpu_disp
;
2277 disp_lock_enter(&dp
->disp_lock
);
2278 while ((pri
= dp
->disp_max_unbound_pri
) != -1) {
2279 dq
= &dp
->disp_q
[pri
];
2283 * Skip over bound threads.
2285 while (tp
!= NULL
&& tp
->t_bound_cpu
!= NULL
) {
2290 /* disp_max_unbound_pri must be inaccurate, so fix it */
2291 disp_fix_unbound_pri(dp
, pri
);
2295 wasonq
= dispdeq(tp
); /* drops disp_lock */
2297 ASSERT(tp
->t_weakbound_cpu
== NULL
);
2301 * Called from cpu_offline:
2303 * cp has already been removed from the list of active cpus
2304 * and tp->t_cpu has been changed so there is no risk of
2305 * tp ending up back on cp.
2307 * Called from cpupart_move_cpu:
2309 * The cpu has moved to a new cpupart. Any threads that
2310 * were on it's dispatch queues before the move remain
2311 * in the old partition and can't run in the new partition.
2313 ASSERT(tp
->t_cpu
!= cp
);
2316 disp_lock_enter(&dp
->disp_lock
);
2318 disp_lock_exit(&dp
->disp_lock
);
2322 * disp_lowpri_cpu - find CPU running the lowest priority thread.
2323 * The hint passed in is used as a starting point so we don't favor
2324 * CPU 0 or any other CPU. The caller should pass in the most recently
2325 * used CPU for the thread.
2327 * The lgroup and priority are used to determine the best CPU to run on
2328 * in a NUMA machine. The lgroup specifies which CPUs are closest while
2329 * the thread priority will indicate whether the thread will actually run
2330 * there. To pick the best CPU, the CPUs inside and outside of the given
2331 * lgroup which are running the lowest priority threads are found. The
2332 * remote CPU is chosen only if the thread will not run locally on a CPU
2333 * within the lgroup, but will run on the remote CPU. If the thread
2334 * cannot immediately run on any CPU, the best local CPU will be chosen.
2336 * The lpl specified also identifies the cpu partition from which
2337 * disp_lowpri_cpu should select a CPU.
2339 * curcpu is used to indicate that disp_lowpri_cpu is being called on
2340 * behalf of the current thread. (curthread is looking for a new cpu)
2341 * In this case, cpu_dispatch_pri for this thread's cpu should be
2344 * If a cpu is the target of an offline request then try to avoid it.
2346 * This function must be called at either high SPL, or with preemption
2347 * disabled, so that the "hint" CPU cannot be removed from the online
2348 * CPU list while we are traversing it.
2351 disp_lowpri_cpu(cpu_t
*hint
, lpl_t
*lpl
, pri_t tpri
, cpu_t
*curcpu
)
2355 cpu_t
*cp
, *cpstart
;
2363 lpl_t
*lpl_iter
, *lpl_leaf
;
2367 * Scan for a CPU currently running the lowest priority thread.
2368 * Cannot get cpu_lock here because it is adaptive.
2369 * We do not require lock on CPU list.
2371 ASSERT(hint
!= NULL
);
2372 ASSERT(lpl
!= NULL
);
2373 ASSERT(lpl
->lpl_ncpu
> 0);
2376 * First examine local CPUs. Note that it's possible the hint CPU
2377 * passed in in remote to the specified home lgroup. If our priority
2378 * isn't sufficient enough such that we can run immediately at home,
2379 * then examine CPUs remote to our home lgroup.
2380 * We would like to give preference to CPUs closest to "home".
2381 * If we can't find a CPU where we'll run at a given level
2382 * of locality, we expand our search to include the next level.
2384 bestcpu
= besthomecpu
= NULL
;
2385 klgrpset_clear(done
);
2386 /* start with lpl we were passed */
2393 klgrpset_clear(cur_set
);
2395 for (i
= 0; i
< lpl_iter
->lpl_nrset
; i
++) {
2396 lpl_leaf
= lpl_iter
->lpl_rset
[i
];
2397 if (klgrpset_ismember(done
, lpl_leaf
->lpl_lgrpid
))
2400 klgrpset_add(cur_set
, lpl_leaf
->lpl_lgrpid
);
2402 if (hint
->cpu_lpl
== lpl_leaf
)
2403 cp
= cpstart
= hint
;
2405 cp
= cpstart
= lpl_leaf
->lpl_cpus
;
2410 else if (cp
== cpu_inmotion
)
2413 cpupri
= cp
->cpu_dispatch_pri
;
2414 if (cp
->cpu_disp
->disp_maxrunpri
> cpupri
)
2415 cpupri
= cp
->cpu_disp
->disp_maxrunpri
;
2416 if (cp
->cpu_chosen_level
> cpupri
)
2417 cpupri
= cp
->cpu_chosen_level
;
2418 if (cpupri
< bestpri
) {
2419 if (CPU_IDLING(cpupri
)) {
2420 ASSERT((cp
->cpu_flags
&
2421 CPU_QUIESCED
) == 0);
2427 } while ((cp
= cp
->cpu_next_lpl
) != cpstart
);
2430 if (bestcpu
&& (tpri
> bestpri
)) {
2431 ASSERT((bestcpu
->cpu_flags
& CPU_QUIESCED
) == 0);
2434 if (besthomecpu
== NULL
)
2435 besthomecpu
= bestcpu
;
2437 * Add the lgrps we just considered to the "done" set
2439 klgrpset_or(done
, cur_set
);
2441 } while ((lpl_iter
= lpl_iter
->lpl_parent
) != NULL
);
2444 * The specified priority isn't high enough to run immediately
2445 * anywhere, so just return the best CPU from the home lgroup.
2447 ASSERT((besthomecpu
->cpu_flags
& CPU_QUIESCED
) == 0);
2448 return (besthomecpu
);
2452 * This routine provides the generic idle cpu function for all processors.
2453 * If a processor has some specific code to execute when idle (say, to stop
2454 * the pipeline and save power) then that routine should be defined in the
2455 * processors specific code (module_xx.c) and the global variable idle_cpu
2456 * set to that function.
2459 generic_idle_cpu(void)
2465 generic_enq_thread(cpu_t
*cpu
, int bound
)