dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / disp / disp.c
blob1f96d12bc0b7b101a860f06020b4b11d9c24c9ab
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
34 #include <sys/user.h>
35 #include <sys/systm.h>
36 #include <sys/sysinfo.h>
37 #include <sys/var.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/inline.h>
42 #include <sys/disp.h>
43 #include <sys/class.h>
44 #include <sys/bitmap.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/vtrace.h>
48 #include <sys/tnf.h>
49 #include <sys/cpupart.h>
50 #include <sys/lgrp.h>
51 #include <sys/pg.h>
52 #include <sys/cmt.h>
53 #include <sys/bitset.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 #include <sys/archsystm.h>
59 #include <sys/stdbool.h>
61 #include <vm/as.h>
63 #define BOUND_CPU 0x1
64 #define BOUND_PARTITION 0x2
65 #define BOUND_INTR 0x4
67 /* Dispatch queue allocation structure and functions */
68 struct disp_queue_info {
69 disp_t *dp;
70 dispq_t *olddispq;
71 dispq_t *newdispq;
72 ulong_t *olddqactmap;
73 ulong_t *newdqactmap;
74 int oldnglobpris;
76 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
77 disp_t *dp);
78 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
79 static void disp_dq_free(struct disp_queue_info *dptr);
81 /* platform-specific routine to call when processor is idle */
82 static void generic_idle_cpu();
83 void (*idle_cpu)() = generic_idle_cpu;
85 /* routines invoked when a CPU enters/exits the idle loop */
86 static void idle_enter();
87 static void idle_exit();
89 /* platform-specific routine to call when thread is enqueued */
90 static void generic_enq_thread(cpu_t *, int);
91 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
93 pri_t kpreemptpri; /* priority where kernel preemption applies */
94 pri_t upreemptpri = 0; /* priority where normal preemption applies */
95 pri_t intr_pri; /* interrupt thread priority base level */
97 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
98 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
99 disp_t cpu0_disp; /* boot CPU's dispatch queue */
100 int nswapped; /* total number of swapped threads */
101 static void disp_swapped_setrun(kthread_t *tp);
102 static void cpu_resched(cpu_t *cp, pri_t tpri);
105 * If this is set, only interrupt threads will cause kernel preemptions.
106 * This is done by changing the value of kpreemptpri. kpreemptpri
107 * will either be the max sysclass pri + 1 or the min interrupt pri.
109 int only_intr_kpreempt;
111 extern void set_idle_cpu(int cpun);
112 extern void unset_idle_cpu(int cpun);
113 static void setkpdq(kthread_t *tp, int borf);
114 #define SETKP_BACK 0
115 #define SETKP_FRONT 1
117 * Parameter that determines how recently a thread must have run
118 * on the CPU to be considered loosely-bound to that CPU to reduce
119 * cold cache effects. The interval is in hertz.
121 #define RECHOOSE_INTERVAL 3
122 int rechoose_interval = RECHOOSE_INTERVAL;
125 * Parameter that determines how long (in nanoseconds) a thread must
126 * be sitting on a run queue before it can be stolen by another CPU
127 * to reduce migrations. The interval is in nanoseconds.
129 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
130 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
131 * here indicating it is uninitiallized.
132 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
135 #define NOSTEAL_UNINITIALIZED (-1)
136 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
137 extern void cmp_set_nosteal_interval(void);
139 id_t defaultcid; /* system "default" class; see dispadmin(1M) */
141 disp_lock_t transition_lock; /* lock on transitioning threads */
142 disp_lock_t stop_lock; /* lock on stopped threads */
144 static void cpu_dispqalloc(int numpris);
147 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
148 * a thread because it was sitting on its run queue for a very short
149 * period of time.
151 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
153 static kthread_t *disp_getwork(cpu_t *to);
154 static kthread_t *disp_getbest(disp_t *from);
155 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
157 void swtch_to(kthread_t *);
160 * dispatcher and scheduler initialization
164 * disp_setup - Common code to calculate and allocate dispatcher
165 * variables and structures based on the maximum priority.
167 static void
168 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
170 pri_t newnglobpris;
172 ASSERT(MUTEX_HELD(&cpu_lock));
174 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
176 if (newnglobpris > oldnglobpris) {
178 * Allocate new kp queues for each CPU partition.
180 cpupart_kpqalloc(newnglobpris);
183 * Allocate new dispatch queues for each CPU.
185 cpu_dispqalloc(newnglobpris);
188 * compute new interrupt thread base priority
190 intr_pri = maxglobpri;
191 if (only_intr_kpreempt) {
192 kpreemptpri = intr_pri + 1;
193 if (kpqpri == KPQPRI)
194 kpqpri = kpreemptpri;
196 v.v_nglobpris = newnglobpris;
201 * dispinit - Called to initialize all loaded classes and the
202 * dispatcher framework.
204 void
205 dispinit(void)
207 id_t cid;
208 pri_t maxglobpri;
209 pri_t cl_maxglobpri;
211 maxglobpri = -1;
214 * Initialize transition lock, which will always be set.
216 DISP_LOCK_INIT(&transition_lock);
217 disp_lock_enter_high(&transition_lock);
218 DISP_LOCK_INIT(&stop_lock);
220 mutex_enter(&cpu_lock);
221 CPU->cpu_disp->disp_maxrunpri = -1;
222 CPU->cpu_disp->disp_max_unbound_pri = -1;
225 * Initialize the default CPU partition.
227 cpupart_initialize_default();
229 * Call the class specific initialization functions for
230 * all pre-installed schedulers.
232 * We pass the size of a class specific parameter
233 * buffer to each of the initialization functions
234 * to try to catch problems with backward compatibility
235 * of class modules.
237 * For example a new class module running on an old system
238 * which didn't provide sufficiently large parameter buffers
239 * would be bad news. Class initialization modules can check for
240 * this and take action if they detect a problem.
243 for (cid = 0; cid < nclass; cid++) {
244 sclass_t *sc;
246 sc = &sclass[cid];
247 if (SCHED_INSTALLED(sc)) {
248 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
249 &sc->cl_funcs);
250 if (cl_maxglobpri > maxglobpri)
251 maxglobpri = cl_maxglobpri;
254 kpreemptpri = (pri_t)v.v_maxsyspri + 1;
255 if (kpqpri == KPQPRI)
256 kpqpri = kpreemptpri;
258 ASSERT(maxglobpri >= 0);
259 disp_setup(maxglobpri, 0);
261 mutex_exit(&cpu_lock);
264 * Platform specific sticky scheduler setup.
266 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
267 cmp_set_nosteal_interval();
270 * Get the default class ID; this may be later modified via
271 * dispadmin(1M). This will load the class (normally TS) and that will
272 * call disp_add(), which is why we had to drop cpu_lock first.
274 if (getcid(defaultclass, &defaultcid) != 0) {
275 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
276 defaultclass);
281 * disp_add - Called with class pointer to initialize the dispatcher
282 * for a newly loaded class.
284 void
285 disp_add(sclass_t *clp)
287 pri_t maxglobpri;
288 pri_t cl_maxglobpri;
290 mutex_enter(&cpu_lock);
292 * Initialize the scheduler class.
294 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
295 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
296 if (cl_maxglobpri > maxglobpri)
297 maxglobpri = cl_maxglobpri;
300 * Save old queue information. Since we're initializing a
301 * new scheduling class which has just been loaded, then
302 * the size of the dispq may have changed. We need to handle
303 * that here.
305 disp_setup(maxglobpri, v.v_nglobpris);
307 mutex_exit(&cpu_lock);
312 * For each CPU, allocate new dispatch queues
313 * with the stated number of priorities.
315 static void
316 cpu_dispqalloc(int numpris)
318 cpu_t *cpup;
319 struct disp_queue_info *disp_mem;
320 int i, num;
322 ASSERT(MUTEX_HELD(&cpu_lock));
324 disp_mem = kmem_zalloc(NCPU *
325 sizeof (struct disp_queue_info), KM_SLEEP);
328 * This routine must allocate all of the memory before stopping
329 * the cpus because it must not sleep in kmem_alloc while the
330 * CPUs are stopped. Locks they hold will not be freed until they
331 * are restarted.
333 i = 0;
334 cpup = cpu_list;
335 do {
336 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
337 i++;
338 cpup = cpup->cpu_next;
339 } while (cpup != cpu_list);
340 num = i;
342 pause_cpus(NULL, NULL);
343 for (i = 0; i < num; i++)
344 disp_dq_assign(&disp_mem[i], numpris);
345 start_cpus();
348 * I must free all of the memory after starting the cpus because
349 * I can not risk sleeping in kmem_free while the cpus are stopped.
351 for (i = 0; i < num; i++)
352 disp_dq_free(&disp_mem[i]);
354 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
357 static void
358 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
360 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
361 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
362 sizeof (long), KM_SLEEP);
363 dptr->dp = dp;
366 static void
367 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
369 disp_t *dp;
371 dp = dptr->dp;
372 dptr->olddispq = dp->disp_q;
373 dptr->olddqactmap = dp->disp_qactmap;
374 dptr->oldnglobpris = dp->disp_npri;
376 ASSERT(dptr->oldnglobpris < numpris);
378 if (dptr->olddispq != NULL) {
380 * Use kcopy because bcopy is platform-specific
381 * and could block while we might have paused the cpus.
383 (void) kcopy(dptr->olddispq, dptr->newdispq,
384 dptr->oldnglobpris * sizeof (dispq_t));
385 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
386 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
387 sizeof (long));
389 dp->disp_q = dptr->newdispq;
390 dp->disp_qactmap = dptr->newdqactmap;
391 dp->disp_q_limit = &dptr->newdispq[numpris];
392 dp->disp_npri = numpris;
395 static void
396 disp_dq_free(struct disp_queue_info *dptr)
398 if (dptr->olddispq != NULL)
399 kmem_free(dptr->olddispq,
400 dptr->oldnglobpris * sizeof (dispq_t));
401 if (dptr->olddqactmap != NULL)
402 kmem_free(dptr->olddqactmap,
403 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
407 * For a newly created CPU, initialize the dispatch queue.
408 * This is called before the CPU is known through cpu[] or on any lists.
410 void
411 disp_cpu_init(cpu_t *cp)
413 disp_t *dp;
414 dispq_t *newdispq;
415 ulong_t *newdqactmap;
417 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
419 if (cp == cpu0_disp.disp_cpu)
420 dp = &cpu0_disp;
421 else
422 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
423 bzero(dp, sizeof (disp_t));
424 cp->cpu_disp = dp;
425 dp->disp_cpu = cp;
426 dp->disp_maxrunpri = -1;
427 dp->disp_max_unbound_pri = -1;
428 DISP_LOCK_INIT(&cp->cpu_thread_lock);
430 * Allocate memory for the dispatcher queue headers
431 * and the active queue bitmap.
433 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
434 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
435 sizeof (long), KM_SLEEP);
436 dp->disp_q = newdispq;
437 dp->disp_qactmap = newdqactmap;
438 dp->disp_q_limit = &newdispq[v.v_nglobpris];
439 dp->disp_npri = v.v_nglobpris;
442 void
443 disp_cpu_fini(cpu_t *cp)
445 ASSERT(MUTEX_HELD(&cpu_lock));
447 disp_kp_free(cp->cpu_disp);
448 if (cp->cpu_disp != &cpu0_disp)
449 kmem_free(cp->cpu_disp, sizeof (disp_t));
453 * Allocate new, larger kpreempt dispatch queue to replace the old one.
455 void
456 disp_kp_alloc(disp_t *dq, pri_t npri)
458 struct disp_queue_info mem_info;
460 if (npri > dq->disp_npri) {
462 * Allocate memory for the new array.
464 disp_dq_alloc(&mem_info, npri, dq);
467 * We need to copy the old structures to the new
468 * and free the old.
470 disp_dq_assign(&mem_info, npri);
471 disp_dq_free(&mem_info);
476 * Free dispatch queue.
477 * Used for the kpreempt queues for a removed CPU partition and
478 * for the per-CPU queues of deleted CPUs.
480 void
481 disp_kp_free(disp_t *dq)
483 struct disp_queue_info mem_info;
485 mem_info.olddispq = dq->disp_q;
486 mem_info.olddqactmap = dq->disp_qactmap;
487 mem_info.oldnglobpris = dq->disp_npri;
488 disp_dq_free(&mem_info);
492 * End dispatcher and scheduler initialization.
496 * See if there's anything to do other than remain idle.
497 * Return non-zero if there is.
499 * This function must be called with high spl, or with
500 * kernel preemption disabled to prevent the partition's
501 * active cpu list from changing while being traversed.
503 * This is essentially a simpler version of disp_getwork()
504 * to be called by CPUs preparing to "halt".
507 disp_anywork(void)
509 cpu_t *cp = CPU;
510 cpu_t *ocp;
511 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
513 if (!(cp->cpu_flags & CPU_OFFLINE)) {
514 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
515 return (1);
517 for (ocp = cp->cpu_next_part; ocp != cp;
518 ocp = ocp->cpu_next_part) {
519 ASSERT(CPU_ACTIVE(ocp));
522 * Something has appeared on the local run queue.
524 if (*local_nrunnable > 0)
525 return (1);
527 * If we encounter another idle CPU that will
528 * soon be trolling around through disp_anywork()
529 * terminate our walk here and let this other CPU
530 * patrol the next part of the list.
532 if (ocp->cpu_dispatch_pri == -1 &&
533 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
534 return (0);
536 * Work can be taken from another CPU if:
537 * - There is unbound work on the run queue
538 * - That work isn't a thread undergoing a
539 * - context switch on an otherwise empty queue.
540 * - The CPU isn't running the idle loop.
542 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
543 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
544 ocp->cpu_disp->disp_nrunnable == 1) &&
545 ocp->cpu_dispatch_pri != -1)
546 return (1);
549 return (0);
553 * Called when CPU enters the idle loop
555 static void
556 idle_enter()
558 cpu_t *cp = CPU;
560 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
561 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
562 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
566 * Called when CPU exits the idle loop
568 static void
569 idle_exit()
571 cpu_t *cp = CPU;
573 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
574 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
578 * Idle loop.
580 void
581 idle()
583 struct cpu *cp = CPU; /* pointer to this CPU */
584 kthread_t *t; /* taken thread */
586 idle_enter();
589 * Uniprocessor version of idle loop.
590 * Do this until notified that we're on an actual multiprocessor.
592 while (ncpus == 1) {
593 if (cp->cpu_disp->disp_nrunnable == 0) {
594 (*idle_cpu)();
595 continue;
597 idle_exit();
598 swtch();
600 idle_enter(); /* returned from swtch */
604 * Multiprocessor idle loop.
606 for (;;) {
608 * If CPU is completely quiesced by p_online(2), just wait
609 * here with minimal bus traffic until put online.
611 while (cp->cpu_flags & CPU_QUIESCED)
612 (*idle_cpu)();
614 if (cp->cpu_disp->disp_nrunnable != 0) {
615 idle_exit();
616 swtch();
617 } else {
618 if (cp->cpu_flags & CPU_OFFLINE)
619 continue;
620 if ((t = disp_getwork(cp)) == NULL) {
621 if (cp->cpu_chosen_level != -1) {
622 disp_t *dp = cp->cpu_disp;
623 disp_t *kpq;
625 disp_lock_enter(&dp->disp_lock);
627 * Set kpq under lock to prevent
628 * migration between partitions.
630 kpq = &cp->cpu_part->cp_kp_queue;
631 if (kpq->disp_maxrunpri == -1)
632 cp->cpu_chosen_level = -1;
633 disp_lock_exit(&dp->disp_lock);
635 (*idle_cpu)();
636 continue;
639 * If there was a thread but we couldn't steal
640 * it, then keep trying.
642 if (t == T_DONTSTEAL)
643 continue;
644 idle_exit();
645 swtch_to(t);
647 idle_enter(); /* returned from swtch/swtch_to */
653 * Preempt the currently running thread in favor of the highest
654 * priority thread. The class of the current thread controls
655 * where it goes on the dispatcher queues. If panicking, turn
656 * preemption off.
658 void
659 preempt()
661 kthread_t *t = curthread;
662 klwp_t *lwp = ttolwp(curthread);
664 if (panicstr)
665 return;
667 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
669 thread_lock(t);
671 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
673 * this thread has already been chosen to be run on
674 * another CPU. Clear kprunrun on this CPU since we're
675 * already headed for swtch().
677 CPU->cpu_kprunrun = 0;
678 thread_unlock_nopreempt(t);
679 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
680 } else {
681 if (lwp != NULL)
682 lwp->lwp_ru.nivcsw++;
683 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
684 THREAD_TRANSITION(t);
685 CL_PREEMPT(t);
686 DTRACE_SCHED(preempt);
687 thread_unlock_nopreempt(t);
689 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
691 swtch(); /* clears CPU->cpu_runrun via disp() */
695 extern kthread_t *thread_unpin();
698 * disp() - find the highest priority thread for this processor to run, and
699 * set it in TS_ONPROC state so that resume() can be called to run it.
701 static kthread_t *
702 disp()
704 cpu_t *cpup;
705 disp_t *dp;
706 kthread_t *tp;
707 dispq_t *dq;
708 int maxrunword;
709 pri_t pri;
710 disp_t *kpq;
712 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
714 cpup = CPU;
716 * Find the highest priority loaded, runnable thread.
718 dp = cpup->cpu_disp;
720 reschedule:
722 * If there is more important work on the global queue with a better
723 * priority than the maximum on this CPU, take it now.
725 kpq = &cpup->cpu_part->cp_kp_queue;
726 while ((pri = kpq->disp_maxrunpri) >= 0 &&
727 pri >= dp->disp_maxrunpri &&
728 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
729 (tp = disp_getbest(kpq)) != NULL) {
730 if (disp_ratify(tp, kpq) != NULL) {
731 TRACE_1(TR_FAC_DISP, TR_DISP_END,
732 "disp_end:tid %p", tp);
733 return (tp);
737 disp_lock_enter(&dp->disp_lock);
738 pri = dp->disp_maxrunpri;
741 * If there is nothing to run, look at what's runnable on other queues.
742 * Choose the idle thread if the CPU is quiesced.
743 * Note that CPUs that have the CPU_OFFLINE flag set can still run
744 * interrupt threads, which will be the only threads on the CPU's own
745 * queue, but cannot run threads from other queues.
747 if (pri == -1) {
748 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
749 disp_lock_exit(&dp->disp_lock);
750 if ((tp = disp_getwork(cpup)) == NULL ||
751 tp == T_DONTSTEAL) {
752 tp = cpup->cpu_idle_thread;
753 (void) splhigh();
754 THREAD_ONPROC(tp, cpup);
755 cpup->cpu_dispthread = tp;
756 cpup->cpu_dispatch_pri = -1;
757 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
758 cpup->cpu_chosen_level = -1;
760 } else {
761 disp_lock_exit_high(&dp->disp_lock);
762 tp = cpup->cpu_idle_thread;
763 THREAD_ONPROC(tp, cpup);
764 cpup->cpu_dispthread = tp;
765 cpup->cpu_dispatch_pri = -1;
766 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
767 cpup->cpu_chosen_level = -1;
769 TRACE_1(TR_FAC_DISP, TR_DISP_END,
770 "disp_end:tid %p", tp);
771 return (tp);
774 dq = &dp->disp_q[pri];
775 tp = dq->dq_first;
777 ASSERT(tp != NULL);
779 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
782 * Found it so remove it from queue.
784 dp->disp_nrunnable--;
785 dq->dq_sruncnt--;
786 if ((dq->dq_first = tp->t_link) == NULL) {
787 ulong_t *dqactmap = dp->disp_qactmap;
789 ASSERT(dq->dq_sruncnt == 0);
790 dq->dq_last = NULL;
793 * The queue is empty, so the corresponding bit needs to be
794 * turned off in dqactmap. If nrunnable != 0 just took the
795 * last runnable thread off the
796 * highest queue, so recompute disp_maxrunpri.
798 maxrunword = pri >> BT_ULSHIFT;
799 dqactmap[maxrunword] &= ~BT_BIW(pri);
801 if (dp->disp_nrunnable == 0) {
802 dp->disp_max_unbound_pri = -1;
803 dp->disp_maxrunpri = -1;
804 } else {
805 int ipri;
807 ipri = bt_gethighbit(dqactmap, maxrunword);
808 dp->disp_maxrunpri = ipri;
809 if (ipri < dp->disp_max_unbound_pri)
810 dp->disp_max_unbound_pri = ipri;
812 } else {
813 tp->t_link = NULL;
816 cpup->cpu_dispthread = tp; /* protected by spl only */
817 cpup->cpu_dispatch_pri = pri;
818 ASSERT(pri == DISP_PRIO(tp));
819 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
820 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
822 ASSERT(tp != NULL);
823 TRACE_1(TR_FAC_DISP, TR_DISP_END,
824 "disp_end:tid %p", tp);
826 if (disp_ratify(tp, kpq) == NULL)
827 goto reschedule;
829 return (tp);
833 * swtch()
834 * Find best runnable thread and run it.
835 * Called with the current thread already switched to a new state,
836 * on a sleep queue, run queue, stopped, and not zombied.
837 * May be called at any spl level less than or equal to LOCK_LEVEL.
838 * Always drops spl to the base level (spl0()).
840 void
841 swtch()
843 kthread_t *t = curthread;
844 kthread_t *next;
845 cpu_t *cp;
847 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
849 if (t->t_flag & T_INTR_THREAD)
850 cpu_intr_swtch_enter(t);
852 if (t->t_intr != NULL) {
854 * We are an interrupt thread. Setup and return
855 * the interrupted thread to be resumed.
857 (void) splhigh(); /* block other scheduler action */
858 cp = CPU; /* now protected against migration */
859 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
860 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
861 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
862 next = thread_unpin();
863 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
864 resume_from_intr(next);
865 } else {
866 #ifdef DEBUG
867 if (t->t_state == TS_ONPROC &&
868 t->t_disp_queue->disp_cpu == CPU &&
869 t->t_preempt == 0) {
870 thread_lock(t);
871 ASSERT(t->t_state != TS_ONPROC ||
872 t->t_disp_queue->disp_cpu != CPU ||
873 t->t_preempt != 0); /* cannot migrate */
874 thread_unlock_nopreempt(t);
876 #endif /* DEBUG */
877 cp = CPU;
878 next = disp(); /* returns with spl high */
879 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
881 /* OK to steal anything left on run queue */
882 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
884 if (next != t) {
885 hrtime_t now;
887 now = gethrtime_unscaled();
888 pg_ev_thread_swtch(cp, now, t, next);
891 * If t was previously in the TS_ONPROC state,
892 * setfrontdq and setbackdq won't have set its t_waitrq.
893 * Since we now finally know that we're switching away
894 * from this thread, set its t_waitrq if it is on a run
895 * queue.
897 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
898 t->t_waitrq = now;
902 * restore mstate of thread that we are switching to
904 restore_mstate(next);
906 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
907 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
908 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
910 if (dtrace_vtime_active)
911 dtrace_vtime_switch(next);
913 resume(next);
915 * The TR_RESUME_END and TR_SWTCH_END trace points
916 * appear at the end of resume(), because we may not
917 * return here
919 } else {
920 if (t->t_flag & T_INTR_THREAD)
921 cpu_intr_swtch_exit(t);
923 * Threads that enqueue themselves on a run queue defer
924 * setting t_waitrq. It is then either set in swtch()
925 * when the CPU is actually yielded, or not at all if it
926 * is remaining on the CPU.
927 * There is however a window between where the thread
928 * placed itself on a run queue, and where it selects
929 * itself in disp(), where a third party (eg. clock()
930 * doing tick processing) may have re-enqueued this
931 * thread, setting t_waitrq in the process. We detect
932 * this race by noticing that despite switching to
933 * ourself, our t_waitrq has been set, and should be
934 * cleared.
936 if (t->t_waitrq != 0)
937 t->t_waitrq = 0;
939 pg_ev_thread_remain(cp, t);
941 DTRACE_SCHED(remain__cpu);
942 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
943 (void) spl0();
949 * swtch_from_zombie()
950 * Special case of swtch(), which allows checks for TS_ZOMB to be
951 * eliminated from normal resume.
952 * Find best runnable thread and run it.
953 * Called with the current thread zombied.
954 * Zombies cannot migrate, so CPU references are safe.
956 void
957 swtch_from_zombie()
959 kthread_t *next;
960 cpu_t *cpu = CPU;
962 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
964 ASSERT(curthread->t_state == TS_ZOMB);
966 next = disp(); /* returns with spl high */
967 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
968 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
969 ASSERT(next != curthread);
970 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
972 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
974 restore_mstate(next);
976 if (dtrace_vtime_active)
977 dtrace_vtime_switch(next);
979 resume_from_zombie(next);
981 * The TR_RESUME_END and TR_SWTCH_END trace points
982 * appear at the end of resume(), because we certainly will not
983 * return here
987 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
990 * search_disp_queues()
991 * Search the given dispatch queues for thread tp.
992 * Return 1 if tp is found, otherwise return 0.
994 static int
995 search_disp_queues(disp_t *dp, kthread_t *tp)
997 dispq_t *dq;
998 dispq_t *eq;
1000 disp_lock_enter_high(&dp->disp_lock);
1002 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1003 kthread_t *rp;
1005 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1007 for (rp = dq->dq_first; rp; rp = rp->t_link)
1008 if (tp == rp) {
1009 disp_lock_exit_high(&dp->disp_lock);
1010 return (1);
1013 disp_lock_exit_high(&dp->disp_lock);
1015 return (0);
1019 * thread_on_queue()
1020 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1021 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1023 static int
1024 thread_on_queue(kthread_t *tp)
1026 cpu_t *cp;
1027 struct cpupart *part;
1029 ASSERT(getpil() >= DISP_LEVEL);
1032 * Search the per-CPU dispatch queues for tp.
1034 cp = CPU;
1035 do {
1036 if (search_disp_queues(cp->cpu_disp, tp))
1037 return (1);
1038 } while ((cp = cp->cpu_next_onln) != CPU);
1041 * Search the partition-wide kpreempt queues for tp.
1043 part = CPU->cpu_part;
1044 do {
1045 if (search_disp_queues(&part->cp_kp_queue, tp))
1046 return (1);
1047 } while ((part = part->cp_next) != CPU->cpu_part);
1049 return (0);
1052 #else
1054 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1056 #endif /* DEBUG */
1059 * like swtch(), but switch to a specified thread taken from another CPU.
1060 * called with spl high..
1062 void
1063 swtch_to(kthread_t *next)
1065 cpu_t *cp = CPU;
1066 hrtime_t now;
1068 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1071 * Update context switch statistics.
1073 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1075 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1077 now = gethrtime_unscaled();
1078 pg_ev_thread_swtch(cp, now, curthread, next);
1080 /* OK to steal anything left on run queue */
1081 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1083 /* record last execution time */
1084 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1087 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1088 * won't have set its t_waitrq. Since we now finally know that we're
1089 * switching away from this thread, set its t_waitrq if it is on a run
1090 * queue.
1092 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1093 curthread->t_waitrq = now;
1096 /* restore next thread to previously running microstate */
1097 restore_mstate(next);
1099 if (dtrace_vtime_active)
1100 dtrace_vtime_switch(next);
1102 resume(next);
1104 * The TR_RESUME_END and TR_SWTCH_END trace points
1105 * appear at the end of resume(), because we may not
1106 * return here
1110 #define CPU_IDLING(pri) ((pri) == -1)
1112 static void
1113 cpu_resched(cpu_t *cp, pri_t tpri)
1115 int call_poke_cpu = 0;
1116 pri_t cpupri = cp->cpu_dispatch_pri;
1118 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1119 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1120 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1121 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1122 cp->cpu_runrun = 1;
1123 aston(cp->cpu_dispthread);
1124 if (tpri < kpreemptpri && cp != CPU)
1125 call_poke_cpu = 1;
1127 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1128 cp->cpu_kprunrun = 1;
1129 if (cp != CPU)
1130 call_poke_cpu = 1;
1135 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1137 membar_enter();
1139 if (call_poke_cpu)
1140 poke_cpu(cp->cpu_id);
1144 * setbackdq() keeps runqs balanced such that the difference in length
1145 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1146 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1147 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1148 * try to keep runqs perfectly balanced regardless of the thread priority.
1150 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1151 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1152 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1155 * Macro that evaluates to true if it is likely that the thread has cache
1156 * warmth. This is based on the amount of time that has elapsed since the
1157 * thread last ran. If that amount of time is less than "rechoose_interval"
1158 * ticks, then we decide that the thread has enough cache warmth to warrant
1159 * some affinity for t->t_cpu.
1161 #define THREAD_HAS_CACHE_WARMTH(thread) \
1162 ((thread == curthread) || \
1163 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1166 * Put the specified thread on the front/back of the dispatcher queue
1167 * corresponding to its current priority.
1169 * Called with the thread in transition, onproc or stopped state and locked
1170 * (transition implies locked) and at high spl. Returns with the thread in
1171 * TS_RUN state and still locked.
1173 static void
1174 setfrontbackdq(kthread_t *tp, bool front)
1176 dispq_t *dq;
1177 disp_t *dp;
1178 cpu_t *cp;
1179 pri_t tpri;
1180 bool bound;
1181 boolean_t self;
1183 ASSERT(THREAD_LOCK_HELD(tp));
1184 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1185 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1187 self = (tp == curthread);
1188 bound = (tp->t_bound_cpu || tp->t_weakbound_cpu);
1190 tpri = DISP_PRIO(tp);
1191 if (ncpus == 1)
1192 cp = tp->t_cpu;
1193 else if (!bound) {
1194 if (tpri >= kpqpri) {
1195 setkpdq(tp, front ? SETKP_FRONT : SETKP_BACK);
1196 return;
1199 cp = tp->t_cpu;
1201 if (!front) {
1203 * We'll generally let this thread continue to run where
1204 * it last ran...but will consider migration if:
1205 * - We thread probably doesn't have much cache warmth.
1206 * - The CPU where it last ran is the target of an offline
1207 * request.
1208 * - The thread last ran outside it's home lgroup.
1210 if ((!THREAD_HAS_CACHE_WARMTH(tp)) || (cp == cpu_inmotion)) {
1211 cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri, NULL);
1212 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) {
1213 cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
1214 self ? cp : NULL);
1219 if (tp->t_cpupart == cp->cpu_part) {
1220 if (front) {
1222 * We'll generally let this thread continue to run
1223 * where it last ran, but will consider migration if:
1224 * - The thread last ran outside it's home lgroup.
1225 * - The CPU where it last ran is the target of an
1226 * offline request (a thread_nomigrate() on the in
1227 * motion CPU relies on this when forcing a preempt).
1228 * - The thread isn't the highest priority thread where
1229 * it last ran, and it is considered not likely to
1230 * have significant cache warmth.
1232 if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1233 (cp == cpu_inmotion)) {
1234 cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
1235 self ? cp : NULL);
1236 } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1237 (!THREAD_HAS_CACHE_WARMTH(tp))) {
1238 cp = disp_lowpri_cpu(cp, tp->t_lpl, tpri,
1239 NULL);
1241 } else {
1242 int qlen;
1245 * Perform any CMT load balancing
1247 cp = cmt_balance(tp, cp);
1250 * Balance across the run queues
1252 qlen = RUNQ_LEN(cp, tpri);
1253 if (tpri >= RUNQ_MATCH_PRI &&
1254 !(tp->t_schedflag & TS_RUNQMATCH))
1255 qlen -= RUNQ_MAX_DIFF;
1256 if (qlen > 0) {
1257 cpu_t *newcp;
1259 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1260 newcp = cp->cpu_next_part;
1261 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1262 newcp = cp->cpu_next_part;
1265 if (RUNQ_LEN(newcp, tpri) < qlen) {
1266 DTRACE_PROBE3(runq__balance,
1267 kthread_t *, tp,
1268 cpu_t *, cp, cpu_t *, newcp);
1269 cp = newcp;
1273 } else {
1275 * Migrate to a cpu in the new partition.
1277 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1278 tp->t_lpl, tp->t_pri, NULL);
1281 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1282 } else {
1284 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1285 * a short time until weak binding that existed when the
1286 * strong binding was established has dropped) so we must
1287 * favour weak binding over strong.
1289 cp = tp->t_weakbound_cpu ?
1290 tp->t_weakbound_cpu : tp->t_bound_cpu;
1294 * A thread that is ONPROC may be temporarily placed on the run queue
1295 * but then chosen to run again by disp. If the thread we're placing on
1296 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1297 * replacement process is actually scheduled in swtch(). In this
1298 * situation, curthread is the only thread that could be in the ONPROC
1299 * state.
1301 if ((!self) && (tp->t_waitrq == 0)) {
1302 hrtime_t curtime;
1304 curtime = gethrtime_unscaled();
1305 (void) cpu_update_pct(tp, curtime);
1306 tp->t_waitrq = curtime;
1307 } else {
1308 (void) cpu_update_pct(tp, gethrtime_unscaled());
1311 dp = cp->cpu_disp;
1312 disp_lock_enter_high(&dp->disp_lock);
1314 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, front);
1316 #ifndef NPROBE
1317 /* Kernel probe */
1318 if (tnf_tracing_active)
1319 tnf_thread_queue(tp, cp, tpri);
1320 #endif /* NPROBE */
1322 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1324 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1325 tp->t_disp_queue = dp;
1326 tp->t_link = NULL;
1328 dq = &dp->disp_q[tpri];
1329 dp->disp_nrunnable++;
1330 if (!bound)
1331 dp->disp_steal = 0;
1332 membar_enter();
1334 if (dq->dq_sruncnt++ != 0) {
1335 if (front) {
1336 ASSERT(dq->dq_last != NULL);
1337 tp->t_link = dq->dq_first;
1338 dq->dq_first = tp;
1339 } else {
1340 ASSERT(dq->dq_first != NULL);
1341 dq->dq_last->t_link = tp;
1342 dq->dq_last = tp;
1344 } else {
1345 ASSERT(dq->dq_first == NULL);
1346 ASSERT(dq->dq_last == NULL);
1347 dq->dq_first = dq->dq_last = tp;
1348 BT_SET(dp->disp_qactmap, tpri);
1349 if (tpri > dp->disp_maxrunpri) {
1350 dp->disp_maxrunpri = tpri;
1351 membar_enter();
1352 cpu_resched(cp, tpri);
1356 if (!bound && tpri > dp->disp_max_unbound_pri) {
1357 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1359 * If there are no other unbound threads on the
1360 * run queue, don't allow other CPUs to steal
1361 * this thread while we are in the middle of a
1362 * context switch. We may just switch to it
1363 * again right away. CPU_DISP_DONTSTEAL is cleared
1364 * in swtch and swtch_to.
1366 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1368 dp->disp_max_unbound_pri = tpri;
1371 (*disp_enq_thread)(cp, bound);
1375 * Put the specified thread on the back of the dispatcher
1376 * queue corresponding to its current priority.
1378 * Called with the thread in transition, onproc or stopped state
1379 * and locked (transition implies locked) and at high spl.
1380 * Returns with the thread in TS_RUN state and still locked.
1382 void
1383 setbackdq(kthread_t *tp)
1385 setfrontbackdq(tp, false);
1389 * Put the specified thread on the front of the dispatcher
1390 * queue corresponding to its current priority.
1392 * Called with the thread in transition, onproc or stopped state
1393 * and locked (transition implies locked) and at high spl.
1394 * Returns with the thread in TS_RUN state and still locked.
1396 void
1397 setfrontdq(kthread_t *tp)
1399 setfrontbackdq(tp, true);
1403 * Put a high-priority unbound thread on the kp queue
1405 static void
1406 setkpdq(kthread_t *tp, int borf)
1408 dispq_t *dq;
1409 disp_t *dp;
1410 cpu_t *cp;
1411 pri_t tpri;
1413 tpri = DISP_PRIO(tp);
1415 dp = &tp->t_cpupart->cp_kp_queue;
1416 disp_lock_enter_high(&dp->disp_lock);
1418 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1420 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1421 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1422 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1423 tp->t_disp_queue = dp;
1424 dp->disp_nrunnable++;
1425 dq = &dp->disp_q[tpri];
1427 if (dq->dq_sruncnt++ != 0) {
1428 if (borf == SETKP_BACK) {
1429 ASSERT(dq->dq_first != NULL);
1430 tp->t_link = NULL;
1431 dq->dq_last->t_link = tp;
1432 dq->dq_last = tp;
1433 } else {
1434 ASSERT(dq->dq_last != NULL);
1435 tp->t_link = dq->dq_first;
1436 dq->dq_first = tp;
1438 } else {
1439 if (borf == SETKP_BACK) {
1440 ASSERT(dq->dq_first == NULL);
1441 ASSERT(dq->dq_last == NULL);
1442 dq->dq_first = dq->dq_last = tp;
1443 } else {
1444 ASSERT(dq->dq_last == NULL);
1445 ASSERT(dq->dq_first == NULL);
1446 tp->t_link = NULL;
1447 dq->dq_first = dq->dq_last = tp;
1449 BT_SET(dp->disp_qactmap, tpri);
1450 if (tpri > dp->disp_max_unbound_pri)
1451 dp->disp_max_unbound_pri = tpri;
1452 if (tpri > dp->disp_maxrunpri) {
1453 dp->disp_maxrunpri = tpri;
1454 membar_enter();
1458 cp = tp->t_cpu;
1459 if (tp->t_cpupart != cp->cpu_part) {
1460 /* migrate to a cpu in the new partition */
1461 cp = tp->t_cpupart->cp_cpulist;
1463 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1464 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1465 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1467 #ifndef NPROBE
1468 /* Kernel probe */
1469 if (tnf_tracing_active)
1470 tnf_thread_queue(tp, cp, tpri);
1471 #endif /* NPROBE */
1473 if (cp->cpu_chosen_level < tpri)
1474 cp->cpu_chosen_level = tpri;
1475 cpu_resched(cp, tpri);
1476 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1477 (*disp_enq_thread)(cp, 0);
1481 * Remove a thread from the dispatcher queue if it is on it.
1482 * It is not an error if it is not found but we return whether
1483 * or not it was found in case the caller wants to check.
1486 dispdeq(kthread_t *tp)
1488 disp_t *dp;
1489 dispq_t *dq;
1490 kthread_t *rp;
1491 kthread_t *trp;
1492 kthread_t **ptp;
1493 int tpri;
1495 ASSERT(THREAD_LOCK_HELD(tp));
1497 if (tp->t_state != TS_RUN)
1498 return (0);
1500 tpri = DISP_PRIO(tp);
1501 dp = tp->t_disp_queue;
1502 ASSERT(tpri < dp->disp_npri);
1503 dq = &dp->disp_q[tpri];
1504 ptp = &dq->dq_first;
1505 rp = *ptp;
1506 trp = NULL;
1508 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1511 * Search for thread in queue.
1512 * Double links would simplify this at the expense of disp/setrun.
1514 while (rp != tp && rp != NULL) {
1515 trp = rp;
1516 ptp = &trp->t_link;
1517 rp = trp->t_link;
1520 if (rp == NULL) {
1521 panic("dispdeq: thread not on queue");
1524 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1527 * Found it so remove it from queue.
1529 if ((*ptp = rp->t_link) == NULL)
1530 dq->dq_last = trp;
1532 dp->disp_nrunnable--;
1533 if (--dq->dq_sruncnt == 0) {
1534 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1535 if (dp->disp_nrunnable == 0) {
1536 dp->disp_max_unbound_pri = -1;
1537 dp->disp_maxrunpri = -1;
1538 } else if (tpri == dp->disp_maxrunpri) {
1539 int ipri;
1541 ipri = bt_gethighbit(dp->disp_qactmap,
1542 dp->disp_maxrunpri >> BT_ULSHIFT);
1543 if (ipri < dp->disp_max_unbound_pri)
1544 dp->disp_max_unbound_pri = ipri;
1545 dp->disp_maxrunpri = ipri;
1548 tp->t_link = NULL;
1549 THREAD_TRANSITION(tp); /* put in intermediate state */
1550 return (1);
1554 * Make a thread give up its processor. Find the processor on
1555 * which this thread is executing, and have that processor
1556 * preempt.
1558 * We allow System Duty Cycle (SDC) threads to be preempted even if
1559 * they are running at kernel priorities. To implement this, we always
1560 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1561 * calls cpu_surrender() very often, we only preempt if there is anyone
1562 * competing with us.
1564 void
1565 cpu_surrender(kthread_t *tp)
1567 cpu_t *cpup;
1568 int max_pri;
1569 int max_run_pri;
1570 klwp_t *lwp;
1572 ASSERT(THREAD_LOCK_HELD(tp));
1574 if (tp->t_state != TS_ONPROC)
1575 return;
1576 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1577 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1578 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1579 if (max_pri < max_run_pri)
1580 max_pri = max_run_pri;
1582 if (tp->t_cid == sysdccid) {
1583 uint_t t_pri = DISP_PRIO(tp);
1584 if (t_pri > max_pri)
1585 return; /* we are not competing w/ anyone */
1586 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1587 } else {
1588 cpup->cpu_runrun = 1;
1589 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1590 cpup->cpu_kprunrun = 1;
1595 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1597 membar_enter();
1599 DTRACE_SCHED1(surrender, kthread_t *, tp);
1602 * Make the target thread take an excursion through trap()
1603 * to do preempt() (unless we're already in trap or post_syscall,
1604 * calling cpu_surrender via CL_TRAPRET).
1606 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1607 lwp->lwp_state != LWP_USER) {
1608 aston(tp);
1609 if (cpup != CPU)
1610 poke_cpu(cpup->cpu_id);
1612 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1613 "cpu_surrender:tid %p cpu %p", tp, cpup);
1617 * Commit to and ratify a scheduling decision
1619 /*ARGSUSED*/
1620 static kthread_t *
1621 disp_ratify(kthread_t *tp, disp_t *kpq)
1623 pri_t tpri, maxpri;
1624 pri_t maxkpri;
1625 cpu_t *cpup;
1627 ASSERT(tp != NULL);
1629 * Commit to, then ratify scheduling decision
1631 cpup = CPU;
1632 if (cpup->cpu_runrun != 0)
1633 cpup->cpu_runrun = 0;
1634 if (cpup->cpu_kprunrun != 0)
1635 cpup->cpu_kprunrun = 0;
1636 if (cpup->cpu_chosen_level != -1)
1637 cpup->cpu_chosen_level = -1;
1638 membar_enter();
1639 tpri = DISP_PRIO(tp);
1640 maxpri = cpup->cpu_disp->disp_maxrunpri;
1641 maxkpri = kpq->disp_maxrunpri;
1642 if (maxpri < maxkpri)
1643 maxpri = maxkpri;
1644 if (tpri < maxpri) {
1646 * should have done better
1647 * put this one back and indicate to try again
1649 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1650 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1651 thread_lock_high(tp);
1652 THREAD_TRANSITION(tp);
1653 setfrontdq(tp);
1654 thread_unlock_nopreempt(tp);
1656 tp = NULL;
1658 return (tp);
1662 * See if there is any work on the dispatcher queue for other CPUs.
1663 * If there is, dequeue the best thread and return.
1665 static kthread_t *
1666 disp_getwork(cpu_t *cp)
1668 cpu_t *ocp; /* other CPU */
1669 cpu_t *ocp_start;
1670 cpu_t *tcp; /* target local CPU */
1671 kthread_t *tp;
1672 kthread_t *retval = NULL;
1673 pri_t maxpri;
1674 disp_t *kpq; /* kp queue for this partition */
1675 lpl_t *lpl, *lpl_leaf;
1676 int leafidx, startidx;
1677 hrtime_t stealtime;
1678 lgrp_id_t local_id;
1680 maxpri = -1;
1681 tcp = NULL;
1683 kpq = &cp->cpu_part->cp_kp_queue;
1684 while (kpq->disp_maxrunpri >= 0) {
1686 * Try to take a thread from the kp_queue.
1688 tp = (disp_getbest(kpq));
1689 if (tp)
1690 return (disp_ratify(tp, kpq));
1693 kpreempt_disable(); /* protect the cpu_active list */
1696 * Try to find something to do on another CPU's run queue.
1697 * Loop through all other CPUs looking for the one with the highest
1698 * priority unbound thread.
1700 * On NUMA machines, the partition's CPUs are consulted in order of
1701 * distance from the current CPU. This way, the first available
1702 * work found is also the closest, and will suffer the least
1703 * from being migrated.
1705 lpl = lpl_leaf = cp->cpu_lpl;
1706 local_id = lpl_leaf->lpl_lgrpid;
1707 leafidx = startidx = 0;
1710 * This loop traverses the lpl hierarchy. Higher level lpls represent
1711 * broader levels of locality
1713 do {
1714 /* This loop iterates over the lpl's leaves */
1715 do {
1716 if (lpl_leaf != cp->cpu_lpl)
1717 ocp = lpl_leaf->lpl_cpus;
1718 else
1719 ocp = cp->cpu_next_lpl;
1721 /* This loop iterates over the CPUs in the leaf */
1722 ocp_start = ocp;
1723 do {
1724 pri_t pri;
1726 ASSERT(CPU_ACTIVE(ocp));
1729 * End our stroll around this lpl if:
1731 * - Something became runnable on the local
1732 * queue...which also ends our stroll around
1733 * the partition.
1735 * - We happen across another idle CPU.
1736 * Since it is patrolling the next portion
1737 * of the lpl's list (assuming it's not
1738 * halted, or busy servicing an interrupt),
1739 * move to the next higher level of locality.
1741 if (cp->cpu_disp->disp_nrunnable != 0) {
1742 kpreempt_enable();
1743 return (NULL);
1745 if (ocp->cpu_dispatch_pri == -1) {
1746 if (ocp->cpu_disp_flags &
1747 CPU_DISP_HALTED ||
1748 ocp->cpu_intr_actv != 0)
1749 continue;
1750 else
1751 goto next_level;
1755 * If there's only one thread and the CPU
1756 * is in the middle of a context switch,
1757 * or it's currently running the idle thread,
1758 * don't steal it.
1760 if ((ocp->cpu_disp_flags &
1761 CPU_DISP_DONTSTEAL) &&
1762 ocp->cpu_disp->disp_nrunnable == 1)
1763 continue;
1765 pri = ocp->cpu_disp->disp_max_unbound_pri;
1766 if (pri > maxpri) {
1768 * Don't steal threads that we attempted
1769 * to steal recently until they're ready
1770 * to be stolen again.
1772 stealtime = ocp->cpu_disp->disp_steal;
1773 if (stealtime == 0 ||
1774 stealtime - gethrtime() <= 0) {
1775 maxpri = pri;
1776 tcp = ocp;
1777 } else {
1779 * Don't update tcp, just set
1780 * the retval to T_DONTSTEAL, so
1781 * that if no acceptable CPUs
1782 * are found the return value
1783 * will be T_DONTSTEAL rather
1784 * then NULL.
1786 retval = T_DONTSTEAL;
1789 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1792 * Iterate to the next leaf lpl in the resource set
1793 * at this level of locality. If we hit the end of
1794 * the set, wrap back around to the beginning.
1796 * Note: This iteration is NULL terminated for a reason
1797 * see lpl_topo_bootstrap() in lgrp.c for details.
1799 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1800 leafidx = 0;
1801 lpl_leaf = lpl->lpl_rset[leafidx];
1803 } while (leafidx != startidx);
1805 next_level:
1807 * Expand the search to include farther away CPUs (next
1808 * locality level). The closer CPUs that have already been
1809 * checked will be checked again. In doing so, idle CPUs
1810 * will tend to be more aggresive about stealing from CPUs
1811 * that are closer (since the closer CPUs will be considered
1812 * more often).
1813 * Begin at this level with the CPUs local leaf lpl.
1815 if ((lpl = lpl->lpl_parent) != NULL) {
1816 leafidx = startidx = lpl->lpl_id2rset[local_id];
1817 lpl_leaf = lpl->lpl_rset[leafidx];
1819 } while (!tcp && lpl);
1821 kpreempt_enable();
1824 * If another queue looks good, and there is still nothing on
1825 * the local queue, try to transfer one or more threads
1826 * from it to our queue.
1828 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
1829 tp = disp_getbest(tcp->cpu_disp);
1830 if (tp == NULL || tp == T_DONTSTEAL)
1831 return (tp);
1832 return (disp_ratify(tp, kpq));
1834 return (retval);
1839 * disp_fix_unbound_pri()
1840 * Determines the maximum priority of unbound threads on the queue.
1841 * The priority is kept for the queue, but is only increased, never
1842 * reduced unless some CPU is looking for something on that queue.
1844 * The priority argument is the known upper limit.
1846 * Perhaps this should be kept accurately, but that probably means
1847 * separate bitmaps for bound and unbound threads. Since only idled
1848 * CPUs will have to do this recalculation, it seems better this way.
1850 static void
1851 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
1853 kthread_t *tp;
1854 dispq_t *dq;
1855 ulong_t *dqactmap = dp->disp_qactmap;
1856 ulong_t mapword;
1857 int wx;
1859 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
1861 ASSERT(pri >= 0); /* checked by caller */
1864 * Start the search at the next lowest priority below the supplied
1865 * priority. This depends on the bitmap implementation.
1867 do {
1868 wx = pri >> BT_ULSHIFT; /* index of word in map */
1871 * Form mask for all lower priorities in the word.
1873 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
1876 * Get next lower active priority.
1878 if (mapword != 0) {
1879 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
1880 } else if (wx > 0) {
1881 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
1882 if (pri < 0)
1883 break;
1884 } else {
1885 pri = -1;
1886 break;
1890 * Search the queue for unbound, runnable threads.
1892 dq = &dp->disp_q[pri];
1893 tp = dq->dq_first;
1895 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
1896 tp = tp->t_link;
1900 * If a thread was found, set the priority and return.
1902 } while (tp == NULL);
1905 * pri holds the maximum unbound thread priority or -1.
1907 if (dp->disp_max_unbound_pri != pri)
1908 dp->disp_max_unbound_pri = pri;
1912 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
1913 * check if the CPU to which is was previously bound should have
1914 * its disp_max_unbound_pri increased.
1916 void
1917 disp_adjust_unbound_pri(kthread_t *tp)
1919 disp_t *dp;
1920 pri_t tpri;
1922 ASSERT(THREAD_LOCK_HELD(tp));
1925 * Don't do anything if the thread is not bound, or
1926 * currently not runnable.
1928 if (tp->t_bound_cpu == NULL ||
1929 tp->t_state != TS_RUN)
1930 return;
1932 tpri = DISP_PRIO(tp);
1933 dp = tp->t_bound_cpu->cpu_disp;
1934 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1935 if (tpri > dp->disp_max_unbound_pri)
1936 dp->disp_max_unbound_pri = tpri;
1940 * disp_getbest()
1941 * De-queue the highest priority unbound runnable thread.
1942 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
1943 * Returns NULL if nothing found.
1944 * Returns T_DONTSTEAL if the thread was not stealable.
1945 * so that the caller will try again later.
1947 * Passed a pointer to a dispatch queue not associated with this CPU, and
1948 * its type.
1950 static kthread_t *
1951 disp_getbest(disp_t *dp)
1953 kthread_t *tp;
1954 dispq_t *dq;
1955 pri_t pri;
1956 cpu_t *cp, *tcp;
1957 boolean_t allbound;
1959 disp_lock_enter(&dp->disp_lock);
1962 * If there is nothing to run, or the CPU is in the middle of a
1963 * context switch of the only thread, return NULL.
1965 tcp = dp->disp_cpu;
1966 cp = CPU;
1967 pri = dp->disp_max_unbound_pri;
1968 if (pri == -1 ||
1969 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
1970 tcp->cpu_disp->disp_nrunnable == 1)) {
1971 disp_lock_exit_nopreempt(&dp->disp_lock);
1972 return (NULL);
1975 dq = &dp->disp_q[pri];
1979 * Assume that all threads are bound on this queue, and change it
1980 * later when we find out that it is not the case.
1982 allbound = B_TRUE;
1983 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
1984 hrtime_t now, nosteal, rqtime;
1987 * Skip over bound threads which could be here even
1988 * though disp_max_unbound_pri indicated this level.
1990 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1991 continue;
1994 * We've got some unbound threads on this queue, so turn
1995 * the allbound flag off now.
1997 allbound = B_FALSE;
2000 * The thread is a candidate for stealing from its run queue. We
2001 * don't want to steal threads that became runnable just a
2002 * moment ago. This improves CPU affinity for threads that get
2003 * preempted for short periods of time and go back on the run
2004 * queue.
2006 * We want to let it stay on its run queue if it was only placed
2007 * there recently and it was running on the same CPU before that
2008 * to preserve its cache investment. For the thread to remain on
2009 * its run queue, ALL of the following conditions must be
2010 * satisfied:
2012 * - the disp queue should not be the kernel preemption queue
2013 * - delayed idle stealing should not be disabled
2014 * - nosteal_nsec should be non-zero
2015 * - it should run with user priority
2016 * - it should be on the run queue of the CPU where it was
2017 * running before being placed on the run queue
2018 * - it should be the only thread on the run queue (to prevent
2019 * extra scheduling latency for other threads)
2020 * - it should sit on the run queue for less than per-chip
2021 * nosteal interval or global nosteal interval
2022 * - in case of CPUs with shared cache it should sit in a run
2023 * queue of a CPU from a different chip
2025 * The checks are arranged so that the ones that are faster are
2026 * placed earlier.
2028 if (tcp == NULL ||
2029 pri >= minclsyspri ||
2030 tp->t_cpu != tcp)
2031 break;
2034 * Steal immediately if, due to CMT processor architecture
2035 * migraiton between cp and tcp would incur no performance
2036 * penalty.
2038 if (pg_cmt_can_migrate(cp, tcp))
2039 break;
2041 nosteal = nosteal_nsec;
2042 if (nosteal == 0)
2043 break;
2046 * Calculate time spent sitting on run queue
2048 now = gethrtime_unscaled();
2049 rqtime = now - tp->t_waitrq;
2050 scalehrtime(&rqtime);
2053 * Steal immediately if the time spent on this run queue is more
2054 * than allowed nosteal delay.
2056 * Negative rqtime check is needed here to avoid infinite
2057 * stealing delays caused by unlikely but not impossible
2058 * drifts between CPU times on different CPUs.
2060 if (rqtime > nosteal || rqtime < 0)
2061 break;
2063 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2064 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2065 scalehrtime(&now);
2067 * Calculate when this thread becomes stealable
2069 now += (nosteal - rqtime);
2072 * Calculate time when some thread becomes stealable
2074 if (now < dp->disp_steal)
2075 dp->disp_steal = now;
2079 * If there were no unbound threads on this queue, find the queue
2080 * where they are and then return later. The value of
2081 * disp_max_unbound_pri is not always accurate because it isn't
2082 * reduced until another idle CPU looks for work.
2084 if (allbound)
2085 disp_fix_unbound_pri(dp, pri);
2088 * If we reached the end of the queue and found no unbound threads
2089 * then return NULL so that other CPUs will be considered. If there
2090 * are unbound threads but they cannot yet be stolen, then
2091 * return T_DONTSTEAL and try again later.
2093 if (tp == NULL) {
2094 disp_lock_exit_nopreempt(&dp->disp_lock);
2095 return (allbound ? NULL : T_DONTSTEAL);
2099 * Found a runnable, unbound thread, so remove it from queue.
2100 * dispdeq() requires that we have the thread locked, and we do,
2101 * by virtue of holding the dispatch queue lock. dispdeq() will
2102 * put the thread in transition state, thereby dropping the dispq
2103 * lock.
2106 #ifdef DEBUG
2108 int thread_was_on_queue;
2110 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2111 ASSERT(thread_was_on_queue);
2114 #else /* DEBUG */
2115 (void) dispdeq(tp); /* drops disp_lock */
2116 #endif /* DEBUG */
2119 * Reset the disp_queue steal time - we do not know what is the smallest
2120 * value across the queue is.
2122 dp->disp_steal = 0;
2125 * Setup thread to run on the current CPU.
2127 tp->t_disp_queue = cp->cpu_disp;
2129 cp->cpu_dispthread = tp; /* protected by spl only */
2130 cp->cpu_dispatch_pri = pri;
2133 * There can be a memory synchronization race between disp_getbest()
2134 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2135 * to preempt the current thread to run the enqueued thread while
2136 * disp_getbest() and disp_ratify() are changing the current thread
2137 * to the stolen thread. This may lead to a situation where
2138 * cpu_resched() tries to preempt the wrong thread and the
2139 * stolen thread continues to run on the CPU which has been tagged
2140 * for preemption.
2141 * Later the clock thread gets enqueued but doesn't get to run on the
2142 * CPU causing the system to hang.
2144 * To avoid this, grabbing and dropping the disp_lock (which does
2145 * a memory barrier) is needed to synchronize the execution of
2146 * cpu_resched() with disp_getbest() and disp_ratify() and
2147 * synchronize the memory read and written by cpu_resched(),
2148 * disp_getbest(), and disp_ratify() with each other.
2149 * (see CR#6482861 for more details).
2151 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2152 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2154 ASSERT(pri == DISP_PRIO(tp));
2156 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2158 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2161 * Return with spl high so that swtch() won't need to raise it.
2162 * The disp_lock was dropped by dispdeq().
2165 return (tp);
2169 * disp_bound_common() - common routine for higher level functions
2170 * that check for bound threads under certain conditions.
2171 * If 'threadlistsafe' is set then there is no need to acquire
2172 * pidlock to stop the thread list from changing (eg, if
2173 * disp_bound_* is called with cpus paused).
2175 static int
2176 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2178 int found = 0;
2179 kthread_t *tp;
2181 ASSERT(flag);
2183 if (!threadlistsafe)
2184 mutex_enter(&pidlock);
2185 tp = curthread; /* faster than allthreads */
2186 do {
2187 if (tp->t_state != TS_FREE) {
2189 * If an interrupt thread is busy, but the
2190 * caller doesn't care (i.e. BOUND_INTR is off),
2191 * then just ignore it and continue through.
2193 if ((tp->t_flag & T_INTR_THREAD) &&
2194 !(flag & BOUND_INTR))
2195 continue;
2198 * Skip the idle thread for the CPU
2199 * we're about to set offline.
2201 if (tp == cp->cpu_idle_thread)
2202 continue;
2205 * Skip the pause thread for the CPU
2206 * we're about to set offline.
2208 if (tp == cp->cpu_pause_thread)
2209 continue;
2211 if ((flag & BOUND_CPU) &&
2212 (tp->t_bound_cpu == cp ||
2213 tp->t_bind_cpu == cp->cpu_id ||
2214 tp->t_weakbound_cpu == cp)) {
2215 found = 1;
2216 break;
2219 if ((flag & BOUND_PARTITION) &&
2220 (tp->t_cpupart == cp->cpu_part)) {
2221 found = 1;
2222 break;
2225 } while ((tp = tp->t_next) != curthread && found == 0);
2226 if (!threadlistsafe)
2227 mutex_exit(&pidlock);
2228 return (found);
2232 * disp_bound_threads - return nonzero if threads are bound to the processor.
2233 * Called infrequently. Keep this simple.
2234 * Includes threads that are asleep or stopped but not onproc.
2237 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2239 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2243 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2244 * to the given processor, including interrupt threads.
2247 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2249 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2253 * disp_bound_partition - return nonzero if threads are bound to the same
2254 * partition as the processor.
2255 * Called infrequently. Keep this simple.
2256 * Includes threads that are asleep or stopped but not onproc.
2259 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2261 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2265 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2266 * threads to other CPUs.
2268 void
2269 disp_cpu_inactive(cpu_t *cp)
2271 kthread_t *tp;
2272 disp_t *dp = cp->cpu_disp;
2273 dispq_t *dq;
2274 pri_t pri;
2275 int wasonq;
2277 disp_lock_enter(&dp->disp_lock);
2278 while ((pri = dp->disp_max_unbound_pri) != -1) {
2279 dq = &dp->disp_q[pri];
2280 tp = dq->dq_first;
2283 * Skip over bound threads.
2285 while (tp != NULL && tp->t_bound_cpu != NULL) {
2286 tp = tp->t_link;
2289 if (tp == NULL) {
2290 /* disp_max_unbound_pri must be inaccurate, so fix it */
2291 disp_fix_unbound_pri(dp, pri);
2292 continue;
2295 wasonq = dispdeq(tp); /* drops disp_lock */
2296 ASSERT(wasonq);
2297 ASSERT(tp->t_weakbound_cpu == NULL);
2299 setbackdq(tp);
2301 * Called from cpu_offline:
2303 * cp has already been removed from the list of active cpus
2304 * and tp->t_cpu has been changed so there is no risk of
2305 * tp ending up back on cp.
2307 * Called from cpupart_move_cpu:
2309 * The cpu has moved to a new cpupart. Any threads that
2310 * were on it's dispatch queues before the move remain
2311 * in the old partition and can't run in the new partition.
2313 ASSERT(tp->t_cpu != cp);
2314 thread_unlock(tp);
2316 disp_lock_enter(&dp->disp_lock);
2318 disp_lock_exit(&dp->disp_lock);
2322 * disp_lowpri_cpu - find CPU running the lowest priority thread.
2323 * The hint passed in is used as a starting point so we don't favor
2324 * CPU 0 or any other CPU. The caller should pass in the most recently
2325 * used CPU for the thread.
2327 * The lgroup and priority are used to determine the best CPU to run on
2328 * in a NUMA machine. The lgroup specifies which CPUs are closest while
2329 * the thread priority will indicate whether the thread will actually run
2330 * there. To pick the best CPU, the CPUs inside and outside of the given
2331 * lgroup which are running the lowest priority threads are found. The
2332 * remote CPU is chosen only if the thread will not run locally on a CPU
2333 * within the lgroup, but will run on the remote CPU. If the thread
2334 * cannot immediately run on any CPU, the best local CPU will be chosen.
2336 * The lpl specified also identifies the cpu partition from which
2337 * disp_lowpri_cpu should select a CPU.
2339 * curcpu is used to indicate that disp_lowpri_cpu is being called on
2340 * behalf of the current thread. (curthread is looking for a new cpu)
2341 * In this case, cpu_dispatch_pri for this thread's cpu should be
2342 * ignored.
2344 * If a cpu is the target of an offline request then try to avoid it.
2346 * This function must be called at either high SPL, or with preemption
2347 * disabled, so that the "hint" CPU cannot be removed from the online
2348 * CPU list while we are traversing it.
2350 cpu_t *
2351 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2353 cpu_t *bestcpu;
2354 cpu_t *besthomecpu;
2355 cpu_t *cp, *cpstart;
2357 pri_t bestpri;
2358 pri_t cpupri;
2360 klgrpset_t done;
2361 klgrpset_t cur_set;
2363 lpl_t *lpl_iter, *lpl_leaf;
2364 int i;
2367 * Scan for a CPU currently running the lowest priority thread.
2368 * Cannot get cpu_lock here because it is adaptive.
2369 * We do not require lock on CPU list.
2371 ASSERT(hint != NULL);
2372 ASSERT(lpl != NULL);
2373 ASSERT(lpl->lpl_ncpu > 0);
2376 * First examine local CPUs. Note that it's possible the hint CPU
2377 * passed in in remote to the specified home lgroup. If our priority
2378 * isn't sufficient enough such that we can run immediately at home,
2379 * then examine CPUs remote to our home lgroup.
2380 * We would like to give preference to CPUs closest to "home".
2381 * If we can't find a CPU where we'll run at a given level
2382 * of locality, we expand our search to include the next level.
2384 bestcpu = besthomecpu = NULL;
2385 klgrpset_clear(done);
2386 /* start with lpl we were passed */
2388 lpl_iter = lpl;
2390 do {
2392 bestpri = SHRT_MAX;
2393 klgrpset_clear(cur_set);
2395 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2396 lpl_leaf = lpl_iter->lpl_rset[i];
2397 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2398 continue;
2400 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2402 if (hint->cpu_lpl == lpl_leaf)
2403 cp = cpstart = hint;
2404 else
2405 cp = cpstart = lpl_leaf->lpl_cpus;
2407 do {
2408 if (cp == curcpu)
2409 cpupri = -1;
2410 else if (cp == cpu_inmotion)
2411 cpupri = SHRT_MAX;
2412 else
2413 cpupri = cp->cpu_dispatch_pri;
2414 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2415 cpupri = cp->cpu_disp->disp_maxrunpri;
2416 if (cp->cpu_chosen_level > cpupri)
2417 cpupri = cp->cpu_chosen_level;
2418 if (cpupri < bestpri) {
2419 if (CPU_IDLING(cpupri)) {
2420 ASSERT((cp->cpu_flags &
2421 CPU_QUIESCED) == 0);
2422 return (cp);
2424 bestcpu = cp;
2425 bestpri = cpupri;
2427 } while ((cp = cp->cpu_next_lpl) != cpstart);
2430 if (bestcpu && (tpri > bestpri)) {
2431 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2432 return (bestcpu);
2434 if (besthomecpu == NULL)
2435 besthomecpu = bestcpu;
2437 * Add the lgrps we just considered to the "done" set
2439 klgrpset_or(done, cur_set);
2441 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2444 * The specified priority isn't high enough to run immediately
2445 * anywhere, so just return the best CPU from the home lgroup.
2447 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2448 return (besthomecpu);
2452 * This routine provides the generic idle cpu function for all processors.
2453 * If a processor has some specific code to execute when idle (say, to stop
2454 * the pipeline and save power) then that routine should be defined in the
2455 * processors specific code (module_xx.c) and the global variable idle_cpu
2456 * set to that function.
2458 static void
2459 generic_idle_cpu(void)
2463 /*ARGSUSED*/
2464 static void
2465 generic_enq_thread(cpu_t *cpu, int bound)