sys/kern/kern_runq.c

   1 /*      $NetBSD: kern_runq.c,v 1.27 2009/10/21 21:12:06 rmind Exp $     */
   2
   3 /*
   4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.27 2009/10/21 21:12:06 rmind Exp $");
  31
  32 #include <sys/param.h>
  33 #include <sys/kernel.h>
  34 #include <sys/bitops.h>
  35 #include <sys/cpu.h>
  36 #include <sys/idle.h>
  37 #include <sys/intr.h>
  38 #include <sys/kmem.h>
  39 #include <sys/lwp.h>
  40 #include <sys/mutex.h>
  41 #include <sys/proc.h>
  42 #include <sys/sched.h>
  43 #include <sys/syscallargs.h>
  44 #include <sys/sysctl.h>
  45 #include <sys/systm.h>
  46 #include <sys/types.h>
  47 #include <sys/evcnt.h>
  48
  49 /*
  50  * Priority related defintions.
  51  */
  52 #define PRI_TS_COUNT    (NPRI_USER)
  53 #define PRI_RT_COUNT    (PRI_COUNT - PRI_TS_COUNT)
  54 #define PRI_HTS_RANGE   (PRI_TS_COUNT / 10)
  55
  56 #define PRI_HIGHEST_TS  (MAXPRI_USER)
  57
  58 /*
  59  * Bits per map.
  60  */
  61 #define BITMAP_BITS     (32)
  62 #define BITMAP_SHIFT    (5)
  63 #define BITMAP_MSB      (0x80000000U)
  64 #define BITMAP_MASK     (BITMAP_BITS - 1)
  65
  66 /*
  67  * Structures, runqueue.
  68  */
  69
  70 const int       schedppq = 1;
  71
  72 typedef struct {
  73         TAILQ_HEAD(, lwp) q_head;
  74 } queue_t;
  75
  76 typedef struct {
  77         /* Bitmap */
  78         uint32_t        r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
  79         /* Counters */
  80         u_int           r_count;        /* Count of the threads */
  81         u_int           r_avgcount;     /* Average count of threads */
  82         u_int           r_mcount;       /* Count of migratable threads */
  83         /* Runqueues */
  84         queue_t         r_rt_queue[PRI_RT_COUNT];
  85         queue_t         r_ts_queue[PRI_TS_COUNT];
  86         /* Event counters */
  87         struct evcnt    r_ev_pull;
  88         struct evcnt    r_ev_push;
  89         struct evcnt    r_ev_stay;
  90         struct evcnt    r_ev_localize;
  91 } runqueue_t;
  92
  93 static void *   sched_getrq(runqueue_t *, const pri_t);
  94 #ifdef MULTIPROCESSOR
  95 static lwp_t *  sched_catchlwp(struct cpu_info *);
  96 static void     sched_balance(void *);
  97 #endif
  98
  99 /*
 100  * Preemption control.
 101  */
 102 int             sched_upreempt_pri = PRI_KERNEL;
 103 #ifdef __HAVE_PREEMPTION
 104 # ifdef DEBUG
 105 int             sched_kpreempt_pri = 0;
 106 # else
 107 int             sched_kpreempt_pri = PRI_USER_RT;
 108 # endif
 109 #else
 110 int             sched_kpreempt_pri = 1000;
 111 #endif
 112
 113 /*
 114  * Migration and balancing.
 115  */
 116 static u_int    cacheht_time;           /* Cache hotness time */
 117 static u_int    min_catch;              /* Minimal LWP count for catching */
 118 static u_int    balance_period;         /* Balance period */
 119 static struct cpu_info *worker_ci;      /* Victim CPU */
 120 #ifdef MULTIPROCESSOR
 121 static struct callout balance_ch;       /* Callout of balancer */
 122 #endif
 123
 124 void
 125 runq_init(void)
 126 {
 127
 128         /* Balancing */
 129         worker_ci = curcpu();
 130         cacheht_time = mstohz(3);               /*   ~3 ms */
 131         balance_period = mstohz(300);           /* ~300 ms */
 132
 133         /* Minimal count of LWPs for catching */
 134         min_catch = 1;
 135
 136         /* Initialize balancing callout and run it */
 137 #ifdef MULTIPROCESSOR
 138         callout_init(&balance_ch, CALLOUT_MPSAFE);
 139         callout_setfunc(&balance_ch, sched_balance, NULL);
 140         callout_schedule(&balance_ch, balance_period);
 141 #endif
 142 }
 143
 144 void
 145 sched_cpuattach(struct cpu_info *ci)
 146 {
 147         runqueue_t *ci_rq;
 148         void *rq_ptr;
 149         u_int i, size;
 150         char *cpuname;
 151
 152         if (ci->ci_schedstate.spc_lwplock == NULL) {
 153                 ci->ci_schedstate.spc_lwplock =
 154                     mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
 155         }
 156         if (ci == lwp0.l_cpu) {
 157                 /* Initialize the scheduler structure of the primary LWP */
 158                 lwp0.l_mutex = ci->ci_schedstate.spc_lwplock;
 159         }
 160         if (ci->ci_schedstate.spc_mutex != NULL) {
 161                 /* Already initialized. */
 162                 return;
 163         }
 164
 165         /* Allocate the run queue */
 166         size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit;
 167         rq_ptr = kmem_zalloc(size, KM_SLEEP);
 168         if (rq_ptr == NULL) {
 169                 panic("sched_cpuattach: could not allocate the runqueue");
 170         }
 171         ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit));
 172
 173         /* Initialize run queues */
 174         ci->ci_schedstate.spc_mutex =
 175             mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
 176         for (i = 0; i < PRI_RT_COUNT; i++)
 177                 TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
 178         for (i = 0; i < PRI_TS_COUNT; i++)
 179                 TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
 180
 181         ci->ci_schedstate.spc_sched_info = ci_rq;
 182
 183         cpuname = kmem_alloc(8, KM_SLEEP);
 184         snprintf(cpuname, 8, "cpu%d", cpu_index(ci));
 185
 186         evcnt_attach_dynamic(&ci_rq->r_ev_pull, EVCNT_TYPE_MISC, NULL,
 187            cpuname, "runqueue pull");
 188         evcnt_attach_dynamic(&ci_rq->r_ev_push, EVCNT_TYPE_MISC, NULL,
 189            cpuname, "runqueue push");
 190         evcnt_attach_dynamic(&ci_rq->r_ev_stay, EVCNT_TYPE_MISC, NULL,
 191            cpuname, "runqueue stay");
 192         evcnt_attach_dynamic(&ci_rq->r_ev_localize, EVCNT_TYPE_MISC, NULL,
 193            cpuname, "runqueue localize");
 194 }
 195
 196 /*
 197  * Control of the runqueue.
 198  */
 199
 200 static inline void *
 201 sched_getrq(runqueue_t *ci_rq, const pri_t prio)
 202 {
 203
 204         KASSERT(prio < PRI_COUNT);
 205         return (prio <= PRI_HIGHEST_TS) ?
 206             &ci_rq->r_ts_queue[prio].q_head :
 207             &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
 208 }
 209
 210 void
 211 sched_enqueue(struct lwp *l, bool swtch)
 212 {
 213         runqueue_t *ci_rq;
 214         struct schedstate_percpu *spc;
 215         TAILQ_HEAD(, lwp) *q_head;
 216         const pri_t eprio = lwp_eprio(l);
 217         struct cpu_info *ci;
 218         int type;
 219
 220         ci = l->l_cpu;
 221         spc = &ci->ci_schedstate;
 222         ci_rq = spc->spc_sched_info;
 223         KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
 224
 225         /* Update the last run time on switch */
 226         if (__predict_true(swtch == true))
 227                 l->l_rticksum += (hardclock_ticks - l->l_rticks);
 228         else if (l->l_rticks == 0)
 229                 l->l_rticks = hardclock_ticks;
 230
 231         /* Enqueue the thread */
 232         q_head = sched_getrq(ci_rq, eprio);
 233         if (TAILQ_EMPTY(q_head)) {
 234                 u_int i;
 235                 uint32_t q;
 236
 237                 /* Mark bit */
 238                 i = eprio >> BITMAP_SHIFT;
 239                 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
 240                 KASSERT((ci_rq->r_bitmap[i] & q) == 0);
 241                 ci_rq->r_bitmap[i] |= q;
 242         }
 243         TAILQ_INSERT_TAIL(q_head, l, l_runq);
 244         ci_rq->r_count++;
 245         if ((l->l_pflag & LP_BOUND) == 0)
 246                 ci_rq->r_mcount++;
 247
 248         /*
 249          * Update the value of highest priority in the runqueue,
 250          * if priority of this thread is higher.
 251          */
 252         if (eprio > spc->spc_maxpriority)
 253                 spc->spc_maxpriority = eprio;
 254
 255         sched_newts(l);
 256
 257         /*
 258          * Wake the chosen CPU or cause a preemption if the newly
 259          * enqueued thread has higher priority.  Don't cause a
 260          * preemption if the thread is yielding (swtch).
 261          */
 262         if (!swtch && eprio > spc->spc_curpriority) {
 263                 if (eprio >= sched_kpreempt_pri)
 264                         type = RESCHED_KPREEMPT;
 265                 else if (eprio >= sched_upreempt_pri)
 266                         type = RESCHED_IMMED;
 267                 else
 268                         type = RESCHED_LAZY;
 269                 cpu_need_resched(ci, type);
 270         }
 271 }
 272
 273 void
 274 sched_dequeue(struct lwp *l)
 275 {
 276         runqueue_t *ci_rq;
 277         TAILQ_HEAD(, lwp) *q_head;
 278         struct schedstate_percpu *spc;
 279         const pri_t eprio = lwp_eprio(l);
 280
 281         spc = & l->l_cpu->ci_schedstate;
 282         ci_rq = spc->spc_sched_info;
 283         KASSERT(lwp_locked(l, spc->spc_mutex));
 284
 285         KASSERT(eprio <= spc->spc_maxpriority);
 286         KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
 287         KASSERT(ci_rq->r_count > 0);
 288
 289         if (spc->spc_migrating == l)
 290                 spc->spc_migrating = NULL;
 291
 292         ci_rq->r_count--;
 293         if ((l->l_pflag & LP_BOUND) == 0)
 294                 ci_rq->r_mcount--;
 295
 296         q_head = sched_getrq(ci_rq, eprio);
 297         TAILQ_REMOVE(q_head, l, l_runq);
 298         if (TAILQ_EMPTY(q_head)) {
 299                 u_int i;
 300                 uint32_t q;
 301
 302                 /* Unmark bit */
 303                 i = eprio >> BITMAP_SHIFT;
 304                 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
 305                 KASSERT((ci_rq->r_bitmap[i] & q) != 0);
 306                 ci_rq->r_bitmap[i] &= ~q;
 307
 308                 /*
 309                  * Update the value of highest priority in the runqueue, in a
 310                  * case it was a last thread in the queue of highest priority.
 311                  */
 312                 if (eprio != spc->spc_maxpriority)
 313                         return;
 314
 315                 do {
 316                         if (ci_rq->r_bitmap[i] != 0) {
 317                                 q = ffs(ci_rq->r_bitmap[i]);
 318                                 spc->spc_maxpriority =
 319                                     (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
 320                                 return;
 321                         }
 322                 } while (i--);
 323
 324                 /* If not found - set the lowest value */
 325                 spc->spc_maxpriority = 0;
 326         }
 327 }
 328
 329 /*
 330  * Migration and balancing.
 331  */
 332
 333 #ifdef MULTIPROCESSOR
 334
 335 /* Estimate if LWP is cache-hot */
 336 static inline bool
 337 lwp_cache_hot(const struct lwp *l)
 338 {
 339
 340         if (__predict_false(l->l_slptime || l->l_rticks == 0))
 341                 return false;
 342
 343         return (hardclock_ticks - l->l_rticks <= cacheht_time);
 344 }
 345
 346 /* Check if LWP can migrate to the chosen CPU */
 347 static inline bool
 348 sched_migratable(const struct lwp *l, struct cpu_info *ci)
 349 {
 350         const struct schedstate_percpu *spc = &ci->ci_schedstate;
 351         KASSERT(lwp_locked(__UNCONST(l), NULL));
 352
 353         /* CPU is offline */
 354         if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
 355                 return false;
 356
 357         /* Affinity bind */
 358         if (__predict_false(l->l_flag & LW_AFFINITY))
 359                 return kcpuset_isset(cpu_index(ci), l->l_affinity);
 360
 361         /* Processor-set */
 362         return (spc->spc_psid == l->l_psid);
 363 }
 364
 365 /*
 366  * Estimate the migration of LWP to the other CPU.
 367  * Take and return the CPU, if migration is needed.
 368  */
 369 struct cpu_info *
 370 sched_takecpu(struct lwp *l)
 371 {
 372         struct cpu_info *ci, *tci, *first, *next;
 373         struct schedstate_percpu *spc;
 374         runqueue_t *ci_rq, *ici_rq;
 375         pri_t eprio, lpri, pri;
 376
 377         KASSERT(lwp_locked(l, NULL));
 378
 379         /* If thread is strictly bound, do not estimate other CPUs */
 380         ci = l->l_cpu;
 381         if (l->l_pflag & LP_BOUND)
 382                 return ci;
 383
 384         spc = &ci->ci_schedstate;
 385         ci_rq = spc->spc_sched_info;
 386
 387         /* Make sure that thread is in appropriate processor-set */
 388         if (__predict_true(spc->spc_psid == l->l_psid)) {
 389                 /* If CPU of this thread is idling - run there */
 390                 if (ci_rq->r_count == 0) {
 391                         ci_rq->r_ev_stay.ev_count++;
 392                         return ci;
 393                 }
 394                 /* Stay if thread is cache-hot */
 395                 eprio = lwp_eprio(l);
 396                 if (__predict_true(l->l_stat != LSIDL) &&
 397                     lwp_cache_hot(l) && eprio >= spc->spc_curpriority) {
 398                         ci_rq->r_ev_stay.ev_count++;
 399                         return ci;
 400                 }
 401         } else {
 402                 eprio = lwp_eprio(l);
 403         }
 404
 405         /* Run on current CPU if priority of thread is higher */
 406         ci = curcpu();
 407         spc = &ci->ci_schedstate;
 408         if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) {
 409                 ci_rq = spc->spc_sched_info;
 410                 ci_rq->r_ev_localize.ev_count++;
 411                 return ci;
 412         }
 413
 414         /*
 415          * Look for the CPU with the lowest priority thread.  In case of
 416          * equal priority, choose the CPU with the fewest of threads.
 417          */
 418         first = l->l_cpu;
 419         ci = first;
 420         tci = first;
 421         lpri = PRI_COUNT;
 422         do {
 423                 next = CIRCLEQ_LOOP_NEXT(&cpu_queue, ci, ci_data.cpu_qchain);
 424                 spc = &ci->ci_schedstate;
 425                 ici_rq = spc->spc_sched_info;
 426                 pri = max(spc->spc_curpriority, spc->spc_maxpriority);
 427                 if (pri > lpri)
 428                         continue;
 429
 430                 if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
 431                         continue;
 432
 433                 if (!sched_migratable(l, ci))
 434                         continue;
 435
 436                 lpri = pri;
 437                 tci = ci;
 438                 ci_rq = ici_rq;
 439         } while (ci = next, ci != first);
 440
 441         ci_rq = tci->ci_schedstate.spc_sched_info;
 442         ci_rq->r_ev_push.ev_count++;
 443
 444         return tci;
 445 }
 446
 447 /*
 448  * Tries to catch an LWP from the runqueue of other CPU.
 449  */
 450 static struct lwp *
 451 sched_catchlwp(struct cpu_info *ci)
 452 {
 453         struct cpu_info *curci = curcpu();
 454         struct schedstate_percpu *spc, *curspc;
 455         TAILQ_HEAD(, lwp) *q_head;
 456         runqueue_t *ci_rq;
 457         struct lwp *l;
 458
 459         curspc = &curci->ci_schedstate;
 460         spc = &ci->ci_schedstate;
 461         KASSERT(curspc->spc_psid == spc->spc_psid);
 462
 463         ci_rq = spc->spc_sched_info;
 464         if (ci_rq->r_mcount < min_catch) {
 465                 spc_unlock(ci);
 466                 return NULL;
 467         }
 468
 469         /* Take the highest priority thread */
 470         q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
 471         l = TAILQ_FIRST(q_head);
 472
 473         for (;;) {
 474                 /* Check the first and next result from the queue */
 475                 if (l == NULL) {
 476                         break;
 477                 }
 478                 KASSERT(l->l_stat == LSRUN);
 479
 480                 /* Look for threads, whose are allowed to migrate */
 481                 if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) ||
 482                     !sched_migratable(l, curci)) {
 483                         l = TAILQ_NEXT(l, l_runq);
 484                         continue;
 485                 }
 486
 487                 /* Grab the thread, and move to the local run queue */
 488                 sched_dequeue(l);
 489
 490                 /*
 491                  * If LWP is still context switching, we may need to
 492                  * spin-wait before changing its CPU.
 493                  */
 494                 if (__predict_false(l->l_ctxswtch != 0)) {
 495                         u_int count;
 496                         count = SPINLOCK_BACKOFF_MIN;
 497                         while (l->l_ctxswtch)
 498                                 SPINLOCK_BACKOFF(count);
 499                 }
 500                 l->l_cpu = curci;
 501                 ci_rq->r_ev_pull.ev_count++;
 502                 lwp_unlock_to(l, curspc->spc_mutex);
 503                 sched_enqueue(l, false);
 504                 return l;
 505         }
 506         spc_unlock(ci);
 507
 508         return l;
 509 }
 510
 511 /*
 512  * Periodical calculations for balancing.
 513  */
 514 static void
 515 sched_balance(void *nocallout)
 516 {
 517         struct cpu_info *ci, *hci;
 518         runqueue_t *ci_rq;
 519         CPU_INFO_ITERATOR cii;
 520         u_int highest;
 521
 522         hci = curcpu();
 523         highest = 0;
 524
 525         /* Make lockless countings */
 526         for (CPU_INFO_FOREACH(cii, ci)) {
 527                 ci_rq = ci->ci_schedstate.spc_sched_info;
 528
 529                 /* Average count of the threads */
 530                 ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1;
 531
 532                 /* Look for CPU with the highest average */
 533                 if (ci_rq->r_avgcount > highest) {
 534                         hci = ci;
 535                         highest = ci_rq->r_avgcount;
 536                 }
 537         }
 538
 539         /* Update the worker */
 540         worker_ci = hci;
 541
 542         if (nocallout == NULL)
 543                 callout_schedule(&balance_ch, balance_period);
 544 }
 545
 546 /*
 547  * Called from each CPU's idle loop.
 548  */
 549 void
 550 sched_idle(void)
 551 {
 552         struct cpu_info *ci = curcpu(), *tci = NULL;
 553         struct schedstate_percpu *spc, *tspc;
 554         runqueue_t *ci_rq;
 555         bool dlock = false;
 556
 557         /* Check if there is a migrating LWP */
 558         spc = &ci->ci_schedstate;
 559         if (spc->spc_migrating == NULL)
 560                 goto no_migration;
 561
 562         spc_lock(ci);
 563         for (;;) {
 564                 struct lwp *l;
 565
 566                 l = spc->spc_migrating;
 567                 if (l == NULL)
 568                         break;
 569
 570                 /*
 571                  * If second attempt, and target CPU has changed,
 572                  * drop the old lock.
 573                  */
 574                 if (dlock == true && tci != l->l_target_cpu) {
 575                         KASSERT(tci != NULL);
 576                         spc_unlock(tci);
 577                         dlock = false;
 578                 }
 579
 580                 /*
 581                  * Nothing to do if destination has changed to the
 582                  * local CPU, or migration was done by other CPU.
 583                  */
 584                 tci = l->l_target_cpu;
 585                 if (tci == NULL || tci == ci) {
 586                         spc->spc_migrating = NULL;
 587                         l->l_target_cpu = NULL;
 588                         break;
 589                 }
 590                 tspc = &tci->ci_schedstate;
 591
 592                 /*
 593                  * Double-lock the runqueues.
 594                  * We do that only once.
 595                  */
 596                 if (dlock == false) {
 597                         dlock = true;
 598                         if (ci < tci) {
 599                                 spc_lock(tci);
 600                         } else if (!mutex_tryenter(tspc->spc_mutex)) {
 601                                 spc_unlock(ci);
 602                                 spc_lock(tci);
 603                                 spc_lock(ci);
 604                                 /* Check the situation again.. */
 605                                 continue;
 606                         }
 607                 }
 608
 609                 /* Migrate the thread */
 610                 KASSERT(l->l_stat == LSRUN);
 611                 spc->spc_migrating = NULL;
 612                 l->l_target_cpu = NULL;
 613                 sched_dequeue(l);
 614                 l->l_cpu = tci;
 615                 lwp_setlock(l, tspc->spc_mutex);
 616                 sched_enqueue(l, false);
 617                 break;
 618         }
 619         if (dlock == true) {
 620                 KASSERT(tci != NULL);
 621                 spc_unlock(tci);
 622         }
 623         spc_unlock(ci);
 624
 625 no_migration:
 626         ci_rq = spc->spc_sched_info;
 627         if ((spc->spc_flags & SPCF_OFFLINE) != 0 || ci_rq->r_count != 0) {
 628                 return;
 629         }
 630
 631         /* Reset the counter, and call the balancer */
 632         ci_rq->r_avgcount = 0;
 633         sched_balance(ci);
 634         tci = worker_ci;
 635         tspc = &tci->ci_schedstate;
 636         if (ci == tci || spc->spc_psid != tspc->spc_psid)
 637                 return;
 638         spc_dlock(ci, tci);
 639         (void)sched_catchlwp(tci);
 640         spc_unlock(ci);
 641 }
 642
 643 #else
 644
 645 struct cpu_info *
 646 sched_takecpu(struct lwp *l)
 647 {
 648
 649         return l->l_cpu;
 650 }
 651
 652 void
 653 sched_idle(void)
 654 {
 655
 656 }
 657 #endif  /* MULTIPROCESSOR */
 658
 659 /*
 660  * Scheduling statistics and balancing.
 661  */
 662 void
 663 sched_lwp_stats(struct lwp *l)
 664 {
 665         int batch;
 666
 667         KASSERT(lwp_locked(l, NULL));
 668
 669         /* Update sleep time */
 670         if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
 671             l->l_stat == LSSUSPENDED)
 672                 l->l_slptime++;
 673
 674         /*
 675          * Set that thread is more CPU-bound, if sum of run time exceeds the
 676          * sum of sleep time.  Check if thread is CPU-bound a first time.
 677          */
 678         batch = (l->l_rticksum > l->l_slpticksum);
 679         if (batch != 0) {
 680                 if ((l->l_flag & LW_BATCH) == 0)
 681                         batch = 0;
 682                 l->l_flag |= LW_BATCH;
 683         } else
 684                 l->l_flag &= ~LW_BATCH;
 685
 686         /*
 687          * If thread is CPU-bound and never sleeps, it would occupy the CPU.
 688          * In such case reset the value of last sleep, and check it later, if
 689          * it is still zero - perform the migration, unmark the batch flag.
 690          */
 691         if (batch && (l->l_slptime + l->l_slpticksum) == 0) {
 692                 if (l->l_slpticks == 0) {
 693                         if (l->l_target_cpu == NULL &&
 694                             (l->l_stat == LSRUN || l->l_stat == LSONPROC)) {
 695                                 struct cpu_info *ci = sched_takecpu(l);
 696                                 l->l_target_cpu = (ci != l->l_cpu) ? ci : NULL;
 697                         }
 698                         l->l_flag &= ~LW_BATCH;
 699                 } else {
 700                         l->l_slpticks = 0;
 701                 }
 702         }
 703
 704         /* Reset the time sums */
 705         l->l_slpticksum = 0;
 706         l->l_rticksum = 0;
 707
 708         /* Scheduler-specific hook */
 709         sched_pstats_hook(l, batch);
 710 }
 711
 712 /*
 713  * Scheduler mill.
 714  */
 715 struct lwp *
 716 sched_nextlwp(void)
 717 {
 718         struct cpu_info *ci = curcpu();
 719         struct schedstate_percpu *spc;
 720         TAILQ_HEAD(, lwp) *q_head;
 721         runqueue_t *ci_rq;
 722         struct lwp *l;
 723
 724         /* Return to idle LWP if there is a migrating thread */
 725         spc = &ci->ci_schedstate;
 726         if (__predict_false(spc->spc_migrating != NULL))
 727                 return NULL;
 728         ci_rq = spc->spc_sched_info;
 729
 730 #ifdef MULTIPROCESSOR
 731         /* If runqueue is empty, try to catch some thread from other CPU */
 732         if (__predict_false(ci_rq->r_count == 0)) {
 733                 struct schedstate_percpu *cspc;
 734                 struct cpu_info *cci;
 735
 736                 /* Offline CPUs should not perform this, however */
 737                 if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
 738                         return NULL;
 739
 740                 /* Reset the counter, and call the balancer */
 741                 ci_rq->r_avgcount = 0;
 742                 sched_balance(ci);
 743                 cci = worker_ci;
 744                 cspc = &cci->ci_schedstate;
 745                 if (ci == cci || spc->spc_psid != cspc->spc_psid ||
 746                     !mutex_tryenter(cci->ci_schedstate.spc_mutex))
 747                         return NULL;
 748                 return sched_catchlwp(cci);
 749         }
 750 #else
 751         if (__predict_false(ci_rq->r_count == 0))
 752                 return NULL;
 753 #endif
 754
 755         /* Take the highest priority thread */
 756         KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
 757         q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
 758         l = TAILQ_FIRST(q_head);
 759         KASSERT(l != NULL);
 760
 761         sched_oncpu(l);
 762         l->l_rticks = hardclock_ticks;
 763
 764         return l;
 765 }
 766
 767 bool
 768 sched_curcpu_runnable_p(void)
 769 {
 770         const struct cpu_info *ci;
 771         const struct schedstate_percpu *spc;
 772         const runqueue_t *ci_rq;
 773         bool rv;
 774
 775         kpreempt_disable();
 776         ci = curcpu();
 777         spc = &ci->ci_schedstate;
 778         ci_rq = spc->spc_sched_info;
 779
 780 #ifndef __HAVE_FAST_SOFTINTS
 781         if (ci->ci_data.cpu_softints) {
 782                 kpreempt_enable();
 783                 return true;
 784         }
 785 #endif
 786
 787         rv = (ci_rq->r_count != 0) ? true : false;
 788         kpreempt_enable();
 789
 790         return rv;
 791 }
 792
 793 /*
 794  * Sysctl nodes and initialization.
 795  */
 796
 797 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
 798 {
 799         const struct sysctlnode *node = NULL;
 800
 801         sysctl_createv(clog, 0, NULL, NULL,
 802                 CTLFLAG_PERMANENT,
 803                 CTLTYPE_NODE, "kern", NULL,
 804                 NULL, 0, NULL, 0,
 805                 CTL_KERN, CTL_EOL);
 806         sysctl_createv(clog, 0, NULL, &node,
 807                 CTLFLAG_PERMANENT,
 808                 CTLTYPE_NODE, "sched",
 809                 SYSCTL_DESCR("Scheduler options"),
 810                 NULL, 0, NULL, 0,
 811                 CTL_KERN, CTL_CREATE, CTL_EOL);
 812
 813         if (node == NULL)
 814                 return;
 815
 816         sysctl_createv(clog, 0, &node, NULL,
 817                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 818                 CTLTYPE_INT, "cacheht_time",
 819                 SYSCTL_DESCR("Cache hotness time (in ticks)"),
 820                 NULL, 0, &cacheht_time, 0,
 821                 CTL_CREATE, CTL_EOL);
 822         sysctl_createv(clog, 0, &node, NULL,
 823                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 824                 CTLTYPE_INT, "balance_period",
 825                 SYSCTL_DESCR("Balance period (in ticks)"),
 826                 NULL, 0, &balance_period, 0,
 827                 CTL_CREATE, CTL_EOL);
 828         sysctl_createv(clog, 0, &node, NULL,
 829                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 830                 CTLTYPE_INT, "min_catch",
 831                 SYSCTL_DESCR("Minimal count of threads for catching"),
 832                 NULL, 0, &min_catch, 0,
 833                 CTL_CREATE, CTL_EOL);
 834         sysctl_createv(clog, 0, &node, NULL,
 835                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 836                 CTLTYPE_INT, "timesoftints",
 837                 SYSCTL_DESCR("Track CPU time for soft interrupts"),
 838                 NULL, 0, &softint_timing, 0,
 839                 CTL_CREATE, CTL_EOL);
 840         sysctl_createv(clog, 0, &node, NULL,
 841                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 842                 CTLTYPE_INT, "kpreempt_pri",
 843                 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
 844                 NULL, 0, &sched_kpreempt_pri, 0,
 845                 CTL_CREATE, CTL_EOL);
 846         sysctl_createv(clog, 0, &node, NULL,
 847                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 848                 CTLTYPE_INT, "upreempt_pri",
 849                 SYSCTL_DESCR("Minimum priority to trigger user preemption"),
 850                 NULL, 0, &sched_upreempt_pri, 0,
 851                 CTL_CREATE, CTL_EOL);
 852 }
 853
 854 /*
 855  * Debugging.
 856  */
 857
 858 #ifdef DDB
 859
 860 void
 861 sched_print_runqueue(void (*pr)(const char *, ...)
 862     __attribute__((__format__(__printf__,1,2))))
 863 {
 864         runqueue_t *ci_rq;
 865         struct cpu_info *ci, *tci;
 866         struct schedstate_percpu *spc;
 867         struct lwp *l;
 868         struct proc *p;
 869         CPU_INFO_ITERATOR cii;
 870
 871         for (CPU_INFO_FOREACH(cii, ci)) {
 872                 int i;
 873
 874                 spc = &ci->ci_schedstate;
 875                 ci_rq = spc->spc_sched_info;
 876
 877                 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
 878                 (*pr)(" pid.lid = %d.%d, r_count = %u, r_avgcount = %u, "
 879                     "maxpri = %d, mlwp = %p\n",
 880 #ifdef MULTIPROCESSOR
 881                     ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
 882 #else
 883                     curlwp->l_proc->p_pid, curlwp->l_lid,
 884 #endif
 885                     ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority,
 886                     spc->spc_migrating);
 887                 i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
 888                 do {
 889                         uint32_t q;
 890                         q = ci_rq->r_bitmap[i];
 891                         (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
 892                 } while (i--);
 893         }
 894
 895         (*pr)("   %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
 896             "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");
 897
 898         PROCLIST_FOREACH(p, &allproc) {
 899                 if ((p->p_flag & PK_MARKER) != 0)
 900                         continue;
 901                 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
 902                 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
 903                         ci = l->l_cpu;
 904                         tci = l->l_target_cpu;
 905                         (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
 906                             (int)l->l_lid, l->l_priority, lwp_eprio(l),
 907                             l->l_flag, l->l_stat == LSRUN ? "RQ" :
 908                             (l->l_stat == LSSLEEP ? "SQ" : "-"),
 909                             l, ci->ci_index, (tci ? tci->ci_index : -1),
 910                             (u_int)(hardclock_ticks - l->l_rticks));
 911                 }
 912         }
 913 }
 914
 915 #endif