sys/kern/kern_synch.c

   1 /*      $NetBSD: kern_synch.c,v 1.273 2009/12/05 22:38:19 pooka Exp $   */
   2
   3 /*-
   4  * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009
   5  *    The NetBSD Foundation, Inc.
   6  * All rights reserved.
   7  *
   8  * This code is derived from software contributed to The NetBSD Foundation
   9  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
  10  * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
  11  * Daniel Sieger.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32  * POSSIBILITY OF SUCH DAMAGE.
  33  */
  34
  35 /*-
  36  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  37  *      The Regents of the University of California.  All rights reserved.
  38  * (c) UNIX System Laboratories, Inc.
  39  * All or some portions of this file are derived from material licensed
  40  * to the University of California by American Telephone and Telegraph
  41  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  42  * the permission of UNIX System Laboratories, Inc.
  43  *
  44  * Redistribution and use in source and binary forms, with or without
  45  * modification, are permitted provided that the following conditions
  46  * are met:
  47  * 1. Redistributions of source code must retain the above copyright
  48  *    notice, this list of conditions and the following disclaimer.
  49  * 2. Redistributions in binary form must reproduce the above copyright
  50  *    notice, this list of conditions and the following disclaimer in the
  51  *    documentation and/or other materials provided with the distribution.
  52  * 3. Neither the name of the University nor the names of its contributors
  53  *    may be used to endorse or promote products derived from this software
  54  *    without specific prior written permission.
  55  *
  56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  66  * SUCH DAMAGE.
  67  *
  68  *      @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
  69  */
  70
  71 #include <sys/cdefs.h>
  72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.273 2009/12/05 22:38:19 pooka Exp $");
  73
  74 #include "opt_kstack.h"
  75 #include "opt_perfctrs.h"
  76 #include "opt_sa.h"
  77
  78 #define __MUTEX_PRIVATE
  79
  80 #include <sys/param.h>
  81 #include <sys/systm.h>
  82 #include <sys/proc.h>
  83 #include <sys/kernel.h>
  84 #if defined(PERFCTRS)
  85 #include <sys/pmc.h>
  86 #endif
  87 #include <sys/cpu.h>
  88 #include <sys/resourcevar.h>
  89 #include <sys/sched.h>
  90 #include <sys/sa.h>
  91 #include <sys/savar.h>
  92 #include <sys/syscall_stats.h>
  93 #include <sys/sleepq.h>
  94 #include <sys/lockdebug.h>
  95 #include <sys/evcnt.h>
  96 #include <sys/intr.h>
  97 #include <sys/lwpctl.h>
  98 #include <sys/atomic.h>
  99 #include <sys/simplelock.h>
 100
 101 #include <uvm/uvm_extern.h>
 102
 103 #include <dev/lockstat.h>
 104
 105 static void     sched_unsleep(struct lwp *, bool);
 106 static void     sched_changepri(struct lwp *, pri_t);
 107 static void     sched_lendpri(struct lwp *, pri_t);
 108 static void     resched_cpu(struct lwp *);
 109
 110 syncobj_t sleep_syncobj = {
 111         SOBJ_SLEEPQ_SORTED,
 112         sleepq_unsleep,
 113         sleepq_changepri,
 114         sleepq_lendpri,
 115         syncobj_noowner,
 116 };
 117
 118 syncobj_t sched_syncobj = {
 119         SOBJ_SLEEPQ_SORTED,
 120         sched_unsleep,
 121         sched_changepri,
 122         sched_lendpri,
 123         syncobj_noowner,
 124 };
 125
 126 callout_t       sched_pstats_ch;
 127 unsigned        sched_pstats_ticks;
 128 kcondvar_t      lbolt;                  /* once a second sleep address */
 129
 130 /* Preemption event counters */
 131 static struct evcnt kpreempt_ev_crit;
 132 static struct evcnt kpreempt_ev_klock;
 133 static struct evcnt kpreempt_ev_immed;
 134
 135 /*
 136  * During autoconfiguration or after a panic, a sleep will simply lower the
 137  * priority briefly to allow interrupts, then return.  The priority to be
 138  * used (safepri) is machine-dependent, thus this value is initialized and
 139  * maintained in the machine-dependent layers.  This priority will typically
 140  * be 0, or the lowest priority that is safe for use on the interrupt stack;
 141  * it can be made higher to block network software interrupts after panics.
 142  */
 143 int     safepri;
 144
 145 void
 146 synch_init(void)
 147 {
 148
 149         cv_init(&lbolt, "lbolt");
 150         callout_init(&sched_pstats_ch, CALLOUT_MPSAFE);
 151         callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
 152
 153         evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL,
 154            "kpreempt", "defer: critical section");
 155         evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL,
 156            "kpreempt", "defer: kernel_lock");
 157         evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL,
 158            "kpreempt", "immediate");
 159
 160         sched_pstats(NULL);
 161 }
 162
 163 /*
 164  * OBSOLETE INTERFACE
 165  *
 166  * General sleep call.  Suspends the current LWP until a wakeup is
 167  * performed on the specified identifier.  The LWP will then be made
 168  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
 169  * means no timeout).  If pri includes PCATCH flag, signals are checked
 170  * before and after sleeping, else signals are not checked.  Returns 0 if
 171  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 172  * signal needs to be delivered, ERESTART is returned if the current system
 173  * call should be restarted if possible, and EINTR is returned if the system
 174  * call should be interrupted by the signal (return EINTR).
 175  *
 176  * The interlock is held until we are on a sleep queue. The interlock will
 177  * be locked before returning back to the caller unless the PNORELOCK flag
 178  * is specified, in which case the interlock will always be unlocked upon
 179  * return.
 180  */
 181 int
 182 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
 183         volatile struct simplelock *interlock)
 184 {
 185         struct lwp *l = curlwp;
 186         sleepq_t *sq;
 187         kmutex_t *mp;
 188         int error;
 189
 190         KASSERT((l->l_pflag & LP_INTR) == 0);
 191         KASSERT(ident != &lbolt);
 192
 193         if (sleepq_dontsleep(l)) {
 194                 (void)sleepq_abort(NULL, 0);
 195                 if ((priority & PNORELOCK) != 0)
 196                         simple_unlock(interlock);
 197                 return 0;
 198         }
 199
 200         l->l_kpriority = true;
 201         sq = sleeptab_lookup(&sleeptab, ident, &mp);
 202         sleepq_enter(sq, l, mp);
 203         sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
 204
 205         if (interlock != NULL) {
 206                 KASSERT(simple_lock_held(interlock));
 207                 simple_unlock(interlock);
 208         }
 209
 210         error = sleepq_block(timo, priority & PCATCH);
 211
 212         if (interlock != NULL && (priority & PNORELOCK) == 0)
 213                 simple_lock(interlock);
 214
 215         return error;
 216 }
 217
 218 int
 219 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
 220         kmutex_t *mtx)
 221 {
 222         struct lwp *l = curlwp;
 223         sleepq_t *sq;
 224         kmutex_t *mp;
 225         int error;
 226
 227         KASSERT((l->l_pflag & LP_INTR) == 0);
 228         KASSERT(ident != &lbolt);
 229
 230         if (sleepq_dontsleep(l)) {
 231                 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
 232                 return 0;
 233         }
 234
 235         l->l_kpriority = true;
 236         sq = sleeptab_lookup(&sleeptab, ident, &mp);
 237         sleepq_enter(sq, l, mp);
 238         sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
 239         mutex_exit(mtx);
 240         error = sleepq_block(timo, priority & PCATCH);
 241
 242         if ((priority & PNORELOCK) == 0)
 243                 mutex_enter(mtx);
 244
 245         return error;
 246 }
 247
 248 /*
 249  * General sleep call for situations where a wake-up is not expected.
 250  */
 251 int
 252 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
 253 {
 254         struct lwp *l = curlwp;
 255         kmutex_t *mp;
 256         sleepq_t *sq;
 257         int error;
 258
 259         if (sleepq_dontsleep(l))
 260                 return sleepq_abort(NULL, 0);
 261
 262         if (mtx != NULL)
 263                 mutex_exit(mtx);
 264         l->l_kpriority = true;
 265         sq = sleeptab_lookup(&sleeptab, l, &mp);
 266         sleepq_enter(sq, l, mp);
 267         sleepq_enqueue(sq, l, wmesg, &sleep_syncobj);
 268         error = sleepq_block(timo, intr);
 269         if (mtx != NULL)
 270                 mutex_enter(mtx);
 271
 272         return error;
 273 }
 274
 275 #ifdef KERN_SA
 276 /*
 277  * sa_awaken:
 278  *
 279  *      We believe this lwp is an SA lwp. If it's yielding,
 280  * let it know it needs to wake up.
 281  *
 282  *      We are called and exit with the lwp locked. We are
 283  * called in the middle of wakeup operations, so we need
 284  * to not touch the locks at all.
 285  */
 286 void
 287 sa_awaken(struct lwp *l)
 288 {
 289         /* LOCK_ASSERT(lwp_locked(l, NULL)); */
 290
 291         if (l == l->l_savp->savp_lwp && l->l_flag & LW_SA_YIELD)
 292                 l->l_flag &= ~LW_SA_IDLE;
 293 }
 294 #endif /* KERN_SA */
 295
 296 /*
 297  * OBSOLETE INTERFACE
 298  *
 299  * Make all LWPs sleeping on the specified identifier runnable.
 300  */
 301 void
 302 wakeup(wchan_t ident)
 303 {
 304         sleepq_t *sq;
 305         kmutex_t *mp;
 306
 307         if (__predict_false(cold))
 308                 return;
 309
 310         sq = sleeptab_lookup(&sleeptab, ident, &mp);
 311         sleepq_wake(sq, ident, (u_int)-1, mp);
 312 }
 313
 314 /*
 315  * OBSOLETE INTERFACE
 316  *
 317  * Make the highest priority LWP first in line on the specified
 318  * identifier runnable.
 319  */
 320 void
 321 wakeup_one(wchan_t ident)
 322 {
 323         sleepq_t *sq;
 324         kmutex_t *mp;
 325
 326         if (__predict_false(cold))
 327                 return;
 328
 329         sq = sleeptab_lookup(&sleeptab, ident, &mp);
 330         sleepq_wake(sq, ident, 1, mp);
 331 }
 332
 333
 334 /*
 335  * General yield call.  Puts the current LWP back on its run queue and
 336  * performs a voluntary context switch.  Should only be called when the
 337  * current LWP explicitly requests it (eg sched_yield(2)).
 338  */
 339 void
 340 yield(void)
 341 {
 342         struct lwp *l = curlwp;
 343
 344         KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
 345         lwp_lock(l);
 346         KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
 347         KASSERT(l->l_stat == LSONPROC);
 348         l->l_kpriority = false;
 349         (void)mi_switch(l);
 350         KERNEL_LOCK(l->l_biglocks, l);
 351 }
 352
 353 /*
 354  * General preemption call.  Puts the current LWP back on its run queue
 355  * and performs an involuntary context switch.
 356  */
 357 void
 358 preempt(void)
 359 {
 360         struct lwp *l = curlwp;
 361
 362         KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
 363         lwp_lock(l);
 364         KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
 365         KASSERT(l->l_stat == LSONPROC);
 366         l->l_kpriority = false;
 367         l->l_nivcsw++;
 368         (void)mi_switch(l);
 369         KERNEL_LOCK(l->l_biglocks, l);
 370 }
 371
 372 /*
 373  * Handle a request made by another agent to preempt the current LWP
 374  * in-kernel.  Usually called when l_dopreempt may be non-zero.
 375  *
 376  * Character addresses for lockstat only.
 377  */
 378 static char     in_critical_section;
 379 static char     kernel_lock_held;
 380 static char     is_softint;
 381 static char     cpu_kpreempt_enter_fail;
 382
 383 bool
 384 kpreempt(uintptr_t where)
 385 {
 386         uintptr_t failed;
 387         lwp_t *l;
 388         int s, dop, lsflag;
 389
 390         l = curlwp;
 391         failed = 0;
 392         while ((dop = l->l_dopreempt) != 0) {
 393                 if (l->l_stat != LSONPROC) {
 394                         /*
 395                          * About to block (or die), let it happen.
 396                          * Doesn't really count as "preemption has
 397                          * been blocked", since we're going to
 398                          * context switch.
 399                          */
 400                         l->l_dopreempt = 0;
 401                         return true;
 402                 }
 403                 if (__predict_false((l->l_flag & LW_IDLE) != 0)) {
 404                         /* Can't preempt idle loop, don't count as failure. */
 405                         l->l_dopreempt = 0;
 406                         return true;
 407                 }
 408                 if (__predict_false(l->l_nopreempt != 0)) {
 409                         /* LWP holds preemption disabled, explicitly. */
 410                         if ((dop & DOPREEMPT_COUNTED) == 0) {
 411                                 kpreempt_ev_crit.ev_count++;
 412                         }
 413                         failed = (uintptr_t)&in_critical_section;
 414                         break;
 415                 }
 416                 if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
 417                         /* Can't preempt soft interrupts yet. */
 418                         l->l_dopreempt = 0;
 419                         failed = (uintptr_t)&is_softint;
 420                         break;
 421                 }
 422                 s = splsched();
 423                 if (__predict_false(l->l_blcnt != 0 ||
 424                     curcpu()->ci_biglock_wanted != NULL)) {
 425                         /* Hold or want kernel_lock, code is not MT safe. */
 426                         splx(s);
 427                         if ((dop & DOPREEMPT_COUNTED) == 0) {
 428                                 kpreempt_ev_klock.ev_count++;
 429                         }
 430                         failed = (uintptr_t)&kernel_lock_held;
 431                         break;
 432                 }
 433                 if (__predict_false(!cpu_kpreempt_enter(where, s))) {
 434                         /*
 435                          * It may be that the IPL is too high.
 436                          * kpreempt_enter() can schedule an
 437                          * interrupt to retry later.
 438                          */
 439                         splx(s);
 440                         failed = (uintptr_t)&cpu_kpreempt_enter_fail;
 441                         break;
 442                 }
 443                 /* Do it! */
 444                 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
 445                         kpreempt_ev_immed.ev_count++;
 446                 }
 447                 lwp_lock(l);
 448                 mi_switch(l);
 449                 l->l_nopreempt++;
 450                 splx(s);
 451
 452                 /* Take care of any MD cleanup. */
 453                 cpu_kpreempt_exit(where);
 454                 l->l_nopreempt--;
 455         }
 456
 457         if (__predict_true(!failed)) {
 458                 return false;
 459         }
 460
 461         /* Record preemption failure for reporting via lockstat. */
 462         atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
 463         lsflag = 0;
 464         LOCKSTAT_ENTER(lsflag);
 465         if (__predict_false(lsflag)) {
 466                 if (where == 0) {
 467                         where = (uintptr_t)__builtin_return_address(0);
 468                 }
 469                 /* Preemption is on, might recurse, so make it atomic. */
 470                 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL,
 471                     (void *)where) == NULL) {
 472                         LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
 473                         l->l_pfaillock = failed;
 474                 }
 475         }
 476         LOCKSTAT_EXIT(lsflag);
 477         return true;
 478 }
 479
 480 /*
 481  * Return true if preemption is explicitly disabled.
 482  */
 483 bool
 484 kpreempt_disabled(void)
 485 {
 486         const lwp_t *l = curlwp;
 487
 488         return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
 489             (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
 490 }
 491
 492 /*
 493  * Disable kernel preemption.
 494  */
 495 void
 496 kpreempt_disable(void)
 497 {
 498
 499         KPREEMPT_DISABLE(curlwp);
 500 }
 501
 502 /*
 503  * Reenable kernel preemption.
 504  */
 505 void
 506 kpreempt_enable(void)
 507 {
 508
 509         KPREEMPT_ENABLE(curlwp);
 510 }
 511
 512 /*
 513  * Compute the amount of time during which the current lwp was running.
 514  *
 515  * - update l_rtime unless it's an idle lwp.
 516  */
 517
 518 void
 519 updatertime(lwp_t *l, const struct bintime *now)
 520 {
 521
 522         if (__predict_false(l->l_flag & LW_IDLE))
 523                 return;
 524
 525         /* rtime += now - stime */
 526         bintime_add(&l->l_rtime, now);
 527         bintime_sub(&l->l_rtime, &l->l_stime);
 528 }
 529
 530 /*
 531  * Select next LWP from the current CPU to run..
 532  */
 533 static inline lwp_t *
 534 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc)
 535 {
 536         lwp_t *newl;
 537
 538         /*
 539          * Let sched_nextlwp() select the LWP to run the CPU next.
 540          * If no LWP is runnable, select the idle LWP.
 541          *
 542          * Note that spc_lwplock might not necessary be held, and
 543          * new thread would be unlocked after setting the LWP-lock.
 544          */
 545         newl = sched_nextlwp();
 546         if (newl != NULL) {
 547                 sched_dequeue(newl);
 548                 KASSERT(lwp_locked(newl, spc->spc_mutex));
 549                 KASSERT(newl->l_cpu == ci);
 550                 newl->l_stat = LSONPROC;
 551                 newl->l_pflag |= LP_RUNNING;
 552                 lwp_setlock(newl, spc->spc_lwplock);
 553         } else {
 554                 newl = ci->ci_data.cpu_idlelwp;
 555                 newl->l_stat = LSONPROC;
 556                 newl->l_pflag |= LP_RUNNING;
 557         }
 558
 559         /*
 560          * Only clear want_resched if there are no pending (slow)
 561          * software interrupts.
 562          */
 563         ci->ci_want_resched = ci->ci_data.cpu_softints;
 564         spc->spc_flags &= ~SPCF_SWITCHCLEAR;
 565         spc->spc_curpriority = lwp_eprio(newl);
 566
 567         return newl;
 568 }
 569
 570 /*
 571  * The machine independent parts of context switch.
 572  *
 573  * Returns 1 if another LWP was actually run.
 574  */
 575 int
 576 mi_switch(lwp_t *l)
 577 {
 578         struct cpu_info *ci;
 579         struct schedstate_percpu *spc;
 580         struct lwp *newl;
 581         int retval, oldspl;
 582         struct bintime bt;
 583         bool returning;
 584
 585         KASSERT(lwp_locked(l, NULL));
 586         KASSERT(kpreempt_disabled());
 587         LOCKDEBUG_BARRIER(l->l_mutex, 1);
 588
 589         kstack_check_magic(l);
 590
 591         binuptime(&bt);
 592
 593         KASSERT((l->l_pflag & LP_RUNNING) != 0);
 594         KASSERT(l->l_cpu == curcpu());
 595         ci = l->l_cpu;
 596         spc = &ci->ci_schedstate;
 597         returning = false;
 598         newl = NULL;
 599
 600         /*
 601          * If we have been asked to switch to a specific LWP, then there
 602          * is no need to inspect the run queues.  If a soft interrupt is
 603          * blocking, then return to the interrupted thread without adjusting
 604          * VM context or its start time: neither have been changed in order
 605          * to take the interrupt.
 606          */
 607         if (l->l_switchto != NULL) {
 608                 if ((l->l_pflag & LP_INTR) != 0) {
 609                         returning = true;
 610                         softint_block(l);
 611                         if ((l->l_pflag & LP_TIMEINTR) != 0)
 612                                 updatertime(l, &bt);
 613                 }
 614                 newl = l->l_switchto;
 615                 l->l_switchto = NULL;
 616         }
 617 #ifndef __HAVE_FAST_SOFTINTS
 618         else if (ci->ci_data.cpu_softints != 0) {
 619                 /* There are pending soft interrupts, so pick one. */
 620                 newl = softint_picklwp();
 621                 newl->l_stat = LSONPROC;
 622                 newl->l_pflag |= LP_RUNNING;
 623         }
 624 #endif  /* !__HAVE_FAST_SOFTINTS */
 625
 626         /* Count time spent in current system call */
 627         if (!returning) {
 628                 SYSCALL_TIME_SLEEP(l);
 629
 630                 /*
 631                  * XXXSMP If we are using h/w performance counters,
 632                  * save context.
 633                  */
 634 #if PERFCTRS
 635                 if (PMC_ENABLED(l->l_proc)) {
 636                         pmc_save_context(l->l_proc);
 637                 }
 638 #endif
 639                 updatertime(l, &bt);
 640         }
 641
 642         /* Lock the runqueue */
 643         KASSERT(l->l_stat != LSRUN);
 644         mutex_spin_enter(spc->spc_mutex);
 645
 646         /*
 647          * If on the CPU and we have gotten this far, then we must yield.
 648          */
 649         if (l->l_stat == LSONPROC && l != newl) {
 650                 KASSERT(lwp_locked(l, spc->spc_lwplock));
 651                 if ((l->l_flag & LW_IDLE) == 0) {
 652                         l->l_stat = LSRUN;
 653                         lwp_setlock(l, spc->spc_mutex);
 654                         sched_enqueue(l, true);
 655                         /* Handle migration case */
 656                         KASSERT(spc->spc_migrating == NULL);
 657                         if (l->l_target_cpu !=  NULL) {
 658                                 spc->spc_migrating = l;
 659                         }
 660                 } else
 661                         l->l_stat = LSIDL;
 662         }
 663
 664         /* Pick new LWP to run. */
 665         if (newl == NULL) {
 666                 newl = nextlwp(ci, spc);
 667         }
 668
 669         /* Items that must be updated with the CPU locked. */
 670         if (!returning) {
 671                 /* Update the new LWP's start time. */
 672                 newl->l_stime = bt;
 673
 674                 /*
 675                  * ci_curlwp changes when a fast soft interrupt occurs.
 676                  * We use cpu_onproc to keep track of which kernel or
 677                  * user thread is running 'underneath' the software
 678                  * interrupt.  This is important for time accounting,
 679                  * itimers and forcing user threads to preempt (aston).
 680                  */
 681                 ci->ci_data.cpu_onproc = newl;
 682         }
 683
 684         /*
 685          * Preemption related tasks.  Must be done with the current
 686          * CPU locked.
 687          */
 688         cpu_did_resched(l);
 689         l->l_dopreempt = 0;
 690         if (__predict_false(l->l_pfailaddr != 0)) {
 691                 LOCKSTAT_FLAG(lsflag);
 692                 LOCKSTAT_ENTER(lsflag);
 693                 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
 694                 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
 695                     1, l->l_pfailtime, l->l_pfailaddr);
 696                 LOCKSTAT_EXIT(lsflag);
 697                 l->l_pfailtime = 0;
 698                 l->l_pfaillock = 0;
 699                 l->l_pfailaddr = 0;
 700         }
 701
 702         if (l != newl) {
 703                 struct lwp *prevlwp;
 704
 705                 /* Release all locks, but leave the current LWP locked */
 706                 if (l->l_mutex == spc->spc_mutex) {
 707                         /*
 708                          * Drop spc_lwplock, if the current LWP has been moved
 709                          * to the run queue (it is now locked by spc_mutex).
 710                          */
 711                         mutex_spin_exit(spc->spc_lwplock);
 712                 } else {
 713                         /*
 714                          * Otherwise, drop the spc_mutex, we are done with the
 715                          * run queues.
 716                          */
 717                         mutex_spin_exit(spc->spc_mutex);
 718                 }
 719
 720                 /*
 721                  * Mark that context switch is going to be performed
 722                  * for this LWP, to protect it from being switched
 723                  * to on another CPU.
 724                  */
 725                 KASSERT(l->l_ctxswtch == 0);
 726                 l->l_ctxswtch = 1;
 727                 l->l_ncsw++;
 728                 KASSERT((l->l_pflag & LP_RUNNING) != 0);
 729                 l->l_pflag &= ~LP_RUNNING;
 730
 731                 /*
 732                  * Increase the count of spin-mutexes before the release
 733                  * of the last lock - we must remain at IPL_SCHED during
 734                  * the context switch.
 735                  */
 736                 oldspl = MUTEX_SPIN_OLDSPL(ci);
 737                 ci->ci_mtx_count--;
 738                 lwp_unlock(l);
 739
 740                 /* Count the context switch on this CPU. */
 741                 ci->ci_data.cpu_nswtch++;
 742
 743                 /* Update status for lwpctl, if present. */
 744                 if (l->l_lwpctl != NULL)
 745                         l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE;
 746
 747                 /*
 748                  * Save old VM context, unless a soft interrupt
 749                  * handler is blocking.
 750                  */
 751                 if (!returning)
 752                         pmap_deactivate(l);
 753
 754                 /*
 755                  * We may need to spin-wait for if 'newl' is still
 756                  * context switching on another CPU.
 757                  */
 758                 if (__predict_false(newl->l_ctxswtch != 0)) {
 759                         u_int count;
 760                         count = SPINLOCK_BACKOFF_MIN;
 761                         while (newl->l_ctxswtch)
 762                                 SPINLOCK_BACKOFF(count);
 763                 }
 764
 765                 /* Switch to the new LWP.. */
 766                 prevlwp = cpu_switchto(l, newl, returning);
 767                 ci = curcpu();
 768
 769                 /*
 770                  * Switched away - we have new curlwp.
 771                  * Restore VM context and IPL.
 772                  */
 773                 pmap_activate(l);
 774                 uvm_emap_switch(l);
 775
 776                 if (prevlwp != NULL) {
 777                         /* Normalize the count of the spin-mutexes */
 778                         ci->ci_mtx_count++;
 779                         /* Unmark the state of context switch */
 780                         membar_exit();
 781                         prevlwp->l_ctxswtch = 0;
 782                 }
 783
 784                 /* Update status for lwpctl, if present. */
 785                 if (l->l_lwpctl != NULL) {
 786                         l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
 787                         l->l_lwpctl->lc_pctr++;
 788                 }
 789
 790                 KASSERT(l->l_cpu == ci);
 791                 splx(oldspl);
 792                 retval = 1;
 793         } else {
 794                 /* Nothing to do - just unlock and return. */
 795                 mutex_spin_exit(spc->spc_mutex);
 796                 lwp_unlock(l);
 797                 retval = 0;
 798         }
 799
 800         KASSERT(l == curlwp);
 801         KASSERT(l->l_stat == LSONPROC);
 802
 803         /*
 804          * XXXSMP If we are using h/w performance counters, restore context.
 805          * XXXSMP preemption problem.
 806          */
 807 #if PERFCTRS
 808         if (PMC_ENABLED(l->l_proc)) {
 809                 pmc_restore_context(l->l_proc);
 810         }
 811 #endif
 812         SYSCALL_TIME_WAKEUP(l);
 813         LOCKDEBUG_BARRIER(NULL, 1);
 814
 815         return retval;
 816 }
 817
 818 /*
 819  * The machine independent parts of context switch to oblivion.
 820  * Does not return.  Call with the LWP unlocked.
 821  */
 822 void
 823 lwp_exit_switchaway(lwp_t *l)
 824 {
 825         struct cpu_info *ci;
 826         struct lwp *newl;
 827         struct bintime bt;
 828
 829         ci = l->l_cpu;
 830
 831         KASSERT(kpreempt_disabled());
 832         KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL);
 833         KASSERT(ci == curcpu());
 834         LOCKDEBUG_BARRIER(NULL, 0);
 835
 836         kstack_check_magic(l);
 837
 838         /* Count time spent in current system call */
 839         SYSCALL_TIME_SLEEP(l);
 840         binuptime(&bt);
 841         updatertime(l, &bt);
 842
 843         /* Must stay at IPL_SCHED even after releasing run queue lock. */
 844         (void)splsched();
 845
 846         /*
 847          * Let sched_nextlwp() select the LWP to run the CPU next.
 848          * If no LWP is runnable, select the idle LWP.
 849          *
 850          * Note that spc_lwplock might not necessary be held, and
 851          * new thread would be unlocked after setting the LWP-lock.
 852          */
 853         spc_lock(ci);
 854 #ifndef __HAVE_FAST_SOFTINTS
 855         if (ci->ci_data.cpu_softints != 0) {
 856                 /* There are pending soft interrupts, so pick one. */
 857                 newl = softint_picklwp();
 858                 newl->l_stat = LSONPROC;
 859                 newl->l_pflag |= LP_RUNNING;
 860         } else
 861 #endif  /* !__HAVE_FAST_SOFTINTS */
 862         {
 863                 newl = nextlwp(ci, &ci->ci_schedstate);
 864         }
 865
 866         /* Update the new LWP's start time. */
 867         newl->l_stime = bt;
 868         l->l_pflag &= ~LP_RUNNING;
 869
 870         /*
 871          * ci_curlwp changes when a fast soft interrupt occurs.
 872          * We use cpu_onproc to keep track of which kernel or
 873          * user thread is running 'underneath' the software
 874          * interrupt.  This is important for time accounting,
 875          * itimers and forcing user threads to preempt (aston).
 876          */
 877         ci->ci_data.cpu_onproc = newl;
 878
 879         /*
 880          * Preemption related tasks.  Must be done with the current
 881          * CPU locked.
 882          */
 883         cpu_did_resched(l);
 884
 885         /* Unlock the run queue. */
 886         spc_unlock(ci);
 887
 888         /* Count the context switch on this CPU. */
 889         ci->ci_data.cpu_nswtch++;
 890
 891         /* Update status for lwpctl, if present. */
 892         if (l->l_lwpctl != NULL)
 893                 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED;
 894
 895         /*
 896          * We may need to spin-wait for if 'newl' is still
 897          * context switching on another CPU.
 898          */
 899         if (__predict_false(newl->l_ctxswtch != 0)) {
 900                 u_int count;
 901                 count = SPINLOCK_BACKOFF_MIN;
 902                 while (newl->l_ctxswtch)
 903                         SPINLOCK_BACKOFF(count);
 904         }
 905
 906         /* Switch to the new LWP.. */
 907         (void)cpu_switchto(NULL, newl, false);
 908
 909         for (;;) continue;      /* XXX: convince gcc about "noreturn" */
 910         /* NOTREACHED */
 911 }
 912
 913 /*
 914  * setrunnable: change LWP state to be runnable, placing it on the run queue.
 915  *
 916  * Call with the process and LWP locked.  Will return with the LWP unlocked.
 917  */
 918 void
 919 setrunnable(struct lwp *l)
 920 {
 921         struct proc *p = l->l_proc;
 922         struct cpu_info *ci;
 923
 924         KASSERT((l->l_flag & LW_IDLE) == 0);
 925         KASSERT(mutex_owned(p->p_lock));
 926         KASSERT(lwp_locked(l, NULL));
 927         KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
 928
 929         switch (l->l_stat) {
 930         case LSSTOP:
 931                 /*
 932                  * If we're being traced (possibly because someone attached us
 933                  * while we were stopped), check for a signal from the debugger.
 934                  */
 935                 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0)
 936                         signotify(l);
 937                 p->p_nrlwps++;
 938                 break;
 939         case LSSUSPENDED:
 940                 l->l_flag &= ~LW_WSUSPEND;
 941                 p->p_nrlwps++;
 942                 cv_broadcast(&p->p_lwpcv);
 943                 break;
 944         case LSSLEEP:
 945                 KASSERT(l->l_wchan != NULL);
 946                 break;
 947         default:
 948                 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
 949         }
 950
 951 #ifdef KERN_SA
 952         if (l->l_proc->p_sa)
 953                 sa_awaken(l);
 954 #endif /* KERN_SA */
 955
 956         /*
 957          * If the LWP was sleeping interruptably, then it's OK to start it
 958          * again.  If not, mark it as still sleeping.
 959          */
 960         if (l->l_wchan != NULL) {
 961                 l->l_stat = LSSLEEP;
 962                 /* lwp_unsleep() will release the lock. */
 963                 lwp_unsleep(l, true);
 964                 return;
 965         }
 966
 967         /*
 968          * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
 969          * about to call mi_switch(), in which case it will yield.
 970          */
 971         if ((l->l_pflag & LP_RUNNING) != 0) {
 972                 l->l_stat = LSONPROC;
 973                 l->l_slptime = 0;
 974                 lwp_unlock(l);
 975                 return;
 976         }
 977
 978         /*
 979          * Look for a CPU to run.
 980          * Set the LWP runnable.
 981          */
 982         ci = sched_takecpu(l);
 983         l->l_cpu = ci;
 984         spc_lock(ci);
 985         lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
 986         sched_setrunnable(l);
 987         l->l_stat = LSRUN;
 988         l->l_slptime = 0;
 989
 990         sched_enqueue(l, false);
 991         resched_cpu(l);
 992         lwp_unlock(l);
 993 }
 994
 995 /*
 996  * suspendsched:
 997  *
 998  *      Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
 999  */
1000 void
1001 suspendsched(void)
1002 {
1003         CPU_INFO_ITERATOR cii;
1004         struct cpu_info *ci;
1005         struct lwp *l;
1006         struct proc *p;
1007
1008         /*
1009          * We do this by process in order not to violate the locking rules.
1010          */
1011         mutex_enter(proc_lock);
1012         PROCLIST_FOREACH(p, &allproc) {
1013                 if ((p->p_flag & PK_MARKER) != 0)
1014                         continue;
1015
1016                 mutex_enter(p->p_lock);
1017                 if ((p->p_flag & PK_SYSTEM) != 0) {
1018                         mutex_exit(p->p_lock);
1019                         continue;
1020                 }
1021
1022                 p->p_stat = SSTOP;
1023
1024                 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1025                         if (l == curlwp)
1026                                 continue;
1027
1028                         lwp_lock(l);
1029
1030                         /*
1031                          * Set L_WREBOOT so that the LWP will suspend itself
1032                          * when it tries to return to user mode.  We want to
1033                          * try and get to get as many LWPs as possible to
1034                          * the user / kernel boundary, so that they will
1035                          * release any locks that they hold.
1036                          */
1037                         l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
1038
1039                         if (l->l_stat == LSSLEEP &&
1040                             (l->l_flag & LW_SINTR) != 0) {
1041                                 /* setrunnable() will release the lock. */
1042                                 setrunnable(l);
1043                                 continue;
1044                         }
1045
1046                         lwp_unlock(l);
1047                 }
1048
1049                 mutex_exit(p->p_lock);
1050         }
1051         mutex_exit(proc_lock);
1052
1053         /*
1054          * Kick all CPUs to make them preempt any LWPs running in user mode.
1055          * They'll trap into the kernel and suspend themselves in userret().
1056          */
1057         for (CPU_INFO_FOREACH(cii, ci)) {
1058                 spc_lock(ci);
1059                 cpu_need_resched(ci, RESCHED_IMMED);
1060                 spc_unlock(ci);
1061         }
1062 }
1063
1064 /*
1065  * sched_unsleep:
1066  *
1067  *      The is called when the LWP has not been awoken normally but instead
1068  *      interrupted: for example, if the sleep timed out.  Because of this,
1069  *      it's not a valid action for running or idle LWPs.
1070  */
1071 static void
1072 sched_unsleep(struct lwp *l, bool cleanup)
1073 {
1074
1075         lwp_unlock(l);
1076         panic("sched_unsleep");
1077 }
1078
1079 static void
1080 resched_cpu(struct lwp *l)
1081 {
1082         struct cpu_info *ci = l->l_cpu;
1083
1084         KASSERT(lwp_locked(l, NULL));
1085         if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
1086                 cpu_need_resched(ci, 0);
1087 }
1088
1089 static void
1090 sched_changepri(struct lwp *l, pri_t pri)
1091 {
1092
1093         KASSERT(lwp_locked(l, NULL));
1094
1095         if (l->l_stat == LSRUN) {
1096                 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1097                 sched_dequeue(l);
1098                 l->l_priority = pri;
1099                 sched_enqueue(l, false);
1100         } else {
1101                 l->l_priority = pri;
1102         }
1103         resched_cpu(l);
1104 }
1105
1106 static void
1107 sched_lendpri(struct lwp *l, pri_t pri)
1108 {
1109
1110         KASSERT(lwp_locked(l, NULL));
1111
1112         if (l->l_stat == LSRUN) {
1113                 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1114                 sched_dequeue(l);
1115                 l->l_inheritedprio = pri;
1116                 sched_enqueue(l, false);
1117         } else {
1118                 l->l_inheritedprio = pri;
1119         }
1120         resched_cpu(l);
1121 }
1122
1123 struct lwp *
1124 syncobj_noowner(wchan_t wchan)
1125 {
1126
1127         return NULL;
1128 }
1129
1130 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */
1131 const fixpt_t   ccpu = 0.95122942450071400909 * FSCALE;
1132
1133 /*
1134  * sched_pstats:
1135  *
1136  * Update process statistics and check CPU resource allocation.
1137  * Call scheduler-specific hook to eventually adjust process/LWP
1138  * priorities.
1139  */
1140 void
1141 sched_pstats(void *arg)
1142 {
1143         const int clkhz = (stathz != 0 ? stathz : hz);
1144         static bool backwards;
1145         struct rlimit *rlim;
1146         struct lwp *l;
1147         struct proc *p;
1148         long runtm;
1149         fixpt_t lpctcpu;
1150         u_int lcpticks;
1151         int sig;
1152
1153         sched_pstats_ticks++;
1154
1155         mutex_enter(proc_lock);
1156         PROCLIST_FOREACH(p, &allproc) {
1157                 if (__predict_false((p->p_flag & PK_MARKER) != 0))
1158                         continue;
1159
1160                 /* Increment sleep time (if sleeping), ignore overflow. */
1161                 mutex_enter(p->p_lock);
1162                 runtm = p->p_rtime.sec;
1163                 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1164                         if (__predict_false((l->l_flag & LW_IDLE) != 0))
1165                                 continue;
1166                         lwp_lock(l);
1167                         runtm += l->l_rtime.sec;
1168                         l->l_swtime++;
1169                         sched_lwp_stats(l);
1170                         lwp_unlock(l);
1171
1172                         l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
1173                         if (l->l_slptime != 0)
1174                                 continue;
1175
1176                         lpctcpu = l->l_pctcpu;
1177                         lcpticks = atomic_swap_uint(&l->l_cpticks, 0);
1178                         lpctcpu += ((FSCALE - ccpu) *
1179                             (lcpticks * FSCALE / clkhz)) >> FSHIFT;
1180                         l->l_pctcpu = lpctcpu;
1181                 }
1182                 /* Calculating p_pctcpu only for ps(1) */
1183                 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
1184
1185                 /*
1186                  * Check if the process exceeds its CPU resource allocation.
1187                  * If over max, kill it.
1188                  */
1189                 rlim = &p->p_rlimit[RLIMIT_CPU];
1190                 sig = 0;
1191                 if (__predict_false(runtm >= rlim->rlim_cur)) {
1192                         if (runtm >= rlim->rlim_max)
1193                                 sig = SIGKILL;
1194                         else {
1195                                 sig = SIGXCPU;
1196                                 if (rlim->rlim_cur < rlim->rlim_max)
1197                                         rlim->rlim_cur += 5;
1198                         }
1199                 }
1200                 mutex_exit(p->p_lock);
1201                 if (__predict_false(runtm < 0)) {
1202                         if (!backwards) {
1203                                 backwards = true;
1204                                 printf("WARNING: negative runtime; "
1205                                     "monotonic clock has gone backwards\n");
1206                         }
1207                 } else if (__predict_false(sig)) {
1208                         KASSERT((p->p_flag & PK_SYSTEM) == 0);
1209                         psignal(p, sig);
1210                 }
1211         }
1212         mutex_exit(proc_lock);
1213         uvm_meter();
1214         cv_broadcast(&lbolt);
1215         callout_schedule(&sched_pstats_ch, hz);
1216 }