sys/kern/kern_clock.c

   1 /*      $NetBSD: kern_clock.c,v 1.125 2008/07/02 19:38:37 rmind Exp $   */
   2
   3 /*-
   4  * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
   9  * NASA Ames Research Center.
  10  * This code is derived from software contributed to The NetBSD Foundation
  11  * by Charles M. Hannum.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32  * POSSIBILITY OF SUCH DAMAGE.
  33  */
  34
  35 /*-
  36  * Copyright (c) 1982, 1986, 1991, 1993
  37  *      The Regents of the University of California.  All rights reserved.
  38  * (c) UNIX System Laboratories, Inc.
  39  * All or some portions of this file are derived from material licensed
  40  * to the University of California by American Telephone and Telegraph
  41  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  42  * the permission of UNIX System Laboratories, Inc.
  43  *
  44  * Redistribution and use in source and binary forms, with or without
  45  * modification, are permitted provided that the following conditions
  46  * are met:
  47  * 1. Redistributions of source code must retain the above copyright
  48  *    notice, this list of conditions and the following disclaimer.
  49  * 2. Redistributions in binary form must reproduce the above copyright
  50  *    notice, this list of conditions and the following disclaimer in the
  51  *    documentation and/or other materials provided with the distribution.
  52  * 3. Neither the name of the University nor the names of its contributors
  53  *    may be used to endorse or promote products derived from this software
  54  *    without specific prior written permission.
  55  *
  56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  66  * SUCH DAMAGE.
  67  *
  68  *      @(#)kern_clock.c        8.5 (Berkeley) 1/21/94
  69  */
  70
  71 #include <sys/cdefs.h>
  72 __KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.125 2008/07/02 19:38:37 rmind Exp $");
  73
  74 #include "opt_ntp.h"
  75 #include "opt_perfctrs.h"
  76
  77 #include <sys/param.h>
  78 #include <sys/systm.h>
  79 #include <sys/callout.h>
  80 #include <sys/kernel.h>
  81 #include <sys/proc.h>
  82 #include <sys/resourcevar.h>
  83 #include <sys/signalvar.h>
  84 #include <sys/sysctl.h>
  85 #include <sys/timex.h>
  86 #include <sys/sched.h>
  87 #include <sys/time.h>
  88 #include <sys/timetc.h>
  89 #include <sys/cpu.h>
  90 #include <sys/atomic.h>
  91
  92 #include <uvm/uvm_extern.h>
  93
  94 #ifdef GPROF
  95 #include <sys/gmon.h>
  96 #endif
  97
  98 /*
  99  * Clock handling routines.
 100  *
 101  * This code is written to operate with two timers that run independently of
 102  * each other.  The main clock, running hz times per second, is used to keep
 103  * track of real time.  The second timer handles kernel and user profiling,
 104  * and does resource use estimation.  If the second timer is programmable,
 105  * it is randomized to avoid aliasing between the two clocks.  For example,
 106  * the randomization prevents an adversary from always giving up the CPU
 107  * just before its quantum expires.  Otherwise, it would never accumulate
 108  * CPU ticks.  The mean frequency of the second timer is stathz.
 109  *
 110  * If no second timer exists, stathz will be zero; in this case we drive
 111  * profiling and statistics off the main clock.  This WILL NOT be accurate;
 112  * do not do it unless absolutely necessary.
 113  *
 114  * The statistics clock may (or may not) be run at a higher rate while
 115  * profiling.  This profile clock runs at profhz.  We require that profhz
 116  * be an integral multiple of stathz.
 117  *
 118  * If the statistics clock is running fast, it must be divided by the ratio
 119  * profhz/stathz for statistics.  (For profiling, every tick counts.)
 120  */
 121
 122 int     stathz;
 123 int     profhz;
 124 int     profsrc;
 125 int     schedhz;
 126 int     profprocs;
 127 int     hardclock_ticks;
 128 static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */
 129 static int psdiv;                       /* prof => stat divider */
 130 int     psratio;                        /* ratio: prof / stat */
 131
 132 static u_int get_intr_timecount(struct timecounter *);
 133
 134 static struct timecounter intr_timecounter = {
 135         get_intr_timecount,     /* get_timecount */
 136         0,                      /* no poll_pps */
 137         ~0u,                    /* counter_mask */
 138         0,                      /* frequency */
 139         "clockinterrupt",       /* name */
 140         0,                      /* quality - minimum implementation level for a clock */
 141         NULL,                   /* prev */
 142         NULL,                   /* next */
 143 };
 144
 145 static u_int
 146 get_intr_timecount(struct timecounter *tc)
 147 {
 148
 149         return (u_int)hardclock_ticks;
 150 }
 151
 152 /*
 153  * Initialize clock frequencies and start both clocks running.
 154  */
 155 void
 156 initclocks(void)
 157 {
 158         int i;
 159
 160         /*
 161          * Set divisors to 1 (normal case) and let the machine-specific
 162          * code do its bit.
 163          */
 164         psdiv = 1;
 165         /*
 166          * provide minimum default time counter
 167          * will only run at interrupt resolution
 168          */
 169         intr_timecounter.tc_frequency = hz;
 170         tc_init(&intr_timecounter);
 171         cpu_initclocks();
 172
 173         /*
 174          * Compute profhz and stathz, fix profhz if needed.
 175          */
 176         i = stathz ? stathz : hz;
 177         if (profhz == 0)
 178                 profhz = i;
 179         psratio = profhz / i;
 180         if (schedhz == 0) {
 181                 /* 16Hz is best */
 182                 hardscheddiv = hz / 16;
 183                 if (hardscheddiv <= 0)
 184                         panic("hardscheddiv");
 185         }
 186
 187 }
 188
 189 /*
 190  * The real-time timer, interrupting hz times per second.
 191  */
 192 void
 193 hardclock(struct clockframe *frame)
 194 {
 195         struct lwp *l;
 196         struct cpu_info *ci;
 197
 198         ci = curcpu();
 199         l = ci->ci_data.cpu_onproc;
 200
 201         timer_tick(l, CLKF_USERMODE(frame));
 202
 203         /*
 204          * If no separate statistics clock is available, run it from here.
 205          */
 206         if (stathz == 0)
 207                 statclock(frame);
 208         /*
 209          * If no separate schedclock is provided, call it here
 210          * at about 16 Hz.
 211          */
 212         if (schedhz == 0) {
 213                 if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) {
 214                         schedclock(l);
 215                         ci->ci_schedstate.spc_schedticks = hardscheddiv;
 216                 }
 217         }
 218         if ((--ci->ci_schedstate.spc_ticks) <= 0)
 219                 sched_tick(ci);
 220
 221         if (CPU_IS_PRIMARY(ci)) {
 222                 hardclock_ticks++;
 223                 tc_ticktock();
 224         }
 225
 226         /*
 227          * Update real-time timeout queue.
 228          */
 229         callout_hardclock();
 230 }
 231
 232 /*
 233  * Start profiling on a process.
 234  *
 235  * Kernel profiling passes proc0 which never exits and hence
 236  * keeps the profile clock running constantly.
 237  */
 238 void
 239 startprofclock(struct proc *p)
 240 {
 241
 242         KASSERT(mutex_owned(&p->p_stmutex));
 243
 244         if ((p->p_stflag & PST_PROFIL) == 0) {
 245                 p->p_stflag |= PST_PROFIL;
 246                 /*
 247                  * This is only necessary if using the clock as the
 248                  * profiling source.
 249                  */
 250                 if (++profprocs == 1 && stathz != 0)
 251                         psdiv = psratio;
 252         }
 253 }
 254
 255 /*
 256  * Stop profiling on a process.
 257  */
 258 void
 259 stopprofclock(struct proc *p)
 260 {
 261
 262         KASSERT(mutex_owned(&p->p_stmutex));
 263
 264         if (p->p_stflag & PST_PROFIL) {
 265                 p->p_stflag &= ~PST_PROFIL;
 266                 /*
 267                  * This is only necessary if using the clock as the
 268                  * profiling source.
 269                  */
 270                 if (--profprocs == 0 && stathz != 0)
 271                         psdiv = 1;
 272         }
 273 }
 274
 275 #if defined(PERFCTRS)
 276 /*
 277  * Independent profiling "tick" in case we're using a separate
 278  * clock or profiling event source.  Currently, that's just
 279  * performance counters--hence the wrapper.
 280  */
 281 void
 282 proftick(struct clockframe *frame)
 283 {
 284 #ifdef GPROF
 285         struct gmonparam *g;
 286         intptr_t i;
 287 #endif
 288         struct lwp *l;
 289         struct proc *p;
 290
 291         l = curcpu()->ci_data.cpu_onproc;
 292         p = (l ? l->l_proc : NULL);
 293         if (CLKF_USERMODE(frame)) {
 294                 mutex_spin_enter(&p->p_stmutex);
 295                 if (p->p_stflag & PST_PROFIL)
 296                         addupc_intr(l, CLKF_PC(frame));
 297                 mutex_spin_exit(&p->p_stmutex);
 298         } else {
 299 #ifdef GPROF
 300                 g = &_gmonparam;
 301                 if (g->state == GMON_PROF_ON) {
 302                         i = CLKF_PC(frame) - g->lowpc;
 303                         if (i < g->textsize) {
 304                                 i /= HISTFRACTION * sizeof(*g->kcount);
 305                                 g->kcount[i]++;
 306                         }
 307                 }
 308 #endif
 309 #ifdef LWP_PC
 310                 if (p != NULL && (p->p_stflag & PST_PROFIL) != 0)
 311                         addupc_intr(l, LWP_PC(l));
 312 #endif
 313         }
 314 }
 315 #endif
 316
 317 void
 318 schedclock(struct lwp *l)
 319 {
 320         struct cpu_info *ci;
 321
 322         ci = l->l_cpu;
 323
 324         /* Accumulate syscall and context switch counts. */
 325         atomic_add_int((unsigned *)&uvmexp.swtch, ci->ci_data.cpu_nswtch);
 326         ci->ci_data.cpu_nswtch = 0;
 327         atomic_add_int((unsigned *)&uvmexp.syscalls, ci->ci_data.cpu_nsyscall);
 328         ci->ci_data.cpu_nsyscall = 0;
 329         atomic_add_int((unsigned *)&uvmexp.traps, ci->ci_data.cpu_ntrap);
 330         ci->ci_data.cpu_ntrap = 0;
 331
 332         if ((l->l_flag & LW_IDLE) != 0)
 333                 return;
 334
 335         sched_schedclock(l);
 336 }
 337
 338 /*
 339  * Statistics clock.  Grab profile sample, and if divider reaches 0,
 340  * do process and kernel statistics.
 341  */
 342 void
 343 statclock(struct clockframe *frame)
 344 {
 345 #ifdef GPROF
 346         struct gmonparam *g;
 347         intptr_t i;
 348 #endif
 349         struct cpu_info *ci = curcpu();
 350         struct schedstate_percpu *spc = &ci->ci_schedstate;
 351         struct proc *p;
 352         struct lwp *l;
 353
 354         /*
 355          * Notice changes in divisor frequency, and adjust clock
 356          * frequency accordingly.
 357          */
 358         if (spc->spc_psdiv != psdiv) {
 359                 spc->spc_psdiv = psdiv;
 360                 spc->spc_pscnt = psdiv;
 361                 if (psdiv == 1) {
 362                         setstatclockrate(stathz);
 363                 } else {
 364                         setstatclockrate(profhz);
 365                 }
 366         }
 367         l = ci->ci_data.cpu_onproc;
 368         if ((l->l_flag & LW_IDLE) != 0) {
 369                 /*
 370                  * don't account idle lwps as swapper.
 371                  */
 372                 p = NULL;
 373         } else {
 374                 p = l->l_proc;
 375                 mutex_spin_enter(&p->p_stmutex);
 376         }
 377
 378         if (CLKF_USERMODE(frame)) {
 379                 if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
 380                         addupc_intr(l, CLKF_PC(frame));
 381                 if (--spc->spc_pscnt > 0) {
 382                         mutex_spin_exit(&p->p_stmutex);
 383                         return;
 384                 }
 385
 386                 /*
 387                  * Came from user mode; CPU was in user state.
 388                  * If this process is being profiled record the tick.
 389                  */
 390                 p->p_uticks++;
 391                 if (p->p_nice > NZERO)
 392                         spc->spc_cp_time[CP_NICE]++;
 393                 else
 394                         spc->spc_cp_time[CP_USER]++;
 395         } else {
 396 #ifdef GPROF
 397                 /*
 398                  * Kernel statistics are just like addupc_intr, only easier.
 399                  */
 400                 g = &_gmonparam;
 401                 if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
 402                         i = CLKF_PC(frame) - g->lowpc;
 403                         if (i < g->textsize) {
 404                                 i /= HISTFRACTION * sizeof(*g->kcount);
 405                                 g->kcount[i]++;
 406                         }
 407                 }
 408 #endif
 409 #ifdef LWP_PC
 410                 if (p != NULL && profsrc == PROFSRC_CLOCK &&
 411                     (p->p_stflag & PST_PROFIL)) {
 412                         addupc_intr(l, LWP_PC(l));
 413                 }
 414 #endif
 415                 if (--spc->spc_pscnt > 0) {
 416                         if (p != NULL)
 417                                 mutex_spin_exit(&p->p_stmutex);
 418                         return;
 419                 }
 420                 /*
 421                  * Came from kernel mode, so we were:
 422                  * - handling an interrupt,
 423                  * - doing syscall or trap work on behalf of the current
 424                  *   user process, or
 425                  * - spinning in the idle loop.
 426                  * Whichever it is, charge the time as appropriate.
 427                  * Note that we charge interrupts to the current process,
 428                  * regardless of whether they are ``for'' that process,
 429                  * so that we know how much of its real time was spent
 430                  * in ``non-process'' (i.e., interrupt) work.
 431                  */
 432                 if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) {
 433                         if (p != NULL) {
 434                                 p->p_iticks++;
 435                         }
 436                         spc->spc_cp_time[CP_INTR]++;
 437                 } else if (p != NULL) {
 438                         p->p_sticks++;
 439                         spc->spc_cp_time[CP_SYS]++;
 440                 } else {
 441                         spc->spc_cp_time[CP_IDLE]++;
 442                 }
 443         }
 444         spc->spc_pscnt = psdiv;
 445
 446         if (p != NULL) {
 447                 atomic_inc_uint(&l->l_cpticks);
 448                 mutex_spin_exit(&p->p_stmutex);
 449         }
 450 }