minix/lib/libsys/cpuavg.c

   1 /*
   2  * Routines to maintain a decaying average of per-process CPU utilization, in a
   3  * way that results in numbers that are (hopefully) similar to those produced
   4  * by NetBSD.  Once a second, NetBSD performs the following basic computation
   5  * for each process:
   6  *
   7  *   avg = ccpu * avg + (1 - ccpu) * (run / hz)
   8  *
   9  * In this formula, 'avg' is the running average, 'hz' is the number of clock
  10  * ticks per second, 'run' is the number of ticks during which the process was
  11  * found running in the last second, and 'ccpu' is a decay value chosen such
  12  * that only 5% of the original average remains after 60 seconds: e**(-1/20).
  13  *
  14  * Here, the idea is that we update the average lazily, namely, only when the
  15  * process is running when the kernel processes a clock tick - no matter how
  16  * long it had not been running before that.  The result is that at any given
  17  * time, the average may be out of date.  For that reason, this code is shared
  18  * between the kernel and the MIB service: the latter occasionally obtains the
  19  * raw kernel process table, for example because a user runs ps(1), and it then
  20  * needs to bring the values up to date.  The kernel could do that itself just
  21  * before copying out the process table, but the MIB service is equally capable
  22  * of doing it post-copy - while also being preemptible during the computation.
  23  * There is more to be said about this, but the summary is that it is not clear
  24  * which of the two options is better in practice.  We simply chose this one.
  25  *
  26  * In addition, we deliberately delay updating the actual average by one
  27  * second, keeping the last second's number of process run ticks in a separate
  28  * variable 'last'.  This allows us to produce an estimate of short-term
  29  * activity of the process as well.  We use this to generate a "CPU estimate"
  30  * value.  BSD generates such a value for the purpose of scheduling, but we
  31  * have no actual use for that, and generating the value just for userland is
  32  * a bit too costly in our case.  Our inaccurate value should suffice for most
  33  * practical purposes though (e.g., comparisons between active processes).
  34  *
  35  * Overall, in terms of overhead, our approach should produce the same values
  36  * as NetBSD while having only the same overhead as NetBSD in the very worst
  37  * case, and much less overhead on average.  Even in the worst case, in our
  38  * case, the computation is spread out across each second, rather than all done
  39  * at once.  In terms of implementation, since this code is running in the
  40  * kernel, we make use of small tables of precomputed values, and we try to
  41  * save on computation as much as possible.  We copy much of the NetBSD
  42  * approach of avoiding divisions using FSCALE.
  43  *
  44  * Another difference with NetBSD is that our kernel does not actually call
  45  * this function from its clock interrupt handler, but rather when a process
  46  * has spent a number of CPU cycles that adds up to one clock tick worth of
  47  * execution time.  The result is better accuracy (no process can escape
  48  * accounting by yielding just before each clock interrupt), but due to the
  49  * inaccuracy of converting CPU cycles to clock ticks, a process may end up
  50  * using more than 'hz' clock ticks per second.  We could correct for this;
  51  * however, it has not yet shown to be a problem.
  52  *
  53  * Zooming out a bit again, the current average is fairly accurate but not
  54  * very precise.  There are two reasons for this.  First, the accounting is in
  55  * clock tick fractions, which means that a per-second CPU usage below 1/hz
  56  * cannot be measured.  Second, the NetBSD FSCALE and ccpu values are such that
  57  * (FSCALE - ccpu) equals 100, which means that a per-second CPU usage below
  58  * 1/100 cannot be measured either.  Both issues can be resolved by switching
  59  * to a CPU cycle based accounting approach, which requires 64-bit arithmetic
  60  * and a MINIX3-specific FSCALE value.  For now, this is just not worth doing.
  61  *
  62  * Finally, it should be noted that in terms of overall operating system
  63  * functionality, the CPU averages feature is entirely optional; as of writing,
  64  * the produced values are only used in the output of utilities such as ps(1).
  65  * If computing the CPU average becomes too burdensome in terms of either
  66  * performance or maintenance, it can simply be removed again.
  67  *
  68  * Original author: David van Moolenbroek <david@minix3.org>
  69  */
  70
  71 #include "sysutil.h"
  72 #include <sys/param.h>
  73
  74 #define CCPUTAB_SHIFT   3                               /* 2**3 == 8 */
  75 #define CCPUTAB_MASK    ((1 << CCPUTAB_SHIFT) - 1)
  76
  77 #define F(n) ((uint32_t)((n) * FSCALE))
  78
  79 /* e**(-1/20*n)*FSCALE for n=1..(2**CCPUTAB_SHIFT-1) */
  80 static const uint32_t ccpu_low[CCPUTAB_MASK] = {
  81         F(0.951229424501), F(0.904837418036), F(0.860707976425),
  82         F(0.818730753078), F(0.778800783071), F(0.740818220682),
  83         F(0.704688089719)
  84 };
  85 #define ccpu            (ccpu_low[0])
  86
  87 /* e**(-1/20*8*n)*FSCALE for n=1.. until the value is zero (for FSCALE=2048) */
  88 static const uint32_t ccpu_high[] = {
  89         F(0.670320046036), F(0.449328964117), F(0.301194211912),
  90         F(0.201896517995), F(0.135335283237), F(0.090717953289),
  91         F(0.060810062625), F(0.040762203978), F(0.027323722447),
  92         F(0.018315638889), F(0.012277339903), F(0.008229747049),
  93         F(0.005516564421), F(0.003697863716), F(0.002478752177),
  94         F(0.001661557273), F(0.001113775148), F(0.000746585808),
  95         F(0.000500451433)
  96 };
  97
  98 /*
  99  * Initialize the per-process CPU average structure.  To be called when the
 100  * process is started, that is, as part of a fork call.
 101  */
 102 void
 103 cpuavg_init(struct cpuavg * ca)
 104 {
 105
 106         ca->ca_base = 0;
 107         ca->ca_run = 0;
 108         ca->ca_last = 0;
 109         ca->ca_avg = 0;
 110 }
 111
 112 /*
 113  * Return a new CPU usage average value, resulting from decaying the old value
 114  * by the given number of seconds, using the formula (avg * ccpu**secs).
 115  * We use two-level lookup tables to limit the computational expense to two
 116  * multiplications while keeping the tables themselves relatively small.
 117  */
 118 static uint32_t
 119 cpuavg_decay(uint32_t avg, uint32_t secs)
 120 {
 121         unsigned int slot;
 122
 123         /*
 124          * The ccpu_high table is set up such that with the default FSCALE, the
 125          * values of any array entries beyond the end would be zero.  That is,
 126          * the average would be decayed to a value that, if represented in
 127          * FSCALE units, would be zero.  Thus, if it has been that long ago
 128          * that we updated the average, we can just reset it to zero.
 129          */
 130         if (secs > (__arraycount(ccpu_high) << CCPUTAB_SHIFT))
 131                 return 0;
 132
 133         if (secs > CCPUTAB_MASK) {
 134                 slot = (secs >> CCPUTAB_SHIFT) - 1;
 135
 136                 avg = (ccpu_high[slot] * avg) >> FSHIFT;        /* decay #3 */
 137
 138                 secs &= CCPUTAB_MASK;
 139         }
 140
 141         if (secs > 0)
 142                 avg = (ccpu_low[secs - 1] * avg) >> FSHIFT;     /* decay #4 */
 143
 144         return avg;
 145 }
 146
 147 /*
 148  * Update the CPU average value, either because the kernel is processing a
 149  * clock tick, or because the MIB service updates obtained averages.  We
 150  * perform the decay in at most four computation steps (shown as "decay #n"),
 151  * and thus, this algorithm is O(1).
 152  */
 153 static void
 154 cpuavg_update(struct cpuavg * ca, clock_t now, clock_t hz)
 155 {
 156         clock_t delta;
 157         uint32_t secs;
 158
 159         delta = now - ca->ca_base;
 160
 161         /*
 162          * If at least a second elapsed since we last updated the average, we
 163          * must do so now.  If not, we need not do anything for now.
 164          */
 165         if (delta < hz)
 166                 return;
 167
 168         /*
 169          * Decay the average by one second, and merge in the run fraction of
 170          * the previous second, as though that second only just ended - even
 171          * though the real time is at least one whole second ahead.  By doing
 172          * so, we roll the statistics time forward by one virtual second.
 173          */
 174         ca->ca_avg = (ccpu * ca->ca_avg) >> FSHIFT;             /* decay #1 */
 175         ca->ca_avg += (FSCALE - ccpu) * (ca->ca_last / hz) >> FSHIFT;
 176
 177         ca->ca_last = ca->ca_run;       /* move 'run' into 'last' */
 178         ca->ca_run = 0;
 179
 180         ca->ca_base += hz;              /* move forward by a second */
 181         delta -= hz;
 182
 183         if (delta < hz)
 184                 return;
 185
 186         /*
 187          * At least a whole second more elapsed since the start of the recorded
 188          * second.  That means that our current 'run' counter (now moved into
 189          * 'last') is also outdated, and we need to merge it in as well, before
 190          * performing the next decay steps.
 191          */
 192         ca->ca_avg = (ccpu * ca->ca_avg) >> FSHIFT;             /* decay #2 */
 193         ca->ca_avg += (FSCALE - ccpu) * (ca->ca_last / hz) >> FSHIFT;
 194
 195         ca->ca_last = 0;                /* 'run' is already zero now */
 196
 197         ca->ca_base += hz;              /* move forward by a second */
 198         delta -= hz;
 199
 200         if (delta < hz)
 201                 return;
 202
 203         /*
 204          * If additional whole seconds elapsed since the start of the last
 205          * second slot, roll forward in time by that many whole seconds, thus
 206          * decaying the value properly while maintaining alignment to whole-
 207          * second slots.  The decay takes up to another two computation steps.
 208          */
 209         secs = delta / hz;
 210
 211         ca->ca_avg = cpuavg_decay(ca->ca_avg, secs);
 212
 213         ca->ca_base += secs * hz;       /* move forward by whole seconds */
 214 }
 215
 216 /*
 217  * The clock ticked, and this last clock tick is accounted to the process for
 218  * which the CPU average statistics are stored in 'ca'.  Update the statistics
 219  * accordingly, decaying the average as necessary.  The current system uptime
 220  * must be given as 'now', and the number of clock ticks per second must be
 221  * given as 'hz'.
 222  */
 223 void
 224 cpuavg_increment(struct cpuavg * ca, clock_t now, clock_t hz)
 225 {
 226
 227         if (ca->ca_base == 0)
 228                 ca->ca_base = now;
 229         else
 230                 cpuavg_update(ca, now, hz);
 231
 232         /*
 233          * Register that the process was running at this clock tick.  We could
 234          * avoid one division above by precomputing (FSCALE/hz), but this is
 235          * typically not a clean division and would therefore result in (more)
 236          * loss of accuracy.
 237          */
 238         ca->ca_run += FSCALE;
 239 }
 240
 241 /*
 242  * Retrieve the decaying CPU utilization average (as return value), the number
 243  * of CPU run ticks in the current second so far (stored in 'cpticks'), and an
 244  * opaque CPU utilization estimate (stored in 'estcpu').  The caller must
 245  * provide the CPU average structure ('ca_orig'), which will not be modified,
 246  * as well as the current uptime in clock ticks ('now') and the number of clock
 247  * ticks per second ('hz').
 248  */
 249 uint32_t
 250 cpuavg_getstats(const struct cpuavg * ca_orig, uint32_t * cpticks,
 251         uint32_t * estcpu, clock_t now, clock_t hz)
 252 {
 253         struct cpuavg ca;
 254
 255         ca = *ca_orig;
 256
 257         /* Update the average as necessary. */
 258         cpuavg_update(&ca, now, hz);
 259
 260         /* Merge the last second into the average. */
 261         ca.ca_avg = (ccpu * ca.ca_avg) >> FSHIFT;
 262         ca.ca_avg += (FSCALE - ccpu) * (ca.ca_last / hz) >> FSHIFT;
 263
 264         *cpticks = ca.ca_run >> FSHIFT;
 265
 266         /*
 267          * NetBSD's estcpu value determines a scheduling queue, and decays to
 268          * 10% in 5*(the current load average) seconds.  Our 'estcpu' simply
 269          * reports the process's percentage of CPU usage in the last second,
 270          * thus yielding a value in the range 0..100 with a decay of 100% after
 271          * one second.  This should be good enough for most practical purposes.
 272          */
 273         *estcpu = (ca.ca_last / hz * 100) >> FSHIFT;
 274
 275         return ca.ca_avg;
 276 }
 277
 278 /*
 279  * Return the ccpu decay value, in FSCALE units.
 280  */
 281 uint32_t
 282 cpuavg_getccpu(void)
 283 {
 284
 285         return ccpu;
 286 }