sys/kern/sched_m2.c

   1 /*      $NetBSD: sched_m2.c,v 1.28 2009/07/06 12:37:17 joerg Exp $      */
   2
   3 /*
   4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * TODO:
  31  *  - Implementation of fair share queue;
  32  *  - Support for NUMA;
  33  */
  34
  35 #include <sys/cdefs.h>
  36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.28 2009/07/06 12:37:17 joerg Exp $");
  37
  38 #include <sys/param.h>
  39
  40 #include <sys/cpu.h>
  41 #include <sys/callout.h>
  42 #include <sys/errno.h>
  43 #include <sys/kernel.h>
  44 #include <sys/kmem.h>
  45 #include <sys/lwp.h>
  46 #include <sys/mutex.h>
  47 #include <sys/pool.h>
  48 #include <sys/proc.h>
  49 #include <sys/pset.h>
  50 #include <sys/resource.h>
  51 #include <sys/resourcevar.h>
  52 #include <sys/sched.h>
  53 #include <sys/syscallargs.h>
  54 #include <sys/sysctl.h>
  55 #include <sys/types.h>
  56
  57 /*
  58  * Priority related defintions.
  59  */
  60 #define PRI_TS_COUNT    (NPRI_USER)
  61 #define PRI_RT_COUNT    (PRI_COUNT - PRI_TS_COUNT)
  62 #define PRI_HTS_RANGE   (PRI_TS_COUNT / 10)
  63
  64 #define PRI_HIGHEST_TS  (MAXPRI_USER)
  65
  66 /*
  67  * Time-slices and priorities.
  68  */
  69 static u_int    min_ts;                 /* Minimal time-slice */
  70 static u_int    max_ts;                 /* Maximal time-slice */
  71 static u_int    rt_ts;                  /* Real-time time-slice */
  72 static u_int    ts_map[PRI_COUNT];      /* Map of time-slices */
  73 static pri_t    high_pri[PRI_COUNT];    /* Map for priority increase */
  74
  75 static void     sched_precalcts(void);
  76
  77 /*
  78  * Initialization and setup.
  79  */
  80
  81 void
  82 sched_rqinit(void)
  83 {
  84         struct cpu_info *ci = curcpu();
  85
  86         if (hz < 100) {
  87                 panic("sched_rqinit: value of HZ is too low\n");
  88         }
  89
  90         /* Default timing ranges */
  91         min_ts = mstohz(20);                    /*  ~20 ms */
  92         max_ts = mstohz(150);                   /* ~150 ms */
  93         rt_ts = mstohz(100);                    /* ~100 ms */
  94         sched_precalcts();
  95
  96         /* Attach the primary CPU here */
  97         sched_cpuattach(ci);
  98
  99         sched_lwp_fork(NULL, &lwp0);
 100         sched_newts(&lwp0);
 101 }
 102
 103 /* Pre-calculate the time-slices for the priorities */
 104 static void
 105 sched_precalcts(void)
 106 {
 107         pri_t p;
 108
 109         /* Time-sharing range */
 110         for (p = 0; p <= PRI_HIGHEST_TS; p++) {
 111                 ts_map[p] = max_ts -
 112                     (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
 113                 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
 114                     ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
 115         }
 116
 117         /* Real-time range */
 118         for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
 119                 ts_map[p] = rt_ts;
 120                 high_pri[p] = p;
 121         }
 122 }
 123
 124 /*
 125  * Hooks.
 126  */
 127
 128 void
 129 sched_proc_fork(struct proc *parent, struct proc *child)
 130 {
 131         struct lwp *l;
 132
 133         LIST_FOREACH(l, &child->p_lwps, l_sibling) {
 134                 lwp_lock(l);
 135                 sched_newts(l);
 136                 lwp_unlock(l);
 137         }
 138 }
 139
 140 void
 141 sched_proc_exit(struct proc *child, struct proc *parent)
 142 {
 143
 144 }
 145
 146 void
 147 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
 148 {
 149
 150 }
 151
 152 void
 153 sched_lwp_collect(struct lwp *l)
 154 {
 155
 156 }
 157
 158 void
 159 sched_setrunnable(struct lwp *l)
 160 {
 161
 162 }
 163
 164 void
 165 sched_schedclock(struct lwp *l)
 166 {
 167
 168 }
 169
 170 /*
 171  * Priorities and time-slice.
 172  */
 173
 174 void
 175 sched_nice(struct proc *p, int prio)
 176 {
 177         struct lwp *l;
 178         int n;
 179
 180         KASSERT(mutex_owned(p->p_lock));
 181
 182         p->p_nice = prio;
 183         n = (prio - NZERO) >> 2;
 184         if (n == 0)
 185                 return;
 186
 187         LIST_FOREACH(l, &p->p_lwps, l_sibling) {
 188                 lwp_lock(l);
 189                 if (l->l_class == SCHED_OTHER) {
 190                         pri_t pri = l->l_priority - n;
 191                         pri = (n < 0) ? min(pri, PRI_HIGHEST_TS) : imax(pri, 0);
 192                         lwp_changepri(l, pri);
 193                 }
 194                 lwp_unlock(l);
 195         }
 196 }
 197
 198 /* Recalculate the time-slice */
 199 void
 200 sched_newts(struct lwp *l)
 201 {
 202
 203         l->l_sched.timeslice = ts_map[lwp_eprio(l)];
 204 }
 205
 206 void
 207 sched_slept(struct lwp *l)
 208 {
 209
 210         /*
 211          * If thread is in time-sharing queue and batch flag is not marked,
 212          * increase the priority, and run with the lower time-quantum.
 213          */
 214         if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
 215                 struct proc *p = l->l_proc;
 216
 217                 KASSERT(l->l_class == SCHED_OTHER);
 218                 if (__predict_false(p->p_nice < NZERO)) {
 219                         const int n = max((NZERO - p->p_nice) >> 2, 1);
 220                         l->l_priority = min(l->l_priority + n, PRI_HIGHEST_TS);
 221                 } else {
 222                         l->l_priority++;
 223                 }
 224         }
 225 }
 226
 227 void
 228 sched_wakeup(struct lwp *l)
 229 {
 230
 231         /* If thread was sleeping a second or more - set a high priority */
 232         if (l->l_slptime >= 1)
 233                 l->l_priority = high_pri[l->l_priority];
 234 }
 235
 236 void
 237 sched_pstats_hook(struct lwp *l, int batch)
 238 {
 239         pri_t prio;
 240
 241         /*
 242          * Estimate threads on time-sharing queue only, however,
 243          * exclude the highest priority for performance purposes.
 244          */
 245         KASSERT(lwp_locked(l, NULL));
 246         if (l->l_priority >= PRI_HIGHEST_TS)
 247                 return;
 248         KASSERT(l->l_class == SCHED_OTHER);
 249
 250         /* If it is CPU-bound not a first time - decrease the priority */
 251         prio = l->l_priority;
 252         if (batch && prio != 0)
 253                 prio--;
 254
 255         /* If thread was not ran a second or more - set a high priority */
 256         if (l->l_stat == LSRUN) {
 257                 if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
 258                         prio = high_pri[prio];
 259                 /* Re-enqueue the thread if priority has changed */
 260                 if (prio != l->l_priority)
 261                         lwp_changepri(l, prio);
 262         } else {
 263                 /* In other states, change the priority directly */
 264                 l->l_priority = prio;
 265         }
 266 }
 267
 268 void
 269 sched_oncpu(lwp_t *l)
 270 {
 271         struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate;
 272
 273         /* Update the counters */
 274         KASSERT(l->l_sched.timeslice >= min_ts);
 275         KASSERT(l->l_sched.timeslice <= max_ts);
 276         spc->spc_ticks = l->l_sched.timeslice;
 277 }
 278
 279 /*
 280  * Time-driven events.
 281  */
 282
 283 /*
 284  * Called once per time-quantum.  This routine is CPU-local and runs at
 285  * IPL_SCHED, thus the locking is not needed.
 286  */
 287 void
 288 sched_tick(struct cpu_info *ci)
 289 {
 290         struct schedstate_percpu *spc = &ci->ci_schedstate;
 291         struct lwp *l = curlwp;
 292         struct proc *p;
 293
 294         if (__predict_false(CURCPU_IDLE_P()))
 295                 return;
 296
 297         switch (l->l_class) {
 298         case SCHED_FIFO:
 299                 /*
 300                  * Update the time-quantum, and continue running,
 301                  * if thread runs on FIFO real-time policy.
 302                  */
 303                 KASSERT(l->l_priority > PRI_HIGHEST_TS);
 304                 spc->spc_ticks = l->l_sched.timeslice;
 305                 return;
 306         case SCHED_OTHER:
 307                 /*
 308                  * If thread is in time-sharing queue, decrease the priority,
 309                  * and run with a higher time-quantum.
 310                  */
 311                 KASSERT(l->l_priority <= PRI_HIGHEST_TS);
 312                 if (l->l_priority == 0)
 313                         break;
 314
 315                 p = l->l_proc;
 316                 if (__predict_false(p->p_nice > NZERO)) {
 317                         const int n = max((p->p_nice - NZERO) >> 2, 1);
 318                         l->l_priority = imax(l->l_priority - n, 0);
 319                 } else
 320                         l->l_priority--;
 321                 break;
 322         }
 323
 324         /*
 325          * If there are higher priority threads or threads in the same queue,
 326          * mark that thread should yield, otherwise, continue running.
 327          */
 328         if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
 329                 spc->spc_flags |= SPCF_SHOULDYIELD;
 330                 cpu_need_resched(ci, 0);
 331         } else
 332                 spc->spc_ticks = l->l_sched.timeslice;
 333 }
 334
 335 /*
 336  * Sysctl nodes and initialization.
 337  */
 338
 339 static int
 340 sysctl_sched_rtts(SYSCTLFN_ARGS)
 341 {
 342         struct sysctlnode node;
 343         int rttsms = hztoms(rt_ts);
 344
 345         node = *rnode;
 346         node.sysctl_data = &rttsms;
 347         return sysctl_lookup(SYSCTLFN_CALL(&node));
 348 }
 349
 350 static int
 351 sysctl_sched_mints(SYSCTLFN_ARGS)
 352 {
 353         struct sysctlnode node;
 354         struct cpu_info *ci;
 355         int error, newsize;
 356         CPU_INFO_ITERATOR cii;
 357
 358         node = *rnode;
 359         node.sysctl_data = &newsize;
 360
 361         newsize = hztoms(min_ts);
 362         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 363         if (error || newp == NULL)
 364                 return error;
 365
 366         newsize = mstohz(newsize);
 367         if (newsize < 1 || newsize > hz || newsize >= max_ts)
 368                 return EINVAL;
 369
 370         /* It is safe to do this in such order */
 371         for (CPU_INFO_FOREACH(cii, ci))
 372                 spc_lock(ci);
 373
 374         min_ts = newsize;
 375         sched_precalcts();
 376
 377         for (CPU_INFO_FOREACH(cii, ci))
 378                 spc_unlock(ci);
 379
 380         return 0;
 381 }
 382
 383 static int
 384 sysctl_sched_maxts(SYSCTLFN_ARGS)
 385 {
 386         struct sysctlnode node;
 387         struct cpu_info *ci;
 388         int error, newsize;
 389         CPU_INFO_ITERATOR cii;
 390
 391         node = *rnode;
 392         node.sysctl_data = &newsize;
 393
 394         newsize = hztoms(max_ts);
 395         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 396         if (error || newp == NULL)
 397                 return error;
 398
 399         newsize = mstohz(newsize);
 400         if (newsize < 10 || newsize > hz || newsize <= min_ts)
 401                 return EINVAL;
 402
 403         /* It is safe to do this in such order */
 404         for (CPU_INFO_FOREACH(cii, ci))
 405                 spc_lock(ci);
 406
 407         max_ts = newsize;
 408         sched_precalcts();
 409
 410         for (CPU_INFO_FOREACH(cii, ci))
 411                 spc_unlock(ci);
 412
 413         return 0;
 414 }
 415
 416 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
 417 {
 418         const struct sysctlnode *node = NULL;
 419
 420         sysctl_createv(clog, 0, NULL, NULL,
 421                 CTLFLAG_PERMANENT,
 422                 CTLTYPE_NODE, "kern", NULL,
 423                 NULL, 0, NULL, 0,
 424                 CTL_KERN, CTL_EOL);
 425         sysctl_createv(clog, 0, NULL, &node,
 426                 CTLFLAG_PERMANENT,
 427                 CTLTYPE_NODE, "sched",
 428                 SYSCTL_DESCR("Scheduler options"),
 429                 NULL, 0, NULL, 0,
 430                 CTL_KERN, CTL_CREATE, CTL_EOL);
 431
 432         if (node == NULL)
 433                 return;
 434
 435         sysctl_createv(NULL, 0, &node, NULL,
 436                 CTLFLAG_PERMANENT,
 437                 CTLTYPE_STRING, "name", NULL,
 438                 NULL, 0, __UNCONST("M2"), 0,
 439                 CTL_CREATE, CTL_EOL);
 440         sysctl_createv(NULL, 0, &node, NULL,
 441                 CTLFLAG_PERMANENT,
 442                 CTLTYPE_INT, "rtts",
 443                 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
 444                 sysctl_sched_rtts, 0, NULL, 0,
 445                 CTL_CREATE, CTL_EOL);
 446         sysctl_createv(NULL, 0, &node, NULL,
 447                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 448                 CTLTYPE_INT, "maxts",
 449                 SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
 450                 sysctl_sched_maxts, 0, &max_ts, 0,
 451                 CTL_CREATE, CTL_EOL);
 452         sysctl_createv(NULL, 0, &node, NULL,
 453                 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
 454                 CTLTYPE_INT, "mints",
 455                 SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
 456                 sysctl_sched_mints, 0, &min_ts, 0,
 457                 CTL_CREATE, CTL_EOL);
 458 }