1 /* $NetBSD: sched_m2.c,v 1.28 2009/07/06 12:37:17 joerg Exp $ */
4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * - Implementation of fair share queue;
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.28 2009/07/06 12:37:17 joerg Exp $");
38 #include <sys/param.h>
41 #include <sys/callout.h>
42 #include <sys/errno.h>
43 #include <sys/kernel.h>
46 #include <sys/mutex.h>
50 #include <sys/resource.h>
51 #include <sys/resourcevar.h>
52 #include <sys/sched.h>
53 #include <sys/syscallargs.h>
54 #include <sys/sysctl.h>
55 #include <sys/types.h>
58 * Priority related defintions.
60 #define PRI_TS_COUNT (NPRI_USER)
61 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT)
62 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10)
64 #define PRI_HIGHEST_TS (MAXPRI_USER)
67 * Time-slices and priorities.
69 static u_int min_ts
; /* Minimal time-slice */
70 static u_int max_ts
; /* Maximal time-slice */
71 static u_int rt_ts
; /* Real-time time-slice */
72 static u_int ts_map
[PRI_COUNT
]; /* Map of time-slices */
73 static pri_t high_pri
[PRI_COUNT
]; /* Map for priority increase */
75 static void sched_precalcts(void);
78 * Initialization and setup.
84 struct cpu_info
*ci
= curcpu();
87 panic("sched_rqinit: value of HZ is too low\n");
90 /* Default timing ranges */
91 min_ts
= mstohz(20); /* ~20 ms */
92 max_ts
= mstohz(150); /* ~150 ms */
93 rt_ts
= mstohz(100); /* ~100 ms */
96 /* Attach the primary CPU here */
99 sched_lwp_fork(NULL
, &lwp0
);
103 /* Pre-calculate the time-slices for the priorities */
105 sched_precalcts(void)
109 /* Time-sharing range */
110 for (p
= 0; p
<= PRI_HIGHEST_TS
; p
++) {
112 (p
* 100 / (PRI_TS_COUNT
- 1) * (max_ts
- min_ts
) / 100);
113 high_pri
[p
] = (PRI_HIGHEST_TS
- PRI_HTS_RANGE
) +
114 ((p
* PRI_HTS_RANGE
) / (PRI_TS_COUNT
- 1));
117 /* Real-time range */
118 for (p
= (PRI_HIGHEST_TS
+ 1); p
< PRI_COUNT
; p
++) {
129 sched_proc_fork(struct proc
*parent
, struct proc
*child
)
133 LIST_FOREACH(l
, &child
->p_lwps
, l_sibling
) {
141 sched_proc_exit(struct proc
*child
, struct proc
*parent
)
147 sched_lwp_fork(struct lwp
*l1
, struct lwp
*l2
)
153 sched_lwp_collect(struct lwp
*l
)
159 sched_setrunnable(struct lwp
*l
)
165 sched_schedclock(struct lwp
*l
)
171 * Priorities and time-slice.
175 sched_nice(struct proc
*p
, int prio
)
180 KASSERT(mutex_owned(p
->p_lock
));
183 n
= (prio
- NZERO
) >> 2;
187 LIST_FOREACH(l
, &p
->p_lwps
, l_sibling
) {
189 if (l
->l_class
== SCHED_OTHER
) {
190 pri_t pri
= l
->l_priority
- n
;
191 pri
= (n
< 0) ? min(pri
, PRI_HIGHEST_TS
) : imax(pri
, 0);
192 lwp_changepri(l
, pri
);
198 /* Recalculate the time-slice */
200 sched_newts(struct lwp
*l
)
203 l
->l_sched
.timeslice
= ts_map
[lwp_eprio(l
)];
207 sched_slept(struct lwp
*l
)
211 * If thread is in time-sharing queue and batch flag is not marked,
212 * increase the priority, and run with the lower time-quantum.
214 if (l
->l_priority
< PRI_HIGHEST_TS
&& (l
->l_flag
& LW_BATCH
) == 0) {
215 struct proc
*p
= l
->l_proc
;
217 KASSERT(l
->l_class
== SCHED_OTHER
);
218 if (__predict_false(p
->p_nice
< NZERO
)) {
219 const int n
= max((NZERO
- p
->p_nice
) >> 2, 1);
220 l
->l_priority
= min(l
->l_priority
+ n
, PRI_HIGHEST_TS
);
228 sched_wakeup(struct lwp
*l
)
231 /* If thread was sleeping a second or more - set a high priority */
232 if (l
->l_slptime
>= 1)
233 l
->l_priority
= high_pri
[l
->l_priority
];
237 sched_pstats_hook(struct lwp
*l
, int batch
)
242 * Estimate threads on time-sharing queue only, however,
243 * exclude the highest priority for performance purposes.
245 KASSERT(lwp_locked(l
, NULL
));
246 if (l
->l_priority
>= PRI_HIGHEST_TS
)
248 KASSERT(l
->l_class
== SCHED_OTHER
);
250 /* If it is CPU-bound not a first time - decrease the priority */
251 prio
= l
->l_priority
;
252 if (batch
&& prio
!= 0)
255 /* If thread was not ran a second or more - set a high priority */
256 if (l
->l_stat
== LSRUN
) {
257 if (l
->l_rticks
&& (hardclock_ticks
- l
->l_rticks
>= hz
))
258 prio
= high_pri
[prio
];
259 /* Re-enqueue the thread if priority has changed */
260 if (prio
!= l
->l_priority
)
261 lwp_changepri(l
, prio
);
263 /* In other states, change the priority directly */
264 l
->l_priority
= prio
;
269 sched_oncpu(lwp_t
*l
)
271 struct schedstate_percpu
*spc
= &l
->l_cpu
->ci_schedstate
;
273 /* Update the counters */
274 KASSERT(l
->l_sched
.timeslice
>= min_ts
);
275 KASSERT(l
->l_sched
.timeslice
<= max_ts
);
276 spc
->spc_ticks
= l
->l_sched
.timeslice
;
280 * Time-driven events.
284 * Called once per time-quantum. This routine is CPU-local and runs at
285 * IPL_SCHED, thus the locking is not needed.
288 sched_tick(struct cpu_info
*ci
)
290 struct schedstate_percpu
*spc
= &ci
->ci_schedstate
;
291 struct lwp
*l
= curlwp
;
294 if (__predict_false(CURCPU_IDLE_P()))
297 switch (l
->l_class
) {
300 * Update the time-quantum, and continue running,
301 * if thread runs on FIFO real-time policy.
303 KASSERT(l
->l_priority
> PRI_HIGHEST_TS
);
304 spc
->spc_ticks
= l
->l_sched
.timeslice
;
308 * If thread is in time-sharing queue, decrease the priority,
309 * and run with a higher time-quantum.
311 KASSERT(l
->l_priority
<= PRI_HIGHEST_TS
);
312 if (l
->l_priority
== 0)
316 if (__predict_false(p
->p_nice
> NZERO
)) {
317 const int n
= max((p
->p_nice
- NZERO
) >> 2, 1);
318 l
->l_priority
= imax(l
->l_priority
- n
, 0);
325 * If there are higher priority threads or threads in the same queue,
326 * mark that thread should yield, otherwise, continue running.
328 if (lwp_eprio(l
) <= spc
->spc_maxpriority
|| l
->l_target_cpu
) {
329 spc
->spc_flags
|= SPCF_SHOULDYIELD
;
330 cpu_need_resched(ci
, 0);
332 spc
->spc_ticks
= l
->l_sched
.timeslice
;
336 * Sysctl nodes and initialization.
340 sysctl_sched_rtts(SYSCTLFN_ARGS
)
342 struct sysctlnode node
;
343 int rttsms
= hztoms(rt_ts
);
346 node
.sysctl_data
= &rttsms
;
347 return sysctl_lookup(SYSCTLFN_CALL(&node
));
351 sysctl_sched_mints(SYSCTLFN_ARGS
)
353 struct sysctlnode node
;
356 CPU_INFO_ITERATOR cii
;
359 node
.sysctl_data
= &newsize
;
361 newsize
= hztoms(min_ts
);
362 error
= sysctl_lookup(SYSCTLFN_CALL(&node
));
363 if (error
|| newp
== NULL
)
366 newsize
= mstohz(newsize
);
367 if (newsize
< 1 || newsize
> hz
|| newsize
>= max_ts
)
370 /* It is safe to do this in such order */
371 for (CPU_INFO_FOREACH(cii
, ci
))
377 for (CPU_INFO_FOREACH(cii
, ci
))
384 sysctl_sched_maxts(SYSCTLFN_ARGS
)
386 struct sysctlnode node
;
389 CPU_INFO_ITERATOR cii
;
392 node
.sysctl_data
= &newsize
;
394 newsize
= hztoms(max_ts
);
395 error
= sysctl_lookup(SYSCTLFN_CALL(&node
));
396 if (error
|| newp
== NULL
)
399 newsize
= mstohz(newsize
);
400 if (newsize
< 10 || newsize
> hz
|| newsize
<= min_ts
)
403 /* It is safe to do this in such order */
404 for (CPU_INFO_FOREACH(cii
, ci
))
410 for (CPU_INFO_FOREACH(cii
, ci
))
416 SYSCTL_SETUP(sysctl_sched_m2_setup
, "sysctl sched setup")
418 const struct sysctlnode
*node
= NULL
;
420 sysctl_createv(clog
, 0, NULL
, NULL
,
422 CTLTYPE_NODE
, "kern", NULL
,
425 sysctl_createv(clog
, 0, NULL
, &node
,
427 CTLTYPE_NODE
, "sched",
428 SYSCTL_DESCR("Scheduler options"),
430 CTL_KERN
, CTL_CREATE
, CTL_EOL
);
435 sysctl_createv(NULL
, 0, &node
, NULL
,
437 CTLTYPE_STRING
, "name", NULL
,
438 NULL
, 0, __UNCONST("M2"), 0,
439 CTL_CREATE
, CTL_EOL
);
440 sysctl_createv(NULL
, 0, &node
, NULL
,
443 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
444 sysctl_sched_rtts
, 0, NULL
, 0,
445 CTL_CREATE
, CTL_EOL
);
446 sysctl_createv(NULL
, 0, &node
, NULL
,
447 CTLFLAG_PERMANENT
| CTLFLAG_READWRITE
,
448 CTLTYPE_INT
, "maxts",
449 SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
450 sysctl_sched_maxts
, 0, &max_ts
, 0,
451 CTL_CREATE
, CTL_EOL
);
452 sysctl_createv(NULL
, 0, &node
, NULL
,
453 CTLFLAG_PERMANENT
| CTLFLAG_READWRITE
,
454 CTLTYPE_INT
, "mints",
455 SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
456 sysctl_sched_mints
, 0, &min_ts
, 0,
457 CTL_CREATE
, CTL_EOL
);