1 // SPDX-License-Identifier: GPL-2.0
3 * Timer events oriented CPU idle governor
5 * Copyright (C) 2018 - 2021 Intel Corporation
6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
10 * DOC: teo-description
12 * The idea of this governor is based on the observation that on many systems
13 * timer events are two or more orders of magnitude more frequent than any
14 * other interrupts, so they are likely to be the most significant cause of CPU
15 * wakeups from idle states. Moreover, information about what happened in the
16 * (relatively recent) past can be used to estimate whether or not the deepest
17 * idle state with target residency within the (known) time till the closest
18 * timer event, referred to as the sleep length, is likely to be suitable for
19 * the upcoming CPU idle period and, if not, then which of the shallower idle
20 * states to choose instead of it.
22 * Of course, non-timer wakeup sources are more important in some use cases
23 * which can be covered by taking a few most recent idle time intervals of the
24 * CPU into account. However, even in that context it is not necessary to
25 * consider idle duration values greater than the sleep length, because the
26 * closest timer will ultimately wake up the CPU anyway unless it is woken up
29 * Thus this governor estimates whether or not the prospective idle duration of
30 * a CPU is likely to be significantly shorter than the sleep length and selects
31 * an idle state for it accordingly.
33 * The computations carried out by this governor are based on using bins whose
34 * boundaries are aligned with the target residency parameter values of the CPU
35 * idle states provided by the %CPUIdle driver in the ascending order. That is,
36 * the first bin spans from 0 up to, but not including, the target residency of
37 * the second idle state (idle state 1), the second bin spans from the target
38 * residency of idle state 1 up to, but not including, the target residency of
39 * idle state 2, the third bin spans from the target residency of idle state 2
40 * up to, but not including, the target residency of idle state 3 and so on.
41 * The last bin spans from the target residency of the deepest idle state
42 * supplied by the driver to infinity.
44 * Two metrics called "hits" and "intercepts" are associated with each bin.
45 * They are updated every time before selecting an idle state for the given CPU
46 * in accordance with what happened last time.
48 * The "hits" metric reflects the relative frequency of situations in which the
49 * sleep length and the idle duration measured after CPU wakeup fall into the
50 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
51 * length). In turn, the "intercepts" metric reflects the relative frequency of
52 * situations in which the measured idle duration is so much shorter than the
53 * sleep length that the bin it falls into corresponds to an idle state
54 * shallower than the one whose bin is fallen into by the sleep length (these
55 * situations are referred to as "intercepts" below).
57 * In order to select an idle state for a CPU, the governor takes the following
58 * steps (modulo the possible latency constraint that must be taken into account
61 * 1. Find the deepest CPU idle state whose target residency does not exceed
62 * the current sleep length (the candidate idle state) and compute 2 sums as
65 * - The sum of the "hits" and "intercepts" metrics for the candidate state
66 * and all of the deeper idle states (it represents the cases in which the
67 * CPU was idle long enough to avoid being intercepted if the sleep length
68 * had been equal to the current one).
70 * - The sum of the "intercepts" metrics for all of the idle states shallower
71 * than the candidate one (it represents the cases in which the CPU was not
72 * idle long enough to avoid being intercepted if the sleep length had been
73 * equal to the current one).
75 * 2. If the second sum is greater than the first one the CPU is likely to wake
76 * up early, so look for an alternative idle state to select.
78 * - Traverse the idle states shallower than the candidate one in the
81 * - For each of them compute the sum of the "intercepts" metrics over all
82 * of the idle states between it and the candidate one (including the
83 * former and excluding the latter).
85 * - If each of these sums that needs to be taken into account (because the
86 * check related to it has indicated that the CPU is likely to wake up
87 * early) is greater than a half of the corresponding sum computed in step
88 * 1 (which means that the target residency of the state in question had
89 * not exceeded the idle duration in over a half of the relevant cases),
90 * select the given idle state instead of the candidate one.
92 * 3. By default, select the candidate state.
95 #include <linux/cpuidle.h>
96 #include <linux/jiffies.h>
97 #include <linux/kernel.h>
98 #include <linux/sched/clock.h>
99 #include <linux/tick.h>
104 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
105 * is used for decreasing metrics on a regular basis.
108 #define DECAY_SHIFT 3
111 * struct teo_bin - Metrics used by the TEO cpuidle governor.
112 * @intercepts: The "intercepts" metric.
113 * @hits: The "hits" metric.
116 unsigned int intercepts
;
121 * struct teo_cpu - CPU data used by the TEO cpuidle governor.
122 * @time_span_ns: Time between idle state selection and post-wakeup update.
123 * @sleep_length_ns: Time till the closest timer event (at the selection time).
124 * @state_bins: Idle state data bins for this CPU.
125 * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
126 * @tick_hits: Number of "hits" after TICK_NSEC.
131 struct teo_bin state_bins
[CPUIDLE_STATE_MAX
];
133 unsigned int tick_hits
;
136 static DEFINE_PER_CPU(struct teo_cpu
, teo_cpus
);
139 * teo_update - Update CPU metrics after wakeup.
140 * @drv: cpuidle driver containing state data.
143 static void teo_update(struct cpuidle_driver
*drv
, struct cpuidle_device
*dev
)
145 struct teo_cpu
*cpu_data
= per_cpu_ptr(&teo_cpus
, dev
->cpu
);
146 int i
, idx_timer
= 0, idx_duration
= 0;
147 s64 target_residency_ns
;
150 if (cpu_data
->time_span_ns
>= cpu_data
->sleep_length_ns
) {
152 * One of the safety nets has triggered or the wakeup was close
153 * enough to the closest timer event expected at the idle state
154 * selection time to be discarded.
156 measured_ns
= U64_MAX
;
158 u64 lat_ns
= drv
->states
[dev
->last_state_idx
].exit_latency_ns
;
161 * The computations below are to determine whether or not the
162 * (saved) time till the next timer event and the measured idle
163 * duration fall into the same "bin", so use last_residency_ns
164 * for that instead of time_span_ns which includes the cpuidle
167 measured_ns
= dev
->last_residency_ns
;
169 * The delay between the wakeup and the first instruction
170 * executed by the CPU is not likely to be worst-case every
171 * time, so take 1/2 of the exit latency as a very rough
172 * approximation of the average of it.
174 if (measured_ns
>= lat_ns
)
175 measured_ns
-= lat_ns
/ 2;
183 * Decay the "hits" and "intercepts" metrics for all of the bins and
184 * find the bins that the sleep length and the measured idle duration
187 for (i
= 0; i
< drv
->state_count
; i
++) {
188 struct teo_bin
*bin
= &cpu_data
->state_bins
[i
];
190 bin
->hits
-= bin
->hits
>> DECAY_SHIFT
;
191 bin
->intercepts
-= bin
->intercepts
>> DECAY_SHIFT
;
193 cpu_data
->total
+= bin
->hits
+ bin
->intercepts
;
195 target_residency_ns
= drv
->states
[i
].target_residency_ns
;
197 if (target_residency_ns
<= cpu_data
->sleep_length_ns
) {
199 if (target_residency_ns
<= measured_ns
)
205 * If the deepest state's target residency is below the tick length,
206 * make a record of it to help teo_select() decide whether or not
207 * to stop the tick. This effectively adds an extra hits-only bin
208 * beyond the last state-related one.
210 if (target_residency_ns
< TICK_NSEC
) {
211 cpu_data
->tick_hits
-= cpu_data
->tick_hits
>> DECAY_SHIFT
;
213 cpu_data
->total
+= cpu_data
->tick_hits
;
215 if (TICK_NSEC
<= cpu_data
->sleep_length_ns
) {
216 idx_timer
= drv
->state_count
;
217 if (TICK_NSEC
<= measured_ns
) {
218 cpu_data
->tick_hits
+= PULSE
;
225 * If the measured idle duration falls into the same bin as the sleep
226 * length, this is a "hit", so update the "hits" metric for that bin.
227 * Otherwise, update the "intercepts" metric for the bin fallen into by
228 * the measured idle duration.
230 if (idx_timer
== idx_duration
)
231 cpu_data
->state_bins
[idx_timer
].hits
+= PULSE
;
233 cpu_data
->state_bins
[idx_duration
].intercepts
+= PULSE
;
236 cpu_data
->total
+= PULSE
;
239 static bool teo_state_ok(int i
, struct cpuidle_driver
*drv
)
241 return !tick_nohz_tick_stopped() ||
242 drv
->states
[i
].target_residency_ns
>= TICK_NSEC
;
246 * teo_find_shallower_state - Find shallower idle state matching given duration.
247 * @drv: cpuidle driver containing state data.
249 * @state_idx: Index of the capping idle state.
250 * @duration_ns: Idle duration value to match.
251 * @no_poll: Don't consider polling states.
253 static int teo_find_shallower_state(struct cpuidle_driver
*drv
,
254 struct cpuidle_device
*dev
, int state_idx
,
255 s64 duration_ns
, bool no_poll
)
259 for (i
= state_idx
- 1; i
>= 0; i
--) {
260 if (dev
->states_usage
[i
].disable
||
261 (no_poll
&& drv
->states
[i
].flags
& CPUIDLE_FLAG_POLLING
))
265 if (drv
->states
[i
].target_residency_ns
<= duration_ns
)
272 * teo_select - Selects the next idle state to enter.
273 * @drv: cpuidle driver containing state data.
275 * @stop_tick: Indication on whether or not to stop the scheduler tick.
277 static int teo_select(struct cpuidle_driver
*drv
, struct cpuidle_device
*dev
,
280 struct teo_cpu
*cpu_data
= per_cpu_ptr(&teo_cpus
, dev
->cpu
);
281 s64 latency_req
= cpuidle_governor_latency_req(dev
->cpu
);
282 ktime_t delta_tick
= TICK_NSEC
/ 2;
283 unsigned int tick_intercept_sum
= 0;
284 unsigned int idx_intercept_sum
= 0;
285 unsigned int intercept_sum
= 0;
286 unsigned int idx_hit_sum
= 0;
287 unsigned int hit_sum
= 0;
288 int constraint_idx
= 0;
289 int idx0
= 0, idx
= -1;
290 int prev_intercept_idx
;
294 if (dev
->last_state_idx
>= 0) {
295 teo_update(drv
, dev
);
296 dev
->last_state_idx
= -1;
299 cpu_data
->time_span_ns
= local_clock();
301 * Set the expected sleep length to infinity in case of an early
304 cpu_data
->sleep_length_ns
= KTIME_MAX
;
306 /* Check if there is any choice in the first place. */
307 if (drv
->state_count
< 2) {
312 if (!dev
->states_usage
[0].disable
)
315 /* Compute the sums of metrics for early wakeup pattern detection. */
316 for (i
= 1; i
< drv
->state_count
; i
++) {
317 struct teo_bin
*prev_bin
= &cpu_data
->state_bins
[i
-1];
318 struct cpuidle_state
*s
= &drv
->states
[i
];
321 * Update the sums of idle state mertics for all of the states
322 * shallower than the current one.
324 intercept_sum
+= prev_bin
->intercepts
;
325 hit_sum
+= prev_bin
->hits
;
327 if (dev
->states_usage
[i
].disable
)
331 idx0
= i
; /* first enabled state */
335 if (s
->exit_latency_ns
<= latency_req
)
338 /* Save the sums for the current state. */
339 idx_intercept_sum
= intercept_sum
;
340 idx_hit_sum
= hit_sum
;
343 /* Avoid unnecessary overhead. */
345 idx
= 0; /* No states enabled, must use 0. */
351 * Only one idle state is enabled, so use it, but do not
352 * allow the tick to be stopped it is shallow enough.
354 duration_ns
= drv
->states
[idx
].target_residency_ns
;
358 tick_intercept_sum
= intercept_sum
+
359 cpu_data
->state_bins
[drv
->state_count
-1].intercepts
;
362 * If the sum of the intercepts metric for all of the idle states
363 * shallower than the current candidate one (idx) is greater than the
364 * sum of the intercepts and hits metrics for the candidate state and
365 * all of the deeper states a shallower idle state is likely to be a
368 prev_intercept_idx
= idx
;
369 if (2 * idx_intercept_sum
> cpu_data
->total
- idx_hit_sum
) {
370 int first_suitable_idx
= idx
;
373 * Look for the deepest idle state whose target residency had
374 * not exceeded the idle duration in over a half of the relevant
377 * Take the possible duration limitation present if the tick
378 * has been stopped already into account.
382 for (i
= idx
- 1; i
>= 0; i
--) {
383 struct teo_bin
*bin
= &cpu_data
->state_bins
[i
];
385 intercept_sum
+= bin
->intercepts
;
387 if (2 * intercept_sum
> idx_intercept_sum
) {
389 * Use the current state unless it is too
390 * shallow or disabled, in which case take the
391 * first enabled state that is deep enough.
393 if (teo_state_ok(i
, drv
) &&
394 !dev
->states_usage
[i
].disable
)
397 idx
= first_suitable_idx
;
402 if (dev
->states_usage
[i
].disable
)
405 if (!teo_state_ok(i
, drv
)) {
407 * The current state is too shallow, but if an
408 * alternative candidate state has been found,
409 * it may still turn out to be a better choice.
411 if (first_suitable_idx
!= idx
)
417 first_suitable_idx
= i
;
420 if (!idx
&& prev_intercept_idx
) {
422 * We have to query the sleep length here otherwise we don't
423 * know after wakeup if our guess was correct.
425 duration_ns
= tick_nohz_get_sleep_length(&delta_tick
);
426 cpu_data
->sleep_length_ns
= duration_ns
;
431 * If there is a latency constraint, it may be necessary to select an
432 * idle state shallower than the current candidate one.
434 if (idx
> constraint_idx
)
435 idx
= constraint_idx
;
438 * Skip the timers check if state 0 is the current candidate one,
439 * because an immediate non-timer wakeup is expected in that case.
445 * If state 0 is a polling one, check if the target residency of
446 * the current candidate state is low enough and skip the timers
447 * check in that case too.
449 if ((drv
->states
[0].flags
& CPUIDLE_FLAG_POLLING
) &&
450 drv
->states
[idx
].target_residency_ns
< RESIDENCY_THRESHOLD_NS
)
453 duration_ns
= tick_nohz_get_sleep_length(&delta_tick
);
454 cpu_data
->sleep_length_ns
= duration_ns
;
457 * If the closest expected timer is before the target residency of the
458 * candidate state, a shallower one needs to be found.
460 if (drv
->states
[idx
].target_residency_ns
> duration_ns
) {
461 i
= teo_find_shallower_state(drv
, dev
, idx
, duration_ns
, false);
462 if (teo_state_ok(i
, drv
))
467 * If the selected state's target residency is below the tick length
468 * and intercepts occurring before the tick length are the majority of
469 * total wakeup events, do not stop the tick.
471 if (drv
->states
[idx
].target_residency_ns
< TICK_NSEC
&&
472 tick_intercept_sum
> cpu_data
->total
/ 2 + cpu_data
->total
/ 8)
473 duration_ns
= TICK_NSEC
/ 2;
477 * Allow the tick to be stopped unless the selected state is a polling
478 * one or the expected idle duration is shorter than the tick period
481 if ((!(drv
->states
[idx
].flags
& CPUIDLE_FLAG_POLLING
) &&
482 duration_ns
>= TICK_NSEC
) || tick_nohz_tick_stopped())
486 * The tick is not going to be stopped, so if the target residency of
487 * the state to be returned is not within the time till the closest
488 * timer including the tick, try to correct that.
491 drv
->states
[idx
].target_residency_ns
> delta_tick
)
492 idx
= teo_find_shallower_state(drv
, dev
, idx
, delta_tick
, false);
500 * teo_reflect - Note that governor data for the CPU need to be updated.
502 * @state: Entered state.
504 static void teo_reflect(struct cpuidle_device
*dev
, int state
)
506 struct teo_cpu
*cpu_data
= per_cpu_ptr(&teo_cpus
, dev
->cpu
);
508 dev
->last_state_idx
= state
;
510 * If the wakeup was not "natural", but triggered by one of the safety
511 * nets, assume that the CPU might have been idle for the entire sleep
514 if (dev
->poll_time_limit
||
515 (tick_nohz_idle_got_tick() && cpu_data
->sleep_length_ns
> TICK_NSEC
)) {
516 dev
->poll_time_limit
= false;
517 cpu_data
->time_span_ns
= cpu_data
->sleep_length_ns
;
519 cpu_data
->time_span_ns
= local_clock() - cpu_data
->time_span_ns
;
524 * teo_enable_device - Initialize the governor's data for the target CPU.
525 * @drv: cpuidle driver (not used).
528 static int teo_enable_device(struct cpuidle_driver
*drv
,
529 struct cpuidle_device
*dev
)
531 struct teo_cpu
*cpu_data
= per_cpu_ptr(&teo_cpus
, dev
->cpu
);
533 memset(cpu_data
, 0, sizeof(*cpu_data
));
538 static struct cpuidle_governor teo_governor
= {
541 .enable
= teo_enable_device
,
542 .select
= teo_select
,
543 .reflect
= teo_reflect
,
546 static int __init
teo_governor_init(void)
548 return cpuidle_register_governor(&teo_governor
);
551 postcore_initcall(teo_governor_init
);