4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cpu_pm.h>
27 #include <sys/cmn_err.h>
32 * Solaris Event Based CPU Power Manager
34 * This file implements platform independent event based CPU power management.
35 * When CPUs are configured into the system, the CMT scheduling subsystem will
36 * query the platform to determine if the CPU belongs to any power management
37 * domains. That is, sets of CPUs that share power management states.
39 * Active Power Management domains represent a group of CPUs across which the
40 * Operating System can request speed changes (which may in turn result
41 * in voltage changes). This allows the operating system to trade off
42 * performance for power savings.
44 * Idle Power Management domains can enter power savings states when they are
45 * unutilized. These states allow the Operating System to trade off power
46 * for performance (in the form of latency to transition from the idle state
49 * For each active and idle power domain the CMT subsystem instantiates, a
50 * cpupm_domain_t structure is created. As the dispatcher schedules threads
51 * to run on the system's CPUs, it will also track the utilization of the
52 * enumerated power domains. Significant changes in utilization will result
53 * in the dispatcher sending the power manager events that relate to the
54 * utilization of the power domain. The power manager recieves the events,
55 * and in the context of the policy objectives in force, may decide to request
56 * the domain's power/performance state be changed.
58 * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
59 * manager will request the CPUs in the domain run at their fastest (and most
60 * power consuming) state. When the domain becomes idle (utilization at zero),
61 * the power manager will request that the CPUs run at a speed that saves the
64 * The advantage of this scheme, is that the CPU power manager working with the
65 * dispatcher can be extremely responsive to changes in utilization. Optimizing
66 * for performance in the presence of utilization, and power savings in the
67 * presence of idleness. Such close collaboration with the dispatcher has other
68 * benefits that will play out in the form of more sophisticated power /
69 * performance policy in the near future.
71 * Avoiding state thrashing in the presence of transient periods of utilization
72 * and idleness while still being responsive to non-transient periods is key.
73 * The power manager implements a "governor" that is used to throttle
74 * state transitions when a significant amount of transient idle or transient
77 * Kernel background activity (e.g. taskq threads) are by far the most common
78 * form of transient utilization. Ungoverned in the face of this utililzation,
79 * hundreds of state transitions per second would result on an idle system.
81 * Transient idleness is common when a thread briefly yields the CPU to
82 * wait for an event elsewhere in the system. Where the idle period is short
83 * enough, the overhead associated with making the state transition doesn't
84 * justify the power savings.
86 * The following is the state machine for the governor implemented by
87 * cpupm_utilization_event():
89 * ----->---tw---->-----
91 * (I)-<-ti-<- -<-ntw-<(W)
94 * >-nti/rm->(D)--->-tw->-
98 * - (D): Default (ungoverned)
99 * - (W): Transient work governed
100 * - (I): Transient idle governed
102 * - tw: transient work
103 * - ti: transient idleness
104 * - ntw: non-transient work
105 * - nti: non-transient idleness
106 * - rm: thread remain event
109 static cpupm_domain_t
*cpupm_domains
= NULL
;
112 * Uninitialized state of CPU power management is disabled
114 cpupm_policy_t cpupm_policy
= CPUPM_POLICY_DISABLED
;
117 * Periods of utilization lasting less than this time interval are characterized
118 * as transient. State changes associated with transient work are considered
119 * to be mispredicted. That is, it's not worth raising and lower power states
120 * where the utilization lasts for less than this interval.
122 hrtime_t cpupm_tw_predict_interval
;
125 * Periods of idleness lasting less than this time interval are characterized
126 * as transient. State changes associated with transient idle are considered
127 * to be mispredicted. That is, it's not worth lowering and raising power
128 * states where the idleness lasts for less than this interval.
130 hrtime_t cpupm_ti_predict_interval
;
133 * Number of mispredictions after which future transitions will be governed.
135 int cpupm_mispredict_thresh
= 4;
138 * Likewise, the number of mispredicted governed transitions after which the
139 * governor will be removed.
141 int cpupm_mispredict_gov_thresh
= 4;
144 * The transient work and transient idle prediction intervals are specified
145 * here. Tuning them higher will result in the transient work, and transient
146 * idle governors being used more aggresively, which limits the frequency of
147 * state transitions at the expense of performance and power savings,
148 * respectively. The intervals are specified in nanoseconds.
153 #define CPUPM_DEFAULT_TI_INTERVAL 400000
157 #define CPUPM_DEFAULT_TW_INTERVAL 400000
159 hrtime_t cpupm_ti_gov_interval
= CPUPM_DEFAULT_TI_INTERVAL
;
160 hrtime_t cpupm_tw_gov_interval
= CPUPM_DEFAULT_TW_INTERVAL
;
163 static void cpupm_governor_initialize(void);
164 static void cpupm_state_change_global(cpupm_dtype_t
, cpupm_state_name_t
);
167 cpupm_get_policy(void)
169 return (cpupm_policy
);
173 cpupm_set_policy(cpupm_policy_t new_policy
)
175 static int gov_init
= 0;
178 mutex_enter(&cpu_lock
);
179 if (new_policy
== cpupm_policy
) {
180 mutex_exit(&cpu_lock
);
185 * Pausing CPUs causes a high priority thread to be scheduled
186 * on all other CPUs (besides the current one). This locks out
187 * other CPUs from making CPUPM state transitions.
189 switch (new_policy
) {
190 case CPUPM_POLICY_DISABLED
:
191 pause_cpus(NULL
, NULL
);
192 cpupm_policy
= CPUPM_POLICY_DISABLED
;
195 result
= cmt_pad_disable(PGHW_POW_ACTIVE
);
198 * Once PAD has been enabled, it should always be possible
204 * Bring all the active power domains to the maximum
207 cpupm_state_change_global(CPUPM_DTYPE_ACTIVE
,
208 CPUPM_STATE_MAX_PERF
);
211 case CPUPM_POLICY_ELASTIC
:
213 result
= cmt_pad_enable(PGHW_POW_ACTIVE
);
216 * Failed to enable PAD across the active power
217 * domains, which may well be because none were
224 * Initialize the governor parameters the first time through.
227 cpupm_governor_initialize();
231 pause_cpus(NULL
, NULL
);
232 cpupm_policy
= CPUPM_POLICY_ELASTIC
;
237 cmn_err(CE_WARN
, "Attempt to set unknown CPUPM policy %d\n",
242 mutex_exit(&cpu_lock
);
248 * Look for an existing power domain
250 static cpupm_domain_t
*
251 cpupm_domain_find(id_t id
, cpupm_dtype_t type
)
253 ASSERT(MUTEX_HELD(&cpu_lock
));
258 while (dom
!= NULL
) {
259 if (id
== dom
->cpd_id
&& type
== dom
->cpd_type
)
267 * Create a new domain
269 static cpupm_domain_t
*
270 cpupm_domain_create(id_t id
, cpupm_dtype_t type
)
274 ASSERT(MUTEX_HELD(&cpu_lock
));
276 dom
= kmem_zalloc(sizeof (cpupm_domain_t
), KM_SLEEP
);
278 dom
->cpd_type
= type
;
280 /* Link into the known domain list */
281 dom
->cpd_next
= cpupm_domains
;
288 cpupm_domain_state_enum(struct cpu
*cp
, cpupm_domain_t
*dom
)
291 * In the envent we're enumerating because the domain's state
292 * configuration has changed, toss any existing states.
294 if (dom
->cpd_nstates
> 0) {
295 kmem_free(dom
->cpd_states
,
296 sizeof (cpupm_state_t
) * dom
->cpd_nstates
);
297 dom
->cpd_nstates
= 0;
301 * Query to determine the number of states, allocate storage
302 * large enough to hold the state information, and pass it back
303 * to the platform driver to complete the enumeration.
305 dom
->cpd_nstates
= cpupm_plat_state_enumerate(cp
, dom
->cpd_type
, NULL
);
307 if (dom
->cpd_nstates
== 0)
311 kmem_zalloc(dom
->cpd_nstates
* sizeof (cpupm_state_t
), KM_SLEEP
);
312 (void) cpupm_plat_state_enumerate(cp
, dom
->cpd_type
, dom
->cpd_states
);
316 * Initialize the specified type of power domain on behalf of the CPU
319 cpupm_domain_init(struct cpu
*cp
, cpupm_dtype_t type
)
324 ASSERT(MUTEX_HELD(&cpu_lock
));
327 * Instantiate the domain if it doesn't already exist
328 * and enumerate its power states.
330 did
= cpupm_domain_id(cp
, type
);
331 dom
= cpupm_domain_find(did
, type
);
333 dom
= cpupm_domain_create(did
, type
);
334 cpupm_domain_state_enum(cp
, dom
);
338 * Named state initialization
340 if (type
== CPUPM_DTYPE_ACTIVE
) {
342 * For active power domains, the highest performance
343 * state is defined as first state returned from
344 * the domain enumeration.
346 dom
->cpd_named_states
[CPUPM_STATE_MAX_PERF
] =
348 dom
->cpd_named_states
[CPUPM_STATE_LOW_POWER
] =
349 &dom
->cpd_states
[dom
->cpd_nstates
- 1];
352 * Begin by assuming CPU is running at the max perf state.
354 dom
->cpd_state
= dom
->cpd_named_states
[CPUPM_STATE_MAX_PERF
];
361 * Return the id associated with the given type of domain
362 * to which cp belongs
365 cpupm_domain_id(struct cpu
*cp
, cpupm_dtype_t type
)
367 return (cpupm_plat_domain_id(cp
, type
));
371 * Initiate a state change for the specified domain on behalf of cp
374 cpupm_change_state(struct cpu
*cp
, cpupm_domain_t
*dom
, cpupm_state_t
*state
)
376 if (cpupm_plat_change_state(cp
, state
) < 0)
379 DTRACE_PROBE2(cpupm__change__state
,
380 cpupm_domain_t
*, dom
,
381 cpupm_state_t
*, state
);
383 dom
->cpd_state
= state
;
388 * Interface into the CPU power manager to indicate a significant change
389 * in utilization of the specified active power domain
392 cpupm_utilization_event(struct cpu
*cp
, hrtime_t now
, cpupm_domain_t
*dom
,
393 cpupm_util_event_t event
)
395 cpupm_state_t
*new_state
= NULL
;
398 if (cpupm_policy
== CPUPM_POLICY_DISABLED
) {
403 * What follows is a simple elastic power state management policy.
405 * If the utilization has become non-zero, and the domain was
406 * previously at it's lowest power state, then transition it
407 * to the highest state in the spirit of "race to idle".
409 * If the utilization has dropped to zero, then transition the
410 * domain to its lowest power state.
412 * Statistics are maintained to implement a governor to reduce state
413 * transitions resulting from either transient work, or periods of
414 * transient idleness on the domain.
417 case CPUPM_DOM_REMAIN_BUSY
:
420 * We've received an event that the domain is running a thread
421 * that's made it to the end of it's time slice. If we are at
422 * low power, then raise it. If the transient work governor
423 * is engaged, then remove it.
425 if (dom
->cpd_state
==
426 dom
->cpd_named_states
[CPUPM_STATE_LOW_POWER
]) {
428 dom
->cpd_named_states
[CPUPM_STATE_MAX_PERF
];
429 if (dom
->cpd_governor
== CPUPM_GOV_TRANS_WORK
) {
430 dom
->cpd_governor
= CPUPM_GOV_DISENGAGED
;
436 case CPUPM_DOM_BUSY_FROM_IDLE
:
437 last
= dom
->cpd_last_lower
;
438 dom
->cpd_last_raise
= now
;
440 DTRACE_PROBE3(cpupm__raise__req
,
441 cpupm_domain_t
*, dom
,
445 if (dom
->cpd_state
==
446 dom
->cpd_named_states
[CPUPM_STATE_LOW_POWER
]) {
449 * There's non-zero utilization, and the domain is
450 * running in the lower power state. Before we
451 * consider raising power, check if the preceeding
452 * idle period was transient in duration.
454 * If the domain is already transient work governed,
455 * then we don't bother maintaining transient idle
456 * statistics, as the presence of enough transient work
457 * can also make the domain frequently transiently idle.
458 * In this case, we still want to remain transient work
461 if (dom
->cpd_governor
== CPUPM_GOV_DISENGAGED
) {
462 if ((now
- last
) < cpupm_ti_predict_interval
) {
464 * We're raising the domain power and
465 * we *just* lowered it. Consider
466 * this a mispredicted power state
467 * transition due to a transient
471 cpupm_mispredict_thresh
) {
473 * There's enough transient
474 * idle transitions to
475 * justify governing future
479 CPUPM_GOV_TRANS_IDLE
;
483 cpupm_domain_t
*, dom
);
487 * We correctly predicted the last
493 if (dom
->cpd_governor
== CPUPM_GOV_TRANS_WORK
) {
495 * Raise requests are governed due to
498 DTRACE_PROBE1(cpupm__raise__governed
,
499 cpupm_domain_t
*, dom
);
504 * Prepare to transition to the higher power state
506 new_state
= dom
->cpd_named_states
[CPUPM_STATE_MAX_PERF
];
508 } else if (dom
->cpd_state
==
509 dom
->cpd_named_states
[CPUPM_STATE_MAX_PERF
]) {
512 * Utilization is non-zero, and we're already running
513 * in the higher power state. Take this opportunity to
514 * perform some book keeping if the last lowering
515 * request was governed.
517 if (dom
->cpd_governor
== CPUPM_GOV_TRANS_IDLE
) {
519 if ((now
- last
) >= cpupm_ti_predict_interval
) {
521 * The domain is transient idle
522 * governed, and we mispredicted
523 * governing the last lowering request.
526 cpupm_mispredict_gov_thresh
) {
528 * There's enough non-transient
529 * idle periods to justify
530 * removing the governor.
533 CPUPM_GOV_DISENGAGED
;
536 cpupm__ti__ungoverned
,
537 cpupm_domain_t
*, dom
);
541 * Correctly predicted governing the
542 * last lowering request.
550 case CPUPM_DOM_IDLE_FROM_BUSY
:
551 last
= dom
->cpd_last_raise
;
552 dom
->cpd_last_lower
= now
;
554 DTRACE_PROBE3(cpupm__lower__req
,
555 cpupm_domain_t
*, dom
,
559 if (dom
->cpd_state
==
560 dom
->cpd_named_states
[CPUPM_STATE_MAX_PERF
]) {
563 * The domain is idle, and is running in the highest
564 * performance state. Before we consider lowering power,
565 * perform some book keeping for the transient work
568 if (dom
->cpd_governor
== CPUPM_GOV_DISENGAGED
) {
569 if ((now
- last
) < cpupm_tw_predict_interval
) {
571 * We're lowering the domain power and
572 * we *just* raised it. Consider the
573 * last raise mispredicted due to
577 cpupm_mispredict_thresh
) {
579 * There's enough transient work
580 * transitions to justify
581 * governing future raise
585 CPUPM_GOV_TRANS_WORK
;
589 cpupm_domain_t
*, dom
);
593 * We correctly predicted during the
599 if (dom
->cpd_governor
== CPUPM_GOV_TRANS_IDLE
) {
601 * Lowering requests are governed due to
602 * transient idleness.
604 DTRACE_PROBE1(cpupm__lowering__governed
,
605 cpupm_domain_t
*, dom
);
611 * Prepare to transition to a lower power state.
614 dom
->cpd_named_states
[CPUPM_STATE_LOW_POWER
];
616 } else if (dom
->cpd_state
==
617 dom
->cpd_named_states
[CPUPM_STATE_LOW_POWER
]) {
620 * The domain is idle, and we're already running in
621 * the lower power state. Take this opportunity to
622 * perform some book keeping if the last raising
623 * request was governed.
625 if (dom
->cpd_governor
== CPUPM_GOV_TRANS_WORK
) {
626 if ((now
- last
) >= cpupm_tw_predict_interval
) {
628 * The domain is transient work
629 * governed, and we mispredicted
630 * governing the last raising request.
633 cpupm_mispredict_gov_thresh
) {
635 * There's enough non-transient
636 * work to justify removing
640 CPUPM_GOV_DISENGAGED
;
643 cpupm__tw__ungoverned
,
644 cpupm_domain_t
*, dom
);
648 * We correctly predicted governing
658 * Change the power state
659 * Not much currently done if this doesn't succeed
662 (void) cpupm_change_state(cp
, dom
, new_state
);
667 * Interface called by platforms to dynamically change the
668 * MAX performance cpupm state
671 cpupm_redefine_max_activepwr_state(struct cpu
*cp
, int max_perf_level
)
675 cpupm_dtype_t type
= CPUPM_DTYPE_ACTIVE
;
676 boolean_t change_state
= B_FALSE
;
677 cpupm_state_t
*new_state
= NULL
;
679 did
= cpupm_domain_id(cp
, type
);
680 if (MUTEX_HELD(&cpu_lock
)) {
681 dom
= cpupm_domain_find(did
, type
);
683 mutex_enter(&cpu_lock
);
684 dom
= cpupm_domain_find(did
, type
);
685 mutex_exit(&cpu_lock
);
689 * Can use a lock to avoid changing the power state of the cpu when
690 * CPUPM_STATE_MAX_PERF is getting changed.
691 * Since the occurance of events to change MAX_PERF is not frequent,
692 * it may not be a good idea to overburden with locks. In the worst
693 * case, for one cycle the power may not get changed to the required
697 if (dom
->cpd_state
==
698 dom
->cpd_named_states
[CPUPM_STATE_MAX_PERF
]) {
699 change_state
= B_TRUE
;
703 * If an out of range level is passed, use the lowest supported
706 if (max_perf_level
>= dom
->cpd_nstates
&&
707 dom
->cpd_nstates
> 1) {
708 max_perf_level
= dom
->cpd_nstates
- 1;
711 dom
->cpd_named_states
[CPUPM_STATE_MAX_PERF
] =
712 &dom
->cpd_states
[max_perf_level
];
715 * If the current state is MAX_PERF, change the current state
716 * to the new MAX_PERF
720 dom
->cpd_named_states
[CPUPM_STATE_MAX_PERF
];
722 (void) cpupm_change_state(cp
, dom
, new_state
);
729 * Initialize the parameters for the transience governor state machine
732 cpupm_governor_initialize(void)
735 * The default prediction intervals are specified in nanoseconds.
736 * Convert these to the equivalent in unscaled hrtime, which is the
737 * format of the timestamps passed to cpupm_utilization_event()
739 cpupm_ti_predict_interval
= unscalehrtime(cpupm_ti_gov_interval
);
740 cpupm_tw_predict_interval
= unscalehrtime(cpupm_tw_gov_interval
);
744 * Initiate a state change in all CPUPM domain instances of the specified type
747 cpupm_state_change_global(cpupm_dtype_t type
, cpupm_state_name_t state
)
754 pg_cpu_itr_t cpu_iter
;
757 ASSERT(MUTEX_HELD(&cpu_lock
));
760 case CPUPM_DTYPE_ACTIVE
:
761 hw
= PGHW_POW_ACTIVE
;
765 * Power domain types other than "active" unsupported.
767 ASSERT(type
== CPUPM_DTYPE_ACTIVE
);
771 if ((hwset
= pghw_set_lookup(hw
)) == NULL
)
775 * Iterate over the power domains
777 group_iter_init(&giter
);
778 while ((pwr_pg
= group_iterate(hwset
, &giter
)) != NULL
) {
780 dom
= (cpupm_domain_t
*)pwr_pg
->cmt_pg
.pghw_handle
;
783 * Iterate over the CPUs in each domain
785 PG_CPU_ITR_INIT(pwr_pg
, cpu_iter
);
786 while ((cp
= pg_cpu_next(&cpu_iter
)) != NULL
) {
787 (void) cpupm_change_state(cp
, dom
,
788 dom
->cpd_named_states
[state
]);