2 * Copyright 2009, Intel Corporation
3 * Copyright 2009, Sun Microsystems, Inc
5 * This file is part of PowerTOP
7 * This program file is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the
9 * Free Software Foundation; version 2 of the License.
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 * You should have received a copy of the GNU General Public License
17 * along with this program in a file named COPYING; if not, write to the
18 * Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301 USA
23 * Arjan van de Ven <arjan@linux.intel.com>
24 * Eric C Saxe <eric.saxe@sun.com>
25 * Aubrey Li <aubrey.li@intel.com>
31 * For the avoidance of doubt, except that if any license choice other
32 * than GPL or LGPL is available it will apply instead, Sun elects to
33 * use only the General Public License version 2 (GPLv2) at this time
34 * for any software where a choice of GPL license versions is made
35 * available with the language indicating that GPLv2 or any later
36 * version may be used, or where a choice of which version of the GPL
37 * is applied is otherwise unspecified.
47 #define HZ2MHZ(speed) ((speed) / MICROSEC)
48 #define DTP_ARG_COUNT 2
49 #define DTP_ARG_LENGTH 5
51 static uint64_t max_cpufreq
= 0;
52 static dtrace_hdl_t
*dtp
;
53 static char **dtp_argv
;
56 * Enabling PM through /etc/power.conf
57 * See pt_cpufreq_suggest()
59 static char default_conf
[] = "/etc/power.conf";
60 static char default_pmconf
[] = "/usr/sbin/pmconfig";
61 static char cpupm_enable
[] = "echo cpupm enable >> /etc/power.conf";
62 static char cpupm_treshold
[] = "echo cpu-threshold 1s >> /etc/power.conf";
65 * Buffer containing DTrace program to track CPU frequency transitions
67 static const char *dtp_cpufreq
=
76 "/last[(processorid_t)arg0] != 0/"
78 " this->cpu = (processorid_t)arg0;"
79 " this->oldspeed = (uint64_t)arg1;"
80 " @times[this->cpu, this->oldspeed] = sum(timestamp - last[this->cpu]);"
81 " last[this->cpu] = timestamp;"
84 "/last[(processorid_t)arg0] == 0/"
86 " this->cpu = (processorid_t)arg0;"
87 " this->oldspeed = (uint64_t)arg1;"
88 " @times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
89 " last[this->cpu] = timestamp;"
93 * Same as above, but only for a specific CPU
95 static const char *dtp_cpufreq_c
=
100 " begin = timestamp;"
103 ":::cpu-change-speed"
104 "/(processorid_t)arg0 == $1 &&"
107 " this->cpu = (processorid_t)arg0;"
108 " this->oldspeed = (uint64_t)arg1;"
109 " @times[this->cpu, this->oldspeed] = sum(timestamp - last);"
112 ":::cpu-change-speed"
113 "/(processorid_t)arg0 == $1 &&"
116 " this->cpu = (processorid_t)arg0;"
117 " this->oldspeed = (uint64_t)arg1;"
118 " @times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
122 static int pt_cpufreq_setup(void);
123 static int pt_cpufreq_snapshot(void);
124 static int pt_cpufreq_dtrace_walk(const dtrace_aggdata_t
*, void *);
125 static void pt_cpufreq_stat_account(double, uint_t
);
126 static int pt_cpufreq_snapshot_cpu(kstat_ctl_t
*, uint_t
);
127 static int pt_cpufreq_check_pm(void);
128 static void pt_cpufreq_enable(void);
131 pt_cpufreq_setup(void)
133 if ((dtp_argv
= malloc(sizeof (char *) * DTP_ARG_COUNT
)) == NULL
)
136 if ((dtp_argv
[0] = malloc(sizeof (char) * DTP_ARG_LENGTH
)) == NULL
) {
141 (void) snprintf(dtp_argv
[0], 5, "%d\0", g_ncpus_observed
);
144 if ((dtp_argv
[1] = malloc(sizeof (char) * DTP_ARG_LENGTH
))
150 (void) snprintf(dtp_argv
[1], 5, "%d\0", g_observed_cpu
);
157 * Perform setup necessary to enumerate and track CPU speed changes
160 pt_cpufreq_stat_prepare(void)
163 dtrace_proginfo_t info
;
164 dtrace_optval_t statustime
;
168 freq_state_info_t
*state
;
169 char *s
, *token
, *prog_ptr
;
172 if ((err
= pt_cpufreq_setup()) != 0) {
173 pt_error("failed to setup %s report (couldn't allocate "
174 "memory)\n", g_msg_freq_state
);
178 state
= g_pstate_info
;
179 if ((g_cpu_power_states
= calloc((size_t)g_ncpus
,
180 sizeof (cpu_power_info_t
))) == NULL
)
184 * Enumerate the CPU frequencies
186 if ((kc
= kstat_open()) == NULL
)
189 ksp
= kstat_lookup(kc
, "cpu_info", g_cpu_table
[g_observed_cpu
], NULL
);
193 (void) kstat_close(kc
);
197 (void) kstat_read(kc
, ksp
, NULL
);
199 knp
= kstat_data_lookup(ksp
, "supported_frequencies_Hz");
200 s
= knp
->value
.str
.addr
.ptr
;
204 for (token
= strtok(s
, ":"), s
= NULL
;
205 token
!= NULL
&& g_npstates
< NSTATES
;
206 token
= strtok(NULL
, ":")) {
208 state
->speed
= HZ2MHZ(atoll(token
));
210 if (state
->speed
> max_cpufreq
)
211 max_cpufreq
= state
->speed
;
213 state
->total_time
= (uint64_t)0;
220 pt_error("CPU exceeds the supported number of %s\n",
223 (void) kstat_close(kc
);
226 * Return if speed transition is not supported
232 * Setup DTrace to look for CPU frequency changes
234 if ((dtp
= dtrace_open(DTRACE_VERSION
, 0, &err
)) == NULL
) {
235 pt_error("cannot open dtrace library for the %s report: %s\n",
236 g_msg_freq_state
, dtrace_errmsg(NULL
, err
));
241 * Execute different scripts (defined above) depending on
242 * user specified options. Default mode uses dtp_cpufreq.
245 prog_ptr
= (char *)dtp_cpufreq_c
;
247 prog_ptr
= (char *)dtp_cpufreq
;
249 if ((prog
= dtrace_program_strcompile(dtp
, prog_ptr
,
250 DTRACE_PROBESPEC_NAME
, 0, (1 + g_argc
), dtp_argv
)) == NULL
) {
251 pt_error("failed to compile %s program\n", g_msg_freq_state
);
252 return (dtrace_errno(dtp
));
255 if (dtrace_program_exec(dtp
, prog
, &info
) == -1) {
256 pt_error("failed to enable %s probes\n", g_msg_freq_state
);
257 return (dtrace_errno(dtp
));
260 if (dtrace_setopt(dtp
, "aggsize", "128k") == -1)
261 pt_error("failed to set %s 'aggsize'\n", g_msg_freq_state
);
263 if (dtrace_setopt(dtp
, "aggrate", "0") == -1)
264 pt_error("failed to set %s 'aggrate'\n", g_msg_freq_state
);
266 if (dtrace_setopt(dtp
, "aggpercpu", 0) == -1)
267 pt_error("failed to set %s 'aggpercpu'\n", g_msg_freq_state
);
269 if (dtrace_go(dtp
) != 0) {
270 pt_error("failed to start %s observation\n", g_msg_freq_state
);
271 return (dtrace_errno(dtp
));
274 if (dtrace_getopt(dtp
, "statusrate", &statustime
) == -1) {
275 pt_error("failed to get %s 'statusrate'\n", g_msg_freq_state
);
276 return (dtrace_errno(dtp
));
283 * The DTrace probes have already been enabled, and are tracking
284 * CPU speed transitions. Take a snapshot of the aggregations, and
285 * look for any CPUs that have made a speed transition over the last
286 * sampling interval. Note that the aggregations may be empty if no
287 * speed transitions took place over the last interval. In that case,
288 * notate that we have already accounted for the time, so that when
289 * we do encounter a speed transition in a future sampling interval
290 * we can subtract that time back out.
293 pt_cpufreq_stat_collect(double interval
)
298 * Zero out the interval time reported by DTrace for
301 for (i
= 0; i
< g_npstates
; i
++)
302 g_pstate_info
[i
].total_time
= 0;
304 for (i
= 0; i
< g_ncpus
; i
++)
305 g_cpu_power_states
[i
].dtrace_time
= 0;
307 if (dtrace_status(dtp
) == -1)
310 if (dtrace_aggregate_snap(dtp
) != 0)
311 pt_error("failed to collect data for %s\n", g_msg_freq_state
);
313 if (dtrace_aggregate_walk_keyvarsorted(dtp
, pt_cpufreq_dtrace_walk
,
315 pt_error("failed to sort data for %s\n", g_msg_freq_state
);
317 dtrace_aggregate_clear(dtp
);
319 if ((ret
= pt_cpufreq_snapshot()) != 0) {
320 pt_error("failed to snapshot %s state\n", g_msg_freq_state
);
326 pt_cpufreq_stat_account(interval
, g_observed_cpu
);
328 case PT_MODE_DEFAULT
:
330 for (i
= 0; i
< g_ncpus_observed
; i
++)
331 pt_cpufreq_stat_account(interval
, i
);
339 pt_cpufreq_stat_account(double interval
, uint_t cpu
)
341 cpu_power_info_t
*cpu_pow
;
346 cpu_pow
= &g_cpu_power_states
[cpu
];
347 speed
= cpu_pow
->current_pstate
;
349 duration
= (hrtime_t
)(interval
* NANOSEC
) - cpu_pow
->dtrace_time
;
352 * 'duration' may be a negative value when we're using or forcing a
353 * small interval, and the amount of time already accounted ends up
354 * being larger than the the former.
359 for (i
= 0; i
< g_npstates
; i
++) {
360 if (g_pstate_info
[i
].speed
== speed
) {
361 g_pstate_info
[i
].total_time
+= duration
;
362 cpu_pow
->time_accounted
+= duration
;
363 cpu_pow
->speed_accounted
= speed
;
369 * Take a snapshot of each CPU's speed by looking through the cpu_info kstats.
372 pt_cpufreq_snapshot(void)
378 if ((kc
= kstat_open()) == NULL
)
383 ret
= pt_cpufreq_snapshot_cpu(kc
, g_observed_cpu
);
385 case PT_MODE_DEFAULT
:
387 for (i
= 0; i
< g_ncpus_observed
; i
++)
388 if ((ret
= pt_cpufreq_snapshot_cpu(kc
, i
)) != 0)
393 if (kstat_close(kc
) != 0)
394 pt_error("couldn't close %s kstat\n", g_msg_freq_state
);
400 pt_cpufreq_snapshot_cpu(kstat_ctl_t
*kc
, uint_t cpu
)
405 ksp
= kstat_lookup(kc
, "cpu_info", g_cpu_table
[cpu
], NULL
);
407 pt_error("couldn't find 'cpu_info' kstat for CPU %d\n while "
408 "taking a snapshot of %s\n", cpu
, g_msg_freq_state
);
412 if (kstat_read(kc
, ksp
, NULL
) == -1) {
413 pt_error("couldn't read 'cpu_info' kstat for CPU %d\n while "
414 "taking a snapshot of %s\n", cpu
, g_msg_freq_state
);
418 knp
= kstat_data_lookup(ksp
, "current_clock_Hz");
420 pt_error("couldn't find 'current_clock_Hz' kstat for CPU %d "
421 "while taking a snapshot of %s\n", cpu
, g_msg_freq_state
);
425 g_cpu_power_states
[cpu
].current_pstate
= HZ2MHZ(knp
->value
.ui64
);
431 * DTrace aggregation walker that sorts through a snapshot of the
432 * aggregation data collected during firings of the cpu-change-speed
437 pt_cpufreq_dtrace_walk(const dtrace_aggdata_t
*data
, void *arg
)
439 dtrace_aggdesc_t
*aggdesc
= data
->dtada_desc
;
440 dtrace_recdesc_t
*cpu_rec
, *speed_rec
;
441 cpu_power_info_t
*cp
;
447 if (strcmp(aggdesc
->dtagd_name
, "times") == 0) {
448 cpu_rec
= &aggdesc
->dtagd_rec
[1];
449 speed_rec
= &aggdesc
->dtagd_rec
[2];
451 /* LINTED - alignment */
452 cpu
= *(int32_t *)(data
->dtada_data
+ cpu_rec
->dtrd_offset
);
454 /* LINTED - alignment */
455 res
= *((hrtime_t
*)(data
->dtada_percpu
[cpu
]));
457 /* LINTED - alignment */
458 speed
= *(uint64_t *)(data
->dtada_data
+
459 speed_rec
->dtrd_offset
);
464 speed
= HZ2MHZ(speed
);
467 * We have an aggregation record for "cpu" being at "speed"
468 * for an interval of "n" nanoseconds. The reported interval
469 * may exceed the powertop sampling interval, since we only
470 * notice during potentially infrequent firings of the
471 * "speed change" DTrace probe. In this case powertop would
472 * have already accounted for the portions of the interval
473 * that happened during prior powertop samplings, so subtract
474 * out time already accounted.
476 cp
= &g_cpu_power_states
[cpu
];
478 for (i
= 0; i
< g_npstates
; i
++) {
479 if (g_pstate_info
[i
].speed
== speed
) {
481 if (cp
->time_accounted
> 0 &&
482 cp
->speed_accounted
== speed
) {
483 if (res
> cp
->time_accounted
) {
484 res
-= cp
->time_accounted
;
485 cp
->time_accounted
= 0;
486 cp
->speed_accounted
= 0;
488 return (DTRACE_AGGWALK_NEXT
);
492 g_pstate_info
[i
].total_time
+= res
;
493 cp
->dtrace_time
+= res
;
498 return (DTRACE_AGGWALK_NEXT
);
502 * Checks if PM is enabled in /etc/power.conf, enabling if not
505 pt_cpufreq_suggest(void)
507 int ret
= pt_cpufreq_check_pm();
511 pt_sugg_add("Suggestion: enable CPU power management by "
512 "pressing the P key", 40, 'P', (char *)g_msg_freq_enable
,
519 * Checks /etc/power.conf and returns:
521 * 0 if CPUPM is not enabled
522 * 1 if there's nothing for us to do because:
523 * (a) the system does not support frequency scaling
524 * (b) there's no power.conf.
525 * 2 if CPUPM is enabled
526 * 3 if the system is running in poll-mode, as opposed to event-mode
528 * Notice the ordering of the return values, they will be picked up and
529 * switched upon ascendingly.
532 pt_cpufreq_check_pm(void)
538 if (g_npstates
< 2 || (file
= fopen(default_conf
, "r")) == NULL
)
541 (void) memset(line
, 0, 1024);
543 while (fgets(line
, 1024, file
)) {
544 if (strstr(line
, "cpupm")) {
545 if (strstr(line
, "enable")) {
550 if (strstr(line
, "poll"))
560 * Used as a suggestion, sets PM in /etc/power.conf and
561 * a 1sec threshold, then calls /usr/sbin/pmconfig
564 pt_cpufreq_enable(void)
566 (void) system(cpupm_enable
);
567 (void) system(cpupm_treshold
);
568 (void) system(default_pmconf
);
570 if (pt_sugg_remove(pt_cpufreq_enable
) == 0)
571 pt_error("failed to remove a %s suggestion\n",