Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / cmd / powertop / common / cpufreq.c
blob7d1978fba8318cf1d15d2038428042aa7fef3f80
1 /*
2 * Copyright 2009, Intel Corporation
3 * Copyright 2009, Sun Microsystems, Inc
5 * This file is part of PowerTOP
7 * This program file is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the
9 * Free Software Foundation; version 2 of the License.
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program in a file named COPYING; if not, write to the
18 * Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301 USA
22 * Authors:
23 * Arjan van de Ven <arjan@linux.intel.com>
24 * Eric C Saxe <eric.saxe@sun.com>
25 * Aubrey Li <aubrey.li@intel.com>
29 * GPL Disclaimer
31 * For the avoidance of doubt, except that if any license choice other
32 * than GPL or LGPL is available it will apply instead, Sun elects to
33 * use only the General Public License version 2 (GPLv2) at this time
34 * for any software where a choice of GPL license versions is made
35 * available with the language indicating that GPLv2 or any later
36 * version may be used, or where a choice of which version of the GPL
37 * is applied is otherwise unspecified.
40 #include <stdlib.h>
41 #include <string.h>
42 #include <dtrace.h>
43 #include <kstat.h>
44 #include <errno.h>
45 #include "powertop.h"
47 #define HZ2MHZ(speed) ((speed) / MICROSEC)
48 #define DTP_ARG_COUNT 2
49 #define DTP_ARG_LENGTH 5
51 static uint64_t max_cpufreq = 0;
52 static dtrace_hdl_t *dtp;
53 static char **dtp_argv;
56 * Enabling PM through /etc/power.conf
57 * See pt_cpufreq_suggest()
59 static char default_conf[] = "/etc/power.conf";
60 static char default_pmconf[] = "/usr/sbin/pmconfig";
61 static char cpupm_enable[] = "echo cpupm enable >> /etc/power.conf";
62 static char cpupm_treshold[] = "echo cpu-threshold 1s >> /etc/power.conf";
65 * Buffer containing DTrace program to track CPU frequency transitions
67 static const char *dtp_cpufreq =
68 "hrtime_t last[$0];"
70 "BEGIN"
71 "{"
72 " begin = timestamp;"
73 "}"
75 ":::cpu-change-speed"
76 "/last[(processorid_t)arg0] != 0/"
77 "{"
78 " this->cpu = (processorid_t)arg0;"
79 " this->oldspeed = (uint64_t)arg1;"
80 " @times[this->cpu, this->oldspeed] = sum(timestamp - last[this->cpu]);"
81 " last[this->cpu] = timestamp;"
82 "}"
83 ":::cpu-change-speed"
84 "/last[(processorid_t)arg0] == 0/"
85 "{"
86 " this->cpu = (processorid_t)arg0;"
87 " this->oldspeed = (uint64_t)arg1;"
88 " @times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
89 " last[this->cpu] = timestamp;"
90 "}";
93 * Same as above, but only for a specific CPU
95 static const char *dtp_cpufreq_c =
96 "hrtime_t last;"
98 "BEGIN"
99 "{"
100 " begin = timestamp;"
103 ":::cpu-change-speed"
104 "/(processorid_t)arg0 == $1 &&"
105 " last != 0/"
107 " this->cpu = (processorid_t)arg0;"
108 " this->oldspeed = (uint64_t)arg1;"
109 " @times[this->cpu, this->oldspeed] = sum(timestamp - last);"
110 " last = timestamp;"
112 ":::cpu-change-speed"
113 "/(processorid_t)arg0 == $1 &&"
114 " last == 0/"
116 " this->cpu = (processorid_t)arg0;"
117 " this->oldspeed = (uint64_t)arg1;"
118 " @times[this->cpu, this->oldspeed] = sum(timestamp - begin);"
119 " last = timestamp;"
120 "}";
122 static int pt_cpufreq_setup(void);
123 static int pt_cpufreq_snapshot(void);
124 static int pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *, void *);
125 static void pt_cpufreq_stat_account(double, uint_t);
126 static int pt_cpufreq_snapshot_cpu(kstat_ctl_t *, uint_t);
127 static int pt_cpufreq_check_pm(void);
128 static void pt_cpufreq_enable(void);
130 static int
131 pt_cpufreq_setup(void)
133 if ((dtp_argv = malloc(sizeof (char *) * DTP_ARG_COUNT)) == NULL)
134 return (1);
136 if ((dtp_argv[0] = malloc(sizeof (char) * DTP_ARG_LENGTH)) == NULL) {
137 free(dtp_argv);
138 return (1);
141 (void) snprintf(dtp_argv[0], 5, "%d\0", g_ncpus_observed);
143 if (PT_ON_CPU) {
144 if ((dtp_argv[1] = malloc(sizeof (char) * DTP_ARG_LENGTH))
145 == NULL) {
146 free(dtp_argv[0]);
147 free(dtp_argv);
148 return (1);
150 (void) snprintf(dtp_argv[1], 5, "%d\0", g_observed_cpu);
153 return (0);
157 * Perform setup necessary to enumerate and track CPU speed changes
160 pt_cpufreq_stat_prepare(void)
162 dtrace_prog_t *prog;
163 dtrace_proginfo_t info;
164 dtrace_optval_t statustime;
165 kstat_ctl_t *kc;
166 kstat_t *ksp;
167 kstat_named_t *knp;
168 freq_state_info_t *state;
169 char *s, *token, *prog_ptr;
170 int err;
172 if ((err = pt_cpufreq_setup()) != 0) {
173 pt_error("failed to setup %s report (couldn't allocate "
174 "memory)\n", g_msg_freq_state);
175 return (errno);
178 state = g_pstate_info;
179 if ((g_cpu_power_states = calloc((size_t)g_ncpus,
180 sizeof (cpu_power_info_t))) == NULL)
181 return (-1);
184 * Enumerate the CPU frequencies
186 if ((kc = kstat_open()) == NULL)
187 return (errno);
189 ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[g_observed_cpu], NULL);
191 if (ksp == NULL) {
192 err = errno;
193 (void) kstat_close(kc);
194 return (err);
197 (void) kstat_read(kc, ksp, NULL);
199 knp = kstat_data_lookup(ksp, "supported_frequencies_Hz");
200 s = knp->value.str.addr.ptr;
202 g_npstates = 0;
204 for (token = strtok(s, ":"), s = NULL;
205 token != NULL && g_npstates < NSTATES;
206 token = strtok(NULL, ":")) {
208 state->speed = HZ2MHZ(atoll(token));
210 if (state->speed > max_cpufreq)
211 max_cpufreq = state->speed;
213 state->total_time = (uint64_t)0;
215 g_npstates++;
216 state++;
219 if (token != NULL)
220 pt_error("CPU exceeds the supported number of %s\n",
221 g_msg_freq_state);
223 (void) kstat_close(kc);
226 * Return if speed transition is not supported
228 if (g_npstates < 2)
229 return (-1);
232 * Setup DTrace to look for CPU frequency changes
234 if ((dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
235 pt_error("cannot open dtrace library for the %s report: %s\n",
236 g_msg_freq_state, dtrace_errmsg(NULL, err));
237 return (-2);
241 * Execute different scripts (defined above) depending on
242 * user specified options. Default mode uses dtp_cpufreq.
244 if (PT_ON_CPU)
245 prog_ptr = (char *)dtp_cpufreq_c;
246 else
247 prog_ptr = (char *)dtp_cpufreq;
249 if ((prog = dtrace_program_strcompile(dtp, prog_ptr,
250 DTRACE_PROBESPEC_NAME, 0, (1 + g_argc), dtp_argv)) == NULL) {
251 pt_error("failed to compile %s program\n", g_msg_freq_state);
252 return (dtrace_errno(dtp));
255 if (dtrace_program_exec(dtp, prog, &info) == -1) {
256 pt_error("failed to enable %s probes\n", g_msg_freq_state);
257 return (dtrace_errno(dtp));
260 if (dtrace_setopt(dtp, "aggsize", "128k") == -1)
261 pt_error("failed to set %s 'aggsize'\n", g_msg_freq_state);
263 if (dtrace_setopt(dtp, "aggrate", "0") == -1)
264 pt_error("failed to set %s 'aggrate'\n", g_msg_freq_state);
266 if (dtrace_setopt(dtp, "aggpercpu", 0) == -1)
267 pt_error("failed to set %s 'aggpercpu'\n", g_msg_freq_state);
269 if (dtrace_go(dtp) != 0) {
270 pt_error("failed to start %s observation\n", g_msg_freq_state);
271 return (dtrace_errno(dtp));
274 if (dtrace_getopt(dtp, "statusrate", &statustime) == -1) {
275 pt_error("failed to get %s 'statusrate'\n", g_msg_freq_state);
276 return (dtrace_errno(dtp));
279 return (0);
283 * The DTrace probes have already been enabled, and are tracking
284 * CPU speed transitions. Take a snapshot of the aggregations, and
285 * look for any CPUs that have made a speed transition over the last
286 * sampling interval. Note that the aggregations may be empty if no
287 * speed transitions took place over the last interval. In that case,
288 * notate that we have already accounted for the time, so that when
289 * we do encounter a speed transition in a future sampling interval
290 * we can subtract that time back out.
293 pt_cpufreq_stat_collect(double interval)
295 int i, ret;
298 * Zero out the interval time reported by DTrace for
299 * this interval
301 for (i = 0; i < g_npstates; i++)
302 g_pstate_info[i].total_time = 0;
304 for (i = 0; i < g_ncpus; i++)
305 g_cpu_power_states[i].dtrace_time = 0;
307 if (dtrace_status(dtp) == -1)
308 return (-1);
310 if (dtrace_aggregate_snap(dtp) != 0)
311 pt_error("failed to collect data for %s\n", g_msg_freq_state);
313 if (dtrace_aggregate_walk_keyvarsorted(dtp, pt_cpufreq_dtrace_walk,
314 NULL) != 0)
315 pt_error("failed to sort data for %s\n", g_msg_freq_state);
317 dtrace_aggregate_clear(dtp);
319 if ((ret = pt_cpufreq_snapshot()) != 0) {
320 pt_error("failed to snapshot %s state\n", g_msg_freq_state);
321 return (ret);
324 switch (g_op_mode) {
325 case PT_MODE_CPU:
326 pt_cpufreq_stat_account(interval, g_observed_cpu);
327 break;
328 case PT_MODE_DEFAULT:
329 default:
330 for (i = 0; i < g_ncpus_observed; i++)
331 pt_cpufreq_stat_account(interval, i);
332 break;
335 return (0);
338 static void
339 pt_cpufreq_stat_account(double interval, uint_t cpu)
341 cpu_power_info_t *cpu_pow;
342 uint64_t speed;
343 hrtime_t duration;
344 int i;
346 cpu_pow = &g_cpu_power_states[cpu];
347 speed = cpu_pow->current_pstate;
349 duration = (hrtime_t)(interval * NANOSEC) - cpu_pow->dtrace_time;
352 * 'duration' may be a negative value when we're using or forcing a
353 * small interval, and the amount of time already accounted ends up
354 * being larger than the the former.
356 if (duration < 0)
357 return;
359 for (i = 0; i < g_npstates; i++) {
360 if (g_pstate_info[i].speed == speed) {
361 g_pstate_info[i].total_time += duration;
362 cpu_pow->time_accounted += duration;
363 cpu_pow->speed_accounted = speed;
369 * Take a snapshot of each CPU's speed by looking through the cpu_info kstats.
371 static int
372 pt_cpufreq_snapshot(void)
374 kstat_ctl_t *kc;
375 int ret;
376 uint_t i;
378 if ((kc = kstat_open()) == NULL)
379 return (errno);
381 switch (g_op_mode) {
382 case PT_MODE_CPU:
383 ret = pt_cpufreq_snapshot_cpu(kc, g_observed_cpu);
384 break;
385 case PT_MODE_DEFAULT:
386 default:
387 for (i = 0; i < g_ncpus_observed; i++)
388 if ((ret = pt_cpufreq_snapshot_cpu(kc, i)) != 0)
389 break;
390 break;
393 if (kstat_close(kc) != 0)
394 pt_error("couldn't close %s kstat\n", g_msg_freq_state);
396 return (ret);
399 static int
400 pt_cpufreq_snapshot_cpu(kstat_ctl_t *kc, uint_t cpu)
402 kstat_t *ksp;
403 kstat_named_t *knp;
405 ksp = kstat_lookup(kc, "cpu_info", g_cpu_table[cpu], NULL);
406 if (ksp == NULL) {
407 pt_error("couldn't find 'cpu_info' kstat for CPU %d\n while "
408 "taking a snapshot of %s\n", cpu, g_msg_freq_state);
409 return (1);
412 if (kstat_read(kc, ksp, NULL) == -1) {
413 pt_error("couldn't read 'cpu_info' kstat for CPU %d\n while "
414 "taking a snapshot of %s\n", cpu, g_msg_freq_state);
415 return (2);
418 knp = kstat_data_lookup(ksp, "current_clock_Hz");
419 if (knp == NULL) {
420 pt_error("couldn't find 'current_clock_Hz' kstat for CPU %d "
421 "while taking a snapshot of %s\n", cpu, g_msg_freq_state);
422 return (3);
425 g_cpu_power_states[cpu].current_pstate = HZ2MHZ(knp->value.ui64);
427 return (0);
431 * DTrace aggregation walker that sorts through a snapshot of the
432 * aggregation data collected during firings of the cpu-change-speed
433 * probe.
435 /*ARGSUSED*/
436 static int
437 pt_cpufreq_dtrace_walk(const dtrace_aggdata_t *data, void *arg)
439 dtrace_aggdesc_t *aggdesc = data->dtada_desc;
440 dtrace_recdesc_t *cpu_rec, *speed_rec;
441 cpu_power_info_t *cp;
442 int32_t cpu;
443 uint64_t speed;
444 hrtime_t res;
445 int i;
447 if (strcmp(aggdesc->dtagd_name, "times") == 0) {
448 cpu_rec = &aggdesc->dtagd_rec[1];
449 speed_rec = &aggdesc->dtagd_rec[2];
451 /* LINTED - alignment */
452 cpu = *(int32_t *)(data->dtada_data + cpu_rec->dtrd_offset);
454 /* LINTED - alignment */
455 res = *((hrtime_t *)(data->dtada_percpu[cpu]));
457 /* LINTED - alignment */
458 speed = *(uint64_t *)(data->dtada_data +
459 speed_rec->dtrd_offset);
461 if (speed == 0)
462 speed = max_cpufreq;
463 else
464 speed = HZ2MHZ(speed);
467 * We have an aggregation record for "cpu" being at "speed"
468 * for an interval of "n" nanoseconds. The reported interval
469 * may exceed the powertop sampling interval, since we only
470 * notice during potentially infrequent firings of the
471 * "speed change" DTrace probe. In this case powertop would
472 * have already accounted for the portions of the interval
473 * that happened during prior powertop samplings, so subtract
474 * out time already accounted.
476 cp = &g_cpu_power_states[cpu];
478 for (i = 0; i < g_npstates; i++) {
479 if (g_pstate_info[i].speed == speed) {
481 if (cp->time_accounted > 0 &&
482 cp->speed_accounted == speed) {
483 if (res > cp->time_accounted) {
484 res -= cp->time_accounted;
485 cp->time_accounted = 0;
486 cp->speed_accounted = 0;
487 } else {
488 return (DTRACE_AGGWALK_NEXT);
492 g_pstate_info[i].total_time += res;
493 cp->dtrace_time += res;
498 return (DTRACE_AGGWALK_NEXT);
502 * Checks if PM is enabled in /etc/power.conf, enabling if not
504 void
505 pt_cpufreq_suggest(void)
507 int ret = pt_cpufreq_check_pm();
509 switch (ret) {
510 case 0:
511 pt_sugg_add("Suggestion: enable CPU power management by "
512 "pressing the P key", 40, 'P', (char *)g_msg_freq_enable,
513 pt_cpufreq_enable);
514 break;
519 * Checks /etc/power.conf and returns:
521 * 0 if CPUPM is not enabled
522 * 1 if there's nothing for us to do because:
523 * (a) the system does not support frequency scaling
524 * (b) there's no power.conf.
525 * 2 if CPUPM is enabled
526 * 3 if the system is running in poll-mode, as opposed to event-mode
528 * Notice the ordering of the return values, they will be picked up and
529 * switched upon ascendingly.
531 static int
532 pt_cpufreq_check_pm(void)
534 char line[1024];
535 FILE *file;
536 int ret = 0;
538 if (g_npstates < 2 || (file = fopen(default_conf, "r")) == NULL)
539 return (1);
541 (void) memset(line, 0, 1024);
543 while (fgets(line, 1024, file)) {
544 if (strstr(line, "cpupm")) {
545 if (strstr(line, "enable")) {
546 (void) fclose(file);
547 return (2);
550 if (strstr(line, "poll"))
551 ret = 3;
554 (void) fclose(file);
556 return (ret);
560 * Used as a suggestion, sets PM in /etc/power.conf and
561 * a 1sec threshold, then calls /usr/sbin/pmconfig
563 static void
564 pt_cpufreq_enable(void)
566 (void) system(cpupm_enable);
567 (void) system(cpupm_treshold);
568 (void) system(default_pmconf);
570 if (pt_sugg_remove(pt_cpufreq_enable) == 0)
571 pt_error("failed to remove a %s suggestion\n",
572 g_msg_freq_state);