1 // SPDX-License-Identifier: GPL-2.0-only
3 * Common code for Intel Running Average Power Limit (RAPL) support.
4 * Copyright (c) 2019, Intel Corporation.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/list.h>
11 #include <linux/types.h>
12 #include <linux/device.h>
13 #include <linux/slab.h>
14 #include <linux/log2.h>
15 #include <linux/bitmap.h>
16 #include <linux/delay.h>
17 #include <linux/sysfs.h>
18 #include <linux/cpu.h>
19 #include <linux/powercap.h>
20 #include <linux/suspend.h>
21 #include <linux/intel_rapl.h>
22 #include <linux/processor.h>
23 #include <linux/platform_device.h>
25 #include <asm/iosf_mbi.h>
26 #include <asm/cpu_device_id.h>
27 #include <asm/intel-family.h>
30 #define MSR_PLATFORM_POWER_LIMIT 0x0000065C
32 /* bitmasks for RAPL MSRs, used by primitive access functions */
33 #define ENERGY_STATUS_MASK 0xffffffff
35 #define POWER_LIMIT1_MASK 0x7FFF
36 #define POWER_LIMIT1_ENABLE BIT(15)
37 #define POWER_LIMIT1_CLAMP BIT(16)
39 #define POWER_LIMIT2_MASK (0x7FFFULL<<32)
40 #define POWER_LIMIT2_ENABLE BIT_ULL(47)
41 #define POWER_LIMIT2_CLAMP BIT_ULL(48)
42 #define POWER_HIGH_LOCK BIT_ULL(63)
43 #define POWER_LOW_LOCK BIT(31)
45 #define TIME_WINDOW1_MASK (0x7FULL<<17)
46 #define TIME_WINDOW2_MASK (0x7FULL<<49)
48 #define POWER_UNIT_OFFSET 0
49 #define POWER_UNIT_MASK 0x0F
51 #define ENERGY_UNIT_OFFSET 0x08
52 #define ENERGY_UNIT_MASK 0x1F00
54 #define TIME_UNIT_OFFSET 0x10
55 #define TIME_UNIT_MASK 0xF0000
57 #define POWER_INFO_MAX_MASK (0x7fffULL<<32)
58 #define POWER_INFO_MIN_MASK (0x7fffULL<<16)
59 #define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48)
60 #define POWER_INFO_THERMAL_SPEC_MASK 0x7fff
62 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
63 #define PP_POLICY_MASK 0x1F
65 /* Non HW constants */
66 #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */
67 #define RAPL_PRIMITIVE_DUMMY BIT(2)
69 #define TIME_WINDOW_MAX_MSEC 40000
70 #define TIME_WINDOW_MIN_MSEC 250
71 #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */
73 ARBITRARY_UNIT
, /* no translation */
79 /* per domain data, some are optional */
80 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
82 #define DOMAIN_STATE_INACTIVE BIT(0)
83 #define DOMAIN_STATE_POWER_LIMIT_SET BIT(1)
84 #define DOMAIN_STATE_BIOS_LOCKED BIT(2)
86 static const char pl1_name
[] = "long_term";
87 static const char pl2_name
[] = "short_term";
89 #define power_zone_to_rapl_domain(_zone) \
90 container_of(_zone, struct rapl_domain, power_zone)
92 struct rapl_defaults
{
93 u8 floor_freq_reg_addr
;
94 int (*check_unit
)(struct rapl_package
*rp
, int cpu
);
95 void (*set_floor_freq
)(struct rapl_domain
*rd
, bool mode
);
96 u64 (*compute_time_window
)(struct rapl_package
*rp
, u64 val
,
98 unsigned int dram_domain_energy_unit
;
100 static struct rapl_defaults
*rapl_defaults
;
102 /* Sideband MBI registers */
103 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
104 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
106 #define PACKAGE_PLN_INT_SAVED BIT(0)
107 #define MAX_PRIM_NAME (32)
109 /* per domain data. used to describe individual knobs such that access function
110 * can be consolidated into one instead of many inline functions.
112 struct rapl_primitive_info
{
116 enum rapl_domain_reg_id id
;
121 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \
130 static void rapl_init_domains(struct rapl_package
*rp
);
131 static int rapl_read_data_raw(struct rapl_domain
*rd
,
132 enum rapl_primitives prim
,
133 bool xlate
, u64
*data
);
134 static int rapl_write_data_raw(struct rapl_domain
*rd
,
135 enum rapl_primitives prim
,
136 unsigned long long value
);
137 static u64
rapl_unit_xlate(struct rapl_domain
*rd
,
138 enum unit_type type
, u64 value
, int to_raw
);
139 static void package_power_limit_irq_save(struct rapl_package
*rp
);
141 static LIST_HEAD(rapl_packages
); /* guarded by CPU hotplug lock */
143 static const char *const rapl_domain_names
[] = {
151 static int get_energy_counter(struct powercap_zone
*power_zone
,
154 struct rapl_domain
*rd
;
157 /* prevent CPU hotplug, make sure the RAPL domain does not go
158 * away while reading the counter.
161 rd
= power_zone_to_rapl_domain(power_zone
);
163 if (!rapl_read_data_raw(rd
, ENERGY_COUNTER
, true, &energy_now
)) {
164 *energy_raw
= energy_now
;
174 static int get_max_energy_counter(struct powercap_zone
*pcd_dev
, u64
*energy
)
176 struct rapl_domain
*rd
= power_zone_to_rapl_domain(pcd_dev
);
178 *energy
= rapl_unit_xlate(rd
, ENERGY_UNIT
, ENERGY_STATUS_MASK
, 0);
182 static int release_zone(struct powercap_zone
*power_zone
)
184 struct rapl_domain
*rd
= power_zone_to_rapl_domain(power_zone
);
185 struct rapl_package
*rp
= rd
->rp
;
187 /* package zone is the last zone of a package, we can free
188 * memory here since all children has been unregistered.
190 if (rd
->id
== RAPL_DOMAIN_PACKAGE
) {
199 static int find_nr_power_limit(struct rapl_domain
*rd
)
203 for (i
= 0; i
< NR_POWER_LIMITS
; i
++) {
211 static int set_domain_enable(struct powercap_zone
*power_zone
, bool mode
)
213 struct rapl_domain
*rd
= power_zone_to_rapl_domain(power_zone
);
215 if (rd
->state
& DOMAIN_STATE_BIOS_LOCKED
)
219 rapl_write_data_raw(rd
, PL1_ENABLE
, mode
);
220 if (rapl_defaults
->set_floor_freq
)
221 rapl_defaults
->set_floor_freq(rd
, mode
);
227 static int get_domain_enable(struct powercap_zone
*power_zone
, bool *mode
)
229 struct rapl_domain
*rd
= power_zone_to_rapl_domain(power_zone
);
232 if (rd
->state
& DOMAIN_STATE_BIOS_LOCKED
) {
237 if (rapl_read_data_raw(rd
, PL1_ENABLE
, true, &val
)) {
247 /* per RAPL domain ops, in the order of rapl_domain_type */
248 static const struct powercap_zone_ops zone_ops
[] = {
249 /* RAPL_DOMAIN_PACKAGE */
251 .get_energy_uj
= get_energy_counter
,
252 .get_max_energy_range_uj
= get_max_energy_counter
,
253 .release
= release_zone
,
254 .set_enable
= set_domain_enable
,
255 .get_enable
= get_domain_enable
,
257 /* RAPL_DOMAIN_PP0 */
259 .get_energy_uj
= get_energy_counter
,
260 .get_max_energy_range_uj
= get_max_energy_counter
,
261 .release
= release_zone
,
262 .set_enable
= set_domain_enable
,
263 .get_enable
= get_domain_enable
,
265 /* RAPL_DOMAIN_PP1 */
267 .get_energy_uj
= get_energy_counter
,
268 .get_max_energy_range_uj
= get_max_energy_counter
,
269 .release
= release_zone
,
270 .set_enable
= set_domain_enable
,
271 .get_enable
= get_domain_enable
,
273 /* RAPL_DOMAIN_DRAM */
275 .get_energy_uj
= get_energy_counter
,
276 .get_max_energy_range_uj
= get_max_energy_counter
,
277 .release
= release_zone
,
278 .set_enable
= set_domain_enable
,
279 .get_enable
= get_domain_enable
,
281 /* RAPL_DOMAIN_PLATFORM */
283 .get_energy_uj
= get_energy_counter
,
284 .get_max_energy_range_uj
= get_max_energy_counter
,
285 .release
= release_zone
,
286 .set_enable
= set_domain_enable
,
287 .get_enable
= get_domain_enable
,
292 * Constraint index used by powercap can be different than power limit (PL)
293 * index in that some PLs maybe missing due to non-existent MSRs. So we
294 * need to convert here by finding the valid PLs only (name populated).
296 static int contraint_to_pl(struct rapl_domain
*rd
, int cid
)
300 for (i
= 0, j
= 0; i
< NR_POWER_LIMITS
; i
++) {
301 if ((rd
->rpl
[i
].name
) && j
++ == cid
) {
302 pr_debug("%s: index %d\n", __func__
, i
);
306 pr_err("Cannot find matching power limit for constraint %d\n", cid
);
311 static int set_power_limit(struct powercap_zone
*power_zone
, int cid
,
314 struct rapl_domain
*rd
;
315 struct rapl_package
*rp
;
320 rd
= power_zone_to_rapl_domain(power_zone
);
321 id
= contraint_to_pl(rd
, cid
);
329 if (rd
->state
& DOMAIN_STATE_BIOS_LOCKED
) {
330 dev_warn(&power_zone
->dev
,
331 "%s locked by BIOS, monitoring only\n", rd
->name
);
336 switch (rd
->rpl
[id
].prim_id
) {
338 rapl_write_data_raw(rd
, POWER_LIMIT1
, power_limit
);
341 rapl_write_data_raw(rd
, POWER_LIMIT2
, power_limit
);
347 package_power_limit_irq_save(rp
);
353 static int get_current_power_limit(struct powercap_zone
*power_zone
, int cid
,
356 struct rapl_domain
*rd
;
363 rd
= power_zone_to_rapl_domain(power_zone
);
364 id
= contraint_to_pl(rd
, cid
);
370 switch (rd
->rpl
[id
].prim_id
) {
381 if (rapl_read_data_raw(rd
, prim
, true, &val
))
392 static int set_time_window(struct powercap_zone
*power_zone
, int cid
,
395 struct rapl_domain
*rd
;
400 rd
= power_zone_to_rapl_domain(power_zone
);
401 id
= contraint_to_pl(rd
, cid
);
407 switch (rd
->rpl
[id
].prim_id
) {
409 rapl_write_data_raw(rd
, TIME_WINDOW1
, window
);
412 rapl_write_data_raw(rd
, TIME_WINDOW2
, window
);
423 static int get_time_window(struct powercap_zone
*power_zone
, int cid
,
426 struct rapl_domain
*rd
;
432 rd
= power_zone_to_rapl_domain(power_zone
);
433 id
= contraint_to_pl(rd
, cid
);
439 switch (rd
->rpl
[id
].prim_id
) {
441 ret
= rapl_read_data_raw(rd
, TIME_WINDOW1
, true, &val
);
444 ret
= rapl_read_data_raw(rd
, TIME_WINDOW2
, true, &val
);
459 static const char *get_constraint_name(struct powercap_zone
*power_zone
,
462 struct rapl_domain
*rd
;
465 rd
= power_zone_to_rapl_domain(power_zone
);
466 id
= contraint_to_pl(rd
, cid
);
468 return rd
->rpl
[id
].name
;
473 static int get_max_power(struct powercap_zone
*power_zone
, int id
, u64
*data
)
475 struct rapl_domain
*rd
;
481 rd
= power_zone_to_rapl_domain(power_zone
);
482 switch (rd
->rpl
[id
].prim_id
) {
484 prim
= THERMAL_SPEC_POWER
;
493 if (rapl_read_data_raw(rd
, prim
, true, &val
))
503 static const struct powercap_zone_constraint_ops constraint_ops
= {
504 .set_power_limit_uw
= set_power_limit
,
505 .get_power_limit_uw
= get_current_power_limit
,
506 .set_time_window_us
= set_time_window
,
507 .get_time_window_us
= get_time_window
,
508 .get_max_power_uw
= get_max_power
,
509 .get_name
= get_constraint_name
,
512 /* called after domain detection and package level data are set */
513 static void rapl_init_domains(struct rapl_package
*rp
)
515 enum rapl_domain_type i
;
516 enum rapl_domain_reg_id j
;
517 struct rapl_domain
*rd
= rp
->domains
;
519 for (i
= 0; i
< RAPL_DOMAIN_MAX
; i
++) {
520 unsigned int mask
= rp
->domain_map
& (1 << i
);
526 rd
->name
= rapl_domain_names
[i
];
528 rd
->rpl
[0].prim_id
= PL1_ENABLE
;
529 rd
->rpl
[0].name
= pl1_name
;
530 /* some domain may support two power limits */
531 if (rp
->priv
->limits
[i
] == 2) {
532 rd
->rpl
[1].prim_id
= PL2_ENABLE
;
533 rd
->rpl
[1].name
= pl2_name
;
536 for (j
= 0; j
< RAPL_DOMAIN_REG_MAX
; j
++)
537 rd
->regs
[j
] = rp
->priv
->regs
[i
][j
];
539 if (i
== RAPL_DOMAIN_DRAM
) {
540 rd
->domain_energy_unit
=
541 rapl_defaults
->dram_domain_energy_unit
;
542 if (rd
->domain_energy_unit
)
543 pr_info("DRAM domain energy unit %dpj\n",
544 rd
->domain_energy_unit
);
550 static u64
rapl_unit_xlate(struct rapl_domain
*rd
, enum unit_type type
,
551 u64 value
, int to_raw
)
554 struct rapl_package
*rp
= rd
->rp
;
559 units
= rp
->power_unit
;
562 scale
= ENERGY_UNIT_SCALE
;
563 /* per domain unit takes precedence */
564 if (rd
->domain_energy_unit
)
565 units
= rd
->domain_energy_unit
;
567 units
= rp
->energy_unit
;
570 return rapl_defaults
->compute_time_window(rp
, value
, to_raw
);
577 return div64_u64(value
, units
) * scale
;
581 return div64_u64(value
, scale
);
584 /* in the order of enum rapl_primitives */
585 static struct rapl_primitive_info rpi
[] = {
586 /* name, mask, shift, msr index, unit divisor */
587 PRIMITIVE_INFO_INIT(ENERGY_COUNTER
, ENERGY_STATUS_MASK
, 0,
588 RAPL_DOMAIN_REG_STATUS
, ENERGY_UNIT
, 0),
589 PRIMITIVE_INFO_INIT(POWER_LIMIT1
, POWER_LIMIT1_MASK
, 0,
590 RAPL_DOMAIN_REG_LIMIT
, POWER_UNIT
, 0),
591 PRIMITIVE_INFO_INIT(POWER_LIMIT2
, POWER_LIMIT2_MASK
, 32,
592 RAPL_DOMAIN_REG_LIMIT
, POWER_UNIT
, 0),
593 PRIMITIVE_INFO_INIT(FW_LOCK
, POWER_LOW_LOCK
, 31,
594 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
595 PRIMITIVE_INFO_INIT(PL1_ENABLE
, POWER_LIMIT1_ENABLE
, 15,
596 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
597 PRIMITIVE_INFO_INIT(PL1_CLAMP
, POWER_LIMIT1_CLAMP
, 16,
598 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
599 PRIMITIVE_INFO_INIT(PL2_ENABLE
, POWER_LIMIT2_ENABLE
, 47,
600 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
601 PRIMITIVE_INFO_INIT(PL2_CLAMP
, POWER_LIMIT2_CLAMP
, 48,
602 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
603 PRIMITIVE_INFO_INIT(TIME_WINDOW1
, TIME_WINDOW1_MASK
, 17,
604 RAPL_DOMAIN_REG_LIMIT
, TIME_UNIT
, 0),
605 PRIMITIVE_INFO_INIT(TIME_WINDOW2
, TIME_WINDOW2_MASK
, 49,
606 RAPL_DOMAIN_REG_LIMIT
, TIME_UNIT
, 0),
607 PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER
, POWER_INFO_THERMAL_SPEC_MASK
,
608 0, RAPL_DOMAIN_REG_INFO
, POWER_UNIT
, 0),
609 PRIMITIVE_INFO_INIT(MAX_POWER
, POWER_INFO_MAX_MASK
, 32,
610 RAPL_DOMAIN_REG_INFO
, POWER_UNIT
, 0),
611 PRIMITIVE_INFO_INIT(MIN_POWER
, POWER_INFO_MIN_MASK
, 16,
612 RAPL_DOMAIN_REG_INFO
, POWER_UNIT
, 0),
613 PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW
, POWER_INFO_MAX_TIME_WIN_MASK
, 48,
614 RAPL_DOMAIN_REG_INFO
, TIME_UNIT
, 0),
615 PRIMITIVE_INFO_INIT(THROTTLED_TIME
, PERF_STATUS_THROTTLE_TIME_MASK
, 0,
616 RAPL_DOMAIN_REG_PERF
, TIME_UNIT
, 0),
617 PRIMITIVE_INFO_INIT(PRIORITY_LEVEL
, PP_POLICY_MASK
, 0,
618 RAPL_DOMAIN_REG_POLICY
, ARBITRARY_UNIT
, 0),
620 PRIMITIVE_INFO_INIT(AVERAGE_POWER
, 0, 0, 0, POWER_UNIT
,
621 RAPL_PRIMITIVE_DERIVED
),
625 /* Read primitive data based on its related struct rapl_primitive_info.
626 * if xlate flag is set, return translated data based on data units, i.e.
627 * time, energy, and power.
628 * RAPL MSRs are non-architectual and are laid out not consistently across
629 * domains. Here we use primitive info to allow writing consolidated access
631 * For a given primitive, it is processed by MSR mask and shift. Unit conversion
632 * is pre-assigned based on RAPL unit MSRs read at init time.
633 * 63-------------------------- 31--------------------------- 0
635 * | |<- shift ----------------|
636 * 63-------------------------- 31--------------------------- 0
638 static int rapl_read_data_raw(struct rapl_domain
*rd
,
639 enum rapl_primitives prim
, bool xlate
, u64
*data
)
642 struct rapl_primitive_info
*rp
= &rpi
[prim
];
643 struct reg_action ra
;
646 if (!rp
->name
|| rp
->flag
& RAPL_PRIMITIVE_DUMMY
)
649 ra
.reg
= rd
->regs
[rp
->id
];
653 cpu
= rd
->rp
->lead_cpu
;
655 /* domain with 2 limits has different bit */
656 if (prim
== FW_LOCK
&& rd
->rp
->priv
->limits
[rd
->id
] == 2) {
657 rp
->mask
= POWER_HIGH_LOCK
;
660 /* non-hardware data are collected by the polling thread */
661 if (rp
->flag
& RAPL_PRIMITIVE_DERIVED
) {
662 *data
= rd
->rdd
.primitives
[prim
];
668 if (rd
->rp
->priv
->read_raw(cpu
, &ra
)) {
669 pr_debug("failed to read reg 0x%llx on cpu %d\n", ra
.reg
, cpu
);
673 value
= ra
.value
>> rp
->shift
;
676 *data
= rapl_unit_xlate(rd
, rp
->unit
, value
, 0);
683 /* Similar use of primitive info in the read counterpart */
684 static int rapl_write_data_raw(struct rapl_domain
*rd
,
685 enum rapl_primitives prim
,
686 unsigned long long value
)
688 struct rapl_primitive_info
*rp
= &rpi
[prim
];
691 struct reg_action ra
;
694 cpu
= rd
->rp
->lead_cpu
;
695 bits
= rapl_unit_xlate(rd
, rp
->unit
, value
, 1);
699 memset(&ra
, 0, sizeof(ra
));
701 ra
.reg
= rd
->regs
[rp
->id
];
705 ret
= rd
->rp
->priv
->write_raw(cpu
, &ra
);
711 * Raw RAPL data stored in MSRs are in certain scales. We need to
712 * convert them into standard units based on the units reported in
713 * the RAPL unit MSRs. This is specific to CPUs as the method to
714 * calculate units differ on different CPUs.
715 * We convert the units to below format based on CPUs.
717 * energy unit: picoJoules : Represented in picoJoules by default
718 * power unit : microWatts : Represented in milliWatts by default
719 * time unit : microseconds: Represented in seconds by default
721 static int rapl_check_unit_core(struct rapl_package
*rp
, int cpu
)
723 struct reg_action ra
;
726 ra
.reg
= rp
->priv
->reg_unit
;
728 if (rp
->priv
->read_raw(cpu
, &ra
)) {
729 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
730 rp
->priv
->reg_unit
, cpu
);
734 value
= (ra
.value
& ENERGY_UNIT_MASK
) >> ENERGY_UNIT_OFFSET
;
735 rp
->energy_unit
= ENERGY_UNIT_SCALE
* 1000000 / (1 << value
);
737 value
= (ra
.value
& POWER_UNIT_MASK
) >> POWER_UNIT_OFFSET
;
738 rp
->power_unit
= 1000000 / (1 << value
);
740 value
= (ra
.value
& TIME_UNIT_MASK
) >> TIME_UNIT_OFFSET
;
741 rp
->time_unit
= 1000000 / (1 << value
);
743 pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
744 rp
->name
, rp
->energy_unit
, rp
->time_unit
, rp
->power_unit
);
749 static int rapl_check_unit_atom(struct rapl_package
*rp
, int cpu
)
751 struct reg_action ra
;
754 ra
.reg
= rp
->priv
->reg_unit
;
756 if (rp
->priv
->read_raw(cpu
, &ra
)) {
757 pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
758 rp
->priv
->reg_unit
, cpu
);
762 value
= (ra
.value
& ENERGY_UNIT_MASK
) >> ENERGY_UNIT_OFFSET
;
763 rp
->energy_unit
= ENERGY_UNIT_SCALE
* 1 << value
;
765 value
= (ra
.value
& POWER_UNIT_MASK
) >> POWER_UNIT_OFFSET
;
766 rp
->power_unit
= (1 << value
) * 1000;
768 value
= (ra
.value
& TIME_UNIT_MASK
) >> TIME_UNIT_OFFSET
;
769 rp
->time_unit
= 1000000 / (1 << value
);
771 pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
772 rp
->name
, rp
->energy_unit
, rp
->time_unit
, rp
->power_unit
);
777 static void power_limit_irq_save_cpu(void *info
)
780 struct rapl_package
*rp
= (struct rapl_package
*)info
;
782 /* save the state of PLN irq mask bit before disabling it */
783 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, &l
, &h
);
784 if (!(rp
->power_limit_irq
& PACKAGE_PLN_INT_SAVED
)) {
785 rp
->power_limit_irq
= l
& PACKAGE_THERM_INT_PLN_ENABLE
;
786 rp
->power_limit_irq
|= PACKAGE_PLN_INT_SAVED
;
788 l
&= ~PACKAGE_THERM_INT_PLN_ENABLE
;
789 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, l
, h
);
793 * When package power limit is set artificially low by RAPL, LVT
794 * thermal interrupt for package power limit should be ignored
795 * since we are not really exceeding the real limit. The intention
796 * is to avoid excessive interrupts while we are trying to save power.
797 * A useful feature might be routing the package_power_limit interrupt
798 * to userspace via eventfd. once we have a usecase, this is simple
799 * to do by adding an atomic notifier.
802 static void package_power_limit_irq_save(struct rapl_package
*rp
)
804 if (!boot_cpu_has(X86_FEATURE_PTS
) || !boot_cpu_has(X86_FEATURE_PLN
))
807 smp_call_function_single(rp
->lead_cpu
, power_limit_irq_save_cpu
, rp
, 1);
811 * Restore per package power limit interrupt enable state. Called from cpu
812 * hotplug code on package removal.
814 static void package_power_limit_irq_restore(struct rapl_package
*rp
)
818 if (!boot_cpu_has(X86_FEATURE_PTS
) || !boot_cpu_has(X86_FEATURE_PLN
))
821 /* irq enable state not saved, nothing to restore */
822 if (!(rp
->power_limit_irq
& PACKAGE_PLN_INT_SAVED
))
825 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, &l
, &h
);
827 if (rp
->power_limit_irq
& PACKAGE_THERM_INT_PLN_ENABLE
)
828 l
|= PACKAGE_THERM_INT_PLN_ENABLE
;
830 l
&= ~PACKAGE_THERM_INT_PLN_ENABLE
;
832 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, l
, h
);
835 static void set_floor_freq_default(struct rapl_domain
*rd
, bool mode
)
837 int nr_powerlimit
= find_nr_power_limit(rd
);
839 /* always enable clamp such that p-state can go below OS requested
840 * range. power capping priority over guranteed frequency.
842 rapl_write_data_raw(rd
, PL1_CLAMP
, mode
);
844 /* some domains have pl2 */
845 if (nr_powerlimit
> 1) {
846 rapl_write_data_raw(rd
, PL2_ENABLE
, mode
);
847 rapl_write_data_raw(rd
, PL2_CLAMP
, mode
);
851 static void set_floor_freq_atom(struct rapl_domain
*rd
, bool enable
)
853 static u32 power_ctrl_orig_val
;
856 if (!rapl_defaults
->floor_freq_reg_addr
) {
857 pr_err("Invalid floor frequency config register\n");
861 if (!power_ctrl_orig_val
)
862 iosf_mbi_read(BT_MBI_UNIT_PMC
, MBI_CR_READ
,
863 rapl_defaults
->floor_freq_reg_addr
,
864 &power_ctrl_orig_val
);
865 mdata
= power_ctrl_orig_val
;
867 mdata
&= ~(0x7f << 8);
870 iosf_mbi_write(BT_MBI_UNIT_PMC
, MBI_CR_WRITE
,
871 rapl_defaults
->floor_freq_reg_addr
, mdata
);
874 static u64
rapl_compute_time_window_core(struct rapl_package
*rp
, u64 value
,
877 u64 f
, y
; /* fraction and exp. used for time unit */
880 * Special processing based on 2^Y*(1+F/4), refer
881 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
884 f
= (value
& 0x60) >> 5;
886 value
= (1 << y
) * (4 + f
) * rp
->time_unit
/ 4;
888 do_div(value
, rp
->time_unit
);
890 f
= div64_u64(4 * (value
- (1 << y
)), 1 << y
);
891 value
= (y
& 0x1f) | ((f
& 0x3) << 5);
896 static u64
rapl_compute_time_window_atom(struct rapl_package
*rp
, u64 value
,
900 * Atom time unit encoding is straight forward val * time_unit,
901 * where time_unit is default to 1 sec. Never 0.
904 return (value
) ? value
*= rp
->time_unit
: rp
->time_unit
;
906 value
= div64_u64(value
, rp
->time_unit
);
911 static const struct rapl_defaults rapl_defaults_core
= {
912 .floor_freq_reg_addr
= 0,
913 .check_unit
= rapl_check_unit_core
,
914 .set_floor_freq
= set_floor_freq_default
,
915 .compute_time_window
= rapl_compute_time_window_core
,
918 static const struct rapl_defaults rapl_defaults_hsw_server
= {
919 .check_unit
= rapl_check_unit_core
,
920 .set_floor_freq
= set_floor_freq_default
,
921 .compute_time_window
= rapl_compute_time_window_core
,
922 .dram_domain_energy_unit
= 15300,
925 static const struct rapl_defaults rapl_defaults_byt
= {
926 .floor_freq_reg_addr
= IOSF_CPU_POWER_BUDGET_CTL_BYT
,
927 .check_unit
= rapl_check_unit_atom
,
928 .set_floor_freq
= set_floor_freq_atom
,
929 .compute_time_window
= rapl_compute_time_window_atom
,
932 static const struct rapl_defaults rapl_defaults_tng
= {
933 .floor_freq_reg_addr
= IOSF_CPU_POWER_BUDGET_CTL_TNG
,
934 .check_unit
= rapl_check_unit_atom
,
935 .set_floor_freq
= set_floor_freq_atom
,
936 .compute_time_window
= rapl_compute_time_window_atom
,
939 static const struct rapl_defaults rapl_defaults_ann
= {
940 .floor_freq_reg_addr
= 0,
941 .check_unit
= rapl_check_unit_atom
,
942 .set_floor_freq
= NULL
,
943 .compute_time_window
= rapl_compute_time_window_atom
,
946 static const struct rapl_defaults rapl_defaults_cht
= {
947 .floor_freq_reg_addr
= 0,
948 .check_unit
= rapl_check_unit_atom
,
949 .set_floor_freq
= NULL
,
950 .compute_time_window
= rapl_compute_time_window_atom
,
953 static const struct x86_cpu_id rapl_ids
[] __initconst
= {
954 INTEL_CPU_FAM6(SANDYBRIDGE
, rapl_defaults_core
),
955 INTEL_CPU_FAM6(SANDYBRIDGE_X
, rapl_defaults_core
),
957 INTEL_CPU_FAM6(IVYBRIDGE
, rapl_defaults_core
),
958 INTEL_CPU_FAM6(IVYBRIDGE_X
, rapl_defaults_core
),
960 INTEL_CPU_FAM6(HASWELL
, rapl_defaults_core
),
961 INTEL_CPU_FAM6(HASWELL_L
, rapl_defaults_core
),
962 INTEL_CPU_FAM6(HASWELL_G
, rapl_defaults_core
),
963 INTEL_CPU_FAM6(HASWELL_X
, rapl_defaults_hsw_server
),
965 INTEL_CPU_FAM6(BROADWELL
, rapl_defaults_core
),
966 INTEL_CPU_FAM6(BROADWELL_G
, rapl_defaults_core
),
967 INTEL_CPU_FAM6(BROADWELL_D
, rapl_defaults_core
),
968 INTEL_CPU_FAM6(BROADWELL_X
, rapl_defaults_hsw_server
),
970 INTEL_CPU_FAM6(SKYLAKE
, rapl_defaults_core
),
971 INTEL_CPU_FAM6(SKYLAKE_L
, rapl_defaults_core
),
972 INTEL_CPU_FAM6(SKYLAKE_X
, rapl_defaults_hsw_server
),
973 INTEL_CPU_FAM6(KABYLAKE_L
, rapl_defaults_core
),
974 INTEL_CPU_FAM6(KABYLAKE
, rapl_defaults_core
),
975 INTEL_CPU_FAM6(CANNONLAKE_L
, rapl_defaults_core
),
976 INTEL_CPU_FAM6(ICELAKE_L
, rapl_defaults_core
),
977 INTEL_CPU_FAM6(ICELAKE
, rapl_defaults_core
),
978 INTEL_CPU_FAM6(ICELAKE_NNPI
, rapl_defaults_core
),
979 INTEL_CPU_FAM6(ICELAKE_X
, rapl_defaults_hsw_server
),
980 INTEL_CPU_FAM6(ICELAKE_D
, rapl_defaults_hsw_server
),
982 INTEL_CPU_FAM6(ATOM_SILVERMONT
, rapl_defaults_byt
),
983 INTEL_CPU_FAM6(ATOM_AIRMONT
, rapl_defaults_cht
),
984 INTEL_CPU_FAM6(ATOM_SILVERMONT_MID
, rapl_defaults_tng
),
985 INTEL_CPU_FAM6(ATOM_AIRMONT_MID
, rapl_defaults_ann
),
986 INTEL_CPU_FAM6(ATOM_GOLDMONT
, rapl_defaults_core
),
987 INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS
, rapl_defaults_core
),
988 INTEL_CPU_FAM6(ATOM_GOLDMONT_D
, rapl_defaults_core
),
989 INTEL_CPU_FAM6(ATOM_TREMONT_D
, rapl_defaults_core
),
991 INTEL_CPU_FAM6(XEON_PHI_KNL
, rapl_defaults_hsw_server
),
992 INTEL_CPU_FAM6(XEON_PHI_KNM
, rapl_defaults_hsw_server
),
996 MODULE_DEVICE_TABLE(x86cpu
, rapl_ids
);
998 /* Read once for all raw primitive data for domains */
999 static void rapl_update_domain_data(struct rapl_package
*rp
)
1004 for (dmn
= 0; dmn
< rp
->nr_domains
; dmn
++) {
1005 pr_debug("update %s domain %s data\n", rp
->name
,
1006 rp
->domains
[dmn
].name
);
1007 /* exclude non-raw primitives */
1008 for (prim
= 0; prim
< NR_RAW_PRIMITIVES
; prim
++) {
1009 if (!rapl_read_data_raw(&rp
->domains
[dmn
], prim
,
1010 rpi
[prim
].unit
, &val
))
1011 rp
->domains
[dmn
].rdd
.primitives
[prim
] = val
;
1017 static int rapl_package_register_powercap(struct rapl_package
*rp
)
1019 struct rapl_domain
*rd
;
1020 struct powercap_zone
*power_zone
= NULL
;
1023 /* Update the domain data of the new package */
1024 rapl_update_domain_data(rp
);
1026 /* first we register package domain as the parent zone */
1027 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
1028 if (rd
->id
== RAPL_DOMAIN_PACKAGE
) {
1029 nr_pl
= find_nr_power_limit(rd
);
1030 pr_debug("register package domain %s\n", rp
->name
);
1031 power_zone
= powercap_register_zone(&rd
->power_zone
,
1032 rp
->priv
->control_type
, rp
->name
,
1033 NULL
, &zone_ops
[rd
->id
], nr_pl
,
1035 if (IS_ERR(power_zone
)) {
1036 pr_debug("failed to register power zone %s\n",
1038 return PTR_ERR(power_zone
);
1040 /* track parent zone in per package/socket data */
1041 rp
->power_zone
= power_zone
;
1042 /* done, only one package domain per socket */
1047 pr_err("no package domain found, unknown topology!\n");
1050 /* now register domains as children of the socket/package */
1051 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
1052 if (rd
->id
== RAPL_DOMAIN_PACKAGE
)
1054 /* number of power limits per domain varies */
1055 nr_pl
= find_nr_power_limit(rd
);
1056 power_zone
= powercap_register_zone(&rd
->power_zone
,
1057 rp
->priv
->control_type
,
1058 rd
->name
, rp
->power_zone
,
1059 &zone_ops
[rd
->id
], nr_pl
,
1062 if (IS_ERR(power_zone
)) {
1063 pr_debug("failed to register power_zone, %s:%s\n",
1064 rp
->name
, rd
->name
);
1065 ret
= PTR_ERR(power_zone
);
1073 * Clean up previously initialized domains within the package if we
1074 * failed after the first domain setup.
1076 while (--rd
>= rp
->domains
) {
1077 pr_debug("unregister %s domain %s\n", rp
->name
, rd
->name
);
1078 powercap_unregister_zone(rp
->priv
->control_type
,
1085 int rapl_add_platform_domain(struct rapl_if_priv
*priv
)
1087 struct rapl_domain
*rd
;
1088 struct powercap_zone
*power_zone
;
1089 struct reg_action ra
;
1092 ra
.reg
= priv
->regs
[RAPL_DOMAIN_PLATFORM
][RAPL_DOMAIN_REG_STATUS
];
1094 ret
= priv
->read_raw(0, &ra
);
1095 if (ret
|| !ra
.value
)
1098 ra
.reg
= priv
->regs
[RAPL_DOMAIN_PLATFORM
][RAPL_DOMAIN_REG_LIMIT
];
1100 ret
= priv
->read_raw(0, &ra
);
1101 if (ret
|| !ra
.value
)
1104 rd
= kzalloc(sizeof(*rd
), GFP_KERNEL
);
1108 rd
->name
= rapl_domain_names
[RAPL_DOMAIN_PLATFORM
];
1109 rd
->id
= RAPL_DOMAIN_PLATFORM
;
1110 rd
->regs
[RAPL_DOMAIN_REG_LIMIT
] =
1111 priv
->regs
[RAPL_DOMAIN_PLATFORM
][RAPL_DOMAIN_REG_LIMIT
];
1112 rd
->regs
[RAPL_DOMAIN_REG_STATUS
] =
1113 priv
->regs
[RAPL_DOMAIN_PLATFORM
][RAPL_DOMAIN_REG_STATUS
];
1114 rd
->rpl
[0].prim_id
= PL1_ENABLE
;
1115 rd
->rpl
[0].name
= pl1_name
;
1116 rd
->rpl
[1].prim_id
= PL2_ENABLE
;
1117 rd
->rpl
[1].name
= pl2_name
;
1118 rd
->rp
= rapl_find_package_domain(0, priv
);
1120 power_zone
= powercap_register_zone(&rd
->power_zone
, priv
->control_type
,
1122 &zone_ops
[RAPL_DOMAIN_PLATFORM
],
1123 2, &constraint_ops
);
1125 if (IS_ERR(power_zone
)) {
1127 return PTR_ERR(power_zone
);
1130 priv
->platform_rapl_domain
= rd
;
1134 EXPORT_SYMBOL_GPL(rapl_add_platform_domain
);
1136 void rapl_remove_platform_domain(struct rapl_if_priv
*priv
)
1138 if (priv
->platform_rapl_domain
) {
1139 powercap_unregister_zone(priv
->control_type
,
1140 &priv
->platform_rapl_domain
->power_zone
);
1141 kfree(priv
->platform_rapl_domain
);
1144 EXPORT_SYMBOL_GPL(rapl_remove_platform_domain
);
1146 static int rapl_check_domain(int cpu
, int domain
, struct rapl_package
*rp
)
1148 struct reg_action ra
;
1151 case RAPL_DOMAIN_PACKAGE
:
1152 case RAPL_DOMAIN_PP0
:
1153 case RAPL_DOMAIN_PP1
:
1154 case RAPL_DOMAIN_DRAM
:
1155 ra
.reg
= rp
->priv
->regs
[domain
][RAPL_DOMAIN_REG_STATUS
];
1157 case RAPL_DOMAIN_PLATFORM
:
1158 /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
1161 pr_err("invalid domain id %d\n", domain
);
1164 /* make sure domain counters are available and contains non-zero
1165 * values, otherwise skip it.
1169 if (rp
->priv
->read_raw(cpu
, &ra
) || !ra
.value
)
1176 * Check if power limits are available. Two cases when they are not available:
1177 * 1. Locked by BIOS, in this case we still provide read-only access so that
1178 * users can see what limit is set by the BIOS.
1179 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1180 * exist at all. In this case, we do not show the constraints in powercap.
1182 * Called after domains are detected and initialized.
1184 static void rapl_detect_powerlimit(struct rapl_domain
*rd
)
1189 /* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1190 if (!rapl_read_data_raw(rd
, FW_LOCK
, false, &val64
)) {
1192 pr_info("RAPL %s domain %s locked by BIOS\n",
1193 rd
->rp
->name
, rd
->name
);
1194 rd
->state
|= DOMAIN_STATE_BIOS_LOCKED
;
1197 /* check if power limit MSR exists, otherwise domain is monitoring only */
1198 for (i
= 0; i
< NR_POWER_LIMITS
; i
++) {
1199 int prim
= rd
->rpl
[i
].prim_id
;
1201 if (rapl_read_data_raw(rd
, prim
, false, &val64
))
1202 rd
->rpl
[i
].name
= NULL
;
1206 /* Detect active and valid domains for the given CPU, caller must
1207 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1209 static int rapl_detect_domains(struct rapl_package
*rp
, int cpu
)
1211 struct rapl_domain
*rd
;
1214 for (i
= 0; i
< RAPL_DOMAIN_MAX
; i
++) {
1215 /* use physical package id to read counters */
1216 if (!rapl_check_domain(cpu
, i
, rp
)) {
1217 rp
->domain_map
|= 1 << i
;
1218 pr_info("Found RAPL domain %s\n", rapl_domain_names
[i
]);
1221 rp
->nr_domains
= bitmap_weight(&rp
->domain_map
, RAPL_DOMAIN_MAX
);
1222 if (!rp
->nr_domains
) {
1223 pr_debug("no valid rapl domains found in %s\n", rp
->name
);
1226 pr_debug("found %d domains on %s\n", rp
->nr_domains
, rp
->name
);
1228 rp
->domains
= kcalloc(rp
->nr_domains
+ 1, sizeof(struct rapl_domain
),
1233 rapl_init_domains(rp
);
1235 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++)
1236 rapl_detect_powerlimit(rd
);
1241 /* called from CPU hotplug notifier, hotplug lock held */
1242 void rapl_remove_package(struct rapl_package
*rp
)
1244 struct rapl_domain
*rd
, *rd_package
= NULL
;
1246 package_power_limit_irq_restore(rp
);
1248 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
1249 rapl_write_data_raw(rd
, PL1_ENABLE
, 0);
1250 rapl_write_data_raw(rd
, PL1_CLAMP
, 0);
1251 if (find_nr_power_limit(rd
) > 1) {
1252 rapl_write_data_raw(rd
, PL2_ENABLE
, 0);
1253 rapl_write_data_raw(rd
, PL2_CLAMP
, 0);
1255 if (rd
->id
== RAPL_DOMAIN_PACKAGE
) {
1259 pr_debug("remove package, undo power limit on %s: %s\n",
1260 rp
->name
, rd
->name
);
1261 powercap_unregister_zone(rp
->priv
->control_type
,
1264 /* do parent zone last */
1265 powercap_unregister_zone(rp
->priv
->control_type
,
1266 &rd_package
->power_zone
);
1267 list_del(&rp
->plist
);
1270 EXPORT_SYMBOL_GPL(rapl_remove_package
);
1272 /* caller to ensure CPU hotplug lock is held */
1273 struct rapl_package
*rapl_find_package_domain(int cpu
, struct rapl_if_priv
*priv
)
1275 int id
= topology_logical_die_id(cpu
);
1276 struct rapl_package
*rp
;
1278 list_for_each_entry(rp
, &rapl_packages
, plist
) {
1280 && rp
->priv
->control_type
== priv
->control_type
)
1286 EXPORT_SYMBOL_GPL(rapl_find_package_domain
);
1288 /* called from CPU hotplug notifier, hotplug lock held */
1289 struct rapl_package
*rapl_add_package(int cpu
, struct rapl_if_priv
*priv
)
1291 int id
= topology_logical_die_id(cpu
);
1292 struct rapl_package
*rp
;
1293 struct cpuinfo_x86
*c
= &cpu_data(cpu
);
1297 return ERR_PTR(-ENODEV
);
1299 rp
= kzalloc(sizeof(struct rapl_package
), GFP_KERNEL
);
1301 return ERR_PTR(-ENOMEM
);
1303 /* add the new package to the list */
1308 if (topology_max_die_per_package() > 1)
1309 snprintf(rp
->name
, PACKAGE_DOMAIN_NAME_LENGTH
,
1310 "package-%d-die-%d", c
->phys_proc_id
, c
->cpu_die_id
);
1312 snprintf(rp
->name
, PACKAGE_DOMAIN_NAME_LENGTH
, "package-%d",
1315 /* check if the package contains valid domains */
1316 if (rapl_detect_domains(rp
, cpu
) || rapl_defaults
->check_unit(rp
, cpu
)) {
1318 goto err_free_package
;
1320 ret
= rapl_package_register_powercap(rp
);
1322 INIT_LIST_HEAD(&rp
->plist
);
1323 list_add(&rp
->plist
, &rapl_packages
);
1330 return ERR_PTR(ret
);
1332 EXPORT_SYMBOL_GPL(rapl_add_package
);
1334 static void power_limit_state_save(void)
1336 struct rapl_package
*rp
;
1337 struct rapl_domain
*rd
;
1341 list_for_each_entry(rp
, &rapl_packages
, plist
) {
1342 if (!rp
->power_zone
)
1344 rd
= power_zone_to_rapl_domain(rp
->power_zone
);
1345 nr_pl
= find_nr_power_limit(rd
);
1346 for (i
= 0; i
< nr_pl
; i
++) {
1347 switch (rd
->rpl
[i
].prim_id
) {
1349 ret
= rapl_read_data_raw(rd
,
1351 &rd
->rpl
[i
].last_power_limit
);
1353 rd
->rpl
[i
].last_power_limit
= 0;
1356 ret
= rapl_read_data_raw(rd
,
1358 &rd
->rpl
[i
].last_power_limit
);
1360 rd
->rpl
[i
].last_power_limit
= 0;
1368 static void power_limit_state_restore(void)
1370 struct rapl_package
*rp
;
1371 struct rapl_domain
*rd
;
1375 list_for_each_entry(rp
, &rapl_packages
, plist
) {
1376 if (!rp
->power_zone
)
1378 rd
= power_zone_to_rapl_domain(rp
->power_zone
);
1379 nr_pl
= find_nr_power_limit(rd
);
1380 for (i
= 0; i
< nr_pl
; i
++) {
1381 switch (rd
->rpl
[i
].prim_id
) {
1383 if (rd
->rpl
[i
].last_power_limit
)
1384 rapl_write_data_raw(rd
, POWER_LIMIT1
,
1385 rd
->rpl
[i
].last_power_limit
);
1388 if (rd
->rpl
[i
].last_power_limit
)
1389 rapl_write_data_raw(rd
, POWER_LIMIT2
,
1390 rd
->rpl
[i
].last_power_limit
);
1398 static int rapl_pm_callback(struct notifier_block
*nb
,
1399 unsigned long mode
, void *_unused
)
1402 case PM_SUSPEND_PREPARE
:
1403 power_limit_state_save();
1405 case PM_POST_SUSPEND
:
1406 power_limit_state_restore();
1412 static struct notifier_block rapl_pm_notifier
= {
1413 .notifier_call
= rapl_pm_callback
,
1416 static struct platform_device
*rapl_msr_platdev
;
1418 static int __init
rapl_init(void)
1420 const struct x86_cpu_id
*id
;
1423 id
= x86_match_cpu(rapl_ids
);
1425 pr_err("driver does not support CPU family %d model %d\n",
1426 boot_cpu_data
.x86
, boot_cpu_data
.x86_model
);
1431 rapl_defaults
= (struct rapl_defaults
*)id
->driver_data
;
1433 ret
= register_pm_notifier(&rapl_pm_notifier
);
1437 rapl_msr_platdev
= platform_device_alloc("intel_rapl_msr", 0);
1438 if (!rapl_msr_platdev
) {
1443 ret
= platform_device_add(rapl_msr_platdev
);
1445 platform_device_put(rapl_msr_platdev
);
1449 unregister_pm_notifier(&rapl_pm_notifier
);
1454 static void __exit
rapl_exit(void)
1456 platform_device_unregister(rapl_msr_platdev
);
1457 unregister_pm_notifier(&rapl_pm_notifier
);
1460 fs_initcall(rapl_init
);
1461 module_exit(rapl_exit
);
1463 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
1464 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1465 MODULE_LICENSE("GPL v2");