1 // SPDX-License-Identifier: GPL-2.0-only
3 * Common code for Intel Running Average Power Limit (RAPL) support.
4 * Copyright (c) 2019, Intel Corporation.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/bitmap.h>
9 #include <linux/cleanup.h>
10 #include <linux/cpu.h>
11 #include <linux/delay.h>
12 #include <linux/device.h>
13 #include <linux/intel_rapl.h>
14 #include <linux/kernel.h>
15 #include <linux/list.h>
16 #include <linux/log2.h>
17 #include <linux/module.h>
18 #include <linux/nospec.h>
19 #include <linux/perf_event.h>
20 #include <linux/platform_device.h>
21 #include <linux/powercap.h>
22 #include <linux/processor.h>
23 #include <linux/slab.h>
24 #include <linux/suspend.h>
25 #include <linux/sysfs.h>
26 #include <linux/types.h>
28 #include <asm/cpu_device_id.h>
29 #include <asm/intel-family.h>
30 #include <asm/iosf_mbi.h>
32 /* bitmasks for RAPL MSRs, used by primitive access functions */
33 #define ENERGY_STATUS_MASK 0xffffffff
35 #define POWER_LIMIT1_MASK 0x7FFF
36 #define POWER_LIMIT1_ENABLE BIT(15)
37 #define POWER_LIMIT1_CLAMP BIT(16)
39 #define POWER_LIMIT2_MASK (0x7FFFULL<<32)
40 #define POWER_LIMIT2_ENABLE BIT_ULL(47)
41 #define POWER_LIMIT2_CLAMP BIT_ULL(48)
42 #define POWER_HIGH_LOCK BIT_ULL(63)
43 #define POWER_LOW_LOCK BIT(31)
45 #define POWER_LIMIT4_MASK 0x1FFF
47 #define TIME_WINDOW1_MASK (0x7FULL<<17)
48 #define TIME_WINDOW2_MASK (0x7FULL<<49)
50 #define POWER_UNIT_OFFSET 0
51 #define POWER_UNIT_MASK 0x0F
53 #define ENERGY_UNIT_OFFSET 0x08
54 #define ENERGY_UNIT_MASK 0x1F00
56 #define TIME_UNIT_OFFSET 0x10
57 #define TIME_UNIT_MASK 0xF0000
59 #define POWER_INFO_MAX_MASK (0x7fffULL<<32)
60 #define POWER_INFO_MIN_MASK (0x7fffULL<<16)
61 #define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48)
62 #define POWER_INFO_THERMAL_SPEC_MASK 0x7fff
64 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
65 #define PP_POLICY_MASK 0x1F
68 * SPR has different layout for Psys Domain PowerLimit registers.
69 * There are 17 bits of PL1 and PL2 instead of 15 bits.
70 * The Enable bits and TimeWindow bits are also shifted as a result.
72 #define PSYS_POWER_LIMIT1_MASK 0x1FFFF
73 #define PSYS_POWER_LIMIT1_ENABLE BIT(17)
75 #define PSYS_POWER_LIMIT2_MASK (0x1FFFFULL<<32)
76 #define PSYS_POWER_LIMIT2_ENABLE BIT_ULL(49)
78 #define PSYS_TIME_WINDOW1_MASK (0x7FULL<<19)
79 #define PSYS_TIME_WINDOW2_MASK (0x7FULL<<51)
81 /* bitmasks for RAPL TPMI, used by primitive access functions */
82 #define TPMI_POWER_LIMIT_MASK 0x3FFFF
83 #define TPMI_POWER_LIMIT_ENABLE BIT_ULL(62)
84 #define TPMI_TIME_WINDOW_MASK (0x7FULL<<18)
85 #define TPMI_INFO_SPEC_MASK 0x3FFFF
86 #define TPMI_INFO_MIN_MASK (0x3FFFFULL << 18)
87 #define TPMI_INFO_MAX_MASK (0x3FFFFULL << 36)
88 #define TPMI_INFO_MAX_TIME_WIN_MASK (0x7FULL << 54)
90 /* Non HW constants */
91 #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */
92 #define RAPL_PRIMITIVE_DUMMY BIT(2)
94 #define TIME_WINDOW_MAX_MSEC 40000
95 #define TIME_WINDOW_MIN_MSEC 250
96 #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */
98 ARBITRARY_UNIT
, /* no translation */
104 /* per domain data, some are optional */
105 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
107 #define DOMAIN_STATE_INACTIVE BIT(0)
108 #define DOMAIN_STATE_POWER_LIMIT_SET BIT(1)
110 static const char *pl_names
[NR_POWER_LIMITS
] = {
111 [POWER_LIMIT1
] = "long_term",
112 [POWER_LIMIT2
] = "short_term",
113 [POWER_LIMIT4
] = "peak_power",
125 static bool is_pl_valid(struct rapl_domain
*rd
, int pl
)
127 if (pl
< POWER_LIMIT1
|| pl
> POWER_LIMIT4
)
129 return rd
->rpl
[pl
].name
? true : false;
132 static int get_pl_lock_prim(struct rapl_domain
*rd
, int pl
)
134 if (rd
->rp
->priv
->type
== RAPL_IF_TPMI
) {
135 if (pl
== POWER_LIMIT1
)
137 if (pl
== POWER_LIMIT2
)
139 if (pl
== POWER_LIMIT4
)
143 /* MSR/MMIO Interface doesn't have Lock bit for PL4 */
144 if (pl
== POWER_LIMIT4
)
148 * Power Limit register that supports two power limits has a different
149 * bit position for the Lock bit.
151 if (rd
->rp
->priv
->limits
[rd
->id
] & BIT(POWER_LIMIT2
))
156 static int get_pl_prim(struct rapl_domain
*rd
, int pl
, enum pl_prims prim
)
160 if (prim
== PL_ENABLE
)
162 if (prim
== PL_CLAMP
&& rd
->rp
->priv
->type
!= RAPL_IF_TPMI
)
164 if (prim
== PL_LIMIT
)
166 if (prim
== PL_TIME_WINDOW
)
168 if (prim
== PL_MAX_POWER
)
169 return THERMAL_SPEC_POWER
;
171 return get_pl_lock_prim(rd
, pl
);
174 if (prim
== PL_ENABLE
)
176 if (prim
== PL_CLAMP
&& rd
->rp
->priv
->type
!= RAPL_IF_TPMI
)
178 if (prim
== PL_LIMIT
)
180 if (prim
== PL_TIME_WINDOW
)
182 if (prim
== PL_MAX_POWER
)
185 return get_pl_lock_prim(rd
, pl
);
188 if (prim
== PL_LIMIT
)
190 if (prim
== PL_ENABLE
)
192 /* PL4 would be around two times PL2, use same prim as PL2. */
193 if (prim
== PL_MAX_POWER
)
196 return get_pl_lock_prim(rd
, pl
);
203 #define power_zone_to_rapl_domain(_zone) \
204 container_of(_zone, struct rapl_domain, power_zone)
206 struct rapl_defaults
{
207 u8 floor_freq_reg_addr
;
208 int (*check_unit
)(struct rapl_domain
*rd
);
209 void (*set_floor_freq
)(struct rapl_domain
*rd
, bool mode
);
210 u64 (*compute_time_window
)(struct rapl_domain
*rd
, u64 val
,
212 unsigned int dram_domain_energy_unit
;
213 unsigned int psys_domain_energy_unit
;
216 static struct rapl_defaults
*defaults_msr
;
217 static const struct rapl_defaults defaults_tpmi
;
219 static struct rapl_defaults
*get_defaults(struct rapl_package
*rp
)
221 return rp
->priv
->defaults
;
224 /* Sideband MBI registers */
225 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
226 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
228 #define PACKAGE_PLN_INT_SAVED BIT(0)
229 #define MAX_PRIM_NAME (32)
231 /* per domain data. used to describe individual knobs such that access function
232 * can be consolidated into one instead of many inline functions.
234 struct rapl_primitive_info
{
238 enum rapl_domain_reg_id id
;
243 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \
252 static void rapl_init_domains(struct rapl_package
*rp
);
253 static int rapl_read_data_raw(struct rapl_domain
*rd
,
254 enum rapl_primitives prim
,
255 bool xlate
, u64
*data
);
256 static int rapl_write_data_raw(struct rapl_domain
*rd
,
257 enum rapl_primitives prim
,
258 unsigned long long value
);
259 static int rapl_read_pl_data(struct rapl_domain
*rd
, int pl
,
260 enum pl_prims pl_prim
,
261 bool xlate
, u64
*data
);
262 static int rapl_write_pl_data(struct rapl_domain
*rd
, int pl
,
263 enum pl_prims pl_prim
,
264 unsigned long long value
);
265 static u64
rapl_unit_xlate(struct rapl_domain
*rd
,
266 enum unit_type type
, u64 value
, int to_raw
);
267 static void package_power_limit_irq_save(struct rapl_package
*rp
);
269 static LIST_HEAD(rapl_packages
); /* guarded by CPU hotplug lock */
271 static const char *const rapl_domain_names
[] = {
279 static int get_energy_counter(struct powercap_zone
*power_zone
,
282 struct rapl_domain
*rd
;
285 /* prevent CPU hotplug, make sure the RAPL domain does not go
286 * away while reading the counter.
289 rd
= power_zone_to_rapl_domain(power_zone
);
291 if (!rapl_read_data_raw(rd
, ENERGY_COUNTER
, true, &energy_now
)) {
292 *energy_raw
= energy_now
;
302 static int get_max_energy_counter(struct powercap_zone
*pcd_dev
, u64
*energy
)
304 struct rapl_domain
*rd
= power_zone_to_rapl_domain(pcd_dev
);
306 *energy
= rapl_unit_xlate(rd
, ENERGY_UNIT
, ENERGY_STATUS_MASK
, 0);
310 static int release_zone(struct powercap_zone
*power_zone
)
312 struct rapl_domain
*rd
= power_zone_to_rapl_domain(power_zone
);
313 struct rapl_package
*rp
= rd
->rp
;
315 /* package zone is the last zone of a package, we can free
316 * memory here since all children has been unregistered.
318 if (rd
->id
== RAPL_DOMAIN_PACKAGE
) {
327 static int find_nr_power_limit(struct rapl_domain
*rd
)
331 for (i
= 0; i
< NR_POWER_LIMITS
; i
++) {
332 if (is_pl_valid(rd
, i
))
339 static int set_domain_enable(struct powercap_zone
*power_zone
, bool mode
)
341 struct rapl_domain
*rd
= power_zone_to_rapl_domain(power_zone
);
342 struct rapl_defaults
*defaults
= get_defaults(rd
->rp
);
346 ret
= rapl_write_pl_data(rd
, POWER_LIMIT1
, PL_ENABLE
, mode
);
347 if (!ret
&& defaults
->set_floor_freq
)
348 defaults
->set_floor_freq(rd
, mode
);
354 static int get_domain_enable(struct powercap_zone
*power_zone
, bool *mode
)
356 struct rapl_domain
*rd
= power_zone_to_rapl_domain(power_zone
);
360 if (rd
->rpl
[POWER_LIMIT1
].locked
) {
365 ret
= rapl_read_pl_data(rd
, POWER_LIMIT1
, PL_ENABLE
, true, &val
);
373 /* per RAPL domain ops, in the order of rapl_domain_type */
374 static const struct powercap_zone_ops zone_ops
[] = {
375 /* RAPL_DOMAIN_PACKAGE */
377 .get_energy_uj
= get_energy_counter
,
378 .get_max_energy_range_uj
= get_max_energy_counter
,
379 .release
= release_zone
,
380 .set_enable
= set_domain_enable
,
381 .get_enable
= get_domain_enable
,
383 /* RAPL_DOMAIN_PP0 */
385 .get_energy_uj
= get_energy_counter
,
386 .get_max_energy_range_uj
= get_max_energy_counter
,
387 .release
= release_zone
,
388 .set_enable
= set_domain_enable
,
389 .get_enable
= get_domain_enable
,
391 /* RAPL_DOMAIN_PP1 */
393 .get_energy_uj
= get_energy_counter
,
394 .get_max_energy_range_uj
= get_max_energy_counter
,
395 .release
= release_zone
,
396 .set_enable
= set_domain_enable
,
397 .get_enable
= get_domain_enable
,
399 /* RAPL_DOMAIN_DRAM */
401 .get_energy_uj
= get_energy_counter
,
402 .get_max_energy_range_uj
= get_max_energy_counter
,
403 .release
= release_zone
,
404 .set_enable
= set_domain_enable
,
405 .get_enable
= get_domain_enable
,
407 /* RAPL_DOMAIN_PLATFORM */
409 .get_energy_uj
= get_energy_counter
,
410 .get_max_energy_range_uj
= get_max_energy_counter
,
411 .release
= release_zone
,
412 .set_enable
= set_domain_enable
,
413 .get_enable
= get_domain_enable
,
418 * Constraint index used by powercap can be different than power limit (PL)
419 * index in that some PLs maybe missing due to non-existent MSRs. So we
420 * need to convert here by finding the valid PLs only (name populated).
422 static int contraint_to_pl(struct rapl_domain
*rd
, int cid
)
426 for (i
= POWER_LIMIT1
, j
= 0; i
< NR_POWER_LIMITS
; i
++) {
427 if (is_pl_valid(rd
, i
) && j
++ == cid
) {
428 pr_debug("%s: index %d\n", __func__
, i
);
432 pr_err("Cannot find matching power limit for constraint %d\n", cid
);
437 static int set_power_limit(struct powercap_zone
*power_zone
, int cid
,
440 struct rapl_domain
*rd
;
441 struct rapl_package
*rp
;
446 rd
= power_zone_to_rapl_domain(power_zone
);
447 id
= contraint_to_pl(rd
, cid
);
450 ret
= rapl_write_pl_data(rd
, id
, PL_LIMIT
, power_limit
);
452 package_power_limit_irq_save(rp
);
457 static int get_current_power_limit(struct powercap_zone
*power_zone
, int cid
,
460 struct rapl_domain
*rd
;
466 rd
= power_zone_to_rapl_domain(power_zone
);
467 id
= contraint_to_pl(rd
, cid
);
469 ret
= rapl_read_pl_data(rd
, id
, PL_LIMIT
, true, &val
);
478 static int set_time_window(struct powercap_zone
*power_zone
, int cid
,
481 struct rapl_domain
*rd
;
486 rd
= power_zone_to_rapl_domain(power_zone
);
487 id
= contraint_to_pl(rd
, cid
);
489 ret
= rapl_write_pl_data(rd
, id
, PL_TIME_WINDOW
, window
);
495 static int get_time_window(struct powercap_zone
*power_zone
, int cid
,
498 struct rapl_domain
*rd
;
504 rd
= power_zone_to_rapl_domain(power_zone
);
505 id
= contraint_to_pl(rd
, cid
);
507 ret
= rapl_read_pl_data(rd
, id
, PL_TIME_WINDOW
, true, &val
);
516 static const char *get_constraint_name(struct powercap_zone
*power_zone
,
519 struct rapl_domain
*rd
;
522 rd
= power_zone_to_rapl_domain(power_zone
);
523 id
= contraint_to_pl(rd
, cid
);
525 return rd
->rpl
[id
].name
;
530 static int get_max_power(struct powercap_zone
*power_zone
, int cid
, u64
*data
)
532 struct rapl_domain
*rd
;
538 rd
= power_zone_to_rapl_domain(power_zone
);
539 id
= contraint_to_pl(rd
, cid
);
541 ret
= rapl_read_pl_data(rd
, id
, PL_MAX_POWER
, true, &val
);
545 /* As a generalization rule, PL4 would be around two times PL2. */
546 if (id
== POWER_LIMIT4
)
554 static const struct powercap_zone_constraint_ops constraint_ops
= {
555 .set_power_limit_uw
= set_power_limit
,
556 .get_power_limit_uw
= get_current_power_limit
,
557 .set_time_window_us
= set_time_window
,
558 .get_time_window_us
= get_time_window
,
559 .get_max_power_uw
= get_max_power
,
560 .get_name
= get_constraint_name
,
563 /* Return the id used for read_raw/write_raw callback */
564 static int get_rid(struct rapl_package
*rp
)
566 return rp
->lead_cpu
>= 0 ? rp
->lead_cpu
: rp
->id
;
569 /* called after domain detection and package level data are set */
570 static void rapl_init_domains(struct rapl_package
*rp
)
572 enum rapl_domain_type i
;
573 enum rapl_domain_reg_id j
;
574 struct rapl_domain
*rd
= rp
->domains
;
576 for (i
= 0; i
< RAPL_DOMAIN_MAX
; i
++) {
577 unsigned int mask
= rp
->domain_map
& (1 << i
);
585 if (i
== RAPL_DOMAIN_PLATFORM
&& rp
->id
> 0) {
586 snprintf(rd
->name
, RAPL_DOMAIN_NAME_LENGTH
, "psys-%d",
587 rp
->lead_cpu
>= 0 ? topology_physical_package_id(rp
->lead_cpu
) :
590 snprintf(rd
->name
, RAPL_DOMAIN_NAME_LENGTH
, "%s",
591 rapl_domain_names
[i
]);
596 /* PL1 is supported by default */
597 rp
->priv
->limits
[i
] |= BIT(POWER_LIMIT1
);
599 for (t
= POWER_LIMIT1
; t
< NR_POWER_LIMITS
; t
++) {
600 if (rp
->priv
->limits
[i
] & BIT(t
))
601 rd
->rpl
[t
].name
= pl_names
[t
];
604 for (j
= 0; j
< RAPL_DOMAIN_REG_MAX
; j
++)
605 rd
->regs
[j
] = rp
->priv
->regs
[i
][j
];
611 static u64
rapl_unit_xlate(struct rapl_domain
*rd
, enum unit_type type
,
612 u64 value
, int to_raw
)
615 struct rapl_defaults
*defaults
= get_defaults(rd
->rp
);
620 units
= rd
->power_unit
;
623 scale
= ENERGY_UNIT_SCALE
;
624 units
= rd
->energy_unit
;
627 return defaults
->compute_time_window(rd
, value
, to_raw
);
634 return div64_u64(value
, units
) * scale
;
638 return div64_u64(value
, scale
);
641 /* RAPL primitives for MSR and MMIO I/F */
642 static struct rapl_primitive_info rpi_msr
[NR_RAPL_PRIMITIVES
] = {
643 /* name, mask, shift, msr index, unit divisor */
644 [POWER_LIMIT1
] = PRIMITIVE_INFO_INIT(POWER_LIMIT1
, POWER_LIMIT1_MASK
, 0,
645 RAPL_DOMAIN_REG_LIMIT
, POWER_UNIT
, 0),
646 [POWER_LIMIT2
] = PRIMITIVE_INFO_INIT(POWER_LIMIT2
, POWER_LIMIT2_MASK
, 32,
647 RAPL_DOMAIN_REG_LIMIT
, POWER_UNIT
, 0),
648 [POWER_LIMIT4
] = PRIMITIVE_INFO_INIT(POWER_LIMIT4
, POWER_LIMIT4_MASK
, 0,
649 RAPL_DOMAIN_REG_PL4
, POWER_UNIT
, 0),
650 [ENERGY_COUNTER
] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER
, ENERGY_STATUS_MASK
, 0,
651 RAPL_DOMAIN_REG_STATUS
, ENERGY_UNIT
, 0),
652 [FW_LOCK
] = PRIMITIVE_INFO_INIT(FW_LOCK
, POWER_LOW_LOCK
, 31,
653 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
654 [FW_HIGH_LOCK
] = PRIMITIVE_INFO_INIT(FW_LOCK
, POWER_HIGH_LOCK
, 63,
655 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
656 [PL1_ENABLE
] = PRIMITIVE_INFO_INIT(PL1_ENABLE
, POWER_LIMIT1_ENABLE
, 15,
657 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
658 [PL1_CLAMP
] = PRIMITIVE_INFO_INIT(PL1_CLAMP
, POWER_LIMIT1_CLAMP
, 16,
659 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
660 [PL2_ENABLE
] = PRIMITIVE_INFO_INIT(PL2_ENABLE
, POWER_LIMIT2_ENABLE
, 47,
661 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
662 [PL2_CLAMP
] = PRIMITIVE_INFO_INIT(PL2_CLAMP
, POWER_LIMIT2_CLAMP
, 48,
663 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
664 [TIME_WINDOW1
] = PRIMITIVE_INFO_INIT(TIME_WINDOW1
, TIME_WINDOW1_MASK
, 17,
665 RAPL_DOMAIN_REG_LIMIT
, TIME_UNIT
, 0),
666 [TIME_WINDOW2
] = PRIMITIVE_INFO_INIT(TIME_WINDOW2
, TIME_WINDOW2_MASK
, 49,
667 RAPL_DOMAIN_REG_LIMIT
, TIME_UNIT
, 0),
668 [THERMAL_SPEC_POWER
] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER
, POWER_INFO_THERMAL_SPEC_MASK
,
669 0, RAPL_DOMAIN_REG_INFO
, POWER_UNIT
, 0),
670 [MAX_POWER
] = PRIMITIVE_INFO_INIT(MAX_POWER
, POWER_INFO_MAX_MASK
, 32,
671 RAPL_DOMAIN_REG_INFO
, POWER_UNIT
, 0),
672 [MIN_POWER
] = PRIMITIVE_INFO_INIT(MIN_POWER
, POWER_INFO_MIN_MASK
, 16,
673 RAPL_DOMAIN_REG_INFO
, POWER_UNIT
, 0),
674 [MAX_TIME_WINDOW
] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW
, POWER_INFO_MAX_TIME_WIN_MASK
, 48,
675 RAPL_DOMAIN_REG_INFO
, TIME_UNIT
, 0),
676 [THROTTLED_TIME
] = PRIMITIVE_INFO_INIT(THROTTLED_TIME
, PERF_STATUS_THROTTLE_TIME_MASK
, 0,
677 RAPL_DOMAIN_REG_PERF
, TIME_UNIT
, 0),
678 [PRIORITY_LEVEL
] = PRIMITIVE_INFO_INIT(PRIORITY_LEVEL
, PP_POLICY_MASK
, 0,
679 RAPL_DOMAIN_REG_POLICY
, ARBITRARY_UNIT
, 0),
680 [PSYS_POWER_LIMIT1
] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1
, PSYS_POWER_LIMIT1_MASK
, 0,
681 RAPL_DOMAIN_REG_LIMIT
, POWER_UNIT
, 0),
682 [PSYS_POWER_LIMIT2
] = PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2
, PSYS_POWER_LIMIT2_MASK
, 32,
683 RAPL_DOMAIN_REG_LIMIT
, POWER_UNIT
, 0),
684 [PSYS_PL1_ENABLE
] = PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE
, PSYS_POWER_LIMIT1_ENABLE
, 17,
685 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
686 [PSYS_PL2_ENABLE
] = PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE
, PSYS_POWER_LIMIT2_ENABLE
, 49,
687 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
688 [PSYS_TIME_WINDOW1
] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1
, PSYS_TIME_WINDOW1_MASK
, 19,
689 RAPL_DOMAIN_REG_LIMIT
, TIME_UNIT
, 0),
690 [PSYS_TIME_WINDOW2
] = PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2
, PSYS_TIME_WINDOW2_MASK
, 51,
691 RAPL_DOMAIN_REG_LIMIT
, TIME_UNIT
, 0),
693 [AVERAGE_POWER
] = PRIMITIVE_INFO_INIT(AVERAGE_POWER
, 0, 0, 0, POWER_UNIT
,
694 RAPL_PRIMITIVE_DERIVED
),
697 /* RAPL primitives for TPMI I/F */
698 static struct rapl_primitive_info rpi_tpmi
[NR_RAPL_PRIMITIVES
] = {
699 /* name, mask, shift, msr index, unit divisor */
700 [POWER_LIMIT1
] = PRIMITIVE_INFO_INIT(POWER_LIMIT1
, TPMI_POWER_LIMIT_MASK
, 0,
701 RAPL_DOMAIN_REG_LIMIT
, POWER_UNIT
, 0),
702 [POWER_LIMIT2
] = PRIMITIVE_INFO_INIT(POWER_LIMIT2
, TPMI_POWER_LIMIT_MASK
, 0,
703 RAPL_DOMAIN_REG_PL2
, POWER_UNIT
, 0),
704 [POWER_LIMIT4
] = PRIMITIVE_INFO_INIT(POWER_LIMIT4
, TPMI_POWER_LIMIT_MASK
, 0,
705 RAPL_DOMAIN_REG_PL4
, POWER_UNIT
, 0),
706 [ENERGY_COUNTER
] = PRIMITIVE_INFO_INIT(ENERGY_COUNTER
, ENERGY_STATUS_MASK
, 0,
707 RAPL_DOMAIN_REG_STATUS
, ENERGY_UNIT
, 0),
708 [PL1_LOCK
] = PRIMITIVE_INFO_INIT(PL1_LOCK
, POWER_HIGH_LOCK
, 63,
709 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
710 [PL2_LOCK
] = PRIMITIVE_INFO_INIT(PL2_LOCK
, POWER_HIGH_LOCK
, 63,
711 RAPL_DOMAIN_REG_PL2
, ARBITRARY_UNIT
, 0),
712 [PL4_LOCK
] = PRIMITIVE_INFO_INIT(PL4_LOCK
, POWER_HIGH_LOCK
, 63,
713 RAPL_DOMAIN_REG_PL4
, ARBITRARY_UNIT
, 0),
714 [PL1_ENABLE
] = PRIMITIVE_INFO_INIT(PL1_ENABLE
, TPMI_POWER_LIMIT_ENABLE
, 62,
715 RAPL_DOMAIN_REG_LIMIT
, ARBITRARY_UNIT
, 0),
716 [PL2_ENABLE
] = PRIMITIVE_INFO_INIT(PL2_ENABLE
, TPMI_POWER_LIMIT_ENABLE
, 62,
717 RAPL_DOMAIN_REG_PL2
, ARBITRARY_UNIT
, 0),
718 [PL4_ENABLE
] = PRIMITIVE_INFO_INIT(PL4_ENABLE
, TPMI_POWER_LIMIT_ENABLE
, 62,
719 RAPL_DOMAIN_REG_PL4
, ARBITRARY_UNIT
, 0),
720 [TIME_WINDOW1
] = PRIMITIVE_INFO_INIT(TIME_WINDOW1
, TPMI_TIME_WINDOW_MASK
, 18,
721 RAPL_DOMAIN_REG_LIMIT
, TIME_UNIT
, 0),
722 [TIME_WINDOW2
] = PRIMITIVE_INFO_INIT(TIME_WINDOW2
, TPMI_TIME_WINDOW_MASK
, 18,
723 RAPL_DOMAIN_REG_PL2
, TIME_UNIT
, 0),
724 [THERMAL_SPEC_POWER
] = PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER
, TPMI_INFO_SPEC_MASK
, 0,
725 RAPL_DOMAIN_REG_INFO
, POWER_UNIT
, 0),
726 [MAX_POWER
] = PRIMITIVE_INFO_INIT(MAX_POWER
, TPMI_INFO_MAX_MASK
, 36,
727 RAPL_DOMAIN_REG_INFO
, POWER_UNIT
, 0),
728 [MIN_POWER
] = PRIMITIVE_INFO_INIT(MIN_POWER
, TPMI_INFO_MIN_MASK
, 18,
729 RAPL_DOMAIN_REG_INFO
, POWER_UNIT
, 0),
730 [MAX_TIME_WINDOW
] = PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW
, TPMI_INFO_MAX_TIME_WIN_MASK
, 54,
731 RAPL_DOMAIN_REG_INFO
, TIME_UNIT
, 0),
732 [THROTTLED_TIME
] = PRIMITIVE_INFO_INIT(THROTTLED_TIME
, PERF_STATUS_THROTTLE_TIME_MASK
, 0,
733 RAPL_DOMAIN_REG_PERF
, TIME_UNIT
, 0),
735 [AVERAGE_POWER
] = PRIMITIVE_INFO_INIT(AVERAGE_POWER
, 0, 0, 0,
736 POWER_UNIT
, RAPL_PRIMITIVE_DERIVED
),
739 static struct rapl_primitive_info
*get_rpi(struct rapl_package
*rp
, int prim
)
741 struct rapl_primitive_info
*rpi
= rp
->priv
->rpi
;
743 if (prim
< 0 || prim
>= NR_RAPL_PRIMITIVES
|| !rpi
)
749 static int rapl_config(struct rapl_package
*rp
)
751 switch (rp
->priv
->type
) {
752 /* MMIO I/F shares the same register layout as MSR registers */
755 rp
->priv
->defaults
= (void *)defaults_msr
;
756 rp
->priv
->rpi
= (void *)rpi_msr
;
759 rp
->priv
->defaults
= (void *)&defaults_tpmi
;
760 rp
->priv
->rpi
= (void *)rpi_tpmi
;
766 /* defaults_msr can be NULL on unsupported platforms */
767 if (!rp
->priv
->defaults
|| !rp
->priv
->rpi
)
773 static enum rapl_primitives
774 prim_fixups(struct rapl_domain
*rd
, enum rapl_primitives prim
)
776 struct rapl_defaults
*defaults
= get_defaults(rd
->rp
);
778 if (!defaults
->spr_psys_bits
)
781 if (rd
->id
!= RAPL_DOMAIN_PLATFORM
)
786 return PSYS_POWER_LIMIT1
;
788 return PSYS_POWER_LIMIT2
;
790 return PSYS_PL1_ENABLE
;
792 return PSYS_PL2_ENABLE
;
794 return PSYS_TIME_WINDOW1
;
796 return PSYS_TIME_WINDOW2
;
802 /* Read primitive data based on its related struct rapl_primitive_info.
803 * if xlate flag is set, return translated data based on data units, i.e.
804 * time, energy, and power.
805 * RAPL MSRs are non-architectual and are laid out not consistently across
806 * domains. Here we use primitive info to allow writing consolidated access
808 * For a given primitive, it is processed by MSR mask and shift. Unit conversion
809 * is pre-assigned based on RAPL unit MSRs read at init time.
810 * 63-------------------------- 31--------------------------- 0
812 * | |<- shift ----------------|
813 * 63-------------------------- 31--------------------------- 0
815 static int rapl_read_data_raw(struct rapl_domain
*rd
,
816 enum rapl_primitives prim
, bool xlate
, u64
*data
)
819 enum rapl_primitives prim_fixed
= prim_fixups(rd
, prim
);
820 struct rapl_primitive_info
*rpi
= get_rpi(rd
->rp
, prim_fixed
);
821 struct reg_action ra
;
823 if (!rpi
|| !rpi
->name
|| rpi
->flag
& RAPL_PRIMITIVE_DUMMY
)
826 ra
.reg
= rd
->regs
[rpi
->id
];
830 /* non-hardware data are collected by the polling thread */
831 if (rpi
->flag
& RAPL_PRIMITIVE_DERIVED
) {
832 *data
= rd
->rdd
.primitives
[prim
];
838 if (rd
->rp
->priv
->read_raw(get_rid(rd
->rp
), &ra
)) {
839 pr_debug("failed to read reg 0x%llx for %s:%s\n", ra
.reg
.val
, rd
->rp
->name
, rd
->name
);
843 value
= ra
.value
>> rpi
->shift
;
846 *data
= rapl_unit_xlate(rd
, rpi
->unit
, value
, 0);
853 /* Similar use of primitive info in the read counterpart */
854 static int rapl_write_data_raw(struct rapl_domain
*rd
,
855 enum rapl_primitives prim
,
856 unsigned long long value
)
858 enum rapl_primitives prim_fixed
= prim_fixups(rd
, prim
);
859 struct rapl_primitive_info
*rpi
= get_rpi(rd
->rp
, prim_fixed
);
861 struct reg_action ra
;
864 if (!rpi
|| !rpi
->name
|| rpi
->flag
& RAPL_PRIMITIVE_DUMMY
)
867 bits
= rapl_unit_xlate(rd
, rpi
->unit
, value
, 1);
871 memset(&ra
, 0, sizeof(ra
));
873 ra
.reg
= rd
->regs
[rpi
->id
];
877 ret
= rd
->rp
->priv
->write_raw(get_rid(rd
->rp
), &ra
);
882 static int rapl_read_pl_data(struct rapl_domain
*rd
, int pl
,
883 enum pl_prims pl_prim
, bool xlate
, u64
*data
)
885 enum rapl_primitives prim
= get_pl_prim(rd
, pl
, pl_prim
);
887 if (!is_pl_valid(rd
, pl
))
890 return rapl_read_data_raw(rd
, prim
, xlate
, data
);
893 static int rapl_write_pl_data(struct rapl_domain
*rd
, int pl
,
894 enum pl_prims pl_prim
,
895 unsigned long long value
)
897 enum rapl_primitives prim
= get_pl_prim(rd
, pl
, pl_prim
);
899 if (!is_pl_valid(rd
, pl
))
902 if (rd
->rpl
[pl
].locked
) {
903 pr_debug("%s:%s:%s locked by BIOS\n", rd
->rp
->name
, rd
->name
, pl_names
[pl
]);
907 return rapl_write_data_raw(rd
, prim
, value
);
910 * Raw RAPL data stored in MSRs are in certain scales. We need to
911 * convert them into standard units based on the units reported in
912 * the RAPL unit MSRs. This is specific to CPUs as the method to
913 * calculate units differ on different CPUs.
914 * We convert the units to below format based on CPUs.
916 * energy unit: picoJoules : Represented in picoJoules by default
917 * power unit : microWatts : Represented in milliWatts by default
918 * time unit : microseconds: Represented in seconds by default
920 static int rapl_check_unit_core(struct rapl_domain
*rd
)
922 struct reg_action ra
;
925 ra
.reg
= rd
->regs
[RAPL_DOMAIN_REG_UNIT
];
927 if (rd
->rp
->priv
->read_raw(get_rid(rd
->rp
), &ra
)) {
928 pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
929 ra
.reg
.val
, rd
->rp
->name
, rd
->name
);
933 value
= (ra
.value
& ENERGY_UNIT_MASK
) >> ENERGY_UNIT_OFFSET
;
934 rd
->energy_unit
= ENERGY_UNIT_SCALE
* 1000000 / (1 << value
);
936 value
= (ra
.value
& POWER_UNIT_MASK
) >> POWER_UNIT_OFFSET
;
937 rd
->power_unit
= 1000000 / (1 << value
);
939 value
= (ra
.value
& TIME_UNIT_MASK
) >> TIME_UNIT_OFFSET
;
940 rd
->time_unit
= 1000000 / (1 << value
);
942 pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n",
943 rd
->rp
->name
, rd
->name
, rd
->energy_unit
, rd
->time_unit
, rd
->power_unit
);
948 static int rapl_check_unit_atom(struct rapl_domain
*rd
)
950 struct reg_action ra
;
953 ra
.reg
= rd
->regs
[RAPL_DOMAIN_REG_UNIT
];
955 if (rd
->rp
->priv
->read_raw(get_rid(rd
->rp
), &ra
)) {
956 pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
957 ra
.reg
.val
, rd
->rp
->name
, rd
->name
);
961 value
= (ra
.value
& ENERGY_UNIT_MASK
) >> ENERGY_UNIT_OFFSET
;
962 rd
->energy_unit
= ENERGY_UNIT_SCALE
* 1 << value
;
964 value
= (ra
.value
& POWER_UNIT_MASK
) >> POWER_UNIT_OFFSET
;
965 rd
->power_unit
= (1 << value
) * 1000;
967 value
= (ra
.value
& TIME_UNIT_MASK
) >> TIME_UNIT_OFFSET
;
968 rd
->time_unit
= 1000000 / (1 << value
);
970 pr_debug("Atom %s:%s energy=%dpJ, time=%dus, power=%duW\n",
971 rd
->rp
->name
, rd
->name
, rd
->energy_unit
, rd
->time_unit
, rd
->power_unit
);
976 static void power_limit_irq_save_cpu(void *info
)
979 struct rapl_package
*rp
= (struct rapl_package
*)info
;
981 /* save the state of PLN irq mask bit before disabling it */
982 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, &l
, &h
);
983 if (!(rp
->power_limit_irq
& PACKAGE_PLN_INT_SAVED
)) {
984 rp
->power_limit_irq
= l
& PACKAGE_THERM_INT_PLN_ENABLE
;
985 rp
->power_limit_irq
|= PACKAGE_PLN_INT_SAVED
;
987 l
&= ~PACKAGE_THERM_INT_PLN_ENABLE
;
988 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, l
, h
);
992 * When package power limit is set artificially low by RAPL, LVT
993 * thermal interrupt for package power limit should be ignored
994 * since we are not really exceeding the real limit. The intention
995 * is to avoid excessive interrupts while we are trying to save power.
996 * A useful feature might be routing the package_power_limit interrupt
997 * to userspace via eventfd. once we have a usecase, this is simple
998 * to do by adding an atomic notifier.
1001 static void package_power_limit_irq_save(struct rapl_package
*rp
)
1003 if (rp
->lead_cpu
< 0)
1006 if (!boot_cpu_has(X86_FEATURE_PTS
) || !boot_cpu_has(X86_FEATURE_PLN
))
1009 smp_call_function_single(rp
->lead_cpu
, power_limit_irq_save_cpu
, rp
, 1);
1013 * Restore per package power limit interrupt enable state. Called from cpu
1014 * hotplug code on package removal.
1016 static void package_power_limit_irq_restore(struct rapl_package
*rp
)
1020 if (rp
->lead_cpu
< 0)
1023 if (!boot_cpu_has(X86_FEATURE_PTS
) || !boot_cpu_has(X86_FEATURE_PLN
))
1026 /* irq enable state not saved, nothing to restore */
1027 if (!(rp
->power_limit_irq
& PACKAGE_PLN_INT_SAVED
))
1030 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, &l
, &h
);
1032 if (rp
->power_limit_irq
& PACKAGE_THERM_INT_PLN_ENABLE
)
1033 l
|= PACKAGE_THERM_INT_PLN_ENABLE
;
1035 l
&= ~PACKAGE_THERM_INT_PLN_ENABLE
;
1037 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, l
, h
);
1040 static void set_floor_freq_default(struct rapl_domain
*rd
, bool mode
)
1044 /* always enable clamp such that p-state can go below OS requested
1045 * range. power capping priority over guranteed frequency.
1047 rapl_write_pl_data(rd
, POWER_LIMIT1
, PL_CLAMP
, mode
);
1049 for (i
= POWER_LIMIT2
; i
< NR_POWER_LIMITS
; i
++) {
1050 rapl_write_pl_data(rd
, i
, PL_ENABLE
, mode
);
1051 rapl_write_pl_data(rd
, i
, PL_CLAMP
, mode
);
1055 static void set_floor_freq_atom(struct rapl_domain
*rd
, bool enable
)
1057 static u32 power_ctrl_orig_val
;
1058 struct rapl_defaults
*defaults
= get_defaults(rd
->rp
);
1061 if (!defaults
->floor_freq_reg_addr
) {
1062 pr_err("Invalid floor frequency config register\n");
1066 if (!power_ctrl_orig_val
)
1067 iosf_mbi_read(BT_MBI_UNIT_PMC
, MBI_CR_READ
,
1068 defaults
->floor_freq_reg_addr
,
1069 &power_ctrl_orig_val
);
1070 mdata
= power_ctrl_orig_val
;
1072 mdata
&= ~(0x7f << 8);
1075 iosf_mbi_write(BT_MBI_UNIT_PMC
, MBI_CR_WRITE
,
1076 defaults
->floor_freq_reg_addr
, mdata
);
1079 static u64
rapl_compute_time_window_core(struct rapl_domain
*rd
, u64 value
,
1082 u64 f
, y
; /* fraction and exp. used for time unit */
1085 * Special processing based on 2^Y*(1+F/4), refer
1086 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
1089 f
= (value
& 0x60) >> 5;
1091 value
= (1 << y
) * (4 + f
) * rd
->time_unit
/ 4;
1093 if (value
< rd
->time_unit
)
1096 do_div(value
, rd
->time_unit
);
1100 * The target hardware field is 7 bits wide, so return all ones
1101 * if the exponent is too large.
1106 f
= div64_u64(4 * (value
- (1ULL << y
)), 1ULL << y
);
1107 value
= (y
& 0x1f) | ((f
& 0x3) << 5);
1112 static u64
rapl_compute_time_window_atom(struct rapl_domain
*rd
, u64 value
,
1116 * Atom time unit encoding is straight forward val * time_unit,
1117 * where time_unit is default to 1 sec. Never 0.
1120 return (value
) ? value
* rd
->time_unit
: rd
->time_unit
;
1122 value
= div64_u64(value
, rd
->time_unit
);
1127 /* TPMI Unit register has different layout */
1128 #define TPMI_POWER_UNIT_OFFSET POWER_UNIT_OFFSET
1129 #define TPMI_POWER_UNIT_MASK POWER_UNIT_MASK
1130 #define TPMI_ENERGY_UNIT_OFFSET 0x06
1131 #define TPMI_ENERGY_UNIT_MASK 0x7C0
1132 #define TPMI_TIME_UNIT_OFFSET 0x0C
1133 #define TPMI_TIME_UNIT_MASK 0xF000
1135 static int rapl_check_unit_tpmi(struct rapl_domain
*rd
)
1137 struct reg_action ra
;
1140 ra
.reg
= rd
->regs
[RAPL_DOMAIN_REG_UNIT
];
1142 if (rd
->rp
->priv
->read_raw(get_rid(rd
->rp
), &ra
)) {
1143 pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n",
1144 ra
.reg
.val
, rd
->rp
->name
, rd
->name
);
1148 value
= (ra
.value
& TPMI_ENERGY_UNIT_MASK
) >> TPMI_ENERGY_UNIT_OFFSET
;
1149 rd
->energy_unit
= ENERGY_UNIT_SCALE
* 1000000 / (1 << value
);
1151 value
= (ra
.value
& TPMI_POWER_UNIT_MASK
) >> TPMI_POWER_UNIT_OFFSET
;
1152 rd
->power_unit
= 1000000 / (1 << value
);
1154 value
= (ra
.value
& TPMI_TIME_UNIT_MASK
) >> TPMI_TIME_UNIT_OFFSET
;
1155 rd
->time_unit
= 1000000 / (1 << value
);
1157 pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n",
1158 rd
->rp
->name
, rd
->name
, rd
->energy_unit
, rd
->time_unit
, rd
->power_unit
);
1163 static const struct rapl_defaults defaults_tpmi
= {
1164 .check_unit
= rapl_check_unit_tpmi
,
1165 /* Reuse existing logic, ignore the PL_CLAMP failures and enable all Power Limits */
1166 .set_floor_freq
= set_floor_freq_default
,
1167 .compute_time_window
= rapl_compute_time_window_core
,
1170 static const struct rapl_defaults rapl_defaults_core
= {
1171 .floor_freq_reg_addr
= 0,
1172 .check_unit
= rapl_check_unit_core
,
1173 .set_floor_freq
= set_floor_freq_default
,
1174 .compute_time_window
= rapl_compute_time_window_core
,
1177 static const struct rapl_defaults rapl_defaults_hsw_server
= {
1178 .check_unit
= rapl_check_unit_core
,
1179 .set_floor_freq
= set_floor_freq_default
,
1180 .compute_time_window
= rapl_compute_time_window_core
,
1181 .dram_domain_energy_unit
= 15300,
1184 static const struct rapl_defaults rapl_defaults_spr_server
= {
1185 .check_unit
= rapl_check_unit_core
,
1186 .set_floor_freq
= set_floor_freq_default
,
1187 .compute_time_window
= rapl_compute_time_window_core
,
1188 .psys_domain_energy_unit
= 1000000000,
1189 .spr_psys_bits
= true,
1192 static const struct rapl_defaults rapl_defaults_byt
= {
1193 .floor_freq_reg_addr
= IOSF_CPU_POWER_BUDGET_CTL_BYT
,
1194 .check_unit
= rapl_check_unit_atom
,
1195 .set_floor_freq
= set_floor_freq_atom
,
1196 .compute_time_window
= rapl_compute_time_window_atom
,
1199 static const struct rapl_defaults rapl_defaults_tng
= {
1200 .floor_freq_reg_addr
= IOSF_CPU_POWER_BUDGET_CTL_TNG
,
1201 .check_unit
= rapl_check_unit_atom
,
1202 .set_floor_freq
= set_floor_freq_atom
,
1203 .compute_time_window
= rapl_compute_time_window_atom
,
1206 static const struct rapl_defaults rapl_defaults_ann
= {
1207 .floor_freq_reg_addr
= 0,
1208 .check_unit
= rapl_check_unit_atom
,
1209 .set_floor_freq
= NULL
,
1210 .compute_time_window
= rapl_compute_time_window_atom
,
1213 static const struct rapl_defaults rapl_defaults_cht
= {
1214 .floor_freq_reg_addr
= 0,
1215 .check_unit
= rapl_check_unit_atom
,
1216 .set_floor_freq
= NULL
,
1217 .compute_time_window
= rapl_compute_time_window_atom
,
1220 static const struct rapl_defaults rapl_defaults_amd
= {
1221 .check_unit
= rapl_check_unit_core
,
1224 static const struct x86_cpu_id rapl_ids
[] __initconst
= {
1225 X86_MATCH_VFM(INTEL_SANDYBRIDGE
, &rapl_defaults_core
),
1226 X86_MATCH_VFM(INTEL_SANDYBRIDGE_X
, &rapl_defaults_core
),
1228 X86_MATCH_VFM(INTEL_IVYBRIDGE
, &rapl_defaults_core
),
1229 X86_MATCH_VFM(INTEL_IVYBRIDGE_X
, &rapl_defaults_core
),
1231 X86_MATCH_VFM(INTEL_HASWELL
, &rapl_defaults_core
),
1232 X86_MATCH_VFM(INTEL_HASWELL_L
, &rapl_defaults_core
),
1233 X86_MATCH_VFM(INTEL_HASWELL_G
, &rapl_defaults_core
),
1234 X86_MATCH_VFM(INTEL_HASWELL_X
, &rapl_defaults_hsw_server
),
1236 X86_MATCH_VFM(INTEL_BROADWELL
, &rapl_defaults_core
),
1237 X86_MATCH_VFM(INTEL_BROADWELL_G
, &rapl_defaults_core
),
1238 X86_MATCH_VFM(INTEL_BROADWELL_D
, &rapl_defaults_core
),
1239 X86_MATCH_VFM(INTEL_BROADWELL_X
, &rapl_defaults_hsw_server
),
1241 X86_MATCH_VFM(INTEL_SKYLAKE
, &rapl_defaults_core
),
1242 X86_MATCH_VFM(INTEL_SKYLAKE_L
, &rapl_defaults_core
),
1243 X86_MATCH_VFM(INTEL_SKYLAKE_X
, &rapl_defaults_hsw_server
),
1244 X86_MATCH_VFM(INTEL_KABYLAKE_L
, &rapl_defaults_core
),
1245 X86_MATCH_VFM(INTEL_KABYLAKE
, &rapl_defaults_core
),
1246 X86_MATCH_VFM(INTEL_CANNONLAKE_L
, &rapl_defaults_core
),
1247 X86_MATCH_VFM(INTEL_ICELAKE_L
, &rapl_defaults_core
),
1248 X86_MATCH_VFM(INTEL_ICELAKE
, &rapl_defaults_core
),
1249 X86_MATCH_VFM(INTEL_ICELAKE_NNPI
, &rapl_defaults_core
),
1250 X86_MATCH_VFM(INTEL_ICELAKE_X
, &rapl_defaults_hsw_server
),
1251 X86_MATCH_VFM(INTEL_ICELAKE_D
, &rapl_defaults_hsw_server
),
1252 X86_MATCH_VFM(INTEL_COMETLAKE_L
, &rapl_defaults_core
),
1253 X86_MATCH_VFM(INTEL_COMETLAKE
, &rapl_defaults_core
),
1254 X86_MATCH_VFM(INTEL_TIGERLAKE_L
, &rapl_defaults_core
),
1255 X86_MATCH_VFM(INTEL_TIGERLAKE
, &rapl_defaults_core
),
1256 X86_MATCH_VFM(INTEL_ROCKETLAKE
, &rapl_defaults_core
),
1257 X86_MATCH_VFM(INTEL_ALDERLAKE
, &rapl_defaults_core
),
1258 X86_MATCH_VFM(INTEL_ALDERLAKE_L
, &rapl_defaults_core
),
1259 X86_MATCH_VFM(INTEL_ATOM_GRACEMONT
, &rapl_defaults_core
),
1260 X86_MATCH_VFM(INTEL_RAPTORLAKE
, &rapl_defaults_core
),
1261 X86_MATCH_VFM(INTEL_RAPTORLAKE_P
, &rapl_defaults_core
),
1262 X86_MATCH_VFM(INTEL_RAPTORLAKE_S
, &rapl_defaults_core
),
1263 X86_MATCH_VFM(INTEL_METEORLAKE
, &rapl_defaults_core
),
1264 X86_MATCH_VFM(INTEL_METEORLAKE_L
, &rapl_defaults_core
),
1265 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X
, &rapl_defaults_spr_server
),
1266 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X
, &rapl_defaults_spr_server
),
1267 X86_MATCH_VFM(INTEL_LUNARLAKE_M
, &rapl_defaults_core
),
1268 X86_MATCH_VFM(INTEL_ARROWLAKE_H
, &rapl_defaults_core
),
1269 X86_MATCH_VFM(INTEL_ARROWLAKE
, &rapl_defaults_core
),
1270 X86_MATCH_VFM(INTEL_ARROWLAKE_U
, &rapl_defaults_core
),
1271 X86_MATCH_VFM(INTEL_LAKEFIELD
, &rapl_defaults_core
),
1273 X86_MATCH_VFM(INTEL_ATOM_SILVERMONT
, &rapl_defaults_byt
),
1274 X86_MATCH_VFM(INTEL_ATOM_AIRMONT
, &rapl_defaults_cht
),
1275 X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID
, &rapl_defaults_tng
),
1276 X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID
, &rapl_defaults_ann
),
1277 X86_MATCH_VFM(INTEL_ATOM_GOLDMONT
, &rapl_defaults_core
),
1278 X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS
, &rapl_defaults_core
),
1279 X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D
, &rapl_defaults_core
),
1280 X86_MATCH_VFM(INTEL_ATOM_TREMONT
, &rapl_defaults_core
),
1281 X86_MATCH_VFM(INTEL_ATOM_TREMONT_D
, &rapl_defaults_core
),
1282 X86_MATCH_VFM(INTEL_ATOM_TREMONT_L
, &rapl_defaults_core
),
1284 X86_MATCH_VFM(INTEL_XEON_PHI_KNL
, &rapl_defaults_hsw_server
),
1285 X86_MATCH_VFM(INTEL_XEON_PHI_KNM
, &rapl_defaults_hsw_server
),
1287 X86_MATCH_VENDOR_FAM(AMD
, 0x17, &rapl_defaults_amd
),
1288 X86_MATCH_VENDOR_FAM(AMD
, 0x19, &rapl_defaults_amd
),
1289 X86_MATCH_VENDOR_FAM(AMD
, 0x1A, &rapl_defaults_amd
),
1290 X86_MATCH_VENDOR_FAM(HYGON
, 0x18, &rapl_defaults_amd
),
1293 MODULE_DEVICE_TABLE(x86cpu
, rapl_ids
);
1295 /* Read once for all raw primitive data for domains */
1296 static void rapl_update_domain_data(struct rapl_package
*rp
)
1301 for (dmn
= 0; dmn
< rp
->nr_domains
; dmn
++) {
1302 pr_debug("update %s domain %s data\n", rp
->name
,
1303 rp
->domains
[dmn
].name
);
1304 /* exclude non-raw primitives */
1305 for (prim
= 0; prim
< NR_RAW_PRIMITIVES
; prim
++) {
1306 struct rapl_primitive_info
*rpi
= get_rpi(rp
, prim
);
1308 if (!rapl_read_data_raw(&rp
->domains
[dmn
], prim
,
1310 rp
->domains
[dmn
].rdd
.primitives
[prim
] = val
;
1316 static int rapl_package_register_powercap(struct rapl_package
*rp
)
1318 struct rapl_domain
*rd
;
1319 struct powercap_zone
*power_zone
= NULL
;
1322 /* Update the domain data of the new package */
1323 rapl_update_domain_data(rp
);
1325 /* first we register package domain as the parent zone */
1326 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
1327 if (rd
->id
== RAPL_DOMAIN_PACKAGE
) {
1328 nr_pl
= find_nr_power_limit(rd
);
1329 pr_debug("register package domain %s\n", rp
->name
);
1330 power_zone
= powercap_register_zone(&rd
->power_zone
,
1331 rp
->priv
->control_type
, rp
->name
,
1332 NULL
, &zone_ops
[rd
->id
], nr_pl
,
1334 if (IS_ERR(power_zone
)) {
1335 pr_debug("failed to register power zone %s\n",
1337 return PTR_ERR(power_zone
);
1339 /* track parent zone in per package/socket data */
1340 rp
->power_zone
= power_zone
;
1341 /* done, only one package domain per socket */
1346 pr_err("no package domain found, unknown topology!\n");
1349 /* now register domains as children of the socket/package */
1350 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
1351 struct powercap_zone
*parent
= rp
->power_zone
;
1353 if (rd
->id
== RAPL_DOMAIN_PACKAGE
)
1355 if (rd
->id
== RAPL_DOMAIN_PLATFORM
)
1357 /* number of power limits per domain varies */
1358 nr_pl
= find_nr_power_limit(rd
);
1359 power_zone
= powercap_register_zone(&rd
->power_zone
,
1360 rp
->priv
->control_type
,
1362 &zone_ops
[rd
->id
], nr_pl
,
1365 if (IS_ERR(power_zone
)) {
1366 pr_debug("failed to register power_zone, %s:%s\n",
1367 rp
->name
, rd
->name
);
1368 ret
= PTR_ERR(power_zone
);
1376 * Clean up previously initialized domains within the package if we
1377 * failed after the first domain setup.
1379 while (--rd
>= rp
->domains
) {
1380 pr_debug("unregister %s domain %s\n", rp
->name
, rd
->name
);
1381 powercap_unregister_zone(rp
->priv
->control_type
,
1388 static int rapl_check_domain(int domain
, struct rapl_package
*rp
)
1390 struct reg_action ra
;
1393 case RAPL_DOMAIN_PACKAGE
:
1394 case RAPL_DOMAIN_PP0
:
1395 case RAPL_DOMAIN_PP1
:
1396 case RAPL_DOMAIN_DRAM
:
1397 case RAPL_DOMAIN_PLATFORM
:
1398 ra
.reg
= rp
->priv
->regs
[domain
][RAPL_DOMAIN_REG_STATUS
];
1401 pr_err("invalid domain id %d\n", domain
);
1404 /* make sure domain counters are available and contains non-zero
1405 * values, otherwise skip it.
1408 ra
.mask
= ENERGY_STATUS_MASK
;
1409 if (rp
->priv
->read_raw(get_rid(rp
), &ra
) || !ra
.value
)
1416 * Get per domain energy/power/time unit.
1417 * RAPL Interfaces without per domain unit register will use the package
1418 * scope unit register to set per domain units.
1420 static int rapl_get_domain_unit(struct rapl_domain
*rd
)
1422 struct rapl_defaults
*defaults
= get_defaults(rd
->rp
);
1425 if (!rd
->regs
[RAPL_DOMAIN_REG_UNIT
].val
) {
1426 if (!rd
->rp
->priv
->reg_unit
.val
) {
1427 pr_err("No valid Unit register found\n");
1430 rd
->regs
[RAPL_DOMAIN_REG_UNIT
] = rd
->rp
->priv
->reg_unit
;
1433 if (!defaults
->check_unit
) {
1434 pr_err("missing .check_unit() callback\n");
1438 ret
= defaults
->check_unit(rd
);
1442 if (rd
->id
== RAPL_DOMAIN_DRAM
&& defaults
->dram_domain_energy_unit
)
1443 rd
->energy_unit
= defaults
->dram_domain_energy_unit
;
1444 if (rd
->id
== RAPL_DOMAIN_PLATFORM
&& defaults
->psys_domain_energy_unit
)
1445 rd
->energy_unit
= defaults
->psys_domain_energy_unit
;
1450 * Check if power limits are available. Two cases when they are not available:
1451 * 1. Locked by BIOS, in this case we still provide read-only access so that
1452 * users can see what limit is set by the BIOS.
1453 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1454 * exist at all. In this case, we do not show the constraints in powercap.
1456 * Called after domains are detected and initialized.
1458 static void rapl_detect_powerlimit(struct rapl_domain
*rd
)
1463 for (i
= POWER_LIMIT1
; i
< NR_POWER_LIMITS
; i
++) {
1464 if (!rapl_read_pl_data(rd
, i
, PL_LOCK
, false, &val64
)) {
1466 rd
->rpl
[i
].locked
= true;
1467 pr_info("%s:%s:%s locked by BIOS\n",
1468 rd
->rp
->name
, rd
->name
, pl_names
[i
]);
1472 if (rapl_read_pl_data(rd
, i
, PL_LIMIT
, false, &val64
))
1473 rd
->rpl
[i
].name
= NULL
;
1477 /* Detect active and valid domains for the given CPU, caller must
1478 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1480 static int rapl_detect_domains(struct rapl_package
*rp
)
1482 struct rapl_domain
*rd
;
1485 for (i
= 0; i
< RAPL_DOMAIN_MAX
; i
++) {
1486 /* use physical package id to read counters */
1487 if (!rapl_check_domain(i
, rp
)) {
1488 rp
->domain_map
|= 1 << i
;
1489 pr_info("Found RAPL domain %s\n", rapl_domain_names
[i
]);
1492 rp
->nr_domains
= bitmap_weight(&rp
->domain_map
, RAPL_DOMAIN_MAX
);
1493 if (!rp
->nr_domains
) {
1494 pr_debug("no valid rapl domains found in %s\n", rp
->name
);
1497 pr_debug("found %d domains on %s\n", rp
->nr_domains
, rp
->name
);
1499 rp
->domains
= kcalloc(rp
->nr_domains
, sizeof(struct rapl_domain
),
1504 rapl_init_domains(rp
);
1506 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
1507 rapl_get_domain_unit(rd
);
1508 rapl_detect_powerlimit(rd
);
1514 #ifdef CONFIG_PERF_EVENTS
1517 * Support for RAPL PMU
1519 * Register a PMU if any of the registered RAPL Packages have the requirement
1520 * of exposing its energy counters via Perf PMU.
1526 * Name Event id RAPL Domain
1527 * energy_cores 0x01 RAPL_DOMAIN_PP0
1528 * energy_pkg 0x02 RAPL_DOMAIN_PACKAGE
1529 * energy_ram 0x03 RAPL_DOMAIN_DRAM
1530 * energy_gpu 0x04 RAPL_DOMAIN_PP1
1531 * energy_psys 0x05 RAPL_DOMAIN_PLATFORM
1537 * 2.3283064365386962890625e-10
1538 * The same RAPL domain in different RAPL Packages may have different
1539 * energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as
1540 * the fixed unit for all energy counters, and covert each hardware
1541 * counter increase to N times of PMU event counter increases.
1543 * This is fully compatible with the current MSR RAPL PMU. This means that
1544 * userspace programs like turbostat can use the same code to handle RAPL Perf
1545 * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running
1546 * underlying on the platform.
1548 * Note that RAPL Packages can be probed/removed dynamically, and the events
1549 * supported by each TPMI RAPL device can be different. Thus the RAPL PMU
1550 * support is done on demand, which means
1551 * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for
1552 * unsupported counters are not exposed.
1553 * 2. PMU is unregistered and registered when a new RAPL Package is probed and
1554 * supports new counters that are not supported by current PMU.
1555 * 3. PMU is unregistered when all registered RAPL Packages don't need PMU.
1559 struct pmu pmu
; /* Perf PMU structure */
1560 u64 timer_ms
; /* Maximum expiration time to avoid counter overflow */
1561 unsigned long domain_map
; /* Events supported by current registered PMU */
1562 bool registered
; /* Whether the PMU has been registered or not */
1565 static struct rapl_pmu rapl_pmu
;
1569 static int get_pmu_cpu(struct rapl_package
*rp
)
1576 /* Only TPMI RAPL is supported for now */
1577 if (rp
->priv
->type
!= RAPL_IF_TPMI
)
1580 /* TPMI RAPL uses any CPU in the package for PMU */
1581 for_each_online_cpu(cpu
)
1582 if (topology_physical_package_id(cpu
) == rp
->id
)
1588 static bool is_rp_pmu_cpu(struct rapl_package
*rp
, int cpu
)
1593 /* Only TPMI RAPL is supported for now */
1594 if (rp
->priv
->type
!= RAPL_IF_TPMI
)
1597 /* TPMI RAPL uses any CPU in the package for PMU */
1598 return topology_physical_package_id(cpu
) == rp
->id
;
1601 static struct rapl_package_pmu_data
*event_to_pmu_data(struct perf_event
*event
)
1603 struct rapl_package
*rp
= event
->pmu_private
;
1605 return &rp
->pmu_data
;
1608 /* PMU event callbacks */
1610 static u64
event_read_counter(struct perf_event
*event
)
1612 struct rapl_package
*rp
= event
->pmu_private
;
1616 /* Return 0 for unsupported events */
1617 if (event
->hw
.idx
< 0)
1620 ret
= rapl_read_data_raw(&rp
->domains
[event
->hw
.idx
], ENERGY_COUNTER
, false, &val
);
1622 /* Return 0 for failed read */
1629 static void __rapl_pmu_event_start(struct perf_event
*event
)
1631 struct rapl_package_pmu_data
*data
= event_to_pmu_data(event
);
1633 if (WARN_ON_ONCE(!(event
->hw
.state
& PERF_HES_STOPPED
)))
1636 event
->hw
.state
= 0;
1638 list_add_tail(&event
->active_entry
, &data
->active_list
);
1640 local64_set(&event
->hw
.prev_count
, event_read_counter(event
));
1641 if (++data
->n_active
== 1)
1642 hrtimer_start(&data
->hrtimer
, data
->timer_interval
,
1643 HRTIMER_MODE_REL_PINNED
);
1646 static void rapl_pmu_event_start(struct perf_event
*event
, int mode
)
1648 struct rapl_package_pmu_data
*data
= event_to_pmu_data(event
);
1649 unsigned long flags
;
1651 raw_spin_lock_irqsave(&data
->lock
, flags
);
1652 __rapl_pmu_event_start(event
);
1653 raw_spin_unlock_irqrestore(&data
->lock
, flags
);
1656 static u64
rapl_event_update(struct perf_event
*event
)
1658 struct hw_perf_event
*hwc
= &event
->hw
;
1659 struct rapl_package_pmu_data
*data
= event_to_pmu_data(event
);
1660 u64 prev_raw_count
, new_raw_count
;
1664 * Follow the generic code to drain hwc->prev_count.
1665 * The loop is not expected to run for multiple times.
1667 prev_raw_count
= local64_read(&hwc
->prev_count
);
1669 new_raw_count
= event_read_counter(event
);
1670 } while (!local64_try_cmpxchg(&hwc
->prev_count
,
1671 &prev_raw_count
, new_raw_count
));
1675 * Now we have the new raw value and have updated the prev
1676 * timestamp already. We can now calculate the elapsed delta
1677 * (event-)time and add that to the generic event.
1679 delta
= new_raw_count
- prev_raw_count
;
1682 * Scale delta to smallest unit (2^-32)
1683 * users must then scale back: count * 1/(1e9*2^32) to get Joules
1684 * or use ldexp(count, -32).
1685 * Watts = Joules/Time delta
1687 sdelta
= delta
* data
->scale
[event
->hw
.flags
];
1689 local64_add(sdelta
, &event
->count
);
1691 return new_raw_count
;
1694 static void rapl_pmu_event_stop(struct perf_event
*event
, int mode
)
1696 struct rapl_package_pmu_data
*data
= event_to_pmu_data(event
);
1697 struct hw_perf_event
*hwc
= &event
->hw
;
1698 unsigned long flags
;
1700 raw_spin_lock_irqsave(&data
->lock
, flags
);
1702 /* Mark event as deactivated and stopped */
1703 if (!(hwc
->state
& PERF_HES_STOPPED
)) {
1704 WARN_ON_ONCE(data
->n_active
<= 0);
1705 if (--data
->n_active
== 0)
1706 hrtimer_cancel(&data
->hrtimer
);
1708 list_del(&event
->active_entry
);
1710 WARN_ON_ONCE(hwc
->state
& PERF_HES_STOPPED
);
1711 hwc
->state
|= PERF_HES_STOPPED
;
1714 /* Check if update of sw counter is necessary */
1715 if ((mode
& PERF_EF_UPDATE
) && !(hwc
->state
& PERF_HES_UPTODATE
)) {
1717 * Drain the remaining delta count out of a event
1718 * that we are disabling:
1720 rapl_event_update(event
);
1721 hwc
->state
|= PERF_HES_UPTODATE
;
1724 raw_spin_unlock_irqrestore(&data
->lock
, flags
);
1727 static int rapl_pmu_event_add(struct perf_event
*event
, int mode
)
1729 struct rapl_package_pmu_data
*data
= event_to_pmu_data(event
);
1730 struct hw_perf_event
*hwc
= &event
->hw
;
1731 unsigned long flags
;
1733 raw_spin_lock_irqsave(&data
->lock
, flags
);
1735 hwc
->state
= PERF_HES_UPTODATE
| PERF_HES_STOPPED
;
1737 if (mode
& PERF_EF_START
)
1738 __rapl_pmu_event_start(event
);
1740 raw_spin_unlock_irqrestore(&data
->lock
, flags
);
1745 static void rapl_pmu_event_del(struct perf_event
*event
, int flags
)
1747 rapl_pmu_event_stop(event
, PERF_EF_UPDATE
);
1750 /* RAPL PMU event ids, same as shown in sysfs */
1751 enum perf_rapl_events
{
1752 PERF_RAPL_PP0
= 1, /* all cores */
1753 PERF_RAPL_PKG
, /* entire package */
1754 PERF_RAPL_RAM
, /* DRAM */
1755 PERF_RAPL_PP1
, /* gpu */
1756 PERF_RAPL_PSYS
, /* psys */
1759 #define RAPL_EVENT_MASK GENMASK(7, 0)
1761 static const int event_to_domain
[PERF_RAPL_MAX
] = {
1762 [PERF_RAPL_PP0
] = RAPL_DOMAIN_PP0
,
1763 [PERF_RAPL_PKG
] = RAPL_DOMAIN_PACKAGE
,
1764 [PERF_RAPL_RAM
] = RAPL_DOMAIN_DRAM
,
1765 [PERF_RAPL_PP1
] = RAPL_DOMAIN_PP1
,
1766 [PERF_RAPL_PSYS
] = RAPL_DOMAIN_PLATFORM
,
1769 static int rapl_pmu_event_init(struct perf_event
*event
)
1771 struct rapl_package
*pos
, *rp
= NULL
;
1772 u64 cfg
= event
->attr
.config
& RAPL_EVENT_MASK
;
1775 /* Only look at RAPL events */
1776 if (event
->attr
.type
!= event
->pmu
->type
)
1779 /* Check for supported events only */
1780 if (!cfg
|| cfg
>= PERF_RAPL_MAX
)
1786 /* Find out which Package the event belongs to */
1787 list_for_each_entry(pos
, &rapl_packages
, plist
) {
1788 if (is_rp_pmu_cpu(pos
, event
->cpu
)) {
1796 /* Find out which RAPL Domain the event belongs to */
1797 domain
= event_to_domain
[cfg
];
1799 event
->event_caps
|= PERF_EV_CAP_READ_ACTIVE_PKG
;
1800 event
->pmu_private
= rp
; /* Which package */
1801 event
->hw
.flags
= domain
; /* Which domain */
1804 /* Find out the index in rp->domains[] to get domain pointer */
1805 for (idx
= 0; idx
< rp
->nr_domains
; idx
++) {
1806 if (rp
->domains
[idx
].id
== domain
) {
1807 event
->hw
.idx
= idx
;
1815 static void rapl_pmu_event_read(struct perf_event
*event
)
1817 rapl_event_update(event
);
1820 static enum hrtimer_restart
rapl_hrtimer_handle(struct hrtimer
*hrtimer
)
1822 struct rapl_package_pmu_data
*data
=
1823 container_of(hrtimer
, struct rapl_package_pmu_data
, hrtimer
);
1824 struct perf_event
*event
;
1825 unsigned long flags
;
1827 if (!data
->n_active
)
1828 return HRTIMER_NORESTART
;
1830 raw_spin_lock_irqsave(&data
->lock
, flags
);
1832 list_for_each_entry(event
, &data
->active_list
, active_entry
)
1833 rapl_event_update(event
);
1835 raw_spin_unlock_irqrestore(&data
->lock
, flags
);
1837 hrtimer_forward_now(hrtimer
, data
->timer_interval
);
1839 return HRTIMER_RESTART
;
1842 /* PMU sysfs attributes */
1845 * There are no default events, but we need to create "events" group (with
1846 * empty attrs) before updating it with detected events.
1848 static struct attribute
*attrs_empty
[] = {
1852 static struct attribute_group pmu_events_group
= {
1854 .attrs
= attrs_empty
,
1857 static ssize_t
cpumask_show(struct device
*dev
,
1858 struct device_attribute
*attr
, char *buf
)
1860 struct rapl_package
*rp
;
1861 cpumask_var_t cpu_mask
;
1865 if (!alloc_cpumask_var(&cpu_mask
, GFP_KERNEL
))
1870 cpumask_clear(cpu_mask
);
1872 /* Choose a cpu for each RAPL Package */
1873 list_for_each_entry(rp
, &rapl_packages
, plist
) {
1874 cpu
= get_pmu_cpu(rp
);
1875 if (cpu
< nr_cpu_ids
)
1876 cpumask_set_cpu(cpu
, cpu_mask
);
1880 ret
= cpumap_print_to_pagebuf(true, buf
, cpu_mask
);
1882 free_cpumask_var(cpu_mask
);
1887 static DEVICE_ATTR_RO(cpumask
);
1889 static struct attribute
*pmu_cpumask_attrs
[] = {
1890 &dev_attr_cpumask
.attr
,
1894 static struct attribute_group pmu_cpumask_group
= {
1895 .attrs
= pmu_cpumask_attrs
,
1898 PMU_FORMAT_ATTR(event
, "config:0-7");
1899 static struct attribute
*pmu_format_attr
[] = {
1900 &format_attr_event
.attr
,
1904 static struct attribute_group pmu_format_group
= {
1906 .attrs
= pmu_format_attr
,
1909 static const struct attribute_group
*pmu_attr_groups
[] = {
1916 #define RAPL_EVENT_ATTR_STR(_name, v, str) \
1917 static struct perf_pmu_events_attr event_attr_##v = { \
1918 .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
1922 RAPL_EVENT_ATTR_STR(energy
-cores
, rapl_cores
, "event=0x01");
1923 RAPL_EVENT_ATTR_STR(energy
-pkg
, rapl_pkg
, "event=0x02");
1924 RAPL_EVENT_ATTR_STR(energy
-ram
, rapl_ram
, "event=0x03");
1925 RAPL_EVENT_ATTR_STR(energy
-gpu
, rapl_gpu
, "event=0x04");
1926 RAPL_EVENT_ATTR_STR(energy
-psys
, rapl_psys
, "event=0x05");
1928 RAPL_EVENT_ATTR_STR(energy
-cores
.unit
, rapl_unit_cores
, "Joules");
1929 RAPL_EVENT_ATTR_STR(energy
-pkg
.unit
, rapl_unit_pkg
, "Joules");
1930 RAPL_EVENT_ATTR_STR(energy
-ram
.unit
, rapl_unit_ram
, "Joules");
1931 RAPL_EVENT_ATTR_STR(energy
-gpu
.unit
, rapl_unit_gpu
, "Joules");
1932 RAPL_EVENT_ATTR_STR(energy
-psys
.unit
, rapl_unit_psys
, "Joules");
1934 RAPL_EVENT_ATTR_STR(energy
-cores
.scale
, rapl_scale_cores
, "2.3283064365386962890625e-10");
1935 RAPL_EVENT_ATTR_STR(energy
-pkg
.scale
, rapl_scale_pkg
, "2.3283064365386962890625e-10");
1936 RAPL_EVENT_ATTR_STR(energy
-ram
.scale
, rapl_scale_ram
, "2.3283064365386962890625e-10");
1937 RAPL_EVENT_ATTR_STR(energy
-gpu
.scale
, rapl_scale_gpu
, "2.3283064365386962890625e-10");
1938 RAPL_EVENT_ATTR_STR(energy
-psys
.scale
, rapl_scale_psys
, "2.3283064365386962890625e-10");
1940 #define RAPL_EVENT_GROUP(_name, domain) \
1941 static struct attribute *pmu_attr_##_name[] = { \
1942 &event_attr_rapl_##_name.attr.attr, \
1943 &event_attr_rapl_unit_##_name.attr.attr, \
1944 &event_attr_rapl_scale_##_name.attr.attr, \
1947 static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \
1949 return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \
1951 static struct attribute_group pmu_group_##_name = { \
1953 .attrs = pmu_attr_##_name, \
1954 .is_visible = is_visible_##_name, \
1957 RAPL_EVENT_GROUP(cores
, RAPL_DOMAIN_PP0
);
1958 RAPL_EVENT_GROUP(pkg
, RAPL_DOMAIN_PACKAGE
);
1959 RAPL_EVENT_GROUP(ram
, RAPL_DOMAIN_DRAM
);
1960 RAPL_EVENT_GROUP(gpu
, RAPL_DOMAIN_PP1
);
1961 RAPL_EVENT_GROUP(psys
, RAPL_DOMAIN_PLATFORM
);
1963 static const struct attribute_group
*pmu_attr_update
[] = {
1972 static int rapl_pmu_update(struct rapl_package
*rp
)
1976 /* Return if PMU already covers all events supported by current RAPL Package */
1977 if (rapl_pmu
.registered
&& !(rp
->domain_map
& (~rapl_pmu
.domain_map
)))
1980 /* Unregister previous registered PMU */
1981 if (rapl_pmu
.registered
)
1982 perf_pmu_unregister(&rapl_pmu
.pmu
);
1984 rapl_pmu
.registered
= false;
1985 rapl_pmu
.domain_map
|= rp
->domain_map
;
1987 memset(&rapl_pmu
.pmu
, 0, sizeof(struct pmu
));
1988 rapl_pmu
.pmu
.attr_groups
= pmu_attr_groups
;
1989 rapl_pmu
.pmu
.attr_update
= pmu_attr_update
;
1990 rapl_pmu
.pmu
.task_ctx_nr
= perf_invalid_context
;
1991 rapl_pmu
.pmu
.event_init
= rapl_pmu_event_init
;
1992 rapl_pmu
.pmu
.add
= rapl_pmu_event_add
;
1993 rapl_pmu
.pmu
.del
= rapl_pmu_event_del
;
1994 rapl_pmu
.pmu
.start
= rapl_pmu_event_start
;
1995 rapl_pmu
.pmu
.stop
= rapl_pmu_event_stop
;
1996 rapl_pmu
.pmu
.read
= rapl_pmu_event_read
;
1997 rapl_pmu
.pmu
.module
= THIS_MODULE
;
1998 rapl_pmu
.pmu
.capabilities
= PERF_PMU_CAP_NO_EXCLUDE
| PERF_PMU_CAP_NO_INTERRUPT
;
1999 ret
= perf_pmu_register(&rapl_pmu
.pmu
, "power", -1);
2001 pr_info("Failed to register PMU\n");
2005 rapl_pmu
.registered
= true;
2011 int rapl_package_add_pmu(struct rapl_package
*rp
)
2013 struct rapl_package_pmu_data
*data
= &rp
->pmu_data
;
2019 guard(cpus_read_lock
)();
2021 for (idx
= 0; idx
< rp
->nr_domains
; idx
++) {
2022 struct rapl_domain
*rd
= &rp
->domains
[idx
];
2023 int domain
= rd
->id
;
2026 if (!test_bit(domain
, &rp
->domain_map
))
2030 * The RAPL PMU granularity is 2^-32 Joules
2031 * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase
2033 val
= rd
->energy_unit
* (1ULL << 32);
2034 do_div(val
, ENERGY_UNIT_SCALE
* 1000000);
2035 data
->scale
[domain
] = val
;
2037 if (!rapl_pmu
.timer_ms
) {
2038 struct rapl_primitive_info
*rpi
= get_rpi(rp
, ENERGY_COUNTER
);
2041 * Calculate the timer rate:
2042 * Use reference of 200W for scaling the timeout to avoid counter
2045 * max_count = rpi->mask >> rpi->shift + 1
2046 * max_energy_pj = max_count * rd->energy_unit
2047 * max_time_sec = (max_energy_pj / 1000000000) / 200w
2049 * rapl_pmu.timer_ms = max_time_sec * 1000 / 2
2051 val
= (rpi
->mask
>> rpi
->shift
) + 1;
2052 val
*= rd
->energy_unit
;
2053 do_div(val
, 1000000 * 200 * 2);
2054 rapl_pmu
.timer_ms
= val
;
2056 pr_debug("%llu ms overflow timer\n", rapl_pmu
.timer_ms
);
2059 pr_debug("Domain %s: hw unit %lld * 2^-32 Joules\n", rd
->name
, data
->scale
[domain
]);
2062 /* Initialize per package PMU data */
2063 raw_spin_lock_init(&data
->lock
);
2064 INIT_LIST_HEAD(&data
->active_list
);
2065 data
->timer_interval
= ms_to_ktime(rapl_pmu
.timer_ms
);
2066 hrtimer_init(&data
->hrtimer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
2067 data
->hrtimer
.function
= rapl_hrtimer_handle
;
2069 return rapl_pmu_update(rp
);
2071 EXPORT_SYMBOL_GPL(rapl_package_add_pmu
);
2073 void rapl_package_remove_pmu(struct rapl_package
*rp
)
2075 struct rapl_package
*pos
;
2080 guard(cpus_read_lock
)();
2082 list_for_each_entry(pos
, &rapl_packages
, plist
) {
2083 /* PMU is still needed */
2084 if (pos
->has_pmu
&& pos
!= rp
)
2088 perf_pmu_unregister(&rapl_pmu
.pmu
);
2089 memset(&rapl_pmu
, 0, sizeof(struct rapl_pmu
));
2091 EXPORT_SYMBOL_GPL(rapl_package_remove_pmu
);
2094 /* called from CPU hotplug notifier, hotplug lock held */
2095 void rapl_remove_package_cpuslocked(struct rapl_package
*rp
)
2097 struct rapl_domain
*rd
, *rd_package
= NULL
;
2099 package_power_limit_irq_restore(rp
);
2101 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
2104 for (i
= POWER_LIMIT1
; i
< NR_POWER_LIMITS
; i
++) {
2105 rapl_write_pl_data(rd
, i
, PL_ENABLE
, 0);
2106 rapl_write_pl_data(rd
, i
, PL_CLAMP
, 0);
2109 if (rd
->id
== RAPL_DOMAIN_PACKAGE
) {
2113 pr_debug("remove package, undo power limit on %s: %s\n",
2114 rp
->name
, rd
->name
);
2115 powercap_unregister_zone(rp
->priv
->control_type
,
2118 /* do parent zone last */
2119 powercap_unregister_zone(rp
->priv
->control_type
,
2120 &rd_package
->power_zone
);
2121 list_del(&rp
->plist
);
2124 EXPORT_SYMBOL_GPL(rapl_remove_package_cpuslocked
);
2126 void rapl_remove_package(struct rapl_package
*rp
)
2128 guard(cpus_read_lock
)();
2129 rapl_remove_package_cpuslocked(rp
);
2131 EXPORT_SYMBOL_GPL(rapl_remove_package
);
2134 * RAPL Package energy counter scope:
2135 * 1. AMD/HYGON platforms use per-PKG package energy counter
2136 * 2. For Intel platforms
2137 * 2.1 CLX-AP platform has per-DIE package energy counter
2138 * 2.2 Other platforms that uses MSR RAPL are single die systems so the
2139 * package energy counter can be considered as per-PKG/per-DIE,
2140 * here it is considered as per-DIE.
2141 * 2.3 New platforms that use TPMI RAPL doesn't care about the
2142 * scope because they are not MSR/CPU based.
2144 #define rapl_msrs_are_pkg_scope() \
2145 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
2146 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
2148 /* caller to ensure CPU hotplug lock is held */
2149 struct rapl_package
*rapl_find_package_domain_cpuslocked(int id
, struct rapl_if_priv
*priv
,
2152 struct rapl_package
*rp
;
2156 uid
= rapl_msrs_are_pkg_scope() ?
2157 topology_physical_package_id(id
) : topology_logical_die_id(id
);
2159 pr_err("topology_logical_(package/die)_id() returned a negative value");
2166 list_for_each_entry(rp
, &rapl_packages
, plist
) {
2168 && rp
->priv
->control_type
== priv
->control_type
)
2174 EXPORT_SYMBOL_GPL(rapl_find_package_domain_cpuslocked
);
2176 struct rapl_package
*rapl_find_package_domain(int id
, struct rapl_if_priv
*priv
, bool id_is_cpu
)
2178 guard(cpus_read_lock
)();
2179 return rapl_find_package_domain_cpuslocked(id
, priv
, id_is_cpu
);
2181 EXPORT_SYMBOL_GPL(rapl_find_package_domain
);
2183 /* called from CPU hotplug notifier, hotplug lock held */
2184 struct rapl_package
*rapl_add_package_cpuslocked(int id
, struct rapl_if_priv
*priv
, bool id_is_cpu
)
2186 struct rapl_package
*rp
;
2189 rp
= kzalloc(sizeof(struct rapl_package
), GFP_KERNEL
);
2191 return ERR_PTR(-ENOMEM
);
2194 rp
->id
= rapl_msrs_are_pkg_scope() ?
2195 topology_physical_package_id(id
) : topology_logical_die_id(id
);
2196 if ((int)(rp
->id
) < 0) {
2197 pr_err("topology_logical_(package/die)_id() returned a negative value");
2198 return ERR_PTR(-EINVAL
);
2201 if (!rapl_msrs_are_pkg_scope() && topology_max_dies_per_package() > 1)
2202 snprintf(rp
->name
, PACKAGE_DOMAIN_NAME_LENGTH
, "package-%d-die-%d",
2203 topology_physical_package_id(id
), topology_die_id(id
));
2205 snprintf(rp
->name
, PACKAGE_DOMAIN_NAME_LENGTH
, "package-%d",
2206 topology_physical_package_id(id
));
2210 snprintf(rp
->name
, PACKAGE_DOMAIN_NAME_LENGTH
, "package-%d", id
);
2214 ret
= rapl_config(rp
);
2216 goto err_free_package
;
2218 /* check if the package contains valid domains */
2219 if (rapl_detect_domains(rp
)) {
2221 goto err_free_package
;
2223 ret
= rapl_package_register_powercap(rp
);
2225 INIT_LIST_HEAD(&rp
->plist
);
2226 list_add(&rp
->plist
, &rapl_packages
);
2233 return ERR_PTR(ret
);
2235 EXPORT_SYMBOL_GPL(rapl_add_package_cpuslocked
);
2237 struct rapl_package
*rapl_add_package(int id
, struct rapl_if_priv
*priv
, bool id_is_cpu
)
2239 guard(cpus_read_lock
)();
2240 return rapl_add_package_cpuslocked(id
, priv
, id_is_cpu
);
2242 EXPORT_SYMBOL_GPL(rapl_add_package
);
2244 static void power_limit_state_save(void)
2246 struct rapl_package
*rp
;
2247 struct rapl_domain
*rd
;
2251 list_for_each_entry(rp
, &rapl_packages
, plist
) {
2252 if (!rp
->power_zone
)
2254 rd
= power_zone_to_rapl_domain(rp
->power_zone
);
2255 for (i
= POWER_LIMIT1
; i
< NR_POWER_LIMITS
; i
++) {
2256 ret
= rapl_read_pl_data(rd
, i
, PL_LIMIT
, true,
2257 &rd
->rpl
[i
].last_power_limit
);
2259 rd
->rpl
[i
].last_power_limit
= 0;
2265 static void power_limit_state_restore(void)
2267 struct rapl_package
*rp
;
2268 struct rapl_domain
*rd
;
2272 list_for_each_entry(rp
, &rapl_packages
, plist
) {
2273 if (!rp
->power_zone
)
2275 rd
= power_zone_to_rapl_domain(rp
->power_zone
);
2276 for (i
= POWER_LIMIT1
; i
< NR_POWER_LIMITS
; i
++)
2277 if (rd
->rpl
[i
].last_power_limit
)
2278 rapl_write_pl_data(rd
, i
, PL_LIMIT
,
2279 rd
->rpl
[i
].last_power_limit
);
2284 static int rapl_pm_callback(struct notifier_block
*nb
,
2285 unsigned long mode
, void *_unused
)
2288 case PM_SUSPEND_PREPARE
:
2289 power_limit_state_save();
2291 case PM_POST_SUSPEND
:
2292 power_limit_state_restore();
2298 static struct notifier_block rapl_pm_notifier
= {
2299 .notifier_call
= rapl_pm_callback
,
2302 static struct platform_device
*rapl_msr_platdev
;
2304 static int __init
rapl_init(void)
2306 const struct x86_cpu_id
*id
;
2309 id
= x86_match_cpu(rapl_ids
);
2311 defaults_msr
= (struct rapl_defaults
*)id
->driver_data
;
2313 rapl_msr_platdev
= platform_device_alloc("intel_rapl_msr", 0);
2314 if (!rapl_msr_platdev
)
2317 ret
= platform_device_add(rapl_msr_platdev
);
2319 platform_device_put(rapl_msr_platdev
);
2324 ret
= register_pm_notifier(&rapl_pm_notifier
);
2325 if (ret
&& rapl_msr_platdev
) {
2326 platform_device_del(rapl_msr_platdev
);
2327 platform_device_put(rapl_msr_platdev
);
2333 static void __exit
rapl_exit(void)
2335 platform_device_unregister(rapl_msr_platdev
);
2336 unregister_pm_notifier(&rapl_pm_notifier
);
2339 fs_initcall(rapl_init
);
2340 module_exit(rapl_exit
);
2342 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
2343 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
2344 MODULE_LICENSE("GPL v2");