2 * Intel Running Average Power Limit (RAPL) Driver
3 * Copyright (c) 2013, Intel Corporation.
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.
18 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 #include <linux/kernel.h>
21 #include <linux/module.h>
22 #include <linux/list.h>
23 #include <linux/types.h>
24 #include <linux/device.h>
25 #include <linux/slab.h>
26 #include <linux/log2.h>
27 #include <linux/bitmap.h>
28 #include <linux/delay.h>
29 #include <linux/sysfs.h>
30 #include <linux/cpu.h>
31 #include <linux/powercap.h>
32 #include <asm/iosf_mbi.h>
34 #include <asm/processor.h>
35 #include <asm/cpu_device_id.h>
36 #include <asm/intel-family.h>
39 #define MSR_PLATFORM_POWER_LIMIT 0x0000065C
41 /* bitmasks for RAPL MSRs, used by primitive access functions */
42 #define ENERGY_STATUS_MASK 0xffffffff
44 #define POWER_LIMIT1_MASK 0x7FFF
45 #define POWER_LIMIT1_ENABLE BIT(15)
46 #define POWER_LIMIT1_CLAMP BIT(16)
48 #define POWER_LIMIT2_MASK (0x7FFFULL<<32)
49 #define POWER_LIMIT2_ENABLE BIT_ULL(47)
50 #define POWER_LIMIT2_CLAMP BIT_ULL(48)
51 #define POWER_PACKAGE_LOCK BIT_ULL(63)
52 #define POWER_PP_LOCK BIT(31)
54 #define TIME_WINDOW1_MASK (0x7FULL<<17)
55 #define TIME_WINDOW2_MASK (0x7FULL<<49)
57 #define POWER_UNIT_OFFSET 0
58 #define POWER_UNIT_MASK 0x0F
60 #define ENERGY_UNIT_OFFSET 0x08
61 #define ENERGY_UNIT_MASK 0x1F00
63 #define TIME_UNIT_OFFSET 0x10
64 #define TIME_UNIT_MASK 0xF0000
66 #define POWER_INFO_MAX_MASK (0x7fffULL<<32)
67 #define POWER_INFO_MIN_MASK (0x7fffULL<<16)
68 #define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48)
69 #define POWER_INFO_THERMAL_SPEC_MASK 0x7fff
71 #define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
72 #define PP_POLICY_MASK 0x1F
74 /* Non HW constants */
75 #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */
76 #define RAPL_PRIMITIVE_DUMMY BIT(2)
78 #define TIME_WINDOW_MAX_MSEC 40000
79 #define TIME_WINDOW_MIN_MSEC 250
80 #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */
82 ARBITRARY_UNIT
, /* no translation */
88 enum rapl_domain_type
{
89 RAPL_DOMAIN_PACKAGE
, /* entire package/socket */
90 RAPL_DOMAIN_PP0
, /* core power plane */
91 RAPL_DOMAIN_PP1
, /* graphics uncore */
92 RAPL_DOMAIN_DRAM
,/* DRAM control_type */
93 RAPL_DOMAIN_PLATFORM
, /* PSys control_type */
97 enum rapl_domain_msr_id
{
98 RAPL_DOMAIN_MSR_LIMIT
,
99 RAPL_DOMAIN_MSR_STATUS
,
100 RAPL_DOMAIN_MSR_PERF
,
101 RAPL_DOMAIN_MSR_POLICY
,
102 RAPL_DOMAIN_MSR_INFO
,
106 /* per domain data, some are optional */
107 enum rapl_primitives
{
113 PL1_ENABLE
, /* power limit 1, aka long term */
114 PL1_CLAMP
, /* allow frequency to go below OS request */
115 PL2_ENABLE
, /* power limit 2, aka short term, instantaneous */
118 TIME_WINDOW1
, /* long term */
119 TIME_WINDOW2
, /* short term */
128 /* below are not raw primitive data */
133 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
135 /* Can be expanded to include events, etc.*/
136 struct rapl_domain_data
{
137 u64 primitives
[NR_RAPL_PRIMITIVES
];
138 unsigned long timestamp
;
148 #define DOMAIN_STATE_INACTIVE BIT(0)
149 #define DOMAIN_STATE_POWER_LIMIT_SET BIT(1)
150 #define DOMAIN_STATE_BIOS_LOCKED BIT(2)
152 #define NR_POWER_LIMITS (2)
153 struct rapl_power_limit
{
154 struct powercap_zone_constraint
*constraint
;
155 int prim_id
; /* primitive ID used to enable */
156 struct rapl_domain
*domain
;
160 static const char pl1_name
[] = "long_term";
161 static const char pl2_name
[] = "short_term";
166 enum rapl_domain_type id
;
167 int msrs
[RAPL_DOMAIN_MSR_MAX
];
168 struct powercap_zone power_zone
;
169 struct rapl_domain_data rdd
;
170 struct rapl_power_limit rpl
[NR_POWER_LIMITS
];
171 u64 attr_map
; /* track capabilities */
173 unsigned int domain_energy_unit
;
174 struct rapl_package
*rp
;
176 #define power_zone_to_rapl_domain(_zone) \
177 container_of(_zone, struct rapl_domain, power_zone)
180 /* Each physical package contains multiple domains, these are the common
181 * data across RAPL domains within a package.
183 struct rapl_package
{
184 unsigned int id
; /* physical package/socket id */
185 unsigned int nr_domains
;
186 unsigned long domain_map
; /* bit map of active domains */
187 unsigned int power_unit
;
188 unsigned int energy_unit
;
189 unsigned int time_unit
;
190 struct rapl_domain
*domains
; /* array of domains, sized at runtime */
191 struct powercap_zone
*power_zone
; /* keep track of parent zone */
192 unsigned long power_limit_irq
; /* keep track of package power limit
193 * notify interrupt enable status.
195 struct list_head plist
;
196 int lead_cpu
; /* one active cpu per package for access */
197 /* Track active cpus */
198 struct cpumask cpumask
;
201 struct rapl_defaults
{
202 u8 floor_freq_reg_addr
;
203 int (*check_unit
)(struct rapl_package
*rp
, int cpu
);
204 void (*set_floor_freq
)(struct rapl_domain
*rd
, bool mode
);
205 u64 (*compute_time_window
)(struct rapl_package
*rp
, u64 val
,
207 unsigned int dram_domain_energy_unit
;
209 static struct rapl_defaults
*rapl_defaults
;
211 /* Sideband MBI registers */
212 #define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
213 #define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
215 #define PACKAGE_PLN_INT_SAVED BIT(0)
216 #define MAX_PRIM_NAME (32)
218 /* per domain data. used to describe individual knobs such that access function
219 * can be consolidated into one instead of many inline functions.
221 struct rapl_primitive_info
{
225 enum rapl_domain_msr_id id
;
230 #define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \
239 static void rapl_init_domains(struct rapl_package
*rp
);
240 static int rapl_read_data_raw(struct rapl_domain
*rd
,
241 enum rapl_primitives prim
,
242 bool xlate
, u64
*data
);
243 static int rapl_write_data_raw(struct rapl_domain
*rd
,
244 enum rapl_primitives prim
,
245 unsigned long long value
);
246 static u64
rapl_unit_xlate(struct rapl_domain
*rd
,
247 enum unit_type type
, u64 value
,
249 static void package_power_limit_irq_save(struct rapl_package
*rp
);
251 static LIST_HEAD(rapl_packages
); /* guarded by CPU hotplug lock */
253 static const char * const rapl_domain_names
[] = {
261 static struct powercap_control_type
*control_type
; /* PowerCap Controller */
262 static struct rapl_domain
*platform_rapl_domain
; /* Platform (PSys) domain */
264 /* caller to ensure CPU hotplug lock is held */
265 static struct rapl_package
*find_package_by_id(int id
)
267 struct rapl_package
*rp
;
269 list_for_each_entry(rp
, &rapl_packages
, plist
) {
277 static int get_energy_counter(struct powercap_zone
*power_zone
, u64
*energy_raw
)
279 struct rapl_domain
*rd
;
282 /* prevent CPU hotplug, make sure the RAPL domain does not go
283 * away while reading the counter.
286 rd
= power_zone_to_rapl_domain(power_zone
);
288 if (!rapl_read_data_raw(rd
, ENERGY_COUNTER
, true, &energy_now
)) {
289 *energy_raw
= energy_now
;
299 static int get_max_energy_counter(struct powercap_zone
*pcd_dev
, u64
*energy
)
301 struct rapl_domain
*rd
= power_zone_to_rapl_domain(pcd_dev
);
303 *energy
= rapl_unit_xlate(rd
, ENERGY_UNIT
, ENERGY_STATUS_MASK
, 0);
307 static int release_zone(struct powercap_zone
*power_zone
)
309 struct rapl_domain
*rd
= power_zone_to_rapl_domain(power_zone
);
310 struct rapl_package
*rp
= rd
->rp
;
312 /* package zone is the last zone of a package, we can free
313 * memory here since all children has been unregistered.
315 if (rd
->id
== RAPL_DOMAIN_PACKAGE
) {
324 static int find_nr_power_limit(struct rapl_domain
*rd
)
328 for (i
= 0; i
< NR_POWER_LIMITS
; i
++) {
336 static int set_domain_enable(struct powercap_zone
*power_zone
, bool mode
)
338 struct rapl_domain
*rd
= power_zone_to_rapl_domain(power_zone
);
340 if (rd
->state
& DOMAIN_STATE_BIOS_LOCKED
)
344 rapl_write_data_raw(rd
, PL1_ENABLE
, mode
);
345 if (rapl_defaults
->set_floor_freq
)
346 rapl_defaults
->set_floor_freq(rd
, mode
);
352 static int get_domain_enable(struct powercap_zone
*power_zone
, bool *mode
)
354 struct rapl_domain
*rd
= power_zone_to_rapl_domain(power_zone
);
357 if (rd
->state
& DOMAIN_STATE_BIOS_LOCKED
) {
362 if (rapl_read_data_raw(rd
, PL1_ENABLE
, true, &val
)) {
372 /* per RAPL domain ops, in the order of rapl_domain_type */
373 static const struct powercap_zone_ops zone_ops
[] = {
374 /* RAPL_DOMAIN_PACKAGE */
376 .get_energy_uj
= get_energy_counter
,
377 .get_max_energy_range_uj
= get_max_energy_counter
,
378 .release
= release_zone
,
379 .set_enable
= set_domain_enable
,
380 .get_enable
= get_domain_enable
,
382 /* RAPL_DOMAIN_PP0 */
384 .get_energy_uj
= get_energy_counter
,
385 .get_max_energy_range_uj
= get_max_energy_counter
,
386 .release
= release_zone
,
387 .set_enable
= set_domain_enable
,
388 .get_enable
= get_domain_enable
,
390 /* RAPL_DOMAIN_PP1 */
392 .get_energy_uj
= get_energy_counter
,
393 .get_max_energy_range_uj
= get_max_energy_counter
,
394 .release
= release_zone
,
395 .set_enable
= set_domain_enable
,
396 .get_enable
= get_domain_enable
,
398 /* RAPL_DOMAIN_DRAM */
400 .get_energy_uj
= get_energy_counter
,
401 .get_max_energy_range_uj
= get_max_energy_counter
,
402 .release
= release_zone
,
403 .set_enable
= set_domain_enable
,
404 .get_enable
= get_domain_enable
,
406 /* RAPL_DOMAIN_PLATFORM */
408 .get_energy_uj
= get_energy_counter
,
409 .get_max_energy_range_uj
= get_max_energy_counter
,
410 .release
= release_zone
,
411 .set_enable
= set_domain_enable
,
412 .get_enable
= get_domain_enable
,
418 * Constraint index used by powercap can be different than power limit (PL)
419 * index in that some PLs maybe missing due to non-existant MSRs. So we
420 * need to convert here by finding the valid PLs only (name populated).
422 static int contraint_to_pl(struct rapl_domain
*rd
, int cid
)
426 for (i
= 0, j
= 0; i
< NR_POWER_LIMITS
; i
++) {
427 if ((rd
->rpl
[i
].name
) && j
++ == cid
) {
428 pr_debug("%s: index %d\n", __func__
, i
);
432 pr_err("Cannot find matching power limit for constraint %d\n", cid
);
437 static int set_power_limit(struct powercap_zone
*power_zone
, int cid
,
440 struct rapl_domain
*rd
;
441 struct rapl_package
*rp
;
446 rd
= power_zone_to_rapl_domain(power_zone
);
447 id
= contraint_to_pl(rd
, cid
);
455 if (rd
->state
& DOMAIN_STATE_BIOS_LOCKED
) {
456 dev_warn(&power_zone
->dev
, "%s locked by BIOS, monitoring only\n",
462 switch (rd
->rpl
[id
].prim_id
) {
464 rapl_write_data_raw(rd
, POWER_LIMIT1
, power_limit
);
467 rapl_write_data_raw(rd
, POWER_LIMIT2
, power_limit
);
473 package_power_limit_irq_save(rp
);
479 static int get_current_power_limit(struct powercap_zone
*power_zone
, int cid
,
482 struct rapl_domain
*rd
;
489 rd
= power_zone_to_rapl_domain(power_zone
);
490 id
= contraint_to_pl(rd
, cid
);
496 switch (rd
->rpl
[id
].prim_id
) {
507 if (rapl_read_data_raw(rd
, prim
, true, &val
))
518 static int set_time_window(struct powercap_zone
*power_zone
, int cid
,
521 struct rapl_domain
*rd
;
526 rd
= power_zone_to_rapl_domain(power_zone
);
527 id
= contraint_to_pl(rd
, cid
);
533 switch (rd
->rpl
[id
].prim_id
) {
535 rapl_write_data_raw(rd
, TIME_WINDOW1
, window
);
538 rapl_write_data_raw(rd
, TIME_WINDOW2
, window
);
549 static int get_time_window(struct powercap_zone
*power_zone
, int cid
, u64
*data
)
551 struct rapl_domain
*rd
;
557 rd
= power_zone_to_rapl_domain(power_zone
);
558 id
= contraint_to_pl(rd
, cid
);
564 switch (rd
->rpl
[id
].prim_id
) {
566 ret
= rapl_read_data_raw(rd
, TIME_WINDOW1
, true, &val
);
569 ret
= rapl_read_data_raw(rd
, TIME_WINDOW2
, true, &val
);
584 static const char *get_constraint_name(struct powercap_zone
*power_zone
, int cid
)
586 struct rapl_domain
*rd
;
589 rd
= power_zone_to_rapl_domain(power_zone
);
590 id
= contraint_to_pl(rd
, cid
);
592 return rd
->rpl
[id
].name
;
598 static int get_max_power(struct powercap_zone
*power_zone
, int id
,
601 struct rapl_domain
*rd
;
607 rd
= power_zone_to_rapl_domain(power_zone
);
608 switch (rd
->rpl
[id
].prim_id
) {
610 prim
= THERMAL_SPEC_POWER
;
619 if (rapl_read_data_raw(rd
, prim
, true, &val
))
629 static const struct powercap_zone_constraint_ops constraint_ops
= {
630 .set_power_limit_uw
= set_power_limit
,
631 .get_power_limit_uw
= get_current_power_limit
,
632 .set_time_window_us
= set_time_window
,
633 .get_time_window_us
= get_time_window
,
634 .get_max_power_uw
= get_max_power
,
635 .get_name
= get_constraint_name
,
638 /* called after domain detection and package level data are set */
639 static void rapl_init_domains(struct rapl_package
*rp
)
642 struct rapl_domain
*rd
= rp
->domains
;
644 for (i
= 0; i
< RAPL_DOMAIN_MAX
; i
++) {
645 unsigned int mask
= rp
->domain_map
& (1 << i
);
647 case BIT(RAPL_DOMAIN_PACKAGE
):
648 rd
->name
= rapl_domain_names
[RAPL_DOMAIN_PACKAGE
];
649 rd
->id
= RAPL_DOMAIN_PACKAGE
;
650 rd
->msrs
[0] = MSR_PKG_POWER_LIMIT
;
651 rd
->msrs
[1] = MSR_PKG_ENERGY_STATUS
;
652 rd
->msrs
[2] = MSR_PKG_PERF_STATUS
;
654 rd
->msrs
[4] = MSR_PKG_POWER_INFO
;
655 rd
->rpl
[0].prim_id
= PL1_ENABLE
;
656 rd
->rpl
[0].name
= pl1_name
;
657 rd
->rpl
[1].prim_id
= PL2_ENABLE
;
658 rd
->rpl
[1].name
= pl2_name
;
660 case BIT(RAPL_DOMAIN_PP0
):
661 rd
->name
= rapl_domain_names
[RAPL_DOMAIN_PP0
];
662 rd
->id
= RAPL_DOMAIN_PP0
;
663 rd
->msrs
[0] = MSR_PP0_POWER_LIMIT
;
664 rd
->msrs
[1] = MSR_PP0_ENERGY_STATUS
;
666 rd
->msrs
[3] = MSR_PP0_POLICY
;
668 rd
->rpl
[0].prim_id
= PL1_ENABLE
;
669 rd
->rpl
[0].name
= pl1_name
;
671 case BIT(RAPL_DOMAIN_PP1
):
672 rd
->name
= rapl_domain_names
[RAPL_DOMAIN_PP1
];
673 rd
->id
= RAPL_DOMAIN_PP1
;
674 rd
->msrs
[0] = MSR_PP1_POWER_LIMIT
;
675 rd
->msrs
[1] = MSR_PP1_ENERGY_STATUS
;
677 rd
->msrs
[3] = MSR_PP1_POLICY
;
679 rd
->rpl
[0].prim_id
= PL1_ENABLE
;
680 rd
->rpl
[0].name
= pl1_name
;
682 case BIT(RAPL_DOMAIN_DRAM
):
683 rd
->name
= rapl_domain_names
[RAPL_DOMAIN_DRAM
];
684 rd
->id
= RAPL_DOMAIN_DRAM
;
685 rd
->msrs
[0] = MSR_DRAM_POWER_LIMIT
;
686 rd
->msrs
[1] = MSR_DRAM_ENERGY_STATUS
;
687 rd
->msrs
[2] = MSR_DRAM_PERF_STATUS
;
689 rd
->msrs
[4] = MSR_DRAM_POWER_INFO
;
690 rd
->rpl
[0].prim_id
= PL1_ENABLE
;
691 rd
->rpl
[0].name
= pl1_name
;
692 rd
->domain_energy_unit
=
693 rapl_defaults
->dram_domain_energy_unit
;
694 if (rd
->domain_energy_unit
)
695 pr_info("DRAM domain energy unit %dpj\n",
696 rd
->domain_energy_unit
);
706 static u64
rapl_unit_xlate(struct rapl_domain
*rd
, enum unit_type type
,
707 u64 value
, int to_raw
)
710 struct rapl_package
*rp
= rd
->rp
;
715 units
= rp
->power_unit
;
718 scale
= ENERGY_UNIT_SCALE
;
719 /* per domain unit takes precedence */
720 if (rd
->domain_energy_unit
)
721 units
= rd
->domain_energy_unit
;
723 units
= rp
->energy_unit
;
726 return rapl_defaults
->compute_time_window(rp
, value
, to_raw
);
733 return div64_u64(value
, units
) * scale
;
737 return div64_u64(value
, scale
);
740 /* in the order of enum rapl_primitives */
741 static struct rapl_primitive_info rpi
[] = {
742 /* name, mask, shift, msr index, unit divisor */
743 PRIMITIVE_INFO_INIT(ENERGY_COUNTER
, ENERGY_STATUS_MASK
, 0,
744 RAPL_DOMAIN_MSR_STATUS
, ENERGY_UNIT
, 0),
745 PRIMITIVE_INFO_INIT(POWER_LIMIT1
, POWER_LIMIT1_MASK
, 0,
746 RAPL_DOMAIN_MSR_LIMIT
, POWER_UNIT
, 0),
747 PRIMITIVE_INFO_INIT(POWER_LIMIT2
, POWER_LIMIT2_MASK
, 32,
748 RAPL_DOMAIN_MSR_LIMIT
, POWER_UNIT
, 0),
749 PRIMITIVE_INFO_INIT(FW_LOCK
, POWER_PP_LOCK
, 31,
750 RAPL_DOMAIN_MSR_LIMIT
, ARBITRARY_UNIT
, 0),
751 PRIMITIVE_INFO_INIT(PL1_ENABLE
, POWER_LIMIT1_ENABLE
, 15,
752 RAPL_DOMAIN_MSR_LIMIT
, ARBITRARY_UNIT
, 0),
753 PRIMITIVE_INFO_INIT(PL1_CLAMP
, POWER_LIMIT1_CLAMP
, 16,
754 RAPL_DOMAIN_MSR_LIMIT
, ARBITRARY_UNIT
, 0),
755 PRIMITIVE_INFO_INIT(PL2_ENABLE
, POWER_LIMIT2_ENABLE
, 47,
756 RAPL_DOMAIN_MSR_LIMIT
, ARBITRARY_UNIT
, 0),
757 PRIMITIVE_INFO_INIT(PL2_CLAMP
, POWER_LIMIT2_CLAMP
, 48,
758 RAPL_DOMAIN_MSR_LIMIT
, ARBITRARY_UNIT
, 0),
759 PRIMITIVE_INFO_INIT(TIME_WINDOW1
, TIME_WINDOW1_MASK
, 17,
760 RAPL_DOMAIN_MSR_LIMIT
, TIME_UNIT
, 0),
761 PRIMITIVE_INFO_INIT(TIME_WINDOW2
, TIME_WINDOW2_MASK
, 49,
762 RAPL_DOMAIN_MSR_LIMIT
, TIME_UNIT
, 0),
763 PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER
, POWER_INFO_THERMAL_SPEC_MASK
,
764 0, RAPL_DOMAIN_MSR_INFO
, POWER_UNIT
, 0),
765 PRIMITIVE_INFO_INIT(MAX_POWER
, POWER_INFO_MAX_MASK
, 32,
766 RAPL_DOMAIN_MSR_INFO
, POWER_UNIT
, 0),
767 PRIMITIVE_INFO_INIT(MIN_POWER
, POWER_INFO_MIN_MASK
, 16,
768 RAPL_DOMAIN_MSR_INFO
, POWER_UNIT
, 0),
769 PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW
, POWER_INFO_MAX_TIME_WIN_MASK
, 48,
770 RAPL_DOMAIN_MSR_INFO
, TIME_UNIT
, 0),
771 PRIMITIVE_INFO_INIT(THROTTLED_TIME
, PERF_STATUS_THROTTLE_TIME_MASK
, 0,
772 RAPL_DOMAIN_MSR_PERF
, TIME_UNIT
, 0),
773 PRIMITIVE_INFO_INIT(PRIORITY_LEVEL
, PP_POLICY_MASK
, 0,
774 RAPL_DOMAIN_MSR_POLICY
, ARBITRARY_UNIT
, 0),
776 PRIMITIVE_INFO_INIT(AVERAGE_POWER
, 0, 0, 0, POWER_UNIT
,
777 RAPL_PRIMITIVE_DERIVED
),
781 /* Read primitive data based on its related struct rapl_primitive_info.
782 * if xlate flag is set, return translated data based on data units, i.e.
783 * time, energy, and power.
784 * RAPL MSRs are non-architectual and are laid out not consistently across
785 * domains. Here we use primitive info to allow writing consolidated access
787 * For a given primitive, it is processed by MSR mask and shift. Unit conversion
788 * is pre-assigned based on RAPL unit MSRs read at init time.
789 * 63-------------------------- 31--------------------------- 0
791 * | |<- shift ----------------|
792 * 63-------------------------- 31--------------------------- 0
794 static int rapl_read_data_raw(struct rapl_domain
*rd
,
795 enum rapl_primitives prim
,
796 bool xlate
, u64
*data
)
800 struct rapl_primitive_info
*rp
= &rpi
[prim
];
803 if (!rp
->name
|| rp
->flag
& RAPL_PRIMITIVE_DUMMY
)
806 msr
= rd
->msrs
[rp
->id
];
810 cpu
= rd
->rp
->lead_cpu
;
812 /* special-case package domain, which uses a different bit*/
813 if (prim
== FW_LOCK
&& rd
->id
== RAPL_DOMAIN_PACKAGE
) {
814 rp
->mask
= POWER_PACKAGE_LOCK
;
817 /* non-hardware data are collected by the polling thread */
818 if (rp
->flag
& RAPL_PRIMITIVE_DERIVED
) {
819 *data
= rd
->rdd
.primitives
[prim
];
823 if (rdmsrl_safe_on_cpu(cpu
, msr
, &value
)) {
824 pr_debug("failed to read msr 0x%x on cpu %d\n", msr
, cpu
);
828 final
= value
& rp
->mask
;
829 final
= final
>> rp
->shift
;
831 *data
= rapl_unit_xlate(rd
, rp
->unit
, final
, 0);
839 static int msrl_update_safe(u32 msr_no
, u64 clear_mask
, u64 set_mask
)
844 err
= rdmsrl_safe(msr_no
, &val
);
851 err
= wrmsrl_safe(msr_no
, val
);
857 static void msrl_update_func(void *info
)
859 struct msrl_action
*ma
= info
;
861 ma
->err
= msrl_update_safe(ma
->msr_no
, ma
->clear_mask
, ma
->set_mask
);
864 /* Similar use of primitive info in the read counterpart */
865 static int rapl_write_data_raw(struct rapl_domain
*rd
,
866 enum rapl_primitives prim
,
867 unsigned long long value
)
869 struct rapl_primitive_info
*rp
= &rpi
[prim
];
872 struct msrl_action ma
;
875 cpu
= rd
->rp
->lead_cpu
;
876 bits
= rapl_unit_xlate(rd
, rp
->unit
, value
, 1);
880 memset(&ma
, 0, sizeof(ma
));
882 ma
.msr_no
= rd
->msrs
[rp
->id
];
883 ma
.clear_mask
= rp
->mask
;
886 ret
= smp_call_function_single(cpu
, msrl_update_func
, &ma
, 1);
896 * Raw RAPL data stored in MSRs are in certain scales. We need to
897 * convert them into standard units based on the units reported in
898 * the RAPL unit MSRs. This is specific to CPUs as the method to
899 * calculate units differ on different CPUs.
900 * We convert the units to below format based on CPUs.
902 * energy unit: picoJoules : Represented in picoJoules by default
903 * power unit : microWatts : Represented in milliWatts by default
904 * time unit : microseconds: Represented in seconds by default
906 static int rapl_check_unit_core(struct rapl_package
*rp
, int cpu
)
911 if (rdmsrl_safe_on_cpu(cpu
, MSR_RAPL_POWER_UNIT
, &msr_val
)) {
912 pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
913 MSR_RAPL_POWER_UNIT
, cpu
);
917 value
= (msr_val
& ENERGY_UNIT_MASK
) >> ENERGY_UNIT_OFFSET
;
918 rp
->energy_unit
= ENERGY_UNIT_SCALE
* 1000000 / (1 << value
);
920 value
= (msr_val
& POWER_UNIT_MASK
) >> POWER_UNIT_OFFSET
;
921 rp
->power_unit
= 1000000 / (1 << value
);
923 value
= (msr_val
& TIME_UNIT_MASK
) >> TIME_UNIT_OFFSET
;
924 rp
->time_unit
= 1000000 / (1 << value
);
926 pr_debug("Core CPU package %d energy=%dpJ, time=%dus, power=%duW\n",
927 rp
->id
, rp
->energy_unit
, rp
->time_unit
, rp
->power_unit
);
932 static int rapl_check_unit_atom(struct rapl_package
*rp
, int cpu
)
937 if (rdmsrl_safe_on_cpu(cpu
, MSR_RAPL_POWER_UNIT
, &msr_val
)) {
938 pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
939 MSR_RAPL_POWER_UNIT
, cpu
);
942 value
= (msr_val
& ENERGY_UNIT_MASK
) >> ENERGY_UNIT_OFFSET
;
943 rp
->energy_unit
= ENERGY_UNIT_SCALE
* 1 << value
;
945 value
= (msr_val
& POWER_UNIT_MASK
) >> POWER_UNIT_OFFSET
;
946 rp
->power_unit
= (1 << value
) * 1000;
948 value
= (msr_val
& TIME_UNIT_MASK
) >> TIME_UNIT_OFFSET
;
949 rp
->time_unit
= 1000000 / (1 << value
);
951 pr_debug("Atom package %d energy=%dpJ, time=%dus, power=%duW\n",
952 rp
->id
, rp
->energy_unit
, rp
->time_unit
, rp
->power_unit
);
957 static void power_limit_irq_save_cpu(void *info
)
960 struct rapl_package
*rp
= (struct rapl_package
*)info
;
962 /* save the state of PLN irq mask bit before disabling it */
963 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, &l
, &h
);
964 if (!(rp
->power_limit_irq
& PACKAGE_PLN_INT_SAVED
)) {
965 rp
->power_limit_irq
= l
& PACKAGE_THERM_INT_PLN_ENABLE
;
966 rp
->power_limit_irq
|= PACKAGE_PLN_INT_SAVED
;
968 l
&= ~PACKAGE_THERM_INT_PLN_ENABLE
;
969 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, l
, h
);
974 * When package power limit is set artificially low by RAPL, LVT
975 * thermal interrupt for package power limit should be ignored
976 * since we are not really exceeding the real limit. The intention
977 * is to avoid excessive interrupts while we are trying to save power.
978 * A useful feature might be routing the package_power_limit interrupt
979 * to userspace via eventfd. once we have a usecase, this is simple
980 * to do by adding an atomic notifier.
983 static void package_power_limit_irq_save(struct rapl_package
*rp
)
985 if (!boot_cpu_has(X86_FEATURE_PTS
) || !boot_cpu_has(X86_FEATURE_PLN
))
988 smp_call_function_single(rp
->lead_cpu
, power_limit_irq_save_cpu
, rp
, 1);
992 * Restore per package power limit interrupt enable state. Called from cpu
993 * hotplug code on package removal.
995 static void package_power_limit_irq_restore(struct rapl_package
*rp
)
999 if (!boot_cpu_has(X86_FEATURE_PTS
) || !boot_cpu_has(X86_FEATURE_PLN
))
1002 /* irq enable state not saved, nothing to restore */
1003 if (!(rp
->power_limit_irq
& PACKAGE_PLN_INT_SAVED
))
1006 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, &l
, &h
);
1008 if (rp
->power_limit_irq
& PACKAGE_THERM_INT_PLN_ENABLE
)
1009 l
|= PACKAGE_THERM_INT_PLN_ENABLE
;
1011 l
&= ~PACKAGE_THERM_INT_PLN_ENABLE
;
1013 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT
, l
, h
);
1016 static void set_floor_freq_default(struct rapl_domain
*rd
, bool mode
)
1018 int nr_powerlimit
= find_nr_power_limit(rd
);
1020 /* always enable clamp such that p-state can go below OS requested
1021 * range. power capping priority over guranteed frequency.
1023 rapl_write_data_raw(rd
, PL1_CLAMP
, mode
);
1025 /* some domains have pl2 */
1026 if (nr_powerlimit
> 1) {
1027 rapl_write_data_raw(rd
, PL2_ENABLE
, mode
);
1028 rapl_write_data_raw(rd
, PL2_CLAMP
, mode
);
1032 static void set_floor_freq_atom(struct rapl_domain
*rd
, bool enable
)
1034 static u32 power_ctrl_orig_val
;
1037 if (!rapl_defaults
->floor_freq_reg_addr
) {
1038 pr_err("Invalid floor frequency config register\n");
1042 if (!power_ctrl_orig_val
)
1043 iosf_mbi_read(BT_MBI_UNIT_PMC
, MBI_CR_READ
,
1044 rapl_defaults
->floor_freq_reg_addr
,
1045 &power_ctrl_orig_val
);
1046 mdata
= power_ctrl_orig_val
;
1048 mdata
&= ~(0x7f << 8);
1051 iosf_mbi_write(BT_MBI_UNIT_PMC
, MBI_CR_WRITE
,
1052 rapl_defaults
->floor_freq_reg_addr
, mdata
);
1055 static u64
rapl_compute_time_window_core(struct rapl_package
*rp
, u64 value
,
1058 u64 f
, y
; /* fraction and exp. used for time unit */
1061 * Special processing based on 2^Y*(1+F/4), refer
1062 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
1065 f
= (value
& 0x60) >> 5;
1067 value
= (1 << y
) * (4 + f
) * rp
->time_unit
/ 4;
1069 do_div(value
, rp
->time_unit
);
1071 f
= div64_u64(4 * (value
- (1 << y
)), 1 << y
);
1072 value
= (y
& 0x1f) | ((f
& 0x3) << 5);
1077 static u64
rapl_compute_time_window_atom(struct rapl_package
*rp
, u64 value
,
1081 * Atom time unit encoding is straight forward val * time_unit,
1082 * where time_unit is default to 1 sec. Never 0.
1085 return (value
) ? value
*= rp
->time_unit
: rp
->time_unit
;
1087 value
= div64_u64(value
, rp
->time_unit
);
1092 static const struct rapl_defaults rapl_defaults_core
= {
1093 .floor_freq_reg_addr
= 0,
1094 .check_unit
= rapl_check_unit_core
,
1095 .set_floor_freq
= set_floor_freq_default
,
1096 .compute_time_window
= rapl_compute_time_window_core
,
1099 static const struct rapl_defaults rapl_defaults_hsw_server
= {
1100 .check_unit
= rapl_check_unit_core
,
1101 .set_floor_freq
= set_floor_freq_default
,
1102 .compute_time_window
= rapl_compute_time_window_core
,
1103 .dram_domain_energy_unit
= 15300,
1106 static const struct rapl_defaults rapl_defaults_byt
= {
1107 .floor_freq_reg_addr
= IOSF_CPU_POWER_BUDGET_CTL_BYT
,
1108 .check_unit
= rapl_check_unit_atom
,
1109 .set_floor_freq
= set_floor_freq_atom
,
1110 .compute_time_window
= rapl_compute_time_window_atom
,
1113 static const struct rapl_defaults rapl_defaults_tng
= {
1114 .floor_freq_reg_addr
= IOSF_CPU_POWER_BUDGET_CTL_TNG
,
1115 .check_unit
= rapl_check_unit_atom
,
1116 .set_floor_freq
= set_floor_freq_atom
,
1117 .compute_time_window
= rapl_compute_time_window_atom
,
1120 static const struct rapl_defaults rapl_defaults_ann
= {
1121 .floor_freq_reg_addr
= 0,
1122 .check_unit
= rapl_check_unit_atom
,
1123 .set_floor_freq
= NULL
,
1124 .compute_time_window
= rapl_compute_time_window_atom
,
1127 static const struct rapl_defaults rapl_defaults_cht
= {
1128 .floor_freq_reg_addr
= 0,
1129 .check_unit
= rapl_check_unit_atom
,
1130 .set_floor_freq
= NULL
,
1131 .compute_time_window
= rapl_compute_time_window_atom
,
1134 #define RAPL_CPU(_model, _ops) { \
1135 .vendor = X86_VENDOR_INTEL, \
1138 .driver_data = (kernel_ulong_t)&_ops, \
1141 static const struct x86_cpu_id rapl_ids
[] __initconst
= {
1142 RAPL_CPU(INTEL_FAM6_SANDYBRIDGE
, rapl_defaults_core
),
1143 RAPL_CPU(INTEL_FAM6_SANDYBRIDGE_X
, rapl_defaults_core
),
1145 RAPL_CPU(INTEL_FAM6_IVYBRIDGE
, rapl_defaults_core
),
1146 RAPL_CPU(INTEL_FAM6_IVYBRIDGE_X
, rapl_defaults_core
),
1148 RAPL_CPU(INTEL_FAM6_HASWELL_CORE
, rapl_defaults_core
),
1149 RAPL_CPU(INTEL_FAM6_HASWELL_ULT
, rapl_defaults_core
),
1150 RAPL_CPU(INTEL_FAM6_HASWELL_GT3E
, rapl_defaults_core
),
1151 RAPL_CPU(INTEL_FAM6_HASWELL_X
, rapl_defaults_hsw_server
),
1153 RAPL_CPU(INTEL_FAM6_BROADWELL_CORE
, rapl_defaults_core
),
1154 RAPL_CPU(INTEL_FAM6_BROADWELL_GT3E
, rapl_defaults_core
),
1155 RAPL_CPU(INTEL_FAM6_BROADWELL_XEON_D
, rapl_defaults_core
),
1156 RAPL_CPU(INTEL_FAM6_BROADWELL_X
, rapl_defaults_hsw_server
),
1158 RAPL_CPU(INTEL_FAM6_SKYLAKE_DESKTOP
, rapl_defaults_core
),
1159 RAPL_CPU(INTEL_FAM6_SKYLAKE_MOBILE
, rapl_defaults_core
),
1160 RAPL_CPU(INTEL_FAM6_SKYLAKE_X
, rapl_defaults_hsw_server
),
1161 RAPL_CPU(INTEL_FAM6_KABYLAKE_MOBILE
, rapl_defaults_core
),
1162 RAPL_CPU(INTEL_FAM6_KABYLAKE_DESKTOP
, rapl_defaults_core
),
1164 RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1
, rapl_defaults_byt
),
1165 RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT
, rapl_defaults_cht
),
1166 RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD
, rapl_defaults_tng
),
1167 RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD
, rapl_defaults_ann
),
1168 RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT
, rapl_defaults_core
),
1169 RAPL_CPU(INTEL_FAM6_ATOM_GEMINI_LAKE
, rapl_defaults_core
),
1170 RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON
, rapl_defaults_core
),
1172 RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL
, rapl_defaults_hsw_server
),
1173 RAPL_CPU(INTEL_FAM6_XEON_PHI_KNM
, rapl_defaults_hsw_server
),
1176 MODULE_DEVICE_TABLE(x86cpu
, rapl_ids
);
1178 /* Read once for all raw primitive data for domains */
1179 static void rapl_update_domain_data(struct rapl_package
*rp
)
1184 for (dmn
= 0; dmn
< rp
->nr_domains
; dmn
++) {
1185 pr_debug("update package %d domain %s data\n", rp
->id
,
1186 rp
->domains
[dmn
].name
);
1187 /* exclude non-raw primitives */
1188 for (prim
= 0; prim
< NR_RAW_PRIMITIVES
; prim
++) {
1189 if (!rapl_read_data_raw(&rp
->domains
[dmn
], prim
,
1190 rpi
[prim
].unit
, &val
))
1191 rp
->domains
[dmn
].rdd
.primitives
[prim
] = val
;
1197 static void rapl_unregister_powercap(void)
1199 if (platform_rapl_domain
) {
1200 powercap_unregister_zone(control_type
,
1201 &platform_rapl_domain
->power_zone
);
1202 kfree(platform_rapl_domain
);
1204 powercap_unregister_control_type(control_type
);
1207 static int rapl_package_register_powercap(struct rapl_package
*rp
)
1209 struct rapl_domain
*rd
;
1210 char dev_name
[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/
1211 struct powercap_zone
*power_zone
= NULL
;
1214 /* Update the domain data of the new package */
1215 rapl_update_domain_data(rp
);
1217 /* first we register package domain as the parent zone*/
1218 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
1219 if (rd
->id
== RAPL_DOMAIN_PACKAGE
) {
1220 nr_pl
= find_nr_power_limit(rd
);
1221 pr_debug("register socket %d package domain %s\n",
1223 memset(dev_name
, 0, sizeof(dev_name
));
1224 snprintf(dev_name
, sizeof(dev_name
), "%s-%d",
1226 power_zone
= powercap_register_zone(&rd
->power_zone
,
1232 if (IS_ERR(power_zone
)) {
1233 pr_debug("failed to register package, %d\n",
1235 return PTR_ERR(power_zone
);
1237 /* track parent zone in per package/socket data */
1238 rp
->power_zone
= power_zone
;
1239 /* done, only one package domain per socket */
1244 pr_err("no package domain found, unknown topology!\n");
1247 /* now register domains as children of the socket/package*/
1248 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
1249 if (rd
->id
== RAPL_DOMAIN_PACKAGE
)
1251 /* number of power limits per domain varies */
1252 nr_pl
= find_nr_power_limit(rd
);
1253 power_zone
= powercap_register_zone(&rd
->power_zone
,
1254 control_type
, rd
->name
,
1256 &zone_ops
[rd
->id
], nr_pl
,
1259 if (IS_ERR(power_zone
)) {
1260 pr_debug("failed to register power_zone, %d:%s:%s\n",
1261 rp
->id
, rd
->name
, dev_name
);
1262 ret
= PTR_ERR(power_zone
);
1270 * Clean up previously initialized domains within the package if we
1271 * failed after the first domain setup.
1273 while (--rd
>= rp
->domains
) {
1274 pr_debug("unregister package %d domain %s\n", rp
->id
, rd
->name
);
1275 powercap_unregister_zone(control_type
, &rd
->power_zone
);
1281 static int __init
rapl_register_psys(void)
1283 struct rapl_domain
*rd
;
1284 struct powercap_zone
*power_zone
;
1287 if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS
, &val
) || !val
)
1290 if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT
, &val
) || !val
)
1293 rd
= kzalloc(sizeof(*rd
), GFP_KERNEL
);
1297 rd
->name
= rapl_domain_names
[RAPL_DOMAIN_PLATFORM
];
1298 rd
->id
= RAPL_DOMAIN_PLATFORM
;
1299 rd
->msrs
[0] = MSR_PLATFORM_POWER_LIMIT
;
1300 rd
->msrs
[1] = MSR_PLATFORM_ENERGY_STATUS
;
1301 rd
->rpl
[0].prim_id
= PL1_ENABLE
;
1302 rd
->rpl
[0].name
= pl1_name
;
1303 rd
->rpl
[1].prim_id
= PL2_ENABLE
;
1304 rd
->rpl
[1].name
= pl2_name
;
1305 rd
->rp
= find_package_by_id(0);
1307 power_zone
= powercap_register_zone(&rd
->power_zone
, control_type
,
1309 &zone_ops
[RAPL_DOMAIN_PLATFORM
],
1310 2, &constraint_ops
);
1312 if (IS_ERR(power_zone
)) {
1314 return PTR_ERR(power_zone
);
1317 platform_rapl_domain
= rd
;
1322 static int __init
rapl_register_powercap(void)
1324 control_type
= powercap_register_control_type(NULL
, "intel-rapl", NULL
);
1325 if (IS_ERR(control_type
)) {
1326 pr_debug("failed to register powercap control_type.\n");
1327 return PTR_ERR(control_type
);
1332 static int rapl_check_domain(int cpu
, int domain
)
1338 case RAPL_DOMAIN_PACKAGE
:
1339 msr
= MSR_PKG_ENERGY_STATUS
;
1341 case RAPL_DOMAIN_PP0
:
1342 msr
= MSR_PP0_ENERGY_STATUS
;
1344 case RAPL_DOMAIN_PP1
:
1345 msr
= MSR_PP1_ENERGY_STATUS
;
1347 case RAPL_DOMAIN_DRAM
:
1348 msr
= MSR_DRAM_ENERGY_STATUS
;
1350 case RAPL_DOMAIN_PLATFORM
:
1351 /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
1354 pr_err("invalid domain id %d\n", domain
);
1357 /* make sure domain counters are available and contains non-zero
1358 * values, otherwise skip it.
1360 if (rdmsrl_safe_on_cpu(cpu
, msr
, &val
) || !val
)
1368 * Check if power limits are available. Two cases when they are not available:
1369 * 1. Locked by BIOS, in this case we still provide read-only access so that
1370 * users can see what limit is set by the BIOS.
1371 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
1372 * exist at all. In this case, we do not show the contraints in powercap.
1374 * Called after domains are detected and initialized.
1376 static void rapl_detect_powerlimit(struct rapl_domain
*rd
)
1381 /* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
1382 if (!rapl_read_data_raw(rd
, FW_LOCK
, false, &val64
)) {
1384 pr_info("RAPL package %d domain %s locked by BIOS\n",
1385 rd
->rp
->id
, rd
->name
);
1386 rd
->state
|= DOMAIN_STATE_BIOS_LOCKED
;
1389 /* check if power limit MSRs exists, otherwise domain is monitoring only */
1390 for (i
= 0; i
< NR_POWER_LIMITS
; i
++) {
1391 int prim
= rd
->rpl
[i
].prim_id
;
1392 if (rapl_read_data_raw(rd
, prim
, false, &val64
))
1393 rd
->rpl
[i
].name
= NULL
;
1397 /* Detect active and valid domains for the given CPU, caller must
1398 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
1400 static int rapl_detect_domains(struct rapl_package
*rp
, int cpu
)
1402 struct rapl_domain
*rd
;
1405 for (i
= 0; i
< RAPL_DOMAIN_MAX
; i
++) {
1406 /* use physical package id to read counters */
1407 if (!rapl_check_domain(cpu
, i
)) {
1408 rp
->domain_map
|= 1 << i
;
1409 pr_info("Found RAPL domain %s\n", rapl_domain_names
[i
]);
1412 rp
->nr_domains
= bitmap_weight(&rp
->domain_map
, RAPL_DOMAIN_MAX
);
1413 if (!rp
->nr_domains
) {
1414 pr_debug("no valid rapl domains found in package %d\n", rp
->id
);
1417 pr_debug("found %d domains on package %d\n", rp
->nr_domains
, rp
->id
);
1419 rp
->domains
= kcalloc(rp
->nr_domains
+ 1, sizeof(struct rapl_domain
),
1424 rapl_init_domains(rp
);
1426 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++)
1427 rapl_detect_powerlimit(rd
);
1432 /* called from CPU hotplug notifier, hotplug lock held */
1433 static void rapl_remove_package(struct rapl_package
*rp
)
1435 struct rapl_domain
*rd
, *rd_package
= NULL
;
1437 package_power_limit_irq_restore(rp
);
1439 for (rd
= rp
->domains
; rd
< rp
->domains
+ rp
->nr_domains
; rd
++) {
1440 rapl_write_data_raw(rd
, PL1_ENABLE
, 0);
1441 rapl_write_data_raw(rd
, PL1_CLAMP
, 0);
1442 if (find_nr_power_limit(rd
) > 1) {
1443 rapl_write_data_raw(rd
, PL2_ENABLE
, 0);
1444 rapl_write_data_raw(rd
, PL2_CLAMP
, 0);
1446 if (rd
->id
== RAPL_DOMAIN_PACKAGE
) {
1450 pr_debug("remove package, undo power limit on %d: %s\n",
1452 powercap_unregister_zone(control_type
, &rd
->power_zone
);
1454 /* do parent zone last */
1455 powercap_unregister_zone(control_type
, &rd_package
->power_zone
);
1456 list_del(&rp
->plist
);
1460 /* called from CPU hotplug notifier, hotplug lock held */
1461 static struct rapl_package
*rapl_add_package(int cpu
, int pkgid
)
1463 struct rapl_package
*rp
;
1466 rp
= kzalloc(sizeof(struct rapl_package
), GFP_KERNEL
);
1468 return ERR_PTR(-ENOMEM
);
1470 /* add the new package to the list */
1474 /* check if the package contains valid domains */
1475 if (rapl_detect_domains(rp
, cpu
) ||
1476 rapl_defaults
->check_unit(rp
, cpu
)) {
1478 goto err_free_package
;
1480 ret
= rapl_package_register_powercap(rp
);
1482 INIT_LIST_HEAD(&rp
->plist
);
1483 list_add(&rp
->plist
, &rapl_packages
);
1490 return ERR_PTR(ret
);
1493 /* Handles CPU hotplug on multi-socket systems.
1494 * If a CPU goes online as the first CPU of the physical package
1495 * we add the RAPL package to the system. Similarly, when the last
1496 * CPU of the package is removed, we remove the RAPL package and its
1497 * associated domains. Cooling devices are handled accordingly at
1500 static int rapl_cpu_online(unsigned int cpu
)
1502 int pkgid
= topology_physical_package_id(cpu
);
1503 struct rapl_package
*rp
;
1505 rp
= find_package_by_id(pkgid
);
1507 rp
= rapl_add_package(cpu
, pkgid
);
1511 cpumask_set_cpu(cpu
, &rp
->cpumask
);
1515 static int rapl_cpu_down_prep(unsigned int cpu
)
1517 int pkgid
= topology_physical_package_id(cpu
);
1518 struct rapl_package
*rp
;
1521 rp
= find_package_by_id(pkgid
);
1525 cpumask_clear_cpu(cpu
, &rp
->cpumask
);
1526 lead_cpu
= cpumask_first(&rp
->cpumask
);
1527 if (lead_cpu
>= nr_cpu_ids
)
1528 rapl_remove_package(rp
);
1529 else if (rp
->lead_cpu
== cpu
)
1530 rp
->lead_cpu
= lead_cpu
;
1534 static enum cpuhp_state pcap_rapl_online
;
1536 static int __init
rapl_init(void)
1538 const struct x86_cpu_id
*id
;
1541 id
= x86_match_cpu(rapl_ids
);
1543 pr_err("driver does not support CPU family %d model %d\n",
1544 boot_cpu_data
.x86
, boot_cpu_data
.x86_model
);
1549 rapl_defaults
= (struct rapl_defaults
*)id
->driver_data
;
1551 ret
= rapl_register_powercap();
1555 ret
= cpuhp_setup_state(CPUHP_AP_ONLINE_DYN
, "powercap/rapl:online",
1556 rapl_cpu_online
, rapl_cpu_down_prep
);
1559 pcap_rapl_online
= ret
;
1561 /* Don't bail out if PSys is not supported */
1562 rapl_register_psys();
1566 rapl_unregister_powercap();
1570 static void __exit
rapl_exit(void)
1572 cpuhp_remove_state(pcap_rapl_online
);
1573 rapl_unregister_powercap();
1576 module_init(rapl_init
);
1577 module_exit(rapl_exit
);
1579 MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit)");
1580 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
1581 MODULE_LICENSE("GPL v2");