1 // SPDX-License-Identifier: GPL-2.0-only
3 * x86 APERF/MPERF KHz calculation for
4 * /sys/.../cpufreq/scaling_cur_freq
6 * Copyright (C) 2017 Intel Corp.
7 * Author: Len Brown <len.brown@intel.com>
9 #include <linux/cpufreq.h>
10 #include <linux/delay.h>
11 #include <linux/ktime.h>
12 #include <linux/math64.h>
13 #include <linux/percpu.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched/isolation.h>
16 #include <linux/sched/topology.h>
17 #include <linux/smp.h>
18 #include <linux/syscore_ops.h>
21 #include <asm/cpu_device_id.h>
22 #include <asm/intel-family.h>
28 unsigned long last_update
;
35 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf
, cpu_samples
) = {
36 .seq
= SEQCNT_ZERO(cpu_samples
.seq
)
39 static void init_counter_refs(void)
43 rdmsrl(MSR_IA32_APERF
, aperf
);
44 rdmsrl(MSR_IA32_MPERF
, mperf
);
46 this_cpu_write(cpu_samples
.aperf
, aperf
);
47 this_cpu_write(cpu_samples
.mperf
, mperf
);
50 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
52 * APERF/MPERF frequency ratio computation.
54 * The scheduler wants to do frequency invariant accounting and needs a <1
55 * ratio to account for the 'current' frequency, corresponding to
56 * freq_curr / freq_max.
58 * Since the frequency freq_curr on x86 is controlled by micro-controller and
59 * our P-state setting is little more than a request/hint, we need to observe
60 * the effective frequency 'BusyMHz', i.e. the average frequency over a time
61 * interval after discarding idle time. This is given by:
63 * BusyMHz = delta_APERF / delta_MPERF * freq_base
65 * where freq_base is the max non-turbo P-state.
67 * The freq_max term has to be set to a somewhat arbitrary value, because we
68 * can't know which turbo states will be available at a given point in time:
69 * it all depends on the thermal headroom of the entire package. We set it to
70 * the turbo level with 4 cores active.
72 * Benchmarks show that's a good compromise between the 1C turbo ratio
73 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
74 * which would ignore the entire turbo range (a conspicuous part, making
75 * freq_curr/freq_max always maxed out).
77 * An exception to the heuristic above is the Atom uarch, where we choose the
78 * highest turbo level for freq_max since Atom's are generally oriented towards
81 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
82 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
85 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key
);
87 static u64 arch_turbo_freq_ratio
= SCHED_CAPACITY_SCALE
;
88 static u64 arch_max_freq_ratio
= SCHED_CAPACITY_SCALE
;
90 void arch_set_max_freq_ratio(bool turbo_disabled
)
92 arch_max_freq_ratio
= turbo_disabled
? SCHED_CAPACITY_SCALE
:
93 arch_turbo_freq_ratio
;
95 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio
);
97 static bool __init
turbo_disabled(void)
102 err
= rdmsrl_safe(MSR_IA32_MISC_ENABLE
, &misc_en
);
106 return (misc_en
& MSR_IA32_MISC_ENABLE_TURBO_DISABLE
);
109 static bool __init
slv_set_max_freq_ratio(u64
*base_freq
, u64
*turbo_freq
)
113 err
= rdmsrl_safe(MSR_ATOM_CORE_RATIOS
, base_freq
);
117 err
= rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS
, turbo_freq
);
121 *base_freq
= (*base_freq
>> 16) & 0x3F; /* max P state */
122 *turbo_freq
= *turbo_freq
& 0x3F; /* 1C turbo */
127 #define X86_MATCH(vfm) \
128 X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
130 static const struct x86_cpu_id has_knl_turbo_ratio_limits
[] __initconst
= {
131 X86_MATCH(INTEL_XEON_PHI_KNL
),
132 X86_MATCH(INTEL_XEON_PHI_KNM
),
136 static const struct x86_cpu_id has_skx_turbo_ratio_limits
[] __initconst
= {
137 X86_MATCH(INTEL_SKYLAKE_X
),
141 static const struct x86_cpu_id has_glm_turbo_ratio_limits
[] __initconst
= {
142 X86_MATCH(INTEL_ATOM_GOLDMONT
),
143 X86_MATCH(INTEL_ATOM_GOLDMONT_D
),
144 X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS
),
148 static bool __init
knl_set_max_freq_ratio(u64
*base_freq
, u64
*turbo_freq
,
149 int num_delta_fratio
)
151 int fratio
, delta_fratio
, found
;
155 err
= rdmsrl_safe(MSR_PLATFORM_INFO
, base_freq
);
159 *base_freq
= (*base_freq
>> 8) & 0xFF; /* max P state */
161 err
= rdmsrl_safe(MSR_TURBO_RATIO_LIMIT
, &msr
);
165 fratio
= (msr
>> 8) & 0xFF;
169 if (found
>= num_delta_fratio
) {
170 *turbo_freq
= fratio
;
174 delta_fratio
= (msr
>> (i
+ 5)) & 0x7;
178 fratio
-= delta_fratio
;
187 static bool __init
skx_set_max_freq_ratio(u64
*base_freq
, u64
*turbo_freq
, int size
)
193 err
= rdmsrl_safe(MSR_PLATFORM_INFO
, base_freq
);
197 *base_freq
= (*base_freq
>> 8) & 0xFF; /* max P state */
199 err
= rdmsrl_safe(MSR_TURBO_RATIO_LIMIT
, &ratios
);
203 err
= rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1
, &counts
);
207 for (i
= 0; i
< 64; i
+= 8) {
208 group_size
= (counts
>> i
) & 0xFF;
209 if (group_size
>= size
) {
210 *turbo_freq
= (ratios
>> i
) & 0xFF;
218 static bool __init
core_set_max_freq_ratio(u64
*base_freq
, u64
*turbo_freq
)
223 err
= rdmsrl_safe(MSR_PLATFORM_INFO
, base_freq
);
227 err
= rdmsrl_safe(MSR_TURBO_RATIO_LIMIT
, &msr
);
231 *base_freq
= (*base_freq
>> 8) & 0xFF; /* max P state */
232 *turbo_freq
= (msr
>> 24) & 0xFF; /* 4C turbo */
234 /* The CPU may have less than 4 cores */
236 *turbo_freq
= msr
& 0xFF; /* 1C turbo */
241 static bool __init
intel_set_max_freq_ratio(void)
243 u64 base_freq
, turbo_freq
;
246 if (slv_set_max_freq_ratio(&base_freq
, &turbo_freq
))
249 if (x86_match_cpu(has_glm_turbo_ratio_limits
) &&
250 skx_set_max_freq_ratio(&base_freq
, &turbo_freq
, 1))
253 if (x86_match_cpu(has_knl_turbo_ratio_limits
) &&
254 knl_set_max_freq_ratio(&base_freq
, &turbo_freq
, 1))
257 if (x86_match_cpu(has_skx_turbo_ratio_limits
) &&
258 skx_set_max_freq_ratio(&base_freq
, &turbo_freq
, 4))
261 if (core_set_max_freq_ratio(&base_freq
, &turbo_freq
))
268 * Some hypervisors advertise X86_FEATURE_APERFMPERF
269 * but then fill all MSR's with zeroes.
270 * Some CPUs have turbo boost but don't declare any turbo ratio
271 * in MSR_TURBO_RATIO_LIMIT.
273 if (!base_freq
|| !turbo_freq
) {
274 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
278 turbo_ratio
= div_u64(turbo_freq
* SCHED_CAPACITY_SCALE
, base_freq
);
280 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
284 arch_turbo_freq_ratio
= turbo_ratio
;
285 arch_set_max_freq_ratio(turbo_disabled());
290 #ifdef CONFIG_PM_SLEEP
291 static struct syscore_ops freq_invariance_syscore_ops
= {
292 .resume
= init_counter_refs
,
295 static void register_freq_invariance_syscore_ops(void)
297 register_syscore_ops(&freq_invariance_syscore_ops
);
300 static inline void register_freq_invariance_syscore_ops(void) {}
303 static void freq_invariance_enable(void)
305 if (static_branch_unlikely(&arch_scale_freq_key
)) {
309 static_branch_enable_cpuslocked(&arch_scale_freq_key
);
310 register_freq_invariance_syscore_ops();
311 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio
);
314 void freq_invariance_set_perf_ratio(u64 ratio
, bool turbo_disabled
)
316 arch_turbo_freq_ratio
= ratio
;
317 arch_set_max_freq_ratio(turbo_disabled
);
318 freq_invariance_enable();
321 static void __init
bp_init_freq_invariance(void)
323 if (boot_cpu_data
.x86_vendor
!= X86_VENDOR_INTEL
)
326 if (intel_set_max_freq_ratio()) {
327 guard(cpus_read_lock
)();
328 freq_invariance_enable();
332 static void disable_freq_invariance_workfn(struct work_struct
*work
)
336 static_branch_disable(&arch_scale_freq_key
);
339 * Set arch_freq_scale to a default value on all cpus
340 * This negates the effect of scaling
342 for_each_possible_cpu(cpu
)
343 per_cpu(arch_freq_scale
, cpu
) = SCHED_CAPACITY_SCALE
;
346 static DECLARE_WORK(disable_freq_invariance_work
,
347 disable_freq_invariance_workfn
);
349 DEFINE_PER_CPU(unsigned long, arch_freq_scale
) = SCHED_CAPACITY_SCALE
;
350 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale
);
352 static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key
);
354 struct arch_hybrid_cpu_scale
{
355 unsigned long capacity
;
356 unsigned long freq_ratio
;
359 static struct arch_hybrid_cpu_scale __percpu
*arch_cpu_scale
;
362 * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
364 * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
365 * initialize it and set the static key controlling its code paths.
367 * Must be called before arch_set_cpu_capacity().
369 bool arch_enable_hybrid_capacity_scale(void)
373 if (static_branch_unlikely(&arch_hybrid_cap_scale_key
)) {
374 WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
378 arch_cpu_scale
= alloc_percpu(struct arch_hybrid_cpu_scale
);
382 for_each_possible_cpu(cpu
) {
383 per_cpu_ptr(arch_cpu_scale
, cpu
)->capacity
= SCHED_CAPACITY_SCALE
;
384 per_cpu_ptr(arch_cpu_scale
, cpu
)->freq_ratio
= arch_max_freq_ratio
;
387 static_branch_enable(&arch_hybrid_cap_scale_key
);
389 pr_info("Hybrid CPU capacity scaling enabled\n");
395 * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
397 * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
398 * @max_cap: System-wide maximum CPU capacity.
399 * @cap_freq: Frequency of @cpu corresponding to @cap.
400 * @base_freq: Frequency of @cpu at which MPERF counts.
402 * The units in which @cap and @max_cap are expressed do not matter, so long
403 * as they are consistent, because the former is effectively divided by the
404 * latter. Analogously for @cap_freq and @base_freq.
406 * After calling this function for all CPUs, call arch_rebuild_sched_domains()
407 * to let the scheduler know that capacity-aware scheduling can be used going
410 void arch_set_cpu_capacity(int cpu
, unsigned long cap
, unsigned long max_cap
,
411 unsigned long cap_freq
, unsigned long base_freq
)
413 if (static_branch_likely(&arch_hybrid_cap_scale_key
)) {
414 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale
, cpu
)->capacity
,
415 div_u64(cap
<< SCHED_CAPACITY_SHIFT
, max_cap
));
416 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale
, cpu
)->freq_ratio
,
417 div_u64(cap_freq
<< SCHED_CAPACITY_SHIFT
, base_freq
));
419 WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
423 unsigned long arch_scale_cpu_capacity(int cpu
)
425 if (static_branch_unlikely(&arch_hybrid_cap_scale_key
))
426 return READ_ONCE(per_cpu_ptr(arch_cpu_scale
, cpu
)->capacity
);
428 return SCHED_CAPACITY_SCALE
;
430 EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity
);
432 static void scale_freq_tick(u64 acnt
, u64 mcnt
)
434 u64 freq_scale
, freq_ratio
;
436 if (!arch_scale_freq_invariant())
439 if (check_shl_overflow(acnt
, 2*SCHED_CAPACITY_SHIFT
, &acnt
))
442 if (static_branch_unlikely(&arch_hybrid_cap_scale_key
))
443 freq_ratio
= READ_ONCE(this_cpu_ptr(arch_cpu_scale
)->freq_ratio
);
445 freq_ratio
= arch_max_freq_ratio
;
447 if (check_mul_overflow(mcnt
, freq_ratio
, &mcnt
) || !mcnt
)
450 freq_scale
= div64_u64(acnt
, mcnt
);
454 if (freq_scale
> SCHED_CAPACITY_SCALE
)
455 freq_scale
= SCHED_CAPACITY_SCALE
;
457 this_cpu_write(arch_freq_scale
, freq_scale
);
461 pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
462 schedule_work(&disable_freq_invariance_work
);
465 static inline void bp_init_freq_invariance(void) { }
466 static inline void scale_freq_tick(u64 acnt
, u64 mcnt
) { }
467 #endif /* CONFIG_X86_64 && CONFIG_SMP */
469 void arch_scale_freq_tick(void)
471 struct aperfmperf
*s
= this_cpu_ptr(&cpu_samples
);
472 u64 acnt
, mcnt
, aperf
, mperf
;
474 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF
))
477 rdmsrl(MSR_IA32_APERF
, aperf
);
478 rdmsrl(MSR_IA32_MPERF
, mperf
);
479 acnt
= aperf
- s
->aperf
;
480 mcnt
= mperf
- s
->mperf
;
485 raw_write_seqcount_begin(&s
->seq
);
486 s
->last_update
= jiffies
;
489 raw_write_seqcount_end(&s
->seq
);
491 scale_freq_tick(acnt
, mcnt
);
495 * Discard samples older than the define maximum sample age of 20ms. There
496 * is no point in sending IPIs in such a case. If the scheduler tick was
497 * not running then the CPU is either idle or isolated.
499 #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
501 unsigned int arch_freq_get_on_cpu(int cpu
)
503 struct aperfmperf
*s
= per_cpu_ptr(&cpu_samples
, cpu
);
504 unsigned int seq
, freq
;
508 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF
))
512 seq
= raw_read_seqcount_begin(&s
->seq
);
513 last
= s
->last_update
;
516 } while (read_seqcount_retry(&s
->seq
, seq
));
519 * Bail on invalid count and when the last update was too long ago,
520 * which covers idle and NOHZ full CPUs.
522 if (!mcnt
|| (jiffies
- last
) > MAX_SAMPLE_AGE
)
525 return div64_u64((cpu_khz
* acnt
), mcnt
);
528 freq
= cpufreq_quick_get(cpu
);
529 return freq
? freq
: cpu_khz
;
532 static int __init
bp_init_aperfmperf(void)
534 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF
))
538 bp_init_freq_invariance();
541 early_initcall(bp_init_aperfmperf
);
543 void ap_init_aperfmperf(void)
545 if (cpu_feature_enabled(X86_FEATURE_APERFMPERF
))