1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
5 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
8 * Avi Kivity <avi@redhat.com>
9 * Gleb Natapov <gleb@redhat.com>
10 * Wei Huang <wei@redhat.com>
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 #include <linux/types.h>
15 #include <linux/kvm_host.h>
16 #include <linux/perf_event.h>
17 #include <linux/bsearch.h>
18 #include <linux/sort.h>
19 #include <asm/perf_event.h>
20 #include <asm/cpu_device_id.h>
26 /* This is enough to filter the vast majority of currently defined events. */
27 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
29 struct x86_pmu_capability __read_mostly kvm_pmu_cap
;
30 EXPORT_SYMBOL_GPL(kvm_pmu_cap
);
32 struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel
;
33 EXPORT_SYMBOL_GPL(kvm_pmu_eventsel
);
35 /* Precise Distribution of Instructions Retired (PDIR) */
36 static const struct x86_cpu_id vmx_pebs_pdir_cpu
[] = {
37 X86_MATCH_VFM(INTEL_ICELAKE_D
, NULL
),
38 X86_MATCH_VFM(INTEL_ICELAKE_X
, NULL
),
39 /* Instruction-Accurate PDIR (PDIR++) */
40 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X
, NULL
),
44 /* Precise Distribution (PDist) */
45 static const struct x86_cpu_id vmx_pebs_pdist_cpu
[] = {
46 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X
, NULL
),
51 * - Each perf counter is defined as "struct kvm_pmc";
52 * - There are two types of perf counters: general purpose (gp) and fixed.
53 * gp counters are stored in gp_counters[] and fixed counters are stored
54 * in fixed_counters[] respectively. Both of them are part of "struct
56 * - pmu.c understands the difference between gp counters and fixed counters.
57 * However AMD doesn't support fixed-counters;
58 * - There are three types of index to access perf counters (PMC):
59 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
60 * has MSR_K7_PERFCTRn and, for families 15H and later,
61 * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
62 * aliased to MSR_K7_PERFCTRn.
63 * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
64 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
65 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
66 * that it also supports fixed counters. idx can be used to as index to
67 * gp and fixed counters.
68 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
69 * code. Each pmc, stored in kvm_pmc.idx field, is unique across
70 * all perf counters (both gp and fixed). The mapping relationship
71 * between pmc and perf counters is as the following:
72 * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
73 * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
74 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
75 * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
78 static struct kvm_pmu_ops kvm_pmu_ops __read_mostly
;
80 #define KVM_X86_PMU_OP(func) \
81 DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
82 *(((struct kvm_pmu_ops *)0)->func));
83 #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
84 #include <asm/kvm-x86-pmu-ops.h>
86 void kvm_pmu_ops_update(const struct kvm_pmu_ops
*pmu_ops
)
88 memcpy(&kvm_pmu_ops
, pmu_ops
, sizeof(kvm_pmu_ops
));
90 #define __KVM_X86_PMU_OP(func) \
91 static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
92 #define KVM_X86_PMU_OP(func) \
93 WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
94 #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
95 #include <asm/kvm-x86-pmu-ops.h>
96 #undef __KVM_X86_PMU_OP
99 static inline void __kvm_perf_overflow(struct kvm_pmc
*pmc
, bool in_pmi
)
101 struct kvm_pmu
*pmu
= pmc_to_pmu(pmc
);
102 bool skip_pmi
= false;
104 if (pmc
->perf_event
&& pmc
->perf_event
->attr
.precise_ip
) {
107 * TODO: KVM is currently _choosing_ to not generate records
108 * for emulated instructions, avoiding BUFFER_OVF PMI when
109 * there are no records. Strictly speaking, it should be done
110 * as well in the right context to improve sampling accuracy.
114 /* Indicate PEBS overflow PMI to guest. */
115 skip_pmi
= __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT
,
116 (unsigned long *)&pmu
->global_status
);
119 __set_bit(pmc
->idx
, (unsigned long *)&pmu
->global_status
);
122 if (pmc
->intr
&& !skip_pmi
)
123 kvm_make_request(KVM_REQ_PMI
, pmc
->vcpu
);
126 static void kvm_perf_overflow(struct perf_event
*perf_event
,
127 struct perf_sample_data
*data
,
128 struct pt_regs
*regs
)
130 struct kvm_pmc
*pmc
= perf_event
->overflow_handler_context
;
133 * Ignore asynchronous overflow events for counters that are scheduled
134 * to be reprogrammed, e.g. if a PMI for the previous event races with
135 * KVM's handling of a related guest WRMSR.
137 if (test_and_set_bit(pmc
->idx
, pmc_to_pmu(pmc
)->reprogram_pmi
))
140 __kvm_perf_overflow(pmc
, true);
142 kvm_make_request(KVM_REQ_PMU
, pmc
->vcpu
);
145 static u64
pmc_get_pebs_precise_level(struct kvm_pmc
*pmc
)
148 * For some model specific pebs counters with special capabilities
149 * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
150 * level to the maximum value (currently 3, backwards compatible)
151 * so that the perf subsystem would assign specific hardware counter
152 * with that capability for vPMC.
154 if ((pmc
->idx
== 0 && x86_match_cpu(vmx_pebs_pdist_cpu
)) ||
155 (pmc
->idx
== 32 && x86_match_cpu(vmx_pebs_pdir_cpu
)))
159 * The non-zero precision level of guest event makes the ordinary
160 * guest event becomes a guest PEBS event and triggers the host
161 * PEBS PMI handler to determine whether the PEBS overflow PMI
162 * comes from the host counters or the guest.
167 static u64
get_sample_period(struct kvm_pmc
*pmc
, u64 counter_value
)
169 u64 sample_period
= (-counter_value
) & pmc_bitmask(pmc
);
172 sample_period
= pmc_bitmask(pmc
) + 1;
173 return sample_period
;
176 static int pmc_reprogram_counter(struct kvm_pmc
*pmc
, u32 type
, u64 config
,
177 bool exclude_user
, bool exclude_kernel
,
180 struct kvm_pmu
*pmu
= pmc_to_pmu(pmc
);
181 struct perf_event
*event
;
182 struct perf_event_attr attr
= {
184 .size
= sizeof(attr
),
186 .exclude_idle
= true,
188 .exclude_user
= exclude_user
,
189 .exclude_kernel
= exclude_kernel
,
192 bool pebs
= test_bit(pmc
->idx
, (unsigned long *)&pmu
->pebs_enable
);
194 attr
.sample_period
= get_sample_period(pmc
, pmc
->counter
);
196 if ((attr
.config
& HSW_IN_TX_CHECKPOINTED
) &&
197 (boot_cpu_has(X86_FEATURE_RTM
) || boot_cpu_has(X86_FEATURE_HLE
))) {
199 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
200 * period. Just clear the sample period so at least
201 * allocating the counter doesn't fail.
203 attr
.sample_period
= 0;
207 * For most PEBS hardware events, the difference in the software
208 * precision levels of guest and host PEBS events will not affect
209 * the accuracy of the PEBS profiling result, because the "event IP"
210 * in the PEBS record is calibrated on the guest side.
212 attr
.precise_ip
= pmc_get_pebs_precise_level(pmc
);
215 event
= perf_event_create_kernel_counter(&attr
, -1, current
,
216 kvm_perf_overflow
, pmc
);
218 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
219 PTR_ERR(event
), pmc
->idx
);
220 return PTR_ERR(event
);
223 pmc
->perf_event
= event
;
224 pmc_to_pmu(pmc
)->event_count
++;
225 pmc
->is_paused
= false;
226 pmc
->intr
= intr
|| pebs
;
230 static bool pmc_pause_counter(struct kvm_pmc
*pmc
)
232 u64 counter
= pmc
->counter
;
235 /* update counter, reset event value to avoid redundant accumulation */
236 if (pmc
->perf_event
&& !pmc
->is_paused
)
237 counter
+= perf_event_pause(pmc
->perf_event
, true);
240 * Snapshot the previous counter *after* accumulating state from perf.
241 * If overflow already happened, hardware (via perf) is responsible for
242 * generating a PMI. KVM just needs to detect overflow on emulated
243 * counter events that haven't yet been processed.
245 prev_counter
= counter
& pmc_bitmask(pmc
);
247 counter
+= pmc
->emulated_counter
;
248 pmc
->counter
= counter
& pmc_bitmask(pmc
);
250 pmc
->emulated_counter
= 0;
251 pmc
->is_paused
= true;
253 return pmc
->counter
< prev_counter
;
256 static bool pmc_resume_counter(struct kvm_pmc
*pmc
)
258 if (!pmc
->perf_event
)
261 /* recalibrate sample period and check if it's accepted by perf core */
262 if (is_sampling_event(pmc
->perf_event
) &&
263 perf_event_period(pmc
->perf_event
,
264 get_sample_period(pmc
, pmc
->counter
)))
267 if (test_bit(pmc
->idx
, (unsigned long *)&pmc_to_pmu(pmc
)->pebs_enable
) !=
268 (!!pmc
->perf_event
->attr
.precise_ip
))
271 /* reuse perf_event to serve as pmc_reprogram_counter() does*/
272 perf_event_enable(pmc
->perf_event
);
273 pmc
->is_paused
= false;
278 static void pmc_release_perf_event(struct kvm_pmc
*pmc
)
280 if (pmc
->perf_event
) {
281 perf_event_release_kernel(pmc
->perf_event
);
282 pmc
->perf_event
= NULL
;
283 pmc
->current_config
= 0;
284 pmc_to_pmu(pmc
)->event_count
--;
288 static void pmc_stop_counter(struct kvm_pmc
*pmc
)
290 if (pmc
->perf_event
) {
291 pmc
->counter
= pmc_read_counter(pmc
);
292 pmc_release_perf_event(pmc
);
296 static void pmc_update_sample_period(struct kvm_pmc
*pmc
)
298 if (!pmc
->perf_event
|| pmc
->is_paused
||
299 !is_sampling_event(pmc
->perf_event
))
302 perf_event_period(pmc
->perf_event
,
303 get_sample_period(pmc
, pmc
->counter
));
306 void pmc_write_counter(struct kvm_pmc
*pmc
, u64 val
)
309 * Drop any unconsumed accumulated counts, the WRMSR is a write, not a
310 * read-modify-write. Adjust the counter value so that its value is
311 * relative to the current count, as reading the current count from
312 * perf is faster than pausing and repgrogramming the event in order to
313 * reset it to '0'. Note, this very sneakily offsets the accumulated
314 * emulated count too, by using pmc_read_counter()!
316 pmc
->emulated_counter
= 0;
317 pmc
->counter
+= val
- pmc_read_counter(pmc
);
318 pmc
->counter
&= pmc_bitmask(pmc
);
319 pmc_update_sample_period(pmc
);
321 EXPORT_SYMBOL_GPL(pmc_write_counter
);
323 static int filter_cmp(const void *pa
, const void *pb
, u64 mask
)
325 u64 a
= *(u64
*)pa
& mask
;
326 u64 b
= *(u64
*)pb
& mask
;
328 return (a
> b
) - (a
< b
);
332 static int filter_sort_cmp(const void *pa
, const void *pb
)
334 return filter_cmp(pa
, pb
, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT
|
335 KVM_PMU_MASKED_ENTRY_EXCLUDE
));
339 * For the event filter, searching is done on the 'includes' list and
340 * 'excludes' list separately rather than on the 'events' list (which
341 * has both). As a result the exclude bit can be ignored.
343 static int filter_event_cmp(const void *pa
, const void *pb
)
345 return filter_cmp(pa
, pb
, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT
));
348 static int find_filter_index(u64
*events
, u64 nevents
, u64 key
)
350 u64
*fe
= bsearch(&key
, events
, nevents
, sizeof(events
[0]),
359 static bool is_filter_entry_match(u64 filter_event
, u64 umask
)
361 u64 mask
= filter_event
>> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT
- 8);
362 u64 match
= filter_event
& KVM_PMU_MASKED_ENTRY_UMASK_MATCH
;
364 BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
365 (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT
- 8)) !=
366 ARCH_PERFMON_EVENTSEL_UMASK
);
368 return (umask
& mask
) == match
;
371 static bool filter_contains_match(u64
*events
, u64 nevents
, u64 eventsel
)
373 u64 event_select
= eventsel
& kvm_pmu_ops
.EVENTSEL_EVENT
;
374 u64 umask
= eventsel
& ARCH_PERFMON_EVENTSEL_UMASK
;
377 index
= find_filter_index(events
, nevents
, event_select
);
382 * Entries are sorted by the event select. Walk the list in both
383 * directions to process all entries with the targeted event select.
385 for (i
= index
; i
< nevents
; i
++) {
386 if (filter_event_cmp(&events
[i
], &event_select
))
389 if (is_filter_entry_match(events
[i
], umask
))
393 for (i
= index
- 1; i
>= 0; i
--) {
394 if (filter_event_cmp(&events
[i
], &event_select
))
397 if (is_filter_entry_match(events
[i
], umask
))
404 static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter
*f
,
407 if (filter_contains_match(f
->includes
, f
->nr_includes
, eventsel
) &&
408 !filter_contains_match(f
->excludes
, f
->nr_excludes
, eventsel
))
409 return f
->action
== KVM_PMU_EVENT_ALLOW
;
411 return f
->action
== KVM_PMU_EVENT_DENY
;
414 static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter
*filter
,
417 int fixed_idx
= idx
- KVM_FIXED_PMC_BASE_IDX
;
419 if (filter
->action
== KVM_PMU_EVENT_DENY
&&
420 test_bit(fixed_idx
, (ulong
*)&filter
->fixed_counter_bitmap
))
422 if (filter
->action
== KVM_PMU_EVENT_ALLOW
&&
423 !test_bit(fixed_idx
, (ulong
*)&filter
->fixed_counter_bitmap
))
429 static bool check_pmu_event_filter(struct kvm_pmc
*pmc
)
431 struct kvm_x86_pmu_event_filter
*filter
;
432 struct kvm
*kvm
= pmc
->vcpu
->kvm
;
434 filter
= srcu_dereference(kvm
->arch
.pmu_event_filter
, &kvm
->srcu
);
439 return is_gp_event_allowed(filter
, pmc
->eventsel
);
441 return is_fixed_event_allowed(filter
, pmc
->idx
);
444 static bool pmc_event_is_allowed(struct kvm_pmc
*pmc
)
446 return pmc_is_globally_enabled(pmc
) && pmc_speculative_in_use(pmc
) &&
447 check_pmu_event_filter(pmc
);
450 static int reprogram_counter(struct kvm_pmc
*pmc
)
452 struct kvm_pmu
*pmu
= pmc_to_pmu(pmc
);
453 u64 eventsel
= pmc
->eventsel
;
454 u64 new_config
= eventsel
;
455 bool emulate_overflow
;
458 emulate_overflow
= pmc_pause_counter(pmc
);
460 if (!pmc_event_is_allowed(pmc
))
463 if (emulate_overflow
)
464 __kvm_perf_overflow(pmc
, false);
466 if (eventsel
& ARCH_PERFMON_EVENTSEL_PIN_CONTROL
)
467 printk_once("kvm pmu: pin control bit is ignored\n");
469 if (pmc_is_fixed(pmc
)) {
470 fixed_ctr_ctrl
= fixed_ctrl_field(pmu
->fixed_ctr_ctrl
,
471 pmc
->idx
- KVM_FIXED_PMC_BASE_IDX
);
472 if (fixed_ctr_ctrl
& INTEL_FIXED_0_KERNEL
)
473 eventsel
|= ARCH_PERFMON_EVENTSEL_OS
;
474 if (fixed_ctr_ctrl
& INTEL_FIXED_0_USER
)
475 eventsel
|= ARCH_PERFMON_EVENTSEL_USR
;
476 if (fixed_ctr_ctrl
& INTEL_FIXED_0_ENABLE_PMI
)
477 eventsel
|= ARCH_PERFMON_EVENTSEL_INT
;
478 new_config
= (u64
)fixed_ctr_ctrl
;
481 if (pmc
->current_config
== new_config
&& pmc_resume_counter(pmc
))
484 pmc_release_perf_event(pmc
);
486 pmc
->current_config
= new_config
;
488 return pmc_reprogram_counter(pmc
, PERF_TYPE_RAW
,
489 (eventsel
& pmu
->raw_event_mask
),
490 !(eventsel
& ARCH_PERFMON_EVENTSEL_USR
),
491 !(eventsel
& ARCH_PERFMON_EVENTSEL_OS
),
492 eventsel
& ARCH_PERFMON_EVENTSEL_INT
);
495 void kvm_pmu_handle_event(struct kvm_vcpu
*vcpu
)
497 DECLARE_BITMAP(bitmap
, X86_PMC_IDX_MAX
);
498 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
502 bitmap_copy(bitmap
, pmu
->reprogram_pmi
, X86_PMC_IDX_MAX
);
505 * The reprogramming bitmap can be written asynchronously by something
506 * other than the task that holds vcpu->mutex, take care to clear only
507 * the bits that will actually processed.
509 BUILD_BUG_ON(sizeof(bitmap
) != sizeof(atomic64_t
));
510 atomic64_andnot(*(s64
*)bitmap
, &pmu
->__reprogram_pmi
);
512 kvm_for_each_pmc(pmu
, pmc
, bit
, bitmap
) {
514 * If reprogramming fails, e.g. due to contention, re-set the
515 * regprogram bit set, i.e. opportunistically try again on the
516 * next PMU refresh. Don't make a new request as doing so can
517 * stall the guest if reprogramming repeatedly fails.
519 if (reprogram_counter(pmc
))
520 set_bit(pmc
->idx
, pmu
->reprogram_pmi
);
524 * Release unused perf_events if the corresponding guest MSRs weren't
525 * accessed during the last vCPU time slice (need_cleanup is set when
526 * the vCPU is scheduled back in).
528 if (unlikely(pmu
->need_cleanup
))
529 kvm_pmu_cleanup(vcpu
);
532 int kvm_pmu_check_rdpmc_early(struct kvm_vcpu
*vcpu
, unsigned int idx
)
535 * On Intel, VMX interception has priority over RDPMC exceptions that
536 * aren't already handled by the emulator, i.e. there are no additional
537 * check needed for Intel PMUs.
539 * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
540 * i.e. an invalid PMC results in a #GP, not #VMEXIT.
542 if (!kvm_pmu_ops
.check_rdpmc_early
)
545 return kvm_pmu_call(check_rdpmc_early
)(vcpu
, idx
);
548 bool is_vmware_backdoor_pmc(u32 pmc_idx
)
551 case VMWARE_BACKDOOR_PMC_HOST_TSC
:
552 case VMWARE_BACKDOOR_PMC_REAL_TIME
:
553 case VMWARE_BACKDOOR_PMC_APPARENT_TIME
:
559 static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu
*vcpu
, unsigned idx
, u64
*data
)
564 case VMWARE_BACKDOOR_PMC_HOST_TSC
:
567 case VMWARE_BACKDOOR_PMC_REAL_TIME
:
568 ctr_val
= ktime_get_boottime_ns();
570 case VMWARE_BACKDOOR_PMC_APPARENT_TIME
:
571 ctr_val
= ktime_get_boottime_ns() +
572 vcpu
->kvm
->arch
.kvmclock_offset
;
582 int kvm_pmu_rdpmc(struct kvm_vcpu
*vcpu
, unsigned idx
, u64
*data
)
584 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
591 if (is_vmware_backdoor_pmc(idx
))
592 return kvm_pmu_rdpmc_vmware(vcpu
, idx
, data
);
594 pmc
= kvm_pmu_call(rdpmc_ecx_to_pmc
)(vcpu
, idx
, &mask
);
598 if (!kvm_is_cr4_bit_set(vcpu
, X86_CR4_PCE
) &&
599 (kvm_x86_call(get_cpl
)(vcpu
) != 0) &&
600 kvm_is_cr0_bit_set(vcpu
, X86_CR0_PE
))
603 *data
= pmc_read_counter(pmc
) & mask
;
607 void kvm_pmu_deliver_pmi(struct kvm_vcpu
*vcpu
)
609 if (lapic_in_kernel(vcpu
)) {
610 kvm_pmu_call(deliver_pmi
)(vcpu
);
611 kvm_apic_local_deliver(vcpu
->arch
.apic
, APIC_LVTPC
);
615 bool kvm_pmu_is_valid_msr(struct kvm_vcpu
*vcpu
, u32 msr
)
618 case MSR_CORE_PERF_GLOBAL_STATUS
:
619 case MSR_CORE_PERF_GLOBAL_CTRL
:
620 case MSR_CORE_PERF_GLOBAL_OVF_CTRL
:
621 return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu
));
625 return kvm_pmu_call(msr_idx_to_pmc
)(vcpu
, msr
) ||
626 kvm_pmu_call(is_valid_msr
)(vcpu
, msr
);
629 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu
*vcpu
, u32 msr
)
631 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
632 struct kvm_pmc
*pmc
= kvm_pmu_call(msr_idx_to_pmc
)(vcpu
, msr
);
635 __set_bit(pmc
->idx
, pmu
->pmc_in_use
);
638 int kvm_pmu_get_msr(struct kvm_vcpu
*vcpu
, struct msr_data
*msr_info
)
640 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
641 u32 msr
= msr_info
->index
;
644 case MSR_CORE_PERF_GLOBAL_STATUS
:
645 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS
:
646 msr_info
->data
= pmu
->global_status
;
648 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL
:
649 case MSR_CORE_PERF_GLOBAL_CTRL
:
650 msr_info
->data
= pmu
->global_ctrl
;
652 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR
:
653 case MSR_CORE_PERF_GLOBAL_OVF_CTRL
:
657 return kvm_pmu_call(get_msr
)(vcpu
, msr_info
);
663 int kvm_pmu_set_msr(struct kvm_vcpu
*vcpu
, struct msr_data
*msr_info
)
665 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
666 u32 msr
= msr_info
->index
;
667 u64 data
= msr_info
->data
;
671 * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
672 * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
675 case MSR_CORE_PERF_GLOBAL_STATUS
:
676 if (!msr_info
->host_initiated
)
677 return 1; /* RO MSR */
679 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS
:
680 /* Per PPR, Read-only MSR. Writes are ignored. */
681 if (!msr_info
->host_initiated
)
684 if (data
& pmu
->global_status_rsvd
)
687 pmu
->global_status
= data
;
689 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL
:
690 data
&= ~pmu
->global_ctrl_rsvd
;
692 case MSR_CORE_PERF_GLOBAL_CTRL
:
693 if (!kvm_valid_perf_global_ctrl(pmu
, data
))
696 if (pmu
->global_ctrl
!= data
) {
697 diff
= pmu
->global_ctrl
^ data
;
698 pmu
->global_ctrl
= data
;
699 reprogram_counters(pmu
, diff
);
702 case MSR_CORE_PERF_GLOBAL_OVF_CTRL
:
704 * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
705 * GLOBAL_STATUS, and so the set of reserved bits is the same.
707 if (data
& pmu
->global_status_rsvd
)
710 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR
:
711 if (!msr_info
->host_initiated
)
712 pmu
->global_status
&= ~data
;
715 kvm_pmu_mark_pmc_in_use(vcpu
, msr_info
->index
);
716 return kvm_pmu_call(set_msr
)(vcpu
, msr_info
);
722 static void kvm_pmu_reset(struct kvm_vcpu
*vcpu
)
724 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
728 pmu
->need_cleanup
= false;
730 bitmap_zero(pmu
->reprogram_pmi
, X86_PMC_IDX_MAX
);
732 kvm_for_each_pmc(pmu
, pmc
, i
, pmu
->all_valid_pmc_idx
) {
733 pmc_stop_counter(pmc
);
735 pmc
->emulated_counter
= 0;
741 pmu
->fixed_ctr_ctrl
= pmu
->global_ctrl
= pmu
->global_status
= 0;
743 kvm_pmu_call(reset
)(vcpu
);
748 * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
749 * and/or PERF_CAPABILITIES.
751 void kvm_pmu_refresh(struct kvm_vcpu
*vcpu
)
753 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
755 if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu
), vcpu
->kvm
))
759 * Stop/release all existing counters/events before realizing the new
765 pmu
->nr_arch_gp_counters
= 0;
766 pmu
->nr_arch_fixed_counters
= 0;
767 pmu
->counter_bitmask
[KVM_PMC_GP
] = 0;
768 pmu
->counter_bitmask
[KVM_PMC_FIXED
] = 0;
769 pmu
->reserved_bits
= 0xffffffff00200000ull
;
770 pmu
->raw_event_mask
= X86_RAW_EVENT_MASK
;
771 pmu
->global_ctrl_rsvd
= ~0ull;
772 pmu
->global_status_rsvd
= ~0ull;
773 pmu
->fixed_ctr_ctrl_rsvd
= ~0ull;
774 pmu
->pebs_enable_rsvd
= ~0ull;
775 pmu
->pebs_data_cfg_rsvd
= ~0ull;
776 bitmap_zero(pmu
->all_valid_pmc_idx
, X86_PMC_IDX_MAX
);
778 if (!vcpu
->kvm
->arch
.enable_pmu
)
781 kvm_pmu_call(refresh
)(vcpu
);
784 * At RESET, both Intel and AMD CPUs set all enable bits for general
785 * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
786 * was written for v1 PMUs don't unknowingly leave GP counters disabled
787 * in the global controls). Emulate that behavior when refreshing the
788 * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
790 if (kvm_pmu_has_perf_global_ctrl(pmu
) && pmu
->nr_arch_gp_counters
)
791 pmu
->global_ctrl
= GENMASK_ULL(pmu
->nr_arch_gp_counters
- 1, 0);
794 void kvm_pmu_init(struct kvm_vcpu
*vcpu
)
796 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
798 memset(pmu
, 0, sizeof(*pmu
));
799 kvm_pmu_call(init
)(vcpu
);
800 kvm_pmu_refresh(vcpu
);
803 /* Release perf_events for vPMCs that have been unused for a full time slice. */
804 void kvm_pmu_cleanup(struct kvm_vcpu
*vcpu
)
806 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
807 struct kvm_pmc
*pmc
= NULL
;
808 DECLARE_BITMAP(bitmask
, X86_PMC_IDX_MAX
);
811 pmu
->need_cleanup
= false;
813 bitmap_andnot(bitmask
, pmu
->all_valid_pmc_idx
,
814 pmu
->pmc_in_use
, X86_PMC_IDX_MAX
);
816 kvm_for_each_pmc(pmu
, pmc
, i
, bitmask
) {
817 if (pmc
->perf_event
&& !pmc_speculative_in_use(pmc
))
818 pmc_stop_counter(pmc
);
821 kvm_pmu_call(cleanup
)(vcpu
);
823 bitmap_zero(pmu
->pmc_in_use
, X86_PMC_IDX_MAX
);
826 void kvm_pmu_destroy(struct kvm_vcpu
*vcpu
)
831 static void kvm_pmu_incr_counter(struct kvm_pmc
*pmc
)
833 pmc
->emulated_counter
++;
834 kvm_pmu_request_counter_reprogram(pmc
);
837 static inline bool cpl_is_matched(struct kvm_pmc
*pmc
)
839 bool select_os
, select_user
;
842 if (pmc_is_gp(pmc
)) {
843 config
= pmc
->eventsel
;
844 select_os
= config
& ARCH_PERFMON_EVENTSEL_OS
;
845 select_user
= config
& ARCH_PERFMON_EVENTSEL_USR
;
847 config
= fixed_ctrl_field(pmc_to_pmu(pmc
)->fixed_ctr_ctrl
,
848 pmc
->idx
- KVM_FIXED_PMC_BASE_IDX
);
849 select_os
= config
& INTEL_FIXED_0_KERNEL
;
850 select_user
= config
& INTEL_FIXED_0_USER
;
854 * Skip the CPL lookup, which isn't free on Intel, if the result will
855 * be the same regardless of the CPL.
857 if (select_os
== select_user
)
860 return (kvm_x86_call(get_cpl
)(pmc
->vcpu
) == 0) ? select_os
:
864 void kvm_pmu_trigger_event(struct kvm_vcpu
*vcpu
, u64 eventsel
)
866 DECLARE_BITMAP(bitmap
, X86_PMC_IDX_MAX
);
867 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
871 BUILD_BUG_ON(sizeof(pmu
->global_ctrl
) * BITS_PER_BYTE
!= X86_PMC_IDX_MAX
);
873 if (!kvm_pmu_has_perf_global_ctrl(pmu
))
874 bitmap_copy(bitmap
, pmu
->all_valid_pmc_idx
, X86_PMC_IDX_MAX
);
875 else if (!bitmap_and(bitmap
, pmu
->all_valid_pmc_idx
,
876 (unsigned long *)&pmu
->global_ctrl
, X86_PMC_IDX_MAX
))
879 kvm_for_each_pmc(pmu
, pmc
, i
, bitmap
) {
881 * Ignore checks for edge detect (all events currently emulated
882 * but KVM are always rising edges), pin control (unsupported
883 * by modern CPUs), and counter mask and its invert flag (KVM
884 * doesn't emulate multiple events in a single clock cycle).
886 * Note, the uppermost nibble of AMD's mask overlaps Intel's
887 * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
888 * bits (bits 35:34). Checking the "in HLE/RTM transaction"
889 * flags is correct as the vCPU can't be in a transaction if
890 * KVM is emulating an instruction. Checking the reserved bits
891 * might be wrong if they are defined in the future, but so
892 * could ignoring them, so do the simple thing for now.
894 if (((pmc
->eventsel
^ eventsel
) & AMD64_RAW_EVENT_MASK_NB
) ||
895 !pmc_event_is_allowed(pmc
) || !cpl_is_matched(pmc
))
898 kvm_pmu_incr_counter(pmc
);
901 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event
);
903 static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter
*filter
)
905 u64 mask
= kvm_pmu_ops
.EVENTSEL_EVENT
|
906 KVM_PMU_MASKED_ENTRY_UMASK_MASK
|
907 KVM_PMU_MASKED_ENTRY_UMASK_MATCH
|
908 KVM_PMU_MASKED_ENTRY_EXCLUDE
;
911 for (i
= 0; i
< filter
->nevents
; i
++) {
912 if (filter
->events
[i
] & ~mask
)
919 static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter
*filter
)
923 for (i
= 0, j
= 0; i
< filter
->nevents
; i
++) {
925 * Skip events that are impossible to match against a guest
926 * event. When filtering, only the event select + unit mask
927 * of the guest event is used. To maintain backwards
928 * compatibility, impossible filters can't be rejected :-(
930 if (filter
->events
[i
] & ~(kvm_pmu_ops
.EVENTSEL_EVENT
|
931 ARCH_PERFMON_EVENTSEL_UMASK
))
934 * Convert userspace events to a common in-kernel event so
935 * only one code path is needed to support both events. For
936 * the in-kernel events use masked events because they are
937 * flexible enough to handle both cases. To convert to masked
938 * events all that's needed is to add an "all ones" umask_mask,
939 * (unmasked filter events don't support EXCLUDE).
941 filter
->events
[j
++] = filter
->events
[i
] |
942 (0xFFULL
<< KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT
);
948 static int prepare_filter_lists(struct kvm_x86_pmu_event_filter
*filter
)
952 if (!(filter
->flags
& KVM_PMU_EVENT_FLAG_MASKED_EVENTS
))
953 convert_to_masked_filter(filter
);
954 else if (!is_masked_filter_valid(filter
))
958 * Sort entries by event select and includes vs. excludes so that all
959 * entries for a given event select can be processed efficiently during
960 * filtering. The EXCLUDE flag uses a more significant bit than the
961 * event select, and so the sorted list is also effectively split into
962 * includes and excludes sub-lists.
964 sort(&filter
->events
, filter
->nevents
, sizeof(filter
->events
[0]),
965 filter_sort_cmp
, NULL
);
968 /* Find the first EXCLUDE event (only supported for masked events). */
969 if (filter
->flags
& KVM_PMU_EVENT_FLAG_MASKED_EVENTS
) {
970 for (i
= 0; i
< filter
->nevents
; i
++) {
971 if (filter
->events
[i
] & KVM_PMU_MASKED_ENTRY_EXCLUDE
)
976 filter
->nr_includes
= i
;
977 filter
->nr_excludes
= filter
->nevents
- filter
->nr_includes
;
978 filter
->includes
= filter
->events
;
979 filter
->excludes
= filter
->events
+ filter
->nr_includes
;
984 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm
*kvm
, void __user
*argp
)
986 struct kvm_pmu_event_filter __user
*user_filter
= argp
;
987 struct kvm_x86_pmu_event_filter
*filter
;
988 struct kvm_pmu_event_filter tmp
;
989 struct kvm_vcpu
*vcpu
;
994 if (copy_from_user(&tmp
, user_filter
, sizeof(tmp
)))
997 if (tmp
.action
!= KVM_PMU_EVENT_ALLOW
&&
998 tmp
.action
!= KVM_PMU_EVENT_DENY
)
1001 if (tmp
.flags
& ~KVM_PMU_EVENT_FLAGS_VALID_MASK
)
1004 if (tmp
.nevents
> KVM_PMU_EVENT_FILTER_MAX_EVENTS
)
1007 size
= struct_size(filter
, events
, tmp
.nevents
);
1008 filter
= kzalloc(size
, GFP_KERNEL_ACCOUNT
);
1012 filter
->action
= tmp
.action
;
1013 filter
->nevents
= tmp
.nevents
;
1014 filter
->fixed_counter_bitmap
= tmp
.fixed_counter_bitmap
;
1015 filter
->flags
= tmp
.flags
;
1018 if (copy_from_user(filter
->events
, user_filter
->events
,
1019 sizeof(filter
->events
[0]) * filter
->nevents
))
1022 r
= prepare_filter_lists(filter
);
1026 mutex_lock(&kvm
->lock
);
1027 filter
= rcu_replace_pointer(kvm
->arch
.pmu_event_filter
, filter
,
1028 mutex_is_locked(&kvm
->lock
));
1029 mutex_unlock(&kvm
->lock
);
1030 synchronize_srcu_expedited(&kvm
->srcu
);
1032 BUILD_BUG_ON(sizeof(((struct kvm_pmu
*)0)->reprogram_pmi
) >
1033 sizeof(((struct kvm_pmu
*)0)->__reprogram_pmi
));
1035 kvm_for_each_vcpu(i
, vcpu
, kvm
)
1036 atomic64_set(&vcpu_to_pmu(vcpu
)->__reprogram_pmi
, -1ull);
1038 kvm_make_all_cpus_request(kvm
, KVM_REQ_PMU
);