drm/modes: Fix drm_mode_vrefres() docs
[drm/drm-misc.git] / arch / x86 / kvm / pmu.c
blob47a46283c8667779722ae7ac39099f78701afb89
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
5 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
7 * Authors:
8 * Avi Kivity <avi@redhat.com>
9 * Gleb Natapov <gleb@redhat.com>
10 * Wei Huang <wei@redhat.com>
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 #include <linux/types.h>
15 #include <linux/kvm_host.h>
16 #include <linux/perf_event.h>
17 #include <linux/bsearch.h>
18 #include <linux/sort.h>
19 #include <asm/perf_event.h>
20 #include <asm/cpu_device_id.h>
21 #include "x86.h"
22 #include "cpuid.h"
23 #include "lapic.h"
24 #include "pmu.h"
26 /* This is enough to filter the vast majority of currently defined events. */
27 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
29 struct x86_pmu_capability __read_mostly kvm_pmu_cap;
30 EXPORT_SYMBOL_GPL(kvm_pmu_cap);
32 struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
33 EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
35 /* Precise Distribution of Instructions Retired (PDIR) */
36 static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
37 X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
38 X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
39 /* Instruction-Accurate PDIR (PDIR++) */
40 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
44 /* Precise Distribution (PDist) */
45 static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
46 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
50 /* NOTE:
51 * - Each perf counter is defined as "struct kvm_pmc";
52 * - There are two types of perf counters: general purpose (gp) and fixed.
53 * gp counters are stored in gp_counters[] and fixed counters are stored
54 * in fixed_counters[] respectively. Both of them are part of "struct
55 * kvm_pmu";
56 * - pmu.c understands the difference between gp counters and fixed counters.
57 * However AMD doesn't support fixed-counters;
58 * - There are three types of index to access perf counters (PMC):
59 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
60 * has MSR_K7_PERFCTRn and, for families 15H and later,
61 * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
62 * aliased to MSR_K7_PERFCTRn.
63 * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
64 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
65 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
66 * that it also supports fixed counters. idx can be used to as index to
67 * gp and fixed counters.
68 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
69 * code. Each pmc, stored in kvm_pmc.idx field, is unique across
70 * all perf counters (both gp and fixed). The mapping relationship
71 * between pmc and perf counters is as the following:
72 * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
73 * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
74 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
75 * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
78 static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
80 #define KVM_X86_PMU_OP(func) \
81 DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
82 *(((struct kvm_pmu_ops *)0)->func));
83 #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
84 #include <asm/kvm-x86-pmu-ops.h>
86 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
88 memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
90 #define __KVM_X86_PMU_OP(func) \
91 static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
92 #define KVM_X86_PMU_OP(func) \
93 WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
94 #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
95 #include <asm/kvm-x86-pmu-ops.h>
96 #undef __KVM_X86_PMU_OP
99 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
101 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
102 bool skip_pmi = false;
104 if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
105 if (!in_pmi) {
107 * TODO: KVM is currently _choosing_ to not generate records
108 * for emulated instructions, avoiding BUFFER_OVF PMI when
109 * there are no records. Strictly speaking, it should be done
110 * as well in the right context to improve sampling accuracy.
112 skip_pmi = true;
113 } else {
114 /* Indicate PEBS overflow PMI to guest. */
115 skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
116 (unsigned long *)&pmu->global_status);
118 } else {
119 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
122 if (pmc->intr && !skip_pmi)
123 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
126 static void kvm_perf_overflow(struct perf_event *perf_event,
127 struct perf_sample_data *data,
128 struct pt_regs *regs)
130 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
133 * Ignore asynchronous overflow events for counters that are scheduled
134 * to be reprogrammed, e.g. if a PMI for the previous event races with
135 * KVM's handling of a related guest WRMSR.
137 if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
138 return;
140 __kvm_perf_overflow(pmc, true);
142 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
145 static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
148 * For some model specific pebs counters with special capabilities
149 * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
150 * level to the maximum value (currently 3, backwards compatible)
151 * so that the perf subsystem would assign specific hardware counter
152 * with that capability for vPMC.
154 if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
155 (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
156 return 3;
159 * The non-zero precision level of guest event makes the ordinary
160 * guest event becomes a guest PEBS event and triggers the host
161 * PEBS PMI handler to determine whether the PEBS overflow PMI
162 * comes from the host counters or the guest.
164 return 1;
167 static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
169 u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
171 if (!sample_period)
172 sample_period = pmc_bitmask(pmc) + 1;
173 return sample_period;
176 static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
177 bool exclude_user, bool exclude_kernel,
178 bool intr)
180 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
181 struct perf_event *event;
182 struct perf_event_attr attr = {
183 .type = type,
184 .size = sizeof(attr),
185 .pinned = true,
186 .exclude_idle = true,
187 .exclude_host = 1,
188 .exclude_user = exclude_user,
189 .exclude_kernel = exclude_kernel,
190 .config = config,
192 bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
194 attr.sample_period = get_sample_period(pmc, pmc->counter);
196 if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
197 (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
199 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
200 * period. Just clear the sample period so at least
201 * allocating the counter doesn't fail.
203 attr.sample_period = 0;
205 if (pebs) {
207 * For most PEBS hardware events, the difference in the software
208 * precision levels of guest and host PEBS events will not affect
209 * the accuracy of the PEBS profiling result, because the "event IP"
210 * in the PEBS record is calibrated on the guest side.
212 attr.precise_ip = pmc_get_pebs_precise_level(pmc);
215 event = perf_event_create_kernel_counter(&attr, -1, current,
216 kvm_perf_overflow, pmc);
217 if (IS_ERR(event)) {
218 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
219 PTR_ERR(event), pmc->idx);
220 return PTR_ERR(event);
223 pmc->perf_event = event;
224 pmc_to_pmu(pmc)->event_count++;
225 pmc->is_paused = false;
226 pmc->intr = intr || pebs;
227 return 0;
230 static bool pmc_pause_counter(struct kvm_pmc *pmc)
232 u64 counter = pmc->counter;
233 u64 prev_counter;
235 /* update counter, reset event value to avoid redundant accumulation */
236 if (pmc->perf_event && !pmc->is_paused)
237 counter += perf_event_pause(pmc->perf_event, true);
240 * Snapshot the previous counter *after* accumulating state from perf.
241 * If overflow already happened, hardware (via perf) is responsible for
242 * generating a PMI. KVM just needs to detect overflow on emulated
243 * counter events that haven't yet been processed.
245 prev_counter = counter & pmc_bitmask(pmc);
247 counter += pmc->emulated_counter;
248 pmc->counter = counter & pmc_bitmask(pmc);
250 pmc->emulated_counter = 0;
251 pmc->is_paused = true;
253 return pmc->counter < prev_counter;
256 static bool pmc_resume_counter(struct kvm_pmc *pmc)
258 if (!pmc->perf_event)
259 return false;
261 /* recalibrate sample period and check if it's accepted by perf core */
262 if (is_sampling_event(pmc->perf_event) &&
263 perf_event_period(pmc->perf_event,
264 get_sample_period(pmc, pmc->counter)))
265 return false;
267 if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
268 (!!pmc->perf_event->attr.precise_ip))
269 return false;
271 /* reuse perf_event to serve as pmc_reprogram_counter() does*/
272 perf_event_enable(pmc->perf_event);
273 pmc->is_paused = false;
275 return true;
278 static void pmc_release_perf_event(struct kvm_pmc *pmc)
280 if (pmc->perf_event) {
281 perf_event_release_kernel(pmc->perf_event);
282 pmc->perf_event = NULL;
283 pmc->current_config = 0;
284 pmc_to_pmu(pmc)->event_count--;
288 static void pmc_stop_counter(struct kvm_pmc *pmc)
290 if (pmc->perf_event) {
291 pmc->counter = pmc_read_counter(pmc);
292 pmc_release_perf_event(pmc);
296 static void pmc_update_sample_period(struct kvm_pmc *pmc)
298 if (!pmc->perf_event || pmc->is_paused ||
299 !is_sampling_event(pmc->perf_event))
300 return;
302 perf_event_period(pmc->perf_event,
303 get_sample_period(pmc, pmc->counter));
306 void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
309 * Drop any unconsumed accumulated counts, the WRMSR is a write, not a
310 * read-modify-write. Adjust the counter value so that its value is
311 * relative to the current count, as reading the current count from
312 * perf is faster than pausing and repgrogramming the event in order to
313 * reset it to '0'. Note, this very sneakily offsets the accumulated
314 * emulated count too, by using pmc_read_counter()!
316 pmc->emulated_counter = 0;
317 pmc->counter += val - pmc_read_counter(pmc);
318 pmc->counter &= pmc_bitmask(pmc);
319 pmc_update_sample_period(pmc);
321 EXPORT_SYMBOL_GPL(pmc_write_counter);
323 static int filter_cmp(const void *pa, const void *pb, u64 mask)
325 u64 a = *(u64 *)pa & mask;
326 u64 b = *(u64 *)pb & mask;
328 return (a > b) - (a < b);
332 static int filter_sort_cmp(const void *pa, const void *pb)
334 return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
335 KVM_PMU_MASKED_ENTRY_EXCLUDE));
339 * For the event filter, searching is done on the 'includes' list and
340 * 'excludes' list separately rather than on the 'events' list (which
341 * has both). As a result the exclude bit can be ignored.
343 static int filter_event_cmp(const void *pa, const void *pb)
345 return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
348 static int find_filter_index(u64 *events, u64 nevents, u64 key)
350 u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
351 filter_event_cmp);
353 if (!fe)
354 return -1;
356 return fe - events;
359 static bool is_filter_entry_match(u64 filter_event, u64 umask)
361 u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
362 u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
364 BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
365 (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
366 ARCH_PERFMON_EVENTSEL_UMASK);
368 return (umask & mask) == match;
371 static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
373 u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
374 u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
375 int i, index;
377 index = find_filter_index(events, nevents, event_select);
378 if (index < 0)
379 return false;
382 * Entries are sorted by the event select. Walk the list in both
383 * directions to process all entries with the targeted event select.
385 for (i = index; i < nevents; i++) {
386 if (filter_event_cmp(&events[i], &event_select))
387 break;
389 if (is_filter_entry_match(events[i], umask))
390 return true;
393 for (i = index - 1; i >= 0; i--) {
394 if (filter_event_cmp(&events[i], &event_select))
395 break;
397 if (is_filter_entry_match(events[i], umask))
398 return true;
401 return false;
404 static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
405 u64 eventsel)
407 if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
408 !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
409 return f->action == KVM_PMU_EVENT_ALLOW;
411 return f->action == KVM_PMU_EVENT_DENY;
414 static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
415 int idx)
417 int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
419 if (filter->action == KVM_PMU_EVENT_DENY &&
420 test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
421 return false;
422 if (filter->action == KVM_PMU_EVENT_ALLOW &&
423 !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
424 return false;
426 return true;
429 static bool check_pmu_event_filter(struct kvm_pmc *pmc)
431 struct kvm_x86_pmu_event_filter *filter;
432 struct kvm *kvm = pmc->vcpu->kvm;
434 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
435 if (!filter)
436 return true;
438 if (pmc_is_gp(pmc))
439 return is_gp_event_allowed(filter, pmc->eventsel);
441 return is_fixed_event_allowed(filter, pmc->idx);
444 static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
446 return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
447 check_pmu_event_filter(pmc);
450 static int reprogram_counter(struct kvm_pmc *pmc)
452 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
453 u64 eventsel = pmc->eventsel;
454 u64 new_config = eventsel;
455 bool emulate_overflow;
456 u8 fixed_ctr_ctrl;
458 emulate_overflow = pmc_pause_counter(pmc);
460 if (!pmc_event_is_allowed(pmc))
461 return 0;
463 if (emulate_overflow)
464 __kvm_perf_overflow(pmc, false);
466 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
467 printk_once("kvm pmu: pin control bit is ignored\n");
469 if (pmc_is_fixed(pmc)) {
470 fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
471 pmc->idx - KVM_FIXED_PMC_BASE_IDX);
472 if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
473 eventsel |= ARCH_PERFMON_EVENTSEL_OS;
474 if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
475 eventsel |= ARCH_PERFMON_EVENTSEL_USR;
476 if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
477 eventsel |= ARCH_PERFMON_EVENTSEL_INT;
478 new_config = (u64)fixed_ctr_ctrl;
481 if (pmc->current_config == new_config && pmc_resume_counter(pmc))
482 return 0;
484 pmc_release_perf_event(pmc);
486 pmc->current_config = new_config;
488 return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
489 (eventsel & pmu->raw_event_mask),
490 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
491 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
492 eventsel & ARCH_PERFMON_EVENTSEL_INT);
495 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
497 DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
498 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
499 struct kvm_pmc *pmc;
500 int bit;
502 bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
505 * The reprogramming bitmap can be written asynchronously by something
506 * other than the task that holds vcpu->mutex, take care to clear only
507 * the bits that will actually processed.
509 BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
510 atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
512 kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
514 * If reprogramming fails, e.g. due to contention, re-set the
515 * regprogram bit set, i.e. opportunistically try again on the
516 * next PMU refresh. Don't make a new request as doing so can
517 * stall the guest if reprogramming repeatedly fails.
519 if (reprogram_counter(pmc))
520 set_bit(pmc->idx, pmu->reprogram_pmi);
524 * Release unused perf_events if the corresponding guest MSRs weren't
525 * accessed during the last vCPU time slice (need_cleanup is set when
526 * the vCPU is scheduled back in).
528 if (unlikely(pmu->need_cleanup))
529 kvm_pmu_cleanup(vcpu);
532 int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
535 * On Intel, VMX interception has priority over RDPMC exceptions that
536 * aren't already handled by the emulator, i.e. there are no additional
537 * check needed for Intel PMUs.
539 * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
540 * i.e. an invalid PMC results in a #GP, not #VMEXIT.
542 if (!kvm_pmu_ops.check_rdpmc_early)
543 return 0;
545 return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
548 bool is_vmware_backdoor_pmc(u32 pmc_idx)
550 switch (pmc_idx) {
551 case VMWARE_BACKDOOR_PMC_HOST_TSC:
552 case VMWARE_BACKDOOR_PMC_REAL_TIME:
553 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
554 return true;
556 return false;
559 static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
561 u64 ctr_val;
563 switch (idx) {
564 case VMWARE_BACKDOOR_PMC_HOST_TSC:
565 ctr_val = rdtsc();
566 break;
567 case VMWARE_BACKDOOR_PMC_REAL_TIME:
568 ctr_val = ktime_get_boottime_ns();
569 break;
570 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
571 ctr_val = ktime_get_boottime_ns() +
572 vcpu->kvm->arch.kvmclock_offset;
573 break;
574 default:
575 return 1;
578 *data = ctr_val;
579 return 0;
582 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
584 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
585 struct kvm_pmc *pmc;
586 u64 mask = ~0ull;
588 if (!pmu->version)
589 return 1;
591 if (is_vmware_backdoor_pmc(idx))
592 return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
594 pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
595 if (!pmc)
596 return 1;
598 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
599 (kvm_x86_call(get_cpl)(vcpu) != 0) &&
600 kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
601 return 1;
603 *data = pmc_read_counter(pmc) & mask;
604 return 0;
607 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
609 if (lapic_in_kernel(vcpu)) {
610 kvm_pmu_call(deliver_pmi)(vcpu);
611 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
615 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
617 switch (msr) {
618 case MSR_CORE_PERF_GLOBAL_STATUS:
619 case MSR_CORE_PERF_GLOBAL_CTRL:
620 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
621 return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
622 default:
623 break;
625 return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
626 kvm_pmu_call(is_valid_msr)(vcpu, msr);
629 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
631 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
632 struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
634 if (pmc)
635 __set_bit(pmc->idx, pmu->pmc_in_use);
638 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
640 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
641 u32 msr = msr_info->index;
643 switch (msr) {
644 case MSR_CORE_PERF_GLOBAL_STATUS:
645 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
646 msr_info->data = pmu->global_status;
647 break;
648 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
649 case MSR_CORE_PERF_GLOBAL_CTRL:
650 msr_info->data = pmu->global_ctrl;
651 break;
652 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
653 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
654 msr_info->data = 0;
655 break;
656 default:
657 return kvm_pmu_call(get_msr)(vcpu, msr_info);
660 return 0;
663 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
665 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
666 u32 msr = msr_info->index;
667 u64 data = msr_info->data;
668 u64 diff;
671 * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
672 * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
674 switch (msr) {
675 case MSR_CORE_PERF_GLOBAL_STATUS:
676 if (!msr_info->host_initiated)
677 return 1; /* RO MSR */
678 fallthrough;
679 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
680 /* Per PPR, Read-only MSR. Writes are ignored. */
681 if (!msr_info->host_initiated)
682 break;
684 if (data & pmu->global_status_rsvd)
685 return 1;
687 pmu->global_status = data;
688 break;
689 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
690 data &= ~pmu->global_ctrl_rsvd;
691 fallthrough;
692 case MSR_CORE_PERF_GLOBAL_CTRL:
693 if (!kvm_valid_perf_global_ctrl(pmu, data))
694 return 1;
696 if (pmu->global_ctrl != data) {
697 diff = pmu->global_ctrl ^ data;
698 pmu->global_ctrl = data;
699 reprogram_counters(pmu, diff);
701 break;
702 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
704 * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
705 * GLOBAL_STATUS, and so the set of reserved bits is the same.
707 if (data & pmu->global_status_rsvd)
708 return 1;
709 fallthrough;
710 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
711 if (!msr_info->host_initiated)
712 pmu->global_status &= ~data;
713 break;
714 default:
715 kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
716 return kvm_pmu_call(set_msr)(vcpu, msr_info);
719 return 0;
722 static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
724 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
725 struct kvm_pmc *pmc;
726 int i;
728 pmu->need_cleanup = false;
730 bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
732 kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
733 pmc_stop_counter(pmc);
734 pmc->counter = 0;
735 pmc->emulated_counter = 0;
737 if (pmc_is_gp(pmc))
738 pmc->eventsel = 0;
741 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
743 kvm_pmu_call(reset)(vcpu);
748 * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
749 * and/or PERF_CAPABILITIES.
751 void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
753 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
755 if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
756 return;
759 * Stop/release all existing counters/events before realizing the new
760 * vPMU model.
762 kvm_pmu_reset(vcpu);
764 pmu->version = 0;
765 pmu->nr_arch_gp_counters = 0;
766 pmu->nr_arch_fixed_counters = 0;
767 pmu->counter_bitmask[KVM_PMC_GP] = 0;
768 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
769 pmu->reserved_bits = 0xffffffff00200000ull;
770 pmu->raw_event_mask = X86_RAW_EVENT_MASK;
771 pmu->global_ctrl_rsvd = ~0ull;
772 pmu->global_status_rsvd = ~0ull;
773 pmu->fixed_ctr_ctrl_rsvd = ~0ull;
774 pmu->pebs_enable_rsvd = ~0ull;
775 pmu->pebs_data_cfg_rsvd = ~0ull;
776 bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
778 if (!vcpu->kvm->arch.enable_pmu)
779 return;
781 kvm_pmu_call(refresh)(vcpu);
784 * At RESET, both Intel and AMD CPUs set all enable bits for general
785 * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
786 * was written for v1 PMUs don't unknowingly leave GP counters disabled
787 * in the global controls). Emulate that behavior when refreshing the
788 * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
790 if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
791 pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
794 void kvm_pmu_init(struct kvm_vcpu *vcpu)
796 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
798 memset(pmu, 0, sizeof(*pmu));
799 kvm_pmu_call(init)(vcpu);
800 kvm_pmu_refresh(vcpu);
803 /* Release perf_events for vPMCs that have been unused for a full time slice. */
804 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
806 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
807 struct kvm_pmc *pmc = NULL;
808 DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
809 int i;
811 pmu->need_cleanup = false;
813 bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
814 pmu->pmc_in_use, X86_PMC_IDX_MAX);
816 kvm_for_each_pmc(pmu, pmc, i, bitmask) {
817 if (pmc->perf_event && !pmc_speculative_in_use(pmc))
818 pmc_stop_counter(pmc);
821 kvm_pmu_call(cleanup)(vcpu);
823 bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
826 void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
828 kvm_pmu_reset(vcpu);
831 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
833 pmc->emulated_counter++;
834 kvm_pmu_request_counter_reprogram(pmc);
837 static inline bool cpl_is_matched(struct kvm_pmc *pmc)
839 bool select_os, select_user;
840 u64 config;
842 if (pmc_is_gp(pmc)) {
843 config = pmc->eventsel;
844 select_os = config & ARCH_PERFMON_EVENTSEL_OS;
845 select_user = config & ARCH_PERFMON_EVENTSEL_USR;
846 } else {
847 config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
848 pmc->idx - KVM_FIXED_PMC_BASE_IDX);
849 select_os = config & INTEL_FIXED_0_KERNEL;
850 select_user = config & INTEL_FIXED_0_USER;
854 * Skip the CPL lookup, which isn't free on Intel, if the result will
855 * be the same regardless of the CPL.
857 if (select_os == select_user)
858 return select_os;
860 return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
861 select_user;
864 void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
866 DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
867 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
868 struct kvm_pmc *pmc;
869 int i;
871 BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
873 if (!kvm_pmu_has_perf_global_ctrl(pmu))
874 bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
875 else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx,
876 (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
877 return;
879 kvm_for_each_pmc(pmu, pmc, i, bitmap) {
881 * Ignore checks for edge detect (all events currently emulated
882 * but KVM are always rising edges), pin control (unsupported
883 * by modern CPUs), and counter mask and its invert flag (KVM
884 * doesn't emulate multiple events in a single clock cycle).
886 * Note, the uppermost nibble of AMD's mask overlaps Intel's
887 * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
888 * bits (bits 35:34). Checking the "in HLE/RTM transaction"
889 * flags is correct as the vCPU can't be in a transaction if
890 * KVM is emulating an instruction. Checking the reserved bits
891 * might be wrong if they are defined in the future, but so
892 * could ignoring them, so do the simple thing for now.
894 if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
895 !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc))
896 continue;
898 kvm_pmu_incr_counter(pmc);
901 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
903 static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
905 u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
906 KVM_PMU_MASKED_ENTRY_UMASK_MASK |
907 KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
908 KVM_PMU_MASKED_ENTRY_EXCLUDE;
909 int i;
911 for (i = 0; i < filter->nevents; i++) {
912 if (filter->events[i] & ~mask)
913 return false;
916 return true;
919 static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
921 int i, j;
923 for (i = 0, j = 0; i < filter->nevents; i++) {
925 * Skip events that are impossible to match against a guest
926 * event. When filtering, only the event select + unit mask
927 * of the guest event is used. To maintain backwards
928 * compatibility, impossible filters can't be rejected :-(
930 if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
931 ARCH_PERFMON_EVENTSEL_UMASK))
932 continue;
934 * Convert userspace events to a common in-kernel event so
935 * only one code path is needed to support both events. For
936 * the in-kernel events use masked events because they are
937 * flexible enough to handle both cases. To convert to masked
938 * events all that's needed is to add an "all ones" umask_mask,
939 * (unmasked filter events don't support EXCLUDE).
941 filter->events[j++] = filter->events[i] |
942 (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
945 filter->nevents = j;
948 static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
950 int i;
952 if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
953 convert_to_masked_filter(filter);
954 else if (!is_masked_filter_valid(filter))
955 return -EINVAL;
958 * Sort entries by event select and includes vs. excludes so that all
959 * entries for a given event select can be processed efficiently during
960 * filtering. The EXCLUDE flag uses a more significant bit than the
961 * event select, and so the sorted list is also effectively split into
962 * includes and excludes sub-lists.
964 sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
965 filter_sort_cmp, NULL);
967 i = filter->nevents;
968 /* Find the first EXCLUDE event (only supported for masked events). */
969 if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
970 for (i = 0; i < filter->nevents; i++) {
971 if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
972 break;
976 filter->nr_includes = i;
977 filter->nr_excludes = filter->nevents - filter->nr_includes;
978 filter->includes = filter->events;
979 filter->excludes = filter->events + filter->nr_includes;
981 return 0;
984 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
986 struct kvm_pmu_event_filter __user *user_filter = argp;
987 struct kvm_x86_pmu_event_filter *filter;
988 struct kvm_pmu_event_filter tmp;
989 struct kvm_vcpu *vcpu;
990 unsigned long i;
991 size_t size;
992 int r;
994 if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
995 return -EFAULT;
997 if (tmp.action != KVM_PMU_EVENT_ALLOW &&
998 tmp.action != KVM_PMU_EVENT_DENY)
999 return -EINVAL;
1001 if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
1002 return -EINVAL;
1004 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
1005 return -E2BIG;
1007 size = struct_size(filter, events, tmp.nevents);
1008 filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
1009 if (!filter)
1010 return -ENOMEM;
1012 filter->action = tmp.action;
1013 filter->nevents = tmp.nevents;
1014 filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
1015 filter->flags = tmp.flags;
1017 r = -EFAULT;
1018 if (copy_from_user(filter->events, user_filter->events,
1019 sizeof(filter->events[0]) * filter->nevents))
1020 goto cleanup;
1022 r = prepare_filter_lists(filter);
1023 if (r)
1024 goto cleanup;
1026 mutex_lock(&kvm->lock);
1027 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
1028 mutex_is_locked(&kvm->lock));
1029 mutex_unlock(&kvm->lock);
1030 synchronize_srcu_expedited(&kvm->srcu);
1032 BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
1033 sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
1035 kvm_for_each_vcpu(i, vcpu, kvm)
1036 atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
1038 kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
1040 r = 0;
1041 cleanup:
1042 kfree(filter);
1043 return r;