1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
5 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
8 * Avi Kivity <avi@redhat.com>
9 * Gleb Natapov <gleb@redhat.com>
10 * Wei Huang <wei@redhat.com>
13 #include <linux/types.h>
14 #include <linux/kvm_host.h>
15 #include <linux/perf_event.h>
16 #include <asm/perf_event.h>
22 /* This is enough to filter the vast majority of currently defined events. */
23 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
26 * - Each perf counter is defined as "struct kvm_pmc";
27 * - There are two types of perf counters: general purpose (gp) and fixed.
28 * gp counters are stored in gp_counters[] and fixed counters are stored
29 * in fixed_counters[] respectively. Both of them are part of "struct
31 * - pmu.c understands the difference between gp counters and fixed counters.
32 * However AMD doesn't support fixed-counters;
33 * - There are three types of index to access perf counters (PMC):
34 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
35 * has MSR_K7_PERFCTRn.
36 * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
37 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
38 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
39 * that it also supports fixed counters. idx can be used to as index to
40 * gp and fixed counters.
41 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
42 * code. Each pmc, stored in kvm_pmc.idx field, is unique across
43 * all perf counters (both gp and fixed). The mapping relationship
44 * between pmc and perf counters is as the following:
45 * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
46 * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
47 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
50 static void kvm_pmi_trigger_fn(struct irq_work
*irq_work
)
52 struct kvm_pmu
*pmu
= container_of(irq_work
, struct kvm_pmu
, irq_work
);
53 struct kvm_vcpu
*vcpu
= pmu_to_vcpu(pmu
);
55 kvm_pmu_deliver_pmi(vcpu
);
58 static void kvm_perf_overflow(struct perf_event
*perf_event
,
59 struct perf_sample_data
*data
,
62 struct kvm_pmc
*pmc
= perf_event
->overflow_handler_context
;
63 struct kvm_pmu
*pmu
= pmc_to_pmu(pmc
);
65 if (!test_and_set_bit(pmc
->idx
, pmu
->reprogram_pmi
)) {
66 __set_bit(pmc
->idx
, (unsigned long *)&pmu
->global_status
);
67 kvm_make_request(KVM_REQ_PMU
, pmc
->vcpu
);
71 static void kvm_perf_overflow_intr(struct perf_event
*perf_event
,
72 struct perf_sample_data
*data
,
75 struct kvm_pmc
*pmc
= perf_event
->overflow_handler_context
;
76 struct kvm_pmu
*pmu
= pmc_to_pmu(pmc
);
78 if (!test_and_set_bit(pmc
->idx
, pmu
->reprogram_pmi
)) {
79 __set_bit(pmc
->idx
, (unsigned long *)&pmu
->global_status
);
80 kvm_make_request(KVM_REQ_PMU
, pmc
->vcpu
);
83 * Inject PMI. If vcpu was in a guest mode during NMI PMI
84 * can be ejected on a guest mode re-entry. Otherwise we can't
85 * be sure that vcpu wasn't executing hlt instruction at the
86 * time of vmexit and is not going to re-enter guest mode until
87 * woken up. So we should wake it, but this is impossible from
88 * NMI context. Do it from irq work instead.
90 if (!kvm_is_in_guest())
91 irq_work_queue(&pmc_to_pmu(pmc
)->irq_work
);
93 kvm_make_request(KVM_REQ_PMI
, pmc
->vcpu
);
97 static void pmc_reprogram_counter(struct kvm_pmc
*pmc
, u32 type
,
98 unsigned config
, bool exclude_user
,
99 bool exclude_kernel
, bool intr
,
100 bool in_tx
, bool in_tx_cp
)
102 struct perf_event
*event
;
103 struct perf_event_attr attr
= {
105 .size
= sizeof(attr
),
107 .exclude_idle
= true,
109 .exclude_user
= exclude_user
,
110 .exclude_kernel
= exclude_kernel
,
114 attr
.sample_period
= (-pmc
->counter
) & pmc_bitmask(pmc
);
117 attr
.config
|= HSW_IN_TX
;
120 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
121 * period. Just clear the sample period so at least
122 * allocating the counter doesn't fail.
124 attr
.sample_period
= 0;
125 attr
.config
|= HSW_IN_TX_CHECKPOINTED
;
128 event
= perf_event_create_kernel_counter(&attr
, -1, current
,
129 intr
? kvm_perf_overflow_intr
:
130 kvm_perf_overflow
, pmc
);
132 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
133 PTR_ERR(event
), pmc
->idx
);
137 pmc
->perf_event
= event
;
138 pmc_to_pmu(pmc
)->event_count
++;
139 clear_bit(pmc
->idx
, pmc_to_pmu(pmc
)->reprogram_pmi
);
142 static void pmc_pause_counter(struct kvm_pmc
*pmc
)
144 u64 counter
= pmc
->counter
;
146 if (!pmc
->perf_event
)
149 /* update counter, reset event value to avoid redundant accumulation */
150 counter
+= perf_event_pause(pmc
->perf_event
, true);
151 pmc
->counter
= counter
& pmc_bitmask(pmc
);
154 static bool pmc_resume_counter(struct kvm_pmc
*pmc
)
156 if (!pmc
->perf_event
)
159 /* recalibrate sample period and check if it's accepted by perf core */
160 if (perf_event_period(pmc
->perf_event
,
161 (-pmc
->counter
) & pmc_bitmask(pmc
)))
164 /* reuse perf_event to serve as pmc_reprogram_counter() does*/
165 perf_event_enable(pmc
->perf_event
);
167 clear_bit(pmc
->idx
, (unsigned long *)&pmc_to_pmu(pmc
)->reprogram_pmi
);
171 void reprogram_gp_counter(struct kvm_pmc
*pmc
, u64 eventsel
)
173 unsigned config
, type
= PERF_TYPE_RAW
;
174 u8 event_select
, unit_mask
;
175 struct kvm
*kvm
= pmc
->vcpu
->kvm
;
176 struct kvm_pmu_event_filter
*filter
;
178 bool allow_event
= true;
180 if (eventsel
& ARCH_PERFMON_EVENTSEL_PIN_CONTROL
)
181 printk_once("kvm pmu: pin control bit is ignored\n");
183 pmc
->eventsel
= eventsel
;
185 pmc_pause_counter(pmc
);
187 if (!(eventsel
& ARCH_PERFMON_EVENTSEL_ENABLE
) || !pmc_is_enabled(pmc
))
190 filter
= srcu_dereference(kvm
->arch
.pmu_event_filter
, &kvm
->srcu
);
192 for (i
= 0; i
< filter
->nevents
; i
++)
193 if (filter
->events
[i
] ==
194 (eventsel
& AMD64_RAW_EVENT_MASK_NB
))
196 if (filter
->action
== KVM_PMU_EVENT_ALLOW
&&
197 i
== filter
->nevents
)
199 if (filter
->action
== KVM_PMU_EVENT_DENY
&&
206 event_select
= eventsel
& ARCH_PERFMON_EVENTSEL_EVENT
;
207 unit_mask
= (eventsel
& ARCH_PERFMON_EVENTSEL_UMASK
) >> 8;
209 if (!(eventsel
& (ARCH_PERFMON_EVENTSEL_EDGE
|
210 ARCH_PERFMON_EVENTSEL_INV
|
211 ARCH_PERFMON_EVENTSEL_CMASK
|
213 HSW_IN_TX_CHECKPOINTED
))) {
214 config
= kvm_x86_ops
->pmu_ops
->find_arch_event(pmc_to_pmu(pmc
),
217 if (config
!= PERF_COUNT_HW_MAX
)
218 type
= PERF_TYPE_HARDWARE
;
221 if (type
== PERF_TYPE_RAW
)
222 config
= eventsel
& X86_RAW_EVENT_MASK
;
224 if (pmc
->current_config
== eventsel
&& pmc_resume_counter(pmc
))
227 pmc_release_perf_event(pmc
);
229 pmc
->current_config
= eventsel
;
230 pmc_reprogram_counter(pmc
, type
, config
,
231 !(eventsel
& ARCH_PERFMON_EVENTSEL_USR
),
232 !(eventsel
& ARCH_PERFMON_EVENTSEL_OS
),
233 eventsel
& ARCH_PERFMON_EVENTSEL_INT
,
234 (eventsel
& HSW_IN_TX
),
235 (eventsel
& HSW_IN_TX_CHECKPOINTED
));
237 EXPORT_SYMBOL_GPL(reprogram_gp_counter
);
239 void reprogram_fixed_counter(struct kvm_pmc
*pmc
, u8 ctrl
, int idx
)
241 unsigned en_field
= ctrl
& 0x3;
242 bool pmi
= ctrl
& 0x8;
243 struct kvm_pmu_event_filter
*filter
;
244 struct kvm
*kvm
= pmc
->vcpu
->kvm
;
246 pmc_pause_counter(pmc
);
248 if (!en_field
|| !pmc_is_enabled(pmc
))
251 filter
= srcu_dereference(kvm
->arch
.pmu_event_filter
, &kvm
->srcu
);
253 if (filter
->action
== KVM_PMU_EVENT_DENY
&&
254 test_bit(idx
, (ulong
*)&filter
->fixed_counter_bitmap
))
256 if (filter
->action
== KVM_PMU_EVENT_ALLOW
&&
257 !test_bit(idx
, (ulong
*)&filter
->fixed_counter_bitmap
))
261 if (pmc
->current_config
== (u64
)ctrl
&& pmc_resume_counter(pmc
))
264 pmc_release_perf_event(pmc
);
266 pmc
->current_config
= (u64
)ctrl
;
267 pmc_reprogram_counter(pmc
, PERF_TYPE_HARDWARE
,
268 kvm_x86_ops
->pmu_ops
->find_fixed_event(idx
),
269 !(en_field
& 0x2), /* exclude user */
270 !(en_field
& 0x1), /* exclude kernel */
273 EXPORT_SYMBOL_GPL(reprogram_fixed_counter
);
275 void reprogram_counter(struct kvm_pmu
*pmu
, int pmc_idx
)
277 struct kvm_pmc
*pmc
= kvm_x86_ops
->pmu_ops
->pmc_idx_to_pmc(pmu
, pmc_idx
);
283 reprogram_gp_counter(pmc
, pmc
->eventsel
);
285 int idx
= pmc_idx
- INTEL_PMC_IDX_FIXED
;
286 u8 ctrl
= fixed_ctrl_field(pmu
->fixed_ctr_ctrl
, idx
);
288 reprogram_fixed_counter(pmc
, ctrl
, idx
);
291 EXPORT_SYMBOL_GPL(reprogram_counter
);
293 void kvm_pmu_handle_event(struct kvm_vcpu
*vcpu
)
295 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
298 for_each_set_bit(bit
, pmu
->reprogram_pmi
, X86_PMC_IDX_MAX
) {
299 struct kvm_pmc
*pmc
= kvm_x86_ops
->pmu_ops
->pmc_idx_to_pmc(pmu
, bit
);
301 if (unlikely(!pmc
|| !pmc
->perf_event
)) {
302 clear_bit(bit
, pmu
->reprogram_pmi
);
306 reprogram_counter(pmu
, bit
);
310 * Unused perf_events are only released if the corresponding MSRs
311 * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
312 * triggers KVM_REQ_PMU if cleanup is needed.
314 if (unlikely(pmu
->need_cleanup
))
315 kvm_pmu_cleanup(vcpu
);
318 /* check if idx is a valid index to access PMU */
319 int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu
*vcpu
, unsigned int idx
)
321 return kvm_x86_ops
->pmu_ops
->is_valid_rdpmc_ecx(vcpu
, idx
);
324 bool is_vmware_backdoor_pmc(u32 pmc_idx
)
327 case VMWARE_BACKDOOR_PMC_HOST_TSC
:
328 case VMWARE_BACKDOOR_PMC_REAL_TIME
:
329 case VMWARE_BACKDOOR_PMC_APPARENT_TIME
:
335 static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu
*vcpu
, unsigned idx
, u64
*data
)
340 case VMWARE_BACKDOOR_PMC_HOST_TSC
:
343 case VMWARE_BACKDOOR_PMC_REAL_TIME
:
344 ctr_val
= ktime_get_boottime_ns();
346 case VMWARE_BACKDOOR_PMC_APPARENT_TIME
:
347 ctr_val
= ktime_get_boottime_ns() +
348 vcpu
->kvm
->arch
.kvmclock_offset
;
358 int kvm_pmu_rdpmc(struct kvm_vcpu
*vcpu
, unsigned idx
, u64
*data
)
360 bool fast_mode
= idx
& (1u << 31);
361 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
363 u64 mask
= fast_mode
? ~0u : ~0ull;
368 if (is_vmware_backdoor_pmc(idx
))
369 return kvm_pmu_rdpmc_vmware(vcpu
, idx
, data
);
371 pmc
= kvm_x86_ops
->pmu_ops
->rdpmc_ecx_to_pmc(vcpu
, idx
, &mask
);
375 *data
= pmc_read_counter(pmc
) & mask
;
379 void kvm_pmu_deliver_pmi(struct kvm_vcpu
*vcpu
)
381 if (lapic_in_kernel(vcpu
))
382 kvm_apic_local_deliver(vcpu
->arch
.apic
, APIC_LVTPC
);
385 bool kvm_pmu_is_valid_msr(struct kvm_vcpu
*vcpu
, u32 msr
)
387 return kvm_x86_ops
->pmu_ops
->msr_idx_to_pmc(vcpu
, msr
) ||
388 kvm_x86_ops
->pmu_ops
->is_valid_msr(vcpu
, msr
);
391 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu
*vcpu
, u32 msr
)
393 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
394 struct kvm_pmc
*pmc
= kvm_x86_ops
->pmu_ops
->msr_idx_to_pmc(vcpu
, msr
);
397 __set_bit(pmc
->idx
, pmu
->pmc_in_use
);
400 int kvm_pmu_get_msr(struct kvm_vcpu
*vcpu
, u32 msr
, u64
*data
)
402 return kvm_x86_ops
->pmu_ops
->get_msr(vcpu
, msr
, data
);
405 int kvm_pmu_set_msr(struct kvm_vcpu
*vcpu
, struct msr_data
*msr_info
)
407 kvm_pmu_mark_pmc_in_use(vcpu
, msr_info
->index
);
408 return kvm_x86_ops
->pmu_ops
->set_msr(vcpu
, msr_info
);
411 /* refresh PMU settings. This function generally is called when underlying
412 * settings are changed (such as changes of PMU CPUID by guest VMs), which
413 * should rarely happen.
415 void kvm_pmu_refresh(struct kvm_vcpu
*vcpu
)
417 kvm_x86_ops
->pmu_ops
->refresh(vcpu
);
420 void kvm_pmu_reset(struct kvm_vcpu
*vcpu
)
422 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
424 irq_work_sync(&pmu
->irq_work
);
425 kvm_x86_ops
->pmu_ops
->reset(vcpu
);
428 void kvm_pmu_init(struct kvm_vcpu
*vcpu
)
430 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
432 memset(pmu
, 0, sizeof(*pmu
));
433 kvm_x86_ops
->pmu_ops
->init(vcpu
);
434 init_irq_work(&pmu
->irq_work
, kvm_pmi_trigger_fn
);
435 pmu
->event_count
= 0;
436 pmu
->need_cleanup
= false;
437 kvm_pmu_refresh(vcpu
);
440 static inline bool pmc_speculative_in_use(struct kvm_pmc
*pmc
)
442 struct kvm_pmu
*pmu
= pmc_to_pmu(pmc
);
444 if (pmc_is_fixed(pmc
))
445 return fixed_ctrl_field(pmu
->fixed_ctr_ctrl
,
446 pmc
->idx
- INTEL_PMC_IDX_FIXED
) & 0x3;
448 return pmc
->eventsel
& ARCH_PERFMON_EVENTSEL_ENABLE
;
451 /* Release perf_events for vPMCs that have been unused for a full time slice. */
452 void kvm_pmu_cleanup(struct kvm_vcpu
*vcpu
)
454 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
455 struct kvm_pmc
*pmc
= NULL
;
456 DECLARE_BITMAP(bitmask
, X86_PMC_IDX_MAX
);
459 pmu
->need_cleanup
= false;
461 bitmap_andnot(bitmask
, pmu
->all_valid_pmc_idx
,
462 pmu
->pmc_in_use
, X86_PMC_IDX_MAX
);
464 for_each_set_bit(i
, bitmask
, X86_PMC_IDX_MAX
) {
465 pmc
= kvm_x86_ops
->pmu_ops
->pmc_idx_to_pmc(pmu
, i
);
467 if (pmc
&& pmc
->perf_event
&& !pmc_speculative_in_use(pmc
))
468 pmc_stop_counter(pmc
);
471 bitmap_zero(pmu
->pmc_in_use
, X86_PMC_IDX_MAX
);
474 void kvm_pmu_destroy(struct kvm_vcpu
*vcpu
)
479 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm
*kvm
, void __user
*argp
)
481 struct kvm_pmu_event_filter tmp
, *filter
;
485 if (copy_from_user(&tmp
, argp
, sizeof(tmp
)))
488 if (tmp
.action
!= KVM_PMU_EVENT_ALLOW
&&
489 tmp
.action
!= KVM_PMU_EVENT_DENY
)
495 if (tmp
.nevents
> KVM_PMU_EVENT_FILTER_MAX_EVENTS
)
498 size
= struct_size(filter
, events
, tmp
.nevents
);
499 filter
= kmalloc(size
, GFP_KERNEL_ACCOUNT
);
504 if (copy_from_user(filter
, argp
, size
))
507 /* Ensure nevents can't be changed between the user copies. */
510 mutex_lock(&kvm
->lock
);
511 filter
= rcu_replace_pointer(kvm
->arch
.pmu_event_filter
, filter
,
512 mutex_is_locked(&kvm
->lock
));
513 mutex_unlock(&kvm
->lock
);
515 synchronize_srcu_expedited(&kvm
->srcu
);