2 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
4 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
7 * Avi Kivity <avi@redhat.com>
8 * Gleb Natapov <gleb@redhat.com>
9 * Wei Huang <wei@redhat.com>
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
16 #include <linux/types.h>
17 #include <linux/kvm_host.h>
18 #include <linux/perf_event.h>
19 #include <asm/perf_event.h>
26 * - Each perf counter is defined as "struct kvm_pmc";
27 * - There are two types of perf counters: general purpose (gp) and fixed.
28 * gp counters are stored in gp_counters[] and fixed counters are stored
29 * in fixed_counters[] respectively. Both of them are part of "struct
31 * - pmu.c understands the difference between gp counters and fixed counters.
32 * However AMD doesn't support fixed-counters;
33 * - There are three types of index to access perf counters (PMC):
34 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
35 * has MSR_K7_PERFCTRn.
36 * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
37 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
38 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
39 * that it also supports fixed counters. idx can be used to as index to
40 * gp and fixed counters.
41 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
42 * code. Each pmc, stored in kvm_pmc.idx field, is unique across
43 * all perf counters (both gp and fixed). The mapping relationship
44 * between pmc and perf counters is as the following:
45 * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
46 * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
47 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
50 static void kvm_pmi_trigger_fn(struct irq_work
*irq_work
)
52 struct kvm_pmu
*pmu
= container_of(irq_work
, struct kvm_pmu
, irq_work
);
53 struct kvm_vcpu
*vcpu
= pmu_to_vcpu(pmu
);
55 kvm_pmu_deliver_pmi(vcpu
);
58 static void kvm_perf_overflow(struct perf_event
*perf_event
,
59 struct perf_sample_data
*data
,
62 struct kvm_pmc
*pmc
= perf_event
->overflow_handler_context
;
63 struct kvm_pmu
*pmu
= pmc_to_pmu(pmc
);
65 if (!test_and_set_bit(pmc
->idx
,
66 (unsigned long *)&pmu
->reprogram_pmi
)) {
67 __set_bit(pmc
->idx
, (unsigned long *)&pmu
->global_status
);
68 kvm_make_request(KVM_REQ_PMU
, pmc
->vcpu
);
72 static void kvm_perf_overflow_intr(struct perf_event
*perf_event
,
73 struct perf_sample_data
*data
,
76 struct kvm_pmc
*pmc
= perf_event
->overflow_handler_context
;
77 struct kvm_pmu
*pmu
= pmc_to_pmu(pmc
);
79 if (!test_and_set_bit(pmc
->idx
,
80 (unsigned long *)&pmu
->reprogram_pmi
)) {
81 __set_bit(pmc
->idx
, (unsigned long *)&pmu
->global_status
);
82 kvm_make_request(KVM_REQ_PMU
, pmc
->vcpu
);
85 * Inject PMI. If vcpu was in a guest mode during NMI PMI
86 * can be ejected on a guest mode re-entry. Otherwise we can't
87 * be sure that vcpu wasn't executing hlt instruction at the
88 * time of vmexit and is not going to re-enter guest mode until
89 * woken up. So we should wake it, but this is impossible from
90 * NMI context. Do it from irq work instead.
92 if (!kvm_is_in_guest())
93 irq_work_queue(&pmc_to_pmu(pmc
)->irq_work
);
95 kvm_make_request(KVM_REQ_PMI
, pmc
->vcpu
);
99 static void pmc_reprogram_counter(struct kvm_pmc
*pmc
, u32 type
,
100 unsigned config
, bool exclude_user
,
101 bool exclude_kernel
, bool intr
,
102 bool in_tx
, bool in_tx_cp
)
104 struct perf_event
*event
;
105 struct perf_event_attr attr
= {
107 .size
= sizeof(attr
),
109 .exclude_idle
= true,
111 .exclude_user
= exclude_user
,
112 .exclude_kernel
= exclude_kernel
,
116 attr
.sample_period
= (-pmc
->counter
) & pmc_bitmask(pmc
);
119 attr
.config
|= HSW_IN_TX
;
122 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
123 * period. Just clear the sample period so at least
124 * allocating the counter doesn't fail.
126 attr
.sample_period
= 0;
127 attr
.config
|= HSW_IN_TX_CHECKPOINTED
;
130 event
= perf_event_create_kernel_counter(&attr
, -1, current
,
131 intr
? kvm_perf_overflow_intr
:
132 kvm_perf_overflow
, pmc
);
134 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
135 PTR_ERR(event
), pmc
->idx
);
139 pmc
->perf_event
= event
;
140 clear_bit(pmc
->idx
, (unsigned long*)&pmc_to_pmu(pmc
)->reprogram_pmi
);
143 void reprogram_gp_counter(struct kvm_pmc
*pmc
, u64 eventsel
)
145 unsigned config
, type
= PERF_TYPE_RAW
;
146 u8 event_select
, unit_mask
;
148 if (eventsel
& ARCH_PERFMON_EVENTSEL_PIN_CONTROL
)
149 printk_once("kvm pmu: pin control bit is ignored\n");
151 pmc
->eventsel
= eventsel
;
153 pmc_stop_counter(pmc
);
155 if (!(eventsel
& ARCH_PERFMON_EVENTSEL_ENABLE
) || !pmc_is_enabled(pmc
))
158 event_select
= eventsel
& ARCH_PERFMON_EVENTSEL_EVENT
;
159 unit_mask
= (eventsel
& ARCH_PERFMON_EVENTSEL_UMASK
) >> 8;
161 if (!(eventsel
& (ARCH_PERFMON_EVENTSEL_EDGE
|
162 ARCH_PERFMON_EVENTSEL_INV
|
163 ARCH_PERFMON_EVENTSEL_CMASK
|
165 HSW_IN_TX_CHECKPOINTED
))) {
166 config
= kvm_x86_ops
->pmu_ops
->find_arch_event(pmc_to_pmu(pmc
),
169 if (config
!= PERF_COUNT_HW_MAX
)
170 type
= PERF_TYPE_HARDWARE
;
173 if (type
== PERF_TYPE_RAW
)
174 config
= eventsel
& X86_RAW_EVENT_MASK
;
176 pmc_reprogram_counter(pmc
, type
, config
,
177 !(eventsel
& ARCH_PERFMON_EVENTSEL_USR
),
178 !(eventsel
& ARCH_PERFMON_EVENTSEL_OS
),
179 eventsel
& ARCH_PERFMON_EVENTSEL_INT
,
180 (eventsel
& HSW_IN_TX
),
181 (eventsel
& HSW_IN_TX_CHECKPOINTED
));
183 EXPORT_SYMBOL_GPL(reprogram_gp_counter
);
185 void reprogram_fixed_counter(struct kvm_pmc
*pmc
, u8 ctrl
, int idx
)
187 unsigned en_field
= ctrl
& 0x3;
188 bool pmi
= ctrl
& 0x8;
190 pmc_stop_counter(pmc
);
192 if (!en_field
|| !pmc_is_enabled(pmc
))
195 pmc_reprogram_counter(pmc
, PERF_TYPE_HARDWARE
,
196 kvm_x86_ops
->pmu_ops
->find_fixed_event(idx
),
197 !(en_field
& 0x2), /* exclude user */
198 !(en_field
& 0x1), /* exclude kernel */
201 EXPORT_SYMBOL_GPL(reprogram_fixed_counter
);
203 void reprogram_counter(struct kvm_pmu
*pmu
, int pmc_idx
)
205 struct kvm_pmc
*pmc
= kvm_x86_ops
->pmu_ops
->pmc_idx_to_pmc(pmu
, pmc_idx
);
211 reprogram_gp_counter(pmc
, pmc
->eventsel
);
213 int idx
= pmc_idx
- INTEL_PMC_IDX_FIXED
;
214 u8 ctrl
= fixed_ctrl_field(pmu
->fixed_ctr_ctrl
, idx
);
216 reprogram_fixed_counter(pmc
, ctrl
, idx
);
219 EXPORT_SYMBOL_GPL(reprogram_counter
);
221 void kvm_pmu_handle_event(struct kvm_vcpu
*vcpu
)
223 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
227 bitmask
= pmu
->reprogram_pmi
;
229 for_each_set_bit(bit
, (unsigned long *)&bitmask
, X86_PMC_IDX_MAX
) {
230 struct kvm_pmc
*pmc
= kvm_x86_ops
->pmu_ops
->pmc_idx_to_pmc(pmu
, bit
);
232 if (unlikely(!pmc
|| !pmc
->perf_event
)) {
233 clear_bit(bit
, (unsigned long *)&pmu
->reprogram_pmi
);
237 reprogram_counter(pmu
, bit
);
241 /* check if idx is a valid index to access PMU */
242 int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu
*vcpu
, unsigned idx
)
244 return kvm_x86_ops
->pmu_ops
->is_valid_msr_idx(vcpu
, idx
);
247 int kvm_pmu_rdpmc(struct kvm_vcpu
*vcpu
, unsigned idx
, u64
*data
)
249 bool fast_mode
= idx
& (1u << 31);
253 pmc
= kvm_x86_ops
->pmu_ops
->msr_idx_to_pmc(vcpu
, idx
);
257 ctr_val
= pmc_read_counter(pmc
);
259 ctr_val
= (u32
)ctr_val
;
265 void kvm_pmu_deliver_pmi(struct kvm_vcpu
*vcpu
)
267 if (lapic_in_kernel(vcpu
))
268 kvm_apic_local_deliver(vcpu
->arch
.apic
, APIC_LVTPC
);
271 bool kvm_pmu_is_valid_msr(struct kvm_vcpu
*vcpu
, u32 msr
)
273 return kvm_x86_ops
->pmu_ops
->is_valid_msr(vcpu
, msr
);
276 int kvm_pmu_get_msr(struct kvm_vcpu
*vcpu
, u32 msr
, u64
*data
)
278 return kvm_x86_ops
->pmu_ops
->get_msr(vcpu
, msr
, data
);
281 int kvm_pmu_set_msr(struct kvm_vcpu
*vcpu
, struct msr_data
*msr_info
)
283 return kvm_x86_ops
->pmu_ops
->set_msr(vcpu
, msr_info
);
286 /* refresh PMU settings. This function generally is called when underlying
287 * settings are changed (such as changes of PMU CPUID by guest VMs), which
288 * should rarely happen.
290 void kvm_pmu_refresh(struct kvm_vcpu
*vcpu
)
292 kvm_x86_ops
->pmu_ops
->refresh(vcpu
);
295 void kvm_pmu_reset(struct kvm_vcpu
*vcpu
)
297 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
299 irq_work_sync(&pmu
->irq_work
);
300 kvm_x86_ops
->pmu_ops
->reset(vcpu
);
303 void kvm_pmu_init(struct kvm_vcpu
*vcpu
)
305 struct kvm_pmu
*pmu
= vcpu_to_pmu(vcpu
);
307 memset(pmu
, 0, sizeof(*pmu
));
308 kvm_x86_ops
->pmu_ops
->init(vcpu
);
309 init_irq_work(&pmu
->irq_work
, kvm_pmi_trigger_fn
);
310 kvm_pmu_refresh(vcpu
);
313 void kvm_pmu_destroy(struct kvm_vcpu
*vcpu
)