2 * Performance events x86 architecture code
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
12 * For licencing details see kernel-base/COPYING
15 #include <linux/perf_event.h>
16 #include <linux/capability.h>
17 #include <linux/notifier.h>
18 #include <linux/hardirq.h>
19 #include <linux/kprobes.h>
20 #include <linux/export.h>
21 #include <linux/init.h>
22 #include <linux/kdebug.h>
23 #include <linux/sched/mm.h>
24 #include <linux/sched/clock.h>
25 #include <linux/uaccess.h>
26 #include <linux/slab.h>
27 #include <linux/cpu.h>
28 #include <linux/bitops.h>
29 #include <linux/device.h>
30 #include <linux/nospec.h>
33 #include <asm/stacktrace.h>
36 #include <asm/alternative.h>
37 #include <asm/mmu_context.h>
38 #include <asm/tlbflush.h>
39 #include <asm/timer.h>
42 #include <asm/unwind.h>
44 #include "perf_event.h"
46 struct x86_pmu x86_pmu __read_mostly
;
48 DEFINE_PER_CPU(struct cpu_hw_events
, cpu_hw_events
) = {
52 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key
);
53 DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key
);
55 u64 __read_mostly hw_cache_event_ids
56 [PERF_COUNT_HW_CACHE_MAX
]
57 [PERF_COUNT_HW_CACHE_OP_MAX
]
58 [PERF_COUNT_HW_CACHE_RESULT_MAX
];
59 u64 __read_mostly hw_cache_extra_regs
60 [PERF_COUNT_HW_CACHE_MAX
]
61 [PERF_COUNT_HW_CACHE_OP_MAX
]
62 [PERF_COUNT_HW_CACHE_RESULT_MAX
];
65 * Propagate event elapsed time into the generic event.
66 * Can only be executed on the CPU where the event is active.
67 * Returns the delta events processed.
69 u64
x86_perf_event_update(struct perf_event
*event
)
71 struct hw_perf_event
*hwc
= &event
->hw
;
72 int shift
= 64 - x86_pmu
.cntval_bits
;
73 u64 prev_raw_count
, new_raw_count
;
77 if (idx
== INTEL_PMC_IDX_FIXED_BTS
)
81 * Careful: an NMI might modify the previous event value.
83 * Our tactic to handle this is to first atomically read and
84 * exchange a new raw count - then add that new-prev delta
85 * count to the generic event atomically:
88 prev_raw_count
= local64_read(&hwc
->prev_count
);
89 rdpmcl(hwc
->event_base_rdpmc
, new_raw_count
);
91 if (local64_cmpxchg(&hwc
->prev_count
, prev_raw_count
,
92 new_raw_count
) != prev_raw_count
)
96 * Now we have the new raw value and have updated the prev
97 * timestamp already. We can now calculate the elapsed delta
98 * (event-)time and add that to the generic event.
100 * Careful, not all hw sign-extends above the physical width
103 delta
= (new_raw_count
<< shift
) - (prev_raw_count
<< shift
);
106 local64_add(delta
, &event
->count
);
107 local64_sub(delta
, &hwc
->period_left
);
109 return new_raw_count
;
113 * Find and validate any extra registers to set up.
115 static int x86_pmu_extra_regs(u64 config
, struct perf_event
*event
)
117 struct hw_perf_event_extra
*reg
;
118 struct extra_reg
*er
;
120 reg
= &event
->hw
.extra_reg
;
122 if (!x86_pmu
.extra_regs
)
125 for (er
= x86_pmu
.extra_regs
; er
->msr
; er
++) {
126 if (er
->event
!= (config
& er
->config_mask
))
128 if (event
->attr
.config1
& ~er
->valid_mask
)
130 /* Check if the extra msrs can be safely accessed*/
131 if (!er
->extra_msr_access
)
135 reg
->config
= event
->attr
.config1
;
142 static atomic_t active_events
;
143 static atomic_t pmc_refcount
;
144 static DEFINE_MUTEX(pmc_reserve_mutex
);
146 #ifdef CONFIG_X86_LOCAL_APIC
148 static bool reserve_pmc_hardware(void)
152 for (i
= 0; i
< x86_pmu
.num_counters
; i
++) {
153 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i
)))
157 for (i
= 0; i
< x86_pmu
.num_counters
; i
++) {
158 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i
)))
165 for (i
--; i
>= 0; i
--)
166 release_evntsel_nmi(x86_pmu_config_addr(i
));
168 i
= x86_pmu
.num_counters
;
171 for (i
--; i
>= 0; i
--)
172 release_perfctr_nmi(x86_pmu_event_addr(i
));
177 static void release_pmc_hardware(void)
181 for (i
= 0; i
< x86_pmu
.num_counters
; i
++) {
182 release_perfctr_nmi(x86_pmu_event_addr(i
));
183 release_evntsel_nmi(x86_pmu_config_addr(i
));
189 static bool reserve_pmc_hardware(void) { return true; }
190 static void release_pmc_hardware(void) {}
194 static bool check_hw_exists(void)
196 u64 val
, val_fail
= -1, val_new
= ~0;
197 int i
, reg
, reg_fail
= -1, ret
= 0;
202 * Check to see if the BIOS enabled any of the counters, if so
205 for (i
= 0; i
< x86_pmu
.num_counters
; i
++) {
206 reg
= x86_pmu_config_addr(i
);
207 ret
= rdmsrl_safe(reg
, &val
);
210 if (val
& ARCH_PERFMON_EVENTSEL_ENABLE
) {
219 if (x86_pmu
.num_counters_fixed
) {
220 reg
= MSR_ARCH_PERFMON_FIXED_CTR_CTRL
;
221 ret
= rdmsrl_safe(reg
, &val
);
224 for (i
= 0; i
< x86_pmu
.num_counters_fixed
; i
++) {
225 if (val
& (0x03 << i
*4)) {
234 * If all the counters are enabled, the below test will always
235 * fail. The tools will also become useless in this scenario.
236 * Just fail and disable the hardware counters.
239 if (reg_safe
== -1) {
245 * Read the current value, change it and read it back to see if it
246 * matches, this is needed to detect certain hardware emulators
247 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
249 reg
= x86_pmu_event_addr(reg_safe
);
250 if (rdmsrl_safe(reg
, &val
))
253 ret
= wrmsrl_safe(reg
, val
);
254 ret
|= rdmsrl_safe(reg
, &val_new
);
255 if (ret
|| val
!= val_new
)
259 * We still allow the PMU driver to operate:
262 pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
263 pr_err(FW_BUG
"the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
270 if (boot_cpu_has(X86_FEATURE_HYPERVISOR
)) {
271 pr_cont("PMU not available due to virtualization, using software events only.\n");
273 pr_cont("Broken PMU hardware detected, using software events only.\n");
274 pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
281 static void hw_perf_event_destroy(struct perf_event
*event
)
283 x86_release_hardware();
284 atomic_dec(&active_events
);
287 void hw_perf_lbr_event_destroy(struct perf_event
*event
)
289 hw_perf_event_destroy(event
);
291 /* undo the lbr/bts event accounting */
292 x86_del_exclusive(x86_lbr_exclusive_lbr
);
295 static inline int x86_pmu_initialized(void)
297 return x86_pmu
.handle_irq
!= NULL
;
301 set_ext_hw_attr(struct hw_perf_event
*hwc
, struct perf_event
*event
)
303 struct perf_event_attr
*attr
= &event
->attr
;
304 unsigned int cache_type
, cache_op
, cache_result
;
307 config
= attr
->config
;
309 cache_type
= (config
>> 0) & 0xff;
310 if (cache_type
>= PERF_COUNT_HW_CACHE_MAX
)
312 cache_type
= array_index_nospec(cache_type
, PERF_COUNT_HW_CACHE_MAX
);
314 cache_op
= (config
>> 8) & 0xff;
315 if (cache_op
>= PERF_COUNT_HW_CACHE_OP_MAX
)
317 cache_op
= array_index_nospec(cache_op
, PERF_COUNT_HW_CACHE_OP_MAX
);
319 cache_result
= (config
>> 16) & 0xff;
320 if (cache_result
>= PERF_COUNT_HW_CACHE_RESULT_MAX
)
322 cache_result
= array_index_nospec(cache_result
, PERF_COUNT_HW_CACHE_RESULT_MAX
);
324 val
= hw_cache_event_ids
[cache_type
][cache_op
][cache_result
];
333 attr
->config1
= hw_cache_extra_regs
[cache_type
][cache_op
][cache_result
];
334 return x86_pmu_extra_regs(val
, event
);
337 int x86_reserve_hardware(void)
341 if (!atomic_inc_not_zero(&pmc_refcount
)) {
342 mutex_lock(&pmc_reserve_mutex
);
343 if (atomic_read(&pmc_refcount
) == 0) {
344 if (!reserve_pmc_hardware())
347 reserve_ds_buffers();
350 atomic_inc(&pmc_refcount
);
351 mutex_unlock(&pmc_reserve_mutex
);
357 void x86_release_hardware(void)
359 if (atomic_dec_and_mutex_lock(&pmc_refcount
, &pmc_reserve_mutex
)) {
360 release_pmc_hardware();
361 release_ds_buffers();
362 mutex_unlock(&pmc_reserve_mutex
);
367 * Check if we can create event of a certain type (that no conflicting events
370 int x86_add_exclusive(unsigned int what
)
375 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
376 * LBR and BTS are still mutually exclusive.
378 if (x86_pmu
.lbr_pt_coexist
&& what
== x86_lbr_exclusive_pt
)
381 if (!atomic_inc_not_zero(&x86_pmu
.lbr_exclusive
[what
])) {
382 mutex_lock(&pmc_reserve_mutex
);
383 for (i
= 0; i
< ARRAY_SIZE(x86_pmu
.lbr_exclusive
); i
++) {
384 if (i
!= what
&& atomic_read(&x86_pmu
.lbr_exclusive
[i
]))
387 atomic_inc(&x86_pmu
.lbr_exclusive
[what
]);
388 mutex_unlock(&pmc_reserve_mutex
);
392 atomic_inc(&active_events
);
396 mutex_unlock(&pmc_reserve_mutex
);
400 void x86_del_exclusive(unsigned int what
)
402 atomic_dec(&active_events
);
405 * See the comment in x86_add_exclusive().
407 if (x86_pmu
.lbr_pt_coexist
&& what
== x86_lbr_exclusive_pt
)
410 atomic_dec(&x86_pmu
.lbr_exclusive
[what
]);
413 int x86_setup_perfctr(struct perf_event
*event
)
415 struct perf_event_attr
*attr
= &event
->attr
;
416 struct hw_perf_event
*hwc
= &event
->hw
;
419 if (!is_sampling_event(event
)) {
420 hwc
->sample_period
= x86_pmu
.max_period
;
421 hwc
->last_period
= hwc
->sample_period
;
422 local64_set(&hwc
->period_left
, hwc
->sample_period
);
425 if (attr
->type
== PERF_TYPE_RAW
)
426 return x86_pmu_extra_regs(event
->attr
.config
, event
);
428 if (attr
->type
== PERF_TYPE_HW_CACHE
)
429 return set_ext_hw_attr(hwc
, event
);
431 if (attr
->config
>= x86_pmu
.max_events
)
434 attr
->config
= array_index_nospec((unsigned long)attr
->config
, x86_pmu
.max_events
);
439 config
= x86_pmu
.event_map(attr
->config
);
447 hwc
->config
|= config
;
453 * check that branch_sample_type is compatible with
454 * settings needed for precise_ip > 1 which implies
455 * using the LBR to capture ALL taken branches at the
456 * priv levels of the measurement
458 static inline int precise_br_compat(struct perf_event
*event
)
460 u64 m
= event
->attr
.branch_sample_type
;
463 /* must capture all branches */
464 if (!(m
& PERF_SAMPLE_BRANCH_ANY
))
467 m
&= PERF_SAMPLE_BRANCH_KERNEL
| PERF_SAMPLE_BRANCH_USER
;
469 if (!event
->attr
.exclude_user
)
470 b
|= PERF_SAMPLE_BRANCH_USER
;
472 if (!event
->attr
.exclude_kernel
)
473 b
|= PERF_SAMPLE_BRANCH_KERNEL
;
476 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
482 int x86_pmu_max_precise(void)
486 /* Support for constant skid */
487 if (x86_pmu
.pebs_active
&& !x86_pmu
.pebs_broken
) {
490 /* Support for IP fixup */
491 if (x86_pmu
.lbr_nr
|| x86_pmu
.intel_cap
.pebs_format
>= 2)
494 if (x86_pmu
.pebs_prec_dist
)
500 int x86_pmu_hw_config(struct perf_event
*event
)
502 if (event
->attr
.precise_ip
) {
503 int precise
= x86_pmu_max_precise();
505 if (event
->attr
.precise_ip
> precise
)
508 /* There's no sense in having PEBS for non sampling events: */
509 if (!is_sampling_event(event
))
513 * check that PEBS LBR correction does not conflict with
514 * whatever the user is asking with attr->branch_sample_type
516 if (event
->attr
.precise_ip
> 1 && x86_pmu
.intel_cap
.pebs_format
< 2) {
517 u64
*br_type
= &event
->attr
.branch_sample_type
;
519 if (has_branch_stack(event
)) {
520 if (!precise_br_compat(event
))
523 /* branch_sample_type is compatible */
527 * user did not specify branch_sample_type
529 * For PEBS fixups, we capture all
530 * the branches at the priv level of the
533 *br_type
= PERF_SAMPLE_BRANCH_ANY
;
535 if (!event
->attr
.exclude_user
)
536 *br_type
|= PERF_SAMPLE_BRANCH_USER
;
538 if (!event
->attr
.exclude_kernel
)
539 *br_type
|= PERF_SAMPLE_BRANCH_KERNEL
;
543 if (event
->attr
.branch_sample_type
& PERF_SAMPLE_BRANCH_CALL_STACK
)
544 event
->attach_state
|= PERF_ATTACH_TASK_DATA
;
548 * (keep 'enabled' bit clear for now)
550 event
->hw
.config
= ARCH_PERFMON_EVENTSEL_INT
;
553 * Count user and OS events unless requested not to
555 if (!event
->attr
.exclude_user
)
556 event
->hw
.config
|= ARCH_PERFMON_EVENTSEL_USR
;
557 if (!event
->attr
.exclude_kernel
)
558 event
->hw
.config
|= ARCH_PERFMON_EVENTSEL_OS
;
560 if (event
->attr
.type
== PERF_TYPE_RAW
)
561 event
->hw
.config
|= event
->attr
.config
& X86_RAW_EVENT_MASK
;
563 if (event
->attr
.sample_period
&& x86_pmu
.limit_period
) {
564 if (x86_pmu
.limit_period(event
, event
->attr
.sample_period
) >
565 event
->attr
.sample_period
)
569 /* sample_regs_user never support XMM registers */
570 if (unlikely(event
->attr
.sample_regs_user
& PERF_REG_EXTENDED_MASK
))
573 * Besides the general purpose registers, XMM registers may
574 * be collected in PEBS on some platforms, e.g. Icelake
576 if (unlikely(event
->attr
.sample_regs_intr
& PERF_REG_EXTENDED_MASK
)) {
577 if (!(event
->pmu
->capabilities
& PERF_PMU_CAP_EXTENDED_REGS
))
580 if (!event
->attr
.precise_ip
)
584 return x86_setup_perfctr(event
);
588 * Setup the hardware configuration for a given attr_type
590 static int __x86_pmu_event_init(struct perf_event
*event
)
594 if (!x86_pmu_initialized())
597 err
= x86_reserve_hardware();
601 atomic_inc(&active_events
);
602 event
->destroy
= hw_perf_event_destroy
;
605 event
->hw
.last_cpu
= -1;
606 event
->hw
.last_tag
= ~0ULL;
609 event
->hw
.extra_reg
.idx
= EXTRA_REG_NONE
;
610 event
->hw
.branch_reg
.idx
= EXTRA_REG_NONE
;
612 return x86_pmu
.hw_config(event
);
615 void x86_pmu_disable_all(void)
617 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
620 for (idx
= 0; idx
< x86_pmu
.num_counters
; idx
++) {
621 struct hw_perf_event
*hwc
= &cpuc
->events
[idx
]->hw
;
624 if (!test_bit(idx
, cpuc
->active_mask
))
626 rdmsrl(x86_pmu_config_addr(idx
), val
);
627 if (!(val
& ARCH_PERFMON_EVENTSEL_ENABLE
))
629 val
&= ~ARCH_PERFMON_EVENTSEL_ENABLE
;
630 wrmsrl(x86_pmu_config_addr(idx
), val
);
631 if (is_counter_pair(hwc
))
632 wrmsrl(x86_pmu_config_addr(idx
+ 1), 0);
637 * There may be PMI landing after enabled=0. The PMI hitting could be before or
640 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
641 * It will not be re-enabled in the NMI handler again, because enabled=0. After
642 * handling the NMI, disable_all will be called, which will not change the
643 * state either. If PMI hits after disable_all, the PMU is already disabled
644 * before entering NMI handler. The NMI handler will not change the state
647 * So either situation is harmless.
649 static void x86_pmu_disable(struct pmu
*pmu
)
651 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
653 if (!x86_pmu_initialized())
663 x86_pmu
.disable_all();
666 void x86_pmu_enable_all(int added
)
668 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
671 for (idx
= 0; idx
< x86_pmu
.num_counters
; idx
++) {
672 struct hw_perf_event
*hwc
= &cpuc
->events
[idx
]->hw
;
674 if (!test_bit(idx
, cpuc
->active_mask
))
677 __x86_pmu_enable_event(hwc
, ARCH_PERFMON_EVENTSEL_ENABLE
);
681 static struct pmu pmu
;
683 static inline int is_x86_event(struct perf_event
*event
)
685 return event
->pmu
== &pmu
;
688 struct pmu
*x86_get_pmu(void)
693 * Event scheduler state:
695 * Assign events iterating over all events and counters, beginning
696 * with events with least weights first. Keep the current iterator
697 * state in struct sched_state.
701 int event
; /* event index */
702 int counter
; /* counter index */
703 int unassigned
; /* number of events to be assigned left */
704 int nr_gp
; /* number of GP counters used */
708 /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
709 #define SCHED_STATES_MAX 2
716 struct event_constraint
**constraints
;
717 struct sched_state state
;
718 struct sched_state saved
[SCHED_STATES_MAX
];
722 * Initialize interator that runs through all events and counters.
724 static void perf_sched_init(struct perf_sched
*sched
, struct event_constraint
**constraints
,
725 int num
, int wmin
, int wmax
, int gpmax
)
729 memset(sched
, 0, sizeof(*sched
));
730 sched
->max_events
= num
;
731 sched
->max_weight
= wmax
;
732 sched
->max_gp
= gpmax
;
733 sched
->constraints
= constraints
;
735 for (idx
= 0; idx
< num
; idx
++) {
736 if (constraints
[idx
]->weight
== wmin
)
740 sched
->state
.event
= idx
; /* start with min weight */
741 sched
->state
.weight
= wmin
;
742 sched
->state
.unassigned
= num
;
745 static void perf_sched_save_state(struct perf_sched
*sched
)
747 if (WARN_ON_ONCE(sched
->saved_states
>= SCHED_STATES_MAX
))
750 sched
->saved
[sched
->saved_states
] = sched
->state
;
751 sched
->saved_states
++;
754 static bool perf_sched_restore_state(struct perf_sched
*sched
)
756 if (!sched
->saved_states
)
759 sched
->saved_states
--;
760 sched
->state
= sched
->saved
[sched
->saved_states
];
762 /* this assignment didn't work out */
763 /* XXX broken vs EVENT_PAIR */
764 sched
->state
.used
&= ~BIT_ULL(sched
->state
.counter
);
766 /* try the next one */
767 sched
->state
.counter
++;
773 * Select a counter for the current event to schedule. Return true on
776 static bool __perf_sched_find_counter(struct perf_sched
*sched
)
778 struct event_constraint
*c
;
781 if (!sched
->state
.unassigned
)
784 if (sched
->state
.event
>= sched
->max_events
)
787 c
= sched
->constraints
[sched
->state
.event
];
788 /* Prefer fixed purpose counters */
789 if (c
->idxmsk64
& (~0ULL << INTEL_PMC_IDX_FIXED
)) {
790 idx
= INTEL_PMC_IDX_FIXED
;
791 for_each_set_bit_from(idx
, c
->idxmsk
, X86_PMC_IDX_MAX
) {
792 u64 mask
= BIT_ULL(idx
);
794 if (sched
->state
.used
& mask
)
797 sched
->state
.used
|= mask
;
802 /* Grab the first unused counter starting with idx */
803 idx
= sched
->state
.counter
;
804 for_each_set_bit_from(idx
, c
->idxmsk
, INTEL_PMC_IDX_FIXED
) {
805 u64 mask
= BIT_ULL(idx
);
807 if (c
->flags
& PERF_X86_EVENT_PAIR
)
810 if (sched
->state
.used
& mask
)
813 if (sched
->state
.nr_gp
++ >= sched
->max_gp
)
816 sched
->state
.used
|= mask
;
823 sched
->state
.counter
= idx
;
826 perf_sched_save_state(sched
);
831 static bool perf_sched_find_counter(struct perf_sched
*sched
)
833 while (!__perf_sched_find_counter(sched
)) {
834 if (!perf_sched_restore_state(sched
))
842 * Go through all unassigned events and find the next one to schedule.
843 * Take events with the least weight first. Return true on success.
845 static bool perf_sched_next_event(struct perf_sched
*sched
)
847 struct event_constraint
*c
;
849 if (!sched
->state
.unassigned
|| !--sched
->state
.unassigned
)
854 sched
->state
.event
++;
855 if (sched
->state
.event
>= sched
->max_events
) {
857 sched
->state
.event
= 0;
858 sched
->state
.weight
++;
859 if (sched
->state
.weight
> sched
->max_weight
)
862 c
= sched
->constraints
[sched
->state
.event
];
863 } while (c
->weight
!= sched
->state
.weight
);
865 sched
->state
.counter
= 0; /* start with first counter */
871 * Assign a counter for each event.
873 int perf_assign_events(struct event_constraint
**constraints
, int n
,
874 int wmin
, int wmax
, int gpmax
, int *assign
)
876 struct perf_sched sched
;
878 perf_sched_init(&sched
, constraints
, n
, wmin
, wmax
, gpmax
);
881 if (!perf_sched_find_counter(&sched
))
884 assign
[sched
.state
.event
] = sched
.state
.counter
;
885 } while (perf_sched_next_event(&sched
));
887 return sched
.state
.unassigned
;
889 EXPORT_SYMBOL_GPL(perf_assign_events
);
891 int x86_schedule_events(struct cpu_hw_events
*cpuc
, int n
, int *assign
)
893 struct event_constraint
*c
;
894 struct perf_event
*e
;
895 int n0
, i
, wmin
, wmax
, unsched
= 0;
896 struct hw_perf_event
*hwc
;
900 * Compute the number of events already present; see x86_pmu_add(),
901 * validate_group() and x86_pmu_commit_txn(). For the former two
902 * cpuc->n_events hasn't been updated yet, while for the latter
903 * cpuc->n_txn contains the number of events added in the current
907 if (cpuc
->txn_flags
& PERF_PMU_TXN_ADD
)
910 if (x86_pmu
.start_scheduling
)
911 x86_pmu
.start_scheduling(cpuc
);
913 for (i
= 0, wmin
= X86_PMC_IDX_MAX
, wmax
= 0; i
< n
; i
++) {
914 c
= cpuc
->event_constraint
[i
];
917 * Previously scheduled events should have a cached constraint,
918 * while new events should not have one.
920 WARN_ON_ONCE((c
&& i
>= n0
) || (!c
&& i
< n0
));
923 * Request constraints for new events; or for those events that
924 * have a dynamic constraint -- for those the constraint can
925 * change due to external factors (sibling state, allow_tfa).
927 if (!c
|| (c
->flags
& PERF_X86_EVENT_DYNAMIC
)) {
928 c
= x86_pmu
.get_event_constraints(cpuc
, i
, cpuc
->event_list
[i
]);
929 cpuc
->event_constraint
[i
] = c
;
932 wmin
= min(wmin
, c
->weight
);
933 wmax
= max(wmax
, c
->weight
);
937 * fastpath, try to reuse previous register
939 for (i
= 0; i
< n
; i
++) {
942 hwc
= &cpuc
->event_list
[i
]->hw
;
943 c
= cpuc
->event_constraint
[i
];
949 /* constraint still honored */
950 if (!test_bit(hwc
->idx
, c
->idxmsk
))
953 mask
= BIT_ULL(hwc
->idx
);
954 if (is_counter_pair(hwc
))
957 /* not already used */
958 if (used_mask
& mask
)
964 assign
[i
] = hwc
->idx
;
969 int gpmax
= x86_pmu
.num_counters
;
972 * Do not allow scheduling of more than half the available
975 * This helps avoid counter starvation of sibling thread by
976 * ensuring at most half the counters cannot be in exclusive
977 * mode. There is no designated counters for the limits. Any
978 * N/2 counters can be used. This helps with events with
979 * specific counter constraints.
981 if (is_ht_workaround_enabled() && !cpuc
->is_fake
&&
982 READ_ONCE(cpuc
->excl_cntrs
->exclusive_present
))
986 * Reduce the amount of available counters to allow fitting
987 * the extra Merge events needed by large increment events.
989 if (x86_pmu
.flags
& PMU_FL_PAIR
) {
990 gpmax
= x86_pmu
.num_counters
- cpuc
->n_pair
;
994 unsched
= perf_assign_events(cpuc
->event_constraint
, n
, wmin
,
995 wmax
, gpmax
, assign
);
999 * In case of success (unsched = 0), mark events as committed,
1000 * so we do not put_constraint() in case new events are added
1001 * and fail to be scheduled
1003 * We invoke the lower level commit callback to lock the resource
1005 * We do not need to do all of this in case we are called to
1006 * validate an event group (assign == NULL)
1008 if (!unsched
&& assign
) {
1009 for (i
= 0; i
< n
; i
++) {
1010 e
= cpuc
->event_list
[i
];
1011 if (x86_pmu
.commit_scheduling
)
1012 x86_pmu
.commit_scheduling(cpuc
, i
, assign
[i
]);
1015 for (i
= n0
; i
< n
; i
++) {
1016 e
= cpuc
->event_list
[i
];
1019 * release events that failed scheduling
1021 if (x86_pmu
.put_event_constraints
)
1022 x86_pmu
.put_event_constraints(cpuc
, e
);
1024 cpuc
->event_constraint
[i
] = NULL
;
1028 if (x86_pmu
.stop_scheduling
)
1029 x86_pmu
.stop_scheduling(cpuc
);
1031 return unsched
? -EINVAL
: 0;
1035 * dogrp: true if must collect siblings events (group)
1036 * returns total number of events and error code
1038 static int collect_events(struct cpu_hw_events
*cpuc
, struct perf_event
*leader
, bool dogrp
)
1040 struct perf_event
*event
;
1043 max_count
= x86_pmu
.num_counters
+ x86_pmu
.num_counters_fixed
;
1045 /* current number of events already accepted */
1047 if (!cpuc
->n_events
)
1048 cpuc
->pebs_output
= 0;
1050 if (!cpuc
->is_fake
&& leader
->attr
.precise_ip
) {
1052 * For PEBS->PT, if !aux_event, the group leader (PT) went
1053 * away, the group was broken down and this singleton event
1054 * can't schedule any more.
1056 if (is_pebs_pt(leader
) && !leader
->aux_event
)
1060 * pebs_output: 0: no PEBS so far, 1: PT, 2: DS
1062 if (cpuc
->pebs_output
&&
1063 cpuc
->pebs_output
!= is_pebs_pt(leader
) + 1)
1066 cpuc
->pebs_output
= is_pebs_pt(leader
) + 1;
1069 if (is_x86_event(leader
)) {
1072 cpuc
->event_list
[n
] = leader
;
1074 if (is_counter_pair(&leader
->hw
))
1080 for_each_sibling_event(event
, leader
) {
1081 if (!is_x86_event(event
) ||
1082 event
->state
<= PERF_EVENT_STATE_OFF
)
1088 cpuc
->event_list
[n
] = event
;
1090 if (is_counter_pair(&event
->hw
))
1096 static inline void x86_assign_hw_event(struct perf_event
*event
,
1097 struct cpu_hw_events
*cpuc
, int i
)
1099 struct hw_perf_event
*hwc
= &event
->hw
;
1101 hwc
->idx
= cpuc
->assign
[i
];
1102 hwc
->last_cpu
= smp_processor_id();
1103 hwc
->last_tag
= ++cpuc
->tags
[i
];
1105 if (hwc
->idx
== INTEL_PMC_IDX_FIXED_BTS
) {
1106 hwc
->config_base
= 0;
1107 hwc
->event_base
= 0;
1108 } else if (hwc
->idx
>= INTEL_PMC_IDX_FIXED
) {
1109 hwc
->config_base
= MSR_ARCH_PERFMON_FIXED_CTR_CTRL
;
1110 hwc
->event_base
= MSR_ARCH_PERFMON_FIXED_CTR0
+ (hwc
->idx
- INTEL_PMC_IDX_FIXED
);
1111 hwc
->event_base_rdpmc
= (hwc
->idx
- INTEL_PMC_IDX_FIXED
) | 1<<30;
1113 hwc
->config_base
= x86_pmu_config_addr(hwc
->idx
);
1114 hwc
->event_base
= x86_pmu_event_addr(hwc
->idx
);
1115 hwc
->event_base_rdpmc
= x86_pmu_rdpmc_index(hwc
->idx
);
1120 * x86_perf_rdpmc_index - Return PMC counter used for event
1121 * @event: the perf_event to which the PMC counter was assigned
1123 * The counter assigned to this performance event may change if interrupts
1124 * are enabled. This counter should thus never be used while interrupts are
1125 * enabled. Before this function is used to obtain the assigned counter the
1126 * event should be checked for validity using, for example,
1127 * perf_event_read_local(), within the same interrupt disabled section in
1128 * which this counter is planned to be used.
1130 * Return: The index of the performance monitoring counter assigned to
1133 int x86_perf_rdpmc_index(struct perf_event
*event
)
1135 lockdep_assert_irqs_disabled();
1137 return event
->hw
.event_base_rdpmc
;
1140 static inline int match_prev_assignment(struct hw_perf_event
*hwc
,
1141 struct cpu_hw_events
*cpuc
,
1144 return hwc
->idx
== cpuc
->assign
[i
] &&
1145 hwc
->last_cpu
== smp_processor_id() &&
1146 hwc
->last_tag
== cpuc
->tags
[i
];
1149 static void x86_pmu_start(struct perf_event
*event
, int flags
);
1151 static void x86_pmu_enable(struct pmu
*pmu
)
1153 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
1154 struct perf_event
*event
;
1155 struct hw_perf_event
*hwc
;
1156 int i
, added
= cpuc
->n_added
;
1158 if (!x86_pmu_initialized())
1164 if (cpuc
->n_added
) {
1165 int n_running
= cpuc
->n_events
- cpuc
->n_added
;
1167 * apply assignment obtained either from
1168 * hw_perf_group_sched_in() or x86_pmu_enable()
1170 * step1: save events moving to new counters
1172 for (i
= 0; i
< n_running
; i
++) {
1173 event
= cpuc
->event_list
[i
];
1177 * we can avoid reprogramming counter if:
1178 * - assigned same counter as last time
1179 * - running on same CPU as last time
1180 * - no other event has used the counter since
1182 if (hwc
->idx
== -1 ||
1183 match_prev_assignment(hwc
, cpuc
, i
))
1187 * Ensure we don't accidentally enable a stopped
1188 * counter simply because we rescheduled.
1190 if (hwc
->state
& PERF_HES_STOPPED
)
1191 hwc
->state
|= PERF_HES_ARCH
;
1193 x86_pmu_stop(event
, PERF_EF_UPDATE
);
1197 * step2: reprogram moved events into new counters
1199 for (i
= 0; i
< cpuc
->n_events
; i
++) {
1200 event
= cpuc
->event_list
[i
];
1203 if (!match_prev_assignment(hwc
, cpuc
, i
))
1204 x86_assign_hw_event(event
, cpuc
, i
);
1205 else if (i
< n_running
)
1208 if (hwc
->state
& PERF_HES_ARCH
)
1211 x86_pmu_start(event
, PERF_EF_RELOAD
);
1214 perf_events_lapic_init();
1220 x86_pmu
.enable_all(added
);
1223 static DEFINE_PER_CPU(u64
[X86_PMC_IDX_MAX
], pmc_prev_left
);
1226 * Set the next IRQ period, based on the hwc->period_left value.
1227 * To be called with the event disabled in hw:
1229 int x86_perf_event_set_period(struct perf_event
*event
)
1231 struct hw_perf_event
*hwc
= &event
->hw
;
1232 s64 left
= local64_read(&hwc
->period_left
);
1233 s64 period
= hwc
->sample_period
;
1234 int ret
= 0, idx
= hwc
->idx
;
1236 if (idx
== INTEL_PMC_IDX_FIXED_BTS
)
1240 * If we are way outside a reasonable range then just skip forward:
1242 if (unlikely(left
<= -period
)) {
1244 local64_set(&hwc
->period_left
, left
);
1245 hwc
->last_period
= period
;
1249 if (unlikely(left
<= 0)) {
1251 local64_set(&hwc
->period_left
, left
);
1252 hwc
->last_period
= period
;
1256 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1258 if (unlikely(left
< 2))
1261 if (left
> x86_pmu
.max_period
)
1262 left
= x86_pmu
.max_period
;
1264 if (x86_pmu
.limit_period
)
1265 left
= x86_pmu
.limit_period(event
, left
);
1267 per_cpu(pmc_prev_left
[idx
], smp_processor_id()) = left
;
1270 * The hw event starts counting from this event offset,
1271 * mark it to be able to extra future deltas:
1273 local64_set(&hwc
->prev_count
, (u64
)-left
);
1275 wrmsrl(hwc
->event_base
, (u64
)(-left
) & x86_pmu
.cntval_mask
);
1278 * Clear the Merge event counter's upper 16 bits since
1279 * we currently declare a 48-bit counter width
1281 if (is_counter_pair(hwc
))
1282 wrmsrl(x86_pmu_event_addr(idx
+ 1), 0);
1285 * Due to erratum on certan cpu we need
1286 * a second write to be sure the register
1287 * is updated properly
1289 if (x86_pmu
.perfctr_second_write
) {
1290 wrmsrl(hwc
->event_base
,
1291 (u64
)(-left
) & x86_pmu
.cntval_mask
);
1294 perf_event_update_userpage(event
);
1299 void x86_pmu_enable_event(struct perf_event
*event
)
1301 if (__this_cpu_read(cpu_hw_events
.enabled
))
1302 __x86_pmu_enable_event(&event
->hw
,
1303 ARCH_PERFMON_EVENTSEL_ENABLE
);
1307 * Add a single event to the PMU.
1309 * The event is added to the group of enabled events
1310 * but only if it can be scheduled with existing events.
1312 static int x86_pmu_add(struct perf_event
*event
, int flags
)
1314 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
1315 struct hw_perf_event
*hwc
;
1316 int assign
[X86_PMC_IDX_MAX
];
1321 n0
= cpuc
->n_events
;
1322 ret
= n
= collect_events(cpuc
, event
, false);
1326 hwc
->state
= PERF_HES_UPTODATE
| PERF_HES_STOPPED
;
1327 if (!(flags
& PERF_EF_START
))
1328 hwc
->state
|= PERF_HES_ARCH
;
1331 * If group events scheduling transaction was started,
1332 * skip the schedulability test here, it will be performed
1333 * at commit time (->commit_txn) as a whole.
1335 * If commit fails, we'll call ->del() on all events
1336 * for which ->add() was called.
1338 if (cpuc
->txn_flags
& PERF_PMU_TXN_ADD
)
1341 ret
= x86_pmu
.schedule_events(cpuc
, n
, assign
);
1345 * copy new assignment, now we know it is possible
1346 * will be used by hw_perf_enable()
1348 memcpy(cpuc
->assign
, assign
, n
*sizeof(int));
1352 * Commit the collect_events() state. See x86_pmu_del() and
1356 cpuc
->n_added
+= n
- n0
;
1357 cpuc
->n_txn
+= n
- n0
;
1361 * This is before x86_pmu_enable() will call x86_pmu_start(),
1362 * so we enable LBRs before an event needs them etc..
1372 static void x86_pmu_start(struct perf_event
*event
, int flags
)
1374 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
1375 int idx
= event
->hw
.idx
;
1377 if (WARN_ON_ONCE(!(event
->hw
.state
& PERF_HES_STOPPED
)))
1380 if (WARN_ON_ONCE(idx
== -1))
1383 if (flags
& PERF_EF_RELOAD
) {
1384 WARN_ON_ONCE(!(event
->hw
.state
& PERF_HES_UPTODATE
));
1385 x86_perf_event_set_period(event
);
1388 event
->hw
.state
= 0;
1390 cpuc
->events
[idx
] = event
;
1391 __set_bit(idx
, cpuc
->active_mask
);
1392 __set_bit(idx
, cpuc
->running
);
1393 x86_pmu
.enable(event
);
1394 perf_event_update_userpage(event
);
1397 void perf_event_print_debug(void)
1399 u64 ctrl
, status
, overflow
, pmc_ctrl
, pmc_count
, prev_left
, fixed
;
1401 struct cpu_hw_events
*cpuc
;
1402 unsigned long flags
;
1405 if (!x86_pmu
.num_counters
)
1408 local_irq_save(flags
);
1410 cpu
= smp_processor_id();
1411 cpuc
= &per_cpu(cpu_hw_events
, cpu
);
1413 if (x86_pmu
.version
>= 2) {
1414 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL
, ctrl
);
1415 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS
, status
);
1416 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL
, overflow
);
1417 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL
, fixed
);
1420 pr_info("CPU#%d: ctrl: %016llx\n", cpu
, ctrl
);
1421 pr_info("CPU#%d: status: %016llx\n", cpu
, status
);
1422 pr_info("CPU#%d: overflow: %016llx\n", cpu
, overflow
);
1423 pr_info("CPU#%d: fixed: %016llx\n", cpu
, fixed
);
1424 if (x86_pmu
.pebs_constraints
) {
1425 rdmsrl(MSR_IA32_PEBS_ENABLE
, pebs
);
1426 pr_info("CPU#%d: pebs: %016llx\n", cpu
, pebs
);
1428 if (x86_pmu
.lbr_nr
) {
1429 rdmsrl(MSR_IA32_DEBUGCTLMSR
, debugctl
);
1430 pr_info("CPU#%d: debugctl: %016llx\n", cpu
, debugctl
);
1433 pr_info("CPU#%d: active: %016llx\n", cpu
, *(u64
*)cpuc
->active_mask
);
1435 for (idx
= 0; idx
< x86_pmu
.num_counters
; idx
++) {
1436 rdmsrl(x86_pmu_config_addr(idx
), pmc_ctrl
);
1437 rdmsrl(x86_pmu_event_addr(idx
), pmc_count
);
1439 prev_left
= per_cpu(pmc_prev_left
[idx
], cpu
);
1441 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1442 cpu
, idx
, pmc_ctrl
);
1443 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1444 cpu
, idx
, pmc_count
);
1445 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1446 cpu
, idx
, prev_left
);
1448 for (idx
= 0; idx
< x86_pmu
.num_counters_fixed
; idx
++) {
1449 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0
+ idx
, pmc_count
);
1451 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1452 cpu
, idx
, pmc_count
);
1454 local_irq_restore(flags
);
1457 void x86_pmu_stop(struct perf_event
*event
, int flags
)
1459 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
1460 struct hw_perf_event
*hwc
= &event
->hw
;
1462 if (test_bit(hwc
->idx
, cpuc
->active_mask
)) {
1463 x86_pmu
.disable(event
);
1464 __clear_bit(hwc
->idx
, cpuc
->active_mask
);
1465 cpuc
->events
[hwc
->idx
] = NULL
;
1466 WARN_ON_ONCE(hwc
->state
& PERF_HES_STOPPED
);
1467 hwc
->state
|= PERF_HES_STOPPED
;
1470 if ((flags
& PERF_EF_UPDATE
) && !(hwc
->state
& PERF_HES_UPTODATE
)) {
1472 * Drain the remaining delta count out of a event
1473 * that we are disabling:
1475 x86_perf_event_update(event
);
1476 hwc
->state
|= PERF_HES_UPTODATE
;
1480 static void x86_pmu_del(struct perf_event
*event
, int flags
)
1482 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
1486 * If we're called during a txn, we only need to undo x86_pmu.add.
1487 * The events never got scheduled and ->cancel_txn will truncate
1490 * XXX assumes any ->del() called during a TXN will only be on
1491 * an event added during that same TXN.
1493 if (cpuc
->txn_flags
& PERF_PMU_TXN_ADD
)
1497 * Not a TXN, therefore cleanup properly.
1499 x86_pmu_stop(event
, PERF_EF_UPDATE
);
1501 for (i
= 0; i
< cpuc
->n_events
; i
++) {
1502 if (event
== cpuc
->event_list
[i
])
1506 if (WARN_ON_ONCE(i
== cpuc
->n_events
)) /* called ->del() without ->add() ? */
1509 /* If we have a newly added event; make sure to decrease n_added. */
1510 if (i
>= cpuc
->n_events
- cpuc
->n_added
)
1513 if (x86_pmu
.put_event_constraints
)
1514 x86_pmu
.put_event_constraints(cpuc
, event
);
1516 /* Delete the array entry. */
1517 while (++i
< cpuc
->n_events
) {
1518 cpuc
->event_list
[i
-1] = cpuc
->event_list
[i
];
1519 cpuc
->event_constraint
[i
-1] = cpuc
->event_constraint
[i
];
1521 cpuc
->event_constraint
[i
-1] = NULL
;
1524 perf_event_update_userpage(event
);
1529 * This is after x86_pmu_stop(); so we disable LBRs after any
1530 * event can need them etc..
1536 int x86_pmu_handle_irq(struct pt_regs
*regs
)
1538 struct perf_sample_data data
;
1539 struct cpu_hw_events
*cpuc
;
1540 struct perf_event
*event
;
1541 int idx
, handled
= 0;
1544 cpuc
= this_cpu_ptr(&cpu_hw_events
);
1547 * Some chipsets need to unmask the LVTPC in a particular spot
1548 * inside the nmi handler. As a result, the unmasking was pushed
1549 * into all the nmi handlers.
1551 * This generic handler doesn't seem to have any issues where the
1552 * unmasking occurs so it was left at the top.
1554 apic_write(APIC_LVTPC
, APIC_DM_NMI
);
1556 for (idx
= 0; idx
< x86_pmu
.num_counters
; idx
++) {
1557 if (!test_bit(idx
, cpuc
->active_mask
))
1560 event
= cpuc
->events
[idx
];
1562 val
= x86_perf_event_update(event
);
1563 if (val
& (1ULL << (x86_pmu
.cntval_bits
- 1)))
1570 perf_sample_data_init(&data
, 0, event
->hw
.last_period
);
1572 if (!x86_perf_event_set_period(event
))
1575 if (perf_event_overflow(event
, &data
, regs
))
1576 x86_pmu_stop(event
, 0);
1580 inc_irq_stat(apic_perf_irqs
);
1585 void perf_events_lapic_init(void)
1587 if (!x86_pmu
.apic
|| !x86_pmu_initialized())
1591 * Always use NMI for PMU
1593 apic_write(APIC_LVTPC
, APIC_DM_NMI
);
1597 perf_event_nmi_handler(unsigned int cmd
, struct pt_regs
*regs
)
1604 * All PMUs/events that share this PMI handler should make sure to
1605 * increment active_events for their events.
1607 if (!atomic_read(&active_events
))
1610 start_clock
= sched_clock();
1611 ret
= x86_pmu
.handle_irq(regs
);
1612 finish_clock
= sched_clock();
1614 perf_sample_event_took(finish_clock
- start_clock
);
1618 NOKPROBE_SYMBOL(perf_event_nmi_handler
);
1620 struct event_constraint emptyconstraint
;
1621 struct event_constraint unconstrained
;
1623 static int x86_pmu_prepare_cpu(unsigned int cpu
)
1625 struct cpu_hw_events
*cpuc
= &per_cpu(cpu_hw_events
, cpu
);
1628 for (i
= 0 ; i
< X86_PERF_KFREE_MAX
; i
++)
1629 cpuc
->kfree_on_online
[i
] = NULL
;
1630 if (x86_pmu
.cpu_prepare
)
1631 return x86_pmu
.cpu_prepare(cpu
);
1635 static int x86_pmu_dead_cpu(unsigned int cpu
)
1637 if (x86_pmu
.cpu_dead
)
1638 x86_pmu
.cpu_dead(cpu
);
1642 static int x86_pmu_online_cpu(unsigned int cpu
)
1644 struct cpu_hw_events
*cpuc
= &per_cpu(cpu_hw_events
, cpu
);
1647 for (i
= 0 ; i
< X86_PERF_KFREE_MAX
; i
++) {
1648 kfree(cpuc
->kfree_on_online
[i
]);
1649 cpuc
->kfree_on_online
[i
] = NULL
;
1654 static int x86_pmu_starting_cpu(unsigned int cpu
)
1656 if (x86_pmu
.cpu_starting
)
1657 x86_pmu
.cpu_starting(cpu
);
1661 static int x86_pmu_dying_cpu(unsigned int cpu
)
1663 if (x86_pmu
.cpu_dying
)
1664 x86_pmu
.cpu_dying(cpu
);
1668 static void __init
pmu_check_apic(void)
1670 if (boot_cpu_has(X86_FEATURE_APIC
))
1674 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1675 pr_info("no hardware sampling interrupt available.\n");
1678 * If we have a PMU initialized but no APIC
1679 * interrupts, we cannot sample hardware
1680 * events (user-space has to fall back and
1681 * sample via a hrtimer based software event):
1683 pmu
.capabilities
|= PERF_PMU_CAP_NO_INTERRUPT
;
1687 static struct attribute_group x86_pmu_format_group __ro_after_init
= {
1692 ssize_t
events_sysfs_show(struct device
*dev
, struct device_attribute
*attr
, char *page
)
1694 struct perf_pmu_events_attr
*pmu_attr
=
1695 container_of(attr
, struct perf_pmu_events_attr
, attr
);
1698 if (pmu_attr
->id
< x86_pmu
.max_events
)
1699 config
= x86_pmu
.event_map(pmu_attr
->id
);
1701 /* string trumps id */
1702 if (pmu_attr
->event_str
)
1703 return sprintf(page
, "%s", pmu_attr
->event_str
);
1705 return x86_pmu
.events_sysfs_show(page
, config
);
1707 EXPORT_SYMBOL_GPL(events_sysfs_show
);
1709 ssize_t
events_ht_sysfs_show(struct device
*dev
, struct device_attribute
*attr
,
1712 struct perf_pmu_events_ht_attr
*pmu_attr
=
1713 container_of(attr
, struct perf_pmu_events_ht_attr
, attr
);
1716 * Report conditional events depending on Hyper-Threading.
1718 * This is overly conservative as usually the HT special
1719 * handling is not needed if the other CPU thread is idle.
1721 * Note this does not (and cannot) handle the case when thread
1722 * siblings are invisible, for example with virtualization
1723 * if they are owned by some other guest. The user tool
1724 * has to re-read when a thread sibling gets onlined later.
1726 return sprintf(page
, "%s",
1727 topology_max_smt_threads() > 1 ?
1728 pmu_attr
->event_str_ht
:
1729 pmu_attr
->event_str_noht
);
1732 EVENT_ATTR(cpu
-cycles
, CPU_CYCLES
);
1733 EVENT_ATTR(instructions
, INSTRUCTIONS
);
1734 EVENT_ATTR(cache
-references
, CACHE_REFERENCES
);
1735 EVENT_ATTR(cache
-misses
, CACHE_MISSES
);
1736 EVENT_ATTR(branch
-instructions
, BRANCH_INSTRUCTIONS
);
1737 EVENT_ATTR(branch
-misses
, BRANCH_MISSES
);
1738 EVENT_ATTR(bus
-cycles
, BUS_CYCLES
);
1739 EVENT_ATTR(stalled
-cycles
-frontend
, STALLED_CYCLES_FRONTEND
);
1740 EVENT_ATTR(stalled
-cycles
-backend
, STALLED_CYCLES_BACKEND
);
1741 EVENT_ATTR(ref
-cycles
, REF_CPU_CYCLES
);
1743 static struct attribute
*empty_attrs
;
1745 static struct attribute
*events_attr
[] = {
1746 EVENT_PTR(CPU_CYCLES
),
1747 EVENT_PTR(INSTRUCTIONS
),
1748 EVENT_PTR(CACHE_REFERENCES
),
1749 EVENT_PTR(CACHE_MISSES
),
1750 EVENT_PTR(BRANCH_INSTRUCTIONS
),
1751 EVENT_PTR(BRANCH_MISSES
),
1752 EVENT_PTR(BUS_CYCLES
),
1753 EVENT_PTR(STALLED_CYCLES_FRONTEND
),
1754 EVENT_PTR(STALLED_CYCLES_BACKEND
),
1755 EVENT_PTR(REF_CPU_CYCLES
),
1760 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1761 * out of events_attr attributes.
1764 is_visible(struct kobject
*kobj
, struct attribute
*attr
, int idx
)
1766 struct perf_pmu_events_attr
*pmu_attr
;
1768 if (idx
>= x86_pmu
.max_events
)
1771 pmu_attr
= container_of(attr
, struct perf_pmu_events_attr
, attr
.attr
);
1773 return pmu_attr
->event_str
|| x86_pmu
.event_map(idx
) ? attr
->mode
: 0;
1776 static struct attribute_group x86_pmu_events_group __ro_after_init
= {
1778 .attrs
= events_attr
,
1779 .is_visible
= is_visible
,
1782 ssize_t
x86_event_sysfs_show(char *page
, u64 config
, u64 event
)
1784 u64 umask
= (config
& ARCH_PERFMON_EVENTSEL_UMASK
) >> 8;
1785 u64 cmask
= (config
& ARCH_PERFMON_EVENTSEL_CMASK
) >> 24;
1786 bool edge
= (config
& ARCH_PERFMON_EVENTSEL_EDGE
);
1787 bool pc
= (config
& ARCH_PERFMON_EVENTSEL_PIN_CONTROL
);
1788 bool any
= (config
& ARCH_PERFMON_EVENTSEL_ANY
);
1789 bool inv
= (config
& ARCH_PERFMON_EVENTSEL_INV
);
1793 * We have whole page size to spend and just little data
1794 * to write, so we can safely use sprintf.
1796 ret
= sprintf(page
, "event=0x%02llx", event
);
1799 ret
+= sprintf(page
+ ret
, ",umask=0x%02llx", umask
);
1802 ret
+= sprintf(page
+ ret
, ",edge");
1805 ret
+= sprintf(page
+ ret
, ",pc");
1808 ret
+= sprintf(page
+ ret
, ",any");
1811 ret
+= sprintf(page
+ ret
, ",inv");
1814 ret
+= sprintf(page
+ ret
, ",cmask=0x%02llx", cmask
);
1816 ret
+= sprintf(page
+ ret
, "\n");
1821 static struct attribute_group x86_pmu_attr_group
;
1822 static struct attribute_group x86_pmu_caps_group
;
1824 static int __init
init_hw_perf_events(void)
1826 struct x86_pmu_quirk
*quirk
;
1829 pr_info("Performance Events: ");
1831 switch (boot_cpu_data
.x86_vendor
) {
1832 case X86_VENDOR_INTEL
:
1833 err
= intel_pmu_init();
1835 case X86_VENDOR_AMD
:
1836 err
= amd_pmu_init();
1838 case X86_VENDOR_HYGON
:
1839 err
= amd_pmu_init();
1840 x86_pmu
.name
= "HYGON";
1846 pr_cont("no PMU driver, software events only.\n");
1852 /* sanity check that the hardware exists or is emulated */
1853 if (!check_hw_exists())
1856 pr_cont("%s PMU driver.\n", x86_pmu
.name
);
1858 x86_pmu
.attr_rdpmc
= 1; /* enable userspace RDPMC usage by default */
1860 for (quirk
= x86_pmu
.quirks
; quirk
; quirk
= quirk
->next
)
1863 if (!x86_pmu
.intel_ctrl
)
1864 x86_pmu
.intel_ctrl
= (1 << x86_pmu
.num_counters
) - 1;
1866 perf_events_lapic_init();
1867 register_nmi_handler(NMI_LOCAL
, perf_event_nmi_handler
, 0, "PMI");
1869 unconstrained
= (struct event_constraint
)
1870 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu
.num_counters
) - 1,
1871 0, x86_pmu
.num_counters
, 0, 0);
1873 x86_pmu_format_group
.attrs
= x86_pmu
.format_attrs
;
1875 if (!x86_pmu
.events_sysfs_show
)
1876 x86_pmu_events_group
.attrs
= &empty_attrs
;
1878 pmu
.attr_update
= x86_pmu
.attr_update
;
1880 pr_info("... version: %d\n", x86_pmu
.version
);
1881 pr_info("... bit width: %d\n", x86_pmu
.cntval_bits
);
1882 pr_info("... generic registers: %d\n", x86_pmu
.num_counters
);
1883 pr_info("... value mask: %016Lx\n", x86_pmu
.cntval_mask
);
1884 pr_info("... max period: %016Lx\n", x86_pmu
.max_period
);
1885 pr_info("... fixed-purpose events: %d\n", x86_pmu
.num_counters_fixed
);
1886 pr_info("... event mask: %016Lx\n", x86_pmu
.intel_ctrl
);
1889 * Install callbacks. Core will call them for each online
1892 err
= cpuhp_setup_state(CPUHP_PERF_X86_PREPARE
, "perf/x86:prepare",
1893 x86_pmu_prepare_cpu
, x86_pmu_dead_cpu
);
1897 err
= cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING
,
1898 "perf/x86:starting", x86_pmu_starting_cpu
,
1903 err
= cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE
, "perf/x86:online",
1904 x86_pmu_online_cpu
, NULL
);
1908 err
= perf_pmu_register(&pmu
, "cpu", PERF_TYPE_RAW
);
1915 cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE
);
1917 cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING
);
1919 cpuhp_remove_state(CPUHP_PERF_X86_PREPARE
);
1922 early_initcall(init_hw_perf_events
);
1924 static inline void x86_pmu_read(struct perf_event
*event
)
1927 return x86_pmu
.read(event
);
1928 x86_perf_event_update(event
);
1932 * Start group events scheduling transaction
1933 * Set the flag to make pmu::enable() not perform the
1934 * schedulability test, it will be performed at commit time
1936 * We only support PERF_PMU_TXN_ADD transactions. Save the
1937 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
1940 static void x86_pmu_start_txn(struct pmu
*pmu
, unsigned int txn_flags
)
1942 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
1944 WARN_ON_ONCE(cpuc
->txn_flags
); /* txn already in flight */
1946 cpuc
->txn_flags
= txn_flags
;
1947 if (txn_flags
& ~PERF_PMU_TXN_ADD
)
1950 perf_pmu_disable(pmu
);
1951 __this_cpu_write(cpu_hw_events
.n_txn
, 0);
1955 * Stop group events scheduling transaction
1956 * Clear the flag and pmu::enable() will perform the
1957 * schedulability test.
1959 static void x86_pmu_cancel_txn(struct pmu
*pmu
)
1961 unsigned int txn_flags
;
1962 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
1964 WARN_ON_ONCE(!cpuc
->txn_flags
); /* no txn in flight */
1966 txn_flags
= cpuc
->txn_flags
;
1967 cpuc
->txn_flags
= 0;
1968 if (txn_flags
& ~PERF_PMU_TXN_ADD
)
1972 * Truncate collected array by the number of events added in this
1973 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
1975 __this_cpu_sub(cpu_hw_events
.n_added
, __this_cpu_read(cpu_hw_events
.n_txn
));
1976 __this_cpu_sub(cpu_hw_events
.n_events
, __this_cpu_read(cpu_hw_events
.n_txn
));
1977 perf_pmu_enable(pmu
);
1981 * Commit group events scheduling transaction
1982 * Perform the group schedulability test as a whole
1983 * Return 0 if success
1985 * Does not cancel the transaction on failure; expects the caller to do this.
1987 static int x86_pmu_commit_txn(struct pmu
*pmu
)
1989 struct cpu_hw_events
*cpuc
= this_cpu_ptr(&cpu_hw_events
);
1990 int assign
[X86_PMC_IDX_MAX
];
1993 WARN_ON_ONCE(!cpuc
->txn_flags
); /* no txn in flight */
1995 if (cpuc
->txn_flags
& ~PERF_PMU_TXN_ADD
) {
1996 cpuc
->txn_flags
= 0;
2002 if (!x86_pmu_initialized())
2005 ret
= x86_pmu
.schedule_events(cpuc
, n
, assign
);
2010 * copy new assignment, now we know it is possible
2011 * will be used by hw_perf_enable()
2013 memcpy(cpuc
->assign
, assign
, n
*sizeof(int));
2015 cpuc
->txn_flags
= 0;
2016 perf_pmu_enable(pmu
);
2020 * a fake_cpuc is used to validate event groups. Due to
2021 * the extra reg logic, we need to also allocate a fake
2022 * per_core and per_cpu structure. Otherwise, group events
2023 * using extra reg may conflict without the kernel being
2024 * able to catch this when the last event gets added to
2027 static void free_fake_cpuc(struct cpu_hw_events
*cpuc
)
2029 intel_cpuc_finish(cpuc
);
2033 static struct cpu_hw_events
*allocate_fake_cpuc(void)
2035 struct cpu_hw_events
*cpuc
;
2036 int cpu
= raw_smp_processor_id();
2038 cpuc
= kzalloc(sizeof(*cpuc
), GFP_KERNEL
);
2040 return ERR_PTR(-ENOMEM
);
2043 if (intel_cpuc_prepare(cpuc
, cpu
))
2048 free_fake_cpuc(cpuc
);
2049 return ERR_PTR(-ENOMEM
);
2053 * validate that we can schedule this event
2055 static int validate_event(struct perf_event
*event
)
2057 struct cpu_hw_events
*fake_cpuc
;
2058 struct event_constraint
*c
;
2061 fake_cpuc
= allocate_fake_cpuc();
2062 if (IS_ERR(fake_cpuc
))
2063 return PTR_ERR(fake_cpuc
);
2065 c
= x86_pmu
.get_event_constraints(fake_cpuc
, 0, event
);
2067 if (!c
|| !c
->weight
)
2070 if (x86_pmu
.put_event_constraints
)
2071 x86_pmu
.put_event_constraints(fake_cpuc
, event
);
2073 free_fake_cpuc(fake_cpuc
);
2079 * validate a single event group
2081 * validation include:
2082 * - check events are compatible which each other
2083 * - events do not compete for the same counter
2084 * - number of events <= number of counters
2086 * validation ensures the group can be loaded onto the
2087 * PMU if it was the only group available.
2089 static int validate_group(struct perf_event
*event
)
2091 struct perf_event
*leader
= event
->group_leader
;
2092 struct cpu_hw_events
*fake_cpuc
;
2093 int ret
= -EINVAL
, n
;
2095 fake_cpuc
= allocate_fake_cpuc();
2096 if (IS_ERR(fake_cpuc
))
2097 return PTR_ERR(fake_cpuc
);
2099 * the event is not yet connected with its
2100 * siblings therefore we must first collect
2101 * existing siblings, then add the new event
2102 * before we can simulate the scheduling
2104 n
= collect_events(fake_cpuc
, leader
, true);
2108 fake_cpuc
->n_events
= n
;
2109 n
= collect_events(fake_cpuc
, event
, false);
2113 fake_cpuc
->n_events
= 0;
2114 ret
= x86_pmu
.schedule_events(fake_cpuc
, n
, NULL
);
2117 free_fake_cpuc(fake_cpuc
);
2121 static int x86_pmu_event_init(struct perf_event
*event
)
2126 switch (event
->attr
.type
) {
2128 case PERF_TYPE_HARDWARE
:
2129 case PERF_TYPE_HW_CACHE
:
2136 err
= __x86_pmu_event_init(event
);
2139 * we temporarily connect event to its pmu
2140 * such that validate_group() can classify
2141 * it as an x86 event using is_x86_event()
2146 if (event
->group_leader
!= event
)
2147 err
= validate_group(event
);
2149 err
= validate_event(event
);
2155 event
->destroy(event
);
2158 if (READ_ONCE(x86_pmu
.attr_rdpmc
) &&
2159 !(event
->hw
.flags
& PERF_X86_EVENT_LARGE_PEBS
))
2160 event
->hw
.flags
|= PERF_X86_EVENT_RDPMC_ALLOWED
;
2165 static void refresh_pce(void *ignored
)
2167 load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate
.loaded_mm
));
2170 static void x86_pmu_event_mapped(struct perf_event
*event
, struct mm_struct
*mm
)
2172 if (!(event
->hw
.flags
& PERF_X86_EVENT_RDPMC_ALLOWED
))
2176 * This function relies on not being called concurrently in two
2177 * tasks in the same mm. Otherwise one task could observe
2178 * perf_rdpmc_allowed > 1 and return all the way back to
2179 * userspace with CR4.PCE clear while another task is still
2180 * doing on_each_cpu_mask() to propagate CR4.PCE.
2182 * For now, this can't happen because all callers hold mmap_sem
2183 * for write. If this changes, we'll need a different solution.
2185 lockdep_assert_held_write(&mm
->mmap_sem
);
2187 if (atomic_inc_return(&mm
->context
.perf_rdpmc_allowed
) == 1)
2188 on_each_cpu_mask(mm_cpumask(mm
), refresh_pce
, NULL
, 1);
2191 static void x86_pmu_event_unmapped(struct perf_event
*event
, struct mm_struct
*mm
)
2194 if (!(event
->hw
.flags
& PERF_X86_EVENT_RDPMC_ALLOWED
))
2197 if (atomic_dec_and_test(&mm
->context
.perf_rdpmc_allowed
))
2198 on_each_cpu_mask(mm_cpumask(mm
), refresh_pce
, NULL
, 1);
2201 static int x86_pmu_event_idx(struct perf_event
*event
)
2203 int idx
= event
->hw
.idx
;
2205 if (!(event
->hw
.flags
& PERF_X86_EVENT_RDPMC_ALLOWED
))
2208 if (x86_pmu
.num_counters_fixed
&& idx
>= INTEL_PMC_IDX_FIXED
) {
2209 idx
-= INTEL_PMC_IDX_FIXED
;
2216 static ssize_t
get_attr_rdpmc(struct device
*cdev
,
2217 struct device_attribute
*attr
,
2220 return snprintf(buf
, 40, "%d\n", x86_pmu
.attr_rdpmc
);
2223 static ssize_t
set_attr_rdpmc(struct device
*cdev
,
2224 struct device_attribute
*attr
,
2225 const char *buf
, size_t count
)
2230 ret
= kstrtoul(buf
, 0, &val
);
2237 if (x86_pmu
.attr_rdpmc_broken
)
2240 if (val
!= x86_pmu
.attr_rdpmc
) {
2242 * Changing into or out of never available or always available,
2243 * aka perf-event-bypassing mode. This path is extremely slow,
2244 * but only root can trigger it, so it's okay.
2247 static_branch_inc(&rdpmc_never_available_key
);
2248 else if (x86_pmu
.attr_rdpmc
== 0)
2249 static_branch_dec(&rdpmc_never_available_key
);
2252 static_branch_inc(&rdpmc_always_available_key
);
2253 else if (x86_pmu
.attr_rdpmc
== 2)
2254 static_branch_dec(&rdpmc_always_available_key
);
2256 on_each_cpu(refresh_pce
, NULL
, 1);
2257 x86_pmu
.attr_rdpmc
= val
;
2263 static DEVICE_ATTR(rdpmc
, S_IRUSR
| S_IWUSR
, get_attr_rdpmc
, set_attr_rdpmc
);
2265 static struct attribute
*x86_pmu_attrs
[] = {
2266 &dev_attr_rdpmc
.attr
,
2270 static struct attribute_group x86_pmu_attr_group __ro_after_init
= {
2271 .attrs
= x86_pmu_attrs
,
2274 static ssize_t
max_precise_show(struct device
*cdev
,
2275 struct device_attribute
*attr
,
2278 return snprintf(buf
, PAGE_SIZE
, "%d\n", x86_pmu_max_precise());
2281 static DEVICE_ATTR_RO(max_precise
);
2283 static struct attribute
*x86_pmu_caps_attrs
[] = {
2284 &dev_attr_max_precise
.attr
,
2288 static struct attribute_group x86_pmu_caps_group __ro_after_init
= {
2290 .attrs
= x86_pmu_caps_attrs
,
2293 static const struct attribute_group
*x86_pmu_attr_groups
[] = {
2294 &x86_pmu_attr_group
,
2295 &x86_pmu_format_group
,
2296 &x86_pmu_events_group
,
2297 &x86_pmu_caps_group
,
2301 static void x86_pmu_sched_task(struct perf_event_context
*ctx
, bool sched_in
)
2303 if (x86_pmu
.sched_task
)
2304 x86_pmu
.sched_task(ctx
, sched_in
);
2307 static void x86_pmu_swap_task_ctx(struct perf_event_context
*prev
,
2308 struct perf_event_context
*next
)
2310 if (x86_pmu
.swap_task_ctx
)
2311 x86_pmu
.swap_task_ctx(prev
, next
);
2314 void perf_check_microcode(void)
2316 if (x86_pmu
.check_microcode
)
2317 x86_pmu
.check_microcode();
2320 static int x86_pmu_check_period(struct perf_event
*event
, u64 value
)
2322 if (x86_pmu
.check_period
&& x86_pmu
.check_period(event
, value
))
2325 if (value
&& x86_pmu
.limit_period
) {
2326 if (x86_pmu
.limit_period(event
, value
) > value
)
2333 static int x86_pmu_aux_output_match(struct perf_event
*event
)
2335 if (!(pmu
.capabilities
& PERF_PMU_CAP_AUX_OUTPUT
))
2338 if (x86_pmu
.aux_output_match
)
2339 return x86_pmu
.aux_output_match(event
);
2344 static struct pmu pmu
= {
2345 .pmu_enable
= x86_pmu_enable
,
2346 .pmu_disable
= x86_pmu_disable
,
2348 .attr_groups
= x86_pmu_attr_groups
,
2350 .event_init
= x86_pmu_event_init
,
2352 .event_mapped
= x86_pmu_event_mapped
,
2353 .event_unmapped
= x86_pmu_event_unmapped
,
2357 .start
= x86_pmu_start
,
2358 .stop
= x86_pmu_stop
,
2359 .read
= x86_pmu_read
,
2361 .start_txn
= x86_pmu_start_txn
,
2362 .cancel_txn
= x86_pmu_cancel_txn
,
2363 .commit_txn
= x86_pmu_commit_txn
,
2365 .event_idx
= x86_pmu_event_idx
,
2366 .sched_task
= x86_pmu_sched_task
,
2367 .task_ctx_size
= sizeof(struct x86_perf_task_context
),
2368 .swap_task_ctx
= x86_pmu_swap_task_ctx
,
2369 .check_period
= x86_pmu_check_period
,
2371 .aux_output_match
= x86_pmu_aux_output_match
,
2374 void arch_perf_update_userpage(struct perf_event
*event
,
2375 struct perf_event_mmap_page
*userpg
, u64 now
)
2377 struct cyc2ns_data data
;
2380 userpg
->cap_user_time
= 0;
2381 userpg
->cap_user_time_zero
= 0;
2382 userpg
->cap_user_rdpmc
=
2383 !!(event
->hw
.flags
& PERF_X86_EVENT_RDPMC_ALLOWED
);
2384 userpg
->pmc_width
= x86_pmu
.cntval_bits
;
2386 if (!using_native_sched_clock() || !sched_clock_stable())
2389 cyc2ns_read_begin(&data
);
2391 offset
= data
.cyc2ns_offset
+ __sched_clock_offset
;
2394 * Internal timekeeping for enabled/running/stopped times
2395 * is always in the local_clock domain.
2397 userpg
->cap_user_time
= 1;
2398 userpg
->time_mult
= data
.cyc2ns_mul
;
2399 userpg
->time_shift
= data
.cyc2ns_shift
;
2400 userpg
->time_offset
= offset
- now
;
2403 * cap_user_time_zero doesn't make sense when we're using a different
2404 * time base for the records.
2406 if (!event
->attr
.use_clockid
) {
2407 userpg
->cap_user_time_zero
= 1;
2408 userpg
->time_zero
= offset
;
2415 * Determine whether the regs were taken from an irq/exception handler rather
2416 * than from perf_arch_fetch_caller_regs().
2418 static bool perf_hw_regs(struct pt_regs
*regs
)
2420 return regs
->flags
& X86_EFLAGS_FIXED
;
2424 perf_callchain_kernel(struct perf_callchain_entry_ctx
*entry
, struct pt_regs
*regs
)
2426 struct unwind_state state
;
2429 if (perf_guest_cbs
&& perf_guest_cbs
->is_in_guest()) {
2430 /* TODO: We don't support guest os callchain now */
2434 if (perf_callchain_store(entry
, regs
->ip
))
2437 if (perf_hw_regs(regs
))
2438 unwind_start(&state
, current
, regs
, NULL
);
2440 unwind_start(&state
, current
, NULL
, (void *)regs
->sp
);
2442 for (; !unwind_done(&state
); unwind_next_frame(&state
)) {
2443 addr
= unwind_get_return_address(&state
);
2444 if (!addr
|| perf_callchain_store(entry
, addr
))
2450 valid_user_frame(const void __user
*fp
, unsigned long size
)
2452 return (__range_not_ok(fp
, size
, TASK_SIZE
) == 0);
2455 static unsigned long get_segment_base(unsigned int segment
)
2457 struct desc_struct
*desc
;
2458 unsigned int idx
= segment
>> 3;
2460 if ((segment
& SEGMENT_TI_MASK
) == SEGMENT_LDT
) {
2461 #ifdef CONFIG_MODIFY_LDT_SYSCALL
2462 struct ldt_struct
*ldt
;
2464 /* IRQs are off, so this synchronizes with smp_store_release */
2465 ldt
= READ_ONCE(current
->active_mm
->context
.ldt
);
2466 if (!ldt
|| idx
>= ldt
->nr_entries
)
2469 desc
= &ldt
->entries
[idx
];
2474 if (idx
>= GDT_ENTRIES
)
2477 desc
= raw_cpu_ptr(gdt_page
.gdt
) + idx
;
2480 return get_desc_base(desc
);
2483 #ifdef CONFIG_IA32_EMULATION
2485 #include <linux/compat.h>
2488 perf_callchain_user32(struct pt_regs
*regs
, struct perf_callchain_entry_ctx
*entry
)
2490 /* 32-bit process in 64-bit kernel. */
2491 unsigned long ss_base
, cs_base
;
2492 struct stack_frame_ia32 frame
;
2493 const void __user
*fp
;
2495 if (!test_thread_flag(TIF_IA32
))
2498 cs_base
= get_segment_base(regs
->cs
);
2499 ss_base
= get_segment_base(regs
->ss
);
2501 fp
= compat_ptr(ss_base
+ regs
->bp
);
2502 pagefault_disable();
2503 while (entry
->nr
< entry
->max_stack
) {
2504 unsigned long bytes
;
2505 frame
.next_frame
= 0;
2506 frame
.return_address
= 0;
2508 if (!valid_user_frame(fp
, sizeof(frame
)))
2511 bytes
= __copy_from_user_nmi(&frame
.next_frame
, fp
, 4);
2514 bytes
= __copy_from_user_nmi(&frame
.return_address
, fp
+4, 4);
2518 perf_callchain_store(entry
, cs_base
+ frame
.return_address
);
2519 fp
= compat_ptr(ss_base
+ frame
.next_frame
);
2526 perf_callchain_user32(struct pt_regs
*regs
, struct perf_callchain_entry_ctx
*entry
)
2533 perf_callchain_user(struct perf_callchain_entry_ctx
*entry
, struct pt_regs
*regs
)
2535 struct stack_frame frame
;
2536 const unsigned long __user
*fp
;
2538 if (perf_guest_cbs
&& perf_guest_cbs
->is_in_guest()) {
2539 /* TODO: We don't support guest os callchain now */
2544 * We don't know what to do with VM86 stacks.. ignore them for now.
2546 if (regs
->flags
& (X86_VM_MASK
| PERF_EFLAGS_VM
))
2549 fp
= (unsigned long __user
*)regs
->bp
;
2551 perf_callchain_store(entry
, regs
->ip
);
2553 if (!nmi_uaccess_okay())
2556 if (perf_callchain_user32(regs
, entry
))
2559 pagefault_disable();
2560 while (entry
->nr
< entry
->max_stack
) {
2561 unsigned long bytes
;
2563 frame
.next_frame
= NULL
;
2564 frame
.return_address
= 0;
2566 if (!valid_user_frame(fp
, sizeof(frame
)))
2569 bytes
= __copy_from_user_nmi(&frame
.next_frame
, fp
, sizeof(*fp
));
2572 bytes
= __copy_from_user_nmi(&frame
.return_address
, fp
+ 1, sizeof(*fp
));
2576 perf_callchain_store(entry
, frame
.return_address
);
2577 fp
= (void __user
*)frame
.next_frame
;
2583 * Deal with code segment offsets for the various execution modes:
2585 * VM86 - the good olde 16 bit days, where the linear address is
2586 * 20 bits and we use regs->ip + 0x10 * regs->cs.
2588 * IA32 - Where we need to look at GDT/LDT segment descriptor tables
2589 * to figure out what the 32bit base address is.
2591 * X32 - has TIF_X32 set, but is running in x86_64
2593 * X86_64 - CS,DS,SS,ES are all zero based.
2595 static unsigned long code_segment_base(struct pt_regs
*regs
)
2598 * For IA32 we look at the GDT/LDT segment base to convert the
2599 * effective IP to a linear address.
2602 #ifdef CONFIG_X86_32
2604 * If we are in VM86 mode, add the segment offset to convert to a
2607 if (regs
->flags
& X86_VM_MASK
)
2608 return 0x10 * regs
->cs
;
2610 if (user_mode(regs
) && regs
->cs
!= __USER_CS
)
2611 return get_segment_base(regs
->cs
);
2613 if (user_mode(regs
) && !user_64bit_mode(regs
) &&
2614 regs
->cs
!= __USER32_CS
)
2615 return get_segment_base(regs
->cs
);
2620 unsigned long perf_instruction_pointer(struct pt_regs
*regs
)
2622 if (perf_guest_cbs
&& perf_guest_cbs
->is_in_guest())
2623 return perf_guest_cbs
->get_guest_ip();
2625 return regs
->ip
+ code_segment_base(regs
);
2628 unsigned long perf_misc_flags(struct pt_regs
*regs
)
2632 if (perf_guest_cbs
&& perf_guest_cbs
->is_in_guest()) {
2633 if (perf_guest_cbs
->is_user_mode())
2634 misc
|= PERF_RECORD_MISC_GUEST_USER
;
2636 misc
|= PERF_RECORD_MISC_GUEST_KERNEL
;
2638 if (user_mode(regs
))
2639 misc
|= PERF_RECORD_MISC_USER
;
2641 misc
|= PERF_RECORD_MISC_KERNEL
;
2644 if (regs
->flags
& PERF_EFLAGS_EXACT
)
2645 misc
|= PERF_RECORD_MISC_EXACT_IP
;
2650 void perf_get_x86_pmu_capability(struct x86_pmu_capability
*cap
)
2652 cap
->version
= x86_pmu
.version
;
2653 cap
->num_counters_gp
= x86_pmu
.num_counters
;
2654 cap
->num_counters_fixed
= x86_pmu
.num_counters_fixed
;
2655 cap
->bit_width_gp
= x86_pmu
.cntval_bits
;
2656 cap
->bit_width_fixed
= x86_pmu
.cntval_bits
;
2657 cap
->events_mask
= (unsigned int)x86_pmu
.events_maskl
;
2658 cap
->events_mask_len
= x86_pmu
.events_mask_len
;
2660 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability
);