PM / yenta: Split resume into early and late parts (rev. 4)
[linux/fpc-iii.git] / kernel / perf_counter.c
blobb1dc4684e66ac2b8fc046951d8024d3606c6385c
1 /*
2 * Performance counter core code
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
9 * For licensing details see kernel-base/COPYING
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/file.h>
17 #include <linux/poll.h>
18 #include <linux/sysfs.h>
19 #include <linux/dcache.h>
20 #include <linux/percpu.h>
21 #include <linux/ptrace.h>
22 #include <linux/vmstat.h>
23 #include <linux/hardirq.h>
24 #include <linux/rculist.h>
25 #include <linux/uaccess.h>
26 #include <linux/syscalls.h>
27 #include <linux/anon_inodes.h>
28 #include <linux/kernel_stat.h>
29 #include <linux/perf_counter.h>
31 #include <asm/irq_regs.h>
34 * Each CPU has a list of per CPU counters:
36 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
38 int perf_max_counters __read_mostly = 1;
39 static int perf_reserved_percpu __read_mostly;
40 static int perf_overcommit __read_mostly = 1;
42 static atomic_t nr_counters __read_mostly;
43 static atomic_t nr_mmap_counters __read_mostly;
44 static atomic_t nr_comm_counters __read_mostly;
45 static atomic_t nr_task_counters __read_mostly;
48 * perf counter paranoia level:
49 * 0 - not paranoid
50 * 1 - disallow cpu counters to unpriv
51 * 2 - disallow kernel profiling to unpriv
53 int sysctl_perf_counter_paranoid __read_mostly = 1;
55 static inline bool perf_paranoid_cpu(void)
57 return sysctl_perf_counter_paranoid > 0;
60 static inline bool perf_paranoid_kernel(void)
62 return sysctl_perf_counter_paranoid > 1;
65 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
68 * max perf counter sample rate
70 int sysctl_perf_counter_sample_rate __read_mostly = 100000;
72 static atomic64_t perf_counter_id;
75 * Lock for (sysadmin-configurable) counter reservations:
77 static DEFINE_SPINLOCK(perf_resource_lock);
80 * Architecture provided APIs - weak aliases:
82 extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
84 return NULL;
87 void __weak hw_perf_disable(void) { barrier(); }
88 void __weak hw_perf_enable(void) { barrier(); }
90 void __weak hw_perf_counter_setup(int cpu) { barrier(); }
91 void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
93 int __weak
94 hw_perf_group_sched_in(struct perf_counter *group_leader,
95 struct perf_cpu_context *cpuctx,
96 struct perf_counter_context *ctx, int cpu)
98 return 0;
101 void __weak perf_counter_print_debug(void) { }
103 static DEFINE_PER_CPU(int, disable_count);
105 void __perf_disable(void)
107 __get_cpu_var(disable_count)++;
110 bool __perf_enable(void)
112 return !--__get_cpu_var(disable_count);
115 void perf_disable(void)
117 __perf_disable();
118 hw_perf_disable();
121 void perf_enable(void)
123 if (__perf_enable())
124 hw_perf_enable();
127 static void get_ctx(struct perf_counter_context *ctx)
129 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
132 static void free_ctx(struct rcu_head *head)
134 struct perf_counter_context *ctx;
136 ctx = container_of(head, struct perf_counter_context, rcu_head);
137 kfree(ctx);
140 static void put_ctx(struct perf_counter_context *ctx)
142 if (atomic_dec_and_test(&ctx->refcount)) {
143 if (ctx->parent_ctx)
144 put_ctx(ctx->parent_ctx);
145 if (ctx->task)
146 put_task_struct(ctx->task);
147 call_rcu(&ctx->rcu_head, free_ctx);
151 static void unclone_ctx(struct perf_counter_context *ctx)
153 if (ctx->parent_ctx) {
154 put_ctx(ctx->parent_ctx);
155 ctx->parent_ctx = NULL;
160 * If we inherit counters we want to return the parent counter id
161 * to userspace.
163 static u64 primary_counter_id(struct perf_counter *counter)
165 u64 id = counter->id;
167 if (counter->parent)
168 id = counter->parent->id;
170 return id;
174 * Get the perf_counter_context for a task and lock it.
175 * This has to cope with with the fact that until it is locked,
176 * the context could get moved to another task.
178 static struct perf_counter_context *
179 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
181 struct perf_counter_context *ctx;
183 rcu_read_lock();
184 retry:
185 ctx = rcu_dereference(task->perf_counter_ctxp);
186 if (ctx) {
188 * If this context is a clone of another, it might
189 * get swapped for another underneath us by
190 * perf_counter_task_sched_out, though the
191 * rcu_read_lock() protects us from any context
192 * getting freed. Lock the context and check if it
193 * got swapped before we could get the lock, and retry
194 * if so. If we locked the right context, then it
195 * can't get swapped on us any more.
197 spin_lock_irqsave(&ctx->lock, *flags);
198 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
199 spin_unlock_irqrestore(&ctx->lock, *flags);
200 goto retry;
203 if (!atomic_inc_not_zero(&ctx->refcount)) {
204 spin_unlock_irqrestore(&ctx->lock, *flags);
205 ctx = NULL;
208 rcu_read_unlock();
209 return ctx;
213 * Get the context for a task and increment its pin_count so it
214 * can't get swapped to another task. This also increments its
215 * reference count so that the context can't get freed.
217 static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
219 struct perf_counter_context *ctx;
220 unsigned long flags;
222 ctx = perf_lock_task_context(task, &flags);
223 if (ctx) {
224 ++ctx->pin_count;
225 spin_unlock_irqrestore(&ctx->lock, flags);
227 return ctx;
230 static void perf_unpin_context(struct perf_counter_context *ctx)
232 unsigned long flags;
234 spin_lock_irqsave(&ctx->lock, flags);
235 --ctx->pin_count;
236 spin_unlock_irqrestore(&ctx->lock, flags);
237 put_ctx(ctx);
241 * Add a counter from the lists for its context.
242 * Must be called with ctx->mutex and ctx->lock held.
244 static void
245 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
247 struct perf_counter *group_leader = counter->group_leader;
250 * Depending on whether it is a standalone or sibling counter,
251 * add it straight to the context's counter list, or to the group
252 * leader's sibling list:
254 if (group_leader == counter)
255 list_add_tail(&counter->list_entry, &ctx->counter_list);
256 else {
257 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
258 group_leader->nr_siblings++;
261 list_add_rcu(&counter->event_entry, &ctx->event_list);
262 ctx->nr_counters++;
263 if (counter->attr.inherit_stat)
264 ctx->nr_stat++;
268 * Remove a counter from the lists for its context.
269 * Must be called with ctx->mutex and ctx->lock held.
271 static void
272 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
274 struct perf_counter *sibling, *tmp;
276 if (list_empty(&counter->list_entry))
277 return;
278 ctx->nr_counters--;
279 if (counter->attr.inherit_stat)
280 ctx->nr_stat--;
282 list_del_init(&counter->list_entry);
283 list_del_rcu(&counter->event_entry);
285 if (counter->group_leader != counter)
286 counter->group_leader->nr_siblings--;
289 * If this was a group counter with sibling counters then
290 * upgrade the siblings to singleton counters by adding them
291 * to the context list directly:
293 list_for_each_entry_safe(sibling, tmp,
294 &counter->sibling_list, list_entry) {
296 list_move_tail(&sibling->list_entry, &ctx->counter_list);
297 sibling->group_leader = sibling;
301 static void
302 counter_sched_out(struct perf_counter *counter,
303 struct perf_cpu_context *cpuctx,
304 struct perf_counter_context *ctx)
306 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
307 return;
309 counter->state = PERF_COUNTER_STATE_INACTIVE;
310 if (counter->pending_disable) {
311 counter->pending_disable = 0;
312 counter->state = PERF_COUNTER_STATE_OFF;
314 counter->tstamp_stopped = ctx->time;
315 counter->pmu->disable(counter);
316 counter->oncpu = -1;
318 if (!is_software_counter(counter))
319 cpuctx->active_oncpu--;
320 ctx->nr_active--;
321 if (counter->attr.exclusive || !cpuctx->active_oncpu)
322 cpuctx->exclusive = 0;
325 static void
326 group_sched_out(struct perf_counter *group_counter,
327 struct perf_cpu_context *cpuctx,
328 struct perf_counter_context *ctx)
330 struct perf_counter *counter;
332 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
333 return;
335 counter_sched_out(group_counter, cpuctx, ctx);
338 * Schedule out siblings (if any):
340 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
341 counter_sched_out(counter, cpuctx, ctx);
343 if (group_counter->attr.exclusive)
344 cpuctx->exclusive = 0;
348 * Cross CPU call to remove a performance counter
350 * We disable the counter on the hardware level first. After that we
351 * remove it from the context list.
353 static void __perf_counter_remove_from_context(void *info)
355 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
356 struct perf_counter *counter = info;
357 struct perf_counter_context *ctx = counter->ctx;
360 * If this is a task context, we need to check whether it is
361 * the current task context of this cpu. If not it has been
362 * scheduled out before the smp call arrived.
364 if (ctx->task && cpuctx->task_ctx != ctx)
365 return;
367 spin_lock(&ctx->lock);
369 * Protect the list operation against NMI by disabling the
370 * counters on a global level.
372 perf_disable();
374 counter_sched_out(counter, cpuctx, ctx);
376 list_del_counter(counter, ctx);
378 if (!ctx->task) {
380 * Allow more per task counters with respect to the
381 * reservation:
383 cpuctx->max_pertask =
384 min(perf_max_counters - ctx->nr_counters,
385 perf_max_counters - perf_reserved_percpu);
388 perf_enable();
389 spin_unlock(&ctx->lock);
394 * Remove the counter from a task's (or a CPU's) list of counters.
396 * Must be called with ctx->mutex held.
398 * CPU counters are removed with a smp call. For task counters we only
399 * call when the task is on a CPU.
401 * If counter->ctx is a cloned context, callers must make sure that
402 * every task struct that counter->ctx->task could possibly point to
403 * remains valid. This is OK when called from perf_release since
404 * that only calls us on the top-level context, which can't be a clone.
405 * When called from perf_counter_exit_task, it's OK because the
406 * context has been detached from its task.
408 static void perf_counter_remove_from_context(struct perf_counter *counter)
410 struct perf_counter_context *ctx = counter->ctx;
411 struct task_struct *task = ctx->task;
413 if (!task) {
415 * Per cpu counters are removed via an smp call and
416 * the removal is always sucessful.
418 smp_call_function_single(counter->cpu,
419 __perf_counter_remove_from_context,
420 counter, 1);
421 return;
424 retry:
425 task_oncpu_function_call(task, __perf_counter_remove_from_context,
426 counter);
428 spin_lock_irq(&ctx->lock);
430 * If the context is active we need to retry the smp call.
432 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
433 spin_unlock_irq(&ctx->lock);
434 goto retry;
438 * The lock prevents that this context is scheduled in so we
439 * can remove the counter safely, if the call above did not
440 * succeed.
442 if (!list_empty(&counter->list_entry)) {
443 list_del_counter(counter, ctx);
445 spin_unlock_irq(&ctx->lock);
448 static inline u64 perf_clock(void)
450 return cpu_clock(smp_processor_id());
454 * Update the record of the current time in a context.
456 static void update_context_time(struct perf_counter_context *ctx)
458 u64 now = perf_clock();
460 ctx->time += now - ctx->timestamp;
461 ctx->timestamp = now;
465 * Update the total_time_enabled and total_time_running fields for a counter.
467 static void update_counter_times(struct perf_counter *counter)
469 struct perf_counter_context *ctx = counter->ctx;
470 u64 run_end;
472 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
473 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
474 return;
476 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
478 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
479 run_end = counter->tstamp_stopped;
480 else
481 run_end = ctx->time;
483 counter->total_time_running = run_end - counter->tstamp_running;
487 * Update total_time_enabled and total_time_running for all counters in a group.
489 static void update_group_times(struct perf_counter *leader)
491 struct perf_counter *counter;
493 update_counter_times(leader);
494 list_for_each_entry(counter, &leader->sibling_list, list_entry)
495 update_counter_times(counter);
499 * Cross CPU call to disable a performance counter
501 static void __perf_counter_disable(void *info)
503 struct perf_counter *counter = info;
504 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
505 struct perf_counter_context *ctx = counter->ctx;
508 * If this is a per-task counter, need to check whether this
509 * counter's task is the current task on this cpu.
511 if (ctx->task && cpuctx->task_ctx != ctx)
512 return;
514 spin_lock(&ctx->lock);
517 * If the counter is on, turn it off.
518 * If it is in error state, leave it in error state.
520 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
521 update_context_time(ctx);
522 update_group_times(counter);
523 if (counter == counter->group_leader)
524 group_sched_out(counter, cpuctx, ctx);
525 else
526 counter_sched_out(counter, cpuctx, ctx);
527 counter->state = PERF_COUNTER_STATE_OFF;
530 spin_unlock(&ctx->lock);
534 * Disable a counter.
536 * If counter->ctx is a cloned context, callers must make sure that
537 * every task struct that counter->ctx->task could possibly point to
538 * remains valid. This condition is satisifed when called through
539 * perf_counter_for_each_child or perf_counter_for_each because they
540 * hold the top-level counter's child_mutex, so any descendant that
541 * goes to exit will block in sync_child_counter.
542 * When called from perf_pending_counter it's OK because counter->ctx
543 * is the current context on this CPU and preemption is disabled,
544 * hence we can't get into perf_counter_task_sched_out for this context.
546 static void perf_counter_disable(struct perf_counter *counter)
548 struct perf_counter_context *ctx = counter->ctx;
549 struct task_struct *task = ctx->task;
551 if (!task) {
553 * Disable the counter on the cpu that it's on
555 smp_call_function_single(counter->cpu, __perf_counter_disable,
556 counter, 1);
557 return;
560 retry:
561 task_oncpu_function_call(task, __perf_counter_disable, counter);
563 spin_lock_irq(&ctx->lock);
565 * If the counter is still active, we need to retry the cross-call.
567 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
568 spin_unlock_irq(&ctx->lock);
569 goto retry;
573 * Since we have the lock this context can't be scheduled
574 * in, so we can change the state safely.
576 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
577 update_group_times(counter);
578 counter->state = PERF_COUNTER_STATE_OFF;
581 spin_unlock_irq(&ctx->lock);
584 static int
585 counter_sched_in(struct perf_counter *counter,
586 struct perf_cpu_context *cpuctx,
587 struct perf_counter_context *ctx,
588 int cpu)
590 if (counter->state <= PERF_COUNTER_STATE_OFF)
591 return 0;
593 counter->state = PERF_COUNTER_STATE_ACTIVE;
594 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
596 * The new state must be visible before we turn it on in the hardware:
598 smp_wmb();
600 if (counter->pmu->enable(counter)) {
601 counter->state = PERF_COUNTER_STATE_INACTIVE;
602 counter->oncpu = -1;
603 return -EAGAIN;
606 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
608 if (!is_software_counter(counter))
609 cpuctx->active_oncpu++;
610 ctx->nr_active++;
612 if (counter->attr.exclusive)
613 cpuctx->exclusive = 1;
615 return 0;
618 static int
619 group_sched_in(struct perf_counter *group_counter,
620 struct perf_cpu_context *cpuctx,
621 struct perf_counter_context *ctx,
622 int cpu)
624 struct perf_counter *counter, *partial_group;
625 int ret;
627 if (group_counter->state == PERF_COUNTER_STATE_OFF)
628 return 0;
630 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
631 if (ret)
632 return ret < 0 ? ret : 0;
634 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
635 return -EAGAIN;
638 * Schedule in siblings as one group (if any):
640 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
641 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
642 partial_group = counter;
643 goto group_error;
647 return 0;
649 group_error:
651 * Groups can be scheduled in as one unit only, so undo any
652 * partial group before returning:
654 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
655 if (counter == partial_group)
656 break;
657 counter_sched_out(counter, cpuctx, ctx);
659 counter_sched_out(group_counter, cpuctx, ctx);
661 return -EAGAIN;
665 * Return 1 for a group consisting entirely of software counters,
666 * 0 if the group contains any hardware counters.
668 static int is_software_only_group(struct perf_counter *leader)
670 struct perf_counter *counter;
672 if (!is_software_counter(leader))
673 return 0;
675 list_for_each_entry(counter, &leader->sibling_list, list_entry)
676 if (!is_software_counter(counter))
677 return 0;
679 return 1;
683 * Work out whether we can put this counter group on the CPU now.
685 static int group_can_go_on(struct perf_counter *counter,
686 struct perf_cpu_context *cpuctx,
687 int can_add_hw)
690 * Groups consisting entirely of software counters can always go on.
692 if (is_software_only_group(counter))
693 return 1;
695 * If an exclusive group is already on, no other hardware
696 * counters can go on.
698 if (cpuctx->exclusive)
699 return 0;
701 * If this group is exclusive and there are already
702 * counters on the CPU, it can't go on.
704 if (counter->attr.exclusive && cpuctx->active_oncpu)
705 return 0;
707 * Otherwise, try to add it if all previous groups were able
708 * to go on.
710 return can_add_hw;
713 static void add_counter_to_ctx(struct perf_counter *counter,
714 struct perf_counter_context *ctx)
716 list_add_counter(counter, ctx);
717 counter->tstamp_enabled = ctx->time;
718 counter->tstamp_running = ctx->time;
719 counter->tstamp_stopped = ctx->time;
723 * Cross CPU call to install and enable a performance counter
725 * Must be called with ctx->mutex held
727 static void __perf_install_in_context(void *info)
729 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
730 struct perf_counter *counter = info;
731 struct perf_counter_context *ctx = counter->ctx;
732 struct perf_counter *leader = counter->group_leader;
733 int cpu = smp_processor_id();
734 int err;
737 * If this is a task context, we need to check whether it is
738 * the current task context of this cpu. If not it has been
739 * scheduled out before the smp call arrived.
740 * Or possibly this is the right context but it isn't
741 * on this cpu because it had no counters.
743 if (ctx->task && cpuctx->task_ctx != ctx) {
744 if (cpuctx->task_ctx || ctx->task != current)
745 return;
746 cpuctx->task_ctx = ctx;
749 spin_lock(&ctx->lock);
750 ctx->is_active = 1;
751 update_context_time(ctx);
754 * Protect the list operation against NMI by disabling the
755 * counters on a global level. NOP for non NMI based counters.
757 perf_disable();
759 add_counter_to_ctx(counter, ctx);
762 * Don't put the counter on if it is disabled or if
763 * it is in a group and the group isn't on.
765 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
766 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
767 goto unlock;
770 * An exclusive counter can't go on if there are already active
771 * hardware counters, and no hardware counter can go on if there
772 * is already an exclusive counter on.
774 if (!group_can_go_on(counter, cpuctx, 1))
775 err = -EEXIST;
776 else
777 err = counter_sched_in(counter, cpuctx, ctx, cpu);
779 if (err) {
781 * This counter couldn't go on. If it is in a group
782 * then we have to pull the whole group off.
783 * If the counter group is pinned then put it in error state.
785 if (leader != counter)
786 group_sched_out(leader, cpuctx, ctx);
787 if (leader->attr.pinned) {
788 update_group_times(leader);
789 leader->state = PERF_COUNTER_STATE_ERROR;
793 if (!err && !ctx->task && cpuctx->max_pertask)
794 cpuctx->max_pertask--;
796 unlock:
797 perf_enable();
799 spin_unlock(&ctx->lock);
803 * Attach a performance counter to a context
805 * First we add the counter to the list with the hardware enable bit
806 * in counter->hw_config cleared.
808 * If the counter is attached to a task which is on a CPU we use a smp
809 * call to enable it in the task context. The task might have been
810 * scheduled away, but we check this in the smp call again.
812 * Must be called with ctx->mutex held.
814 static void
815 perf_install_in_context(struct perf_counter_context *ctx,
816 struct perf_counter *counter,
817 int cpu)
819 struct task_struct *task = ctx->task;
821 if (!task) {
823 * Per cpu counters are installed via an smp call and
824 * the install is always sucessful.
826 smp_call_function_single(cpu, __perf_install_in_context,
827 counter, 1);
828 return;
831 retry:
832 task_oncpu_function_call(task, __perf_install_in_context,
833 counter);
835 spin_lock_irq(&ctx->lock);
837 * we need to retry the smp call.
839 if (ctx->is_active && list_empty(&counter->list_entry)) {
840 spin_unlock_irq(&ctx->lock);
841 goto retry;
845 * The lock prevents that this context is scheduled in so we
846 * can add the counter safely, if it the call above did not
847 * succeed.
849 if (list_empty(&counter->list_entry))
850 add_counter_to_ctx(counter, ctx);
851 spin_unlock_irq(&ctx->lock);
855 * Put a counter into inactive state and update time fields.
856 * Enabling the leader of a group effectively enables all
857 * the group members that aren't explicitly disabled, so we
858 * have to update their ->tstamp_enabled also.
859 * Note: this works for group members as well as group leaders
860 * since the non-leader members' sibling_lists will be empty.
862 static void __perf_counter_mark_enabled(struct perf_counter *counter,
863 struct perf_counter_context *ctx)
865 struct perf_counter *sub;
867 counter->state = PERF_COUNTER_STATE_INACTIVE;
868 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
869 list_for_each_entry(sub, &counter->sibling_list, list_entry)
870 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
871 sub->tstamp_enabled =
872 ctx->time - sub->total_time_enabled;
876 * Cross CPU call to enable a performance counter
878 static void __perf_counter_enable(void *info)
880 struct perf_counter *counter = info;
881 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
882 struct perf_counter_context *ctx = counter->ctx;
883 struct perf_counter *leader = counter->group_leader;
884 int err;
887 * If this is a per-task counter, need to check whether this
888 * counter's task is the current task on this cpu.
890 if (ctx->task && cpuctx->task_ctx != ctx) {
891 if (cpuctx->task_ctx || ctx->task != current)
892 return;
893 cpuctx->task_ctx = ctx;
896 spin_lock(&ctx->lock);
897 ctx->is_active = 1;
898 update_context_time(ctx);
900 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
901 goto unlock;
902 __perf_counter_mark_enabled(counter, ctx);
905 * If the counter is in a group and isn't the group leader,
906 * then don't put it on unless the group is on.
908 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
909 goto unlock;
911 if (!group_can_go_on(counter, cpuctx, 1)) {
912 err = -EEXIST;
913 } else {
914 perf_disable();
915 if (counter == leader)
916 err = group_sched_in(counter, cpuctx, ctx,
917 smp_processor_id());
918 else
919 err = counter_sched_in(counter, cpuctx, ctx,
920 smp_processor_id());
921 perf_enable();
924 if (err) {
926 * If this counter can't go on and it's part of a
927 * group, then the whole group has to come off.
929 if (leader != counter)
930 group_sched_out(leader, cpuctx, ctx);
931 if (leader->attr.pinned) {
932 update_group_times(leader);
933 leader->state = PERF_COUNTER_STATE_ERROR;
937 unlock:
938 spin_unlock(&ctx->lock);
942 * Enable a counter.
944 * If counter->ctx is a cloned context, callers must make sure that
945 * every task struct that counter->ctx->task could possibly point to
946 * remains valid. This condition is satisfied when called through
947 * perf_counter_for_each_child or perf_counter_for_each as described
948 * for perf_counter_disable.
950 static void perf_counter_enable(struct perf_counter *counter)
952 struct perf_counter_context *ctx = counter->ctx;
953 struct task_struct *task = ctx->task;
955 if (!task) {
957 * Enable the counter on the cpu that it's on
959 smp_call_function_single(counter->cpu, __perf_counter_enable,
960 counter, 1);
961 return;
964 spin_lock_irq(&ctx->lock);
965 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
966 goto out;
969 * If the counter is in error state, clear that first.
970 * That way, if we see the counter in error state below, we
971 * know that it has gone back into error state, as distinct
972 * from the task having been scheduled away before the
973 * cross-call arrived.
975 if (counter->state == PERF_COUNTER_STATE_ERROR)
976 counter->state = PERF_COUNTER_STATE_OFF;
978 retry:
979 spin_unlock_irq(&ctx->lock);
980 task_oncpu_function_call(task, __perf_counter_enable, counter);
982 spin_lock_irq(&ctx->lock);
985 * If the context is active and the counter is still off,
986 * we need to retry the cross-call.
988 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
989 goto retry;
992 * Since we have the lock this context can't be scheduled
993 * in, so we can change the state safely.
995 if (counter->state == PERF_COUNTER_STATE_OFF)
996 __perf_counter_mark_enabled(counter, ctx);
998 out:
999 spin_unlock_irq(&ctx->lock);
1002 static int perf_counter_refresh(struct perf_counter *counter, int refresh)
1005 * not supported on inherited counters
1007 if (counter->attr.inherit)
1008 return -EINVAL;
1010 atomic_add(refresh, &counter->event_limit);
1011 perf_counter_enable(counter);
1013 return 0;
1016 void __perf_counter_sched_out(struct perf_counter_context *ctx,
1017 struct perf_cpu_context *cpuctx)
1019 struct perf_counter *counter;
1021 spin_lock(&ctx->lock);
1022 ctx->is_active = 0;
1023 if (likely(!ctx->nr_counters))
1024 goto out;
1025 update_context_time(ctx);
1027 perf_disable();
1028 if (ctx->nr_active) {
1029 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1030 if (counter != counter->group_leader)
1031 counter_sched_out(counter, cpuctx, ctx);
1032 else
1033 group_sched_out(counter, cpuctx, ctx);
1036 perf_enable();
1037 out:
1038 spin_unlock(&ctx->lock);
1042 * Test whether two contexts are equivalent, i.e. whether they
1043 * have both been cloned from the same version of the same context
1044 * and they both have the same number of enabled counters.
1045 * If the number of enabled counters is the same, then the set
1046 * of enabled counters should be the same, because these are both
1047 * inherited contexts, therefore we can't access individual counters
1048 * in them directly with an fd; we can only enable/disable all
1049 * counters via prctl, or enable/disable all counters in a family
1050 * via ioctl, which will have the same effect on both contexts.
1052 static int context_equiv(struct perf_counter_context *ctx1,
1053 struct perf_counter_context *ctx2)
1055 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1056 && ctx1->parent_gen == ctx2->parent_gen
1057 && !ctx1->pin_count && !ctx2->pin_count;
1060 static void __perf_counter_read(void *counter);
1062 static void __perf_counter_sync_stat(struct perf_counter *counter,
1063 struct perf_counter *next_counter)
1065 u64 value;
1067 if (!counter->attr.inherit_stat)
1068 return;
1071 * Update the counter value, we cannot use perf_counter_read()
1072 * because we're in the middle of a context switch and have IRQs
1073 * disabled, which upsets smp_call_function_single(), however
1074 * we know the counter must be on the current CPU, therefore we
1075 * don't need to use it.
1077 switch (counter->state) {
1078 case PERF_COUNTER_STATE_ACTIVE:
1079 __perf_counter_read(counter);
1080 break;
1082 case PERF_COUNTER_STATE_INACTIVE:
1083 update_counter_times(counter);
1084 break;
1086 default:
1087 break;
1091 * In order to keep per-task stats reliable we need to flip the counter
1092 * values when we flip the contexts.
1094 value = atomic64_read(&next_counter->count);
1095 value = atomic64_xchg(&counter->count, value);
1096 atomic64_set(&next_counter->count, value);
1098 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1099 swap(counter->total_time_running, next_counter->total_time_running);
1102 * Since we swizzled the values, update the user visible data too.
1104 perf_counter_update_userpage(counter);
1105 perf_counter_update_userpage(next_counter);
1108 #define list_next_entry(pos, member) \
1109 list_entry(pos->member.next, typeof(*pos), member)
1111 static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1112 struct perf_counter_context *next_ctx)
1114 struct perf_counter *counter, *next_counter;
1116 if (!ctx->nr_stat)
1117 return;
1119 counter = list_first_entry(&ctx->event_list,
1120 struct perf_counter, event_entry);
1122 next_counter = list_first_entry(&next_ctx->event_list,
1123 struct perf_counter, event_entry);
1125 while (&counter->event_entry != &ctx->event_list &&
1126 &next_counter->event_entry != &next_ctx->event_list) {
1128 __perf_counter_sync_stat(counter, next_counter);
1130 counter = list_next_entry(counter, event_entry);
1131 next_counter = list_next_entry(next_counter, event_entry);
1136 * Called from scheduler to remove the counters of the current task,
1137 * with interrupts disabled.
1139 * We stop each counter and update the counter value in counter->count.
1141 * This does not protect us against NMI, but disable()
1142 * sets the disabled bit in the control field of counter _before_
1143 * accessing the counter control register. If a NMI hits, then it will
1144 * not restart the counter.
1146 void perf_counter_task_sched_out(struct task_struct *task,
1147 struct task_struct *next, int cpu)
1149 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1150 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1151 struct perf_counter_context *next_ctx;
1152 struct perf_counter_context *parent;
1153 struct pt_regs *regs;
1154 int do_switch = 1;
1156 regs = task_pt_regs(task);
1157 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1159 if (likely(!ctx || !cpuctx->task_ctx))
1160 return;
1162 update_context_time(ctx);
1164 rcu_read_lock();
1165 parent = rcu_dereference(ctx->parent_ctx);
1166 next_ctx = next->perf_counter_ctxp;
1167 if (parent && next_ctx &&
1168 rcu_dereference(next_ctx->parent_ctx) == parent) {
1170 * Looks like the two contexts are clones, so we might be
1171 * able to optimize the context switch. We lock both
1172 * contexts and check that they are clones under the
1173 * lock (including re-checking that neither has been
1174 * uncloned in the meantime). It doesn't matter which
1175 * order we take the locks because no other cpu could
1176 * be trying to lock both of these tasks.
1178 spin_lock(&ctx->lock);
1179 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1180 if (context_equiv(ctx, next_ctx)) {
1182 * XXX do we need a memory barrier of sorts
1183 * wrt to rcu_dereference() of perf_counter_ctxp
1185 task->perf_counter_ctxp = next_ctx;
1186 next->perf_counter_ctxp = ctx;
1187 ctx->task = next;
1188 next_ctx->task = task;
1189 do_switch = 0;
1191 perf_counter_sync_stat(ctx, next_ctx);
1193 spin_unlock(&next_ctx->lock);
1194 spin_unlock(&ctx->lock);
1196 rcu_read_unlock();
1198 if (do_switch) {
1199 __perf_counter_sched_out(ctx, cpuctx);
1200 cpuctx->task_ctx = NULL;
1205 * Called with IRQs disabled
1207 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1209 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1211 if (!cpuctx->task_ctx)
1212 return;
1214 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1215 return;
1217 __perf_counter_sched_out(ctx, cpuctx);
1218 cpuctx->task_ctx = NULL;
1222 * Called with IRQs disabled
1224 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1226 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1229 static void
1230 __perf_counter_sched_in(struct perf_counter_context *ctx,
1231 struct perf_cpu_context *cpuctx, int cpu)
1233 struct perf_counter *counter;
1234 int can_add_hw = 1;
1236 spin_lock(&ctx->lock);
1237 ctx->is_active = 1;
1238 if (likely(!ctx->nr_counters))
1239 goto out;
1241 ctx->timestamp = perf_clock();
1243 perf_disable();
1246 * First go through the list and put on any pinned groups
1247 * in order to give them the best chance of going on.
1249 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1250 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1251 !counter->attr.pinned)
1252 continue;
1253 if (counter->cpu != -1 && counter->cpu != cpu)
1254 continue;
1256 if (counter != counter->group_leader)
1257 counter_sched_in(counter, cpuctx, ctx, cpu);
1258 else {
1259 if (group_can_go_on(counter, cpuctx, 1))
1260 group_sched_in(counter, cpuctx, ctx, cpu);
1264 * If this pinned group hasn't been scheduled,
1265 * put it in error state.
1267 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1268 update_group_times(counter);
1269 counter->state = PERF_COUNTER_STATE_ERROR;
1273 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1275 * Ignore counters in OFF or ERROR state, and
1276 * ignore pinned counters since we did them already.
1278 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1279 counter->attr.pinned)
1280 continue;
1283 * Listen to the 'cpu' scheduling filter constraint
1284 * of counters:
1286 if (counter->cpu != -1 && counter->cpu != cpu)
1287 continue;
1289 if (counter != counter->group_leader) {
1290 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1291 can_add_hw = 0;
1292 } else {
1293 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1294 if (group_sched_in(counter, cpuctx, ctx, cpu))
1295 can_add_hw = 0;
1299 perf_enable();
1300 out:
1301 spin_unlock(&ctx->lock);
1305 * Called from scheduler to add the counters of the current task
1306 * with interrupts disabled.
1308 * We restore the counter value and then enable it.
1310 * This does not protect us against NMI, but enable()
1311 * sets the enabled bit in the control field of counter _before_
1312 * accessing the counter control register. If a NMI hits, then it will
1313 * keep the counter running.
1315 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1317 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1318 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1320 if (likely(!ctx))
1321 return;
1322 if (cpuctx->task_ctx == ctx)
1323 return;
1324 __perf_counter_sched_in(ctx, cpuctx, cpu);
1325 cpuctx->task_ctx = ctx;
1328 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1330 struct perf_counter_context *ctx = &cpuctx->ctx;
1332 __perf_counter_sched_in(ctx, cpuctx, cpu);
1335 #define MAX_INTERRUPTS (~0ULL)
1337 static void perf_log_throttle(struct perf_counter *counter, int enable);
1339 static void perf_adjust_period(struct perf_counter *counter, u64 events)
1341 struct hw_perf_counter *hwc = &counter->hw;
1342 u64 period, sample_period;
1343 s64 delta;
1345 events *= hwc->sample_period;
1346 period = div64_u64(events, counter->attr.sample_freq);
1348 delta = (s64)(period - hwc->sample_period);
1349 delta = (delta + 7) / 8; /* low pass filter */
1351 sample_period = hwc->sample_period + delta;
1353 if (!sample_period)
1354 sample_period = 1;
1356 hwc->sample_period = sample_period;
1359 static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1361 struct perf_counter *counter;
1362 struct hw_perf_counter *hwc;
1363 u64 interrupts, freq;
1365 spin_lock(&ctx->lock);
1366 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1367 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1368 continue;
1370 hwc = &counter->hw;
1372 interrupts = hwc->interrupts;
1373 hwc->interrupts = 0;
1376 * unthrottle counters on the tick
1378 if (interrupts == MAX_INTERRUPTS) {
1379 perf_log_throttle(counter, 1);
1380 counter->pmu->unthrottle(counter);
1381 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1384 if (!counter->attr.freq || !counter->attr.sample_freq)
1385 continue;
1388 * if the specified freq < HZ then we need to skip ticks
1390 if (counter->attr.sample_freq < HZ) {
1391 freq = counter->attr.sample_freq;
1393 hwc->freq_count += freq;
1394 hwc->freq_interrupts += interrupts;
1396 if (hwc->freq_count < HZ)
1397 continue;
1399 interrupts = hwc->freq_interrupts;
1400 hwc->freq_interrupts = 0;
1401 hwc->freq_count -= HZ;
1402 } else
1403 freq = HZ;
1405 perf_adjust_period(counter, freq * interrupts);
1408 * In order to avoid being stalled by an (accidental) huge
1409 * sample period, force reset the sample period if we didn't
1410 * get any events in this freq period.
1412 if (!interrupts) {
1413 perf_disable();
1414 counter->pmu->disable(counter);
1415 atomic64_set(&hwc->period_left, 0);
1416 counter->pmu->enable(counter);
1417 perf_enable();
1420 spin_unlock(&ctx->lock);
1424 * Round-robin a context's counters:
1426 static void rotate_ctx(struct perf_counter_context *ctx)
1428 struct perf_counter *counter;
1430 if (!ctx->nr_counters)
1431 return;
1433 spin_lock(&ctx->lock);
1435 * Rotate the first entry last (works just fine for group counters too):
1437 perf_disable();
1438 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1439 list_move_tail(&counter->list_entry, &ctx->counter_list);
1440 break;
1442 perf_enable();
1444 spin_unlock(&ctx->lock);
1447 void perf_counter_task_tick(struct task_struct *curr, int cpu)
1449 struct perf_cpu_context *cpuctx;
1450 struct perf_counter_context *ctx;
1452 if (!atomic_read(&nr_counters))
1453 return;
1455 cpuctx = &per_cpu(perf_cpu_context, cpu);
1456 ctx = curr->perf_counter_ctxp;
1458 perf_ctx_adjust_freq(&cpuctx->ctx);
1459 if (ctx)
1460 perf_ctx_adjust_freq(ctx);
1462 perf_counter_cpu_sched_out(cpuctx);
1463 if (ctx)
1464 __perf_counter_task_sched_out(ctx);
1466 rotate_ctx(&cpuctx->ctx);
1467 if (ctx)
1468 rotate_ctx(ctx);
1470 perf_counter_cpu_sched_in(cpuctx, cpu);
1471 if (ctx)
1472 perf_counter_task_sched_in(curr, cpu);
1476 * Enable all of a task's counters that have been marked enable-on-exec.
1477 * This expects task == current.
1479 static void perf_counter_enable_on_exec(struct task_struct *task)
1481 struct perf_counter_context *ctx;
1482 struct perf_counter *counter;
1483 unsigned long flags;
1484 int enabled = 0;
1486 local_irq_save(flags);
1487 ctx = task->perf_counter_ctxp;
1488 if (!ctx || !ctx->nr_counters)
1489 goto out;
1491 __perf_counter_task_sched_out(ctx);
1493 spin_lock(&ctx->lock);
1495 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1496 if (!counter->attr.enable_on_exec)
1497 continue;
1498 counter->attr.enable_on_exec = 0;
1499 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1500 continue;
1501 __perf_counter_mark_enabled(counter, ctx);
1502 enabled = 1;
1506 * Unclone this context if we enabled any counter.
1508 if (enabled)
1509 unclone_ctx(ctx);
1511 spin_unlock(&ctx->lock);
1513 perf_counter_task_sched_in(task, smp_processor_id());
1514 out:
1515 local_irq_restore(flags);
1519 * Cross CPU call to read the hardware counter
1521 static void __perf_counter_read(void *info)
1523 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1524 struct perf_counter *counter = info;
1525 struct perf_counter_context *ctx = counter->ctx;
1526 unsigned long flags;
1529 * If this is a task context, we need to check whether it is
1530 * the current task context of this cpu. If not it has been
1531 * scheduled out before the smp call arrived. In that case
1532 * counter->count would have been updated to a recent sample
1533 * when the counter was scheduled out.
1535 if (ctx->task && cpuctx->task_ctx != ctx)
1536 return;
1538 local_irq_save(flags);
1539 if (ctx->is_active)
1540 update_context_time(ctx);
1541 counter->pmu->read(counter);
1542 update_counter_times(counter);
1543 local_irq_restore(flags);
1546 static u64 perf_counter_read(struct perf_counter *counter)
1549 * If counter is enabled and currently active on a CPU, update the
1550 * value in the counter structure:
1552 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1553 smp_call_function_single(counter->oncpu,
1554 __perf_counter_read, counter, 1);
1555 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1556 update_counter_times(counter);
1559 return atomic64_read(&counter->count);
1563 * Initialize the perf_counter context in a task_struct:
1565 static void
1566 __perf_counter_init_context(struct perf_counter_context *ctx,
1567 struct task_struct *task)
1569 memset(ctx, 0, sizeof(*ctx));
1570 spin_lock_init(&ctx->lock);
1571 mutex_init(&ctx->mutex);
1572 INIT_LIST_HEAD(&ctx->counter_list);
1573 INIT_LIST_HEAD(&ctx->event_list);
1574 atomic_set(&ctx->refcount, 1);
1575 ctx->task = task;
1578 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1580 struct perf_counter_context *ctx;
1581 struct perf_cpu_context *cpuctx;
1582 struct task_struct *task;
1583 unsigned long flags;
1584 int err;
1587 * If cpu is not a wildcard then this is a percpu counter:
1589 if (cpu != -1) {
1590 /* Must be root to operate on a CPU counter: */
1591 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1592 return ERR_PTR(-EACCES);
1594 if (cpu < 0 || cpu > num_possible_cpus())
1595 return ERR_PTR(-EINVAL);
1598 * We could be clever and allow to attach a counter to an
1599 * offline CPU and activate it when the CPU comes up, but
1600 * that's for later.
1602 if (!cpu_isset(cpu, cpu_online_map))
1603 return ERR_PTR(-ENODEV);
1605 cpuctx = &per_cpu(perf_cpu_context, cpu);
1606 ctx = &cpuctx->ctx;
1607 get_ctx(ctx);
1609 return ctx;
1612 rcu_read_lock();
1613 if (!pid)
1614 task = current;
1615 else
1616 task = find_task_by_vpid(pid);
1617 if (task)
1618 get_task_struct(task);
1619 rcu_read_unlock();
1621 if (!task)
1622 return ERR_PTR(-ESRCH);
1625 * Can't attach counters to a dying task.
1627 err = -ESRCH;
1628 if (task->flags & PF_EXITING)
1629 goto errout;
1631 /* Reuse ptrace permission checks for now. */
1632 err = -EACCES;
1633 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1634 goto errout;
1636 retry:
1637 ctx = perf_lock_task_context(task, &flags);
1638 if (ctx) {
1639 unclone_ctx(ctx);
1640 spin_unlock_irqrestore(&ctx->lock, flags);
1643 if (!ctx) {
1644 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1645 err = -ENOMEM;
1646 if (!ctx)
1647 goto errout;
1648 __perf_counter_init_context(ctx, task);
1649 get_ctx(ctx);
1650 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1652 * We raced with some other task; use
1653 * the context they set.
1655 kfree(ctx);
1656 goto retry;
1658 get_task_struct(task);
1661 put_task_struct(task);
1662 return ctx;
1664 errout:
1665 put_task_struct(task);
1666 return ERR_PTR(err);
1669 static void free_counter_rcu(struct rcu_head *head)
1671 struct perf_counter *counter;
1673 counter = container_of(head, struct perf_counter, rcu_head);
1674 if (counter->ns)
1675 put_pid_ns(counter->ns);
1676 kfree(counter);
1679 static void perf_pending_sync(struct perf_counter *counter);
1681 static void free_counter(struct perf_counter *counter)
1683 perf_pending_sync(counter);
1685 if (!counter->parent) {
1686 atomic_dec(&nr_counters);
1687 if (counter->attr.mmap)
1688 atomic_dec(&nr_mmap_counters);
1689 if (counter->attr.comm)
1690 atomic_dec(&nr_comm_counters);
1691 if (counter->attr.task)
1692 atomic_dec(&nr_task_counters);
1695 if (counter->destroy)
1696 counter->destroy(counter);
1698 put_ctx(counter->ctx);
1699 call_rcu(&counter->rcu_head, free_counter_rcu);
1703 * Called when the last reference to the file is gone.
1705 static int perf_release(struct inode *inode, struct file *file)
1707 struct perf_counter *counter = file->private_data;
1708 struct perf_counter_context *ctx = counter->ctx;
1710 file->private_data = NULL;
1712 WARN_ON_ONCE(ctx->parent_ctx);
1713 mutex_lock(&ctx->mutex);
1714 perf_counter_remove_from_context(counter);
1715 mutex_unlock(&ctx->mutex);
1717 mutex_lock(&counter->owner->perf_counter_mutex);
1718 list_del_init(&counter->owner_entry);
1719 mutex_unlock(&counter->owner->perf_counter_mutex);
1720 put_task_struct(counter->owner);
1722 free_counter(counter);
1724 return 0;
1727 static int perf_counter_read_size(struct perf_counter *counter)
1729 int entry = sizeof(u64); /* value */
1730 int size = 0;
1731 int nr = 1;
1733 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1734 size += sizeof(u64);
1736 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1737 size += sizeof(u64);
1739 if (counter->attr.read_format & PERF_FORMAT_ID)
1740 entry += sizeof(u64);
1742 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1743 nr += counter->group_leader->nr_siblings;
1744 size += sizeof(u64);
1747 size += entry * nr;
1749 return size;
1752 static u64 perf_counter_read_value(struct perf_counter *counter)
1754 struct perf_counter *child;
1755 u64 total = 0;
1757 total += perf_counter_read(counter);
1758 list_for_each_entry(child, &counter->child_list, child_list)
1759 total += perf_counter_read(child);
1761 return total;
1764 static int perf_counter_read_entry(struct perf_counter *counter,
1765 u64 read_format, char __user *buf)
1767 int n = 0, count = 0;
1768 u64 values[2];
1770 values[n++] = perf_counter_read_value(counter);
1771 if (read_format & PERF_FORMAT_ID)
1772 values[n++] = primary_counter_id(counter);
1774 count = n * sizeof(u64);
1776 if (copy_to_user(buf, values, count))
1777 return -EFAULT;
1779 return count;
1782 static int perf_counter_read_group(struct perf_counter *counter,
1783 u64 read_format, char __user *buf)
1785 struct perf_counter *leader = counter->group_leader, *sub;
1786 int n = 0, size = 0, err = -EFAULT;
1787 u64 values[3];
1789 values[n++] = 1 + leader->nr_siblings;
1790 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1791 values[n++] = leader->total_time_enabled +
1792 atomic64_read(&leader->child_total_time_enabled);
1794 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1795 values[n++] = leader->total_time_running +
1796 atomic64_read(&leader->child_total_time_running);
1799 size = n * sizeof(u64);
1801 if (copy_to_user(buf, values, size))
1802 return -EFAULT;
1804 err = perf_counter_read_entry(leader, read_format, buf + size);
1805 if (err < 0)
1806 return err;
1808 size += err;
1810 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1811 err = perf_counter_read_entry(sub, read_format,
1812 buf + size);
1813 if (err < 0)
1814 return err;
1816 size += err;
1819 return size;
1822 static int perf_counter_read_one(struct perf_counter *counter,
1823 u64 read_format, char __user *buf)
1825 u64 values[4];
1826 int n = 0;
1828 values[n++] = perf_counter_read_value(counter);
1829 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1830 values[n++] = counter->total_time_enabled +
1831 atomic64_read(&counter->child_total_time_enabled);
1833 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1834 values[n++] = counter->total_time_running +
1835 atomic64_read(&counter->child_total_time_running);
1837 if (read_format & PERF_FORMAT_ID)
1838 values[n++] = primary_counter_id(counter);
1840 if (copy_to_user(buf, values, n * sizeof(u64)))
1841 return -EFAULT;
1843 return n * sizeof(u64);
1847 * Read the performance counter - simple non blocking version for now
1849 static ssize_t
1850 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1852 u64 read_format = counter->attr.read_format;
1853 int ret;
1856 * Return end-of-file for a read on a counter that is in
1857 * error state (i.e. because it was pinned but it couldn't be
1858 * scheduled on to the CPU at some point).
1860 if (counter->state == PERF_COUNTER_STATE_ERROR)
1861 return 0;
1863 if (count < perf_counter_read_size(counter))
1864 return -ENOSPC;
1866 WARN_ON_ONCE(counter->ctx->parent_ctx);
1867 mutex_lock(&counter->child_mutex);
1868 if (read_format & PERF_FORMAT_GROUP)
1869 ret = perf_counter_read_group(counter, read_format, buf);
1870 else
1871 ret = perf_counter_read_one(counter, read_format, buf);
1872 mutex_unlock(&counter->child_mutex);
1874 return ret;
1877 static ssize_t
1878 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1880 struct perf_counter *counter = file->private_data;
1882 return perf_read_hw(counter, buf, count);
1885 static unsigned int perf_poll(struct file *file, poll_table *wait)
1887 struct perf_counter *counter = file->private_data;
1888 struct perf_mmap_data *data;
1889 unsigned int events = POLL_HUP;
1891 rcu_read_lock();
1892 data = rcu_dereference(counter->data);
1893 if (data)
1894 events = atomic_xchg(&data->poll, 0);
1895 rcu_read_unlock();
1897 poll_wait(file, &counter->waitq, wait);
1899 return events;
1902 static void perf_counter_reset(struct perf_counter *counter)
1904 (void)perf_counter_read(counter);
1905 atomic64_set(&counter->count, 0);
1906 perf_counter_update_userpage(counter);
1910 * Holding the top-level counter's child_mutex means that any
1911 * descendant process that has inherited this counter will block
1912 * in sync_child_counter if it goes to exit, thus satisfying the
1913 * task existence requirements of perf_counter_enable/disable.
1915 static void perf_counter_for_each_child(struct perf_counter *counter,
1916 void (*func)(struct perf_counter *))
1918 struct perf_counter *child;
1920 WARN_ON_ONCE(counter->ctx->parent_ctx);
1921 mutex_lock(&counter->child_mutex);
1922 func(counter);
1923 list_for_each_entry(child, &counter->child_list, child_list)
1924 func(child);
1925 mutex_unlock(&counter->child_mutex);
1928 static void perf_counter_for_each(struct perf_counter *counter,
1929 void (*func)(struct perf_counter *))
1931 struct perf_counter_context *ctx = counter->ctx;
1932 struct perf_counter *sibling;
1934 WARN_ON_ONCE(ctx->parent_ctx);
1935 mutex_lock(&ctx->mutex);
1936 counter = counter->group_leader;
1938 perf_counter_for_each_child(counter, func);
1939 func(counter);
1940 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1941 perf_counter_for_each_child(counter, func);
1942 mutex_unlock(&ctx->mutex);
1945 static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1947 struct perf_counter_context *ctx = counter->ctx;
1948 unsigned long size;
1949 int ret = 0;
1950 u64 value;
1952 if (!counter->attr.sample_period)
1953 return -EINVAL;
1955 size = copy_from_user(&value, arg, sizeof(value));
1956 if (size != sizeof(value))
1957 return -EFAULT;
1959 if (!value)
1960 return -EINVAL;
1962 spin_lock_irq(&ctx->lock);
1963 if (counter->attr.freq) {
1964 if (value > sysctl_perf_counter_sample_rate) {
1965 ret = -EINVAL;
1966 goto unlock;
1969 counter->attr.sample_freq = value;
1970 } else {
1971 counter->attr.sample_period = value;
1972 counter->hw.sample_period = value;
1974 unlock:
1975 spin_unlock_irq(&ctx->lock);
1977 return ret;
1980 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1982 struct perf_counter *counter = file->private_data;
1983 void (*func)(struct perf_counter *);
1984 u32 flags = arg;
1986 switch (cmd) {
1987 case PERF_COUNTER_IOC_ENABLE:
1988 func = perf_counter_enable;
1989 break;
1990 case PERF_COUNTER_IOC_DISABLE:
1991 func = perf_counter_disable;
1992 break;
1993 case PERF_COUNTER_IOC_RESET:
1994 func = perf_counter_reset;
1995 break;
1997 case PERF_COUNTER_IOC_REFRESH:
1998 return perf_counter_refresh(counter, arg);
2000 case PERF_COUNTER_IOC_PERIOD:
2001 return perf_counter_period(counter, (u64 __user *)arg);
2003 default:
2004 return -ENOTTY;
2007 if (flags & PERF_IOC_FLAG_GROUP)
2008 perf_counter_for_each(counter, func);
2009 else
2010 perf_counter_for_each_child(counter, func);
2012 return 0;
2015 int perf_counter_task_enable(void)
2017 struct perf_counter *counter;
2019 mutex_lock(&current->perf_counter_mutex);
2020 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2021 perf_counter_for_each_child(counter, perf_counter_enable);
2022 mutex_unlock(&current->perf_counter_mutex);
2024 return 0;
2027 int perf_counter_task_disable(void)
2029 struct perf_counter *counter;
2031 mutex_lock(&current->perf_counter_mutex);
2032 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2033 perf_counter_for_each_child(counter, perf_counter_disable);
2034 mutex_unlock(&current->perf_counter_mutex);
2036 return 0;
2039 #ifndef PERF_COUNTER_INDEX_OFFSET
2040 # define PERF_COUNTER_INDEX_OFFSET 0
2041 #endif
2043 static int perf_counter_index(struct perf_counter *counter)
2045 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2046 return 0;
2048 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2052 * Callers need to ensure there can be no nesting of this function, otherwise
2053 * the seqlock logic goes bad. We can not serialize this because the arch
2054 * code calls this from NMI context.
2056 void perf_counter_update_userpage(struct perf_counter *counter)
2058 struct perf_counter_mmap_page *userpg;
2059 struct perf_mmap_data *data;
2061 rcu_read_lock();
2062 data = rcu_dereference(counter->data);
2063 if (!data)
2064 goto unlock;
2066 userpg = data->user_page;
2069 * Disable preemption so as to not let the corresponding user-space
2070 * spin too long if we get preempted.
2072 preempt_disable();
2073 ++userpg->lock;
2074 barrier();
2075 userpg->index = perf_counter_index(counter);
2076 userpg->offset = atomic64_read(&counter->count);
2077 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
2078 userpg->offset -= atomic64_read(&counter->hw.prev_count);
2080 userpg->time_enabled = counter->total_time_enabled +
2081 atomic64_read(&counter->child_total_time_enabled);
2083 userpg->time_running = counter->total_time_running +
2084 atomic64_read(&counter->child_total_time_running);
2086 barrier();
2087 ++userpg->lock;
2088 preempt_enable();
2089 unlock:
2090 rcu_read_unlock();
2093 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2095 struct perf_counter *counter = vma->vm_file->private_data;
2096 struct perf_mmap_data *data;
2097 int ret = VM_FAULT_SIGBUS;
2099 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2100 if (vmf->pgoff == 0)
2101 ret = 0;
2102 return ret;
2105 rcu_read_lock();
2106 data = rcu_dereference(counter->data);
2107 if (!data)
2108 goto unlock;
2110 if (vmf->pgoff == 0) {
2111 vmf->page = virt_to_page(data->user_page);
2112 } else {
2113 int nr = vmf->pgoff - 1;
2115 if ((unsigned)nr > data->nr_pages)
2116 goto unlock;
2118 if (vmf->flags & FAULT_FLAG_WRITE)
2119 goto unlock;
2121 vmf->page = virt_to_page(data->data_pages[nr]);
2124 get_page(vmf->page);
2125 vmf->page->mapping = vma->vm_file->f_mapping;
2126 vmf->page->index = vmf->pgoff;
2128 ret = 0;
2129 unlock:
2130 rcu_read_unlock();
2132 return ret;
2135 static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
2137 struct perf_mmap_data *data;
2138 unsigned long size;
2139 int i;
2141 WARN_ON(atomic_read(&counter->mmap_count));
2143 size = sizeof(struct perf_mmap_data);
2144 size += nr_pages * sizeof(void *);
2146 data = kzalloc(size, GFP_KERNEL);
2147 if (!data)
2148 goto fail;
2150 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2151 if (!data->user_page)
2152 goto fail_user_page;
2154 for (i = 0; i < nr_pages; i++) {
2155 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2156 if (!data->data_pages[i])
2157 goto fail_data_pages;
2160 data->nr_pages = nr_pages;
2161 atomic_set(&data->lock, -1);
2163 rcu_assign_pointer(counter->data, data);
2165 return 0;
2167 fail_data_pages:
2168 for (i--; i >= 0; i--)
2169 free_page((unsigned long)data->data_pages[i]);
2171 free_page((unsigned long)data->user_page);
2173 fail_user_page:
2174 kfree(data);
2176 fail:
2177 return -ENOMEM;
2180 static void perf_mmap_free_page(unsigned long addr)
2182 struct page *page = virt_to_page((void *)addr);
2184 page->mapping = NULL;
2185 __free_page(page);
2188 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2190 struct perf_mmap_data *data;
2191 int i;
2193 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2195 perf_mmap_free_page((unsigned long)data->user_page);
2196 for (i = 0; i < data->nr_pages; i++)
2197 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2199 kfree(data);
2202 static void perf_mmap_data_free(struct perf_counter *counter)
2204 struct perf_mmap_data *data = counter->data;
2206 WARN_ON(atomic_read(&counter->mmap_count));
2208 rcu_assign_pointer(counter->data, NULL);
2209 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2212 static void perf_mmap_open(struct vm_area_struct *vma)
2214 struct perf_counter *counter = vma->vm_file->private_data;
2216 atomic_inc(&counter->mmap_count);
2219 static void perf_mmap_close(struct vm_area_struct *vma)
2221 struct perf_counter *counter = vma->vm_file->private_data;
2223 WARN_ON_ONCE(counter->ctx->parent_ctx);
2224 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
2225 struct user_struct *user = current_user();
2227 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
2228 vma->vm_mm->locked_vm -= counter->data->nr_locked;
2229 perf_mmap_data_free(counter);
2230 mutex_unlock(&counter->mmap_mutex);
2234 static struct vm_operations_struct perf_mmap_vmops = {
2235 .open = perf_mmap_open,
2236 .close = perf_mmap_close,
2237 .fault = perf_mmap_fault,
2238 .page_mkwrite = perf_mmap_fault,
2241 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2243 struct perf_counter *counter = file->private_data;
2244 unsigned long user_locked, user_lock_limit;
2245 struct user_struct *user = current_user();
2246 unsigned long locked, lock_limit;
2247 unsigned long vma_size;
2248 unsigned long nr_pages;
2249 long user_extra, extra;
2250 int ret = 0;
2252 if (!(vma->vm_flags & VM_SHARED))
2253 return -EINVAL;
2255 vma_size = vma->vm_end - vma->vm_start;
2256 nr_pages = (vma_size / PAGE_SIZE) - 1;
2259 * If we have data pages ensure they're a power-of-two number, so we
2260 * can do bitmasks instead of modulo.
2262 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2263 return -EINVAL;
2265 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2266 return -EINVAL;
2268 if (vma->vm_pgoff != 0)
2269 return -EINVAL;
2271 WARN_ON_ONCE(counter->ctx->parent_ctx);
2272 mutex_lock(&counter->mmap_mutex);
2273 if (atomic_inc_not_zero(&counter->mmap_count)) {
2274 if (nr_pages != counter->data->nr_pages)
2275 ret = -EINVAL;
2276 goto unlock;
2279 user_extra = nr_pages + 1;
2280 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
2283 * Increase the limit linearly with more CPUs:
2285 user_lock_limit *= num_online_cpus();
2287 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2289 extra = 0;
2290 if (user_locked > user_lock_limit)
2291 extra = user_locked - user_lock_limit;
2293 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2294 lock_limit >>= PAGE_SHIFT;
2295 locked = vma->vm_mm->locked_vm + extra;
2297 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
2298 ret = -EPERM;
2299 goto unlock;
2302 WARN_ON(counter->data);
2303 ret = perf_mmap_data_alloc(counter, nr_pages);
2304 if (ret)
2305 goto unlock;
2307 atomic_set(&counter->mmap_count, 1);
2308 atomic_long_add(user_extra, &user->locked_vm);
2309 vma->vm_mm->locked_vm += extra;
2310 counter->data->nr_locked = extra;
2311 if (vma->vm_flags & VM_WRITE)
2312 counter->data->writable = 1;
2314 unlock:
2315 mutex_unlock(&counter->mmap_mutex);
2317 vma->vm_flags |= VM_RESERVED;
2318 vma->vm_ops = &perf_mmap_vmops;
2320 return ret;
2323 static int perf_fasync(int fd, struct file *filp, int on)
2325 struct inode *inode = filp->f_path.dentry->d_inode;
2326 struct perf_counter *counter = filp->private_data;
2327 int retval;
2329 mutex_lock(&inode->i_mutex);
2330 retval = fasync_helper(fd, filp, on, &counter->fasync);
2331 mutex_unlock(&inode->i_mutex);
2333 if (retval < 0)
2334 return retval;
2336 return 0;
2339 static const struct file_operations perf_fops = {
2340 .release = perf_release,
2341 .read = perf_read,
2342 .poll = perf_poll,
2343 .unlocked_ioctl = perf_ioctl,
2344 .compat_ioctl = perf_ioctl,
2345 .mmap = perf_mmap,
2346 .fasync = perf_fasync,
2350 * Perf counter wakeup
2352 * If there's data, ensure we set the poll() state and publish everything
2353 * to user-space before waking everybody up.
2356 void perf_counter_wakeup(struct perf_counter *counter)
2358 wake_up_all(&counter->waitq);
2360 if (counter->pending_kill) {
2361 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2362 counter->pending_kill = 0;
2367 * Pending wakeups
2369 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2371 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2372 * single linked list and use cmpxchg() to add entries lockless.
2375 static void perf_pending_counter(struct perf_pending_entry *entry)
2377 struct perf_counter *counter = container_of(entry,
2378 struct perf_counter, pending);
2380 if (counter->pending_disable) {
2381 counter->pending_disable = 0;
2382 __perf_counter_disable(counter);
2385 if (counter->pending_wakeup) {
2386 counter->pending_wakeup = 0;
2387 perf_counter_wakeup(counter);
2391 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2393 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2394 PENDING_TAIL,
2397 static void perf_pending_queue(struct perf_pending_entry *entry,
2398 void (*func)(struct perf_pending_entry *))
2400 struct perf_pending_entry **head;
2402 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2403 return;
2405 entry->func = func;
2407 head = &get_cpu_var(perf_pending_head);
2409 do {
2410 entry->next = *head;
2411 } while (cmpxchg(head, entry->next, entry) != entry->next);
2413 set_perf_counter_pending();
2415 put_cpu_var(perf_pending_head);
2418 static int __perf_pending_run(void)
2420 struct perf_pending_entry *list;
2421 int nr = 0;
2423 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2424 while (list != PENDING_TAIL) {
2425 void (*func)(struct perf_pending_entry *);
2426 struct perf_pending_entry *entry = list;
2428 list = list->next;
2430 func = entry->func;
2431 entry->next = NULL;
2433 * Ensure we observe the unqueue before we issue the wakeup,
2434 * so that we won't be waiting forever.
2435 * -- see perf_not_pending().
2437 smp_wmb();
2439 func(entry);
2440 nr++;
2443 return nr;
2446 static inline int perf_not_pending(struct perf_counter *counter)
2449 * If we flush on whatever cpu we run, there is a chance we don't
2450 * need to wait.
2452 get_cpu();
2453 __perf_pending_run();
2454 put_cpu();
2457 * Ensure we see the proper queue state before going to sleep
2458 * so that we do not miss the wakeup. -- see perf_pending_handle()
2460 smp_rmb();
2461 return counter->pending.next == NULL;
2464 static void perf_pending_sync(struct perf_counter *counter)
2466 wait_event(counter->waitq, perf_not_pending(counter));
2469 void perf_counter_do_pending(void)
2471 __perf_pending_run();
2475 * Callchain support -- arch specific
2478 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2480 return NULL;
2484 * Output
2487 struct perf_output_handle {
2488 struct perf_counter *counter;
2489 struct perf_mmap_data *data;
2490 unsigned long head;
2491 unsigned long offset;
2492 int nmi;
2493 int sample;
2494 int locked;
2495 unsigned long flags;
2498 static bool perf_output_space(struct perf_mmap_data *data,
2499 unsigned int offset, unsigned int head)
2501 unsigned long tail;
2502 unsigned long mask;
2504 if (!data->writable)
2505 return true;
2507 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2509 * Userspace could choose to issue a mb() before updating the tail
2510 * pointer. So that all reads will be completed before the write is
2511 * issued.
2513 tail = ACCESS_ONCE(data->user_page->data_tail);
2514 smp_rmb();
2516 offset = (offset - tail) & mask;
2517 head = (head - tail) & mask;
2519 if ((int)(head - offset) < 0)
2520 return false;
2522 return true;
2525 static void perf_output_wakeup(struct perf_output_handle *handle)
2527 atomic_set(&handle->data->poll, POLL_IN);
2529 if (handle->nmi) {
2530 handle->counter->pending_wakeup = 1;
2531 perf_pending_queue(&handle->counter->pending,
2532 perf_pending_counter);
2533 } else
2534 perf_counter_wakeup(handle->counter);
2538 * Curious locking construct.
2540 * We need to ensure a later event doesn't publish a head when a former
2541 * event isn't done writing. However since we need to deal with NMIs we
2542 * cannot fully serialize things.
2544 * What we do is serialize between CPUs so we only have to deal with NMI
2545 * nesting on a single CPU.
2547 * We only publish the head (and generate a wakeup) when the outer-most
2548 * event completes.
2550 static void perf_output_lock(struct perf_output_handle *handle)
2552 struct perf_mmap_data *data = handle->data;
2553 int cpu;
2555 handle->locked = 0;
2557 local_irq_save(handle->flags);
2558 cpu = smp_processor_id();
2560 if (in_nmi() && atomic_read(&data->lock) == cpu)
2561 return;
2563 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2564 cpu_relax();
2566 handle->locked = 1;
2569 static void perf_output_unlock(struct perf_output_handle *handle)
2571 struct perf_mmap_data *data = handle->data;
2572 unsigned long head;
2573 int cpu;
2575 data->done_head = data->head;
2577 if (!handle->locked)
2578 goto out;
2580 again:
2582 * The xchg implies a full barrier that ensures all writes are done
2583 * before we publish the new head, matched by a rmb() in userspace when
2584 * reading this position.
2586 while ((head = atomic_long_xchg(&data->done_head, 0)))
2587 data->user_page->data_head = head;
2590 * NMI can happen here, which means we can miss a done_head update.
2593 cpu = atomic_xchg(&data->lock, -1);
2594 WARN_ON_ONCE(cpu != smp_processor_id());
2597 * Therefore we have to validate we did not indeed do so.
2599 if (unlikely(atomic_long_read(&data->done_head))) {
2601 * Since we had it locked, we can lock it again.
2603 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2604 cpu_relax();
2606 goto again;
2609 if (atomic_xchg(&data->wakeup, 0))
2610 perf_output_wakeup(handle);
2611 out:
2612 local_irq_restore(handle->flags);
2615 static void perf_output_copy(struct perf_output_handle *handle,
2616 const void *buf, unsigned int len)
2618 unsigned int pages_mask;
2619 unsigned int offset;
2620 unsigned int size;
2621 void **pages;
2623 offset = handle->offset;
2624 pages_mask = handle->data->nr_pages - 1;
2625 pages = handle->data->data_pages;
2627 do {
2628 unsigned int page_offset;
2629 int nr;
2631 nr = (offset >> PAGE_SHIFT) & pages_mask;
2632 page_offset = offset & (PAGE_SIZE - 1);
2633 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2635 memcpy(pages[nr] + page_offset, buf, size);
2637 len -= size;
2638 buf += size;
2639 offset += size;
2640 } while (len);
2642 handle->offset = offset;
2645 * Check we didn't copy past our reservation window, taking the
2646 * possible unsigned int wrap into account.
2648 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2651 #define perf_output_put(handle, x) \
2652 perf_output_copy((handle), &(x), sizeof(x))
2654 static int perf_output_begin(struct perf_output_handle *handle,
2655 struct perf_counter *counter, unsigned int size,
2656 int nmi, int sample)
2658 struct perf_mmap_data *data;
2659 unsigned int offset, head;
2660 int have_lost;
2661 struct {
2662 struct perf_event_header header;
2663 u64 id;
2664 u64 lost;
2665 } lost_event;
2668 * For inherited counters we send all the output towards the parent.
2670 if (counter->parent)
2671 counter = counter->parent;
2673 rcu_read_lock();
2674 data = rcu_dereference(counter->data);
2675 if (!data)
2676 goto out;
2678 handle->data = data;
2679 handle->counter = counter;
2680 handle->nmi = nmi;
2681 handle->sample = sample;
2683 if (!data->nr_pages)
2684 goto fail;
2686 have_lost = atomic_read(&data->lost);
2687 if (have_lost)
2688 size += sizeof(lost_event);
2690 perf_output_lock(handle);
2692 do {
2693 offset = head = atomic_long_read(&data->head);
2694 head += size;
2695 if (unlikely(!perf_output_space(data, offset, head)))
2696 goto fail;
2697 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2699 handle->offset = offset;
2700 handle->head = head;
2702 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2703 atomic_set(&data->wakeup, 1);
2705 if (have_lost) {
2706 lost_event.header.type = PERF_EVENT_LOST;
2707 lost_event.header.misc = 0;
2708 lost_event.header.size = sizeof(lost_event);
2709 lost_event.id = counter->id;
2710 lost_event.lost = atomic_xchg(&data->lost, 0);
2712 perf_output_put(handle, lost_event);
2715 return 0;
2717 fail:
2718 atomic_inc(&data->lost);
2719 perf_output_unlock(handle);
2720 out:
2721 rcu_read_unlock();
2723 return -ENOSPC;
2726 static void perf_output_end(struct perf_output_handle *handle)
2728 struct perf_counter *counter = handle->counter;
2729 struct perf_mmap_data *data = handle->data;
2731 int wakeup_events = counter->attr.wakeup_events;
2733 if (handle->sample && wakeup_events) {
2734 int events = atomic_inc_return(&data->events);
2735 if (events >= wakeup_events) {
2736 atomic_sub(wakeup_events, &data->events);
2737 atomic_set(&data->wakeup, 1);
2741 perf_output_unlock(handle);
2742 rcu_read_unlock();
2745 static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2748 * only top level counters have the pid namespace they were created in
2750 if (counter->parent)
2751 counter = counter->parent;
2753 return task_tgid_nr_ns(p, counter->ns);
2756 static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2759 * only top level counters have the pid namespace they were created in
2761 if (counter->parent)
2762 counter = counter->parent;
2764 return task_pid_nr_ns(p, counter->ns);
2767 static void perf_output_read_one(struct perf_output_handle *handle,
2768 struct perf_counter *counter)
2770 u64 read_format = counter->attr.read_format;
2771 u64 values[4];
2772 int n = 0;
2774 values[n++] = atomic64_read(&counter->count);
2775 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2776 values[n++] = counter->total_time_enabled +
2777 atomic64_read(&counter->child_total_time_enabled);
2779 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2780 values[n++] = counter->total_time_running +
2781 atomic64_read(&counter->child_total_time_running);
2783 if (read_format & PERF_FORMAT_ID)
2784 values[n++] = primary_counter_id(counter);
2786 perf_output_copy(handle, values, n * sizeof(u64));
2790 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2792 static void perf_output_read_group(struct perf_output_handle *handle,
2793 struct perf_counter *counter)
2795 struct perf_counter *leader = counter->group_leader, *sub;
2796 u64 read_format = counter->attr.read_format;
2797 u64 values[5];
2798 int n = 0;
2800 values[n++] = 1 + leader->nr_siblings;
2802 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2803 values[n++] = leader->total_time_enabled;
2805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2806 values[n++] = leader->total_time_running;
2808 if (leader != counter)
2809 leader->pmu->read(leader);
2811 values[n++] = atomic64_read(&leader->count);
2812 if (read_format & PERF_FORMAT_ID)
2813 values[n++] = primary_counter_id(leader);
2815 perf_output_copy(handle, values, n * sizeof(u64));
2817 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2818 n = 0;
2820 if (sub != counter)
2821 sub->pmu->read(sub);
2823 values[n++] = atomic64_read(&sub->count);
2824 if (read_format & PERF_FORMAT_ID)
2825 values[n++] = primary_counter_id(sub);
2827 perf_output_copy(handle, values, n * sizeof(u64));
2831 static void perf_output_read(struct perf_output_handle *handle,
2832 struct perf_counter *counter)
2834 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2835 perf_output_read_group(handle, counter);
2836 else
2837 perf_output_read_one(handle, counter);
2840 void perf_counter_output(struct perf_counter *counter, int nmi,
2841 struct perf_sample_data *data)
2843 int ret;
2844 u64 sample_type = counter->attr.sample_type;
2845 struct perf_output_handle handle;
2846 struct perf_event_header header;
2847 u64 ip;
2848 struct {
2849 u32 pid, tid;
2850 } tid_entry;
2851 struct perf_callchain_entry *callchain = NULL;
2852 int callchain_size = 0;
2853 u64 time;
2854 struct {
2855 u32 cpu, reserved;
2856 } cpu_entry;
2858 header.type = PERF_EVENT_SAMPLE;
2859 header.size = sizeof(header);
2861 header.misc = 0;
2862 header.misc |= perf_misc_flags(data->regs);
2864 if (sample_type & PERF_SAMPLE_IP) {
2865 ip = perf_instruction_pointer(data->regs);
2866 header.size += sizeof(ip);
2869 if (sample_type & PERF_SAMPLE_TID) {
2870 /* namespace issues */
2871 tid_entry.pid = perf_counter_pid(counter, current);
2872 tid_entry.tid = perf_counter_tid(counter, current);
2874 header.size += sizeof(tid_entry);
2877 if (sample_type & PERF_SAMPLE_TIME) {
2879 * Maybe do better on x86 and provide cpu_clock_nmi()
2881 time = sched_clock();
2883 header.size += sizeof(u64);
2886 if (sample_type & PERF_SAMPLE_ADDR)
2887 header.size += sizeof(u64);
2889 if (sample_type & PERF_SAMPLE_ID)
2890 header.size += sizeof(u64);
2892 if (sample_type & PERF_SAMPLE_STREAM_ID)
2893 header.size += sizeof(u64);
2895 if (sample_type & PERF_SAMPLE_CPU) {
2896 header.size += sizeof(cpu_entry);
2898 cpu_entry.cpu = raw_smp_processor_id();
2899 cpu_entry.reserved = 0;
2902 if (sample_type & PERF_SAMPLE_PERIOD)
2903 header.size += sizeof(u64);
2905 if (sample_type & PERF_SAMPLE_READ)
2906 header.size += perf_counter_read_size(counter);
2908 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2909 callchain = perf_callchain(data->regs);
2911 if (callchain) {
2912 callchain_size = (1 + callchain->nr) * sizeof(u64);
2913 header.size += callchain_size;
2914 } else
2915 header.size += sizeof(u64);
2918 if (sample_type & PERF_SAMPLE_RAW) {
2919 int size = sizeof(u32);
2921 if (data->raw)
2922 size += data->raw->size;
2923 else
2924 size += sizeof(u32);
2926 WARN_ON_ONCE(size & (sizeof(u64)-1));
2927 header.size += size;
2930 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2931 if (ret)
2932 return;
2934 perf_output_put(&handle, header);
2936 if (sample_type & PERF_SAMPLE_IP)
2937 perf_output_put(&handle, ip);
2939 if (sample_type & PERF_SAMPLE_TID)
2940 perf_output_put(&handle, tid_entry);
2942 if (sample_type & PERF_SAMPLE_TIME)
2943 perf_output_put(&handle, time);
2945 if (sample_type & PERF_SAMPLE_ADDR)
2946 perf_output_put(&handle, data->addr);
2948 if (sample_type & PERF_SAMPLE_ID) {
2949 u64 id = primary_counter_id(counter);
2951 perf_output_put(&handle, id);
2954 if (sample_type & PERF_SAMPLE_STREAM_ID)
2955 perf_output_put(&handle, counter->id);
2957 if (sample_type & PERF_SAMPLE_CPU)
2958 perf_output_put(&handle, cpu_entry);
2960 if (sample_type & PERF_SAMPLE_PERIOD)
2961 perf_output_put(&handle, data->period);
2963 if (sample_type & PERF_SAMPLE_READ)
2964 perf_output_read(&handle, counter);
2966 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2967 if (callchain)
2968 perf_output_copy(&handle, callchain, callchain_size);
2969 else {
2970 u64 nr = 0;
2971 perf_output_put(&handle, nr);
2975 if (sample_type & PERF_SAMPLE_RAW) {
2976 if (data->raw) {
2977 perf_output_put(&handle, data->raw->size);
2978 perf_output_copy(&handle, data->raw->data, data->raw->size);
2979 } else {
2980 struct {
2981 u32 size;
2982 u32 data;
2983 } raw = {
2984 .size = sizeof(u32),
2985 .data = 0,
2987 perf_output_put(&handle, raw);
2991 perf_output_end(&handle);
2995 * read event
2998 struct perf_read_event {
2999 struct perf_event_header header;
3001 u32 pid;
3002 u32 tid;
3005 static void
3006 perf_counter_read_event(struct perf_counter *counter,
3007 struct task_struct *task)
3009 struct perf_output_handle handle;
3010 struct perf_read_event event = {
3011 .header = {
3012 .type = PERF_EVENT_READ,
3013 .misc = 0,
3014 .size = sizeof(event) + perf_counter_read_size(counter),
3016 .pid = perf_counter_pid(counter, task),
3017 .tid = perf_counter_tid(counter, task),
3019 int ret;
3021 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3022 if (ret)
3023 return;
3025 perf_output_put(&handle, event);
3026 perf_output_read(&handle, counter);
3028 perf_output_end(&handle);
3032 * task tracking -- fork/exit
3034 * enabled by: attr.comm | attr.mmap | attr.task
3037 struct perf_task_event {
3038 struct task_struct *task;
3039 struct perf_counter_context *task_ctx;
3041 struct {
3042 struct perf_event_header header;
3044 u32 pid;
3045 u32 ppid;
3046 u32 tid;
3047 u32 ptid;
3048 } event;
3051 static void perf_counter_task_output(struct perf_counter *counter,
3052 struct perf_task_event *task_event)
3054 struct perf_output_handle handle;
3055 int size = task_event->event.header.size;
3056 struct task_struct *task = task_event->task;
3057 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3059 if (ret)
3060 return;
3062 task_event->event.pid = perf_counter_pid(counter, task);
3063 task_event->event.ppid = perf_counter_pid(counter, current);
3065 task_event->event.tid = perf_counter_tid(counter, task);
3066 task_event->event.ptid = perf_counter_tid(counter, current);
3068 perf_output_put(&handle, task_event->event);
3069 perf_output_end(&handle);
3072 static int perf_counter_task_match(struct perf_counter *counter)
3074 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
3075 return 1;
3077 return 0;
3080 static void perf_counter_task_ctx(struct perf_counter_context *ctx,
3081 struct perf_task_event *task_event)
3083 struct perf_counter *counter;
3085 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3086 return;
3088 rcu_read_lock();
3089 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3090 if (perf_counter_task_match(counter))
3091 perf_counter_task_output(counter, task_event);
3093 rcu_read_unlock();
3096 static void perf_counter_task_event(struct perf_task_event *task_event)
3098 struct perf_cpu_context *cpuctx;
3099 struct perf_counter_context *ctx = task_event->task_ctx;
3101 cpuctx = &get_cpu_var(perf_cpu_context);
3102 perf_counter_task_ctx(&cpuctx->ctx, task_event);
3103 put_cpu_var(perf_cpu_context);
3105 rcu_read_lock();
3106 if (!ctx)
3107 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
3108 if (ctx)
3109 perf_counter_task_ctx(ctx, task_event);
3110 rcu_read_unlock();
3113 static void perf_counter_task(struct task_struct *task,
3114 struct perf_counter_context *task_ctx,
3115 int new)
3117 struct perf_task_event task_event;
3119 if (!atomic_read(&nr_comm_counters) &&
3120 !atomic_read(&nr_mmap_counters) &&
3121 !atomic_read(&nr_task_counters))
3122 return;
3124 task_event = (struct perf_task_event){
3125 .task = task,
3126 .task_ctx = task_ctx,
3127 .event = {
3128 .header = {
3129 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
3130 .misc = 0,
3131 .size = sizeof(task_event.event),
3133 /* .pid */
3134 /* .ppid */
3135 /* .tid */
3136 /* .ptid */
3140 perf_counter_task_event(&task_event);
3143 void perf_counter_fork(struct task_struct *task)
3145 perf_counter_task(task, NULL, 1);
3149 * comm tracking
3152 struct perf_comm_event {
3153 struct task_struct *task;
3154 char *comm;
3155 int comm_size;
3157 struct {
3158 struct perf_event_header header;
3160 u32 pid;
3161 u32 tid;
3162 } event;
3165 static void perf_counter_comm_output(struct perf_counter *counter,
3166 struct perf_comm_event *comm_event)
3168 struct perf_output_handle handle;
3169 int size = comm_event->event.header.size;
3170 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3172 if (ret)
3173 return;
3175 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
3176 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
3178 perf_output_put(&handle, comm_event->event);
3179 perf_output_copy(&handle, comm_event->comm,
3180 comm_event->comm_size);
3181 perf_output_end(&handle);
3184 static int perf_counter_comm_match(struct perf_counter *counter)
3186 if (counter->attr.comm)
3187 return 1;
3189 return 0;
3192 static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
3193 struct perf_comm_event *comm_event)
3195 struct perf_counter *counter;
3197 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3198 return;
3200 rcu_read_lock();
3201 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3202 if (perf_counter_comm_match(counter))
3203 perf_counter_comm_output(counter, comm_event);
3205 rcu_read_unlock();
3208 static void perf_counter_comm_event(struct perf_comm_event *comm_event)
3210 struct perf_cpu_context *cpuctx;
3211 struct perf_counter_context *ctx;
3212 unsigned int size;
3213 char comm[TASK_COMM_LEN];
3215 memset(comm, 0, sizeof(comm));
3216 strncpy(comm, comm_event->task->comm, sizeof(comm));
3217 size = ALIGN(strlen(comm)+1, sizeof(u64));
3219 comm_event->comm = comm;
3220 comm_event->comm_size = size;
3222 comm_event->event.header.size = sizeof(comm_event->event) + size;
3224 cpuctx = &get_cpu_var(perf_cpu_context);
3225 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
3226 put_cpu_var(perf_cpu_context);
3228 rcu_read_lock();
3230 * doesn't really matter which of the child contexts the
3231 * events ends up in.
3233 ctx = rcu_dereference(current->perf_counter_ctxp);
3234 if (ctx)
3235 perf_counter_comm_ctx(ctx, comm_event);
3236 rcu_read_unlock();
3239 void perf_counter_comm(struct task_struct *task)
3241 struct perf_comm_event comm_event;
3243 if (task->perf_counter_ctxp)
3244 perf_counter_enable_on_exec(task);
3246 if (!atomic_read(&nr_comm_counters))
3247 return;
3249 comm_event = (struct perf_comm_event){
3250 .task = task,
3251 /* .comm */
3252 /* .comm_size */
3253 .event = {
3254 .header = {
3255 .type = PERF_EVENT_COMM,
3256 .misc = 0,
3257 /* .size */
3259 /* .pid */
3260 /* .tid */
3264 perf_counter_comm_event(&comm_event);
3268 * mmap tracking
3271 struct perf_mmap_event {
3272 struct vm_area_struct *vma;
3274 const char *file_name;
3275 int file_size;
3277 struct {
3278 struct perf_event_header header;
3280 u32 pid;
3281 u32 tid;
3282 u64 start;
3283 u64 len;
3284 u64 pgoff;
3285 } event;
3288 static void perf_counter_mmap_output(struct perf_counter *counter,
3289 struct perf_mmap_event *mmap_event)
3291 struct perf_output_handle handle;
3292 int size = mmap_event->event.header.size;
3293 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3295 if (ret)
3296 return;
3298 mmap_event->event.pid = perf_counter_pid(counter, current);
3299 mmap_event->event.tid = perf_counter_tid(counter, current);
3301 perf_output_put(&handle, mmap_event->event);
3302 perf_output_copy(&handle, mmap_event->file_name,
3303 mmap_event->file_size);
3304 perf_output_end(&handle);
3307 static int perf_counter_mmap_match(struct perf_counter *counter,
3308 struct perf_mmap_event *mmap_event)
3310 if (counter->attr.mmap)
3311 return 1;
3313 return 0;
3316 static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
3317 struct perf_mmap_event *mmap_event)
3319 struct perf_counter *counter;
3321 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3322 return;
3324 rcu_read_lock();
3325 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3326 if (perf_counter_mmap_match(counter, mmap_event))
3327 perf_counter_mmap_output(counter, mmap_event);
3329 rcu_read_unlock();
3332 static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3334 struct perf_cpu_context *cpuctx;
3335 struct perf_counter_context *ctx;
3336 struct vm_area_struct *vma = mmap_event->vma;
3337 struct file *file = vma->vm_file;
3338 unsigned int size;
3339 char tmp[16];
3340 char *buf = NULL;
3341 const char *name;
3343 memset(tmp, 0, sizeof(tmp));
3345 if (file) {
3347 * d_path works from the end of the buffer backwards, so we
3348 * need to add enough zero bytes after the string to handle
3349 * the 64bit alignment we do later.
3351 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3352 if (!buf) {
3353 name = strncpy(tmp, "//enomem", sizeof(tmp));
3354 goto got_name;
3356 name = d_path(&file->f_path, buf, PATH_MAX);
3357 if (IS_ERR(name)) {
3358 name = strncpy(tmp, "//toolong", sizeof(tmp));
3359 goto got_name;
3361 } else {
3362 if (arch_vma_name(mmap_event->vma)) {
3363 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3364 sizeof(tmp));
3365 goto got_name;
3368 if (!vma->vm_mm) {
3369 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3370 goto got_name;
3373 name = strncpy(tmp, "//anon", sizeof(tmp));
3374 goto got_name;
3377 got_name:
3378 size = ALIGN(strlen(name)+1, sizeof(u64));
3380 mmap_event->file_name = name;
3381 mmap_event->file_size = size;
3383 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
3385 cpuctx = &get_cpu_var(perf_cpu_context);
3386 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
3387 put_cpu_var(perf_cpu_context);
3389 rcu_read_lock();
3391 * doesn't really matter which of the child contexts the
3392 * events ends up in.
3394 ctx = rcu_dereference(current->perf_counter_ctxp);
3395 if (ctx)
3396 perf_counter_mmap_ctx(ctx, mmap_event);
3397 rcu_read_unlock();
3399 kfree(buf);
3402 void __perf_counter_mmap(struct vm_area_struct *vma)
3404 struct perf_mmap_event mmap_event;
3406 if (!atomic_read(&nr_mmap_counters))
3407 return;
3409 mmap_event = (struct perf_mmap_event){
3410 .vma = vma,
3411 /* .file_name */
3412 /* .file_size */
3413 .event = {
3414 .header = {
3415 .type = PERF_EVENT_MMAP,
3416 .misc = 0,
3417 /* .size */
3419 /* .pid */
3420 /* .tid */
3421 .start = vma->vm_start,
3422 .len = vma->vm_end - vma->vm_start,
3423 .pgoff = vma->vm_pgoff,
3427 perf_counter_mmap_event(&mmap_event);
3431 * IRQ throttle logging
3434 static void perf_log_throttle(struct perf_counter *counter, int enable)
3436 struct perf_output_handle handle;
3437 int ret;
3439 struct {
3440 struct perf_event_header header;
3441 u64 time;
3442 u64 id;
3443 u64 stream_id;
3444 } throttle_event = {
3445 .header = {
3446 .type = PERF_EVENT_THROTTLE,
3447 .misc = 0,
3448 .size = sizeof(throttle_event),
3450 .time = sched_clock(),
3451 .id = primary_counter_id(counter),
3452 .stream_id = counter->id,
3455 if (enable)
3456 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3458 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3459 if (ret)
3460 return;
3462 perf_output_put(&handle, throttle_event);
3463 perf_output_end(&handle);
3467 * Generic counter overflow handling, sampling.
3470 int perf_counter_overflow(struct perf_counter *counter, int nmi,
3471 struct perf_sample_data *data)
3473 int events = atomic_read(&counter->event_limit);
3474 int throttle = counter->pmu->unthrottle != NULL;
3475 struct hw_perf_counter *hwc = &counter->hw;
3476 int ret = 0;
3478 if (!throttle) {
3479 hwc->interrupts++;
3480 } else {
3481 if (hwc->interrupts != MAX_INTERRUPTS) {
3482 hwc->interrupts++;
3483 if (HZ * hwc->interrupts >
3484 (u64)sysctl_perf_counter_sample_rate) {
3485 hwc->interrupts = MAX_INTERRUPTS;
3486 perf_log_throttle(counter, 0);
3487 ret = 1;
3489 } else {
3491 * Keep re-disabling counters even though on the previous
3492 * pass we disabled it - just in case we raced with a
3493 * sched-in and the counter got enabled again:
3495 ret = 1;
3499 if (counter->attr.freq) {
3500 u64 now = sched_clock();
3501 s64 delta = now - hwc->freq_stamp;
3503 hwc->freq_stamp = now;
3505 if (delta > 0 && delta < TICK_NSEC)
3506 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3510 * XXX event_limit might not quite work as expected on inherited
3511 * counters
3514 counter->pending_kill = POLL_IN;
3515 if (events && atomic_dec_and_test(&counter->event_limit)) {
3516 ret = 1;
3517 counter->pending_kill = POLL_HUP;
3518 if (nmi) {
3519 counter->pending_disable = 1;
3520 perf_pending_queue(&counter->pending,
3521 perf_pending_counter);
3522 } else
3523 perf_counter_disable(counter);
3526 perf_counter_output(counter, nmi, data);
3527 return ret;
3531 * Generic software counter infrastructure
3535 * We directly increment counter->count and keep a second value in
3536 * counter->hw.period_left to count intervals. This period counter
3537 * is kept in the range [-sample_period, 0] so that we can use the
3538 * sign as trigger.
3541 static u64 perf_swcounter_set_period(struct perf_counter *counter)
3543 struct hw_perf_counter *hwc = &counter->hw;
3544 u64 period = hwc->last_period;
3545 u64 nr, offset;
3546 s64 old, val;
3548 hwc->last_period = hwc->sample_period;
3550 again:
3551 old = val = atomic64_read(&hwc->period_left);
3552 if (val < 0)
3553 return 0;
3555 nr = div64_u64(period + val, period);
3556 offset = nr * period;
3557 val -= offset;
3558 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3559 goto again;
3561 return nr;
3564 static void perf_swcounter_overflow(struct perf_counter *counter,
3565 int nmi, struct perf_sample_data *data)
3567 struct hw_perf_counter *hwc = &counter->hw;
3568 u64 overflow;
3570 data->period = counter->hw.last_period;
3571 overflow = perf_swcounter_set_period(counter);
3573 if (hwc->interrupts == MAX_INTERRUPTS)
3574 return;
3576 for (; overflow; overflow--) {
3577 if (perf_counter_overflow(counter, nmi, data)) {
3579 * We inhibit the overflow from happening when
3580 * hwc->interrupts == MAX_INTERRUPTS.
3582 break;
3587 static void perf_swcounter_unthrottle(struct perf_counter *counter)
3590 * Nothing to do, we already reset hwc->interrupts.
3594 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3595 int nmi, struct perf_sample_data *data)
3597 struct hw_perf_counter *hwc = &counter->hw;
3599 atomic64_add(nr, &counter->count);
3601 if (!hwc->sample_period)
3602 return;
3604 if (!data->regs)
3605 return;
3607 if (!atomic64_add_negative(nr, &hwc->period_left))
3608 perf_swcounter_overflow(counter, nmi, data);
3611 static int perf_swcounter_is_counting(struct perf_counter *counter)
3614 * The counter is active, we're good!
3616 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3617 return 1;
3620 * The counter is off/error, not counting.
3622 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3623 return 0;
3626 * The counter is inactive, if the context is active
3627 * we're part of a group that didn't make it on the 'pmu',
3628 * not counting.
3630 if (counter->ctx->is_active)
3631 return 0;
3634 * We're inactive and the context is too, this means the
3635 * task is scheduled out, we're counting events that happen
3636 * to us, like migration events.
3638 return 1;
3641 static int perf_swcounter_match(struct perf_counter *counter,
3642 enum perf_type_id type,
3643 u32 event, struct pt_regs *regs)
3645 if (!perf_swcounter_is_counting(counter))
3646 return 0;
3648 if (counter->attr.type != type)
3649 return 0;
3650 if (counter->attr.config != event)
3651 return 0;
3653 if (regs) {
3654 if (counter->attr.exclude_user && user_mode(regs))
3655 return 0;
3657 if (counter->attr.exclude_kernel && !user_mode(regs))
3658 return 0;
3661 return 1;
3664 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3665 enum perf_type_id type,
3666 u32 event, u64 nr, int nmi,
3667 struct perf_sample_data *data)
3669 struct perf_counter *counter;
3671 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3672 return;
3674 rcu_read_lock();
3675 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3676 if (perf_swcounter_match(counter, type, event, data->regs))
3677 perf_swcounter_add(counter, nr, nmi, data);
3679 rcu_read_unlock();
3682 static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3684 if (in_nmi())
3685 return &cpuctx->recursion[3];
3687 if (in_irq())
3688 return &cpuctx->recursion[2];
3690 if (in_softirq())
3691 return &cpuctx->recursion[1];
3693 return &cpuctx->recursion[0];
3696 static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3697 u64 nr, int nmi,
3698 struct perf_sample_data *data)
3700 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3701 int *recursion = perf_swcounter_recursion_context(cpuctx);
3702 struct perf_counter_context *ctx;
3704 if (*recursion)
3705 goto out;
3707 (*recursion)++;
3708 barrier();
3710 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3711 nr, nmi, data);
3712 rcu_read_lock();
3714 * doesn't really matter which of the child contexts the
3715 * events ends up in.
3717 ctx = rcu_dereference(current->perf_counter_ctxp);
3718 if (ctx)
3719 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3720 rcu_read_unlock();
3722 barrier();
3723 (*recursion)--;
3725 out:
3726 put_cpu_var(perf_cpu_context);
3729 void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3730 struct pt_regs *regs, u64 addr)
3732 struct perf_sample_data data = {
3733 .regs = regs,
3734 .addr = addr,
3737 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3740 static void perf_swcounter_read(struct perf_counter *counter)
3744 static int perf_swcounter_enable(struct perf_counter *counter)
3746 struct hw_perf_counter *hwc = &counter->hw;
3748 if (hwc->sample_period) {
3749 hwc->last_period = hwc->sample_period;
3750 perf_swcounter_set_period(counter);
3752 return 0;
3755 static void perf_swcounter_disable(struct perf_counter *counter)
3759 static const struct pmu perf_ops_generic = {
3760 .enable = perf_swcounter_enable,
3761 .disable = perf_swcounter_disable,
3762 .read = perf_swcounter_read,
3763 .unthrottle = perf_swcounter_unthrottle,
3767 * hrtimer based swcounter callback
3770 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3772 enum hrtimer_restart ret = HRTIMER_RESTART;
3773 struct perf_sample_data data;
3774 struct perf_counter *counter;
3775 u64 period;
3777 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3778 counter->pmu->read(counter);
3780 data.addr = 0;
3781 data.regs = get_irq_regs();
3783 * In case we exclude kernel IPs or are somehow not in interrupt
3784 * context, provide the next best thing, the user IP.
3786 if ((counter->attr.exclude_kernel || !data.regs) &&
3787 !counter->attr.exclude_user)
3788 data.regs = task_pt_regs(current);
3790 if (data.regs) {
3791 if (perf_counter_overflow(counter, 0, &data))
3792 ret = HRTIMER_NORESTART;
3795 period = max_t(u64, 10000, counter->hw.sample_period);
3796 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3798 return ret;
3802 * Software counter: cpu wall time clock
3805 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3807 int cpu = raw_smp_processor_id();
3808 s64 prev;
3809 u64 now;
3811 now = cpu_clock(cpu);
3812 prev = atomic64_read(&counter->hw.prev_count);
3813 atomic64_set(&counter->hw.prev_count, now);
3814 atomic64_add(now - prev, &counter->count);
3817 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3819 struct hw_perf_counter *hwc = &counter->hw;
3820 int cpu = raw_smp_processor_id();
3822 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3823 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3824 hwc->hrtimer.function = perf_swcounter_hrtimer;
3825 if (hwc->sample_period) {
3826 u64 period = max_t(u64, 10000, hwc->sample_period);
3827 __hrtimer_start_range_ns(&hwc->hrtimer,
3828 ns_to_ktime(period), 0,
3829 HRTIMER_MODE_REL, 0);
3832 return 0;
3835 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3837 if (counter->hw.sample_period)
3838 hrtimer_cancel(&counter->hw.hrtimer);
3839 cpu_clock_perf_counter_update(counter);
3842 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3844 cpu_clock_perf_counter_update(counter);
3847 static const struct pmu perf_ops_cpu_clock = {
3848 .enable = cpu_clock_perf_counter_enable,
3849 .disable = cpu_clock_perf_counter_disable,
3850 .read = cpu_clock_perf_counter_read,
3854 * Software counter: task time clock
3857 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3859 u64 prev;
3860 s64 delta;
3862 prev = atomic64_xchg(&counter->hw.prev_count, now);
3863 delta = now - prev;
3864 atomic64_add(delta, &counter->count);
3867 static int task_clock_perf_counter_enable(struct perf_counter *counter)
3869 struct hw_perf_counter *hwc = &counter->hw;
3870 u64 now;
3872 now = counter->ctx->time;
3874 atomic64_set(&hwc->prev_count, now);
3875 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3876 hwc->hrtimer.function = perf_swcounter_hrtimer;
3877 if (hwc->sample_period) {
3878 u64 period = max_t(u64, 10000, hwc->sample_period);
3879 __hrtimer_start_range_ns(&hwc->hrtimer,
3880 ns_to_ktime(period), 0,
3881 HRTIMER_MODE_REL, 0);
3884 return 0;
3887 static void task_clock_perf_counter_disable(struct perf_counter *counter)
3889 if (counter->hw.sample_period)
3890 hrtimer_cancel(&counter->hw.hrtimer);
3891 task_clock_perf_counter_update(counter, counter->ctx->time);
3895 static void task_clock_perf_counter_read(struct perf_counter *counter)
3897 u64 time;
3899 if (!in_nmi()) {
3900 update_context_time(counter->ctx);
3901 time = counter->ctx->time;
3902 } else {
3903 u64 now = perf_clock();
3904 u64 delta = now - counter->ctx->timestamp;
3905 time = counter->ctx->time + delta;
3908 task_clock_perf_counter_update(counter, time);
3911 static const struct pmu perf_ops_task_clock = {
3912 .enable = task_clock_perf_counter_enable,
3913 .disable = task_clock_perf_counter_disable,
3914 .read = task_clock_perf_counter_read,
3917 #ifdef CONFIG_EVENT_PROFILE
3918 void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3919 int entry_size)
3921 struct perf_raw_record raw = {
3922 .size = entry_size,
3923 .data = record,
3926 struct perf_sample_data data = {
3927 .regs = get_irq_regs(),
3928 .addr = addr,
3929 .raw = &raw,
3932 if (!data.regs)
3933 data.regs = task_pt_regs(current);
3935 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3937 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3939 extern int ftrace_profile_enable(int);
3940 extern void ftrace_profile_disable(int);
3942 static void tp_perf_counter_destroy(struct perf_counter *counter)
3944 ftrace_profile_disable(counter->attr.config);
3947 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3950 * Raw tracepoint data is a severe data leak, only allow root to
3951 * have these.
3953 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3954 !capable(CAP_SYS_ADMIN))
3955 return ERR_PTR(-EPERM);
3957 if (ftrace_profile_enable(counter->attr.config))
3958 return NULL;
3960 counter->destroy = tp_perf_counter_destroy;
3962 return &perf_ops_generic;
3964 #else
3965 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3967 return NULL;
3969 #endif
3971 atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3973 static void sw_perf_counter_destroy(struct perf_counter *counter)
3975 u64 event = counter->attr.config;
3977 WARN_ON(counter->parent);
3979 atomic_dec(&perf_swcounter_enabled[event]);
3982 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3984 const struct pmu *pmu = NULL;
3985 u64 event = counter->attr.config;
3988 * Software counters (currently) can't in general distinguish
3989 * between user, kernel and hypervisor events.
3990 * However, context switches and cpu migrations are considered
3991 * to be kernel events, and page faults are never hypervisor
3992 * events.
3994 switch (event) {
3995 case PERF_COUNT_SW_CPU_CLOCK:
3996 pmu = &perf_ops_cpu_clock;
3998 break;
3999 case PERF_COUNT_SW_TASK_CLOCK:
4001 * If the user instantiates this as a per-cpu counter,
4002 * use the cpu_clock counter instead.
4004 if (counter->ctx->task)
4005 pmu = &perf_ops_task_clock;
4006 else
4007 pmu = &perf_ops_cpu_clock;
4009 break;
4010 case PERF_COUNT_SW_PAGE_FAULTS:
4011 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4012 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4013 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4014 case PERF_COUNT_SW_CPU_MIGRATIONS:
4015 if (!counter->parent) {
4016 atomic_inc(&perf_swcounter_enabled[event]);
4017 counter->destroy = sw_perf_counter_destroy;
4019 pmu = &perf_ops_generic;
4020 break;
4023 return pmu;
4027 * Allocate and initialize a counter structure
4029 static struct perf_counter *
4030 perf_counter_alloc(struct perf_counter_attr *attr,
4031 int cpu,
4032 struct perf_counter_context *ctx,
4033 struct perf_counter *group_leader,
4034 struct perf_counter *parent_counter,
4035 gfp_t gfpflags)
4037 const struct pmu *pmu;
4038 struct perf_counter *counter;
4039 struct hw_perf_counter *hwc;
4040 long err;
4042 counter = kzalloc(sizeof(*counter), gfpflags);
4043 if (!counter)
4044 return ERR_PTR(-ENOMEM);
4047 * Single counters are their own group leaders, with an
4048 * empty sibling list:
4050 if (!group_leader)
4051 group_leader = counter;
4053 mutex_init(&counter->child_mutex);
4054 INIT_LIST_HEAD(&counter->child_list);
4056 INIT_LIST_HEAD(&counter->list_entry);
4057 INIT_LIST_HEAD(&counter->event_entry);
4058 INIT_LIST_HEAD(&counter->sibling_list);
4059 init_waitqueue_head(&counter->waitq);
4061 mutex_init(&counter->mmap_mutex);
4063 counter->cpu = cpu;
4064 counter->attr = *attr;
4065 counter->group_leader = group_leader;
4066 counter->pmu = NULL;
4067 counter->ctx = ctx;
4068 counter->oncpu = -1;
4070 counter->parent = parent_counter;
4072 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
4073 counter->id = atomic64_inc_return(&perf_counter_id);
4075 counter->state = PERF_COUNTER_STATE_INACTIVE;
4077 if (attr->disabled)
4078 counter->state = PERF_COUNTER_STATE_OFF;
4080 pmu = NULL;
4082 hwc = &counter->hw;
4083 hwc->sample_period = attr->sample_period;
4084 if (attr->freq && attr->sample_freq)
4085 hwc->sample_period = 1;
4086 hwc->last_period = hwc->sample_period;
4088 atomic64_set(&hwc->period_left, hwc->sample_period);
4091 * we currently do not support PERF_FORMAT_GROUP on inherited counters
4093 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4094 goto done;
4096 switch (attr->type) {
4097 case PERF_TYPE_RAW:
4098 case PERF_TYPE_HARDWARE:
4099 case PERF_TYPE_HW_CACHE:
4100 pmu = hw_perf_counter_init(counter);
4101 break;
4103 case PERF_TYPE_SOFTWARE:
4104 pmu = sw_perf_counter_init(counter);
4105 break;
4107 case PERF_TYPE_TRACEPOINT:
4108 pmu = tp_perf_counter_init(counter);
4109 break;
4111 default:
4112 break;
4114 done:
4115 err = 0;
4116 if (!pmu)
4117 err = -EINVAL;
4118 else if (IS_ERR(pmu))
4119 err = PTR_ERR(pmu);
4121 if (err) {
4122 if (counter->ns)
4123 put_pid_ns(counter->ns);
4124 kfree(counter);
4125 return ERR_PTR(err);
4128 counter->pmu = pmu;
4130 if (!counter->parent) {
4131 atomic_inc(&nr_counters);
4132 if (counter->attr.mmap)
4133 atomic_inc(&nr_mmap_counters);
4134 if (counter->attr.comm)
4135 atomic_inc(&nr_comm_counters);
4136 if (counter->attr.task)
4137 atomic_inc(&nr_task_counters);
4140 return counter;
4143 static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4144 struct perf_counter_attr *attr)
4146 u32 size;
4147 int ret;
4149 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4150 return -EFAULT;
4153 * zero the full structure, so that a short copy will be nice.
4155 memset(attr, 0, sizeof(*attr));
4157 ret = get_user(size, &uattr->size);
4158 if (ret)
4159 return ret;
4161 if (size > PAGE_SIZE) /* silly large */
4162 goto err_size;
4164 if (!size) /* abi compat */
4165 size = PERF_ATTR_SIZE_VER0;
4167 if (size < PERF_ATTR_SIZE_VER0)
4168 goto err_size;
4171 * If we're handed a bigger struct than we know of,
4172 * ensure all the unknown bits are 0 - i.e. new
4173 * user-space does not rely on any kernel feature
4174 * extensions we dont know about yet.
4176 if (size > sizeof(*attr)) {
4177 unsigned char __user *addr;
4178 unsigned char __user *end;
4179 unsigned char val;
4181 addr = (void __user *)uattr + sizeof(*attr);
4182 end = (void __user *)uattr + size;
4184 for (; addr < end; addr++) {
4185 ret = get_user(val, addr);
4186 if (ret)
4187 return ret;
4188 if (val)
4189 goto err_size;
4191 size = sizeof(*attr);
4194 ret = copy_from_user(attr, uattr, size);
4195 if (ret)
4196 return -EFAULT;
4199 * If the type exists, the corresponding creation will verify
4200 * the attr->config.
4202 if (attr->type >= PERF_TYPE_MAX)
4203 return -EINVAL;
4205 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4206 return -EINVAL;
4208 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4209 return -EINVAL;
4211 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4212 return -EINVAL;
4214 out:
4215 return ret;
4217 err_size:
4218 put_user(sizeof(*attr), &uattr->size);
4219 ret = -E2BIG;
4220 goto out;
4224 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4226 * @attr_uptr: event type attributes for monitoring/sampling
4227 * @pid: target pid
4228 * @cpu: target cpu
4229 * @group_fd: group leader counter fd
4231 SYSCALL_DEFINE5(perf_counter_open,
4232 struct perf_counter_attr __user *, attr_uptr,
4233 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4235 struct perf_counter *counter, *group_leader;
4236 struct perf_counter_attr attr;
4237 struct perf_counter_context *ctx;
4238 struct file *counter_file = NULL;
4239 struct file *group_file = NULL;
4240 int fput_needed = 0;
4241 int fput_needed2 = 0;
4242 int ret;
4244 /* for future expandability... */
4245 if (flags)
4246 return -EINVAL;
4248 ret = perf_copy_attr(attr_uptr, &attr);
4249 if (ret)
4250 return ret;
4252 if (!attr.exclude_kernel) {
4253 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4254 return -EACCES;
4257 if (attr.freq) {
4258 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
4259 return -EINVAL;
4263 * Get the target context (task or percpu):
4265 ctx = find_get_context(pid, cpu);
4266 if (IS_ERR(ctx))
4267 return PTR_ERR(ctx);
4270 * Look up the group leader (we will attach this counter to it):
4272 group_leader = NULL;
4273 if (group_fd != -1) {
4274 ret = -EINVAL;
4275 group_file = fget_light(group_fd, &fput_needed);
4276 if (!group_file)
4277 goto err_put_context;
4278 if (group_file->f_op != &perf_fops)
4279 goto err_put_context;
4281 group_leader = group_file->private_data;
4283 * Do not allow a recursive hierarchy (this new sibling
4284 * becoming part of another group-sibling):
4286 if (group_leader->group_leader != group_leader)
4287 goto err_put_context;
4289 * Do not allow to attach to a group in a different
4290 * task or CPU context:
4292 if (group_leader->ctx != ctx)
4293 goto err_put_context;
4295 * Only a group leader can be exclusive or pinned
4297 if (attr.exclusive || attr.pinned)
4298 goto err_put_context;
4301 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4302 NULL, GFP_KERNEL);
4303 ret = PTR_ERR(counter);
4304 if (IS_ERR(counter))
4305 goto err_put_context;
4307 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4308 if (ret < 0)
4309 goto err_free_put_context;
4311 counter_file = fget_light(ret, &fput_needed2);
4312 if (!counter_file)
4313 goto err_free_put_context;
4315 counter->filp = counter_file;
4316 WARN_ON_ONCE(ctx->parent_ctx);
4317 mutex_lock(&ctx->mutex);
4318 perf_install_in_context(ctx, counter, cpu);
4319 ++ctx->generation;
4320 mutex_unlock(&ctx->mutex);
4322 counter->owner = current;
4323 get_task_struct(current);
4324 mutex_lock(&current->perf_counter_mutex);
4325 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4326 mutex_unlock(&current->perf_counter_mutex);
4328 fput_light(counter_file, fput_needed2);
4330 out_fput:
4331 fput_light(group_file, fput_needed);
4333 return ret;
4335 err_free_put_context:
4336 kfree(counter);
4338 err_put_context:
4339 put_ctx(ctx);
4341 goto out_fput;
4345 * inherit a counter from parent task to child task:
4347 static struct perf_counter *
4348 inherit_counter(struct perf_counter *parent_counter,
4349 struct task_struct *parent,
4350 struct perf_counter_context *parent_ctx,
4351 struct task_struct *child,
4352 struct perf_counter *group_leader,
4353 struct perf_counter_context *child_ctx)
4355 struct perf_counter *child_counter;
4358 * Instead of creating recursive hierarchies of counters,
4359 * we link inherited counters back to the original parent,
4360 * which has a filp for sure, which we use as the reference
4361 * count:
4363 if (parent_counter->parent)
4364 parent_counter = parent_counter->parent;
4366 child_counter = perf_counter_alloc(&parent_counter->attr,
4367 parent_counter->cpu, child_ctx,
4368 group_leader, parent_counter,
4369 GFP_KERNEL);
4370 if (IS_ERR(child_counter))
4371 return child_counter;
4372 get_ctx(child_ctx);
4375 * Make the child state follow the state of the parent counter,
4376 * not its attr.disabled bit. We hold the parent's mutex,
4377 * so we won't race with perf_counter_{en, dis}able_family.
4379 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
4380 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
4381 else
4382 child_counter->state = PERF_COUNTER_STATE_OFF;
4384 if (parent_counter->attr.freq)
4385 child_counter->hw.sample_period = parent_counter->hw.sample_period;
4388 * Link it up in the child's context:
4390 add_counter_to_ctx(child_counter, child_ctx);
4393 * Get a reference to the parent filp - we will fput it
4394 * when the child counter exits. This is safe to do because
4395 * we are in the parent and we know that the filp still
4396 * exists and has a nonzero count:
4398 atomic_long_inc(&parent_counter->filp->f_count);
4401 * Link this into the parent counter's child list
4403 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4404 mutex_lock(&parent_counter->child_mutex);
4405 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
4406 mutex_unlock(&parent_counter->child_mutex);
4408 return child_counter;
4411 static int inherit_group(struct perf_counter *parent_counter,
4412 struct task_struct *parent,
4413 struct perf_counter_context *parent_ctx,
4414 struct task_struct *child,
4415 struct perf_counter_context *child_ctx)
4417 struct perf_counter *leader;
4418 struct perf_counter *sub;
4419 struct perf_counter *child_ctr;
4421 leader = inherit_counter(parent_counter, parent, parent_ctx,
4422 child, NULL, child_ctx);
4423 if (IS_ERR(leader))
4424 return PTR_ERR(leader);
4425 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
4426 child_ctr = inherit_counter(sub, parent, parent_ctx,
4427 child, leader, child_ctx);
4428 if (IS_ERR(child_ctr))
4429 return PTR_ERR(child_ctr);
4431 return 0;
4434 static void sync_child_counter(struct perf_counter *child_counter,
4435 struct task_struct *child)
4437 struct perf_counter *parent_counter = child_counter->parent;
4438 u64 child_val;
4440 if (child_counter->attr.inherit_stat)
4441 perf_counter_read_event(child_counter, child);
4443 child_val = atomic64_read(&child_counter->count);
4446 * Add back the child's count to the parent's count:
4448 atomic64_add(child_val, &parent_counter->count);
4449 atomic64_add(child_counter->total_time_enabled,
4450 &parent_counter->child_total_time_enabled);
4451 atomic64_add(child_counter->total_time_running,
4452 &parent_counter->child_total_time_running);
4455 * Remove this counter from the parent's list
4457 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4458 mutex_lock(&parent_counter->child_mutex);
4459 list_del_init(&child_counter->child_list);
4460 mutex_unlock(&parent_counter->child_mutex);
4463 * Release the parent counter, if this was the last
4464 * reference to it.
4466 fput(parent_counter->filp);
4469 static void
4470 __perf_counter_exit_task(struct perf_counter *child_counter,
4471 struct perf_counter_context *child_ctx,
4472 struct task_struct *child)
4474 struct perf_counter *parent_counter;
4476 update_counter_times(child_counter);
4477 perf_counter_remove_from_context(child_counter);
4479 parent_counter = child_counter->parent;
4481 * It can happen that parent exits first, and has counters
4482 * that are still around due to the child reference. These
4483 * counters need to be zapped - but otherwise linger.
4485 if (parent_counter) {
4486 sync_child_counter(child_counter, child);
4487 free_counter(child_counter);
4492 * When a child task exits, feed back counter values to parent counters.
4494 void perf_counter_exit_task(struct task_struct *child)
4496 struct perf_counter *child_counter, *tmp;
4497 struct perf_counter_context *child_ctx;
4498 unsigned long flags;
4500 if (likely(!child->perf_counter_ctxp)) {
4501 perf_counter_task(child, NULL, 0);
4502 return;
4505 local_irq_save(flags);
4507 * We can't reschedule here because interrupts are disabled,
4508 * and either child is current or it is a task that can't be
4509 * scheduled, so we are now safe from rescheduling changing
4510 * our context.
4512 child_ctx = child->perf_counter_ctxp;
4513 __perf_counter_task_sched_out(child_ctx);
4516 * Take the context lock here so that if find_get_context is
4517 * reading child->perf_counter_ctxp, we wait until it has
4518 * incremented the context's refcount before we do put_ctx below.
4520 spin_lock(&child_ctx->lock);
4521 child->perf_counter_ctxp = NULL;
4523 * If this context is a clone; unclone it so it can't get
4524 * swapped to another process while we're removing all
4525 * the counters from it.
4527 unclone_ctx(child_ctx);
4528 spin_unlock_irqrestore(&child_ctx->lock, flags);
4531 * Report the task dead after unscheduling the counters so that we
4532 * won't get any samples after PERF_EVENT_EXIT. We can however still
4533 * get a few PERF_EVENT_READ events.
4535 perf_counter_task(child, child_ctx, 0);
4538 * We can recurse on the same lock type through:
4540 * __perf_counter_exit_task()
4541 * sync_child_counter()
4542 * fput(parent_counter->filp)
4543 * perf_release()
4544 * mutex_lock(&ctx->mutex)
4546 * But since its the parent context it won't be the same instance.
4548 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4550 again:
4551 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4552 list_entry)
4553 __perf_counter_exit_task(child_counter, child_ctx, child);
4556 * If the last counter was a group counter, it will have appended all
4557 * its siblings to the list, but we obtained 'tmp' before that which
4558 * will still point to the list head terminating the iteration.
4560 if (!list_empty(&child_ctx->counter_list))
4561 goto again;
4563 mutex_unlock(&child_ctx->mutex);
4565 put_ctx(child_ctx);
4569 * free an unexposed, unused context as created by inheritance by
4570 * init_task below, used by fork() in case of fail.
4572 void perf_counter_free_task(struct task_struct *task)
4574 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4575 struct perf_counter *counter, *tmp;
4577 if (!ctx)
4578 return;
4580 mutex_lock(&ctx->mutex);
4581 again:
4582 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4583 struct perf_counter *parent = counter->parent;
4585 if (WARN_ON_ONCE(!parent))
4586 continue;
4588 mutex_lock(&parent->child_mutex);
4589 list_del_init(&counter->child_list);
4590 mutex_unlock(&parent->child_mutex);
4592 fput(parent->filp);
4594 list_del_counter(counter, ctx);
4595 free_counter(counter);
4598 if (!list_empty(&ctx->counter_list))
4599 goto again;
4601 mutex_unlock(&ctx->mutex);
4603 put_ctx(ctx);
4607 * Initialize the perf_counter context in task_struct
4609 int perf_counter_init_task(struct task_struct *child)
4611 struct perf_counter_context *child_ctx, *parent_ctx;
4612 struct perf_counter_context *cloned_ctx;
4613 struct perf_counter *counter;
4614 struct task_struct *parent = current;
4615 int inherited_all = 1;
4616 int ret = 0;
4618 child->perf_counter_ctxp = NULL;
4620 mutex_init(&child->perf_counter_mutex);
4621 INIT_LIST_HEAD(&child->perf_counter_list);
4623 if (likely(!parent->perf_counter_ctxp))
4624 return 0;
4627 * This is executed from the parent task context, so inherit
4628 * counters that have been marked for cloning.
4629 * First allocate and initialize a context for the child.
4632 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4633 if (!child_ctx)
4634 return -ENOMEM;
4636 __perf_counter_init_context(child_ctx, child);
4637 child->perf_counter_ctxp = child_ctx;
4638 get_task_struct(child);
4641 * If the parent's context is a clone, pin it so it won't get
4642 * swapped under us.
4644 parent_ctx = perf_pin_task_context(parent);
4647 * No need to check if parent_ctx != NULL here; since we saw
4648 * it non-NULL earlier, the only reason for it to become NULL
4649 * is if we exit, and since we're currently in the middle of
4650 * a fork we can't be exiting at the same time.
4654 * Lock the parent list. No need to lock the child - not PID
4655 * hashed yet and not running, so nobody can access it.
4657 mutex_lock(&parent_ctx->mutex);
4660 * We dont have to disable NMIs - we are only looking at
4661 * the list, not manipulating it:
4663 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4664 if (counter != counter->group_leader)
4665 continue;
4667 if (!counter->attr.inherit) {
4668 inherited_all = 0;
4669 continue;
4672 ret = inherit_group(counter, parent, parent_ctx,
4673 child, child_ctx);
4674 if (ret) {
4675 inherited_all = 0;
4676 break;
4680 if (inherited_all) {
4682 * Mark the child context as a clone of the parent
4683 * context, or of whatever the parent is a clone of.
4684 * Note that if the parent is a clone, it could get
4685 * uncloned at any point, but that doesn't matter
4686 * because the list of counters and the generation
4687 * count can't have changed since we took the mutex.
4689 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4690 if (cloned_ctx) {
4691 child_ctx->parent_ctx = cloned_ctx;
4692 child_ctx->parent_gen = parent_ctx->parent_gen;
4693 } else {
4694 child_ctx->parent_ctx = parent_ctx;
4695 child_ctx->parent_gen = parent_ctx->generation;
4697 get_ctx(child_ctx->parent_ctx);
4700 mutex_unlock(&parent_ctx->mutex);
4702 perf_unpin_context(parent_ctx);
4704 return ret;
4707 static void __cpuinit perf_counter_init_cpu(int cpu)
4709 struct perf_cpu_context *cpuctx;
4711 cpuctx = &per_cpu(perf_cpu_context, cpu);
4712 __perf_counter_init_context(&cpuctx->ctx, NULL);
4714 spin_lock(&perf_resource_lock);
4715 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4716 spin_unlock(&perf_resource_lock);
4718 hw_perf_counter_setup(cpu);
4721 #ifdef CONFIG_HOTPLUG_CPU
4722 static void __perf_counter_exit_cpu(void *info)
4724 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4725 struct perf_counter_context *ctx = &cpuctx->ctx;
4726 struct perf_counter *counter, *tmp;
4728 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4729 __perf_counter_remove_from_context(counter);
4731 static void perf_counter_exit_cpu(int cpu)
4733 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4734 struct perf_counter_context *ctx = &cpuctx->ctx;
4736 mutex_lock(&ctx->mutex);
4737 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4738 mutex_unlock(&ctx->mutex);
4740 #else
4741 static inline void perf_counter_exit_cpu(int cpu) { }
4742 #endif
4744 static int __cpuinit
4745 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4747 unsigned int cpu = (long)hcpu;
4749 switch (action) {
4751 case CPU_UP_PREPARE:
4752 case CPU_UP_PREPARE_FROZEN:
4753 perf_counter_init_cpu(cpu);
4754 break;
4756 case CPU_ONLINE:
4757 case CPU_ONLINE_FROZEN:
4758 hw_perf_counter_setup_online(cpu);
4759 break;
4761 case CPU_DOWN_PREPARE:
4762 case CPU_DOWN_PREPARE_FROZEN:
4763 perf_counter_exit_cpu(cpu);
4764 break;
4766 default:
4767 break;
4770 return NOTIFY_OK;
4774 * This has to have a higher priority than migration_notifier in sched.c.
4776 static struct notifier_block __cpuinitdata perf_cpu_nb = {
4777 .notifier_call = perf_cpu_notify,
4778 .priority = 20,
4781 void __init perf_counter_init(void)
4783 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4784 (void *)(long)smp_processor_id());
4785 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4786 (void *)(long)smp_processor_id());
4787 register_cpu_notifier(&perf_cpu_nb);
4790 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4792 return sprintf(buf, "%d\n", perf_reserved_percpu);
4795 static ssize_t
4796 perf_set_reserve_percpu(struct sysdev_class *class,
4797 const char *buf,
4798 size_t count)
4800 struct perf_cpu_context *cpuctx;
4801 unsigned long val;
4802 int err, cpu, mpt;
4804 err = strict_strtoul(buf, 10, &val);
4805 if (err)
4806 return err;
4807 if (val > perf_max_counters)
4808 return -EINVAL;
4810 spin_lock(&perf_resource_lock);
4811 perf_reserved_percpu = val;
4812 for_each_online_cpu(cpu) {
4813 cpuctx = &per_cpu(perf_cpu_context, cpu);
4814 spin_lock_irq(&cpuctx->ctx.lock);
4815 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4816 perf_max_counters - perf_reserved_percpu);
4817 cpuctx->max_pertask = mpt;
4818 spin_unlock_irq(&cpuctx->ctx.lock);
4820 spin_unlock(&perf_resource_lock);
4822 return count;
4825 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4827 return sprintf(buf, "%d\n", perf_overcommit);
4830 static ssize_t
4831 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4833 unsigned long val;
4834 int err;
4836 err = strict_strtoul(buf, 10, &val);
4837 if (err)
4838 return err;
4839 if (val > 1)
4840 return -EINVAL;
4842 spin_lock(&perf_resource_lock);
4843 perf_overcommit = val;
4844 spin_unlock(&perf_resource_lock);
4846 return count;
4849 static SYSDEV_CLASS_ATTR(
4850 reserve_percpu,
4851 0644,
4852 perf_show_reserve_percpu,
4853 perf_set_reserve_percpu
4856 static SYSDEV_CLASS_ATTR(
4857 overcommit,
4858 0644,
4859 perf_show_overcommit,
4860 perf_set_overcommit
4863 static struct attribute *perfclass_attrs[] = {
4864 &attr_reserve_percpu.attr,
4865 &attr_overcommit.attr,
4866 NULL
4869 static struct attribute_group perfclass_attr_group = {
4870 .attrs = perfclass_attrs,
4871 .name = "perf_counters",
4874 static int __init perf_counter_sysfs_init(void)
4876 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4877 &perfclass_attr_group);
4879 device_initcall(perf_counter_sysfs_init);