1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2021, Microsoft Corporation.
6 * Beau Belgrave <beaub@linux.microsoft.com>
9 #include <linux/bitmap.h>
10 #include <linux/cdev.h>
11 #include <linux/hashtable.h>
12 #include <linux/list.h>
14 #include <linux/uio.h>
15 #include <linux/ioctl.h>
16 #include <linux/jhash.h>
17 #include <linux/refcount.h>
18 #include <linux/trace_events.h>
19 #include <linux/tracefs.h>
20 #include <linux/types.h>
21 #include <linux/uaccess.h>
22 #include <linux/highmem.h>
23 #include <linux/init.h>
24 #include <linux/user_events.h>
25 #include "trace_dynevent.h"
26 #include "trace_output.h"
29 #define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
31 #define FIELD_DEPTH_TYPE 0
32 #define FIELD_DEPTH_NAME 1
33 #define FIELD_DEPTH_SIZE 2
35 /* Limit how long of an event name plus args within the subsystem. */
36 #define MAX_EVENT_DESC 512
37 #define EVENT_NAME(user_event) ((user_event)->reg_name)
38 #define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name)
39 #define MAX_FIELD_ARRAY_SIZE 1024
42 * Internal bits (kernel side only) to keep track of connected probes:
43 * These are used when status is requested in text form about an event. These
44 * bits are compared against an internal byte on the event to determine which
45 * probes to print out to the user.
47 * These do not reflect the mapped bytes between the user and kernel space.
49 #define EVENT_STATUS_FTRACE BIT(0)
50 #define EVENT_STATUS_PERF BIT(1)
51 #define EVENT_STATUS_OTHER BIT(7)
54 * Stores the system name, tables, and locks for a group of events. This
55 * allows isolation for events by various means.
57 struct user_event_group
{
59 char *system_multi_name
;
60 struct hlist_node node
;
61 struct mutex reg_mutex
;
62 DECLARE_HASHTABLE(register_table
, 8);
63 /* ID that moves forward within the group for multi-event names */
67 /* Group for init_user_ns mapping, top-most group */
68 static struct user_event_group
*init_group
;
70 /* Max allowed events for the whole system */
71 static unsigned int max_user_events
= 32768;
73 /* Current number of events on the whole system */
74 static unsigned int current_user_events
;
77 * Stores per-event properties, as users register events
78 * within a file a user_event might be created if it does not
79 * already exist. These are globally used and their lifetime
80 * is tied to the refcnt member. These cannot go away until the
84 struct user_event_group
*group
;
86 struct tracepoint tracepoint
;
87 struct trace_event_call call
;
88 struct trace_event_class
class;
89 struct dyn_event devent
;
90 struct hlist_node node
;
91 struct list_head fields
;
92 struct list_head validators
;
93 struct work_struct put_work
;
101 * Stores per-mm/event properties that enable an address to be
102 * updated properly for each task. As tasks are forked, we use
103 * these to track enablement sites that are tied to an event.
105 struct user_event_enabler
{
106 struct list_head mm_enablers_link
;
107 struct user_event
*event
;
110 /* Track enable bit, flags, etc. Aligned for bitops. */
111 unsigned long values
;
114 /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
115 #define ENABLE_VAL_BIT_MASK 0x3F
117 /* Bit 6 is for faulting status of enablement */
118 #define ENABLE_VAL_FAULTING_BIT 6
120 /* Bit 7 is for freeing status of enablement */
121 #define ENABLE_VAL_FREEING_BIT 7
123 /* Bit 8 is for marking 32-bit on 64-bit */
124 #define ENABLE_VAL_32_ON_64_BIT 8
126 #define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT)
128 /* Only duplicate the bit and compat values */
129 #define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK)
131 #define ENABLE_BITOPS(e) (&(e)->values)
133 #define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK))
135 #define EVENT_MULTI_FORMAT(f) ((f) & USER_EVENT_REG_MULTI_FORMAT)
137 /* Used for asynchronous faulting in of pages */
138 struct user_event_enabler_fault
{
139 struct work_struct work
;
140 struct user_event_mm
*mm
;
141 struct user_event_enabler
*enabler
;
145 static struct kmem_cache
*fault_cache
;
147 /* Global list of memory descriptors using user_events */
148 static LIST_HEAD(user_event_mms
);
149 static DEFINE_SPINLOCK(user_event_mms_lock
);
152 * Stores per-file events references, as users register events
153 * within a file this structure is modified and freed via RCU.
154 * The lifetime of this struct is tied to the lifetime of the file.
155 * These are not shared and only accessible by the file that created it.
157 struct user_event_refs
{
160 struct user_event
*events
[];
163 struct user_event_file_info
{
164 struct user_event_group
*group
;
165 struct user_event_refs
*refs
;
168 #define VALIDATOR_ENSURE_NULL (1 << 0)
169 #define VALIDATOR_REL (1 << 1)
171 struct user_event_validator
{
172 struct list_head user_event_link
;
177 static inline void align_addr_bit(unsigned long *addr
, int *bit
,
178 unsigned long *flags
)
180 if (IS_ALIGNED(*addr
, sizeof(long))) {
182 /* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */
183 if (test_bit(ENABLE_VAL_32_ON_64_BIT
, flags
))
189 *addr
= ALIGN_DOWN(*addr
, sizeof(long));
192 * We only support 32 and 64 bit values. The only time we need
193 * to align is a 32 bit value on a 64 bit kernel, which on LE
194 * is always 32 bits, and on BE requires no change when unaligned.
196 #ifdef __LITTLE_ENDIAN
201 typedef void (*user_event_func_t
) (struct user_event
*user
, struct iov_iter
*i
,
202 void *tpdata
, bool *faulted
);
204 static int user_event_parse(struct user_event_group
*group
, char *name
,
205 char *args
, char *flags
,
206 struct user_event
**newuser
, int reg_flags
);
208 static struct user_event_mm
*user_event_mm_get(struct user_event_mm
*mm
);
209 static struct user_event_mm
*user_event_mm_get_all(struct user_event
*user
);
210 static void user_event_mm_put(struct user_event_mm
*mm
);
211 static int destroy_user_event(struct user_event
*user
);
212 static bool user_fields_match(struct user_event
*user
, int argc
,
215 static u32
user_event_key(char *name
)
217 return jhash(name
, strlen(name
), 0);
220 static bool user_event_capable(u16 reg_flags
)
222 /* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */
223 if (reg_flags
& USER_EVENT_REG_PERSIST
) {
224 if (!perfmon_capable())
231 static struct user_event
*user_event_get(struct user_event
*user
)
233 refcount_inc(&user
->refcnt
);
238 static void delayed_destroy_user_event(struct work_struct
*work
)
240 struct user_event
*user
= container_of(
241 work
, struct user_event
, put_work
);
243 mutex_lock(&event_mutex
);
245 if (!refcount_dec_and_test(&user
->refcnt
))
248 if (destroy_user_event(user
)) {
250 * The only reason this would fail here is if we cannot
251 * update the visibility of the event. In this case the
252 * event stays in the hashtable, waiting for someone to
253 * attempt to delete it later.
255 pr_warn("user_events: Unable to delete event\n");
256 refcount_set(&user
->refcnt
, 1);
259 mutex_unlock(&event_mutex
);
262 static void user_event_put(struct user_event
*user
, bool locked
)
270 * When the event is not enabled for auto-delete there will always
271 * be at least 1 reference to the event. During the event creation
272 * we initially set the refcnt to 2 to achieve this. In those cases
273 * the caller must acquire event_mutex and after decrement check if
274 * the refcnt is 1, meaning this is the last reference. When auto
275 * delete is enabled, there will only be 1 ref, IE: refcnt will be
276 * only set to 1 during creation to allow the below checks to go
277 * through upon the last put. The last put must always be done with
278 * the event mutex held.
281 lockdep_assert_not_held(&event_mutex
);
282 delete = refcount_dec_and_mutex_lock(&user
->refcnt
, &event_mutex
);
284 lockdep_assert_held(&event_mutex
);
285 delete = refcount_dec_and_test(&user
->refcnt
);
292 * We now have the event_mutex in all cases, which ensures that
293 * no new references will be taken until event_mutex is released.
294 * New references come through find_user_event(), which requires
295 * the event_mutex to be held.
298 if (user
->reg_flags
& USER_EVENT_REG_PERSIST
) {
299 /* We should not get here when persist flag is set */
300 pr_alert("BUG: Auto-delete engaged on persistent event\n");
305 * Unfortunately we have to attempt the actual destroy in a work
306 * queue. This is because not all cases handle a trace_event_call
307 * being removed within the class->reg() operation for unregister.
309 INIT_WORK(&user
->put_work
, delayed_destroy_user_event
);
312 * Since the event is still in the hashtable, we have to re-inc
313 * the ref count to 1. This count will be decremented and checked
314 * in the work queue to ensure it's still the last ref. This is
315 * needed because a user-process could register the same event in
316 * between the time of event_mutex release and the work queue
317 * running the delayed destroy. If we removed the item now from
318 * the hashtable, this would result in a timing window where a
319 * user process would fail a register because the trace_event_call
320 * register would fail in the tracing layers.
322 refcount_set(&user
->refcnt
, 1);
324 if (WARN_ON_ONCE(!schedule_work(&user
->put_work
))) {
326 * If we fail we must wait for an admin to attempt delete or
327 * another register/close of the event, whichever is first.
329 pr_warn("user_events: Unable to queue delayed destroy\n");
332 /* Ensure if we didn't have event_mutex before we unlock it */
334 mutex_unlock(&event_mutex
);
337 static void user_event_group_destroy(struct user_event_group
*group
)
339 kfree(group
->system_name
);
340 kfree(group
->system_multi_name
);
344 static char *user_event_group_system_name(void)
347 int len
= sizeof(USER_EVENTS_SYSTEM
) + 1;
349 system_name
= kmalloc(len
, GFP_KERNEL
);
354 snprintf(system_name
, len
, "%s", USER_EVENTS_SYSTEM
);
359 static char *user_event_group_system_multi_name(void)
361 return kstrdup(USER_EVENTS_MULTI_SYSTEM
, GFP_KERNEL
);
364 static struct user_event_group
*current_user_event_group(void)
369 static struct user_event_group
*user_event_group_create(void)
371 struct user_event_group
*group
;
373 group
= kzalloc(sizeof(*group
), GFP_KERNEL
);
378 group
->system_name
= user_event_group_system_name();
380 if (!group
->system_name
)
383 group
->system_multi_name
= user_event_group_system_multi_name();
385 if (!group
->system_multi_name
)
388 mutex_init(&group
->reg_mutex
);
389 hash_init(group
->register_table
);
394 user_event_group_destroy(group
);
399 static void user_event_enabler_destroy(struct user_event_enabler
*enabler
,
402 list_del_rcu(&enabler
->mm_enablers_link
);
404 /* No longer tracking the event via the enabler */
405 user_event_put(enabler
->event
, locked
);
410 static int user_event_mm_fault_in(struct user_event_mm
*mm
, unsigned long uaddr
,
417 * Normally this is low, ensure that it cannot be taken advantage of by
418 * bad user processes to cause excessive looping.
423 mmap_read_lock(mm
->mm
);
425 /* Ensure MM has tasks, cannot use after exit_mm() */
426 if (refcount_read(&mm
->tasks
) == 0) {
431 ret
= fixup_user_fault(mm
->mm
, uaddr
, FAULT_FLAG_WRITE
| FAULT_FLAG_REMOTE
,
434 mmap_read_unlock(mm
->mm
);
439 static int user_event_enabler_write(struct user_event_mm
*mm
,
440 struct user_event_enabler
*enabler
,
441 bool fixup_fault
, int *attempt
);
443 static void user_event_enabler_fault_fixup(struct work_struct
*work
)
445 struct user_event_enabler_fault
*fault
= container_of(
446 work
, struct user_event_enabler_fault
, work
);
447 struct user_event_enabler
*enabler
= fault
->enabler
;
448 struct user_event_mm
*mm
= fault
->mm
;
449 unsigned long uaddr
= enabler
->addr
;
450 int attempt
= fault
->attempt
;
453 ret
= user_event_mm_fault_in(mm
, uaddr
, attempt
);
455 if (ret
&& ret
!= -ENOENT
) {
456 struct user_event
*user
= enabler
->event
;
458 pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
459 mm
->mm
, (unsigned long long)uaddr
, EVENT_NAME(user
));
462 /* Prevent state changes from racing */
463 mutex_lock(&event_mutex
);
465 /* User asked for enabler to be removed during fault */
466 if (test_bit(ENABLE_VAL_FREEING_BIT
, ENABLE_BITOPS(enabler
))) {
467 user_event_enabler_destroy(enabler
, true);
472 * If we managed to get the page, re-issue the write. We do not
473 * want to get into a possible infinite loop, which is why we only
474 * attempt again directly if the page came in. If we couldn't get
475 * the page here, then we will try again the next time the event is
478 clear_bit(ENABLE_VAL_FAULTING_BIT
, ENABLE_BITOPS(enabler
));
481 mmap_read_lock(mm
->mm
);
482 user_event_enabler_write(mm
, enabler
, true, &attempt
);
483 mmap_read_unlock(mm
->mm
);
486 mutex_unlock(&event_mutex
);
488 /* In all cases we no longer need the mm or fault */
489 user_event_mm_put(mm
);
490 kmem_cache_free(fault_cache
, fault
);
493 static bool user_event_enabler_queue_fault(struct user_event_mm
*mm
,
494 struct user_event_enabler
*enabler
,
497 struct user_event_enabler_fault
*fault
;
499 fault
= kmem_cache_zalloc(fault_cache
, GFP_NOWAIT
| __GFP_NOWARN
);
504 INIT_WORK(&fault
->work
, user_event_enabler_fault_fixup
);
505 fault
->mm
= user_event_mm_get(mm
);
506 fault
->enabler
= enabler
;
507 fault
->attempt
= attempt
;
509 /* Don't try to queue in again while we have a pending fault */
510 set_bit(ENABLE_VAL_FAULTING_BIT
, ENABLE_BITOPS(enabler
));
512 if (!schedule_work(&fault
->work
)) {
513 /* Allow another attempt later */
514 clear_bit(ENABLE_VAL_FAULTING_BIT
, ENABLE_BITOPS(enabler
));
516 user_event_mm_put(mm
);
517 kmem_cache_free(fault_cache
, fault
);
525 static int user_event_enabler_write(struct user_event_mm
*mm
,
526 struct user_event_enabler
*enabler
,
527 bool fixup_fault
, int *attempt
)
529 unsigned long uaddr
= enabler
->addr
;
533 int bit
= ENABLE_BIT(enabler
);
536 lockdep_assert_held(&event_mutex
);
537 mmap_assert_locked(mm
->mm
);
541 /* Ensure MM has tasks, cannot use after exit_mm() */
542 if (refcount_read(&mm
->tasks
) == 0)
545 if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT
, ENABLE_BITOPS(enabler
)) ||
546 test_bit(ENABLE_VAL_FREEING_BIT
, ENABLE_BITOPS(enabler
))))
549 align_addr_bit(&uaddr
, &bit
, ENABLE_BITOPS(enabler
));
551 ret
= pin_user_pages_remote(mm
->mm
, uaddr
, 1, FOLL_WRITE
| FOLL_NOFAULT
,
554 if (unlikely(ret
<= 0)) {
558 if (!user_event_enabler_queue_fault(mm
, enabler
, *attempt
))
559 pr_warn("user_events: Unable to queue fault handler\n");
564 kaddr
= kmap_local_page(page
);
565 ptr
= kaddr
+ (uaddr
& ~PAGE_MASK
);
567 /* Update bit atomically, user tracers must be atomic as well */
568 if (enabler
->event
&& enabler
->event
->status
)
574 unpin_user_pages_dirty_lock(&page
, 1, true);
579 static bool user_event_enabler_exists(struct user_event_mm
*mm
,
580 unsigned long uaddr
, unsigned char bit
)
582 struct user_event_enabler
*enabler
;
584 list_for_each_entry(enabler
, &mm
->enablers
, mm_enablers_link
) {
585 if (enabler
->addr
== uaddr
&& ENABLE_BIT(enabler
) == bit
)
592 static void user_event_enabler_update(struct user_event
*user
)
594 struct user_event_enabler
*enabler
;
595 struct user_event_mm
*next
;
596 struct user_event_mm
*mm
;
599 lockdep_assert_held(&event_mutex
);
602 * We need to build a one-shot list of all the mms that have an
603 * enabler for the user_event passed in. This list is only valid
604 * while holding the event_mutex. The only reason for this is due
605 * to the global mm list being RCU protected and we use methods
606 * which can wait (mmap_read_lock and pin_user_pages_remote).
608 * NOTE: user_event_mm_get_all() increments the ref count of each
609 * mm that is added to the list to prevent removal timing windows.
610 * We must always put each mm after they are used, which may wait.
612 mm
= user_event_mm_get_all(user
);
616 mmap_read_lock(mm
->mm
);
618 list_for_each_entry(enabler
, &mm
->enablers
, mm_enablers_link
) {
619 if (enabler
->event
== user
) {
621 user_event_enabler_write(mm
, enabler
, true, &attempt
);
625 mmap_read_unlock(mm
->mm
);
626 user_event_mm_put(mm
);
631 static bool user_event_enabler_dup(struct user_event_enabler
*orig
,
632 struct user_event_mm
*mm
)
634 struct user_event_enabler
*enabler
;
636 /* Skip pending frees */
637 if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT
, ENABLE_BITOPS(orig
))))
640 enabler
= kzalloc(sizeof(*enabler
), GFP_NOWAIT
| __GFP_ACCOUNT
);
645 enabler
->event
= user_event_get(orig
->event
);
646 enabler
->addr
= orig
->addr
;
648 /* Only dup part of value (ignore future flags, etc) */
649 enabler
->values
= orig
->values
& ENABLE_VAL_DUP_MASK
;
651 /* Enablers not exposed yet, RCU not required */
652 list_add(&enabler
->mm_enablers_link
, &mm
->enablers
);
657 static struct user_event_mm
*user_event_mm_get(struct user_event_mm
*mm
)
659 refcount_inc(&mm
->refcnt
);
664 static struct user_event_mm
*user_event_mm_get_all(struct user_event
*user
)
666 struct user_event_mm
*found
= NULL
;
667 struct user_event_enabler
*enabler
;
668 struct user_event_mm
*mm
;
671 * We use the mm->next field to build a one-shot list from the global
672 * RCU protected list. To build this list the event_mutex must be held.
673 * This lets us build a list without requiring allocs that could fail
674 * when user based events are most wanted for diagnostics.
676 lockdep_assert_held(&event_mutex
);
679 * We do not want to block fork/exec while enablements are being
680 * updated, so we use RCU to walk the current tasks that have used
681 * user_events ABI for 1 or more events. Each enabler found in each
682 * task that matches the event being updated has a write to reflect
683 * the kernel state back into the process. Waits/faults must not occur
684 * during this. So we scan the list under RCU for all the mm that have
685 * the event within it. This is needed because mm_read_lock() can wait.
686 * Each user mm returned has a ref inc to handle remove RCU races.
690 list_for_each_entry_rcu(mm
, &user_event_mms
, mms_link
) {
691 list_for_each_entry_rcu(enabler
, &mm
->enablers
, mm_enablers_link
) {
692 if (enabler
->event
== user
) {
694 found
= user_event_mm_get(mm
);
705 static struct user_event_mm
*user_event_mm_alloc(struct task_struct
*t
)
707 struct user_event_mm
*user_mm
;
709 user_mm
= kzalloc(sizeof(*user_mm
), GFP_KERNEL_ACCOUNT
);
715 INIT_LIST_HEAD(&user_mm
->enablers
);
716 refcount_set(&user_mm
->refcnt
, 1);
717 refcount_set(&user_mm
->tasks
, 1);
720 * The lifetime of the memory descriptor can slightly outlast
721 * the task lifetime if a ref to the user_event_mm is taken
722 * between list_del_rcu() and call_rcu(). Therefore we need
723 * to take a reference to it to ensure it can live this long
724 * under this corner case. This can also occur in clones that
725 * outlast the parent.
732 static void user_event_mm_attach(struct user_event_mm
*user_mm
, struct task_struct
*t
)
736 spin_lock_irqsave(&user_event_mms_lock
, flags
);
737 list_add_rcu(&user_mm
->mms_link
, &user_event_mms
);
738 spin_unlock_irqrestore(&user_event_mms_lock
, flags
);
740 t
->user_event_mm
= user_mm
;
743 static struct user_event_mm
*current_user_event_mm(void)
745 struct user_event_mm
*user_mm
= current
->user_event_mm
;
750 user_mm
= user_event_mm_alloc(current
);
755 user_event_mm_attach(user_mm
, current
);
757 refcount_inc(&user_mm
->refcnt
);
762 static void user_event_mm_destroy(struct user_event_mm
*mm
)
764 struct user_event_enabler
*enabler
, *next
;
766 list_for_each_entry_safe(enabler
, next
, &mm
->enablers
, mm_enablers_link
)
767 user_event_enabler_destroy(enabler
, false);
773 static void user_event_mm_put(struct user_event_mm
*mm
)
775 if (mm
&& refcount_dec_and_test(&mm
->refcnt
))
776 user_event_mm_destroy(mm
);
779 static void delayed_user_event_mm_put(struct work_struct
*work
)
781 struct user_event_mm
*mm
;
783 mm
= container_of(to_rcu_work(work
), struct user_event_mm
, put_rwork
);
784 user_event_mm_put(mm
);
787 void user_event_mm_remove(struct task_struct
*t
)
789 struct user_event_mm
*mm
;
794 mm
= t
->user_event_mm
;
795 t
->user_event_mm
= NULL
;
797 /* Clone will increment the tasks, only remove if last clone */
798 if (!refcount_dec_and_test(&mm
->tasks
))
801 /* Remove the mm from the list, so it can no longer be enabled */
802 spin_lock_irqsave(&user_event_mms_lock
, flags
);
803 list_del_rcu(&mm
->mms_link
);
804 spin_unlock_irqrestore(&user_event_mms_lock
, flags
);
807 * We need to wait for currently occurring writes to stop within
808 * the mm. This is required since exit_mm() snaps the current rss
809 * stats and clears them. On the final mmdrop(), check_mm() will
810 * report a bug if these increment.
812 * All writes/pins are done under mmap_read lock, take the write
813 * lock to ensure in-progress faults have completed. Faults that
814 * are pending but yet to run will check the task count and skip
815 * the fault since the mm is going away.
817 mmap_write_lock(mm
->mm
);
818 mmap_write_unlock(mm
->mm
);
821 * Put for mm must be done after RCU delay to handle new refs in
822 * between the list_del_rcu() and now. This ensures any get refs
823 * during rcu_read_lock() are accounted for during list removal.
826 * ---------------------------------------------------------------
827 * user_event_mm_remove() | rcu_read_lock();
828 * list_del_rcu() | list_for_each_entry_rcu();
829 * call_rcu() | refcount_inc();
830 * . | rcu_read_unlock();
831 * schedule_work() | .
832 * user_event_mm_put() | .
834 * mmdrop() cannot be called in the softirq context of call_rcu()
835 * so we use a work queue after call_rcu() to run within.
837 INIT_RCU_WORK(&mm
->put_rwork
, delayed_user_event_mm_put
);
838 queue_rcu_work(system_wq
, &mm
->put_rwork
);
841 void user_event_mm_dup(struct task_struct
*t
, struct user_event_mm
*old_mm
)
843 struct user_event_mm
*mm
= user_event_mm_alloc(t
);
844 struct user_event_enabler
*enabler
;
851 list_for_each_entry_rcu(enabler
, &old_mm
->enablers
, mm_enablers_link
) {
852 if (!user_event_enabler_dup(enabler
, mm
))
858 user_event_mm_attach(mm
, t
);
862 user_event_mm_destroy(mm
);
865 static bool current_user_event_enabler_exists(unsigned long uaddr
,
868 struct user_event_mm
*user_mm
= current_user_event_mm();
874 exists
= user_event_enabler_exists(user_mm
, uaddr
, bit
);
876 user_event_mm_put(user_mm
);
881 static struct user_event_enabler
882 *user_event_enabler_create(struct user_reg
*reg
, struct user_event
*user
,
885 struct user_event_enabler
*enabler
;
886 struct user_event_mm
*user_mm
;
887 unsigned long uaddr
= (unsigned long)reg
->enable_addr
;
890 user_mm
= current_user_event_mm();
895 enabler
= kzalloc(sizeof(*enabler
), GFP_KERNEL_ACCOUNT
);
900 enabler
->event
= user
;
901 enabler
->addr
= uaddr
;
902 enabler
->values
= reg
->enable_bit
;
904 #if BITS_PER_LONG >= 64
905 if (reg
->enable_size
== 4)
906 set_bit(ENABLE_VAL_32_ON_64_BIT
, ENABLE_BITOPS(enabler
));
910 /* Prevents state changes from racing with new enablers */
911 mutex_lock(&event_mutex
);
913 /* Attempt to reflect the current state within the process */
914 mmap_read_lock(user_mm
->mm
);
915 *write_result
= user_event_enabler_write(user_mm
, enabler
, false,
917 mmap_read_unlock(user_mm
->mm
);
920 * If the write works, then we will track the enabler. A ref to the
921 * underlying user_event is held by the enabler to prevent it going
922 * away while the enabler is still in use by a process. The ref is
923 * removed when the enabler is destroyed. This means a event cannot
924 * be forcefully deleted from the system until all tasks using it
925 * exit or run exec(), which includes forks and clones.
927 if (!*write_result
) {
928 user_event_get(user
);
929 list_add_rcu(&enabler
->mm_enablers_link
, &user_mm
->enablers
);
932 mutex_unlock(&event_mutex
);
935 /* Attempt to fault-in and retry if it worked */
936 if (!user_event_mm_fault_in(user_mm
, uaddr
, attempt
))
943 user_event_mm_put(user_mm
);
948 static __always_inline __must_check
949 bool user_event_last_ref(struct user_event
*user
)
953 if (user
->reg_flags
& USER_EVENT_REG_PERSIST
)
956 return refcount_read(&user
->refcnt
) == last
;
959 static __always_inline __must_check
960 size_t copy_nofault(void *addr
, size_t bytes
, struct iov_iter
*i
)
966 ret
= copy_from_iter_nocache(addr
, bytes
, i
);
973 static struct list_head
*user_event_get_fields(struct trace_event_call
*call
)
975 struct user_event
*user
= (struct user_event
*)call
->data
;
977 return &user
->fields
;
981 * Parses a register command for user_events
982 * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
984 * Example event named 'test' with a 20 char 'msg' field with an unsigned int
986 * test char[20] msg;unsigned int id
988 * NOTE: Offsets are from the user data perspective, they are not from the
989 * trace_entry/buffer perspective. We automatically add the common properties
990 * sizes to the offset for the user.
992 * Upon success user_event has its ref count increased by 1.
994 static int user_event_parse_cmd(struct user_event_group
*group
,
995 char *raw_command
, struct user_event
**newuser
,
998 char *name
= raw_command
;
999 char *args
= strpbrk(name
, " ");
1005 flags
= strpbrk(name
, ":");
1010 return user_event_parse(group
, name
, args
, flags
, newuser
, reg_flags
);
1013 static int user_field_array_size(const char *type
)
1015 const char *start
= strchr(type
, '[');
1023 if (strscpy(val
, start
+ 1, sizeof(val
)) <= 0)
1026 bracket
= strchr(val
, ']');
1033 if (kstrtouint(val
, 0, &size
))
1036 if (size
> MAX_FIELD_ARRAY_SIZE
)
1042 static int user_field_size(const char *type
)
1044 /* long is not allowed from a user, since it's ambigious in size */
1045 if (strcmp(type
, "s64") == 0)
1047 if (strcmp(type
, "u64") == 0)
1049 if (strcmp(type
, "s32") == 0)
1051 if (strcmp(type
, "u32") == 0)
1053 if (strcmp(type
, "int") == 0)
1055 if (strcmp(type
, "unsigned int") == 0)
1056 return sizeof(unsigned int);
1057 if (strcmp(type
, "s16") == 0)
1059 if (strcmp(type
, "u16") == 0)
1061 if (strcmp(type
, "short") == 0)
1062 return sizeof(short);
1063 if (strcmp(type
, "unsigned short") == 0)
1064 return sizeof(unsigned short);
1065 if (strcmp(type
, "s8") == 0)
1067 if (strcmp(type
, "u8") == 0)
1069 if (strcmp(type
, "char") == 0)
1070 return sizeof(char);
1071 if (strcmp(type
, "unsigned char") == 0)
1072 return sizeof(unsigned char);
1073 if (str_has_prefix(type
, "char["))
1074 return user_field_array_size(type
);
1075 if (str_has_prefix(type
, "unsigned char["))
1076 return user_field_array_size(type
);
1077 if (str_has_prefix(type
, "__data_loc "))
1079 if (str_has_prefix(type
, "__rel_loc "))
1082 /* Uknown basic type, error */
1086 static void user_event_destroy_validators(struct user_event
*user
)
1088 struct user_event_validator
*validator
, *next
;
1089 struct list_head
*head
= &user
->validators
;
1091 list_for_each_entry_safe(validator
, next
, head
, user_event_link
) {
1092 list_del(&validator
->user_event_link
);
1097 static void user_event_destroy_fields(struct user_event
*user
)
1099 struct ftrace_event_field
*field
, *next
;
1100 struct list_head
*head
= &user
->fields
;
1102 list_for_each_entry_safe(field
, next
, head
, link
) {
1103 list_del(&field
->link
);
1108 static int user_event_add_field(struct user_event
*user
, const char *type
,
1109 const char *name
, int offset
, int size
,
1110 int is_signed
, int filter_type
)
1112 struct user_event_validator
*validator
;
1113 struct ftrace_event_field
*field
;
1114 int validator_flags
= 0;
1116 field
= kmalloc(sizeof(*field
), GFP_KERNEL_ACCOUNT
);
1121 if (str_has_prefix(type
, "__data_loc "))
1124 if (str_has_prefix(type
, "__rel_loc ")) {
1125 validator_flags
|= VALIDATOR_REL
;
1132 if (strstr(type
, "char") != NULL
)
1133 validator_flags
|= VALIDATOR_ENSURE_NULL
;
1135 validator
= kmalloc(sizeof(*validator
), GFP_KERNEL_ACCOUNT
);
1142 validator
->flags
= validator_flags
;
1143 validator
->offset
= offset
;
1145 /* Want sequential access when validating */
1146 list_add_tail(&validator
->user_event_link
, &user
->validators
);
1151 field
->offset
= offset
;
1153 field
->is_signed
= is_signed
;
1154 field
->filter_type
= filter_type
;
1156 if (filter_type
== FILTER_OTHER
)
1157 field
->filter_type
= filter_assign_type(type
);
1159 list_add(&field
->link
, &user
->fields
);
1162 * Min size from user writes that are required, this does not include
1163 * the size of trace_entry (common fields).
1165 user
->min_size
= (offset
+ size
) - sizeof(struct trace_entry
);
1171 * Parses the values of a field within the description
1172 * Format: type name [size]
1174 static int user_event_parse_field(char *field
, struct user_event
*user
,
1177 char *part
, *type
, *name
;
1178 u32 depth
= 0, saved_offset
= *offset
;
1179 int len
, size
= -EINVAL
;
1180 bool is_struct
= false;
1182 field
= skip_spaces(field
);
1187 /* Handle types that have a space within */
1188 len
= str_has_prefix(field
, "unsigned ");
1192 len
= str_has_prefix(field
, "struct ");
1198 len
= str_has_prefix(field
, "__data_loc unsigned ");
1202 len
= str_has_prefix(field
, "__data_loc ");
1206 len
= str_has_prefix(field
, "__rel_loc unsigned ");
1210 len
= str_has_prefix(field
, "__rel_loc ");
1217 field
= strpbrk(field
+ len
, " ");
1227 while ((part
= strsep(&field
, " ")) != NULL
) {
1229 case FIELD_DEPTH_TYPE
:
1232 case FIELD_DEPTH_NAME
:
1235 case FIELD_DEPTH_SIZE
:
1239 if (kstrtou32(part
, 10, &size
))
1247 if (depth
< FIELD_DEPTH_SIZE
|| !name
)
1250 if (depth
== FIELD_DEPTH_SIZE
)
1251 size
= user_field_size(type
);
1259 *offset
= saved_offset
+ size
;
1261 return user_event_add_field(user
, type
, name
, saved_offset
, size
,
1262 type
[0] != 'u', FILTER_OTHER
);
1265 static int user_event_parse_fields(struct user_event
*user
, char *args
)
1268 u32 offset
= sizeof(struct trace_entry
);
1274 while ((field
= strsep(&args
, ";")) != NULL
) {
1275 ret
= user_event_parse_field(field
, user
, &offset
);
1284 static struct trace_event_fields user_event_fields_array
[1];
1286 static const char *user_field_format(const char *type
)
1288 if (strcmp(type
, "s64") == 0)
1290 if (strcmp(type
, "u64") == 0)
1292 if (strcmp(type
, "s32") == 0)
1294 if (strcmp(type
, "u32") == 0)
1296 if (strcmp(type
, "int") == 0)
1298 if (strcmp(type
, "unsigned int") == 0)
1300 if (strcmp(type
, "s16") == 0)
1302 if (strcmp(type
, "u16") == 0)
1304 if (strcmp(type
, "short") == 0)
1306 if (strcmp(type
, "unsigned short") == 0)
1308 if (strcmp(type
, "s8") == 0)
1310 if (strcmp(type
, "u8") == 0)
1312 if (strcmp(type
, "char") == 0)
1314 if (strcmp(type
, "unsigned char") == 0)
1316 if (strstr(type
, "char[") != NULL
)
1319 /* Unknown, likely struct, allowed treat as 64-bit */
1323 static bool user_field_is_dyn_string(const char *type
, const char **str_func
)
1325 if (str_has_prefix(type
, "__data_loc ")) {
1326 *str_func
= "__get_str";
1330 if (str_has_prefix(type
, "__rel_loc ")) {
1331 *str_func
= "__get_rel_str";
1337 return strstr(type
, "char") != NULL
;
1340 #define LEN_OR_ZERO (len ? len - pos : 0)
1341 static int user_dyn_field_set_string(int argc
, const char **argv
, int *iout
,
1342 char *buf
, int len
, bool *colon
)
1344 int pos
= 0, i
= *iout
;
1348 for (; i
< argc
; ++i
) {
1350 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, " ");
1352 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, "%s", argv
[i
]);
1354 if (strchr(argv
[i
], ';')) {
1361 /* Actual set, advance i */
1368 static int user_field_set_string(struct ftrace_event_field
*field
,
1369 char *buf
, int len
, bool colon
)
1373 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, "%s", field
->type
);
1374 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, " ");
1375 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, "%s", field
->name
);
1377 if (str_has_prefix(field
->type
, "struct "))
1378 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, " %d", field
->size
);
1381 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, ";");
1386 static int user_event_set_print_fmt(struct user_event
*user
, char *buf
, int len
)
1388 struct ftrace_event_field
*field
;
1389 struct list_head
*head
= &user
->fields
;
1390 int pos
= 0, depth
= 0;
1391 const char *str_func
;
1393 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, "\"");
1395 list_for_each_entry_reverse(field
, head
, link
) {
1397 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, " ");
1399 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, "%s=%s",
1400 field
->name
, user_field_format(field
->type
));
1405 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
, "\"");
1407 list_for_each_entry_reverse(field
, head
, link
) {
1408 if (user_field_is_dyn_string(field
->type
, &str_func
))
1409 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
,
1410 ", %s(%s)", str_func
, field
->name
);
1412 pos
+= snprintf(buf
+ pos
, LEN_OR_ZERO
,
1413 ", REC->%s", field
->name
);
1420 static int user_event_create_print_fmt(struct user_event
*user
)
1425 len
= user_event_set_print_fmt(user
, NULL
, 0);
1427 print_fmt
= kmalloc(len
, GFP_KERNEL_ACCOUNT
);
1432 user_event_set_print_fmt(user
, print_fmt
, len
);
1434 user
->call
.print_fmt
= print_fmt
;
1439 static enum print_line_t
user_event_print_trace(struct trace_iterator
*iter
,
1441 struct trace_event
*event
)
1443 return print_event_fields(iter
, event
);
1446 static struct trace_event_functions user_event_funcs
= {
1447 .trace
= user_event_print_trace
,
1450 static int user_event_set_call_visible(struct user_event
*user
, bool visible
)
1453 const struct cred
*old_cred
;
1456 cred
= prepare_creds();
1462 * While by default tracefs is locked down, systems can be configured
1463 * to allow user_event files to be less locked down. The extreme case
1464 * being "other" has read/write access to user_events_data/status.
1466 * When not locked down, processes may not have permissions to
1467 * add/remove calls themselves to tracefs. We need to temporarily
1468 * switch to root file permission to allow for this scenario.
1470 cred
->fsuid
= GLOBAL_ROOT_UID
;
1472 old_cred
= override_creds(cred
);
1475 ret
= trace_add_event_call(&user
->call
);
1477 ret
= trace_remove_event_call(&user
->call
);
1479 revert_creds(old_cred
);
1485 static int destroy_user_event(struct user_event
*user
)
1489 lockdep_assert_held(&event_mutex
);
1491 /* Must destroy fields before call removal */
1492 user_event_destroy_fields(user
);
1494 ret
= user_event_set_call_visible(user
, false);
1499 dyn_event_remove(&user
->devent
);
1500 hash_del(&user
->node
);
1502 user_event_destroy_validators(user
);
1504 /* If we have different names, both must be freed */
1505 if (EVENT_NAME(user
) != EVENT_TP_NAME(user
))
1506 kfree(EVENT_TP_NAME(user
));
1508 kfree(user
->call
.print_fmt
);
1509 kfree(EVENT_NAME(user
));
1512 if (current_user_events
> 0)
1513 current_user_events
--;
1515 pr_alert("BUG: Bad current_user_events\n");
1520 static struct user_event
*find_user_event(struct user_event_group
*group
,
1521 char *name
, int argc
, const char **argv
,
1522 u32 flags
, u32
*outkey
)
1524 struct user_event
*user
;
1525 u32 key
= user_event_key(name
);
1529 hash_for_each_possible(group
->register_table
, user
, node
, key
) {
1531 * Single-format events shouldn't return multi-format
1532 * events. Callers expect the underlying tracepoint to match
1533 * the name exactly in these cases. Only check like-formats.
1535 if (EVENT_MULTI_FORMAT(flags
) != EVENT_MULTI_FORMAT(user
->reg_flags
))
1538 if (strcmp(EVENT_NAME(user
), name
))
1541 if (user_fields_match(user
, argc
, argv
))
1542 return user_event_get(user
);
1544 /* Scan others if this is a multi-format event */
1545 if (EVENT_MULTI_FORMAT(flags
))
1548 return ERR_PTR(-EADDRINUSE
);
1554 static int user_event_validate(struct user_event
*user
, void *data
, int len
)
1556 struct list_head
*head
= &user
->validators
;
1557 struct user_event_validator
*validator
;
1558 void *pos
, *end
= data
+ len
;
1559 u32 loc
, offset
, size
;
1561 list_for_each_entry(validator
, head
, user_event_link
) {
1562 pos
= data
+ validator
->offset
;
1564 /* Already done min_size check, no bounds check here */
1566 offset
= loc
& 0xffff;
1569 if (likely(validator
->flags
& VALIDATOR_REL
))
1570 pos
+= offset
+ sizeof(loc
);
1572 pos
= data
+ offset
;
1576 if (unlikely(pos
> end
))
1579 if (likely(validator
->flags
& VALIDATOR_ENSURE_NULL
))
1580 if (unlikely(*(char *)(pos
- 1) != '\0'))
1588 * Writes the user supplied payload out to a trace file.
1590 static void user_event_ftrace(struct user_event
*user
, struct iov_iter
*i
,
1591 void *tpdata
, bool *faulted
)
1593 struct trace_event_file
*file
;
1594 struct trace_entry
*entry
;
1595 struct trace_event_buffer event_buffer
;
1596 size_t size
= sizeof(*entry
) + i
->count
;
1598 file
= (struct trace_event_file
*)tpdata
;
1601 !(file
->flags
& EVENT_FILE_FL_ENABLED
) ||
1602 trace_trigger_soft_disabled(file
))
1605 /* Allocates and fills trace_entry, + 1 of this is data payload */
1606 entry
= trace_event_buffer_reserve(&event_buffer
, file
, size
);
1608 if (unlikely(!entry
))
1611 if (unlikely(i
->count
!= 0 && !copy_nofault(entry
+ 1, i
->count
, i
)))
1614 if (!list_empty(&user
->validators
) &&
1615 unlikely(user_event_validate(user
, entry
, size
)))
1618 trace_event_buffer_commit(&event_buffer
);
1623 __trace_event_discard_commit(event_buffer
.buffer
,
1624 event_buffer
.event
);
1627 #ifdef CONFIG_PERF_EVENTS
1629 * Writes the user supplied payload out to perf ring buffer.
1631 static void user_event_perf(struct user_event
*user
, struct iov_iter
*i
,
1632 void *tpdata
, bool *faulted
)
1634 struct hlist_head
*perf_head
;
1636 perf_head
= this_cpu_ptr(user
->call
.perf_events
);
1638 if (perf_head
&& !hlist_empty(perf_head
)) {
1639 struct trace_entry
*perf_entry
;
1640 struct pt_regs
*regs
;
1641 size_t size
= sizeof(*perf_entry
) + i
->count
;
1644 perf_entry
= perf_trace_buf_alloc(ALIGN(size
, 8),
1647 if (unlikely(!perf_entry
))
1650 perf_fetch_caller_regs(regs
);
1652 if (unlikely(i
->count
!= 0 && !copy_nofault(perf_entry
+ 1, i
->count
, i
)))
1655 if (!list_empty(&user
->validators
) &&
1656 unlikely(user_event_validate(user
, perf_entry
, size
)))
1659 perf_trace_buf_submit(perf_entry
, size
, context
,
1660 user
->call
.event
.type
, 1, regs
,
1666 perf_swevent_put_recursion_context(context
);
1672 * Update the enabled bit among all user processes.
1674 static void update_enable_bit_for(struct user_event
*user
)
1676 struct tracepoint
*tp
= &user
->tracepoint
;
1679 if (atomic_read(&tp
->key
.enabled
) > 0) {
1680 struct tracepoint_func
*probe_func_ptr
;
1681 user_event_func_t probe_func
;
1683 rcu_read_lock_sched();
1685 probe_func_ptr
= rcu_dereference_sched(tp
->funcs
);
1687 if (probe_func_ptr
) {
1689 probe_func
= probe_func_ptr
->func
;
1691 if (probe_func
== user_event_ftrace
)
1692 status
|= EVENT_STATUS_FTRACE
;
1693 #ifdef CONFIG_PERF_EVENTS
1694 else if (probe_func
== user_event_perf
)
1695 status
|= EVENT_STATUS_PERF
;
1698 status
|= EVENT_STATUS_OTHER
;
1699 } while ((++probe_func_ptr
)->func
);
1702 rcu_read_unlock_sched();
1705 user
->status
= status
;
1707 user_event_enabler_update(user
);
1711 * Register callback for our events from tracing sub-systems.
1713 static int user_event_reg(struct trace_event_call
*call
,
1714 enum trace_reg type
,
1717 struct user_event
*user
= (struct user_event
*)call
->data
;
1724 case TRACE_REG_REGISTER
:
1725 ret
= tracepoint_probe_register(call
->tp
,
1732 case TRACE_REG_UNREGISTER
:
1733 tracepoint_probe_unregister(call
->tp
,
1738 #ifdef CONFIG_PERF_EVENTS
1739 case TRACE_REG_PERF_REGISTER
:
1740 ret
= tracepoint_probe_register(call
->tp
,
1741 call
->class->perf_probe
,
1747 case TRACE_REG_PERF_UNREGISTER
:
1748 tracepoint_probe_unregister(call
->tp
,
1749 call
->class->perf_probe
,
1753 case TRACE_REG_PERF_OPEN
:
1754 case TRACE_REG_PERF_CLOSE
:
1755 case TRACE_REG_PERF_ADD
:
1756 case TRACE_REG_PERF_DEL
:
1763 user_event_get(user
);
1764 update_enable_bit_for(user
);
1767 update_enable_bit_for(user
);
1768 user_event_put(user
, true);
1772 static int user_event_create(const char *raw_command
)
1774 struct user_event_group
*group
;
1775 struct user_event
*user
;
1779 if (!str_has_prefix(raw_command
, USER_EVENTS_PREFIX
))
1782 raw_command
+= USER_EVENTS_PREFIX_LEN
;
1783 raw_command
= skip_spaces(raw_command
);
1785 name
= kstrdup(raw_command
, GFP_KERNEL_ACCOUNT
);
1790 group
= current_user_event_group();
1797 mutex_lock(&group
->reg_mutex
);
1799 /* Dyn events persist, otherwise they would cleanup immediately */
1800 ret
= user_event_parse_cmd(group
, name
, &user
, USER_EVENT_REG_PERSIST
);
1803 user_event_put(user
, false);
1805 mutex_unlock(&group
->reg_mutex
);
1813 static int user_event_show(struct seq_file
*m
, struct dyn_event
*ev
)
1815 struct user_event
*user
= container_of(ev
, struct user_event
, devent
);
1816 struct ftrace_event_field
*field
;
1817 struct list_head
*head
;
1820 seq_printf(m
, "%s%s", USER_EVENTS_PREFIX
, EVENT_NAME(user
));
1822 head
= trace_get_fields(&user
->call
);
1824 list_for_each_entry_reverse(field
, head
, link
) {
1830 seq_printf(m
, "%s %s", field
->type
, field
->name
);
1832 if (str_has_prefix(field
->type
, "struct "))
1833 seq_printf(m
, " %d", field
->size
);
1843 static bool user_event_is_busy(struct dyn_event
*ev
)
1845 struct user_event
*user
= container_of(ev
, struct user_event
, devent
);
1847 return !user_event_last_ref(user
);
1850 static int user_event_free(struct dyn_event
*ev
)
1852 struct user_event
*user
= container_of(ev
, struct user_event
, devent
);
1854 if (!user_event_last_ref(user
))
1857 if (!user_event_capable(user
->reg_flags
))
1860 return destroy_user_event(user
);
1863 static bool user_field_match(struct ftrace_event_field
*field
, int argc
,
1864 const char **argv
, int *iout
)
1866 char *field_name
= NULL
, *dyn_field_name
= NULL
;
1867 bool colon
= false, match
= false;
1873 dyn_len
= user_dyn_field_set_string(argc
, argv
, iout
, dyn_field_name
,
1876 len
= user_field_set_string(field
, field_name
, 0, colon
);
1881 dyn_field_name
= kmalloc(dyn_len
, GFP_KERNEL
);
1882 field_name
= kmalloc(len
, GFP_KERNEL
);
1884 if (!dyn_field_name
|| !field_name
)
1887 user_dyn_field_set_string(argc
, argv
, iout
, dyn_field_name
,
1890 user_field_set_string(field
, field_name
, len
, colon
);
1892 match
= strcmp(dyn_field_name
, field_name
) == 0;
1894 kfree(dyn_field_name
);
1900 static bool user_fields_match(struct user_event
*user
, int argc
,
1903 struct ftrace_event_field
*field
;
1904 struct list_head
*head
= &user
->fields
;
1908 return list_empty(head
);
1910 list_for_each_entry_reverse(field
, head
, link
) {
1911 if (!user_field_match(field
, argc
, argv
, &i
))
1921 static bool user_event_match(const char *system
, const char *event
,
1922 int argc
, const char **argv
, struct dyn_event
*ev
)
1924 struct user_event
*user
= container_of(ev
, struct user_event
, devent
);
1927 match
= strcmp(EVENT_NAME(user
), event
) == 0;
1929 if (match
&& system
) {
1930 match
= strcmp(system
, user
->group
->system_name
) == 0 ||
1931 strcmp(system
, user
->group
->system_multi_name
) == 0;
1935 match
= user_fields_match(user
, argc
, argv
);
1940 static struct dyn_event_operations user_event_dops
= {
1941 .create
= user_event_create
,
1942 .show
= user_event_show
,
1943 .is_busy
= user_event_is_busy
,
1944 .free
= user_event_free
,
1945 .match
= user_event_match
,
1948 static int user_event_trace_register(struct user_event
*user
)
1952 ret
= register_trace_event(&user
->call
.event
);
1957 ret
= user_event_set_call_visible(user
, true);
1960 unregister_trace_event(&user
->call
.event
);
1965 static int user_event_set_tp_name(struct user_event
*user
)
1967 lockdep_assert_held(&user
->group
->reg_mutex
);
1969 if (EVENT_MULTI_FORMAT(user
->reg_flags
)) {
1972 multi_name
= kasprintf(GFP_KERNEL_ACCOUNT
, "%s.%llx",
1973 user
->reg_name
, user
->group
->multi_id
);
1978 user
->call
.name
= multi_name
;
1979 user
->tracepoint
.name
= multi_name
;
1981 /* Inc to ensure unique multi-event name next time */
1982 user
->group
->multi_id
++;
1984 /* Non Multi-format uses register name */
1985 user
->call
.name
= user
->reg_name
;
1986 user
->tracepoint
.name
= user
->reg_name
;
1993 * Counts how many ';' without a trailing space are in the args.
1995 static int count_semis_no_space(char *args
)
1999 while ((args
= strchr(args
, ';'))) {
2002 if (!isspace(*args
))
2010 * Copies the arguments while ensuring all ';' have a trailing space.
2012 static char *insert_space_after_semis(char *args
, int count
)
2017 len
= strlen(args
) + count
;
2018 fixed
= kmalloc(len
+ 1, GFP_KERNEL
);
2025 /* Insert a space after ';' if there is no trailing space. */
2029 if (*pos
++ == ';' && !isspace(*args
))
2038 static char **user_event_argv_split(char *args
, int *argc
)
2044 /* Count how many ';' without a trailing space */
2045 count
= count_semis_no_space(args
);
2047 /* No fixup is required */
2049 return argv_split(GFP_KERNEL
, args
, argc
);
2051 /* We must fixup 'field;field' to 'field; field' */
2052 fixed
= insert_space_after_semis(args
, count
);
2057 /* We do a normal split afterwards */
2058 split
= argv_split(GFP_KERNEL
, fixed
, argc
);
2060 /* We can free since argv_split makes a copy */
2067 * Parses the event name, arguments and flags then registers if successful.
2068 * The name buffer lifetime is owned by this method for success cases only.
2069 * Upon success the returned user_event has its ref count increased by 1.
2071 static int user_event_parse(struct user_event_group
*group
, char *name
,
2072 char *args
, char *flags
,
2073 struct user_event
**newuser
, int reg_flags
)
2075 struct user_event
*user
;
2081 /* Currently don't support any text based flags */
2085 if (!user_event_capable(reg_flags
))
2089 argv
= user_event_argv_split(args
, &argc
);
2095 /* Prevent dyn_event from racing */
2096 mutex_lock(&event_mutex
);
2097 user
= find_user_event(group
, name
, argc
, (const char **)argv
,
2099 mutex_unlock(&event_mutex
);
2105 return PTR_ERR(user
);
2110 * Name is allocated by caller, free it since it already exists.
2111 * Caller only worries about failure cases for freeing.
2118 user
= kzalloc(sizeof(*user
), GFP_KERNEL_ACCOUNT
);
2123 INIT_LIST_HEAD(&user
->class.fields
);
2124 INIT_LIST_HEAD(&user
->fields
);
2125 INIT_LIST_HEAD(&user
->validators
);
2127 user
->group
= group
;
2128 user
->reg_name
= name
;
2129 user
->reg_flags
= reg_flags
;
2131 ret
= user_event_set_tp_name(user
);
2136 ret
= user_event_parse_fields(user
, args
);
2141 ret
= user_event_create_print_fmt(user
);
2146 user
->call
.data
= user
;
2147 user
->call
.class = &user
->class;
2148 user
->call
.flags
= TRACE_EVENT_FL_TRACEPOINT
;
2149 user
->call
.tp
= &user
->tracepoint
;
2150 user
->call
.event
.funcs
= &user_event_funcs
;
2152 if (EVENT_MULTI_FORMAT(user
->reg_flags
))
2153 user
->class.system
= group
->system_multi_name
;
2155 user
->class.system
= group
->system_name
;
2157 user
->class.fields_array
= user_event_fields_array
;
2158 user
->class.get_fields
= user_event_get_fields
;
2159 user
->class.reg
= user_event_reg
;
2160 user
->class.probe
= user_event_ftrace
;
2161 #ifdef CONFIG_PERF_EVENTS
2162 user
->class.perf_probe
= user_event_perf
;
2165 mutex_lock(&event_mutex
);
2167 if (current_user_events
>= max_user_events
) {
2172 ret
= user_event_trace_register(user
);
2177 if (user
->reg_flags
& USER_EVENT_REG_PERSIST
) {
2178 /* Ensure we track self ref and caller ref (2) */
2179 refcount_set(&user
->refcnt
, 2);
2181 /* Ensure we track only caller ref (1) */
2182 refcount_set(&user
->refcnt
, 1);
2185 dyn_event_init(&user
->devent
, &user_event_dops
);
2186 dyn_event_add(&user
->devent
, &user
->call
);
2187 hash_add(group
->register_table
, &user
->node
, key
);
2188 current_user_events
++;
2190 mutex_unlock(&event_mutex
);
2195 mutex_unlock(&event_mutex
);
2197 user_event_destroy_fields(user
);
2198 user_event_destroy_validators(user
);
2199 kfree(user
->call
.print_fmt
);
2201 /* Caller frees reg_name on error, but not multi-name */
2202 if (EVENT_NAME(user
) != EVENT_TP_NAME(user
))
2203 kfree(EVENT_TP_NAME(user
));
2210 * Deletes previously created events if they are no longer being used.
2212 static int delete_user_event(struct user_event_group
*group
, char *name
)
2214 struct user_event
*user
;
2215 struct hlist_node
*tmp
;
2216 u32 key
= user_event_key(name
);
2219 /* Attempt to delete all event(s) with the name passed in */
2220 hash_for_each_possible_safe(group
->register_table
, user
, tmp
, node
, key
) {
2221 if (strcmp(EVENT_NAME(user
), name
))
2224 if (!user_event_last_ref(user
))
2227 if (!user_event_capable(user
->reg_flags
))
2230 ret
= destroy_user_event(user
);
2240 * Validates the user payload and writes via iterator.
2242 static ssize_t
user_events_write_core(struct file
*file
, struct iov_iter
*i
)
2244 struct user_event_file_info
*info
= file
->private_data
;
2245 struct user_event_refs
*refs
;
2246 struct user_event
*user
= NULL
;
2247 struct tracepoint
*tp
;
2248 ssize_t ret
= i
->count
;
2251 if (unlikely(copy_from_iter(&idx
, sizeof(idx
), i
) != sizeof(idx
)))
2257 rcu_read_lock_sched();
2259 refs
= rcu_dereference_sched(info
->refs
);
2262 * The refs->events array is protected by RCU, and new items may be
2263 * added. But the user retrieved from indexing into the events array
2264 * shall be immutable while the file is opened.
2266 if (likely(refs
&& idx
< refs
->count
))
2267 user
= refs
->events
[idx
];
2269 rcu_read_unlock_sched();
2271 if (unlikely(user
== NULL
))
2274 if (unlikely(i
->count
< user
->min_size
))
2277 tp
= &user
->tracepoint
;
2280 * It's possible key.enabled disables after this check, however
2281 * we don't mind if a few events are included in this condition.
2283 if (likely(atomic_read(&tp
->key
.enabled
) > 0)) {
2284 struct tracepoint_func
*probe_func_ptr
;
2285 user_event_func_t probe_func
;
2286 struct iov_iter copy
;
2290 if (unlikely(fault_in_iov_iter_readable(i
, i
->count
)))
2295 rcu_read_lock_sched();
2297 probe_func_ptr
= rcu_dereference_sched(tp
->funcs
);
2299 if (probe_func_ptr
) {
2302 probe_func
= probe_func_ptr
->func
;
2303 tpdata
= probe_func_ptr
->data
;
2304 probe_func(user
, ©
, tpdata
, &faulted
);
2305 } while ((++probe_func_ptr
)->func
);
2308 rcu_read_unlock_sched();
2310 if (unlikely(faulted
))
2318 static int user_events_open(struct inode
*node
, struct file
*file
)
2320 struct user_event_group
*group
;
2321 struct user_event_file_info
*info
;
2323 group
= current_user_event_group();
2328 info
= kzalloc(sizeof(*info
), GFP_KERNEL_ACCOUNT
);
2333 info
->group
= group
;
2335 file
->private_data
= info
;
2340 static ssize_t
user_events_write(struct file
*file
, const char __user
*ubuf
,
2341 size_t count
, loff_t
*ppos
)
2345 if (unlikely(*ppos
!= 0))
2348 if (unlikely(import_ubuf(ITER_SOURCE
, (char __user
*)ubuf
, count
, &i
)))
2351 return user_events_write_core(file
, &i
);
2354 static ssize_t
user_events_write_iter(struct kiocb
*kp
, struct iov_iter
*i
)
2356 return user_events_write_core(kp
->ki_filp
, i
);
2359 static int user_events_ref_add(struct user_event_file_info
*info
,
2360 struct user_event
*user
)
2362 struct user_event_group
*group
= info
->group
;
2363 struct user_event_refs
*refs
, *new_refs
;
2364 int i
, size
, count
= 0;
2366 refs
= rcu_dereference_protected(info
->refs
,
2367 lockdep_is_held(&group
->reg_mutex
));
2370 count
= refs
->count
;
2372 for (i
= 0; i
< count
; ++i
)
2373 if (refs
->events
[i
] == user
)
2377 size
= struct_size(refs
, events
, count
+ 1);
2379 new_refs
= kzalloc(size
, GFP_KERNEL_ACCOUNT
);
2384 new_refs
->count
= count
+ 1;
2386 for (i
= 0; i
< count
; ++i
)
2387 new_refs
->events
[i
] = refs
->events
[i
];
2389 new_refs
->events
[i
] = user_event_get(user
);
2391 rcu_assign_pointer(info
->refs
, new_refs
);
2394 kfree_rcu(refs
, rcu
);
2399 static long user_reg_get(struct user_reg __user
*ureg
, struct user_reg
*kreg
)
2404 ret
= get_user(size
, &ureg
->size
);
2409 if (size
> PAGE_SIZE
)
2412 if (size
< offsetofend(struct user_reg
, write_index
))
2415 ret
= copy_struct_from_user(kreg
, sizeof(*kreg
), ureg
, size
);
2420 /* Ensure only valid flags */
2421 if (kreg
->flags
& ~(USER_EVENT_REG_MAX
-1))
2424 /* Ensure supported size */
2425 switch (kreg
->enable_size
) {
2429 #if BITS_PER_LONG >= 64
2438 /* Ensure natural alignment */
2439 if (kreg
->enable_addr
% kreg
->enable_size
)
2442 /* Ensure bit range for size */
2443 if (kreg
->enable_bit
> (kreg
->enable_size
* BITS_PER_BYTE
) - 1)
2446 /* Ensure accessible */
2447 if (!access_ok((const void __user
*)(uintptr_t)kreg
->enable_addr
,
2457 * Registers a user_event on behalf of a user process.
2459 static long user_events_ioctl_reg(struct user_event_file_info
*info
,
2462 struct user_reg __user
*ureg
= (struct user_reg __user
*)uarg
;
2463 struct user_reg reg
;
2464 struct user_event
*user
;
2465 struct user_event_enabler
*enabler
;
2470 ret
= user_reg_get(ureg
, ®
);
2476 * Prevent users from using the same address and bit multiple times
2477 * within the same mm address space. This can cause unexpected behavior
2478 * for user processes that is far easier to debug if this is explictly
2479 * an error upon registering.
2481 if (current_user_event_enabler_exists((unsigned long)reg
.enable_addr
,
2485 name
= strndup_user((const char __user
*)(uintptr_t)reg
.name_args
,
2489 ret
= PTR_ERR(name
);
2493 ret
= user_event_parse_cmd(info
->group
, name
, &user
, reg
.flags
);
2500 ret
= user_events_ref_add(info
, user
);
2502 /* No longer need parse ref, ref_add either worked or not */
2503 user_event_put(user
, false);
2505 /* Positive number is index and valid */
2510 * user_events_ref_add succeeded:
2511 * At this point we have a user_event, it's lifetime is bound by the
2512 * reference count, not this file. If anything fails, the user_event
2513 * still has a reference until the file is released. During release
2514 * any remaining references (from user_events_ref_add) are decremented.
2516 * Attempt to create an enabler, which too has a lifetime tied in the
2517 * same way for the event. Once the task that caused the enabler to be
2518 * created exits or issues exec() then the enablers it has created
2519 * will be destroyed and the ref to the event will be decremented.
2521 enabler
= user_event_enabler_create(®
, user
, &write_result
);
2526 /* Write failed/faulted, give error back to caller */
2528 return write_result
;
2530 put_user((u32
)ret
, &ureg
->write_index
);
2536 * Deletes a user_event on behalf of a user process.
2538 static long user_events_ioctl_del(struct user_event_file_info
*info
,
2541 void __user
*ubuf
= (void __user
*)uarg
;
2545 name
= strndup_user(ubuf
, MAX_EVENT_DESC
);
2548 return PTR_ERR(name
);
2550 /* event_mutex prevents dyn_event from racing */
2551 mutex_lock(&event_mutex
);
2552 ret
= delete_user_event(info
->group
, name
);
2553 mutex_unlock(&event_mutex
);
2560 static long user_unreg_get(struct user_unreg __user
*ureg
,
2561 struct user_unreg
*kreg
)
2566 ret
= get_user(size
, &ureg
->size
);
2571 if (size
> PAGE_SIZE
)
2574 if (size
< offsetofend(struct user_unreg
, disable_addr
))
2577 ret
= copy_struct_from_user(kreg
, sizeof(*kreg
), ureg
, size
);
2579 /* Ensure no reserved values, since we don't support any yet */
2580 if (kreg
->__reserved
|| kreg
->__reserved2
)
2586 static int user_event_mm_clear_bit(struct user_event_mm
*user_mm
,
2587 unsigned long uaddr
, unsigned char bit
,
2588 unsigned long flags
)
2590 struct user_event_enabler enabler
;
2594 memset(&enabler
, 0, sizeof(enabler
));
2595 enabler
.addr
= uaddr
;
2596 enabler
.values
= bit
| flags
;
2598 /* Prevents state changes from racing with new enablers */
2599 mutex_lock(&event_mutex
);
2601 /* Force the bit to be cleared, since no event is attached */
2602 mmap_read_lock(user_mm
->mm
);
2603 result
= user_event_enabler_write(user_mm
, &enabler
, false, &attempt
);
2604 mmap_read_unlock(user_mm
->mm
);
2606 mutex_unlock(&event_mutex
);
2609 /* Attempt to fault-in and retry if it worked */
2610 if (!user_event_mm_fault_in(user_mm
, uaddr
, attempt
))
2618 * Unregisters an enablement address/bit within a task/user mm.
2620 static long user_events_ioctl_unreg(unsigned long uarg
)
2622 struct user_unreg __user
*ureg
= (struct user_unreg __user
*)uarg
;
2623 struct user_event_mm
*mm
= current
->user_event_mm
;
2624 struct user_event_enabler
*enabler
, *next
;
2625 struct user_unreg reg
;
2626 unsigned long flags
;
2629 ret
= user_unreg_get(ureg
, ®
);
2641 * Flags freeing and faulting are used to indicate if the enabler is in
2642 * use at all. When faulting is set a page-fault is occurring asyncly.
2643 * During async fault if freeing is set, the enabler will be destroyed.
2644 * If no async fault is happening, we can destroy it now since we hold
2645 * the event_mutex during these checks.
2647 mutex_lock(&event_mutex
);
2649 list_for_each_entry_safe(enabler
, next
, &mm
->enablers
, mm_enablers_link
) {
2650 if (enabler
->addr
== reg
.disable_addr
&&
2651 ENABLE_BIT(enabler
) == reg
.disable_bit
) {
2652 set_bit(ENABLE_VAL_FREEING_BIT
, ENABLE_BITOPS(enabler
));
2654 /* We must keep compat flags for the clear */
2655 flags
|= enabler
->values
& ENABLE_VAL_COMPAT_MASK
;
2657 if (!test_bit(ENABLE_VAL_FAULTING_BIT
, ENABLE_BITOPS(enabler
)))
2658 user_event_enabler_destroy(enabler
, true);
2660 /* Removed at least one */
2665 mutex_unlock(&event_mutex
);
2667 /* Ensure bit is now cleared for user, regardless of event status */
2669 ret
= user_event_mm_clear_bit(mm
, reg
.disable_addr
,
2670 reg
.disable_bit
, flags
);
2676 * Handles the ioctl from user mode to register or alter operations.
2678 static long user_events_ioctl(struct file
*file
, unsigned int cmd
,
2681 struct user_event_file_info
*info
= file
->private_data
;
2682 struct user_event_group
*group
= info
->group
;
2687 mutex_lock(&group
->reg_mutex
);
2688 ret
= user_events_ioctl_reg(info
, uarg
);
2689 mutex_unlock(&group
->reg_mutex
);
2693 mutex_lock(&group
->reg_mutex
);
2694 ret
= user_events_ioctl_del(info
, uarg
);
2695 mutex_unlock(&group
->reg_mutex
);
2698 case DIAG_IOCSUNREG
:
2699 mutex_lock(&group
->reg_mutex
);
2700 ret
= user_events_ioctl_unreg(uarg
);
2701 mutex_unlock(&group
->reg_mutex
);
2709 * Handles the final close of the file from user mode.
2711 static int user_events_release(struct inode
*node
, struct file
*file
)
2713 struct user_event_file_info
*info
= file
->private_data
;
2714 struct user_event_group
*group
;
2715 struct user_event_refs
*refs
;
2721 group
= info
->group
;
2724 * Ensure refs cannot change under any situation by taking the
2725 * register mutex during the final freeing of the references.
2727 mutex_lock(&group
->reg_mutex
);
2735 * The lifetime of refs has reached an end, it's tied to this file.
2736 * The underlying user_events are ref counted, and cannot be freed.
2737 * After this decrement, the user_events may be freed elsewhere.
2739 for (i
= 0; i
< refs
->count
; ++i
)
2740 user_event_put(refs
->events
[i
], false);
2743 file
->private_data
= NULL
;
2745 mutex_unlock(&group
->reg_mutex
);
2753 static const struct file_operations user_data_fops
= {
2754 .open
= user_events_open
,
2755 .write
= user_events_write
,
2756 .write_iter
= user_events_write_iter
,
2757 .unlocked_ioctl
= user_events_ioctl
,
2758 .release
= user_events_release
,
2761 static void *user_seq_start(struct seq_file
*m
, loff_t
*pos
)
2769 static void *user_seq_next(struct seq_file
*m
, void *p
, loff_t
*pos
)
2775 static void user_seq_stop(struct seq_file
*m
, void *p
)
2779 static int user_seq_show(struct seq_file
*m
, void *p
)
2781 struct user_event_group
*group
= m
->private;
2782 struct user_event
*user
;
2784 int i
, active
= 0, busy
= 0;
2789 mutex_lock(&group
->reg_mutex
);
2791 hash_for_each(group
->register_table
, i
, user
, node
) {
2792 status
= user
->status
;
2794 seq_printf(m
, "%s", EVENT_TP_NAME(user
));
2800 seq_puts(m
, " Used by");
2801 if (status
& EVENT_STATUS_FTRACE
)
2802 seq_puts(m
, " ftrace");
2803 if (status
& EVENT_STATUS_PERF
)
2804 seq_puts(m
, " perf");
2805 if (status
& EVENT_STATUS_OTHER
)
2806 seq_puts(m
, " other");
2814 mutex_unlock(&group
->reg_mutex
);
2817 seq_printf(m
, "Active: %d\n", active
);
2818 seq_printf(m
, "Busy: %d\n", busy
);
2823 static const struct seq_operations user_seq_ops
= {
2824 .start
= user_seq_start
,
2825 .next
= user_seq_next
,
2826 .stop
= user_seq_stop
,
2827 .show
= user_seq_show
,
2830 static int user_status_open(struct inode
*node
, struct file
*file
)
2832 struct user_event_group
*group
;
2835 group
= current_user_event_group();
2840 ret
= seq_open(file
, &user_seq_ops
);
2843 /* Chain group to seq_file */
2844 struct seq_file
*m
= file
->private_data
;
2852 static const struct file_operations user_status_fops
= {
2853 .open
= user_status_open
,
2855 .llseek
= seq_lseek
,
2856 .release
= seq_release
,
2860 * Creates a set of tracefs files to allow user mode interactions.
2862 static int create_user_tracefs(void)
2864 struct dentry
*edata
, *emmap
;
2866 edata
= tracefs_create_file("user_events_data", TRACE_MODE_WRITE
,
2867 NULL
, NULL
, &user_data_fops
);
2870 pr_warn("Could not create tracefs 'user_events_data' entry\n");
2874 emmap
= tracefs_create_file("user_events_status", TRACE_MODE_READ
,
2875 NULL
, NULL
, &user_status_fops
);
2878 tracefs_remove(edata
);
2879 pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
2888 static int set_max_user_events_sysctl(const struct ctl_table
*table
, int write
,
2889 void *buffer
, size_t *lenp
, loff_t
*ppos
)
2893 mutex_lock(&event_mutex
);
2895 ret
= proc_douintvec(table
, write
, buffer
, lenp
, ppos
);
2897 mutex_unlock(&event_mutex
);
2902 static struct ctl_table user_event_sysctls
[] = {
2904 .procname
= "user_events_max",
2905 .data
= &max_user_events
,
2906 .maxlen
= sizeof(unsigned int),
2908 .proc_handler
= set_max_user_events_sysctl
,
2912 static int __init
trace_events_user_init(void)
2916 fault_cache
= KMEM_CACHE(user_event_enabler_fault
, 0);
2921 init_group
= user_event_group_create();
2924 kmem_cache_destroy(fault_cache
);
2928 ret
= create_user_tracefs();
2931 pr_warn("user_events could not register with tracefs\n");
2932 user_event_group_destroy(init_group
);
2933 kmem_cache_destroy(fault_cache
);
2938 if (dyn_event_register(&user_event_dops
))
2939 pr_warn("user_events could not register with dyn_events\n");
2941 register_sysctl_init("kernel", user_event_sysctls
);
2946 fs_initcall(trace_events_user_init
);