1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
3 * Copyright (c) 2016 Facebook
5 #include <linux/kernel.h>
6 #include <linux/types.h>
7 #include <linux/slab.h>
9 #include <linux/bpf_perf_event.h>
10 #include <linux/filter.h>
11 #include <linux/uaccess.h>
12 #include <linux/ctype.h>
13 #include <linux/kprobes.h>
14 #include <linux/syscalls.h>
15 #include <linux/error-injection.h>
19 #include "trace_probe.h"
23 struct bpf_trace_module
{
24 struct module
*module
;
25 struct list_head list
;
28 static LIST_HEAD(bpf_trace_modules
);
29 static DEFINE_MUTEX(bpf_module_mutex
);
31 static struct bpf_raw_event_map
*bpf_get_raw_tracepoint_module(const char *name
)
33 struct bpf_raw_event_map
*btp
, *ret
= NULL
;
34 struct bpf_trace_module
*btm
;
37 mutex_lock(&bpf_module_mutex
);
38 list_for_each_entry(btm
, &bpf_trace_modules
, list
) {
39 for (i
= 0; i
< btm
->module
->num_bpf_raw_events
; ++i
) {
40 btp
= &btm
->module
->bpf_raw_events
[i
];
41 if (!strcmp(btp
->tp
->name
, name
)) {
42 if (try_module_get(btm
->module
))
49 mutex_unlock(&bpf_module_mutex
);
53 static struct bpf_raw_event_map
*bpf_get_raw_tracepoint_module(const char *name
)
57 #endif /* CONFIG_MODULES */
59 u64
bpf_get_stackid(u64 r1
, u64 r2
, u64 r3
, u64 r4
, u64 r5
);
60 u64
bpf_get_stack(u64 r1
, u64 r2
, u64 r3
, u64 r4
, u64 r5
);
63 * trace_call_bpf - invoke BPF program
64 * @call: tracepoint event
65 * @ctx: opaque context pointer
67 * kprobe handlers execute BPF programs via this helper.
68 * Can be used from static tracepoints in the future.
70 * Return: BPF programs always return an integer which is interpreted by
72 * 0 - return from kprobe (event is filtered out)
73 * 1 - store kprobe event into ring buffer
74 * Other values are reserved and currently alias to 1
76 unsigned int trace_call_bpf(struct trace_event_call
*call
, void *ctx
)
80 if (in_nmi()) /* not supported yet */
85 if (unlikely(__this_cpu_inc_return(bpf_prog_active
) != 1)) {
87 * since some bpf program is already running on this cpu,
88 * don't call into another bpf program (same or different)
89 * and don't send kprobe event into ring-buffer,
97 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
98 * to all call sites, we did a bpf_prog_array_valid() there to check
99 * whether call->prog_array is empty or not, which is
100 * a heurisitc to speed up execution.
102 * If bpf_prog_array_valid() fetched prog_array was
103 * non-NULL, we go into trace_call_bpf() and do the actual
104 * proper rcu_dereference() under RCU lock.
105 * If it turns out that prog_array is NULL then, we bail out.
106 * For the opposite, if the bpf_prog_array_valid() fetched pointer
107 * was NULL, you'll skip the prog_array with the risk of missing
108 * out of events when it was updated in between this and the
109 * rcu_dereference() which is accepted risk.
111 ret
= BPF_PROG_RUN_ARRAY_CHECK(call
->prog_array
, ctx
, BPF_PROG_RUN
);
114 __this_cpu_dec(bpf_prog_active
);
119 EXPORT_SYMBOL_GPL(trace_call_bpf
);
121 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
122 BPF_CALL_2(bpf_override_return
, struct pt_regs
*, regs
, unsigned long, rc
)
124 regs_set_return_value(regs
, rc
);
125 override_function_with_return(regs
);
129 static const struct bpf_func_proto bpf_override_return_proto
= {
130 .func
= bpf_override_return
,
132 .ret_type
= RET_INTEGER
,
133 .arg1_type
= ARG_PTR_TO_CTX
,
134 .arg2_type
= ARG_ANYTHING
,
138 BPF_CALL_3(bpf_probe_read
, void *, dst
, u32
, size
, const void *, unsafe_ptr
)
142 ret
= probe_kernel_read(dst
, unsafe_ptr
, size
);
143 if (unlikely(ret
< 0))
144 memset(dst
, 0, size
);
149 static const struct bpf_func_proto bpf_probe_read_proto
= {
150 .func
= bpf_probe_read
,
152 .ret_type
= RET_INTEGER
,
153 .arg1_type
= ARG_PTR_TO_UNINIT_MEM
,
154 .arg2_type
= ARG_CONST_SIZE_OR_ZERO
,
155 .arg3_type
= ARG_ANYTHING
,
158 BPF_CALL_3(bpf_probe_write_user
, void *, unsafe_ptr
, const void *, src
,
162 * Ensure we're in user context which is safe for the helper to
163 * run. This helper has no business in a kthread.
165 * access_ok() should prevent writing to non-user memory, but in
166 * some situations (nommu, temporary switch, etc) access_ok() does
167 * not provide enough validation, hence the check on KERNEL_DS.
169 * nmi_uaccess_okay() ensures the probe is not run in an interim
170 * state, when the task or mm are switched. This is specifically
171 * required to prevent the use of temporary mm.
174 if (unlikely(in_interrupt() ||
175 current
->flags
& (PF_KTHREAD
| PF_EXITING
)))
177 if (unlikely(uaccess_kernel()))
179 if (unlikely(!nmi_uaccess_okay()))
181 if (!access_ok(unsafe_ptr
, size
))
184 return probe_kernel_write(unsafe_ptr
, src
, size
);
187 static const struct bpf_func_proto bpf_probe_write_user_proto
= {
188 .func
= bpf_probe_write_user
,
190 .ret_type
= RET_INTEGER
,
191 .arg1_type
= ARG_ANYTHING
,
192 .arg2_type
= ARG_PTR_TO_MEM
,
193 .arg3_type
= ARG_CONST_SIZE
,
196 static const struct bpf_func_proto
*bpf_get_probe_write_proto(void)
198 pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
199 current
->comm
, task_pid_nr(current
));
201 return &bpf_probe_write_user_proto
;
205 * Only limited trace_printk() conversion specifiers allowed:
206 * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
208 BPF_CALL_5(bpf_trace_printk
, char *, fmt
, u32
, fmt_size
, u64
, arg1
,
209 u64
, arg2
, u64
, arg3
)
211 bool str_seen
= false;
219 * bpf_check()->check_func_arg()->check_stack_boundary()
220 * guarantees that fmt points to bpf program stack,
221 * fmt_size bytes of it were initialized and fmt_size > 0
223 if (fmt
[--fmt_size
] != 0)
226 /* check format string for allowed specifiers */
227 for (i
= 0; i
< fmt_size
; i
++) {
228 if ((!isprint(fmt
[i
]) && !isspace(fmt
[i
])) || !isascii(fmt
[i
]))
237 /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
242 } else if (fmt
[i
] == 'p' || fmt
[i
] == 's') {
244 /* disallow any further format extensions */
245 if (fmt
[i
+ 1] != 0 &&
246 !isspace(fmt
[i
+ 1]) &&
247 !ispunct(fmt
[i
+ 1]))
252 /* allow only one '%s' per fmt string */
271 strncpy_from_unsafe(buf
,
272 (void *) (long) unsafe_addr
,
283 if (fmt
[i
] != 'i' && fmt
[i
] != 'd' &&
284 fmt
[i
] != 'u' && fmt
[i
] != 'x')
289 /* Horrid workaround for getting va_list handling working with different
290 * argument type combinations generically for 32 and 64 bit archs.
292 #define __BPF_TP_EMIT() __BPF_ARG3_TP()
293 #define __BPF_TP(...) \
294 __trace_printk(0 /* Fake ip */, \
297 #define __BPF_ARG1_TP(...) \
298 ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
299 ? __BPF_TP(arg1, ##__VA_ARGS__) \
300 : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
301 ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
302 : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
304 #define __BPF_ARG2_TP(...) \
305 ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
306 ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
307 : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
308 ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
309 : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
311 #define __BPF_ARG3_TP(...) \
312 ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
313 ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
314 : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
315 ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
316 : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
318 return __BPF_TP_EMIT();
321 static const struct bpf_func_proto bpf_trace_printk_proto
= {
322 .func
= bpf_trace_printk
,
324 .ret_type
= RET_INTEGER
,
325 .arg1_type
= ARG_PTR_TO_MEM
,
326 .arg2_type
= ARG_CONST_SIZE
,
329 const struct bpf_func_proto
*bpf_get_trace_printk_proto(void)
332 * this program might be calling bpf_trace_printk,
333 * so allocate per-cpu printk buffers
335 trace_printk_init_buffers();
337 return &bpf_trace_printk_proto
;
340 static __always_inline
int
341 get_map_perf_counter(struct bpf_map
*map
, u64 flags
,
342 u64
*value
, u64
*enabled
, u64
*running
)
344 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
345 unsigned int cpu
= smp_processor_id();
346 u64 index
= flags
& BPF_F_INDEX_MASK
;
347 struct bpf_event_entry
*ee
;
349 if (unlikely(flags
& ~(BPF_F_INDEX_MASK
)))
351 if (index
== BPF_F_CURRENT_CPU
)
353 if (unlikely(index
>= array
->map
.max_entries
))
356 ee
= READ_ONCE(array
->ptrs
[index
]);
360 return perf_event_read_local(ee
->event
, value
, enabled
, running
);
363 BPF_CALL_2(bpf_perf_event_read
, struct bpf_map
*, map
, u64
, flags
)
368 err
= get_map_perf_counter(map
, flags
, &value
, NULL
, NULL
);
370 * this api is ugly since we miss [-22..-2] range of valid
371 * counter values, but that's uapi
378 static const struct bpf_func_proto bpf_perf_event_read_proto
= {
379 .func
= bpf_perf_event_read
,
381 .ret_type
= RET_INTEGER
,
382 .arg1_type
= ARG_CONST_MAP_PTR
,
383 .arg2_type
= ARG_ANYTHING
,
386 BPF_CALL_4(bpf_perf_event_read_value
, struct bpf_map
*, map
, u64
, flags
,
387 struct bpf_perf_event_value
*, buf
, u32
, size
)
391 if (unlikely(size
!= sizeof(struct bpf_perf_event_value
)))
393 err
= get_map_perf_counter(map
, flags
, &buf
->counter
, &buf
->enabled
,
399 memset(buf
, 0, size
);
403 static const struct bpf_func_proto bpf_perf_event_read_value_proto
= {
404 .func
= bpf_perf_event_read_value
,
406 .ret_type
= RET_INTEGER
,
407 .arg1_type
= ARG_CONST_MAP_PTR
,
408 .arg2_type
= ARG_ANYTHING
,
409 .arg3_type
= ARG_PTR_TO_UNINIT_MEM
,
410 .arg4_type
= ARG_CONST_SIZE
,
413 static __always_inline u64
414 __bpf_perf_event_output(struct pt_regs
*regs
, struct bpf_map
*map
,
415 u64 flags
, struct perf_sample_data
*sd
)
417 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
418 unsigned int cpu
= smp_processor_id();
419 u64 index
= flags
& BPF_F_INDEX_MASK
;
420 struct bpf_event_entry
*ee
;
421 struct perf_event
*event
;
423 if (index
== BPF_F_CURRENT_CPU
)
425 if (unlikely(index
>= array
->map
.max_entries
))
428 ee
= READ_ONCE(array
->ptrs
[index
]);
433 if (unlikely(event
->attr
.type
!= PERF_TYPE_SOFTWARE
||
434 event
->attr
.config
!= PERF_COUNT_SW_BPF_OUTPUT
))
437 if (unlikely(event
->oncpu
!= cpu
))
440 return perf_event_output(event
, sd
, regs
);
444 * Support executing tracepoints in normal, irq, and nmi context that each call
445 * bpf_perf_event_output
447 struct bpf_trace_sample_data
{
448 struct perf_sample_data sds
[3];
451 static DEFINE_PER_CPU(struct bpf_trace_sample_data
, bpf_trace_sds
);
452 static DEFINE_PER_CPU(int, bpf_trace_nest_level
);
453 BPF_CALL_5(bpf_perf_event_output
, struct pt_regs
*, regs
, struct bpf_map
*, map
,
454 u64
, flags
, void *, data
, u64
, size
)
456 struct bpf_trace_sample_data
*sds
= this_cpu_ptr(&bpf_trace_sds
);
457 int nest_level
= this_cpu_inc_return(bpf_trace_nest_level
);
458 struct perf_raw_record raw
= {
464 struct perf_sample_data
*sd
;
467 if (WARN_ON_ONCE(nest_level
> ARRAY_SIZE(sds
->sds
))) {
472 sd
= &sds
->sds
[nest_level
- 1];
474 if (unlikely(flags
& ~(BPF_F_INDEX_MASK
))) {
479 perf_sample_data_init(sd
, 0, 0);
482 err
= __bpf_perf_event_output(regs
, map
, flags
, sd
);
485 this_cpu_dec(bpf_trace_nest_level
);
489 static const struct bpf_func_proto bpf_perf_event_output_proto
= {
490 .func
= bpf_perf_event_output
,
492 .ret_type
= RET_INTEGER
,
493 .arg1_type
= ARG_PTR_TO_CTX
,
494 .arg2_type
= ARG_CONST_MAP_PTR
,
495 .arg3_type
= ARG_ANYTHING
,
496 .arg4_type
= ARG_PTR_TO_MEM
,
497 .arg5_type
= ARG_CONST_SIZE_OR_ZERO
,
500 static DEFINE_PER_CPU(struct pt_regs
, bpf_pt_regs
);
501 static DEFINE_PER_CPU(struct perf_sample_data
, bpf_misc_sd
);
503 u64
bpf_event_output(struct bpf_map
*map
, u64 flags
, void *meta
, u64 meta_size
,
504 void *ctx
, u64 ctx_size
, bpf_ctx_copy_t ctx_copy
)
506 struct perf_sample_data
*sd
= this_cpu_ptr(&bpf_misc_sd
);
507 struct pt_regs
*regs
= this_cpu_ptr(&bpf_pt_regs
);
508 struct perf_raw_frag frag
= {
513 struct perf_raw_record raw
= {
516 .next
= ctx_size
? &frag
: NULL
,
523 perf_fetch_caller_regs(regs
);
524 perf_sample_data_init(sd
, 0, 0);
527 return __bpf_perf_event_output(regs
, map
, flags
, sd
);
530 BPF_CALL_0(bpf_get_current_task
)
532 return (long) current
;
535 static const struct bpf_func_proto bpf_get_current_task_proto
= {
536 .func
= bpf_get_current_task
,
538 .ret_type
= RET_INTEGER
,
541 BPF_CALL_2(bpf_current_task_under_cgroup
, struct bpf_map
*, map
, u32
, idx
)
543 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
546 if (unlikely(idx
>= array
->map
.max_entries
))
549 cgrp
= READ_ONCE(array
->ptrs
[idx
]);
553 return task_under_cgroup_hierarchy(current
, cgrp
);
556 static const struct bpf_func_proto bpf_current_task_under_cgroup_proto
= {
557 .func
= bpf_current_task_under_cgroup
,
559 .ret_type
= RET_INTEGER
,
560 .arg1_type
= ARG_CONST_MAP_PTR
,
561 .arg2_type
= ARG_ANYTHING
,
564 BPF_CALL_3(bpf_probe_read_str
, void *, dst
, u32
, size
,
565 const void *, unsafe_ptr
)
570 * The strncpy_from_unsafe() call will likely not fill the entire
571 * buffer, but that's okay in this circumstance as we're probing
572 * arbitrary memory anyway similar to bpf_probe_read() and might
573 * as well probe the stack. Thus, memory is explicitly cleared
574 * only in error case, so that improper users ignoring return
575 * code altogether don't copy garbage; otherwise length of string
576 * is returned that can be used for bpf_perf_event_output() et al.
578 ret
= strncpy_from_unsafe(dst
, unsafe_ptr
, size
);
579 if (unlikely(ret
< 0))
580 memset(dst
, 0, size
);
585 static const struct bpf_func_proto bpf_probe_read_str_proto
= {
586 .func
= bpf_probe_read_str
,
588 .ret_type
= RET_INTEGER
,
589 .arg1_type
= ARG_PTR_TO_UNINIT_MEM
,
590 .arg2_type
= ARG_CONST_SIZE_OR_ZERO
,
591 .arg3_type
= ARG_ANYTHING
,
594 static const struct bpf_func_proto
*
595 tracing_func_proto(enum bpf_func_id func_id
, const struct bpf_prog
*prog
)
598 case BPF_FUNC_map_lookup_elem
:
599 return &bpf_map_lookup_elem_proto
;
600 case BPF_FUNC_map_update_elem
:
601 return &bpf_map_update_elem_proto
;
602 case BPF_FUNC_map_delete_elem
:
603 return &bpf_map_delete_elem_proto
;
604 case BPF_FUNC_map_push_elem
:
605 return &bpf_map_push_elem_proto
;
606 case BPF_FUNC_map_pop_elem
:
607 return &bpf_map_pop_elem_proto
;
608 case BPF_FUNC_map_peek_elem
:
609 return &bpf_map_peek_elem_proto
;
610 case BPF_FUNC_probe_read
:
611 return &bpf_probe_read_proto
;
612 case BPF_FUNC_ktime_get_ns
:
613 return &bpf_ktime_get_ns_proto
;
614 case BPF_FUNC_tail_call
:
615 return &bpf_tail_call_proto
;
616 case BPF_FUNC_get_current_pid_tgid
:
617 return &bpf_get_current_pid_tgid_proto
;
618 case BPF_FUNC_get_current_task
:
619 return &bpf_get_current_task_proto
;
620 case BPF_FUNC_get_current_uid_gid
:
621 return &bpf_get_current_uid_gid_proto
;
622 case BPF_FUNC_get_current_comm
:
623 return &bpf_get_current_comm_proto
;
624 case BPF_FUNC_trace_printk
:
625 return bpf_get_trace_printk_proto();
626 case BPF_FUNC_get_smp_processor_id
:
627 return &bpf_get_smp_processor_id_proto
;
628 case BPF_FUNC_get_numa_node_id
:
629 return &bpf_get_numa_node_id_proto
;
630 case BPF_FUNC_perf_event_read
:
631 return &bpf_perf_event_read_proto
;
632 case BPF_FUNC_probe_write_user
:
633 return bpf_get_probe_write_proto();
634 case BPF_FUNC_current_task_under_cgroup
:
635 return &bpf_current_task_under_cgroup_proto
;
636 case BPF_FUNC_get_prandom_u32
:
637 return &bpf_get_prandom_u32_proto
;
638 case BPF_FUNC_probe_read_str
:
639 return &bpf_probe_read_str_proto
;
640 #ifdef CONFIG_CGROUPS
641 case BPF_FUNC_get_current_cgroup_id
:
642 return &bpf_get_current_cgroup_id_proto
;
649 static const struct bpf_func_proto
*
650 kprobe_prog_func_proto(enum bpf_func_id func_id
, const struct bpf_prog
*prog
)
653 case BPF_FUNC_perf_event_output
:
654 return &bpf_perf_event_output_proto
;
655 case BPF_FUNC_get_stackid
:
656 return &bpf_get_stackid_proto
;
657 case BPF_FUNC_get_stack
:
658 return &bpf_get_stack_proto
;
659 case BPF_FUNC_perf_event_read_value
:
660 return &bpf_perf_event_read_value_proto
;
661 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
662 case BPF_FUNC_override_return
:
663 return &bpf_override_return_proto
;
666 return tracing_func_proto(func_id
, prog
);
670 /* bpf+kprobe programs can access fields of 'struct pt_regs' */
671 static bool kprobe_prog_is_valid_access(int off
, int size
, enum bpf_access_type type
,
672 const struct bpf_prog
*prog
,
673 struct bpf_insn_access_aux
*info
)
675 if (off
< 0 || off
>= sizeof(struct pt_regs
))
677 if (type
!= BPF_READ
)
682 * Assertion for 32 bit to make sure last 8 byte access
683 * (BPF_DW) to the last 4 byte member is disallowed.
685 if (off
+ size
> sizeof(struct pt_regs
))
691 const struct bpf_verifier_ops kprobe_verifier_ops
= {
692 .get_func_proto
= kprobe_prog_func_proto
,
693 .is_valid_access
= kprobe_prog_is_valid_access
,
696 const struct bpf_prog_ops kprobe_prog_ops
= {
699 BPF_CALL_5(bpf_perf_event_output_tp
, void *, tp_buff
, struct bpf_map
*, map
,
700 u64
, flags
, void *, data
, u64
, size
)
702 struct pt_regs
*regs
= *(struct pt_regs
**)tp_buff
;
705 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
706 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
707 * from there and call the same bpf_perf_event_output() helper inline.
709 return ____bpf_perf_event_output(regs
, map
, flags
, data
, size
);
712 static const struct bpf_func_proto bpf_perf_event_output_proto_tp
= {
713 .func
= bpf_perf_event_output_tp
,
715 .ret_type
= RET_INTEGER
,
716 .arg1_type
= ARG_PTR_TO_CTX
,
717 .arg2_type
= ARG_CONST_MAP_PTR
,
718 .arg3_type
= ARG_ANYTHING
,
719 .arg4_type
= ARG_PTR_TO_MEM
,
720 .arg5_type
= ARG_CONST_SIZE_OR_ZERO
,
723 BPF_CALL_3(bpf_get_stackid_tp
, void *, tp_buff
, struct bpf_map
*, map
,
726 struct pt_regs
*regs
= *(struct pt_regs
**)tp_buff
;
729 * Same comment as in bpf_perf_event_output_tp(), only that this time
730 * the other helper's function body cannot be inlined due to being
731 * external, thus we need to call raw helper function.
733 return bpf_get_stackid((unsigned long) regs
, (unsigned long) map
,
737 static const struct bpf_func_proto bpf_get_stackid_proto_tp
= {
738 .func
= bpf_get_stackid_tp
,
740 .ret_type
= RET_INTEGER
,
741 .arg1_type
= ARG_PTR_TO_CTX
,
742 .arg2_type
= ARG_CONST_MAP_PTR
,
743 .arg3_type
= ARG_ANYTHING
,
746 BPF_CALL_4(bpf_get_stack_tp
, void *, tp_buff
, void *, buf
, u32
, size
,
749 struct pt_regs
*regs
= *(struct pt_regs
**)tp_buff
;
751 return bpf_get_stack((unsigned long) regs
, (unsigned long) buf
,
752 (unsigned long) size
, flags
, 0);
755 static const struct bpf_func_proto bpf_get_stack_proto_tp
= {
756 .func
= bpf_get_stack_tp
,
758 .ret_type
= RET_INTEGER
,
759 .arg1_type
= ARG_PTR_TO_CTX
,
760 .arg2_type
= ARG_PTR_TO_UNINIT_MEM
,
761 .arg3_type
= ARG_CONST_SIZE_OR_ZERO
,
762 .arg4_type
= ARG_ANYTHING
,
765 static const struct bpf_func_proto
*
766 tp_prog_func_proto(enum bpf_func_id func_id
, const struct bpf_prog
*prog
)
769 case BPF_FUNC_perf_event_output
:
770 return &bpf_perf_event_output_proto_tp
;
771 case BPF_FUNC_get_stackid
:
772 return &bpf_get_stackid_proto_tp
;
773 case BPF_FUNC_get_stack
:
774 return &bpf_get_stack_proto_tp
;
776 return tracing_func_proto(func_id
, prog
);
780 static bool tp_prog_is_valid_access(int off
, int size
, enum bpf_access_type type
,
781 const struct bpf_prog
*prog
,
782 struct bpf_insn_access_aux
*info
)
784 if (off
< sizeof(void *) || off
>= PERF_MAX_TRACE_SIZE
)
786 if (type
!= BPF_READ
)
791 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE
% sizeof(__u64
));
795 const struct bpf_verifier_ops tracepoint_verifier_ops
= {
796 .get_func_proto
= tp_prog_func_proto
,
797 .is_valid_access
= tp_prog_is_valid_access
,
800 const struct bpf_prog_ops tracepoint_prog_ops
= {
803 BPF_CALL_3(bpf_perf_prog_read_value
, struct bpf_perf_event_data_kern
*, ctx
,
804 struct bpf_perf_event_value
*, buf
, u32
, size
)
808 if (unlikely(size
!= sizeof(struct bpf_perf_event_value
)))
810 err
= perf_event_read_local(ctx
->event
, &buf
->counter
, &buf
->enabled
,
816 memset(buf
, 0, size
);
820 static const struct bpf_func_proto bpf_perf_prog_read_value_proto
= {
821 .func
= bpf_perf_prog_read_value
,
823 .ret_type
= RET_INTEGER
,
824 .arg1_type
= ARG_PTR_TO_CTX
,
825 .arg2_type
= ARG_PTR_TO_UNINIT_MEM
,
826 .arg3_type
= ARG_CONST_SIZE
,
829 static const struct bpf_func_proto
*
830 pe_prog_func_proto(enum bpf_func_id func_id
, const struct bpf_prog
*prog
)
833 case BPF_FUNC_perf_event_output
:
834 return &bpf_perf_event_output_proto_tp
;
835 case BPF_FUNC_get_stackid
:
836 return &bpf_get_stackid_proto_tp
;
837 case BPF_FUNC_get_stack
:
838 return &bpf_get_stack_proto_tp
;
839 case BPF_FUNC_perf_prog_read_value
:
840 return &bpf_perf_prog_read_value_proto
;
842 return tracing_func_proto(func_id
, prog
);
847 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
848 * to avoid potential recursive reuse issue when/if tracepoints are added
849 * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
851 * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
852 * in normal, irq, and nmi context.
854 struct bpf_raw_tp_regs
{
855 struct pt_regs regs
[3];
857 static DEFINE_PER_CPU(struct bpf_raw_tp_regs
, bpf_raw_tp_regs
);
858 static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level
);
859 static struct pt_regs
*get_bpf_raw_tp_regs(void)
861 struct bpf_raw_tp_regs
*tp_regs
= this_cpu_ptr(&bpf_raw_tp_regs
);
862 int nest_level
= this_cpu_inc_return(bpf_raw_tp_nest_level
);
864 if (WARN_ON_ONCE(nest_level
> ARRAY_SIZE(tp_regs
->regs
))) {
865 this_cpu_dec(bpf_raw_tp_nest_level
);
866 return ERR_PTR(-EBUSY
);
869 return &tp_regs
->regs
[nest_level
- 1];
872 static void put_bpf_raw_tp_regs(void)
874 this_cpu_dec(bpf_raw_tp_nest_level
);
877 BPF_CALL_5(bpf_perf_event_output_raw_tp
, struct bpf_raw_tracepoint_args
*, args
,
878 struct bpf_map
*, map
, u64
, flags
, void *, data
, u64
, size
)
880 struct pt_regs
*regs
= get_bpf_raw_tp_regs();
884 return PTR_ERR(regs
);
886 perf_fetch_caller_regs(regs
);
887 ret
= ____bpf_perf_event_output(regs
, map
, flags
, data
, size
);
889 put_bpf_raw_tp_regs();
893 static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp
= {
894 .func
= bpf_perf_event_output_raw_tp
,
896 .ret_type
= RET_INTEGER
,
897 .arg1_type
= ARG_PTR_TO_CTX
,
898 .arg2_type
= ARG_CONST_MAP_PTR
,
899 .arg3_type
= ARG_ANYTHING
,
900 .arg4_type
= ARG_PTR_TO_MEM
,
901 .arg5_type
= ARG_CONST_SIZE_OR_ZERO
,
904 BPF_CALL_3(bpf_get_stackid_raw_tp
, struct bpf_raw_tracepoint_args
*, args
,
905 struct bpf_map
*, map
, u64
, flags
)
907 struct pt_regs
*regs
= get_bpf_raw_tp_regs();
911 return PTR_ERR(regs
);
913 perf_fetch_caller_regs(regs
);
914 /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
915 ret
= bpf_get_stackid((unsigned long) regs
, (unsigned long) map
,
917 put_bpf_raw_tp_regs();
921 static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp
= {
922 .func
= bpf_get_stackid_raw_tp
,
924 .ret_type
= RET_INTEGER
,
925 .arg1_type
= ARG_PTR_TO_CTX
,
926 .arg2_type
= ARG_CONST_MAP_PTR
,
927 .arg3_type
= ARG_ANYTHING
,
930 BPF_CALL_4(bpf_get_stack_raw_tp
, struct bpf_raw_tracepoint_args
*, args
,
931 void *, buf
, u32
, size
, u64
, flags
)
933 struct pt_regs
*regs
= get_bpf_raw_tp_regs();
937 return PTR_ERR(regs
);
939 perf_fetch_caller_regs(regs
);
940 ret
= bpf_get_stack((unsigned long) regs
, (unsigned long) buf
,
941 (unsigned long) size
, flags
, 0);
942 put_bpf_raw_tp_regs();
946 static const struct bpf_func_proto bpf_get_stack_proto_raw_tp
= {
947 .func
= bpf_get_stack_raw_tp
,
949 .ret_type
= RET_INTEGER
,
950 .arg1_type
= ARG_PTR_TO_CTX
,
951 .arg2_type
= ARG_PTR_TO_MEM
,
952 .arg3_type
= ARG_CONST_SIZE_OR_ZERO
,
953 .arg4_type
= ARG_ANYTHING
,
956 static const struct bpf_func_proto
*
957 raw_tp_prog_func_proto(enum bpf_func_id func_id
, const struct bpf_prog
*prog
)
960 case BPF_FUNC_perf_event_output
:
961 return &bpf_perf_event_output_proto_raw_tp
;
962 case BPF_FUNC_get_stackid
:
963 return &bpf_get_stackid_proto_raw_tp
;
964 case BPF_FUNC_get_stack
:
965 return &bpf_get_stack_proto_raw_tp
;
967 return tracing_func_proto(func_id
, prog
);
971 static bool raw_tp_prog_is_valid_access(int off
, int size
,
972 enum bpf_access_type type
,
973 const struct bpf_prog
*prog
,
974 struct bpf_insn_access_aux
*info
)
976 /* largest tracepoint in the kernel has 12 args */
977 if (off
< 0 || off
>= sizeof(__u64
) * 12)
979 if (type
!= BPF_READ
)
986 const struct bpf_verifier_ops raw_tracepoint_verifier_ops
= {
987 .get_func_proto
= raw_tp_prog_func_proto
,
988 .is_valid_access
= raw_tp_prog_is_valid_access
,
991 const struct bpf_prog_ops raw_tracepoint_prog_ops
= {
994 static bool raw_tp_writable_prog_is_valid_access(int off
, int size
,
995 enum bpf_access_type type
,
996 const struct bpf_prog
*prog
,
997 struct bpf_insn_access_aux
*info
)
1000 if (size
!= sizeof(u64
) || type
!= BPF_READ
)
1002 info
->reg_type
= PTR_TO_TP_BUFFER
;
1004 return raw_tp_prog_is_valid_access(off
, size
, type
, prog
, info
);
1007 const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops
= {
1008 .get_func_proto
= raw_tp_prog_func_proto
,
1009 .is_valid_access
= raw_tp_writable_prog_is_valid_access
,
1012 const struct bpf_prog_ops raw_tracepoint_writable_prog_ops
= {
1015 static bool pe_prog_is_valid_access(int off
, int size
, enum bpf_access_type type
,
1016 const struct bpf_prog
*prog
,
1017 struct bpf_insn_access_aux
*info
)
1019 const int size_u64
= sizeof(u64
);
1021 if (off
< 0 || off
>= sizeof(struct bpf_perf_event_data
))
1023 if (type
!= BPF_READ
)
1025 if (off
% size
!= 0) {
1026 if (sizeof(unsigned long) != 4)
1030 if (off
% size
!= 4)
1035 case bpf_ctx_range(struct bpf_perf_event_data
, sample_period
):
1036 bpf_ctx_record_field_size(info
, size_u64
);
1037 if (!bpf_ctx_narrow_access_ok(off
, size
, size_u64
))
1040 case bpf_ctx_range(struct bpf_perf_event_data
, addr
):
1041 bpf_ctx_record_field_size(info
, size_u64
);
1042 if (!bpf_ctx_narrow_access_ok(off
, size
, size_u64
))
1046 if (size
!= sizeof(long))
1053 static u32
pe_prog_convert_ctx_access(enum bpf_access_type type
,
1054 const struct bpf_insn
*si
,
1055 struct bpf_insn
*insn_buf
,
1056 struct bpf_prog
*prog
, u32
*target_size
)
1058 struct bpf_insn
*insn
= insn_buf
;
1061 case offsetof(struct bpf_perf_event_data
, sample_period
):
1062 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern
,
1063 data
), si
->dst_reg
, si
->src_reg
,
1064 offsetof(struct bpf_perf_event_data_kern
, data
));
1065 *insn
++ = BPF_LDX_MEM(BPF_DW
, si
->dst_reg
, si
->dst_reg
,
1066 bpf_target_off(struct perf_sample_data
, period
, 8,
1069 case offsetof(struct bpf_perf_event_data
, addr
):
1070 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern
,
1071 data
), si
->dst_reg
, si
->src_reg
,
1072 offsetof(struct bpf_perf_event_data_kern
, data
));
1073 *insn
++ = BPF_LDX_MEM(BPF_DW
, si
->dst_reg
, si
->dst_reg
,
1074 bpf_target_off(struct perf_sample_data
, addr
, 8,
1078 *insn
++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern
,
1079 regs
), si
->dst_reg
, si
->src_reg
,
1080 offsetof(struct bpf_perf_event_data_kern
, regs
));
1081 *insn
++ = BPF_LDX_MEM(BPF_SIZEOF(long), si
->dst_reg
, si
->dst_reg
,
1086 return insn
- insn_buf
;
1089 const struct bpf_verifier_ops perf_event_verifier_ops
= {
1090 .get_func_proto
= pe_prog_func_proto
,
1091 .is_valid_access
= pe_prog_is_valid_access
,
1092 .convert_ctx_access
= pe_prog_convert_ctx_access
,
1095 const struct bpf_prog_ops perf_event_prog_ops
= {
1098 static DEFINE_MUTEX(bpf_event_mutex
);
1100 #define BPF_TRACE_MAX_PROGS 64
1102 int perf_event_attach_bpf_prog(struct perf_event
*event
,
1103 struct bpf_prog
*prog
)
1105 struct bpf_prog_array __rcu
*old_array
;
1106 struct bpf_prog_array
*new_array
;
1110 * Kprobe override only works if they are on the function entry,
1111 * and only if they are on the opt-in list.
1113 if (prog
->kprobe_override
&&
1114 (!trace_kprobe_on_func_entry(event
->tp_event
) ||
1115 !trace_kprobe_error_injectable(event
->tp_event
)))
1118 mutex_lock(&bpf_event_mutex
);
1123 old_array
= event
->tp_event
->prog_array
;
1125 bpf_prog_array_length(old_array
) >= BPF_TRACE_MAX_PROGS
) {
1130 ret
= bpf_prog_array_copy(old_array
, NULL
, prog
, &new_array
);
1134 /* set the new array to event->tp_event and set event->prog */
1136 rcu_assign_pointer(event
->tp_event
->prog_array
, new_array
);
1137 bpf_prog_array_free(old_array
);
1140 mutex_unlock(&bpf_event_mutex
);
1144 void perf_event_detach_bpf_prog(struct perf_event
*event
)
1146 struct bpf_prog_array __rcu
*old_array
;
1147 struct bpf_prog_array
*new_array
;
1150 mutex_lock(&bpf_event_mutex
);
1155 old_array
= event
->tp_event
->prog_array
;
1156 ret
= bpf_prog_array_copy(old_array
, event
->prog
, NULL
, &new_array
);
1160 bpf_prog_array_delete_safe(old_array
, event
->prog
);
1162 rcu_assign_pointer(event
->tp_event
->prog_array
, new_array
);
1163 bpf_prog_array_free(old_array
);
1166 bpf_prog_put(event
->prog
);
1170 mutex_unlock(&bpf_event_mutex
);
1173 int perf_event_query_prog_array(struct perf_event
*event
, void __user
*info
)
1175 struct perf_event_query_bpf __user
*uquery
= info
;
1176 struct perf_event_query_bpf query
= {};
1177 u32
*ids
, prog_cnt
, ids_len
;
1180 if (!capable(CAP_SYS_ADMIN
))
1182 if (event
->attr
.type
!= PERF_TYPE_TRACEPOINT
)
1184 if (copy_from_user(&query
, uquery
, sizeof(query
)))
1187 ids_len
= query
.ids_len
;
1188 if (ids_len
> BPF_TRACE_MAX_PROGS
)
1190 ids
= kcalloc(ids_len
, sizeof(u32
), GFP_USER
| __GFP_NOWARN
);
1194 * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which
1195 * is required when user only wants to check for uquery->prog_cnt.
1196 * There is no need to check for it since the case is handled
1197 * gracefully in bpf_prog_array_copy_info.
1200 mutex_lock(&bpf_event_mutex
);
1201 ret
= bpf_prog_array_copy_info(event
->tp_event
->prog_array
,
1205 mutex_unlock(&bpf_event_mutex
);
1207 if (copy_to_user(&uquery
->prog_cnt
, &prog_cnt
, sizeof(prog_cnt
)) ||
1208 copy_to_user(uquery
->ids
, ids
, ids_len
* sizeof(u32
)))
1215 extern struct bpf_raw_event_map __start__bpf_raw_tp
[];
1216 extern struct bpf_raw_event_map __stop__bpf_raw_tp
[];
1218 struct bpf_raw_event_map
*bpf_get_raw_tracepoint(const char *name
)
1220 struct bpf_raw_event_map
*btp
= __start__bpf_raw_tp
;
1222 for (; btp
< __stop__bpf_raw_tp
; btp
++) {
1223 if (!strcmp(btp
->tp
->name
, name
))
1227 return bpf_get_raw_tracepoint_module(name
);
1230 void bpf_put_raw_tracepoint(struct bpf_raw_event_map
*btp
)
1232 struct module
*mod
= __module_address((unsigned long)btp
);
1238 static __always_inline
1239 void __bpf_trace_run(struct bpf_prog
*prog
, u64
*args
)
1243 (void) BPF_PROG_RUN(prog
, args
);
1248 #define UNPACK(...) __VA_ARGS__
1249 #define REPEAT_1(FN, DL, X, ...) FN(X)
1250 #define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
1251 #define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
1252 #define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
1253 #define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
1254 #define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
1255 #define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
1256 #define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
1257 #define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
1258 #define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
1259 #define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
1260 #define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
1261 #define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__)
1263 #define SARG(X) u64 arg##X
1264 #define COPY(X) args[X] = arg##X
1266 #define __DL_COM (,)
1267 #define __DL_SEM (;)
1269 #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
1271 #define BPF_TRACE_DEFN_x(x) \
1272 void bpf_trace_run##x(struct bpf_prog *prog, \
1273 REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \
1276 REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \
1277 __bpf_trace_run(prog, args); \
1279 EXPORT_SYMBOL_GPL(bpf_trace_run##x)
1280 BPF_TRACE_DEFN_x(1);
1281 BPF_TRACE_DEFN_x(2);
1282 BPF_TRACE_DEFN_x(3);
1283 BPF_TRACE_DEFN_x(4);
1284 BPF_TRACE_DEFN_x(5);
1285 BPF_TRACE_DEFN_x(6);
1286 BPF_TRACE_DEFN_x(7);
1287 BPF_TRACE_DEFN_x(8);
1288 BPF_TRACE_DEFN_x(9);
1289 BPF_TRACE_DEFN_x(10);
1290 BPF_TRACE_DEFN_x(11);
1291 BPF_TRACE_DEFN_x(12);
1293 static int __bpf_probe_register(struct bpf_raw_event_map
*btp
, struct bpf_prog
*prog
)
1295 struct tracepoint
*tp
= btp
->tp
;
1298 * check that program doesn't access arguments beyond what's
1299 * available in this tracepoint
1301 if (prog
->aux
->max_ctx_offset
> btp
->num_args
* sizeof(u64
))
1304 if (prog
->aux
->max_tp_access
> btp
->writable_size
)
1307 return tracepoint_probe_register(tp
, (void *)btp
->bpf_func
, prog
);
1310 int bpf_probe_register(struct bpf_raw_event_map
*btp
, struct bpf_prog
*prog
)
1312 return __bpf_probe_register(btp
, prog
);
1315 int bpf_probe_unregister(struct bpf_raw_event_map
*btp
, struct bpf_prog
*prog
)
1317 return tracepoint_probe_unregister(btp
->tp
, (void *)btp
->bpf_func
, prog
);
1320 int bpf_get_perf_event_info(const struct perf_event
*event
, u32
*prog_id
,
1321 u32
*fd_type
, const char **buf
,
1322 u64
*probe_offset
, u64
*probe_addr
)
1324 bool is_tracepoint
, is_syscall_tp
;
1325 struct bpf_prog
*prog
;
1332 /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
1333 if (prog
->type
== BPF_PROG_TYPE_PERF_EVENT
)
1336 *prog_id
= prog
->aux
->id
;
1337 flags
= event
->tp_event
->flags
;
1338 is_tracepoint
= flags
& TRACE_EVENT_FL_TRACEPOINT
;
1339 is_syscall_tp
= is_syscall_trace_event(event
->tp_event
);
1341 if (is_tracepoint
|| is_syscall_tp
) {
1342 *buf
= is_tracepoint
? event
->tp_event
->tp
->name
1343 : event
->tp_event
->name
;
1344 *fd_type
= BPF_FD_TYPE_TRACEPOINT
;
1345 *probe_offset
= 0x0;
1350 #ifdef CONFIG_KPROBE_EVENTS
1351 if (flags
& TRACE_EVENT_FL_KPROBE
)
1352 err
= bpf_get_kprobe_info(event
, fd_type
, buf
,
1353 probe_offset
, probe_addr
,
1354 event
->attr
.type
== PERF_TYPE_TRACEPOINT
);
1356 #ifdef CONFIG_UPROBE_EVENTS
1357 if (flags
& TRACE_EVENT_FL_UPROBE
)
1358 err
= bpf_get_uprobe_info(event
, fd_type
, buf
,
1360 event
->attr
.type
== PERF_TYPE_TRACEPOINT
);
1367 #ifdef CONFIG_MODULES
1368 static int bpf_event_notify(struct notifier_block
*nb
, unsigned long op
,
1371 struct bpf_trace_module
*btm
, *tmp
;
1372 struct module
*mod
= module
;
1374 if (mod
->num_bpf_raw_events
== 0 ||
1375 (op
!= MODULE_STATE_COMING
&& op
!= MODULE_STATE_GOING
))
1378 mutex_lock(&bpf_module_mutex
);
1381 case MODULE_STATE_COMING
:
1382 btm
= kzalloc(sizeof(*btm
), GFP_KERNEL
);
1384 btm
->module
= module
;
1385 list_add(&btm
->list
, &bpf_trace_modules
);
1388 case MODULE_STATE_GOING
:
1389 list_for_each_entry_safe(btm
, tmp
, &bpf_trace_modules
, list
) {
1390 if (btm
->module
== module
) {
1391 list_del(&btm
->list
);
1399 mutex_unlock(&bpf_module_mutex
);
1404 static struct notifier_block bpf_module_nb
= {
1405 .notifier_call
= bpf_event_notify
,
1408 static int __init
bpf_event_init(void)
1410 register_module_notifier(&bpf_module_nb
);
1414 fs_initcall(bpf_event_init
);
1415 #endif /* CONFIG_MODULES */