1 /* SPDX-License-Identifier: GPL-2.0 */
3 * Generic event filter for sampling events in BPF.
5 * The BPF program is fixed and just to read filter expressions in the 'filters'
6 * map and compare the sample data in order to reject samples that don't match.
7 * Each filter expression contains a sample flag (term) to compare, an operation
8 * (==, >=, and so on) and a value.
10 * Note that each entry has an array of filter expressions and it only succeeds
11 * when all of the expressions are satisfied. But it supports the logical OR
12 * using a GROUP operation which is satisfied when any of its member expression
13 * is evaluated to true. But it doesn't allow nested GROUP operations for now.
15 * To support non-root users, the filters map can be loaded and pinned in the BPF
16 * filesystem by root (perf record --setup-filter pin). Then each user will get
17 * a new entry in the shared filters map to fill the filter expressions. And the
18 * BPF program will find the filter using (task-id, event-id) as a key.
20 * The pinned BPF object (shared for regular users) has:
24 * event->id ---> | id | ---+ idx_hash | filters
26 * | .... | +-> | idx | --+--> | exprs | ---> perf_bpf_filter_entry[]
28 * task id (tgid) --------------+ | .... | | | ... | .term (+ part)
31 * ======= (root would skip this part) ======== (compares it in a loop)
33 * This is used for per-task use cases while system-wide profiling (normally from
34 * root user) uses a separate copy of the program and the maps for its own so that
35 * it can proceed even if a lot of non-root users are using the filters at the
36 * same time. In this case the filters map has a single entry and no need to use
37 * the hash maps to get the index (key) of the filters map (IOW it's always 0).
39 * The BPF program returns 1 to accept the sample or 0 to drop it.
40 * The 'dropped' map is to keep how many samples it dropped by the filter and
41 * it will be reported as lost samples.
45 #include <sys/ioctl.h>
49 #include <linux/err.h>
50 #include <linux/list.h>
51 #include <api/fs/fs.h>
52 #include <internal/xyarray.h>
53 #include <perf/threadmap.h>
55 #include "util/debug.h"
56 #include "util/evsel.h"
57 #include "util/target.h"
59 #include "util/bpf-filter.h"
60 #include <util/bpf-filter-flex.h>
61 #include <util/bpf-filter-bison.h>
63 #include "bpf_skel/sample-filter.h"
64 #include "bpf_skel/sample_filter.skel.h"
66 #define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
68 #define __PERF_SAMPLE_TYPE(tt, st, opt) { tt, #st, opt }
69 #define PERF_SAMPLE_TYPE(_st, opt) __PERF_SAMPLE_TYPE(PBF_TERM_##_st, PERF_SAMPLE_##_st, opt)
71 /* Index in the pinned 'filters' map. Should be released after use. */
72 struct pinned_filter_idx
{
73 struct list_head list
;
79 static LIST_HEAD(pinned_filters
);
81 static const struct perf_sample_info
{
82 enum perf_bpf_filter_term type
;
86 /* default sample flags */
87 PERF_SAMPLE_TYPE(IP
, NULL
),
88 PERF_SAMPLE_TYPE(TID
, NULL
),
89 PERF_SAMPLE_TYPE(PERIOD
, NULL
),
90 /* flags mostly set by default, but still have options */
91 PERF_SAMPLE_TYPE(ID
, "--sample-identifier"),
92 PERF_SAMPLE_TYPE(CPU
, "--sample-cpu"),
93 PERF_SAMPLE_TYPE(TIME
, "-T"),
94 /* optional sample flags */
95 PERF_SAMPLE_TYPE(ADDR
, "-d"),
96 PERF_SAMPLE_TYPE(DATA_SRC
, "-d"),
97 PERF_SAMPLE_TYPE(PHYS_ADDR
, "--phys-data"),
98 PERF_SAMPLE_TYPE(WEIGHT
, "-W"),
99 PERF_SAMPLE_TYPE(WEIGHT_STRUCT
, "-W"),
100 PERF_SAMPLE_TYPE(TRANSACTION
, "--transaction"),
101 PERF_SAMPLE_TYPE(CODE_PAGE_SIZE
, "--code-page-size"),
102 PERF_SAMPLE_TYPE(DATA_PAGE_SIZE
, "--data-page-size"),
103 PERF_SAMPLE_TYPE(CGROUP
, "--all-cgroups"),
106 static int get_pinned_fd(const char *name
);
108 static const struct perf_sample_info
*get_sample_info(enum perf_bpf_filter_term type
)
112 for (i
= 0; i
< ARRAY_SIZE(sample_table
); i
++) {
113 if (sample_table
[i
].type
== type
)
114 return &sample_table
[i
];
119 static int check_sample_flags(struct evsel
*evsel
, struct perf_bpf_filter_expr
*expr
)
121 const struct perf_sample_info
*info
;
123 if (expr
->term
>= PBF_TERM_SAMPLE_START
&& expr
->term
<= PBF_TERM_SAMPLE_END
&&
124 (evsel
->core
.attr
.sample_type
& (1 << (expr
->term
- PBF_TERM_SAMPLE_START
))))
127 if (expr
->term
== PBF_TERM_UID
|| expr
->term
== PBF_TERM_GID
) {
128 /* Not dependent on the sample_type as computed from a BPF helper. */
132 if (expr
->op
== PBF_OP_GROUP_BEGIN
) {
133 struct perf_bpf_filter_expr
*group
;
135 list_for_each_entry(group
, &expr
->groups
, list
) {
136 if (check_sample_flags(evsel
, group
) < 0)
142 info
= get_sample_info(expr
->term
);
144 pr_err("Error: %s event does not have sample flags %d\n",
145 evsel__name(evsel
), expr
->term
);
149 pr_err("Error: %s event does not have %s\n", evsel__name(evsel
), info
->name
);
151 pr_err(" Hint: please add %s option to perf record\n", info
->option
);
155 static int get_filter_entries(struct evsel
*evsel
, struct perf_bpf_filter_entry
*entry
)
158 struct perf_bpf_filter_expr
*expr
;
160 list_for_each_entry(expr
, &evsel
->bpf_filters
, list
) {
161 if (check_sample_flags(evsel
, expr
) < 0)
164 if (i
== MAX_FILTERS
)
167 entry
[i
].op
= expr
->op
;
168 entry
[i
].part
= expr
->part
;
169 entry
[i
].term
= expr
->term
;
170 entry
[i
].value
= expr
->val
;
173 if (expr
->op
== PBF_OP_GROUP_BEGIN
) {
174 struct perf_bpf_filter_expr
*group
;
176 list_for_each_entry(group
, &expr
->groups
, list
) {
177 if (i
== MAX_FILTERS
)
180 entry
[i
].op
= group
->op
;
181 entry
[i
].part
= group
->part
;
182 entry
[i
].term
= group
->term
;
183 entry
[i
].value
= group
->val
;
187 if (i
== MAX_FILTERS
)
190 entry
[i
].op
= PBF_OP_GROUP_END
;
195 if (i
< MAX_FILTERS
) {
196 /* to terminate the loop early */
197 entry
[i
].op
= PBF_OP_DONE
;
203 static int convert_to_tgid(int tid
)
210 scnprintf(path
, sizeof(path
), "%d/status", tid
);
211 if (procfs__read_str(path
, &buf
, &len
) < 0)
214 p
= strstr(buf
, "Tgid:");
220 tgid
= strtol(p
+ 6, &q
, 0);
229 * The event might be closed already so we cannot get the list of ids using FD
230 * like in create_event_hash() below, let's iterate the event_hash map and
231 * delete all entries that have the event id as a key.
233 static void destroy_event_hash(u64 event_id
)
236 u64 key
, *prev_key
= NULL
;
237 int num
= 0, alloced
= 32;
238 u64
*ids
= calloc(alloced
, sizeof(*ids
));
243 fd
= get_pinned_fd("event_hash");
245 pr_debug("cannot get fd for 'event_hash' map\n");
250 /* Iterate the whole map to collect keys for the event id. */
251 while (!bpf_map_get_next_key(fd
, prev_key
, &key
)) {
254 if (bpf_map_lookup_elem(fd
, &key
, &id
) == 0 && id
== event_id
) {
255 if (num
== alloced
) {
259 tmp
= realloc(ids
, alloced
* sizeof(*ids
));
271 for (int i
= 0; i
< num
; i
++)
272 bpf_map_delete_elem(fd
, &ids
[i
]);
279 * Return a representative id if ok, or 0 for failures.
281 * The perf_event->id is good for this, but an evsel would have multiple
282 * instances for CPUs and tasks. So pick up the first id and setup a hash
283 * from id of each instance to the representative id (the first one).
285 static u64
create_event_hash(struct evsel
*evsel
)
290 fd
= get_pinned_fd("event_hash");
292 pr_err("cannot get fd for 'event_hash' map\n");
296 for (x
= 0; x
< xyarray__max_x(evsel
->core
.fd
); x
++) {
297 for (y
= 0; y
< xyarray__max_y(evsel
->core
.fd
); y
++) {
298 int ret
= ioctl(FD(evsel
, x
, y
), PERF_EVENT_IOC_ID
, &id
);
301 pr_err("Failed to get the event id\n");
303 destroy_event_hash(the_id
);
310 bpf_map_update_elem(fd
, &id
, &the_id
, BPF_ANY
);
318 static void destroy_idx_hash(struct pinned_filter_idx
*pfi
)
321 struct perf_thread_map
*threads
;
323 fd
= get_pinned_fd("filters");
324 bpf_map_delete_elem(fd
, &pfi
->hash_idx
);
328 destroy_event_hash(pfi
->event_id
);
330 threads
= perf_evsel__threads(&pfi
->evsel
->core
);
334 fd
= get_pinned_fd("idx_hash");
335 nr
= perf_thread_map__nr(threads
);
336 for (int i
= 0; i
< nr
; i
++) {
337 /* The target task might be dead already, just try the pid */
338 struct idx_hash_key key
= {
339 .evt_id
= pfi
->event_id
,
340 .tgid
= perf_thread_map__pid(threads
, i
),
343 bpf_map_delete_elem(fd
, &key
);
348 /* Maintain a hashmap from (tgid, event-id) to filter index */
349 static int create_idx_hash(struct evsel
*evsel
, struct perf_bpf_filter_entry
*entry
)
354 struct pinned_filter_idx
*pfi
= NULL
;
355 struct perf_thread_map
*threads
;
357 fd
= get_pinned_fd("filters");
359 pr_err("cannot get fd for 'filters' map\n");
363 /* Find the first available entry in the filters map */
364 for (filter_idx
= 0; filter_idx
< MAX_FILTERS
; filter_idx
++) {
365 if (bpf_map_update_elem(fd
, &filter_idx
, entry
, BPF_NOEXIST
) == 0)
370 if (filter_idx
== MAX_FILTERS
) {
371 pr_err("Too many users for the filter map\n");
375 pfi
= zalloc(sizeof(*pfi
));
377 pr_err("Cannot save pinned filter index\n");
382 pfi
->hash_idx
= filter_idx
;
384 event_id
= create_event_hash(evsel
);
386 pr_err("Cannot update the event hash\n");
390 pfi
->event_id
= event_id
;
392 threads
= perf_evsel__threads(&evsel
->core
);
393 if (threads
== NULL
) {
394 pr_err("Cannot get the thread list of the event\n");
398 /* save the index to a hash map */
399 fd
= get_pinned_fd("idx_hash");
401 pr_err("cannot get fd for 'idx_hash' map\n");
406 nr
= perf_thread_map__nr(threads
);
407 for (int i
= 0; i
< nr
; i
++) {
408 int pid
= perf_thread_map__pid(threads
, i
);
410 struct idx_hash_key key
= {
414 /* it actually needs tgid, let's get tgid from /proc. */
415 tgid
= convert_to_tgid(pid
);
417 /* the thread may be dead, ignore. */
426 if (bpf_map_update_elem(fd
, &key
, &filter_idx
, BPF_ANY
) < 0) {
427 pr_err("Failed to update the idx_hash\n");
431 pr_debug("bpf-filter: idx_hash (task=%d,%s) -> %d\n",
432 tgid
, evsel__name(evsel
), filter_idx
);
435 list_add(&pfi
->list
, &pinned_filters
);
440 destroy_idx_hash(pfi
);
445 int perf_bpf_filter__prepare(struct evsel
*evsel
, struct target
*target
)
447 int i
, x
, y
, fd
, ret
;
448 struct sample_filter_bpf
*skel
= NULL
;
449 struct bpf_program
*prog
;
450 struct bpf_link
*link
;
451 struct perf_bpf_filter_entry
*entry
;
452 bool needs_idx_hash
= !target__has_cpu(target
) && !target
->uid_str
;
454 entry
= calloc(MAX_FILTERS
, sizeof(*entry
));
458 ret
= get_filter_entries(evsel
, entry
);
460 pr_err("Failed to process filter entries\n");
464 if (needs_idx_hash
&& geteuid() != 0) {
467 /* The filters map is shared among other processes */
468 ret
= create_idx_hash(evsel
, entry
);
472 fd
= get_pinned_fd("dropped");
478 /* Reset the lost count */
479 bpf_map_update_elem(fd
, &ret
, &zero
, BPF_ANY
);
482 fd
= get_pinned_fd("perf_sample_filter");
488 for (x
= 0; x
< xyarray__max_x(evsel
->core
.fd
); x
++) {
489 for (y
= 0; y
< xyarray__max_y(evsel
->core
.fd
); y
++) {
490 ret
= ioctl(FD(evsel
, x
, y
), PERF_EVENT_IOC_SET_BPF
, fd
);
492 pr_err("Failed to attach perf sample-filter\n");
504 skel
= sample_filter_bpf__open_and_load();
507 pr_err("Failed to load perf sample-filter BPF skeleton\n");
512 fd
= bpf_map__fd(skel
->maps
.filters
);
514 /* The filters map has only one entry in this case */
515 if (bpf_map_update_elem(fd
, &i
, entry
, BPF_ANY
) < 0) {
517 pr_err("Failed to update the filter map\n");
521 prog
= skel
->progs
.perf_sample_filter
;
522 for (x
= 0; x
< xyarray__max_x(evsel
->core
.fd
); x
++) {
523 for (y
= 0; y
< xyarray__max_y(evsel
->core
.fd
); y
++) {
524 link
= bpf_program__attach_perf_event(prog
, FD(evsel
, x
, y
));
526 pr_err("Failed to attach perf sample-filter program\n");
533 evsel
->bpf_skel
= skel
;
538 if (!list_empty(&pinned_filters
)) {
539 struct pinned_filter_idx
*pfi
, *tmp
;
541 list_for_each_entry_safe(pfi
, tmp
, &pinned_filters
, list
) {
542 destroy_idx_hash(pfi
);
543 list_del(&pfi
->list
);
547 sample_filter_bpf__destroy(skel
);
551 int perf_bpf_filter__destroy(struct evsel
*evsel
)
553 struct perf_bpf_filter_expr
*expr
, *tmp
;
554 struct pinned_filter_idx
*pfi
, *pos
;
556 list_for_each_entry_safe(expr
, tmp
, &evsel
->bpf_filters
, list
) {
557 list_del(&expr
->list
);
560 sample_filter_bpf__destroy(evsel
->bpf_skel
);
562 list_for_each_entry_safe(pfi
, pos
, &pinned_filters
, list
) {
563 destroy_idx_hash(pfi
);
564 list_del(&pfi
->list
);
570 u64
perf_bpf_filter__lost_count(struct evsel
*evsel
)
574 if (list_empty(&evsel
->bpf_filters
))
577 if (!list_empty(&pinned_filters
)) {
578 int fd
= get_pinned_fd("dropped");
579 struct pinned_filter_idx
*pfi
;
584 list_for_each_entry(pfi
, &pinned_filters
, list
) {
585 if (pfi
->evsel
!= evsel
)
588 bpf_map_lookup_elem(fd
, &pfi
->hash_idx
, &count
);
592 } else if (evsel
->bpf_skel
) {
593 struct sample_filter_bpf
*skel
= evsel
->bpf_skel
;
594 int fd
= bpf_map__fd(skel
->maps
.dropped
);
597 bpf_map_lookup_elem(fd
, &idx
, &count
);
603 struct perf_bpf_filter_expr
*perf_bpf_filter_expr__new(enum perf_bpf_filter_term term
,
605 enum perf_bpf_filter_op op
,
608 struct perf_bpf_filter_expr
*expr
;
610 expr
= malloc(sizeof(*expr
));
616 INIT_LIST_HEAD(&expr
->groups
);
621 int perf_bpf_filter__parse(struct list_head
*expr_head
, const char *str
)
623 YY_BUFFER_STATE buffer
;
626 buffer
= perf_bpf_filter__scan_string(str
);
628 ret
= perf_bpf_filter_parse(expr_head
);
630 perf_bpf_filter__flush_buffer(buffer
);
631 perf_bpf_filter__delete_buffer(buffer
);
632 perf_bpf_filter_lex_destroy();
637 int perf_bpf_filter__pin(void)
639 struct sample_filter_bpf
*skel
;
641 int dir_fd
, ret
= -1;
643 skel
= sample_filter_bpf__open();
646 pr_err("Failed to open perf sample-filter BPF skeleton\n");
650 /* pinned program will use pid-hash */
651 bpf_map__set_max_entries(skel
->maps
.filters
, MAX_FILTERS
);
652 bpf_map__set_max_entries(skel
->maps
.event_hash
, MAX_EVT_HASH
);
653 bpf_map__set_max_entries(skel
->maps
.idx_hash
, MAX_IDX_HASH
);
654 bpf_map__set_max_entries(skel
->maps
.dropped
, MAX_FILTERS
);
655 skel
->rodata
->use_idx_hash
= 1;
657 if (sample_filter_bpf__load(skel
) < 0) {
659 pr_err("Failed to load perf sample-filter BPF skeleton\n");
663 if (asprintf(&path
, "%s/fs/bpf/%s", sysfs__mountpoint(),
664 PERF_BPF_FILTER_PIN_PATH
) < 0) {
666 pr_err("Failed to allocate pathname in the BPF-fs\n");
670 ret
= bpf_object__pin(skel
->obj
, path
);
672 pr_err("Failed to pin BPF filter objects\n");
676 /* setup access permissions for the pinned objects */
677 dir_fd
= open(path
, O_PATH
);
679 bpf_object__unpin(skel
->obj
, path
);
684 /* BPF-fs root has the sticky bit */
685 if (fchmodat(dir_fd
, "..", 01755, 0) < 0) {
686 pr_debug("chmod for BPF-fs failed\n");
691 /* perf_filter directory */
692 if (fchmodat(dir_fd
, ".", 0755, 0) < 0) {
693 pr_debug("chmod for perf_filter directory failed?\n");
698 /* programs need write permission for some reason */
699 if (fchmodat(dir_fd
, "perf_sample_filter", 0777, 0) < 0) {
700 pr_debug("chmod for perf_sample_filter failed\n");
704 if (fchmodat(dir_fd
, "filters", 0666, 0) < 0) {
705 pr_debug("chmod for filters failed\n");
708 if (fchmodat(dir_fd
, "event_hash", 0666, 0) < 0) {
709 pr_debug("chmod for event_hash failed\n");
712 if (fchmodat(dir_fd
, "idx_hash", 0666, 0) < 0) {
713 pr_debug("chmod for idx_hash failed\n");
716 if (fchmodat(dir_fd
, "dropped", 0666, 0) < 0) {
717 pr_debug("chmod for dropped failed\n");
726 sample_filter_bpf__destroy(skel
);
730 int perf_bpf_filter__unpin(void)
732 struct sample_filter_bpf
*skel
;
736 skel
= sample_filter_bpf__open_and_load();
739 pr_err("Failed to open perf sample-filter BPF skeleton\n");
743 if (asprintf(&path
, "%s/fs/bpf/%s", sysfs__mountpoint(),
744 PERF_BPF_FILTER_PIN_PATH
) < 0) {
746 pr_err("Failed to allocate pathname in the BPF-fs\n");
750 ret
= bpf_object__unpin(skel
->obj
, path
);
754 sample_filter_bpf__destroy(skel
);
758 static int get_pinned_fd(const char *name
)
763 if (asprintf(&path
, "%s/fs/bpf/%s/%s", sysfs__mountpoint(),
764 PERF_BPF_FILTER_PIN_PATH
, name
) < 0)
767 fd
= bpf_obj_get(path
);