1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2020 Facebook */
5 #include <linux/anon_inodes.h>
6 #include <linux/filter.h>
8 #include <linux/rcupdate_trace.h>
10 struct bpf_iter_target_info
{
11 struct list_head list
;
12 const struct bpf_iter_reg
*reg_info
;
13 u32 btf_id
; /* cached value */
16 struct bpf_iter_link
{
18 struct bpf_iter_aux_info aux
;
19 struct bpf_iter_target_info
*tinfo
;
22 struct bpf_iter_priv_data
{
23 struct bpf_iter_target_info
*tinfo
;
24 const struct bpf_iter_seq_info
*seq_info
;
25 struct bpf_prog
*prog
;
29 u8 target_private
[] __aligned(8);
32 static struct list_head targets
= LIST_HEAD_INIT(targets
);
33 static DEFINE_MUTEX(targets_mutex
);
35 /* protect bpf_iter_link changes */
36 static DEFINE_MUTEX(link_mutex
);
38 /* incremented on every opened seq_file */
39 static atomic64_t session_id
;
41 static int prepare_seq_file(struct file
*file
, struct bpf_iter_link
*link
,
42 const struct bpf_iter_seq_info
*seq_info
);
44 static void bpf_iter_inc_seq_num(struct seq_file
*seq
)
46 struct bpf_iter_priv_data
*iter_priv
;
48 iter_priv
= container_of(seq
->private, struct bpf_iter_priv_data
,
53 static void bpf_iter_dec_seq_num(struct seq_file
*seq
)
55 struct bpf_iter_priv_data
*iter_priv
;
57 iter_priv
= container_of(seq
->private, struct bpf_iter_priv_data
,
62 static void bpf_iter_done_stop(struct seq_file
*seq
)
64 struct bpf_iter_priv_data
*iter_priv
;
66 iter_priv
= container_of(seq
->private, struct bpf_iter_priv_data
,
68 iter_priv
->done_stop
= true;
71 static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info
*tinfo
)
73 return tinfo
->reg_info
->feature
& BPF_ITER_RESCHED
;
76 static bool bpf_iter_support_resched(struct seq_file
*seq
)
78 struct bpf_iter_priv_data
*iter_priv
;
80 iter_priv
= container_of(seq
->private, struct bpf_iter_priv_data
,
82 return bpf_iter_target_support_resched(iter_priv
->tinfo
);
85 /* maximum visited objects before bailing out */
86 #define MAX_ITER_OBJECTS 1000000
88 /* bpf_seq_read, a customized and simpler version for bpf iterator.
89 * The following are differences from seq_read():
90 * . fixed buffer size (PAGE_SIZE)
91 * . assuming NULL ->llseek()
92 * . stop() may call bpf program, handling potential overflow there
94 static ssize_t
bpf_seq_read(struct file
*file
, char __user
*buf
, size_t size
,
97 struct seq_file
*seq
= file
->private_data
;
98 size_t n
, offs
, copied
= 0;
99 int err
= 0, num_objs
= 0;
103 mutex_lock(&seq
->lock
);
106 seq
->size
= PAGE_SIZE
<< 3;
107 seq
->buf
= kvmalloc(seq
->size
, GFP_KERNEL
);
115 n
= min(seq
->count
, size
);
116 err
= copy_to_user(buf
, seq
->buf
+ seq
->from
, n
);
128 p
= seq
->op
->start(seq
, &seq
->index
);
133 seq
->op
->stop(seq
, p
);
138 err
= seq
->op
->show(seq
, p
);
140 /* object is skipped, decrease seq_num, so next
141 * valid object can reuse the same seq_num.
143 bpf_iter_dec_seq_num(seq
);
145 } else if (err
< 0 || seq_has_overflowed(seq
)) {
148 seq
->op
->stop(seq
, p
);
153 can_resched
= bpf_iter_support_resched(seq
);
155 loff_t pos
= seq
->index
;
159 p
= seq
->op
->next(seq
, p
, &seq
->index
);
160 if (pos
== seq
->index
) {
161 pr_info_ratelimited("buggy seq_file .next function %ps "
162 "did not updated position index\n",
167 if (IS_ERR_OR_NULL(p
))
170 /* got a valid next object, increase seq_num */
171 bpf_iter_inc_seq_num(seq
);
173 if (seq
->count
>= size
)
176 if (num_objs
>= MAX_ITER_OBJECTS
) {
179 seq
->op
->stop(seq
, p
);
185 err
= seq
->op
->show(seq
, p
);
187 bpf_iter_dec_seq_num(seq
);
189 } else if (err
< 0 || seq_has_overflowed(seq
)) {
194 seq
->op
->stop(seq
, p
);
206 seq
->op
->stop(seq
, NULL
);
210 /* bpf program called if !p */
211 seq
->op
->stop(seq
, p
);
213 if (!seq_has_overflowed(seq
)) {
214 bpf_iter_done_stop(seq
);
224 n
= min(seq
->count
, size
);
225 err
= copy_to_user(buf
, seq
->buf
, n
);
238 mutex_unlock(&seq
->lock
);
242 static const struct bpf_iter_seq_info
*
243 __get_seq_info(struct bpf_iter_link
*link
)
245 const struct bpf_iter_seq_info
*seq_info
;
248 seq_info
= link
->aux
.map
->ops
->iter_seq_info
;
253 return link
->tinfo
->reg_info
->seq_info
;
256 static int iter_open(struct inode
*inode
, struct file
*file
)
258 struct bpf_iter_link
*link
= inode
->i_private
;
260 return prepare_seq_file(file
, link
, __get_seq_info(link
));
263 static int iter_release(struct inode
*inode
, struct file
*file
)
265 struct bpf_iter_priv_data
*iter_priv
;
266 struct seq_file
*seq
;
268 seq
= file
->private_data
;
272 iter_priv
= container_of(seq
->private, struct bpf_iter_priv_data
,
275 if (iter_priv
->seq_info
->fini_seq_private
)
276 iter_priv
->seq_info
->fini_seq_private(seq
->private);
278 bpf_prog_put(iter_priv
->prog
);
279 seq
->private = iter_priv
;
281 return seq_release_private(inode
, file
);
284 const struct file_operations bpf_iter_fops
= {
286 .read
= bpf_seq_read
,
287 .release
= iter_release
,
290 /* The argument reg_info will be cached in bpf_iter_target_info.
291 * The common practice is to declare target reg_info as
292 * a const static variable and passed as an argument to
293 * bpf_iter_reg_target().
295 int bpf_iter_reg_target(const struct bpf_iter_reg
*reg_info
)
297 struct bpf_iter_target_info
*tinfo
;
299 tinfo
= kzalloc(sizeof(*tinfo
), GFP_KERNEL
);
303 tinfo
->reg_info
= reg_info
;
304 INIT_LIST_HEAD(&tinfo
->list
);
306 mutex_lock(&targets_mutex
);
307 list_add(&tinfo
->list
, &targets
);
308 mutex_unlock(&targets_mutex
);
313 void bpf_iter_unreg_target(const struct bpf_iter_reg
*reg_info
)
315 struct bpf_iter_target_info
*tinfo
;
318 mutex_lock(&targets_mutex
);
319 list_for_each_entry(tinfo
, &targets
, list
) {
320 if (reg_info
== tinfo
->reg_info
) {
321 list_del(&tinfo
->list
);
327 mutex_unlock(&targets_mutex
);
329 WARN_ON(found
== false);
332 static void cache_btf_id(struct bpf_iter_target_info
*tinfo
,
333 struct bpf_prog
*prog
)
335 tinfo
->btf_id
= prog
->aux
->attach_btf_id
;
338 bool bpf_iter_prog_supported(struct bpf_prog
*prog
)
340 const char *attach_fname
= prog
->aux
->attach_func_name
;
341 struct bpf_iter_target_info
*tinfo
= NULL
, *iter
;
342 u32 prog_btf_id
= prog
->aux
->attach_btf_id
;
343 const char *prefix
= BPF_ITER_FUNC_PREFIX
;
344 int prefix_len
= strlen(prefix
);
346 if (strncmp(attach_fname
, prefix
, prefix_len
))
349 mutex_lock(&targets_mutex
);
350 list_for_each_entry(iter
, &targets
, list
) {
351 if (iter
->btf_id
&& iter
->btf_id
== prog_btf_id
) {
355 if (!strcmp(attach_fname
+ prefix_len
, iter
->reg_info
->target
)) {
356 cache_btf_id(iter
, prog
);
361 mutex_unlock(&targets_mutex
);
364 prog
->aux
->ctx_arg_info_size
= tinfo
->reg_info
->ctx_arg_info_size
;
365 prog
->aux
->ctx_arg_info
= tinfo
->reg_info
->ctx_arg_info
;
368 return tinfo
!= NULL
;
371 const struct bpf_func_proto
*
372 bpf_iter_get_func_proto(enum bpf_func_id func_id
, const struct bpf_prog
*prog
)
374 const struct bpf_iter_target_info
*tinfo
;
375 const struct bpf_func_proto
*fn
= NULL
;
377 mutex_lock(&targets_mutex
);
378 list_for_each_entry(tinfo
, &targets
, list
) {
379 if (tinfo
->btf_id
== prog
->aux
->attach_btf_id
) {
380 const struct bpf_iter_reg
*reg_info
;
382 reg_info
= tinfo
->reg_info
;
383 if (reg_info
->get_func_proto
)
384 fn
= reg_info
->get_func_proto(func_id
, prog
);
388 mutex_unlock(&targets_mutex
);
393 static void bpf_iter_link_release(struct bpf_link
*link
)
395 struct bpf_iter_link
*iter_link
=
396 container_of(link
, struct bpf_iter_link
, link
);
398 if (iter_link
->tinfo
->reg_info
->detach_target
)
399 iter_link
->tinfo
->reg_info
->detach_target(&iter_link
->aux
);
402 static void bpf_iter_link_dealloc(struct bpf_link
*link
)
404 struct bpf_iter_link
*iter_link
=
405 container_of(link
, struct bpf_iter_link
, link
);
410 static int bpf_iter_link_replace(struct bpf_link
*link
,
411 struct bpf_prog
*new_prog
,
412 struct bpf_prog
*old_prog
)
416 mutex_lock(&link_mutex
);
417 if (old_prog
&& link
->prog
!= old_prog
) {
422 if (link
->prog
->type
!= new_prog
->type
||
423 link
->prog
->expected_attach_type
!= new_prog
->expected_attach_type
||
424 link
->prog
->aux
->attach_btf_id
!= new_prog
->aux
->attach_btf_id
) {
429 old_prog
= xchg(&link
->prog
, new_prog
);
430 bpf_prog_put(old_prog
);
433 mutex_unlock(&link_mutex
);
437 static void bpf_iter_link_show_fdinfo(const struct bpf_link
*link
,
438 struct seq_file
*seq
)
440 struct bpf_iter_link
*iter_link
=
441 container_of(link
, struct bpf_iter_link
, link
);
442 bpf_iter_show_fdinfo_t show_fdinfo
;
445 "target_name:\t%s\n",
446 iter_link
->tinfo
->reg_info
->target
);
448 show_fdinfo
= iter_link
->tinfo
->reg_info
->show_fdinfo
;
450 show_fdinfo(&iter_link
->aux
, seq
);
453 static int bpf_iter_link_fill_link_info(const struct bpf_link
*link
,
454 struct bpf_link_info
*info
)
456 struct bpf_iter_link
*iter_link
=
457 container_of(link
, struct bpf_iter_link
, link
);
458 char __user
*ubuf
= u64_to_user_ptr(info
->iter
.target_name
);
459 bpf_iter_fill_link_info_t fill_link_info
;
460 u32 ulen
= info
->iter
.target_name_len
;
461 const char *target_name
;
467 target_name
= iter_link
->tinfo
->reg_info
->target
;
468 target_len
= strlen(target_name
);
469 info
->iter
.target_name_len
= target_len
+ 1;
472 if (ulen
>= target_len
+ 1) {
473 if (copy_to_user(ubuf
, target_name
, target_len
+ 1))
478 if (copy_to_user(ubuf
, target_name
, ulen
- 1))
480 if (put_user(zero
, ubuf
+ ulen
- 1))
486 fill_link_info
= iter_link
->tinfo
->reg_info
->fill_link_info
;
488 return fill_link_info(&iter_link
->aux
, info
);
493 static const struct bpf_link_ops bpf_iter_link_lops
= {
494 .release
= bpf_iter_link_release
,
495 .dealloc
= bpf_iter_link_dealloc
,
496 .update_prog
= bpf_iter_link_replace
,
497 .show_fdinfo
= bpf_iter_link_show_fdinfo
,
498 .fill_link_info
= bpf_iter_link_fill_link_info
,
501 bool bpf_link_is_iter(struct bpf_link
*link
)
503 return link
->ops
== &bpf_iter_link_lops
;
506 int bpf_iter_link_attach(const union bpf_attr
*attr
, bpfptr_t uattr
,
507 struct bpf_prog
*prog
)
509 struct bpf_iter_target_info
*tinfo
= NULL
, *iter
;
510 struct bpf_link_primer link_primer
;
511 union bpf_iter_link_info linfo
;
512 struct bpf_iter_link
*link
;
513 u32 prog_btf_id
, linfo_len
;
517 if (attr
->link_create
.target_fd
|| attr
->link_create
.flags
)
520 memset(&linfo
, 0, sizeof(union bpf_iter_link_info
));
522 ulinfo
= make_bpfptr(attr
->link_create
.iter_info
, uattr
.is_kernel
);
523 linfo_len
= attr
->link_create
.iter_info_len
;
524 if (bpfptr_is_null(ulinfo
) ^ !linfo_len
)
527 if (!bpfptr_is_null(ulinfo
)) {
528 err
= bpf_check_uarg_tail_zero(ulinfo
, sizeof(linfo
),
532 linfo_len
= min_t(u32
, linfo_len
, sizeof(linfo
));
533 if (copy_from_bpfptr(&linfo
, ulinfo
, linfo_len
))
537 prog_btf_id
= prog
->aux
->attach_btf_id
;
538 mutex_lock(&targets_mutex
);
539 list_for_each_entry(iter
, &targets
, list
) {
540 if (iter
->btf_id
== prog_btf_id
) {
545 mutex_unlock(&targets_mutex
);
549 /* Only allow sleepable program for resched-able iterator */
550 if (prog
->sleepable
&& !bpf_iter_target_support_resched(tinfo
))
553 link
= kzalloc(sizeof(*link
), GFP_USER
| __GFP_NOWARN
);
557 bpf_link_init(&link
->link
, BPF_LINK_TYPE_ITER
, &bpf_iter_link_lops
, prog
);
560 err
= bpf_link_prime(&link
->link
, &link_primer
);
566 if (tinfo
->reg_info
->attach_target
) {
567 err
= tinfo
->reg_info
->attach_target(prog
, &linfo
, &link
->aux
);
569 bpf_link_cleanup(&link_primer
);
574 return bpf_link_settle(&link_primer
);
577 static void init_seq_meta(struct bpf_iter_priv_data
*priv_data
,
578 struct bpf_iter_target_info
*tinfo
,
579 const struct bpf_iter_seq_info
*seq_info
,
580 struct bpf_prog
*prog
)
582 priv_data
->tinfo
= tinfo
;
583 priv_data
->seq_info
= seq_info
;
584 priv_data
->prog
= prog
;
585 priv_data
->session_id
= atomic64_inc_return(&session_id
);
586 priv_data
->seq_num
= 0;
587 priv_data
->done_stop
= false;
590 static int prepare_seq_file(struct file
*file
, struct bpf_iter_link
*link
,
591 const struct bpf_iter_seq_info
*seq_info
)
593 struct bpf_iter_priv_data
*priv_data
;
594 struct bpf_iter_target_info
*tinfo
;
595 struct bpf_prog
*prog
;
596 u32 total_priv_dsize
;
597 struct seq_file
*seq
;
600 mutex_lock(&link_mutex
);
601 prog
= link
->link
.prog
;
603 mutex_unlock(&link_mutex
);
606 total_priv_dsize
= offsetof(struct bpf_iter_priv_data
, target_private
) +
607 seq_info
->seq_priv_size
;
608 priv_data
= __seq_open_private(file
, seq_info
->seq_ops
,
615 if (seq_info
->init_seq_private
) {
616 err
= seq_info
->init_seq_private(priv_data
->target_private
, &link
->aux
);
618 goto release_seq_file
;
621 init_seq_meta(priv_data
, tinfo
, seq_info
, prog
);
622 seq
= file
->private_data
;
623 seq
->private = priv_data
->target_private
;
628 seq_release_private(file
->f_inode
, file
);
629 file
->private_data
= NULL
;
635 int bpf_iter_new_fd(struct bpf_link
*link
)
637 struct bpf_iter_link
*iter_link
;
642 if (link
->ops
!= &bpf_iter_link_lops
)
645 flags
= O_RDONLY
| O_CLOEXEC
;
646 fd
= get_unused_fd_flags(flags
);
650 file
= anon_inode_getfile("bpf_iter", &bpf_iter_fops
, NULL
, flags
);
656 iter_link
= container_of(link
, struct bpf_iter_link
, link
);
657 err
= prepare_seq_file(file
, iter_link
, __get_seq_info(iter_link
));
661 fd_install(fd
, file
);
671 struct bpf_prog
*bpf_iter_get_info(struct bpf_iter_meta
*meta
, bool in_stop
)
673 struct bpf_iter_priv_data
*iter_priv
;
674 struct seq_file
*seq
;
678 if (seq
->file
->f_op
!= &bpf_iter_fops
)
681 seq_priv
= seq
->private;
682 iter_priv
= container_of(seq_priv
, struct bpf_iter_priv_data
,
685 if (in_stop
&& iter_priv
->done_stop
)
688 meta
->session_id
= iter_priv
->session_id
;
689 meta
->seq_num
= iter_priv
->seq_num
;
691 return iter_priv
->prog
;
694 int bpf_iter_run_prog(struct bpf_prog
*prog
, void *ctx
)
696 struct bpf_run_ctx run_ctx
, *old_run_ctx
;
699 if (prog
->sleepable
) {
700 rcu_read_lock_trace();
703 old_run_ctx
= bpf_set_run_ctx(&run_ctx
);
704 ret
= bpf_prog_run(prog
, ctx
);
705 bpf_reset_run_ctx(old_run_ctx
);
707 rcu_read_unlock_trace();
711 old_run_ctx
= bpf_set_run_ctx(&run_ctx
);
712 ret
= bpf_prog_run(prog
, ctx
);
713 bpf_reset_run_ctx(old_run_ctx
);
718 /* bpf program can only return 0 or 1:
720 * 1 : retry the same object
721 * The bpf_iter_run_prog() return value
722 * will be seq_ops->show() return value.
724 return ret
== 0 ? 0 : -EAGAIN
;
727 BPF_CALL_4(bpf_for_each_map_elem
, struct bpf_map
*, map
, void *, callback_fn
,
728 void *, callback_ctx
, u64
, flags
)
730 return map
->ops
->map_for_each_callback(map
, callback_fn
, callback_ctx
, flags
);
733 const struct bpf_func_proto bpf_for_each_map_elem_proto
= {
734 .func
= bpf_for_each_map_elem
,
736 .ret_type
= RET_INTEGER
,
737 .arg1_type
= ARG_CONST_MAP_PTR
,
738 .arg2_type
= ARG_PTR_TO_FUNC
,
739 .arg3_type
= ARG_PTR_TO_STACK_OR_NULL
,
740 .arg4_type
= ARG_ANYTHING
,
743 BPF_CALL_4(bpf_loop
, u32
, nr_loops
, void *, callback_fn
, void *, callback_ctx
,
746 bpf_callback_t callback
= (bpf_callback_t
)callback_fn
;
750 /* Note: these safety checks are also verified when bpf_loop
751 * is inlined, be careful to modify this code in sync. See
752 * function verifier.c:inline_bpf_loop.
756 if (nr_loops
> BPF_MAX_LOOPS
)
759 for (i
= 0; i
< nr_loops
; i
++) {
760 ret
= callback((u64
)i
, (u64
)(long)callback_ctx
, 0, 0, 0);
761 /* return value: 0 - continue, 1 - stop and return */
769 const struct bpf_func_proto bpf_loop_proto
= {
772 .ret_type
= RET_INTEGER
,
773 .arg1_type
= ARG_ANYTHING
,
774 .arg2_type
= ARG_PTR_TO_FUNC
,
775 .arg3_type
= ARG_PTR_TO_STACK_OR_NULL
,
776 .arg4_type
= ARG_ANYTHING
,
779 struct bpf_iter_num_kern
{
780 int cur
; /* current value, inclusive */
781 int end
; /* final value, exclusive */
784 __bpf_kfunc_start_defs();
786 __bpf_kfunc
int bpf_iter_num_new(struct bpf_iter_num
*it
, int start
, int end
)
788 struct bpf_iter_num_kern
*s
= (void *)it
;
790 BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern
) != sizeof(struct bpf_iter_num
));
791 BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern
) != __alignof__(struct bpf_iter_num
));
793 /* start == end is legit, it's an empty range and we'll just get NULL
794 * on first (and any subsequent) bpf_iter_num_next() call
801 /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
802 if ((s64
)end
- (s64
)start
> BPF_MAX_LOOPS
) {
807 /* user will call bpf_iter_num_next() first,
808 * which will set s->cur to exactly start value;
809 * underflow shouldn't matter
817 __bpf_kfunc
int *bpf_iter_num_next(struct bpf_iter_num
* it
)
819 struct bpf_iter_num_kern
*s
= (void *)it
;
821 /* check failed initialization or if we are done (same behavior);
822 * need to be careful about overflow, so convert to s64 for checks,
823 * e.g., if s->cur == s->end == INT_MAX, we can't just do
824 * s->cur + 1 >= s->end
826 if ((s64
)(s
->cur
+ 1) >= s
->end
) {
836 __bpf_kfunc
void bpf_iter_num_destroy(struct bpf_iter_num
*it
)
838 struct bpf_iter_num_kern
*s
= (void *)it
;
843 __bpf_kfunc_end_defs();