1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2022 Google */
4 #include <linux/btf_ids.h>
5 #include <linux/cgroup.h>
6 #include <linux/kernel.h>
7 #include <linux/seq_file.h>
9 #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
11 /* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
13 * 1. Walk the descendants of a cgroup in pre-order.
14 * 2. Walk the descendants of a cgroup in post-order.
15 * 3. Walk the ancestors of a cgroup.
16 * 4. Show the given cgroup only.
18 * For walking descendants, cgroup_iter can walk in either pre-order or
19 * post-order. For walking ancestors, the iter walks up from a cgroup to
22 * The iter program can terminate the walk early by returning 1. Walk
23 * continues if prog returns 0.
25 * The prog can check (seq->num == 0) to determine whether this is
26 * the first element. The prog may also be passed a NULL cgroup,
27 * which means the walk has completed and the prog has a chance to
28 * do post-processing, such as outputting an epilogue.
30 * Note: the iter_prog is called with cgroup_mutex held.
32 * Currently only one session is supported, which means, depending on the
33 * volume of data bpf program intends to send to user space, the number
34 * of cgroups that can be walked is limited. For example, given the current
35 * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
36 * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
37 * be walked is 512. This is a limitation of cgroup_iter. If the output data
38 * is larger than the kernel buffer size, after all data in the kernel buffer
39 * is consumed by user space, the subsequent read() syscall will signal
40 * EOPNOTSUPP. In order to work around, the user may have to update their
41 * program to reduce the volume of data sent to output. For example, skip
42 * some uninteresting cgroups.
45 struct bpf_iter__cgroup
{
46 __bpf_md_ptr(struct bpf_iter_meta
*, meta
);
47 __bpf_md_ptr(struct cgroup
*, cgroup
);
50 struct cgroup_iter_priv
{
51 struct cgroup_subsys_state
*start_css
;
57 static void *cgroup_iter_seq_start(struct seq_file
*seq
, loff_t
*pos
)
59 struct cgroup_iter_priv
*p
= seq
->private;
63 /* cgroup_iter doesn't support read across multiple sessions. */
68 /* Haven't visited all, but because cgroup_mutex has dropped,
69 * return -EOPNOTSUPP to indicate incomplete iteration.
71 return ERR_PTR(-EOPNOTSUPP
);
76 p
->visited_all
= false;
77 if (p
->order
== BPF_CGROUP_ITER_DESCENDANTS_PRE
)
78 return css_next_descendant_pre(NULL
, p
->start_css
);
79 else if (p
->order
== BPF_CGROUP_ITER_DESCENDANTS_POST
)
80 return css_next_descendant_post(NULL
, p
->start_css
);
81 else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
85 static int __cgroup_iter_seq_show(struct seq_file
*seq
,
86 struct cgroup_subsys_state
*css
, int in_stop
);
88 static void cgroup_iter_seq_stop(struct seq_file
*seq
, void *v
)
90 struct cgroup_iter_priv
*p
= seq
->private;
94 /* pass NULL to the prog for post-processing */
96 __cgroup_iter_seq_show(seq
, NULL
, true);
97 p
->visited_all
= true;
101 static void *cgroup_iter_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
103 struct cgroup_subsys_state
*curr
= (struct cgroup_subsys_state
*)v
;
104 struct cgroup_iter_priv
*p
= seq
->private;
110 if (p
->order
== BPF_CGROUP_ITER_DESCENDANTS_PRE
)
111 return css_next_descendant_pre(curr
, p
->start_css
);
112 else if (p
->order
== BPF_CGROUP_ITER_DESCENDANTS_POST
)
113 return css_next_descendant_post(curr
, p
->start_css
);
114 else if (p
->order
== BPF_CGROUP_ITER_ANCESTORS_UP
)
116 else /* BPF_CGROUP_ITER_SELF_ONLY */
120 static int __cgroup_iter_seq_show(struct seq_file
*seq
,
121 struct cgroup_subsys_state
*css
, int in_stop
)
123 struct cgroup_iter_priv
*p
= seq
->private;
124 struct bpf_iter__cgroup ctx
;
125 struct bpf_iter_meta meta
;
126 struct bpf_prog
*prog
;
129 /* cgroup is dead, skip this element */
130 if (css
&& cgroup_is_dead(css
->cgroup
))
134 ctx
.cgroup
= css
? css
->cgroup
: NULL
;
136 prog
= bpf_iter_get_info(&meta
, in_stop
);
138 ret
= bpf_iter_run_prog(prog
, &ctx
);
140 /* if prog returns > 0, terminate after this element. */
147 static int cgroup_iter_seq_show(struct seq_file
*seq
, void *v
)
149 return __cgroup_iter_seq_show(seq
, (struct cgroup_subsys_state
*)v
,
153 static const struct seq_operations cgroup_iter_seq_ops
= {
154 .start
= cgroup_iter_seq_start
,
155 .next
= cgroup_iter_seq_next
,
156 .stop
= cgroup_iter_seq_stop
,
157 .show
= cgroup_iter_seq_show
,
160 BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id
, struct, cgroup
)
162 static int cgroup_iter_seq_init(void *priv
, struct bpf_iter_aux_info
*aux
)
164 struct cgroup_iter_priv
*p
= (struct cgroup_iter_priv
*)priv
;
165 struct cgroup
*cgrp
= aux
->cgroup
.start
;
167 /* bpf_iter_attach_cgroup() has already acquired an extra reference
168 * for the start cgroup, but the reference may be released after
169 * cgroup_iter_seq_init(), so acquire another reference for the
172 p
->start_css
= &cgrp
->self
;
173 css_get(p
->start_css
);
174 p
->terminate
= false;
175 p
->visited_all
= false;
176 p
->order
= aux
->cgroup
.order
;
180 static void cgroup_iter_seq_fini(void *priv
)
182 struct cgroup_iter_priv
*p
= (struct cgroup_iter_priv
*)priv
;
184 css_put(p
->start_css
);
187 static const struct bpf_iter_seq_info cgroup_iter_seq_info
= {
188 .seq_ops
= &cgroup_iter_seq_ops
,
189 .init_seq_private
= cgroup_iter_seq_init
,
190 .fini_seq_private
= cgroup_iter_seq_fini
,
191 .seq_priv_size
= sizeof(struct cgroup_iter_priv
),
194 static int bpf_iter_attach_cgroup(struct bpf_prog
*prog
,
195 union bpf_iter_link_info
*linfo
,
196 struct bpf_iter_aux_info
*aux
)
198 int fd
= linfo
->cgroup
.cgroup_fd
;
199 u64 id
= linfo
->cgroup
.cgroup_id
;
200 int order
= linfo
->cgroup
.order
;
203 if (order
!= BPF_CGROUP_ITER_DESCENDANTS_PRE
&&
204 order
!= BPF_CGROUP_ITER_DESCENDANTS_POST
&&
205 order
!= BPF_CGROUP_ITER_ANCESTORS_UP
&&
206 order
!= BPF_CGROUP_ITER_SELF_ONLY
)
213 cgrp
= cgroup_v1v2_get_from_fd(fd
);
215 cgrp
= cgroup_get_from_id(id
);
216 else /* walk the entire hierarchy by default. */
217 cgrp
= cgroup_get_from_path("/");
220 return PTR_ERR(cgrp
);
222 aux
->cgroup
.start
= cgrp
;
223 aux
->cgroup
.order
= order
;
227 static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info
*aux
)
229 cgroup_put(aux
->cgroup
.start
);
232 static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info
*aux
,
233 struct seq_file
*seq
)
237 buf
= kzalloc(PATH_MAX
, GFP_KERNEL
);
239 seq_puts(seq
, "cgroup_path:\t<unknown>\n");
243 /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
244 * will print nothing.
246 * Path is in the calling process's cgroup namespace.
248 cgroup_path_ns(aux
->cgroup
.start
, buf
, PATH_MAX
,
249 current
->nsproxy
->cgroup_ns
);
250 seq_printf(seq
, "cgroup_path:\t%s\n", buf
);
254 if (aux
->cgroup
.order
== BPF_CGROUP_ITER_DESCENDANTS_PRE
)
255 seq_puts(seq
, "order: descendants_pre\n");
256 else if (aux
->cgroup
.order
== BPF_CGROUP_ITER_DESCENDANTS_POST
)
257 seq_puts(seq
, "order: descendants_post\n");
258 else if (aux
->cgroup
.order
== BPF_CGROUP_ITER_ANCESTORS_UP
)
259 seq_puts(seq
, "order: ancestors_up\n");
260 else /* BPF_CGROUP_ITER_SELF_ONLY */
261 seq_puts(seq
, "order: self_only\n");
264 static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info
*aux
,
265 struct bpf_link_info
*info
)
267 info
->iter
.cgroup
.order
= aux
->cgroup
.order
;
268 info
->iter
.cgroup
.cgroup_id
= cgroup_id(aux
->cgroup
.start
);
272 DEFINE_BPF_ITER_FUNC(cgroup
, struct bpf_iter_meta
*meta
,
273 struct cgroup
*cgroup
)
275 static struct bpf_iter_reg bpf_cgroup_reg_info
= {
277 .feature
= BPF_ITER_RESCHED
,
278 .attach_target
= bpf_iter_attach_cgroup
,
279 .detach_target
= bpf_iter_detach_cgroup
,
280 .show_fdinfo
= bpf_iter_cgroup_show_fdinfo
,
281 .fill_link_info
= bpf_iter_cgroup_fill_link_info
,
282 .ctx_arg_info_size
= 1,
284 { offsetof(struct bpf_iter__cgroup
, cgroup
),
285 PTR_TO_BTF_ID_OR_NULL
| PTR_TRUSTED
},
287 .seq_info
= &cgroup_iter_seq_info
,
290 static int __init
bpf_cgroup_iter_init(void)
292 bpf_cgroup_reg_info
.ctx_arg_info
[0].btf_id
= bpf_cgroup_btf_id
[0];
293 return bpf_iter_reg_target(&bpf_cgroup_reg_info
);
296 late_initcall(bpf_cgroup_iter_init
);
298 struct bpf_iter_css
{
300 } __attribute__((aligned(8)));
302 struct bpf_iter_css_kern
{
303 struct cgroup_subsys_state
*start
;
304 struct cgroup_subsys_state
*pos
;
306 } __attribute__((aligned(8)));
308 __bpf_kfunc_start_defs();
310 __bpf_kfunc
int bpf_iter_css_new(struct bpf_iter_css
*it
,
311 struct cgroup_subsys_state
*start
, unsigned int flags
)
313 struct bpf_iter_css_kern
*kit
= (void *)it
;
315 BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern
) > sizeof(struct bpf_iter_css
));
316 BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern
) != __alignof__(struct bpf_iter_css
));
320 case BPF_CGROUP_ITER_DESCENDANTS_PRE
:
321 case BPF_CGROUP_ITER_DESCENDANTS_POST
:
322 case BPF_CGROUP_ITER_ANCESTORS_UP
:
334 __bpf_kfunc
struct cgroup_subsys_state
*bpf_iter_css_next(struct bpf_iter_css
*it
)
336 struct bpf_iter_css_kern
*kit
= (void *)it
;
341 switch (kit
->flags
) {
342 case BPF_CGROUP_ITER_DESCENDANTS_PRE
:
343 kit
->pos
= css_next_descendant_pre(kit
->pos
, kit
->start
);
345 case BPF_CGROUP_ITER_DESCENDANTS_POST
:
346 kit
->pos
= css_next_descendant_post(kit
->pos
, kit
->start
);
348 case BPF_CGROUP_ITER_ANCESTORS_UP
:
349 kit
->pos
= kit
->pos
? kit
->pos
->parent
: kit
->start
;
355 __bpf_kfunc
void bpf_iter_css_destroy(struct bpf_iter_css
*it
)
359 __bpf_kfunc_end_defs();