1 //SPDX-License-Identifier: GPL-2.0
2 #include <linux/bpf-cgroup.h>
6 #include <linux/filter.h>
8 #include <linux/rbtree.h>
9 #include <linux/slab.h>
10 #include <uapi/linux/btf.h>
12 DEFINE_PER_CPU(struct bpf_cgroup_storage
*, bpf_cgroup_storage
[MAX_BPF_CGROUP_STORAGE_TYPE
]);
14 #ifdef CONFIG_CGROUP_BPF
16 #include "../cgroup/cgroup-internal.h"
18 #define LOCAL_STORAGE_CREATE_FLAG_MASK \
19 (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
21 struct bpf_cgroup_storage_map
{
26 struct list_head list
;
29 static struct bpf_cgroup_storage_map
*map_to_storage(struct bpf_map
*map
)
31 return container_of(map
, struct bpf_cgroup_storage_map
, map
);
34 static bool attach_type_isolated(const struct bpf_map
*map
)
36 return map
->key_size
== sizeof(struct bpf_cgroup_storage_key
);
39 static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map
*map
,
40 const void *_key1
, const void *_key2
)
42 if (attach_type_isolated(&map
->map
)) {
43 const struct bpf_cgroup_storage_key
*key1
= _key1
;
44 const struct bpf_cgroup_storage_key
*key2
= _key2
;
46 if (key1
->cgroup_inode_id
< key2
->cgroup_inode_id
)
48 else if (key1
->cgroup_inode_id
> key2
->cgroup_inode_id
)
50 else if (key1
->attach_type
< key2
->attach_type
)
52 else if (key1
->attach_type
> key2
->attach_type
)
55 const __u64
*cgroup_inode_id1
= _key1
;
56 const __u64
*cgroup_inode_id2
= _key2
;
58 if (*cgroup_inode_id1
< *cgroup_inode_id2
)
60 else if (*cgroup_inode_id1
> *cgroup_inode_id2
)
66 struct bpf_cgroup_storage
*
67 cgroup_storage_lookup(struct bpf_cgroup_storage_map
*map
,
68 void *key
, bool locked
)
70 struct rb_root
*root
= &map
->root
;
74 spin_lock_bh(&map
->lock
);
78 struct bpf_cgroup_storage
*storage
;
80 storage
= container_of(node
, struct bpf_cgroup_storage
, node
);
82 switch (bpf_cgroup_storage_key_cmp(map
, key
, &storage
->key
)) {
87 node
= node
->rb_right
;
91 spin_unlock_bh(&map
->lock
);
97 spin_unlock_bh(&map
->lock
);
102 static int cgroup_storage_insert(struct bpf_cgroup_storage_map
*map
,
103 struct bpf_cgroup_storage
*storage
)
105 struct rb_root
*root
= &map
->root
;
106 struct rb_node
**new = &(root
->rb_node
), *parent
= NULL
;
109 struct bpf_cgroup_storage
*this;
111 this = container_of(*new, struct bpf_cgroup_storage
, node
);
114 switch (bpf_cgroup_storage_key_cmp(map
, &storage
->key
, &this->key
)) {
116 new = &((*new)->rb_left
);
119 new = &((*new)->rb_right
);
126 rb_link_node(&storage
->node
, parent
, new);
127 rb_insert_color(&storage
->node
, root
);
132 static void *cgroup_storage_lookup_elem(struct bpf_map
*_map
, void *key
)
134 struct bpf_cgroup_storage_map
*map
= map_to_storage(_map
);
135 struct bpf_cgroup_storage
*storage
;
137 storage
= cgroup_storage_lookup(map
, key
, false);
141 return &READ_ONCE(storage
->buf
)->data
[0];
144 static int cgroup_storage_update_elem(struct bpf_map
*map
, void *key
,
145 void *value
, u64 flags
)
147 struct bpf_cgroup_storage
*storage
;
148 struct bpf_storage_buffer
*new;
150 if (unlikely(flags
& ~(BPF_F_LOCK
| BPF_EXIST
)))
153 if (unlikely((flags
& BPF_F_LOCK
) &&
154 !map_value_has_spin_lock(map
)))
157 storage
= cgroup_storage_lookup((struct bpf_cgroup_storage_map
*)map
,
162 if (flags
& BPF_F_LOCK
) {
163 copy_map_value_locked(map
, storage
->buf
->data
, value
, false);
167 new = kmalloc_node(sizeof(struct bpf_storage_buffer
) +
169 __GFP_ZERO
| GFP_ATOMIC
| __GFP_NOWARN
,
174 memcpy(&new->data
[0], value
, map
->value_size
);
175 check_and_init_map_lock(map
, new->data
);
177 new = xchg(&storage
->buf
, new);
183 int bpf_percpu_cgroup_storage_copy(struct bpf_map
*_map
, void *key
,
186 struct bpf_cgroup_storage_map
*map
= map_to_storage(_map
);
187 struct bpf_cgroup_storage
*storage
;
192 storage
= cgroup_storage_lookup(map
, key
, false);
198 /* per_cpu areas are zero-filled and bpf programs can only
199 * access 'value_size' of them, so copying rounded areas
200 * will not leak any kernel data
202 size
= round_up(_map
->value_size
, 8);
203 for_each_possible_cpu(cpu
) {
204 bpf_long_memcpy(value
+ off
,
205 per_cpu_ptr(storage
->percpu_buf
, cpu
), size
);
212 int bpf_percpu_cgroup_storage_update(struct bpf_map
*_map
, void *key
,
213 void *value
, u64 map_flags
)
215 struct bpf_cgroup_storage_map
*map
= map_to_storage(_map
);
216 struct bpf_cgroup_storage
*storage
;
220 if (map_flags
!= BPF_ANY
&& map_flags
!= BPF_EXIST
)
224 storage
= cgroup_storage_lookup(map
, key
, false);
230 /* the user space will provide round_up(value_size, 8) bytes that
231 * will be copied into per-cpu area. bpf programs can only access
232 * value_size of it. During lookup the same extra bytes will be
233 * returned or zeros which were zero-filled by percpu_alloc,
234 * so no kernel data leaks possible
236 size
= round_up(_map
->value_size
, 8);
237 for_each_possible_cpu(cpu
) {
238 bpf_long_memcpy(per_cpu_ptr(storage
->percpu_buf
, cpu
),
246 static int cgroup_storage_get_next_key(struct bpf_map
*_map
, void *key
,
249 struct bpf_cgroup_storage_map
*map
= map_to_storage(_map
);
250 struct bpf_cgroup_storage
*storage
;
252 spin_lock_bh(&map
->lock
);
254 if (list_empty(&map
->list
))
258 storage
= cgroup_storage_lookup(map
, key
, true);
262 storage
= list_next_entry(storage
, list_map
);
266 storage
= list_first_entry(&map
->list
,
267 struct bpf_cgroup_storage
, list_map
);
270 spin_unlock_bh(&map
->lock
);
272 if (attach_type_isolated(&map
->map
)) {
273 struct bpf_cgroup_storage_key
*next
= _next_key
;
274 *next
= storage
->key
;
276 __u64
*next
= _next_key
;
277 *next
= storage
->key
.cgroup_inode_id
;
282 spin_unlock_bh(&map
->lock
);
286 static struct bpf_map
*cgroup_storage_map_alloc(union bpf_attr
*attr
)
288 int numa_node
= bpf_map_attr_numa_node(attr
);
289 struct bpf_cgroup_storage_map
*map
;
290 struct bpf_map_memory mem
;
293 if (attr
->key_size
!= sizeof(struct bpf_cgroup_storage_key
) &&
294 attr
->key_size
!= sizeof(__u64
))
295 return ERR_PTR(-EINVAL
);
297 if (attr
->value_size
== 0)
298 return ERR_PTR(-EINVAL
);
300 if (attr
->value_size
> PAGE_SIZE
)
301 return ERR_PTR(-E2BIG
);
303 if (attr
->map_flags
& ~LOCAL_STORAGE_CREATE_FLAG_MASK
||
304 !bpf_map_flags_access_ok(attr
->map_flags
))
305 return ERR_PTR(-EINVAL
);
307 if (attr
->max_entries
)
308 /* max_entries is not used and enforced to be 0 */
309 return ERR_PTR(-EINVAL
);
311 ret
= bpf_map_charge_init(&mem
, sizeof(struct bpf_cgroup_storage_map
));
315 map
= kmalloc_node(sizeof(struct bpf_cgroup_storage_map
),
316 __GFP_ZERO
| GFP_USER
, numa_node
);
318 bpf_map_charge_finish(&mem
);
319 return ERR_PTR(-ENOMEM
);
322 bpf_map_charge_move(&map
->map
.memory
, &mem
);
324 /* copy mandatory map attributes */
325 bpf_map_init_from_attr(&map
->map
, attr
);
327 spin_lock_init(&map
->lock
);
329 INIT_LIST_HEAD(&map
->list
);
334 static void cgroup_storage_map_free(struct bpf_map
*_map
)
336 struct bpf_cgroup_storage_map
*map
= map_to_storage(_map
);
337 struct list_head
*storages
= &map
->list
;
338 struct bpf_cgroup_storage
*storage
, *stmp
;
340 mutex_lock(&cgroup_mutex
);
342 list_for_each_entry_safe(storage
, stmp
, storages
, list_map
) {
343 bpf_cgroup_storage_unlink(storage
);
344 bpf_cgroup_storage_free(storage
);
347 mutex_unlock(&cgroup_mutex
);
349 WARN_ON(!RB_EMPTY_ROOT(&map
->root
));
350 WARN_ON(!list_empty(&map
->list
));
355 static int cgroup_storage_delete_elem(struct bpf_map
*map
, void *key
)
360 static int cgroup_storage_check_btf(const struct bpf_map
*map
,
361 const struct btf
*btf
,
362 const struct btf_type
*key_type
,
363 const struct btf_type
*value_type
)
365 if (attach_type_isolated(map
)) {
366 struct btf_member
*m
;
369 /* Key is expected to be of struct bpf_cgroup_storage_key type,
371 * struct bpf_cgroup_storage_key {
372 * __u64 cgroup_inode_id;
378 * Key_type must be a structure with two fields.
380 if (BTF_INFO_KIND(key_type
->info
) != BTF_KIND_STRUCT
||
381 BTF_INFO_VLEN(key_type
->info
) != 2)
385 * The first field must be a 64 bit integer at 0 offset.
387 m
= (struct btf_member
*)(key_type
+ 1);
388 size
= sizeof_field(struct bpf_cgroup_storage_key
, cgroup_inode_id
);
389 if (!btf_member_is_reg_int(btf
, key_type
, m
, 0, size
))
393 * The second field must be a 32 bit integer at 64 bit offset.
396 offset
= offsetof(struct bpf_cgroup_storage_key
, attach_type
);
397 size
= sizeof_field(struct bpf_cgroup_storage_key
, attach_type
);
398 if (!btf_member_is_reg_int(btf
, key_type
, m
, offset
, size
))
404 * Key is expected to be u64, which stores the cgroup_inode_id
407 if (BTF_INFO_KIND(key_type
->info
) != BTF_KIND_INT
)
410 int_data
= *(u32
*)(key_type
+ 1);
411 if (BTF_INT_BITS(int_data
) != 64 || BTF_INT_OFFSET(int_data
))
418 static void cgroup_storage_seq_show_elem(struct bpf_map
*map
, void *key
,
421 enum bpf_cgroup_storage_type stype
= cgroup_storage_type(map
);
422 struct bpf_cgroup_storage
*storage
;
426 storage
= cgroup_storage_lookup(map_to_storage(map
), key
, false);
432 btf_type_seq_show(map
->btf
, map
->btf_key_type_id
, key
, m
);
433 stype
= cgroup_storage_type(map
);
434 if (stype
== BPF_CGROUP_STORAGE_SHARED
) {
436 btf_type_seq_show(map
->btf
, map
->btf_value_type_id
,
437 &READ_ONCE(storage
->buf
)->data
[0], m
);
440 seq_puts(m
, ": {\n");
441 for_each_possible_cpu(cpu
) {
442 seq_printf(m
, "\tcpu%d: ", cpu
);
443 btf_type_seq_show(map
->btf
, map
->btf_value_type_id
,
444 per_cpu_ptr(storage
->percpu_buf
, cpu
),
453 static int cgroup_storage_map_btf_id
;
454 const struct bpf_map_ops cgroup_storage_map_ops
= {
455 .map_alloc
= cgroup_storage_map_alloc
,
456 .map_free
= cgroup_storage_map_free
,
457 .map_get_next_key
= cgroup_storage_get_next_key
,
458 .map_lookup_elem
= cgroup_storage_lookup_elem
,
459 .map_update_elem
= cgroup_storage_update_elem
,
460 .map_delete_elem
= cgroup_storage_delete_elem
,
461 .map_check_btf
= cgroup_storage_check_btf
,
462 .map_seq_show_elem
= cgroup_storage_seq_show_elem
,
463 .map_btf_name
= "bpf_cgroup_storage_map",
464 .map_btf_id
= &cgroup_storage_map_btf_id
,
467 int bpf_cgroup_storage_assign(struct bpf_prog_aux
*aux
, struct bpf_map
*_map
)
469 enum bpf_cgroup_storage_type stype
= cgroup_storage_type(_map
);
471 if (aux
->cgroup_storage
[stype
] &&
472 aux
->cgroup_storage
[stype
] != _map
)
475 aux
->cgroup_storage
[stype
] = _map
;
479 static size_t bpf_cgroup_storage_calculate_size(struct bpf_map
*map
, u32
*pages
)
483 if (cgroup_storage_type(map
) == BPF_CGROUP_STORAGE_SHARED
) {
484 size
= sizeof(struct bpf_storage_buffer
) + map
->value_size
;
485 *pages
= round_up(sizeof(struct bpf_cgroup_storage
) + size
,
486 PAGE_SIZE
) >> PAGE_SHIFT
;
488 size
= map
->value_size
;
489 *pages
= round_up(round_up(size
, 8) * num_possible_cpus(),
490 PAGE_SIZE
) >> PAGE_SHIFT
;
496 struct bpf_cgroup_storage
*bpf_cgroup_storage_alloc(struct bpf_prog
*prog
,
497 enum bpf_cgroup_storage_type stype
)
499 struct bpf_cgroup_storage
*storage
;
505 map
= prog
->aux
->cgroup_storage
[stype
];
509 size
= bpf_cgroup_storage_calculate_size(map
, &pages
);
511 if (bpf_map_charge_memlock(map
, pages
))
512 return ERR_PTR(-EPERM
);
514 storage
= kmalloc_node(sizeof(struct bpf_cgroup_storage
),
515 __GFP_ZERO
| GFP_USER
, map
->numa_node
);
519 flags
= __GFP_ZERO
| GFP_USER
;
521 if (stype
== BPF_CGROUP_STORAGE_SHARED
) {
522 storage
->buf
= kmalloc_node(size
, flags
, map
->numa_node
);
525 check_and_init_map_lock(map
, storage
->buf
->data
);
527 storage
->percpu_buf
= __alloc_percpu_gfp(size
, 8, flags
);
528 if (!storage
->percpu_buf
)
532 storage
->map
= (struct bpf_cgroup_storage_map
*)map
;
537 bpf_map_uncharge_memlock(map
, pages
);
539 return ERR_PTR(-ENOMEM
);
542 static void free_shared_cgroup_storage_rcu(struct rcu_head
*rcu
)
544 struct bpf_cgroup_storage
*storage
=
545 container_of(rcu
, struct bpf_cgroup_storage
, rcu
);
551 static void free_percpu_cgroup_storage_rcu(struct rcu_head
*rcu
)
553 struct bpf_cgroup_storage
*storage
=
554 container_of(rcu
, struct bpf_cgroup_storage
, rcu
);
556 free_percpu(storage
->percpu_buf
);
560 void bpf_cgroup_storage_free(struct bpf_cgroup_storage
*storage
)
562 enum bpf_cgroup_storage_type stype
;
569 map
= &storage
->map
->map
;
571 bpf_cgroup_storage_calculate_size(map
, &pages
);
572 bpf_map_uncharge_memlock(map
, pages
);
574 stype
= cgroup_storage_type(map
);
575 if (stype
== BPF_CGROUP_STORAGE_SHARED
)
576 call_rcu(&storage
->rcu
, free_shared_cgroup_storage_rcu
);
578 call_rcu(&storage
->rcu
, free_percpu_cgroup_storage_rcu
);
581 void bpf_cgroup_storage_link(struct bpf_cgroup_storage
*storage
,
582 struct cgroup
*cgroup
,
583 enum bpf_attach_type type
)
585 struct bpf_cgroup_storage_map
*map
;
590 storage
->key
.attach_type
= type
;
591 storage
->key
.cgroup_inode_id
= cgroup_id(cgroup
);
595 spin_lock_bh(&map
->lock
);
596 WARN_ON(cgroup_storage_insert(map
, storage
));
597 list_add(&storage
->list_map
, &map
->list
);
598 list_add(&storage
->list_cg
, &cgroup
->bpf
.storages
);
599 spin_unlock_bh(&map
->lock
);
602 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage
*storage
)
604 struct bpf_cgroup_storage_map
*map
;
605 struct rb_root
*root
;
612 spin_lock_bh(&map
->lock
);
614 rb_erase(&storage
->node
, root
);
616 list_del(&storage
->list_map
);
617 list_del(&storage
->list_cg
);
618 spin_unlock_bh(&map
->lock
);