1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016,2017 Facebook
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 #include <linux/bpf.h>
14 #include <linux/err.h>
15 #include <linux/slab.h>
17 #include <linux/filter.h>
18 #include <linux/perf_event.h>
20 #include "map_in_map.h"
22 #define ARRAY_CREATE_FLAG_MASK \
23 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
25 static void bpf_array_free_percpu(struct bpf_array
*array
)
29 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
30 free_percpu(array
->pptrs
[i
]);
35 static int bpf_array_alloc_percpu(struct bpf_array
*array
)
40 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
41 ptr
= __alloc_percpu_gfp(array
->elem_size
, 8,
42 GFP_USER
| __GFP_NOWARN
);
44 bpf_array_free_percpu(array
);
47 array
->pptrs
[i
] = ptr
;
54 /* Called from syscall */
55 static int array_map_alloc_check(union bpf_attr
*attr
)
57 bool percpu
= attr
->map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
;
58 int numa_node
= bpf_map_attr_numa_node(attr
);
60 /* check sanity of attributes */
61 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
62 attr
->value_size
== 0 ||
63 attr
->map_flags
& ~ARRAY_CREATE_FLAG_MASK
||
64 (percpu
&& numa_node
!= NUMA_NO_NODE
))
67 if (attr
->value_size
> KMALLOC_MAX_SIZE
)
68 /* if value_size is bigger, the user space won't be able to
69 * access the elements.
76 static struct bpf_map
*array_map_alloc(union bpf_attr
*attr
)
78 bool percpu
= attr
->map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
;
79 int ret
, numa_node
= bpf_map_attr_numa_node(attr
);
80 u32 elem_size
, index_mask
, max_entries
;
81 bool unpriv
= !capable(CAP_SYS_ADMIN
);
82 u64 cost
, array_size
, mask64
;
83 struct bpf_array
*array
;
85 elem_size
= round_up(attr
->value_size
, 8);
87 max_entries
= attr
->max_entries
;
89 /* On 32 bit archs roundup_pow_of_two() with max_entries that has
90 * upper most bit set in u32 space is undefined behavior due to
91 * resulting 1U << 32, so do it manually here in u64 space.
93 mask64
= fls_long(max_entries
- 1);
94 mask64
= 1ULL << mask64
;
99 /* round up array size to nearest power of 2,
100 * since cpu will speculate within index_mask limits
102 max_entries
= index_mask
+ 1;
103 /* Check for overflows. */
104 if (max_entries
< attr
->max_entries
)
105 return ERR_PTR(-E2BIG
);
108 array_size
= sizeof(*array
);
110 array_size
+= (u64
) max_entries
* sizeof(void *);
112 array_size
+= (u64
) max_entries
* elem_size
;
114 /* make sure there is no u32 overflow later in round_up() */
116 if (cost
>= U32_MAX
- PAGE_SIZE
)
117 return ERR_PTR(-ENOMEM
);
119 cost
+= (u64
)attr
->max_entries
* elem_size
* num_possible_cpus();
120 if (cost
>= U32_MAX
- PAGE_SIZE
)
121 return ERR_PTR(-ENOMEM
);
123 cost
= round_up(cost
, PAGE_SIZE
) >> PAGE_SHIFT
;
125 ret
= bpf_map_precharge_memlock(cost
);
129 /* allocate all map elements and zero-initialize them */
130 array
= bpf_map_area_alloc(array_size
, numa_node
);
132 return ERR_PTR(-ENOMEM
);
133 array
->index_mask
= index_mask
;
134 array
->map
.unpriv_array
= unpriv
;
136 /* copy mandatory map attributes */
137 bpf_map_init_from_attr(&array
->map
, attr
);
138 array
->map
.pages
= cost
;
139 array
->elem_size
= elem_size
;
141 if (percpu
&& bpf_array_alloc_percpu(array
)) {
142 bpf_map_area_free(array
);
143 return ERR_PTR(-ENOMEM
);
149 /* Called from syscall or from eBPF program */
150 static void *array_map_lookup_elem(struct bpf_map
*map
, void *key
)
152 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
153 u32 index
= *(u32
*)key
;
155 if (unlikely(index
>= array
->map
.max_entries
))
158 return array
->value
+ array
->elem_size
* (index
& array
->index_mask
);
161 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
162 static u32
array_map_gen_lookup(struct bpf_map
*map
, struct bpf_insn
*insn_buf
)
164 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
165 struct bpf_insn
*insn
= insn_buf
;
166 u32 elem_size
= round_up(map
->value_size
, 8);
167 const int ret
= BPF_REG_0
;
168 const int map_ptr
= BPF_REG_1
;
169 const int index
= BPF_REG_2
;
171 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, map_ptr
, offsetof(struct bpf_array
, value
));
172 *insn
++ = BPF_LDX_MEM(BPF_W
, ret
, index
, 0);
173 if (map
->unpriv_array
) {
174 *insn
++ = BPF_JMP_IMM(BPF_JGE
, ret
, map
->max_entries
, 4);
175 *insn
++ = BPF_ALU32_IMM(BPF_AND
, ret
, array
->index_mask
);
177 *insn
++ = BPF_JMP_IMM(BPF_JGE
, ret
, map
->max_entries
, 3);
180 if (is_power_of_2(elem_size
)) {
181 *insn
++ = BPF_ALU64_IMM(BPF_LSH
, ret
, ilog2(elem_size
));
183 *insn
++ = BPF_ALU64_IMM(BPF_MUL
, ret
, elem_size
);
185 *insn
++ = BPF_ALU64_REG(BPF_ADD
, ret
, map_ptr
);
186 *insn
++ = BPF_JMP_IMM(BPF_JA
, 0, 0, 1);
187 *insn
++ = BPF_MOV64_IMM(ret
, 0);
188 return insn
- insn_buf
;
191 /* Called from eBPF program */
192 static void *percpu_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
194 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
195 u32 index
= *(u32
*)key
;
197 if (unlikely(index
>= array
->map
.max_entries
))
200 return this_cpu_ptr(array
->pptrs
[index
& array
->index_mask
]);
203 int bpf_percpu_array_copy(struct bpf_map
*map
, void *key
, void *value
)
205 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
206 u32 index
= *(u32
*)key
;
211 if (unlikely(index
>= array
->map
.max_entries
))
214 /* per_cpu areas are zero-filled and bpf programs can only
215 * access 'value_size' of them, so copying rounded areas
216 * will not leak any kernel data
218 size
= round_up(map
->value_size
, 8);
220 pptr
= array
->pptrs
[index
& array
->index_mask
];
221 for_each_possible_cpu(cpu
) {
222 bpf_long_memcpy(value
+ off
, per_cpu_ptr(pptr
, cpu
), size
);
229 /* Called from syscall */
230 static int array_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
232 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
233 u32 index
= key
? *(u32
*)key
: U32_MAX
;
234 u32
*next
= (u32
*)next_key
;
236 if (index
>= array
->map
.max_entries
) {
241 if (index
== array
->map
.max_entries
- 1)
248 /* Called from syscall or from eBPF program */
249 static int array_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
252 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
253 u32 index
= *(u32
*)key
;
255 if (unlikely(map_flags
> BPF_EXIST
))
259 if (unlikely(index
>= array
->map
.max_entries
))
260 /* all elements were pre-allocated, cannot insert a new one */
263 if (unlikely(map_flags
== BPF_NOEXIST
))
264 /* all elements already exist */
267 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
268 memcpy(this_cpu_ptr(array
->pptrs
[index
& array
->index_mask
]),
269 value
, map
->value_size
);
271 memcpy(array
->value
+
272 array
->elem_size
* (index
& array
->index_mask
),
273 value
, map
->value_size
);
277 int bpf_percpu_array_update(struct bpf_map
*map
, void *key
, void *value
,
280 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
281 u32 index
= *(u32
*)key
;
286 if (unlikely(map_flags
> BPF_EXIST
))
290 if (unlikely(index
>= array
->map
.max_entries
))
291 /* all elements were pre-allocated, cannot insert a new one */
294 if (unlikely(map_flags
== BPF_NOEXIST
))
295 /* all elements already exist */
298 /* the user space will provide round_up(value_size, 8) bytes that
299 * will be copied into per-cpu area. bpf programs can only access
300 * value_size of it. During lookup the same extra bytes will be
301 * returned or zeros which were zero-filled by percpu_alloc,
302 * so no kernel data leaks possible
304 size
= round_up(map
->value_size
, 8);
306 pptr
= array
->pptrs
[index
& array
->index_mask
];
307 for_each_possible_cpu(cpu
) {
308 bpf_long_memcpy(per_cpu_ptr(pptr
, cpu
), value
+ off
, size
);
315 /* Called from syscall or from eBPF program */
316 static int array_map_delete_elem(struct bpf_map
*map
, void *key
)
321 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
322 static void array_map_free(struct bpf_map
*map
)
324 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
326 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
327 * so the programs (can be more than one that used this map) were
328 * disconnected from events. Wait for outstanding programs to complete
333 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
334 bpf_array_free_percpu(array
);
336 bpf_map_area_free(array
);
339 const struct bpf_map_ops array_map_ops
= {
340 .map_alloc_check
= array_map_alloc_check
,
341 .map_alloc
= array_map_alloc
,
342 .map_free
= array_map_free
,
343 .map_get_next_key
= array_map_get_next_key
,
344 .map_lookup_elem
= array_map_lookup_elem
,
345 .map_update_elem
= array_map_update_elem
,
346 .map_delete_elem
= array_map_delete_elem
,
347 .map_gen_lookup
= array_map_gen_lookup
,
350 const struct bpf_map_ops percpu_array_map_ops
= {
351 .map_alloc_check
= array_map_alloc_check
,
352 .map_alloc
= array_map_alloc
,
353 .map_free
= array_map_free
,
354 .map_get_next_key
= array_map_get_next_key
,
355 .map_lookup_elem
= percpu_array_map_lookup_elem
,
356 .map_update_elem
= array_map_update_elem
,
357 .map_delete_elem
= array_map_delete_elem
,
360 static int fd_array_map_alloc_check(union bpf_attr
*attr
)
362 /* only file descriptors can be stored in this type of map */
363 if (attr
->value_size
!= sizeof(u32
))
365 return array_map_alloc_check(attr
);
368 static void fd_array_map_free(struct bpf_map
*map
)
370 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
375 /* make sure it's empty */
376 for (i
= 0; i
< array
->map
.max_entries
; i
++)
377 BUG_ON(array
->ptrs
[i
] != NULL
);
379 bpf_map_area_free(array
);
382 static void *fd_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
387 /* only called from syscall */
388 int bpf_fd_array_map_lookup_elem(struct bpf_map
*map
, void *key
, u32
*value
)
393 if (!map
->ops
->map_fd_sys_lookup_elem
)
397 elem
= array_map_lookup_elem(map
, key
);
398 if (elem
&& (ptr
= READ_ONCE(*elem
)))
399 *value
= map
->ops
->map_fd_sys_lookup_elem(ptr
);
407 /* only called from syscall */
408 int bpf_fd_array_map_update_elem(struct bpf_map
*map
, struct file
*map_file
,
409 void *key
, void *value
, u64 map_flags
)
411 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
412 void *new_ptr
, *old_ptr
;
413 u32 index
= *(u32
*)key
, ufd
;
415 if (map_flags
!= BPF_ANY
)
418 if (index
>= array
->map
.max_entries
)
422 new_ptr
= map
->ops
->map_fd_get_ptr(map
, map_file
, ufd
);
424 return PTR_ERR(new_ptr
);
426 old_ptr
= xchg(array
->ptrs
+ index
, new_ptr
);
428 map
->ops
->map_fd_put_ptr(old_ptr
);
433 static int fd_array_map_delete_elem(struct bpf_map
*map
, void *key
)
435 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
437 u32 index
= *(u32
*)key
;
439 if (index
>= array
->map
.max_entries
)
442 old_ptr
= xchg(array
->ptrs
+ index
, NULL
);
444 map
->ops
->map_fd_put_ptr(old_ptr
);
451 static void *prog_fd_array_get_ptr(struct bpf_map
*map
,
452 struct file
*map_file
, int fd
)
454 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
455 struct bpf_prog
*prog
= bpf_prog_get(fd
);
460 if (!bpf_prog_array_compatible(array
, prog
)) {
462 return ERR_PTR(-EINVAL
);
468 static void prog_fd_array_put_ptr(void *ptr
)
473 static u32
prog_fd_array_sys_lookup_elem(void *ptr
)
475 return ((struct bpf_prog
*)ptr
)->aux
->id
;
478 /* decrement refcnt of all bpf_progs that are stored in this map */
479 void bpf_fd_array_map_clear(struct bpf_map
*map
)
481 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
484 for (i
= 0; i
< array
->map
.max_entries
; i
++)
485 fd_array_map_delete_elem(map
, &i
);
488 const struct bpf_map_ops prog_array_map_ops
= {
489 .map_alloc_check
= fd_array_map_alloc_check
,
490 .map_alloc
= array_map_alloc
,
491 .map_free
= fd_array_map_free
,
492 .map_get_next_key
= array_map_get_next_key
,
493 .map_lookup_elem
= fd_array_map_lookup_elem
,
494 .map_delete_elem
= fd_array_map_delete_elem
,
495 .map_fd_get_ptr
= prog_fd_array_get_ptr
,
496 .map_fd_put_ptr
= prog_fd_array_put_ptr
,
497 .map_fd_sys_lookup_elem
= prog_fd_array_sys_lookup_elem
,
500 static struct bpf_event_entry
*bpf_event_entry_gen(struct file
*perf_file
,
501 struct file
*map_file
)
503 struct bpf_event_entry
*ee
;
505 ee
= kzalloc(sizeof(*ee
), GFP_ATOMIC
);
507 ee
->event
= perf_file
->private_data
;
508 ee
->perf_file
= perf_file
;
509 ee
->map_file
= map_file
;
515 static void __bpf_event_entry_free(struct rcu_head
*rcu
)
517 struct bpf_event_entry
*ee
;
519 ee
= container_of(rcu
, struct bpf_event_entry
, rcu
);
524 static void bpf_event_entry_free_rcu(struct bpf_event_entry
*ee
)
526 call_rcu(&ee
->rcu
, __bpf_event_entry_free
);
529 static void *perf_event_fd_array_get_ptr(struct bpf_map
*map
,
530 struct file
*map_file
, int fd
)
532 struct bpf_event_entry
*ee
;
533 struct perf_event
*event
;
534 struct file
*perf_file
;
537 perf_file
= perf_event_get(fd
);
538 if (IS_ERR(perf_file
))
541 ee
= ERR_PTR(-EOPNOTSUPP
);
542 event
= perf_file
->private_data
;
543 if (perf_event_read_local(event
, &value
, NULL
, NULL
) == -EOPNOTSUPP
)
546 ee
= bpf_event_entry_gen(perf_file
, map_file
);
549 ee
= ERR_PTR(-ENOMEM
);
555 static void perf_event_fd_array_put_ptr(void *ptr
)
557 bpf_event_entry_free_rcu(ptr
);
560 static void perf_event_fd_array_release(struct bpf_map
*map
,
561 struct file
*map_file
)
563 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
564 struct bpf_event_entry
*ee
;
568 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
569 ee
= READ_ONCE(array
->ptrs
[i
]);
570 if (ee
&& ee
->map_file
== map_file
)
571 fd_array_map_delete_elem(map
, &i
);
576 const struct bpf_map_ops perf_event_array_map_ops
= {
577 .map_alloc_check
= fd_array_map_alloc_check
,
578 .map_alloc
= array_map_alloc
,
579 .map_free
= fd_array_map_free
,
580 .map_get_next_key
= array_map_get_next_key
,
581 .map_lookup_elem
= fd_array_map_lookup_elem
,
582 .map_delete_elem
= fd_array_map_delete_elem
,
583 .map_fd_get_ptr
= perf_event_fd_array_get_ptr
,
584 .map_fd_put_ptr
= perf_event_fd_array_put_ptr
,
585 .map_release
= perf_event_fd_array_release
,
588 #ifdef CONFIG_CGROUPS
589 static void *cgroup_fd_array_get_ptr(struct bpf_map
*map
,
590 struct file
*map_file
/* not used */,
593 return cgroup_get_from_fd(fd
);
596 static void cgroup_fd_array_put_ptr(void *ptr
)
598 /* cgroup_put free cgrp after a rcu grace period */
602 static void cgroup_fd_array_free(struct bpf_map
*map
)
604 bpf_fd_array_map_clear(map
);
605 fd_array_map_free(map
);
608 const struct bpf_map_ops cgroup_array_map_ops
= {
609 .map_alloc_check
= fd_array_map_alloc_check
,
610 .map_alloc
= array_map_alloc
,
611 .map_free
= cgroup_fd_array_free
,
612 .map_get_next_key
= array_map_get_next_key
,
613 .map_lookup_elem
= fd_array_map_lookup_elem
,
614 .map_delete_elem
= fd_array_map_delete_elem
,
615 .map_fd_get_ptr
= cgroup_fd_array_get_ptr
,
616 .map_fd_put_ptr
= cgroup_fd_array_put_ptr
,
620 static struct bpf_map
*array_of_map_alloc(union bpf_attr
*attr
)
622 struct bpf_map
*map
, *inner_map_meta
;
624 inner_map_meta
= bpf_map_meta_alloc(attr
->inner_map_fd
);
625 if (IS_ERR(inner_map_meta
))
626 return inner_map_meta
;
628 map
= array_map_alloc(attr
);
630 bpf_map_meta_free(inner_map_meta
);
634 map
->inner_map_meta
= inner_map_meta
;
639 static void array_of_map_free(struct bpf_map
*map
)
641 /* map->inner_map_meta is only accessed by syscall which
642 * is protected by fdget/fdput.
644 bpf_map_meta_free(map
->inner_map_meta
);
645 bpf_fd_array_map_clear(map
);
646 fd_array_map_free(map
);
649 static void *array_of_map_lookup_elem(struct bpf_map
*map
, void *key
)
651 struct bpf_map
**inner_map
= array_map_lookup_elem(map
, key
);
656 return READ_ONCE(*inner_map
);
659 static u32
array_of_map_gen_lookup(struct bpf_map
*map
,
660 struct bpf_insn
*insn_buf
)
662 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
663 u32 elem_size
= round_up(map
->value_size
, 8);
664 struct bpf_insn
*insn
= insn_buf
;
665 const int ret
= BPF_REG_0
;
666 const int map_ptr
= BPF_REG_1
;
667 const int index
= BPF_REG_2
;
669 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, map_ptr
, offsetof(struct bpf_array
, value
));
670 *insn
++ = BPF_LDX_MEM(BPF_W
, ret
, index
, 0);
671 if (map
->unpriv_array
) {
672 *insn
++ = BPF_JMP_IMM(BPF_JGE
, ret
, map
->max_entries
, 6);
673 *insn
++ = BPF_ALU32_IMM(BPF_AND
, ret
, array
->index_mask
);
675 *insn
++ = BPF_JMP_IMM(BPF_JGE
, ret
, map
->max_entries
, 5);
677 if (is_power_of_2(elem_size
))
678 *insn
++ = BPF_ALU64_IMM(BPF_LSH
, ret
, ilog2(elem_size
));
680 *insn
++ = BPF_ALU64_IMM(BPF_MUL
, ret
, elem_size
);
681 *insn
++ = BPF_ALU64_REG(BPF_ADD
, ret
, map_ptr
);
682 *insn
++ = BPF_LDX_MEM(BPF_DW
, ret
, ret
, 0);
683 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, ret
, 0, 1);
684 *insn
++ = BPF_JMP_IMM(BPF_JA
, 0, 0, 1);
685 *insn
++ = BPF_MOV64_IMM(ret
, 0);
687 return insn
- insn_buf
;
690 const struct bpf_map_ops array_of_maps_map_ops
= {
691 .map_alloc_check
= fd_array_map_alloc_check
,
692 .map_alloc
= array_of_map_alloc
,
693 .map_free
= array_of_map_free
,
694 .map_get_next_key
= array_map_get_next_key
,
695 .map_lookup_elem
= array_of_map_lookup_elem
,
696 .map_delete_elem
= fd_array_map_delete_elem
,
697 .map_fd_get_ptr
= bpf_map_fd_get_ptr
,
698 .map_fd_put_ptr
= bpf_map_fd_put_ptr
,
699 .map_fd_sys_lookup_elem
= bpf_map_fd_sys_lookup_elem
,
700 .map_gen_lookup
= array_of_map_gen_lookup
,