1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016,2017 Facebook
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 #include <linux/bpf.h>
14 #include <linux/err.h>
15 #include <linux/slab.h>
17 #include <linux/filter.h>
18 #include <linux/perf_event.h>
20 #include "map_in_map.h"
22 #define ARRAY_CREATE_FLAG_MASK \
23 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
25 static void bpf_array_free_percpu(struct bpf_array
*array
)
29 for (i
= 0; i
< array
->map
.max_entries
; i
++)
30 free_percpu(array
->pptrs
[i
]);
33 static int bpf_array_alloc_percpu(struct bpf_array
*array
)
38 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
39 ptr
= __alloc_percpu_gfp(array
->elem_size
, 8,
40 GFP_USER
| __GFP_NOWARN
);
42 bpf_array_free_percpu(array
);
45 array
->pptrs
[i
] = ptr
;
51 /* Called from syscall */
52 static struct bpf_map
*array_map_alloc(union bpf_attr
*attr
)
54 bool percpu
= attr
->map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
;
55 int numa_node
= bpf_map_attr_numa_node(attr
);
56 struct bpf_array
*array
;
60 /* check sanity of attributes */
61 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
62 attr
->value_size
== 0 ||
63 attr
->map_flags
& ~ARRAY_CREATE_FLAG_MASK
||
64 (percpu
&& numa_node
!= NUMA_NO_NODE
))
65 return ERR_PTR(-EINVAL
);
67 if (attr
->value_size
> KMALLOC_MAX_SIZE
)
68 /* if value_size is bigger, the user space won't be able to
69 * access the elements.
71 return ERR_PTR(-E2BIG
);
73 elem_size
= round_up(attr
->value_size
, 8);
75 array_size
= sizeof(*array
);
77 array_size
+= (u64
) attr
->max_entries
* sizeof(void *);
79 array_size
+= (u64
) attr
->max_entries
* elem_size
;
81 /* make sure there is no u32 overflow later in round_up() */
82 if (array_size
>= U32_MAX
- PAGE_SIZE
)
83 return ERR_PTR(-ENOMEM
);
85 /* allocate all map elements and zero-initialize them */
86 array
= bpf_map_area_alloc(array_size
, numa_node
);
88 return ERR_PTR(-ENOMEM
);
90 /* copy mandatory map attributes */
91 array
->map
.map_type
= attr
->map_type
;
92 array
->map
.key_size
= attr
->key_size
;
93 array
->map
.value_size
= attr
->value_size
;
94 array
->map
.max_entries
= attr
->max_entries
;
95 array
->map
.map_flags
= attr
->map_flags
;
96 array
->map
.numa_node
= numa_node
;
97 array
->elem_size
= elem_size
;
102 array_size
+= (u64
) attr
->max_entries
* elem_size
* num_possible_cpus();
104 if (array_size
>= U32_MAX
- PAGE_SIZE
||
105 bpf_array_alloc_percpu(array
)) {
106 bpf_map_area_free(array
);
107 return ERR_PTR(-ENOMEM
);
110 array
->map
.pages
= round_up(array_size
, PAGE_SIZE
) >> PAGE_SHIFT
;
115 /* Called from syscall or from eBPF program */
116 static void *array_map_lookup_elem(struct bpf_map
*map
, void *key
)
118 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
119 u32 index
= *(u32
*)key
;
121 if (unlikely(index
>= array
->map
.max_entries
))
124 return array
->value
+ array
->elem_size
* index
;
127 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
128 static u32
array_map_gen_lookup(struct bpf_map
*map
, struct bpf_insn
*insn_buf
)
130 struct bpf_insn
*insn
= insn_buf
;
131 u32 elem_size
= round_up(map
->value_size
, 8);
132 const int ret
= BPF_REG_0
;
133 const int map_ptr
= BPF_REG_1
;
134 const int index
= BPF_REG_2
;
136 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, map_ptr
, offsetof(struct bpf_array
, value
));
137 *insn
++ = BPF_LDX_MEM(BPF_W
, ret
, index
, 0);
138 *insn
++ = BPF_JMP_IMM(BPF_JGE
, ret
, map
->max_entries
, 3);
140 if (is_power_of_2(elem_size
)) {
141 *insn
++ = BPF_ALU64_IMM(BPF_LSH
, ret
, ilog2(elem_size
));
143 *insn
++ = BPF_ALU64_IMM(BPF_MUL
, ret
, elem_size
);
145 *insn
++ = BPF_ALU64_REG(BPF_ADD
, ret
, map_ptr
);
146 *insn
++ = BPF_JMP_IMM(BPF_JA
, 0, 0, 1);
147 *insn
++ = BPF_MOV64_IMM(ret
, 0);
148 return insn
- insn_buf
;
151 /* Called from eBPF program */
152 static void *percpu_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
154 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
155 u32 index
= *(u32
*)key
;
157 if (unlikely(index
>= array
->map
.max_entries
))
160 return this_cpu_ptr(array
->pptrs
[index
]);
163 int bpf_percpu_array_copy(struct bpf_map
*map
, void *key
, void *value
)
165 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
166 u32 index
= *(u32
*)key
;
171 if (unlikely(index
>= array
->map
.max_entries
))
174 /* per_cpu areas are zero-filled and bpf programs can only
175 * access 'value_size' of them, so copying rounded areas
176 * will not leak any kernel data
178 size
= round_up(map
->value_size
, 8);
180 pptr
= array
->pptrs
[index
];
181 for_each_possible_cpu(cpu
) {
182 bpf_long_memcpy(value
+ off
, per_cpu_ptr(pptr
, cpu
), size
);
189 /* Called from syscall */
190 static int array_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
192 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
193 u32 index
= key
? *(u32
*)key
: U32_MAX
;
194 u32
*next
= (u32
*)next_key
;
196 if (index
>= array
->map
.max_entries
) {
201 if (index
== array
->map
.max_entries
- 1)
208 /* Called from syscall or from eBPF program */
209 static int array_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
212 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
213 u32 index
= *(u32
*)key
;
215 if (unlikely(map_flags
> BPF_EXIST
))
219 if (unlikely(index
>= array
->map
.max_entries
))
220 /* all elements were pre-allocated, cannot insert a new one */
223 if (unlikely(map_flags
== BPF_NOEXIST
))
224 /* all elements already exist */
227 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
228 memcpy(this_cpu_ptr(array
->pptrs
[index
]),
229 value
, map
->value_size
);
231 memcpy(array
->value
+ array
->elem_size
* index
,
232 value
, map
->value_size
);
236 int bpf_percpu_array_update(struct bpf_map
*map
, void *key
, void *value
,
239 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
240 u32 index
= *(u32
*)key
;
245 if (unlikely(map_flags
> BPF_EXIST
))
249 if (unlikely(index
>= array
->map
.max_entries
))
250 /* all elements were pre-allocated, cannot insert a new one */
253 if (unlikely(map_flags
== BPF_NOEXIST
))
254 /* all elements already exist */
257 /* the user space will provide round_up(value_size, 8) bytes that
258 * will be copied into per-cpu area. bpf programs can only access
259 * value_size of it. During lookup the same extra bytes will be
260 * returned or zeros which were zero-filled by percpu_alloc,
261 * so no kernel data leaks possible
263 size
= round_up(map
->value_size
, 8);
265 pptr
= array
->pptrs
[index
];
266 for_each_possible_cpu(cpu
) {
267 bpf_long_memcpy(per_cpu_ptr(pptr
, cpu
), value
+ off
, size
);
274 /* Called from syscall or from eBPF program */
275 static int array_map_delete_elem(struct bpf_map
*map
, void *key
)
280 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
281 static void array_map_free(struct bpf_map
*map
)
283 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
285 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
286 * so the programs (can be more than one that used this map) were
287 * disconnected from events. Wait for outstanding programs to complete
292 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
293 bpf_array_free_percpu(array
);
295 bpf_map_area_free(array
);
298 const struct bpf_map_ops array_map_ops
= {
299 .map_alloc
= array_map_alloc
,
300 .map_free
= array_map_free
,
301 .map_get_next_key
= array_map_get_next_key
,
302 .map_lookup_elem
= array_map_lookup_elem
,
303 .map_update_elem
= array_map_update_elem
,
304 .map_delete_elem
= array_map_delete_elem
,
305 .map_gen_lookup
= array_map_gen_lookup
,
308 const struct bpf_map_ops percpu_array_map_ops
= {
309 .map_alloc
= array_map_alloc
,
310 .map_free
= array_map_free
,
311 .map_get_next_key
= array_map_get_next_key
,
312 .map_lookup_elem
= percpu_array_map_lookup_elem
,
313 .map_update_elem
= array_map_update_elem
,
314 .map_delete_elem
= array_map_delete_elem
,
317 static struct bpf_map
*fd_array_map_alloc(union bpf_attr
*attr
)
319 /* only file descriptors can be stored in this type of map */
320 if (attr
->value_size
!= sizeof(u32
))
321 return ERR_PTR(-EINVAL
);
322 return array_map_alloc(attr
);
325 static void fd_array_map_free(struct bpf_map
*map
)
327 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
332 /* make sure it's empty */
333 for (i
= 0; i
< array
->map
.max_entries
; i
++)
334 BUG_ON(array
->ptrs
[i
] != NULL
);
336 bpf_map_area_free(array
);
339 static void *fd_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
344 /* only called from syscall */
345 int bpf_fd_array_map_lookup_elem(struct bpf_map
*map
, void *key
, u32
*value
)
350 if (!map
->ops
->map_fd_sys_lookup_elem
)
354 elem
= array_map_lookup_elem(map
, key
);
355 if (elem
&& (ptr
= READ_ONCE(*elem
)))
356 *value
= map
->ops
->map_fd_sys_lookup_elem(ptr
);
364 /* only called from syscall */
365 int bpf_fd_array_map_update_elem(struct bpf_map
*map
, struct file
*map_file
,
366 void *key
, void *value
, u64 map_flags
)
368 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
369 void *new_ptr
, *old_ptr
;
370 u32 index
= *(u32
*)key
, ufd
;
372 if (map_flags
!= BPF_ANY
)
375 if (index
>= array
->map
.max_entries
)
379 new_ptr
= map
->ops
->map_fd_get_ptr(map
, map_file
, ufd
);
381 return PTR_ERR(new_ptr
);
383 old_ptr
= xchg(array
->ptrs
+ index
, new_ptr
);
385 map
->ops
->map_fd_put_ptr(old_ptr
);
390 static int fd_array_map_delete_elem(struct bpf_map
*map
, void *key
)
392 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
394 u32 index
= *(u32
*)key
;
396 if (index
>= array
->map
.max_entries
)
399 old_ptr
= xchg(array
->ptrs
+ index
, NULL
);
401 map
->ops
->map_fd_put_ptr(old_ptr
);
408 static void *prog_fd_array_get_ptr(struct bpf_map
*map
,
409 struct file
*map_file
, int fd
)
411 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
412 struct bpf_prog
*prog
= bpf_prog_get(fd
);
417 if (!bpf_prog_array_compatible(array
, prog
)) {
419 return ERR_PTR(-EINVAL
);
425 static void prog_fd_array_put_ptr(void *ptr
)
430 static u32
prog_fd_array_sys_lookup_elem(void *ptr
)
432 return ((struct bpf_prog
*)ptr
)->aux
->id
;
435 /* decrement refcnt of all bpf_progs that are stored in this map */
436 void bpf_fd_array_map_clear(struct bpf_map
*map
)
438 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
441 for (i
= 0; i
< array
->map
.max_entries
; i
++)
442 fd_array_map_delete_elem(map
, &i
);
445 const struct bpf_map_ops prog_array_map_ops
= {
446 .map_alloc
= fd_array_map_alloc
,
447 .map_free
= fd_array_map_free
,
448 .map_get_next_key
= array_map_get_next_key
,
449 .map_lookup_elem
= fd_array_map_lookup_elem
,
450 .map_delete_elem
= fd_array_map_delete_elem
,
451 .map_fd_get_ptr
= prog_fd_array_get_ptr
,
452 .map_fd_put_ptr
= prog_fd_array_put_ptr
,
453 .map_fd_sys_lookup_elem
= prog_fd_array_sys_lookup_elem
,
456 static struct bpf_event_entry
*bpf_event_entry_gen(struct file
*perf_file
,
457 struct file
*map_file
)
459 struct bpf_event_entry
*ee
;
461 ee
= kzalloc(sizeof(*ee
), GFP_ATOMIC
);
463 ee
->event
= perf_file
->private_data
;
464 ee
->perf_file
= perf_file
;
465 ee
->map_file
= map_file
;
471 static void __bpf_event_entry_free(struct rcu_head
*rcu
)
473 struct bpf_event_entry
*ee
;
475 ee
= container_of(rcu
, struct bpf_event_entry
, rcu
);
480 static void bpf_event_entry_free_rcu(struct bpf_event_entry
*ee
)
482 call_rcu(&ee
->rcu
, __bpf_event_entry_free
);
485 static void *perf_event_fd_array_get_ptr(struct bpf_map
*map
,
486 struct file
*map_file
, int fd
)
488 struct bpf_event_entry
*ee
;
489 struct perf_event
*event
;
490 struct file
*perf_file
;
493 perf_file
= perf_event_get(fd
);
494 if (IS_ERR(perf_file
))
497 ee
= ERR_PTR(-EOPNOTSUPP
);
498 event
= perf_file
->private_data
;
499 if (perf_event_read_local(event
, &value
, NULL
, NULL
) == -EOPNOTSUPP
)
502 ee
= bpf_event_entry_gen(perf_file
, map_file
);
505 ee
= ERR_PTR(-ENOMEM
);
511 static void perf_event_fd_array_put_ptr(void *ptr
)
513 bpf_event_entry_free_rcu(ptr
);
516 static void perf_event_fd_array_release(struct bpf_map
*map
,
517 struct file
*map_file
)
519 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
520 struct bpf_event_entry
*ee
;
524 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
525 ee
= READ_ONCE(array
->ptrs
[i
]);
526 if (ee
&& ee
->map_file
== map_file
)
527 fd_array_map_delete_elem(map
, &i
);
532 const struct bpf_map_ops perf_event_array_map_ops
= {
533 .map_alloc
= fd_array_map_alloc
,
534 .map_free
= fd_array_map_free
,
535 .map_get_next_key
= array_map_get_next_key
,
536 .map_lookup_elem
= fd_array_map_lookup_elem
,
537 .map_delete_elem
= fd_array_map_delete_elem
,
538 .map_fd_get_ptr
= perf_event_fd_array_get_ptr
,
539 .map_fd_put_ptr
= perf_event_fd_array_put_ptr
,
540 .map_release
= perf_event_fd_array_release
,
543 #ifdef CONFIG_CGROUPS
544 static void *cgroup_fd_array_get_ptr(struct bpf_map
*map
,
545 struct file
*map_file
/* not used */,
548 return cgroup_get_from_fd(fd
);
551 static void cgroup_fd_array_put_ptr(void *ptr
)
553 /* cgroup_put free cgrp after a rcu grace period */
557 static void cgroup_fd_array_free(struct bpf_map
*map
)
559 bpf_fd_array_map_clear(map
);
560 fd_array_map_free(map
);
563 const struct bpf_map_ops cgroup_array_map_ops
= {
564 .map_alloc
= fd_array_map_alloc
,
565 .map_free
= cgroup_fd_array_free
,
566 .map_get_next_key
= array_map_get_next_key
,
567 .map_lookup_elem
= fd_array_map_lookup_elem
,
568 .map_delete_elem
= fd_array_map_delete_elem
,
569 .map_fd_get_ptr
= cgroup_fd_array_get_ptr
,
570 .map_fd_put_ptr
= cgroup_fd_array_put_ptr
,
574 static struct bpf_map
*array_of_map_alloc(union bpf_attr
*attr
)
576 struct bpf_map
*map
, *inner_map_meta
;
578 inner_map_meta
= bpf_map_meta_alloc(attr
->inner_map_fd
);
579 if (IS_ERR(inner_map_meta
))
580 return inner_map_meta
;
582 map
= fd_array_map_alloc(attr
);
584 bpf_map_meta_free(inner_map_meta
);
588 map
->inner_map_meta
= inner_map_meta
;
593 static void array_of_map_free(struct bpf_map
*map
)
595 /* map->inner_map_meta is only accessed by syscall which
596 * is protected by fdget/fdput.
598 bpf_map_meta_free(map
->inner_map_meta
);
599 bpf_fd_array_map_clear(map
);
600 fd_array_map_free(map
);
603 static void *array_of_map_lookup_elem(struct bpf_map
*map
, void *key
)
605 struct bpf_map
**inner_map
= array_map_lookup_elem(map
, key
);
610 return READ_ONCE(*inner_map
);
613 static u32
array_of_map_gen_lookup(struct bpf_map
*map
,
614 struct bpf_insn
*insn_buf
)
616 u32 elem_size
= round_up(map
->value_size
, 8);
617 struct bpf_insn
*insn
= insn_buf
;
618 const int ret
= BPF_REG_0
;
619 const int map_ptr
= BPF_REG_1
;
620 const int index
= BPF_REG_2
;
622 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, map_ptr
, offsetof(struct bpf_array
, value
));
623 *insn
++ = BPF_LDX_MEM(BPF_W
, ret
, index
, 0);
624 *insn
++ = BPF_JMP_IMM(BPF_JGE
, ret
, map
->max_entries
, 5);
625 if (is_power_of_2(elem_size
))
626 *insn
++ = BPF_ALU64_IMM(BPF_LSH
, ret
, ilog2(elem_size
));
628 *insn
++ = BPF_ALU64_IMM(BPF_MUL
, ret
, elem_size
);
629 *insn
++ = BPF_ALU64_REG(BPF_ADD
, ret
, map_ptr
);
630 *insn
++ = BPF_LDX_MEM(BPF_DW
, ret
, ret
, 0);
631 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, ret
, 0, 1);
632 *insn
++ = BPF_JMP_IMM(BPF_JA
, 0, 0, 1);
633 *insn
++ = BPF_MOV64_IMM(ret
, 0);
635 return insn
- insn_buf
;
638 const struct bpf_map_ops array_of_maps_map_ops
= {
639 .map_alloc
= array_of_map_alloc
,
640 .map_free
= array_of_map_free
,
641 .map_get_next_key
= array_map_get_next_key
,
642 .map_lookup_elem
= array_of_map_lookup_elem
,
643 .map_delete_elem
= fd_array_map_delete_elem
,
644 .map_fd_get_ptr
= bpf_map_fd_get_ptr
,
645 .map_fd_put_ptr
= bpf_map_fd_put_ptr
,
646 .map_fd_sys_lookup_elem
= bpf_map_fd_sys_lookup_elem
,
647 .map_gen_lookup
= array_of_map_gen_lookup
,