1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016 Facebook
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 #include <linux/bpf.h>
14 #include <linux/jhash.h>
15 #include <linux/filter.h>
16 #include <linux/vmalloc.h>
17 #include "percpu_freelist.h"
20 struct hlist_head head
;
26 struct bucket
*buckets
;
28 struct pcpu_freelist freelist
;
29 atomic_t count
; /* number of elements in this hashtable */
30 u32 n_buckets
; /* number of hash buckets */
31 u32 elem_size
; /* size of each element in bytes */
34 /* each htab element is struct htab_elem + key + value */
37 struct hlist_node hash_node
;
38 struct bpf_htab
*htab
;
39 struct pcpu_freelist_node fnode
;
43 char key
[0] __aligned(8);
46 static inline void htab_elem_set_ptr(struct htab_elem
*l
, u32 key_size
,
49 *(void __percpu
**)(l
->key
+ key_size
) = pptr
;
52 static inline void __percpu
*htab_elem_get_ptr(struct htab_elem
*l
, u32 key_size
)
54 return *(void __percpu
**)(l
->key
+ key_size
);
57 static struct htab_elem
*get_htab_elem(struct bpf_htab
*htab
, int i
)
59 return (struct htab_elem
*) (htab
->elems
+ i
* htab
->elem_size
);
62 static void htab_free_elems(struct bpf_htab
*htab
)
66 if (htab
->map
.map_type
!= BPF_MAP_TYPE_PERCPU_HASH
)
69 for (i
= 0; i
< htab
->map
.max_entries
; i
++) {
72 pptr
= htab_elem_get_ptr(get_htab_elem(htab
, i
),
80 static int prealloc_elems_and_freelist(struct bpf_htab
*htab
)
84 htab
->elems
= vzalloc(htab
->elem_size
* htab
->map
.max_entries
);
88 if (htab
->map
.map_type
!= BPF_MAP_TYPE_PERCPU_HASH
)
89 goto skip_percpu_elems
;
91 for (i
= 0; i
< htab
->map
.max_entries
; i
++) {
92 u32 size
= round_up(htab
->map
.value_size
, 8);
95 pptr
= __alloc_percpu_gfp(size
, 8, GFP_USER
| __GFP_NOWARN
);
98 htab_elem_set_ptr(get_htab_elem(htab
, i
), htab
->map
.key_size
,
103 err
= pcpu_freelist_init(&htab
->freelist
);
107 pcpu_freelist_populate(&htab
->freelist
, htab
->elems
, htab
->elem_size
,
108 htab
->map
.max_entries
);
112 htab_free_elems(htab
);
116 /* Called from syscall */
117 static struct bpf_map
*htab_map_alloc(union bpf_attr
*attr
)
119 bool percpu
= attr
->map_type
== BPF_MAP_TYPE_PERCPU_HASH
;
120 struct bpf_htab
*htab
;
124 if (attr
->map_flags
& ~BPF_F_NO_PREALLOC
)
125 /* reserved bits should not be used */
126 return ERR_PTR(-EINVAL
);
128 htab
= kzalloc(sizeof(*htab
), GFP_USER
);
130 return ERR_PTR(-ENOMEM
);
132 /* mandatory map attributes */
133 htab
->map
.map_type
= attr
->map_type
;
134 htab
->map
.key_size
= attr
->key_size
;
135 htab
->map
.value_size
= attr
->value_size
;
136 htab
->map
.max_entries
= attr
->max_entries
;
137 htab
->map
.map_flags
= attr
->map_flags
;
139 /* check sanity of attributes.
140 * value_size == 0 may be allowed in the future to use map as a set
143 if (htab
->map
.max_entries
== 0 || htab
->map
.key_size
== 0 ||
144 htab
->map
.value_size
== 0)
147 /* hash table size must be power of 2 */
148 htab
->n_buckets
= roundup_pow_of_two(htab
->map
.max_entries
);
151 if (htab
->map
.key_size
> MAX_BPF_STACK
)
152 /* eBPF programs initialize keys on stack, so they cannot be
153 * larger than max stack size
157 if (htab
->map
.value_size
>= (1 << (KMALLOC_SHIFT_MAX
- 1)) -
158 MAX_BPF_STACK
- sizeof(struct htab_elem
))
159 /* if value_size is bigger, the user space won't be able to
160 * access the elements via bpf syscall. This check also makes
161 * sure that the elem_size doesn't overflow and it's
162 * kmalloc-able later in htab_map_update_elem()
166 if (percpu
&& round_up(htab
->map
.value_size
, 8) > PCPU_MIN_UNIT_SIZE
)
167 /* make sure the size for pcpu_alloc() is reasonable */
170 htab
->elem_size
= sizeof(struct htab_elem
) +
171 round_up(htab
->map
.key_size
, 8);
173 htab
->elem_size
+= sizeof(void *);
175 htab
->elem_size
+= round_up(htab
->map
.value_size
, 8);
177 /* prevent zero size kmalloc and check for u32 overflow */
178 if (htab
->n_buckets
== 0 ||
179 htab
->n_buckets
> U32_MAX
/ sizeof(struct bucket
))
182 cost
= (u64
) htab
->n_buckets
* sizeof(struct bucket
) +
183 (u64
) htab
->elem_size
* htab
->map
.max_entries
;
186 cost
+= (u64
) round_up(htab
->map
.value_size
, 8) *
187 num_possible_cpus() * htab
->map
.max_entries
;
189 if (cost
>= U32_MAX
- PAGE_SIZE
)
190 /* make sure page count doesn't overflow */
193 htab
->map
.pages
= round_up(cost
, PAGE_SIZE
) >> PAGE_SHIFT
;
195 /* if map size is larger than memlock limit, reject it early */
196 err
= bpf_map_precharge_memlock(htab
->map
.pages
);
201 htab
->buckets
= kmalloc_array(htab
->n_buckets
, sizeof(struct bucket
),
202 GFP_USER
| __GFP_NOWARN
);
204 if (!htab
->buckets
) {
205 htab
->buckets
= vmalloc(htab
->n_buckets
* sizeof(struct bucket
));
210 for (i
= 0; i
< htab
->n_buckets
; i
++) {
211 INIT_HLIST_HEAD(&htab
->buckets
[i
].head
);
212 raw_spin_lock_init(&htab
->buckets
[i
].lock
);
215 if (!(attr
->map_flags
& BPF_F_NO_PREALLOC
)) {
216 err
= prealloc_elems_and_freelist(htab
);
224 kvfree(htab
->buckets
);
230 static inline u32
htab_map_hash(const void *key
, u32 key_len
)
232 return jhash(key
, key_len
, 0);
235 static inline struct bucket
*__select_bucket(struct bpf_htab
*htab
, u32 hash
)
237 return &htab
->buckets
[hash
& (htab
->n_buckets
- 1)];
240 static inline struct hlist_head
*select_bucket(struct bpf_htab
*htab
, u32 hash
)
242 return &__select_bucket(htab
, hash
)->head
;
245 static struct htab_elem
*lookup_elem_raw(struct hlist_head
*head
, u32 hash
,
246 void *key
, u32 key_size
)
250 hlist_for_each_entry_rcu(l
, head
, hash_node
)
251 if (l
->hash
== hash
&& !memcmp(&l
->key
, key
, key_size
))
257 /* Called from syscall or from eBPF program */
258 static void *__htab_map_lookup_elem(struct bpf_map
*map
, void *key
)
260 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
261 struct hlist_head
*head
;
265 /* Must be called with rcu_read_lock. */
266 WARN_ON_ONCE(!rcu_read_lock_held());
268 key_size
= map
->key_size
;
270 hash
= htab_map_hash(key
, key_size
);
272 head
= select_bucket(htab
, hash
);
274 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
279 static void *htab_map_lookup_elem(struct bpf_map
*map
, void *key
)
281 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
284 return l
->key
+ round_up(map
->key_size
, 8);
289 /* Called from syscall */
290 static int htab_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
292 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
293 struct hlist_head
*head
;
294 struct htab_elem
*l
, *next_l
;
298 WARN_ON_ONCE(!rcu_read_lock_held());
300 key_size
= map
->key_size
;
302 hash
= htab_map_hash(key
, key_size
);
304 head
= select_bucket(htab
, hash
);
307 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
311 goto find_first_elem
;
314 /* key was found, get next key in the same bucket */
315 next_l
= hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l
->hash_node
)),
316 struct htab_elem
, hash_node
);
319 /* if next elem in this hash list is non-zero, just return it */
320 memcpy(next_key
, next_l
->key
, key_size
);
324 /* no more elements in this hash list, go to the next bucket */
325 i
= hash
& (htab
->n_buckets
- 1);
329 /* iterate over buckets */
330 for (; i
< htab
->n_buckets
; i
++) {
331 head
= select_bucket(htab
, i
);
333 /* pick first element in the bucket */
334 next_l
= hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head
)),
335 struct htab_elem
, hash_node
);
337 /* if it's not empty, just return it */
338 memcpy(next_key
, next_l
->key
, key_size
);
343 /* iterated over all buckets and all elements */
347 static void htab_elem_free(struct bpf_htab
*htab
, struct htab_elem
*l
)
349 if (htab
->map
.map_type
== BPF_MAP_TYPE_PERCPU_HASH
)
350 free_percpu(htab_elem_get_ptr(l
, htab
->map
.key_size
));
355 static void htab_elem_free_rcu(struct rcu_head
*head
)
357 struct htab_elem
*l
= container_of(head
, struct htab_elem
, rcu
);
358 struct bpf_htab
*htab
= l
->htab
;
360 /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
361 * we're calling kfree, otherwise deadlock is possible if kprobes
362 * are placed somewhere inside of slub
365 __this_cpu_inc(bpf_prog_active
);
366 htab_elem_free(htab
, l
);
367 __this_cpu_dec(bpf_prog_active
);
371 static void free_htab_elem(struct bpf_htab
*htab
, struct htab_elem
*l
)
373 if (!(htab
->map
.map_flags
& BPF_F_NO_PREALLOC
)) {
374 pcpu_freelist_push(&htab
->freelist
, &l
->fnode
);
376 atomic_dec(&htab
->count
);
378 call_rcu(&l
->rcu
, htab_elem_free_rcu
);
382 static struct htab_elem
*alloc_htab_elem(struct bpf_htab
*htab
, void *key
,
383 void *value
, u32 key_size
, u32 hash
,
384 bool percpu
, bool onallcpus
)
386 u32 size
= htab
->map
.value_size
;
387 bool prealloc
= !(htab
->map
.map_flags
& BPF_F_NO_PREALLOC
);
388 struct htab_elem
*l_new
;
392 l_new
= (struct htab_elem
*)pcpu_freelist_pop(&htab
->freelist
);
394 return ERR_PTR(-E2BIG
);
396 if (atomic_inc_return(&htab
->count
) > htab
->map
.max_entries
) {
397 atomic_dec(&htab
->count
);
398 return ERR_PTR(-E2BIG
);
400 l_new
= kmalloc(htab
->elem_size
, GFP_ATOMIC
| __GFP_NOWARN
);
402 return ERR_PTR(-ENOMEM
);
405 memcpy(l_new
->key
, key
, key_size
);
407 /* round up value_size to 8 bytes */
408 size
= round_up(size
, 8);
411 pptr
= htab_elem_get_ptr(l_new
, key_size
);
413 /* alloc_percpu zero-fills */
414 pptr
= __alloc_percpu_gfp(size
, 8,
415 GFP_ATOMIC
| __GFP_NOWARN
);
418 return ERR_PTR(-ENOMEM
);
423 /* copy true value_size bytes */
424 memcpy(this_cpu_ptr(pptr
), value
, htab
->map
.value_size
);
428 for_each_possible_cpu(cpu
) {
429 bpf_long_memcpy(per_cpu_ptr(pptr
, cpu
),
435 htab_elem_set_ptr(l_new
, key_size
, pptr
);
437 memcpy(l_new
->key
+ round_up(key_size
, 8), value
, size
);
444 static int check_flags(struct bpf_htab
*htab
, struct htab_elem
*l_old
,
447 if (l_old
&& map_flags
== BPF_NOEXIST
)
448 /* elem already exists */
451 if (!l_old
&& map_flags
== BPF_EXIST
)
452 /* elem doesn't exist, cannot update it */
458 /* Called from syscall or from eBPF program */
459 static int htab_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
462 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
463 struct htab_elem
*l_new
= NULL
, *l_old
;
464 struct hlist_head
*head
;
470 if (unlikely(map_flags
> BPF_EXIST
))
474 WARN_ON_ONCE(!rcu_read_lock_held());
476 key_size
= map
->key_size
;
478 hash
= htab_map_hash(key
, key_size
);
480 b
= __select_bucket(htab
, hash
);
483 /* bpf_map_update_elem() can be called in_irq() */
484 raw_spin_lock_irqsave(&b
->lock
, flags
);
486 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
488 ret
= check_flags(htab
, l_old
, map_flags
);
492 l_new
= alloc_htab_elem(htab
, key
, value
, key_size
, hash
, false, false);
494 /* all pre-allocated elements are in use or memory exhausted */
495 ret
= PTR_ERR(l_new
);
499 /* add new element to the head of the list, so that
500 * concurrent search will find it before old elem
502 hlist_add_head_rcu(&l_new
->hash_node
, head
);
504 hlist_del_rcu(&l_old
->hash_node
);
505 free_htab_elem(htab
, l_old
);
509 raw_spin_unlock_irqrestore(&b
->lock
, flags
);
513 static int __htab_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
514 void *value
, u64 map_flags
,
517 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
518 struct htab_elem
*l_new
= NULL
, *l_old
;
519 struct hlist_head
*head
;
525 if (unlikely(map_flags
> BPF_EXIST
))
529 WARN_ON_ONCE(!rcu_read_lock_held());
531 key_size
= map
->key_size
;
533 hash
= htab_map_hash(key
, key_size
);
535 b
= __select_bucket(htab
, hash
);
538 /* bpf_map_update_elem() can be called in_irq() */
539 raw_spin_lock_irqsave(&b
->lock
, flags
);
541 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
543 ret
= check_flags(htab
, l_old
, map_flags
);
548 void __percpu
*pptr
= htab_elem_get_ptr(l_old
, key_size
);
549 u32 size
= htab
->map
.value_size
;
551 /* per-cpu hash map can update value in-place */
553 memcpy(this_cpu_ptr(pptr
), value
, size
);
557 size
= round_up(size
, 8);
558 for_each_possible_cpu(cpu
) {
559 bpf_long_memcpy(per_cpu_ptr(pptr
, cpu
),
565 l_new
= alloc_htab_elem(htab
, key
, value
, key_size
,
566 hash
, true, onallcpus
);
568 ret
= PTR_ERR(l_new
);
571 hlist_add_head_rcu(&l_new
->hash_node
, head
);
575 raw_spin_unlock_irqrestore(&b
->lock
, flags
);
579 static int htab_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
580 void *value
, u64 map_flags
)
582 return __htab_percpu_map_update_elem(map
, key
, value
, map_flags
, false);
585 /* Called from syscall or from eBPF program */
586 static int htab_map_delete_elem(struct bpf_map
*map
, void *key
)
588 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
589 struct hlist_head
*head
;
596 WARN_ON_ONCE(!rcu_read_lock_held());
598 key_size
= map
->key_size
;
600 hash
= htab_map_hash(key
, key_size
);
601 b
= __select_bucket(htab
, hash
);
604 raw_spin_lock_irqsave(&b
->lock
, flags
);
606 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
609 hlist_del_rcu(&l
->hash_node
);
610 free_htab_elem(htab
, l
);
614 raw_spin_unlock_irqrestore(&b
->lock
, flags
);
618 static void delete_all_elements(struct bpf_htab
*htab
)
622 for (i
= 0; i
< htab
->n_buckets
; i
++) {
623 struct hlist_head
*head
= select_bucket(htab
, i
);
624 struct hlist_node
*n
;
627 hlist_for_each_entry_safe(l
, n
, head
, hash_node
) {
628 hlist_del_rcu(&l
->hash_node
);
629 htab_elem_free(htab
, l
);
633 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
634 static void htab_map_free(struct bpf_map
*map
)
636 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
638 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
639 * so the programs (can be more than one that used this map) were
640 * disconnected from events. Wait for outstanding critical sections in
641 * these programs to complete
645 /* some of free_htab_elem() callbacks for elements of this map may
646 * not have executed. Wait for them.
649 if (htab
->map
.map_flags
& BPF_F_NO_PREALLOC
) {
650 delete_all_elements(htab
);
652 htab_free_elems(htab
);
653 pcpu_freelist_destroy(&htab
->freelist
);
655 kvfree(htab
->buckets
);
659 static const struct bpf_map_ops htab_ops
= {
660 .map_alloc
= htab_map_alloc
,
661 .map_free
= htab_map_free
,
662 .map_get_next_key
= htab_map_get_next_key
,
663 .map_lookup_elem
= htab_map_lookup_elem
,
664 .map_update_elem
= htab_map_update_elem
,
665 .map_delete_elem
= htab_map_delete_elem
,
668 static struct bpf_map_type_list htab_type __read_mostly
= {
670 .type
= BPF_MAP_TYPE_HASH
,
673 /* Called from eBPF program */
674 static void *htab_percpu_map_lookup_elem(struct bpf_map
*map
, void *key
)
676 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
679 return this_cpu_ptr(htab_elem_get_ptr(l
, map
->key_size
));
684 int bpf_percpu_hash_copy(struct bpf_map
*map
, void *key
, void *value
)
692 /* per_cpu areas are zero-filled and bpf programs can only
693 * access 'value_size' of them, so copying rounded areas
694 * will not leak any kernel data
696 size
= round_up(map
->value_size
, 8);
698 l
= __htab_map_lookup_elem(map
, key
);
701 pptr
= htab_elem_get_ptr(l
, map
->key_size
);
702 for_each_possible_cpu(cpu
) {
703 bpf_long_memcpy(value
+ off
,
704 per_cpu_ptr(pptr
, cpu
), size
);
713 int bpf_percpu_hash_update(struct bpf_map
*map
, void *key
, void *value
,
719 ret
= __htab_percpu_map_update_elem(map
, key
, value
, map_flags
, true);
725 static const struct bpf_map_ops htab_percpu_ops
= {
726 .map_alloc
= htab_map_alloc
,
727 .map_free
= htab_map_free
,
728 .map_get_next_key
= htab_map_get_next_key
,
729 .map_lookup_elem
= htab_percpu_map_lookup_elem
,
730 .map_update_elem
= htab_percpu_map_update_elem
,
731 .map_delete_elem
= htab_map_delete_elem
,
734 static struct bpf_map_type_list htab_percpu_type __read_mostly
= {
735 .ops
= &htab_percpu_ops
,
736 .type
= BPF_MAP_TYPE_PERCPU_HASH
,
739 static int __init
register_htab_map(void)
741 bpf_register_map_type(&htab_type
);
742 bpf_register_map_type(&htab_percpu_type
);
745 late_initcall(register_htab_map
);