1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 * Copyright (c) 2016 Facebook
7 #include <linux/jhash.h>
8 #include <linux/filter.h>
9 #include <linux/rculist_nulls.h>
10 #include <linux/rcupdate_wait.h>
11 #include <linux/random.h>
12 #include <uapi/linux/btf.h>
13 #include <linux/rcupdate_trace.h>
14 #include <linux/btf_ids.h>
15 #include "percpu_freelist.h"
16 #include "bpf_lru_list.h"
17 #include "map_in_map.h"
18 #include <linux/bpf_mem_alloc.h>
20 #define HTAB_CREATE_FLAG_MASK \
21 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
22 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
24 #define BATCH_OPS(_name) \
26 _name##_map_lookup_batch, \
27 .map_lookup_and_delete_batch = \
28 _name##_map_lookup_and_delete_batch, \
30 generic_map_update_batch, \
32 generic_map_delete_batch
35 * The bucket lock has two protection scopes:
37 * 1) Serializing concurrent operations from BPF programs on different
40 * 2) Serializing concurrent operations from BPF programs and sys_bpf()
42 * BPF programs can execute in any context including perf, kprobes and
43 * tracing. As there are almost no limits where perf, kprobes and tracing
44 * can be invoked from the lock operations need to be protected against
45 * deadlocks. Deadlocks can be caused by recursion and by an invocation in
46 * the lock held section when functions which acquire this lock are invoked
47 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
48 * variable bpf_prog_active, which prevents BPF programs attached to perf
49 * events, kprobes and tracing to be invoked before the prior invocation
50 * from one of these contexts completed. sys_bpf() uses the same mechanism
51 * by pinning the task to the current CPU and incrementing the recursion
52 * protection across the map operation.
54 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
55 * operations like memory allocations (even with GFP_ATOMIC) from atomic
56 * contexts. This is required because even with GFP_ATOMIC the memory
57 * allocator calls into code paths which acquire locks with long held lock
58 * sections. To ensure the deterministic behaviour these locks are regular
59 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
60 * true atomic contexts on an RT kernel are the low level hardware
61 * handling, scheduling, low level interrupt handling, NMIs etc. None of
62 * these contexts should ever do memory allocations.
64 * As regular device interrupt handlers and soft interrupts are forced into
65 * thread context, the existing code which does
66 * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
69 * In theory the BPF locks could be converted to regular spinlocks as well,
70 * but the bucket locks and percpu_freelist locks can be taken from
71 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
72 * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
73 * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
74 * because there is no memory allocation within the lock held sections. However
75 * after hash map was fully converted to use bpf_mem_alloc, there will be
76 * non-synchronous memory allocation for non-preallocated hash map, so it is
77 * safe to always use raw spinlock for bucket lock.
80 struct hlist_nulls_head head
;
81 raw_spinlock_t raw_lock
;
84 #define HASHTAB_MAP_LOCK_COUNT 8
85 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
89 struct bpf_mem_alloc ma
;
90 struct bpf_mem_alloc pcpu_ma
;
91 struct bucket
*buckets
;
94 struct pcpu_freelist freelist
;
97 struct htab_elem
*__percpu
*extra_elems
;
98 /* number of elements in non-preallocated hashtable are kept
99 * in either pcount or count
101 struct percpu_counter pcount
;
103 bool use_percpu_counter
;
104 u32 n_buckets
; /* number of hash buckets */
105 u32 elem_size
; /* size of each element in bytes */
107 struct lock_class_key lockdep_key
;
108 int __percpu
*map_locked
[HASHTAB_MAP_LOCK_COUNT
];
111 /* each htab element is struct htab_elem + key + value */
114 struct hlist_nulls_node hash_node
;
118 struct pcpu_freelist_node fnode
;
119 struct htab_elem
*batch_flink
;
124 /* pointer to per-cpu pointer */
126 struct bpf_lru_node lru_node
;
129 char key
[] __aligned(8);
132 static inline bool htab_is_prealloc(const struct bpf_htab
*htab
)
134 return !(htab
->map
.map_flags
& BPF_F_NO_PREALLOC
);
137 static void htab_init_buckets(struct bpf_htab
*htab
)
141 for (i
= 0; i
< htab
->n_buckets
; i
++) {
142 INIT_HLIST_NULLS_HEAD(&htab
->buckets
[i
].head
, i
);
143 raw_spin_lock_init(&htab
->buckets
[i
].raw_lock
);
144 lockdep_set_class(&htab
->buckets
[i
].raw_lock
,
150 static inline int htab_lock_bucket(const struct bpf_htab
*htab
,
151 struct bucket
*b
, u32 hash
,
152 unsigned long *pflags
)
156 hash
= hash
& min_t(u32
, HASHTAB_MAP_LOCK_MASK
, htab
->n_buckets
- 1);
159 local_irq_save(flags
);
160 if (unlikely(__this_cpu_inc_return(*(htab
->map_locked
[hash
])) != 1)) {
161 __this_cpu_dec(*(htab
->map_locked
[hash
]));
162 local_irq_restore(flags
);
167 raw_spin_lock(&b
->raw_lock
);
173 static inline void htab_unlock_bucket(const struct bpf_htab
*htab
,
174 struct bucket
*b
, u32 hash
,
177 hash
= hash
& min_t(u32
, HASHTAB_MAP_LOCK_MASK
, htab
->n_buckets
- 1);
178 raw_spin_unlock(&b
->raw_lock
);
179 __this_cpu_dec(*(htab
->map_locked
[hash
]));
180 local_irq_restore(flags
);
184 static bool htab_lru_map_delete_node(void *arg
, struct bpf_lru_node
*node
);
186 static bool htab_is_lru(const struct bpf_htab
*htab
)
188 return htab
->map
.map_type
== BPF_MAP_TYPE_LRU_HASH
||
189 htab
->map
.map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
;
192 static bool htab_is_percpu(const struct bpf_htab
*htab
)
194 return htab
->map
.map_type
== BPF_MAP_TYPE_PERCPU_HASH
||
195 htab
->map
.map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
;
198 static inline void htab_elem_set_ptr(struct htab_elem
*l
, u32 key_size
,
201 *(void __percpu
**)(l
->key
+ key_size
) = pptr
;
204 static inline void __percpu
*htab_elem_get_ptr(struct htab_elem
*l
, u32 key_size
)
206 return *(void __percpu
**)(l
->key
+ key_size
);
209 static void *fd_htab_map_get_ptr(const struct bpf_map
*map
, struct htab_elem
*l
)
211 return *(void **)(l
->key
+ roundup(map
->key_size
, 8));
214 static struct htab_elem
*get_htab_elem(struct bpf_htab
*htab
, int i
)
216 return (struct htab_elem
*) (htab
->elems
+ i
* (u64
)htab
->elem_size
);
219 static bool htab_has_extra_elems(struct bpf_htab
*htab
)
221 return !htab_is_percpu(htab
) && !htab_is_lru(htab
);
224 static void htab_free_prealloced_timers_and_wq(struct bpf_htab
*htab
)
226 u32 num_entries
= htab
->map
.max_entries
;
229 if (htab_has_extra_elems(htab
))
230 num_entries
+= num_possible_cpus();
232 for (i
= 0; i
< num_entries
; i
++) {
233 struct htab_elem
*elem
;
235 elem
= get_htab_elem(htab
, i
);
236 if (btf_record_has_field(htab
->map
.record
, BPF_TIMER
))
237 bpf_obj_free_timer(htab
->map
.record
,
238 elem
->key
+ round_up(htab
->map
.key_size
, 8));
239 if (btf_record_has_field(htab
->map
.record
, BPF_WORKQUEUE
))
240 bpf_obj_free_workqueue(htab
->map
.record
,
241 elem
->key
+ round_up(htab
->map
.key_size
, 8));
246 static void htab_free_prealloced_fields(struct bpf_htab
*htab
)
248 u32 num_entries
= htab
->map
.max_entries
;
251 if (IS_ERR_OR_NULL(htab
->map
.record
))
253 if (htab_has_extra_elems(htab
))
254 num_entries
+= num_possible_cpus();
255 for (i
= 0; i
< num_entries
; i
++) {
256 struct htab_elem
*elem
;
258 elem
= get_htab_elem(htab
, i
);
259 if (htab_is_percpu(htab
)) {
260 void __percpu
*pptr
= htab_elem_get_ptr(elem
, htab
->map
.key_size
);
263 for_each_possible_cpu(cpu
) {
264 bpf_obj_free_fields(htab
->map
.record
, per_cpu_ptr(pptr
, cpu
));
268 bpf_obj_free_fields(htab
->map
.record
, elem
->key
+ round_up(htab
->map
.key_size
, 8));
275 static void htab_free_elems(struct bpf_htab
*htab
)
279 if (!htab_is_percpu(htab
))
282 for (i
= 0; i
< htab
->map
.max_entries
; i
++) {
285 pptr
= htab_elem_get_ptr(get_htab_elem(htab
, i
),
291 bpf_map_area_free(htab
->elems
);
294 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock
295 * (bucket_lock). If both locks need to be acquired together, the lock
296 * order is always lru_lock -> bucket_lock and this only happens in
297 * bpf_lru_list.c logic. For example, certain code path of
298 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
299 * will acquire lru_lock first followed by acquiring bucket_lock.
301 * In hashtab.c, to avoid deadlock, lock acquisition of
302 * bucket_lock followed by lru_lock is not allowed. In such cases,
303 * bucket_lock needs to be released first before acquiring lru_lock.
305 static struct htab_elem
*prealloc_lru_pop(struct bpf_htab
*htab
, void *key
,
308 struct bpf_lru_node
*node
= bpf_lru_pop_free(&htab
->lru
, hash
);
312 bpf_map_inc_elem_count(&htab
->map
);
313 l
= container_of(node
, struct htab_elem
, lru_node
);
314 memcpy(l
->key
, key
, htab
->map
.key_size
);
321 static int prealloc_init(struct bpf_htab
*htab
)
323 u32 num_entries
= htab
->map
.max_entries
;
324 int err
= -ENOMEM
, i
;
326 if (htab_has_extra_elems(htab
))
327 num_entries
+= num_possible_cpus();
329 htab
->elems
= bpf_map_area_alloc((u64
)htab
->elem_size
* num_entries
,
330 htab
->map
.numa_node
);
334 if (!htab_is_percpu(htab
))
335 goto skip_percpu_elems
;
337 for (i
= 0; i
< num_entries
; i
++) {
338 u32 size
= round_up(htab
->map
.value_size
, 8);
341 pptr
= bpf_map_alloc_percpu(&htab
->map
, size
, 8,
342 GFP_USER
| __GFP_NOWARN
);
345 htab_elem_set_ptr(get_htab_elem(htab
, i
), htab
->map
.key_size
,
351 if (htab_is_lru(htab
))
352 err
= bpf_lru_init(&htab
->lru
,
353 htab
->map
.map_flags
& BPF_F_NO_COMMON_LRU
,
354 offsetof(struct htab_elem
, hash
) -
355 offsetof(struct htab_elem
, lru_node
),
356 htab_lru_map_delete_node
,
359 err
= pcpu_freelist_init(&htab
->freelist
);
364 if (htab_is_lru(htab
))
365 bpf_lru_populate(&htab
->lru
, htab
->elems
,
366 offsetof(struct htab_elem
, lru_node
),
367 htab
->elem_size
, num_entries
);
369 pcpu_freelist_populate(&htab
->freelist
,
370 htab
->elems
+ offsetof(struct htab_elem
, fnode
),
371 htab
->elem_size
, num_entries
);
376 htab_free_elems(htab
);
380 static void prealloc_destroy(struct bpf_htab
*htab
)
382 htab_free_elems(htab
);
384 if (htab_is_lru(htab
))
385 bpf_lru_destroy(&htab
->lru
);
387 pcpu_freelist_destroy(&htab
->freelist
);
390 static int alloc_extra_elems(struct bpf_htab
*htab
)
392 struct htab_elem
*__percpu
*pptr
, *l_new
;
393 struct pcpu_freelist_node
*l
;
396 pptr
= bpf_map_alloc_percpu(&htab
->map
, sizeof(struct htab_elem
*), 8,
397 GFP_USER
| __GFP_NOWARN
);
401 for_each_possible_cpu(cpu
) {
402 l
= pcpu_freelist_pop(&htab
->freelist
);
403 /* pop will succeed, since prealloc_init()
404 * preallocated extra num_possible_cpus elements
406 l_new
= container_of(l
, struct htab_elem
, fnode
);
407 *per_cpu_ptr(pptr
, cpu
) = l_new
;
409 htab
->extra_elems
= pptr
;
413 /* Called from syscall */
414 static int htab_map_alloc_check(union bpf_attr
*attr
)
416 bool percpu
= (attr
->map_type
== BPF_MAP_TYPE_PERCPU_HASH
||
417 attr
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
);
418 bool lru
= (attr
->map_type
== BPF_MAP_TYPE_LRU_HASH
||
419 attr
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
);
420 /* percpu_lru means each cpu has its own LRU list.
421 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
422 * the map's value itself is percpu. percpu_lru has
423 * nothing to do with the map's value.
425 bool percpu_lru
= (attr
->map_flags
& BPF_F_NO_COMMON_LRU
);
426 bool prealloc
= !(attr
->map_flags
& BPF_F_NO_PREALLOC
);
427 bool zero_seed
= (attr
->map_flags
& BPF_F_ZERO_SEED
);
428 int numa_node
= bpf_map_attr_numa_node(attr
);
430 BUILD_BUG_ON(offsetof(struct htab_elem
, fnode
.next
) !=
431 offsetof(struct htab_elem
, hash_node
.pprev
));
433 if (zero_seed
&& !capable(CAP_SYS_ADMIN
))
434 /* Guard against local DoS, and discourage production use. */
437 if (attr
->map_flags
& ~HTAB_CREATE_FLAG_MASK
||
438 !bpf_map_flags_access_ok(attr
->map_flags
))
441 if (!lru
&& percpu_lru
)
444 if (lru
&& !prealloc
)
447 if (numa_node
!= NUMA_NO_NODE
&& (percpu
|| percpu_lru
))
450 /* check sanity of attributes.
451 * value_size == 0 may be allowed in the future to use map as a set
453 if (attr
->max_entries
== 0 || attr
->key_size
== 0 ||
454 attr
->value_size
== 0)
457 if ((u64
)attr
->key_size
+ attr
->value_size
>= KMALLOC_MAX_SIZE
-
458 sizeof(struct htab_elem
))
459 /* if key_size + value_size is bigger, the user space won't be
460 * able to access the elements via bpf syscall. This check
461 * also makes sure that the elem_size doesn't overflow and it's
462 * kmalloc-able later in htab_map_update_elem()
465 /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
466 if (percpu
&& round_up(attr
->value_size
, 8) > PCPU_MIN_UNIT_SIZE
)
472 static struct bpf_map
*htab_map_alloc(union bpf_attr
*attr
)
474 bool percpu
= (attr
->map_type
== BPF_MAP_TYPE_PERCPU_HASH
||
475 attr
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
);
476 bool lru
= (attr
->map_type
== BPF_MAP_TYPE_LRU_HASH
||
477 attr
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
);
478 /* percpu_lru means each cpu has its own LRU list.
479 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
480 * the map's value itself is percpu. percpu_lru has
481 * nothing to do with the map's value.
483 bool percpu_lru
= (attr
->map_flags
& BPF_F_NO_COMMON_LRU
);
484 bool prealloc
= !(attr
->map_flags
& BPF_F_NO_PREALLOC
);
485 struct bpf_htab
*htab
;
488 htab
= bpf_map_area_alloc(sizeof(*htab
), NUMA_NO_NODE
);
490 return ERR_PTR(-ENOMEM
);
492 lockdep_register_key(&htab
->lockdep_key
);
494 bpf_map_init_from_attr(&htab
->map
, attr
);
497 /* ensure each CPU's lru list has >=1 elements.
498 * since we are at it, make each lru list has the same
499 * number of elements.
501 htab
->map
.max_entries
= roundup(attr
->max_entries
,
502 num_possible_cpus());
503 if (htab
->map
.max_entries
< attr
->max_entries
)
504 htab
->map
.max_entries
= rounddown(attr
->max_entries
,
505 num_possible_cpus());
508 /* hash table size must be power of 2; roundup_pow_of_two() can overflow
509 * into UB on 32-bit arches, so check that first
512 if (htab
->map
.max_entries
> 1UL << 31)
515 htab
->n_buckets
= roundup_pow_of_two(htab
->map
.max_entries
);
517 htab
->elem_size
= sizeof(struct htab_elem
) +
518 round_up(htab
->map
.key_size
, 8);
520 htab
->elem_size
+= sizeof(void *);
522 htab
->elem_size
+= round_up(htab
->map
.value_size
, 8);
524 /* check for u32 overflow */
525 if (htab
->n_buckets
> U32_MAX
/ sizeof(struct bucket
))
528 err
= bpf_map_init_elem_count(&htab
->map
);
533 htab
->buckets
= bpf_map_area_alloc(htab
->n_buckets
*
534 sizeof(struct bucket
),
535 htab
->map
.numa_node
);
537 goto free_elem_count
;
539 for (i
= 0; i
< HASHTAB_MAP_LOCK_COUNT
; i
++) {
540 htab
->map_locked
[i
] = bpf_map_alloc_percpu(&htab
->map
,
544 if (!htab
->map_locked
[i
])
545 goto free_map_locked
;
548 if (htab
->map
.map_flags
& BPF_F_ZERO_SEED
)
551 htab
->hashrnd
= get_random_u32();
553 htab_init_buckets(htab
);
555 /* compute_batch_value() computes batch value as num_online_cpus() * 2
556 * and __percpu_counter_compare() needs
557 * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
558 * for percpu_counter to be faster than atomic_t. In practice the average bpf
559 * hash map size is 10k, which means that a system with 64 cpus will fill
560 * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
561 * define our own batch count as 32 then 10k hash map can be filled up to 80%:
562 * 10k - 8k > 32 _batch_ * 64 _cpus_
563 * and __percpu_counter_compare() will still be fast. At that point hash map
564 * collisions will dominate its performance anyway. Assume that hash map filled
565 * to 50+% isn't going to be O(1) and use the following formula to choose
566 * between percpu_counter and atomic_t.
568 #define PERCPU_COUNTER_BATCH 32
569 if (attr
->max_entries
/ 2 > num_online_cpus() * PERCPU_COUNTER_BATCH
)
570 htab
->use_percpu_counter
= true;
572 if (htab
->use_percpu_counter
) {
573 err
= percpu_counter_init(&htab
->pcount
, 0, GFP_KERNEL
);
575 goto free_map_locked
;
579 err
= prealloc_init(htab
);
581 goto free_map_locked
;
583 if (!percpu
&& !lru
) {
584 /* lru itself can remove the least used element, so
585 * there is no need for an extra elem during map_update.
587 err
= alloc_extra_elems(htab
);
592 err
= bpf_mem_alloc_init(&htab
->ma
, htab
->elem_size
, false);
594 goto free_map_locked
;
596 err
= bpf_mem_alloc_init(&htab
->pcpu_ma
,
597 round_up(htab
->map
.value_size
, 8), true);
599 goto free_map_locked
;
606 prealloc_destroy(htab
);
608 if (htab
->use_percpu_counter
)
609 percpu_counter_destroy(&htab
->pcount
);
610 for (i
= 0; i
< HASHTAB_MAP_LOCK_COUNT
; i
++)
611 free_percpu(htab
->map_locked
[i
]);
612 bpf_map_area_free(htab
->buckets
);
613 bpf_mem_alloc_destroy(&htab
->pcpu_ma
);
614 bpf_mem_alloc_destroy(&htab
->ma
);
616 bpf_map_free_elem_count(&htab
->map
);
618 lockdep_unregister_key(&htab
->lockdep_key
);
619 bpf_map_area_free(htab
);
623 static inline u32
htab_map_hash(const void *key
, u32 key_len
, u32 hashrnd
)
625 if (likely(key_len
% 4 == 0))
626 return jhash2(key
, key_len
/ 4, hashrnd
);
627 return jhash(key
, key_len
, hashrnd
);
630 static inline struct bucket
*__select_bucket(struct bpf_htab
*htab
, u32 hash
)
632 return &htab
->buckets
[hash
& (htab
->n_buckets
- 1)];
635 static inline struct hlist_nulls_head
*select_bucket(struct bpf_htab
*htab
, u32 hash
)
637 return &__select_bucket(htab
, hash
)->head
;
640 /* this lookup function can only be called with bucket lock taken */
641 static struct htab_elem
*lookup_elem_raw(struct hlist_nulls_head
*head
, u32 hash
,
642 void *key
, u32 key_size
)
644 struct hlist_nulls_node
*n
;
647 hlist_nulls_for_each_entry_rcu(l
, n
, head
, hash_node
)
648 if (l
->hash
== hash
&& !memcmp(&l
->key
, key
, key_size
))
654 /* can be called without bucket lock. it will repeat the loop in
655 * the unlikely event when elements moved from one bucket into another
656 * while link list is being walked
658 static struct htab_elem
*lookup_nulls_elem_raw(struct hlist_nulls_head
*head
,
660 u32 key_size
, u32 n_buckets
)
662 struct hlist_nulls_node
*n
;
666 hlist_nulls_for_each_entry_rcu(l
, n
, head
, hash_node
)
667 if (l
->hash
== hash
&& !memcmp(&l
->key
, key
, key_size
))
670 if (unlikely(get_nulls_value(n
) != (hash
& (n_buckets
- 1))))
676 /* Called from syscall or from eBPF program directly, so
677 * arguments have to match bpf_map_lookup_elem() exactly.
678 * The return value is adjusted by BPF instructions
679 * in htab_map_gen_lookup().
681 static void *__htab_map_lookup_elem(struct bpf_map
*map
, void *key
)
683 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
684 struct hlist_nulls_head
*head
;
688 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
689 !rcu_read_lock_bh_held());
691 key_size
= map
->key_size
;
693 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
695 head
= select_bucket(htab
, hash
);
697 l
= lookup_nulls_elem_raw(head
, hash
, key
, key_size
, htab
->n_buckets
);
702 static void *htab_map_lookup_elem(struct bpf_map
*map
, void *key
)
704 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
707 return l
->key
+ round_up(map
->key_size
, 8);
712 /* inline bpf_map_lookup_elem() call.
715 * bpf_map_lookup_elem
716 * map->ops->map_lookup_elem
717 * htab_map_lookup_elem
718 * __htab_map_lookup_elem
721 * __htab_map_lookup_elem
723 static int htab_map_gen_lookup(struct bpf_map
*map
, struct bpf_insn
*insn_buf
)
725 struct bpf_insn
*insn
= insn_buf
;
726 const int ret
= BPF_REG_0
;
728 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem
,
729 (void *(*)(struct bpf_map
*map
, void *key
))NULL
));
730 *insn
++ = BPF_EMIT_CALL(__htab_map_lookup_elem
);
731 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, ret
, 0, 1);
732 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, ret
,
733 offsetof(struct htab_elem
, key
) +
734 round_up(map
->key_size
, 8));
735 return insn
- insn_buf
;
738 static __always_inline
void *__htab_lru_map_lookup_elem(struct bpf_map
*map
,
739 void *key
, const bool mark
)
741 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
745 bpf_lru_node_set_ref(&l
->lru_node
);
746 return l
->key
+ round_up(map
->key_size
, 8);
752 static void *htab_lru_map_lookup_elem(struct bpf_map
*map
, void *key
)
754 return __htab_lru_map_lookup_elem(map
, key
, true);
757 static void *htab_lru_map_lookup_elem_sys(struct bpf_map
*map
, void *key
)
759 return __htab_lru_map_lookup_elem(map
, key
, false);
762 static int htab_lru_map_gen_lookup(struct bpf_map
*map
,
763 struct bpf_insn
*insn_buf
)
765 struct bpf_insn
*insn
= insn_buf
;
766 const int ret
= BPF_REG_0
;
767 const int ref_reg
= BPF_REG_1
;
769 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem
,
770 (void *(*)(struct bpf_map
*map
, void *key
))NULL
));
771 *insn
++ = BPF_EMIT_CALL(__htab_map_lookup_elem
);
772 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, ret
, 0, 4);
773 *insn
++ = BPF_LDX_MEM(BPF_B
, ref_reg
, ret
,
774 offsetof(struct htab_elem
, lru_node
) +
775 offsetof(struct bpf_lru_node
, ref
));
776 *insn
++ = BPF_JMP_IMM(BPF_JNE
, ref_reg
, 0, 1);
777 *insn
++ = BPF_ST_MEM(BPF_B
, ret
,
778 offsetof(struct htab_elem
, lru_node
) +
779 offsetof(struct bpf_lru_node
, ref
),
781 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, ret
,
782 offsetof(struct htab_elem
, key
) +
783 round_up(map
->key_size
, 8));
784 return insn
- insn_buf
;
787 static void check_and_free_fields(struct bpf_htab
*htab
,
788 struct htab_elem
*elem
)
790 if (htab_is_percpu(htab
)) {
791 void __percpu
*pptr
= htab_elem_get_ptr(elem
, htab
->map
.key_size
);
794 for_each_possible_cpu(cpu
)
795 bpf_obj_free_fields(htab
->map
.record
, per_cpu_ptr(pptr
, cpu
));
797 void *map_value
= elem
->key
+ round_up(htab
->map
.key_size
, 8);
799 bpf_obj_free_fields(htab
->map
.record
, map_value
);
803 /* It is called from the bpf_lru_list when the LRU needs to delete
804 * older elements from the htab.
806 static bool htab_lru_map_delete_node(void *arg
, struct bpf_lru_node
*node
)
808 struct bpf_htab
*htab
= arg
;
809 struct htab_elem
*l
= NULL
, *tgt_l
;
810 struct hlist_nulls_head
*head
;
811 struct hlist_nulls_node
*n
;
816 tgt_l
= container_of(node
, struct htab_elem
, lru_node
);
817 b
= __select_bucket(htab
, tgt_l
->hash
);
820 ret
= htab_lock_bucket(htab
, b
, tgt_l
->hash
, &flags
);
824 hlist_nulls_for_each_entry_rcu(l
, n
, head
, hash_node
)
826 hlist_nulls_del_rcu(&l
->hash_node
);
827 check_and_free_fields(htab
, l
);
828 bpf_map_dec_elem_count(&htab
->map
);
832 htab_unlock_bucket(htab
, b
, tgt_l
->hash
, flags
);
837 /* Called from syscall */
838 static int htab_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
840 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
841 struct hlist_nulls_head
*head
;
842 struct htab_elem
*l
, *next_l
;
846 WARN_ON_ONCE(!rcu_read_lock_held());
848 key_size
= map
->key_size
;
851 goto find_first_elem
;
853 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
855 head
= select_bucket(htab
, hash
);
858 l
= lookup_nulls_elem_raw(head
, hash
, key
, key_size
, htab
->n_buckets
);
861 goto find_first_elem
;
863 /* key was found, get next key in the same bucket */
864 next_l
= hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l
->hash_node
)),
865 struct htab_elem
, hash_node
);
868 /* if next elem in this hash list is non-zero, just return it */
869 memcpy(next_key
, next_l
->key
, key_size
);
873 /* no more elements in this hash list, go to the next bucket */
874 i
= hash
& (htab
->n_buckets
- 1);
878 /* iterate over buckets */
879 for (; i
< htab
->n_buckets
; i
++) {
880 head
= select_bucket(htab
, i
);
882 /* pick first element in the bucket */
883 next_l
= hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head
)),
884 struct htab_elem
, hash_node
);
886 /* if it's not empty, just return it */
887 memcpy(next_key
, next_l
->key
, key_size
);
892 /* iterated over all buckets and all elements */
896 static void htab_elem_free(struct bpf_htab
*htab
, struct htab_elem
*l
)
898 check_and_free_fields(htab
, l
);
901 if (htab
->map
.map_type
== BPF_MAP_TYPE_PERCPU_HASH
)
902 bpf_mem_cache_free(&htab
->pcpu_ma
, l
->ptr_to_pptr
);
903 bpf_mem_cache_free(&htab
->ma
, l
);
907 static void htab_put_fd_value(struct bpf_htab
*htab
, struct htab_elem
*l
)
909 struct bpf_map
*map
= &htab
->map
;
912 if (map
->ops
->map_fd_put_ptr
) {
913 ptr
= fd_htab_map_get_ptr(map
, l
);
914 map
->ops
->map_fd_put_ptr(map
, ptr
, true);
918 static bool is_map_full(struct bpf_htab
*htab
)
920 if (htab
->use_percpu_counter
)
921 return __percpu_counter_compare(&htab
->pcount
, htab
->map
.max_entries
,
922 PERCPU_COUNTER_BATCH
) >= 0;
923 return atomic_read(&htab
->count
) >= htab
->map
.max_entries
;
926 static void inc_elem_count(struct bpf_htab
*htab
)
928 bpf_map_inc_elem_count(&htab
->map
);
930 if (htab
->use_percpu_counter
)
931 percpu_counter_add_batch(&htab
->pcount
, 1, PERCPU_COUNTER_BATCH
);
933 atomic_inc(&htab
->count
);
936 static void dec_elem_count(struct bpf_htab
*htab
)
938 bpf_map_dec_elem_count(&htab
->map
);
940 if (htab
->use_percpu_counter
)
941 percpu_counter_add_batch(&htab
->pcount
, -1, PERCPU_COUNTER_BATCH
);
943 atomic_dec(&htab
->count
);
947 static void free_htab_elem(struct bpf_htab
*htab
, struct htab_elem
*l
)
949 htab_put_fd_value(htab
, l
);
951 if (htab_is_prealloc(htab
)) {
952 bpf_map_dec_elem_count(&htab
->map
);
953 check_and_free_fields(htab
, l
);
954 pcpu_freelist_push(&htab
->freelist
, &l
->fnode
);
956 dec_elem_count(htab
);
957 htab_elem_free(htab
, l
);
961 static void pcpu_copy_value(struct bpf_htab
*htab
, void __percpu
*pptr
,
962 void *value
, bool onallcpus
)
965 /* copy true value_size bytes */
966 copy_map_value(&htab
->map
, this_cpu_ptr(pptr
), value
);
968 u32 size
= round_up(htab
->map
.value_size
, 8);
971 for_each_possible_cpu(cpu
) {
972 copy_map_value_long(&htab
->map
, per_cpu_ptr(pptr
, cpu
), value
+ off
);
978 static void pcpu_init_value(struct bpf_htab
*htab
, void __percpu
*pptr
,
979 void *value
, bool onallcpus
)
981 /* When not setting the initial value on all cpus, zero-fill element
982 * values for other cpus. Otherwise, bpf program has no way to ensure
983 * known initial values for cpus other than current one
984 * (onallcpus=false always when coming from bpf prog).
987 int current_cpu
= raw_smp_processor_id();
990 for_each_possible_cpu(cpu
) {
991 if (cpu
== current_cpu
)
992 copy_map_value_long(&htab
->map
, per_cpu_ptr(pptr
, cpu
), value
);
993 else /* Since elem is preallocated, we cannot touch special fields */
994 zero_map_value(&htab
->map
, per_cpu_ptr(pptr
, cpu
));
997 pcpu_copy_value(htab
, pptr
, value
, onallcpus
);
1001 static bool fd_htab_map_needs_adjust(const struct bpf_htab
*htab
)
1003 return htab
->map
.map_type
== BPF_MAP_TYPE_HASH_OF_MAPS
&&
1004 BITS_PER_LONG
== 64;
1007 static struct htab_elem
*alloc_htab_elem(struct bpf_htab
*htab
, void *key
,
1008 void *value
, u32 key_size
, u32 hash
,
1009 bool percpu
, bool onallcpus
,
1010 struct htab_elem
*old_elem
)
1012 u32 size
= htab
->map
.value_size
;
1013 bool prealloc
= htab_is_prealloc(htab
);
1014 struct htab_elem
*l_new
, **pl_new
;
1015 void __percpu
*pptr
;
1019 /* if we're updating the existing element,
1020 * use per-cpu extra elems to avoid freelist_pop/push
1022 pl_new
= this_cpu_ptr(htab
->extra_elems
);
1026 struct pcpu_freelist_node
*l
;
1028 l
= __pcpu_freelist_pop(&htab
->freelist
);
1030 return ERR_PTR(-E2BIG
);
1031 l_new
= container_of(l
, struct htab_elem
, fnode
);
1032 bpf_map_inc_elem_count(&htab
->map
);
1035 if (is_map_full(htab
))
1037 /* when map is full and update() is replacing
1038 * old element, it's ok to allocate, since
1039 * old element will be freed immediately.
1040 * Otherwise return an error
1042 return ERR_PTR(-E2BIG
);
1043 inc_elem_count(htab
);
1044 l_new
= bpf_mem_cache_alloc(&htab
->ma
);
1046 l_new
= ERR_PTR(-ENOMEM
);
1051 memcpy(l_new
->key
, key
, key_size
);
1054 pptr
= htab_elem_get_ptr(l_new
, key_size
);
1056 /* alloc_percpu zero-fills */
1057 void *ptr
= bpf_mem_cache_alloc(&htab
->pcpu_ma
);
1060 bpf_mem_cache_free(&htab
->ma
, l_new
);
1061 l_new
= ERR_PTR(-ENOMEM
);
1064 l_new
->ptr_to_pptr
= ptr
;
1065 pptr
= *(void __percpu
**)ptr
;
1068 pcpu_init_value(htab
, pptr
, value
, onallcpus
);
1071 htab_elem_set_ptr(l_new
, key_size
, pptr
);
1072 } else if (fd_htab_map_needs_adjust(htab
)) {
1073 size
= round_up(size
, 8);
1074 memcpy(l_new
->key
+ round_up(key_size
, 8), value
, size
);
1076 copy_map_value(&htab
->map
,
1077 l_new
->key
+ round_up(key_size
, 8),
1084 dec_elem_count(htab
);
1088 static int check_flags(struct bpf_htab
*htab
, struct htab_elem
*l_old
,
1091 if (l_old
&& (map_flags
& ~BPF_F_LOCK
) == BPF_NOEXIST
)
1092 /* elem already exists */
1095 if (!l_old
&& (map_flags
& ~BPF_F_LOCK
) == BPF_EXIST
)
1096 /* elem doesn't exist, cannot update it */
1102 /* Called from syscall or from eBPF program */
1103 static long htab_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
1106 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1107 struct htab_elem
*l_new
= NULL
, *l_old
;
1108 struct hlist_nulls_head
*head
;
1109 unsigned long flags
;
1115 if (unlikely((map_flags
& ~BPF_F_LOCK
) > BPF_EXIST
))
1119 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1120 !rcu_read_lock_bh_held());
1122 key_size
= map
->key_size
;
1124 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1126 b
= __select_bucket(htab
, hash
);
1129 if (unlikely(map_flags
& BPF_F_LOCK
)) {
1130 if (unlikely(!btf_record_has_field(map
->record
, BPF_SPIN_LOCK
)))
1132 /* find an element without taking the bucket lock */
1133 l_old
= lookup_nulls_elem_raw(head
, hash
, key
, key_size
,
1135 ret
= check_flags(htab
, l_old
, map_flags
);
1139 /* grab the element lock and update value in place */
1140 copy_map_value_locked(map
,
1141 l_old
->key
+ round_up(key_size
, 8),
1145 /* fall through, grab the bucket lock and lookup again.
1146 * 99.9% chance that the element won't be found,
1147 * but second lookup under lock has to be done.
1151 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1155 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
1157 ret
= check_flags(htab
, l_old
, map_flags
);
1161 if (unlikely(l_old
&& (map_flags
& BPF_F_LOCK
))) {
1162 /* first lookup without the bucket lock didn't find the element,
1163 * but second lookup with the bucket lock found it.
1164 * This case is highly unlikely, but has to be dealt with:
1165 * grab the element lock in addition to the bucket lock
1166 * and update element in place
1168 copy_map_value_locked(map
,
1169 l_old
->key
+ round_up(key_size
, 8),
1175 l_new
= alloc_htab_elem(htab
, key
, value
, key_size
, hash
, false, false,
1177 if (IS_ERR(l_new
)) {
1178 /* all pre-allocated elements are in use or memory exhausted */
1179 ret
= PTR_ERR(l_new
);
1183 /* add new element to the head of the list, so that
1184 * concurrent search will find it before old elem
1186 hlist_nulls_add_head_rcu(&l_new
->hash_node
, head
);
1188 hlist_nulls_del_rcu(&l_old
->hash_node
);
1190 /* l_old has already been stashed in htab->extra_elems, free
1191 * its special fields before it is available for reuse. Also
1192 * save the old map pointer in htab of maps before unlock
1193 * and release it after unlock.
1196 if (htab_is_prealloc(htab
)) {
1197 if (map
->ops
->map_fd_put_ptr
)
1198 old_map_ptr
= fd_htab_map_get_ptr(map
, l_old
);
1199 check_and_free_fields(htab
, l_old
);
1202 htab_unlock_bucket(htab
, b
, hash
, flags
);
1205 map
->ops
->map_fd_put_ptr(map
, old_map_ptr
, true);
1206 if (!htab_is_prealloc(htab
))
1207 free_htab_elem(htab
, l_old
);
1211 htab_unlock_bucket(htab
, b
, hash
, flags
);
1215 static void htab_lru_push_free(struct bpf_htab
*htab
, struct htab_elem
*elem
)
1217 check_and_free_fields(htab
, elem
);
1218 bpf_map_dec_elem_count(&htab
->map
);
1219 bpf_lru_push_free(&htab
->lru
, &elem
->lru_node
);
1222 static long htab_lru_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
1225 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1226 struct htab_elem
*l_new
, *l_old
= NULL
;
1227 struct hlist_nulls_head
*head
;
1228 unsigned long flags
;
1233 if (unlikely(map_flags
> BPF_EXIST
))
1237 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1238 !rcu_read_lock_bh_held());
1240 key_size
= map
->key_size
;
1242 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1244 b
= __select_bucket(htab
, hash
);
1247 /* For LRU, we need to alloc before taking bucket's
1248 * spinlock because getting free nodes from LRU may need
1249 * to remove older elements from htab and this removal
1250 * operation will need a bucket lock.
1252 l_new
= prealloc_lru_pop(htab
, key
, hash
);
1255 copy_map_value(&htab
->map
,
1256 l_new
->key
+ round_up(map
->key_size
, 8), value
);
1258 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1260 goto err_lock_bucket
;
1262 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
1264 ret
= check_flags(htab
, l_old
, map_flags
);
1268 /* add new element to the head of the list, so that
1269 * concurrent search will find it before old elem
1271 hlist_nulls_add_head_rcu(&l_new
->hash_node
, head
);
1273 bpf_lru_node_set_ref(&l_new
->lru_node
);
1274 hlist_nulls_del_rcu(&l_old
->hash_node
);
1279 htab_unlock_bucket(htab
, b
, hash
, flags
);
1283 htab_lru_push_free(htab
, l_new
);
1285 htab_lru_push_free(htab
, l_old
);
1290 static long __htab_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
1291 void *value
, u64 map_flags
,
1294 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1295 struct htab_elem
*l_new
= NULL
, *l_old
;
1296 struct hlist_nulls_head
*head
;
1297 unsigned long flags
;
1302 if (unlikely(map_flags
> BPF_EXIST
))
1306 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1307 !rcu_read_lock_bh_held());
1309 key_size
= map
->key_size
;
1311 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1313 b
= __select_bucket(htab
, hash
);
1316 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1320 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
1322 ret
= check_flags(htab
, l_old
, map_flags
);
1327 /* per-cpu hash map can update value in-place */
1328 pcpu_copy_value(htab
, htab_elem_get_ptr(l_old
, key_size
),
1331 l_new
= alloc_htab_elem(htab
, key
, value
, key_size
,
1332 hash
, true, onallcpus
, NULL
);
1333 if (IS_ERR(l_new
)) {
1334 ret
= PTR_ERR(l_new
);
1337 hlist_nulls_add_head_rcu(&l_new
->hash_node
, head
);
1341 htab_unlock_bucket(htab
, b
, hash
, flags
);
1345 static long __htab_lru_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
1346 void *value
, u64 map_flags
,
1349 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1350 struct htab_elem
*l_new
= NULL
, *l_old
;
1351 struct hlist_nulls_head
*head
;
1352 unsigned long flags
;
1357 if (unlikely(map_flags
> BPF_EXIST
))
1361 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1362 !rcu_read_lock_bh_held());
1364 key_size
= map
->key_size
;
1366 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1368 b
= __select_bucket(htab
, hash
);
1371 /* For LRU, we need to alloc before taking bucket's
1372 * spinlock because LRU's elem alloc may need
1373 * to remove older elem from htab and this removal
1374 * operation will need a bucket lock.
1376 if (map_flags
!= BPF_EXIST
) {
1377 l_new
= prealloc_lru_pop(htab
, key
, hash
);
1382 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1384 goto err_lock_bucket
;
1386 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
1388 ret
= check_flags(htab
, l_old
, map_flags
);
1393 bpf_lru_node_set_ref(&l_old
->lru_node
);
1395 /* per-cpu hash map can update value in-place */
1396 pcpu_copy_value(htab
, htab_elem_get_ptr(l_old
, key_size
),
1399 pcpu_init_value(htab
, htab_elem_get_ptr(l_new
, key_size
),
1401 hlist_nulls_add_head_rcu(&l_new
->hash_node
, head
);
1406 htab_unlock_bucket(htab
, b
, hash
, flags
);
1409 bpf_map_dec_elem_count(&htab
->map
);
1410 bpf_lru_push_free(&htab
->lru
, &l_new
->lru_node
);
1415 static long htab_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
1416 void *value
, u64 map_flags
)
1418 return __htab_percpu_map_update_elem(map
, key
, value
, map_flags
, false);
1421 static long htab_lru_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
1422 void *value
, u64 map_flags
)
1424 return __htab_lru_percpu_map_update_elem(map
, key
, value
, map_flags
,
1428 /* Called from syscall or from eBPF program */
1429 static long htab_map_delete_elem(struct bpf_map
*map
, void *key
)
1431 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1432 struct hlist_nulls_head
*head
;
1434 struct htab_elem
*l
;
1435 unsigned long flags
;
1439 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1440 !rcu_read_lock_bh_held());
1442 key_size
= map
->key_size
;
1444 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1445 b
= __select_bucket(htab
, hash
);
1448 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1452 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
1454 hlist_nulls_del_rcu(&l
->hash_node
);
1458 htab_unlock_bucket(htab
, b
, hash
, flags
);
1461 free_htab_elem(htab
, l
);
1465 static long htab_lru_map_delete_elem(struct bpf_map
*map
, void *key
)
1467 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1468 struct hlist_nulls_head
*head
;
1470 struct htab_elem
*l
;
1471 unsigned long flags
;
1475 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1476 !rcu_read_lock_bh_held());
1478 key_size
= map
->key_size
;
1480 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1481 b
= __select_bucket(htab
, hash
);
1484 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1488 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
1491 hlist_nulls_del_rcu(&l
->hash_node
);
1495 htab_unlock_bucket(htab
, b
, hash
, flags
);
1497 htab_lru_push_free(htab
, l
);
1501 static void delete_all_elements(struct bpf_htab
*htab
)
1505 /* It's called from a worker thread, so disable migration here,
1506 * since bpf_mem_cache_free() relies on that.
1509 for (i
= 0; i
< htab
->n_buckets
; i
++) {
1510 struct hlist_nulls_head
*head
= select_bucket(htab
, i
);
1511 struct hlist_nulls_node
*n
;
1512 struct htab_elem
*l
;
1514 hlist_nulls_for_each_entry_safe(l
, n
, head
, hash_node
) {
1515 hlist_nulls_del_rcu(&l
->hash_node
);
1516 htab_elem_free(htab
, l
);
1523 static void htab_free_malloced_timers_and_wq(struct bpf_htab
*htab
)
1528 for (i
= 0; i
< htab
->n_buckets
; i
++) {
1529 struct hlist_nulls_head
*head
= select_bucket(htab
, i
);
1530 struct hlist_nulls_node
*n
;
1531 struct htab_elem
*l
;
1533 hlist_nulls_for_each_entry(l
, n
, head
, hash_node
) {
1534 /* We only free timer on uref dropping to zero */
1535 if (btf_record_has_field(htab
->map
.record
, BPF_TIMER
))
1536 bpf_obj_free_timer(htab
->map
.record
,
1537 l
->key
+ round_up(htab
->map
.key_size
, 8));
1538 if (btf_record_has_field(htab
->map
.record
, BPF_WORKQUEUE
))
1539 bpf_obj_free_workqueue(htab
->map
.record
,
1540 l
->key
+ round_up(htab
->map
.key_size
, 8));
1547 static void htab_map_free_timers_and_wq(struct bpf_map
*map
)
1549 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1551 /* We only free timer and workqueue on uref dropping to zero */
1552 if (btf_record_has_field(htab
->map
.record
, BPF_TIMER
| BPF_WORKQUEUE
)) {
1553 if (!htab_is_prealloc(htab
))
1554 htab_free_malloced_timers_and_wq(htab
);
1556 htab_free_prealloced_timers_and_wq(htab
);
1560 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1561 static void htab_map_free(struct bpf_map
*map
)
1563 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1566 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1567 * bpf_free_used_maps() is called after bpf prog is no longer executing.
1568 * There is no need to synchronize_rcu() here to protect map elements.
1571 /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
1572 * underneath and is responsible for waiting for callbacks to finish
1573 * during bpf_mem_alloc_destroy().
1575 if (!htab_is_prealloc(htab
)) {
1576 delete_all_elements(htab
);
1578 htab_free_prealloced_fields(htab
);
1579 prealloc_destroy(htab
);
1582 bpf_map_free_elem_count(map
);
1583 free_percpu(htab
->extra_elems
);
1584 bpf_map_area_free(htab
->buckets
);
1585 bpf_mem_alloc_destroy(&htab
->pcpu_ma
);
1586 bpf_mem_alloc_destroy(&htab
->ma
);
1587 if (htab
->use_percpu_counter
)
1588 percpu_counter_destroy(&htab
->pcount
);
1589 for (i
= 0; i
< HASHTAB_MAP_LOCK_COUNT
; i
++)
1590 free_percpu(htab
->map_locked
[i
]);
1591 lockdep_unregister_key(&htab
->lockdep_key
);
1592 bpf_map_area_free(htab
);
1595 static void htab_map_seq_show_elem(struct bpf_map
*map
, void *key
,
1602 value
= htab_map_lookup_elem(map
, key
);
1608 btf_type_seq_show(map
->btf
, map
->btf_key_type_id
, key
, m
);
1610 btf_type_seq_show(map
->btf
, map
->btf_value_type_id
, value
, m
);
1616 static int __htab_map_lookup_and_delete_elem(struct bpf_map
*map
, void *key
,
1617 void *value
, bool is_lru_map
,
1618 bool is_percpu
, u64 flags
)
1620 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1621 struct hlist_nulls_head
*head
;
1622 unsigned long bflags
;
1623 struct htab_elem
*l
;
1628 key_size
= map
->key_size
;
1630 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1631 b
= __select_bucket(htab
, hash
);
1634 ret
= htab_lock_bucket(htab
, b
, hash
, &bflags
);
1638 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
1643 u32 roundup_value_size
= round_up(map
->value_size
, 8);
1644 void __percpu
*pptr
;
1647 pptr
= htab_elem_get_ptr(l
, key_size
);
1648 for_each_possible_cpu(cpu
) {
1649 copy_map_value_long(&htab
->map
, value
+ off
, per_cpu_ptr(pptr
, cpu
));
1650 check_and_init_map_value(&htab
->map
, value
+ off
);
1651 off
+= roundup_value_size
;
1654 u32 roundup_key_size
= round_up(map
->key_size
, 8);
1656 if (flags
& BPF_F_LOCK
)
1657 copy_map_value_locked(map
, value
, l
->key
+
1661 copy_map_value(map
, value
, l
->key
+
1663 /* Zeroing special fields in the temp buffer */
1664 check_and_init_map_value(map
, value
);
1667 hlist_nulls_del_rcu(&l
->hash_node
);
1669 free_htab_elem(htab
, l
);
1672 htab_unlock_bucket(htab
, b
, hash
, bflags
);
1674 if (is_lru_map
&& l
)
1675 htab_lru_push_free(htab
, l
);
1680 static int htab_map_lookup_and_delete_elem(struct bpf_map
*map
, void *key
,
1681 void *value
, u64 flags
)
1683 return __htab_map_lookup_and_delete_elem(map
, key
, value
, false, false,
1687 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map
*map
,
1688 void *key
, void *value
,
1691 return __htab_map_lookup_and_delete_elem(map
, key
, value
, false, true,
1695 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map
*map
, void *key
,
1696 void *value
, u64 flags
)
1698 return __htab_map_lookup_and_delete_elem(map
, key
, value
, true, false,
1702 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map
*map
,
1703 void *key
, void *value
,
1706 return __htab_map_lookup_and_delete_elem(map
, key
, value
, true, true,
1711 __htab_map_lookup_and_delete_batch(struct bpf_map
*map
,
1712 const union bpf_attr
*attr
,
1713 union bpf_attr __user
*uattr
,
1714 bool do_delete
, bool is_lru_map
,
1717 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1718 u32 bucket_cnt
, total
, key_size
, value_size
, roundup_key_size
;
1719 void *keys
= NULL
, *values
= NULL
, *value
, *dst_key
, *dst_val
;
1720 void __user
*uvalues
= u64_to_user_ptr(attr
->batch
.values
);
1721 void __user
*ukeys
= u64_to_user_ptr(attr
->batch
.keys
);
1722 void __user
*ubatch
= u64_to_user_ptr(attr
->batch
.in_batch
);
1723 u32 batch
, max_count
, size
, bucket_size
, map_id
;
1724 struct htab_elem
*node_to_free
= NULL
;
1725 u64 elem_map_flags
, map_flags
;
1726 struct hlist_nulls_head
*head
;
1727 struct hlist_nulls_node
*n
;
1728 unsigned long flags
= 0;
1729 bool locked
= false;
1730 struct htab_elem
*l
;
1734 elem_map_flags
= attr
->batch
.elem_flags
;
1735 if ((elem_map_flags
& ~BPF_F_LOCK
) ||
1736 ((elem_map_flags
& BPF_F_LOCK
) && !btf_record_has_field(map
->record
, BPF_SPIN_LOCK
)))
1739 map_flags
= attr
->batch
.flags
;
1743 max_count
= attr
->batch
.count
;
1747 if (put_user(0, &uattr
->batch
.count
))
1751 if (ubatch
&& copy_from_user(&batch
, ubatch
, sizeof(batch
)))
1754 if (batch
>= htab
->n_buckets
)
1757 key_size
= htab
->map
.key_size
;
1758 roundup_key_size
= round_up(htab
->map
.key_size
, 8);
1759 value_size
= htab
->map
.value_size
;
1760 size
= round_up(value_size
, 8);
1762 value_size
= size
* num_possible_cpus();
1764 /* while experimenting with hash tables with sizes ranging from 10 to
1765 * 1000, it was observed that a bucket can have up to 5 entries.
1770 /* We cannot do copy_from_user or copy_to_user inside
1771 * the rcu_read_lock. Allocate enough space here.
1773 keys
= kvmalloc_array(key_size
, bucket_size
, GFP_USER
| __GFP_NOWARN
);
1774 values
= kvmalloc_array(value_size
, bucket_size
, GFP_USER
| __GFP_NOWARN
);
1775 if (!keys
|| !values
) {
1781 bpf_disable_instrumentation();
1786 b
= &htab
->buckets
[batch
];
1788 /* do not grab the lock unless need it (bucket_cnt > 0). */
1790 ret
= htab_lock_bucket(htab
, b
, batch
, &flags
);
1793 bpf_enable_instrumentation();
1799 hlist_nulls_for_each_entry_rcu(l
, n
, head
, hash_node
)
1802 if (bucket_cnt
&& !locked
) {
1807 if (bucket_cnt
> (max_count
- total
)) {
1810 /* Note that since bucket_cnt > 0 here, it is implicit
1811 * that the locked was grabbed, so release it.
1813 htab_unlock_bucket(htab
, b
, batch
, flags
);
1815 bpf_enable_instrumentation();
1819 if (bucket_cnt
> bucket_size
) {
1820 bucket_size
= bucket_cnt
;
1821 /* Note that since bucket_cnt > 0 here, it is implicit
1822 * that the locked was grabbed, so release it.
1824 htab_unlock_bucket(htab
, b
, batch
, flags
);
1826 bpf_enable_instrumentation();
1832 /* Next block is only safe to run if you have grabbed the lock */
1836 hlist_nulls_for_each_entry_safe(l
, n
, head
, hash_node
) {
1837 memcpy(dst_key
, l
->key
, key_size
);
1841 void __percpu
*pptr
;
1843 pptr
= htab_elem_get_ptr(l
, map
->key_size
);
1844 for_each_possible_cpu(cpu
) {
1845 copy_map_value_long(&htab
->map
, dst_val
+ off
, per_cpu_ptr(pptr
, cpu
));
1846 check_and_init_map_value(&htab
->map
, dst_val
+ off
);
1850 value
= l
->key
+ roundup_key_size
;
1851 if (map
->map_type
== BPF_MAP_TYPE_HASH_OF_MAPS
) {
1852 struct bpf_map
**inner_map
= value
;
1854 /* Actual value is the id of the inner map */
1855 map_id
= map
->ops
->map_fd_sys_lookup_elem(*inner_map
);
1859 if (elem_map_flags
& BPF_F_LOCK
)
1860 copy_map_value_locked(map
, dst_val
, value
,
1863 copy_map_value(map
, dst_val
, value
);
1864 /* Zeroing special fields in the temp buffer */
1865 check_and_init_map_value(map
, dst_val
);
1868 hlist_nulls_del_rcu(&l
->hash_node
);
1870 /* bpf_lru_push_free() will acquire lru_lock, which
1871 * may cause deadlock. See comments in function
1872 * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1873 * after releasing the bucket lock.
1875 * For htab of maps, htab_put_fd_value() in
1876 * free_htab_elem() may acquire a spinlock with bucket
1877 * lock being held and it violates the lock rule, so
1878 * invoke free_htab_elem() after unlock as well.
1880 l
->batch_flink
= node_to_free
;
1883 dst_key
+= key_size
;
1884 dst_val
+= value_size
;
1887 htab_unlock_bucket(htab
, b
, batch
, flags
);
1890 while (node_to_free
) {
1892 node_to_free
= node_to_free
->batch_flink
;
1894 htab_lru_push_free(htab
, l
);
1896 free_htab_elem(htab
, l
);
1900 /* If we are not copying data, we can go to next bucket and avoid
1901 * unlocking the rcu.
1903 if (!bucket_cnt
&& (batch
+ 1 < htab
->n_buckets
)) {
1909 bpf_enable_instrumentation();
1910 if (bucket_cnt
&& (copy_to_user(ukeys
+ total
* key_size
, keys
,
1911 key_size
* bucket_cnt
) ||
1912 copy_to_user(uvalues
+ total
* value_size
, values
,
1913 value_size
* bucket_cnt
))) {
1918 total
+= bucket_cnt
;
1920 if (batch
>= htab
->n_buckets
) {
1930 /* copy # of entries and next batch */
1931 ubatch
= u64_to_user_ptr(attr
->batch
.out_batch
);
1932 if (copy_to_user(ubatch
, &batch
, sizeof(batch
)) ||
1933 put_user(total
, &uattr
->batch
.count
))
1943 htab_percpu_map_lookup_batch(struct bpf_map
*map
, const union bpf_attr
*attr
,
1944 union bpf_attr __user
*uattr
)
1946 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, false,
1951 htab_percpu_map_lookup_and_delete_batch(struct bpf_map
*map
,
1952 const union bpf_attr
*attr
,
1953 union bpf_attr __user
*uattr
)
1955 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, true,
1960 htab_map_lookup_batch(struct bpf_map
*map
, const union bpf_attr
*attr
,
1961 union bpf_attr __user
*uattr
)
1963 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, false,
1968 htab_map_lookup_and_delete_batch(struct bpf_map
*map
,
1969 const union bpf_attr
*attr
,
1970 union bpf_attr __user
*uattr
)
1972 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, true,
1977 htab_lru_percpu_map_lookup_batch(struct bpf_map
*map
,
1978 const union bpf_attr
*attr
,
1979 union bpf_attr __user
*uattr
)
1981 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, false,
1986 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map
*map
,
1987 const union bpf_attr
*attr
,
1988 union bpf_attr __user
*uattr
)
1990 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, true,
1995 htab_lru_map_lookup_batch(struct bpf_map
*map
, const union bpf_attr
*attr
,
1996 union bpf_attr __user
*uattr
)
1998 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, false,
2003 htab_lru_map_lookup_and_delete_batch(struct bpf_map
*map
,
2004 const union bpf_attr
*attr
,
2005 union bpf_attr __user
*uattr
)
2007 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, true,
2011 struct bpf_iter_seq_hash_map_info
{
2012 struct bpf_map
*map
;
2013 struct bpf_htab
*htab
;
2014 void *percpu_value_buf
; // non-zero means percpu hash
2019 static struct htab_elem
*
2020 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info
*info
,
2021 struct htab_elem
*prev_elem
)
2023 const struct bpf_htab
*htab
= info
->htab
;
2024 u32 skip_elems
= info
->skip_elems
;
2025 u32 bucket_id
= info
->bucket_id
;
2026 struct hlist_nulls_head
*head
;
2027 struct hlist_nulls_node
*n
;
2028 struct htab_elem
*elem
;
2032 if (bucket_id
>= htab
->n_buckets
)
2035 /* try to find next elem in the same bucket */
2037 /* no update/deletion on this bucket, prev_elem should be still valid
2038 * and we won't skip elements.
2040 n
= rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem
->hash_node
));
2041 elem
= hlist_nulls_entry_safe(n
, struct htab_elem
, hash_node
);
2045 /* not found, unlock and go to the next bucket */
2046 b
= &htab
->buckets
[bucket_id
++];
2051 for (i
= bucket_id
; i
< htab
->n_buckets
; i
++) {
2052 b
= &htab
->buckets
[i
];
2057 hlist_nulls_for_each_entry_rcu(elem
, n
, head
, hash_node
) {
2058 if (count
>= skip_elems
) {
2059 info
->bucket_id
= i
;
2060 info
->skip_elems
= count
;
2070 info
->bucket_id
= i
;
2071 info
->skip_elems
= 0;
2075 static void *bpf_hash_map_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2077 struct bpf_iter_seq_hash_map_info
*info
= seq
->private;
2078 struct htab_elem
*elem
;
2080 elem
= bpf_hash_map_seq_find_next(info
, NULL
);
2089 static void *bpf_hash_map_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2091 struct bpf_iter_seq_hash_map_info
*info
= seq
->private;
2095 return bpf_hash_map_seq_find_next(info
, v
);
2098 static int __bpf_hash_map_seq_show(struct seq_file
*seq
, struct htab_elem
*elem
)
2100 struct bpf_iter_seq_hash_map_info
*info
= seq
->private;
2101 u32 roundup_key_size
, roundup_value_size
;
2102 struct bpf_iter__bpf_map_elem ctx
= {};
2103 struct bpf_map
*map
= info
->map
;
2104 struct bpf_iter_meta meta
;
2105 int ret
= 0, off
= 0, cpu
;
2106 struct bpf_prog
*prog
;
2107 void __percpu
*pptr
;
2110 prog
= bpf_iter_get_info(&meta
, elem
== NULL
);
2113 ctx
.map
= info
->map
;
2115 roundup_key_size
= round_up(map
->key_size
, 8);
2116 ctx
.key
= elem
->key
;
2117 if (!info
->percpu_value_buf
) {
2118 ctx
.value
= elem
->key
+ roundup_key_size
;
2120 roundup_value_size
= round_up(map
->value_size
, 8);
2121 pptr
= htab_elem_get_ptr(elem
, map
->key_size
);
2122 for_each_possible_cpu(cpu
) {
2123 copy_map_value_long(map
, info
->percpu_value_buf
+ off
,
2124 per_cpu_ptr(pptr
, cpu
));
2125 check_and_init_map_value(map
, info
->percpu_value_buf
+ off
);
2126 off
+= roundup_value_size
;
2128 ctx
.value
= info
->percpu_value_buf
;
2131 ret
= bpf_iter_run_prog(prog
, &ctx
);
2137 static int bpf_hash_map_seq_show(struct seq_file
*seq
, void *v
)
2139 return __bpf_hash_map_seq_show(seq
, v
);
2142 static void bpf_hash_map_seq_stop(struct seq_file
*seq
, void *v
)
2145 (void)__bpf_hash_map_seq_show(seq
, NULL
);
2150 static int bpf_iter_init_hash_map(void *priv_data
,
2151 struct bpf_iter_aux_info
*aux
)
2153 struct bpf_iter_seq_hash_map_info
*seq_info
= priv_data
;
2154 struct bpf_map
*map
= aux
->map
;
2158 if (map
->map_type
== BPF_MAP_TYPE_PERCPU_HASH
||
2159 map
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
) {
2160 buf_size
= round_up(map
->value_size
, 8) * num_possible_cpus();
2161 value_buf
= kmalloc(buf_size
, GFP_USER
| __GFP_NOWARN
);
2165 seq_info
->percpu_value_buf
= value_buf
;
2168 bpf_map_inc_with_uref(map
);
2169 seq_info
->map
= map
;
2170 seq_info
->htab
= container_of(map
, struct bpf_htab
, map
);
2174 static void bpf_iter_fini_hash_map(void *priv_data
)
2176 struct bpf_iter_seq_hash_map_info
*seq_info
= priv_data
;
2178 bpf_map_put_with_uref(seq_info
->map
);
2179 kfree(seq_info
->percpu_value_buf
);
2182 static const struct seq_operations bpf_hash_map_seq_ops
= {
2183 .start
= bpf_hash_map_seq_start
,
2184 .next
= bpf_hash_map_seq_next
,
2185 .stop
= bpf_hash_map_seq_stop
,
2186 .show
= bpf_hash_map_seq_show
,
2189 static const struct bpf_iter_seq_info iter_seq_info
= {
2190 .seq_ops
= &bpf_hash_map_seq_ops
,
2191 .init_seq_private
= bpf_iter_init_hash_map
,
2192 .fini_seq_private
= bpf_iter_fini_hash_map
,
2193 .seq_priv_size
= sizeof(struct bpf_iter_seq_hash_map_info
),
2196 static long bpf_for_each_hash_elem(struct bpf_map
*map
, bpf_callback_t callback_fn
,
2197 void *callback_ctx
, u64 flags
)
2199 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
2200 struct hlist_nulls_head
*head
;
2201 struct hlist_nulls_node
*n
;
2202 struct htab_elem
*elem
;
2203 u32 roundup_key_size
;
2204 int i
, num_elems
= 0;
2205 void __percpu
*pptr
;
2214 is_percpu
= htab_is_percpu(htab
);
2216 roundup_key_size
= round_up(map
->key_size
, 8);
2217 /* disable migration so percpu value prepared here will be the
2218 * same as the one seen by the bpf program with bpf_map_lookup_elem().
2222 for (i
= 0; i
< htab
->n_buckets
; i
++) {
2223 b
= &htab
->buckets
[i
];
2226 hlist_nulls_for_each_entry_rcu(elem
, n
, head
, hash_node
) {
2229 /* current cpu value for percpu map */
2230 pptr
= htab_elem_get_ptr(elem
, map
->key_size
);
2231 val
= this_cpu_ptr(pptr
);
2233 val
= elem
->key
+ roundup_key_size
;
2236 ret
= callback_fn((u64
)(long)map
, (u64
)(long)key
,
2237 (u64
)(long)val
, (u64
)(long)callback_ctx
, 0);
2238 /* return value: 0 - continue, 1 - stop and return */
2252 static u64
htab_map_mem_usage(const struct bpf_map
*map
)
2254 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
2255 u32 value_size
= round_up(htab
->map
.value_size
, 8);
2256 bool prealloc
= htab_is_prealloc(htab
);
2257 bool percpu
= htab_is_percpu(htab
);
2258 bool lru
= htab_is_lru(htab
);
2260 u64 usage
= sizeof(struct bpf_htab
);
2262 usage
+= sizeof(struct bucket
) * htab
->n_buckets
;
2263 usage
+= sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT
;
2265 num_entries
= map
->max_entries
;
2266 if (htab_has_extra_elems(htab
))
2267 num_entries
+= num_possible_cpus();
2269 usage
+= htab
->elem_size
* num_entries
;
2272 usage
+= value_size
* num_possible_cpus() * num_entries
;
2274 usage
+= sizeof(struct htab_elem
*) * num_possible_cpus();
2276 #define LLIST_NODE_SZ sizeof(struct llist_node)
2278 num_entries
= htab
->use_percpu_counter
?
2279 percpu_counter_sum(&htab
->pcount
) :
2280 atomic_read(&htab
->count
);
2281 usage
+= (htab
->elem_size
+ LLIST_NODE_SZ
) * num_entries
;
2283 usage
+= (LLIST_NODE_SZ
+ sizeof(void *)) * num_entries
;
2284 usage
+= value_size
* num_possible_cpus() * num_entries
;
2290 BTF_ID_LIST_SINGLE(htab_map_btf_ids
, struct, bpf_htab
)
2291 const struct bpf_map_ops htab_map_ops
= {
2292 .map_meta_equal
= bpf_map_meta_equal
,
2293 .map_alloc_check
= htab_map_alloc_check
,
2294 .map_alloc
= htab_map_alloc
,
2295 .map_free
= htab_map_free
,
2296 .map_get_next_key
= htab_map_get_next_key
,
2297 .map_release_uref
= htab_map_free_timers_and_wq
,
2298 .map_lookup_elem
= htab_map_lookup_elem
,
2299 .map_lookup_and_delete_elem
= htab_map_lookup_and_delete_elem
,
2300 .map_update_elem
= htab_map_update_elem
,
2301 .map_delete_elem
= htab_map_delete_elem
,
2302 .map_gen_lookup
= htab_map_gen_lookup
,
2303 .map_seq_show_elem
= htab_map_seq_show_elem
,
2304 .map_set_for_each_callback_args
= map_set_for_each_callback_args
,
2305 .map_for_each_callback
= bpf_for_each_hash_elem
,
2306 .map_mem_usage
= htab_map_mem_usage
,
2308 .map_btf_id
= &htab_map_btf_ids
[0],
2309 .iter_seq_info
= &iter_seq_info
,
2312 const struct bpf_map_ops htab_lru_map_ops
= {
2313 .map_meta_equal
= bpf_map_meta_equal
,
2314 .map_alloc_check
= htab_map_alloc_check
,
2315 .map_alloc
= htab_map_alloc
,
2316 .map_free
= htab_map_free
,
2317 .map_get_next_key
= htab_map_get_next_key
,
2318 .map_release_uref
= htab_map_free_timers_and_wq
,
2319 .map_lookup_elem
= htab_lru_map_lookup_elem
,
2320 .map_lookup_and_delete_elem
= htab_lru_map_lookup_and_delete_elem
,
2321 .map_lookup_elem_sys_only
= htab_lru_map_lookup_elem_sys
,
2322 .map_update_elem
= htab_lru_map_update_elem
,
2323 .map_delete_elem
= htab_lru_map_delete_elem
,
2324 .map_gen_lookup
= htab_lru_map_gen_lookup
,
2325 .map_seq_show_elem
= htab_map_seq_show_elem
,
2326 .map_set_for_each_callback_args
= map_set_for_each_callback_args
,
2327 .map_for_each_callback
= bpf_for_each_hash_elem
,
2328 .map_mem_usage
= htab_map_mem_usage
,
2329 BATCH_OPS(htab_lru
),
2330 .map_btf_id
= &htab_map_btf_ids
[0],
2331 .iter_seq_info
= &iter_seq_info
,
2334 /* Called from eBPF program */
2335 static void *htab_percpu_map_lookup_elem(struct bpf_map
*map
, void *key
)
2337 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
2340 return this_cpu_ptr(htab_elem_get_ptr(l
, map
->key_size
));
2345 /* inline bpf_map_lookup_elem() call for per-CPU hashmap */
2346 static int htab_percpu_map_gen_lookup(struct bpf_map
*map
, struct bpf_insn
*insn_buf
)
2348 struct bpf_insn
*insn
= insn_buf
;
2350 if (!bpf_jit_supports_percpu_insn())
2353 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem
,
2354 (void *(*)(struct bpf_map
*map
, void *key
))NULL
));
2355 *insn
++ = BPF_EMIT_CALL(__htab_map_lookup_elem
);
2356 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 3);
2357 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, BPF_REG_0
,
2358 offsetof(struct htab_elem
, key
) + map
->key_size
);
2359 *insn
++ = BPF_LDX_MEM(BPF_DW
, BPF_REG_0
, BPF_REG_0
, 0);
2360 *insn
++ = BPF_MOV64_PERCPU_REG(BPF_REG_0
, BPF_REG_0
);
2362 return insn
- insn_buf
;
2365 static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map
*map
, void *key
, u32 cpu
)
2367 struct htab_elem
*l
;
2369 if (cpu
>= nr_cpu_ids
)
2372 l
= __htab_map_lookup_elem(map
, key
);
2374 return per_cpu_ptr(htab_elem_get_ptr(l
, map
->key_size
), cpu
);
2379 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map
*map
, void *key
)
2381 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
2384 bpf_lru_node_set_ref(&l
->lru_node
);
2385 return this_cpu_ptr(htab_elem_get_ptr(l
, map
->key_size
));
2391 static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map
*map
, void *key
, u32 cpu
)
2393 struct htab_elem
*l
;
2395 if (cpu
>= nr_cpu_ids
)
2398 l
= __htab_map_lookup_elem(map
, key
);
2400 bpf_lru_node_set_ref(&l
->lru_node
);
2401 return per_cpu_ptr(htab_elem_get_ptr(l
, map
->key_size
), cpu
);
2407 int bpf_percpu_hash_copy(struct bpf_map
*map
, void *key
, void *value
)
2409 struct htab_elem
*l
;
2410 void __percpu
*pptr
;
2415 /* per_cpu areas are zero-filled and bpf programs can only
2416 * access 'value_size' of them, so copying rounded areas
2417 * will not leak any kernel data
2419 size
= round_up(map
->value_size
, 8);
2421 l
= __htab_map_lookup_elem(map
, key
);
2424 /* We do not mark LRU map element here in order to not mess up
2425 * eviction heuristics when user space does a map walk.
2427 pptr
= htab_elem_get_ptr(l
, map
->key_size
);
2428 for_each_possible_cpu(cpu
) {
2429 copy_map_value_long(map
, value
+ off
, per_cpu_ptr(pptr
, cpu
));
2430 check_and_init_map_value(map
, value
+ off
);
2439 int bpf_percpu_hash_update(struct bpf_map
*map
, void *key
, void *value
,
2442 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
2446 if (htab_is_lru(htab
))
2447 ret
= __htab_lru_percpu_map_update_elem(map
, key
, value
,
2450 ret
= __htab_percpu_map_update_elem(map
, key
, value
, map_flags
,
2457 static void htab_percpu_map_seq_show_elem(struct bpf_map
*map
, void *key
,
2460 struct htab_elem
*l
;
2461 void __percpu
*pptr
;
2466 l
= __htab_map_lookup_elem(map
, key
);
2472 btf_type_seq_show(map
->btf
, map
->btf_key_type_id
, key
, m
);
2473 seq_puts(m
, ": {\n");
2474 pptr
= htab_elem_get_ptr(l
, map
->key_size
);
2475 for_each_possible_cpu(cpu
) {
2476 seq_printf(m
, "\tcpu%d: ", cpu
);
2477 btf_type_seq_show(map
->btf
, map
->btf_value_type_id
,
2478 per_cpu_ptr(pptr
, cpu
), m
);
2486 const struct bpf_map_ops htab_percpu_map_ops
= {
2487 .map_meta_equal
= bpf_map_meta_equal
,
2488 .map_alloc_check
= htab_map_alloc_check
,
2489 .map_alloc
= htab_map_alloc
,
2490 .map_free
= htab_map_free
,
2491 .map_get_next_key
= htab_map_get_next_key
,
2492 .map_lookup_elem
= htab_percpu_map_lookup_elem
,
2493 .map_gen_lookup
= htab_percpu_map_gen_lookup
,
2494 .map_lookup_and_delete_elem
= htab_percpu_map_lookup_and_delete_elem
,
2495 .map_update_elem
= htab_percpu_map_update_elem
,
2496 .map_delete_elem
= htab_map_delete_elem
,
2497 .map_lookup_percpu_elem
= htab_percpu_map_lookup_percpu_elem
,
2498 .map_seq_show_elem
= htab_percpu_map_seq_show_elem
,
2499 .map_set_for_each_callback_args
= map_set_for_each_callback_args
,
2500 .map_for_each_callback
= bpf_for_each_hash_elem
,
2501 .map_mem_usage
= htab_map_mem_usage
,
2502 BATCH_OPS(htab_percpu
),
2503 .map_btf_id
= &htab_map_btf_ids
[0],
2504 .iter_seq_info
= &iter_seq_info
,
2507 const struct bpf_map_ops htab_lru_percpu_map_ops
= {
2508 .map_meta_equal
= bpf_map_meta_equal
,
2509 .map_alloc_check
= htab_map_alloc_check
,
2510 .map_alloc
= htab_map_alloc
,
2511 .map_free
= htab_map_free
,
2512 .map_get_next_key
= htab_map_get_next_key
,
2513 .map_lookup_elem
= htab_lru_percpu_map_lookup_elem
,
2514 .map_lookup_and_delete_elem
= htab_lru_percpu_map_lookup_and_delete_elem
,
2515 .map_update_elem
= htab_lru_percpu_map_update_elem
,
2516 .map_delete_elem
= htab_lru_map_delete_elem
,
2517 .map_lookup_percpu_elem
= htab_lru_percpu_map_lookup_percpu_elem
,
2518 .map_seq_show_elem
= htab_percpu_map_seq_show_elem
,
2519 .map_set_for_each_callback_args
= map_set_for_each_callback_args
,
2520 .map_for_each_callback
= bpf_for_each_hash_elem
,
2521 .map_mem_usage
= htab_map_mem_usage
,
2522 BATCH_OPS(htab_lru_percpu
),
2523 .map_btf_id
= &htab_map_btf_ids
[0],
2524 .iter_seq_info
= &iter_seq_info
,
2527 static int fd_htab_map_alloc_check(union bpf_attr
*attr
)
2529 if (attr
->value_size
!= sizeof(u32
))
2531 return htab_map_alloc_check(attr
);
2534 static void fd_htab_map_free(struct bpf_map
*map
)
2536 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
2537 struct hlist_nulls_node
*n
;
2538 struct hlist_nulls_head
*head
;
2539 struct htab_elem
*l
;
2542 for (i
= 0; i
< htab
->n_buckets
; i
++) {
2543 head
= select_bucket(htab
, i
);
2545 hlist_nulls_for_each_entry_safe(l
, n
, head
, hash_node
) {
2546 void *ptr
= fd_htab_map_get_ptr(map
, l
);
2548 map
->ops
->map_fd_put_ptr(map
, ptr
, false);
2555 /* only called from syscall */
2556 int bpf_fd_htab_map_lookup_elem(struct bpf_map
*map
, void *key
, u32
*value
)
2561 if (!map
->ops
->map_fd_sys_lookup_elem
)
2565 ptr
= htab_map_lookup_elem(map
, key
);
2567 *value
= map
->ops
->map_fd_sys_lookup_elem(READ_ONCE(*ptr
));
2575 /* only called from syscall */
2576 int bpf_fd_htab_map_update_elem(struct bpf_map
*map
, struct file
*map_file
,
2577 void *key
, void *value
, u64 map_flags
)
2581 u32 ufd
= *(u32
*)value
;
2583 ptr
= map
->ops
->map_fd_get_ptr(map
, map_file
, ufd
);
2585 return PTR_ERR(ptr
);
2587 /* The htab bucket lock is always held during update operations in fd
2588 * htab map, and the following rcu_read_lock() is only used to avoid
2589 * the WARN_ON_ONCE in htab_map_update_elem().
2592 ret
= htab_map_update_elem(map
, key
, &ptr
, map_flags
);
2595 map
->ops
->map_fd_put_ptr(map
, ptr
, false);
2600 static struct bpf_map
*htab_of_map_alloc(union bpf_attr
*attr
)
2602 struct bpf_map
*map
, *inner_map_meta
;
2604 inner_map_meta
= bpf_map_meta_alloc(attr
->inner_map_fd
);
2605 if (IS_ERR(inner_map_meta
))
2606 return inner_map_meta
;
2608 map
= htab_map_alloc(attr
);
2610 bpf_map_meta_free(inner_map_meta
);
2614 map
->inner_map_meta
= inner_map_meta
;
2619 static void *htab_of_map_lookup_elem(struct bpf_map
*map
, void *key
)
2621 struct bpf_map
**inner_map
= htab_map_lookup_elem(map
, key
);
2626 return READ_ONCE(*inner_map
);
2629 static int htab_of_map_gen_lookup(struct bpf_map
*map
,
2630 struct bpf_insn
*insn_buf
)
2632 struct bpf_insn
*insn
= insn_buf
;
2633 const int ret
= BPF_REG_0
;
2635 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem
,
2636 (void *(*)(struct bpf_map
*map
, void *key
))NULL
));
2637 *insn
++ = BPF_EMIT_CALL(__htab_map_lookup_elem
);
2638 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, ret
, 0, 2);
2639 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, ret
,
2640 offsetof(struct htab_elem
, key
) +
2641 round_up(map
->key_size
, 8));
2642 *insn
++ = BPF_LDX_MEM(BPF_DW
, ret
, ret
, 0);
2644 return insn
- insn_buf
;
2647 static void htab_of_map_free(struct bpf_map
*map
)
2649 bpf_map_meta_free(map
->inner_map_meta
);
2650 fd_htab_map_free(map
);
2653 const struct bpf_map_ops htab_of_maps_map_ops
= {
2654 .map_alloc_check
= fd_htab_map_alloc_check
,
2655 .map_alloc
= htab_of_map_alloc
,
2656 .map_free
= htab_of_map_free
,
2657 .map_get_next_key
= htab_map_get_next_key
,
2658 .map_lookup_elem
= htab_of_map_lookup_elem
,
2659 .map_delete_elem
= htab_map_delete_elem
,
2660 .map_fd_get_ptr
= bpf_map_fd_get_ptr
,
2661 .map_fd_put_ptr
= bpf_map_fd_put_ptr
,
2662 .map_fd_sys_lookup_elem
= bpf_map_fd_sys_lookup_elem
,
2663 .map_gen_lookup
= htab_of_map_gen_lookup
,
2664 .map_check_btf
= map_check_no_btf
,
2665 .map_mem_usage
= htab_map_mem_usage
,
2667 .map_btf_id
= &htab_map_btf_ids
[0],