1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 * Copyright (c) 2016 Facebook
7 #include <linux/jhash.h>
8 #include <linux/filter.h>
9 #include <linux/rculist_nulls.h>
10 #include <linux/random.h>
11 #include <uapi/linux/btf.h>
12 #include <linux/rcupdate_trace.h>
13 #include "percpu_freelist.h"
14 #include "bpf_lru_list.h"
15 #include "map_in_map.h"
17 #define HTAB_CREATE_FLAG_MASK \
18 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
19 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
21 #define BATCH_OPS(_name) \
23 _name##_map_lookup_batch, \
24 .map_lookup_and_delete_batch = \
25 _name##_map_lookup_and_delete_batch, \
27 generic_map_update_batch, \
29 generic_map_delete_batch
32 * The bucket lock has two protection scopes:
34 * 1) Serializing concurrent operations from BPF programs on differrent
37 * 2) Serializing concurrent operations from BPF programs and sys_bpf()
39 * BPF programs can execute in any context including perf, kprobes and
40 * tracing. As there are almost no limits where perf, kprobes and tracing
41 * can be invoked from the lock operations need to be protected against
42 * deadlocks. Deadlocks can be caused by recursion and by an invocation in
43 * the lock held section when functions which acquire this lock are invoked
44 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
45 * variable bpf_prog_active, which prevents BPF programs attached to perf
46 * events, kprobes and tracing to be invoked before the prior invocation
47 * from one of these contexts completed. sys_bpf() uses the same mechanism
48 * by pinning the task to the current CPU and incrementing the recursion
49 * protection accross the map operation.
51 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
52 * operations like memory allocations (even with GFP_ATOMIC) from atomic
53 * contexts. This is required because even with GFP_ATOMIC the memory
54 * allocator calls into code pathes which acquire locks with long held lock
55 * sections. To ensure the deterministic behaviour these locks are regular
56 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
57 * true atomic contexts on an RT kernel are the low level hardware
58 * handling, scheduling, low level interrupt handling, NMIs etc. None of
59 * these contexts should ever do memory allocations.
61 * As regular device interrupt handlers and soft interrupts are forced into
62 * thread context, the existing code which does
63 * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
66 * In theory the BPF locks could be converted to regular spinlocks as well,
67 * but the bucket locks and percpu_freelist locks can be taken from
68 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
69 * atomic contexts even on RT. These mechanisms require preallocated maps,
70 * so there is no need to invoke memory allocations within the lock held
73 * BPF maps which need dynamic allocation are only used from (forced)
74 * thread context on RT and can therefore use regular spinlocks which in
75 * turn allows to invoke memory allocations from the lock held section.
77 * On a non RT kernel this distinction is neither possible nor required.
78 * spinlock maps to raw_spinlock and the extra code is optimized out by the
82 struct hlist_nulls_head head
;
84 raw_spinlock_t raw_lock
;
89 #define HASHTAB_MAP_LOCK_COUNT 8
90 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
94 struct bucket
*buckets
;
97 struct pcpu_freelist freelist
;
100 struct htab_elem
*__percpu
*extra_elems
;
101 atomic_t count
; /* number of elements in this hashtable */
102 u32 n_buckets
; /* number of hash buckets */
103 u32 elem_size
; /* size of each element in bytes */
105 struct lock_class_key lockdep_key
;
106 int __percpu
*map_locked
[HASHTAB_MAP_LOCK_COUNT
];
109 /* each htab element is struct htab_elem + key + value */
112 struct hlist_nulls_node hash_node
;
116 struct bpf_htab
*htab
;
117 struct pcpu_freelist_node fnode
;
118 struct htab_elem
*batch_flink
;
124 struct bpf_lru_node lru_node
;
127 char key
[] __aligned(8);
130 static inline bool htab_is_prealloc(const struct bpf_htab
*htab
)
132 return !(htab
->map
.map_flags
& BPF_F_NO_PREALLOC
);
135 static inline bool htab_use_raw_lock(const struct bpf_htab
*htab
)
137 return (!IS_ENABLED(CONFIG_PREEMPT_RT
) || htab_is_prealloc(htab
));
140 static void htab_init_buckets(struct bpf_htab
*htab
)
144 for (i
= 0; i
< htab
->n_buckets
; i
++) {
145 INIT_HLIST_NULLS_HEAD(&htab
->buckets
[i
].head
, i
);
146 if (htab_use_raw_lock(htab
)) {
147 raw_spin_lock_init(&htab
->buckets
[i
].raw_lock
);
148 lockdep_set_class(&htab
->buckets
[i
].raw_lock
,
151 spin_lock_init(&htab
->buckets
[i
].lock
);
152 lockdep_set_class(&htab
->buckets
[i
].lock
,
159 static inline int htab_lock_bucket(const struct bpf_htab
*htab
,
160 struct bucket
*b
, u32 hash
,
161 unsigned long *pflags
)
165 hash
= hash
& HASHTAB_MAP_LOCK_MASK
;
168 if (unlikely(__this_cpu_inc_return(*(htab
->map_locked
[hash
])) != 1)) {
169 __this_cpu_dec(*(htab
->map_locked
[hash
]));
174 if (htab_use_raw_lock(htab
))
175 raw_spin_lock_irqsave(&b
->raw_lock
, flags
);
177 spin_lock_irqsave(&b
->lock
, flags
);
183 static inline void htab_unlock_bucket(const struct bpf_htab
*htab
,
184 struct bucket
*b
, u32 hash
,
187 hash
= hash
& HASHTAB_MAP_LOCK_MASK
;
188 if (htab_use_raw_lock(htab
))
189 raw_spin_unlock_irqrestore(&b
->raw_lock
, flags
);
191 spin_unlock_irqrestore(&b
->lock
, flags
);
192 __this_cpu_dec(*(htab
->map_locked
[hash
]));
196 static bool htab_lru_map_delete_node(void *arg
, struct bpf_lru_node
*node
);
198 static bool htab_is_lru(const struct bpf_htab
*htab
)
200 return htab
->map
.map_type
== BPF_MAP_TYPE_LRU_HASH
||
201 htab
->map
.map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
;
204 static bool htab_is_percpu(const struct bpf_htab
*htab
)
206 return htab
->map
.map_type
== BPF_MAP_TYPE_PERCPU_HASH
||
207 htab
->map
.map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
;
210 static inline void htab_elem_set_ptr(struct htab_elem
*l
, u32 key_size
,
213 *(void __percpu
**)(l
->key
+ key_size
) = pptr
;
216 static inline void __percpu
*htab_elem_get_ptr(struct htab_elem
*l
, u32 key_size
)
218 return *(void __percpu
**)(l
->key
+ key_size
);
221 static void *fd_htab_map_get_ptr(const struct bpf_map
*map
, struct htab_elem
*l
)
223 return *(void **)(l
->key
+ roundup(map
->key_size
, 8));
226 static struct htab_elem
*get_htab_elem(struct bpf_htab
*htab
, int i
)
228 return (struct htab_elem
*) (htab
->elems
+ i
* (u64
)htab
->elem_size
);
231 static void htab_free_elems(struct bpf_htab
*htab
)
235 if (!htab_is_percpu(htab
))
238 for (i
= 0; i
< htab
->map
.max_entries
; i
++) {
241 pptr
= htab_elem_get_ptr(get_htab_elem(htab
, i
),
247 bpf_map_area_free(htab
->elems
);
250 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock
251 * (bucket_lock). If both locks need to be acquired together, the lock
252 * order is always lru_lock -> bucket_lock and this only happens in
253 * bpf_lru_list.c logic. For example, certain code path of
254 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
255 * will acquire lru_lock first followed by acquiring bucket_lock.
257 * In hashtab.c, to avoid deadlock, lock acquisition of
258 * bucket_lock followed by lru_lock is not allowed. In such cases,
259 * bucket_lock needs to be released first before acquiring lru_lock.
261 static struct htab_elem
*prealloc_lru_pop(struct bpf_htab
*htab
, void *key
,
264 struct bpf_lru_node
*node
= bpf_lru_pop_free(&htab
->lru
, hash
);
268 l
= container_of(node
, struct htab_elem
, lru_node
);
269 memcpy(l
->key
, key
, htab
->map
.key_size
);
276 static int prealloc_init(struct bpf_htab
*htab
)
278 u32 num_entries
= htab
->map
.max_entries
;
279 int err
= -ENOMEM
, i
;
281 if (!htab_is_percpu(htab
) && !htab_is_lru(htab
))
282 num_entries
+= num_possible_cpus();
284 htab
->elems
= bpf_map_area_alloc((u64
)htab
->elem_size
* num_entries
,
285 htab
->map
.numa_node
);
289 if (!htab_is_percpu(htab
))
290 goto skip_percpu_elems
;
292 for (i
= 0; i
< num_entries
; i
++) {
293 u32 size
= round_up(htab
->map
.value_size
, 8);
296 pptr
= bpf_map_alloc_percpu(&htab
->map
, size
, 8,
297 GFP_USER
| __GFP_NOWARN
);
300 htab_elem_set_ptr(get_htab_elem(htab
, i
), htab
->map
.key_size
,
306 if (htab_is_lru(htab
))
307 err
= bpf_lru_init(&htab
->lru
,
308 htab
->map
.map_flags
& BPF_F_NO_COMMON_LRU
,
309 offsetof(struct htab_elem
, hash
) -
310 offsetof(struct htab_elem
, lru_node
),
311 htab_lru_map_delete_node
,
314 err
= pcpu_freelist_init(&htab
->freelist
);
319 if (htab_is_lru(htab
))
320 bpf_lru_populate(&htab
->lru
, htab
->elems
,
321 offsetof(struct htab_elem
, lru_node
),
322 htab
->elem_size
, num_entries
);
324 pcpu_freelist_populate(&htab
->freelist
,
325 htab
->elems
+ offsetof(struct htab_elem
, fnode
),
326 htab
->elem_size
, num_entries
);
331 htab_free_elems(htab
);
335 static void prealloc_destroy(struct bpf_htab
*htab
)
337 htab_free_elems(htab
);
339 if (htab_is_lru(htab
))
340 bpf_lru_destroy(&htab
->lru
);
342 pcpu_freelist_destroy(&htab
->freelist
);
345 static int alloc_extra_elems(struct bpf_htab
*htab
)
347 struct htab_elem
*__percpu
*pptr
, *l_new
;
348 struct pcpu_freelist_node
*l
;
351 pptr
= bpf_map_alloc_percpu(&htab
->map
, sizeof(struct htab_elem
*), 8,
352 GFP_USER
| __GFP_NOWARN
);
356 for_each_possible_cpu(cpu
) {
357 l
= pcpu_freelist_pop(&htab
->freelist
);
358 /* pop will succeed, since prealloc_init()
359 * preallocated extra num_possible_cpus elements
361 l_new
= container_of(l
, struct htab_elem
, fnode
);
362 *per_cpu_ptr(pptr
, cpu
) = l_new
;
364 htab
->extra_elems
= pptr
;
368 /* Called from syscall */
369 static int htab_map_alloc_check(union bpf_attr
*attr
)
371 bool percpu
= (attr
->map_type
== BPF_MAP_TYPE_PERCPU_HASH
||
372 attr
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
);
373 bool lru
= (attr
->map_type
== BPF_MAP_TYPE_LRU_HASH
||
374 attr
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
);
375 /* percpu_lru means each cpu has its own LRU list.
376 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
377 * the map's value itself is percpu. percpu_lru has
378 * nothing to do with the map's value.
380 bool percpu_lru
= (attr
->map_flags
& BPF_F_NO_COMMON_LRU
);
381 bool prealloc
= !(attr
->map_flags
& BPF_F_NO_PREALLOC
);
382 bool zero_seed
= (attr
->map_flags
& BPF_F_ZERO_SEED
);
383 int numa_node
= bpf_map_attr_numa_node(attr
);
385 BUILD_BUG_ON(offsetof(struct htab_elem
, htab
) !=
386 offsetof(struct htab_elem
, hash_node
.pprev
));
387 BUILD_BUG_ON(offsetof(struct htab_elem
, fnode
.next
) !=
388 offsetof(struct htab_elem
, hash_node
.pprev
));
390 if (lru
&& !bpf_capable())
391 /* LRU implementation is much complicated than other
392 * maps. Hence, limit to CAP_BPF.
396 if (zero_seed
&& !capable(CAP_SYS_ADMIN
))
397 /* Guard against local DoS, and discourage production use. */
400 if (attr
->map_flags
& ~HTAB_CREATE_FLAG_MASK
||
401 !bpf_map_flags_access_ok(attr
->map_flags
))
404 if (!lru
&& percpu_lru
)
407 if (lru
&& !prealloc
)
410 if (numa_node
!= NUMA_NO_NODE
&& (percpu
|| percpu_lru
))
413 /* check sanity of attributes.
414 * value_size == 0 may be allowed in the future to use map as a set
416 if (attr
->max_entries
== 0 || attr
->key_size
== 0 ||
417 attr
->value_size
== 0)
420 if ((u64
)attr
->key_size
+ attr
->value_size
>= KMALLOC_MAX_SIZE
-
421 sizeof(struct htab_elem
))
422 /* if key_size + value_size is bigger, the user space won't be
423 * able to access the elements via bpf syscall. This check
424 * also makes sure that the elem_size doesn't overflow and it's
425 * kmalloc-able later in htab_map_update_elem()
432 static struct bpf_map
*htab_map_alloc(union bpf_attr
*attr
)
434 bool percpu
= (attr
->map_type
== BPF_MAP_TYPE_PERCPU_HASH
||
435 attr
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
);
436 bool lru
= (attr
->map_type
== BPF_MAP_TYPE_LRU_HASH
||
437 attr
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
);
438 /* percpu_lru means each cpu has its own LRU list.
439 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
440 * the map's value itself is percpu. percpu_lru has
441 * nothing to do with the map's value.
443 bool percpu_lru
= (attr
->map_flags
& BPF_F_NO_COMMON_LRU
);
444 bool prealloc
= !(attr
->map_flags
& BPF_F_NO_PREALLOC
);
445 struct bpf_htab
*htab
;
448 htab
= kzalloc(sizeof(*htab
), GFP_USER
| __GFP_ACCOUNT
);
450 return ERR_PTR(-ENOMEM
);
452 lockdep_register_key(&htab
->lockdep_key
);
454 bpf_map_init_from_attr(&htab
->map
, attr
);
457 /* ensure each CPU's lru list has >=1 elements.
458 * since we are at it, make each lru list has the same
459 * number of elements.
461 htab
->map
.max_entries
= roundup(attr
->max_entries
,
462 num_possible_cpus());
463 if (htab
->map
.max_entries
< attr
->max_entries
)
464 htab
->map
.max_entries
= rounddown(attr
->max_entries
,
465 num_possible_cpus());
468 /* hash table size must be power of 2 */
469 htab
->n_buckets
= roundup_pow_of_two(htab
->map
.max_entries
);
471 htab
->elem_size
= sizeof(struct htab_elem
) +
472 round_up(htab
->map
.key_size
, 8);
474 htab
->elem_size
+= sizeof(void *);
476 htab
->elem_size
+= round_up(htab
->map
.value_size
, 8);
479 /* prevent zero size kmalloc and check for u32 overflow */
480 if (htab
->n_buckets
== 0 ||
481 htab
->n_buckets
> U32_MAX
/ sizeof(struct bucket
))
485 htab
->buckets
= bpf_map_area_alloc(htab
->n_buckets
*
486 sizeof(struct bucket
),
487 htab
->map
.numa_node
);
491 for (i
= 0; i
< HASHTAB_MAP_LOCK_COUNT
; i
++) {
492 htab
->map_locked
[i
] = bpf_map_alloc_percpu(&htab
->map
,
496 if (!htab
->map_locked
[i
])
497 goto free_map_locked
;
500 if (htab
->map
.map_flags
& BPF_F_ZERO_SEED
)
503 htab
->hashrnd
= get_random_int();
505 htab_init_buckets(htab
);
508 err
= prealloc_init(htab
);
510 goto free_map_locked
;
512 if (!percpu
&& !lru
) {
513 /* lru itself can remove the least used element, so
514 * there is no need for an extra elem during map_update.
516 err
= alloc_extra_elems(htab
);
525 prealloc_destroy(htab
);
527 for (i
= 0; i
< HASHTAB_MAP_LOCK_COUNT
; i
++)
528 free_percpu(htab
->map_locked
[i
]);
529 bpf_map_area_free(htab
->buckets
);
531 lockdep_unregister_key(&htab
->lockdep_key
);
536 static inline u32
htab_map_hash(const void *key
, u32 key_len
, u32 hashrnd
)
538 return jhash(key
, key_len
, hashrnd
);
541 static inline struct bucket
*__select_bucket(struct bpf_htab
*htab
, u32 hash
)
543 return &htab
->buckets
[hash
& (htab
->n_buckets
- 1)];
546 static inline struct hlist_nulls_head
*select_bucket(struct bpf_htab
*htab
, u32 hash
)
548 return &__select_bucket(htab
, hash
)->head
;
551 /* this lookup function can only be called with bucket lock taken */
552 static struct htab_elem
*lookup_elem_raw(struct hlist_nulls_head
*head
, u32 hash
,
553 void *key
, u32 key_size
)
555 struct hlist_nulls_node
*n
;
558 hlist_nulls_for_each_entry_rcu(l
, n
, head
, hash_node
)
559 if (l
->hash
== hash
&& !memcmp(&l
->key
, key
, key_size
))
565 /* can be called without bucket lock. it will repeat the loop in
566 * the unlikely event when elements moved from one bucket into another
567 * while link list is being walked
569 static struct htab_elem
*lookup_nulls_elem_raw(struct hlist_nulls_head
*head
,
571 u32 key_size
, u32 n_buckets
)
573 struct hlist_nulls_node
*n
;
577 hlist_nulls_for_each_entry_rcu(l
, n
, head
, hash_node
)
578 if (l
->hash
== hash
&& !memcmp(&l
->key
, key
, key_size
))
581 if (unlikely(get_nulls_value(n
) != (hash
& (n_buckets
- 1))))
587 /* Called from syscall or from eBPF program directly, so
588 * arguments have to match bpf_map_lookup_elem() exactly.
589 * The return value is adjusted by BPF instructions
590 * in htab_map_gen_lookup().
592 static void *__htab_map_lookup_elem(struct bpf_map
*map
, void *key
)
594 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
595 struct hlist_nulls_head
*head
;
599 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
601 key_size
= map
->key_size
;
603 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
605 head
= select_bucket(htab
, hash
);
607 l
= lookup_nulls_elem_raw(head
, hash
, key
, key_size
, htab
->n_buckets
);
612 static void *htab_map_lookup_elem(struct bpf_map
*map
, void *key
)
614 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
617 return l
->key
+ round_up(map
->key_size
, 8);
622 /* inline bpf_map_lookup_elem() call.
625 * bpf_map_lookup_elem
626 * map->ops->map_lookup_elem
627 * htab_map_lookup_elem
628 * __htab_map_lookup_elem
631 * __htab_map_lookup_elem
633 static int htab_map_gen_lookup(struct bpf_map
*map
, struct bpf_insn
*insn_buf
)
635 struct bpf_insn
*insn
= insn_buf
;
636 const int ret
= BPF_REG_0
;
638 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem
,
639 (void *(*)(struct bpf_map
*map
, void *key
))NULL
));
640 *insn
++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem
));
641 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, ret
, 0, 1);
642 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, ret
,
643 offsetof(struct htab_elem
, key
) +
644 round_up(map
->key_size
, 8));
645 return insn
- insn_buf
;
648 static __always_inline
void *__htab_lru_map_lookup_elem(struct bpf_map
*map
,
649 void *key
, const bool mark
)
651 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
655 bpf_lru_node_set_ref(&l
->lru_node
);
656 return l
->key
+ round_up(map
->key_size
, 8);
662 static void *htab_lru_map_lookup_elem(struct bpf_map
*map
, void *key
)
664 return __htab_lru_map_lookup_elem(map
, key
, true);
667 static void *htab_lru_map_lookup_elem_sys(struct bpf_map
*map
, void *key
)
669 return __htab_lru_map_lookup_elem(map
, key
, false);
672 static int htab_lru_map_gen_lookup(struct bpf_map
*map
,
673 struct bpf_insn
*insn_buf
)
675 struct bpf_insn
*insn
= insn_buf
;
676 const int ret
= BPF_REG_0
;
677 const int ref_reg
= BPF_REG_1
;
679 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem
,
680 (void *(*)(struct bpf_map
*map
, void *key
))NULL
));
681 *insn
++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem
));
682 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, ret
, 0, 4);
683 *insn
++ = BPF_LDX_MEM(BPF_B
, ref_reg
, ret
,
684 offsetof(struct htab_elem
, lru_node
) +
685 offsetof(struct bpf_lru_node
, ref
));
686 *insn
++ = BPF_JMP_IMM(BPF_JNE
, ref_reg
, 0, 1);
687 *insn
++ = BPF_ST_MEM(BPF_B
, ret
,
688 offsetof(struct htab_elem
, lru_node
) +
689 offsetof(struct bpf_lru_node
, ref
),
691 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, ret
,
692 offsetof(struct htab_elem
, key
) +
693 round_up(map
->key_size
, 8));
694 return insn
- insn_buf
;
697 /* It is called from the bpf_lru_list when the LRU needs to delete
698 * older elements from the htab.
700 static bool htab_lru_map_delete_node(void *arg
, struct bpf_lru_node
*node
)
702 struct bpf_htab
*htab
= (struct bpf_htab
*)arg
;
703 struct htab_elem
*l
= NULL
, *tgt_l
;
704 struct hlist_nulls_head
*head
;
705 struct hlist_nulls_node
*n
;
710 tgt_l
= container_of(node
, struct htab_elem
, lru_node
);
711 b
= __select_bucket(htab
, tgt_l
->hash
);
714 ret
= htab_lock_bucket(htab
, b
, tgt_l
->hash
, &flags
);
718 hlist_nulls_for_each_entry_rcu(l
, n
, head
, hash_node
)
720 hlist_nulls_del_rcu(&l
->hash_node
);
724 htab_unlock_bucket(htab
, b
, tgt_l
->hash
, flags
);
729 /* Called from syscall */
730 static int htab_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
732 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
733 struct hlist_nulls_head
*head
;
734 struct htab_elem
*l
, *next_l
;
738 WARN_ON_ONCE(!rcu_read_lock_held());
740 key_size
= map
->key_size
;
743 goto find_first_elem
;
745 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
747 head
= select_bucket(htab
, hash
);
750 l
= lookup_nulls_elem_raw(head
, hash
, key
, key_size
, htab
->n_buckets
);
753 goto find_first_elem
;
755 /* key was found, get next key in the same bucket */
756 next_l
= hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l
->hash_node
)),
757 struct htab_elem
, hash_node
);
760 /* if next elem in this hash list is non-zero, just return it */
761 memcpy(next_key
, next_l
->key
, key_size
);
765 /* no more elements in this hash list, go to the next bucket */
766 i
= hash
& (htab
->n_buckets
- 1);
770 /* iterate over buckets */
771 for (; i
< htab
->n_buckets
; i
++) {
772 head
= select_bucket(htab
, i
);
774 /* pick first element in the bucket */
775 next_l
= hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head
)),
776 struct htab_elem
, hash_node
);
778 /* if it's not empty, just return it */
779 memcpy(next_key
, next_l
->key
, key_size
);
784 /* iterated over all buckets and all elements */
788 static void htab_elem_free(struct bpf_htab
*htab
, struct htab_elem
*l
)
790 if (htab
->map
.map_type
== BPF_MAP_TYPE_PERCPU_HASH
)
791 free_percpu(htab_elem_get_ptr(l
, htab
->map
.key_size
));
795 static void htab_elem_free_rcu(struct rcu_head
*head
)
797 struct htab_elem
*l
= container_of(head
, struct htab_elem
, rcu
);
798 struct bpf_htab
*htab
= l
->htab
;
800 htab_elem_free(htab
, l
);
803 static void htab_put_fd_value(struct bpf_htab
*htab
, struct htab_elem
*l
)
805 struct bpf_map
*map
= &htab
->map
;
808 if (map
->ops
->map_fd_put_ptr
) {
809 ptr
= fd_htab_map_get_ptr(map
, l
);
810 map
->ops
->map_fd_put_ptr(ptr
);
814 static void free_htab_elem(struct bpf_htab
*htab
, struct htab_elem
*l
)
816 htab_put_fd_value(htab
, l
);
818 if (htab_is_prealloc(htab
)) {
819 __pcpu_freelist_push(&htab
->freelist
, &l
->fnode
);
821 atomic_dec(&htab
->count
);
823 call_rcu(&l
->rcu
, htab_elem_free_rcu
);
827 static void pcpu_copy_value(struct bpf_htab
*htab
, void __percpu
*pptr
,
828 void *value
, bool onallcpus
)
831 /* copy true value_size bytes */
832 memcpy(this_cpu_ptr(pptr
), value
, htab
->map
.value_size
);
834 u32 size
= round_up(htab
->map
.value_size
, 8);
837 for_each_possible_cpu(cpu
) {
838 bpf_long_memcpy(per_cpu_ptr(pptr
, cpu
),
845 static void pcpu_init_value(struct bpf_htab
*htab
, void __percpu
*pptr
,
846 void *value
, bool onallcpus
)
848 /* When using prealloc and not setting the initial value on all cpus,
849 * zero-fill element values for other cpus (just as what happens when
850 * not using prealloc). Otherwise, bpf program has no way to ensure
851 * known initial values for cpus other than current one
852 * (onallcpus=false always when coming from bpf prog).
854 if (htab_is_prealloc(htab
) && !onallcpus
) {
855 u32 size
= round_up(htab
->map
.value_size
, 8);
856 int current_cpu
= raw_smp_processor_id();
859 for_each_possible_cpu(cpu
) {
860 if (cpu
== current_cpu
)
861 bpf_long_memcpy(per_cpu_ptr(pptr
, cpu
), value
,
864 memset(per_cpu_ptr(pptr
, cpu
), 0, size
);
867 pcpu_copy_value(htab
, pptr
, value
, onallcpus
);
871 static bool fd_htab_map_needs_adjust(const struct bpf_htab
*htab
)
873 return htab
->map
.map_type
== BPF_MAP_TYPE_HASH_OF_MAPS
&&
877 static struct htab_elem
*alloc_htab_elem(struct bpf_htab
*htab
, void *key
,
878 void *value
, u32 key_size
, u32 hash
,
879 bool percpu
, bool onallcpus
,
880 struct htab_elem
*old_elem
)
882 u32 size
= htab
->map
.value_size
;
883 bool prealloc
= htab_is_prealloc(htab
);
884 struct htab_elem
*l_new
, **pl_new
;
889 /* if we're updating the existing element,
890 * use per-cpu extra elems to avoid freelist_pop/push
892 pl_new
= this_cpu_ptr(htab
->extra_elems
);
894 htab_put_fd_value(htab
, old_elem
);
897 struct pcpu_freelist_node
*l
;
899 l
= __pcpu_freelist_pop(&htab
->freelist
);
901 return ERR_PTR(-E2BIG
);
902 l_new
= container_of(l
, struct htab_elem
, fnode
);
905 if (atomic_inc_return(&htab
->count
) > htab
->map
.max_entries
)
907 /* when map is full and update() is replacing
908 * old element, it's ok to allocate, since
909 * old element will be freed immediately.
910 * Otherwise return an error
912 l_new
= ERR_PTR(-E2BIG
);
915 l_new
= bpf_map_kmalloc_node(&htab
->map
, htab
->elem_size
,
916 GFP_ATOMIC
| __GFP_NOWARN
,
917 htab
->map
.numa_node
);
919 l_new
= ERR_PTR(-ENOMEM
);
922 check_and_init_map_lock(&htab
->map
,
923 l_new
->key
+ round_up(key_size
, 8));
926 memcpy(l_new
->key
, key
, key_size
);
928 size
= round_up(size
, 8);
930 pptr
= htab_elem_get_ptr(l_new
, key_size
);
932 /* alloc_percpu zero-fills */
933 pptr
= bpf_map_alloc_percpu(&htab
->map
, size
, 8,
934 GFP_ATOMIC
| __GFP_NOWARN
);
937 l_new
= ERR_PTR(-ENOMEM
);
942 pcpu_init_value(htab
, pptr
, value
, onallcpus
);
945 htab_elem_set_ptr(l_new
, key_size
, pptr
);
946 } else if (fd_htab_map_needs_adjust(htab
)) {
947 size
= round_up(size
, 8);
948 memcpy(l_new
->key
+ round_up(key_size
, 8), value
, size
);
950 copy_map_value(&htab
->map
,
951 l_new
->key
+ round_up(key_size
, 8),
958 atomic_dec(&htab
->count
);
962 static int check_flags(struct bpf_htab
*htab
, struct htab_elem
*l_old
,
965 if (l_old
&& (map_flags
& ~BPF_F_LOCK
) == BPF_NOEXIST
)
966 /* elem already exists */
969 if (!l_old
&& (map_flags
& ~BPF_F_LOCK
) == BPF_EXIST
)
970 /* elem doesn't exist, cannot update it */
976 /* Called from syscall or from eBPF program */
977 static int htab_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
980 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
981 struct htab_elem
*l_new
= NULL
, *l_old
;
982 struct hlist_nulls_head
*head
;
988 if (unlikely((map_flags
& ~BPF_F_LOCK
) > BPF_EXIST
))
992 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
994 key_size
= map
->key_size
;
996 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
998 b
= __select_bucket(htab
, hash
);
1001 if (unlikely(map_flags
& BPF_F_LOCK
)) {
1002 if (unlikely(!map_value_has_spin_lock(map
)))
1004 /* find an element without taking the bucket lock */
1005 l_old
= lookup_nulls_elem_raw(head
, hash
, key
, key_size
,
1007 ret
= check_flags(htab
, l_old
, map_flags
);
1011 /* grab the element lock and update value in place */
1012 copy_map_value_locked(map
,
1013 l_old
->key
+ round_up(key_size
, 8),
1017 /* fall through, grab the bucket lock and lookup again.
1018 * 99.9% chance that the element won't be found,
1019 * but second lookup under lock has to be done.
1023 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1027 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
1029 ret
= check_flags(htab
, l_old
, map_flags
);
1033 if (unlikely(l_old
&& (map_flags
& BPF_F_LOCK
))) {
1034 /* first lookup without the bucket lock didn't find the element,
1035 * but second lookup with the bucket lock found it.
1036 * This case is highly unlikely, but has to be dealt with:
1037 * grab the element lock in addition to the bucket lock
1038 * and update element in place
1040 copy_map_value_locked(map
,
1041 l_old
->key
+ round_up(key_size
, 8),
1047 l_new
= alloc_htab_elem(htab
, key
, value
, key_size
, hash
, false, false,
1049 if (IS_ERR(l_new
)) {
1050 /* all pre-allocated elements are in use or memory exhausted */
1051 ret
= PTR_ERR(l_new
);
1055 /* add new element to the head of the list, so that
1056 * concurrent search will find it before old elem
1058 hlist_nulls_add_head_rcu(&l_new
->hash_node
, head
);
1060 hlist_nulls_del_rcu(&l_old
->hash_node
);
1061 if (!htab_is_prealloc(htab
))
1062 free_htab_elem(htab
, l_old
);
1066 htab_unlock_bucket(htab
, b
, hash
, flags
);
1070 static int htab_lru_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
1073 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1074 struct htab_elem
*l_new
, *l_old
= NULL
;
1075 struct hlist_nulls_head
*head
;
1076 unsigned long flags
;
1081 if (unlikely(map_flags
> BPF_EXIST
))
1085 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
1087 key_size
= map
->key_size
;
1089 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1091 b
= __select_bucket(htab
, hash
);
1094 /* For LRU, we need to alloc before taking bucket's
1095 * spinlock because getting free nodes from LRU may need
1096 * to remove older elements from htab and this removal
1097 * operation will need a bucket lock.
1099 l_new
= prealloc_lru_pop(htab
, key
, hash
);
1102 memcpy(l_new
->key
+ round_up(map
->key_size
, 8), value
, map
->value_size
);
1104 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1108 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
1110 ret
= check_flags(htab
, l_old
, map_flags
);
1114 /* add new element to the head of the list, so that
1115 * concurrent search will find it before old elem
1117 hlist_nulls_add_head_rcu(&l_new
->hash_node
, head
);
1119 bpf_lru_node_set_ref(&l_new
->lru_node
);
1120 hlist_nulls_del_rcu(&l_old
->hash_node
);
1125 htab_unlock_bucket(htab
, b
, hash
, flags
);
1128 bpf_lru_push_free(&htab
->lru
, &l_new
->lru_node
);
1130 bpf_lru_push_free(&htab
->lru
, &l_old
->lru_node
);
1135 static int __htab_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
1136 void *value
, u64 map_flags
,
1139 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1140 struct htab_elem
*l_new
= NULL
, *l_old
;
1141 struct hlist_nulls_head
*head
;
1142 unsigned long flags
;
1147 if (unlikely(map_flags
> BPF_EXIST
))
1151 WARN_ON_ONCE(!rcu_read_lock_held());
1153 key_size
= map
->key_size
;
1155 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1157 b
= __select_bucket(htab
, hash
);
1160 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1164 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
1166 ret
= check_flags(htab
, l_old
, map_flags
);
1171 /* per-cpu hash map can update value in-place */
1172 pcpu_copy_value(htab
, htab_elem_get_ptr(l_old
, key_size
),
1175 l_new
= alloc_htab_elem(htab
, key
, value
, key_size
,
1176 hash
, true, onallcpus
, NULL
);
1177 if (IS_ERR(l_new
)) {
1178 ret
= PTR_ERR(l_new
);
1181 hlist_nulls_add_head_rcu(&l_new
->hash_node
, head
);
1185 htab_unlock_bucket(htab
, b
, hash
, flags
);
1189 static int __htab_lru_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
1190 void *value
, u64 map_flags
,
1193 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1194 struct htab_elem
*l_new
= NULL
, *l_old
;
1195 struct hlist_nulls_head
*head
;
1196 unsigned long flags
;
1201 if (unlikely(map_flags
> BPF_EXIST
))
1205 WARN_ON_ONCE(!rcu_read_lock_held());
1207 key_size
= map
->key_size
;
1209 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1211 b
= __select_bucket(htab
, hash
);
1214 /* For LRU, we need to alloc before taking bucket's
1215 * spinlock because LRU's elem alloc may need
1216 * to remove older elem from htab and this removal
1217 * operation will need a bucket lock.
1219 if (map_flags
!= BPF_EXIST
) {
1220 l_new
= prealloc_lru_pop(htab
, key
, hash
);
1225 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1229 l_old
= lookup_elem_raw(head
, hash
, key
, key_size
);
1231 ret
= check_flags(htab
, l_old
, map_flags
);
1236 bpf_lru_node_set_ref(&l_old
->lru_node
);
1238 /* per-cpu hash map can update value in-place */
1239 pcpu_copy_value(htab
, htab_elem_get_ptr(l_old
, key_size
),
1242 pcpu_init_value(htab
, htab_elem_get_ptr(l_new
, key_size
),
1244 hlist_nulls_add_head_rcu(&l_new
->hash_node
, head
);
1249 htab_unlock_bucket(htab
, b
, hash
, flags
);
1251 bpf_lru_push_free(&htab
->lru
, &l_new
->lru_node
);
1255 static int htab_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
1256 void *value
, u64 map_flags
)
1258 return __htab_percpu_map_update_elem(map
, key
, value
, map_flags
, false);
1261 static int htab_lru_percpu_map_update_elem(struct bpf_map
*map
, void *key
,
1262 void *value
, u64 map_flags
)
1264 return __htab_lru_percpu_map_update_elem(map
, key
, value
, map_flags
,
1268 /* Called from syscall or from eBPF program */
1269 static int htab_map_delete_elem(struct bpf_map
*map
, void *key
)
1271 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1272 struct hlist_nulls_head
*head
;
1274 struct htab_elem
*l
;
1275 unsigned long flags
;
1279 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
1281 key_size
= map
->key_size
;
1283 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1284 b
= __select_bucket(htab
, hash
);
1287 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1291 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
1294 hlist_nulls_del_rcu(&l
->hash_node
);
1295 free_htab_elem(htab
, l
);
1300 htab_unlock_bucket(htab
, b
, hash
, flags
);
1304 static int htab_lru_map_delete_elem(struct bpf_map
*map
, void *key
)
1306 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1307 struct hlist_nulls_head
*head
;
1309 struct htab_elem
*l
;
1310 unsigned long flags
;
1314 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
1316 key_size
= map
->key_size
;
1318 hash
= htab_map_hash(key
, key_size
, htab
->hashrnd
);
1319 b
= __select_bucket(htab
, hash
);
1322 ret
= htab_lock_bucket(htab
, b
, hash
, &flags
);
1326 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
1329 hlist_nulls_del_rcu(&l
->hash_node
);
1333 htab_unlock_bucket(htab
, b
, hash
, flags
);
1335 bpf_lru_push_free(&htab
->lru
, &l
->lru_node
);
1339 static void delete_all_elements(struct bpf_htab
*htab
)
1343 for (i
= 0; i
< htab
->n_buckets
; i
++) {
1344 struct hlist_nulls_head
*head
= select_bucket(htab
, i
);
1345 struct hlist_nulls_node
*n
;
1346 struct htab_elem
*l
;
1348 hlist_nulls_for_each_entry_safe(l
, n
, head
, hash_node
) {
1349 hlist_nulls_del_rcu(&l
->hash_node
);
1350 htab_elem_free(htab
, l
);
1355 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1356 static void htab_map_free(struct bpf_map
*map
)
1358 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1361 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1362 * bpf_free_used_maps() is called after bpf prog is no longer executing.
1363 * There is no need to synchronize_rcu() here to protect map elements.
1366 /* some of free_htab_elem() callbacks for elements of this map may
1367 * not have executed. Wait for them.
1370 if (!htab_is_prealloc(htab
))
1371 delete_all_elements(htab
);
1373 prealloc_destroy(htab
);
1375 free_percpu(htab
->extra_elems
);
1376 bpf_map_area_free(htab
->buckets
);
1377 for (i
= 0; i
< HASHTAB_MAP_LOCK_COUNT
; i
++)
1378 free_percpu(htab
->map_locked
[i
]);
1379 lockdep_unregister_key(&htab
->lockdep_key
);
1383 static void htab_map_seq_show_elem(struct bpf_map
*map
, void *key
,
1390 value
= htab_map_lookup_elem(map
, key
);
1396 btf_type_seq_show(map
->btf
, map
->btf_key_type_id
, key
, m
);
1398 btf_type_seq_show(map
->btf
, map
->btf_value_type_id
, value
, m
);
1405 __htab_map_lookup_and_delete_batch(struct bpf_map
*map
,
1406 const union bpf_attr
*attr
,
1407 union bpf_attr __user
*uattr
,
1408 bool do_delete
, bool is_lru_map
,
1411 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1412 u32 bucket_cnt
, total
, key_size
, value_size
, roundup_key_size
;
1413 void *keys
= NULL
, *values
= NULL
, *value
, *dst_key
, *dst_val
;
1414 void __user
*uvalues
= u64_to_user_ptr(attr
->batch
.values
);
1415 void __user
*ukeys
= u64_to_user_ptr(attr
->batch
.keys
);
1416 void __user
*ubatch
= u64_to_user_ptr(attr
->batch
.in_batch
);
1417 u32 batch
, max_count
, size
, bucket_size
;
1418 struct htab_elem
*node_to_free
= NULL
;
1419 u64 elem_map_flags
, map_flags
;
1420 struct hlist_nulls_head
*head
;
1421 struct hlist_nulls_node
*n
;
1422 unsigned long flags
= 0;
1423 bool locked
= false;
1424 struct htab_elem
*l
;
1428 elem_map_flags
= attr
->batch
.elem_flags
;
1429 if ((elem_map_flags
& ~BPF_F_LOCK
) ||
1430 ((elem_map_flags
& BPF_F_LOCK
) && !map_value_has_spin_lock(map
)))
1433 map_flags
= attr
->batch
.flags
;
1437 max_count
= attr
->batch
.count
;
1441 if (put_user(0, &uattr
->batch
.count
))
1445 if (ubatch
&& copy_from_user(&batch
, ubatch
, sizeof(batch
)))
1448 if (batch
>= htab
->n_buckets
)
1451 key_size
= htab
->map
.key_size
;
1452 roundup_key_size
= round_up(htab
->map
.key_size
, 8);
1453 value_size
= htab
->map
.value_size
;
1454 size
= round_up(value_size
, 8);
1456 value_size
= size
* num_possible_cpus();
1458 /* while experimenting with hash tables with sizes ranging from 10 to
1459 * 1000, it was observed that a bucket can have upto 5 entries.
1464 /* We cannot do copy_from_user or copy_to_user inside
1465 * the rcu_read_lock. Allocate enough space here.
1467 keys
= kvmalloc(key_size
* bucket_size
, GFP_USER
| __GFP_NOWARN
);
1468 values
= kvmalloc(value_size
* bucket_size
, GFP_USER
| __GFP_NOWARN
);
1469 if (!keys
|| !values
) {
1475 bpf_disable_instrumentation();
1480 b
= &htab
->buckets
[batch
];
1482 /* do not grab the lock unless need it (bucket_cnt > 0). */
1484 ret
= htab_lock_bucket(htab
, b
, batch
, &flags
);
1490 hlist_nulls_for_each_entry_rcu(l
, n
, head
, hash_node
)
1493 if (bucket_cnt
&& !locked
) {
1498 if (bucket_cnt
> (max_count
- total
)) {
1501 /* Note that since bucket_cnt > 0 here, it is implicit
1502 * that the locked was grabbed, so release it.
1504 htab_unlock_bucket(htab
, b
, batch
, flags
);
1506 bpf_enable_instrumentation();
1510 if (bucket_cnt
> bucket_size
) {
1511 bucket_size
= bucket_cnt
;
1512 /* Note that since bucket_cnt > 0 here, it is implicit
1513 * that the locked was grabbed, so release it.
1515 htab_unlock_bucket(htab
, b
, batch
, flags
);
1517 bpf_enable_instrumentation();
1523 /* Next block is only safe to run if you have grabbed the lock */
1527 hlist_nulls_for_each_entry_safe(l
, n
, head
, hash_node
) {
1528 memcpy(dst_key
, l
->key
, key_size
);
1532 void __percpu
*pptr
;
1534 pptr
= htab_elem_get_ptr(l
, map
->key_size
);
1535 for_each_possible_cpu(cpu
) {
1536 bpf_long_memcpy(dst_val
+ off
,
1537 per_cpu_ptr(pptr
, cpu
), size
);
1541 value
= l
->key
+ roundup_key_size
;
1542 if (elem_map_flags
& BPF_F_LOCK
)
1543 copy_map_value_locked(map
, dst_val
, value
,
1546 copy_map_value(map
, dst_val
, value
);
1547 check_and_init_map_lock(map
, dst_val
);
1550 hlist_nulls_del_rcu(&l
->hash_node
);
1552 /* bpf_lru_push_free() will acquire lru_lock, which
1553 * may cause deadlock. See comments in function
1554 * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1555 * after releasing the bucket lock.
1558 l
->batch_flink
= node_to_free
;
1561 free_htab_elem(htab
, l
);
1564 dst_key
+= key_size
;
1565 dst_val
+= value_size
;
1568 htab_unlock_bucket(htab
, b
, batch
, flags
);
1571 while (node_to_free
) {
1573 node_to_free
= node_to_free
->batch_flink
;
1574 bpf_lru_push_free(&htab
->lru
, &l
->lru_node
);
1578 /* If we are not copying data, we can go to next bucket and avoid
1579 * unlocking the rcu.
1581 if (!bucket_cnt
&& (batch
+ 1 < htab
->n_buckets
)) {
1587 bpf_enable_instrumentation();
1588 if (bucket_cnt
&& (copy_to_user(ukeys
+ total
* key_size
, keys
,
1589 key_size
* bucket_cnt
) ||
1590 copy_to_user(uvalues
+ total
* value_size
, values
,
1591 value_size
* bucket_cnt
))) {
1596 total
+= bucket_cnt
;
1598 if (batch
>= htab
->n_buckets
) {
1608 /* copy # of entries and next batch */
1609 ubatch
= u64_to_user_ptr(attr
->batch
.out_batch
);
1610 if (copy_to_user(ubatch
, &batch
, sizeof(batch
)) ||
1611 put_user(total
, &uattr
->batch
.count
))
1621 htab_percpu_map_lookup_batch(struct bpf_map
*map
, const union bpf_attr
*attr
,
1622 union bpf_attr __user
*uattr
)
1624 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, false,
1629 htab_percpu_map_lookup_and_delete_batch(struct bpf_map
*map
,
1630 const union bpf_attr
*attr
,
1631 union bpf_attr __user
*uattr
)
1633 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, true,
1638 htab_map_lookup_batch(struct bpf_map
*map
, const union bpf_attr
*attr
,
1639 union bpf_attr __user
*uattr
)
1641 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, false,
1646 htab_map_lookup_and_delete_batch(struct bpf_map
*map
,
1647 const union bpf_attr
*attr
,
1648 union bpf_attr __user
*uattr
)
1650 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, true,
1655 htab_lru_percpu_map_lookup_batch(struct bpf_map
*map
,
1656 const union bpf_attr
*attr
,
1657 union bpf_attr __user
*uattr
)
1659 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, false,
1664 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map
*map
,
1665 const union bpf_attr
*attr
,
1666 union bpf_attr __user
*uattr
)
1668 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, true,
1673 htab_lru_map_lookup_batch(struct bpf_map
*map
, const union bpf_attr
*attr
,
1674 union bpf_attr __user
*uattr
)
1676 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, false,
1681 htab_lru_map_lookup_and_delete_batch(struct bpf_map
*map
,
1682 const union bpf_attr
*attr
,
1683 union bpf_attr __user
*uattr
)
1685 return __htab_map_lookup_and_delete_batch(map
, attr
, uattr
, true,
1689 struct bpf_iter_seq_hash_map_info
{
1690 struct bpf_map
*map
;
1691 struct bpf_htab
*htab
;
1692 void *percpu_value_buf
; // non-zero means percpu hash
1697 static struct htab_elem
*
1698 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info
*info
,
1699 struct htab_elem
*prev_elem
)
1701 const struct bpf_htab
*htab
= info
->htab
;
1702 u32 skip_elems
= info
->skip_elems
;
1703 u32 bucket_id
= info
->bucket_id
;
1704 struct hlist_nulls_head
*head
;
1705 struct hlist_nulls_node
*n
;
1706 struct htab_elem
*elem
;
1710 if (bucket_id
>= htab
->n_buckets
)
1713 /* try to find next elem in the same bucket */
1715 /* no update/deletion on this bucket, prev_elem should be still valid
1716 * and we won't skip elements.
1718 n
= rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem
->hash_node
));
1719 elem
= hlist_nulls_entry_safe(n
, struct htab_elem
, hash_node
);
1723 /* not found, unlock and go to the next bucket */
1724 b
= &htab
->buckets
[bucket_id
++];
1729 for (i
= bucket_id
; i
< htab
->n_buckets
; i
++) {
1730 b
= &htab
->buckets
[i
];
1735 hlist_nulls_for_each_entry_rcu(elem
, n
, head
, hash_node
) {
1736 if (count
>= skip_elems
) {
1737 info
->bucket_id
= i
;
1738 info
->skip_elems
= count
;
1748 info
->bucket_id
= i
;
1749 info
->skip_elems
= 0;
1753 static void *bpf_hash_map_seq_start(struct seq_file
*seq
, loff_t
*pos
)
1755 struct bpf_iter_seq_hash_map_info
*info
= seq
->private;
1756 struct htab_elem
*elem
;
1758 elem
= bpf_hash_map_seq_find_next(info
, NULL
);
1767 static void *bpf_hash_map_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
1769 struct bpf_iter_seq_hash_map_info
*info
= seq
->private;
1773 return bpf_hash_map_seq_find_next(info
, v
);
1776 static int __bpf_hash_map_seq_show(struct seq_file
*seq
, struct htab_elem
*elem
)
1778 struct bpf_iter_seq_hash_map_info
*info
= seq
->private;
1779 u32 roundup_key_size
, roundup_value_size
;
1780 struct bpf_iter__bpf_map_elem ctx
= {};
1781 struct bpf_map
*map
= info
->map
;
1782 struct bpf_iter_meta meta
;
1783 int ret
= 0, off
= 0, cpu
;
1784 struct bpf_prog
*prog
;
1785 void __percpu
*pptr
;
1788 prog
= bpf_iter_get_info(&meta
, elem
== NULL
);
1791 ctx
.map
= info
->map
;
1793 roundup_key_size
= round_up(map
->key_size
, 8);
1794 ctx
.key
= elem
->key
;
1795 if (!info
->percpu_value_buf
) {
1796 ctx
.value
= elem
->key
+ roundup_key_size
;
1798 roundup_value_size
= round_up(map
->value_size
, 8);
1799 pptr
= htab_elem_get_ptr(elem
, map
->key_size
);
1800 for_each_possible_cpu(cpu
) {
1801 bpf_long_memcpy(info
->percpu_value_buf
+ off
,
1802 per_cpu_ptr(pptr
, cpu
),
1803 roundup_value_size
);
1804 off
+= roundup_value_size
;
1806 ctx
.value
= info
->percpu_value_buf
;
1809 ret
= bpf_iter_run_prog(prog
, &ctx
);
1815 static int bpf_hash_map_seq_show(struct seq_file
*seq
, void *v
)
1817 return __bpf_hash_map_seq_show(seq
, v
);
1820 static void bpf_hash_map_seq_stop(struct seq_file
*seq
, void *v
)
1823 (void)__bpf_hash_map_seq_show(seq
, NULL
);
1828 static int bpf_iter_init_hash_map(void *priv_data
,
1829 struct bpf_iter_aux_info
*aux
)
1831 struct bpf_iter_seq_hash_map_info
*seq_info
= priv_data
;
1832 struct bpf_map
*map
= aux
->map
;
1836 if (map
->map_type
== BPF_MAP_TYPE_PERCPU_HASH
||
1837 map
->map_type
== BPF_MAP_TYPE_LRU_PERCPU_HASH
) {
1838 buf_size
= round_up(map
->value_size
, 8) * num_possible_cpus();
1839 value_buf
= kmalloc(buf_size
, GFP_USER
| __GFP_NOWARN
);
1843 seq_info
->percpu_value_buf
= value_buf
;
1846 seq_info
->map
= map
;
1847 seq_info
->htab
= container_of(map
, struct bpf_htab
, map
);
1851 static void bpf_iter_fini_hash_map(void *priv_data
)
1853 struct bpf_iter_seq_hash_map_info
*seq_info
= priv_data
;
1855 kfree(seq_info
->percpu_value_buf
);
1858 static const struct seq_operations bpf_hash_map_seq_ops
= {
1859 .start
= bpf_hash_map_seq_start
,
1860 .next
= bpf_hash_map_seq_next
,
1861 .stop
= bpf_hash_map_seq_stop
,
1862 .show
= bpf_hash_map_seq_show
,
1865 static const struct bpf_iter_seq_info iter_seq_info
= {
1866 .seq_ops
= &bpf_hash_map_seq_ops
,
1867 .init_seq_private
= bpf_iter_init_hash_map
,
1868 .fini_seq_private
= bpf_iter_fini_hash_map
,
1869 .seq_priv_size
= sizeof(struct bpf_iter_seq_hash_map_info
),
1872 static int htab_map_btf_id
;
1873 const struct bpf_map_ops htab_map_ops
= {
1874 .map_meta_equal
= bpf_map_meta_equal
,
1875 .map_alloc_check
= htab_map_alloc_check
,
1876 .map_alloc
= htab_map_alloc
,
1877 .map_free
= htab_map_free
,
1878 .map_get_next_key
= htab_map_get_next_key
,
1879 .map_lookup_elem
= htab_map_lookup_elem
,
1880 .map_update_elem
= htab_map_update_elem
,
1881 .map_delete_elem
= htab_map_delete_elem
,
1882 .map_gen_lookup
= htab_map_gen_lookup
,
1883 .map_seq_show_elem
= htab_map_seq_show_elem
,
1885 .map_btf_name
= "bpf_htab",
1886 .map_btf_id
= &htab_map_btf_id
,
1887 .iter_seq_info
= &iter_seq_info
,
1890 static int htab_lru_map_btf_id
;
1891 const struct bpf_map_ops htab_lru_map_ops
= {
1892 .map_meta_equal
= bpf_map_meta_equal
,
1893 .map_alloc_check
= htab_map_alloc_check
,
1894 .map_alloc
= htab_map_alloc
,
1895 .map_free
= htab_map_free
,
1896 .map_get_next_key
= htab_map_get_next_key
,
1897 .map_lookup_elem
= htab_lru_map_lookup_elem
,
1898 .map_lookup_elem_sys_only
= htab_lru_map_lookup_elem_sys
,
1899 .map_update_elem
= htab_lru_map_update_elem
,
1900 .map_delete_elem
= htab_lru_map_delete_elem
,
1901 .map_gen_lookup
= htab_lru_map_gen_lookup
,
1902 .map_seq_show_elem
= htab_map_seq_show_elem
,
1903 BATCH_OPS(htab_lru
),
1904 .map_btf_name
= "bpf_htab",
1905 .map_btf_id
= &htab_lru_map_btf_id
,
1906 .iter_seq_info
= &iter_seq_info
,
1909 /* Called from eBPF program */
1910 static void *htab_percpu_map_lookup_elem(struct bpf_map
*map
, void *key
)
1912 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
1915 return this_cpu_ptr(htab_elem_get_ptr(l
, map
->key_size
));
1920 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map
*map
, void *key
)
1922 struct htab_elem
*l
= __htab_map_lookup_elem(map
, key
);
1925 bpf_lru_node_set_ref(&l
->lru_node
);
1926 return this_cpu_ptr(htab_elem_get_ptr(l
, map
->key_size
));
1932 int bpf_percpu_hash_copy(struct bpf_map
*map
, void *key
, void *value
)
1934 struct htab_elem
*l
;
1935 void __percpu
*pptr
;
1940 /* per_cpu areas are zero-filled and bpf programs can only
1941 * access 'value_size' of them, so copying rounded areas
1942 * will not leak any kernel data
1944 size
= round_up(map
->value_size
, 8);
1946 l
= __htab_map_lookup_elem(map
, key
);
1949 /* We do not mark LRU map element here in order to not mess up
1950 * eviction heuristics when user space does a map walk.
1952 pptr
= htab_elem_get_ptr(l
, map
->key_size
);
1953 for_each_possible_cpu(cpu
) {
1954 bpf_long_memcpy(value
+ off
,
1955 per_cpu_ptr(pptr
, cpu
), size
);
1964 int bpf_percpu_hash_update(struct bpf_map
*map
, void *key
, void *value
,
1967 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
1971 if (htab_is_lru(htab
))
1972 ret
= __htab_lru_percpu_map_update_elem(map
, key
, value
,
1975 ret
= __htab_percpu_map_update_elem(map
, key
, value
, map_flags
,
1982 static void htab_percpu_map_seq_show_elem(struct bpf_map
*map
, void *key
,
1985 struct htab_elem
*l
;
1986 void __percpu
*pptr
;
1991 l
= __htab_map_lookup_elem(map
, key
);
1997 btf_type_seq_show(map
->btf
, map
->btf_key_type_id
, key
, m
);
1998 seq_puts(m
, ": {\n");
1999 pptr
= htab_elem_get_ptr(l
, map
->key_size
);
2000 for_each_possible_cpu(cpu
) {
2001 seq_printf(m
, "\tcpu%d: ", cpu
);
2002 btf_type_seq_show(map
->btf
, map
->btf_value_type_id
,
2003 per_cpu_ptr(pptr
, cpu
), m
);
2011 static int htab_percpu_map_btf_id
;
2012 const struct bpf_map_ops htab_percpu_map_ops
= {
2013 .map_meta_equal
= bpf_map_meta_equal
,
2014 .map_alloc_check
= htab_map_alloc_check
,
2015 .map_alloc
= htab_map_alloc
,
2016 .map_free
= htab_map_free
,
2017 .map_get_next_key
= htab_map_get_next_key
,
2018 .map_lookup_elem
= htab_percpu_map_lookup_elem
,
2019 .map_update_elem
= htab_percpu_map_update_elem
,
2020 .map_delete_elem
= htab_map_delete_elem
,
2021 .map_seq_show_elem
= htab_percpu_map_seq_show_elem
,
2022 BATCH_OPS(htab_percpu
),
2023 .map_btf_name
= "bpf_htab",
2024 .map_btf_id
= &htab_percpu_map_btf_id
,
2025 .iter_seq_info
= &iter_seq_info
,
2028 static int htab_lru_percpu_map_btf_id
;
2029 const struct bpf_map_ops htab_lru_percpu_map_ops
= {
2030 .map_meta_equal
= bpf_map_meta_equal
,
2031 .map_alloc_check
= htab_map_alloc_check
,
2032 .map_alloc
= htab_map_alloc
,
2033 .map_free
= htab_map_free
,
2034 .map_get_next_key
= htab_map_get_next_key
,
2035 .map_lookup_elem
= htab_lru_percpu_map_lookup_elem
,
2036 .map_update_elem
= htab_lru_percpu_map_update_elem
,
2037 .map_delete_elem
= htab_lru_map_delete_elem
,
2038 .map_seq_show_elem
= htab_percpu_map_seq_show_elem
,
2039 BATCH_OPS(htab_lru_percpu
),
2040 .map_btf_name
= "bpf_htab",
2041 .map_btf_id
= &htab_lru_percpu_map_btf_id
,
2042 .iter_seq_info
= &iter_seq_info
,
2045 static int fd_htab_map_alloc_check(union bpf_attr
*attr
)
2047 if (attr
->value_size
!= sizeof(u32
))
2049 return htab_map_alloc_check(attr
);
2052 static void fd_htab_map_free(struct bpf_map
*map
)
2054 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
2055 struct hlist_nulls_node
*n
;
2056 struct hlist_nulls_head
*head
;
2057 struct htab_elem
*l
;
2060 for (i
= 0; i
< htab
->n_buckets
; i
++) {
2061 head
= select_bucket(htab
, i
);
2063 hlist_nulls_for_each_entry_safe(l
, n
, head
, hash_node
) {
2064 void *ptr
= fd_htab_map_get_ptr(map
, l
);
2066 map
->ops
->map_fd_put_ptr(ptr
);
2073 /* only called from syscall */
2074 int bpf_fd_htab_map_lookup_elem(struct bpf_map
*map
, void *key
, u32
*value
)
2079 if (!map
->ops
->map_fd_sys_lookup_elem
)
2083 ptr
= htab_map_lookup_elem(map
, key
);
2085 *value
= map
->ops
->map_fd_sys_lookup_elem(READ_ONCE(*ptr
));
2093 /* only called from syscall */
2094 int bpf_fd_htab_map_update_elem(struct bpf_map
*map
, struct file
*map_file
,
2095 void *key
, void *value
, u64 map_flags
)
2099 u32 ufd
= *(u32
*)value
;
2101 ptr
= map
->ops
->map_fd_get_ptr(map
, map_file
, ufd
);
2103 return PTR_ERR(ptr
);
2105 ret
= htab_map_update_elem(map
, key
, &ptr
, map_flags
);
2107 map
->ops
->map_fd_put_ptr(ptr
);
2112 static struct bpf_map
*htab_of_map_alloc(union bpf_attr
*attr
)
2114 struct bpf_map
*map
, *inner_map_meta
;
2116 inner_map_meta
= bpf_map_meta_alloc(attr
->inner_map_fd
);
2117 if (IS_ERR(inner_map_meta
))
2118 return inner_map_meta
;
2120 map
= htab_map_alloc(attr
);
2122 bpf_map_meta_free(inner_map_meta
);
2126 map
->inner_map_meta
= inner_map_meta
;
2131 static void *htab_of_map_lookup_elem(struct bpf_map
*map
, void *key
)
2133 struct bpf_map
**inner_map
= htab_map_lookup_elem(map
, key
);
2138 return READ_ONCE(*inner_map
);
2141 static int htab_of_map_gen_lookup(struct bpf_map
*map
,
2142 struct bpf_insn
*insn_buf
)
2144 struct bpf_insn
*insn
= insn_buf
;
2145 const int ret
= BPF_REG_0
;
2147 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem
,
2148 (void *(*)(struct bpf_map
*map
, void *key
))NULL
));
2149 *insn
++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem
));
2150 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, ret
, 0, 2);
2151 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, ret
,
2152 offsetof(struct htab_elem
, key
) +
2153 round_up(map
->key_size
, 8));
2154 *insn
++ = BPF_LDX_MEM(BPF_DW
, ret
, ret
, 0);
2156 return insn
- insn_buf
;
2159 static void htab_of_map_free(struct bpf_map
*map
)
2161 bpf_map_meta_free(map
->inner_map_meta
);
2162 fd_htab_map_free(map
);
2165 static int htab_of_maps_map_btf_id
;
2166 const struct bpf_map_ops htab_of_maps_map_ops
= {
2167 .map_alloc_check
= fd_htab_map_alloc_check
,
2168 .map_alloc
= htab_of_map_alloc
,
2169 .map_free
= htab_of_map_free
,
2170 .map_get_next_key
= htab_map_get_next_key
,
2171 .map_lookup_elem
= htab_of_map_lookup_elem
,
2172 .map_delete_elem
= htab_map_delete_elem
,
2173 .map_fd_get_ptr
= bpf_map_fd_get_ptr
,
2174 .map_fd_put_ptr
= bpf_map_fd_put_ptr
,
2175 .map_fd_sys_lookup_elem
= bpf_map_fd_sys_lookup_elem
,
2176 .map_gen_lookup
= htab_of_map_gen_lookup
,
2177 .map_check_btf
= map_check_no_btf
,
2178 .map_btf_name
= "bpf_htab",
2179 .map_btf_id
= &htab_of_maps_map_btf_id
,