1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 #include <linux/bpf.h>
13 #include <linux/jhash.h>
14 #include <linux/filter.h>
15 #include <linux/vmalloc.h>
18 struct hlist_head head
;
24 struct bucket
*buckets
;
25 atomic_t count
; /* number of elements in this hashtable */
26 u32 n_buckets
; /* number of hash buckets */
27 u32 elem_size
; /* size of each element in bytes */
30 /* each htab element is struct htab_elem + key + value */
32 struct hlist_node hash_node
;
35 char key
[0] __aligned(8);
38 /* Called from syscall */
39 static struct bpf_map
*htab_map_alloc(union bpf_attr
*attr
)
41 struct bpf_htab
*htab
;
44 htab
= kzalloc(sizeof(*htab
), GFP_USER
);
46 return ERR_PTR(-ENOMEM
);
48 /* mandatory map attributes */
49 htab
->map
.key_size
= attr
->key_size
;
50 htab
->map
.value_size
= attr
->value_size
;
51 htab
->map
.max_entries
= attr
->max_entries
;
53 /* check sanity of attributes.
54 * value_size == 0 may be allowed in the future to use map as a set
57 if (htab
->map
.max_entries
== 0 || htab
->map
.key_size
== 0 ||
58 htab
->map
.value_size
== 0)
61 /* hash table size must be power of 2 */
62 htab
->n_buckets
= roundup_pow_of_two(htab
->map
.max_entries
);
65 if (htab
->map
.key_size
> MAX_BPF_STACK
)
66 /* eBPF programs initialize keys on stack, so they cannot be
67 * larger than max stack size
71 if (htab
->map
.value_size
>= (1 << (KMALLOC_SHIFT_MAX
- 1)) -
72 MAX_BPF_STACK
- sizeof(struct htab_elem
))
73 /* if value_size is bigger, the user space won't be able to
74 * access the elements via bpf syscall. This check also makes
75 * sure that the elem_size doesn't overflow and it's
76 * kmalloc-able later in htab_map_update_elem()
80 htab
->elem_size
= sizeof(struct htab_elem
) +
81 round_up(htab
->map
.key_size
, 8) +
84 /* prevent zero size kmalloc and check for u32 overflow */
85 if (htab
->n_buckets
== 0 ||
86 htab
->n_buckets
> U32_MAX
/ sizeof(struct bucket
))
89 if ((u64
) htab
->n_buckets
* sizeof(struct bucket
) +
90 (u64
) htab
->elem_size
* htab
->map
.max_entries
>=
92 /* make sure page count doesn't overflow */
95 htab
->map
.pages
= round_up(htab
->n_buckets
* sizeof(struct bucket
) +
96 htab
->elem_size
* htab
->map
.max_entries
,
97 PAGE_SIZE
) >> PAGE_SHIFT
;
100 htab
->buckets
= kmalloc_array(htab
->n_buckets
, sizeof(struct bucket
),
101 GFP_USER
| __GFP_NOWARN
);
103 if (!htab
->buckets
) {
104 htab
->buckets
= vmalloc(htab
->n_buckets
* sizeof(struct bucket
));
109 for (i
= 0; i
< htab
->n_buckets
; i
++) {
110 INIT_HLIST_HEAD(&htab
->buckets
[i
].head
);
111 raw_spin_lock_init(&htab
->buckets
[i
].lock
);
114 atomic_set(&htab
->count
, 0);
123 static inline u32
htab_map_hash(const void *key
, u32 key_len
)
125 return jhash(key
, key_len
, 0);
128 static inline struct bucket
*__select_bucket(struct bpf_htab
*htab
, u32 hash
)
130 return &htab
->buckets
[hash
& (htab
->n_buckets
- 1)];
133 static inline struct hlist_head
*select_bucket(struct bpf_htab
*htab
, u32 hash
)
135 return &__select_bucket(htab
, hash
)->head
;
138 static struct htab_elem
*lookup_elem_raw(struct hlist_head
*head
, u32 hash
,
139 void *key
, u32 key_size
)
143 hlist_for_each_entry_rcu(l
, head
, hash_node
)
144 if (l
->hash
== hash
&& !memcmp(&l
->key
, key
, key_size
))
150 /* Called from syscall or from eBPF program */
151 static void *htab_map_lookup_elem(struct bpf_map
*map
, void *key
)
153 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
154 struct hlist_head
*head
;
158 /* Must be called with rcu_read_lock. */
159 WARN_ON_ONCE(!rcu_read_lock_held());
161 key_size
= map
->key_size
;
163 hash
= htab_map_hash(key
, key_size
);
165 head
= select_bucket(htab
, hash
);
167 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
170 return l
->key
+ round_up(map
->key_size
, 8);
175 /* Called from syscall */
176 static int htab_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
178 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
179 struct hlist_head
*head
;
180 struct htab_elem
*l
, *next_l
;
184 WARN_ON_ONCE(!rcu_read_lock_held());
186 key_size
= map
->key_size
;
188 hash
= htab_map_hash(key
, key_size
);
190 head
= select_bucket(htab
, hash
);
193 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
197 goto find_first_elem
;
200 /* key was found, get next key in the same bucket */
201 next_l
= hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l
->hash_node
)),
202 struct htab_elem
, hash_node
);
205 /* if next elem in this hash list is non-zero, just return it */
206 memcpy(next_key
, next_l
->key
, key_size
);
210 /* no more elements in this hash list, go to the next bucket */
211 i
= hash
& (htab
->n_buckets
- 1);
215 /* iterate over buckets */
216 for (; i
< htab
->n_buckets
; i
++) {
217 head
= select_bucket(htab
, i
);
219 /* pick first element in the bucket */
220 next_l
= hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head
)),
221 struct htab_elem
, hash_node
);
223 /* if it's not empty, just return it */
224 memcpy(next_key
, next_l
->key
, key_size
);
229 /* itereated over all buckets and all elements */
233 /* Called from syscall or from eBPF program */
234 static int htab_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
237 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
238 struct htab_elem
*l_new
, *l_old
;
239 struct hlist_head
*head
;
245 if (map_flags
> BPF_EXIST
)
249 WARN_ON_ONCE(!rcu_read_lock_held());
251 /* allocate new element outside of lock */
252 l_new
= kmalloc(htab
->elem_size
, GFP_ATOMIC
| __GFP_NOWARN
);
256 key_size
= map
->key_size
;
258 memcpy(l_new
->key
, key
, key_size
);
259 memcpy(l_new
->key
+ round_up(key_size
, 8), value
, map
->value_size
);
261 l_new
->hash
= htab_map_hash(l_new
->key
, key_size
);
262 b
= __select_bucket(htab
, l_new
->hash
);
265 /* bpf_map_update_elem() can be called in_irq() */
266 raw_spin_lock_irqsave(&b
->lock
, flags
);
268 l_old
= lookup_elem_raw(head
, l_new
->hash
, key
, key_size
);
270 if (!l_old
&& unlikely(atomic_read(&htab
->count
) >= map
->max_entries
)) {
271 /* if elem with this 'key' doesn't exist and we've reached
272 * max_entries limit, fail insertion of new elem
278 if (l_old
&& map_flags
== BPF_NOEXIST
) {
279 /* elem already exists */
284 if (!l_old
&& map_flags
== BPF_EXIST
) {
285 /* elem doesn't exist, cannot update it */
290 /* add new element to the head of the list, so that concurrent
291 * search will find it before old elem
293 hlist_add_head_rcu(&l_new
->hash_node
, head
);
295 hlist_del_rcu(&l_old
->hash_node
);
296 kfree_rcu(l_old
, rcu
);
298 atomic_inc(&htab
->count
);
300 raw_spin_unlock_irqrestore(&b
->lock
, flags
);
304 raw_spin_unlock_irqrestore(&b
->lock
, flags
);
309 /* Called from syscall or from eBPF program */
310 static int htab_map_delete_elem(struct bpf_map
*map
, void *key
)
312 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
313 struct hlist_head
*head
;
320 WARN_ON_ONCE(!rcu_read_lock_held());
322 key_size
= map
->key_size
;
324 hash
= htab_map_hash(key
, key_size
);
325 b
= __select_bucket(htab
, hash
);
328 raw_spin_lock_irqsave(&b
->lock
, flags
);
330 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
333 hlist_del_rcu(&l
->hash_node
);
334 atomic_dec(&htab
->count
);
339 raw_spin_unlock_irqrestore(&b
->lock
, flags
);
343 static void delete_all_elements(struct bpf_htab
*htab
)
347 for (i
= 0; i
< htab
->n_buckets
; i
++) {
348 struct hlist_head
*head
= select_bucket(htab
, i
);
349 struct hlist_node
*n
;
352 hlist_for_each_entry_safe(l
, n
, head
, hash_node
) {
353 hlist_del_rcu(&l
->hash_node
);
354 atomic_dec(&htab
->count
);
360 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
361 static void htab_map_free(struct bpf_map
*map
)
363 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
365 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
366 * so the programs (can be more than one that used this map) were
367 * disconnected from events. Wait for outstanding critical sections in
368 * these programs to complete
372 /* some of kfree_rcu() callbacks for elements of this map may not have
373 * executed. It's ok. Proceed to free residual elements and map itself
375 delete_all_elements(htab
);
376 kvfree(htab
->buckets
);
380 static const struct bpf_map_ops htab_ops
= {
381 .map_alloc
= htab_map_alloc
,
382 .map_free
= htab_map_free
,
383 .map_get_next_key
= htab_map_get_next_key
,
384 .map_lookup_elem
= htab_map_lookup_elem
,
385 .map_update_elem
= htab_map_update_elem
,
386 .map_delete_elem
= htab_map_delete_elem
,
389 static struct bpf_map_type_list htab_type __read_mostly
= {
391 .type
= BPF_MAP_TYPE_HASH
,
394 static int __init
register_htab_map(void)
396 bpf_register_map_type(&htab_type
);
399 late_initcall(register_htab_map
);