1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 #include <linux/bpf.h>
13 #include <linux/jhash.h>
14 #include <linux/filter.h>
15 #include <linux/vmalloc.h>
19 struct hlist_head
*buckets
;
21 u32 count
; /* number of elements in this hashtable */
22 u32 n_buckets
; /* number of hash buckets */
23 u32 elem_size
; /* size of each element in bytes */
26 /* each htab element is struct htab_elem + key + value */
28 struct hlist_node hash_node
;
31 char key
[0] __aligned(8);
34 /* Called from syscall */
35 static struct bpf_map
*htab_map_alloc(union bpf_attr
*attr
)
37 struct bpf_htab
*htab
;
40 htab
= kzalloc(sizeof(*htab
), GFP_USER
);
42 return ERR_PTR(-ENOMEM
);
44 /* mandatory map attributes */
45 htab
->map
.key_size
= attr
->key_size
;
46 htab
->map
.value_size
= attr
->value_size
;
47 htab
->map
.max_entries
= attr
->max_entries
;
49 /* check sanity of attributes.
50 * value_size == 0 may be allowed in the future to use map as a set
53 if (htab
->map
.max_entries
== 0 || htab
->map
.key_size
== 0 ||
54 htab
->map
.value_size
== 0)
57 /* hash table size must be power of 2 */
58 htab
->n_buckets
= roundup_pow_of_two(htab
->map
.max_entries
);
61 if (htab
->map
.key_size
> MAX_BPF_STACK
)
62 /* eBPF programs initialize keys on stack, so they cannot be
63 * larger than max stack size
67 if (htab
->map
.value_size
>= (1 << (KMALLOC_SHIFT_MAX
- 1)) -
68 MAX_BPF_STACK
- sizeof(struct htab_elem
))
69 /* if value_size is bigger, the user space won't be able to
70 * access the elements via bpf syscall. This check also makes
71 * sure that the elem_size doesn't overflow and it's
72 * kmalloc-able later in htab_map_update_elem()
76 htab
->elem_size
= sizeof(struct htab_elem
) +
77 round_up(htab
->map
.key_size
, 8) +
80 /* prevent zero size kmalloc and check for u32 overflow */
81 if (htab
->n_buckets
== 0 ||
82 htab
->n_buckets
> U32_MAX
/ sizeof(struct hlist_head
))
85 if ((u64
) htab
->n_buckets
* sizeof(struct hlist_head
) +
86 (u64
) htab
->elem_size
* htab
->map
.max_entries
>=
88 /* make sure page count doesn't overflow */
91 htab
->map
.pages
= round_up(htab
->n_buckets
* sizeof(struct hlist_head
) +
92 htab
->elem_size
* htab
->map
.max_entries
,
93 PAGE_SIZE
) >> PAGE_SHIFT
;
96 htab
->buckets
= kmalloc_array(htab
->n_buckets
, sizeof(struct hlist_head
),
97 GFP_USER
| __GFP_NOWARN
);
100 htab
->buckets
= vmalloc(htab
->n_buckets
* sizeof(struct hlist_head
));
105 for (i
= 0; i
< htab
->n_buckets
; i
++)
106 INIT_HLIST_HEAD(&htab
->buckets
[i
]);
108 raw_spin_lock_init(&htab
->lock
);
118 static inline u32
htab_map_hash(const void *key
, u32 key_len
)
120 return jhash(key
, key_len
, 0);
123 static inline struct hlist_head
*select_bucket(struct bpf_htab
*htab
, u32 hash
)
125 return &htab
->buckets
[hash
& (htab
->n_buckets
- 1)];
128 static struct htab_elem
*lookup_elem_raw(struct hlist_head
*head
, u32 hash
,
129 void *key
, u32 key_size
)
133 hlist_for_each_entry_rcu(l
, head
, hash_node
)
134 if (l
->hash
== hash
&& !memcmp(&l
->key
, key
, key_size
))
140 /* Called from syscall or from eBPF program */
141 static void *htab_map_lookup_elem(struct bpf_map
*map
, void *key
)
143 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
144 struct hlist_head
*head
;
148 /* Must be called with rcu_read_lock. */
149 WARN_ON_ONCE(!rcu_read_lock_held());
151 key_size
= map
->key_size
;
153 hash
= htab_map_hash(key
, key_size
);
155 head
= select_bucket(htab
, hash
);
157 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
160 return l
->key
+ round_up(map
->key_size
, 8);
165 /* Called from syscall */
166 static int htab_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
168 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
169 struct hlist_head
*head
;
170 struct htab_elem
*l
, *next_l
;
174 WARN_ON_ONCE(!rcu_read_lock_held());
176 key_size
= map
->key_size
;
178 hash
= htab_map_hash(key
, key_size
);
180 head
= select_bucket(htab
, hash
);
183 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
187 goto find_first_elem
;
190 /* key was found, get next key in the same bucket */
191 next_l
= hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l
->hash_node
)),
192 struct htab_elem
, hash_node
);
195 /* if next elem in this hash list is non-zero, just return it */
196 memcpy(next_key
, next_l
->key
, key_size
);
200 /* no more elements in this hash list, go to the next bucket */
201 i
= hash
& (htab
->n_buckets
- 1);
205 /* iterate over buckets */
206 for (; i
< htab
->n_buckets
; i
++) {
207 head
= select_bucket(htab
, i
);
209 /* pick first element in the bucket */
210 next_l
= hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head
)),
211 struct htab_elem
, hash_node
);
213 /* if it's not empty, just return it */
214 memcpy(next_key
, next_l
->key
, key_size
);
219 /* itereated over all buckets and all elements */
223 /* Called from syscall or from eBPF program */
224 static int htab_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
227 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
228 struct htab_elem
*l_new
, *l_old
;
229 struct hlist_head
*head
;
234 if (map_flags
> BPF_EXIST
)
238 WARN_ON_ONCE(!rcu_read_lock_held());
240 /* allocate new element outside of lock */
241 l_new
= kmalloc(htab
->elem_size
, GFP_ATOMIC
| __GFP_NOWARN
);
245 key_size
= map
->key_size
;
247 memcpy(l_new
->key
, key
, key_size
);
248 memcpy(l_new
->key
+ round_up(key_size
, 8), value
, map
->value_size
);
250 l_new
->hash
= htab_map_hash(l_new
->key
, key_size
);
252 /* bpf_map_update_elem() can be called in_irq() */
253 raw_spin_lock_irqsave(&htab
->lock
, flags
);
255 head
= select_bucket(htab
, l_new
->hash
);
257 l_old
= lookup_elem_raw(head
, l_new
->hash
, key
, key_size
);
259 if (!l_old
&& unlikely(htab
->count
>= map
->max_entries
)) {
260 /* if elem with this 'key' doesn't exist and we've reached
261 * max_entries limit, fail insertion of new elem
267 if (l_old
&& map_flags
== BPF_NOEXIST
) {
268 /* elem already exists */
273 if (!l_old
&& map_flags
== BPF_EXIST
) {
274 /* elem doesn't exist, cannot update it */
279 /* add new element to the head of the list, so that concurrent
280 * search will find it before old elem
282 hlist_add_head_rcu(&l_new
->hash_node
, head
);
284 hlist_del_rcu(&l_old
->hash_node
);
285 kfree_rcu(l_old
, rcu
);
289 raw_spin_unlock_irqrestore(&htab
->lock
, flags
);
293 raw_spin_unlock_irqrestore(&htab
->lock
, flags
);
298 /* Called from syscall or from eBPF program */
299 static int htab_map_delete_elem(struct bpf_map
*map
, void *key
)
301 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
302 struct hlist_head
*head
;
308 WARN_ON_ONCE(!rcu_read_lock_held());
310 key_size
= map
->key_size
;
312 hash
= htab_map_hash(key
, key_size
);
314 raw_spin_lock_irqsave(&htab
->lock
, flags
);
316 head
= select_bucket(htab
, hash
);
318 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
321 hlist_del_rcu(&l
->hash_node
);
327 raw_spin_unlock_irqrestore(&htab
->lock
, flags
);
331 static void delete_all_elements(struct bpf_htab
*htab
)
335 for (i
= 0; i
< htab
->n_buckets
; i
++) {
336 struct hlist_head
*head
= select_bucket(htab
, i
);
337 struct hlist_node
*n
;
340 hlist_for_each_entry_safe(l
, n
, head
, hash_node
) {
341 hlist_del_rcu(&l
->hash_node
);
348 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
349 static void htab_map_free(struct bpf_map
*map
)
351 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
353 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
354 * so the programs (can be more than one that used this map) were
355 * disconnected from events. Wait for outstanding critical sections in
356 * these programs to complete
360 /* some of kfree_rcu() callbacks for elements of this map may not have
361 * executed. It's ok. Proceed to free residual elements and map itself
363 delete_all_elements(htab
);
364 kvfree(htab
->buckets
);
368 static const struct bpf_map_ops htab_ops
= {
369 .map_alloc
= htab_map_alloc
,
370 .map_free
= htab_map_free
,
371 .map_get_next_key
= htab_map_get_next_key
,
372 .map_lookup_elem
= htab_map_lookup_elem
,
373 .map_update_elem
= htab_map_update_elem
,
374 .map_delete_elem
= htab_map_delete_elem
,
377 static struct bpf_map_type_list htab_type __read_mostly
= {
379 .type
= BPF_MAP_TYPE_HASH
,
382 static int __init
register_htab_map(void)
384 bpf_register_map_type(&htab_type
);
387 late_initcall(register_htab_map
);