1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 #include <linux/bpf.h>
13 #include <linux/jhash.h>
14 #include <linux/filter.h>
15 #include <linux/vmalloc.h>
19 struct hlist_head
*buckets
;
21 u32 count
; /* number of elements in this hashtable */
22 u32 n_buckets
; /* number of hash buckets */
23 u32 elem_size
; /* size of each element in bytes */
26 /* each htab element is struct htab_elem + key + value */
28 struct hlist_node hash_node
;
31 char key
[0] __aligned(8);
34 /* Called from syscall */
35 static struct bpf_map
*htab_map_alloc(union bpf_attr
*attr
)
37 struct bpf_htab
*htab
;
40 htab
= kzalloc(sizeof(*htab
), GFP_USER
);
42 return ERR_PTR(-ENOMEM
);
44 /* mandatory map attributes */
45 htab
->map
.key_size
= attr
->key_size
;
46 htab
->map
.value_size
= attr
->value_size
;
47 htab
->map
.max_entries
= attr
->max_entries
;
49 /* check sanity of attributes.
50 * value_size == 0 may be allowed in the future to use map as a set
53 if (htab
->map
.max_entries
== 0 || htab
->map
.key_size
== 0 ||
54 htab
->map
.value_size
== 0)
57 /* hash table size must be power of 2 */
58 htab
->n_buckets
= roundup_pow_of_two(htab
->map
.max_entries
);
61 if (htab
->map
.key_size
> MAX_BPF_STACK
)
62 /* eBPF programs initialize keys on stack, so they cannot be
63 * larger than max stack size
68 /* prevent zero size kmalloc and check for u32 overflow */
69 if (htab
->n_buckets
== 0 ||
70 htab
->n_buckets
> U32_MAX
/ sizeof(struct hlist_head
))
73 htab
->buckets
= kmalloc_array(htab
->n_buckets
, sizeof(struct hlist_head
),
74 GFP_USER
| __GFP_NOWARN
);
77 htab
->buckets
= vmalloc(htab
->n_buckets
* sizeof(struct hlist_head
));
82 for (i
= 0; i
< htab
->n_buckets
; i
++)
83 INIT_HLIST_HEAD(&htab
->buckets
[i
]);
85 spin_lock_init(&htab
->lock
);
88 htab
->elem_size
= sizeof(struct htab_elem
) +
89 round_up(htab
->map
.key_size
, 8) +
98 static inline u32
htab_map_hash(const void *key
, u32 key_len
)
100 return jhash(key
, key_len
, 0);
103 static inline struct hlist_head
*select_bucket(struct bpf_htab
*htab
, u32 hash
)
105 return &htab
->buckets
[hash
& (htab
->n_buckets
- 1)];
108 static struct htab_elem
*lookup_elem_raw(struct hlist_head
*head
, u32 hash
,
109 void *key
, u32 key_size
)
113 hlist_for_each_entry_rcu(l
, head
, hash_node
)
114 if (l
->hash
== hash
&& !memcmp(&l
->key
, key
, key_size
))
120 /* Called from syscall or from eBPF program */
121 static void *htab_map_lookup_elem(struct bpf_map
*map
, void *key
)
123 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
124 struct hlist_head
*head
;
128 /* Must be called with rcu_read_lock. */
129 WARN_ON_ONCE(!rcu_read_lock_held());
131 key_size
= map
->key_size
;
133 hash
= htab_map_hash(key
, key_size
);
135 head
= select_bucket(htab
, hash
);
137 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
140 return l
->key
+ round_up(map
->key_size
, 8);
145 /* Called from syscall */
146 static int htab_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
148 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
149 struct hlist_head
*head
;
150 struct htab_elem
*l
, *next_l
;
154 WARN_ON_ONCE(!rcu_read_lock_held());
156 key_size
= map
->key_size
;
158 hash
= htab_map_hash(key
, key_size
);
160 head
= select_bucket(htab
, hash
);
163 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
167 goto find_first_elem
;
170 /* key was found, get next key in the same bucket */
171 next_l
= hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l
->hash_node
)),
172 struct htab_elem
, hash_node
);
175 /* if next elem in this hash list is non-zero, just return it */
176 memcpy(next_key
, next_l
->key
, key_size
);
180 /* no more elements in this hash list, go to the next bucket */
181 i
= hash
& (htab
->n_buckets
- 1);
185 /* iterate over buckets */
186 for (; i
< htab
->n_buckets
; i
++) {
187 head
= select_bucket(htab
, i
);
189 /* pick first element in the bucket */
190 next_l
= hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head
)),
191 struct htab_elem
, hash_node
);
193 /* if it's not empty, just return it */
194 memcpy(next_key
, next_l
->key
, key_size
);
199 /* itereated over all buckets and all elements */
203 /* Called from syscall or from eBPF program */
204 static int htab_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
207 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
208 struct htab_elem
*l_new
, *l_old
;
209 struct hlist_head
*head
;
214 if (map_flags
> BPF_EXIST
)
218 WARN_ON_ONCE(!rcu_read_lock_held());
220 /* allocate new element outside of lock */
221 l_new
= kmalloc(htab
->elem_size
, GFP_ATOMIC
);
225 key_size
= map
->key_size
;
227 memcpy(l_new
->key
, key
, key_size
);
228 memcpy(l_new
->key
+ round_up(key_size
, 8), value
, map
->value_size
);
230 l_new
->hash
= htab_map_hash(l_new
->key
, key_size
);
232 /* bpf_map_update_elem() can be called in_irq() */
233 spin_lock_irqsave(&htab
->lock
, flags
);
235 head
= select_bucket(htab
, l_new
->hash
);
237 l_old
= lookup_elem_raw(head
, l_new
->hash
, key
, key_size
);
239 if (!l_old
&& unlikely(htab
->count
>= map
->max_entries
)) {
240 /* if elem with this 'key' doesn't exist and we've reached
241 * max_entries limit, fail insertion of new elem
247 if (l_old
&& map_flags
== BPF_NOEXIST
) {
248 /* elem already exists */
253 if (!l_old
&& map_flags
== BPF_EXIST
) {
254 /* elem doesn't exist, cannot update it */
259 /* add new element to the head of the list, so that concurrent
260 * search will find it before old elem
262 hlist_add_head_rcu(&l_new
->hash_node
, head
);
264 hlist_del_rcu(&l_old
->hash_node
);
265 kfree_rcu(l_old
, rcu
);
269 spin_unlock_irqrestore(&htab
->lock
, flags
);
273 spin_unlock_irqrestore(&htab
->lock
, flags
);
278 /* Called from syscall or from eBPF program */
279 static int htab_map_delete_elem(struct bpf_map
*map
, void *key
)
281 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
282 struct hlist_head
*head
;
288 WARN_ON_ONCE(!rcu_read_lock_held());
290 key_size
= map
->key_size
;
292 hash
= htab_map_hash(key
, key_size
);
294 spin_lock_irqsave(&htab
->lock
, flags
);
296 head
= select_bucket(htab
, hash
);
298 l
= lookup_elem_raw(head
, hash
, key
, key_size
);
301 hlist_del_rcu(&l
->hash_node
);
307 spin_unlock_irqrestore(&htab
->lock
, flags
);
311 static void delete_all_elements(struct bpf_htab
*htab
)
315 for (i
= 0; i
< htab
->n_buckets
; i
++) {
316 struct hlist_head
*head
= select_bucket(htab
, i
);
317 struct hlist_node
*n
;
320 hlist_for_each_entry_safe(l
, n
, head
, hash_node
) {
321 hlist_del_rcu(&l
->hash_node
);
328 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
329 static void htab_map_free(struct bpf_map
*map
)
331 struct bpf_htab
*htab
= container_of(map
, struct bpf_htab
, map
);
333 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
334 * so the programs (can be more than one that used this map) were
335 * disconnected from events. Wait for outstanding critical sections in
336 * these programs to complete
340 /* some of kfree_rcu() callbacks for elements of this map may not have
341 * executed. It's ok. Proceed to free residual elements and map itself
343 delete_all_elements(htab
);
344 kvfree(htab
->buckets
);
348 static const struct bpf_map_ops htab_ops
= {
349 .map_alloc
= htab_map_alloc
,
350 .map_free
= htab_map_free
,
351 .map_get_next_key
= htab_map_get_next_key
,
352 .map_lookup_elem
= htab_map_lookup_elem
,
353 .map_update_elem
= htab_map_update_elem
,
354 .map_delete_elem
= htab_map_delete_elem
,
357 static struct bpf_map_type_list htab_type __read_mostly
= {
359 .type
= BPF_MAP_TYPE_HASH
,
362 static int __init
register_htab_map(void)
364 bpf_register_map_type(&htab_type
);
367 late_initcall(register_htab_map
);