1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2019 Facebook */
3 #include <linux/rculist.h>
4 #include <linux/list.h>
5 #include <linux/hash.h>
6 #include <linux/types.h>
7 #include <linux/spinlock.h>
9 #include <linux/btf_ids.h>
10 #include <linux/bpf_local_storage.h>
12 #include <uapi/linux/sock_diag.h>
13 #include <uapi/linux/btf.h>
15 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
17 static struct bpf_local_storage_map_bucket
*
18 select_bucket(struct bpf_local_storage_map
*smap
,
19 struct bpf_local_storage_elem
*selem
)
21 return &smap
->buckets
[hash_ptr(selem
, smap
->bucket_log
)];
24 static int mem_charge(struct bpf_local_storage_map
*smap
, void *owner
, u32 size
)
26 struct bpf_map
*map
= &smap
->map
;
28 if (!map
->ops
->map_local_storage_charge
)
31 return map
->ops
->map_local_storage_charge(smap
, owner
, size
);
34 static void mem_uncharge(struct bpf_local_storage_map
*smap
, void *owner
,
37 struct bpf_map
*map
= &smap
->map
;
39 if (map
->ops
->map_local_storage_uncharge
)
40 map
->ops
->map_local_storage_uncharge(smap
, owner
, size
);
43 static struct bpf_local_storage __rcu
**
44 owner_storage(struct bpf_local_storage_map
*smap
, void *owner
)
46 struct bpf_map
*map
= &smap
->map
;
48 return map
->ops
->map_owner_storage_ptr(owner
);
51 static bool selem_linked_to_storage(const struct bpf_local_storage_elem
*selem
)
53 return !hlist_unhashed(&selem
->snode
);
56 static bool selem_linked_to_map(const struct bpf_local_storage_elem
*selem
)
58 return !hlist_unhashed(&selem
->map_node
);
61 struct bpf_local_storage_elem
*
62 bpf_selem_alloc(struct bpf_local_storage_map
*smap
, void *owner
,
63 void *value
, bool charge_mem
)
65 struct bpf_local_storage_elem
*selem
;
67 if (charge_mem
&& mem_charge(smap
, owner
, smap
->elem_size
))
70 selem
= bpf_map_kzalloc(&smap
->map
, smap
->elem_size
,
71 GFP_ATOMIC
| __GFP_NOWARN
);
74 memcpy(SDATA(selem
)->data
, value
, smap
->map
.value_size
);
79 mem_uncharge(smap
, owner
, smap
->elem_size
);
84 /* local_storage->lock must be held and selem->local_storage == local_storage.
85 * The caller must ensure selem->smap is still valid to be
86 * dereferenced for its smap->elem_size and smap->cache_idx.
88 bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage
*local_storage
,
89 struct bpf_local_storage_elem
*selem
,
92 struct bpf_local_storage_map
*smap
;
93 bool free_local_storage
;
96 smap
= rcu_dereference(SDATA(selem
)->smap
);
97 owner
= local_storage
->owner
;
99 /* All uncharging on the owner must be done first.
100 * The owner may be freed once the last selem is unlinked
101 * from local_storage.
104 mem_uncharge(smap
, owner
, smap
->elem_size
);
106 free_local_storage
= hlist_is_singular_node(&selem
->snode
,
107 &local_storage
->list
);
108 if (free_local_storage
) {
109 mem_uncharge(smap
, owner
, sizeof(struct bpf_local_storage
));
110 local_storage
->owner
= NULL
;
112 /* After this RCU_INIT, owner may be freed and cannot be used */
113 RCU_INIT_POINTER(*owner_storage(smap
, owner
), NULL
);
115 /* local_storage is not freed now. local_storage->lock is
116 * still held and raw_spin_unlock_bh(&local_storage->lock)
117 * will be done by the caller.
119 * Although the unlock will be done under
120 * rcu_read_lock(), it is more intutivie to
121 * read if kfree_rcu(local_storage, rcu) is done
122 * after the raw_spin_unlock_bh(&local_storage->lock).
124 * Hence, a "bool free_local_storage" is returned
125 * to the caller which then calls the kfree_rcu()
129 hlist_del_init_rcu(&selem
->snode
);
130 if (rcu_access_pointer(local_storage
->cache
[smap
->cache_idx
]) ==
132 RCU_INIT_POINTER(local_storage
->cache
[smap
->cache_idx
], NULL
);
134 kfree_rcu(selem
, rcu
);
136 return free_local_storage
;
139 static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem
*selem
)
141 struct bpf_local_storage
*local_storage
;
142 bool free_local_storage
= false;
144 if (unlikely(!selem_linked_to_storage(selem
)))
145 /* selem has already been unlinked from sk */
148 local_storage
= rcu_dereference(selem
->local_storage
);
149 raw_spin_lock_bh(&local_storage
->lock
);
150 if (likely(selem_linked_to_storage(selem
)))
151 free_local_storage
= bpf_selem_unlink_storage_nolock(
152 local_storage
, selem
, true);
153 raw_spin_unlock_bh(&local_storage
->lock
);
155 if (free_local_storage
)
156 kfree_rcu(local_storage
, rcu
);
159 void bpf_selem_link_storage_nolock(struct bpf_local_storage
*local_storage
,
160 struct bpf_local_storage_elem
*selem
)
162 RCU_INIT_POINTER(selem
->local_storage
, local_storage
);
163 hlist_add_head_rcu(&selem
->snode
, &local_storage
->list
);
166 void bpf_selem_unlink_map(struct bpf_local_storage_elem
*selem
)
168 struct bpf_local_storage_map
*smap
;
169 struct bpf_local_storage_map_bucket
*b
;
171 if (unlikely(!selem_linked_to_map(selem
)))
172 /* selem has already be unlinked from smap */
175 smap
= rcu_dereference(SDATA(selem
)->smap
);
176 b
= select_bucket(smap
, selem
);
177 raw_spin_lock_bh(&b
->lock
);
178 if (likely(selem_linked_to_map(selem
)))
179 hlist_del_init_rcu(&selem
->map_node
);
180 raw_spin_unlock_bh(&b
->lock
);
183 void bpf_selem_link_map(struct bpf_local_storage_map
*smap
,
184 struct bpf_local_storage_elem
*selem
)
186 struct bpf_local_storage_map_bucket
*b
= select_bucket(smap
, selem
);
188 raw_spin_lock_bh(&b
->lock
);
189 RCU_INIT_POINTER(SDATA(selem
)->smap
, smap
);
190 hlist_add_head_rcu(&selem
->map_node
, &b
->list
);
191 raw_spin_unlock_bh(&b
->lock
);
194 void bpf_selem_unlink(struct bpf_local_storage_elem
*selem
)
196 /* Always unlink from map before unlinking from local_storage
197 * because selem will be freed after successfully unlinked from
200 bpf_selem_unlink_map(selem
);
201 __bpf_selem_unlink_storage(selem
);
204 struct bpf_local_storage_data
*
205 bpf_local_storage_lookup(struct bpf_local_storage
*local_storage
,
206 struct bpf_local_storage_map
*smap
,
209 struct bpf_local_storage_data
*sdata
;
210 struct bpf_local_storage_elem
*selem
;
212 /* Fast path (cache hit) */
213 sdata
= rcu_dereference(local_storage
->cache
[smap
->cache_idx
]);
214 if (sdata
&& rcu_access_pointer(sdata
->smap
) == smap
)
217 /* Slow path (cache miss) */
218 hlist_for_each_entry_rcu(selem
, &local_storage
->list
, snode
)
219 if (rcu_access_pointer(SDATA(selem
)->smap
) == smap
)
225 sdata
= SDATA(selem
);
226 if (cacheit_lockit
) {
227 /* spinlock is needed to avoid racing with the
228 * parallel delete. Otherwise, publishing an already
229 * deleted sdata to the cache will become a use-after-free
230 * problem in the next bpf_local_storage_lookup().
232 raw_spin_lock_bh(&local_storage
->lock
);
233 if (selem_linked_to_storage(selem
))
234 rcu_assign_pointer(local_storage
->cache
[smap
->cache_idx
],
236 raw_spin_unlock_bh(&local_storage
->lock
);
242 static int check_flags(const struct bpf_local_storage_data
*old_sdata
,
245 if (old_sdata
&& (map_flags
& ~BPF_F_LOCK
) == BPF_NOEXIST
)
246 /* elem already exists */
249 if (!old_sdata
&& (map_flags
& ~BPF_F_LOCK
) == BPF_EXIST
)
250 /* elem doesn't exist, cannot update it */
256 int bpf_local_storage_alloc(void *owner
,
257 struct bpf_local_storage_map
*smap
,
258 struct bpf_local_storage_elem
*first_selem
)
260 struct bpf_local_storage
*prev_storage
, *storage
;
261 struct bpf_local_storage
**owner_storage_ptr
;
264 err
= mem_charge(smap
, owner
, sizeof(*storage
));
268 storage
= bpf_map_kzalloc(&smap
->map
, sizeof(*storage
),
269 GFP_ATOMIC
| __GFP_NOWARN
);
275 INIT_HLIST_HEAD(&storage
->list
);
276 raw_spin_lock_init(&storage
->lock
);
277 storage
->owner
= owner
;
279 bpf_selem_link_storage_nolock(storage
, first_selem
);
280 bpf_selem_link_map(smap
, first_selem
);
283 (struct bpf_local_storage
**)owner_storage(smap
, owner
);
284 /* Publish storage to the owner.
285 * Instead of using any lock of the kernel object (i.e. owner),
286 * cmpxchg will work with any kernel object regardless what
287 * the running context is, bh, irq...etc.
289 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
290 * is protected by the storage->lock. Hence, when freeing
291 * the owner->storage, the storage->lock must be held before
292 * setting owner->storage ptr to NULL.
294 prev_storage
= cmpxchg(owner_storage_ptr
, NULL
, storage
);
295 if (unlikely(prev_storage
)) {
296 bpf_selem_unlink_map(first_selem
);
300 /* Note that even first_selem was linked to smap's
301 * bucket->list, first_selem can be freed immediately
302 * (instead of kfree_rcu) because
303 * bpf_local_storage_map_free() does a
304 * synchronize_rcu() before walking the bucket->list.
305 * Hence, no one is accessing selem from the
306 * bucket->list under rcu_read_lock().
314 mem_uncharge(smap
, owner
, sizeof(*storage
));
318 /* sk cannot be going away because it is linking new elem
319 * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
320 * Otherwise, it will become a leak (and other memory issues
321 * during map destruction).
323 struct bpf_local_storage_data
*
324 bpf_local_storage_update(void *owner
, struct bpf_local_storage_map
*smap
,
325 void *value
, u64 map_flags
)
327 struct bpf_local_storage_data
*old_sdata
= NULL
;
328 struct bpf_local_storage_elem
*selem
;
329 struct bpf_local_storage
*local_storage
;
332 /* BPF_EXIST and BPF_NOEXIST cannot be both set */
333 if (unlikely((map_flags
& ~BPF_F_LOCK
) > BPF_EXIST
) ||
334 /* BPF_F_LOCK can only be used in a value with spin_lock */
335 unlikely((map_flags
& BPF_F_LOCK
) &&
336 !map_value_has_spin_lock(&smap
->map
)))
337 return ERR_PTR(-EINVAL
);
339 local_storage
= rcu_dereference(*owner_storage(smap
, owner
));
340 if (!local_storage
|| hlist_empty(&local_storage
->list
)) {
341 /* Very first elem for the owner */
342 err
= check_flags(NULL
, map_flags
);
346 selem
= bpf_selem_alloc(smap
, owner
, value
, true);
348 return ERR_PTR(-ENOMEM
);
350 err
= bpf_local_storage_alloc(owner
, smap
, selem
);
353 mem_uncharge(smap
, owner
, smap
->elem_size
);
360 if ((map_flags
& BPF_F_LOCK
) && !(map_flags
& BPF_NOEXIST
)) {
361 /* Hoping to find an old_sdata to do inline update
362 * such that it can avoid taking the local_storage->lock
363 * and changing the lists.
366 bpf_local_storage_lookup(local_storage
, smap
, false);
367 err
= check_flags(old_sdata
, map_flags
);
370 if (old_sdata
&& selem_linked_to_storage(SELEM(old_sdata
))) {
371 copy_map_value_locked(&smap
->map
, old_sdata
->data
,
377 raw_spin_lock_bh(&local_storage
->lock
);
379 /* Recheck local_storage->list under local_storage->lock */
380 if (unlikely(hlist_empty(&local_storage
->list
))) {
381 /* A parallel del is happening and local_storage is going
382 * away. It has just been checked before, so very
383 * unlikely. Return instead of retry to keep things
390 old_sdata
= bpf_local_storage_lookup(local_storage
, smap
, false);
391 err
= check_flags(old_sdata
, map_flags
);
395 if (old_sdata
&& (map_flags
& BPF_F_LOCK
)) {
396 copy_map_value_locked(&smap
->map
, old_sdata
->data
, value
,
398 selem
= SELEM(old_sdata
);
402 /* local_storage->lock is held. Hence, we are sure
403 * we can unlink and uncharge the old_sdata successfully
404 * later. Hence, instead of charging the new selem now
405 * and then uncharge the old selem later (which may cause
406 * a potential but unnecessary charge failure), avoid taking
407 * a charge at all here (the "!old_sdata" check) and the
408 * old_sdata will not be uncharged later during
409 * bpf_selem_unlink_storage_nolock().
411 selem
= bpf_selem_alloc(smap
, owner
, value
, !old_sdata
);
417 /* First, link the new selem to the map */
418 bpf_selem_link_map(smap
, selem
);
420 /* Second, link (and publish) the new selem to local_storage */
421 bpf_selem_link_storage_nolock(local_storage
, selem
);
423 /* Third, remove old selem, SELEM(old_sdata) */
425 bpf_selem_unlink_map(SELEM(old_sdata
));
426 bpf_selem_unlink_storage_nolock(local_storage
, SELEM(old_sdata
),
431 raw_spin_unlock_bh(&local_storage
->lock
);
435 raw_spin_unlock_bh(&local_storage
->lock
);
439 u16
bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache
*cache
)
441 u64 min_usage
= U64_MAX
;
444 spin_lock(&cache
->idx_lock
);
446 for (i
= 0; i
< BPF_LOCAL_STORAGE_CACHE_SIZE
; i
++) {
447 if (cache
->idx_usage_counts
[i
] < min_usage
) {
448 min_usage
= cache
->idx_usage_counts
[i
];
451 /* Found a free cache_idx */
456 cache
->idx_usage_counts
[res
]++;
458 spin_unlock(&cache
->idx_lock
);
463 void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache
*cache
,
466 spin_lock(&cache
->idx_lock
);
467 cache
->idx_usage_counts
[idx
]--;
468 spin_unlock(&cache
->idx_lock
);
471 void bpf_local_storage_map_free(struct bpf_local_storage_map
*smap
)
473 struct bpf_local_storage_elem
*selem
;
474 struct bpf_local_storage_map_bucket
*b
;
477 /* Note that this map might be concurrently cloned from
478 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
479 * RCU read section to finish before proceeding. New RCU
480 * read sections should be prevented via bpf_map_inc_not_zero.
484 /* bpf prog and the userspace can no longer access this map
485 * now. No new selem (of this map) can be added
486 * to the owner->storage or to the map bucket's list.
488 * The elem of this map can be cleaned up here
489 * or when the storage is freed e.g.
490 * by bpf_sk_storage_free() during __sk_destruct().
492 for (i
= 0; i
< (1U << smap
->bucket_log
); i
++) {
493 b
= &smap
->buckets
[i
];
496 /* No one is adding to b->list now */
497 while ((selem
= hlist_entry_safe(
498 rcu_dereference_raw(hlist_first_rcu(&b
->list
)),
499 struct bpf_local_storage_elem
, map_node
))) {
500 bpf_selem_unlink(selem
);
506 /* While freeing the storage we may still need to access the map.
508 * e.g. when bpf_sk_storage_free() has unlinked selem from the map
509 * which then made the above while((selem = ...)) loop
512 * However, while freeing the storage one still needs to access the
513 * smap->elem_size to do the uncharging in
514 * bpf_selem_unlink_storage_nolock().
516 * Hence, wait another rcu grace period for the storage to be freed.
520 kvfree(smap
->buckets
);
524 int bpf_local_storage_map_alloc_check(union bpf_attr
*attr
)
526 if (attr
->map_flags
& ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK
||
527 !(attr
->map_flags
& BPF_F_NO_PREALLOC
) ||
529 attr
->key_size
!= sizeof(int) || !attr
->value_size
||
530 /* Enforce BTF for userspace sk dumping */
531 !attr
->btf_key_type_id
|| !attr
->btf_value_type_id
)
537 if (attr
->value_size
> BPF_LOCAL_STORAGE_MAX_VALUE_SIZE
)
543 struct bpf_local_storage_map
*bpf_local_storage_map_alloc(union bpf_attr
*attr
)
545 struct bpf_local_storage_map
*smap
;
549 smap
= kzalloc(sizeof(*smap
), GFP_USER
| __GFP_NOWARN
| __GFP_ACCOUNT
);
551 return ERR_PTR(-ENOMEM
);
552 bpf_map_init_from_attr(&smap
->map
, attr
);
554 nbuckets
= roundup_pow_of_two(num_possible_cpus());
555 /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
556 nbuckets
= max_t(u32
, 2, nbuckets
);
557 smap
->bucket_log
= ilog2(nbuckets
);
559 smap
->buckets
= kvcalloc(sizeof(*smap
->buckets
), nbuckets
,
560 GFP_USER
| __GFP_NOWARN
| __GFP_ACCOUNT
);
561 if (!smap
->buckets
) {
563 return ERR_PTR(-ENOMEM
);
566 for (i
= 0; i
< nbuckets
; i
++) {
567 INIT_HLIST_HEAD(&smap
->buckets
[i
].list
);
568 raw_spin_lock_init(&smap
->buckets
[i
].lock
);
572 sizeof(struct bpf_local_storage_elem
) + attr
->value_size
;
577 int bpf_local_storage_map_check_btf(const struct bpf_map
*map
,
578 const struct btf
*btf
,
579 const struct btf_type
*key_type
,
580 const struct btf_type
*value_type
)
584 if (BTF_INFO_KIND(key_type
->info
) != BTF_KIND_INT
)
587 int_data
= *(u32
*)(key_type
+ 1);
588 if (BTF_INT_BITS(int_data
) != 32 || BTF_INT_OFFSET(int_data
))