net/core/bpf_sk_storage.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* Copyright (c) 2019 Facebook  */
   3 #include <linux/rculist.h>
   4 #include <linux/list.h>
   5 #include <linux/hash.h>
   6 #include <linux/types.h>
   7 #include <linux/spinlock.h>
   8 #include <linux/bpf.h>
   9 #include <net/bpf_sk_storage.h>
  10 #include <net/sock.h>
  11 #include <uapi/linux/btf.h>
  12
  13 static atomic_t cache_idx;
  14
  15 struct bucket {
  16         struct hlist_head list;
  17         raw_spinlock_t lock;
  18 };
  19
  20 /* Thp map is not the primary owner of a bpf_sk_storage_elem.
  21  * Instead, the sk->sk_bpf_storage is.
  22  *
  23  * The map (bpf_sk_storage_map) is for two purposes
  24  * 1. Define the size of the "sk local storage".  It is
  25  *    the map's value_size.
  26  *
  27  * 2. Maintain a list to keep track of all elems such
  28  *    that they can be cleaned up during the map destruction.
  29  *
  30  * When a bpf local storage is being looked up for a
  31  * particular sk,  the "bpf_map" pointer is actually used
  32  * as the "key" to search in the list of elem in
  33  * sk->sk_bpf_storage.
  34  *
  35  * Hence, consider sk->sk_bpf_storage is the mini-map
  36  * with the "bpf_map" pointer as the searching key.
  37  */
  38 struct bpf_sk_storage_map {
  39         struct bpf_map map;
  40         /* Lookup elem does not require accessing the map.
  41          *
  42          * Updating/Deleting requires a bucket lock to
  43          * link/unlink the elem from the map.  Having
  44          * multiple buckets to improve contention.
  45          */
  46         struct bucket *buckets;
  47         u32 bucket_log;
  48         u16 elem_size;
  49         u16 cache_idx;
  50 };
  51
  52 struct bpf_sk_storage_data {
  53         /* smap is used as the searching key when looking up
  54          * from sk->sk_bpf_storage.
  55          *
  56          * Put it in the same cacheline as the data to minimize
  57          * the number of cachelines access during the cache hit case.
  58          */
  59         struct bpf_sk_storage_map __rcu *smap;
  60         u8 data[0] __aligned(8);
  61 };
  62
  63 /* Linked to bpf_sk_storage and bpf_sk_storage_map */
  64 struct bpf_sk_storage_elem {
  65         struct hlist_node map_node;     /* Linked to bpf_sk_storage_map */
  66         struct hlist_node snode;        /* Linked to bpf_sk_storage */
  67         struct bpf_sk_storage __rcu *sk_storage;
  68         struct rcu_head rcu;
  69         /* 8 bytes hole */
  70         /* The data is stored in aother cacheline to minimize
  71          * the number of cachelines access during a cache hit.
  72          */
  73         struct bpf_sk_storage_data sdata ____cacheline_aligned;
  74 };
  75
  76 #define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata)
  77 #define SDATA(_SELEM) (&(_SELEM)->sdata)
  78 #define BPF_SK_STORAGE_CACHE_SIZE       16
  79
  80 struct bpf_sk_storage {
  81         struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE];
  82         struct hlist_head list; /* List of bpf_sk_storage_elem */
  83         struct sock *sk;        /* The sk that owns the the above "list" of
  84                                  * bpf_sk_storage_elem.
  85                                  */
  86         struct rcu_head rcu;
  87         raw_spinlock_t lock;    /* Protect adding/removing from the "list" */
  88 };
  89
  90 static struct bucket *select_bucket(struct bpf_sk_storage_map *smap,
  91                                     struct bpf_sk_storage_elem *selem)
  92 {
  93         return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
  94 }
  95
  96 static int omem_charge(struct sock *sk, unsigned int size)
  97 {
  98         /* same check as in sock_kmalloc() */
  99         if (size <= sysctl_optmem_max &&
 100             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
 101                 atomic_add(size, &sk->sk_omem_alloc);
 102                 return 0;
 103         }
 104
 105         return -ENOMEM;
 106 }
 107
 108 static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem)
 109 {
 110         return !hlist_unhashed(&selem->snode);
 111 }
 112
 113 static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem)
 114 {
 115         return !hlist_unhashed(&selem->map_node);
 116 }
 117
 118 static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap,
 119                                                struct sock *sk, void *value,
 120                                                bool charge_omem)
 121 {
 122         struct bpf_sk_storage_elem *selem;
 123
 124         if (charge_omem && omem_charge(sk, smap->elem_size))
 125                 return NULL;
 126
 127         selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
 128         if (selem) {
 129                 if (value)
 130                         memcpy(SDATA(selem)->data, value, smap->map.value_size);
 131                 return selem;
 132         }
 133
 134         if (charge_omem)
 135                 atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
 136
 137         return NULL;
 138 }
 139
 140 /* sk_storage->lock must be held and selem->sk_storage == sk_storage.
 141  * The caller must ensure selem->smap is still valid to be
 142  * dereferenced for its smap->elem_size and smap->cache_idx.
 143  */
 144 static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage,
 145                               struct bpf_sk_storage_elem *selem,
 146                               bool uncharge_omem)
 147 {
 148         struct bpf_sk_storage_map *smap;
 149         bool free_sk_storage;
 150         struct sock *sk;
 151
 152         smap = rcu_dereference(SDATA(selem)->smap);
 153         sk = sk_storage->sk;
 154
 155         /* All uncharging on sk->sk_omem_alloc must be done first.
 156          * sk may be freed once the last selem is unlinked from sk_storage.
 157          */
 158         if (uncharge_omem)
 159                 atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
 160
 161         free_sk_storage = hlist_is_singular_node(&selem->snode,
 162                                                  &sk_storage->list);
 163         if (free_sk_storage) {
 164                 atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc);
 165                 sk_storage->sk = NULL;
 166                 /* After this RCU_INIT, sk may be freed and cannot be used */
 167                 RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
 168
 169                 /* sk_storage is not freed now.  sk_storage->lock is
 170                  * still held and raw_spin_unlock_bh(&sk_storage->lock)
 171                  * will be done by the caller.
 172                  *
 173                  * Although the unlock will be done under
 174                  * rcu_read_lock(),  it is more intutivie to
 175                  * read if kfree_rcu(sk_storage, rcu) is done
 176                  * after the raw_spin_unlock_bh(&sk_storage->lock).
 177                  *
 178                  * Hence, a "bool free_sk_storage" is returned
 179                  * to the caller which then calls the kfree_rcu()
 180                  * after unlock.
 181                  */
 182         }
 183         hlist_del_init_rcu(&selem->snode);
 184         if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) ==
 185             SDATA(selem))
 186                 RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL);
 187
 188         kfree_rcu(selem, rcu);
 189
 190         return free_sk_storage;
 191 }
 192
 193 static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
 194 {
 195         struct bpf_sk_storage *sk_storage;
 196         bool free_sk_storage = false;
 197
 198         if (unlikely(!selem_linked_to_sk(selem)))
 199                 /* selem has already been unlinked from sk */
 200                 return;
 201
 202         sk_storage = rcu_dereference(selem->sk_storage);
 203         raw_spin_lock_bh(&sk_storage->lock);
 204         if (likely(selem_linked_to_sk(selem)))
 205                 free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
 206         raw_spin_unlock_bh(&sk_storage->lock);
 207
 208         if (free_sk_storage)
 209                 kfree_rcu(sk_storage, rcu);
 210 }
 211
 212 /* sk_storage->lock must be held and sk_storage->list cannot be empty */
 213 static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
 214                             struct bpf_sk_storage_elem *selem)
 215 {
 216         RCU_INIT_POINTER(selem->sk_storage, sk_storage);
 217         hlist_add_head(&selem->snode, &sk_storage->list);
 218 }
 219
 220 static void selem_unlink_map(struct bpf_sk_storage_elem *selem)
 221 {
 222         struct bpf_sk_storage_map *smap;
 223         struct bucket *b;
 224
 225         if (unlikely(!selem_linked_to_map(selem)))
 226                 /* selem has already be unlinked from smap */
 227                 return;
 228
 229         smap = rcu_dereference(SDATA(selem)->smap);
 230         b = select_bucket(smap, selem);
 231         raw_spin_lock_bh(&b->lock);
 232         if (likely(selem_linked_to_map(selem)))
 233                 hlist_del_init_rcu(&selem->map_node);
 234         raw_spin_unlock_bh(&b->lock);
 235 }
 236
 237 static void selem_link_map(struct bpf_sk_storage_map *smap,
 238                            struct bpf_sk_storage_elem *selem)
 239 {
 240         struct bucket *b = select_bucket(smap, selem);
 241
 242         raw_spin_lock_bh(&b->lock);
 243         RCU_INIT_POINTER(SDATA(selem)->smap, smap);
 244         hlist_add_head_rcu(&selem->map_node, &b->list);
 245         raw_spin_unlock_bh(&b->lock);
 246 }
 247
 248 static void selem_unlink(struct bpf_sk_storage_elem *selem)
 249 {
 250         /* Always unlink from map before unlinking from sk_storage
 251          * because selem will be freed after successfully unlinked from
 252          * the sk_storage.
 253          */
 254         selem_unlink_map(selem);
 255         selem_unlink_sk(selem);
 256 }
 257
 258 static struct bpf_sk_storage_data *
 259 __sk_storage_lookup(struct bpf_sk_storage *sk_storage,
 260                     struct bpf_sk_storage_map *smap,
 261                     bool cacheit_lockit)
 262 {
 263         struct bpf_sk_storage_data *sdata;
 264         struct bpf_sk_storage_elem *selem;
 265
 266         /* Fast path (cache hit) */
 267         sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]);
 268         if (sdata && rcu_access_pointer(sdata->smap) == smap)
 269                 return sdata;
 270
 271         /* Slow path (cache miss) */
 272         hlist_for_each_entry_rcu(selem, &sk_storage->list, snode)
 273                 if (rcu_access_pointer(SDATA(selem)->smap) == smap)
 274                         break;
 275
 276         if (!selem)
 277                 return NULL;
 278
 279         sdata = SDATA(selem);
 280         if (cacheit_lockit) {
 281                 /* spinlock is needed to avoid racing with the
 282                  * parallel delete.  Otherwise, publishing an already
 283                  * deleted sdata to the cache will become a use-after-free
 284                  * problem in the next __sk_storage_lookup().
 285                  */
 286                 raw_spin_lock_bh(&sk_storage->lock);
 287                 if (selem_linked_to_sk(selem))
 288                         rcu_assign_pointer(sk_storage->cache[smap->cache_idx],
 289                                            sdata);
 290                 raw_spin_unlock_bh(&sk_storage->lock);
 291         }
 292
 293         return sdata;
 294 }
 295
 296 static struct bpf_sk_storage_data *
 297 sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
 298 {
 299         struct bpf_sk_storage *sk_storage;
 300         struct bpf_sk_storage_map *smap;
 301
 302         sk_storage = rcu_dereference(sk->sk_bpf_storage);
 303         if (!sk_storage)
 304                 return NULL;
 305
 306         smap = (struct bpf_sk_storage_map *)map;
 307         return __sk_storage_lookup(sk_storage, smap, cacheit_lockit);
 308 }
 309
 310 static int check_flags(const struct bpf_sk_storage_data *old_sdata,
 311                        u64 map_flags)
 312 {
 313         if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
 314                 /* elem already exists */
 315                 return -EEXIST;
 316
 317         if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
 318                 /* elem doesn't exist, cannot update it */
 319                 return -ENOENT;
 320
 321         return 0;
 322 }
 323
 324 static int sk_storage_alloc(struct sock *sk,
 325                             struct bpf_sk_storage_map *smap,
 326                             struct bpf_sk_storage_elem *first_selem)
 327 {
 328         struct bpf_sk_storage *prev_sk_storage, *sk_storage;
 329         int err;
 330
 331         err = omem_charge(sk, sizeof(*sk_storage));
 332         if (err)
 333                 return err;
 334
 335         sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN);
 336         if (!sk_storage) {
 337                 err = -ENOMEM;
 338                 goto uncharge;
 339         }
 340         INIT_HLIST_HEAD(&sk_storage->list);
 341         raw_spin_lock_init(&sk_storage->lock);
 342         sk_storage->sk = sk;
 343
 344         __selem_link_sk(sk_storage, first_selem);
 345         selem_link_map(smap, first_selem);
 346         /* Publish sk_storage to sk.  sk->sk_lock cannot be acquired.
 347          * Hence, atomic ops is used to set sk->sk_bpf_storage
 348          * from NULL to the newly allocated sk_storage ptr.
 349          *
 350          * From now on, the sk->sk_bpf_storage pointer is protected
 351          * by the sk_storage->lock.  Hence,  when freeing
 352          * the sk->sk_bpf_storage, the sk_storage->lock must
 353          * be held before setting sk->sk_bpf_storage to NULL.
 354          */
 355         prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage,
 356                                   NULL, sk_storage);
 357         if (unlikely(prev_sk_storage)) {
 358                 selem_unlink_map(first_selem);
 359                 err = -EAGAIN;
 360                 goto uncharge;
 361
 362                 /* Note that even first_selem was linked to smap's
 363                  * bucket->list, first_selem can be freed immediately
 364                  * (instead of kfree_rcu) because
 365                  * bpf_sk_storage_map_free() does a
 366                  * synchronize_rcu() before walking the bucket->list.
 367                  * Hence, no one is accessing selem from the
 368                  * bucket->list under rcu_read_lock().
 369                  */
 370         }
 371
 372         return 0;
 373
 374 uncharge:
 375         kfree(sk_storage);
 376         atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc);
 377         return err;
 378 }
 379
 380 /* sk cannot be going away because it is linking new elem
 381  * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
 382  * Otherwise, it will become a leak (and other memory issues
 383  * during map destruction).
 384  */
 385 static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
 386                                                      struct bpf_map *map,
 387                                                      void *value,
 388                                                      u64 map_flags)
 389 {
 390         struct bpf_sk_storage_data *old_sdata = NULL;
 391         struct bpf_sk_storage_elem *selem;
 392         struct bpf_sk_storage *sk_storage;
 393         struct bpf_sk_storage_map *smap;
 394         int err;
 395
 396         /* BPF_EXIST and BPF_NOEXIST cannot be both set */
 397         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
 398             /* BPF_F_LOCK can only be used in a value with spin_lock */
 399             unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
 400                 return ERR_PTR(-EINVAL);
 401
 402         smap = (struct bpf_sk_storage_map *)map;
 403         sk_storage = rcu_dereference(sk->sk_bpf_storage);
 404         if (!sk_storage || hlist_empty(&sk_storage->list)) {
 405                 /* Very first elem for this sk */
 406                 err = check_flags(NULL, map_flags);
 407                 if (err)
 408                         return ERR_PTR(err);
 409
 410                 selem = selem_alloc(smap, sk, value, true);
 411                 if (!selem)
 412                         return ERR_PTR(-ENOMEM);
 413
 414                 err = sk_storage_alloc(sk, smap, selem);
 415                 if (err) {
 416                         kfree(selem);
 417                         atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
 418                         return ERR_PTR(err);
 419                 }
 420
 421                 return SDATA(selem);
 422         }
 423
 424         if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
 425                 /* Hoping to find an old_sdata to do inline update
 426                  * such that it can avoid taking the sk_storage->lock
 427                  * and changing the lists.
 428                  */
 429                 old_sdata = __sk_storage_lookup(sk_storage, smap, false);
 430                 err = check_flags(old_sdata, map_flags);
 431                 if (err)
 432                         return ERR_PTR(err);
 433                 if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) {
 434                         copy_map_value_locked(map, old_sdata->data,
 435                                               value, false);
 436                         return old_sdata;
 437                 }
 438         }
 439
 440         raw_spin_lock_bh(&sk_storage->lock);
 441
 442         /* Recheck sk_storage->list under sk_storage->lock */
 443         if (unlikely(hlist_empty(&sk_storage->list))) {
 444                 /* A parallel del is happening and sk_storage is going
 445                  * away.  It has just been checked before, so very
 446                  * unlikely.  Return instead of retry to keep things
 447                  * simple.
 448                  */
 449                 err = -EAGAIN;
 450                 goto unlock_err;
 451         }
 452
 453         old_sdata = __sk_storage_lookup(sk_storage, smap, false);
 454         err = check_flags(old_sdata, map_flags);
 455         if (err)
 456                 goto unlock_err;
 457
 458         if (old_sdata && (map_flags & BPF_F_LOCK)) {
 459                 copy_map_value_locked(map, old_sdata->data, value, false);
 460                 selem = SELEM(old_sdata);
 461                 goto unlock;
 462         }
 463
 464         /* sk_storage->lock is held.  Hence, we are sure
 465          * we can unlink and uncharge the old_sdata successfully
 466          * later.  Hence, instead of charging the new selem now
 467          * and then uncharge the old selem later (which may cause
 468          * a potential but unnecessary charge failure),  avoid taking
 469          * a charge at all here (the "!old_sdata" check) and the
 470          * old_sdata will not be uncharged later during __selem_unlink_sk().
 471          */
 472         selem = selem_alloc(smap, sk, value, !old_sdata);
 473         if (!selem) {
 474                 err = -ENOMEM;
 475                 goto unlock_err;
 476         }
 477
 478         /* First, link the new selem to the map */
 479         selem_link_map(smap, selem);
 480
 481         /* Second, link (and publish) the new selem to sk_storage */
 482         __selem_link_sk(sk_storage, selem);
 483
 484         /* Third, remove old selem, SELEM(old_sdata) */
 485         if (old_sdata) {
 486                 selem_unlink_map(SELEM(old_sdata));
 487                 __selem_unlink_sk(sk_storage, SELEM(old_sdata), false);
 488         }
 489
 490 unlock:
 491         raw_spin_unlock_bh(&sk_storage->lock);
 492         return SDATA(selem);
 493
 494 unlock_err:
 495         raw_spin_unlock_bh(&sk_storage->lock);
 496         return ERR_PTR(err);
 497 }
 498
 499 static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
 500 {
 501         struct bpf_sk_storage_data *sdata;
 502
 503         sdata = sk_storage_lookup(sk, map, false);
 504         if (!sdata)
 505                 return -ENOENT;
 506
 507         selem_unlink(SELEM(sdata));
 508
 509         return 0;
 510 }
 511
 512 /* Called by __sk_destruct() */
 513 void bpf_sk_storage_free(struct sock *sk)
 514 {
 515         struct bpf_sk_storage_elem *selem;
 516         struct bpf_sk_storage *sk_storage;
 517         bool free_sk_storage = false;
 518         struct hlist_node *n;
 519
 520         rcu_read_lock();
 521         sk_storage = rcu_dereference(sk->sk_bpf_storage);
 522         if (!sk_storage) {
 523                 rcu_read_unlock();
 524                 return;
 525         }
 526
 527         /* Netiher the bpf_prog nor the bpf-map's syscall
 528          * could be modifying the sk_storage->list now.
 529          * Thus, no elem can be added-to or deleted-from the
 530          * sk_storage->list by the bpf_prog or by the bpf-map's syscall.
 531          *
 532          * It is racing with bpf_sk_storage_map_free() alone
 533          * when unlinking elem from the sk_storage->list and
 534          * the map's bucket->list.
 535          */
 536         raw_spin_lock_bh(&sk_storage->lock);
 537         hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) {
 538                 /* Always unlink from map before unlinking from
 539                  * sk_storage.
 540                  */
 541                 selem_unlink_map(selem);
 542                 free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
 543         }
 544         raw_spin_unlock_bh(&sk_storage->lock);
 545         rcu_read_unlock();
 546
 547         if (free_sk_storage)
 548                 kfree_rcu(sk_storage, rcu);
 549 }
 550
 551 static void bpf_sk_storage_map_free(struct bpf_map *map)
 552 {
 553         struct bpf_sk_storage_elem *selem;
 554         struct bpf_sk_storage_map *smap;
 555         struct bucket *b;
 556         unsigned int i;
 557
 558         smap = (struct bpf_sk_storage_map *)map;
 559
 560         synchronize_rcu();
 561
 562         /* bpf prog and the userspace can no longer access this map
 563          * now.  No new selem (of this map) can be added
 564          * to the sk->sk_bpf_storage or to the map bucket's list.
 565          *
 566          * The elem of this map can be cleaned up here
 567          * or
 568          * by bpf_sk_storage_free() during __sk_destruct().
 569          */
 570         for (i = 0; i < (1U << smap->bucket_log); i++) {
 571                 b = &smap->buckets[i];
 572
 573                 rcu_read_lock();
 574                 /* No one is adding to b->list now */
 575                 while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)),
 576                                                  struct bpf_sk_storage_elem,
 577                                                  map_node))) {
 578                         selem_unlink(selem);
 579                         cond_resched_rcu();
 580                 }
 581                 rcu_read_unlock();
 582         }
 583
 584         /* bpf_sk_storage_free() may still need to access the map.
 585          * e.g. bpf_sk_storage_free() has unlinked selem from the map
 586          * which then made the above while((selem = ...)) loop
 587          * exited immediately.
 588          *
 589          * However, the bpf_sk_storage_free() still needs to access
 590          * the smap->elem_size to do the uncharging in
 591          * __selem_unlink_sk().
 592          *
 593          * Hence, wait another rcu grace period for the
 594          * bpf_sk_storage_free() to finish.
 595          */
 596         synchronize_rcu();
 597
 598         kvfree(smap->buckets);
 599         kfree(map);
 600 }
 601
 602 static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 603 {
 604         if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
 605             attr->key_size != sizeof(int) || !attr->value_size ||
 606             /* Enforce BTF for userspace sk dumping */
 607             !attr->btf_key_type_id || !attr->btf_value_type_id)
 608                 return -EINVAL;
 609
 610         if (!capable(CAP_SYS_ADMIN))
 611                 return -EPERM;
 612
 613         if (attr->value_size >= KMALLOC_MAX_SIZE -
 614             MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) ||
 615             /* U16_MAX is much more than enough for sk local storage
 616              * considering a tcp_sock is ~2k.
 617              */
 618             attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem))
 619                 return -E2BIG;
 620
 621         return 0;
 622 }
 623
 624 static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 625 {
 626         struct bpf_sk_storage_map *smap;
 627         unsigned int i;
 628         u32 nbuckets;
 629         u64 cost;
 630
 631         smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
 632         if (!smap)
 633                 return ERR_PTR(-ENOMEM);
 634         bpf_map_init_from_attr(&smap->map, attr);
 635
 636         /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
 637         smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus())));
 638         nbuckets = 1U << smap->bucket_log;
 639         smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
 640                                  GFP_USER | __GFP_NOWARN);
 641         if (!smap->buckets) {
 642                 kfree(smap);
 643                 return ERR_PTR(-ENOMEM);
 644         }
 645         cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
 646
 647         for (i = 0; i < nbuckets; i++) {
 648                 INIT_HLIST_HEAD(&smap->buckets[i].list);
 649                 raw_spin_lock_init(&smap->buckets[i].lock);
 650         }
 651
 652         smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
 653         smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
 654                 BPF_SK_STORAGE_CACHE_SIZE;
 655         smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 656
 657         return &smap->map;
 658 }
 659
 660 static int notsupp_get_next_key(struct bpf_map *map, void *key,
 661                                 void *next_key)
 662 {
 663         return -ENOTSUPP;
 664 }
 665
 666 static int bpf_sk_storage_map_check_btf(const struct bpf_map *map,
 667                                         const struct btf *btf,
 668                                         const struct btf_type *key_type,
 669                                         const struct btf_type *value_type)
 670 {
 671         u32 int_data;
 672
 673         if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
 674                 return -EINVAL;
 675
 676         int_data = *(u32 *)(key_type + 1);
 677         if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
 678                 return -EINVAL;
 679
 680         return 0;
 681 }
 682
 683 static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
 684 {
 685         struct bpf_sk_storage_data *sdata;
 686         struct socket *sock;
 687         int fd, err;
 688
 689         fd = *(int *)key;
 690         sock = sockfd_lookup(fd, &err);
 691         if (sock) {
 692                 sdata = sk_storage_lookup(sock->sk, map, true);
 693                 sockfd_put(sock);
 694                 return sdata ? sdata->data : NULL;
 695         }
 696
 697         return ERR_PTR(err);
 698 }
 699
 700 static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
 701                                          void *value, u64 map_flags)
 702 {
 703         struct bpf_sk_storage_data *sdata;
 704         struct socket *sock;
 705         int fd, err;
 706
 707         fd = *(int *)key;
 708         sock = sockfd_lookup(fd, &err);
 709         if (sock) {
 710                 sdata = sk_storage_update(sock->sk, map, value, map_flags);
 711                 sockfd_put(sock);
 712                 return PTR_ERR_OR_ZERO(sdata);
 713         }
 714
 715         return err;
 716 }
 717
 718 static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
 719 {
 720         struct socket *sock;
 721         int fd, err;
 722
 723         fd = *(int *)key;
 724         sock = sockfd_lookup(fd, &err);
 725         if (sock) {
 726                 err = sk_storage_delete(sock->sk, map);
 727                 sockfd_put(sock);
 728                 return err;
 729         }
 730
 731         return err;
 732 }
 733
 734 BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 735            void *, value, u64, flags)
 736 {
 737         struct bpf_sk_storage_data *sdata;
 738
 739         if (flags > BPF_SK_STORAGE_GET_F_CREATE)
 740                 return (unsigned long)NULL;
 741
 742         sdata = sk_storage_lookup(sk, map, true);
 743         if (sdata)
 744                 return (unsigned long)sdata->data;
 745
 746         if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
 747             /* Cannot add new elem to a going away sk.
 748              * Otherwise, the new elem may become a leak
 749              * (and also other memory issues during map
 750              *  destruction).
 751              */
 752             refcount_inc_not_zero(&sk->sk_refcnt)) {
 753                 sdata = sk_storage_update(sk, map, value, BPF_NOEXIST);
 754                 /* sk must be a fullsock (guaranteed by verifier),
 755                  * so sock_gen_put() is unnecessary.
 756                  */
 757                 sock_put(sk);
 758                 return IS_ERR(sdata) ?
 759                         (unsigned long)NULL : (unsigned long)sdata->data;
 760         }
 761
 762         return (unsigned long)NULL;
 763 }
 764
 765 BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
 766 {
 767         if (refcount_inc_not_zero(&sk->sk_refcnt)) {
 768                 int err;
 769
 770                 err = sk_storage_delete(sk, map);
 771                 sock_put(sk);
 772                 return err;
 773         }
 774
 775         return -ENOENT;
 776 }
 777
 778 const struct bpf_map_ops sk_storage_map_ops = {
 779         .map_alloc_check = bpf_sk_storage_map_alloc_check,
 780         .map_alloc = bpf_sk_storage_map_alloc,
 781         .map_free = bpf_sk_storage_map_free,
 782         .map_get_next_key = notsupp_get_next_key,
 783         .map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
 784         .map_update_elem = bpf_fd_sk_storage_update_elem,
 785         .map_delete_elem = bpf_fd_sk_storage_delete_elem,
 786         .map_check_btf = bpf_sk_storage_map_check_btf,
 787 };
 788
 789 const struct bpf_func_proto bpf_sk_storage_get_proto = {
 790         .func           = bpf_sk_storage_get,
 791         .gpl_only       = false,
 792         .ret_type       = RET_PTR_TO_MAP_VALUE_OR_NULL,
 793         .arg1_type      = ARG_CONST_MAP_PTR,
 794         .arg2_type      = ARG_PTR_TO_SOCKET,
 795         .arg3_type      = ARG_PTR_TO_MAP_VALUE_OR_NULL,
 796         .arg4_type      = ARG_ANYTHING,
 797 };
 798
 799 const struct bpf_func_proto bpf_sk_storage_delete_proto = {
 800         .func           = bpf_sk_storage_delete,
 801         .gpl_only       = false,
 802         .ret_type       = RET_INTEGER,
 803         .arg1_type      = ARG_CONST_MAP_PTR,
 804         .arg2_type      = ARG_PTR_TO_SOCKET,
 805 };