net/core/page_pool.c

   1 /* SPDX-License-Identifier: GPL-2.0
   2  *
   3  * page_pool.c
   4  *      Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5  *      Copyright (C) 2016 Red Hat, Inc.
   6  */
   7
   8 #include <linux/error-injection.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/slab.h>
  12 #include <linux/device.h>
  13
  14 #include <net/netdev_rx_queue.h>
  15 #include <net/page_pool/helpers.h>
  16 #include <net/xdp.h>
  17
  18 #include <linux/dma-direction.h>
  19 #include <linux/dma-mapping.h>
  20 #include <linux/page-flags.h>
  21 #include <linux/mm.h> /* for put_page() */
  22 #include <linux/poison.h>
  23 #include <linux/ethtool.h>
  24 #include <linux/netdevice.h>
  25
  26 #include <trace/events/page_pool.h>
  27
  28 #include "mp_dmabuf_devmem.h"
  29 #include "netmem_priv.h"
  30 #include "page_pool_priv.h"
  31
  32 DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
  33
  34 #define DEFER_TIME (msecs_to_jiffies(1000))
  35 #define DEFER_WARN_INTERVAL (60 * HZ)
  36
  37 #define BIAS_MAX        (LONG_MAX >> 1)
  38
  39 #ifdef CONFIG_PAGE_POOL_STATS
  40 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
  41
  42 /* alloc_stat_inc is intended to be used in softirq context */
  43 #define alloc_stat_inc(pool, __stat)    (pool->alloc_stats.__stat++)
  44 /* recycle_stat_inc is safe to use when preemption is possible. */
  45 #define recycle_stat_inc(pool, __stat)                                                  \
  46         do {                                                                            \
  47                 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;       \
  48                 this_cpu_inc(s->__stat);                                                \
  49         } while (0)
  50
  51 #define recycle_stat_add(pool, __stat, val)                                             \
  52         do {                                                                            \
  53                 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;       \
  54                 this_cpu_add(s->__stat, val);                                           \
  55         } while (0)
  56
  57 static const char pp_stats[][ETH_GSTRING_LEN] = {
  58         "rx_pp_alloc_fast",
  59         "rx_pp_alloc_slow",
  60         "rx_pp_alloc_slow_ho",
  61         "rx_pp_alloc_empty",
  62         "rx_pp_alloc_refill",
  63         "rx_pp_alloc_waive",
  64         "rx_pp_recycle_cached",
  65         "rx_pp_recycle_cache_full",
  66         "rx_pp_recycle_ring",
  67         "rx_pp_recycle_ring_full",
  68         "rx_pp_recycle_released_ref",
  69 };
  70
  71 /**
  72  * page_pool_get_stats() - fetch page pool stats
  73  * @pool:       pool from which page was allocated
  74  * @stats:      struct page_pool_stats to fill in
  75  *
  76  * Retrieve statistics about the page_pool. This API is only available
  77  * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
  78  * A pointer to a caller allocated struct page_pool_stats structure
  79  * is passed to this API which is filled in. The caller can then report
  80  * those stats to the user (perhaps via ethtool, debugfs, etc.).
  81  */
  82 bool page_pool_get_stats(const struct page_pool *pool,
  83                          struct page_pool_stats *stats)
  84 {
  85         int cpu = 0;
  86
  87         if (!stats)
  88                 return false;
  89
  90         /* The caller is responsible to initialize stats. */
  91         stats->alloc_stats.fast += pool->alloc_stats.fast;
  92         stats->alloc_stats.slow += pool->alloc_stats.slow;
  93         stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
  94         stats->alloc_stats.empty += pool->alloc_stats.empty;
  95         stats->alloc_stats.refill += pool->alloc_stats.refill;
  96         stats->alloc_stats.waive += pool->alloc_stats.waive;
  97
  98         for_each_possible_cpu(cpu) {
  99                 const struct page_pool_recycle_stats *pcpu =
 100                         per_cpu_ptr(pool->recycle_stats, cpu);
 101
 102                 stats->recycle_stats.cached += pcpu->cached;
 103                 stats->recycle_stats.cache_full += pcpu->cache_full;
 104                 stats->recycle_stats.ring += pcpu->ring;
 105                 stats->recycle_stats.ring_full += pcpu->ring_full;
 106                 stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
 107         }
 108
 109         return true;
 110 }
 111 EXPORT_SYMBOL(page_pool_get_stats);
 112
 113 u8 *page_pool_ethtool_stats_get_strings(u8 *data)
 114 {
 115         int i;
 116
 117         for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
 118                 memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
 119                 data += ETH_GSTRING_LEN;
 120         }
 121
 122         return data;
 123 }
 124 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
 125
 126 int page_pool_ethtool_stats_get_count(void)
 127 {
 128         return ARRAY_SIZE(pp_stats);
 129 }
 130 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
 131
 132 u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
 133 {
 134         const struct page_pool_stats *pool_stats = stats;
 135
 136         *data++ = pool_stats->alloc_stats.fast;
 137         *data++ = pool_stats->alloc_stats.slow;
 138         *data++ = pool_stats->alloc_stats.slow_high_order;
 139         *data++ = pool_stats->alloc_stats.empty;
 140         *data++ = pool_stats->alloc_stats.refill;
 141         *data++ = pool_stats->alloc_stats.waive;
 142         *data++ = pool_stats->recycle_stats.cached;
 143         *data++ = pool_stats->recycle_stats.cache_full;
 144         *data++ = pool_stats->recycle_stats.ring;
 145         *data++ = pool_stats->recycle_stats.ring_full;
 146         *data++ = pool_stats->recycle_stats.released_refcnt;
 147
 148         return data;
 149 }
 150 EXPORT_SYMBOL(page_pool_ethtool_stats_get);
 151
 152 #else
 153 #define alloc_stat_inc(pool, __stat)
 154 #define recycle_stat_inc(pool, __stat)
 155 #define recycle_stat_add(pool, __stat, val)
 156 #endif
 157
 158 static bool page_pool_producer_lock(struct page_pool *pool)
 159         __acquires(&pool->ring.producer_lock)
 160 {
 161         bool in_softirq = in_softirq();
 162
 163         if (in_softirq)
 164                 spin_lock(&pool->ring.producer_lock);
 165         else
 166                 spin_lock_bh(&pool->ring.producer_lock);
 167
 168         return in_softirq;
 169 }
 170
 171 static void page_pool_producer_unlock(struct page_pool *pool,
 172                                       bool in_softirq)
 173         __releases(&pool->ring.producer_lock)
 174 {
 175         if (in_softirq)
 176                 spin_unlock(&pool->ring.producer_lock);
 177         else
 178                 spin_unlock_bh(&pool->ring.producer_lock);
 179 }
 180
 181 static void page_pool_struct_check(void)
 182 {
 183         CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
 184         CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
 185         CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
 186         CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
 187                                     PAGE_POOL_FRAG_GROUP_ALIGN);
 188 }
 189
 190 static int page_pool_init(struct page_pool *pool,
 191                           const struct page_pool_params *params,
 192                           int cpuid)
 193 {
 194         unsigned int ring_qsize = 1024; /* Default */
 195         struct netdev_rx_queue *rxq;
 196         int err;
 197
 198         page_pool_struct_check();
 199
 200         memcpy(&pool->p, &params->fast, sizeof(pool->p));
 201         memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 202
 203         pool->cpuid = cpuid;
 204
 205         /* Validate only known flags were used */
 206         if (pool->slow.flags & ~PP_FLAG_ALL)
 207                 return -EINVAL;
 208
 209         if (pool->p.pool_size)
 210                 ring_qsize = pool->p.pool_size;
 211
 212         /* Sanity limit mem that can be pinned down */
 213         if (ring_qsize > 32768)
 214                 return -E2BIG;
 215
 216         /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
 217          * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 218          * which is the XDP_TX use-case.
 219          */
 220         if (pool->slow.flags & PP_FLAG_DMA_MAP) {
 221                 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 222                     (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 223                         return -EINVAL;
 224
 225                 pool->dma_map = true;
 226         }
 227
 228         if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
 229                 /* In order to request DMA-sync-for-device the page
 230                  * needs to be mapped
 231                  */
 232                 if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
 233                         return -EINVAL;
 234
 235                 if (!pool->p.max_len)
 236                         return -EINVAL;
 237
 238                 pool->dma_sync = true;
 239
 240                 /* pool->p.offset has to be set according to the address
 241                  * offset used by the DMA engine to start copying rx data
 242                  */
 243         }
 244
 245         pool->has_init_callback = !!pool->slow.init_callback;
 246
 247 #ifdef CONFIG_PAGE_POOL_STATS
 248         if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
 249                 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 250                 if (!pool->recycle_stats)
 251                         return -ENOMEM;
 252         } else {
 253                 /* For system page pool instance we use a singular stats object
 254                  * instead of allocating a separate percpu variable for each
 255                  * (also percpu) page pool instance.
 256                  */
 257                 pool->recycle_stats = &pp_system_recycle_stats;
 258                 pool->system = true;
 259         }
 260 #endif
 261
 262         if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
 263 #ifdef CONFIG_PAGE_POOL_STATS
 264                 if (!pool->system)
 265                         free_percpu(pool->recycle_stats);
 266 #endif
 267                 return -ENOMEM;
 268         }
 269
 270         atomic_set(&pool->pages_state_release_cnt, 0);
 271
 272         /* Driver calling page_pool_create() also call page_pool_destroy() */
 273         refcount_set(&pool->user_cnt, 1);
 274
 275         if (pool->dma_map)
 276                 get_device(pool->p.dev);
 277
 278         if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
 279                 /* We rely on rtnl_lock()ing to make sure netdev_rx_queue
 280                  * configuration doesn't change while we're initializing
 281                  * the page_pool.
 282                  */
 283                 ASSERT_RTNL();
 284                 rxq = __netif_get_rx_queue(pool->slow.netdev,
 285                                            pool->slow.queue_idx);
 286                 pool->mp_priv = rxq->mp_params.mp_priv;
 287         }
 288
 289         if (pool->mp_priv) {
 290                 err = mp_dmabuf_devmem_init(pool);
 291                 if (err) {
 292                         pr_warn("%s() mem-provider init failed %d\n", __func__,
 293                                 err);
 294                         goto free_ptr_ring;
 295                 }
 296
 297                 static_branch_inc(&page_pool_mem_providers);
 298         }
 299
 300         return 0;
 301
 302 free_ptr_ring:
 303         ptr_ring_cleanup(&pool->ring, NULL);
 304 #ifdef CONFIG_PAGE_POOL_STATS
 305         if (!pool->system)
 306                 free_percpu(pool->recycle_stats);
 307 #endif
 308         return err;
 309 }
 310
 311 static void page_pool_uninit(struct page_pool *pool)
 312 {
 313         ptr_ring_cleanup(&pool->ring, NULL);
 314
 315         if (pool->dma_map)
 316                 put_device(pool->p.dev);
 317
 318 #ifdef CONFIG_PAGE_POOL_STATS
 319         if (!pool->system)
 320                 free_percpu(pool->recycle_stats);
 321 #endif
 322 }
 323
 324 /**
 325  * page_pool_create_percpu() - create a page pool for a given cpu.
 326  * @params: parameters, see struct page_pool_params
 327  * @cpuid: cpu identifier
 328  */
 329 struct page_pool *
 330 page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
 331 {
 332         struct page_pool *pool;
 333         int err;
 334
 335         pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 336         if (!pool)
 337                 return ERR_PTR(-ENOMEM);
 338
 339         err = page_pool_init(pool, params, cpuid);
 340         if (err < 0)
 341                 goto err_free;
 342
 343         err = page_pool_list(pool);
 344         if (err)
 345                 goto err_uninit;
 346
 347         return pool;
 348
 349 err_uninit:
 350         page_pool_uninit(pool);
 351 err_free:
 352         pr_warn("%s() gave up with errno %d\n", __func__, err);
 353         kfree(pool);
 354         return ERR_PTR(err);
 355 }
 356 EXPORT_SYMBOL(page_pool_create_percpu);
 357
 358 /**
 359  * page_pool_create() - create a page pool
 360  * @params: parameters, see struct page_pool_params
 361  */
 362 struct page_pool *page_pool_create(const struct page_pool_params *params)
 363 {
 364         return page_pool_create_percpu(params, -1);
 365 }
 366 EXPORT_SYMBOL(page_pool_create);
 367
 368 static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
 369
 370 static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
 371 {
 372         struct ptr_ring *r = &pool->ring;
 373         netmem_ref netmem;
 374         int pref_nid; /* preferred NUMA node */
 375
 376         /* Quicker fallback, avoid locks when ring is empty */
 377         if (__ptr_ring_empty(r)) {
 378                 alloc_stat_inc(pool, empty);
 379                 return 0;
 380         }
 381
 382         /* Softirq guarantee CPU and thus NUMA node is stable. This,
 383          * assumes CPU refilling driver RX-ring will also run RX-NAPI.
 384          */
 385 #ifdef CONFIG_NUMA
 386         pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
 387 #else
 388         /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
 389         pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
 390 #endif
 391
 392         /* Refill alloc array, but only if NUMA match */
 393         do {
 394                 netmem = (__force netmem_ref)__ptr_ring_consume(r);
 395                 if (unlikely(!netmem))
 396                         break;
 397
 398                 if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
 399                         pool->alloc.cache[pool->alloc.count++] = netmem;
 400                 } else {
 401                         /* NUMA mismatch;
 402                          * (1) release 1 page to page-allocator and
 403                          * (2) break out to fallthrough to alloc_pages_node.
 404                          * This limit stress on page buddy alloactor.
 405                          */
 406                         page_pool_return_page(pool, netmem);
 407                         alloc_stat_inc(pool, waive);
 408                         netmem = 0;
 409                         break;
 410                 }
 411         } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
 412
 413         /* Return last page */
 414         if (likely(pool->alloc.count > 0)) {
 415                 netmem = pool->alloc.cache[--pool->alloc.count];
 416                 alloc_stat_inc(pool, refill);
 417         }
 418
 419         return netmem;
 420 }
 421
 422 /* fast path */
 423 static netmem_ref __page_pool_get_cached(struct page_pool *pool)
 424 {
 425         netmem_ref netmem;
 426
 427         /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
 428         if (likely(pool->alloc.count)) {
 429                 /* Fast-path */
 430                 netmem = pool->alloc.cache[--pool->alloc.count];
 431                 alloc_stat_inc(pool, fast);
 432         } else {
 433                 netmem = page_pool_refill_alloc_cache(pool);
 434         }
 435
 436         return netmem;
 437 }
 438
 439 static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
 440                                             netmem_ref netmem,
 441                                             u32 dma_sync_size)
 442 {
 443 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
 444         dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
 445
 446         dma_sync_size = min(dma_sync_size, pool->p.max_len);
 447         __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
 448                                      dma_sync_size, pool->p.dma_dir);
 449 #endif
 450 }
 451
 452 static __always_inline void
 453 page_pool_dma_sync_for_device(const struct page_pool *pool,
 454                               netmem_ref netmem,
 455                               u32 dma_sync_size)
 456 {
 457         if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
 458                 __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 459 }
 460
 461 static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
 462 {
 463         dma_addr_t dma;
 464
 465         /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 466          * since dma_addr_t can be either 32 or 64 bits and does not always fit
 467          * into page private data (i.e 32bit cpu with 64bit DMA caps)
 468          * This mapping is kept for lifetime of page, until leaving pool.
 469          */
 470         dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
 471                                  (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
 472                                  DMA_ATTR_SKIP_CPU_SYNC |
 473                                          DMA_ATTR_WEAK_ORDERING);
 474         if (dma_mapping_error(pool->p.dev, dma))
 475                 return false;
 476
 477         if (page_pool_set_dma_addr_netmem(netmem, dma))
 478                 goto unmap_failed;
 479
 480         page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
 481
 482         return true;
 483
 484 unmap_failed:
 485         WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
 486         dma_unmap_page_attrs(pool->p.dev, dma,
 487                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 488                              DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 489         return false;
 490 }
 491
 492 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 493                                                  gfp_t gfp)
 494 {
 495         struct page *page;
 496
 497         gfp |= __GFP_COMP;
 498         page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 499         if (unlikely(!page))
 500                 return NULL;
 501
 502         if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
 503                 put_page(page);
 504                 return NULL;
 505         }
 506
 507         alloc_stat_inc(pool, slow_high_order);
 508         page_pool_set_pp_info(pool, page_to_netmem(page));
 509
 510         /* Track how many pages are held 'in-flight' */
 511         pool->pages_state_hold_cnt++;
 512         trace_page_pool_state_hold(pool, page_to_netmem(page),
 513                                    pool->pages_state_hold_cnt);
 514         return page;
 515 }
 516
 517 /* slow path */
 518 static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
 519                                                         gfp_t gfp)
 520 {
 521         const int bulk = PP_ALLOC_CACHE_REFILL;
 522         unsigned int pp_order = pool->p.order;
 523         bool dma_map = pool->dma_map;
 524         netmem_ref netmem;
 525         int i, nr_pages;
 526
 527         /* Don't support bulk alloc for high-order pages */
 528         if (unlikely(pp_order))
 529                 return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
 530
 531         /* Unnecessary as alloc cache is empty, but guarantees zero count */
 532         if (unlikely(pool->alloc.count > 0))
 533                 return pool->alloc.cache[--pool->alloc.count];
 534
 535         /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
 536         memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
 537
 538         nr_pages = alloc_pages_bulk_array_node(gfp,
 539                                                pool->p.nid, bulk,
 540                                                (struct page **)pool->alloc.cache);
 541         if (unlikely(!nr_pages))
 542                 return 0;
 543
 544         /* Pages have been filled into alloc.cache array, but count is zero and
 545          * page element have not been (possibly) DMA mapped.
 546          */
 547         for (i = 0; i < nr_pages; i++) {
 548                 netmem = pool->alloc.cache[i];
 549                 if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
 550                         put_page(netmem_to_page(netmem));
 551                         continue;
 552                 }
 553
 554                 page_pool_set_pp_info(pool, netmem);
 555                 pool->alloc.cache[pool->alloc.count++] = netmem;
 556                 /* Track how many pages are held 'in-flight' */
 557                 pool->pages_state_hold_cnt++;
 558                 trace_page_pool_state_hold(pool, netmem,
 559                                            pool->pages_state_hold_cnt);
 560         }
 561
 562         /* Return last page */
 563         if (likely(pool->alloc.count > 0)) {
 564                 netmem = pool->alloc.cache[--pool->alloc.count];
 565                 alloc_stat_inc(pool, slow);
 566         } else {
 567                 netmem = 0;
 568         }
 569
 570         /* When page just alloc'ed is should/must have refcnt 1. */
 571         return netmem;
 572 }
 573
 574 /* For using page_pool replace: alloc_pages() API calls, but provide
 575  * synchronization guarantee for allocation side.
 576  */
 577 netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
 578 {
 579         netmem_ref netmem;
 580
 581         /* Fast-path: Get a page from cache */
 582         netmem = __page_pool_get_cached(pool);
 583         if (netmem)
 584                 return netmem;
 585
 586         /* Slow-path: cache empty, do real allocation */
 587         if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 588                 netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
 589         else
 590                 netmem = __page_pool_alloc_pages_slow(pool, gfp);
 591         return netmem;
 592 }
 593 EXPORT_SYMBOL(page_pool_alloc_netmem);
 594
 595 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 596 {
 597         return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
 598 }
 599 EXPORT_SYMBOL(page_pool_alloc_pages);
 600 ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
 601
 602 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
 603  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
 604  */
 605 #define _distance(a, b) (s32)((a) - (b))
 606
 607 s32 page_pool_inflight(const struct page_pool *pool, bool strict)
 608 {
 609         u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 610         u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 611         s32 inflight;
 612
 613         inflight = _distance(hold_cnt, release_cnt);
 614
 615         if (strict) {
 616                 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 617                 WARN(inflight < 0, "Negative(%d) inflight packet-pages",
 618                      inflight);
 619         } else {
 620                 inflight = max(0, inflight);
 621         }
 622
 623         return inflight;
 624 }
 625
 626 void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 627 {
 628         netmem_set_pp(netmem, pool);
 629         netmem_or_pp_magic(netmem, PP_SIGNATURE);
 630
 631         /* Ensuring all pages have been split into one fragment initially:
 632          * page_pool_set_pp_info() is only called once for every page when it
 633          * is allocated from the page allocator and page_pool_fragment_page()
 634          * is dirtying the same cache line as the page->pp_magic above, so
 635          * the overhead is negligible.
 636          */
 637         page_pool_fragment_netmem(netmem, 1);
 638         if (pool->has_init_callback)
 639                 pool->slow.init_callback(netmem, pool->slow.init_arg);
 640 }
 641
 642 void page_pool_clear_pp_info(netmem_ref netmem)
 643 {
 644         netmem_clear_pp_magic(netmem);
 645         netmem_set_pp(netmem, NULL);
 646 }
 647
 648 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
 649                                                          netmem_ref netmem)
 650 {
 651         dma_addr_t dma;
 652
 653         if (!pool->dma_map)
 654                 /* Always account for inflight pages, even if we didn't
 655                  * map them
 656                  */
 657                 return;
 658
 659         dma = page_pool_get_dma_addr_netmem(netmem);
 660
 661         /* When page is unmapped, it cannot be returned to our pool */
 662         dma_unmap_page_attrs(pool->p.dev, dma,
 663                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 664                              DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 665         page_pool_set_dma_addr_netmem(netmem, 0);
 666 }
 667
 668 /* Disconnects a page (from a page_pool).  API users can have a need
 669  * to disconnect a page (from a page_pool), to allow it to be used as
 670  * a regular page (that will eventually be returned to the normal
 671  * page-allocator via put_page).
 672  */
 673 void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
 674 {
 675         int count;
 676         bool put;
 677
 678         put = true;
 679         if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 680                 put = mp_dmabuf_devmem_release_page(pool, netmem);
 681         else
 682                 __page_pool_release_page_dma(pool, netmem);
 683
 684         /* This may be the last page returned, releasing the pool, so
 685          * it is not safe to reference pool afterwards.
 686          */
 687         count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
 688         trace_page_pool_state_release(pool, netmem, count);
 689
 690         if (put) {
 691                 page_pool_clear_pp_info(netmem);
 692                 put_page(netmem_to_page(netmem));
 693         }
 694         /* An optimization would be to call __free_pages(page, pool->p.order)
 695          * knowing page is not part of page-cache (thus avoiding a
 696          * __page_cache_release() call).
 697          */
 698 }
 699
 700 static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
 701 {
 702         int ret;
 703         /* BH protection not needed if current is softirq */
 704         if (in_softirq())
 705                 ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
 706         else
 707                 ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
 708
 709         if (!ret) {
 710                 recycle_stat_inc(pool, ring);
 711                 return true;
 712         }
 713
 714         return false;
 715 }
 716
 717 /* Only allow direct recycling in special circumstances, into the
 718  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 719  *
 720  * Caller must provide appropriate safe context.
 721  */
 722 static bool page_pool_recycle_in_cache(netmem_ref netmem,
 723                                        struct page_pool *pool)
 724 {
 725         if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
 726                 recycle_stat_inc(pool, cache_full);
 727                 return false;
 728         }
 729
 730         /* Caller MUST have verified/know (page_ref_count(page) == 1) */
 731         pool->alloc.cache[pool->alloc.count++] = netmem;
 732         recycle_stat_inc(pool, cached);
 733         return true;
 734 }
 735
 736 static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
 737 {
 738         return netmem_is_net_iov(netmem) ||
 739                (page_ref_count(netmem_to_page(netmem)) == 1 &&
 740                 !page_is_pfmemalloc(netmem_to_page(netmem)));
 741 }
 742
 743 /* If the page refcnt == 1, this will try to recycle the page.
 744  * If pool->dma_sync is set, we'll try to sync the DMA area for
 745  * the configured size min(dma_sync_size, pool->max_len).
 746  * If the page refcnt != 1, then the page will be returned to memory
 747  * subsystem.
 748  */
 749 static __always_inline netmem_ref
 750 __page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
 751                      unsigned int dma_sync_size, bool allow_direct)
 752 {
 753         lockdep_assert_no_hardirq();
 754
 755         /* This allocator is optimized for the XDP mode that uses
 756          * one-frame-per-page, but have fallbacks that act like the
 757          * regular page allocator APIs.
 758          *
 759          * refcnt == 1 means page_pool owns page, and can recycle it.
 760          *
 761          * page is NOT reusable when allocated when system is under
 762          * some pressure. (page_is_pfmemalloc)
 763          */
 764         if (likely(__page_pool_page_can_be_recycled(netmem))) {
 765                 /* Read barrier done in page_ref_count / READ_ONCE */
 766
 767                 page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 768
 769                 if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
 770                         return 0;
 771
 772                 /* Page found as candidate for recycling */
 773                 return netmem;
 774         }
 775
 776         /* Fallback/non-XDP mode: API user have elevated refcnt.
 777          *
 778          * Many drivers split up the page into fragments, and some
 779          * want to keep doing this to save memory and do refcnt based
 780          * recycling. Support this use case too, to ease drivers
 781          * switching between XDP/non-XDP.
 782          *
 783          * In-case page_pool maintains the DMA mapping, API user must
 784          * call page_pool_put_page once.  In this elevated refcnt
 785          * case, the DMA is unmapped/released, as driver is likely
 786          * doing refcnt based recycle tricks, meaning another process
 787          * will be invoking put_page.
 788          */
 789         recycle_stat_inc(pool, released_refcnt);
 790         page_pool_return_page(pool, netmem);
 791
 792         return 0;
 793 }
 794
 795 static bool page_pool_napi_local(const struct page_pool *pool)
 796 {
 797         const struct napi_struct *napi;
 798         u32 cpuid;
 799
 800         if (unlikely(!in_softirq()))
 801                 return false;
 802
 803         /* Allow direct recycle if we have reasons to believe that we are
 804          * in the same context as the consumer would run, so there's
 805          * no possible race.
 806          * __page_pool_put_page() makes sure we're not in hardirq context
 807          * and interrupts are enabled prior to accessing the cache.
 808          */
 809         cpuid = smp_processor_id();
 810         if (READ_ONCE(pool->cpuid) == cpuid)
 811                 return true;
 812
 813         napi = READ_ONCE(pool->p.napi);
 814
 815         return napi && READ_ONCE(napi->list_owner) == cpuid;
 816 }
 817
 818 void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
 819                                   unsigned int dma_sync_size, bool allow_direct)
 820 {
 821         if (!allow_direct)
 822                 allow_direct = page_pool_napi_local(pool);
 823
 824         netmem =
 825                 __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
 826         if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
 827                 /* Cache full, fallback to free pages */
 828                 recycle_stat_inc(pool, ring_full);
 829                 page_pool_return_page(pool, netmem);
 830         }
 831 }
 832 EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
 833
 834 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 835                                 unsigned int dma_sync_size, bool allow_direct)
 836 {
 837         page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
 838                                      allow_direct);
 839 }
 840 EXPORT_SYMBOL(page_pool_put_unrefed_page);
 841
 842 /**
 843  * page_pool_put_page_bulk() - release references on multiple pages
 844  * @pool:       pool from which pages were allocated
 845  * @data:       array holding page pointers
 846  * @count:      number of pages in @data
 847  *
 848  * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
 849  * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
 850  * will release leftover pages to the page allocator.
 851  * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
 852  * completion loop for the XDP_REDIRECT use case.
 853  *
 854  * Please note the caller must not use data area after running
 855  * page_pool_put_page_bulk(), as this function overwrites it.
 856  */
 857 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 858                              int count)
 859 {
 860         int i, bulk_len = 0;
 861         bool allow_direct;
 862         bool in_softirq;
 863
 864         allow_direct = page_pool_napi_local(pool);
 865
 866         for (i = 0; i < count; i++) {
 867                 netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
 868
 869                 /* It is not the last user for the page frag case */
 870                 if (!page_pool_is_last_ref(netmem))
 871                         continue;
 872
 873                 netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
 874                 /* Approved for bulk recycling in ptr_ring cache */
 875                 if (netmem)
 876                         data[bulk_len++] = (__force void *)netmem;
 877         }
 878
 879         if (!bulk_len)
 880                 return;
 881
 882         /* Bulk producer into ptr_ring page_pool cache */
 883         in_softirq = page_pool_producer_lock(pool);
 884         for (i = 0; i < bulk_len; i++) {
 885                 if (__ptr_ring_produce(&pool->ring, data[i])) {
 886                         /* ring full */
 887                         recycle_stat_inc(pool, ring_full);
 888                         break;
 889                 }
 890         }
 891         recycle_stat_add(pool, ring, i);
 892         page_pool_producer_unlock(pool, in_softirq);
 893
 894         /* Hopefully all pages was return into ptr_ring */
 895         if (likely(i == bulk_len))
 896                 return;
 897
 898         /* ptr_ring cache full, free remaining pages outside producer lock
 899          * since put_page() with refcnt == 1 can be an expensive operation
 900          */
 901         for (; i < bulk_len; i++)
 902                 page_pool_return_page(pool, (__force netmem_ref)data[i]);
 903 }
 904 EXPORT_SYMBOL(page_pool_put_page_bulk);
 905
 906 static netmem_ref page_pool_drain_frag(struct page_pool *pool,
 907                                        netmem_ref netmem)
 908 {
 909         long drain_count = BIAS_MAX - pool->frag_users;
 910
 911         /* Some user is still using the page frag */
 912         if (likely(page_pool_unref_netmem(netmem, drain_count)))
 913                 return 0;
 914
 915         if (__page_pool_page_can_be_recycled(netmem)) {
 916                 page_pool_dma_sync_for_device(pool, netmem, -1);
 917                 return netmem;
 918         }
 919
 920         page_pool_return_page(pool, netmem);
 921         return 0;
 922 }
 923
 924 static void page_pool_free_frag(struct page_pool *pool)
 925 {
 926         long drain_count = BIAS_MAX - pool->frag_users;
 927         netmem_ref netmem = pool->frag_page;
 928
 929         pool->frag_page = 0;
 930
 931         if (!netmem || page_pool_unref_netmem(netmem, drain_count))
 932                 return;
 933
 934         page_pool_return_page(pool, netmem);
 935 }
 936
 937 netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
 938                                        unsigned int *offset, unsigned int size,
 939                                        gfp_t gfp)
 940 {
 941         unsigned int max_size = PAGE_SIZE << pool->p.order;
 942         netmem_ref netmem = pool->frag_page;
 943
 944         if (WARN_ON(size > max_size))
 945                 return 0;
 946
 947         size = ALIGN(size, dma_get_cache_alignment());
 948         *offset = pool->frag_offset;
 949
 950         if (netmem && *offset + size > max_size) {
 951                 netmem = page_pool_drain_frag(pool, netmem);
 952                 if (netmem) {
 953                         alloc_stat_inc(pool, fast);
 954                         goto frag_reset;
 955                 }
 956         }
 957
 958         if (!netmem) {
 959                 netmem = page_pool_alloc_netmem(pool, gfp);
 960                 if (unlikely(!netmem)) {
 961                         pool->frag_page = 0;
 962                         return 0;
 963                 }
 964
 965                 pool->frag_page = netmem;
 966
 967 frag_reset:
 968                 pool->frag_users = 1;
 969                 *offset = 0;
 970                 pool->frag_offset = size;
 971                 page_pool_fragment_netmem(netmem, BIAS_MAX);
 972                 return netmem;
 973         }
 974
 975         pool->frag_users++;
 976         pool->frag_offset = *offset + size;
 977         alloc_stat_inc(pool, fast);
 978         return netmem;
 979 }
 980 EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
 981
 982 struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
 983                                   unsigned int size, gfp_t gfp)
 984 {
 985         return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
 986                                                           gfp));
 987 }
 988 EXPORT_SYMBOL(page_pool_alloc_frag);
 989
 990 static void page_pool_empty_ring(struct page_pool *pool)
 991 {
 992         netmem_ref netmem;
 993
 994         /* Empty recycle ring */
 995         while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
 996                 /* Verify the refcnt invariant of cached pages */
 997                 if (!(netmem_ref_count(netmem) == 1))
 998                         pr_crit("%s() page_pool refcnt %d violation\n",
 999                                 __func__, netmem_ref_count(netmem));
1000
1001                 page_pool_return_page(pool, netmem);
1002         }
1003 }
1004
1005 static void __page_pool_destroy(struct page_pool *pool)
1006 {
1007         if (pool->disconnect)
1008                 pool->disconnect(pool);
1009
1010         page_pool_unlist(pool);
1011         page_pool_uninit(pool);
1012
1013         if (pool->mp_priv) {
1014                 mp_dmabuf_devmem_destroy(pool);
1015                 static_branch_dec(&page_pool_mem_providers);
1016         }
1017
1018         kfree(pool);
1019 }
1020
1021 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1022 {
1023         netmem_ref netmem;
1024
1025         if (pool->destroy_cnt)
1026                 return;
1027
1028         /* Empty alloc cache, assume caller made sure this is
1029          * no-longer in use, and page_pool_alloc_pages() cannot be
1030          * call concurrently.
1031          */
1032         while (pool->alloc.count) {
1033                 netmem = pool->alloc.cache[--pool->alloc.count];
1034                 page_pool_return_page(pool, netmem);
1035         }
1036 }
1037
1038 static void page_pool_scrub(struct page_pool *pool)
1039 {
1040         page_pool_empty_alloc_cache_once(pool);
1041         pool->destroy_cnt++;
1042
1043         /* No more consumers should exist, but producers could still
1044          * be in-flight.
1045          */
1046         page_pool_empty_ring(pool);
1047 }
1048
1049 static int page_pool_release(struct page_pool *pool)
1050 {
1051         int inflight;
1052
1053         page_pool_scrub(pool);
1054         inflight = page_pool_inflight(pool, true);
1055         if (!inflight)
1056                 __page_pool_destroy(pool);
1057
1058         return inflight;
1059 }
1060
1061 static void page_pool_release_retry(struct work_struct *wq)
1062 {
1063         struct delayed_work *dwq = to_delayed_work(wq);
1064         struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1065         void *netdev;
1066         int inflight;
1067
1068         inflight = page_pool_release(pool);
1069         if (!inflight)
1070                 return;
1071
1072         /* Periodic warning for page pools the user can't see */
1073         netdev = READ_ONCE(pool->slow.netdev);
1074         if (time_after_eq(jiffies, pool->defer_warn) &&
1075             (!netdev || netdev == NET_PTR_POISON)) {
1076                 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1077
1078                 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1079                         __func__, pool->user.id, inflight, sec);
1080                 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1081         }
1082
1083         /* Still not ready to be disconnected, retry later */
1084         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1085 }
1086
1087 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1088                            const struct xdp_mem_info *mem)
1089 {
1090         refcount_inc(&pool->user_cnt);
1091         pool->disconnect = disconnect;
1092         pool->xdp_mem_id = mem->id;
1093 }
1094
1095 void page_pool_disable_direct_recycling(struct page_pool *pool)
1096 {
1097         /* Disable direct recycling based on pool->cpuid.
1098          * Paired with READ_ONCE() in page_pool_napi_local().
1099          */
1100         WRITE_ONCE(pool->cpuid, -1);
1101
1102         if (!pool->p.napi)
1103                 return;
1104
1105         /* To avoid races with recycling and additional barriers make sure
1106          * pool and NAPI are unlinked when NAPI is disabled.
1107          */
1108         WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1109         WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1110
1111         WRITE_ONCE(pool->p.napi, NULL);
1112 }
1113 EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1114
1115 void page_pool_destroy(struct page_pool *pool)
1116 {
1117         if (!pool)
1118                 return;
1119
1120         if (!page_pool_put(pool))
1121                 return;
1122
1123         page_pool_disable_direct_recycling(pool);
1124         page_pool_free_frag(pool);
1125
1126         if (!page_pool_release(pool))
1127                 return;
1128
1129         page_pool_detached(pool);
1130         pool->defer_start = jiffies;
1131         pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1132
1133         INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1134         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1135 }
1136 EXPORT_SYMBOL(page_pool_destroy);
1137
1138 /* Caller must provide appropriate safe context, e.g. NAPI. */
1139 void page_pool_update_nid(struct page_pool *pool, int new_nid)
1140 {
1141         netmem_ref netmem;
1142
1143         trace_page_pool_update_nid(pool, new_nid);
1144         pool->p.nid = new_nid;
1145
1146         /* Flush pool alloc cache, as refill will check NUMA node */
1147         while (pool->alloc.count) {
1148                 netmem = pool->alloc.cache[--pool->alloc.count];
1149                 page_pool_return_page(pool, netmem);
1150         }
1151 }
1152 EXPORT_SYMBOL(page_pool_update_nid);