net/core/page_pool.c

   1 /* SPDX-License-Identifier: GPL-2.0
   2  *
   3  * page_pool.c
   4  *      Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5  *      Copyright (C) 2016 Red Hat, Inc.
   6  */
   7
   8 #include <linux/error-injection.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/slab.h>
  12 #include <linux/device.h>
  13
  14 #include <net/netdev_rx_queue.h>
  15 #include <net/page_pool/helpers.h>
  16 #include <net/xdp.h>
  17
  18 #include <linux/dma-direction.h>
  19 #include <linux/dma-mapping.h>
  20 #include <linux/page-flags.h>
  21 #include <linux/mm.h> /* for put_page() */
  22 #include <linux/poison.h>
  23 #include <linux/ethtool.h>
  24 #include <linux/netdevice.h>
  25
  26 #include <trace/events/page_pool.h>
  27
  28 #include "mp_dmabuf_devmem.h"
  29 #include "netmem_priv.h"
  30 #include "page_pool_priv.h"
  31
  32 DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
  33
  34 #define DEFER_TIME (msecs_to_jiffies(1000))
  35 #define DEFER_WARN_INTERVAL (60 * HZ)
  36
  37 #define BIAS_MAX        (LONG_MAX >> 1)
  38
  39 #ifdef CONFIG_PAGE_POOL_STATS
  40 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
  41
  42 /* alloc_stat_inc is intended to be used in softirq context */
  43 #define alloc_stat_inc(pool, __stat)    (pool->alloc_stats.__stat++)
  44 /* recycle_stat_inc is safe to use when preemption is possible. */
  45 #define recycle_stat_inc(pool, __stat)                                                  \
  46         do {                                                                            \
  47                 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;       \
  48                 this_cpu_inc(s->__stat);                                                \
  49         } while (0)
  50
  51 #define recycle_stat_add(pool, __stat, val)                                             \
  52         do {                                                                            \
  53                 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;       \
  54                 this_cpu_add(s->__stat, val);                                           \
  55         } while (0)
  56
  57 static const char pp_stats[][ETH_GSTRING_LEN] = {
  58         "rx_pp_alloc_fast",
  59         "rx_pp_alloc_slow",
  60         "rx_pp_alloc_slow_ho",
  61         "rx_pp_alloc_empty",
  62         "rx_pp_alloc_refill",
  63         "rx_pp_alloc_waive",
  64         "rx_pp_recycle_cached",
  65         "rx_pp_recycle_cache_full",
  66         "rx_pp_recycle_ring",
  67         "rx_pp_recycle_ring_full",
  68         "rx_pp_recycle_released_ref",
  69 };
  70
  71 /**
  72  * page_pool_get_stats() - fetch page pool stats
  73  * @pool:       pool from which page was allocated
  74  * @stats:      struct page_pool_stats to fill in
  75  *
  76  * Retrieve statistics about the page_pool. This API is only available
  77  * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
  78  * A pointer to a caller allocated struct page_pool_stats structure
  79  * is passed to this API which is filled in. The caller can then report
  80  * those stats to the user (perhaps via ethtool, debugfs, etc.).
  81  */
  82 bool page_pool_get_stats(const struct page_pool *pool,
  83                          struct page_pool_stats *stats)
  84 {
  85         int cpu = 0;
  86
  87         if (!stats)
  88                 return false;
  89
  90         /* The caller is responsible to initialize stats. */
  91         stats->alloc_stats.fast += pool->alloc_stats.fast;
  92         stats->alloc_stats.slow += pool->alloc_stats.slow;
  93         stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
  94         stats->alloc_stats.empty += pool->alloc_stats.empty;
  95         stats->alloc_stats.refill += pool->alloc_stats.refill;
  96         stats->alloc_stats.waive += pool->alloc_stats.waive;
  97
  98         for_each_possible_cpu(cpu) {
  99                 const struct page_pool_recycle_stats *pcpu =
 100                         per_cpu_ptr(pool->recycle_stats, cpu);
 101
 102                 stats->recycle_stats.cached += pcpu->cached;
 103                 stats->recycle_stats.cache_full += pcpu->cache_full;
 104                 stats->recycle_stats.ring += pcpu->ring;
 105                 stats->recycle_stats.ring_full += pcpu->ring_full;
 106                 stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
 107         }
 108
 109         return true;
 110 }
 111 EXPORT_SYMBOL(page_pool_get_stats);
 112
 113 u8 *page_pool_ethtool_stats_get_strings(u8 *data)
 114 {
 115         int i;
 116
 117         for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
 118                 memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
 119                 data += ETH_GSTRING_LEN;
 120         }
 121
 122         return data;
 123 }
 124 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
 125
 126 int page_pool_ethtool_stats_get_count(void)
 127 {
 128         return ARRAY_SIZE(pp_stats);
 129 }
 130 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
 131
 132 u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
 133 {
 134         const struct page_pool_stats *pool_stats = stats;
 135
 136         *data++ = pool_stats->alloc_stats.fast;
 137         *data++ = pool_stats->alloc_stats.slow;
 138         *data++ = pool_stats->alloc_stats.slow_high_order;
 139         *data++ = pool_stats->alloc_stats.empty;
 140         *data++ = pool_stats->alloc_stats.refill;
 141         *data++ = pool_stats->alloc_stats.waive;
 142         *data++ = pool_stats->recycle_stats.cached;
 143         *data++ = pool_stats->recycle_stats.cache_full;
 144         *data++ = pool_stats->recycle_stats.ring;
 145         *data++ = pool_stats->recycle_stats.ring_full;
 146         *data++ = pool_stats->recycle_stats.released_refcnt;
 147
 148         return data;
 149 }
 150 EXPORT_SYMBOL(page_pool_ethtool_stats_get);
 151
 152 #else
 153 #define alloc_stat_inc(pool, __stat)
 154 #define recycle_stat_inc(pool, __stat)
 155 #define recycle_stat_add(pool, __stat, val)
 156 #endif
 157
 158 static bool page_pool_producer_lock(struct page_pool *pool)
 159         __acquires(&pool->ring.producer_lock)
 160 {
 161         bool in_softirq = in_softirq();
 162
 163         if (in_softirq)
 164                 spin_lock(&pool->ring.producer_lock);
 165         else
 166                 spin_lock_bh(&pool->ring.producer_lock);
 167
 168         return in_softirq;
 169 }
 170
 171 static void page_pool_producer_unlock(struct page_pool *pool,
 172                                       bool in_softirq)
 173         __releases(&pool->ring.producer_lock)
 174 {
 175         if (in_softirq)
 176                 spin_unlock(&pool->ring.producer_lock);
 177         else
 178                 spin_unlock_bh(&pool->ring.producer_lock);
 179 }
 180
 181 static void page_pool_struct_check(void)
 182 {
 183         CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
 184         CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
 185         CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
 186         CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
 187                                     PAGE_POOL_FRAG_GROUP_ALIGN);
 188 }
 189
 190 static int page_pool_init(struct page_pool *pool,
 191                           const struct page_pool_params *params,
 192                           int cpuid)
 193 {
 194         unsigned int ring_qsize = 1024; /* Default */
 195         struct netdev_rx_queue *rxq;
 196         int err;
 197
 198         page_pool_struct_check();
 199
 200         memcpy(&pool->p, &params->fast, sizeof(pool->p));
 201         memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 202
 203         pool->cpuid = cpuid;
 204         pool->dma_sync_for_cpu = true;
 205
 206         /* Validate only known flags were used */
 207         if (pool->slow.flags & ~PP_FLAG_ALL)
 208                 return -EINVAL;
 209
 210         if (pool->p.pool_size)
 211                 ring_qsize = pool->p.pool_size;
 212
 213         /* Sanity limit mem that can be pinned down */
 214         if (ring_qsize > 32768)
 215                 return -E2BIG;
 216
 217         /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
 218          * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 219          * which is the XDP_TX use-case.
 220          */
 221         if (pool->slow.flags & PP_FLAG_DMA_MAP) {
 222                 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 223                     (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 224                         return -EINVAL;
 225
 226                 pool->dma_map = true;
 227         }
 228
 229         if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
 230                 /* In order to request DMA-sync-for-device the page
 231                  * needs to be mapped
 232                  */
 233                 if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
 234                         return -EINVAL;
 235
 236                 if (!pool->p.max_len)
 237                         return -EINVAL;
 238
 239                 pool->dma_sync = true;
 240
 241                 /* pool->p.offset has to be set according to the address
 242                  * offset used by the DMA engine to start copying rx data
 243                  */
 244         }
 245
 246         pool->has_init_callback = !!pool->slow.init_callback;
 247
 248 #ifdef CONFIG_PAGE_POOL_STATS
 249         if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
 250                 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 251                 if (!pool->recycle_stats)
 252                         return -ENOMEM;
 253         } else {
 254                 /* For system page pool instance we use a singular stats object
 255                  * instead of allocating a separate percpu variable for each
 256                  * (also percpu) page pool instance.
 257                  */
 258                 pool->recycle_stats = &pp_system_recycle_stats;
 259                 pool->system = true;
 260         }
 261 #endif
 262
 263         if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
 264 #ifdef CONFIG_PAGE_POOL_STATS
 265                 if (!pool->system)
 266                         free_percpu(pool->recycle_stats);
 267 #endif
 268                 return -ENOMEM;
 269         }
 270
 271         atomic_set(&pool->pages_state_release_cnt, 0);
 272
 273         /* Driver calling page_pool_create() also call page_pool_destroy() */
 274         refcount_set(&pool->user_cnt, 1);
 275
 276         if (pool->dma_map)
 277                 get_device(pool->p.dev);
 278
 279         if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
 280                 /* We rely on rtnl_lock()ing to make sure netdev_rx_queue
 281                  * configuration doesn't change while we're initializing
 282                  * the page_pool.
 283                  */
 284                 ASSERT_RTNL();
 285                 rxq = __netif_get_rx_queue(pool->slow.netdev,
 286                                            pool->slow.queue_idx);
 287                 pool->mp_priv = rxq->mp_params.mp_priv;
 288         }
 289
 290         if (pool->mp_priv) {
 291                 if (!pool->dma_map || !pool->dma_sync)
 292                         return -EOPNOTSUPP;
 293
 294                 err = mp_dmabuf_devmem_init(pool);
 295                 if (err) {
 296                         pr_warn("%s() mem-provider init failed %d\n", __func__,
 297                                 err);
 298                         goto free_ptr_ring;
 299                 }
 300
 301                 static_branch_inc(&page_pool_mem_providers);
 302         }
 303
 304         return 0;
 305
 306 free_ptr_ring:
 307         ptr_ring_cleanup(&pool->ring, NULL);
 308 #ifdef CONFIG_PAGE_POOL_STATS
 309         if (!pool->system)
 310                 free_percpu(pool->recycle_stats);
 311 #endif
 312         return err;
 313 }
 314
 315 static void page_pool_uninit(struct page_pool *pool)
 316 {
 317         ptr_ring_cleanup(&pool->ring, NULL);
 318
 319         if (pool->dma_map)
 320                 put_device(pool->p.dev);
 321
 322 #ifdef CONFIG_PAGE_POOL_STATS
 323         if (!pool->system)
 324                 free_percpu(pool->recycle_stats);
 325 #endif
 326 }
 327
 328 /**
 329  * page_pool_create_percpu() - create a page pool for a given cpu.
 330  * @params: parameters, see struct page_pool_params
 331  * @cpuid: cpu identifier
 332  */
 333 struct page_pool *
 334 page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
 335 {
 336         struct page_pool *pool;
 337         int err;
 338
 339         pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
 340         if (!pool)
 341                 return ERR_PTR(-ENOMEM);
 342
 343         err = page_pool_init(pool, params, cpuid);
 344         if (err < 0)
 345                 goto err_free;
 346
 347         err = page_pool_list(pool);
 348         if (err)
 349                 goto err_uninit;
 350
 351         return pool;
 352
 353 err_uninit:
 354         page_pool_uninit(pool);
 355 err_free:
 356         pr_warn("%s() gave up with errno %d\n", __func__, err);
 357         kfree(pool);
 358         return ERR_PTR(err);
 359 }
 360 EXPORT_SYMBOL(page_pool_create_percpu);
 361
 362 /**
 363  * page_pool_create() - create a page pool
 364  * @params: parameters, see struct page_pool_params
 365  */
 366 struct page_pool *page_pool_create(const struct page_pool_params *params)
 367 {
 368         return page_pool_create_percpu(params, -1);
 369 }
 370 EXPORT_SYMBOL(page_pool_create);
 371
 372 static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
 373
 374 static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
 375 {
 376         struct ptr_ring *r = &pool->ring;
 377         netmem_ref netmem;
 378         int pref_nid; /* preferred NUMA node */
 379
 380         /* Quicker fallback, avoid locks when ring is empty */
 381         if (__ptr_ring_empty(r)) {
 382                 alloc_stat_inc(pool, empty);
 383                 return 0;
 384         }
 385
 386         /* Softirq guarantee CPU and thus NUMA node is stable. This,
 387          * assumes CPU refilling driver RX-ring will also run RX-NAPI.
 388          */
 389 #ifdef CONFIG_NUMA
 390         pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
 391 #else
 392         /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
 393         pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
 394 #endif
 395
 396         /* Refill alloc array, but only if NUMA match */
 397         do {
 398                 netmem = (__force netmem_ref)__ptr_ring_consume(r);
 399                 if (unlikely(!netmem))
 400                         break;
 401
 402                 if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
 403                         pool->alloc.cache[pool->alloc.count++] = netmem;
 404                 } else {
 405                         /* NUMA mismatch;
 406                          * (1) release 1 page to page-allocator and
 407                          * (2) break out to fallthrough to alloc_pages_node.
 408                          * This limit stress on page buddy alloactor.
 409                          */
 410                         page_pool_return_page(pool, netmem);
 411                         alloc_stat_inc(pool, waive);
 412                         netmem = 0;
 413                         break;
 414                 }
 415         } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
 416
 417         /* Return last page */
 418         if (likely(pool->alloc.count > 0)) {
 419                 netmem = pool->alloc.cache[--pool->alloc.count];
 420                 alloc_stat_inc(pool, refill);
 421         }
 422
 423         return netmem;
 424 }
 425
 426 /* fast path */
 427 static netmem_ref __page_pool_get_cached(struct page_pool *pool)
 428 {
 429         netmem_ref netmem;
 430
 431         /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
 432         if (likely(pool->alloc.count)) {
 433                 /* Fast-path */
 434                 netmem = pool->alloc.cache[--pool->alloc.count];
 435                 alloc_stat_inc(pool, fast);
 436         } else {
 437                 netmem = page_pool_refill_alloc_cache(pool);
 438         }
 439
 440         return netmem;
 441 }
 442
 443 static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
 444                                             netmem_ref netmem,
 445                                             u32 dma_sync_size)
 446 {
 447 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
 448         dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
 449
 450         dma_sync_size = min(dma_sync_size, pool->p.max_len);
 451         __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
 452                                      dma_sync_size, pool->p.dma_dir);
 453 #endif
 454 }
 455
 456 static __always_inline void
 457 page_pool_dma_sync_for_device(const struct page_pool *pool,
 458                               netmem_ref netmem,
 459                               u32 dma_sync_size)
 460 {
 461         if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
 462                 __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 463 }
 464
 465 static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
 466 {
 467         dma_addr_t dma;
 468
 469         /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 470          * since dma_addr_t can be either 32 or 64 bits and does not always fit
 471          * into page private data (i.e 32bit cpu with 64bit DMA caps)
 472          * This mapping is kept for lifetime of page, until leaving pool.
 473          */
 474         dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
 475                                  (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
 476                                  DMA_ATTR_SKIP_CPU_SYNC |
 477                                          DMA_ATTR_WEAK_ORDERING);
 478         if (dma_mapping_error(pool->p.dev, dma))
 479                 return false;
 480
 481         if (page_pool_set_dma_addr_netmem(netmem, dma))
 482                 goto unmap_failed;
 483
 484         page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
 485
 486         return true;
 487
 488 unmap_failed:
 489         WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
 490         dma_unmap_page_attrs(pool->p.dev, dma,
 491                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 492                              DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 493         return false;
 494 }
 495
 496 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 497                                                  gfp_t gfp)
 498 {
 499         struct page *page;
 500
 501         gfp |= __GFP_COMP;
 502         page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 503         if (unlikely(!page))
 504                 return NULL;
 505
 506         if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
 507                 put_page(page);
 508                 return NULL;
 509         }
 510
 511         alloc_stat_inc(pool, slow_high_order);
 512         page_pool_set_pp_info(pool, page_to_netmem(page));
 513
 514         /* Track how many pages are held 'in-flight' */
 515         pool->pages_state_hold_cnt++;
 516         trace_page_pool_state_hold(pool, page_to_netmem(page),
 517                                    pool->pages_state_hold_cnt);
 518         return page;
 519 }
 520
 521 /* slow path */
 522 static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
 523                                                         gfp_t gfp)
 524 {
 525         const int bulk = PP_ALLOC_CACHE_REFILL;
 526         unsigned int pp_order = pool->p.order;
 527         bool dma_map = pool->dma_map;
 528         netmem_ref netmem;
 529         int i, nr_pages;
 530
 531         /* Don't support bulk alloc for high-order pages */
 532         if (unlikely(pp_order))
 533                 return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
 534
 535         /* Unnecessary as alloc cache is empty, but guarantees zero count */
 536         if (unlikely(pool->alloc.count > 0))
 537                 return pool->alloc.cache[--pool->alloc.count];
 538
 539         /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
 540         memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
 541
 542         nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
 543                                          (struct page **)pool->alloc.cache);
 544         if (unlikely(!nr_pages))
 545                 return 0;
 546
 547         /* Pages have been filled into alloc.cache array, but count is zero and
 548          * page element have not been (possibly) DMA mapped.
 549          */
 550         for (i = 0; i < nr_pages; i++) {
 551                 netmem = pool->alloc.cache[i];
 552                 if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
 553                         put_page(netmem_to_page(netmem));
 554                         continue;
 555                 }
 556
 557                 page_pool_set_pp_info(pool, netmem);
 558                 pool->alloc.cache[pool->alloc.count++] = netmem;
 559                 /* Track how many pages are held 'in-flight' */
 560                 pool->pages_state_hold_cnt++;
 561                 trace_page_pool_state_hold(pool, netmem,
 562                                            pool->pages_state_hold_cnt);
 563         }
 564
 565         /* Return last page */
 566         if (likely(pool->alloc.count > 0)) {
 567                 netmem = pool->alloc.cache[--pool->alloc.count];
 568                 alloc_stat_inc(pool, slow);
 569         } else {
 570                 netmem = 0;
 571         }
 572
 573         /* When page just alloc'ed is should/must have refcnt 1. */
 574         return netmem;
 575 }
 576
 577 /* For using page_pool replace: alloc_pages() API calls, but provide
 578  * synchronization guarantee for allocation side.
 579  */
 580 netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
 581 {
 582         netmem_ref netmem;
 583
 584         /* Fast-path: Get a page from cache */
 585         netmem = __page_pool_get_cached(pool);
 586         if (netmem)
 587                 return netmem;
 588
 589         /* Slow-path: cache empty, do real allocation */
 590         if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 591                 netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
 592         else
 593                 netmem = __page_pool_alloc_pages_slow(pool, gfp);
 594         return netmem;
 595 }
 596 EXPORT_SYMBOL(page_pool_alloc_netmems);
 597 ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL);
 598
 599 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 600 {
 601         return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
 602 }
 603 EXPORT_SYMBOL(page_pool_alloc_pages);
 604
 605 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
 606  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
 607  */
 608 #define _distance(a, b) (s32)((a) - (b))
 609
 610 s32 page_pool_inflight(const struct page_pool *pool, bool strict)
 611 {
 612         u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 613         u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 614         s32 inflight;
 615
 616         inflight = _distance(hold_cnt, release_cnt);
 617
 618         if (strict) {
 619                 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 620                 WARN(inflight < 0, "Negative(%d) inflight packet-pages",
 621                      inflight);
 622         } else {
 623                 inflight = max(0, inflight);
 624         }
 625
 626         return inflight;
 627 }
 628
 629 void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 630 {
 631         netmem_set_pp(netmem, pool);
 632         netmem_or_pp_magic(netmem, PP_SIGNATURE);
 633
 634         /* Ensuring all pages have been split into one fragment initially:
 635          * page_pool_set_pp_info() is only called once for every page when it
 636          * is allocated from the page allocator and page_pool_fragment_page()
 637          * is dirtying the same cache line as the page->pp_magic above, so
 638          * the overhead is negligible.
 639          */
 640         page_pool_fragment_netmem(netmem, 1);
 641         if (pool->has_init_callback)
 642                 pool->slow.init_callback(netmem, pool->slow.init_arg);
 643 }
 644
 645 void page_pool_clear_pp_info(netmem_ref netmem)
 646 {
 647         netmem_clear_pp_magic(netmem);
 648         netmem_set_pp(netmem, NULL);
 649 }
 650
 651 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
 652                                                          netmem_ref netmem)
 653 {
 654         dma_addr_t dma;
 655
 656         if (!pool->dma_map)
 657                 /* Always account for inflight pages, even if we didn't
 658                  * map them
 659                  */
 660                 return;
 661
 662         dma = page_pool_get_dma_addr_netmem(netmem);
 663
 664         /* When page is unmapped, it cannot be returned to our pool */
 665         dma_unmap_page_attrs(pool->p.dev, dma,
 666                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 667                              DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
 668         page_pool_set_dma_addr_netmem(netmem, 0);
 669 }
 670
 671 /* Disconnects a page (from a page_pool).  API users can have a need
 672  * to disconnect a page (from a page_pool), to allow it to be used as
 673  * a regular page (that will eventually be returned to the normal
 674  * page-allocator via put_page).
 675  */
 676 void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
 677 {
 678         int count;
 679         bool put;
 680
 681         put = true;
 682         if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
 683                 put = mp_dmabuf_devmem_release_page(pool, netmem);
 684         else
 685                 __page_pool_release_page_dma(pool, netmem);
 686
 687         /* This may be the last page returned, releasing the pool, so
 688          * it is not safe to reference pool afterwards.
 689          */
 690         count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
 691         trace_page_pool_state_release(pool, netmem, count);
 692
 693         if (put) {
 694                 page_pool_clear_pp_info(netmem);
 695                 put_page(netmem_to_page(netmem));
 696         }
 697         /* An optimization would be to call __free_pages(page, pool->p.order)
 698          * knowing page is not part of page-cache (thus avoiding a
 699          * __page_cache_release() call).
 700          */
 701 }
 702
 703 static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
 704 {
 705         int ret;
 706         /* BH protection not needed if current is softirq */
 707         if (in_softirq())
 708                 ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
 709         else
 710                 ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
 711
 712         if (!ret) {
 713                 recycle_stat_inc(pool, ring);
 714                 return true;
 715         }
 716
 717         return false;
 718 }
 719
 720 /* Only allow direct recycling in special circumstances, into the
 721  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 722  *
 723  * Caller must provide appropriate safe context.
 724  */
 725 static bool page_pool_recycle_in_cache(netmem_ref netmem,
 726                                        struct page_pool *pool)
 727 {
 728         if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
 729                 recycle_stat_inc(pool, cache_full);
 730                 return false;
 731         }
 732
 733         /* Caller MUST have verified/know (page_ref_count(page) == 1) */
 734         pool->alloc.cache[pool->alloc.count++] = netmem;
 735         recycle_stat_inc(pool, cached);
 736         return true;
 737 }
 738
 739 static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
 740 {
 741         return netmem_is_net_iov(netmem) ||
 742                (page_ref_count(netmem_to_page(netmem)) == 1 &&
 743                 !page_is_pfmemalloc(netmem_to_page(netmem)));
 744 }
 745
 746 /* If the page refcnt == 1, this will try to recycle the page.
 747  * If pool->dma_sync is set, we'll try to sync the DMA area for
 748  * the configured size min(dma_sync_size, pool->max_len).
 749  * If the page refcnt != 1, then the page will be returned to memory
 750  * subsystem.
 751  */
 752 static __always_inline netmem_ref
 753 __page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
 754                      unsigned int dma_sync_size, bool allow_direct)
 755 {
 756         lockdep_assert_no_hardirq();
 757
 758         /* This allocator is optimized for the XDP mode that uses
 759          * one-frame-per-page, but have fallbacks that act like the
 760          * regular page allocator APIs.
 761          *
 762          * refcnt == 1 means page_pool owns page, and can recycle it.
 763          *
 764          * page is NOT reusable when allocated when system is under
 765          * some pressure. (page_is_pfmemalloc)
 766          */
 767         if (likely(__page_pool_page_can_be_recycled(netmem))) {
 768                 /* Read barrier done in page_ref_count / READ_ONCE */
 769
 770                 page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
 771
 772                 if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
 773                         return 0;
 774
 775                 /* Page found as candidate for recycling */
 776                 return netmem;
 777         }
 778
 779         /* Fallback/non-XDP mode: API user have elevated refcnt.
 780          *
 781          * Many drivers split up the page into fragments, and some
 782          * want to keep doing this to save memory and do refcnt based
 783          * recycling. Support this use case too, to ease drivers
 784          * switching between XDP/non-XDP.
 785          *
 786          * In-case page_pool maintains the DMA mapping, API user must
 787          * call page_pool_put_page once.  In this elevated refcnt
 788          * case, the DMA is unmapped/released, as driver is likely
 789          * doing refcnt based recycle tricks, meaning another process
 790          * will be invoking put_page.
 791          */
 792         recycle_stat_inc(pool, released_refcnt);
 793         page_pool_return_page(pool, netmem);
 794
 795         return 0;
 796 }
 797
 798 static bool page_pool_napi_local(const struct page_pool *pool)
 799 {
 800         const struct napi_struct *napi;
 801         u32 cpuid;
 802
 803         if (unlikely(!in_softirq()))
 804                 return false;
 805
 806         /* Allow direct recycle if we have reasons to believe that we are
 807          * in the same context as the consumer would run, so there's
 808          * no possible race.
 809          * __page_pool_put_page() makes sure we're not in hardirq context
 810          * and interrupts are enabled prior to accessing the cache.
 811          */
 812         cpuid = smp_processor_id();
 813         if (READ_ONCE(pool->cpuid) == cpuid)
 814                 return true;
 815
 816         napi = READ_ONCE(pool->p.napi);
 817
 818         return napi && READ_ONCE(napi->list_owner) == cpuid;
 819 }
 820
 821 void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
 822                                   unsigned int dma_sync_size, bool allow_direct)
 823 {
 824         if (!allow_direct)
 825                 allow_direct = page_pool_napi_local(pool);
 826
 827         netmem =
 828                 __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
 829         if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
 830                 /* Cache full, fallback to free pages */
 831                 recycle_stat_inc(pool, ring_full);
 832                 page_pool_return_page(pool, netmem);
 833         }
 834 }
 835 EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
 836
 837 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 838                                 unsigned int dma_sync_size, bool allow_direct)
 839 {
 840         page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
 841                                      allow_direct);
 842 }
 843 EXPORT_SYMBOL(page_pool_put_unrefed_page);
 844
 845 static void page_pool_recycle_ring_bulk(struct page_pool *pool,
 846                                         netmem_ref *bulk,
 847                                         u32 bulk_len)
 848 {
 849         bool in_softirq;
 850         u32 i;
 851
 852         /* Bulk produce into ptr_ring page_pool cache */
 853         in_softirq = page_pool_producer_lock(pool);
 854
 855         for (i = 0; i < bulk_len; i++) {
 856                 if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
 857                         /* ring full */
 858                         recycle_stat_inc(pool, ring_full);
 859                         break;
 860                 }
 861         }
 862
 863         page_pool_producer_unlock(pool, in_softirq);
 864         recycle_stat_add(pool, ring, i);
 865
 866         /* Hopefully all pages were returned into ptr_ring */
 867         if (likely(i == bulk_len))
 868                 return;
 869
 870         /*
 871          * ptr_ring cache is full, free remaining pages outside producer lock
 872          * since put_page() with refcnt == 1 can be an expensive operation.
 873          */
 874         for (; i < bulk_len; i++)
 875                 page_pool_return_page(pool, bulk[i]);
 876 }
 877
 878 /**
 879  * page_pool_put_netmem_bulk() - release references on multiple netmems
 880  * @data:       array holding netmem references
 881  * @count:      number of entries in @data
 882  *
 883  * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
 884  * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
 885  * will release leftover netmems to the memory provider.
 886  * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
 887  * completion loop for the XDP_REDIRECT use case.
 888  *
 889  * Please note the caller must not use data area after running
 890  * page_pool_put_netmem_bulk(), as this function overwrites it.
 891  */
 892 void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
 893 {
 894         u32 bulk_len = 0;
 895
 896         for (u32 i = 0; i < count; i++) {
 897                 netmem_ref netmem = netmem_compound_head(data[i]);
 898
 899                 if (page_pool_unref_and_test(netmem))
 900                         data[bulk_len++] = netmem;
 901         }
 902
 903         count = bulk_len;
 904         while (count) {
 905                 netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
 906                 struct page_pool *pool = NULL;
 907                 bool allow_direct;
 908                 u32 foreign = 0;
 909
 910                 bulk_len = 0;
 911
 912                 for (u32 i = 0; i < count; i++) {
 913                         struct page_pool *netmem_pp;
 914                         netmem_ref netmem = data[i];
 915
 916                         netmem_pp = netmem_get_pp(netmem);
 917                         if (unlikely(!pool)) {
 918                                 pool = netmem_pp;
 919                                 allow_direct = page_pool_napi_local(pool);
 920                         } else if (netmem_pp != pool) {
 921                                 /*
 922                                  * If the netmem belongs to a different
 923                                  * page_pool, save it for another round.
 924                                  */
 925                                 data[foreign++] = netmem;
 926                                 continue;
 927                         }
 928
 929                         netmem = __page_pool_put_page(pool, netmem, -1,
 930                                                       allow_direct);
 931                         /* Approved for bulk recycling in ptr_ring cache */
 932                         if (netmem)
 933                                 bulk[bulk_len++] = netmem;
 934                 }
 935
 936                 if (bulk_len)
 937                         page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
 938
 939                 count = foreign;
 940         }
 941 }
 942 EXPORT_SYMBOL(page_pool_put_netmem_bulk);
 943
 944 static netmem_ref page_pool_drain_frag(struct page_pool *pool,
 945                                        netmem_ref netmem)
 946 {
 947         long drain_count = BIAS_MAX - pool->frag_users;
 948
 949         /* Some user is still using the page frag */
 950         if (likely(page_pool_unref_netmem(netmem, drain_count)))
 951                 return 0;
 952
 953         if (__page_pool_page_can_be_recycled(netmem)) {
 954                 page_pool_dma_sync_for_device(pool, netmem, -1);
 955                 return netmem;
 956         }
 957
 958         page_pool_return_page(pool, netmem);
 959         return 0;
 960 }
 961
 962 static void page_pool_free_frag(struct page_pool *pool)
 963 {
 964         long drain_count = BIAS_MAX - pool->frag_users;
 965         netmem_ref netmem = pool->frag_page;
 966
 967         pool->frag_page = 0;
 968
 969         if (!netmem || page_pool_unref_netmem(netmem, drain_count))
 970                 return;
 971
 972         page_pool_return_page(pool, netmem);
 973 }
 974
 975 netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
 976                                        unsigned int *offset, unsigned int size,
 977                                        gfp_t gfp)
 978 {
 979         unsigned int max_size = PAGE_SIZE << pool->p.order;
 980         netmem_ref netmem = pool->frag_page;
 981
 982         if (WARN_ON(size > max_size))
 983                 return 0;
 984
 985         size = ALIGN(size, dma_get_cache_alignment());
 986         *offset = pool->frag_offset;
 987
 988         if (netmem && *offset + size > max_size) {
 989                 netmem = page_pool_drain_frag(pool, netmem);
 990                 if (netmem) {
 991                         recycle_stat_inc(pool, cached);
 992                         alloc_stat_inc(pool, fast);
 993                         goto frag_reset;
 994                 }
 995         }
 996
 997         if (!netmem) {
 998                 netmem = page_pool_alloc_netmems(pool, gfp);
 999                 if (unlikely(!netmem)) {
1000                         pool->frag_page = 0;
1001                         return 0;
1002                 }
1003
1004                 pool->frag_page = netmem;
1005
1006 frag_reset:
1007                 pool->frag_users = 1;
1008                 *offset = 0;
1009                 pool->frag_offset = size;
1010                 page_pool_fragment_netmem(netmem, BIAS_MAX);
1011                 return netmem;
1012         }
1013
1014         pool->frag_users++;
1015         pool->frag_offset = *offset + size;
1016         return netmem;
1017 }
1018 EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
1019
1020 struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
1021                                   unsigned int size, gfp_t gfp)
1022 {
1023         return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
1024                                                           gfp));
1025 }
1026 EXPORT_SYMBOL(page_pool_alloc_frag);
1027
1028 static void page_pool_empty_ring(struct page_pool *pool)
1029 {
1030         netmem_ref netmem;
1031
1032         /* Empty recycle ring */
1033         while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
1034                 /* Verify the refcnt invariant of cached pages */
1035                 if (!(netmem_ref_count(netmem) == 1))
1036                         pr_crit("%s() page_pool refcnt %d violation\n",
1037                                 __func__, netmem_ref_count(netmem));
1038
1039                 page_pool_return_page(pool, netmem);
1040         }
1041 }
1042
1043 static void __page_pool_destroy(struct page_pool *pool)
1044 {
1045         if (pool->disconnect)
1046                 pool->disconnect(pool);
1047
1048         page_pool_unlist(pool);
1049         page_pool_uninit(pool);
1050
1051         if (pool->mp_priv) {
1052                 mp_dmabuf_devmem_destroy(pool);
1053                 static_branch_dec(&page_pool_mem_providers);
1054         }
1055
1056         kfree(pool);
1057 }
1058
1059 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1060 {
1061         netmem_ref netmem;
1062
1063         if (pool->destroy_cnt)
1064                 return;
1065
1066         /* Empty alloc cache, assume caller made sure this is
1067          * no-longer in use, and page_pool_alloc_pages() cannot be
1068          * call concurrently.
1069          */
1070         while (pool->alloc.count) {
1071                 netmem = pool->alloc.cache[--pool->alloc.count];
1072                 page_pool_return_page(pool, netmem);
1073         }
1074 }
1075
1076 static void page_pool_scrub(struct page_pool *pool)
1077 {
1078         page_pool_empty_alloc_cache_once(pool);
1079         pool->destroy_cnt++;
1080
1081         /* No more consumers should exist, but producers could still
1082          * be in-flight.
1083          */
1084         page_pool_empty_ring(pool);
1085 }
1086
1087 static int page_pool_release(struct page_pool *pool)
1088 {
1089         int inflight;
1090
1091         page_pool_scrub(pool);
1092         inflight = page_pool_inflight(pool, true);
1093         if (!inflight)
1094                 __page_pool_destroy(pool);
1095
1096         return inflight;
1097 }
1098
1099 static void page_pool_release_retry(struct work_struct *wq)
1100 {
1101         struct delayed_work *dwq = to_delayed_work(wq);
1102         struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1103         void *netdev;
1104         int inflight;
1105
1106         inflight = page_pool_release(pool);
1107         if (!inflight)
1108                 return;
1109
1110         /* Periodic warning for page pools the user can't see */
1111         netdev = READ_ONCE(pool->slow.netdev);
1112         if (time_after_eq(jiffies, pool->defer_warn) &&
1113             (!netdev || netdev == NET_PTR_POISON)) {
1114                 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1115
1116                 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1117                         __func__, pool->user.id, inflight, sec);
1118                 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1119         }
1120
1121         /* Still not ready to be disconnected, retry later */
1122         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1123 }
1124
1125 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1126                            const struct xdp_mem_info *mem)
1127 {
1128         refcount_inc(&pool->user_cnt);
1129         pool->disconnect = disconnect;
1130         pool->xdp_mem_id = mem->id;
1131 }
1132
1133 void page_pool_disable_direct_recycling(struct page_pool *pool)
1134 {
1135         /* Disable direct recycling based on pool->cpuid.
1136          * Paired with READ_ONCE() in page_pool_napi_local().
1137          */
1138         WRITE_ONCE(pool->cpuid, -1);
1139
1140         if (!pool->p.napi)
1141                 return;
1142
1143         /* To avoid races with recycling and additional barriers make sure
1144          * pool and NAPI are unlinked when NAPI is disabled.
1145          */
1146         WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1147         WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1148
1149         mutex_lock(&page_pools_lock);
1150         WRITE_ONCE(pool->p.napi, NULL);
1151         mutex_unlock(&page_pools_lock);
1152 }
1153 EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1154
1155 void page_pool_destroy(struct page_pool *pool)
1156 {
1157         if (!pool)
1158                 return;
1159
1160         if (!page_pool_put(pool))
1161                 return;
1162
1163         page_pool_disable_direct_recycling(pool);
1164         page_pool_free_frag(pool);
1165
1166         if (!page_pool_release(pool))
1167                 return;
1168
1169         page_pool_detached(pool);
1170         pool->defer_start = jiffies;
1171         pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1172
1173         INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1174         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1175 }
1176 EXPORT_SYMBOL(page_pool_destroy);
1177
1178 /* Caller must provide appropriate safe context, e.g. NAPI. */
1179 void page_pool_update_nid(struct page_pool *pool, int new_nid)
1180 {
1181         netmem_ref netmem;
1182
1183         trace_page_pool_update_nid(pool, new_nid);
1184         pool->p.nid = new_nid;
1185
1186         /* Flush pool alloc cache, as refill will check NUMA node */
1187         while (pool->alloc.count) {
1188                 netmem = pool->alloc.cache[--pool->alloc.count];
1189                 page_pool_return_page(pool, netmem);
1190         }
1191 }
1192 EXPORT_SYMBOL(page_pool_update_nid);