mm/memcontrol.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* memcontrol.c - Memory Controller
   3  *
   4  * Copyright IBM Corporation, 2007
   5  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   6  *
   7  * Copyright 2007 OpenVZ SWsoft Inc
   8  * Author: Pavel Emelianov <xemul@openvz.org>
   9  *
  10  * Memory thresholds
  11  * Copyright (C) 2009 Nokia Corporation
  12  * Author: Kirill A. Shutemov
  13  *
  14  * Kernel Memory Controller
  15  * Copyright (C) 2012 Parallels Inc. and Google Inc.
  16  * Authors: Glauber Costa and Suleiman Souhlal
  17  *
  18  * Native page reclaim
  19  * Charge lifetime sanitation
  20  * Lockless page tracking & accounting
  21  * Unified hierarchy configuration model
  22  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  23  *
  24  * Per memcg lru locking
  25  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  26  */
  27
  28 #include <linux/cgroup-defs.h>
  29 #include <linux/page_counter.h>
  30 #include <linux/memcontrol.h>
  31 #include <linux/cgroup.h>
  32 #include <linux/sched/mm.h>
  33 #include <linux/shmem_fs.h>
  34 #include <linux/hugetlb.h>
  35 #include <linux/pagemap.h>
  36 #include <linux/pagevec.h>
  37 #include <linux/vm_event_item.h>
  38 #include <linux/smp.h>
  39 #include <linux/page-flags.h>
  40 #include <linux/backing-dev.h>
  41 #include <linux/bit_spinlock.h>
  42 #include <linux/rcupdate.h>
  43 #include <linux/limits.h>
  44 #include <linux/export.h>
  45 #include <linux/list.h>
  46 #include <linux/mutex.h>
  47 #include <linux/rbtree.h>
  48 #include <linux/slab.h>
  49 #include <linux/swapops.h>
  50 #include <linux/spinlock.h>
  51 #include <linux/fs.h>
  52 #include <linux/seq_file.h>
  53 #include <linux/parser.h>
  54 #include <linux/vmpressure.h>
  55 #include <linux/memremap.h>
  56 #include <linux/mm_inline.h>
  57 #include <linux/swap_cgroup.h>
  58 #include <linux/cpu.h>
  59 #include <linux/oom.h>
  60 #include <linux/lockdep.h>
  61 #include <linux/resume_user_mode.h>
  62 #include <linux/psi.h>
  63 #include <linux/seq_buf.h>
  64 #include <linux/sched/isolation.h>
  65 #include <linux/kmemleak.h>
  66 #include "internal.h"
  67 #include <net/sock.h>
  68 #include <net/ip.h>
  69 #include "slab.h"
  70 #include "memcontrol-v1.h"
  71
  72 #include <linux/uaccess.h>
  73
  74 #define CREATE_TRACE_POINTS
  75 #include <trace/events/memcg.h>
  76 #undef CREATE_TRACE_POINTS
  77
  78 #include <trace/events/vmscan.h>
  79
  80 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  81 EXPORT_SYMBOL(memory_cgrp_subsys);
  82
  83 struct mem_cgroup *root_mem_cgroup __read_mostly;
  84
  85 /* Active memory cgroup to use from an interrupt context */
  86 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
  87 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
  88
  89 /* Socket memory accounting disabled? */
  90 static bool cgroup_memory_nosocket __ro_after_init;
  91
  92 /* Kernel memory accounting disabled? */
  93 static bool cgroup_memory_nokmem __ro_after_init;
  94
  95 /* BPF memory accounting disabled? */
  96 static bool cgroup_memory_nobpf __ro_after_init;
  97
  98 #ifdef CONFIG_CGROUP_WRITEBACK
  99 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 100 #endif
 101
 102 static inline bool task_is_dying(void)
 103 {
 104         return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 105                 (current->flags & PF_EXITING);
 106 }
 107
 108 /* Some nice accessors for the vmpressure. */
 109 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 110 {
 111         if (!memcg)
 112                 memcg = root_mem_cgroup;
 113         return &memcg->vmpressure;
 114 }
 115
 116 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
 117 {
 118         return container_of(vmpr, struct mem_cgroup, vmpressure);
 119 }
 120
 121 #define SEQ_BUF_SIZE SZ_4K
 122 #define CURRENT_OBJCG_UPDATE_BIT 0
 123 #define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
 124
 125 static DEFINE_SPINLOCK(objcg_lock);
 126
 127 bool mem_cgroup_kmem_disabled(void)
 128 {
 129         return cgroup_memory_nokmem;
 130 }
 131
 132 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
 133                                       unsigned int nr_pages);
 134
 135 static void obj_cgroup_release(struct percpu_ref *ref)
 136 {
 137         struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
 138         unsigned int nr_bytes;
 139         unsigned int nr_pages;
 140         unsigned long flags;
 141
 142         /*
 143          * At this point all allocated objects are freed, and
 144          * objcg->nr_charged_bytes can't have an arbitrary byte value.
 145          * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
 146          *
 147          * The following sequence can lead to it:
 148          * 1) CPU0: objcg == stock->cached_objcg
 149          * 2) CPU1: we do a small allocation (e.g. 92 bytes),
 150          *          PAGE_SIZE bytes are charged
 151          * 3) CPU1: a process from another memcg is allocating something,
 152          *          the stock if flushed,
 153          *          objcg->nr_charged_bytes = PAGE_SIZE - 92
 154          * 5) CPU0: we do release this object,
 155          *          92 bytes are added to stock->nr_bytes
 156          * 6) CPU0: stock is flushed,
 157          *          92 bytes are added to objcg->nr_charged_bytes
 158          *
 159          * In the result, nr_charged_bytes == PAGE_SIZE.
 160          * This page will be uncharged in obj_cgroup_release().
 161          */
 162         nr_bytes = atomic_read(&objcg->nr_charged_bytes);
 163         WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
 164         nr_pages = nr_bytes >> PAGE_SHIFT;
 165
 166         if (nr_pages)
 167                 obj_cgroup_uncharge_pages(objcg, nr_pages);
 168
 169         spin_lock_irqsave(&objcg_lock, flags);
 170         list_del(&objcg->list);
 171         spin_unlock_irqrestore(&objcg_lock, flags);
 172
 173         percpu_ref_exit(ref);
 174         kfree_rcu(objcg, rcu);
 175 }
 176
 177 static struct obj_cgroup *obj_cgroup_alloc(void)
 178 {
 179         struct obj_cgroup *objcg;
 180         int ret;
 181
 182         objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
 183         if (!objcg)
 184                 return NULL;
 185
 186         ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
 187                               GFP_KERNEL);
 188         if (ret) {
 189                 kfree(objcg);
 190                 return NULL;
 191         }
 192         INIT_LIST_HEAD(&objcg->list);
 193         return objcg;
 194 }
 195
 196 static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
 197                                   struct mem_cgroup *parent)
 198 {
 199         struct obj_cgroup *objcg, *iter;
 200
 201         objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
 202
 203         spin_lock_irq(&objcg_lock);
 204
 205         /* 1) Ready to reparent active objcg. */
 206         list_add(&objcg->list, &memcg->objcg_list);
 207         /* 2) Reparent active objcg and already reparented objcgs to parent. */
 208         list_for_each_entry(iter, &memcg->objcg_list, list)
 209                 WRITE_ONCE(iter->memcg, parent);
 210         /* 3) Move already reparented objcgs to the parent's list */
 211         list_splice(&memcg->objcg_list, &parent->objcg_list);
 212
 213         spin_unlock_irq(&objcg_lock);
 214
 215         percpu_ref_kill(&objcg->refcnt);
 216 }
 217
 218 /*
 219  * A lot of the calls to the cache allocation functions are expected to be
 220  * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are
 221  * conditional to this static branch, we'll have to allow modules that does
 222  * kmem_cache_alloc and the such to see this symbol as well
 223  */
 224 DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
 225 EXPORT_SYMBOL(memcg_kmem_online_key);
 226
 227 DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
 228 EXPORT_SYMBOL(memcg_bpf_enabled_key);
 229
 230 /**
 231  * mem_cgroup_css_from_folio - css of the memcg associated with a folio
 232  * @folio: folio of interest
 233  *
 234  * If memcg is bound to the default hierarchy, css of the memcg associated
 235  * with @folio is returned.  The returned css remains associated with @folio
 236  * until it is released.
 237  *
 238  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 239  * is returned.
 240  */
 241 struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
 242 {
 243         struct mem_cgroup *memcg = folio_memcg(folio);
 244
 245         if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 246                 memcg = root_mem_cgroup;
 247
 248         return &memcg->css;
 249 }
 250
 251 /**
 252  * page_cgroup_ino - return inode number of the memcg a page is charged to
 253  * @page: the page
 254  *
 255  * Look up the closest online ancestor of the memory cgroup @page is charged to
 256  * and return its inode number or 0 if @page is not charged to any cgroup. It
 257  * is safe to call this function without holding a reference to @page.
 258  *
 259  * Note, this function is inherently racy, because there is nothing to prevent
 260  * the cgroup inode from getting torn down and potentially reallocated a moment
 261  * after page_cgroup_ino() returns, so it only should be used by callers that
 262  * do not care (such as procfs interfaces).
 263  */
 264 ino_t page_cgroup_ino(struct page *page)
 265 {
 266         struct mem_cgroup *memcg;
 267         unsigned long ino = 0;
 268
 269         rcu_read_lock();
 270         /* page_folio() is racy here, but the entire function is racy anyway */
 271         memcg = folio_memcg_check(page_folio(page));
 272
 273         while (memcg && !(memcg->css.flags & CSS_ONLINE))
 274                 memcg = parent_mem_cgroup(memcg);
 275         if (memcg)
 276                 ino = cgroup_ino(memcg->css.cgroup);
 277         rcu_read_unlock();
 278         return ino;
 279 }
 280
 281 /* Subset of node_stat_item for memcg stats */
 282 static const unsigned int memcg_node_stat_items[] = {
 283         NR_INACTIVE_ANON,
 284         NR_ACTIVE_ANON,
 285         NR_INACTIVE_FILE,
 286         NR_ACTIVE_FILE,
 287         NR_UNEVICTABLE,
 288         NR_SLAB_RECLAIMABLE_B,
 289         NR_SLAB_UNRECLAIMABLE_B,
 290         WORKINGSET_REFAULT_ANON,
 291         WORKINGSET_REFAULT_FILE,
 292         WORKINGSET_ACTIVATE_ANON,
 293         WORKINGSET_ACTIVATE_FILE,
 294         WORKINGSET_RESTORE_ANON,
 295         WORKINGSET_RESTORE_FILE,
 296         WORKINGSET_NODERECLAIM,
 297         NR_ANON_MAPPED,
 298         NR_FILE_MAPPED,
 299         NR_FILE_PAGES,
 300         NR_FILE_DIRTY,
 301         NR_WRITEBACK,
 302         NR_SHMEM,
 303         NR_SHMEM_THPS,
 304         NR_FILE_THPS,
 305         NR_ANON_THPS,
 306         NR_KERNEL_STACK_KB,
 307         NR_PAGETABLE,
 308         NR_SECONDARY_PAGETABLE,
 309 #ifdef CONFIG_SWAP
 310         NR_SWAPCACHE,
 311 #endif
 312 #ifdef CONFIG_NUMA_BALANCING
 313         PGPROMOTE_SUCCESS,
 314 #endif
 315         PGDEMOTE_KSWAPD,
 316         PGDEMOTE_DIRECT,
 317         PGDEMOTE_KHUGEPAGED,
 318 #ifdef CONFIG_HUGETLB_PAGE
 319         NR_HUGETLB,
 320 #endif
 321 };
 322
 323 static const unsigned int memcg_stat_items[] = {
 324         MEMCG_SWAP,
 325         MEMCG_SOCK,
 326         MEMCG_PERCPU_B,
 327         MEMCG_VMALLOC,
 328         MEMCG_KMEM,
 329         MEMCG_ZSWAP_B,
 330         MEMCG_ZSWAPPED,
 331 };
 332
 333 #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
 334 #define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \
 335                            ARRAY_SIZE(memcg_stat_items))
 336 #define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX)
 337 static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
 338
 339 static void init_memcg_stats(void)
 340 {
 341         u8 i, j = 0;
 342
 343         BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX);
 344
 345         memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index));
 346
 347         for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j)
 348                 mem_cgroup_stats_index[memcg_node_stat_items[i]] = j;
 349
 350         for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j)
 351                 mem_cgroup_stats_index[memcg_stat_items[i]] = j;
 352 }
 353
 354 static inline int memcg_stats_index(int idx)
 355 {
 356         return mem_cgroup_stats_index[idx];
 357 }
 358
 359 struct lruvec_stats_percpu {
 360         /* Local (CPU and cgroup) state */
 361         long state[NR_MEMCG_NODE_STAT_ITEMS];
 362
 363         /* Delta calculation for lockless upward propagation */
 364         long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
 365 };
 366
 367 struct lruvec_stats {
 368         /* Aggregated (CPU and subtree) state */
 369         long state[NR_MEMCG_NODE_STAT_ITEMS];
 370
 371         /* Non-hierarchical (CPU aggregated) state */
 372         long state_local[NR_MEMCG_NODE_STAT_ITEMS];
 373
 374         /* Pending child counts during tree propagation */
 375         long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
 376 };
 377
 378 unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
 379 {
 380         struct mem_cgroup_per_node *pn;
 381         long x;
 382         int i;
 383
 384         if (mem_cgroup_disabled())
 385                 return node_page_state(lruvec_pgdat(lruvec), idx);
 386
 387         i = memcg_stats_index(idx);
 388         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 389                 return 0;
 390
 391         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 392         x = READ_ONCE(pn->lruvec_stats->state[i]);
 393 #ifdef CONFIG_SMP
 394         if (x < 0)
 395                 x = 0;
 396 #endif
 397         return x;
 398 }
 399
 400 unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 401                                       enum node_stat_item idx)
 402 {
 403         struct mem_cgroup_per_node *pn;
 404         long x;
 405         int i;
 406
 407         if (mem_cgroup_disabled())
 408                 return node_page_state(lruvec_pgdat(lruvec), idx);
 409
 410         i = memcg_stats_index(idx);
 411         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 412                 return 0;
 413
 414         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 415         x = READ_ONCE(pn->lruvec_stats->state_local[i]);
 416 #ifdef CONFIG_SMP
 417         if (x < 0)
 418                 x = 0;
 419 #endif
 420         return x;
 421 }
 422
 423 /* Subset of vm_event_item to report for memcg event stats */
 424 static const unsigned int memcg_vm_event_stat[] = {
 425 #ifdef CONFIG_MEMCG_V1
 426         PGPGIN,
 427         PGPGOUT,
 428 #endif
 429         PSWPIN,
 430         PSWPOUT,
 431         PGSCAN_KSWAPD,
 432         PGSCAN_DIRECT,
 433         PGSCAN_KHUGEPAGED,
 434         PGSTEAL_KSWAPD,
 435         PGSTEAL_DIRECT,
 436         PGSTEAL_KHUGEPAGED,
 437         PGFAULT,
 438         PGMAJFAULT,
 439         PGREFILL,
 440         PGACTIVATE,
 441         PGDEACTIVATE,
 442         PGLAZYFREE,
 443         PGLAZYFREED,
 444 #ifdef CONFIG_SWAP
 445         SWPIN_ZERO,
 446         SWPOUT_ZERO,
 447 #endif
 448 #ifdef CONFIG_ZSWAP
 449         ZSWPIN,
 450         ZSWPOUT,
 451         ZSWPWB,
 452 #endif
 453 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 454         THP_FAULT_ALLOC,
 455         THP_COLLAPSE_ALLOC,
 456         THP_SWPOUT,
 457         THP_SWPOUT_FALLBACK,
 458 #endif
 459 #ifdef CONFIG_NUMA_BALANCING
 460         NUMA_PAGE_MIGRATE,
 461         NUMA_PTE_UPDATES,
 462         NUMA_HINT_FAULTS,
 463 #endif
 464 };
 465
 466 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
 467 static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
 468
 469 static void init_memcg_events(void)
 470 {
 471         u8 i;
 472
 473         BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX);
 474
 475         memset(mem_cgroup_events_index, U8_MAX,
 476                sizeof(mem_cgroup_events_index));
 477
 478         for (i = 0; i < NR_MEMCG_EVENTS; ++i)
 479                 mem_cgroup_events_index[memcg_vm_event_stat[i]] = i;
 480 }
 481
 482 static inline int memcg_events_index(enum vm_event_item idx)
 483 {
 484         return mem_cgroup_events_index[idx];
 485 }
 486
 487 struct memcg_vmstats_percpu {
 488         /* Stats updates since the last flush */
 489         unsigned int                    stats_updates;
 490
 491         /* Cached pointers for fast iteration in memcg_rstat_updated() */
 492         struct memcg_vmstats_percpu     *parent;
 493         struct memcg_vmstats            *vmstats;
 494
 495         /* The above should fit a single cacheline for memcg_rstat_updated() */
 496
 497         /* Local (CPU and cgroup) page state & events */
 498         long                    state[MEMCG_VMSTAT_SIZE];
 499         unsigned long           events[NR_MEMCG_EVENTS];
 500
 501         /* Delta calculation for lockless upward propagation */
 502         long                    state_prev[MEMCG_VMSTAT_SIZE];
 503         unsigned long           events_prev[NR_MEMCG_EVENTS];
 504 } ____cacheline_aligned;
 505
 506 struct memcg_vmstats {
 507         /* Aggregated (CPU and subtree) page state & events */
 508         long                    state[MEMCG_VMSTAT_SIZE];
 509         unsigned long           events[NR_MEMCG_EVENTS];
 510
 511         /* Non-hierarchical (CPU aggregated) page state & events */
 512         long                    state_local[MEMCG_VMSTAT_SIZE];
 513         unsigned long           events_local[NR_MEMCG_EVENTS];
 514
 515         /* Pending child counts during tree propagation */
 516         long                    state_pending[MEMCG_VMSTAT_SIZE];
 517         unsigned long           events_pending[NR_MEMCG_EVENTS];
 518
 519         /* Stats updates since the last flush */
 520         atomic64_t              stats_updates;
 521 };
 522
 523 /*
 524  * memcg and lruvec stats flushing
 525  *
 526  * Many codepaths leading to stats update or read are performance sensitive and
 527  * adding stats flushing in such codepaths is not desirable. So, to optimize the
 528  * flushing the kernel does:
 529  *
 530  * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
 531  *    rstat update tree grow unbounded.
 532  *
 533  * 2) Flush the stats synchronously on reader side only when there are more than
 534  *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
 535  *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
 536  *    only for 2 seconds due to (1).
 537  */
 538 static void flush_memcg_stats_dwork(struct work_struct *w);
 539 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
 540 static u64 flush_last_time;
 541
 542 #define FLUSH_TIME (2UL*HZ)
 543
 544 /*
 545  * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
 546  * not rely on this as part of an acquired spinlock_t lock. These functions are
 547  * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
 548  * is sufficient.
 549  */
 550 static void memcg_stats_lock(void)
 551 {
 552         preempt_disable_nested();
 553         VM_WARN_ON_IRQS_ENABLED();
 554 }
 555
 556 static void __memcg_stats_lock(void)
 557 {
 558         preempt_disable_nested();
 559 }
 560
 561 static void memcg_stats_unlock(void)
 562 {
 563         preempt_enable_nested();
 564 }
 565
 566
 567 static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
 568 {
 569         return atomic64_read(&vmstats->stats_updates) >
 570                 MEMCG_CHARGE_BATCH * num_online_cpus();
 571 }
 572
 573 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 574 {
 575         struct memcg_vmstats_percpu *statc;
 576         int cpu = smp_processor_id();
 577         unsigned int stats_updates;
 578
 579         if (!val)
 580                 return;
 581
 582         cgroup_rstat_updated(memcg->css.cgroup, cpu);
 583         statc = this_cpu_ptr(memcg->vmstats_percpu);
 584         for (; statc; statc = statc->parent) {
 585                 stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
 586                 WRITE_ONCE(statc->stats_updates, stats_updates);
 587                 if (stats_updates < MEMCG_CHARGE_BATCH)
 588                         continue;
 589
 590                 /*
 591                  * If @memcg is already flush-able, increasing stats_updates is
 592                  * redundant. Avoid the overhead of the atomic update.
 593                  */
 594                 if (!memcg_vmstats_needs_flush(statc->vmstats))
 595                         atomic64_add(stats_updates,
 596                                      &statc->vmstats->stats_updates);
 597                 WRITE_ONCE(statc->stats_updates, 0);
 598         }
 599 }
 600
 601 static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
 602 {
 603         bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
 604
 605         trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
 606                 force, needs_flush);
 607
 608         if (!force && !needs_flush)
 609                 return;
 610
 611         if (mem_cgroup_is_root(memcg))
 612                 WRITE_ONCE(flush_last_time, jiffies_64);
 613
 614         cgroup_rstat_flush(memcg->css.cgroup);
 615 }
 616
 617 /*
 618  * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
 619  * @memcg: root of the subtree to flush
 620  *
 621  * Flushing is serialized by the underlying global rstat lock. There is also a
 622  * minimum amount of work to be done even if there are no stat updates to flush.
 623  * Hence, we only flush the stats if the updates delta exceeds a threshold. This
 624  * avoids unnecessary work and contention on the underlying lock.
 625  */
 626 void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
 627 {
 628         if (mem_cgroup_disabled())
 629                 return;
 630
 631         if (!memcg)
 632                 memcg = root_mem_cgroup;
 633
 634         __mem_cgroup_flush_stats(memcg, false);
 635 }
 636
 637 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
 638 {
 639         /* Only flush if the periodic flusher is one full cycle late */
 640         if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
 641                 mem_cgroup_flush_stats(memcg);
 642 }
 643
 644 static void flush_memcg_stats_dwork(struct work_struct *w)
 645 {
 646         /*
 647          * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
 648          * in latency-sensitive paths is as cheap as possible.
 649          */
 650         __mem_cgroup_flush_stats(root_mem_cgroup, true);
 651         queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 652 }
 653
 654 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 655 {
 656         long x;
 657         int i = memcg_stats_index(idx);
 658
 659         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 660                 return 0;
 661
 662         x = READ_ONCE(memcg->vmstats->state[i]);
 663 #ifdef CONFIG_SMP
 664         if (x < 0)
 665                 x = 0;
 666 #endif
 667         return x;
 668 }
 669
 670 static int memcg_page_state_unit(int item);
 671
 672 /*
 673  * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
 674  * up non-zero sub-page updates to 1 page as zero page updates are ignored.
 675  */
 676 static int memcg_state_val_in_pages(int idx, int val)
 677 {
 678         int unit = memcg_page_state_unit(idx);
 679
 680         if (!val || unit == PAGE_SIZE)
 681                 return val;
 682         else
 683                 return max(val * unit / PAGE_SIZE, 1UL);
 684 }
 685
 686 /**
 687  * __mod_memcg_state - update cgroup memory statistics
 688  * @memcg: the memory cgroup
 689  * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 690  * @val: delta to add to the counter, can be negative
 691  */
 692 void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
 693                        int val)
 694 {
 695         int i = memcg_stats_index(idx);
 696
 697         if (mem_cgroup_disabled())
 698                 return;
 699
 700         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 701                 return;
 702
 703         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
 704         val = memcg_state_val_in_pages(idx, val);
 705         memcg_rstat_updated(memcg, val);
 706         trace_mod_memcg_state(memcg, idx, val);
 707 }
 708
 709 /* idx can be of type enum memcg_stat_item or node_stat_item. */
 710 unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 711 {
 712         long x;
 713         int i = memcg_stats_index(idx);
 714
 715         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 716                 return 0;
 717
 718         x = READ_ONCE(memcg->vmstats->state_local[i]);
 719 #ifdef CONFIG_SMP
 720         if (x < 0)
 721                 x = 0;
 722 #endif
 723         return x;
 724 }
 725
 726 static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
 727                                      enum node_stat_item idx,
 728                                      int val)
 729 {
 730         struct mem_cgroup_per_node *pn;
 731         struct mem_cgroup *memcg;
 732         int i = memcg_stats_index(idx);
 733
 734         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 735                 return;
 736
 737         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 738         memcg = pn->memcg;
 739
 740         /*
 741          * The caller from rmap relies on disabled preemption because they never
 742          * update their counter from in-interrupt context. For these two
 743          * counters we check that the update is never performed from an
 744          * interrupt context while other caller need to have disabled interrupt.
 745          */
 746         __memcg_stats_lock();
 747         if (IS_ENABLED(CONFIG_DEBUG_VM)) {
 748                 switch (idx) {
 749                 case NR_ANON_MAPPED:
 750                 case NR_FILE_MAPPED:
 751                 case NR_ANON_THPS:
 752                         WARN_ON_ONCE(!in_task());
 753                         break;
 754                 default:
 755                         VM_WARN_ON_IRQS_ENABLED();
 756                 }
 757         }
 758
 759         /* Update memcg */
 760         __this_cpu_add(memcg->vmstats_percpu->state[i], val);
 761
 762         /* Update lruvec */
 763         __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
 764
 765         val = memcg_state_val_in_pages(idx, val);
 766         memcg_rstat_updated(memcg, val);
 767         trace_mod_memcg_lruvec_state(memcg, idx, val);
 768         memcg_stats_unlock();
 769 }
 770
 771 /**
 772  * __mod_lruvec_state - update lruvec memory statistics
 773  * @lruvec: the lruvec
 774  * @idx: the stat item
 775  * @val: delta to add to the counter, can be negative
 776  *
 777  * The lruvec is the intersection of the NUMA node and a cgroup. This
 778  * function updates the all three counters that are affected by a
 779  * change of state at this level: per-node, per-cgroup, per-lruvec.
 780  */
 781 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 782                         int val)
 783 {
 784         /* Update node */
 785         __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 786
 787         /* Update memcg and lruvec */
 788         if (!mem_cgroup_disabled())
 789                 __mod_memcg_lruvec_state(lruvec, idx, val);
 790 }
 791
 792 void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 793                              int val)
 794 {
 795         struct mem_cgroup *memcg;
 796         pg_data_t *pgdat = folio_pgdat(folio);
 797         struct lruvec *lruvec;
 798
 799         rcu_read_lock();
 800         memcg = folio_memcg(folio);
 801         /* Untracked pages have no memcg, no lruvec. Update only the node */
 802         if (!memcg) {
 803                 rcu_read_unlock();
 804                 __mod_node_page_state(pgdat, idx, val);
 805                 return;
 806         }
 807
 808         lruvec = mem_cgroup_lruvec(memcg, pgdat);
 809         __mod_lruvec_state(lruvec, idx, val);
 810         rcu_read_unlock();
 811 }
 812 EXPORT_SYMBOL(__lruvec_stat_mod_folio);
 813
 814 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 815 {
 816         pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 817         struct mem_cgroup *memcg;
 818         struct lruvec *lruvec;
 819
 820         rcu_read_lock();
 821         memcg = mem_cgroup_from_slab_obj(p);
 822
 823         /*
 824          * Untracked pages have no memcg, no lruvec. Update only the
 825          * node. If we reparent the slab objects to the root memcg,
 826          * when we free the slab object, we need to update the per-memcg
 827          * vmstats to keep it correct for the root memcg.
 828          */
 829         if (!memcg) {
 830                 __mod_node_page_state(pgdat, idx, val);
 831         } else {
 832                 lruvec = mem_cgroup_lruvec(memcg, pgdat);
 833                 __mod_lruvec_state(lruvec, idx, val);
 834         }
 835         rcu_read_unlock();
 836 }
 837
 838 /**
 839  * __count_memcg_events - account VM events in a cgroup
 840  * @memcg: the memory cgroup
 841  * @idx: the event item
 842  * @count: the number of events that occurred
 843  */
 844 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 845                           unsigned long count)
 846 {
 847         int i = memcg_events_index(idx);
 848
 849         if (mem_cgroup_disabled())
 850                 return;
 851
 852         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 853                 return;
 854
 855         memcg_stats_lock();
 856         __this_cpu_add(memcg->vmstats_percpu->events[i], count);
 857         memcg_rstat_updated(memcg, count);
 858         trace_count_memcg_events(memcg, idx, count);
 859         memcg_stats_unlock();
 860 }
 861
 862 unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 863 {
 864         int i = memcg_events_index(event);
 865
 866         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
 867                 return 0;
 868
 869         return READ_ONCE(memcg->vmstats->events[i]);
 870 }
 871
 872 unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 873 {
 874         int i = memcg_events_index(event);
 875
 876         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
 877                 return 0;
 878
 879         return READ_ONCE(memcg->vmstats->events_local[i]);
 880 }
 881
 882 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 883 {
 884         /*
 885          * mm_update_next_owner() may clear mm->owner to NULL
 886          * if it races with swapoff, page migration, etc.
 887          * So this can be called with p == NULL.
 888          */
 889         if (unlikely(!p))
 890                 return NULL;
 891
 892         return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 893 }
 894 EXPORT_SYMBOL(mem_cgroup_from_task);
 895
 896 static __always_inline struct mem_cgroup *active_memcg(void)
 897 {
 898         if (!in_task())
 899                 return this_cpu_read(int_active_memcg);
 900         else
 901                 return current->active_memcg;
 902 }
 903
 904 /**
 905  * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
 906  * @mm: mm from which memcg should be extracted. It can be NULL.
 907  *
 908  * Obtain a reference on mm->memcg and returns it if successful. If mm
 909  * is NULL, then the memcg is chosen as follows:
 910  * 1) The active memcg, if set.
 911  * 2) current->mm->memcg, if available
 912  * 3) root memcg
 913  * If mem_cgroup is disabled, NULL is returned.
 914  */
 915 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 916 {
 917         struct mem_cgroup *memcg;
 918
 919         if (mem_cgroup_disabled())
 920                 return NULL;
 921
 922         /*
 923          * Page cache insertions can happen without an
 924          * actual mm context, e.g. during disk probing
 925          * on boot, loopback IO, acct() writes etc.
 926          *
 927          * No need to css_get on root memcg as the reference
 928          * counting is disabled on the root level in the
 929          * cgroup core. See CSS_NO_REF.
 930          */
 931         if (unlikely(!mm)) {
 932                 memcg = active_memcg();
 933                 if (unlikely(memcg)) {
 934                         /* remote memcg must hold a ref */
 935                         css_get(&memcg->css);
 936                         return memcg;
 937                 }
 938                 mm = current->mm;
 939                 if (unlikely(!mm))
 940                         return root_mem_cgroup;
 941         }
 942
 943         rcu_read_lock();
 944         do {
 945                 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 946                 if (unlikely(!memcg))
 947                         memcg = root_mem_cgroup;
 948         } while (!css_tryget(&memcg->css));
 949         rcu_read_unlock();
 950         return memcg;
 951 }
 952 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 953
 954 /**
 955  * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
 956  */
 957 struct mem_cgroup *get_mem_cgroup_from_current(void)
 958 {
 959         struct mem_cgroup *memcg;
 960
 961         if (mem_cgroup_disabled())
 962                 return NULL;
 963
 964 again:
 965         rcu_read_lock();
 966         memcg = mem_cgroup_from_task(current);
 967         if (!css_tryget(&memcg->css)) {
 968                 rcu_read_unlock();
 969                 goto again;
 970         }
 971         rcu_read_unlock();
 972         return memcg;
 973 }
 974
 975 /**
 976  * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
 977  * @folio: folio from which memcg should be extracted.
 978  */
 979 struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
 980 {
 981         struct mem_cgroup *memcg = folio_memcg(folio);
 982
 983         if (mem_cgroup_disabled())
 984                 return NULL;
 985
 986         rcu_read_lock();
 987         if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
 988                 memcg = root_mem_cgroup;
 989         rcu_read_unlock();
 990         return memcg;
 991 }
 992
 993 /**
 994  * mem_cgroup_iter - iterate over memory cgroup hierarchy
 995  * @root: hierarchy root
 996  * @prev: previously returned memcg, NULL on first invocation
 997  * @reclaim: cookie for shared reclaim walks, NULL for full walks
 998  *
 999  * Returns references to children of the hierarchy below @root, or
1000  * @root itself, or %NULL after a full round-trip.
1001  *
1002  * Caller must pass the return value in @prev on subsequent
1003  * invocations for reference counting, or use mem_cgroup_iter_break()
1004  * to cancel a hierarchy walk before the round-trip is complete.
1005  *
1006  * Reclaimers can specify a node in @reclaim to divide up the memcgs
1007  * in the hierarchy among all concurrent reclaimers operating on the
1008  * same node.
1009  */
1010 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1011                                    struct mem_cgroup *prev,
1012                                    struct mem_cgroup_reclaim_cookie *reclaim)
1013 {
1014         struct mem_cgroup_reclaim_iter *iter;
1015         struct cgroup_subsys_state *css;
1016         struct mem_cgroup *pos;
1017         struct mem_cgroup *next;
1018
1019         if (mem_cgroup_disabled())
1020                 return NULL;
1021
1022         if (!root)
1023                 root = root_mem_cgroup;
1024
1025         rcu_read_lock();
1026 restart:
1027         next = NULL;
1028
1029         if (reclaim) {
1030                 int gen;
1031                 int nid = reclaim->pgdat->node_id;
1032
1033                 iter = &root->nodeinfo[nid]->iter;
1034                 gen = atomic_read(&iter->generation);
1035
1036                 /*
1037                  * On start, join the current reclaim iteration cycle.
1038                  * Exit when a concurrent walker completes it.
1039                  */
1040                 if (!prev)
1041                         reclaim->generation = gen;
1042                 else if (reclaim->generation != gen)
1043                         goto out_unlock;
1044
1045                 pos = READ_ONCE(iter->position);
1046         } else
1047                 pos = prev;
1048
1049         css = pos ? &pos->css : NULL;
1050
1051         while ((css = css_next_descendant_pre(css, &root->css))) {
1052                 /*
1053                  * Verify the css and acquire a reference.  The root
1054                  * is provided by the caller, so we know it's alive
1055                  * and kicking, and don't take an extra reference.
1056                  */
1057                 if (css == &root->css || css_tryget(css))
1058                         break;
1059         }
1060
1061         next = mem_cgroup_from_css(css);
1062
1063         if (reclaim) {
1064                 /*
1065                  * The position could have already been updated by a competing
1066                  * thread, so check that the value hasn't changed since we read
1067                  * it to avoid reclaiming from the same cgroup twice.
1068                  */
1069                 if (cmpxchg(&iter->position, pos, next) != pos) {
1070                         if (css && css != &root->css)
1071                                 css_put(css);
1072                         goto restart;
1073                 }
1074
1075                 if (!next) {
1076                         atomic_inc(&iter->generation);
1077
1078                         /*
1079                          * Reclaimers share the hierarchy walk, and a
1080                          * new one might jump in right at the end of
1081                          * the hierarchy - make sure they see at least
1082                          * one group and restart from the beginning.
1083                          */
1084                         if (!prev)
1085                                 goto restart;
1086                 }
1087         }
1088
1089 out_unlock:
1090         rcu_read_unlock();
1091         if (prev && prev != root)
1092                 css_put(&prev->css);
1093
1094         return next;
1095 }
1096
1097 /**
1098  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1099  * @root: hierarchy root
1100  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1101  */
1102 void mem_cgroup_iter_break(struct mem_cgroup *root,
1103                            struct mem_cgroup *prev)
1104 {
1105         if (!root)
1106                 root = root_mem_cgroup;
1107         if (prev && prev != root)
1108                 css_put(&prev->css);
1109 }
1110
1111 static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1112                                         struct mem_cgroup *dead_memcg)
1113 {
1114         struct mem_cgroup_reclaim_iter *iter;
1115         struct mem_cgroup_per_node *mz;
1116         int nid;
1117
1118         for_each_node(nid) {
1119                 mz = from->nodeinfo[nid];
1120                 iter = &mz->iter;
1121                 cmpxchg(&iter->position, dead_memcg, NULL);
1122         }
1123 }
1124
1125 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1126 {
1127         struct mem_cgroup *memcg = dead_memcg;
1128         struct mem_cgroup *last;
1129
1130         do {
1131                 __invalidate_reclaim_iterators(memcg, dead_memcg);
1132                 last = memcg;
1133         } while ((memcg = parent_mem_cgroup(memcg)));
1134
1135         /*
1136          * When cgroup1 non-hierarchy mode is used,
1137          * parent_mem_cgroup() does not walk all the way up to the
1138          * cgroup root (root_mem_cgroup). So we have to handle
1139          * dead_memcg from cgroup root separately.
1140          */
1141         if (!mem_cgroup_is_root(last))
1142                 __invalidate_reclaim_iterators(root_mem_cgroup,
1143                                                 dead_memcg);
1144 }
1145
1146 /**
1147  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1148  * @memcg: hierarchy root
1149  * @fn: function to call for each task
1150  * @arg: argument passed to @fn
1151  *
1152  * This function iterates over tasks attached to @memcg or to any of its
1153  * descendants and calls @fn for each task. If @fn returns a non-zero
1154  * value, the function breaks the iteration loop. Otherwise, it will iterate
1155  * over all tasks and return 0.
1156  *
1157  * This function must not be called for the root memory cgroup.
1158  */
1159 void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1160                            int (*fn)(struct task_struct *, void *), void *arg)
1161 {
1162         struct mem_cgroup *iter;
1163         int ret = 0;
1164
1165         BUG_ON(mem_cgroup_is_root(memcg));
1166
1167         for_each_mem_cgroup_tree(iter, memcg) {
1168                 struct css_task_iter it;
1169                 struct task_struct *task;
1170
1171                 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1172                 while (!ret && (task = css_task_iter_next(&it)))
1173                         ret = fn(task, arg);
1174                 css_task_iter_end(&it);
1175                 if (ret) {
1176                         mem_cgroup_iter_break(memcg, iter);
1177                         break;
1178                 }
1179         }
1180 }
1181
1182 #ifdef CONFIG_DEBUG_VM
1183 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
1184 {
1185         struct mem_cgroup *memcg;
1186
1187         if (mem_cgroup_disabled())
1188                 return;
1189
1190         memcg = folio_memcg(folio);
1191
1192         if (!memcg)
1193                 VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
1194         else
1195                 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
1196 }
1197 #endif
1198
1199 /**
1200  * folio_lruvec_lock - Lock the lruvec for a folio.
1201  * @folio: Pointer to the folio.
1202  *
1203  * These functions are safe to use under any of the following conditions:
1204  * - folio locked
1205  * - folio_test_lru false
1206  * - folio frozen (refcount of 0)
1207  *
1208  * Return: The lruvec this folio is on with its lock held.
1209  */
1210 struct lruvec *folio_lruvec_lock(struct folio *folio)
1211 {
1212         struct lruvec *lruvec = folio_lruvec(folio);
1213
1214         spin_lock(&lruvec->lru_lock);
1215         lruvec_memcg_debug(lruvec, folio);
1216
1217         return lruvec;
1218 }
1219
1220 /**
1221  * folio_lruvec_lock_irq - Lock the lruvec for a folio.
1222  * @folio: Pointer to the folio.
1223  *
1224  * These functions are safe to use under any of the following conditions:
1225  * - folio locked
1226  * - folio_test_lru false
1227  * - folio frozen (refcount of 0)
1228  *
1229  * Return: The lruvec this folio is on with its lock held and interrupts
1230  * disabled.
1231  */
1232 struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
1233 {
1234         struct lruvec *lruvec = folio_lruvec(folio);
1235
1236         spin_lock_irq(&lruvec->lru_lock);
1237         lruvec_memcg_debug(lruvec, folio);
1238
1239         return lruvec;
1240 }
1241
1242 /**
1243  * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
1244  * @folio: Pointer to the folio.
1245  * @flags: Pointer to irqsave flags.
1246  *
1247  * These functions are safe to use under any of the following conditions:
1248  * - folio locked
1249  * - folio_test_lru false
1250  * - folio frozen (refcount of 0)
1251  *
1252  * Return: The lruvec this folio is on with its lock held and interrupts
1253  * disabled.
1254  */
1255 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
1256                 unsigned long *flags)
1257 {
1258         struct lruvec *lruvec = folio_lruvec(folio);
1259
1260         spin_lock_irqsave(&lruvec->lru_lock, *flags);
1261         lruvec_memcg_debug(lruvec, folio);
1262
1263         return lruvec;
1264 }
1265
1266 /**
1267  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1268  * @lruvec: mem_cgroup per zone lru vector
1269  * @lru: index of lru list the page is sitting on
1270  * @zid: zone id of the accounted pages
1271  * @nr_pages: positive when adding or negative when removing
1272  *
1273  * This function must be called under lru_lock, just before a page is added
1274  * to or just after a page is removed from an lru list.
1275  */
1276 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1277                                 int zid, int nr_pages)
1278 {
1279         struct mem_cgroup_per_node *mz;
1280         unsigned long *lru_size;
1281         long size;
1282
1283         if (mem_cgroup_disabled())
1284                 return;
1285
1286         mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1287         lru_size = &mz->lru_zone_size[zid][lru];
1288
1289         if (nr_pages < 0)
1290                 *lru_size += nr_pages;
1291
1292         size = *lru_size;
1293         if (WARN_ONCE(size < 0,
1294                 "%s(%p, %d, %d): lru_size %ld\n",
1295                 __func__, lruvec, lru, nr_pages, size)) {
1296                 VM_BUG_ON(1);
1297                 *lru_size = 0;
1298         }
1299
1300         if (nr_pages > 0)
1301                 *lru_size += nr_pages;
1302 }
1303
1304 /**
1305  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1306  * @memcg: the memory cgroup
1307  *
1308  * Returns the maximum amount of memory @mem can be charged with, in
1309  * pages.
1310  */
1311 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1312 {
1313         unsigned long margin = 0;
1314         unsigned long count;
1315         unsigned long limit;
1316
1317         count = page_counter_read(&memcg->memory);
1318         limit = READ_ONCE(memcg->memory.max);
1319         if (count < limit)
1320                 margin = limit - count;
1321
1322         if (do_memsw_account()) {
1323                 count = page_counter_read(&memcg->memsw);
1324                 limit = READ_ONCE(memcg->memsw.max);
1325                 if (count < limit)
1326                         margin = min(margin, limit - count);
1327                 else
1328                         margin = 0;
1329         }
1330
1331         return margin;
1332 }
1333
1334 struct memory_stat {
1335         const char *name;
1336         unsigned int idx;
1337 };
1338
1339 static const struct memory_stat memory_stats[] = {
1340         { "anon",                       NR_ANON_MAPPED                  },
1341         { "file",                       NR_FILE_PAGES                   },
1342         { "kernel",                     MEMCG_KMEM                      },
1343         { "kernel_stack",               NR_KERNEL_STACK_KB              },
1344         { "pagetables",                 NR_PAGETABLE                    },
1345         { "sec_pagetables",             NR_SECONDARY_PAGETABLE          },
1346         { "percpu",                     MEMCG_PERCPU_B                  },
1347         { "sock",                       MEMCG_SOCK                      },
1348         { "vmalloc",                    MEMCG_VMALLOC                   },
1349         { "shmem",                      NR_SHMEM                        },
1350 #ifdef CONFIG_ZSWAP
1351         { "zswap",                      MEMCG_ZSWAP_B                   },
1352         { "zswapped",                   MEMCG_ZSWAPPED                  },
1353 #endif
1354         { "file_mapped",                NR_FILE_MAPPED                  },
1355         { "file_dirty",                 NR_FILE_DIRTY                   },
1356         { "file_writeback",             NR_WRITEBACK                    },
1357 #ifdef CONFIG_SWAP
1358         { "swapcached",                 NR_SWAPCACHE                    },
1359 #endif
1360 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1361         { "anon_thp",                   NR_ANON_THPS                    },
1362         { "file_thp",                   NR_FILE_THPS                    },
1363         { "shmem_thp",                  NR_SHMEM_THPS                   },
1364 #endif
1365         { "inactive_anon",              NR_INACTIVE_ANON                },
1366         { "active_anon",                NR_ACTIVE_ANON                  },
1367         { "inactive_file",              NR_INACTIVE_FILE                },
1368         { "active_file",                NR_ACTIVE_FILE                  },
1369         { "unevictable",                NR_UNEVICTABLE                  },
1370         { "slab_reclaimable",           NR_SLAB_RECLAIMABLE_B           },
1371         { "slab_unreclaimable",         NR_SLAB_UNRECLAIMABLE_B         },
1372 #ifdef CONFIG_HUGETLB_PAGE
1373         { "hugetlb",                    NR_HUGETLB                      },
1374 #endif
1375
1376         /* The memory events */
1377         { "workingset_refault_anon",    WORKINGSET_REFAULT_ANON         },
1378         { "workingset_refault_file",    WORKINGSET_REFAULT_FILE         },
1379         { "workingset_activate_anon",   WORKINGSET_ACTIVATE_ANON        },
1380         { "workingset_activate_file",   WORKINGSET_ACTIVATE_FILE        },
1381         { "workingset_restore_anon",    WORKINGSET_RESTORE_ANON         },
1382         { "workingset_restore_file",    WORKINGSET_RESTORE_FILE         },
1383         { "workingset_nodereclaim",     WORKINGSET_NODERECLAIM          },
1384
1385         { "pgdemote_kswapd",            PGDEMOTE_KSWAPD         },
1386         { "pgdemote_direct",            PGDEMOTE_DIRECT         },
1387         { "pgdemote_khugepaged",        PGDEMOTE_KHUGEPAGED     },
1388 #ifdef CONFIG_NUMA_BALANCING
1389         { "pgpromote_success",          PGPROMOTE_SUCCESS       },
1390 #endif
1391 };
1392
1393 /* The actual unit of the state item, not the same as the output unit */
1394 static int memcg_page_state_unit(int item)
1395 {
1396         switch (item) {
1397         case MEMCG_PERCPU_B:
1398         case MEMCG_ZSWAP_B:
1399         case NR_SLAB_RECLAIMABLE_B:
1400         case NR_SLAB_UNRECLAIMABLE_B:
1401                 return 1;
1402         case NR_KERNEL_STACK_KB:
1403                 return SZ_1K;
1404         default:
1405                 return PAGE_SIZE;
1406         }
1407 }
1408
1409 /* Translate stat items to the correct unit for memory.stat output */
1410 static int memcg_page_state_output_unit(int item)
1411 {
1412         /*
1413          * Workingset state is actually in pages, but we export it to userspace
1414          * as a scalar count of events, so special case it here.
1415          *
1416          * Demotion and promotion activities are exported in pages, consistent
1417          * with their global counterparts.
1418          */
1419         switch (item) {
1420         case WORKINGSET_REFAULT_ANON:
1421         case WORKINGSET_REFAULT_FILE:
1422         case WORKINGSET_ACTIVATE_ANON:
1423         case WORKINGSET_ACTIVATE_FILE:
1424         case WORKINGSET_RESTORE_ANON:
1425         case WORKINGSET_RESTORE_FILE:
1426         case WORKINGSET_NODERECLAIM:
1427         case PGDEMOTE_KSWAPD:
1428         case PGDEMOTE_DIRECT:
1429         case PGDEMOTE_KHUGEPAGED:
1430 #ifdef CONFIG_NUMA_BALANCING
1431         case PGPROMOTE_SUCCESS:
1432 #endif
1433                 return 1;
1434         default:
1435                 return memcg_page_state_unit(item);
1436         }
1437 }
1438
1439 unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
1440 {
1441         return memcg_page_state(memcg, item) *
1442                 memcg_page_state_output_unit(item);
1443 }
1444
1445 unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
1446 {
1447         return memcg_page_state_local(memcg, item) *
1448                 memcg_page_state_output_unit(item);
1449 }
1450
1451 static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1452 {
1453         int i;
1454
1455         /*
1456          * Provide statistics on the state of the memory subsystem as
1457          * well as cumulative event counters that show past behavior.
1458          *
1459          * This list is ordered following a combination of these gradients:
1460          * 1) generic big picture -> specifics and details
1461          * 2) reflecting userspace activity -> reflecting kernel heuristics
1462          *
1463          * Current memory state:
1464          */
1465         mem_cgroup_flush_stats(memcg);
1466
1467         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1468                 u64 size;
1469
1470 #ifdef CONFIG_HUGETLB_PAGE
1471                 if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
1472                     !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
1473                         continue;
1474 #endif
1475                 size = memcg_page_state_output(memcg, memory_stats[i].idx);
1476                 seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
1477
1478                 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1479                         size += memcg_page_state_output(memcg,
1480                                                         NR_SLAB_RECLAIMABLE_B);
1481                         seq_buf_printf(s, "slab %llu\n", size);
1482                 }
1483         }
1484
1485         /* Accumulated memory events */
1486         seq_buf_printf(s, "pgscan %lu\n",
1487                        memcg_events(memcg, PGSCAN_KSWAPD) +
1488                        memcg_events(memcg, PGSCAN_DIRECT) +
1489                        memcg_events(memcg, PGSCAN_KHUGEPAGED));
1490         seq_buf_printf(s, "pgsteal %lu\n",
1491                        memcg_events(memcg, PGSTEAL_KSWAPD) +
1492                        memcg_events(memcg, PGSTEAL_DIRECT) +
1493                        memcg_events(memcg, PGSTEAL_KHUGEPAGED));
1494
1495         for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
1496 #ifdef CONFIG_MEMCG_V1
1497                 if (memcg_vm_event_stat[i] == PGPGIN ||
1498                     memcg_vm_event_stat[i] == PGPGOUT)
1499                         continue;
1500 #endif
1501                 seq_buf_printf(s, "%s %lu\n",
1502                                vm_event_name(memcg_vm_event_stat[i]),
1503                                memcg_events(memcg, memcg_vm_event_stat[i]));
1504         }
1505 }
1506
1507 static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1508 {
1509         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1510                 memcg_stat_format(memcg, s);
1511         else
1512                 memcg1_stat_format(memcg, s);
1513         if (seq_buf_has_overflowed(s))
1514                 pr_warn("%s: Warning, stat buffer overflow, please report\n", __func__);
1515 }
1516
1517 /**
1518  * mem_cgroup_print_oom_context: Print OOM information relevant to
1519  * memory controller.
1520  * @memcg: The memory cgroup that went over limit
1521  * @p: Task that is going to be killed
1522  *
1523  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1524  * enabled
1525  */
1526 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1527 {
1528         rcu_read_lock();
1529
1530         if (memcg) {
1531                 pr_cont(",oom_memcg=");
1532                 pr_cont_cgroup_path(memcg->css.cgroup);
1533         } else
1534                 pr_cont(",global_oom");
1535         if (p) {
1536                 pr_cont(",task_memcg=");
1537                 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1538         }
1539         rcu_read_unlock();
1540 }
1541
1542 /**
1543  * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1544  * memory controller.
1545  * @memcg: The memory cgroup that went over limit
1546  */
1547 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1548 {
1549         /* Use static buffer, for the caller is holding oom_lock. */
1550         static char buf[SEQ_BUF_SIZE];
1551         struct seq_buf s;
1552
1553         lockdep_assert_held(&oom_lock);
1554
1555         pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1556                 K((u64)page_counter_read(&memcg->memory)),
1557                 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1558         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1559                 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1560                         K((u64)page_counter_read(&memcg->swap)),
1561                         K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1562 #ifdef CONFIG_MEMCG_V1
1563         else {
1564                 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1565                         K((u64)page_counter_read(&memcg->memsw)),
1566                         K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1567                 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1568                         K((u64)page_counter_read(&memcg->kmem)),
1569                         K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1570         }
1571 #endif
1572
1573         pr_info("Memory cgroup stats for ");
1574         pr_cont_cgroup_path(memcg->css.cgroup);
1575         pr_cont(":");
1576         seq_buf_init(&s, buf, SEQ_BUF_SIZE);
1577         memory_stat_format(memcg, &s);
1578         seq_buf_do_printk(&s, KERN_INFO);
1579 }
1580
1581 /*
1582  * Return the memory (and swap, if configured) limit for a memcg.
1583  */
1584 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1585 {
1586         unsigned long max = READ_ONCE(memcg->memory.max);
1587
1588         if (do_memsw_account()) {
1589                 if (mem_cgroup_swappiness(memcg)) {
1590                         /* Calculate swap excess capacity from memsw limit */
1591                         unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1592
1593                         max += min(swap, (unsigned long)total_swap_pages);
1594                 }
1595         } else {
1596                 if (mem_cgroup_swappiness(memcg))
1597                         max += min(READ_ONCE(memcg->swap.max),
1598                                    (unsigned long)total_swap_pages);
1599         }
1600         return max;
1601 }
1602
1603 unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1604 {
1605         return page_counter_read(&memcg->memory);
1606 }
1607
1608 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1609                                      int order)
1610 {
1611         struct oom_control oc = {
1612                 .zonelist = NULL,
1613                 .nodemask = NULL,
1614                 .memcg = memcg,
1615                 .gfp_mask = gfp_mask,
1616                 .order = order,
1617         };
1618         bool ret = true;
1619
1620         if (mutex_lock_killable(&oom_lock))
1621                 return true;
1622
1623         if (mem_cgroup_margin(memcg) >= (1 << order))
1624                 goto unlock;
1625
1626         /*
1627          * A few threads which were not waiting at mutex_lock_killable() can
1628          * fail to bail out. Therefore, check again after holding oom_lock.
1629          */
1630         ret = task_is_dying() || out_of_memory(&oc);
1631
1632 unlock:
1633         mutex_unlock(&oom_lock);
1634         return ret;
1635 }
1636
1637 /*
1638  * Returns true if successfully killed one or more processes. Though in some
1639  * corner cases it can return true even without killing any process.
1640  */
1641 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1642 {
1643         bool locked, ret;
1644
1645         if (order > PAGE_ALLOC_COSTLY_ORDER)
1646                 return false;
1647
1648         memcg_memory_event(memcg, MEMCG_OOM);
1649
1650         if (!memcg1_oom_prepare(memcg, &locked))
1651                 return false;
1652
1653         ret = mem_cgroup_out_of_memory(memcg, mask, order);
1654
1655         memcg1_oom_finish(memcg, locked);
1656
1657         return ret;
1658 }
1659
1660 /**
1661  * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1662  * @victim: task to be killed by the OOM killer
1663  * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1664  *
1665  * Returns a pointer to a memory cgroup, which has to be cleaned up
1666  * by killing all belonging OOM-killable tasks.
1667  *
1668  * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1669  */
1670 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1671                                             struct mem_cgroup *oom_domain)
1672 {
1673         struct mem_cgroup *oom_group = NULL;
1674         struct mem_cgroup *memcg;
1675
1676         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1677                 return NULL;
1678
1679         if (!oom_domain)
1680                 oom_domain = root_mem_cgroup;
1681
1682         rcu_read_lock();
1683
1684         memcg = mem_cgroup_from_task(victim);
1685         if (mem_cgroup_is_root(memcg))
1686                 goto out;
1687
1688         /*
1689          * If the victim task has been asynchronously moved to a different
1690          * memory cgroup, we might end up killing tasks outside oom_domain.
1691          * In this case it's better to ignore memory.group.oom.
1692          */
1693         if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1694                 goto out;
1695
1696         /*
1697          * Traverse the memory cgroup hierarchy from the victim task's
1698          * cgroup up to the OOMing cgroup (or root) to find the
1699          * highest-level memory cgroup with oom.group set.
1700          */
1701         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1702                 if (READ_ONCE(memcg->oom_group))
1703                         oom_group = memcg;
1704
1705                 if (memcg == oom_domain)
1706                         break;
1707         }
1708
1709         if (oom_group)
1710                 css_get(&oom_group->css);
1711 out:
1712         rcu_read_unlock();
1713
1714         return oom_group;
1715 }
1716
1717 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1718 {
1719         pr_info("Tasks in ");
1720         pr_cont_cgroup_path(memcg->css.cgroup);
1721         pr_cont(" are going to be killed due to memory.oom.group set\n");
1722 }
1723
1724 struct memcg_stock_pcp {
1725         local_lock_t stock_lock;
1726         struct mem_cgroup *cached; /* this never be root cgroup */
1727         unsigned int nr_pages;
1728
1729         struct obj_cgroup *cached_objcg;
1730         struct pglist_data *cached_pgdat;
1731         unsigned int nr_bytes;
1732         int nr_slab_reclaimable_b;
1733         int nr_slab_unreclaimable_b;
1734
1735         struct work_struct work;
1736         unsigned long flags;
1737 #define FLUSHING_CACHED_CHARGE  0
1738 };
1739 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
1740         .stock_lock = INIT_LOCAL_LOCK(stock_lock),
1741 };
1742 static DEFINE_MUTEX(percpu_charge_mutex);
1743
1744 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
1745 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
1746                                      struct mem_cgroup *root_memcg);
1747
1748 /**
1749  * consume_stock: Try to consume stocked charge on this cpu.
1750  * @memcg: memcg to consume from.
1751  * @nr_pages: how many pages to charge.
1752  *
1753  * The charges will only happen if @memcg matches the current cpu's memcg
1754  * stock, and at least @nr_pages are available in that stock.  Failure to
1755  * service an allocation will refill the stock.
1756  *
1757  * returns true if successful, false otherwise.
1758  */
1759 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1760 {
1761         struct memcg_stock_pcp *stock;
1762         unsigned int stock_pages;
1763         unsigned long flags;
1764         bool ret = false;
1765
1766         if (nr_pages > MEMCG_CHARGE_BATCH)
1767                 return ret;
1768
1769         local_lock_irqsave(&memcg_stock.stock_lock, flags);
1770
1771         stock = this_cpu_ptr(&memcg_stock);
1772         stock_pages = READ_ONCE(stock->nr_pages);
1773         if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
1774                 WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
1775                 ret = true;
1776         }
1777
1778         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
1779
1780         return ret;
1781 }
1782
1783 /*
1784  * Returns stocks cached in percpu and reset cached information.
1785  */
1786 static void drain_stock(struct memcg_stock_pcp *stock)
1787 {
1788         unsigned int stock_pages = READ_ONCE(stock->nr_pages);
1789         struct mem_cgroup *old = READ_ONCE(stock->cached);
1790
1791         if (!old)
1792                 return;
1793
1794         if (stock_pages) {
1795                 page_counter_uncharge(&old->memory, stock_pages);
1796                 if (do_memsw_account())
1797                         page_counter_uncharge(&old->memsw, stock_pages);
1798
1799                 WRITE_ONCE(stock->nr_pages, 0);
1800         }
1801
1802         css_put(&old->css);
1803         WRITE_ONCE(stock->cached, NULL);
1804 }
1805
1806 static void drain_local_stock(struct work_struct *dummy)
1807 {
1808         struct memcg_stock_pcp *stock;
1809         struct obj_cgroup *old = NULL;
1810         unsigned long flags;
1811
1812         /*
1813          * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
1814          * drain_stock races is that we always operate on local CPU stock
1815          * here with IRQ disabled
1816          */
1817         local_lock_irqsave(&memcg_stock.stock_lock, flags);
1818
1819         stock = this_cpu_ptr(&memcg_stock);
1820         old = drain_obj_stock(stock);
1821         drain_stock(stock);
1822         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1823
1824         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
1825         obj_cgroup_put(old);
1826 }
1827
1828 /*
1829  * Cache charges(val) to local per_cpu area.
1830  * This will be consumed by consume_stock() function, later.
1831  */
1832 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1833 {
1834         struct memcg_stock_pcp *stock;
1835         unsigned int stock_pages;
1836
1837         stock = this_cpu_ptr(&memcg_stock);
1838         if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
1839                 drain_stock(stock);
1840                 css_get(&memcg->css);
1841                 WRITE_ONCE(stock->cached, memcg);
1842         }
1843         stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
1844         WRITE_ONCE(stock->nr_pages, stock_pages);
1845
1846         if (stock_pages > MEMCG_CHARGE_BATCH)
1847                 drain_stock(stock);
1848 }
1849
1850 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1851 {
1852         unsigned long flags;
1853
1854         local_lock_irqsave(&memcg_stock.stock_lock, flags);
1855         __refill_stock(memcg, nr_pages);
1856         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
1857 }
1858
1859 /*
1860  * Drains all per-CPU charge caches for given root_memcg resp. subtree
1861  * of the hierarchy under it.
1862  */
1863 void drain_all_stock(struct mem_cgroup *root_memcg)
1864 {
1865         int cpu, curcpu;
1866
1867         /* If someone's already draining, avoid adding running more workers. */
1868         if (!mutex_trylock(&percpu_charge_mutex))
1869                 return;
1870         /*
1871          * Notify other cpus that system-wide "drain" is running
1872          * We do not care about races with the cpu hotplug because cpu down
1873          * as well as workers from this path always operate on the local
1874          * per-cpu data. CPU up doesn't touch memcg_stock at all.
1875          */
1876         migrate_disable();
1877         curcpu = smp_processor_id();
1878         for_each_online_cpu(cpu) {
1879                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1880                 struct mem_cgroup *memcg;
1881                 bool flush = false;
1882
1883                 rcu_read_lock();
1884                 memcg = READ_ONCE(stock->cached);
1885                 if (memcg && READ_ONCE(stock->nr_pages) &&
1886                     mem_cgroup_is_descendant(memcg, root_memcg))
1887                         flush = true;
1888                 else if (obj_stock_flush_required(stock, root_memcg))
1889                         flush = true;
1890                 rcu_read_unlock();
1891
1892                 if (flush &&
1893                     !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
1894                         if (cpu == curcpu)
1895                                 drain_local_stock(&stock->work);
1896                         else if (!cpu_is_isolated(cpu))
1897                                 schedule_work_on(cpu, &stock->work);
1898                 }
1899         }
1900         migrate_enable();
1901         mutex_unlock(&percpu_charge_mutex);
1902 }
1903
1904 static int memcg_hotplug_cpu_dead(unsigned int cpu)
1905 {
1906         struct memcg_stock_pcp *stock;
1907
1908         stock = &per_cpu(memcg_stock, cpu);
1909         drain_stock(stock);
1910
1911         return 0;
1912 }
1913
1914 static unsigned long reclaim_high(struct mem_cgroup *memcg,
1915                                   unsigned int nr_pages,
1916                                   gfp_t gfp_mask)
1917 {
1918         unsigned long nr_reclaimed = 0;
1919
1920         do {
1921                 unsigned long pflags;
1922
1923                 if (page_counter_read(&memcg->memory) <=
1924                     READ_ONCE(memcg->memory.high))
1925                         continue;
1926
1927                 memcg_memory_event(memcg, MEMCG_HIGH);
1928
1929                 psi_memstall_enter(&pflags);
1930                 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
1931                                                         gfp_mask,
1932                                                         MEMCG_RECLAIM_MAY_SWAP,
1933                                                         NULL);
1934                 psi_memstall_leave(&pflags);
1935         } while ((memcg = parent_mem_cgroup(memcg)) &&
1936                  !mem_cgroup_is_root(memcg));
1937
1938         return nr_reclaimed;
1939 }
1940
1941 static void high_work_func(struct work_struct *work)
1942 {
1943         struct mem_cgroup *memcg;
1944
1945         memcg = container_of(work, struct mem_cgroup, high_work);
1946         reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
1947 }
1948
1949 /*
1950  * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
1951  * enough to still cause a significant slowdown in most cases, while still
1952  * allowing diagnostics and tracing to proceed without becoming stuck.
1953  */
1954 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
1955
1956 /*
1957  * When calculating the delay, we use these either side of the exponentiation to
1958  * maintain precision and scale to a reasonable number of jiffies (see the table
1959  * below.
1960  *
1961  * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
1962  *   overage ratio to a delay.
1963  * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
1964  *   proposed penalty in order to reduce to a reasonable number of jiffies, and
1965  *   to produce a reasonable delay curve.
1966  *
1967  * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
1968  * reasonable delay curve compared to precision-adjusted overage, not
1969  * penalising heavily at first, but still making sure that growth beyond the
1970  * limit penalises misbehaviour cgroups by slowing them down exponentially. For
1971  * example, with a high of 100 megabytes:
1972  *
1973  *  +-------+------------------------+
1974  *  | usage | time to allocate in ms |
1975  *  +-------+------------------------+
1976  *  | 100M  |                      0 |
1977  *  | 101M  |                      6 |
1978  *  | 102M  |                     25 |
1979  *  | 103M  |                     57 |
1980  *  | 104M  |                    102 |
1981  *  | 105M  |                    159 |
1982  *  | 106M  |                    230 |
1983  *  | 107M  |                    313 |
1984  *  | 108M  |                    409 |
1985  *  | 109M  |                    518 |
1986  *  | 110M  |                    639 |
1987  *  | 111M  |                    774 |
1988  *  | 112M  |                    921 |
1989  *  | 113M  |                   1081 |
1990  *  | 114M  |                   1254 |
1991  *  | 115M  |                   1439 |
1992  *  | 116M  |                   1638 |
1993  *  | 117M  |                   1849 |
1994  *  | 118M  |                   2000 |
1995  *  | 119M  |                   2000 |
1996  *  | 120M  |                   2000 |
1997  *  +-------+------------------------+
1998  */
1999  #define MEMCG_DELAY_PRECISION_SHIFT 20
2000  #define MEMCG_DELAY_SCALING_SHIFT 14
2001
2002 static u64 calculate_overage(unsigned long usage, unsigned long high)
2003 {
2004         u64 overage;
2005
2006         if (usage <= high)
2007                 return 0;
2008
2009         /*
2010          * Prevent division by 0 in overage calculation by acting as if
2011          * it was a threshold of 1 page
2012          */
2013         high = max(high, 1UL);
2014
2015         overage = usage - high;
2016         overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2017         return div64_u64(overage, high);
2018 }
2019
2020 static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2021 {
2022         u64 overage, max_overage = 0;
2023
2024         do {
2025                 overage = calculate_overage(page_counter_read(&memcg->memory),
2026                                             READ_ONCE(memcg->memory.high));
2027                 max_overage = max(overage, max_overage);
2028         } while ((memcg = parent_mem_cgroup(memcg)) &&
2029                  !mem_cgroup_is_root(memcg));
2030
2031         return max_overage;
2032 }
2033
2034 static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2035 {
2036         u64 overage, max_overage = 0;
2037
2038         do {
2039                 overage = calculate_overage(page_counter_read(&memcg->swap),
2040                                             READ_ONCE(memcg->swap.high));
2041                 if (overage)
2042                         memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2043                 max_overage = max(overage, max_overage);
2044         } while ((memcg = parent_mem_cgroup(memcg)) &&
2045                  !mem_cgroup_is_root(memcg));
2046
2047         return max_overage;
2048 }
2049
2050 /*
2051  * Get the number of jiffies that we should penalise a mischievous cgroup which
2052  * is exceeding its memory.high by checking both it and its ancestors.
2053  */
2054 static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2055                                           unsigned int nr_pages,
2056                                           u64 max_overage)
2057 {
2058         unsigned long penalty_jiffies;
2059
2060         if (!max_overage)
2061                 return 0;
2062
2063         /*
2064          * We use overage compared to memory.high to calculate the number of
2065          * jiffies to sleep (penalty_jiffies). Ideally this value should be
2066          * fairly lenient on small overages, and increasingly harsh when the
2067          * memcg in question makes it clear that it has no intention of stopping
2068          * its crazy behaviour, so we exponentially increase the delay based on
2069          * overage amount.
2070          */
2071         penalty_jiffies = max_overage * max_overage * HZ;
2072         penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2073         penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2074
2075         /*
2076          * Factor in the task's own contribution to the overage, such that four
2077          * N-sized allocations are throttled approximately the same as one
2078          * 4N-sized allocation.
2079          *
2080          * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2081          * larger the current charge patch is than that.
2082          */
2083         return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2084 }
2085
2086 /*
2087  * Reclaims memory over the high limit. Called directly from
2088  * try_charge() (context permitting), as well as from the userland
2089  * return path where reclaim is always able to block.
2090  */
2091 void mem_cgroup_handle_over_high(gfp_t gfp_mask)
2092 {
2093         unsigned long penalty_jiffies;
2094         unsigned long pflags;
2095         unsigned long nr_reclaimed;
2096         unsigned int nr_pages = current->memcg_nr_pages_over_high;
2097         int nr_retries = MAX_RECLAIM_RETRIES;
2098         struct mem_cgroup *memcg;
2099         bool in_retry = false;
2100
2101         if (likely(!nr_pages))
2102                 return;
2103
2104         memcg = get_mem_cgroup_from_mm(current->mm);
2105         current->memcg_nr_pages_over_high = 0;
2106
2107 retry_reclaim:
2108         /*
2109          * Bail if the task is already exiting. Unlike memory.max,
2110          * memory.high enforcement isn't as strict, and there is no
2111          * OOM killer involved, which means the excess could already
2112          * be much bigger (and still growing) than it could for
2113          * memory.max; the dying task could get stuck in fruitless
2114          * reclaim for a long time, which isn't desirable.
2115          */
2116         if (task_is_dying())
2117                 goto out;
2118
2119         /*
2120          * The allocating task should reclaim at least the batch size, but for
2121          * subsequent retries we only want to do what's necessary to prevent oom
2122          * or breaching resource isolation.
2123          *
2124          * This is distinct from memory.max or page allocator behaviour because
2125          * memory.high is currently batched, whereas memory.max and the page
2126          * allocator run every time an allocation is made.
2127          */
2128         nr_reclaimed = reclaim_high(memcg,
2129                                     in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2130                                     gfp_mask);
2131
2132         /*
2133          * memory.high is breached and reclaim is unable to keep up. Throttle
2134          * allocators proactively to slow down excessive growth.
2135          */
2136         penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2137                                                mem_find_max_overage(memcg));
2138
2139         penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2140                                                 swap_find_max_overage(memcg));
2141
2142         /*
2143          * Clamp the max delay per usermode return so as to still keep the
2144          * application moving forwards and also permit diagnostics, albeit
2145          * extremely slowly.
2146          */
2147         penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2148
2149         /*
2150          * Don't sleep if the amount of jiffies this memcg owes us is so low
2151          * that it's not even worth doing, in an attempt to be nice to those who
2152          * go only a small amount over their memory.high value and maybe haven't
2153          * been aggressively reclaimed enough yet.
2154          */
2155         if (penalty_jiffies <= HZ / 100)
2156                 goto out;
2157
2158         /*
2159          * If reclaim is making forward progress but we're still over
2160          * memory.high, we want to encourage that rather than doing allocator
2161          * throttling.
2162          */
2163         if (nr_reclaimed || nr_retries--) {
2164                 in_retry = true;
2165                 goto retry_reclaim;
2166         }
2167
2168         /*
2169          * Reclaim didn't manage to push usage below the limit, slow
2170          * this allocating task down.
2171          *
2172          * If we exit early, we're guaranteed to die (since
2173          * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2174          * need to account for any ill-begotten jiffies to pay them off later.
2175          */
2176         psi_memstall_enter(&pflags);
2177         schedule_timeout_killable(penalty_jiffies);
2178         psi_memstall_leave(&pflags);
2179
2180 out:
2181         css_put(&memcg->css);
2182 }
2183
2184 int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2185                      unsigned int nr_pages)
2186 {
2187         unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2188         int nr_retries = MAX_RECLAIM_RETRIES;
2189         struct mem_cgroup *mem_over_limit;
2190         struct page_counter *counter;
2191         unsigned long nr_reclaimed;
2192         bool passed_oom = false;
2193         unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
2194         bool drained = false;
2195         bool raised_max_event = false;
2196         unsigned long pflags;
2197
2198 retry:
2199         if (consume_stock(memcg, nr_pages))
2200                 return 0;
2201
2202         if (!do_memsw_account() ||
2203             page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2204                 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2205                         goto done_restock;
2206                 if (do_memsw_account())
2207                         page_counter_uncharge(&memcg->memsw, batch);
2208                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2209         } else {
2210                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2211                 reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
2212         }
2213
2214         if (batch > nr_pages) {
2215                 batch = nr_pages;
2216                 goto retry;
2217         }
2218
2219         /*
2220          * Prevent unbounded recursion when reclaim operations need to
2221          * allocate memory. This might exceed the limits temporarily,
2222          * but we prefer facilitating memory reclaim and getting back
2223          * under the limit over triggering OOM kills in these cases.
2224          */
2225         if (unlikely(current->flags & PF_MEMALLOC))
2226                 goto force;
2227
2228         if (unlikely(task_in_memcg_oom(current)))
2229                 goto nomem;
2230
2231         if (!gfpflags_allow_blocking(gfp_mask))
2232                 goto nomem;
2233
2234         memcg_memory_event(mem_over_limit, MEMCG_MAX);
2235         raised_max_event = true;
2236
2237         psi_memstall_enter(&pflags);
2238         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2239                                                     gfp_mask, reclaim_options, NULL);
2240         psi_memstall_leave(&pflags);
2241
2242         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2243                 goto retry;
2244
2245         if (!drained) {
2246                 drain_all_stock(mem_over_limit);
2247                 drained = true;
2248                 goto retry;
2249         }
2250
2251         if (gfp_mask & __GFP_NORETRY)
2252                 goto nomem;
2253         /*
2254          * Even though the limit is exceeded at this point, reclaim
2255          * may have been able to free some pages.  Retry the charge
2256          * before killing the task.
2257          *
2258          * Only for regular pages, though: huge pages are rather
2259          * unlikely to succeed so close to the limit, and we fall back
2260          * to regular pages anyway in case of failure.
2261          */
2262         if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2263                 goto retry;
2264
2265         if (nr_retries--)
2266                 goto retry;
2267
2268         if (gfp_mask & __GFP_RETRY_MAYFAIL)
2269                 goto nomem;
2270
2271         /* Avoid endless loop for tasks bypassed by the oom killer */
2272         if (passed_oom && task_is_dying())
2273                 goto nomem;
2274
2275         /*
2276          * keep retrying as long as the memcg oom killer is able to make
2277          * a forward progress or bypass the charge if the oom killer
2278          * couldn't make any progress.
2279          */
2280         if (mem_cgroup_oom(mem_over_limit, gfp_mask,
2281                            get_order(nr_pages * PAGE_SIZE))) {
2282                 passed_oom = true;
2283                 nr_retries = MAX_RECLAIM_RETRIES;
2284                 goto retry;
2285         }
2286 nomem:
2287         /*
2288          * Memcg doesn't have a dedicated reserve for atomic
2289          * allocations. But like the global atomic pool, we need to
2290          * put the burden of reclaim on regular allocation requests
2291          * and let these go through as privileged allocations.
2292          */
2293         if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
2294                 return -ENOMEM;
2295 force:
2296         /*
2297          * If the allocation has to be enforced, don't forget to raise
2298          * a MEMCG_MAX event.
2299          */
2300         if (!raised_max_event)
2301                 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2302
2303         /*
2304          * The allocation either can't fail or will lead to more memory
2305          * being freed very soon.  Allow memory usage go over the limit
2306          * temporarily by force charging it.
2307          */
2308         page_counter_charge(&memcg->memory, nr_pages);
2309         if (do_memsw_account())
2310                 page_counter_charge(&memcg->memsw, nr_pages);
2311
2312         return 0;
2313
2314 done_restock:
2315         if (batch > nr_pages)
2316                 refill_stock(memcg, batch - nr_pages);
2317
2318         /*
2319          * If the hierarchy is above the normal consumption range, schedule
2320          * reclaim on returning to userland.  We can perform reclaim here
2321          * if __GFP_RECLAIM but let's always punt for simplicity and so that
2322          * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2323          * not recorded as it most likely matches current's and won't
2324          * change in the meantime.  As high limit is checked again before
2325          * reclaim, the cost of mismatch is negligible.
2326          */
2327         do {
2328                 bool mem_high, swap_high;
2329
2330                 mem_high = page_counter_read(&memcg->memory) >
2331                         READ_ONCE(memcg->memory.high);
2332                 swap_high = page_counter_read(&memcg->swap) >
2333                         READ_ONCE(memcg->swap.high);
2334
2335                 /* Don't bother a random interrupted task */
2336                 if (!in_task()) {
2337                         if (mem_high) {
2338                                 schedule_work(&memcg->high_work);
2339                                 break;
2340                         }
2341                         continue;
2342                 }
2343
2344                 if (mem_high || swap_high) {
2345                         /*
2346                          * The allocating tasks in this cgroup will need to do
2347                          * reclaim or be throttled to prevent further growth
2348                          * of the memory or swap footprints.
2349                          *
2350                          * Target some best-effort fairness between the tasks,
2351                          * and distribute reclaim work and delay penalties
2352                          * based on how much each task is actually allocating.
2353                          */
2354                         current->memcg_nr_pages_over_high += batch;
2355                         set_notify_resume(current);
2356                         break;
2357                 }
2358         } while ((memcg = parent_mem_cgroup(memcg)));
2359
2360         /*
2361          * Reclaim is set up above to be called from the userland
2362          * return path. But also attempt synchronous reclaim to avoid
2363          * excessive overrun while the task is still inside the
2364          * kernel. If this is successful, the return path will see it
2365          * when it rechecks the overage and simply bail out.
2366          */
2367         if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
2368             !(current->flags & PF_MEMALLOC) &&
2369             gfpflags_allow_blocking(gfp_mask))
2370                 mem_cgroup_handle_over_high(gfp_mask);
2371         return 0;
2372 }
2373
2374 /**
2375  * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
2376  * @memcg: memcg previously charged.
2377  * @nr_pages: number of pages previously charged.
2378  */
2379 void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2380 {
2381         if (mem_cgroup_is_root(memcg))
2382                 return;
2383
2384         page_counter_uncharge(&memcg->memory, nr_pages);
2385         if (do_memsw_account())
2386                 page_counter_uncharge(&memcg->memsw, nr_pages);
2387 }
2388
2389 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2390 {
2391         VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
2392         /*
2393          * Any of the following ensures page's memcg stability:
2394          *
2395          * - the page lock
2396          * - LRU isolation
2397          * - exclusive reference
2398          */
2399         folio->memcg_data = (unsigned long)memcg;
2400 }
2401
2402 /**
2403  * mem_cgroup_commit_charge - commit a previously successful try_charge().
2404  * @folio: folio to commit the charge to.
2405  * @memcg: memcg previously charged.
2406  */
2407 void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2408 {
2409         css_get(&memcg->css);
2410         commit_charge(folio, memcg);
2411         memcg1_commit_charge(folio, memcg);
2412 }
2413
2414 static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
2415                                        struct pglist_data *pgdat,
2416                                        enum node_stat_item idx, int nr)
2417 {
2418         struct mem_cgroup *memcg;
2419         struct lruvec *lruvec;
2420
2421         rcu_read_lock();
2422         memcg = obj_cgroup_memcg(objcg);
2423         lruvec = mem_cgroup_lruvec(memcg, pgdat);
2424         __mod_memcg_lruvec_state(lruvec, idx, nr);
2425         rcu_read_unlock();
2426 }
2427
2428 static __always_inline
2429 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
2430 {
2431         /*
2432          * Slab objects are accounted individually, not per-page.
2433          * Memcg membership data for each individual object is saved in
2434          * slab->obj_exts.
2435          */
2436         if (folio_test_slab(folio)) {
2437                 struct slabobj_ext *obj_exts;
2438                 struct slab *slab;
2439                 unsigned int off;
2440
2441                 slab = folio_slab(folio);
2442                 obj_exts = slab_obj_exts(slab);
2443                 if (!obj_exts)
2444                         return NULL;
2445
2446                 off = obj_to_index(slab->slab_cache, slab, p);
2447                 if (obj_exts[off].objcg)
2448                         return obj_cgroup_memcg(obj_exts[off].objcg);
2449
2450                 return NULL;
2451         }
2452
2453         /*
2454          * folio_memcg_check() is used here, because in theory we can encounter
2455          * a folio where the slab flag has been cleared already, but
2456          * slab->obj_exts has not been freed yet
2457          * folio_memcg_check() will guarantee that a proper memory
2458          * cgroup pointer or NULL will be returned.
2459          */
2460         return folio_memcg_check(folio);
2461 }
2462
2463 /*
2464  * Returns a pointer to the memory cgroup to which the kernel object is charged.
2465  * It is not suitable for objects allocated using vmalloc().
2466  *
2467  * A passed kernel object must be a slab object or a generic kernel page.
2468  *
2469  * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2470  * cgroup_mutex, etc.
2471  */
2472 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
2473 {
2474         if (mem_cgroup_disabled())
2475                 return NULL;
2476
2477         return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
2478 }
2479
2480 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
2481 {
2482         struct obj_cgroup *objcg = NULL;
2483
2484         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
2485                 objcg = rcu_dereference(memcg->objcg);
2486                 if (likely(objcg && obj_cgroup_tryget(objcg)))
2487                         break;
2488                 objcg = NULL;
2489         }
2490         return objcg;
2491 }
2492
2493 static struct obj_cgroup *current_objcg_update(void)
2494 {
2495         struct mem_cgroup *memcg;
2496         struct obj_cgroup *old, *objcg = NULL;
2497
2498         do {
2499                 /* Atomically drop the update bit. */
2500                 old = xchg(&current->objcg, NULL);
2501                 if (old) {
2502                         old = (struct obj_cgroup *)
2503                                 ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
2504                         obj_cgroup_put(old);
2505
2506                         old = NULL;
2507                 }
2508
2509                 /* If new objcg is NULL, no reason for the second atomic update. */
2510                 if (!current->mm || (current->flags & PF_KTHREAD))
2511                         return NULL;
2512
2513                 /*
2514                  * Release the objcg pointer from the previous iteration,
2515                  * if try_cmpxcg() below fails.
2516                  */
2517                 if (unlikely(objcg)) {
2518                         obj_cgroup_put(objcg);
2519                         objcg = NULL;
2520                 }
2521
2522                 /*
2523                  * Obtain the new objcg pointer. The current task can be
2524                  * asynchronously moved to another memcg and the previous
2525                  * memcg can be offlined. So let's get the memcg pointer
2526                  * and try get a reference to objcg under a rcu read lock.
2527                  */
2528
2529                 rcu_read_lock();
2530                 memcg = mem_cgroup_from_task(current);
2531                 objcg = __get_obj_cgroup_from_memcg(memcg);
2532                 rcu_read_unlock();
2533
2534                 /*
2535                  * Try set up a new objcg pointer atomically. If it
2536                  * fails, it means the update flag was set concurrently, so
2537                  * the whole procedure should be repeated.
2538                  */
2539         } while (!try_cmpxchg(&current->objcg, &old, objcg));
2540
2541         return objcg;
2542 }
2543
2544 __always_inline struct obj_cgroup *current_obj_cgroup(void)
2545 {
2546         struct mem_cgroup *memcg;
2547         struct obj_cgroup *objcg;
2548
2549         if (in_task()) {
2550                 memcg = current->active_memcg;
2551                 if (unlikely(memcg))
2552                         goto from_memcg;
2553
2554                 objcg = READ_ONCE(current->objcg);
2555                 if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
2556                         objcg = current_objcg_update();
2557                 /*
2558                  * Objcg reference is kept by the task, so it's safe
2559                  * to use the objcg by the current task.
2560                  */
2561                 return objcg;
2562         }
2563
2564         memcg = this_cpu_read(int_active_memcg);
2565         if (unlikely(memcg))
2566                 goto from_memcg;
2567
2568         return NULL;
2569
2570 from_memcg:
2571         objcg = NULL;
2572         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
2573                 /*
2574                  * Memcg pointer is protected by scope (see set_active_memcg())
2575                  * and is pinning the corresponding objcg, so objcg can't go
2576                  * away and can be used within the scope without any additional
2577                  * protection.
2578                  */
2579                 objcg = rcu_dereference_check(memcg->objcg, 1);
2580                 if (likely(objcg))
2581                         break;
2582         }
2583
2584         return objcg;
2585 }
2586
2587 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
2588 {
2589         struct obj_cgroup *objcg;
2590
2591         if (!memcg_kmem_online())
2592                 return NULL;
2593
2594         if (folio_memcg_kmem(folio)) {
2595                 objcg = __folio_objcg(folio);
2596                 obj_cgroup_get(objcg);
2597         } else {
2598                 struct mem_cgroup *memcg;
2599
2600                 rcu_read_lock();
2601                 memcg = __folio_memcg(folio);
2602                 if (memcg)
2603                         objcg = __get_obj_cgroup_from_memcg(memcg);
2604                 else
2605                         objcg = NULL;
2606                 rcu_read_unlock();
2607         }
2608         return objcg;
2609 }
2610
2611 /*
2612  * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
2613  * @objcg: object cgroup to uncharge
2614  * @nr_pages: number of pages to uncharge
2615  */
2616 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
2617                                       unsigned int nr_pages)
2618 {
2619         struct mem_cgroup *memcg;
2620
2621         memcg = get_mem_cgroup_from_objcg(objcg);
2622
2623         mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
2624         memcg1_account_kmem(memcg, -nr_pages);
2625         refill_stock(memcg, nr_pages);
2626
2627         css_put(&memcg->css);
2628 }
2629
2630 /*
2631  * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
2632  * @objcg: object cgroup to charge
2633  * @gfp: reclaim mode
2634  * @nr_pages: number of pages to charge
2635  *
2636  * Returns 0 on success, an error code on failure.
2637  */
2638 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
2639                                    unsigned int nr_pages)
2640 {
2641         struct mem_cgroup *memcg;
2642         int ret;
2643
2644         memcg = get_mem_cgroup_from_objcg(objcg);
2645
2646         ret = try_charge_memcg(memcg, gfp, nr_pages);
2647         if (ret)
2648                 goto out;
2649
2650         mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
2651         memcg1_account_kmem(memcg, nr_pages);
2652 out:
2653         css_put(&memcg->css);
2654
2655         return ret;
2656 }
2657
2658 /**
2659  * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2660  * @page: page to charge
2661  * @gfp: reclaim mode
2662  * @order: allocation order
2663  *
2664  * Returns 0 on success, an error code on failure.
2665  */
2666 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2667 {
2668         struct obj_cgroup *objcg;
2669         int ret = 0;
2670
2671         objcg = current_obj_cgroup();
2672         if (objcg) {
2673                 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
2674                 if (!ret) {
2675                         obj_cgroup_get(objcg);
2676                         page->memcg_data = (unsigned long)objcg |
2677                                 MEMCG_DATA_KMEM;
2678                         return 0;
2679                 }
2680         }
2681         return ret;
2682 }
2683
2684 /**
2685  * __memcg_kmem_uncharge_page: uncharge a kmem page
2686  * @page: page to uncharge
2687  * @order: allocation order
2688  */
2689 void __memcg_kmem_uncharge_page(struct page *page, int order)
2690 {
2691         struct folio *folio = page_folio(page);
2692         struct obj_cgroup *objcg;
2693         unsigned int nr_pages = 1 << order;
2694
2695         if (!folio_memcg_kmem(folio))
2696                 return;
2697
2698         objcg = __folio_objcg(folio);
2699         obj_cgroup_uncharge_pages(objcg, nr_pages);
2700         folio->memcg_data = 0;
2701         obj_cgroup_put(objcg);
2702 }
2703
2704 static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
2705                      enum node_stat_item idx, int nr)
2706 {
2707         struct memcg_stock_pcp *stock;
2708         struct obj_cgroup *old = NULL;
2709         unsigned long flags;
2710         int *bytes;
2711
2712         local_lock_irqsave(&memcg_stock.stock_lock, flags);
2713         stock = this_cpu_ptr(&memcg_stock);
2714
2715         /*
2716          * Save vmstat data in stock and skip vmstat array update unless
2717          * accumulating over a page of vmstat data or when pgdat or idx
2718          * changes.
2719          */
2720         if (READ_ONCE(stock->cached_objcg) != objcg) {
2721                 old = drain_obj_stock(stock);
2722                 obj_cgroup_get(objcg);
2723                 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
2724                                 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
2725                 WRITE_ONCE(stock->cached_objcg, objcg);
2726                 stock->cached_pgdat = pgdat;
2727         } else if (stock->cached_pgdat != pgdat) {
2728                 /* Flush the existing cached vmstat data */
2729                 struct pglist_data *oldpg = stock->cached_pgdat;
2730
2731                 if (stock->nr_slab_reclaimable_b) {
2732                         __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
2733                                           stock->nr_slab_reclaimable_b);
2734                         stock->nr_slab_reclaimable_b = 0;
2735                 }
2736                 if (stock->nr_slab_unreclaimable_b) {
2737                         __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
2738                                           stock->nr_slab_unreclaimable_b);
2739                         stock->nr_slab_unreclaimable_b = 0;
2740                 }
2741                 stock->cached_pgdat = pgdat;
2742         }
2743
2744         bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
2745                                                : &stock->nr_slab_unreclaimable_b;
2746         /*
2747          * Even for large object >= PAGE_SIZE, the vmstat data will still be
2748          * cached locally at least once before pushing it out.
2749          */
2750         if (!*bytes) {
2751                 *bytes = nr;
2752                 nr = 0;
2753         } else {
2754                 *bytes += nr;
2755                 if (abs(*bytes) > PAGE_SIZE) {
2756                         nr = *bytes;
2757                         *bytes = 0;
2758                 } else {
2759                         nr = 0;
2760                 }
2761         }
2762         if (nr)
2763                 __mod_objcg_mlstate(objcg, pgdat, idx, nr);
2764
2765         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2766         obj_cgroup_put(old);
2767 }
2768
2769 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
2770 {
2771         struct memcg_stock_pcp *stock;
2772         unsigned long flags;
2773         bool ret = false;
2774
2775         local_lock_irqsave(&memcg_stock.stock_lock, flags);
2776
2777         stock = this_cpu_ptr(&memcg_stock);
2778         if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
2779                 stock->nr_bytes -= nr_bytes;
2780                 ret = true;
2781         }
2782
2783         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2784
2785         return ret;
2786 }
2787
2788 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
2789 {
2790         struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
2791
2792         if (!old)
2793                 return NULL;
2794
2795         if (stock->nr_bytes) {
2796                 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
2797                 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
2798
2799                 if (nr_pages) {
2800                         struct mem_cgroup *memcg;
2801
2802                         memcg = get_mem_cgroup_from_objcg(old);
2803
2804                         mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
2805                         memcg1_account_kmem(memcg, -nr_pages);
2806                         __refill_stock(memcg, nr_pages);
2807
2808                         css_put(&memcg->css);
2809                 }
2810
2811                 /*
2812                  * The leftover is flushed to the centralized per-memcg value.
2813                  * On the next attempt to refill obj stock it will be moved
2814                  * to a per-cpu stock (probably, on an other CPU), see
2815                  * refill_obj_stock().
2816                  *
2817                  * How often it's flushed is a trade-off between the memory
2818                  * limit enforcement accuracy and potential CPU contention,
2819                  * so it might be changed in the future.
2820                  */
2821                 atomic_add(nr_bytes, &old->nr_charged_bytes);
2822                 stock->nr_bytes = 0;
2823         }
2824
2825         /*
2826          * Flush the vmstat data in current stock
2827          */
2828         if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
2829                 if (stock->nr_slab_reclaimable_b) {
2830                         __mod_objcg_mlstate(old, stock->cached_pgdat,
2831                                           NR_SLAB_RECLAIMABLE_B,
2832                                           stock->nr_slab_reclaimable_b);
2833                         stock->nr_slab_reclaimable_b = 0;
2834                 }
2835                 if (stock->nr_slab_unreclaimable_b) {
2836                         __mod_objcg_mlstate(old, stock->cached_pgdat,
2837                                           NR_SLAB_UNRECLAIMABLE_B,
2838                                           stock->nr_slab_unreclaimable_b);
2839                         stock->nr_slab_unreclaimable_b = 0;
2840                 }
2841                 stock->cached_pgdat = NULL;
2842         }
2843
2844         WRITE_ONCE(stock->cached_objcg, NULL);
2845         /*
2846          * The `old' objects needs to be released by the caller via
2847          * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
2848          */
2849         return old;
2850 }
2851
2852 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2853                                      struct mem_cgroup *root_memcg)
2854 {
2855         struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
2856         struct mem_cgroup *memcg;
2857
2858         if (objcg) {
2859                 memcg = obj_cgroup_memcg(objcg);
2860                 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
2861                         return true;
2862         }
2863
2864         return false;
2865 }
2866
2867 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
2868                              bool allow_uncharge)
2869 {
2870         struct memcg_stock_pcp *stock;
2871         struct obj_cgroup *old = NULL;
2872         unsigned long flags;
2873         unsigned int nr_pages = 0;
2874
2875         local_lock_irqsave(&memcg_stock.stock_lock, flags);
2876
2877         stock = this_cpu_ptr(&memcg_stock);
2878         if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
2879                 old = drain_obj_stock(stock);
2880                 obj_cgroup_get(objcg);
2881                 WRITE_ONCE(stock->cached_objcg, objcg);
2882                 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
2883                                 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
2884                 allow_uncharge = true;  /* Allow uncharge when objcg changes */
2885         }
2886         stock->nr_bytes += nr_bytes;
2887
2888         if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
2889                 nr_pages = stock->nr_bytes >> PAGE_SHIFT;
2890                 stock->nr_bytes &= (PAGE_SIZE - 1);
2891         }
2892
2893         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2894         obj_cgroup_put(old);
2895
2896         if (nr_pages)
2897                 obj_cgroup_uncharge_pages(objcg, nr_pages);
2898 }
2899
2900 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
2901 {
2902         unsigned int nr_pages, nr_bytes;
2903         int ret;
2904
2905         if (consume_obj_stock(objcg, size))
2906                 return 0;
2907
2908         /*
2909          * In theory, objcg->nr_charged_bytes can have enough
2910          * pre-charged bytes to satisfy the allocation. However,
2911          * flushing objcg->nr_charged_bytes requires two atomic
2912          * operations, and objcg->nr_charged_bytes can't be big.
2913          * The shared objcg->nr_charged_bytes can also become a
2914          * performance bottleneck if all tasks of the same memcg are
2915          * trying to update it. So it's better to ignore it and try
2916          * grab some new pages. The stock's nr_bytes will be flushed to
2917          * objcg->nr_charged_bytes later on when objcg changes.
2918          *
2919          * The stock's nr_bytes may contain enough pre-charged bytes
2920          * to allow one less page from being charged, but we can't rely
2921          * on the pre-charged bytes not being changed outside of
2922          * consume_obj_stock() or refill_obj_stock(). So ignore those
2923          * pre-charged bytes as well when charging pages. To avoid a
2924          * page uncharge right after a page charge, we set the
2925          * allow_uncharge flag to false when calling refill_obj_stock()
2926          * to temporarily allow the pre-charged bytes to exceed the page
2927          * size limit. The maximum reachable value of the pre-charged
2928          * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
2929          * race.
2930          */
2931         nr_pages = size >> PAGE_SHIFT;
2932         nr_bytes = size & (PAGE_SIZE - 1);
2933
2934         if (nr_bytes)
2935                 nr_pages += 1;
2936
2937         ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
2938         if (!ret && nr_bytes)
2939                 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
2940
2941         return ret;
2942 }
2943
2944 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
2945 {
2946         refill_obj_stock(objcg, size, true);
2947 }
2948
2949 static inline size_t obj_full_size(struct kmem_cache *s)
2950 {
2951         /*
2952          * For each accounted object there is an extra space which is used
2953          * to store obj_cgroup membership. Charge it too.
2954          */
2955         return s->size + sizeof(struct obj_cgroup *);
2956 }
2957
2958 bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
2959                                   gfp_t flags, size_t size, void **p)
2960 {
2961         struct obj_cgroup *objcg;
2962         struct slab *slab;
2963         unsigned long off;
2964         size_t i;
2965
2966         /*
2967          * The obtained objcg pointer is safe to use within the current scope,
2968          * defined by current task or set_active_memcg() pair.
2969          * obj_cgroup_get() is used to get a permanent reference.
2970          */
2971         objcg = current_obj_cgroup();
2972         if (!objcg)
2973                 return true;
2974
2975         /*
2976          * slab_alloc_node() avoids the NULL check, so we might be called with a
2977          * single NULL object. kmem_cache_alloc_bulk() aborts if it can't fill
2978          * the whole requested size.
2979          * return success as there's nothing to free back
2980          */
2981         if (unlikely(*p == NULL))
2982                 return true;
2983
2984         flags &= gfp_allowed_mask;
2985
2986         if (lru) {
2987                 int ret;
2988                 struct mem_cgroup *memcg;
2989
2990                 memcg = get_mem_cgroup_from_objcg(objcg);
2991                 ret = memcg_list_lru_alloc(memcg, lru, flags);
2992                 css_put(&memcg->css);
2993
2994                 if (ret)
2995                         return false;
2996         }
2997
2998         if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
2999                 return false;
3000
3001         for (i = 0; i < size; i++) {
3002                 slab = virt_to_slab(p[i]);
3003
3004                 if (!slab_obj_exts(slab) &&
3005                     alloc_slab_obj_exts(slab, s, flags, false)) {
3006                         obj_cgroup_uncharge(objcg, obj_full_size(s));
3007                         continue;
3008                 }
3009
3010                 off = obj_to_index(s, slab, p[i]);
3011                 obj_cgroup_get(objcg);
3012                 slab_obj_exts(slab)[off].objcg = objcg;
3013                 mod_objcg_state(objcg, slab_pgdat(slab),
3014                                 cache_vmstat_idx(s), obj_full_size(s));
3015         }
3016
3017         return true;
3018 }
3019
3020 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
3021                             void **p, int objects, struct slabobj_ext *obj_exts)
3022 {
3023         for (int i = 0; i < objects; i++) {
3024                 struct obj_cgroup *objcg;
3025                 unsigned int off;
3026
3027                 off = obj_to_index(s, slab, p[i]);
3028                 objcg = obj_exts[off].objcg;
3029                 if (!objcg)
3030                         continue;
3031
3032                 obj_exts[off].objcg = NULL;
3033                 obj_cgroup_uncharge(objcg, obj_full_size(s));
3034                 mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
3035                                 -obj_full_size(s));
3036                 obj_cgroup_put(objcg);
3037         }
3038 }
3039
3040 /*
3041  * Because folio_memcg(head) is not set on tails, set it now.
3042  */
3043 void split_page_memcg(struct page *head, int old_order, int new_order)
3044 {
3045         struct folio *folio = page_folio(head);
3046         int i;
3047         unsigned int old_nr = 1 << old_order;
3048         unsigned int new_nr = 1 << new_order;
3049
3050         if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
3051                 return;
3052
3053         for (i = new_nr; i < old_nr; i += new_nr)
3054                 folio_page(folio, i)->memcg_data = folio->memcg_data;
3055
3056         if (folio_memcg_kmem(folio))
3057                 obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
3058         else
3059                 css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1);
3060 }
3061
3062 unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3063 {
3064         unsigned long val;
3065
3066         if (mem_cgroup_is_root(memcg)) {
3067                 /*
3068                  * Approximate root's usage from global state. This isn't
3069                  * perfect, but the root usage was always an approximation.
3070                  */
3071                 val = global_node_page_state(NR_FILE_PAGES) +
3072                         global_node_page_state(NR_ANON_MAPPED);
3073                 if (swap)
3074                         val += total_swap_pages - get_nr_swap_pages();
3075         } else {
3076                 if (!swap)
3077                         val = page_counter_read(&memcg->memory);
3078                 else
3079                         val = page_counter_read(&memcg->memsw);
3080         }
3081         return val;
3082 }
3083
3084 static int memcg_online_kmem(struct mem_cgroup *memcg)
3085 {
3086         struct obj_cgroup *objcg;
3087
3088         if (mem_cgroup_kmem_disabled())
3089                 return 0;
3090
3091         if (unlikely(mem_cgroup_is_root(memcg)))
3092                 return 0;
3093
3094         objcg = obj_cgroup_alloc();
3095         if (!objcg)
3096                 return -ENOMEM;
3097
3098         objcg->memcg = memcg;
3099         rcu_assign_pointer(memcg->objcg, objcg);
3100         obj_cgroup_get(objcg);
3101         memcg->orig_objcg = objcg;
3102
3103         static_branch_enable(&memcg_kmem_online_key);
3104
3105         memcg->kmemcg_id = memcg->id.id;
3106
3107         return 0;
3108 }
3109
3110 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3111 {
3112         struct mem_cgroup *parent;
3113
3114         if (mem_cgroup_kmem_disabled())
3115                 return;
3116
3117         if (unlikely(mem_cgroup_is_root(memcg)))
3118                 return;
3119
3120         parent = parent_mem_cgroup(memcg);
3121         if (!parent)
3122                 parent = root_mem_cgroup;
3123
3124         memcg_reparent_list_lrus(memcg, parent);
3125
3126         /*
3127          * Objcg's reparenting must be after list_lru's, make sure list_lru
3128          * helpers won't use parent's list_lru until child is drained.
3129          */
3130         memcg_reparent_objcgs(memcg, parent);
3131 }
3132
3133 #ifdef CONFIG_CGROUP_WRITEBACK
3134
3135 #include <trace/events/writeback.h>
3136
3137 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3138 {
3139         return wb_domain_init(&memcg->cgwb_domain, gfp);
3140 }
3141
3142 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3143 {
3144         wb_domain_exit(&memcg->cgwb_domain);
3145 }
3146
3147 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3148 {
3149         wb_domain_size_changed(&memcg->cgwb_domain);
3150 }
3151
3152 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3153 {
3154         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3155
3156         if (!memcg->css.parent)
3157                 return NULL;
3158
3159         return &memcg->cgwb_domain;
3160 }
3161
3162 /**
3163  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3164  * @wb: bdi_writeback in question
3165  * @pfilepages: out parameter for number of file pages
3166  * @pheadroom: out parameter for number of allocatable pages according to memcg
3167  * @pdirty: out parameter for number of dirty pages
3168  * @pwriteback: out parameter for number of pages under writeback
3169  *
3170  * Determine the numbers of file, headroom, dirty, and writeback pages in
3171  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3172  * is a bit more involved.
3173  *
3174  * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3175  * headroom is calculated as the lowest headroom of itself and the
3176  * ancestors.  Note that this doesn't consider the actual amount of
3177  * available memory in the system.  The caller should further cap
3178  * *@pheadroom accordingly.
3179  */
3180 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3181                          unsigned long *pheadroom, unsigned long *pdirty,
3182                          unsigned long *pwriteback)
3183 {
3184         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3185         struct mem_cgroup *parent;
3186
3187         mem_cgroup_flush_stats_ratelimited(memcg);
3188
3189         *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
3190         *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
3191         *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
3192                         memcg_page_state(memcg, NR_ACTIVE_FILE);
3193
3194         *pheadroom = PAGE_COUNTER_MAX;
3195         while ((parent = parent_mem_cgroup(memcg))) {
3196                 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
3197                                             READ_ONCE(memcg->memory.high));
3198                 unsigned long used = page_counter_read(&memcg->memory);
3199
3200                 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3201                 memcg = parent;
3202         }
3203 }
3204
3205 /*
3206  * Foreign dirty flushing
3207  *
3208  * There's an inherent mismatch between memcg and writeback.  The former
3209  * tracks ownership per-page while the latter per-inode.  This was a
3210  * deliberate design decision because honoring per-page ownership in the
3211  * writeback path is complicated, may lead to higher CPU and IO overheads
3212  * and deemed unnecessary given that write-sharing an inode across
3213  * different cgroups isn't a common use-case.
3214  *
3215  * Combined with inode majority-writer ownership switching, this works well
3216  * enough in most cases but there are some pathological cases.  For
3217  * example, let's say there are two cgroups A and B which keep writing to
3218  * different but confined parts of the same inode.  B owns the inode and
3219  * A's memory is limited far below B's.  A's dirty ratio can rise enough to
3220  * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
3221  * triggering background writeback.  A will be slowed down without a way to
3222  * make writeback of the dirty pages happen.
3223  *
3224  * Conditions like the above can lead to a cgroup getting repeatedly and
3225  * severely throttled after making some progress after each
3226  * dirty_expire_interval while the underlying IO device is almost
3227  * completely idle.
3228  *
3229  * Solving this problem completely requires matching the ownership tracking
3230  * granularities between memcg and writeback in either direction.  However,
3231  * the more egregious behaviors can be avoided by simply remembering the
3232  * most recent foreign dirtying events and initiating remote flushes on
3233  * them when local writeback isn't enough to keep the memory clean enough.
3234  *
3235  * The following two functions implement such mechanism.  When a foreign
3236  * page - a page whose memcg and writeback ownerships don't match - is
3237  * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
3238  * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
3239  * decides that the memcg needs to sleep due to high dirty ratio, it calls
3240  * mem_cgroup_flush_foreign() which queues writeback on the recorded
3241  * foreign bdi_writebacks which haven't expired.  Both the numbers of
3242  * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
3243  * limited to MEMCG_CGWB_FRN_CNT.
3244  *
3245  * The mechanism only remembers IDs and doesn't hold any object references.
3246  * As being wrong occasionally doesn't matter, updates and accesses to the
3247  * records are lockless and racy.
3248  */
3249 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
3250                                              struct bdi_writeback *wb)
3251 {
3252         struct mem_cgroup *memcg = folio_memcg(folio);
3253         struct memcg_cgwb_frn *frn;
3254         u64 now = get_jiffies_64();
3255         u64 oldest_at = now;
3256         int oldest = -1;
3257         int i;
3258
3259         trace_track_foreign_dirty(folio, wb);
3260
3261         /*
3262          * Pick the slot to use.  If there is already a slot for @wb, keep
3263          * using it.  If not replace the oldest one which isn't being
3264          * written out.
3265          */
3266         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
3267                 frn = &memcg->cgwb_frn[i];
3268                 if (frn->bdi_id == wb->bdi->id &&
3269                     frn->memcg_id == wb->memcg_css->id)
3270                         break;
3271                 if (time_before64(frn->at, oldest_at) &&
3272                     atomic_read(&frn->done.cnt) == 1) {
3273                         oldest = i;
3274                         oldest_at = frn->at;
3275                 }
3276         }
3277
3278         if (i < MEMCG_CGWB_FRN_CNT) {
3279                 /*
3280                  * Re-using an existing one.  Update timestamp lazily to
3281                  * avoid making the cacheline hot.  We want them to be
3282                  * reasonably up-to-date and significantly shorter than
3283                  * dirty_expire_interval as that's what expires the record.
3284                  * Use the shorter of 1s and dirty_expire_interval / 8.
3285                  */
3286                 unsigned long update_intv =
3287                         min_t(unsigned long, HZ,
3288                               msecs_to_jiffies(dirty_expire_interval * 10) / 8);
3289
3290                 if (time_before64(frn->at, now - update_intv))
3291                         frn->at = now;
3292         } else if (oldest >= 0) {
3293                 /* replace the oldest free one */
3294                 frn = &memcg->cgwb_frn[oldest];
3295                 frn->bdi_id = wb->bdi->id;
3296                 frn->memcg_id = wb->memcg_css->id;
3297                 frn->at = now;
3298         }
3299 }
3300
3301 /* issue foreign writeback flushes for recorded foreign dirtying events */
3302 void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
3303 {
3304         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3305         unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
3306         u64 now = jiffies_64;
3307         int i;
3308
3309         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
3310                 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
3311
3312                 /*
3313                  * If the record is older than dirty_expire_interval,
3314                  * writeback on it has already started.  No need to kick it
3315                  * off again.  Also, don't start a new one if there's
3316                  * already one in flight.
3317                  */
3318                 if (time_after64(frn->at, now - intv) &&
3319                     atomic_read(&frn->done.cnt) == 1) {
3320                         frn->at = 0;
3321                         trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
3322                         cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
3323                                                WB_REASON_FOREIGN_FLUSH,
3324                                                &frn->done);
3325                 }
3326         }
3327 }
3328
3329 #else   /* CONFIG_CGROUP_WRITEBACK */
3330
3331 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3332 {
3333         return 0;
3334 }
3335
3336 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3337 {
3338 }
3339
3340 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3341 {
3342 }
3343
3344 #endif  /* CONFIG_CGROUP_WRITEBACK */
3345
3346 /*
3347  * Private memory cgroup IDR
3348  *
3349  * Swap-out records and page cache shadow entries need to store memcg
3350  * references in constrained space, so we maintain an ID space that is
3351  * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
3352  * memory-controlled cgroups to 64k.
3353  *
3354  * However, there usually are many references to the offline CSS after
3355  * the cgroup has been destroyed, such as page cache or reclaimable
3356  * slab objects, that don't need to hang on to the ID. We want to keep
3357  * those dead CSS from occupying IDs, or we might quickly exhaust the
3358  * relatively small ID space and prevent the creation of new cgroups
3359  * even when there are much fewer than 64k cgroups - possibly none.
3360  *
3361  * Maintain a private 16-bit ID space for memcg, and allow the ID to
3362  * be freed and recycled when it's no longer needed, which is usually
3363  * when the CSS is offlined.
3364  *
3365  * The only exception to that are records of swapped out tmpfs/shmem
3366  * pages that need to be attributed to live ancestors on swapin. But
3367  * those references are manageable from userspace.
3368  */
3369
3370 #define MEM_CGROUP_ID_MAX       ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
3371 static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids);
3372
3373 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
3374 {
3375         if (memcg->id.id > 0) {
3376                 xa_erase(&mem_cgroup_ids, memcg->id.id);
3377                 memcg->id.id = 0;
3378         }
3379 }
3380
3381 void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
3382                                            unsigned int n)
3383 {
3384         refcount_add(n, &memcg->id.ref);
3385 }
3386
3387 void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
3388 {
3389         if (refcount_sub_and_test(n, &memcg->id.ref)) {
3390                 mem_cgroup_id_remove(memcg);
3391
3392                 /* Memcg ID pins CSS */
3393                 css_put(&memcg->css);
3394         }
3395 }
3396
3397 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
3398 {
3399         mem_cgroup_id_put_many(memcg, 1);
3400 }
3401
3402 /**
3403  * mem_cgroup_from_id - look up a memcg from a memcg id
3404  * @id: the memcg id to look up
3405  *
3406  * Caller must hold rcu_read_lock().
3407  */
3408 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
3409 {
3410         WARN_ON_ONCE(!rcu_read_lock_held());
3411         return xa_load(&mem_cgroup_ids, id);
3412 }
3413
3414 #ifdef CONFIG_SHRINKER_DEBUG
3415 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
3416 {
3417         struct cgroup *cgrp;
3418         struct cgroup_subsys_state *css;
3419         struct mem_cgroup *memcg;
3420
3421         cgrp = cgroup_get_from_id(ino);
3422         if (IS_ERR(cgrp))
3423                 return ERR_CAST(cgrp);
3424
3425         css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
3426         if (css)
3427                 memcg = container_of(css, struct mem_cgroup, css);
3428         else
3429                 memcg = ERR_PTR(-ENOENT);
3430
3431         cgroup_put(cgrp);
3432
3433         return memcg;
3434 }
3435 #endif
3436
3437 static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
3438 {
3439         struct mem_cgroup_per_node *pn;
3440
3441         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
3442         if (!pn)
3443                 return false;
3444
3445         pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats),
3446                                         GFP_KERNEL_ACCOUNT, node);
3447         if (!pn->lruvec_stats)
3448                 goto fail;
3449
3450         pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
3451                                                    GFP_KERNEL_ACCOUNT);
3452         if (!pn->lruvec_stats_percpu)
3453                 goto fail;
3454
3455         lruvec_init(&pn->lruvec);
3456         pn->memcg = memcg;
3457
3458         memcg->nodeinfo[node] = pn;
3459         return true;
3460 fail:
3461         kfree(pn->lruvec_stats);
3462         kfree(pn);
3463         return false;
3464 }
3465
3466 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
3467 {
3468         struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3469
3470         if (!pn)
3471                 return;
3472
3473         free_percpu(pn->lruvec_stats_percpu);
3474         kfree(pn->lruvec_stats);
3475         kfree(pn);
3476 }
3477
3478 static void __mem_cgroup_free(struct mem_cgroup *memcg)
3479 {
3480         int node;
3481
3482         obj_cgroup_put(memcg->orig_objcg);
3483
3484         for_each_node(node)
3485                 free_mem_cgroup_per_node_info(memcg, node);
3486         memcg1_free_events(memcg);
3487         kfree(memcg->vmstats);
3488         free_percpu(memcg->vmstats_percpu);
3489         kfree(memcg);
3490 }
3491
3492 static void mem_cgroup_free(struct mem_cgroup *memcg)
3493 {
3494         lru_gen_exit_memcg(memcg);
3495         memcg_wb_domain_exit(memcg);
3496         __mem_cgroup_free(memcg);
3497 }
3498
3499 static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
3500 {
3501         struct memcg_vmstats_percpu *statc, *pstatc;
3502         struct mem_cgroup *memcg;
3503         int node, cpu;
3504         int __maybe_unused i;
3505         long error;
3506
3507         memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
3508         if (!memcg)
3509                 return ERR_PTR(-ENOMEM);
3510
3511         error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL,
3512                          XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL);
3513         if (error)
3514                 goto fail;
3515         error = -ENOMEM;
3516
3517         memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
3518                                  GFP_KERNEL_ACCOUNT);
3519         if (!memcg->vmstats)
3520                 goto fail;
3521
3522         memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
3523                                                  GFP_KERNEL_ACCOUNT);
3524         if (!memcg->vmstats_percpu)
3525                 goto fail;
3526
3527         if (!memcg1_alloc_events(memcg))
3528                 goto fail;
3529
3530         for_each_possible_cpu(cpu) {
3531                 if (parent)
3532                         pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
3533                 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
3534                 statc->parent = parent ? pstatc : NULL;
3535                 statc->vmstats = memcg->vmstats;
3536         }
3537
3538         for_each_node(node)
3539                 if (!alloc_mem_cgroup_per_node_info(memcg, node))
3540                         goto fail;
3541
3542         if (memcg_wb_domain_init(memcg, GFP_KERNEL))
3543                 goto fail;
3544
3545         INIT_WORK(&memcg->high_work, high_work_func);
3546         vmpressure_init(&memcg->vmpressure);
3547         INIT_LIST_HEAD(&memcg->memory_peaks);
3548         INIT_LIST_HEAD(&memcg->swap_peaks);
3549         spin_lock_init(&memcg->peaks_lock);
3550         memcg->socket_pressure = jiffies;
3551         memcg1_memcg_init(memcg);
3552         memcg->kmemcg_id = -1;
3553         INIT_LIST_HEAD(&memcg->objcg_list);
3554 #ifdef CONFIG_CGROUP_WRITEBACK
3555         INIT_LIST_HEAD(&memcg->cgwb_list);
3556         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
3557                 memcg->cgwb_frn[i].done =
3558                         __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
3559 #endif
3560 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3561         spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
3562         INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
3563         memcg->deferred_split_queue.split_queue_len = 0;
3564 #endif
3565         lru_gen_init_memcg(memcg);
3566         return memcg;
3567 fail:
3568         mem_cgroup_id_remove(memcg);
3569         __mem_cgroup_free(memcg);
3570         return ERR_PTR(error);
3571 }
3572
3573 static struct cgroup_subsys_state * __ref
3574 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
3575 {
3576         struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
3577         struct mem_cgroup *memcg, *old_memcg;
3578
3579         old_memcg = set_active_memcg(parent);
3580         memcg = mem_cgroup_alloc(parent);
3581         set_active_memcg(old_memcg);
3582         if (IS_ERR(memcg))
3583                 return ERR_CAST(memcg);
3584
3585         page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
3586         memcg1_soft_limit_reset(memcg);
3587 #ifdef CONFIG_ZSWAP
3588         memcg->zswap_max = PAGE_COUNTER_MAX;
3589         WRITE_ONCE(memcg->zswap_writeback, true);
3590 #endif
3591         page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
3592         if (parent) {
3593                 WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
3594
3595                 page_counter_init(&memcg->memory, &parent->memory, true);
3596                 page_counter_init(&memcg->swap, &parent->swap, false);
3597 #ifdef CONFIG_MEMCG_V1
3598                 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
3599                 page_counter_init(&memcg->kmem, &parent->kmem, false);
3600                 page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
3601 #endif
3602         } else {
3603                 init_memcg_stats();
3604                 init_memcg_events();
3605                 page_counter_init(&memcg->memory, NULL, true);
3606                 page_counter_init(&memcg->swap, NULL, false);
3607 #ifdef CONFIG_MEMCG_V1
3608                 page_counter_init(&memcg->kmem, NULL, false);
3609                 page_counter_init(&memcg->tcpmem, NULL, false);
3610 #endif
3611                 root_mem_cgroup = memcg;
3612                 return &memcg->css;
3613         }
3614
3615         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
3616                 static_branch_inc(&memcg_sockets_enabled_key);
3617
3618         if (!cgroup_memory_nobpf)
3619                 static_branch_inc(&memcg_bpf_enabled_key);
3620
3621         return &memcg->css;
3622 }
3623
3624 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
3625 {
3626         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3627
3628         if (memcg_online_kmem(memcg))
3629                 goto remove_id;
3630
3631         /*
3632          * A memcg must be visible for expand_shrinker_info()
3633          * by the time the maps are allocated. So, we allocate maps
3634          * here, when for_each_mem_cgroup() can't skip it.
3635          */
3636         if (alloc_shrinker_info(memcg))
3637                 goto offline_kmem;
3638
3639         if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
3640                 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
3641                                    FLUSH_TIME);
3642         lru_gen_online_memcg(memcg);
3643
3644         /* Online state pins memcg ID, memcg ID pins CSS */
3645         refcount_set(&memcg->id.ref, 1);
3646         css_get(css);
3647
3648         /*
3649          * Ensure mem_cgroup_from_id() works once we're fully online.
3650          *
3651          * We could do this earlier and require callers to filter with
3652          * css_tryget_online(). But right now there are no users that
3653          * need earlier access, and the workingset code relies on the
3654          * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So
3655          * publish it here at the end of onlining. This matches the
3656          * regular ID destruction during offlining.
3657          */
3658         xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL);
3659
3660         return 0;
3661 offline_kmem:
3662         memcg_offline_kmem(memcg);
3663 remove_id:
3664         mem_cgroup_id_remove(memcg);
3665         return -ENOMEM;
3666 }
3667
3668 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
3669 {
3670         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3671
3672         memcg1_css_offline(memcg);
3673
3674         page_counter_set_min(&memcg->memory, 0);
3675         page_counter_set_low(&memcg->memory, 0);
3676
3677         zswap_memcg_offline_cleanup(memcg);
3678
3679         memcg_offline_kmem(memcg);
3680         reparent_shrinker_deferred(memcg);
3681         wb_memcg_offline(memcg);
3682         lru_gen_offline_memcg(memcg);
3683
3684         drain_all_stock(memcg);
3685
3686         mem_cgroup_id_put(memcg);
3687 }
3688
3689 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
3690 {
3691         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3692
3693         invalidate_reclaim_iterators(memcg);
3694         lru_gen_release_memcg(memcg);
3695 }
3696
3697 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
3698 {
3699         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3700         int __maybe_unused i;
3701
3702 #ifdef CONFIG_CGROUP_WRITEBACK
3703         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
3704                 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
3705 #endif
3706         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
3707                 static_branch_dec(&memcg_sockets_enabled_key);
3708
3709         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg1_tcpmem_active(memcg))
3710                 static_branch_dec(&memcg_sockets_enabled_key);
3711
3712         if (!cgroup_memory_nobpf)
3713                 static_branch_dec(&memcg_bpf_enabled_key);
3714
3715         vmpressure_cleanup(&memcg->vmpressure);
3716         cancel_work_sync(&memcg->high_work);
3717         memcg1_remove_from_trees(memcg);
3718         free_shrinker_info(memcg);
3719         mem_cgroup_free(memcg);
3720 }
3721
3722 /**
3723  * mem_cgroup_css_reset - reset the states of a mem_cgroup
3724  * @css: the target css
3725  *
3726  * Reset the states of the mem_cgroup associated with @css.  This is
3727  * invoked when the userland requests disabling on the default hierarchy
3728  * but the memcg is pinned through dependency.  The memcg should stop
3729  * applying policies and should revert to the vanilla state as it may be
3730  * made visible again.
3731  *
3732  * The current implementation only resets the essential configurations.
3733  * This needs to be expanded to cover all the visible parts.
3734  */
3735 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
3736 {
3737         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3738
3739         page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
3740         page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
3741 #ifdef CONFIG_MEMCG_V1
3742         page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
3743         page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
3744 #endif
3745         page_counter_set_min(&memcg->memory, 0);
3746         page_counter_set_low(&memcg->memory, 0);
3747         page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
3748         memcg1_soft_limit_reset(memcg);
3749         page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
3750         memcg_wb_domain_size_changed(memcg);
3751 }
3752
3753 struct aggregate_control {
3754         /* pointer to the aggregated (CPU and subtree aggregated) counters */
3755         long *aggregate;
3756         /* pointer to the non-hierarchichal (CPU aggregated) counters */
3757         long *local;
3758         /* pointer to the pending child counters during tree propagation */
3759         long *pending;
3760         /* pointer to the parent's pending counters, could be NULL */
3761         long *ppending;
3762         /* pointer to the percpu counters to be aggregated */
3763         long *cstat;
3764         /* pointer to the percpu counters of the last aggregation*/
3765         long *cstat_prev;
3766         /* size of the above counters */
3767         int size;
3768 };
3769
3770 static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
3771 {
3772         int i;
3773         long delta, delta_cpu, v;
3774
3775         for (i = 0; i < ac->size; i++) {
3776                 /*
3777                  * Collect the aggregated propagation counts of groups
3778                  * below us. We're in a per-cpu loop here and this is
3779                  * a global counter, so the first cycle will get them.
3780                  */
3781                 delta = ac->pending[i];
3782                 if (delta)
3783                         ac->pending[i] = 0;
3784
3785                 /* Add CPU changes on this level since the last flush */
3786                 delta_cpu = 0;
3787                 v = READ_ONCE(ac->cstat[i]);
3788                 if (v != ac->cstat_prev[i]) {
3789                         delta_cpu = v - ac->cstat_prev[i];
3790                         delta += delta_cpu;
3791                         ac->cstat_prev[i] = v;
3792                 }
3793
3794                 /* Aggregate counts on this level and propagate upwards */
3795                 if (delta_cpu)
3796                         ac->local[i] += delta_cpu;
3797
3798                 if (delta) {
3799                         ac->aggregate[i] += delta;
3800                         if (ac->ppending)
3801                                 ac->ppending[i] += delta;
3802                 }
3803         }
3804 }
3805
3806 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
3807 {
3808         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3809         struct mem_cgroup *parent = parent_mem_cgroup(memcg);
3810         struct memcg_vmstats_percpu *statc;
3811         struct aggregate_control ac;
3812         int nid;
3813
3814         statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
3815
3816         ac = (struct aggregate_control) {
3817                 .aggregate = memcg->vmstats->state,
3818                 .local = memcg->vmstats->state_local,
3819                 .pending = memcg->vmstats->state_pending,
3820                 .ppending = parent ? parent->vmstats->state_pending : NULL,
3821                 .cstat = statc->state,
3822                 .cstat_prev = statc->state_prev,
3823                 .size = MEMCG_VMSTAT_SIZE,
3824         };
3825         mem_cgroup_stat_aggregate(&ac);
3826
3827         ac = (struct aggregate_control) {
3828                 .aggregate = memcg->vmstats->events,
3829                 .local = memcg->vmstats->events_local,
3830                 .pending = memcg->vmstats->events_pending,
3831                 .ppending = parent ? parent->vmstats->events_pending : NULL,
3832                 .cstat = statc->events,
3833                 .cstat_prev = statc->events_prev,
3834                 .size = NR_MEMCG_EVENTS,
3835         };
3836         mem_cgroup_stat_aggregate(&ac);
3837
3838         for_each_node_state(nid, N_MEMORY) {
3839                 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
3840                 struct lruvec_stats *lstats = pn->lruvec_stats;
3841                 struct lruvec_stats *plstats = NULL;
3842                 struct lruvec_stats_percpu *lstatc;
3843
3844                 if (parent)
3845                         plstats = parent->nodeinfo[nid]->lruvec_stats;
3846
3847                 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
3848
3849                 ac = (struct aggregate_control) {
3850                         .aggregate = lstats->state,
3851                         .local = lstats->state_local,
3852                         .pending = lstats->state_pending,
3853                         .ppending = plstats ? plstats->state_pending : NULL,
3854                         .cstat = lstatc->state,
3855                         .cstat_prev = lstatc->state_prev,
3856                         .size = NR_MEMCG_NODE_STAT_ITEMS,
3857                 };
3858                 mem_cgroup_stat_aggregate(&ac);
3859
3860         }
3861         WRITE_ONCE(statc->stats_updates, 0);
3862         /* We are in a per-cpu loop here, only do the atomic write once */
3863         if (atomic64_read(&memcg->vmstats->stats_updates))
3864                 atomic64_set(&memcg->vmstats->stats_updates, 0);
3865 }
3866
3867 static void mem_cgroup_fork(struct task_struct *task)
3868 {
3869         /*
3870          * Set the update flag to cause task->objcg to be initialized lazily
3871          * on the first allocation. It can be done without any synchronization
3872          * because it's always performed on the current task, so does
3873          * current_objcg_update().
3874          */
3875         task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG;
3876 }
3877
3878 static void mem_cgroup_exit(struct task_struct *task)
3879 {
3880         struct obj_cgroup *objcg = task->objcg;
3881
3882         objcg = (struct obj_cgroup *)
3883                 ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
3884         obj_cgroup_put(objcg);
3885
3886         /*
3887          * Some kernel allocations can happen after this point,
3888          * but let's ignore them. It can be done without any synchronization
3889          * because it's always performed on the current task, so does
3890          * current_objcg_update().
3891          */
3892         task->objcg = NULL;
3893 }
3894
3895 #ifdef CONFIG_LRU_GEN
3896 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset)
3897 {
3898         struct task_struct *task;
3899         struct cgroup_subsys_state *css;
3900
3901         /* find the first leader if there is any */
3902         cgroup_taskset_for_each_leader(task, css, tset)
3903                 break;
3904
3905         if (!task)
3906                 return;
3907
3908         task_lock(task);
3909         if (task->mm && READ_ONCE(task->mm->owner) == task)
3910                 lru_gen_migrate_mm(task->mm);
3911         task_unlock(task);
3912 }
3913 #else
3914 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {}
3915 #endif /* CONFIG_LRU_GEN */
3916
3917 static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
3918 {
3919         struct task_struct *task;
3920         struct cgroup_subsys_state *css;
3921
3922         cgroup_taskset_for_each(task, css, tset) {
3923                 /* atomically set the update bit */
3924                 set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg);
3925         }
3926 }
3927
3928 static void mem_cgroup_attach(struct cgroup_taskset *tset)
3929 {
3930         mem_cgroup_lru_gen_attach(tset);
3931         mem_cgroup_kmem_attach(tset);
3932 }
3933
3934 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
3935 {
3936         if (value == PAGE_COUNTER_MAX)
3937                 seq_puts(m, "max\n");
3938         else
3939                 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
3940
3941         return 0;
3942 }
3943
3944 static u64 memory_current_read(struct cgroup_subsys_state *css,
3945                                struct cftype *cft)
3946 {
3947         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3948
3949         return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
3950 }
3951
3952 #define OFP_PEAK_UNSET (((-1UL)))
3953
3954 static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc)
3955 {
3956         struct cgroup_of_peak *ofp = of_peak(sf->private);
3957         u64 fd_peak = READ_ONCE(ofp->value), peak;
3958
3959         /* User wants global or local peak? */
3960         if (fd_peak == OFP_PEAK_UNSET)
3961                 peak = pc->watermark;
3962         else
3963                 peak = max(fd_peak, READ_ONCE(pc->local_watermark));
3964
3965         seq_printf(sf, "%llu\n", peak * PAGE_SIZE);
3966         return 0;
3967 }
3968
3969 static int memory_peak_show(struct seq_file *sf, void *v)
3970 {
3971         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3972
3973         return peak_show(sf, v, &memcg->memory);
3974 }
3975
3976 static int peak_open(struct kernfs_open_file *of)
3977 {
3978         struct cgroup_of_peak *ofp = of_peak(of);
3979
3980         ofp->value = OFP_PEAK_UNSET;
3981         return 0;
3982 }
3983
3984 static void peak_release(struct kernfs_open_file *of)
3985 {
3986         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3987         struct cgroup_of_peak *ofp = of_peak(of);
3988
3989         if (ofp->value == OFP_PEAK_UNSET) {
3990                 /* fast path (no writes on this fd) */
3991                 return;
3992         }
3993         spin_lock(&memcg->peaks_lock);
3994         list_del(&ofp->list);
3995         spin_unlock(&memcg->peaks_lock);
3996 }
3997
3998 static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
3999                           loff_t off, struct page_counter *pc,
4000                           struct list_head *watchers)
4001 {
4002         unsigned long usage;
4003         struct cgroup_of_peak *peer_ctx;
4004         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4005         struct cgroup_of_peak *ofp = of_peak(of);
4006
4007         spin_lock(&memcg->peaks_lock);
4008
4009         usage = page_counter_read(pc);
4010         WRITE_ONCE(pc->local_watermark, usage);
4011
4012         list_for_each_entry(peer_ctx, watchers, list)
4013                 if (usage > peer_ctx->value)
4014                         WRITE_ONCE(peer_ctx->value, usage);
4015
4016         /* initial write, register watcher */
4017         if (ofp->value == -1)
4018                 list_add(&ofp->list, watchers);
4019
4020         WRITE_ONCE(ofp->value, usage);
4021         spin_unlock(&memcg->peaks_lock);
4022
4023         return nbytes;
4024 }
4025
4026 static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
4027                                  size_t nbytes, loff_t off)
4028 {
4029         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4030
4031         return peak_write(of, buf, nbytes, off, &memcg->memory,
4032                           &memcg->memory_peaks);
4033 }
4034
4035 #undef OFP_PEAK_UNSET
4036
4037 static int memory_min_show(struct seq_file *m, void *v)
4038 {
4039         return seq_puts_memcg_tunable(m,
4040                 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
4041 }
4042
4043 static ssize_t memory_min_write(struct kernfs_open_file *of,
4044                                 char *buf, size_t nbytes, loff_t off)
4045 {
4046         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4047         unsigned long min;
4048         int err;
4049
4050         buf = strstrip(buf);
4051         err = page_counter_memparse(buf, "max", &min);
4052         if (err)
4053                 return err;
4054
4055         page_counter_set_min(&memcg->memory, min);
4056
4057         return nbytes;
4058 }
4059
4060 static int memory_low_show(struct seq_file *m, void *v)
4061 {
4062         return seq_puts_memcg_tunable(m,
4063                 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
4064 }
4065
4066 static ssize_t memory_low_write(struct kernfs_open_file *of,
4067                                 char *buf, size_t nbytes, loff_t off)
4068 {
4069         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4070         unsigned long low;
4071         int err;
4072
4073         buf = strstrip(buf);
4074         err = page_counter_memparse(buf, "max", &low);
4075         if (err)
4076                 return err;
4077
4078         page_counter_set_low(&memcg->memory, low);
4079
4080         return nbytes;
4081 }
4082
4083 static int memory_high_show(struct seq_file *m, void *v)
4084 {
4085         return seq_puts_memcg_tunable(m,
4086                 READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
4087 }
4088
4089 static ssize_t memory_high_write(struct kernfs_open_file *of,
4090                                  char *buf, size_t nbytes, loff_t off)
4091 {
4092         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4093         unsigned int nr_retries = MAX_RECLAIM_RETRIES;
4094         bool drained = false;
4095         unsigned long high;
4096         int err;
4097
4098         buf = strstrip(buf);
4099         err = page_counter_memparse(buf, "max", &high);
4100         if (err)
4101                 return err;
4102
4103         page_counter_set_high(&memcg->memory, high);
4104
4105         for (;;) {
4106                 unsigned long nr_pages = page_counter_read(&memcg->memory);
4107                 unsigned long reclaimed;
4108
4109                 if (nr_pages <= high)
4110                         break;
4111
4112                 if (signal_pending(current))
4113                         break;
4114
4115                 if (!drained) {
4116                         drain_all_stock(memcg);
4117                         drained = true;
4118                         continue;
4119                 }
4120
4121                 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
4122                                         GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
4123
4124                 if (!reclaimed && !nr_retries--)
4125                         break;
4126         }
4127
4128         memcg_wb_domain_size_changed(memcg);
4129         return nbytes;
4130 }
4131
4132 static int memory_max_show(struct seq_file *m, void *v)
4133 {
4134         return seq_puts_memcg_tunable(m,
4135                 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
4136 }
4137
4138 static ssize_t memory_max_write(struct kernfs_open_file *of,
4139                                 char *buf, size_t nbytes, loff_t off)
4140 {
4141         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4142         unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
4143         bool drained = false;
4144         unsigned long max;
4145         int err;
4146
4147         buf = strstrip(buf);
4148         err = page_counter_memparse(buf, "max", &max);
4149         if (err)
4150                 return err;
4151
4152         xchg(&memcg->memory.max, max);
4153
4154         for (;;) {
4155                 unsigned long nr_pages = page_counter_read(&memcg->memory);
4156
4157                 if (nr_pages <= max)
4158                         break;
4159
4160                 if (signal_pending(current))
4161                         break;
4162
4163                 if (!drained) {
4164                         drain_all_stock(memcg);
4165                         drained = true;
4166                         continue;
4167                 }
4168
4169                 if (nr_reclaims) {
4170                         if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
4171                                         GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
4172                                 nr_reclaims--;
4173                         continue;
4174                 }
4175
4176                 memcg_memory_event(memcg, MEMCG_OOM);
4177                 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
4178                         break;
4179         }
4180
4181         memcg_wb_domain_size_changed(memcg);
4182         return nbytes;
4183 }
4184
4185 /*
4186  * Note: don't forget to update the 'samples/cgroup/memcg_event_listener'
4187  * if any new events become available.
4188  */
4189 static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
4190 {
4191         seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
4192         seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
4193         seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
4194         seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
4195         seq_printf(m, "oom_kill %lu\n",
4196                    atomic_long_read(&events[MEMCG_OOM_KILL]));
4197         seq_printf(m, "oom_group_kill %lu\n",
4198                    atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
4199 }
4200
4201 static int memory_events_show(struct seq_file *m, void *v)
4202 {
4203         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4204
4205         __memory_events_show(m, memcg->memory_events);
4206         return 0;
4207 }
4208
4209 static int memory_events_local_show(struct seq_file *m, void *v)
4210 {
4211         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4212
4213         __memory_events_show(m, memcg->memory_events_local);
4214         return 0;
4215 }
4216
4217 int memory_stat_show(struct seq_file *m, void *v)
4218 {
4219         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4220         char *buf = kmalloc(SEQ_BUF_SIZE, GFP_KERNEL);
4221         struct seq_buf s;
4222
4223         if (!buf)
4224                 return -ENOMEM;
4225         seq_buf_init(&s, buf, SEQ_BUF_SIZE);
4226         memory_stat_format(memcg, &s);
4227         seq_puts(m, buf);
4228         kfree(buf);
4229         return 0;
4230 }
4231
4232 #ifdef CONFIG_NUMA
4233 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
4234                                                      int item)
4235 {
4236         return lruvec_page_state(lruvec, item) *
4237                 memcg_page_state_output_unit(item);
4238 }
4239
4240 static int memory_numa_stat_show(struct seq_file *m, void *v)
4241 {
4242         int i;
4243         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4244
4245         mem_cgroup_flush_stats(memcg);
4246
4247         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
4248                 int nid;
4249
4250                 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
4251                         continue;
4252
4253                 seq_printf(m, "%s", memory_stats[i].name);
4254                 for_each_node_state(nid, N_MEMORY) {
4255                         u64 size;
4256                         struct lruvec *lruvec;
4257
4258                         lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
4259                         size = lruvec_page_state_output(lruvec,
4260                                                         memory_stats[i].idx);
4261                         seq_printf(m, " N%d=%llu", nid, size);
4262                 }
4263                 seq_putc(m, '\n');
4264         }
4265
4266         return 0;
4267 }
4268 #endif
4269
4270 static int memory_oom_group_show(struct seq_file *m, void *v)
4271 {
4272         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4273
4274         seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group));
4275
4276         return 0;
4277 }
4278
4279 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
4280                                       char *buf, size_t nbytes, loff_t off)
4281 {
4282         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4283         int ret, oom_group;
4284
4285         buf = strstrip(buf);
4286         if (!buf)
4287                 return -EINVAL;
4288
4289         ret = kstrtoint(buf, 0, &oom_group);
4290         if (ret)
4291                 return ret;
4292
4293         if (oom_group != 0 && oom_group != 1)
4294                 return -EINVAL;
4295
4296         WRITE_ONCE(memcg->oom_group, oom_group);
4297
4298         return nbytes;
4299 }
4300
4301 enum {
4302         MEMORY_RECLAIM_SWAPPINESS = 0,
4303         MEMORY_RECLAIM_NULL,
4304 };
4305
4306 static const match_table_t tokens = {
4307         { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
4308         { MEMORY_RECLAIM_NULL, NULL },
4309 };
4310
4311 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
4312                               size_t nbytes, loff_t off)
4313 {
4314         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4315         unsigned int nr_retries = MAX_RECLAIM_RETRIES;
4316         unsigned long nr_to_reclaim, nr_reclaimed = 0;
4317         int swappiness = -1;
4318         unsigned int reclaim_options;
4319         char *old_buf, *start;
4320         substring_t args[MAX_OPT_ARGS];
4321
4322         buf = strstrip(buf);
4323
4324         old_buf = buf;
4325         nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
4326         if (buf == old_buf)
4327                 return -EINVAL;
4328
4329         buf = strstrip(buf);
4330
4331         while ((start = strsep(&buf, " ")) != NULL) {
4332                 if (!strlen(start))
4333                         continue;
4334                 switch (match_token(start, tokens, args)) {
4335                 case MEMORY_RECLAIM_SWAPPINESS:
4336                         if (match_int(&args[0], &swappiness))
4337                                 return -EINVAL;
4338                         if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
4339                                 return -EINVAL;
4340                         break;
4341                 default:
4342                         return -EINVAL;
4343                 }
4344         }
4345
4346         reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
4347         while (nr_reclaimed < nr_to_reclaim) {
4348                 /* Will converge on zero, but reclaim enforces a minimum */
4349                 unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
4350                 unsigned long reclaimed;
4351
4352                 if (signal_pending(current))
4353                         return -EINTR;
4354
4355                 /*
4356                  * This is the final attempt, drain percpu lru caches in the
4357                  * hope of introducing more evictable pages for
4358                  * try_to_free_mem_cgroup_pages().
4359                  */
4360                 if (!nr_retries)
4361                         lru_add_drain_all();
4362
4363                 reclaimed = try_to_free_mem_cgroup_pages(memcg,
4364                                         batch_size, GFP_KERNEL,
4365                                         reclaim_options,
4366                                         swappiness == -1 ? NULL : &swappiness);
4367
4368                 if (!reclaimed && !nr_retries--)
4369                         return -EAGAIN;
4370
4371                 nr_reclaimed += reclaimed;
4372         }
4373
4374         return nbytes;
4375 }
4376
4377 static struct cftype memory_files[] = {
4378         {
4379                 .name = "current",
4380                 .flags = CFTYPE_NOT_ON_ROOT,
4381                 .read_u64 = memory_current_read,
4382         },
4383         {
4384                 .name = "peak",
4385                 .flags = CFTYPE_NOT_ON_ROOT,
4386                 .open = peak_open,
4387                 .release = peak_release,
4388                 .seq_show = memory_peak_show,
4389                 .write = memory_peak_write,
4390         },
4391         {
4392                 .name = "min",
4393                 .flags = CFTYPE_NOT_ON_ROOT,
4394                 .seq_show = memory_min_show,
4395                 .write = memory_min_write,
4396         },
4397         {
4398                 .name = "low",
4399                 .flags = CFTYPE_NOT_ON_ROOT,
4400                 .seq_show = memory_low_show,
4401                 .write = memory_low_write,
4402         },
4403         {
4404                 .name = "high",
4405                 .flags = CFTYPE_NOT_ON_ROOT,
4406                 .seq_show = memory_high_show,
4407                 .write = memory_high_write,
4408         },
4409         {
4410                 .name = "max",
4411                 .flags = CFTYPE_NOT_ON_ROOT,
4412                 .seq_show = memory_max_show,
4413                 .write = memory_max_write,
4414         },
4415         {
4416                 .name = "events",
4417                 .flags = CFTYPE_NOT_ON_ROOT,
4418                 .file_offset = offsetof(struct mem_cgroup, events_file),
4419                 .seq_show = memory_events_show,
4420         },
4421         {
4422                 .name = "events.local",
4423                 .flags = CFTYPE_NOT_ON_ROOT,
4424                 .file_offset = offsetof(struct mem_cgroup, events_local_file),
4425                 .seq_show = memory_events_local_show,
4426         },
4427         {
4428                 .name = "stat",
4429                 .seq_show = memory_stat_show,
4430         },
4431 #ifdef CONFIG_NUMA
4432         {
4433                 .name = "numa_stat",
4434                 .seq_show = memory_numa_stat_show,
4435         },
4436 #endif
4437         {
4438                 .name = "oom.group",
4439                 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
4440                 .seq_show = memory_oom_group_show,
4441                 .write = memory_oom_group_write,
4442         },
4443         {
4444                 .name = "reclaim",
4445                 .flags = CFTYPE_NS_DELEGATABLE,
4446                 .write = memory_reclaim,
4447         },
4448         { }     /* terminate */
4449 };
4450
4451 struct cgroup_subsys memory_cgrp_subsys = {
4452         .css_alloc = mem_cgroup_css_alloc,
4453         .css_online = mem_cgroup_css_online,
4454         .css_offline = mem_cgroup_css_offline,
4455         .css_released = mem_cgroup_css_released,
4456         .css_free = mem_cgroup_css_free,
4457         .css_reset = mem_cgroup_css_reset,
4458         .css_rstat_flush = mem_cgroup_css_rstat_flush,
4459         .attach = mem_cgroup_attach,
4460         .fork = mem_cgroup_fork,
4461         .exit = mem_cgroup_exit,
4462         .dfl_cftypes = memory_files,
4463 #ifdef CONFIG_MEMCG_V1
4464         .legacy_cftypes = mem_cgroup_legacy_files,
4465 #endif
4466         .early_init = 0,
4467 };
4468
4469 /**
4470  * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
4471  * @root: the top ancestor of the sub-tree being checked
4472  * @memcg: the memory cgroup to check
4473  *
4474  * WARNING: This function is not stateless! It can only be used as part
4475  *          of a top-down tree iteration, not for isolated queries.
4476  */
4477 void mem_cgroup_calculate_protection(struct mem_cgroup *root,
4478                                      struct mem_cgroup *memcg)
4479 {
4480         bool recursive_protection =
4481                 cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT;
4482
4483         if (mem_cgroup_disabled())
4484                 return;
4485
4486         if (!root)
4487                 root = root_mem_cgroup;
4488
4489         page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection);
4490 }
4491
4492 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
4493                         gfp_t gfp)
4494 {
4495         int ret;
4496
4497         ret = try_charge(memcg, gfp, folio_nr_pages(folio));
4498         if (ret)
4499                 goto out;
4500
4501         mem_cgroup_commit_charge(folio, memcg);
4502 out:
4503         return ret;
4504 }
4505
4506 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
4507 {
4508         struct mem_cgroup *memcg;
4509         int ret;
4510
4511         memcg = get_mem_cgroup_from_mm(mm);
4512         ret = charge_memcg(folio, memcg, gfp);
4513         css_put(&memcg->css);
4514
4515         return ret;
4516 }
4517
4518 /**
4519  * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
4520  * @memcg: memcg to charge.
4521  * @gfp: reclaim mode.
4522  * @nr_pages: number of pages to charge.
4523  *
4524  * This function is called when allocating a huge page folio to determine if
4525  * the memcg has the capacity for it. It does not commit the charge yet,
4526  * as the hugetlb folio itself has not been obtained from the hugetlb pool.
4527  *
4528  * Once we have obtained the hugetlb folio, we can call
4529  * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
4530  * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
4531  * of try_charge().
4532  *
4533  * Returns 0 on success. Otherwise, an error code is returned.
4534  */
4535 int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
4536                         long nr_pages)
4537 {
4538         /*
4539          * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
4540          * but do not attempt to commit charge later (or cancel on error) either.
4541          */
4542         if (mem_cgroup_disabled() || !memcg ||
4543                 !cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
4544                 !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
4545                 return -EOPNOTSUPP;
4546
4547         if (try_charge(memcg, gfp, nr_pages))
4548                 return -ENOMEM;
4549
4550         return 0;
4551 }
4552
4553 /**
4554  * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
4555  * @folio: folio to charge.
4556  * @mm: mm context of the victim
4557  * @gfp: reclaim mode
4558  * @entry: swap entry for which the folio is allocated
4559  *
4560  * This function charges a folio allocated for swapin. Please call this before
4561  * adding the folio to the swapcache.
4562  *
4563  * Returns 0 on success. Otherwise, an error code is returned.
4564  */
4565 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
4566                                   gfp_t gfp, swp_entry_t entry)
4567 {
4568         struct mem_cgroup *memcg;
4569         unsigned short id;
4570         int ret;
4571
4572         if (mem_cgroup_disabled())
4573                 return 0;
4574
4575         id = lookup_swap_cgroup_id(entry);
4576         rcu_read_lock();
4577         memcg = mem_cgroup_from_id(id);
4578         if (!memcg || !css_tryget_online(&memcg->css))
4579                 memcg = get_mem_cgroup_from_mm(mm);
4580         rcu_read_unlock();
4581
4582         ret = charge_memcg(folio, memcg, gfp);
4583
4584         css_put(&memcg->css);
4585         return ret;
4586 }
4587
4588 /*
4589  * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
4590  * @entry: the first swap entry for which the pages are charged
4591  * @nr_pages: number of pages which will be uncharged
4592  *
4593  * Call this function after successfully adding the charged page to swapcache.
4594  *
4595  * Note: This function assumes the page for which swap slot is being uncharged
4596  * is order 0 page.
4597  */
4598 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
4599 {
4600         /*
4601          * Cgroup1's unified memory+swap counter has been charged with the
4602          * new swapcache page, finish the transfer by uncharging the swap
4603          * slot. The swap slot would also get uncharged when it dies, but
4604          * it can stick around indefinitely and we'd count the page twice
4605          * the entire time.
4606          *
4607          * Cgroup2 has separate resource counters for memory and swap,
4608          * so this is a non-issue here. Memory and swap charge lifetimes
4609          * correspond 1:1 to page and swap slot lifetimes: we charge the
4610          * page to memory here, and uncharge swap when the slot is freed.
4611          */
4612         if (!mem_cgroup_disabled() && do_memsw_account()) {
4613                 /*
4614                  * The swap entry might not get freed for a long time,
4615                  * let's not wait for it.  The page already received a
4616                  * memory+swap charge, drop the swap entry duplicate.
4617                  */
4618                 mem_cgroup_uncharge_swap(entry, nr_pages);
4619         }
4620 }
4621
4622 struct uncharge_gather {
4623         struct mem_cgroup *memcg;
4624         unsigned long nr_memory;
4625         unsigned long pgpgout;
4626         unsigned long nr_kmem;
4627         int nid;
4628 };
4629
4630 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
4631 {
4632         memset(ug, 0, sizeof(*ug));
4633 }
4634
4635 static void uncharge_batch(const struct uncharge_gather *ug)
4636 {
4637         if (ug->nr_memory) {
4638                 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
4639                 if (do_memsw_account())
4640                         page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
4641                 if (ug->nr_kmem) {
4642                         mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
4643                         memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
4644                 }
4645                 memcg1_oom_recover(ug->memcg);
4646         }
4647
4648         memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid);
4649
4650         /* drop reference from uncharge_folio */
4651         css_put(&ug->memcg->css);
4652 }
4653
4654 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
4655 {
4656         long nr_pages;
4657         struct mem_cgroup *memcg;
4658         struct obj_cgroup *objcg;
4659
4660         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
4661
4662         /*
4663          * Nobody should be changing or seriously looking at
4664          * folio memcg or objcg at this point, we have fully
4665          * exclusive access to the folio.
4666          */
4667         if (folio_memcg_kmem(folio)) {
4668                 objcg = __folio_objcg(folio);
4669                 /*
4670                  * This get matches the put at the end of the function and
4671                  * kmem pages do not hold memcg references anymore.
4672                  */
4673                 memcg = get_mem_cgroup_from_objcg(objcg);
4674         } else {
4675                 memcg = __folio_memcg(folio);
4676         }
4677
4678         if (!memcg)
4679                 return;
4680
4681         if (ug->memcg != memcg) {
4682                 if (ug->memcg) {
4683                         uncharge_batch(ug);
4684                         uncharge_gather_clear(ug);
4685                 }
4686                 ug->memcg = memcg;
4687                 ug->nid = folio_nid(folio);
4688
4689                 /* pairs with css_put in uncharge_batch */
4690                 css_get(&memcg->css);
4691         }
4692
4693         nr_pages = folio_nr_pages(folio);
4694
4695         if (folio_memcg_kmem(folio)) {
4696                 ug->nr_memory += nr_pages;
4697                 ug->nr_kmem += nr_pages;
4698
4699                 folio->memcg_data = 0;
4700                 obj_cgroup_put(objcg);
4701         } else {
4702                 /* LRU pages aren't accounted at the root level */
4703                 if (!mem_cgroup_is_root(memcg))
4704                         ug->nr_memory += nr_pages;
4705                 ug->pgpgout++;
4706
4707                 WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
4708                 folio->memcg_data = 0;
4709         }
4710
4711         css_put(&memcg->css);
4712 }
4713
4714 void __mem_cgroup_uncharge(struct folio *folio)
4715 {
4716         struct uncharge_gather ug;
4717
4718         /* Don't touch folio->lru of any random page, pre-check: */
4719         if (!folio_memcg_charged(folio))
4720                 return;
4721
4722         uncharge_gather_clear(&ug);
4723         uncharge_folio(folio, &ug);
4724         uncharge_batch(&ug);
4725 }
4726
4727 void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
4728 {
4729         struct uncharge_gather ug;
4730         unsigned int i;
4731
4732         uncharge_gather_clear(&ug);
4733         for (i = 0; i < folios->nr; i++)
4734                 uncharge_folio(folios->folios[i], &ug);
4735         if (ug.memcg)
4736                 uncharge_batch(&ug);
4737 }
4738
4739 /**
4740  * mem_cgroup_replace_folio - Charge a folio's replacement.
4741  * @old: Currently circulating folio.
4742  * @new: Replacement folio.
4743  *
4744  * Charge @new as a replacement folio for @old. @old will
4745  * be uncharged upon free.
4746  *
4747  * Both folios must be locked, @new->mapping must be set up.
4748  */
4749 void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
4750 {
4751         struct mem_cgroup *memcg;
4752         long nr_pages = folio_nr_pages(new);
4753
4754         VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
4755         VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
4756         VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
4757         VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
4758
4759         if (mem_cgroup_disabled())
4760                 return;
4761
4762         /* Page cache replacement: new folio already charged? */
4763         if (folio_memcg_charged(new))
4764                 return;
4765
4766         memcg = folio_memcg(old);
4767         VM_WARN_ON_ONCE_FOLIO(!memcg, old);
4768         if (!memcg)
4769                 return;
4770
4771         /* Force-charge the new page. The old one will be freed soon */
4772         if (!mem_cgroup_is_root(memcg)) {
4773                 page_counter_charge(&memcg->memory, nr_pages);
4774                 if (do_memsw_account())
4775                         page_counter_charge(&memcg->memsw, nr_pages);
4776         }
4777
4778         css_get(&memcg->css);
4779         commit_charge(new, memcg);
4780         memcg1_commit_charge(new, memcg);
4781 }
4782
4783 /**
4784  * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
4785  * @old: Currently circulating folio.
4786  * @new: Replacement folio.
4787  *
4788  * Transfer the memcg data from the old folio to the new folio for migration.
4789  * The old folio's data info will be cleared. Note that the memory counters
4790  * will remain unchanged throughout the process.
4791  *
4792  * Both folios must be locked, @new->mapping must be set up.
4793  */
4794 void mem_cgroup_migrate(struct folio *old, struct folio *new)
4795 {
4796         struct mem_cgroup *memcg;
4797
4798         VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
4799         VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
4800         VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
4801         VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new);
4802         VM_BUG_ON_FOLIO(folio_test_lru(old), old);
4803
4804         if (mem_cgroup_disabled())
4805                 return;
4806
4807         memcg = folio_memcg(old);
4808         /*
4809          * Note that it is normal to see !memcg for a hugetlb folio.
4810          * For e.g, itt could have been allocated when memory_hugetlb_accounting
4811          * was not selected.
4812          */
4813         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
4814         if (!memcg)
4815                 return;
4816
4817         /* Transfer the charge and the css ref */
4818         commit_charge(new, memcg);
4819
4820         /* Warning should never happen, so don't worry about refcount non-0 */
4821         WARN_ON_ONCE(folio_unqueue_deferred_split(old));
4822         old->memcg_data = 0;
4823 }
4824
4825 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
4826 EXPORT_SYMBOL(memcg_sockets_enabled_key);
4827
4828 void mem_cgroup_sk_alloc(struct sock *sk)
4829 {
4830         struct mem_cgroup *memcg;
4831
4832         if (!mem_cgroup_sockets_enabled)
4833                 return;
4834
4835         /* Do not associate the sock with unrelated interrupted task's memcg. */
4836         if (!in_task())
4837                 return;
4838
4839         rcu_read_lock();
4840         memcg = mem_cgroup_from_task(current);
4841         if (mem_cgroup_is_root(memcg))
4842                 goto out;
4843         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
4844                 goto out;
4845         if (css_tryget(&memcg->css))
4846                 sk->sk_memcg = memcg;
4847 out:
4848         rcu_read_unlock();
4849 }
4850
4851 void mem_cgroup_sk_free(struct sock *sk)
4852 {
4853         if (sk->sk_memcg)
4854                 css_put(&sk->sk_memcg->css);
4855 }
4856
4857 /**
4858  * mem_cgroup_charge_skmem - charge socket memory
4859  * @memcg: memcg to charge
4860  * @nr_pages: number of pages to charge
4861  * @gfp_mask: reclaim mode
4862  *
4863  * Charges @nr_pages to @memcg. Returns %true if the charge fit within
4864  * @memcg's configured limit, %false if it doesn't.
4865  */
4866 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
4867                              gfp_t gfp_mask)
4868 {
4869         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
4870                 return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
4871
4872         if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
4873                 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
4874                 return true;
4875         }
4876
4877         return false;
4878 }
4879
4880 /**
4881  * mem_cgroup_uncharge_skmem - uncharge socket memory
4882  * @memcg: memcg to uncharge
4883  * @nr_pages: number of pages to uncharge
4884  */
4885 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
4886 {
4887         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
4888                 memcg1_uncharge_skmem(memcg, nr_pages);
4889                 return;
4890         }
4891
4892         mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
4893
4894         refill_stock(memcg, nr_pages);
4895 }
4896
4897 static int __init cgroup_memory(char *s)
4898 {
4899         char *token;
4900
4901         while ((token = strsep(&s, ",")) != NULL) {
4902                 if (!*token)
4903                         continue;
4904                 if (!strcmp(token, "nosocket"))
4905                         cgroup_memory_nosocket = true;
4906                 if (!strcmp(token, "nokmem"))
4907                         cgroup_memory_nokmem = true;
4908                 if (!strcmp(token, "nobpf"))
4909                         cgroup_memory_nobpf = true;
4910         }
4911         return 1;
4912 }
4913 __setup("cgroup.memory=", cgroup_memory);
4914
4915 /*
4916  * subsys_initcall() for memory controller.
4917  *
4918  * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
4919  * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
4920  * basically everything that doesn't depend on a specific mem_cgroup structure
4921  * should be initialized from here.
4922  */
4923 static int __init mem_cgroup_init(void)
4924 {
4925         int cpu;
4926
4927         /*
4928          * Currently s32 type (can refer to struct batched_lruvec_stat) is
4929          * used for per-memcg-per-cpu caching of per-node statistics. In order
4930          * to work fine, we should make sure that the overfill threshold can't
4931          * exceed S32_MAX / PAGE_SIZE.
4932          */
4933         BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
4934
4935         cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
4936                                   memcg_hotplug_cpu_dead);
4937
4938         for_each_possible_cpu(cpu)
4939                 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
4940                           drain_local_stock);
4941
4942         return 0;
4943 }
4944 subsys_initcall(mem_cgroup_init);
4945
4946 #ifdef CONFIG_SWAP
4947 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
4948 {
4949         while (!refcount_inc_not_zero(&memcg->id.ref)) {
4950                 /*
4951                  * The root cgroup cannot be destroyed, so it's refcount must
4952                  * always be >= 1.
4953                  */
4954                 if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
4955                         VM_BUG_ON(1);
4956                         break;
4957                 }
4958                 memcg = parent_mem_cgroup(memcg);
4959                 if (!memcg)
4960                         memcg = root_mem_cgroup;
4961         }
4962         return memcg;
4963 }
4964
4965 /**
4966  * mem_cgroup_swapout - transfer a memsw charge to swap
4967  * @folio: folio whose memsw charge to transfer
4968  * @entry: swap entry to move the charge to
4969  *
4970  * Transfer the memsw charge of @folio to @entry.
4971  */
4972 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
4973 {
4974         struct mem_cgroup *memcg, *swap_memcg;
4975         unsigned int nr_entries;
4976         unsigned short oldid;
4977
4978         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
4979         VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
4980
4981         if (mem_cgroup_disabled())
4982                 return;
4983
4984         if (!do_memsw_account())
4985                 return;
4986
4987         memcg = folio_memcg(folio);
4988
4989         VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
4990         if (!memcg)
4991                 return;
4992
4993         /*
4994          * In case the memcg owning these pages has been offlined and doesn't
4995          * have an ID allocated to it anymore, charge the closest online
4996          * ancestor for the swap instead and transfer the memory+swap charge.
4997          */
4998         swap_memcg = mem_cgroup_id_get_online(memcg);
4999         nr_entries = folio_nr_pages(folio);
5000         /* Get references for the tail pages, too */
5001         if (nr_entries > 1)
5002                 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
5003         oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
5004                                    nr_entries);
5005         VM_BUG_ON_FOLIO(oldid, folio);
5006         mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
5007
5008         folio_unqueue_deferred_split(folio);
5009         folio->memcg_data = 0;
5010
5011         if (!mem_cgroup_is_root(memcg))
5012                 page_counter_uncharge(&memcg->memory, nr_entries);
5013
5014         if (memcg != swap_memcg) {
5015                 if (!mem_cgroup_is_root(swap_memcg))
5016                         page_counter_charge(&swap_memcg->memsw, nr_entries);
5017                 page_counter_uncharge(&memcg->memsw, nr_entries);
5018         }
5019
5020         memcg1_swapout(folio, memcg);
5021         css_put(&memcg->css);
5022 }
5023
5024 /**
5025  * __mem_cgroup_try_charge_swap - try charging swap space for a folio
5026  * @folio: folio being added to swap
5027  * @entry: swap entry to charge
5028  *
5029  * Try to charge @folio's memcg for the swap space at @entry.
5030  *
5031  * Returns 0 on success, -ENOMEM on failure.
5032  */
5033 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
5034 {
5035         unsigned int nr_pages = folio_nr_pages(folio);
5036         struct page_counter *counter;
5037         struct mem_cgroup *memcg;
5038         unsigned short oldid;
5039
5040         if (do_memsw_account())
5041                 return 0;
5042
5043         memcg = folio_memcg(folio);
5044
5045         VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
5046         if (!memcg)
5047                 return 0;
5048
5049         if (!entry.val) {
5050                 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
5051                 return 0;
5052         }
5053
5054         memcg = mem_cgroup_id_get_online(memcg);
5055
5056         if (!mem_cgroup_is_root(memcg) &&
5057             !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
5058                 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
5059                 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
5060                 mem_cgroup_id_put(memcg);
5061                 return -ENOMEM;
5062         }
5063
5064         /* Get references for the tail pages, too */
5065         if (nr_pages > 1)
5066                 mem_cgroup_id_get_many(memcg, nr_pages - 1);
5067         oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
5068         VM_BUG_ON_FOLIO(oldid, folio);
5069         mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
5070
5071         return 0;
5072 }
5073
5074 /**
5075  * __mem_cgroup_uncharge_swap - uncharge swap space
5076  * @entry: swap entry to uncharge
5077  * @nr_pages: the amount of swap space to uncharge
5078  */
5079 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
5080 {
5081         struct mem_cgroup *memcg;
5082         unsigned short id;
5083
5084         id = swap_cgroup_record(entry, 0, nr_pages);
5085         rcu_read_lock();
5086         memcg = mem_cgroup_from_id(id);
5087         if (memcg) {
5088                 if (!mem_cgroup_is_root(memcg)) {
5089                         if (do_memsw_account())
5090                                 page_counter_uncharge(&memcg->memsw, nr_pages);
5091                         else
5092                                 page_counter_uncharge(&memcg->swap, nr_pages);
5093                 }
5094                 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
5095                 mem_cgroup_id_put_many(memcg, nr_pages);
5096         }
5097         rcu_read_unlock();
5098 }
5099
5100 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
5101 {
5102         long nr_swap_pages = get_nr_swap_pages();
5103
5104         if (mem_cgroup_disabled() || do_memsw_account())
5105                 return nr_swap_pages;
5106         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
5107                 nr_swap_pages = min_t(long, nr_swap_pages,
5108                                       READ_ONCE(memcg->swap.max) -
5109                                       page_counter_read(&memcg->swap));
5110         return nr_swap_pages;
5111 }
5112
5113 bool mem_cgroup_swap_full(struct folio *folio)
5114 {
5115         struct mem_cgroup *memcg;
5116
5117         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
5118
5119         if (vm_swap_full())
5120                 return true;
5121         if (do_memsw_account())
5122                 return false;
5123
5124         memcg = folio_memcg(folio);
5125         if (!memcg)
5126                 return false;
5127
5128         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
5129                 unsigned long usage = page_counter_read(&memcg->swap);
5130
5131                 if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
5132                     usage * 2 >= READ_ONCE(memcg->swap.max))
5133                         return true;
5134         }
5135
5136         return false;
5137 }
5138
5139 static int __init setup_swap_account(char *s)
5140 {
5141         bool res;
5142
5143         if (!kstrtobool(s, &res) && !res)
5144                 pr_warn_once("The swapaccount=0 commandline option is deprecated "
5145                              "in favor of configuring swap control via cgroupfs. "
5146                              "Please report your usecase to linux-mm@kvack.org if you "
5147                              "depend on this functionality.\n");
5148         return 1;
5149 }
5150 __setup("swapaccount=", setup_swap_account);
5151
5152 static u64 swap_current_read(struct cgroup_subsys_state *css,
5153                              struct cftype *cft)
5154 {
5155         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5156
5157         return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
5158 }
5159
5160 static int swap_peak_show(struct seq_file *sf, void *v)
5161 {
5162         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
5163
5164         return peak_show(sf, v, &memcg->swap);
5165 }
5166
5167 static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
5168                                size_t nbytes, loff_t off)
5169 {
5170         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5171
5172         return peak_write(of, buf, nbytes, off, &memcg->swap,
5173                           &memcg->swap_peaks);
5174 }
5175
5176 static int swap_high_show(struct seq_file *m, void *v)
5177 {
5178         return seq_puts_memcg_tunable(m,
5179                 READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
5180 }
5181
5182 static ssize_t swap_high_write(struct kernfs_open_file *of,
5183                                char *buf, size_t nbytes, loff_t off)
5184 {
5185         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5186         unsigned long high;
5187         int err;
5188
5189         buf = strstrip(buf);
5190         err = page_counter_memparse(buf, "max", &high);
5191         if (err)
5192                 return err;
5193
5194         page_counter_set_high(&memcg->swap, high);
5195
5196         return nbytes;
5197 }
5198
5199 static int swap_max_show(struct seq_file *m, void *v)
5200 {
5201         return seq_puts_memcg_tunable(m,
5202                 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
5203 }
5204
5205 static ssize_t swap_max_write(struct kernfs_open_file *of,
5206                               char *buf, size_t nbytes, loff_t off)
5207 {
5208         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5209         unsigned long max;
5210         int err;
5211
5212         buf = strstrip(buf);
5213         err = page_counter_memparse(buf, "max", &max);
5214         if (err)
5215                 return err;
5216
5217         xchg(&memcg->swap.max, max);
5218
5219         return nbytes;
5220 }
5221
5222 static int swap_events_show(struct seq_file *m, void *v)
5223 {
5224         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5225
5226         seq_printf(m, "high %lu\n",
5227                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
5228         seq_printf(m, "max %lu\n",
5229                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
5230         seq_printf(m, "fail %lu\n",
5231                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
5232
5233         return 0;
5234 }
5235
5236 static struct cftype swap_files[] = {
5237         {
5238                 .name = "swap.current",
5239                 .flags = CFTYPE_NOT_ON_ROOT,
5240                 .read_u64 = swap_current_read,
5241         },
5242         {
5243                 .name = "swap.high",
5244                 .flags = CFTYPE_NOT_ON_ROOT,
5245                 .seq_show = swap_high_show,
5246                 .write = swap_high_write,
5247         },
5248         {
5249                 .name = "swap.max",
5250                 .flags = CFTYPE_NOT_ON_ROOT,
5251                 .seq_show = swap_max_show,
5252                 .write = swap_max_write,
5253         },
5254         {
5255                 .name = "swap.peak",
5256                 .flags = CFTYPE_NOT_ON_ROOT,
5257                 .open = peak_open,
5258                 .release = peak_release,
5259                 .seq_show = swap_peak_show,
5260                 .write = swap_peak_write,
5261         },
5262         {
5263                 .name = "swap.events",
5264                 .flags = CFTYPE_NOT_ON_ROOT,
5265                 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
5266                 .seq_show = swap_events_show,
5267         },
5268         { }     /* terminate */
5269 };
5270
5271 #ifdef CONFIG_ZSWAP
5272 /**
5273  * obj_cgroup_may_zswap - check if this cgroup can zswap
5274  * @objcg: the object cgroup
5275  *
5276  * Check if the hierarchical zswap limit has been reached.
5277  *
5278  * This doesn't check for specific headroom, and it is not atomic
5279  * either. But with zswap, the size of the allocation is only known
5280  * once compression has occurred, and this optimistic pre-check avoids
5281  * spending cycles on compression when there is already no room left
5282  * or zswap is disabled altogether somewhere in the hierarchy.
5283  */
5284 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
5285 {
5286         struct mem_cgroup *memcg, *original_memcg;
5287         bool ret = true;
5288
5289         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5290                 return true;
5291
5292         original_memcg = get_mem_cgroup_from_objcg(objcg);
5293         for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
5294              memcg = parent_mem_cgroup(memcg)) {
5295                 unsigned long max = READ_ONCE(memcg->zswap_max);
5296                 unsigned long pages;
5297
5298                 if (max == PAGE_COUNTER_MAX)
5299                         continue;
5300                 if (max == 0) {
5301                         ret = false;
5302                         break;
5303                 }
5304
5305                 /* Force flush to get accurate stats for charging */
5306                 __mem_cgroup_flush_stats(memcg, true);
5307                 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
5308                 if (pages < max)
5309                         continue;
5310                 ret = false;
5311                 break;
5312         }
5313         mem_cgroup_put(original_memcg);
5314         return ret;
5315 }
5316
5317 /**
5318  * obj_cgroup_charge_zswap - charge compression backend memory
5319  * @objcg: the object cgroup
5320  * @size: size of compressed object
5321  *
5322  * This forces the charge after obj_cgroup_may_zswap() allowed
5323  * compression and storage in zwap for this cgroup to go ahead.
5324  */
5325 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
5326 {
5327         struct mem_cgroup *memcg;
5328
5329         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5330                 return;
5331
5332         VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
5333
5334         /* PF_MEMALLOC context, charging must succeed */
5335         if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
5336                 VM_WARN_ON_ONCE(1);
5337
5338         rcu_read_lock();
5339         memcg = obj_cgroup_memcg(objcg);
5340         mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
5341         mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
5342         rcu_read_unlock();
5343 }
5344
5345 /**
5346  * obj_cgroup_uncharge_zswap - uncharge compression backend memory
5347  * @objcg: the object cgroup
5348  * @size: size of compressed object
5349  *
5350  * Uncharges zswap memory on page in.
5351  */
5352 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
5353 {
5354         struct mem_cgroup *memcg;
5355
5356         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5357                 return;
5358
5359         obj_cgroup_uncharge(objcg, size);
5360
5361         rcu_read_lock();
5362         memcg = obj_cgroup_memcg(objcg);
5363         mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
5364         mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
5365         rcu_read_unlock();
5366 }
5367
5368 bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
5369 {
5370         /* if zswap is disabled, do not block pages going to the swapping device */
5371         if (!zswap_is_enabled())
5372                 return true;
5373
5374         for (; memcg; memcg = parent_mem_cgroup(memcg))
5375                 if (!READ_ONCE(memcg->zswap_writeback))
5376                         return false;
5377
5378         return true;
5379 }
5380
5381 static u64 zswap_current_read(struct cgroup_subsys_state *css,
5382                               struct cftype *cft)
5383 {
5384         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5385
5386         mem_cgroup_flush_stats(memcg);
5387         return memcg_page_state(memcg, MEMCG_ZSWAP_B);
5388 }
5389
5390 static int zswap_max_show(struct seq_file *m, void *v)
5391 {
5392         return seq_puts_memcg_tunable(m,
5393                 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
5394 }
5395
5396 static ssize_t zswap_max_write(struct kernfs_open_file *of,
5397                                char *buf, size_t nbytes, loff_t off)
5398 {
5399         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5400         unsigned long max;
5401         int err;
5402
5403         buf = strstrip(buf);
5404         err = page_counter_memparse(buf, "max", &max);
5405         if (err)
5406                 return err;
5407
5408         xchg(&memcg->zswap_max, max);
5409
5410         return nbytes;
5411 }
5412
5413 static int zswap_writeback_show(struct seq_file *m, void *v)
5414 {
5415         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5416
5417         seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback));
5418         return 0;
5419 }
5420
5421 static ssize_t zswap_writeback_write(struct kernfs_open_file *of,
5422                                 char *buf, size_t nbytes, loff_t off)
5423 {
5424         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5425         int zswap_writeback;
5426         ssize_t parse_ret = kstrtoint(strstrip(buf), 0, &zswap_writeback);
5427
5428         if (parse_ret)
5429                 return parse_ret;
5430
5431         if (zswap_writeback != 0 && zswap_writeback != 1)
5432                 return -EINVAL;
5433
5434         WRITE_ONCE(memcg->zswap_writeback, zswap_writeback);
5435         return nbytes;
5436 }
5437
5438 static struct cftype zswap_files[] = {
5439         {
5440                 .name = "zswap.current",
5441                 .flags = CFTYPE_NOT_ON_ROOT,
5442                 .read_u64 = zswap_current_read,
5443         },
5444         {
5445                 .name = "zswap.max",
5446                 .flags = CFTYPE_NOT_ON_ROOT,
5447                 .seq_show = zswap_max_show,
5448                 .write = zswap_max_write,
5449         },
5450         {
5451                 .name = "zswap.writeback",
5452                 .seq_show = zswap_writeback_show,
5453                 .write = zswap_writeback_write,
5454         },
5455         { }     /* terminate */
5456 };
5457 #endif /* CONFIG_ZSWAP */
5458
5459 static int __init mem_cgroup_swap_init(void)
5460 {
5461         if (mem_cgroup_disabled())
5462                 return 0;
5463
5464         WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
5465 #ifdef CONFIG_MEMCG_V1
5466         WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
5467 #endif
5468 #ifdef CONFIG_ZSWAP
5469         WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
5470 #endif
5471         return 0;
5472 }
5473 subsys_initcall(mem_cgroup_swap_init);
5474
5475 #endif /* CONFIG_SWAP */