1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* memcontrol.c - Memory Controller
4 * Copyright IBM Corporation, 2007
5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
7 * Copyright 2007 OpenVZ SWsoft Inc
8 * Author: Pavel Emelianov <xemul@openvz.org>
11 * Copyright (C) 2009 Nokia Corporation
12 * Author: Kirill A. Shutemov
14 * Kernel Memory Controller
15 * Copyright (C) 2012 Parallels Inc. and Google Inc.
16 * Authors: Glauber Costa and Suleiman Souhlal
19 * Charge lifetime sanitation
20 * Lockless page tracking & accounting
21 * Unified hierarchy configuration model
22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
24 * Per memcg lru locking
25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi
28 #include <linux/cgroup-defs.h>
29 #include <linux/page_counter.h>
30 #include <linux/memcontrol.h>
31 #include <linux/cgroup.h>
32 #include <linux/sched/mm.h>
33 #include <linux/shmem_fs.h>
34 #include <linux/hugetlb.h>
35 #include <linux/pagemap.h>
36 #include <linux/pagevec.h>
37 #include <linux/vm_event_item.h>
38 #include <linux/smp.h>
39 #include <linux/page-flags.h>
40 #include <linux/backing-dev.h>
41 #include <linux/bit_spinlock.h>
42 #include <linux/rcupdate.h>
43 #include <linux/limits.h>
44 #include <linux/export.h>
45 #include <linux/list.h>
46 #include <linux/mutex.h>
47 #include <linux/rbtree.h>
48 #include <linux/slab.h>
49 #include <linux/swapops.h>
50 #include <linux/spinlock.h>
52 #include <linux/seq_file.h>
53 #include <linux/parser.h>
54 #include <linux/vmpressure.h>
55 #include <linux/memremap.h>
56 #include <linux/mm_inline.h>
57 #include <linux/swap_cgroup.h>
58 #include <linux/cpu.h>
59 #include <linux/oom.h>
60 #include <linux/lockdep.h>
61 #include <linux/resume_user_mode.h>
62 #include <linux/psi.h>
63 #include <linux/seq_buf.h>
64 #include <linux/sched/isolation.h>
65 #include <linux/kmemleak.h>
70 #include "memcontrol-v1.h"
72 #include <linux/uaccess.h>
74 #define CREATE_TRACE_POINTS
75 #include <trace/events/memcg.h>
76 #undef CREATE_TRACE_POINTS
78 #include <trace/events/vmscan.h>
80 struct cgroup_subsys memory_cgrp_subsys __read_mostly
;
81 EXPORT_SYMBOL(memory_cgrp_subsys
);
83 struct mem_cgroup
*root_mem_cgroup __read_mostly
;
85 /* Active memory cgroup to use from an interrupt context */
86 DEFINE_PER_CPU(struct mem_cgroup
*, int_active_memcg
);
87 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg
);
89 /* Socket memory accounting disabled? */
90 static bool cgroup_memory_nosocket __ro_after_init
;
92 /* Kernel memory accounting disabled? */
93 static bool cgroup_memory_nokmem __ro_after_init
;
95 /* BPF memory accounting disabled? */
96 static bool cgroup_memory_nobpf __ro_after_init
;
98 #ifdef CONFIG_CGROUP_WRITEBACK
99 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq
);
102 static inline bool task_is_dying(void)
104 return tsk_is_oom_victim(current
) || fatal_signal_pending(current
) ||
105 (current
->flags
& PF_EXITING
);
108 /* Some nice accessors for the vmpressure. */
109 struct vmpressure
*memcg_to_vmpressure(struct mem_cgroup
*memcg
)
112 memcg
= root_mem_cgroup
;
113 return &memcg
->vmpressure
;
116 struct mem_cgroup
*vmpressure_to_memcg(struct vmpressure
*vmpr
)
118 return container_of(vmpr
, struct mem_cgroup
, vmpressure
);
121 #define SEQ_BUF_SIZE SZ_4K
122 #define CURRENT_OBJCG_UPDATE_BIT 0
123 #define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
125 static DEFINE_SPINLOCK(objcg_lock
);
127 bool mem_cgroup_kmem_disabled(void)
129 return cgroup_memory_nokmem
;
132 static void obj_cgroup_uncharge_pages(struct obj_cgroup
*objcg
,
133 unsigned int nr_pages
);
135 static void obj_cgroup_release(struct percpu_ref
*ref
)
137 struct obj_cgroup
*objcg
= container_of(ref
, struct obj_cgroup
, refcnt
);
138 unsigned int nr_bytes
;
139 unsigned int nr_pages
;
143 * At this point all allocated objects are freed, and
144 * objcg->nr_charged_bytes can't have an arbitrary byte value.
145 * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
147 * The following sequence can lead to it:
148 * 1) CPU0: objcg == stock->cached_objcg
149 * 2) CPU1: we do a small allocation (e.g. 92 bytes),
150 * PAGE_SIZE bytes are charged
151 * 3) CPU1: a process from another memcg is allocating something,
152 * the stock if flushed,
153 * objcg->nr_charged_bytes = PAGE_SIZE - 92
154 * 5) CPU0: we do release this object,
155 * 92 bytes are added to stock->nr_bytes
156 * 6) CPU0: stock is flushed,
157 * 92 bytes are added to objcg->nr_charged_bytes
159 * In the result, nr_charged_bytes == PAGE_SIZE.
160 * This page will be uncharged in obj_cgroup_release().
162 nr_bytes
= atomic_read(&objcg
->nr_charged_bytes
);
163 WARN_ON_ONCE(nr_bytes
& (PAGE_SIZE
- 1));
164 nr_pages
= nr_bytes
>> PAGE_SHIFT
;
167 obj_cgroup_uncharge_pages(objcg
, nr_pages
);
169 spin_lock_irqsave(&objcg_lock
, flags
);
170 list_del(&objcg
->list
);
171 spin_unlock_irqrestore(&objcg_lock
, flags
);
173 percpu_ref_exit(ref
);
174 kfree_rcu(objcg
, rcu
);
177 static struct obj_cgroup
*obj_cgroup_alloc(void)
179 struct obj_cgroup
*objcg
;
182 objcg
= kzalloc(sizeof(struct obj_cgroup
), GFP_KERNEL
);
186 ret
= percpu_ref_init(&objcg
->refcnt
, obj_cgroup_release
, 0,
192 INIT_LIST_HEAD(&objcg
->list
);
196 static void memcg_reparent_objcgs(struct mem_cgroup
*memcg
,
197 struct mem_cgroup
*parent
)
199 struct obj_cgroup
*objcg
, *iter
;
201 objcg
= rcu_replace_pointer(memcg
->objcg
, NULL
, true);
203 spin_lock_irq(&objcg_lock
);
205 /* 1) Ready to reparent active objcg. */
206 list_add(&objcg
->list
, &memcg
->objcg_list
);
207 /* 2) Reparent active objcg and already reparented objcgs to parent. */
208 list_for_each_entry(iter
, &memcg
->objcg_list
, list
)
209 WRITE_ONCE(iter
->memcg
, parent
);
210 /* 3) Move already reparented objcgs to the parent's list */
211 list_splice(&memcg
->objcg_list
, &parent
->objcg_list
);
213 spin_unlock_irq(&objcg_lock
);
215 percpu_ref_kill(&objcg
->refcnt
);
219 * A lot of the calls to the cache allocation functions are expected to be
220 * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are
221 * conditional to this static branch, we'll have to allow modules that does
222 * kmem_cache_alloc and the such to see this symbol as well
224 DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key
);
225 EXPORT_SYMBOL(memcg_kmem_online_key
);
227 DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key
);
228 EXPORT_SYMBOL(memcg_bpf_enabled_key
);
231 * mem_cgroup_css_from_folio - css of the memcg associated with a folio
232 * @folio: folio of interest
234 * If memcg is bound to the default hierarchy, css of the memcg associated
235 * with @folio is returned. The returned css remains associated with @folio
236 * until it is released.
238 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
241 struct cgroup_subsys_state
*mem_cgroup_css_from_folio(struct folio
*folio
)
243 struct mem_cgroup
*memcg
= folio_memcg(folio
);
245 if (!memcg
|| !cgroup_subsys_on_dfl(memory_cgrp_subsys
))
246 memcg
= root_mem_cgroup
;
252 * page_cgroup_ino - return inode number of the memcg a page is charged to
255 * Look up the closest online ancestor of the memory cgroup @page is charged to
256 * and return its inode number or 0 if @page is not charged to any cgroup. It
257 * is safe to call this function without holding a reference to @page.
259 * Note, this function is inherently racy, because there is nothing to prevent
260 * the cgroup inode from getting torn down and potentially reallocated a moment
261 * after page_cgroup_ino() returns, so it only should be used by callers that
262 * do not care (such as procfs interfaces).
264 ino_t
page_cgroup_ino(struct page
*page
)
266 struct mem_cgroup
*memcg
;
267 unsigned long ino
= 0;
270 /* page_folio() is racy here, but the entire function is racy anyway */
271 memcg
= folio_memcg_check(page_folio(page
));
273 while (memcg
&& !(memcg
->css
.flags
& CSS_ONLINE
))
274 memcg
= parent_mem_cgroup(memcg
);
276 ino
= cgroup_ino(memcg
->css
.cgroup
);
281 /* Subset of node_stat_item for memcg stats */
282 static const unsigned int memcg_node_stat_items
[] = {
288 NR_SLAB_RECLAIMABLE_B
,
289 NR_SLAB_UNRECLAIMABLE_B
,
290 WORKINGSET_REFAULT_ANON
,
291 WORKINGSET_REFAULT_FILE
,
292 WORKINGSET_ACTIVATE_ANON
,
293 WORKINGSET_ACTIVATE_FILE
,
294 WORKINGSET_RESTORE_ANON
,
295 WORKINGSET_RESTORE_FILE
,
296 WORKINGSET_NODERECLAIM
,
308 NR_SECONDARY_PAGETABLE
,
312 #ifdef CONFIG_NUMA_BALANCING
318 #ifdef CONFIG_HUGETLB_PAGE
323 static const unsigned int memcg_stat_items
[] = {
333 #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
334 #define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \
335 ARRAY_SIZE(memcg_stat_items))
336 #define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX)
337 static u8 mem_cgroup_stats_index
[MEMCG_NR_STAT
] __read_mostly
;
339 static void init_memcg_stats(void)
343 BUILD_BUG_ON(MEMCG_NR_STAT
>= U8_MAX
);
345 memset(mem_cgroup_stats_index
, U8_MAX
, sizeof(mem_cgroup_stats_index
));
347 for (i
= 0; i
< NR_MEMCG_NODE_STAT_ITEMS
; ++i
, ++j
)
348 mem_cgroup_stats_index
[memcg_node_stat_items
[i
]] = j
;
350 for (i
= 0; i
< ARRAY_SIZE(memcg_stat_items
); ++i
, ++j
)
351 mem_cgroup_stats_index
[memcg_stat_items
[i
]] = j
;
354 static inline int memcg_stats_index(int idx
)
356 return mem_cgroup_stats_index
[idx
];
359 struct lruvec_stats_percpu
{
360 /* Local (CPU and cgroup) state */
361 long state
[NR_MEMCG_NODE_STAT_ITEMS
];
363 /* Delta calculation for lockless upward propagation */
364 long state_prev
[NR_MEMCG_NODE_STAT_ITEMS
];
367 struct lruvec_stats
{
368 /* Aggregated (CPU and subtree) state */
369 long state
[NR_MEMCG_NODE_STAT_ITEMS
];
371 /* Non-hierarchical (CPU aggregated) state */
372 long state_local
[NR_MEMCG_NODE_STAT_ITEMS
];
374 /* Pending child counts during tree propagation */
375 long state_pending
[NR_MEMCG_NODE_STAT_ITEMS
];
378 unsigned long lruvec_page_state(struct lruvec
*lruvec
, enum node_stat_item idx
)
380 struct mem_cgroup_per_node
*pn
;
384 if (mem_cgroup_disabled())
385 return node_page_state(lruvec_pgdat(lruvec
), idx
);
387 i
= memcg_stats_index(idx
);
388 if (WARN_ONCE(BAD_STAT_IDX(i
), "%s: missing stat item %d\n", __func__
, idx
))
391 pn
= container_of(lruvec
, struct mem_cgroup_per_node
, lruvec
);
392 x
= READ_ONCE(pn
->lruvec_stats
->state
[i
]);
400 unsigned long lruvec_page_state_local(struct lruvec
*lruvec
,
401 enum node_stat_item idx
)
403 struct mem_cgroup_per_node
*pn
;
407 if (mem_cgroup_disabled())
408 return node_page_state(lruvec_pgdat(lruvec
), idx
);
410 i
= memcg_stats_index(idx
);
411 if (WARN_ONCE(BAD_STAT_IDX(i
), "%s: missing stat item %d\n", __func__
, idx
))
414 pn
= container_of(lruvec
, struct mem_cgroup_per_node
, lruvec
);
415 x
= READ_ONCE(pn
->lruvec_stats
->state_local
[i
]);
423 /* Subset of vm_event_item to report for memcg event stats */
424 static const unsigned int memcg_vm_event_stat
[] = {
425 #ifdef CONFIG_MEMCG_V1
453 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
459 #ifdef CONFIG_NUMA_BALANCING
466 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
467 static u8 mem_cgroup_events_index
[NR_VM_EVENT_ITEMS
] __read_mostly
;
469 static void init_memcg_events(void)
473 BUILD_BUG_ON(NR_VM_EVENT_ITEMS
>= U8_MAX
);
475 memset(mem_cgroup_events_index
, U8_MAX
,
476 sizeof(mem_cgroup_events_index
));
478 for (i
= 0; i
< NR_MEMCG_EVENTS
; ++i
)
479 mem_cgroup_events_index
[memcg_vm_event_stat
[i
]] = i
;
482 static inline int memcg_events_index(enum vm_event_item idx
)
484 return mem_cgroup_events_index
[idx
];
487 struct memcg_vmstats_percpu
{
488 /* Stats updates since the last flush */
489 unsigned int stats_updates
;
491 /* Cached pointers for fast iteration in memcg_rstat_updated() */
492 struct memcg_vmstats_percpu
*parent
;
493 struct memcg_vmstats
*vmstats
;
495 /* The above should fit a single cacheline for memcg_rstat_updated() */
497 /* Local (CPU and cgroup) page state & events */
498 long state
[MEMCG_VMSTAT_SIZE
];
499 unsigned long events
[NR_MEMCG_EVENTS
];
501 /* Delta calculation for lockless upward propagation */
502 long state_prev
[MEMCG_VMSTAT_SIZE
];
503 unsigned long events_prev
[NR_MEMCG_EVENTS
];
504 } ____cacheline_aligned
;
506 struct memcg_vmstats
{
507 /* Aggregated (CPU and subtree) page state & events */
508 long state
[MEMCG_VMSTAT_SIZE
];
509 unsigned long events
[NR_MEMCG_EVENTS
];
511 /* Non-hierarchical (CPU aggregated) page state & events */
512 long state_local
[MEMCG_VMSTAT_SIZE
];
513 unsigned long events_local
[NR_MEMCG_EVENTS
];
515 /* Pending child counts during tree propagation */
516 long state_pending
[MEMCG_VMSTAT_SIZE
];
517 unsigned long events_pending
[NR_MEMCG_EVENTS
];
519 /* Stats updates since the last flush */
520 atomic64_t stats_updates
;
524 * memcg and lruvec stats flushing
526 * Many codepaths leading to stats update or read are performance sensitive and
527 * adding stats flushing in such codepaths is not desirable. So, to optimize the
528 * flushing the kernel does:
530 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
531 * rstat update tree grow unbounded.
533 * 2) Flush the stats synchronously on reader side only when there are more than
534 * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
535 * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
536 * only for 2 seconds due to (1).
538 static void flush_memcg_stats_dwork(struct work_struct
*w
);
539 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork
, flush_memcg_stats_dwork
);
540 static u64 flush_last_time
;
542 #define FLUSH_TIME (2UL*HZ)
545 * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
546 * not rely on this as part of an acquired spinlock_t lock. These functions are
547 * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
550 static void memcg_stats_lock(void)
552 preempt_disable_nested();
553 VM_WARN_ON_IRQS_ENABLED();
556 static void __memcg_stats_lock(void)
558 preempt_disable_nested();
561 static void memcg_stats_unlock(void)
563 preempt_enable_nested();
567 static bool memcg_vmstats_needs_flush(struct memcg_vmstats
*vmstats
)
569 return atomic64_read(&vmstats
->stats_updates
) >
570 MEMCG_CHARGE_BATCH
* num_online_cpus();
573 static inline void memcg_rstat_updated(struct mem_cgroup
*memcg
, int val
)
575 struct memcg_vmstats_percpu
*statc
;
576 int cpu
= smp_processor_id();
577 unsigned int stats_updates
;
582 cgroup_rstat_updated(memcg
->css
.cgroup
, cpu
);
583 statc
= this_cpu_ptr(memcg
->vmstats_percpu
);
584 for (; statc
; statc
= statc
->parent
) {
585 stats_updates
= READ_ONCE(statc
->stats_updates
) + abs(val
);
586 WRITE_ONCE(statc
->stats_updates
, stats_updates
);
587 if (stats_updates
< MEMCG_CHARGE_BATCH
)
591 * If @memcg is already flush-able, increasing stats_updates is
592 * redundant. Avoid the overhead of the atomic update.
594 if (!memcg_vmstats_needs_flush(statc
->vmstats
))
595 atomic64_add(stats_updates
,
596 &statc
->vmstats
->stats_updates
);
597 WRITE_ONCE(statc
->stats_updates
, 0);
601 static void __mem_cgroup_flush_stats(struct mem_cgroup
*memcg
, bool force
)
603 bool needs_flush
= memcg_vmstats_needs_flush(memcg
->vmstats
);
605 trace_memcg_flush_stats(memcg
, atomic64_read(&memcg
->vmstats
->stats_updates
),
608 if (!force
&& !needs_flush
)
611 if (mem_cgroup_is_root(memcg
))
612 WRITE_ONCE(flush_last_time
, jiffies_64
);
614 cgroup_rstat_flush(memcg
->css
.cgroup
);
618 * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
619 * @memcg: root of the subtree to flush
621 * Flushing is serialized by the underlying global rstat lock. There is also a
622 * minimum amount of work to be done even if there are no stat updates to flush.
623 * Hence, we only flush the stats if the updates delta exceeds a threshold. This
624 * avoids unnecessary work and contention on the underlying lock.
626 void mem_cgroup_flush_stats(struct mem_cgroup
*memcg
)
628 if (mem_cgroup_disabled())
632 memcg
= root_mem_cgroup
;
634 __mem_cgroup_flush_stats(memcg
, false);
637 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup
*memcg
)
639 /* Only flush if the periodic flusher is one full cycle late */
640 if (time_after64(jiffies_64
, READ_ONCE(flush_last_time
) + 2*FLUSH_TIME
))
641 mem_cgroup_flush_stats(memcg
);
644 static void flush_memcg_stats_dwork(struct work_struct
*w
)
647 * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
648 * in latency-sensitive paths is as cheap as possible.
650 __mem_cgroup_flush_stats(root_mem_cgroup
, true);
651 queue_delayed_work(system_unbound_wq
, &stats_flush_dwork
, FLUSH_TIME
);
654 unsigned long memcg_page_state(struct mem_cgroup
*memcg
, int idx
)
657 int i
= memcg_stats_index(idx
);
659 if (WARN_ONCE(BAD_STAT_IDX(i
), "%s: missing stat item %d\n", __func__
, idx
))
662 x
= READ_ONCE(memcg
->vmstats
->state
[i
]);
670 static int memcg_page_state_unit(int item
);
673 * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
674 * up non-zero sub-page updates to 1 page as zero page updates are ignored.
676 static int memcg_state_val_in_pages(int idx
, int val
)
678 int unit
= memcg_page_state_unit(idx
);
680 if (!val
|| unit
== PAGE_SIZE
)
683 return max(val
* unit
/ PAGE_SIZE
, 1UL);
687 * __mod_memcg_state - update cgroup memory statistics
688 * @memcg: the memory cgroup
689 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
690 * @val: delta to add to the counter, can be negative
692 void __mod_memcg_state(struct mem_cgroup
*memcg
, enum memcg_stat_item idx
,
695 int i
= memcg_stats_index(idx
);
697 if (mem_cgroup_disabled())
700 if (WARN_ONCE(BAD_STAT_IDX(i
), "%s: missing stat item %d\n", __func__
, idx
))
703 __this_cpu_add(memcg
->vmstats_percpu
->state
[i
], val
);
704 val
= memcg_state_val_in_pages(idx
, val
);
705 memcg_rstat_updated(memcg
, val
);
706 trace_mod_memcg_state(memcg
, idx
, val
);
709 /* idx can be of type enum memcg_stat_item or node_stat_item. */
710 unsigned long memcg_page_state_local(struct mem_cgroup
*memcg
, int idx
)
713 int i
= memcg_stats_index(idx
);
715 if (WARN_ONCE(BAD_STAT_IDX(i
), "%s: missing stat item %d\n", __func__
, idx
))
718 x
= READ_ONCE(memcg
->vmstats
->state_local
[i
]);
726 static void __mod_memcg_lruvec_state(struct lruvec
*lruvec
,
727 enum node_stat_item idx
,
730 struct mem_cgroup_per_node
*pn
;
731 struct mem_cgroup
*memcg
;
732 int i
= memcg_stats_index(idx
);
734 if (WARN_ONCE(BAD_STAT_IDX(i
), "%s: missing stat item %d\n", __func__
, idx
))
737 pn
= container_of(lruvec
, struct mem_cgroup_per_node
, lruvec
);
741 * The caller from rmap relies on disabled preemption because they never
742 * update their counter from in-interrupt context. For these two
743 * counters we check that the update is never performed from an
744 * interrupt context while other caller need to have disabled interrupt.
746 __memcg_stats_lock();
747 if (IS_ENABLED(CONFIG_DEBUG_VM
)) {
752 WARN_ON_ONCE(!in_task());
755 VM_WARN_ON_IRQS_ENABLED();
760 __this_cpu_add(memcg
->vmstats_percpu
->state
[i
], val
);
763 __this_cpu_add(pn
->lruvec_stats_percpu
->state
[i
], val
);
765 val
= memcg_state_val_in_pages(idx
, val
);
766 memcg_rstat_updated(memcg
, val
);
767 trace_mod_memcg_lruvec_state(memcg
, idx
, val
);
768 memcg_stats_unlock();
772 * __mod_lruvec_state - update lruvec memory statistics
773 * @lruvec: the lruvec
774 * @idx: the stat item
775 * @val: delta to add to the counter, can be negative
777 * The lruvec is the intersection of the NUMA node and a cgroup. This
778 * function updates the all three counters that are affected by a
779 * change of state at this level: per-node, per-cgroup, per-lruvec.
781 void __mod_lruvec_state(struct lruvec
*lruvec
, enum node_stat_item idx
,
785 __mod_node_page_state(lruvec_pgdat(lruvec
), idx
, val
);
787 /* Update memcg and lruvec */
788 if (!mem_cgroup_disabled())
789 __mod_memcg_lruvec_state(lruvec
, idx
, val
);
792 void __lruvec_stat_mod_folio(struct folio
*folio
, enum node_stat_item idx
,
795 struct mem_cgroup
*memcg
;
796 pg_data_t
*pgdat
= folio_pgdat(folio
);
797 struct lruvec
*lruvec
;
800 memcg
= folio_memcg(folio
);
801 /* Untracked pages have no memcg, no lruvec. Update only the node */
804 __mod_node_page_state(pgdat
, idx
, val
);
808 lruvec
= mem_cgroup_lruvec(memcg
, pgdat
);
809 __mod_lruvec_state(lruvec
, idx
, val
);
812 EXPORT_SYMBOL(__lruvec_stat_mod_folio
);
814 void __mod_lruvec_kmem_state(void *p
, enum node_stat_item idx
, int val
)
816 pg_data_t
*pgdat
= page_pgdat(virt_to_page(p
));
817 struct mem_cgroup
*memcg
;
818 struct lruvec
*lruvec
;
821 memcg
= mem_cgroup_from_slab_obj(p
);
824 * Untracked pages have no memcg, no lruvec. Update only the
825 * node. If we reparent the slab objects to the root memcg,
826 * when we free the slab object, we need to update the per-memcg
827 * vmstats to keep it correct for the root memcg.
830 __mod_node_page_state(pgdat
, idx
, val
);
832 lruvec
= mem_cgroup_lruvec(memcg
, pgdat
);
833 __mod_lruvec_state(lruvec
, idx
, val
);
839 * __count_memcg_events - account VM events in a cgroup
840 * @memcg: the memory cgroup
841 * @idx: the event item
842 * @count: the number of events that occurred
844 void __count_memcg_events(struct mem_cgroup
*memcg
, enum vm_event_item idx
,
847 int i
= memcg_events_index(idx
);
849 if (mem_cgroup_disabled())
852 if (WARN_ONCE(BAD_STAT_IDX(i
), "%s: missing stat item %d\n", __func__
, idx
))
856 __this_cpu_add(memcg
->vmstats_percpu
->events
[i
], count
);
857 memcg_rstat_updated(memcg
, count
);
858 trace_count_memcg_events(memcg
, idx
, count
);
859 memcg_stats_unlock();
862 unsigned long memcg_events(struct mem_cgroup
*memcg
, int event
)
864 int i
= memcg_events_index(event
);
866 if (WARN_ONCE(BAD_STAT_IDX(i
), "%s: missing stat item %d\n", __func__
, event
))
869 return READ_ONCE(memcg
->vmstats
->events
[i
]);
872 unsigned long memcg_events_local(struct mem_cgroup
*memcg
, int event
)
874 int i
= memcg_events_index(event
);
876 if (WARN_ONCE(BAD_STAT_IDX(i
), "%s: missing stat item %d\n", __func__
, event
))
879 return READ_ONCE(memcg
->vmstats
->events_local
[i
]);
882 struct mem_cgroup
*mem_cgroup_from_task(struct task_struct
*p
)
885 * mm_update_next_owner() may clear mm->owner to NULL
886 * if it races with swapoff, page migration, etc.
887 * So this can be called with p == NULL.
892 return mem_cgroup_from_css(task_css(p
, memory_cgrp_id
));
894 EXPORT_SYMBOL(mem_cgroup_from_task
);
896 static __always_inline
struct mem_cgroup
*active_memcg(void)
899 return this_cpu_read(int_active_memcg
);
901 return current
->active_memcg
;
905 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
906 * @mm: mm from which memcg should be extracted. It can be NULL.
908 * Obtain a reference on mm->memcg and returns it if successful. If mm
909 * is NULL, then the memcg is chosen as follows:
910 * 1) The active memcg, if set.
911 * 2) current->mm->memcg, if available
913 * If mem_cgroup is disabled, NULL is returned.
915 struct mem_cgroup
*get_mem_cgroup_from_mm(struct mm_struct
*mm
)
917 struct mem_cgroup
*memcg
;
919 if (mem_cgroup_disabled())
923 * Page cache insertions can happen without an
924 * actual mm context, e.g. during disk probing
925 * on boot, loopback IO, acct() writes etc.
927 * No need to css_get on root memcg as the reference
928 * counting is disabled on the root level in the
929 * cgroup core. See CSS_NO_REF.
932 memcg
= active_memcg();
933 if (unlikely(memcg
)) {
934 /* remote memcg must hold a ref */
935 css_get(&memcg
->css
);
940 return root_mem_cgroup
;
945 memcg
= mem_cgroup_from_task(rcu_dereference(mm
->owner
));
946 if (unlikely(!memcg
))
947 memcg
= root_mem_cgroup
;
948 } while (!css_tryget(&memcg
->css
));
952 EXPORT_SYMBOL(get_mem_cgroup_from_mm
);
955 * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
957 struct mem_cgroup
*get_mem_cgroup_from_current(void)
959 struct mem_cgroup
*memcg
;
961 if (mem_cgroup_disabled())
966 memcg
= mem_cgroup_from_task(current
);
967 if (!css_tryget(&memcg
->css
)) {
976 * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
977 * @folio: folio from which memcg should be extracted.
979 struct mem_cgroup
*get_mem_cgroup_from_folio(struct folio
*folio
)
981 struct mem_cgroup
*memcg
= folio_memcg(folio
);
983 if (mem_cgroup_disabled())
987 if (!memcg
|| WARN_ON_ONCE(!css_tryget(&memcg
->css
)))
988 memcg
= root_mem_cgroup
;
994 * mem_cgroup_iter - iterate over memory cgroup hierarchy
995 * @root: hierarchy root
996 * @prev: previously returned memcg, NULL on first invocation
997 * @reclaim: cookie for shared reclaim walks, NULL for full walks
999 * Returns references to children of the hierarchy below @root, or
1000 * @root itself, or %NULL after a full round-trip.
1002 * Caller must pass the return value in @prev on subsequent
1003 * invocations for reference counting, or use mem_cgroup_iter_break()
1004 * to cancel a hierarchy walk before the round-trip is complete.
1006 * Reclaimers can specify a node in @reclaim to divide up the memcgs
1007 * in the hierarchy among all concurrent reclaimers operating on the
1010 struct mem_cgroup
*mem_cgroup_iter(struct mem_cgroup
*root
,
1011 struct mem_cgroup
*prev
,
1012 struct mem_cgroup_reclaim_cookie
*reclaim
)
1014 struct mem_cgroup_reclaim_iter
*iter
;
1015 struct cgroup_subsys_state
*css
;
1016 struct mem_cgroup
*pos
;
1017 struct mem_cgroup
*next
;
1019 if (mem_cgroup_disabled())
1023 root
= root_mem_cgroup
;
1031 int nid
= reclaim
->pgdat
->node_id
;
1033 iter
= &root
->nodeinfo
[nid
]->iter
;
1034 gen
= atomic_read(&iter
->generation
);
1037 * On start, join the current reclaim iteration cycle.
1038 * Exit when a concurrent walker completes it.
1041 reclaim
->generation
= gen
;
1042 else if (reclaim
->generation
!= gen
)
1045 pos
= READ_ONCE(iter
->position
);
1049 css
= pos
? &pos
->css
: NULL
;
1051 while ((css
= css_next_descendant_pre(css
, &root
->css
))) {
1053 * Verify the css and acquire a reference. The root
1054 * is provided by the caller, so we know it's alive
1055 * and kicking, and don't take an extra reference.
1057 if (css
== &root
->css
|| css_tryget(css
))
1061 next
= mem_cgroup_from_css(css
);
1065 * The position could have already been updated by a competing
1066 * thread, so check that the value hasn't changed since we read
1067 * it to avoid reclaiming from the same cgroup twice.
1069 if (cmpxchg(&iter
->position
, pos
, next
) != pos
) {
1070 if (css
&& css
!= &root
->css
)
1076 atomic_inc(&iter
->generation
);
1079 * Reclaimers share the hierarchy walk, and a
1080 * new one might jump in right at the end of
1081 * the hierarchy - make sure they see at least
1082 * one group and restart from the beginning.
1091 if (prev
&& prev
!= root
)
1092 css_put(&prev
->css
);
1098 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1099 * @root: hierarchy root
1100 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1102 void mem_cgroup_iter_break(struct mem_cgroup
*root
,
1103 struct mem_cgroup
*prev
)
1106 root
= root_mem_cgroup
;
1107 if (prev
&& prev
!= root
)
1108 css_put(&prev
->css
);
1111 static void __invalidate_reclaim_iterators(struct mem_cgroup
*from
,
1112 struct mem_cgroup
*dead_memcg
)
1114 struct mem_cgroup_reclaim_iter
*iter
;
1115 struct mem_cgroup_per_node
*mz
;
1118 for_each_node(nid
) {
1119 mz
= from
->nodeinfo
[nid
];
1121 cmpxchg(&iter
->position
, dead_memcg
, NULL
);
1125 static void invalidate_reclaim_iterators(struct mem_cgroup
*dead_memcg
)
1127 struct mem_cgroup
*memcg
= dead_memcg
;
1128 struct mem_cgroup
*last
;
1131 __invalidate_reclaim_iterators(memcg
, dead_memcg
);
1133 } while ((memcg
= parent_mem_cgroup(memcg
)));
1136 * When cgroup1 non-hierarchy mode is used,
1137 * parent_mem_cgroup() does not walk all the way up to the
1138 * cgroup root (root_mem_cgroup). So we have to handle
1139 * dead_memcg from cgroup root separately.
1141 if (!mem_cgroup_is_root(last
))
1142 __invalidate_reclaim_iterators(root_mem_cgroup
,
1147 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1148 * @memcg: hierarchy root
1149 * @fn: function to call for each task
1150 * @arg: argument passed to @fn
1152 * This function iterates over tasks attached to @memcg or to any of its
1153 * descendants and calls @fn for each task. If @fn returns a non-zero
1154 * value, the function breaks the iteration loop. Otherwise, it will iterate
1155 * over all tasks and return 0.
1157 * This function must not be called for the root memory cgroup.
1159 void mem_cgroup_scan_tasks(struct mem_cgroup
*memcg
,
1160 int (*fn
)(struct task_struct
*, void *), void *arg
)
1162 struct mem_cgroup
*iter
;
1165 BUG_ON(mem_cgroup_is_root(memcg
));
1167 for_each_mem_cgroup_tree(iter
, memcg
) {
1168 struct css_task_iter it
;
1169 struct task_struct
*task
;
1171 css_task_iter_start(&iter
->css
, CSS_TASK_ITER_PROCS
, &it
);
1172 while (!ret
&& (task
= css_task_iter_next(&it
)))
1173 ret
= fn(task
, arg
);
1174 css_task_iter_end(&it
);
1176 mem_cgroup_iter_break(memcg
, iter
);
1182 #ifdef CONFIG_DEBUG_VM
1183 void lruvec_memcg_debug(struct lruvec
*lruvec
, struct folio
*folio
)
1185 struct mem_cgroup
*memcg
;
1187 if (mem_cgroup_disabled())
1190 memcg
= folio_memcg(folio
);
1193 VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec
)), folio
);
1195 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec
) != memcg
, folio
);
1200 * folio_lruvec_lock - Lock the lruvec for a folio.
1201 * @folio: Pointer to the folio.
1203 * These functions are safe to use under any of the following conditions:
1205 * - folio_test_lru false
1206 * - folio frozen (refcount of 0)
1208 * Return: The lruvec this folio is on with its lock held.
1210 struct lruvec
*folio_lruvec_lock(struct folio
*folio
)
1212 struct lruvec
*lruvec
= folio_lruvec(folio
);
1214 spin_lock(&lruvec
->lru_lock
);
1215 lruvec_memcg_debug(lruvec
, folio
);
1221 * folio_lruvec_lock_irq - Lock the lruvec for a folio.
1222 * @folio: Pointer to the folio.
1224 * These functions are safe to use under any of the following conditions:
1226 * - folio_test_lru false
1227 * - folio frozen (refcount of 0)
1229 * Return: The lruvec this folio is on with its lock held and interrupts
1232 struct lruvec
*folio_lruvec_lock_irq(struct folio
*folio
)
1234 struct lruvec
*lruvec
= folio_lruvec(folio
);
1236 spin_lock_irq(&lruvec
->lru_lock
);
1237 lruvec_memcg_debug(lruvec
, folio
);
1243 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
1244 * @folio: Pointer to the folio.
1245 * @flags: Pointer to irqsave flags.
1247 * These functions are safe to use under any of the following conditions:
1249 * - folio_test_lru false
1250 * - folio frozen (refcount of 0)
1252 * Return: The lruvec this folio is on with its lock held and interrupts
1255 struct lruvec
*folio_lruvec_lock_irqsave(struct folio
*folio
,
1256 unsigned long *flags
)
1258 struct lruvec
*lruvec
= folio_lruvec(folio
);
1260 spin_lock_irqsave(&lruvec
->lru_lock
, *flags
);
1261 lruvec_memcg_debug(lruvec
, folio
);
1267 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1268 * @lruvec: mem_cgroup per zone lru vector
1269 * @lru: index of lru list the page is sitting on
1270 * @zid: zone id of the accounted pages
1271 * @nr_pages: positive when adding or negative when removing
1273 * This function must be called under lru_lock, just before a page is added
1274 * to or just after a page is removed from an lru list.
1276 void mem_cgroup_update_lru_size(struct lruvec
*lruvec
, enum lru_list lru
,
1277 int zid
, int nr_pages
)
1279 struct mem_cgroup_per_node
*mz
;
1280 unsigned long *lru_size
;
1283 if (mem_cgroup_disabled())
1286 mz
= container_of(lruvec
, struct mem_cgroup_per_node
, lruvec
);
1287 lru_size
= &mz
->lru_zone_size
[zid
][lru
];
1290 *lru_size
+= nr_pages
;
1293 if (WARN_ONCE(size
< 0,
1294 "%s(%p, %d, %d): lru_size %ld\n",
1295 __func__
, lruvec
, lru
, nr_pages
, size
)) {
1301 *lru_size
+= nr_pages
;
1305 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1306 * @memcg: the memory cgroup
1308 * Returns the maximum amount of memory @mem can be charged with, in
1311 static unsigned long mem_cgroup_margin(struct mem_cgroup
*memcg
)
1313 unsigned long margin
= 0;
1314 unsigned long count
;
1315 unsigned long limit
;
1317 count
= page_counter_read(&memcg
->memory
);
1318 limit
= READ_ONCE(memcg
->memory
.max
);
1320 margin
= limit
- count
;
1322 if (do_memsw_account()) {
1323 count
= page_counter_read(&memcg
->memsw
);
1324 limit
= READ_ONCE(memcg
->memsw
.max
);
1326 margin
= min(margin
, limit
- count
);
1334 struct memory_stat
{
1339 static const struct memory_stat memory_stats
[] = {
1340 { "anon", NR_ANON_MAPPED
},
1341 { "file", NR_FILE_PAGES
},
1342 { "kernel", MEMCG_KMEM
},
1343 { "kernel_stack", NR_KERNEL_STACK_KB
},
1344 { "pagetables", NR_PAGETABLE
},
1345 { "sec_pagetables", NR_SECONDARY_PAGETABLE
},
1346 { "percpu", MEMCG_PERCPU_B
},
1347 { "sock", MEMCG_SOCK
},
1348 { "vmalloc", MEMCG_VMALLOC
},
1349 { "shmem", NR_SHMEM
},
1351 { "zswap", MEMCG_ZSWAP_B
},
1352 { "zswapped", MEMCG_ZSWAPPED
},
1354 { "file_mapped", NR_FILE_MAPPED
},
1355 { "file_dirty", NR_FILE_DIRTY
},
1356 { "file_writeback", NR_WRITEBACK
},
1358 { "swapcached", NR_SWAPCACHE
},
1360 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1361 { "anon_thp", NR_ANON_THPS
},
1362 { "file_thp", NR_FILE_THPS
},
1363 { "shmem_thp", NR_SHMEM_THPS
},
1365 { "inactive_anon", NR_INACTIVE_ANON
},
1366 { "active_anon", NR_ACTIVE_ANON
},
1367 { "inactive_file", NR_INACTIVE_FILE
},
1368 { "active_file", NR_ACTIVE_FILE
},
1369 { "unevictable", NR_UNEVICTABLE
},
1370 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B
},
1371 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B
},
1372 #ifdef CONFIG_HUGETLB_PAGE
1373 { "hugetlb", NR_HUGETLB
},
1376 /* The memory events */
1377 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON
},
1378 { "workingset_refault_file", WORKINGSET_REFAULT_FILE
},
1379 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON
},
1380 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE
},
1381 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON
},
1382 { "workingset_restore_file", WORKINGSET_RESTORE_FILE
},
1383 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM
},
1385 { "pgdemote_kswapd", PGDEMOTE_KSWAPD
},
1386 { "pgdemote_direct", PGDEMOTE_DIRECT
},
1387 { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED
},
1388 #ifdef CONFIG_NUMA_BALANCING
1389 { "pgpromote_success", PGPROMOTE_SUCCESS
},
1393 /* The actual unit of the state item, not the same as the output unit */
1394 static int memcg_page_state_unit(int item
)
1397 case MEMCG_PERCPU_B
:
1399 case NR_SLAB_RECLAIMABLE_B
:
1400 case NR_SLAB_UNRECLAIMABLE_B
:
1402 case NR_KERNEL_STACK_KB
:
1409 /* Translate stat items to the correct unit for memory.stat output */
1410 static int memcg_page_state_output_unit(int item
)
1413 * Workingset state is actually in pages, but we export it to userspace
1414 * as a scalar count of events, so special case it here.
1416 * Demotion and promotion activities are exported in pages, consistent
1417 * with their global counterparts.
1420 case WORKINGSET_REFAULT_ANON
:
1421 case WORKINGSET_REFAULT_FILE
:
1422 case WORKINGSET_ACTIVATE_ANON
:
1423 case WORKINGSET_ACTIVATE_FILE
:
1424 case WORKINGSET_RESTORE_ANON
:
1425 case WORKINGSET_RESTORE_FILE
:
1426 case WORKINGSET_NODERECLAIM
:
1427 case PGDEMOTE_KSWAPD
:
1428 case PGDEMOTE_DIRECT
:
1429 case PGDEMOTE_KHUGEPAGED
:
1430 #ifdef CONFIG_NUMA_BALANCING
1431 case PGPROMOTE_SUCCESS
:
1435 return memcg_page_state_unit(item
);
1439 unsigned long memcg_page_state_output(struct mem_cgroup
*memcg
, int item
)
1441 return memcg_page_state(memcg
, item
) *
1442 memcg_page_state_output_unit(item
);
1445 unsigned long memcg_page_state_local_output(struct mem_cgroup
*memcg
, int item
)
1447 return memcg_page_state_local(memcg
, item
) *
1448 memcg_page_state_output_unit(item
);
1451 static void memcg_stat_format(struct mem_cgroup
*memcg
, struct seq_buf
*s
)
1456 * Provide statistics on the state of the memory subsystem as
1457 * well as cumulative event counters that show past behavior.
1459 * This list is ordered following a combination of these gradients:
1460 * 1) generic big picture -> specifics and details
1461 * 2) reflecting userspace activity -> reflecting kernel heuristics
1463 * Current memory state:
1465 mem_cgroup_flush_stats(memcg
);
1467 for (i
= 0; i
< ARRAY_SIZE(memory_stats
); i
++) {
1470 #ifdef CONFIG_HUGETLB_PAGE
1471 if (unlikely(memory_stats
[i
].idx
== NR_HUGETLB
) &&
1472 !(cgrp_dfl_root
.flags
& CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING
))
1475 size
= memcg_page_state_output(memcg
, memory_stats
[i
].idx
);
1476 seq_buf_printf(s
, "%s %llu\n", memory_stats
[i
].name
, size
);
1478 if (unlikely(memory_stats
[i
].idx
== NR_SLAB_UNRECLAIMABLE_B
)) {
1479 size
+= memcg_page_state_output(memcg
,
1480 NR_SLAB_RECLAIMABLE_B
);
1481 seq_buf_printf(s
, "slab %llu\n", size
);
1485 /* Accumulated memory events */
1486 seq_buf_printf(s
, "pgscan %lu\n",
1487 memcg_events(memcg
, PGSCAN_KSWAPD
) +
1488 memcg_events(memcg
, PGSCAN_DIRECT
) +
1489 memcg_events(memcg
, PGSCAN_KHUGEPAGED
));
1490 seq_buf_printf(s
, "pgsteal %lu\n",
1491 memcg_events(memcg
, PGSTEAL_KSWAPD
) +
1492 memcg_events(memcg
, PGSTEAL_DIRECT
) +
1493 memcg_events(memcg
, PGSTEAL_KHUGEPAGED
));
1495 for (i
= 0; i
< ARRAY_SIZE(memcg_vm_event_stat
); i
++) {
1496 #ifdef CONFIG_MEMCG_V1
1497 if (memcg_vm_event_stat
[i
] == PGPGIN
||
1498 memcg_vm_event_stat
[i
] == PGPGOUT
)
1501 seq_buf_printf(s
, "%s %lu\n",
1502 vm_event_name(memcg_vm_event_stat
[i
]),
1503 memcg_events(memcg
, memcg_vm_event_stat
[i
]));
1507 static void memory_stat_format(struct mem_cgroup
*memcg
, struct seq_buf
*s
)
1509 if (cgroup_subsys_on_dfl(memory_cgrp_subsys
))
1510 memcg_stat_format(memcg
, s
);
1512 memcg1_stat_format(memcg
, s
);
1513 if (seq_buf_has_overflowed(s
))
1514 pr_warn("%s: Warning, stat buffer overflow, please report\n", __func__
);
1518 * mem_cgroup_print_oom_context: Print OOM information relevant to
1519 * memory controller.
1520 * @memcg: The memory cgroup that went over limit
1521 * @p: Task that is going to be killed
1523 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1526 void mem_cgroup_print_oom_context(struct mem_cgroup
*memcg
, struct task_struct
*p
)
1531 pr_cont(",oom_memcg=");
1532 pr_cont_cgroup_path(memcg
->css
.cgroup
);
1534 pr_cont(",global_oom");
1536 pr_cont(",task_memcg=");
1537 pr_cont_cgroup_path(task_cgroup(p
, memory_cgrp_id
));
1543 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1544 * memory controller.
1545 * @memcg: The memory cgroup that went over limit
1547 void mem_cgroup_print_oom_meminfo(struct mem_cgroup
*memcg
)
1549 /* Use static buffer, for the caller is holding oom_lock. */
1550 static char buf
[SEQ_BUF_SIZE
];
1553 lockdep_assert_held(&oom_lock
);
1555 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1556 K((u64
)page_counter_read(&memcg
->memory
)),
1557 K((u64
)READ_ONCE(memcg
->memory
.max
)), memcg
->memory
.failcnt
);
1558 if (cgroup_subsys_on_dfl(memory_cgrp_subsys
))
1559 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1560 K((u64
)page_counter_read(&memcg
->swap
)),
1561 K((u64
)READ_ONCE(memcg
->swap
.max
)), memcg
->swap
.failcnt
);
1562 #ifdef CONFIG_MEMCG_V1
1564 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1565 K((u64
)page_counter_read(&memcg
->memsw
)),
1566 K((u64
)memcg
->memsw
.max
), memcg
->memsw
.failcnt
);
1567 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1568 K((u64
)page_counter_read(&memcg
->kmem
)),
1569 K((u64
)memcg
->kmem
.max
), memcg
->kmem
.failcnt
);
1573 pr_info("Memory cgroup stats for ");
1574 pr_cont_cgroup_path(memcg
->css
.cgroup
);
1576 seq_buf_init(&s
, buf
, SEQ_BUF_SIZE
);
1577 memory_stat_format(memcg
, &s
);
1578 seq_buf_do_printk(&s
, KERN_INFO
);
1582 * Return the memory (and swap, if configured) limit for a memcg.
1584 unsigned long mem_cgroup_get_max(struct mem_cgroup
*memcg
)
1586 unsigned long max
= READ_ONCE(memcg
->memory
.max
);
1588 if (do_memsw_account()) {
1589 if (mem_cgroup_swappiness(memcg
)) {
1590 /* Calculate swap excess capacity from memsw limit */
1591 unsigned long swap
= READ_ONCE(memcg
->memsw
.max
) - max
;
1593 max
+= min(swap
, (unsigned long)total_swap_pages
);
1596 if (mem_cgroup_swappiness(memcg
))
1597 max
+= min(READ_ONCE(memcg
->swap
.max
),
1598 (unsigned long)total_swap_pages
);
1603 unsigned long mem_cgroup_size(struct mem_cgroup
*memcg
)
1605 return page_counter_read(&memcg
->memory
);
1608 static bool mem_cgroup_out_of_memory(struct mem_cgroup
*memcg
, gfp_t gfp_mask
,
1611 struct oom_control oc
= {
1615 .gfp_mask
= gfp_mask
,
1620 if (mutex_lock_killable(&oom_lock
))
1623 if (mem_cgroup_margin(memcg
) >= (1 << order
))
1627 * A few threads which were not waiting at mutex_lock_killable() can
1628 * fail to bail out. Therefore, check again after holding oom_lock.
1630 ret
= task_is_dying() || out_of_memory(&oc
);
1633 mutex_unlock(&oom_lock
);
1638 * Returns true if successfully killed one or more processes. Though in some
1639 * corner cases it can return true even without killing any process.
1641 static bool mem_cgroup_oom(struct mem_cgroup
*memcg
, gfp_t mask
, int order
)
1645 if (order
> PAGE_ALLOC_COSTLY_ORDER
)
1648 memcg_memory_event(memcg
, MEMCG_OOM
);
1650 if (!memcg1_oom_prepare(memcg
, &locked
))
1653 ret
= mem_cgroup_out_of_memory(memcg
, mask
, order
);
1655 memcg1_oom_finish(memcg
, locked
);
1661 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1662 * @victim: task to be killed by the OOM killer
1663 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1665 * Returns a pointer to a memory cgroup, which has to be cleaned up
1666 * by killing all belonging OOM-killable tasks.
1668 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1670 struct mem_cgroup
*mem_cgroup_get_oom_group(struct task_struct
*victim
,
1671 struct mem_cgroup
*oom_domain
)
1673 struct mem_cgroup
*oom_group
= NULL
;
1674 struct mem_cgroup
*memcg
;
1676 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys
))
1680 oom_domain
= root_mem_cgroup
;
1684 memcg
= mem_cgroup_from_task(victim
);
1685 if (mem_cgroup_is_root(memcg
))
1689 * If the victim task has been asynchronously moved to a different
1690 * memory cgroup, we might end up killing tasks outside oom_domain.
1691 * In this case it's better to ignore memory.group.oom.
1693 if (unlikely(!mem_cgroup_is_descendant(memcg
, oom_domain
)))
1697 * Traverse the memory cgroup hierarchy from the victim task's
1698 * cgroup up to the OOMing cgroup (or root) to find the
1699 * highest-level memory cgroup with oom.group set.
1701 for (; memcg
; memcg
= parent_mem_cgroup(memcg
)) {
1702 if (READ_ONCE(memcg
->oom_group
))
1705 if (memcg
== oom_domain
)
1710 css_get(&oom_group
->css
);
1717 void mem_cgroup_print_oom_group(struct mem_cgroup
*memcg
)
1719 pr_info("Tasks in ");
1720 pr_cont_cgroup_path(memcg
->css
.cgroup
);
1721 pr_cont(" are going to be killed due to memory.oom.group set\n");
1724 struct memcg_stock_pcp
{
1725 local_lock_t stock_lock
;
1726 struct mem_cgroup
*cached
; /* this never be root cgroup */
1727 unsigned int nr_pages
;
1729 struct obj_cgroup
*cached_objcg
;
1730 struct pglist_data
*cached_pgdat
;
1731 unsigned int nr_bytes
;
1732 int nr_slab_reclaimable_b
;
1733 int nr_slab_unreclaimable_b
;
1735 struct work_struct work
;
1736 unsigned long flags
;
1737 #define FLUSHING_CACHED_CHARGE 0
1739 static DEFINE_PER_CPU(struct memcg_stock_pcp
, memcg_stock
) = {
1740 .stock_lock
= INIT_LOCAL_LOCK(stock_lock
),
1742 static DEFINE_MUTEX(percpu_charge_mutex
);
1744 static struct obj_cgroup
*drain_obj_stock(struct memcg_stock_pcp
*stock
);
1745 static bool obj_stock_flush_required(struct memcg_stock_pcp
*stock
,
1746 struct mem_cgroup
*root_memcg
);
1749 * consume_stock: Try to consume stocked charge on this cpu.
1750 * @memcg: memcg to consume from.
1751 * @nr_pages: how many pages to charge.
1753 * The charges will only happen if @memcg matches the current cpu's memcg
1754 * stock, and at least @nr_pages are available in that stock. Failure to
1755 * service an allocation will refill the stock.
1757 * returns true if successful, false otherwise.
1759 static bool consume_stock(struct mem_cgroup
*memcg
, unsigned int nr_pages
)
1761 struct memcg_stock_pcp
*stock
;
1762 unsigned int stock_pages
;
1763 unsigned long flags
;
1766 if (nr_pages
> MEMCG_CHARGE_BATCH
)
1769 local_lock_irqsave(&memcg_stock
.stock_lock
, flags
);
1771 stock
= this_cpu_ptr(&memcg_stock
);
1772 stock_pages
= READ_ONCE(stock
->nr_pages
);
1773 if (memcg
== READ_ONCE(stock
->cached
) && stock_pages
>= nr_pages
) {
1774 WRITE_ONCE(stock
->nr_pages
, stock_pages
- nr_pages
);
1778 local_unlock_irqrestore(&memcg_stock
.stock_lock
, flags
);
1784 * Returns stocks cached in percpu and reset cached information.
1786 static void drain_stock(struct memcg_stock_pcp
*stock
)
1788 unsigned int stock_pages
= READ_ONCE(stock
->nr_pages
);
1789 struct mem_cgroup
*old
= READ_ONCE(stock
->cached
);
1795 page_counter_uncharge(&old
->memory
, stock_pages
);
1796 if (do_memsw_account())
1797 page_counter_uncharge(&old
->memsw
, stock_pages
);
1799 WRITE_ONCE(stock
->nr_pages
, 0);
1803 WRITE_ONCE(stock
->cached
, NULL
);
1806 static void drain_local_stock(struct work_struct
*dummy
)
1808 struct memcg_stock_pcp
*stock
;
1809 struct obj_cgroup
*old
= NULL
;
1810 unsigned long flags
;
1813 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
1814 * drain_stock races is that we always operate on local CPU stock
1815 * here with IRQ disabled
1817 local_lock_irqsave(&memcg_stock
.stock_lock
, flags
);
1819 stock
= this_cpu_ptr(&memcg_stock
);
1820 old
= drain_obj_stock(stock
);
1822 clear_bit(FLUSHING_CACHED_CHARGE
, &stock
->flags
);
1824 local_unlock_irqrestore(&memcg_stock
.stock_lock
, flags
);
1825 obj_cgroup_put(old
);
1829 * Cache charges(val) to local per_cpu area.
1830 * This will be consumed by consume_stock() function, later.
1832 static void __refill_stock(struct mem_cgroup
*memcg
, unsigned int nr_pages
)
1834 struct memcg_stock_pcp
*stock
;
1835 unsigned int stock_pages
;
1837 stock
= this_cpu_ptr(&memcg_stock
);
1838 if (READ_ONCE(stock
->cached
) != memcg
) { /* reset if necessary */
1840 css_get(&memcg
->css
);
1841 WRITE_ONCE(stock
->cached
, memcg
);
1843 stock_pages
= READ_ONCE(stock
->nr_pages
) + nr_pages
;
1844 WRITE_ONCE(stock
->nr_pages
, stock_pages
);
1846 if (stock_pages
> MEMCG_CHARGE_BATCH
)
1850 static void refill_stock(struct mem_cgroup
*memcg
, unsigned int nr_pages
)
1852 unsigned long flags
;
1854 local_lock_irqsave(&memcg_stock
.stock_lock
, flags
);
1855 __refill_stock(memcg
, nr_pages
);
1856 local_unlock_irqrestore(&memcg_stock
.stock_lock
, flags
);
1860 * Drains all per-CPU charge caches for given root_memcg resp. subtree
1861 * of the hierarchy under it.
1863 void drain_all_stock(struct mem_cgroup
*root_memcg
)
1867 /* If someone's already draining, avoid adding running more workers. */
1868 if (!mutex_trylock(&percpu_charge_mutex
))
1871 * Notify other cpus that system-wide "drain" is running
1872 * We do not care about races with the cpu hotplug because cpu down
1873 * as well as workers from this path always operate on the local
1874 * per-cpu data. CPU up doesn't touch memcg_stock at all.
1877 curcpu
= smp_processor_id();
1878 for_each_online_cpu(cpu
) {
1879 struct memcg_stock_pcp
*stock
= &per_cpu(memcg_stock
, cpu
);
1880 struct mem_cgroup
*memcg
;
1884 memcg
= READ_ONCE(stock
->cached
);
1885 if (memcg
&& READ_ONCE(stock
->nr_pages
) &&
1886 mem_cgroup_is_descendant(memcg
, root_memcg
))
1888 else if (obj_stock_flush_required(stock
, root_memcg
))
1893 !test_and_set_bit(FLUSHING_CACHED_CHARGE
, &stock
->flags
)) {
1895 drain_local_stock(&stock
->work
);
1896 else if (!cpu_is_isolated(cpu
))
1897 schedule_work_on(cpu
, &stock
->work
);
1901 mutex_unlock(&percpu_charge_mutex
);
1904 static int memcg_hotplug_cpu_dead(unsigned int cpu
)
1906 struct memcg_stock_pcp
*stock
;
1908 stock
= &per_cpu(memcg_stock
, cpu
);
1914 static unsigned long reclaim_high(struct mem_cgroup
*memcg
,
1915 unsigned int nr_pages
,
1918 unsigned long nr_reclaimed
= 0;
1921 unsigned long pflags
;
1923 if (page_counter_read(&memcg
->memory
) <=
1924 READ_ONCE(memcg
->memory
.high
))
1927 memcg_memory_event(memcg
, MEMCG_HIGH
);
1929 psi_memstall_enter(&pflags
);
1930 nr_reclaimed
+= try_to_free_mem_cgroup_pages(memcg
, nr_pages
,
1932 MEMCG_RECLAIM_MAY_SWAP
,
1934 psi_memstall_leave(&pflags
);
1935 } while ((memcg
= parent_mem_cgroup(memcg
)) &&
1936 !mem_cgroup_is_root(memcg
));
1938 return nr_reclaimed
;
1941 static void high_work_func(struct work_struct
*work
)
1943 struct mem_cgroup
*memcg
;
1945 memcg
= container_of(work
, struct mem_cgroup
, high_work
);
1946 reclaim_high(memcg
, MEMCG_CHARGE_BATCH
, GFP_KERNEL
);
1950 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
1951 * enough to still cause a significant slowdown in most cases, while still
1952 * allowing diagnostics and tracing to proceed without becoming stuck.
1954 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
1957 * When calculating the delay, we use these either side of the exponentiation to
1958 * maintain precision and scale to a reasonable number of jiffies (see the table
1961 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
1962 * overage ratio to a delay.
1963 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
1964 * proposed penalty in order to reduce to a reasonable number of jiffies, and
1965 * to produce a reasonable delay curve.
1967 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
1968 * reasonable delay curve compared to precision-adjusted overage, not
1969 * penalising heavily at first, but still making sure that growth beyond the
1970 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
1971 * example, with a high of 100 megabytes:
1973 * +-------+------------------------+
1974 * | usage | time to allocate in ms |
1975 * +-------+------------------------+
1997 * +-------+------------------------+
1999 #define MEMCG_DELAY_PRECISION_SHIFT 20
2000 #define MEMCG_DELAY_SCALING_SHIFT 14
2002 static u64
calculate_overage(unsigned long usage
, unsigned long high
)
2010 * Prevent division by 0 in overage calculation by acting as if
2011 * it was a threshold of 1 page
2013 high
= max(high
, 1UL);
2015 overage
= usage
- high
;
2016 overage
<<= MEMCG_DELAY_PRECISION_SHIFT
;
2017 return div64_u64(overage
, high
);
2020 static u64
mem_find_max_overage(struct mem_cgroup
*memcg
)
2022 u64 overage
, max_overage
= 0;
2025 overage
= calculate_overage(page_counter_read(&memcg
->memory
),
2026 READ_ONCE(memcg
->memory
.high
));
2027 max_overage
= max(overage
, max_overage
);
2028 } while ((memcg
= parent_mem_cgroup(memcg
)) &&
2029 !mem_cgroup_is_root(memcg
));
2034 static u64
swap_find_max_overage(struct mem_cgroup
*memcg
)
2036 u64 overage
, max_overage
= 0;
2039 overage
= calculate_overage(page_counter_read(&memcg
->swap
),
2040 READ_ONCE(memcg
->swap
.high
));
2042 memcg_memory_event(memcg
, MEMCG_SWAP_HIGH
);
2043 max_overage
= max(overage
, max_overage
);
2044 } while ((memcg
= parent_mem_cgroup(memcg
)) &&
2045 !mem_cgroup_is_root(memcg
));
2051 * Get the number of jiffies that we should penalise a mischievous cgroup which
2052 * is exceeding its memory.high by checking both it and its ancestors.
2054 static unsigned long calculate_high_delay(struct mem_cgroup
*memcg
,
2055 unsigned int nr_pages
,
2058 unsigned long penalty_jiffies
;
2064 * We use overage compared to memory.high to calculate the number of
2065 * jiffies to sleep (penalty_jiffies). Ideally this value should be
2066 * fairly lenient on small overages, and increasingly harsh when the
2067 * memcg in question makes it clear that it has no intention of stopping
2068 * its crazy behaviour, so we exponentially increase the delay based on
2071 penalty_jiffies
= max_overage
* max_overage
* HZ
;
2072 penalty_jiffies
>>= MEMCG_DELAY_PRECISION_SHIFT
;
2073 penalty_jiffies
>>= MEMCG_DELAY_SCALING_SHIFT
;
2076 * Factor in the task's own contribution to the overage, such that four
2077 * N-sized allocations are throttled approximately the same as one
2078 * 4N-sized allocation.
2080 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2081 * larger the current charge patch is than that.
2083 return penalty_jiffies
* nr_pages
/ MEMCG_CHARGE_BATCH
;
2087 * Reclaims memory over the high limit. Called directly from
2088 * try_charge() (context permitting), as well as from the userland
2089 * return path where reclaim is always able to block.
2091 void mem_cgroup_handle_over_high(gfp_t gfp_mask
)
2093 unsigned long penalty_jiffies
;
2094 unsigned long pflags
;
2095 unsigned long nr_reclaimed
;
2096 unsigned int nr_pages
= current
->memcg_nr_pages_over_high
;
2097 int nr_retries
= MAX_RECLAIM_RETRIES
;
2098 struct mem_cgroup
*memcg
;
2099 bool in_retry
= false;
2101 if (likely(!nr_pages
))
2104 memcg
= get_mem_cgroup_from_mm(current
->mm
);
2105 current
->memcg_nr_pages_over_high
= 0;
2109 * Bail if the task is already exiting. Unlike memory.max,
2110 * memory.high enforcement isn't as strict, and there is no
2111 * OOM killer involved, which means the excess could already
2112 * be much bigger (and still growing) than it could for
2113 * memory.max; the dying task could get stuck in fruitless
2114 * reclaim for a long time, which isn't desirable.
2116 if (task_is_dying())
2120 * The allocating task should reclaim at least the batch size, but for
2121 * subsequent retries we only want to do what's necessary to prevent oom
2122 * or breaching resource isolation.
2124 * This is distinct from memory.max or page allocator behaviour because
2125 * memory.high is currently batched, whereas memory.max and the page
2126 * allocator run every time an allocation is made.
2128 nr_reclaimed
= reclaim_high(memcg
,
2129 in_retry
? SWAP_CLUSTER_MAX
: nr_pages
,
2133 * memory.high is breached and reclaim is unable to keep up. Throttle
2134 * allocators proactively to slow down excessive growth.
2136 penalty_jiffies
= calculate_high_delay(memcg
, nr_pages
,
2137 mem_find_max_overage(memcg
));
2139 penalty_jiffies
+= calculate_high_delay(memcg
, nr_pages
,
2140 swap_find_max_overage(memcg
));
2143 * Clamp the max delay per usermode return so as to still keep the
2144 * application moving forwards and also permit diagnostics, albeit
2147 penalty_jiffies
= min(penalty_jiffies
, MEMCG_MAX_HIGH_DELAY_JIFFIES
);
2150 * Don't sleep if the amount of jiffies this memcg owes us is so low
2151 * that it's not even worth doing, in an attempt to be nice to those who
2152 * go only a small amount over their memory.high value and maybe haven't
2153 * been aggressively reclaimed enough yet.
2155 if (penalty_jiffies
<= HZ
/ 100)
2159 * If reclaim is making forward progress but we're still over
2160 * memory.high, we want to encourage that rather than doing allocator
2163 if (nr_reclaimed
|| nr_retries
--) {
2169 * Reclaim didn't manage to push usage below the limit, slow
2170 * this allocating task down.
2172 * If we exit early, we're guaranteed to die (since
2173 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2174 * need to account for any ill-begotten jiffies to pay them off later.
2176 psi_memstall_enter(&pflags
);
2177 schedule_timeout_killable(penalty_jiffies
);
2178 psi_memstall_leave(&pflags
);
2181 css_put(&memcg
->css
);
2184 int try_charge_memcg(struct mem_cgroup
*memcg
, gfp_t gfp_mask
,
2185 unsigned int nr_pages
)
2187 unsigned int batch
= max(MEMCG_CHARGE_BATCH
, nr_pages
);
2188 int nr_retries
= MAX_RECLAIM_RETRIES
;
2189 struct mem_cgroup
*mem_over_limit
;
2190 struct page_counter
*counter
;
2191 unsigned long nr_reclaimed
;
2192 bool passed_oom
= false;
2193 unsigned int reclaim_options
= MEMCG_RECLAIM_MAY_SWAP
;
2194 bool drained
= false;
2195 bool raised_max_event
= false;
2196 unsigned long pflags
;
2199 if (consume_stock(memcg
, nr_pages
))
2202 if (!do_memsw_account() ||
2203 page_counter_try_charge(&memcg
->memsw
, batch
, &counter
)) {
2204 if (page_counter_try_charge(&memcg
->memory
, batch
, &counter
))
2206 if (do_memsw_account())
2207 page_counter_uncharge(&memcg
->memsw
, batch
);
2208 mem_over_limit
= mem_cgroup_from_counter(counter
, memory
);
2210 mem_over_limit
= mem_cgroup_from_counter(counter
, memsw
);
2211 reclaim_options
&= ~MEMCG_RECLAIM_MAY_SWAP
;
2214 if (batch
> nr_pages
) {
2220 * Prevent unbounded recursion when reclaim operations need to
2221 * allocate memory. This might exceed the limits temporarily,
2222 * but we prefer facilitating memory reclaim and getting back
2223 * under the limit over triggering OOM kills in these cases.
2225 if (unlikely(current
->flags
& PF_MEMALLOC
))
2228 if (unlikely(task_in_memcg_oom(current
)))
2231 if (!gfpflags_allow_blocking(gfp_mask
))
2234 memcg_memory_event(mem_over_limit
, MEMCG_MAX
);
2235 raised_max_event
= true;
2237 psi_memstall_enter(&pflags
);
2238 nr_reclaimed
= try_to_free_mem_cgroup_pages(mem_over_limit
, nr_pages
,
2239 gfp_mask
, reclaim_options
, NULL
);
2240 psi_memstall_leave(&pflags
);
2242 if (mem_cgroup_margin(mem_over_limit
) >= nr_pages
)
2246 drain_all_stock(mem_over_limit
);
2251 if (gfp_mask
& __GFP_NORETRY
)
2254 * Even though the limit is exceeded at this point, reclaim
2255 * may have been able to free some pages. Retry the charge
2256 * before killing the task.
2258 * Only for regular pages, though: huge pages are rather
2259 * unlikely to succeed so close to the limit, and we fall back
2260 * to regular pages anyway in case of failure.
2262 if (nr_reclaimed
&& nr_pages
<= (1 << PAGE_ALLOC_COSTLY_ORDER
))
2268 if (gfp_mask
& __GFP_RETRY_MAYFAIL
)
2271 /* Avoid endless loop for tasks bypassed by the oom killer */
2272 if (passed_oom
&& task_is_dying())
2276 * keep retrying as long as the memcg oom killer is able to make
2277 * a forward progress or bypass the charge if the oom killer
2278 * couldn't make any progress.
2280 if (mem_cgroup_oom(mem_over_limit
, gfp_mask
,
2281 get_order(nr_pages
* PAGE_SIZE
))) {
2283 nr_retries
= MAX_RECLAIM_RETRIES
;
2288 * Memcg doesn't have a dedicated reserve for atomic
2289 * allocations. But like the global atomic pool, we need to
2290 * put the burden of reclaim on regular allocation requests
2291 * and let these go through as privileged allocations.
2293 if (!(gfp_mask
& (__GFP_NOFAIL
| __GFP_HIGH
)))
2297 * If the allocation has to be enforced, don't forget to raise
2298 * a MEMCG_MAX event.
2300 if (!raised_max_event
)
2301 memcg_memory_event(mem_over_limit
, MEMCG_MAX
);
2304 * The allocation either can't fail or will lead to more memory
2305 * being freed very soon. Allow memory usage go over the limit
2306 * temporarily by force charging it.
2308 page_counter_charge(&memcg
->memory
, nr_pages
);
2309 if (do_memsw_account())
2310 page_counter_charge(&memcg
->memsw
, nr_pages
);
2315 if (batch
> nr_pages
)
2316 refill_stock(memcg
, batch
- nr_pages
);
2319 * If the hierarchy is above the normal consumption range, schedule
2320 * reclaim on returning to userland. We can perform reclaim here
2321 * if __GFP_RECLAIM but let's always punt for simplicity and so that
2322 * GFP_KERNEL can consistently be used during reclaim. @memcg is
2323 * not recorded as it most likely matches current's and won't
2324 * change in the meantime. As high limit is checked again before
2325 * reclaim, the cost of mismatch is negligible.
2328 bool mem_high
, swap_high
;
2330 mem_high
= page_counter_read(&memcg
->memory
) >
2331 READ_ONCE(memcg
->memory
.high
);
2332 swap_high
= page_counter_read(&memcg
->swap
) >
2333 READ_ONCE(memcg
->swap
.high
);
2335 /* Don't bother a random interrupted task */
2338 schedule_work(&memcg
->high_work
);
2344 if (mem_high
|| swap_high
) {
2346 * The allocating tasks in this cgroup will need to do
2347 * reclaim or be throttled to prevent further growth
2348 * of the memory or swap footprints.
2350 * Target some best-effort fairness between the tasks,
2351 * and distribute reclaim work and delay penalties
2352 * based on how much each task is actually allocating.
2354 current
->memcg_nr_pages_over_high
+= batch
;
2355 set_notify_resume(current
);
2358 } while ((memcg
= parent_mem_cgroup(memcg
)));
2361 * Reclaim is set up above to be called from the userland
2362 * return path. But also attempt synchronous reclaim to avoid
2363 * excessive overrun while the task is still inside the
2364 * kernel. If this is successful, the return path will see it
2365 * when it rechecks the overage and simply bail out.
2367 if (current
->memcg_nr_pages_over_high
> MEMCG_CHARGE_BATCH
&&
2368 !(current
->flags
& PF_MEMALLOC
) &&
2369 gfpflags_allow_blocking(gfp_mask
))
2370 mem_cgroup_handle_over_high(gfp_mask
);
2375 * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
2376 * @memcg: memcg previously charged.
2377 * @nr_pages: number of pages previously charged.
2379 void mem_cgroup_cancel_charge(struct mem_cgroup
*memcg
, unsigned int nr_pages
)
2381 if (mem_cgroup_is_root(memcg
))
2384 page_counter_uncharge(&memcg
->memory
, nr_pages
);
2385 if (do_memsw_account())
2386 page_counter_uncharge(&memcg
->memsw
, nr_pages
);
2389 static void commit_charge(struct folio
*folio
, struct mem_cgroup
*memcg
)
2391 VM_BUG_ON_FOLIO(folio_memcg_charged(folio
), folio
);
2393 * Any of the following ensures page's memcg stability:
2397 * - exclusive reference
2399 folio
->memcg_data
= (unsigned long)memcg
;
2403 * mem_cgroup_commit_charge - commit a previously successful try_charge().
2404 * @folio: folio to commit the charge to.
2405 * @memcg: memcg previously charged.
2407 void mem_cgroup_commit_charge(struct folio
*folio
, struct mem_cgroup
*memcg
)
2409 css_get(&memcg
->css
);
2410 commit_charge(folio
, memcg
);
2411 memcg1_commit_charge(folio
, memcg
);
2414 static inline void __mod_objcg_mlstate(struct obj_cgroup
*objcg
,
2415 struct pglist_data
*pgdat
,
2416 enum node_stat_item idx
, int nr
)
2418 struct mem_cgroup
*memcg
;
2419 struct lruvec
*lruvec
;
2422 memcg
= obj_cgroup_memcg(objcg
);
2423 lruvec
= mem_cgroup_lruvec(memcg
, pgdat
);
2424 __mod_memcg_lruvec_state(lruvec
, idx
, nr
);
2428 static __always_inline
2429 struct mem_cgroup
*mem_cgroup_from_obj_folio(struct folio
*folio
, void *p
)
2432 * Slab objects are accounted individually, not per-page.
2433 * Memcg membership data for each individual object is saved in
2436 if (folio_test_slab(folio
)) {
2437 struct slabobj_ext
*obj_exts
;
2441 slab
= folio_slab(folio
);
2442 obj_exts
= slab_obj_exts(slab
);
2446 off
= obj_to_index(slab
->slab_cache
, slab
, p
);
2447 if (obj_exts
[off
].objcg
)
2448 return obj_cgroup_memcg(obj_exts
[off
].objcg
);
2454 * folio_memcg_check() is used here, because in theory we can encounter
2455 * a folio where the slab flag has been cleared already, but
2456 * slab->obj_exts has not been freed yet
2457 * folio_memcg_check() will guarantee that a proper memory
2458 * cgroup pointer or NULL will be returned.
2460 return folio_memcg_check(folio
);
2464 * Returns a pointer to the memory cgroup to which the kernel object is charged.
2465 * It is not suitable for objects allocated using vmalloc().
2467 * A passed kernel object must be a slab object or a generic kernel page.
2469 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2470 * cgroup_mutex, etc.
2472 struct mem_cgroup
*mem_cgroup_from_slab_obj(void *p
)
2474 if (mem_cgroup_disabled())
2477 return mem_cgroup_from_obj_folio(virt_to_folio(p
), p
);
2480 static struct obj_cgroup
*__get_obj_cgroup_from_memcg(struct mem_cgroup
*memcg
)
2482 struct obj_cgroup
*objcg
= NULL
;
2484 for (; !mem_cgroup_is_root(memcg
); memcg
= parent_mem_cgroup(memcg
)) {
2485 objcg
= rcu_dereference(memcg
->objcg
);
2486 if (likely(objcg
&& obj_cgroup_tryget(objcg
)))
2493 static struct obj_cgroup
*current_objcg_update(void)
2495 struct mem_cgroup
*memcg
;
2496 struct obj_cgroup
*old
, *objcg
= NULL
;
2499 /* Atomically drop the update bit. */
2500 old
= xchg(¤t
->objcg
, NULL
);
2502 old
= (struct obj_cgroup
*)
2503 ((unsigned long)old
& ~CURRENT_OBJCG_UPDATE_FLAG
);
2504 obj_cgroup_put(old
);
2509 /* If new objcg is NULL, no reason for the second atomic update. */
2510 if (!current
->mm
|| (current
->flags
& PF_KTHREAD
))
2514 * Release the objcg pointer from the previous iteration,
2515 * if try_cmpxcg() below fails.
2517 if (unlikely(objcg
)) {
2518 obj_cgroup_put(objcg
);
2523 * Obtain the new objcg pointer. The current task can be
2524 * asynchronously moved to another memcg and the previous
2525 * memcg can be offlined. So let's get the memcg pointer
2526 * and try get a reference to objcg under a rcu read lock.
2530 memcg
= mem_cgroup_from_task(current
);
2531 objcg
= __get_obj_cgroup_from_memcg(memcg
);
2535 * Try set up a new objcg pointer atomically. If it
2536 * fails, it means the update flag was set concurrently, so
2537 * the whole procedure should be repeated.
2539 } while (!try_cmpxchg(¤t
->objcg
, &old
, objcg
));
2544 __always_inline
struct obj_cgroup
*current_obj_cgroup(void)
2546 struct mem_cgroup
*memcg
;
2547 struct obj_cgroup
*objcg
;
2550 memcg
= current
->active_memcg
;
2551 if (unlikely(memcg
))
2554 objcg
= READ_ONCE(current
->objcg
);
2555 if (unlikely((unsigned long)objcg
& CURRENT_OBJCG_UPDATE_FLAG
))
2556 objcg
= current_objcg_update();
2558 * Objcg reference is kept by the task, so it's safe
2559 * to use the objcg by the current task.
2564 memcg
= this_cpu_read(int_active_memcg
);
2565 if (unlikely(memcg
))
2572 for (; !mem_cgroup_is_root(memcg
); memcg
= parent_mem_cgroup(memcg
)) {
2574 * Memcg pointer is protected by scope (see set_active_memcg())
2575 * and is pinning the corresponding objcg, so objcg can't go
2576 * away and can be used within the scope without any additional
2579 objcg
= rcu_dereference_check(memcg
->objcg
, 1);
2587 struct obj_cgroup
*get_obj_cgroup_from_folio(struct folio
*folio
)
2589 struct obj_cgroup
*objcg
;
2591 if (!memcg_kmem_online())
2594 if (folio_memcg_kmem(folio
)) {
2595 objcg
= __folio_objcg(folio
);
2596 obj_cgroup_get(objcg
);
2598 struct mem_cgroup
*memcg
;
2601 memcg
= __folio_memcg(folio
);
2603 objcg
= __get_obj_cgroup_from_memcg(memcg
);
2612 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
2613 * @objcg: object cgroup to uncharge
2614 * @nr_pages: number of pages to uncharge
2616 static void obj_cgroup_uncharge_pages(struct obj_cgroup
*objcg
,
2617 unsigned int nr_pages
)
2619 struct mem_cgroup
*memcg
;
2621 memcg
= get_mem_cgroup_from_objcg(objcg
);
2623 mod_memcg_state(memcg
, MEMCG_KMEM
, -nr_pages
);
2624 memcg1_account_kmem(memcg
, -nr_pages
);
2625 refill_stock(memcg
, nr_pages
);
2627 css_put(&memcg
->css
);
2631 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
2632 * @objcg: object cgroup to charge
2633 * @gfp: reclaim mode
2634 * @nr_pages: number of pages to charge
2636 * Returns 0 on success, an error code on failure.
2638 static int obj_cgroup_charge_pages(struct obj_cgroup
*objcg
, gfp_t gfp
,
2639 unsigned int nr_pages
)
2641 struct mem_cgroup
*memcg
;
2644 memcg
= get_mem_cgroup_from_objcg(objcg
);
2646 ret
= try_charge_memcg(memcg
, gfp
, nr_pages
);
2650 mod_memcg_state(memcg
, MEMCG_KMEM
, nr_pages
);
2651 memcg1_account_kmem(memcg
, nr_pages
);
2653 css_put(&memcg
->css
);
2659 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2660 * @page: page to charge
2661 * @gfp: reclaim mode
2662 * @order: allocation order
2664 * Returns 0 on success, an error code on failure.
2666 int __memcg_kmem_charge_page(struct page
*page
, gfp_t gfp
, int order
)
2668 struct obj_cgroup
*objcg
;
2671 objcg
= current_obj_cgroup();
2673 ret
= obj_cgroup_charge_pages(objcg
, gfp
, 1 << order
);
2675 obj_cgroup_get(objcg
);
2676 page
->memcg_data
= (unsigned long)objcg
|
2685 * __memcg_kmem_uncharge_page: uncharge a kmem page
2686 * @page: page to uncharge
2687 * @order: allocation order
2689 void __memcg_kmem_uncharge_page(struct page
*page
, int order
)
2691 struct folio
*folio
= page_folio(page
);
2692 struct obj_cgroup
*objcg
;
2693 unsigned int nr_pages
= 1 << order
;
2695 if (!folio_memcg_kmem(folio
))
2698 objcg
= __folio_objcg(folio
);
2699 obj_cgroup_uncharge_pages(objcg
, nr_pages
);
2700 folio
->memcg_data
= 0;
2701 obj_cgroup_put(objcg
);
2704 static void mod_objcg_state(struct obj_cgroup
*objcg
, struct pglist_data
*pgdat
,
2705 enum node_stat_item idx
, int nr
)
2707 struct memcg_stock_pcp
*stock
;
2708 struct obj_cgroup
*old
= NULL
;
2709 unsigned long flags
;
2712 local_lock_irqsave(&memcg_stock
.stock_lock
, flags
);
2713 stock
= this_cpu_ptr(&memcg_stock
);
2716 * Save vmstat data in stock and skip vmstat array update unless
2717 * accumulating over a page of vmstat data or when pgdat or idx
2720 if (READ_ONCE(stock
->cached_objcg
) != objcg
) {
2721 old
= drain_obj_stock(stock
);
2722 obj_cgroup_get(objcg
);
2723 stock
->nr_bytes
= atomic_read(&objcg
->nr_charged_bytes
)
2724 ? atomic_xchg(&objcg
->nr_charged_bytes
, 0) : 0;
2725 WRITE_ONCE(stock
->cached_objcg
, objcg
);
2726 stock
->cached_pgdat
= pgdat
;
2727 } else if (stock
->cached_pgdat
!= pgdat
) {
2728 /* Flush the existing cached vmstat data */
2729 struct pglist_data
*oldpg
= stock
->cached_pgdat
;
2731 if (stock
->nr_slab_reclaimable_b
) {
2732 __mod_objcg_mlstate(objcg
, oldpg
, NR_SLAB_RECLAIMABLE_B
,
2733 stock
->nr_slab_reclaimable_b
);
2734 stock
->nr_slab_reclaimable_b
= 0;
2736 if (stock
->nr_slab_unreclaimable_b
) {
2737 __mod_objcg_mlstate(objcg
, oldpg
, NR_SLAB_UNRECLAIMABLE_B
,
2738 stock
->nr_slab_unreclaimable_b
);
2739 stock
->nr_slab_unreclaimable_b
= 0;
2741 stock
->cached_pgdat
= pgdat
;
2744 bytes
= (idx
== NR_SLAB_RECLAIMABLE_B
) ? &stock
->nr_slab_reclaimable_b
2745 : &stock
->nr_slab_unreclaimable_b
;
2747 * Even for large object >= PAGE_SIZE, the vmstat data will still be
2748 * cached locally at least once before pushing it out.
2755 if (abs(*bytes
) > PAGE_SIZE
) {
2763 __mod_objcg_mlstate(objcg
, pgdat
, idx
, nr
);
2765 local_unlock_irqrestore(&memcg_stock
.stock_lock
, flags
);
2766 obj_cgroup_put(old
);
2769 static bool consume_obj_stock(struct obj_cgroup
*objcg
, unsigned int nr_bytes
)
2771 struct memcg_stock_pcp
*stock
;
2772 unsigned long flags
;
2775 local_lock_irqsave(&memcg_stock
.stock_lock
, flags
);
2777 stock
= this_cpu_ptr(&memcg_stock
);
2778 if (objcg
== READ_ONCE(stock
->cached_objcg
) && stock
->nr_bytes
>= nr_bytes
) {
2779 stock
->nr_bytes
-= nr_bytes
;
2783 local_unlock_irqrestore(&memcg_stock
.stock_lock
, flags
);
2788 static struct obj_cgroup
*drain_obj_stock(struct memcg_stock_pcp
*stock
)
2790 struct obj_cgroup
*old
= READ_ONCE(stock
->cached_objcg
);
2795 if (stock
->nr_bytes
) {
2796 unsigned int nr_pages
= stock
->nr_bytes
>> PAGE_SHIFT
;
2797 unsigned int nr_bytes
= stock
->nr_bytes
& (PAGE_SIZE
- 1);
2800 struct mem_cgroup
*memcg
;
2802 memcg
= get_mem_cgroup_from_objcg(old
);
2804 mod_memcg_state(memcg
, MEMCG_KMEM
, -nr_pages
);
2805 memcg1_account_kmem(memcg
, -nr_pages
);
2806 __refill_stock(memcg
, nr_pages
);
2808 css_put(&memcg
->css
);
2812 * The leftover is flushed to the centralized per-memcg value.
2813 * On the next attempt to refill obj stock it will be moved
2814 * to a per-cpu stock (probably, on an other CPU), see
2815 * refill_obj_stock().
2817 * How often it's flushed is a trade-off between the memory
2818 * limit enforcement accuracy and potential CPU contention,
2819 * so it might be changed in the future.
2821 atomic_add(nr_bytes
, &old
->nr_charged_bytes
);
2822 stock
->nr_bytes
= 0;
2826 * Flush the vmstat data in current stock
2828 if (stock
->nr_slab_reclaimable_b
|| stock
->nr_slab_unreclaimable_b
) {
2829 if (stock
->nr_slab_reclaimable_b
) {
2830 __mod_objcg_mlstate(old
, stock
->cached_pgdat
,
2831 NR_SLAB_RECLAIMABLE_B
,
2832 stock
->nr_slab_reclaimable_b
);
2833 stock
->nr_slab_reclaimable_b
= 0;
2835 if (stock
->nr_slab_unreclaimable_b
) {
2836 __mod_objcg_mlstate(old
, stock
->cached_pgdat
,
2837 NR_SLAB_UNRECLAIMABLE_B
,
2838 stock
->nr_slab_unreclaimable_b
);
2839 stock
->nr_slab_unreclaimable_b
= 0;
2841 stock
->cached_pgdat
= NULL
;
2844 WRITE_ONCE(stock
->cached_objcg
, NULL
);
2846 * The `old' objects needs to be released by the caller via
2847 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
2852 static bool obj_stock_flush_required(struct memcg_stock_pcp
*stock
,
2853 struct mem_cgroup
*root_memcg
)
2855 struct obj_cgroup
*objcg
= READ_ONCE(stock
->cached_objcg
);
2856 struct mem_cgroup
*memcg
;
2859 memcg
= obj_cgroup_memcg(objcg
);
2860 if (memcg
&& mem_cgroup_is_descendant(memcg
, root_memcg
))
2867 static void refill_obj_stock(struct obj_cgroup
*objcg
, unsigned int nr_bytes
,
2868 bool allow_uncharge
)
2870 struct memcg_stock_pcp
*stock
;
2871 struct obj_cgroup
*old
= NULL
;
2872 unsigned long flags
;
2873 unsigned int nr_pages
= 0;
2875 local_lock_irqsave(&memcg_stock
.stock_lock
, flags
);
2877 stock
= this_cpu_ptr(&memcg_stock
);
2878 if (READ_ONCE(stock
->cached_objcg
) != objcg
) { /* reset if necessary */
2879 old
= drain_obj_stock(stock
);
2880 obj_cgroup_get(objcg
);
2881 WRITE_ONCE(stock
->cached_objcg
, objcg
);
2882 stock
->nr_bytes
= atomic_read(&objcg
->nr_charged_bytes
)
2883 ? atomic_xchg(&objcg
->nr_charged_bytes
, 0) : 0;
2884 allow_uncharge
= true; /* Allow uncharge when objcg changes */
2886 stock
->nr_bytes
+= nr_bytes
;
2888 if (allow_uncharge
&& (stock
->nr_bytes
> PAGE_SIZE
)) {
2889 nr_pages
= stock
->nr_bytes
>> PAGE_SHIFT
;
2890 stock
->nr_bytes
&= (PAGE_SIZE
- 1);
2893 local_unlock_irqrestore(&memcg_stock
.stock_lock
, flags
);
2894 obj_cgroup_put(old
);
2897 obj_cgroup_uncharge_pages(objcg
, nr_pages
);
2900 int obj_cgroup_charge(struct obj_cgroup
*objcg
, gfp_t gfp
, size_t size
)
2902 unsigned int nr_pages
, nr_bytes
;
2905 if (consume_obj_stock(objcg
, size
))
2909 * In theory, objcg->nr_charged_bytes can have enough
2910 * pre-charged bytes to satisfy the allocation. However,
2911 * flushing objcg->nr_charged_bytes requires two atomic
2912 * operations, and objcg->nr_charged_bytes can't be big.
2913 * The shared objcg->nr_charged_bytes can also become a
2914 * performance bottleneck if all tasks of the same memcg are
2915 * trying to update it. So it's better to ignore it and try
2916 * grab some new pages. The stock's nr_bytes will be flushed to
2917 * objcg->nr_charged_bytes later on when objcg changes.
2919 * The stock's nr_bytes may contain enough pre-charged bytes
2920 * to allow one less page from being charged, but we can't rely
2921 * on the pre-charged bytes not being changed outside of
2922 * consume_obj_stock() or refill_obj_stock(). So ignore those
2923 * pre-charged bytes as well when charging pages. To avoid a
2924 * page uncharge right after a page charge, we set the
2925 * allow_uncharge flag to false when calling refill_obj_stock()
2926 * to temporarily allow the pre-charged bytes to exceed the page
2927 * size limit. The maximum reachable value of the pre-charged
2928 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
2931 nr_pages
= size
>> PAGE_SHIFT
;
2932 nr_bytes
= size
& (PAGE_SIZE
- 1);
2937 ret
= obj_cgroup_charge_pages(objcg
, gfp
, nr_pages
);
2938 if (!ret
&& nr_bytes
)
2939 refill_obj_stock(objcg
, PAGE_SIZE
- nr_bytes
, false);
2944 void obj_cgroup_uncharge(struct obj_cgroup
*objcg
, size_t size
)
2946 refill_obj_stock(objcg
, size
, true);
2949 static inline size_t obj_full_size(struct kmem_cache
*s
)
2952 * For each accounted object there is an extra space which is used
2953 * to store obj_cgroup membership. Charge it too.
2955 return s
->size
+ sizeof(struct obj_cgroup
*);
2958 bool __memcg_slab_post_alloc_hook(struct kmem_cache
*s
, struct list_lru
*lru
,
2959 gfp_t flags
, size_t size
, void **p
)
2961 struct obj_cgroup
*objcg
;
2967 * The obtained objcg pointer is safe to use within the current scope,
2968 * defined by current task or set_active_memcg() pair.
2969 * obj_cgroup_get() is used to get a permanent reference.
2971 objcg
= current_obj_cgroup();
2976 * slab_alloc_node() avoids the NULL check, so we might be called with a
2977 * single NULL object. kmem_cache_alloc_bulk() aborts if it can't fill
2978 * the whole requested size.
2979 * return success as there's nothing to free back
2981 if (unlikely(*p
== NULL
))
2984 flags
&= gfp_allowed_mask
;
2988 struct mem_cgroup
*memcg
;
2990 memcg
= get_mem_cgroup_from_objcg(objcg
);
2991 ret
= memcg_list_lru_alloc(memcg
, lru
, flags
);
2992 css_put(&memcg
->css
);
2998 if (obj_cgroup_charge(objcg
, flags
, size
* obj_full_size(s
)))
3001 for (i
= 0; i
< size
; i
++) {
3002 slab
= virt_to_slab(p
[i
]);
3004 if (!slab_obj_exts(slab
) &&
3005 alloc_slab_obj_exts(slab
, s
, flags
, false)) {
3006 obj_cgroup_uncharge(objcg
, obj_full_size(s
));
3010 off
= obj_to_index(s
, slab
, p
[i
]);
3011 obj_cgroup_get(objcg
);
3012 slab_obj_exts(slab
)[off
].objcg
= objcg
;
3013 mod_objcg_state(objcg
, slab_pgdat(slab
),
3014 cache_vmstat_idx(s
), obj_full_size(s
));
3020 void __memcg_slab_free_hook(struct kmem_cache
*s
, struct slab
*slab
,
3021 void **p
, int objects
, struct slabobj_ext
*obj_exts
)
3023 for (int i
= 0; i
< objects
; i
++) {
3024 struct obj_cgroup
*objcg
;
3027 off
= obj_to_index(s
, slab
, p
[i
]);
3028 objcg
= obj_exts
[off
].objcg
;
3032 obj_exts
[off
].objcg
= NULL
;
3033 obj_cgroup_uncharge(objcg
, obj_full_size(s
));
3034 mod_objcg_state(objcg
, slab_pgdat(slab
), cache_vmstat_idx(s
),
3036 obj_cgroup_put(objcg
);
3041 * Because folio_memcg(head) is not set on tails, set it now.
3043 void split_page_memcg(struct page
*head
, int old_order
, int new_order
)
3045 struct folio
*folio
= page_folio(head
);
3047 unsigned int old_nr
= 1 << old_order
;
3048 unsigned int new_nr
= 1 << new_order
;
3050 if (mem_cgroup_disabled() || !folio_memcg_charged(folio
))
3053 for (i
= new_nr
; i
< old_nr
; i
+= new_nr
)
3054 folio_page(folio
, i
)->memcg_data
= folio
->memcg_data
;
3056 if (folio_memcg_kmem(folio
))
3057 obj_cgroup_get_many(__folio_objcg(folio
), old_nr
/ new_nr
- 1);
3059 css_get_many(&folio_memcg(folio
)->css
, old_nr
/ new_nr
- 1);
3062 unsigned long mem_cgroup_usage(struct mem_cgroup
*memcg
, bool swap
)
3066 if (mem_cgroup_is_root(memcg
)) {
3068 * Approximate root's usage from global state. This isn't
3069 * perfect, but the root usage was always an approximation.
3071 val
= global_node_page_state(NR_FILE_PAGES
) +
3072 global_node_page_state(NR_ANON_MAPPED
);
3074 val
+= total_swap_pages
- get_nr_swap_pages();
3077 val
= page_counter_read(&memcg
->memory
);
3079 val
= page_counter_read(&memcg
->memsw
);
3084 static int memcg_online_kmem(struct mem_cgroup
*memcg
)
3086 struct obj_cgroup
*objcg
;
3088 if (mem_cgroup_kmem_disabled())
3091 if (unlikely(mem_cgroup_is_root(memcg
)))
3094 objcg
= obj_cgroup_alloc();
3098 objcg
->memcg
= memcg
;
3099 rcu_assign_pointer(memcg
->objcg
, objcg
);
3100 obj_cgroup_get(objcg
);
3101 memcg
->orig_objcg
= objcg
;
3103 static_branch_enable(&memcg_kmem_online_key
);
3105 memcg
->kmemcg_id
= memcg
->id
.id
;
3110 static void memcg_offline_kmem(struct mem_cgroup
*memcg
)
3112 struct mem_cgroup
*parent
;
3114 if (mem_cgroup_kmem_disabled())
3117 if (unlikely(mem_cgroup_is_root(memcg
)))
3120 parent
= parent_mem_cgroup(memcg
);
3122 parent
= root_mem_cgroup
;
3124 memcg_reparent_list_lrus(memcg
, parent
);
3127 * Objcg's reparenting must be after list_lru's, make sure list_lru
3128 * helpers won't use parent's list_lru until child is drained.
3130 memcg_reparent_objcgs(memcg
, parent
);
3133 #ifdef CONFIG_CGROUP_WRITEBACK
3135 #include <trace/events/writeback.h>
3137 static int memcg_wb_domain_init(struct mem_cgroup
*memcg
, gfp_t gfp
)
3139 return wb_domain_init(&memcg
->cgwb_domain
, gfp
);
3142 static void memcg_wb_domain_exit(struct mem_cgroup
*memcg
)
3144 wb_domain_exit(&memcg
->cgwb_domain
);
3147 static void memcg_wb_domain_size_changed(struct mem_cgroup
*memcg
)
3149 wb_domain_size_changed(&memcg
->cgwb_domain
);
3152 struct wb_domain
*mem_cgroup_wb_domain(struct bdi_writeback
*wb
)
3154 struct mem_cgroup
*memcg
= mem_cgroup_from_css(wb
->memcg_css
);
3156 if (!memcg
->css
.parent
)
3159 return &memcg
->cgwb_domain
;
3163 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3164 * @wb: bdi_writeback in question
3165 * @pfilepages: out parameter for number of file pages
3166 * @pheadroom: out parameter for number of allocatable pages according to memcg
3167 * @pdirty: out parameter for number of dirty pages
3168 * @pwriteback: out parameter for number of pages under writeback
3170 * Determine the numbers of file, headroom, dirty, and writeback pages in
3171 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
3172 * is a bit more involved.
3174 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the
3175 * headroom is calculated as the lowest headroom of itself and the
3176 * ancestors. Note that this doesn't consider the actual amount of
3177 * available memory in the system. The caller should further cap
3178 * *@pheadroom accordingly.
3180 void mem_cgroup_wb_stats(struct bdi_writeback
*wb
, unsigned long *pfilepages
,
3181 unsigned long *pheadroom
, unsigned long *pdirty
,
3182 unsigned long *pwriteback
)
3184 struct mem_cgroup
*memcg
= mem_cgroup_from_css(wb
->memcg_css
);
3185 struct mem_cgroup
*parent
;
3187 mem_cgroup_flush_stats_ratelimited(memcg
);
3189 *pdirty
= memcg_page_state(memcg
, NR_FILE_DIRTY
);
3190 *pwriteback
= memcg_page_state(memcg
, NR_WRITEBACK
);
3191 *pfilepages
= memcg_page_state(memcg
, NR_INACTIVE_FILE
) +
3192 memcg_page_state(memcg
, NR_ACTIVE_FILE
);
3194 *pheadroom
= PAGE_COUNTER_MAX
;
3195 while ((parent
= parent_mem_cgroup(memcg
))) {
3196 unsigned long ceiling
= min(READ_ONCE(memcg
->memory
.max
),
3197 READ_ONCE(memcg
->memory
.high
));
3198 unsigned long used
= page_counter_read(&memcg
->memory
);
3200 *pheadroom
= min(*pheadroom
, ceiling
- min(ceiling
, used
));
3206 * Foreign dirty flushing
3208 * There's an inherent mismatch between memcg and writeback. The former
3209 * tracks ownership per-page while the latter per-inode. This was a
3210 * deliberate design decision because honoring per-page ownership in the
3211 * writeback path is complicated, may lead to higher CPU and IO overheads
3212 * and deemed unnecessary given that write-sharing an inode across
3213 * different cgroups isn't a common use-case.
3215 * Combined with inode majority-writer ownership switching, this works well
3216 * enough in most cases but there are some pathological cases. For
3217 * example, let's say there are two cgroups A and B which keep writing to
3218 * different but confined parts of the same inode. B owns the inode and
3219 * A's memory is limited far below B's. A's dirty ratio can rise enough to
3220 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
3221 * triggering background writeback. A will be slowed down without a way to
3222 * make writeback of the dirty pages happen.
3224 * Conditions like the above can lead to a cgroup getting repeatedly and
3225 * severely throttled after making some progress after each
3226 * dirty_expire_interval while the underlying IO device is almost
3229 * Solving this problem completely requires matching the ownership tracking
3230 * granularities between memcg and writeback in either direction. However,
3231 * the more egregious behaviors can be avoided by simply remembering the
3232 * most recent foreign dirtying events and initiating remote flushes on
3233 * them when local writeback isn't enough to keep the memory clean enough.
3235 * The following two functions implement such mechanism. When a foreign
3236 * page - a page whose memcg and writeback ownerships don't match - is
3237 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
3238 * bdi_writeback on the page owning memcg. When balance_dirty_pages()
3239 * decides that the memcg needs to sleep due to high dirty ratio, it calls
3240 * mem_cgroup_flush_foreign() which queues writeback on the recorded
3241 * foreign bdi_writebacks which haven't expired. Both the numbers of
3242 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
3243 * limited to MEMCG_CGWB_FRN_CNT.
3245 * The mechanism only remembers IDs and doesn't hold any object references.
3246 * As being wrong occasionally doesn't matter, updates and accesses to the
3247 * records are lockless and racy.
3249 void mem_cgroup_track_foreign_dirty_slowpath(struct folio
*folio
,
3250 struct bdi_writeback
*wb
)
3252 struct mem_cgroup
*memcg
= folio_memcg(folio
);
3253 struct memcg_cgwb_frn
*frn
;
3254 u64 now
= get_jiffies_64();
3255 u64 oldest_at
= now
;
3259 trace_track_foreign_dirty(folio
, wb
);
3262 * Pick the slot to use. If there is already a slot for @wb, keep
3263 * using it. If not replace the oldest one which isn't being
3266 for (i
= 0; i
< MEMCG_CGWB_FRN_CNT
; i
++) {
3267 frn
= &memcg
->cgwb_frn
[i
];
3268 if (frn
->bdi_id
== wb
->bdi
->id
&&
3269 frn
->memcg_id
== wb
->memcg_css
->id
)
3271 if (time_before64(frn
->at
, oldest_at
) &&
3272 atomic_read(&frn
->done
.cnt
) == 1) {
3274 oldest_at
= frn
->at
;
3278 if (i
< MEMCG_CGWB_FRN_CNT
) {
3280 * Re-using an existing one. Update timestamp lazily to
3281 * avoid making the cacheline hot. We want them to be
3282 * reasonably up-to-date and significantly shorter than
3283 * dirty_expire_interval as that's what expires the record.
3284 * Use the shorter of 1s and dirty_expire_interval / 8.
3286 unsigned long update_intv
=
3287 min_t(unsigned long, HZ
,
3288 msecs_to_jiffies(dirty_expire_interval
* 10) / 8);
3290 if (time_before64(frn
->at
, now
- update_intv
))
3292 } else if (oldest
>= 0) {
3293 /* replace the oldest free one */
3294 frn
= &memcg
->cgwb_frn
[oldest
];
3295 frn
->bdi_id
= wb
->bdi
->id
;
3296 frn
->memcg_id
= wb
->memcg_css
->id
;
3301 /* issue foreign writeback flushes for recorded foreign dirtying events */
3302 void mem_cgroup_flush_foreign(struct bdi_writeback
*wb
)
3304 struct mem_cgroup
*memcg
= mem_cgroup_from_css(wb
->memcg_css
);
3305 unsigned long intv
= msecs_to_jiffies(dirty_expire_interval
* 10);
3306 u64 now
= jiffies_64
;
3309 for (i
= 0; i
< MEMCG_CGWB_FRN_CNT
; i
++) {
3310 struct memcg_cgwb_frn
*frn
= &memcg
->cgwb_frn
[i
];
3313 * If the record is older than dirty_expire_interval,
3314 * writeback on it has already started. No need to kick it
3315 * off again. Also, don't start a new one if there's
3316 * already one in flight.
3318 if (time_after64(frn
->at
, now
- intv
) &&
3319 atomic_read(&frn
->done
.cnt
) == 1) {
3321 trace_flush_foreign(wb
, frn
->bdi_id
, frn
->memcg_id
);
3322 cgroup_writeback_by_id(frn
->bdi_id
, frn
->memcg_id
,
3323 WB_REASON_FOREIGN_FLUSH
,
3329 #else /* CONFIG_CGROUP_WRITEBACK */
3331 static int memcg_wb_domain_init(struct mem_cgroup
*memcg
, gfp_t gfp
)
3336 static void memcg_wb_domain_exit(struct mem_cgroup
*memcg
)
3340 static void memcg_wb_domain_size_changed(struct mem_cgroup
*memcg
)
3344 #endif /* CONFIG_CGROUP_WRITEBACK */
3347 * Private memory cgroup IDR
3349 * Swap-out records and page cache shadow entries need to store memcg
3350 * references in constrained space, so we maintain an ID space that is
3351 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
3352 * memory-controlled cgroups to 64k.
3354 * However, there usually are many references to the offline CSS after
3355 * the cgroup has been destroyed, such as page cache or reclaimable
3356 * slab objects, that don't need to hang on to the ID. We want to keep
3357 * those dead CSS from occupying IDs, or we might quickly exhaust the
3358 * relatively small ID space and prevent the creation of new cgroups
3359 * even when there are much fewer than 64k cgroups - possibly none.
3361 * Maintain a private 16-bit ID space for memcg, and allow the ID to
3362 * be freed and recycled when it's no longer needed, which is usually
3363 * when the CSS is offlined.
3365 * The only exception to that are records of swapped out tmpfs/shmem
3366 * pages that need to be attributed to live ancestors on swapin. But
3367 * those references are manageable from userspace.
3370 #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
3371 static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids
);
3373 static void mem_cgroup_id_remove(struct mem_cgroup
*memcg
)
3375 if (memcg
->id
.id
> 0) {
3376 xa_erase(&mem_cgroup_ids
, memcg
->id
.id
);
3381 void __maybe_unused
mem_cgroup_id_get_many(struct mem_cgroup
*memcg
,
3384 refcount_add(n
, &memcg
->id
.ref
);
3387 void mem_cgroup_id_put_many(struct mem_cgroup
*memcg
, unsigned int n
)
3389 if (refcount_sub_and_test(n
, &memcg
->id
.ref
)) {
3390 mem_cgroup_id_remove(memcg
);
3392 /* Memcg ID pins CSS */
3393 css_put(&memcg
->css
);
3397 static inline void mem_cgroup_id_put(struct mem_cgroup
*memcg
)
3399 mem_cgroup_id_put_many(memcg
, 1);
3403 * mem_cgroup_from_id - look up a memcg from a memcg id
3404 * @id: the memcg id to look up
3406 * Caller must hold rcu_read_lock().
3408 struct mem_cgroup
*mem_cgroup_from_id(unsigned short id
)
3410 WARN_ON_ONCE(!rcu_read_lock_held());
3411 return xa_load(&mem_cgroup_ids
, id
);
3414 #ifdef CONFIG_SHRINKER_DEBUG
3415 struct mem_cgroup
*mem_cgroup_get_from_ino(unsigned long ino
)
3417 struct cgroup
*cgrp
;
3418 struct cgroup_subsys_state
*css
;
3419 struct mem_cgroup
*memcg
;
3421 cgrp
= cgroup_get_from_id(ino
);
3423 return ERR_CAST(cgrp
);
3425 css
= cgroup_get_e_css(cgrp
, &memory_cgrp_subsys
);
3427 memcg
= container_of(css
, struct mem_cgroup
, css
);
3429 memcg
= ERR_PTR(-ENOENT
);
3437 static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup
*memcg
, int node
)
3439 struct mem_cgroup_per_node
*pn
;
3441 pn
= kzalloc_node(sizeof(*pn
), GFP_KERNEL
, node
);
3445 pn
->lruvec_stats
= kzalloc_node(sizeof(struct lruvec_stats
),
3446 GFP_KERNEL_ACCOUNT
, node
);
3447 if (!pn
->lruvec_stats
)
3450 pn
->lruvec_stats_percpu
= alloc_percpu_gfp(struct lruvec_stats_percpu
,
3451 GFP_KERNEL_ACCOUNT
);
3452 if (!pn
->lruvec_stats_percpu
)
3455 lruvec_init(&pn
->lruvec
);
3458 memcg
->nodeinfo
[node
] = pn
;
3461 kfree(pn
->lruvec_stats
);
3466 static void free_mem_cgroup_per_node_info(struct mem_cgroup
*memcg
, int node
)
3468 struct mem_cgroup_per_node
*pn
= memcg
->nodeinfo
[node
];
3473 free_percpu(pn
->lruvec_stats_percpu
);
3474 kfree(pn
->lruvec_stats
);
3478 static void __mem_cgroup_free(struct mem_cgroup
*memcg
)
3482 obj_cgroup_put(memcg
->orig_objcg
);
3485 free_mem_cgroup_per_node_info(memcg
, node
);
3486 memcg1_free_events(memcg
);
3487 kfree(memcg
->vmstats
);
3488 free_percpu(memcg
->vmstats_percpu
);
3492 static void mem_cgroup_free(struct mem_cgroup
*memcg
)
3494 lru_gen_exit_memcg(memcg
);
3495 memcg_wb_domain_exit(memcg
);
3496 __mem_cgroup_free(memcg
);
3499 static struct mem_cgroup
*mem_cgroup_alloc(struct mem_cgroup
*parent
)
3501 struct memcg_vmstats_percpu
*statc
, *pstatc
;
3502 struct mem_cgroup
*memcg
;
3504 int __maybe_unused i
;
3507 memcg
= kzalloc(struct_size(memcg
, nodeinfo
, nr_node_ids
), GFP_KERNEL
);
3509 return ERR_PTR(-ENOMEM
);
3511 error
= xa_alloc(&mem_cgroup_ids
, &memcg
->id
.id
, NULL
,
3512 XA_LIMIT(1, MEM_CGROUP_ID_MAX
), GFP_KERNEL
);
3517 memcg
->vmstats
= kzalloc(sizeof(struct memcg_vmstats
),
3518 GFP_KERNEL_ACCOUNT
);
3519 if (!memcg
->vmstats
)
3522 memcg
->vmstats_percpu
= alloc_percpu_gfp(struct memcg_vmstats_percpu
,
3523 GFP_KERNEL_ACCOUNT
);
3524 if (!memcg
->vmstats_percpu
)
3527 if (!memcg1_alloc_events(memcg
))
3530 for_each_possible_cpu(cpu
) {
3532 pstatc
= per_cpu_ptr(parent
->vmstats_percpu
, cpu
);
3533 statc
= per_cpu_ptr(memcg
->vmstats_percpu
, cpu
);
3534 statc
->parent
= parent
? pstatc
: NULL
;
3535 statc
->vmstats
= memcg
->vmstats
;
3539 if (!alloc_mem_cgroup_per_node_info(memcg
, node
))
3542 if (memcg_wb_domain_init(memcg
, GFP_KERNEL
))
3545 INIT_WORK(&memcg
->high_work
, high_work_func
);
3546 vmpressure_init(&memcg
->vmpressure
);
3547 INIT_LIST_HEAD(&memcg
->memory_peaks
);
3548 INIT_LIST_HEAD(&memcg
->swap_peaks
);
3549 spin_lock_init(&memcg
->peaks_lock
);
3550 memcg
->socket_pressure
= jiffies
;
3551 memcg1_memcg_init(memcg
);
3552 memcg
->kmemcg_id
= -1;
3553 INIT_LIST_HEAD(&memcg
->objcg_list
);
3554 #ifdef CONFIG_CGROUP_WRITEBACK
3555 INIT_LIST_HEAD(&memcg
->cgwb_list
);
3556 for (i
= 0; i
< MEMCG_CGWB_FRN_CNT
; i
++)
3557 memcg
->cgwb_frn
[i
].done
=
3558 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq
);
3560 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3561 spin_lock_init(&memcg
->deferred_split_queue
.split_queue_lock
);
3562 INIT_LIST_HEAD(&memcg
->deferred_split_queue
.split_queue
);
3563 memcg
->deferred_split_queue
.split_queue_len
= 0;
3565 lru_gen_init_memcg(memcg
);
3568 mem_cgroup_id_remove(memcg
);
3569 __mem_cgroup_free(memcg
);
3570 return ERR_PTR(error
);
3573 static struct cgroup_subsys_state
* __ref
3574 mem_cgroup_css_alloc(struct cgroup_subsys_state
*parent_css
)
3576 struct mem_cgroup
*parent
= mem_cgroup_from_css(parent_css
);
3577 struct mem_cgroup
*memcg
, *old_memcg
;
3579 old_memcg
= set_active_memcg(parent
);
3580 memcg
= mem_cgroup_alloc(parent
);
3581 set_active_memcg(old_memcg
);
3583 return ERR_CAST(memcg
);
3585 page_counter_set_high(&memcg
->memory
, PAGE_COUNTER_MAX
);
3586 memcg1_soft_limit_reset(memcg
);
3588 memcg
->zswap_max
= PAGE_COUNTER_MAX
;
3589 WRITE_ONCE(memcg
->zswap_writeback
, true);
3591 page_counter_set_high(&memcg
->swap
, PAGE_COUNTER_MAX
);
3593 WRITE_ONCE(memcg
->swappiness
, mem_cgroup_swappiness(parent
));
3595 page_counter_init(&memcg
->memory
, &parent
->memory
, true);
3596 page_counter_init(&memcg
->swap
, &parent
->swap
, false);
3597 #ifdef CONFIG_MEMCG_V1
3598 WRITE_ONCE(memcg
->oom_kill_disable
, READ_ONCE(parent
->oom_kill_disable
));
3599 page_counter_init(&memcg
->kmem
, &parent
->kmem
, false);
3600 page_counter_init(&memcg
->tcpmem
, &parent
->tcpmem
, false);
3604 init_memcg_events();
3605 page_counter_init(&memcg
->memory
, NULL
, true);
3606 page_counter_init(&memcg
->swap
, NULL
, false);
3607 #ifdef CONFIG_MEMCG_V1
3608 page_counter_init(&memcg
->kmem
, NULL
, false);
3609 page_counter_init(&memcg
->tcpmem
, NULL
, false);
3611 root_mem_cgroup
= memcg
;
3615 if (cgroup_subsys_on_dfl(memory_cgrp_subsys
) && !cgroup_memory_nosocket
)
3616 static_branch_inc(&memcg_sockets_enabled_key
);
3618 if (!cgroup_memory_nobpf
)
3619 static_branch_inc(&memcg_bpf_enabled_key
);
3624 static int mem_cgroup_css_online(struct cgroup_subsys_state
*css
)
3626 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
3628 if (memcg_online_kmem(memcg
))
3632 * A memcg must be visible for expand_shrinker_info()
3633 * by the time the maps are allocated. So, we allocate maps
3634 * here, when for_each_mem_cgroup() can't skip it.
3636 if (alloc_shrinker_info(memcg
))
3639 if (unlikely(mem_cgroup_is_root(memcg
)) && !mem_cgroup_disabled())
3640 queue_delayed_work(system_unbound_wq
, &stats_flush_dwork
,
3642 lru_gen_online_memcg(memcg
);
3644 /* Online state pins memcg ID, memcg ID pins CSS */
3645 refcount_set(&memcg
->id
.ref
, 1);
3649 * Ensure mem_cgroup_from_id() works once we're fully online.
3651 * We could do this earlier and require callers to filter with
3652 * css_tryget_online(). But right now there are no users that
3653 * need earlier access, and the workingset code relies on the
3654 * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So
3655 * publish it here at the end of onlining. This matches the
3656 * regular ID destruction during offlining.
3658 xa_store(&mem_cgroup_ids
, memcg
->id
.id
, memcg
, GFP_KERNEL
);
3662 memcg_offline_kmem(memcg
);
3664 mem_cgroup_id_remove(memcg
);
3668 static void mem_cgroup_css_offline(struct cgroup_subsys_state
*css
)
3670 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
3672 memcg1_css_offline(memcg
);
3674 page_counter_set_min(&memcg
->memory
, 0);
3675 page_counter_set_low(&memcg
->memory
, 0);
3677 zswap_memcg_offline_cleanup(memcg
);
3679 memcg_offline_kmem(memcg
);
3680 reparent_shrinker_deferred(memcg
);
3681 wb_memcg_offline(memcg
);
3682 lru_gen_offline_memcg(memcg
);
3684 drain_all_stock(memcg
);
3686 mem_cgroup_id_put(memcg
);
3689 static void mem_cgroup_css_released(struct cgroup_subsys_state
*css
)
3691 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
3693 invalidate_reclaim_iterators(memcg
);
3694 lru_gen_release_memcg(memcg
);
3697 static void mem_cgroup_css_free(struct cgroup_subsys_state
*css
)
3699 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
3700 int __maybe_unused i
;
3702 #ifdef CONFIG_CGROUP_WRITEBACK
3703 for (i
= 0; i
< MEMCG_CGWB_FRN_CNT
; i
++)
3704 wb_wait_for_completion(&memcg
->cgwb_frn
[i
].done
);
3706 if (cgroup_subsys_on_dfl(memory_cgrp_subsys
) && !cgroup_memory_nosocket
)
3707 static_branch_dec(&memcg_sockets_enabled_key
);
3709 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys
) && memcg1_tcpmem_active(memcg
))
3710 static_branch_dec(&memcg_sockets_enabled_key
);
3712 if (!cgroup_memory_nobpf
)
3713 static_branch_dec(&memcg_bpf_enabled_key
);
3715 vmpressure_cleanup(&memcg
->vmpressure
);
3716 cancel_work_sync(&memcg
->high_work
);
3717 memcg1_remove_from_trees(memcg
);
3718 free_shrinker_info(memcg
);
3719 mem_cgroup_free(memcg
);
3723 * mem_cgroup_css_reset - reset the states of a mem_cgroup
3724 * @css: the target css
3726 * Reset the states of the mem_cgroup associated with @css. This is
3727 * invoked when the userland requests disabling on the default hierarchy
3728 * but the memcg is pinned through dependency. The memcg should stop
3729 * applying policies and should revert to the vanilla state as it may be
3730 * made visible again.
3732 * The current implementation only resets the essential configurations.
3733 * This needs to be expanded to cover all the visible parts.
3735 static void mem_cgroup_css_reset(struct cgroup_subsys_state
*css
)
3737 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
3739 page_counter_set_max(&memcg
->memory
, PAGE_COUNTER_MAX
);
3740 page_counter_set_max(&memcg
->swap
, PAGE_COUNTER_MAX
);
3741 #ifdef CONFIG_MEMCG_V1
3742 page_counter_set_max(&memcg
->kmem
, PAGE_COUNTER_MAX
);
3743 page_counter_set_max(&memcg
->tcpmem
, PAGE_COUNTER_MAX
);
3745 page_counter_set_min(&memcg
->memory
, 0);
3746 page_counter_set_low(&memcg
->memory
, 0);
3747 page_counter_set_high(&memcg
->memory
, PAGE_COUNTER_MAX
);
3748 memcg1_soft_limit_reset(memcg
);
3749 page_counter_set_high(&memcg
->swap
, PAGE_COUNTER_MAX
);
3750 memcg_wb_domain_size_changed(memcg
);
3753 struct aggregate_control
{
3754 /* pointer to the aggregated (CPU and subtree aggregated) counters */
3756 /* pointer to the non-hierarchichal (CPU aggregated) counters */
3758 /* pointer to the pending child counters during tree propagation */
3760 /* pointer to the parent's pending counters, could be NULL */
3762 /* pointer to the percpu counters to be aggregated */
3764 /* pointer to the percpu counters of the last aggregation*/
3766 /* size of the above counters */
3770 static void mem_cgroup_stat_aggregate(struct aggregate_control
*ac
)
3773 long delta
, delta_cpu
, v
;
3775 for (i
= 0; i
< ac
->size
; i
++) {
3777 * Collect the aggregated propagation counts of groups
3778 * below us. We're in a per-cpu loop here and this is
3779 * a global counter, so the first cycle will get them.
3781 delta
= ac
->pending
[i
];
3785 /* Add CPU changes on this level since the last flush */
3787 v
= READ_ONCE(ac
->cstat
[i
]);
3788 if (v
!= ac
->cstat_prev
[i
]) {
3789 delta_cpu
= v
- ac
->cstat_prev
[i
];
3791 ac
->cstat_prev
[i
] = v
;
3794 /* Aggregate counts on this level and propagate upwards */
3796 ac
->local
[i
] += delta_cpu
;
3799 ac
->aggregate
[i
] += delta
;
3801 ac
->ppending
[i
] += delta
;
3806 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state
*css
, int cpu
)
3808 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
3809 struct mem_cgroup
*parent
= parent_mem_cgroup(memcg
);
3810 struct memcg_vmstats_percpu
*statc
;
3811 struct aggregate_control ac
;
3814 statc
= per_cpu_ptr(memcg
->vmstats_percpu
, cpu
);
3816 ac
= (struct aggregate_control
) {
3817 .aggregate
= memcg
->vmstats
->state
,
3818 .local
= memcg
->vmstats
->state_local
,
3819 .pending
= memcg
->vmstats
->state_pending
,
3820 .ppending
= parent
? parent
->vmstats
->state_pending
: NULL
,
3821 .cstat
= statc
->state
,
3822 .cstat_prev
= statc
->state_prev
,
3823 .size
= MEMCG_VMSTAT_SIZE
,
3825 mem_cgroup_stat_aggregate(&ac
);
3827 ac
= (struct aggregate_control
) {
3828 .aggregate
= memcg
->vmstats
->events
,
3829 .local
= memcg
->vmstats
->events_local
,
3830 .pending
= memcg
->vmstats
->events_pending
,
3831 .ppending
= parent
? parent
->vmstats
->events_pending
: NULL
,
3832 .cstat
= statc
->events
,
3833 .cstat_prev
= statc
->events_prev
,
3834 .size
= NR_MEMCG_EVENTS
,
3836 mem_cgroup_stat_aggregate(&ac
);
3838 for_each_node_state(nid
, N_MEMORY
) {
3839 struct mem_cgroup_per_node
*pn
= memcg
->nodeinfo
[nid
];
3840 struct lruvec_stats
*lstats
= pn
->lruvec_stats
;
3841 struct lruvec_stats
*plstats
= NULL
;
3842 struct lruvec_stats_percpu
*lstatc
;
3845 plstats
= parent
->nodeinfo
[nid
]->lruvec_stats
;
3847 lstatc
= per_cpu_ptr(pn
->lruvec_stats_percpu
, cpu
);
3849 ac
= (struct aggregate_control
) {
3850 .aggregate
= lstats
->state
,
3851 .local
= lstats
->state_local
,
3852 .pending
= lstats
->state_pending
,
3853 .ppending
= plstats
? plstats
->state_pending
: NULL
,
3854 .cstat
= lstatc
->state
,
3855 .cstat_prev
= lstatc
->state_prev
,
3856 .size
= NR_MEMCG_NODE_STAT_ITEMS
,
3858 mem_cgroup_stat_aggregate(&ac
);
3861 WRITE_ONCE(statc
->stats_updates
, 0);
3862 /* We are in a per-cpu loop here, only do the atomic write once */
3863 if (atomic64_read(&memcg
->vmstats
->stats_updates
))
3864 atomic64_set(&memcg
->vmstats
->stats_updates
, 0);
3867 static void mem_cgroup_fork(struct task_struct
*task
)
3870 * Set the update flag to cause task->objcg to be initialized lazily
3871 * on the first allocation. It can be done without any synchronization
3872 * because it's always performed on the current task, so does
3873 * current_objcg_update().
3875 task
->objcg
= (struct obj_cgroup
*)CURRENT_OBJCG_UPDATE_FLAG
;
3878 static void mem_cgroup_exit(struct task_struct
*task
)
3880 struct obj_cgroup
*objcg
= task
->objcg
;
3882 objcg
= (struct obj_cgroup
*)
3883 ((unsigned long)objcg
& ~CURRENT_OBJCG_UPDATE_FLAG
);
3884 obj_cgroup_put(objcg
);
3887 * Some kernel allocations can happen after this point,
3888 * but let's ignore them. It can be done without any synchronization
3889 * because it's always performed on the current task, so does
3890 * current_objcg_update().
3895 #ifdef CONFIG_LRU_GEN
3896 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset
*tset
)
3898 struct task_struct
*task
;
3899 struct cgroup_subsys_state
*css
;
3901 /* find the first leader if there is any */
3902 cgroup_taskset_for_each_leader(task
, css
, tset
)
3909 if (task
->mm
&& READ_ONCE(task
->mm
->owner
) == task
)
3910 lru_gen_migrate_mm(task
->mm
);
3914 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset
*tset
) {}
3915 #endif /* CONFIG_LRU_GEN */
3917 static void mem_cgroup_kmem_attach(struct cgroup_taskset
*tset
)
3919 struct task_struct
*task
;
3920 struct cgroup_subsys_state
*css
;
3922 cgroup_taskset_for_each(task
, css
, tset
) {
3923 /* atomically set the update bit */
3924 set_bit(CURRENT_OBJCG_UPDATE_BIT
, (unsigned long *)&task
->objcg
);
3928 static void mem_cgroup_attach(struct cgroup_taskset
*tset
)
3930 mem_cgroup_lru_gen_attach(tset
);
3931 mem_cgroup_kmem_attach(tset
);
3934 static int seq_puts_memcg_tunable(struct seq_file
*m
, unsigned long value
)
3936 if (value
== PAGE_COUNTER_MAX
)
3937 seq_puts(m
, "max\n");
3939 seq_printf(m
, "%llu\n", (u64
)value
* PAGE_SIZE
);
3944 static u64
memory_current_read(struct cgroup_subsys_state
*css
,
3947 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
3949 return (u64
)page_counter_read(&memcg
->memory
) * PAGE_SIZE
;
3952 #define OFP_PEAK_UNSET (((-1UL)))
3954 static int peak_show(struct seq_file
*sf
, void *v
, struct page_counter
*pc
)
3956 struct cgroup_of_peak
*ofp
= of_peak(sf
->private);
3957 u64 fd_peak
= READ_ONCE(ofp
->value
), peak
;
3959 /* User wants global or local peak? */
3960 if (fd_peak
== OFP_PEAK_UNSET
)
3961 peak
= pc
->watermark
;
3963 peak
= max(fd_peak
, READ_ONCE(pc
->local_watermark
));
3965 seq_printf(sf
, "%llu\n", peak
* PAGE_SIZE
);
3969 static int memory_peak_show(struct seq_file
*sf
, void *v
)
3971 struct mem_cgroup
*memcg
= mem_cgroup_from_css(seq_css(sf
));
3973 return peak_show(sf
, v
, &memcg
->memory
);
3976 static int peak_open(struct kernfs_open_file
*of
)
3978 struct cgroup_of_peak
*ofp
= of_peak(of
);
3980 ofp
->value
= OFP_PEAK_UNSET
;
3984 static void peak_release(struct kernfs_open_file
*of
)
3986 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
3987 struct cgroup_of_peak
*ofp
= of_peak(of
);
3989 if (ofp
->value
== OFP_PEAK_UNSET
) {
3990 /* fast path (no writes on this fd) */
3993 spin_lock(&memcg
->peaks_lock
);
3994 list_del(&ofp
->list
);
3995 spin_unlock(&memcg
->peaks_lock
);
3998 static ssize_t
peak_write(struct kernfs_open_file
*of
, char *buf
, size_t nbytes
,
3999 loff_t off
, struct page_counter
*pc
,
4000 struct list_head
*watchers
)
4002 unsigned long usage
;
4003 struct cgroup_of_peak
*peer_ctx
;
4004 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
4005 struct cgroup_of_peak
*ofp
= of_peak(of
);
4007 spin_lock(&memcg
->peaks_lock
);
4009 usage
= page_counter_read(pc
);
4010 WRITE_ONCE(pc
->local_watermark
, usage
);
4012 list_for_each_entry(peer_ctx
, watchers
, list
)
4013 if (usage
> peer_ctx
->value
)
4014 WRITE_ONCE(peer_ctx
->value
, usage
);
4016 /* initial write, register watcher */
4017 if (ofp
->value
== -1)
4018 list_add(&ofp
->list
, watchers
);
4020 WRITE_ONCE(ofp
->value
, usage
);
4021 spin_unlock(&memcg
->peaks_lock
);
4026 static ssize_t
memory_peak_write(struct kernfs_open_file
*of
, char *buf
,
4027 size_t nbytes
, loff_t off
)
4029 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
4031 return peak_write(of
, buf
, nbytes
, off
, &memcg
->memory
,
4032 &memcg
->memory_peaks
);
4035 #undef OFP_PEAK_UNSET
4037 static int memory_min_show(struct seq_file
*m
, void *v
)
4039 return seq_puts_memcg_tunable(m
,
4040 READ_ONCE(mem_cgroup_from_seq(m
)->memory
.min
));
4043 static ssize_t
memory_min_write(struct kernfs_open_file
*of
,
4044 char *buf
, size_t nbytes
, loff_t off
)
4046 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
4050 buf
= strstrip(buf
);
4051 err
= page_counter_memparse(buf
, "max", &min
);
4055 page_counter_set_min(&memcg
->memory
, min
);
4060 static int memory_low_show(struct seq_file
*m
, void *v
)
4062 return seq_puts_memcg_tunable(m
,
4063 READ_ONCE(mem_cgroup_from_seq(m
)->memory
.low
));
4066 static ssize_t
memory_low_write(struct kernfs_open_file
*of
,
4067 char *buf
, size_t nbytes
, loff_t off
)
4069 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
4073 buf
= strstrip(buf
);
4074 err
= page_counter_memparse(buf
, "max", &low
);
4078 page_counter_set_low(&memcg
->memory
, low
);
4083 static int memory_high_show(struct seq_file
*m
, void *v
)
4085 return seq_puts_memcg_tunable(m
,
4086 READ_ONCE(mem_cgroup_from_seq(m
)->memory
.high
));
4089 static ssize_t
memory_high_write(struct kernfs_open_file
*of
,
4090 char *buf
, size_t nbytes
, loff_t off
)
4092 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
4093 unsigned int nr_retries
= MAX_RECLAIM_RETRIES
;
4094 bool drained
= false;
4098 buf
= strstrip(buf
);
4099 err
= page_counter_memparse(buf
, "max", &high
);
4103 page_counter_set_high(&memcg
->memory
, high
);
4106 unsigned long nr_pages
= page_counter_read(&memcg
->memory
);
4107 unsigned long reclaimed
;
4109 if (nr_pages
<= high
)
4112 if (signal_pending(current
))
4116 drain_all_stock(memcg
);
4121 reclaimed
= try_to_free_mem_cgroup_pages(memcg
, nr_pages
- high
,
4122 GFP_KERNEL
, MEMCG_RECLAIM_MAY_SWAP
, NULL
);
4124 if (!reclaimed
&& !nr_retries
--)
4128 memcg_wb_domain_size_changed(memcg
);
4132 static int memory_max_show(struct seq_file
*m
, void *v
)
4134 return seq_puts_memcg_tunable(m
,
4135 READ_ONCE(mem_cgroup_from_seq(m
)->memory
.max
));
4138 static ssize_t
memory_max_write(struct kernfs_open_file
*of
,
4139 char *buf
, size_t nbytes
, loff_t off
)
4141 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
4142 unsigned int nr_reclaims
= MAX_RECLAIM_RETRIES
;
4143 bool drained
= false;
4147 buf
= strstrip(buf
);
4148 err
= page_counter_memparse(buf
, "max", &max
);
4152 xchg(&memcg
->memory
.max
, max
);
4155 unsigned long nr_pages
= page_counter_read(&memcg
->memory
);
4157 if (nr_pages
<= max
)
4160 if (signal_pending(current
))
4164 drain_all_stock(memcg
);
4170 if (!try_to_free_mem_cgroup_pages(memcg
, nr_pages
- max
,
4171 GFP_KERNEL
, MEMCG_RECLAIM_MAY_SWAP
, NULL
))
4176 memcg_memory_event(memcg
, MEMCG_OOM
);
4177 if (!mem_cgroup_out_of_memory(memcg
, GFP_KERNEL
, 0))
4181 memcg_wb_domain_size_changed(memcg
);
4186 * Note: don't forget to update the 'samples/cgroup/memcg_event_listener'
4187 * if any new events become available.
4189 static void __memory_events_show(struct seq_file
*m
, atomic_long_t
*events
)
4191 seq_printf(m
, "low %lu\n", atomic_long_read(&events
[MEMCG_LOW
]));
4192 seq_printf(m
, "high %lu\n", atomic_long_read(&events
[MEMCG_HIGH
]));
4193 seq_printf(m
, "max %lu\n", atomic_long_read(&events
[MEMCG_MAX
]));
4194 seq_printf(m
, "oom %lu\n", atomic_long_read(&events
[MEMCG_OOM
]));
4195 seq_printf(m
, "oom_kill %lu\n",
4196 atomic_long_read(&events
[MEMCG_OOM_KILL
]));
4197 seq_printf(m
, "oom_group_kill %lu\n",
4198 atomic_long_read(&events
[MEMCG_OOM_GROUP_KILL
]));
4201 static int memory_events_show(struct seq_file
*m
, void *v
)
4203 struct mem_cgroup
*memcg
= mem_cgroup_from_seq(m
);
4205 __memory_events_show(m
, memcg
->memory_events
);
4209 static int memory_events_local_show(struct seq_file
*m
, void *v
)
4211 struct mem_cgroup
*memcg
= mem_cgroup_from_seq(m
);
4213 __memory_events_show(m
, memcg
->memory_events_local
);
4217 int memory_stat_show(struct seq_file
*m
, void *v
)
4219 struct mem_cgroup
*memcg
= mem_cgroup_from_seq(m
);
4220 char *buf
= kmalloc(SEQ_BUF_SIZE
, GFP_KERNEL
);
4225 seq_buf_init(&s
, buf
, SEQ_BUF_SIZE
);
4226 memory_stat_format(memcg
, &s
);
4233 static inline unsigned long lruvec_page_state_output(struct lruvec
*lruvec
,
4236 return lruvec_page_state(lruvec
, item
) *
4237 memcg_page_state_output_unit(item
);
4240 static int memory_numa_stat_show(struct seq_file
*m
, void *v
)
4243 struct mem_cgroup
*memcg
= mem_cgroup_from_seq(m
);
4245 mem_cgroup_flush_stats(memcg
);
4247 for (i
= 0; i
< ARRAY_SIZE(memory_stats
); i
++) {
4250 if (memory_stats
[i
].idx
>= NR_VM_NODE_STAT_ITEMS
)
4253 seq_printf(m
, "%s", memory_stats
[i
].name
);
4254 for_each_node_state(nid
, N_MEMORY
) {
4256 struct lruvec
*lruvec
;
4258 lruvec
= mem_cgroup_lruvec(memcg
, NODE_DATA(nid
));
4259 size
= lruvec_page_state_output(lruvec
,
4260 memory_stats
[i
].idx
);
4261 seq_printf(m
, " N%d=%llu", nid
, size
);
4270 static int memory_oom_group_show(struct seq_file
*m
, void *v
)
4272 struct mem_cgroup
*memcg
= mem_cgroup_from_seq(m
);
4274 seq_printf(m
, "%d\n", READ_ONCE(memcg
->oom_group
));
4279 static ssize_t
memory_oom_group_write(struct kernfs_open_file
*of
,
4280 char *buf
, size_t nbytes
, loff_t off
)
4282 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
4285 buf
= strstrip(buf
);
4289 ret
= kstrtoint(buf
, 0, &oom_group
);
4293 if (oom_group
!= 0 && oom_group
!= 1)
4296 WRITE_ONCE(memcg
->oom_group
, oom_group
);
4302 MEMORY_RECLAIM_SWAPPINESS
= 0,
4303 MEMORY_RECLAIM_NULL
,
4306 static const match_table_t tokens
= {
4307 { MEMORY_RECLAIM_SWAPPINESS
, "swappiness=%d"},
4308 { MEMORY_RECLAIM_NULL
, NULL
},
4311 static ssize_t
memory_reclaim(struct kernfs_open_file
*of
, char *buf
,
4312 size_t nbytes
, loff_t off
)
4314 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
4315 unsigned int nr_retries
= MAX_RECLAIM_RETRIES
;
4316 unsigned long nr_to_reclaim
, nr_reclaimed
= 0;
4317 int swappiness
= -1;
4318 unsigned int reclaim_options
;
4319 char *old_buf
, *start
;
4320 substring_t args
[MAX_OPT_ARGS
];
4322 buf
= strstrip(buf
);
4325 nr_to_reclaim
= memparse(buf
, &buf
) / PAGE_SIZE
;
4329 buf
= strstrip(buf
);
4331 while ((start
= strsep(&buf
, " ")) != NULL
) {
4334 switch (match_token(start
, tokens
, args
)) {
4335 case MEMORY_RECLAIM_SWAPPINESS
:
4336 if (match_int(&args
[0], &swappiness
))
4338 if (swappiness
< MIN_SWAPPINESS
|| swappiness
> MAX_SWAPPINESS
)
4346 reclaim_options
= MEMCG_RECLAIM_MAY_SWAP
| MEMCG_RECLAIM_PROACTIVE
;
4347 while (nr_reclaimed
< nr_to_reclaim
) {
4348 /* Will converge on zero, but reclaim enforces a minimum */
4349 unsigned long batch_size
= (nr_to_reclaim
- nr_reclaimed
) / 4;
4350 unsigned long reclaimed
;
4352 if (signal_pending(current
))
4356 * This is the final attempt, drain percpu lru caches in the
4357 * hope of introducing more evictable pages for
4358 * try_to_free_mem_cgroup_pages().
4361 lru_add_drain_all();
4363 reclaimed
= try_to_free_mem_cgroup_pages(memcg
,
4364 batch_size
, GFP_KERNEL
,
4366 swappiness
== -1 ? NULL
: &swappiness
);
4368 if (!reclaimed
&& !nr_retries
--)
4371 nr_reclaimed
+= reclaimed
;
4377 static struct cftype memory_files
[] = {
4380 .flags
= CFTYPE_NOT_ON_ROOT
,
4381 .read_u64
= memory_current_read
,
4385 .flags
= CFTYPE_NOT_ON_ROOT
,
4387 .release
= peak_release
,
4388 .seq_show
= memory_peak_show
,
4389 .write
= memory_peak_write
,
4393 .flags
= CFTYPE_NOT_ON_ROOT
,
4394 .seq_show
= memory_min_show
,
4395 .write
= memory_min_write
,
4399 .flags
= CFTYPE_NOT_ON_ROOT
,
4400 .seq_show
= memory_low_show
,
4401 .write
= memory_low_write
,
4405 .flags
= CFTYPE_NOT_ON_ROOT
,
4406 .seq_show
= memory_high_show
,
4407 .write
= memory_high_write
,
4411 .flags
= CFTYPE_NOT_ON_ROOT
,
4412 .seq_show
= memory_max_show
,
4413 .write
= memory_max_write
,
4417 .flags
= CFTYPE_NOT_ON_ROOT
,
4418 .file_offset
= offsetof(struct mem_cgroup
, events_file
),
4419 .seq_show
= memory_events_show
,
4422 .name
= "events.local",
4423 .flags
= CFTYPE_NOT_ON_ROOT
,
4424 .file_offset
= offsetof(struct mem_cgroup
, events_local_file
),
4425 .seq_show
= memory_events_local_show
,
4429 .seq_show
= memory_stat_show
,
4433 .name
= "numa_stat",
4434 .seq_show
= memory_numa_stat_show
,
4438 .name
= "oom.group",
4439 .flags
= CFTYPE_NOT_ON_ROOT
| CFTYPE_NS_DELEGATABLE
,
4440 .seq_show
= memory_oom_group_show
,
4441 .write
= memory_oom_group_write
,
4445 .flags
= CFTYPE_NS_DELEGATABLE
,
4446 .write
= memory_reclaim
,
4451 struct cgroup_subsys memory_cgrp_subsys
= {
4452 .css_alloc
= mem_cgroup_css_alloc
,
4453 .css_online
= mem_cgroup_css_online
,
4454 .css_offline
= mem_cgroup_css_offline
,
4455 .css_released
= mem_cgroup_css_released
,
4456 .css_free
= mem_cgroup_css_free
,
4457 .css_reset
= mem_cgroup_css_reset
,
4458 .css_rstat_flush
= mem_cgroup_css_rstat_flush
,
4459 .attach
= mem_cgroup_attach
,
4460 .fork
= mem_cgroup_fork
,
4461 .exit
= mem_cgroup_exit
,
4462 .dfl_cftypes
= memory_files
,
4463 #ifdef CONFIG_MEMCG_V1
4464 .legacy_cftypes
= mem_cgroup_legacy_files
,
4470 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
4471 * @root: the top ancestor of the sub-tree being checked
4472 * @memcg: the memory cgroup to check
4474 * WARNING: This function is not stateless! It can only be used as part
4475 * of a top-down tree iteration, not for isolated queries.
4477 void mem_cgroup_calculate_protection(struct mem_cgroup
*root
,
4478 struct mem_cgroup
*memcg
)
4480 bool recursive_protection
=
4481 cgrp_dfl_root
.flags
& CGRP_ROOT_MEMORY_RECURSIVE_PROT
;
4483 if (mem_cgroup_disabled())
4487 root
= root_mem_cgroup
;
4489 page_counter_calculate_protection(&root
->memory
, &memcg
->memory
, recursive_protection
);
4492 static int charge_memcg(struct folio
*folio
, struct mem_cgroup
*memcg
,
4497 ret
= try_charge(memcg
, gfp
, folio_nr_pages(folio
));
4501 mem_cgroup_commit_charge(folio
, memcg
);
4506 int __mem_cgroup_charge(struct folio
*folio
, struct mm_struct
*mm
, gfp_t gfp
)
4508 struct mem_cgroup
*memcg
;
4511 memcg
= get_mem_cgroup_from_mm(mm
);
4512 ret
= charge_memcg(folio
, memcg
, gfp
);
4513 css_put(&memcg
->css
);
4519 * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
4520 * @memcg: memcg to charge.
4521 * @gfp: reclaim mode.
4522 * @nr_pages: number of pages to charge.
4524 * This function is called when allocating a huge page folio to determine if
4525 * the memcg has the capacity for it. It does not commit the charge yet,
4526 * as the hugetlb folio itself has not been obtained from the hugetlb pool.
4528 * Once we have obtained the hugetlb folio, we can call
4529 * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
4530 * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
4533 * Returns 0 on success. Otherwise, an error code is returned.
4535 int mem_cgroup_hugetlb_try_charge(struct mem_cgroup
*memcg
, gfp_t gfp
,
4539 * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
4540 * but do not attempt to commit charge later (or cancel on error) either.
4542 if (mem_cgroup_disabled() || !memcg
||
4543 !cgroup_subsys_on_dfl(memory_cgrp_subsys
) ||
4544 !(cgrp_dfl_root
.flags
& CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING
))
4547 if (try_charge(memcg
, gfp
, nr_pages
))
4554 * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
4555 * @folio: folio to charge.
4556 * @mm: mm context of the victim
4557 * @gfp: reclaim mode
4558 * @entry: swap entry for which the folio is allocated
4560 * This function charges a folio allocated for swapin. Please call this before
4561 * adding the folio to the swapcache.
4563 * Returns 0 on success. Otherwise, an error code is returned.
4565 int mem_cgroup_swapin_charge_folio(struct folio
*folio
, struct mm_struct
*mm
,
4566 gfp_t gfp
, swp_entry_t entry
)
4568 struct mem_cgroup
*memcg
;
4572 if (mem_cgroup_disabled())
4575 id
= lookup_swap_cgroup_id(entry
);
4577 memcg
= mem_cgroup_from_id(id
);
4578 if (!memcg
|| !css_tryget_online(&memcg
->css
))
4579 memcg
= get_mem_cgroup_from_mm(mm
);
4582 ret
= charge_memcg(folio
, memcg
, gfp
);
4584 css_put(&memcg
->css
);
4589 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
4590 * @entry: the first swap entry for which the pages are charged
4591 * @nr_pages: number of pages which will be uncharged
4593 * Call this function after successfully adding the charged page to swapcache.
4595 * Note: This function assumes the page for which swap slot is being uncharged
4598 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry
, unsigned int nr_pages
)
4601 * Cgroup1's unified memory+swap counter has been charged with the
4602 * new swapcache page, finish the transfer by uncharging the swap
4603 * slot. The swap slot would also get uncharged when it dies, but
4604 * it can stick around indefinitely and we'd count the page twice
4607 * Cgroup2 has separate resource counters for memory and swap,
4608 * so this is a non-issue here. Memory and swap charge lifetimes
4609 * correspond 1:1 to page and swap slot lifetimes: we charge the
4610 * page to memory here, and uncharge swap when the slot is freed.
4612 if (!mem_cgroup_disabled() && do_memsw_account()) {
4614 * The swap entry might not get freed for a long time,
4615 * let's not wait for it. The page already received a
4616 * memory+swap charge, drop the swap entry duplicate.
4618 mem_cgroup_uncharge_swap(entry
, nr_pages
);
4622 struct uncharge_gather
{
4623 struct mem_cgroup
*memcg
;
4624 unsigned long nr_memory
;
4625 unsigned long pgpgout
;
4626 unsigned long nr_kmem
;
4630 static inline void uncharge_gather_clear(struct uncharge_gather
*ug
)
4632 memset(ug
, 0, sizeof(*ug
));
4635 static void uncharge_batch(const struct uncharge_gather
*ug
)
4637 if (ug
->nr_memory
) {
4638 page_counter_uncharge(&ug
->memcg
->memory
, ug
->nr_memory
);
4639 if (do_memsw_account())
4640 page_counter_uncharge(&ug
->memcg
->memsw
, ug
->nr_memory
);
4642 mod_memcg_state(ug
->memcg
, MEMCG_KMEM
, -ug
->nr_kmem
);
4643 memcg1_account_kmem(ug
->memcg
, -ug
->nr_kmem
);
4645 memcg1_oom_recover(ug
->memcg
);
4648 memcg1_uncharge_batch(ug
->memcg
, ug
->pgpgout
, ug
->nr_memory
, ug
->nid
);
4650 /* drop reference from uncharge_folio */
4651 css_put(&ug
->memcg
->css
);
4654 static void uncharge_folio(struct folio
*folio
, struct uncharge_gather
*ug
)
4657 struct mem_cgroup
*memcg
;
4658 struct obj_cgroup
*objcg
;
4660 VM_BUG_ON_FOLIO(folio_test_lru(folio
), folio
);
4663 * Nobody should be changing or seriously looking at
4664 * folio memcg or objcg at this point, we have fully
4665 * exclusive access to the folio.
4667 if (folio_memcg_kmem(folio
)) {
4668 objcg
= __folio_objcg(folio
);
4670 * This get matches the put at the end of the function and
4671 * kmem pages do not hold memcg references anymore.
4673 memcg
= get_mem_cgroup_from_objcg(objcg
);
4675 memcg
= __folio_memcg(folio
);
4681 if (ug
->memcg
!= memcg
) {
4684 uncharge_gather_clear(ug
);
4687 ug
->nid
= folio_nid(folio
);
4689 /* pairs with css_put in uncharge_batch */
4690 css_get(&memcg
->css
);
4693 nr_pages
= folio_nr_pages(folio
);
4695 if (folio_memcg_kmem(folio
)) {
4696 ug
->nr_memory
+= nr_pages
;
4697 ug
->nr_kmem
+= nr_pages
;
4699 folio
->memcg_data
= 0;
4700 obj_cgroup_put(objcg
);
4702 /* LRU pages aren't accounted at the root level */
4703 if (!mem_cgroup_is_root(memcg
))
4704 ug
->nr_memory
+= nr_pages
;
4707 WARN_ON_ONCE(folio_unqueue_deferred_split(folio
));
4708 folio
->memcg_data
= 0;
4711 css_put(&memcg
->css
);
4714 void __mem_cgroup_uncharge(struct folio
*folio
)
4716 struct uncharge_gather ug
;
4718 /* Don't touch folio->lru of any random page, pre-check: */
4719 if (!folio_memcg_charged(folio
))
4722 uncharge_gather_clear(&ug
);
4723 uncharge_folio(folio
, &ug
);
4724 uncharge_batch(&ug
);
4727 void __mem_cgroup_uncharge_folios(struct folio_batch
*folios
)
4729 struct uncharge_gather ug
;
4732 uncharge_gather_clear(&ug
);
4733 for (i
= 0; i
< folios
->nr
; i
++)
4734 uncharge_folio(folios
->folios
[i
], &ug
);
4736 uncharge_batch(&ug
);
4740 * mem_cgroup_replace_folio - Charge a folio's replacement.
4741 * @old: Currently circulating folio.
4742 * @new: Replacement folio.
4744 * Charge @new as a replacement folio for @old. @old will
4745 * be uncharged upon free.
4747 * Both folios must be locked, @new->mapping must be set up.
4749 void mem_cgroup_replace_folio(struct folio
*old
, struct folio
*new)
4751 struct mem_cgroup
*memcg
;
4752 long nr_pages
= folio_nr_pages(new);
4754 VM_BUG_ON_FOLIO(!folio_test_locked(old
), old
);
4755 VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
4756 VM_BUG_ON_FOLIO(folio_test_anon(old
) != folio_test_anon(new), new);
4757 VM_BUG_ON_FOLIO(folio_nr_pages(old
) != nr_pages
, new);
4759 if (mem_cgroup_disabled())
4762 /* Page cache replacement: new folio already charged? */
4763 if (folio_memcg_charged(new))
4766 memcg
= folio_memcg(old
);
4767 VM_WARN_ON_ONCE_FOLIO(!memcg
, old
);
4771 /* Force-charge the new page. The old one will be freed soon */
4772 if (!mem_cgroup_is_root(memcg
)) {
4773 page_counter_charge(&memcg
->memory
, nr_pages
);
4774 if (do_memsw_account())
4775 page_counter_charge(&memcg
->memsw
, nr_pages
);
4778 css_get(&memcg
->css
);
4779 commit_charge(new, memcg
);
4780 memcg1_commit_charge(new, memcg
);
4784 * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
4785 * @old: Currently circulating folio.
4786 * @new: Replacement folio.
4788 * Transfer the memcg data from the old folio to the new folio for migration.
4789 * The old folio's data info will be cleared. Note that the memory counters
4790 * will remain unchanged throughout the process.
4792 * Both folios must be locked, @new->mapping must be set up.
4794 void mem_cgroup_migrate(struct folio
*old
, struct folio
*new)
4796 struct mem_cgroup
*memcg
;
4798 VM_BUG_ON_FOLIO(!folio_test_locked(old
), old
);
4799 VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
4800 VM_BUG_ON_FOLIO(folio_test_anon(old
) != folio_test_anon(new), new);
4801 VM_BUG_ON_FOLIO(folio_nr_pages(old
) != folio_nr_pages(new), new);
4802 VM_BUG_ON_FOLIO(folio_test_lru(old
), old
);
4804 if (mem_cgroup_disabled())
4807 memcg
= folio_memcg(old
);
4809 * Note that it is normal to see !memcg for a hugetlb folio.
4810 * For e.g, itt could have been allocated when memory_hugetlb_accounting
4813 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old
) && !memcg
, old
);
4817 /* Transfer the charge and the css ref */
4818 commit_charge(new, memcg
);
4820 /* Warning should never happen, so don't worry about refcount non-0 */
4821 WARN_ON_ONCE(folio_unqueue_deferred_split(old
));
4822 old
->memcg_data
= 0;
4825 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key
);
4826 EXPORT_SYMBOL(memcg_sockets_enabled_key
);
4828 void mem_cgroup_sk_alloc(struct sock
*sk
)
4830 struct mem_cgroup
*memcg
;
4832 if (!mem_cgroup_sockets_enabled
)
4835 /* Do not associate the sock with unrelated interrupted task's memcg. */
4840 memcg
= mem_cgroup_from_task(current
);
4841 if (mem_cgroup_is_root(memcg
))
4843 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys
) && !memcg1_tcpmem_active(memcg
))
4845 if (css_tryget(&memcg
->css
))
4846 sk
->sk_memcg
= memcg
;
4851 void mem_cgroup_sk_free(struct sock
*sk
)
4854 css_put(&sk
->sk_memcg
->css
);
4858 * mem_cgroup_charge_skmem - charge socket memory
4859 * @memcg: memcg to charge
4860 * @nr_pages: number of pages to charge
4861 * @gfp_mask: reclaim mode
4863 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
4864 * @memcg's configured limit, %false if it doesn't.
4866 bool mem_cgroup_charge_skmem(struct mem_cgroup
*memcg
, unsigned int nr_pages
,
4869 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys
))
4870 return memcg1_charge_skmem(memcg
, nr_pages
, gfp_mask
);
4872 if (try_charge(memcg
, gfp_mask
, nr_pages
) == 0) {
4873 mod_memcg_state(memcg
, MEMCG_SOCK
, nr_pages
);
4881 * mem_cgroup_uncharge_skmem - uncharge socket memory
4882 * @memcg: memcg to uncharge
4883 * @nr_pages: number of pages to uncharge
4885 void mem_cgroup_uncharge_skmem(struct mem_cgroup
*memcg
, unsigned int nr_pages
)
4887 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys
)) {
4888 memcg1_uncharge_skmem(memcg
, nr_pages
);
4892 mod_memcg_state(memcg
, MEMCG_SOCK
, -nr_pages
);
4894 refill_stock(memcg
, nr_pages
);
4897 static int __init
cgroup_memory(char *s
)
4901 while ((token
= strsep(&s
, ",")) != NULL
) {
4904 if (!strcmp(token
, "nosocket"))
4905 cgroup_memory_nosocket
= true;
4906 if (!strcmp(token
, "nokmem"))
4907 cgroup_memory_nokmem
= true;
4908 if (!strcmp(token
, "nobpf"))
4909 cgroup_memory_nobpf
= true;
4913 __setup("cgroup.memory=", cgroup_memory
);
4916 * subsys_initcall() for memory controller.
4918 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
4919 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
4920 * basically everything that doesn't depend on a specific mem_cgroup structure
4921 * should be initialized from here.
4923 static int __init
mem_cgroup_init(void)
4928 * Currently s32 type (can refer to struct batched_lruvec_stat) is
4929 * used for per-memcg-per-cpu caching of per-node statistics. In order
4930 * to work fine, we should make sure that the overfill threshold can't
4931 * exceed S32_MAX / PAGE_SIZE.
4933 BUILD_BUG_ON(MEMCG_CHARGE_BATCH
> S32_MAX
/ PAGE_SIZE
);
4935 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD
, "mm/memctrl:dead", NULL
,
4936 memcg_hotplug_cpu_dead
);
4938 for_each_possible_cpu(cpu
)
4939 INIT_WORK(&per_cpu_ptr(&memcg_stock
, cpu
)->work
,
4944 subsys_initcall(mem_cgroup_init
);
4947 static struct mem_cgroup
*mem_cgroup_id_get_online(struct mem_cgroup
*memcg
)
4949 while (!refcount_inc_not_zero(&memcg
->id
.ref
)) {
4951 * The root cgroup cannot be destroyed, so it's refcount must
4954 if (WARN_ON_ONCE(mem_cgroup_is_root(memcg
))) {
4958 memcg
= parent_mem_cgroup(memcg
);
4960 memcg
= root_mem_cgroup
;
4966 * mem_cgroup_swapout - transfer a memsw charge to swap
4967 * @folio: folio whose memsw charge to transfer
4968 * @entry: swap entry to move the charge to
4970 * Transfer the memsw charge of @folio to @entry.
4972 void mem_cgroup_swapout(struct folio
*folio
, swp_entry_t entry
)
4974 struct mem_cgroup
*memcg
, *swap_memcg
;
4975 unsigned int nr_entries
;
4976 unsigned short oldid
;
4978 VM_BUG_ON_FOLIO(folio_test_lru(folio
), folio
);
4979 VM_BUG_ON_FOLIO(folio_ref_count(folio
), folio
);
4981 if (mem_cgroup_disabled())
4984 if (!do_memsw_account())
4987 memcg
= folio_memcg(folio
);
4989 VM_WARN_ON_ONCE_FOLIO(!memcg
, folio
);
4994 * In case the memcg owning these pages has been offlined and doesn't
4995 * have an ID allocated to it anymore, charge the closest online
4996 * ancestor for the swap instead and transfer the memory+swap charge.
4998 swap_memcg
= mem_cgroup_id_get_online(memcg
);
4999 nr_entries
= folio_nr_pages(folio
);
5000 /* Get references for the tail pages, too */
5002 mem_cgroup_id_get_many(swap_memcg
, nr_entries
- 1);
5003 oldid
= swap_cgroup_record(entry
, mem_cgroup_id(swap_memcg
),
5005 VM_BUG_ON_FOLIO(oldid
, folio
);
5006 mod_memcg_state(swap_memcg
, MEMCG_SWAP
, nr_entries
);
5008 folio_unqueue_deferred_split(folio
);
5009 folio
->memcg_data
= 0;
5011 if (!mem_cgroup_is_root(memcg
))
5012 page_counter_uncharge(&memcg
->memory
, nr_entries
);
5014 if (memcg
!= swap_memcg
) {
5015 if (!mem_cgroup_is_root(swap_memcg
))
5016 page_counter_charge(&swap_memcg
->memsw
, nr_entries
);
5017 page_counter_uncharge(&memcg
->memsw
, nr_entries
);
5020 memcg1_swapout(folio
, memcg
);
5021 css_put(&memcg
->css
);
5025 * __mem_cgroup_try_charge_swap - try charging swap space for a folio
5026 * @folio: folio being added to swap
5027 * @entry: swap entry to charge
5029 * Try to charge @folio's memcg for the swap space at @entry.
5031 * Returns 0 on success, -ENOMEM on failure.
5033 int __mem_cgroup_try_charge_swap(struct folio
*folio
, swp_entry_t entry
)
5035 unsigned int nr_pages
= folio_nr_pages(folio
);
5036 struct page_counter
*counter
;
5037 struct mem_cgroup
*memcg
;
5038 unsigned short oldid
;
5040 if (do_memsw_account())
5043 memcg
= folio_memcg(folio
);
5045 VM_WARN_ON_ONCE_FOLIO(!memcg
, folio
);
5050 memcg_memory_event(memcg
, MEMCG_SWAP_FAIL
);
5054 memcg
= mem_cgroup_id_get_online(memcg
);
5056 if (!mem_cgroup_is_root(memcg
) &&
5057 !page_counter_try_charge(&memcg
->swap
, nr_pages
, &counter
)) {
5058 memcg_memory_event(memcg
, MEMCG_SWAP_MAX
);
5059 memcg_memory_event(memcg
, MEMCG_SWAP_FAIL
);
5060 mem_cgroup_id_put(memcg
);
5064 /* Get references for the tail pages, too */
5066 mem_cgroup_id_get_many(memcg
, nr_pages
- 1);
5067 oldid
= swap_cgroup_record(entry
, mem_cgroup_id(memcg
), nr_pages
);
5068 VM_BUG_ON_FOLIO(oldid
, folio
);
5069 mod_memcg_state(memcg
, MEMCG_SWAP
, nr_pages
);
5075 * __mem_cgroup_uncharge_swap - uncharge swap space
5076 * @entry: swap entry to uncharge
5077 * @nr_pages: the amount of swap space to uncharge
5079 void __mem_cgroup_uncharge_swap(swp_entry_t entry
, unsigned int nr_pages
)
5081 struct mem_cgroup
*memcg
;
5084 id
= swap_cgroup_record(entry
, 0, nr_pages
);
5086 memcg
= mem_cgroup_from_id(id
);
5088 if (!mem_cgroup_is_root(memcg
)) {
5089 if (do_memsw_account())
5090 page_counter_uncharge(&memcg
->memsw
, nr_pages
);
5092 page_counter_uncharge(&memcg
->swap
, nr_pages
);
5094 mod_memcg_state(memcg
, MEMCG_SWAP
, -nr_pages
);
5095 mem_cgroup_id_put_many(memcg
, nr_pages
);
5100 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup
*memcg
)
5102 long nr_swap_pages
= get_nr_swap_pages();
5104 if (mem_cgroup_disabled() || do_memsw_account())
5105 return nr_swap_pages
;
5106 for (; !mem_cgroup_is_root(memcg
); memcg
= parent_mem_cgroup(memcg
))
5107 nr_swap_pages
= min_t(long, nr_swap_pages
,
5108 READ_ONCE(memcg
->swap
.max
) -
5109 page_counter_read(&memcg
->swap
));
5110 return nr_swap_pages
;
5113 bool mem_cgroup_swap_full(struct folio
*folio
)
5115 struct mem_cgroup
*memcg
;
5117 VM_BUG_ON_FOLIO(!folio_test_locked(folio
), folio
);
5121 if (do_memsw_account())
5124 memcg
= folio_memcg(folio
);
5128 for (; !mem_cgroup_is_root(memcg
); memcg
= parent_mem_cgroup(memcg
)) {
5129 unsigned long usage
= page_counter_read(&memcg
->swap
);
5131 if (usage
* 2 >= READ_ONCE(memcg
->swap
.high
) ||
5132 usage
* 2 >= READ_ONCE(memcg
->swap
.max
))
5139 static int __init
setup_swap_account(char *s
)
5143 if (!kstrtobool(s
, &res
) && !res
)
5144 pr_warn_once("The swapaccount=0 commandline option is deprecated "
5145 "in favor of configuring swap control via cgroupfs. "
5146 "Please report your usecase to linux-mm@kvack.org if you "
5147 "depend on this functionality.\n");
5150 __setup("swapaccount=", setup_swap_account
);
5152 static u64
swap_current_read(struct cgroup_subsys_state
*css
,
5155 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
5157 return (u64
)page_counter_read(&memcg
->swap
) * PAGE_SIZE
;
5160 static int swap_peak_show(struct seq_file
*sf
, void *v
)
5162 struct mem_cgroup
*memcg
= mem_cgroup_from_css(seq_css(sf
));
5164 return peak_show(sf
, v
, &memcg
->swap
);
5167 static ssize_t
swap_peak_write(struct kernfs_open_file
*of
, char *buf
,
5168 size_t nbytes
, loff_t off
)
5170 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
5172 return peak_write(of
, buf
, nbytes
, off
, &memcg
->swap
,
5173 &memcg
->swap_peaks
);
5176 static int swap_high_show(struct seq_file
*m
, void *v
)
5178 return seq_puts_memcg_tunable(m
,
5179 READ_ONCE(mem_cgroup_from_seq(m
)->swap
.high
));
5182 static ssize_t
swap_high_write(struct kernfs_open_file
*of
,
5183 char *buf
, size_t nbytes
, loff_t off
)
5185 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
5189 buf
= strstrip(buf
);
5190 err
= page_counter_memparse(buf
, "max", &high
);
5194 page_counter_set_high(&memcg
->swap
, high
);
5199 static int swap_max_show(struct seq_file
*m
, void *v
)
5201 return seq_puts_memcg_tunable(m
,
5202 READ_ONCE(mem_cgroup_from_seq(m
)->swap
.max
));
5205 static ssize_t
swap_max_write(struct kernfs_open_file
*of
,
5206 char *buf
, size_t nbytes
, loff_t off
)
5208 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
5212 buf
= strstrip(buf
);
5213 err
= page_counter_memparse(buf
, "max", &max
);
5217 xchg(&memcg
->swap
.max
, max
);
5222 static int swap_events_show(struct seq_file
*m
, void *v
)
5224 struct mem_cgroup
*memcg
= mem_cgroup_from_seq(m
);
5226 seq_printf(m
, "high %lu\n",
5227 atomic_long_read(&memcg
->memory_events
[MEMCG_SWAP_HIGH
]));
5228 seq_printf(m
, "max %lu\n",
5229 atomic_long_read(&memcg
->memory_events
[MEMCG_SWAP_MAX
]));
5230 seq_printf(m
, "fail %lu\n",
5231 atomic_long_read(&memcg
->memory_events
[MEMCG_SWAP_FAIL
]));
5236 static struct cftype swap_files
[] = {
5238 .name
= "swap.current",
5239 .flags
= CFTYPE_NOT_ON_ROOT
,
5240 .read_u64
= swap_current_read
,
5243 .name
= "swap.high",
5244 .flags
= CFTYPE_NOT_ON_ROOT
,
5245 .seq_show
= swap_high_show
,
5246 .write
= swap_high_write
,
5250 .flags
= CFTYPE_NOT_ON_ROOT
,
5251 .seq_show
= swap_max_show
,
5252 .write
= swap_max_write
,
5255 .name
= "swap.peak",
5256 .flags
= CFTYPE_NOT_ON_ROOT
,
5258 .release
= peak_release
,
5259 .seq_show
= swap_peak_show
,
5260 .write
= swap_peak_write
,
5263 .name
= "swap.events",
5264 .flags
= CFTYPE_NOT_ON_ROOT
,
5265 .file_offset
= offsetof(struct mem_cgroup
, swap_events_file
),
5266 .seq_show
= swap_events_show
,
5273 * obj_cgroup_may_zswap - check if this cgroup can zswap
5274 * @objcg: the object cgroup
5276 * Check if the hierarchical zswap limit has been reached.
5278 * This doesn't check for specific headroom, and it is not atomic
5279 * either. But with zswap, the size of the allocation is only known
5280 * once compression has occurred, and this optimistic pre-check avoids
5281 * spending cycles on compression when there is already no room left
5282 * or zswap is disabled altogether somewhere in the hierarchy.
5284 bool obj_cgroup_may_zswap(struct obj_cgroup
*objcg
)
5286 struct mem_cgroup
*memcg
, *original_memcg
;
5289 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys
))
5292 original_memcg
= get_mem_cgroup_from_objcg(objcg
);
5293 for (memcg
= original_memcg
; !mem_cgroup_is_root(memcg
);
5294 memcg
= parent_mem_cgroup(memcg
)) {
5295 unsigned long max
= READ_ONCE(memcg
->zswap_max
);
5296 unsigned long pages
;
5298 if (max
== PAGE_COUNTER_MAX
)
5305 /* Force flush to get accurate stats for charging */
5306 __mem_cgroup_flush_stats(memcg
, true);
5307 pages
= memcg_page_state(memcg
, MEMCG_ZSWAP_B
) / PAGE_SIZE
;
5313 mem_cgroup_put(original_memcg
);
5318 * obj_cgroup_charge_zswap - charge compression backend memory
5319 * @objcg: the object cgroup
5320 * @size: size of compressed object
5322 * This forces the charge after obj_cgroup_may_zswap() allowed
5323 * compression and storage in zwap for this cgroup to go ahead.
5325 void obj_cgroup_charge_zswap(struct obj_cgroup
*objcg
, size_t size
)
5327 struct mem_cgroup
*memcg
;
5329 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys
))
5332 VM_WARN_ON_ONCE(!(current
->flags
& PF_MEMALLOC
));
5334 /* PF_MEMALLOC context, charging must succeed */
5335 if (obj_cgroup_charge(objcg
, GFP_KERNEL
, size
))
5339 memcg
= obj_cgroup_memcg(objcg
);
5340 mod_memcg_state(memcg
, MEMCG_ZSWAP_B
, size
);
5341 mod_memcg_state(memcg
, MEMCG_ZSWAPPED
, 1);
5346 * obj_cgroup_uncharge_zswap - uncharge compression backend memory
5347 * @objcg: the object cgroup
5348 * @size: size of compressed object
5350 * Uncharges zswap memory on page in.
5352 void obj_cgroup_uncharge_zswap(struct obj_cgroup
*objcg
, size_t size
)
5354 struct mem_cgroup
*memcg
;
5356 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys
))
5359 obj_cgroup_uncharge(objcg
, size
);
5362 memcg
= obj_cgroup_memcg(objcg
);
5363 mod_memcg_state(memcg
, MEMCG_ZSWAP_B
, -size
);
5364 mod_memcg_state(memcg
, MEMCG_ZSWAPPED
, -1);
5368 bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup
*memcg
)
5370 /* if zswap is disabled, do not block pages going to the swapping device */
5371 if (!zswap_is_enabled())
5374 for (; memcg
; memcg
= parent_mem_cgroup(memcg
))
5375 if (!READ_ONCE(memcg
->zswap_writeback
))
5381 static u64
zswap_current_read(struct cgroup_subsys_state
*css
,
5384 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
5386 mem_cgroup_flush_stats(memcg
);
5387 return memcg_page_state(memcg
, MEMCG_ZSWAP_B
);
5390 static int zswap_max_show(struct seq_file
*m
, void *v
)
5392 return seq_puts_memcg_tunable(m
,
5393 READ_ONCE(mem_cgroup_from_seq(m
)->zswap_max
));
5396 static ssize_t
zswap_max_write(struct kernfs_open_file
*of
,
5397 char *buf
, size_t nbytes
, loff_t off
)
5399 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
5403 buf
= strstrip(buf
);
5404 err
= page_counter_memparse(buf
, "max", &max
);
5408 xchg(&memcg
->zswap_max
, max
);
5413 static int zswap_writeback_show(struct seq_file
*m
, void *v
)
5415 struct mem_cgroup
*memcg
= mem_cgroup_from_seq(m
);
5417 seq_printf(m
, "%d\n", READ_ONCE(memcg
->zswap_writeback
));
5421 static ssize_t
zswap_writeback_write(struct kernfs_open_file
*of
,
5422 char *buf
, size_t nbytes
, loff_t off
)
5424 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
5425 int zswap_writeback
;
5426 ssize_t parse_ret
= kstrtoint(strstrip(buf
), 0, &zswap_writeback
);
5431 if (zswap_writeback
!= 0 && zswap_writeback
!= 1)
5434 WRITE_ONCE(memcg
->zswap_writeback
, zswap_writeback
);
5438 static struct cftype zswap_files
[] = {
5440 .name
= "zswap.current",
5441 .flags
= CFTYPE_NOT_ON_ROOT
,
5442 .read_u64
= zswap_current_read
,
5445 .name
= "zswap.max",
5446 .flags
= CFTYPE_NOT_ON_ROOT
,
5447 .seq_show
= zswap_max_show
,
5448 .write
= zswap_max_write
,
5451 .name
= "zswap.writeback",
5452 .seq_show
= zswap_writeback_show
,
5453 .write
= zswap_writeback_write
,
5457 #endif /* CONFIG_ZSWAP */
5459 static int __init
mem_cgroup_swap_init(void)
5461 if (mem_cgroup_disabled())
5464 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys
, swap_files
));
5465 #ifdef CONFIG_MEMCG_V1
5466 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys
, memsw_files
));
5469 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys
, zswap_files
));
5473 subsys_initcall(mem_cgroup_swap_init
);
5475 #endif /* CONFIG_SWAP */