1 // SPDX-License-Identifier: GPL-2.0-or-later
3 #include <linux/memcontrol.h>
4 #include <linux/swap.h>
5 #include <linux/mm_inline.h>
6 #include <linux/pagewalk.h>
7 #include <linux/backing-dev.h>
8 #include <linux/swap_cgroup.h>
9 #include <linux/eventfd.h>
10 #include <linux/poll.h>
11 #include <linux/sort.h>
12 #include <linux/file.h>
13 #include <linux/seq_buf.h>
17 #include "memcontrol-v1.h"
20 * Cgroups above their limits are maintained in a RB-Tree, independent of
21 * their hierarchy representation
24 struct mem_cgroup_tree_per_node
{
25 struct rb_root rb_root
;
26 struct rb_node
*rb_rightmost
;
30 struct mem_cgroup_tree
{
31 struct mem_cgroup_tree_per_node
*rb_tree_per_node
[MAX_NUMNODES
];
34 static struct mem_cgroup_tree soft_limit_tree __read_mostly
;
37 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
38 * limit reclaim to prevent infinite loops, if they ever occur.
40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
44 struct mem_cgroup_eventfd_list
{
45 struct list_head list
;
46 struct eventfd_ctx
*eventfd
;
50 * cgroup_event represents events which userspace want to receive.
52 struct mem_cgroup_event
{
54 * memcg which the event belongs to.
56 struct mem_cgroup
*memcg
;
58 * eventfd to signal userspace about the event.
60 struct eventfd_ctx
*eventfd
;
62 * Each of these stored in a list by the cgroup.
64 struct list_head list
;
66 * register_event() callback will be used to add new userspace
67 * waiter for changes related to this event. Use eventfd_signal()
68 * on eventfd to send notification to userspace.
70 int (*register_event
)(struct mem_cgroup
*memcg
,
71 struct eventfd_ctx
*eventfd
, const char *args
);
73 * unregister_event() callback will be called when userspace closes
74 * the eventfd or on cgroup removing. This callback must be set,
75 * if you want provide notification functionality.
77 void (*unregister_event
)(struct mem_cgroup
*memcg
,
78 struct eventfd_ctx
*eventfd
);
80 * All fields below needed to unregister event when
81 * userspace closes eventfd.
84 wait_queue_head_t
*wqh
;
85 wait_queue_entry_t wait
;
86 struct work_struct remove
;
89 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
90 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
91 #define MEMFILE_ATTR(val) ((val) & 0xffff)
101 #ifdef CONFIG_LOCKDEP
102 static struct lockdep_map memcg_oom_lock_dep_map
= {
103 .name
= "memcg_oom_lock",
107 DEFINE_SPINLOCK(memcg_oom_lock
);
109 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node
*mz
,
110 struct mem_cgroup_tree_per_node
*mctz
,
111 unsigned long new_usage_in_excess
)
113 struct rb_node
**p
= &mctz
->rb_root
.rb_node
;
114 struct rb_node
*parent
= NULL
;
115 struct mem_cgroup_per_node
*mz_node
;
116 bool rightmost
= true;
121 mz
->usage_in_excess
= new_usage_in_excess
;
122 if (!mz
->usage_in_excess
)
126 mz_node
= rb_entry(parent
, struct mem_cgroup_per_node
,
128 if (mz
->usage_in_excess
< mz_node
->usage_in_excess
) {
137 mctz
->rb_rightmost
= &mz
->tree_node
;
139 rb_link_node(&mz
->tree_node
, parent
, p
);
140 rb_insert_color(&mz
->tree_node
, &mctz
->rb_root
);
144 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node
*mz
,
145 struct mem_cgroup_tree_per_node
*mctz
)
150 if (&mz
->tree_node
== mctz
->rb_rightmost
)
151 mctz
->rb_rightmost
= rb_prev(&mz
->tree_node
);
153 rb_erase(&mz
->tree_node
, &mctz
->rb_root
);
157 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node
*mz
,
158 struct mem_cgroup_tree_per_node
*mctz
)
162 spin_lock_irqsave(&mctz
->lock
, flags
);
163 __mem_cgroup_remove_exceeded(mz
, mctz
);
164 spin_unlock_irqrestore(&mctz
->lock
, flags
);
167 static unsigned long soft_limit_excess(struct mem_cgroup
*memcg
)
169 unsigned long nr_pages
= page_counter_read(&memcg
->memory
);
170 unsigned long soft_limit
= READ_ONCE(memcg
->soft_limit
);
171 unsigned long excess
= 0;
173 if (nr_pages
> soft_limit
)
174 excess
= nr_pages
- soft_limit
;
179 static void memcg1_update_tree(struct mem_cgroup
*memcg
, int nid
)
181 unsigned long excess
;
182 struct mem_cgroup_per_node
*mz
;
183 struct mem_cgroup_tree_per_node
*mctz
;
185 if (lru_gen_enabled()) {
186 if (soft_limit_excess(memcg
))
187 lru_gen_soft_reclaim(memcg
, nid
);
191 mctz
= soft_limit_tree
.rb_tree_per_node
[nid
];
195 * Necessary to update all ancestors when hierarchy is used.
196 * because their event counter is not touched.
198 for (; memcg
; memcg
= parent_mem_cgroup(memcg
)) {
199 mz
= memcg
->nodeinfo
[nid
];
200 excess
= soft_limit_excess(memcg
);
202 * We have to update the tree if mz is on RB-tree or
203 * mem is over its softlimit.
205 if (excess
|| mz
->on_tree
) {
208 spin_lock_irqsave(&mctz
->lock
, flags
);
209 /* if on-tree, remove it */
211 __mem_cgroup_remove_exceeded(mz
, mctz
);
213 * Insert again. mz->usage_in_excess will be updated.
214 * If excess is 0, no tree ops.
216 __mem_cgroup_insert_exceeded(mz
, mctz
, excess
);
217 spin_unlock_irqrestore(&mctz
->lock
, flags
);
222 void memcg1_remove_from_trees(struct mem_cgroup
*memcg
)
224 struct mem_cgroup_tree_per_node
*mctz
;
225 struct mem_cgroup_per_node
*mz
;
229 mz
= memcg
->nodeinfo
[nid
];
230 mctz
= soft_limit_tree
.rb_tree_per_node
[nid
];
232 mem_cgroup_remove_exceeded(mz
, mctz
);
236 static struct mem_cgroup_per_node
*
237 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node
*mctz
)
239 struct mem_cgroup_per_node
*mz
;
243 if (!mctz
->rb_rightmost
)
244 goto done
; /* Nothing to reclaim from */
246 mz
= rb_entry(mctz
->rb_rightmost
,
247 struct mem_cgroup_per_node
, tree_node
);
249 * Remove the node now but someone else can add it back,
250 * we will to add it back at the end of reclaim to its correct
251 * position in the tree.
253 __mem_cgroup_remove_exceeded(mz
, mctz
);
254 if (!soft_limit_excess(mz
->memcg
) ||
255 !css_tryget(&mz
->memcg
->css
))
261 static struct mem_cgroup_per_node
*
262 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node
*mctz
)
264 struct mem_cgroup_per_node
*mz
;
266 spin_lock_irq(&mctz
->lock
);
267 mz
= __mem_cgroup_largest_soft_limit_node(mctz
);
268 spin_unlock_irq(&mctz
->lock
);
272 static int mem_cgroup_soft_reclaim(struct mem_cgroup
*root_memcg
,
275 unsigned long *total_scanned
)
277 struct mem_cgroup
*victim
= NULL
;
280 unsigned long excess
;
281 unsigned long nr_scanned
;
282 struct mem_cgroup_reclaim_cookie reclaim
= {
286 excess
= soft_limit_excess(root_memcg
);
289 victim
= mem_cgroup_iter(root_memcg
, victim
, &reclaim
);
294 * If we have not been able to reclaim
295 * anything, it might because there are
296 * no reclaimable pages under this hierarchy
301 * We want to do more targeted reclaim.
302 * excess >> 2 is not to excessive so as to
303 * reclaim too much, nor too less that we keep
304 * coming back to reclaim from this cgroup
306 if (total
>= (excess
>> 2) ||
307 (loop
> MEM_CGROUP_MAX_RECLAIM_LOOPS
))
312 total
+= mem_cgroup_shrink_node(victim
, gfp_mask
, false,
314 *total_scanned
+= nr_scanned
;
315 if (!soft_limit_excess(root_memcg
))
318 mem_cgroup_iter_break(root_memcg
, victim
);
322 unsigned long memcg1_soft_limit_reclaim(pg_data_t
*pgdat
, int order
,
324 unsigned long *total_scanned
)
326 unsigned long nr_reclaimed
= 0;
327 struct mem_cgroup_per_node
*mz
, *next_mz
= NULL
;
328 unsigned long reclaimed
;
330 struct mem_cgroup_tree_per_node
*mctz
;
331 unsigned long excess
;
333 if (lru_gen_enabled())
339 mctz
= soft_limit_tree
.rb_tree_per_node
[pgdat
->node_id
];
342 * Do not even bother to check the largest node if the root
343 * is empty. Do it lockless to prevent lock bouncing. Races
344 * are acceptable as soft limit is best effort anyway.
346 if (!mctz
|| RB_EMPTY_ROOT(&mctz
->rb_root
))
350 * This loop can run a while, specially if mem_cgroup's continuously
351 * keep exceeding their soft limit and putting the system under
358 mz
= mem_cgroup_largest_soft_limit_node(mctz
);
362 reclaimed
= mem_cgroup_soft_reclaim(mz
->memcg
, pgdat
,
363 gfp_mask
, total_scanned
);
364 nr_reclaimed
+= reclaimed
;
365 spin_lock_irq(&mctz
->lock
);
368 * If we failed to reclaim anything from this memory cgroup
369 * it is time to move on to the next cgroup
373 next_mz
= __mem_cgroup_largest_soft_limit_node(mctz
);
375 excess
= soft_limit_excess(mz
->memcg
);
377 * One school of thought says that we should not add
378 * back the node to the tree if reclaim returns 0.
379 * But our reclaim could return 0, simply because due
380 * to priority we are exposing a smaller subset of
381 * memory to reclaim from. Consider this as a longer
384 /* If excess == 0, no tree ops */
385 __mem_cgroup_insert_exceeded(mz
, mctz
, excess
);
386 spin_unlock_irq(&mctz
->lock
);
387 css_put(&mz
->memcg
->css
);
390 * Could not reclaim anything and there are no more
391 * mem cgroups to try or we seem to be looping without
392 * reclaiming anything.
396 loop
> MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS
))
398 } while (!nr_reclaimed
);
400 css_put(&next_mz
->memcg
->css
);
404 static u64
mem_cgroup_move_charge_read(struct cgroup_subsys_state
*css
,
411 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state
*css
,
412 struct cftype
*cft
, u64 val
)
414 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
415 "Please report your usecase to linux-mm@kvack.org if you "
416 "depend on this functionality.\n");
423 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state
*css
,
424 struct cftype
*cft
, u64 val
)
430 static void __mem_cgroup_threshold(struct mem_cgroup
*memcg
, bool swap
)
432 struct mem_cgroup_threshold_ary
*t
;
438 t
= rcu_dereference(memcg
->thresholds
.primary
);
440 t
= rcu_dereference(memcg
->memsw_thresholds
.primary
);
445 usage
= mem_cgroup_usage(memcg
, swap
);
448 * current_threshold points to threshold just below or equal to usage.
449 * If it's not true, a threshold was crossed after last
450 * call of __mem_cgroup_threshold().
452 i
= t
->current_threshold
;
455 * Iterate backward over array of thresholds starting from
456 * current_threshold and check if a threshold is crossed.
457 * If none of thresholds below usage is crossed, we read
458 * only one element of the array here.
460 for (; i
>= 0 && unlikely(t
->entries
[i
].threshold
> usage
); i
--)
461 eventfd_signal(t
->entries
[i
].eventfd
);
463 /* i = current_threshold + 1 */
467 * Iterate forward over array of thresholds starting from
468 * current_threshold+1 and check if a threshold is crossed.
469 * If none of thresholds above usage is crossed, we read
470 * only one element of the array here.
472 for (; i
< t
->size
&& unlikely(t
->entries
[i
].threshold
<= usage
); i
++)
473 eventfd_signal(t
->entries
[i
].eventfd
);
475 /* Update current_threshold */
476 t
->current_threshold
= i
- 1;
481 static void mem_cgroup_threshold(struct mem_cgroup
*memcg
)
484 __mem_cgroup_threshold(memcg
, false);
485 if (do_memsw_account())
486 __mem_cgroup_threshold(memcg
, true);
488 memcg
= parent_mem_cgroup(memcg
);
492 /* Cgroup1: threshold notifications & softlimit tree updates */
493 struct memcg1_events_percpu
{
494 unsigned long nr_page_events
;
495 unsigned long targets
[MEM_CGROUP_NTARGETS
];
498 static void memcg1_charge_statistics(struct mem_cgroup
*memcg
, int nr_pages
)
500 /* pagein of a big page is an event. So, ignore page size */
502 __count_memcg_events(memcg
, PGPGIN
, 1);
504 __count_memcg_events(memcg
, PGPGOUT
, 1);
505 nr_pages
= -nr_pages
; /* for event */
508 __this_cpu_add(memcg
->events_percpu
->nr_page_events
, nr_pages
);
511 #define THRESHOLDS_EVENTS_TARGET 128
512 #define SOFTLIMIT_EVENTS_TARGET 1024
514 static bool memcg1_event_ratelimit(struct mem_cgroup
*memcg
,
515 enum mem_cgroup_events_target target
)
517 unsigned long val
, next
;
519 val
= __this_cpu_read(memcg
->events_percpu
->nr_page_events
);
520 next
= __this_cpu_read(memcg
->events_percpu
->targets
[target
]);
521 /* from time_after() in jiffies.h */
522 if ((long)(next
- val
) < 0) {
524 case MEM_CGROUP_TARGET_THRESH
:
525 next
= val
+ THRESHOLDS_EVENTS_TARGET
;
527 case MEM_CGROUP_TARGET_SOFTLIMIT
:
528 next
= val
+ SOFTLIMIT_EVENTS_TARGET
;
533 __this_cpu_write(memcg
->events_percpu
->targets
[target
], next
);
540 * Check events in order.
543 static void memcg1_check_events(struct mem_cgroup
*memcg
, int nid
)
545 if (IS_ENABLED(CONFIG_PREEMPT_RT
))
548 /* threshold event is triggered in finer grain than soft limit */
549 if (unlikely(memcg1_event_ratelimit(memcg
,
550 MEM_CGROUP_TARGET_THRESH
))) {
553 do_softlimit
= memcg1_event_ratelimit(memcg
,
554 MEM_CGROUP_TARGET_SOFTLIMIT
);
555 mem_cgroup_threshold(memcg
);
556 if (unlikely(do_softlimit
))
557 memcg1_update_tree(memcg
, nid
);
561 void memcg1_commit_charge(struct folio
*folio
, struct mem_cgroup
*memcg
)
565 local_irq_save(flags
);
566 memcg1_charge_statistics(memcg
, folio_nr_pages(folio
));
567 memcg1_check_events(memcg
, folio_nid(folio
));
568 local_irq_restore(flags
);
571 void memcg1_swapout(struct folio
*folio
, struct mem_cgroup
*memcg
)
574 * Interrupts should be disabled here because the caller holds the
575 * i_pages lock which is taken with interrupts-off. It is
576 * important here to have the interrupts disabled because it is the
577 * only synchronisation we have for updating the per-CPU variables.
579 preempt_disable_nested();
580 VM_WARN_ON_IRQS_ENABLED();
581 memcg1_charge_statistics(memcg
, -folio_nr_pages(folio
));
582 preempt_enable_nested();
583 memcg1_check_events(memcg
, folio_nid(folio
));
586 void memcg1_uncharge_batch(struct mem_cgroup
*memcg
, unsigned long pgpgout
,
587 unsigned long nr_memory
, int nid
)
591 local_irq_save(flags
);
592 __count_memcg_events(memcg
, PGPGOUT
, pgpgout
);
593 __this_cpu_add(memcg
->events_percpu
->nr_page_events
, nr_memory
);
594 memcg1_check_events(memcg
, nid
);
595 local_irq_restore(flags
);
598 static int compare_thresholds(const void *a
, const void *b
)
600 const struct mem_cgroup_threshold
*_a
= a
;
601 const struct mem_cgroup_threshold
*_b
= b
;
603 if (_a
->threshold
> _b
->threshold
)
606 if (_a
->threshold
< _b
->threshold
)
612 static int mem_cgroup_oom_notify_cb(struct mem_cgroup
*memcg
)
614 struct mem_cgroup_eventfd_list
*ev
;
616 spin_lock(&memcg_oom_lock
);
618 list_for_each_entry(ev
, &memcg
->oom_notify
, list
)
619 eventfd_signal(ev
->eventfd
);
621 spin_unlock(&memcg_oom_lock
);
625 static void mem_cgroup_oom_notify(struct mem_cgroup
*memcg
)
627 struct mem_cgroup
*iter
;
629 for_each_mem_cgroup_tree(iter
, memcg
)
630 mem_cgroup_oom_notify_cb(iter
);
633 static int __mem_cgroup_usage_register_event(struct mem_cgroup
*memcg
,
634 struct eventfd_ctx
*eventfd
, const char *args
, enum res_type type
)
636 struct mem_cgroup_thresholds
*thresholds
;
637 struct mem_cgroup_threshold_ary
*new;
638 unsigned long threshold
;
642 ret
= page_counter_memparse(args
, "-1", &threshold
);
646 mutex_lock(&memcg
->thresholds_lock
);
649 thresholds
= &memcg
->thresholds
;
650 usage
= mem_cgroup_usage(memcg
, false);
651 } else if (type
== _MEMSWAP
) {
652 thresholds
= &memcg
->memsw_thresholds
;
653 usage
= mem_cgroup_usage(memcg
, true);
657 /* Check if a threshold crossed before adding a new one */
658 if (thresholds
->primary
)
659 __mem_cgroup_threshold(memcg
, type
== _MEMSWAP
);
661 size
= thresholds
->primary
? thresholds
->primary
->size
+ 1 : 1;
663 /* Allocate memory for new array of thresholds */
664 new = kmalloc(struct_size(new, entries
, size
), GFP_KERNEL
);
671 /* Copy thresholds (if any) to new array */
672 if (thresholds
->primary
)
673 memcpy(new->entries
, thresholds
->primary
->entries
,
674 flex_array_size(new, entries
, size
- 1));
676 /* Add new threshold */
677 new->entries
[size
- 1].eventfd
= eventfd
;
678 new->entries
[size
- 1].threshold
= threshold
;
680 /* Sort thresholds. Registering of new threshold isn't time-critical */
681 sort(new->entries
, size
, sizeof(*new->entries
),
682 compare_thresholds
, NULL
);
684 /* Find current threshold */
685 new->current_threshold
= -1;
686 for (i
= 0; i
< size
; i
++) {
687 if (new->entries
[i
].threshold
<= usage
) {
689 * new->current_threshold will not be used until
690 * rcu_assign_pointer(), so it's safe to increment
693 ++new->current_threshold
;
698 /* Free old spare buffer and save old primary buffer as spare */
699 kfree(thresholds
->spare
);
700 thresholds
->spare
= thresholds
->primary
;
702 rcu_assign_pointer(thresholds
->primary
, new);
704 /* To be sure that nobody uses thresholds */
708 mutex_unlock(&memcg
->thresholds_lock
);
713 static int mem_cgroup_usage_register_event(struct mem_cgroup
*memcg
,
714 struct eventfd_ctx
*eventfd
, const char *args
)
716 return __mem_cgroup_usage_register_event(memcg
, eventfd
, args
, _MEM
);
719 static int memsw_cgroup_usage_register_event(struct mem_cgroup
*memcg
,
720 struct eventfd_ctx
*eventfd
, const char *args
)
722 return __mem_cgroup_usage_register_event(memcg
, eventfd
, args
, _MEMSWAP
);
725 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup
*memcg
,
726 struct eventfd_ctx
*eventfd
, enum res_type type
)
728 struct mem_cgroup_thresholds
*thresholds
;
729 struct mem_cgroup_threshold_ary
*new;
731 int i
, j
, size
, entries
;
733 mutex_lock(&memcg
->thresholds_lock
);
736 thresholds
= &memcg
->thresholds
;
737 usage
= mem_cgroup_usage(memcg
, false);
738 } else if (type
== _MEMSWAP
) {
739 thresholds
= &memcg
->memsw_thresholds
;
740 usage
= mem_cgroup_usage(memcg
, true);
744 if (!thresholds
->primary
)
747 /* Check if a threshold crossed before removing */
748 __mem_cgroup_threshold(memcg
, type
== _MEMSWAP
);
750 /* Calculate new number of threshold */
752 for (i
= 0; i
< thresholds
->primary
->size
; i
++) {
753 if (thresholds
->primary
->entries
[i
].eventfd
!= eventfd
)
759 new = thresholds
->spare
;
761 /* If no items related to eventfd have been cleared, nothing to do */
765 /* Set thresholds array to NULL if we don't have thresholds */
774 /* Copy thresholds and find current threshold */
775 new->current_threshold
= -1;
776 for (i
= 0, j
= 0; i
< thresholds
->primary
->size
; i
++) {
777 if (thresholds
->primary
->entries
[i
].eventfd
== eventfd
)
780 new->entries
[j
] = thresholds
->primary
->entries
[i
];
781 if (new->entries
[j
].threshold
<= usage
) {
783 * new->current_threshold will not be used
784 * until rcu_assign_pointer(), so it's safe to increment
787 ++new->current_threshold
;
793 /* Swap primary and spare array */
794 thresholds
->spare
= thresholds
->primary
;
796 rcu_assign_pointer(thresholds
->primary
, new);
798 /* To be sure that nobody uses thresholds */
801 /* If all events are unregistered, free the spare array */
803 kfree(thresholds
->spare
);
804 thresholds
->spare
= NULL
;
807 mutex_unlock(&memcg
->thresholds_lock
);
810 static void mem_cgroup_usage_unregister_event(struct mem_cgroup
*memcg
,
811 struct eventfd_ctx
*eventfd
)
813 return __mem_cgroup_usage_unregister_event(memcg
, eventfd
, _MEM
);
816 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup
*memcg
,
817 struct eventfd_ctx
*eventfd
)
819 return __mem_cgroup_usage_unregister_event(memcg
, eventfd
, _MEMSWAP
);
822 static int mem_cgroup_oom_register_event(struct mem_cgroup
*memcg
,
823 struct eventfd_ctx
*eventfd
, const char *args
)
825 struct mem_cgroup_eventfd_list
*event
;
827 event
= kmalloc(sizeof(*event
), GFP_KERNEL
);
831 spin_lock(&memcg_oom_lock
);
833 event
->eventfd
= eventfd
;
834 list_add(&event
->list
, &memcg
->oom_notify
);
836 /* already in OOM ? */
837 if (memcg
->under_oom
)
838 eventfd_signal(eventfd
);
839 spin_unlock(&memcg_oom_lock
);
844 static void mem_cgroup_oom_unregister_event(struct mem_cgroup
*memcg
,
845 struct eventfd_ctx
*eventfd
)
847 struct mem_cgroup_eventfd_list
*ev
, *tmp
;
849 spin_lock(&memcg_oom_lock
);
851 list_for_each_entry_safe(ev
, tmp
, &memcg
->oom_notify
, list
) {
852 if (ev
->eventfd
== eventfd
) {
858 spin_unlock(&memcg_oom_lock
);
862 * DO NOT USE IN NEW FILES.
864 * "cgroup.event_control" implementation.
866 * This is way over-engineered. It tries to support fully configurable
867 * events for each user. Such level of flexibility is completely
868 * unnecessary especially in the light of the planned unified hierarchy.
870 * Please deprecate this and replace with something simpler if at all
875 * Unregister event and free resources.
877 * Gets called from workqueue.
879 static void memcg_event_remove(struct work_struct
*work
)
881 struct mem_cgroup_event
*event
=
882 container_of(work
, struct mem_cgroup_event
, remove
);
883 struct mem_cgroup
*memcg
= event
->memcg
;
885 remove_wait_queue(event
->wqh
, &event
->wait
);
887 event
->unregister_event(memcg
, event
->eventfd
);
889 /* Notify userspace the event is going away. */
890 eventfd_signal(event
->eventfd
);
892 eventfd_ctx_put(event
->eventfd
);
894 css_put(&memcg
->css
);
898 * Gets called on EPOLLHUP on eventfd when user closes it.
900 * Called with wqh->lock held and interrupts disabled.
902 static int memcg_event_wake(wait_queue_entry_t
*wait
, unsigned mode
,
905 struct mem_cgroup_event
*event
=
906 container_of(wait
, struct mem_cgroup_event
, wait
);
907 struct mem_cgroup
*memcg
= event
->memcg
;
908 __poll_t flags
= key_to_poll(key
);
910 if (flags
& EPOLLHUP
) {
912 * If the event has been detached at cgroup removal, we
913 * can simply return knowing the other side will cleanup
916 * We can't race against event freeing since the other
917 * side will require wqh->lock via remove_wait_queue(),
920 spin_lock(&memcg
->event_list_lock
);
921 if (!list_empty(&event
->list
)) {
922 list_del_init(&event
->list
);
924 * We are in atomic context, but cgroup_event_remove()
925 * may sleep, so we have to call it in workqueue.
927 schedule_work(&event
->remove
);
929 spin_unlock(&memcg
->event_list_lock
);
935 static void memcg_event_ptable_queue_proc(struct file
*file
,
936 wait_queue_head_t
*wqh
, poll_table
*pt
)
938 struct mem_cgroup_event
*event
=
939 container_of(pt
, struct mem_cgroup_event
, pt
);
942 add_wait_queue(wqh
, &event
->wait
);
946 * DO NOT USE IN NEW FILES.
948 * Parse input and register new cgroup event handler.
950 * Input must be in format '<event_fd> <control_fd> <args>'.
951 * Interpretation of args is defined by control file implementation.
953 static ssize_t
memcg_write_event_control(struct kernfs_open_file
*of
,
954 char *buf
, size_t nbytes
, loff_t off
)
956 struct cgroup_subsys_state
*css
= of_css(of
);
957 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
958 struct mem_cgroup_event
*event
;
959 struct cgroup_subsys_state
*cfile_css
;
960 unsigned int efd
, cfd
;
961 struct dentry
*cdentry
;
966 if (IS_ENABLED(CONFIG_PREEMPT_RT
))
971 efd
= simple_strtoul(buf
, &endp
, 10);
976 cfd
= simple_strtoul(buf
, &endp
, 10);
979 else if (*endp
== ' ')
984 CLASS(fd
, efile
)(efd
);
988 CLASS(fd
, cfile
)(cfd
);
990 event
= kzalloc(sizeof(*event
), GFP_KERNEL
);
994 event
->memcg
= memcg
;
995 INIT_LIST_HEAD(&event
->list
);
996 init_poll_funcptr(&event
->pt
, memcg_event_ptable_queue_proc
);
997 init_waitqueue_func_entry(&event
->wait
, memcg_event_wake
);
998 INIT_WORK(&event
->remove
, memcg_event_remove
);
1000 event
->eventfd
= eventfd_ctx_fileget(fd_file(efile
));
1001 if (IS_ERR(event
->eventfd
)) {
1002 ret
= PTR_ERR(event
->eventfd
);
1006 if (fd_empty(cfile
)) {
1008 goto out_put_eventfd
;
1011 /* the process need read permission on control file */
1012 /* AV: shouldn't we check that it's been opened for read instead? */
1013 ret
= file_permission(fd_file(cfile
), MAY_READ
);
1015 goto out_put_eventfd
;
1018 * The control file must be a regular cgroup1 file. As a regular cgroup
1019 * file can't be renamed, it's safe to access its name afterwards.
1021 cdentry
= fd_file(cfile
)->f_path
.dentry
;
1022 if (cdentry
->d_sb
->s_type
!= &cgroup_fs_type
|| !d_is_reg(cdentry
)) {
1024 goto out_put_eventfd
;
1028 * Determine the event callbacks and set them in @event. This used
1029 * to be done via struct cftype but cgroup core no longer knows
1030 * about these events. The following is crude but the whole thing
1031 * is for compatibility anyway.
1033 * DO NOT ADD NEW FILES.
1035 name
= cdentry
->d_name
.name
;
1037 if (!strcmp(name
, "memory.usage_in_bytes")) {
1038 event
->register_event
= mem_cgroup_usage_register_event
;
1039 event
->unregister_event
= mem_cgroup_usage_unregister_event
;
1040 } else if (!strcmp(name
, "memory.oom_control")) {
1041 pr_warn_once("oom_control is deprecated and will be removed. "
1042 "Please report your usecase to linux-mm-@kvack.org"
1043 " if you depend on this functionality. \n");
1044 event
->register_event
= mem_cgroup_oom_register_event
;
1045 event
->unregister_event
= mem_cgroup_oom_unregister_event
;
1046 } else if (!strcmp(name
, "memory.pressure_level")) {
1047 pr_warn_once("pressure_level is deprecated and will be removed. "
1048 "Please report your usecase to linux-mm-@kvack.org "
1049 "if you depend on this functionality. \n");
1050 event
->register_event
= vmpressure_register_event
;
1051 event
->unregister_event
= vmpressure_unregister_event
;
1052 } else if (!strcmp(name
, "memory.memsw.usage_in_bytes")) {
1053 event
->register_event
= memsw_cgroup_usage_register_event
;
1054 event
->unregister_event
= memsw_cgroup_usage_unregister_event
;
1057 goto out_put_eventfd
;
1061 * Verify @cfile should belong to @css. Also, remaining events are
1062 * automatically removed on cgroup destruction but the removal is
1063 * asynchronous, so take an extra ref on @css.
1065 cfile_css
= css_tryget_online_from_dir(cdentry
->d_parent
,
1066 &memory_cgrp_subsys
);
1068 if (IS_ERR(cfile_css
))
1069 goto out_put_eventfd
;
1070 if (cfile_css
!= css
)
1073 ret
= event
->register_event(memcg
, event
->eventfd
, buf
);
1077 vfs_poll(fd_file(efile
), &event
->pt
);
1079 spin_lock_irq(&memcg
->event_list_lock
);
1080 list_add(&event
->list
, &memcg
->event_list
);
1081 spin_unlock_irq(&memcg
->event_list_lock
);
1087 eventfd_ctx_put(event
->eventfd
);
1093 void memcg1_memcg_init(struct mem_cgroup
*memcg
)
1095 INIT_LIST_HEAD(&memcg
->oom_notify
);
1096 mutex_init(&memcg
->thresholds_lock
);
1097 INIT_LIST_HEAD(&memcg
->event_list
);
1098 spin_lock_init(&memcg
->event_list_lock
);
1101 void memcg1_css_offline(struct mem_cgroup
*memcg
)
1103 struct mem_cgroup_event
*event
, *tmp
;
1106 * Unregister events and notify userspace.
1107 * Notify userspace about cgroup removing only after rmdir of cgroup
1108 * directory to avoid race between userspace and kernelspace.
1110 spin_lock_irq(&memcg
->event_list_lock
);
1111 list_for_each_entry_safe(event
, tmp
, &memcg
->event_list
, list
) {
1112 list_del_init(&event
->list
);
1113 schedule_work(&event
->remove
);
1115 spin_unlock_irq(&memcg
->event_list_lock
);
1119 * Check OOM-Killer is already running under our hierarchy.
1120 * If someone is running, return false.
1122 static bool mem_cgroup_oom_trylock(struct mem_cgroup
*memcg
)
1124 struct mem_cgroup
*iter
, *failed
= NULL
;
1126 spin_lock(&memcg_oom_lock
);
1128 for_each_mem_cgroup_tree(iter
, memcg
) {
1129 if (iter
->oom_lock
) {
1131 * this subtree of our hierarchy is already locked
1132 * so we cannot give a lock.
1135 mem_cgroup_iter_break(memcg
, iter
);
1138 iter
->oom_lock
= true;
1143 * OK, we failed to lock the whole subtree so we have
1144 * to clean up what we set up to the failing subtree
1146 for_each_mem_cgroup_tree(iter
, memcg
) {
1147 if (iter
== failed
) {
1148 mem_cgroup_iter_break(memcg
, iter
);
1151 iter
->oom_lock
= false;
1154 mutex_acquire(&memcg_oom_lock_dep_map
, 0, 1, _RET_IP_
);
1156 spin_unlock(&memcg_oom_lock
);
1161 static void mem_cgroup_oom_unlock(struct mem_cgroup
*memcg
)
1163 struct mem_cgroup
*iter
;
1165 spin_lock(&memcg_oom_lock
);
1166 mutex_release(&memcg_oom_lock_dep_map
, _RET_IP_
);
1167 for_each_mem_cgroup_tree(iter
, memcg
)
1168 iter
->oom_lock
= false;
1169 spin_unlock(&memcg_oom_lock
);
1172 static void mem_cgroup_mark_under_oom(struct mem_cgroup
*memcg
)
1174 struct mem_cgroup
*iter
;
1176 spin_lock(&memcg_oom_lock
);
1177 for_each_mem_cgroup_tree(iter
, memcg
)
1179 spin_unlock(&memcg_oom_lock
);
1182 static void mem_cgroup_unmark_under_oom(struct mem_cgroup
*memcg
)
1184 struct mem_cgroup
*iter
;
1187 * Be careful about under_oom underflows because a child memcg
1188 * could have been added after mem_cgroup_mark_under_oom.
1190 spin_lock(&memcg_oom_lock
);
1191 for_each_mem_cgroup_tree(iter
, memcg
)
1192 if (iter
->under_oom
> 0)
1194 spin_unlock(&memcg_oom_lock
);
1197 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq
);
1199 struct oom_wait_info
{
1200 struct mem_cgroup
*memcg
;
1201 wait_queue_entry_t wait
;
1204 static int memcg_oom_wake_function(wait_queue_entry_t
*wait
,
1205 unsigned mode
, int sync
, void *arg
)
1207 struct mem_cgroup
*wake_memcg
= (struct mem_cgroup
*)arg
;
1208 struct mem_cgroup
*oom_wait_memcg
;
1209 struct oom_wait_info
*oom_wait_info
;
1211 oom_wait_info
= container_of(wait
, struct oom_wait_info
, wait
);
1212 oom_wait_memcg
= oom_wait_info
->memcg
;
1214 if (!mem_cgroup_is_descendant(wake_memcg
, oom_wait_memcg
) &&
1215 !mem_cgroup_is_descendant(oom_wait_memcg
, wake_memcg
))
1217 return autoremove_wake_function(wait
, mode
, sync
, arg
);
1220 void memcg1_oom_recover(struct mem_cgroup
*memcg
)
1223 * For the following lockless ->under_oom test, the only required
1224 * guarantee is that it must see the state asserted by an OOM when
1225 * this function is called as a result of userland actions
1226 * triggered by the notification of the OOM. This is trivially
1227 * achieved by invoking mem_cgroup_mark_under_oom() before
1228 * triggering notification.
1230 if (memcg
&& memcg
->under_oom
)
1231 __wake_up(&memcg_oom_waitq
, TASK_NORMAL
, 0, memcg
);
1235 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1236 * @handle: actually kill/wait or just clean up the OOM state
1238 * This has to be called at the end of a page fault if the memcg OOM
1239 * handler was enabled.
1241 * Memcg supports userspace OOM handling where failed allocations must
1242 * sleep on a waitqueue until the userspace task resolves the
1243 * situation. Sleeping directly in the charge context with all kinds
1244 * of locks held is not a good idea, instead we remember an OOM state
1245 * in the task and mem_cgroup_oom_synchronize() has to be called at
1246 * the end of the page fault to complete the OOM handling.
1248 * Returns %true if an ongoing memcg OOM situation was detected and
1249 * completed, %false otherwise.
1251 bool mem_cgroup_oom_synchronize(bool handle
)
1253 struct mem_cgroup
*memcg
= current
->memcg_in_oom
;
1254 struct oom_wait_info owait
;
1257 /* OOM is global, do not handle */
1264 owait
.memcg
= memcg
;
1265 owait
.wait
.flags
= 0;
1266 owait
.wait
.func
= memcg_oom_wake_function
;
1267 owait
.wait
.private = current
;
1268 INIT_LIST_HEAD(&owait
.wait
.entry
);
1270 prepare_to_wait(&memcg_oom_waitq
, &owait
.wait
, TASK_KILLABLE
);
1271 mem_cgroup_mark_under_oom(memcg
);
1273 locked
= mem_cgroup_oom_trylock(memcg
);
1276 mem_cgroup_oom_notify(memcg
);
1279 mem_cgroup_unmark_under_oom(memcg
);
1280 finish_wait(&memcg_oom_waitq
, &owait
.wait
);
1283 mem_cgroup_oom_unlock(memcg
);
1285 current
->memcg_in_oom
= NULL
;
1286 css_put(&memcg
->css
);
1291 bool memcg1_oom_prepare(struct mem_cgroup
*memcg
, bool *locked
)
1294 * We are in the middle of the charge context here, so we
1295 * don't want to block when potentially sitting on a callstack
1296 * that holds all kinds of filesystem and mm locks.
1298 * cgroup1 allows disabling the OOM killer and waiting for outside
1299 * handling until the charge can succeed; remember the context and put
1300 * the task to sleep at the end of the page fault when all locks are
1303 * On the other hand, in-kernel OOM killer allows for an async victim
1304 * memory reclaim (oom_reaper) and that means that we are not solely
1305 * relying on the oom victim to make a forward progress and we can
1306 * invoke the oom killer here.
1308 * Please note that mem_cgroup_out_of_memory might fail to find a
1309 * victim and then we have to bail out from the charge path.
1311 if (READ_ONCE(memcg
->oom_kill_disable
)) {
1312 if (current
->in_user_fault
) {
1313 css_get(&memcg
->css
);
1314 current
->memcg_in_oom
= memcg
;
1319 mem_cgroup_mark_under_oom(memcg
);
1321 *locked
= mem_cgroup_oom_trylock(memcg
);
1324 mem_cgroup_oom_notify(memcg
);
1326 mem_cgroup_unmark_under_oom(memcg
);
1331 void memcg1_oom_finish(struct mem_cgroup
*memcg
, bool locked
)
1334 mem_cgroup_oom_unlock(memcg
);
1337 static DEFINE_MUTEX(memcg_max_mutex
);
1339 static int mem_cgroup_resize_max(struct mem_cgroup
*memcg
,
1340 unsigned long max
, bool memsw
)
1342 bool enlarge
= false;
1343 bool drained
= false;
1345 bool limits_invariant
;
1346 struct page_counter
*counter
= memsw
? &memcg
->memsw
: &memcg
->memory
;
1349 if (signal_pending(current
)) {
1354 mutex_lock(&memcg_max_mutex
);
1356 * Make sure that the new limit (memsw or memory limit) doesn't
1357 * break our basic invariant rule memory.max <= memsw.max.
1359 limits_invariant
= memsw
? max
>= READ_ONCE(memcg
->memory
.max
) :
1360 max
<= memcg
->memsw
.max
;
1361 if (!limits_invariant
) {
1362 mutex_unlock(&memcg_max_mutex
);
1366 if (max
> counter
->max
)
1368 ret
= page_counter_set_max(counter
, max
);
1369 mutex_unlock(&memcg_max_mutex
);
1375 drain_all_stock(memcg
);
1380 if (!try_to_free_mem_cgroup_pages(memcg
, 1, GFP_KERNEL
,
1381 memsw
? 0 : MEMCG_RECLAIM_MAY_SWAP
, NULL
)) {
1387 if (!ret
&& enlarge
)
1388 memcg1_oom_recover(memcg
);
1394 * Reclaims as many pages from the given memcg as possible.
1396 * Caller is responsible for holding css reference for memcg.
1398 static int mem_cgroup_force_empty(struct mem_cgroup
*memcg
)
1400 int nr_retries
= MAX_RECLAIM_RETRIES
;
1402 /* we call try-to-free pages for make this cgroup empty */
1403 lru_add_drain_all();
1405 drain_all_stock(memcg
);
1407 /* try to free all pages in this cgroup */
1408 while (nr_retries
&& page_counter_read(&memcg
->memory
)) {
1409 if (signal_pending(current
))
1412 if (!try_to_free_mem_cgroup_pages(memcg
, 1, GFP_KERNEL
,
1413 MEMCG_RECLAIM_MAY_SWAP
, NULL
))
1420 static ssize_t
mem_cgroup_force_empty_write(struct kernfs_open_file
*of
,
1421 char *buf
, size_t nbytes
,
1424 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
1426 if (mem_cgroup_is_root(memcg
))
1428 return mem_cgroup_force_empty(memcg
) ?: nbytes
;
1431 static u64
mem_cgroup_hierarchy_read(struct cgroup_subsys_state
*css
,
1437 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state
*css
,
1438 struct cftype
*cft
, u64 val
)
1443 pr_warn_once("Non-hierarchical mode is deprecated. "
1444 "Please report your usecase to linux-mm@kvack.org if you "
1445 "depend on this functionality.\n");
1450 static u64
mem_cgroup_read_u64(struct cgroup_subsys_state
*css
,
1453 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
1454 struct page_counter
*counter
;
1456 switch (MEMFILE_TYPE(cft
->private)) {
1458 counter
= &memcg
->memory
;
1461 counter
= &memcg
->memsw
;
1464 counter
= &memcg
->kmem
;
1467 counter
= &memcg
->tcpmem
;
1473 switch (MEMFILE_ATTR(cft
->private)) {
1475 if (counter
== &memcg
->memory
)
1476 return (u64
)mem_cgroup_usage(memcg
, false) * PAGE_SIZE
;
1477 if (counter
== &memcg
->memsw
)
1478 return (u64
)mem_cgroup_usage(memcg
, true) * PAGE_SIZE
;
1479 return (u64
)page_counter_read(counter
) * PAGE_SIZE
;
1481 return (u64
)counter
->max
* PAGE_SIZE
;
1483 return (u64
)counter
->watermark
* PAGE_SIZE
;
1485 return counter
->failcnt
;
1486 case RES_SOFT_LIMIT
:
1487 return (u64
)READ_ONCE(memcg
->soft_limit
) * PAGE_SIZE
;
1494 * This function doesn't do anything useful. Its only job is to provide a read
1495 * handler for a file so that cgroup_file_mode() will add read permissions.
1497 static int mem_cgroup_dummy_seq_show(__always_unused
struct seq_file
*m
,
1498 __always_unused
void *v
)
1503 static int memcg_update_tcp_max(struct mem_cgroup
*memcg
, unsigned long max
)
1507 mutex_lock(&memcg_max_mutex
);
1509 ret
= page_counter_set_max(&memcg
->tcpmem
, max
);
1513 if (!memcg
->tcpmem_active
) {
1515 * The active flag needs to be written after the static_key
1516 * update. This is what guarantees that the socket activation
1517 * function is the last one to run. See mem_cgroup_sk_alloc()
1518 * for details, and note that we don't mark any socket as
1519 * belonging to this memcg until that flag is up.
1521 * We need to do this, because static_keys will span multiple
1522 * sites, but we can't control their order. If we mark a socket
1523 * as accounted, but the accounting functions are not patched in
1524 * yet, we'll lose accounting.
1526 * We never race with the readers in mem_cgroup_sk_alloc(),
1527 * because when this value change, the code to process it is not
1530 static_branch_inc(&memcg_sockets_enabled_key
);
1531 memcg
->tcpmem_active
= true;
1534 mutex_unlock(&memcg_max_mutex
);
1539 * The user of this function is...
1542 static ssize_t
mem_cgroup_write(struct kernfs_open_file
*of
,
1543 char *buf
, size_t nbytes
, loff_t off
)
1545 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
1546 unsigned long nr_pages
;
1549 buf
= strstrip(buf
);
1550 ret
= page_counter_memparse(buf
, "-1", &nr_pages
);
1554 switch (MEMFILE_ATTR(of_cft(of
)->private)) {
1556 if (mem_cgroup_is_root(memcg
)) { /* Can't set limit on root */
1560 switch (MEMFILE_TYPE(of_cft(of
)->private)) {
1562 ret
= mem_cgroup_resize_max(memcg
, nr_pages
, false);
1565 ret
= mem_cgroup_resize_max(memcg
, nr_pages
, true);
1568 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
1569 "Writing any value to this file has no effect. "
1570 "Please report your usecase to linux-mm@kvack.org if you "
1571 "depend on this functionality.\n");
1575 pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
1576 "Please report your usecase to linux-mm@kvack.org if you "
1577 "depend on this functionality.\n");
1578 ret
= memcg_update_tcp_max(memcg
, nr_pages
);
1582 case RES_SOFT_LIMIT
:
1583 if (IS_ENABLED(CONFIG_PREEMPT_RT
)) {
1586 pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
1587 "Please report your usecase to linux-mm@kvack.org if you "
1588 "depend on this functionality.\n");
1589 WRITE_ONCE(memcg
->soft_limit
, nr_pages
);
1594 return ret
?: nbytes
;
1597 static ssize_t
mem_cgroup_reset(struct kernfs_open_file
*of
, char *buf
,
1598 size_t nbytes
, loff_t off
)
1600 struct mem_cgroup
*memcg
= mem_cgroup_from_css(of_css(of
));
1601 struct page_counter
*counter
;
1603 switch (MEMFILE_TYPE(of_cft(of
)->private)) {
1605 counter
= &memcg
->memory
;
1608 counter
= &memcg
->memsw
;
1611 counter
= &memcg
->kmem
;
1614 counter
= &memcg
->tcpmem
;
1620 switch (MEMFILE_ATTR(of_cft(of
)->private)) {
1622 page_counter_reset_watermark(counter
);
1625 counter
->failcnt
= 0;
1636 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
1637 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
1638 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
1640 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup
*memcg
,
1641 int nid
, unsigned int lru_mask
, bool tree
)
1643 struct lruvec
*lruvec
= mem_cgroup_lruvec(memcg
, NODE_DATA(nid
));
1644 unsigned long nr
= 0;
1647 VM_BUG_ON((unsigned)nid
>= nr_node_ids
);
1650 if (!(BIT(lru
) & lru_mask
))
1653 nr
+= lruvec_page_state(lruvec
, NR_LRU_BASE
+ lru
);
1655 nr
+= lruvec_page_state_local(lruvec
, NR_LRU_BASE
+ lru
);
1660 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup
*memcg
,
1661 unsigned int lru_mask
,
1664 unsigned long nr
= 0;
1668 if (!(BIT(lru
) & lru_mask
))
1671 nr
+= memcg_page_state(memcg
, NR_LRU_BASE
+ lru
);
1673 nr
+= memcg_page_state_local(memcg
, NR_LRU_BASE
+ lru
);
1678 static int memcg_numa_stat_show(struct seq_file
*m
, void *v
)
1682 unsigned int lru_mask
;
1685 static const struct numa_stat stats
[] = {
1686 { "total", LRU_ALL
},
1687 { "file", LRU_ALL_FILE
},
1688 { "anon", LRU_ALL_ANON
},
1689 { "unevictable", BIT(LRU_UNEVICTABLE
) },
1691 const struct numa_stat
*stat
;
1693 struct mem_cgroup
*memcg
= mem_cgroup_from_seq(m
);
1695 mem_cgroup_flush_stats(memcg
);
1697 for (stat
= stats
; stat
< stats
+ ARRAY_SIZE(stats
); stat
++) {
1698 seq_printf(m
, "%s=%lu", stat
->name
,
1699 mem_cgroup_nr_lru_pages(memcg
, stat
->lru_mask
,
1701 for_each_node_state(nid
, N_MEMORY
)
1702 seq_printf(m
, " N%d=%lu", nid
,
1703 mem_cgroup_node_nr_lru_pages(memcg
, nid
,
1704 stat
->lru_mask
, false));
1708 for (stat
= stats
; stat
< stats
+ ARRAY_SIZE(stats
); stat
++) {
1710 seq_printf(m
, "hierarchical_%s=%lu", stat
->name
,
1711 mem_cgroup_nr_lru_pages(memcg
, stat
->lru_mask
,
1713 for_each_node_state(nid
, N_MEMORY
)
1714 seq_printf(m
, " N%d=%lu", nid
,
1715 mem_cgroup_node_nr_lru_pages(memcg
, nid
,
1716 stat
->lru_mask
, true));
1722 #endif /* CONFIG_NUMA */
1724 static const unsigned int memcg1_stats
[] = {
1727 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1734 WORKINGSET_REFAULT_ANON
,
1735 WORKINGSET_REFAULT_FILE
,
1742 static const char *const memcg1_stat_names
[] = {
1745 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1752 "workingset_refault_anon",
1753 "workingset_refault_file",
1760 /* Universal VM events cgroup1 shows, original sort order */
1761 static const unsigned int memcg1_events
[] = {
1768 void memcg1_stat_format(struct mem_cgroup
*memcg
, struct seq_buf
*s
)
1770 unsigned long memory
, memsw
;
1771 struct mem_cgroup
*mi
;
1774 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names
) != ARRAY_SIZE(memcg1_stats
));
1776 mem_cgroup_flush_stats(memcg
);
1778 for (i
= 0; i
< ARRAY_SIZE(memcg1_stats
); i
++) {
1781 nr
= memcg_page_state_local_output(memcg
, memcg1_stats
[i
]);
1782 seq_buf_printf(s
, "%s %lu\n", memcg1_stat_names
[i
], nr
);
1785 for (i
= 0; i
< ARRAY_SIZE(memcg1_events
); i
++)
1786 seq_buf_printf(s
, "%s %lu\n", vm_event_name(memcg1_events
[i
]),
1787 memcg_events_local(memcg
, memcg1_events
[i
]));
1789 for (i
= 0; i
< NR_LRU_LISTS
; i
++)
1790 seq_buf_printf(s
, "%s %lu\n", lru_list_name(i
),
1791 memcg_page_state_local(memcg
, NR_LRU_BASE
+ i
) *
1794 /* Hierarchical information */
1795 memory
= memsw
= PAGE_COUNTER_MAX
;
1796 for (mi
= memcg
; mi
; mi
= parent_mem_cgroup(mi
)) {
1797 memory
= min(memory
, READ_ONCE(mi
->memory
.max
));
1798 memsw
= min(memsw
, READ_ONCE(mi
->memsw
.max
));
1800 seq_buf_printf(s
, "hierarchical_memory_limit %llu\n",
1801 (u64
)memory
* PAGE_SIZE
);
1802 seq_buf_printf(s
, "hierarchical_memsw_limit %llu\n",
1803 (u64
)memsw
* PAGE_SIZE
);
1805 for (i
= 0; i
< ARRAY_SIZE(memcg1_stats
); i
++) {
1808 nr
= memcg_page_state_output(memcg
, memcg1_stats
[i
]);
1809 seq_buf_printf(s
, "total_%s %llu\n", memcg1_stat_names
[i
],
1813 for (i
= 0; i
< ARRAY_SIZE(memcg1_events
); i
++)
1814 seq_buf_printf(s
, "total_%s %llu\n",
1815 vm_event_name(memcg1_events
[i
]),
1816 (u64
)memcg_events(memcg
, memcg1_events
[i
]));
1818 for (i
= 0; i
< NR_LRU_LISTS
; i
++)
1819 seq_buf_printf(s
, "total_%s %llu\n", lru_list_name(i
),
1820 (u64
)memcg_page_state(memcg
, NR_LRU_BASE
+ i
) *
1823 #ifdef CONFIG_DEBUG_VM
1826 struct mem_cgroup_per_node
*mz
;
1827 unsigned long anon_cost
= 0;
1828 unsigned long file_cost
= 0;
1830 for_each_online_pgdat(pgdat
) {
1831 mz
= memcg
->nodeinfo
[pgdat
->node_id
];
1833 anon_cost
+= mz
->lruvec
.anon_cost
;
1834 file_cost
+= mz
->lruvec
.file_cost
;
1836 seq_buf_printf(s
, "anon_cost %lu\n", anon_cost
);
1837 seq_buf_printf(s
, "file_cost %lu\n", file_cost
);
1842 static u64
mem_cgroup_swappiness_read(struct cgroup_subsys_state
*css
,
1845 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
1847 return mem_cgroup_swappiness(memcg
);
1850 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state
*css
,
1851 struct cftype
*cft
, u64 val
)
1853 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
1855 if (val
> MAX_SWAPPINESS
)
1858 if (!mem_cgroup_is_root(memcg
))
1859 WRITE_ONCE(memcg
->swappiness
, val
);
1861 WRITE_ONCE(vm_swappiness
, val
);
1866 static int mem_cgroup_oom_control_read(struct seq_file
*sf
, void *v
)
1868 struct mem_cgroup
*memcg
= mem_cgroup_from_seq(sf
);
1870 seq_printf(sf
, "oom_kill_disable %d\n", READ_ONCE(memcg
->oom_kill_disable
));
1871 seq_printf(sf
, "under_oom %d\n", (bool)memcg
->under_oom
);
1872 seq_printf(sf
, "oom_kill %lu\n",
1873 atomic_long_read(&memcg
->memory_events
[MEMCG_OOM_KILL
]));
1877 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state
*css
,
1878 struct cftype
*cft
, u64 val
)
1880 struct mem_cgroup
*memcg
= mem_cgroup_from_css(css
);
1882 pr_warn_once("oom_control is deprecated and will be removed. "
1883 "Please report your usecase to linux-mm-@kvack.org if you "
1884 "depend on this functionality. \n");
1886 /* cannot set to root cgroup and only 0 and 1 are allowed */
1887 if (mem_cgroup_is_root(memcg
) || !((val
== 0) || (val
== 1)))
1890 WRITE_ONCE(memcg
->oom_kill_disable
, val
);
1892 memcg1_oom_recover(memcg
);
1897 #ifdef CONFIG_SLUB_DEBUG
1898 static int mem_cgroup_slab_show(struct seq_file
*m
, void *p
)
1902 * Please, take a look at tools/cgroup/memcg_slabinfo.py .
1908 struct cftype mem_cgroup_legacy_files
[] = {
1910 .name
= "usage_in_bytes",
1911 .private = MEMFILE_PRIVATE(_MEM
, RES_USAGE
),
1912 .read_u64
= mem_cgroup_read_u64
,
1915 .name
= "max_usage_in_bytes",
1916 .private = MEMFILE_PRIVATE(_MEM
, RES_MAX_USAGE
),
1917 .write
= mem_cgroup_reset
,
1918 .read_u64
= mem_cgroup_read_u64
,
1921 .name
= "limit_in_bytes",
1922 .private = MEMFILE_PRIVATE(_MEM
, RES_LIMIT
),
1923 .write
= mem_cgroup_write
,
1924 .read_u64
= mem_cgroup_read_u64
,
1927 .name
= "soft_limit_in_bytes",
1928 .private = MEMFILE_PRIVATE(_MEM
, RES_SOFT_LIMIT
),
1929 .write
= mem_cgroup_write
,
1930 .read_u64
= mem_cgroup_read_u64
,
1934 .private = MEMFILE_PRIVATE(_MEM
, RES_FAILCNT
),
1935 .write
= mem_cgroup_reset
,
1936 .read_u64
= mem_cgroup_read_u64
,
1940 .seq_show
= memory_stat_show
,
1943 .name
= "force_empty",
1944 .write
= mem_cgroup_force_empty_write
,
1947 .name
= "use_hierarchy",
1948 .write_u64
= mem_cgroup_hierarchy_write
,
1949 .read_u64
= mem_cgroup_hierarchy_read
,
1952 .name
= "cgroup.event_control", /* XXX: for compat */
1953 .write
= memcg_write_event_control
,
1954 .flags
= CFTYPE_NO_PREFIX
| CFTYPE_WORLD_WRITABLE
,
1957 .name
= "swappiness",
1958 .read_u64
= mem_cgroup_swappiness_read
,
1959 .write_u64
= mem_cgroup_swappiness_write
,
1962 .name
= "move_charge_at_immigrate",
1963 .read_u64
= mem_cgroup_move_charge_read
,
1964 .write_u64
= mem_cgroup_move_charge_write
,
1967 .name
= "oom_control",
1968 .seq_show
= mem_cgroup_oom_control_read
,
1969 .write_u64
= mem_cgroup_oom_control_write
,
1972 .name
= "pressure_level",
1973 .seq_show
= mem_cgroup_dummy_seq_show
,
1977 .name
= "numa_stat",
1978 .seq_show
= memcg_numa_stat_show
,
1982 .name
= "kmem.limit_in_bytes",
1983 .private = MEMFILE_PRIVATE(_KMEM
, RES_LIMIT
),
1984 .write
= mem_cgroup_write
,
1985 .read_u64
= mem_cgroup_read_u64
,
1988 .name
= "kmem.usage_in_bytes",
1989 .private = MEMFILE_PRIVATE(_KMEM
, RES_USAGE
),
1990 .read_u64
= mem_cgroup_read_u64
,
1993 .name
= "kmem.failcnt",
1994 .private = MEMFILE_PRIVATE(_KMEM
, RES_FAILCNT
),
1995 .write
= mem_cgroup_reset
,
1996 .read_u64
= mem_cgroup_read_u64
,
1999 .name
= "kmem.max_usage_in_bytes",
2000 .private = MEMFILE_PRIVATE(_KMEM
, RES_MAX_USAGE
),
2001 .write
= mem_cgroup_reset
,
2002 .read_u64
= mem_cgroup_read_u64
,
2004 #ifdef CONFIG_SLUB_DEBUG
2006 .name
= "kmem.slabinfo",
2007 .seq_show
= mem_cgroup_slab_show
,
2011 .name
= "kmem.tcp.limit_in_bytes",
2012 .private = MEMFILE_PRIVATE(_TCP
, RES_LIMIT
),
2013 .write
= mem_cgroup_write
,
2014 .read_u64
= mem_cgroup_read_u64
,
2017 .name
= "kmem.tcp.usage_in_bytes",
2018 .private = MEMFILE_PRIVATE(_TCP
, RES_USAGE
),
2019 .read_u64
= mem_cgroup_read_u64
,
2022 .name
= "kmem.tcp.failcnt",
2023 .private = MEMFILE_PRIVATE(_TCP
, RES_FAILCNT
),
2024 .write
= mem_cgroup_reset
,
2025 .read_u64
= mem_cgroup_read_u64
,
2028 .name
= "kmem.tcp.max_usage_in_bytes",
2029 .private = MEMFILE_PRIVATE(_TCP
, RES_MAX_USAGE
),
2030 .write
= mem_cgroup_reset
,
2031 .read_u64
= mem_cgroup_read_u64
,
2033 { }, /* terminate */
2036 struct cftype memsw_files
[] = {
2038 .name
= "memsw.usage_in_bytes",
2039 .private = MEMFILE_PRIVATE(_MEMSWAP
, RES_USAGE
),
2040 .read_u64
= mem_cgroup_read_u64
,
2043 .name
= "memsw.max_usage_in_bytes",
2044 .private = MEMFILE_PRIVATE(_MEMSWAP
, RES_MAX_USAGE
),
2045 .write
= mem_cgroup_reset
,
2046 .read_u64
= mem_cgroup_read_u64
,
2049 .name
= "memsw.limit_in_bytes",
2050 .private = MEMFILE_PRIVATE(_MEMSWAP
, RES_LIMIT
),
2051 .write
= mem_cgroup_write
,
2052 .read_u64
= mem_cgroup_read_u64
,
2055 .name
= "memsw.failcnt",
2056 .private = MEMFILE_PRIVATE(_MEMSWAP
, RES_FAILCNT
),
2057 .write
= mem_cgroup_reset
,
2058 .read_u64
= mem_cgroup_read_u64
,
2060 { }, /* terminate */
2063 void memcg1_account_kmem(struct mem_cgroup
*memcg
, int nr_pages
)
2065 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys
)) {
2067 page_counter_charge(&memcg
->kmem
, nr_pages
);
2069 page_counter_uncharge(&memcg
->kmem
, -nr_pages
);
2073 bool memcg1_charge_skmem(struct mem_cgroup
*memcg
, unsigned int nr_pages
,
2076 struct page_counter
*fail
;
2078 if (page_counter_try_charge(&memcg
->tcpmem
, nr_pages
, &fail
)) {
2079 memcg
->tcpmem_pressure
= 0;
2082 memcg
->tcpmem_pressure
= 1;
2083 if (gfp_mask
& __GFP_NOFAIL
) {
2084 page_counter_charge(&memcg
->tcpmem
, nr_pages
);
2090 bool memcg1_alloc_events(struct mem_cgroup
*memcg
)
2092 memcg
->events_percpu
= alloc_percpu_gfp(struct memcg1_events_percpu
,
2093 GFP_KERNEL_ACCOUNT
);
2094 return !!memcg
->events_percpu
;
2097 void memcg1_free_events(struct mem_cgroup
*memcg
)
2099 if (memcg
->events_percpu
)
2100 free_percpu(memcg
->events_percpu
);
2103 static int __init
memcg1_init(void)
2107 for_each_node(node
) {
2108 struct mem_cgroup_tree_per_node
*rtpn
;
2110 rtpn
= kzalloc_node(sizeof(*rtpn
), GFP_KERNEL
, node
);
2112 rtpn
->rb_root
= RB_ROOT
;
2113 rtpn
->rb_rightmost
= NULL
;
2114 spin_lock_init(&rtpn
->lock
);
2115 soft_limit_tree
.rb_tree_per_node
[node
] = rtpn
;
2120 subsys_initcall(memcg1_init
);