1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/memcontrol.h>
3 #include <linux/rwsem.h>
4 #include <linux/shrinker.h>
5 #include <linux/rculist.h>
6 #include <trace/events/vmscan.h>
10 LIST_HEAD(shrinker_list
);
11 DEFINE_MUTEX(shrinker_mutex
);
14 static int shrinker_nr_max
;
16 static inline int shrinker_unit_size(int nr_items
)
18 return (DIV_ROUND_UP(nr_items
, SHRINKER_UNIT_BITS
) * sizeof(struct shrinker_info_unit
*));
21 static inline void shrinker_unit_free(struct shrinker_info
*info
, int start
)
23 struct shrinker_info_unit
**unit
;
30 nr
= DIV_ROUND_UP(info
->map_nr_max
, SHRINKER_UNIT_BITS
);
32 for (i
= start
; i
< nr
; i
++) {
41 static inline int shrinker_unit_alloc(struct shrinker_info
*new,
42 struct shrinker_info
*old
, int nid
)
44 struct shrinker_info_unit
*unit
;
45 int nr
= DIV_ROUND_UP(new->map_nr_max
, SHRINKER_UNIT_BITS
);
46 int start
= old
? DIV_ROUND_UP(old
->map_nr_max
, SHRINKER_UNIT_BITS
) : 0;
49 for (i
= start
; i
< nr
; i
++) {
50 unit
= kzalloc_node(sizeof(*unit
), GFP_KERNEL
, nid
);
52 shrinker_unit_free(new, start
);
62 void free_shrinker_info(struct mem_cgroup
*memcg
)
64 struct mem_cgroup_per_node
*pn
;
65 struct shrinker_info
*info
;
69 pn
= memcg
->nodeinfo
[nid
];
70 info
= rcu_dereference_protected(pn
->shrinker_info
, true);
71 shrinker_unit_free(info
, 0);
73 rcu_assign_pointer(pn
->shrinker_info
, NULL
);
77 int alloc_shrinker_info(struct mem_cgroup
*memcg
)
82 mutex_lock(&shrinker_mutex
);
83 array_size
= shrinker_unit_size(shrinker_nr_max
);
85 struct shrinker_info
*info
= kvzalloc_node(sizeof(*info
) + array_size
,
89 info
->map_nr_max
= shrinker_nr_max
;
90 if (shrinker_unit_alloc(info
, NULL
, nid
)) {
94 rcu_assign_pointer(memcg
->nodeinfo
[nid
]->shrinker_info
, info
);
96 mutex_unlock(&shrinker_mutex
);
101 mutex_unlock(&shrinker_mutex
);
102 free_shrinker_info(memcg
);
106 static struct shrinker_info
*shrinker_info_protected(struct mem_cgroup
*memcg
,
109 return rcu_dereference_protected(memcg
->nodeinfo
[nid
]->shrinker_info
,
110 lockdep_is_held(&shrinker_mutex
));
113 static int expand_one_shrinker_info(struct mem_cgroup
*memcg
, int new_size
,
114 int old_size
, int new_nr_max
)
116 struct shrinker_info
*new, *old
;
117 struct mem_cgroup_per_node
*pn
;
121 pn
= memcg
->nodeinfo
[nid
];
122 old
= shrinker_info_protected(memcg
, nid
);
123 /* Not yet online memcg */
127 /* Already expanded this shrinker_info */
128 if (new_nr_max
<= old
->map_nr_max
)
131 new = kvzalloc_node(sizeof(*new) + new_size
, GFP_KERNEL
, nid
);
135 new->map_nr_max
= new_nr_max
;
137 memcpy(new->unit
, old
->unit
, old_size
);
138 if (shrinker_unit_alloc(new, old
, nid
)) {
143 rcu_assign_pointer(pn
->shrinker_info
, new);
144 kvfree_rcu(old
, rcu
);
150 static int expand_shrinker_info(int new_id
)
153 int new_nr_max
= round_up(new_id
+ 1, SHRINKER_UNIT_BITS
);
154 int new_size
, old_size
= 0;
155 struct mem_cgroup
*memcg
;
157 if (!root_mem_cgroup
)
160 lockdep_assert_held(&shrinker_mutex
);
162 new_size
= shrinker_unit_size(new_nr_max
);
163 old_size
= shrinker_unit_size(shrinker_nr_max
);
165 memcg
= mem_cgroup_iter(NULL
, NULL
, NULL
);
167 ret
= expand_one_shrinker_info(memcg
, new_size
, old_size
,
170 mem_cgroup_iter_break(NULL
, memcg
);
173 } while ((memcg
= mem_cgroup_iter(NULL
, memcg
, NULL
)) != NULL
);
176 shrinker_nr_max
= new_nr_max
;
181 static inline int shrinker_id_to_index(int shrinker_id
)
183 return shrinker_id
/ SHRINKER_UNIT_BITS
;
186 static inline int shrinker_id_to_offset(int shrinker_id
)
188 return shrinker_id
% SHRINKER_UNIT_BITS
;
191 static inline int calc_shrinker_id(int index
, int offset
)
193 return index
* SHRINKER_UNIT_BITS
+ offset
;
196 void set_shrinker_bit(struct mem_cgroup
*memcg
, int nid
, int shrinker_id
)
198 if (shrinker_id
>= 0 && memcg
&& !mem_cgroup_is_root(memcg
)) {
199 struct shrinker_info
*info
;
200 struct shrinker_info_unit
*unit
;
203 info
= rcu_dereference(memcg
->nodeinfo
[nid
]->shrinker_info
);
204 unit
= info
->unit
[shrinker_id_to_index(shrinker_id
)];
205 if (!WARN_ON_ONCE(shrinker_id
>= info
->map_nr_max
)) {
206 /* Pairs with smp mb in shrink_slab() */
207 smp_mb__before_atomic();
208 set_bit(shrinker_id_to_offset(shrinker_id
), unit
->map
);
214 static DEFINE_IDR(shrinker_idr
);
216 static int shrinker_memcg_alloc(struct shrinker
*shrinker
)
218 int id
, ret
= -ENOMEM
;
220 if (mem_cgroup_disabled())
223 mutex_lock(&shrinker_mutex
);
224 id
= idr_alloc(&shrinker_idr
, shrinker
, 0, 0, GFP_KERNEL
);
228 if (id
>= shrinker_nr_max
) {
229 if (expand_shrinker_info(id
)) {
230 idr_remove(&shrinker_idr
, id
);
237 mutex_unlock(&shrinker_mutex
);
241 static void shrinker_memcg_remove(struct shrinker
*shrinker
)
243 int id
= shrinker
->id
;
247 lockdep_assert_held(&shrinker_mutex
);
249 idr_remove(&shrinker_idr
, id
);
252 static long xchg_nr_deferred_memcg(int nid
, struct shrinker
*shrinker
,
253 struct mem_cgroup
*memcg
)
255 struct shrinker_info
*info
;
256 struct shrinker_info_unit
*unit
;
260 info
= rcu_dereference(memcg
->nodeinfo
[nid
]->shrinker_info
);
261 unit
= info
->unit
[shrinker_id_to_index(shrinker
->id
)];
262 nr_deferred
= atomic_long_xchg(&unit
->nr_deferred
[shrinker_id_to_offset(shrinker
->id
)], 0);
268 static long add_nr_deferred_memcg(long nr
, int nid
, struct shrinker
*shrinker
,
269 struct mem_cgroup
*memcg
)
271 struct shrinker_info
*info
;
272 struct shrinker_info_unit
*unit
;
276 info
= rcu_dereference(memcg
->nodeinfo
[nid
]->shrinker_info
);
277 unit
= info
->unit
[shrinker_id_to_index(shrinker
->id
)];
279 atomic_long_add_return(nr
, &unit
->nr_deferred
[shrinker_id_to_offset(shrinker
->id
)]);
285 void reparent_shrinker_deferred(struct mem_cgroup
*memcg
)
287 int nid
, index
, offset
;
289 struct mem_cgroup
*parent
;
290 struct shrinker_info
*child_info
, *parent_info
;
291 struct shrinker_info_unit
*child_unit
, *parent_unit
;
293 parent
= parent_mem_cgroup(memcg
);
295 parent
= root_mem_cgroup
;
297 /* Prevent from concurrent shrinker_info expand */
298 mutex_lock(&shrinker_mutex
);
300 child_info
= shrinker_info_protected(memcg
, nid
);
301 parent_info
= shrinker_info_protected(parent
, nid
);
302 for (index
= 0; index
< shrinker_id_to_index(child_info
->map_nr_max
); index
++) {
303 child_unit
= child_info
->unit
[index
];
304 parent_unit
= parent_info
->unit
[index
];
305 for (offset
= 0; offset
< SHRINKER_UNIT_BITS
; offset
++) {
306 nr
= atomic_long_read(&child_unit
->nr_deferred
[offset
]);
307 atomic_long_add(nr
, &parent_unit
->nr_deferred
[offset
]);
311 mutex_unlock(&shrinker_mutex
);
314 static int shrinker_memcg_alloc(struct shrinker
*shrinker
)
319 static void shrinker_memcg_remove(struct shrinker
*shrinker
)
323 static long xchg_nr_deferred_memcg(int nid
, struct shrinker
*shrinker
,
324 struct mem_cgroup
*memcg
)
329 static long add_nr_deferred_memcg(long nr
, int nid
, struct shrinker
*shrinker
,
330 struct mem_cgroup
*memcg
)
334 #endif /* CONFIG_MEMCG */
336 static long xchg_nr_deferred(struct shrinker
*shrinker
,
337 struct shrink_control
*sc
)
341 if (!(shrinker
->flags
& SHRINKER_NUMA_AWARE
))
345 (shrinker
->flags
& SHRINKER_MEMCG_AWARE
))
346 return xchg_nr_deferred_memcg(nid
, shrinker
,
349 return atomic_long_xchg(&shrinker
->nr_deferred
[nid
], 0);
353 static long add_nr_deferred(long nr
, struct shrinker
*shrinker
,
354 struct shrink_control
*sc
)
358 if (!(shrinker
->flags
& SHRINKER_NUMA_AWARE
))
362 (shrinker
->flags
& SHRINKER_MEMCG_AWARE
))
363 return add_nr_deferred_memcg(nr
, nid
, shrinker
,
366 return atomic_long_add_return(nr
, &shrinker
->nr_deferred
[nid
]);
369 #define SHRINK_BATCH 128
371 static unsigned long do_shrink_slab(struct shrink_control
*shrinkctl
,
372 struct shrinker
*shrinker
, int priority
)
374 unsigned long freed
= 0;
375 unsigned long long delta
;
380 long batch_size
= shrinker
->batch
? shrinker
->batch
382 long scanned
= 0, next_deferred
;
384 freeable
= shrinker
->count_objects(shrinker
, shrinkctl
);
385 if (freeable
== 0 || freeable
== SHRINK_EMPTY
)
389 * copy the current shrinker scan count into a local variable
390 * and zero it so that other concurrent shrinker invocations
391 * don't also do this scanning work.
393 nr
= xchg_nr_deferred(shrinker
, shrinkctl
);
395 if (shrinker
->seeks
) {
396 delta
= freeable
>> priority
;
398 do_div(delta
, shrinker
->seeks
);
401 * These objects don't require any IO to create. Trim
402 * them aggressively under memory pressure to keep
403 * them from causing refetches in the IO caches.
405 delta
= freeable
/ 2;
408 total_scan
= nr
>> priority
;
410 total_scan
= min(total_scan
, (2 * freeable
));
412 trace_mm_shrink_slab_start(shrinker
, shrinkctl
, nr
,
413 freeable
, delta
, total_scan
, priority
);
416 * Normally, we should not scan less than batch_size objects in one
417 * pass to avoid too frequent shrinker calls, but if the slab has less
418 * than batch_size objects in total and we are really tight on memory,
419 * we will try to reclaim all available objects, otherwise we can end
420 * up failing allocations although there are plenty of reclaimable
421 * objects spread over several slabs with usage less than the
424 * We detect the "tight on memory" situations by looking at the total
425 * number of objects we want to scan (total_scan). If it is greater
426 * than the total number of objects on slab (freeable), we must be
427 * scanning at high prio and therefore should try to reclaim as much as
430 while (total_scan
>= batch_size
||
431 total_scan
>= freeable
) {
433 unsigned long nr_to_scan
= min(batch_size
, total_scan
);
435 shrinkctl
->nr_to_scan
= nr_to_scan
;
436 shrinkctl
->nr_scanned
= nr_to_scan
;
437 ret
= shrinker
->scan_objects(shrinker
, shrinkctl
);
438 if (ret
== SHRINK_STOP
)
442 count_vm_events(SLABS_SCANNED
, shrinkctl
->nr_scanned
);
443 total_scan
-= shrinkctl
->nr_scanned
;
444 scanned
+= shrinkctl
->nr_scanned
;
450 * The deferred work is increased by any new work (delta) that wasn't
451 * done, decreased by old deferred work that was done now.
453 * And it is capped to two times of the freeable items.
455 next_deferred
= max_t(long, (nr
+ delta
- scanned
), 0);
456 next_deferred
= min(next_deferred
, (2 * freeable
));
459 * move the unused scan count back into the shrinker in a
460 * manner that handles concurrent updates.
462 new_nr
= add_nr_deferred(next_deferred
, shrinker
, shrinkctl
);
464 trace_mm_shrink_slab_end(shrinker
, shrinkctl
->nid
, freed
, nr
, new_nr
, total_scan
);
469 static unsigned long shrink_slab_memcg(gfp_t gfp_mask
, int nid
,
470 struct mem_cgroup
*memcg
, int priority
)
472 struct shrinker_info
*info
;
473 unsigned long ret
, freed
= 0;
474 int offset
, index
= 0;
476 if (!mem_cgroup_online(memcg
))
480 * lockless algorithm of memcg shrink.
482 * The shrinker_info may be freed asynchronously via RCU in the
483 * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
484 * to ensure the existence of the shrinker_info.
486 * The shrinker_info_unit is never freed unless its corresponding memcg
487 * is destroyed. Here we already hold the refcount of memcg, so the
488 * memcg will not be destroyed, and of course shrinker_info_unit will
491 * So in the memcg shrink:
492 * step 1: use rcu_read_lock() to guarantee existence of the
494 * step 2: after getting shrinker_info_unit we can safely release the
496 * step 3: traverse the bitmap and calculate shrinker_id
497 * step 4: use rcu_read_lock() to guarantee existence of the shrinker.
498 * step 5: use shrinker_id to find the shrinker, then use
499 * shrinker_try_get() to guarantee existence of the shrinker,
500 * then we can release the RCU lock to do do_shrink_slab() that
502 * step 6: do shrinker_put() paired with step 5 to put the refcount,
503 * if the refcount reaches 0, then wake up the waiter in
504 * shrinker_free() by calling complete().
505 * Note: here is different from the global shrink, we don't
506 * need to acquire the RCU lock to guarantee existence of
507 * the shrinker, because we don't need to use this
508 * shrinker to traverse the next shrinker in the bitmap.
509 * step 7: we have already exited the read-side of rcu critical section
510 * before calling do_shrink_slab(), the shrinker_info may be
511 * released in expand_one_shrinker_info(), so go back to step 1
512 * to reacquire the shrinker_info.
516 info
= rcu_dereference(memcg
->nodeinfo
[nid
]->shrinker_info
);
520 if (index
< shrinker_id_to_index(info
->map_nr_max
)) {
521 struct shrinker_info_unit
*unit
;
523 unit
= info
->unit
[index
];
527 for_each_set_bit(offset
, unit
->map
, SHRINKER_UNIT_BITS
) {
528 struct shrink_control sc
= {
529 .gfp_mask
= gfp_mask
,
533 struct shrinker
*shrinker
;
534 int shrinker_id
= calc_shrinker_id(index
, offset
);
537 shrinker
= idr_find(&shrinker_idr
, shrinker_id
);
538 if (unlikely(!shrinker
|| !shrinker_try_get(shrinker
))) {
539 clear_bit(offset
, unit
->map
);
545 /* Call non-slab shrinkers even though kmem is disabled */
546 if (!memcg_kmem_online() &&
547 !(shrinker
->flags
& SHRINKER_NONSLAB
))
550 ret
= do_shrink_slab(&sc
, shrinker
, priority
);
551 if (ret
== SHRINK_EMPTY
) {
552 clear_bit(offset
, unit
->map
);
554 * After the shrinker reported that it had no objects to
555 * free, but before we cleared the corresponding bit in
556 * the memcg shrinker map, a new object might have been
557 * added. To make sure, we have the bit set in this
558 * case, we invoke the shrinker one more time and reset
559 * the bit if it reports that it is not empty anymore.
560 * The memory barrier here pairs with the barrier in
561 * set_shrinker_bit():
563 * list_lru_add() shrink_slab_memcg()
564 * list_add_tail() clear_bit()
566 * set_bit() do_shrink_slab()
568 smp_mb__after_atomic();
569 ret
= do_shrink_slab(&sc
, shrinker
, priority
);
570 if (ret
== SHRINK_EMPTY
)
573 set_shrinker_bit(memcg
, nid
, shrinker_id
);
576 shrinker_put(shrinker
);
586 #else /* !CONFIG_MEMCG */
587 static unsigned long shrink_slab_memcg(gfp_t gfp_mask
, int nid
,
588 struct mem_cgroup
*memcg
, int priority
)
592 #endif /* CONFIG_MEMCG */
595 * shrink_slab - shrink slab caches
596 * @gfp_mask: allocation context
597 * @nid: node whose slab caches to target
598 * @memcg: memory cgroup whose slab caches to target
599 * @priority: the reclaim priority
601 * Call the shrink functions to age shrinkable caches.
603 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
604 * unaware shrinkers will receive a node id of 0 instead.
606 * @memcg specifies the memory cgroup to target. Unaware shrinkers
607 * are called only if it is the root cgroup.
609 * @priority is sc->priority, we take the number of objects and >> by priority
610 * in order to get the scan target.
612 * Returns the number of reclaimed slab objects.
614 unsigned long shrink_slab(gfp_t gfp_mask
, int nid
, struct mem_cgroup
*memcg
,
617 unsigned long ret
, freed
= 0;
618 struct shrinker
*shrinker
;
621 * The root memcg might be allocated even though memcg is disabled
622 * via "cgroup_disable=memory" boot parameter. This could make
623 * mem_cgroup_is_root() return false, then just run memcg slab
624 * shrink, but skip global shrink. This may result in premature
627 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg
))
628 return shrink_slab_memcg(gfp_mask
, nid
, memcg
, priority
);
631 * lockless algorithm of global shrink.
633 * In the unregistration setp, the shrinker will be freed asynchronously
634 * via RCU after its refcount reaches 0. So both rcu_read_lock() and
635 * shrinker_try_get() can be used to ensure the existence of the shrinker.
637 * So in the global shrink:
638 * step 1: use rcu_read_lock() to guarantee existence of the shrinker
639 * and the validity of the shrinker_list walk.
640 * step 2: use shrinker_try_get() to try get the refcount, if successful,
641 * then the existence of the shrinker can also be guaranteed,
642 * so we can release the RCU lock to do do_shrink_slab() that
644 * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
645 * which ensures that neither this shrinker nor the next shrinker
646 * will be freed in the next traversal operation.
647 * step 4: do shrinker_put() paired with step 2 to put the refcount,
648 * if the refcount reaches 0, then wake up the waiter in
649 * shrinker_free() by calling complete().
652 list_for_each_entry_rcu(shrinker
, &shrinker_list
, list
) {
653 struct shrink_control sc
= {
654 .gfp_mask
= gfp_mask
,
659 if (!shrinker_try_get(shrinker
))
664 ret
= do_shrink_slab(&sc
, shrinker
, priority
);
665 if (ret
== SHRINK_EMPTY
)
670 shrinker_put(shrinker
);
678 struct shrinker
*shrinker_alloc(unsigned int flags
, const char *fmt
, ...)
680 struct shrinker
*shrinker
;
685 shrinker
= kzalloc(sizeof(struct shrinker
), GFP_KERNEL
);
690 err
= shrinker_debugfs_name_alloc(shrinker
, fmt
, ap
);
695 shrinker
->flags
= flags
| SHRINKER_ALLOCATED
;
696 shrinker
->seeks
= DEFAULT_SEEKS
;
698 if (flags
& SHRINKER_MEMCG_AWARE
) {
699 err
= shrinker_memcg_alloc(shrinker
);
700 if (err
== -ENOSYS
) {
701 /* Memcg is not supported, fallback to non-memcg-aware shrinker. */
702 shrinker
->flags
&= ~SHRINKER_MEMCG_AWARE
;
714 * The nr_deferred is available on per memcg level for memcg aware
715 * shrinkers, so only allocate nr_deferred in the following cases:
716 * - non-memcg-aware shrinkers
718 * - memcg is disabled by kernel command line
720 size
= sizeof(*shrinker
->nr_deferred
);
721 if (flags
& SHRINKER_NUMA_AWARE
)
724 shrinker
->nr_deferred
= kzalloc(size
, GFP_KERNEL
);
725 if (!shrinker
->nr_deferred
)
731 shrinker_debugfs_name_free(shrinker
);
736 EXPORT_SYMBOL_GPL(shrinker_alloc
);
738 void shrinker_register(struct shrinker
*shrinker
)
740 if (unlikely(!(shrinker
->flags
& SHRINKER_ALLOCATED
))) {
741 pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
745 mutex_lock(&shrinker_mutex
);
746 list_add_tail_rcu(&shrinker
->list
, &shrinker_list
);
747 shrinker
->flags
|= SHRINKER_REGISTERED
;
748 shrinker_debugfs_add(shrinker
);
749 mutex_unlock(&shrinker_mutex
);
751 init_completion(&shrinker
->done
);
753 * Now the shrinker is fully set up, take the first reference to it to
754 * indicate that lookup operations are now allowed to use it via
755 * shrinker_try_get().
757 refcount_set(&shrinker
->refcount
, 1);
759 EXPORT_SYMBOL_GPL(shrinker_register
);
761 static void shrinker_free_rcu_cb(struct rcu_head
*head
)
763 struct shrinker
*shrinker
= container_of(head
, struct shrinker
, rcu
);
765 kfree(shrinker
->nr_deferred
);
769 void shrinker_free(struct shrinker
*shrinker
)
771 struct dentry
*debugfs_entry
= NULL
;
777 if (shrinker
->flags
& SHRINKER_REGISTERED
) {
778 /* drop the initial refcount */
779 shrinker_put(shrinker
);
781 * Wait for all lookups of the shrinker to complete, after that,
782 * no shrinker is running or will run again, then we can safely
783 * free it asynchronously via RCU and safely free the structure
784 * where the shrinker is located, such as super_block etc.
786 wait_for_completion(&shrinker
->done
);
789 mutex_lock(&shrinker_mutex
);
790 if (shrinker
->flags
& SHRINKER_REGISTERED
) {
792 * Now we can safely remove it from the shrinker_list and then
795 list_del_rcu(&shrinker
->list
);
796 debugfs_entry
= shrinker_debugfs_detach(shrinker
, &debugfs_id
);
797 shrinker
->flags
&= ~SHRINKER_REGISTERED
;
800 shrinker_debugfs_name_free(shrinker
);
802 if (shrinker
->flags
& SHRINKER_MEMCG_AWARE
)
803 shrinker_memcg_remove(shrinker
);
804 mutex_unlock(&shrinker_mutex
);
807 shrinker_debugfs_remove(debugfs_entry
, debugfs_id
);
809 call_rcu(&shrinker
->rcu
, shrinker_free_rcu_cb
);
811 EXPORT_SYMBOL_GPL(shrinker_free
);