1 // SPDX-License-Identifier: GPL-2.0-or-later
3 #include "cpuset-internal.h"
6 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
8 struct cpuset_remove_tasks_struct
{
9 struct work_struct work
;
14 * Frequency meter - How fast is some event occurring?
16 * These routines manage a digitally filtered, constant time based,
17 * event frequency meter. There are four routines:
18 * fmeter_init() - initialize a frequency meter.
19 * fmeter_markevent() - called each time the event happens.
20 * fmeter_getrate() - returns the recent rate of such events.
21 * fmeter_update() - internal routine used to update fmeter.
23 * A common data structure is passed to each of these routines,
24 * which is used to keep track of the state required to manage the
25 * frequency meter and its digital filter.
27 * The filter works on the number of events marked per unit time.
28 * The filter is single-pole low-pass recursive (IIR). The time unit
29 * is 1 second. Arithmetic is done using 32-bit integers scaled to
30 * simulate 3 decimal digits of precision (multiplied by 1000).
32 * With an FM_COEF of 933, and a time base of 1 second, the filter
33 * has a half-life of 10 seconds, meaning that if the events quit
34 * happening, then the rate returned from the fmeter_getrate()
35 * will be cut in half each 10 seconds, until it converges to zero.
37 * It is not worth doing a real infinitely recursive filter. If more
38 * than FM_MAXTICKS ticks have elapsed since the last filter event,
39 * just compute FM_MAXTICKS ticks worth, by which point the level
42 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
43 * arithmetic overflow in the fmeter_update() routine.
45 * Given the simple 32 bit integer arithmetic used, this meter works
46 * best for reporting rates between one per millisecond (msec) and
47 * one per 32 (approx) seconds. At constant rates faster than one
48 * per msec it maxes out at values just under 1,000,000. At constant
49 * rates between one per msec, and one per second it will stabilize
50 * to a value N*1000, where N is the rate of events per second.
51 * At constant rates between one per second and one per 32 seconds,
52 * it will be choppy, moving up on the seconds that have an event,
53 * and then decaying until the next event. At rates slower than
54 * about one in 32 seconds, it decays all the way back to zero between
58 #define FM_COEF 933 /* coefficient for half-life of 10 secs */
59 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
60 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
61 #define FM_SCALE 1000 /* faux fixed point scale */
63 /* Initialize a frequency meter */
64 void fmeter_init(struct fmeter
*fmp
)
69 spin_lock_init(&fmp
->lock
);
72 /* Internal meter update - process cnt events and update value */
73 static void fmeter_update(struct fmeter
*fmp
)
78 now
= ktime_get_seconds();
79 ticks
= now
- fmp
->time
;
84 ticks
= min(FM_MAXTICKS
, ticks
);
86 fmp
->val
= (FM_COEF
* fmp
->val
) / FM_SCALE
;
89 fmp
->val
+= ((FM_SCALE
- FM_COEF
) * fmp
->cnt
) / FM_SCALE
;
93 /* Process any previous ticks, then bump cnt by one (times scale). */
94 static void fmeter_markevent(struct fmeter
*fmp
)
96 spin_lock(&fmp
->lock
);
98 fmp
->cnt
= min(FM_MAXCNT
, fmp
->cnt
+ FM_SCALE
);
99 spin_unlock(&fmp
->lock
);
102 /* Process any previous ticks, then return current value. */
103 static int fmeter_getrate(struct fmeter
*fmp
)
107 spin_lock(&fmp
->lock
);
110 spin_unlock(&fmp
->lock
);
115 * Collection of memory_pressure is suppressed unless
116 * this flag is enabled by writing "1" to the special
117 * cpuset file 'memory_pressure_enabled' in the root cpuset.
120 int cpuset_memory_pressure_enabled __read_mostly
;
123 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
125 * Keep a running average of the rate of synchronous (direct)
126 * page reclaim efforts initiated by tasks in each cpuset.
128 * This represents the rate at which some task in the cpuset
129 * ran low on memory on all nodes it was allowed to use, and
130 * had to enter the kernels page reclaim code in an effort to
131 * create more free memory by tossing clean pages or swapping
132 * or writing dirty pages.
134 * Display to user space in the per-cpuset read-only file
135 * "memory_pressure". Value displayed is an integer
136 * representing the recent rate of entry into the synchronous
137 * (direct) page reclaim by any task attached to the cpuset.
140 void __cpuset_memory_pressure_bump(void)
143 fmeter_markevent(&task_cs(current
)->fmeter
);
147 static int update_relax_domain_level(struct cpuset
*cs
, s64 val
)
150 if (val
< -1 || val
> sched_domain_level_max
+ 1)
154 if (val
!= cs
->relax_domain_level
) {
155 cs
->relax_domain_level
= val
;
156 if (!cpumask_empty(cs
->cpus_allowed
) &&
157 is_sched_load_balance(cs
))
158 rebuild_sched_domains_locked();
164 static int cpuset_write_s64(struct cgroup_subsys_state
*css
, struct cftype
*cft
,
167 struct cpuset
*cs
= css_cs(css
);
168 cpuset_filetype_t type
= cft
->private;
169 int retval
= -ENODEV
;
173 if (!is_cpuset_online(cs
))
177 case FILE_SCHED_RELAX_DOMAIN_LEVEL
:
178 retval
= update_relax_domain_level(cs
, val
);
190 static s64
cpuset_read_s64(struct cgroup_subsys_state
*css
, struct cftype
*cft
)
192 struct cpuset
*cs
= css_cs(css
);
193 cpuset_filetype_t type
= cft
->private;
196 case FILE_SCHED_RELAX_DOMAIN_LEVEL
:
197 return cs
->relax_domain_level
;
202 /* Unreachable but makes gcc happy */
207 * update task's spread flag if cpuset's page/slab spread flag is set
209 * Call with callback_lock or cpuset_mutex held. The check can be skipped
210 * if on default hierarchy.
212 void cpuset1_update_task_spread_flags(struct cpuset
*cs
,
213 struct task_struct
*tsk
)
215 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys
))
218 if (is_spread_page(cs
))
219 task_set_spread_page(tsk
);
221 task_clear_spread_page(tsk
);
223 if (is_spread_slab(cs
))
224 task_set_spread_slab(tsk
);
226 task_clear_spread_slab(tsk
);
230 * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
231 * @cs: the cpuset in which each task's spread flags needs to be changed
233 * Iterate through each task of @cs updating its spread flags. As this
234 * function is called with cpuset_mutex held, cpuset membership stays
237 void cpuset1_update_tasks_flags(struct cpuset
*cs
)
239 struct css_task_iter it
;
240 struct task_struct
*task
;
242 css_task_iter_start(&cs
->css
, 0, &it
);
243 while ((task
= css_task_iter_next(&it
)))
244 cpuset1_update_task_spread_flags(cs
, task
);
245 css_task_iter_end(&it
);
249 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
250 * or memory nodes, we need to walk over the cpuset hierarchy,
251 * removing that CPU or node from all cpusets. If this removes the
252 * last CPU or node from a cpuset, then move the tasks in the empty
253 * cpuset to its next-highest non-empty parent.
255 static void remove_tasks_in_empty_cpuset(struct cpuset
*cs
)
257 struct cpuset
*parent
;
260 * Find its next-highest non-empty parent, (top cpuset
261 * has online cpus, so can't be empty).
263 parent
= parent_cs(cs
);
264 while (cpumask_empty(parent
->cpus_allowed
) ||
265 nodes_empty(parent
->mems_allowed
))
266 parent
= parent_cs(parent
);
268 if (cgroup_transfer_tasks(parent
->css
.cgroup
, cs
->css
.cgroup
)) {
269 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
270 pr_cont_cgroup_name(cs
->css
.cgroup
);
275 static void cpuset_migrate_tasks_workfn(struct work_struct
*work
)
277 struct cpuset_remove_tasks_struct
*s
;
279 s
= container_of(work
, struct cpuset_remove_tasks_struct
, work
);
280 remove_tasks_in_empty_cpuset(s
->cs
);
281 css_put(&s
->cs
->css
);
285 void cpuset1_hotplug_update_tasks(struct cpuset
*cs
,
286 struct cpumask
*new_cpus
, nodemask_t
*new_mems
,
287 bool cpus_updated
, bool mems_updated
)
291 cpuset_callback_lock_irq();
292 cpumask_copy(cs
->cpus_allowed
, new_cpus
);
293 cpumask_copy(cs
->effective_cpus
, new_cpus
);
294 cs
->mems_allowed
= *new_mems
;
295 cs
->effective_mems
= *new_mems
;
296 cpuset_callback_unlock_irq();
299 * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
300 * as the tasks will be migrated to an ancestor.
302 if (cpus_updated
&& !cpumask_empty(cs
->cpus_allowed
))
303 cpuset_update_tasks_cpumask(cs
, new_cpus
);
304 if (mems_updated
&& !nodes_empty(cs
->mems_allowed
))
305 cpuset_update_tasks_nodemask(cs
);
307 is_empty
= cpumask_empty(cs
->cpus_allowed
) ||
308 nodes_empty(cs
->mems_allowed
);
311 * Move tasks to the nearest ancestor with execution resources,
312 * This is full cgroup operation which will also call back into
313 * cpuset. Execute it asynchronously using workqueue.
315 if (is_empty
&& cs
->css
.cgroup
->nr_populated_csets
&&
316 css_tryget_online(&cs
->css
)) {
317 struct cpuset_remove_tasks_struct
*s
;
319 s
= kzalloc(sizeof(*s
), GFP_KERNEL
);
320 if (WARN_ON_ONCE(!s
)) {
326 INIT_WORK(&s
->work
, cpuset_migrate_tasks_workfn
);
327 schedule_work(&s
->work
);
332 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
334 * One cpuset is a subset of another if all its allowed CPUs and
335 * Memory Nodes are a subset of the other, and its exclusive flags
336 * are only set if the other's are set. Call holding cpuset_mutex.
339 static int is_cpuset_subset(const struct cpuset
*p
, const struct cpuset
*q
)
341 return cpumask_subset(p
->cpus_allowed
, q
->cpus_allowed
) &&
342 nodes_subset(p
->mems_allowed
, q
->mems_allowed
) &&
343 is_cpu_exclusive(p
) <= is_cpu_exclusive(q
) &&
344 is_mem_exclusive(p
) <= is_mem_exclusive(q
);
348 * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
351 int cpuset1_validate_change(struct cpuset
*cur
, struct cpuset
*trial
)
353 struct cgroup_subsys_state
*css
;
354 struct cpuset
*c
, *par
;
357 WARN_ON_ONCE(!rcu_read_lock_held());
359 /* Each of our child cpusets must be a subset of us */
361 cpuset_for_each_child(c
, css
, cur
)
362 if (!is_cpuset_subset(c
, trial
))
365 /* On legacy hierarchy, we must be a subset of our parent cpuset. */
367 par
= parent_cs(cur
);
368 if (par
&& !is_cpuset_subset(trial
, par
))
376 static u64
cpuset_read_u64(struct cgroup_subsys_state
*css
, struct cftype
*cft
)
378 struct cpuset
*cs
= css_cs(css
);
379 cpuset_filetype_t type
= cft
->private;
382 case FILE_CPU_EXCLUSIVE
:
383 return is_cpu_exclusive(cs
);
384 case FILE_MEM_EXCLUSIVE
:
385 return is_mem_exclusive(cs
);
386 case FILE_MEM_HARDWALL
:
387 return is_mem_hardwall(cs
);
388 case FILE_SCHED_LOAD_BALANCE
:
389 return is_sched_load_balance(cs
);
390 case FILE_MEMORY_MIGRATE
:
391 return is_memory_migrate(cs
);
392 case FILE_MEMORY_PRESSURE_ENABLED
:
393 return cpuset_memory_pressure_enabled
;
394 case FILE_MEMORY_PRESSURE
:
395 return fmeter_getrate(&cs
->fmeter
);
396 case FILE_SPREAD_PAGE
:
397 return is_spread_page(cs
);
398 case FILE_SPREAD_SLAB
:
399 return is_spread_slab(cs
);
404 /* Unreachable but makes gcc happy */
408 static int cpuset_write_u64(struct cgroup_subsys_state
*css
, struct cftype
*cft
,
411 struct cpuset
*cs
= css_cs(css
);
412 cpuset_filetype_t type
= cft
->private;
417 if (!is_cpuset_online(cs
)) {
423 case FILE_CPU_EXCLUSIVE
:
424 retval
= cpuset_update_flag(CS_CPU_EXCLUSIVE
, cs
, val
);
426 case FILE_MEM_EXCLUSIVE
:
427 retval
= cpuset_update_flag(CS_MEM_EXCLUSIVE
, cs
, val
);
429 case FILE_MEM_HARDWALL
:
430 retval
= cpuset_update_flag(CS_MEM_HARDWALL
, cs
, val
);
432 case FILE_SCHED_LOAD_BALANCE
:
433 retval
= cpuset_update_flag(CS_SCHED_LOAD_BALANCE
, cs
, val
);
435 case FILE_MEMORY_MIGRATE
:
436 retval
= cpuset_update_flag(CS_MEMORY_MIGRATE
, cs
, val
);
438 case FILE_MEMORY_PRESSURE_ENABLED
:
439 cpuset_memory_pressure_enabled
= !!val
;
441 case FILE_SPREAD_PAGE
:
442 retval
= cpuset_update_flag(CS_SPREAD_PAGE
, cs
, val
);
444 case FILE_SPREAD_SLAB
:
445 retval
= cpuset_update_flag(CS_SPREAD_SLAB
, cs
, val
);
458 * for the common functions, 'private' gives the type of file
461 struct cftype cpuset1_files
[] = {
464 .seq_show
= cpuset_common_seq_show
,
465 .write
= cpuset_write_resmask
,
466 .max_write_len
= (100U + 6 * NR_CPUS
),
467 .private = FILE_CPULIST
,
472 .seq_show
= cpuset_common_seq_show
,
473 .write
= cpuset_write_resmask
,
474 .max_write_len
= (100U + 6 * MAX_NUMNODES
),
475 .private = FILE_MEMLIST
,
479 .name
= "effective_cpus",
480 .seq_show
= cpuset_common_seq_show
,
481 .private = FILE_EFFECTIVE_CPULIST
,
485 .name
= "effective_mems",
486 .seq_show
= cpuset_common_seq_show
,
487 .private = FILE_EFFECTIVE_MEMLIST
,
491 .name
= "cpu_exclusive",
492 .read_u64
= cpuset_read_u64
,
493 .write_u64
= cpuset_write_u64
,
494 .private = FILE_CPU_EXCLUSIVE
,
498 .name
= "mem_exclusive",
499 .read_u64
= cpuset_read_u64
,
500 .write_u64
= cpuset_write_u64
,
501 .private = FILE_MEM_EXCLUSIVE
,
505 .name
= "mem_hardwall",
506 .read_u64
= cpuset_read_u64
,
507 .write_u64
= cpuset_write_u64
,
508 .private = FILE_MEM_HARDWALL
,
512 .name
= "sched_load_balance",
513 .read_u64
= cpuset_read_u64
,
514 .write_u64
= cpuset_write_u64
,
515 .private = FILE_SCHED_LOAD_BALANCE
,
519 .name
= "sched_relax_domain_level",
520 .read_s64
= cpuset_read_s64
,
521 .write_s64
= cpuset_write_s64
,
522 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL
,
526 .name
= "memory_migrate",
527 .read_u64
= cpuset_read_u64
,
528 .write_u64
= cpuset_write_u64
,
529 .private = FILE_MEMORY_MIGRATE
,
533 .name
= "memory_pressure",
534 .read_u64
= cpuset_read_u64
,
535 .private = FILE_MEMORY_PRESSURE
,
539 .name
= "memory_spread_page",
540 .read_u64
= cpuset_read_u64
,
541 .write_u64
= cpuset_write_u64
,
542 .private = FILE_SPREAD_PAGE
,
546 /* obsolete, may be removed in the future */
547 .name
= "memory_spread_slab",
548 .read_u64
= cpuset_read_u64
,
549 .write_u64
= cpuset_write_u64
,
550 .private = FILE_SPREAD_SLAB
,
554 .name
= "memory_pressure_enabled",
555 .flags
= CFTYPE_ONLY_ON_ROOT
,
556 .read_u64
= cpuset_read_u64
,
557 .write_u64
= cpuset_write_u64
,
558 .private = FILE_MEMORY_PRESSURE_ENABLED
,