4 * Processor and Memory placement constraints for sets of tasks.
6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004-2007 Silicon Graphics, Inc.
8 * Copyright (C) 2006 Google, Inc
10 * Portions derived from Patrick Mochel's sysfs code.
11 * sysfs is Copyright (c) 2001-3 Patrick Mochel
13 * 2003-10-10 Written by Simon Derr.
14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
20 * This file is subject to the terms and conditions of the GNU General Public
21 * License. See the file COPYING in the main directory of the Linux
22 * distribution for more details.
25 #include <linux/cpu.h>
26 #include <linux/cpumask.h>
27 #include <linux/cpuset.h>
28 #include <linux/err.h>
29 #include <linux/errno.h>
30 #include <linux/file.h>
32 #include <linux/init.h>
33 #include <linux/interrupt.h>
34 #include <linux/kernel.h>
35 #include <linux/kmod.h>
36 #include <linux/list.h>
37 #include <linux/mempolicy.h>
39 #include <linux/memory.h>
40 #include <linux/export.h>
41 #include <linux/mount.h>
42 #include <linux/fs_context.h>
43 #include <linux/namei.h>
44 #include <linux/pagemap.h>
45 #include <linux/proc_fs.h>
46 #include <linux/rcupdate.h>
47 #include <linux/sched.h>
48 #include <linux/sched/mm.h>
49 #include <linux/sched/task.h>
50 #include <linux/seq_file.h>
51 #include <linux/security.h>
52 #include <linux/slab.h>
53 #include <linux/spinlock.h>
54 #include <linux/stat.h>
55 #include <linux/string.h>
56 #include <linux/time.h>
57 #include <linux/time64.h>
58 #include <linux/backing-dev.h>
59 #include <linux/sort.h>
60 #include <linux/oom.h>
61 #include <linux/sched/isolation.h>
62 #include <linux/uaccess.h>
63 #include <linux/atomic.h>
64 #include <linux/mutex.h>
65 #include <linux/cgroup.h>
66 #include <linux/wait.h>
68 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key
);
69 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key
);
71 /* See "Frequency meter" comments, below. */
74 int cnt
; /* unprocessed events count */
75 int val
; /* most recent output value */
76 time64_t time
; /* clock (secs) when val computed */
77 spinlock_t lock
; /* guards read or write of above */
81 struct cgroup_subsys_state css
;
83 unsigned long flags
; /* "unsigned long" so bitops work */
86 * On default hierarchy:
88 * The user-configured masks can only be changed by writing to
89 * cpuset.cpus and cpuset.mems, and won't be limited by the
92 * The effective masks is the real masks that apply to the tasks
93 * in the cpuset. They may be changed if the configured masks are
94 * changed or hotplug happens.
96 * effective_mask == configured_mask & parent's effective_mask,
97 * and if it ends up empty, it will inherit the parent's mask.
100 * On legacy hierachy:
102 * The user-configured masks are always the same with effective masks.
105 /* user-configured CPUs and Memory Nodes allow to tasks */
106 cpumask_var_t cpus_allowed
;
107 nodemask_t mems_allowed
;
109 /* effective CPUs and Memory Nodes allow to tasks */
110 cpumask_var_t effective_cpus
;
111 nodemask_t effective_mems
;
114 * CPUs allocated to child sub-partitions (default hierarchy only)
115 * - CPUs granted by the parent = effective_cpus U subparts_cpus
116 * - effective_cpus and subparts_cpus are mutually exclusive.
118 * effective_cpus contains only onlined CPUs, but subparts_cpus
119 * may have offlined ones.
121 cpumask_var_t subparts_cpus
;
124 * This is old Memory Nodes tasks took on.
126 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
127 * - A new cpuset's old_mems_allowed is initialized when some
128 * task is moved into it.
129 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
130 * cpuset.mems_allowed and have tasks' nodemask updated, and
131 * then old_mems_allowed is updated to mems_allowed.
133 nodemask_t old_mems_allowed
;
135 struct fmeter fmeter
; /* memory_pressure filter */
138 * Tasks are being attached to this cpuset. Used to prevent
139 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
141 int attach_in_progress
;
143 /* partition number for rebuild_sched_domains() */
146 /* for custom sched domain */
147 int relax_domain_level
;
149 /* number of CPUs in subparts_cpus */
150 int nr_subparts_cpus
;
152 /* partition root state */
153 int partition_root_state
;
156 * Default hierarchy only:
157 * use_parent_ecpus - set if using parent's effective_cpus
158 * child_ecpus_count - # of children with use_parent_ecpus set
160 int use_parent_ecpus
;
161 int child_ecpus_count
;
165 * Partition root states:
167 * 0 - not a partition root
171 * -1 - invalid partition root
172 * None of the cpus in cpus_allowed can be put into the parent's
173 * subparts_cpus. In this case, the cpuset is not a real partition
174 * root anymore. However, the CPU_EXCLUSIVE bit will still be set
175 * and the cpuset can be restored back to a partition root if the
176 * parent cpuset can give more CPUs back to this child cpuset.
178 #define PRS_DISABLED 0
179 #define PRS_ENABLED 1
183 * Temporary cpumasks for working with partitions that are passed among
184 * functions to avoid memory allocation in inner functions.
187 cpumask_var_t addmask
, delmask
; /* For partition root */
188 cpumask_var_t new_cpus
; /* For update_cpumasks_hier() */
191 static inline struct cpuset
*css_cs(struct cgroup_subsys_state
*css
)
193 return css
? container_of(css
, struct cpuset
, css
) : NULL
;
196 /* Retrieve the cpuset for a task */
197 static inline struct cpuset
*task_cs(struct task_struct
*task
)
199 return css_cs(task_css(task
, cpuset_cgrp_id
));
202 static inline struct cpuset
*parent_cs(struct cpuset
*cs
)
204 return css_cs(cs
->css
.parent
);
207 /* bits in struct cpuset flags field */
214 CS_SCHED_LOAD_BALANCE
,
219 /* convenient tests for these bits */
220 static inline bool is_cpuset_online(struct cpuset
*cs
)
222 return test_bit(CS_ONLINE
, &cs
->flags
) && !css_is_dying(&cs
->css
);
225 static inline int is_cpu_exclusive(const struct cpuset
*cs
)
227 return test_bit(CS_CPU_EXCLUSIVE
, &cs
->flags
);
230 static inline int is_mem_exclusive(const struct cpuset
*cs
)
232 return test_bit(CS_MEM_EXCLUSIVE
, &cs
->flags
);
235 static inline int is_mem_hardwall(const struct cpuset
*cs
)
237 return test_bit(CS_MEM_HARDWALL
, &cs
->flags
);
240 static inline int is_sched_load_balance(const struct cpuset
*cs
)
242 return test_bit(CS_SCHED_LOAD_BALANCE
, &cs
->flags
);
245 static inline int is_memory_migrate(const struct cpuset
*cs
)
247 return test_bit(CS_MEMORY_MIGRATE
, &cs
->flags
);
250 static inline int is_spread_page(const struct cpuset
*cs
)
252 return test_bit(CS_SPREAD_PAGE
, &cs
->flags
);
255 static inline int is_spread_slab(const struct cpuset
*cs
)
257 return test_bit(CS_SPREAD_SLAB
, &cs
->flags
);
260 static inline int is_partition_root(const struct cpuset
*cs
)
262 return cs
->partition_root_state
> 0;
265 static struct cpuset top_cpuset
= {
266 .flags
= ((1 << CS_ONLINE
) | (1 << CS_CPU_EXCLUSIVE
) |
267 (1 << CS_MEM_EXCLUSIVE
)),
268 .partition_root_state
= PRS_ENABLED
,
272 * cpuset_for_each_child - traverse online children of a cpuset
273 * @child_cs: loop cursor pointing to the current child
274 * @pos_css: used for iteration
275 * @parent_cs: target cpuset to walk children of
277 * Walk @child_cs through the online children of @parent_cs. Must be used
278 * with RCU read locked.
280 #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
281 css_for_each_child((pos_css), &(parent_cs)->css) \
282 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
285 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
286 * @des_cs: loop cursor pointing to the current descendant
287 * @pos_css: used for iteration
288 * @root_cs: target cpuset to walk ancestor of
290 * Walk @des_cs through the online descendants of @root_cs. Must be used
291 * with RCU read locked. The caller may modify @pos_css by calling
292 * css_rightmost_descendant() to skip subtree. @root_cs is included in the
293 * iteration and the first node to be visited.
295 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
296 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
297 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
300 * There are two global locks guarding cpuset structures - cpuset_mutex and
301 * callback_lock. We also require taking task_lock() when dereferencing a
302 * task's cpuset pointer. See "The task_lock() exception", at the end of this
305 * A task must hold both locks to modify cpusets. If a task holds
306 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
307 * is the only task able to also acquire callback_lock and be able to
308 * modify cpusets. It can perform various checks on the cpuset structure
309 * first, knowing nothing will change. It can also allocate memory while
310 * just holding cpuset_mutex. While it is performing these checks, various
311 * callback routines can briefly acquire callback_lock to query cpusets.
312 * Once it is ready to make the changes, it takes callback_lock, blocking
315 * Calls to the kernel memory allocator can not be made while holding
316 * callback_lock, as that would risk double tripping on callback_lock
317 * from one of the callbacks into the cpuset code from within
320 * If a task is only holding callback_lock, then it has read-only
323 * Now, the task_struct fields mems_allowed and mempolicy may be changed
324 * by other task, we use alloc_lock in the task_struct fields to protect
327 * The cpuset_common_file_read() handlers only hold callback_lock across
328 * small pieces of code, such as when reading out possibly multi-word
329 * cpumasks and nodemasks.
331 * Accessing a task's cpuset should be done in accordance with the
332 * guidelines for accessing subsystem state in kernel/cgroup.c
335 static DEFINE_MUTEX(cpuset_mutex
);
336 static DEFINE_SPINLOCK(callback_lock
);
338 static struct workqueue_struct
*cpuset_migrate_mm_wq
;
341 * CPU / memory hotplug is handled asynchronously.
343 static void cpuset_hotplug_workfn(struct work_struct
*work
);
344 static DECLARE_WORK(cpuset_hotplug_work
, cpuset_hotplug_workfn
);
346 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq
);
349 * Cgroup v2 behavior is used when on default hierarchy or the
350 * cgroup_v2_mode flag is set.
352 static inline bool is_in_v2_mode(void)
354 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys
) ||
355 (cpuset_cgrp_subsys
.root
->flags
& CGRP_ROOT_CPUSET_V2_MODE
);
359 * This is ugly, but preserves the userspace API for existing cpuset
360 * users. If someone tries to mount the "cpuset" filesystem, we
361 * silently switch it to mount "cgroup" instead
363 static int cpuset_get_tree(struct fs_context
*fc
)
365 struct file_system_type
*cgroup_fs
;
366 struct fs_context
*new_fc
;
369 cgroup_fs
= get_fs_type("cgroup");
373 new_fc
= fs_context_for_mount(cgroup_fs
, fc
->sb_flags
);
374 if (IS_ERR(new_fc
)) {
375 ret
= PTR_ERR(new_fc
);
377 static const char agent_path
[] = "/sbin/cpuset_release_agent";
378 ret
= vfs_parse_fs_string(new_fc
, "cpuset", NULL
, 0);
380 ret
= vfs_parse_fs_string(new_fc
, "noprefix", NULL
, 0);
382 ret
= vfs_parse_fs_string(new_fc
, "release_agent",
383 agent_path
, sizeof(agent_path
) - 1);
385 ret
= vfs_get_tree(new_fc
);
386 if (!ret
) { /* steal the result */
387 fc
->root
= new_fc
->root
;
390 put_fs_context(new_fc
);
392 put_filesystem(cgroup_fs
);
396 static const struct fs_context_operations cpuset_fs_context_ops
= {
397 .get_tree
= cpuset_get_tree
,
400 static int cpuset_init_fs_context(struct fs_context
*fc
)
402 fc
->ops
= &cpuset_fs_context_ops
;
406 static struct file_system_type cpuset_fs_type
= {
408 .init_fs_context
= cpuset_init_fs_context
,
412 * Return in pmask the portion of a cpusets's cpus_allowed that
413 * are online. If none are online, walk up the cpuset hierarchy
414 * until we find one that does have some online cpus.
416 * One way or another, we guarantee to return some non-empty subset
417 * of cpu_online_mask.
419 * Call with callback_lock or cpuset_mutex held.
421 static void guarantee_online_cpus(struct cpuset
*cs
, struct cpumask
*pmask
)
423 while (!cpumask_intersects(cs
->effective_cpus
, cpu_online_mask
)) {
427 * The top cpuset doesn't have any online cpu as a
428 * consequence of a race between cpuset_hotplug_work
429 * and cpu hotplug notifier. But we know the top
430 * cpuset's effective_cpus is on its way to to be
431 * identical to cpu_online_mask.
433 cpumask_copy(pmask
, cpu_online_mask
);
437 cpumask_and(pmask
, cs
->effective_cpus
, cpu_online_mask
);
441 * Return in *pmask the portion of a cpusets's mems_allowed that
442 * are online, with memory. If none are online with memory, walk
443 * up the cpuset hierarchy until we find one that does have some
444 * online mems. The top cpuset always has some mems online.
446 * One way or another, we guarantee to return some non-empty subset
447 * of node_states[N_MEMORY].
449 * Call with callback_lock or cpuset_mutex held.
451 static void guarantee_online_mems(struct cpuset
*cs
, nodemask_t
*pmask
)
453 while (!nodes_intersects(cs
->effective_mems
, node_states
[N_MEMORY
]))
455 nodes_and(*pmask
, cs
->effective_mems
, node_states
[N_MEMORY
]);
459 * update task's spread flag if cpuset's page/slab spread flag is set
461 * Call with callback_lock or cpuset_mutex held.
463 static void cpuset_update_task_spread_flag(struct cpuset
*cs
,
464 struct task_struct
*tsk
)
466 if (is_spread_page(cs
))
467 task_set_spread_page(tsk
);
469 task_clear_spread_page(tsk
);
471 if (is_spread_slab(cs
))
472 task_set_spread_slab(tsk
);
474 task_clear_spread_slab(tsk
);
478 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
480 * One cpuset is a subset of another if all its allowed CPUs and
481 * Memory Nodes are a subset of the other, and its exclusive flags
482 * are only set if the other's are set. Call holding cpuset_mutex.
485 static int is_cpuset_subset(const struct cpuset
*p
, const struct cpuset
*q
)
487 return cpumask_subset(p
->cpus_allowed
, q
->cpus_allowed
) &&
488 nodes_subset(p
->mems_allowed
, q
->mems_allowed
) &&
489 is_cpu_exclusive(p
) <= is_cpu_exclusive(q
) &&
490 is_mem_exclusive(p
) <= is_mem_exclusive(q
);
494 * alloc_cpumasks - allocate three cpumasks for cpuset
495 * @cs: the cpuset that have cpumasks to be allocated.
496 * @tmp: the tmpmasks structure pointer
497 * Return: 0 if successful, -ENOMEM otherwise.
499 * Only one of the two input arguments should be non-NULL.
501 static inline int alloc_cpumasks(struct cpuset
*cs
, struct tmpmasks
*tmp
)
503 cpumask_var_t
*pmask1
, *pmask2
, *pmask3
;
506 pmask1
= &cs
->cpus_allowed
;
507 pmask2
= &cs
->effective_cpus
;
508 pmask3
= &cs
->subparts_cpus
;
510 pmask1
= &tmp
->new_cpus
;
511 pmask2
= &tmp
->addmask
;
512 pmask3
= &tmp
->delmask
;
515 if (!zalloc_cpumask_var(pmask1
, GFP_KERNEL
))
518 if (!zalloc_cpumask_var(pmask2
, GFP_KERNEL
))
521 if (!zalloc_cpumask_var(pmask3
, GFP_KERNEL
))
527 free_cpumask_var(*pmask2
);
529 free_cpumask_var(*pmask1
);
534 * free_cpumasks - free cpumasks in a tmpmasks structure
535 * @cs: the cpuset that have cpumasks to be free.
536 * @tmp: the tmpmasks structure pointer
538 static inline void free_cpumasks(struct cpuset
*cs
, struct tmpmasks
*tmp
)
541 free_cpumask_var(cs
->cpus_allowed
);
542 free_cpumask_var(cs
->effective_cpus
);
543 free_cpumask_var(cs
->subparts_cpus
);
546 free_cpumask_var(tmp
->new_cpus
);
547 free_cpumask_var(tmp
->addmask
);
548 free_cpumask_var(tmp
->delmask
);
553 * alloc_trial_cpuset - allocate a trial cpuset
554 * @cs: the cpuset that the trial cpuset duplicates
556 static struct cpuset
*alloc_trial_cpuset(struct cpuset
*cs
)
558 struct cpuset
*trial
;
560 trial
= kmemdup(cs
, sizeof(*cs
), GFP_KERNEL
);
564 if (alloc_cpumasks(trial
, NULL
)) {
569 cpumask_copy(trial
->cpus_allowed
, cs
->cpus_allowed
);
570 cpumask_copy(trial
->effective_cpus
, cs
->effective_cpus
);
575 * free_cpuset - free the cpuset
576 * @cs: the cpuset to be freed
578 static inline void free_cpuset(struct cpuset
*cs
)
580 free_cpumasks(cs
, NULL
);
585 * validate_change() - Used to validate that any proposed cpuset change
586 * follows the structural rules for cpusets.
588 * If we replaced the flag and mask values of the current cpuset
589 * (cur) with those values in the trial cpuset (trial), would
590 * our various subset and exclusive rules still be valid? Presumes
593 * 'cur' is the address of an actual, in-use cpuset. Operations
594 * such as list traversal that depend on the actual address of the
595 * cpuset in the list must use cur below, not trial.
597 * 'trial' is the address of bulk structure copy of cur, with
598 * perhaps one or more of the fields cpus_allowed, mems_allowed,
599 * or flags changed to new, trial values.
601 * Return 0 if valid, -errno if not.
604 static int validate_change(struct cpuset
*cur
, struct cpuset
*trial
)
606 struct cgroup_subsys_state
*css
;
607 struct cpuset
*c
, *par
;
612 /* Each of our child cpusets must be a subset of us */
614 cpuset_for_each_child(c
, css
, cur
)
615 if (!is_cpuset_subset(c
, trial
))
618 /* Remaining checks don't apply to root cpuset */
620 if (cur
== &top_cpuset
)
623 par
= parent_cs(cur
);
625 /* On legacy hiearchy, we must be a subset of our parent cpuset. */
627 if (!is_in_v2_mode() && !is_cpuset_subset(trial
, par
))
631 * If either I or some sibling (!= me) is exclusive, we can't
635 cpuset_for_each_child(c
, css
, par
) {
636 if ((is_cpu_exclusive(trial
) || is_cpu_exclusive(c
)) &&
638 cpumask_intersects(trial
->cpus_allowed
, c
->cpus_allowed
))
640 if ((is_mem_exclusive(trial
) || is_mem_exclusive(c
)) &&
642 nodes_intersects(trial
->mems_allowed
, c
->mems_allowed
))
647 * Cpusets with tasks - existing or newly being attached - can't
648 * be changed to have empty cpus_allowed or mems_allowed.
651 if ((cgroup_is_populated(cur
->css
.cgroup
) || cur
->attach_in_progress
)) {
652 if (!cpumask_empty(cur
->cpus_allowed
) &&
653 cpumask_empty(trial
->cpus_allowed
))
655 if (!nodes_empty(cur
->mems_allowed
) &&
656 nodes_empty(trial
->mems_allowed
))
661 * We can't shrink if we won't have enough room for SCHED_DEADLINE
665 if (is_cpu_exclusive(cur
) &&
666 !cpuset_cpumask_can_shrink(cur
->cpus_allowed
,
667 trial
->cpus_allowed
))
678 * Helper routine for generate_sched_domains().
679 * Do cpusets a, b have overlapping effective cpus_allowed masks?
681 static int cpusets_overlap(struct cpuset
*a
, struct cpuset
*b
)
683 return cpumask_intersects(a
->effective_cpus
, b
->effective_cpus
);
687 update_domain_attr(struct sched_domain_attr
*dattr
, struct cpuset
*c
)
689 if (dattr
->relax_domain_level
< c
->relax_domain_level
)
690 dattr
->relax_domain_level
= c
->relax_domain_level
;
694 static void update_domain_attr_tree(struct sched_domain_attr
*dattr
,
695 struct cpuset
*root_cs
)
698 struct cgroup_subsys_state
*pos_css
;
701 cpuset_for_each_descendant_pre(cp
, pos_css
, root_cs
) {
702 /* skip the whole subtree if @cp doesn't have any CPU */
703 if (cpumask_empty(cp
->cpus_allowed
)) {
704 pos_css
= css_rightmost_descendant(pos_css
);
708 if (is_sched_load_balance(cp
))
709 update_domain_attr(dattr
, cp
);
714 /* Must be called with cpuset_mutex held. */
715 static inline int nr_cpusets(void)
717 /* jump label reference count + the top-level cpuset */
718 return static_key_count(&cpusets_enabled_key
.key
) + 1;
722 * generate_sched_domains()
724 * This function builds a partial partition of the systems CPUs
725 * A 'partial partition' is a set of non-overlapping subsets whose
726 * union is a subset of that set.
727 * The output of this function needs to be passed to kernel/sched/core.c
728 * partition_sched_domains() routine, which will rebuild the scheduler's
729 * load balancing domains (sched domains) as specified by that partial
732 * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
733 * for a background explanation of this.
735 * Does not return errors, on the theory that the callers of this
736 * routine would rather not worry about failures to rebuild sched
737 * domains when operating in the severe memory shortage situations
738 * that could cause allocation failures below.
740 * Must be called with cpuset_mutex held.
742 * The three key local variables below are:
743 * cp - cpuset pointer, used (together with pos_css) to perform a
744 * top-down scan of all cpusets. For our purposes, rebuilding
745 * the schedulers sched domains, we can ignore !is_sched_load_
747 * csa - (for CpuSet Array) Array of pointers to all the cpusets
748 * that need to be load balanced, for convenient iterative
749 * access by the subsequent code that finds the best partition,
750 * i.e the set of domains (subsets) of CPUs such that the
751 * cpus_allowed of every cpuset marked is_sched_load_balance
752 * is a subset of one of these domains, while there are as
753 * many such domains as possible, each as small as possible.
754 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
755 * the kernel/sched/core.c routine partition_sched_domains() in a
756 * convenient format, that can be easily compared to the prior
757 * value to determine what partition elements (sched domains)
758 * were changed (added or removed.)
760 * Finding the best partition (set of domains):
761 * The triple nested loops below over i, j, k scan over the
762 * load balanced cpusets (using the array of cpuset pointers in
763 * csa[]) looking for pairs of cpusets that have overlapping
764 * cpus_allowed, but which don't have the same 'pn' partition
765 * number and gives them in the same partition number. It keeps
766 * looping on the 'restart' label until it can no longer find
769 * The union of the cpus_allowed masks from the set of
770 * all cpusets having the same 'pn' value then form the one
771 * element of the partition (one sched domain) to be passed to
772 * partition_sched_domains().
774 static int generate_sched_domains(cpumask_var_t
**domains
,
775 struct sched_domain_attr
**attributes
)
777 struct cpuset
*cp
; /* top-down scan of cpusets */
778 struct cpuset
**csa
; /* array of all cpuset ptrs */
779 int csn
; /* how many cpuset ptrs in csa so far */
780 int i
, j
, k
; /* indices for partition finding loops */
781 cpumask_var_t
*doms
; /* resulting partition; i.e. sched domains */
782 struct sched_domain_attr
*dattr
; /* attributes for custom domains */
783 int ndoms
= 0; /* number of sched domains in result */
784 int nslot
; /* next empty doms[] struct cpumask slot */
785 struct cgroup_subsys_state
*pos_css
;
786 bool root_load_balance
= is_sched_load_balance(&top_cpuset
);
792 /* Special case for the 99% of systems with one, full, sched domain */
793 if (root_load_balance
&& !top_cpuset
.nr_subparts_cpus
) {
795 doms
= alloc_sched_domains(ndoms
);
799 dattr
= kmalloc(sizeof(struct sched_domain_attr
), GFP_KERNEL
);
801 *dattr
= SD_ATTR_INIT
;
802 update_domain_attr_tree(dattr
, &top_cpuset
);
804 cpumask_and(doms
[0], top_cpuset
.effective_cpus
,
805 housekeeping_cpumask(HK_FLAG_DOMAIN
));
810 csa
= kmalloc_array(nr_cpusets(), sizeof(cp
), GFP_KERNEL
);
816 if (root_load_balance
)
817 csa
[csn
++] = &top_cpuset
;
818 cpuset_for_each_descendant_pre(cp
, pos_css
, &top_cpuset
) {
819 if (cp
== &top_cpuset
)
822 * Continue traversing beyond @cp iff @cp has some CPUs and
823 * isn't load balancing. The former is obvious. The
824 * latter: All child cpusets contain a subset of the
825 * parent's cpus, so just skip them, and then we call
826 * update_domain_attr_tree() to calc relax_domain_level of
827 * the corresponding sched domain.
829 * If root is load-balancing, we can skip @cp if it
830 * is a subset of the root's effective_cpus.
832 if (!cpumask_empty(cp
->cpus_allowed
) &&
833 !(is_sched_load_balance(cp
) &&
834 cpumask_intersects(cp
->cpus_allowed
,
835 housekeeping_cpumask(HK_FLAG_DOMAIN
))))
838 if (root_load_balance
&&
839 cpumask_subset(cp
->cpus_allowed
, top_cpuset
.effective_cpus
))
842 if (is_sched_load_balance(cp
))
845 /* skip @cp's subtree if not a partition root */
846 if (!is_partition_root(cp
))
847 pos_css
= css_rightmost_descendant(pos_css
);
851 for (i
= 0; i
< csn
; i
++)
856 /* Find the best partition (set of sched domains) */
857 for (i
= 0; i
< csn
; i
++) {
858 struct cpuset
*a
= csa
[i
];
861 for (j
= 0; j
< csn
; j
++) {
862 struct cpuset
*b
= csa
[j
];
865 if (apn
!= bpn
&& cpusets_overlap(a
, b
)) {
866 for (k
= 0; k
< csn
; k
++) {
867 struct cpuset
*c
= csa
[k
];
872 ndoms
--; /* one less element */
879 * Now we know how many domains to create.
880 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
882 doms
= alloc_sched_domains(ndoms
);
887 * The rest of the code, including the scheduler, can deal with
888 * dattr==NULL case. No need to abort if alloc fails.
890 dattr
= kmalloc_array(ndoms
, sizeof(struct sched_domain_attr
),
893 for (nslot
= 0, i
= 0; i
< csn
; i
++) {
894 struct cpuset
*a
= csa
[i
];
899 /* Skip completed partitions */
905 if (nslot
== ndoms
) {
906 static int warnings
= 10;
908 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
909 nslot
, ndoms
, csn
, i
, apn
);
917 *(dattr
+ nslot
) = SD_ATTR_INIT
;
918 for (j
= i
; j
< csn
; j
++) {
919 struct cpuset
*b
= csa
[j
];
922 cpumask_or(dp
, dp
, b
->effective_cpus
);
923 cpumask_and(dp
, dp
, housekeeping_cpumask(HK_FLAG_DOMAIN
));
925 update_domain_attr_tree(dattr
+ nslot
, b
);
927 /* Done with this partition */
933 BUG_ON(nslot
!= ndoms
);
939 * Fallback to the default domain if kmalloc() failed.
940 * See comments in partition_sched_domains().
951 * Rebuild scheduler domains.
953 * If the flag 'sched_load_balance' of any cpuset with non-empty
954 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
955 * which has that flag enabled, or if any cpuset with a non-empty
956 * 'cpus' is removed, then call this routine to rebuild the
957 * scheduler's dynamic sched domains.
959 * Call with cpuset_mutex held. Takes get_online_cpus().
961 static void rebuild_sched_domains_locked(void)
963 struct sched_domain_attr
*attr
;
967 lockdep_assert_held(&cpuset_mutex
);
971 * We have raced with CPU hotplug. Don't do anything to avoid
972 * passing doms with offlined cpu to partition_sched_domains().
973 * Anyways, hotplug work item will rebuild sched domains.
975 if (!top_cpuset
.nr_subparts_cpus
&&
976 !cpumask_equal(top_cpuset
.effective_cpus
, cpu_active_mask
))
979 if (top_cpuset
.nr_subparts_cpus
&&
980 !cpumask_subset(top_cpuset
.effective_cpus
, cpu_active_mask
))
983 /* Generate domain masks and attrs */
984 ndoms
= generate_sched_domains(&doms
, &attr
);
986 /* Have scheduler rebuild the domains */
987 partition_sched_domains(ndoms
, doms
, attr
);
991 #else /* !CONFIG_SMP */
992 static void rebuild_sched_domains_locked(void)
995 #endif /* CONFIG_SMP */
997 void rebuild_sched_domains(void)
999 mutex_lock(&cpuset_mutex
);
1000 rebuild_sched_domains_locked();
1001 mutex_unlock(&cpuset_mutex
);
1005 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
1006 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
1008 * Iterate through each task of @cs updating its cpus_allowed to the
1009 * effective cpuset's. As this function is called with cpuset_mutex held,
1010 * cpuset membership stays stable.
1012 static void update_tasks_cpumask(struct cpuset
*cs
)
1014 struct css_task_iter it
;
1015 struct task_struct
*task
;
1017 css_task_iter_start(&cs
->css
, 0, &it
);
1018 while ((task
= css_task_iter_next(&it
)))
1019 set_cpus_allowed_ptr(task
, cs
->effective_cpus
);
1020 css_task_iter_end(&it
);
1024 * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1025 * @new_cpus: the temp variable for the new effective_cpus mask
1026 * @cs: the cpuset the need to recompute the new effective_cpus mask
1027 * @parent: the parent cpuset
1029 * If the parent has subpartition CPUs, include them in the list of
1030 * allowable CPUs in computing the new effective_cpus mask. Since offlined
1031 * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
1032 * to mask those out.
1034 static void compute_effective_cpumask(struct cpumask
*new_cpus
,
1035 struct cpuset
*cs
, struct cpuset
*parent
)
1037 if (parent
->nr_subparts_cpus
) {
1038 cpumask_or(new_cpus
, parent
->effective_cpus
,
1039 parent
->subparts_cpus
);
1040 cpumask_and(new_cpus
, new_cpus
, cs
->cpus_allowed
);
1041 cpumask_and(new_cpus
, new_cpus
, cpu_active_mask
);
1043 cpumask_and(new_cpus
, cs
->cpus_allowed
, parent
->effective_cpus
);
1048 * Commands for update_parent_subparts_cpumask
1051 partcmd_enable
, /* Enable partition root */
1052 partcmd_disable
, /* Disable partition root */
1053 partcmd_update
, /* Update parent's subparts_cpus */
1057 * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
1058 * @cpuset: The cpuset that requests change in partition root state
1059 * @cmd: Partition root state change command
1060 * @newmask: Optional new cpumask for partcmd_update
1061 * @tmp: Temporary addmask and delmask
1062 * Return: 0, 1 or an error code
1064 * For partcmd_enable, the cpuset is being transformed from a non-partition
1065 * root to a partition root. The cpus_allowed mask of the given cpuset will
1066 * be put into parent's subparts_cpus and taken away from parent's
1067 * effective_cpus. The function will return 0 if all the CPUs listed in
1068 * cpus_allowed can be granted or an error code will be returned.
1070 * For partcmd_disable, the cpuset is being transofrmed from a partition
1071 * root back to a non-partition root. any CPUs in cpus_allowed that are in
1072 * parent's subparts_cpus will be taken away from that cpumask and put back
1073 * into parent's effective_cpus. 0 should always be returned.
1075 * For partcmd_update, if the optional newmask is specified, the cpu
1076 * list is to be changed from cpus_allowed to newmask. Otherwise,
1077 * cpus_allowed is assumed to remain the same. The cpuset should either
1078 * be a partition root or an invalid partition root. The partition root
1079 * state may change if newmask is NULL and none of the requested CPUs can
1080 * be granted by the parent. The function will return 1 if changes to
1081 * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
1082 * Error code should only be returned when newmask is non-NULL.
1084 * The partcmd_enable and partcmd_disable commands are used by
1085 * update_prstate(). The partcmd_update command is used by
1086 * update_cpumasks_hier() with newmask NULL and update_cpumask() with
1089 * The checking is more strict when enabling partition root than the
1090 * other two commands.
1092 * Because of the implicit cpu exclusive nature of a partition root,
1093 * cpumask changes that violates the cpu exclusivity rule will not be
1094 * permitted when checked by validate_change(). The validate_change()
1095 * function will also prevent any changes to the cpu list if it is not
1096 * a superset of children's cpu lists.
1098 static int update_parent_subparts_cpumask(struct cpuset
*cpuset
, int cmd
,
1099 struct cpumask
*newmask
,
1100 struct tmpmasks
*tmp
)
1102 struct cpuset
*parent
= parent_cs(cpuset
);
1103 int adding
; /* Moving cpus from effective_cpus to subparts_cpus */
1104 int deleting
; /* Moving cpus from subparts_cpus to effective_cpus */
1105 bool part_error
= false; /* Partition error? */
1107 lockdep_assert_held(&cpuset_mutex
);
1110 * The parent must be a partition root.
1111 * The new cpumask, if present, or the current cpus_allowed must
1114 if (!is_partition_root(parent
) ||
1115 (newmask
&& cpumask_empty(newmask
)) ||
1116 (!newmask
&& cpumask_empty(cpuset
->cpus_allowed
)))
1120 * Enabling/disabling partition root is not allowed if there are
1123 if ((cmd
!= partcmd_update
) && css_has_online_children(&cpuset
->css
))
1127 * Enabling partition root is not allowed if not all the CPUs
1128 * can be granted from parent's effective_cpus or at least one
1129 * CPU will be left after that.
1131 if ((cmd
== partcmd_enable
) &&
1132 (!cpumask_subset(cpuset
->cpus_allowed
, parent
->effective_cpus
) ||
1133 cpumask_equal(cpuset
->cpus_allowed
, parent
->effective_cpus
)))
1137 * A cpumask update cannot make parent's effective_cpus become empty.
1139 adding
= deleting
= false;
1140 if (cmd
== partcmd_enable
) {
1141 cpumask_copy(tmp
->addmask
, cpuset
->cpus_allowed
);
1143 } else if (cmd
== partcmd_disable
) {
1144 deleting
= cpumask_and(tmp
->delmask
, cpuset
->cpus_allowed
,
1145 parent
->subparts_cpus
);
1146 } else if (newmask
) {
1148 * partcmd_update with newmask:
1150 * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
1151 * addmask = newmask & parent->effective_cpus
1152 * & ~parent->subparts_cpus
1154 cpumask_andnot(tmp
->delmask
, cpuset
->cpus_allowed
, newmask
);
1155 deleting
= cpumask_and(tmp
->delmask
, tmp
->delmask
,
1156 parent
->subparts_cpus
);
1158 cpumask_and(tmp
->addmask
, newmask
, parent
->effective_cpus
);
1159 adding
= cpumask_andnot(tmp
->addmask
, tmp
->addmask
,
1160 parent
->subparts_cpus
);
1162 * Return error if the new effective_cpus could become empty.
1165 cpumask_equal(parent
->effective_cpus
, tmp
->addmask
)) {
1169 * As some of the CPUs in subparts_cpus might have
1170 * been offlined, we need to compute the real delmask
1173 if (!cpumask_and(tmp
->addmask
, tmp
->delmask
,
1176 cpumask_copy(tmp
->addmask
, parent
->effective_cpus
);
1180 * partcmd_update w/o newmask:
1182 * addmask = cpus_allowed & parent->effectiveb_cpus
1184 * Note that parent's subparts_cpus may have been
1185 * pre-shrunk in case there is a change in the cpu list.
1186 * So no deletion is needed.
1188 adding
= cpumask_and(tmp
->addmask
, cpuset
->cpus_allowed
,
1189 parent
->effective_cpus
);
1190 part_error
= cpumask_equal(tmp
->addmask
,
1191 parent
->effective_cpus
);
1194 if (cmd
== partcmd_update
) {
1195 int prev_prs
= cpuset
->partition_root_state
;
1198 * Check for possible transition between PRS_ENABLED
1201 switch (cpuset
->partition_root_state
) {
1204 cpuset
->partition_root_state
= PRS_ERROR
;
1208 cpuset
->partition_root_state
= PRS_ENABLED
;
1212 * Set part_error if previously in invalid state.
1214 part_error
= (prev_prs
== PRS_ERROR
);
1217 if (!part_error
&& (cpuset
->partition_root_state
== PRS_ERROR
))
1218 return 0; /* Nothing need to be done */
1220 if (cpuset
->partition_root_state
== PRS_ERROR
) {
1222 * Remove all its cpus from parent's subparts_cpus.
1225 deleting
= cpumask_and(tmp
->delmask
, cpuset
->cpus_allowed
,
1226 parent
->subparts_cpus
);
1229 if (!adding
&& !deleting
)
1233 * Change the parent's subparts_cpus.
1234 * Newly added CPUs will be removed from effective_cpus and
1235 * newly deleted ones will be added back to effective_cpus.
1237 spin_lock_irq(&callback_lock
);
1239 cpumask_or(parent
->subparts_cpus
,
1240 parent
->subparts_cpus
, tmp
->addmask
);
1241 cpumask_andnot(parent
->effective_cpus
,
1242 parent
->effective_cpus
, tmp
->addmask
);
1245 cpumask_andnot(parent
->subparts_cpus
,
1246 parent
->subparts_cpus
, tmp
->delmask
);
1248 * Some of the CPUs in subparts_cpus might have been offlined.
1250 cpumask_and(tmp
->delmask
, tmp
->delmask
, cpu_active_mask
);
1251 cpumask_or(parent
->effective_cpus
,
1252 parent
->effective_cpus
, tmp
->delmask
);
1255 parent
->nr_subparts_cpus
= cpumask_weight(parent
->subparts_cpus
);
1256 spin_unlock_irq(&callback_lock
);
1258 return cmd
== partcmd_update
;
1262 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
1263 * @cs: the cpuset to consider
1264 * @tmp: temp variables for calculating effective_cpus & partition setup
1266 * When congifured cpumask is changed, the effective cpumasks of this cpuset
1267 * and all its descendants need to be updated.
1269 * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
1271 * Called with cpuset_mutex held
1273 static void update_cpumasks_hier(struct cpuset
*cs
, struct tmpmasks
*tmp
)
1276 struct cgroup_subsys_state
*pos_css
;
1277 bool need_rebuild_sched_domains
= false;
1280 cpuset_for_each_descendant_pre(cp
, pos_css
, cs
) {
1281 struct cpuset
*parent
= parent_cs(cp
);
1283 compute_effective_cpumask(tmp
->new_cpus
, cp
, parent
);
1286 * If it becomes empty, inherit the effective mask of the
1287 * parent, which is guaranteed to have some CPUs.
1289 if (is_in_v2_mode() && cpumask_empty(tmp
->new_cpus
)) {
1290 cpumask_copy(tmp
->new_cpus
, parent
->effective_cpus
);
1291 if (!cp
->use_parent_ecpus
) {
1292 cp
->use_parent_ecpus
= true;
1293 parent
->child_ecpus_count
++;
1295 } else if (cp
->use_parent_ecpus
) {
1296 cp
->use_parent_ecpus
= false;
1297 WARN_ON_ONCE(!parent
->child_ecpus_count
);
1298 parent
->child_ecpus_count
--;
1302 * Skip the whole subtree if the cpumask remains the same
1303 * and has no partition root state.
1305 if (!cp
->partition_root_state
&&
1306 cpumask_equal(tmp
->new_cpus
, cp
->effective_cpus
)) {
1307 pos_css
= css_rightmost_descendant(pos_css
);
1312 * update_parent_subparts_cpumask() should have been called
1313 * for cs already in update_cpumask(). We should also call
1314 * update_tasks_cpumask() again for tasks in the parent
1315 * cpuset if the parent's subparts_cpus changes.
1317 if ((cp
!= cs
) && cp
->partition_root_state
) {
1318 switch (parent
->partition_root_state
) {
1321 * If parent is not a partition root or an
1322 * invalid partition root, clear the state
1323 * state and the CS_CPU_EXCLUSIVE flag.
1325 WARN_ON_ONCE(cp
->partition_root_state
1327 cp
->partition_root_state
= 0;
1330 * clear_bit() is an atomic operation and
1331 * readers aren't interested in the state
1332 * of CS_CPU_EXCLUSIVE anyway. So we can
1333 * just update the flag without holding
1334 * the callback_lock.
1336 clear_bit(CS_CPU_EXCLUSIVE
, &cp
->flags
);
1340 if (update_parent_subparts_cpumask(cp
, partcmd_update
, NULL
, tmp
))
1341 update_tasks_cpumask(parent
);
1346 * When parent is invalid, it has to be too.
1348 cp
->partition_root_state
= PRS_ERROR
;
1349 if (cp
->nr_subparts_cpus
) {
1350 cp
->nr_subparts_cpus
= 0;
1351 cpumask_clear(cp
->subparts_cpus
);
1357 if (!css_tryget_online(&cp
->css
))
1361 spin_lock_irq(&callback_lock
);
1363 cpumask_copy(cp
->effective_cpus
, tmp
->new_cpus
);
1364 if (cp
->nr_subparts_cpus
&&
1365 (cp
->partition_root_state
!= PRS_ENABLED
)) {
1366 cp
->nr_subparts_cpus
= 0;
1367 cpumask_clear(cp
->subparts_cpus
);
1368 } else if (cp
->nr_subparts_cpus
) {
1370 * Make sure that effective_cpus & subparts_cpus
1371 * are mutually exclusive.
1373 * In the unlikely event that effective_cpus
1374 * becomes empty. we clear cp->nr_subparts_cpus and
1375 * let its child partition roots to compete for
1378 cpumask_andnot(cp
->effective_cpus
, cp
->effective_cpus
,
1380 if (cpumask_empty(cp
->effective_cpus
)) {
1381 cpumask_copy(cp
->effective_cpus
, tmp
->new_cpus
);
1382 cpumask_clear(cp
->subparts_cpus
);
1383 cp
->nr_subparts_cpus
= 0;
1384 } else if (!cpumask_subset(cp
->subparts_cpus
,
1386 cpumask_andnot(cp
->subparts_cpus
,
1387 cp
->subparts_cpus
, tmp
->new_cpus
);
1388 cp
->nr_subparts_cpus
1389 = cpumask_weight(cp
->subparts_cpus
);
1392 spin_unlock_irq(&callback_lock
);
1394 WARN_ON(!is_in_v2_mode() &&
1395 !cpumask_equal(cp
->cpus_allowed
, cp
->effective_cpus
));
1397 update_tasks_cpumask(cp
);
1400 * On legacy hierarchy, if the effective cpumask of any non-
1401 * empty cpuset is changed, we need to rebuild sched domains.
1402 * On default hierarchy, the cpuset needs to be a partition
1405 if (!cpumask_empty(cp
->cpus_allowed
) &&
1406 is_sched_load_balance(cp
) &&
1407 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys
) ||
1408 is_partition_root(cp
)))
1409 need_rebuild_sched_domains
= true;
1416 if (need_rebuild_sched_domains
)
1417 rebuild_sched_domains_locked();
1421 * update_sibling_cpumasks - Update siblings cpumasks
1422 * @parent: Parent cpuset
1423 * @cs: Current cpuset
1424 * @tmp: Temp variables
1426 static void update_sibling_cpumasks(struct cpuset
*parent
, struct cpuset
*cs
,
1427 struct tmpmasks
*tmp
)
1429 struct cpuset
*sibling
;
1430 struct cgroup_subsys_state
*pos_css
;
1433 * Check all its siblings and call update_cpumasks_hier()
1434 * if their use_parent_ecpus flag is set in order for them
1435 * to use the right effective_cpus value.
1438 cpuset_for_each_child(sibling
, pos_css
, parent
) {
1441 if (!sibling
->use_parent_ecpus
)
1444 update_cpumasks_hier(sibling
, tmp
);
1450 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
1451 * @cs: the cpuset to consider
1452 * @trialcs: trial cpuset
1453 * @buf: buffer of cpu numbers written to this cpuset
1455 static int update_cpumask(struct cpuset
*cs
, struct cpuset
*trialcs
,
1459 struct tmpmasks tmp
;
1461 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
1462 if (cs
== &top_cpuset
)
1466 * An empty cpus_allowed is ok only if the cpuset has no tasks.
1467 * Since cpulist_parse() fails on an empty mask, we special case
1468 * that parsing. The validate_change() call ensures that cpusets
1469 * with tasks have cpus.
1472 cpumask_clear(trialcs
->cpus_allowed
);
1474 retval
= cpulist_parse(buf
, trialcs
->cpus_allowed
);
1478 if (!cpumask_subset(trialcs
->cpus_allowed
,
1479 top_cpuset
.cpus_allowed
))
1483 /* Nothing to do if the cpus didn't change */
1484 if (cpumask_equal(cs
->cpus_allowed
, trialcs
->cpus_allowed
))
1487 retval
= validate_change(cs
, trialcs
);
1491 #ifdef CONFIG_CPUMASK_OFFSTACK
1493 * Use the cpumasks in trialcs for tmpmasks when they are pointers
1494 * to allocated cpumasks.
1496 tmp
.addmask
= trialcs
->subparts_cpus
;
1497 tmp
.delmask
= trialcs
->effective_cpus
;
1498 tmp
.new_cpus
= trialcs
->cpus_allowed
;
1501 if (cs
->partition_root_state
) {
1502 /* Cpumask of a partition root cannot be empty */
1503 if (cpumask_empty(trialcs
->cpus_allowed
))
1505 if (update_parent_subparts_cpumask(cs
, partcmd_update
,
1506 trialcs
->cpus_allowed
, &tmp
) < 0)
1510 spin_lock_irq(&callback_lock
);
1511 cpumask_copy(cs
->cpus_allowed
, trialcs
->cpus_allowed
);
1514 * Make sure that subparts_cpus is a subset of cpus_allowed.
1516 if (cs
->nr_subparts_cpus
) {
1517 cpumask_andnot(cs
->subparts_cpus
, cs
->subparts_cpus
,
1519 cs
->nr_subparts_cpus
= cpumask_weight(cs
->subparts_cpus
);
1521 spin_unlock_irq(&callback_lock
);
1523 update_cpumasks_hier(cs
, &tmp
);
1525 if (cs
->partition_root_state
) {
1526 struct cpuset
*parent
= parent_cs(cs
);
1529 * For partition root, update the cpumasks of sibling
1530 * cpusets if they use parent's effective_cpus.
1532 if (parent
->child_ecpus_count
)
1533 update_sibling_cpumasks(parent
, cs
, &tmp
);
1539 * Migrate memory region from one set of nodes to another. This is
1540 * performed asynchronously as it can be called from process migration path
1541 * holding locks involved in process management. All mm migrations are
1542 * performed in the queued order and can be waited for by flushing
1543 * cpuset_migrate_mm_wq.
1546 struct cpuset_migrate_mm_work
{
1547 struct work_struct work
;
1548 struct mm_struct
*mm
;
1553 static void cpuset_migrate_mm_workfn(struct work_struct
*work
)
1555 struct cpuset_migrate_mm_work
*mwork
=
1556 container_of(work
, struct cpuset_migrate_mm_work
, work
);
1558 /* on a wq worker, no need to worry about %current's mems_allowed */
1559 do_migrate_pages(mwork
->mm
, &mwork
->from
, &mwork
->to
, MPOL_MF_MOVE_ALL
);
1564 static void cpuset_migrate_mm(struct mm_struct
*mm
, const nodemask_t
*from
,
1565 const nodemask_t
*to
)
1567 struct cpuset_migrate_mm_work
*mwork
;
1569 mwork
= kzalloc(sizeof(*mwork
), GFP_KERNEL
);
1572 mwork
->from
= *from
;
1574 INIT_WORK(&mwork
->work
, cpuset_migrate_mm_workfn
);
1575 queue_work(cpuset_migrate_mm_wq
, &mwork
->work
);
1581 static void cpuset_post_attach(void)
1583 flush_workqueue(cpuset_migrate_mm_wq
);
1587 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1588 * @tsk: the task to change
1589 * @newmems: new nodes that the task will be set
1591 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
1592 * and rebind an eventual tasks' mempolicy. If the task is allocating in
1593 * parallel, it might temporarily see an empty intersection, which results in
1594 * a seqlock check and retry before OOM or allocation failure.
1596 static void cpuset_change_task_nodemask(struct task_struct
*tsk
,
1597 nodemask_t
*newmems
)
1601 local_irq_disable();
1602 write_seqcount_begin(&tsk
->mems_allowed_seq
);
1604 nodes_or(tsk
->mems_allowed
, tsk
->mems_allowed
, *newmems
);
1605 mpol_rebind_task(tsk
, newmems
);
1606 tsk
->mems_allowed
= *newmems
;
1608 write_seqcount_end(&tsk
->mems_allowed_seq
);
1614 static void *cpuset_being_rebound
;
1617 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1618 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1620 * Iterate through each task of @cs updating its mems_allowed to the
1621 * effective cpuset's. As this function is called with cpuset_mutex held,
1622 * cpuset membership stays stable.
1624 static void update_tasks_nodemask(struct cpuset
*cs
)
1626 static nodemask_t newmems
; /* protected by cpuset_mutex */
1627 struct css_task_iter it
;
1628 struct task_struct
*task
;
1630 cpuset_being_rebound
= cs
; /* causes mpol_dup() rebind */
1632 guarantee_online_mems(cs
, &newmems
);
1635 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1636 * take while holding tasklist_lock. Forks can happen - the
1637 * mpol_dup() cpuset_being_rebound check will catch such forks,
1638 * and rebind their vma mempolicies too. Because we still hold
1639 * the global cpuset_mutex, we know that no other rebind effort
1640 * will be contending for the global variable cpuset_being_rebound.
1641 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1642 * is idempotent. Also migrate pages in each mm to new nodes.
1644 css_task_iter_start(&cs
->css
, 0, &it
);
1645 while ((task
= css_task_iter_next(&it
))) {
1646 struct mm_struct
*mm
;
1649 cpuset_change_task_nodemask(task
, &newmems
);
1651 mm
= get_task_mm(task
);
1655 migrate
= is_memory_migrate(cs
);
1657 mpol_rebind_mm(mm
, &cs
->mems_allowed
);
1659 cpuset_migrate_mm(mm
, &cs
->old_mems_allowed
, &newmems
);
1663 css_task_iter_end(&it
);
1666 * All the tasks' nodemasks have been updated, update
1667 * cs->old_mems_allowed.
1669 cs
->old_mems_allowed
= newmems
;
1671 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1672 cpuset_being_rebound
= NULL
;
1676 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
1677 * @cs: the cpuset to consider
1678 * @new_mems: a temp variable for calculating new effective_mems
1680 * When configured nodemask is changed, the effective nodemasks of this cpuset
1681 * and all its descendants need to be updated.
1683 * On legacy hiearchy, effective_mems will be the same with mems_allowed.
1685 * Called with cpuset_mutex held
1687 static void update_nodemasks_hier(struct cpuset
*cs
, nodemask_t
*new_mems
)
1690 struct cgroup_subsys_state
*pos_css
;
1693 cpuset_for_each_descendant_pre(cp
, pos_css
, cs
) {
1694 struct cpuset
*parent
= parent_cs(cp
);
1696 nodes_and(*new_mems
, cp
->mems_allowed
, parent
->effective_mems
);
1699 * If it becomes empty, inherit the effective mask of the
1700 * parent, which is guaranteed to have some MEMs.
1702 if (is_in_v2_mode() && nodes_empty(*new_mems
))
1703 *new_mems
= parent
->effective_mems
;
1705 /* Skip the whole subtree if the nodemask remains the same. */
1706 if (nodes_equal(*new_mems
, cp
->effective_mems
)) {
1707 pos_css
= css_rightmost_descendant(pos_css
);
1711 if (!css_tryget_online(&cp
->css
))
1715 spin_lock_irq(&callback_lock
);
1716 cp
->effective_mems
= *new_mems
;
1717 spin_unlock_irq(&callback_lock
);
1719 WARN_ON(!is_in_v2_mode() &&
1720 !nodes_equal(cp
->mems_allowed
, cp
->effective_mems
));
1722 update_tasks_nodemask(cp
);
1731 * Handle user request to change the 'mems' memory placement
1732 * of a cpuset. Needs to validate the request, update the
1733 * cpusets mems_allowed, and for each task in the cpuset,
1734 * update mems_allowed and rebind task's mempolicy and any vma
1735 * mempolicies and if the cpuset is marked 'memory_migrate',
1736 * migrate the tasks pages to the new memory.
1738 * Call with cpuset_mutex held. May take callback_lock during call.
1739 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1740 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1741 * their mempolicies to the cpusets new mems_allowed.
1743 static int update_nodemask(struct cpuset
*cs
, struct cpuset
*trialcs
,
1749 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1752 if (cs
== &top_cpuset
) {
1758 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1759 * Since nodelist_parse() fails on an empty mask, we special case
1760 * that parsing. The validate_change() call ensures that cpusets
1761 * with tasks have memory.
1764 nodes_clear(trialcs
->mems_allowed
);
1766 retval
= nodelist_parse(buf
, trialcs
->mems_allowed
);
1770 if (!nodes_subset(trialcs
->mems_allowed
,
1771 top_cpuset
.mems_allowed
)) {
1777 if (nodes_equal(cs
->mems_allowed
, trialcs
->mems_allowed
)) {
1778 retval
= 0; /* Too easy - nothing to do */
1781 retval
= validate_change(cs
, trialcs
);
1785 spin_lock_irq(&callback_lock
);
1786 cs
->mems_allowed
= trialcs
->mems_allowed
;
1787 spin_unlock_irq(&callback_lock
);
1789 /* use trialcs->mems_allowed as a temp variable */
1790 update_nodemasks_hier(cs
, &trialcs
->mems_allowed
);
1795 bool current_cpuset_is_being_rebound(void)
1800 ret
= task_cs(current
) == cpuset_being_rebound
;
1806 static int update_relax_domain_level(struct cpuset
*cs
, s64 val
)
1809 if (val
< -1 || val
>= sched_domain_level_max
)
1813 if (val
!= cs
->relax_domain_level
) {
1814 cs
->relax_domain_level
= val
;
1815 if (!cpumask_empty(cs
->cpus_allowed
) &&
1816 is_sched_load_balance(cs
))
1817 rebuild_sched_domains_locked();
1824 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1825 * @cs: the cpuset in which each task's spread flags needs to be changed
1827 * Iterate through each task of @cs updating its spread flags. As this
1828 * function is called with cpuset_mutex held, cpuset membership stays
1831 static void update_tasks_flags(struct cpuset
*cs
)
1833 struct css_task_iter it
;
1834 struct task_struct
*task
;
1836 css_task_iter_start(&cs
->css
, 0, &it
);
1837 while ((task
= css_task_iter_next(&it
)))
1838 cpuset_update_task_spread_flag(cs
, task
);
1839 css_task_iter_end(&it
);
1843 * update_flag - read a 0 or a 1 in a file and update associated flag
1844 * bit: the bit to update (see cpuset_flagbits_t)
1845 * cs: the cpuset to update
1846 * turning_on: whether the flag is being set or cleared
1848 * Call with cpuset_mutex held.
1851 static int update_flag(cpuset_flagbits_t bit
, struct cpuset
*cs
,
1854 struct cpuset
*trialcs
;
1855 int balance_flag_changed
;
1856 int spread_flag_changed
;
1859 trialcs
= alloc_trial_cpuset(cs
);
1864 set_bit(bit
, &trialcs
->flags
);
1866 clear_bit(bit
, &trialcs
->flags
);
1868 err
= validate_change(cs
, trialcs
);
1872 balance_flag_changed
= (is_sched_load_balance(cs
) !=
1873 is_sched_load_balance(trialcs
));
1875 spread_flag_changed
= ((is_spread_slab(cs
) != is_spread_slab(trialcs
))
1876 || (is_spread_page(cs
) != is_spread_page(trialcs
)));
1878 spin_lock_irq(&callback_lock
);
1879 cs
->flags
= trialcs
->flags
;
1880 spin_unlock_irq(&callback_lock
);
1882 if (!cpumask_empty(trialcs
->cpus_allowed
) && balance_flag_changed
)
1883 rebuild_sched_domains_locked();
1885 if (spread_flag_changed
)
1886 update_tasks_flags(cs
);
1888 free_cpuset(trialcs
);
1893 * update_prstate - update partititon_root_state
1894 * cs: the cpuset to update
1895 * val: 0 - disabled, 1 - enabled
1897 * Call with cpuset_mutex held.
1899 static int update_prstate(struct cpuset
*cs
, int val
)
1902 struct cpuset
*parent
= parent_cs(cs
);
1903 struct tmpmasks tmp
;
1905 if ((val
!= 0) && (val
!= 1))
1907 if (val
== cs
->partition_root_state
)
1911 * Cannot force a partial or invalid partition root to a full
1914 if (val
&& cs
->partition_root_state
)
1917 if (alloc_cpumasks(NULL
, &tmp
))
1921 if (!cs
->partition_root_state
) {
1923 * Turning on partition root requires setting the
1924 * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
1927 if (cpumask_empty(cs
->cpus_allowed
))
1930 err
= update_flag(CS_CPU_EXCLUSIVE
, cs
, 1);
1934 err
= update_parent_subparts_cpumask(cs
, partcmd_enable
,
1937 update_flag(CS_CPU_EXCLUSIVE
, cs
, 0);
1940 cs
->partition_root_state
= PRS_ENABLED
;
1943 * Turning off partition root will clear the
1944 * CS_CPU_EXCLUSIVE bit.
1946 if (cs
->partition_root_state
== PRS_ERROR
) {
1947 cs
->partition_root_state
= 0;
1948 update_flag(CS_CPU_EXCLUSIVE
, cs
, 0);
1953 err
= update_parent_subparts_cpumask(cs
, partcmd_disable
,
1958 cs
->partition_root_state
= 0;
1960 /* Turning off CS_CPU_EXCLUSIVE will not return error */
1961 update_flag(CS_CPU_EXCLUSIVE
, cs
, 0);
1965 * Update cpumask of parent's tasks except when it is the top
1966 * cpuset as some system daemons cannot be mapped to other CPUs.
1968 if (parent
!= &top_cpuset
)
1969 update_tasks_cpumask(parent
);
1971 if (parent
->child_ecpus_count
)
1972 update_sibling_cpumasks(parent
, cs
, &tmp
);
1974 rebuild_sched_domains_locked();
1976 free_cpumasks(NULL
, &tmp
);
1981 * Frequency meter - How fast is some event occurring?
1983 * These routines manage a digitally filtered, constant time based,
1984 * event frequency meter. There are four routines:
1985 * fmeter_init() - initialize a frequency meter.
1986 * fmeter_markevent() - called each time the event happens.
1987 * fmeter_getrate() - returns the recent rate of such events.
1988 * fmeter_update() - internal routine used to update fmeter.
1990 * A common data structure is passed to each of these routines,
1991 * which is used to keep track of the state required to manage the
1992 * frequency meter and its digital filter.
1994 * The filter works on the number of events marked per unit time.
1995 * The filter is single-pole low-pass recursive (IIR). The time unit
1996 * is 1 second. Arithmetic is done using 32-bit integers scaled to
1997 * simulate 3 decimal digits of precision (multiplied by 1000).
1999 * With an FM_COEF of 933, and a time base of 1 second, the filter
2000 * has a half-life of 10 seconds, meaning that if the events quit
2001 * happening, then the rate returned from the fmeter_getrate()
2002 * will be cut in half each 10 seconds, until it converges to zero.
2004 * It is not worth doing a real infinitely recursive filter. If more
2005 * than FM_MAXTICKS ticks have elapsed since the last filter event,
2006 * just compute FM_MAXTICKS ticks worth, by which point the level
2009 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
2010 * arithmetic overflow in the fmeter_update() routine.
2012 * Given the simple 32 bit integer arithmetic used, this meter works
2013 * best for reporting rates between one per millisecond (msec) and
2014 * one per 32 (approx) seconds. At constant rates faster than one
2015 * per msec it maxes out at values just under 1,000,000. At constant
2016 * rates between one per msec, and one per second it will stabilize
2017 * to a value N*1000, where N is the rate of events per second.
2018 * At constant rates between one per second and one per 32 seconds,
2019 * it will be choppy, moving up on the seconds that have an event,
2020 * and then decaying until the next event. At rates slower than
2021 * about one in 32 seconds, it decays all the way back to zero between
2025 #define FM_COEF 933 /* coefficient for half-life of 10 secs */
2026 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
2027 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
2028 #define FM_SCALE 1000 /* faux fixed point scale */
2030 /* Initialize a frequency meter */
2031 static void fmeter_init(struct fmeter
*fmp
)
2036 spin_lock_init(&fmp
->lock
);
2039 /* Internal meter update - process cnt events and update value */
2040 static void fmeter_update(struct fmeter
*fmp
)
2045 now
= ktime_get_seconds();
2046 ticks
= now
- fmp
->time
;
2051 ticks
= min(FM_MAXTICKS
, ticks
);
2053 fmp
->val
= (FM_COEF
* fmp
->val
) / FM_SCALE
;
2056 fmp
->val
+= ((FM_SCALE
- FM_COEF
) * fmp
->cnt
) / FM_SCALE
;
2060 /* Process any previous ticks, then bump cnt by one (times scale). */
2061 static void fmeter_markevent(struct fmeter
*fmp
)
2063 spin_lock(&fmp
->lock
);
2065 fmp
->cnt
= min(FM_MAXCNT
, fmp
->cnt
+ FM_SCALE
);
2066 spin_unlock(&fmp
->lock
);
2069 /* Process any previous ticks, then return current value. */
2070 static int fmeter_getrate(struct fmeter
*fmp
)
2074 spin_lock(&fmp
->lock
);
2077 spin_unlock(&fmp
->lock
);
2081 static struct cpuset
*cpuset_attach_old_cs
;
2083 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
2084 static int cpuset_can_attach(struct cgroup_taskset
*tset
)
2086 struct cgroup_subsys_state
*css
;
2088 struct task_struct
*task
;
2091 /* used later by cpuset_attach() */
2092 cpuset_attach_old_cs
= task_cs(cgroup_taskset_first(tset
, &css
));
2095 mutex_lock(&cpuset_mutex
);
2097 /* allow moving tasks into an empty cpuset if on default hierarchy */
2099 if (!is_in_v2_mode() &&
2100 (cpumask_empty(cs
->cpus_allowed
) || nodes_empty(cs
->mems_allowed
)))
2103 cgroup_taskset_for_each(task
, css
, tset
) {
2104 ret
= task_can_attach(task
, cs
->cpus_allowed
);
2107 ret
= security_task_setscheduler(task
);
2113 * Mark attach is in progress. This makes validate_change() fail
2114 * changes which zero cpus/mems_allowed.
2116 cs
->attach_in_progress
++;
2119 mutex_unlock(&cpuset_mutex
);
2123 static void cpuset_cancel_attach(struct cgroup_taskset
*tset
)
2125 struct cgroup_subsys_state
*css
;
2127 cgroup_taskset_first(tset
, &css
);
2129 mutex_lock(&cpuset_mutex
);
2130 css_cs(css
)->attach_in_progress
--;
2131 mutex_unlock(&cpuset_mutex
);
2135 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
2136 * but we can't allocate it dynamically there. Define it global and
2137 * allocate from cpuset_init().
2139 static cpumask_var_t cpus_attach
;
2141 static void cpuset_attach(struct cgroup_taskset
*tset
)
2143 /* static buf protected by cpuset_mutex */
2144 static nodemask_t cpuset_attach_nodemask_to
;
2145 struct task_struct
*task
;
2146 struct task_struct
*leader
;
2147 struct cgroup_subsys_state
*css
;
2149 struct cpuset
*oldcs
= cpuset_attach_old_cs
;
2151 cgroup_taskset_first(tset
, &css
);
2154 mutex_lock(&cpuset_mutex
);
2156 /* prepare for attach */
2157 if (cs
== &top_cpuset
)
2158 cpumask_copy(cpus_attach
, cpu_possible_mask
);
2160 guarantee_online_cpus(cs
, cpus_attach
);
2162 guarantee_online_mems(cs
, &cpuset_attach_nodemask_to
);
2164 cgroup_taskset_for_each(task
, css
, tset
) {
2166 * can_attach beforehand should guarantee that this doesn't
2167 * fail. TODO: have a better way to handle failure here
2169 WARN_ON_ONCE(set_cpus_allowed_ptr(task
, cpus_attach
));
2171 cpuset_change_task_nodemask(task
, &cpuset_attach_nodemask_to
);
2172 cpuset_update_task_spread_flag(cs
, task
);
2176 * Change mm for all threadgroup leaders. This is expensive and may
2177 * sleep and should be moved outside migration path proper.
2179 cpuset_attach_nodemask_to
= cs
->effective_mems
;
2180 cgroup_taskset_for_each_leader(leader
, css
, tset
) {
2181 struct mm_struct
*mm
= get_task_mm(leader
);
2184 mpol_rebind_mm(mm
, &cpuset_attach_nodemask_to
);
2187 * old_mems_allowed is the same with mems_allowed
2188 * here, except if this task is being moved
2189 * automatically due to hotplug. In that case
2190 * @mems_allowed has been updated and is empty, so
2191 * @old_mems_allowed is the right nodesets that we
2194 if (is_memory_migrate(cs
))
2195 cpuset_migrate_mm(mm
, &oldcs
->old_mems_allowed
,
2196 &cpuset_attach_nodemask_to
);
2202 cs
->old_mems_allowed
= cpuset_attach_nodemask_to
;
2204 cs
->attach_in_progress
--;
2205 if (!cs
->attach_in_progress
)
2206 wake_up(&cpuset_attach_wq
);
2208 mutex_unlock(&cpuset_mutex
);
2211 /* The various types of files and directories in a cpuset file system */
2214 FILE_MEMORY_MIGRATE
,
2217 FILE_EFFECTIVE_CPULIST
,
2218 FILE_EFFECTIVE_MEMLIST
,
2219 FILE_SUBPARTS_CPULIST
,
2223 FILE_SCHED_LOAD_BALANCE
,
2224 FILE_PARTITION_ROOT
,
2225 FILE_SCHED_RELAX_DOMAIN_LEVEL
,
2226 FILE_MEMORY_PRESSURE_ENABLED
,
2227 FILE_MEMORY_PRESSURE
,
2230 } cpuset_filetype_t
;
2232 static int cpuset_write_u64(struct cgroup_subsys_state
*css
, struct cftype
*cft
,
2235 struct cpuset
*cs
= css_cs(css
);
2236 cpuset_filetype_t type
= cft
->private;
2239 mutex_lock(&cpuset_mutex
);
2240 if (!is_cpuset_online(cs
)) {
2246 case FILE_CPU_EXCLUSIVE
:
2247 retval
= update_flag(CS_CPU_EXCLUSIVE
, cs
, val
);
2249 case FILE_MEM_EXCLUSIVE
:
2250 retval
= update_flag(CS_MEM_EXCLUSIVE
, cs
, val
);
2252 case FILE_MEM_HARDWALL
:
2253 retval
= update_flag(CS_MEM_HARDWALL
, cs
, val
);
2255 case FILE_SCHED_LOAD_BALANCE
:
2256 retval
= update_flag(CS_SCHED_LOAD_BALANCE
, cs
, val
);
2258 case FILE_MEMORY_MIGRATE
:
2259 retval
= update_flag(CS_MEMORY_MIGRATE
, cs
, val
);
2261 case FILE_MEMORY_PRESSURE_ENABLED
:
2262 cpuset_memory_pressure_enabled
= !!val
;
2264 case FILE_SPREAD_PAGE
:
2265 retval
= update_flag(CS_SPREAD_PAGE
, cs
, val
);
2267 case FILE_SPREAD_SLAB
:
2268 retval
= update_flag(CS_SPREAD_SLAB
, cs
, val
);
2275 mutex_unlock(&cpuset_mutex
);
2279 static int cpuset_write_s64(struct cgroup_subsys_state
*css
, struct cftype
*cft
,
2282 struct cpuset
*cs
= css_cs(css
);
2283 cpuset_filetype_t type
= cft
->private;
2284 int retval
= -ENODEV
;
2286 mutex_lock(&cpuset_mutex
);
2287 if (!is_cpuset_online(cs
))
2291 case FILE_SCHED_RELAX_DOMAIN_LEVEL
:
2292 retval
= update_relax_domain_level(cs
, val
);
2299 mutex_unlock(&cpuset_mutex
);
2304 * Common handling for a write to a "cpus" or "mems" file.
2306 static ssize_t
cpuset_write_resmask(struct kernfs_open_file
*of
,
2307 char *buf
, size_t nbytes
, loff_t off
)
2309 struct cpuset
*cs
= css_cs(of_css(of
));
2310 struct cpuset
*trialcs
;
2311 int retval
= -ENODEV
;
2313 buf
= strstrip(buf
);
2316 * CPU or memory hotunplug may leave @cs w/o any execution
2317 * resources, in which case the hotplug code asynchronously updates
2318 * configuration and transfers all tasks to the nearest ancestor
2319 * which can execute.
2321 * As writes to "cpus" or "mems" may restore @cs's execution
2322 * resources, wait for the previously scheduled operations before
2323 * proceeding, so that we don't end up keep removing tasks added
2324 * after execution capability is restored.
2326 * cpuset_hotplug_work calls back into cgroup core via
2327 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
2328 * operation like this one can lead to a deadlock through kernfs
2329 * active_ref protection. Let's break the protection. Losing the
2330 * protection is okay as we check whether @cs is online after
2331 * grabbing cpuset_mutex anyway. This only happens on the legacy
2335 kernfs_break_active_protection(of
->kn
);
2336 flush_work(&cpuset_hotplug_work
);
2338 mutex_lock(&cpuset_mutex
);
2339 if (!is_cpuset_online(cs
))
2342 trialcs
= alloc_trial_cpuset(cs
);
2348 switch (of_cft(of
)->private) {
2350 retval
= update_cpumask(cs
, trialcs
, buf
);
2353 retval
= update_nodemask(cs
, trialcs
, buf
);
2360 free_cpuset(trialcs
);
2362 mutex_unlock(&cpuset_mutex
);
2363 kernfs_unbreak_active_protection(of
->kn
);
2365 flush_workqueue(cpuset_migrate_mm_wq
);
2366 return retval
?: nbytes
;
2370 * These ascii lists should be read in a single call, by using a user
2371 * buffer large enough to hold the entire map. If read in smaller
2372 * chunks, there is no guarantee of atomicity. Since the display format
2373 * used, list of ranges of sequential numbers, is variable length,
2374 * and since these maps can change value dynamically, one could read
2375 * gibberish by doing partial reads while a list was changing.
2377 static int cpuset_common_seq_show(struct seq_file
*sf
, void *v
)
2379 struct cpuset
*cs
= css_cs(seq_css(sf
));
2380 cpuset_filetype_t type
= seq_cft(sf
)->private;
2383 spin_lock_irq(&callback_lock
);
2387 seq_printf(sf
, "%*pbl\n", cpumask_pr_args(cs
->cpus_allowed
));
2390 seq_printf(sf
, "%*pbl\n", nodemask_pr_args(&cs
->mems_allowed
));
2392 case FILE_EFFECTIVE_CPULIST
:
2393 seq_printf(sf
, "%*pbl\n", cpumask_pr_args(cs
->effective_cpus
));
2395 case FILE_EFFECTIVE_MEMLIST
:
2396 seq_printf(sf
, "%*pbl\n", nodemask_pr_args(&cs
->effective_mems
));
2398 case FILE_SUBPARTS_CPULIST
:
2399 seq_printf(sf
, "%*pbl\n", cpumask_pr_args(cs
->subparts_cpus
));
2405 spin_unlock_irq(&callback_lock
);
2409 static u64
cpuset_read_u64(struct cgroup_subsys_state
*css
, struct cftype
*cft
)
2411 struct cpuset
*cs
= css_cs(css
);
2412 cpuset_filetype_t type
= cft
->private;
2414 case FILE_CPU_EXCLUSIVE
:
2415 return is_cpu_exclusive(cs
);
2416 case FILE_MEM_EXCLUSIVE
:
2417 return is_mem_exclusive(cs
);
2418 case FILE_MEM_HARDWALL
:
2419 return is_mem_hardwall(cs
);
2420 case FILE_SCHED_LOAD_BALANCE
:
2421 return is_sched_load_balance(cs
);
2422 case FILE_MEMORY_MIGRATE
:
2423 return is_memory_migrate(cs
);
2424 case FILE_MEMORY_PRESSURE_ENABLED
:
2425 return cpuset_memory_pressure_enabled
;
2426 case FILE_MEMORY_PRESSURE
:
2427 return fmeter_getrate(&cs
->fmeter
);
2428 case FILE_SPREAD_PAGE
:
2429 return is_spread_page(cs
);
2430 case FILE_SPREAD_SLAB
:
2431 return is_spread_slab(cs
);
2436 /* Unreachable but makes gcc happy */
2440 static s64
cpuset_read_s64(struct cgroup_subsys_state
*css
, struct cftype
*cft
)
2442 struct cpuset
*cs
= css_cs(css
);
2443 cpuset_filetype_t type
= cft
->private;
2445 case FILE_SCHED_RELAX_DOMAIN_LEVEL
:
2446 return cs
->relax_domain_level
;
2451 /* Unrechable but makes gcc happy */
2455 static int sched_partition_show(struct seq_file
*seq
, void *v
)
2457 struct cpuset
*cs
= css_cs(seq_css(seq
));
2459 switch (cs
->partition_root_state
) {
2461 seq_puts(seq
, "root\n");
2464 seq_puts(seq
, "member\n");
2467 seq_puts(seq
, "root invalid\n");
2473 static ssize_t
sched_partition_write(struct kernfs_open_file
*of
, char *buf
,
2474 size_t nbytes
, loff_t off
)
2476 struct cpuset
*cs
= css_cs(of_css(of
));
2478 int retval
= -ENODEV
;
2480 buf
= strstrip(buf
);
2483 * Convert "root" to ENABLED, and convert "member" to DISABLED.
2485 if (!strcmp(buf
, "root"))
2487 else if (!strcmp(buf
, "member"))
2493 mutex_lock(&cpuset_mutex
);
2494 if (!is_cpuset_online(cs
))
2497 retval
= update_prstate(cs
, val
);
2499 mutex_unlock(&cpuset_mutex
);
2501 return retval
?: nbytes
;
2505 * for the common functions, 'private' gives the type of file
2508 static struct cftype legacy_files
[] = {
2511 .seq_show
= cpuset_common_seq_show
,
2512 .write
= cpuset_write_resmask
,
2513 .max_write_len
= (100U + 6 * NR_CPUS
),
2514 .private = FILE_CPULIST
,
2519 .seq_show
= cpuset_common_seq_show
,
2520 .write
= cpuset_write_resmask
,
2521 .max_write_len
= (100U + 6 * MAX_NUMNODES
),
2522 .private = FILE_MEMLIST
,
2526 .name
= "effective_cpus",
2527 .seq_show
= cpuset_common_seq_show
,
2528 .private = FILE_EFFECTIVE_CPULIST
,
2532 .name
= "effective_mems",
2533 .seq_show
= cpuset_common_seq_show
,
2534 .private = FILE_EFFECTIVE_MEMLIST
,
2538 .name
= "cpu_exclusive",
2539 .read_u64
= cpuset_read_u64
,
2540 .write_u64
= cpuset_write_u64
,
2541 .private = FILE_CPU_EXCLUSIVE
,
2545 .name
= "mem_exclusive",
2546 .read_u64
= cpuset_read_u64
,
2547 .write_u64
= cpuset_write_u64
,
2548 .private = FILE_MEM_EXCLUSIVE
,
2552 .name
= "mem_hardwall",
2553 .read_u64
= cpuset_read_u64
,
2554 .write_u64
= cpuset_write_u64
,
2555 .private = FILE_MEM_HARDWALL
,
2559 .name
= "sched_load_balance",
2560 .read_u64
= cpuset_read_u64
,
2561 .write_u64
= cpuset_write_u64
,
2562 .private = FILE_SCHED_LOAD_BALANCE
,
2566 .name
= "sched_relax_domain_level",
2567 .read_s64
= cpuset_read_s64
,
2568 .write_s64
= cpuset_write_s64
,
2569 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL
,
2573 .name
= "memory_migrate",
2574 .read_u64
= cpuset_read_u64
,
2575 .write_u64
= cpuset_write_u64
,
2576 .private = FILE_MEMORY_MIGRATE
,
2580 .name
= "memory_pressure",
2581 .read_u64
= cpuset_read_u64
,
2582 .private = FILE_MEMORY_PRESSURE
,
2586 .name
= "memory_spread_page",
2587 .read_u64
= cpuset_read_u64
,
2588 .write_u64
= cpuset_write_u64
,
2589 .private = FILE_SPREAD_PAGE
,
2593 .name
= "memory_spread_slab",
2594 .read_u64
= cpuset_read_u64
,
2595 .write_u64
= cpuset_write_u64
,
2596 .private = FILE_SPREAD_SLAB
,
2600 .name
= "memory_pressure_enabled",
2601 .flags
= CFTYPE_ONLY_ON_ROOT
,
2602 .read_u64
= cpuset_read_u64
,
2603 .write_u64
= cpuset_write_u64
,
2604 .private = FILE_MEMORY_PRESSURE_ENABLED
,
2611 * This is currently a minimal set for the default hierarchy. It can be
2612 * expanded later on by migrating more features and control files from v1.
2614 static struct cftype dfl_files
[] = {
2617 .seq_show
= cpuset_common_seq_show
,
2618 .write
= cpuset_write_resmask
,
2619 .max_write_len
= (100U + 6 * NR_CPUS
),
2620 .private = FILE_CPULIST
,
2621 .flags
= CFTYPE_NOT_ON_ROOT
,
2626 .seq_show
= cpuset_common_seq_show
,
2627 .write
= cpuset_write_resmask
,
2628 .max_write_len
= (100U + 6 * MAX_NUMNODES
),
2629 .private = FILE_MEMLIST
,
2630 .flags
= CFTYPE_NOT_ON_ROOT
,
2634 .name
= "cpus.effective",
2635 .seq_show
= cpuset_common_seq_show
,
2636 .private = FILE_EFFECTIVE_CPULIST
,
2640 .name
= "mems.effective",
2641 .seq_show
= cpuset_common_seq_show
,
2642 .private = FILE_EFFECTIVE_MEMLIST
,
2646 .name
= "cpus.partition",
2647 .seq_show
= sched_partition_show
,
2648 .write
= sched_partition_write
,
2649 .private = FILE_PARTITION_ROOT
,
2650 .flags
= CFTYPE_NOT_ON_ROOT
,
2654 .name
= "cpus.subpartitions",
2655 .seq_show
= cpuset_common_seq_show
,
2656 .private = FILE_SUBPARTS_CPULIST
,
2657 .flags
= CFTYPE_DEBUG
,
2665 * cpuset_css_alloc - allocate a cpuset css
2666 * cgrp: control group that the new cpuset will be part of
2669 static struct cgroup_subsys_state
*
2670 cpuset_css_alloc(struct cgroup_subsys_state
*parent_css
)
2675 return &top_cpuset
.css
;
2677 cs
= kzalloc(sizeof(*cs
), GFP_KERNEL
);
2679 return ERR_PTR(-ENOMEM
);
2681 if (alloc_cpumasks(cs
, NULL
)) {
2683 return ERR_PTR(-ENOMEM
);
2686 set_bit(CS_SCHED_LOAD_BALANCE
, &cs
->flags
);
2687 nodes_clear(cs
->mems_allowed
);
2688 nodes_clear(cs
->effective_mems
);
2689 fmeter_init(&cs
->fmeter
);
2690 cs
->relax_domain_level
= -1;
2695 static int cpuset_css_online(struct cgroup_subsys_state
*css
)
2697 struct cpuset
*cs
= css_cs(css
);
2698 struct cpuset
*parent
= parent_cs(cs
);
2699 struct cpuset
*tmp_cs
;
2700 struct cgroup_subsys_state
*pos_css
;
2705 mutex_lock(&cpuset_mutex
);
2707 set_bit(CS_ONLINE
, &cs
->flags
);
2708 if (is_spread_page(parent
))
2709 set_bit(CS_SPREAD_PAGE
, &cs
->flags
);
2710 if (is_spread_slab(parent
))
2711 set_bit(CS_SPREAD_SLAB
, &cs
->flags
);
2715 spin_lock_irq(&callback_lock
);
2716 if (is_in_v2_mode()) {
2717 cpumask_copy(cs
->effective_cpus
, parent
->effective_cpus
);
2718 cs
->effective_mems
= parent
->effective_mems
;
2719 cs
->use_parent_ecpus
= true;
2720 parent
->child_ecpus_count
++;
2722 spin_unlock_irq(&callback_lock
);
2724 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN
, &css
->cgroup
->flags
))
2728 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
2729 * set. This flag handling is implemented in cgroup core for
2730 * histrical reasons - the flag may be specified during mount.
2732 * Currently, if any sibling cpusets have exclusive cpus or mem, we
2733 * refuse to clone the configuration - thereby refusing the task to
2734 * be entered, and as a result refusing the sys_unshare() or
2735 * clone() which initiated it. If this becomes a problem for some
2736 * users who wish to allow that scenario, then this could be
2737 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
2738 * (and likewise for mems) to the new cgroup.
2741 cpuset_for_each_child(tmp_cs
, pos_css
, parent
) {
2742 if (is_mem_exclusive(tmp_cs
) || is_cpu_exclusive(tmp_cs
)) {
2749 spin_lock_irq(&callback_lock
);
2750 cs
->mems_allowed
= parent
->mems_allowed
;
2751 cs
->effective_mems
= parent
->mems_allowed
;
2752 cpumask_copy(cs
->cpus_allowed
, parent
->cpus_allowed
);
2753 cpumask_copy(cs
->effective_cpus
, parent
->cpus_allowed
);
2754 spin_unlock_irq(&callback_lock
);
2756 mutex_unlock(&cpuset_mutex
);
2761 * If the cpuset being removed has its flag 'sched_load_balance'
2762 * enabled, then simulate turning sched_load_balance off, which
2763 * will call rebuild_sched_domains_locked(). That is not needed
2764 * in the default hierarchy where only changes in partition
2765 * will cause repartitioning.
2767 * If the cpuset has the 'sched.partition' flag enabled, simulate
2768 * turning 'sched.partition" off.
2771 static void cpuset_css_offline(struct cgroup_subsys_state
*css
)
2773 struct cpuset
*cs
= css_cs(css
);
2775 mutex_lock(&cpuset_mutex
);
2777 if (is_partition_root(cs
))
2778 update_prstate(cs
, 0);
2780 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys
) &&
2781 is_sched_load_balance(cs
))
2782 update_flag(CS_SCHED_LOAD_BALANCE
, cs
, 0);
2784 if (cs
->use_parent_ecpus
) {
2785 struct cpuset
*parent
= parent_cs(cs
);
2787 cs
->use_parent_ecpus
= false;
2788 parent
->child_ecpus_count
--;
2792 clear_bit(CS_ONLINE
, &cs
->flags
);
2794 mutex_unlock(&cpuset_mutex
);
2797 static void cpuset_css_free(struct cgroup_subsys_state
*css
)
2799 struct cpuset
*cs
= css_cs(css
);
2804 static void cpuset_bind(struct cgroup_subsys_state
*root_css
)
2806 mutex_lock(&cpuset_mutex
);
2807 spin_lock_irq(&callback_lock
);
2809 if (is_in_v2_mode()) {
2810 cpumask_copy(top_cpuset
.cpus_allowed
, cpu_possible_mask
);
2811 top_cpuset
.mems_allowed
= node_possible_map
;
2813 cpumask_copy(top_cpuset
.cpus_allowed
,
2814 top_cpuset
.effective_cpus
);
2815 top_cpuset
.mems_allowed
= top_cpuset
.effective_mems
;
2818 spin_unlock_irq(&callback_lock
);
2819 mutex_unlock(&cpuset_mutex
);
2823 * Make sure the new task conform to the current state of its parent,
2824 * which could have been changed by cpuset just after it inherits the
2825 * state from the parent and before it sits on the cgroup's task list.
2827 static void cpuset_fork(struct task_struct
*task
)
2829 if (task_css_is_root(task
, cpuset_cgrp_id
))
2832 set_cpus_allowed_ptr(task
, ¤t
->cpus_allowed
);
2833 task
->mems_allowed
= current
->mems_allowed
;
2836 struct cgroup_subsys cpuset_cgrp_subsys
= {
2837 .css_alloc
= cpuset_css_alloc
,
2838 .css_online
= cpuset_css_online
,
2839 .css_offline
= cpuset_css_offline
,
2840 .css_free
= cpuset_css_free
,
2841 .can_attach
= cpuset_can_attach
,
2842 .cancel_attach
= cpuset_cancel_attach
,
2843 .attach
= cpuset_attach
,
2844 .post_attach
= cpuset_post_attach
,
2845 .bind
= cpuset_bind
,
2846 .fork
= cpuset_fork
,
2847 .legacy_cftypes
= legacy_files
,
2848 .dfl_cftypes
= dfl_files
,
2854 * cpuset_init - initialize cpusets at system boot
2856 * Description: Initialize top_cpuset and the cpuset internal file system,
2859 int __init
cpuset_init(void)
2863 BUG_ON(!alloc_cpumask_var(&top_cpuset
.cpus_allowed
, GFP_KERNEL
));
2864 BUG_ON(!alloc_cpumask_var(&top_cpuset
.effective_cpus
, GFP_KERNEL
));
2865 BUG_ON(!zalloc_cpumask_var(&top_cpuset
.subparts_cpus
, GFP_KERNEL
));
2867 cpumask_setall(top_cpuset
.cpus_allowed
);
2868 nodes_setall(top_cpuset
.mems_allowed
);
2869 cpumask_setall(top_cpuset
.effective_cpus
);
2870 nodes_setall(top_cpuset
.effective_mems
);
2872 fmeter_init(&top_cpuset
.fmeter
);
2873 set_bit(CS_SCHED_LOAD_BALANCE
, &top_cpuset
.flags
);
2874 top_cpuset
.relax_domain_level
= -1;
2876 err
= register_filesystem(&cpuset_fs_type
);
2880 BUG_ON(!alloc_cpumask_var(&cpus_attach
, GFP_KERNEL
));
2886 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
2887 * or memory nodes, we need to walk over the cpuset hierarchy,
2888 * removing that CPU or node from all cpusets. If this removes the
2889 * last CPU or node from a cpuset, then move the tasks in the empty
2890 * cpuset to its next-highest non-empty parent.
2892 static void remove_tasks_in_empty_cpuset(struct cpuset
*cs
)
2894 struct cpuset
*parent
;
2897 * Find its next-highest non-empty parent, (top cpuset
2898 * has online cpus, so can't be empty).
2900 parent
= parent_cs(cs
);
2901 while (cpumask_empty(parent
->cpus_allowed
) ||
2902 nodes_empty(parent
->mems_allowed
))
2903 parent
= parent_cs(parent
);
2905 if (cgroup_transfer_tasks(parent
->css
.cgroup
, cs
->css
.cgroup
)) {
2906 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2907 pr_cont_cgroup_name(cs
->css
.cgroup
);
2913 hotplug_update_tasks_legacy(struct cpuset
*cs
,
2914 struct cpumask
*new_cpus
, nodemask_t
*new_mems
,
2915 bool cpus_updated
, bool mems_updated
)
2919 spin_lock_irq(&callback_lock
);
2920 cpumask_copy(cs
->cpus_allowed
, new_cpus
);
2921 cpumask_copy(cs
->effective_cpus
, new_cpus
);
2922 cs
->mems_allowed
= *new_mems
;
2923 cs
->effective_mems
= *new_mems
;
2924 spin_unlock_irq(&callback_lock
);
2927 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
2928 * as the tasks will be migratecd to an ancestor.
2930 if (cpus_updated
&& !cpumask_empty(cs
->cpus_allowed
))
2931 update_tasks_cpumask(cs
);
2932 if (mems_updated
&& !nodes_empty(cs
->mems_allowed
))
2933 update_tasks_nodemask(cs
);
2935 is_empty
= cpumask_empty(cs
->cpus_allowed
) ||
2936 nodes_empty(cs
->mems_allowed
);
2938 mutex_unlock(&cpuset_mutex
);
2941 * Move tasks to the nearest ancestor with execution resources,
2942 * This is full cgroup operation which will also call back into
2943 * cpuset. Should be done outside any lock.
2946 remove_tasks_in_empty_cpuset(cs
);
2948 mutex_lock(&cpuset_mutex
);
2952 hotplug_update_tasks(struct cpuset
*cs
,
2953 struct cpumask
*new_cpus
, nodemask_t
*new_mems
,
2954 bool cpus_updated
, bool mems_updated
)
2956 if (cpumask_empty(new_cpus
))
2957 cpumask_copy(new_cpus
, parent_cs(cs
)->effective_cpus
);
2958 if (nodes_empty(*new_mems
))
2959 *new_mems
= parent_cs(cs
)->effective_mems
;
2961 spin_lock_irq(&callback_lock
);
2962 cpumask_copy(cs
->effective_cpus
, new_cpus
);
2963 cs
->effective_mems
= *new_mems
;
2964 spin_unlock_irq(&callback_lock
);
2967 update_tasks_cpumask(cs
);
2969 update_tasks_nodemask(cs
);
2972 static bool force_rebuild
;
2974 void cpuset_force_rebuild(void)
2976 force_rebuild
= true;
2980 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2981 * @cs: cpuset in interest
2982 * @tmp: the tmpmasks structure pointer
2984 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2985 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2986 * all its tasks are moved to the nearest ancestor with both resources.
2988 static void cpuset_hotplug_update_tasks(struct cpuset
*cs
, struct tmpmasks
*tmp
)
2990 static cpumask_t new_cpus
;
2991 static nodemask_t new_mems
;
2994 struct cpuset
*parent
;
2996 wait_event(cpuset_attach_wq
, cs
->attach_in_progress
== 0);
2998 mutex_lock(&cpuset_mutex
);
3001 * We have raced with task attaching. We wait until attaching
3002 * is finished, so we won't attach a task to an empty cpuset.
3004 if (cs
->attach_in_progress
) {
3005 mutex_unlock(&cpuset_mutex
);
3009 parent
= parent_cs(cs
);
3010 compute_effective_cpumask(&new_cpus
, cs
, parent
);
3011 nodes_and(new_mems
, cs
->mems_allowed
, parent
->effective_mems
);
3013 if (cs
->nr_subparts_cpus
)
3015 * Make sure that CPUs allocated to child partitions
3016 * do not show up in effective_cpus.
3018 cpumask_andnot(&new_cpus
, &new_cpus
, cs
->subparts_cpus
);
3020 if (!tmp
|| !cs
->partition_root_state
)
3024 * In the unlikely event that a partition root has empty
3025 * effective_cpus or its parent becomes erroneous, we have to
3026 * transition it to the erroneous state.
3028 if (is_partition_root(cs
) && (cpumask_empty(&new_cpus
) ||
3029 (parent
->partition_root_state
== PRS_ERROR
))) {
3030 if (cs
->nr_subparts_cpus
) {
3031 cs
->nr_subparts_cpus
= 0;
3032 cpumask_clear(cs
->subparts_cpus
);
3033 compute_effective_cpumask(&new_cpus
, cs
, parent
);
3037 * If the effective_cpus is empty because the child
3038 * partitions take away all the CPUs, we can keep
3039 * the current partition and let the child partitions
3040 * fight for available CPUs.
3042 if ((parent
->partition_root_state
== PRS_ERROR
) ||
3043 cpumask_empty(&new_cpus
)) {
3044 update_parent_subparts_cpumask(cs
, partcmd_disable
,
3046 cs
->partition_root_state
= PRS_ERROR
;
3048 cpuset_force_rebuild();
3052 * On the other hand, an erroneous partition root may be transitioned
3053 * back to a regular one or a partition root with no CPU allocated
3054 * from the parent may change to erroneous.
3056 if (is_partition_root(parent
) &&
3057 ((cs
->partition_root_state
== PRS_ERROR
) ||
3058 !cpumask_intersects(&new_cpus
, parent
->subparts_cpus
)) &&
3059 update_parent_subparts_cpumask(cs
, partcmd_update
, NULL
, tmp
))
3060 cpuset_force_rebuild();
3063 cpus_updated
= !cpumask_equal(&new_cpus
, cs
->effective_cpus
);
3064 mems_updated
= !nodes_equal(new_mems
, cs
->effective_mems
);
3066 if (is_in_v2_mode())
3067 hotplug_update_tasks(cs
, &new_cpus
, &new_mems
,
3068 cpus_updated
, mems_updated
);
3070 hotplug_update_tasks_legacy(cs
, &new_cpus
, &new_mems
,
3071 cpus_updated
, mems_updated
);
3073 mutex_unlock(&cpuset_mutex
);
3077 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
3079 * This function is called after either CPU or memory configuration has
3080 * changed and updates cpuset accordingly. The top_cpuset is always
3081 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
3082 * order to make cpusets transparent (of no affect) on systems that are
3083 * actively using CPU hotplug but making no active use of cpusets.
3085 * Non-root cpusets are only affected by offlining. If any CPUs or memory
3086 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
3089 * Note that CPU offlining during suspend is ignored. We don't modify
3090 * cpusets across suspend/resume cycles at all.
3092 static void cpuset_hotplug_workfn(struct work_struct
*work
)
3094 static cpumask_t new_cpus
;
3095 static nodemask_t new_mems
;
3096 bool cpus_updated
, mems_updated
;
3097 bool on_dfl
= is_in_v2_mode();
3098 struct tmpmasks tmp
, *ptmp
= NULL
;
3100 if (on_dfl
&& !alloc_cpumasks(NULL
, &tmp
))
3103 mutex_lock(&cpuset_mutex
);
3105 /* fetch the available cpus/mems and find out which changed how */
3106 cpumask_copy(&new_cpus
, cpu_active_mask
);
3107 new_mems
= node_states
[N_MEMORY
];
3110 * If subparts_cpus is populated, it is likely that the check below
3111 * will produce a false positive on cpus_updated when the cpu list
3112 * isn't changed. It is extra work, but it is better to be safe.
3114 cpus_updated
= !cpumask_equal(top_cpuset
.effective_cpus
, &new_cpus
);
3115 mems_updated
= !nodes_equal(top_cpuset
.effective_mems
, new_mems
);
3117 /* synchronize cpus_allowed to cpu_active_mask */
3119 spin_lock_irq(&callback_lock
);
3121 cpumask_copy(top_cpuset
.cpus_allowed
, &new_cpus
);
3123 * Make sure that CPUs allocated to child partitions
3124 * do not show up in effective_cpus. If no CPU is left,
3125 * we clear the subparts_cpus & let the child partitions
3126 * fight for the CPUs again.
3128 if (top_cpuset
.nr_subparts_cpus
) {
3129 if (cpumask_subset(&new_cpus
,
3130 top_cpuset
.subparts_cpus
)) {
3131 top_cpuset
.nr_subparts_cpus
= 0;
3132 cpumask_clear(top_cpuset
.subparts_cpus
);
3134 cpumask_andnot(&new_cpus
, &new_cpus
,
3135 top_cpuset
.subparts_cpus
);
3138 cpumask_copy(top_cpuset
.effective_cpus
, &new_cpus
);
3139 spin_unlock_irq(&callback_lock
);
3140 /* we don't mess with cpumasks of tasks in top_cpuset */
3143 /* synchronize mems_allowed to N_MEMORY */
3145 spin_lock_irq(&callback_lock
);
3147 top_cpuset
.mems_allowed
= new_mems
;
3148 top_cpuset
.effective_mems
= new_mems
;
3149 spin_unlock_irq(&callback_lock
);
3150 update_tasks_nodemask(&top_cpuset
);
3153 mutex_unlock(&cpuset_mutex
);
3155 /* if cpus or mems changed, we need to propagate to descendants */
3156 if (cpus_updated
|| mems_updated
) {
3158 struct cgroup_subsys_state
*pos_css
;
3161 cpuset_for_each_descendant_pre(cs
, pos_css
, &top_cpuset
) {
3162 if (cs
== &top_cpuset
|| !css_tryget_online(&cs
->css
))
3166 cpuset_hotplug_update_tasks(cs
, ptmp
);
3174 /* rebuild sched domains if cpus_allowed has changed */
3175 if (cpus_updated
|| force_rebuild
) {
3176 force_rebuild
= false;
3177 rebuild_sched_domains();
3180 free_cpumasks(NULL
, ptmp
);
3183 void cpuset_update_active_cpus(void)
3186 * We're inside cpu hotplug critical region which usually nests
3187 * inside cgroup synchronization. Bounce actual hotplug processing
3188 * to a work item to avoid reverse locking order.
3190 schedule_work(&cpuset_hotplug_work
);
3193 void cpuset_wait_for_hotplug(void)
3195 flush_work(&cpuset_hotplug_work
);
3199 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
3200 * Call this routine anytime after node_states[N_MEMORY] changes.
3201 * See cpuset_update_active_cpus() for CPU hotplug handling.
3203 static int cpuset_track_online_nodes(struct notifier_block
*self
,
3204 unsigned long action
, void *arg
)
3206 schedule_work(&cpuset_hotplug_work
);
3210 static struct notifier_block cpuset_track_online_nodes_nb
= {
3211 .notifier_call
= cpuset_track_online_nodes
,
3212 .priority
= 10, /* ??! */
3216 * cpuset_init_smp - initialize cpus_allowed
3218 * Description: Finish top cpuset after cpu, node maps are initialized
3220 void __init
cpuset_init_smp(void)
3222 cpumask_copy(top_cpuset
.cpus_allowed
, cpu_active_mask
);
3223 top_cpuset
.mems_allowed
= node_states
[N_MEMORY
];
3224 top_cpuset
.old_mems_allowed
= top_cpuset
.mems_allowed
;
3226 cpumask_copy(top_cpuset
.effective_cpus
, cpu_active_mask
);
3227 top_cpuset
.effective_mems
= node_states
[N_MEMORY
];
3229 register_hotmemory_notifier(&cpuset_track_online_nodes_nb
);
3231 cpuset_migrate_mm_wq
= alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3232 BUG_ON(!cpuset_migrate_mm_wq
);
3236 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
3237 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
3238 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
3240 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
3241 * attached to the specified @tsk. Guaranteed to return some non-empty
3242 * subset of cpu_online_mask, even if this means going outside the
3246 void cpuset_cpus_allowed(struct task_struct
*tsk
, struct cpumask
*pmask
)
3248 unsigned long flags
;
3250 spin_lock_irqsave(&callback_lock
, flags
);
3252 guarantee_online_cpus(task_cs(tsk
), pmask
);
3254 spin_unlock_irqrestore(&callback_lock
, flags
);
3258 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
3259 * @tsk: pointer to task_struct with which the scheduler is struggling
3261 * Description: In the case that the scheduler cannot find an allowed cpu in
3262 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
3263 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
3264 * which will not contain a sane cpumask during cases such as cpu hotplugging.
3265 * This is the absolute last resort for the scheduler and it is only used if
3266 * _every_ other avenue has been traveled.
3269 void cpuset_cpus_allowed_fallback(struct task_struct
*tsk
)
3272 do_set_cpus_allowed(tsk
, is_in_v2_mode() ?
3273 task_cs(tsk
)->cpus_allowed
: cpu_possible_mask
);
3277 * We own tsk->cpus_allowed, nobody can change it under us.
3279 * But we used cs && cs->cpus_allowed lockless and thus can
3280 * race with cgroup_attach_task() or update_cpumask() and get
3281 * the wrong tsk->cpus_allowed. However, both cases imply the
3282 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
3283 * which takes task_rq_lock().
3285 * If we are called after it dropped the lock we must see all
3286 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
3287 * set any mask even if it is not right from task_cs() pov,
3288 * the pending set_cpus_allowed_ptr() will fix things.
3290 * select_fallback_rq() will fix things ups and set cpu_possible_mask
3295 void __init
cpuset_init_current_mems_allowed(void)
3297 nodes_setall(current
->mems_allowed
);
3301 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
3302 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
3304 * Description: Returns the nodemask_t mems_allowed of the cpuset
3305 * attached to the specified @tsk. Guaranteed to return some non-empty
3306 * subset of node_states[N_MEMORY], even if this means going outside the
3310 nodemask_t
cpuset_mems_allowed(struct task_struct
*tsk
)
3313 unsigned long flags
;
3315 spin_lock_irqsave(&callback_lock
, flags
);
3317 guarantee_online_mems(task_cs(tsk
), &mask
);
3319 spin_unlock_irqrestore(&callback_lock
, flags
);
3325 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
3326 * @nodemask: the nodemask to be checked
3328 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
3330 int cpuset_nodemask_valid_mems_allowed(nodemask_t
*nodemask
)
3332 return nodes_intersects(*nodemask
, current
->mems_allowed
);
3336 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
3337 * mem_hardwall ancestor to the specified cpuset. Call holding
3338 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
3339 * (an unusual configuration), then returns the root cpuset.
3341 static struct cpuset
*nearest_hardwall_ancestor(struct cpuset
*cs
)
3343 while (!(is_mem_exclusive(cs
) || is_mem_hardwall(cs
)) && parent_cs(cs
))
3349 * cpuset_node_allowed - Can we allocate on a memory node?
3350 * @node: is this an allowed node?
3351 * @gfp_mask: memory allocation flags
3353 * If we're in interrupt, yes, we can always allocate. If @node is set in
3354 * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
3355 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
3356 * yes. If current has access to memory reserves as an oom victim, yes.
3359 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
3360 * and do not allow allocations outside the current tasks cpuset
3361 * unless the task has been OOM killed.
3362 * GFP_KERNEL allocations are not so marked, so can escape to the
3363 * nearest enclosing hardwalled ancestor cpuset.
3365 * Scanning up parent cpusets requires callback_lock. The
3366 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
3367 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
3368 * current tasks mems_allowed came up empty on the first pass over
3369 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
3370 * cpuset are short of memory, might require taking the callback_lock.
3372 * The first call here from mm/page_alloc:get_page_from_freelist()
3373 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
3374 * so no allocation on a node outside the cpuset is allowed (unless
3375 * in interrupt, of course).
3377 * The second pass through get_page_from_freelist() doesn't even call
3378 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
3379 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
3380 * in alloc_flags. That logic and the checks below have the combined
3382 * in_interrupt - any node ok (current task context irrelevant)
3383 * GFP_ATOMIC - any node ok
3384 * tsk_is_oom_victim - any node ok
3385 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
3386 * GFP_USER - only nodes in current tasks mems allowed ok.
3388 bool __cpuset_node_allowed(int node
, gfp_t gfp_mask
)
3390 struct cpuset
*cs
; /* current cpuset ancestors */
3391 int allowed
; /* is allocation in zone z allowed? */
3392 unsigned long flags
;
3396 if (node_isset(node
, current
->mems_allowed
))
3399 * Allow tasks that have access to memory reserves because they have
3400 * been OOM killed to get memory anywhere.
3402 if (unlikely(tsk_is_oom_victim(current
)))
3404 if (gfp_mask
& __GFP_HARDWALL
) /* If hardwall request, stop here */
3407 if (current
->flags
& PF_EXITING
) /* Let dying task have memory */
3410 /* Not hardwall and node outside mems_allowed: scan up cpusets */
3411 spin_lock_irqsave(&callback_lock
, flags
);
3414 cs
= nearest_hardwall_ancestor(task_cs(current
));
3415 allowed
= node_isset(node
, cs
->mems_allowed
);
3418 spin_unlock_irqrestore(&callback_lock
, flags
);
3423 * cpuset_mem_spread_node() - On which node to begin search for a file page
3424 * cpuset_slab_spread_node() - On which node to begin search for a slab page
3426 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
3427 * tasks in a cpuset with is_spread_page or is_spread_slab set),
3428 * and if the memory allocation used cpuset_mem_spread_node()
3429 * to determine on which node to start looking, as it will for
3430 * certain page cache or slab cache pages such as used for file
3431 * system buffers and inode caches, then instead of starting on the
3432 * local node to look for a free page, rather spread the starting
3433 * node around the tasks mems_allowed nodes.
3435 * We don't have to worry about the returned node being offline
3436 * because "it can't happen", and even if it did, it would be ok.
3438 * The routines calling guarantee_online_mems() are careful to
3439 * only set nodes in task->mems_allowed that are online. So it
3440 * should not be possible for the following code to return an
3441 * offline node. But if it did, that would be ok, as this routine
3442 * is not returning the node where the allocation must be, only
3443 * the node where the search should start. The zonelist passed to
3444 * __alloc_pages() will include all nodes. If the slab allocator
3445 * is passed an offline node, it will fall back to the local node.
3446 * See kmem_cache_alloc_node().
3449 static int cpuset_spread_node(int *rotor
)
3451 return *rotor
= next_node_in(*rotor
, current
->mems_allowed
);
3454 int cpuset_mem_spread_node(void)
3456 if (current
->cpuset_mem_spread_rotor
== NUMA_NO_NODE
)
3457 current
->cpuset_mem_spread_rotor
=
3458 node_random(¤t
->mems_allowed
);
3460 return cpuset_spread_node(¤t
->cpuset_mem_spread_rotor
);
3463 int cpuset_slab_spread_node(void)
3465 if (current
->cpuset_slab_spread_rotor
== NUMA_NO_NODE
)
3466 current
->cpuset_slab_spread_rotor
=
3467 node_random(¤t
->mems_allowed
);
3469 return cpuset_spread_node(¤t
->cpuset_slab_spread_rotor
);
3472 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node
);
3475 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
3476 * @tsk1: pointer to task_struct of some task.
3477 * @tsk2: pointer to task_struct of some other task.
3479 * Description: Return true if @tsk1's mems_allowed intersects the
3480 * mems_allowed of @tsk2. Used by the OOM killer to determine if
3481 * one of the task's memory usage might impact the memory available
3485 int cpuset_mems_allowed_intersects(const struct task_struct
*tsk1
,
3486 const struct task_struct
*tsk2
)
3488 return nodes_intersects(tsk1
->mems_allowed
, tsk2
->mems_allowed
);
3492 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
3494 * Description: Prints current's name, cpuset name, and cached copy of its
3495 * mems_allowed to the kernel log.
3497 void cpuset_print_current_mems_allowed(void)
3499 struct cgroup
*cgrp
;
3503 cgrp
= task_cs(current
)->css
.cgroup
;
3504 pr_cont(",cpuset=");
3505 pr_cont_cgroup_name(cgrp
);
3506 pr_cont(",mems_allowed=%*pbl",
3507 nodemask_pr_args(¤t
->mems_allowed
));
3513 * Collection of memory_pressure is suppressed unless
3514 * this flag is enabled by writing "1" to the special
3515 * cpuset file 'memory_pressure_enabled' in the root cpuset.
3518 int cpuset_memory_pressure_enabled __read_mostly
;
3521 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
3523 * Keep a running average of the rate of synchronous (direct)
3524 * page reclaim efforts initiated by tasks in each cpuset.
3526 * This represents the rate at which some task in the cpuset
3527 * ran low on memory on all nodes it was allowed to use, and
3528 * had to enter the kernels page reclaim code in an effort to
3529 * create more free memory by tossing clean pages or swapping
3530 * or writing dirty pages.
3532 * Display to user space in the per-cpuset read-only file
3533 * "memory_pressure". Value displayed is an integer
3534 * representing the recent rate of entry into the synchronous
3535 * (direct) page reclaim by any task attached to the cpuset.
3538 void __cpuset_memory_pressure_bump(void)
3541 fmeter_markevent(&task_cs(current
)->fmeter
);
3545 #ifdef CONFIG_PROC_PID_CPUSET
3547 * proc_cpuset_show()
3548 * - Print tasks cpuset path into seq_file.
3549 * - Used for /proc/<pid>/cpuset.
3550 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
3551 * doesn't really matter if tsk->cpuset changes after we read it,
3552 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
3555 int proc_cpuset_show(struct seq_file
*m
, struct pid_namespace
*ns
,
3556 struct pid
*pid
, struct task_struct
*tsk
)
3559 struct cgroup_subsys_state
*css
;
3563 buf
= kmalloc(PATH_MAX
, GFP_KERNEL
);
3567 css
= task_get_css(tsk
, cpuset_cgrp_id
);
3568 retval
= cgroup_path_ns(css
->cgroup
, buf
, PATH_MAX
,
3569 current
->nsproxy
->cgroup_ns
);
3571 if (retval
>= PATH_MAX
)
3572 retval
= -ENAMETOOLONG
;
3583 #endif /* CONFIG_PROC_PID_CPUSET */
3585 /* Display task mems_allowed in /proc/<pid>/status file. */
3586 void cpuset_task_status_allowed(struct seq_file
*m
, struct task_struct
*task
)
3588 seq_printf(m
, "Mems_allowed:\t%*pb\n",
3589 nodemask_pr_args(&task
->mems_allowed
));
3590 seq_printf(m
, "Mems_allowed_list:\t%*pbl\n",
3591 nodemask_pr_args(&task
->mems_allowed
));