1 commit 827b86ad1dd21feed4c0b99faf6059f245f7dadb
2 Author: Tejun Heo <tj@kernel.org>
3 Date: Fri Mar 11 07:31:23 2016 -0500
5 sched: Misc preps for cgroup unified hierarchy interface
7 Make the following changes in preparation for the cpu controller
8 interface implementation for the unified hierarchy. This patch
9 doesn't cause any functional differences.
11 * s/cpu_stats_show()/cpu_cfs_stats_show()/
13 * s/cpu_files/cpu_legacy_files/
15 * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While
16 at it, make the @val array u64 for consistency.
18 Signed-off-by: Tejun Heo <tj@kernel.org>
19 Cc: Ingo Molnar <mingo@redhat.com>
20 Cc: Peter Zijlstra <peterz@infradead.org>
21 Cc: Li Zefan <lizefan@huawei.com>
22 Cc: Johannes Weiner <hannes@cmpxchg.org>
24 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
25 index 3b31fc05a0f1..a1b95e83fa87 100644
26 --- a/kernel/sched/core.c
27 +++ b/kernel/sched/core.c
28 @@ -7174,7 +7174,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
32 -static int cpu_stats_show(struct seq_file *sf, void *v)
33 +static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
35 struct task_group *tg = css_tg(seq_css(sf));
36 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
37 @@ -7214,7 +7214,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
39 #endif /* CONFIG_RT_GROUP_SCHED */
41 -static struct cftype cpu_files[] = {
42 +static struct cftype cpu_legacy_files[] = {
43 #ifdef CONFIG_FAIR_GROUP_SCHED
46 @@ -7235,7 +7235,7 @@ static struct cftype cpu_files[] = {
50 - .seq_show = cpu_stats_show,
51 + .seq_show = cpu_cfs_stats_show,
54 #ifdef CONFIG_RT_GROUP_SCHED
55 @@ -7261,7 +7261,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
56 .fork = cpu_cgroup_fork,
57 .can_attach = cpu_cgroup_can_attach,
58 .attach = cpu_cgroup_attach,
59 - .legacy_cftypes = cpu_files,
60 + .legacy_cftypes = cpu_legacy_files,
64 diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
65 index f95ab29a45d0..6151c23f722f 100644
66 --- a/kernel/sched/cpuacct.c
67 +++ b/kernel/sched/cpuacct.c
68 @@ -276,26 +276,33 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
72 -static int cpuacct_stats_show(struct seq_file *sf, void *v)
73 +static void cpuacct_stats_read(struct cpuacct *ca,
74 + u64 (*val)[CPUACCT_STAT_NSTATS])
76 - struct cpuacct *ca = css_ca(seq_css(sf));
77 - s64 val[CPUACCT_STAT_NSTATS];
81 - memset(val, 0, sizeof(val));
82 + memset(val, 0, sizeof(*val));
84 for_each_possible_cpu(cpu) {
85 u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
87 - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
88 - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
89 - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
90 - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
91 - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
92 + (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
93 + (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
94 + (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
95 + (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
96 + (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
100 +static int cpuacct_stats_show(struct seq_file *sf, void *v)
102 + u64 val[CPUACCT_STAT_NSTATS];
105 + cpuacct_stats_read(css_ca(seq_css(sf)), &val);
107 for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
108 - seq_printf(sf, "%s %lld\n",
109 + seq_printf(sf, "%s %llu\n",
110 cpuacct_stat_desc[stat],
111 (long long)nsec_to_clock_t(val[stat]));
114 commit fdb64d002b3a223ce4bb11aa4448a42050470052
115 Author: Tejun Heo <tj@kernel.org>
116 Date: Fri Mar 11 07:31:23 2016 -0500
118 sched: Implement interface for cgroup unified hierarchy
120 While the cpu controller doesn't have any functional problems, there
121 are a couple interface issues which can be addressed in the v2
124 * cpuacct being a separate controller. This separation is artificial
125 and rather pointless as demonstrated by most use cases co-mounting
126 the two controllers. It also forces certain information to be
129 * Use of different time units. Writable control knobs use
130 microseconds, some stat fields use nanoseconds while other cpuacct
131 stat fields use centiseconds.
133 * Control knobs which can't be used in the root cgroup still show up
136 * Control knob names and semantics aren't consistent with other
139 This patchset implements cpu controller's interface on the unified
140 hierarchy which adheres to the controller file conventions described
141 in Documentation/cgroups/unified-hierarchy.txt. Overall, the
142 following changes are made.
144 * cpuacct is implictly enabled and disabled by cpu and its information
145 is reported through "cpu.stat" which now uses microseconds for all
146 time durations. All time duration fields now have "_usec" appended
147 to them for clarity. While this doesn't solve the double accounting
148 immediately, once majority of users switch to v2, cpu can directly
149 account and report the relevant stats and cpuacct can be disabled on
150 the unified hierarchy.
152 Note that cpuacct.usage_percpu is currently not included in
153 "cpu.stat". If this information is actually called for, it can be
156 * "cpu.shares" is replaced with "cpu.weight" and operates on the
157 standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
158 The weight is scaled to scheduler weight so that 100 maps to 1024
159 and the ratio relationship is preserved - if weight is W and its
160 scaled value is S, W / 100 == S / 1024. While the mapped range is a
161 bit smaller than the orignal scheduler weight range, the dead zones
162 on both sides are relatively small and covers wider range than the
163 nice value mappings. This file doesn't make sense in the root
164 cgroup and isn't create on root.
166 * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
167 which contains both quota and period.
169 * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
170 "cpu.rt.max" which contains both runtime and period.
172 v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
173 CFS bandwidth stats and also using raw division for u64. Use
174 CONFIG_CFS_BANDWITH and do_div() instead.
176 The semantics of "cpu.rt.max" is not fully decided yet. Dropped
179 Signed-off-by: Tejun Heo <tj@kernel.org>
180 Cc: Ingo Molnar <mingo@redhat.com>
181 Cc: Peter Zijlstra <peterz@infradead.org>
182 Cc: Li Zefan <lizefan@huawei.com>
183 Cc: Johannes Weiner <hannes@cmpxchg.org>
185 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
186 index a1b95e83fa87..f01d56e58a1b 100644
187 --- a/kernel/sched/core.c
188 +++ b/kernel/sched/core.c
189 @@ -7253,6 +7253,139 @@ static struct cftype cpu_legacy_files[] = {
193 +static int cpu_stats_show(struct seq_file *sf, void *v)
195 + cpuacct_cpu_stats_show(sf);
197 +#ifdef CONFIG_CFS_BANDWIDTH
199 + struct task_group *tg = css_tg(seq_css(sf));
200 + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
201 + u64 throttled_usec;
203 + throttled_usec = cfs_b->throttled_time;
204 + do_div(throttled_usec, NSEC_PER_USEC);
206 + seq_printf(sf, "nr_periods %d\n"
207 + "nr_throttled %d\n"
208 + "throttled_usec %llu\n",
209 + cfs_b->nr_periods, cfs_b->nr_throttled,
216 +#ifdef CONFIG_FAIR_GROUP_SCHED
217 +static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
218 + struct cftype *cft)
220 + struct task_group *tg = css_tg(css);
221 + u64 weight = scale_load_down(tg->shares);
223 + return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
226 +static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
227 + struct cftype *cftype, u64 weight)
230 + * cgroup weight knobs should use the common MIN, DFL and MAX
231 + * values which are 1, 100 and 10000 respectively. While it loses
232 + * a bit of range on both ends, it maps pretty well onto the shares
233 + * value used by scheduler and the round-trip conversions preserve
234 + * the original value over the entire range.
236 + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
239 + weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
241 + return sched_group_set_shares(css_tg(css), scale_load(weight));
245 +static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
246 + long period, long quota)
249 + seq_puts(sf, "max");
251 + seq_printf(sf, "%ld", quota);
253 + seq_printf(sf, " %ld\n", period);
256 +/* caller should put the current value in *@periodp before calling */
257 +static int __maybe_unused cpu_period_quota_parse(char *buf,
258 + u64 *periodp, u64 *quotap)
260 + char tok[21]; /* U64_MAX */
262 + if (!sscanf(buf, "%s %llu", tok, periodp))
265 + *periodp *= NSEC_PER_USEC;
267 + if (sscanf(tok, "%llu", quotap))
268 + *quotap *= NSEC_PER_USEC;
269 + else if (!strcmp(tok, "max"))
270 + *quotap = RUNTIME_INF;
277 +#ifdef CONFIG_CFS_BANDWIDTH
278 +static int cpu_max_show(struct seq_file *sf, void *v)
280 + struct task_group *tg = css_tg(seq_css(sf));
282 + cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
286 +static ssize_t cpu_max_write(struct kernfs_open_file *of,
287 + char *buf, size_t nbytes, loff_t off)
289 + struct task_group *tg = css_tg(of_css(of));
290 + u64 period = tg_get_cfs_period(tg);
294 + ret = cpu_period_quota_parse(buf, &period, "a);
296 + ret = tg_set_cfs_bandwidth(tg, period, quota);
297 + return ret ?: nbytes;
301 +static struct cftype cpu_files[] = {
304 + .flags = CFTYPE_NOT_ON_ROOT,
305 + .seq_show = cpu_stats_show,
307 +#ifdef CONFIG_FAIR_GROUP_SCHED
310 + .flags = CFTYPE_NOT_ON_ROOT,
311 + .read_u64 = cpu_weight_read_u64,
312 + .write_u64 = cpu_weight_write_u64,
315 +#ifdef CONFIG_CFS_BANDWIDTH
318 + .flags = CFTYPE_NOT_ON_ROOT,
319 + .seq_show = cpu_max_show,
320 + .write = cpu_max_write,
323 + { } /* terminate */
326 struct cgroup_subsys cpu_cgrp_subsys = {
327 .css_alloc = cpu_cgroup_css_alloc,
328 .css_online = cpu_cgroup_css_online,
329 @@ -7262,7 +7395,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
330 .can_attach = cpu_cgroup_can_attach,
331 .attach = cpu_cgroup_attach,
332 .legacy_cftypes = cpu_legacy_files,
333 + .dfl_cftypes = cpu_files,
335 +#ifdef CONFIG_CGROUP_CPUACCT
337 + * cpuacct is enabled together with cpu on the unified hierarchy
338 + * and its stats are reported through "cpu.stat".
340 + .depends_on = 1 << cpuacct_cgrp_id,
344 #endif /* CONFIG_CGROUP_SCHED */
345 diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
346 index 6151c23f722f..fc1cf13c3af1 100644
347 --- a/kernel/sched/cpuacct.c
348 +++ b/kernel/sched/cpuacct.c
349 @@ -347,6 +347,31 @@ static struct cftype files[] = {
353 +/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
354 +void cpuacct_cpu_stats_show(struct seq_file *sf)
356 + struct cgroup_subsys_state *css;
357 + u64 usage, val[CPUACCT_STAT_NSTATS];
359 + css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
361 + usage = cpuusage_read(css, seq_cft(sf));
362 + cpuacct_stats_read(css_ca(css), &val);
364 + val[CPUACCT_STAT_USER] *= TICK_NSEC;
365 + val[CPUACCT_STAT_SYSTEM] *= TICK_NSEC;
366 + do_div(usage, NSEC_PER_USEC);
367 + do_div(val[CPUACCT_STAT_USER], NSEC_PER_USEC);
368 + do_div(val[CPUACCT_STAT_SYSTEM], NSEC_PER_USEC);
370 + seq_printf(sf, "usage_usec %llu\n"
372 + "system_usec %llu\n",
373 + usage, val[CPUACCT_STAT_USER], val[CPUACCT_STAT_SYSTEM]);
379 * charge this task's execution time to its accounting group.
381 diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
382 index ba72807c73d4..ddf7af466d35 100644
383 --- a/kernel/sched/cpuacct.h
384 +++ b/kernel/sched/cpuacct.h
387 extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
388 extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
389 +extern void cpuacct_cpu_stats_show(struct seq_file *sf);
393 @@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
397 +static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
403 commit 8dde150866b8c433216105c50b7e889d5242d583
404 Author: Tejun Heo <tj@kernel.org>
405 Date: Fri Aug 5 12:41:01 2016 -0400
407 cgroup: add documentation regarding CPU controller cgroup v2 support
409 Signed-off-by: Tejun Heo <tj@kernel.org>
411 diff --git a/Documentation/cgroup-v2-cpu.txt b/Documentation/cgroup-v2-cpu.txt
413 index 000000000000..1ed7032d4472
415 +++ b/Documentation/cgroup-v2-cpu.txt
419 +CPU Controller on Control Group v2
421 +August, 2016 Tejun Heo <tj@kernel.org>
424 +While most controllers have support for cgroup v2 now, the CPU
425 +controller support is not upstream yet due to objections from the
426 +scheduler maintainers on the basic designs of cgroup v2. This
427 +document explains the current situation as well as an interim
428 +solution, and details the disagreements and arguments. The latest
429 +version of this document can be found at the following URL.
431 + https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu
433 +This document was posted to the linux-kernel and cgroup mailing lists.
434 +Unfortunately, no consensus was reached as of Oct, 2016. The thread
435 +can be found at the following URL.
437 + http://lkml.kernel.org/r/20160805170752.GK2542@mtj.duckdns.org
442 +1. Current Situation and Interim Solution
443 +2. Disagreements and Arguments
444 + 2-1. Contentious Restrictions
445 + 2-1-1. Process Granularity
446 + 2-1-2. No Internal Process Constraint
447 + 2-2. Impact on CPU Controller
448 + 2-2-1. Impact of Process Granularity
449 + 2-2-2. Impact of No Internal Process Constraint
450 + 2-3. Arguments for cgroup v2
455 +1. Current Situation and Interim Solution
457 +All objections from the scheduler maintainers apply to cgroup v2 core
458 +design, and there are no known objections to the specifics of the CPU
459 +controller cgroup v2 interface. The only blocked part is changes to
460 +expose the CPU controller interface on cgroup v2, which comprises the
461 +following two patches:
463 + [1] sched: Misc preps for cgroup unified hierarchy interface
464 + [2] sched: Implement interface for cgroup unified hierarchy
466 +The necessary changes are superficial and implement the interface
467 +files on cgroup v2. The combined diffstat is as follows.
469 + kernel/sched/core.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++--
470 + kernel/sched/cpuacct.c | 57 ++++++++++++------
471 + kernel/sched/cpuacct.h | 5 +
472 + 3 files changed, 189 insertions(+), 22 deletions(-)
474 +The patches are easy to apply and forward-port. The following git
475 +branch will always carry the two patches on top of the latest release
476 +of the upstream kernel.
478 + git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu
480 +There also are versioned branches going back to v4.4.
482 + git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu-$KERNEL_VER
484 +While it's difficult to tell whether the CPU controller support will
485 +be merged, there are crucial resource control features in cgroup v2
486 +that are only possible due to the design choices that are being
487 +objected to, and every effort will be made to ease enabling the CPU
488 +controller cgroup v2 support out-of-tree for parties which choose to.
491 +2. Disagreements and Arguments
493 +There have been several lengthy discussion threads [3][4] on LKML
494 +around the structural constraints of cgroup v2. The two that affect
495 +the CPU controller are process granularity and no internal process
496 +constraint. Both arise primarily from the need for common resource
497 +domain definition across different resources.
499 +The common resource domain is a powerful concept in cgroup v2 that
500 +allows controllers to make basic assumptions about the structural
501 +organization of processes and controllers inside the cgroup hierarchy,
502 +and thus solve problems spanning multiple types of resources. The
503 +prime example for this is page cache writeback: dirty page cache is
504 +regulated through throttling buffered writers based on memory
505 +availability, and initiating batched write outs to the disk based on
506 +IO capacity. Tracking and controlling writeback inside a cgroup thus
507 +requires the direct cooperation of the memory and the IO controller.
509 +This easily extends to other areas, such as CPU cycles consumed while
510 +performing memory reclaim or IO encryption.
513 +2-1. Contentious Restrictions
515 +For controllers of different resources to work together, they must
516 +agree on a common organization. This uniform model across controllers
517 +imposes two contentious restrictions on the CPU controller: process
518 +granularity and the no-internal-process constraint.
521 + 2-1-1. Process Granularity
523 + For memory, because an address space is shared between all threads
524 + of a process, the terminal consumer is a process, not a thread.
525 + Separating the threads of a single process into different memory
526 + control domains doesn't make semantical sense. cgroup v2 ensures
527 + that all controller can agree on the same organization by requiring
528 + that threads of the same process belong to the same cgroup.
530 + There are other reasons to enforce process granularity. One
531 + important one is isolating system-level management operations from
532 + in-process application operations. The cgroup interface, being a
533 + virtual filesystem, is very unfit for multiple independent
534 + operations taking place at the same time as most operations have to
535 + be multi-step and there is no way to synchronize multiple accessors.
536 + See also [5] Documentation/cgroup-v2.txt, "R-2. Thread Granularity"
539 + 2-1-2. No Internal Process Constraint
541 + cgroup v2 does not allow processes to belong to any cgroup which has
542 + child cgroups when resource controllers are enabled on it (the
543 + notable exception being the root cgroup itself). This is because,
544 + for some resources, a resource domain (cgroup) is not directly
545 + comparable to the terminal consumer (process/task) of said resource,
546 + and so putting the two into a sibling relationship isn't meaningful.
548 + - Differing Control Parameters and Capabilities
550 + A cgroup controller has different resource control parameters and
551 + capabilities from a terminal consumer, be that a task or process.
552 + There are a couple cases where a cgroup control knob can be mapped
553 + to a per-task or per-process API but they are exceptions and the
554 + mappings aren't obvious even in those cases.
556 + For example, task priorities (also known as nice values) set
557 + through setpriority(2) are mapped to the CPU controller
558 + "cpu.shares" values. However, how exactly the two ranges map and
559 + even the fact that they map to each other at all are not obvious.
561 + The situation gets further muddled when considering other resource
562 + types and control knobs. IO priorities set through ioprio_set(2)
563 + cannot be mapped to IO controller weights and most cgroup resource
564 + control knobs including the bandwidth control knobs of the CPU
565 + controller don't have counterparts in the terminal consumers.
567 + - Anonymous Resource Consumption
569 + For CPU, every time slice consumed from inside a cgroup, which
570 + comprises most but not all of consumed CPU time for the cgroup,
571 + can be clearly attributed to a specific task or process. Because
572 + these two types of entities are directly comparable as consumers
573 + of CPU time, it's theoretically possible to mix tasks and cgroups
574 + on the same tree levels and let them directly compete for the time
575 + quota available to their common ancestor.
577 + However, the same can't be said for resource types like memory or
578 + IO: the memory consumed by the page cache, for example, can be
579 + tracked on a per-cgroup level, but due to mismatches in lifetimes
580 + of involved objects (page cache can persist long after processes
581 + are gone), shared usages and the implementation overhead of
582 + tracking persistent state, it can no longer be attributed to
583 + individual processes after instantiation. Consequently, any IO
584 + incurred by page cache writeback can be attributed to a cgroup,
585 + but not to the individual consumers inside the cgroup.
587 + For memory and IO, this makes a resource domain (cgroup) an object
588 + of a fundamentally different type than a terminal consumer
589 + (process). A process can't be a first class object in the resource
590 + distribution graph as its total resource consumption can't be
591 + described without the containing resource domain.
593 + Disallowing processes in internal cgroups avoids competition between
594 + cgroups and processes which cannot be meaningfully defined for these
595 + resources. All resource control takes place among cgroups and a
596 + terminal consumer interacts with the containing cgroup the same way
597 + it would with the system without cgroup.
599 + Root cgroup is exempt from this constraint, which is in line with
600 + how root cgroup is handled in general - it's excluded from cgroup
601 + resource accounting and control.
604 +Enforcing process granularity and no internal process constraint
605 +allows all controllers to be on the same footing in terms of resource
606 +distribution hierarchy.
609 +2-2. Impact on CPU Controller
611 +As indicated earlier, the CPU controller's resource distribution graph
612 +is the simplest. Every schedulable resource consumption can be
613 +attributed to a specific task. In addition, for weight based control,
614 +the per-task priority set through setpriority(2) can be translated to
615 +and from a per-cgroup weight. As such, the CPU controller can treat a
616 +task and a cgroup symmetrically, allowing support for any tree layout
617 +of cgroups and tasks. Both process granularity and the no internal
618 +process constraint restrict how the CPU controller can be used.
621 + 2-2-1. Impact of Process Granularity
623 + Process granularity prevents tasks belonging to the same process to
624 + be assigned to different cgroups. It was pointed out [6] that this
625 + excludes the valid use case of hierarchical CPU distribution within
628 + To address this issue, the rgroup (resource group) [7][8][9]
629 + interface, an extension of the existing setpriority(2) API, was
630 + proposed, which is in line with other programmable priority
631 + mechanisms and eliminates the risk of in-application configuration
632 + and system configuration stepping on each other's toes.
633 + Unfortunately, the proposal quickly turned into discussions around
634 + cgroup v2 design decisions [4] and no consensus could be reached.
637 + 2-2-2. Impact of No Internal Process Constraint
639 + The no internal process constraint disallows tasks from competing
640 + directly against cgroups. Here is an excerpt from Peter Zijlstra
641 + pointing out the issue [10] - R, L and A are cgroups; t1, t2, t3 and
652 + Is fundamentally different from:
662 + Because if in the first hierarchy you add a task (t5) to R, all of
663 + its A will run at 1/4th of total bandwidth where before it had
664 + 1/3rd, whereas with the second example, if you add our t5 to L, A
665 + doesn't get any less bandwidth.
668 + It is true that the trees are semantically different from each other
669 + and the symmetric handling of tasks and cgroups is aesthetically
670 + pleasing. However, it isn't clear what the practical usefulness of
671 + a layout with direct competition between tasks and cgroups would be,
672 + considering that number and behavior of tasks are controlled by each
673 + application, and cgroups primarily deal with system level resource
674 + distribution; changes in the number of active threads would directly
675 + impact resource distribution. Real world use cases of such layouts
676 + could not be established during the discussions.
679 +2-3. Arguments for cgroup v2
681 +There are strong demands for comprehensive hierarchical resource
682 +control across all major resources, and establishing a common resource
683 +hierarchy is an essential step. As with most engineering decisions,
684 +common resource hierarchy definition comes with its trade-offs. With
685 +cgroup v2, the trade-offs are in the form of structural constraints
686 +which, among others, restrict the CPU controller's space of possible
689 +However, even with the restrictions, cgroup v2, in combination with
690 +rgroup, covers most of identified real world use cases while enabling
691 +new important use cases of resource control across multiple resource
692 +types that were fundamentally broken previously.
694 +Furthermore, for resource control, treating resource domains as
695 +objects of a different type from terminal consumers has important
696 +advantages - it can account for resource consumptions which are not
697 +tied to any specific terminal consumer, be that a task or process, and
698 +allows decoupling resource distribution controls from in-application
699 +APIs. Even the CPU controller may benefit from it as the kernel can
700 +consume significant amount of CPU cycles in interrupt context or tasks
701 +shared across multiple resource domains (e.g. softirq).
703 +Finally, it's important to note that enabling cgroup v2 support for
704 +the CPU controller doesn't block use cases which require the features
705 +which are not available on cgroup v2. Unlikely, but should anybody
706 +actually rely on the CPU controller's symmetric handling of tasks and
707 +cgroups, backward compatibility is and will be maintained by being
708 +able to disconnect the controller from the cgroup v2 hierarchy and use
709 +it standalone. This also holds for cpuset which is often used in
710 +highly customized configurations which might be a poor fit for common
713 +The required changes are minimal, the benefits for the target use
714 +cases are critical and obvious, and use cases which have to use v1 can
720 +cgroup v2 primarily aims to solve the problem of comprehensive
721 +hierarchical resource control across all major computing resources,
722 +which is one of the core problems of modern server infrastructure
723 +engineering. The trade-offs that cgroup v2 took are results of
724 +pursuing that goal and gaining a better understanding of the nature of
725 +resource control in the process.
727 +I believe that real world usages will prove cgroup v2's model right,
728 +considering the crucial pieces of comprehensive resource control that
729 +cannot be implemented without common resource domains. This is not to
730 +say that cgroup v2 is fixed in stone and can't be updated; if there is
731 +an approach which better serves both comprehensive resource control
732 +and the CPU controller's flexibility, we will surely move towards
733 +that. It goes without saying that discussions around such approach
734 +should consider practical aspects of resource control as a whole
735 +rather than absolutely focusing on a particular controller.
737 +Until such consensus can be reached, the CPU controller cgroup v2
738 +support will be maintained out of the mainline kernel in an easily
739 +accessible form. If there is anything cgroup developers can do to
740 +ease the pain, please feel free to contact us on the cgroup mailing
741 +list at cgroups@vger.kernel.org.
746 +[1] http://lkml.kernel.org/r/20160105164834.GE5995@mtj.duckdns.org
747 + [PATCH 1/2] sched: Misc preps for cgroup unified hierarchy interface
748 + Tejun Heo <tj@kernel.org>
750 +[2] http://lkml.kernel.org/r/20160105164852.GF5995@mtj.duckdns.org
751 + [PATCH 2/2] sched: Implement interface for cgroup unified hierarchy
752 + Tejun Heo <tj@kernel.org>
754 +[3] http://lkml.kernel.org/r/1438641689-14655-4-git-send-email-tj@kernel.org
755 + [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
756 + Tejun Heo <tj@kernel.org>
758 +[4] http://lkml.kernel.org/r/20160407064549.GH3430@twins.programming.kicks-ass.net
759 + Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
760 + Peter Zijlstra <peterz@infradead.org>
762 +[5] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/cgroup-v2.txt
764 + Tejun Heo <tj@kernel.org>
766 +[6] http://lkml.kernel.org/r/CAPM31RJNy3jgG=DYe6GO=wyL4BPPxwUm1f2S6YXacQmo7viFZA@mail.gmail.com
767 + Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
768 + Paul Turner <pjt@google.com>
770 +[7] http://lkml.kernel.org/r/20160105154503.GC5995@mtj.duckdns.org
771 + [RFD] cgroup: thread granularity support for cpu controller
772 + Tejun Heo <tj@kernel.org>
774 +[8] http://lkml.kernel.org/r/1457710888-31182-1-git-send-email-tj@kernel.org
775 + [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
776 + Tejun Heo <tj@kernel.org>
778 +[9] http://lkml.kernel.org/r/20160311160522.GA24046@htj.duckdns.org
779 + Example program for PRIO_RGRP
780 + Tejun Heo <tj@kernel.org>
782 +[10] http://lkml.kernel.org/r/20160407082810.GN3430@twins.programming.kicks-ass.net
783 + Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource
784 + Peter Zijlstra <peterz@infradead.org>