pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.4.patch

   1 commit e7cae741f6d645ac68fe8823ca6ef45dbbf6891b
   2 Author: Tejun Heo <tj@kernel.org>
   3 Date:   Fri Mar 11 07:31:23 2016 -0500
   4
   5     sched: Misc preps for cgroup unified hierarchy interface
   6
   7     Make the following changes in preparation for the cpu controller
   8     interface implementation for the unified hierarchy.  This patch
   9     doesn't cause any functional differences.
  10
  11     * s/cpu_stats_show()/cpu_cfs_stats_show()/
  12
  13     * s/cpu_files/cpu_legacy_files/
  14
  15     * Separate out cpuacct_stats_read() from cpuacct_stats_show().  While
  16       at it, remove pointless cpuacct_stat_desc[] array.
  17
  18     Signed-off-by: Tejun Heo <tj@kernel.org>
  19     Cc: Ingo Molnar <mingo@redhat.com>
  20     Cc: Peter Zijlstra <peterz@infradead.org>
  21     Cc: Li Zefan <lizefan@huawei.com>
  22     Cc: Johannes Weiner <hannes@cmpxchg.org>
  23
  24 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
  25 index 732e993..77f3ddd 100644
  26 --- a/kernel/sched/core.c
  27 +++ b/kernel/sched/core.c
  28 @@ -8512,7 +8512,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
  29         return ret;
  30  }
  31
  32 -static int cpu_stats_show(struct seq_file *sf, void *v)
  33 +static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
  34  {
  35         struct task_group *tg = css_tg(seq_css(sf));
  36         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
  37 @@ -8552,7 +8552,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
  38  }
  39  #endif /* CONFIG_RT_GROUP_SCHED */
  40
  41 -static struct cftype cpu_files[] = {
  42 +static struct cftype cpu_legacy_files[] = {
  43  #ifdef CONFIG_FAIR_GROUP_SCHED
  44         {
  45                 .name = "shares",
  46 @@ -8573,7 +8573,7 @@ static struct cftype cpu_files[] = {
  47         },
  48         {
  49                 .name = "stat",
  50 -               .seq_show = cpu_stats_show,
  51 +               .seq_show = cpu_cfs_stats_show,
  52         },
  53  #endif
  54  #ifdef CONFIG_RT_GROUP_SCHED
  55 @@ -8599,7 +8599,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
  56         .fork           = cpu_cgroup_fork,
  57         .can_attach     = cpu_cgroup_can_attach,
  58         .attach         = cpu_cgroup_attach,
  59 -       .legacy_cftypes = cpu_files,
  60 +       .legacy_cftypes = cpu_legacy_files,
  61         .early_init     = 1,
  62  };
  63
  64 diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
  65 index dd7cbb5..42b2dd5 100644
  66 --- a/kernel/sched/cpuacct.c
  67 +++ b/kernel/sched/cpuacct.c
  68 @@ -177,36 +177,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
  69         return 0;
  70  }
  71
  72 -static const char * const cpuacct_stat_desc[] = {
  73 -       [CPUACCT_STAT_USER] = "user",
  74 -       [CPUACCT_STAT_SYSTEM] = "system",
  75 -};
  76 -
  77 -static int cpuacct_stats_show(struct seq_file *sf, void *v)
  78 +static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp)
  79  {
  80 -       struct cpuacct *ca = css_ca(seq_css(sf));
  81         int cpu;
  82 -       s64 val = 0;
  83
  84 +       *userp = 0;
  85         for_each_online_cpu(cpu) {
  86                 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
  87 -               val += kcpustat->cpustat[CPUTIME_USER];
  88 -               val += kcpustat->cpustat[CPUTIME_NICE];
  89 +               *userp += kcpustat->cpustat[CPUTIME_USER];
  90 +               *userp += kcpustat->cpustat[CPUTIME_NICE];
  91         }
  92 -       val = cputime64_to_clock_t(val);
  93 -       seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
  94
  95 -       val = 0;
  96 +       *sysp = 0;
  97         for_each_online_cpu(cpu) {
  98                 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
  99 -               val += kcpustat->cpustat[CPUTIME_SYSTEM];
 100 -               val += kcpustat->cpustat[CPUTIME_IRQ];
 101 -               val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
 102 +               *sysp += kcpustat->cpustat[CPUTIME_SYSTEM];
 103 +               *sysp += kcpustat->cpustat[CPUTIME_IRQ];
 104 +               *sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ];
 105         }
 106 +}
 107
 108 -       val = cputime64_to_clock_t(val);
 109 -       seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
 110 +static int cpuacct_stats_show(struct seq_file *sf, void *v)
 111 +{
 112 +       cputime64_t user, sys;
 113
 114 +       cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys);
 115 +       seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user));
 116 +       seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys));
 117         return 0;
 118  }
 119
 120
 121 commit 1bb33e8a69f089f2d3f58a0e681d4ff352e11c97
 122 Author: Tejun Heo <tj@kernel.org>
 123 Date:   Fri Mar 11 07:31:23 2016 -0500
 124
 125     sched: Implement interface for cgroup unified hierarchy
 126
 127     While the cpu controller doesn't have any functional problems, there
 128     are a couple interface issues which can be addressed in the v2
 129     interface.
 130
 131     * cpuacct being a separate controller.  This separation is artificial
 132       and rather pointless as demonstrated by most use cases co-mounting
 133       the two controllers.  It also forces certain information to be
 134       accounted twice.
 135
 136     * Use of different time units.  Writable control knobs use
 137       microseconds, some stat fields use nanoseconds while other cpuacct
 138       stat fields use centiseconds.
 139
 140     * Control knobs which can't be used in the root cgroup still show up
 141       in the root.
 142
 143     * Control knob names and semantics aren't consistent with other
 144       controllers.
 145
 146     This patchset implements cpu controller's interface on the unified
 147     hierarchy which adheres to the controller file conventions described
 148     in Documentation/cgroups/unified-hierarchy.txt.  Overall, the
 149     following changes are made.
 150
 151     * cpuacct is implictly enabled and disabled by cpu and its information
 152       is reported through "cpu.stat" which now uses microseconds for all
 153       time durations.  All time duration fields now have "_usec" appended
 154       to them for clarity.  While this doesn't solve the double accounting
 155       immediately, once majority of users switch to v2, cpu can directly
 156       account and report the relevant stats and cpuacct can be disabled on
 157       the unified hierarchy.
 158
 159       Note that cpuacct.usage_percpu is currently not included in
 160       "cpu.stat".  If this information is actually called for, it can be
 161       added later.
 162
 163     * "cpu.shares" is replaced with "cpu.weight" and operates on the
 164       standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
 165       The weight is scaled to scheduler weight so that 100 maps to 1024
 166       and the ratio relationship is preserved - if weight is W and its
 167       scaled value is S, W / 100 == S / 1024.  While the mapped range is a
 168       bit smaller than the orignal scheduler weight range, the dead zones
 169       on both sides are relatively small and covers wider range than the
 170       nice value mappings.  This file doesn't make sense in the root
 171       cgroup and isn't create on root.
 172
 173     * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
 174       which contains both quota and period.
 175
 176     * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
 177       "cpu.rt.max" which contains both runtime and period.
 178
 179     v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
 180         CFS bandwidth stats and also using raw division for u64.  Use
 181         CONFIG_CFS_BANDWITH and do_div() instead.
 182
 183         The semantics of "cpu.rt.max" is not fully decided yet.  Dropped
 184         for now.
 185
 186     Signed-off-by: Tejun Heo <tj@kernel.org>
 187     Cc: Ingo Molnar <mingo@redhat.com>
 188     Cc: Peter Zijlstra <peterz@infradead.org>
 189     Cc: Li Zefan <lizefan@huawei.com>
 190     Cc: Johannes Weiner <hannes@cmpxchg.org>
 191
 192 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 193 index 77f3ddd..7aafe63 100644
 194 --- a/kernel/sched/core.c
 195 +++ b/kernel/sched/core.c
 196 @@ -8591,6 +8591,139 @@ static struct cftype cpu_legacy_files[] = {
 197         { }     /* terminate */
 198  };
 199
 200 +static int cpu_stats_show(struct seq_file *sf, void *v)
 201 +{
 202 +       cpuacct_cpu_stats_show(sf);
 203 +
 204 +#ifdef CONFIG_CFS_BANDWIDTH
 205 +       {
 206 +               struct task_group *tg = css_tg(seq_css(sf));
 207 +               struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 208 +               u64 throttled_usec;
 209 +
 210 +               throttled_usec = cfs_b->throttled_time;
 211 +               do_div(throttled_usec, NSEC_PER_USEC);
 212 +
 213 +               seq_printf(sf, "nr_periods %d\n"
 214 +                          "nr_throttled %d\n"
 215 +                          "throttled_usec %llu\n",
 216 +                          cfs_b->nr_periods, cfs_b->nr_throttled,
 217 +                          throttled_usec);
 218 +       }
 219 +#endif
 220 +       return 0;
 221 +}
 222 +
 223 +#ifdef CONFIG_FAIR_GROUP_SCHED
 224 +static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
 225 +                              struct cftype *cft)
 226 +{
 227 +       struct task_group *tg = css_tg(css);
 228 +       u64 weight = scale_load_down(tg->shares);
 229 +
 230 +       return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
 231 +}
 232 +
 233 +static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
 234 +                               struct cftype *cftype, u64 weight)
 235 +{
 236 +       /*
 237 +        * cgroup weight knobs should use the common MIN, DFL and MAX
 238 +        * values which are 1, 100 and 10000 respectively.  While it loses
 239 +        * a bit of range on both ends, it maps pretty well onto the shares
 240 +        * value used by scheduler and the round-trip conversions preserve
 241 +        * the original value over the entire range.
 242 +        */
 243 +       if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
 244 +               return -ERANGE;
 245 +
 246 +       weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
 247 +
 248 +       return sched_group_set_shares(css_tg(css), scale_load(weight));
 249 +}
 250 +#endif
 251 +
 252 +static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
 253 +                                                 long period, long quota)
 254 +{
 255 +       if (quota < 0)
 256 +               seq_puts(sf, "max");
 257 +       else
 258 +               seq_printf(sf, "%ld", quota);
 259 +
 260 +       seq_printf(sf, " %ld\n", period);
 261 +}
 262 +
 263 +/* caller should put the current value in *@periodp before calling */
 264 +static int __maybe_unused cpu_period_quota_parse(char *buf,
 265 +                                                u64 *periodp, u64 *quotap)
 266 +{
 267 +       char tok[21];   /* U64_MAX */
 268 +
 269 +       if (!sscanf(buf, "%s %llu", tok, periodp))
 270 +               return -EINVAL;
 271 +
 272 +       *periodp *= NSEC_PER_USEC;
 273 +
 274 +       if (sscanf(tok, "%llu", quotap))
 275 +               *quotap *= NSEC_PER_USEC;
 276 +       else if (!strcmp(tok, "max"))
 277 +               *quotap = RUNTIME_INF;
 278 +       else
 279 +               return -EINVAL;
 280 +
 281 +       return 0;
 282 +}
 283 +
 284 +#ifdef CONFIG_CFS_BANDWIDTH
 285 +static int cpu_max_show(struct seq_file *sf, void *v)
 286 +{
 287 +       struct task_group *tg = css_tg(seq_css(sf));
 288 +
 289 +       cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
 290 +       return 0;
 291 +}
 292 +
 293 +static ssize_t cpu_max_write(struct kernfs_open_file *of,
 294 +                            char *buf, size_t nbytes, loff_t off)
 295 +{
 296 +       struct task_group *tg = css_tg(of_css(of));
 297 +       u64 period = tg_get_cfs_period(tg);
 298 +       u64 quota;
 299 +       int ret;
 300 +
 301 +       ret = cpu_period_quota_parse(buf, &period, &quota);
 302 +       if (!ret)
 303 +               ret = tg_set_cfs_bandwidth(tg, period, quota);
 304 +       return ret ?: nbytes;
 305 +}
 306 +#endif
 307 +
 308 +static struct cftype cpu_files[] = {
 309 +       {
 310 +               .name = "stat",
 311 +               .flags = CFTYPE_NOT_ON_ROOT,
 312 +               .seq_show = cpu_stats_show,
 313 +       },
 314 +#ifdef CONFIG_FAIR_GROUP_SCHED
 315 +       {
 316 +               .name = "weight",
 317 +               .flags = CFTYPE_NOT_ON_ROOT,
 318 +               .read_u64 = cpu_weight_read_u64,
 319 +               .write_u64 = cpu_weight_write_u64,
 320 +       },
 321 +#endif
 322 +#ifdef CONFIG_CFS_BANDWIDTH
 323 +       {
 324 +               .name = "max",
 325 +               .flags = CFTYPE_NOT_ON_ROOT,
 326 +               .seq_show = cpu_max_show,
 327 +               .write = cpu_max_write,
 328 +       },
 329 +#endif
 330 +       { }     /* terminate */
 331 +};
 332 +
 333  struct cgroup_subsys cpu_cgrp_subsys = {
 334         .css_alloc      = cpu_cgroup_css_alloc,
 335         .css_free       = cpu_cgroup_css_free,
 336 @@ -8600,7 +8733,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 337         .can_attach     = cpu_cgroup_can_attach,
 338         .attach         = cpu_cgroup_attach,
 339         .legacy_cftypes = cpu_legacy_files,
 340 +       .dfl_cftypes    = cpu_files,
 341         .early_init     = 1,
 342 +#ifdef CONFIG_CGROUP_CPUACCT
 343 +       /*
 344 +        * cpuacct is enabled together with cpu on the unified hierarchy
 345 +        * and its stats are reported through "cpu.stat".
 346 +        */
 347 +       .depends_on     = 1 << cpuacct_cgrp_id,
 348 +#endif
 349  };
 350
 351  #endif /* CONFIG_CGROUP_SCHED */
 352 diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
 353 index 42b2dd5..b4d32a6 100644
 354 --- a/kernel/sched/cpuacct.c
 355 +++ b/kernel/sched/cpuacct.c
 356 @@ -224,6 +224,30 @@ static struct cftype files[] = {
 357         { }     /* terminate */
 358  };
 359
 360 +/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
 361 +void cpuacct_cpu_stats_show(struct seq_file *sf)
 362 +{
 363 +       struct cgroup_subsys_state *css;
 364 +       u64 usage, user, sys;
 365 +
 366 +       css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
 367 +
 368 +       usage = cpuusage_read(css, seq_cft(sf));
 369 +       cpuacct_stats_read(css_ca(css), &user, &sys);
 370 +
 371 +       user *= TICK_NSEC;
 372 +       sys *= TICK_NSEC;
 373 +       do_div(usage, NSEC_PER_USEC);
 374 +       do_div(user, NSEC_PER_USEC);
 375 +       do_div(sys, NSEC_PER_USEC);
 376 +
 377 +       seq_printf(sf, "usage_usec %llu\n"
 378 +                  "user_usec %llu\n"
 379 +                  "system_usec %llu\n", usage, user, sys);
 380 +
 381 +       css_put(css);
 382 +}
 383 +
 384  /*
 385   * charge this task's execution time to its accounting group.
 386   *
 387 diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
 388 index ed60562..44eace9 100644
 389 --- a/kernel/sched/cpuacct.h
 390 +++ b/kernel/sched/cpuacct.h
 391 @@ -2,6 +2,7 @@
 392
 393  extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 394  extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
 395 +extern void cpuacct_cpu_stats_show(struct seq_file *sf);
 396
 397  #else
 398
 399 @@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *p, int index, u64 val)
 400  {
 401  }
 402
 403 +static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
 404 +{
 405 +}
 406 +
 407  #endif