kernel/cgroup/stat.c

   1 #include "cgroup-internal.h"
   2
   3 #include <linux/sched/cputime.h>
   4
   5 static DEFINE_MUTEX(cgroup_stat_mutex);
   6 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
   7
   8 static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
   9 {
  10         return per_cpu_ptr(cgrp->cpu_stat, cpu);
  11 }
  12
  13 /**
  14  * cgroup_cpu_stat_updated - keep track of updated cpu_stat
  15  * @cgrp: target cgroup
  16  * @cpu: cpu on which cpu_stat was updated
  17  *
  18  * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
  19  * cpu_stat->updated_children list.  See the comment on top of
  20  * cgroup_cpu_stat definition for details.
  21  */
  22 static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
  23 {
  24         raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
  25         struct cgroup *parent;
  26         unsigned long flags;
  27
  28         /*
  29          * Speculative already-on-list test.  This may race leading to
  30          * temporary inaccuracies, which is fine.
  31          *
  32          * Because @parent's updated_children is terminated with @parent
  33          * instead of NULL, we can tell whether @cgrp is on the list by
  34          * testing the next pointer for NULL.
  35          */
  36         if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
  37                 return;
  38
  39         raw_spin_lock_irqsave(cpu_lock, flags);
  40
  41         /* put @cgrp and all ancestors on the corresponding updated lists */
  42         for (parent = cgroup_parent(cgrp); parent;
  43              cgrp = parent, parent = cgroup_parent(cgrp)) {
  44                 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
  45                 struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
  46
  47                 /*
  48                  * Both additions and removals are bottom-up.  If a cgroup
  49                  * is already in the tree, all ancestors are.
  50                  */
  51                 if (cstat->updated_next)
  52                         break;
  53
  54                 cstat->updated_next = pcstat->updated_children;
  55                 pcstat->updated_children = cgrp;
  56         }
  57
  58         raw_spin_unlock_irqrestore(cpu_lock, flags);
  59 }
  60
  61 /**
  62  * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
  63  * @pos: current position
  64  * @root: root of the tree to traversal
  65  * @cpu: target cpu
  66  *
  67  * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
  68  * the traversal and %NULL return indicates the end.  During traversal,
  69  * each returned cgroup is unlinked from the tree.  Must be called with the
  70  * matching cgroup_cpu_stat_lock held.
  71  *
  72  * The only ordering guarantee is that, for a parent and a child pair
  73  * covered by a given traversal, if a child is visited, its parent is
  74  * guaranteed to be visited afterwards.
  75  */
  76 static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
  77                                                   struct cgroup *root, int cpu)
  78 {
  79         struct cgroup_cpu_stat *cstat;
  80         struct cgroup *parent;
  81
  82         if (pos == root)
  83                 return NULL;
  84
  85         /*
  86          * We're gonna walk down to the first leaf and visit/remove it.  We
  87          * can pick whatever unvisited node as the starting point.
  88          */
  89         if (!pos)
  90                 pos = root;
  91         else
  92                 pos = cgroup_parent(pos);
  93
  94         /* walk down to the first leaf */
  95         while (true) {
  96                 cstat = cgroup_cpu_stat(pos, cpu);
  97                 if (cstat->updated_children == pos)
  98                         break;
  99                 pos = cstat->updated_children;
 100         }
 101
 102         /*
 103          * Unlink @pos from the tree.  As the updated_children list is
 104          * singly linked, we have to walk it to find the removal point.
 105          * However, due to the way we traverse, @pos will be the first
 106          * child in most cases. The only exception is @root.
 107          */
 108         parent = cgroup_parent(pos);
 109         if (parent && cstat->updated_next) {
 110                 struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
 111                 struct cgroup_cpu_stat *ncstat;
 112                 struct cgroup **nextp;
 113
 114                 nextp = &pcstat->updated_children;
 115                 while (true) {
 116                         ncstat = cgroup_cpu_stat(*nextp, cpu);
 117                         if (*nextp == pos)
 118                                 break;
 119
 120                         WARN_ON_ONCE(*nextp == parent);
 121                         nextp = &ncstat->updated_next;
 122                 }
 123
 124                 *nextp = cstat->updated_next;
 125                 cstat->updated_next = NULL;
 126         }
 127
 128         return pos;
 129 }
 130
 131 static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
 132                                    struct cgroup_stat *src_stat)
 133 {
 134         dst_stat->cputime.utime += src_stat->cputime.utime;
 135         dst_stat->cputime.stime += src_stat->cputime.stime;
 136         dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
 137 }
 138
 139 static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
 140 {
 141         struct cgroup *parent = cgroup_parent(cgrp);
 142         struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
 143         struct task_cputime *last_cputime = &cstat->last_cputime;
 144         struct task_cputime cputime;
 145         struct cgroup_stat delta;
 146         unsigned seq;
 147
 148         lockdep_assert_held(&cgroup_stat_mutex);
 149
 150         /* fetch the current per-cpu values */
 151         do {
 152                 seq = __u64_stats_fetch_begin(&cstat->sync);
 153                 cputime = cstat->cputime;
 154         } while (__u64_stats_fetch_retry(&cstat->sync, seq));
 155
 156         /* accumulate the deltas to propgate */
 157         delta.cputime.utime = cputime.utime - last_cputime->utime;
 158         delta.cputime.stime = cputime.stime - last_cputime->stime;
 159         delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
 160                                          last_cputime->sum_exec_runtime;
 161         *last_cputime = cputime;
 162
 163         /* transfer the pending stat into delta */
 164         cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
 165         memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
 166
 167         /* propagate delta into the global stat and the parent's pending */
 168         cgroup_stat_accumulate(&cgrp->stat, &delta);
 169         if (parent)
 170                 cgroup_stat_accumulate(&parent->pending_stat, &delta);
 171 }
 172
 173 /* see cgroup_stat_flush() */
 174 static void cgroup_stat_flush_locked(struct cgroup *cgrp)
 175 {
 176         int cpu;
 177
 178         lockdep_assert_held(&cgroup_stat_mutex);
 179
 180         for_each_possible_cpu(cpu) {
 181                 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
 182                 struct cgroup *pos = NULL;
 183
 184                 raw_spin_lock_irq(cpu_lock);
 185                 while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
 186                         cgroup_cpu_stat_flush_one(pos, cpu);
 187                 raw_spin_unlock_irq(cpu_lock);
 188         }
 189 }
 190
 191 /**
 192  * cgroup_stat_flush - flush stats in @cgrp's subtree
 193  * @cgrp: target cgroup
 194  *
 195  * Collect all per-cpu stats in @cgrp's subtree into the global counters
 196  * and propagate them upwards.  After this function returns, all cgroups in
 197  * the subtree have up-to-date ->stat.
 198  *
 199  * This also gets all cgroups in the subtree including @cgrp off the
 200  * ->updated_children lists.
 201  */
 202 void cgroup_stat_flush(struct cgroup *cgrp)
 203 {
 204         mutex_lock(&cgroup_stat_mutex);
 205         cgroup_stat_flush_locked(cgrp);
 206         mutex_unlock(&cgroup_stat_mutex);
 207 }
 208
 209 static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
 210 {
 211         struct cgroup_cpu_stat *cstat;
 212
 213         cstat = get_cpu_ptr(cgrp->cpu_stat);
 214         u64_stats_update_begin(&cstat->sync);
 215         return cstat;
 216 }
 217
 218 static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
 219                                         struct cgroup_cpu_stat *cstat)
 220 {
 221         u64_stats_update_end(&cstat->sync);
 222         cgroup_cpu_stat_updated(cgrp, smp_processor_id());
 223         put_cpu_ptr(cstat);
 224 }
 225
 226 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
 227 {
 228         struct cgroup_cpu_stat *cstat;
 229
 230         cstat = cgroup_cpu_stat_account_begin(cgrp);
 231         cstat->cputime.sum_exec_runtime += delta_exec;
 232         cgroup_cpu_stat_account_end(cgrp, cstat);
 233 }
 234
 235 void __cgroup_account_cputime_field(struct cgroup *cgrp,
 236                                     enum cpu_usage_stat index, u64 delta_exec)
 237 {
 238         struct cgroup_cpu_stat *cstat;
 239
 240         cstat = cgroup_cpu_stat_account_begin(cgrp);
 241
 242         switch (index) {
 243         case CPUTIME_USER:
 244         case CPUTIME_NICE:
 245                 cstat->cputime.utime += delta_exec;
 246                 break;
 247         case CPUTIME_SYSTEM:
 248         case CPUTIME_IRQ:
 249         case CPUTIME_SOFTIRQ:
 250                 cstat->cputime.stime += delta_exec;
 251                 break;
 252         default:
 253                 break;
 254         }
 255
 256         cgroup_cpu_stat_account_end(cgrp, cstat);
 257 }
 258
 259 void cgroup_stat_show_cputime(struct seq_file *seq)
 260 {
 261         struct cgroup *cgrp = seq_css(seq)->cgroup;
 262         u64 usage, utime, stime;
 263
 264         if (!cgroup_parent(cgrp))
 265                 return;
 266
 267         mutex_lock(&cgroup_stat_mutex);
 268
 269         cgroup_stat_flush_locked(cgrp);
 270
 271         usage = cgrp->stat.cputime.sum_exec_runtime;
 272         cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
 273                        &utime, &stime);
 274
 275         mutex_unlock(&cgroup_stat_mutex);
 276
 277         do_div(usage, NSEC_PER_USEC);
 278         do_div(utime, NSEC_PER_USEC);
 279         do_div(stime, NSEC_PER_USEC);
 280
 281         seq_printf(seq, "usage_usec %llu\n"
 282                    "user_usec %llu\n"
 283                    "system_usec %llu\n",
 284                    usage, utime, stime);
 285 }
 286
 287 int cgroup_stat_init(struct cgroup *cgrp)
 288 {
 289         int cpu;
 290
 291         /* the root cgrp has cpu_stat preallocated */
 292         if (!cgrp->cpu_stat) {
 293                 cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
 294                 if (!cgrp->cpu_stat)
 295                         return -ENOMEM;
 296         }
 297
 298         /* ->updated_children list is self terminated */
 299         for_each_possible_cpu(cpu) {
 300                 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
 301
 302                 cstat->updated_children = cgrp;
 303                 u64_stats_init(&cstat->sync);
 304         }
 305
 306         prev_cputime_init(&cgrp->stat.prev_cputime);
 307
 308         return 0;
 309 }
 310
 311 void cgroup_stat_exit(struct cgroup *cgrp)
 312 {
 313         int cpu;
 314
 315         cgroup_stat_flush(cgrp);
 316
 317         /* sanity check */
 318         for_each_possible_cpu(cpu) {
 319                 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
 320
 321                 if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
 322                     WARN_ON_ONCE(cstat->updated_next))
 323                         return;
 324         }
 325
 326         free_percpu(cgrp->cpu_stat);
 327         cgrp->cpu_stat = NULL;
 328 }
 329
 330 void __init cgroup_stat_boot(void)
 331 {
 332         int cpu;
 333
 334         for_each_possible_cpu(cpu)
 335                 raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
 336
 337         BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
 338 }