1 // SPDX-License-Identifier: GPL-2.0-only
2 #include "cgroup-internal.h"
4 #include <linux/sched/cputime.h>
8 #include <linux/btf_ids.h>
10 #include <trace/events/cgroup.h>
12 static DEFINE_SPINLOCK(cgroup_rstat_lock
);
13 static DEFINE_PER_CPU(raw_spinlock_t
, cgroup_rstat_cpu_lock
);
15 static void cgroup_base_stat_flush(struct cgroup
*cgrp
, int cpu
);
17 static struct cgroup_rstat_cpu
*cgroup_rstat_cpu(struct cgroup
*cgrp
, int cpu
)
19 return per_cpu_ptr(cgrp
->rstat_cpu
, cpu
);
23 * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
25 * This makes it easier to diagnose locking issues and contention in
26 * production environments. The parameter @fast_path determine the
27 * tracepoints being added, allowing us to diagnose "flush" related
28 * operations without handling high-frequency fast-path "update" events.
30 static __always_inline
31 unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t
*cpu_lock
, int cpu
,
32 struct cgroup
*cgrp
, const bool fast_path
)
38 * The _irqsave() is needed because cgroup_rstat_lock is
39 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
40 * this lock with the _irq() suffix only disables interrupts on
41 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
42 * interrupts on both configurations. The _irqsave() ensures
43 * that interrupts are always disabled and later restored.
45 contended
= !raw_spin_trylock_irqsave(cpu_lock
, flags
);
48 trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp
, cpu
, contended
);
50 trace_cgroup_rstat_cpu_lock_contended(cgrp
, cpu
, contended
);
52 raw_spin_lock_irqsave(cpu_lock
, flags
);
56 trace_cgroup_rstat_cpu_locked_fastpath(cgrp
, cpu
, contended
);
58 trace_cgroup_rstat_cpu_locked(cgrp
, cpu
, contended
);
63 static __always_inline
64 void _cgroup_rstat_cpu_unlock(raw_spinlock_t
*cpu_lock
, int cpu
,
65 struct cgroup
*cgrp
, unsigned long flags
,
69 trace_cgroup_rstat_cpu_unlock_fastpath(cgrp
, cpu
, false);
71 trace_cgroup_rstat_cpu_unlock(cgrp
, cpu
, false);
73 raw_spin_unlock_irqrestore(cpu_lock
, flags
);
77 * cgroup_rstat_updated - keep track of updated rstat_cpu
78 * @cgrp: target cgroup
79 * @cpu: cpu on which rstat_cpu was updated
81 * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
82 * rstat_cpu->updated_children list. See the comment on top of
83 * cgroup_rstat_cpu definition for details.
85 __bpf_kfunc
void cgroup_rstat_updated(struct cgroup
*cgrp
, int cpu
)
87 raw_spinlock_t
*cpu_lock
= per_cpu_ptr(&cgroup_rstat_cpu_lock
, cpu
);
91 * Speculative already-on-list test. This may race leading to
92 * temporary inaccuracies, which is fine.
94 * Because @parent's updated_children is terminated with @parent
95 * instead of NULL, we can tell whether @cgrp is on the list by
96 * testing the next pointer for NULL.
98 if (data_race(cgroup_rstat_cpu(cgrp
, cpu
)->updated_next
))
101 flags
= _cgroup_rstat_cpu_lock(cpu_lock
, cpu
, cgrp
, true);
103 /* put @cgrp and all ancestors on the corresponding updated lists */
105 struct cgroup_rstat_cpu
*rstatc
= cgroup_rstat_cpu(cgrp
, cpu
);
106 struct cgroup
*parent
= cgroup_parent(cgrp
);
107 struct cgroup_rstat_cpu
*prstatc
;
110 * Both additions and removals are bottom-up. If a cgroup
111 * is already in the tree, all ancestors are.
113 if (rstatc
->updated_next
)
116 /* Root has no parent to link it to, but mark it busy */
118 rstatc
->updated_next
= cgrp
;
122 prstatc
= cgroup_rstat_cpu(parent
, cpu
);
123 rstatc
->updated_next
= prstatc
->updated_children
;
124 prstatc
->updated_children
= cgrp
;
129 _cgroup_rstat_cpu_unlock(cpu_lock
, cpu
, cgrp
, flags
, true);
133 * cgroup_rstat_push_children - push children cgroups into the given list
134 * @head: current head of the list (= subtree root)
135 * @child: first child of the root
137 * Return: A new singly linked list of cgroups to be flush
139 * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
140 * level and push all the parents first before their next level children
141 * into a singly linked list built from the tail backward like "pushing"
142 * cgroups into a stack. The root is pushed by the caller.
144 static struct cgroup
*cgroup_rstat_push_children(struct cgroup
*head
,
145 struct cgroup
*child
, int cpu
)
147 struct cgroup
*chead
= child
; /* Head of child cgroup level */
148 struct cgroup
*ghead
= NULL
; /* Head of grandchild cgroup level */
149 struct cgroup
*parent
, *grandchild
;
150 struct cgroup_rstat_cpu
*crstatc
;
152 child
->rstat_flush_next
= NULL
;
157 chead
= child
->rstat_flush_next
;
158 parent
= cgroup_parent(child
);
160 /* updated_next is parent cgroup terminated */
161 while (child
!= parent
) {
162 child
->rstat_flush_next
= head
;
164 crstatc
= cgroup_rstat_cpu(child
, cpu
);
165 grandchild
= crstatc
->updated_children
;
166 if (grandchild
!= child
) {
167 /* Push the grand child to the next level */
168 crstatc
->updated_children
= child
;
169 grandchild
->rstat_flush_next
= ghead
;
172 child
= crstatc
->updated_next
;
173 crstatc
->updated_next
= NULL
;
186 * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
187 * @root: root of the cgroup subtree to traverse
189 * Return: A singly linked list of cgroups to be flushed
191 * Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
192 * each returned cgroup is unlinked from the updated tree.
194 * The only ordering guarantee is that, for a parent and a child pair
195 * covered by a given traversal, the child is before its parent in
198 * Note that updated_children is self terminated and points to a list of
199 * child cgroups if not empty. Whereas updated_next is like a sibling link
200 * within the children list and terminated by the parent cgroup. An exception
201 * here is the cgroup root whose updated_next can be self terminated.
203 static struct cgroup
*cgroup_rstat_updated_list(struct cgroup
*root
, int cpu
)
205 raw_spinlock_t
*cpu_lock
= per_cpu_ptr(&cgroup_rstat_cpu_lock
, cpu
);
206 struct cgroup_rstat_cpu
*rstatc
= cgroup_rstat_cpu(root
, cpu
);
207 struct cgroup
*head
= NULL
, *parent
, *child
;
210 flags
= _cgroup_rstat_cpu_lock(cpu_lock
, cpu
, root
, false);
212 /* Return NULL if this subtree is not on-list */
213 if (!rstatc
->updated_next
)
217 * Unlink @root from its parent. As the updated_children list is
218 * singly linked, we have to walk it to find the removal point.
220 parent
= cgroup_parent(root
);
222 struct cgroup_rstat_cpu
*prstatc
;
223 struct cgroup
**nextp
;
225 prstatc
= cgroup_rstat_cpu(parent
, cpu
);
226 nextp
= &prstatc
->updated_children
;
227 while (*nextp
!= root
) {
228 struct cgroup_rstat_cpu
*nrstatc
;
230 nrstatc
= cgroup_rstat_cpu(*nextp
, cpu
);
231 WARN_ON_ONCE(*nextp
== parent
);
232 nextp
= &nrstatc
->updated_next
;
234 *nextp
= rstatc
->updated_next
;
237 rstatc
->updated_next
= NULL
;
239 /* Push @root to the list first before pushing the children */
241 root
->rstat_flush_next
= NULL
;
242 child
= rstatc
->updated_children
;
243 rstatc
->updated_children
= root
;
245 head
= cgroup_rstat_push_children(head
, child
, cpu
);
247 _cgroup_rstat_cpu_unlock(cpu_lock
, cpu
, root
, flags
, false);
252 * A hook for bpf stat collectors to attach to and flush their stats.
253 * Together with providing bpf kfuncs for cgroup_rstat_updated() and
254 * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
255 * collect cgroup stats can integrate with rstat for efficient flushing.
257 * A static noinline declaration here could cause the compiler to optimize away
258 * the function. A global noinline declaration will keep the definition, but may
259 * optimize away the callsite. Therefore, __weak is needed to ensure that the
260 * call is still emitted, by telling the compiler that we don't know what the
261 * function might eventually be.
266 __weak noinline
void bpf_rstat_flush(struct cgroup
*cgrp
,
267 struct cgroup
*parent
, int cpu
)
274 * Helper functions for locking cgroup_rstat_lock.
276 * This makes it easier to diagnose locking issues and contention in
277 * production environments. The parameter @cpu_in_loop indicate lock
278 * was released and re-taken when collection data from the CPUs. The
279 * value -1 is used when obtaining the main lock else this is the CPU
280 * number processed last.
282 static inline void __cgroup_rstat_lock(struct cgroup
*cgrp
, int cpu_in_loop
)
283 __acquires(&cgroup_rstat_lock
)
287 contended
= !spin_trylock_irq(&cgroup_rstat_lock
);
289 trace_cgroup_rstat_lock_contended(cgrp
, cpu_in_loop
, contended
);
290 spin_lock_irq(&cgroup_rstat_lock
);
292 trace_cgroup_rstat_locked(cgrp
, cpu_in_loop
, contended
);
295 static inline void __cgroup_rstat_unlock(struct cgroup
*cgrp
, int cpu_in_loop
)
296 __releases(&cgroup_rstat_lock
)
298 trace_cgroup_rstat_unlock(cgrp
, cpu_in_loop
, false);
299 spin_unlock_irq(&cgroup_rstat_lock
);
302 /* see cgroup_rstat_flush() */
303 static void cgroup_rstat_flush_locked(struct cgroup
*cgrp
)
304 __releases(&cgroup_rstat_lock
) __acquires(&cgroup_rstat_lock
)
308 lockdep_assert_held(&cgroup_rstat_lock
);
310 for_each_possible_cpu(cpu
) {
311 struct cgroup
*pos
= cgroup_rstat_updated_list(cgrp
, cpu
);
313 for (; pos
; pos
= pos
->rstat_flush_next
) {
314 struct cgroup_subsys_state
*css
;
316 cgroup_base_stat_flush(pos
, cpu
);
317 bpf_rstat_flush(pos
, cgroup_parent(pos
), cpu
);
320 list_for_each_entry_rcu(css
, &pos
->rstat_css_list
,
322 css
->ss
->css_rstat_flush(css
, cpu
);
326 /* play nice and yield if necessary */
327 if (need_resched() || spin_needbreak(&cgroup_rstat_lock
)) {
328 __cgroup_rstat_unlock(cgrp
, cpu
);
331 __cgroup_rstat_lock(cgrp
, cpu
);
337 * cgroup_rstat_flush - flush stats in @cgrp's subtree
338 * @cgrp: target cgroup
340 * Collect all per-cpu stats in @cgrp's subtree into the global counters
341 * and propagate them upwards. After this function returns, all cgroups in
342 * the subtree have up-to-date ->stat.
344 * This also gets all cgroups in the subtree including @cgrp off the
345 * ->updated_children lists.
347 * This function may block.
349 __bpf_kfunc
void cgroup_rstat_flush(struct cgroup
*cgrp
)
353 __cgroup_rstat_lock(cgrp
, -1);
354 cgroup_rstat_flush_locked(cgrp
);
355 __cgroup_rstat_unlock(cgrp
, -1);
359 * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
360 * @cgrp: target cgroup
362 * Flush stats in @cgrp's subtree and prevent further flushes. Must be
363 * paired with cgroup_rstat_flush_release().
365 * This function may block.
367 void cgroup_rstat_flush_hold(struct cgroup
*cgrp
)
368 __acquires(&cgroup_rstat_lock
)
371 __cgroup_rstat_lock(cgrp
, -1);
372 cgroup_rstat_flush_locked(cgrp
);
376 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
377 * @cgrp: cgroup used by tracepoint
379 void cgroup_rstat_flush_release(struct cgroup
*cgrp
)
380 __releases(&cgroup_rstat_lock
)
382 __cgroup_rstat_unlock(cgrp
, -1);
385 int cgroup_rstat_init(struct cgroup
*cgrp
)
389 /* the root cgrp has rstat_cpu preallocated */
390 if (!cgrp
->rstat_cpu
) {
391 cgrp
->rstat_cpu
= alloc_percpu(struct cgroup_rstat_cpu
);
392 if (!cgrp
->rstat_cpu
)
396 /* ->updated_children list is self terminated */
397 for_each_possible_cpu(cpu
) {
398 struct cgroup_rstat_cpu
*rstatc
= cgroup_rstat_cpu(cgrp
, cpu
);
400 rstatc
->updated_children
= cgrp
;
401 u64_stats_init(&rstatc
->bsync
);
407 void cgroup_rstat_exit(struct cgroup
*cgrp
)
411 cgroup_rstat_flush(cgrp
);
414 for_each_possible_cpu(cpu
) {
415 struct cgroup_rstat_cpu
*rstatc
= cgroup_rstat_cpu(cgrp
, cpu
);
417 if (WARN_ON_ONCE(rstatc
->updated_children
!= cgrp
) ||
418 WARN_ON_ONCE(rstatc
->updated_next
))
422 free_percpu(cgrp
->rstat_cpu
);
423 cgrp
->rstat_cpu
= NULL
;
426 void __init
cgroup_rstat_boot(void)
430 for_each_possible_cpu(cpu
)
431 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock
, cpu
));
435 * Functions for cgroup basic resource statistics implemented on top of
438 static void cgroup_base_stat_add(struct cgroup_base_stat
*dst_bstat
,
439 struct cgroup_base_stat
*src_bstat
)
441 dst_bstat
->cputime
.utime
+= src_bstat
->cputime
.utime
;
442 dst_bstat
->cputime
.stime
+= src_bstat
->cputime
.stime
;
443 dst_bstat
->cputime
.sum_exec_runtime
+= src_bstat
->cputime
.sum_exec_runtime
;
444 #ifdef CONFIG_SCHED_CORE
445 dst_bstat
->forceidle_sum
+= src_bstat
->forceidle_sum
;
447 dst_bstat
->ntime
+= src_bstat
->ntime
;
450 static void cgroup_base_stat_sub(struct cgroup_base_stat
*dst_bstat
,
451 struct cgroup_base_stat
*src_bstat
)
453 dst_bstat
->cputime
.utime
-= src_bstat
->cputime
.utime
;
454 dst_bstat
->cputime
.stime
-= src_bstat
->cputime
.stime
;
455 dst_bstat
->cputime
.sum_exec_runtime
-= src_bstat
->cputime
.sum_exec_runtime
;
456 #ifdef CONFIG_SCHED_CORE
457 dst_bstat
->forceidle_sum
-= src_bstat
->forceidle_sum
;
459 dst_bstat
->ntime
-= src_bstat
->ntime
;
462 static void cgroup_base_stat_flush(struct cgroup
*cgrp
, int cpu
)
464 struct cgroup_rstat_cpu
*rstatc
= cgroup_rstat_cpu(cgrp
, cpu
);
465 struct cgroup
*parent
= cgroup_parent(cgrp
);
466 struct cgroup_rstat_cpu
*prstatc
;
467 struct cgroup_base_stat delta
;
470 /* Root-level stats are sourced from system-wide CPU stats */
474 /* fetch the current per-cpu values */
476 seq
= __u64_stats_fetch_begin(&rstatc
->bsync
);
477 delta
= rstatc
->bstat
;
478 } while (__u64_stats_fetch_retry(&rstatc
->bsync
, seq
));
480 /* propagate per-cpu delta to cgroup and per-cpu global statistics */
481 cgroup_base_stat_sub(&delta
, &rstatc
->last_bstat
);
482 cgroup_base_stat_add(&cgrp
->bstat
, &delta
);
483 cgroup_base_stat_add(&rstatc
->last_bstat
, &delta
);
484 cgroup_base_stat_add(&rstatc
->subtree_bstat
, &delta
);
486 /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
487 if (cgroup_parent(parent
)) {
489 cgroup_base_stat_sub(&delta
, &cgrp
->last_bstat
);
490 cgroup_base_stat_add(&parent
->bstat
, &delta
);
491 cgroup_base_stat_add(&cgrp
->last_bstat
, &delta
);
493 delta
= rstatc
->subtree_bstat
;
494 prstatc
= cgroup_rstat_cpu(parent
, cpu
);
495 cgroup_base_stat_sub(&delta
, &rstatc
->last_subtree_bstat
);
496 cgroup_base_stat_add(&prstatc
->subtree_bstat
, &delta
);
497 cgroup_base_stat_add(&rstatc
->last_subtree_bstat
, &delta
);
501 static struct cgroup_rstat_cpu
*
502 cgroup_base_stat_cputime_account_begin(struct cgroup
*cgrp
, unsigned long *flags
)
504 struct cgroup_rstat_cpu
*rstatc
;
506 rstatc
= get_cpu_ptr(cgrp
->rstat_cpu
);
507 *flags
= u64_stats_update_begin_irqsave(&rstatc
->bsync
);
511 static void cgroup_base_stat_cputime_account_end(struct cgroup
*cgrp
,
512 struct cgroup_rstat_cpu
*rstatc
,
515 u64_stats_update_end_irqrestore(&rstatc
->bsync
, flags
);
516 cgroup_rstat_updated(cgrp
, smp_processor_id());
520 void __cgroup_account_cputime(struct cgroup
*cgrp
, u64 delta_exec
)
522 struct cgroup_rstat_cpu
*rstatc
;
525 rstatc
= cgroup_base_stat_cputime_account_begin(cgrp
, &flags
);
526 rstatc
->bstat
.cputime
.sum_exec_runtime
+= delta_exec
;
527 cgroup_base_stat_cputime_account_end(cgrp
, rstatc
, flags
);
530 void __cgroup_account_cputime_field(struct cgroup
*cgrp
,
531 enum cpu_usage_stat index
, u64 delta_exec
)
533 struct cgroup_rstat_cpu
*rstatc
;
536 rstatc
= cgroup_base_stat_cputime_account_begin(cgrp
, &flags
);
540 rstatc
->bstat
.ntime
+= delta_exec
;
543 rstatc
->bstat
.cputime
.utime
+= delta_exec
;
547 case CPUTIME_SOFTIRQ
:
548 rstatc
->bstat
.cputime
.stime
+= delta_exec
;
550 #ifdef CONFIG_SCHED_CORE
551 case CPUTIME_FORCEIDLE
:
552 rstatc
->bstat
.forceidle_sum
+= delta_exec
;
559 cgroup_base_stat_cputime_account_end(cgrp
, rstatc
, flags
);
563 * compute the cputime for the root cgroup by getting the per cpu data
564 * at a global level, then categorizing the fields in a manner consistent
565 * with how it is done by __cgroup_account_cputime_field for each bit of
566 * cpu time attributed to a cgroup.
568 static void root_cgroup_cputime(struct cgroup_base_stat
*bstat
)
570 struct task_cputime
*cputime
= &bstat
->cputime
;
573 memset(bstat
, 0, sizeof(*bstat
));
574 for_each_possible_cpu(i
) {
575 struct kernel_cpustat kcpustat
;
576 u64
*cpustat
= kcpustat
.cpustat
;
580 kcpustat_cpu_fetch(&kcpustat
, i
);
582 user
+= cpustat
[CPUTIME_USER
];
583 user
+= cpustat
[CPUTIME_NICE
];
584 cputime
->utime
+= user
;
586 sys
+= cpustat
[CPUTIME_SYSTEM
];
587 sys
+= cpustat
[CPUTIME_IRQ
];
588 sys
+= cpustat
[CPUTIME_SOFTIRQ
];
589 cputime
->stime
+= sys
;
591 cputime
->sum_exec_runtime
+= user
;
592 cputime
->sum_exec_runtime
+= sys
;
593 cputime
->sum_exec_runtime
+= cpustat
[CPUTIME_STEAL
];
595 #ifdef CONFIG_SCHED_CORE
596 bstat
->forceidle_sum
+= cpustat
[CPUTIME_FORCEIDLE
];
598 bstat
->ntime
+= cpustat
[CPUTIME_NICE
];
603 static void cgroup_force_idle_show(struct seq_file
*seq
, struct cgroup_base_stat
*bstat
)
605 #ifdef CONFIG_SCHED_CORE
606 u64 forceidle_time
= bstat
->forceidle_sum
;
608 do_div(forceidle_time
, NSEC_PER_USEC
);
609 seq_printf(seq
, "core_sched.force_idle_usec %llu\n", forceidle_time
);
613 void cgroup_base_stat_cputime_show(struct seq_file
*seq
)
615 struct cgroup
*cgrp
= seq_css(seq
)->cgroup
;
616 u64 usage
, utime
, stime
, ntime
;
618 if (cgroup_parent(cgrp
)) {
619 cgroup_rstat_flush_hold(cgrp
);
620 usage
= cgrp
->bstat
.cputime
.sum_exec_runtime
;
621 cputime_adjust(&cgrp
->bstat
.cputime
, &cgrp
->prev_cputime
,
623 ntime
= cgrp
->bstat
.ntime
;
624 cgroup_rstat_flush_release(cgrp
);
626 /* cgrp->bstat of root is not actually used, reuse it */
627 root_cgroup_cputime(&cgrp
->bstat
);
628 usage
= cgrp
->bstat
.cputime
.sum_exec_runtime
;
629 utime
= cgrp
->bstat
.cputime
.utime
;
630 stime
= cgrp
->bstat
.cputime
.stime
;
631 ntime
= cgrp
->bstat
.ntime
;
634 do_div(usage
, NSEC_PER_USEC
);
635 do_div(utime
, NSEC_PER_USEC
);
636 do_div(stime
, NSEC_PER_USEC
);
637 do_div(ntime
, NSEC_PER_USEC
);
639 seq_printf(seq
, "usage_usec %llu\n"
643 usage
, utime
, stime
, ntime
);
645 cgroup_force_idle_show(seq
, &cgrp
->bstat
);
648 /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
649 BTF_KFUNCS_START(bpf_rstat_kfunc_ids
)
650 BTF_ID_FLAGS(func
, cgroup_rstat_updated
)
651 BTF_ID_FLAGS(func
, cgroup_rstat_flush
, KF_SLEEPABLE
)
652 BTF_KFUNCS_END(bpf_rstat_kfunc_ids
)
654 static const struct btf_kfunc_id_set bpf_rstat_kfunc_set
= {
655 .owner
= THIS_MODULE
,
656 .set
= &bpf_rstat_kfunc_ids
,
659 static int __init
bpf_rstat_kfunc_init(void)
661 return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING
,
662 &bpf_rstat_kfunc_set
);
664 late_initcall(bpf_rstat_kfunc_init
);