1 // SPDX-License-Identifier: GPL-2.0-only
3 * Simple CPU accounting cgroup controller
7 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
10 * There are no locks covering percpu hardirq/softirq time.
11 * They are only modified in vtime_account, on corresponding CPU
12 * with interrupts disabled. So, writes are safe.
13 * They are read and saved off onto struct rq in update_rq_clock().
14 * This may result in other CPU reading this CPU's irq time and can
15 * race with irq/vtime_account on this CPU. We would either get old
16 * or new value with a side effect of accounting a slice of irq time to wrong
17 * task when irq is in progress while we read rq->clock. That is a worthy
18 * compromise in place of having locks on each irq in account_system_time.
20 DEFINE_PER_CPU(struct irqtime
, cpu_irqtime
);
22 static int sched_clock_irqtime
;
24 void enable_sched_clock_irqtime(void)
26 sched_clock_irqtime
= 1;
29 void disable_sched_clock_irqtime(void)
31 sched_clock_irqtime
= 0;
34 static void irqtime_account_delta(struct irqtime
*irqtime
, u64 delta
,
35 enum cpu_usage_stat idx
)
37 u64
*cpustat
= kcpustat_this_cpu
->cpustat
;
39 u64_stats_update_begin(&irqtime
->sync
);
40 cpustat
[idx
] += delta
;
41 irqtime
->total
+= delta
;
42 irqtime
->tick_delta
+= delta
;
43 u64_stats_update_end(&irqtime
->sync
);
47 * Called after incrementing preempt_count on {soft,}irq_enter
48 * and before decrementing preempt_count on {soft,}irq_exit.
50 void irqtime_account_irq(struct task_struct
*curr
, unsigned int offset
)
52 struct irqtime
*irqtime
= this_cpu_ptr(&cpu_irqtime
);
57 if (!sched_clock_irqtime
)
60 cpu
= smp_processor_id();
61 delta
= sched_clock_cpu(cpu
) - irqtime
->irq_start_time
;
62 irqtime
->irq_start_time
+= delta
;
63 pc
= preempt_count() - offset
;
66 * We do not account for softirq time from ksoftirqd here.
67 * We want to continue accounting softirq time to ksoftirqd thread
68 * in that case, so as not to confuse scheduler with a special task
69 * that do not consume any time, but still wants to run.
71 if (pc
& HARDIRQ_MASK
)
72 irqtime_account_delta(irqtime
, delta
, CPUTIME_IRQ
);
73 else if ((pc
& SOFTIRQ_OFFSET
) && curr
!= this_cpu_ksoftirqd())
74 irqtime_account_delta(irqtime
, delta
, CPUTIME_SOFTIRQ
);
77 static u64
irqtime_tick_accounted(u64 maxtime
)
79 struct irqtime
*irqtime
= this_cpu_ptr(&cpu_irqtime
);
82 delta
= min(irqtime
->tick_delta
, maxtime
);
83 irqtime
->tick_delta
-= delta
;
88 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
90 #define sched_clock_irqtime (0)
92 static u64
irqtime_tick_accounted(u64 dummy
)
97 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
99 static inline void task_group_account_field(struct task_struct
*p
, int index
,
103 * Since all updates are sure to touch the root cgroup, we
104 * get ourselves ahead and touch it first. If the root cgroup
105 * is the only cgroup, then nothing else should be necessary.
108 __this_cpu_add(kernel_cpustat
.cpustat
[index
], tmp
);
110 cgroup_account_cputime_field(p
, index
, tmp
);
114 * Account user CPU time to a process.
115 * @p: the process that the CPU time gets accounted to
116 * @cputime: the CPU time spent in user space since the last update
118 void account_user_time(struct task_struct
*p
, u64 cputime
)
122 /* Add user time to process. */
124 account_group_user_time(p
, cputime
);
126 index
= (task_nice(p
) > 0) ? CPUTIME_NICE
: CPUTIME_USER
;
128 /* Add user time to cpustat. */
129 task_group_account_field(p
, index
, cputime
);
131 /* Account for user time used */
132 acct_account_cputime(p
);
136 * Account guest CPU time to a process.
137 * @p: the process that the CPU time gets accounted to
138 * @cputime: the CPU time spent in virtual machine since the last update
140 void account_guest_time(struct task_struct
*p
, u64 cputime
)
142 u64
*cpustat
= kcpustat_this_cpu
->cpustat
;
144 /* Add guest time to process. */
146 account_group_user_time(p
, cputime
);
149 /* Add guest time to cpustat. */
150 if (task_nice(p
) > 0) {
151 cpustat
[CPUTIME_NICE
] += cputime
;
152 cpustat
[CPUTIME_GUEST_NICE
] += cputime
;
154 cpustat
[CPUTIME_USER
] += cputime
;
155 cpustat
[CPUTIME_GUEST
] += cputime
;
160 * Account system CPU time to a process and desired cpustat field
161 * @p: the process that the CPU time gets accounted to
162 * @cputime: the CPU time spent in kernel space since the last update
163 * @index: pointer to cpustat field that has to be updated
165 void account_system_index_time(struct task_struct
*p
,
166 u64 cputime
, enum cpu_usage_stat index
)
168 /* Add system time to process. */
170 account_group_system_time(p
, cputime
);
172 /* Add system time to cpustat. */
173 task_group_account_field(p
, index
, cputime
);
175 /* Account for system time used */
176 acct_account_cputime(p
);
180 * Account system CPU time to a process.
181 * @p: the process that the CPU time gets accounted to
182 * @hardirq_offset: the offset to subtract from hardirq_count()
183 * @cputime: the CPU time spent in kernel space since the last update
185 void account_system_time(struct task_struct
*p
, int hardirq_offset
, u64 cputime
)
189 if ((p
->flags
& PF_VCPU
) && (irq_count() - hardirq_offset
== 0)) {
190 account_guest_time(p
, cputime
);
194 if (hardirq_count() - hardirq_offset
)
196 else if (in_serving_softirq())
197 index
= CPUTIME_SOFTIRQ
;
199 index
= CPUTIME_SYSTEM
;
201 account_system_index_time(p
, cputime
, index
);
205 * Account for involuntary wait time.
206 * @cputime: the CPU time spent in involuntary wait
208 void account_steal_time(u64 cputime
)
210 u64
*cpustat
= kcpustat_this_cpu
->cpustat
;
212 cpustat
[CPUTIME_STEAL
] += cputime
;
216 * Account for idle time.
217 * @cputime: the CPU time spent in idle wait
219 void account_idle_time(u64 cputime
)
221 u64
*cpustat
= kcpustat_this_cpu
->cpustat
;
222 struct rq
*rq
= this_rq();
224 if (atomic_read(&rq
->nr_iowait
) > 0)
225 cpustat
[CPUTIME_IOWAIT
] += cputime
;
227 cpustat
[CPUTIME_IDLE
] += cputime
;
231 * When a guest is interrupted for a longer amount of time, missed clock
232 * ticks are not redelivered later. Due to that, this function may on
233 * occasion account more time than the calling functions think elapsed.
235 static __always_inline u64
steal_account_process_time(u64 maxtime
)
237 #ifdef CONFIG_PARAVIRT
238 if (static_key_false(¶virt_steal_enabled
)) {
241 steal
= paravirt_steal_clock(smp_processor_id());
242 steal
-= this_rq()->prev_steal_time
;
243 steal
= min(steal
, maxtime
);
244 account_steal_time(steal
);
245 this_rq()->prev_steal_time
+= steal
;
254 * Account how much elapsed time was spent in steal, irq, or softirq time.
256 static inline u64
account_other_time(u64 max
)
260 lockdep_assert_irqs_disabled();
262 accounted
= steal_account_process_time(max
);
265 accounted
+= irqtime_tick_accounted(max
- accounted
);
271 static inline u64
read_sum_exec_runtime(struct task_struct
*t
)
273 return t
->se
.sum_exec_runtime
;
276 static u64
read_sum_exec_runtime(struct task_struct
*t
)
282 rq
= task_rq_lock(t
, &rf
);
283 ns
= t
->se
.sum_exec_runtime
;
284 task_rq_unlock(rq
, t
, &rf
);
291 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
292 * tasks (sum on group iteration) belonging to @tsk's group.
294 void thread_group_cputime(struct task_struct
*tsk
, struct task_cputime
*times
)
296 struct signal_struct
*sig
= tsk
->signal
;
298 struct task_struct
*t
;
299 unsigned int seq
, nextseq
;
303 * Update current task runtime to account pending time since last
304 * scheduler action or thread_group_cputime() call. This thread group
305 * might have other running tasks on different CPUs, but updating
306 * their runtime can affect syscall performance, so we skip account
307 * those pending times and rely only on values updated on tick or
308 * other scheduler action.
310 if (same_thread_group(current
, tsk
))
311 (void) task_sched_runtime(current
);
314 /* Attempt a lockless read on the first round. */
318 flags
= read_seqbegin_or_lock_irqsave(&sig
->stats_lock
, &seq
);
319 times
->utime
= sig
->utime
;
320 times
->stime
= sig
->stime
;
321 times
->sum_exec_runtime
= sig
->sum_sched_runtime
;
323 for_each_thread(tsk
, t
) {
324 task_cputime(t
, &utime
, &stime
);
325 times
->utime
+= utime
;
326 times
->stime
+= stime
;
327 times
->sum_exec_runtime
+= read_sum_exec_runtime(t
);
329 /* If lockless access failed, take the lock. */
331 } while (need_seqretry(&sig
->stats_lock
, seq
));
332 done_seqretry_irqrestore(&sig
->stats_lock
, seq
, flags
);
336 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
338 * Account a tick to a process and cpustat
339 * @p: the process that the CPU time gets accounted to
340 * @user_tick: is the tick from userspace
341 * @rq: the pointer to rq
343 * Tick demultiplexing follows the order
344 * - pending hardirq update
345 * - pending softirq update
349 * - check for guest_time
350 * - else account as system_time
352 * Check for hardirq is done both for system and user time as there is
353 * no timer going off while we are on hardirq and hence we may never get an
354 * opportunity to update it solely in system time.
355 * p->stime and friends are only updated on system time and not on irq
356 * softirq as those do not count in task exec_runtime any more.
358 static void irqtime_account_process_tick(struct task_struct
*p
, int user_tick
,
361 u64 other
, cputime
= TICK_NSEC
* ticks
;
364 * When returning from idle, many ticks can get accounted at
365 * once, including some ticks of steal, irq, and softirq time.
366 * Subtract those ticks from the amount of time accounted to
367 * idle, or potentially user or system time. Due to rounding,
368 * other time can exceed ticks occasionally.
370 other
= account_other_time(ULONG_MAX
);
371 if (other
>= cputime
)
376 if (this_cpu_ksoftirqd() == p
) {
378 * ksoftirqd time do not get accounted in cpu_softirq_time.
379 * So, we have to handle it separately here.
380 * Also, p->stime needs to be updated for ksoftirqd.
382 account_system_index_time(p
, cputime
, CPUTIME_SOFTIRQ
);
383 } else if (user_tick
) {
384 account_user_time(p
, cputime
);
385 } else if (p
== this_rq()->idle
) {
386 account_idle_time(cputime
);
387 } else if (p
->flags
& PF_VCPU
) { /* System time or guest time */
388 account_guest_time(p
, cputime
);
390 account_system_index_time(p
, cputime
, CPUTIME_SYSTEM
);
394 static void irqtime_account_idle_ticks(int ticks
)
396 irqtime_account_process_tick(current
, 0, ticks
);
398 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
399 static inline void irqtime_account_idle_ticks(int ticks
) { }
400 static inline void irqtime_account_process_tick(struct task_struct
*p
, int user_tick
,
402 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
405 * Use precise platform statistics if available:
407 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
409 # ifndef __ARCH_HAS_VTIME_TASK_SWITCH
410 void vtime_task_switch(struct task_struct
*prev
)
412 if (is_idle_task(prev
))
413 vtime_account_idle(prev
);
415 vtime_account_kernel(prev
);
418 arch_vtime_task_switch(prev
);
422 void vtime_account_irq(struct task_struct
*tsk
, unsigned int offset
)
424 unsigned int pc
= preempt_count() - offset
;
426 if (pc
& HARDIRQ_OFFSET
) {
427 vtime_account_hardirq(tsk
);
428 } else if (pc
& SOFTIRQ_OFFSET
) {
429 vtime_account_softirq(tsk
);
430 } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE
) &&
432 vtime_account_idle(tsk
);
434 vtime_account_kernel(tsk
);
438 void cputime_adjust(struct task_cputime
*curr
, struct prev_cputime
*prev
,
445 void task_cputime_adjusted(struct task_struct
*p
, u64
*ut
, u64
*st
)
450 EXPORT_SYMBOL_GPL(task_cputime_adjusted
);
452 void thread_group_cputime_adjusted(struct task_struct
*p
, u64
*ut
, u64
*st
)
454 struct task_cputime cputime
;
456 thread_group_cputime(p
, &cputime
);
462 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
465 * Account a single tick of CPU time.
466 * @p: the process that the CPU time gets accounted to
467 * @user_tick: indicates if the tick is a user or a system tick
469 void account_process_tick(struct task_struct
*p
, int user_tick
)
473 if (vtime_accounting_enabled_this_cpu())
476 if (sched_clock_irqtime
) {
477 irqtime_account_process_tick(p
, user_tick
, 1);
482 steal
= steal_account_process_time(ULONG_MAX
);
484 if (steal
>= cputime
)
490 account_user_time(p
, cputime
);
491 else if ((p
!= this_rq()->idle
) || (irq_count() != HARDIRQ_OFFSET
))
492 account_system_time(p
, HARDIRQ_OFFSET
, cputime
);
494 account_idle_time(cputime
);
498 * Account multiple ticks of idle time.
499 * @ticks: number of stolen ticks
501 void account_idle_ticks(unsigned long ticks
)
505 if (sched_clock_irqtime
) {
506 irqtime_account_idle_ticks(ticks
);
510 cputime
= ticks
* TICK_NSEC
;
511 steal
= steal_account_process_time(ULONG_MAX
);
513 if (steal
>= cputime
)
517 account_idle_time(cputime
);
521 * Adjust tick based cputime random precision against scheduler runtime
524 * Tick based cputime accounting depend on random scheduling timeslices of a
525 * task to be interrupted or not by the timer. Depending on these
526 * circumstances, the number of these interrupts may be over or
527 * under-optimistic, matching the real user and system cputime with a variable
530 * Fix this by scaling these tick based values against the total runtime
531 * accounted by the CFS scheduler.
533 * This code provides the following guarantees:
535 * stime + utime == rtime
536 * stime_i+1 >= stime_i, utime_i+1 >= utime_i
538 * Assuming that rtime_i+1 >= rtime_i.
540 void cputime_adjust(struct task_cputime
*curr
, struct prev_cputime
*prev
,
543 u64 rtime
, stime
, utime
;
546 /* Serialize concurrent callers such that we can honour our guarantees */
547 raw_spin_lock_irqsave(&prev
->lock
, flags
);
548 rtime
= curr
->sum_exec_runtime
;
551 * This is possible under two circumstances:
552 * - rtime isn't monotonic after all (a bug);
553 * - we got reordered by the lock.
555 * In both cases this acts as a filter such that the rest of the code
556 * can assume it is monotonic regardless of anything else.
558 if (prev
->stime
+ prev
->utime
>= rtime
)
565 * If either stime or utime are 0, assume all runtime is userspace.
566 * Once a task gets some ticks, the monotonicy code at 'update:'
567 * will ensure things converge to the observed ratio.
579 stime
= mul_u64_u64_div_u64(stime
, rtime
, stime
+ utime
);
583 * Make sure stime doesn't go backwards; this preserves monotonicity
584 * for utime because rtime is monotonic.
586 * utime_i+1 = rtime_i+1 - stime_i
587 * = rtime_i+1 - (rtime_i - utime_i)
588 * = (rtime_i+1 - rtime_i) + utime_i
591 if (stime
< prev
->stime
)
593 utime
= rtime
- stime
;
596 * Make sure utime doesn't go backwards; this still preserves
597 * monotonicity for stime, analogous argument to above.
599 if (utime
< prev
->utime
) {
601 stime
= rtime
- utime
;
609 raw_spin_unlock_irqrestore(&prev
->lock
, flags
);
612 void task_cputime_adjusted(struct task_struct
*p
, u64
*ut
, u64
*st
)
614 struct task_cputime cputime
= {
615 .sum_exec_runtime
= p
->se
.sum_exec_runtime
,
618 task_cputime(p
, &cputime
.utime
, &cputime
.stime
);
619 cputime_adjust(&cputime
, &p
->prev_cputime
, ut
, st
);
621 EXPORT_SYMBOL_GPL(task_cputime_adjusted
);
623 void thread_group_cputime_adjusted(struct task_struct
*p
, u64
*ut
, u64
*st
)
625 struct task_cputime cputime
;
627 thread_group_cputime(p
, &cputime
);
628 cputime_adjust(&cputime
, &p
->signal
->prev_cputime
, ut
, st
);
630 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
632 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
633 static u64
vtime_delta(struct vtime
*vtime
)
635 unsigned long long clock
;
637 clock
= sched_clock();
638 if (clock
< vtime
->starttime
)
641 return clock
- vtime
->starttime
;
644 static u64
get_vtime_delta(struct vtime
*vtime
)
646 u64 delta
= vtime_delta(vtime
);
650 * Unlike tick based timing, vtime based timing never has lost
651 * ticks, and no need for steal time accounting to make up for
652 * lost ticks. Vtime accounts a rounded version of actual
653 * elapsed time. Limit account_other_time to prevent rounding
654 * errors from causing elapsed vtime to go negative.
656 other
= account_other_time(delta
);
657 WARN_ON_ONCE(vtime
->state
== VTIME_INACTIVE
);
658 vtime
->starttime
+= delta
;
660 return delta
- other
;
663 static void vtime_account_system(struct task_struct
*tsk
,
666 vtime
->stime
+= get_vtime_delta(vtime
);
667 if (vtime
->stime
>= TICK_NSEC
) {
668 account_system_time(tsk
, irq_count(), vtime
->stime
);
673 static void vtime_account_guest(struct task_struct
*tsk
,
676 vtime
->gtime
+= get_vtime_delta(vtime
);
677 if (vtime
->gtime
>= TICK_NSEC
) {
678 account_guest_time(tsk
, vtime
->gtime
);
683 static void __vtime_account_kernel(struct task_struct
*tsk
,
686 /* We might have scheduled out from guest path */
687 if (vtime
->state
== VTIME_GUEST
)
688 vtime_account_guest(tsk
, vtime
);
690 vtime_account_system(tsk
, vtime
);
693 void vtime_account_kernel(struct task_struct
*tsk
)
695 struct vtime
*vtime
= &tsk
->vtime
;
697 if (!vtime_delta(vtime
))
700 write_seqcount_begin(&vtime
->seqcount
);
701 __vtime_account_kernel(tsk
, vtime
);
702 write_seqcount_end(&vtime
->seqcount
);
705 void vtime_user_enter(struct task_struct
*tsk
)
707 struct vtime
*vtime
= &tsk
->vtime
;
709 write_seqcount_begin(&vtime
->seqcount
);
710 vtime_account_system(tsk
, vtime
);
711 vtime
->state
= VTIME_USER
;
712 write_seqcount_end(&vtime
->seqcount
);
715 void vtime_user_exit(struct task_struct
*tsk
)
717 struct vtime
*vtime
= &tsk
->vtime
;
719 write_seqcount_begin(&vtime
->seqcount
);
720 vtime
->utime
+= get_vtime_delta(vtime
);
721 if (vtime
->utime
>= TICK_NSEC
) {
722 account_user_time(tsk
, vtime
->utime
);
725 vtime
->state
= VTIME_SYS
;
726 write_seqcount_end(&vtime
->seqcount
);
729 void vtime_guest_enter(struct task_struct
*tsk
)
731 struct vtime
*vtime
= &tsk
->vtime
;
733 * The flags must be updated under the lock with
734 * the vtime_starttime flush and update.
735 * That enforces a right ordering and update sequence
736 * synchronization against the reader (task_gtime())
737 * that can thus safely catch up with a tickless delta.
739 write_seqcount_begin(&vtime
->seqcount
);
740 vtime_account_system(tsk
, vtime
);
741 tsk
->flags
|= PF_VCPU
;
742 vtime
->state
= VTIME_GUEST
;
743 write_seqcount_end(&vtime
->seqcount
);
745 EXPORT_SYMBOL_GPL(vtime_guest_enter
);
747 void vtime_guest_exit(struct task_struct
*tsk
)
749 struct vtime
*vtime
= &tsk
->vtime
;
751 write_seqcount_begin(&vtime
->seqcount
);
752 vtime_account_guest(tsk
, vtime
);
753 tsk
->flags
&= ~PF_VCPU
;
754 vtime
->state
= VTIME_SYS
;
755 write_seqcount_end(&vtime
->seqcount
);
757 EXPORT_SYMBOL_GPL(vtime_guest_exit
);
759 void vtime_account_idle(struct task_struct
*tsk
)
761 account_idle_time(get_vtime_delta(&tsk
->vtime
));
764 void vtime_task_switch_generic(struct task_struct
*prev
)
766 struct vtime
*vtime
= &prev
->vtime
;
768 write_seqcount_begin(&vtime
->seqcount
);
769 if (vtime
->state
== VTIME_IDLE
)
770 vtime_account_idle(prev
);
772 __vtime_account_kernel(prev
, vtime
);
773 vtime
->state
= VTIME_INACTIVE
;
775 write_seqcount_end(&vtime
->seqcount
);
777 vtime
= ¤t
->vtime
;
779 write_seqcount_begin(&vtime
->seqcount
);
780 if (is_idle_task(current
))
781 vtime
->state
= VTIME_IDLE
;
782 else if (current
->flags
& PF_VCPU
)
783 vtime
->state
= VTIME_GUEST
;
785 vtime
->state
= VTIME_SYS
;
786 vtime
->starttime
= sched_clock();
787 vtime
->cpu
= smp_processor_id();
788 write_seqcount_end(&vtime
->seqcount
);
791 void vtime_init_idle(struct task_struct
*t
, int cpu
)
793 struct vtime
*vtime
= &t
->vtime
;
796 local_irq_save(flags
);
797 write_seqcount_begin(&vtime
->seqcount
);
798 vtime
->state
= VTIME_IDLE
;
799 vtime
->starttime
= sched_clock();
801 write_seqcount_end(&vtime
->seqcount
);
802 local_irq_restore(flags
);
805 u64
task_gtime(struct task_struct
*t
)
807 struct vtime
*vtime
= &t
->vtime
;
811 if (!vtime_accounting_enabled())
815 seq
= read_seqcount_begin(&vtime
->seqcount
);
818 if (vtime
->state
== VTIME_GUEST
)
819 gtime
+= vtime
->gtime
+ vtime_delta(vtime
);
821 } while (read_seqcount_retry(&vtime
->seqcount
, seq
));
827 * Fetch cputime raw values from fields of task_struct and
828 * add up the pending nohz execution time since the last
831 void task_cputime(struct task_struct
*t
, u64
*utime
, u64
*stime
)
833 struct vtime
*vtime
= &t
->vtime
;
837 if (!vtime_accounting_enabled()) {
844 seq
= read_seqcount_begin(&vtime
->seqcount
);
849 /* Task is sleeping or idle, nothing to add */
850 if (vtime
->state
< VTIME_SYS
)
853 delta
= vtime_delta(vtime
);
856 * Task runs either in user (including guest) or kernel space,
857 * add pending nohz time to the right place.
859 if (vtime
->state
== VTIME_SYS
)
860 *stime
+= vtime
->stime
+ delta
;
862 *utime
+= vtime
->utime
+ delta
;
863 } while (read_seqcount_retry(&vtime
->seqcount
, seq
));
866 static int vtime_state_fetch(struct vtime
*vtime
, int cpu
)
868 int state
= READ_ONCE(vtime
->state
);
871 * We raced against a context switch, fetch the
872 * kcpustat task again.
874 if (vtime
->cpu
!= cpu
&& vtime
->cpu
!= -1)
878 * Two possible things here:
879 * 1) We are seeing the scheduling out task (prev) or any past one.
880 * 2) We are seeing the scheduling in task (next) but it hasn't
881 * passed though vtime_task_switch() yet so the pending
882 * cputime of the prev task may not be flushed yet.
884 * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
886 if (state
== VTIME_INACTIVE
)
892 static u64
kcpustat_user_vtime(struct vtime
*vtime
)
894 if (vtime
->state
== VTIME_USER
)
895 return vtime
->utime
+ vtime_delta(vtime
);
896 else if (vtime
->state
== VTIME_GUEST
)
897 return vtime
->gtime
+ vtime_delta(vtime
);
901 static int kcpustat_field_vtime(u64
*cpustat
,
902 struct task_struct
*tsk
,
903 enum cpu_usage_stat usage
,
906 struct vtime
*vtime
= &tsk
->vtime
;
912 seq
= read_seqcount_begin(&vtime
->seqcount
);
914 state
= vtime_state_fetch(vtime
, cpu
);
918 *val
= cpustat
[usage
];
921 * Nice VS unnice cputime accounting may be inaccurate if
922 * the nice value has changed since the last vtime update.
923 * But proper fix would involve interrupting target on nice
924 * updates which is a no go on nohz_full (although the scheduler
925 * may still interrupt the target if rescheduling is needed...)
929 if (state
== VTIME_SYS
)
930 *val
+= vtime
->stime
+ vtime_delta(vtime
);
933 if (task_nice(tsk
) <= 0)
934 *val
+= kcpustat_user_vtime(vtime
);
937 if (task_nice(tsk
) > 0)
938 *val
+= kcpustat_user_vtime(vtime
);
941 if (state
== VTIME_GUEST
&& task_nice(tsk
) <= 0)
942 *val
+= vtime
->gtime
+ vtime_delta(vtime
);
944 case CPUTIME_GUEST_NICE
:
945 if (state
== VTIME_GUEST
&& task_nice(tsk
) > 0)
946 *val
+= vtime
->gtime
+ vtime_delta(vtime
);
951 } while (read_seqcount_retry(&vtime
->seqcount
, seq
));
956 u64
kcpustat_field(struct kernel_cpustat
*kcpustat
,
957 enum cpu_usage_stat usage
, int cpu
)
959 u64
*cpustat
= kcpustat
->cpustat
;
960 u64 val
= cpustat
[usage
];
964 if (!vtime_accounting_enabled_cpu(cpu
))
970 struct task_struct
*curr
;
973 curr
= rcu_dereference(rq
->curr
);
974 if (WARN_ON_ONCE(!curr
)) {
976 return cpustat
[usage
];
979 err
= kcpustat_field_vtime(cpustat
, curr
, usage
, cpu
, &val
);
988 EXPORT_SYMBOL_GPL(kcpustat_field
);
990 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat
*dst
,
991 const struct kernel_cpustat
*src
,
992 struct task_struct
*tsk
, int cpu
)
994 struct vtime
*vtime
= &tsk
->vtime
;
1002 seq
= read_seqcount_begin(&vtime
->seqcount
);
1004 state
= vtime_state_fetch(vtime
, cpu
);
1009 cpustat
= dst
->cpustat
;
1011 /* Task is sleeping, dead or idle, nothing to add */
1012 if (state
< VTIME_SYS
)
1015 delta
= vtime_delta(vtime
);
1018 * Task runs either in user (including guest) or kernel space,
1019 * add pending nohz time to the right place.
1021 if (state
== VTIME_SYS
) {
1022 cpustat
[CPUTIME_SYSTEM
] += vtime
->stime
+ delta
;
1023 } else if (state
== VTIME_USER
) {
1024 if (task_nice(tsk
) > 0)
1025 cpustat
[CPUTIME_NICE
] += vtime
->utime
+ delta
;
1027 cpustat
[CPUTIME_USER
] += vtime
->utime
+ delta
;
1029 WARN_ON_ONCE(state
!= VTIME_GUEST
);
1030 if (task_nice(tsk
) > 0) {
1031 cpustat
[CPUTIME_GUEST_NICE
] += vtime
->gtime
+ delta
;
1032 cpustat
[CPUTIME_NICE
] += vtime
->gtime
+ delta
;
1034 cpustat
[CPUTIME_GUEST
] += vtime
->gtime
+ delta
;
1035 cpustat
[CPUTIME_USER
] += vtime
->gtime
+ delta
;
1038 } while (read_seqcount_retry(&vtime
->seqcount
, seq
));
1043 void kcpustat_cpu_fetch(struct kernel_cpustat
*dst
, int cpu
)
1045 const struct kernel_cpustat
*src
= &kcpustat_cpu(cpu
);
1049 if (!vtime_accounting_enabled_cpu(cpu
)) {
1057 struct task_struct
*curr
;
1060 curr
= rcu_dereference(rq
->curr
);
1061 if (WARN_ON_ONCE(!curr
)) {
1067 err
= kcpustat_cpu_fetch_vtime(dst
, src
, curr
, cpu
);
1076 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch
);
1078 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */