kernel/sched/cputime.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Simple CPU accounting cgroup controller
   4  */
   5 #include "sched.h"
   6
   7 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
   8
   9 /*
  10  * There are no locks covering percpu hardirq/softirq time.
  11  * They are only modified in vtime_account, on corresponding CPU
  12  * with interrupts disabled. So, writes are safe.
  13  * They are read and saved off onto struct rq in update_rq_clock().
  14  * This may result in other CPU reading this CPU's irq time and can
  15  * race with irq/vtime_account on this CPU. We would either get old
  16  * or new value with a side effect of accounting a slice of irq time to wrong
  17  * task when irq is in progress while we read rq->clock. That is a worthy
  18  * compromise in place of having locks on each irq in account_system_time.
  19  */
  20 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
  21
  22 static int sched_clock_irqtime;
  23
  24 void enable_sched_clock_irqtime(void)
  25 {
  26         sched_clock_irqtime = 1;
  27 }
  28
  29 void disable_sched_clock_irqtime(void)
  30 {
  31         sched_clock_irqtime = 0;
  32 }
  33
  34 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
  35                                   enum cpu_usage_stat idx)
  36 {
  37         u64 *cpustat = kcpustat_this_cpu->cpustat;
  38
  39         u64_stats_update_begin(&irqtime->sync);
  40         cpustat[idx] += delta;
  41         irqtime->total += delta;
  42         irqtime->tick_delta += delta;
  43         u64_stats_update_end(&irqtime->sync);
  44 }
  45
  46 /*
  47  * Called after incrementing preempt_count on {soft,}irq_enter
  48  * and before decrementing preempt_count on {soft,}irq_exit.
  49  */
  50 void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
  51 {
  52         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  53         unsigned int pc;
  54         s64 delta;
  55         int cpu;
  56
  57         if (!sched_clock_irqtime)
  58                 return;
  59
  60         cpu = smp_processor_id();
  61         delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
  62         irqtime->irq_start_time += delta;
  63         pc = preempt_count() - offset;
  64
  65         /*
  66          * We do not account for softirq time from ksoftirqd here.
  67          * We want to continue accounting softirq time to ksoftirqd thread
  68          * in that case, so as not to confuse scheduler with a special task
  69          * that do not consume any time, but still wants to run.
  70          */
  71         if (pc & HARDIRQ_MASK)
  72                 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
  73         else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
  74                 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
  75 }
  76
  77 static u64 irqtime_tick_accounted(u64 maxtime)
  78 {
  79         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  80         u64 delta;
  81
  82         delta = min(irqtime->tick_delta, maxtime);
  83         irqtime->tick_delta -= delta;
  84
  85         return delta;
  86 }
  87
  88 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
  89
  90 #define sched_clock_irqtime     (0)
  91
  92 static u64 irqtime_tick_accounted(u64 dummy)
  93 {
  94         return 0;
  95 }
  96
  97 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
  98
  99 static inline void task_group_account_field(struct task_struct *p, int index,
 100                                             u64 tmp)
 101 {
 102         /*
 103          * Since all updates are sure to touch the root cgroup, we
 104          * get ourselves ahead and touch it first. If the root cgroup
 105          * is the only cgroup, then nothing else should be necessary.
 106          *
 107          */
 108         __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 109
 110         cgroup_account_cputime_field(p, index, tmp);
 111 }
 112
 113 /*
 114  * Account user CPU time to a process.
 115  * @p: the process that the CPU time gets accounted to
 116  * @cputime: the CPU time spent in user space since the last update
 117  */
 118 void account_user_time(struct task_struct *p, u64 cputime)
 119 {
 120         int index;
 121
 122         /* Add user time to process. */
 123         p->utime += cputime;
 124         account_group_user_time(p, cputime);
 125
 126         index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 127
 128         /* Add user time to cpustat. */
 129         task_group_account_field(p, index, cputime);
 130
 131         /* Account for user time used */
 132         acct_account_cputime(p);
 133 }
 134
 135 /*
 136  * Account guest CPU time to a process.
 137  * @p: the process that the CPU time gets accounted to
 138  * @cputime: the CPU time spent in virtual machine since the last update
 139  */
 140 void account_guest_time(struct task_struct *p, u64 cputime)
 141 {
 142         u64 *cpustat = kcpustat_this_cpu->cpustat;
 143
 144         /* Add guest time to process. */
 145         p->utime += cputime;
 146         account_group_user_time(p, cputime);
 147         p->gtime += cputime;
 148
 149         /* Add guest time to cpustat. */
 150         if (task_nice(p) > 0) {
 151                 cpustat[CPUTIME_NICE] += cputime;
 152                 cpustat[CPUTIME_GUEST_NICE] += cputime;
 153         } else {
 154                 cpustat[CPUTIME_USER] += cputime;
 155                 cpustat[CPUTIME_GUEST] += cputime;
 156         }
 157 }
 158
 159 /*
 160  * Account system CPU time to a process and desired cpustat field
 161  * @p: the process that the CPU time gets accounted to
 162  * @cputime: the CPU time spent in kernel space since the last update
 163  * @index: pointer to cpustat field that has to be updated
 164  */
 165 void account_system_index_time(struct task_struct *p,
 166                                u64 cputime, enum cpu_usage_stat index)
 167 {
 168         /* Add system time to process. */
 169         p->stime += cputime;
 170         account_group_system_time(p, cputime);
 171
 172         /* Add system time to cpustat. */
 173         task_group_account_field(p, index, cputime);
 174
 175         /* Account for system time used */
 176         acct_account_cputime(p);
 177 }
 178
 179 /*
 180  * Account system CPU time to a process.
 181  * @p: the process that the CPU time gets accounted to
 182  * @hardirq_offset: the offset to subtract from hardirq_count()
 183  * @cputime: the CPU time spent in kernel space since the last update
 184  */
 185 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 186 {
 187         int index;
 188
 189         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 190                 account_guest_time(p, cputime);
 191                 return;
 192         }
 193
 194         if (hardirq_count() - hardirq_offset)
 195                 index = CPUTIME_IRQ;
 196         else if (in_serving_softirq())
 197                 index = CPUTIME_SOFTIRQ;
 198         else
 199                 index = CPUTIME_SYSTEM;
 200
 201         account_system_index_time(p, cputime, index);
 202 }
 203
 204 /*
 205  * Account for involuntary wait time.
 206  * @cputime: the CPU time spent in involuntary wait
 207  */
 208 void account_steal_time(u64 cputime)
 209 {
 210         u64 *cpustat = kcpustat_this_cpu->cpustat;
 211
 212         cpustat[CPUTIME_STEAL] += cputime;
 213 }
 214
 215 /*
 216  * Account for idle time.
 217  * @cputime: the CPU time spent in idle wait
 218  */
 219 void account_idle_time(u64 cputime)
 220 {
 221         u64 *cpustat = kcpustat_this_cpu->cpustat;
 222         struct rq *rq = this_rq();
 223
 224         if (atomic_read(&rq->nr_iowait) > 0)
 225                 cpustat[CPUTIME_IOWAIT] += cputime;
 226         else
 227                 cpustat[CPUTIME_IDLE] += cputime;
 228 }
 229
 230 /*
 231  * When a guest is interrupted for a longer amount of time, missed clock
 232  * ticks are not redelivered later. Due to that, this function may on
 233  * occasion account more time than the calling functions think elapsed.
 234  */
 235 static __always_inline u64 steal_account_process_time(u64 maxtime)
 236 {
 237 #ifdef CONFIG_PARAVIRT
 238         if (static_key_false(&paravirt_steal_enabled)) {
 239                 u64 steal;
 240
 241                 steal = paravirt_steal_clock(smp_processor_id());
 242                 steal -= this_rq()->prev_steal_time;
 243                 steal = min(steal, maxtime);
 244                 account_steal_time(steal);
 245                 this_rq()->prev_steal_time += steal;
 246
 247                 return steal;
 248         }
 249 #endif
 250         return 0;
 251 }
 252
 253 /*
 254  * Account how much elapsed time was spent in steal, irq, or softirq time.
 255  */
 256 static inline u64 account_other_time(u64 max)
 257 {
 258         u64 accounted;
 259
 260         lockdep_assert_irqs_disabled();
 261
 262         accounted = steal_account_process_time(max);
 263
 264         if (accounted < max)
 265                 accounted += irqtime_tick_accounted(max - accounted);
 266
 267         return accounted;
 268 }
 269
 270 #ifdef CONFIG_64BIT
 271 static inline u64 read_sum_exec_runtime(struct task_struct *t)
 272 {
 273         return t->se.sum_exec_runtime;
 274 }
 275 #else
 276 static u64 read_sum_exec_runtime(struct task_struct *t)
 277 {
 278         u64 ns;
 279         struct rq_flags rf;
 280         struct rq *rq;
 281
 282         rq = task_rq_lock(t, &rf);
 283         ns = t->se.sum_exec_runtime;
 284         task_rq_unlock(rq, t, &rf);
 285
 286         return ns;
 287 }
 288 #endif
 289
 290 /*
 291  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 292  * tasks (sum on group iteration) belonging to @tsk's group.
 293  */
 294 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 295 {
 296         struct signal_struct *sig = tsk->signal;
 297         u64 utime, stime;
 298         struct task_struct *t;
 299         unsigned int seq, nextseq;
 300         unsigned long flags;
 301
 302         /*
 303          * Update current task runtime to account pending time since last
 304          * scheduler action or thread_group_cputime() call. This thread group
 305          * might have other running tasks on different CPUs, but updating
 306          * their runtime can affect syscall performance, so we skip account
 307          * those pending times and rely only on values updated on tick or
 308          * other scheduler action.
 309          */
 310         if (same_thread_group(current, tsk))
 311                 (void) task_sched_runtime(current);
 312
 313         rcu_read_lock();
 314         /* Attempt a lockless read on the first round. */
 315         nextseq = 0;
 316         do {
 317                 seq = nextseq;
 318                 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 319                 times->utime = sig->utime;
 320                 times->stime = sig->stime;
 321                 times->sum_exec_runtime = sig->sum_sched_runtime;
 322
 323                 for_each_thread(tsk, t) {
 324                         task_cputime(t, &utime, &stime);
 325                         times->utime += utime;
 326                         times->stime += stime;
 327                         times->sum_exec_runtime += read_sum_exec_runtime(t);
 328                 }
 329                 /* If lockless access failed, take the lock. */
 330                 nextseq = 1;
 331         } while (need_seqretry(&sig->stats_lock, seq));
 332         done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
 333         rcu_read_unlock();
 334 }
 335
 336 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 337 /*
 338  * Account a tick to a process and cpustat
 339  * @p: the process that the CPU time gets accounted to
 340  * @user_tick: is the tick from userspace
 341  * @rq: the pointer to rq
 342  *
 343  * Tick demultiplexing follows the order
 344  * - pending hardirq update
 345  * - pending softirq update
 346  * - user_time
 347  * - idle_time
 348  * - system time
 349  *   - check for guest_time
 350  *   - else account as system_time
 351  *
 352  * Check for hardirq is done both for system and user time as there is
 353  * no timer going off while we are on hardirq and hence we may never get an
 354  * opportunity to update it solely in system time.
 355  * p->stime and friends are only updated on system time and not on irq
 356  * softirq as those do not count in task exec_runtime any more.
 357  */
 358 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 359                                          int ticks)
 360 {
 361         u64 other, cputime = TICK_NSEC * ticks;
 362
 363         /*
 364          * When returning from idle, many ticks can get accounted at
 365          * once, including some ticks of steal, irq, and softirq time.
 366          * Subtract those ticks from the amount of time accounted to
 367          * idle, or potentially user or system time. Due to rounding,
 368          * other time can exceed ticks occasionally.
 369          */
 370         other = account_other_time(ULONG_MAX);
 371         if (other >= cputime)
 372                 return;
 373
 374         cputime -= other;
 375
 376         if (this_cpu_ksoftirqd() == p) {
 377                 /*
 378                  * ksoftirqd time do not get accounted in cpu_softirq_time.
 379                  * So, we have to handle it separately here.
 380                  * Also, p->stime needs to be updated for ksoftirqd.
 381                  */
 382                 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 383         } else if (user_tick) {
 384                 account_user_time(p, cputime);
 385         } else if (p == this_rq()->idle) {
 386                 account_idle_time(cputime);
 387         } else if (p->flags & PF_VCPU) { /* System time or guest time */
 388                 account_guest_time(p, cputime);
 389         } else {
 390                 account_system_index_time(p, cputime, CPUTIME_SYSTEM);
 391         }
 392 }
 393
 394 static void irqtime_account_idle_ticks(int ticks)
 395 {
 396         irqtime_account_process_tick(current, 0, ticks);
 397 }
 398 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 399 static inline void irqtime_account_idle_ticks(int ticks) { }
 400 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 401                                                 int nr_ticks) { }
 402 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 403
 404 /*
 405  * Use precise platform statistics if available:
 406  */
 407 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 408
 409 # ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 410 void vtime_task_switch(struct task_struct *prev)
 411 {
 412         if (is_idle_task(prev))
 413                 vtime_account_idle(prev);
 414         else
 415                 vtime_account_kernel(prev);
 416
 417         vtime_flush(prev);
 418         arch_vtime_task_switch(prev);
 419 }
 420 # endif
 421
 422 void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
 423 {
 424         unsigned int pc = preempt_count() - offset;
 425
 426         if (pc & HARDIRQ_OFFSET) {
 427                 vtime_account_hardirq(tsk);
 428         } else if (pc & SOFTIRQ_OFFSET) {
 429                 vtime_account_softirq(tsk);
 430         } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
 431                    is_idle_task(tsk)) {
 432                 vtime_account_idle(tsk);
 433         } else {
 434                 vtime_account_kernel(tsk);
 435         }
 436 }
 437
 438 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 439                     u64 *ut, u64 *st)
 440 {
 441         *ut = curr->utime;
 442         *st = curr->stime;
 443 }
 444
 445 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 446 {
 447         *ut = p->utime;
 448         *st = p->stime;
 449 }
 450 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 451
 452 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 453 {
 454         struct task_cputime cputime;
 455
 456         thread_group_cputime(p, &cputime);
 457
 458         *ut = cputime.utime;
 459         *st = cputime.stime;
 460 }
 461
 462 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
 463
 464 /*
 465  * Account a single tick of CPU time.
 466  * @p: the process that the CPU time gets accounted to
 467  * @user_tick: indicates if the tick is a user or a system tick
 468  */
 469 void account_process_tick(struct task_struct *p, int user_tick)
 470 {
 471         u64 cputime, steal;
 472
 473         if (vtime_accounting_enabled_this_cpu())
 474                 return;
 475
 476         if (sched_clock_irqtime) {
 477                 irqtime_account_process_tick(p, user_tick, 1);
 478                 return;
 479         }
 480
 481         cputime = TICK_NSEC;
 482         steal = steal_account_process_time(ULONG_MAX);
 483
 484         if (steal >= cputime)
 485                 return;
 486
 487         cputime -= steal;
 488
 489         if (user_tick)
 490                 account_user_time(p, cputime);
 491         else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
 492                 account_system_time(p, HARDIRQ_OFFSET, cputime);
 493         else
 494                 account_idle_time(cputime);
 495 }
 496
 497 /*
 498  * Account multiple ticks of idle time.
 499  * @ticks: number of stolen ticks
 500  */
 501 void account_idle_ticks(unsigned long ticks)
 502 {
 503         u64 cputime, steal;
 504
 505         if (sched_clock_irqtime) {
 506                 irqtime_account_idle_ticks(ticks);
 507                 return;
 508         }
 509
 510         cputime = ticks * TICK_NSEC;
 511         steal = steal_account_process_time(ULONG_MAX);
 512
 513         if (steal >= cputime)
 514                 return;
 515
 516         cputime -= steal;
 517         account_idle_time(cputime);
 518 }
 519
 520 /*
 521  * Adjust tick based cputime random precision against scheduler runtime
 522  * accounting.
 523  *
 524  * Tick based cputime accounting depend on random scheduling timeslices of a
 525  * task to be interrupted or not by the timer.  Depending on these
 526  * circumstances, the number of these interrupts may be over or
 527  * under-optimistic, matching the real user and system cputime with a variable
 528  * precision.
 529  *
 530  * Fix this by scaling these tick based values against the total runtime
 531  * accounted by the CFS scheduler.
 532  *
 533  * This code provides the following guarantees:
 534  *
 535  *   stime + utime == rtime
 536  *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
 537  *
 538  * Assuming that rtime_i+1 >= rtime_i.
 539  */
 540 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 541                     u64 *ut, u64 *st)
 542 {
 543         u64 rtime, stime, utime;
 544         unsigned long flags;
 545
 546         /* Serialize concurrent callers such that we can honour our guarantees */
 547         raw_spin_lock_irqsave(&prev->lock, flags);
 548         rtime = curr->sum_exec_runtime;
 549
 550         /*
 551          * This is possible under two circumstances:
 552          *  - rtime isn't monotonic after all (a bug);
 553          *  - we got reordered by the lock.
 554          *
 555          * In both cases this acts as a filter such that the rest of the code
 556          * can assume it is monotonic regardless of anything else.
 557          */
 558         if (prev->stime + prev->utime >= rtime)
 559                 goto out;
 560
 561         stime = curr->stime;
 562         utime = curr->utime;
 563
 564         /*
 565          * If either stime or utime are 0, assume all runtime is userspace.
 566          * Once a task gets some ticks, the monotonicy code at 'update:'
 567          * will ensure things converge to the observed ratio.
 568          */
 569         if (stime == 0) {
 570                 utime = rtime;
 571                 goto update;
 572         }
 573
 574         if (utime == 0) {
 575                 stime = rtime;
 576                 goto update;
 577         }
 578
 579         stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
 580
 581 update:
 582         /*
 583          * Make sure stime doesn't go backwards; this preserves monotonicity
 584          * for utime because rtime is monotonic.
 585          *
 586          *  utime_i+1 = rtime_i+1 - stime_i
 587          *            = rtime_i+1 - (rtime_i - utime_i)
 588          *            = (rtime_i+1 - rtime_i) + utime_i
 589          *            >= utime_i
 590          */
 591         if (stime < prev->stime)
 592                 stime = prev->stime;
 593         utime = rtime - stime;
 594
 595         /*
 596          * Make sure utime doesn't go backwards; this still preserves
 597          * monotonicity for stime, analogous argument to above.
 598          */
 599         if (utime < prev->utime) {
 600                 utime = prev->utime;
 601                 stime = rtime - utime;
 602         }
 603
 604         prev->stime = stime;
 605         prev->utime = utime;
 606 out:
 607         *ut = prev->utime;
 608         *st = prev->stime;
 609         raw_spin_unlock_irqrestore(&prev->lock, flags);
 610 }
 611
 612 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 613 {
 614         struct task_cputime cputime = {
 615                 .sum_exec_runtime = p->se.sum_exec_runtime,
 616         };
 617
 618         task_cputime(p, &cputime.utime, &cputime.stime);
 619         cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 620 }
 621 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 622
 623 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 624 {
 625         struct task_cputime cputime;
 626
 627         thread_group_cputime(p, &cputime);
 628         cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 629 }
 630 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 631
 632 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 633 static u64 vtime_delta(struct vtime *vtime)
 634 {
 635         unsigned long long clock;
 636
 637         clock = sched_clock();
 638         if (clock < vtime->starttime)
 639                 return 0;
 640
 641         return clock - vtime->starttime;
 642 }
 643
 644 static u64 get_vtime_delta(struct vtime *vtime)
 645 {
 646         u64 delta = vtime_delta(vtime);
 647         u64 other;
 648
 649         /*
 650          * Unlike tick based timing, vtime based timing never has lost
 651          * ticks, and no need for steal time accounting to make up for
 652          * lost ticks. Vtime accounts a rounded version of actual
 653          * elapsed time. Limit account_other_time to prevent rounding
 654          * errors from causing elapsed vtime to go negative.
 655          */
 656         other = account_other_time(delta);
 657         WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
 658         vtime->starttime += delta;
 659
 660         return delta - other;
 661 }
 662
 663 static void vtime_account_system(struct task_struct *tsk,
 664                                  struct vtime *vtime)
 665 {
 666         vtime->stime += get_vtime_delta(vtime);
 667         if (vtime->stime >= TICK_NSEC) {
 668                 account_system_time(tsk, irq_count(), vtime->stime);
 669                 vtime->stime = 0;
 670         }
 671 }
 672
 673 static void vtime_account_guest(struct task_struct *tsk,
 674                                 struct vtime *vtime)
 675 {
 676         vtime->gtime += get_vtime_delta(vtime);
 677         if (vtime->gtime >= TICK_NSEC) {
 678                 account_guest_time(tsk, vtime->gtime);
 679                 vtime->gtime = 0;
 680         }
 681 }
 682
 683 static void __vtime_account_kernel(struct task_struct *tsk,
 684                                    struct vtime *vtime)
 685 {
 686         /* We might have scheduled out from guest path */
 687         if (vtime->state == VTIME_GUEST)
 688                 vtime_account_guest(tsk, vtime);
 689         else
 690                 vtime_account_system(tsk, vtime);
 691 }
 692
 693 void vtime_account_kernel(struct task_struct *tsk)
 694 {
 695         struct vtime *vtime = &tsk->vtime;
 696
 697         if (!vtime_delta(vtime))
 698                 return;
 699
 700         write_seqcount_begin(&vtime->seqcount);
 701         __vtime_account_kernel(tsk, vtime);
 702         write_seqcount_end(&vtime->seqcount);
 703 }
 704
 705 void vtime_user_enter(struct task_struct *tsk)
 706 {
 707         struct vtime *vtime = &tsk->vtime;
 708
 709         write_seqcount_begin(&vtime->seqcount);
 710         vtime_account_system(tsk, vtime);
 711         vtime->state = VTIME_USER;
 712         write_seqcount_end(&vtime->seqcount);
 713 }
 714
 715 void vtime_user_exit(struct task_struct *tsk)
 716 {
 717         struct vtime *vtime = &tsk->vtime;
 718
 719         write_seqcount_begin(&vtime->seqcount);
 720         vtime->utime += get_vtime_delta(vtime);
 721         if (vtime->utime >= TICK_NSEC) {
 722                 account_user_time(tsk, vtime->utime);
 723                 vtime->utime = 0;
 724         }
 725         vtime->state = VTIME_SYS;
 726         write_seqcount_end(&vtime->seqcount);
 727 }
 728
 729 void vtime_guest_enter(struct task_struct *tsk)
 730 {
 731         struct vtime *vtime = &tsk->vtime;
 732         /*
 733          * The flags must be updated under the lock with
 734          * the vtime_starttime flush and update.
 735          * That enforces a right ordering and update sequence
 736          * synchronization against the reader (task_gtime())
 737          * that can thus safely catch up with a tickless delta.
 738          */
 739         write_seqcount_begin(&vtime->seqcount);
 740         vtime_account_system(tsk, vtime);
 741         tsk->flags |= PF_VCPU;
 742         vtime->state = VTIME_GUEST;
 743         write_seqcount_end(&vtime->seqcount);
 744 }
 745 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 746
 747 void vtime_guest_exit(struct task_struct *tsk)
 748 {
 749         struct vtime *vtime = &tsk->vtime;
 750
 751         write_seqcount_begin(&vtime->seqcount);
 752         vtime_account_guest(tsk, vtime);
 753         tsk->flags &= ~PF_VCPU;
 754         vtime->state = VTIME_SYS;
 755         write_seqcount_end(&vtime->seqcount);
 756 }
 757 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 758
 759 void vtime_account_idle(struct task_struct *tsk)
 760 {
 761         account_idle_time(get_vtime_delta(&tsk->vtime));
 762 }
 763
 764 void vtime_task_switch_generic(struct task_struct *prev)
 765 {
 766         struct vtime *vtime = &prev->vtime;
 767
 768         write_seqcount_begin(&vtime->seqcount);
 769         if (vtime->state == VTIME_IDLE)
 770                 vtime_account_idle(prev);
 771         else
 772                 __vtime_account_kernel(prev, vtime);
 773         vtime->state = VTIME_INACTIVE;
 774         vtime->cpu = -1;
 775         write_seqcount_end(&vtime->seqcount);
 776
 777         vtime = &current->vtime;
 778
 779         write_seqcount_begin(&vtime->seqcount);
 780         if (is_idle_task(current))
 781                 vtime->state = VTIME_IDLE;
 782         else if (current->flags & PF_VCPU)
 783                 vtime->state = VTIME_GUEST;
 784         else
 785                 vtime->state = VTIME_SYS;
 786         vtime->starttime = sched_clock();
 787         vtime->cpu = smp_processor_id();
 788         write_seqcount_end(&vtime->seqcount);
 789 }
 790
 791 void vtime_init_idle(struct task_struct *t, int cpu)
 792 {
 793         struct vtime *vtime = &t->vtime;
 794         unsigned long flags;
 795
 796         local_irq_save(flags);
 797         write_seqcount_begin(&vtime->seqcount);
 798         vtime->state = VTIME_IDLE;
 799         vtime->starttime = sched_clock();
 800         vtime->cpu = cpu;
 801         write_seqcount_end(&vtime->seqcount);
 802         local_irq_restore(flags);
 803 }
 804
 805 u64 task_gtime(struct task_struct *t)
 806 {
 807         struct vtime *vtime = &t->vtime;
 808         unsigned int seq;
 809         u64 gtime;
 810
 811         if (!vtime_accounting_enabled())
 812                 return t->gtime;
 813
 814         do {
 815                 seq = read_seqcount_begin(&vtime->seqcount);
 816
 817                 gtime = t->gtime;
 818                 if (vtime->state == VTIME_GUEST)
 819                         gtime += vtime->gtime + vtime_delta(vtime);
 820
 821         } while (read_seqcount_retry(&vtime->seqcount, seq));
 822
 823         return gtime;
 824 }
 825
 826 /*
 827  * Fetch cputime raw values from fields of task_struct and
 828  * add up the pending nohz execution time since the last
 829  * cputime snapshot.
 830  */
 831 void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 832 {
 833         struct vtime *vtime = &t->vtime;
 834         unsigned int seq;
 835         u64 delta;
 836
 837         if (!vtime_accounting_enabled()) {
 838                 *utime = t->utime;
 839                 *stime = t->stime;
 840                 return;
 841         }
 842
 843         do {
 844                 seq = read_seqcount_begin(&vtime->seqcount);
 845
 846                 *utime = t->utime;
 847                 *stime = t->stime;
 848
 849                 /* Task is sleeping or idle, nothing to add */
 850                 if (vtime->state < VTIME_SYS)
 851                         continue;
 852
 853                 delta = vtime_delta(vtime);
 854
 855                 /*
 856                  * Task runs either in user (including guest) or kernel space,
 857                  * add pending nohz time to the right place.
 858                  */
 859                 if (vtime->state == VTIME_SYS)
 860                         *stime += vtime->stime + delta;
 861                 else
 862                         *utime += vtime->utime + delta;
 863         } while (read_seqcount_retry(&vtime->seqcount, seq));
 864 }
 865
 866 static int vtime_state_fetch(struct vtime *vtime, int cpu)
 867 {
 868         int state = READ_ONCE(vtime->state);
 869
 870         /*
 871          * We raced against a context switch, fetch the
 872          * kcpustat task again.
 873          */
 874         if (vtime->cpu != cpu && vtime->cpu != -1)
 875                 return -EAGAIN;
 876
 877         /*
 878          * Two possible things here:
 879          * 1) We are seeing the scheduling out task (prev) or any past one.
 880          * 2) We are seeing the scheduling in task (next) but it hasn't
 881          *    passed though vtime_task_switch() yet so the pending
 882          *    cputime of the prev task may not be flushed yet.
 883          *
 884          * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
 885          */
 886         if (state == VTIME_INACTIVE)
 887                 return -EAGAIN;
 888
 889         return state;
 890 }
 891
 892 static u64 kcpustat_user_vtime(struct vtime *vtime)
 893 {
 894         if (vtime->state == VTIME_USER)
 895                 return vtime->utime + vtime_delta(vtime);
 896         else if (vtime->state == VTIME_GUEST)
 897                 return vtime->gtime + vtime_delta(vtime);
 898         return 0;
 899 }
 900
 901 static int kcpustat_field_vtime(u64 *cpustat,
 902                                 struct task_struct *tsk,
 903                                 enum cpu_usage_stat usage,
 904                                 int cpu, u64 *val)
 905 {
 906         struct vtime *vtime = &tsk->vtime;
 907         unsigned int seq;
 908
 909         do {
 910                 int state;
 911
 912                 seq = read_seqcount_begin(&vtime->seqcount);
 913
 914                 state = vtime_state_fetch(vtime, cpu);
 915                 if (state < 0)
 916                         return state;
 917
 918                 *val = cpustat[usage];
 919
 920                 /*
 921                  * Nice VS unnice cputime accounting may be inaccurate if
 922                  * the nice value has changed since the last vtime update.
 923                  * But proper fix would involve interrupting target on nice
 924                  * updates which is a no go on nohz_full (although the scheduler
 925                  * may still interrupt the target if rescheduling is needed...)
 926                  */
 927                 switch (usage) {
 928                 case CPUTIME_SYSTEM:
 929                         if (state == VTIME_SYS)
 930                                 *val += vtime->stime + vtime_delta(vtime);
 931                         break;
 932                 case CPUTIME_USER:
 933                         if (task_nice(tsk) <= 0)
 934                                 *val += kcpustat_user_vtime(vtime);
 935                         break;
 936                 case CPUTIME_NICE:
 937                         if (task_nice(tsk) > 0)
 938                                 *val += kcpustat_user_vtime(vtime);
 939                         break;
 940                 case CPUTIME_GUEST:
 941                         if (state == VTIME_GUEST && task_nice(tsk) <= 0)
 942                                 *val += vtime->gtime + vtime_delta(vtime);
 943                         break;
 944                 case CPUTIME_GUEST_NICE:
 945                         if (state == VTIME_GUEST && task_nice(tsk) > 0)
 946                                 *val += vtime->gtime + vtime_delta(vtime);
 947                         break;
 948                 default:
 949                         break;
 950                 }
 951         } while (read_seqcount_retry(&vtime->seqcount, seq));
 952
 953         return 0;
 954 }
 955
 956 u64 kcpustat_field(struct kernel_cpustat *kcpustat,
 957                    enum cpu_usage_stat usage, int cpu)
 958 {
 959         u64 *cpustat = kcpustat->cpustat;
 960         u64 val = cpustat[usage];
 961         struct rq *rq;
 962         int err;
 963
 964         if (!vtime_accounting_enabled_cpu(cpu))
 965                 return val;
 966
 967         rq = cpu_rq(cpu);
 968
 969         for (;;) {
 970                 struct task_struct *curr;
 971
 972                 rcu_read_lock();
 973                 curr = rcu_dereference(rq->curr);
 974                 if (WARN_ON_ONCE(!curr)) {
 975                         rcu_read_unlock();
 976                         return cpustat[usage];
 977                 }
 978
 979                 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
 980                 rcu_read_unlock();
 981
 982                 if (!err)
 983                         return val;
 984
 985                 cpu_relax();
 986         }
 987 }
 988 EXPORT_SYMBOL_GPL(kcpustat_field);
 989
 990 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
 991                                     const struct kernel_cpustat *src,
 992                                     struct task_struct *tsk, int cpu)
 993 {
 994         struct vtime *vtime = &tsk->vtime;
 995         unsigned int seq;
 996
 997         do {
 998                 u64 *cpustat;
 999                 u64 delta;
1000                 int state;
1001
1002                 seq = read_seqcount_begin(&vtime->seqcount);
1003
1004                 state = vtime_state_fetch(vtime, cpu);
1005                 if (state < 0)
1006                         return state;
1007
1008                 *dst = *src;
1009                 cpustat = dst->cpustat;
1010
1011                 /* Task is sleeping, dead or idle, nothing to add */
1012                 if (state < VTIME_SYS)
1013                         continue;
1014
1015                 delta = vtime_delta(vtime);
1016
1017                 /*
1018                  * Task runs either in user (including guest) or kernel space,
1019                  * add pending nohz time to the right place.
1020                  */
1021                 if (state == VTIME_SYS) {
1022                         cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
1023                 } else if (state == VTIME_USER) {
1024                         if (task_nice(tsk) > 0)
1025                                 cpustat[CPUTIME_NICE] += vtime->utime + delta;
1026                         else
1027                                 cpustat[CPUTIME_USER] += vtime->utime + delta;
1028                 } else {
1029                         WARN_ON_ONCE(state != VTIME_GUEST);
1030                         if (task_nice(tsk) > 0) {
1031                                 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
1032                                 cpustat[CPUTIME_NICE] += vtime->gtime + delta;
1033                         } else {
1034                                 cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
1035                                 cpustat[CPUTIME_USER] += vtime->gtime + delta;
1036                         }
1037                 }
1038         } while (read_seqcount_retry(&vtime->seqcount, seq));
1039
1040         return 0;
1041 }
1042
1043 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
1044 {
1045         const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
1046         struct rq *rq;
1047         int err;
1048
1049         if (!vtime_accounting_enabled_cpu(cpu)) {
1050                 *dst = *src;
1051                 return;
1052         }
1053
1054         rq = cpu_rq(cpu);
1055
1056         for (;;) {
1057                 struct task_struct *curr;
1058
1059                 rcu_read_lock();
1060                 curr = rcu_dereference(rq->curr);
1061                 if (WARN_ON_ONCE(!curr)) {
1062                         rcu_read_unlock();
1063                         *dst = *src;
1064                         return;
1065                 }
1066
1067                 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
1068                 rcu_read_unlock();
1069
1070                 if (!err)
1071                         return;
1072
1073                 cpu_relax();
1074         }
1075 }
1076 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
1077
1078 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */