1 // SPDX-License-Identifier: GPL-2.0
3 * Pressure stall information for CPU, memory and IO
5 * Copyright (c) 2018 Facebook, Inc.
6 * Author: Johannes Weiner <hannes@cmpxchg.org>
8 * Polling support by Suren Baghdasaryan <surenb@google.com>
9 * Copyright (c) 2018 Google, Inc.
11 * When CPU, memory and IO are contended, tasks experience delays that
12 * reduce throughput and introduce latencies into the workload. Memory
13 * and IO contention, in addition, can cause a full loss of forward
14 * progress in which the CPU goes idle.
16 * This code aggregates individual task delays into resource pressure
17 * metrics that indicate problems with both workload health and
18 * resource utilization.
22 * The time in which a task can execute on a CPU is our baseline for
23 * productivity. Pressure expresses the amount of time in which this
24 * potential cannot be realized due to resource contention.
26 * This concept of productivity has two components: the workload and
27 * the CPU. To measure the impact of pressure on both, we define two
28 * contention states for a resource: SOME and FULL.
30 * In the SOME state of a given resource, one or more tasks are
31 * delayed on that resource. This affects the workload's ability to
32 * perform work, but the CPU may still be executing other tasks.
34 * In the FULL state of a given resource, all non-idle tasks are
35 * delayed on that resource such that nobody is advancing and the CPU
36 * goes idle. This leaves both workload and CPU unproductive.
38 * SOME = nr_delayed_tasks != 0
39 * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
41 * What it means for a task to be productive is defined differently
42 * for each resource. For IO, productive means a running task. For
43 * memory, productive means a running task that isn't a reclaimer. For
44 * CPU, productive means an on-CPU task.
46 * Naturally, the FULL state doesn't exist for the CPU resource at the
47 * system level, but exist at the cgroup level. At the cgroup level,
48 * FULL means all non-idle tasks in the cgroup are delayed on the CPU
49 * resource which is being used by others outside of the cgroup or
50 * throttled by the cgroup cpu.max configuration.
52 * The percentage of wall clock time spent in those compound stall
53 * states gives pressure numbers between 0 and 100 for each resource,
54 * where the SOME percentage indicates workload slowdowns and the FULL
55 * percentage indicates reduced CPU utilization:
57 * %SOME = time(SOME) / period
58 * %FULL = time(FULL) / period
62 * The more tasks and available CPUs there are, the more work can be
63 * performed concurrently. This means that the potential that can go
64 * unrealized due to resource contention *also* scales with non-idle
67 * Consider a scenario where 257 number crunching tasks are trying to
68 * run concurrently on 256 CPUs. If we simply aggregated the task
69 * states, we would have to conclude a CPU SOME pressure number of
70 * 100%, since *somebody* is waiting on a runqueue at all
71 * times. However, that is clearly not the amount of contention the
72 * workload is experiencing: only one out of 256 possible execution
73 * threads will be contended at any given time, or about 0.4%.
75 * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
76 * given time *one* of the tasks is delayed due to a lack of memory.
77 * Again, looking purely at the task state would yield a memory FULL
78 * pressure number of 0%, since *somebody* is always making forward
79 * progress. But again this wouldn't capture the amount of execution
80 * potential lost, which is 1 out of 4 CPUs, or 25%.
82 * To calculate wasted potential (pressure) with multiple processors,
83 * we have to base our calculation on the number of non-idle tasks in
84 * conjunction with the number of available CPUs, which is the number
85 * of potential execution threads. SOME becomes then the proportion of
86 * delayed tasks to possible threads, and FULL is the share of possible
87 * threads that are unproductive due to delays:
89 * threads = min(nr_nonidle_tasks, nr_cpus)
90 * SOME = min(nr_delayed_tasks / threads, 1)
91 * FULL = (threads - min(nr_productive_tasks, threads)) / threads
93 * For the 257 number crunchers on 256 CPUs, this yields:
95 * threads = min(257, 256)
96 * SOME = min(1 / 256, 1) = 0.4%
97 * FULL = (256 - min(256, 256)) / 256 = 0%
99 * For the 1 out of 4 memory-delayed tasks, this yields:
101 * threads = min(4, 4)
102 * SOME = min(1 / 4, 1) = 25%
103 * FULL = (4 - min(3, 4)) / 4 = 25%
105 * [ Substitute nr_cpus with 1, and you can see that it's a natural
106 * extension of the single-CPU model. ]
110 * To assess the precise time spent in each such state, we would have
111 * to freeze the system on task changes and start/stop the state
112 * clocks accordingly. Obviously that doesn't scale in practice.
114 * Because the scheduler aims to distribute the compute load evenly
115 * among the available CPUs, we can track task state locally to each
116 * CPU and, at much lower frequency, extrapolate the global state for
117 * the cumulative stall times and the running averages.
119 * For each runqueue, we track:
121 * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
122 * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
123 * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
125 * and then periodically aggregate:
127 * tNONIDLE = sum(tNONIDLE[i])
129 * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE
130 * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE
132 * %SOME = tSOME / period
133 * %FULL = tFULL / period
135 * This gives us an approximation of pressure that is practical
136 * cost-wise, yet way more sensitive and accurate than periodic
137 * sampling of the aggregate task states would be.
140 static int psi_bug __read_mostly
;
142 DEFINE_STATIC_KEY_FALSE(psi_disabled
);
143 static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled
);
145 #ifdef CONFIG_PSI_DEFAULT_DISABLED
146 static bool psi_enable
;
148 static bool psi_enable
= true;
150 static int __init
setup_psi(char *str
)
152 return kstrtobool(str
, &psi_enable
) == 0;
154 __setup("psi=", setup_psi
);
156 /* Running averages - we need to be higher-res than loadavg */
157 #define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
158 #define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
159 #define EXP_60s 1981 /* 1/exp(2s/60s) */
160 #define EXP_300s 2034 /* 1/exp(2s/300s) */
162 /* PSI trigger definitions */
163 #define WINDOW_MAX_US 10000000 /* Max window size is 10s */
164 #define UPDATES_PER_WINDOW 10 /* 10 updates per window */
166 /* Sampling frequency in nanoseconds */
167 static u64 psi_period __read_mostly
;
169 /* System-level pressure and stall tracking */
170 static DEFINE_PER_CPU(struct psi_group_cpu
, system_group_pcpu
);
171 struct psi_group psi_system
= {
172 .pcpu
= &system_group_pcpu
,
175 static void psi_avgs_work(struct work_struct
*work
);
177 static void poll_timer_fn(struct timer_list
*t
);
179 static void group_init(struct psi_group
*group
)
183 group
->enabled
= true;
184 for_each_possible_cpu(cpu
)
185 seqcount_init(&per_cpu_ptr(group
->pcpu
, cpu
)->seq
);
186 group
->avg_last_update
= sched_clock();
187 group
->avg_next_update
= group
->avg_last_update
+ psi_period
;
188 mutex_init(&group
->avgs_lock
);
190 /* Init avg trigger-related members */
191 INIT_LIST_HEAD(&group
->avg_triggers
);
192 memset(group
->avg_nr_triggers
, 0, sizeof(group
->avg_nr_triggers
));
193 INIT_DELAYED_WORK(&group
->avgs_work
, psi_avgs_work
);
195 /* Init rtpoll trigger-related members */
196 atomic_set(&group
->rtpoll_scheduled
, 0);
197 mutex_init(&group
->rtpoll_trigger_lock
);
198 INIT_LIST_HEAD(&group
->rtpoll_triggers
);
199 group
->rtpoll_min_period
= U32_MAX
;
200 group
->rtpoll_next_update
= ULLONG_MAX
;
201 init_waitqueue_head(&group
->rtpoll_wait
);
202 timer_setup(&group
->rtpoll_timer
, poll_timer_fn
, 0);
203 rcu_assign_pointer(group
->rtpoll_task
, NULL
);
206 void __init
psi_init(void)
209 static_branch_enable(&psi_disabled
);
210 static_branch_disable(&psi_cgroups_enabled
);
214 if (!cgroup_psi_enabled())
215 static_branch_disable(&psi_cgroups_enabled
);
217 psi_period
= jiffies_to_nsecs(PSI_FREQ
);
218 group_init(&psi_system
);
221 static u32
test_states(unsigned int *tasks
, u32 state_mask
)
223 const bool oncpu
= state_mask
& PSI_ONCPU
;
225 if (tasks
[NR_IOWAIT
]) {
226 state_mask
|= BIT(PSI_IO_SOME
);
227 if (!tasks
[NR_RUNNING
])
228 state_mask
|= BIT(PSI_IO_FULL
);
231 if (tasks
[NR_MEMSTALL
]) {
232 state_mask
|= BIT(PSI_MEM_SOME
);
233 if (tasks
[NR_RUNNING
] == tasks
[NR_MEMSTALL_RUNNING
])
234 state_mask
|= BIT(PSI_MEM_FULL
);
237 if (tasks
[NR_RUNNING
] > oncpu
)
238 state_mask
|= BIT(PSI_CPU_SOME
);
240 if (tasks
[NR_RUNNING
] && !oncpu
)
241 state_mask
|= BIT(PSI_CPU_FULL
);
243 if (tasks
[NR_IOWAIT
] || tasks
[NR_MEMSTALL
] || tasks
[NR_RUNNING
])
244 state_mask
|= BIT(PSI_NONIDLE
);
249 static void get_recent_times(struct psi_group
*group
, int cpu
,
250 enum psi_aggregators aggregator
, u32
*times
,
251 u32
*pchanged_states
)
253 struct psi_group_cpu
*groupc
= per_cpu_ptr(group
->pcpu
, cpu
);
254 int current_cpu
= raw_smp_processor_id();
255 unsigned int tasks
[NR_PSI_TASK_COUNTS
];
256 u64 now
, state_start
;
261 *pchanged_states
= 0;
263 /* Snapshot a coherent view of the CPU state */
265 seq
= read_seqcount_begin(&groupc
->seq
);
266 now
= cpu_clock(cpu
);
267 memcpy(times
, groupc
->times
, sizeof(groupc
->times
));
268 state_mask
= groupc
->state_mask
;
269 state_start
= groupc
->state_start
;
270 if (cpu
== current_cpu
)
271 memcpy(tasks
, groupc
->tasks
, sizeof(groupc
->tasks
));
272 } while (read_seqcount_retry(&groupc
->seq
, seq
));
274 /* Calculate state time deltas against the previous snapshot */
275 for (s
= 0; s
< NR_PSI_STATES
; s
++) {
278 * In addition to already concluded states, we also
279 * incorporate currently active states on the CPU,
280 * since states may last for many sampling periods.
282 * This way we keep our delta sampling buckets small
283 * (u32) and our reported pressure close to what's
284 * actually happening.
286 if (state_mask
& (1 << s
))
287 times
[s
] += now
- state_start
;
289 delta
= times
[s
] - groupc
->times_prev
[aggregator
][s
];
290 groupc
->times_prev
[aggregator
][s
] = times
[s
];
294 *pchanged_states
|= (1 << s
);
298 * When collect_percpu_times() from the avgs_work, we don't want to
299 * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
300 * this avgs_work is never IDLE, cause avgs_work can't be shut off.
301 * So for the current CPU, we need to re-arm avgs_work only when
302 * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
303 * we can just check PSI_NONIDLE delta.
305 if (current_work() == &group
->avgs_work
.work
) {
308 if (cpu
== current_cpu
)
309 reschedule
= tasks
[NR_RUNNING
] +
311 tasks
[NR_MEMSTALL
] > 1;
313 reschedule
= *pchanged_states
& (1 << PSI_NONIDLE
);
316 *pchanged_states
|= PSI_STATE_RESCHEDULE
;
320 static void calc_avgs(unsigned long avg
[3], int missed_periods
,
321 u64 time
, u64 period
)
325 /* Fill in zeroes for periods of no activity */
326 if (missed_periods
) {
327 avg
[0] = calc_load_n(avg
[0], EXP_10s
, 0, missed_periods
);
328 avg
[1] = calc_load_n(avg
[1], EXP_60s
, 0, missed_periods
);
329 avg
[2] = calc_load_n(avg
[2], EXP_300s
, 0, missed_periods
);
332 /* Sample the most recent active period */
333 pct
= div_u64(time
* 100, period
);
335 avg
[0] = calc_load(avg
[0], EXP_10s
, pct
);
336 avg
[1] = calc_load(avg
[1], EXP_60s
, pct
);
337 avg
[2] = calc_load(avg
[2], EXP_300s
, pct
);
340 static void collect_percpu_times(struct psi_group
*group
,
341 enum psi_aggregators aggregator
,
342 u32
*pchanged_states
)
344 u64 deltas
[NR_PSI_STATES
- 1] = { 0, };
345 unsigned long nonidle_total
= 0;
346 u32 changed_states
= 0;
351 * Collect the per-cpu time buckets and average them into a
352 * single time sample that is normalized to wall clock time.
354 * For averaging, each CPU is weighted by its non-idle time in
355 * the sampling period. This eliminates artifacts from uneven
356 * loading, or even entirely idle CPUs.
358 for_each_possible_cpu(cpu
) {
359 u32 times
[NR_PSI_STATES
];
361 u32 cpu_changed_states
;
363 get_recent_times(group
, cpu
, aggregator
, times
,
364 &cpu_changed_states
);
365 changed_states
|= cpu_changed_states
;
367 nonidle
= nsecs_to_jiffies(times
[PSI_NONIDLE
]);
368 nonidle_total
+= nonidle
;
370 for (s
= 0; s
< PSI_NONIDLE
; s
++)
371 deltas
[s
] += (u64
)times
[s
] * nonidle
;
375 * Integrate the sample into the running statistics that are
376 * reported to userspace: the cumulative stall times and the
379 * Pressure percentages are sampled at PSI_FREQ. We might be
380 * called more often when the user polls more frequently than
381 * that; we might be called less often when there is no task
382 * activity, thus no data, and clock ticks are sporadic. The
383 * below handles both.
387 for (s
= 0; s
< NR_PSI_STATES
- 1; s
++)
388 group
->total
[aggregator
][s
] +=
389 div_u64(deltas
[s
], max(nonidle_total
, 1UL));
392 *pchanged_states
= changed_states
;
395 /* Trigger tracking window manipulations */
396 static void window_reset(struct psi_window
*win
, u64 now
, u64 value
,
399 win
->start_time
= now
;
400 win
->start_value
= value
;
401 win
->prev_growth
= prev_growth
;
405 * PSI growth tracking window update and growth calculation routine.
407 * This approximates a sliding tracking window by interpolating
408 * partially elapsed windows using historical growth data from the
409 * previous intervals. This minimizes memory requirements (by not storing
410 * all the intermediate values in the previous window) and simplifies
411 * the calculations. It works well because PSI signal changes only in
412 * positive direction and over relatively small window sizes the growth
413 * is close to linear.
415 static u64
window_update(struct psi_window
*win
, u64 now
, u64 value
)
420 elapsed
= now
- win
->start_time
;
421 growth
= value
- win
->start_value
;
423 * After each tracking window passes win->start_value and
424 * win->start_time get reset and win->prev_growth stores
425 * the average per-window growth of the previous window.
426 * win->prev_growth is then used to interpolate additional
427 * growth from the previous window assuming it was linear.
429 if (elapsed
> win
->size
)
430 window_reset(win
, now
, value
, growth
);
434 remaining
= win
->size
- elapsed
;
435 growth
+= div64_u64(win
->prev_growth
* remaining
, win
->size
);
441 static void update_triggers(struct psi_group
*group
, u64 now
,
442 enum psi_aggregators aggregator
)
444 struct psi_trigger
*t
;
445 u64
*total
= group
->total
[aggregator
];
446 struct list_head
*triggers
;
447 u64
*aggregator_total
;
449 if (aggregator
== PSI_AVGS
) {
450 triggers
= &group
->avg_triggers
;
451 aggregator_total
= group
->avg_total
;
453 triggers
= &group
->rtpoll_triggers
;
454 aggregator_total
= group
->rtpoll_total
;
458 * On subsequent updates, calculate growth deltas and let
459 * watchers know when their specified thresholds are exceeded.
461 list_for_each_entry(t
, triggers
, node
) {
465 new_stall
= aggregator_total
[t
->state
] != total
[t
->state
];
467 /* Check for stall activity or a previous threshold breach */
468 if (!new_stall
&& !t
->pending_event
)
471 * Check for new stall activity, as well as deferred
472 * events that occurred in the last window after the
473 * trigger had already fired (we want to ratelimit
474 * events without dropping any).
477 /* Calculate growth since last update */
478 growth
= window_update(&t
->win
, now
, total
[t
->state
]);
479 if (!t
->pending_event
) {
480 if (growth
< t
->threshold
)
483 t
->pending_event
= true;
486 /* Limit event signaling to once per window */
487 if (now
< t
->last_event_time
+ t
->win
.size
)
490 /* Generate an event */
491 if (cmpxchg(&t
->event
, 0, 1) == 0) {
493 kernfs_notify(t
->of
->kn
);
495 wake_up_interruptible(&t
->event_wait
);
497 t
->last_event_time
= now
;
498 /* Reset threshold breach flag once event got generated */
499 t
->pending_event
= false;
503 static u64
update_averages(struct psi_group
*group
, u64 now
)
505 unsigned long missed_periods
= 0;
511 expires
= group
->avg_next_update
;
512 if (now
- expires
>= psi_period
)
513 missed_periods
= div_u64(now
- expires
, psi_period
);
516 * The periodic clock tick can get delayed for various
517 * reasons, especially on loaded systems. To avoid clock
518 * drift, we schedule the clock in fixed psi_period intervals.
519 * But the deltas we sample out of the per-cpu buckets above
520 * are based on the actual time elapsing between clock ticks.
522 avg_next_update
= expires
+ ((1 + missed_periods
) * psi_period
);
523 period
= now
- (group
->avg_last_update
+ (missed_periods
* psi_period
));
524 group
->avg_last_update
= now
;
526 for (s
= 0; s
< NR_PSI_STATES
- 1; s
++) {
529 sample
= group
->total
[PSI_AVGS
][s
] - group
->avg_total
[s
];
531 * Due to the lockless sampling of the time buckets,
532 * recorded time deltas can slip into the next period,
533 * which under full pressure can result in samples in
534 * excess of the period length.
536 * We don't want to report non-sensical pressures in
537 * excess of 100%, nor do we want to drop such events
538 * on the floor. Instead we punt any overage into the
539 * future until pressure subsides. By doing this we
540 * don't underreport the occurring pressure curve, we
541 * just report it delayed by one period length.
543 * The error isn't cumulative. As soon as another
544 * delta slips from a period P to P+1, by definition
545 * it frees up its time T in P.
549 group
->avg_total
[s
] += sample
;
550 calc_avgs(group
->avg
[s
], missed_periods
, sample
, period
);
553 return avg_next_update
;
556 static void psi_avgs_work(struct work_struct
*work
)
558 struct delayed_work
*dwork
;
559 struct psi_group
*group
;
563 dwork
= to_delayed_work(work
);
564 group
= container_of(dwork
, struct psi_group
, avgs_work
);
566 mutex_lock(&group
->avgs_lock
);
570 collect_percpu_times(group
, PSI_AVGS
, &changed_states
);
572 * If there is task activity, periodically fold the per-cpu
573 * times and feed samples into the running averages. If things
574 * are idle and there is no data to process, stop the clock.
575 * Once restarted, we'll catch up the running averages in one
576 * go - see calc_avgs() and missed_periods.
578 if (now
>= group
->avg_next_update
) {
579 update_triggers(group
, now
, PSI_AVGS
);
580 group
->avg_next_update
= update_averages(group
, now
);
583 if (changed_states
& PSI_STATE_RESCHEDULE
) {
584 schedule_delayed_work(dwork
, nsecs_to_jiffies(
585 group
->avg_next_update
- now
) + 1);
588 mutex_unlock(&group
->avgs_lock
);
591 static void init_rtpoll_triggers(struct psi_group
*group
, u64 now
)
593 struct psi_trigger
*t
;
595 list_for_each_entry(t
, &group
->rtpoll_triggers
, node
)
596 window_reset(&t
->win
, now
,
597 group
->total
[PSI_POLL
][t
->state
], 0);
598 memcpy(group
->rtpoll_total
, group
->total
[PSI_POLL
],
599 sizeof(group
->rtpoll_total
));
600 group
->rtpoll_next_update
= now
+ group
->rtpoll_min_period
;
603 /* Schedule rtpolling if it's not already scheduled or forced. */
604 static void psi_schedule_rtpoll_work(struct psi_group
*group
, unsigned long delay
,
607 struct task_struct
*task
;
610 * atomic_xchg should be called even when !force to provide a
611 * full memory barrier (see the comment inside psi_rtpoll_work).
613 if (atomic_xchg(&group
->rtpoll_scheduled
, 1) && !force
)
618 task
= rcu_dereference(group
->rtpoll_task
);
620 * kworker might be NULL in case psi_trigger_destroy races with
621 * psi_task_change (hotpath) which can't use locks
624 mod_timer(&group
->rtpoll_timer
, jiffies
+ delay
);
626 atomic_set(&group
->rtpoll_scheduled
, 0);
631 static void psi_rtpoll_work(struct psi_group
*group
)
633 bool force_reschedule
= false;
637 mutex_lock(&group
->rtpoll_trigger_lock
);
641 if (now
> group
->rtpoll_until
) {
643 * We are either about to start or might stop rtpolling if no
644 * state change was recorded. Resetting rtpoll_scheduled leaves
645 * a small window for psi_group_change to sneak in and schedule
646 * an immediate rtpoll_work before we get to rescheduling. One
647 * potential extra wakeup at the end of the rtpolling window
648 * should be negligible and rtpoll_next_update still keeps
649 * updates correctly on schedule.
651 atomic_set(&group
->rtpoll_scheduled
, 0);
653 * A task change can race with the rtpoll worker that is supposed to
654 * report on it. To avoid missing events, ensure ordering between
655 * rtpoll_scheduled and the task state accesses, such that if the
656 * rtpoll worker misses the state update, the task change is
657 * guaranteed to reschedule the rtpoll worker:
660 * atomic_set(rtpoll_scheduled, 0)
666 * if atomic_xchg(rtpoll_scheduled, 1) == 0:
667 * schedule rtpoll worker
669 * The atomic_xchg() implies a full barrier.
673 /* The rtpolling window is not over, keep rescheduling */
674 force_reschedule
= true;
678 collect_percpu_times(group
, PSI_POLL
, &changed_states
);
680 if (changed_states
& group
->rtpoll_states
) {
681 /* Initialize trigger windows when entering rtpolling mode */
682 if (now
> group
->rtpoll_until
)
683 init_rtpoll_triggers(group
, now
);
686 * Keep the monitor active for at least the duration of the
687 * minimum tracking window as long as monitor states are
690 group
->rtpoll_until
= now
+
691 group
->rtpoll_min_period
* UPDATES_PER_WINDOW
;
694 if (now
> group
->rtpoll_until
) {
695 group
->rtpoll_next_update
= ULLONG_MAX
;
699 if (now
>= group
->rtpoll_next_update
) {
700 if (changed_states
& group
->rtpoll_states
) {
701 update_triggers(group
, now
, PSI_POLL
);
702 memcpy(group
->rtpoll_total
, group
->total
[PSI_POLL
],
703 sizeof(group
->rtpoll_total
));
705 group
->rtpoll_next_update
= now
+ group
->rtpoll_min_period
;
708 psi_schedule_rtpoll_work(group
,
709 nsecs_to_jiffies(group
->rtpoll_next_update
- now
) + 1,
713 mutex_unlock(&group
->rtpoll_trigger_lock
);
716 static int psi_rtpoll_worker(void *data
)
718 struct psi_group
*group
= (struct psi_group
*)data
;
720 sched_set_fifo_low(current
);
723 wait_event_interruptible(group
->rtpoll_wait
,
724 atomic_cmpxchg(&group
->rtpoll_wakeup
, 1, 0) ||
725 kthread_should_stop());
726 if (kthread_should_stop())
729 psi_rtpoll_work(group
);
734 static void poll_timer_fn(struct timer_list
*t
)
736 struct psi_group
*group
= from_timer(group
, t
, rtpoll_timer
);
738 atomic_set(&group
->rtpoll_wakeup
, 1);
739 wake_up_interruptible(&group
->rtpoll_wait
);
742 static void record_times(struct psi_group_cpu
*groupc
, u64 now
)
746 delta
= now
- groupc
->state_start
;
747 groupc
->state_start
= now
;
749 if (groupc
->state_mask
& (1 << PSI_IO_SOME
)) {
750 groupc
->times
[PSI_IO_SOME
] += delta
;
751 if (groupc
->state_mask
& (1 << PSI_IO_FULL
))
752 groupc
->times
[PSI_IO_FULL
] += delta
;
755 if (groupc
->state_mask
& (1 << PSI_MEM_SOME
)) {
756 groupc
->times
[PSI_MEM_SOME
] += delta
;
757 if (groupc
->state_mask
& (1 << PSI_MEM_FULL
))
758 groupc
->times
[PSI_MEM_FULL
] += delta
;
761 if (groupc
->state_mask
& (1 << PSI_CPU_SOME
)) {
762 groupc
->times
[PSI_CPU_SOME
] += delta
;
763 if (groupc
->state_mask
& (1 << PSI_CPU_FULL
))
764 groupc
->times
[PSI_CPU_FULL
] += delta
;
767 if (groupc
->state_mask
& (1 << PSI_NONIDLE
))
768 groupc
->times
[PSI_NONIDLE
] += delta
;
771 static void psi_group_change(struct psi_group
*group
, int cpu
,
772 unsigned int clear
, unsigned int set
, u64 now
,
775 struct psi_group_cpu
*groupc
;
779 lockdep_assert_rq_held(cpu_rq(cpu
));
780 groupc
= per_cpu_ptr(group
->pcpu
, cpu
);
783 * First we update the task counts according to the state
784 * change requested through the @clear and @set bits.
786 * Then if the cgroup PSI stats accounting enabled, we
787 * assess the aggregate resource states this CPU's tasks
788 * have been in since the last change, and account any
789 * SOME and FULL time these may have resulted in.
791 write_seqcount_begin(&groupc
->seq
);
794 * Start with TSK_ONCPU, which doesn't have a corresponding
795 * task count - it's just a boolean flag directly encoded in
796 * the state mask. Clear, set, or carry the current state if
797 * no changes are requested.
799 if (unlikely(clear
& TSK_ONCPU
)) {
802 } else if (unlikely(set
& TSK_ONCPU
)) {
803 state_mask
= PSI_ONCPU
;
806 state_mask
= groupc
->state_mask
& PSI_ONCPU
;
810 * The rest of the state mask is calculated based on the task
811 * counts. Update those first, then construct the mask.
813 for (t
= 0, m
= clear
; m
; m
&= ~(1 << t
), t
++) {
816 if (groupc
->tasks
[t
]) {
818 } else if (!psi_bug
) {
819 printk_deferred(KERN_ERR
"psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
820 cpu
, t
, groupc
->tasks
[0],
821 groupc
->tasks
[1], groupc
->tasks
[2],
822 groupc
->tasks
[3], clear
, set
);
827 for (t
= 0; set
; set
&= ~(1 << t
), t
++)
831 if (!group
->enabled
) {
833 * On the first group change after disabling PSI, conclude
834 * the current state and flush its time. This is unlikely
835 * to matter to the user, but aggregation (get_recent_times)
836 * may have already incorporated the live state into times_prev;
837 * avoid a delta sample underflow when PSI is later re-enabled.
839 if (unlikely(groupc
->state_mask
& (1 << PSI_NONIDLE
)))
840 record_times(groupc
, now
);
842 groupc
->state_mask
= state_mask
;
844 write_seqcount_end(&groupc
->seq
);
848 state_mask
= test_states(groupc
->tasks
, state_mask
);
851 * Since we care about lost potential, a memstall is FULL
852 * when there are no other working tasks, but also when
853 * the CPU is actively reclaiming and nothing productive
854 * could run even if it were runnable. So when the current
855 * task in a cgroup is in_memstall, the corresponding groupc
856 * on that cpu is in PSI_MEM_FULL state.
858 if (unlikely((state_mask
& PSI_ONCPU
) && cpu_curr(cpu
)->in_memstall
))
859 state_mask
|= (1 << PSI_MEM_FULL
);
861 record_times(groupc
, now
);
863 groupc
->state_mask
= state_mask
;
865 write_seqcount_end(&groupc
->seq
);
867 if (state_mask
& group
->rtpoll_states
)
868 psi_schedule_rtpoll_work(group
, 1, false);
870 if (wake_clock
&& !delayed_work_pending(&group
->avgs_work
))
871 schedule_delayed_work(&group
->avgs_work
, PSI_FREQ
);
874 static inline struct psi_group
*task_psi_group(struct task_struct
*task
)
876 #ifdef CONFIG_CGROUPS
877 if (static_branch_likely(&psi_cgroups_enabled
))
878 return cgroup_psi(task_dfl_cgroup(task
));
883 static void psi_flags_change(struct task_struct
*task
, int clear
, int set
)
885 if (((task
->psi_flags
& set
) ||
886 (task
->psi_flags
& clear
) != clear
) &&
888 printk_deferred(KERN_ERR
"psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
889 task
->pid
, task
->comm
, task_cpu(task
),
890 task
->psi_flags
, clear
, set
);
894 task
->psi_flags
&= ~clear
;
895 task
->psi_flags
|= set
;
898 void psi_task_change(struct task_struct
*task
, int clear
, int set
)
900 int cpu
= task_cpu(task
);
901 struct psi_group
*group
;
907 psi_flags_change(task
, clear
, set
);
909 now
= cpu_clock(cpu
);
911 group
= task_psi_group(task
);
913 psi_group_change(group
, cpu
, clear
, set
, now
, true);
914 } while ((group
= group
->parent
));
917 void psi_task_switch(struct task_struct
*prev
, struct task_struct
*next
,
920 struct psi_group
*group
, *common
= NULL
;
921 int cpu
= task_cpu(prev
);
922 u64 now
= cpu_clock(cpu
);
925 psi_flags_change(next
, 0, TSK_ONCPU
);
927 * Set TSK_ONCPU on @next's cgroups. If @next shares any
928 * ancestors with @prev, those will already have @prev's
929 * TSK_ONCPU bit set, and we can stop the iteration there.
931 group
= task_psi_group(next
);
933 if (per_cpu_ptr(group
->pcpu
, cpu
)->state_mask
&
939 psi_group_change(group
, cpu
, 0, TSK_ONCPU
, now
, true);
940 } while ((group
= group
->parent
));
944 int clear
= TSK_ONCPU
, set
= 0;
945 bool wake_clock
= true;
948 * When we're going to sleep, psi_dequeue() lets us
949 * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
950 * TSK_IOWAIT here, where we can combine it with
951 * TSK_ONCPU and save walking common ancestors twice.
954 clear
|= TSK_RUNNING
;
955 if (prev
->in_memstall
)
956 clear
|= TSK_MEMSTALL_RUNNING
;
961 * Periodic aggregation shuts off if there is a period of no
962 * task changes, so we wake it back up if necessary. However,
963 * don't do this if the task change is the aggregation worker
964 * itself going to sleep, or we'll ping-pong forever.
966 if (unlikely((prev
->flags
& PF_WQ_WORKER
) &&
967 wq_worker_last_func(prev
) == psi_avgs_work
))
971 psi_flags_change(prev
, clear
, set
);
973 group
= task_psi_group(prev
);
977 psi_group_change(group
, cpu
, clear
, set
, now
, wake_clock
);
978 } while ((group
= group
->parent
));
981 * TSK_ONCPU is handled up to the common ancestor. If there are
982 * any other differences between the two tasks (e.g. prev goes
983 * to sleep, or only one task is memstall), finish propagating
984 * those differences all the way up to the root.
986 if ((prev
->psi_flags
^ next
->psi_flags
) & ~TSK_ONCPU
) {
988 for (; group
; group
= group
->parent
)
989 psi_group_change(group
, cpu
, clear
, set
, now
, wake_clock
);
994 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
995 void psi_account_irqtime(struct rq
*rq
, struct task_struct
*curr
, struct task_struct
*prev
)
997 int cpu
= task_cpu(curr
);
998 struct psi_group
*group
;
999 struct psi_group_cpu
*groupc
;
1003 if (static_branch_likely(&psi_disabled
))
1009 lockdep_assert_rq_held(rq
);
1010 group
= task_psi_group(curr
);
1011 if (prev
&& task_psi_group(prev
) == group
)
1014 now
= cpu_clock(cpu
);
1015 irq
= irq_time_read(cpu
);
1016 delta
= (s64
)(irq
- rq
->psi_irq_time
);
1019 rq
->psi_irq_time
= irq
;
1022 if (!group
->enabled
)
1025 groupc
= per_cpu_ptr(group
->pcpu
, cpu
);
1027 write_seqcount_begin(&groupc
->seq
);
1029 record_times(groupc
, now
);
1030 groupc
->times
[PSI_IRQ_FULL
] += delta
;
1032 write_seqcount_end(&groupc
->seq
);
1034 if (group
->rtpoll_states
& (1 << PSI_IRQ_FULL
))
1035 psi_schedule_rtpoll_work(group
, 1, false);
1036 } while ((group
= group
->parent
));
1041 * psi_memstall_enter - mark the beginning of a memory stall section
1042 * @flags: flags to handle nested sections
1044 * Marks the calling task as being stalled due to a lack of memory,
1045 * such as waiting for a refault or performing reclaim.
1047 void psi_memstall_enter(unsigned long *flags
)
1052 if (static_branch_likely(&psi_disabled
))
1055 *flags
= current
->in_memstall
;
1059 * in_memstall setting & accounting needs to be atomic wrt
1060 * changes to the task's scheduling state, otherwise we can
1061 * race with CPU migration.
1063 rq
= this_rq_lock_irq(&rf
);
1065 current
->in_memstall
= 1;
1066 psi_task_change(current
, 0, TSK_MEMSTALL
| TSK_MEMSTALL_RUNNING
);
1068 rq_unlock_irq(rq
, &rf
);
1070 EXPORT_SYMBOL_GPL(psi_memstall_enter
);
1073 * psi_memstall_leave - mark the end of an memory stall section
1074 * @flags: flags to handle nested memdelay sections
1076 * Marks the calling task as no longer stalled due to lack of memory.
1078 void psi_memstall_leave(unsigned long *flags
)
1083 if (static_branch_likely(&psi_disabled
))
1089 * in_memstall clearing & accounting needs to be atomic wrt
1090 * changes to the task's scheduling state, otherwise we could
1091 * race with CPU migration.
1093 rq
= this_rq_lock_irq(&rf
);
1095 current
->in_memstall
= 0;
1096 psi_task_change(current
, TSK_MEMSTALL
| TSK_MEMSTALL_RUNNING
, 0);
1098 rq_unlock_irq(rq
, &rf
);
1100 EXPORT_SYMBOL_GPL(psi_memstall_leave
);
1102 #ifdef CONFIG_CGROUPS
1103 int psi_cgroup_alloc(struct cgroup
*cgroup
)
1105 if (!static_branch_likely(&psi_cgroups_enabled
))
1108 cgroup
->psi
= kzalloc(sizeof(struct psi_group
), GFP_KERNEL
);
1112 cgroup
->psi
->pcpu
= alloc_percpu(struct psi_group_cpu
);
1113 if (!cgroup
->psi
->pcpu
) {
1117 group_init(cgroup
->psi
);
1118 cgroup
->psi
->parent
= cgroup_psi(cgroup_parent(cgroup
));
1122 void psi_cgroup_free(struct cgroup
*cgroup
)
1124 if (!static_branch_likely(&psi_cgroups_enabled
))
1127 cancel_delayed_work_sync(&cgroup
->psi
->avgs_work
);
1128 free_percpu(cgroup
->psi
->pcpu
);
1129 /* All triggers must be removed by now */
1130 WARN_ONCE(cgroup
->psi
->rtpoll_states
, "psi: trigger leak\n");
1135 * cgroup_move_task - move task to a different cgroup
1137 * @to: the target css_set
1139 * Move task to a new cgroup and safely migrate its associated stall
1140 * state between the different groups.
1142 * This function acquires the task's rq lock to lock out concurrent
1143 * changes to the task's scheduling state and - in case the task is
1144 * running - concurrent changes to its stall state.
1146 void cgroup_move_task(struct task_struct
*task
, struct css_set
*to
)
1148 unsigned int task_flags
;
1152 if (!static_branch_likely(&psi_cgroups_enabled
)) {
1154 * Lame to do this here, but the scheduler cannot be locked
1155 * from the outside, so we move cgroups from inside sched/.
1157 rcu_assign_pointer(task
->cgroups
, to
);
1161 rq
= task_rq_lock(task
, &rf
);
1164 * We may race with schedule() dropping the rq lock between
1165 * deactivating prev and switching to next. Because the psi
1166 * updates from the deactivation are deferred to the switch
1167 * callback to save cgroup tree updates, the task's scheduling
1168 * state here is not coherent with its psi state:
1170 * schedule() cgroup_move_task()
1174 * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
1178 * psi_task_change() // old cgroup
1179 * task->cgroups = to
1180 * psi_task_change() // new cgroup
1183 * psi_sched_switch() // does deferred updates in new cgroup
1185 * Don't rely on the scheduling state. Use psi_flags instead.
1187 task_flags
= task
->psi_flags
;
1190 psi_task_change(task
, task_flags
, 0);
1192 /* See comment above */
1193 rcu_assign_pointer(task
->cgroups
, to
);
1196 psi_task_change(task
, 0, task_flags
);
1198 task_rq_unlock(rq
, task
, &rf
);
1201 void psi_cgroup_restart(struct psi_group
*group
)
1206 * After we disable psi_group->enabled, we don't actually
1207 * stop percpu tasks accounting in each psi_group_cpu,
1208 * instead only stop test_states() loop, record_times()
1209 * and averaging worker, see psi_group_change() for details.
1211 * When disable cgroup PSI, this function has nothing to sync
1212 * since cgroup pressure files are hidden and percpu psi_group_cpu
1213 * would see !psi_group->enabled and only do task accounting.
1215 * When re-enable cgroup PSI, this function use psi_group_change()
1216 * to get correct state mask from test_states() loop on tasks[],
1217 * and restart groupc->state_start from now, use .clear = .set = 0
1218 * here since no task status really changed.
1220 if (!group
->enabled
)
1223 for_each_possible_cpu(cpu
) {
1224 struct rq
*rq
= cpu_rq(cpu
);
1228 rq_lock_irq(rq
, &rf
);
1229 now
= cpu_clock(cpu
);
1230 psi_group_change(group
, cpu
, 0, 0, now
, true);
1231 rq_unlock_irq(rq
, &rf
);
1234 #endif /* CONFIG_CGROUPS */
1236 int psi_show(struct seq_file
*m
, struct psi_group
*group
, enum psi_res res
)
1238 bool only_full
= false;
1242 if (static_branch_likely(&psi_disabled
))
1245 /* Update averages before reporting them */
1246 mutex_lock(&group
->avgs_lock
);
1247 now
= sched_clock();
1248 collect_percpu_times(group
, PSI_AVGS
, NULL
);
1249 if (now
>= group
->avg_next_update
)
1250 group
->avg_next_update
= update_averages(group
, now
);
1251 mutex_unlock(&group
->avgs_lock
);
1253 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
1254 only_full
= res
== PSI_IRQ
;
1257 for (full
= 0; full
< 2 - only_full
; full
++) {
1258 unsigned long avg
[3] = { 0, };
1262 /* CPU FULL is undefined at the system level */
1263 if (!(group
== &psi_system
&& res
== PSI_CPU
&& full
)) {
1264 for (w
= 0; w
< 3; w
++)
1265 avg
[w
] = group
->avg
[res
* 2 + full
][w
];
1266 total
= div_u64(group
->total
[PSI_AVGS
][res
* 2 + full
],
1270 seq_printf(m
, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
1271 full
|| only_full
? "full" : "some",
1272 LOAD_INT(avg
[0]), LOAD_FRAC(avg
[0]),
1273 LOAD_INT(avg
[1]), LOAD_FRAC(avg
[1]),
1274 LOAD_INT(avg
[2]), LOAD_FRAC(avg
[2]),
1281 struct psi_trigger
*psi_trigger_create(struct psi_group
*group
, char *buf
,
1282 enum psi_res res
, struct file
*file
,
1283 struct kernfs_open_file
*of
)
1285 struct psi_trigger
*t
;
1286 enum psi_states state
;
1291 if (static_branch_likely(&psi_disabled
))
1292 return ERR_PTR(-EOPNOTSUPP
);
1295 * Checking the privilege here on file->f_cred implies that a privileged user
1296 * could open the file and delegate the write to an unprivileged one.
1298 privileged
= cap_raised(file
->f_cred
->cap_effective
, CAP_SYS_RESOURCE
);
1300 if (sscanf(buf
, "some %u %u", &threshold_us
, &window_us
) == 2)
1301 state
= PSI_IO_SOME
+ res
* 2;
1302 else if (sscanf(buf
, "full %u %u", &threshold_us
, &window_us
) == 2)
1303 state
= PSI_IO_FULL
+ res
* 2;
1305 return ERR_PTR(-EINVAL
);
1307 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
1308 if (res
== PSI_IRQ
&& --state
!= PSI_IRQ_FULL
)
1309 return ERR_PTR(-EINVAL
);
1312 if (state
>= PSI_NONIDLE
)
1313 return ERR_PTR(-EINVAL
);
1315 if (window_us
== 0 || window_us
> WINDOW_MAX_US
)
1316 return ERR_PTR(-EINVAL
);
1319 * Unprivileged users can only use 2s windows so that averages aggregation
1320 * work is used, and no RT threads need to be spawned.
1322 if (!privileged
&& window_us
% 2000000)
1323 return ERR_PTR(-EINVAL
);
1325 /* Check threshold */
1326 if (threshold_us
== 0 || threshold_us
> window_us
)
1327 return ERR_PTR(-EINVAL
);
1329 t
= kmalloc(sizeof(*t
), GFP_KERNEL
);
1331 return ERR_PTR(-ENOMEM
);
1335 t
->threshold
= threshold_us
* NSEC_PER_USEC
;
1336 t
->win
.size
= window_us
* NSEC_PER_USEC
;
1337 window_reset(&t
->win
, sched_clock(),
1338 group
->total
[PSI_POLL
][t
->state
], 0);
1341 t
->last_event_time
= 0;
1344 init_waitqueue_head(&t
->event_wait
);
1345 t
->pending_event
= false;
1346 t
->aggregator
= privileged
? PSI_POLL
: PSI_AVGS
;
1349 mutex_lock(&group
->rtpoll_trigger_lock
);
1351 if (!rcu_access_pointer(group
->rtpoll_task
)) {
1352 struct task_struct
*task
;
1354 task
= kthread_create(psi_rtpoll_worker
, group
, "psimon");
1357 mutex_unlock(&group
->rtpoll_trigger_lock
);
1358 return ERR_CAST(task
);
1360 atomic_set(&group
->rtpoll_wakeup
, 0);
1361 wake_up_process(task
);
1362 rcu_assign_pointer(group
->rtpoll_task
, task
);
1365 list_add(&t
->node
, &group
->rtpoll_triggers
);
1366 group
->rtpoll_min_period
= min(group
->rtpoll_min_period
,
1367 div_u64(t
->win
.size
, UPDATES_PER_WINDOW
));
1368 group
->rtpoll_nr_triggers
[t
->state
]++;
1369 group
->rtpoll_states
|= (1 << t
->state
);
1371 mutex_unlock(&group
->rtpoll_trigger_lock
);
1373 mutex_lock(&group
->avgs_lock
);
1375 list_add(&t
->node
, &group
->avg_triggers
);
1376 group
->avg_nr_triggers
[t
->state
]++;
1378 mutex_unlock(&group
->avgs_lock
);
1383 void psi_trigger_destroy(struct psi_trigger
*t
)
1385 struct psi_group
*group
;
1386 struct task_struct
*task_to_destroy
= NULL
;
1389 * We do not check psi_disabled since it might have been disabled after
1390 * the trigger got created.
1397 * Wakeup waiters to stop polling and clear the queue to prevent it from
1398 * being accessed later. Can happen if cgroup is deleted from under a
1402 kernfs_notify(t
->of
->kn
);
1404 wake_up_interruptible(&t
->event_wait
);
1406 if (t
->aggregator
== PSI_AVGS
) {
1407 mutex_lock(&group
->avgs_lock
);
1408 if (!list_empty(&t
->node
)) {
1410 group
->avg_nr_triggers
[t
->state
]--;
1412 mutex_unlock(&group
->avgs_lock
);
1414 mutex_lock(&group
->rtpoll_trigger_lock
);
1415 if (!list_empty(&t
->node
)) {
1416 struct psi_trigger
*tmp
;
1417 u64 period
= ULLONG_MAX
;
1420 group
->rtpoll_nr_triggers
[t
->state
]--;
1421 if (!group
->rtpoll_nr_triggers
[t
->state
])
1422 group
->rtpoll_states
&= ~(1 << t
->state
);
1424 * Reset min update period for the remaining triggers
1425 * iff the destroying trigger had the min window size.
1427 if (group
->rtpoll_min_period
== div_u64(t
->win
.size
, UPDATES_PER_WINDOW
)) {
1428 list_for_each_entry(tmp
, &group
->rtpoll_triggers
, node
)
1429 period
= min(period
, div_u64(tmp
->win
.size
,
1430 UPDATES_PER_WINDOW
));
1431 group
->rtpoll_min_period
= period
;
1433 /* Destroy rtpoll_task when the last trigger is destroyed */
1434 if (group
->rtpoll_states
== 0) {
1435 group
->rtpoll_until
= 0;
1436 task_to_destroy
= rcu_dereference_protected(
1438 lockdep_is_held(&group
->rtpoll_trigger_lock
));
1439 rcu_assign_pointer(group
->rtpoll_task
, NULL
);
1440 del_timer(&group
->rtpoll_timer
);
1443 mutex_unlock(&group
->rtpoll_trigger_lock
);
1447 * Wait for psi_schedule_rtpoll_work RCU to complete its read-side
1448 * critical section before destroying the trigger and optionally the
1453 * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent
1454 * a deadlock while waiting for psi_rtpoll_work to acquire
1455 * rtpoll_trigger_lock
1457 if (task_to_destroy
) {
1459 * After the RCU grace period has expired, the worker
1460 * can no longer be found through group->rtpoll_task.
1462 kthread_stop(task_to_destroy
);
1463 atomic_set(&group
->rtpoll_scheduled
, 0);
1468 __poll_t
psi_trigger_poll(void **trigger_ptr
,
1469 struct file
*file
, poll_table
*wait
)
1471 __poll_t ret
= DEFAULT_POLLMASK
;
1472 struct psi_trigger
*t
;
1474 if (static_branch_likely(&psi_disabled
))
1475 return DEFAULT_POLLMASK
| EPOLLERR
| EPOLLPRI
;
1477 t
= smp_load_acquire(trigger_ptr
);
1479 return DEFAULT_POLLMASK
| EPOLLERR
| EPOLLPRI
;
1482 kernfs_generic_poll(t
->of
, wait
);
1484 poll_wait(file
, &t
->event_wait
, wait
);
1486 if (cmpxchg(&t
->event
, 1, 0) == 1)
1492 #ifdef CONFIG_PROC_FS
1493 static int psi_io_show(struct seq_file
*m
, void *v
)
1495 return psi_show(m
, &psi_system
, PSI_IO
);
1498 static int psi_memory_show(struct seq_file
*m
, void *v
)
1500 return psi_show(m
, &psi_system
, PSI_MEM
);
1503 static int psi_cpu_show(struct seq_file
*m
, void *v
)
1505 return psi_show(m
, &psi_system
, PSI_CPU
);
1508 static int psi_io_open(struct inode
*inode
, struct file
*file
)
1510 return single_open(file
, psi_io_show
, NULL
);
1513 static int psi_memory_open(struct inode
*inode
, struct file
*file
)
1515 return single_open(file
, psi_memory_show
, NULL
);
1518 static int psi_cpu_open(struct inode
*inode
, struct file
*file
)
1520 return single_open(file
, psi_cpu_show
, NULL
);
1523 static ssize_t
psi_write(struct file
*file
, const char __user
*user_buf
,
1524 size_t nbytes
, enum psi_res res
)
1528 struct seq_file
*seq
;
1529 struct psi_trigger
*new;
1531 if (static_branch_likely(&psi_disabled
))
1537 buf_size
= min(nbytes
, sizeof(buf
));
1538 if (copy_from_user(buf
, user_buf
, buf_size
))
1541 buf
[buf_size
- 1] = '\0';
1543 seq
= file
->private_data
;
1545 /* Take seq->lock to protect seq->private from concurrent writes */
1546 mutex_lock(&seq
->lock
);
1548 /* Allow only one trigger per file descriptor */
1550 mutex_unlock(&seq
->lock
);
1554 new = psi_trigger_create(&psi_system
, buf
, res
, file
, NULL
);
1556 mutex_unlock(&seq
->lock
);
1557 return PTR_ERR(new);
1560 smp_store_release(&seq
->private, new);
1561 mutex_unlock(&seq
->lock
);
1566 static ssize_t
psi_io_write(struct file
*file
, const char __user
*user_buf
,
1567 size_t nbytes
, loff_t
*ppos
)
1569 return psi_write(file
, user_buf
, nbytes
, PSI_IO
);
1572 static ssize_t
psi_memory_write(struct file
*file
, const char __user
*user_buf
,
1573 size_t nbytes
, loff_t
*ppos
)
1575 return psi_write(file
, user_buf
, nbytes
, PSI_MEM
);
1578 static ssize_t
psi_cpu_write(struct file
*file
, const char __user
*user_buf
,
1579 size_t nbytes
, loff_t
*ppos
)
1581 return psi_write(file
, user_buf
, nbytes
, PSI_CPU
);
1584 static __poll_t
psi_fop_poll(struct file
*file
, poll_table
*wait
)
1586 struct seq_file
*seq
= file
->private_data
;
1588 return psi_trigger_poll(&seq
->private, file
, wait
);
1591 static int psi_fop_release(struct inode
*inode
, struct file
*file
)
1593 struct seq_file
*seq
= file
->private_data
;
1595 psi_trigger_destroy(seq
->private);
1596 return single_release(inode
, file
);
1599 static const struct proc_ops psi_io_proc_ops
= {
1600 .proc_open
= psi_io_open
,
1601 .proc_read
= seq_read
,
1602 .proc_lseek
= seq_lseek
,
1603 .proc_write
= psi_io_write
,
1604 .proc_poll
= psi_fop_poll
,
1605 .proc_release
= psi_fop_release
,
1608 static const struct proc_ops psi_memory_proc_ops
= {
1609 .proc_open
= psi_memory_open
,
1610 .proc_read
= seq_read
,
1611 .proc_lseek
= seq_lseek
,
1612 .proc_write
= psi_memory_write
,
1613 .proc_poll
= psi_fop_poll
,
1614 .proc_release
= psi_fop_release
,
1617 static const struct proc_ops psi_cpu_proc_ops
= {
1618 .proc_open
= psi_cpu_open
,
1619 .proc_read
= seq_read
,
1620 .proc_lseek
= seq_lseek
,
1621 .proc_write
= psi_cpu_write
,
1622 .proc_poll
= psi_fop_poll
,
1623 .proc_release
= psi_fop_release
,
1626 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
1627 static int psi_irq_show(struct seq_file
*m
, void *v
)
1629 return psi_show(m
, &psi_system
, PSI_IRQ
);
1632 static int psi_irq_open(struct inode
*inode
, struct file
*file
)
1634 return single_open(file
, psi_irq_show
, NULL
);
1637 static ssize_t
psi_irq_write(struct file
*file
, const char __user
*user_buf
,
1638 size_t nbytes
, loff_t
*ppos
)
1640 return psi_write(file
, user_buf
, nbytes
, PSI_IRQ
);
1643 static const struct proc_ops psi_irq_proc_ops
= {
1644 .proc_open
= psi_irq_open
,
1645 .proc_read
= seq_read
,
1646 .proc_lseek
= seq_lseek
,
1647 .proc_write
= psi_irq_write
,
1648 .proc_poll
= psi_fop_poll
,
1649 .proc_release
= psi_fop_release
,
1653 static int __init
psi_proc_init(void)
1656 proc_mkdir("pressure", NULL
);
1657 proc_create("pressure/io", 0666, NULL
, &psi_io_proc_ops
);
1658 proc_create("pressure/memory", 0666, NULL
, &psi_memory_proc_ops
);
1659 proc_create("pressure/cpu", 0666, NULL
, &psi_cpu_proc_ops
);
1660 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
1661 proc_create("pressure/irq", 0666, NULL
, &psi_irq_proc_ops
);
1666 module_init(psi_proc_init
);
1668 #endif /* CONFIG_PROC_FS */