1 // SPDX-License-Identifier: GPL-2.0
3 * Kernel internal timers
5 * Copyright (C) 1991, 1992 Linus Torvalds
7 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
9 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
10 * "A Kernel Model for Precision Timekeeping" by Dave Mills
11 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
12 * serialize accesses to xtime/lost_ticks).
13 * Copyright (C) 1998 Andrea Arcangeli
14 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
15 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
16 * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
17 * Copyright (C) 2000, 2001, 2002 Ingo Molnar
18 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
21 #include <linux/kernel_stat.h>
22 #include <linux/export.h>
23 #include <linux/interrupt.h>
24 #include <linux/percpu.h>
25 #include <linux/init.h>
27 #include <linux/swap.h>
28 #include <linux/pid_namespace.h>
29 #include <linux/notifier.h>
30 #include <linux/thread_info.h>
31 #include <linux/time.h>
32 #include <linux/jiffies.h>
33 #include <linux/posix-timers.h>
34 #include <linux/cpu.h>
35 #include <linux/syscalls.h>
36 #include <linux/delay.h>
37 #include <linux/tick.h>
38 #include <linux/kallsyms.h>
39 #include <linux/irq_work.h>
40 #include <linux/sched/sysctl.h>
41 #include <linux/sched/nohz.h>
42 #include <linux/sched/debug.h>
43 #include <linux/slab.h>
44 #include <linux/compat.h>
45 #include <linux/random.h>
46 #include <linux/sysctl.h>
48 #include <linux/uaccess.h>
49 #include <asm/unistd.h>
50 #include <asm/div64.h>
51 #include <asm/timex.h>
54 #include "tick-internal.h"
55 #include "timer_migration.h"
57 #define CREATE_TRACE_POINTS
58 #include <trace/events/timer.h>
60 __visible u64 jiffies_64 __cacheline_aligned_in_smp
= INITIAL_JIFFIES
;
62 EXPORT_SYMBOL(jiffies_64
);
65 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
66 * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
67 * level has a different granularity.
69 * The level granularity is: LVL_CLK_DIV ^ level
70 * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
72 * The array level of a newly armed timer depends on the relative expiry
73 * time. The farther the expiry time is away the higher the array level and
74 * therefore the granularity becomes.
76 * Contrary to the original timer wheel implementation, which aims for 'exact'
77 * expiry of the timers, this implementation removes the need for recascading
78 * the timers into the lower array levels. The previous 'classic' timer wheel
79 * implementation of the kernel already violated the 'exact' expiry by adding
80 * slack to the expiry time to provide batched expiration. The granularity
81 * levels provide implicit batching.
83 * This is an optimization of the original timer wheel implementation for the
84 * majority of the timer wheel use cases: timeouts. The vast majority of
85 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
86 * the timeout expires it indicates that normal operation is disturbed, so it
87 * does not matter much whether the timeout comes with a slight delay.
89 * The only exception to this are networking timers with a small expiry
90 * time. They rely on the granularity. Those fit into the first wheel level,
91 * which has HZ granularity.
93 * We don't have cascading anymore. timers with a expiry time above the
94 * capacity of the last wheel level are force expired at the maximum timeout
95 * value of the last wheel level. From data sampling we know that the maximum
96 * value observed is 5 days (network connection tracking), so this should not
99 * The currently chosen array constants values are a good compromise between
100 * array size and granularity.
102 * This results in the following granularity and range levels:
105 * Level Offset Granularity Range
106 * 0 0 1 ms 0 ms - 63 ms
107 * 1 64 8 ms 64 ms - 511 ms
108 * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
109 * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
110 * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
111 * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
112 * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
113 * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
114 * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
117 * Level Offset Granularity Range
118 * 0 0 3 ms 0 ms - 210 ms
119 * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
120 * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
121 * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
122 * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
123 * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
124 * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
125 * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
126 * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
129 * Level Offset Granularity Range
130 * 0 0 4 ms 0 ms - 255 ms
131 * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
132 * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
133 * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
134 * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
135 * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
136 * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
137 * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
138 * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
141 * Level Offset Granularity Range
142 * 0 0 10 ms 0 ms - 630 ms
143 * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
144 * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
145 * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
146 * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
147 * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
148 * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
149 * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
152 /* Clock divisor for the next level */
153 #define LVL_CLK_SHIFT 3
154 #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
155 #define LVL_CLK_MASK (LVL_CLK_DIV - 1)
156 #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
157 #define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
160 * The time start value for each level to select the bucket at enqueue
161 * time. We start from the last possible delta of the previous level
162 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
164 #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
166 /* Size of each clock level */
168 #define LVL_SIZE (1UL << LVL_BITS)
169 #define LVL_MASK (LVL_SIZE - 1)
170 #define LVL_OFFS(n) ((n) * LVL_SIZE)
179 /* The cutoff (max. capacity of the wheel) */
180 #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
181 #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
184 * The resulting wheel size. If NOHZ is configured we allocate two
185 * wheels so we have a separate storage for the deferrable timers.
187 #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
189 #ifdef CONFIG_NO_HZ_COMMON
191 * If multiple bases need to be locked, use the base ordering for lock
192 * nesting, i.e. lowest number first.
195 # define BASE_LOCAL 0
196 # define BASE_GLOBAL 1
200 # define BASE_LOCAL 0
201 # define BASE_GLOBAL 0
206 * struct timer_base - Per CPU timer base (number of base depends on config)
207 * @lock: Lock protecting the timer_base
208 * @running_timer: When expiring timers, the lock is dropped. To make
209 * sure not to race against deleting/modifying a
210 * currently running timer, the pointer is set to the
211 * timer, which expires at the moment. If no timer is
212 * running, the pointer is NULL.
213 * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around
214 * timer expiry callback execution and when trying to
215 * delete a running timer and it wasn't successful in
216 * the first glance. It prevents priority inversion
217 * when callback was preempted on a remote CPU and a
218 * caller tries to delete the running timer. It also
219 * prevents a life lock, when the task which tries to
220 * delete a timer preempted the softirq thread which
221 * is running the timer callback function.
222 * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter
223 * waiting for the end of the timer callback function
225 * @clk: clock of the timer base; is updated before enqueue
226 * of a timer; during expiry, it is 1 offset ahead of
227 * jiffies to avoid endless requeuing to current
229 * @next_expiry: expiry value of the first timer; it is updated when
230 * finding the next timer and during enqueue; the
231 * value is not valid, when next_expiry_recalc is set
232 * @cpu: Number of CPU the timer base belongs to
233 * @next_expiry_recalc: States, whether a recalculation of next_expiry is
234 * required. Value is set true, when a timer was
236 * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ
237 * code. This state is only used in standard
238 * base. Deferrable timers, which are enqueued remotely
239 * never wake up an idle CPU. So no matter of supporting it
241 * @timers_pending: Is set, when a timer is pending in the base. It is only
242 * reliable when next_expiry_recalc is not set.
243 * @pending_map: bitmap of the timer wheel; each bit reflects a
244 * bucket of the wheel. When a bit is set, at least a
245 * single timer is enqueued in the related bucket.
246 * @vectors: Array of lists; Each array member reflects a bucket
247 * of the timer wheel. The list contains all timers
248 * which are enqueued into a specific bucket.
252 struct timer_list
*running_timer
;
253 #ifdef CONFIG_PREEMPT_RT
254 spinlock_t expiry_lock
;
255 atomic_t timer_waiters
;
258 unsigned long next_expiry
;
260 bool next_expiry_recalc
;
263 DECLARE_BITMAP(pending_map
, WHEEL_SIZE
);
264 struct hlist_head vectors
[WHEEL_SIZE
];
265 } ____cacheline_aligned
;
267 static DEFINE_PER_CPU(struct timer_base
, timer_bases
[NR_BASES
]);
269 #ifdef CONFIG_NO_HZ_COMMON
271 static DEFINE_STATIC_KEY_FALSE(timers_nohz_active
);
272 static DEFINE_MUTEX(timer_keys_mutex
);
274 static void timer_update_keys(struct work_struct
*work
);
275 static DECLARE_WORK(timer_update_work
, timer_update_keys
);
278 static unsigned int sysctl_timer_migration
= 1;
280 DEFINE_STATIC_KEY_FALSE(timers_migration_enabled
);
282 static void timers_update_migration(void)
284 if (sysctl_timer_migration
&& tick_nohz_active
)
285 static_branch_enable(&timers_migration_enabled
);
287 static_branch_disable(&timers_migration_enabled
);
291 static int timer_migration_handler(const struct ctl_table
*table
, int write
,
292 void *buffer
, size_t *lenp
, loff_t
*ppos
)
296 mutex_lock(&timer_keys_mutex
);
297 ret
= proc_dointvec_minmax(table
, write
, buffer
, lenp
, ppos
);
299 timers_update_migration();
300 mutex_unlock(&timer_keys_mutex
);
304 static struct ctl_table timer_sysctl
[] = {
306 .procname
= "timer_migration",
307 .data
= &sysctl_timer_migration
,
308 .maxlen
= sizeof(unsigned int),
310 .proc_handler
= timer_migration_handler
,
311 .extra1
= SYSCTL_ZERO
,
312 .extra2
= SYSCTL_ONE
,
316 static int __init
timer_sysctl_init(void)
318 register_sysctl("kernel", timer_sysctl
);
321 device_initcall(timer_sysctl_init
);
322 #endif /* CONFIG_SYSCTL */
323 #else /* CONFIG_SMP */
324 static inline void timers_update_migration(void) { }
325 #endif /* !CONFIG_SMP */
327 static void timer_update_keys(struct work_struct
*work
)
329 mutex_lock(&timer_keys_mutex
);
330 timers_update_migration();
331 static_branch_enable(&timers_nohz_active
);
332 mutex_unlock(&timer_keys_mutex
);
335 void timers_update_nohz(void)
337 schedule_work(&timer_update_work
);
340 static inline bool is_timers_nohz_active(void)
342 return static_branch_unlikely(&timers_nohz_active
);
345 static inline bool is_timers_nohz_active(void) { return false; }
346 #endif /* NO_HZ_COMMON */
348 static unsigned long round_jiffies_common(unsigned long j
, int cpu
,
352 unsigned long original
= j
;
355 * We don't want all cpus firing their timers at once hitting the
356 * same lock or cachelines, so we skew each extra cpu with an extra
357 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
359 * The skew is done by adding 3*cpunr, then round, then subtract this
360 * extra offset again.
367 * If the target jiffy is just after a whole second (which can happen
368 * due to delays of the timer irq, long irq off times etc etc) then
369 * we should round down to the whole second, not up. Use 1/4th second
370 * as cutoff for this rounding as an extreme upper bound for this.
371 * But never round down if @force_up is set.
373 if (rem
< HZ
/4 && !force_up
) /* round down */
378 /* now that we have rounded, subtract the extra skew again */
382 * Make sure j is still in the future. Otherwise return the
385 return time_is_after_jiffies(j
) ? j
: original
;
389 * __round_jiffies - function to round jiffies to a full second
390 * @j: the time in (absolute) jiffies that should be rounded
391 * @cpu: the processor number on which the timeout will happen
393 * __round_jiffies() rounds an absolute time in the future (in jiffies)
394 * up or down to (approximately) full seconds. This is useful for timers
395 * for which the exact time they fire does not matter too much, as long as
396 * they fire approximately every X seconds.
398 * By rounding these timers to whole seconds, all such timers will fire
399 * at the same time, rather than at various times spread out. The goal
400 * of this is to have the CPU wake up less, which saves power.
402 * The exact rounding is skewed for each processor to avoid all
403 * processors firing at the exact same time, which could lead
404 * to lock contention or spurious cache line bouncing.
406 * The return value is the rounded version of the @j parameter.
408 unsigned long __round_jiffies(unsigned long j
, int cpu
)
410 return round_jiffies_common(j
, cpu
, false);
412 EXPORT_SYMBOL_GPL(__round_jiffies
);
415 * __round_jiffies_relative - function to round jiffies to a full second
416 * @j: the time in (relative) jiffies that should be rounded
417 * @cpu: the processor number on which the timeout will happen
419 * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
420 * up or down to (approximately) full seconds. This is useful for timers
421 * for which the exact time they fire does not matter too much, as long as
422 * they fire approximately every X seconds.
424 * By rounding these timers to whole seconds, all such timers will fire
425 * at the same time, rather than at various times spread out. The goal
426 * of this is to have the CPU wake up less, which saves power.
428 * The exact rounding is skewed for each processor to avoid all
429 * processors firing at the exact same time, which could lead
430 * to lock contention or spurious cache line bouncing.
432 * The return value is the rounded version of the @j parameter.
434 unsigned long __round_jiffies_relative(unsigned long j
, int cpu
)
436 unsigned long j0
= jiffies
;
438 /* Use j0 because jiffies might change while we run */
439 return round_jiffies_common(j
+ j0
, cpu
, false) - j0
;
441 EXPORT_SYMBOL_GPL(__round_jiffies_relative
);
444 * round_jiffies - function to round jiffies to a full second
445 * @j: the time in (absolute) jiffies that should be rounded
447 * round_jiffies() rounds an absolute time in the future (in jiffies)
448 * up or down to (approximately) full seconds. This is useful for timers
449 * for which the exact time they fire does not matter too much, as long as
450 * they fire approximately every X seconds.
452 * By rounding these timers to whole seconds, all such timers will fire
453 * at the same time, rather than at various times spread out. The goal
454 * of this is to have the CPU wake up less, which saves power.
456 * The return value is the rounded version of the @j parameter.
458 unsigned long round_jiffies(unsigned long j
)
460 return round_jiffies_common(j
, raw_smp_processor_id(), false);
462 EXPORT_SYMBOL_GPL(round_jiffies
);
465 * round_jiffies_relative - function to round jiffies to a full second
466 * @j: the time in (relative) jiffies that should be rounded
468 * round_jiffies_relative() rounds a time delta in the future (in jiffies)
469 * up or down to (approximately) full seconds. This is useful for timers
470 * for which the exact time they fire does not matter too much, as long as
471 * they fire approximately every X seconds.
473 * By rounding these timers to whole seconds, all such timers will fire
474 * at the same time, rather than at various times spread out. The goal
475 * of this is to have the CPU wake up less, which saves power.
477 * The return value is the rounded version of the @j parameter.
479 unsigned long round_jiffies_relative(unsigned long j
)
481 return __round_jiffies_relative(j
, raw_smp_processor_id());
483 EXPORT_SYMBOL_GPL(round_jiffies_relative
);
486 * __round_jiffies_up - function to round jiffies up to a full second
487 * @j: the time in (absolute) jiffies that should be rounded
488 * @cpu: the processor number on which the timeout will happen
490 * This is the same as __round_jiffies() except that it will never
491 * round down. This is useful for timeouts for which the exact time
492 * of firing does not matter too much, as long as they don't fire too
495 unsigned long __round_jiffies_up(unsigned long j
, int cpu
)
497 return round_jiffies_common(j
, cpu
, true);
499 EXPORT_SYMBOL_GPL(__round_jiffies_up
);
502 * __round_jiffies_up_relative - function to round jiffies up to a full second
503 * @j: the time in (relative) jiffies that should be rounded
504 * @cpu: the processor number on which the timeout will happen
506 * This is the same as __round_jiffies_relative() except that it will never
507 * round down. This is useful for timeouts for which the exact time
508 * of firing does not matter too much, as long as they don't fire too
511 unsigned long __round_jiffies_up_relative(unsigned long j
, int cpu
)
513 unsigned long j0
= jiffies
;
515 /* Use j0 because jiffies might change while we run */
516 return round_jiffies_common(j
+ j0
, cpu
, true) - j0
;
518 EXPORT_SYMBOL_GPL(__round_jiffies_up_relative
);
521 * round_jiffies_up - function to round jiffies up to a full second
522 * @j: the time in (absolute) jiffies that should be rounded
524 * This is the same as round_jiffies() except that it will never
525 * round down. This is useful for timeouts for which the exact time
526 * of firing does not matter too much, as long as they don't fire too
529 unsigned long round_jiffies_up(unsigned long j
)
531 return round_jiffies_common(j
, raw_smp_processor_id(), true);
533 EXPORT_SYMBOL_GPL(round_jiffies_up
);
536 * round_jiffies_up_relative - function to round jiffies up to a full second
537 * @j: the time in (relative) jiffies that should be rounded
539 * This is the same as round_jiffies_relative() except that it will never
540 * round down. This is useful for timeouts for which the exact time
541 * of firing does not matter too much, as long as they don't fire too
544 unsigned long round_jiffies_up_relative(unsigned long j
)
546 return __round_jiffies_up_relative(j
, raw_smp_processor_id());
548 EXPORT_SYMBOL_GPL(round_jiffies_up_relative
);
551 static inline unsigned int timer_get_idx(struct timer_list
*timer
)
553 return (timer
->flags
& TIMER_ARRAYMASK
) >> TIMER_ARRAYSHIFT
;
556 static inline void timer_set_idx(struct timer_list
*timer
, unsigned int idx
)
558 timer
->flags
= (timer
->flags
& ~TIMER_ARRAYMASK
) |
559 idx
<< TIMER_ARRAYSHIFT
;
563 * Helper function to calculate the array index for a given expiry
566 static inline unsigned calc_index(unsigned long expires
, unsigned lvl
,
567 unsigned long *bucket_expiry
)
571 * The timer wheel has to guarantee that a timer does not fire
572 * early. Early expiry can happen due to:
573 * - Timer is armed at the edge of a tick
574 * - Truncation of the expiry time in the outer wheel levels
576 * Round up with level granularity to prevent this.
578 expires
= (expires
>> LVL_SHIFT(lvl
)) + 1;
579 *bucket_expiry
= expires
<< LVL_SHIFT(lvl
);
580 return LVL_OFFS(lvl
) + (expires
& LVL_MASK
);
583 static int calc_wheel_index(unsigned long expires
, unsigned long clk
,
584 unsigned long *bucket_expiry
)
586 unsigned long delta
= expires
- clk
;
589 if (delta
< LVL_START(1)) {
590 idx
= calc_index(expires
, 0, bucket_expiry
);
591 } else if (delta
< LVL_START(2)) {
592 idx
= calc_index(expires
, 1, bucket_expiry
);
593 } else if (delta
< LVL_START(3)) {
594 idx
= calc_index(expires
, 2, bucket_expiry
);
595 } else if (delta
< LVL_START(4)) {
596 idx
= calc_index(expires
, 3, bucket_expiry
);
597 } else if (delta
< LVL_START(5)) {
598 idx
= calc_index(expires
, 4, bucket_expiry
);
599 } else if (delta
< LVL_START(6)) {
600 idx
= calc_index(expires
, 5, bucket_expiry
);
601 } else if (delta
< LVL_START(7)) {
602 idx
= calc_index(expires
, 6, bucket_expiry
);
603 } else if (LVL_DEPTH
> 8 && delta
< LVL_START(8)) {
604 idx
= calc_index(expires
, 7, bucket_expiry
);
605 } else if ((long) delta
< 0) {
606 idx
= clk
& LVL_MASK
;
607 *bucket_expiry
= clk
;
610 * Force expire obscene large timeouts to expire at the
611 * capacity limit of the wheel.
613 if (delta
>= WHEEL_TIMEOUT_CUTOFF
)
614 expires
= clk
+ WHEEL_TIMEOUT_MAX
;
616 idx
= calc_index(expires
, LVL_DEPTH
- 1, bucket_expiry
);
622 trigger_dyntick_cpu(struct timer_base
*base
, struct timer_list
*timer
)
625 * Deferrable timers do not prevent the CPU from entering dynticks and
626 * are not taken into account on the idle/nohz_full path. An IPI when a
627 * new deferrable timer is enqueued will wake up the remote CPU but
628 * nothing will be done with the deferrable timer base. Therefore skip
629 * the remote IPI for deferrable timers completely.
631 if (!is_timers_nohz_active() || timer
->flags
& TIMER_DEFERRABLE
)
635 * We might have to IPI the remote CPU if the base is idle and the
636 * timer is pinned. If it is a non pinned timer, it is only queued
637 * on the remote CPU, when timer was running during queueing. Then
638 * everything is handled by remote CPU anyway. If the other CPU is
639 * on the way to idle then it can't set base->is_idle as we hold
643 WARN_ON_ONCE(!(timer
->flags
& TIMER_PINNED
||
644 tick_nohz_full_cpu(base
->cpu
)));
645 wake_up_nohz_cpu(base
->cpu
);
650 * Enqueue the timer into the hash bucket, mark it pending in
651 * the bitmap, store the index in the timer flags then wake up
652 * the target CPU if needed.
654 static void enqueue_timer(struct timer_base
*base
, struct timer_list
*timer
,
655 unsigned int idx
, unsigned long bucket_expiry
)
658 hlist_add_head(&timer
->entry
, base
->vectors
+ idx
);
659 __set_bit(idx
, base
->pending_map
);
660 timer_set_idx(timer
, idx
);
662 trace_timer_start(timer
, bucket_expiry
);
665 * Check whether this is the new first expiring timer. The
666 * effective expiry time of the timer is required here
667 * (bucket_expiry) instead of timer->expires.
669 if (time_before(bucket_expiry
, base
->next_expiry
)) {
671 * Set the next expiry time and kick the CPU so it
672 * can reevaluate the wheel:
674 WRITE_ONCE(base
->next_expiry
, bucket_expiry
);
675 base
->timers_pending
= true;
676 base
->next_expiry_recalc
= false;
677 trigger_dyntick_cpu(base
, timer
);
681 static void internal_add_timer(struct timer_base
*base
, struct timer_list
*timer
)
683 unsigned long bucket_expiry
;
686 idx
= calc_wheel_index(timer
->expires
, base
->clk
, &bucket_expiry
);
687 enqueue_timer(base
, timer
, idx
, bucket_expiry
);
690 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
692 static const struct debug_obj_descr timer_debug_descr
;
695 void (*function
)(struct timer_list
*t
);
699 #define TIMER_HINT(fn, container, timr, hintfn) \
702 .offset = offsetof(container, hintfn) - \
703 offsetof(container, timr) \
706 static const struct timer_hint timer_hints
[] = {
707 TIMER_HINT(delayed_work_timer_fn
,
708 struct delayed_work
, timer
, work
.func
),
709 TIMER_HINT(kthread_delayed_work_timer_fn
,
710 struct kthread_delayed_work
, timer
, work
.func
),
713 static void *timer_debug_hint(void *addr
)
715 struct timer_list
*timer
= addr
;
718 for (i
= 0; i
< ARRAY_SIZE(timer_hints
); i
++) {
719 if (timer_hints
[i
].function
== timer
->function
) {
720 void (**fn
)(void) = addr
+ timer_hints
[i
].offset
;
726 return timer
->function
;
729 static bool timer_is_static_object(void *addr
)
731 struct timer_list
*timer
= addr
;
733 return (timer
->entry
.pprev
== NULL
&&
734 timer
->entry
.next
== TIMER_ENTRY_STATIC
);
738 * timer_fixup_init is called when:
739 * - an active object is initialized
741 static bool timer_fixup_init(void *addr
, enum debug_obj_state state
)
743 struct timer_list
*timer
= addr
;
746 case ODEBUG_STATE_ACTIVE
:
747 del_timer_sync(timer
);
748 debug_object_init(timer
, &timer_debug_descr
);
755 /* Stub timer callback for improperly used timers. */
756 static void stub_timer(struct timer_list
*unused
)
762 * timer_fixup_activate is called when:
763 * - an active object is activated
764 * - an unknown non-static object is activated
766 static bool timer_fixup_activate(void *addr
, enum debug_obj_state state
)
768 struct timer_list
*timer
= addr
;
771 case ODEBUG_STATE_NOTAVAILABLE
:
772 timer_setup(timer
, stub_timer
, 0);
775 case ODEBUG_STATE_ACTIVE
:
784 * timer_fixup_free is called when:
785 * - an active object is freed
787 static bool timer_fixup_free(void *addr
, enum debug_obj_state state
)
789 struct timer_list
*timer
= addr
;
792 case ODEBUG_STATE_ACTIVE
:
793 del_timer_sync(timer
);
794 debug_object_free(timer
, &timer_debug_descr
);
802 * timer_fixup_assert_init is called when:
803 * - an untracked/uninit-ed object is found
805 static bool timer_fixup_assert_init(void *addr
, enum debug_obj_state state
)
807 struct timer_list
*timer
= addr
;
810 case ODEBUG_STATE_NOTAVAILABLE
:
811 timer_setup(timer
, stub_timer
, 0);
818 static const struct debug_obj_descr timer_debug_descr
= {
819 .name
= "timer_list",
820 .debug_hint
= timer_debug_hint
,
821 .is_static_object
= timer_is_static_object
,
822 .fixup_init
= timer_fixup_init
,
823 .fixup_activate
= timer_fixup_activate
,
824 .fixup_free
= timer_fixup_free
,
825 .fixup_assert_init
= timer_fixup_assert_init
,
828 static inline void debug_timer_init(struct timer_list
*timer
)
830 debug_object_init(timer
, &timer_debug_descr
);
833 static inline void debug_timer_activate(struct timer_list
*timer
)
835 debug_object_activate(timer
, &timer_debug_descr
);
838 static inline void debug_timer_deactivate(struct timer_list
*timer
)
840 debug_object_deactivate(timer
, &timer_debug_descr
);
843 static inline void debug_timer_assert_init(struct timer_list
*timer
)
845 debug_object_assert_init(timer
, &timer_debug_descr
);
848 static void do_init_timer(struct timer_list
*timer
,
849 void (*func
)(struct timer_list
*),
851 const char *name
, struct lock_class_key
*key
);
853 void init_timer_on_stack_key(struct timer_list
*timer
,
854 void (*func
)(struct timer_list
*),
856 const char *name
, struct lock_class_key
*key
)
858 debug_object_init_on_stack(timer
, &timer_debug_descr
);
859 do_init_timer(timer
, func
, flags
, name
, key
);
861 EXPORT_SYMBOL_GPL(init_timer_on_stack_key
);
863 void destroy_timer_on_stack(struct timer_list
*timer
)
865 debug_object_free(timer
, &timer_debug_descr
);
867 EXPORT_SYMBOL_GPL(destroy_timer_on_stack
);
870 static inline void debug_timer_init(struct timer_list
*timer
) { }
871 static inline void debug_timer_activate(struct timer_list
*timer
) { }
872 static inline void debug_timer_deactivate(struct timer_list
*timer
) { }
873 static inline void debug_timer_assert_init(struct timer_list
*timer
) { }
876 static inline void debug_init(struct timer_list
*timer
)
878 debug_timer_init(timer
);
879 trace_timer_init(timer
);
882 static inline void debug_deactivate(struct timer_list
*timer
)
884 debug_timer_deactivate(timer
);
885 trace_timer_cancel(timer
);
888 static inline void debug_assert_init(struct timer_list
*timer
)
890 debug_timer_assert_init(timer
);
893 static void do_init_timer(struct timer_list
*timer
,
894 void (*func
)(struct timer_list
*),
896 const char *name
, struct lock_class_key
*key
)
898 timer
->entry
.pprev
= NULL
;
899 timer
->function
= func
;
900 if (WARN_ON_ONCE(flags
& ~TIMER_INIT_FLAGS
))
901 flags
&= TIMER_INIT_FLAGS
;
902 timer
->flags
= flags
| raw_smp_processor_id();
903 lockdep_init_map(&timer
->lockdep_map
, name
, key
, 0);
907 * init_timer_key - initialize a timer
908 * @timer: the timer to be initialized
909 * @func: timer callback function
910 * @flags: timer flags
911 * @name: name of the timer
912 * @key: lockdep class key of the fake lock used for tracking timer
913 * sync lock dependencies
915 * init_timer_key() must be done to a timer prior to calling *any* of the
916 * other timer functions.
918 void init_timer_key(struct timer_list
*timer
,
919 void (*func
)(struct timer_list
*), unsigned int flags
,
920 const char *name
, struct lock_class_key
*key
)
923 do_init_timer(timer
, func
, flags
, name
, key
);
925 EXPORT_SYMBOL(init_timer_key
);
927 static inline void detach_timer(struct timer_list
*timer
, bool clear_pending
)
929 struct hlist_node
*entry
= &timer
->entry
;
931 debug_deactivate(timer
);
936 entry
->next
= LIST_POISON2
;
939 static int detach_if_pending(struct timer_list
*timer
, struct timer_base
*base
,
942 unsigned idx
= timer_get_idx(timer
);
944 if (!timer_pending(timer
))
947 if (hlist_is_singular_node(&timer
->entry
, base
->vectors
+ idx
)) {
948 __clear_bit(idx
, base
->pending_map
);
949 base
->next_expiry_recalc
= true;
952 detach_timer(timer
, clear_pending
);
956 static inline struct timer_base
*get_timer_cpu_base(u32 tflags
, u32 cpu
)
958 int index
= tflags
& TIMER_PINNED
? BASE_LOCAL
: BASE_GLOBAL
;
959 struct timer_base
*base
;
961 base
= per_cpu_ptr(&timer_bases
[index
], cpu
);
964 * If the timer is deferrable and NO_HZ_COMMON is set then we need
965 * to use the deferrable base.
967 if (IS_ENABLED(CONFIG_NO_HZ_COMMON
) && (tflags
& TIMER_DEFERRABLE
))
968 base
= per_cpu_ptr(&timer_bases
[BASE_DEF
], cpu
);
972 static inline struct timer_base
*get_timer_this_cpu_base(u32 tflags
)
974 int index
= tflags
& TIMER_PINNED
? BASE_LOCAL
: BASE_GLOBAL
;
975 struct timer_base
*base
;
977 base
= this_cpu_ptr(&timer_bases
[index
]);
980 * If the timer is deferrable and NO_HZ_COMMON is set then we need
981 * to use the deferrable base.
983 if (IS_ENABLED(CONFIG_NO_HZ_COMMON
) && (tflags
& TIMER_DEFERRABLE
))
984 base
= this_cpu_ptr(&timer_bases
[BASE_DEF
]);
988 static inline struct timer_base
*get_timer_base(u32 tflags
)
990 return get_timer_cpu_base(tflags
, tflags
& TIMER_CPUMASK
);
993 static inline void __forward_timer_base(struct timer_base
*base
,
997 * Check whether we can forward the base. We can only do that when
998 * @basej is past base->clk otherwise we might rewind base->clk.
1000 if (time_before_eq(basej
, base
->clk
))
1004 * If the next expiry value is > jiffies, then we fast forward to
1005 * jiffies otherwise we forward to the next expiry value.
1007 if (time_after(base
->next_expiry
, basej
)) {
1010 if (WARN_ON_ONCE(time_before(base
->next_expiry
, base
->clk
)))
1012 base
->clk
= base
->next_expiry
;
1017 static inline void forward_timer_base(struct timer_base
*base
)
1019 __forward_timer_base(base
, READ_ONCE(jiffies
));
1023 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
1024 * that all timers which are tied to this base are locked, and the base itself
1027 * So __run_timers/migrate_timers can safely modify all timers which could
1028 * be found in the base->vectors array.
1030 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
1031 * to wait until the migration is done.
1033 static struct timer_base
*lock_timer_base(struct timer_list
*timer
,
1034 unsigned long *flags
)
1035 __acquires(timer
->base
->lock
)
1038 struct timer_base
*base
;
1042 * We need to use READ_ONCE() here, otherwise the compiler
1043 * might re-read @tf between the check for TIMER_MIGRATING
1046 tf
= READ_ONCE(timer
->flags
);
1048 if (!(tf
& TIMER_MIGRATING
)) {
1049 base
= get_timer_base(tf
);
1050 raw_spin_lock_irqsave(&base
->lock
, *flags
);
1051 if (timer
->flags
== tf
)
1053 raw_spin_unlock_irqrestore(&base
->lock
, *flags
);
1059 #define MOD_TIMER_PENDING_ONLY 0x01
1060 #define MOD_TIMER_REDUCE 0x02
1061 #define MOD_TIMER_NOTPENDING 0x04
1064 __mod_timer(struct timer_list
*timer
, unsigned long expires
, unsigned int options
)
1066 unsigned long clk
= 0, flags
, bucket_expiry
;
1067 struct timer_base
*base
, *new_base
;
1068 unsigned int idx
= UINT_MAX
;
1071 debug_assert_init(timer
);
1074 * This is a common optimization triggered by the networking code - if
1075 * the timer is re-modified to have the same timeout or ends up in the
1076 * same array bucket then just return:
1078 if (!(options
& MOD_TIMER_NOTPENDING
) && timer_pending(timer
)) {
1080 * The downside of this optimization is that it can result in
1081 * larger granularity than you would get from adding a new
1082 * timer with this expiry.
1084 long diff
= timer
->expires
- expires
;
1088 if (options
& MOD_TIMER_REDUCE
&& diff
<= 0)
1092 * We lock timer base and calculate the bucket index right
1093 * here. If the timer ends up in the same bucket, then we
1094 * just update the expiry time and avoid the whole
1095 * dequeue/enqueue dance.
1097 base
= lock_timer_base(timer
, &flags
);
1099 * Has @timer been shutdown? This needs to be evaluated
1100 * while holding base lock to prevent a race against the
1103 if (!timer
->function
)
1106 forward_timer_base(base
);
1108 if (timer_pending(timer
) && (options
& MOD_TIMER_REDUCE
) &&
1109 time_before_eq(timer
->expires
, expires
)) {
1115 idx
= calc_wheel_index(expires
, clk
, &bucket_expiry
);
1118 * Retrieve and compare the array index of the pending
1119 * timer. If it matches set the expiry to the new value so a
1120 * subsequent call will exit in the expires check above.
1122 if (idx
== timer_get_idx(timer
)) {
1123 if (!(options
& MOD_TIMER_REDUCE
))
1124 timer
->expires
= expires
;
1125 else if (time_after(timer
->expires
, expires
))
1126 timer
->expires
= expires
;
1131 base
= lock_timer_base(timer
, &flags
);
1133 * Has @timer been shutdown? This needs to be evaluated
1134 * while holding base lock to prevent a race against the
1137 if (!timer
->function
)
1140 forward_timer_base(base
);
1143 ret
= detach_if_pending(timer
, base
, false);
1144 if (!ret
&& (options
& MOD_TIMER_PENDING_ONLY
))
1147 new_base
= get_timer_this_cpu_base(timer
->flags
);
1149 if (base
!= new_base
) {
1151 * We are trying to schedule the timer on the new base.
1152 * However we can't change timer's base while it is running,
1153 * otherwise timer_delete_sync() can't detect that the timer's
1154 * handler yet has not finished. This also guarantees that the
1155 * timer is serialized wrt itself.
1157 if (likely(base
->running_timer
!= timer
)) {
1158 /* See the comment in lock_timer_base() */
1159 timer
->flags
|= TIMER_MIGRATING
;
1161 raw_spin_unlock(&base
->lock
);
1163 raw_spin_lock(&base
->lock
);
1164 WRITE_ONCE(timer
->flags
,
1165 (timer
->flags
& ~TIMER_BASEMASK
) | base
->cpu
);
1166 forward_timer_base(base
);
1170 debug_timer_activate(timer
);
1172 timer
->expires
= expires
;
1174 * If 'idx' was calculated above and the base time did not advance
1175 * between calculating 'idx' and possibly switching the base, only
1176 * enqueue_timer() is required. Otherwise we need to (re)calculate
1177 * the wheel index via internal_add_timer().
1179 if (idx
!= UINT_MAX
&& clk
== base
->clk
)
1180 enqueue_timer(base
, timer
, idx
, bucket_expiry
);
1182 internal_add_timer(base
, timer
);
1185 raw_spin_unlock_irqrestore(&base
->lock
, flags
);
1191 * mod_timer_pending - Modify a pending timer's timeout
1192 * @timer: The pending timer to be modified
1193 * @expires: New absolute timeout in jiffies
1195 * mod_timer_pending() is the same for pending timers as mod_timer(), but
1196 * will not activate inactive timers.
1198 * If @timer->function == NULL then the start operation is silently
1202 * * %0 - The timer was inactive and not modified or was in
1203 * shutdown state and the operation was discarded
1204 * * %1 - The timer was active and requeued to expire at @expires
1206 int mod_timer_pending(struct timer_list
*timer
, unsigned long expires
)
1208 return __mod_timer(timer
, expires
, MOD_TIMER_PENDING_ONLY
);
1210 EXPORT_SYMBOL(mod_timer_pending
);
1213 * mod_timer - Modify a timer's timeout
1214 * @timer: The timer to be modified
1215 * @expires: New absolute timeout in jiffies
1217 * mod_timer(timer, expires) is equivalent to:
1219 * del_timer(timer); timer->expires = expires; add_timer(timer);
1221 * mod_timer() is more efficient than the above open coded sequence. In
1222 * case that the timer is inactive, the del_timer() part is a NOP. The
1223 * timer is in any case activated with the new expiry time @expires.
1225 * Note that if there are multiple unserialized concurrent users of the
1226 * same timer, then mod_timer() is the only safe way to modify the timeout,
1227 * since add_timer() cannot modify an already running timer.
1229 * If @timer->function == NULL then the start operation is silently
1230 * discarded. In this case the return value is 0 and meaningless.
1233 * * %0 - The timer was inactive and started or was in shutdown
1234 * state and the operation was discarded
1235 * * %1 - The timer was active and requeued to expire at @expires or
1236 * the timer was active and not modified because @expires did
1237 * not change the effective expiry time
1239 int mod_timer(struct timer_list
*timer
, unsigned long expires
)
1241 return __mod_timer(timer
, expires
, 0);
1243 EXPORT_SYMBOL(mod_timer
);
1246 * timer_reduce - Modify a timer's timeout if it would reduce the timeout
1247 * @timer: The timer to be modified
1248 * @expires: New absolute timeout in jiffies
1250 * timer_reduce() is very similar to mod_timer(), except that it will only
1251 * modify an enqueued timer if that would reduce the expiration time. If
1252 * @timer is not enqueued it starts the timer.
1254 * If @timer->function == NULL then the start operation is silently
1258 * * %0 - The timer was inactive and started or was in shutdown
1259 * state and the operation was discarded
1260 * * %1 - The timer was active and requeued to expire at @expires or
1261 * the timer was active and not modified because @expires
1262 * did not change the effective expiry time such that the
1263 * timer would expire earlier than already scheduled
1265 int timer_reduce(struct timer_list
*timer
, unsigned long expires
)
1267 return __mod_timer(timer
, expires
, MOD_TIMER_REDUCE
);
1269 EXPORT_SYMBOL(timer_reduce
);
1272 * add_timer - Start a timer
1273 * @timer: The timer to be started
1275 * Start @timer to expire at @timer->expires in the future. @timer->expires
1276 * is the absolute expiry time measured in 'jiffies'. When the timer expires
1277 * timer->function(timer) will be invoked from soft interrupt context.
1279 * The @timer->expires and @timer->function fields must be set prior
1280 * to calling this function.
1282 * If @timer->function == NULL then the start operation is silently
1285 * If @timer->expires is already in the past @timer will be queued to
1286 * expire at the next timer tick.
1288 * This can only operate on an inactive timer. Attempts to invoke this on
1289 * an active timer are rejected with a warning.
1291 void add_timer(struct timer_list
*timer
)
1293 if (WARN_ON_ONCE(timer_pending(timer
)))
1295 __mod_timer(timer
, timer
->expires
, MOD_TIMER_NOTPENDING
);
1297 EXPORT_SYMBOL(add_timer
);
1300 * add_timer_local() - Start a timer on the local CPU
1301 * @timer: The timer to be started
1303 * Same as add_timer() except that the timer flag TIMER_PINNED is set.
1305 * See add_timer() for further details.
1307 void add_timer_local(struct timer_list
*timer
)
1309 if (WARN_ON_ONCE(timer_pending(timer
)))
1311 timer
->flags
|= TIMER_PINNED
;
1312 __mod_timer(timer
, timer
->expires
, MOD_TIMER_NOTPENDING
);
1314 EXPORT_SYMBOL(add_timer_local
);
1317 * add_timer_global() - Start a timer without TIMER_PINNED flag set
1318 * @timer: The timer to be started
1320 * Same as add_timer() except that the timer flag TIMER_PINNED is unset.
1322 * See add_timer() for further details.
1324 void add_timer_global(struct timer_list
*timer
)
1326 if (WARN_ON_ONCE(timer_pending(timer
)))
1328 timer
->flags
&= ~TIMER_PINNED
;
1329 __mod_timer(timer
, timer
->expires
, MOD_TIMER_NOTPENDING
);
1331 EXPORT_SYMBOL(add_timer_global
);
1334 * add_timer_on - Start a timer on a particular CPU
1335 * @timer: The timer to be started
1336 * @cpu: The CPU to start it on
1338 * Same as add_timer() except that it starts the timer on the given CPU and
1339 * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
1340 * the next round, add_timer_global() should be used instead as it unsets
1341 * the TIMER_PINNED flag.
1343 * See add_timer() for further details.
1345 void add_timer_on(struct timer_list
*timer
, int cpu
)
1347 struct timer_base
*new_base
, *base
;
1348 unsigned long flags
;
1350 debug_assert_init(timer
);
1352 if (WARN_ON_ONCE(timer_pending(timer
)))
1355 /* Make sure timer flags have TIMER_PINNED flag set */
1356 timer
->flags
|= TIMER_PINNED
;
1358 new_base
= get_timer_cpu_base(timer
->flags
, cpu
);
1361 * If @timer was on a different CPU, it should be migrated with the
1362 * old base locked to prevent other operations proceeding with the
1363 * wrong base locked. See lock_timer_base().
1365 base
= lock_timer_base(timer
, &flags
);
1367 * Has @timer been shutdown? This needs to be evaluated while
1368 * holding base lock to prevent a race against the shutdown code.
1370 if (!timer
->function
)
1373 if (base
!= new_base
) {
1374 timer
->flags
|= TIMER_MIGRATING
;
1376 raw_spin_unlock(&base
->lock
);
1378 raw_spin_lock(&base
->lock
);
1379 WRITE_ONCE(timer
->flags
,
1380 (timer
->flags
& ~TIMER_BASEMASK
) | cpu
);
1382 forward_timer_base(base
);
1384 debug_timer_activate(timer
);
1385 internal_add_timer(base
, timer
);
1387 raw_spin_unlock_irqrestore(&base
->lock
, flags
);
1389 EXPORT_SYMBOL_GPL(add_timer_on
);
1392 * __timer_delete - Internal function: Deactivate a timer
1393 * @timer: The timer to be deactivated
1394 * @shutdown: If true, this indicates that the timer is about to be
1395 * shutdown permanently.
1397 * If @shutdown is true then @timer->function is set to NULL under the
1398 * timer base lock which prevents further rearming of the time. In that
1399 * case any attempt to rearm @timer after this function returns will be
1403 * * %0 - The timer was not pending
1404 * * %1 - The timer was pending and deactivated
1406 static int __timer_delete(struct timer_list
*timer
, bool shutdown
)
1408 struct timer_base
*base
;
1409 unsigned long flags
;
1412 debug_assert_init(timer
);
1415 * If @shutdown is set then the lock has to be taken whether the
1416 * timer is pending or not to protect against a concurrent rearm
1417 * which might hit between the lockless pending check and the lock
1418 * acquisition. By taking the lock it is ensured that such a newly
1419 * enqueued timer is dequeued and cannot end up with
1420 * timer->function == NULL in the expiry code.
1422 * If timer->function is currently executed, then this makes sure
1423 * that the callback cannot requeue the timer.
1425 if (timer_pending(timer
) || shutdown
) {
1426 base
= lock_timer_base(timer
, &flags
);
1427 ret
= detach_if_pending(timer
, base
, true);
1429 timer
->function
= NULL
;
1430 raw_spin_unlock_irqrestore(&base
->lock
, flags
);
1437 * timer_delete - Deactivate a timer
1438 * @timer: The timer to be deactivated
1440 * The function only deactivates a pending timer, but contrary to
1441 * timer_delete_sync() it does not take into account whether the timer's
1442 * callback function is concurrently executed on a different CPU or not.
1443 * It neither prevents rearming of the timer. If @timer can be rearmed
1444 * concurrently then the return value of this function is meaningless.
1447 * * %0 - The timer was not pending
1448 * * %1 - The timer was pending and deactivated
1450 int timer_delete(struct timer_list
*timer
)
1452 return __timer_delete(timer
, false);
1454 EXPORT_SYMBOL(timer_delete
);
1457 * timer_shutdown - Deactivate a timer and prevent rearming
1458 * @timer: The timer to be deactivated
1460 * The function does not wait for an eventually running timer callback on a
1461 * different CPU but it prevents rearming of the timer. Any attempt to arm
1462 * @timer after this function returns will be silently ignored.
1464 * This function is useful for teardown code and should only be used when
1465 * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
1468 * * %0 - The timer was not pending
1469 * * %1 - The timer was pending
1471 int timer_shutdown(struct timer_list
*timer
)
1473 return __timer_delete(timer
, true);
1475 EXPORT_SYMBOL_GPL(timer_shutdown
);
1478 * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
1479 * @timer: Timer to deactivate
1480 * @shutdown: If true, this indicates that the timer is about to be
1481 * shutdown permanently.
1483 * If @shutdown is true then @timer->function is set to NULL under the
1484 * timer base lock which prevents further rearming of the timer. Any
1485 * attempt to rearm @timer after this function returns will be silently
1488 * This function cannot guarantee that the timer cannot be rearmed
1489 * right after dropping the base lock if @shutdown is false. That
1490 * needs to be prevented by the calling code if necessary.
1493 * * %0 - The timer was not pending
1494 * * %1 - The timer was pending and deactivated
1495 * * %-1 - The timer callback function is running on a different CPU
1497 static int __try_to_del_timer_sync(struct timer_list
*timer
, bool shutdown
)
1499 struct timer_base
*base
;
1500 unsigned long flags
;
1503 debug_assert_init(timer
);
1505 base
= lock_timer_base(timer
, &flags
);
1507 if (base
->running_timer
!= timer
)
1508 ret
= detach_if_pending(timer
, base
, true);
1510 timer
->function
= NULL
;
1512 raw_spin_unlock_irqrestore(&base
->lock
, flags
);
1518 * try_to_del_timer_sync - Try to deactivate a timer
1519 * @timer: Timer to deactivate
1521 * This function tries to deactivate a timer. On success the timer is not
1522 * queued and the timer callback function is not running on any CPU.
1524 * This function does not guarantee that the timer cannot be rearmed right
1525 * after dropping the base lock. That needs to be prevented by the calling
1526 * code if necessary.
1529 * * %0 - The timer was not pending
1530 * * %1 - The timer was pending and deactivated
1531 * * %-1 - The timer callback function is running on a different CPU
1533 int try_to_del_timer_sync(struct timer_list
*timer
)
1535 return __try_to_del_timer_sync(timer
, false);
1537 EXPORT_SYMBOL(try_to_del_timer_sync
);
1539 #ifdef CONFIG_PREEMPT_RT
1540 static __init
void timer_base_init_expiry_lock(struct timer_base
*base
)
1542 spin_lock_init(&base
->expiry_lock
);
1545 static inline void timer_base_lock_expiry(struct timer_base
*base
)
1547 spin_lock(&base
->expiry_lock
);
1550 static inline void timer_base_unlock_expiry(struct timer_base
*base
)
1552 spin_unlock(&base
->expiry_lock
);
1556 * The counterpart to del_timer_wait_running().
1558 * If there is a waiter for base->expiry_lock, then it was waiting for the
1559 * timer callback to finish. Drop expiry_lock and reacquire it. That allows
1560 * the waiter to acquire the lock and make progress.
1562 static void timer_sync_wait_running(struct timer_base
*base
)
1563 __releases(&base
->lock
) __releases(&base
->expiry_lock
)
1564 __acquires(&base
->expiry_lock
) __acquires(&base
->lock
)
1566 if (atomic_read(&base
->timer_waiters
)) {
1567 raw_spin_unlock_irq(&base
->lock
);
1568 spin_unlock(&base
->expiry_lock
);
1569 spin_lock(&base
->expiry_lock
);
1570 raw_spin_lock_irq(&base
->lock
);
1575 * This function is called on PREEMPT_RT kernels when the fast path
1576 * deletion of a timer failed because the timer callback function was
1579 * This prevents priority inversion, if the softirq thread on a remote CPU
1580 * got preempted, and it prevents a life lock when the task which tries to
1581 * delete a timer preempted the softirq thread running the timer callback
1584 static void del_timer_wait_running(struct timer_list
*timer
)
1588 tf
= READ_ONCE(timer
->flags
);
1589 if (!(tf
& (TIMER_MIGRATING
| TIMER_IRQSAFE
))) {
1590 struct timer_base
*base
= get_timer_base(tf
);
1593 * Mark the base as contended and grab the expiry lock,
1594 * which is held by the softirq across the timer
1595 * callback. Drop the lock immediately so the softirq can
1596 * expire the next timer. In theory the timer could already
1597 * be running again, but that's more than unlikely and just
1598 * causes another wait loop.
1600 atomic_inc(&base
->timer_waiters
);
1601 spin_lock_bh(&base
->expiry_lock
);
1602 atomic_dec(&base
->timer_waiters
);
1603 spin_unlock_bh(&base
->expiry_lock
);
1607 static inline void timer_base_init_expiry_lock(struct timer_base
*base
) { }
1608 static inline void timer_base_lock_expiry(struct timer_base
*base
) { }
1609 static inline void timer_base_unlock_expiry(struct timer_base
*base
) { }
1610 static inline void timer_sync_wait_running(struct timer_base
*base
) { }
1611 static inline void del_timer_wait_running(struct timer_list
*timer
) { }
1615 * __timer_delete_sync - Internal function: Deactivate a timer and wait
1616 * for the handler to finish.
1617 * @timer: The timer to be deactivated
1618 * @shutdown: If true, @timer->function will be set to NULL under the
1619 * timer base lock which prevents rearming of @timer
1621 * If @shutdown is not set the timer can be rearmed later. If the timer can
1622 * be rearmed concurrently, i.e. after dropping the base lock then the
1623 * return value is meaningless.
1625 * If @shutdown is set then @timer->function is set to NULL under timer
1626 * base lock which prevents rearming of the timer. Any attempt to rearm
1627 * a shutdown timer is silently ignored.
1629 * If the timer should be reused after shutdown it has to be initialized
1633 * * %0 - The timer was not pending
1634 * * %1 - The timer was pending and deactivated
1636 static int __timer_delete_sync(struct timer_list
*timer
, bool shutdown
)
1640 #ifdef CONFIG_LOCKDEP
1641 unsigned long flags
;
1644 * If lockdep gives a backtrace here, please reference
1645 * the synchronization rules above.
1647 local_irq_save(flags
);
1648 lock_map_acquire(&timer
->lockdep_map
);
1649 lock_map_release(&timer
->lockdep_map
);
1650 local_irq_restore(flags
);
1653 * don't use it in hardirq context, because it
1654 * could lead to deadlock.
1656 WARN_ON(in_hardirq() && !(timer
->flags
& TIMER_IRQSAFE
));
1659 * Must be able to sleep on PREEMPT_RT because of the slowpath in
1660 * del_timer_wait_running().
1662 if (IS_ENABLED(CONFIG_PREEMPT_RT
) && !(timer
->flags
& TIMER_IRQSAFE
))
1663 lockdep_assert_preemption_enabled();
1666 ret
= __try_to_del_timer_sync(timer
, shutdown
);
1668 if (unlikely(ret
< 0)) {
1669 del_timer_wait_running(timer
);
1678 * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
1679 * @timer: The timer to be deactivated
1681 * Synchronization rules: Callers must prevent restarting of the timer,
1682 * otherwise this function is meaningless. It must not be called from
1683 * interrupt contexts unless the timer is an irqsafe one. The caller must
1684 * not hold locks which would prevent completion of the timer's callback
1685 * function. The timer's handler must not call add_timer_on(). Upon exit
1686 * the timer is not queued and the handler is not running on any CPU.
1688 * For !irqsafe timers, the caller must not hold locks that are held in
1689 * interrupt context. Even if the lock has nothing to do with the timer in
1690 * question. Here's why::
1696 * base->running_timer = mytimer;
1697 * spin_lock_irq(somelock);
1699 * spin_lock(somelock);
1700 * timer_delete_sync(mytimer);
1701 * while (base->running_timer == mytimer);
1703 * Now timer_delete_sync() will never return and never release somelock.
1704 * The interrupt on the other CPU is waiting to grab somelock but it has
1705 * interrupted the softirq that CPU0 is waiting to finish.
1707 * This function cannot guarantee that the timer is not rearmed again by
1708 * some concurrent or preempting code, right after it dropped the base
1709 * lock. If there is the possibility of a concurrent rearm then the return
1710 * value of the function is meaningless.
1712 * If such a guarantee is needed, e.g. for teardown situations then use
1713 * timer_shutdown_sync() instead.
1716 * * %0 - The timer was not pending
1717 * * %1 - The timer was pending and deactivated
1719 int timer_delete_sync(struct timer_list
*timer
)
1721 return __timer_delete_sync(timer
, false);
1723 EXPORT_SYMBOL(timer_delete_sync
);
1726 * timer_shutdown_sync - Shutdown a timer and prevent rearming
1727 * @timer: The timer to be shutdown
1729 * When the function returns it is guaranteed that:
1730 * - @timer is not queued
1731 * - The callback function of @timer is not running
1732 * - @timer cannot be enqueued again. Any attempt to rearm
1733 * @timer is silently ignored.
1735 * See timer_delete_sync() for synchronization rules.
1737 * This function is useful for final teardown of an infrastructure where
1738 * the timer is subject to a circular dependency problem.
1740 * A common pattern for this is a timer and a workqueue where the timer can
1741 * schedule work and work can arm the timer. On shutdown the workqueue must
1742 * be destroyed and the timer must be prevented from rearming. Unless the
1743 * code has conditionals like 'if (mything->in_shutdown)' to prevent that
1744 * there is no way to get this correct with timer_delete_sync().
1746 * timer_shutdown_sync() is solving the problem. The correct ordering of
1747 * calls in this case is:
1749 * timer_shutdown_sync(&mything->timer);
1750 * workqueue_destroy(&mything->workqueue);
1752 * After this 'mything' can be safely freed.
1754 * This obviously implies that the timer is not required to be functional
1755 * for the rest of the shutdown operation.
1758 * * %0 - The timer was not pending
1759 * * %1 - The timer was pending
1761 int timer_shutdown_sync(struct timer_list
*timer
)
1763 return __timer_delete_sync(timer
, true);
1765 EXPORT_SYMBOL_GPL(timer_shutdown_sync
);
1767 static void call_timer_fn(struct timer_list
*timer
,
1768 void (*fn
)(struct timer_list
*),
1769 unsigned long baseclk
)
1771 int count
= preempt_count();
1773 #ifdef CONFIG_LOCKDEP
1775 * It is permissible to free the timer from inside the
1776 * function that is called from it, this we need to take into
1777 * account for lockdep too. To avoid bogus "held lock freed"
1778 * warnings as well as problems when looking into
1779 * timer->lockdep_map, make a copy and use that here.
1781 struct lockdep_map lockdep_map
;
1783 lockdep_copy_map(&lockdep_map
, &timer
->lockdep_map
);
1786 * Couple the lock chain with the lock chain at
1787 * timer_delete_sync() by acquiring the lock_map around the fn()
1788 * call here and in timer_delete_sync().
1790 lock_map_acquire(&lockdep_map
);
1792 trace_timer_expire_entry(timer
, baseclk
);
1794 trace_timer_expire_exit(timer
);
1796 lock_map_release(&lockdep_map
);
1798 if (count
!= preempt_count()) {
1799 WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
1800 fn
, count
, preempt_count());
1802 * Restore the preempt count. That gives us a decent
1803 * chance to survive and extract information. If the
1804 * callback kept a lock held, bad luck, but not worse
1805 * than the BUG() we had.
1807 preempt_count_set(count
);
1811 static void expire_timers(struct timer_base
*base
, struct hlist_head
*head
)
1814 * This value is required only for tracing. base->clk was
1815 * incremented directly before expire_timers was called. But expiry
1816 * is related to the old base->clk value.
1818 unsigned long baseclk
= base
->clk
- 1;
1820 while (!hlist_empty(head
)) {
1821 struct timer_list
*timer
;
1822 void (*fn
)(struct timer_list
*);
1824 timer
= hlist_entry(head
->first
, struct timer_list
, entry
);
1826 base
->running_timer
= timer
;
1827 detach_timer(timer
, true);
1829 fn
= timer
->function
;
1831 if (WARN_ON_ONCE(!fn
)) {
1832 /* Should never happen. Emphasis on should! */
1833 base
->running_timer
= NULL
;
1837 if (timer
->flags
& TIMER_IRQSAFE
) {
1838 raw_spin_unlock(&base
->lock
);
1839 call_timer_fn(timer
, fn
, baseclk
);
1840 raw_spin_lock(&base
->lock
);
1841 base
->running_timer
= NULL
;
1843 raw_spin_unlock_irq(&base
->lock
);
1844 call_timer_fn(timer
, fn
, baseclk
);
1845 raw_spin_lock_irq(&base
->lock
);
1846 base
->running_timer
= NULL
;
1847 timer_sync_wait_running(base
);
1852 static int collect_expired_timers(struct timer_base
*base
,
1853 struct hlist_head
*heads
)
1855 unsigned long clk
= base
->clk
= base
->next_expiry
;
1856 struct hlist_head
*vec
;
1860 for (i
= 0; i
< LVL_DEPTH
; i
++) {
1861 idx
= (clk
& LVL_MASK
) + i
* LVL_SIZE
;
1863 if (__test_and_clear_bit(idx
, base
->pending_map
)) {
1864 vec
= base
->vectors
+ idx
;
1865 hlist_move_list(vec
, heads
++);
1868 /* Is it time to look at the next level? */
1869 if (clk
& LVL_CLK_MASK
)
1871 /* Shift clock for the next level granularity */
1872 clk
>>= LVL_CLK_SHIFT
;
1878 * Find the next pending bucket of a level. Search from level start (@offset)
1879 * + @clk upwards and if nothing there, search from start of the level
1880 * (@offset) up to @offset + clk.
1882 static int next_pending_bucket(struct timer_base
*base
, unsigned offset
,
1885 unsigned pos
, start
= offset
+ clk
;
1886 unsigned end
= offset
+ LVL_SIZE
;
1888 pos
= find_next_bit(base
->pending_map
, end
, start
);
1892 pos
= find_next_bit(base
->pending_map
, start
, offset
);
1893 return pos
< start
? pos
+ LVL_SIZE
- start
: -1;
1897 * Search the first expiring timer in the various clock levels. Caller must
1900 * Store next expiry time in base->next_expiry.
1902 static void timer_recalc_next_expiry(struct timer_base
*base
)
1904 unsigned long clk
, next
, adj
;
1905 unsigned lvl
, offset
= 0;
1907 next
= base
->clk
+ NEXT_TIMER_MAX_DELTA
;
1909 for (lvl
= 0; lvl
< LVL_DEPTH
; lvl
++, offset
+= LVL_SIZE
) {
1910 int pos
= next_pending_bucket(base
, offset
, clk
& LVL_MASK
);
1911 unsigned long lvl_clk
= clk
& LVL_CLK_MASK
;
1914 unsigned long tmp
= clk
+ (unsigned long) pos
;
1916 tmp
<<= LVL_SHIFT(lvl
);
1917 if (time_before(tmp
, next
))
1921 * If the next expiration happens before we reach
1922 * the next level, no need to check further.
1924 if (pos
<= ((LVL_CLK_DIV
- lvl_clk
) & LVL_CLK_MASK
))
1928 * Clock for the next level. If the current level clock lower
1929 * bits are zero, we look at the next level as is. If not we
1930 * need to advance it by one because that's going to be the
1931 * next expiring bucket in that level. base->clk is the next
1932 * expiring jiffy. So in case of:
1934 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1937 * we have to look at all levels @index 0. With
1939 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1942 * LVL0 has the next expiring bucket @index 2. The upper
1943 * levels have the next expiring bucket @index 1.
1945 * In case that the propagation wraps the next level the same
1948 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1951 * So after looking at LVL0 we get:
1953 * LVL5 LVL4 LVL3 LVL2 LVL1
1956 * So no propagation from LVL1 to LVL2 because that happened
1957 * with the add already, but then we need to propagate further
1958 * from LVL2 to LVL3.
1960 * So the simple check whether the lower bits of the current
1961 * level are 0 or not is sufficient for all cases.
1963 adj
= lvl_clk
? 1 : 0;
1964 clk
>>= LVL_CLK_SHIFT
;
1968 WRITE_ONCE(base
->next_expiry
, next
);
1969 base
->next_expiry_recalc
= false;
1970 base
->timers_pending
= !(next
== base
->clk
+ NEXT_TIMER_MAX_DELTA
);
1973 #ifdef CONFIG_NO_HZ_COMMON
1975 * Check, if the next hrtimer event is before the next timer wheel
1978 static u64
cmp_next_hrtimer_event(u64 basem
, u64 expires
)
1980 u64 nextevt
= hrtimer_get_next_event();
1983 * If high resolution timers are enabled
1984 * hrtimer_get_next_event() returns KTIME_MAX.
1986 if (expires
<= nextevt
)
1990 * If the next timer is already expired, return the tick base
1991 * time so the tick is fired immediately.
1993 if (nextevt
<= basem
)
1997 * Round up to the next jiffy. High resolution timers are
1998 * off, so the hrtimers are expired in the tick and we need to
1999 * make sure that this tick really expires the timer to avoid
2000 * a ping pong of the nohz stop code.
2002 * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
2004 return DIV_ROUND_UP_ULL(nextevt
, TICK_NSEC
) * TICK_NSEC
;
2007 static unsigned long next_timer_interrupt(struct timer_base
*base
,
2008 unsigned long basej
)
2010 if (base
->next_expiry_recalc
)
2011 timer_recalc_next_expiry(base
);
2014 * Move next_expiry for the empty base into the future to prevent an
2015 * unnecessary raise of the timer softirq when the next_expiry value
2016 * will be reached even if there is no timer pending.
2018 * This update is also required to make timer_base::next_expiry values
2019 * easy comparable to find out which base holds the first pending timer.
2021 if (!base
->timers_pending
)
2022 WRITE_ONCE(base
->next_expiry
, basej
+ NEXT_TIMER_MAX_DELTA
);
2024 return base
->next_expiry
;
2027 static unsigned long fetch_next_timer_interrupt(unsigned long basej
, u64 basem
,
2028 struct timer_base
*base_local
,
2029 struct timer_base
*base_global
,
2030 struct timer_events
*tevt
)
2032 unsigned long nextevt
, nextevt_local
, nextevt_global
;
2035 nextevt_local
= next_timer_interrupt(base_local
, basej
);
2036 nextevt_global
= next_timer_interrupt(base_global
, basej
);
2038 local_first
= time_before_eq(nextevt_local
, nextevt_global
);
2040 nextevt
= local_first
? nextevt_local
: nextevt_global
;
2043 * If the @nextevt is at max. one tick away, use @nextevt and store
2044 * it in the local expiry value. The next global event is irrelevant in
2045 * this case and can be left as KTIME_MAX.
2047 if (time_before_eq(nextevt
, basej
+ 1)) {
2048 /* If we missed a tick already, force 0 delta */
2049 if (time_before(nextevt
, basej
))
2051 tevt
->local
= basem
+ (u64
)(nextevt
- basej
) * TICK_NSEC
;
2054 * This is required for the remote check only but it doesn't
2055 * hurt, when it is done for both call sites:
2057 * * The remote callers will only take care of the global timers
2058 * as local timers will be handled by CPU itself. When not
2059 * updating tevt->global with the already missed first global
2060 * timer, it is possible that it will be missed completely.
2062 * * The local callers will ignore the tevt->global anyway, when
2063 * nextevt is max. one tick away.
2066 tevt
->global
= tevt
->local
;
2071 * Update tevt.* values:
2073 * If the local queue expires first, then the global event can be
2074 * ignored. If the global queue is empty, nothing to do either.
2076 if (!local_first
&& base_global
->timers_pending
)
2077 tevt
->global
= basem
+ (u64
)(nextevt_global
- basej
) * TICK_NSEC
;
2079 if (base_local
->timers_pending
)
2080 tevt
->local
= basem
+ (u64
)(nextevt_local
- basej
) * TICK_NSEC
;
2087 * fetch_next_timer_interrupt_remote() - Store next timers into @tevt
2088 * @basej: base time jiffies
2089 * @basem: base time clock monotonic
2090 * @tevt: Pointer to the storage for the expiry values
2093 * Stores the next pending local and global timer expiry values in the
2094 * struct pointed to by @tevt. If a queue is empty the corresponding
2095 * field is set to KTIME_MAX. If local event expires before global
2096 * event, global event is set to KTIME_MAX as well.
2098 * Caller needs to make sure timer base locks are held (use
2099 * timer_lock_remote_bases() for this purpose).
2101 void fetch_next_timer_interrupt_remote(unsigned long basej
, u64 basem
,
2102 struct timer_events
*tevt
,
2105 struct timer_base
*base_local
, *base_global
;
2107 /* Preset local / global events */
2108 tevt
->local
= tevt
->global
= KTIME_MAX
;
2110 base_local
= per_cpu_ptr(&timer_bases
[BASE_LOCAL
], cpu
);
2111 base_global
= per_cpu_ptr(&timer_bases
[BASE_GLOBAL
], cpu
);
2113 lockdep_assert_held(&base_local
->lock
);
2114 lockdep_assert_held(&base_global
->lock
);
2116 fetch_next_timer_interrupt(basej
, basem
, base_local
, base_global
, tevt
);
2120 * timer_unlock_remote_bases - unlock timer bases of cpu
2123 * Unlocks the remote timer bases.
2125 void timer_unlock_remote_bases(unsigned int cpu
)
2126 __releases(timer_bases
[BASE_LOCAL
]->lock
)
2127 __releases(timer_bases
[BASE_GLOBAL
]->lock
)
2129 struct timer_base
*base_local
, *base_global
;
2131 base_local
= per_cpu_ptr(&timer_bases
[BASE_LOCAL
], cpu
);
2132 base_global
= per_cpu_ptr(&timer_bases
[BASE_GLOBAL
], cpu
);
2134 raw_spin_unlock(&base_global
->lock
);
2135 raw_spin_unlock(&base_local
->lock
);
2139 * timer_lock_remote_bases - lock timer bases of cpu
2142 * Locks the remote timer bases.
2144 void timer_lock_remote_bases(unsigned int cpu
)
2145 __acquires(timer_bases
[BASE_LOCAL
]->lock
)
2146 __acquires(timer_bases
[BASE_GLOBAL
]->lock
)
2148 struct timer_base
*base_local
, *base_global
;
2150 base_local
= per_cpu_ptr(&timer_bases
[BASE_LOCAL
], cpu
);
2151 base_global
= per_cpu_ptr(&timer_bases
[BASE_GLOBAL
], cpu
);
2153 lockdep_assert_irqs_disabled();
2155 raw_spin_lock(&base_local
->lock
);
2156 raw_spin_lock_nested(&base_global
->lock
, SINGLE_DEPTH_NESTING
);
2160 * timer_base_is_idle() - Return whether timer base is set idle
2162 * Returns value of local timer base is_idle value.
2164 bool timer_base_is_idle(void)
2166 return __this_cpu_read(timer_bases
[BASE_LOCAL
].is_idle
);
2169 static void __run_timer_base(struct timer_base
*base
);
2172 * timer_expire_remote() - expire global timers of cpu
2175 * Expire timers of global base of remote CPU.
2177 void timer_expire_remote(unsigned int cpu
)
2179 struct timer_base
*base
= per_cpu_ptr(&timer_bases
[BASE_GLOBAL
], cpu
);
2181 __run_timer_base(base
);
2184 static void timer_use_tmigr(unsigned long basej
, u64 basem
,
2185 unsigned long *nextevt
, bool *tick_stop_path
,
2186 bool timer_base_idle
, struct timer_events
*tevt
)
2190 if (timer_base_idle
)
2191 next_tmigr
= tmigr_cpu_new_timer(tevt
->global
);
2192 else if (tick_stop_path
)
2193 next_tmigr
= tmigr_cpu_deactivate(tevt
->global
);
2195 next_tmigr
= tmigr_quick_check(tevt
->global
);
2198 * If the CPU is the last going idle in timer migration hierarchy, make
2199 * sure the CPU will wake up in time to handle remote timers.
2200 * next_tmigr == KTIME_MAX if other CPUs are still active.
2202 if (next_tmigr
< tevt
->local
) {
2205 /* If we missed a tick already, force 0 delta */
2206 if (next_tmigr
< basem
)
2209 tmp
= div_u64(next_tmigr
- basem
, TICK_NSEC
);
2211 *nextevt
= basej
+ (unsigned long)tmp
;
2212 tevt
->local
= next_tmigr
;
2216 static void timer_use_tmigr(unsigned long basej
, u64 basem
,
2217 unsigned long *nextevt
, bool *tick_stop_path
,
2218 bool timer_base_idle
, struct timer_events
*tevt
)
2221 * Make sure first event is written into tevt->local to not miss a
2222 * timer on !SMP systems.
2224 tevt
->local
= min_t(u64
, tevt
->local
, tevt
->global
);
2226 # endif /* CONFIG_SMP */
2228 static inline u64
__get_next_timer_interrupt(unsigned long basej
, u64 basem
,
2231 struct timer_events tevt
= { .local
= KTIME_MAX
, .global
= KTIME_MAX
};
2232 struct timer_base
*base_local
, *base_global
;
2233 unsigned long nextevt
;
2234 bool idle_is_possible
;
2237 * When the CPU is offline, the tick is cancelled and nothing is supposed
2238 * to try to stop it.
2240 if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
2246 base_local
= this_cpu_ptr(&timer_bases
[BASE_LOCAL
]);
2247 base_global
= this_cpu_ptr(&timer_bases
[BASE_GLOBAL
]);
2249 raw_spin_lock(&base_local
->lock
);
2250 raw_spin_lock_nested(&base_global
->lock
, SINGLE_DEPTH_NESTING
);
2252 nextevt
= fetch_next_timer_interrupt(basej
, basem
, base_local
,
2253 base_global
, &tevt
);
2256 * If the next event is only one jiffy ahead there is no need to call
2257 * timer migration hierarchy related functions. The value for the next
2258 * global timer in @tevt struct equals then KTIME_MAX. This is also
2259 * true, when the timer base is idle.
2261 * The proper timer migration hierarchy function depends on the callsite
2262 * and whether timer base is idle or not. @nextevt will be updated when
2263 * this CPU needs to handle the first timer migration hierarchy
2264 * event. See timer_use_tmigr() for detailed information.
2266 idle_is_possible
= time_after(nextevt
, basej
+ 1);
2267 if (idle_is_possible
)
2268 timer_use_tmigr(basej
, basem
, &nextevt
, idle
,
2269 base_local
->is_idle
, &tevt
);
2272 * We have a fresh next event. Check whether we can forward the
2275 __forward_timer_base(base_local
, basej
);
2276 __forward_timer_base(base_global
, basej
);
2279 * Set base->is_idle only when caller is timer_base_try_to_set_idle()
2283 * Bases are idle if the next event is more than a tick
2284 * away. Caution: @nextevt could have changed by enqueueing a
2285 * global timer into timer migration hierarchy. Therefore a new
2286 * check is required here.
2288 * If the base is marked idle then any timer add operation must
2289 * forward the base clk itself to keep granularity small. This
2290 * idle logic is only maintained for the BASE_LOCAL and
2291 * BASE_GLOBAL base, deferrable timers may still see large
2292 * granularity skew (by design).
2294 if (!base_local
->is_idle
&& time_after(nextevt
, basej
+ 1)) {
2295 base_local
->is_idle
= true;
2297 * Global timers queued locally while running in a task
2298 * in nohz_full mode need a self-IPI to kick reprogramming
2301 if (tick_nohz_full_cpu(base_local
->cpu
))
2302 base_global
->is_idle
= true;
2303 trace_timer_base_idle(true, base_local
->cpu
);
2305 *idle
= base_local
->is_idle
;
2308 * When timer base is not set idle, undo the effect of
2309 * tmigr_cpu_deactivate() to prevent inconsistent states - active
2310 * timer base but inactive timer migration hierarchy.
2312 * When timer base was already marked idle, nothing will be
2315 if (!base_local
->is_idle
&& idle_is_possible
)
2316 tmigr_cpu_activate();
2319 raw_spin_unlock(&base_global
->lock
);
2320 raw_spin_unlock(&base_local
->lock
);
2322 return cmp_next_hrtimer_event(basem
, tevt
.local
);
2326 * get_next_timer_interrupt() - return the time (clock mono) of the next timer
2327 * @basej: base time jiffies
2328 * @basem: base time clock monotonic
2330 * Returns the tick aligned clock monotonic time of the next pending timer or
2331 * KTIME_MAX if no timer is pending. If timer of global base was queued into
2332 * timer migration hierarchy, first global timer is not taken into account. If
2333 * it was the last CPU of timer migration hierarchy going idle, first global
2334 * event is taken into account.
2336 u64
get_next_timer_interrupt(unsigned long basej
, u64 basem
)
2338 return __get_next_timer_interrupt(basej
, basem
, NULL
);
2342 * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
2343 * @basej: base time jiffies
2344 * @basem: base time clock monotonic
2345 * @idle: pointer to store the value of timer_base->is_idle on return;
2346 * *idle contains the information whether tick was already stopped
2348 * Returns the tick aligned clock monotonic time of the next pending timer or
2349 * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
2352 u64
timer_base_try_to_set_idle(unsigned long basej
, u64 basem
, bool *idle
)
2357 return __get_next_timer_interrupt(basej
, basem
, idle
);
2361 * timer_clear_idle - Clear the idle state of the timer base
2363 * Called with interrupts disabled
2365 void timer_clear_idle(void)
2368 * We do this unlocked. The worst outcome is a remote pinned timer
2369 * enqueue sending a pointless IPI, but taking the lock would just
2370 * make the window for sending the IPI a few instructions smaller
2371 * for the cost of taking the lock in the exit from idle
2372 * path. Required for BASE_LOCAL only.
2374 __this_cpu_write(timer_bases
[BASE_LOCAL
].is_idle
, false);
2375 if (tick_nohz_full_cpu(smp_processor_id()))
2376 __this_cpu_write(timer_bases
[BASE_GLOBAL
].is_idle
, false);
2377 trace_timer_base_idle(false, smp_processor_id());
2379 /* Activate without holding the timer_base->lock */
2380 tmigr_cpu_activate();
2385 * __run_timers - run all expired timers (if any) on this CPU.
2386 * @base: the timer vector to be processed.
2388 static inline void __run_timers(struct timer_base
*base
)
2390 struct hlist_head heads
[LVL_DEPTH
];
2393 lockdep_assert_held(&base
->lock
);
2395 if (base
->running_timer
)
2398 while (time_after_eq(jiffies
, base
->clk
) &&
2399 time_after_eq(jiffies
, base
->next_expiry
)) {
2400 levels
= collect_expired_timers(base
, heads
);
2402 * The two possible reasons for not finding any expired
2403 * timer at this clk are that all matching timers have been
2404 * dequeued or no timer has been queued since
2405 * base::next_expiry was set to base::clk +
2406 * NEXT_TIMER_MAX_DELTA.
2408 WARN_ON_ONCE(!levels
&& !base
->next_expiry_recalc
2409 && base
->timers_pending
);
2411 * While executing timers, base->clk is set 1 offset ahead of
2412 * jiffies to avoid endless requeuing to current jiffies.
2415 timer_recalc_next_expiry(base
);
2418 expire_timers(base
, heads
+ levels
);
2422 static void __run_timer_base(struct timer_base
*base
)
2424 /* Can race against a remote CPU updating next_expiry under the lock */
2425 if (time_before(jiffies
, READ_ONCE(base
->next_expiry
)))
2428 timer_base_lock_expiry(base
);
2429 raw_spin_lock_irq(&base
->lock
);
2431 raw_spin_unlock_irq(&base
->lock
);
2432 timer_base_unlock_expiry(base
);
2435 static void run_timer_base(int index
)
2437 struct timer_base
*base
= this_cpu_ptr(&timer_bases
[index
]);
2439 __run_timer_base(base
);
2443 * This function runs timers and the timer-tq in bottom half context.
2445 static __latent_entropy
void run_timer_softirq(void)
2447 run_timer_base(BASE_LOCAL
);
2448 if (IS_ENABLED(CONFIG_NO_HZ_COMMON
)) {
2449 run_timer_base(BASE_GLOBAL
);
2450 run_timer_base(BASE_DEF
);
2452 if (is_timers_nohz_active())
2453 tmigr_handle_remote();
2458 * Called by the local, per-CPU timer interrupt on SMP.
2460 static void run_local_timers(void)
2462 struct timer_base
*base
= this_cpu_ptr(&timer_bases
[BASE_LOCAL
]);
2464 hrtimer_run_queues();
2466 for (int i
= 0; i
< NR_BASES
; i
++, base
++) {
2468 * Raise the softirq only if required.
2470 * timer_base::next_expiry can be written by a remote CPU while
2471 * holding the lock. If this write happens at the same time than
2472 * the lockless local read, sanity checker could complain about
2475 * There are two possible situations where
2476 * timer_base::next_expiry is written by a remote CPU:
2478 * 1. Remote CPU expires global timers of this CPU and updates
2479 * timer_base::next_expiry of BASE_GLOBAL afterwards in
2480 * next_timer_interrupt() or timer_recalc_next_expiry(). The
2481 * worst outcome is a superfluous raise of the timer softirq
2482 * when the not yet updated value is read.
2484 * 2. A new first pinned timer is enqueued by a remote CPU
2485 * and therefore timer_base::next_expiry of BASE_LOCAL is
2486 * updated. When this update is missed, this isn't a
2487 * problem, as an IPI is executed nevertheless when the CPU
2488 * was idle before. When the CPU wasn't idle but the update
2489 * is missed, then the timer would expire one jiffy late -
2492 * Those unlikely corner cases where the worst outcome is only a
2493 * one jiffy delay or a superfluous raise of the softirq are
2494 * not that expensive as doing the check always while holding
2497 * Possible remote writers are using WRITE_ONCE(). Local reader
2498 * uses therefore READ_ONCE().
2500 if (time_after_eq(jiffies
, READ_ONCE(base
->next_expiry
)) ||
2501 (i
== BASE_DEF
&& tmigr_requires_handle_remote())) {
2502 raise_timer_softirq(TIMER_SOFTIRQ
);
2509 * Called from the timer interrupt handler to charge one tick to the current
2510 * process. user_tick is 1 if the tick is user time, 0 for system.
2512 void update_process_times(int user_tick
)
2514 struct task_struct
*p
= current
;
2516 /* Note: this timer irq context must be accounted for as well. */
2517 account_process_tick(p
, user_tick
);
2519 rcu_sched_clock_irq(user_tick
);
2520 #ifdef CONFIG_IRQ_WORK
2525 if (IS_ENABLED(CONFIG_POSIX_TIMERS
))
2526 run_posix_cpu_timers();
2529 #ifdef CONFIG_HOTPLUG_CPU
2530 static void migrate_timer_list(struct timer_base
*new_base
, struct hlist_head
*head
)
2532 struct timer_list
*timer
;
2533 int cpu
= new_base
->cpu
;
2535 while (!hlist_empty(head
)) {
2536 timer
= hlist_entry(head
->first
, struct timer_list
, entry
);
2537 detach_timer(timer
, false);
2538 timer
->flags
= (timer
->flags
& ~TIMER_BASEMASK
) | cpu
;
2539 internal_add_timer(new_base
, timer
);
2543 int timers_prepare_cpu(unsigned int cpu
)
2545 struct timer_base
*base
;
2548 for (b
= 0; b
< NR_BASES
; b
++) {
2549 base
= per_cpu_ptr(&timer_bases
[b
], cpu
);
2550 base
->clk
= jiffies
;
2551 base
->next_expiry
= base
->clk
+ NEXT_TIMER_MAX_DELTA
;
2552 base
->next_expiry_recalc
= false;
2553 base
->timers_pending
= false;
2554 base
->is_idle
= false;
2559 int timers_dead_cpu(unsigned int cpu
)
2561 struct timer_base
*old_base
;
2562 struct timer_base
*new_base
;
2565 for (b
= 0; b
< NR_BASES
; b
++) {
2566 old_base
= per_cpu_ptr(&timer_bases
[b
], cpu
);
2567 new_base
= get_cpu_ptr(&timer_bases
[b
]);
2569 * The caller is globally serialized and nobody else
2570 * takes two locks at once, deadlock is not possible.
2572 raw_spin_lock_irq(&new_base
->lock
);
2573 raw_spin_lock_nested(&old_base
->lock
, SINGLE_DEPTH_NESTING
);
2576 * The current CPUs base clock might be stale. Update it
2577 * before moving the timers over.
2579 forward_timer_base(new_base
);
2581 WARN_ON_ONCE(old_base
->running_timer
);
2582 old_base
->running_timer
= NULL
;
2584 for (i
= 0; i
< WHEEL_SIZE
; i
++)
2585 migrate_timer_list(new_base
, old_base
->vectors
+ i
);
2587 raw_spin_unlock(&old_base
->lock
);
2588 raw_spin_unlock_irq(&new_base
->lock
);
2589 put_cpu_ptr(&timer_bases
);
2594 #endif /* CONFIG_HOTPLUG_CPU */
2596 static void __init
init_timer_cpu(int cpu
)
2598 struct timer_base
*base
;
2601 for (i
= 0; i
< NR_BASES
; i
++) {
2602 base
= per_cpu_ptr(&timer_bases
[i
], cpu
);
2604 raw_spin_lock_init(&base
->lock
);
2605 base
->clk
= jiffies
;
2606 base
->next_expiry
= base
->clk
+ NEXT_TIMER_MAX_DELTA
;
2607 timer_base_init_expiry_lock(base
);
2611 static void __init
init_timer_cpus(void)
2615 for_each_possible_cpu(cpu
)
2616 init_timer_cpu(cpu
);
2619 void __init
init_timers(void)
2622 posix_cputimers_init_work();
2623 open_softirq(TIMER_SOFTIRQ
, run_timer_softirq
);