1 // SPDX-License-Identifier: GPL-2.0+
3 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
5 * Copyright IBM Corporation, 2008
7 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
8 * Manfred Spraul <manfred@colorfullife.com>
9 * Paul E. McKenney <paulmck@linux.ibm.com>
11 * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
12 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
14 * For detailed explanation of Read-Copy Update mechanism see -
18 #define pr_fmt(fmt) "rcu: " fmt
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/init.h>
23 #include <linux/spinlock.h>
24 #include <linux/smp.h>
25 #include <linux/rcupdate_wait.h>
26 #include <linux/interrupt.h>
27 #include <linux/sched.h>
28 #include <linux/sched/debug.h>
29 #include <linux/nmi.h>
30 #include <linux/atomic.h>
31 #include <linux/bitops.h>
32 #include <linux/export.h>
33 #include <linux/completion.h>
34 #include <linux/kmemleak.h>
35 #include <linux/moduleparam.h>
36 #include <linux/panic.h>
37 #include <linux/panic_notifier.h>
38 #include <linux/percpu.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/mutex.h>
42 #include <linux/time.h>
43 #include <linux/kernel_stat.h>
44 #include <linux/wait.h>
45 #include <linux/kthread.h>
46 #include <uapi/linux/sched/types.h>
47 #include <linux/prefetch.h>
48 #include <linux/delay.h>
49 #include <linux/random.h>
50 #include <linux/trace_events.h>
51 #include <linux/suspend.h>
52 #include <linux/ftrace.h>
53 #include <linux/tick.h>
54 #include <linux/sysrq.h>
55 #include <linux/kprobes.h>
56 #include <linux/gfp.h>
57 #include <linux/oom.h>
58 #include <linux/smpboot.h>
59 #include <linux/jiffies.h>
60 #include <linux/slab.h>
61 #include <linux/sched/isolation.h>
62 #include <linux/sched/clock.h>
63 #include <linux/vmalloc.h>
65 #include <linux/kasan.h>
66 #include <linux/context_tracking.h>
67 #include "../time/tick-internal.h"
72 #ifdef MODULE_PARAM_PREFIX
73 #undef MODULE_PARAM_PREFIX
75 #define MODULE_PARAM_PREFIX "rcutree."
77 /* Data structures. */
78 static void rcu_sr_normal_gp_cleanup_work(struct work_struct
*);
80 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data
, rcu_data
) = {
83 static struct rcu_state rcu_state
= {
84 .level
= { &rcu_state
.node
[0] },
85 .gp_state
= RCU_GP_IDLE
,
86 .gp_seq
= (0UL - 300UL) << RCU_SEQ_CTR_SHIFT
,
87 .barrier_mutex
= __MUTEX_INITIALIZER(rcu_state
.barrier_mutex
),
88 .barrier_lock
= __RAW_SPIN_LOCK_UNLOCKED(rcu_state
.barrier_lock
),
91 .exp_mutex
= __MUTEX_INITIALIZER(rcu_state
.exp_mutex
),
92 .exp_wake_mutex
= __MUTEX_INITIALIZER(rcu_state
.exp_wake_mutex
),
93 .ofl_lock
= __ARCH_SPIN_LOCK_UNLOCKED
,
94 .srs_cleanup_work
= __WORK_INITIALIZER(rcu_state
.srs_cleanup_work
,
95 rcu_sr_normal_gp_cleanup_work
),
96 .srs_cleanups_pending
= ATOMIC_INIT(0),
97 #ifdef CONFIG_RCU_NOCB_CPU
98 .nocb_mutex
= __MUTEX_INITIALIZER(rcu_state
.nocb_mutex
),
102 /* Dump rcu_node combining tree at boot to verify correct setup. */
103 static bool dump_tree
;
104 module_param(dump_tree
, bool, 0444);
105 /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
106 static bool use_softirq
= !IS_ENABLED(CONFIG_PREEMPT_RT
);
107 #ifndef CONFIG_PREEMPT_RT
108 module_param(use_softirq
, bool, 0444);
110 /* Control rcu_node-tree auto-balancing at boot time. */
111 static bool rcu_fanout_exact
;
112 module_param(rcu_fanout_exact
, bool, 0444);
113 /* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
114 static int rcu_fanout_leaf
= RCU_FANOUT_LEAF
;
115 module_param(rcu_fanout_leaf
, int, 0444);
116 int rcu_num_lvls __read_mostly
= RCU_NUM_LVLS
;
117 /* Number of rcu_nodes at specified level. */
118 int num_rcu_lvl
[] = NUM_RCU_LVL_INIT
;
119 int rcu_num_nodes __read_mostly
= NUM_RCU_NODES
; /* Total # rcu_nodes in use. */
122 * The rcu_scheduler_active variable is initialized to the value
123 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
124 * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
125 * RCU can assume that there is but one task, allowing RCU to (for example)
126 * optimize synchronize_rcu() to a simple barrier(). When this variable
127 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
128 * to detect real grace periods. This variable is also used to suppress
129 * boot-time false positives from lockdep-RCU error checking. Finally, it
130 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
131 * is fully initialized, including all of its kthreads having been spawned.
133 int rcu_scheduler_active __read_mostly
;
134 EXPORT_SYMBOL_GPL(rcu_scheduler_active
);
137 * The rcu_scheduler_fully_active variable transitions from zero to one
138 * during the early_initcall() processing, which is after the scheduler
139 * is capable of creating new tasks. So RCU processing (for example,
140 * creating tasks for RCU priority boosting) must be delayed until after
141 * rcu_scheduler_fully_active transitions from zero to one. We also
142 * currently delay invocation of any RCU callbacks until after this point.
144 * It might later prove better for people registering RCU callbacks during
145 * early boot to take responsibility for these callbacks, but one step at
148 static int rcu_scheduler_fully_active __read_mostly
;
150 static void rcu_report_qs_rnp(unsigned long mask
, struct rcu_node
*rnp
,
151 unsigned long gps
, unsigned long flags
);
152 static struct task_struct
*rcu_boost_task(struct rcu_node
*rnp
);
153 static void invoke_rcu_core(void);
154 static void rcu_report_exp_rdp(struct rcu_data
*rdp
);
155 static void sync_sched_exp_online_cleanup(int cpu
);
156 static void check_cb_ovld_locked(struct rcu_data
*rdp
, struct rcu_node
*rnp
);
157 static bool rcu_rdp_is_offloaded(struct rcu_data
*rdp
);
158 static bool rcu_rdp_cpu_online(struct rcu_data
*rdp
);
159 static bool rcu_init_invoked(void);
160 static void rcu_cleanup_dead_rnp(struct rcu_node
*rnp_leaf
);
161 static void rcu_init_new_rnp(struct rcu_node
*rnp_leaf
);
164 * rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
165 * real-time priority(enabling/disabling) is controlled by
166 * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
168 static int kthread_prio
= IS_ENABLED(CONFIG_RCU_BOOST
) ? 1 : 0;
169 module_param(kthread_prio
, int, 0444);
171 /* Delay in jiffies for grace-period initialization delays, debug only. */
173 static int gp_preinit_delay
;
174 module_param(gp_preinit_delay
, int, 0444);
175 static int gp_init_delay
;
176 module_param(gp_init_delay
, int, 0444);
177 static int gp_cleanup_delay
;
178 module_param(gp_cleanup_delay
, int, 0444);
179 static int nohz_full_patience_delay
;
180 module_param(nohz_full_patience_delay
, int, 0444);
181 static int nohz_full_patience_delay_jiffies
;
183 // Add delay to rcu_read_unlock() for strict grace periods.
184 static int rcu_unlock_delay
;
185 #ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
186 module_param(rcu_unlock_delay
, int, 0444);
190 * This rcu parameter is runtime-read-only. It reflects
191 * a minimum allowed number of objects which can be cached
192 * per-CPU. Object size is equal to one page. This value
193 * can be changed at boot time.
195 static int rcu_min_cached_objs
= 5;
196 module_param(rcu_min_cached_objs
, int, 0444);
198 // A page shrinker can ask for pages to be freed to make them
199 // available for other parts of the system. This usually happens
200 // under low memory conditions, and in that case we should also
201 // defer page-cache filling for a short time period.
203 // The default value is 5 seconds, which is long enough to reduce
204 // interference with the shrinker while it asks other systems to
205 // drain their caches.
206 static int rcu_delay_page_cache_fill_msec
= 5000;
207 module_param(rcu_delay_page_cache_fill_msec
, int, 0444);
209 /* Retrieve RCU kthreads priority for rcutorture */
210 int rcu_get_gp_kthreads_prio(void)
214 EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio
);
217 * Number of grace periods between delays, normalized by the duration of
218 * the delay. The longer the delay, the more the grace periods between
219 * each delay. The reason for this normalization is that it means that,
220 * for non-zero delays, the overall slowdown of grace periods is constant
221 * regardless of the duration of the delay. This arrangement balances
222 * the need for long delays to increase some race probabilities with the
223 * need for fast grace periods to increase other race probabilities.
225 #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
228 * Return true if an RCU grace period is in progress. The READ_ONCE()s
229 * permit this function to be invoked without holding the root rcu_node
230 * structure's ->lock, but of course results can be subject to change.
232 static int rcu_gp_in_progress(void)
234 return rcu_seq_state(rcu_seq_current(&rcu_state
.gp_seq
));
238 * Return the number of callbacks queued on the specified CPU.
239 * Handles both the nocbs and normal cases.
241 static long rcu_get_n_cbs_cpu(int cpu
)
243 struct rcu_data
*rdp
= per_cpu_ptr(&rcu_data
, cpu
);
245 if (rcu_segcblist_is_enabled(&rdp
->cblist
))
246 return rcu_segcblist_n_cbs(&rdp
->cblist
);
251 * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing
253 * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU.
254 * This is a special-purpose function to be used in the softirq
255 * infrastructure and perhaps the occasional long-running softirq
258 * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is
259 * equivalent to momentarily completely enabling preemption. For
260 * example, given this code::
262 * local_bh_disable();
264 * rcu_softirq_qs(); // A
265 * do_something_else();
266 * local_bh_enable(); // B
268 * A call to synchronize_rcu() that began concurrently with the
269 * call to do_something() would be guaranteed to wait only until
270 * execution reached statement A. Without that rcu_softirq_qs(),
271 * that same synchronize_rcu() would instead be guaranteed to wait
272 * until execution reached statement B.
274 void rcu_softirq_qs(void)
276 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map
) ||
277 lock_is_held(&rcu_lock_map
) ||
278 lock_is_held(&rcu_sched_lock_map
),
279 "Illegal rcu_softirq_qs() in RCU read-side critical section");
281 rcu_preempt_deferred_qs(current
);
282 rcu_tasks_qs(current
, false);
286 * Reset the current CPU's RCU_WATCHING counter to indicate that the
287 * newly onlined CPU is no longer in an extended quiescent state.
288 * This will either leave the counter unchanged, or increment it
289 * to the next non-quiescent value.
291 * The non-atomic test/increment sequence works because the upper bits
292 * of the ->state variable are manipulated only by the corresponding CPU,
293 * or when the corresponding CPU is offline.
295 static void rcu_watching_online(void)
297 if (ct_rcu_watching() & CT_RCU_WATCHING
)
299 ct_state_inc(CT_RCU_WATCHING
);
303 * Return true if the snapshot returned from ct_rcu_watching()
304 * indicates that RCU is in an extended quiescent state.
306 static bool rcu_watching_snap_in_eqs(int snap
)
308 return !(snap
& CT_RCU_WATCHING
);
312 * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU
313 * since the specified @snap?
315 * @rdp: The rcu_data corresponding to the CPU for which to check EQS.
316 * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS.
318 * Returns true if the CPU corresponding to @rdp has spent some time in an
319 * extended quiescent state since @snap. Note that this doesn't check if it
320 * /still/ is in an EQS, just that it went through one since @snap.
322 * This is meant to be used in a loop waiting for a CPU to go through an EQS.
324 static bool rcu_watching_snap_stopped_since(struct rcu_data
*rdp
, int snap
)
327 * The first failing snapshot is already ordered against the accesses
328 * performed by the remote CPU after it exits idle.
330 * The second snapshot therefore only needs to order against accesses
331 * performed by the remote CPU prior to entering idle and therefore can
332 * rely solely on acquire semantics.
334 if (WARN_ON_ONCE(rcu_watching_snap_in_eqs(snap
)))
337 return snap
!= ct_rcu_watching_cpu_acquire(rdp
->cpu
);
341 * Return true if the referenced integer is zero while the specified
342 * CPU remains within a single extended quiescent state.
344 bool rcu_watching_zero_in_eqs(int cpu
, int *vp
)
348 // If not quiescent, force back to earlier extended quiescent state.
349 snap
= ct_rcu_watching_cpu(cpu
) & ~CT_RCU_WATCHING
;
350 smp_rmb(); // Order CT state and *vp reads.
352 return false; // Non-zero, so report failure;
353 smp_rmb(); // Order *vp read and CT state re-read.
355 // If still in the same extended quiescent state, we are good!
356 return snap
== ct_rcu_watching_cpu(cpu
);
360 * Let the RCU core know that this CPU has gone through the scheduler,
361 * which is a quiescent state. This is called when the need for a
362 * quiescent state is urgent, so we burn an atomic operation and full
363 * memory barriers to let the RCU core know about it, regardless of what
364 * this CPU might (or might not) do in the near future.
366 * We inform the RCU core by emulating a zero-duration dyntick-idle period.
368 * The caller must have disabled interrupts and must not be idle.
370 notrace
void rcu_momentary_eqs(void)
374 raw_cpu_write(rcu_data
.rcu_need_heavy_qs
, false);
375 seq
= ct_state_inc(2 * CT_RCU_WATCHING
);
376 /* It is illegal to call this from idle state. */
377 WARN_ON_ONCE(!(seq
& CT_RCU_WATCHING
));
378 rcu_preempt_deferred_qs(current
);
380 EXPORT_SYMBOL_GPL(rcu_momentary_eqs
);
383 * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
385 * If the current CPU is idle and running at a first-level (not nested)
386 * interrupt, or directly, from idle, return true.
388 * The caller must have at least disabled IRQs.
390 static int rcu_is_cpu_rrupt_from_idle(void)
395 * Usually called from the tick; but also used from smp_function_call()
396 * for expedited grace periods. This latter can result in running from
397 * the idle task, instead of an actual IPI.
399 lockdep_assert_irqs_disabled();
401 /* Check for counter underflows */
402 RCU_LOCKDEP_WARN(ct_nesting() < 0,
403 "RCU nesting counter underflow!");
404 RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0,
405 "RCU nmi_nesting counter underflow/zero!");
407 /* Are we at first interrupt nesting level? */
408 nesting
= ct_nmi_nesting();
413 * If we're not in an interrupt, we must be in the idle task!
415 WARN_ON_ONCE(!nesting
&& !is_idle_task(current
));
417 /* Does CPU appear to be idle from an RCU standpoint? */
418 return ct_nesting() == 0;
421 #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
422 // Maximum callbacks per rcu_do_batch ...
423 #define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
424 static long blimit
= DEFAULT_RCU_BLIMIT
;
425 #define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
426 static long qhimark
= DEFAULT_RCU_QHIMARK
;
427 #define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit.
428 static long qlowmark
= DEFAULT_RCU_QLOMARK
;
429 #define DEFAULT_RCU_QOVLD_MULT 2
430 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
431 static long qovld
= DEFAULT_RCU_QOVLD
; // If this many pending, hammer QS.
432 static long qovld_calc
= -1; // No pre-initialization lock acquisitions!
434 module_param(blimit
, long, 0444);
435 module_param(qhimark
, long, 0444);
436 module_param(qlowmark
, long, 0444);
437 module_param(qovld
, long, 0444);
439 static ulong jiffies_till_first_fqs
= IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD
) ? 0 : ULONG_MAX
;
440 static ulong jiffies_till_next_fqs
= ULONG_MAX
;
441 static bool rcu_kick_kthreads
;
442 static int rcu_divisor
= 7;
443 module_param(rcu_divisor
, int, 0644);
445 /* Force an exit from rcu_do_batch() after 3 milliseconds. */
446 static long rcu_resched_ns
= 3 * NSEC_PER_MSEC
;
447 module_param(rcu_resched_ns
, long, 0644);
450 * How long the grace period must be before we start recruiting
451 * quiescent-state help from rcu_note_context_switch().
453 static ulong jiffies_till_sched_qs
= ULONG_MAX
;
454 module_param(jiffies_till_sched_qs
, ulong
, 0444);
455 static ulong jiffies_to_sched_qs
; /* See adjust_jiffies_till_sched_qs(). */
456 module_param(jiffies_to_sched_qs
, ulong
, 0444); /* Display only! */
459 * Make sure that we give the grace-period kthread time to detect any
460 * idle CPUs before taking active measures to force quiescent states.
461 * However, don't go below 100 milliseconds, adjusted upwards for really
464 static void adjust_jiffies_till_sched_qs(void)
468 /* If jiffies_till_sched_qs was specified, respect the request. */
469 if (jiffies_till_sched_qs
!= ULONG_MAX
) {
470 WRITE_ONCE(jiffies_to_sched_qs
, jiffies_till_sched_qs
);
473 /* Otherwise, set to third fqs scan, but bound below on large system. */
474 j
= READ_ONCE(jiffies_till_first_fqs
) +
475 2 * READ_ONCE(jiffies_till_next_fqs
);
476 if (j
< HZ
/ 10 + nr_cpu_ids
/ RCU_JIFFIES_FQS_DIV
)
477 j
= HZ
/ 10 + nr_cpu_ids
/ RCU_JIFFIES_FQS_DIV
;
478 pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j
);
479 WRITE_ONCE(jiffies_to_sched_qs
, j
);
482 static int param_set_first_fqs_jiffies(const char *val
, const struct kernel_param
*kp
)
485 int ret
= kstrtoul(val
, 0, &j
);
488 WRITE_ONCE(*(ulong
*)kp
->arg
, (j
> HZ
) ? HZ
: j
);
489 adjust_jiffies_till_sched_qs();
494 static int param_set_next_fqs_jiffies(const char *val
, const struct kernel_param
*kp
)
497 int ret
= kstrtoul(val
, 0, &j
);
500 WRITE_ONCE(*(ulong
*)kp
->arg
, (j
> HZ
) ? HZ
: (j
?: 1));
501 adjust_jiffies_till_sched_qs();
506 static const struct kernel_param_ops first_fqs_jiffies_ops
= {
507 .set
= param_set_first_fqs_jiffies
,
508 .get
= param_get_ulong
,
511 static const struct kernel_param_ops next_fqs_jiffies_ops
= {
512 .set
= param_set_next_fqs_jiffies
,
513 .get
= param_get_ulong
,
516 module_param_cb(jiffies_till_first_fqs
, &first_fqs_jiffies_ops
, &jiffies_till_first_fqs
, 0644);
517 module_param_cb(jiffies_till_next_fqs
, &next_fqs_jiffies_ops
, &jiffies_till_next_fqs
, 0644);
518 module_param(rcu_kick_kthreads
, bool, 0644);
520 static void force_qs_rnp(int (*f
)(struct rcu_data
*rdp
));
521 static int rcu_pending(int user
);
524 * Return the number of RCU GPs completed thus far for debug & stats.
526 unsigned long rcu_get_gp_seq(void)
528 return READ_ONCE(rcu_state
.gp_seq
);
530 EXPORT_SYMBOL_GPL(rcu_get_gp_seq
);
533 * Return the number of RCU expedited batches completed thus far for
534 * debug & stats. Odd numbers mean that a batch is in progress, even
535 * numbers mean idle. The value returned will thus be roughly double
536 * the cumulative batches since boot.
538 unsigned long rcu_exp_batches_completed(void)
540 return rcu_state
.expedited_sequence
;
542 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed
);
545 * Return the root node of the rcu_state structure.
547 static struct rcu_node
*rcu_get_root(void)
549 return &rcu_state
.node
[0];
553 * Send along grace-period-related data for rcutorture diagnostics.
555 void rcutorture_get_gp_data(int *flags
, unsigned long *gp_seq
)
557 *flags
= READ_ONCE(rcu_state
.gp_flags
);
558 *gp_seq
= rcu_seq_current(&rcu_state
.gp_seq
);
560 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data
);
562 #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
564 * An empty function that will trigger a reschedule on
565 * IRQ tail once IRQs get re-enabled on userspace/guest resume.
567 static void late_wakeup_func(struct irq_work
*work
)
571 static DEFINE_PER_CPU(struct irq_work
, late_wakeup_work
) =
572 IRQ_WORK_INIT(late_wakeup_func
);
577 * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
578 * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
580 * In these cases the late RCU wake ups aren't supported in the resched loops and our
581 * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
582 * get re-enabled again.
584 noinstr
void rcu_irq_work_resched(void)
586 struct rcu_data
*rdp
= this_cpu_ptr(&rcu_data
);
588 if (IS_ENABLED(CONFIG_GENERIC_ENTRY
) && !(current
->flags
& PF_VCPU
))
591 if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK
) && (current
->flags
& PF_VCPU
))
594 instrumentation_begin();
595 if (do_nocb_deferred_wakeup(rdp
) && need_resched()) {
596 irq_work_queue(this_cpu_ptr(&late_wakeup_work
));
598 instrumentation_end();
600 #endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */
602 #ifdef CONFIG_PROVE_RCU
604 * rcu_irq_exit_check_preempt - Validate that scheduling is possible
606 void rcu_irq_exit_check_preempt(void)
608 lockdep_assert_irqs_disabled();
610 RCU_LOCKDEP_WARN(ct_nesting() <= 0,
611 "RCU nesting counter underflow/zero!");
612 RCU_LOCKDEP_WARN(ct_nmi_nesting() !=
613 CT_NESTING_IRQ_NONIDLE
,
614 "Bad RCU nmi_nesting counter\n");
615 RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(),
616 "RCU in extended quiescent state!");
618 #endif /* #ifdef CONFIG_PROVE_RCU */
620 #ifdef CONFIG_NO_HZ_FULL
622 * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
624 * The scheduler tick is not normally enabled when CPUs enter the kernel
625 * from nohz_full userspace execution. After all, nohz_full userspace
626 * execution is an RCU quiescent state and the time executing in the kernel
627 * is quite short. Except of course when it isn't. And it is not hard to
628 * cause a large system to spend tens of seconds or even minutes looping
629 * in the kernel, which can cause a number of problems, include RCU CPU
632 * Therefore, if a nohz_full CPU fails to report a quiescent state
633 * in a timely manner, the RCU grace-period kthread sets that CPU's
634 * ->rcu_urgent_qs flag with the expectation that the next interrupt or
635 * exception will invoke this function, which will turn on the scheduler
636 * tick, which will enable RCU to detect that CPU's quiescent states,
637 * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels.
638 * The tick will be disabled once a quiescent state is reported for
641 * Of course, in carefully tuned systems, there might never be an
642 * interrupt or exception. In that case, the RCU grace-period kthread
643 * will eventually cause one to happen. However, in less carefully
644 * controlled environments, this function allows RCU to get what it
645 * needs without creating otherwise useless interruptions.
647 void __rcu_irq_enter_check_tick(void)
649 struct rcu_data
*rdp
= this_cpu_ptr(&rcu_data
);
651 // If we're here from NMI there's nothing to do.
655 RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(),
656 "Illegal rcu_irq_enter_check_tick() from extended quiescent state");
658 if (!tick_nohz_full_cpu(rdp
->cpu
) ||
659 !READ_ONCE(rdp
->rcu_urgent_qs
) ||
660 READ_ONCE(rdp
->rcu_forced_tick
)) {
661 // RCU doesn't need nohz_full help from this CPU, or it is
662 // already getting that help.
666 // We get here only when not in an extended quiescent state and
667 // from interrupts (as opposed to NMIs). Therefore, (1) RCU is
668 // already watching and (2) The fact that we are in an interrupt
669 // handler and that the rcu_node lock is an irq-disabled lock
670 // prevents self-deadlock. So we can safely recheck under the lock.
671 // Note that the nohz_full state currently cannot change.
672 raw_spin_lock_rcu_node(rdp
->mynode
);
673 if (READ_ONCE(rdp
->rcu_urgent_qs
) && !rdp
->rcu_forced_tick
) {
674 // A nohz_full CPU is in the kernel and RCU needs a
675 // quiescent state. Turn on the tick!
676 WRITE_ONCE(rdp
->rcu_forced_tick
, true);
677 tick_dep_set_cpu(rdp
->cpu
, TICK_DEP_BIT_RCU
);
679 raw_spin_unlock_rcu_node(rdp
->mynode
);
681 NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick
);
682 #endif /* CONFIG_NO_HZ_FULL */
685 * Check to see if any future non-offloaded RCU-related work will need
686 * to be done by the current CPU, even if none need be done immediately,
687 * returning 1 if so. This function is part of the RCU implementation;
688 * it is -not- an exported member of the RCU API. This is used by
689 * the idle-entry code to figure out whether it is safe to disable the
690 * scheduler-clock interrupt.
692 * Just check whether or not this CPU has non-offloaded RCU callbacks
695 int rcu_needs_cpu(void)
697 return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data
)->cblist
) &&
698 !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data
));
702 * If any sort of urgency was applied to the current CPU (for example,
703 * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
704 * to get to a quiescent state, disable it.
706 static void rcu_disable_urgency_upon_qs(struct rcu_data
*rdp
)
708 raw_lockdep_assert_held_rcu_node(rdp
->mynode
);
709 WRITE_ONCE(rdp
->rcu_urgent_qs
, false);
710 WRITE_ONCE(rdp
->rcu_need_heavy_qs
, false);
711 if (tick_nohz_full_cpu(rdp
->cpu
) && rdp
->rcu_forced_tick
) {
712 tick_dep_clear_cpu(rdp
->cpu
, TICK_DEP_BIT_RCU
);
713 WRITE_ONCE(rdp
->rcu_forced_tick
, false);
718 * rcu_is_watching - RCU read-side critical sections permitted on current CPU?
720 * Return @true if RCU is watching the running CPU and @false otherwise.
721 * An @true return means that this CPU can safely enter RCU read-side
724 * Although calls to rcu_is_watching() from most parts of the kernel
725 * will return @true, there are important exceptions. For example, if the
726 * current CPU is deep within its idle loop, in kernel entry/exit code,
727 * or offline, rcu_is_watching() will return @false.
729 * Make notrace because it can be called by the internal functions of
730 * ftrace, and making this notrace removes unnecessary recursion calls.
732 notrace
bool rcu_is_watching(void)
736 preempt_disable_notrace();
737 ret
= rcu_is_watching_curr_cpu();
738 preempt_enable_notrace();
741 EXPORT_SYMBOL_GPL(rcu_is_watching
);
744 * If a holdout task is actually running, request an urgent quiescent
745 * state from its CPU. This is unsynchronized, so migrations can cause
746 * the request to go to the wrong CPU. Which is OK, all that will happen
747 * is that the CPU's next context switch will be a bit slower and next
748 * time around this task will generate another request.
750 void rcu_request_urgent_qs_task(struct task_struct
*t
)
757 return; /* This task is not running on that CPU. */
758 smp_store_release(per_cpu_ptr(&rcu_data
.rcu_urgent_qs
, cpu
), true);
762 * When trying to report a quiescent state on behalf of some other CPU,
763 * it is our responsibility to check for and handle potential overflow
764 * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
765 * After all, the CPU might be in deep idle state, and thus executing no
768 static void rcu_gpnum_ovf(struct rcu_node
*rnp
, struct rcu_data
*rdp
)
770 raw_lockdep_assert_held_rcu_node(rnp
);
771 if (ULONG_CMP_LT(rcu_seq_current(&rdp
->gp_seq
) + ULONG_MAX
/ 4,
773 WRITE_ONCE(rdp
->gpwrap
, true);
774 if (ULONG_CMP_LT(rdp
->rcu_iw_gp_seq
+ ULONG_MAX
/ 4, rnp
->gp_seq
))
775 rdp
->rcu_iw_gp_seq
= rnp
->gp_seq
+ ULONG_MAX
/ 4;
779 * Snapshot the specified CPU's RCU_WATCHING counter so that we can later
780 * credit them with an implicit quiescent state. Return 1 if this CPU
781 * is in dynticks idle mode, which is an extended quiescent state.
783 static int rcu_watching_snap_save(struct rcu_data
*rdp
)
786 * Full ordering between remote CPU's post idle accesses and updater's
787 * accesses prior to current GP (and also the started GP sequence number)
788 * is enforced by rcu_seq_start() implicit barrier and even further by
789 * smp_mb__after_unlock_lock() barriers chained all the way throughout the
790 * rnp locking tree since rcu_gp_init() and up to the current leaf rnp
793 * Ordering between remote CPU's pre idle accesses and post grace period
794 * updater's accesses is enforced by the below acquire semantic.
796 rdp
->watching_snap
= ct_rcu_watching_cpu_acquire(rdp
->cpu
);
797 if (rcu_watching_snap_in_eqs(rdp
->watching_snap
)) {
798 trace_rcu_fqs(rcu_state
.name
, rdp
->gp_seq
, rdp
->cpu
, TPS("dti"));
799 rcu_gpnum_ovf(rdp
->mynode
, rdp
);
806 * Returns positive if the specified CPU has passed through a quiescent state
807 * by virtue of being in or having passed through an dynticks idle state since
808 * the last call to rcu_watching_snap_save() for this same CPU, or by
809 * virtue of having been offline.
811 * Returns negative if the specified CPU needs a force resched.
813 * Returns zero otherwise.
815 static int rcu_watching_snap_recheck(struct rcu_data
*rdp
)
819 struct rcu_node
*rnp
= rdp
->mynode
;
822 * If the CPU passed through or entered a dynticks idle phase with
823 * no active irq/NMI handlers, then we can safely pretend that the CPU
824 * already acknowledged the request to pass through a quiescent
825 * state. Either way, that CPU cannot possibly be in an RCU
826 * read-side critical section that started before the beginning
827 * of the current RCU grace period.
829 if (rcu_watching_snap_stopped_since(rdp
, rdp
->watching_snap
)) {
830 trace_rcu_fqs(rcu_state
.name
, rdp
->gp_seq
, rdp
->cpu
, TPS("dti"));
831 rcu_gpnum_ovf(rnp
, rdp
);
836 * Complain if a CPU that is considered to be offline from RCU's
837 * perspective has not yet reported a quiescent state. After all,
838 * the offline CPU should have reported a quiescent state during
839 * the CPU-offline process, or, failing that, by rcu_gp_init()
840 * if it ran concurrently with either the CPU going offline or the
841 * last task on a leaf rcu_node structure exiting its RCU read-side
842 * critical section while all CPUs corresponding to that structure
843 * are offline. This added warning detects bugs in any of these
846 * The rcu_node structure's ->lock is held here, which excludes
847 * the relevant portions the CPU-hotplug code, the grace-period
848 * initialization code, and the rcu_read_unlock() code paths.
850 * For more detail, please refer to the "Hotplug CPU" section
851 * of RCU's Requirements documentation.
853 if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp
))) {
854 struct rcu_node
*rnp1
;
856 pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
857 __func__
, rnp
->grplo
, rnp
->grphi
, rnp
->level
,
858 (long)rnp
->gp_seq
, (long)rnp
->completedqs
);
859 for (rnp1
= rnp
; rnp1
; rnp1
= rnp1
->parent
)
860 pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
861 __func__
, rnp1
->grplo
, rnp1
->grphi
, rnp1
->qsmask
, rnp1
->qsmaskinit
, rnp1
->qsmaskinitnext
, rnp1
->rcu_gp_init_mask
);
862 pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
863 __func__
, rdp
->cpu
, ".o"[rcu_rdp_cpu_online(rdp
)],
864 (long)rdp
->rcu_onl_gp_seq
, rdp
->rcu_onl_gp_state
,
865 (long)rdp
->rcu_ofl_gp_seq
, rdp
->rcu_ofl_gp_state
);
866 return 1; /* Break things loose after complaining. */
870 * A CPU running for an extended time within the kernel can
871 * delay RCU grace periods: (1) At age jiffies_to_sched_qs,
872 * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set
873 * both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the
874 * unsynchronized assignments to the per-CPU rcu_need_heavy_qs
875 * variable are safe because the assignments are repeated if this
876 * CPU failed to pass through a quiescent state. This code
877 * also checks .jiffies_resched in case jiffies_to_sched_qs
880 jtsq
= READ_ONCE(jiffies_to_sched_qs
);
881 if (!READ_ONCE(rdp
->rcu_need_heavy_qs
) &&
882 (time_after(jiffies
, rcu_state
.gp_start
+ jtsq
* 2) ||
883 time_after(jiffies
, rcu_state
.jiffies_resched
) ||
885 WRITE_ONCE(rdp
->rcu_need_heavy_qs
, true);
886 /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
887 smp_store_release(&rdp
->rcu_urgent_qs
, true);
888 } else if (time_after(jiffies
, rcu_state
.gp_start
+ jtsq
)) {
889 WRITE_ONCE(rdp
->rcu_urgent_qs
, true);
893 * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
894 * The above code handles this, but only for straight cond_resched().
895 * And some in-kernel loops check need_resched() before calling
896 * cond_resched(), which defeats the above code for CPUs that are
897 * running in-kernel with scheduling-clock interrupts disabled.
898 * So hit them over the head with the resched_cpu() hammer!
900 if (tick_nohz_full_cpu(rdp
->cpu
) &&
901 (time_after(jiffies
, READ_ONCE(rdp
->last_fqs_resched
) + jtsq
* 3) ||
903 WRITE_ONCE(rdp
->rcu_urgent_qs
, true);
904 WRITE_ONCE(rdp
->last_fqs_resched
, jiffies
);
909 * If more than halfway to RCU CPU stall-warning time, invoke
910 * resched_cpu() more frequently to try to loosen things up a bit.
911 * Also check to see if the CPU is getting hammered with interrupts,
912 * but only once per grace period, just to keep the IPIs down to
915 if (time_after(jiffies
, rcu_state
.jiffies_resched
)) {
916 if (time_after(jiffies
,
917 READ_ONCE(rdp
->last_fqs_resched
) + jtsq
)) {
918 WRITE_ONCE(rdp
->last_fqs_resched
, jiffies
);
921 if (IS_ENABLED(CONFIG_IRQ_WORK
) &&
922 !rdp
->rcu_iw_pending
&& rdp
->rcu_iw_gp_seq
!= rnp
->gp_seq
&&
923 (rnp
->ffmask
& rdp
->grpmask
)) {
924 rdp
->rcu_iw_pending
= true;
925 rdp
->rcu_iw_gp_seq
= rnp
->gp_seq
;
926 irq_work_queue_on(&rdp
->rcu_iw
, rdp
->cpu
);
929 if (rcu_cpu_stall_cputime
&& rdp
->snap_record
.gp_seq
!= rdp
->gp_seq
) {
931 struct rcu_snap_record
*rsrp
;
932 struct kernel_cpustat
*kcsp
;
934 kcsp
= &kcpustat_cpu(cpu
);
936 rsrp
= &rdp
->snap_record
;
937 rsrp
->cputime_irq
= kcpustat_field(kcsp
, CPUTIME_IRQ
, cpu
);
938 rsrp
->cputime_softirq
= kcpustat_field(kcsp
, CPUTIME_SOFTIRQ
, cpu
);
939 rsrp
->cputime_system
= kcpustat_field(kcsp
, CPUTIME_SYSTEM
, cpu
);
940 rsrp
->nr_hardirqs
= kstat_cpu_irqs_sum(rdp
->cpu
);
941 rsrp
->nr_softirqs
= kstat_cpu_softirqs_sum(rdp
->cpu
);
942 rsrp
->nr_csw
= nr_context_switches_cpu(rdp
->cpu
);
943 rsrp
->jiffies
= jiffies
;
944 rsrp
->gp_seq
= rdp
->gp_seq
;
951 /* Trace-event wrapper function for trace_rcu_future_grace_period. */
952 static void trace_rcu_this_gp(struct rcu_node
*rnp
, struct rcu_data
*rdp
,
953 unsigned long gp_seq_req
, const char *s
)
955 trace_rcu_future_grace_period(rcu_state
.name
, READ_ONCE(rnp
->gp_seq
),
956 gp_seq_req
, rnp
->level
,
957 rnp
->grplo
, rnp
->grphi
, s
);
961 * rcu_start_this_gp - Request the start of a particular grace period
962 * @rnp_start: The leaf node of the CPU from which to start.
963 * @rdp: The rcu_data corresponding to the CPU from which to start.
964 * @gp_seq_req: The gp_seq of the grace period to start.
966 * Start the specified grace period, as needed to handle newly arrived
967 * callbacks. The required future grace periods are recorded in each
968 * rcu_node structure's ->gp_seq_needed field. Returns true if there
969 * is reason to awaken the grace-period kthread.
971 * The caller must hold the specified rcu_node structure's ->lock, which
972 * is why the caller is responsible for waking the grace-period kthread.
974 * Returns true if the GP thread needs to be awakened else false.
976 static bool rcu_start_this_gp(struct rcu_node
*rnp_start
, struct rcu_data
*rdp
,
977 unsigned long gp_seq_req
)
980 struct rcu_node
*rnp
;
983 * Use funnel locking to either acquire the root rcu_node
984 * structure's lock or bail out if the need for this grace period
985 * has already been recorded -- or if that grace period has in
986 * fact already started. If there is already a grace period in
987 * progress in a non-leaf node, no recording is needed because the
988 * end of the grace period will scan the leaf rcu_node structures.
989 * Note that rnp_start->lock must not be released.
991 raw_lockdep_assert_held_rcu_node(rnp_start
);
992 trace_rcu_this_gp(rnp_start
, rdp
, gp_seq_req
, TPS("Startleaf"));
993 for (rnp
= rnp_start
; 1; rnp
= rnp
->parent
) {
994 if (rnp
!= rnp_start
)
995 raw_spin_lock_rcu_node(rnp
);
996 if (ULONG_CMP_GE(rnp
->gp_seq_needed
, gp_seq_req
) ||
997 rcu_seq_started(&rnp
->gp_seq
, gp_seq_req
) ||
999 rcu_seq_state(rcu_seq_current(&rnp
->gp_seq
)))) {
1000 trace_rcu_this_gp(rnp
, rdp
, gp_seq_req
,
1004 WRITE_ONCE(rnp
->gp_seq_needed
, gp_seq_req
);
1005 if (rcu_seq_state(rcu_seq_current(&rnp
->gp_seq
))) {
1007 * We just marked the leaf or internal node, and a
1008 * grace period is in progress, which means that
1009 * rcu_gp_cleanup() will see the marking. Bail to
1010 * reduce contention.
1012 trace_rcu_this_gp(rnp_start
, rdp
, gp_seq_req
,
1013 TPS("Startedleaf"));
1016 if (rnp
!= rnp_start
&& rnp
->parent
!= NULL
)
1017 raw_spin_unlock_rcu_node(rnp
);
1019 break; /* At root, and perhaps also leaf. */
1022 /* If GP already in progress, just leave, otherwise start one. */
1023 if (rcu_gp_in_progress()) {
1024 trace_rcu_this_gp(rnp
, rdp
, gp_seq_req
, TPS("Startedleafroot"));
1027 trace_rcu_this_gp(rnp
, rdp
, gp_seq_req
, TPS("Startedroot"));
1028 WRITE_ONCE(rcu_state
.gp_flags
, rcu_state
.gp_flags
| RCU_GP_FLAG_INIT
);
1029 WRITE_ONCE(rcu_state
.gp_req_activity
, jiffies
);
1030 if (!READ_ONCE(rcu_state
.gp_kthread
)) {
1031 trace_rcu_this_gp(rnp
, rdp
, gp_seq_req
, TPS("NoGPkthread"));
1034 trace_rcu_grace_period(rcu_state
.name
, data_race(rcu_state
.gp_seq
), TPS("newreq"));
1035 ret
= true; /* Caller must wake GP kthread. */
1037 /* Push furthest requested GP to leaf node and rcu_data structure. */
1038 if (ULONG_CMP_LT(gp_seq_req
, rnp
->gp_seq_needed
)) {
1039 WRITE_ONCE(rnp_start
->gp_seq_needed
, rnp
->gp_seq_needed
);
1040 WRITE_ONCE(rdp
->gp_seq_needed
, rnp
->gp_seq_needed
);
1042 if (rnp
!= rnp_start
)
1043 raw_spin_unlock_rcu_node(rnp
);
1048 * Clean up any old requests for the just-ended grace period. Also return
1049 * whether any additional grace periods have been requested.
1051 static bool rcu_future_gp_cleanup(struct rcu_node
*rnp
)
1054 struct rcu_data
*rdp
= this_cpu_ptr(&rcu_data
);
1056 needmore
= ULONG_CMP_LT(rnp
->gp_seq
, rnp
->gp_seq_needed
);
1058 rnp
->gp_seq_needed
= rnp
->gp_seq
; /* Avoid counter wrap. */
1059 trace_rcu_this_gp(rnp
, rdp
, rnp
->gp_seq
,
1060 needmore
? TPS("CleanupMore") : TPS("Cleanup"));
1064 static void swake_up_one_online_ipi(void *arg
)
1066 struct swait_queue_head
*wqh
= arg
;
1071 static void swake_up_one_online(struct swait_queue_head
*wqh
)
1073 int cpu
= get_cpu();
1076 * If called from rcutree_report_cpu_starting(), wake up
1077 * is dangerous that late in the CPU-down hotplug process. The
1078 * scheduler might queue an ignored hrtimer. Defer the wake up
1079 * to an online CPU instead.
1081 if (unlikely(cpu_is_offline(cpu
))) {
1084 target
= cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU
),
1087 smp_call_function_single(target
, swake_up_one_online_ipi
,
1097 * Awaken the grace-period kthread. Don't do a self-awaken (unless in an
1098 * interrupt or softirq handler, in which case we just might immediately
1099 * sleep upon return, resulting in a grace-period hang), and don't bother
1100 * awakening when there is nothing for the grace-period kthread to do
1101 * (as in several CPUs raced to awaken, we lost), and finally don't try
1102 * to awaken a kthread that has not yet been created. If all those checks
1103 * are passed, track some debug information and awaken.
1105 * So why do the self-wakeup when in an interrupt or softirq handler
1106 * in the grace-period kthread's context? Because the kthread might have
1107 * been interrupted just as it was going to sleep, and just after the final
1108 * pre-sleep check of the awaken condition. In this case, a wakeup really
1109 * is required, and is therefore supplied.
1111 static void rcu_gp_kthread_wake(void)
1113 struct task_struct
*t
= READ_ONCE(rcu_state
.gp_kthread
);
1115 if ((current
== t
&& !in_hardirq() && !in_serving_softirq()) ||
1116 !READ_ONCE(rcu_state
.gp_flags
) || !t
)
1118 WRITE_ONCE(rcu_state
.gp_wake_time
, jiffies
);
1119 WRITE_ONCE(rcu_state
.gp_wake_seq
, READ_ONCE(rcu_state
.gp_seq
));
1120 swake_up_one_online(&rcu_state
.gp_wq
);
1124 * If there is room, assign a ->gp_seq number to any callbacks on this
1125 * CPU that have not already been assigned. Also accelerate any callbacks
1126 * that were previously assigned a ->gp_seq number that has since proven
1127 * to be too conservative, which can happen if callbacks get assigned a
1128 * ->gp_seq number while RCU is idle, but with reference to a non-root
1129 * rcu_node structure. This function is idempotent, so it does not hurt
1130 * to call it repeatedly. Returns an flag saying that we should awaken
1131 * the RCU grace-period kthread.
1133 * The caller must hold rnp->lock with interrupts disabled.
1135 static bool rcu_accelerate_cbs(struct rcu_node
*rnp
, struct rcu_data
*rdp
)
1137 unsigned long gp_seq_req
;
1140 rcu_lockdep_assert_cblist_protected(rdp
);
1141 raw_lockdep_assert_held_rcu_node(rnp
);
1143 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1144 if (!rcu_segcblist_pend_cbs(&rdp
->cblist
))
1147 trace_rcu_segcb_stats(&rdp
->cblist
, TPS("SegCbPreAcc"));
1150 * Callbacks are often registered with incomplete grace-period
1151 * information. Something about the fact that getting exact
1152 * information requires acquiring a global lock... RCU therefore
1153 * makes a conservative estimate of the grace period number at which
1154 * a given callback will become ready to invoke. The following
1155 * code checks this estimate and improves it when possible, thus
1156 * accelerating callback invocation to an earlier grace-period
1159 gp_seq_req
= rcu_seq_snap(&rcu_state
.gp_seq
);
1160 if (rcu_segcblist_accelerate(&rdp
->cblist
, gp_seq_req
))
1161 ret
= rcu_start_this_gp(rnp
, rdp
, gp_seq_req
);
1163 /* Trace depending on how much we were able to accelerate. */
1164 if (rcu_segcblist_restempty(&rdp
->cblist
, RCU_WAIT_TAIL
))
1165 trace_rcu_grace_period(rcu_state
.name
, gp_seq_req
, TPS("AccWaitCB"));
1167 trace_rcu_grace_period(rcu_state
.name
, gp_seq_req
, TPS("AccReadyCB"));
1169 trace_rcu_segcb_stats(&rdp
->cblist
, TPS("SegCbPostAcc"));
1175 * Similar to rcu_accelerate_cbs(), but does not require that the leaf
1176 * rcu_node structure's ->lock be held. It consults the cached value
1177 * of ->gp_seq_needed in the rcu_data structure, and if that indicates
1178 * that a new grace-period request be made, invokes rcu_accelerate_cbs()
1179 * while holding the leaf rcu_node structure's ->lock.
1181 static void rcu_accelerate_cbs_unlocked(struct rcu_node
*rnp
,
1182 struct rcu_data
*rdp
)
1187 rcu_lockdep_assert_cblist_protected(rdp
);
1188 c
= rcu_seq_snap(&rcu_state
.gp_seq
);
1189 if (!READ_ONCE(rdp
->gpwrap
) && ULONG_CMP_GE(rdp
->gp_seq_needed
, c
)) {
1190 /* Old request still live, so mark recent callbacks. */
1191 (void)rcu_segcblist_accelerate(&rdp
->cblist
, c
);
1194 raw_spin_lock_rcu_node(rnp
); /* irqs already disabled. */
1195 needwake
= rcu_accelerate_cbs(rnp
, rdp
);
1196 raw_spin_unlock_rcu_node(rnp
); /* irqs remain disabled. */
1198 rcu_gp_kthread_wake();
1202 * Move any callbacks whose grace period has completed to the
1203 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1204 * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
1205 * sublist. This function is idempotent, so it does not hurt to
1206 * invoke it repeatedly. As long as it is not invoked -too- often...
1207 * Returns true if the RCU grace-period kthread needs to be awakened.
1209 * The caller must hold rnp->lock with interrupts disabled.
1211 static bool rcu_advance_cbs(struct rcu_node
*rnp
, struct rcu_data
*rdp
)
1213 rcu_lockdep_assert_cblist_protected(rdp
);
1214 raw_lockdep_assert_held_rcu_node(rnp
);
1216 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1217 if (!rcu_segcblist_pend_cbs(&rdp
->cblist
))
1221 * Find all callbacks whose ->gp_seq numbers indicate that they
1222 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1224 rcu_segcblist_advance(&rdp
->cblist
, rnp
->gp_seq
);
1226 /* Classify any remaining callbacks. */
1227 return rcu_accelerate_cbs(rnp
, rdp
);
1231 * Move and classify callbacks, but only if doing so won't require
1232 * that the RCU grace-period kthread be awakened.
1234 static void __maybe_unused
rcu_advance_cbs_nowake(struct rcu_node
*rnp
,
1235 struct rcu_data
*rdp
)
1237 rcu_lockdep_assert_cblist_protected(rdp
);
1238 if (!rcu_seq_state(rcu_seq_current(&rnp
->gp_seq
)) || !raw_spin_trylock_rcu_node(rnp
))
1240 // The grace period cannot end while we hold the rcu_node lock.
1241 if (rcu_seq_state(rcu_seq_current(&rnp
->gp_seq
)))
1242 WARN_ON_ONCE(rcu_advance_cbs(rnp
, rdp
));
1243 raw_spin_unlock_rcu_node(rnp
);
1247 * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
1248 * quiescent state. This is intended to be invoked when the CPU notices
1249 * a new grace period.
1251 static void rcu_strict_gp_check_qs(void)
1253 if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD
)) {
1260 * Update CPU-local rcu_data state to record the beginnings and ends of
1261 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1262 * structure corresponding to the current CPU, and must have irqs disabled.
1263 * Returns true if the grace-period kthread needs to be awakened.
1265 static bool __note_gp_changes(struct rcu_node
*rnp
, struct rcu_data
*rdp
)
1269 const bool offloaded
= rcu_rdp_is_offloaded(rdp
);
1271 raw_lockdep_assert_held_rcu_node(rnp
);
1273 if (rdp
->gp_seq
== rnp
->gp_seq
)
1274 return false; /* Nothing to do. */
1276 /* Handle the ends of any preceding grace periods first. */
1277 if (rcu_seq_completed_gp(rdp
->gp_seq
, rnp
->gp_seq
) ||
1278 unlikely(READ_ONCE(rdp
->gpwrap
))) {
1280 ret
= rcu_advance_cbs(rnp
, rdp
); /* Advance CBs. */
1281 rdp
->core_needs_qs
= false;
1282 trace_rcu_grace_period(rcu_state
.name
, rdp
->gp_seq
, TPS("cpuend"));
1285 ret
= rcu_accelerate_cbs(rnp
, rdp
); /* Recent CBs. */
1286 if (rdp
->core_needs_qs
)
1287 rdp
->core_needs_qs
= !!(rnp
->qsmask
& rdp
->grpmask
);
1290 /* Now handle the beginnings of any new-to-this-CPU grace periods. */
1291 if (rcu_seq_new_gp(rdp
->gp_seq
, rnp
->gp_seq
) ||
1292 unlikely(READ_ONCE(rdp
->gpwrap
))) {
1294 * If the current grace period is waiting for this CPU,
1295 * set up to detect a quiescent state, otherwise don't
1296 * go looking for one.
1298 trace_rcu_grace_period(rcu_state
.name
, rnp
->gp_seq
, TPS("cpustart"));
1299 need_qs
= !!(rnp
->qsmask
& rdp
->grpmask
);
1300 rdp
->cpu_no_qs
.b
.norm
= need_qs
;
1301 rdp
->core_needs_qs
= need_qs
;
1302 zero_cpu_stall_ticks(rdp
);
1304 rdp
->gp_seq
= rnp
->gp_seq
; /* Remember new grace-period state. */
1305 if (ULONG_CMP_LT(rdp
->gp_seq_needed
, rnp
->gp_seq_needed
) || rdp
->gpwrap
)
1306 WRITE_ONCE(rdp
->gp_seq_needed
, rnp
->gp_seq_needed
);
1307 if (IS_ENABLED(CONFIG_PROVE_RCU
) && READ_ONCE(rdp
->gpwrap
))
1308 WRITE_ONCE(rdp
->last_sched_clock
, jiffies
);
1309 WRITE_ONCE(rdp
->gpwrap
, false);
1310 rcu_gpnum_ovf(rnp
, rdp
);
1314 static void note_gp_changes(struct rcu_data
*rdp
)
1316 unsigned long flags
;
1318 struct rcu_node
*rnp
;
1320 local_irq_save(flags
);
1322 if ((rdp
->gp_seq
== rcu_seq_current(&rnp
->gp_seq
) &&
1323 !unlikely(READ_ONCE(rdp
->gpwrap
))) || /* w/out lock. */
1324 !raw_spin_trylock_rcu_node(rnp
)) { /* irqs already off, so later. */
1325 local_irq_restore(flags
);
1328 needwake
= __note_gp_changes(rnp
, rdp
);
1329 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
1330 rcu_strict_gp_check_qs();
1332 rcu_gp_kthread_wake();
1335 static atomic_t
*rcu_gp_slow_suppress
;
1337 /* Register a counter to suppress debugging grace-period delays. */
1338 void rcu_gp_slow_register(atomic_t
*rgssp
)
1340 WARN_ON_ONCE(rcu_gp_slow_suppress
);
1342 WRITE_ONCE(rcu_gp_slow_suppress
, rgssp
);
1344 EXPORT_SYMBOL_GPL(rcu_gp_slow_register
);
1346 /* Unregister a counter, with NULL for not caring which. */
1347 void rcu_gp_slow_unregister(atomic_t
*rgssp
)
1349 WARN_ON_ONCE(rgssp
&& rgssp
!= rcu_gp_slow_suppress
&& rcu_gp_slow_suppress
!= NULL
);
1351 WRITE_ONCE(rcu_gp_slow_suppress
, NULL
);
1353 EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister
);
1355 static bool rcu_gp_slow_is_suppressed(void)
1357 atomic_t
*rgssp
= READ_ONCE(rcu_gp_slow_suppress
);
1359 return rgssp
&& atomic_read(rgssp
);
1362 static void rcu_gp_slow(int delay
)
1364 if (!rcu_gp_slow_is_suppressed() && delay
> 0 &&
1365 !(rcu_seq_ctr(rcu_state
.gp_seq
) % (rcu_num_nodes
* PER_RCU_NODE_PERIOD
* delay
)))
1366 schedule_timeout_idle(delay
);
1369 static unsigned long sleep_duration
;
1371 /* Allow rcutorture to stall the grace-period kthread. */
1372 void rcu_gp_set_torture_wait(int duration
)
1374 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST
) && duration
> 0)
1375 WRITE_ONCE(sleep_duration
, duration
);
1377 EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait
);
1379 /* Actually implement the aforementioned wait. */
1380 static void rcu_gp_torture_wait(void)
1382 unsigned long duration
;
1384 if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST
))
1386 duration
= xchg(&sleep_duration
, 0UL);
1388 pr_alert("%s: Waiting %lu jiffies\n", __func__
, duration
);
1389 schedule_timeout_idle(duration
);
1390 pr_alert("%s: Wait complete\n", __func__
);
1395 * Handler for on_each_cpu() to invoke the target CPU's RCU core
1398 static void rcu_strict_gp_boundary(void *unused
)
1403 // Make the polled API aware of the beginning of a grace period.
1404 static void rcu_poll_gp_seq_start(unsigned long *snap
)
1406 struct rcu_node
*rnp
= rcu_get_root();
1408 if (rcu_scheduler_active
!= RCU_SCHEDULER_INACTIVE
)
1409 raw_lockdep_assert_held_rcu_node(rnp
);
1411 // If RCU was idle, note beginning of GP.
1412 if (!rcu_seq_state(rcu_state
.gp_seq_polled
))
1413 rcu_seq_start(&rcu_state
.gp_seq_polled
);
1415 // Either way, record current state.
1416 *snap
= rcu_state
.gp_seq_polled
;
1419 // Make the polled API aware of the end of a grace period.
1420 static void rcu_poll_gp_seq_end(unsigned long *snap
)
1422 struct rcu_node
*rnp
= rcu_get_root();
1424 if (rcu_scheduler_active
!= RCU_SCHEDULER_INACTIVE
)
1425 raw_lockdep_assert_held_rcu_node(rnp
);
1427 // If the previously noted GP is still in effect, record the
1428 // end of that GP. Either way, zero counter to avoid counter-wrap
1430 if (*snap
&& *snap
== rcu_state
.gp_seq_polled
) {
1431 rcu_seq_end(&rcu_state
.gp_seq_polled
);
1432 rcu_state
.gp_seq_polled_snap
= 0;
1433 rcu_state
.gp_seq_polled_exp_snap
= 0;
1439 // Make the polled API aware of the beginning of a grace period, but
1440 // where caller does not hold the root rcu_node structure's lock.
1441 static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap
)
1443 unsigned long flags
;
1444 struct rcu_node
*rnp
= rcu_get_root();
1446 if (rcu_init_invoked()) {
1447 if (rcu_scheduler_active
!= RCU_SCHEDULER_INACTIVE
)
1448 lockdep_assert_irqs_enabled();
1449 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
1451 rcu_poll_gp_seq_start(snap
);
1452 if (rcu_init_invoked())
1453 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
1456 // Make the polled API aware of the end of a grace period, but where
1457 // caller does not hold the root rcu_node structure's lock.
1458 static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap
)
1460 unsigned long flags
;
1461 struct rcu_node
*rnp
= rcu_get_root();
1463 if (rcu_init_invoked()) {
1464 if (rcu_scheduler_active
!= RCU_SCHEDULER_INACTIVE
)
1465 lockdep_assert_irqs_enabled();
1466 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
1468 rcu_poll_gp_seq_end(snap
);
1469 if (rcu_init_invoked())
1470 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
1474 * There is a single llist, which is used for handling
1475 * synchronize_rcu() users' enqueued rcu_synchronize nodes.
1476 * Within this llist, there are two tail pointers:
1478 * wait tail: Tracks the set of nodes, which need to
1479 * wait for the current GP to complete.
1480 * done tail: Tracks the set of nodes, for which grace
1481 * period has elapsed. These nodes processing
1482 * will be done as part of the cleanup work
1483 * execution by a kworker.
1485 * At every grace period init, a new wait node is added
1486 * to the llist. This wait node is used as wait tail
1487 * for this new grace period. Given that there are a fixed
1488 * number of wait nodes, if all wait nodes are in use
1489 * (which can happen when kworker callback processing
1490 * is delayed) and additional grace period is requested.
1491 * This means, a system is slow in processing callbacks.
1493 * TODO: If a slow processing is detected, a first node
1494 * in the llist should be used as a wait-tail for this
1495 * grace period, therefore users which should wait due
1496 * to a slow process are handled by _this_ grace period
1499 * Below is an illustration of how the done and wait
1500 * tail pointers move from one set of rcu_synchronize nodes
1501 * to the other, as grace periods start and finish and
1502 * nodes are processed by kworker.
1505 * a. Initial llist callbacks list:
1507 * +----------+ +--------+ +-------+
1509 * | head |---------> | cb2 |--------->| cb1 |
1511 * +----------+ +--------+ +-------+
1521 * +----------+ +--------+ +--------+ +-------+
1523 * | head ------> wait |------> cb2 |------> | cb1 |
1524 * | | | head1 | | | | |
1525 * +----------+ +--------+ +--------+ +-------+
1531 * WAIT_TAIL == DONE_TAIL
1537 * +----------+ +--------+ +--------+ +-------+
1539 * | head ------> wait |------> cb2 |------> | cb1 |
1540 * | | | head1 | | | | |
1541 * +----------+ +--------+ +--------+ +-------+
1545 * d. New callbacks and GP2 start:
1547 * WAIT TAIL DONE TAIL
1551 * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
1552 * | | | | | | | | | | | | | |
1553 * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
1554 * | | | head2| | | | | |head1| | | | |
1555 * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
1559 * e. GP2 completion:
1561 * WAIT_TAIL == DONE_TAIL
1566 * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
1567 * | | | | | | | | | | | | | |
1568 * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
1569 * | | | head2| | | | | |head1| | | | |
1570 * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
1573 * While the llist state transitions from d to e, a kworker
1574 * can start executing rcu_sr_normal_gp_cleanup_work() and
1575 * can observe either the old done tail (@c) or the new
1576 * done tail (@e). So, done tail updates and reads need
1577 * to use the rel-acq semantics. If the concurrent kworker
1578 * observes the old done tail, the newly queued work
1579 * execution will process the updated done tail. If the
1580 * concurrent kworker observes the new done tail, then
1581 * the newly queued work will skip processing the done
1582 * tail, as workqueue semantics guarantees that the new
1583 * work is executed only after the previous one completes.
1585 * f. kworker callbacks processing complete:
1592 * +----------+ +--------+
1594 * | head ------> wait |
1596 * +----------+ +--------+
1599 static bool rcu_sr_is_wait_head(struct llist_node
*node
)
1601 return &(rcu_state
.srs_wait_nodes
)[0].node
<= node
&&
1602 node
<= &(rcu_state
.srs_wait_nodes
)[SR_NORMAL_GP_WAIT_HEAD_MAX
- 1].node
;
1605 static struct llist_node
*rcu_sr_get_wait_head(void)
1607 struct sr_wait_node
*sr_wn
;
1610 for (i
= 0; i
< SR_NORMAL_GP_WAIT_HEAD_MAX
; i
++) {
1611 sr_wn
= &(rcu_state
.srs_wait_nodes
)[i
];
1613 if (!atomic_cmpxchg_acquire(&sr_wn
->inuse
, 0, 1))
1614 return &sr_wn
->node
;
1620 static void rcu_sr_put_wait_head(struct llist_node
*node
)
1622 struct sr_wait_node
*sr_wn
= container_of(node
, struct sr_wait_node
, node
);
1624 atomic_set_release(&sr_wn
->inuse
, 0);
1627 /* Disabled by default. */
1628 static int rcu_normal_wake_from_gp
;
1629 module_param(rcu_normal_wake_from_gp
, int, 0644);
1630 static struct workqueue_struct
*sync_wq
;
1632 static void rcu_sr_normal_complete(struct llist_node
*node
)
1634 struct rcu_synchronize
*rs
= container_of(
1635 (struct rcu_head
*) node
, struct rcu_synchronize
, head
);
1636 unsigned long oldstate
= (unsigned long) rs
->head
.func
;
1638 WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU
) &&
1639 !poll_state_synchronize_rcu(oldstate
),
1640 "A full grace period is not passed yet: %lu",
1641 rcu_seq_diff(get_state_synchronize_rcu(), oldstate
));
1644 complete(&rs
->completion
);
1647 static void rcu_sr_normal_gp_cleanup_work(struct work_struct
*work
)
1649 struct llist_node
*done
, *rcu
, *next
, *head
;
1652 * This work execution can potentially execute
1653 * while a new done tail is being updated by
1654 * grace period kthread in rcu_sr_normal_gp_cleanup().
1655 * So, read and updates of done tail need to
1656 * follow acq-rel semantics.
1658 * Given that wq semantics guarantees that a single work
1659 * cannot execute concurrently by multiple kworkers,
1660 * the done tail list manipulations are protected here.
1662 done
= smp_load_acquire(&rcu_state
.srs_done_tail
);
1663 if (WARN_ON_ONCE(!done
))
1666 WARN_ON_ONCE(!rcu_sr_is_wait_head(done
));
1671 * The dummy node, which is pointed to by the
1672 * done tail which is acq-read above is not removed
1673 * here. This allows lockless additions of new
1674 * rcu_synchronize nodes in rcu_sr_normal_add_req(),
1675 * while the cleanup work executes. The dummy
1676 * nodes is removed, in next round of cleanup
1679 llist_for_each_safe(rcu
, next
, head
) {
1680 if (!rcu_sr_is_wait_head(rcu
)) {
1681 rcu_sr_normal_complete(rcu
);
1685 rcu_sr_put_wait_head(rcu
);
1688 /* Order list manipulations with atomic access. */
1689 atomic_dec_return_release(&rcu_state
.srs_cleanups_pending
);
1693 * Helper function for rcu_gp_cleanup().
1695 static void rcu_sr_normal_gp_cleanup(void)
1697 struct llist_node
*wait_tail
, *next
= NULL
, *rcu
= NULL
;
1700 wait_tail
= rcu_state
.srs_wait_tail
;
1701 if (wait_tail
== NULL
)
1704 rcu_state
.srs_wait_tail
= NULL
;
1705 ASSERT_EXCLUSIVE_WRITER(rcu_state
.srs_wait_tail
);
1706 WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail
));
1709 * Process (a) and (d) cases. See an illustration.
1711 llist_for_each_safe(rcu
, next
, wait_tail
->next
) {
1712 if (rcu_sr_is_wait_head(rcu
))
1715 rcu_sr_normal_complete(rcu
);
1716 // It can be last, update a next on this step.
1717 wait_tail
->next
= next
;
1719 if (++done
== SR_MAX_USERS_WAKE_FROM_GP
)
1724 * Fast path, no more users to process except putting the second last
1725 * wait head if no inflight-workers. If there are in-flight workers,
1726 * they will remove the last wait head.
1728 * Note that the ACQUIRE orders atomic access with list manipulation.
1730 if (wait_tail
->next
&& wait_tail
->next
->next
== NULL
&&
1731 rcu_sr_is_wait_head(wait_tail
->next
) &&
1732 !atomic_read_acquire(&rcu_state
.srs_cleanups_pending
)) {
1733 rcu_sr_put_wait_head(wait_tail
->next
);
1734 wait_tail
->next
= NULL
;
1737 /* Concurrent sr_normal_gp_cleanup work might observe this update. */
1738 ASSERT_EXCLUSIVE_WRITER(rcu_state
.srs_done_tail
);
1739 smp_store_release(&rcu_state
.srs_done_tail
, wait_tail
);
1742 * We schedule a work in order to perform a final processing
1743 * of outstanding users(if still left) and releasing wait-heads
1744 * added by rcu_sr_normal_gp_init() call.
1746 if (wait_tail
->next
) {
1747 atomic_inc(&rcu_state
.srs_cleanups_pending
);
1748 if (!queue_work(sync_wq
, &rcu_state
.srs_cleanup_work
))
1749 atomic_dec(&rcu_state
.srs_cleanups_pending
);
1754 * Helper function for rcu_gp_init().
1756 static bool rcu_sr_normal_gp_init(void)
1758 struct llist_node
*first
;
1759 struct llist_node
*wait_head
;
1760 bool start_new_poll
= false;
1762 first
= READ_ONCE(rcu_state
.srs_next
.first
);
1763 if (!first
|| rcu_sr_is_wait_head(first
))
1764 return start_new_poll
;
1766 wait_head
= rcu_sr_get_wait_head();
1768 // Kick another GP to retry.
1769 start_new_poll
= true;
1770 return start_new_poll
;
1773 /* Inject a wait-dummy-node. */
1774 llist_add(wait_head
, &rcu_state
.srs_next
);
1777 * A waiting list of rcu_synchronize nodes should be empty on
1778 * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(),
1779 * rolls it over. If not, it is a BUG, warn a user.
1781 WARN_ON_ONCE(rcu_state
.srs_wait_tail
!= NULL
);
1782 rcu_state
.srs_wait_tail
= wait_head
;
1783 ASSERT_EXCLUSIVE_WRITER(rcu_state
.srs_wait_tail
);
1785 return start_new_poll
;
1788 static void rcu_sr_normal_add_req(struct rcu_synchronize
*rs
)
1790 llist_add((struct llist_node
*) &rs
->head
, &rcu_state
.srs_next
);
1794 * Initialize a new grace period. Return false if no grace period required.
1796 static noinline_for_stack
bool rcu_gp_init(void)
1798 unsigned long flags
;
1799 unsigned long oldmask
;
1801 struct rcu_data
*rdp
;
1802 struct rcu_node
*rnp
= rcu_get_root();
1803 bool start_new_poll
;
1805 WRITE_ONCE(rcu_state
.gp_activity
, jiffies
);
1806 raw_spin_lock_irq_rcu_node(rnp
);
1807 if (!rcu_state
.gp_flags
) {
1808 /* Spurious wakeup, tell caller to go back to sleep. */
1809 raw_spin_unlock_irq_rcu_node(rnp
);
1812 WRITE_ONCE(rcu_state
.gp_flags
, 0); /* Clear all flags: New GP. */
1814 if (WARN_ON_ONCE(rcu_gp_in_progress())) {
1816 * Grace period already in progress, don't start another.
1817 * Not supposed to be able to happen.
1819 raw_spin_unlock_irq_rcu_node(rnp
);
1823 /* Advance to a new grace period and initialize state. */
1824 record_gp_stall_check_time();
1825 /* Record GP times before starting GP, hence rcu_seq_start(). */
1826 rcu_seq_start(&rcu_state
.gp_seq
);
1827 ASSERT_EXCLUSIVE_WRITER(rcu_state
.gp_seq
);
1828 start_new_poll
= rcu_sr_normal_gp_init();
1829 trace_rcu_grace_period(rcu_state
.name
, rcu_state
.gp_seq
, TPS("start"));
1830 rcu_poll_gp_seq_start(&rcu_state
.gp_seq_polled_snap
);
1831 raw_spin_unlock_irq_rcu_node(rnp
);
1834 * The "start_new_poll" is set to true, only when this GP is not able
1835 * to handle anything and there are outstanding users. It happens when
1836 * the rcu_sr_normal_gp_init() function was not able to insert a dummy
1837 * separator to the llist, because there were no left any dummy-nodes.
1839 * Number of dummy-nodes is fixed, it could be that we are run out of
1840 * them, if so we start a new pool request to repeat a try. It is rare
1841 * and it means that a system is doing a slow processing of callbacks.
1844 (void) start_poll_synchronize_rcu();
1847 * Apply per-leaf buffered online and offline operations to
1848 * the rcu_node tree. Note that this new grace period need not
1849 * wait for subsequent online CPUs, and that RCU hooks in the CPU
1850 * offlining path, when combined with checks in this function,
1851 * will handle CPUs that are currently going offline or that will
1852 * go offline later. Please also refer to "Hotplug CPU" section
1853 * of RCU's Requirements documentation.
1855 WRITE_ONCE(rcu_state
.gp_state
, RCU_GP_ONOFF
);
1856 /* Exclude CPU hotplug operations. */
1857 rcu_for_each_leaf_node(rnp
) {
1858 local_irq_disable();
1859 arch_spin_lock(&rcu_state
.ofl_lock
);
1860 raw_spin_lock_rcu_node(rnp
);
1861 if (rnp
->qsmaskinit
== rnp
->qsmaskinitnext
&&
1862 !rnp
->wait_blkd_tasks
) {
1863 /* Nothing to do on this leaf rcu_node structure. */
1864 raw_spin_unlock_rcu_node(rnp
);
1865 arch_spin_unlock(&rcu_state
.ofl_lock
);
1870 /* Record old state, apply changes to ->qsmaskinit field. */
1871 oldmask
= rnp
->qsmaskinit
;
1872 rnp
->qsmaskinit
= rnp
->qsmaskinitnext
;
1874 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
1875 if (!oldmask
!= !rnp
->qsmaskinit
) {
1876 if (!oldmask
) { /* First online CPU for rcu_node. */
1877 if (!rnp
->wait_blkd_tasks
) /* Ever offline? */
1878 rcu_init_new_rnp(rnp
);
1879 } else if (rcu_preempt_has_tasks(rnp
)) {
1880 rnp
->wait_blkd_tasks
= true; /* blocked tasks */
1881 } else { /* Last offline CPU and can propagate. */
1882 rcu_cleanup_dead_rnp(rnp
);
1887 * If all waited-on tasks from prior grace period are
1888 * done, and if all this rcu_node structure's CPUs are
1889 * still offline, propagate up the rcu_node tree and
1890 * clear ->wait_blkd_tasks. Otherwise, if one of this
1891 * rcu_node structure's CPUs has since come back online,
1892 * simply clear ->wait_blkd_tasks.
1894 if (rnp
->wait_blkd_tasks
&&
1895 (!rcu_preempt_has_tasks(rnp
) || rnp
->qsmaskinit
)) {
1896 rnp
->wait_blkd_tasks
= false;
1897 if (!rnp
->qsmaskinit
)
1898 rcu_cleanup_dead_rnp(rnp
);
1901 raw_spin_unlock_rcu_node(rnp
);
1902 arch_spin_unlock(&rcu_state
.ofl_lock
);
1905 rcu_gp_slow(gp_preinit_delay
); /* Races with CPU hotplug. */
1908 * Set the quiescent-state-needed bits in all the rcu_node
1909 * structures for all currently online CPUs in breadth-first
1910 * order, starting from the root rcu_node structure, relying on the
1911 * layout of the tree within the rcu_state.node[] array. Note that
1912 * other CPUs will access only the leaves of the hierarchy, thus
1913 * seeing that no grace period is in progress, at least until the
1914 * corresponding leaf node has been initialized.
1916 * The grace period cannot complete until the initialization
1917 * process finishes, because this kthread handles both.
1919 WRITE_ONCE(rcu_state
.gp_state
, RCU_GP_INIT
);
1920 rcu_for_each_node_breadth_first(rnp
) {
1921 rcu_gp_slow(gp_init_delay
);
1922 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
1923 rdp
= this_cpu_ptr(&rcu_data
);
1924 rcu_preempt_check_blocked_tasks(rnp
);
1925 rnp
->qsmask
= rnp
->qsmaskinit
;
1926 WRITE_ONCE(rnp
->gp_seq
, rcu_state
.gp_seq
);
1927 if (rnp
== rdp
->mynode
)
1928 (void)__note_gp_changes(rnp
, rdp
);
1929 rcu_preempt_boost_start_gp(rnp
);
1930 trace_rcu_grace_period_init(rcu_state
.name
, rnp
->gp_seq
,
1931 rnp
->level
, rnp
->grplo
,
1932 rnp
->grphi
, rnp
->qsmask
);
1933 /* Quiescent states for tasks on any now-offline CPUs. */
1934 mask
= rnp
->qsmask
& ~rnp
->qsmaskinitnext
;
1935 rnp
->rcu_gp_init_mask
= mask
;
1936 if ((mask
|| rnp
->wait_blkd_tasks
) && rcu_is_leaf_node(rnp
))
1937 rcu_report_qs_rnp(mask
, rnp
, rnp
->gp_seq
, flags
);
1939 raw_spin_unlock_irq_rcu_node(rnp
);
1940 cond_resched_tasks_rcu_qs();
1941 WRITE_ONCE(rcu_state
.gp_activity
, jiffies
);
1944 // If strict, make all CPUs aware of new grace period.
1945 if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD
))
1946 on_each_cpu(rcu_strict_gp_boundary
, NULL
, 0);
1952 * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
1955 static bool rcu_gp_fqs_check_wake(int *gfp
)
1957 struct rcu_node
*rnp
= rcu_get_root();
1959 // If under overload conditions, force an immediate FQS scan.
1960 if (*gfp
& RCU_GP_FLAG_OVLD
)
1963 // Someone like call_rcu() requested a force-quiescent-state scan.
1964 *gfp
= READ_ONCE(rcu_state
.gp_flags
);
1965 if (*gfp
& RCU_GP_FLAG_FQS
)
1968 // The current grace period has completed.
1969 if (!READ_ONCE(rnp
->qsmask
) && !rcu_preempt_blocked_readers_cgp(rnp
))
1976 * Do one round of quiescent-state forcing.
1978 static void rcu_gp_fqs(bool first_time
)
1980 int nr_fqs
= READ_ONCE(rcu_state
.nr_fqs_jiffies_stall
);
1981 struct rcu_node
*rnp
= rcu_get_root();
1983 WRITE_ONCE(rcu_state
.gp_activity
, jiffies
);
1984 WRITE_ONCE(rcu_state
.n_force_qs
, rcu_state
.n_force_qs
+ 1);
1986 WARN_ON_ONCE(nr_fqs
> 3);
1987 /* Only countdown nr_fqs for stall purposes if jiffies moves. */
1990 WRITE_ONCE(rcu_state
.jiffies_stall
,
1991 jiffies
+ rcu_jiffies_till_stall_check());
1993 WRITE_ONCE(rcu_state
.nr_fqs_jiffies_stall
, --nr_fqs
);
1997 /* Collect dyntick-idle snapshots. */
1998 force_qs_rnp(rcu_watching_snap_save
);
2000 /* Handle dyntick-idle and offline CPUs. */
2001 force_qs_rnp(rcu_watching_snap_recheck
);
2003 /* Clear flag to prevent immediate re-entry. */
2004 if (READ_ONCE(rcu_state
.gp_flags
) & RCU_GP_FLAG_FQS
) {
2005 raw_spin_lock_irq_rcu_node(rnp
);
2006 WRITE_ONCE(rcu_state
.gp_flags
, rcu_state
.gp_flags
& ~RCU_GP_FLAG_FQS
);
2007 raw_spin_unlock_irq_rcu_node(rnp
);
2012 * Loop doing repeated quiescent-state forcing until the grace period ends.
2014 static noinline_for_stack
void rcu_gp_fqs_loop(void)
2016 bool first_gp_fqs
= true;
2020 struct rcu_node
*rnp
= rcu_get_root();
2022 j
= READ_ONCE(jiffies_till_first_fqs
);
2023 if (rcu_state
.cbovld
)
2024 gf
= RCU_GP_FLAG_OVLD
;
2027 if (rcu_state
.cbovld
) {
2032 if (!ret
|| time_before(jiffies
+ j
, rcu_state
.jiffies_force_qs
)) {
2033 WRITE_ONCE(rcu_state
.jiffies_force_qs
, jiffies
+ j
);
2035 * jiffies_force_qs before RCU_GP_WAIT_FQS state
2036 * update; required for stall checks.
2039 WRITE_ONCE(rcu_state
.jiffies_kick_kthreads
,
2040 jiffies
+ (j
? 3 * j
: 2));
2042 trace_rcu_grace_period(rcu_state
.name
, rcu_state
.gp_seq
,
2044 WRITE_ONCE(rcu_state
.gp_state
, RCU_GP_WAIT_FQS
);
2045 (void)swait_event_idle_timeout_exclusive(rcu_state
.gp_wq
,
2046 rcu_gp_fqs_check_wake(&gf
), j
);
2047 rcu_gp_torture_wait();
2048 WRITE_ONCE(rcu_state
.gp_state
, RCU_GP_DOING_FQS
);
2049 /* Locking provides needed memory barriers. */
2051 * Exit the loop if the root rcu_node structure indicates that the grace period
2052 * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
2053 * is required only for single-node rcu_node trees because readers blocking
2054 * the current grace period are queued only on leaf rcu_node structures.
2055 * For multi-node trees, checking the root node's ->qsmask suffices, because a
2056 * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
2057 * the corresponding leaf nodes have passed through their quiescent state.
2059 if (!READ_ONCE(rnp
->qsmask
) &&
2060 !rcu_preempt_blocked_readers_cgp(rnp
))
2062 /* If time for quiescent-state forcing, do it. */
2063 if (!time_after(rcu_state
.jiffies_force_qs
, jiffies
) ||
2064 (gf
& (RCU_GP_FLAG_FQS
| RCU_GP_FLAG_OVLD
))) {
2065 trace_rcu_grace_period(rcu_state
.name
, rcu_state
.gp_seq
,
2067 rcu_gp_fqs(first_gp_fqs
);
2070 first_gp_fqs
= false;
2071 gf
= rcu_state
.cbovld
? RCU_GP_FLAG_OVLD
: 0;
2073 trace_rcu_grace_period(rcu_state
.name
, rcu_state
.gp_seq
,
2075 cond_resched_tasks_rcu_qs();
2076 WRITE_ONCE(rcu_state
.gp_activity
, jiffies
);
2077 ret
= 0; /* Force full wait till next FQS. */
2078 j
= READ_ONCE(jiffies_till_next_fqs
);
2080 /* Deal with stray signal. */
2081 cond_resched_tasks_rcu_qs();
2082 WRITE_ONCE(rcu_state
.gp_activity
, jiffies
);
2083 WARN_ON(signal_pending(current
));
2084 trace_rcu_grace_period(rcu_state
.name
, rcu_state
.gp_seq
,
2086 ret
= 1; /* Keep old FQS timing. */
2088 if (time_after(jiffies
, rcu_state
.jiffies_force_qs
))
2091 j
= rcu_state
.jiffies_force_qs
- j
;
2098 * Clean up after the old grace period.
2100 static noinline
void rcu_gp_cleanup(void)
2103 bool needgp
= false;
2104 unsigned long gp_duration
;
2105 unsigned long new_gp_seq
;
2107 struct rcu_data
*rdp
;
2108 struct rcu_node
*rnp
= rcu_get_root();
2109 struct swait_queue_head
*sq
;
2111 WRITE_ONCE(rcu_state
.gp_activity
, jiffies
);
2112 raw_spin_lock_irq_rcu_node(rnp
);
2113 rcu_state
.gp_end
= jiffies
;
2114 gp_duration
= rcu_state
.gp_end
- rcu_state
.gp_start
;
2115 if (gp_duration
> rcu_state
.gp_max
)
2116 rcu_state
.gp_max
= gp_duration
;
2119 * We know the grace period is complete, but to everyone else
2120 * it appears to still be ongoing. But it is also the case
2121 * that to everyone else it looks like there is nothing that
2122 * they can do to advance the grace period. It is therefore
2123 * safe for us to drop the lock in order to mark the grace
2124 * period as completed in all of the rcu_node structures.
2126 rcu_poll_gp_seq_end(&rcu_state
.gp_seq_polled_snap
);
2127 raw_spin_unlock_irq_rcu_node(rnp
);
2130 * Propagate new ->gp_seq value to rcu_node structures so that
2131 * other CPUs don't have to wait until the start of the next grace
2132 * period to process their callbacks. This also avoids some nasty
2133 * RCU grace-period initialization races by forcing the end of
2134 * the current grace period to be completely recorded in all of
2135 * the rcu_node structures before the beginning of the next grace
2136 * period is recorded in any of the rcu_node structures.
2138 new_gp_seq
= rcu_state
.gp_seq
;
2139 rcu_seq_end(&new_gp_seq
);
2140 rcu_for_each_node_breadth_first(rnp
) {
2141 raw_spin_lock_irq_rcu_node(rnp
);
2142 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp
)))
2143 dump_blkd_tasks(rnp
, 10);
2144 WARN_ON_ONCE(rnp
->qsmask
);
2145 WRITE_ONCE(rnp
->gp_seq
, new_gp_seq
);
2147 smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
2148 rdp
= this_cpu_ptr(&rcu_data
);
2149 if (rnp
== rdp
->mynode
)
2150 needgp
= __note_gp_changes(rnp
, rdp
) || needgp
;
2151 /* smp_mb() provided by prior unlock-lock pair. */
2152 needgp
= rcu_future_gp_cleanup(rnp
) || needgp
;
2153 // Reset overload indication for CPUs no longer overloaded
2154 if (rcu_is_leaf_node(rnp
))
2155 for_each_leaf_node_cpu_mask(rnp
, cpu
, rnp
->cbovldmask
) {
2156 rdp
= per_cpu_ptr(&rcu_data
, cpu
);
2157 check_cb_ovld_locked(rdp
, rnp
);
2159 sq
= rcu_nocb_gp_get(rnp
);
2160 raw_spin_unlock_irq_rcu_node(rnp
);
2161 rcu_nocb_gp_cleanup(sq
);
2162 cond_resched_tasks_rcu_qs();
2163 WRITE_ONCE(rcu_state
.gp_activity
, jiffies
);
2164 rcu_gp_slow(gp_cleanup_delay
);
2166 rnp
= rcu_get_root();
2167 raw_spin_lock_irq_rcu_node(rnp
); /* GP before ->gp_seq update. */
2169 /* Declare grace period done, trace first to use old GP number. */
2170 trace_rcu_grace_period(rcu_state
.name
, rcu_state
.gp_seq
, TPS("end"));
2171 rcu_seq_end(&rcu_state
.gp_seq
);
2172 ASSERT_EXCLUSIVE_WRITER(rcu_state
.gp_seq
);
2173 WRITE_ONCE(rcu_state
.gp_state
, RCU_GP_IDLE
);
2174 /* Check for GP requests since above loop. */
2175 rdp
= this_cpu_ptr(&rcu_data
);
2176 if (!needgp
&& ULONG_CMP_LT(rnp
->gp_seq
, rnp
->gp_seq_needed
)) {
2177 trace_rcu_this_gp(rnp
, rdp
, rnp
->gp_seq_needed
,
2178 TPS("CleanupMore"));
2181 /* Advance CBs to reduce false positives below. */
2182 offloaded
= rcu_rdp_is_offloaded(rdp
);
2183 if ((offloaded
|| !rcu_accelerate_cbs(rnp
, rdp
)) && needgp
) {
2185 // We get here if a grace period was needed (“needgp”)
2186 // and the above call to rcu_accelerate_cbs() did not set
2187 // the RCU_GP_FLAG_INIT bit in ->gp_state (which records
2188 // the need for another grace period). The purpose
2189 // of the “offloaded” check is to avoid invoking
2190 // rcu_accelerate_cbs() on an offloaded CPU because we do not
2191 // hold the ->nocb_lock needed to safely access an offloaded
2192 // ->cblist. We do not want to acquire that lock because
2193 // it can be heavily contended during callback floods.
2195 WRITE_ONCE(rcu_state
.gp_flags
, RCU_GP_FLAG_INIT
);
2196 WRITE_ONCE(rcu_state
.gp_req_activity
, jiffies
);
2197 trace_rcu_grace_period(rcu_state
.name
, rcu_state
.gp_seq
, TPS("newreq"));
2200 // We get here either if there is no need for an
2201 // additional grace period or if rcu_accelerate_cbs() has
2202 // already set the RCU_GP_FLAG_INIT bit in ->gp_flags.
2203 // So all we need to do is to clear all of the other
2206 WRITE_ONCE(rcu_state
.gp_flags
, rcu_state
.gp_flags
& RCU_GP_FLAG_INIT
);
2208 raw_spin_unlock_irq_rcu_node(rnp
);
2210 // Make synchronize_rcu() users aware of the end of old grace period.
2211 rcu_sr_normal_gp_cleanup();
2213 // If strict, make all CPUs aware of the end of the old grace period.
2214 if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD
))
2215 on_each_cpu(rcu_strict_gp_boundary
, NULL
, 0);
2219 * Body of kthread that handles grace periods.
2221 static int __noreturn
rcu_gp_kthread(void *unused
)
2223 rcu_bind_gp_kthread();
2226 /* Handle grace-period start. */
2228 trace_rcu_grace_period(rcu_state
.name
, rcu_state
.gp_seq
,
2230 WRITE_ONCE(rcu_state
.gp_state
, RCU_GP_WAIT_GPS
);
2231 swait_event_idle_exclusive(rcu_state
.gp_wq
,
2232 READ_ONCE(rcu_state
.gp_flags
) &
2234 rcu_gp_torture_wait();
2235 WRITE_ONCE(rcu_state
.gp_state
, RCU_GP_DONE_GPS
);
2236 /* Locking provides needed memory barrier. */
2239 cond_resched_tasks_rcu_qs();
2240 WRITE_ONCE(rcu_state
.gp_activity
, jiffies
);
2241 WARN_ON(signal_pending(current
));
2242 trace_rcu_grace_period(rcu_state
.name
, rcu_state
.gp_seq
,
2246 /* Handle quiescent-state forcing. */
2249 /* Handle grace-period end. */
2250 WRITE_ONCE(rcu_state
.gp_state
, RCU_GP_CLEANUP
);
2252 WRITE_ONCE(rcu_state
.gp_state
, RCU_GP_CLEANED
);
2257 * Report a full set of quiescent states to the rcu_state data structure.
2258 * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if
2259 * another grace period is required. Whether we wake the grace-period
2260 * kthread or it awakens itself for the next round of quiescent-state
2261 * forcing, that kthread will clean up after the just-completed grace
2262 * period. Note that the caller must hold rnp->lock, which is released
2265 static void rcu_report_qs_rsp(unsigned long flags
)
2266 __releases(rcu_get_root()->lock
)
2268 raw_lockdep_assert_held_rcu_node(rcu_get_root());
2269 WARN_ON_ONCE(!rcu_gp_in_progress());
2270 WRITE_ONCE(rcu_state
.gp_flags
, rcu_state
.gp_flags
| RCU_GP_FLAG_FQS
);
2271 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags
);
2272 rcu_gp_kthread_wake();
2276 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
2277 * Allows quiescent states for a group of CPUs to be reported at one go
2278 * to the specified rcu_node structure, though all the CPUs in the group
2279 * must be represented by the same rcu_node structure (which need not be a
2280 * leaf rcu_node structure, though it often will be). The gps parameter
2281 * is the grace-period snapshot, which means that the quiescent states
2282 * are valid only if rnp->gp_seq is equal to gps. That structure's lock
2283 * must be held upon entry, and it is released before return.
2285 * As a special case, if mask is zero, the bit-already-cleared check is
2286 * disabled. This allows propagating quiescent state due to resumed tasks
2287 * during grace-period initialization.
2289 static void rcu_report_qs_rnp(unsigned long mask
, struct rcu_node
*rnp
,
2290 unsigned long gps
, unsigned long flags
)
2291 __releases(rnp
->lock
)
2293 unsigned long oldmask
= 0;
2294 struct rcu_node
*rnp_c
;
2296 raw_lockdep_assert_held_rcu_node(rnp
);
2298 /* Walk up the rcu_node hierarchy. */
2300 if ((!(rnp
->qsmask
& mask
) && mask
) || rnp
->gp_seq
!= gps
) {
2303 * Our bit has already been cleared, or the
2304 * relevant grace period is already over, so done.
2306 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
2309 WARN_ON_ONCE(oldmask
); /* Any child must be all zeroed! */
2310 WARN_ON_ONCE(!rcu_is_leaf_node(rnp
) &&
2311 rcu_preempt_blocked_readers_cgp(rnp
));
2312 WRITE_ONCE(rnp
->qsmask
, rnp
->qsmask
& ~mask
);
2313 trace_rcu_quiescent_state_report(rcu_state
.name
, rnp
->gp_seq
,
2314 mask
, rnp
->qsmask
, rnp
->level
,
2315 rnp
->grplo
, rnp
->grphi
,
2317 if (rnp
->qsmask
!= 0 || rcu_preempt_blocked_readers_cgp(rnp
)) {
2319 /* Other bits still set at this level, so done. */
2320 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
2323 rnp
->completedqs
= rnp
->gp_seq
;
2324 mask
= rnp
->grpmask
;
2325 if (rnp
->parent
== NULL
) {
2327 /* No more levels. Exit loop holding root lock. */
2331 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
2334 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
2335 oldmask
= READ_ONCE(rnp_c
->qsmask
);
2339 * Get here if we are the last CPU to pass through a quiescent
2340 * state for this grace period. Invoke rcu_report_qs_rsp()
2341 * to clean up and start the next grace period if one is needed.
2343 rcu_report_qs_rsp(flags
); /* releases rnp->lock. */
2347 * Record a quiescent state for all tasks that were previously queued
2348 * on the specified rcu_node structure and that were blocking the current
2349 * RCU grace period. The caller must hold the corresponding rnp->lock with
2350 * irqs disabled, and this lock is released upon return, but irqs remain
2353 static void __maybe_unused
2354 rcu_report_unblock_qs_rnp(struct rcu_node
*rnp
, unsigned long flags
)
2355 __releases(rnp
->lock
)
2359 struct rcu_node
*rnp_p
;
2361 raw_lockdep_assert_held_rcu_node(rnp
);
2362 if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU
)) ||
2363 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp
)) ||
2365 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
2366 return; /* Still need more quiescent states! */
2369 rnp
->completedqs
= rnp
->gp_seq
;
2370 rnp_p
= rnp
->parent
;
2371 if (rnp_p
== NULL
) {
2373 * Only one rcu_node structure in the tree, so don't
2374 * try to report up to its nonexistent parent!
2376 rcu_report_qs_rsp(flags
);
2380 /* Report up the rest of the hierarchy, tracking current ->gp_seq. */
2382 mask
= rnp
->grpmask
;
2383 raw_spin_unlock_rcu_node(rnp
); /* irqs remain disabled. */
2384 raw_spin_lock_rcu_node(rnp_p
); /* irqs already disabled. */
2385 rcu_report_qs_rnp(mask
, rnp_p
, gps
, flags
);
2389 * Record a quiescent state for the specified CPU to that CPU's rcu_data
2390 * structure. This must be called from the specified CPU.
2393 rcu_report_qs_rdp(struct rcu_data
*rdp
)
2395 unsigned long flags
;
2397 struct rcu_node
*rnp
;
2399 WARN_ON_ONCE(rdp
->cpu
!= smp_processor_id());
2401 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
2402 if (rdp
->cpu_no_qs
.b
.norm
|| rdp
->gp_seq
!= rnp
->gp_seq
||
2406 * The grace period in which this quiescent state was
2407 * recorded has ended, so don't report it upwards.
2408 * We will instead need a new quiescent state that lies
2409 * within the current grace period.
2411 rdp
->cpu_no_qs
.b
.norm
= true; /* need qs for new gp. */
2412 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
2415 mask
= rdp
->grpmask
;
2416 rdp
->core_needs_qs
= false;
2417 if ((rnp
->qsmask
& mask
) == 0) {
2418 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
2421 * This GP can't end until cpu checks in, so all of our
2422 * callbacks can be processed during the next GP.
2424 * NOCB kthreads have their own way to deal with that...
2426 if (!rcu_rdp_is_offloaded(rdp
)) {
2428 * The current GP has not yet ended, so it
2429 * should not be possible for rcu_accelerate_cbs()
2430 * to return true. So complain, but don't awaken.
2432 WARN_ON_ONCE(rcu_accelerate_cbs(rnp
, rdp
));
2435 rcu_disable_urgency_upon_qs(rdp
);
2436 rcu_report_qs_rnp(mask
, rnp
, rnp
->gp_seq
, flags
);
2437 /* ^^^ Released rnp->lock */
2442 * Check to see if there is a new grace period of which this CPU
2443 * is not yet aware, and if so, set up local rcu_data state for it.
2444 * Otherwise, see if this CPU has just passed through its first
2445 * quiescent state for this grace period, and record that fact if so.
2448 rcu_check_quiescent_state(struct rcu_data
*rdp
)
2450 /* Check for grace-period ends and beginnings. */
2451 note_gp_changes(rdp
);
2454 * Does this CPU still need to do its part for current grace period?
2455 * If no, return and let the other CPUs do their part as well.
2457 if (!rdp
->core_needs_qs
)
2461 * Was there a quiescent state since the beginning of the grace
2462 * period? If no, then exit and wait for the next call.
2464 if (rdp
->cpu_no_qs
.b
.norm
)
2468 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
2471 rcu_report_qs_rdp(rdp
);
2474 /* Return true if callback-invocation time limit exceeded. */
2475 static bool rcu_do_batch_check_time(long count
, long tlimit
,
2476 bool jlimit_check
, unsigned long jlimit
)
2478 // Invoke local_clock() only once per 32 consecutive callbacks.
2479 return unlikely(tlimit
) &&
2480 (!likely(count
& 31) ||
2481 (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME
) &&
2482 jlimit_check
&& time_after(jiffies
, jlimit
))) &&
2483 local_clock() >= tlimit
;
2487 * Invoke any RCU callbacks that have made it to the end of their grace
2488 * period. Throttle as specified by rdp->blimit.
2490 static void rcu_do_batch(struct rcu_data
*rdp
)
2495 bool __maybe_unused empty
;
2496 unsigned long flags
;
2497 unsigned long jlimit
;
2498 bool jlimit_check
= false;
2500 struct rcu_cblist rcl
= RCU_CBLIST_INITIALIZER(rcl
);
2501 struct rcu_head
*rhp
;
2504 /* If no callbacks are ready, just return. */
2505 if (!rcu_segcblist_ready_cbs(&rdp
->cblist
)) {
2506 trace_rcu_batch_start(rcu_state
.name
,
2507 rcu_segcblist_n_cbs(&rdp
->cblist
), 0);
2508 trace_rcu_batch_end(rcu_state
.name
, 0,
2509 !rcu_segcblist_empty(&rdp
->cblist
),
2510 need_resched(), is_idle_task(current
),
2511 rcu_is_callbacks_kthread(rdp
));
2516 * Extract the list of ready callbacks, disabling IRQs to prevent
2517 * races with call_rcu() from interrupt handlers. Leave the
2518 * callback counts, as rcu_barrier() needs to be conservative.
2520 * Callbacks execution is fully ordered against preceding grace period
2521 * completion (materialized by rnp->gp_seq update) thanks to the
2522 * smp_mb__after_unlock_lock() upon node locking required for callbacks
2523 * advancing. In NOCB mode this ordering is then further relayed through
2524 * the nocb locking that protects both callbacks advancing and extraction.
2526 rcu_nocb_lock_irqsave(rdp
, flags
);
2527 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2528 pending
= rcu_segcblist_get_seglen(&rdp
->cblist
, RCU_DONE_TAIL
);
2529 div
= READ_ONCE(rcu_divisor
);
2530 div
= div
< 0 ? 7 : div
> sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div
;
2531 bl
= max(rdp
->blimit
, pending
>> div
);
2532 if ((in_serving_softirq() || rdp
->rcu_cpu_kthread_status
== RCU_KTHREAD_RUNNING
) &&
2533 (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME
) || unlikely(bl
> 100))) {
2534 const long npj
= NSEC_PER_SEC
/ HZ
;
2535 long rrn
= READ_ONCE(rcu_resched_ns
);
2537 rrn
= rrn
< NSEC_PER_MSEC
? NSEC_PER_MSEC
: rrn
> NSEC_PER_SEC
? NSEC_PER_SEC
: rrn
;
2538 tlimit
= local_clock() + rrn
;
2539 jlimit
= jiffies
+ (rrn
+ npj
+ 1) / npj
;
2540 jlimit_check
= true;
2542 trace_rcu_batch_start(rcu_state
.name
,
2543 rcu_segcblist_n_cbs(&rdp
->cblist
), bl
);
2544 rcu_segcblist_extract_done_cbs(&rdp
->cblist
, &rcl
);
2545 if (rcu_rdp_is_offloaded(rdp
))
2546 rdp
->qlen_last_fqs_check
= rcu_segcblist_n_cbs(&rdp
->cblist
);
2548 trace_rcu_segcb_stats(&rdp
->cblist
, TPS("SegCbDequeued"));
2549 rcu_nocb_unlock_irqrestore(rdp
, flags
);
2551 /* Invoke callbacks. */
2552 tick_dep_set_task(current
, TICK_DEP_BIT_RCU
);
2553 rhp
= rcu_cblist_dequeue(&rcl
);
2555 for (; rhp
; rhp
= rcu_cblist_dequeue(&rcl
)) {
2559 debug_rcu_head_unqueue(rhp
);
2561 rcu_lock_acquire(&rcu_callback_map
);
2562 trace_rcu_invoke_callback(rcu_state
.name
, rhp
);
2565 debug_rcu_head_callback(rhp
);
2566 WRITE_ONCE(rhp
->func
, (rcu_callback_t
)0L);
2569 rcu_lock_release(&rcu_callback_map
);
2572 * Stop only if limit reached and CPU has something to do.
2574 if (in_serving_softirq()) {
2575 if (count
>= bl
&& (need_resched() || !is_idle_task(current
)))
2578 * Make sure we don't spend too much time here and deprive other
2579 * softirq vectors of CPU cycles.
2581 if (rcu_do_batch_check_time(count
, tlimit
, jlimit_check
, jlimit
))
2584 // In rcuc/rcuoc context, so no worries about
2585 // depriving other softirq vectors of CPU cycles.
2587 lockdep_assert_irqs_enabled();
2588 cond_resched_tasks_rcu_qs();
2589 lockdep_assert_irqs_enabled();
2591 // But rcuc kthreads can delay quiescent-state
2592 // reporting, so check time limits for them.
2593 if (rdp
->rcu_cpu_kthread_status
== RCU_KTHREAD_RUNNING
&&
2594 rcu_do_batch_check_time(count
, tlimit
, jlimit_check
, jlimit
)) {
2595 rdp
->rcu_cpu_has_work
= 1;
2601 rcu_nocb_lock_irqsave(rdp
, flags
);
2602 rdp
->n_cbs_invoked
+= count
;
2603 trace_rcu_batch_end(rcu_state
.name
, count
, !!rcl
.head
, need_resched(),
2604 is_idle_task(current
), rcu_is_callbacks_kthread(rdp
));
2606 /* Update counts and requeue any remaining callbacks. */
2607 rcu_segcblist_insert_done_cbs(&rdp
->cblist
, &rcl
);
2608 rcu_segcblist_add_len(&rdp
->cblist
, -count
);
2610 /* Reinstate batch limit if we have worked down the excess. */
2611 count
= rcu_segcblist_n_cbs(&rdp
->cblist
);
2612 if (rdp
->blimit
>= DEFAULT_MAX_RCU_BLIMIT
&& count
<= qlowmark
)
2613 rdp
->blimit
= blimit
;
2615 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
2616 if (count
== 0 && rdp
->qlen_last_fqs_check
!= 0) {
2617 rdp
->qlen_last_fqs_check
= 0;
2618 rdp
->n_force_qs_snap
= READ_ONCE(rcu_state
.n_force_qs
);
2619 } else if (count
< rdp
->qlen_last_fqs_check
- qhimark
)
2620 rdp
->qlen_last_fqs_check
= count
;
2623 * The following usually indicates a double call_rcu(). To track
2624 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
2626 empty
= rcu_segcblist_empty(&rdp
->cblist
);
2627 WARN_ON_ONCE(count
== 0 && !empty
);
2628 WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU
) &&
2629 count
!= 0 && empty
);
2630 WARN_ON_ONCE(count
== 0 && rcu_segcblist_n_segment_cbs(&rdp
->cblist
) != 0);
2631 WARN_ON_ONCE(!empty
&& rcu_segcblist_n_segment_cbs(&rdp
->cblist
) == 0);
2633 rcu_nocb_unlock_irqrestore(rdp
, flags
);
2635 tick_dep_clear_task(current
, TICK_DEP_BIT_RCU
);
2639 * This function is invoked from each scheduling-clock interrupt,
2640 * and checks to see if this CPU is in a non-context-switch quiescent
2641 * state, for example, user mode or idle loop. It also schedules RCU
2642 * core processing. If the current grace period has gone on too long,
2643 * it will ask the scheduler to manufacture a context switch for the sole
2644 * purpose of providing the needed quiescent state.
2646 void rcu_sched_clock_irq(int user
)
2650 if (IS_ENABLED(CONFIG_PROVE_RCU
)) {
2652 WARN_ON_ONCE(time_before(j
, __this_cpu_read(rcu_data
.last_sched_clock
)));
2653 __this_cpu_write(rcu_data
.last_sched_clock
, j
);
2655 trace_rcu_utilization(TPS("Start scheduler-tick"));
2656 lockdep_assert_irqs_disabled();
2657 raw_cpu_inc(rcu_data
.ticks_this_gp
);
2658 /* The load-acquire pairs with the store-release setting to true. */
2659 if (smp_load_acquire(this_cpu_ptr(&rcu_data
.rcu_urgent_qs
))) {
2660 /* Idle and userspace execution already are quiescent states. */
2661 if (!rcu_is_cpu_rrupt_from_idle() && !user
) {
2662 set_tsk_need_resched(current
);
2663 set_preempt_need_resched();
2665 __this_cpu_write(rcu_data
.rcu_urgent_qs
, false);
2667 rcu_flavor_sched_clock_irq(user
);
2668 if (rcu_pending(user
))
2670 if (user
|| rcu_is_cpu_rrupt_from_idle())
2671 rcu_note_voluntary_context_switch(current
);
2672 lockdep_assert_irqs_disabled();
2674 trace_rcu_utilization(TPS("End scheduler-tick"));
2678 * Scan the leaf rcu_node structures. For each structure on which all
2679 * CPUs have reported a quiescent state and on which there are tasks
2680 * blocking the current grace period, initiate RCU priority boosting.
2681 * Otherwise, invoke the specified function to check dyntick state for
2682 * each CPU that has not yet reported a quiescent state.
2684 static void force_qs_rnp(int (*f
)(struct rcu_data
*rdp
))
2687 unsigned long flags
;
2688 struct rcu_node
*rnp
;
2690 rcu_state
.cbovld
= rcu_state
.cbovldnext
;
2691 rcu_state
.cbovldnext
= false;
2692 rcu_for_each_leaf_node(rnp
) {
2693 unsigned long mask
= 0;
2694 unsigned long rsmask
= 0;
2696 cond_resched_tasks_rcu_qs();
2697 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
2698 rcu_state
.cbovldnext
|= !!rnp
->cbovldmask
;
2699 if (rnp
->qsmask
== 0) {
2700 if (rcu_preempt_blocked_readers_cgp(rnp
)) {
2702 * No point in scanning bits because they
2703 * are all zero. But we might need to
2704 * priority-boost blocked readers.
2706 rcu_initiate_boost(rnp
, flags
);
2707 /* rcu_initiate_boost() releases rnp->lock */
2710 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
2713 for_each_leaf_node_cpu_mask(rnp
, cpu
, rnp
->qsmask
) {
2714 struct rcu_data
*rdp
;
2717 rdp
= per_cpu_ptr(&rcu_data
, cpu
);
2720 mask
|= rdp
->grpmask
;
2721 rcu_disable_urgency_upon_qs(rdp
);
2724 rsmask
|= rdp
->grpmask
;
2727 /* Idle/offline CPUs, report (releases rnp->lock). */
2728 rcu_report_qs_rnp(mask
, rnp
, rnp
->gp_seq
, flags
);
2730 /* Nothing to do here, so just drop the lock. */
2731 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
2734 for_each_leaf_node_cpu_mask(rnp
, cpu
, rsmask
)
2740 * Force quiescent states on reluctant CPUs, and also detect which
2741 * CPUs are in dyntick-idle mode.
2743 void rcu_force_quiescent_state(void)
2745 unsigned long flags
;
2747 struct rcu_node
*rnp
;
2748 struct rcu_node
*rnp_old
= NULL
;
2750 if (!rcu_gp_in_progress())
2752 /* Funnel through hierarchy to reduce memory contention. */
2753 rnp
= raw_cpu_read(rcu_data
.mynode
);
2754 for (; rnp
!= NULL
; rnp
= rnp
->parent
) {
2755 ret
= (READ_ONCE(rcu_state
.gp_flags
) & RCU_GP_FLAG_FQS
) ||
2756 !raw_spin_trylock(&rnp
->fqslock
);
2757 if (rnp_old
!= NULL
)
2758 raw_spin_unlock(&rnp_old
->fqslock
);
2763 /* rnp_old == rcu_get_root(), rnp == NULL. */
2765 /* Reached the root of the rcu_node tree, acquire lock. */
2766 raw_spin_lock_irqsave_rcu_node(rnp_old
, flags
);
2767 raw_spin_unlock(&rnp_old
->fqslock
);
2768 if (READ_ONCE(rcu_state
.gp_flags
) & RCU_GP_FLAG_FQS
) {
2769 raw_spin_unlock_irqrestore_rcu_node(rnp_old
, flags
);
2770 return; /* Someone beat us to it. */
2772 WRITE_ONCE(rcu_state
.gp_flags
, rcu_state
.gp_flags
| RCU_GP_FLAG_FQS
);
2773 raw_spin_unlock_irqrestore_rcu_node(rnp_old
, flags
);
2774 rcu_gp_kthread_wake();
2776 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state
);
2778 // Workqueue handler for an RCU reader for kernels enforcing struct RCU
2780 static void strict_work_handler(struct work_struct
*work
)
2786 /* Perform RCU core processing work for the current CPU. */
2787 static __latent_entropy
void rcu_core(void)
2789 unsigned long flags
;
2790 struct rcu_data
*rdp
= raw_cpu_ptr(&rcu_data
);
2791 struct rcu_node
*rnp
= rdp
->mynode
;
2793 if (cpu_is_offline(smp_processor_id()))
2795 trace_rcu_utilization(TPS("Start RCU core"));
2796 WARN_ON_ONCE(!rdp
->beenonline
);
2798 /* Report any deferred quiescent states if preemption enabled. */
2799 if (IS_ENABLED(CONFIG_PREEMPT_COUNT
) && (!(preempt_count() & PREEMPT_MASK
))) {
2800 rcu_preempt_deferred_qs(current
);
2801 } else if (rcu_preempt_need_deferred_qs(current
)) {
2802 set_tsk_need_resched(current
);
2803 set_preempt_need_resched();
2806 /* Update RCU state based on any recent quiescent states. */
2807 rcu_check_quiescent_state(rdp
);
2809 /* No grace period and unregistered callbacks? */
2810 if (!rcu_gp_in_progress() &&
2811 rcu_segcblist_is_enabled(&rdp
->cblist
) && !rcu_rdp_is_offloaded(rdp
)) {
2812 local_irq_save(flags
);
2813 if (!rcu_segcblist_restempty(&rdp
->cblist
, RCU_NEXT_READY_TAIL
))
2814 rcu_accelerate_cbs_unlocked(rnp
, rdp
);
2815 local_irq_restore(flags
);
2818 rcu_check_gp_start_stall(rnp
, rdp
, rcu_jiffies_till_stall_check());
2820 /* If there are callbacks ready, invoke them. */
2821 if (!rcu_rdp_is_offloaded(rdp
) && rcu_segcblist_ready_cbs(&rdp
->cblist
) &&
2822 likely(READ_ONCE(rcu_scheduler_fully_active
))) {
2824 /* Re-invoke RCU core processing if there are callbacks remaining. */
2825 if (rcu_segcblist_ready_cbs(&rdp
->cblist
))
2829 /* Do any needed deferred wakeups of rcuo kthreads. */
2830 do_nocb_deferred_wakeup(rdp
);
2831 trace_rcu_utilization(TPS("End RCU core"));
2833 // If strict GPs, schedule an RCU reader in a clean environment.
2834 if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD
))
2835 queue_work_on(rdp
->cpu
, rcu_gp_wq
, &rdp
->strict_work
);
2838 static void rcu_core_si(void)
2843 static void rcu_wake_cond(struct task_struct
*t
, int status
)
2846 * If the thread is yielding, only wake it when this
2847 * is invoked from idle
2849 if (t
&& (status
!= RCU_KTHREAD_YIELDING
|| is_idle_task(current
)))
2853 static void invoke_rcu_core_kthread(void)
2855 struct task_struct
*t
;
2856 unsigned long flags
;
2858 local_irq_save(flags
);
2859 __this_cpu_write(rcu_data
.rcu_cpu_has_work
, 1);
2860 t
= __this_cpu_read(rcu_data
.rcu_cpu_kthread_task
);
2861 if (t
!= NULL
&& t
!= current
)
2862 rcu_wake_cond(t
, __this_cpu_read(rcu_data
.rcu_cpu_kthread_status
));
2863 local_irq_restore(flags
);
2867 * Wake up this CPU's rcuc kthread to do RCU core processing.
2869 static void invoke_rcu_core(void)
2871 if (!cpu_online(smp_processor_id()))
2874 raise_softirq(RCU_SOFTIRQ
);
2876 invoke_rcu_core_kthread();
2879 static void rcu_cpu_kthread_park(unsigned int cpu
)
2881 per_cpu(rcu_data
.rcu_cpu_kthread_status
, cpu
) = RCU_KTHREAD_OFFCPU
;
2884 static int rcu_cpu_kthread_should_run(unsigned int cpu
)
2886 return __this_cpu_read(rcu_data
.rcu_cpu_has_work
);
2890 * Per-CPU kernel thread that invokes RCU callbacks. This replaces
2891 * the RCU softirq used in configurations of RCU that do not support RCU
2892 * priority boosting.
2894 static void rcu_cpu_kthread(unsigned int cpu
)
2896 unsigned int *statusp
= this_cpu_ptr(&rcu_data
.rcu_cpu_kthread_status
);
2897 char work
, *workp
= this_cpu_ptr(&rcu_data
.rcu_cpu_has_work
);
2898 unsigned long *j
= this_cpu_ptr(&rcu_data
.rcuc_activity
);
2901 trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
2902 for (spincnt
= 0; spincnt
< 10; spincnt
++) {
2903 WRITE_ONCE(*j
, jiffies
);
2905 *statusp
= RCU_KTHREAD_RUNNING
;
2906 local_irq_disable();
2908 WRITE_ONCE(*workp
, 0);
2913 if (!READ_ONCE(*workp
)) {
2914 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
2915 *statusp
= RCU_KTHREAD_WAITING
;
2919 *statusp
= RCU_KTHREAD_YIELDING
;
2920 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
2921 schedule_timeout_idle(2);
2922 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
2923 *statusp
= RCU_KTHREAD_WAITING
;
2924 WRITE_ONCE(*j
, jiffies
);
2927 static struct smp_hotplug_thread rcu_cpu_thread_spec
= {
2928 .store
= &rcu_data
.rcu_cpu_kthread_task
,
2929 .thread_should_run
= rcu_cpu_kthread_should_run
,
2930 .thread_fn
= rcu_cpu_kthread
,
2931 .thread_comm
= "rcuc/%u",
2932 .setup
= rcu_cpu_kthread_setup
,
2933 .park
= rcu_cpu_kthread_park
,
2937 * Spawn per-CPU RCU core processing kthreads.
2939 static int __init
rcu_spawn_core_kthreads(void)
2943 for_each_possible_cpu(cpu
)
2944 per_cpu(rcu_data
.rcu_cpu_has_work
, cpu
) = 0;
2947 WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec
),
2948 "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__
);
2952 static void rcutree_enqueue(struct rcu_data
*rdp
, struct rcu_head
*head
, rcu_callback_t func
)
2954 rcu_segcblist_enqueue(&rdp
->cblist
, head
);
2955 if (__is_kvfree_rcu_offset((unsigned long)func
))
2956 trace_rcu_kvfree_callback(rcu_state
.name
, head
,
2957 (unsigned long)func
,
2958 rcu_segcblist_n_cbs(&rdp
->cblist
));
2960 trace_rcu_callback(rcu_state
.name
, head
,
2961 rcu_segcblist_n_cbs(&rdp
->cblist
));
2962 trace_rcu_segcb_stats(&rdp
->cblist
, TPS("SegCBQueued"));
2966 * Handle any core-RCU processing required by a call_rcu() invocation.
2968 static void call_rcu_core(struct rcu_data
*rdp
, struct rcu_head
*head
,
2969 rcu_callback_t func
, unsigned long flags
)
2971 rcutree_enqueue(rdp
, head
, func
);
2973 * If called from an extended quiescent state, invoke the RCU
2974 * core in order to force a re-evaluation of RCU's idleness.
2976 if (!rcu_is_watching())
2979 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
2980 if (irqs_disabled_flags(flags
) || cpu_is_offline(smp_processor_id()))
2984 * Force the grace period if too many callbacks or too long waiting.
2985 * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
2986 * if some other CPU has recently done so. Also, don't bother
2987 * invoking rcu_force_quiescent_state() if the newly enqueued callback
2988 * is the only one waiting for a grace period to complete.
2990 if (unlikely(rcu_segcblist_n_cbs(&rdp
->cblist
) >
2991 rdp
->qlen_last_fqs_check
+ qhimark
)) {
2993 /* Are we ignoring a completed grace period? */
2994 note_gp_changes(rdp
);
2996 /* Start a new grace period if one not already started. */
2997 if (!rcu_gp_in_progress()) {
2998 rcu_accelerate_cbs_unlocked(rdp
->mynode
, rdp
);
3000 /* Give the grace period a kick. */
3001 rdp
->blimit
= DEFAULT_MAX_RCU_BLIMIT
;
3002 if (READ_ONCE(rcu_state
.n_force_qs
) == rdp
->n_force_qs_snap
&&
3003 rcu_segcblist_first_pend_cb(&rdp
->cblist
) != head
)
3004 rcu_force_quiescent_state();
3005 rdp
->n_force_qs_snap
= READ_ONCE(rcu_state
.n_force_qs
);
3006 rdp
->qlen_last_fqs_check
= rcu_segcblist_n_cbs(&rdp
->cblist
);
3012 * RCU callback function to leak a callback.
3014 static void rcu_leak_callback(struct rcu_head
*rhp
)
3019 * Check and if necessary update the leaf rcu_node structure's
3020 * ->cbovldmask bit corresponding to the current CPU based on that CPU's
3021 * number of queued RCU callbacks. The caller must hold the leaf rcu_node
3022 * structure's ->lock.
3024 static void check_cb_ovld_locked(struct rcu_data
*rdp
, struct rcu_node
*rnp
)
3026 raw_lockdep_assert_held_rcu_node(rnp
);
3027 if (qovld_calc
<= 0)
3028 return; // Early boot and wildcard value set.
3029 if (rcu_segcblist_n_cbs(&rdp
->cblist
) >= qovld_calc
)
3030 WRITE_ONCE(rnp
->cbovldmask
, rnp
->cbovldmask
| rdp
->grpmask
);
3032 WRITE_ONCE(rnp
->cbovldmask
, rnp
->cbovldmask
& ~rdp
->grpmask
);
3036 * Check and if necessary update the leaf rcu_node structure's
3037 * ->cbovldmask bit corresponding to the current CPU based on that CPU's
3038 * number of queued RCU callbacks. No locks need be held, but the
3039 * caller must have disabled interrupts.
3041 * Note that this function ignores the possibility that there are a lot
3042 * of callbacks all of which have already seen the end of their respective
3043 * grace periods. This omission is due to the need for no-CBs CPUs to
3044 * be holding ->nocb_lock to do this check, which is too heavy for a
3045 * common-case operation.
3047 static void check_cb_ovld(struct rcu_data
*rdp
)
3049 struct rcu_node
*const rnp
= rdp
->mynode
;
3051 if (qovld_calc
<= 0 ||
3052 ((rcu_segcblist_n_cbs(&rdp
->cblist
) >= qovld_calc
) ==
3053 !!(READ_ONCE(rnp
->cbovldmask
) & rdp
->grpmask
)))
3054 return; // Early boot wildcard value or already set correctly.
3055 raw_spin_lock_rcu_node(rnp
);
3056 check_cb_ovld_locked(rdp
, rnp
);
3057 raw_spin_unlock_rcu_node(rnp
);
3061 __call_rcu_common(struct rcu_head
*head
, rcu_callback_t func
, bool lazy_in
)
3063 static atomic_t doublefrees
;
3064 unsigned long flags
;
3066 struct rcu_data
*rdp
;
3068 /* Misaligned rcu_head! */
3069 WARN_ON_ONCE((unsigned long)head
& (sizeof(void *) - 1));
3071 if (debug_rcu_head_queue(head
)) {
3073 * Probable double call_rcu(), so leak the callback.
3074 * Use rcu:rcu_callback trace event to find the previous
3075 * time callback was passed to call_rcu().
3077 if (atomic_inc_return(&doublefrees
) < 4) {
3078 pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__
, head
, head
->func
);
3081 WRITE_ONCE(head
->func
, rcu_leak_callback
);
3086 kasan_record_aux_stack_noalloc(head
);
3087 local_irq_save(flags
);
3088 rdp
= this_cpu_ptr(&rcu_data
);
3089 lazy
= lazy_in
&& !rcu_async_should_hurry();
3091 /* Add the callback to our list. */
3092 if (unlikely(!rcu_segcblist_is_enabled(&rdp
->cblist
))) {
3093 // This can trigger due to call_rcu() from offline CPU:
3094 WARN_ON_ONCE(rcu_scheduler_active
!= RCU_SCHEDULER_INACTIVE
);
3095 WARN_ON_ONCE(!rcu_is_watching());
3096 // Very early boot, before rcu_init(). Initialize if needed
3097 // and then drop through to queue the callback.
3098 if (rcu_segcblist_empty(&rdp
->cblist
))
3099 rcu_segcblist_init(&rdp
->cblist
);
3104 if (unlikely(rcu_rdp_is_offloaded(rdp
)))
3105 call_rcu_nocb(rdp
, head
, func
, flags
, lazy
);
3107 call_rcu_core(rdp
, head
, func
, flags
);
3108 local_irq_restore(flags
);
3111 #ifdef CONFIG_RCU_LAZY
3112 static bool enable_rcu_lazy __read_mostly
= !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF
);
3113 module_param(enable_rcu_lazy
, bool, 0444);
3116 * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
3117 * flush all lazy callbacks (including the new one) to the main ->cblist while
3120 * @head: structure to be used for queueing the RCU updates.
3121 * @func: actual callback function to be invoked after the grace period
3123 * The callback function will be invoked some time after a full grace
3124 * period elapses, in other words after all pre-existing RCU read-side
3125 * critical sections have completed.
3127 * Use this API instead of call_rcu() if you don't want the callback to be
3128 * invoked after very long periods of time, which can happen on systems without
3129 * memory pressure and on systems which are lightly loaded or mostly idle.
3130 * This function will cause callbacks to be invoked sooner than later at the
3131 * expense of extra power. Other than that, this function is identical to, and
3132 * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
3133 * ordering and other functionality.
3135 void call_rcu_hurry(struct rcu_head
*head
, rcu_callback_t func
)
3137 __call_rcu_common(head
, func
, false);
3139 EXPORT_SYMBOL_GPL(call_rcu_hurry
);
3141 #define enable_rcu_lazy false
3145 * call_rcu() - Queue an RCU callback for invocation after a grace period.
3146 * By default the callbacks are 'lazy' and are kept hidden from the main
3147 * ->cblist to prevent starting of grace periods too soon.
3148 * If you desire grace periods to start very soon, use call_rcu_hurry().
3150 * @head: structure to be used for queueing the RCU updates.
3151 * @func: actual callback function to be invoked after the grace period
3153 * The callback function will be invoked some time after a full grace
3154 * period elapses, in other words after all pre-existing RCU read-side
3155 * critical sections have completed. However, the callback function
3156 * might well execute concurrently with RCU read-side critical sections
3157 * that started after call_rcu() was invoked.
3159 * RCU read-side critical sections are delimited by rcu_read_lock()
3160 * and rcu_read_unlock(), and may be nested. In addition, but only in
3161 * v5.0 and later, regions of code across which interrupts, preemption,
3162 * or softirqs have been disabled also serve as RCU read-side critical
3163 * sections. This includes hardware interrupt handlers, softirq handlers,
3166 * Note that all CPUs must agree that the grace period extended beyond
3167 * all pre-existing RCU read-side critical section. On systems with more
3168 * than one CPU, this means that when "func()" is invoked, each CPU is
3169 * guaranteed to have executed a full memory barrier since the end of its
3170 * last RCU read-side critical section whose beginning preceded the call
3171 * to call_rcu(). It also means that each CPU executing an RCU read-side
3172 * critical section that continues beyond the start of "func()" must have
3173 * executed a memory barrier after the call_rcu() but before the beginning
3174 * of that RCU read-side critical section. Note that these guarantees
3175 * include CPUs that are offline, idle, or executing in user mode, as
3176 * well as CPUs that are executing in the kernel.
3178 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
3179 * resulting RCU callback function "func()", then both CPU A and CPU B are
3180 * guaranteed to execute a full memory barrier during the time interval
3181 * between the call to call_rcu() and the invocation of "func()" -- even
3182 * if CPU A and CPU B are the same CPU (but again only if the system has
3183 * more than one CPU).
3185 * Implementation of these memory-ordering guarantees is described here:
3186 * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
3188 void call_rcu(struct rcu_head
*head
, rcu_callback_t func
)
3190 __call_rcu_common(head
, func
, enable_rcu_lazy
);
3192 EXPORT_SYMBOL_GPL(call_rcu
);
3194 /* Maximum number of jiffies to wait before draining a batch. */
3195 #define KFREE_DRAIN_JIFFIES (5 * HZ)
3196 #define KFREE_N_BATCHES 2
3197 #define FREE_N_CHANNELS 2
3200 * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
3201 * @list: List node. All blocks are linked between each other
3202 * @gp_snap: Snapshot of RCU state for objects placed to this bulk
3203 * @nr_records: Number of active pointers in the array
3204 * @records: Array of the kvfree_rcu() pointers
3206 struct kvfree_rcu_bulk_data
{
3207 struct list_head list
;
3208 struct rcu_gp_oldstate gp_snap
;
3209 unsigned long nr_records
;
3210 void *records
[] __counted_by(nr_records
);
3214 * This macro defines how many entries the "records" array
3215 * will contain. It is based on the fact that the size of
3216 * kvfree_rcu_bulk_data structure becomes exactly one page.
3218 #define KVFREE_BULK_MAX_ENTR \
3219 ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
3222 * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
3223 * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
3224 * @head_free: List of kfree_rcu() objects waiting for a grace period
3225 * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
3226 * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
3227 * @krcp: Pointer to @kfree_rcu_cpu structure
3230 struct kfree_rcu_cpu_work
{
3231 struct rcu_work rcu_work
;
3232 struct rcu_head
*head_free
;
3233 struct rcu_gp_oldstate head_free_gp_snap
;
3234 struct list_head bulk_head_free
[FREE_N_CHANNELS
];
3235 struct kfree_rcu_cpu
*krcp
;
3239 * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
3240 * @head: List of kfree_rcu() objects not yet waiting for a grace period
3241 * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
3242 * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
3243 * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
3244 * @lock: Synchronize access to this structure
3245 * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
3246 * @initialized: The @rcu_work fields have been initialized
3247 * @head_count: Number of objects in rcu_head singular list
3248 * @bulk_count: Number of objects in bulk-list
3250 * A simple cache list that contains objects for reuse purpose.
3251 * In order to save some per-cpu space the list is singular.
3252 * Even though it is lockless an access has to be protected by the
3254 * @page_cache_work: A work to refill the cache when it is empty
3255 * @backoff_page_cache_fill: Delay cache refills
3256 * @work_in_progress: Indicates that page_cache_work is running
3257 * @hrtimer: A hrtimer for scheduling a page_cache_work
3258 * @nr_bkv_objs: number of allocated objects at @bkvcache.
3260 * This is a per-CPU structure. The reason that it is not included in
3261 * the rcu_data structure is to permit this code to be extracted from
3262 * the RCU files. Such extraction could allow further optimization of
3263 * the interactions with the slab allocators.
3265 struct kfree_rcu_cpu
{
3266 // Objects queued on a linked list
3267 // through their rcu_head structures.
3268 struct rcu_head
*head
;
3269 unsigned long head_gp_snap
;
3270 atomic_t head_count
;
3272 // Objects queued on a bulk-list.
3273 struct list_head bulk_head
[FREE_N_CHANNELS
];
3274 atomic_t bulk_count
[FREE_N_CHANNELS
];
3276 struct kfree_rcu_cpu_work krw_arr
[KFREE_N_BATCHES
];
3277 raw_spinlock_t lock
;
3278 struct delayed_work monitor_work
;
3281 struct delayed_work page_cache_work
;
3282 atomic_t backoff_page_cache_fill
;
3283 atomic_t work_in_progress
;
3284 struct hrtimer hrtimer
;
3286 struct llist_head bkvcache
;
3290 static DEFINE_PER_CPU(struct kfree_rcu_cpu
, krc
) = {
3291 .lock
= __RAW_SPIN_LOCK_UNLOCKED(krc
.lock
),
3294 static __always_inline
void
3295 debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data
*bhead
)
3297 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
3300 for (i
= 0; i
< bhead
->nr_records
; i
++)
3301 debug_rcu_head_unqueue((struct rcu_head
*)(bhead
->records
[i
]));
3305 static inline struct kfree_rcu_cpu
*
3306 krc_this_cpu_lock(unsigned long *flags
)
3308 struct kfree_rcu_cpu
*krcp
;
3310 local_irq_save(*flags
); // For safely calling this_cpu_ptr().
3311 krcp
= this_cpu_ptr(&krc
);
3312 raw_spin_lock(&krcp
->lock
);
3318 krc_this_cpu_unlock(struct kfree_rcu_cpu
*krcp
, unsigned long flags
)
3320 raw_spin_unlock_irqrestore(&krcp
->lock
, flags
);
3323 static inline struct kvfree_rcu_bulk_data
*
3324 get_cached_bnode(struct kfree_rcu_cpu
*krcp
)
3326 if (!krcp
->nr_bkv_objs
)
3329 WRITE_ONCE(krcp
->nr_bkv_objs
, krcp
->nr_bkv_objs
- 1);
3330 return (struct kvfree_rcu_bulk_data
*)
3331 llist_del_first(&krcp
->bkvcache
);
3335 put_cached_bnode(struct kfree_rcu_cpu
*krcp
,
3336 struct kvfree_rcu_bulk_data
*bnode
)
3339 if (krcp
->nr_bkv_objs
>= rcu_min_cached_objs
)
3342 llist_add((struct llist_node
*) bnode
, &krcp
->bkvcache
);
3343 WRITE_ONCE(krcp
->nr_bkv_objs
, krcp
->nr_bkv_objs
+ 1);
3348 drain_page_cache(struct kfree_rcu_cpu
*krcp
)
3350 unsigned long flags
;
3351 struct llist_node
*page_list
, *pos
, *n
;
3354 if (!rcu_min_cached_objs
)
3357 raw_spin_lock_irqsave(&krcp
->lock
, flags
);
3358 page_list
= llist_del_all(&krcp
->bkvcache
);
3359 WRITE_ONCE(krcp
->nr_bkv_objs
, 0);
3360 raw_spin_unlock_irqrestore(&krcp
->lock
, flags
);
3362 llist_for_each_safe(pos
, n
, page_list
) {
3363 free_page((unsigned long)pos
);
3371 kvfree_rcu_bulk(struct kfree_rcu_cpu
*krcp
,
3372 struct kvfree_rcu_bulk_data
*bnode
, int idx
)
3374 unsigned long flags
;
3377 if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode
->gp_snap
))) {
3378 debug_rcu_bhead_unqueue(bnode
);
3379 rcu_lock_acquire(&rcu_callback_map
);
3380 if (idx
== 0) { // kmalloc() / kfree().
3381 trace_rcu_invoke_kfree_bulk_callback(
3382 rcu_state
.name
, bnode
->nr_records
,
3385 kfree_bulk(bnode
->nr_records
, bnode
->records
);
3386 } else { // vmalloc() / vfree().
3387 for (i
= 0; i
< bnode
->nr_records
; i
++) {
3388 trace_rcu_invoke_kvfree_callback(
3389 rcu_state
.name
, bnode
->records
[i
], 0);
3391 vfree(bnode
->records
[i
]);
3394 rcu_lock_release(&rcu_callback_map
);
3397 raw_spin_lock_irqsave(&krcp
->lock
, flags
);
3398 if (put_cached_bnode(krcp
, bnode
))
3400 raw_spin_unlock_irqrestore(&krcp
->lock
, flags
);
3403 free_page((unsigned long) bnode
);
3405 cond_resched_tasks_rcu_qs();
3409 kvfree_rcu_list(struct rcu_head
*head
)
3411 struct rcu_head
*next
;
3413 for (; head
; head
= next
) {
3414 void *ptr
= (void *) head
->func
;
3415 unsigned long offset
= (void *) head
- ptr
;
3418 debug_rcu_head_unqueue((struct rcu_head
*)ptr
);
3419 rcu_lock_acquire(&rcu_callback_map
);
3420 trace_rcu_invoke_kvfree_callback(rcu_state
.name
, head
, offset
);
3422 if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset
)))
3425 rcu_lock_release(&rcu_callback_map
);
3426 cond_resched_tasks_rcu_qs();
3431 * This function is invoked in workqueue context after a grace period.
3432 * It frees all the objects queued on ->bulk_head_free or ->head_free.
3434 static void kfree_rcu_work(struct work_struct
*work
)
3436 unsigned long flags
;
3437 struct kvfree_rcu_bulk_data
*bnode
, *n
;
3438 struct list_head bulk_head
[FREE_N_CHANNELS
];
3439 struct rcu_head
*head
;
3440 struct kfree_rcu_cpu
*krcp
;
3441 struct kfree_rcu_cpu_work
*krwp
;
3442 struct rcu_gp_oldstate head_gp_snap
;
3445 krwp
= container_of(to_rcu_work(work
),
3446 struct kfree_rcu_cpu_work
, rcu_work
);
3449 raw_spin_lock_irqsave(&krcp
->lock
, flags
);
3450 // Channels 1 and 2.
3451 for (i
= 0; i
< FREE_N_CHANNELS
; i
++)
3452 list_replace_init(&krwp
->bulk_head_free
[i
], &bulk_head
[i
]);
3455 head
= krwp
->head_free
;
3456 krwp
->head_free
= NULL
;
3457 head_gp_snap
= krwp
->head_free_gp_snap
;
3458 raw_spin_unlock_irqrestore(&krcp
->lock
, flags
);
3460 // Handle the first two channels.
3461 for (i
= 0; i
< FREE_N_CHANNELS
; i
++) {
3462 // Start from the tail page, so a GP is likely passed for it.
3463 list_for_each_entry_safe(bnode
, n
, &bulk_head
[i
], list
)
3464 kvfree_rcu_bulk(krcp
, bnode
, i
);
3468 * This is used when the "bulk" path can not be used for the
3469 * double-argument of kvfree_rcu(). This happens when the
3470 * page-cache is empty, which means that objects are instead
3471 * queued on a linked list through their rcu_head structures.
3472 * This list is named "Channel 3".
3474 if (head
&& !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap
)))
3475 kvfree_rcu_list(head
);
3479 need_offload_krc(struct kfree_rcu_cpu
*krcp
)
3483 for (i
= 0; i
< FREE_N_CHANNELS
; i
++)
3484 if (!list_empty(&krcp
->bulk_head
[i
]))
3487 return !!READ_ONCE(krcp
->head
);
3491 need_wait_for_krwp_work(struct kfree_rcu_cpu_work
*krwp
)
3495 for (i
= 0; i
< FREE_N_CHANNELS
; i
++)
3496 if (!list_empty(&krwp
->bulk_head_free
[i
]))
3499 return !!krwp
->head_free
;
3502 static int krc_count(struct kfree_rcu_cpu
*krcp
)
3504 int sum
= atomic_read(&krcp
->head_count
);
3507 for (i
= 0; i
< FREE_N_CHANNELS
; i
++)
3508 sum
+= atomic_read(&krcp
->bulk_count
[i
]);
3514 __schedule_delayed_monitor_work(struct kfree_rcu_cpu
*krcp
)
3516 long delay
, delay_left
;
3518 delay
= krc_count(krcp
) >= KVFREE_BULK_MAX_ENTR
? 1:KFREE_DRAIN_JIFFIES
;
3519 if (delayed_work_pending(&krcp
->monitor_work
)) {
3520 delay_left
= krcp
->monitor_work
.timer
.expires
- jiffies
;
3521 if (delay
< delay_left
)
3522 mod_delayed_work(system_unbound_wq
, &krcp
->monitor_work
, delay
);
3525 queue_delayed_work(system_unbound_wq
, &krcp
->monitor_work
, delay
);
3529 schedule_delayed_monitor_work(struct kfree_rcu_cpu
*krcp
)
3531 unsigned long flags
;
3533 raw_spin_lock_irqsave(&krcp
->lock
, flags
);
3534 __schedule_delayed_monitor_work(krcp
);
3535 raw_spin_unlock_irqrestore(&krcp
->lock
, flags
);
3539 kvfree_rcu_drain_ready(struct kfree_rcu_cpu
*krcp
)
3541 struct list_head bulk_ready
[FREE_N_CHANNELS
];
3542 struct kvfree_rcu_bulk_data
*bnode
, *n
;
3543 struct rcu_head
*head_ready
= NULL
;
3544 unsigned long flags
;
3547 raw_spin_lock_irqsave(&krcp
->lock
, flags
);
3548 for (i
= 0; i
< FREE_N_CHANNELS
; i
++) {
3549 INIT_LIST_HEAD(&bulk_ready
[i
]);
3551 list_for_each_entry_safe_reverse(bnode
, n
, &krcp
->bulk_head
[i
], list
) {
3552 if (!poll_state_synchronize_rcu_full(&bnode
->gp_snap
))
3555 atomic_sub(bnode
->nr_records
, &krcp
->bulk_count
[i
]);
3556 list_move(&bnode
->list
, &bulk_ready
[i
]);
3560 if (krcp
->head
&& poll_state_synchronize_rcu(krcp
->head_gp_snap
)) {
3561 head_ready
= krcp
->head
;
3562 atomic_set(&krcp
->head_count
, 0);
3563 WRITE_ONCE(krcp
->head
, NULL
);
3565 raw_spin_unlock_irqrestore(&krcp
->lock
, flags
);
3567 for (i
= 0; i
< FREE_N_CHANNELS
; i
++) {
3568 list_for_each_entry_safe(bnode
, n
, &bulk_ready
[i
], list
)
3569 kvfree_rcu_bulk(krcp
, bnode
, i
);
3573 kvfree_rcu_list(head_ready
);
3577 * Return: %true if a work is queued, %false otherwise.
3580 kvfree_rcu_queue_batch(struct kfree_rcu_cpu
*krcp
)
3582 unsigned long flags
;
3583 bool queued
= false;
3586 raw_spin_lock_irqsave(&krcp
->lock
, flags
);
3588 // Attempt to start a new batch.
3589 for (i
= 0; i
< KFREE_N_BATCHES
; i
++) {
3590 struct kfree_rcu_cpu_work
*krwp
= &(krcp
->krw_arr
[i
]);
3592 // Try to detach bulk_head or head and attach it, only when
3593 // all channels are free. Any channel is not free means at krwp
3594 // there is on-going rcu work to handle krwp's free business.
3595 if (need_wait_for_krwp_work(krwp
))
3598 // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
3599 if (need_offload_krc(krcp
)) {
3600 // Channel 1 corresponds to the SLAB-pointer bulk path.
3601 // Channel 2 corresponds to vmalloc-pointer bulk path.
3602 for (j
= 0; j
< FREE_N_CHANNELS
; j
++) {
3603 if (list_empty(&krwp
->bulk_head_free
[j
])) {
3604 atomic_set(&krcp
->bulk_count
[j
], 0);
3605 list_replace_init(&krcp
->bulk_head
[j
],
3606 &krwp
->bulk_head_free
[j
]);
3610 // Channel 3 corresponds to both SLAB and vmalloc
3611 // objects queued on the linked list.
3612 if (!krwp
->head_free
) {
3613 krwp
->head_free
= krcp
->head
;
3614 get_state_synchronize_rcu_full(&krwp
->head_free_gp_snap
);
3615 atomic_set(&krcp
->head_count
, 0);
3616 WRITE_ONCE(krcp
->head
, NULL
);
3619 // One work is per one batch, so there are three
3620 // "free channels", the batch can handle. Break
3621 // the loop since it is done with this CPU thus
3622 // queuing an RCU work is _always_ success here.
3623 queued
= queue_rcu_work(system_unbound_wq
, &krwp
->rcu_work
);
3624 WARN_ON_ONCE(!queued
);
3629 raw_spin_unlock_irqrestore(&krcp
->lock
, flags
);
3634 * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
3636 static void kfree_rcu_monitor(struct work_struct
*work
)
3638 struct kfree_rcu_cpu
*krcp
= container_of(work
,
3639 struct kfree_rcu_cpu
, monitor_work
.work
);
3641 // Drain ready for reclaim.
3642 kvfree_rcu_drain_ready(krcp
);
3644 // Queue a batch for a rest.
3645 kvfree_rcu_queue_batch(krcp
);
3647 // If there is nothing to detach, it means that our job is
3648 // successfully done here. In case of having at least one
3649 // of the channels that is still busy we should rearm the
3650 // work to repeat an attempt. Because previous batches are
3651 // still in progress.
3652 if (need_offload_krc(krcp
))
3653 schedule_delayed_monitor_work(krcp
);
3656 static enum hrtimer_restart
3657 schedule_page_work_fn(struct hrtimer
*t
)
3659 struct kfree_rcu_cpu
*krcp
=
3660 container_of(t
, struct kfree_rcu_cpu
, hrtimer
);
3662 queue_delayed_work(system_highpri_wq
, &krcp
->page_cache_work
, 0);
3663 return HRTIMER_NORESTART
;
3666 static void fill_page_cache_func(struct work_struct
*work
)
3668 struct kvfree_rcu_bulk_data
*bnode
;
3669 struct kfree_rcu_cpu
*krcp
=
3670 container_of(work
, struct kfree_rcu_cpu
,
3671 page_cache_work
.work
);
3672 unsigned long flags
;
3677 nr_pages
= atomic_read(&krcp
->backoff_page_cache_fill
) ?
3678 1 : rcu_min_cached_objs
;
3680 for (i
= READ_ONCE(krcp
->nr_bkv_objs
); i
< nr_pages
; i
++) {
3681 bnode
= (struct kvfree_rcu_bulk_data
*)
3682 __get_free_page(GFP_KERNEL
| __GFP_NORETRY
| __GFP_NOMEMALLOC
| __GFP_NOWARN
);
3687 raw_spin_lock_irqsave(&krcp
->lock
, flags
);
3688 pushed
= put_cached_bnode(krcp
, bnode
);
3689 raw_spin_unlock_irqrestore(&krcp
->lock
, flags
);
3692 free_page((unsigned long) bnode
);
3697 atomic_set(&krcp
->work_in_progress
, 0);
3698 atomic_set(&krcp
->backoff_page_cache_fill
, 0);
3702 run_page_cache_worker(struct kfree_rcu_cpu
*krcp
)
3704 // If cache disabled, bail out.
3705 if (!rcu_min_cached_objs
)
3708 if (rcu_scheduler_active
== RCU_SCHEDULER_RUNNING
&&
3709 !atomic_xchg(&krcp
->work_in_progress
, 1)) {
3710 if (atomic_read(&krcp
->backoff_page_cache_fill
)) {
3711 queue_delayed_work(system_unbound_wq
,
3712 &krcp
->page_cache_work
,
3713 msecs_to_jiffies(rcu_delay_page_cache_fill_msec
));
3715 hrtimer_init(&krcp
->hrtimer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
3716 krcp
->hrtimer
.function
= schedule_page_work_fn
;
3717 hrtimer_start(&krcp
->hrtimer
, 0, HRTIMER_MODE_REL
);
3722 // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
3723 // state specified by flags. If can_alloc is true, the caller must
3724 // be schedulable and not be holding any locks or mutexes that might be
3725 // acquired by the memory allocator or anything that it might invoke.
3726 // Returns true if ptr was successfully recorded, else the caller must
3729 add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu
**krcp
,
3730 unsigned long *flags
, void *ptr
, bool can_alloc
)
3732 struct kvfree_rcu_bulk_data
*bnode
;
3735 *krcp
= krc_this_cpu_lock(flags
);
3736 if (unlikely(!(*krcp
)->initialized
))
3739 idx
= !!is_vmalloc_addr(ptr
);
3740 bnode
= list_first_entry_or_null(&(*krcp
)->bulk_head
[idx
],
3741 struct kvfree_rcu_bulk_data
, list
);
3743 /* Check if a new block is required. */
3744 if (!bnode
|| bnode
->nr_records
== KVFREE_BULK_MAX_ENTR
) {
3745 bnode
= get_cached_bnode(*krcp
);
3746 if (!bnode
&& can_alloc
) {
3747 krc_this_cpu_unlock(*krcp
, *flags
);
3749 // __GFP_NORETRY - allows a light-weight direct reclaim
3750 // what is OK from minimizing of fallback hitting point of
3751 // view. Apart of that it forbids any OOM invoking what is
3752 // also beneficial since we are about to release memory soon.
3754 // __GFP_NOMEMALLOC - prevents from consuming of all the
3755 // memory reserves. Please note we have a fallback path.
3757 // __GFP_NOWARN - it is supposed that an allocation can
3758 // be failed under low memory or high memory pressure
3760 bnode
= (struct kvfree_rcu_bulk_data
*)
3761 __get_free_page(GFP_KERNEL
| __GFP_NORETRY
| __GFP_NOMEMALLOC
| __GFP_NOWARN
);
3762 raw_spin_lock_irqsave(&(*krcp
)->lock
, *flags
);
3768 // Initialize the new block and attach it.
3769 bnode
->nr_records
= 0;
3770 list_add(&bnode
->list
, &(*krcp
)->bulk_head
[idx
]);
3773 // Finally insert and update the GP for this page.
3774 bnode
->nr_records
++;
3775 bnode
->records
[bnode
->nr_records
- 1] = ptr
;
3776 get_state_synchronize_rcu_full(&bnode
->gp_snap
);
3777 atomic_inc(&(*krcp
)->bulk_count
[idx
]);
3783 * Queue a request for lazy invocation of the appropriate free routine
3784 * after a grace period. Please note that three paths are maintained,
3785 * two for the common case using arrays of pointers and a third one that
3786 * is used only when the main paths cannot be used, for example, due to
3789 * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
3790 * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
3791 * be free'd in workqueue context. This allows us to: batch requests together to
3792 * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
3794 void kvfree_call_rcu(struct rcu_head
*head
, void *ptr
)
3796 unsigned long flags
;
3797 struct kfree_rcu_cpu
*krcp
;
3801 * Please note there is a limitation for the head-less
3802 * variant, that is why there is a clear rule for such
3803 * objects: it can be used from might_sleep() context
3804 * only. For other places please embed an rcu_head to
3810 // Queue the object but don't yet schedule the batch.
3811 if (debug_rcu_head_queue(ptr
)) {
3812 // Probable double kfree_rcu(), just leak.
3813 WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
3816 // Mark as success and leave.
3820 kasan_record_aux_stack_noalloc(ptr
);
3821 success
= add_ptr_to_bulk_krc_lock(&krcp
, &flags
, ptr
, !head
);
3823 run_page_cache_worker(krcp
);
3826 // Inline if kvfree_rcu(one_arg) call.
3830 head
->next
= krcp
->head
;
3831 WRITE_ONCE(krcp
->head
, head
);
3832 atomic_inc(&krcp
->head_count
);
3834 // Take a snapshot for this krcp.
3835 krcp
->head_gp_snap
= get_state_synchronize_rcu();
3840 * The kvfree_rcu() caller considers the pointer freed at this point
3841 * and likely removes any references to it. Since the actual slab
3842 * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
3843 * this object (no scanning or false positives reporting).
3845 kmemleak_ignore(ptr
);
3847 // Set timer to drain after KFREE_DRAIN_JIFFIES.
3848 if (rcu_scheduler_active
== RCU_SCHEDULER_RUNNING
)
3849 __schedule_delayed_monitor_work(krcp
);
3852 krc_this_cpu_unlock(krcp
, flags
);
3855 * Inline kvfree() after synchronize_rcu(). We can do
3856 * it from might_sleep() context only, so the current
3857 * CPU can pass the QS state.
3860 debug_rcu_head_unqueue((struct rcu_head
*) ptr
);
3865 EXPORT_SYMBOL_GPL(kvfree_call_rcu
);
3868 * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
3870 * Note that a single argument of kvfree_rcu() call has a slow path that
3871 * triggers synchronize_rcu() following by freeing a pointer. It is done
3872 * before the return from the function. Therefore for any single-argument
3873 * call that will result in a kfree() to a cache that is to be destroyed
3874 * during module exit, it is developer's responsibility to ensure that all
3875 * such calls have returned before the call to kmem_cache_destroy().
3877 void kvfree_rcu_barrier(void)
3879 struct kfree_rcu_cpu_work
*krwp
;
3880 struct kfree_rcu_cpu
*krcp
;
3885 * Firstly we detach objects and queue them over an RCU-batch
3886 * for all CPUs. Finally queued works are flushed for each CPU.
3888 * Please note. If there are outstanding batches for a particular
3889 * CPU, those have to be finished first following by queuing a new.
3891 for_each_possible_cpu(cpu
) {
3892 krcp
= per_cpu_ptr(&krc
, cpu
);
3895 * Check if this CPU has any objects which have been queued for a
3896 * new GP completion. If not(means nothing to detach), we are done
3897 * with it. If any batch is pending/running for this "krcp", below
3898 * per-cpu flush_rcu_work() waits its completion(see last step).
3900 if (!need_offload_krc(krcp
))
3905 * If we are not able to queue a new RCU work it means:
3906 * - batches for this CPU are still in flight which should
3907 * be flushed first and then repeat;
3908 * - no objects to detach, because of concurrency.
3910 queued
= kvfree_rcu_queue_batch(krcp
);
3913 * Bail out, if there is no need to offload this "krcp"
3914 * anymore. As noted earlier it can run concurrently.
3916 if (queued
|| !need_offload_krc(krcp
))
3919 /* There are ongoing batches. */
3920 for (i
= 0; i
< KFREE_N_BATCHES
; i
++) {
3921 krwp
= &(krcp
->krw_arr
[i
]);
3922 flush_rcu_work(&krwp
->rcu_work
);
3928 * Now we guarantee that all objects are flushed.
3930 for_each_possible_cpu(cpu
) {
3931 krcp
= per_cpu_ptr(&krc
, cpu
);
3934 * A monitor work can drain ready to reclaim objects
3935 * directly. Wait its completion if running or pending.
3937 cancel_delayed_work_sync(&krcp
->monitor_work
);
3939 for (i
= 0; i
< KFREE_N_BATCHES
; i
++) {
3940 krwp
= &(krcp
->krw_arr
[i
]);
3941 flush_rcu_work(&krwp
->rcu_work
);
3945 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier
);
3947 static unsigned long
3948 kfree_rcu_shrink_count(struct shrinker
*shrink
, struct shrink_control
*sc
)
3951 unsigned long count
= 0;
3953 /* Snapshot count of all CPUs */
3954 for_each_possible_cpu(cpu
) {
3955 struct kfree_rcu_cpu
*krcp
= per_cpu_ptr(&krc
, cpu
);
3957 count
+= krc_count(krcp
);
3958 count
+= READ_ONCE(krcp
->nr_bkv_objs
);
3959 atomic_set(&krcp
->backoff_page_cache_fill
, 1);
3962 return count
== 0 ? SHRINK_EMPTY
: count
;
3965 static unsigned long
3966 kfree_rcu_shrink_scan(struct shrinker
*shrink
, struct shrink_control
*sc
)
3970 for_each_possible_cpu(cpu
) {
3972 struct kfree_rcu_cpu
*krcp
= per_cpu_ptr(&krc
, cpu
);
3974 count
= krc_count(krcp
);
3975 count
+= drain_page_cache(krcp
);
3976 kfree_rcu_monitor(&krcp
->monitor_work
.work
);
3978 sc
->nr_to_scan
-= count
;
3981 if (sc
->nr_to_scan
<= 0)
3985 return freed
== 0 ? SHRINK_STOP
: freed
;
3988 void __init
kfree_rcu_scheduler_running(void)
3992 for_each_possible_cpu(cpu
) {
3993 struct kfree_rcu_cpu
*krcp
= per_cpu_ptr(&krc
, cpu
);
3995 if (need_offload_krc(krcp
))
3996 schedule_delayed_monitor_work(krcp
);
4001 * During early boot, any blocking grace-period wait automatically
4002 * implies a grace period.
4004 * Later on, this could in theory be the case for kernels built with
4005 * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
4006 * is not a common case. Furthermore, this optimization would cause
4007 * the rcu_gp_oldstate structure to expand by 50%, so this potential
4008 * grace-period optimization is ignored once the scheduler is running.
4010 static int rcu_blocking_is_gp(void)
4012 if (rcu_scheduler_active
!= RCU_SCHEDULER_INACTIVE
) {
4020 * Helper function for the synchronize_rcu() API.
4022 static void synchronize_rcu_normal(void)
4024 struct rcu_synchronize rs
;
4026 trace_rcu_sr_normal(rcu_state
.name
, &rs
.head
, TPS("request"));
4028 if (!READ_ONCE(rcu_normal_wake_from_gp
)) {
4029 wait_rcu_gp(call_rcu_hurry
);
4030 goto trace_complete_out
;
4033 init_rcu_head_on_stack(&rs
.head
);
4034 init_completion(&rs
.completion
);
4037 * This code might be preempted, therefore take a GP
4038 * snapshot before adding a request.
4040 if (IS_ENABLED(CONFIG_PROVE_RCU
))
4041 rs
.head
.func
= (void *) get_state_synchronize_rcu();
4043 rcu_sr_normal_add_req(&rs
);
4045 /* Kick a GP and start waiting. */
4046 (void) start_poll_synchronize_rcu();
4048 /* Now we can wait. */
4049 wait_for_completion(&rs
.completion
);
4050 destroy_rcu_head_on_stack(&rs
.head
);
4053 trace_rcu_sr_normal(rcu_state
.name
, &rs
.head
, TPS("complete"));
4057 * synchronize_rcu - wait until a grace period has elapsed.
4059 * Control will return to the caller some time after a full grace
4060 * period has elapsed, in other words after all currently executing RCU
4061 * read-side critical sections have completed. Note, however, that
4062 * upon return from synchronize_rcu(), the caller might well be executing
4063 * concurrently with new RCU read-side critical sections that began while
4064 * synchronize_rcu() was waiting.
4066 * RCU read-side critical sections are delimited by rcu_read_lock()
4067 * and rcu_read_unlock(), and may be nested. In addition, but only in
4068 * v5.0 and later, regions of code across which interrupts, preemption,
4069 * or softirqs have been disabled also serve as RCU read-side critical
4070 * sections. This includes hardware interrupt handlers, softirq handlers,
4073 * Note that this guarantee implies further memory-ordering guarantees.
4074 * On systems with more than one CPU, when synchronize_rcu() returns,
4075 * each CPU is guaranteed to have executed a full memory barrier since
4076 * the end of its last RCU read-side critical section whose beginning
4077 * preceded the call to synchronize_rcu(). In addition, each CPU having
4078 * an RCU read-side critical section that extends beyond the return from
4079 * synchronize_rcu() is guaranteed to have executed a full memory barrier
4080 * after the beginning of synchronize_rcu() and before the beginning of
4081 * that RCU read-side critical section. Note that these guarantees include
4082 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
4083 * that are executing in the kernel.
4085 * Furthermore, if CPU A invoked synchronize_rcu(), which returned
4086 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
4087 * to have executed a full memory barrier during the execution of
4088 * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
4089 * again only if the system has more than one CPU).
4091 * Implementation of these memory-ordering guarantees is described here:
4092 * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
4094 void synchronize_rcu(void)
4096 unsigned long flags
;
4097 struct rcu_node
*rnp
;
4099 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map
) ||
4100 lock_is_held(&rcu_lock_map
) ||
4101 lock_is_held(&rcu_sched_lock_map
),
4102 "Illegal synchronize_rcu() in RCU read-side critical section");
4103 if (!rcu_blocking_is_gp()) {
4104 if (rcu_gp_is_expedited())
4105 synchronize_rcu_expedited();
4107 synchronize_rcu_normal();
4111 // Context allows vacuous grace periods.
4112 // Note well that this code runs with !PREEMPT && !SMP.
4113 // In addition, all code that advances grace periods runs at
4114 // process level. Therefore, this normal GP overlaps with other
4115 // normal GPs only by being fully nested within them, which allows
4116 // reuse of ->gp_seq_polled_snap.
4117 rcu_poll_gp_seq_start_unlocked(&rcu_state
.gp_seq_polled_snap
);
4118 rcu_poll_gp_seq_end_unlocked(&rcu_state
.gp_seq_polled_snap
);
4120 // Update the normal grace-period counters to record
4121 // this grace period, but only those used by the boot CPU.
4122 // The rcu_scheduler_starting() will take care of the rest of
4124 local_irq_save(flags
);
4125 WARN_ON_ONCE(num_online_cpus() > 1);
4126 rcu_state
.gp_seq
+= (1 << RCU_SEQ_CTR_SHIFT
);
4127 for (rnp
= this_cpu_ptr(&rcu_data
)->mynode
; rnp
; rnp
= rnp
->parent
)
4128 rnp
->gp_seq_needed
= rnp
->gp_seq
= rcu_state
.gp_seq
;
4129 local_irq_restore(flags
);
4131 EXPORT_SYMBOL_GPL(synchronize_rcu
);
4134 * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
4135 * @rgosp: Place to put state cookie
4137 * Stores into @rgosp a value that will always be treated by functions
4138 * like poll_state_synchronize_rcu_full() as a cookie whose grace period
4139 * has already completed.
4141 void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate
*rgosp
)
4143 rgosp
->rgos_norm
= RCU_GET_STATE_COMPLETED
;
4144 rgosp
->rgos_exp
= RCU_GET_STATE_COMPLETED
;
4146 EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full
);
4149 * get_state_synchronize_rcu - Snapshot current RCU state
4151 * Returns a cookie that is used by a later call to cond_synchronize_rcu()
4152 * or poll_state_synchronize_rcu() to determine whether or not a full
4153 * grace period has elapsed in the meantime.
4155 unsigned long get_state_synchronize_rcu(void)
4158 * Any prior manipulation of RCU-protected data must happen
4159 * before the load from ->gp_seq.
4162 return rcu_seq_snap(&rcu_state
.gp_seq_polled
);
4164 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu
);
4167 * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
4168 * @rgosp: location to place combined normal/expedited grace-period state
4170 * Places the normal and expedited grace-period states in @rgosp. This
4171 * state value can be passed to a later call to cond_synchronize_rcu_full()
4172 * or poll_state_synchronize_rcu_full() to determine whether or not a
4173 * grace period (whether normal or expedited) has elapsed in the meantime.
4174 * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
4175 * long, but is guaranteed to see all grace periods. In contrast, the
4176 * combined state occupies less memory, but can sometimes fail to take
4177 * grace periods into account.
4179 * This does not guarantee that the needed grace period will actually
4182 void get_state_synchronize_rcu_full(struct rcu_gp_oldstate
*rgosp
)
4184 struct rcu_node
*rnp
= rcu_get_root();
4187 * Any prior manipulation of RCU-protected data must happen
4188 * before the loads from ->gp_seq and ->expedited_sequence.
4191 rgosp
->rgos_norm
= rcu_seq_snap(&rnp
->gp_seq
);
4192 rgosp
->rgos_exp
= rcu_seq_snap(&rcu_state
.expedited_sequence
);
4194 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full
);
4197 * Helper function for start_poll_synchronize_rcu() and
4198 * start_poll_synchronize_rcu_full().
4200 static void start_poll_synchronize_rcu_common(void)
4202 unsigned long flags
;
4204 struct rcu_data
*rdp
;
4205 struct rcu_node
*rnp
;
4207 local_irq_save(flags
);
4208 rdp
= this_cpu_ptr(&rcu_data
);
4210 raw_spin_lock_rcu_node(rnp
); // irqs already disabled.
4211 // Note it is possible for a grace period to have elapsed between
4212 // the above call to get_state_synchronize_rcu() and the below call
4213 // to rcu_seq_snap. This is OK, the worst that happens is that we
4214 // get a grace period that no one needed. These accesses are ordered
4215 // by smp_mb(), and we are accessing them in the opposite order
4216 // from which they are updated at grace-period start, as required.
4217 needwake
= rcu_start_this_gp(rnp
, rdp
, rcu_seq_snap(&rcu_state
.gp_seq
));
4218 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
4220 rcu_gp_kthread_wake();
4224 * start_poll_synchronize_rcu - Snapshot and start RCU grace period
4226 * Returns a cookie that is used by a later call to cond_synchronize_rcu()
4227 * or poll_state_synchronize_rcu() to determine whether or not a full
4228 * grace period has elapsed in the meantime. If the needed grace period
4229 * is not already slated to start, notifies RCU core of the need for that
4232 unsigned long start_poll_synchronize_rcu(void)
4234 unsigned long gp_seq
= get_state_synchronize_rcu();
4236 start_poll_synchronize_rcu_common();
4239 EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu
);
4242 * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
4243 * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
4245 * Places the normal and expedited grace-period states in *@rgos. This
4246 * state value can be passed to a later call to cond_synchronize_rcu_full()
4247 * or poll_state_synchronize_rcu_full() to determine whether or not a
4248 * grace period (whether normal or expedited) has elapsed in the meantime.
4249 * If the needed grace period is not already slated to start, notifies
4250 * RCU core of the need for that grace period.
4252 void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate
*rgosp
)
4254 get_state_synchronize_rcu_full(rgosp
);
4256 start_poll_synchronize_rcu_common();
4258 EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full
);
4261 * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
4262 * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
4264 * If a full RCU grace period has elapsed since the earlier call from
4265 * which @oldstate was obtained, return @true, otherwise return @false.
4266 * If @false is returned, it is the caller's responsibility to invoke this
4267 * function later on until it does return @true. Alternatively, the caller
4268 * can explicitly wait for a grace period, for example, by passing @oldstate
4269 * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited()
4270 * on the one hand or by directly invoking either synchronize_rcu() or
4271 * synchronize_rcu_expedited() on the other.
4273 * Yes, this function does not take counter wrap into account.
4274 * But counter wrap is harmless. If the counter wraps, we have waited for
4275 * more than a billion grace periods (and way more on a 64-bit system!).
4276 * Those needing to keep old state values for very long time periods
4277 * (many hours even on 32-bit systems) should check them occasionally and
4278 * either refresh them or set a flag indicating that the grace period has
4279 * completed. Alternatively, they can use get_completed_synchronize_rcu()
4280 * to get a guaranteed-completed grace-period state.
4282 * In addition, because oldstate compresses the grace-period state for
4283 * both normal and expedited grace periods into a single unsigned long,
4284 * it can miss a grace period when synchronize_rcu() runs concurrently
4285 * with synchronize_rcu_expedited(). If this is unacceptable, please
4286 * instead use the _full() variant of these polling APIs.
4288 * This function provides the same memory-ordering guarantees that
4289 * would be provided by a synchronize_rcu() that was invoked at the call
4290 * to the function that provided @oldstate, and that returned at the end
4293 bool poll_state_synchronize_rcu(unsigned long oldstate
)
4295 if (oldstate
== RCU_GET_STATE_COMPLETED
||
4296 rcu_seq_done_exact(&rcu_state
.gp_seq_polled
, oldstate
)) {
4297 smp_mb(); /* Ensure GP ends before subsequent accesses. */
4302 EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu
);
4305 * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
4306 * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
4308 * If a full RCU grace period has elapsed since the earlier call from
4309 * which *rgosp was obtained, return @true, otherwise return @false.
4310 * If @false is returned, it is the caller's responsibility to invoke this
4311 * function later on until it does return @true. Alternatively, the caller
4312 * can explicitly wait for a grace period, for example, by passing @rgosp
4313 * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
4315 * Yes, this function does not take counter wrap into account.
4316 * But counter wrap is harmless. If the counter wraps, we have waited
4317 * for more than a billion grace periods (and way more on a 64-bit
4318 * system!). Those needing to keep rcu_gp_oldstate values for very
4319 * long time periods (many hours even on 32-bit systems) should check
4320 * them occasionally and either refresh them or set a flag indicating
4321 * that the grace period has completed. Alternatively, they can use
4322 * get_completed_synchronize_rcu_full() to get a guaranteed-completed
4323 * grace-period state.
4325 * This function provides the same memory-ordering guarantees that would
4326 * be provided by a synchronize_rcu() that was invoked at the call to
4327 * the function that provided @rgosp, and that returned at the end of this
4328 * function. And this guarantee requires that the root rcu_node structure's
4329 * ->gp_seq field be checked instead of that of the rcu_state structure.
4330 * The problem is that the just-ending grace-period's callbacks can be
4331 * invoked between the time that the root rcu_node structure's ->gp_seq
4332 * field is updated and the time that the rcu_state structure's ->gp_seq
4333 * field is updated. Therefore, if a single synchronize_rcu() is to
4334 * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
4335 * then the root rcu_node structure is the one that needs to be polled.
4337 bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate
*rgosp
)
4339 struct rcu_node
*rnp
= rcu_get_root();
4341 smp_mb(); // Order against root rcu_node structure grace-period cleanup.
4342 if (rgosp
->rgos_norm
== RCU_GET_STATE_COMPLETED
||
4343 rcu_seq_done_exact(&rnp
->gp_seq
, rgosp
->rgos_norm
) ||
4344 rgosp
->rgos_exp
== RCU_GET_STATE_COMPLETED
||
4345 rcu_seq_done_exact(&rcu_state
.expedited_sequence
, rgosp
->rgos_exp
)) {
4346 smp_mb(); /* Ensure GP ends before subsequent accesses. */
4351 EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full
);
4354 * cond_synchronize_rcu - Conditionally wait for an RCU grace period
4355 * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
4357 * If a full RCU grace period has elapsed since the earlier call to
4358 * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
4359 * Otherwise, invoke synchronize_rcu() to wait for a full grace period.
4361 * Yes, this function does not take counter wrap into account.
4362 * But counter wrap is harmless. If the counter wraps, we have waited for
4363 * more than 2 billion grace periods (and way more on a 64-bit system!),
4364 * so waiting for a couple of additional grace periods should be just fine.
4366 * This function provides the same memory-ordering guarantees that
4367 * would be provided by a synchronize_rcu() that was invoked at the call
4368 * to the function that provided @oldstate and that returned at the end
4371 void cond_synchronize_rcu(unsigned long oldstate
)
4373 if (!poll_state_synchronize_rcu(oldstate
))
4376 EXPORT_SYMBOL_GPL(cond_synchronize_rcu
);
4379 * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
4380 * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
4382 * If a full RCU grace period has elapsed since the call to
4383 * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
4384 * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
4385 * obtained, just return. Otherwise, invoke synchronize_rcu() to wait
4386 * for a full grace period.
4388 * Yes, this function does not take counter wrap into account.
4389 * But counter wrap is harmless. If the counter wraps, we have waited for
4390 * more than 2 billion grace periods (and way more on a 64-bit system!),
4391 * so waiting for a couple of additional grace periods should be just fine.
4393 * This function provides the same memory-ordering guarantees that
4394 * would be provided by a synchronize_rcu() that was invoked at the call
4395 * to the function that provided @rgosp and that returned at the end of
4398 void cond_synchronize_rcu_full(struct rcu_gp_oldstate
*rgosp
)
4400 if (!poll_state_synchronize_rcu_full(rgosp
))
4403 EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full
);
4406 * Check to see if there is any immediate RCU-related work to be done by
4407 * the current CPU, returning 1 if so and zero otherwise. The checks are
4408 * in order of increasing expense: checks that can be carried out against
4409 * CPU-local state are performed first. However, we must check for CPU
4410 * stalls first, else we might not get a chance.
4412 static int rcu_pending(int user
)
4414 bool gp_in_progress
;
4415 struct rcu_data
*rdp
= this_cpu_ptr(&rcu_data
);
4416 struct rcu_node
*rnp
= rdp
->mynode
;
4418 lockdep_assert_irqs_disabled();
4420 /* Check for CPU stalls, if enabled. */
4421 check_cpu_stall(rdp
);
4423 /* Does this CPU need a deferred NOCB wakeup? */
4424 if (rcu_nocb_need_deferred_wakeup(rdp
, RCU_NOCB_WAKE
))
4427 /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
4428 gp_in_progress
= rcu_gp_in_progress();
4429 if ((user
|| rcu_is_cpu_rrupt_from_idle() ||
4431 time_before(jiffies
, READ_ONCE(rcu_state
.gp_start
) +
4432 nohz_full_patience_delay_jiffies
))) &&
4433 rcu_nohz_full_cpu())
4436 /* Is the RCU core waiting for a quiescent state from this CPU? */
4437 if (rdp
->core_needs_qs
&& !rdp
->cpu_no_qs
.b
.norm
&& gp_in_progress
)
4440 /* Does this CPU have callbacks ready to invoke? */
4441 if (!rcu_rdp_is_offloaded(rdp
) &&
4442 rcu_segcblist_ready_cbs(&rdp
->cblist
))
4445 /* Has RCU gone idle with this CPU needing another grace period? */
4446 if (!gp_in_progress
&& rcu_segcblist_is_enabled(&rdp
->cblist
) &&
4447 !rcu_rdp_is_offloaded(rdp
) &&
4448 !rcu_segcblist_restempty(&rdp
->cblist
, RCU_NEXT_READY_TAIL
))
4451 /* Have RCU grace period completed or started? */
4452 if (rcu_seq_current(&rnp
->gp_seq
) != rdp
->gp_seq
||
4453 unlikely(READ_ONCE(rdp
->gpwrap
))) /* outside lock */
4461 * Helper function for rcu_barrier() tracing. If tracing is disabled,
4462 * the compiler is expected to optimize this away.
4464 static void rcu_barrier_trace(const char *s
, int cpu
, unsigned long done
)
4466 trace_rcu_barrier(rcu_state
.name
, s
, cpu
,
4467 atomic_read(&rcu_state
.barrier_cpu_count
), done
);
4471 * RCU callback function for rcu_barrier(). If we are last, wake
4472 * up the task executing rcu_barrier().
4474 * Note that the value of rcu_state.barrier_sequence must be captured
4475 * before the atomic_dec_and_test(). Otherwise, if this CPU is not last,
4476 * other CPUs might count the value down to zero before this CPU gets
4477 * around to invoking rcu_barrier_trace(), which might result in bogus
4478 * data from the next instance of rcu_barrier().
4480 static void rcu_barrier_callback(struct rcu_head
*rhp
)
4482 unsigned long __maybe_unused s
= rcu_state
.barrier_sequence
;
4484 rhp
->next
= rhp
; // Mark the callback as having been invoked.
4485 if (atomic_dec_and_test(&rcu_state
.barrier_cpu_count
)) {
4486 rcu_barrier_trace(TPS("LastCB"), -1, s
);
4487 complete(&rcu_state
.barrier_completion
);
4489 rcu_barrier_trace(TPS("CB"), -1, s
);
4494 * If needed, entrain an rcu_barrier() callback on rdp->cblist.
4496 static void rcu_barrier_entrain(struct rcu_data
*rdp
)
4498 unsigned long gseq
= READ_ONCE(rcu_state
.barrier_sequence
);
4499 unsigned long lseq
= READ_ONCE(rdp
->barrier_seq_snap
);
4500 bool wake_nocb
= false;
4501 bool was_alldone
= false;
4503 lockdep_assert_held(&rcu_state
.barrier_lock
);
4504 if (rcu_seq_state(lseq
) || !rcu_seq_state(gseq
) || rcu_seq_ctr(lseq
) != rcu_seq_ctr(gseq
))
4506 rcu_barrier_trace(TPS("IRQ"), -1, rcu_state
.barrier_sequence
);
4507 rdp
->barrier_head
.func
= rcu_barrier_callback
;
4508 debug_rcu_head_queue(&rdp
->barrier_head
);
4511 * Flush bypass and wakeup rcuog if we add callbacks to an empty regular
4512 * queue. This way we don't wait for bypass timer that can reach seconds
4513 * if it's fully lazy.
4515 was_alldone
= rcu_rdp_is_offloaded(rdp
) && !rcu_segcblist_pend_cbs(&rdp
->cblist
);
4516 WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp
, NULL
, jiffies
, false));
4517 wake_nocb
= was_alldone
&& rcu_segcblist_pend_cbs(&rdp
->cblist
);
4518 if (rcu_segcblist_entrain(&rdp
->cblist
, &rdp
->barrier_head
)) {
4519 atomic_inc(&rcu_state
.barrier_cpu_count
);
4521 debug_rcu_head_unqueue(&rdp
->barrier_head
);
4522 rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state
.barrier_sequence
);
4524 rcu_nocb_unlock(rdp
);
4526 wake_nocb_gp(rdp
, false);
4527 smp_store_release(&rdp
->barrier_seq_snap
, gseq
);
4531 * Called with preemption disabled, and from cross-cpu IRQ context.
4533 static void rcu_barrier_handler(void *cpu_in
)
4535 uintptr_t cpu
= (uintptr_t)cpu_in
;
4536 struct rcu_data
*rdp
= per_cpu_ptr(&rcu_data
, cpu
);
4538 lockdep_assert_irqs_disabled();
4539 WARN_ON_ONCE(cpu
!= rdp
->cpu
);
4540 WARN_ON_ONCE(cpu
!= smp_processor_id());
4541 raw_spin_lock(&rcu_state
.barrier_lock
);
4542 rcu_barrier_entrain(rdp
);
4543 raw_spin_unlock(&rcu_state
.barrier_lock
);
4547 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
4549 * Note that this primitive does not necessarily wait for an RCU grace period
4550 * to complete. For example, if there are no RCU callbacks queued anywhere
4551 * in the system, then rcu_barrier() is within its rights to return
4552 * immediately, without waiting for anything, much less an RCU grace period.
4554 void rcu_barrier(void)
4557 unsigned long flags
;
4559 struct rcu_data
*rdp
;
4560 unsigned long s
= rcu_seq_snap(&rcu_state
.barrier_sequence
);
4562 rcu_barrier_trace(TPS("Begin"), -1, s
);
4564 /* Take mutex to serialize concurrent rcu_barrier() requests. */
4565 mutex_lock(&rcu_state
.barrier_mutex
);
4567 /* Did someone else do our work for us? */
4568 if (rcu_seq_done(&rcu_state
.barrier_sequence
, s
)) {
4569 rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state
.barrier_sequence
);
4570 smp_mb(); /* caller's subsequent code after above check. */
4571 mutex_unlock(&rcu_state
.barrier_mutex
);
4575 /* Mark the start of the barrier operation. */
4576 raw_spin_lock_irqsave(&rcu_state
.barrier_lock
, flags
);
4577 rcu_seq_start(&rcu_state
.barrier_sequence
);
4578 gseq
= rcu_state
.barrier_sequence
;
4579 rcu_barrier_trace(TPS("Inc1"), -1, rcu_state
.barrier_sequence
);
4582 * Initialize the count to two rather than to zero in order
4583 * to avoid a too-soon return to zero in case of an immediate
4584 * invocation of the just-enqueued callback (or preemption of
4585 * this task). Exclude CPU-hotplug operations to ensure that no
4586 * offline non-offloaded CPU has callbacks queued.
4588 init_completion(&rcu_state
.barrier_completion
);
4589 atomic_set(&rcu_state
.barrier_cpu_count
, 2);
4590 raw_spin_unlock_irqrestore(&rcu_state
.barrier_lock
, flags
);
4593 * Force each CPU with callbacks to register a new callback.
4594 * When that callback is invoked, we will know that all of the
4595 * corresponding CPU's preceding callbacks have been invoked.
4597 for_each_possible_cpu(cpu
) {
4598 rdp
= per_cpu_ptr(&rcu_data
, cpu
);
4600 if (smp_load_acquire(&rdp
->barrier_seq_snap
) == gseq
)
4602 raw_spin_lock_irqsave(&rcu_state
.barrier_lock
, flags
);
4603 if (!rcu_segcblist_n_cbs(&rdp
->cblist
)) {
4604 WRITE_ONCE(rdp
->barrier_seq_snap
, gseq
);
4605 raw_spin_unlock_irqrestore(&rcu_state
.barrier_lock
, flags
);
4606 rcu_barrier_trace(TPS("NQ"), cpu
, rcu_state
.barrier_sequence
);
4609 if (!rcu_rdp_cpu_online(rdp
)) {
4610 rcu_barrier_entrain(rdp
);
4611 WARN_ON_ONCE(READ_ONCE(rdp
->barrier_seq_snap
) != gseq
);
4612 raw_spin_unlock_irqrestore(&rcu_state
.barrier_lock
, flags
);
4613 rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu
, rcu_state
.barrier_sequence
);
4616 raw_spin_unlock_irqrestore(&rcu_state
.barrier_lock
, flags
);
4617 if (smp_call_function_single(cpu
, rcu_barrier_handler
, (void *)cpu
, 1)) {
4618 schedule_timeout_uninterruptible(1);
4621 WARN_ON_ONCE(READ_ONCE(rdp
->barrier_seq_snap
) != gseq
);
4622 rcu_barrier_trace(TPS("OnlineQ"), cpu
, rcu_state
.barrier_sequence
);
4626 * Now that we have an rcu_barrier_callback() callback on each
4627 * CPU, and thus each counted, remove the initial count.
4629 if (atomic_sub_and_test(2, &rcu_state
.barrier_cpu_count
))
4630 complete(&rcu_state
.barrier_completion
);
4632 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
4633 wait_for_completion(&rcu_state
.barrier_completion
);
4635 /* Mark the end of the barrier operation. */
4636 rcu_barrier_trace(TPS("Inc2"), -1, rcu_state
.barrier_sequence
);
4637 rcu_seq_end(&rcu_state
.barrier_sequence
);
4638 gseq
= rcu_state
.barrier_sequence
;
4639 for_each_possible_cpu(cpu
) {
4640 rdp
= per_cpu_ptr(&rcu_data
, cpu
);
4642 WRITE_ONCE(rdp
->barrier_seq_snap
, gseq
);
4645 /* Other rcu_barrier() invocations can now safely proceed. */
4646 mutex_unlock(&rcu_state
.barrier_mutex
);
4648 EXPORT_SYMBOL_GPL(rcu_barrier
);
4650 static unsigned long rcu_barrier_last_throttle
;
4653 * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second
4655 * This can be thought of as guard rails around rcu_barrier() that
4656 * permits unrestricted userspace use, at least assuming the hardware's
4657 * try_cmpxchg() is robust. There will be at most one call per second to
4658 * rcu_barrier() system-wide from use of this function, which means that
4659 * callers might needlessly wait a second or three.
4661 * This is intended for use by test suites to avoid OOM by flushing RCU
4662 * callbacks from the previous test before starting the next. See the
4663 * rcutree.do_rcu_barrier module parameter for more information.
4665 * Why not simply make rcu_barrier() more scalable? That might be
4666 * the eventual endpoint, but let's keep it simple for the time being.
4667 * Note that the module parameter infrastructure serializes calls to a
4668 * given .set() function, but should concurrent .set() invocation ever be
4669 * possible, we are ready!
4671 static void rcu_barrier_throttled(void)
4673 unsigned long j
= jiffies
;
4674 unsigned long old
= READ_ONCE(rcu_barrier_last_throttle
);
4675 unsigned long s
= rcu_seq_snap(&rcu_state
.barrier_sequence
);
4677 while (time_in_range(j
, old
, old
+ HZ
/ 16) ||
4678 !try_cmpxchg(&rcu_barrier_last_throttle
, &old
, j
)) {
4679 schedule_timeout_idle(HZ
/ 16);
4680 if (rcu_seq_done(&rcu_state
.barrier_sequence
, s
)) {
4681 smp_mb(); /* caller's subsequent code after above check. */
4685 old
= READ_ONCE(rcu_barrier_last_throttle
);
4691 * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier
4692 * request arrives. We insist on a true value to allow for possible
4695 static int param_set_do_rcu_barrier(const char *val
, const struct kernel_param
*kp
)
4700 if (rcu_scheduler_active
!= RCU_SCHEDULER_RUNNING
)
4702 ret
= kstrtobool(val
, &b
);
4704 atomic_inc((atomic_t
*)kp
->arg
);
4705 rcu_barrier_throttled();
4706 atomic_dec((atomic_t
*)kp
->arg
);
4712 * Output the number of outstanding rcutree.do_rcu_barrier requests.
4714 static int param_get_do_rcu_barrier(char *buffer
, const struct kernel_param
*kp
)
4716 return sprintf(buffer
, "%d\n", atomic_read((atomic_t
*)kp
->arg
));
4719 static const struct kernel_param_ops do_rcu_barrier_ops
= {
4720 .set
= param_set_do_rcu_barrier
,
4721 .get
= param_get_do_rcu_barrier
,
4723 static atomic_t do_rcu_barrier
;
4724 module_param_cb(do_rcu_barrier
, &do_rcu_barrier_ops
, &do_rcu_barrier
, 0644);
4727 * Compute the mask of online CPUs for the specified rcu_node structure.
4728 * This will not be stable unless the rcu_node structure's ->lock is
4729 * held, but the bit corresponding to the current CPU will be stable
4732 static unsigned long rcu_rnp_online_cpus(struct rcu_node
*rnp
)
4734 return READ_ONCE(rnp
->qsmaskinitnext
);
4738 * Is the CPU corresponding to the specified rcu_data structure online
4739 * from RCU's perspective? This perspective is given by that structure's
4740 * ->qsmaskinitnext field rather than by the global cpu_online_mask.
4742 static bool rcu_rdp_cpu_online(struct rcu_data
*rdp
)
4744 return !!(rdp
->grpmask
& rcu_rnp_online_cpus(rdp
->mynode
));
4747 bool rcu_cpu_online(int cpu
)
4749 struct rcu_data
*rdp
= per_cpu_ptr(&rcu_data
, cpu
);
4751 return rcu_rdp_cpu_online(rdp
);
4754 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
4757 * Is the current CPU online as far as RCU is concerned?
4759 * Disable preemption to avoid false positives that could otherwise
4760 * happen due to the current CPU number being sampled, this task being
4761 * preempted, its old CPU being taken offline, resuming on some other CPU,
4762 * then determining that its old CPU is now offline.
4764 * Disable checking if in an NMI handler because we cannot safely
4765 * report errors from NMI handlers anyway. In addition, it is OK to use
4766 * RCU on an offline processor during initial boot, hence the check for
4767 * rcu_scheduler_fully_active.
4769 bool rcu_lockdep_current_cpu_online(void)
4771 struct rcu_data
*rdp
;
4774 if (in_nmi() || !rcu_scheduler_fully_active
)
4776 preempt_disable_notrace();
4777 rdp
= this_cpu_ptr(&rcu_data
);
4779 * Strictly, we care here about the case where the current CPU is
4780 * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
4781 * not being up to date. So arch_spin_is_locked() might have a
4782 * false positive if it's held by some *other* CPU, but that's
4783 * OK because that just means a false *negative* on the warning.
4785 if (rcu_rdp_cpu_online(rdp
) || arch_spin_is_locked(&rcu_state
.ofl_lock
))
4787 preempt_enable_notrace();
4790 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online
);
4792 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
4794 // Has rcu_init() been invoked? This is used (for example) to determine
4795 // whether spinlocks may be acquired safely.
4796 static bool rcu_init_invoked(void)
4798 return !!READ_ONCE(rcu_state
.n_online_cpus
);
4802 * All CPUs for the specified rcu_node structure have gone offline,
4803 * and all tasks that were preempted within an RCU read-side critical
4804 * section while running on one of those CPUs have since exited their RCU
4805 * read-side critical section. Some other CPU is reporting this fact with
4806 * the specified rcu_node structure's ->lock held and interrupts disabled.
4807 * This function therefore goes up the tree of rcu_node structures,
4808 * clearing the corresponding bits in the ->qsmaskinit fields. Note that
4809 * the leaf rcu_node structure's ->qsmaskinit field has already been
4812 * This function does check that the specified rcu_node structure has
4813 * all CPUs offline and no blocked tasks, so it is OK to invoke it
4814 * prematurely. That said, invoking it after the fact will cost you
4815 * a needless lock acquisition. So once it has done its work, don't
4818 static void rcu_cleanup_dead_rnp(struct rcu_node
*rnp_leaf
)
4821 struct rcu_node
*rnp
= rnp_leaf
;
4823 raw_lockdep_assert_held_rcu_node(rnp_leaf
);
4824 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU
) ||
4825 WARN_ON_ONCE(rnp_leaf
->qsmaskinit
) ||
4826 WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf
)))
4829 mask
= rnp
->grpmask
;
4833 raw_spin_lock_rcu_node(rnp
); /* irqs already disabled. */
4834 rnp
->qsmaskinit
&= ~mask
;
4835 /* Between grace periods, so better already be zero! */
4836 WARN_ON_ONCE(rnp
->qsmask
);
4837 if (rnp
->qsmaskinit
) {
4838 raw_spin_unlock_rcu_node(rnp
);
4839 /* irqs remain disabled. */
4842 raw_spin_unlock_rcu_node(rnp
); /* irqs remain disabled. */
4847 * Propagate ->qsinitmask bits up the rcu_node tree to account for the
4848 * first CPU in a given leaf rcu_node structure coming online. The caller
4849 * must hold the corresponding leaf rcu_node ->lock with interrupts
4852 static void rcu_init_new_rnp(struct rcu_node
*rnp_leaf
)
4856 struct rcu_node
*rnp
= rnp_leaf
;
4858 raw_lockdep_assert_held_rcu_node(rnp_leaf
);
4859 WARN_ON_ONCE(rnp
->wait_blkd_tasks
);
4861 mask
= rnp
->grpmask
;
4865 raw_spin_lock_rcu_node(rnp
); /* Interrupts already disabled. */
4866 oldmask
= rnp
->qsmaskinit
;
4867 rnp
->qsmaskinit
|= mask
;
4868 raw_spin_unlock_rcu_node(rnp
); /* Interrupts remain disabled. */
4875 * Do boot-time initialization of a CPU's per-CPU RCU data.
4878 rcu_boot_init_percpu_data(int cpu
)
4880 struct context_tracking
*ct
= this_cpu_ptr(&context_tracking
);
4881 struct rcu_data
*rdp
= per_cpu_ptr(&rcu_data
, cpu
);
4883 /* Set up local state, ensuring consistent view of global state. */
4884 rdp
->grpmask
= leaf_node_cpu_bit(rdp
->mynode
, cpu
);
4885 INIT_WORK(&rdp
->strict_work
, strict_work_handler
);
4886 WARN_ON_ONCE(ct
->nesting
!= 1);
4887 WARN_ON_ONCE(rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu
)));
4888 rdp
->barrier_seq_snap
= rcu_state
.barrier_sequence
;
4889 rdp
->rcu_ofl_gp_seq
= rcu_state
.gp_seq
;
4890 rdp
->rcu_ofl_gp_state
= RCU_GP_CLEANED
;
4891 rdp
->rcu_onl_gp_seq
= rcu_state
.gp_seq
;
4892 rdp
->rcu_onl_gp_state
= RCU_GP_CLEANED
;
4893 rdp
->last_sched_clock
= jiffies
;
4895 rcu_boot_init_nocb_percpu_data(rdp
);
4898 struct kthread_worker
*rcu_exp_gp_kworker
;
4900 static void rcu_spawn_exp_par_gp_kworker(struct rcu_node
*rnp
)
4902 struct kthread_worker
*kworker
;
4903 const char *name
= "rcu_exp_par_gp_kthread_worker/%d";
4904 struct sched_param param
= { .sched_priority
= kthread_prio
};
4905 int rnp_index
= rnp
- rcu_get_root();
4907 if (rnp
->exp_kworker
)
4910 kworker
= kthread_create_worker(0, name
, rnp_index
);
4911 if (IS_ERR_OR_NULL(kworker
)) {
4912 pr_err("Failed to create par gp kworker on %d/%d\n",
4913 rnp
->grplo
, rnp
->grphi
);
4916 WRITE_ONCE(rnp
->exp_kworker
, kworker
);
4918 if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD
))
4919 sched_setscheduler_nocheck(kworker
->task
, SCHED_FIFO
, ¶m
);
4922 static struct task_struct
*rcu_exp_par_gp_task(struct rcu_node
*rnp
)
4924 struct kthread_worker
*kworker
= READ_ONCE(rnp
->exp_kworker
);
4929 return kworker
->task
;
4932 static void __init
rcu_start_exp_gp_kworker(void)
4934 const char *name
= "rcu_exp_gp_kthread_worker";
4935 struct sched_param param
= { .sched_priority
= kthread_prio
};
4937 rcu_exp_gp_kworker
= kthread_create_worker(0, name
);
4938 if (IS_ERR_OR_NULL(rcu_exp_gp_kworker
)) {
4939 pr_err("Failed to create %s!\n", name
);
4940 rcu_exp_gp_kworker
= NULL
;
4944 if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD
))
4945 sched_setscheduler_nocheck(rcu_exp_gp_kworker
->task
, SCHED_FIFO
, ¶m
);
4948 static void rcu_spawn_rnp_kthreads(struct rcu_node
*rnp
)
4950 if (rcu_scheduler_fully_active
) {
4951 mutex_lock(&rnp
->kthread_mutex
);
4952 rcu_spawn_one_boost_kthread(rnp
);
4953 rcu_spawn_exp_par_gp_kworker(rnp
);
4954 mutex_unlock(&rnp
->kthread_mutex
);
4959 * Invoked early in the CPU-online process, when pretty much all services
4960 * are available. The incoming CPU is not present.
4962 * Initializes a CPU's per-CPU RCU data. Note that only one online or
4963 * offline event can be happening at a given time. Note also that we can
4964 * accept some slop in the rsp->gp_seq access due to the fact that this
4965 * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
4966 * And any offloaded callbacks are being numbered elsewhere.
4968 int rcutree_prepare_cpu(unsigned int cpu
)
4970 unsigned long flags
;
4971 struct context_tracking
*ct
= per_cpu_ptr(&context_tracking
, cpu
);
4972 struct rcu_data
*rdp
= per_cpu_ptr(&rcu_data
, cpu
);
4973 struct rcu_node
*rnp
= rcu_get_root();
4975 /* Set up local state, ensuring consistent view of global state. */
4976 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
4977 rdp
->qlen_last_fqs_check
= 0;
4978 rdp
->n_force_qs_snap
= READ_ONCE(rcu_state
.n_force_qs
);
4979 rdp
->blimit
= blimit
;
4980 ct
->nesting
= 1; /* CPU not up, no tearing. */
4981 raw_spin_unlock_rcu_node(rnp
); /* irqs remain disabled. */
4984 * Only non-NOCB CPUs that didn't have early-boot callbacks need to be
4987 if (!rcu_segcblist_is_enabled(&rdp
->cblist
))
4988 rcu_segcblist_init(&rdp
->cblist
); /* Re-enable callbacks. */
4991 * Add CPU to leaf rcu_node pending-online bitmask. Any needed
4992 * propagation up the rcu_node tree will happen at the beginning
4993 * of the next grace period.
4996 raw_spin_lock_rcu_node(rnp
); /* irqs already disabled. */
4997 rdp
->gp_seq
= READ_ONCE(rnp
->gp_seq
);
4998 rdp
->gp_seq_needed
= rdp
->gp_seq
;
4999 rdp
->cpu_no_qs
.b
.norm
= true;
5000 rdp
->core_needs_qs
= false;
5001 rdp
->rcu_iw_pending
= false;
5002 rdp
->rcu_iw
= IRQ_WORK_INIT_HARD(rcu_iw_handler
);
5003 rdp
->rcu_iw_gp_seq
= rdp
->gp_seq
- 1;
5004 trace_rcu_grace_period(rcu_state
.name
, rdp
->gp_seq
, TPS("cpuonl"));
5005 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
5006 rcu_spawn_rnp_kthreads(rnp
);
5007 rcu_spawn_cpu_nocb_kthread(cpu
);
5008 ASSERT_EXCLUSIVE_WRITER(rcu_state
.n_online_cpus
);
5009 WRITE_ONCE(rcu_state
.n_online_cpus
, rcu_state
.n_online_cpus
+ 1);
5015 * Update kthreads affinity during CPU-hotplug changes.
5017 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
5018 * served by the rcu_node in question. The CPU hotplug lock is still
5019 * held, so the value of rnp->qsmaskinit will be stable.
5021 * We don't include outgoingcpu in the affinity set, use -1 if there is
5022 * no outgoing CPU. If there are no CPUs left in the affinity set,
5023 * this function allows the kthread to execute on any CPU.
5025 * Any future concurrent calls are serialized via ->kthread_mutex.
5027 static void rcutree_affinity_setting(unsigned int cpu
, int outgoingcpu
)
5031 struct rcu_data
*rdp
;
5032 struct rcu_node
*rnp
;
5033 struct task_struct
*task_boost
, *task_exp
;
5035 rdp
= per_cpu_ptr(&rcu_data
, cpu
);
5038 task_boost
= rcu_boost_task(rnp
);
5039 task_exp
= rcu_exp_par_gp_task(rnp
);
5042 * If CPU is the boot one, those tasks are created later from early
5043 * initcall since kthreadd must be created first.
5045 if (!task_boost
&& !task_exp
)
5048 if (!zalloc_cpumask_var(&cm
, GFP_KERNEL
))
5051 mutex_lock(&rnp
->kthread_mutex
);
5052 mask
= rcu_rnp_online_cpus(rnp
);
5053 for_each_leaf_node_possible_cpu(rnp
, cpu
)
5054 if ((mask
& leaf_node_cpu_bit(rnp
, cpu
)) &&
5056 cpumask_set_cpu(cpu
, cm
);
5057 cpumask_and(cm
, cm
, housekeeping_cpumask(HK_TYPE_RCU
));
5058 if (cpumask_empty(cm
)) {
5059 cpumask_copy(cm
, housekeeping_cpumask(HK_TYPE_RCU
));
5060 if (outgoingcpu
>= 0)
5061 cpumask_clear_cpu(outgoingcpu
, cm
);
5065 set_cpus_allowed_ptr(task_exp
, cm
);
5068 set_cpus_allowed_ptr(task_boost
, cm
);
5070 mutex_unlock(&rnp
->kthread_mutex
);
5072 free_cpumask_var(cm
);
5076 * Has the specified (known valid) CPU ever been fully online?
5078 bool rcu_cpu_beenfullyonline(int cpu
)
5080 struct rcu_data
*rdp
= per_cpu_ptr(&rcu_data
, cpu
);
5082 return smp_load_acquire(&rdp
->beenonline
);
5086 * Near the end of the CPU-online process. Pretty much all services
5087 * enabled, and the CPU is now very much alive.
5089 int rcutree_online_cpu(unsigned int cpu
)
5091 unsigned long flags
;
5092 struct rcu_data
*rdp
;
5093 struct rcu_node
*rnp
;
5095 rdp
= per_cpu_ptr(&rcu_data
, cpu
);
5097 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
5098 rnp
->ffmask
|= rdp
->grpmask
;
5099 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
5100 if (rcu_scheduler_active
== RCU_SCHEDULER_INACTIVE
)
5101 return 0; /* Too early in boot for scheduler work. */
5102 sync_sched_exp_online_cleanup(cpu
);
5103 rcutree_affinity_setting(cpu
, -1);
5105 // Stop-machine done, so allow nohz_full to disable tick.
5106 tick_dep_clear(TICK_DEP_BIT_RCU
);
5111 * Mark the specified CPU as being online so that subsequent grace periods
5112 * (both expedited and normal) will wait on it. Note that this means that
5113 * incoming CPUs are not allowed to use RCU read-side critical sections
5114 * until this function is called. Failing to observe this restriction
5115 * will result in lockdep splats.
5117 * Note that this function is special in that it is invoked directly
5118 * from the incoming CPU rather than from the cpuhp_step mechanism.
5119 * This is because this function must be invoked at a precise location.
5120 * This incoming CPU must not have enabled interrupts yet.
5122 * This mirrors the effects of rcutree_report_cpu_dead().
5124 void rcutree_report_cpu_starting(unsigned int cpu
)
5127 struct rcu_data
*rdp
;
5128 struct rcu_node
*rnp
;
5131 lockdep_assert_irqs_disabled();
5132 rdp
= per_cpu_ptr(&rcu_data
, cpu
);
5133 if (rdp
->cpu_started
)
5135 rdp
->cpu_started
= true;
5138 mask
= rdp
->grpmask
;
5139 arch_spin_lock(&rcu_state
.ofl_lock
);
5140 rcu_watching_online();
5141 raw_spin_lock(&rcu_state
.barrier_lock
);
5142 raw_spin_lock_rcu_node(rnp
);
5143 WRITE_ONCE(rnp
->qsmaskinitnext
, rnp
->qsmaskinitnext
| mask
);
5144 raw_spin_unlock(&rcu_state
.barrier_lock
);
5145 newcpu
= !(rnp
->expmaskinitnext
& mask
);
5146 rnp
->expmaskinitnext
|= mask
;
5147 /* Allow lockless access for expedited grace periods. */
5148 smp_store_release(&rcu_state
.ncpus
, rcu_state
.ncpus
+ newcpu
); /* ^^^ */
5149 ASSERT_EXCLUSIVE_WRITER(rcu_state
.ncpus
);
5150 rcu_gpnum_ovf(rnp
, rdp
); /* Offline-induced counter wrap? */
5151 rdp
->rcu_onl_gp_seq
= READ_ONCE(rcu_state
.gp_seq
);
5152 rdp
->rcu_onl_gp_state
= READ_ONCE(rcu_state
.gp_state
);
5154 /* An incoming CPU should never be blocking a grace period. */
5155 if (WARN_ON_ONCE(rnp
->qsmask
& mask
)) { /* RCU waiting on incoming CPU? */
5156 /* rcu_report_qs_rnp() *really* wants some flags to restore */
5157 unsigned long flags
;
5159 local_irq_save(flags
);
5160 rcu_disable_urgency_upon_qs(rdp
);
5161 /* Report QS -after- changing ->qsmaskinitnext! */
5162 rcu_report_qs_rnp(mask
, rnp
, rnp
->gp_seq
, flags
);
5164 raw_spin_unlock_rcu_node(rnp
);
5166 arch_spin_unlock(&rcu_state
.ofl_lock
);
5167 smp_store_release(&rdp
->beenonline
, true);
5168 smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
5172 * The outgoing function has no further need of RCU, so remove it from
5173 * the rcu_node tree's ->qsmaskinitnext bit masks.
5175 * Note that this function is special in that it is invoked directly
5176 * from the outgoing CPU rather than from the cpuhp_step mechanism.
5177 * This is because this function must be invoked at a precise location.
5179 * This mirrors the effect of rcutree_report_cpu_starting().
5181 void rcutree_report_cpu_dead(void)
5183 unsigned long flags
;
5185 struct rcu_data
*rdp
= this_cpu_ptr(&rcu_data
);
5186 struct rcu_node
*rnp
= rdp
->mynode
; /* Outgoing CPU's rdp & rnp. */
5189 * IRQS must be disabled from now on and until the CPU dies, or an interrupt
5190 * may introduce a new READ-side while it is actually off the QS masks.
5192 lockdep_assert_irqs_disabled();
5193 // Do any dangling deferred wakeups.
5194 do_nocb_deferred_wakeup(rdp
);
5196 rcu_preempt_deferred_qs(current
);
5198 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
5199 mask
= rdp
->grpmask
;
5200 arch_spin_lock(&rcu_state
.ofl_lock
);
5201 raw_spin_lock_irqsave_rcu_node(rnp
, flags
); /* Enforce GP memory-order guarantee. */
5202 rdp
->rcu_ofl_gp_seq
= READ_ONCE(rcu_state
.gp_seq
);
5203 rdp
->rcu_ofl_gp_state
= READ_ONCE(rcu_state
.gp_state
);
5204 if (rnp
->qsmask
& mask
) { /* RCU waiting on outgoing CPU? */
5205 /* Report quiescent state -before- changing ->qsmaskinitnext! */
5206 rcu_disable_urgency_upon_qs(rdp
);
5207 rcu_report_qs_rnp(mask
, rnp
, rnp
->gp_seq
, flags
);
5208 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
5210 WRITE_ONCE(rnp
->qsmaskinitnext
, rnp
->qsmaskinitnext
& ~mask
);
5211 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
5212 arch_spin_unlock(&rcu_state
.ofl_lock
);
5213 rdp
->cpu_started
= false;
5216 #ifdef CONFIG_HOTPLUG_CPU
5218 * The outgoing CPU has just passed through the dying-idle state, and we
5219 * are being invoked from the CPU that was IPIed to continue the offline
5220 * operation. Migrate the outgoing CPU's callbacks to the current CPU.
5222 void rcutree_migrate_callbacks(int cpu
)
5224 unsigned long flags
;
5225 struct rcu_data
*my_rdp
;
5226 struct rcu_node
*my_rnp
;
5227 struct rcu_data
*rdp
= per_cpu_ptr(&rcu_data
, cpu
);
5230 if (rcu_rdp_is_offloaded(rdp
))
5233 raw_spin_lock_irqsave(&rcu_state
.barrier_lock
, flags
);
5234 if (rcu_segcblist_empty(&rdp
->cblist
)) {
5235 raw_spin_unlock_irqrestore(&rcu_state
.barrier_lock
, flags
);
5236 return; /* No callbacks to migrate. */
5239 WARN_ON_ONCE(rcu_rdp_cpu_online(rdp
));
5240 rcu_barrier_entrain(rdp
);
5241 my_rdp
= this_cpu_ptr(&rcu_data
);
5242 my_rnp
= my_rdp
->mynode
;
5243 rcu_nocb_lock(my_rdp
); /* irqs already disabled. */
5244 WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp
, NULL
, jiffies
, false));
5245 raw_spin_lock_rcu_node(my_rnp
); /* irqs already disabled. */
5246 /* Leverage recent GPs and set GP for new callbacks. */
5247 needwake
= rcu_advance_cbs(my_rnp
, rdp
) ||
5248 rcu_advance_cbs(my_rnp
, my_rdp
);
5249 rcu_segcblist_merge(&my_rdp
->cblist
, &rdp
->cblist
);
5250 raw_spin_unlock(&rcu_state
.barrier_lock
); /* irqs remain disabled. */
5251 needwake
= needwake
|| rcu_advance_cbs(my_rnp
, my_rdp
);
5252 rcu_segcblist_disable(&rdp
->cblist
);
5253 WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp
->cblist
) != !rcu_segcblist_n_cbs(&my_rdp
->cblist
));
5254 check_cb_ovld_locked(my_rdp
, my_rnp
);
5255 if (rcu_rdp_is_offloaded(my_rdp
)) {
5256 raw_spin_unlock_rcu_node(my_rnp
); /* irqs remain disabled. */
5257 __call_rcu_nocb_wake(my_rdp
, true, flags
);
5259 rcu_nocb_unlock(my_rdp
); /* irqs remain disabled. */
5260 raw_spin_unlock_rcu_node(my_rnp
); /* irqs remain disabled. */
5262 local_irq_restore(flags
);
5264 rcu_gp_kthread_wake();
5265 lockdep_assert_irqs_enabled();
5266 WARN_ONCE(rcu_segcblist_n_cbs(&rdp
->cblist
) != 0 ||
5267 !rcu_segcblist_empty(&rdp
->cblist
),
5268 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
5269 cpu
, rcu_segcblist_n_cbs(&rdp
->cblist
),
5270 rcu_segcblist_first_cb(&rdp
->cblist
));
5274 * The CPU has been completely removed, and some other CPU is reporting
5275 * this fact from process context. Do the remainder of the cleanup.
5276 * There can only be one CPU hotplug operation at a time, so no need for
5279 int rcutree_dead_cpu(unsigned int cpu
)
5281 ASSERT_EXCLUSIVE_WRITER(rcu_state
.n_online_cpus
);
5282 WRITE_ONCE(rcu_state
.n_online_cpus
, rcu_state
.n_online_cpus
- 1);
5283 // Stop-machine done, so allow nohz_full to disable tick.
5284 tick_dep_clear(TICK_DEP_BIT_RCU
);
5289 * Near the end of the offline process. Trace the fact that this CPU
5292 int rcutree_dying_cpu(unsigned int cpu
)
5295 struct rcu_data
*rdp
= per_cpu_ptr(&rcu_data
, cpu
);
5296 struct rcu_node
*rnp
= rdp
->mynode
;
5298 blkd
= !!(READ_ONCE(rnp
->qsmask
) & rdp
->grpmask
);
5299 trace_rcu_grace_period(rcu_state
.name
, READ_ONCE(rnp
->gp_seq
),
5300 blkd
? TPS("cpuofl-bgp") : TPS("cpuofl"));
5305 * Near the beginning of the process. The CPU is still very much alive
5306 * with pretty much all services enabled.
5308 int rcutree_offline_cpu(unsigned int cpu
)
5310 unsigned long flags
;
5311 struct rcu_data
*rdp
;
5312 struct rcu_node
*rnp
;
5314 rdp
= per_cpu_ptr(&rcu_data
, cpu
);
5316 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
5317 rnp
->ffmask
&= ~rdp
->grpmask
;
5318 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
5320 rcutree_affinity_setting(cpu
, cpu
);
5322 // nohz_full CPUs need the tick for stop-machine to work quickly
5323 tick_dep_set(TICK_DEP_BIT_RCU
);
5326 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
5329 * On non-huge systems, use expedited RCU grace periods to make suspend
5330 * and hibernation run faster.
5332 static int rcu_pm_notify(struct notifier_block
*self
,
5333 unsigned long action
, void *hcpu
)
5336 case PM_HIBERNATION_PREPARE
:
5337 case PM_SUSPEND_PREPARE
:
5341 case PM_POST_HIBERNATION
:
5342 case PM_POST_SUSPEND
:
5343 rcu_unexpedite_gp();
5353 * Spawn the kthreads that handle RCU's grace periods.
5355 static int __init
rcu_spawn_gp_kthread(void)
5357 unsigned long flags
;
5358 struct rcu_node
*rnp
;
5359 struct sched_param sp
;
5360 struct task_struct
*t
;
5361 struct rcu_data
*rdp
= this_cpu_ptr(&rcu_data
);
5363 rcu_scheduler_fully_active
= 1;
5364 t
= kthread_create(rcu_gp_kthread
, NULL
, "%s", rcu_state
.name
);
5365 if (WARN_ONCE(IS_ERR(t
), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__
))
5368 sp
.sched_priority
= kthread_prio
;
5369 sched_setscheduler_nocheck(t
, SCHED_FIFO
, &sp
);
5371 rnp
= rcu_get_root();
5372 raw_spin_lock_irqsave_rcu_node(rnp
, flags
);
5373 WRITE_ONCE(rcu_state
.gp_activity
, jiffies
);
5374 WRITE_ONCE(rcu_state
.gp_req_activity
, jiffies
);
5375 // Reset .gp_activity and .gp_req_activity before setting .gp_kthread.
5376 smp_store_release(&rcu_state
.gp_kthread
, t
); /* ^^^ */
5377 raw_spin_unlock_irqrestore_rcu_node(rnp
, flags
);
5379 /* This is a pre-SMP initcall, we expect a single CPU */
5380 WARN_ON(num_online_cpus() > 1);
5382 * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
5383 * due to rcu_scheduler_fully_active.
5385 rcu_spawn_cpu_nocb_kthread(smp_processor_id());
5386 rcu_spawn_rnp_kthreads(rdp
->mynode
);
5387 rcu_spawn_core_kthreads();
5388 /* Create kthread worker for expedited GPs */
5389 rcu_start_exp_gp_kworker();
5392 early_initcall(rcu_spawn_gp_kthread
);
5395 * This function is invoked towards the end of the scheduler's
5396 * initialization process. Before this is called, the idle task might
5397 * contain synchronous grace-period primitives (during which time, this idle
5398 * task is booting the system, and such primitives are no-ops). After this
5399 * function is called, any synchronous grace-period primitives are run as
5400 * expedited, with the requesting task driving the grace period forward.
5401 * A later core_initcall() rcu_set_runtime_mode() will switch to full
5402 * runtime RCU functionality.
5404 void rcu_scheduler_starting(void)
5406 unsigned long flags
;
5407 struct rcu_node
*rnp
;
5409 WARN_ON(num_online_cpus() != 1);
5410 WARN_ON(nr_context_switches() > 0);
5411 rcu_test_sync_prims();
5413 // Fix up the ->gp_seq counters.
5414 local_irq_save(flags
);
5415 rcu_for_each_node_breadth_first(rnp
)
5416 rnp
->gp_seq_needed
= rnp
->gp_seq
= rcu_state
.gp_seq
;
5417 local_irq_restore(flags
);
5419 // Switch out of early boot mode.
5420 rcu_scheduler_active
= RCU_SCHEDULER_INIT
;
5421 rcu_test_sync_prims();
5425 * Helper function for rcu_init() that initializes the rcu_state structure.
5427 static void __init
rcu_init_one(void)
5429 static const char * const buf
[] = RCU_NODE_NAME_INIT
;
5430 static const char * const fqs
[] = RCU_FQS_NAME_INIT
;
5431 static struct lock_class_key rcu_node_class
[RCU_NUM_LVLS
];
5432 static struct lock_class_key rcu_fqs_class
[RCU_NUM_LVLS
];
5434 int levelspread
[RCU_NUM_LVLS
]; /* kids/node in each level. */
5438 struct rcu_node
*rnp
;
5440 BUILD_BUG_ON(RCU_NUM_LVLS
> ARRAY_SIZE(buf
)); /* Fix buf[] init! */
5442 /* Silence gcc 4.8 false positive about array index out of range. */
5443 if (rcu_num_lvls
<= 0 || rcu_num_lvls
> RCU_NUM_LVLS
)
5444 panic("rcu_init_one: rcu_num_lvls out of range");
5446 /* Initialize the level-tracking arrays. */
5448 for (i
= 1; i
< rcu_num_lvls
; i
++)
5449 rcu_state
.level
[i
] =
5450 rcu_state
.level
[i
- 1] + num_rcu_lvl
[i
- 1];
5451 rcu_init_levelspread(levelspread
, num_rcu_lvl
);
5453 /* Initialize the elements themselves, starting from the leaves. */
5455 for (i
= rcu_num_lvls
- 1; i
>= 0; i
--) {
5456 cpustride
*= levelspread
[i
];
5457 rnp
= rcu_state
.level
[i
];
5458 for (j
= 0; j
< num_rcu_lvl
[i
]; j
++, rnp
++) {
5459 raw_spin_lock_init(&ACCESS_PRIVATE(rnp
, lock
));
5460 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp
, lock
),
5461 &rcu_node_class
[i
], buf
[i
]);
5462 raw_spin_lock_init(&rnp
->fqslock
);
5463 lockdep_set_class_and_name(&rnp
->fqslock
,
5464 &rcu_fqs_class
[i
], fqs
[i
]);
5465 rnp
->gp_seq
= rcu_state
.gp_seq
;
5466 rnp
->gp_seq_needed
= rcu_state
.gp_seq
;
5467 rnp
->completedqs
= rcu_state
.gp_seq
;
5469 rnp
->qsmaskinit
= 0;
5470 rnp
->grplo
= j
* cpustride
;
5471 rnp
->grphi
= (j
+ 1) * cpustride
- 1;
5472 if (rnp
->grphi
>= nr_cpu_ids
)
5473 rnp
->grphi
= nr_cpu_ids
- 1;
5479 rnp
->grpnum
= j
% levelspread
[i
- 1];
5480 rnp
->grpmask
= BIT(rnp
->grpnum
);
5481 rnp
->parent
= rcu_state
.level
[i
- 1] +
5482 j
/ levelspread
[i
- 1];
5485 INIT_LIST_HEAD(&rnp
->blkd_tasks
);
5486 rcu_init_one_nocb(rnp
);
5487 init_waitqueue_head(&rnp
->exp_wq
[0]);
5488 init_waitqueue_head(&rnp
->exp_wq
[1]);
5489 init_waitqueue_head(&rnp
->exp_wq
[2]);
5490 init_waitqueue_head(&rnp
->exp_wq
[3]);
5491 spin_lock_init(&rnp
->exp_lock
);
5492 mutex_init(&rnp
->kthread_mutex
);
5493 raw_spin_lock_init(&rnp
->exp_poll_lock
);
5494 rnp
->exp_seq_poll_rq
= RCU_GET_STATE_COMPLETED
;
5495 INIT_WORK(&rnp
->exp_poll_wq
, sync_rcu_do_polled_gp
);
5499 init_swait_queue_head(&rcu_state
.gp_wq
);
5500 init_swait_queue_head(&rcu_state
.expedited_wq
);
5501 rnp
= rcu_first_leaf_node();
5502 for_each_possible_cpu(i
) {
5503 while (i
> rnp
->grphi
)
5505 per_cpu_ptr(&rcu_data
, i
)->mynode
= rnp
;
5506 per_cpu_ptr(&rcu_data
, i
)->barrier_head
.next
=
5507 &per_cpu_ptr(&rcu_data
, i
)->barrier_head
;
5508 rcu_boot_init_percpu_data(i
);
5513 * Force priority from the kernel command-line into range.
5515 static void __init
sanitize_kthread_prio(void)
5517 int kthread_prio_in
= kthread_prio
;
5519 if (IS_ENABLED(CONFIG_RCU_BOOST
) && kthread_prio
< 2
5520 && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST
))
5522 else if (IS_ENABLED(CONFIG_RCU_BOOST
) && kthread_prio
< 1)
5524 else if (kthread_prio
< 0)
5526 else if (kthread_prio
> 99)
5529 if (kthread_prio
!= kthread_prio_in
)
5530 pr_alert("%s: Limited prio to %d from %d\n",
5531 __func__
, kthread_prio
, kthread_prio_in
);
5535 * Compute the rcu_node tree geometry from kernel parameters. This cannot
5536 * replace the definitions in tree.h because those are needed to size
5537 * the ->node array in the rcu_state structure.
5539 void rcu_init_geometry(void)
5543 static unsigned long old_nr_cpu_ids
;
5544 int rcu_capacity
[RCU_NUM_LVLS
];
5545 static bool initialized
;
5549 * Warn if setup_nr_cpu_ids() had not yet been invoked,
5550 * unless nr_cpus_ids == NR_CPUS, in which case who cares?
5552 WARN_ON_ONCE(old_nr_cpu_ids
!= nr_cpu_ids
);
5556 old_nr_cpu_ids
= nr_cpu_ids
;
5560 * Initialize any unspecified boot parameters.
5561 * The default values of jiffies_till_first_fqs and
5562 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
5563 * value, which is a function of HZ, then adding one for each
5564 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
5566 d
= RCU_JIFFIES_TILL_FORCE_QS
+ nr_cpu_ids
/ RCU_JIFFIES_FQS_DIV
;
5567 if (jiffies_till_first_fqs
== ULONG_MAX
)
5568 jiffies_till_first_fqs
= d
;
5569 if (jiffies_till_next_fqs
== ULONG_MAX
)
5570 jiffies_till_next_fqs
= d
;
5571 adjust_jiffies_till_sched_qs();
5573 /* If the compile-time values are accurate, just leave. */
5574 if (rcu_fanout_leaf
== RCU_FANOUT_LEAF
&&
5575 nr_cpu_ids
== NR_CPUS
)
5577 pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
5578 rcu_fanout_leaf
, nr_cpu_ids
);
5581 * The boot-time rcu_fanout_leaf parameter must be at least two
5582 * and cannot exceed the number of bits in the rcu_node masks.
5583 * Complain and fall back to the compile-time values if this
5584 * limit is exceeded.
5586 if (rcu_fanout_leaf
< 2 || rcu_fanout_leaf
> BITS_PER_LONG
) {
5587 rcu_fanout_leaf
= RCU_FANOUT_LEAF
;
5593 * Compute number of nodes that can be handled an rcu_node tree
5594 * with the given number of levels.
5596 rcu_capacity
[0] = rcu_fanout_leaf
;
5597 for (i
= 1; i
< RCU_NUM_LVLS
; i
++)
5598 rcu_capacity
[i
] = rcu_capacity
[i
- 1] * RCU_FANOUT
;
5601 * The tree must be able to accommodate the configured number of CPUs.
5602 * If this limit is exceeded, fall back to the compile-time values.
5604 if (nr_cpu_ids
> rcu_capacity
[RCU_NUM_LVLS
- 1]) {
5605 rcu_fanout_leaf
= RCU_FANOUT_LEAF
;
5610 /* Calculate the number of levels in the tree. */
5611 for (i
= 0; nr_cpu_ids
> rcu_capacity
[i
]; i
++) {
5613 rcu_num_lvls
= i
+ 1;
5615 /* Calculate the number of rcu_nodes at each level of the tree. */
5616 for (i
= 0; i
< rcu_num_lvls
; i
++) {
5617 int cap
= rcu_capacity
[(rcu_num_lvls
- 1) - i
];
5618 num_rcu_lvl
[i
] = DIV_ROUND_UP(nr_cpu_ids
, cap
);
5621 /* Calculate the total number of rcu_node structures. */
5623 for (i
= 0; i
< rcu_num_lvls
; i
++)
5624 rcu_num_nodes
+= num_rcu_lvl
[i
];
5628 * Dump out the structure of the rcu_node combining tree associated
5629 * with the rcu_state structure.
5631 static void __init
rcu_dump_rcu_node_tree(void)
5634 struct rcu_node
*rnp
;
5636 pr_info("rcu_node tree layout dump\n");
5638 rcu_for_each_node_breadth_first(rnp
) {
5639 if (rnp
->level
!= level
) {
5644 pr_cont("%d:%d ^%d ", rnp
->grplo
, rnp
->grphi
, rnp
->grpnum
);
5649 struct workqueue_struct
*rcu_gp_wq
;
5651 static void __init
kfree_rcu_batch_init(void)
5655 struct shrinker
*kfree_rcu_shrinker
;
5657 /* Clamp it to [0:100] seconds interval. */
5658 if (rcu_delay_page_cache_fill_msec
< 0 ||
5659 rcu_delay_page_cache_fill_msec
> 100 * MSEC_PER_SEC
) {
5661 rcu_delay_page_cache_fill_msec
=
5662 clamp(rcu_delay_page_cache_fill_msec
, 0,
5663 (int) (100 * MSEC_PER_SEC
));
5665 pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
5666 rcu_delay_page_cache_fill_msec
);
5669 for_each_possible_cpu(cpu
) {
5670 struct kfree_rcu_cpu
*krcp
= per_cpu_ptr(&krc
, cpu
);
5672 for (i
= 0; i
< KFREE_N_BATCHES
; i
++) {
5673 INIT_RCU_WORK(&krcp
->krw_arr
[i
].rcu_work
, kfree_rcu_work
);
5674 krcp
->krw_arr
[i
].krcp
= krcp
;
5676 for (j
= 0; j
< FREE_N_CHANNELS
; j
++)
5677 INIT_LIST_HEAD(&krcp
->krw_arr
[i
].bulk_head_free
[j
]);
5680 for (i
= 0; i
< FREE_N_CHANNELS
; i
++)
5681 INIT_LIST_HEAD(&krcp
->bulk_head
[i
]);
5683 INIT_DELAYED_WORK(&krcp
->monitor_work
, kfree_rcu_monitor
);
5684 INIT_DELAYED_WORK(&krcp
->page_cache_work
, fill_page_cache_func
);
5685 krcp
->initialized
= true;
5688 kfree_rcu_shrinker
= shrinker_alloc(0, "rcu-kfree");
5689 if (!kfree_rcu_shrinker
) {
5690 pr_err("Failed to allocate kfree_rcu() shrinker!\n");
5694 kfree_rcu_shrinker
->count_objects
= kfree_rcu_shrink_count
;
5695 kfree_rcu_shrinker
->scan_objects
= kfree_rcu_shrink_scan
;
5697 shrinker_register(kfree_rcu_shrinker
);
5700 void __init
rcu_init(void)
5702 int cpu
= smp_processor_id();
5704 rcu_early_boot_tests();
5706 kfree_rcu_batch_init();
5707 rcu_bootup_announce();
5708 sanitize_kthread_prio();
5709 rcu_init_geometry();
5712 rcu_dump_rcu_node_tree();
5714 open_softirq(RCU_SOFTIRQ
, rcu_core_si
);
5717 * We don't need protection against CPU-hotplug here because
5718 * this is called early in boot, before either interrupts
5719 * or the scheduler are operational.
5721 pm_notifier(rcu_pm_notify
, 0);
5722 WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
5723 rcutree_prepare_cpu(cpu
);
5724 rcutree_report_cpu_starting(cpu
);
5725 rcutree_online_cpu(cpu
);
5727 /* Create workqueue for Tree SRCU and for expedited GPs. */
5728 rcu_gp_wq
= alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM
, 0);
5729 WARN_ON(!rcu_gp_wq
);
5731 sync_wq
= alloc_workqueue("sync_wq", WQ_MEM_RECLAIM
, 0);
5734 /* Fill in default value for rcutree.qovld boot parameter. */
5735 /* -After- the rcu_node ->lock fields are initialized! */
5737 qovld_calc
= DEFAULT_RCU_QOVLD_MULT
* qhimark
;
5741 // Kick-start in case any polled grace periods started early.
5742 (void)start_poll_synchronize_rcu_expedited();
5744 rcu_test_sync_prims();
5746 tasks_cblist_init_generic();
5749 #include "tree_stall.h"
5750 #include "tree_exp.h"
5751 #include "tree_nocb.h"
5752 #include "tree_plugin.h"