summaryrefslogtreecommitdiffstats
path: root/kernel/rcu
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/Kconfig242
-rw-r--r--kernel/rcu/Kconfig.debug82
-rw-r--r--kernel/rcu/Makefile5
-rw-r--r--kernel/rcu/rcu.h430
-rw-r--r--kernel/rcu/rcu_segcblist.c505
-rw-r--r--kernel/rcu/rcu_segcblist.h164
-rw-r--r--kernel/rcu/rcuperf.c129
-rw-r--r--kernel/rcu/rcutorture.c36
-rw-r--r--kernel/rcu/srcu.c656
-rw-r--r--kernel/rcu/srcutiny.c195
-rw-r--r--kernel/rcu/srcutree.c1227
-rw-r--r--kernel/rcu/tiny.c54
-rw-r--r--kernel/rcu/tiny_plugin.h132
-rw-r--r--kernel/rcu/tree.c901
-rw-r--r--kernel/rcu/tree.h272
-rw-r--r--kernel/rcu/tree_exp.h27
-rw-r--r--kernel/rcu/tree_plugin.h633
-rw-r--r--kernel/rcu/tree_trace.c494
-rw-r--r--kernel/rcu/update.c120
19 files changed, 3650 insertions, 2654 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
new file mode 100644
index 000000000000..be90c945063f
--- /dev/null
+++ b/kernel/rcu/Kconfig
@@ -0,0 +1,242 @@
+#
+# RCU-related configuration options
+#
+
+menu "RCU Subsystem"
+
+config TREE_RCU
+ bool
+ default y if !PREEMPT && SMP
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP system with hundreds or
+ thousands of CPUs. It also scales down nicely to
+ smaller systems.
+
+config PREEMPT_RCU
+ bool
+ default y if PREEMPT
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP systems with hundreds or
+ thousands of CPUs, but for which real-time response
+ is also required. It also scales down nicely to
+ smaller systems.
+
+ Select this option if you are unsure.
+
+config TINY_RCU
+ bool
+ default y if !PREEMPT && !SMP
+ help
+ This option selects the RCU implementation that is
+ designed for UP systems from which real-time response
+ is not required. This option greatly reduces the
+ memory footprint of RCU.
+
+config RCU_EXPERT
+ bool "Make expert-level adjustments to RCU configuration"
+ default n
+ help
+ This option needs to be enabled if you wish to make
+ expert-level adjustments to RCU configuration. By default,
+ no such adjustments can be made, which has the often-beneficial
+ side-effect of preventing "make oldconfig" from asking you all
+ sorts of detailed questions about how you would like numerous
+ obscure RCU options to be set up.
+
+ Say Y if you need to make expert-level adjustments to RCU.
+
+ Say N if you are unsure.
+
+config SRCU
+ bool
+ help
+ This option selects the sleepable version of RCU. This version
+ permits arbitrary sleeping or blocking within RCU read-side critical
+ sections.
+
+config TINY_SRCU
+ bool
+ default y if SRCU && TINY_RCU
+ help
+ This option selects the single-CPU non-preemptible version of SRCU.
+
+config TREE_SRCU
+ bool
+ default y if SRCU && !TINY_RCU
+ help
+ This option selects the full-fledged version of SRCU.
+
+config TASKS_RCU
+ bool
+ default n
+ select SRCU
+ help
+ This option enables a task-based RCU implementation that uses
+ only voluntary context switch (not preemption!), idle, and
+ user-mode execution as quiescent states.
+
+config RCU_STALL_COMMON
+ def_bool ( TREE_RCU || PREEMPT_RCU )
+ help
+ This option enables RCU CPU stall code that is common between
+ the TINY and TREE variants of RCU. The purpose is to allow
+ the tiny variants to disable RCU CPU stall warnings, while
+ making these warnings mandatory for the tree variants.
+
+config RCU_NEED_SEGCBLIST
+ def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
+
+config CONTEXT_TRACKING
+ bool
+
+config CONTEXT_TRACKING_FORCE
+ bool "Force context tracking"
+ depends on CONTEXT_TRACKING
+ default y if !NO_HZ_FULL
+ help
+ The major pre-requirement for full dynticks to work is to
+ support the context tracking subsystem. But there are also
+ other dependencies to provide in order to make the full
+ dynticks working.
+
+ This option stands for testing when an arch implements the
+ context tracking backend but doesn't yet fullfill all the
+ requirements to make the full dynticks feature working.
+ Without the full dynticks, there is no way to test the support
+ for context tracking and the subsystems that rely on it: RCU
+ userspace extended quiescent state and tickless cputime
+ accounting. This option copes with the absence of the full
+ dynticks subsystem by forcing the context tracking on all
+ CPUs in the system.
+
+ Say Y only if you're working on the development of an
+ architecture backend for the context tracking.
+
+ Say N otherwise, this option brings an overhead that you
+ don't want in production.
+
+
+config RCU_FANOUT
+ int "Tree-based hierarchical RCU fanout value"
+ range 2 64 if 64BIT
+ range 2 32 if !64BIT
+ depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ default 64 if 64BIT
+ default 32 if !64BIT
+ help
+ This option controls the fanout of hierarchical implementations
+ of RCU, allowing RCU to work efficiently on machines with
+ large numbers of CPUs. This value must be at least the fourth
+ root of NR_CPUS, which allows NR_CPUS to be insanely large.
+ The default value of RCU_FANOUT should be used for production
+ systems, but if you are stress-testing the RCU implementation
+ itself, small RCU_FANOUT values allow you to test large-system
+ code paths on small(er) systems.
+
+ Select a specific number if testing RCU itself.
+ Take the default if unsure.
+
+config RCU_FANOUT_LEAF
+ int "Tree-based hierarchical RCU leaf-level fanout value"
+ range 2 64 if 64BIT
+ range 2 32 if !64BIT
+ depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ default 16
+ help
+ This option controls the leaf-level fanout of hierarchical
+ implementations of RCU, and allows trading off cache misses
+ against lock contention. Systems that synchronize their
+ scheduling-clock interrupts for energy-efficiency reasons will
+ want the default because the smaller leaf-level fanout keeps
+ lock contention levels acceptably low. Very large systems
+ (hundreds or thousands of CPUs) will instead want to set this
+ value to the maximum value possible in order to reduce the
+ number of cache misses incurred during RCU's grace-period
+ initialization. These systems tend to run CPU-bound, and thus
+ are not helped by synchronized interrupts, and thus tend to
+ skew them, which reduces lock contention enough that large
+ leaf-level fanouts work well. That said, setting leaf-level
+ fanout to a large number will likely cause problematic
+ lock contention on the leaf-level rcu_node structures unless
+ you boot with the skew_tick kernel parameter.
+
+ Select a specific number if testing RCU itself.
+
+ Select the maximum permissible value for large systems, but
+ please understand that you may also need to set the skew_tick
+ kernel boot parameter to avoid contention on the rcu_node
+ structure's locks.
+
+ Take the default if unsure.
+
+config RCU_FAST_NO_HZ
+ bool "Accelerate last non-dyntick-idle CPU's grace periods"
+ depends on NO_HZ_COMMON && SMP && RCU_EXPERT
+ default n
+ help
+ This option permits CPUs to enter dynticks-idle state even if
+ they have RCU callbacks queued, and prevents RCU from waking
+ these CPUs up more than roughly once every four jiffies (by
+ default, you can adjust this using the rcutree.rcu_idle_gp_delay
+ parameter), thus improving energy efficiency. On the other
+ hand, this option increases the duration of RCU grace periods,
+ for example, slowing down synchronize_rcu().
+
+ Say Y if energy efficiency is critically important, and you
+ don't care about increased grace-period durations.
+
+ Say N if you are unsure.
+
+config RCU_BOOST
+ bool "Enable RCU priority boosting"
+ depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
+ default n
+ help
+ This option boosts the priority of preempted RCU readers that
+ block the current preemptible RCU grace period for too long.
+ This option also prevents heavy loads from blocking RCU
+ callback invocation for all flavors of RCU.
+
+ Say Y here if you are working with real-time apps or heavy loads
+ Say N here if you are unsure.
+
+config RCU_BOOST_DELAY
+ int "Milliseconds to delay boosting after RCU grace-period start"
+ range 0 3000
+ depends on RCU_BOOST
+ default 500
+ help
+ This option specifies the time to wait after the beginning of
+ a given grace period before priority-boosting preempted RCU
+ readers blocking that grace period. Note that any RCU reader
+ blocking an expedited RCU grace period is boosted immediately.
+
+ Accept the default if unsure.
+
+config RCU_NOCB_CPU
+ bool "Offload RCU callback processing from boot-selected CPUs"
+ depends on TREE_RCU || PREEMPT_RCU
+ depends on RCU_EXPERT || NO_HZ_FULL
+ default n
+ help
+ Use this option to reduce OS jitter for aggressive HPC or
+ real-time workloads. It can also be used to offload RCU
+ callback invocation to energy-efficient CPUs in battery-powered
+ asymmetric multiprocessors.
+
+ This option offloads callback invocation from the set of
+ CPUs specified at boot time by the rcu_nocbs parameter.
+ For each such CPU, a kthread ("rcuox/N") will be created to
+ invoke callbacks, where the "N" is the CPU being offloaded,
+ and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
+ "s" for RCU-sched. Nothing prevents this kthread from running
+ on the specified CPUs, but (1) the kthreads may be preempted
+ between each callback, and (2) affinity or cgroups can be used
+ to force the kthreads to run on whatever set of CPUs is desired.
+
+ Say Y here if you want to help to debug reduced OS jitter.
+ Say N here if you are unsure.
+
+endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
new file mode 100644
index 000000000000..0ec7d1d33a14
--- /dev/null
+++ b/kernel/rcu/Kconfig.debug
@@ -0,0 +1,82 @@
+#
+# RCU-related debugging configuration options
+#
+
+menu "RCU Debugging"
+
+config PROVE_RCU
+ def_bool PROVE_LOCKING
+
+config TORTURE_TEST
+ tristate
+ default n
+
+config RCU_PERF_TEST
+ tristate "performance tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ default n
+ help
+ This option provides a kernel module that runs performance
+ tests on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU performance tests to be built into
+ the kernel.
+ Say M if you want the RCU performance tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST
+ tristate "torture tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ default n
+ help
+ This option provides a kernel module that runs torture tests
+ on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU torture tests to be built into
+ the kernel.
+ Say M if you want the RCU torture tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_CPU_STALL_TIMEOUT
+ int "RCU CPU stall timeout in seconds"
+ depends on RCU_STALL_COMMON
+ range 3 300
+ default 21
+ help
+ If a given RCU grace period extends more than the specified
+ number of seconds, a CPU stall warning is printed. If the
+ RCU grace period persists, additional CPU stall warnings are
+ printed at more widely spaced intervals.
+
+config RCU_TRACE
+ bool "Enable tracing for RCU"
+ depends on DEBUG_KERNEL
+ default y if TREE_RCU
+ select TRACE_CLOCK
+ help
+ This option enables additional tracepoints for ftrace-style
+ event tracing.
+
+ Say Y here if you want to enable RCU tracing
+ Say N if you are unsure.
+
+config RCU_EQS_DEBUG
+ bool "Provide debugging asserts for adding NO_HZ support to an arch"
+ depends on DEBUG_KERNEL
+ help
+ This option provides consistency checks in RCU's handling of
+ NO_HZ. These checks have proven quite helpful in detecting
+ bugs in arch-specific NO_HZ code.
+
+ Say N here if you need ultimate kernel/user switch latencies
+ Say Y if you are unsure
+
+endmenu # "RCU Debugging"
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 18dfc485225c..13c0fc852767 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,10 +3,11 @@
KCOV_INSTRUMENT := n
obj-y += update.o sync.o
-obj-$(CONFIG_SRCU) += srcu.o
+obj-$(CONFIG_TREE_SRCU) += srcutree.o
+obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_PREEMPT_RCU) += tree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
obj-$(CONFIG_TINY_RCU) += tiny.o
+obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0d6ff3e471be..808b8c85f626 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -56,6 +56,83 @@
#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
DYNTICK_TASK_FLAG)
+
+/*
+ * Grace-period counter management.
+ */
+
+#define RCU_SEQ_CTR_SHIFT 2
+#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+
+/*
+ * Return the counter portion of a sequence number previously returned
+ * by rcu_seq_snap() or rcu_seq_current().
+ */
+static inline unsigned long rcu_seq_ctr(unsigned long s)
+{
+ return s >> RCU_SEQ_CTR_SHIFT;
+}
+
+/*
+ * Return the state portion of a sequence number previously returned
+ * by rcu_seq_snap() or rcu_seq_current().
+ */
+static inline int rcu_seq_state(unsigned long s)
+{
+ return s & RCU_SEQ_STATE_MASK;
+}
+
+/*
+ * Set the state portion of the pointed-to sequence number.
+ * The caller is responsible for preventing conflicting updates.
+ */
+static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
+{
+ WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
+ WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
+}
+
+/* Adjust sequence number for start of update-side operation. */
+static inline void rcu_seq_start(unsigned long *sp)
+{
+ WRITE_ONCE(*sp, *sp + 1);
+ smp_mb(); /* Ensure update-side operation after counter increment. */
+ WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
+}
+
+/* Adjust sequence number for end of update-side operation. */
+static inline void rcu_seq_end(unsigned long *sp)
+{
+ smp_mb(); /* Ensure update-side operation before counter increment. */
+ WARN_ON_ONCE(!rcu_seq_state(*sp));
+ WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
+}
+
+/* Take a snapshot of the update side's sequence number. */
+static inline unsigned long rcu_seq_snap(unsigned long *sp)
+{
+ unsigned long s;
+
+ s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK;
+ smp_mb(); /* Above access must not bleed into critical section. */
+ return s;
+}
+
+/* Return the current value the update side's sequence number, no ordering. */
+static inline unsigned long rcu_seq_current(unsigned long *sp)
+{
+ return READ_ONCE(*sp);
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+ return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+
/*
* debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
* by call_rcu() and rcu callback execution, and are therefore not part of the
@@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
rcu_lock_acquire(&rcu_callback_map);
if (__is_kfree_rcu_offset(offset)) {
- RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
+ RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
kfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
return true;
} else {
- RCU_TRACE(trace_rcu_invoke_callback(rn, head));
+ RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
head->func(head);
rcu_lock_release(&rcu_callback_map);
return false;
@@ -135,6 +212,18 @@ int rcu_jiffies_till_stall_check(void);
*/
#define TPS(x) tracepoint_string(x)
+/*
+ * Dump the ftrace buffer, but only one time per callsite per boot.
+ */
+#define rcu_ftrace_dump(oops_dump_mode) \
+do { \
+ static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
+ \
+ if (!atomic_read(&___rfd_beenhere) && \
+ !atomic_xchg(&___rfd_beenhere, 1)) \
+ ftrace_dump(oops_dump_mode); \
+} while (0)
+
void rcu_early_boot_tests(void);
void rcu_test_sync_prims(void);
@@ -144,4 +233,341 @@ void rcu_test_sync_prims(void);
*/
extern void resched_cpu(int cpu);
+#if defined(SRCU) || !defined(TINY_RCU)
+
+#include <linux/rcu_node_tree.h>
+
+extern int rcu_num_lvls;
+extern int num_rcu_lvl[];
+extern int rcu_num_nodes;
+static bool rcu_fanout_exact;
+static int rcu_fanout_leaf;
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
+ */
+static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
+{
+ int i;
+
+ if (rcu_fanout_exact) {
+ levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ for (i = rcu_num_lvls - 2; i >= 0; i--)
+ levelspread[i] = RCU_FANOUT;
+ } else {
+ int ccur;
+ int cprv;
+
+ cprv = nr_cpu_ids;
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ ccur = levelcnt[i];
+ levelspread[i] = (cprv + ccur - 1) / ccur;
+ cprv = ccur;
+ }
+ }
+}
+
+/*
+ * Do a full breadth-first scan of the rcu_node structures for the
+ * specified rcu_state structure.
+ */
+#define rcu_for_each_node_breadth_first(rsp, rnp) \
+ for ((rnp) = &(rsp)->node[0]; \
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+
+/*
+ * Do a breadth-first scan of the non-leaf rcu_node structures for the
+ * specified rcu_state structure. Note that if there is a singleton
+ * rcu_node tree with but one rcu_node structure, this loop is a no-op.
+ */
+#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
+ for ((rnp) = &(rsp)->node[0]; \
+ (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+
+/*
+ * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
+ * structure. Note that if there is a singleton rcu_node tree with but
+ * one rcu_node structure, this loop -will- visit the rcu_node structure.
+ * It is still a leaf node, even if it is also the root node.
+ */
+#define rcu_for_each_leaf_node(rsp, rnp) \
+ for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+
+/*
+ * Iterate over all possible CPUs in a leaf RCU node.
+ */
+#define for_each_leaf_node_possible_cpu(rnp, cpu) \
+ for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
+ cpu <= rnp->grphi; \
+ cpu = cpumask_next((cpu), cpu_possible_mask))
+
+/*
+ * Wrappers for the rcu_node::lock acquire and release.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ *
+ * As ->lock of struct rcu_node is a __private field, therefore one should use
+ * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
+ */
+#define raw_spin_lock_rcu_node(p) \
+do { \
+ raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
+
+#define raw_spin_lock_irq_rcu_node(p) \
+do { \
+ raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_irq_rcu_node(p) \
+ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+
+#define raw_spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+
+#define raw_spin_trylock_rcu_node(p) \
+({ \
+ bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
+#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
+
+#ifdef CONFIG_TINY_RCU
+/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
+static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
+{
+ return true;
+}
+static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
+{
+ return false;
+}
+
+static inline void rcu_expedite_gp(void)
+{
+}
+
+static inline void rcu_unexpedite_gp(void)
+{
+}
+#else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_gp_is_normal(void); /* Internal RCU use. */
+bool rcu_gp_is_expedited(void); /* Internal RCU use. */
+void rcu_expedite_gp(void);
+void rcu_unexpedite_gp(void);
+void rcupdate_announce_bootup_oddness(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+#define RCU_SCHEDULER_INACTIVE 0
+#define RCU_SCHEDULER_INIT 1
+#define RCU_SCHEDULER_RUNNING 2
+
+#ifdef CONFIG_TINY_RCU
+static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
+#else /* #ifdef CONFIG_TINY_RCU */
+void rcu_request_urgent_qs_task(struct task_struct *t);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+enum rcutorture_type {
+ RCU_FLAVOR,
+ RCU_BH_FLAVOR,
+ RCU_SCHED_FLAVOR,
+ RCU_TASKS_FLAVOR,
+ SRCU_FLAVOR,
+ INVALID_RCU_FLAVOR
+};
+
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
+ unsigned long *gpnum, unsigned long *completed);
+void rcutorture_record_test_transition(void);
+void rcutorture_record_progress(unsigned long vernum);
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
+#else
+static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
+ int *flags,
+ unsigned long *gpnum,
+ unsigned long *completed)
+{
+ *flags = 0;
+ *gpnum = 0;
+ *completed = 0;
+}
+static inline void rcutorture_record_test_transition(void)
+{
+}
+static inline void rcutorture_record_progress(unsigned long vernum)
+{
+}
+#ifdef CONFIG_RCU_TRACE
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+ do { } while (0)
+#endif
+#endif
+
+#ifdef CONFIG_TINY_SRCU
+
+static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
+ struct srcu_struct *sp, int *flags,
+ unsigned long *gpnum,
+ unsigned long *completed)
+{
+ if (test_type != SRCU_FLAVOR)
+ return;
+ *flags = 0;
+ *completed = sp->srcu_idx;
+ *gpnum = *completed;
+}
+
+#elif defined(CONFIG_TREE_SRCU)
+
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+ struct srcu_struct *sp, int *flags,
+ unsigned long *gpnum, unsigned long *completed);
+
+#endif
+
+#ifdef CONFIG_TINY_RCU
+
+/*
+ * Return the number of grace periods started.
+ */
+static inline unsigned long rcu_batches_started(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods started.
+ */
+static inline unsigned long rcu_batches_started_bh(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of sched grace periods started.
+ */
+static inline unsigned long rcu_batches_started_sched(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed_bh(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of sched grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed_sched(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of expedited grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of expedited sched grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed_sched(void)
+{
+ return 0;
+}
+
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+ return 0;
+}
+
+static inline void rcu_force_quiescent_state(void)
+{
+}
+
+static inline void rcu_bh_force_quiescent_state(void)
+{
+}
+
+static inline void rcu_sched_force_quiescent_state(void)
+{
+}
+
+static inline void show_rcu_gp_kthreads(void)
+{
+}
+
+#else /* #ifdef CONFIG_TINY_RCU */
+extern unsigned long rcutorture_testseq;
+extern unsigned long rcutorture_vernum;
+unsigned long rcu_batches_started(void);
+unsigned long rcu_batches_started_bh(void);
+unsigned long rcu_batches_started_sched(void);
+unsigned long rcu_batches_completed(void);
+unsigned long rcu_batches_completed_bh(void);
+unsigned long rcu_batches_completed_sched(void);
+unsigned long rcu_exp_batches_completed(void);
+unsigned long rcu_exp_batches_completed_sched(void);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+void show_rcu_gp_kthreads(void);
+void rcu_force_quiescent_state(void);
+void rcu_bh_force_quiescent_state(void);
+void rcu_sched_force_quiescent_state(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+bool rcu_is_nocb_cpu(int cpu);
+#else
+static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
+#endif
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
new file mode 100644
index 000000000000..2b62a38b080f
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.c
@@ -0,0 +1,505 @@
+/*
+ * RCU segmented callback lists, function definitions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+
+#include "rcu_segcblist.h"
+
+/* Initialize simple callback list. */
+void rcu_cblist_init(struct rcu_cblist *rclp)
+{
+ rclp->head = NULL;
+ rclp->tail = &rclp->head;
+ rclp->len = 0;
+ rclp->len_lazy = 0;
+}
+
+/*
+ * Debug function to actually count the number of callbacks.
+ * If the number exceeds the limit specified, return -1.
+ */
+long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
+{
+ int cnt = 0;
+ struct rcu_head **rhpp = &rclp->head;
+
+ for (;;) {
+ if (!*rhpp)
+ return cnt;
+ if (++cnt > lim)
+ return -1;
+ rhpp = &(*rhpp)->next;
+ }
+}
+
+/*
+ * Dequeue the oldest rcu_head structure from the specified callback
+ * list. This function assumes that the callback is non-lazy, but
+ * the caller can later invoke rcu_cblist_dequeued_lazy() if it
+ * finds otherwise (and if it cares about laziness). This allows
+ * different users to have different ways of determining laziness.
+ */
+struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
+{
+ struct rcu_head *rhp;
+
+ rhp = rclp->head;
+ if (!rhp)
+ return NULL;
+ rclp->len--;
+ rclp->head = rhp->next;
+ if (!rclp->head)
+ rclp->tail = &rclp->head;
+ return rhp;
+}
+
+/*
+ * Initialize an rcu_segcblist structure.
+ */
+void rcu_segcblist_init(struct rcu_segcblist *rsclp)
+{
+ int i;
+
+ BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
+ BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
+ rsclp->head = NULL;
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+ rsclp->tails[i] = &rsclp->head;
+ rsclp->len = 0;
+ rsclp->len_lazy = 0;
+}
+
+/*
+ * Disable the specified rcu_segcblist structure, so that callbacks can
+ * no longer be posted to it. This structure must be empty.
+ */
+void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
+{
+ WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
+ WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
+ rsclp->tails[RCU_NEXT_TAIL] = NULL;
+}
+
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+ if (seg == RCU_DONE_TAIL)
+ return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+ return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are ready to be invoked?
+ */
+bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
+{
+ return rcu_segcblist_is_enabled(rsclp) &&
+ &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are still pending, that is, not yet ready to be invoked?
+ */
+bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
+{
+ return rcu_segcblist_is_enabled(rsclp) &&
+ !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
+}
+
+/*
+ * Dequeue and return the first ready-to-invoke callback. If there
+ * are no ready-to-invoke callbacks, return NULL. Disables interrupts
+ * to avoid interference. Does not protect from interference from other
+ * CPUs or tasks.
+ */
+struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
+{
+ unsigned long flags;
+ int i;
+ struct rcu_head *rhp;
+
+ local_irq_save(flags);
+ if (!rcu_segcblist_ready_cbs(rsclp)) {
+ local_irq_restore(flags);
+ return NULL;
+ }
+ rhp = rsclp->head;
+ BUG_ON(!rhp);
+ rsclp->head = rhp->next;
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
+ if (rsclp->tails[i] != &rhp->next)
+ break;
+ rsclp->tails[i] = &rsclp->head;
+ }
+ smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
+ WRITE_ONCE(rsclp->len, rsclp->len - 1);
+ local_irq_restore(flags);
+ return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rsclp->len_lazy--;
+ local_irq_restore(flags);
+}
+
+/*
+ * Return a pointer to the first callback in the specified rcu_segcblist
+ * structure. This is useful for diagnostics.
+ */
+struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
+{
+ if (rcu_segcblist_is_enabled(rsclp))
+ return rsclp->head;
+ return NULL;
+}
+
+/*
+ * Return a pointer to the first pending callback in the specified
+ * rcu_segcblist structure. This is useful just after posting a given
+ * callback -- if that callback is the first pending callback, then
+ * you cannot rely on someone else having already started up the required
+ * grace period.
+ */
+struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
+{
+ if (rcu_segcblist_is_enabled(rsclp))
+ return *rsclp->tails[RCU_DONE_TAIL];
+ return NULL;
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * have not yet been processed beyond having been posted, that is,
+ * does it contain callbacks in its last segment?
+ */
+bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
+{
+ return rcu_segcblist_is_enabled(rsclp) &&
+ !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
+}
+
+/*
+ * Enqueue the specified callback onto the specified rcu_segcblist
+ * structure, updating accounting as needed. Note that the ->len
+ * field may be accessed locklessly, hence the WRITE_ONCE().
+ * The ->len field is used by rcu_barrier() and friends to determine
+ * if it must post a callback on this structure, and it is OK
+ * for rcu_barrier() to sometimes post callbacks needlessly, but
+ * absolutely not OK for it to ever miss posting a callback.
+ */
+void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp, bool lazy)
+{
+ WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
+ if (lazy)
+ rsclp->len_lazy++;
+ smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+ rhp->next = NULL;
+ *rsclp->tails[RCU_NEXT_TAIL] = rhp;
+ rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+}
+
+/*
+ * Entrain the specified callback onto the specified rcu_segcblist at
+ * the end of the last non-empty segment. If the entire rcu_segcblist
+ * is empty, make no change, but return false.
+ *
+ * This is intended for use by rcu_barrier()-like primitives, -not-
+ * for normal grace-period use. IMPORTANT: The callback you enqueue
+ * will wait for all prior callbacks, NOT necessarily for a grace
+ * period. You have been warned.
+ */
+bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp, bool lazy)
+{
+ int i;
+
+ if (rcu_segcblist_n_cbs(rsclp) == 0)
+ return false;
+ WRITE_ONCE(rsclp->len, rsclp->len + 1);
+ if (lazy)
+ rsclp->len_lazy++;
+ smp_mb(); /* Ensure counts are updated before callback is entrained. */
+ rhp->next = NULL;
+ for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
+ if (rsclp->tails[i] != rsclp->tails[i - 1])
+ break;
+ *rsclp->tails[i] = rhp;
+ for (; i <= RCU_NEXT_TAIL; i++)
+ rsclp->tails[i] = &rhp->next;
+ return true;
+}
+
+/*
+ * Extract only the counts from the specified rcu_segcblist structure,
+ * and place them in the specified rcu_cblist structure. This function
+ * supports both callback orphaning and invocation, hence the separation
+ * of counts and callbacks. (Callbacks ready for invocation must be
+ * orphaned and adopted separately from pending callbacks, but counts
+ * apply to all callbacks. Locking must be used to make sure that
+ * both orphaned-callbacks lists are consistent.)
+ */
+void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ rclp->len_lazy += rsclp->len_lazy;
+ rclp->len += rsclp->len;
+ rsclp->len_lazy = 0;
+ WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+}
+
+/*
+ * Extract only those callbacks ready to be invoked from the specified
+ * rcu_segcblist structure and place them in the specified rcu_cblist
+ * structure.
+ */
+void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ int i;
+
+ if (!rcu_segcblist_ready_cbs(rsclp))
+ return; /* Nothing to do. */
+ *rclp->tail = rsclp->head;
+ rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
+ *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ rclp->tail = rsclp->tails[RCU_DONE_TAIL];
+ for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
+ if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
+ rsclp->tails[i] = &rsclp->head;
+}
+
+/*
+ * Extract only those callbacks still pending (not yet ready to be
+ * invoked) from the specified rcu_segcblist structure and place them in
+ * the specified rcu_cblist structure. Note that this loses information
+ * about any callbacks that might have been partway done waiting for
+ * their grace period. Too bad! They will have to start over.
+ */
+void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ int i;
+
+ if (!rcu_segcblist_pend_cbs(rsclp))
+ return; /* Nothing to do. */
+ *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
+ rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
+ *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+ rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Insert counts from the specified rcu_cblist structure in the
+ * specified rcu_segcblist structure.
+ */
+void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ rsclp->len_lazy += rclp->len_lazy;
+ /* ->len sampled locklessly. */
+ WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
+ rclp->len_lazy = 0;
+ rclp->len = 0;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the beginning of the
+ * done-callbacks segment of the specified rcu_segcblist.
+ */
+void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ int i;
+
+ if (!rclp->head)
+ return; /* No callbacks to move. */
+ *rclp->tail = rsclp->head;
+ rsclp->head = rclp->head;
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+ if (&rsclp->head == rsclp->tails[i])
+ rsclp->tails[i] = rclp->tail;
+ else
+ break;
+ rclp->head = NULL;
+ rclp->tail = &rclp->head;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the end of the
+ * new-callbacks segment of the specified rcu_segcblist.
+ */
+void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ if (!rclp->head)
+ return; /* Nothing to do. */
+ *rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
+ rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+ rclp->head = NULL;
+ rclp->tail = &rclp->head;
+}
+
+/*
+ * Advance the callbacks in the specified rcu_segcblist structure based
+ * on the current value passed in for the grace-period counter.
+ */
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+ int i, j;
+
+ WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+ if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+ return;
+
+ /*
+ * Find all callbacks whose ->gp_seq numbers indicate that they
+ * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+ */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+ if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+ break;
+ rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+ }
+
+ /* If no callbacks moved, nothing more need be done. */
+ if (i == RCU_WAIT_TAIL)
+ return;
+
+ /* Clean up tail pointers that might have been misordered above. */
+ for (j = RCU_WAIT_TAIL; j < i; j++)
+ rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+
+ /*
+ * Callbacks moved, so clean up the misordered ->tails[] pointers
+ * that now point into the middle of the list of ready-to-invoke
+ * callbacks. The overall effect is to copy down the later pointers
+ * into the gap that was created by the now-ready segments.
+ */
+ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+ if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+ break; /* No more callbacks. */
+ rsclp->tails[j] = rsclp->tails[i];
+ rsclp->gp_seq[j] = rsclp->gp_seq[i];
+ }
+}
+
+/*
+ * "Accelerate" callbacks based on more-accurate grace-period information.
+ * The reason for this is that RCU does not synchronize the beginnings and
+ * ends of grace periods, and that callbacks are posted locally. This in
+ * turn means that the callbacks must be labelled conservatively early
+ * on, as getting exact information would degrade both performance and
+ * scalability. When more accurate grace-period information becomes
+ * available, previously posted callbacks can be "accelerated", marking
+ * them to complete at the end of the earlier grace period.
+ *
+ * This function operates on an rcu_segcblist structure, and also the
+ * grace-period sequence number seq at which new callbacks would become
+ * ready to invoke. Returns true if there are callbacks that won't be
+ * ready to invoke until seq, false otherwise.
+ */
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+ int i;
+
+ WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+ if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+ return false;
+
+ /*
+ * Find the segment preceding the oldest segment of callbacks
+ * whose ->gp_seq[] completion is at or after that passed in via
+ * "seq", skipping any empty segments. This oldest segment, along
+ * with any later segments, can be merged in with any newly arrived
+ * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
+ * as their ->gp_seq[] grace-period completion sequence number.
+ */
+ for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
+ if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+ ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+ break;
+
+ /*
+ * If all the segments contain callbacks that correspond to
+ * earlier grace-period sequence numbers than "seq", leave.
+ * Assuming that the rcu_segcblist structure has enough
+ * segments in its arrays, this can only happen if some of
+ * the non-done segments contain callbacks that really are
+ * ready to invoke. This situation will get straightened
+ * out by the next call to rcu_segcblist_advance().
+ *
+ * Also advance to the oldest segment of callbacks whose
+ * ->gp_seq[] completion is at or after that passed in via "seq",
+ * skipping any empty segments.
+ */
+ if (++i >= RCU_NEXT_TAIL)
+ return false;
+
+ /*
+ * Merge all later callbacks, including newly arrived callbacks,
+ * into the segment located by the for-loop above. Assign "seq"
+ * as the ->gp_seq[] value in order to correctly handle the case
+ * where there were no pending callbacks in the rcu_segcblist
+ * structure other than in the RCU_NEXT_TAIL segment.
+ */
+ for (; i < RCU_NEXT_TAIL; i++) {
+ rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+ rsclp->gp_seq[i] = seq;
+ }
+ return true;
+}
+
+/*
+ * Scan the specified rcu_segcblist structure for callbacks that need
+ * a grace period later than the one specified by "seq". We don't look
+ * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
+ * have a grace-period sequence number.
+ */
+bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+ unsigned long seq)
+{
+ int i;
+
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (rsclp->tails[i - 1] != rsclp->tails[i] &&
+ ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+ return true;
+ return false;
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
new file mode 100644
index 000000000000..6e36e36478cd
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.h
@@ -0,0 +1,164 @@
+/*
+ * RCU segmented callback lists, internal-to-rcu header file
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/rcu_segcblist.h>
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
+{
+ rclp->len_lazy--;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer. Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
+{
+ return rclp->head;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer. Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
+{
+ WARN_ON_ONCE(!rclp->head);
+ return rclp->tail;
+}
+
+void rcu_cblist_init(struct rcu_cblist *rclp);
+long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
+struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
+
+/*
+ * Is the specified rcu_segcblist structure empty?
+ *
+ * But careful! The fact that the ->head field is NULL does not
+ * necessarily imply that there are no callbacks associated with
+ * this structure. When callbacks are being invoked, they are
+ * removed as a group. If callback invocation must be preempted,
+ * the remaining callbacks will be added back to the list. Either
+ * way, the counts are updated later.
+ *
+ * So it is often the case that rcu_segcblist_n_cbs() should be used
+ * instead.
+ */
+static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
+{
+ return !rsclp->head;
+}
+
+/* Return number of callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
+{
+ return READ_ONCE(rsclp->len);
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
+{
+ return rsclp->len_lazy;
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
+{
+ return rsclp->len - rsclp->len_lazy;
+}
+
+/*
+ * Is the specified rcu_segcblist enabled, for example, not corresponding
+ * to an offline or callback-offloaded CPU?
+ */
+static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
+{
+ return !!rsclp->tails[RCU_NEXT_TAIL];
+}
+
+/*
+ * Are all segments following the specified segment of the specified
+ * rcu_segcblist structure empty of callbacks? (The specified
+ * segment might well contain callbacks.)
+ */
+static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
+{
+ return !*rsclp->tails[seg];
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer. Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
+{
+ return rsclp->head;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer. Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
+{
+ WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
+ return rsclp->tails[RCU_NEXT_TAIL];
+}
+
+void rcu_segcblist_init(struct rcu_segcblist *rsclp);
+void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
+bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
+void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
+void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp, bool lazy);
+bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp, bool lazy);
+void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
+bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+ unsigned long seq);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index a4a86fb47e4a..3cc18110b612 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -48,6 +48,8 @@
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include "rcu.h"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
@@ -59,12 +61,16 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
#define VERBOSE_PERFOUT_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
+torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
-torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nreaders, 0, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, false, "Shutdown at end of performance tests.");
+torture_param(bool, shutdown, !IS_ENABLED(MODULE),
+ "Shutdown at end of performance tests.");
torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
+torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
static char *perf_type = "rcu";
module_param(perf_type, charp, 0444);
@@ -86,13 +92,16 @@ static u64 t_rcu_perf_writer_started;
static u64 t_rcu_perf_writer_finished;
static unsigned long b_rcu_perf_writer_started;
static unsigned long b_rcu_perf_writer_finished;
+static DEFINE_PER_CPU(atomic_t, n_async_inflight);
static int rcu_perf_writer_state;
#define RTWS_INIT 0
-#define RTWS_EXP_SYNC 1
-#define RTWS_SYNC 2
-#define RTWS_IDLE 2
-#define RTWS_STOPPING 3
+#define RTWS_ASYNC 1
+#define RTWS_BARRIER 2
+#define RTWS_EXP_SYNC 3
+#define RTWS_SYNC 4
+#define RTWS_IDLE 5
+#define RTWS_STOPPING 6
#define MAX_MEAS 10000
#define MIN_MEAS 100
@@ -114,6 +123,8 @@ struct rcu_perf_ops {
unsigned long (*started)(void);
unsigned long (*completed)(void);
unsigned long (*exp_completed)(void);
+ void (*async)(struct rcu_head *head, rcu_callback_t func);
+ void (*gp_barrier)(void);
void (*sync)(void);
void (*exp_sync)(void);
const char *name;
@@ -153,6 +164,8 @@ static struct rcu_perf_ops rcu_ops = {
.started = rcu_batches_started,
.completed = rcu_batches_completed,
.exp_completed = rcu_exp_batches_completed,
+ .async = call_rcu,
+ .gp_barrier = rcu_barrier,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
.name = "rcu"
@@ -181,6 +194,8 @@ static struct rcu_perf_ops rcu_bh_ops = {
.started = rcu_batches_started_bh,
.completed = rcu_batches_completed_bh,
.exp_completed = rcu_exp_batches_completed_sched,
+ .async = call_rcu_bh,
+ .gp_barrier = rcu_barrier_bh,
.sync = synchronize_rcu_bh,
.exp_sync = synchronize_rcu_bh_expedited,
.name = "rcu_bh"
@@ -208,6 +223,16 @@ static unsigned long srcu_perf_completed(void)
return srcu_batches_completed(srcu_ctlp);
}
+static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ call_srcu(srcu_ctlp, head, func);
+}
+
+static void srcu_rcu_barrier(void)
+{
+ srcu_barrier(srcu_ctlp);
+}
+
static void srcu_perf_synchronize(void)
{
synchronize_srcu(srcu_ctlp);
@@ -226,11 +251,42 @@ static struct rcu_perf_ops srcu_ops = {
.started = NULL,
.completed = srcu_perf_completed,
.exp_completed = srcu_perf_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
.sync = srcu_perf_synchronize,
.exp_sync = srcu_perf_synchronize_expedited,
.name = "srcu"
};
+static struct srcu_struct srcud;
+
+static void srcu_sync_perf_init(void)
+{
+ srcu_ctlp = &srcud;
+ init_srcu_struct(srcu_ctlp);
+}
+
+static void srcu_sync_perf_cleanup(void)
+{
+ cleanup_srcu_struct(srcu_ctlp);
+}
+
+static struct rcu_perf_ops srcud_ops = {
+ .ptype = SRCU_FLAVOR,
+ .init = srcu_sync_perf_init,
+ .cleanup = srcu_sync_perf_cleanup,
+ .readlock = srcu_perf_read_lock,
+ .readunlock = srcu_perf_read_unlock,
+ .started = NULL,
+ .completed = srcu_perf_completed,
+ .exp_completed = srcu_perf_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
+ .sync = srcu_perf_synchronize,
+ .exp_sync = srcu_perf_synchronize_expedited,
+ .name = "srcud"
+};
+
/*
* Definitions for sched perf testing.
*/
@@ -254,6 +310,8 @@ static struct rcu_perf_ops sched_ops = {
.started = rcu_batches_started_sched,
.completed = rcu_batches_completed_sched,
.exp_completed = rcu_exp_batches_completed_sched,
+ .async = call_rcu_sched,
+ .gp_barrier = rcu_barrier_sched,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
.name = "sched"
@@ -281,6 +339,8 @@ static struct rcu_perf_ops tasks_ops = {
.readunlock = tasks_perf_read_unlock,
.started = rcu_no_completed,
.completed = rcu_no_completed,
+ .async = call_rcu_tasks,
+ .gp_barrier = rcu_barrier_tasks,
.sync = synchronize_rcu_tasks,
.exp_sync = synchronize_rcu_tasks,
.name = "tasks"
@@ -344,6 +404,15 @@ rcu_perf_reader(void *arg)
}
/*
+ * Callback function for asynchronous grace periods from rcu_perf_writer().
+ */
+static void rcu_perf_async_cb(struct rcu_head *rhp)
+{
+ atomic_dec(this_cpu_ptr(&n_async_inflight));
+ kfree(rhp);
+}
+
+/*
* RCU perf writer kthread. Repeatedly does a grace period.
*/
static int
@@ -352,6 +421,7 @@ rcu_perf_writer(void *arg)
int i = 0;
int i_max;
long me = (long)arg;
+ struct rcu_head *rhp = NULL;
struct sched_param sp;
bool started = false, done = false, alldone = false;
u64 t;
@@ -380,9 +450,27 @@ rcu_perf_writer(void *arg)
}
do {
+ if (writer_holdoff)
+ udelay(writer_holdoff);
wdp = &wdpp[i];
*wdp = ktime_get_mono_fast_ns();
- if (gp_exp) {
+ if (gp_async) {
+retry:
+ if (!rhp)
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
+ rcu_perf_writer_state = RTWS_ASYNC;
+ atomic_inc(this_cpu_ptr(&n_async_inflight));
+ cur_ops->async(rhp, rcu_perf_async_cb);
+ rhp = NULL;
+ } else if (!kthread_should_stop()) {
+ rcu_perf_writer_state = RTWS_BARRIER;
+ cur_ops->gp_barrier();
+ goto retry;
+ } else {
+ kfree(rhp); /* Because we are stopping. */
+ }
+ } else if (gp_exp) {
rcu_perf_writer_state = RTWS_EXP_SYNC;
cur_ops->exp_sync();
} else {
@@ -429,6 +517,10 @@ rcu_perf_writer(void *arg)
i++;
rcu_perf_wait_shutdown();
} while (!torture_must_stop());
+ if (gp_async) {
+ rcu_perf_writer_state = RTWS_BARRIER;
+ cur_ops->gp_barrier();
+ }
rcu_perf_writer_state = RTWS_STOPPING;
writer_n_durations[me] = i_max;
torture_kthread_stopping("rcu_perf_writer");
@@ -452,6 +544,17 @@ rcu_perf_cleanup(void)
u64 *wdp;
u64 *wdpp;
+ /*
+ * Would like warning at start, but everything is expedited
+ * during the mid-boot phase, so have to wait till the end.
+ */
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ if (rcu_gp_is_normal() && gp_exp)
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ if (gp_exp && gp_async)
+ VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
if (torture_cleanup_begin())
return;
@@ -554,7 +657,7 @@ rcu_perf_init(void)
long i;
int firsterr = 0;
static struct rcu_perf_ops *perf_ops[] = {
- &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+ &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
RCUPERF_TASKS_OPS
};
@@ -624,16 +727,6 @@ rcu_perf_init(void)
firsterr = -ENOMEM;
goto unwind;
}
- if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
- VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
- firsterr = -EINVAL;
- goto unwind;
- }
- if (rcu_gp_is_normal() && gp_exp) {
- VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
- firsterr = -EINVAL;
- goto unwind;
- }
for (i = 0; i < nrealwriters; i++) {
writer_durations[i] =
kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index cccc417a8135..b8f7f8ce8575 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -52,6 +52,8 @@
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include "rcu.h"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
@@ -559,19 +561,22 @@ static void srcu_torture_barrier(void)
static void srcu_torture_stats(void)
{
- int cpu;
- int idx = srcu_ctlp->completed & 0x1;
+ int __maybe_unused cpu;
+ int idx;
- pr_alert("%s%s per-CPU(idx=%d):",
+#ifdef CONFIG_TREE_SRCU
+ idx = srcu_ctlp->srcu_idx & 0x1;
+ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
unsigned long l0, l1;
unsigned long u0, u1;
long c0, c1;
- struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
+ struct srcu_data *counts;
- u0 = counts->unlock_count[!idx];
- u1 = counts->unlock_count[idx];
+ counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
+ u0 = counts->srcu_unlock_count[!idx];
+ u1 = counts->srcu_unlock_count[idx];
/*
* Make sure that a lock is always counted if the corresponding
@@ -579,14 +584,21 @@ static void srcu_torture_stats(void)
*/
smp_rmb();
- l0 = counts->lock_count[!idx];
- l1 = counts->lock_count[idx];
+ l0 = counts->srcu_lock_count[!idx];
+ l1 = counts->srcu_lock_count[idx];
c0 = l0 - u0;
c1 = l1 - u1;
pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
pr_cont("\n");
+#elif defined(CONFIG_TINY_SRCU)
+ idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
+ pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
+ torture_type, TORTURE_FLAG, idx,
+ READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
+ READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
+#endif
}
static void srcu_torture_synchronize_expedited(void)
@@ -1333,12 +1345,14 @@ rcu_torture_stats_print(void)
cur_ops->stats();
if (rtcv_snap == rcu_torture_current_version &&
rcu_torture_current != NULL) {
- int __maybe_unused flags;
- unsigned long __maybe_unused gpnum;
- unsigned long __maybe_unused completed;
+ int __maybe_unused flags = 0;
+ unsigned long __maybe_unused gpnum = 0;
+ unsigned long __maybe_unused completed = 0;
rcutorture_get_gp_data(cur_ops->ttype,
&flags, &gpnum, &completed);
+ srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+ &flags, &gpnum, &completed);
wtp = READ_ONCE(writer_task);
pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
rcu_torture_writer_state_getname(),
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
deleted file mode 100644
index ef3bcfb15b39..000000000000
--- a/kernel/rcu/srcu.c
+++ /dev/null
@@ -1,656 +0,0 @@
-/*
- * Sleepable Read-Copy Update mechanism for mutual exclusion.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (C) IBM Corporation, 2006
- * Copyright (C) Fujitsu, 2012
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- * Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU/ *.txt
- *
- */
-
-#include <linux/export.h>
-#include <linux/mutex.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/delay.h>
-#include <linux/srcu.h>
-
-#include "rcu.h"
-
-/*
- * Initialize an rcu_batch structure to empty.
- */
-static inline void rcu_batch_init(struct rcu_batch *b)
-{
- b->head = NULL;
- b->tail = &b->head;
-}
-
-/*
- * Enqueue a callback onto the tail of the specified rcu_batch structure.
- */
-static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
-{
- *b->tail = head;
- b->tail = &head->next;
-}
-
-/*
- * Is the specified rcu_batch structure empty?
- */
-static inline bool rcu_batch_empty(struct rcu_batch *b)
-{
- return b->tail == &b->head;
-}
-
-/*
- * Remove the callback at the head of the specified rcu_batch structure
- * and return a pointer to it, or return NULL if the structure is empty.
- */
-static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
-{
- struct rcu_head *head;
-
- if (rcu_batch_empty(b))
- return NULL;
-
- head = b->head;
- b->head = head->next;
- if (b->tail == &head->next)
- rcu_batch_init(b);
-
- return head;
-}
-
-/*
- * Move all callbacks from the rcu_batch structure specified by "from" to
- * the structure specified by "to".
- */
-static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
-{
- if (!rcu_batch_empty(from)) {
- *to->tail = from->head;
- to->tail = from->tail;
- rcu_batch_init(from);
- }
-}
-
-static int init_srcu_struct_fields(struct srcu_struct *sp)
-{
- sp->completed = 0;
- spin_lock_init(&sp->queue_lock);
- sp->running = false;
- rcu_batch_init(&sp->batch_queue);
- rcu_batch_init(&sp->batch_check0);
- rcu_batch_init(&sp->batch_check1);
- rcu_batch_init(&sp->batch_done);
- INIT_DELAYED_WORK(&sp->work, process_srcu);
- sp->per_cpu_ref = alloc_percpu(struct srcu_array);
- return sp->per_cpu_ref ? 0 : -ENOMEM;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
- struct lock_class_key *key)
-{
- /* Don't re-initialize a lock while it is held. */
- debug_check_no_locks_freed((void *)sp, sizeof(*sp));
- lockdep_init_map(&sp->dep_map, name, key, 0);
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(__init_srcu_struct);
-
-#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/**
- * init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
- *
- * Must invoke this on a given srcu_struct before passing that srcu_struct
- * to any other function. Each srcu_struct represents a separate domain
- * of SRCU protection.
- */
-int init_srcu_struct(struct srcu_struct *sp)
-{
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(init_srcu_struct);
-
-#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/*
- * Returns approximate total of the readers' ->lock_count[] values for the
- * rank of per-CPU counters specified by idx.
- */
-static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
- sum += READ_ONCE(cpuc->lock_count[idx]);
- }
- return sum;
-}
-
-/*
- * Returns approximate total of the readers' ->unlock_count[] values for the
- * rank of per-CPU counters specified by idx.
- */
-static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
- sum += READ_ONCE(cpuc->unlock_count[idx]);
- }
- return sum;
-}
-
-/*
- * Return true if the number of pre-existing readers is determined to
- * be zero.
- */
-static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
-{
- unsigned long unlocks;
-
- unlocks = srcu_readers_unlock_idx(sp, idx);
-
- /*
- * Make sure that a lock is always counted if the corresponding unlock
- * is counted. Needs to be a smp_mb() as the read side may contain a
- * read from a variable that is written to before the synchronize_srcu()
- * in the write side. In this case smp_mb()s A and B act like the store
- * buffering pattern.
- *
- * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
- * synchronize_srcu() from being executed before the grace period ends.
- */
- smp_mb(); /* A */
-
- /*
- * If the locks are the same as the unlocks, then there must have
- * been no readers on this index at some time in between. This does not
- * mean that there are no more readers, as one could have read the
- * current index but not have incremented the lock counter yet.
- *
- * Possible bug: There is no guarantee that there haven't been ULONG_MAX
- * increments of ->lock_count[] since the unlocks were counted, meaning
- * that this could return true even if there are still active readers.
- * Since there are no memory barriers around srcu_flip(), the CPU is not
- * required to increment ->completed before running
- * srcu_readers_unlock_idx(), which means that there could be an
- * arbitrarily large number of critical sections that execute after
- * srcu_readers_unlock_idx() but use the old value of ->completed.
- */
- return srcu_readers_lock_idx(sp, idx) == unlocks;
-}
-
-/**
- * srcu_readers_active - returns true if there are readers. and false
- * otherwise
- * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
- *
- * Note that this is not an atomic primitive, and can therefore suffer
- * severe errors when invoked on an active srcu_struct. That said, it
- * can be useful as an error check at cleanup time.
- */
-static bool srcu_readers_active(struct srcu_struct *sp)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
- sum += READ_ONCE(cpuc->lock_count[0]);
- sum += READ_ONCE(cpuc->lock_count[1]);
- sum -= READ_ONCE(cpuc->unlock_count[0]);
- sum -= READ_ONCE(cpuc->unlock_count[1]);
- }
- return sum;
-}
-
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
-{
- if (WARN_ON(srcu_readers_active(sp)))
- return; /* Leakage unless caller handles error. */
- free_percpu(sp->per_cpu_ref);
- sp->per_cpu_ref = NULL;
-}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-
-/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct. Must be called from process context.
- * Returns an index that must be passed to the matching srcu_read_unlock().
- */
-int __srcu_read_lock(struct srcu_struct *sp)
-{
- int idx;
-
- idx = READ_ONCE(sp->completed) & 0x1;
- __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
- smp_mb(); /* B */ /* Avoid leaking the critical section. */
- return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock);
-
-/*
- * Removes the count for the old reader from the appropriate per-CPU
- * element of the srcu_struct. Note that this may well be a different
- * CPU than that which was incremented by the corresponding srcu_read_lock().
- * Must be called from process context.
- */
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
-{
- smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
-}
-EXPORT_SYMBOL_GPL(__srcu_read_unlock);
-
-/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after 10 microseconds,
- * we repeatedly block for 1-millisecond time periods. This approach
- * has done well in testing, so there is no need for a config parameter.
- */
-#define SRCU_RETRY_CHECK_DELAY 5
-#define SYNCHRONIZE_SRCU_TRYCOUNT 2
-#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
-
-/*
- * @@@ Wait until all pre-existing readers complete. Such readers
- * will have used the index specified by "idx".
- * the caller should ensures the ->completed is not changed while checking
- * and idx = (->completed & 1) ^ 1
- */
-static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
-{
- for (;;) {
- if (srcu_readers_active_idx_check(sp, idx))
- return true;
- if (--trycount <= 0)
- return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
- }
-}
-
-/*
- * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->(un)lock_count[] arrays. This allows
- * us to wait for pre-existing readers in a starvation-free manner.
- */
-static void srcu_flip(struct srcu_struct *sp)
-{
- WRITE_ONCE(sp->completed, sp->completed + 1);
-
- /*
- * Ensure that if the updater misses an __srcu_read_unlock()
- * increment, that task's next __srcu_read_lock() will see the
- * above counter update. Note that both this memory barrier
- * and the one in srcu_readers_active_idx_check() provide the
- * guarantee for __srcu_read_lock().
- */
- smp_mb(); /* D */ /* Pairs with C. */
-}
-
-/*
- * Enqueue an SRCU callback on the specified srcu_struct structure,
- * initiating grace-period processing if it is not already running.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing SRCU read-side critical section. On systems with
- * more than one CPU, this means that when "func()" is invoked, each CPU
- * is guaranteed to have executed a full memory barrier since the end of
- * its last corresponding SRCU read-side critical section whose beginning
- * preceded the call to call_rcu(). It also means that each CPU executing
- * an SRCU read-side critical section that continues beyond the start of
- * "func()" must have executed a memory barrier after the call_rcu()
- * but before the beginning of that SRCU read-side critical section.
- * Note that these guarantees include CPUs that are offline, idle, or
- * executing in user mode, as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting SRCU callback function "func()", then both CPU A and CPU
- * B are guaranteed to execute a full memory barrier during the time
- * interval between the call to call_rcu() and the invocation of "func()".
- * This guarantee applies even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
- *
- * Of course, these guarantees apply only for invocations of call_srcu(),
- * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
- * srcu_struct structure.
- */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
- rcu_callback_t func)
-{
- unsigned long flags;
-
- head->next = NULL;
- head->func = func;
- spin_lock_irqsave(&sp->queue_lock, flags);
- smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
- rcu_batch_queue(&sp->batch_queue, head);
- if (!sp->running) {
- sp->running = true;
- queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
- }
- spin_unlock_irqrestore(&sp->queue_lock, flags);
-}
-EXPORT_SYMBOL_GPL(call_srcu);
-
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
-static void srcu_reschedule(struct srcu_struct *sp);
-
-/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
- */
-static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
-{
- struct rcu_synchronize rcu;
- struct rcu_head *head = &rcu.head;
- bool done = false;
-
- RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
- lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
-
- might_sleep();
- init_completion(&rcu.completion);
-
- head->next = NULL;
- head->func = wakeme_after_rcu;
- spin_lock_irq(&sp->queue_lock);
- smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
- if (!sp->running) {
- /* steal the processing owner */
- sp->running = true;
- rcu_batch_queue(&sp->batch_check0, head);
- spin_unlock_irq(&sp->queue_lock);
-
- srcu_advance_batches(sp, trycount);
- if (!rcu_batch_empty(&sp->batch_done)) {
- BUG_ON(sp->batch_done.head != head);
- rcu_batch_dequeue(&sp->batch_done);
- done = true;
- }
- /* give the processing owner to work_struct */
- srcu_reschedule(sp);
- } else {
- rcu_batch_queue(&sp->batch_queue, head);
- spin_unlock_irq(&sp->queue_lock);
- }
-
- if (!done) {
- wait_for_completion(&rcu.completion);
- smp_mb(); /* Caller's later accesses after GP. */
- }
-
-}
-
-/**
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for the count to drain to zero of both indexes. To avoid the
- * possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->completed & 1) ^ 1) to drain to zero at first,
- * and then flip the completed and wait for the count of the other index.
- *
- * Can block; must be called from process context.
- *
- * Note that it is illegal to call synchronize_srcu() from the corresponding
- * SRCU read-side critical section; doing so will result in deadlock.
- * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section,
- * as long as the resulting graph of srcu_structs is acyclic.
- *
- * There are memory-ordering constraints implied by synchronize_srcu().
- * On systems with more than one CPU, when synchronize_srcu() returns,
- * each CPU is guaranteed to have executed a full memory barrier since
- * the end of its last corresponding SRCU-sched read-side critical section
- * whose beginning preceded the call to synchronize_srcu(). In addition,
- * each CPU having an SRCU read-side critical section that extends beyond
- * the return from synchronize_srcu() is guaranteed to have executed a
- * full memory barrier after the beginning of synchronize_srcu() and before
- * the beginning of that SRCU read-side critical section. Note that these
- * guarantees include CPUs that are offline, idle, or executing in user mode,
- * as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_srcu(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
- * are the same CPU, but again only if the system has more than one CPU.
- *
- * Of course, these memory-ordering guarantees apply only when
- * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
- * passed the same srcu_struct structure.
- */
-void synchronize_srcu(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
- ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
- : SYNCHRONIZE_SRCU_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu);
-
-/**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
- * spinning rather than blocking when waiting.
- *
- * Note that synchronize_srcu_expedited() has the same deadlock and
- * memory-ordering properties as does synchronize_srcu().
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
-
-/**
- * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
- * @sp: srcu_struct on which to wait for in-flight callbacks.
- */
-void srcu_barrier(struct srcu_struct *sp)
-{
- synchronize_srcu(sp);
-}
-EXPORT_SYMBOL_GPL(srcu_barrier);
-
-/**
- * srcu_batches_completed - return batches completed.
- * @sp: srcu_struct on which to report batch completion.
- *
- * Report the number of batches, correlated with, but not necessarily
- * precisely the same as, the number of grace periods that have elapsed.
- */
-unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
- return sp->completed;
-}
-EXPORT_SYMBOL_GPL(srcu_batches_completed);
-
-#define SRCU_CALLBACK_BATCH 10
-#define SRCU_INTERVAL 1
-
-/*
- * Move any new SRCU callbacks to the first stage of the SRCU grace
- * period pipeline.
- */
-static void srcu_collect_new(struct srcu_struct *sp)
-{
- if (!rcu_batch_empty(&sp->batch_queue)) {
- spin_lock_irq(&sp->queue_lock);
- rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
- spin_unlock_irq(&sp->queue_lock);
- }
-}
-
-/*
- * Core SRCU state machine. Advance callbacks from ->batch_check0 to
- * ->batch_check1 and then to ->batch_done as readers drain.
- */
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
-{
- int idx = 1 ^ (sp->completed & 1);
-
- /*
- * Because readers might be delayed for an extended period after
- * fetching ->completed for their index, at any point in time there
- * might well be readers using both idx=0 and idx=1. We therefore
- * need to wait for readers to clear from both index values before
- * invoking a callback.
- */
-
- if (rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_check1))
- return; /* no callbacks need to be advanced */
-
- if (!try_check_zero(sp, idx, trycount))
- return; /* failed to advance, will try after SRCU_INTERVAL */
-
- /*
- * The callbacks in ->batch_check1 have already done with their
- * first zero check and flip back when they were enqueued on
- * ->batch_check0 in a previous invocation of srcu_advance_batches().
- * (Presumably try_check_zero() returned false during that
- * invocation, leaving the callbacks stranded on ->batch_check1.)
- * They are therefore ready to invoke, so move them to ->batch_done.
- */
- rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-
- if (rcu_batch_empty(&sp->batch_check0))
- return; /* no callbacks need to be advanced */
- srcu_flip(sp);
-
- /*
- * The callbacks in ->batch_check0 just finished their
- * first check zero and flip, so move them to ->batch_check1
- * for future checking on the other idx.
- */
- rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-
- /*
- * SRCU read-side critical sections are normally short, so check
- * at least twice in quick succession after a flip.
- */
- trycount = trycount < 2 ? 2 : trycount;
- if (!try_check_zero(sp, idx^1, trycount))
- return; /* failed to advance, will try after SRCU_INTERVAL */
-
- /*
- * The callbacks in ->batch_check1 have now waited for all
- * pre-existing readers using both idx values. They are therefore
- * ready to invoke, so move them to ->batch_done.
- */
- rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-}
-
-/*
- * Invoke a limited number of SRCU callbacks that have passed through
- * their grace period. If there are more to do, SRCU will reschedule
- * the workqueue. Note that needed memory barriers have been executed
- * in this task's context by srcu_readers_active_idx_check().
- */
-static void srcu_invoke_callbacks(struct srcu_struct *sp)
-{
- int i;
- struct rcu_head *head;
-
- for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
- head = rcu_batch_dequeue(&sp->batch_done);
- if (!head)
- break;
- local_bh_disable();
- head->func(head);
- local_bh_enable();
- }
-}
-
-/*
- * Finished one round of SRCU grace period. Start another if there are
- * more SRCU callbacks queued, otherwise put SRCU into not-running state.
- */
-static void srcu_reschedule(struct srcu_struct *sp)
-{
- bool pending = true;
-
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
- spin_lock_irq(&sp->queue_lock);
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
- sp->running = false;
- pending = false;
- }
- spin_unlock_irq(&sp->queue_lock);
- }
-
- if (pending)
- queue_delayed_work(system_power_efficient_wq,
- &sp->work, SRCU_INTERVAL);
-}
-
-/*
- * This is the work-queue function that handles SRCU grace periods.
- */
-void process_srcu(struct work_struct *work)
-{
- struct srcu_struct *sp;
-
- sp = container_of(work, struct srcu_struct, work.work);
-
- srcu_collect_new(sp);
- srcu_advance_batches(sp, 1);
- srcu_invoke_callbacks(sp);
- srcu_reschedule(sp);
-}
-EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
new file mode 100644
index 000000000000..1a1c1047d2ed
--- /dev/null
+++ b/kernel/rcu/srcutiny.c
@@ -0,0 +1,195 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ * tiny version for non-preemptible single-CPU use.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+
+#include <linux/rcu_node_tree.h>
+#include "rcu_segcblist.h"
+#include "rcu.h"
+
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+ sp->srcu_lock_nesting[0] = 0;
+ sp->srcu_lock_nesting[1] = 0;
+ init_swait_queue_head(&sp->srcu_wq);
+ sp->srcu_cb_head = NULL;
+ sp->srcu_cb_tail = &sp->srcu_cb_head;
+ sp->srcu_gp_running = false;
+ sp->srcu_gp_waiting = false;
+ sp->srcu_idx = 0;
+ INIT_WORK(&sp->srcu_work, srcu_drive_gp);
+ return 0;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+ struct lock_class_key *key)
+{
+ /* Don't re-initialize a lock while it is held. */
+ debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+ lockdep_init_map(&sp->dep_map, name, key, 0);
+ return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+ return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
+ flush_work(&sp->srcu_work);
+ WARN_ON(sp->srcu_gp_running);
+ WARN_ON(sp->srcu_gp_waiting);
+ WARN_ON(sp->srcu_cb_head);
+ WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Removes the count for the old reader from the appropriate element of
+ * the srcu_struct.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+ int newval = sp->srcu_lock_nesting[idx] - 1;
+
+ WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
+ if (!newval && READ_ONCE(sp->srcu_gp_waiting))
+ swake_up(&sp->srcu_wq);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * Workqueue handler to drive one grace period and invoke any callbacks
+ * that become ready as a result. Single-CPU and !PREEMPT operation
+ * means that we get away with murder on synchronization. ;-)
+ */
+void srcu_drive_gp(struct work_struct *wp)
+{
+ int idx;
+ struct rcu_head *lh;
+ struct rcu_head *rhp;
+ struct srcu_struct *sp;
+
+ sp = container_of(wp, struct srcu_struct, srcu_work);
+ if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head))
+ return; /* Already running or nothing to do. */
+
+ /* Remove recently arrived callbacks and wait for readers. */
+ WRITE_ONCE(sp->srcu_gp_running, true);
+ local_irq_disable();
+ lh = sp->srcu_cb_head;
+ sp->srcu_cb_head = NULL;
+ sp->srcu_cb_tail = &sp->srcu_cb_head;
+ local_irq_enable();
+ idx = sp->srcu_idx;
+ WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
+ WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
+ swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
+ WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+
+ /* Invoke the callbacks we removed above. */
+ while (lh) {
+ rhp = lh;
+ lh = lh->next;
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ }
+
+ /*
+ * Enable rescheduling, and if there are more callbacks,
+ * reschedule ourselves. This can race with a call_srcu()
+ * at interrupt level, but the ->srcu_gp_running checks will
+ * straighten that out.
+ */
+ WRITE_ONCE(sp->srcu_gp_running, false);
+ if (READ_ONCE(sp->srcu_cb_head))
+ schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(srcu_drive_gp);
+
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ unsigned long flags;
+
+ rhp->func = func;
+ rhp->next = NULL;
+ local_irq_save(flags);
+ *sp->srcu_cb_tail = rhp;
+ sp->srcu_cb_tail = &rhp->next;
+ local_irq_restore(flags);
+ if (!READ_ONCE(sp->srcu_gp_running))
+ schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+/*
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+ struct rcu_synchronize rs;
+
+ init_rcu_head_on_stack(&rs.head);
+ init_completion(&rs.completion);
+ call_srcu(sp, &rs.head, wakeme_after_rcu);
+ wait_for_completion(&rs.completion);
+ destroy_rcu_head_on_stack(&rs.head);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
new file mode 100644
index 000000000000..d0ca524bf042
--- /dev/null
+++ b/kernel/rcu/srcutree.c
@@ -0,0 +1,1227 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+
+#include "rcu.h"
+#include "rcu_segcblist.h"
+
+/* Holdoff in nanoseconds for auto-expediting. */
+#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000)
+static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
+module_param(exp_holdoff, ulong, 0444);
+
+/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */
+static ulong counter_wrap_check = (ULONG_MAX >> 2);
+module_param(counter_wrap_check, ulong, 0444);
+
+static void srcu_invoke_callbacks(struct work_struct *work);
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+
+/*
+ * Initialize SRCU combining tree. Note that statically allocated
+ * srcu_struct structures might already have srcu_read_lock() and
+ * srcu_read_unlock() running against them. So if the is_static parameter
+ * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ */
+static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
+{
+ int cpu;
+ int i;
+ int level = 0;
+ int levelspread[RCU_NUM_LVLS];
+ struct srcu_data *sdp;
+ struct srcu_node *snp;
+ struct srcu_node *snp_first;
+
+ /* Work out the overall tree geometry. */
+ sp->level[0] = &sp->node[0];
+ for (i = 1; i < rcu_num_lvls; i++)
+ sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
+ rcu_init_levelspread(levelspread, num_rcu_lvl);
+
+ /* Each pass through this loop initializes one srcu_node structure. */
+ rcu_for_each_node_breadth_first(sp, snp) {
+ raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
+ WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
+ ARRAY_SIZE(snp->srcu_data_have_cbs));
+ for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
+ snp->srcu_have_cbs[i] = 0;
+ snp->srcu_data_have_cbs[i] = 0;
+ }
+ snp->srcu_gp_seq_needed_exp = 0;
+ snp->grplo = -1;
+ snp->grphi = -1;
+ if (snp == &sp->node[0]) {
+ /* Root node, special case. */
+ snp->srcu_parent = NULL;
+ continue;
+ }
+
+ /* Non-root node. */
+ if (snp == sp->level[level + 1])
+ level++;
+ snp->srcu_parent = sp->level[level - 1] +
+ (snp - sp->level[level]) /
+ levelspread[level - 1];
+ }
+
+ /*
+ * Initialize the per-CPU srcu_data array, which feeds into the
+ * leaves of the srcu_node tree.
+ */
+ WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+ ARRAY_SIZE(sdp->srcu_unlock_count));
+ level = rcu_num_lvls - 1;
+ snp_first = sp->level[level];
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(sp->sda, cpu);
+ raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ rcu_segcblist_init(&sdp->srcu_cblist);
+ sdp->srcu_cblist_invoking = false;
+ sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq;
+ sdp->mynode = &snp_first[cpu / levelspread[level]];
+ for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
+ if (snp->grplo < 0)
+ snp->grplo = cpu;
+ snp->grphi = cpu;
+ }
+ sdp->cpu = cpu;
+ INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
+ sdp->sp = sp;
+ sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
+ if (is_static)
+ continue;
+
+ /* Dynamically allocated, better be no srcu_read_locks()! */
+ for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
+ sdp->srcu_lock_count[i] = 0;
+ sdp->srcu_unlock_count[i] = 0;
+ }
+ }
+}
+
+/*
+ * Initialize non-compile-time initialized fields, including the
+ * associated srcu_node and srcu_data structures. The is_static
+ * parameter is passed through to init_srcu_struct_nodes(), and
+ * also tells us that ->sda has already been wired up to srcu_data.
+ */
+static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
+{
+ mutex_init(&sp->srcu_cb_mutex);
+ mutex_init(&sp->srcu_gp_mutex);
+ sp->srcu_idx = 0;
+ sp->srcu_gp_seq = 0;
+ sp->srcu_barrier_seq = 0;
+ mutex_init(&sp->srcu_barrier_mutex);
+ atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&sp->work, process_srcu);
+ if (!is_static)
+ sp->sda = alloc_percpu(struct srcu_data);
+ init_srcu_struct_nodes(sp, is_static);
+ sp->srcu_gp_seq_needed_exp = 0;
+ sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
+ return sp->sda ? 0 : -ENOMEM;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+ struct lock_class_key *key)
+{
+ /* Don't re-initialize a lock while it is held. */
+ debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+ lockdep_init_map(&sp->dep_map, name, key, 0);
+ raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
+ return init_srcu_struct_fields(sp, false);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+ raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
+ return init_srcu_struct_fields(sp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * First-use initialization of statically allocated srcu_struct
+ * structure. Wiring up the combining tree is more than can be
+ * done with compile-time initialization, so this check is added
+ * to each update-side SRCU primitive. Use sp->lock, which -is-
+ * compile-time initialized, to resolve races involving multiple
+ * CPUs trying to garner first-use privileges.
+ */
+static void check_init_srcu_struct(struct srcu_struct *sp)
+{
+ unsigned long flags;
+
+ WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
+ /* The smp_load_acquire() pairs with the smp_store_release(). */
+ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
+ return; /* Already initialized. */
+ raw_spin_lock_irqsave_rcu_node(sp, flags);
+ if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+ return;
+ }
+ init_srcu_struct_fields(sp, true);
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+}
+
+/*
+ * Returns approximate total of the readers' ->srcu_lock_count[] values
+ * for the rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+
+ sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
+ }
+ return sum;
+}
+
+/*
+ * Returns approximate total of the readers' ->srcu_unlock_count[] values
+ * for the rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+
+ sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
+ }
+ return sum;
+}
+
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be zero.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+ unsigned long unlocks;
+
+ unlocks = srcu_readers_unlock_idx(sp, idx);
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted. Needs to be a smp_mb() as the read side may
+ * contain a read from a variable that is written to before the
+ * synchronize_srcu() in the write side. In this case smp_mb()s
+ * A and B act like the store buffering pattern.
+ *
+ * This smp_mb() also pairs with smp_mb() C to prevent accesses
+ * after the synchronize_srcu() from being executed before the
+ * grace period ends.
+ */
+ smp_mb(); /* A */
+
+ /*
+ * If the locks are the same as the unlocks, then there must have
+ * been no readers on this index at some time in between. This does
+ * not mean that there are no more readers, as one could have read
+ * the current index but not have incremented the lock counter yet.
+ *
+ * So suppose that the updater is preempted here for so long
+ * that more than ULONG_MAX non-nested readers come and go in
+ * the meantime. It turns out that this cannot result in overflow
+ * because if a reader modifies its unlock count after we read it
+ * above, then that reader's next load of ->srcu_idx is guaranteed
+ * to get the new value, which will cause it to operate on the
+ * other bank of counters, where it cannot contribute to the
+ * overflow of these counters. This means that there is a maximum
+ * of 2*NR_CPUS increments, which cannot overflow given current
+ * systems, especially not on 64-bit systems.
+ *
+ * OK, how about nesting? This does impose a limit on nesting
+ * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
+ * especially on 64-bit systems.
+ */
+ return srcu_readers_lock_idx(sp, idx) == unlocks;
+}
+
+/**
+ * srcu_readers_active - returns true if there are readers. and false
+ * otherwise
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct. That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static bool srcu_readers_active(struct srcu_struct *sp)
+{
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
+
+ sum += READ_ONCE(cpuc->srcu_lock_count[0]);
+ sum += READ_ONCE(cpuc->srcu_lock_count[1]);
+ sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
+ sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
+ }
+ return sum;
+}
+
+#define SRCU_INTERVAL 1
+
+/*
+ * Return grace-period delay, zero if there are expedited grace
+ * periods pending, SRCU_INTERVAL otherwise.
+ */
+static unsigned long srcu_get_delay(struct srcu_struct *sp)
+{
+ if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq),
+ READ_ONCE(sp->srcu_gp_seq_needed_exp)))
+ return 0;
+ return SRCU_INTERVAL;
+}
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ int cpu;
+
+ if (WARN_ON(!srcu_get_delay(sp)))
+ return; /* Leakage unless caller handles error. */
+ if (WARN_ON(srcu_readers_active(sp)))
+ return; /* Leakage unless caller handles error. */
+ flush_delayed_work(&sp->work);
+ for_each_possible_cpu(cpu)
+ flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+ if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(srcu_readers_active(sp))) {
+ pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+ return; /* Caller forgot to stop doing call_srcu()? */
+ }
+ free_percpu(sp->sda);
+ sp->sda = NULL;
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+ int idx;
+
+ idx = READ_ONCE(sp->srcu_idx) & 0x1;
+ this_cpu_inc(sp->sda->srcu_lock_count[idx]);
+ smp_mb(); /* B */ /* Avoid leaking the critical section. */
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+ smp_mb(); /* C */ /* Avoid leaking the critical section. */
+ this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections. If there are still some readers after a few microseconds,
+ * we repeatedly block for 1-millisecond time periods.
+ */
+#define SRCU_RETRY_CHECK_DELAY 5
+
+/*
+ * Start an SRCU grace period.
+ */
+static void srcu_gp_start(struct srcu_struct *sp)
+{
+ struct srcu_data *sdp = this_cpu_ptr(sp->sda);
+ int state;
+
+ lockdep_assert_held(&sp->lock);
+ WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&sp->srcu_gp_seq));
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+ rcu_seq_snap(&sp->srcu_gp_seq));
+ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
+ rcu_seq_start(&sp->srcu_gp_seq);
+ state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+ WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
+}
+
+/*
+ * Track online CPUs to guide callback workqueue placement.
+ */
+DEFINE_PER_CPU(bool, srcu_online);
+
+void srcu_online_cpu(unsigned int cpu)
+{
+ WRITE_ONCE(per_cpu(srcu_online, cpu), true);
+}
+
+void srcu_offline_cpu(unsigned int cpu)
+{
+ WRITE_ONCE(per_cpu(srcu_online, cpu), false);
+}
+
+/*
+ * Place the workqueue handler on the specified CPU if online, otherwise
+ * just run it whereever. This is useful for placing workqueue handlers
+ * that are to invoke the specified CPU's callbacks.
+ */
+static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork,
+ unsigned long delay)
+{
+ bool ret;
+
+ preempt_disable();
+ if (READ_ONCE(per_cpu(srcu_online, cpu)))
+ ret = queue_delayed_work_on(cpu, wq, dwork, delay);
+ else
+ ret = queue_delayed_work(wq, dwork, delay);
+ preempt_enable();
+ return ret;
+}
+
+/*
+ * Schedule callback invocation for the specified srcu_data structure,
+ * if possible, on the corresponding CPU.
+ */
+static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
+{
+ srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
+ &sdp->work, delay);
+}
+
+/*
+ * Schedule callback invocation for all srcu_data structures associated
+ * with the specified srcu_node structure that have callbacks for the
+ * just-completed grace period, the one corresponding to idx. If possible,
+ * schedule this invocation on the corresponding CPUs.
+ */
+static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
+ unsigned long mask, unsigned long delay)
+{
+ int cpu;
+
+ for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+ if (!(mask & (1 << (cpu - snp->grplo))))
+ continue;
+ srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay);
+ }
+}
+
+/*
+ * Note the end of an SRCU grace period. Initiates callback invocation
+ * and starts a new grace period if needed.
+ *
+ * The ->srcu_cb_mutex acquisition does not protect any data, but
+ * instead prevents more than one grace period from starting while we
+ * are initiating callback invocation. This allows the ->srcu_have_cbs[]
+ * array to have a finite number of elements.
+ */
+static void srcu_gp_end(struct srcu_struct *sp)
+{
+ unsigned long cbdelay;
+ bool cbs;
+ int cpu;
+ unsigned long flags;
+ unsigned long gpseq;
+ int idx;
+ int idxnext;
+ unsigned long mask;
+ struct srcu_data *sdp;
+ struct srcu_node *snp;
+
+ /* Prevent more than one additional grace period. */
+ mutex_lock(&sp->srcu_cb_mutex);
+
+ /* End the current grace period. */
+ raw_spin_lock_irq_rcu_node(sp);
+ idx = rcu_seq_state(sp->srcu_gp_seq);
+ WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
+ cbdelay = srcu_get_delay(sp);
+ sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ rcu_seq_end(&sp->srcu_gp_seq);
+ gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+ if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
+ sp->srcu_gp_seq_needed_exp = gpseq;
+ raw_spin_unlock_irq_rcu_node(sp);
+ mutex_unlock(&sp->srcu_gp_mutex);
+ /* A new grace period can start at this point. But only one. */
+
+ /* Initiate callback invocation as needed. */
+ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+ idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
+ rcu_for_each_node_breadth_first(sp, snp) {
+ raw_spin_lock_irq_rcu_node(snp);
+ cbs = false;
+ if (snp >= sp->level[rcu_num_lvls - 1])
+ cbs = snp->srcu_have_cbs[idx] == gpseq;
+ snp->srcu_have_cbs[idx] = gpseq;
+ rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+ if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
+ snp->srcu_gp_seq_needed_exp = gpseq;
+ mask = snp->srcu_data_have_cbs[idx];
+ snp->srcu_data_have_cbs[idx] = 0;
+ raw_spin_unlock_irq_rcu_node(snp);
+ if (cbs)
+ srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
+
+ /* Occasionally prevent srcu_data counter wrap. */
+ if (!(gpseq & counter_wrap_check))
+ for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+ sdp = per_cpu_ptr(sp->sda, cpu);
+ raw_spin_lock_irqsave_rcu_node(sdp, flags);
+ if (ULONG_CMP_GE(gpseq,
+ sdp->srcu_gp_seq_needed + 100))
+ sdp->srcu_gp_seq_needed = gpseq;
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ }
+ }
+
+ /* Callback initiation done, allow grace periods after next. */
+ mutex_unlock(&sp->srcu_cb_mutex);
+
+ /* Start a new grace period if needed. */
+ raw_spin_lock_irq_rcu_node(sp);
+ gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+ if (!rcu_seq_state(gpseq) &&
+ ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
+ srcu_gp_start(sp);
+ raw_spin_unlock_irq_rcu_node(sp);
+ /* Throttle expedited grace periods: Should be rare! */
+ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
+ ? 0 : SRCU_INTERVAL);
+ } else {
+ raw_spin_unlock_irq_rcu_node(sp);
+ }
+}
+
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent expedited
+ * grace-period requests. This function is invoked for the first known
+ * expedited request for a grace period that has already been requested,
+ * but without expediting. To start a completely new grace period,
+ * whether expedited or not, use srcu_funnel_gp_start() instead.
+ */
+static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
+ unsigned long s)
+{
+ unsigned long flags;
+
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
+ ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
+ return;
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
+ if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ return;
+ }
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ }
+ raw_spin_lock_irqsave_rcu_node(sp, flags);
+ if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+ sp->srcu_gp_seq_needed_exp = s;
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+}
+
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent grace-period
+ * requests. The winner has to do the work of actually starting grace
+ * period s. Losers must either ensure that their desired grace-period
+ * number is recorded on at least their leaf srcu_node structure, or they
+ * must take steps to invoke their own callbacks.
+ */
+static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
+ unsigned long s, bool do_norm)
+{
+ unsigned long flags;
+ int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
+ struct srcu_node *snp = sdp->mynode;
+ unsigned long snp_seq;
+
+ /* Each pass through the loop does one level of the srcu_node tree. */
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
+ return; /* GP already done and CBs recorded. */
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
+ if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+ snp_seq = snp->srcu_have_cbs[idx];
+ if (snp == sdp->mynode && snp_seq == s)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ if (snp == sdp->mynode && snp_seq != s) {
+ srcu_schedule_cbs_sdp(sdp, do_norm
+ ? SRCU_INTERVAL
+ : 0);
+ return;
+ }
+ if (!do_norm)
+ srcu_funnel_exp_start(sp, snp, s);
+ return;
+ }
+ snp->srcu_have_cbs[idx] = s;
+ if (snp == sdp->mynode)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
+ snp->srcu_gp_seq_needed_exp = s;
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ }
+
+ /* Top of tree, must ensure the grace period will be started. */
+ raw_spin_lock_irqsave_rcu_node(sp, flags);
+ if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
+ /*
+ * Record need for grace period s. Pair with load
+ * acquire setting up for initialization.
+ */
+ smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
+ }
+ if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+ sp->srcu_gp_seq_needed_exp = s;
+
+ /* If grace period not already done and none in progress, start it. */
+ if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
+ rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+ srcu_gp_start(sp);
+ queue_delayed_work(system_power_efficient_wq, &sp->work,
+ srcu_get_delay(sp));
+ }
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+}
+
+/*
+ * Wait until all readers counted by array index idx complete, but
+ * loop an additional time if there is an expedited grace period pending.
+ * The caller must ensure that ->srcu_idx is not changed while checking.
+ */
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+{
+ for (;;) {
+ if (srcu_readers_active_idx_check(sp, idx))
+ return true;
+ if (--trycount + !srcu_get_delay(sp) <= 0)
+ return false;
+ udelay(SRCU_RETRY_CHECK_DELAY);
+ }
+}
+
+/*
+ * Increment the ->srcu_idx counter so that future SRCU readers will
+ * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *sp)
+{
+ /*
+ * Ensure that if this updater saw a given reader's increment
+ * from __srcu_read_lock(), that reader was using an old value
+ * of ->srcu_idx. Also ensure that if a given reader sees the
+ * new value of ->srcu_idx, this updater's earlier scans cannot
+ * have seen that reader's increments (which is OK, because this
+ * grace period need not wait on that reader).
+ */
+ smp_mb(); /* E */ /* Pairs with B and C. */
+
+ WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
+
+ /*
+ * Ensure that if the updater misses an __srcu_read_unlock()
+ * increment, that task's next __srcu_read_lock() will see the
+ * above counter update. Note that both this memory barrier
+ * and the one in srcu_readers_active_idx_check() provide the
+ * guarantee for __srcu_read_lock().
+ */
+ smp_mb(); /* D */ /* Pairs with C. */
+}
+
+/*
+ * If SRCU is likely idle, return true, otherwise return false.
+ *
+ * Note that it is OK for several current from-idle requests for a new
+ * grace period from idle to specify expediting because they will all end
+ * up requesting the same grace period anyhow. So no loss.
+ *
+ * Note also that if any CPU (including the current one) is still invoking
+ * callbacks, this function will nevertheless say "idle". This is not
+ * ideal, but the overhead of checking all CPUs' callback lists is even
+ * less ideal, especially on large systems. Furthermore, the wakeup
+ * can happen before the callback is fully removed, so we have no choice
+ * but to accept this type of error.
+ *
+ * This function is also subject to counter-wrap errors, but let's face
+ * it, if this function was preempted for enough time for the counters
+ * to wrap, it really doesn't matter whether or not we expedite the grace
+ * period. The extra overhead of a needlessly expedited grace period is
+ * negligible when amoritized over that time period, and the extra latency
+ * of a needlessly non-expedited grace period is similarly negligible.
+ */
+static bool srcu_might_be_idle(struct srcu_struct *sp)
+{
+ unsigned long curseq;
+ unsigned long flags;
+ struct srcu_data *sdp;
+ unsigned long t;
+
+ /* If the local srcu_data structure has callbacks, not idle. */
+ local_irq_save(flags);
+ sdp = this_cpu_ptr(sp->sda);
+ if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
+ local_irq_restore(flags);
+ return false; /* Callbacks already present, so not idle. */
+ }
+ local_irq_restore(flags);
+
+ /*
+ * No local callbacks, so probabalistically probe global state.
+ * Exact information would require acquiring locks, which would
+ * kill scalability, hence the probabalistic nature of the probe.
+ */
+
+ /* First, see if enough time has passed since the last GP. */
+ t = ktime_get_mono_fast_ns();
+ if (exp_holdoff == 0 ||
+ time_in_range_open(t, sp->srcu_last_gp_end,
+ sp->srcu_last_gp_end + exp_holdoff))
+ return false; /* Too soon after last GP. */
+
+ /* Next, check for probable idleness. */
+ curseq = rcu_seq_current(&sp->srcu_gp_seq);
+ smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
+ if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed)))
+ return false; /* Grace period in progress, so not idle. */
+ smp_mb(); /* Order ->srcu_gp_seq with prior access. */
+ if (curseq != rcu_seq_current(&sp->srcu_gp_seq))
+ return false; /* GP # changed, so not idle. */
+ return true; /* With reasonable probability, idle! */
+}
+
+/*
+ * SRCU callback function to leak a callback.
+ */
+static void srcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
+/*
+ * Enqueue an SRCU callback on the srcu_data structure associated with
+ * the current CPU and the specified srcu_struct structure, initiating
+ * grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section. On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_rcu(). It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_rcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_rcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
+ */
+void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+ rcu_callback_t func, bool do_norm)
+{
+ unsigned long flags;
+ bool needexp = false;
+ bool needgp = false;
+ unsigned long s;
+ struct srcu_data *sdp;
+
+ check_init_srcu_struct(sp);
+ if (debug_rcu_head_queue(rhp)) {
+ /* Probable double call_srcu(), so leak the callback. */
+ WRITE_ONCE(rhp->func, srcu_leak_callback);
+ WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n");
+ return;
+ }
+ rhp->func = func;
+ local_irq_save(flags);
+ sdp = this_cpu_ptr(sp->sda);
+ raw_spin_lock_rcu_node(sdp);
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&sp->srcu_gp_seq));
+ s = rcu_seq_snap(&sp->srcu_gp_seq);
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+ sdp->srcu_gp_seq_needed = s;
+ needgp = true;
+ }
+ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+ sdp->srcu_gp_seq_needed_exp = s;
+ needexp = true;
+ }
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ if (needgp)
+ srcu_funnel_gp_start(sp, sdp, s, do_norm);
+ else if (needexp)
+ srcu_funnel_exp_start(sp, sdp->mynode, s);
+}
+
+/**
+ * call_srcu() - Queue a callback for invocation after an SRCU grace period
+ * @sp: srcu_struct in queue the callback
+ * @head: structure to be used for queueing the SRCU callback.
+ * @func: function to be invoked after the SRCU grace period
+ *
+ * The callback function will be invoked some time after a full SRCU
+ * grace period elapses, in other words after all pre-existing SRCU
+ * read-side critical sections have completed. However, the callback
+ * function might well execute concurrently with other SRCU read-side
+ * critical sections that started after call_srcu() was invoked. SRCU
+ * read-side critical sections are delimited by srcu_read_lock() and
+ * srcu_read_unlock(), and may be nested.
+ *
+ * The callback will be invoked from process context, but must nevertheless
+ * be fast and must not block.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ __call_srcu(sp, rhp, func, true);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
+{
+ struct rcu_synchronize rcu;
+
+ RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+ might_sleep();
+ check_init_srcu_struct(sp);
+ init_completion(&rcu.completion);
+ init_rcu_head_on_stack(&rcu.head);
+ __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
+ wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
+}
+
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+ __synchronize_srcu(sp, rcu_gp_is_normal());
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
+ * and then flip the srcu_idx and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU-sched read-side critical section
+ * whose beginning preceded the call to synchronize_srcu(). In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section. Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
+ *
+ * If SRCU is likely idle, expedite the first request. This semantic
+ * was provided by Classic SRCU, and is relied upon by its users, so TREE
+ * SRCU must also provide it. Note that detecting idleness is heuristic
+ * and subject to both false positives and negatives.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+ if (srcu_might_be_idle(sp) || rcu_gp_is_expedited())
+ synchronize_srcu_expedited(sp);
+ else
+ __synchronize_srcu(sp, true);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/*
+ * Callback function for srcu_barrier() use.
+ */
+static void srcu_barrier_cb(struct rcu_head *rhp)
+{
+ struct srcu_data *sdp;
+ struct srcu_struct *sp;
+
+ sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
+ sp = sdp->sp;
+ if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+ complete(&sp->srcu_barrier_completion);
+}
+
+/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @sp: srcu_struct on which to wait for in-flight callbacks.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+ int cpu;
+ struct srcu_data *sdp;
+ unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
+
+ check_init_srcu_struct(sp);
+ mutex_lock(&sp->srcu_barrier_mutex);
+ if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
+ smp_mb(); /* Force ordering following return. */
+ mutex_unlock(&sp->srcu_barrier_mutex);
+ return; /* Someone else did our work for us. */
+ }
+ rcu_seq_start(&sp->srcu_barrier_seq);
+ init_completion(&sp->srcu_barrier_completion);
+
+ /* Initial count prevents reaching zero until all CBs are posted. */
+ atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
+
+ /*
+ * Each pass through this loop enqueues a callback, but only
+ * on CPUs already having callbacks enqueued. Note that if
+ * a CPU already has callbacks enqueue, it must have already
+ * registered the need for a future grace period, so all we
+ * need do is enqueue a callback that will use the same
+ * grace period as the last callback already in the queue.
+ */
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(sp->sda, cpu);
+ raw_spin_lock_irq_rcu_node(sdp);
+ atomic_inc(&sp->srcu_barrier_cpu_cnt);
+ sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ debug_rcu_head_queue(&sdp->srcu_barrier_head);
+ if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+ &sdp->srcu_barrier_head, 0)) {
+ debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
+ atomic_dec(&sp->srcu_barrier_cpu_cnt);
+ }
+ raw_spin_unlock_irq_rcu_node(sdp);
+ }
+
+ /* Remove the initial count, at which point reaching zero can happen. */
+ if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+ complete(&sp->srcu_barrier_completion);
+ wait_for_completion(&sp->srcu_barrier_completion);
+
+ rcu_seq_end(&sp->srcu_barrier_seq);
+ mutex_unlock(&sp->srcu_barrier_mutex);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+ return sp->srcu_idx;
+}
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+/*
+ * Core SRCU state machine. Push state bits of ->srcu_gp_seq
+ * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
+ * completed in that state.
+ */
+static void srcu_advance_state(struct srcu_struct *sp)
+{
+ int idx;
+
+ mutex_lock(&sp->srcu_gp_mutex);
+
+ /*
+ * Because readers might be delayed for an extended period after
+ * fetching ->srcu_idx for their index, at any point in time there
+ * might well be readers using both idx=0 and idx=1. We therefore
+ * need to wait for readers to clear from both index values before
+ * invoking a callback.
+ *
+ * The load-acquire ensures that we see the accesses performed
+ * by the prior grace period.
+ */
+ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
+ if (idx == SRCU_STATE_IDLE) {
+ raw_spin_lock_irq_rcu_node(sp);
+ if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
+ raw_spin_unlock_irq_rcu_node(sp);
+ mutex_unlock(&sp->srcu_gp_mutex);
+ return;
+ }
+ idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+ if (idx == SRCU_STATE_IDLE)
+ srcu_gp_start(sp);
+ raw_spin_unlock_irq_rcu_node(sp);
+ if (idx != SRCU_STATE_IDLE) {
+ mutex_unlock(&sp->srcu_gp_mutex);
+ return; /* Someone else started the grace period. */
+ }
+ }
+
+ if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ idx = 1 ^ (sp->srcu_idx & 1);
+ if (!try_check_zero(sp, idx, 1)) {
+ mutex_unlock(&sp->srcu_gp_mutex);
+ return; /* readers present, retry later. */
+ }
+ srcu_flip(sp);
+ rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
+ }
+
+ if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+
+ /*
+ * SRCU read-side critical sections are normally short,
+ * so check at least twice in quick succession after a flip.
+ */
+ idx = 1 ^ (sp->srcu_idx & 1);
+ if (!try_check_zero(sp, idx, 2)) {
+ mutex_unlock(&sp->srcu_gp_mutex);
+ return; /* readers present, retry later. */
+ }
+ srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */
+ }
+}
+
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period. If there are more to do, SRCU will reschedule
+ * the workqueue. Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
+ */
+static void srcu_invoke_callbacks(struct work_struct *work)
+{
+ bool more;
+ struct rcu_cblist ready_cbs;
+ struct rcu_head *rhp;
+ struct srcu_data *sdp;
+ struct srcu_struct *sp;
+
+ sdp = container_of(work, struct srcu_data, work.work);
+ sp = sdp->sp;
+ rcu_cblist_init(&ready_cbs);
+ raw_spin_lock_irq_rcu_node(sdp);
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&sp->srcu_gp_seq));
+ if (sdp->srcu_cblist_invoking ||
+ !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
+ raw_spin_unlock_irq_rcu_node(sdp);
+ return; /* Someone else on the job or nothing to do. */
+ }
+
+ /* We are on the job! Extract and invoke ready callbacks. */
+ sdp->srcu_cblist_invoking = true;
+ rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+ raw_spin_unlock_irq_rcu_node(sdp);
+ rhp = rcu_cblist_dequeue(&ready_cbs);
+ for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+ debug_rcu_head_unqueue(rhp);
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ }
+
+ /*
+ * Update counts, accelerate new callbacks, and if needed,
+ * schedule another round of callback invocation.
+ */
+ raw_spin_lock_irq_rcu_node(sdp);
+ rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+ rcu_seq_snap(&sp->srcu_gp_seq));
+ sdp->srcu_cblist_invoking = false;
+ more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
+ raw_spin_unlock_irq_rcu_node(sdp);
+ if (more)
+ srcu_schedule_cbs_sdp(sdp, 0);
+}
+
+/*
+ * Finished one round of SRCU grace period. Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
+{
+ bool pushgp = true;
+
+ raw_spin_lock_irq_rcu_node(sp);
+ if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
+ /* All requests fulfilled, time to go idle. */
+ pushgp = false;
+ }
+ } else if (!rcu_seq_state(sp->srcu_gp_seq)) {
+ /* Outstanding request and no GP. Start one. */
+ srcu_gp_start(sp);
+ }
+ raw_spin_unlock_irq_rcu_node(sp);
+
+ if (pushgp)
+ queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+}
+
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+void process_srcu(struct work_struct *work)
+{
+ struct srcu_struct *sp;
+
+ sp = container_of(work, struct srcu_struct, work.work);
+
+ srcu_advance_state(sp);
+ srcu_reschedule(sp, srcu_get_delay(sp));
+}
+EXPORT_SYMBOL_GPL(process_srcu);
+
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+ struct srcu_struct *sp, int *flags,
+ unsigned long *gpnum, unsigned long *completed)
+{
+ if (test_type != SRCU_FLAVOR)
+ return;
+ *flags = 0;
+ *completed = rcu_seq_ctr(sp->srcu_gp_seq);
+ *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
+}
+EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+
+static int __init srcu_bootup_announce(void)
+{
+ pr_info("Hierarchical SRCU implementation.\n");
+ if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
+ pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ return 0;
+}
+early_initcall(srcu_bootup_announce);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 6ad330dbbae2..f8488965250f 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,15 +35,26 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
-#include <linux/trace_events.h>
#include "rcu.h"
-/* Forward declarations for tiny_plugin.h. */
-struct rcu_ctrlblk;
-static void __call_rcu(struct rcu_head *head,
- rcu_callback_t func,
- struct rcu_ctrlblk *rcp);
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+};
#include "tiny_plugin.h"
@@ -59,19 +70,6 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL(rcu_barrier_sched);
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
-
-/*
- * Test whether RCU thinks that the current CPU is idle.
- */
-bool notrace __rcu_is_watching(void)
-{
- return true;
-}
-EXPORT_SYMBOL(__rcu_is_watching);
-
-#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
-
/*
* Helper function for rcu_sched_qs() and rcu_bh_qs().
* Also irqs are disabled to avoid confusion due to interrupt handlers
@@ -79,7 +77,6 @@ EXPORT_SYMBOL(__rcu_is_watching);
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
- RCU_TRACE(reset_cpu_stall_ticks(rcp));
if (rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
return 1;
@@ -125,7 +122,6 @@ void rcu_bh_qs(void)
*/
void rcu_check_callbacks(int user)
{
- RCU_TRACE(check_cpu_stalls());
if (user)
rcu_sched_qs();
else if (!in_softirq())
@@ -140,10 +136,8 @@ void rcu_check_callbacks(int user)
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
- const char *rn = NULL;
struct rcu_head *next, *list;
unsigned long flags;
- RCU_TRACE(int cb_count = 0);
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
@@ -152,7 +146,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
return;
}
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
list = rcp->rcucblist;
rcp->rcucblist = *rcp->donetail;
*rcp->donetail = NULL;
@@ -162,22 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
/* Invoke the callbacks on the local list. */
- RCU_TRACE(rn = rcp->name);
while (list) {
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
local_bh_disable();
- __rcu_reclaim(rn, list);
+ __rcu_reclaim("", list);
local_bh_enable();
list = next;
- RCU_TRACE(cb_count++);
}
- RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
- RCU_TRACE(trace_rcu_batch_end(rcp->name,
- cb_count, 0, need_resched(),
- is_idle_task(current),
- false));
}
static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
@@ -221,7 +207,6 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
- RCU_TRACE(rcp->qlen++);
local_irq_restore(flags);
if (unlikely(is_idle_task(current))) {
@@ -254,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
-
rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index c64b827ecbca..f0a01b2a3062 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -22,37 +22,7 @@
* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
-#include <linux/kthread.h>
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
- RCU_TRACE(long qlen); /* Number of pending CBs. */
- RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
- RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
- RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
- RCU_TRACE(const char *name); /* Name of RCU type. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_sched")
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_bh")
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
#include <linux/kernel_stat.h>
int rcu_scheduler_active __read_mostly;
@@ -65,105 +35,13 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
* to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
* The reason for this is that Tiny RCU does not need kthreads, so does
* not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process.
+ * at a certain phase of the boot process. Unless SRCU is in the mix.
*/
void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
-}
-
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcp->qlen -= n;
- local_irq_restore(flags);
-}
-
-/*
- * Dump statistics for TINY_RCU, such as they are.
- */
-static int show_tiny_stats(struct seq_file *m, void *unused)
-{
- seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
- seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
- return 0;
-}
-
-static int show_tiny_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_tiny_stats, NULL);
-}
-
-static const struct file_operations show_tiny_stats_fops = {
- .owner = THIS_MODULE,
- .open = show_tiny_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutiny_trace_init(void)
-{
- struct dentry *retval;
-
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto free_out;
- retval = debugfs_create_file("rcudata", 0444, rcudir,
- NULL, &show_tiny_stats_fops);
- if (!retval)
- goto free_out;
- return 0;
-free_out:
- debugfs_remove_recursive(rcudir);
- return 1;
-}
-device_initcall(rcutiny_trace_init);
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- unsigned long j;
- unsigned long js;
-
- if (rcu_cpu_stall_suppress)
- return;
- rcp->ticks_this_gp++;
- j = jiffies;
- js = READ_ONCE(rcp->jiffies_stall);
- if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
- pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
- rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
- jiffies - rcp->gp_start, rcp->qlen);
- dump_stack();
- WRITE_ONCE(rcp->jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
- } else if (ULONG_CMP_GE(j, js)) {
- WRITE_ONCE(rcp->jiffies_stall,
- jiffies + rcu_jiffies_till_stall_check());
- }
-}
-
-static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
-{
- rcp->ticks_this_gp = 0;
- rcp->gp_start = jiffies;
- WRITE_ONCE(rcp->jiffies_stall,
- jiffies + rcu_jiffies_till_stall_check());
-}
-
-static void check_cpu_stalls(void)
-{
- RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
- RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+ rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
+ ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
}
-#endif /* #ifdef CONFIG_RCU_TRACE */
+#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a6dcf3bd244f..51d4c3acf32d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -98,8 +98,8 @@ struct rcu_state sname##_state = { \
.gpnum = 0UL - 300UL, \
.completed = 0UL - 300UL, \
.orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
- .orphan_nxttail = &sname##_state.orphan_nxtlist, \
- .orphan_donetail = &sname##_state.orphan_donelist, \
+ .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
+ .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
@@ -124,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
/* Number of rcu_nodes at specified level. */
-static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
+int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/* panic() on RCU Stall sysctl. */
int sysctl_panic_on_rcu_stall __read_mostly;
@@ -168,39 +168,21 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp,
static void sync_sched_exp_online_cleanup(int cpu);
/* rcuc/rcub kthread realtime priority */
-#ifdef CONFIG_RCU_KTHREAD_PRIO
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
-#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
module_param(kthread_prio, int, 0644);
/* Delay in jiffies for grace-period initialization delays, debug only. */
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
-static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
-module_param(gp_preinit_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
-static const int gp_preinit_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
-
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
-static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
-module_param(gp_init_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-static const int gp_init_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
-static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
-module_param(gp_cleanup_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
-static const int gp_cleanup_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static int gp_preinit_delay;
+module_param(gp_preinit_delay, int, 0444);
+static int gp_init_delay;
+module_param(gp_init_delay, int, 0444);
+static int gp_cleanup_delay;
+module_param(gp_cleanup_delay, int, 0444);
/*
* Number of grace periods between delays, normalized by the duration of
- * the delay. The longer the the delay, the more the grace periods between
+ * the delay. The longer the delay, the more the grace periods between
* each delay. The reason for this normalization is that it means that,
* for non-zero delays, the overall slowdown of grace periods is constant
* regardless of the duration of the delay. This arrangement balances
@@ -250,6 +232,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
*/
void rcu_sched_qs(void)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!");
if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
return;
trace_rcu_grace_period(TPS("rcu_sched"),
@@ -265,6 +248,7 @@ void rcu_sched_qs(void)
void rcu_bh_qs(void)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_bh"),
__this_cpu_read(rcu_bh_data.gpnum),
@@ -273,15 +257,19 @@ void rcu_bh_qs(void)
}
}
-static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+/*
+ * Steal a bit from the bottom of ->dynticks for idle entry/exit
+ * control. Initially this is for TLB flushing.
+ */
+#define RCU_DYNTICK_CTRL_MASK 0x1
+#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
+#ifndef rcu_eqs_special_exit
+#define rcu_eqs_special_exit() do { } while (0)
+#endif
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
- .dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
- .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+ .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
};
/*
@@ -305,15 +293,20 @@ bool rcu_irq_enter_disabled(void)
static void rcu_dynticks_eqs_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int special;
+ int seq;
/*
- * CPUs seeing atomic_inc_return() must see prior RCU read-side
+ * CPUs seeing atomic_add_return() must see prior RCU read-side
* critical sections, and we also must force ordering with the
* next idle sojourn.
*/
- special = atomic_inc_return(&rdtp->dynticks);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
+ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+ /* Better be in an extended quiescent state! */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (seq & RCU_DYNTICK_CTRL_CTR));
+ /* Better not have special action (TLB flush) pending! */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (seq & RCU_DYNTICK_CTRL_MASK));
}
/*
@@ -323,15 +316,22 @@ static void rcu_dynticks_eqs_enter(void)
static void rcu_dynticks_eqs_exit(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int special;
+ int seq;
/*
- * CPUs seeing atomic_inc_return() must see prior idle sojourns,
+ * CPUs seeing atomic_add_return() must see prior idle sojourns,
* and we also must force ordering with the next RCU read-side
* critical section.
*/
- special = atomic_inc_return(&rdtp->dynticks);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
+ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !(seq & RCU_DYNTICK_CTRL_CTR));
+ if (seq & RCU_DYNTICK_CTRL_MASK) {
+ atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
+ smp_mb__after_atomic(); /* _exit after clearing mask. */
+ /* Prefer duplicate flushes to losing a flush. */
+ rcu_eqs_special_exit();
+ }
}
/*
@@ -348,9 +348,9 @@ static void rcu_dynticks_eqs_online(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- if (atomic_read(&rdtp->dynticks) & 0x1)
+ if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
return;
- atomic_add(0x1, &rdtp->dynticks);
+ atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
}
/*
@@ -362,7 +362,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- return !(atomic_read(&rdtp->dynticks) & 0x1);
+ return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
}
/*
@@ -373,7 +373,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
{
int snap = atomic_add_return(0, &rdtp->dynticks);
- return snap;
+ return snap & ~RCU_DYNTICK_CTRL_MASK;
}
/*
@@ -382,7 +382,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & 0x1);
+ return !(snap & RCU_DYNTICK_CTRL_CTR);
}
/*
@@ -402,14 +402,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
static void rcu_dynticks_momentary_idle(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int special = atomic_add_return(2, &rdtp->dynticks);
+ int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
+ &rdtp->dynticks);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(special & 0x1));
+ WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
}
-DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
-EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+/*
+ * Set the special (bottom) bit of the specified CPU so that it
+ * will take special action (such as flushing its TLB) on the
+ * next exit from an extended quiescent state. Returns true if
+ * the bit was successfully set, or false if the CPU was not in
+ * an extended quiescent state.
+ */
+bool rcu_eqs_special_set(int cpu)
+{
+ int old;
+ int new;
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ do {
+ old = atomic_read(&rdtp->dynticks);
+ if (old & RCU_DYNTICK_CTRL_CTR)
+ return false;
+ new = old | RCU_DYNTICK_CTRL_MASK;
+ } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
+ return true;
+}
/*
* Let the RCU core know that this CPU has gone through the scheduler,
@@ -418,44 +438,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
* memory barriers to let the RCU core know about it, regardless of what
* this CPU might (or might not) do in the near future.
*
- * We inform the RCU core by emulating a zero-duration dyntick-idle
- * period, which we in turn do by incrementing the ->dynticks counter
- * by two.
+ * We inform the RCU core by emulating a zero-duration dyntick-idle period.
*
* The caller must have disabled interrupts.
*/
static void rcu_momentary_dyntick_idle(void)
{
- struct rcu_data *rdp;
- int resched_mask;
- struct rcu_state *rsp;
-
- /*
- * Yes, we can lose flag-setting operations. This is OK, because
- * the flag will be set again after some delay.
- */
- resched_mask = raw_cpu_read(rcu_sched_qs_mask);
- raw_cpu_write(rcu_sched_qs_mask, 0);
-
- /* Find the flavor that needs a quiescent state. */
- for_each_rcu_flavor(rsp) {
- rdp = raw_cpu_ptr(rsp->rda);
- if (!(resched_mask & rsp->flavor_mask))
- continue;
- smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
- if (READ_ONCE(rdp->mynode->completed) !=
- READ_ONCE(rdp->cond_resched_completed))
- continue;
-
- /*
- * Pretend to be momentarily idle for the quiescent state.
- * This allows the grace-period kthread to record the
- * quiescent state, with no need for this CPU to do anything
- * further.
- */
- rcu_dynticks_momentary_idle();
- break;
- }
+ raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false);
+ rcu_dynticks_momentary_idle();
}
/*
@@ -463,14 +453,22 @@ static void rcu_momentary_dyntick_idle(void)
* and requires special handling for preemptible RCU.
* The caller must have disabled interrupts.
*/
-void rcu_note_context_switch(void)
+void rcu_note_context_switch(bool preempt)
{
barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs();
- rcu_preempt_note_context_switch();
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_preempt_note_context_switch(preempt);
+ /* Load rcu_urgent_qs before other flags. */
+ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
+ goto out;
+ this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
+ if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
rcu_momentary_dyntick_idle();
+ this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
+ if (!preempt)
+ rcu_note_voluntary_context_switch_lite(current);
+out:
trace_rcu_utilization(TPS("End context switch"));
barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
@@ -493,35 +491,35 @@ void rcu_all_qs(void)
{
unsigned long flags;
+ if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs))
+ return;
+ preempt_disable();
+ /* Load rcu_urgent_qs before other flags. */
+ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) {
+ preempt_enable();
+ return;
+ }
+ this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
barrier(); /* Avoid RCU read-side critical sections leaking down. */
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+ if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
- if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) {
- /*
- * Yes, we just checked a per-CPU variable with preemption
- * enabled, so we might be migrated to some other CPU at
- * this point. That is OK because in that case, the
- * migration will supply the needed quiescent state.
- * We might end up needlessly disabling preemption and
- * invoking rcu_sched_qs() on the destination CPU, but
- * the probability and cost are both quite low, so this
- * should not be a problem in practice.
- */
- preempt_disable();
+ if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)))
rcu_sched_qs();
- preempt_enable();
- }
- this_cpu_inc(rcu_qs_ctr);
+ this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(rcu_all_qs);
-static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
-static long qhimark = 10000; /* If this many pending, ignore blimit. */
-static long qlowmark = 100; /* Once only this many pending, use blimit. */
+#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */
+static long blimit = DEFAULT_RCU_BLIMIT;
+#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
+static long qhimark = DEFAULT_RCU_QHIMARK;
+#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
+static long qlowmark = DEFAULT_RCU_QLOMARK;
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
@@ -544,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644);
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
-static void force_qs_rnp(struct rcu_state *rsp,
- int (*f)(struct rcu_data *rsp, bool *isidle,
- unsigned long *maxj),
- bool *isidle, unsigned long *maxj);
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(void);
@@ -704,15 +699,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
default:
break;
}
- if (rsp != NULL) {
- *flags = READ_ONCE(rsp->gp_flags);
- *gpnum = READ_ONCE(rsp->gpnum);
- *completed = READ_ONCE(rsp->completed);
+ if (rsp == NULL)
return;
- }
- *flags = 0;
- *gpnum = 0;
- *completed = 0;
+ *flags = READ_ONCE(rsp->gp_flags);
+ *gpnum = READ_ONCE(rsp->gpnum);
+ *completed = READ_ONCE(rsp->completed);
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
@@ -728,16 +719,6 @@ void rcutorture_record_progress(unsigned long vernum)
EXPORT_SYMBOL_GPL(rcutorture_record_progress);
/*
- * Does the CPU have callbacks ready to be invoked?
- */
-static int
-cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
-{
- return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
- rdp->nxttail[RCU_NEXT_TAIL] != NULL;
-}
-
-/*
* Return the root node of the specified rcu_state structure.
*/
static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
@@ -756,6 +737,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
return READ_ONCE(*fp);
}
@@ -767,21 +749,18 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- int i;
-
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
if (rcu_gp_in_progress(rsp))
return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
return true; /* Yes, a no-CBs CPU needs one. */
- if (!rdp->nxttail[RCU_NEXT_TAIL])
+ if (!rcu_segcblist_is_enabled(&rdp->cblist))
return false; /* No, this is a no-CBs (or offline) CPU. */
- if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+ if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return true; /* Yes, CPU has newly registered callbacks. */
- for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
- if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
- ULONG_CMP_LT(READ_ONCE(rsp->completed),
- rdp->nxtcompleted[i]))
- return true; /* Yes, CBs for future grace period. */
+ if (rcu_segcblist_future_gp_needed(&rdp->cblist,
+ READ_ONCE(rsp->completed)))
+ return true; /* Yes, CBs for future grace period. */
return false; /* No grace period needed. */
}
@@ -797,6 +776,7 @@ static void rcu_eqs_enter_common(bool user)
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!user && !is_idle_task(current)) {
@@ -867,7 +847,6 @@ void rcu_idle_enter(void)
local_irq_save(flags);
rcu_eqs_enter(false);
- rcu_sysidle_enter(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -917,7 +896,6 @@ void rcu_irq_exit(void)
trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
rdtp->dynticks_nesting--;
}
- rcu_sysidle_enter(1);
}
/*
@@ -970,6 +948,7 @@ static void rcu_eqs_exit(bool user)
struct rcu_dynticks *rdtp;
long long oldval;
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
@@ -998,7 +977,6 @@ void rcu_idle_exit(void)
local_irq_save(flags);
rcu_eqs_exit(false);
- rcu_sysidle_exit(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -1050,7 +1028,6 @@ void rcu_irq_enter(void)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
rcu_eqs_exit_common(oldval, true);
- rcu_sysidle_exit(1);
}
/*
@@ -1133,22 +1110,11 @@ void rcu_nmi_exit(void)
}
/**
- * __rcu_is_watching - are RCU read-side critical sections safe?
- *
- * Return true if RCU is watching the running CPU, which means that
- * this CPU can safely enter RCU read-side critical sections. Unlike
- * rcu_is_watching(), the caller of __rcu_is_watching() must have at
- * least disabled preemption.
- */
-bool notrace __rcu_is_watching(void)
-{
- return !rcu_dynticks_curr_cpu_in_eqs();
-}
-
-/**
* rcu_is_watching - see if RCU thinks that the current CPU is idle
*
- * If the current CPU is in its idle loop and is neither in an interrupt
+ * Return true if RCU is watching the running CPU, which means that this
+ * CPU can safely enter RCU read-side critical sections. In other words,
+ * if the current CPU is in its idle loop and is neither in an interrupt
* or NMI handler, return true.
*/
bool notrace rcu_is_watching(void)
@@ -1156,12 +1122,30 @@ bool notrace rcu_is_watching(void)
bool ret;
preempt_disable_notrace();
- ret = __rcu_is_watching();
+ ret = !rcu_dynticks_curr_cpu_in_eqs();
preempt_enable_notrace();
return ret;
}
EXPORT_SYMBOL_GPL(rcu_is_watching);
+/*
+ * If a holdout task is actually running, request an urgent quiescent
+ * state from its CPU. This is unsynchronized, so migrations can cause
+ * the request to go to the wrong CPU. Which is OK, all that will happen
+ * is that the CPU's next context switch will be a bit slower and next
+ * time around this task will generate another request.
+ */
+void rcu_request_urgent_qs_task(struct task_struct *t)
+{
+ int cpu;
+
+ barrier();
+ cpu = task_cpu(t);
+ if (!task_curr(t))
+ return; /* This task is not running on that CPU. */
+ smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true);
+}
+
#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
/*
@@ -1222,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
*/
-static int dyntick_save_progress_counter(struct rcu_data *rdp,
- bool *isidle, unsigned long *maxj)
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
- rcu_sysidle_check_cpu(rdp, isidle, maxj);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
@@ -1243,11 +1225,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
* idle state since the last call to dyntick_save_progress_counter()
* for this same CPU, or by virtue of having been offline.
*/
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
- bool *isidle, unsigned long *maxj)
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
unsigned long jtsq;
- int *rcrmp;
+ bool *rnhqp;
+ bool *ruqp;
unsigned long rjtsc;
struct rcu_node *rnp;
@@ -1283,11 +1265,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* might not be the case for nohz_full CPUs looping in the kernel.
*/
rnp = rdp->mynode;
+ ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
- READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) &&
+ READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
return 1;
+ } else {
+ /* Load rcu_qs_ctr before store to rcu_urgent_qs. */
+ smp_store_release(ruqp, true);
}
/* Check for the CPU being offline. */
@@ -1304,7 +1290,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* in-kernel CPU-bound tasks cannot advance grace periods.
* So if the grace period is old enough, make the CPU pay attention.
* Note that the unsynchronized assignments to the per-CPU
- * rcu_sched_qs_mask variable are safe. Yes, setting of
+ * rcu_need_heavy_qs variable are safe. Yes, setting of
* bits can be lost, but they will be set again on the next
* force-quiescent-state pass. So lost bit sets do not result
* in incorrect behavior, merely in a grace period lasting
@@ -1318,16 +1304,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* is set too high, we override with half of the RCU CPU stall
* warning delay.
*/
- rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
- if (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
- time_after(jiffies, rdp->rsp->jiffies_resched)) {
- if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
- WRITE_ONCE(rdp->cond_resched_completed,
- READ_ONCE(rdp->mynode->completed));
- smp_mb(); /* ->cond_resched_completed before *rcrmp. */
- WRITE_ONCE(*rcrmp,
- READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
- }
+ rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
+ if (!READ_ONCE(*rnhqp) &&
+ (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
+ time_after(jiffies, rdp->rsp->jiffies_resched))) {
+ WRITE_ONCE(*rnhqp, true);
+ /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
+ smp_store_release(ruqp, true);
rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
}
@@ -1487,7 +1470,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
- totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
+ cpu)->cblist);
pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start),
(long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1541,7 +1525,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
print_cpu_stall_info(rsp, smp_processor_id());
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
- totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
+ cpu)->cblist);
pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
jiffies - rsp->gp_start,
(long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1644,30 +1629,6 @@ void rcu_cpu_stall_reset(void)
}
/*
- * Initialize the specified rcu_data structure's default callback list
- * to empty. The default callback list is the one that is not used by
- * no-callbacks CPUs.
- */
-static void init_default_callback_list(struct rcu_data *rdp)
-{
- int i;
-
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
-}
-
-/*
- * Initialize the specified rcu_data structure's callback list to empty.
- */
-static void init_callback_list(struct rcu_data *rdp)
-{
- if (init_nocb_callback_list(rdp))
- return;
- init_default_callback_list(rdp);
-}
-
-/*
* Determine the value that ->completed will have at the end of the
* next subsequent grace period. This is used to tag callbacks so that
* a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1679,6 +1640,8 @@ static void init_callback_list(struct rcu_data *rdp)
static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
struct rcu_node *rnp)
{
+ lockdep_assert_held(&rnp->lock);
+
/*
* If RCU is idle, we just wait for the next grace period.
* But we can only be sure that RCU is idle if we are looking
@@ -1721,10 +1684,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long *c_out)
{
unsigned long c;
- int i;
bool ret = false;
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+ lockdep_assert_held(&rnp->lock);
+
/*
* Pick up grace-period number for new callbacks. If this
* grace period is already marked as needed, return to the caller.
@@ -1767,13 +1731,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
/*
* Get a new grace-period number. If there really is no grace
* period in progress, it will be smaller than the one we obtained
- * earlier. Adjust callbacks as needed. Note that even no-CBs
- * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+ * earlier. Adjust callbacks as needed.
*/
c = rcu_cbs_completed(rdp->rsp, rnp_root);
- for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
- if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
- rdp->nxtcompleted[i] = c;
+ if (!rcu_is_nocb_cpu(rdp->cpu))
+ (void)rcu_segcblist_accelerate(&rdp->cblist, c);
/*
* If the needed for the required grace period is already
@@ -1805,9 +1767,7 @@ out:
/*
* Clean up any old requests for the just-ended grace period. Also return
- * whether any additional grace periods have been requested. Also invoke
- * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
- * waiting for this grace period to complete.
+ * whether any additional grace periods have been requested.
*/
static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
@@ -1853,57 +1813,29 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
- unsigned long c;
- int i;
- bool ret;
+ bool ret = false;
- /* If the CPU has no callbacks, nothing to do. */
- if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return false;
+ lockdep_assert_held(&rnp->lock);
- /*
- * Starting from the sublist containing the callbacks most
- * recently assigned a ->completed number and working down, find the
- * first sublist that is not assignable to an upcoming grace period.
- * Such a sublist has something in it (first two tests) and has
- * a ->completed number assigned that will complete sooner than
- * the ->completed number for newly arrived callbacks (last test).
- *
- * The key point is that any later sublist can be assigned the
- * same ->completed number as the newly arrived callbacks, which
- * means that the callbacks in any of these later sublist can be
- * grouped into a single sublist, whether or not they have already
- * been assigned a ->completed number.
- */
- c = rcu_cbs_completed(rsp, rnp);
- for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
- if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
- !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
- break;
-
- /*
- * If there are no sublist for unassigned callbacks, leave.
- * At the same time, advance "i" one sublist, so that "i" will
- * index into the sublist where all the remaining callbacks should
- * be grouped into.
- */
- if (++i >= RCU_NEXT_TAIL)
+ /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
/*
- * Assign all subsequent callbacks' ->completed number to the next
- * full grace period and group them all in the sublist initially
- * indexed by "i".
+ * Callbacks are often registered with incomplete grace-period
+ * information. Something about the fact that getting exact
+ * information requires acquiring a global lock... RCU therefore
+ * makes a conservative estimate of the grace period number at which
+ * a given callback will become ready to invoke. The following
+ * code checks this estimate and improves it when possible, thus
+ * accelerating callback invocation to an earlier grace-period
+ * number.
*/
- for (; i <= RCU_NEXT_TAIL; i++) {
- rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
- rdp->nxtcompleted[i] = c;
- }
- /* Record any needed additional grace periods. */
- ret = rcu_start_future_gp(rnp, rdp, NULL);
+ if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
+ ret = rcu_start_future_gp(rnp, rdp, NULL);
/* Trace depending on how much we were able to accelerate. */
- if (!*rdp->nxttail[RCU_WAIT_TAIL])
+ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
else
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
@@ -1923,32 +1855,17 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
- int i, j;
+ lockdep_assert_held(&rnp->lock);
- /* If the CPU has no callbacks, nothing to do. */
- if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
/*
* Find all callbacks whose ->completed numbers indicate that they
* are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
*/
- for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
- if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
- break;
- rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
- }
- /* Clean up any sublist tail pointers that were misordered above. */
- for (j = RCU_WAIT_TAIL; j < i; j++)
- rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
-
- /* Copy down callbacks to fill in empty sublists. */
- for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
- if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
- break;
- rdp->nxttail[j] = rdp->nxttail[i];
- rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
- }
+ rcu_segcblist_advance(&rdp->cblist, rnp->completed);
/* Classify any remaining callbacks. */
return rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1966,6 +1883,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
bool ret;
bool need_gp;
+ lockdep_assert_held(&rnp->lock);
+
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
!unlikely(READ_ONCE(rdp->gpwrap))) {
@@ -1993,7 +1912,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
need_gp = !!(rnp->qsmask & rdp->grpmask);
rdp->cpu_no_qs.b.norm = need_gp;
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
@@ -2172,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
*/
static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
{
- bool isidle = false;
- unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies);
rsp->n_force_qs++;
if (first_time) {
/* Collect dyntick-idle snapshots. */
- if (is_sysidle_rcu_state(rsp)) {
- isidle = true;
- maxj = jiffies - ULONG_MAX / 4;
- }
- force_qs_rnp(rsp, dyntick_save_progress_counter,
- &isidle, &maxj);
- rcu_sysidle_report_gp(rsp, isidle, maxj);
+ force_qs_rnp(rsp, dyntick_save_progress_counter);
} else {
/* Handle dyntick-idle and offline CPUs. */
- isidle = true;
- force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
+ force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
}
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -2398,6 +2308,7 @@ static bool
rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
+ lockdep_assert_held(&rnp->lock);
if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
/*
* Either we have not yet spawned the grace-period
@@ -2459,6 +2370,7 @@ static bool rcu_start_gp(struct rcu_state *rsp)
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
+ lockdep_assert_held(&rcu_get_root(rsp)->lock);
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
@@ -2483,6 +2395,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
unsigned long oldmask = 0;
struct rcu_node *rnp_c;
+ lockdep_assert_held(&rnp->lock);
+
/* Walk up the rcu_node hierarchy. */
for (;;) {
if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
@@ -2543,6 +2457,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
unsigned long mask;
struct rcu_node *rnp_p;
+ lockdep_assert_held(&rnp->lock);
if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2591,7 +2506,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* within the current grace period.
*/
rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -2656,6 +2571,8 @@ static void
rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
+ lockdep_assert_held(&rsp->orphan_lock);
+
/* No-CBs CPUs do not have orphanable callbacks. */
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
return;
@@ -2665,13 +2582,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
* because _rcu_barrier() excludes CPU-hotplug operations, so it
* cannot be running now. Thus no memory barrier is required.
*/
- if (rdp->nxtlist != NULL) {
- rsp->qlen_lazy += rdp->qlen_lazy;
- rsp->qlen += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
- rdp->qlen_lazy = 0;
- WRITE_ONCE(rdp->qlen, 0);
- }
+ rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
+ rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
/*
* Next, move those callbacks still needing a grace period to
@@ -2679,31 +2591,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
* Some of the callbacks might have gone partway through a grace
* period, but that is too bad. They get to start over because we
* cannot assume that grace periods are synchronized across CPUs.
- * We don't bother updating the ->nxttail[] array yet, instead
- * we just reset the whole thing later on.
*/
- if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
- *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
- rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = NULL;
- }
+ rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
/*
* Then move the ready-to-invoke callbacks to the orphanage,
* where some other CPU will pick them up. These will not be
* required to pass though another grace period: They are done.
*/
- if (rdp->nxtlist != NULL) {
- *rsp->orphan_donetail = rdp->nxtlist;
- rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
- }
+ rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
- /*
- * Finally, initialize the rcu_data structure's list to empty and
- * disallow further callbacks on this CPU.
- */
- init_callback_list(rdp);
- rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ /* Finally, disallow further callbacks on this CPU. */
+ rcu_segcblist_disable(&rdp->cblist);
}
/*
@@ -2712,22 +2611,20 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
*/
static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
{
- int i;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+ lockdep_assert_held(&rsp->orphan_lock);
+
/* No-CBs CPUs are handled specially. */
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
return;
/* Do the accounting first. */
- rdp->qlen_lazy += rsp->qlen_lazy;
- rdp->qlen += rsp->qlen;
- rdp->n_cbs_adopted += rsp->qlen;
- if (rsp->qlen_lazy != rsp->qlen)
+ rdp->n_cbs_adopted += rsp->orphan_done.len;
+ if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
rcu_idle_count_callbacks_posted();
- rsp->qlen_lazy = 0;
- rsp->qlen = 0;
+ rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
/*
* We do not need a memory barrier here because the only way we
@@ -2735,24 +2632,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
* we are the task doing the rcu_barrier().
*/
- /* First adopt the ready-to-invoke callbacks. */
- if (rsp->orphan_donelist != NULL) {
- *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
- for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
- if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
- rdp->nxttail[i] = rsp->orphan_donetail;
- rsp->orphan_donelist = NULL;
- rsp->orphan_donetail = &rsp->orphan_donelist;
- }
-
- /* And then adopt the callbacks that still need a grace period. */
- if (rsp->orphan_nxtlist != NULL) {
- *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
- rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
- rsp->orphan_nxtlist = NULL;
- rsp->orphan_nxttail = &rsp->orphan_nxtlist;
- }
+ /* First adopt the ready-to-invoke callbacks, then the done ones. */
+ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
+ WARN_ON_ONCE(rsp->orphan_done.head);
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
+ WARN_ON_ONCE(rsp->orphan_pend.head);
+ WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
+ !rcu_segcblist_n_cbs(&rdp->cblist));
}
/*
@@ -2760,14 +2646,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
*/
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
- RCU_TRACE(unsigned long mask);
- RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
- RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+ RCU_TRACE(unsigned long mask;)
+ RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);)
+ RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
return;
- RCU_TRACE(mask = rdp->grpmask);
+ RCU_TRACE(mask = rdp->grpmask;)
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
TPS("cpuofl"));
@@ -2795,6 +2681,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
+ lockdep_assert_held(&rnp->lock);
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
return;
@@ -2840,9 +2727,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
rcu_adopt_orphan_cbs(rsp, flags);
raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
- WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
- "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
- cpu, rdp->qlen, rdp->nxtlist);
+ WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+ !rcu_segcblist_empty(&rdp->cblist),
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+ cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_first_cb(&rdp->cblist));
}
/*
@@ -2852,14 +2741,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
- struct rcu_head *next, *list, **tail;
- long bl, count, count_lazy;
- int i;
+ struct rcu_head *rhp;
+ struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
+ long bl, count;
/* If no callbacks are ready, just return. */
- if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
- trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
- trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
+ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ trace_rcu_batch_start(rsp->name,
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist), 0);
+ trace_rcu_batch_end(rsp->name, 0,
+ !rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
rcu_is_callbacks_kthread());
return;
@@ -2867,73 +2759,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/*
* Extract the list of ready callbacks, disabling to prevent
- * races with call_rcu() from interrupt handlers.
+ * races with call_rcu() from interrupt handlers. Leave the
+ * callback counts, as rcu_barrier() needs to be conservative.
*/
local_irq_save(flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
bl = rdp->blimit;
- trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
- list = rdp->nxtlist;
- rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = NULL;
- tail = rdp->nxttail[RCU_DONE_TAIL];
- for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
- if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
- rdp->nxttail[i] = &rdp->nxtlist;
+ trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist), bl);
+ rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
local_irq_restore(flags);
/* Invoke callbacks. */
- count = count_lazy = 0;
- while (list) {
- next = list->next;
- prefetch(next);
- debug_rcu_head_unqueue(list);
- if (__rcu_reclaim(rsp->name, list))
- count_lazy++;
- list = next;
- /* Stop only if limit reached and CPU has something to do. */
- if (++count >= bl &&
+ rhp = rcu_cblist_dequeue(&rcl);
+ for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ debug_rcu_head_unqueue(rhp);
+ if (__rcu_reclaim(rsp->name, rhp))
+ rcu_cblist_dequeued_lazy(&rcl);
+ /*
+ * Stop only if limit reached and CPU has something to do.
+ * Note: The rcl structure counts down from zero.
+ */
+ if (-rcl.len >= bl &&
(need_resched() ||
(!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
}
local_irq_save(flags);
- trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
- is_idle_task(current),
- rcu_is_callbacks_kthread());
-
- /* Update count, and requeue any remaining callbacks. */
- if (list != NULL) {
- *tail = rdp->nxtlist;
- rdp->nxtlist = list;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- if (&rdp->nxtlist == rdp->nxttail[i])
- rdp->nxttail[i] = tail;
- else
- break;
- }
+ count = -rcl.len;
+ trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(),
+ is_idle_task(current), rcu_is_callbacks_kthread());
+
+ /* Update counts and requeue any remaining callbacks. */
+ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
smp_mb(); /* List handling before counting for rcu_barrier(). */
- rdp->qlen_lazy -= count_lazy;
- WRITE_ONCE(rdp->qlen, rdp->qlen - count);
rdp->n_cbs_invoked += count;
+ rcu_segcblist_insert_count(&rdp->cblist, &rcl);
/* Reinstate batch limit if we have worked down the excess. */
- if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+ count = rcu_segcblist_n_cbs(&rdp->cblist);
+ if (rdp->blimit == LONG_MAX && count <= qlowmark)
rdp->blimit = blimit;
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
- if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+ if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
- } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
- rdp->qlen_last_fqs_check = rdp->qlen;
- WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
+ } else if (count < rdp->qlen_last_fqs_check - qhimark)
+ rdp->qlen_last_fqs_check = count;
+ WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
local_irq_restore(flags);
/* Re-invoke RCU core processing if there are callbacks remaining. */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_core();
}
@@ -2992,10 +2872,7 @@ void rcu_check_callbacks(int user)
*
* The caller must have suppressed start of new grace periods.
*/
-static void force_qs_rnp(struct rcu_state *rsp,
- int (*f)(struct rcu_data *rsp, bool *isidle,
- unsigned long *maxj),
- bool *isidle, unsigned long *maxj)
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
{
int cpu;
unsigned long flags;
@@ -3034,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
if ((rnp->qsmask & bit) != 0) {
- if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+ if (f(per_cpu_ptr(rsp->rda, cpu)))
mask |= bit;
}
}
@@ -3099,7 +2976,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
bool needwake;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- WARN_ON_ONCE(rdp->beenonline == 0);
+ WARN_ON_ONCE(!rdp->beenonline);
/* Update RCU state based on any recent quiescent states. */
rcu_check_quiescent_state(rsp, rdp);
@@ -3117,7 +2994,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
}
/* If there are callbacks ready, invoke them. */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_callbacks(rsp, rdp);
/* Do any needed deferred wakeups of rcuo kthreads. */
@@ -3189,7 +3066,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
* invoking force_quiescent_state() if the newly enqueued callback
* is the only one waiting for a grace period to complete.
*/
- if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+ if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
+ rdp->qlen_last_fqs_check + qhimark)) {
/* Are we ignoring a completed grace period? */
note_gp_changes(rsp, rdp);
@@ -3207,10 +3085,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
if (rsp->n_force_qs == rdp->n_force_qs_snap &&
- *rdp->nxttail[RCU_DONE_TAIL] != head)
+ rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
force_quiescent_state(rsp);
rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->qlen_last_fqs_check = rdp->qlen;
+ rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
}
@@ -3239,9 +3117,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
if (debug_rcu_head_queue(head)) {
- /* Probable double call_rcu(), so leak the callback. */
+ /*
+ * Probable double call_rcu(), so leak the callback.
+ * Use rcu:rcu_callback trace event to find the previous
+ * time callback was passed to __call_rcu().
+ */
+ WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
+ head, head->func);
WRITE_ONCE(head->func, rcu_leak_callback);
- WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
return;
}
head->func = func;
@@ -3250,7 +3133,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
rdp = this_cpu_ptr(rsp->rda);
/* Add the callback to our list. */
- if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
+ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
int offline;
if (cpu != -1)
@@ -3269,31 +3152,45 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
*/
BUG_ON(cpu != -1);
WARN_ON_ONCE(!rcu_is_watching());
- if (!likely(rdp->nxtlist))
- init_default_callback_list(rdp);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
}
- WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
- if (lazy)
- rdp->qlen_lazy++;
- else
+ rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
+ if (!lazy)
rcu_idle_count_callbacks_posted();
- smp_mb(); /* Count before adding callback for rcu_barrier(). */
- *rdp->nxttail[RCU_NEXT_TAIL] = head;
- rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
- rdp->qlen_lazy, rdp->qlen);
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist));
else
- trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
+ trace_rcu_callback(rsp->name, head,
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
__call_rcu_core(rsp, rdp, head, flags);
local_irq_restore(flags);
}
-/*
- * Queue an RCU-sched callback for invocation after a grace period.
+/**
+ * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_sched() assumes
+ * that the read-side critical sections end on enabling of preemption
+ * or on voluntary preemption.
+ * RCU read-side critical sections are delimited by :
+ * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ * - anything that disables preemption.
+ *
+ * These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
{
@@ -3301,8 +3198,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(call_rcu_sched);
-/*
- * Queue an RCU callback for invocation after a quicker grace period.
+/**
+ * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by :
+ * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
+ * OR
+ * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ * These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
{
@@ -3378,12 +3293,6 @@ static inline int rcu_blocking_is_gp(void)
* to have executed a full memory barrier during the execution of
* synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
* again only if the system has more than one CPU).
- *
- * This primitive provides the guarantees made by the (now removed)
- * synchronize_kernel() API. In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
*/
void synchronize_sched(void)
{
@@ -3531,41 +3440,6 @@ void cond_synchronize_sched(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_sched);
-/* Adjust sequence number for start of update-side operation. */
-static void rcu_seq_start(unsigned long *sp)
-{
- WRITE_ONCE(*sp, *sp + 1);
- smp_mb(); /* Ensure update-side operation after counter increment. */
- WARN_ON_ONCE(!(*sp & 0x1));
-}
-
-/* Adjust sequence number for end of update-side operation. */
-static void rcu_seq_end(unsigned long *sp)
-{
- smp_mb(); /* Ensure update-side operation before counter increment. */
- WRITE_ONCE(*sp, *sp + 1);
- WARN_ON_ONCE(*sp & 0x1);
-}
-
-/* Take a snapshot of the update side's sequence number. */
-static unsigned long rcu_seq_snap(unsigned long *sp)
-{
- unsigned long s;
-
- s = (READ_ONCE(*sp) + 3) & ~0x1;
- smp_mb(); /* Above access must not bleed into critical section. */
- return s;
-}
-
-/*
- * Given a snapshot from rcu_seq_snap(), determine whether or not a
- * full update-side operation has occurred.
- */
-static bool rcu_seq_done(unsigned long *sp, unsigned long s)
-{
- return ULONG_CMP_GE(READ_ONCE(*sp), s);
-}
-
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -3589,7 +3463,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active &&
rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
rdp->n_rp_core_needs_qs++;
} else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
rdp->n_rp_report_qs++;
@@ -3597,7 +3471,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* Does this CPU have callbacks ready to invoke? */
- if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
rdp->n_rp_cb_ready++;
return 1;
}
@@ -3661,10 +3535,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
- if (!rdp->nxtlist)
+ if (rcu_segcblist_empty(&rdp->cblist))
continue;
hc = true;
- if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
+ if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) {
al = false;
break;
}
@@ -3711,8 +3585,14 @@ static void rcu_barrier_func(void *type)
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
_rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
- atomic_inc(&rsp->barrier_cpu_count);
- rsp->call(&rdp->barrier_head, rcu_barrier_callback);
+ rdp->barrier_head.func = rcu_barrier_callback;
+ debug_rcu_head_queue(&rdp->barrier_head);
+ if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
+ atomic_inc(&rsp->barrier_cpu_count);
+ } else {
+ debug_rcu_head_unqueue(&rdp->barrier_head);
+ _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+ }
}
/*
@@ -3773,7 +3653,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
__call_rcu(&rdp->barrier_head,
rcu_barrier_callback, rsp, cpu, 0);
}
- } else if (READ_ONCE(rdp->qlen)) {
+ } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3831,6 +3711,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
+ lockdep_assert_held(&rnp->lock);
for (;;) {
mask = rnp->grpmask;
rnp = rnp->parent;
@@ -3882,10 +3763,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
- if (!rdp->nxtlist)
- init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
+ if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
+ !init_nocb_callback_list(rdp))
+ rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_sysidle_init_percpu_data(rdp->dynticks);
rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
@@ -3902,12 +3783,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
rdp->cpu_no_qs.b.norm = true;
- rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
+ rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
rdp->core_needs_qs = false;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+/*
+ * Invoked early in the CPU-online process, when pretty much all
+ * services are available. The incoming CPU is not present.
+ */
int rcutree_prepare_cpu(unsigned int cpu)
{
struct rcu_state *rsp;
@@ -3921,6 +3806,9 @@ int rcutree_prepare_cpu(unsigned int cpu)
return 0;
}
+/*
+ * Update RCU priority boot kthread affinity for CPU-hotplug changes.
+ */
static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
{
struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
@@ -3928,20 +3816,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
}
+/*
+ * Near the end of the CPU-online process. Pretty much all services
+ * enabled, and the CPU is now very much alive.
+ */
int rcutree_online_cpu(unsigned int cpu)
{
sync_sched_exp_online_cleanup(cpu);
rcutree_affinity_setting(cpu, -1);
+ if (IS_ENABLED(CONFIG_TREE_SRCU))
+ srcu_online_cpu(cpu);
return 0;
}
+/*
+ * Near the beginning of the process. The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
int rcutree_offline_cpu(unsigned int cpu)
{
rcutree_affinity_setting(cpu, cpu);
+ if (IS_ENABLED(CONFIG_TREE_SRCU))
+ srcu_offline_cpu(cpu);
return 0;
}
-
+/*
+ * Near the end of the offline process. We do only tracing here.
+ */
int rcutree_dying_cpu(unsigned int cpu)
{
struct rcu_state *rsp;
@@ -3951,6 +3853,9 @@ int rcutree_dying_cpu(unsigned int cpu)
return 0;
}
+/*
+ * The outgoing CPU is gone and we are running elsewhere.
+ */
int rcutree_dead_cpu(unsigned int cpu)
{
struct rcu_state *rsp;
@@ -3968,6 +3873,10 @@ int rcutree_dead_cpu(unsigned int cpu)
* incoming CPUs are not allowed to use RCU read-side critical sections
* until this function is called. Failing to observe this restriction
* will result in lockdep splats.
+ *
+ * Note that this function is special in that it is invoked directly
+ * from the incoming CPU rather than from the cpuhp_step mechanism.
+ * This is because this function must be invoked at a precise location.
*/
void rcu_cpu_starting(unsigned int cpu)
{
@@ -3993,9 +3902,6 @@ void rcu_cpu_starting(unsigned int cpu)
* The CPU is exiting the idle loop into the arch_cpu_idle_dead()
* function. We now remove it from the rcu_node tree's ->qsmaskinit
* bit masks.
- * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
- * function. We now remove it from the rcu_node tree's ->qsmaskinit
- * bit masks.
*/
static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
{
@@ -4011,6 +3917,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+/*
+ * The outgoing function has no further need of RCU, so remove it from
+ * the list of CPUs that RCU must track.
+ *
+ * Note that this function is special in that it is invoked directly
+ * from the outgoing CPU rather than from the cpuhp_step mechanism.
+ * This is because this function must be invoked at a precise location.
+ */
void rcu_report_dead(unsigned int cpu)
{
struct rcu_state *rsp;
@@ -4025,6 +3939,10 @@ void rcu_report_dead(unsigned int cpu)
}
#endif
+/*
+ * On non-huge systems, use expedited RCU grace periods to make suspend
+ * and hibernation run faster.
+ */
static int rcu_pm_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -4095,7 +4013,7 @@ early_initcall(rcu_spawn_gp_kthread);
* task is booting the system, and such primitives are no-ops). After this
* function is called, any synchronous grace-period primitives are run as
* expedited, with the requesting task driving the grace period forward.
- * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * A later core_initcall() rcu_set_runtime_mode() will switch to full
* runtime RCU functionality.
*/
void rcu_scheduler_starting(void)
@@ -4108,31 +4026,6 @@ void rcu_scheduler_starting(void)
}
/*
- * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
- */
-static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
-{
- int i;
-
- if (rcu_fanout_exact) {
- levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
- for (i = rcu_num_lvls - 2; i >= 0; i--)
- levelspread[i] = RCU_FANOUT;
- } else {
- int ccur;
- int cprv;
-
- cprv = nr_cpu_ids;
- for (i = rcu_num_lvls - 1; i >= 0; i--) {
- ccur = levelcnt[i];
- levelspread[i] = (cprv + ccur - 1) / ccur;
- cprv = ccur;
- }
- }
-}
-
-/*
* Helper function for rcu_init() that initializes one rcu_state structure.
*/
static void __init rcu_init_one(struct rcu_state *rsp)
@@ -4141,9 +4034,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
static const char * const fqs[] = RCU_FQS_NAME_INIT;
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
- static u8 fl_mask = 0x1;
- int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
int cpustride = 1;
int i;
@@ -4158,20 +4049,16 @@ static void __init rcu_init_one(struct rcu_state *rsp)
/* Initialize the level-tracking arrays. */
- for (i = 0; i < rcu_num_lvls; i++)
- levelcnt[i] = num_rcu_lvl[i];
for (i = 1; i < rcu_num_lvls; i++)
- rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
- rcu_init_levelspread(levelspread, levelcnt);
- rsp->flavor_mask = fl_mask;
- fl_mask <<= 1;
+ rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1];
+ rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Initialize the elements themselves, starting from the leaves. */
for (i = rcu_num_lvls - 1; i >= 0; i--) {
cpustride *= levelspread[i];
rnp = rsp->level[i];
- for (j = 0; j < levelcnt[i]; j++, rnp++) {
+ for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
&rcu_node_class[i], buf[i]);
@@ -4344,6 +4231,8 @@ void __init rcu_init(void)
for_each_online_cpu(cpu) {
rcutree_prepare_cpu(cpu);
rcu_cpu_starting(cpu);
+ if (IS_ENABLED(CONFIG_TREE_SRCU))
+ srcu_online_cpu(cpu);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ec62a05bfdb3..9af0f31d6847 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,80 +30,9 @@
#include <linux/seqlock.h>
#include <linux/swait.h>
#include <linux/stop_machine.h>
+#include <linux/rcu_node_tree.h>
-/*
- * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
- * CONFIG_RCU_FANOUT_LEAF.
- * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this did work well going from three levels to four.
- * Of course, your mileage may vary.
- */
-
-#ifdef CONFIG_RCU_FANOUT
-#define RCU_FANOUT CONFIG_RCU_FANOUT
-#else /* #ifdef CONFIG_RCU_FANOUT */
-# ifdef CONFIG_64BIT
-# define RCU_FANOUT 64
-# else
-# define RCU_FANOUT 32
-# endif
-#endif /* #else #ifdef CONFIG_RCU_FANOUT */
-
-#ifdef CONFIG_RCU_FANOUT_LEAF
-#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
-#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
-# ifdef CONFIG_64BIT
-# define RCU_FANOUT_LEAF 64
-# else
-# define RCU_FANOUT_LEAF 32
-# endif
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
-
-#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
-#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
-#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT_1
-# define RCU_NUM_LVLS 1
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_NODES NUM_RCU_LVL_0
-# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
-# define RCU_NODE_NAME_INIT { "rcu_node_0" }
-# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
-#elif NR_CPUS <= RCU_FANOUT_2
-# define RCU_NUM_LVLS 2
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
-# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
-# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
-# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
-#elif NR_CPUS <= RCU_FANOUT_3
-# define RCU_NUM_LVLS 3
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
-# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
-# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
-# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
-#elif NR_CPUS <= RCU_FANOUT_4
-# define RCU_NUM_LVLS 4
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
-# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
-# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
-# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
-#else
-# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-
-extern int rcu_num_lvls;
-extern int rcu_num_nodes;
+#include "rcu_segcblist.h"
/*
* Dynticks per-CPU state.
@@ -113,14 +42,9 @@ struct rcu_dynticks {
/* Process level is worth LLONG_MAX/2. */
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- long long dynticks_idle_nesting;
- /* irq/process nesting level from idle. */
- atomic_t dynticks_idle; /* Even value for idle, else odd. */
- /* "Idle" excludes userspace execution. */
- unsigned long dynticks_idle_jiffies;
- /* End of last non-NMI non-idle period. */
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
+ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
+ bool rcu_urgent_qs; /* GP old need light quiescent state. */
#ifdef CONFIG_RCU_FAST_NO_HZ
bool all_lazy; /* Are all CPU's CBs lazy? */
unsigned long nonlazy_posted;
@@ -228,19 +152,6 @@ struct rcu_node {
/* Number of tasks boosted for expedited GP. */
unsigned long n_normal_boosts;
/* Number of tasks boosted for normal GP. */
- unsigned long n_balk_blkd_tasks;
- /* Refused to boost: no blocked tasks. */
- unsigned long n_balk_exp_gp_tasks;
- /* Refused to boost: nothing blocking GP. */
- unsigned long n_balk_boost_tasks;
- /* Refused to boost: already boosting. */
- unsigned long n_balk_notblocked;
- /* Refused to boost: RCU RS CS still running. */
- unsigned long n_balk_notyet;
- /* Refused to boost: not yet time. */
- unsigned long n_balk_nos;
- /* Refused to boost: not sure why, though. */
- /* This can happen due to race conditions. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -262,41 +173,6 @@ struct rcu_node {
#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
/*
- * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
- */
-#define rcu_for_each_node_breadth_first(rsp, rnp) \
- for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
-
-/*
- * Do a breadth-first scan of the non-leaf rcu_node structures for the
- * specified rcu_state structure. Note that if there is a singleton
- * rcu_node tree with but one rcu_node structure, this loop is a no-op.
- */
-#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
- for ((rnp) = &(rsp)->node[0]; \
- (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
-
-/*
- * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
- * structure. Note that if there is a singleton rcu_node tree with but
- * one rcu_node structure, this loop -will- visit the rcu_node structure.
- * It is still a leaf node, even if it is also the root node.
- */
-#define rcu_for_each_leaf_node(rsp, rnp) \
- for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
- (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
-
-/*
- * Iterate over all possible CPUs in a leaf RCU node.
- */
-#define for_each_leaf_node_possible_cpu(rnp, cpu) \
- for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
- cpu <= rnp->grphi; \
- cpu = cpumask_next((cpu), cpu_possible_mask))
-
-/*
* Union to allow "aggregate OR" operation on the need for a quiescent
* state by the normal and expedited grace periods.
*/
@@ -336,34 +212,9 @@ struct rcu_data {
/* period it is aware of. */
/* 2) batch handling */
- /*
- * If nxtlist is not NULL, it is partitioned as follows.
- * Any of the partitions might be empty, in which case the
- * pointer to that partition will be equal to the pointer for
- * the following partition. When the list is empty, all of
- * the nxttail elements point to the ->nxtlist pointer itself,
- * which in that case is NULL.
- *
- * [nxtlist, *nxttail[RCU_DONE_TAIL]):
- * Entries that batch # <= ->completed
- * The grace period for these entries has completed, and
- * the other grace-period-completed entries may be moved
- * here temporarily in rcu_process_callbacks().
- * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
- * Entries that batch # <= ->completed - 1: waiting for current GP
- * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
- * Entries known to have arrived before current GP ended
- * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
- * Entries that might have arrived after current GP ended
- * Note that the value of *nxttail[RCU_NEXT_TAIL] will
- * always be NULL, as this is the end of the list.
- */
- struct rcu_head *nxtlist;
- struct rcu_head **nxttail[RCU_NEXT_SIZE];
- unsigned long nxtcompleted[RCU_NEXT_SIZE];
- /* grace periods for sublists. */
- long qlen_lazy; /* # of lazy queued callbacks */
- long qlen; /* # of queued callbacks, incl lazy */
+ struct rcu_segcblist cblist; /* Segmented callback list, with */
+ /* different callbacks waiting for */
+ /* different grace periods. */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
@@ -440,9 +291,9 @@ struct rcu_data {
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
-#define RCU_NOGP_WAKE_NOT 0
-#define RCU_NOGP_WAKE 1
-#define RCU_NOGP_WAKE_FORCE 2
+#define RCU_NOCB_WAKE_NOT 0
+#define RCU_NOCB_WAKE 1
+#define RCU_NOCB_WAKE_FORCE 2
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -482,7 +333,6 @@ struct rcu_state {
struct rcu_node *level[RCU_NUM_LVLS + 1];
/* Hierarchy levels (+1 to */
/* shut bogus gcc warning) */
- u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
call_rcu_func_t call; /* call_rcu() flavor. */
int ncpus; /* # CPUs seen so far. */
@@ -502,14 +352,11 @@ struct rcu_state {
raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
/* Protect following fields. */
- struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
+ struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
/* need a grace period. */
- struct rcu_head **orphan_nxttail; /* Tail of above. */
- struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
+ struct rcu_cblist orphan_done; /* Orphaned callbacks that */
/* are ready to invoke. */
- struct rcu_head **orphan_donetail; /* Tail of above. */
- long qlen_lazy; /* Number of lazy callbacks. */
- long qlen; /* Total number of callbacks. */
+ /* (Contains counts.) */
/* End of fields guarded by orphan_lock. */
struct mutex barrier_mutex; /* Guards barrier fields. */
@@ -596,6 +443,7 @@ extern struct rcu_state rcu_preempt_state;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+bool rcu_eqs_special_set(int cpu);
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -608,7 +456,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
-static void rcu_preempt_note_context_switch(void);
+static void rcu_preempt_note_context_switch(bool preempt);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
@@ -660,89 +508,17 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
-static void rcu_sysidle_enter(int irq);
-static void rcu_sysidle_exit(int irq);
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj);
-static bool is_sysidle_rcu_state(struct rcu_state *rsp);
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj);
static void rcu_bind_gp_kthread(void);
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void);
-#endif /* #ifndef RCU_TREE_NONCORE */
+#ifdef CONFIG_SRCU
+void srcu_online_cpu(unsigned int cpu);
+void srcu_offline_cpu(unsigned int cpu);
+#else /* #ifdef CONFIG_SRCU */
+void srcu_online_cpu(unsigned int cpu) { }
+void srcu_offline_cpu(unsigned int cpu) { }
+#endif /* #else #ifdef CONFIG_SRCU */
-#ifdef CONFIG_RCU_TRACE
-/* Read out queue lengths for tracing. */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
- *ql = atomic_long_read(&rdp->nocb_q_count);
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
- *ql = 0;
- *qll = 0;
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-/*
- * Wrappers for the rcu_node::lock acquire and release.
- *
- * Because the rcu_nodes form a tree, the tree traversal locking will observe
- * different lock values, this in turn means that an UNLOCK of one level
- * followed by a LOCK of another level does not imply a full memory barrier;
- * and most importantly transitivity is lost.
- *
- * In order to restore full ordering between tree levels, augment the regular
- * lock acquire functions with smp_mb__after_unlock_lock().
- *
- * As ->lock of struct rcu_node is a __private field, therefore one should use
- * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
- */
-static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
- smp_mb__after_unlock_lock();
-}
-
-static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
-}
-
-static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
- smp_mb__after_unlock_lock();
-}
-
-static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
-}
-
-#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
-do { \
- typecheck(unsigned long, flags); \
- raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \
-do { \
- typecheck(unsigned long, flags); \
- raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \
-} while (0)
-
-static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
-{
- bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
-
- if (locked)
- smp_mb__after_unlock_lock();
- return locked;
-}
+#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a7b639ccd46e..dd21ca47e4b4 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
*
* Caller must hold the rcu_state's exp_mutex.
*/
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
return rnp->exp_tasks == NULL &&
READ_ONCE(rnp->expmask) == 0;
@@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
rnp->grplo, rnp->grphi,
TPS("wait"));
- wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+ wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(rsp,
&rdp->exp_workdone2, s));
return true;
@@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data)
return;
}
__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+ /* Store .exp before .rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
resched_cpu(smp_processor_id());
}
@@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
rnp->exp_seq_rq = s;
spin_unlock(&rnp->exp_lock);
}
- wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+ smp_mb(); /* All above changes before wakeup. */
+ wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]);
}
trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
mutex_unlock(&rsp->exp_wake_mutex);
@@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
/* Wait for expedited grace period to complete. */
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
rnp = rcu_get_root(rsp);
- wait_event(rnp->exp_wq[(s >> 1) & 0x3],
- sync_exp_work_done(rsp,
- &rdp->exp_workdone0, s));
+ wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
+ sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
+ smp_mb(); /* Workqueue actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rsp->exp_mutex);
@@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void)
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-
-/*
- * Switch to run-time mode once Tree RCU has fully initialized.
- */
-static int __init rcu_exp_runtime_mode(void)
-{
- rcu_test_sync_prims();
- rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
- rcu_test_sync_prims();
- return 0;
-}
-core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a62a8f1caac..908b309d60d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
static void __init rcu_bootup_announce_oddness(void)
{
if (IS_ENABLED(CONFIG_RCU_TRACE))
- pr_info("\tRCU debugfs-based tracing is enabled.\n");
+ pr_info("\tRCU event tracing is enabled.\n");
if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
(!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
@@ -90,8 +90,32 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
- if (IS_ENABLED(CONFIG_RCU_BOOST))
- pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+#ifdef CONFIG_RCU_BOOST
+ pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
+#endif
+ if (blimit != DEFAULT_RCU_BLIMIT)
+ pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
+ if (qhimark != DEFAULT_RCU_QHIMARK)
+ pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
+ if (qlowmark != DEFAULT_RCU_QLOMARK)
+ pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
+ if (jiffies_till_first_fqs != ULONG_MAX)
+ pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
+ if (jiffies_till_next_fqs != ULONG_MAX)
+ pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
+ if (rcu_kick_kthreads)
+ pr_info("\tKick kthreads if too-long grace period.\n");
+ if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
+ pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
+ if (gp_preinit_delay)
+ pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
+ if (gp_init_delay)
+ pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
+ if (gp_cleanup_delay)
+ pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
+ pr_info("\tRCU debug extended QS entry/exit.\n");
+ rcupdate_announce_bootup_oddness();
}
#ifdef CONFIG_PREEMPT_RCU
@@ -155,6 +179,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
(rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
struct task_struct *t = current;
+ lockdep_assert_held(&rnp->lock);
+
/*
* Decide where to queue the newly blocked task. In theory,
* this could be an if-statement. In practice, when I tried
@@ -263,6 +289,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static void rcu_preempt_qs(void)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n");
if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data_p->gpnum),
@@ -286,12 +313,14 @@ static void rcu_preempt_qs(void)
*
* Caller must disable interrupts.
*/
-static void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(bool preempt)
{
struct task_struct *t = current;
struct rcu_data *rdp;
struct rcu_node *rnp;
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
+ WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
if (t->rcu_read_lock_nesting > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -607,6 +636,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
if (rcu_preempt_has_tasks(rnp))
rnp->gp_tasks = rnp->blkd_tasks.next;
@@ -643,8 +673,37 @@ static void rcu_preempt_do_callbacks(void)
#endif /* #ifdef CONFIG_RCU_BOOST */
-/*
- * Queue a preemptible-RCU callback for invocation after a grace period.
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed. However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked. RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section. On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu(). It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section. Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -663,8 +722,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
* synchronize_rcu() was waiting. RCU read-side critical sections are
* delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
*
- * See the description of synchronize_sched() for more detailed information
- * on memory ordering guarantees.
+ * See the description of synchronize_sched() for more detailed
+ * information on memory-ordering guarantees. However, please note
+ * that -only- the memory-ordering guarantees apply. For example,
+ * synchronize_rcu() is -not- guaranteed to wait on things like code
+ * protected by preempt_disable(), instead, synchronize_rcu() is -only-
+ * guaranteed to wait on RCU read-side critical sections, that is, sections
+ * of code protected by rcu_read_lock().
*/
void synchronize_rcu(void)
{
@@ -738,7 +802,7 @@ static void __init rcu_bootup_announce(void)
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
-static void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(bool preempt)
{
}
@@ -835,33 +899,6 @@ void exit_rcu(void)
#include "../locking/rtmutex_common.h"
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
- if (!rcu_preempt_has_tasks(rnp))
- rnp->n_balk_blkd_tasks++;
- else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
- rnp->n_balk_exp_gp_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
- rnp->n_balk_boost_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
- rnp->n_balk_notblocked++;
- else if (rnp->gp_tasks != NULL &&
- ULONG_CMP_LT(jiffies, rnp->boost_time))
- rnp->n_balk_notyet++;
- else
- rnp->n_balk_nos++;
-}
-
-#else /* #ifdef CONFIG_RCU_TRACE */
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
static void rcu_wake_cond(struct task_struct *t, int status)
{
/*
@@ -992,8 +1029,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
{
struct task_struct *t;
+ lockdep_assert_held(&rnp->lock);
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
- rnp->n_balk_exp_gp_tasks++;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -1009,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
if (t)
rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
- rcu_initiate_boost_trace(rnp);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1260,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
- ? 0 : rcu_cpu_has_callbacks(NULL);
+ return rcu_cpu_has_callbacks(NULL);
}
/*
@@ -1350,10 +1385,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
*/
if ((rdp->completed != rnp->completed ||
unlikely(READ_ONCE(rdp->gpwrap))) &&
- rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+ rcu_segcblist_pend_cbs(&rdp->cblist))
note_gp_changes(rsp, rdp);
- if (cpu_has_callbacks_ready_to_invoke(rdp))
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
cbs_ready = true;
}
return cbs_ready;
@@ -1372,10 +1407,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
unsigned long dj;
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
- *nextevt = KTIME_MAX;
- return 0;
- }
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1424,8 +1456,8 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
- rcu_is_nocb_cpu(smp_processor_id()))
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
/* Handle nohz enablement switches conservatively. */
@@ -1461,7 +1493,7 @@ static void rcu_prepare_for_idle(void)
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
- if (!*rdp->nxttail[RCU_DONE_TAIL])
+ if (rcu_segcblist_pend_cbs(&rdp->cblist))
continue;
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1479,8 +1511,8 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
- rcu_is_nocb_cpu(smp_processor_id()))
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -1529,7 +1561,7 @@ static void rcu_oom_notify_cpu(void *unused)
for_each_rcu_flavor(rsp) {
rdp = raw_cpu_ptr(rsp->rda);
- if (rdp->qlen_lazy != 0) {
+ if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) {
atomic_inc(&oom_callback_count);
rsp->call(&rdp->oom_head, rcu_oom_callback);
}
@@ -1709,7 +1741,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup);
static int __init parse_rcu_nocb_poll(char *arg)
{
- rcu_nocb_poll = 1;
+ rcu_nocb_poll = true;
return 0;
}
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
@@ -1747,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
/* Is the specified CPU a no-CBs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
@@ -1755,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu)
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Kick the leader kthread for this NOCB group.
@@ -1769,6 +1799,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+ smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
}
}
@@ -1860,7 +1891,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
+ /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmptyIsDeferred"));
}
@@ -1872,7 +1905,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
+ /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvfIsDeferred"));
}
@@ -1930,30 +1965,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp,
unsigned long flags)
{
- long ql = rsp->qlen;
- long qll = rsp->qlen_lazy;
+ long ql = rsp->orphan_done.len;
+ long qll = rsp->orphan_done.len_lazy;
/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
if (!rcu_is_nocb_cpu(smp_processor_id()))
return false;
- rsp->qlen = 0;
- rsp->qlen_lazy = 0;
/* First, enqueue the donelist, if any. This preserves CB ordering. */
- if (rsp->orphan_donelist != NULL) {
- __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
- rsp->orphan_donetail, ql, qll, flags);
- ql = qll = 0;
- rsp->orphan_donelist = NULL;
- rsp->orphan_donetail = &rsp->orphan_donelist;
+ if (rsp->orphan_done.head) {
+ __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
+ rcu_cblist_tail(&rsp->orphan_done),
+ ql, qll, flags);
}
- if (rsp->orphan_nxtlist != NULL) {
- __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
- rsp->orphan_nxttail, ql, qll, flags);
- ql = qll = 0;
- rsp->orphan_nxtlist = NULL;
- rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+ if (rsp->orphan_pend.head) {
+ __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
+ rcu_cblist_tail(&rsp->orphan_pend),
+ ql, qll, flags);
}
+ rcu_cblist_init(&rsp->orphan_done);
+ rcu_cblist_init(&rsp->orphan_pend);
return true;
}
@@ -2023,6 +2054,7 @@ wait_again:
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
+ smp_mb(); /* wakeup before ->nocb_head reads. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
@@ -2201,8 +2233,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
- wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
@@ -2212,10 +2244,6 @@ void __init rcu_init_nohz(void)
bool need_rcu_nocb_mask = true;
struct rcu_state *rsp;
-#ifdef CONFIG_RCU_NOCB_CPU_NONE
- need_rcu_nocb_mask = false;
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
-
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
need_rcu_nocb_mask = true;
@@ -2231,14 +2259,6 @@ void __init rcu_init_nohz(void)
if (!have_rcu_nocb_mask)
return;
-#ifdef CONFIG_RCU_NOCB_CPU_ZERO
- pr_info("\tOffload RCU callbacks from CPU 0\n");
- cpumask_set_cpu(0, rcu_nocb_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
-#ifdef CONFIG_RCU_NOCB_CPU_ALL
- pr_info("\tOffload RCU callbacks from all CPUs\n");
- cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running)
cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
@@ -2395,16 +2415,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
return false;
/* If there are early-boot callbacks, move them to nocb lists. */
- if (rdp->nxtlist) {
- rdp->nocb_head = rdp->nxtlist;
- rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
- atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
- atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
- rdp->nxtlist = NULL;
- rdp->qlen = 0;
- rdp->qlen_lazy = 0;
+ if (!rcu_segcblist_empty(&rdp->cblist)) {
+ rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
+ rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
+ atomic_long_set(&rdp->nocb_q_count,
+ rcu_segcblist_n_cbs(&rdp->cblist));
+ atomic_long_set(&rdp->nocb_q_count_lazy,
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist));
+ rcu_segcblist_init(&rdp->cblist);
}
- rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ rcu_segcblist_disable(&rdp->cblist);
return true;
}
@@ -2491,421 +2511,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
#endif /* #ifdef CONFIG_NO_HZ_FULL */
}
-
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-
-static int full_sysidle_state; /* Current system-idle state. */
-#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
-#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
-#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
-#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
-#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
-
-/*
- * Invoked to note exit from irq or task transition to idle. Note that
- * usermode execution does -not- count as idle here! After all, we want
- * to detect full-system idle states, not RCU quiescent states and grace
- * periods. The caller must have disabled interrupts.
- */
-static void rcu_sysidle_enter(int irq)
-{
- unsigned long j;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- /* Adjust nesting, check for fully idle. */
- if (irq) {
- rdtp->dynticks_idle_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
- if (rdtp->dynticks_idle_nesting != 0)
- return; /* Still not fully idle. */
- } else {
- if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
- DYNTICK_TASK_NEST_VALUE) {
- rdtp->dynticks_idle_nesting = 0;
- } else {
- rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
- return; /* Still not fully idle. */
- }
- }
-
- /* Record start of fully idle period. */
- j = jiffies;
- WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
- smp_mb__before_atomic();
- atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic();
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
-}
-
-/*
- * Unconditionally force exit from full system-idle state. This is
- * invoked when a normal CPU exits idle, but must be called separately
- * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
- * is that the timekeeping CPU is permitted to take scheduling-clock
- * interrupts while the system is in system-idle state, and of course
- * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
- * interrupt from any other type of interrupt.
- */
-void rcu_sysidle_force_exit(void)
-{
- int oldstate = READ_ONCE(full_sysidle_state);
- int newoldstate;
-
- /*
- * Each pass through the following loop attempts to exit full
- * system-idle state. If contention proves to be a problem,
- * a trylock-based contention tree could be used here.
- */
- while (oldstate > RCU_SYSIDLE_SHORT) {
- newoldstate = cmpxchg(&full_sysidle_state,
- oldstate, RCU_SYSIDLE_NOT);
- if (oldstate == newoldstate &&
- oldstate == RCU_SYSIDLE_FULL_NOTED) {
- rcu_kick_nohz_cpu(tick_do_timer_cpu);
- return; /* We cleared it, done! */
- }
- oldstate = newoldstate;
- }
- smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
-}
-
-/*
- * Invoked to note entry to irq or task transition from idle. Note that
- * usermode execution does -not- count as idle here! The caller must
- * have disabled interrupts.
- */
-static void rcu_sysidle_exit(int irq)
-{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- /* Adjust nesting, check for already non-idle. */
- if (irq) {
- rdtp->dynticks_idle_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
- if (rdtp->dynticks_idle_nesting != 1)
- return; /* Already non-idle. */
- } else {
- /*
- * Allow for irq misnesting. Yes, it really is possible
- * to enter an irq handler then never leave it, and maybe
- * also vice versa. Handle both possibilities.
- */
- if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
- rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
- return; /* Already non-idle. */
- } else {
- rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
- }
- }
-
- /* Record end of idle period. */
- smp_mb__before_atomic();
- atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic();
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
-
- /*
- * If we are the timekeeping CPU, we are permitted to be non-idle
- * during a system-idle state. This must be the case, because
- * the timekeeping CPU has to take scheduling-clock interrupts
- * during the time that the system is transitioning to full
- * system-idle state. This means that the timekeeping CPU must
- * invoke rcu_sysidle_force_exit() directly if it does anything
- * more than take a scheduling-clock interrupt.
- */
- if (smp_processor_id() == tick_do_timer_cpu)
- return;
-
- /* Update system-idle state: We are clearly no longer fully idle! */
- rcu_sysidle_force_exit();
-}
-
-/*
- * Check to see if the current CPU is idle. Note that usermode execution
- * does not count as idle. The caller must have disabled interrupts,
- * and must be running on tick_do_timer_cpu.
- */
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj)
-{
- int cur;
- unsigned long j;
- struct rcu_dynticks *rdtp = rdp->dynticks;
-
- /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
- if (!tick_nohz_full_enabled())
- return;
-
- /*
- * If some other CPU has already reported non-idle, if this is
- * not the flavor of RCU that tracks sysidle state, or if this
- * is an offline or the timekeeping CPU, nothing to do.
- */
- if (!*isidle || rdp->rsp != rcu_state_p ||
- cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
- return;
- /* Verify affinity of current kthread. */
- WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
-
- /* Pick up current idle and NMI-nesting counter and check. */
- cur = atomic_read(&rdtp->dynticks_idle);
- if (cur & 0x1) {
- *isidle = false; /* We are not idle! */
- return;
- }
- smp_mb(); /* Read counters before timestamps. */
-
- /* Pick up timestamps. */
- j = READ_ONCE(rdtp->dynticks_idle_jiffies);
- /* If this CPU entered idle more recently, update maxj timestamp. */
- if (ULONG_CMP_LT(*maxj, j))
- *maxj = j;
-}
-
-/*
- * Is this the flavor of RCU that is handling full-system idle?
- */
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
- return rsp == rcu_state_p;
-}
-
-/*
- * Return a delay in jiffies based on the number of CPUs, rcu_node
- * leaf fanout, and jiffies tick rate. The idea is to allow larger
- * systems more time to transition to full-idle state in order to
- * avoid the cache thrashing that otherwise occur on the state variable.
- * Really small systems (less than a couple of tens of CPUs) should
- * instead use a single global atomically incremented counter, and later
- * versions of this will automatically reconfigure themselves accordingly.
- */
-static unsigned long rcu_sysidle_delay(void)
-{
- if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
- return 0;
- return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
-}
-
-/*
- * Advance the full-system-idle state. This is invoked when all of
- * the non-timekeeping CPUs are idle.
- */
-static void rcu_sysidle(unsigned long j)
-{
- /* Check the current state. */
- switch (READ_ONCE(full_sysidle_state)) {
- case RCU_SYSIDLE_NOT:
-
- /* First time all are idle, so note a short idle period. */
- WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
- break;
-
- case RCU_SYSIDLE_SHORT:
-
- /*
- * Idle for a bit, time to advance to next state?
- * cmpxchg failure means race with non-idle, let them win.
- */
- if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
- (void)cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
- break;
-
- case RCU_SYSIDLE_LONG:
-
- /*
- * Do an additional check pass before advancing to full.
- * cmpxchg failure means race with non-idle, let them win.
- */
- if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
- (void)cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
- break;
-
- default:
- break;
- }
-}
-
-/*
- * Found a non-idle non-timekeeping CPU, so kick the system-idle state
- * back to the beginning.
- */
-static void rcu_sysidle_cancel(void)
-{
- smp_mb();
- if (full_sysidle_state > RCU_SYSIDLE_SHORT)
- WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
-}
-
-/*
- * Update the sysidle state based on the results of a force-quiescent-state
- * scan of the CPUs' dyntick-idle state.
- */
-static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
- unsigned long maxj, bool gpkt)
-{
- if (rsp != rcu_state_p)
- return; /* Wrong flavor, ignore. */
- if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
- return; /* Running state machine from timekeeping CPU. */
- if (isidle)
- rcu_sysidle(maxj); /* More idle! */
- else
- rcu_sysidle_cancel(); /* Idle is over. */
-}
-
-/*
- * Wrapper for rcu_sysidle_report() when called from the grace-period
- * kthread's context.
- */
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj)
-{
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- rcu_sysidle_report(rsp, isidle, maxj, true);
-}
-
-/* Callback and function for forcing an RCU grace period. */
-struct rcu_sysidle_head {
- struct rcu_head rh;
- int inuse;
-};
-
-static void rcu_sysidle_cb(struct rcu_head *rhp)
-{
- struct rcu_sysidle_head *rshp;
-
- /*
- * The following memory barrier is needed to replace the
- * memory barriers that would normally be in the memory
- * allocator.
- */
- smp_mb(); /* grace period precedes setting inuse. */
-
- rshp = container_of(rhp, struct rcu_sysidle_head, rh);
- WRITE_ONCE(rshp->inuse, 0);
-}
-
-/*
- * Check to see if the system is fully idle, other than the timekeeping CPU.
- * The caller must have disabled interrupts. This is not intended to be
- * called unless tick_nohz_full_enabled().
- */
-bool rcu_sys_is_idle(void)
-{
- static struct rcu_sysidle_head rsh;
- int rss = READ_ONCE(full_sysidle_state);
-
- if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
- return false;
-
- /* Handle small-system case by doing a full scan of CPUs. */
- if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
- int oldrss = rss - 1;
-
- /*
- * One pass to advance to each state up to _FULL.
- * Give up if any pass fails to advance the state.
- */
- while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
- int cpu;
- bool isidle = true;
- unsigned long maxj = jiffies - ULONG_MAX / 4;
- struct rcu_data *rdp;
-
- /* Scan all the CPUs looking for nonidle CPUs. */
- for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
- rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
- if (!isidle)
- break;
- }
- rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
- oldrss = rss;
- rss = READ_ONCE(full_sysidle_state);
- }
- }
-
- /* If this is the first observation of an idle period, record it. */
- if (rss == RCU_SYSIDLE_FULL) {
- rss = cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
- return rss == RCU_SYSIDLE_FULL;
- }
-
- smp_mb(); /* ensure rss load happens before later caller actions. */
-
- /* If already fully idle, tell the caller (in case of races). */
- if (rss == RCU_SYSIDLE_FULL_NOTED)
- return true;
-
- /*
- * If we aren't there yet, and a grace period is not in flight,
- * initiate a grace period. Either way, tell the caller that
- * we are not there yet. We use an xchg() rather than an assignment
- * to make up for the memory barriers that would otherwise be
- * provided by the memory allocator.
- */
- if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
- !rcu_gp_in_progress(rcu_state_p) &&
- !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
- call_rcu(&rsh.rh, rcu_sysidle_cb);
- return false;
-}
-
-/*
- * Initialize dynticks sysidle state for CPUs coming online.
- */
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
- rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
-}
-
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
-static void rcu_sysidle_enter(int irq)
-{
-}
-
-static void rcu_sysidle_exit(int irq)
-{
-}
-
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj)
-{
-}
-
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
- return false;
-}
-
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj)
-{
-}
-
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -2936,13 +2541,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled())
return;
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- cpu = tick_do_timer_cpu;
- if (cpu >= 0 && cpu < nr_cpu_ids)
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
housekeeping_affine(current);
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}
/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
deleted file mode 100644
index 8751a748499a..000000000000
--- a/kernel/rcu/tree_trace.c
+++ /dev/null
@@ -1,494 +0,0 @@
-/*
- * Read-Copy Update tracing for hierarchical implementation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright IBM Corporation, 2008
- * Author: Paul E. McKenney
- *
- * Papers: http://www.rdrop.com/users/paulmck/RCU
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-#define RCU_TREE_NONCORE
-#include "tree.h"
-
-DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
-
-static int r_open(struct inode *inode, struct file *file,
- const struct seq_operations *op)
-{
- int ret = seq_open(file, op);
- if (!ret) {
- struct seq_file *m = (struct seq_file *)file->private_data;
- m->private = inode->i_private;
- }
- return ret;
-}
-
-static void *r_start(struct seq_file *m, loff_t *pos)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- *pos = cpumask_next(*pos - 1, cpu_possible_mask);
- if ((*pos) < nr_cpu_ids)
- return per_cpu_ptr(rsp->rda, *pos);
- return NULL;
-}
-
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
- (*pos)++;
- return r_start(m, pos);
-}
-
-static void r_stop(struct seq_file *m, void *v)
-{
-}
-
-static int show_rcubarrier(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "bcc: %d bseq: %lu\n",
- atomic_read(&rsp->barrier_cpu_count),
- rsp->barrier_sequence);
- return 0;
-}
-
-static int rcubarrier_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcubarrier, inode->i_private);
-}
-
-static const struct file_operations rcubarrier_fops = {
- .owner = THIS_MODULE,
- .open = rcubarrier_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static char convert_kthread_status(unsigned int kthread_status)
-{
- if (kthread_status > RCU_KTHREAD_MAX)
- return '?';
- return "SRWOY"[kthread_status];
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
-{
- long ql, qll;
-
- if (!rdp->beenonline)
- return;
- seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
- rdp->cpu,
- cpu_is_offline(rdp->cpu) ? '!' : ' ',
- ulong2long(rdp->completed), ulong2long(rdp->gpnum),
- rdp->cpu_no_qs.b.norm,
- rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
- rdp->core_needs_qs);
- seq_printf(m, " dt=%d/%llx/%d df=%lu",
- rcu_dynticks_snap(rdp->dynticks),
- rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi_nesting,
- rdp->dynticks_fqs);
- seq_printf(m, " of=%lu", rdp->offline_fqs);
- rcu_nocb_q_lengths(rdp, &ql, &qll);
- qll += rdp->qlen_lazy;
- ql += rdp->qlen;
- seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
- qll, ql,
- ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
- rdp->nxttail[RCU_NEXT_TAIL]],
- ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
- rdp->nxttail[RCU_NEXT_READY_TAIL]],
- ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
- rdp->nxttail[RCU_WAIT_TAIL]],
- ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
-#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c ktl=%x",
- per_cpu(rcu_cpu_has_work, rdp->cpu),
- convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
- rdp->cpu)),
- per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
-#endif /* #ifdef CONFIG_RCU_BOOST */
- seq_printf(m, " b=%ld", rdp->blimit);
- seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
- rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
- rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
-}
-
-static int show_rcudata(struct seq_file *m, void *v)
-{
- print_one_rcu_data(m, (struct rcu_data *)v);
- return 0;
-}
-
-static const struct seq_operations rcudate_op = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = show_rcudata,
-};
-
-static int rcudata_open(struct inode *inode, struct file *file)
-{
- return r_open(inode, file, &rcudate_op);
-}
-
-static const struct file_operations rcudata_fops = {
- .owner = THIS_MODULE,
- .open = rcudata_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = seq_release,
-};
-
-static int show_rcuexp(struct seq_file *m, void *v)
-{
- int cpu;
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- struct rcu_data *rdp;
- unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
-
- for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
- s0 += atomic_long_read(&rdp->exp_workdone0);
- s1 += atomic_long_read(&rdp->exp_workdone1);
- s2 += atomic_long_read(&rdp->exp_workdone2);
- s3 += atomic_long_read(&rdp->exp_workdone3);
- }
- seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence, s0, s1, s2, s3,
- atomic_read(&rsp->expedited_need_qs),
- rsp->expedited_sequence / 2);
- return 0;
-}
-
-static int rcuexp_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcuexp, inode->i_private);
-}
-
-static const struct file_operations rcuexp_fops = {
- .owner = THIS_MODULE,
- .open = rcuexp_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
-{
- seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
- rnp->grplo, rnp->grphi,
- "T."[list_empty(&rnp->blkd_tasks)],
- "N."[!rnp->gp_tasks],
- "E."[!rnp->exp_tasks],
- "B."[!rnp->boost_tasks],
- convert_kthread_status(rnp->boost_kthread_status),
- rnp->n_tasks_boosted, rnp->n_exp_boosts,
- rnp->n_normal_boosts);
- seq_printf(m, "j=%04x bt=%04x\n",
- (int)(jiffies & 0xffff),
- (int)(rnp->boost_time & 0xffff));
- seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
- rnp->n_balk_blkd_tasks,
- rnp->n_balk_exp_gp_tasks,
- rnp->n_balk_boost_tasks,
- rnp->n_balk_notblocked,
- rnp->n_balk_notyet,
- rnp->n_balk_nos);
-}
-
-static int show_rcu_node_boost(struct seq_file *m, void *unused)
-{
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
- print_one_rcu_node_boost(m, rnp);
- return 0;
-}
-
-static int rcu_node_boost_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcu_node_boost, NULL);
-}
-
-static const struct file_operations rcu_node_boost_fops = {
- .owner = THIS_MODULE,
- .open = rcu_node_boost_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
-{
- unsigned long gpnum;
- int level = 0;
- struct rcu_node *rnp;
-
- gpnum = rsp->gpnum;
- seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
- ulong2long(rsp->completed), ulong2long(gpnum),
- rsp->gp_state,
- (long)(rsp->jiffies_force_qs - jiffies),
- (int)(jiffies & 0xffff));
- seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
- rsp->n_force_qs, rsp->n_force_qs_ngp,
- rsp->n_force_qs - rsp->n_force_qs_ngp,
- READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
- for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
- if (rnp->level != level) {
- seq_puts(m, "\n");
- level = rnp->level;
- }
- seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
- rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
- ".G"[rnp->gp_tasks != NULL],
- ".E"[rnp->exp_tasks != NULL],
- ".T"[!list_empty(&rnp->blkd_tasks)],
- rnp->grplo, rnp->grphi, rnp->grpnum);
- }
- seq_puts(m, "\n");
-}
-
-static int show_rcuhier(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- print_one_rcu_state(m, rsp);
- return 0;
-}
-
-static int rcuhier_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcuhier, inode->i_private);
-}
-
-static const struct file_operations rcuhier_fops = {
- .owner = THIS_MODULE,
- .open = rcuhier_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
-{
- unsigned long flags;
- unsigned long completed;
- unsigned long gpnum;
- unsigned long gpage;
- unsigned long gpmax;
- struct rcu_node *rnp = &rsp->node[0];
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- completed = READ_ONCE(rsp->completed);
- gpnum = READ_ONCE(rsp->gpnum);
- if (completed == gpnum)
- gpage = 0;
- else
- gpage = jiffies - rsp->gp_start;
- gpmax = rsp->gp_max;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
- ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
-}
-
-static int show_rcugp(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- show_one_rcugp(m, rsp);
- return 0;
-}
-
-static int rcugp_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcugp, inode->i_private);
-}
-
-static const struct file_operations rcugp_fops = {
- .owner = THIS_MODULE,
- .open = rcugp_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
-{
- if (!rdp->beenonline)
- return;
- seq_printf(m, "%3d%cnp=%ld ",
- rdp->cpu,
- cpu_is_offline(rdp->cpu) ? '!' : ' ',
- rdp->n_rcu_pending);
- seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
- rdp->n_rp_core_needs_qs,
- rdp->n_rp_report_qs,
- rdp->n_rp_cb_ready,
- rdp->n_rp_cpu_needs_gp);
- seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
- rdp->n_rp_gp_completed,
- rdp->n_rp_gp_started,
- rdp->n_rp_nocb_defer_wakeup,
- rdp->n_rp_need_nothing);
-}
-
-static int show_rcu_pending(struct seq_file *m, void *v)
-{
- print_one_rcu_pending(m, (struct rcu_data *)v);
- return 0;
-}
-
-static const struct seq_operations rcu_pending_op = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = show_rcu_pending,
-};
-
-static int rcu_pending_open(struct inode *inode, struct file *file)
-{
- return r_open(inode, file, &rcu_pending_op);
-}
-
-static const struct file_operations rcu_pending_fops = {
- .owner = THIS_MODULE,
- .open = rcu_pending_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = seq_release,
-};
-
-static int show_rcutorture(struct seq_file *m, void *unused)
-{
- seq_printf(m, "rcutorture test sequence: %lu %s\n",
- rcutorture_testseq >> 1,
- (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
- seq_printf(m, "rcutorture update version number: %lu\n",
- rcutorture_vernum);
- return 0;
-}
-
-static int rcutorture_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcutorture, NULL);
-}
-
-static const struct file_operations rcutorture_fops = {
- .owner = THIS_MODULE,
- .open = rcutorture_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutree_trace_init(void)
-{
- struct rcu_state *rsp;
- struct dentry *retval;
- struct dentry *rspdir;
-
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto free_out;
-
- for_each_rcu_flavor(rsp) {
- rspdir = debugfs_create_dir(rsp->name, rcudir);
- if (!rspdir)
- goto free_out;
-
- retval = debugfs_create_file("rcudata", 0444,
- rspdir, rsp, &rcudata_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcuexp", 0444,
- rspdir, rsp, &rcuexp_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcu_pending", 0444,
- rspdir, rsp, &rcu_pending_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcubarrier", 0444,
- rspdir, rsp, &rcubarrier_fops);
- if (!retval)
- goto free_out;
-
-#ifdef CONFIG_RCU_BOOST
- if (rsp == &rcu_preempt_state) {
- retval = debugfs_create_file("rcuboost", 0444,
- rspdir, NULL, &rcu_node_boost_fops);
- if (!retval)
- goto free_out;
- }
-#endif
-
- retval = debugfs_create_file("rcugp", 0444,
- rspdir, rsp, &rcugp_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcuhier", 0444,
- rspdir, rsp, &rcuhier_fops);
- if (!retval)
- goto free_out;
- }
-
- retval = debugfs_create_file("rcutorture", 0444, rcudir,
- NULL, &rcutorture_fops);
- if (!retval)
- goto free_out;
- return 0;
-free_out:
- debugfs_remove_recursive(rcudir);
- return 1;
-}
-device_initcall(rcutree_trace_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 55c8530316c7..00e77c470017 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,7 +62,9 @@
#define MODULE_PARAM_PREFIX "rcupdate."
#ifndef CONFIG_TINY_RCU
+extern int rcu_expedited; /* from sysctl */
module_param(rcu_expedited, int, 0);
+extern int rcu_normal; /* from sysctl */
module_param(rcu_normal, int, 0);
static int rcu_normal_after_boot;
module_param(rcu_normal_after_boot, int, 0);
@@ -124,7 +126,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
* non-expedited counterparts? Intended for use within RCU. Note
* that if the user specifies both rcu_expedited and rcu_normal, then
* rcu_normal wins. (Except during the time period during boot from
- * when the first task is spawned until the rcu_exp_runtime_mode()
+ * when the first task is spawned until the rcu_set_runtime_mode()
* core_initcall() is invoked, at which point everything is expedited.)
*/
bool rcu_gp_is_normal(void)
@@ -190,6 +192,39 @@ void rcu_end_inkernel_boot(void)
#endif /* #ifndef CONFIG_TINY_RCU */
+/*
+ * Test each non-SRCU synchronous grace-period wait API. This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+ if (!IS_ENABLED(CONFIG_PROVE_RCU))
+ return;
+ synchronize_rcu();
+ synchronize_rcu_bh();
+ synchronize_sched();
+ synchronize_rcu_expedited();
+ synchronize_rcu_bh_expedited();
+ synchronize_sched_expedited();
+}
+
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
+
+/*
+ * Switch to run-time mode once RCU has fully initialized.
+ */
+static int __init rcu_set_runtime_mode(void)
+{
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ rcu_test_sync_prims();
+ return 0;
+}
+core_initcall(rcu_set_runtime_mode);
+
+#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
+
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -346,6 +381,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
struct rcu_synchronize *rs_array)
{
int i;
+ int j;
/* Initialize and register callbacks for each flavor specified. */
for (i = 0; i < n; i++) {
@@ -357,7 +393,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
}
init_rcu_head_on_stack(&rs_array[i].head);
init_completion(&rs_array[i].completion);
- (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ for (j = 0; j < i; j++)
+ if (crcu_array[j] == crcu_array[i])
+ break;
+ if (j == i)
+ (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
}
/* Wait for all callbacks to be invoked. */
@@ -366,7 +406,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
(crcu_array[i] == call_rcu ||
crcu_array[i] == call_rcu_bh))
continue;
- wait_for_completion(&rs_array[i].completion);
+ for (j = 0; j < i; j++)
+ if (crcu_array[j] == crcu_array[i])
+ break;
+ if (j == i)
+ wait_for_completion(&rs_array[i].completion);
destroy_rcu_head_on_stack(&rs_array[i].head);
}
}
@@ -527,15 +571,30 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
DEFINE_SRCU(tasks_rcu_exit_srcu);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
-static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
+#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
+static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
static void rcu_spawn_tasks_kthread(void);
static struct task_struct *rcu_tasks_kthread_ptr;
-/*
- * Post an RCU-tasks callback. First call must be from process context
- * after the scheduler if fully operational.
+/**
+ * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
+ * @rhp: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_tasks() assumes
+ * that the read-side critical sections end at a voluntary context
+ * switch (not a preemption!), entry into idle, or transition to usermode
+ * execution. As such, there are no read-side primitives analogous to
+ * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
+ * to determine that all tasks have passed through a safe state, not so
+ * much for data-strcuture synchronization.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
@@ -632,6 +691,7 @@ static void check_holdout_task(struct task_struct *t,
put_task_struct(t);
return;
}
+ rcu_request_urgent_qs_task(t);
if (!needreport)
return;
if (*firstreport) {
@@ -817,23 +877,23 @@ static void rcu_spawn_tasks_kthread(void)
#endif /* #ifdef CONFIG_TASKS_RCU */
+#ifndef CONFIG_TINY_RCU
+
/*
- * Test each non-SRCU synchronous grace-period wait API. This is
- * useful just after a change in mode for these primitives, and
- * during early boot.
+ * Print any non-default Tasks RCU settings.
*/
-void rcu_test_sync_prims(void)
+static void __init rcu_tasks_bootup_oddness(void)
{
- if (!IS_ENABLED(CONFIG_PROVE_RCU))
- return;
- synchronize_rcu();
- synchronize_rcu_bh();
- synchronize_sched();
- synchronize_rcu_expedited();
- synchronize_rcu_bh_expedited();
- synchronize_sched_expedited();
+#ifdef CONFIG_TASKS_RCU
+ if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
+ pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+ else
+ pr_info("\tTasks RCU enabled.\n");
+#endif /* #ifdef CONFIG_TASKS_RCU */
}
+#endif /* #ifndef CONFIG_TINY_RCU */
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -918,3 +978,25 @@ late_initcall(rcu_verify_early_boot_tests);
#else
void rcu_early_boot_tests(void) {}
#endif /* CONFIG_PROVE_RCU */
+
+#ifndef CONFIG_TINY_RCU
+
+/*
+ * Print any significant non-default boot-time settings.
+ */
+void __init rcupdate_announce_bootup_oddness(void)
+{
+ if (rcu_normal)
+ pr_info("\tNo expedited grace period (rcu_normal).\n");
+ else if (rcu_normal_after_boot)
+ pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n");
+ else if (rcu_expedited)
+ pr_info("\tAll grace periods are expedited (rcu_expedited).\n");
+ if (rcu_cpu_stall_suppress)
+ pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n");
+ if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT)
+ pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout);
+ rcu_tasks_bootup_oddness();
+}
+
+#endif /* #ifndef CONFIG_TINY_RCU */
OpenPOWER on IntegriCloud