diff options
author | Boqun Feng <boqun.feng@gmail.com> | 2018-03-09 09:14:51 +0800 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2018-05-15 10:26:07 -0700 |
commit | 55ebfce0605a4dce35467dfb1dc1b3ad26fe9f86 (patch) | |
tree | 18e8badda41172f8c3527929fcc8e0e776a3c22b | |
parent | 7be8c56f8f8a58af92f8791c5a09d48e342d7101 (diff) | |
download | talos-obmc-linux-55ebfce0605a4dce35467dfb1dc1b3ad26fe9f86.tar.gz talos-obmc-linux-55ebfce0605a4dce35467dfb1dc1b3ad26fe9f86.zip |
rcu: exp: Protect all sync_rcu_preempt_exp_done() with rcu_node lock
Currently some callsites of sync_rcu_preempt_exp_done() are not called
with the corresponding rcu_node's ->lock held, which could introduces
bugs as per Paul:
o CPU 0 in sync_rcu_preempt_exp_done() reads ->exp_tasks and
sees that it is NULL.
o CPU 1 blocks within an RCU read-side critical section, so
it enqueues the task and points ->exp_tasks at it and
clears CPU 1's bit in ->expmask.
o All other CPUs clear their bits in ->expmask.
o CPU 0 reads ->expmask, sees that it is zero, so incorrectly
concludes that all quiescent states have completed, despite
the fact that ->exp_tasks is non-NULL.
To fix this, sync_rcu_preempt_exp_unlocked() is introduced to replace
lockless callsites of sync_rcu_preempt_exp_done().
Further, a lockdep annotation is added into sync_rcu_preempt_exp_done()
to prevent mis-use in the future.
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Nicholas Piggin <npiggin@gmail.com>
-rw-r--r-- | kernel/rcu/tree_exp.h | 28 |
1 files changed, 25 insertions, 3 deletions
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d7622cb85aa7..755d0f461f26 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,6 +20,8 @@ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/lockdep.h> + /* * Record the start of an expedited grace period. */ @@ -158,11 +160,31 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) */ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { + raw_lockdep_assert_held_rcu_node(rnp); + return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } /* + * Like sync_rcu_preempt_exp_done(), but this function assumes the caller + * doesn't hold the rcu_node's ->lock, and will acquire and release the lock + * itself + */ +static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + ret = sync_rcu_preempt_exp_done(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + return ret; +} + + +/* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU * grace period. This event is reported either to the rcu_node structure on @@ -501,9 +523,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) for (;;) { ret = swait_event_timeout( rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), + sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) @@ -536,7 +558,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) + if (sync_rcu_preempt_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, |