From 48d07c04b4cc1dc1221965312f58fd84926212fe Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 20 Mar 2019 22:13:33 +0100 Subject: [PATCH 01/61] rcu: Enable elimination of Tree-RCU softirq processing Some workloads need to change kthread priority for RCU core processing without affecting other softirq work. This commit therefore introduces the rcutree.use_softirq kernel boot parameter, which moves the RCU core work from softirq to a per-CPU SCHED_OTHER kthread named rcuc. Use of SCHED_OTHER approach avoids the scalability problems that appeared with the earlier attempt to move RCU core processing to from softirq to kthreads. That said, kernels built with RCU_BOOST=y will run the rcuc kthreads at the RCU-boosting priority. Note that rcutree.use_softirq=0 must be specified to move RCU core processing to the rcuc kthreads: rcutree.use_softirq=1 is the default. Reported-by: Thomas Gleixner Tested-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior [ paulmck: Adjust for invoke_rcu_callbacks() only ever being invoked from RCU core processing, in contrast to softirq->rcuc transition in old mainline RCU priority boosting. ] [ paulmck: Avoid wakeups when scheduler might have invoked rcu_read_unlock() while holding rq or pi locks, also possibly fixing a pre-existing latent bug involving raise_softirq()-induced wakeups. ] Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 6 + kernel/rcu/tree.c | 138 ++++++++++++++++-- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 136 ++--------------- 4 files changed, 147 insertions(+), 135 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..b96fd15c7316 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3752,6 +3752,12 @@ the propagation of recent CPU-hotplug changes up the rcu_node combining tree. + rcutree.use_softirq= [KNL] + If set to zero, move all RCU_SOFTIRQ processing to + per-CPU rcuc kthreads. Defaults to a non-zero + value, meaning that RCU_SOFTIRQ is used by default. + Specify rcutree.use_softirq=0 to use rcuc kthreads. + rcutree.rcu_fanout_exact= [KNL] Disable autobalancing of the rcu_node combining tree. This is used by rcutorture, and might diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 980ca3ca643f..8e290163505a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -51,6 +51,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include "../time/tick-internal.h" #include "tree.h" #include "rcu.h" @@ -92,6 +98,9 @@ struct rcu_state rcu_state = { /* Dump rcu_node combining tree at boot to verify correct setup. */ static bool dump_tree; module_param(dump_tree, bool, 0444); +/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ +static bool use_softirq = 1; +module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); @@ -2253,7 +2262,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* Perform RCU core processing work for the current CPU. */ -static __latent_entropy void rcu_core(struct softirq_action *unused) +static __latent_entropy void rcu_core(void) { unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); @@ -2295,30 +2304,132 @@ static __latent_entropy void rcu_core(struct softirq_action *unused) trace_rcu_utilization(TPS("End RCU core")); } +static void rcu_core_si(struct softirq_action *h) +{ + rcu_core(); +} + +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) + wake_up_process(t); +} + +static void invoke_rcu_core_kthread(void) +{ + struct task_struct *t; + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); + t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task); + if (t != NULL && t != current) + rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); + local_irq_restore(flags); +} + /* - * Schedule RCU callback invocation. If the running implementation of RCU - * does not support RCU priority boosting, just do a direct call, otherwise - * wake up the per-CPU kernel kthread. Note that because we are running - * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task - * cannot disappear out from under us. + * Do RCU callback invocation. Not that if we are running !use_softirq, + * we are already in the rcuc kthread. If callbacks are offloaded, then + * ->cblist is always empty, so we don't get here. Therefore, we only + * ever need to check for the scheduler being operational (some callbacks + * do wakeups, so we do need the scheduler). */ static void invoke_rcu_callbacks(struct rcu_data *rdp) { if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) return; - if (likely(!rcu_state.boost)) { - rcu_do_batch(rdp); - return; - } - invoke_rcu_callbacks_kthread(); + rcu_do_batch(rdp); } +/* + * Wake up this CPU's rcuc kthread to do RCU core processing. + */ static void invoke_rcu_core(void) { - if (cpu_online(smp_processor_id())) + if (!cpu_online(smp_processor_id())) + return; + if (use_softirq) raise_softirq(RCU_SOFTIRQ); + else + invoke_rcu_core_kthread(); } +static void rcu_cpu_kthread_park(unsigned int cpu) +{ + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +} + +static int rcu_cpu_kthread_should_run(unsigned int cpu) +{ + return __this_cpu_read(rcu_data.rcu_cpu_has_work); +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces + * the RCU softirq used in configurations of RCU that do not support RCU + * priority boosting. + */ +static void rcu_cpu_kthread(unsigned int cpu) +{ + unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); + int spincnt; + + for (spincnt = 0; spincnt < 10; spincnt++) { + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); + local_bh_disable(); + *statusp = RCU_KTHREAD_RUNNING; + local_irq_disable(); + work = *workp; + *workp = 0; + local_irq_enable(); + if (work) + rcu_core(); + local_bh_enable(); + if (*workp == 0) { + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); + *statusp = RCU_KTHREAD_WAITING; + return; + } + } + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); + schedule_timeout_interruptible(2); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); + *statusp = RCU_KTHREAD_WAITING; +} + +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_data.rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; + +/* + * Spawn per-CPU RCU core processing kthreads. + */ +static int __init rcu_spawn_core_kthreads(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; + if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) + return 0; + WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), + "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); + return 0; +} +early_initcall(rcu_spawn_core_kthreads); + /* * Handle any core-RCU processing required by a call_rcu() invocation. */ @@ -3355,7 +3466,8 @@ void __init rcu_init(void) rcu_init_one(); if (dump_tree) rcu_dump_rcu_node_tree(); - open_softirq(RCU_SOFTIRQ, rcu_core); + if (use_softirq) + open_softirq(RCU_SOFTIRQ, rcu_core_si); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e253d11af3c4..a1a72a1ecb02 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -407,8 +407,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); +static void rcu_cpu_kthread_setup(unsigned int cpu); static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1102765f91fd..21611862e083 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -11,29 +11,7 @@ * Paul E. McKenney */ -#include -#include -#include -#include -#include -#include -#include -#include "../time/tick-internal.h" - -#ifdef CONFIG_RCU_BOOST #include "../locking/rtmutex_common.h" -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, - * all uses are in dead code. Provide a definition to keep the compiler - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. - * This probably needs to be excluded from -rt builds. - */ -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) -#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ @@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (!use_softirq) + pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) pr_info("\tRCU debug extended QS entry/exit.\n"); rcupdate_announce_bootup_oddness(); @@ -627,7 +607,7 @@ static void rcu_read_unlock_special(struct task_struct *t) if (preempt_bh_were_disabled || irqs_were_disabled) { WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); /* Need to defer quiescent state until everything is enabled. */ - if (irqs_were_disabled) { + if (irqs_were_disabled && use_softirq) { /* Enabling irqs does not reschedule, so... */ raise_softirq_irqoff(RCU_SOFTIRQ); } else { @@ -944,18 +924,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - -static void rcu_wake_cond(struct task_struct *t, int status) +/* + * If boosting, set rcuc kthreads to realtime priority. + */ +static void rcu_cpu_kthread_setup(unsigned int cpu) { - /* - * If the thread is yielding, only wake it when this - * is invoked from idle - */ - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) - wake_up_process(t); +#ifdef CONFIG_RCU_BOOST + struct sched_param sp; + + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); +#endif /* #ifdef CONFIG_RCU_BOOST */ } +#ifdef CONFIG_RCU_BOOST + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1090,23 +1073,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } } -/* - * Wake up the per-CPU kthread to invoke RCU callbacks. - */ -static void invoke_rcu_callbacks_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), - __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); - } - local_irq_restore(flags); -} - /* * Is the current CPU running the RCU-callbacks kthread? * Caller must have preemption disabled. @@ -1160,59 +1126,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) return 0; } -static void rcu_cpu_kthread_setup(unsigned int cpu) -{ - struct sched_param sp; - - sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -} - -static void rcu_cpu_kthread_park(unsigned int cpu) -{ - per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; -} - -static int rcu_cpu_kthread_should_run(unsigned int cpu) -{ - return __this_cpu_read(rcu_data.rcu_cpu_has_work); -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces - * the RCU softirq used in configurations of RCU that do not support RCU - * priority boosting. - */ -static void rcu_cpu_kthread(unsigned int cpu) -{ - unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); - int spincnt; - - for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); - local_bh_disable(); - *statusp = RCU_KTHREAD_RUNNING; - local_irq_disable(); - work = *workp; - *workp = 0; - local_irq_enable(); - if (work) - rcu_do_batch(this_cpu_ptr(&rcu_data)); - local_bh_enable(); - if (*workp == 0) { - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); - *statusp = RCU_KTHREAD_WAITING; - return; - } - } - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); - *statusp = RCU_KTHREAD_WAITING; -} - /* * Set the per-rcu_node kthread's affinity to cover all CPUs that are * served by the rcu_node in question. The CPU hotplug lock is still @@ -1243,27 +1156,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } -static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_data.rcu_cpu_kthread_task, - .thread_should_run = rcu_cpu_kthread_should_run, - .thread_fn = rcu_cpu_kthread, - .thread_comm = "rcuc/%u", - .setup = rcu_cpu_kthread_setup, - .park = rcu_cpu_kthread_park, -}; - /* * Spawn boost kthreads -- called as soon as the scheduler is running. */ static void __init rcu_spawn_boost_kthreads(void) { struct rcu_node *rnp; - int cpu; - for_each_possible_cpu(cpu) - per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; - if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) - return; rcu_for_each_leaf_node(rnp) (void)rcu_spawn_one_boost_kthread(rnp); } @@ -1286,11 +1185,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static void invoke_rcu_callbacks_kthread(void) -{ - WARN_ON_ONCE(1); -} - static bool rcu_is_callbacks_kthread(void) { return false; From 23634ebc1d946f19eb112d4455c1d84948875e31 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 24 Mar 2019 15:25:51 -0700 Subject: [PATCH 02/61] rcu: Check for wakeup-safe conditions in rcu_read_unlock_special() When RCU core processing is offloaded from RCU_SOFTIRQ to the rcuc kthreads, a full and unconditional wakeup is required to initiate RCU core processing. In contrast, when RCU core processing is carried out by RCU_SOFTIRQ, a raise_softirq() suffices. Of course, there are situations where raise_softirq() does a full wakeup, but these do not occur with normal usage of rcu_read_unlock(). The reason that full wakeups can be problematic is that the scheduler sometimes invokes rcu_read_unlock() with its pi or rq locks held, which can of course result in deadlock in CONFIG_PREEMPT=y kernels when rcu_read_unlock() invokes the scheduler. Scheduler invocations can happen in the following situations: (1) The just-ended reader has been subjected to RCU priority boosting, in which case rcu_read_unlock() must deboost, (2) Interrupts were disabled across the call to rcu_read_unlock(), so the quiescent state must be deferred, requiring a wakeup of the rcuc kthread corresponding to the current CPU. Now, the scheduler may hold one of its locks across rcu_read_unlock() only if preemption has been disabled across the entire RCU read-side critical section, which in the days prior to RCU flavor consolidation meant that rcu_read_unlock() never needed to do wakeups. However, this is no longer the case for any but the first rcu_read_unlock() following a condition (e.g., preempted RCU reader) requiring special rcu_read_unlock() attention. For example, an RCU read-side critical section might be preempted, but preemption might be disabled across the rcu_read_unlock(). The rcu_read_unlock() must defer the quiescent state, and therefore leaves the task queued on its leaf rcu_node structure. If a scheduler interrupt occurs, the scheduler might well invoke rcu_read_unlock() with one of its locks held. However, the preempted task is still queued, so rcu_read_unlock() will attempt to defer the quiescent state once more. When RCU core processing is carried out by RCU_SOFTIRQ, this works just fine: The raise_softirq() function simply sets a bit in a per-CPU mask and the RCU core processing will be undertaken upon return from interrupt. Not so when RCU core processing is carried out by the rcuc kthread: In this case, the required wakeup can result in deadlock. The initial solution to this problem was to use set_tsk_need_resched() and set_preempt_need_resched() to force a future context switch, which allows rcu_preempt_note_context_switch() to report the deferred quiescent state to RCU's core processing. Unfortunately for expedited grace periods, there can be a significant delay between the call for a context switch and the actual context switch. This commit therefore introduces a ->deferred_qs flag to the task_struct structure's rcu_special structure. This flag is initially false, and is set to true by the first call to rcu_read_unlock() requiring special attention, then finally reset back to false when the quiescent state is finally reported. Then rcu_read_unlock() attempts full wakeups only when ->deferred_qs is false, that is, on the first rcu_read_unlock() requiring special attention. Note that a chain of RCU readers linked by some other sort of reader may find that a later rcu_read_unlock() is once again able to do a full wakeup, courtesy of an intervening preemption: rcu_read_lock(); /* preempted */ local_irq_disable(); rcu_read_unlock(); /* Can do full wakeup, sets ->deferred_qs. */ rcu_read_lock(); local_irq_enable(); preempt_disable() rcu_read_unlock(); /* Cannot do full wakeup, ->deferred_qs set. */ rcu_read_lock(); preempt_enable(); /* preempted, >deferred_qs reset. */ local_irq_disable(); rcu_read_unlock(); /* Can again do full wakeup, sets ->deferred_qs. */ Such linked RCU readers do not yet seem to appear in the Linux kernel, and it is probably best if they don't. However, RCU needs to handle them, and some variations on this theme could make even raise_softirq() unsafe due to the possibility of its doing a full wakeup. This commit therefore also avoids invoking raise_softirq() when the ->deferred_qs set flag is set. Signed-off-by: Paul E. McKenney Cc: Sebastian Andrzej Siewior --- include/linux/sched.h | 2 +- kernel/rcu/tree_plugin.h | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 11837410690f..942a44c1b8eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -565,7 +565,7 @@ union rcu_special { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ - u8 pad; /* No garbage from compiler! */ + u8 deferred_qs; } b; /* Bits. */ u32 s; /* Set of bits. */ }; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 21611862e083..75110ea75d01 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -455,6 +455,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } + t->rcu_read_unlock_special.b.deferred_qs = false; if (special.b.need_qs) { rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; @@ -605,16 +606,24 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); - /* Need to defer quiescent state until everything is enabled. */ - if (irqs_were_disabled && use_softirq) { - /* Enabling irqs does not reschedule, so... */ + t->rcu_read_unlock_special.b.exp_hint = false; + // Need to defer quiescent state until everything is enabled. + if (irqs_were_disabled && use_softirq && + (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { + // Using softirq, safe to awaken, and we get + // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); + } else if (irqs_were_disabled && !use_softirq && + !t->rcu_read_unlock_special.b.deferred_qs) { + // Safe to awaken and we get no help from enabling + // irqs, unlike bh/preempt. + invoke_rcu_core(); } else { - /* Enabling BH or preempt does reschedule, so... */ + // Enabling BH or preempt does reschedule, so... set_tsk_need_resched(current); set_preempt_need_resched(); } + t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } From 25102de65fdd246eb6801114ce6dfa3a076bb678 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Apr 2019 14:12:50 -0700 Subject: [PATCH 03/61] rcu: Only do rcu_read_unlock_special() wakeups if expedited Currently, rcu_read_unlock_special() will do wakeups whenever it is safe to do so. However, wakeups are expensive, and they are only really needed when the just-ended RCU read-side critical section is blocking an expedited grace period (in which case speed is of the essence) or on a nohz_full CPU (where it might be a good long time before an interrupt arrives). This commit therefore checks for these conditions, and does the expensive wakeups only if doing so would be useful. Note it can be rather expensive to determine whether or not the current task (as opposed to the current CPU) is blocking the current expedited grace period. Doing so requires traversing the ->blkd_tasks list, which can be quite long. This commit therefore cheats: If the current task is on a given ->blkd_tasks list, and some task on that list is blocking the current expedited grace period, the code assumes that the current task is blocking that expedited grace period. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 75110ea75d01..d15cdab6aeb4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -606,20 +606,28 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { + bool exp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; + t->rcu_read_unlock_special.b.exp_hint = false; + exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || + (rdp->grpmask & rnp->expmask) || + tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. - if (irqs_were_disabled && use_softirq && + if (exp && irqs_were_disabled && use_softirq && (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); - } else if (irqs_were_disabled && !use_softirq && + } else if (exp && irqs_were_disabled && !use_softirq && !t->rcu_read_unlock_special.b.deferred_qs) { // Safe to awaken and we get no help from enabling // irqs, unlike bh/preempt. invoke_rcu_core(); } else { // Enabling BH or preempt does reschedule, so... + // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); } From 385b599e8c04fa843c4d7f785478827cc512d720 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Apr 2019 15:12:47 -0700 Subject: [PATCH 04/61] rcu: Allow rcu_read_unlock_special() to raise_softirq() if in_irq() When running in an interrupt handler, raise_softirq() and raise_softirq_irqoff() have extremely low overhead: They simply set a bit in a per-CPU mask, which is checked upon exit from that interrupt handler. Therefore, if rcu_read_unlock_special() is invoked within an interrupt handler and RCU_SOFTIRQ is in use, this commit make use of raise_softirq_irqoff() even if there is no expedited grace period in flight and even if this is not a nohz_full CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d15cdab6aeb4..e1005f5e8094 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -615,7 +615,7 @@ static void rcu_read_unlock_special(struct task_struct *t) (rdp->grpmask & rnp->expmask) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. - if (exp && irqs_were_disabled && use_softirq && + if ((exp || in_irq()) && irqs_were_disabled && use_softirq && (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. From 0864f057b050bc6dd68106b3185e02db5140012d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Apr 2019 12:19:25 -0700 Subject: [PATCH 05/61] rcu: Use irq_work to get scheduler's attention in clean context When rcu_read_unlock_special() is invoked with interrupts disabled, is either not in an interrupt handler or is not using RCU_SOFTIRQ, is not the first RCU read-side critical section in the chain, and either there is an expedited grace period in flight or this is a NO_HZ_FULL kernel, the end of the grace period can be unduly delayed. The reason for this is that it is not safe to do wakeups in this situation. This commit fixes this problem by using the irq_work subsystem to force a later interrupt handler in a clean environment. Because set_tsk_need_resched(current) and set_preempt_need_resched() are invoked prior to this, the scheduler will force a context switch upon return from this interrupt (though perhaps at the end of any interrupted preempt-disable or BH-disable region of code), which will invoke rcu_note_context_switch() (again in a clean environment), which will in turn give RCU the chance to report the deferred quiescent state. Of course, by then this task might be within another RCU read-side critical section. But that will be detected at that time and reporting will be further deferred to the outermost rcu_read_unlock(). See rcu_preempt_need_deferred_qs() and rcu_preempt_deferred_qs() for more details on the checking. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a1a72a1ecb02..21d740f0b8dc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -161,6 +161,8 @@ struct rcu_data { /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ + struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ + bool defer_qs_iw_pending; /* Scheduler attention pending? */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e1005f5e8094..58c7853f19e7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -587,6 +587,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) t->rcu_read_lock_nesting += RCU_NEST_BIAS; } +/* + * Minimal handler to give the scheduler a chance to re-evaluate. + */ +static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + + rdp = container_of(iwp, struct rcu_data, defer_qs_iw); + rdp->defer_qs_iw_pending = false; +} + /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU @@ -630,6 +641,15 @@ static void rcu_read_unlock_special(struct task_struct *t) // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); + if (IS_ENABLED(CONFIG_IRQ_WORK) && + !rdp->defer_qs_iw_pending && exp) { + // Get scheduler to re-evaluate and call hooks. + // If !IRQ_WORK, FQS scan will eventually IPI. + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); + rdp->defer_qs_iw_pending = true; + irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); + } } t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); From 43e903ad3e0843d03da15d8eaffb5ada22966c76 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Mar 2019 08:36:03 -0700 Subject: [PATCH 06/61] rcu: Inline invoke_rcu_callbacks() into its sole remaining caller This commit saves a few lines of code by inlining invoke_rcu_callbacks() into its sole remaining caller. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e290163505a..7822a2e1370d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -147,7 +147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -2296,8 +2295,9 @@ static __latent_entropy void rcu_core(void) rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_callbacks(rdp); + if (rcu_segcblist_ready_cbs(&rdp->cblist) && + likely(READ_ONCE(rcu_scheduler_fully_active))) + rcu_do_batch(rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); @@ -2332,20 +2332,6 @@ static void invoke_rcu_core_kthread(void) local_irq_restore(flags); } -/* - * Do RCU callback invocation. Not that if we are running !use_softirq, - * we are already in the rcuc kthread. If callbacks are offloaded, then - * ->cblist is always empty, so we don't get here. Therefore, we only - * ever need to check for the scheduler being operational (some callbacks - * do wakeups, so we do need the scheduler). - */ -static void invoke_rcu_callbacks(struct rcu_data *rdp) -{ - if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) - return; - rcu_do_batch(rdp); -} - /* * Wake up this CPU's rcuc kthread to do RCU core processing. */ From b9ad4d6ed18e23b0ff6a824b925a1278625d5345 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 09:09:47 -0700 Subject: [PATCH 07/61] rcu: Avoid self-IPI in sync_rcu_exp_select_node_cpus() Although sync_rcu_exp_select_node_cpus() treats the current CPU as being in a quiescent state, it might well migrate to some other CPU before reaching the smp_call_function_single(), which could then result in an unnecessary simulated self-IPI. This commit therefore instead simply refuses to invoke smp_call_function_single() on the current CPU, which causes the later rcu_report_exp_cpu_mult() to report this CPU's quiescent state with less overhead. This also reduces the rcu_exp_handler() function's state space by removing the direct call that this smp_call_function_single() uses to emulate the requested self-IPI. Signed-off-by: Paul E. McKenney [ paulmck: Use get_cpu() instead of preempt_disable() per Joel Fernandes. ] --- kernel/rcu/tree_exp.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 9c990df880d1..5390618787b6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -384,7 +384,12 @@ retry_ipi: mask_ofl_test |= mask; continue; } + if (get_cpu() == cpu) { + put_cpu(); + continue; + } ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); if (!ret) { mask_ofl_ipi &= ~mask; continue; From e015a341122024198f57d1f0498a776523137e94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 10:03:12 -0700 Subject: [PATCH 08/61] rcu: Avoid self-IPI in sync_sched_exp_online_cleanup() The sync_sched_exp_online_cleanup() is invoked at online time to handle the case where the start of an expedited grace period ran concurrently with a CPU being taken offline and then immediately being placed online. It checks to see if RCU needs an expedited quiescent state from the incoming CPU, sending it an IPI if so. However, it is quite possible that sync_sched_exp_online_cleanup() is running on that CPU, in which case it is considerably less overhead to simply request the quiescent state locally instead of simulating a self-IPI. This commit therefore places the last few lines of rcu_exp_handler() into a new rcu_exp_need_qs() function, which is invoked both by rcu_exp_handler() and by sync_sched_exp_online_cleanup() in the self-IPI case. This also reduces the rcu_exp_handler() function's state space by removing the direct call that this smp_call_function_single() uses to emulate the requested self-IPI. This in turn will allow tighter error checking in rcu_is_cpu_rrupt_from_idle(). Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/tree_exp.h | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 5390618787b6..de1b4acf6979 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -699,6 +699,16 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) #else /* #ifdef CONFIG_PREEMPT_RCU */ +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) +{ + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { @@ -714,25 +724,38 @@ static void rcu_exp_handler(void *unused) rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); return; } - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + rcu_exp_need_qs(); } /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ static void sync_sched_exp_online_cleanup(int cpu) { + unsigned long flags; + int my_cpu; struct rcu_data *rdp; int ret; struct rcu_node *rnp; rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + my_cpu = get_cpu(); + /* Quiescent state either not needed or already requested, leave. */ + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) { + put_cpu(); return; + } + /* Quiescent state needed on current CPU, so set it up locally. */ + if (my_cpu == cpu) { + local_irq_save(flags); + rcu_exp_need_qs(); + local_irq_restore(flags); + put_cpu(); + return; + } + /* Quiescent state needed on some other CPU, send IPI. */ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); WARN_ON_ONCE(ret); } From 71d8d1531e0904e08adf1540e191bd707dfd73da Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 26 Mar 2019 15:24:08 -0400 Subject: [PATCH 09/61] lockdep: Add assertion to check if in an interrupt In rcu_rrupt_from_idle, we want to check if it is called from within an interrupt, but want to do such checking only for debug builds. lockdep already tracks when we enter an interrupt. Let us expose it as an assertion macro so it can be used to assert this. Suggested-by: Steven Rostedt Cc: kernel-team@android.com Cc: rcu@vger.kernel.org Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/lockdep.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 6e2377e6c1d6..e8eef38b2213 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -632,11 +632,18 @@ do { \ "IRQs not disabled as expected\n"); \ } while (0) +#define lockdep_assert_in_irq() do { \ + WARN_ONCE(debug_locks && !current->lockdep_recursion && \ + !current->hardirq_context, \ + "Not in hardirq as expected\n"); \ + } while (0) + #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) +# define lockdep_assert_in_irq() do { } while (0) #endif #ifdef CONFIG_LOCKDEP From 4494dd58fbb477e54c129c1d8ef477aad433eba0 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 22 Apr 2019 12:17:45 -0400 Subject: [PATCH 10/61] tools/memory-model: Prepare for data-race detection This patch makes some slight alterations to linux-kernel.cat in preparation for adding support for data-race detection to the Linux-Kernel Memory Model. The definitions of relations involved in Acquire, Release, and unlock-lock ordering are moved up earlier in the source file. The rmb relation is factored through the new R4rmb class: the class of reads to which rmb will apply. The definition of the fence relation is moved earlier, and it is split up into read- and write-fences (rmb and wmb) and all the others. This should not make any functional changes. Signed-off-by: Alan Stern Reviewed-by: Andrea Parri Signed-off-by: Paul E. McKenney --- tools/memory-model/linux-kernel.cat | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index 8dcb37835b61..834107022c50 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -24,8 +24,14 @@ include "lock.cat" (* Basic relations *) (*******************) +(* Release Acquire *) +let acq-po = [Acquire] ; po ; [M] +let po-rel = [M] ; po ; [Release] +let po-unlock-rf-lock-po = po ; [UL] ; rf ; [LKR] ; po + (* Fences *) -let rmb = [R \ Noreturn] ; fencerel(Rmb) ; [R \ Noreturn] +let R4rmb = R \ Noreturn (* Reads for which rmb works *) +let rmb = [R4rmb] ; fencerel(Rmb) ; [R4rmb] let wmb = [W] ; fencerel(Wmb) ; [W] let mb = ([M] ; fencerel(Mb) ; [M]) | ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | @@ -34,13 +40,10 @@ let mb = ([M] ; fencerel(Mb) ; [M]) | ([M] ; po ; [UL] ; (co | po) ; [LKW] ; fencerel(After-unlock-lock) ; [M]) let gp = po ; [Sync-rcu | Sync-srcu] ; po? - let strong-fence = mb | gp -(* Release Acquire *) -let acq-po = [Acquire] ; po ; [M] -let po-rel = [M] ; po ; [Release] -let po-unlock-rf-lock-po = po ; [UL] ; rf ; [LKR] ; po +let nonrw-fence = strong-fence | po-rel | acq-po +let fence = nonrw-fence | wmb | rmb (**********************************) (* Fundamental coherence ordering *) @@ -63,7 +66,6 @@ let rwdep = (dep | ctrl) ; [W] let overwrite = co | fr let to-w = rwdep | (overwrite & int) let to-r = addr | (dep ; rfi) -let fence = strong-fence | wmb | po-rel | rmb | acq-po let ppo = to-r | to-w | fence | (po-unlock-rf-lock-po & int) (* Propagation: Ordering from release operations and strong fences. *) From d1a84ab190137cc2a980b6979b1f2790d51b2d87 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 22 Apr 2019 12:17:58 -0400 Subject: [PATCH 11/61] tools/memory-model: Add definitions of plain and marked accesses This patch adds definitions for marked and plain accesses to the Linux-Kernel Memory Model. It also modifies the definitions of the existing parts of the model (including the cumul-fence, prop, hb, pb, and rb relations) so as to make them apply only to marked accesses. Signed-off-by: Alan Stern Reviewed-by: Andrea Parri Signed-off-by: Paul E. McKenney --- tools/memory-model/linux-kernel.bell | 5 +++++ tools/memory-model/linux-kernel.cat | 16 +++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell index def9131d3d8e..b60eb5a01053 100644 --- a/tools/memory-model/linux-kernel.bell +++ b/tools/memory-model/linux-kernel.bell @@ -76,3 +76,8 @@ flag ~empty rcu-rscs & (po ; [Sync-srcu] ; po) as invalid-sleep (* Validate SRCU dynamic match *) flag ~empty different-values(srcu-rscs) as srcu-bad-nesting + +(* Compute marked and plain memory accesses *) +let Marked = (~M) | IW | Once | Release | Acquire | domain(rmw) | range(rmw) | + LKR | LKW | UL | LF | RL | RU +let Plain = M \ Marked diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index 834107022c50..ff354e5ffd4b 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -65,19 +65,21 @@ let dep = addr | data let rwdep = (dep | ctrl) ; [W] let overwrite = co | fr let to-w = rwdep | (overwrite & int) -let to-r = addr | (dep ; rfi) +let to-r = addr | (dep ; [Marked] ; rfi) let ppo = to-r | to-w | fence | (po-unlock-rf-lock-po & int) (* Propagation: Ordering from release operations and strong fences. *) -let A-cumul(r) = rfe? ; r -let cumul-fence = A-cumul(strong-fence | po-rel) | wmb | po-unlock-rf-lock-po -let prop = (overwrite & ext)? ; cumul-fence* ; rfe? +let A-cumul(r) = (rfe ; [Marked])? ; r +let cumul-fence = [Marked] ; (A-cumul(strong-fence | po-rel) | wmb | + po-unlock-rf-lock-po) ; [Marked] +let prop = [Marked] ; (overwrite & ext)? ; cumul-fence* ; + [Marked] ; rfe? ; [Marked] (* * Happens Before: Ordering from the passage of time. * No fences needed here for prop because relation confined to one process. *) -let hb = ppo | rfe | ((prop \ id) & int) +let hb = [Marked] ; (ppo | rfe | ((prop \ id) & int)) ; [Marked] acyclic hb as happens-before (****************************************) @@ -85,7 +87,7 @@ acyclic hb as happens-before (****************************************) (* Propagation: Each non-rf link needs a strong fence. *) -let pb = prop ; strong-fence ; hb* +let pb = prop ; strong-fence ; hb* ; [Marked] acyclic pb as propagation (*******) @@ -133,7 +135,7 @@ let rec rcu-fence = rcu-gp | srcu-gp | (rcu-fence ; rcu-link ; rcu-fence) (* rb orders instructions just as pb does *) -let rb = prop ; po ; rcu-fence ; po? ; hb* ; pb* +let rb = prop ; po ; rcu-fence ; po? ; hb* ; pb* ; [Marked] irreflexive rb as rcu From 0031e38adf38779acce5737f4905b9f60750b674 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 22 Apr 2019 12:18:09 -0400 Subject: [PATCH 12/61] tools/memory-model: Add data-race detection This patch adds data-race detection to the Linux-Kernel Memory Model. As part of this effort, support is added for: compiler barriers (the barrier() function), and a new Preserved Program Order term: (addr ; [Plain] ; wmb) Data races are marked with a special Flag warning in herd. It is not guaranteed that the model will provide accurate predictions when a data race is present. The patch does not include documentation for the data-race detection facility. The basic design has been explained in various emails, and a separate documentation patch will be submitted later. This work is based on an earlier formulation of data races for the LKMM by Andrea Parri. Signed-off-by: Alan Stern Reviewed-by: Andrea Parri Signed-off-by: Paul E. McKenney --- tools/memory-model/linux-kernel.bell | 1 + tools/memory-model/linux-kernel.cat | 50 +++++++++++++++++++++++++++- tools/memory-model/linux-kernel.def | 1 + 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell index b60eb5a01053..5be86b1025e8 100644 --- a/tools/memory-model/linux-kernel.bell +++ b/tools/memory-model/linux-kernel.bell @@ -24,6 +24,7 @@ instructions RMW[{'once,'acquire,'release}] enum Barriers = 'wmb (*smp_wmb*) || 'rmb (*smp_rmb*) || 'mb (*smp_mb*) || + 'barrier (*barrier*) || 'rcu-lock (*rcu_read_lock*) || 'rcu-unlock (*rcu_read_unlock*) || 'sync-rcu (*synchronize_rcu*) || diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index ff354e5ffd4b..36d367054811 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -44,6 +44,9 @@ let strong-fence = mb | gp let nonrw-fence = strong-fence | po-rel | acq-po let fence = nonrw-fence | wmb | rmb +let barrier = fencerel(Barrier | Rmb | Wmb | Mb | Sync-rcu | Sync-srcu | + Before-atomic | After-atomic | Acquire | Release) | + (po ; [Release]) | ([Acquire] ; po) (**********************************) (* Fundamental coherence ordering *) @@ -64,7 +67,7 @@ empty rmw & (fre ; coe) as atomic let dep = addr | data let rwdep = (dep | ctrl) ; [W] let overwrite = co | fr -let to-w = rwdep | (overwrite & int) +let to-w = rwdep | (overwrite & int) | (addr ; [Plain] ; wmb) let to-r = addr | (dep ; [Marked] ; rfi) let ppo = to-r | to-w | fence | (po-unlock-rf-lock-po & int) @@ -147,3 +150,48 @@ irreflexive rb as rcu * let xb = hb | pb | rb * acyclic xb as executes-before *) + +(*********************************) +(* Plain accesses and data races *) +(*********************************) + +(* Warn about plain writes and marked accesses in the same region *) +let mixed-accesses = ([Plain & W] ; (po-loc \ barrier) ; [Marked]) | + ([Marked] ; (po-loc \ barrier) ; [Plain & W]) +flag ~empty mixed-accesses as mixed-accesses + +(* Executes-before and visibility *) +let xbstar = (hb | pb | rb)* +let full-fence = strong-fence | (po ; rcu-fence ; po?) +let vis = cumul-fence* ; rfe? ; [Marked] ; + ((full-fence ; [Marked] ; xbstar) | (xbstar & int)) + +(* Boundaries for lifetimes of plain accesses *) +let w-pre-bounded = [Marked] ; (addr | fence)? +let r-pre-bounded = [Marked] ; (addr | nonrw-fence | + ([R4rmb] ; fencerel(Rmb) ; [~Noreturn]))? +let w-post-bounded = fence? ; [Marked] +let r-post-bounded = (nonrw-fence | ([~Noreturn] ; fencerel(Rmb) ; [R4rmb]))? ; + [Marked] + +(* Visibility and executes-before for plain accesses *) +let ww-vis = w-post-bounded ; vis ; w-pre-bounded +let wr-vis = w-post-bounded ; vis ; r-pre-bounded +let rw-xbstar = r-post-bounded ; xbstar ; w-pre-bounded + +(* Potential races *) +let pre-race = ext & ((Plain * M) | ((M \ IW) * Plain)) + +(* Coherence requirements for plain accesses *) +let wr-incoh = pre-race & rf & rw-xbstar^-1 +let rw-incoh = pre-race & fr & wr-vis^-1 +let ww-incoh = pre-race & co & ww-vis^-1 +empty (wr-incoh | rw-incoh | ww-incoh) as plain-coherence + +(* Actual races *) +let ww-nonrace = ww-vis & ((Marked * W) | rw-xbstar) & ((W * Marked) | wr-vis) +let ww-race = (pre-race & co) \ ww-nonrace +let wr-race = (pre-race & (co? ; rf)) \ wr-vis +let rw-race = (pre-race & fr) \ rw-xbstar + +flag ~empty (ww-race | wr-race | rw-race) as data-race diff --git a/tools/memory-model/linux-kernel.def b/tools/memory-model/linux-kernel.def index 551eeaa389d4..ef0f3c1850de 100644 --- a/tools/memory-model/linux-kernel.def +++ b/tools/memory-model/linux-kernel.def @@ -24,6 +24,7 @@ smp_mb__before_atomic() { __fence{before-atomic}; } smp_mb__after_atomic() { __fence{after-atomic}; } smp_mb__after_spinlock() { __fence{after-spinlock}; } smp_mb__after_unlock_lock() { __fence{after-unlock-lock}; } +barrier() { __fence{barrier}; } // Exchange xchg(X,V) __xchg{mb}(X,V) From eddded80121f2a7bda810f65bf7cb648a709ed11 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 26 Mar 2019 15:24:09 -0400 Subject: [PATCH 13/61] rcu: Add checks for dynticks counters in rcu_is_cpu_rrupt_from_idle() It would be good to combine the dynticks and dynticks_nesting counters in order to simplify the code. Unfortunately, there are concerns about usermode upcalls appearing to RCU as half of an interrupt, as Byungchul learned [1]. The "half" in "half interrupt" is due to an unpaired rcu_irq_enter(): Normally, each rcu_irq_enter() has a later call to rcu_irq_exit(). Out of an abundance of caution, Paul added warnings [2] in the RCU code which if not fired by 2021 will be interpreted as meaning that this half-interrupt scenario cannot happen any more, thus permitting simplification of this code. In the meantime, this commit makes the following changes: (1) Combining these two counters requires that rcu_rrupt_from_idle() is invoked only from hard-interrupt contexts as discussed here [3]. This commit therefore adds the required lockdep_assert_in_irq() to check this constraint. (2) Furthermore, rcu_rrupt_from_idle() is not explicit about how it is using the counters which can lead to weird future bugs. This commit therefore adds comments indicating the meaning and use of each counter. (3) Lastly, this commit checks for counter underflows as another check that half interrupts don't occur. (Previously, the function would simply return true upon underflow.) All these checks checks are NOOPs if PROVE_LOCKING (and thus PROVE_RCU) are disabled. [1] https://lore.kernel.org/patchwork/patch/952349/ [2] Commit e11ec65cc8d6 ("rcu: Add warning to detect half-interrupts") [3] https://lore.kernel.org/lkml/20190312150514.GB249405@google.com/ Cc: byungchul.park@lge.com Cc: kernel-team@android.com Cc: rcu@vger.kernel.org Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7822a2e1370d..b9629cf08f94 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -376,16 +376,29 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void) } /** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle * - * If the current CPU is idle or running at a first-level (not nested) + * If the current CPU is idle and running at a first-level (not nested) * interrupt from idle, return true. The caller must have at least * disabled preemption. */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && - __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; + /* Called only from within the scheduling-clock interrupt */ + lockdep_assert_in_irq(); + + /* Check for counter underflows */ + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, + "RCU dynticks_nesting counter underflow!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, + "RCU dynticks_nmi_nesting counter underflow/zero!"); + + /* Are we at first interrupt nesting level? */ + if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + return false; + + /* Does CPU appear to be idle from an RCU standpoint? */ + return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ From 1bb336443cde1154600bd147a45a30baa59c57db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 15:51:25 -0700 Subject: [PATCH 14/61] rcu: Rename rcu_data's ->deferred_qs to ->exp_deferred_qs The rcu_data structure's ->deferred_qs field is used to indicate that the current CPU is blocking an expedited grace period (perhaps a future one). Given that it is used only for expedited grace periods, its current name is misleading, so this commit renames it to ->exp_deferred_qs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_exp.h | 8 ++++---- kernel/rcu/tree_plugin.h | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 21d740f0b8dc..7acaf3a62d39 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -154,7 +154,7 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool deferred_qs; /* This CPU awaiting a deferred QS? */ + bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index de1b4acf6979..e0c928d04be5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -250,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->deferred_qs, false); + WRITE_ONCE(rdp->exp_deferred_qs, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -616,7 +616,7 @@ static void rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -638,7 +638,7 @@ static void rcu_exp_handler(void *unused) if (t->rcu_read_lock_nesting > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -661,7 +661,7 @@ static void rcu_exp_handler(void *unused) * * Otherwise, force a context switch after the CPU enables everything. */ - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { rcu_preempt_deferred_qs(t); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 58c7853f19e7..1aeb4ae187ce 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -237,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->deferred_qs); + WARN_ON_ONCE(rdp->exp_deferred_qs); } /* @@ -337,7 +337,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->deferred_qs) + if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ @@ -451,7 +451,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->deferred_qs) { + if (!special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } @@ -459,7 +459,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (special.b.need_qs) { rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { + if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } @@ -471,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->deferred_qs) { + if (rdp->exp_deferred_qs) { rcu_report_exp_rdp(rdp); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); @@ -560,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.deferred_qs) || + return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && t->rcu_read_lock_nesting <= 0; } From f0b635627395223d3c60a3105372b4349e04772f Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 23 Apr 2019 09:21:55 +0800 Subject: [PATCH 15/61] rcu: Remove unused rdp local from synchronize_rcu_expedited() Because rdp is initialized but never used in synchronize_rcu_expedited(), this commit removes it. Signed-off-by: Jiang Biao Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e0c928d04be5..8e539710721a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -793,7 +793,6 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - struct rcu_data *rdp; struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -830,7 +829,6 @@ void synchronize_rcu_expedited(void) } /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); From de1dbcee433ccff3e0a37698c65b40542c9d4cf1 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 29 Mar 2019 10:05:55 -0400 Subject: [PATCH 16/61] doc/rcuref: Document real world examples in kernel Document similar real world examples in the kernel corresponding to the second and third code snippets. Also correct an issue in release_referenced() in the code snippet example. Cc: oleg@redhat.com Cc: jannh@google.com Signed-off-by: Joel Fernandes (Google) [ paulmck: Do a bit of wordsmithing. ] Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcuref.txt | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt index 613033ff2b9b..5e6429d66c24 100644 --- a/Documentation/RCU/rcuref.txt +++ b/Documentation/RCU/rcuref.txt @@ -12,6 +12,7 @@ please read on. Reference counting on elements of lists which are protected by traditional reader/writer spinlocks or semaphores are straightforward: +CODE LISTING A: 1. 2. add() search_and_reference() { { @@ -28,7 +29,8 @@ add() search_and_reference() release_referenced() delete() { { ... write_lock(&list_lock); - atomic_dec(&el->rc, relfunc) ... + if(atomic_dec_and_test(&el->rc)) ... + kfree(el); ... remove_element } write_unlock(&list_lock); ... @@ -44,6 +46,7 @@ search_and_reference() could potentially hold reference to an element which has already been deleted from the list/array. Use atomic_inc_not_zero() in this scenario as follows: +CODE LISTING B: 1. 2. add() search_and_reference() { { @@ -79,6 +82,7 @@ search_and_reference() code path. In such cases, the atomic_dec_and_test() may be moved from delete() to el_free() as follows: +CODE LISTING C: 1. 2. add() search_and_reference() { { @@ -114,6 +118,17 @@ element can therefore safely be freed. This in turn guarantees that if any reader finds the element, that reader may safely acquire a reference without checking the value of the reference counter. +A clear advantage of the RCU-based pattern in listing C over the one +in listing B is that any call to search_and_reference() that locates +a given object will succeed in obtaining a reference to that object, +even given a concurrent invocation of delete() for that same object. +Similarly, a clear advantage of both listings B and C over listing A is +that a call to delete() is not delayed even if there are an arbitrarily +large number of calls to search_and_reference() searching for the same +object that delete() was invoked on. Instead, all that is delayed is +the eventual invocation of kfree(), which is usually not a problem on +modern computer systems, even the small ones. + In cases where delete() can sleep, synchronize_rcu() can be called from delete(), so that el_free() can be subsumed into delete as follows: @@ -130,3 +145,7 @@ delete() kfree(el); ... } + +As additional examples in the kernel, the pattern in listing C is used by +reference counting of struct pid, while the pattern in listing B is used by +struct posix_acl. From 588759a39145e18dd808341861d45d44383778b5 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sun, 14 Apr 2019 11:11:03 +0800 Subject: [PATCH 17/61] doc: Fixup definition of rcupdate.rcu_task_stall_timeout A positive value of rcupdate.rcu_task_stall_timeout is an interval in seconds rather than jiffies. Signed-off-by: Zhenzhong Duan Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 1ab70c37921f..13e88fc00f01 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -153,7 +153,7 @@ rcupdate.rcu_task_stall_timeout This boot/sysfs parameter controls the RCU-tasks stall warning interval. A value of zero or less suppresses RCU-tasks stall warnings. A positive value sets the stall-warning interval - in jiffies. An RCU-tasks stall warning starts with the line: + in seconds. An RCU-tasks stall warning starts with the line: INFO: rcu_tasks detected stalls on tasks: From cd6d17b4a4646d4bf2568f3a4de13a5a13e2ed28 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 29 Mar 2019 15:25:52 +0530 Subject: [PATCH 18/61] rcu: Dump specified number of blocked tasks The dump_blkd_tasks() function dumps at most 10 blocked tasks, ignoring the value of the ncheck parameter. This commit therefore substitutes the value of ncheck for the hard-coded value of 10. Because all callers currently pass 10 as the number, this patch does not change behavior, but it is clearly an accident waiting to happen. Signed-off-by: Neeraj Upadhyay Reviewed-by: Mukesh Ojha Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1102765f91fd..3a9891a74ead 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -760,7 +760,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) i = 0; list_for_each(lhp, &rnp->blkd_tasks) { pr_cont(" %p", lhp); - if (++i >= 10) + if (++i >= ncheck) break; } pr_cont("\n"); From 3ae976a7e3e87438b8439a01aeb79d4866b1c444 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 29 Mar 2019 16:57:08 +0530 Subject: [PATCH 19/61] rcu: Correctly unlock root node in rcu_check_gp_start_stall() On systems whose rcu_node tree has only one node, the rcu_check_gp_start_stall() function's values of rnp and rnp_root will be identical. In this case, it clearly does not make sense to release both rnp->lock and rnp_root->lock, but that is exactly what this function does in the last early exit. This commit therefore unlocks only rnp->lock when rnp and rnp_root are equal. Signed-off-by: Neeraj Upadhyay Reviewed-by: Mukesh Ojha Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index f65a73a97323..065183391f75 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -630,7 +630,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, time_before(j, rcu_state.gp_req_activity + gpssdelay) || time_before(j, rcu_state.gp_activity + gpssdelay) || atomic_xchg(&warned, 1)) { - raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + if (rnp_root != rnp) + /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp_root); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } From 12edff045bc6dd3ab1565cc02fa4841803c2a633 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 07:48:18 -0700 Subject: [PATCH 20/61] rcu: Make kfree_rcu() ignore NULL pointers This commit makes the kfree_rcu() macro's semantics be consistent with the likes of kfree() by adding a check for NULL pointers, so that kfree_rcu(NULL, ...) is a no-op. Reported-by: Andriy Shevchenko Reported-by: Christoph Hellwig Signed-off-by: Paul E. McKenney Reviewed-by: Andriy Shevchenko --- include/linux/rcupdate.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 922bb6848813..915460ec0872 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -805,7 +805,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree - * @rcu_head: the name of the struct rcu_head within the type of @ptr. + * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore @@ -828,9 +828,13 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ -#define kfree_rcu(ptr, rcu_head) \ - __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) - +#define kfree_rcu(ptr, rhf) \ +do { \ + typeof (ptr) ___p = (ptr); \ + \ + if (___p) \ + __kfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ +} while (0) /* * Place this after a lock-acquisition primitive to guarantee that From d5a9a8c3bc8068f2e5dfba30150ac09b596b461a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Apr 2019 17:01:39 -0700 Subject: [PATCH 21/61] rcu: Set a maximum limit for back-to-back callback invocation Currently, if a CPU has more than 10,000 callbacks pending, it will increase rdp->blimit to LONG_MAX. If you are lucky, LONG_MAX is only about two billion, but this is still a bit too many callbacks to invoke back-to-back while otherwise ignoring the world. This commit therefore sets a maximum limit of DEFAULT_MAX_RCU_BLIMIT, which is set to 10,000, for rdp->blimit. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 980ca3ca643f..f888a76673da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -380,7 +380,8 @@ static int rcu_is_cpu_rrupt_from_idle(void) __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ +#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ static long blimit = DEFAULT_RCU_BLIMIT; #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ static long qhimark = DEFAULT_RCU_QHIMARK; @@ -2113,7 +2114,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); - if (rdp->blimit == LONG_MAX && count <= qlowmark) + if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ @@ -2354,7 +2355,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); } else { /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; + rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) rcu_force_quiescent_state(); From 714b6904e23e1c37f262a4cd02b34d0f1863e227 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Apr 2019 07:12:18 -0700 Subject: [PATCH 22/61] doc: Remove ".vnet" from paulmck email addresses Signed-off-by: Paul E. McKenney --- Documentation/core-api/circular-buffers.rst | 2 +- Documentation/memory-barriers.txt | 2 +- Documentation/translations/ko_KR/memory-barriers.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/core-api/circular-buffers.rst b/Documentation/core-api/circular-buffers.rst index 53e51caa3347..50966f66e398 100644 --- a/Documentation/core-api/circular-buffers.rst +++ b/Documentation/core-api/circular-buffers.rst @@ -3,7 +3,7 @@ Circular Buffers ================ :Author: David Howells -:Author: Paul E. McKenney +:Author: Paul E. McKenney Linux provides a number of features that can be used to implement circular diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f70ebcdfe592..e4e07c8ab89e 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -3,7 +3,7 @@ ============================ By: David Howells - Paul E. McKenney + Paul E. McKenney Will Deacon Peter Zijlstra diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index db0b9d8619f1..5f3c74dcad43 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -24,7 +24,7 @@ Documentation/memory-barriers.txt ========================= 저자: David Howells - Paul E. McKenney + Paul E. McKenney Will Deacon Peter Zijlstra From fe15b50cdeeebd9248bf27e3c31278668f08bc04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Apr 2019 16:15:00 -0700 Subject: [PATCH 23/61] srcu: Allocate per-CPU data for DEFINE_SRCU() in modules Adding DEFINE_SRCU() or DEFINE_STATIC_SRCU() to a loadable module requires that the size of the reserved region be increased, which is not something we want to be doing all that often. One approach would be to require that loadable modules define an srcu_struct and invoke init_srcu_struct() from their module_init function and cleanup_srcu_struct() from their module_exit function. However, this is more than a bit user unfriendly. This commit therefore creates an ___srcu_struct_ptrs linker section, and pointers to srcu_struct structures created by DEFINE_SRCU() and DEFINE_STATIC_SRCU() within a module are placed into that module's ___srcu_struct_ptrs section. The required init_srcu_struct() and cleanup_srcu_struct() functions are then automatically invoked as needed when that module is loaded and unloaded, thus allowing modules to continue to use DEFINE_SRCU() and DEFINE_STATIC_SRCU() while avoiding the need to increase the size of the reserved region. Many of the algorithms and some of the code was cheerfully cherry-picked from other code making use of linker sections, perhaps most notably from tracepoints. All bugs are nevertheless the sole property of the author. Suggested-by: Mathieu Desnoyers [ paulmck: Use __section() and use "default" in srcu_module_notify()'s "switch" statement as suggested by Joel Fernandes. ] Signed-off-by: Paul E. McKenney Tested-by: Joel Fernandes (Google) --- include/asm-generic/vmlinux.lds.h | 4 ++ include/linux/module.h | 5 +++ include/linux/srcutree.h | 14 +++++-- kernel/module.c | 5 +++ kernel/rcu/srcutree.c | 65 +++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 088987e9a3ea..ba1ad39468fc 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -337,6 +337,10 @@ KEEP(*(__tracepoints_ptrs)) /* Tracepoints: pointer array */ \ __stop___tracepoints_ptrs = .; \ *(__tracepoints_strings)/* Tracepoints: strings */ \ + . = ALIGN(8); \ + __start___srcu_struct = .; \ + *(___srcu_struct_ptrs) \ + __end___srcu_struct = .; \ } \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ diff --git a/include/linux/module.h b/include/linux/module.h index 188998d3dca9..1455812dd325 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -450,6 +451,10 @@ struct module { unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif +#ifdef CONFIG_TREE_SRCU + unsigned int num_srcu_structs; + struct srcu_struct **srcu_struct_ptrs; +#endif #ifdef CONFIG_BPF_EVENTS unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 7f7c8c050f63..8af1824c46a8 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -120,9 +120,17 @@ struct srcu_struct { * * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ -#define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data) +#ifdef MODULE +# define __DEFINE_SRCU(name, is_static) \ + is_static struct srcu_struct name; \ + struct srcu_struct *__srcu_struct_##name \ + __section("___srcu_struct_ptrs") = &name +#else +# define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ + is_static struct srcu_struct name = \ + __SRCU_STRUCT_INIT(name, name##_srcu_data) +#endif #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) diff --git a/kernel/module.c b/kernel/module.c index 6e6712b3aaf5..c79a53b629b6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3095,6 +3095,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints_ptrs), &mod->num_tracepoints); #endif +#ifdef CONFIG_TREE_SRCU + mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs", + sizeof(*mod->srcu_struct_ptrs), + &mod->num_srcu_structs); +#endif #ifdef CONFIG_BPF_EVENTS mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", sizeof(*mod->bpf_raw_events), diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9b761e546de8..2ded2614a2f4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1310,3 +1310,68 @@ void __init srcu_init(void) queue_work(rcu_gp_wq, &ssp->work.work); } } + +#ifdef CONFIG_MODULES + +/* Initialize any global-scope srcu_struct structures used by this module. */ +static int srcu_module_coming(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + int ret; + + for (i = 0; i < mod->num_srcu_structs; i++) { + ret = init_srcu_struct(*(sspp++)); + if (WARN_ON_ONCE(ret)) + return ret; + } + return 0; +} + +/* Clean up any global-scope srcu_struct structures used by this module. */ +static void srcu_module_going(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + + for (i = 0; i < mod->num_srcu_structs; i++) + cleanup_srcu_struct(*(sspp++)); +} + +/* Handle one module, either coming or going. */ +static int srcu_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + ret = srcu_module_coming(mod); + break; + case MODULE_STATE_GOING: + srcu_module_going(mod); + break; + default: + break; + } + return ret; +} + +static struct notifier_block srcu_module_nb = { + .notifier_call = srcu_module_notify, + .priority = 0, +}; + +static __init int init_srcu_module_notifier(void) +{ + int ret; + + ret = register_module_notifier(&srcu_module_nb); + if (ret) + pr_warn("Failed to register srcu module notifier\n"); + return ret; +} +late_initcall(init_srcu_module_notifier); + +#endif /* #ifdef CONFIG_MODULES */ From 54e6c11b9e74fd780e52f95c3eed313e0f4e6b7e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 7 Apr 2019 20:47:19 -0400 Subject: [PATCH 24/61] srcu: Remove unused vmlinux srcu linker entries The SRCU for modules optimization (commit title "srcu: Allocate per-CPU data for DEFINE_SRCU() in modules") introduced vmlinux linker entries which is unused since it applies only to the built-in vmlinux. So remove it to prevent any space usage due to the 8 byte alignment it added. vmlinux.lds.h has no effect on module loading and is not used for building the module object, so the changes were not needed in the first place since the optimization is specific to modules. Tested with SRCU torture_type and rcutorture. Put prints in module loader to confirm it is able to find and initialize the srcu structures. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: kernel-team@android.com Cc: paulmck@linux.vnet.ibm.com Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/asm-generic/vmlinux.lds.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ba1ad39468fc..088987e9a3ea 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -337,10 +337,6 @@ KEEP(*(__tracepoints_ptrs)) /* Tracepoints: pointer array */ \ __stop___tracepoints_ptrs = .; \ *(__tracepoints_strings)/* Tracepoints: strings */ \ - . = ALIGN(8); \ - __start___srcu_struct = .; \ - *(___srcu_struct_ptrs) \ - __end___srcu_struct = .; \ } \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ From 056b89e7e699742cc060ce722d3f26effe51b4aa Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 11 Apr 2019 16:24:21 -0400 Subject: [PATCH 25/61] module: Make srcu_struct ptr array as read-only Since commit title ("srcu: Allocate per-CPU data for DEFINE_SRCU() in modules"), modules that call DEFINE_{STATIC,}SRCU will have a new array of srcu_struct pointers, which is used by srcu code to initialize and clean up these structures and save valuable per-cpu reserved space. There is no reason for this array of pointers to be writable, and can cause security or other hidden bugs. Mark these are read-only after the module init has completed. Tested with the following diff to ensure array not writable: (diff is a bit reduced to avoid patch command getting confused) a/kernel/module.c b/kernel/module.c -3506,6 +3506,14 static noinline int do_init_module [snip] rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif module_enable_ro(mod, true); + + if (mod->srcu_struct_ptrs) { + // Check if srcu_struct_ptrs access is possible + char x = *(char *)mod->srcu_struct_ptrs; + *(char *)mod->srcu_struct_ptrs = 0; + *(char *)mod->srcu_struct_ptrs = x; + } + mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); Cc: Rasmus Villemoes Cc: paulmck@linux.vnet.ibm.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: rcu@vger.kernel.org Cc: kernel-hardening@lists.openwall.com Cc: kernel-team@android.com Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8af1824c46a8..9cfcc8a756ae 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -123,7 +123,7 @@ struct srcu_struct { #ifdef MODULE # define __DEFINE_SRCU(name, is_static) \ is_static struct srcu_struct name; \ - struct srcu_struct *__srcu_struct_##name \ + struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else # define __DEFINE_SRCU(name, is_static) \ From 11b000457f4638cf2a9e6794d31636d2d3174842 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 23 Apr 2019 09:22:56 +0800 Subject: [PATCH 26/61] rcu: Make __call_srcu static Because __call_srcu() is not used outside kernel/rcu/srcutree.c, this commit makes it static. Signed-off-by: Jiang Biao Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 2ded2614a2f4..cf0e886314f2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -831,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp) * srcu_read_lock(), and srcu_read_unlock() that are all passed the same * srcu_struct structure. */ -void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, - rcu_callback_t func, bool do_norm) +static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) { unsigned long flags; int idx; From 95bf33b55ff4465399bad843f1d8d618c8baf1f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Apr 2019 14:07:24 +0200 Subject: [PATCH 27/61] rcu/sync: Kill rcu_sync_type/gp_type Now that the RCU flavors have been consolidated, rcu_sync_type makes no sense because none of internal update functions aside from .held() depend on gp_type. This commit therefore removes this field and consolidates the relevant code. Signed-off-by: Oleg Nesterov [ paulmck: Added RCU and RCU-bh checks to rcu_sync_is_idle(). ] [ paulmck: And applied subsequent feedback from Oleg Nesterov. ] Signed-off-by: Paul E. McKenney --- include/linux/percpu-rwsem.h | 2 +- include/linux/rcu_sync.h | 36 +++++++---------------- kernel/locking/percpu-rwsem.c | 2 +- kernel/rcu/sync.c | 55 ++++------------------------------- 4 files changed, 17 insertions(+), 78 deletions(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 03cb4b6f842e..6887636ea169 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -20,7 +20,7 @@ struct percpu_rw_semaphore { #define DEFINE_STATIC_PERCPU_RWSEM(name) \ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \ static struct percpu_rw_semaphore name = { \ - .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \ + .rss = __RCU_SYNC_INITIALIZER(name.rss), \ .read_count = &__percpu_rwsem_rc_##name, \ .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ .writer = __RCUWAIT_INITIALIZER(name.writer), \ diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index 6fc53a1345b3..87971e85519c 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -13,8 +13,6 @@ #include #include -enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC }; - /* Structure to mediate between updaters and fastpath-using readers. */ struct rcu_sync { int gp_state; @@ -23,52 +21,38 @@ struct rcu_sync { int cb_state; struct rcu_head cb_head; - - enum rcu_sync_type gp_type; }; -extern void rcu_sync_lockdep_assert(struct rcu_sync *); - /** * rcu_sync_is_idle() - Are readers permitted to use their fastpaths? * @rsp: Pointer to rcu_sync structure to use for synchronization * - * Returns true if readers are permitted to use their fastpaths. - * Must be invoked within an RCU read-side critical section whose - * flavor matches that of the rcu_sync struture. + * Returns true if readers are permitted to use their fastpaths. Must be + * invoked within some flavor of RCU read-side critical section. */ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) { -#ifdef CONFIG_PROVE_RCU - rcu_sync_lockdep_assert(rsp); -#endif + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && + !rcu_read_lock_bh_held() && + !rcu_read_lock_sched_held(), + "suspicious rcu_sync_is_idle() usage"); return !rsp->gp_state; /* GP_IDLE */ } -extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type); +extern void rcu_sync_init(struct rcu_sync *); extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); -#define __RCU_SYNC_INITIALIZER(name, type) { \ +#define __RCU_SYNC_INITIALIZER(name) { \ .gp_state = 0, \ .gp_count = 0, \ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \ .cb_state = 0, \ - .gp_type = type, \ } -#define __DEFINE_RCU_SYNC(name, type) \ - struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type) - -#define DEFINE_RCU_SYNC(name) \ - __DEFINE_RCU_SYNC(name, RCU_SYNC) - -#define DEFINE_RCU_SCHED_SYNC(name) \ - __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC) - -#define DEFINE_RCU_BH_SYNC(name) \ - __DEFINE_RCU_SYNC(name, RCU_BH_SYNC) +#define DEFINE_RCU_SYNC(name) \ + struct rcu_sync name = __RCU_SYNC_INITIALIZER(name) #endif /* _LINUX_RCU_SYNC_H_ */ diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f17dad99eec8..48cab93a47fd 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -17,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + rcu_sync_init(&sem->rss); __init_rwsem(&sem->rw_sem, name, rwsem_key); rcuwait_init(&sem->writer); sem->readers_block = 0; diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index a8304d90573f..ee427e138dad 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -10,65 +10,20 @@ #include #include -#ifdef CONFIG_PROVE_RCU -#define __INIT_HELD(func) .held = func, -#else -#define __INIT_HELD(func) -#endif - -static const struct { - void (*sync)(void); - void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); - void (*wait)(void); -#ifdef CONFIG_PROVE_RCU - int (*held)(void); -#endif -} gp_ops[] = { - [RCU_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_held) - }, - [RCU_SCHED_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_sched_held) - }, - [RCU_BH_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_bh_held) - }, -}; - enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; #define rss_lock gp_wait.lock -#ifdef CONFIG_PROVE_RCU -void rcu_sync_lockdep_assert(struct rcu_sync *rsp) -{ - RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), - "suspicious rcu_sync_is_idle() usage"); -} - -EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); -#endif - /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized * @type: Flavor of RCU with which to synchronize rcu_sync structure */ -void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +void rcu_sync_init(struct rcu_sync *rsp) { memset(rsp, 0, sizeof(*rsp)); init_waitqueue_head(&rsp->gp_wait); - rsp->gp_type = type; } /** @@ -114,7 +69,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) WARN_ON_ONCE(need_wait && need_sync); if (need_sync) { - gp_ops[rsp->gp_type].sync(); + synchronize_rcu(); rsp->gp_state = GP_PASSED; wake_up_all(&rsp->gp_wait); } else if (need_wait) { @@ -167,7 +122,7 @@ static void rcu_sync_func(struct rcu_head *rhp) * to catch a later GP. */ rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + call_rcu(&rsp->cb_head, rcu_sync_func); } else { /* * We're at least a GP after rcu_sync_exit(); eveybody will now @@ -195,7 +150,7 @@ void rcu_sync_exit(struct rcu_sync *rsp) if (!--rsp->gp_count) { if (rsp->cb_state == CB_IDLE) { rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + call_rcu(&rsp->cb_head, rcu_sync_func); } else if (rsp->cb_state == CB_PENDING) { rsp->cb_state = CB_REPLAY; } @@ -220,7 +175,7 @@ void rcu_sync_dtor(struct rcu_sync *rsp) spin_unlock_irq(&rsp->rss_lock); if (cb_state != CB_IDLE) { - gp_ops[rsp->gp_type].wait(); + rcu_barrier(); WARN_ON_ONCE(rsp->cb_state != CB_IDLE); } } From 2bf1acc299c9757932ef8c6edfaacca6d08302b1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Apr 2019 17:21:02 +0200 Subject: [PATCH 28/61] uprobes: Use DEFINE_STATIC_PERCPU_RWSEM() to initialize dup_mmap_sem Use DEFINE_STATIC_PERCPU_RWSEM() to initialize dup_mmap_sem. Signed-off-by: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/events/uprobes.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 78f61bfc6b79..97c367f0a9aa 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -46,7 +46,7 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) -static struct percpu_rw_semaphore dup_mmap_sem; +DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 @@ -2302,7 +2302,5 @@ void __init uprobes_init(void) for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); - BUG_ON(register_die_notifier(&uprobe_exception_nb)); } From 3f2947b78151ec938dc06aea4ba0e11e56becdff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Apr 2019 18:32:41 +0200 Subject: [PATCH 29/61] locking/percpu-rwsem: Add DEFINE_PERCPU_RWSEM(), use it to initialize cgroup_threadgroup_rwsem Turn DEFINE_STATIC_PERCPU_RWSEM() into __DEFINE_PERCPU_RWSEM() with the additional "is_static" argument to introduce DEFINE_PERCPU_RWSEM(). Change cgroup.c to use DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem). Signed-off-by: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/percpu-rwsem.h | 8 ++++++-- kernel/cgroup/cgroup.c | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 6887636ea169..2809b44cbbee 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -17,14 +17,18 @@ struct percpu_rw_semaphore { int readers_block; }; -#define DEFINE_STATIC_PERCPU_RWSEM(name) \ +#define __DEFINE_PERCPU_RWSEM(name, is_static) \ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \ -static struct percpu_rw_semaphore name = { \ +is_static struct percpu_rw_semaphore name = { \ .rss = __RCU_SYNC_INITIALIZER(name.rss), \ .read_count = &__percpu_rwsem_rc_##name, \ .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ .writer = __RCUWAIT_INITIALIZER(name.writer), \ } +#define DEFINE_PERCPU_RWSEM(name) \ + __DEFINE_PERCPU_RWSEM(name, /* not static */) +#define DEFINE_STATIC_PERCPU_RWSEM(name) \ + __DEFINE_PERCPU_RWSEM(name, static) extern int __percpu_down_read(struct percpu_rw_semaphore *, int); extern void __percpu_up_read(struct percpu_rw_semaphore *); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..b112e93388dc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ @@ -5616,7 +5616,6 @@ int __init cgroup_init(void) int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); From 89da3b94bb97417ca2c5b0ce3a28643819030247 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 25 Apr 2019 18:50:55 +0200 Subject: [PATCH 30/61] rcu/sync: Simplify the state machine With this patch rcu_sync has a single state variable and the transition rules become really simple: GP_IDLE - owned by the first rcu_sync_enter() which moves it to GP_ENTER - owned by rcu-callback which moves it to GP_PASSED - owned by the last rcu_sync_exit() which moves it to GP_EXIT - and this is the only "nontrivial" state. rcu-callback moves it back to GP_IDLE unless another enter() comes before a GP pass. If rcu-callback is invoked before the next rcu_sync_exit() it must see gp_count incremented by that enter() and set GP_PASSED. Otherwise, if the next rcu_sync_exit() wins the race, it will move it to GP_REPLAY - owned by rcu-callback which moves it to GP_EXIT Signed-off-by: Oleg Nesterov [ paulmck: While here, apply READ_ONCE() and WRITE_ONCE() to ->gp_state. ] [ paulmck: Tweaks to make htmldocs happy. (Reported by kbuild test robot.) ] Signed-off-by: Paul E. McKenney --- include/linux/rcu_sync.h | 4 +- kernel/rcu/sync.c | 193 ++++++++++++++++++++++----------------- 2 files changed, 110 insertions(+), 87 deletions(-) diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index 87971e85519c..9b83865d24f9 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -19,7 +19,6 @@ struct rcu_sync { int gp_count; wait_queue_head_t gp_wait; - int cb_state; struct rcu_head cb_head; }; @@ -36,7 +35,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) !rcu_read_lock_bh_held() && !rcu_read_lock_sched_held(), "suspicious rcu_sync_is_idle() usage"); - return !rsp->gp_state; /* GP_IDLE */ + return !READ_ONCE(rsp->gp_state); /* GP_IDLE */ } extern void rcu_sync_init(struct rcu_sync *); @@ -49,7 +48,6 @@ extern void rcu_sync_dtor(struct rcu_sync *); .gp_state = 0, \ .gp_count = 0, \ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \ - .cb_state = 0, \ } #define DEFINE_RCU_SYNC(name) \ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index ee427e138dad..d4558ab7a07d 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -10,15 +10,13 @@ #include #include -enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; -enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; +enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY }; #define rss_lock gp_wait.lock /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized - * @type: Flavor of RCU with which to synchronize rcu_sync structure */ void rcu_sync_init(struct rcu_sync *rsp) { @@ -41,6 +39,70 @@ void rcu_sync_enter_start(struct rcu_sync *rsp) rsp->gp_state = GP_PASSED; } + +static void rcu_sync_func(struct rcu_head *rhp); + +static void rcu_sync_call(struct rcu_sync *rsp) +{ + call_rcu(&rsp->cb_head, rcu_sync_func); +} + +/** + * rcu_sync_func() - Callback function managing reader access to fastpath + * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization + * + * This function is passed to call_rcu() function by rcu_sync_enter() and + * rcu_sync_exit(), so that it is invoked after a grace period following the + * that invocation of enter/exit. + * + * If it is called by rcu_sync_enter() it signals that all the readers were + * switched onto slow path. + * + * If it is called by rcu_sync_exit() it takes action based on events that + * have taken place in the meantime, so that closely spaced rcu_sync_enter() + * and rcu_sync_exit() pairs need not wait for a grace period. + * + * If another rcu_sync_enter() is invoked before the grace period + * ended, reset state to allow the next rcu_sync_exit() to let the + * readers back onto their fastpaths (after a grace period). If both + * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked + * before the grace period ended, re-invoke call_rcu() on behalf of that + * rcu_sync_exit(). Otherwise, set all state back to idle so that readers + * can again use their fastpaths. + */ +static void rcu_sync_func(struct rcu_head *rhp) +{ + struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); + unsigned long flags; + + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); + + spin_lock_irqsave(&rsp->rss_lock, flags); + if (rsp->gp_count) { + /* + * We're at least a GP after the GP_IDLE->GP_ENTER transition. + */ + WRITE_ONCE(rsp->gp_state, GP_PASSED); + wake_up_locked(&rsp->gp_wait); + } else if (rsp->gp_state == GP_REPLAY) { + /* + * A new rcu_sync_exit() has happened; requeue the callback to + * catch a later GP. + */ + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); + } else { + /* + * We're at least a GP after the last rcu_sync_exit(); eveybody + * will now have observed the write side critical section. + * Let 'em rip!. + */ + WRITE_ONCE(rsp->gp_state, GP_IDLE); + } + spin_unlock_irqrestore(&rsp->rss_lock, flags); +} + /** * rcu_sync_enter() - Force readers onto slowpath * @rsp: Pointer to rcu_sync structure to use for synchronization @@ -58,84 +120,43 @@ void rcu_sync_enter_start(struct rcu_sync *rsp) */ void rcu_sync_enter(struct rcu_sync *rsp) { - bool need_wait, need_sync; + int gp_state; spin_lock_irq(&rsp->rss_lock); - need_wait = rsp->gp_count++; - need_sync = rsp->gp_state == GP_IDLE; - if (need_sync) - rsp->gp_state = GP_PENDING; + gp_state = rsp->gp_state; + if (gp_state == GP_IDLE) { + WRITE_ONCE(rsp->gp_state, GP_ENTER); + WARN_ON_ONCE(rsp->gp_count); + /* + * Note that we could simply do rcu_sync_call(rsp) here and + * avoid the "if (gp_state == GP_IDLE)" block below. + * + * However, synchronize_rcu() can be faster if rcu_expedited + * or rcu_blocking_is_gp() is true. + * + * Another reason is that we can't wait for rcu callback if + * we are called at early boot time but this shouldn't happen. + */ + } + rsp->gp_count++; spin_unlock_irq(&rsp->rss_lock); - WARN_ON_ONCE(need_wait && need_sync); - if (need_sync) { + if (gp_state == GP_IDLE) { + /* + * See the comment above, this simply does the "synchronous" + * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED. + */ synchronize_rcu(); - rsp->gp_state = GP_PASSED; - wake_up_all(&rsp->gp_wait); - } else if (need_wait) { - wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); - } else { - /* - * Possible when there's a pending CB from a rcu_sync_exit(). - * Nobody has yet been allowed the 'fast' path and thus we can - * avoid doing any sync(). The callback will get 'dropped'. - */ - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); + rcu_sync_func(&rsp->cb_head); + /* Not really needed, wait_event() would see GP_PASSED. */ + return; } + + wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED); } /** - * rcu_sync_func() - Callback function managing reader access to fastpath - * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization - * - * This function is passed to one of the call_rcu() functions by - * rcu_sync_exit(), so that it is invoked after a grace period following the - * that invocation of rcu_sync_exit(). It takes action based on events that - * have taken place in the meantime, so that closely spaced rcu_sync_enter() - * and rcu_sync_exit() pairs need not wait for a grace period. - * - * If another rcu_sync_enter() is invoked before the grace period - * ended, reset state to allow the next rcu_sync_exit() to let the - * readers back onto their fastpaths (after a grace period). If both - * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked - * before the grace period ended, re-invoke call_rcu() on behalf of that - * rcu_sync_exit(). Otherwise, set all state back to idle so that readers - * can again use their fastpaths. - */ -static void rcu_sync_func(struct rcu_head *rhp) -{ - struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); - unsigned long flags; - - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - WARN_ON_ONCE(rsp->cb_state == CB_IDLE); - - spin_lock_irqsave(&rsp->rss_lock, flags); - if (rsp->gp_count) { - /* - * A new rcu_sync_begin() has happened; drop the callback. - */ - rsp->cb_state = CB_IDLE; - } else if (rsp->cb_state == CB_REPLAY) { - /* - * A new rcu_sync_exit() has happened; requeue the callback - * to catch a later GP. - */ - rsp->cb_state = CB_PENDING; - call_rcu(&rsp->cb_head, rcu_sync_func); - } else { - /* - * We're at least a GP after rcu_sync_exit(); eveybody will now - * have observed the write side critical section. Let 'em rip!. - */ - rsp->cb_state = CB_IDLE; - rsp->gp_state = GP_IDLE; - } - spin_unlock_irqrestore(&rsp->rss_lock, flags); -} - -/** - * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * rcu_sync_exit() - Allow readers back onto fast path after grace period * @rsp: Pointer to rcu_sync structure to use for synchronization * * This function is used by updaters who have completed, and can therefore @@ -146,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp) */ void rcu_sync_exit(struct rcu_sync *rsp) { + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); + spin_lock_irq(&rsp->rss_lock); if (!--rsp->gp_count) { - if (rsp->cb_state == CB_IDLE) { - rsp->cb_state = CB_PENDING; - call_rcu(&rsp->cb_head, rcu_sync_func); - } else if (rsp->cb_state == CB_PENDING) { - rsp->cb_state = CB_REPLAY; + if (rsp->gp_state == GP_PASSED) { + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); + } else if (rsp->gp_state == GP_EXIT) { + WRITE_ONCE(rsp->gp_state, GP_REPLAY); } } spin_unlock_irq(&rsp->rss_lock); @@ -164,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp) */ void rcu_sync_dtor(struct rcu_sync *rsp) { - int cb_state; + int gp_state; - WARN_ON_ONCE(rsp->gp_count); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); - if (rsp->cb_state == CB_REPLAY) - rsp->cb_state = CB_PENDING; - cb_state = rsp->cb_state; + if (rsp->gp_state == GP_REPLAY) + WRITE_ONCE(rsp->gp_state, GP_EXIT); + gp_state = rsp->gp_state; spin_unlock_irq(&rsp->rss_lock); - if (cb_state != CB_IDLE) { + if (gp_state != GP_IDLE) { rcu_barrier(); - WARN_ON_ONCE(rsp->cb_state != CB_IDLE); + WARN_ON_ONCE(rsp->gp_state != GP_IDLE); } } From e0e2147c1a6a722f6b2e359c5132a825ecb8a420 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 26 Mar 2019 15:24:10 -0400 Subject: [PATCH 31/61] rcutorture: Select from only online CPUs The rcutorture jitter.sh script selects a random CPU but does not check if it is offline or online. This leads to taskset errors many times. On my machine, hyper threading is disabled so half the cores are offline causing taskset errors a lot of times. Let us fix this by checking from only the online CPUs on the system. Cc: rcu@vger.kernel.org Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 435b60933985..81c6e6ab5d83 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -34,10 +34,11 @@ do exit 0; fi - # Set affinity to randomly selected CPU - cpus=`ls /sys/devices/system/cpu/*/online | + # Set affinity to randomly selected online CPU + cpus=`grep 1 /sys/devices/system/cpu/*/online | sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' | grep -v '^0*$'` + cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN { srand(n + me + systime()); ncpus = split(cpus, ca); From dd064c35991435efc4da2a6c551601f6e80f2611 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 26 Mar 2019 15:24:11 -0400 Subject: [PATCH 32/61] rcutorture: Add cpu0 to the set of CPUs to add jitter jitter.sh currently does not add CPU0 to the list of CPUs for adding of jitter. Let us add it to this list even when it is not hot-pluggable. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 81c6e6ab5d83..dc49a3ba6111 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -36,8 +36,12 @@ do # Set affinity to randomly selected online CPU cpus=`grep 1 /sys/devices/system/cpu/*/online | - sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' | - grep -v '^0*$'` + sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'` + + # Do not leave out poor old cpu0 which may not be hot-pluggable + if [ ! -f "/sys/devices/system/cpu/cpu0/online" ]; then + cpus="0 $cpus" + fi cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN { srand(n + me + systime()); From 140e53f20b159722903f0c87358bcd809aa9767e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 10:08:18 -0700 Subject: [PATCH 33/61] rcutorture: Add cond_resched() to forward-progress free-up loop The rcu_torture_fwd_prog_cbfree() function frees callbacks used during rcutorture's call_rcu() forward-progress test, but does so in a tight loop. This could cause problems given a very long list of callbacks to be freed, and actual testing produces lists with as many as 25M callbacks. This commit therefore adds a cond_resched() to this loop. While in the area, this commit also rearranges the lock releases to look a bit more sane. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efaa5b3f4d3f..7906ba2d9dad 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1674,16 +1674,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) for (;;) { spin_lock_irqsave(&rcu_fwd_lock, flags); rfcp = rcu_fwd_cb_head; - if (!rfcp) + if (!rfcp) { + spin_unlock_irqrestore(&rcu_fwd_lock, flags); break; + } rcu_fwd_cb_head = rfcp->rfc_next; if (!rcu_fwd_cb_head) rcu_fwd_cb_tail = &rcu_fwd_cb_head; spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; + cond_resched(); } - spin_unlock_irqrestore(&rcu_fwd_lock, flags); return freed; } From e8516c64fe97e27a28fd5bc65b616508ae0020cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 11:06:32 -0700 Subject: [PATCH 34/61] rcutorture: Fix stutter_wait() return value and freelist checks The stutter_wait() function is supposed to return true if it actually waits and false otherwise, but it instead unconditionally returns false. Which hides a bug in rcu_torture_writer() that fails to account for the fact that one of the rcu_tortures[] array elements will normally be referenced by rcu_torture_current, and thus not be on the freelist. This commit therefore corrects the stutter_wait() return value and adds a check for rcu_torture_current to rcu_torture_writer()'s check that things get freed after everything goes quiescent. In addition, this commit causes torture_stutter() to give a bit more than one second (instead of only one jiffy) warning of the end of the stutter interval. Finally, this commit disables long-delay readers and aggressive update-side forward-progress checks while forward-progress testing is in flight. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 ++++++++++++---- kernel/torture.c | 17 +++++++++++++---- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7906ba2d9dad..954ac2b98619 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1010,10 +1010,13 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - if (stutter_wait("rcu_torture_writer")) + if (stutter_wait("rcu_torture_writer") && + !READ_ONCE(rcu_fwd_cb_nodelay)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) - if (list_empty(&rcu_tortures[i].rtort_free)) - WARN_ON_ONCE(1); + if (list_empty(&rcu_tortures[i].rtort_free) && + rcu_access_pointer(rcu_torture_current) != + &rcu_tortures[i]) + WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) @@ -1709,6 +1712,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } /* Tight loop containing cond_resched(). */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); @@ -1747,6 +1752,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) WARN_ON(READ_ONCE(fcs.stop) != 2); destroy_rcu_head_on_stack(&fcs.rh); } + schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } /* Carry out call_rcu() forward-progress testing. */ @@ -1816,7 +1823,6 @@ static void rcu_torture_fwd_prog_cr(void) cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(); - WRITE_ONCE(rcu_fwd_cb_nodelay, false); if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", @@ -1827,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void) n_max_gps, n_max_cbs, cver, gps); rcu_torture_fwd_cb_hist(); } + schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } diff --git a/kernel/torture.c b/kernel/torture.c index 17b2be9bde12..de0e0ecf88e1 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -578,10 +578,12 @@ static int stutter; bool stutter_wait(const char *title) { int spt; + bool ret = false; cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { + ret = true; if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { @@ -592,7 +594,7 @@ bool stutter_wait(const char *title) } torture_shutdown_absorb(title); } - return !!spt; + return ret; } EXPORT_SYMBOL_GPL(stutter_wait); @@ -602,13 +604,20 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { + int wtime; + VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { - WRITE_ONCE(stutter_pause_test, 1); - schedule_timeout_interruptible(stutter - 1); + wtime = stutter; + if (stutter > HZ + 1) { + WRITE_ONCE(stutter_pause_test, 1); + wtime = stutter - HZ - 1; + schedule_timeout_interruptible(wtime); + wtime = HZ + 1; + } WRITE_ONCE(stutter_pause_test, 2); - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(wtime); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) From ff3bf92d90d396e51eb78c5ecde11a994ab7a179 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 14:44:49 -0700 Subject: [PATCH 35/61] torture: Allow inter-stutter interval to be specified Currently, the inter-stutter interval is the same as the stutter duration, that is, whatever number of jiffies is passed into torture_stutter_init(). This has worked well for quite some time, but the addition of forward-progress testing to rcutorture can delay processes for several seconds, which can triple the time that they are stuttered. This commit therefore adds a second argument to torture_stutter_init() that specifies the inter-stutter interval. While locktorture preserves the current behavior, rcutorture uses the RCU CPU stall warning interval to provide a wider inter-stutter interval. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 2 +- kernel/locking/locktorture.c | 2 +- kernel/rcu/rcutorture.c | 5 ++++- kernel/torture.c | 6 ++++-- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/torture.h b/include/linux/torture.h index 23d80db426d7..a620118385bb 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -66,7 +66,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)); /* Task stuttering, which forces load/no-load transitions. */ bool stutter_wait(const char *title); -int torture_stutter_init(int s); +int torture_stutter_init(int s, int sgap); /* Initialization and cleanup. */ bool torture_init_begin(char *ttype, int v); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 80a463d31a8d..c513031cd7e3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -975,7 +975,7 @@ static int __init lock_torture_init(void) goto unwind; } if (stutter > 0) { - firsterr = torture_stutter_init(stutter); + firsterr = torture_stutter_init(stutter, stutter); if (firsterr) goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 954ac2b98619..a16d6abe1715 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2373,7 +2373,10 @@ rcu_torture_init(void) if (stutter < 0) stutter = 0; if (stutter) { - firsterr = torture_stutter_init(stutter * HZ); + int t; + + t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ; + firsterr = torture_stutter_init(stutter * HZ, t); if (firsterr) goto unwind; } diff --git a/kernel/torture.c b/kernel/torture.c index de0e0ecf88e1..a8d9bdfba7c3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -570,6 +570,7 @@ static void torture_shutdown_cleanup(void) static struct task_struct *stutter_task; static int stutter_pause_test; static int stutter; +static int stutter_gap; /* * Block until the stutter interval ends. This must be called periodically @@ -621,7 +622,7 @@ static int torture_stutter(void *arg) } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) - schedule_timeout_interruptible(stutter); + schedule_timeout_interruptible(stutter_gap); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); @@ -631,9 +632,10 @@ static int torture_stutter(void *arg) /* * Initialize and kick off the torture_stutter kthread. */ -int torture_stutter_init(const int s) +int torture_stutter_init(const int s, const int sgap) { stutter = s; + stutter_gap = sgap; return torture_create_kthread(torture_stutter, NULL, stutter_task); } EXPORT_SYMBOL_GPL(torture_stutter_init); From 63b29eaed6f57c27639b2954ac1f202409840abf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Apr 2019 13:31:57 -0700 Subject: [PATCH 36/61] torture: Make kvm-find-errors.sh and kvm-recheck.sh provide exit status This commit causes both kvm-find-errors.sh and kvm-recheck.sh to provide an exit status based on whether or not errors were located. In the case of kvm-recheck.sh, this will be the error status of the last run. This change allows these commands to be used in scripting and Makefiles to automatically report failed rcutorture runs. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh | 3 +++ tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 3 +++ 2 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 8426fe1f15ee..1871d00bccd7 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -11,6 +11,7 @@ # # The "directory" above should end with the date/time directory, for example, # "tools/testing/selftests/rcutorture/res/2018.02.25-14:27:27". +# Returns error status reflecting the success (or not) of the specified run. # # Copyright (C) IBM Corporation, 2018 # @@ -56,6 +57,8 @@ done if test -n "$files" then $editor $files + exit 1 else echo No errors in console logs. + exit 0 fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 2adde6aaafdb..2297ddc2d4c5 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -7,6 +7,8 @@ # # Usage: kvm-recheck.sh resdir ... # +# Returns status reflecting the success or not of the last run specified. +# # Copyright (C) IBM Corporation, 2011 # # Authors: Paul E. McKenney @@ -58,3 +60,4 @@ do fi done done +EDITOR=echo kvm-find-errors.sh "${@: -1}" > /dev/null 2>&1 From 2456a8562b296453b4cdce8b6928e3f91e39fefc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Apr 2019 14:51:13 -0700 Subject: [PATCH 37/61] rcutorture: Provide rudimentary Makefile This commit provides a rudimentary Makefile that runs a 10-minute rcutorture test on scenario TREE01. This must be run on a system capable of spawning virtual machines and with everything installed to permit building Linux kernels. Reported-by: Shuah Khan Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/Makefile | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/Makefile diff --git a/tools/testing/selftests/rcutorture/Makefile b/tools/testing/selftests/rcutorture/Makefile new file mode 100644 index 000000000000..5202dc666206 --- /dev/null +++ b/tools/testing/selftests/rcutorture/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0+ +all: + ( cd ../../../..; tools/testing/selftests/rcutorture/bin/kvm.sh --duration 10 --configs TREE01 ) From 5eabea594b4ce9ba0fbd8618bd3bf01aa9f48af7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Apr 2019 09:02:46 -0700 Subject: [PATCH 38/61] rcutorture: Exempt tasks RCU from timely draining of grace periods After the end of each stutter pause interval, the rcu_torture_writer() kthread checks to be sure that all prior callbacks have completed so that all the test structures have been freed. This works fine except for tasks RCU, in which grace periods can take one good long time. This commit therefore exempts tasks RCU from this check. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a16d6abe1715..6a4558532eac 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -299,6 +299,7 @@ struct rcu_torture_ops { int irq_capable; int can_boost; int extendables; + int slow_gps; const char *name; }; @@ -667,6 +668,7 @@ static struct rcu_torture_ops tasks_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, + .slow_gps = 1, .name = "tasks" }; @@ -1011,7 +1013,8 @@ rcu_torture_writer(void *arg) } rcu_torture_writer_state = RTWS_STUTTER; if (stutter_wait("rcu_torture_writer") && - !READ_ONCE(rcu_fwd_cb_nodelay)) + !READ_ONCE(rcu_fwd_cb_nodelay) && + !cur_ops->slow_gps) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != From 52b23be7ee023d7f1b67e7a20443ef352c7b5d2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 13 Apr 2019 15:41:49 -0700 Subject: [PATCH 39/61] rcutorture: Exempt TREE01 from forward-progress testing Because TREE01 can end up running more vCPUs that physical CPUs, hammering these shortchanged CPUs with tight loops containing call_rcu() invocations seems a bit like overkill. This commit therefore exempts TREE01 from rcutorture's forward-progress testing. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot index ea47da95374b..d6da9a61d44a 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot @@ -3,3 +3,4 @@ rcutree.gp_preinit_delay=3 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcu_nocbs=0 +rcutorture.fwd_progress=0 From ab21f6081f7bc09a0918ef888de795d59a907c1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 14 Apr 2019 18:30:22 -0700 Subject: [PATCH 40/61] rcutorture: Give the scheduler a chance on PREEMPT && NO_HZ_FULL kernels In !PREEMPT kernels, cond_resched() is a no-op. In NO_HZ_FULL kernels, in-kernel execution (such as that of rcutorture's kthreads) might extend indefinitely without the scheduler gaining the aid of a scheduling-clock interrupt. This combination can make the interaction of an rcutorture forward-progress test and a CPU-hotplug stop_machine operation make less forward progress than one might like. Additionally, Sebastian Siewior notes that NO_HZ_FULL kernels have a scheduler check upon return to userspace execution, which suggests that in-kernel emulation of tight userspace loops containing system calls doing call_rcu() might also need explicit checks in the PREEMPT && NO_HZ_FULL case. This commit therefore introduces a rcu_torture_fwd_prog_cond_resched() function that explicitly invokes schedule() in such kernels whenever need_resched() returns true, while retaining use of cond_resched() for kernels that are either !PREEMPT or !NO_HZ_FULL. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6a4558532eac..ef6f6dedf4c4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1667,6 +1667,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_unlock_irqrestore(&rcu_fwd_lock, flags); } +// Give the scheduler a chance, even on nohz_full CPUs. +static void rcu_torture_fwd_prog_cond_resched(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (need_resched()) + schedule(); + } else { + cond_resched(); + } +} + /* * Free all callbacks on the rcu_fwd_cb_head list, either because the * test is over or because we hit an OOM event. @@ -1690,7 +1701,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } return freed; } @@ -1734,7 +1745,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } (*tested_tries)++; if (!time_before(jiffies, stopat) && @@ -1817,7 +1828,7 @@ static void rcu_torture_fwd_prog_cr(void) rfcp->rfc_gps = 0; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); From 3432d765c59ba026de49bd4f1f0c2adeff0e7a16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Apr 2019 14:50:05 -0700 Subject: [PATCH 41/61] rcutorture: Halt forward-progress checks at end of run Once removed, an rcu_torture element can be deferred-freed by a chain of call_rcu() invocations, with each callback invoking another round of call_rcu() until either a fixed number of call_rcu() invocations have been chained or until the test ends. This means that if the test ends, some of the rcu_torture elements will be "stranded" partway through the deferred-free process, which results in false-positive warnings from rcu_torture_writer() due to lack of forward progress should the test end just at the end of a stutter interval. This commit therefore suppresses rcu_torture_writer()'s forward-progress checks when the test ends in order to avoid these false-positive reports.. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ef6f6dedf4c4..a3f5488a319a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1014,7 +1014,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state = RTWS_STUTTER; if (stutter_wait("rcu_torture_writer") && !READ_ONCE(rcu_fwd_cb_nodelay) && - !cur_ops->slow_gps) + !cur_ops->slow_gps && + !torture_must_stop()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != From c682db558e6eec10a711b0a6bcb8c35fd15f6a39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 19 Apr 2019 07:38:27 -0700 Subject: [PATCH 42/61] rcutorture: Add trivial RCU implementation I have been showing off a trivial RCU implementation for non-preemptive environments for some time now: #define rcu_read_lock() #define rcu_read_unlock() #define rcu_dereference(p) READ_ONCE(p) #define rcu_assign_pointer(p, v) smp_store_release(&(p), (v)) void synchronize_rcu(void) { int cpu; for_each_online_cpu(cpu) sched_setaffinity(current->pid, cpumask_of(cpu)); } Trivial or not, as the old saying goes, "if it ain't tested, it don't work!". This commit therefore adds a "trivial" flavor to rcutorture and a corresponding TRIVIAL test scenario. This variant does not handle CPU hotplug, which is unconditionally enabled on x86 for post-v5.1-rc3 kernels, which is why the TRIVIAL.boot says "rcutorture.onoff_interval=0". This commit actually does handle CONFIG_PREEMPT=y kernels, but only because it turns back the Linux-kernel clock in order to provide these alternative definitions (or the moral equivalent thereof): #define rcu_read_lock() preempt_disable() #define rcu_read_unlock() preempt_enable() In CONFIG_PREEMPT=n kernels without debugging, these are equivalent to empty macros give or take a compiler barrier. However, the have been successfully tested with actual empty macros as well. Signed-off-by: Paul E. McKenney [ paulmck: Fix symbol issue reported by kbuild test robot . ] [ paulmck: Work around sched_setaffinity() issue noted by Andrea Parri. ] [ paulmck: Add rcutorture.shuffle_interval=0 to TRIVIAL.boot to fix interaction with shuffler task noted by Peter Zijlstra. ] Tested-by: Andrea Parri --- kernel/rcu/rcu.h | 5 +++ kernel/rcu/rcutorture.c | 45 ++++++++++++++++++- kernel/rcu/update.c | 13 ++++++ .../selftests/rcutorture/configs/rcu/TRIVIAL | 14 ++++++ .../rcutorture/configs/rcu/TRIVIAL.boot | 3 ++ 5 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 390aab20115e..5290b01de534 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -446,6 +446,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR }; @@ -479,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +#endif + #ifdef CONFIG_TINY_SRCU static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a3f5488a319a..6b803fb2f7ca 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -672,6 +672,47 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; +/* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1789,6 +1830,8 @@ static void rcu_torture_fwd_prog_cr(void) if (READ_ONCE(rcu_fwd_emergency_stop)) return; /* Get out of the way quickly, no GP wait! */ + if (!cur_ops->call) + return; /* Can't do call_rcu() fwd prog without ->call. */ /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); @@ -2265,7 +2308,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, + &busted_srcud_ops, &tasks_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c3bf44ba42e5..61df2bf08563 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -423,6 +423,19 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +/* Get rcutorture access to sched_setaffinity(). */ +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + int ret; + + ret = sched_setaffinity(pid, in_mask); + WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); + return ret; +} +EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); +#endif + #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL new file mode 100644 index 000000000000..4d8eb5bfb6f6 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL @@ -0,0 +1,14 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=8 +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot new file mode 100644 index 000000000000..7017f5f5a55f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot @@ -0,0 +1,3 @@ +rcutorture.torture_type=trivial +rcutorture.onoff_interval=0 +rcutorture.shuffle_interval=0 From a6fda6dab93c2c06ef4b8cb4b9258df6674d2438 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 Apr 2019 09:34:46 +0200 Subject: [PATCH 43/61] rcutorture: Tweak kvm options In one of my rcutorture tests the TSC clocksource got marked unstable due to a large difference in the TSC value. I'm not sure if the guest run for a long time with disabled interrupts or if the host was very busy and didn't schedule the guest for some time. I took a look on the qemu/KVM options and decided to update the options: - Use kvm{32|64} as CPU. We could probably use `host' (like ARM does) for maximum available features but since we don't run any userland I'm not sure if it makes any difference. - Drop the "noapic" option. There is no history why the APIC was disabled, I see no reason for it. Once old qemu versions fade away, we can add "x2apic=on,tsc-deadline=on,hypervisor=on,tsc_adjust=on". - Additional config options. It ensures that the kernel knowns that it runs as a kvm guest and can use virt devices like the kvm-clock as clocksource. The kvm-clock was the main motivation here. - I didn't add a random HW device. It would make the random device ready earlier (not it doesn't complete the initialisation at all) but I doubt that there is any need for this. Signed-off-by: Sebastian Andrzej Siewior [ paulmck: The world is not quite ready for CONFIG_PARAVIRT_SPINLOCKS=y and x2apic, so they are omitted for the time being. ] Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/functions.sh | 13 ++++++++++++- .../selftests/rcutorture/configs/rcu/CFcommon | 3 +++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 6bcb8b5b2ff2..c3a49fb4d6f6 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -172,7 +172,7 @@ identify_qemu_append () { local console=ttyS0 case "$1" in qemu-system-x86_64|qemu-system-i386) - echo noapic selinux=0 initcall_debug debug + echo selinux=0 initcall_debug debug ;; qemu-system-aarch64) console=ttyAMA0 @@ -191,8 +191,19 @@ identify_qemu_append () { # Output arguments for qemu arguments based on the TORTURE_QEMU_MAC # and TORTURE_QEMU_INTERACTIVE environment variables. identify_qemu_args () { + local KVM_CPU="" + case "$1" in + qemu-system-x86_64) + KVM_CPU=kvm64 + ;; + qemu-system-i386) + KVM_CPU=kvm32 + ;; + esac case "$1" in qemu-system-x86_64|qemu-system-i386) + echo -machine q35,accel=kvm + echo -cpu ${KVM_CPU} ;; qemu-system-aarch64) echo -machine virt,gic-version=host -cpu host diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon index d2d2a86139db..e19a444a0684 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon @@ -1,2 +1,5 @@ CONFIG_RCU_TORTURE_TEST=y CONFIG_PRINTK_TIME=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_KVM_GUEST=y From 7dedfd4335f7b795d12191d03626212ff71e52aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Apr 2019 04:39:10 -0700 Subject: [PATCH 44/61] torture: Capture qemu output Currently qemu output appears on standard output, but is inaccessible later on. This commit therefore captures this output and causes kvm-recheck.sh to output this output if QEMU gave a non-zero non-137 exit code. (And exit code of 137 indicates that QEMU was killed, in which case we want to know about the hang rather than the fact that QEMU was killed.) Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 10 +++++++++- .../testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 2297ddc2d4c5..e5edd5198725 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -30,8 +30,16 @@ do TORTURE_SUITE="`cat $i/../TORTURE_SUITE`" rm -f $i/console.log.*.diags kvm-recheck-${TORTURE_SUITE}.sh $i - if test -f "$i/console.log" + if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -ne 0 && test "`cat $i/qemu-retval`" -ne 137 then + echo QEMU error, output: + cat $i/qemu-output + elif test -f "$i/console.log" + then + if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -eq 137 + then + echo QEMU killed + fi configcheck.sh $i/.config $i/ConfigFragment if test -r $i/Make.oldconfig.err then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 0eb1ec16d78a..003511494fd9 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -165,7 +165,7 @@ then fi echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd -( $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & +( $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append "$qemu_append $boot_args" > $resdir/qemu-output 2>&1 & echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 sleep 10 # Give qemu's pid a chance to reach the file if test -s "$resdir/qemu_pid" From cd6cb7c8a509b1e785fa67e2eb6bd5003198ac39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 May 2019 13:26:11 -0700 Subject: [PATCH 45/61] torture: Add function graph-tracing cheat sheet Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 8f1e337b9b54..3b4868906794 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -464,3 +464,5 @@ else fi # Tracing: trace_event=rcu:rcu_grace_period,rcu:rcu_future_grace_period,rcu:rcu_grace_period_init,rcu:rcu_nocb_wake,rcu:rcu_preempt_task,rcu:rcu_unlock_preempted_task,rcu:rcu_quiescent_state_report,rcu:rcu_fqs,rcu:rcu_callback,rcu:rcu_kfree_callback,rcu:rcu_batch_start,rcu:rcu_invoke_callback,rcu:rcu_invoke_kfree_callback,rcu:rcu_batch_end,rcu:rcu_torture_read,rcu:rcu_barrier +# Function-graph tracing: ftrace=function_graph ftrace_graph_filter=sched_setaffinity,migration_cpu_stop +# Also --kconfig "CONFIG_FUNCTION_TRACER=y CONFIG_FUNCTION_GRAPH_TRACER=y" From 6dc82595ef0839a45c26075612f304979b6c92c2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 May 2019 21:31:52 -0700 Subject: [PATCH 46/61] torture: Run kernel build in source directory For historical reasons, rcutorture places its build products in a tools/testing/selftests/rcutorture/b1 directory using the O= kbuild command-line argument. However, doing this requires that the source directory be pristine: Not just "make clean" pristine, but instead "make mrproper" (or, equivalently, "make distclean") pristine. Therefore, rcutorture executes a "make mrproper" before each build. Unfortunately, "make mrproper" has the side effect of removing pretty much everything, including tags files and cscope databases, which can be inconvenient to people whose workflow centers around a single source tree. This commit therefore makes rcutorture do the build directly in the source directory, removing the need for "make mrproper". This works because all needed build products are moved to their proper place in the "res" directory immediately after the build completes, so that multiple rcutorture kernels can still run concurrently. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/configinit.sh | 36 +++++-------------- .../selftests/rcutorture/bin/kvm-build.sh | 9 +++-- .../rcutorture/bin/kvm-test-1-run.sh | 21 +++++------ tools/testing/selftests/rcutorture/bin/kvm.sh | 3 +- 4 files changed, 22 insertions(+), 47 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh index 40359486b3a8..bbeae6f67c36 100755 --- a/tools/testing/selftests/rcutorture/bin/configinit.sh +++ b/tools/testing/selftests/rcutorture/bin/configinit.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0+ # -# Usage: configinit.sh config-spec-file build-output-dir results-dir +# Usage: configinit.sh config-spec-file results-dir # # Create a .config file from the spec file. Run from the kernel source tree. # Exits with 0 if all went well, with 1 if all went well but the config @@ -11,10 +11,6 @@ # desired settings, for example, "CONFIG_NO_HZ=y". For best results, # this should be a full pathname. # -# The second argument is a optional path to a build output directory, -# for example, "O=/tmp/foo". If this argument is omitted, the .config -# file will be generated directly in the current directory. -# # Copyright (C) IBM Corporation, 2013 # # Authors: Paul E. McKenney @@ -26,34 +22,20 @@ mkdir $T # Capture config spec file. c=$1 -buildloc=$2 -resdir=$3 -builddir= -if echo $buildloc | grep -q '^O=' -then - builddir=`echo $buildloc | sed -e 's/^O=//'` - if test ! -d $builddir - then - mkdir $builddir - fi -else - echo Bad build directory: \"$buildloc\" - exit 2 -fi +resdir=$2 sed -e 's/^\(CONFIG[0-9A-Z_]*\)=.*$/grep -v "^# \1" |/' < $c > $T/u.sh sed -e 's/^\(CONFIG[0-9A-Z_]*=\).*$/grep -v \1 |/' < $c >> $T/u.sh grep '^grep' < $T/u.sh > $T/upd.sh echo "cat - $c" >> $T/upd.sh -make mrproper -make $buildloc distclean > $resdir/Make.distclean 2>&1 -make $buildloc $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1 -mv $builddir/.config $builddir/.config.sav -sh $T/upd.sh < $builddir/.config.sav > $builddir/.config -cp $builddir/.config $builddir/.config.new -yes '' | make $buildloc oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err +make clean > $resdir/Make.clean 2>&1 +make $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1 +mv .config .config.sav +sh $T/upd.sh < .config.sav > .config +cp .config .config.new +yes '' | make oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err # verify new config matches specification. -configcheck.sh $builddir/.config $c +configcheck.sh .config $c exit 0 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index c27a0bbb9c02..18d6518504ee 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -3,7 +3,7 @@ # # Build a kvm-ready Linux kernel from the tree in the current directory. # -# Usage: kvm-build.sh config-template build-dir resdir +# Usage: kvm-build.sh config-template resdir # # Copyright (C) IBM Corporation, 2011 # @@ -15,8 +15,7 @@ then echo "kvm-build.sh :$config_template: Not a readable file" exit 1 fi -builddir=${2} -resdir=${3} +resdir=${2} T=${TMPDIR-/tmp}/test-linux.sh.$$ trap 'rm -rf $T' 0 @@ -29,14 +28,14 @@ CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_CONSOLE=y ___EOF___ -configinit.sh $T/config O=$builddir $resdir +configinit.sh $T/config $resdir retval=$? if test $retval -gt 1 then exit 2 fi ncpus=`cpus2use.sh` -make O=$builddir -j$ncpus $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1 +make -j$ncpus $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1 retval=$? if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 003511494fd9..27b7b5693ede 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -36,11 +36,6 @@ config_template=${1} config_dir=`echo $config_template | sed -e 's,/[^/]*$,,'` title=`echo $config_template | sed -e 's/^.*\///'` builddir=${2} -if test -z "$builddir" -o ! -d "$builddir" -o ! -w "$builddir" -then - echo "kvm-test-1-run.sh :$builddir: Not a writable directory, cannot build into it" - exit 1 -fi resdir=${3} if test -z "$resdir" -o ! -d "$resdir" -o ! -w "$resdir" then @@ -85,18 +80,18 @@ then ln -s $base_resdir/.config $resdir # for kvm-recheck.sh # Arch-independent indicator touch $resdir/builtkernel -elif kvm-build.sh $T/Kc2 $builddir $resdir +elif kvm-build.sh $T/Kc2 $resdir then # Had to build a kernel for this test. - QEMU="`identify_qemu $builddir/vmlinux`" + QEMU="`identify_qemu vmlinux`" BOOT_IMAGE="`identify_boot_image $QEMU`" - cp $builddir/vmlinux $resdir - cp $builddir/.config $resdir - cp $builddir/Module.symvers $resdir > /dev/null || : - cp $builddir/System.map $resdir > /dev/null || : + cp vmlinux $resdir + cp .config $resdir + cp Module.symvers $resdir > /dev/null || : + cp System.map $resdir > /dev/null || : if test -n "$BOOT_IMAGE" then - cp $builddir/$BOOT_IMAGE $resdir + cp $BOOT_IMAGE $resdir KERNEL=$resdir/${BOOT_IMAGE##*/} # Arch-independent indicator touch $resdir/builtkernel @@ -107,7 +102,7 @@ then parse-build.sh $resdir/Make.out $title else # Build failed. - cp $builddir/.config $resdir || : + cp .config $resdir || : echo Build failed, not running KVM, see $resdir. if test -f $builddir.wait then diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 3b4868906794..8c43eb43409b 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -342,7 +342,7 @@ function dump(first, pastlast, batchnum) print "needqemurun=" jn=1 for (j = first; j < pastlast; j++) { - builddir=KVM "/b1" + builddir=KVM "/b" j - first + 1 cpusr[jn] = cpus[j]; if (cfrep[cf[j]] == "") { cfr[jn] = cf[j]; @@ -358,7 +358,6 @@ function dump(first, pastlast, batchnum) print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date` | tee -a " rd "log"; print "rm -f " builddir ".*"; print "touch " builddir ".wait"; - print "mkdir " builddir " > /dev/null 2>&1 || :"; print "mkdir " rd cfr[jn] " || :"; print "kvm-test-1-run.sh " CONFIGDIR cf[j], builddir, rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &" print "echo ", cfr[jn], cpusr[jn] ovf ": Waiting for build to complete. `date` | tee -a " rd "log"; From 7225c0777271bb489ad6fb095aa10985ad138a81 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 11 May 2019 17:59:52 -0700 Subject: [PATCH 47/61] torture: Make --cpus override idleness calculations Currently, rcutorture will use relatively few CPUs to build the kernel on a busy system, which is often as it should be. However, if the user has used the --cpus argument to dedicate a specified number of CPUs to this torture test, it would be good if the kernel build also made use of them. This commit therefore changes the cpus2use.sh script to use --cpus when specified and to do the idleness calculations otherwise. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/cpus2use.sh | 5 +++++ tools/testing/selftests/rcutorture/bin/kvm.sh | 3 +++ 2 files changed, 8 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/cpus2use.sh b/tools/testing/selftests/rcutorture/bin/cpus2use.sh index ff7102212703..4e9485590c10 100755 --- a/tools/testing/selftests/rcutorture/bin/cpus2use.sh +++ b/tools/testing/selftests/rcutorture/bin/cpus2use.sh @@ -9,6 +9,11 @@ # # Authors: Paul E. McKenney +if test -n "$TORTURE_ALLOTED_CPUS" +then + echo $TORTURE_ALLOTED_CPUS + exit 0 +fi ncpus=`grep '^processor' /proc/cpuinfo | wc -l` idlecpus=`mpstat | tail -1 | \ awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'` diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 8c43eb43409b..ea6289a335f2 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -24,6 +24,7 @@ dur=$((30*60)) dryrun="" KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM PATH=${KVM}/bin:$PATH; export PATH +TORTURE_ALLOTED_CPUS="" TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD @@ -89,6 +90,7 @@ do --cpus) checkarg --cpus "(number)" "$#" "$2" '^[0-9]*$' '^--' cpus=$2 + TORTURE_ALLOTED_CPUS="$2" shift ;; --datestamp) @@ -285,6 +287,7 @@ cat << ___EOF___ > $T/script CONFIGFRAG="$CONFIGFRAG"; export CONFIGFRAG KVM="$KVM"; export KVM PATH="$PATH"; export PATH +TORTURE_ALLOTED_CPUS="$TORTURE_ALLOTED_CPUS"; export TORTURE_ALLOTED_CPUS TORTURE_BOOT_IMAGE="$TORTURE_BOOT_IMAGE"; export TORTURE_BOOT_IMAGE TORTURE_BUILDONLY="$TORTURE_BUILDONLY"; export TORTURE_BUILDONLY TORTURE_DEFCONFIG="$TORTURE_DEFCONFIG"; export TORTURE_DEFCONFIG From b93c765fda30cadae6aafb9d32a30f9391dc0b41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 11 May 2019 20:18:00 -0700 Subject: [PATCH 48/61] torture: Add --trust-make to suppress "make clean" The current rcutorture scripts unconditionally do "make clean", which is a good way of getting the needed testing done despite any imperfections in Makefile dependency tracking. However, this can be a bit irritating when repeatedly running a single scenario after small changes, for example, when debugging a problem that affects only a single scenario. This commit therefore adds a --trust-make argument that suppresses the "make clean". Even when using ccache, this speeds up kernel builds by up to almost an order of magnitude on my laptop. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/configinit.sh | 5 ++++- tools/testing/selftests/rcutorture/bin/kvm.sh | 6 ++++++ tools/testing/selftests/rcutorture/bin/parse-build.sh | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh index bbeae6f67c36..93e80a42249a 100755 --- a/tools/testing/selftests/rcutorture/bin/configinit.sh +++ b/tools/testing/selftests/rcutorture/bin/configinit.sh @@ -28,7 +28,10 @@ sed -e 's/^\(CONFIG[0-9A-Z_]*\)=.*$/grep -v "^# \1" |/' < $c > $T/u.sh sed -e 's/^\(CONFIG[0-9A-Z_]*=\).*$/grep -v \1 |/' < $c >> $T/u.sh grep '^grep' < $T/u.sh > $T/upd.sh echo "cat - $c" >> $T/upd.sh -make clean > $resdir/Make.clean 2>&1 +if test -z "$TORTURE_TRUST_MAKE" +then + make clean > $resdir/Make.clean 2>&1 +fi make $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1 mv .config .config.sav sh $T/upd.sh < .config.sav > .config diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index ea6289a335f2..72518580df23 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -33,6 +33,7 @@ TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu +TORTURE_TRUST_MAKE="" resdir="" configs="" cpus=0 @@ -63,6 +64,7 @@ usage () { echo " --qemu-cmd qemu-system-..." echo " --results absolute-pathname" echo " --torture rcu" + echo " --trust-make" exit 1 } @@ -175,6 +177,9 @@ do jitter=0 fi ;; + --trust-make) + TORTURE_TRUST_MAKE="y" + ;; *) echo Unknown argument $1 usage @@ -300,6 +305,7 @@ TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC TORTURE_QEMU_MEM="$TORTURE_QEMU_MEM"; export TORTURE_QEMU_MEM TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE +TORTURE_TRUST_MAKE="$TORTURE_TRUST_MAKE"; export TORTURE_TRUST_MAKE if ! test -e $resdir then mkdir -p "$resdir" || : diff --git a/tools/testing/selftests/rcutorture/bin/parse-build.sh b/tools/testing/selftests/rcutorture/bin/parse-build.sh index 0701b3bf6ade..09155c15ea65 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-build.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-build.sh @@ -21,7 +21,7 @@ mkdir $T . functions.sh -if grep -q CC < $F +if grep -q CC < $F || test -n "$TORTURE_TRUST_MAKE" then : else From 34aa34b818407bd475786cf160f7838b7a485e87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 May 2019 16:15:16 -0700 Subject: [PATCH 49/61] rcutorture: Dump trace buffer for callback pipe drain failures Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6b803fb2f7ca..89be0f492f78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1060,8 +1060,10 @@ rcu_torture_writer(void *arg) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != - &rcu_tortures[i]) + &rcu_tortures[i]) { + rcu_ftrace_dump(DUMP_ALL); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + } } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) From 8997e6311ed6cb95be34409bc6b4a11c7a84ac35 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 May 2019 08:52:18 -0700 Subject: [PATCH 50/61] torture: Suppress propagating trace_printk() warning When trace_printk() is used, a message including "BUG" is printed to the console, which fools the rcutorture scripting into believing that the corresponding test scenario failed. This commit therefore filters out this particular instance of "BUG", thus avoiding the false-positive test-failure report. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/parse-console.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 4508373a922f..4bf62d7b1cbc 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -106,6 +106,7 @@ fi | tee -a $file.diags egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file | grep -v 'ODEBUG: ' | +grep -v 'This means that this is a DEBUG kernel and it is' | grep -v 'Warning: unable to open an initial console' > $T.diags if test -s $T.diags then From 354ea05d0276384045fabbfd62ccd2d985defa9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 May 2019 12:36:53 -0700 Subject: [PATCH 51/61] rcutorture: Upper case solves the case of the vanishing NULL pointer Various security techniques can obfuscate pointer printouts on the console. Unfortunately, rcutorture relies on either "null" or all zeroes to identify the last few statistics printouts at the end of the test. These need to be identified because failing to do so will results in false-positive complaints about grace-period hangs. This commit therefore prints the "ver:" in capitals ("VER:") when the RCU-protected pointer has been set to NULL, which causes rcutorture's parse-console.sh script to correctly ignore these lines. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 89be0f492f78..fce4e7e6f502 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1408,8 +1408,9 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); - pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, + rcu_torture_current ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), From 96050c68be33edef18800ad6748f61f81db81a20 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 20 Apr 2019 01:40:54 -0700 Subject: [PATCH 52/61] rcu: Upgrade sync_exp_work_done() to smp_mb() The sync_exp_work_done() function uses smp_mb__before_atomic(), but there is no obvious atomic in the ensuing code. The ordering is absolutely required for grace periods to work correctly, so this commit upgrades the smp_mb__before_atomic() to smp_mb(). Fixes: 6fba2b3767ea ("rcu: Remove deprecated RCU debugfs tracing code") Reported-by: Andrea Parri Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 9c990df880d1..d969650a72c6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -259,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ + smp_mb(); /* Ensure test happens before caller kfree(). */ return true; } return false; From b3119cde1d70d6df1574b9f26d8e087e8e5116b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 May 2019 10:07:45 -0700 Subject: [PATCH 53/61] rcu: Fix irritating whitespace error in rcu_assign_pointer() Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 915460ec0872..534c05d529b5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -369,7 +369,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_assign_pointer(p, v) \ ({ \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ - rcu_check_sparse(p, __rcu); \ + rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ From 6da9f775175e516fc7229ceaa9b54f8f56aa7924 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 21 May 2019 16:48:43 -0400 Subject: [PATCH 54/61] rcu: Force inlining of rcu_read_lock() When debugging options are turned on, the rcu_read_lock() function might not be inlined. This results in lockdep's print_lock() function printing "rcu_read_lock+0x0/0x70" instead of rcu_read_lock()'s caller. For example: [ 10.579995] ============================= [ 10.584033] WARNING: suspicious RCU usage [ 10.588074] 4.18.0.memcg_v2+ #1 Not tainted [ 10.593162] ----------------------------- [ 10.597203] include/linux/rcupdate.h:281 Illegal context switch in RCU read-side critical section! [ 10.606220] [ 10.606220] other info that might help us debug this: [ 10.606220] [ 10.614280] [ 10.614280] rcu_scheduler_active = 2, debug_locks = 1 [ 10.620853] 3 locks held by systemd/1: [ 10.624632] #0: (____ptrval____) (&type->i_mutex_dir_key#5){.+.+}, at: lookup_slow+0x42/0x70 [ 10.633232] #1: (____ptrval____) (rcu_read_lock){....}, at: rcu_read_lock+0x0/0x70 [ 10.640954] #2: (____ptrval____) (rcu_read_lock){....}, at: rcu_read_lock+0x0/0x70 These "rcu_read_lock+0x0/0x70" strings are not providing any useful information. This commit therefore forces inlining of the rcu_read_lock() function so that rcu_read_lock()'s caller is instead shown. Signed-off-by: Waiman Long Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 534c05d529b5..a8ed624da555 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -588,7 +588,7 @@ static inline void rcu_preempt_sleep_check(void) { } * read-side critical sections may be preempted and they may also block, but * only when acquiring spinlocks that are subject to priority inheritance. */ -static inline void rcu_read_lock(void) +static __always_inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); From 9129b017b54dab09eb69b7269026243156e5188e Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Mon, 27 May 2019 10:49:57 +0200 Subject: [PATCH 55/61] rcu: Don't return a value from rcu_assign_pointer() Quoting Paul [1]: "Given that a quick (and perhaps error-prone) search of the uses of rcu_assign_pointer() in v5.1 didn't find a single use of the return value, let's please instead change the documentation and implementation to eliminate the return value." [1] https://lkml.kernel.org/r/20190523135013.GL28207@linux.ibm.com Signed-off-by: Andrea Parri Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: rcu@vger.kernel.org Cc: Peter Zijlstra Cc: Will Deacon Cc: Mark Rutland Cc: Matthew Wilcox Cc: Sasha Levin Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 8 ++++---- include/linux/rcupdate.h | 5 ++--- tools/include/linux/rcu.h | 4 ++-- tools/testing/radix-tree/linux/rcupdate.h | 2 +- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 981651a8b65d..7e1a8721637a 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -212,7 +212,7 @@ synchronize_rcu() rcu_assign_pointer() - typeof(p) rcu_assign_pointer(p, typeof(p) v); + void rcu_assign_pointer(p, typeof(p) v); Yes, rcu_assign_pointer() -is- implemented as a macro, though it would be cool to be able to declare a function in this manner. @@ -220,9 +220,9 @@ rcu_assign_pointer() The updater uses this function to assign a new value to an RCU-protected pointer, in order to safely communicate the change - in value from the updater to the reader. This function returns - the new value, and also executes any memory-barrier instructions - required for a given CPU architecture. + in value from the updater to the reader. This macro does not + evaluate to an rvalue, but it does execute any memory-barrier + instructions required for a given CPU architecture. Perhaps just as important, it serves to document (1) which pointers are protected by RCU and (2) the point at which a diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a8ed624da555..0c9b92799abc 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -367,7 +367,7 @@ static inline void rcu_preempt_sleep_check(void) { } * other macros that it invokes. */ #define rcu_assign_pointer(p, v) \ -({ \ +do { \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ rcu_check_sparse(p, __rcu); \ \ @@ -375,8 +375,7 @@ static inline void rcu_preempt_sleep_check(void) { } WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ else \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ - _r_a_p__v; \ -}) +} while (0) /** * rcu_swap_protected() - swap an RCU and a regular pointer diff --git a/tools/include/linux/rcu.h b/tools/include/linux/rcu.h index 7d02527e5bce..9554d3fa54f3 100644 --- a/tools/include/linux/rcu.h +++ b/tools/include/linux/rcu.h @@ -19,7 +19,7 @@ static inline bool rcu_is_watching(void) return false; } -#define rcu_assign_pointer(p, v) ((p) = (v)) -#define RCU_INIT_POINTER(p, v) p=(v) +#define rcu_assign_pointer(p, v) do { (p) = (v); } while (0) +#define RCU_INIT_POINTER(p, v) do { (p) = (v); } while (0) #endif diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h index fd280b070fdb..fed468fb0c78 100644 --- a/tools/testing/radix-tree/linux/rcupdate.h +++ b/tools/testing/radix-tree/linux/rcupdate.h @@ -7,6 +7,6 @@ #define rcu_dereference_raw(p) rcu_dereference(p) #define rcu_dereference_protected(p, cond) rcu_dereference(p) #define rcu_dereference_check(p, cond) rcu_dereference(p) -#define RCU_INIT_POINTER(p, v) (p) = (v) +#define RCU_INIT_POINTER(p, v) do { (p) = (v); } while (0) #endif From 2966f8d440c3d2b6ef6c44093a9f9c42701a077a Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 3 May 2019 13:13:44 -0400 Subject: [PATCH 56/61] Documentation: atomic_t.txt: Explain ordering provided by smp_mb__{before,after}_atomic() The description of smp_mb__before_atomic() and smp_mb__after_atomic() in Documentation/atomic_t.txt is slightly terse and misleading. It does not clearly state which other instructions are ordered by these barriers. This improves the text to make the actual ordering implications clear, and also to explain how these barriers differ from a RELEASE or ACQUIRE ordering. Signed-off-by: Alan Stern Cc: Jonathan Corbet Cc: Peter Zijlstra Acked-by: Andrea Parri Signed-off-by: Paul E. McKenney --- Documentation/atomic_t.txt | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt index dca3fb0554db..b3afe69d03a1 100644 --- a/Documentation/atomic_t.txt +++ b/Documentation/atomic_t.txt @@ -187,8 +187,14 @@ The barriers: smp_mb__{before,after}_atomic() -only apply to the RMW ops and can be used to augment/upgrade the ordering -inherent to the used atomic op. These barriers provide a full smp_mb(). +only apply to the RMW atomic ops and can be used to augment/upgrade the +ordering inherent to the op. These barriers act almost like a full smp_mb(): +smp_mb__before_atomic() orders all earlier accesses against the RMW op +itself and all accesses following it, and smp_mb__after_atomic() orders all +later accesses against the RMW op and all accesses preceding it. However, +accesses between the smp_mb__{before,after}_atomic() and the RMW op are not +ordered, so it is advisable to place the barrier right next to the RMW atomic +op whenever possible. These helper barriers exist because architectures have varying implicit ordering on their SMP atomic primitives. For example our TSO architectures @@ -212,7 +218,9 @@ Further, while something like: atomic_dec(&X); is a 'typical' RELEASE pattern, the barrier is strictly stronger than -a RELEASE. Similarly for something like: +a RELEASE because it orders preceding instructions against both the read +and write parts of the atomic_dec(), and against all following instructions +as well. Similarly, something like: atomic_inc(&X); smp_mb__after_atomic(); @@ -244,7 +252,8 @@ strictly stronger than ACQUIRE. As illustrated: This should not happen; but a hypothetical atomic_inc_acquire() -- (void)atomic_fetch_inc_acquire() for instance -- would allow the outcome, -since then: +because it would not order the W part of the RMW against the following +WRITE_ONCE. Thus: P1 P2 From 46f52b1fe79d2453467b904600ae4759808c4a44 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Tue, 19 Feb 2019 23:55:22 +0100 Subject: [PATCH 57/61] tools/memory-model: Fix comment in MP+poonceonces.litmus The comment should say "Sometimes" for the result. Signed-off-by: Andrea Parri Cc: Will Deacon Cc: Peter Zijlstra Cc: Boqun Feng Cc: Nicholas Piggin Cc: David Howells Cc: Jade Alglave Cc: Luc Maranget Cc: "Paul E. McKenney" Cc: Akira Yokosawa Cc: Daniel Lustig Acked-by: Alan Stern Signed-off-by: Paul E. McKenney --- tools/memory-model/litmus-tests/MP+poonceonces.litmus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/litmus-tests/MP+poonceonces.litmus b/tools/memory-model/litmus-tests/MP+poonceonces.litmus index b2b60b84fb9d..172f0145301c 100644 --- a/tools/memory-model/litmus-tests/MP+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/MP+poonceonces.litmus @@ -1,7 +1,7 @@ C MP+poonceonces (* - * Result: Maybe + * Result: Sometimes * * Can the counter-intuitive message-passing outcome be prevented with * no ordering at all? From 37c600a3cc8a6941d77e853ec4e0e33fffa1046b Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Tue, 19 Feb 2019 23:55:23 +0100 Subject: [PATCH 58/61] tools/memory-model: Do not use "herd" to refer to "herd7" Use "herd7" in each such reference. Signed-off-by: Andrea Parri Cc: Will Deacon Cc: Peter Zijlstra Cc: Boqun Feng Cc: Nicholas Piggin Cc: David Howells Cc: Jade Alglave Cc: Luc Maranget Cc: "Paul E. McKenney" Cc: Akira Yokosawa Cc: Daniel Lustig Acked-by: Alan Stern Signed-off-by: Paul E. McKenney --- tools/memory-model/litmus-tests/README | 2 +- tools/memory-model/lock.cat | 2 +- tools/memory-model/scripts/README | 4 ++-- tools/memory-model/scripts/checkalllitmus.sh | 2 +- tools/memory-model/scripts/checklitmus.sh | 2 +- tools/memory-model/scripts/parseargs.sh | 2 +- tools/memory-model/scripts/runlitmushist.sh | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/memory-model/litmus-tests/README b/tools/memory-model/litmus-tests/README index 5ee08f129094..681f9067fa9e 100644 --- a/tools/memory-model/litmus-tests/README +++ b/tools/memory-model/litmus-tests/README @@ -244,7 +244,7 @@ produce the name: Adding the ".litmus" suffix: SB+rfionceonce-poonceonces.litmus The descriptors that describe connections between consecutive accesses -within the cycle through a given litmus test can be provided by the herd +within the cycle through a given litmus test can be provided by the herd7 tool (Rfi, Po, Fre, and so on) or by the linux-kernel.bell file (Once, Release, Acquire, and so on). diff --git a/tools/memory-model/lock.cat b/tools/memory-model/lock.cat index a059d1a6d8a2..6b52f365d73a 100644 --- a/tools/memory-model/lock.cat +++ b/tools/memory-model/lock.cat @@ -11,7 +11,7 @@ include "cross.cat" (* - * The lock-related events generated by herd are as follows: + * The lock-related events generated by herd7 are as follows: * * LKR Lock-Read: the read part of a spin_lock() or successful * spin_trylock() read-modify-write event pair diff --git a/tools/memory-model/scripts/README b/tools/memory-model/scripts/README index 29375a1fbbfa..095c7eb36f9f 100644 --- a/tools/memory-model/scripts/README +++ b/tools/memory-model/scripts/README @@ -22,7 +22,7 @@ checklitmushist.sh Run all litmus tests having .litmus.out files from previous initlitmushist.sh or newlitmushist.sh runs, comparing the - herd output to that of the original runs. + herd7 output to that of the original runs. checklitmus.sh @@ -43,7 +43,7 @@ initlitmushist.sh judgelitmus.sh - Given a .litmus file and its .litmus.out herd output, check the + Given a .litmus file and its .litmus.out herd7 output, check the .litmus.out file against the .litmus file's "Result:" comment to judge whether the test ran correctly. Not normally run manually, provided instead for use by other scripts. diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh index b35fcd61ecf6..3c0c7fbbd223 100755 --- a/tools/memory-model/scripts/checkalllitmus.sh +++ b/tools/memory-model/scripts/checkalllitmus.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# Run herd tests on all .litmus files in the litmus-tests directory +# Run herd7 tests on all .litmus files in the litmus-tests directory # and check each file's result against a "Result:" comment within that # litmus test. If the verification result does not match that specified # in the litmus test, this script prints an error message prefixed with diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh index dd08801a30b0..11461ed40b5e 100755 --- a/tools/memory-model/scripts/checklitmus.sh +++ b/tools/memory-model/scripts/checklitmus.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# Run a herd test and invokes judgelitmus.sh to check the result against +# Run a herd7 test and invokes judgelitmus.sh to check the result against # a "Result:" comment within the litmus test. It also outputs verification # results to a file whose name is that of the specified litmus test, but # with ".out" appended. diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 859e1d581e05..40f52080fdbd 100644 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -91,7 +91,7 @@ do shift ;; --herdopts|--herdopt) - checkarg --destdir "(herd options)" "$#" "$2" '.*' '^--' + checkarg --destdir "(herd7 options)" "$#" "$2" '.*' '^--' LKMM_HERD_OPTIONS="$2" shift ;; diff --git a/tools/memory-model/scripts/runlitmushist.sh b/tools/memory-model/scripts/runlitmushist.sh index e507f5f933d5..6ed376f495bb 100644 --- a/tools/memory-model/scripts/runlitmushist.sh +++ b/tools/memory-model/scripts/runlitmushist.sh @@ -79,7 +79,7 @@ then echo ' ---' Summary: 1>&2 grep '!!!' $T/*.sh.out 1>&2 nfail="`grep '!!!' $T/*.sh.out | wc -l`" - echo 'Number of failed herd runs (e.g., timeout): ' $nfail 1>&2 + echo 'Number of failed herd7 runs (e.g., timeout): ' $nfail 1>&2 exit 1 else echo All runs completed successfully. 1>&2 From f9de417121001879d92a86960647adb06b5b81bf Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 20 Jun 2019 11:55:36 -0400 Subject: [PATCH 59/61] tools/memory-model: Expand definition of barrier Commit 66be4e66a7f4 ("rcu: locking and unlocking need to always be at least barriers") added compiler barriers back into rcu_read_lock() and rcu_read_unlock(). Furthermore, srcu_read_lock() and srcu_read_unlock() have always contained compiler barriers. The Linux Kernel Memory Model ought to know about these barriers. This patch adds them into the memory model. Signed-off-by: Alan Stern Acked-by: Andrea Parri Signed-off-by: Paul E. McKenney --- tools/memory-model/linux-kernel.cat | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index 36d367054811..8b61218c99fd 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -45,7 +45,8 @@ let strong-fence = mb | gp let nonrw-fence = strong-fence | po-rel | acq-po let fence = nonrw-fence | wmb | rmb let barrier = fencerel(Barrier | Rmb | Wmb | Mb | Sync-rcu | Sync-srcu | - Before-atomic | After-atomic | Acquire | Release) | + Before-atomic | After-atomic | Acquire | Release | + Rcu-lock | Rcu-unlock | Srcu-lock | Srcu-unlock) | (po ; [Release]) | ([Acquire] ; po) (**********************************) From 15aa25cbf0ccc4bd63ed6f2a8065decb7f5e6f89 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 20 Jun 2019 11:55:46 -0400 Subject: [PATCH 60/61] tools/memory-model: Change definition of rcu-fence The rcu-fence relation in the Linux Kernel Memory Model is not well named. It doesn't act like any other fence relation, in that it does not relate events before a fence to events after that fence. All it does is relate certain RCU events to one another (those that are ordered by the RCU Guarantee); this induces an actual strong-fence-like relation linking events preceding the first RCU event to those following the second. This patch renames rcu-fence, now called rcu-order. It adds a new definition of rcu-fence, something which should have been present all along because it is used in the rb relation. And it modifies the fence and strong-fence relations by making them incorporate the new rcu-fence. As a result of this change, there is no longer any need to define full-fence in the section for detecting data races. It can simply be replaced by the updated strong-fence relation. This change should have no effect on the operation of the memory model. Signed-off-by: Alan Stern Acked-by: Andrea Parri Signed-off-by: Paul E. McKenney --- tools/memory-model/linux-kernel.cat | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index 8b61218c99fd..ca2f4297b4e6 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -122,24 +122,28 @@ let rcu-link = po? ; hb* ; pb* ; prop ; po (* * Any sequence containing at least as many grace periods as RCU read-side - * critical sections (joined by rcu-link) acts as a generalized strong fence. + * critical sections (joined by rcu-link) induces order like a generalized + * inter-CPU strong fence. * Likewise for SRCU grace periods and read-side critical sections, provided * the synchronize_srcu() and srcu_read_[un]lock() calls refer to the same * struct srcu_struct location. *) -let rec rcu-fence = rcu-gp | srcu-gp | +let rec rcu-order = rcu-gp | srcu-gp | (rcu-gp ; rcu-link ; rcu-rscsi) | ((srcu-gp ; rcu-link ; srcu-rscsi) & loc) | (rcu-rscsi ; rcu-link ; rcu-gp) | ((srcu-rscsi ; rcu-link ; srcu-gp) & loc) | - (rcu-gp ; rcu-link ; rcu-fence ; rcu-link ; rcu-rscsi) | - ((srcu-gp ; rcu-link ; rcu-fence ; rcu-link ; srcu-rscsi) & loc) | - (rcu-rscsi ; rcu-link ; rcu-fence ; rcu-link ; rcu-gp) | - ((srcu-rscsi ; rcu-link ; rcu-fence ; rcu-link ; srcu-gp) & loc) | - (rcu-fence ; rcu-link ; rcu-fence) + (rcu-gp ; rcu-link ; rcu-order ; rcu-link ; rcu-rscsi) | + ((srcu-gp ; rcu-link ; rcu-order ; rcu-link ; srcu-rscsi) & loc) | + (rcu-rscsi ; rcu-link ; rcu-order ; rcu-link ; rcu-gp) | + ((srcu-rscsi ; rcu-link ; rcu-order ; rcu-link ; srcu-gp) & loc) | + (rcu-order ; rcu-link ; rcu-order) +let rcu-fence = po ; rcu-order ; po? +let fence = fence | rcu-fence +let strong-fence = strong-fence | rcu-fence (* rb orders instructions just as pb does *) -let rb = prop ; po ; rcu-fence ; po? ; hb* ; pb* ; [Marked] +let rb = prop ; rcu-fence ; hb* ; pb* ; [Marked] irreflexive rb as rcu @@ -163,9 +167,8 @@ flag ~empty mixed-accesses as mixed-accesses (* Executes-before and visibility *) let xbstar = (hb | pb | rb)* -let full-fence = strong-fence | (po ; rcu-fence ; po?) let vis = cumul-fence* ; rfe? ; [Marked] ; - ((full-fence ; [Marked] ; xbstar) | (xbstar & int)) + ((strong-fence ; [Marked] ; xbstar) | (xbstar & int)) (* Boundaries for lifetimes of plain accesses *) let w-pre-bounded = [Marked] ; (addr | fence)? From 4289ee7d5a8343eaddd0986f8fb492868e2f546f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 20 Jun 2019 11:55:58 -0400 Subject: [PATCH 61/61] tools/memory-model: Improve data-race detection Herbert Xu recently reported a problem concerning RCU and compiler barriers. In the course of discussing the problem, he put forth a litmus test which illustrated a serious defect in the Linux Kernel Memory Model's data-race-detection code [1]. The defect was that the LKMM assumed visibility and executes-before ordering of plain accesses had to be mediated by marked accesses. In Herbert's litmus test this wasn't so, and the LKMM claimed the litmus test was allowed and contained a data race although neither is true. In fact, plain accesses can be ordered by fences even in the absence of marked accesses. In most cases this doesn't matter, because most fences only order accesses within a single thread. But the rcu-fence relation is different; it can order (and induce visibility between) accesses in different threads -- events which otherwise might be concurrent. This makes it relevant to data-race detection. This patch makes two changes to the memory model to incorporate the new insight: If a store is separated by a fence from another access, the store is necessarily visible to the other access (as reflected in the ww-vis and wr-vis relations). Similarly, if a load is separated by a fence from another access then the load necessarily executes before the other access (as reflected in the rw-xbstar relation). If a store is separated by a strong fence from a marked access then it is necessarily visible to any access that executes after the marked access (as reflected in the ww-vis and wr-vis relations). With these changes, the LKMM gives the desired result for Herbert's litmus test and other related ones [2]. [1] https://lore.kernel.org/lkml/Pine.LNX.4.44L0.1906041026570.1731-100000@iolanthe.rowland.org/ [2] https://github.com/paulmckrcu/litmus/blob/master/manual/plain/C-S-rcunoderef-1.litmus https://github.com/paulmckrcu/litmus/blob/master/manual/plain/C-S-rcunoderef-2.litmus https://github.com/paulmckrcu/litmus/blob/master/manual/plain/C-S-rcunoderef-3.litmus https://github.com/paulmckrcu/litmus/blob/master/manual/plain/C-S-rcunoderef-4.litmus https://github.com/paulmckrcu/litmus/blob/master/manual/plain/strong-vis.litmus Reported-by: Herbert Xu Signed-off-by: Alan Stern Acked-by: Andrea Parri Signed-off-by: Paul E. McKenney Tested-by: Akira Yokosawa --- tools/memory-model/linux-kernel.cat | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index ca2f4297b4e6..ea2ff4b94074 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -179,9 +179,11 @@ let r-post-bounded = (nonrw-fence | ([~Noreturn] ; fencerel(Rmb) ; [R4rmb]))? ; [Marked] (* Visibility and executes-before for plain accesses *) -let ww-vis = w-post-bounded ; vis ; w-pre-bounded -let wr-vis = w-post-bounded ; vis ; r-pre-bounded -let rw-xbstar = r-post-bounded ; xbstar ; w-pre-bounded +let ww-vis = fence | (strong-fence ; xbstar ; w-pre-bounded) | + (w-post-bounded ; vis ; w-pre-bounded) +let wr-vis = fence | (strong-fence ; xbstar ; r-pre-bounded) | + (w-post-bounded ; vis ; r-pre-bounded) +let rw-xbstar = fence | (r-post-bounded ; xbstar ; w-pre-bounded) (* Potential races *) let pre-race = ext & ((Plain * M) | ((M \ IW) * Plain))