Re: scheduler problems in -next (was: Re: [PATCH 6.4 000/227] 6.4.7-rc1 review)

From: Roy Hopkins
Date: Mon Jul 31 2023 - 12:31:12 EST


On Mon, 2023-07-31 at 18:14 +0200, Peter Zijlstra wrote:
> Ha!, I was poking around the same thing. My hack below seems to (so far,
> <20 boots) help things.
>
>
> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> index 56c470a489c8..b083b5a30025 100644
> --- a/kernel/rcu/tasks.h
> +++ b/kernel/rcu/tasks.h
> @@ -652,7 +658,11 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
>         t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
>         if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
>                 return;
> -       smp_mb(); /* Ensure others see full kthread. */
> +       for (;;) {
> +               cond_resched();
> +               if (smp_load_acquire(&rtp->kthread_ptr))
> +                       break;
> +       }
>  }
>  
>  #ifndef CONFIG_TINY_RCU

FWIW, here's my hack which seems to fix it.

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 9b9ce09f8f35..2e76fbfff9c6 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -52,6 +52,7 @@ struct rcu_tasks_percpu {
* @cbs_gbl_lock: Lock protecting callback list.
* @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * @kthread_started: Flag that indicates whether kthread has been launched.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
* @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping.
@@ -92,6 +93,7 @@ struct rcu_tasks {
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
+ int kthread_started;
rcu_tasks_gp_func_t gp_func;
pregp_func_t pregp_func;
pertask_func_t pertask_func;
@@ -582,7 +584,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
return;

// If the grace-period kthread is running, use it.
- if (READ_ONCE(rtp->kthread_ptr)) {
+ if (READ_ONCE(rtp->kthread_started)) {
wait_rcu_gp(rtp->call_func);
return;
}
@@ -595,6 +597,7 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
struct task_struct *t;

t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
+ rtp->kthread_started = 1;
if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
return;
smp_mb(); /* Ensure others see full kthread. */