[PATCH 1/6] workqueue, sched: Notify workqueue of scheduling of RUNNING and preempted tasks

From: Tejun Heo
Date: Tue May 09 2023 - 23:08:22 EST


When a workqueue kworker goes to sleep, wq_worker_sleeping() is called so
that workqueue can manage concurrency. This patch renames
wq_worker_sleeping() to wq_worker_stopping() and calls it whenever a kworker
is scheduled out whether it's going to sleep or not.

Workqueue will use the schedule-out events of RUNNING tasks to automatically
detect CPU hogging work items and exclude them from concurrency management
so that they can't stall other work items.

sched_submit_work() and sched_update_work() were only called by schedule().
As workqueue would need to be notified of all scheduling events including
preemptions, make the two functions notrace, add @sched_mode and call them
from all schedule functions. As wq_worker_sleeping() is now called from
preemption path, it's also marked notrace.

As sched_update_work() is noop when called !SM_NONE, it's debatable whether
it needs to be called from !SM_NONE paths. However, it shouldn't add any
actual overhead and is possibly less confusing to keep the symmetry. This
can go either way.

The only functional change is that wq_worker_sleeping() is now called from
all scheduling paths. As this patch adds early exit conditions which make it
noop in all new invocations, there shouldn't be any behavior change. While
at it, remove the already outdated comment which doesn't cover the io_wq
case.

v2: Lai pointed out that !SM_NONE cases should also be notified. Updated
accordingly.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
Cc: Lai Jiangshan <jiangshanlai@xxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 48 ++++++++++++++++++++++++-------------
kernel/workqueue.c | 23 +++++++++++++-----
kernel/workqueue_internal.h | 2 +-
3 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 944c3ae39861..ab4317a8e11a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6692,24 +6692,20 @@ void __noreturn do_task_dead(void)
cpu_relax();
}

-static inline void sched_submit_work(struct task_struct *tsk)
+static inline notrace void sched_submit_work(struct task_struct *tsk,
+ unsigned int sched_mode)
{
- unsigned int task_flags;
+ unsigned int task_flags = tsk->flags;
+ bool voluntary = sched_mode == SM_NONE;

- if (task_is_running(tsk))
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_stopping(tsk, voluntary);
+
+ if (!voluntary || task_is_running(tsk))
return;

- task_flags = tsk->flags;
- /*
- * If a worker goes to sleep, notify and ask workqueue whether it
- * wants to wake up a task to maintain concurrency.
- */
- if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- if (task_flags & PF_WQ_WORKER)
- wq_worker_sleeping(tsk);
- else
- io_wq_worker_sleeping(tsk);
- }
+ if (task_flags & PF_IO_WORKER)
+ io_wq_worker_sleeping(tsk);

/*
* spinlock and rwlock must not flush block requests. This will
@@ -6725,8 +6721,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_flush_plug(tsk->plug, true);
}

-static void sched_update_worker(struct task_struct *tsk)
+static notrace void sched_update_worker(struct task_struct *tsk,
+ unsigned int sched_mode)
{
+ if (sched_mode != SM_NONE)
+ return;
+
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
@@ -6739,13 +6739,13 @@ asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;

- sched_submit_work(tsk);
+ sched_submit_work(tsk, SM_NONE);
do {
preempt_disable();
__schedule(SM_NONE);
sched_preempt_enable_no_resched();
} while (need_resched());
- sched_update_worker(tsk);
+ sched_update_worker(tsk, SM_NONE);
}
EXPORT_SYMBOL(schedule);

@@ -6808,17 +6808,24 @@ void __sched schedule_preempt_disabled(void)
#ifdef CONFIG_PREEMPT_RT
void __sched notrace schedule_rtlock(void)
{
+ struct task_struct *tsk = current;
+
+ sched_submit_work(tsk, SM_RTLOCK_WAIT);
do {
preempt_disable();
__schedule(SM_RTLOCK_WAIT);
sched_preempt_enable_no_resched();
} while (need_resched());
+ sched_update_worker(tsk, SM_RTLOCK_WAIT);
}
NOKPROBE_SYMBOL(schedule_rtlock);
#endif

static void __sched notrace preempt_schedule_common(void)
{
+ struct task_struct *tsk = current;
+
+ sched_submit_work(tsk, SM_PREEMPT);
do {
/*
* Because the function tracer can trace preempt_count_sub()
@@ -6844,6 +6851,7 @@ static void __sched notrace preempt_schedule_common(void)
* between schedule and now.
*/
} while (need_resched());
+ sched_update_worker(tsk, SM_PREEMPT);
}

#ifdef CONFIG_PREEMPTION
@@ -6901,11 +6909,13 @@ EXPORT_SYMBOL(dynamic_preempt_schedule);
*/
asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
{
+ struct task_struct *tsk = current;
enum ctx_state prev_ctx;

if (likely(!preemptible()))
return;

+ sched_submit_work(tsk, SM_PREEMPT);
do {
/*
* Because the function tracer can trace preempt_count_sub()
@@ -6934,6 +6944,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
} while (need_resched());
+ sched_update_worker(tsk, SM_PREEMPT);
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);

@@ -6968,11 +6979,13 @@ EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
*/
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
+ struct task_struct *tsk = current;
enum ctx_state prev_state;

/* Catch callers which need to be fixed */
BUG_ON(preempt_count() || !irqs_disabled());

+ sched_submit_work(tsk, SM_PREEMPT);
prev_state = exception_enter();

do {
@@ -6984,6 +6997,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
} while (need_resched());

exception_exit(prev_state);
+ sched_update_worker(tsk, SM_PREEMPT);
}

int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4666a1a92a31..cbcdc11adabd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -867,7 +867,10 @@ static void wake_up_worker(struct worker_pool *pool)
* wq_worker_running - a worker is running again
* @task: task waking up
*
- * This function is called when a worker returns from schedule()
+ * This function is called when a worker returns from schedule().
+ *
+ * Unlike wq_worker_stopping(), this function is only called from schedule() but
+ * not other scheduling paths including preemption and can be traced safely.
*/
void wq_worker_running(struct task_struct *task)
{
@@ -890,17 +893,25 @@ void wq_worker_running(struct task_struct *task)
}

/**
- * wq_worker_sleeping - a worker is going to sleep
- * @task: task going to sleep
+ * wq_worker_stopping - a worker is stopping
+ * @task: task stopping
+ * @voluntary: being called from schedule()
+ *
+ * This function is called from scheduling paths including schedule() and
+ * preemption when a busy worker is going off the CPU.
*
- * This function is called from schedule() when a busy worker is
- * going to sleep.
+ * As this function may be called from preempt_enable_notrace() and friends when
+ * !@voluntary, it must be notrace and limit reentrancy when @voluntary to avoid
+ * infinite recursions.
*/
-void wq_worker_sleeping(struct task_struct *task)
+void notrace wq_worker_stopping(struct task_struct *task, bool voluntary)
{
struct worker *worker = kthread_data(task);
struct worker_pool *pool;

+ if (!voluntary || task_is_running(task))
+ return;
+
/*
* Rescuers, which may not have all the fields set up like normal
* workers, also reach here, let's not access anything before
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index e00b1204a8e9..c34b876af16d 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -75,7 +75,7 @@ static inline struct worker *current_wq_worker(void)
* sched/ and workqueue.c.
*/
void wq_worker_running(struct task_struct *task);
-void wq_worker_sleeping(struct task_struct *task);
+void wq_worker_stopping(struct task_struct *task, bool voluntary);
work_func_t wq_worker_last_func(struct task_struct *task);

#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
--
2.40.1