[RFC PATCH 69/86] treewide: io_uring: remove cond_resched()

From: Ankur Arora
Date: Tue Nov 07 2023 - 18:11:10 EST


There are broadly three sets of uses of cond_resched():

1. Calls to cond_resched() out of the goodness of our heart,
otherwise known as avoiding lockup splats.

2. Open coded variants of cond_resched_lock() which call
cond_resched().

3. Retry or error handling loops, where cond_resched() is used as a
quick alternative to spinning in a tight-loop.

When running under a full preemption model, the cond_resched() reduces
to a NOP (not even a barrier) so removing it obviously cannot matter.

But considering only voluntary preemption models (for say code that
has been mostly tested under those), for set-1 and set-2 the
scheduler can now preempt kernel tasks running beyond their time
quanta anywhere they are preemptible() [1]. Which removes any need
for these explicitly placed scheduling points.

The cond_resched() calls in set-3 are a little more difficult.
To start with, given it's NOP character under full preemption, it
never actually saved us from a tight loop.
With voluntary preemption, it's not a NOP, but it might as well be --
for most workloads the scheduler does not have an interminable supply
of runnable tasks on the runqueue.

So, cond_resched() is useful to not get softlockup splats, but not
terribly good for error handling. Ideally, these should be replaced
with some kind of timed or event wait.
For now we use cond_resched_stall(), which tries to schedule if
possible, and executes a cpu_relax() if not.

All of the uses of cond_resched() are from set-1 or set-2.
Remove them.

[1] https://lore.kernel.org/lkml/20231107215742.363031-1-ankur.a.arora@xxxxxxxxxx/

Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Pavel Begunkov <asml.silence@xxxxxxxxx>
Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx>
---
io_uring/io-wq.c | 4 +---
io_uring/io_uring.c | 21 ++++++++++++---------
io_uring/kbuf.c | 2 --
io_uring/sqpoll.c | 6 ++++--
io_uring/tctx.c | 4 +---
5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 522196dfb0ff..fcaf9161be03 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -532,10 +532,8 @@ static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct,
static void io_assign_current_work(struct io_worker *worker,
struct io_wq_work *work)
{
- if (work) {
+ if (work)
io_run_task_work();
- cond_resched();
- }

raw_spin_lock(&worker->lock);
worker->cur_work = work;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8d1bc6cdfe71..547b7c6bdc68 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1203,9 +1203,14 @@ static unsigned int handle_tw_list(struct llist_node *node,
node = next;
count++;
if (unlikely(need_resched())) {
+
+ /*
+ * Depending on whether we have PREEMPT_RCU or not, the
+ * mutex_unlock() or percpu_ref_put() should cause us to
+ * reschedule.
+ */
ctx_flush_and_put(*ctx, ts);
*ctx = NULL;
- cond_resched();
}
}

@@ -1611,7 +1616,6 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
*/
if (need_resched()) {
mutex_unlock(&ctx->uring_lock);
- cond_resched();
mutex_lock(&ctx->uring_lock);
}
}
@@ -1977,7 +1981,6 @@ void io_wq_submit_work(struct io_wq_work *work)
break;
if (io_wq_worker_stopped())
break;
- cond_resched();
continue;
}

@@ -2649,7 +2652,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
ret = 0;
break;
}
- cond_resched();
} while (1);

if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
@@ -3096,8 +3098,12 @@ static __cold void io_ring_exit_work(struct work_struct *work)
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_move_task_work_from_local(ctx);

+ /*
+ * io_uring_try_cancel_requests() will reschedule when needed
+ * in the mutex_unlock().
+ */
while (io_uring_try_cancel_requests(ctx, NULL, true))
- cond_resched();
+ ;

if (ctx->sq_data) {
struct io_sq_data *sqd = ctx->sq_data;
@@ -3313,7 +3319,6 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
- cond_resched();
}
}

@@ -3382,10 +3387,8 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
cancel_all);
}

- if (loop) {
- cond_resched();
+ if (loop)
continue;
- }

prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
io_run_task_work();
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 9123138aa9f4..ef94a7c76d9a 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -246,7 +246,6 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
list_move(&nxt->list, &ctx->io_buffers_cache);
if (++i == nbufs)
return i;
- cond_resched();
}

return i;
@@ -421,7 +420,6 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
buf->bgid = pbuf->bgid;
addr += pbuf->len;
bid++;
- cond_resched();
}

return i ? 0 : -ENOMEM;
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index bd6c2c7959a5..b297b7b8047e 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -212,7 +212,6 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd)
mutex_unlock(&sqd->lock);
if (signal_pending(current))
did_sig = get_signal(&ksig);
- cond_resched();
mutex_lock(&sqd->lock);
}
return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
@@ -258,8 +257,11 @@ static int io_sq_thread(void *data)
if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle;
if (unlikely(need_resched())) {
+ /*
+ * Drop the mutex and reacquire so a reschedule can
+ * happen on unlock.
+ */
mutex_unlock(&sqd->lock);
- cond_resched();
mutex_lock(&sqd->lock);
}
continue;
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index c043fe93a3f2..1bf58f01e50c 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -181,10 +181,8 @@ __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
struct io_tctx_node *node;
unsigned long index;

- xa_for_each(&tctx->xa, index, node) {
+ xa_for_each(&tctx->xa, index, node)
io_uring_del_tctx_node(index);
- cond_resched();
- }
if (wq) {
/*
* Must be after io_uring_del_tctx_node() (removes nodes under
--
2.31.1