Re: [GIT PULL] bcachefs

From: Jens Axboe
Date: Tue Jun 27 2023 - 13:16:10 EST


On 6/26/23 8:59?PM, Jens Axboe wrote:
> On 6/26/23 8:05?PM, Kent Overstreet wrote:
>> On Mon, Jun 26, 2023 at 07:13:54PM -0600, Jens Axboe wrote:
>>> Doesn't reproduce for me with XFS. The above ktest doesn't work for me
>>> either:
>>
>> It just popped for me on xfs, but it took half an hour or so of looping
>> vs. 30 seconds on bcachefs.
>
> OK, I'll try and leave it running overnight and see if I can get it to
> trigger.

I did manage to reproduce it, and also managed to get bcachefs to run
the test. But I had to add:

diff --git a/check b/check
index 5f9f1a6bec88..6d74bd4933bd 100755
--- a/check
+++ b/check
@@ -283,7 +283,7 @@ while [ $# -gt 0 ]; do
case "$1" in
-\? | -h | --help) usage ;;

- -nfs|-afs|-glusterfs|-cifs|-9p|-fuse|-virtiofs|-pvfs2|-tmpfs|-ubifs)
+ -nfs|-afs|-glusterfs|-cifs|-9p|-fuse|-virtiofs|-pvfs2|-tmpfs|-ubifs|-bcachefs)
FSTYP="${1:1}"
;;
-overlay)

to ktest/tests/xfstests/ and run it with -bcachefs, otherwise it kept
failing because it assumed it was XFS.

I suspected this was just a timing issue, and it looks like that's
exactly what it is. Looking at the test case, it'll randomly kill -9
fsstress, and if that happens while we have io_uring IO pending, then we
process completions inline (for a PF_EXITING current). This means they
get pushed to fallback work, which runs out of line. If we hit that case
AND the timing is such that it hasn't been processed yet, we'll still be
holding a file reference under the mount point and umount will -EBUSY
fail.

As far as I can tell, this can happen with aio as well, it's just harder
to hit. If the fput happens while the task is exiting, then fput will
end up being delayed through a workqueue as well. The test case assumes
that once it's reaped the exit of the killed task that all files are
released, which isn't necessarily true if they are done out-of-line.

For io_uring specifically, it may make sense to wait on the fallback
work. The below patch does this, and should fix the issue. But I'm not
fully convinced that this is really needed, as I do think this can
happen without io_uring as well. It just doesn't right now as the test
does buffered IO, and aio will be fully sync with buffered IO. That
means there's either no gap where aio will hit it without O_DIRECT, or
it's just small enough that it hasn't been hit.


diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3bca7a79efda..7abad5cb2131 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -150,7 +150,6 @@ static void io_clean_op(struct io_kiocb *req);
static void io_queue_sqe(struct io_kiocb *req);
static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
-static __cold void io_fallback_tw(struct io_uring_task *tctx);

struct kmem_cache *req_cachep;

@@ -1248,6 +1247,49 @@ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
return cmpxchg(&head->first, old, new);
}

+#define NR_FALLBACK_CTX 8
+
+static __cold void io_flush_fallback(struct io_ring_ctx **ctxs, int *nr_ctx)
+{
+ int i;
+
+ for (i = 0; i < *nr_ctx; i++) {
+ struct io_ring_ctx *ctx = ctxs[i];
+
+ flush_delayed_work(&ctx->fallback_work);
+ percpu_ref_put(&ctx->refs);
+ }
+ *nr_ctx = 0;
+}
+
+static __cold void io_flush_fallback_add(struct io_ring_ctx *ctx,
+ struct io_ring_ctx **ctxs, int *nr_ctx)
+{
+ percpu_ref_get(&ctx->refs);
+ ctxs[(*nr_ctx)++] = ctx;
+ if (*nr_ctx == NR_FALLBACK_CTX)
+ io_flush_fallback(ctxs, nr_ctx);
+}
+
+static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
+{
+ struct llist_node *node = llist_del_all(&tctx->task_list);
+ struct io_ring_ctx *ctxs[NR_FALLBACK_CTX];
+ struct io_kiocb *req;
+ int nr_ctx = 0;
+
+ while (node) {
+ req = container_of(node, struct io_kiocb, io_task_work.node);
+ node = node->next;
+ if (sync)
+ io_flush_fallback_add(req->ctx, ctxs, &nr_ctx);
+ if (llist_add(&req->io_task_work.node,
+ &req->ctx->fallback_llist))
+ schedule_delayed_work(&req->ctx->fallback_work, 1);
+ }
+ io_flush_fallback(ctxs, &nr_ctx);
+}
+
void tctx_task_work(struct callback_head *cb)
{
struct io_tw_state ts = {};
@@ -1260,7 +1302,7 @@ void tctx_task_work(struct callback_head *cb)
unsigned int count = 0;

if (unlikely(current->flags & PF_EXITING)) {
- io_fallback_tw(tctx);
+ io_fallback_tw(tctx, true);
return;
}

@@ -1289,20 +1331,6 @@ void tctx_task_work(struct callback_head *cb)
trace_io_uring_task_work_run(tctx, count, loops);
}

-static __cold void io_fallback_tw(struct io_uring_task *tctx)
-{
- struct llist_node *node = llist_del_all(&tctx->task_list);
- struct io_kiocb *req;
-
- while (node) {
- req = container_of(node, struct io_kiocb, io_task_work.node);
- node = node->next;
- if (llist_add(&req->io_task_work.node,
- &req->ctx->fallback_llist))
- schedule_delayed_work(&req->ctx->fallback_work, 1);
- }
-}
-
static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1377,7 +1405,7 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
return;

- io_fallback_tw(tctx);
+ io_fallback_tw(tctx, false);
}

static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)

--
Jens Axboe