[PATCH v2 6/6] RFC: io_uring getdents: test returning an EOF flag in CQE

From: Dominique Martinet
Date: Wed May 10 2023 - 06:54:33 EST


This turns out to be very slightly faster than an extra call to
getdents, but in practice it doesn't seem to be such an improvement as
the trailing getdents will return almost immediately be absorbed by the
scheduling noise in a find-like context (my ""server"" is too noisy to
get proper benchmarks out, but results look slightly better with this in
async mode, and almost identical in the NOWAIT path)

If the user is waiting the end of a single directory though it might be
worth it, so including the patch for comments.
(in particular I'm not really happy that the flag has become in-out for
vfs_getdents, especially when the getdents64 syscall does not use it,
but I don't see much other way around it)

If this approach is acceptable/wanted then this patch will be split down
further (at least dir_context/vfs_getdents, kernfs, libfs, uring in four
separate commits)

Signed-off-by: Dominique Martinet <asmadeus@xxxxxxxxxxxxx>
---
fs/internal.h | 2 +-
fs/kernfs/dir.c | 1 +
fs/libfs.c | 9 ++++++---
fs/readdir.c | 10 ++++++----
include/linux/fs.h | 2 ++
include/uapi/linux/io_uring.h | 2 ++
io_uring/fs.c | 8 ++++++--
7 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index 0264b001d99a..0b1552c7a870 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -267,4 +267,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap);
struct linux_dirent64;

int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
- unsigned int count, unsigned long flags);
+ unsigned int count, unsigned long *flags);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 5a5b3e7881bf..53a6b4804c34 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1860,6 +1860,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
up_read(&root->kernfs_rwsem);
file->private_data = NULL;
ctx->pos = INT_MAX;
+ ctx->flags |= DIR_CONTEXT_F_EOD;
return 0;
}

diff --git a/fs/libfs.c b/fs/libfs.c
index a3c7e42d90a7..b2a95dadffbd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -208,10 +208,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
p = &next->d_child;
}
spin_lock(&dentry->d_lock);
- if (next)
+ if (next) {
list_move_tail(&cursor->d_child, &next->d_child);
- else
+ } else {
list_del_init(&cursor->d_child);
+ ctx->flags |= DIR_CONTEXT_F_EOD;
+ }
spin_unlock(&dentry->d_lock);
dput(next);

@@ -1347,7 +1349,8 @@ static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)

static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
{
- dir_emit_dots(file, ctx);
+ if (dir_emit_dots(file, ctx))
+ ctx->flags |= DIR_CONTEXT_F_EOD;
return 0;
}

diff --git a/fs/readdir.c b/fs/readdir.c
index 1311b89d75e1..be75a2154b4f 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -358,14 +358,14 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
* @file : pointer to file struct of directory
* @dirent : pointer to user directory structure
* @count : size of buffer
- * @flags : additional dir_context flags
+ * @flags : pointer to additional dir_context flags
*/
int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
- unsigned int count, unsigned long flags)
+ unsigned int count, unsigned long *flags)
{
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
- .ctx.flags = flags,
+ .ctx.flags = flags ? *flags : 0,
.count = count,
.current_dir = dirent
};
@@ -384,6 +384,8 @@ int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
else
error = count - buf.count;
}
+ if (flags)
+ *flags = buf.ctx.flags;
return error;
}

@@ -397,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (!f.file)
return -EBADF;

- error = vfs_getdents(f.file, dirent, count, 0);
+ error = vfs_getdents(f.file, dirent, count, NULL);

fdput_pos(f);
return error;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7de2b5ca38e..d1e31bccfb4f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1723,8 +1723,10 @@ struct dir_context {
* flags for dir_context flags
* DIR_CONTEXT_F_NOWAIT: Request non-blocking iterate
* (requires file->f_mode & FMODE_NOWAIT)
+ * DIR_CONTEXT_F_EOD: Signal directory has been fully iterated, set by the fs
*/
#define DIR_CONTEXT_F_NOWAIT 0x1
+#define DIR_CONTEXT_F_EOD 0x2

/*
* These flags let !MMU mmap() govern direct device mapping vs immediate
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 35d0de18d893..35877132027e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -381,11 +381,13 @@ struct io_uring_cqe {
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
* them from sends.
+ * IORING_CQE_F_EOF If set, file or directory has reached end of file.
*/
#define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1)
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
#define IORING_CQE_F_NOTIF (1U << 3)
+#define IORING_CQE_F_EOF (1U << 4)

enum {
IORING_CQE_BUFFER_SHIFT = 16,
diff --git a/io_uring/fs.c b/io_uring/fs.c
index b15ec81c1ed2..f6222b0148ef 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -322,6 +322,7 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
unsigned long getdents_flags = 0;
+ u32 cqe_flags = 0;
int ret;

if (issue_flags & IO_URING_F_NONBLOCK) {
@@ -338,13 +339,16 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
goto out;
}

- ret = vfs_getdents(req->file, gd->dirent, gd->count, getdents_flags);
+ ret = vfs_getdents(req->file, gd->dirent, gd->count, &getdents_flags);
out:
if (ret == -EAGAIN &&
(issue_flags & IO_URING_F_NONBLOCK))
return -EAGAIN;

- io_req_set_res(req, ret, 0);
+ if (getdents_flags & DIR_CONTEXT_F_EOD)
+ cqe_flags |= IORING_CQE_F_EOF;
+
+ io_req_set_res(req, ret, cqe_flags);
return 0;
}


--
2.39.2