Re: [PATCH] fuse: enable larger read buffers for readdir.

From: Miklos Szeredi
Date: Thu Jul 27 2023 - 11:21:32 EST


On Wed, Jul 26, 2023 at 08:25:48PM +0200, Jaco Kroon wrote:

> glibc uses a 128KiB buffer for getdents64, so I'm not sure >128KiB here
> makes sense. Or if these two buffers are even directly related.

Definitely related. See how the dir_emit() works: if the entry doesn't fit in
the userspace buffer, then ->readdir() returns. If entries remain in the fuse
buffer at that time, they are simply discarded. Simply put, making the
buffer too large is going to be a waste of resources and at some point will be
slower than the single page buffer.

Note: discarding received entries is the only possible action if caching is
disabled. With caching enabled it would make sense to pre-fill the cache with
the overflowed entries and use them in the next iteration. However the caching
code is not prepared for using partial caches at the moment.

The best strategy would be to find the optimal buffer size based on the size of
the userspace buffer. Putting that info into struct dir_context doesn't sound
too complicated...

Here's a patch. It doesn't touch readdir. Simply setting the fuse buffer size
to the userspace buffer size should work, the record sizes are similar (fuse's
is slightly larger than libc's, so no overflow should ever happen).

Thanks,
Miklos


---
fs/exportfs/expfs.c | 1 +
fs/overlayfs/readdir.c | 12 +++++++++++-
fs/readdir.c | 29 ++++++++++++++---------------
include/linux/fs.h | 7 +++++++
4 files changed, 33 insertions(+), 16 deletions(-)

--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -286,6 +286,7 @@ static int get_name(const struct path *p
};
struct getdents_callback buffer = {
.ctx.actor = filldir_one,
+ .ctx.count = INT_MAX,
.name = name,
};

--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -347,6 +347,7 @@ static int ovl_dir_read_merged(struct de
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
+ .ctx.count = INT_MAX,
.dentry = dentry,
.list = list,
.root = root,
@@ -557,6 +558,7 @@ static int ovl_dir_read_impure(const str
struct ovl_cache_entry *p, *n;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = list,
.root = root,
};
@@ -658,6 +660,7 @@ static bool ovl_fill_real(struct dir_con
struct ovl_readdir_translate *rdt =
container_of(ctx, struct ovl_readdir_translate, ctx);
struct dir_context *orig_ctx = rdt->orig_ctx;
+ bool res;

if (rdt->parent_ino && strcmp(name, "..") == 0) {
ino = rdt->parent_ino;
@@ -672,7 +675,10 @@ static bool ovl_fill_real(struct dir_con
name, namelen, rdt->xinowarn);
}

- return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ ctx->count = orig_ctx->count;
+
+ return res;
}

static bool ovl_is_impure_dir(struct file *file)
@@ -699,6 +705,7 @@ static int ovl_iterate_real(struct file
const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
struct ovl_readdir_translate rdt = {
.ctx.actor = ovl_fill_real,
+ .ctx.count = ctx->count,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(ofs),
.xinowarn = ovl_xino_warn(ofs),
@@ -1058,6 +1065,7 @@ int ovl_check_d_type_supported(const str
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
+ .ctx.count = INT_MAX,
.d_type_supported = false,
};

@@ -1079,6 +1087,7 @@ static int ovl_workdir_cleanup_recurse(s
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
bool incompat = false;
@@ -1163,6 +1172,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};

--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -184,6 +184,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned in
struct fd f = fdget_pos(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};

@@ -215,7 +216,6 @@ struct getdents_callback {
struct dir_context ctx;
struct linux_dirent __user * current_dir;
int prev_reclen;
- int count;
int error;
};

@@ -234,7 +234,7 @@ static bool filldir(struct dir_context *
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -259,7 +259,7 @@ static bool filldir(struct dir_context *

buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -274,7 +274,7 @@ SYSCALL_DEFINE3(getdents, unsigned int,
struct fd f;
struct getdents_callback buf = {
.ctx.actor = filldir,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -293,7 +293,7 @@ SYSCALL_DEFINE3(getdents, unsigned int,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
fdput_pos(f);
return error;
@@ -303,7 +303,6 @@ struct getdents_callback64 {
struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
int prev_reclen;
- int count;
int error;
};

@@ -321,7 +320,7 @@ static bool filldir64(struct dir_context
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
@@ -341,7 +340,7 @@ static bool filldir64(struct dir_context

buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;

efault_end:
@@ -357,7 +356,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int
struct fd f;
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -377,7 +376,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int
if (put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
fdput_pos(f);
return error;
@@ -442,6 +441,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsi
struct fd f = fdget_pos(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};

@@ -467,7 +467,6 @@ struct compat_getdents_callback {
struct dir_context ctx;
struct compat_linux_dirent __user *current_dir;
int prev_reclen;
- int count;
int error;
};

@@ -486,7 +485,7 @@ static bool compat_filldir(struct dir_co
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -510,7 +509,7 @@ static bool compat_filldir(struct dir_co

buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -525,8 +524,8 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigne
struct fd f;
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
+ .ctx.count = count,
.current_dir = dirent,
- .count = count
};
int error;

@@ -544,7 +543,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigne
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
fdput_pos(f);
return error;
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1719,6 +1719,13 @@ typedef bool (*filldir_t)(struct dir_con
struct dir_context {
filldir_t actor;
loff_t pos;
+ /*
+ * Filesystems MUST NOT MODIFY count, but may use as a hint:
+ * 0 unknown
+ * > 0 space in buffer (assume at least one entry)
+ * INT_MAX unlimited
+ */
+ int count;
};

/*