Re: [RFC PATCH 2/3] add statmnt(2) syscall

From: Miklos Szeredi
Date: Tue Sep 19 2023 - 06:55:12 EST


On Tue, 19 Sept 2023 at 11:07, Christian Brauner <brauner@xxxxxxxxxx> wrote:
>
> On Tue, Sep 19, 2023 at 10:02:17AM +0200, Miklos Szeredi wrote:

> > This is where the thread started. Knowing the size of the buffer is
> > no good, since the needed buffer could change between calls.
>
> The same problem would exist for the single buffer. Realistically, users
> will most often simply use a fixed size PATH_MAX buffer that will cover
> most cases and fallback to allocating a larger buffer in case things go
> awry.

Exactly. A large buffer will work in 99.99% of the cases. Good
quality implementations will deal with the 0.01% as well, but
optimizing that case is nonsense.

> Really, the main things I care about are 64 bit alignment of the whole
> struct, typed __u64 pointers

Okay.

> with __u32 size for mnt_root and mnt_point

Unnecessary if the strings are nul terminated.

> and that we please spell out "mount" and not use "mnt": so statmount
> because the new mount api uses "mount" (move_mount(), mount_setattr(),
> fsmount(), MOUNT_ATTR_*) almost everywhere.

Okay.

Incremental below.

Also pushed to:

git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git#statmount

Thanks,
Miklos


diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
b/arch/x86/entry/syscalls/syscall_64.tbl
index 0d9a47b0ce9b..a1b3ce7d22cc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,8 +375,8 @@
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
453 64 map_shadow_stack sys_map_shadow_stack
-454 common statmnt sys_statmnt
-455 common listmnt sys_listmnt
+454 common statmount sys_statmount
+455 common listmount sys_listmount

#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/namespace.c b/fs/namespace.c
index 5362b1ffb26f..803003052bfb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -68,9 +68,8 @@ static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);

-/* Don't allow confusion with mount ID allocated wit IDA */
-#define OLD_MNT_ID_MAX UINT_MAX
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(OLD_MNT_ID_MAX);
+/* Don't allow confusion with old 32bit mount ID */
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);

static struct hlist_head *mount_hashtable __read_mostly;
static struct hlist_head *mountpoint_hashtable __read_mostly;
@@ -4679,14 +4678,6 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const
char __user *, path,
return err;
}

-static bool mnt_id_match(struct mount *mnt, u64 id)
-{
- if (id <= OLD_MNT_ID_MAX)
- return id == mnt->mnt_id;
- else
- return id == mnt->mnt_id_unique;
-}
-
struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
{
struct mount *mnt;
@@ -4694,7 +4685,7 @@ struct vfsmount *lookup_mnt_in_ns(u64 id, struct
mnt_namespace *ns)

lock_ns_list(ns);
list_for_each_entry(mnt, &ns->list, mnt_list) {
- if (!mnt_is_cursor(mnt) && mnt_id_match(mnt, id)) {
+ if (!mnt_is_cursor(mnt) && id == mnt->mnt_id_unique) {
res = &mnt->mnt;
break;
}
@@ -4747,7 +4738,7 @@ static int stmt_string_seq(struct stmt_state *s,
stmt_func_t func)
}

static void stmt_string(struct stmt_state *s, u64 mask, stmt_func_t func,
- stmt_str_t *str)
+ u64 *str)
{
int ret = s->pos >= s->bufsize ? -EOVERFLOW : 0;
struct statmnt *sm = &s->sm;
@@ -4767,8 +4758,7 @@ static void stmt_string(struct stmt_state *s,
u64 mask, stmt_func_t func,
if (copy_to_user(s->buf + s->pos, seq->buf, seq->count)) {
s->err = -EFAULT;
} else {
- str->off = s->pos;
- str->len = seq->count - 1;
+ *str = (unsigned long) (s->buf + s->pos);
s->pos += seq->count;
}
}
@@ -4899,39 +4889,10 @@ static int stmt_fs_type(struct stmt_state *s)
struct super_block *sb = s->mnt->mnt_sb;

seq_puts(seq, sb->s_type->name);
- if (sb->s_subtype) {
- seq_putc(seq, '.');
- seq_puts(seq, sb->s_subtype);
- }
- return 0;
-}
-
-static int stmt_sb_opts(struct stmt_state *s)
-{
- struct seq_file *seq = &s->seq;
- struct super_block *sb = s->mnt->mnt_sb;
- char *p, *end, *next, *u = seq->buf;
- int err;
-
- if (!sb->s_op->show_options)
- return 0;
-
- err = sb->s_op->show_options(seq, s->mnt->mnt_root);
- if (err || seq_has_overflowed(seq) || !seq->count)
- return err;
-
- end = seq->buf + seq->count;
- *end = '\0';
- for (p = seq->buf + 1; p < end; p = next + 1) {
- next = strchrnul(p, ',');
- *next = '\0';
- u += string_unescape(p, u, 0, UNESCAPE_OCTAL) + 1;
- }
- seq->count = u - 1 - seq->buf;
return 0;
}

-static int do_statmnt(struct stmt_state *s)
+static int do_statmount(struct stmt_state *s)
{
struct statmnt *sm = &s->sm;
struct mount *m = real_mount(s->mnt);
@@ -4946,7 +4907,6 @@ static int do_statmnt(struct stmt_state *s)
stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
stmt_string(s, STMT_MOUNTPOINT, stmt_mountpoint, &sm->mountpoint);
stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
- stmt_string(s, STMT_SB_OPTS, stmt_sb_opts, &sm->sb_opts);

if (s->err)
return s->err;
@@ -4957,7 +4917,7 @@ static int do_statmnt(struct stmt_state *s)
return 0;
}

-SYSCALL_DEFINE5(statmnt, u64, mnt_id,
+SYSCALL_DEFINE5(statmount, u64, mnt_id,
u64, mask, struct statmnt __user *, buf,
size_t, bufsize, unsigned int, flags)
{
@@ -4980,7 +4940,7 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
};

get_fs_root(current->fs, &s.root);
- err = do_statmnt(&s);
+ err = do_statmount(&s);
path_put(&s.root);
}
up_read(&namespace_sem);
@@ -4988,19 +4948,25 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
return err;
}

-static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
- const struct path *root)
+static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
+ const struct path *root, unsigned int flags)
{
struct mount *r, *m = real_mount(mnt);
struct path rootmnt = { .mnt = root->mnt, .dentry =
root->mnt->mnt_root };
long ctr = 0;
+ bool reachable_only = true;

- if (!capable(CAP_SYS_ADMIN) &&
- !is_path_reachable(m, mnt->mnt_root, &rootmnt))
- return -EPERM;
+ if (flags & LISTMOUNT_UNREACHABLE) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ reachable_only = false;
+ }
+
+ if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt))
+ return capable(CAP_SYS_ADMIN) ? 0 : -EPERM;

list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
- if (!capable(CAP_SYS_ADMIN) &&
+ if (reachable_only &&
!is_path_reachable(r, r->mnt.mnt_root, root))
continue;

@@ -5015,14 +4981,14 @@ static long do_listmnt(struct vfsmount *mnt,
u64 __user *buf, size_t bufsize,
return ctr;
}

-SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
+SYSCALL_DEFINE4(listmount, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
unsigned int, flags)
{
struct vfsmount *mnt;
struct path root;
long err;

- if (flags)
+ if (flags & ~LISTMOUNT_UNREACHABLE)
return -EINVAL;

down_read(&namespace_sem);
@@ -5030,7 +4996,7 @@ SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user
*, buf, size_t, bufsize,
err = -ENOENT;
if (mnt) {
get_fs_root(current->fs, &root);
- err = do_listmnt(mnt, buf, bufsize, &root);
+ err = do_listmount(mnt, buf, bufsize, &root, flags);
path_put(&root);
}
up_read(&namespace_sem);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 5d776cdb6f18..a35fb7b2c842 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -74,6 +74,7 @@ struct landlock_ruleset_attr;
enum landlock_rule_type;
struct cachestat_range;
struct cachestat;
+struct statmnt;

#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -408,11 +409,11 @@ asmlinkage long sys_statfs64(const char __user
*path, size_t sz,
asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
struct statfs64 __user *buf);
-asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
- struct statmnt __user *buf, size_t bufsize,
- unsigned int flags);
-asmlinkage long sys_listmnt(u64 mnt_id, u64 __user *buf, size_t bufsize,
- unsigned int flags);
+asmlinkage long sys_statmount(u64 mnt_id, u64 mask,
+ struct statmnt __user *buf, size_t bufsize,
+ unsigned int flags);
+asmlinkage long sys_listmount(u64 mnt_id, u64 __user *buf, size_t bufsize,
+ unsigned int flags);
asmlinkage long sys_truncate(const char __user *path, long length);
asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
#if BITS_PER_LONG == 32
diff --git a/include/uapi/asm-generic/unistd.h
b/include/uapi/asm-generic/unistd.h
index a2b41370f603..8df6a747e21a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -823,11 +823,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
#define __NR_fchmodat2 452
__SYSCALL(__NR_fchmodat2, sys_fchmodat2)

-#define __NR_statmnt 454
-__SYSCALL(__NR_statmnt, sys_statmnt)
+#define __NR_statmount 454
+__SYSCALL(__NR_statmount, sys_statmount)

-#define __NR_listmnt 455
-__SYSCALL(__NR_listmnt, sys_listmnt)
+#define __NR_listmount 455
+__SYSCALL(__NR_listmount, sys_listmount)

#undef __NR_syscalls
#define __NR_syscalls 456
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 4ec7308a9259..d98b41024507 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -138,11 +138,6 @@ struct mount_attr {
/* List of all mount_attr versions. */
#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */

-struct stmt_str {
- __u32 off;
- __u32 len;
-};
-
struct statmnt {
__u64 mask; /* What results were written [uncond] */
__u32 sb_dev_major; /* Device ID */
@@ -159,11 +154,10 @@ struct statmnt {
__u64 mnt_peer_group; /* ID of shared peer group */
__u64 mnt_master; /* Mount receives propagation from this ID */
__u64 propagate_from; /* Propagation from in current namespace */
- __u64 __spare[20];
- struct stmt_str mnt_root; /* Root of mount relative to
root of fs */
- struct stmt_str mountpoint; /* Mountpoint relative to root
of process */
- struct stmt_str fs_type; /* Filesystem type[.subtype] */
- struct stmt_str sb_opts; /* Super block string options
(nul delimted) */
+ __u64 mnt_root; /* [str] Root of mount relative to root of fs */
+ __u64 mountpoint; /* [str] Mountpoint relative to root
of process */
+ __u64 fs_type; /* [srt] Filesystem type */
+ __u64 __spare[49];
};

#define STMT_SB_BASIC 0x00000001U /* Want/got sb_... */
@@ -172,6 +166,8 @@ struct statmnt {
#define STMT_MNT_ROOT 0x00000008U /* Want/got mnt_root */
#define STMT_MOUNTPOINT 0x00000010U /* Want/got
mountpoint */
#define STMT_FS_TYPE 0x00000020U /* Want/got fs_type */
-#define STMT_SB_OPTS 0x00000040U /* Want/got sb_opts */
+
+/* listmount(2) flags */
+#define LISTMOUNT_UNREACHABLE 0x01 /* List unreachable mounts too */

#endif /* _UAPI_LINUX_MOUNT_H */