Re: Upcoming: Notifications, FS notifications and fsinfo()

From: David Howells
Date: Tue Mar 31 2020 - 15:47:35 EST


Miklos Szeredi <miklos@xxxxxxxxxx> wrote:

> Cool, thanks for testing. Unfortunately the test-fsinfo-perf.c file
> didn't make it into the patch. Can you please refresh and resend?

Oops - I forgot to add it. See attached.

David
---
commit b7239021cb7660bf328bb7fcce05e3a35ce5842b
Author: David Howells <dhowells@xxxxxxxxxx>
Date: Tue Mar 31 14:39:07 2020 +0100

Performance test MiklÃs's patch vs fsinfo

diff --git a/fs/Makefile b/fs/Makefile
index b6bf2424c7f7..ac0627176db1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -137,3 +137,4 @@ obj-$(CONFIG_EFIVAR_FS) += efivarfs/
obj-$(CONFIG_EROFS_FS) += erofs/
obj-$(CONFIG_VBOXSF_FS) += vboxsf/
obj-$(CONFIG_ZONEFS_FS) += zonefs/
+obj-y += mountfs/
diff --git a/fs/mount.h b/fs/mount.h
index 063f41bc2e93..89b091fc482f 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -82,6 +82,7 @@ struct mount {
atomic_t mnt_subtree_notifications; /* Number of notifications in subtree */
struct watch_list *mnt_watchers; /* Watches on dentries within this mount */
#endif
+ struct mountfs_entry *mnt_mountfs_entry;
} __randomize_layout;

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
@@ -177,3 +178,11 @@ static inline void notify_mount(struct mount *triggered,
{
}
#endif
+
+void mnt_namespace_lock_read(void);
+void mnt_namespace_unlock_read(void);
+
+void mountfs_create(struct mount *mnt);
+extern void mountfs_remove(struct mount *mnt);
+int mountfs_lookup_internal(struct vfsmount *m, struct path *path);
+
diff --git a/fs/mountfs/Makefile b/fs/mountfs/Makefile
new file mode 100644
index 000000000000..35a65e9a966f
--- /dev/null
+++ b/fs/mountfs/Makefile
@@ -0,0 +1 @@
+obj-y += super.o
diff --git a/fs/mountfs/super.c b/fs/mountfs/super.c
new file mode 100644
index 000000000000..82c01eb6154d
--- /dev/null
+++ b/fs/mountfs/super.c
@@ -0,0 +1,502 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "../pnode.h"
+#include <linux/fs.h>
+#include <linux/kref.h>
+#include <linux/nsproxy.h>
+#include <linux/fs_struct.h>
+#include <linux/fs_context.h>
+
+#define MOUNTFS_SUPER_MAGIC 0x4e756f4d
+
+static DEFINE_SPINLOCK(mountfs_lock);
+static struct rb_root mountfs_entries = RB_ROOT;
+static struct vfsmount *mountfs_mnt __read_mostly;
+
+struct mountfs_entry {
+ struct kref kref;
+ struct mount *mnt;
+ struct rb_node node;
+ int id;
+};
+
+static const char *mountfs_attrs[] = {
+ "root", "mountpoint", "id", "parent", "options", "children",
+ "group", "master", "propagate_from"
+};
+
+#define MOUNTFS_INO(id) (((unsigned long) id + 1) * \
+ (ARRAY_SIZE(mountfs_attrs) + 1))
+
+void mountfs_entry_release(struct kref *kref)
+{
+ kfree(container_of(kref, struct mountfs_entry, kref));
+}
+
+void mountfs_entry_put(struct mountfs_entry *entry)
+{
+ kref_put(&entry->kref, mountfs_entry_release);
+}
+
+static bool mountfs_entry_visible(struct mountfs_entry *entry)
+{
+ struct mount *mnt;
+ bool visible = false;
+
+ rcu_read_lock();
+ mnt = rcu_dereference(entry->mnt);
+ if (mnt && mnt->mnt_ns == current->nsproxy->mnt_ns)
+ visible = true;
+ rcu_read_unlock();
+
+ return visible;
+}
+static int mountfs_attr_show(struct seq_file *sf, void *v)
+{
+ const char *name = sf->file->f_path.dentry->d_name.name;
+ struct mountfs_entry *entry = sf->private;
+ struct mount *mnt;
+ struct vfsmount *m;
+ struct super_block *sb;
+ struct path root;
+ int tmp, err = -ENODEV;
+
+ mnt_namespace_lock_read();
+
+ mnt = entry->mnt;
+ if (!mnt || !mnt->mnt_ns)
+ goto out;
+
+ err = 0;
+ m = &mnt->mnt;
+ sb = m->mnt_sb;
+
+ if (strcmp(name, "root") == 0) {
+ if (sb->s_op->show_path) {
+ err = sb->s_op->show_path(sf, m->mnt_root);
+ } else {
+ seq_dentry(sf, m->mnt_root, " \t\n\\");
+ }
+ seq_putc(sf, '\n');
+ } else if (strcmp(name, "mountpoint") == 0) {
+ struct path mnt_path = { .dentry = m->mnt_root, .mnt = m };
+
+ get_fs_root(current->fs, &root);
+ err = seq_path_root(sf, &mnt_path, &root, " \t\n\\");
+ if (err == SEQ_SKIP) {
+ seq_puts(sf, "(unreachable)");
+ err = 0;
+ }
+ seq_putc(sf, '\n');
+ path_put(&root);
+ } else if (strcmp(name, "id") == 0) {
+ seq_printf(sf, "%i\n", mnt->mnt_id);
+ } else if (strcmp(name, "parent") == 0) {
+ tmp = rcu_dereference(mnt->mnt_parent)->mnt_id;
+ seq_printf(sf, "%i\n", tmp);
+ } else if (strcmp(name, "options") == 0) {
+ int mnt_flags = READ_ONCE(m->mnt_flags);
+
+ seq_puts(sf, mnt_flags & MNT_READONLY ? "ro" : "rw");
+ seq_mnt_opts(sf, mnt_flags);
+ seq_putc(sf, '\n');
+ } else if (strcmp(name, "children") == 0) {
+ struct mount *child;
+ bool first = true;
+
+ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+ if (!first)
+ seq_putc(sf, ',');
+ else
+ first = false;
+ seq_printf(sf, "%i", child->mnt_id);
+ }
+ if (!first)
+ seq_putc(sf, '\n');
+ } else if (strcmp(name, "group") == 0) {
+ if (IS_MNT_SHARED(mnt))
+ seq_printf(sf, "%i\n", mnt->mnt_group_id);
+ } else if (strcmp(name, "master") == 0) {
+ if (IS_MNT_SLAVE(mnt)) {
+ tmp = rcu_dereference(mnt->mnt_master)->mnt_group_id;
+ seq_printf(sf, "%i\n", tmp);
+ }
+ } else if (strcmp(name, "propagate_from") == 0) {
+ if (IS_MNT_SLAVE(mnt)) {
+ get_fs_root(current->fs, &root);
+ tmp = get_dominating_id(mnt, &root);
+ if (tmp)
+ seq_printf(sf, "%i\n", tmp);
+ }
+ } else {
+ WARN_ON(1);
+ err = -EIO;
+ }
+out:
+ mnt_namespace_unlock_read();
+
+ return err;
+}
+
+static int mountfs_attr_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, mountfs_attr_show, inode->i_private);
+}
+
+static const struct file_operations mountfs_attr_fops = {
+ .open = mountfs_attr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct mountfs_entry *mountfs_node_to_entry(struct rb_node *node)
+{
+ return rb_entry(node, struct mountfs_entry, node);
+}
+
+static struct rb_node **mountfs_find_node(int id, struct rb_node **parent)
+{
+ struct rb_node **link = &mountfs_entries.rb_node;
+
+ *parent = NULL;
+ while (*link) {
+ struct mountfs_entry *entry = mountfs_node_to_entry(*link);
+
+ *parent = *link;
+ if (id < entry->id)
+ link = &entry->node.rb_left;
+ else if (id > entry->id)
+ link = &entry->node.rb_right;
+ else
+ break;
+ }
+ return link;
+}
+
+void mountfs_create(struct mount *mnt)
+{
+ struct mountfs_entry *entry;
+ struct rb_node **link, *parent;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ WARN(1, "failed to allocate mountfs entry");
+ return;
+ }
+ kref_init(&entry->kref);
+ entry->mnt = mnt;
+ entry->id = mnt->mnt_id;
+
+ spin_lock(&mountfs_lock);
+ link = mountfs_find_node(entry->id, &parent);
+ if (!WARN_ON(*link)) {
+ rb_link_node(&entry->node, parent, link);
+ rb_insert_color(&entry->node, &mountfs_entries);
+ mnt->mnt_mountfs_entry = entry;
+ } else {
+ kfree(entry);
+ }
+ spin_unlock(&mountfs_lock);
+}
+
+void mountfs_remove(struct mount *mnt)
+{
+ struct mountfs_entry *entry = mnt->mnt_mountfs_entry;
+
+ if (!entry)
+ return;
+ spin_lock(&mountfs_lock);
+ entry->mnt = NULL;
+ rb_erase(&entry->node, &mountfs_entries);
+ spin_unlock(&mountfs_lock);
+
+ mountfs_entry_put(entry);
+
+ mnt->mnt_mountfs_entry = NULL;
+}
+
+static struct mountfs_entry *mountfs_get_entry(const char *name)
+{
+ struct mountfs_entry *entry = NULL;
+ struct rb_node **link, *dummy;
+ unsigned long mnt_id;
+ char buf[32];
+ int ret;
+
+ ret = kstrtoul(name, 10, &mnt_id);
+ if (ret || mnt_id > INT_MAX)
+ return NULL;
+
+ snprintf(buf, sizeof(buf), "%lu", mnt_id);
+ if (strcmp(buf, name) != 0)
+ return NULL;
+
+ spin_lock(&mountfs_lock);
+ link = mountfs_find_node(mnt_id, &dummy);
+ if (*link) {
+ entry = mountfs_node_to_entry(*link);
+ if (!mountfs_entry_visible(entry))
+ entry = NULL;
+ else
+ kref_get(&entry->kref);
+ }
+ spin_unlock(&mountfs_lock);
+
+ return entry;
+}
+
+static void mountfs_init_inode(struct inode *inode, umode_t mode);
+
+static struct dentry *mountfs_lookup_entry(struct dentry *dentry,
+ struct mountfs_entry *entry,
+ int idx)
+{
+ struct inode *inode;
+
+ inode = new_inode(dentry->d_sb);
+ if (!inode) {
+ mountfs_entry_put(entry);
+ return ERR_PTR(-ENOMEM);
+ }
+ inode->i_private = entry;
+ inode->i_ino = MOUNTFS_INO(entry->id) + idx;
+ mountfs_init_inode(inode, idx ? S_IFREG | 0444 : S_IFDIR | 0555);
+ return d_splice_alias(inode, dentry);
+
+}
+
+static struct dentry *mountfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct mountfs_entry *entry = dir->i_private;
+ int i = 0;
+
+ if (entry) {
+ for (i = 0; i < ARRAY_SIZE(mountfs_attrs); i++)
+ if (strcmp(mountfs_attrs[i], dentry->d_name.name) == 0)
+ break;
+ if (i == ARRAY_SIZE(mountfs_attrs))
+ return ERR_PTR(-ENOMEM);
+ i++;
+ kref_get(&entry->kref);
+ } else {
+ entry = mountfs_get_entry(dentry->d_name.name);
+ if (!entry)
+ return ERR_PTR(-ENOENT);
+ }
+
+ return mountfs_lookup_entry(dentry, entry, i);
+}
+
+static int mountfs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct mountfs_entry *entry = dentry->d_inode->i_private;
+
+ /* root: valid */
+ if (!entry)
+ return 1;
+
+ /* removed: invalid */
+ if (!entry->mnt)
+ return 0;
+
+ /* attribute or visible in this namespace: valid */
+ if (!d_can_lookup(dentry) || mountfs_entry_visible(entry))
+ return 1;
+
+ /* invlisible in this namespace: valid but deny entry*/
+ return -ENOENT;
+}
+
+static int mountfs_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct rb_node *node;
+ struct mountfs_entry *entry = file_inode(file)->i_private;
+ char name[32];
+ const char *s;
+ unsigned int len, pos, id;
+
+ if (ctx->pos - 2 > INT_MAX || !dir_emit_dots(file, ctx))
+ return 0;
+
+ if (entry) {
+ while (ctx->pos - 2 < ARRAY_SIZE(mountfs_attrs)) {
+ s = mountfs_attrs[ctx->pos - 2];
+ if (!dir_emit(ctx, s, strlen(s),
+ MOUNTFS_INO(entry->id) + ctx->pos,
+ DT_REG))
+ break;
+ ctx->pos++;
+ }
+ return 0;
+ }
+
+ pos = ctx->pos - 2;
+ do {
+ spin_lock(&mountfs_lock);
+ mountfs_find_node(pos, &node);
+ pos = 1U + INT_MAX;
+ do {
+ if (!node) {
+ spin_unlock(&mountfs_lock);
+ goto out;
+ }
+ entry = mountfs_node_to_entry(node);
+ node = rb_next(node);
+ } while (!mountfs_entry_visible(entry));
+ if (node)
+ pos = mountfs_node_to_entry(node)->id;
+ id = entry->id;
+ spin_unlock(&mountfs_lock);
+
+ len = snprintf(name, sizeof(name), "%i", id);
+ ctx->pos = id + 2;
+ if (!dir_emit(ctx, name, len, MOUNTFS_INO(id), DT_DIR))
+ return 0;
+ } while (pos <= INT_MAX);
+out:
+ ctx->pos = pos + 2;
+ return 0;
+}
+
+int mountfs_lookup_internal(struct vfsmount *m, struct path *path)
+{
+ char name[32];
+ struct qstr this = { .name = name };
+ struct mount *mnt = real_mount(m);
+ struct mountfs_entry *entry = mnt->mnt_mountfs_entry;
+ struct dentry *dentry, *old, *root = mountfs_mnt->mnt_root;
+
+ this.len = snprintf(name, sizeof(name), "%i", mnt->mnt_id);
+ dentry = d_hash_and_lookup(root, &this);
+ if (dentry && dentry->d_inode->i_private != entry) {
+ d_invalidate(dentry);
+ dput(dentry);
+ dentry = NULL;
+ }
+ if (!dentry) {
+ dentry = d_alloc(root, &this);
+ if (!dentry)
+ return -ENOMEM;
+
+ kref_get(&entry->kref);
+ old = mountfs_lookup_entry(dentry, entry, 0);
+ if (old) {
+ dput(dentry);
+ if (IS_ERR(old))
+ return PTR_ERR(old);
+ dentry = old;
+ }
+ }
+
+ *path = (struct path) { .mnt = mountfs_mnt, .dentry = dentry };
+ return 0;
+}
+
+static const struct dentry_operations mountfs_dops = {
+ .d_revalidate = mountfs_d_revalidate,
+};
+
+static const struct inode_operations mountfs_iops = {
+ .lookup = mountfs_lookup,
+};
+
+static const struct file_operations mountfs_fops = {
+ .iterate_shared = mountfs_readdir,
+ .read = generic_read_dir,
+ .llseek = generic_file_llseek,
+};
+
+static void mountfs_init_inode(struct inode *inode, umode_t mode)
+{
+ inode->i_mode = mode;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ if (S_ISREG(mode)) {
+ inode->i_size = PAGE_SIZE;
+ inode->i_fop = &mountfs_attr_fops;
+ } else {
+ inode->i_op = &mountfs_iops;
+ inode->i_fop = &mountfs_fops;
+ }
+}
+
+static void mountfs_evict_inode(struct inode *inode)
+{
+ struct mountfs_entry *entry = inode->i_private;
+
+ clear_inode(inode);
+ if (entry)
+ mountfs_entry_put(entry);
+}
+
+static const struct super_operations mountfs_sops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = mountfs_evict_inode,
+};
+
+static int mountfs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct inode *root;
+
+ sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
+ sb->s_magic = MOUNTFS_SUPER_MAGIC;
+ sb->s_time_gran = 1;
+ sb->s_shrink.seeks = 0;
+ sb->s_op = &mountfs_sops;
+ sb->s_d_op = &mountfs_dops;
+
+ root = new_inode(sb);
+ if (!root)
+ return -ENOMEM;
+
+ root->i_ino = 1;
+ mountfs_init_inode(root, S_IFDIR | 0444);
+
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int mountfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_single(fc, mountfs_fill_super);
+}
+
+static const struct fs_context_operations mountfs_context_ops = {
+ .get_tree = mountfs_get_tree,
+};
+
+static int mountfs_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &mountfs_context_ops;
+ fc->global = true;
+ return 0;
+}
+
+static struct file_system_type mountfs_fs_type = {
+ .name = "mountfs",
+ .init_fs_context = mountfs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int __init mountfs_init(void)
+{
+ int err;
+
+ err = register_filesystem(&mountfs_fs_type);
+ if (!err) {
+ mountfs_mnt = kern_mount(&mountfs_fs_type);
+ if (IS_ERR(mountfs_mnt)) {
+ err = PTR_ERR(mountfs_mnt);
+ unregister_filesystem(&mountfs_fs_type);
+ }
+ }
+ return err;
+}
+fs_initcall(mountfs_init);
diff --git a/fs/namespace.c b/fs/namespace.c
index 5427e732c1bf..a05a2885a90e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -962,6 +962,8 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)

if (fc->sb_flags & SB_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
+ else
+ mountfs_create(mnt);

atomic_inc(&fc->root->d_sb->s_active);
mnt->mnt.mnt_sb = fc->root->d_sb;
@@ -1033,7 +1035,7 @@ vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
}
EXPORT_SYMBOL_GPL(vfs_submount);

-static struct mount *clone_mnt(struct mount *old, struct dentry *root,
+static struct mount *clone_mnt_common(struct mount *old, struct dentry *root,
int flag)
{
struct super_block *sb = old->mnt.mnt_sb;
@@ -1100,6 +1102,17 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
return ERR_PTR(err);
}

+static struct mount *clone_mnt(struct mount *old, struct dentry *root,
+ int flag)
+{
+ struct mount *mnt = clone_mnt_common(old, root, flag);
+
+ if (!IS_ERR(mnt))
+ mountfs_create(mnt);
+
+ return mnt;
+}
+
static void cleanup_mnt(struct mount *mnt)
{
struct hlist_node *p;
@@ -1112,6 +1125,7 @@ static void cleanup_mnt(struct mount *mnt)
* so mnt_get_writers() below is safe.
*/
WARN_ON(mnt_get_writers(mnt));
+
if (unlikely(mnt->mnt_pins.first))
mnt_pin_kill(mnt);
hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
@@ -1197,6 +1211,8 @@ static void mntput_no_expire(struct mount *mnt)
unlock_mount_hash();
shrink_dentry_list(&list);

+ mountfs_remove(mnt);
+
if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
struct task_struct *task = current;
if (likely(!(task->flags & PF_KTHREAD))) {
@@ -1263,13 +1279,14 @@ EXPORT_SYMBOL(path_is_mountpoint);
struct vfsmount *mnt_clone_internal(const struct path *path)
{
struct mount *p;
- p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
+ p = clone_mnt_common(real_mount(path->mnt), path->dentry, CL_PRIVATE);
if (IS_ERR(p))
return ERR_CAST(p);
p->mnt.mnt_flags |= MNT_INTERNAL;
return &p->mnt;
}

+
#ifdef CONFIG_PROC_FS
/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
@@ -1411,6 +1428,16 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}

+void mnt_namespace_lock_read(void)
+{
+ down_read(&namespace_sem);
+}
+
+void mnt_namespace_unlock_read(void)
+{
+ up_read(&namespace_sem);
+}
+
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7c64272b0fa..0477f8b51182 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3092,6 +3092,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdmount", S_IRUSR|S_IXUSR, proc_fdmount_inode_operations, proc_fdmount_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
@@ -3497,6 +3498,7 @@ static const struct inode_operations proc_tid_comm_inode_operations = {
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdmount", S_IRUSR|S_IXUSR, proc_fdmount_inode_operations, proc_fdmount_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 81882a13212d..94a57e178801 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -361,3 +361,85 @@ const struct file_operations proc_fdinfo_operations = {
.iterate_shared = proc_readfdinfo,
.llseek = generic_file_llseek,
};
+
+static int proc_fdmount_link(struct dentry *dentry, struct path *path)
+{
+ struct files_struct *files = NULL;
+ struct task_struct *task;
+ struct path fd_path;
+ int ret = -ENOENT;
+
+ task = get_proc_task(d_inode(dentry));
+ if (task) {
+ files = get_files_struct(task);
+ put_task_struct(task);
+ }
+
+ if (files) {
+ unsigned int fd = proc_fd(d_inode(dentry));
+ struct file *fd_file;
+
+ spin_lock(&files->file_lock);
+ fd_file = fcheck_files(files, fd);
+ if (fd_file) {
+ fd_path = fd_file->f_path;
+ path_get(&fd_path);
+ ret = 0;
+ }
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+ }
+ if (!ret) {
+ ret = mountfs_lookup_internal(fd_path.mnt, path);
+ path_put(&fd_path);
+ }
+
+ return ret;
+}
+
+static struct dentry *proc_fdmount_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ const struct fd_data *data = ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | 0400);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ ei->fd = data->fd;
+
+ inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_size = 64;
+
+ ei->op.proc_get_link = proc_fdmount_link;
+ tid_fd_update_inode(task, inode, 0);
+
+ d_set_d_op(dentry, &tid_fd_dentry_operations);
+ return d_splice_alias(inode, dentry);
+}
+
+static struct dentry *
+proc_lookupfdmount(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+ return proc_lookupfd_common(dir, dentry, proc_fdmount_instantiate);
+}
+
+static int proc_readfdmount(struct file *file, struct dir_context *ctx)
+{
+ return proc_readfd_common(file, ctx,
+ proc_fdmount_instantiate);
+}
+
+const struct inode_operations proc_fdmount_inode_operations = {
+ .lookup = proc_lookupfdmount,
+ .setattr = proc_setattr,
+};
+
+const struct file_operations proc_fdmount_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_readfdmount,
+ .llseek = generic_file_llseek,
+};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index f371a602bf58..9e087c833e65 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -10,6 +10,9 @@ extern const struct inode_operations proc_fd_inode_operations;
extern const struct file_operations proc_fdinfo_operations;
extern const struct inode_operations proc_fdinfo_inode_operations;

+extern const struct file_operations proc_fdmount_operations;
+extern const struct inode_operations proc_fdmount_inode_operations;
+
extern int proc_fd_permission(struct inode *inode, int mask);

static inline unsigned int proc_fd(struct inode *inode)
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 273ee82d8aa9..e634faa9160e 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -61,24 +61,6 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
return security_sb_show_options(m, sb);
}

-static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
-{
- static const struct proc_fs_info mnt_info[] = {
- { MNT_NOSUID, ",nosuid" },
- { MNT_NODEV, ",nodev" },
- { MNT_NOEXEC, ",noexec" },
- { MNT_NOATIME, ",noatime" },
- { MNT_NODIRATIME, ",nodiratime" },
- { MNT_RELATIME, ",relatime" },
- { 0, NULL }
- };
- const struct proc_fs_info *fs_infop;
-
- for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
- if (mnt->mnt_flags & fs_infop->flag)
- seq_puts(m, fs_infop->str);
- }
-}

static inline void mangle(struct seq_file *m, const char *s)
{
@@ -120,7 +102,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
err = show_sb_opts(m, sb);
if (err)
goto out;
- show_mnt_opts(m, mnt);
+ seq_mnt_opts(m, mnt->mnt_flags);
if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt_path.dentry);
seq_puts(m, " 0 0\n");
@@ -153,7 +135,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
goto out;

seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
- show_mnt_opts(m, mnt);
+ seq_mnt_opts(m, mnt->mnt_flags);

/* Tagged fields ("foo:X" or "bar") */
if (IS_MNT_SHARED(r))
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 1600034a929b..9726baba1732 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -15,6 +15,7 @@
#include <linux/cred.h>
#include <linux/mm.h>
#include <linux/printk.h>
+#include <linux/mount.h>
#include <linux/string_helpers.h>

#include <linux/uaccess.h>
@@ -548,6 +549,28 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
}
EXPORT_SYMBOL(seq_dentry);

+void seq_mnt_opts(struct seq_file *m, int mnt_flags)
+{
+ unsigned int i;
+ static const struct {
+ int flag;
+ const char *str;
+ } mnt_info[] = {
+ { MNT_NOSUID, ",nosuid" },
+ { MNT_NODEV, ",nodev" },
+ { MNT_NOEXEC, ",noexec" },
+ { MNT_NOATIME, ",noatime" },
+ { MNT_NODIRATIME, ",nodiratime" },
+ { MNT_RELATIME, ",relatime" },
+ { 0, NULL }
+ };
+
+ for (i = 0; mnt_info[i].flag; i++) {
+ if (mnt_flags & mnt_info[i].flag)
+ seq_puts(m, mnt_info[i].str);
+ }
+}
+
static void *single_start(struct seq_file *p, loff_t *pos)
{
return NULL + (*pos == 0);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 770c2bf3aa43..9dd7812eb777 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -138,6 +138,7 @@ int seq_file_path(struct seq_file *, struct file *, const char *);
int seq_dentry(struct seq_file *, struct dentry *, const char *);
int seq_path_root(struct seq_file *m, const struct path *path,
const struct path *root, const char *esc);
+void seq_mnt_opts(struct seq_file *m, int mnt_flags);

int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t);
diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile
index 19be60ab950e..78deb8483d27 100644
--- a/samples/vfs/Makefile
+++ b/samples/vfs/Makefile
@@ -4,6 +4,7 @@
hostprogs := \
test-fsinfo \
test-fsmount \
+ test-fsinfo-perf \
test-mntinfo \
test-statx

@@ -12,6 +13,7 @@ always-y := $(hostprogs)
HOSTCFLAGS_test-fsinfo.o += -I$(objtree)/usr/include
HOSTLDLIBS_test-fsinfo += -static -lm
HOSTCFLAGS_test-mntinfo.o += -I$(objtree)/usr/include
+HOSTCFLAGS_test-fsinfo-perf.o += -I$(objtree)/usr/include

HOSTCFLAGS_test-fsmount.o += -I$(objtree)/usr/include
HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include
diff --git a/samples/vfs/test-fsinfo-perf.c b/samples/vfs/test-fsinfo-perf.c
new file mode 100644
index 000000000000..fba40737f768
--- /dev/null
+++ b/samples/vfs/test-fsinfo-perf.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Test the fsinfo() system call
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ */
+
+#define _GNU_SOURCE
+#define _ATFILE_SOURCE
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <sys/time.h>
+#include <linux/fsinfo.h>
+
+#ifndef __NR_fsinfo
+#define __NR_fsinfo -1
+#endif
+
+#define ERR(ret, what) do { if ((long)(ret) == -1) { perror(what); exit(1); } } while(0)
+#define OOM(ret) do { if (!(ret)) { perror(NULL); exit(1); } } while(0)
+
+static int nr_mounts = 3;
+static const char *base_path;
+
+static __attribute__((unused))
+ssize_t fsinfo(int dfd, const char *filename,
+ struct fsinfo_params *params, size_t params_size,
+ void *result_buffer, size_t result_buf_size)
+{
+ return syscall(__NR_fsinfo, dfd, filename,
+ params, params_size,
+ result_buffer, result_buf_size);
+}
+
+static void iterate(void (*func)(int i, const char *))
+{
+ char name[4096];
+ int i;
+
+ for (i = 0; i < nr_mounts; i++) {
+ sprintf(name, "%s/%d", base_path, i);
+ func(i, name);
+ }
+}
+
+static void make_mount(int ix, const char *path)
+{
+ ERR(mkdir(path, 0755), "mkdir");
+ ERR(mount("none", path, "tmpfs", 0, NULL), "mount");
+ ERR(mount("none", path, NULL, MS_PRIVATE, NULL), "mount");
+}
+
+static void do_umount(void)
+{
+ printf("--- umount ---\n");
+ if (umount2(base_path, MNT_DETACH) == -1)
+ perror("umount");
+}
+
+static unsigned long sum_mnt_id;
+
+static void get_mntid_by_fsinfo(int ix, const char *path)
+{
+ struct fsinfo_mount_info r;
+ struct fsinfo_params params = {
+ .flags = FSINFO_FLAGS_QUERY_PATH,
+ .request = FSINFO_ATTR_MOUNT_INFO,
+ };
+
+ ERR(fsinfo(AT_FDCWD, path, &params, sizeof(params), &r, sizeof(r)),
+ "fsinfo");
+ //printf("[%u] %u\n", ix, r.mnt_id);
+ sum_mnt_id += r.mnt_id;
+}
+
+static void get_mntid_by_proc(int ix, const char *path)
+{
+ unsigned int mnt_id;
+ ssize_t len;
+ char procfile[100], buffer[4096], *p, *nl;
+ int fd, fd2;
+
+ fd = open(path, O_PATH);
+ ERR(fd, "open/path");
+ sprintf(procfile, "/proc/self/fdinfo/%u", fd);
+ fd2 = open(procfile, O_RDONLY);
+ ERR(fd2, "open/proc");
+ len = read(fd2, buffer, sizeof(buffer) - 1);
+ ERR(len, "read");
+ buffer[len] = 0;
+ close(fd2);
+ close(fd);
+
+ p = buffer;
+ do {
+ nl = strchr(p, '\n');
+ if (nl)
+ *nl++ = '\0';
+ else
+ nl = NULL;
+
+ if (strncmp(p, "mnt_id:", 7) != 0)
+ continue;
+ p += 7;
+ while (isblank(*p))
+ p++;
+ /* Have to allow for extra numbers being added to the line */
+ if (sscanf(p, "%u", &mnt_id) != 1) {
+ fprintf(stderr, "Bad format %s\n", procfile);
+ exit(3);
+ }
+ break;
+
+ } while ((p = nl));
+
+ if (!p) {
+ fprintf(stderr, "Missing field %s\n", procfile);
+ exit(3);
+ }
+
+ sum_mnt_id += mnt_id;
+ //printf("[%u] %u\n", ix, mnt_id);
+}
+
+static void get_mntid_by_fsinfo_2(void)
+{
+ struct fsinfo_mount_child *children, *c, *end;
+ struct fsinfo_mount_info r;
+ struct fsinfo_params params = {
+ .flags = FSINFO_FLAGS_QUERY_PATH,
+ .request = FSINFO_ATTR_MOUNT_INFO,
+ };
+ unsigned int base_mnt_id;
+ size_t s_children, n_children;
+ char name[32];
+ int i;
+
+ /* Convert path to mount ID */
+ ERR(fsinfo(AT_FDCWD, base_path, &params, sizeof(params), &r, sizeof(r)),
+ "fsinfo/base");
+ base_mnt_id = r.mnt_id;
+ //printf("[B] %u\n", base_mnt_id);
+
+ /* Get a list of all the children of this mount ID */
+ s_children = (nr_mounts + 1) * sizeof(*children);
+ children = malloc(s_children);
+ OOM(children);
+
+ params.flags = FSINFO_FLAGS_QUERY_MOUNT;
+ params.request = FSINFO_ATTR_MOUNT_CHILDREN;
+ sprintf(name, "%u", base_mnt_id);
+ s_children = fsinfo(AT_FDCWD, name, &params, sizeof(params), children, s_children);
+ ERR(s_children, "fsinfo/children");
+
+ /* Query each child */
+ n_children = s_children / sizeof(*c) - 1; // Parent is added at end
+ c = children;
+ end = c + n_children;
+ for (i = 0; c < end; c++, i++) {
+ //printf("[%u] %u\n", i, c->mnt_id);
+ params.flags = FSINFO_FLAGS_QUERY_MOUNT;
+ params.request = FSINFO_ATTR_MOUNT_INFO;
+ sprintf(name, "%u", c->mnt_id);
+ ERR(fsinfo(AT_FDCWD, name, &params, sizeof(params), &r, sizeof(r)),
+ "fsinfo/child");
+ sum_mnt_id += r.mnt_id;
+ }
+}
+
+static void get_mntid_by_mountfs(void)
+{
+ unsigned int base_mnt_id, mnt_id, x;
+ ssize_t len, s_children;
+ char procfile[100], buffer[100], *children, *p, *q, *nl, *comma;
+ int fd, fd2, mntfd, i;
+
+ /* Start off by reading the mount ID from the base path */
+ fd = open(base_path, O_PATH);
+ ERR(fd, "open/path");
+ sprintf(procfile, "/proc/self/fdinfo/%u", fd);
+ fd2 = open(procfile, O_RDONLY);
+ ERR(fd2, "open/proc");
+ len = read(fd2, buffer, sizeof(buffer) - 1);
+ ERR(len, "read");
+ buffer[len] = 0;
+ close(fd2);
+ close(fd);
+
+ p = buffer;
+ do {
+ nl = strchr(p, '\n');
+ if (nl)
+ *nl++ = '\0';
+ else
+ nl = NULL;
+
+ if (strncmp(p, "mnt_id:", 7) != 0)
+ continue;
+ p += 7;
+ while (isblank(*p))
+ p++;
+ /* Have to allow for extra numbers being added to the line */
+ if (sscanf(p, "%u", &base_mnt_id) != 1) {
+ fprintf(stderr, "Bad format %s\n", procfile);
+ exit(3);
+ }
+ break;
+
+ } while ((p = nl));
+
+ if (!p) {
+ fprintf(stderr, "Missing field %s\n", procfile);
+ exit(3);
+ }
+
+ if (0) printf("[B] %u\n", base_mnt_id);
+
+ mntfd = open("/mnt", O_PATH);
+ ERR(fd, "open/mountfs");
+
+ /* Get a list of all the children of this mount ID */
+ s_children = (nr_mounts) * 12;
+ children = malloc(s_children);
+ OOM(children);
+
+ sprintf(procfile, "%u/children", base_mnt_id);
+ fd = openat(mntfd, procfile, O_RDONLY);
+ ERR(fd, "open/children");
+ s_children = read(fd, children, s_children - 1);
+ ERR(s_children, "read/children");
+ close(fd);
+ if (s_children > 0 && children[s_children - 1] == '\n')
+ s_children--;
+ children[s_children] = 0;
+
+ /* Query each child */
+ p = children;
+ if (!*p)
+ return;
+ i = 0;
+ do {
+ mnt_id = strtoul(p, &comma, 10);
+ if (*comma) {
+ if (*comma != ',') {
+ fprintf(stderr, "Bad format in mountfs-children\n");
+ exit(3);
+ }
+ comma++;
+ }
+
+ sprintf(procfile, "%u/id", mnt_id);
+ fd = openat(mntfd, procfile, O_RDONLY);
+ ERR(fd, procfile);
+ len = read(fd, buffer, sizeof(buffer) - 1);
+ ERR(len, "read/id");
+ close(fd);
+ if (len > 0 && buffer[len - 1] == '\n')
+ len--;
+ buffer[len] = 0;
+
+ x = strtoul(buffer, &q, 10);
+
+ if (*q) {
+ fprintf(stderr, "Bad format in %s '%s'\n", procfile, buffer);
+ exit(3);
+ }
+
+ if (0) printf("[%u] %u\n", i++, x);
+ sum_mnt_id += x;
+ } while (p = comma, *comma);
+}
+
+static unsigned long duration(struct timeval *before, struct timeval *after)
+{
+ unsigned long a, b;
+
+ a = after->tv_sec * 1000000 + after->tv_usec;
+ b = before->tv_sec * 1000000 + before->tv_usec;
+ return a - b;
+}
+
+int main(int argc, char **argv)
+{
+ struct timeval f_before, f_after;
+ struct timeval f2_before, f2_after;
+ struct timeval p_before, p_after;
+ struct timeval p2_before, p2_after;
+ const char *path;
+ unsigned long f_dur, f2_dur, p_dur, p2_dur;
+
+ if (argc < 2) {
+ fprintf(stderr, "Format: %s <path> [nr_mounts]\n", argv[0]);
+ exit(2);
+ }
+
+ if (argc == 3)
+ nr_mounts = atoi(argv[2]);
+
+ path = argv[1];
+ ERR(mount("none", path, "tmpfs", 0, NULL), "mount");
+ ERR(mount("none", path, NULL, MS_PRIVATE, NULL), "mount");
+ base_path = path;
+ atexit(do_umount);
+
+ printf("--- make mounts ---\n");
+ iterate(make_mount);
+
+ printf("--- test fsinfo by path ---\n");
+ sum_mnt_id = 0;
+ ERR(gettimeofday(&f_before, NULL), "gettimeofday");
+ iterate(get_mntid_by_fsinfo);
+ ERR(gettimeofday(&f_after, NULL), "gettimeofday");
+ printf("sum(mnt_id) = %lu\n", sum_mnt_id);
+
+ printf("--- test fsinfo by mnt_id ---\n");
+ sum_mnt_id = 0;
+ ERR(gettimeofday(&f2_before, NULL), "gettimeofday");
+ get_mntid_by_fsinfo_2();
+ ERR(gettimeofday(&f2_after, NULL), "gettimeofday");
+ printf("sum(mnt_id) = %lu\n", sum_mnt_id);
+
+ printf("--- test /proc/fdinfo ---\n");
+ sum_mnt_id = 0;
+ ERR(gettimeofday(&p_before, NULL), "gettimeofday");
+ iterate(get_mntid_by_proc);
+ ERR(gettimeofday(&p_after, NULL), "gettimeofday");
+ printf("sum(mnt_id) = %lu\n", sum_mnt_id);
+
+ printf("--- test mountfs ---\n");
+ sum_mnt_id = 0;
+ ERR(gettimeofday(&p2_before, NULL), "gettimeofday");
+ get_mntid_by_mountfs();
+ ERR(gettimeofday(&p2_after, NULL), "gettimeofday");
+ printf("sum(mnt_id) = %lu\n", sum_mnt_id);
+
+ f_dur = duration(&f_before, &f_after);
+ f2_dur = duration(&f2_before, &f2_after);
+ p_dur = duration(&p_before, &p_after);
+ p2_dur = duration(&p2_before, &p2_after);
+ //printf("fsinfo duration %10luus for %d mounts\n", f_dur, nr_mounts);
+ //printf("procfd duration %10luus for %d mounts\n", p_dur, nr_mounts);
+
+ printf("For %7d mounts, f=%10luus f2=%10luus p=%10luus p2=%10luus; p=%.1f*f p=%.1f*f2 p=%.1f*p2\n",
+ nr_mounts, f_dur, f2_dur, p_dur, p2_dur,
+ (double)p_dur / (double)f_dur,
+ (double)p_dur / (double)f2_dur,
+ (double)p_dur / (double)p2_dur);
+ return 0;
+}