Re: [PATCH v2 1/9] mm: Add msharefs filesystem

From: Darrick J. Wong
Date: Thu Jun 30 2022 - 17:53:17 EST


On Wed, Jun 29, 2022 at 04:53:52PM -0600, Khalid Aziz wrote:
> Add a ram-based filesystem that contains page table sharing
> information and files that enables processes to share page tables.
> This patch adds the basic filesystem that can be mounted.
>
> Signed-off-by: Khalid Aziz <khalid.aziz@xxxxxxxxxx>
> ---
> Documentation/filesystems/msharefs.rst | 19 +++++
> include/uapi/linux/magic.h | 1 +
> mm/Makefile | 2 +-
> mm/mshare.c | 103 +++++++++++++++++++++++++
> 4 files changed, 124 insertions(+), 1 deletion(-)
> create mode 100644 Documentation/filesystems/msharefs.rst
> create mode 100644 mm/mshare.c
>
> diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
> new file mode 100644
> index 000000000000..fd161f67045d
> --- /dev/null
> +++ b/Documentation/filesystems/msharefs.rst
> @@ -0,0 +1,19 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +=====================================================
> +msharefs - a filesystem to support shared page tables
> +=====================================================
> +
> +msharefs is a ram-based filesystem that allows multiple processes to
> +share page table entries for shared pages.
> +
> +msharefs is typically mounted like this::
> +
> + mount -t msharefs none /sys/fs/mshare
> +
> +When a process calls mshare syscall with a name for the shared address
> +range,

You mean creat()?

> a file with the same name is created under msharefs with that
> +name. This file can be opened by another process, if permissions
> +allow, to query the addresses shared under this range. These files are
> +removed by mshare_unlink syscall and can not be deleted directly.

Oh?

> +Hence these files are created as immutable files.
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index f724129c0425..2a57a6ec6f3e 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -105,5 +105,6 @@
> #define Z3FOLD_MAGIC 0x33
> #define PPC_CMM_MAGIC 0xc7571590
> #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
> +#define MSHARE_MAGIC 0x4d534852 /* "MSHR" */
>
> #endif /* __LINUX_MAGIC_H__ */
> diff --git a/mm/Makefile b/mm/Makefile
> index 6f9ffa968a1a..51a2ab9080d9 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -37,7 +37,7 @@ CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
> CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
>
> mmu-y := nommu.o
> -mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
> +mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o mshare.o \
> mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
> msync.o page_vma_mapped.o pagewalk.o \
> pgtable-generic.o rmap.o vmalloc.o
> diff --git a/mm/mshare.c b/mm/mshare.c
> new file mode 100644
> index 000000000000..c8fab3869bab
> --- /dev/null
> +++ b/mm/mshare.c

Filesystems are usually supposed to live under fs/; is there some reason
to put it in mm/?

I guess shmfs is in mm so maybe this isn't much of an objection.

Also, should this fs be selectable via a Kconfig option?

--D

> @@ -0,0 +1,103 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Enable copperating processes to share page table between
> + * them to reduce the extra memory consumed by multiple copies
> + * of page tables.
> + *
> + * This code adds an in-memory filesystem - msharefs.
> + * msharefs is used to manage page table sharing
> + *
> + *
> + * Copyright (C) 2022 Oracle Corp. All rights reserved.
> + * Author: Khalid Aziz <khalid.aziz@xxxxxxxxxx>
> + *
> + */
> +
> +#include <linux/fs.h>
> +#include <linux/mount.h>
> +#include <linux/syscalls.h>
> +#include <linux/uaccess.h>
> +#include <linux/pseudo_fs.h>
> +#include <linux/fileattr.h>
> +#include <uapi/linux/magic.h>
> +#include <uapi/linux/limits.h>
> +
> +static struct super_block *msharefs_sb;
> +
> +static const struct file_operations msharefs_file_operations = {
> + .open = simple_open,
> + .llseek = no_llseek,
> +};
> +
> +static int
> +msharefs_d_hash(const struct dentry *dentry, struct qstr *qstr)
> +{
> + unsigned long hash = init_name_hash(dentry);
> + const unsigned char *s = qstr->name;
> + unsigned int len = qstr->len;
> +
> + while (len--)
> + hash = partial_name_hash(*s++, hash);
> + qstr->hash = end_name_hash(hash);
> + return 0;
> +}
> +
> +static const struct dentry_operations msharefs_d_ops = {
> + .d_hash = msharefs_d_hash,
> +};
> +
> +static int
> +msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
> +{
> + static const struct tree_descr empty_descr = {""};
> + int err;
> +
> + sb->s_d_op = &msharefs_d_ops;
> + err = simple_fill_super(sb, MSHARE_MAGIC, &empty_descr);
> + if (err)
> + return err;
> +
> + msharefs_sb = sb;
> + return 0;
> +}
> +
> +static int
> +msharefs_get_tree(struct fs_context *fc)
> +{
> + return get_tree_single(fc, msharefs_fill_super);
> +}
> +
> +static const struct fs_context_operations msharefs_context_ops = {
> + .get_tree = msharefs_get_tree,
> +};
> +
> +static int
> +mshare_init_fs_context(struct fs_context *fc)
> +{
> + fc->ops = &msharefs_context_ops;
> + return 0;
> +}
> +
> +static struct file_system_type mshare_fs = {
> + .name = "msharefs",
> + .init_fs_context = mshare_init_fs_context,
> + .kill_sb = kill_litter_super,
> +};
> +
> +static int
> +mshare_init(void)
> +{
> + int ret = 0;
> +
> + ret = sysfs_create_mount_point(fs_kobj, "mshare");
> + if (ret)
> + return ret;
> +
> + ret = register_filesystem(&mshare_fs);
> + if (ret)
> + sysfs_remove_mount_point(fs_kobj, "mshare");
> +
> + return ret;
> +}
> +
> +fs_initcall(mshare_init);
> --
> 2.32.0
>