[PATCH] memfd: support MFD_NOEXEC alongside MFD_EXEC

From: David Rheinsberg
Date: Fri Jul 14 2023 - 07:48:16 EST


Add a new flag for memfd_create() called MFD_NOEXEC, which has the
opposite effect as MFD_EXEC (i.e., it strips the executable bits from
the inode file mode).

The default mode for memfd_create() has historically been to use 0777 as
file modes. The new `MFD_EXEC` flag has now made this explicit, paving
the way to reduce the default to 0666 and thus dropping the executable
bits for security reasons. Additionally, the `MFD_NOEXEC_SEAL` flag has
been added which allows this without changing the default right now.

Unfortunately, `MFD_NOEXEC_SEAL` enforces `MFD_ALLOW_SEALING` and
`F_SEAL_EXEC`, with no alternatives available. This leads to multiple
issues:

* Applications that do not want to allow sealing either have to use
`MFD_EXEC` (which we don't want, unless they actually need the
executable bits), or they must add `F_SEAL_SEAL` immediately on
return of memfd_create(2) with `MFD_NOEXEC_SEAL`, since this
implicitly enables sealing.

Note that we explicitly added `MFD_ALLOW_SEALING` when creating
memfd_create(2), because enabling seals on existing users of shmem
without them knowing about it can easily introduce DoS scenarios. It
is unclear why `MFD_NOEXEC_SEAL` was designed to enable seals, and
this is especially dangerous with `MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL`
set via sysctl, since it will silently enable sealing on every memfd
created.

* Applications that do not want `MFD_EXEC`, but rely on
`F_GET_SEALS` to *NOT* return `F_SEAL_EXEC` have no way of achieving
this other than using `MFD_EXEC` and clearing the executable bits
manually via fchmod(2). Using `MFD_NOEXEC_SEAL` will set
`F_SEAL_EXEC` and thus break existing code that hard-codes the
seal-set.

This is already an issue when sending log-memfds to systemd-journald
today, which verifies the seal-set of the received FD and fails if
unknown seals are set. Hence, you have to use `MFD_EXEC` when
creating memfds for this purpose, even though you really do not need
the executable bits set.

* Applications that want to enable the executable bit later on,
currently have no way to create the memfd without it. They have to
clear the bits immediately after creating it via fchmod(2), or just
leave them set.

By adding MFD_NOEXEC, user-space is now able to clear the executable
bits on all memfds immediately, clearly stating their intention, without
requiring fixups after creating the memfd. In particular, it is highly
useful for existing use-cases that cannot allow new seals to appear on
memfds.

Signed-off-by: David Rheinsberg <david@xxxxxxxxxxxx>
---
include/linux/pid_namespace.h | 4 ++--
include/uapi/linux/memfd.h | 1 +
mm/memfd.c | 29 ++++++++++++++---------------
3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c758809d5bcf..02f8acf94512 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -19,9 +19,9 @@ struct fs_pin;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
/*
* sysctl for vm.memfd_noexec
- * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
+ * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC
* acts like MFD_EXEC was set.
- * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
+ * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC
* acts like MFD_NOEXEC_SEAL was set.
* 2: memfd_create() without MFD_NOEXEC_SEAL will be
* rejected.
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 273a4e15dfcf..b9e13bd65817 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -12,6 +12,7 @@
#define MFD_NOEXEC_SEAL 0x0008U
/* executable */
#define MFD_EXEC 0x0010U
+#define MFD_NOEXEC 0x0020U

/*
* Huge page size encoding when MFD_HUGETLB is specified, and a huge page
diff --git a/mm/memfd.c b/mm/memfd.c
index e763e76f1106..2afe49368fc5 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -266,7 +266,9 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)

-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
+#define MFD_ALL_FLAGS \
+ (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
+ MFD_NOEXEC_SEAL | MFD_EXEC | MFD_NOEXEC)

SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
@@ -289,11 +291,13 @@ SYSCALL_DEFINE2(memfd_create,
return -EINVAL;
}

- /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
- if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
+ if (flags & MFD_NOEXEC_SEAL)
+ flags |= MFD_ALLOW_SEALING | MFD_NOEXEC;
+
+ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC))
return -EINVAL;

- if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
+ if (!(flags & (MFD_EXEC | MFD_NOEXEC))) {
#ifdef CONFIG_SYSCTL
int sysctl = MEMFD_NOEXEC_SCOPE_EXEC;
struct pid_namespace *ns;
@@ -366,20 +370,15 @@ SYSCALL_DEFINE2(memfd_create,
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;

- if (flags & MFD_NOEXEC_SEAL) {
- struct inode *inode = file_inode(file);
+ if (flags & MFD_NOEXEC)
+ file_inode(file)->i_mode &= ~0111;

- inode->i_mode &= ~0111;
- file_seals = memfd_file_seals_ptr(file);
- if (file_seals) {
+ file_seals = memfd_file_seals_ptr(file);
+ if (file_seals) {
+ if (flags & MFD_ALLOW_SEALING)
*file_seals &= ~F_SEAL_SEAL;
+ if (flags & MFD_NOEXEC_SEAL)
*file_seals |= F_SEAL_EXEC;
- }
- } else if (flags & MFD_ALLOW_SEALING) {
- /* MFD_EXEC and MFD_ALLOW_SEALING are set */
- file_seals = memfd_file_seals_ptr(file);
- if (file_seals)
- *file_seals &= ~F_SEAL_SEAL;
}

fd_install(fd, file);
--
2.41.0