Re: [PATCH v7 2/4] mseal: add mseal syscall

From: Liam R. Howlett
Date: Tue Jan 23 2024 - 13:15:46 EST


* jeffxu@xxxxxxxxxxxx <jeffxu@xxxxxxxxxxxx> [240122 10:29]:
> From: Jeff Xu <jeffxu@xxxxxxxxxxxx>
>
> The new mseal() is an syscall on 64 bit CPU, and with
> following signature:
>
> int mseal(void addr, size_t len, unsigned long flags)
> addr/len: memory range.
> flags: reserved.
>
> mseal() blocks following operations for the given memory range.
>
> 1> Unmapping, moving to another location, and shrinking the size,
> via munmap() and mremap(), can leave an empty space, therefore can
> be replaced with a VMA with a new set of attributes.
>
> 2> Moving or expanding a different VMA into the current location,
> via mremap().
>
> 3> Modifying a VMA via mmap(MAP_FIXED).
>
> 4> Size expansion, via mremap(), does not appear to pose any specific
> risks to sealed VMAs. It is included anyway because the use case is
> unclear. In any case, users can rely on merging to expand a sealed VMA.
>
> 5> mprotect() and pkey_mprotect().
>
> 6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous
> memory, when users don't have write permission to the memory. Those
> behaviors can alter region contents by discarding pages, effectively a
> memset(0) for anonymous memory.
>
> In addition: mmap() has two related changes.
>
> The PROT_SEAL bit in prot field of mmap(). When present, it marks
> the map sealed since creation.
>
> The MAP_SEALABLE bit in the flags field of mmap(). When present, it marks
> the map as sealable. A map created without MAP_SEALABLE will not support
> sealing, i.e. mseal() will fail.
>
> Applications that don't care about sealing will expect their behavior
> unchanged. For those that need sealing support, opt-in by adding
> MAP_SEALABLE in mmap().
>
> I would like to formally acknowledge the valuable contributions
> received during the RFC process, which were instrumental
> in shaping this patch:
>
> Jann Horn: raising awareness and providing valuable insights on the
> destructive madvise operations.
> Linus Torvalds: assisting in defining system call signature and scope.
> Pedro Falcato: suggesting sealing in the mmap().
> Theo de Raadt: sharing the experiences and insights gained from
> implementing mimmutable() in OpenBSD.
>
> Finally, the idea that inspired this patch comes from Stephen Röttger’s
> work in Chrome V8 CFI.
>
> Signed-off-by: Jeff Xu <jeffxu@xxxxxxxxxxxx>
> ---
> include/linux/mm.h | 48 ++++
> include/linux/syscalls.h | 1 +
> include/uapi/asm-generic/mman-common.h | 8 +
> mm/Makefile | 4 +
> mm/madvise.c | 12 +
> mm/mmap.c | 27 ++
> mm/mprotect.c | 10 +
> mm/mremap.c | 31 +++
> mm/mseal.c | 343 +++++++++++++++++++++++++
> 9 files changed, 484 insertions(+)
> create mode 100644 mm/mseal.c
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f5a97dec5169..bdd9a53e9291 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h

None of this can live in mm/internal.h ?

> @@ -328,6 +328,14 @@ extern unsigned int kobjsize(const void *objp);
> #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
> #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
>
> +#ifdef CONFIG_64BIT
> +/* VM is sealable, in vm_flags */
> +#define VM_SEALABLE _BITUL(63)
> +
> +/* VM is sealed, in vm_flags */
> +#define VM_SEALED _BITUL(62)
> +#endif
> +
> #ifdef CONFIG_ARCH_HAS_PKEYS
> # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
> # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */
> @@ -4182,4 +4190,44 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
> return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
> }
>
> +#ifdef CONFIG_64BIT
> +static inline int can_do_mseal(unsigned long flags)
> +{
> + if (flags)
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +bool can_modify_mm(struct mm_struct *mm, unsigned long start,
> + unsigned long end);
> +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
> + unsigned long end, int behavior);
> +unsigned long get_mmap_seals(unsigned long prot,
> + unsigned long flags);
> +#else
> +static inline int can_do_mseal(unsigned long flags)
> +{
> + return -EPERM;
> +}
> +
> +static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
> + unsigned long end)
> +{
> + return true;
> +}
> +
> +static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
> + unsigned long end, int behavior)
> +{
> + return true;
> +}
> +
> +static inline unsigned long get_mmap_seals(unsigned long prot,
> + unsigned long flags)
> +{
> + return 0;
> +}
> +#endif
> +
> #endif /* _LINUX_MM_H */

..

> diff --git a/mm/mmap.c b/mm/mmap.c
> index b78e83d351d2..32bc2179aed0 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1213,6 +1213,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> {
> struct mm_struct *mm = current->mm;
> int pkey = 0;
> + unsigned long vm_seals;
>
> *populate = 0;
>
> @@ -1233,6 +1234,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> if (flags & MAP_FIXED_NOREPLACE)
> flags |= MAP_FIXED;
>
> + vm_seals = get_mmap_seals(prot, flags);
> +
> if (!(flags & MAP_FIXED))
> addr = round_hint_to_min(addr);
>
> @@ -1261,6 +1264,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> return -EEXIST;
> }
>
> + /*
> + * Check if the address range is sealed for do_mmap().
> + * can_modify_mm assumes we have acquired the lock on MM.
> + */
> + if (!can_modify_mm(mm, addr, addr + len))
> + return -EPERM;
> +

This is called after get_unmapped_area(), so this area is either going
to be MAP_FIXED and return the "hint" addr or it's going to be empty.
You can probably avoid walking the VMAs in the non-FIXED case. This
would remove the overhead of your check in the most common case.

> if (prot == PROT_EXEC) {
> pkey = execute_only_pkey(mm);
> if (pkey < 0)
> @@ -1376,6 +1386,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> vm_flags |= VM_NORESERVE;
> }
>
> + vm_flags |= vm_seals;
> addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
> if (!IS_ERR_VALUE(addr) &&
> ((vm_flags & VM_LOCKED) ||
> @@ -2679,6 +2690,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> if (end == start)
> return -EINVAL;
>
> + /*
> + * Check if memory is sealed before arch_unmap.
> + * Prevent unmapping a sealed VMA.
> + * can_modify_mm assumes we have acquired the lock on MM.
> + */
> + if (!can_modify_mm(mm, start, end))
> + return -EPERM;
> +

This function is currently called from mmap_region(), so we are going to
run this check twice as you have it; once in do_mmap() then again in
mma_region() -> do_vmi_munmap(). This effectively doubles your impact
to MAP_FIXED calls.

> /* arch_unmap() might do unmaps itself. */
> arch_unmap(mm, start, end);
>
> @@ -3102,6 +3121,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> {
> struct mm_struct *mm = vma->vm_mm;
>
> + /*
> + * Check if memory is sealed before arch_unmap.
> + * Prevent unmapping a sealed VMA.
> + * can_modify_mm assumes we have acquired the lock on MM.
> + */
> + if (!can_modify_mm(mm, start, end))
> + return -EPERM;
> +

I am sure you've looked at the callers, from what I found there are two:

The brk call uses this function, so it may check more than one VMA in
that path. Will the brk VMAs potentially be msealed? I guess someone
could do that?

The other place this is use is in ipc/shm.c whhere the start/end is just
the vma start/end, so we only really need to check that one vma.

Is there a way to avoid walking the tree for the single known VMA? Does
it make sense to deny mseal writing to brk VMAs?


> arch_unmap(mm, start, end);
> return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> }

..


Ah, I see them now. Yes, this is what I expected to see. Does this not
have any impact on mmap/munmap benchmarks?

> +bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
> +{
> + struct vm_area_struct *vma;
> +
> + VMA_ITERATOR(vmi, mm, start);
> +
> + /* going through each vma to check. */
> + for_each_vma_range(vmi, vma, end) {
> + if (!can_modify_vma(vma))
> + return false;
> + }
> +
> + /* Allow by default. */
> + return true;
> +}
> +
> +/*
> + * Check if the vmas of a memory range are allowed to be modified by madvise.
> + * the memory ranger can have a gap (unallocated memory).
> + * return true, if it is allowed.
> + */
> +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
> + int behavior)
> +{
> + struct vm_area_struct *vma;
> +
> + VMA_ITERATOR(vmi, mm, start);
> +
> + if (!is_madv_discard(behavior))
> + return true;
> +
> + /* going through each vma to check. */
> + for_each_vma_range(vmi, vma, end)
> + if (is_ro_anon(vma) && !can_modify_vma(vma))
> + return false;
> +
> + /* Allow by default. */
> + return true;
> +}
> +

..

> +static int check_mm_seal(unsigned long start, unsigned long end)
> +{
> + struct vm_area_struct *vma;
> + unsigned long nstart = start;
> +
> + VMA_ITERATOR(vmi, current->mm, start);
> +
> + /* going through each vma to check. */
> + for_each_vma_range(vmi, vma, end) {
> + if (vma->vm_start > nstart)
> + /* unallocated memory found. */
> + return -ENOMEM;

Ah, another potential user for a contiguous iterator of VMAs.

> +
> + if (!can_add_vma_seal(vma))
> + return -EACCES;
> +
> + if (vma->vm_end >= end)
> + return 0;
> +
> + nstart = vma->vm_end;
> + }
> +
> + return -ENOMEM;
> +}
> +
> +/*
> + * Apply sealing.
> + */
> +static int apply_mm_seal(unsigned long start, unsigned long end)
> +{
> + unsigned long nstart;
> + struct vm_area_struct *vma, *prev;
> +
> + VMA_ITERATOR(vmi, current->mm, start);
> +
> + vma = vma_iter_load(&vmi);
> + /*
> + * Note: check_mm_seal should already checked ENOMEM case.
> + * so vma should not be null, same for the other ENOMEM cases.

The start to end is contiguous, right?

> + */
> + prev = vma_prev(&vmi);
> + if (start > vma->vm_start)
> + prev = vma;
> +
> + nstart = start;
> + for_each_vma_range(vmi, vma, end) {
> + int error;
> + unsigned long tmp;
> + vm_flags_t newflags;
> +
> + newflags = vma->vm_flags | VM_SEALED;
> + tmp = vma->vm_end;
> + if (tmp > end)
> + tmp = end;
> + error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
> + if (error)
> + return error;

> + tmp = vma_iter_end(&vmi);
> + nstart = tmp;

You set tmp before using it unconditionally to vma->vm_end above, so you
can set nstart = vma_iter_end(&vmi) here. But, also we know the
VMAs are contiguous from your check_mm_seal() call, so we know nstart ==
vma->vm_start on the next loop.

..