[PATCH RFC v2 3/4] mm/ptshare: Create new mm struct for page table sharing

From: Khalid Aziz
Date: Wed Apr 26 2023 - 12:51:37 EST


When a process passes MAP_SHARED_PT flag to mmap(), create a new mm
struct to hold the shareable page table entries for the newly mapped
region. This new mm is not associated with a task. Its lifetime is
until the last shared mapping is deleted. This patch also adds a
new pointer "ptshare_data" to struct address_space which points to
the data structure that will contain pointer to this newly created
mm along with properties of the shared mapping. ptshare_data
maintains a refcount for the shared mapping so that it can be
cleaned up upon last unmap.

Signed-off-by: Khalid Aziz <khalid.aziz@xxxxxxxxxx>
Suggested-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx>
---
include/linux/fs.h | 2 +
mm/Makefile | 2 +-
mm/internal.h | 14 +++++
mm/mmap.c | 72 ++++++++++++++++++++++++++
mm/ptshare.c | 126 +++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 215 insertions(+), 1 deletion(-)
create mode 100644 mm/ptshare.c

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c85916e9f7db..db8d3257c712 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -422,6 +422,7 @@ extern const struct address_space_operations empty_aops;
* @private_lock: For use by the owner of the address_space.
* @private_list: For use by the owner of the address_space.
* @private_data: For use by the owner of the address_space.
+ * @ptshare_data: For shared page table use
*/
struct address_space {
struct inode *host;
@@ -443,6 +444,7 @@ struct address_space {
spinlock_t private_lock;
struct list_head private_list;
void *private_data;
+ void *ptshare_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
/*
* On most architectures that alignment is already the case; but
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e29..d9bb14fdf220 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -40,7 +40,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o
+ pgtable-generic.o rmap.o vmalloc.o ptshare.o


ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/internal.h b/mm/internal.h
index 4d60d2d5fe19..3efb8738e26f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1047,4 +1047,18 @@ static inline bool vma_is_shared(const struct vm_area_struct *vma)
{
return vma->vm_flags & VM_SHARED_PT;
}
+
+/*
+ * mm/ptshare.c
+ */
+struct ptshare_data {
+ struct mm_struct *mm;
+ refcount_t refcnt;
+ unsigned long start;
+ unsigned long size;
+ unsigned long mode;
+};
+int ptshare_new_mm(struct file *file, struct vm_area_struct *vma);
+void ptshare_del_mm(struct vm_area_struct *vm);
+int ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 8b46d465f8d4..c5e9b7f6de90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1382,6 +1382,60 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
+
+#if VM_SHARED_PT
+ /*
+ * Check if this mapping is a candidate for page table sharing
+ * at PMD level. It is if following conditions hold:
+ * - It is not anonymous mapping
+ * - It is not hugetlbfs mapping (for now)
+ * - flags conatins MAP_SHARED or MAP_SHARED_VALIDATE and
+ * MAP_SHARED_PT
+ * - Start address is aligned to PMD size
+ * - Mapping size is a multiple of PMD size
+ */
+ if (ptshare && file && !is_file_hugepages(file)) {
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, addr);
+ if (!((vma->vm_start | vma->vm_end) & (PMD_SIZE - 1))) {
+ struct ptshare_data *info = file->f_mapping->ptshare_data;
+ /*
+ * If this mapping has not been set up for page table
+ * sharing yet, do so by creating a new mm to hold the
+ * shared page tables for this mapping
+ */
+ if (info == NULL) {
+ int ret;
+
+ ret = ptshare_new_mm(file, vma);
+ if (ret < 0)
+ return ret;
+
+ info = file->f_mapping->ptshare_data;
+ ret = ptshare_insert_vma(info->mm, vma);
+ if (ret < 0)
+ addr = ret;
+ else
+ vm_flags_set(vma, VM_SHARED_PT);
+ } else {
+ /*
+ * Page tables will be shared only if the
+ * file is mapped in with the same permissions
+ * across all mappers with same starting
+ * address and size
+ */
+ if (((prot & info->mode) == info->mode) &&
+ (addr == info->start) &&
+ (len == info->size)) {
+ vm_flags_set(vma, VM_SHARED_PT);
+ refcount_inc(&info->refcnt);
+ }
+ }
+ }
+ }
+#endif
+
return addr;
}

@@ -2495,6 +2549,22 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
if (end == start)
return -EINVAL;

+ /*
+ * Check if this vma uses shared page tables
+ */
+ vma = find_vma_intersection(mm, start, end);
+ if (vma && unlikely(vma_is_shared(vma))) {
+ struct ptshare_data *info = NULL;
+
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ info = vma->vm_file->f_mapping->ptshare_data;
+ /* Don't allow partial munmaps */
+ if (info && ((start != info->start) || (len != info->size)))
+ return -EINVAL;
+ ptshare_del_mm(vma);
+ }
+
+
/* arch_unmap() might do unmaps itself. */
arch_unmap(mm, start, end);

@@ -2664,6 +2734,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
}

+ if (vm_flags & VM_SHARED_PT)
+ vm_flags_set(vma, VM_SHARED_PT);
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
diff --git a/mm/ptshare.c b/mm/ptshare.c
new file mode 100644
index 000000000000..f6784268958c
--- /dev/null
+++ b/mm/ptshare.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Share page table entries when possible to reduce the amount of extra
+ * memory consumed by page tables
+ *
+ * Copyright (C) 2022 Oracle Corp. All rights reserved.
+ * Authors: Khalid Aziz <khalid.aziz@xxxxxxxxxx>
+ * Matthew Wilcox <willy@xxxxxxxxxxxxx>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * Create a new mm struct that will hold the shared PTEs. Pointer to
+ * this new mm is stored in the data structure ptshare_data which also
+ * includes a refcount for any current references to PTEs in this new
+ * mm. This refcount is used to determine when the mm struct for shared
+ * PTEs can be deleted.
+ */
+int
+ptshare_new_mm(struct file *file, struct vm_area_struct *vma)
+{
+ struct mm_struct *new_mm;
+ struct ptshare_data *info = NULL;
+ int retval = 0;
+ unsigned long start = vma->vm_start;
+ unsigned long len = vma->vm_end - vma->vm_start;
+
+ new_mm = mm_alloc();
+ if (!new_mm) {
+ retval = -ENOMEM;
+ goto err_free;
+ }
+ new_mm->mmap_base = start;
+ new_mm->task_size = len;
+ if (!new_mm->task_size)
+ new_mm->task_size--;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ retval = -ENOMEM;
+ goto err_free;
+ }
+ info->mm = new_mm;
+ info->start = start;
+ info->size = len;
+ refcount_set(&info->refcnt, 1);
+ file->f_mapping->ptshare_data = info;
+
+ return retval;
+
+err_free:
+ if (new_mm)
+ mmput(new_mm);
+ kfree(info);
+ return retval;
+}
+
+/*
+ * insert vma into mm holding shared page tables
+ */
+int
+ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ struct vm_area_struct *new_vma;
+ int err = 0;
+
+ new_vma = vm_area_dup(vma);
+ if (!new_vma)
+ return -ENOMEM;
+
+ new_vma->vm_file = NULL;
+ /*
+ * This new vma belongs to host mm, so clear the VM_SHARED_PT
+ * flag on this so we know this is the host vma when we clean
+ * up page tables. Do not use THP for page table shared regions
+ */
+ vm_flags_clear(new_vma, (VM_SHARED | VM_SHARED_PT));
+ vm_flags_set(new_vma, VM_NOHUGEPAGE);
+ new_vma->vm_mm = mm;
+
+ err = insert_vm_struct(mm, new_vma);
+ if (err)
+ return -ENOMEM;
+
+ return err;
+}
+
+/*
+ * Free the mm struct created to hold shared PTEs and associated data
+ * structures
+ */
+static inline void
+free_ptshare_mm(struct ptshare_data *info)
+{
+ mmput(info->mm);
+ kfree(info);
+}
+
+/*
+ * This function is called when a reference to the shared PTEs in mm
+ * struct is dropped. It updates refcount and checks to see if last
+ * reference to the mm struct holding shared PTEs has been dropped. If
+ * so, it cleans up the mm struct and associated data structures
+ */
+void
+ptshare_del_mm(struct vm_area_struct *vma)
+{
+ struct ptshare_data *info;
+ struct file *file = vma->vm_file;
+
+ if (!file || (!file->f_mapping))
+ return;
+ info = file->f_mapping->ptshare_data;
+ WARN_ON(!info);
+ if (!info)
+ return;
+
+ if (refcount_dec_and_test(&info->refcnt)) {
+ free_ptshare_mm(info);
+ file->f_mapping->ptshare_data = NULL;
+ }
+}
--
2.37.2