[RFC PATCH 4/7] mm: introduce pte_refcount for user PTE page table page

From: Qi Zheng
Date: Thu Aug 25 2022 - 06:12:23 EST


The following is the largest user PTE page table memory that
can be allocated by a single user process in a 32-bit and a
64-bit system (assuming 4K page size).

+---------------------------+--------+---------+
| | 32-bit | 64-bit |
+===========================+========+=========+
| user PTE page table pages | 3 MiB | 512 GiB |
+---------------------------+--------+---------+
| user PMD page table pages | 3 KiB | 1 GiB |
+---------------------------+--------+---------+
(for 32-bit, take 3G user address space as an example;
for 64-bit, take 48-bit address width as an example.)

Today, 64-bit servers generally have only a few terabytes of
physical memory, and mapping these memory does not require as
many PTE page tables as above, but in some of the following
scenarios, it is still possible to cause huge page table memory
usage.

1. In order to pursue high performance, applications mostly use
some high-performance user-mode memory allocators, such as
jemalloc or tcmalloc. These memory allocators use
madvise(MADV_DONTNEED or MADV_FREE) to release physical memory,
but neither MADV_DONTNEED nor MADV_FREE will release page table
memory, which may cause huge page table memory as follows:

VIRT: 55t
RES: 590g
VmPTE: 110g

In this case, most of the page table entries are empty. For such
a PTE page where all entries are empty, we call it empty PTE page.

2. The shared zero page scenario mentioned by David Hildenbrand:

Especially the shared zeropage is nasty, because there are
sane use cases that can trigger it. Assume you have a VM
(e.g., QEMU) that inflated the balloon to return free memory
to the hypervisor.

Simply migrating that VM will populate the shared zeropage to
all inflated pages, because migration code ends up reading all
VM memory. Similarly, the guest can just read that memory as
well, for example, when the guest issues kdump itself.

In this case, most of the page table entries are mapped to the shared
zero page. For such a PTE page where all page table entries are mapped
to zero pages, we call it zero PTE page.

The page table entries for both types of PTE pages do not record
"meaningful" information, so we can try to free these PTE pages at
some point (such as when memory pressure is high) to reclaim more
memory.

To quickly identify these two types of pages, we have introduced a
pte_refcount for each PTE page. We put the mapped and zero PTE entry
counter into the pte_refcount of the PTE page. The bitmask has the
following meaning:

- bits 0-9 are mapped PTE entry count
- bits 10-19 are zero PTE entry count

Because the mapping and unmapping of PTE entries are under pte_lock,
there is no concurrent thread to modify pte_refcount, so pte_refcount
can be a non-atomic variable with little performance overhead.

Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
---
include/linux/mm.h | 2 ++
include/linux/mm_types.h | 1 +
include/linux/pte_ref.h | 23 +++++++++++++
mm/Makefile | 2 +-
mm/pte_ref.c | 72 ++++++++++++++++++++++++++++++++++++++++
5 files changed, 99 insertions(+), 1 deletion(-)
create mode 100644 include/linux/pte_ref.h
create mode 100644 mm/pte_ref.c

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7898e29bcfb5..23e2f1e75b4b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -28,6 +28,7 @@
#include <linux/sched.h>
#include <linux/pgtable.h>
#include <linux/kasan.h>
+#include <linux/pte_ref.h>

struct mempolicy;
struct anon_vma;
@@ -2336,6 +2337,7 @@ static inline bool pgtable_pte_page_ctor(struct page *page)
return false;
__SetPageTable(page);
inc_lruvec_page_state(page, NR_PAGETABLE);
+ pte_ref_init(page);
return true;
}

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c29ab4c0cd5c..da2738f87737 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -153,6 +153,7 @@ struct page {
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
+ unsigned long pte_refcount; /* only for PTE page */
};
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
new file mode 100644
index 000000000000..db14e03e1dff
--- /dev/null
+++ b/include/linux/pte_ref.h
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022, ByteDance. All rights reserved.
+ *
+ * Author: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
+ */
+
+#ifndef _LINUX_PTE_REF_H
+#define _LINUX_PTE_REF_H
+
+#ifdef CONFIG_FREE_USER_PTE
+
+void pte_ref_init(pgtable_t pte);
+
+#else /* !CONFIG_FREE_USER_PTE */
+
+static inline void pte_ref_init(pgtable_t pte)
+{
+}
+
+#endif /* CONFIG_FREE_USER_PTE */
+
+#endif /* _LINUX_PTE_REF_H */
diff --git a/mm/Makefile b/mm/Makefile
index 6f9ffa968a1a..f8fa5078a13d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -54,7 +54,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o mmap_lock.o $(mmu-y)
+ debug.o gup.o mmap_lock.o $(mmu-y) pte_ref.o

# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
new file mode 100644
index 000000000000..12b27646e88c
--- /dev/null
+++ b/mm/pte_ref.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022, ByteDance. All rights reserved.
+ *
+ * Author: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
+ */
+#include <linux/pgtable.h>
+#include <linux/pte_ref.h>
+
+#ifdef CONFIG_FREE_USER_PTE
+
+/*
+ * For a PTE page where all entries are empty, we call it empty PTE page. For a
+ * PTE page where all page table entries are mapped to zero pages, we call it
+ * zero PTE page.
+ *
+ * The page table entries for both types of PTE pages do not record "meaningful"
+ * information, so we can try to free these PTE pages at some point (such as
+ * when memory pressure is high) to reclaim more memory.
+ *
+ * We put the mapped and zero PTE entry counter into the pte_refcount of the
+ * PTE page. The bitmask has the following meaning:
+ *
+ * - bits 0-9 are mapped PTE entry count
+ * - bits 10-19 are zero PTE entry count
+ *
+ * Because the mapping and unmapping of PTE entries are under pte_lock, there is
+ * no concurrent thread to modify pte_refcount, so pte_refcount can be a
+ * non-atomic variable with little performance overhead.
+ */
+#define PTE_MAPPED_BITS 10
+#define PTE_ZERO_BITS 10
+
+#define PTE_MAPPED_SHIFT 0
+#define PTE_ZERO_SHIFT (PTE_MAPPED_SHIFT + PTE_MAPPED_BITS)
+
+#define __PTE_REF_MASK(x) ((1UL << (x))-1)
+
+#define PTE_MAPPED_MASK (__PTE_REF_MASK(PTE_MAPPED_BITS) << PTE_MAPPED_SHIFT)
+#define PTE_ZERO_MASK (__PTE_REF_MASK(PTE_ZERO_BITS) << PTE_ZERO_SHIFT)
+
+#define PTE_MAPPED_OFFSET (1UL << PTE_MAPPED_SHIFT)
+#define PTE_ZERO_OFFSET (1UL << PTE_ZERO_SHIFT)
+
+static inline unsigned long pte_refcount(pgtable_t pte)
+{
+ return pte->pte_refcount;
+}
+
+#define pte_mapped_count(pte) \
+ ((pte_refcount(pte) & PTE_MAPPED_MASK) >> PTE_MAPPED_SHIFT)
+#define pte_zero_count(pte) \
+ ((pte_refcount(pte) & PTE_ZERO_MASK) >> PTE_ZERO_SHIFT)
+
+static __always_inline void pte_refcount_add(struct mm_struct *mm,
+ pgtable_t pte, int val)
+{
+ pte->pte_refcount += val;
+}
+
+static __always_inline void pte_refcount_sub(struct mm_struct *mm,
+ pgtable_t pte, int val)
+{
+ pte->pte_refcount -= val;
+}
+
+void pte_ref_init(pgtable_t pte)
+{
+ pte->pte_refcount = 0;
+}
+
+#endif /* CONFIG_FREE_USER_PTE */
--
2.20.1