[PATCH] arm64/mm: adds soft dirty page tracking

From: Shivansh Vij
Date: Mon Mar 11 2024 - 21:17:12 EST


Checkpoint-Restore in Userspace (CRIU) needs to be able
to track a memory page's changes if we want to enable
pre-dumping, which is important for live migrations.

The PTE_DIRTY bit (defined in pgtable-prot.h) is already
used to track software dirty pages, and the PTE_WRITE and
PTE_READ bits are used to track hardware dirty pages.

This patch enables full soft dirty page tracking
(including swap PTE support) for arm64 systems, and is
based very closely on the x86 implementation.

It is based on an unfinished patch by
Bin Lu (bin.lu@xxxxxxx) from 2017
(https://patchwork.kernel.org/project/linux-arm-kernel/patch/1512029649-61312-1-git-send-email-bin.lu@xxxxxxx/),
but has been updated for newer 6.x kernels as well as
tested on various 5.x kernels.

The main difference is this attempts to fix the bug
identified in the original patch where calling pte_mkclean()
on a page would result in pte_soft_dirty() == false. This
is invalid behaviour because pte_soft_dirty() should only
return false if the PTE_DIRTY bit is not set and
pte_mksoft_dirty() function has not been called. The x86
implementation expects this behaviour as well.

To achieve this, an additional software dirty bit called
PTE_SOFT_DIRTY is defined (in pgtable-prot.h), which is used
exclusively to track soft dirty pages.

This patch also reuses the _PAGE_SWP_SOFT_DIRTY
bit (defined in pgtable.h) from the original patch to add
support for swapped pages and for THP page MADV_FREE because
pmd_* functions have also been implemented.

This patch has been tested with CRIU's ZDTM test suite on
5.x and 6.x kernels using the following command:
test/zdtm.py run --page-server --remote-lazy-pages --keep-going --pre 3 -a

Signed-off-by: Shivansh Vij <shivanshvij@xxxxxxxxxxx>
---
arch/arm64/Kconfig | 1 +
arch/arm64/include/asm/pgtable-prot.h | 6 +++
arch/arm64/include/asm/pgtable.h | 54 ++++++++++++++++++++++++++-
3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d435139..fe73d4809c7e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -178,6 +178,7 @@ config ARM64
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_STACKLEAK
+ select HAVE_ARCH_SOFT_DIRTY
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 483dbfa39c4c..1b4119bbdf01 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -27,6 +27,12 @@
*/
#define PMD_PRESENT_INVALID (_AT(pteval_t, 1) << 59) /* only when !PMD_SECT_VALID */

+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define PTE_SOFT_DIRTY (_AT(pteval_t, 1) << 60) /* for soft dirty tracking */
+#else
+#define PTE_SOFT_DIRTY 0UL
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
#define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
#define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 79ce70fbb751..0e699e7d96da 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -198,7 +198,7 @@ static inline pte_t pte_mkclean(pte_t pte)

static inline pte_t pte_mkdirty(pte_t pte)
{
- pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
+ pte = set_pte_bit(pte, __pgprot(PTE_DIRTY | PTE_SOFT_DIRTY));

if (pte_write(pte))
pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
@@ -443,6 +443,29 @@ static inline pgprot_t pte_pgprot(pte_t pte)
return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
}

+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline bool pte_soft_dirty(pte_t pte)
+{
+ return pte_sw_dirty(pte) || (!!(pte_val(pte) & PTE_SOFT_DIRTY));
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+ pte = set_pte_bit(pte, __pgprot(PTE_SOFT_DIRTY));
+ return pte;
+}
+
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+ pte = clear_pte_bit(pte, __pgprot(PTE_SOFT_DIRTY));
+ return pte;
+}
+
+#define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd))
+#define pmd_mksoft_dirty(pmd) pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
+#define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)))
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* See the comment in include/linux/pgtable.h
@@ -1013,10 +1036,12 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
* bits 3-7: swap type
* bits 8-57: swap offset
* bit 58: PTE_PROT_NONE (must be zero)
+ * bit 59: swap software dirty tracking
*/
#define __SWP_TYPE_SHIFT 3
#define __SWP_TYPE_BITS 5
#define __SWP_OFFSET_BITS 50
+#define __SWP_PROT_NONE_BITS 1
#define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1)
#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
#define __SWP_OFFSET_MASK ((1UL << __SWP_OFFSET_BITS) - 1)
@@ -1033,6 +1058,33 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
#define __swp_entry_to_pmd(swp) __pmd((swp).val)
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */

+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SWP_SOFT_DIRTY (1UL << (__SWP_OFFSET_SHIFT + __SWP_OFFSET_BITS + __SWP_PROT_NONE_BITS))
+#else
+#define _PAGE_SWP_SOFT_DIRTY 0UL
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline bool pte_swp_soft_dirty(pte_t pte)
+{
+ return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
+}
+
+#define pmd_swp_soft_dirty(pmd) pte_swp_soft_dirty(pmd_pte(pmd))
+#define pmd_swp_mksoft_dirty(pmd) pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd)))
+#define pmd_swp_clear_soft_dirty(pmd) pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd)))
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
/*
* Ensure that there are not more swap files than can be encoded in the kernel
* PTEs.
--
2.34.3