[RFC PATCH v1 3/4] mm/memory: Use ptep_get_lockless_norecency() for orig_pte

From: Ryan Roberts
Date: Thu Feb 15 2024 - 07:19:08 EST


Let's convert handle_pte_fault()'s use of ptep_get_lockless() to
ptep_get_lockless_norecency() to save orig_pte.

There are a number of places that follow this model:

orig_pte = ptep_get_lockless(ptep)
...
<lock>
if (!pte_same(orig_pte, ptep_get(ptep)))
// RACE!
...
<unlock>

So we need to be careful to convert all of those to use
pte_same_norecency() so that the access and dirty bits are excluded from
the comparison.

Additionally there are a couple of places that genuinely rely on the
access and dirty bits of orig_pte, but with some careful refactoring, we
can use ptep_get() once we are holding the lock to achieve equivalent
logic.

Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx>
---
mm/memory.c | 55 +++++++++++++++++++++++++++++++++--------------------
1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 8e65fa1884f1..075245314ec3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3014,7 +3014,7 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
spin_lock(vmf->ptl);
- same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
+ same = pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte);
spin_unlock(vmf->ptl);
}
#endif
@@ -3062,11 +3062,14 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
* take a double page fault, so mark it accessed here.
*/
vmf->pte = NULL;
- if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
+ if (!arch_has_hw_pte_young()) {
pte_t entry;

vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ if (likely(vmf->pte))
+ entry = ptep_get(vmf->pte);
+ if (unlikely(!vmf->pte ||
+ !pte_same_norecency(entry, vmf->orig_pte))) {
/*
* Other thread has already handled the fault
* and update local tlb only
@@ -3077,9 +3080,11 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
goto pte_unlock;
}

- entry = pte_mkyoung(vmf->orig_pte);
- if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
- update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
+ if (!pte_young(entry)) {
+ entry = pte_mkyoung(entry);
+ if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
+ update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
+ }
}

/*
@@ -3094,7 +3099,8 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,

/* Re-validate under PTL if the page is still mapped */
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ if (unlikely(!vmf->pte ||
+ !pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
if (vmf->pte)
update_mmu_tlb(vma, addr, vmf->pte);
@@ -3369,14 +3375,17 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* Re-check the pte - we dropped the lock
*/
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
- if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ if (likely(vmf->pte))
+ entry = ptep_get(vmf->pte);
+ if (likely(vmf->pte && pte_same_norecency(entry, vmf->orig_pte))) {
if (old_folio) {
if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
- ksm_might_unmap_zero_page(mm, vmf->orig_pte);
+ /* Needs dirty bit so can't use vmf->orig_pte. */
+ ksm_might_unmap_zero_page(mm, entry);
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
@@ -3494,7 +3503,7 @@ static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio
* We might have raced with another page fault while we released the
* pte_offset_map_lock.
*/
- if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
+ if (!pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte)) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
@@ -3883,7 +3892,8 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)

vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
+ if (likely(vmf->pte &&
+ pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte)))
restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);

if (vmf->pte)
@@ -3928,7 +3938,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
* quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
* So is_pte_marker() check is not enough to safely drop the pte.
*/
- if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
+ if (pte_same_norecency(vmf->orig_pte, ptep_get(vmf->pte)))
pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@@ -4028,8 +4038,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (unlikely(!vmf->pte ||
- !pte_same(ptep_get(vmf->pte),
- vmf->orig_pte)))
+ !pte_same_norecency(ptep_get(vmf->pte),
+ vmf->orig_pte)))
goto unlock;

/*
@@ -4117,7 +4127,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (likely(vmf->pte &&
- pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
+ pte_same_norecency(ptep_get(vmf->pte),
+ vmf->orig_pte)))
ret = VM_FAULT_OOM;
goto unlock;
}
@@ -4187,7 +4198,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
+ if (unlikely(!vmf->pte ||
+ !pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte)))
goto out_nomap;

if (unlikely(!folio_test_uptodate(folio))) {
@@ -4747,7 +4759,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
static bool vmf_pte_changed(struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
- return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
+ return !pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte);

return !pte_none(ptep_get(vmf->pte));
}
@@ -5125,7 +5137,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* the pfn may be screwed if the read is non atomic.
*/
spin_lock(vmf->ptl);
- if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ if (unlikely(!pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -5197,7 +5209,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
if (unlikely(!vmf->pte))
goto out;
- if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ if (unlikely(!pte_same_norecency(ptep_get(vmf->pte),
+ vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -5343,7 +5356,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
if (unlikely(!vmf->pte))
return 0;
- vmf->orig_pte = ptep_get_lockless(vmf->pte);
+ vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte);
vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;

if (pte_none(vmf->orig_pte)) {
@@ -5363,7 +5376,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)

spin_lock(vmf->ptl);
entry = vmf->orig_pte;
- if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
+ if (unlikely(!pte_same_norecency(ptep_get(vmf->pte), entry))) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
}
--
2.25.1