Re: [PATCH v12 13/31] mm: cache some VMA fields in the vm_fault structure

From: Jerome Glisse
Date: Mon Apr 22 2019 - 16:07:06 EST


On Tue, Apr 16, 2019 at 03:45:04PM +0200, Laurent Dufour wrote:
> When handling speculative page fault, the vma->vm_flags and
> vma->vm_page_prot fields are read once the page table lock is released. So
> there is no more guarantee that these fields would not change in our back.
> They will be saved in the vm_fault structure before the VMA is checked for
> changes.
>
> In the detail, when we deal with a speculative page fault, the mmap_sem is
> not taken, so parallel VMA's changes can occurred. When a VMA change is
> done which will impact the page fault processing, we assumed that the VMA
> sequence counter will be changed. In the page fault processing, at the
> time the PTE is locked, we checked the VMA sequence counter to detect
> changes done in our back. If no change is detected we can continue further.
> But this doesn't prevent the VMA to not be changed in our back while the
> PTE is locked. So VMA's fields which are used while the PTE is locked must
> be saved to ensure that we are using *static* values. This is important
> since the PTE changes will be made on regards to these VMA fields and they
> need to be consistent. This concerns the vma->vm_flags and
> vma->vm_page_prot VMA fields.
>
> This patch also set the fields in hugetlb_no_page() and
> __collapse_huge_page_swapin even if it is not need for the callee.
>
> Signed-off-by: Laurent Dufour <ldufour@xxxxxxxxxxxxx>

I am unsure about something see below, so you might need to update
that one but it would not change the structure of the patch thus:

Reviewed-by: Jérôme Glisse <jglisse@xxxxxxxxxx>

> ---
> include/linux/mm.h | 10 +++++++--
> mm/huge_memory.c | 6 +++---
> mm/hugetlb.c | 2 ++
> mm/khugepaged.c | 2 ++
> mm/memory.c | 53 ++++++++++++++++++++++++----------------------
> mm/migrate.c | 2 +-
> 6 files changed, 44 insertions(+), 31 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 5d45b7d8718d..f465bb2b049e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -439,6 +439,12 @@ struct vm_fault {
> * page table to avoid allocation from
> * atomic context.
> */
> + /*
> + * These entries are required when handling speculative page fault.
> + * This way the page handling is done using consistent field values.
> + */
> + unsigned long vma_flags;
> + pgprot_t vma_page_prot;
> };
>
> /* page entry size for vm->huge_fault() */
> @@ -781,9 +787,9 @@ void free_compound_page(struct page *page);
> * pte_mkwrite. But get_user_pages can cause write faults for mappings
> * that do not have writing enabled, when used by access_process_vm.
> */
> -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> +static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
> {
> - if (likely(vma->vm_flags & VM_WRITE))
> + if (likely(vma_flags & VM_WRITE))
> pte = pte_mkwrite(pte);
> return pte;
> }
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 823688414d27..865886a689ee 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1244,8 +1244,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
>
> for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> pte_t entry;
> - entry = mk_pte(pages[i], vma->vm_page_prot);
> - entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> + entry = mk_pte(pages[i], vmf->vma_page_prot);
> + entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
> memcg = (void *)page_private(pages[i]);
> set_page_private(pages[i], 0);
> page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> @@ -2228,7 +2228,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> entry = pte_swp_mksoft_dirty(entry);
> } else {
> entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
> - entry = maybe_mkwrite(entry, vma);
> + entry = maybe_mkwrite(entry, vma->vm_flags);
> if (!write)
> entry = pte_wrprotect(entry);
> if (!young)
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 109f5de82910..13246da4bc50 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -3812,6 +3812,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> .vma = vma,
> .address = haddr,
> .flags = flags,
> + .vma_flags = vma->vm_flags,
> + .vma_page_prot = vma->vm_page_prot,

Shouldn't you use READ_ONCE ? I doubt compiler will do something creative
with struct initialization but as you are using WRITE_ONCE to update those
fields maybe pairing read with READ_ONCE where the mmap_sem is not always
taken might make sense.

> /*
> * Hard to debug if it ends up being
> * used by a callee that assumes
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6a0cbca3885e..42469037240a 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -888,6 +888,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> .flags = FAULT_FLAG_ALLOW_RETRY,
> .pmd = pmd,
> .pgoff = linear_page_index(vma, address),
> + .vma_flags = vma->vm_flags,
> + .vma_page_prot = vma->vm_page_prot,

Same as above.

[...]

> return VM_FAULT_FALLBACK;
> @@ -3924,6 +3925,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
> .flags = flags,
> .pgoff = linear_page_index(vma, address),
> .gfp_mask = __get_fault_gfp_mask(vma),
> + .vma_flags = vma->vm_flags,
> + .vma_page_prot = vma->vm_page_prot,

Same as above

> };
> unsigned int dirty = flags & FAULT_FLAG_WRITE;
> struct mm_struct *mm = vma->vm_mm;
> diff --git a/mm/migrate.c b/mm/migrate.c
> index f2ecc2855a12..a9138093a8e2 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -240,7 +240,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
> */
> entry = pte_to_swp_entry(*pvmw.pte);
> if (is_write_migration_entry(entry))
> - pte = maybe_mkwrite(pte, vma);
> + pte = maybe_mkwrite(pte, vma->vm_flags);
>
> if (unlikely(is_zone_device_page(new))) {
> if (is_device_private_page(new)) {
> --
> 2.21.0
>