Re: [PATCH 18 of 66] add pmd mangling functions to x86

From: Mel Gorman
Date: Thu Nov 18 2010 - 08:05:43 EST


On Wed, Nov 03, 2010 at 04:27:53PM +0100, Andrea Arcangeli wrote:
> From: Andrea Arcangeli <aarcange@xxxxxxxxxx>
>
> Add needed pmd mangling functions with simmetry with their pte counterparts.

symmetry

> pmdp_freeze_flush is the only exception only present on the pmd side and it's
> needed to serialize the VM against split_huge_page, it simply atomically clears
> the present bit in the same way pmdp_clear_flush_young atomically clears the
> accessed bit (and both need to flush the tlb to make it effective, which is
> mandatory to happen synchronously for pmdp_freeze_flush).

I don't see a pmdp_freeze_flush defined in the patch. Did yu mean
pmdp_splitting_flush? Even if it is, it's the splitting bit you are
dealing with which isn't the same as the present bit. I'm missing
something.

>
> Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Acked-by: Rik van Riel <riel@xxxxxxxxxx>
> ---
>
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -302,15 +302,15 @@ pmd_t *populate_extra_pmd(unsigned long
> pte_t *populate_extra_pte(unsigned long vaddr);
> #endif /* __ASSEMBLY__ */
>
> +#ifndef __ASSEMBLY__
> +#include <linux/mm_types.h>
> +
> #ifdef CONFIG_X86_32
> # include "pgtable_32.h"
> #else
> # include "pgtable_64.h"
> #endif
>
> -#ifndef __ASSEMBLY__
> -#include <linux/mm_types.h>
> -

Stupid quetion: Why is this move necessary?

> static inline int pte_none(pte_t pte)
> {
> return !pte.pte;
> @@ -353,7 +353,7 @@ static inline unsigned long pmd_page_vad
> * Currently stuck as a macro due to indirect forward reference to
> * linux/mmzone.h's __section_mem_map_addr() definition:
> */
> -#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
> +#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
>

Why is it now necessary to use PTE_PFN_MASK?

> /*
> * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -59,6 +59,16 @@ static inline void native_set_pte_atomic
> native_set_pte(ptep, pte);
> }
>
> +static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
> +{
> + *pmdp = pmd;
> +}
> +
> +static inline void native_pmd_clear(pmd_t *pmd)
> +{
> + native_set_pmd(pmd, native_make_pmd(0));
> +}
> +
> static inline pte_t native_ptep_get_and_clear(pte_t *xp)
> {
> #ifdef CONFIG_SMP
> @@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_
> #endif
> }
>
> -static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
> +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
> {
> - *pmdp = pmd;
> -}
> -
> -static inline void native_pmd_clear(pmd_t *pmd)
> -{
> - native_set_pmd(pmd, native_make_pmd(0));
> +#ifdef CONFIG_SMP
> + return native_make_pmd(xchg(&xp->pmd, 0));
> +#else
> + /* native_local_pmdp_get_and_clear,
> + but duplicated because of cyclic dependency */
> + pmd_t ret = *xp;
> + native_pmd_clear(xp);
> + return ret;
> +#endif
> }
>
> static inline void native_set_pud(pud_t *pudp, pud_t pud)
> @@ -181,6 +194,98 @@ static inline int pmd_trans_huge(pmd_t p
> }
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
> +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
> +
> +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
> +extern int pmdp_set_access_flags(struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmdp,
> + pmd_t entry, int dirty);
> +
> +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
> +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
> + unsigned long addr, pmd_t *pmdp);
> +
> +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
> +extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmdp);
> +
> +
> +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
> +extern void pmdp_splitting_flush(struct vm_area_struct *vma,
> + unsigned long addr, pmd_t *pmdp);
> +
> +#define __HAVE_ARCH_PMD_WRITE
> +static inline int pmd_write(pmd_t pmd)
> +{
> + return pmd_flags(pmd) & _PAGE_RW;
> +}
> +
> +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
> +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
> + pmd_t *pmdp)
> +{
> + pmd_t pmd = native_pmdp_get_and_clear(pmdp);
> + pmd_update(mm, addr, pmdp);
> + return pmd;
> +}
> +
> +#define __HAVE_ARCH_PMDP_SET_WRPROTECT
> +static inline void pmdp_set_wrprotect(struct mm_struct *mm,
> + unsigned long addr, pmd_t *pmdp)
> +{
> + clear_bit(_PAGE_BIT_RW, (unsigned long *)&pmdp->pmd);
> + pmd_update(mm, addr, pmdp);
> +}
> +
> +static inline int pmd_young(pmd_t pmd)
> +{
> + return pmd_flags(pmd) & _PAGE_ACCESSED;
> +}
> +
> +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
> +{
> + pmdval_t v = native_pmd_val(pmd);
> +
> + return native_make_pmd(v | set);
> +}
> +
> +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
> +{
> + pmdval_t v = native_pmd_val(pmd);
> +
> + return native_make_pmd(v & ~clear);
> +}
> +
> +static inline pmd_t pmd_mkold(pmd_t pmd)
> +{
> + return pmd_clear_flags(pmd, _PAGE_ACCESSED);
> +}
> +
> +static inline pmd_t pmd_wrprotect(pmd_t pmd)
> +{
> + return pmd_clear_flags(pmd, _PAGE_RW);
> +}
> +
> +static inline pmd_t pmd_mkdirty(pmd_t pmd)
> +{
> + return pmd_set_flags(pmd, _PAGE_DIRTY);
> +}
> +
> +static inline pmd_t pmd_mkhuge(pmd_t pmd)
> +{
> + return pmd_set_flags(pmd, _PAGE_PSE);
> +}
> +
> +static inline pmd_t pmd_mkyoung(pmd_t pmd)
> +{
> + return pmd_set_flags(pmd, _PAGE_ACCESSED);
> +}
> +
> +static inline pmd_t pmd_mkwrite(pmd_t pmd)
> +{
> + return pmd_set_flags(pmd, _PAGE_RW);
> +}
> +
> #endif /* !__ASSEMBLY__ */
>
> #endif /* _ASM_X86_PGTABLE_64_H */
> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area
> return changed;
> }
>
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +int pmdp_set_access_flags(struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmdp,
> + pmd_t entry, int dirty)
> +{
> + int changed = !pmd_same(*pmdp, entry);
> +
> + VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> +
> + if (changed && dirty) {
> + *pmdp = entry;
> + pmd_update_defer(vma->vm_mm, address, pmdp);
> + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> + }
> +
> + return changed;
> +}
> +#endif
> +
> int ptep_test_and_clear_young(struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep)
> {
> @@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_
> return ret;
> }
>
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +int pmdp_test_and_clear_young(struct vm_area_struct *vma,
> + unsigned long addr, pmd_t *pmdp)
> +{
> + int ret = 0;
> +
> + if (pmd_young(*pmdp))
> + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
> + (unsigned long *) &pmdp->pmd);
> +
> + if (ret)
> + pmd_update(vma->vm_mm, addr, pmdp);
> +
> + return ret;
> +}
> +#endif
> +
> int ptep_clear_flush_young(struct vm_area_struct *vma,
> unsigned long address, pte_t *ptep)
> {
> @@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_are
> return young;
> }
>
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +int pmdp_clear_flush_young(struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmdp)
> +{
> + int young;
> +
> + VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> +
> + young = pmdp_test_and_clear_young(vma, address, pmdp);
> + if (young)
> + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> +
> + return young;
> +}
> +
> +void pmdp_splitting_flush(struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmdp)
> +{
> + int set;
> + VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> + set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
> + (unsigned long *)&pmdp->pmd);
> + if (set) {
> + pmd_update(vma->vm_mm, address, pmdp);
> + /* need tlb flush only to serialize against gup-fast */
> + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> + }
> +}
> +#endif
> +

The implementations look fine but I'm having trouble reconsiling what
the leader says with the patch :(

> /**
> * reserve_top_address - reserves a hole in the top of kernel address space
> * @reserve - size of hole to reserve
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/