Re: [PATCH v2 2/3] powerpc: get hugetlbpage handling more generic

From: Aneesh Kumar K.V
Date: Mon Sep 19 2016 - 01:46:06 EST


Christophe Leroy <christophe.leroy@xxxxxx> writes:

> Today there are two implementations of hugetlbpages which are managed
> by exclusive #ifdefs:
> * FSL_BOOKE: several directory entries points to the same single hugepage
> * BOOK3S: one upper level directory entry points to a table of hugepages
>
> In preparation of implementation of hugepage support on the 8xx, we
> need a mix of the two above solutions, because the 8xx needs both cases
> depending on the size of pages:
> * In 4k page size mode, each PGD entry covers a 4M bytes area. It means
> that 2 PGD entries will be necessary to cover an 8M hugepage while a
> single PGD entry will cover 8x 512k hugepages.
> * In 16 page size mode, each PGD entry covers a 64M bytes area. It means
> that 8x 8M hugepages will be covered by one PGD entry and 64x 512k
> hugepages will be covers by one PGD entry.
>
> This patch:
> * removes #ifdefs in favor of if/else based on the range sizes
> * merges the two huge_pte_alloc() functions as they are pretty similar
> * merges the two hugetlbpage_init() functions as they are pretty similar
>
> Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxx>
> ---
> v2: This part is new and results from a split of last patch of v1 serie in
> two parts
>
> arch/powerpc/mm/hugetlbpage.c | 189 +++++++++++++++++-------------------------
> 1 file changed, 77 insertions(+), 112 deletions(-)
>
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index 8a512b1..2119f00 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -64,14 +64,16 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
> {
> struct kmem_cache *cachep;
> pte_t *new;
> -
> -#ifdef CONFIG_PPC_FSL_BOOK3E
> int i;
> - int num_hugepd = 1 << (pshift - pdshift);
> - cachep = hugepte_cache;
> -#else
> - cachep = PGT_CACHE(pdshift - pshift);
> -#endif
> + int num_hugepd;
> +
> + if (pshift >= pdshift) {
> + cachep = hugepte_cache;
> + num_hugepd = 1 << (pshift - pdshift);
> + } else {
> + cachep = PGT_CACHE(pdshift - pshift);
> + num_hugepd = 1;
> + }

Is there a way to hint likely/unlikely branch based on the page size
selected at build time ?



>
> new = kmem_cache_zalloc(cachep, GFP_KERNEL);
>
> @@ -89,7 +91,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
> smp_wmb();
>
> spin_lock(&mm->page_table_lock);
> -#ifdef CONFIG_PPC_FSL_BOOK3E
> +
> /*
> * We have multiple higher-level entries that point to the same
> * actual pte location. Fill in each as we go and backtrack on error.
> @@ -100,8 +102,13 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
> if (unlikely(!hugepd_none(*hpdp)))
> break;
> else

....

> -#ifdef CONFIG_PPC_FSL_BOOK3E
> struct kmem_cache *hugepte_cache;
> static int __init hugetlbpage_init(void)
> {
> int psize;
>
> - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
> - unsigned shift;
> -
> - if (!mmu_psize_defs[psize].shift)
> - continue;
> -
> - shift = mmu_psize_to_shift(psize);
> -
> - /* Don't treat normal page sizes as huge... */
> - if (shift != PAGE_SHIFT)
> - if (add_huge_page_size(1ULL << shift) < 0)
> - continue;
> - }
> -
> - /*
> - * Create a kmem cache for hugeptes. The bottom bits in the pte have
> - * size information encoded in them, so align them to allow this
> - */
> - hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
> - HUGEPD_SHIFT_MASK + 1, 0, NULL);
> - if (hugepte_cache == NULL)
> - panic("%s: Unable to create kmem cache for hugeptes\n",
> - __func__);
> -
> - /* Default hpage size = 4M */
> - if (mmu_psize_defs[MMU_PAGE_4M].shift)
> - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
> - else
> - panic("%s: Unable to set default huge page size\n", __func__);
> -
> -
> - return 0;
> -}
> -#else
> -static int __init hugetlbpage_init(void)
> -{
> - int psize;
> -
> +#if !defined(CONFIG_PPC_FSL_BOOK3E)
> if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
> return -ENODEV;
> -
> +#endif

Do we need that #if ? radix_enabled() should become 0 and that if
condition should be removed at compile time isn't it ? or are you
finding errors with that ?


> for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
> unsigned shift;
> unsigned pdshift;
> @@ -860,16 +807,31 @@ static int __init hugetlbpage_init(void)
> * if we have pdshift and shift value same, we don't
> * use pgt cache for hugepd.
> */
> - if (pdshift != shift) {
> + if (pdshift > shift) {
> pgtable_cache_add(pdshift - shift, NULL);
> if (!PGT_CACHE(pdshift - shift))
> panic("hugetlbpage_init(): could not create "
> "pgtable cache for %d bit pagesize\n", shift);
> + } else if (!hugepte_cache) {
> + /*
> + * Create a kmem cache for hugeptes. The bottom bits in
> + * the pte have size information encoded in them, so
> + * align them to allow this
> + */
> + hugepte_cache = kmem_cache_create("hugepte-cache",
> + sizeof(pte_t),
> + HUGEPD_SHIFT_MASK + 1,
> + 0, NULL);
> + if (hugepte_cache == NULL)
> + panic("%s: Unable to create kmem cache "
> + "for hugeptes\n", __func__);
> +


We don't need hugepte_cache for book3s 64K. I guess we will endup
creating one here ?

> }
> }
>
> /* Set default large page size. Currently, we pick 16M or 1M
> * depending on what is available
> + * We select 4M on other ones.
> */
> if (mmu_psize_defs[MMU_PAGE_16M].shift)
> HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
> @@ -877,11 +839,14 @@ static int __init hugetlbpage_init(void)
> HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
> else if (mmu_psize_defs[MMU_PAGE_2M].shift)
> HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
> -
> + else if (mmu_psize_defs[MMU_PAGE_4M].shift)
> + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
> + else
> + panic("%s: Unable to set default huge page size\n", __func__);
>
> return 0;
> }
> -#endif
> +
> arch_initcall(hugetlbpage_init);
>
> void flush_dcache_icache_hugepage(struct page *page)
> --
> 2.1.0