[RFC PATCH 8/8] powerpc/8xx: Add back support for 8M pages using contiguous PTE entries

From: Christophe Leroy
Date: Mon Mar 25 2024 - 12:45:13 EST


In order to fit better with standard Linux page tables layout, add
support for 8M pages using contiguous PTE entries in a standard
page table. Page tables will then be populated with 1024 similar
entries and two PMD entries will point to that page table.

The PMD entries also get a flag to tell it is addressing an 8M page,
this is required for the HW tablewalk assistance.

Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxxxxxx>
---
arch/powerpc/include/asm/hugetlb.h | 11 ++++-
.../include/asm/nohash/32/hugetlb-8xx.h | 28 +++++++++++-
arch/powerpc/include/asm/nohash/32/pgalloc.h | 2 +
arch/powerpc/include/asm/nohash/32/pte-8xx.h | 43 +++++++++++++++++--
arch/powerpc/include/asm/pgtable.h | 1 +
arch/powerpc/kernel/head_8xx.S | 1 +
arch/powerpc/mm/hugetlbpage.c | 12 +++++-
arch/powerpc/mm/nohash/8xx.c | 31 ++++++++++---
arch/powerpc/mm/nohash/tlb.c | 3 ++
arch/powerpc/mm/pgtable.c | 24 +++++++----
arch/powerpc/mm/pgtable_32.c | 2 +-
11 files changed, 134 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index a05657e5701b..bd60ea134f8e 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -41,7 +41,16 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
+ pmd_t *pmdp = (pmd_t *)ptep;
+ pte_t pte;
+
+ if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
+ pte = __pte(pte_update(mm, addr, pte_offset_kernel(pmdp, 0), ~0UL, 0, 1));
+ pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 1);
+ } else {
+ pte = __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
+ }
+ return pte;
}

#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 178ed9fdd353..1414cfd28987 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -15,6 +15,16 @@ static inline int check_and_get_huge_psize(int shift)
return shift_to_mmu_psize(shift);
}

+#define __HAVE_ARCH_HUGE_PTEP_GET
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ pmd_t *pmdp = (pmd_t *)ptep;
+
+ if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))
+ ptep = pte_offset_kernel(pmdp, 0);
+ return ptep_get(ptep);
+}
+
#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte, unsigned long sz);
@@ -23,7 +33,14 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
{
- pte_update(mm, addr, ptep, ~0UL, 0, 1);
+ pmd_t *pmdp = (pmd_t *)ptep;
+
+ if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
+ pte_update(mm, addr, pte_offset_kernel(pmdp, 0), ~0UL, 0, 1);
+ pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 1);
+ } else {
+ pte_update(mm, addr, ptep, ~0UL, 0, 1);
+ }
}

#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
@@ -33,7 +50,14 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long clr = ~pte_val(pte_wrprotect(__pte(~0)));
unsigned long set = pte_val(pte_wrprotect(__pte(0)));

- pte_update(mm, addr, ptep, clr, set, 1);
+ pmd_t *pmdp = (pmd_t *)ptep;
+
+ if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
+ pte_update(mm, addr, pte_offset_kernel(pmdp, 0), clr, set, 1);
+ pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr, set, 1);
+ } else {
+ pte_update(mm, addr, ptep, clr, set, 1);
+ }
}

#ifdef CONFIG_PPC_4K_PAGES
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 11eac371e7e0..ff4f90cfb461 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -14,6 +14,7 @@
#define __pmd_free_tlb(tlb,x,a) do { } while (0)
/* #define pgd_populate(mm, pmd, pte) BUG() */

+#ifndef CONFIG_PPC_8xx
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
pte_t *pte)
{
@@ -31,5 +32,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
else
*pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT);
}
+#endif

#endif /* _ASM_POWERPC_PGALLOC_32_H */
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 004d7e825af2..b05cc4f87713 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -129,14 +129,23 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
}
#define ptep_set_wrprotect ptep_set_wrprotect

+static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr);
+static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address);
+
static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
pte_t entry, unsigned long address, int psize)
{
unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_EXEC);
unsigned long clr = ~pte_val(entry) & _PAGE_RO;
int huge = psize > mmu_virtual_psize ? 1 : 0;
+ pmd_t *pmdp = (pmd_t *)ptep;

- pte_update(vma->vm_mm, address, ptep, clr, set, huge);
+ if (pmdp == pmd_off(vma->vm_mm, ALIGN_DOWN(address, SZ_8M))) {
+ pte_update(vma->vm_mm, address, pte_offset_kernel(pmdp, 0), clr, set, huge);
+ pte_update(vma->vm_mm, address, pte_offset_kernel(pmdp + 1, 0), clr, set, huge);
+ } else {
+ pte_update(vma->vm_mm, address, ptep, clr, set, huge);
+ }

flush_tlb_page(vma, address);
}
@@ -146,6 +155,8 @@ static inline unsigned long pte_leaf_size(pmd_t pmd, pte_t pte)
{
pte_basic_t val = pte_val(pte);

+ if (pmd_val(pmd) & _PMD_PAGE_8M)
+ return SZ_8M;
if (val & _PAGE_HUGE)
return SZ_512K;
if (val & _PAGE_SPS)
@@ -159,14 +170,16 @@ static inline unsigned long pte_leaf_size(pmd_t pmd, pte_t pte)
* On the 8xx, the page tables are a bit special. For 16k pages, we have
* 4 identical entries. For 512k pages, we have 128 entries as if it was
* 4k pages, but they are flagged as 512k pages for the hardware.
- * For other page sizes, we have a single entry in the table.
+ * For 8M pages, we have 1024 entries as if it was
+ * 4M pages, but they are flagged as 8M pages for the hardware.
+ * For 4k pages, we have a single entry in the table.
*/
-static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr);
-
static inline int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge)
{
if (!huge)
return PAGE_SIZE / SZ_4K;
+ else if ((pmd_val(*pmd) & _PMD_PAGE_MASK) == _PMD_PAGE_8M)
+ return SZ_4M / SZ_4K;
else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE))
return SZ_16K / SZ_4K;
else
@@ -209,6 +222,28 @@ static inline pte_t ptep_get(pte_t *ptep)
}
#endif /* CONFIG_PPC_16K_PAGES */

+static inline void pmd_populate_kernel_size(struct mm_struct *mm, pmd_t *pmdp,
+ pte_t *pte, unsigned long sz)
+{
+ if (sz == SZ_8M)
+ *pmdp = __pmd(__pa(pte) | _PMD_PRESENT | _PMD_PAGE_8M);
+ else
+ *pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
+}
+
+static inline void pmd_populate_size(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pte_page, unsigned long sz)
+{
+ if (sz == SZ_8M)
+ *pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M);
+ else
+ *pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT);
+}
+#define pmd_populate_size pmd_populate_size
+
+#define pmd_populate(mm, pmdp, pte) pmd_populate_size(mm, pmdp, pte, PAGE_SIZE)
+#define pmd_populate_kernel(mm, pmdp, pte) pmd_populate_kernel_size(mm, pmdp, pte, PAGE_SIZE)
+
#endif

#endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 239709a2f68e..005dad336565 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -106,6 +106,7 @@ unsigned long vmalloc_to_phys(void *vmalloc_addr);

void pgtable_cache_add(unsigned int shift);

+void __init *early_alloc_pgtable(unsigned long size);
pte_t *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va);

#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32)
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index b53af565b132..43919ae0bd11 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -415,6 +415,7 @@ FixupDAR:/* Entry point for dcbx workaround. */
oris r11, r11, (swapper_pg_dir - PAGE_OFFSET)@ha
3:
lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11) /* Get the level 1 entry */
+ rlwinm r11, r11, 0, ~_PMD_PAGE_8M
mtspr SPRN_MD_TWC, r11
mfspr r11, SPRN_MD_TWC
lwz r11, 0(r11) /* Get the pte */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 4e9fbd5b895d..dd29845ce0ce 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -195,7 +195,17 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
- return pte_alloc_huge(mm, pmd_off(mm, addr), addr, sz);
+ pmd_t *pmd = pmd_off(mm, addr);
+
+ if (sz == SZ_512M)
+ return pte_alloc_huge(mm, pmd, addr, sz);
+ if (sz != SZ_8M)
+ return NULL;
+ if (!pte_alloc_huge(mm, pmd, addr, sz))
+ return NULL;
+ if (!pte_alloc_huge(mm, pmd + 1, addr, sz))
+ return NULL;
+ return (pte_t *)pmd;
}
#endif

diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index fc10e08bcb85..b416bfc161d4 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -54,25 +54,40 @@ static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
pmd_t *pmdp = pmd_off_k(va);
pte_t *ptep;

- if (WARN_ON(psize != MMU_PAGE_512K))
+ if (WARN_ON(psize != MMU_PAGE_512K && psize != MMU_PAGE_8M))
return -EINVAL;

if (new) {
if (WARN_ON(slab_is_available()))
return -EINVAL;

- ptep = early_pte_alloc_kernel(pmdp, va);
+ if (psize == MMU_PAGE_8M) {
+ if (WARN_ON(!pmd_none(*pmdp) || !pmd_none(*(pmdp + 1))))
+ return -EINVAL;
+
+ ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+ pmd_populate_kernel_size(&init_mm, pmdp, ptep, SZ_8M);
+
+ ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+ pmd_populate_kernel_size(&init_mm, pmdp + 1, ptep, SZ_8M);
+
+ ptep = (pte_t *)pmdp;
+ } else {
+ ptep = early_pte_alloc_kernel(pmdp, va);
+ /* The PTE should never be already present */
+ if (WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
+ return -EINVAL;
+ }
} else {
- ptep = pte_offset_kernel(pmdp, va);
+ if (psize == MMU_PAGE_8M)
+ ptep = (pte_t *)pmdp;
+ else
+ ptep = pte_offset_kernel(pmdp, va);
}

if (WARN_ON(!ptep))
return -ENOMEM;

- /* The PTE should never be already present */
- if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
- return -EINVAL;
-
set_huge_pte_at(&init_mm, va, ptep,
pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)),
1UL << mmu_psize_to_shift(psize));
@@ -110,6 +125,8 @@ static void mmu_mapin_ram_chunk(unsigned long offset, unsigned long top,

for (; p < ALIGN(p, SZ_8M) && p < top; p += SZ_512K, v += SZ_512K)
__early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
+ for (; p < ALIGN_DOWN(top, SZ_8M) && p < top; p += SZ_8M, v += SZ_8M)
+ __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new);
for (; p < ALIGN_DOWN(top, SZ_512K) && p < top; p += SZ_512K, v += SZ_512K)
__early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);

diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index cb2afe39cee5..5ffa0af4328a 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -104,6 +104,9 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_512K] = {
.shift = 19,
},
+ [MMU_PAGE_8M] = {
+ .shift = 23,
+ },
};
#endif

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index acdf64c9b93e..59f0d7706d2f 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -297,11 +297,8 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
}

#if defined(CONFIG_PPC_8xx)
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
- pte_t pte, unsigned long sz)
+static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val)
{
- pmd_t *pmd = pmd_off(mm, addr);
- pte_basic_t val;
pte_basic_t *entry = (pte_basic_t *)ptep;
int num, i;

@@ -311,15 +308,26 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
*/
VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));

- pte = set_pte_filter(pte, addr);
-
- val = pte_val(pte);
-
num = number_of_cells_per_pte(pmd, val, 1);

for (i = 0; i < num; i++, entry++, val += SZ_4K)
*entry = val;
}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned long sz)
+{
+ pmd_t *pmdp = pmd_off(mm, addr);
+
+ pte = set_pte_filter(pte, addr);
+
+ if (sz == SZ_8M) {
+ __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp, 0), pte_val(pte));
+ __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp + 1, 0), pte_val(pte) + SZ_4M);
+ } else {
+ __set_huge_pte_at(pmdp, ptep, pte_val(pte));
+ }
+}
#endif
#endif /* CONFIG_HUGETLB_PAGE */

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index face94977cb2..0b1d68ef87cd 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -48,7 +48,7 @@ notrace void __init early_ioremap_init(void)
early_ioremap_setup();
}

-static void __init *early_alloc_pgtable(unsigned long size)
+void __init *early_alloc_pgtable(unsigned long size)
{
void *ptr = memblock_alloc(size, size);

--
2.43.0