Re: [PATCH] LoongArch: mm: Refactor TLB handlers

From: Huacai Chen
Date: Sun Oct 09 2022 - 02:14:13 EST


Queued for loongarch-next, thanks.

Huacai

On Tue, Oct 4, 2022 at 10:02 PM Rui Wang <wangrui@xxxxxxxxxxx> wrote:
>
> This patch simplifies TLB load, store and modify exception handlers:
>
> 1. Reduce instructions. such as alu/csr and memory access
> 2. Execute tlbsrch only in the fast path.
> 3. Return directly from the fast path for huge pages.
>
> And fixes the concurrent modification issue of fast path for huge pages.
> This issue will occur in the following steps:
>
> CPU-1 (In TLB exception) CPU-2 (In THP)
> 1: Load PMD entry (HUGE=1)
> 2: Goto huge path
> 3: Store PMD entry (HUGE=0)
> 4: Reload PMD entry (HUGE=0)
> 5: Fill TLB entry (PA is incorrect)
>
> This also slightly improves the TLB processing performance:
>
> * Normal pages: 2.15%
> * Huge pages: 1.70%
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <sys/mman.h>
>
> int main(int argc, char *argv[])
> {
> size_t page_size;
> size_t mem_size;
> size_t off;
> void *base;
> int flags;
> int i;
>
> if (argc < 2) {
> fprintf(stderr, "%s MEM_SIZE [HUGE]\n", argv[0]);
> return -1;
> }
>
> page_size = sysconf(_SC_PAGESIZE);
> flags = MAP_PRIVATE | MAP_ANONYMOUS;
> mem_size = strtoul(argv[1], NULL, 10);
> if (argc > 2)
> flags |= MAP_HUGETLB;
>
> for (i = 0; i < 10; i++) {
> base = mmap(NULL, mem_size, PROT_READ, flags, -1, 0);
> if (base == MAP_FAILED) {
> fprintf(stderr, "Map memory failed!\n");
> return -1;
> }
>
> for (off = 0; off < mem_size; off += page_size)
> *(volatile int *)(base + off);
>
> munmap(base, mem_size);
> }
>
> return 0;
> }
>
> Signed-off-by: Rui Wang <wangrui@xxxxxxxxxxx>
> ---
> arch/loongarch/mm/tlbex.S | 223 ++++++++++++++++----------------------
> 1 file changed, 93 insertions(+), 130 deletions(-)
>
> diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
> index 39743337999e..c97bcaad2ff4 100644
> --- a/arch/loongarch/mm/tlbex.S
> +++ b/arch/loongarch/mm/tlbex.S
> @@ -10,6 +10,11 @@
> #include <asm/regdef.h>
> #include <asm/stackframe.h>
>
> +#define PTRS_PER_PGD_BITS (PAGE_SHIFT - 3)
> +#define PTRS_PER_PUD_BITS (PAGE_SHIFT - 3)
> +#define PTRS_PER_PMD_BITS (PAGE_SHIFT - 3)
> +#define PTRS_PER_PTE_BITS (PAGE_SHIFT - 3)
> +
> .macro tlb_do_page_fault, write
> SYM_FUNC_START(tlb_do_page_fault_\write)
> SAVE_ALL
> @@ -52,25 +57,17 @@ SYM_FUNC_START(handle_tlb_load)
>
> vmalloc_done_load:
> /* Get PGD offset in bytes */
> - srli.d t0, t0, PGDIR_SHIFT
> - andi t0, t0, (PTRS_PER_PGD - 1)
> - slli.d t0, t0, 3
> - add.d t1, t1, t0
> + bstrpick.d ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT
> + alsl.d t1, ra, t1, 3
> #if CONFIG_PGTABLE_LEVELS > 3
> - csrrd t0, LOONGARCH_CSR_BADV
> ld.d t1, t1, 0
> - srli.d t0, t0, PUD_SHIFT
> - andi t0, t0, (PTRS_PER_PUD - 1)
> - slli.d t0, t0, 3
> - add.d t1, t1, t0
> + bstrpick.d ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT
> + alsl.d t1, ra, t1, 3
> #endif
> #if CONFIG_PGTABLE_LEVELS > 2
> - csrrd t0, LOONGARCH_CSR_BADV
> ld.d t1, t1, 0
> - srli.d t0, t0, PMD_SHIFT
> - andi t0, t0, (PTRS_PER_PMD - 1)
> - slli.d t0, t0, 3
> - add.d t1, t1, t0
> + bstrpick.d ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT
> + alsl.d t1, ra, t1, 3
> #endif
> ld.d ra, t1, 0
>
> @@ -79,27 +76,20 @@ vmalloc_done_load:
> * instead contains the tlb pte. Check the PAGE_HUGE bit and
> * see if we need to jump to huge tlb processing.
> */
> - andi t0, ra, _PAGE_HUGE
> - bnez t0, tlb_huge_update_load
> + rotri.d ra, ra, _PAGE_HUGE_SHIFT + 1
> + bltz ra, tlb_huge_update_load
>
> - csrrd t0, LOONGARCH_CSR_BADV
> - srli.d t0, t0, PAGE_SHIFT
> - andi t0, t0, (PTRS_PER_PTE - 1)
> - slli.d t0, t0, _PTE_T_LOG2
> - add.d t1, ra, t0
> + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
> + bstrpick.d t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT
> + alsl.d t1, t0, ra, _PTE_T_LOG2
>
> #ifdef CONFIG_SMP
> smp_pgtable_change_load:
> -#endif
> -#ifdef CONFIG_SMP
> ll.d t0, t1, 0
> #else
> ld.d t0, t1, 0
> #endif
> - tlbsrch
> -
> - srli.d ra, t0, _PAGE_PRESENT_SHIFT
> - andi ra, ra, 1
> + andi ra, t0, _PAGE_PRESENT
> beqz ra, nopage_tlb_load
>
> ori t0, t0, _PAGE_VALID
> @@ -109,8 +99,8 @@ smp_pgtable_change_load:
> #else
> st.d t0, t1, 0
> #endif
> - ori t1, t1, 8
> - xori t1, t1, 8
> + tlbsrch
> + bstrins.d t1, zero, 3, 3
> ld.d t0, t1, 0
> ld.d t1, t1, 8
> csrwr t0, LOONGARCH_CSR_TLBELO0
> @@ -133,23 +123,22 @@ vmalloc_load:
> */
> tlb_huge_update_load:
> #ifdef CONFIG_SMP
> - ll.d t0, t1, 0
> -#else
> - ld.d t0, t1, 0
> + ll.d ra, t1, 0
> #endif
> - srli.d ra, t0, _PAGE_PRESENT_SHIFT
> - andi ra, ra, 1
> - beqz ra, nopage_tlb_load
> - tlbsrch
> + andi t0, ra, _PAGE_PRESENT
> + beqz t0, nopage_tlb_load
>
> - ori t0, t0, _PAGE_VALID
> #ifdef CONFIG_SMP
> + ori t0, ra, _PAGE_VALID
> sc.d t0, t1, 0
> beqz t0, tlb_huge_update_load
> - ld.d t0, t1, 0
> + ori t0, ra, _PAGE_VALID
> #else
> + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
> + ori t0, ra, _PAGE_VALID
> st.d t0, t1, 0
> #endif
> + tlbsrch
> addu16i.d t1, zero, -(CSR_TLBIDX_EHINV >> 16)
> addi.d ra, t1, 0
> csrxchg ra, t1, LOONGARCH_CSR_TLBIDX
> @@ -173,9 +162,8 @@ tlb_huge_update_load:
> srli.d t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
> or t0, t0, t1
>
> - addi.d ra, t0, 0
> - csrwr t0, LOONGARCH_CSR_TLBELO0
> - addi.d t0, ra, 0
> + move ra, t0
> + csrwr ra, LOONGARCH_CSR_TLBELO0
>
> /* Convert to entrylo1 */
> addi.d t1, zero, 1
> @@ -193,6 +181,11 @@ tlb_huge_update_load:
> addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16)
> addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
> csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
> +leave_huge_load:
> + csrrd t0, EXCEPTION_KS0
> + csrrd t1, EXCEPTION_KS1
> + csrrd ra, EXCEPTION_KS2
> + ertn
>
> nopage_tlb_load:
> dbar 0
> @@ -215,26 +208,17 @@ SYM_FUNC_START(handle_tlb_store)
>
> vmalloc_done_store:
> /* Get PGD offset in bytes */
> - srli.d t0, t0, PGDIR_SHIFT
> - andi t0, t0, (PTRS_PER_PGD - 1)
> - slli.d t0, t0, 3
> - add.d t1, t1, t0
> -
> + bstrpick.d ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT
> + alsl.d t1, ra, t1, 3
> #if CONFIG_PGTABLE_LEVELS > 3
> - csrrd t0, LOONGARCH_CSR_BADV
> ld.d t1, t1, 0
> - srli.d t0, t0, PUD_SHIFT
> - andi t0, t0, (PTRS_PER_PUD - 1)
> - slli.d t0, t0, 3
> - add.d t1, t1, t0
> + bstrpick.d ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT
> + alsl.d t1, ra, t1, 3
> #endif
> #if CONFIG_PGTABLE_LEVELS > 2
> - csrrd t0, LOONGARCH_CSR_BADV
> ld.d t1, t1, 0
> - srli.d t0, t0, PMD_SHIFT
> - andi t0, t0, (PTRS_PER_PMD - 1)
> - slli.d t0, t0, 3
> - add.d t1, t1, t0
> + bstrpick.d ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT
> + alsl.d t1, ra, t1, 3
> #endif
> ld.d ra, t1, 0
>
> @@ -243,28 +227,21 @@ vmalloc_done_store:
> * instead contains the tlb pte. Check the PAGE_HUGE bit and
> * see if we need to jump to huge tlb processing.
> */
> - andi t0, ra, _PAGE_HUGE
> - bnez t0, tlb_huge_update_store
> + rotri.d ra, ra, _PAGE_HUGE_SHIFT + 1
> + bltz ra, tlb_huge_update_store
>
> - csrrd t0, LOONGARCH_CSR_BADV
> - srli.d t0, t0, PAGE_SHIFT
> - andi t0, t0, (PTRS_PER_PTE - 1)
> - slli.d t0, t0, _PTE_T_LOG2
> - add.d t1, ra, t0
> + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
> + bstrpick.d t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT
> + alsl.d t1, t0, ra, _PTE_T_LOG2
>
> #ifdef CONFIG_SMP
> smp_pgtable_change_store:
> -#endif
> -#ifdef CONFIG_SMP
> ll.d t0, t1, 0
> #else
> ld.d t0, t1, 0
> #endif
> - tlbsrch
> -
> - srli.d ra, t0, _PAGE_PRESENT_SHIFT
> - andi ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
> - xori ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
> + andi ra, t0, _PAGE_PRESENT | _PAGE_WRITE
> + xori ra, ra, _PAGE_PRESENT | _PAGE_WRITE
> bnez ra, nopage_tlb_store
>
> ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> @@ -274,9 +251,8 @@ smp_pgtable_change_store:
> #else
> st.d t0, t1, 0
> #endif
> -
> - ori t1, t1, 8
> - xori t1, t1, 8
> + tlbsrch
> + bstrins.d t1, zero, 3, 3
> ld.d t0, t1, 0
> ld.d t1, t1, 8
> csrwr t0, LOONGARCH_CSR_TLBELO0
> @@ -299,25 +275,23 @@ vmalloc_store:
> */
> tlb_huge_update_store:
> #ifdef CONFIG_SMP
> - ll.d t0, t1, 0
> -#else
> - ld.d t0, t1, 0
> + ll.d ra, t1, 0
> #endif
> - srli.d ra, t0, _PAGE_PRESENT_SHIFT
> - andi ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
> - xori ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
> - bnez ra, nopage_tlb_store
> -
> - tlbsrch
> - ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> + andi t0, ra, _PAGE_PRESENT | _PAGE_WRITE
> + xori t0, t0, _PAGE_PRESENT | _PAGE_WRITE
> + bnez t0, nopage_tlb_store
>
> #ifdef CONFIG_SMP
> + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> sc.d t0, t1, 0
> beqz t0, tlb_huge_update_store
> - ld.d t0, t1, 0
> + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> #else
> + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
> + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> st.d t0, t1, 0
> #endif
> + tlbsrch
> addu16i.d t1, zero, -(CSR_TLBIDX_EHINV >> 16)
> addi.d ra, t1, 0
> csrxchg ra, t1, LOONGARCH_CSR_TLBIDX
> @@ -340,9 +314,8 @@ tlb_huge_update_store:
> srli.d t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
> or t0, t0, t1
>
> - addi.d ra, t0, 0
> - csrwr t0, LOONGARCH_CSR_TLBELO0
> - addi.d t0, ra, 0
> + move ra, t0
> + csrwr ra, LOONGARCH_CSR_TLBELO0
>
> /* Convert to entrylo1 */
> addi.d t1, zero, 1
> @@ -361,6 +334,11 @@ tlb_huge_update_store:
> addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16)
> addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
> csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
> +leave_huge_store:
> + csrrd t0, EXCEPTION_KS0
> + csrrd t1, EXCEPTION_KS1
> + csrrd ra, EXCEPTION_KS2
> + ertn
>
> nopage_tlb_store:
> dbar 0
> @@ -383,25 +361,17 @@ SYM_FUNC_START(handle_tlb_modify)
>
> vmalloc_done_modify:
> /* Get PGD offset in bytes */
> - srli.d t0, t0, PGDIR_SHIFT
> - andi t0, t0, (PTRS_PER_PGD - 1)
> - slli.d t0, t0, 3
> - add.d t1, t1, t0
> + bstrpick.d ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT
> + alsl.d t1, ra, t1, 3
> #if CONFIG_PGTABLE_LEVELS > 3
> - csrrd t0, LOONGARCH_CSR_BADV
> ld.d t1, t1, 0
> - srli.d t0, t0, PUD_SHIFT
> - andi t0, t0, (PTRS_PER_PUD - 1)
> - slli.d t0, t0, 3
> - add.d t1, t1, t0
> + bstrpick.d ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT
> + alsl.d t1, ra, t1, 3
> #endif
> #if CONFIG_PGTABLE_LEVELS > 2
> - csrrd t0, LOONGARCH_CSR_BADV
> ld.d t1, t1, 0
> - srli.d t0, t0, PMD_SHIFT
> - andi t0, t0, (PTRS_PER_PMD - 1)
> - slli.d t0, t0, 3
> - add.d t1, t1, t0
> + bstrpick.d ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT
> + alsl.d t1, ra, t1, 3
> #endif
> ld.d ra, t1, 0
>
> @@ -410,27 +380,20 @@ vmalloc_done_modify:
> * instead contains the tlb pte. Check the PAGE_HUGE bit and
> * see if we need to jump to huge tlb processing.
> */
> - andi t0, ra, _PAGE_HUGE
> - bnez t0, tlb_huge_update_modify
> + rotri.d ra, ra, _PAGE_HUGE_SHIFT + 1
> + bltz ra, tlb_huge_update_modify
>
> - csrrd t0, LOONGARCH_CSR_BADV
> - srli.d t0, t0, PAGE_SHIFT
> - andi t0, t0, (PTRS_PER_PTE - 1)
> - slli.d t0, t0, _PTE_T_LOG2
> - add.d t1, ra, t0
> + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
> + bstrpick.d t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT
> + alsl.d t1, t0, ra, _PTE_T_LOG2
>
> #ifdef CONFIG_SMP
> smp_pgtable_change_modify:
> -#endif
> -#ifdef CONFIG_SMP
> ll.d t0, t1, 0
> #else
> ld.d t0, t1, 0
> #endif
> - tlbsrch
> -
> - srli.d ra, t0, _PAGE_WRITE_SHIFT
> - andi ra, ra, 1
> + andi ra, t0, _PAGE_WRITE
> beqz ra, nopage_tlb_modify
>
> ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> @@ -440,8 +403,8 @@ smp_pgtable_change_modify:
> #else
> st.d t0, t1, 0
> #endif
> - ori t1, t1, 8
> - xori t1, t1, 8
> + tlbsrch
> + bstrins.d t1, zero, 3, 3
> ld.d t0, t1, 0
> ld.d t1, t1, 8
> csrwr t0, LOONGARCH_CSR_TLBELO0
> @@ -464,23 +427,19 @@ vmalloc_modify:
> */
> tlb_huge_update_modify:
> #ifdef CONFIG_SMP
> - ll.d t0, t1, 0
> -#else
> - ld.d t0, t1, 0
> + ll.d ra, t1, 0
> #endif
> -
> - srli.d ra, t0, _PAGE_WRITE_SHIFT
> - andi ra, ra, 1
> - beqz ra, nopage_tlb_modify
> -
> - tlbsrch
> - ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> + andi t0, ra, _PAGE_WRITE
> + beqz t0, nopage_tlb_modify
>
> #ifdef CONFIG_SMP
> + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> sc.d t0, t1, 0
> beqz t0, tlb_huge_update_modify
> - ld.d t0, t1, 0
> + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> #else
> + rotri.d ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
> + ori t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
> st.d t0, t1, 0
> #endif
> /*
> @@ -499,9 +458,8 @@ tlb_huge_update_modify:
> srli.d t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
> or t0, t0, t1
>
> - addi.d ra, t0, 0
> - csrwr t0, LOONGARCH_CSR_TLBELO0
> - addi.d t0, ra, 0
> + move ra, t0
> + csrwr ra, LOONGARCH_CSR_TLBELO0
>
> /* Convert to entrylo1 */
> addi.d t1, zero, 1
> @@ -520,6 +478,11 @@ tlb_huge_update_modify:
> addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16)
> addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
> csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
> +leave_huge_modify:
> + csrrd t0, EXCEPTION_KS0
> + csrrd t1, EXCEPTION_KS1
> + csrrd ra, EXCEPTION_KS2
> + ertn
>
> nopage_tlb_modify:
> dbar 0
> --
> 2.37.3
>