Re: [PATCHv4 4/8] x86/mm: Handle LAM on context switch

From: Alexander Potapenko
Date: Thu Jun 30 2022 - 04:37:05 EST


On Wed, Jun 22, 2022 at 6:22 PM Kirill A. Shutemov
<kirill.shutemov@xxxxxxxxxxxxxxx> wrote:
>
> Linear Address Masking mode for userspace pointers encoded in CR3 bits.
> The mode is selected per-thread. Add new thread features indicate that the
> thread has Linear Address Masking enabled.
>
> switch_mm_irqs_off() now respects these flags and constructs CR3
> accordingly.
>
> The active LAM mode gets recorded in the tlb_state.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> ---
> arch/x86/include/asm/mmu.h | 1 +
> arch/x86/include/asm/mmu_context.h | 24 +++++++++++
> arch/x86/include/asm/tlbflush.h | 4 ++
> arch/x86/mm/tlb.c | 68 +++++++++++++++++++++++-------
> 4 files changed, 82 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index 5d7494631ea9..d150e92163b6 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -40,6 +40,7 @@ typedef struct {
>
> #ifdef CONFIG_X86_64
> unsigned short flags;
> + u64 lam_cr3_mask;
> #endif

Can you please add a comment for this field?

>
> struct mutex lock;
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index b8d40ddeab00..e6eac047c728 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -91,6 +91,29 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
> }
> #endif
>
> +#ifdef CONFIG_X86_64
> +static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
> +{
> + return mm->context.lam_cr3_mask;
> +}

For the sake of uniformity, can it be either lam_cr3_mask or
cr3_lam_mask everywhere?

>
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 4af5579c7ef7..2d70d75e207f 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -100,6 +100,10 @@ struct tlb_state {
> */
> bool invalidate_other;
>
> +#ifdef CONFIG_X86_64
> + u8 lam;
> +#endif
> +

Comment here as well, please.

> /*
> * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
> * the corresponding user PCID needs a flush next time we
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index d400b6d9d246..c5c4f76329c2 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -154,17 +154,17 @@ static inline u16 user_pcid(u16 asid)
> return ret;
> }
>
> -static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
> +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, u64 lam)

I think it's more natural to make `lam` an unsigned long, because cr3
is treated as unsigned long everywhere.

> {
> if (static_cpu_has(X86_FEATURE_PCID)) {
> - return __sme_pa(pgd) | kern_pcid(asid);
> + return __sme_pa(pgd) | kern_pcid(asid) | lam;
> } else {
> VM_WARN_ON_ONCE(asid != 0);
> - return __sme_pa(pgd);
> + return __sme_pa(pgd) | lam;
> }
> }
>
> -static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
> +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, u64 lam)
ditto

> {
> VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
> /*
> @@ -173,7 +173,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
> * boot because all CPU's the have same capabilities:
> */
> VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
> - return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
> + return __sme_pa(pgd) | kern_pcid(asid) | lam | CR3_NOFLUSH;
> }
>
> /*
> @@ -274,15 +274,15 @@ static inline void invalidate_user_asid(u16 asid)
> (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
> }
>
> -static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
> +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, u64 lam, bool need_flush)
> {
> unsigned long new_mm_cr3;
>
> if (need_flush) {
> invalidate_user_asid(new_asid);
> - new_mm_cr3 = build_cr3(pgdir, new_asid);
> + new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
> } else {
> - new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
> + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
> }
>
> /*
> @@ -486,11 +486,38 @@ void cr4_update_pce(void *ignored)
> static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
> #endif
>
> +#ifdef CONFIG_X86_64
> +static inline u64 tlbstate_lam_cr3_mask(void)
Please add comments for these methods.
> +{
> + u64 lam = this_cpu_read(cpu_tlbstate.lam);
> +
> + return lam << X86_CR3_LAM_U57_BIT;
> +}
> +
> +static inline void set_tlbstate_lam_cr3_mask(u64 mask)
> +{
> + this_cpu_write(cpu_tlbstate.lam, mask >> X86_CR3_LAM_U57_BIT);
> +}
> +
> +#else
> +
> +static inline u64 tlbstate_lam_cr3_mask(void)
> +{
> + return 0;
> +}
> +
> +static inline void set_tlbstate_lam_cr3_mask(u64 mask)
> +{
> +}
> +#endif
> +
> void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> struct task_struct *tsk)
> {
> struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
> + u64 prev_lam = tlbstate_lam_cr3_mask();
> + u64 new_lam = mm_cr3_lam_mask(next);

Ditto.

> bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
> unsigned cpu = smp_processor_id();
> u64 next_tlb_gen;
> @@ -504,6 +531,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> * cpu_tlbstate.loaded_mm) matches next.
> *
> * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
> + *
> + * NB: Initial LAM enabling calls us with prev == next. We must update
> + * CR3 if prev_lam doesn't match the new one.
> */
>
> /* We don't want flush_tlb_func() to run concurrently with us. */
> @@ -520,7 +550,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> * isn't free.
> */
> #ifdef CONFIG_DEBUG_VM
> - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
> + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, prev_lam))) {
> /*
> * If we were to BUG here, we'd be very likely to kill
> * the system so hard that we don't see the call trace.
> @@ -551,7 +581,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> * provides that full memory barrier and core serializing
> * instruction.
> */
> - if (real_prev == next) {
> + if (real_prev == next && prev_lam == new_lam) {

Do we want the warning checks below to only happen if prev_lam==new_lam?

> VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
> next->context.ctx_id);
>
> @@ -622,15 +652,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> barrier();
> }
>
> + set_tlbstate_lam_cr3_mask(new_lam);
> if (need_flush) {
> this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
> this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
> - load_new_mm_cr3(next->pgd, new_asid, true);
> + load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
>
> trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
> } else {
> /* The new ASID is already up to date. */
> - load_new_mm_cr3(next->pgd, new_asid, false);
> + load_new_mm_cr3(next->pgd, new_asid, new_lam, false);
>
> trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
> }
> @@ -691,6 +722,10 @@ void initialize_tlbstate_and_flush(void)
> /* Assert that CR3 already references the right mm. */
> WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
>
> + /* LAM expected to be disabled in CR3 and init_mm */
> + WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
> + WARN_ON(mm_cr3_lam_mask(&init_mm));
> +
> /*
> * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
> * doesn't work like other CR4 bits because it can only be set from
> @@ -700,7 +735,7 @@ void initialize_tlbstate_and_flush(void)
> !(cr4_read_shadow() & X86_CR4_PCIDE));
>
> /* Force ASID 0 and force a TLB flush. */
Please update this comment.

> - write_cr3(build_cr3(mm->pgd, 0));
> + write_cr3(build_cr3(mm->pgd, 0, 0));
>
> /* Reinitialize tlbstate. */
> this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
> @@ -708,6 +743,7 @@ void initialize_tlbstate_and_flush(void)
> this_cpu_write(cpu_tlbstate.next_asid, 1);
> this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
> this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
> + set_tlbstate_lam_cr3_mask(0);
>
> for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
> this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
> @@ -1047,8 +1083,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
> */
> unsigned long __get_current_cr3_fast(void)
> {
> - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
> - this_cpu_read(cpu_tlbstate.loaded_mm_asid));
> + unsigned long cr3 =
> + build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
> + this_cpu_read(cpu_tlbstate.loaded_mm_asid),
> + tlbstate_lam_cr3_mask());
>
> /* For now, be very restrictive about when this can be called. */
> VM_WARN_ON(in_nmi() || preemptible());
> --
> 2.35.1
>


--
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Liana Sebastian
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

Diese E-Mail ist vertraulich. Falls Sie diese fälschlicherweise
erhalten haben sollten, leiten Sie diese bitte nicht an jemand anderes
weiter, löschen Sie alle Kopien und Anhänge davon und lassen Sie mich
bitte wissen, dass die E-Mail an die falsche Person gesendet wurde.


This e-mail is confidential. If you received this communication by
mistake, please don't forward it to anyone else, please erase all
copies and attachments, and please let me know that it has gone to the
wrong person.