[RFC PATCH 1/3] x86/mm: fix LAM cr3 mask inconsistency during context switch

From: Yosry Ahmed
Date: Thu Mar 07 2024 - 08:42:54 EST


In switch_mm_irqs_off(), we read the 'mm->context.lam_cr3_mask' into
'new_lam', which is later passed to load_new_mm_cr3(). However, there is
a call to set_tlbstate_lam_mode() in between which will read
'mm->context.lam_cr3_mask' again and set 'cpu_tlbstate.lam' accordingly.
If we race with another thread updating 'mm->context.lam_cr3_mask', the
value in 'cpu_tlbstate.lam' could end up being different from CR3.

Fix the problem by updating set_tlbstate_lam_mode() to return the LAM
mask that was set to 'cpu_tlbstate.lam', and use that mask in
switch_mm_irqs_off() when writing CR3. Use READ_ONCE to make sure we
read the mask once and use it consistenly.

No practical problems have been observed from this, but it's a recipe
for future problems (e.g. debug warnings in switch_mm_irqs_off() or
__get_current_cr3_fast() could fire). It is unlikely that this can cause
any real issues since only a single-threaded process can update its own
LAM mask, so the race here could happen when context switching between
kthreads using a borrowed MM. In this case, it's unlikely that LAM is
important. If it is, then we would need to ensure all CPUs using the mm
are updated before returning to userspace when LAM is enabled -- but we
don't do that.

While we are at it, remove the misguiding comment that states that
'new_lam' may not match tlbstate_lam_cr3_mask() if a race occurs. This
can happen without a race, a different thread could have just enabled
LAM since the last context switch on the current CPU. Replace it with a
hopefully clearer comment above set_tlbstate_lam_mode().

Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>
---
arch/x86/include/asm/tlbflush.h | 11 +++++++----
arch/x86/mm/tlb.c | 17 ++++++++---------
2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 25726893c6f4d..a4ddb20f84fe7 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -399,11 +399,13 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return lam << X86_CR3_LAM_U57_BIT;
}

-static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+static inline unsigned long set_tlbstate_lam_mode(struct mm_struct *mm)
{
- this_cpu_write(cpu_tlbstate.lam,
- mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT);
+ unsigned long lam = READ_ONCE(mm->context.lam_cr3_mask);
+
+ this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT);
this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask);
+ return lam;
}

#else
@@ -413,8 +415,9 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return 0;
}

-static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+static inline unsigned long set_tlbstate_lam_mode(struct mm_struct *mm)
{
+ return 0;
}
#endif
#endif /* !MODULE */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 51f9f56941058..2975d3f89a5de 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -503,9 +503,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
{
struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
+ unsigned long new_lam;
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
@@ -561,11 +561,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);

- /*
- * If this races with another thread that enables lam, 'new_lam'
- * might not match tlbstate_lam_cr3_mask().
- */
-
/*
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
@@ -633,7 +628,12 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
barrier();
}

- set_tlbstate_lam_mode(next);
+ /*
+ * Even if we are not actually switching mm's, another thread could have
+ * updated mm->context.lam_cr3_mask. Make sure tlbstate_lam_cr3_mask()
+ * and the loaded CR3 use the up-to-date mask.
+ */
+ new_lam = set_tlbstate_lam_mode(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -705,7 +705,6 @@ void initialize_tlbstate_and_flush(void)

/* LAM expected to be disabled */
WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
- WARN_ON(mm_lam_cr3_mask(mm));

/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
@@ -724,7 +723,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
- set_tlbstate_lam_mode(mm);
+ WARN_ON(set_tlbstate_lam_mode(mm));

for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
--
2.44.0.278.ge034bb2e1d-goog