[RFC PATCH v2 19/20] context_tracking,x86: Add infrastructure to defer kernel TLBI

From: Valentin Schneider
Date: Thu Jul 20 2023 - 12:38:57 EST


Kernel TLB invalidation IPIs are a common source of interference on
NOHZ_FULL CPUs. Given NOHZ_FULL CPUs executing in userspace are not
accessing any kernel addresses, these invalidations do not need to happen
immediately, and can be deferred until the next user->kernel transition.

Rather than make __flush_tlb_all() noinstr, add a minimal noinstr
variant that doesn't try to leverage INVPCID.

FIXME: not fully noinstr compliant
XXX: same issue as with ins patching, when do we access data that should be
invalidated?

Signed-off-by: Valentin Schneider <vschneid@xxxxxxxxxx>
---
arch/x86/include/asm/context_tracking_work.h | 4 ++++
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 17 +++++++++++++++++
include/linux/context_tracking_state.h | 4 ++++
include/linux/context_tracking_work.h | 2 ++
5 files changed, 28 insertions(+)

diff --git a/arch/x86/include/asm/context_tracking_work.h b/arch/x86/include/asm/context_tracking_work.h
index 2c66687ce00e2..9d4f021b5a45b 100644
--- a/arch/x86/include/asm/context_tracking_work.h
+++ b/arch/x86/include/asm/context_tracking_work.h
@@ -3,6 +3,7 @@
#define _ASM_X86_CONTEXT_TRACKING_WORK_H

#include <asm/sync_core.h>
+#include <asm/tlbflush.h>

static __always_inline void arch_context_tracking_work(int work)
{
@@ -10,6 +11,9 @@ static __always_inline void arch_context_tracking_work(int work)
case CONTEXT_WORK_SYNC:
sync_core();
break;
+ case CONTEXT_WORK_TLBI:
+ __flush_tlb_all_noinstr();
+ break;
}
}

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 80450e1d5385a..323b971987af7 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -17,6 +17,7 @@
DECLARE_PER_CPU(u64, tlbstate_untag_mask);

void __flush_tlb_all(void);
+void noinstr __flush_tlb_all_noinstr(void);

#define TLB_FLUSH_ALL -1UL
#define TLB_GENERATION_INVALID 0
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 267acf27480af..631df9189ded4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1237,6 +1237,23 @@ void __flush_tlb_all(void)
}
EXPORT_SYMBOL_GPL(__flush_tlb_all);

+void noinstr __flush_tlb_all_noinstr(void)
+{
+ /*
+ * This is for invocation in early entry code that cannot be
+ * instrumented. A RMW to CR4 works for most cases, but relies on
+ * being able to flip either of the PGE or PCIDE bits. Flipping CR4.PCID
+ * would require also resetting CR3.PCID, so just try with CR4.PGE, else
+ * do the CR3 write.
+ *
+ * TODO: paravirt
+ */
+ if (cpu_feature_enabled(X86_FEATURE_PGE))
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
+ else
+ flush_tlb_local();
+}
+
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
struct flush_tlb_info *info;
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 292a0b7c06948..3571c62cbb9cd 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -62,6 +62,10 @@ enum ctx_state {
#define RCU_DYNTICKS_END (CT_STATE_SIZE - 1)
#define RCU_DYNTICKS_IDX BIT(RCU_DYNTICKS_START)

+/*
+ * When CONFIG_CONTEXT_TRACKING_WORK=n, _END is 1 behind _START, which makes
+ * the CONTEXT_WORK size computation below 0, which is what we want!
+ */
#define CONTEXT_WORK_START (CONTEXT_STATE_END + 1)
#define CONTEXT_WORK_END (RCU_DYNTICKS_START - 1)

diff --git a/include/linux/context_tracking_work.h b/include/linux/context_tracking_work.h
index 13fc97b395030..47d5ced39a43a 100644
--- a/include/linux/context_tracking_work.h
+++ b/include/linux/context_tracking_work.h
@@ -6,11 +6,13 @@

enum {
CONTEXT_WORK_SYNC_OFFSET,
+ CONTEXT_WORK_TLBI_OFFSET,
CONTEXT_WORK_MAX_OFFSET
};

enum ct_work {
CONTEXT_WORK_SYNC = BIT(CONTEXT_WORK_SYNC_OFFSET),
+ CONTEXT_WORK_TLBI = BIT(CONTEXT_WORK_TLBI_OFFSET),
CONTEXT_WORK_MAX = BIT(CONTEXT_WORK_MAX_OFFSET)
};

--
2.31.1