[PATCH v1 8/8] sbm: x86: lazy TLB flushing

From: Petr Tesarik
Date: Wed Feb 14 2024 - 06:38:35 EST


From: Petr Tesarik <petr.tesarik1@xxxxxxxxxxxxxxxxxxx>

Implement lazy TLB flushing in sandbox mode and keep CR4.PGE enabled.

For the transition from sandbox mode to kernel mode:

1. All user page translations (sandbox code and data) are flushed from the
TLB, because their page protection bits do not include _PAGE_GLOBAL.

2. Any kernel page translations remain valid after the transition. The SBM
state page is an exception; map it without _PAGE_GLOBAL.

For the transition from kernel mode to sandbox mode:

1. Kernel page translations become stale. However, any access by code
running in sandbox mode (with CPL 3) causes a protection violation.
Handle the spurious page faults from such accesses, lazily replacing
entries in the TLB.

2. If the TLB contains any user page translations before the switch to
sandbox mode, they are flushed, because their page protection bits do
not include _PAGE_GLOBAL. This ensures that sandbox mode cannot access
user mode pages.

Note that the TLB may keep kernel page translations for addresses which are
never accessed by sandbox mode. They remain valid after returning to kernel
mode.

Signed-off-by: Petr Tesarik <petr.tesarik1@xxxxxxxxxxxxxxxxxxx>
---
arch/x86/entry/entry_64.S | 17 +-----
arch/x86/kernel/sbm/call_64.S | 5 +-
arch/x86/kernel/sbm/core.c | 100 +++++++++++++++++++++++++++++++++-
3 files changed, 102 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e1364115408a..4ba3eea38102 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -632,10 +632,8 @@ SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
movq TASK_sbm_state(%rcx), %rcx
movq SBM_sbm_cr3(%rcx), %rcx
- movq %cr4, %rax
- andb $~X86_CR4_PGE, %al
- movq %rax, %cr4
movq %rcx, %cr3
+ invlpg x86_sbm_state
orb $3, CS(%rsp)
#endif

@@ -897,9 +895,6 @@ SYM_CODE_START(paranoid_entry)

movq %cr3, %r14
andb $~3, CS+8(%rsp)
- movq %cr4, %rax
- orb $X86_CR4_PGE, %al
- movq %rax, %cr4
movq %rcx, %cr3
jmp .Lparanoid_gsbase
#endif
@@ -1073,9 +1068,6 @@ SYM_CODE_START(error_entry)
jrcxz .Lerror_swapgs

andb $~3, CS+8(%rsp)
- movq %cr4, %rax
- orb $X86_CR4_PGE, %al
- movq %rax, %cr4
movq %rcx, %cr3
jmp .Lerror_entry_done_lfence
#endif
@@ -1281,9 +1273,6 @@ SYM_CODE_START(asm_exc_nmi)
* stack. The code is similar to NMI from user mode.
*/
andb $~3, CS-RIP+8(%rsp)
- movq %cr4, %rdx
- orb $X86_CR4_PGE, %dl
- movq %rdx, %cr4
movq x86_sbm_state + SBM_kernel_cr3, %rdx
movq %rdx, %cr3

@@ -1533,10 +1522,8 @@ end_repeat_nmi:
movq TASK_sbm_state(%rcx), %rcx
jrcxz nmi_no_sbm

- movq %cr4, %rax
- andb $~X86_CR4_PGE, %al
- movq %rax, %cr4
movq %r14, %cr3
+ invlpg x86_sbm_state
#endif

nmi_no_sbm:
diff --git a/arch/x86/kernel/sbm/call_64.S b/arch/x86/kernel/sbm/call_64.S
index 8b2b524c5b46..21edce5666bc 100644
--- a/arch/x86/kernel/sbm/call_64.S
+++ b/arch/x86/kernel/sbm/call_64.S
@@ -10,7 +10,6 @@
#include <linux/linkage.h>
#include <asm/nospec-branch.h>
#include <asm/percpu.h>
-#include <asm/processor-flags.h>
#include <asm/segment.h>

.code64
@@ -75,12 +74,10 @@ SYM_FUNC_START(x86_sbm_exec)
* The NMI handler takes extra care to restore CR3 and CR4.
*/
mov SBM_sbm_cr3(%rdi), %r11
- mov %cr4, %rax
- and $~X86_CR4_PGE, %al
mov %rdx, %rdi /* args */
cli
- mov %rax, %cr4
mov %r11, %cr3
+ invlpg x86_sbm_state
iretq

SYM_INNER_LABEL(x86_sbm_return, SYM_L_GLOBAL)
diff --git a/arch/x86/kernel/sbm/core.c b/arch/x86/kernel/sbm/core.c
index 0ea193550a83..296f1fde3c22 100644
--- a/arch/x86/kernel/sbm/core.c
+++ b/arch/x86/kernel/sbm/core.c
@@ -33,6 +33,11 @@ union {
char page[PAGE_SIZE];
} x86_sbm_state __page_aligned_bss;

+static inline pgprot_t pgprot_nonglobal(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) & ~_PAGE_GLOBAL);
+}
+
static inline phys_addr_t page_to_ptval(struct page *page)
{
return PFN_PHYS(page_to_pfn(page)) | _PAGE_TABLE;
@@ -287,7 +292,7 @@ int arch_sbm_init(struct sbm *sbm)

BUILD_BUG_ON(sizeof(x86_sbm_state) != PAGE_SIZE);
err = map_page(state, (unsigned long)&x86_sbm_state,
- PHYS_PFN(__pa(state)), PAGE_KERNEL);
+ PHYS_PFN(__pa(state)), pgprot_nonglobal(PAGE_KERNEL));
if (err < 0)
return err;

@@ -379,11 +384,104 @@ int arch_sbm_exec(struct sbm *sbm, sbm_func func, void *args)
return err;
}

+static bool spurious_sbm_fault_check(unsigned long error_code, pte_t *pte)
+{
+ if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
+ return false;
+
+ if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
+ return false;
+
+ return true;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X). Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry. Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ *
+ * Returns true if a spurious fault was handled, false otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
+ */
+static bool
+spurious_sbm_fault(struct x86_sbm_state *state, unsigned long error_code,
+ unsigned long address)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ bool ret;
+
+ if ((error_code & ~(X86_PF_WRITE | X86_PF_INSTR)) !=
+ (X86_PF_USER | X86_PF_PROT))
+ return false;
+
+ pgd = __va(state->sbm_cr3 & CR3_ADDR_MASK) + pgd_index(address);
+ if (!pgd_present(*pgd))
+ return false;
+
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return false;
+
+ if (p4d_large(*p4d))
+ return spurious_sbm_fault_check(error_code, (pte_t *)p4d);
+
+ pud = pud_offset(p4d, address);
+ if (!pud_present(*pud))
+ return false;
+
+ if (pud_large(*pud))
+ return spurious_sbm_fault_check(error_code, (pte_t *)pud);
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return false;
+
+ if (pmd_large(*pmd))
+ return spurious_sbm_fault_check(error_code, (pte_t *)pmd);
+
+ pte = pte_offset_kernel(pmd, address);
+ if (!pte_present(*pte))
+ return false;
+
+ ret = spurious_sbm_fault_check(error_code, pte);
+ if (!ret)
+ return false;
+
+ /*
+ * Make sure we have permissions in PMD.
+ * If not, then there's a bug in the page tables:
+ */
+ ret = spurious_sbm_fault_check(error_code, (pte_t *)pmd);
+ WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
+
+ return ret;
+}
+
void handle_sbm_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
struct x86_sbm_state *state = current_thread_info()->sbm_state;

+ if (spurious_sbm_fault(state, error_code, address))
+ return;
+
/*
* Force -EFAULT unless the fault was due to a user-mode instruction
* fetch from the designated return address.
--
2.34.1