[RFC PATCH 47/47] mm: asi: Properly un/mapping task stack from ASI + tlb flush

From: Junaid Shahid
Date: Wed Feb 23 2022 - 00:28:57 EST


From: Ofir Weisse <oweisse@xxxxxxxxxx>

There are several locations where this is important. Especially since a
task_struct might be reused, potentially with a different mm.

1. Map in vcpu_run() @ arch/x86/kvm/x86.c
1. Unmap in release_task_stack() @ kernel/fork.c
2. Unmap in do_exit() @ kernel/exit.c
3. Unmap in begin_new_exec() @ fs/exec.c

Signed-off-by: Ofir Weisse <oweisse@xxxxxxxxxx>


---
arch/x86/include/asm/asi.h | 6 ++++
arch/x86/kvm/x86.c | 6 ++++
arch/x86/mm/asi.c | 59 ++++++++++++++++++++++++++++++++++++++
fs/exec.c | 7 ++++-
include/asm-generic/asi.h | 16 +++++++++--
include/linux/sched.h | 5 ++++
kernel/exit.c | 2 +-
kernel/fork.c | 22 +++++++++++++-
8 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/asi.h b/arch/x86/include/asm/asi.h
index 6148e65fb0c2..9d8f43981678 100644
--- a/arch/x86/include/asm/asi.h
+++ b/arch/x86/include/asm/asi.h
@@ -87,6 +87,12 @@ void asi_unmap_user(struct asi *asi, void *va, size_t len);
int asi_fill_pgtbl_pool(struct asi_pgtbl_pool *pool, uint count, gfp_t flags);
void asi_clear_pgtbl_pool(struct asi_pgtbl_pool *pool);

+int asi_map_task_stack(struct task_struct *tsk, struct asi *asi);
+void asi_unmap_task_stack(struct task_struct *tsk);
+void asi_mark_pages_local_nonsensitive(struct page *pages, uint order,
+ struct mm_struct *mm);
+void asi_clear_pages_local_nonsensitive(struct page *pages, uint order);
+
static inline void asi_init_pgtbl_pool(struct asi_pgtbl_pool *pool)
{
pool->pgtbl_list = NULL;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 294f73e9e71e..718104eefaed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10122,6 +10122,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vcpu->arch.l1tf_flush_l1d = true;

+ /* We must have current->stack mapped into asi. This function can be
+ * safely called many times, as it will only do the actual mapping once. */
+ r = asi_map_task_stack(current, vcpu->kvm->asi);
+ if (r != 0)
+ return r;
+
for (;;) {
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
diff --git a/arch/x86/mm/asi.c b/arch/x86/mm/asi.c
index 7f2aa1823736..a86ac6644a57 100644
--- a/arch/x86/mm/asi.c
+++ b/arch/x86/mm/asi.c
@@ -1029,6 +1029,45 @@ void asi_unmap(struct asi *asi, void *addr, size_t len, bool flush_tlb)
asi_flush_tlb_range(asi, addr, len);
}

+int asi_map_task_stack(struct task_struct *tsk, struct asi *asi)
+{
+ int ret;
+
+ /* If the stack is already mapped to asi - don't need to map it again. */
+ if (tsk->asi_stack_mapped)
+ return 0;
+
+ if (!tsk->mm)
+ return -EINVAL;
+
+ /* If the stack was allocated via the page allocator, we assume the
+ * stack pages were marked with PageNonSensitive, therefore tsk->stack
+ * address is properly aliased. */
+ ret = asi_map(ASI_LOCAL_NONSENSITIVE, tsk->stack, THREAD_SIZE);
+ if (!ret) {
+ tsk->asi_stack_mapped = asi;
+ asi_sync_mapping(asi, tsk->stack, THREAD_SIZE);
+ }
+
+ return ret;
+}
+
+void asi_unmap_task_stack(struct task_struct *tsk)
+{
+ /* No need to unmap if the stack was not mapped to begin with. */
+ if (!tsk->asi_stack_mapped)
+ return;
+
+ if (!tsk->mm)
+ return;
+
+ asi_unmap(ASI_LOCAL_NONSENSITIVE, tsk->stack, THREAD_SIZE,
+ /* flush_tlb = */ true);
+
+ tsk->asi_stack_mapped = NULL;
+}
+
+
void *asi_va(unsigned long pa)
{
struct page *page = pfn_to_page(PHYS_PFN(pa));
@@ -1336,3 +1375,23 @@ void asi_unmap_user(struct asi *asi, void *addr, size_t len)
}
}
EXPORT_SYMBOL_GPL(asi_unmap_user);
+
+void asi_mark_pages_local_nonsensitive(struct page *pages, uint order,
+ struct mm_struct *mm)
+{
+ uint i;
+ for (i = 0; i < (1 << order); i++) {
+ __SetPageLocalNonSensitive(pages + i);
+ pages[i].asi_mm = mm;
+ }
+}
+
+void asi_clear_pages_local_nonsensitive(struct page *pages, uint order)
+{
+ uint i;
+ for (i = 0; i < (1 << order); i++) {
+ __ClearPageLocalNonSensitive(pages + i);
+ pages[i].asi_mm = NULL;
+ }
+}
+
diff --git a/fs/exec.c b/fs/exec.c
index 76f3b433e80d..fb9182cf3f33 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -69,6 +69,7 @@
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
+#include <asm/asi.h>

#include <trace/events/task.h>
#include "internal.h"
@@ -1238,7 +1239,11 @@ int begin_new_exec(struct linux_binprm * bprm)
struct task_struct *me = current;
int retval;

- /* TODO: (oweisse) unmap the stack from ASI */
+ /* The old mm is about to be released later on in exec_mmap. We are
+ * reusing the task, including its stack which was mapped to
+ * mm->asi_pgd[0]. We need to asi_unmap the stack, so the destructor of
+ * the mm won't complain on "lingering" asi mappings. */
+ asi_unmap_task_stack(current);

/* Once we are committed compute the creds */
retval = bprm_creds_from_file(bprm);
diff --git a/include/asm-generic/asi.h b/include/asm-generic/asi.h
index 2763cb1a974c..6e9a261a2b9d 100644
--- a/include/asm-generic/asi.h
+++ b/include/asm-generic/asi.h
@@ -66,8 +66,13 @@ static inline struct asi *asi_get_target(void) { return NULL; }

static inline struct asi *asi_get_current(void) { return NULL; }

-static inline
-int asi_map_gfp(struct asi *asi, void *addr, size_t len, gfp_t gfp_flags)
+static inline int asi_map_task_stack(struct task_struct *tsk, struct asi *asi)
+{ return 0; }
+
+static inline void asi_unmap_task_stack(struct task_struct *tsk) { }
+
+static inline int asi_map_gfp(struct asi *asi, void *addr, size_t len,
+ gfp_t gfp_flags)
{
return 0;
}
@@ -130,6 +135,13 @@ static inline int asi_load_module(struct module* module) {return 0;}

static inline void asi_unload_module(struct module* module) { }

+static inline
+void asi_mark_pages_local_nonsensitive(struct page *pages, uint order,
+ struct mm_struct *mm) { }
+
+static inline
+void asi_clear_pages_local_nonsensitive(struct page *pages, uint order) { }
+
#endif /* !_ASSEMBLY_ */

#endif /* !CONFIG_ADDRESS_SPACE_ISOLATION */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e35fec..87ad45e52b19 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -67,6 +67,7 @@ struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct asi;

/*
* Task state bitmask. NOTE! These bits are also
@@ -1470,6 +1471,10 @@ struct task_struct {
int mce_count;
#endif

+#ifdef CONFIG_ADDRESS_SPACE_ISOLATION
+ struct asi *asi_stack_mapped;
+#endif
+
#ifdef CONFIG_KRETPROBES
struct llist_head kretprobe_instances;
#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index ab2749cf6887..f21cc21814d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -768,7 +768,7 @@ void __noreturn do_exit(long code)
profile_task_exit(tsk);
kcov_task_exit(tsk);

- /* TODO: (oweisse) unmap the stack from ASI */
+ asi_unmap_task_stack(tsk);

coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
diff --git a/kernel/fork.c b/kernel/fork.c
index cb147a72372d..876fefc477cb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -216,7 +216,6 @@ static int free_vm_stack_cache(unsigned int cpu)

static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
- /* TODO: (oweisse) Add annotation to map the stack into ASI */
#ifdef CONFIG_VMAP_STACK
void *stack;
int i;
@@ -269,7 +268,16 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);

+ /* When marking pages as PageLocalNonSesitive we set the page->mm to be
+ * NULL. We must make sure the flag is cleared from the stack pages
+ * before free_pages is called. Otherwise, page->mm will be accessed
+ * which will reuslt in NULL reference. page_address() below will yield
+ * an aliased address after ASI_LOCAL_MAP, thanks to
+ * PageLocalNonSesitive flag. */
if (likely(page)) {
+ asi_mark_pages_local_nonsensitive(page,
+ THREAD_SIZE_ORDER,
+ NULL);
tsk->stack = kasan_reset_tag(page_address(page));
return tsk->stack;
}
@@ -301,6 +309,14 @@ static inline void free_thread_stack(struct task_struct *tsk)
}
#endif

+ /* We must clear the PageNonSensitive flag before calling free_pages().
+ * Otherwise page->mm (which is NULL) will be accessed, in order to
+ * unmap the pages from ASI. Specifically for the stack, we assume the
+ * pages were already unmapped from ASI before we got here, via
+ * asi_unmap_task_stack(). */
+ asi_clear_pages_local_nonsensitive(virt_to_page(tsk->stack),
+ THREAD_SIZE_ORDER);
+
__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
}
# else
@@ -436,6 +452,7 @@ static void release_task_stack(struct task_struct *tsk)
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */

+ asi_unmap_task_stack(tsk);
account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
tsk->stack = NULL;
@@ -916,6 +933,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
* functions again.
*/
tsk->stack = stack;
+#ifdef CONFIG_ADDRESS_SPACE_ISOLATION
+ tsk->asi_stack_mapped = NULL;
+#endif
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
--
2.35.1.473.g83b2b277ed-goog