Re: [PATCH 25/43] x86/mm/kaiser: Unmap kernel from userspace page tables (core patch)

From: Ingo Molnar
Date: Mon Nov 27 2017 - 05:38:08 EST



* Borislav Petkov <bp@xxxxxxxxx> wrote:

> On Fri, Nov 24, 2017 at 06:23:53PM +0100, Ingo Molnar wrote:
> > diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
> > index e1650da01323..d087c3aa0514 100644
> > --- a/arch/x86/entry/calling.h
> > +++ b/arch/x86/entry/calling.h
> > @@ -2,6 +2,7 @@
> > #include <linux/jump_label.h>
> > #include <asm/unwind_hints.h>
> > #include <asm/cpufeatures.h>
> > +#include <asm/page_types.h>
> >
> > /*
> >
> > diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
> > new file mode 100644
> > index 000000000000..3c2cc71b4058
> > --- /dev/null
> > +++ b/arch/x86/include/asm/kaiser.h
> > @@ -0,0 +1,57 @@
> > +#ifndef _ASM_X86_KAISER_H
> > +#define _ASM_X86_KAISER_H
> > +/*
> > + * Copyright(c) 2017 Intel Corporation. All rights reserved.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of version 2 of the GNU General Public License as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful, but
> > + * WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * General Public License for more details.
> > + *
> > + * Based on work published here: https://github.com/IAIK/KAISER
> > + * Modified by Dave Hansen <dave.hansen@xxxxxxxxx to actually work.
> > + */
> > +#ifndef __ASSEMBLY__
> > +
> > +#ifdef CONFIG_KAISER
> > +/**
> > + * kaiser_add_mapping - map a kernel range into the user page tables
> > + * @addr: the start address of the range
> > + * @size: the size of the range
> > + * @flags: The mapping flags of the pages
> > + *
> > + * Use this on all data and code that need to be mapped into both
> > + * copies of the page tables. This includes the code that switches
> > + * to/from userspace and all of the hardware structures that are
> > + * virtually-addressed and needed in userspace like the interrupt
> > + * table.
> > + */
> > +extern int kaiser_add_mapping(unsigned long addr, unsigned long size,
> > + unsigned long flags);
> > +
> > +/**
> > + * kaiser_remove_mapping - remove a kernel mapping from the userpage tables
> > + * @addr: the start address of the range
> > + * @size: the size of the range
> > + */
> > +extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
> > +
> > +/**
> > + * kaiser_init - Initialize the shadow mapping
> > + *
> > + * Most parts of the shadow mapping can be mapped upon boot
> > + * time. Only per-process things like the thread stacks
> > + * or a new LDT have to be mapped at runtime. These boot-
> > + * time mappings are permanent and never unmapped.
> > + */
> > +extern void kaiser_init(void);
>
> Those externs are not needed.

Well, we typically use them for API definitions, even though technically they are
superfluous.

> > + * This generates better code than the inline assembly in
> > + * __set_bit().
> > + */
> > +static inline void *ptr_set_bit(void *ptr, int bit)
> > +{
> > + unsigned long __ptr = (unsigned long)ptr;
> > +
> > + __ptr |= (1<<bit);
>
> __ptr |= BIT(bit);
>
> Ditto below.

Fixed.

> > +static inline bool pgdp_maps_userspace(void *__ptr)
> > +{
> > + unsigned long ptr = (unsigned long)__ptr;
> > +
> > + return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
>
> This generates
>
> return (ptr & ~(~(((1UL) << 12)-1))) < (((1UL) << 12) / 2);
>
> and if you turn that ~PAGE_MASK into PAGE_SIZE-1 you get
>
> return (ptr & (((1UL) << 12)-1)) < (((1UL) << 12) / 2);
>
> which removes the self-cancelling negation:
>
> return (ptr & (PAGE_SIZE-1)) < (PAGE_SIZE / 2);
>
> The final asm is the same, though.

I left the original version, because it generates the same output - unless you
feel strongly about this?

> > + } else {
> > + /*
> > + * _PAGE_USER was not set in either the PGD being set
> > + * or cleared. All kernel PGDs should be
> > + * pre-populated so this should never happen after
> > + * boot.
> > + */
>
> So maybe do:
>
> WARN_ON_ONCE(system_state == SYSTEM_RUNNING);

Indeed, check added.

> > static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
> > {
> > +#if defined(CONFIG_KAISER) && !defined(CONFIG_X86_5LEVEL)
> > + p4dp->pgd = kaiser_set_shadow_pgd(&p4dp->pgd, p4d.pgd);
> > +#else /* CONFIG_KAISER */
>
> No need for that comment.

Fixed.

> > /*
> > * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
> > @@ -128,6 +129,22 @@ void __init init_espfix_bsp(void)
> > pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
> > p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
> > p4d_populate(&init_mm, p4d, espfix_pud_page);
>
> <---- newline here.

Added.

> > /* Randomize the locations */
> > init_espfix_random();
>
> End of part I of the review of this biggy :)

Below is the delta patch - I'm backmerging it to the core patch.

I'll send out the whole series in the next hour or so, people asked for it so that
they can review the latest.

Thanks,

Ingo

============>

arch/x86/include/asm/pgtable_64.h | 9 +++++----
arch/x86/kernel/espfix_64.c | 1 +
2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c8c0dc1cfe86..8d725fcb921b 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -148,14 +148,14 @@ static inline void *ptr_set_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;

- __ptr |= (1<<bit);
+ __ptr |= BIT(bit);
return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;

- __ptr &= ~(1<<bit);
+ __ptr &= ~BIT(bit);
return (void *)__ptr;
}

@@ -255,6 +255,7 @@ static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
* pre-populated so this should never happen after
* boot.
*/
+ WARN_ON_ONCE(system_state == SYSTEM_RUNNING);
}
#endif
/* return the copy of the PGD we want the kernel to use: */
@@ -266,7 +267,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
#if defined(CONFIG_KAISER) && !defined(CONFIG_X86_5LEVEL)
p4dp->pgd = kaiser_set_shadow_pgd(&p4dp->pgd, p4d.pgd);
-#else /* CONFIG_KAISER */
+#else
*p4dp = p4d;
#endif
}
@@ -284,7 +285,7 @@ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
#ifdef CONFIG_KAISER
*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
-#else /* CONFIG_KAISER */
+#else
*pgdp = pgd;
#endif
}
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8bb116d73aaa..319033f5bbd9 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -129,6 +129,7 @@ void __init init_espfix_bsp(void)
pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
p4d_populate(&init_mm, p4d, espfix_pud_page);
+
/*
* Just copy the top-level PGD that is mapping the espfix
* area to ensure it is mapped into the shadow user page