Re: Yet another KPTI regression with 4.14.x series in a VM

From: Thomas Gleixner
Date: Sat Jan 13 2018 - 15:46:07 EST


On Sat, 13 Jan 2018, Andy Lutomirski wrote:
> Trying to inventory this stuff scattered all over the place:
>
> #define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
> #define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
> # define X86_CR3_PTI_SWITCH_BIT 11
> #define PTI_SWITCH_MASK
> (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
>
> Blech. I wouldn't be terribly surprised if I missed a few as well. How about:
>
> PTI_USER_PGTABLE_BIT = PAGE_SHIFT
> PTI_USER_PGTABLE_MASK = 1 << PTI_USER_PGTABLE_BIT
> PTI_USER_PCID_BIT = 11
> PTI_USER_PCID_MASK = 1 << PTI_USER_PCID_BIT
> PTI_USER_PGTABLE_AND_PCID_MASK = PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK
>
> This naming would make the apparently buggy code look fishy, as it
> should. I will give this a shot some time soon if no one beats me to
> it.

Well, the thing we tripped over is that we trusted the SDM that bit 11 is
ignored. Seems its not and the AMD APM says that reserved bit should be
cleared. Next time I surely stare into both....

So something like the below should make it clear. I've not done the
alternatives thing yet...

Thanks,

tglx

8<-------------------
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -198,8 +198,11 @@ For 32-bit we have the following convent
* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
* halves:
*/
-#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
-#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
+#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
+#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT)
+#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT
+#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT)
+#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)

.macro SET_NOFLUSH_BIT reg:req
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
@@ -208,7 +211,7 @@ For 32-bit we have the following convent
.macro ADJUST_KERNEL_CR3 reg:req
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
- andq $(~PTI_SWITCH_MASK), \reg
+ andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
.endm

.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
@@ -239,15 +242,18 @@ For 32-bit we have the following convent
/* Flush needed, clear the bit */
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
movq \scratch_reg2, \scratch_reg
- jmp .Lwrcr3_\@
+ jmp .Lwrcr3_pcid_\@

.Lnoflush_\@:
movq \scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg

+.Lwcr3_pcid_\@:
+ orq $(PTI_USER_PCID_MASK), \scratch_reg
+
.Lwrcr3_\@:
/* Flip the PGD and ASID to the user version */
- orq $(PTI_SWITCH_MASK), \scratch_reg
+ orq $(PTI_USER_PGTABLE_MASK), \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
.endm
@@ -272,7 +278,7 @@ For 32-bit we have the following convent
*
* That indicates a kernel CR3 value, not a user CR3.
*/
- testq $(PTI_SWITCH_MASK), \scratch_reg
+ testq $(PTI_USER_PGTABLE_MASK), \scratch_reg
jz .Ldone_\@

ADJUST_KERNEL_CR3 \scratch_reg
@@ -290,7 +296,7 @@ For 32-bit we have the following convent
* KERNEL pages can always resume with NOFLUSH as we do
* explicit flushes.
*/
- bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
+ bt $PTI_USER_PGTABLE_BIT, \save_reg
jnc .Lnoflush_\@

/*
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -40,7 +40,7 @@
#define CR3_NOFLUSH BIT_ULL(63)

#ifdef CONFIG_PAGE_TABLE_ISOLATION
-# define X86_CR3_PTI_SWITCH_BIT 11
+# define X86_CR3_PTI_PCID_USER_BIT 11
#endif

#else
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -81,13 +81,13 @@ static inline u16 kern_pcid(u16 asid)
* Make sure that the dynamic ASID space does not confict with the
* bit we are using to switch between user and kernel ASIDs.
*/
- BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));

/*
* The ASID being passed in here should have respected the
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
*/
- VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
#endif
/*
* The dynamically-assigned ASIDs that get passed in are small
@@ -112,7 +112,7 @@ static inline u16 user_pcid(u16 asid)
{
u16 ret = kern_pcid(asid);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
- ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
+ ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
#endif
return ret;
}