Re: runtime regression with "x86/mm/pat: Emulate PAT when it is disabled"

From: Toshi Kani
Date: Thu Mar 10 2016 - 17:25:13 EST


On Thu, 2016-03-10 at 22:07 +0100, Borislav Petkov wrote:
> On Thu, Mar 10, 2016 at 01:24:11PM -0700, Toshi Kani wrote:
> > I am not familiar withÂPPRO_FEATURES,
>
> That's the feature bits of the "qemu32" model, and others, in qemu.
>
> > but shouldn't 'flags' in /proc/cpuinfo show "pat" whenÂX86_FEATURE_PAT
> > is set?
>
> static void early_init_intel(struct cpuinfo_x86 *c)
> ...
>
> ÂÂÂÂÂÂÂÂ/*
> ÂÂÂÂÂÂÂÂÂ* There is a known erratum on Pentium III and Core Solo
> ÂÂÂÂÂÂÂÂÂ* and Core Duo CPUs.
> ÂÂÂÂÂÂÂÂÂ* " Page with PAT set to WC while associated MTRR is UC
> ÂÂÂÂÂÂÂÂÂ*ÂÂÂmay consolidate to UC "
> ÂÂÂÂÂÂÂÂÂ* Because of this erratum, it is better to stick with
> ÂÂÂÂÂÂÂÂÂ* setting WC in MTRR rather than using PAT on these CPUs.
> ÂÂÂÂÂÂÂÂÂ*
> ÂÂÂÂÂÂÂÂÂ* Enable PAT WC only on P4, Core 2 or later CPUs.
> ÂÂÂÂÂÂÂÂÂ*/
> ÂÂÂÂÂÂÂÂif (c->x86 == 6 && c->x86_model < 15)
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂclear_cpu_cap(c, X86_FEATURE_PAT);
> ---
>
> which also gives a hint as to how we should fix this: pat_enabled()
> needs to look at that feature bit too:

I see. ÂI will take a look.

> ---
> diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
> index faec01e7a17d..359c30d9a78c 100644
> --- a/arch/x86/mm/pat.c
> +++ b/arch/x86/mm/pat.c
> @@ -56,7 +56,7 @@ early_param("nopat", nopat);
> Â
> Âbool pat_enabled(void)
> Â{
> - return !!__pat_enabled;
> + return !!__pat_enabled && static_cpu_has(X86_FEATURE_PAT);
> Â}
> ÂEXPORT_SYMBOL_GPL(pat_enabled);
> ---
>
> Makes sense?

Yes, I agree that pat_enable() needs to check the PAT feature bit. ÂIn some
reason, static_cpu_has(X86_FEATURE_PAT) returns 0 while cpu_has_pat returns
1 in my testing... ÂI need to check this.

> > pat_init() is being called as part of MTRR setup because PAT
> > initialization requires the same CPU rendezvous operation implemented
> > in the MTRR code.
>
> ... which means, PAT depends on MTRR being present.

Yes, and we need more changes to handle this dependency since MTRR does not
call pat_init() when it is disabled.

Attached is the changes I am working now. ÂI will include your changes, and
send them out once I finished testing. ÂLet me know if you have any
suggestion.

Thanks,
-Toshi
--- Begin Message --- ---
arch/x86/include/asm/pat.h | 1 +
arch/x86/mm/pat.c | 84 +++++++++++++++++++++++++++-----------------
2 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index ca6c228..016142b 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -5,6 +5,7 @@
#include <asm/pgtable_types.h>

bool pat_enabled(void);
+void pat_disable(const char *reason);
extern void pat_init(void);
void pat_init_cache_modes(u64);

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f4ae536..af2b3d8 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -40,11 +40,19 @@
static bool boot_cpu_done;

static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
+static void pat_disable_init(void);

-static inline void pat_disable(const char *reason)
+void pat_disable(const char *reason)
{
+ if (boot_cpu_done) {
+ pr_info("x86/PAT: PAT cannot be disabled after initialized\n");
+ return;
+ }
+
__pat_enabled = 0;
pr_info("x86/PAT: %s\n", reason);
+
+ pat_disable_init();
}

static int __init nopat(char *str)
@@ -207,9 +215,6 @@ static void pat_bsp_init(u64 pat)
return;
}

- if (!pat_enabled())
- goto done;
-
rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
if (!tmp_pat) {
pat_disable("PAT MSR is 0, disabled.");
@@ -218,15 +223,11 @@ static void pat_bsp_init(u64 pat)

wrmsrl(MSR_IA32_CR_PAT, pat);

-done:
pat_init_cache_modes(pat);
}

static void pat_ap_init(u64 pat)
{
- if (!pat_enabled())
- return;
-
if (!cpu_has_pat) {
/*
* If this happens we are on a secondary CPU, but switched to
@@ -238,38 +239,55 @@ static void pat_ap_init(u64 pat)
wrmsrl(MSR_IA32_CR_PAT, pat);
}

+static void pat_disable_init(void)
+{
+ u64 pat;
+ static int disable_init_done = 0;
+
+ if (disable_init_done)
+ return;
+
+ /*
+ * No PAT. Emulate the PAT table that corresponds to the two
+ * cache bits, PWT (Write Through) and PCD (Cache Disable). This
+ * setup is the same as the BIOS default setup when the system
+ * has PAT but the "nopat" boot option has been specified. This
+ * emulated PAT table is used when MSR_IA32_CR_PAT returns 0.
+ *
+ * PTE encoding:
+ *
+ * PCD
+ * |PWT PAT
+ * || slot
+ * 00 0 WB : _PAGE_CACHE_MODE_WB
+ * 01 1 WT : _PAGE_CACHE_MODE_WT
+ * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 11 3 UC : _PAGE_CACHE_MODE_UC
+ *
+ * NOTE: When WC or WP is used, it is redirected to UC- per
+ * the default setup in __cachemode2pte_tbl[].
+ */
+ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
+
+ pat_init_cache_modes(pat);
+
+ disable_init_done = 1;
+}
+
void pat_init(void)
{
u64 pat;
struct cpuinfo_x86 *c = &boot_cpu_data;

if (!pat_enabled()) {
- /*
- * No PAT. Emulate the PAT table that corresponds to the two
- * cache bits, PWT (Write Through) and PCD (Cache Disable). This
- * setup is the same as the BIOS default setup when the system
- * has PAT but the "nopat" boot option has been specified. This
- * emulated PAT table is used when MSR_IA32_CR_PAT returns 0.
- *
- * PTE encoding:
- *
- * PCD
- * |PWT PAT
- * || slot
- * 00 0 WB : _PAGE_CACHE_MODE_WB
- * 01 1 WT : _PAGE_CACHE_MODE_WT
- * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
- * 11 3 UC : _PAGE_CACHE_MODE_UC
- *
- * NOTE: When WC or WP is used, it is redirected to UC- per
- * the default setup in __cachemode2pte_tbl[].
- */
- pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
+ pat_disable_init();
+ return;
+ }

- } else if ((c->x86_vendor == X86_VENDOR_INTEL) &&
- (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
- ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+ if ((c->x86_vendor == X86_VENDOR_INTEL) &&
+ (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
+ ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
/*
* PAT support with the lower four entries. Intel Pentium 2,
* 3, M, and 4 are affected by PAT errata, which makes the

--- End Message ---
--- Begin Message --- ---
arch/x86/kernel/cpu/mtrr/generic.c | 24 ++++++++++++++----------
arch/x86/kernel/cpu/mtrr/main.c | 13 ++++++++++++-
arch/x86/kernel/cpu/mtrr/mtrr.h | 1 +
3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index c870af1..136ae86 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -444,11 +444,24 @@ static void __init print_mtrr_state(void)
pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}

+/* PAT setup for BP. We need to go through sync steps here */
+void __init mtrr_bp_pat_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ prepare_set();
+
+ pat_init();
+
+ post_set();
+ local_irq_restore(flags);
+}
+
/* Grab all of the MTRR state for this CPU into *state */
bool __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
- unsigned long flags;
unsigned lo, dummy;
unsigned int i;

@@ -481,15 +494,6 @@ bool __init get_mtrr_state(void)

mtrr_state_set = 1;

- /* PAT setup for BP. We need to go through sync steps here */
- local_irq_save(flags);
- prepare_set();
-
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-
return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
}

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 5c3d149..d9e91f1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -752,6 +752,9 @@ void __init mtrr_bp_init(void)
/* BIOS may override */
__mtrr_enabled = get_mtrr_state();

+ if (mtrr_enabled())
+ mtrr_bp_pat_init();
+
if (mtrr_cleanup(phys_addr)) {
changed_by_mtrr_cleanup = 1;
mtrr_if->set_all();
@@ -759,8 +762,16 @@ void __init mtrr_bp_init(void)
}
}

- if (!mtrr_enabled())
+ if (!mtrr_enabled()) {
pr_info("MTRR: Disabled\n");
+
+ /*
+ * PAT initialization relies on MTRR's rendezvous handler.
+ * Disable PAT until the handler can initialize both features
+ * independently.
+ */
+ pat_disable("PAT disabled by MTRR");
+ }
}

void mtrr_ap_init(void)
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 951884d..6c7ced0 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -52,6 +52,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
bool get_mtrr_state(void);
+void mtrr_bp_pat_init(void);

extern void set_mtrr_ops(const struct mtrr_ops *ops);


--- End Message ---