[PATCH] thermal: intel: Fix unchecked MSR issue

From: Srinivas Pandruvada
Date: Mon Apr 10 2023 - 13:35:26 EST


Some older processors don't allow BIT(13) and BIT(15) in the current
mask set by "THERM_STATUS_CLEAR_CORE_MASK". This results in:

unchecked MSR access error: WRMSR to 0x19c (tried to
write 0x000000000000aaa8) at rIP: 0xffffffff816f66a6
(throttle_active_work+0xa6/0x1d0)

To avoid unchecked MSR issues, check cpuid for each feature and then
form core mask. Do the same for package mask set by
"THERM_STATUS_CLEAR_PKG_MASK".

Introduce functions thermal_intr_core_clear_mask() and
thermal_intr_pkg_clear_mask() to set core and package mask respectively.
These functions are called during initialization.

Fixes: 6fe1e64b6026 ("thermal: intel: Prevent accidental clearing of HFI status")
Reported-by: Rui Salvaterra <rsalvaterra@xxxxxxxxx>
Link: https://lore.kernel.org/lkml/cdf43fb423368ee3994124a9e8c9b4f8d00712c6.camel@xxxxxxxxxxxxxxx/T/
Tested-by: Rui Salvaterra <rsalvaterra@xxxxxxxxx>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@xxxxxxxxxxxxxxx>
Cc: stable@xxxxxxxxxx # 6.2+
---
drivers/thermal/intel/therm_throt.c | 73 ++++++++++++++++++++++++++---
1 file changed, 66 insertions(+), 7 deletions(-)

diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
index 2e22bb82b738..d5047676f3d2 100644
--- a/drivers/thermal/intel/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -193,8 +193,67 @@ static const struct attribute_group thermal_attr_group = {
#define THERM_THROT_POLL_INTERVAL HZ
#define THERM_STATUS_PROCHOT_LOG BIT(1)

-#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
-#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
+static u64 def_therm_core_clear_mask;
+static u64 def_therm_pkg_clear_mask;
+
+static void thermal_intr_core_clear_mask(void)
+{
+ if (def_therm_core_clear_mask)
+ return;
+
+ /*
+ * Reference: Intel SDM Volume 4
+ * "Table 2-2. IA-32 Architectural MSRs", MSR 0x19C
+ * IA32_THERM_STATUS.
+ */
+
+ /*
+ * Bit 1, 3, 5: CPUID.01H:EDX[22] = 1. This driver will not
+ * enable interrupts, when 0 as it checks for X86_FEATURE_ACPI.
+ */
+ def_therm_core_clear_mask = (BIT(1) | BIT(3) | BIT(5));
+
+ /*
+ * Bit 7 and 9: Thermal Threshold #1 and #2 log
+ * If CPUID.01H:ECX[8] = 1
+ */
+ if (boot_cpu_has(X86_FEATURE_TM2))
+ def_therm_core_clear_mask |= (BIT(7) | BIT(9));
+
+ /* Bit 11: Power Limitation log (R/WC0) If CPUID.06H:EAX[4] = 1 */
+ if (boot_cpu_has(X86_FEATURE_PLN))
+ def_therm_core_clear_mask |= BIT(11);
+
+ /*
+ * Bit 13: Current Limit log (R/WC0) If CPUID.06H:EAX[7] = 1
+ * Bit 15: Cross Domain Limit log (R/WC0) If CPUID.06H:EAX[7] = 1
+ */
+ if (boot_cpu_has(X86_FEATURE_HWP))
+ def_therm_core_clear_mask |= (BIT(13) | BIT(15));
+}
+
+static void thermal_intr_pkg_clear_mask(void)
+{
+ if (def_therm_pkg_clear_mask)
+ return;
+
+ /*
+ * Reference: Intel SDM Volume 4
+ * "Table 2-2. IA-32 Architectural MSRs", MSR 0x1B1
+ * IA32_PACKAGE_THERM_STATUS.
+ */
+
+ /* All bits except BIT 26 depends on CPUID.06H: EAX[6] = 1 */
+ if (boot_cpu_has(X86_FEATURE_PTS))
+ def_therm_pkg_clear_mask = (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11));
+
+ /*
+ * Intel SDM Volume 2A: Thermal and Power Management Leaf
+ * Bit 26: CPUID.06H: EAX[19] = 1
+ */
+ if (boot_cpu_has(X86_FEATURE_HFI))
+ def_therm_pkg_clear_mask |= BIT(26);
+}

/*
* Clear the bits in package thermal status register for bit = 1
@@ -207,13 +266,10 @@ void thermal_clear_package_intr_status(int level, u64 bit_mask)

if (level == CORE_LEVEL) {
msr = MSR_IA32_THERM_STATUS;
- msr_val = THERM_STATUS_CLEAR_CORE_MASK;
+ msr_val = def_therm_core_clear_mask;
} else {
msr = MSR_IA32_PACKAGE_THERM_STATUS;
- msr_val = THERM_STATUS_CLEAR_PKG_MASK;
- if (boot_cpu_has(X86_FEATURE_HFI))
- msr_val |= BIT(26);
-
+ msr_val = def_therm_pkg_clear_mask;
}

msr_val &= ~bit_mask;
@@ -708,6 +764,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
apic_write(APIC_LVTTHMR, h);

+ thermal_intr_core_clear_mask();
+ thermal_intr_pkg_clear_mask();
+
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
wrmsr(MSR_IA32_THERM_INTERRUPT,
--
2.39.1