[PATCH] x86/mce: drop cpu_missing since we have more capable mce_missing_cpus

From: Zhaolong Zhang
Date: Mon Nov 08 2021 - 04:00:41 EST


move mce_missing_cpus checking into mce_panic() as well, because we don't want
to lose the cpu missing information in case mca_cfg.tolerant > 1 and there is
no_way_out.

Signed-off-by: Zhaolong Zhang <zhangzl2013@xxxxxxx>
---
arch/x86/kernel/cpu/mce/core.c | 38 ++++++++++++++++++++--------------
1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 50a3e455cded..0bb59e68a457 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -99,7 +99,6 @@ struct mca_config mca_cfg __read_mostly = {

static DEFINE_PER_CPU(struct mce, mces_seen);
static unsigned long mce_need_notify;
-static int cpu_missing;

/*
* MCA banks polled by the period polling timer for corrected events.
@@ -253,6 +252,12 @@ static atomic_t mce_panicked;
static int fake_panic;
static atomic_t mce_fake_panicked;

+/*
+ * Track which CPUs entered the MCA broadcast synchronization and which not in
+ * order to print holdouts.
+ */
+static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
+
/* Panic in progress. Enable interrupts and wait for final IPI */
static void wait_for_panic(void)
{
@@ -314,8 +319,13 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
if (!apei_err)
apei_err = apei_write_mce(final);
}
- if (cpu_missing)
- pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
+ /*
+ * cpu_online_mask == &mce_missing_cpus means it is reset and no timeout happens.
+ */
+ if (!cpumask_equal(cpu_online_mask, &mce_missing_cpus) &&
+ cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+ pr_emerg(HW_ERR "CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+ cpumask_pr_args(&mce_missing_cpus));
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
@@ -880,12 +890,6 @@ static atomic_t mce_executing;
*/
static atomic_t mce_callin;

-/*
- * Track which CPUs entered the MCA broadcast synchronization and which not in
- * order to print holdouts.
- */
-static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
-
/*
* Check if a timeout waiting for other CPUs happened.
*/
@@ -904,12 +908,8 @@ static int mce_timed_out(u64 *t, const char *msg)
goto out;
if ((s64)*t < SPINUNIT) {
if (mca_cfg.tolerant <= 1) {
- if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
- pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
- cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
}
- cpu_missing = 1;
return 1;
}
*t -= SPINUNIT;
@@ -1079,8 +1079,10 @@ static int mce_end(int order)

if (!timeout)
goto reset;
- if (order < 0)
+ if (order < 0) {
+ timeout = 0;
goto reset;
+ }

/*
* Allow others to run.
@@ -1128,7 +1130,12 @@ static int mce_end(int order)
reset:
atomic_set(&global_nwo, 0);
atomic_set(&mce_callin, 0);
- cpumask_setall(&mce_missing_cpus);
+ /*
+ * Don't reset mce_missing_cpus if there is mce_timed_out() so that
+ * mce_panic() can report right thing.
+ */
+ if (!((s64)timeout < SPINUNIT))
+ cpumask_setall(&mce_missing_cpus);
barrier();

/*
@@ -2720,7 +2727,6 @@ struct dentry *mce_get_debugfs_dir(void)

static void mce_reset(void)
{
- cpu_missing = 0;
atomic_set(&mce_fake_panicked, 0);
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
--
2.27.0