[Patch V1] x86, mce: CPU synchronization for broadcast MCE's is surprised by offline CPUs

From: Ashok Raj
Date: Thu Sep 10 2015 - 19:28:20 EST


Linux has logical CPU offline, supported as shown below.

#echo 0 > /sys/devices/system/cpu/cpuX/online

Hardware doesn't know about OS offlining, hence hardware will always
broadcast any MCE to all CPUs in the system, even it its parked in
cpu_dead.

mce_start() and mce_end() should use cpu_present_map to count CPUs in
rendezvous. Offline cpu is also in the MCE domain, so its going
to execute do_machine_check(). This will increment mce_callin. This
will result in always cpus incrementing would be off by the number
of CPUs offined.

This patch does the following.

- Allow MCE logging from CPUs logically offlined.
- Ensure the offline CPU wil not be choosen as the rendezvous master CPU
- Collect logs from the offline cpu and report them via rendezvous master.

Signed-off-by: Ashok Raj <ashok.raj@xxxxxxxxx>
Reviewed-by: Tony Luck <tony.luck@xxxxxxxxx>
---
arch/x86/kernel/cpu/mcheck/mce.c | 101 +++++++++++++++++++++++++++++++++++++--
1 file changed, 96 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 69c7e3c..7c6b8b2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -769,6 +769,63 @@ out:
}

/*
+ * We can't call mce_log() for offline CPUs because it uses RCU for
+ * synchronization. (and may call arbitrary driver code via
+ * x86_mce_decoder_chain that may also be surprised at being called
+ * from an offline CPU). Provide enough buffer space to hold a few
+ * errors that can be picked up later. We don't care about overflow
+ * here, since this is supposed to be really rare, so not doing any
+ * tracking for overflow.
+ */
+
+#define OFFLINE_CPU_LOG_LEN 16
+
+struct offline_cpu_mce {
+ unsigned short head;
+ unsigned short tail;
+ struct mce mce_log[OFFLINE_CPU_LOG_LEN];
+};
+
+static struct offline_cpu_mce offline_mce;
+static unsigned int offline_mce_overflow = 0;
+
+/*
+ * Add mce's discovered in offline cpu which will be logged by the
+ * MCE rendezvous master. There is no lock required, since MCE's are
+ * processed one cpu at a time, sequenced by the rendezvous master CPU
+ * Safe to be called only from MCE handler.
+ */
+static int offline_mce_add(struct mce *m)
+{
+ unsigned next;
+
+ next = (offline_mce.tail + 1) % OFFLINE_CPU_LOG_LEN;
+ if (next == offline_mce.head) {
+ offline_mce_overflow++;
+ return -1;
+ }
+
+ offline_mce.mce_log[offline_mce.tail] = *m;
+ offline_mce.tail = next;
+ return 0;
+}
+
+static int offline_mce_get(struct mce *m)
+{
+ int ret = 0;
+
+ if (offline_mce.head == offline_mce.tail)
+ goto out;
+
+ *m = offline_mce.mce_log[offline_mce.head];
+ offline_mce.head = (offline_mce.head + 1) % OFFLINE_CPU_LOG_LEN;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
* The Monarch's reign. The Monarch is the CPU who entered
* the machine check handler first. It waits for the others to
* raise the exception too and then grades them. When any
@@ -799,13 +856,31 @@ static void mce_reign(void)
int global_worst = 0;
char *msg = NULL;
char *nmsg = NULL;
+ struct mce offline_mce;
+
+
+ /*
+ * If there are any MCE's logged by offline CPU's, lets
+ * gather and report them via mce_log
+ */
+ while (offline_mce_get(&offline_mce))
+ mce_log(&offline_mce);
+
+ if (offline_mce_overflow) {
+ pr_info (HW_ERR "Lost %d errors logged by offline CPUs\n",
+ offline_mce_overflow);
+ offline_mce_overflow = 0;
+ }

/*
* This CPU is the Monarch and the other CPUs have run
* through their handlers.
* Grade the severity of the errors of all the CPUs.
+ * Intel CPUs broadcast MCE's to all cpus booted.
+ * Even if they are merely parked in the OS for logical offline
+ * they also should process MCE.
*/
- for_each_possible_cpu(cpu) {
+ for_each_present_cpu(cpu) {
int severity = mce_severity(&per_cpu(mces_seen, cpu),
mca_cfg.tolerant,
&nmsg, true);
@@ -841,7 +916,7 @@ static void mce_reign(void)
* Now clear all the mces_seen so that they don't reappear on
* the next mce.
*/
- for_each_possible_cpu(cpu)
+ for_each_present_cpu(cpu)
memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
}

@@ -857,8 +932,9 @@ static atomic_t global_nwo;
static int mce_start(int *no_way_out)
{
int order;
- int cpus = num_online_cpus();
+ int cpus = num_present_cpus();
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ unsigned int this_cpu = smp_processor_id();

if (!timeout)
return -1;
@@ -868,6 +944,16 @@ static int mce_start(int *no_way_out)
* global_nwo should be updated before mce_callin
*/
smp_wmb();
+
+ /*
+ * If this cpu is offline, make sure it won't be elected as
+ * rendezvous master
+ */
+ if (cpu_is_offline(this_cpu)) {
+ while (!atomic_read(&mce_callin))
+ ndelay(SPINUNIT);
+ }
+
order = atomic_inc_return(&mce_callin);

/*
@@ -938,7 +1024,7 @@ static int mce_end(int order)

if (order == 1) {
/* CHECKME: Can this race with a parallel hotplug? */
- int cpus = num_online_cpus();
+ int cpus = num_present_cpus();

/*
* Monarch: Wait for everyone to go through their scanning
@@ -1033,6 +1119,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int i;
int worst = 0;
int severity;
+ unsigned int cpu = smp_processor_id();
+
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
@@ -1153,7 +1241,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
mce_ring_add(m.addr >> PAGE_SHIFT);

- mce_log(&m);
+ if (cpu_is_offline(cpu))
+ offline_mce_add(&m);
+ else
+ mce_log(&m);

if (severity > worst) {
*final = m;
--
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/