[PATCH 10/10] MCE: Add Action-Required support

From: Luck, Tony
Date: Thu Jun 09 2011 - 17:38:52 EST


From: Tony Luck <tony.luck@xxxxxxxxx>

Implement core MCA recovery. This is used for errors
that happen in the current execution context.

The kernel has to first pass the error information
to a function running on the current process stack.
This is done using task_return_notifier_register().

Just handle errors in user mode for now. Later we
may be able to handle some kernel cases (e.g. when
kernel is in copy_*_user())

Based on some original code by Andi Kleen.

Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
---
arch/x86/kernel/cpu/mcheck/mce-severity.c | 35 +++++++-
arch/x86/kernel/cpu/mcheck/mce.c | 118 +++++++++++++++++++++++++++--
2 files changed, 142 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 352d16a..fe8a28c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -13,6 +13,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/debugfs.h>
+#include <linux/module.h>
#include <asm/mce.h>

#include "mce-internal.h"
@@ -54,6 +55,9 @@ static struct severity {
{ .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
#define MASK(x, y, s, m, r...) \
{ .mask = x, .result = y, SEV(s), .msg = m, ## r }
+#define ARMASK(x, y, s, m, r...) \
+ { .mcgmask = MCG_STATUS_RIPV, .mcgres = 0, \
+ .mask = x, .result = y, SEV(s), .msg = m, ## r }
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define MCACOD 0xffff
@@ -67,7 +71,7 @@ static struct severity {
MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
"Neither restart nor error IP"),
MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
- KERNEL),
+ KERNEL, NOSER),
BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),

/* ignore OVER for UCNA */
@@ -77,10 +81,16 @@ static struct severity {
"Illegal combination (UCNA with AR=1)", SER),
MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),

- /* AR add known MCACODs here */
MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
"Action required with lost events", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR, PANIC,
+
+ /* known AR MCACODs: */
+ ARMASK(MCI_UC_SAR|MCI_STATUS_OVER|0xffff, MCI_UC_SAR|0x134, AR,
+ "Action required: data load error", SER),
+ ARMASK(MCI_UC_SAR|MCI_STATUS_OVER|0xffff, MCI_UC_SAR|0x150, AR,
+ "Action required: instruction fetch error", SER),
+
+ ARMASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR, PANIC,
"Action required; unknown MCACOD", SER),

/* known AO MCACODs: */
@@ -89,6 +99,7 @@ static struct severity {
MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
"Action optional: last level cache writeback error", SER),

+
MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
"Action optional unknown MCACOD", SER),
MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
@@ -110,6 +121,17 @@ static int error_context(struct mce *m)
return IN_KERNEL;
}

+static int kernel_ar_recoverable(struct mce *m, int tolerant)
+{
+ if (tolerant >= 2)
+ return MCE_AR_SEVERITY;
+ if (!(m->mcgstatus & MCG_STATUS_EIPV) || !m->ip)
+ return MCE_PANIC_SEVERITY;
+ if (search_exception_tables(m->ip))
+ return MCE_AR_SEVERITY;
+ return MCE_PANIC_SEVERITY;
+}
+
int mce_severity(struct mce *a, int tolerant, char **msg)
{
enum context ctx = error_context(a);
@@ -129,9 +151,12 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
if (msg)
*msg = s->msg;
s->covered = 1;
- if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
- if (panic_on_oops || tolerant < 1)
+ if (ctx == IN_KERNEL) {
+ if (s->sev >= MCE_UC_SEVERITY &&
+ (panic_on_oops || tolerant < 1))
return MCE_PANIC_SEVERITY;
+ if (s->sev == MCE_AR_SEVERITY)
+ return kernel_ar_recoverable(a, tolerant);
}
return s->sev;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9c72245..a7a8c53 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -80,6 +80,20 @@ static void mce_do_notify(struct user_return_notifier *urn);
static DEFINE_PER_CPU(struct mce_notify, mce_notify);

/*
+ * Task return notifiers are used for "action required"
+ * recovery of tasks - i.e. we prevent return to the task
+ * that encountered the machine check, but we ensure that
+ * we process the error in task context.
+ */
+struct task_notify {
+ struct user_return_notifier urn;
+ unsigned long pfn;
+ atomic_t inuse;
+};
+static struct task_notify task_notifier[NR_CPUS];
+static void mce_do_task(struct user_return_notifier *urn);
+
+/*
* Tolerant levels:
* 0: always panic on uncorrected errors, log corrected errors
* 1: panic or SIGBUS on uncorrected errors, log corrected errors
@@ -975,6 +989,84 @@ static void mce_clear_state(unsigned long *toclear)
}
}

+/* Stub when hwpoison is not compiled in */
+int __attribute__((weak)) __memory_failure(unsigned long pfn, int vector,
+ int precount)
+{
+ return -1;
+}
+
+/*
+ * Uncorrected error for current process.
+ */
+static void mce_action_required(struct mce *m, char *msg, struct pt_regs *regs)
+{
+ int i;
+
+ if (!mce_usable_address(m))
+ mce_panic("No address for Action-Required Machine Check",
+ m, msg);
+ if (!(m->mcgstatus & MCG_STATUS_EIPV))
+ mce_panic("No EIPV for Action-Required Machine Check",
+ m, msg);
+
+ for (i = 0; i < NR_CPUS; i++)
+ if (!atomic_cmpxchg(&task_notifier[i].inuse, 0, 1))
+ break;
+ if (i == NR_CPUS)
+ mce_panic("Too many concurrent errors", m, msg);
+
+ task_notifier[i].urn.on_user_return = mce_do_task;
+ task_notifier[i].pfn = m->addr >> PAGE_SHIFT;
+ task_return_notifier_register(&task_notifier[i].urn);
+}
+
+#undef pr_fmt
+#define pr_fmt(x) "MCE: %s:%d " x "\n", current->comm, current->pid
+#define PADDR(x) ((u64)(x) << PAGE_SHIFT)
+
+/*
+ * No successfull recovery. Make sure at least that there's
+ * a SIGBUS.
+ */
+static void ar_fallback(struct task_struct *me, unsigned long pfn)
+{
+ if (signal_pending(me) && sigismember(&me->pending.signal, SIGBUS))
+ return;
+
+ /*
+ * For some reason hwpoison wasn't able to send a proper
+ * SIGBUS. Send a fallback signal. Unfortunately we don't
+ * know the virtual address here, so can't tell the program
+ * details.
+ */
+ force_sig(SIGBUS, me);
+ pr_err("Killed due to action-required memory corruption");
+}
+
+/*
+ * Handle action-required on the process stack. hwpoison does the
+ * bulk of the work and with some luck might even be able to fix the
+ * problem.
+ *
+ * Logic changes here should be reflected in kernel_ar_recoverable().
+ */
+static void handle_action_required(unsigned long pfn)
+{
+ struct task_struct *me = current;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx",
+ PADDR(pfn));
+ if (__memory_failure(pfn, MCE_VECTOR, 0) < 0) {
+ pr_err("Memory error not recovered");
+ ar_fallback(me, pfn);
+ } else
+ pr_err("Memory error recovered");
+}
+
+#undef pr_fmt
+#define pr_fmt(x) x
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@@ -1086,12 +1178,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
continue;
}

- /*
- * Kill on action required.
- */
- if (severity == MCE_AR_SEVERITY)
- kill_it = 1;
-
mce_read_aux(&m, i);

/*
@@ -1136,6 +1222,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_panic("Fatal machine check on current CPU", &m, msg);

/*
+ * Do recovery in current process if needed. This has to be delayed
+ * until we're back on the process stack.
+ */
+ if (worst == MCE_AR_SEVERITY) {
+ mce_action_required(&m, msg, regs);
+ kill_it = 0;
+ }
+
+ /*
* If the error seems to be unrecoverable, something should be
* done. Try to kill as little as possible. If we can kill just
* one task, do that. If the user has set the tolerance very
@@ -1194,6 +1289,17 @@ static void mce_do_notify(struct user_return_notifier *urn)
mce_process_ring();
}

+static void mce_do_task(struct user_return_notifier *urn)
+{
+ struct task_notify *np = container_of(urn, struct task_notify, urn);
+ unsigned long pfn = np->pfn;
+
+ task_return_notifier_unregister(urn);
+ atomic_set(&np->inuse, 0);
+
+ handle_action_required(pfn);
+}
+
static void mce_process_work(struct work_struct *dummy)
{
mce_process_ring();
--
1.7.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/