[PATCH 2/2] UV: NMI: simple dump failover if kdump fails

From: George Beshers
Date: Sat Sep 12 2015 - 23:01:12 EST


Subject: [PATCH 2/2] UV: NMI: simple dump failover if kdump fails

The ability to trigger a kdump using the system NMI command
was added by

commit 12ba6c990fab ("x86/UV: Add kdump to UV NMI handler")
Author: Mike Travis <travis@xxxxxxx>
Date: Mon Sep 23 16:25:03 2013 -0500

This is useful because when kdump is working the information gathered
is more informative than the original per CPU stack traces or "dump"
option. However a number of things can go wrong with kdump and then
the stack traces are more useful than nothing.

The two most common reasons for kdump to not be available are:
1) if a problem occurs during boot before the kdump service is
started, or
2) the kdump daemon failed to start.
In either case the call to crash_kexec() returns unexpectedly.

When this happens uv_nmi_kdump() also sets the uv_nmi_kexec_failed
flag which causes the slave CPU's to also return to the NMI handler.
Upon this unexpected return to the NMI handler, the NMI handler
will revert to the "dump" action which uses show_regs() to obtain
a process trace dump for all the CPU's.

Other minor changes:
The "dump" action now generates both the show_regs() stack trace
and show instruction pointer information. Whereas the "ips"
action only shows instruction pointers for non-idle CPU's. This
is more like an abbreviated "ps" display.

Change printk(KERN_DEFAULT...) --> pr_info()

Signed-off-by: George Beshers <gbeshers@xxxxxxx>
Signed-off-by: Mike Travis <travis@xxxxxxx>
Cc: Russ Anderson <rja@xxxxxxx>
Cc: Dimitri Sivanich <sivanich@xxxxxxx>
Cc: Hedi Berriche <hedi@xxxxxxx>
Cc: Alex Thorlton <athorlton@xxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxx>


diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 5c9f63f..327f21c 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -376,38 +376,42 @@ static void uv_nmi_wait(int master)
atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus());
}

+/* Dump Instruction Pointer header */
static void uv_nmi_dump_cpu_ip_hdr(void)
{
- printk(KERN_DEFAULT
- "\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n",
+ pr_info("\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n",
"CPU", "PID", "COMMAND", "IP");
}

+/* Dump Instruction Pointer info */
static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs)
{
- printk(KERN_DEFAULT "UV: %4d %6d %-32.32s ",
- cpu, current->pid, current->comm);
-
+ pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm);
printk_address(regs->ip);
}

-/* Dump this cpu's state */
+/*
+ * Dump this CPU's state. If action was set to "kdump" and the crash_kexec
+ * failed, then we provide "dump" as an alternate action. Action "dump" now
+ * also includes the show "ips" (instruction pointers) action whereas the
+ * action "ips" only displays instruction pointers for the non-idle CPU's.
+ * This is an abbreviated form of the "ps" command.
+ */
static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
{
const char *dots = " ................................. ";

- if (uv_nmi_action_is("ips")) {
- if (cpu == 0)
- uv_nmi_dump_cpu_ip_hdr();
+ if (cpu == 0)
+ uv_nmi_dump_cpu_ip_hdr();

- if (current->pid != 0)
- uv_nmi_dump_cpu_ip(cpu, regs);
+ if (current->pid != 0 || !uv_nmi_action_is("ips"))
+ uv_nmi_dump_cpu_ip(cpu, regs);

- } else if (uv_nmi_action_is("dump")) {
- printk(KERN_DEFAULT
- "UV:%sNMI process trace for CPU %d\n", dots, cpu);
+ if (uv_nmi_action_is("dump")) {
+ pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu);
show_regs(regs);
}
+
this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
}

@@ -469,8 +473,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
uv_nmi_trigger_dump(tcpu);
}
if (ignored)
- printk(KERN_DEFAULT "UV: %d CPUs ignored NMI\n",
- ignored);
+ pr_alert("UV: %d CPUs ignored NMI\n", ignored);

console_loglevel = saved_console_loglevel;
pr_alert("UV: process trace complete\n");
@@ -492,8 +495,9 @@ static void uv_nmi_touch_watchdogs(void)
touch_nmi_watchdog();
}

-#if defined(CONFIG_KEXEC_CORE)
static atomic_t uv_nmi_kexec_failed;
+
+#if defined(CONFIG_KEXEC_CORE)
static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
/* Call crash to dump system state */
@@ -502,10 +506,9 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
crash_kexec(regs);

pr_emerg("UV: crash_kexec unexpectedly returned, ");
+ atomic_set(&uv_nmi_kexec_failed, 1);
if (!kexec_crash_image) {
pr_cont("crash kernel not loaded\n");
- atomic_set(&uv_nmi_kexec_failed, 1);
- uv_nmi_sync_exit(1);
return;
}
pr_cont("kexec busy, stalling cpus while waiting\n");
@@ -514,9 +517,6 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
/* If crash exec fails the slaves should return, otherwise stall */
while (atomic_read(&uv_nmi_kexec_failed) == 0)
mdelay(10);
-
- /* Crash kernel most likely not loaded, return in an orderly fashion */
- uv_nmi_sync_exit(0);
}

#else /* !CONFIG_KEXEC_CORE */
@@ -524,6 +524,7 @@ static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
if (master)
pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
+ atomic_set(&uv_nmi_kexec_failed, 1);
}
#endif /* !CONFIG_KEXEC_CORE */

@@ -613,9 +614,14 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
master = (atomic_read(&uv_nmi_cpu) == cpu);

/* If NMI action is "kdump", then attempt to do it */
- if (uv_nmi_action_is("kdump"))
+ if (uv_nmi_action_is("kdump")) {
uv_nmi_kdump(cpu, master, regs);

+ /* Unexpected return, revert action to "dump" */
+ if (master)
+ strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action));
+ }
+
/* Pause as all cpus enter the NMI handler */
uv_nmi_wait(master);

@@ -640,6 +646,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
atomic_set(&uv_nmi_cpus_in_nmi, -1);
atomic_set(&uv_nmi_cpu, -1);
atomic_set(&uv_in_nmi, 0);
+ atomic_set(&uv_nmi_kexec_failed, 0);
}

uv_nmi_touch_watchdogs();
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/