[RFC][PATCH] printk: Fixup the nmi printk mess

From: Peter Zijlstra
Date: Wed Jun 10 2015 - 08:55:27 EST


Hi,

I just stumbled upon:

a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs")

Which is not dissimilar to what I've proposed in the past. Except its
squirreled away in some far and dark corner of one arch.

Lets fix that.

---
arch/Kconfig | 4 ++
arch/arm/Kconfig | 1 +
arch/avr32/Kconfig | 1 +
arch/blackfin/Kconfig | 1 +
arch/cris/Kconfig | 1 +
arch/powerpc/Kconfig | 1 +
arch/s390/Kconfig | 1 +
arch/sh/Kconfig | 1 +
arch/sparc/Kconfig | 1 +
arch/tile/Kconfig | 1 +
arch/x86/Kconfig | 1 +
arch/x86/kernel/apic/hw_nmi.c | 86 +----------------------------
include/linux/hardirq.h | 2 +
include/linux/percpu.h | 3 -
include/linux/printk.h | 10 +++-
init/Kconfig | 5 ++
init/main.c | 1 +
kernel/printk/printk.c | 124 ++++++++++++++++++++++++++++++++++++++----
18 files changed, 145 insertions(+), 100 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a65eafb24997..ad8db3979e21 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -179,7 +179,11 @@ config HAVE_OPTPROBES
config HAVE_KPROBES_ON_FTRACE
bool

+config HAVE_NMI
+ bool
+
config HAVE_NMI_WATCHDOG
+ depends on HAVE_NMI
bool
#
# An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 45df48ba0b12..a26e83699df9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -61,6 +61,7 @@ config ARM
select HAVE_KRETPROBES if (HAVE_KPROBES)
select HAVE_MEMBLOCK
select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
+ select HAVE_NMI if (!CPU_V7)
select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
select HAVE_OPTPROBES if !THUMB2_KERNEL
select HAVE_PERF_EVENTS
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b6878eb64884..9dc3e2b1180b 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -17,6 +17,7 @@ config AVR32
select GENERIC_CLOCKEVENTS
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
+ select HAVE_NMI
help
AVR32 is a high-performance 32-bit RISC microprocessor core,
designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index af76634f8d98..47c0a55acafd 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
+ select HAVE_NMI

config GENERIC_CSUM
def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 0314e325a669..2e053f709c4b 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -58,6 +58,7 @@ config CRIS
select CLKSRC_MMIO if ETRAX_ARCH_V32
select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
+ select HAVE_NMI

config HZ
int
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 190cc48abc0c..079f4fec9586 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -153,6 +153,7 @@ config PPC
select NO_BOOTMEM
select HAVE_GENERIC_RCU_GUP
select HAVE_PERF_EVENTS_NMI if PPC64
+ select HAVE_NMI if PERF_EVENTS

config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b06dc3839268..1b84036437c0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -152,6 +152,7 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select HAVE_NMI

config SCHED_OMIT_FRAME_POINTER
def_bool y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..7803b6863c0e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
select OLD_SIGSUSPEND
select OLD_SIGACTION
select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_NMI
help
The SuperH is a RISC processor targeted for use in embedded systems
and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e49502acbab4..198abcb6e510 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -80,6 +80,7 @@ config SPARC64
select NO_BOOTMEM
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_NMI

config ARCH_DEFCONFIG
string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index a07e31b50d3f..1503ab5882cd 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -28,6 +28,7 @@ config TILE
select HAVE_DEBUG_STACKOVERFLOW
select ARCH_WANT_FRAME_POINTERS
select HAVE_CONTEXT_TRACKING
+ select HAVE_NMI if USE_PMC

# FIXME: investigate whether we need/want these options.
# select HAVE_IOREMAP_PROT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 228aa35d7e89..67dbc7442499 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -118,6 +118,7 @@ config X86
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
+ select HAVE_NMI
select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..90eb8c385e60 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/delay.h>
-#include <linux/seq_buf.h>

#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(int watchdog_thresh)
@@ -30,35 +29,14 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);

/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;

-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
-}
-
void arch_trigger_all_cpu_backtrace(bool include_self)
{
- struct nmi_seq_buf *s;
- int len;
- int cpu;
- int i;
int this_cpu = get_cpu();
+ int i;

if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -73,16 +51,6 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
if (!include_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));

- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("sending NMI to %s CPUs:\n",
(include_self ? "all" : "other"));
@@ -97,58 +65,11 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
touch_softlockup_watchdog();
}

- /*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
- */
- for_each_cpu(cpu, &printtrace_mask) {
- int last_i = 0;
-
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
-
clear_bit(0, &backtrace_flag);
smp_mb__after_atomic();
put_cpu();
}

-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
-}
-
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
@@ -157,13 +78,8 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
cpu = smp_processor_id();

if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
- this_cpu_write(printk_func, printk_func_save);

cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return NMI_HANDLED;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..a477e0766d2e 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -67,10 +67,12 @@ extern void irq_exit(void);
preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
rcu_nmi_enter(); \
trace_hardirq_enter(); \
+ printk_nmi_enter(); \
} while (0)

#define nmi_exit() \
do { \
+ printk_nmi_exit(); \
trace_hardirq_exit(); \
rcu_nmi_exit(); \
BUG_ON(!in_nmi()); \
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index caebf2a758dc..04c68b9f56f8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -135,7 +135,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))

-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9b30871c9149..ebe017327e6e 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -120,7 +120,15 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif

-typedef int(*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+#else
+static inline void printk_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+#endif /* HAVE_NMI */

#ifdef CONFIG_PRINTK
asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index b999fa381bf9..fe9a439c8843 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1431,6 +1431,11 @@ config PRINTK
very difficult to diagnose system problems, saying N here is
strongly discouraged.

+config PRINTK_NMI
+ def_bool y
+ depends on PRINTK
+ depends on HAVE_NMI
+
config BUG
bool "BUG() support" if EXPERT
default y
diff --git a/init/main.c b/init/main.c
index 2115055faeac..4a28accaaa98 100644
--- a/init/main.c
+++ b/init/main.c
@@ -586,6 +586,7 @@ asmlinkage __visible void __init start_kernel(void)
timekeeping_init();
time_init();
sched_clock_postinit();
+ printk_init();
perf_event_init();
profile_init();
call_function_init();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c099b082cd02..0149c41be95b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1821,13 +1821,125 @@ int vprintk_default(const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(vprintk_default);

+#ifdef CONFIG_PRINTK_NMI
+
+typedef int(*printk_func_t)(const char *fmt, va_list args);
/*
* This allows printk to be diverted to another function per cpu.
* This is useful for calling printk functions from within NMI
* without worrying about race conditions that can lock up the
* box.
*/
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
+#include <linux/seq_buf.h>
+
+struct nmi_seq_buf {
+ struct seq_buf seq;
+ struct irq_work work;
+ unsigned char buffer[PAGE_SIZE -
+ sizeof(struct seq_buf) -
+ sizeof(struct irq_work)];
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+ const char *buf = s->buffer + start;
+
+ printk("%.*s", (end - start) + 1, buf);
+}
+
+static void __printk_nmi_flush(struct irq_work *work)
+{
+ struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ int len, last_i = 0, i = 0;
+
+again:
+ len = seq_buf_used(&s->seq);
+ if (!len)
+ return;
+
+ /* Print line by line. */
+ for (; i < len; i++) {
+ if (s->buffer[i] == '\n') {
+ print_seq_line(s, last_i, i);
+ last_i = i + 1;
+ }
+ }
+ /* Check if there was a partial line. */
+ if (last_i < len) {
+ print_seq_line(s, last_i, len - 1);
+ pr_cont("\n");
+ }
+
+ /*
+ * Another NMI could have come in while we were printing
+ * check if nothing has been added to the buffer.
+ */
+ if (cmpxchg_local(&s->seq.len, len, 0) != len)
+ goto again;
+}
+
+void printk_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+ init_irq_work(&s->work, __printk_nmi_flush);
+ seq_buf_init(&s->seq, s->buffer, sizeof(s->buffer));
+ }
+}
+
+/*
+ * It is not safe to call printk() directly from NMI handlers.
+ * It may be fine if the NMI detected a lock up and we have no choice
+ * but to do so, but doing a NMI on all other CPUs to get a back trace
+ * can be done with a sysrq-l. We don't want that to lock up, which
+ * can happen if the NMI interrupts a printk in progress.
+ *
+ * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
+ * the content into a per cpu seq_buf buffer. Then when the NMIs are
+ * all done, we can safely dump the contents of the seq_buf to a printk()
+ * from a non NMI context.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+ unsigned int len = seq_buf_used(&s->seq);
+
+ irq_work_queue(&s->work);
+ seq_buf_vprintf(&s->seq, fmt, args);
+ return seq_buf_used(&s->seq) - len;
+}
+
+void printk_nmi_enter(void)
+{
+ this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+ this_cpu_write(printk_func, vprintk_default);
+}
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else
+
+static inline int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_default(fmt, args);
+}
+
+#endif /* PRINTK_NMI */

/**
* printk - print a kernel message
@@ -1852,21 +1964,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
- printk_func_t vprintk_func;
va_list args;
int r;

va_start(args, fmt);
-
- /*
- * If a caller overrides the per_cpu printk_func, then it needs
- * to disable preemption when calling printk(). Otherwise
- * the printk_func should be set to the default. No need to
- * disable preemption here.
- */
- vprintk_func = this_cpu_read(printk_func);
r = vprintk_func(fmt, args);
-
va_end(args);

return r;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/