[PATCH] printk/nmi: More reliable approach to use the main

From: Petr Mladek
Date: Fri Jun 08 2018 - 05:21:07 EST


The check whether logbuf_lock is available is not safe in
printk_nmi_enter(). It might get blocked by another CPU that
takes the lock and enter NMI later.

The race still would be acceptable if the other CPU could
finish the NMI handler independently. But this is not a case
with nmi_trigger_cpumask_backtrace(). It has to serialize
all NMI handlers to prevent mixed output.

It would be possible to make the decision later in vprintk_emit().
But it would cause mixed output and make
nmi_trigger_cpumask_backtrace() unusable.

Also it would be possible to avoid the serialization and force using
the per-CPU buffers in nmi_cpu_backtrace(). But this problem is generic
and we do not know how many other such deadlocks are possible.

But wait, the motivation to use the main log buffer in NMI was for
flushing ftrace log in panic(). It can produce hundreds of lines
that do not fit the per-CPU buffers. The good think is that they
have its own time stamp and can be sorted later. Therefore it might
be acceptable to risk the mixed output in this case.

This patch introduces new printk_nmi_direct() context. It tries
to use the main log buffer when possible but there is a risk of
mixed messages. It will be used only when flushing the ftrace
buffer.

The special printk_nmi per-CPU buffers will always be used in
the other situation again. As a result, nmi_cpu_backtrace() need
not longer be serialized.

Fixes: 719f6a7040f1bdaf96 ("printk: Use the main logbuf in NMI when logbuf_lock is available")
Signed-off-by: Petr Mladek <pmladek@xxxxxxxx>
---
include/linux/printk.h | 4 ++++
kernel/printk/internal.h | 8 ++++++-
kernel/printk/printk.c | 8 ++++++-
kernel/printk/printk_safe.c | 57 ++++++++++++++++++++++++---------------------
kernel/trace/trace.c | 4 +++-
lib/nmi_backtrace.c | 3 ---
6 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 6d7e800affd8..872fbdf8df26 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -148,9 +148,13 @@ void early_printk(const char *s, ...) { }
#ifdef CONFIG_PRINTK_NMI
extern void printk_nmi_enter(void);
extern void printk_nmi_exit(void);
+extern void printk_nmi_direct_enter(void);
+extern void printk_nmi_direct_exit(void);
#else
static inline void printk_nmi_enter(void) { }
static inline void printk_nmi_exit(void) { }
+static void printk_nmi_direct_enter(void) { }
+static void printk_nmi_direct_exit(void) { }
#endif /* PRINTK_NMI */

#ifdef CONFIG_PRINTK
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2a7d04049af4..47db66e41ab4 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -19,11 +19,17 @@
#ifdef CONFIG_PRINTK

#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff
-#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000
+#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000
#define PRINTK_NMI_CONTEXT_MASK 0x80000000

extern raw_spinlock_t logbuf_lock;

+#ifdef CONFIG_PRINTK_NMI
+__printf(1, 0) int vprintk_nmi(const char *fmt, va_list args);
+#else
+__printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) { return 0; }
+#endif
+
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 247808333ba4..cf55bece43d9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1845,7 +1845,13 @@ asmlinkage int vprintk_emit(int facility, int level,
printk_delay();

/* This stops the holder of console_sem just where we want him */
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
+ if (in_nmi() && !raw_spin_trylock(&logbuf_lock)) {
+ printed_len = vprintk_nmi(fmt, args);
+ printk_safe_exit_irqrestore(flags);
+ return printed_len;
+ } else
+ raw_spin_lock(&logbuf_lock);
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index d7d091309054..bf8e3ebfca8e 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -299,40 +299,43 @@ void printk_safe_flush_on_panic(void)
* one writer running. But the buffer might get flushed from another
* CPU, so we need to be careful.
*/
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
+__printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
{
struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);

return printk_safe_log_store(s, fmt, args);
}

-void printk_nmi_enter(void)
+/*
+ * Try to use the main log buffer even in NMI context. It falls
+ * back to vprintk_nmi() when the lock is not available. The decision
+ * is made in vprintk_emit().
+ *
+ * Therefore there is not a risk of a deadlock. But the messages might
+ * get reordered when anyone uses the lock in parallel and only some
+ * messages are stored directly.
+ */
+void printk_nmi_direct_enter(void)
{
- /*
- * The size of the extra per-CPU buffer is limited. Use it only when
- * the main one is locked. If this CPU is not in the safe context,
- * the lock must be taken on another CPU and we could wait for it.
- */
- if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) &&
- raw_spin_is_locked(&logbuf_lock)) {
- this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
- } else {
- this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK);
- }
+ if (!(this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK))
+ return;
+
+ this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
}

-void printk_nmi_exit(void)
+void printk_nmi_direct_exit(void)
{
- this_cpu_and(printk_context,
- ~(PRINTK_NMI_CONTEXT_MASK |
- PRINTK_NMI_DEFERRED_CONTEXT_MASK));
+ this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
}

-#else
+void printk_nmi_enter(void)
+{
+ this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+}

-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
+void printk_nmi_exit(void)
{
- return 0;
+ this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
}

#endif /* CONFIG_PRINTK_NMI */
@@ -363,6 +366,13 @@ void __printk_safe_exit(void)

__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
{
+ /*
+ * Try to use the main logbuf even in NMI. But avoid calling console
+ * drivers that might have their own locks.
+ */
+ if (this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)
+ return vprintk_deferred(fmt, args);
+
/* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
return vprintk_nmi(fmt, args);
@@ -371,13 +381,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
return vprintk_safe(fmt, args);

- /*
- * Use the main logbuf when logbuf_lock is available in NMI.
- * But avoid calling console drivers that might have their own locks.
- */
- if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK)
- return vprintk_deferred(fmt, args);
-
/* No obstacles. */
return vprintk_default(fmt, args);
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 108ce3e1dc13..38b6dbe1709b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8279,6 +8279,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
tracing_off();

local_irq_save(flags);
+ printk_nmi_direct_enter();

/* Simulate the iterator */
trace_init_global_iter(&iter);
@@ -8358,7 +8359,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
for_each_tracing_cpu(cpu) {
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- atomic_dec(&dump_running);
+ atomic_dec(&dump_running);
+ printk_nmi_direct_exit();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 61a6b5aab07e..15ca78e1c7d4 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -87,11 +87,9 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,

bool nmi_cpu_backtrace(struct pt_regs *regs)
{
- static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
int cpu = smp_processor_id();

if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- arch_spin_lock(&lock);
if (regs && cpu_in_idle(instruction_pointer(regs))) {
pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n",
cpu, (void *)instruction_pointer(regs));
@@ -102,7 +100,6 @@ bool nmi_cpu_backtrace(struct pt_regs *regs)
else
dump_stack();
}
- arch_spin_unlock(&lock);
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return true;
}
--
2.13.6