Re: perf: perf_fuzzer quickly locks up on 4.15-rc7

From: Peter Zijlstra
Date: Thu Jan 11 2018 - 10:29:55 EST


On Thu, Jan 11, 2018 at 10:13:53AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 09, 2018 at 08:44:23AM -0500, Vince Weaver wrote:
> > On Tue, 9 Jan 2018, Peter Zijlstra wrote:
> >
> > > So remind me again, how are you running that fuzzer? I'm running
> > > ./fast_repro99.sh as root.
> >
> > I'm running ./fast_repro98.sh on a regular haswell machine with paranoid
> > set to "0".
>
> I'm seeing things like:
>
> Cannot open /sys/kernel/tracing/kprobe_events
>
> this is likely caused by me not having anything mounted there. Rostedt
> provided the magic incantation to make that work, I'll go try now.
>
> At least I seem to have cured all lockdep splats.. of course, the moment
> I hit send on this someone bad is bound to happen.

So while the IVB-EP (6/62/4) goes wobbly after a while, I also started
fuzzing on my HSW-EX (6/63/4) and there it runs without issue for over
30 minutes.

(I get an unchecked MSR splat from the RAPL pmu, but that is non-fatal)

But so far no weird hangs.. running tip/master + below.

Sadly the only other HSW I have is a laptop, and those are absolutely
useless for debugging (no serial :/) but I'll go see if I can reproduce.

FWIW these test-boxes are running debian/testing

---
Documentation/admin-guide/kernel-parameters.txt | 5 +
arch/x86/events/intel/ds.c | 33 ++++---
kernel/events/core.c | 47 +++++++++-
kernel/locking/lockdep.c | 10 ++
kernel/printk/printk.c | 119 +++++++++++++++++-------
kernel/smp.c | 10 ++
6 files changed, 171 insertions(+), 53 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8bc512782bd2..7fa1a59461f7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1169,6 +1169,11 @@
parameter will force ia64_sal_cache_flush to call
ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.

+ force_early_printk
+ Forcefully uses early_console (as per earlyprintk=)
+ usage for regular printk, bypassing everything,
+ including the syslog (dmesg will be empty).
+
forcepae [X86-32]
Forcefully enable Physical Address Extension (PAE).
Many Pentium M systems disable PAE but may have a
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8156e47da7ba..18c25ab28557 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -372,10 +372,9 @@ static int alloc_pebs_buffer(int cpu)
static void release_pebs_buffer(int cpu)
{
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
- struct debug_store *ds = hwev->ds;
void *cea;

- if (!ds || !x86_pmu.pebs)
+ if (!x86_pmu.pebs)
return;

kfree(per_cpu(insn_buffer, cpu));
@@ -384,7 +383,6 @@ static void release_pebs_buffer(int cpu)
/* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
- ds->pebs_buffer_base = 0;
dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
hwev->ds_pebs_vaddr = NULL;
}
@@ -419,16 +417,14 @@ static int alloc_bts_buffer(int cpu)
static void release_bts_buffer(int cpu)
{
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
- struct debug_store *ds = hwev->ds;
void *cea;

- if (!ds || !x86_pmu.bts)
+ if (!x86_pmu.bts)
return;

/* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
ds_clear_cea(cea, BTS_BUFFER_SIZE);
- ds->bts_buffer_base = 0;
dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
hwev->ds_bts_vaddr = NULL;
}
@@ -454,16 +450,22 @@ void release_ds_buffers(void)
if (!x86_pmu.bts && !x86_pmu.pebs)
return;

- get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
+ release_ds_buffer(cpu);
+
+ for_each_possible_cpu(cpu) {
+ /*
+ * Again, ignore errors from offline CPUs, they will no longer
+ * observe cpu_hw_events.ds and not program the DS_AREA when
+ * they come up.
+ */
fini_debug_store_on_cpu(cpu);
+ }

for_each_possible_cpu(cpu) {
release_pebs_buffer(cpu);
release_bts_buffer(cpu);
- release_ds_buffer(cpu);
}
- put_online_cpus();
}

void reserve_ds_buffers(void)
@@ -483,8 +485,6 @@ void reserve_ds_buffers(void)
if (!x86_pmu.pebs)
pebs_err = 1;

- get_online_cpus();
-
for_each_possible_cpu(cpu) {
if (alloc_ds_buffer(cpu)) {
bts_err = 1;
@@ -521,11 +521,14 @@ void reserve_ds_buffers(void)
if (x86_pmu.pebs && !pebs_err)
x86_pmu.pebs_active = 1;

- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ /*
+ * Ignores wrmsr_on_cpu() errors for offline CPUs they
+ * will get this call through intel_pmu_cpu_starting().
+ */
init_debug_store_on_cpu(cpu);
+ }
}
-
- put_online_cpus();
}

/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4e1a1bf8d867..fb079be8971a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
+ *
+ * cpu_hotplug_lock
+ * pmus_lock
+ * cpuctx->mutex / perf_event_context::mutex
*/
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
+ LIST_HEAD(free_list);

/*
* If we got here through err_file: fput(event_file); we will not have
@@ -4268,8 +4273,7 @@ int perf_event_release_kernel(struct perf_event *event)
struct perf_event, child_list);
if (tmp == child) {
perf_remove_from_context(child, DETACH_GROUP);
- list_del(&child->child_list);
- free_event(child);
+ list_move(&child->child_list, &free_list);
/*
* This matches the refcount bump in inherit_event();
* this can't be the last reference.
@@ -4284,6 +4288,11 @@ int perf_event_release_kernel(struct perf_event *event)
}
mutex_unlock(&event->child_mutex);

+ list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+ list_del(&child->child_list);
+ free_event(child);
+ }
+
no_ctx:
put_event(event); /* Must be the 'last' reference */
return 0;
@@ -8526,6 +8535,29 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
return ret;
}

+static int
+perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
+{
+ struct perf_event_context *ctx = event->ctx;
+ int ret;
+
+ /*
+ * Beware, here be dragons!!
+ *
+ * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
+ * stuff does not actually need it. So temporarily drop ctx->mutex. As per
+ * perf_event_ctx_lock() we already have a reference on ctx.
+ *
+ * This can result in event getting moved to a different ctx, but that
+ * does not affect the tracepoint state.
+ */
+ mutex_unlock(&ctx->mutex);
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ mutex_lock(&ctx->mutex);
+
+ return ret;
+}
+
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
char *filter_str;
@@ -8542,8 +8574,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)

if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
event->attr.type == PERF_TYPE_TRACEPOINT)
- ret = ftrace_profile_set_filter(event, event->attr.config,
- filter_str);
+ ret = perf_tracepoint_set_filter(event, filter_str);
else if (has_addr_filter(event))
ret = perf_event_set_addr_filter(event, filter_str);

@@ -9178,7 +9209,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (!try_module_get(pmu->module))
return -ENODEV;

- if (event->group_leader != event) {
+ /*
+ * A number of pmu->event_init() methods iterate the sibling_list to,
+ * for example, validate if the group fits on the PMU. Therefore,
+ * if this is a sibling event, acquire the ctx->mutex to protect
+ * the sibling_list.
+ */
+ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
/*
* This ctx->mutex can nest when we're called through
* inheritance. See the perf_event_ctx_lock_nested() comment.
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 5fa1324a4f29..fb81dc92aaa6 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2668,8 +2668,12 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
this, 0, irqclass);
}

+DECLARE_PER_CPU(struct cpumask *, smp_call_mask);
+
void print_irqtrace_events(struct task_struct *curr)
{
+ struct cpumask *__mask = this_cpu_read(smp_call_mask);
+
printk("irq event stamp: %u\n", curr->irq_events);
printk("hardirqs last enabled at (%u): [<%p>] %pS\n",
curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
@@ -2683,6 +2687,12 @@ void print_irqtrace_events(struct task_struct *curr)
printk("softirqs last disabled at (%u): [<%p>] %pS\n",
curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
(void *)curr->softirq_disable_ip);
+
+ if (__mask) {
+ printk("smp_function_call mask: %*pbl\n", cpumask_pr_args(__mask));
+ } else {
+ printk("smp_function_call mask: (NULL)\n");
+ }
}

static int HARDIRQ_verbose(struct lock_class *class)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b9006617710f..c1a4ed17c22c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -365,6 +365,75 @@ __packed __aligned(4)
#endif
;

+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+static bool __read_mostly force_early_printk;
+
+static int __init force_early_printk_setup(char *str)
+{
+ force_early_printk = true;
+ return 0;
+}
+early_param("force_early_printk", force_early_printk_setup);
+
+static int early_printk_cpu = -1;
+
+static int early_vprintk(const char *fmt, va_list args)
+{
+ int n, cpu, old;
+ char buf[512];
+
+ cpu = get_cpu();
+ /*
+ * Test-and-Set inter-cpu spinlock with recursion.
+ */
+ for (;;) {
+ /*
+ * c-cas to avoid the exclusive bouncing on spin.
+ * Depends on the memory barrier implied by cmpxchg
+ * for ACQUIRE semantics.
+ */
+ old = READ_ONCE(early_printk_cpu);
+ if (old == -1) {
+ old = cmpxchg(&early_printk_cpu, -1, cpu);
+ if (old == -1)
+ break;
+ }
+ /*
+ * Allow recursion for interrupts and the like.
+ */
+ if (old == cpu)
+ break;
+
+ cpu_relax();
+ }
+
+ n = vscnprintf(buf, sizeof(buf), fmt, args);
+ early_console->write(early_console, buf, n);
+
+ /*
+ * Unlock -- in case @old == @cpu, this is a no-op.
+ */
+ smp_store_release(&early_printk_cpu, old);
+ put_cpu();
+
+ return n;
+}
+
+asmlinkage __visible void early_printk(const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!early_console)
+ return;
+
+ va_start(ap, fmt);
+ early_vprintk(fmt, ap);
+ va_end(ap);
+}
+#endif
+
/*
* The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
* within the scheduler's rq lock. It must be released before calling
@@ -1692,6 +1761,16 @@ asmlinkage int vprintk_emit(int facility, int level,
int printed_len;
bool in_sched = false;

+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
+ return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+ if (force_early_printk && early_console)
+ return early_vprintk(fmt, args);
+#endif
+
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
in_sched = true;
@@ -1784,18 +1863,7 @@ EXPORT_SYMBOL(printk_emit);

int vprintk_default(const char *fmt, va_list args)
{
- int r;
-
-#ifdef CONFIG_KGDB_KDB
- /* Allow to pass printk() to kdb but avoid a recursion. */
- if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
- r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
- return r;
- }
-#endif
- r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);

@@ -1826,7 +1894,12 @@ asmlinkage __visible int printk(const char *fmt, ...)
int r;

va_start(args, fmt);
- r = vprintk_func(fmt, args);
+#ifdef CONFIG_EARLY_PRINTK
+ if (force_early_printk && early_console)
+ r = vprintk_default(fmt, args);
+ else
+#endif
+ r = vprintk_func(fmt, args);
va_end(args);

return r;
@@ -1863,26 +1936,6 @@ static bool suppress_message_printing(int level) { return false; }

#endif /* CONFIG_PRINTK */

-#ifdef CONFIG_EARLY_PRINTK
-struct console *early_console;
-
-asmlinkage __visible void early_printk(const char *fmt, ...)
-{
- va_list ap;
- char buf[512];
- int n;
-
- if (!early_console)
- return;
-
- va_start(ap, fmt);
- n = vscnprintf(buf, sizeof(buf), fmt, ap);
- va_end(ap);
-
- early_console->write(early_console, buf, n);
-}
-#endif
-
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options)
{
diff --git a/kernel/smp.c b/kernel/smp.c
index 084c8b3a2681..d014602e13f7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -257,6 +257,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
irq_work_run();
}

+DEFINE_PER_CPU(const struct cpumask *, smp_call_mask);
+
/*
* smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking.
@@ -296,11 +298,15 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
csd_lock(csd);
}

+ this_cpu_write(smp_call_mask, cpumask_of(cpu));
+
err = generic_exec_single(cpu, csd, func, info);

if (wait)
csd_lock_wait(csd);

+ this_cpu_write(smp_call_mask, NULL);
+
put_cpu();

return err;
@@ -457,6 +463,8 @@ void smp_call_function_many(const struct cpumask *mask,
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
}

+ this_cpu_write(smp_call_mask, cfd->cpumask_ipi);
+
/* Send a message to all CPUs in the map */
arch_send_call_function_ipi_mask(cfd->cpumask_ipi);

@@ -468,6 +476,8 @@ void smp_call_function_many(const struct cpumask *mask,
csd_lock_wait(csd);
}
}
+
+ this_cpu_write(smp_call_mask, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);