[PATCH 06/22] tracing/rb: Convert to hotplug state machine

From: Sebastian Andrzej Siewior
Date: Sat Nov 26 2016 - 18:18:43 EST


Install the callbacks via the state machine. The notifier in struct
ring_buffer is replaced by the multi instance interface. Upon
__ring_buffer_alloc() invocation, cpuhp_state_add_instance() will invoke
the trace_rb_cpu_prepare() on each CPU.

This callback may now fail. This means __ring_buffer_alloc() will fail and
cleanup (like previously) and during a CPU up event this failure will not
allow the CPU to come up.

Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
include/linux/cpuhotplug.h | 1 +
include/linux/ring_buffer.h | 6 ++
kernel/trace/ring_buffer.c | 133 +++++++++++++++-----------------------------
kernel/trace/trace.c | 15 ++++-
4 files changed, 65 insertions(+), 90 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index e3771fb959c0..18bcfeb2463e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -62,6 +62,7 @@ enum cpuhp_state {
CPUHP_TOPOLOGY_PREPARE,
CPUHP_NET_IUCV_PREPARE,
CPUHP_ARM_BL_PREPARE,
+ CPUHP_TRACE_RB_PREPARE,
CPUHP_TIMERS_DEAD,
CPUHP_NOTF_ERR_INJ_PREPARE,
CPUHP_MIPS_SOC_PREPARE,
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 4acc552e9279..b6d4568795a7 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -198,4 +198,10 @@ enum ring_buffer_flags {
RB_FL_OVERWRITE = 1 << 0,
};

+#ifdef CONFIG_RING_BUFFER
+int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
+#else
+#define trace_rb_cpu_prepare NULL
+#endif
+
#endif /* _LINUX_RING_BUFFER_H */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c143739b8d7..a7a055f167c7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -479,9 +479,7 @@ struct ring_buffer {

struct ring_buffer_per_cpu **buffers;

-#ifdef CONFIG_HOTPLUG_CPU
- struct notifier_block cpu_notify;
-#endif
+ struct hlist_node node;
u64 (*clock)(void);

struct rb_irq_work irq_work;
@@ -1274,11 +1272,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}

-#ifdef CONFIG_HOTPLUG_CPU
-static int rb_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu);
-#endif
-
/**
* __ring_buffer_alloc - allocate a new ring_buffer
* @size: the size in bytes per cpu that is needed.
@@ -1296,6 +1289,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
long nr_pages;
int bsize;
int cpu;
+ int ret;

/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1318,17 +1312,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (nr_pages < 2)
nr_pages = 2;

- /*
- * In case of non-hotplug cpu, if the ring-buffer is allocated
- * in early initcall, it will not be notified of secondary cpus.
- * In that off case, we need to allocate for all possible cpus.
- */
-#ifdef CONFIG_HOTPLUG_CPU
- cpu_notifier_register_begin();
- cpumask_copy(buffer->cpumask, cpu_online_mask);
-#else
- cpumask_copy(buffer->cpumask, cpu_possible_mask);
-#endif
buffer->cpus = nr_cpu_ids;

bsize = sizeof(void *) * nr_cpu_ids;
@@ -1337,19 +1320,15 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;

- for_each_buffer_cpu(buffer, cpu) {
- buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
- if (!buffer->buffers[cpu])
- goto fail_free_buffers;
- }
+ cpu = raw_smp_processor_id();
+ cpumask_set_cpu(cpu, buffer->cpumask);
+ buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;

-#ifdef CONFIG_HOTPLUG_CPU
- buffer->cpu_notify.notifier_call = rb_cpu_notify;
- buffer->cpu_notify.priority = 0;
- __register_cpu_notifier(&buffer->cpu_notify);
- cpu_notifier_register_done();
-#endif
+ ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+ if (ret < 0)
+ goto fail_free_buffers;

mutex_init(&buffer->mutex);

@@ -1364,9 +1343,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,

fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
-#ifdef CONFIG_HOTPLUG_CPU
- cpu_notifier_register_done();
-#endif

fail_free_buffer:
kfree(buffer);
@@ -1383,18 +1359,11 @@ ring_buffer_free(struct ring_buffer *buffer)
{
int cpu;

-#ifdef CONFIG_HOTPLUG_CPU
- cpu_notifier_register_begin();
- __unregister_cpu_notifier(&buffer->cpu_notify);
-#endif
+ cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);

for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);

-#ifdef CONFIG_HOTPLUG_CPU
- cpu_notifier_register_done();
-#endif
-
kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);

@@ -4633,62 +4602,48 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);

-#ifdef CONFIG_HOTPLUG_CPU
-static int rb_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+/*
+ * We only allocate new buffers, never free them if the CPU goes down.
+ * If we were to free the buffer, then the user would lose any trace that was in
+ * the buffer.
+ */
+int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
{
- struct ring_buffer *buffer =
- container_of(self, struct ring_buffer, cpu_notify);
- long cpu = (long)hcpu;
+ struct ring_buffer *buffer;
long nr_pages_same;
int cpu_i;
unsigned long nr_pages;

- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- if (cpumask_test_cpu(cpu, buffer->cpumask))
- return NOTIFY_OK;
+ buffer = container_of(node, struct ring_buffer, node);
+ if (cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;

- nr_pages = 0;
- nr_pages_same = 1;
- /* check if all cpu sizes are same */
- for_each_buffer_cpu(buffer, cpu_i) {
- /* fill in the size from first enabled cpu */
- if (nr_pages == 0)
- nr_pages = buffer->buffers[cpu_i]->nr_pages;
- if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
- nr_pages_same = 0;
- break;
- }
+ nr_pages = 0;
+ nr_pages_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_buffer_cpu(buffer, cpu_i) {
+ /* fill in the size from first enabled cpu */
+ if (nr_pages == 0)
+ nr_pages = buffer->buffers[cpu_i]->nr_pages;
+ if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
+ nr_pages_same = 0;
+ break;
}
- /* allocate minimum pages, user can later expand it */
- if (!nr_pages_same)
- nr_pages = 2;
- buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
- if (!buffer->buffers[cpu]) {
- WARN(1, "failed to allocate ring buffer on CPU %ld\n",
- cpu);
- return NOTIFY_OK;
- }
- smp_wmb();
- cpumask_set_cpu(cpu, buffer->cpumask);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- /*
- * Do nothing.
- * If we were to free the buffer, then the user would
- * lose any trace that was in the buffer.
- */
- break;
- default:
- break;
}
- return NOTIFY_OK;
+ /* allocate minimum pages, user can later expand it */
+ if (!nr_pages_same)
+ nr_pages = 2;
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+ if (!buffer->buffers[cpu]) {
+ WARN(1, "failed to allocate ring buffer on CPU %u\n",
+ cpu);
+ return -ENOMEM;
+ }
+ smp_wmb();
+ cpumask_set_cpu(cpu, buffer->cpumask);
+ return 0;
}
-#endif

#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8696ce6bf2f6..465d56febc5b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7659,10 +7659,21 @@ __init static int tracer_alloc_buffers(void)

raw_spin_lock_init(&global_trace.start_lock);

+ /*
+ * The prepare callbacks allocates some memory for the ring buffer. We
+ * don't free the buffer if the if the CPU goes down. If we were to free
+ * the buffer, then the user would lose any trace that was in the
+ * buffer. The memory will be removed once the "instance" is removed.
+ */
+ ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
+ "trace/RB:preapre", trace_rb_cpu_prepare,
+ NULL);
+ if (ret < 0)
+ goto out_free_cpumask;
/* Used for event triggers */
temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
if (!temp_buffer)
- goto out_free_cpumask;
+ goto out_rm_hp_state;

if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
@@ -7723,6 +7734,8 @@ __init static int tracer_alloc_buffers(void)
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
ring_buffer_free(temp_buffer);
+out_rm_hp_state:
+ cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE);
out_free_cpumask:
free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
--
2.10.2