[PATCH] perf/core: introduce context per CPU event list

From: kan . liang
Date: Wed Nov 09 2016 - 14:05:05 EST


From: Kan Liang <kan.liang@xxxxxxxxx>

The perf per-process monitoring overhead increases rapidly with the
increasing of events# and CPU#.

Here is some data from the overhead test on Skylake server which has 64
logical CPU. Elapsed time of AIM7 is used to measure the overhead.
perf record -e $event_list -p $pid #$pid is the pid of AIM7
Elapsed time A: elapsed time of AIM7
Elapsed time B: elapsed time of AIM7 while perf is running in parallel.
Overhead = (B - A) / A
Events# Overhead (%)
1 0.45%
2 1.20%
4 3.85%
8 15.82%
16 50.24%

The perf_iterate_ctx contributes to the most of the increased overheads.
Because it iterates over the whole rcu list ctx->event_list.
ctx->event_list increases rapidly by adding events. By adding one event,
CPU# of events will be finally inserted into the event_list.
Iterating the whole event list will become a disaster on multi-core
systems with large CPU#.

Actually, it doesn't need to iterate the whole rcu list every time.
Sometimes, it only needs to output the event on current CPU.
A per cpu event list is introduced to divide the big rcu list into small
per CPU rcu lists, which significantly reduce the overhead.
Events# Overhead (%)
1 0.15%
2 1.06%
4 1.85%
8 9.99%
16 17.26%

Signed-off-by: Kan Liang <kan.liang@xxxxxxxxx>
---
include/linux/perf_event.h | 3 +-
kernel/events/core.c | 177 ++++++++++++++++++++++++++++++++++-----------
2 files changed, 138 insertions(+), 42 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecd..1222a33 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -734,7 +734,8 @@ struct perf_event_context {
struct list_head active_ctx_list;
struct list_head pinned_groups;
struct list_head flexible_groups;
- struct list_head event_list;
+ struct list_head *__percpu event_list;
+
int nr_events;
int nr_active;
int is_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e29213..00c12df 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1118,6 +1118,9 @@ static void free_ctx(struct rcu_head *head)

ctx = container_of(head, struct perf_event_context, rcu_head);
kfree(ctx->task_ctx_data);
+
+ free_percpu(ctx->event_list);
+
kfree(ctx);
}

@@ -1461,6 +1464,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct list_head *t_list;

lockdep_assert_held(&ctx->lock);

@@ -1483,7 +1487,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)

list_update_cgroup_event(event, ctx, true);

- list_add_rcu(&event->event_entry, &ctx->event_list);
+ /* If event CPU is not set, add the event to the list of CPU 0 */
+ t_list = per_cpu_ptr(ctx->event_list, event->cpu == -1 ? 0 : event->cpu);
+ list_add_rcu(&event->event_entry, t_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -2749,25 +2755,30 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
struct perf_event_context *next_ctx)
{
struct perf_event *event, *next_event;
+ int cpu;

if (!ctx->nr_stat)
return;

update_context_time(ctx);

- event = list_first_entry(&ctx->event_list,
- struct perf_event, event_entry);
+ for_each_possible_cpu(cpu) {
+ struct list_head *list, *next_list;

- next_event = list_first_entry(&next_ctx->event_list,
- struct perf_event, event_entry);
+ list = per_cpu_ptr(ctx->event_list, cpu);
+ next_list = per_cpu_ptr(next_ctx->event_list, cpu);

- while (&event->event_entry != &ctx->event_list &&
- &next_event->event_entry != &next_ctx->event_list) {
+ event = list_first_entry(list, struct perf_event, event_entry);
+ next_event = list_first_entry(next_list, struct perf_event, event_entry);

- __perf_event_sync_stat(event, next_event);
+ while (&event->event_entry != list &&
+ &next_event->event_entry != next_list) {

- event = list_next_entry(event, event_entry);
- next_event = list_next_entry(next_event, event_entry);
+ __perf_event_sync_stat(event, next_event);
+
+ event = list_next_entry(event, event_entry);
+ next_event = list_next_entry(next_event, event_entry);
+ }
}
}

@@ -3241,7 +3252,9 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
struct perf_event *event;
struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC;
+ struct list_head *t_list;
s64 delta;
+ int cpu;

/*
* only need to iterate over all events iff:
@@ -3254,7 +3267,11 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
raw_spin_lock(&ctx->lock);
perf_pmu_disable(ctx->pmu);

- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ cpu = smp_processor_id();
+again:
+ t_list = per_cpu_ptr(ctx->event_list, cpu);
+
+ list_for_each_entry_rcu(event, t_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;

@@ -3298,6 +3315,15 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_pmu_enable(event->pmu);
}

+ /*
+ * the event->cpu may be -1.
+ * If so, the event is stored in CPU0's event_list.
+ */
+ if (cpu != 0) {
+ cpu = 0;
+ goto again;
+ }
+
perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
@@ -3385,6 +3411,12 @@ static int event_enable_on_exec(struct perf_event *event,
return 1;
}

+#define for_each_ctx_event_list(__ctx, __cpu, __list) \
+ for (__cpu = cpumask_first(cpu_possible_mask), \
+ __list = per_cpu_ptr(__ctx->event_list, __cpu); \
+ __list; \
+ __cpu = cpumask_next(__cpu, cpu_possible_mask), \
+ __list = (__cpu < nr_cpu_ids) ? per_cpu_ptr(__ctx->event_list, __cpu) : NULL)
/*
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
@@ -3394,8 +3426,10 @@ static void perf_event_enable_on_exec(int ctxn)
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
+ struct list_head *list;
unsigned long flags;
int enabled = 0;
+ int cpu;

local_irq_save(flags);
ctx = current->perf_event_ctxp[ctxn];
@@ -3405,8 +3439,11 @@ static void perf_event_enable_on_exec(int ctxn)
cpuctx = __get_cpu_context(ctx);
perf_ctx_lock(cpuctx, ctx);
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
- list_for_each_entry(event, &ctx->event_list, event_entry)
- enabled |= event_enable_on_exec(event, ctx);
+
+ for_each_ctx_event_list(ctx, cpu, list) {
+ list_for_each_entry(event, list, event_entry)
+ enabled |= event_enable_on_exec(event, ctx);
+ }

/*
* Unclone and reschedule this context if we enabled any event.
@@ -3623,15 +3660,26 @@ static int perf_event_read(struct perf_event *event, bool group)
/*
* Initialize the perf_event context in a task_struct:
*/
-static void __perf_event_init_context(struct perf_event_context *ctx)
+static int __perf_event_init_context(struct perf_event_context *ctx)
{
+ struct list_head *list;
+ int cpu;
+
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->active_ctx_list);
INIT_LIST_HEAD(&ctx->pinned_groups);
INIT_LIST_HEAD(&ctx->flexible_groups);
- INIT_LIST_HEAD(&ctx->event_list);
+
+ ctx->event_list = alloc_percpu(struct list_head);
+ if (!ctx->event_list)
+ return -1;
+ for_each_ctx_event_list(ctx, cpu, list)
+ INIT_LIST_HEAD(list);
+
atomic_set(&ctx->refcount, 1);
+
+ return 0;
}

static struct perf_event_context *
@@ -3643,7 +3691,11 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
if (!ctx)
return NULL;

- __perf_event_init_context(ctx);
+ if (__perf_event_init_context(ctx)) {
+ kfree(ctx);
+ return NULL;
+ }
+
if (task) {
ctx->task = task;
get_task_struct(task);
@@ -3978,13 +4030,17 @@ static bool exclusive_event_installable(struct perf_event *event,
{
struct perf_event *iter_event;
struct pmu *pmu = event->pmu;
+ struct list_head *list;
+ int cpu;

if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
return true;

- list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
- if (exclusive_event_match(iter_event, event))
- return false;
+ for_each_ctx_event_list(ctx, cpu, list) {
+ list_for_each_entry(iter_event, list, event_entry) {
+ if (exclusive_event_match(iter_event, event))
+ return false;
+ }
}

return true;
@@ -6067,16 +6123,29 @@ perf_iterate_ctx(struct perf_event_context *ctx,
void *data, bool all)
{
struct perf_event *event;
+ struct list_head *t_list;
+ int cpu;

- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (!all) {
+ if (all) {
+ for_each_ctx_event_list(ctx, cpu, t_list) {
+ list_for_each_entry_rcu(event, t_list, event_entry)
+ output(event, data);
+ }
+ } else {
+ cpu = smp_processor_id();
+again:
+ t_list = this_cpu_ptr(ctx->event_list);
+ list_for_each_entry_rcu(event, t_list, event_entry) {
if (event->state < PERF_EVENT_STATE_INACTIVE)
continue;
if (!event_filter_match(event))
continue;
+ output(event, data);
+ }
+ if (cpu != 0) {
+ cpu = 0;
+ goto again;
}
-
- output(event, data);
}
}

@@ -7605,6 +7674,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
{
struct perf_sample_data data;
struct perf_event *event;
+ struct list_head *list;
+ int cpu;

struct perf_raw_record raw = {
.frag = {
@@ -7636,13 +7707,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
if (!ctx)
goto unlock;

- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
+ for_each_ctx_event_list(ctx, cpu, list) {
+ list_for_each_entry_rcu(event, list, event_entry) {
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ continue;
+ if (event->attr.config != entry->type)
+ continue;
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
}
unlock:
rcu_read_unlock();
@@ -8590,6 +8663,7 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
static void free_pmu_context(struct pmu *pmu)
{
struct pmu *i;
+ int cpu;

mutex_lock(&pmus_lock);
/*
@@ -8601,7 +8675,12 @@ static void free_pmu_context(struct pmu *pmu)
goto out;
}
}
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;

+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ free_percpu(cpuctx->ctx.event_list);
+ }
free_percpu(pmu->pmu_cpu_context);
out:
mutex_unlock(&pmus_lock);
@@ -8801,7 +8880,8 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
struct perf_cpu_context *cpuctx;

cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
+ if (__perf_event_init_context(&cpuctx->ctx))
+ goto free_pmu_cpu_context;
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
@@ -8845,6 +8925,9 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)

return ret;

+free_pmu_cpu_context:
+ free_pmu_context(pmu);
+
free_dev:
device_del(pmu->dev);
put_device(pmu->dev);
@@ -9969,6 +10052,8 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
struct perf_event_context *src_ctx;
struct perf_event_context *dst_ctx;
struct perf_event *event, *tmp;
+ struct list_head *list;
+ int cpu;
LIST_HEAD(events);

src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
@@ -9979,12 +10064,14 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* of swizzling perf_event::ctx.
*/
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
- perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+
+ for_each_ctx_event_list(src_ctx, cpu, list) {
+ list_for_each_entry_safe(event, tmp, list, event_entry) {
+ perf_remove_from_context(event, 0);
+ unaccount_event_cpu(event, src_cpu);
+ put_ctx(src_ctx);
+ list_add(&event->migrate_entry, &events);
+ }
}

/*
@@ -10111,6 +10198,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
+ struct list_head *list;
+ int cpu;

WARN_ON_ONCE(child != current);

@@ -10160,8 +10249,10 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
perf_event_task(child, child_ctx, 0);

- list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx, child);
+ for_each_ctx_event_list(child_ctx, cpu, list) {
+ list_for_each_entry_safe(child_event, next, list, event_entry)
+ perf_event_exit_event(child_event, child_ctx, child);
+ }

mutex_unlock(&child_ctx->mutex);

@@ -10611,10 +10702,14 @@ static void __perf_event_exit_context(void *__info)
struct perf_event_context *ctx = __info;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
+ struct list_head *list;
+ int cpu;

raw_spin_lock(&ctx->lock);
- list_for_each_entry(event, &ctx->event_list, event_entry)
- __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+ for_each_ctx_event_list(ctx, cpu, list) {
+ list_for_each_entry(event, list, event_entry)
+ __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+ }
raw_spin_unlock(&ctx->lock);
}

--
2.5.5