[PATCH 6/6] Add PERF_COUNT_SW_RUNNABLE_TASKS

From: highguy
Date: Sun Feb 07 2010 - 06:32:01 EST


From: Stijn Devriendt <HIGHGuY@xxxxxxxxx>

---
include/linux/perf_event.h | 17 ++++-
include/linux/sched.h | 1 +
kernel/perf_event.c | 180 ++++++++++++++++++++++++++++++++++++++------
kernel/sched.c | 7 ++
4 files changed, 178 insertions(+), 27 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 084f322..10e56f2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -105,6 +105,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6,
PERF_COUNT_SW_ALIGNMENT_FAULTS = 7,
PERF_COUNT_SW_EMULATION_FAULTS = 8,
+ PERF_COUNT_SW_RUNNABLE_TASKS = 9,

PERF_COUNT_SW_MAX, /* non-ABI */
};
@@ -456,6 +457,7 @@ enum perf_callchain_context {
#include <linux/pid_namespace.h>
#include <linux/workqueue.h>
#include <asm/atomic.h>
+#include <linux/poll.h>

#define PERF_MAX_STACK_DEPTH 255

@@ -519,6 +521,8 @@ struct pmu {
int (*reset) (struct perf_event *event);
void (*wakeup) (struct perf_event *event);
u64 (*read) (struct perf_event *event);
+ void (*init) (struct perf_event *event);
+ unsigned int (*poll) (struct perf_event *event, struct file* file, poll_table *wait);
};

/**
@@ -826,13 +830,20 @@ static inline int is_software_event(struct perf_event *event)

extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];

-extern void __perf_sw_event(u32, s64, int, struct pt_regs *, u64);
+extern void __perf_sw_event(u32, s64, int, struct pt_regs *, u64,
+ struct task_struct* task, int cpu);
+static inline void
+perf_sw_event_target(u32 event_id, s64 nr, int nmi, struct pt_regs *regs,
+ u64 addr, struct task_struct* task, int cpu)
+{
+ if (atomic_read(&perf_swevent_enabled[event_id]))
+ __perf_sw_event(event_id, nr, nmi, regs, addr, task, cpu);
+}

static inline void
perf_sw_event(u32 event_id, s64 nr, int nmi, struct pt_regs *regs, u64 addr)
{
- if (atomic_read(&perf_swevent_enabled[event_id]))
- __perf_sw_event(event_id, nr, nmi, regs, addr);
+ perf_sw_event_target(event_id, nr, nmi, regs, addr, current, smp_processor_id());
}

extern void __perf_event_mmap(struct vm_area_struct *vma);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f2f842d..dce2213 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -138,6 +138,7 @@ extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
extern unsigned long nr_running(void);
+extern unsigned long nr_running_cpu(int cpu);
extern unsigned long nr_uninterruptible(void);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(void);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 08885d0..5f4f23d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -743,6 +743,18 @@ static void add_event_to_ctx(struct perf_event *event,
event->tstamp_stopped = ctx->time;
}

+static void __perf_event_init_event(struct perf_event* event)
+{
+}
+
+static void perf_event_init_event(struct perf_event* event)
+{
+ if (event->pmu->init)
+ event->pmu->init(event);
+ else
+ __perf_event_init_event(event);
+}
+
/*
* Cross CPU call to install and enable a performance event
*
@@ -782,6 +794,8 @@ static void __perf_install_in_context(void *info)

add_event_to_ctx(event, ctx);

+ perf_event_init_event(event);
+
if (event->cpu != -1 && event->cpu != smp_processor_id())
goto unlock;

@@ -1593,7 +1607,7 @@ static u64 perf_event_update(struct perf_event *event)
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}

- return perf_event_read(event);
+ return __perf_event_read(event);
}

/*
@@ -1931,18 +1945,26 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return perf_read_hw(event, buf, count);
}

-static unsigned int perf_poll(struct file *file, poll_table *wait)
+static unsigned int __perf_poll(struct perf_event *event, struct file *file, poll_table *wait)
{
- struct perf_event *event = file->private_data;
unsigned int events = atomic_xchg(&event->poll, 0);

+ /*if (events)
+ printk("Events: POLLIN=%u\n", events&POLLIN);*/
+
if (event->attr.threshold)
{
u64 count = perf_event_read(event);
- if (count < event->attr.min_threshold)
+ if (count <= event->attr.max_threshold)
+ {
events |= POLLIN;
- else if (count > event->attr.max_threshold)
+ //printk(KERN_CONT "+");
+ }
+ else //if (count > event->attr.max_threshold)
+ {
events &= ~POLLIN;
+ //printk(KERN_CONT "-");
+ }
}

poll_wait(file, &event->waitq, wait);
@@ -1950,8 +1972,23 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
return events;
}

-static void perf_event_reset_noop(struct perf_event *event)
+static unsigned int perf_rt_poll(struct perf_event *event, struct file *file, poll_table *wait)
+{
+ return __perf_poll((event->parent ? event->parent : event), file, wait);
+}
+
+static unsigned int perf_poll(struct file* file, poll_table *wait)
+{
+ struct perf_event *event = file->private_data;
+ if (event->pmu->poll)
+ return event->pmu->poll(event, file, wait);
+ else
+ return __perf_poll(event, file, wait);
+}
+
+static int perf_event_reset_noop(struct perf_event *event)
{
+ return 0;
}

static void __perf_event_reset(struct perf_event *event)
@@ -2568,7 +2605,10 @@ void perf_event_wakeup(struct perf_event *event)
if (event->pmu->wakeup)
event->pmu->wakeup(event);
else
+ {
+ atomic_set(&event->poll, POLLIN);
wake_up_all(&event->waitq);
+ }

if (event->pending_kill) {
kill_fasync(&event->fasync, SIGIO, event->pending_kill);
@@ -2719,8 +2759,6 @@ static void __perf_output_wakeup(struct perf_event* event, int nmi)
{
if (event->attr.threshold && perf_event_read(event) > event->attr.max_threshold)
return;
-
- atomic_set(&event->poll, POLLIN);

if (nmi) {
event->pending_wakeup = 1;
@@ -3767,7 +3805,18 @@ int perf_event_overflow(struct perf_event *event, int nmi,

static void perf_event_wakeup_one(struct perf_event *event)
{
- wake_up(&event->waitq);
+ struct perf_event *wakeup_event = event->parent ? event->parent : event;
+ s64 wakeup_count = event->attr.max_threshold - __perf_event_read(wakeup_event);
+
+ if (wakeup_count < 1)
+ wakeup_count = 1;
+
+ atomic_set(&wakeup_event->poll, POLLIN);
+
+ if (event->attr.threshold && wakeup_count == 1)
+ wake_up(&wakeup_event->waitq);
+ else
+ wake_up_all(&wakeup_event->waitq);
}

static u64 __perf_event_add(struct perf_event *event, s64 count)
@@ -3783,7 +3832,7 @@ static u64 perf_event_add(struct perf_event *event, s64 count)
return __perf_event_add(event, count);
}

-static u64 perf_event_add_parent(struct perf_event *event, u64 count)
+static u64 perf_event_add_parent(struct perf_event *event, s64 count)
{
return event->parent ? __perf_event_add(event->parent, count) : __perf_event_add(event, count);
}
@@ -3864,6 +3913,22 @@ static void perf_swevent_add(struct perf_event *event, s64 nr,

perf_event_add(event, nr);

+ BUG_ON(perf_event_read(event) == (u64)-1);
+
+ if (event->attr.config == PERF_COUNT_SW_RUNNABLE_TASKS) {
+ if (event->ctx->task)
+ {
+ }
+ else
+ {
+ if (atomic64_read(&event->count) != nr_running_cpu(event->cpu))
+ {
+ printk("count = %lu <-> nr_running_cpu = %lu", atomic64_read(&event->count), nr_running_cpu(event->cpu));
+ BUG();
+ }
+ }
+ }
+
if (!regs)
return;

@@ -3932,7 +3997,7 @@ static int perf_swevent_match(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (event->cpu != -1 && event->cpu != smp_processor_id() && event_id != PERF_COUNT_SW_RUNNABLE_TASKS)
return 0;

if (!perf_swevent_is_counting(event))
@@ -4006,27 +4071,27 @@ EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
s64 nr, int nmi,
struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct pt_regs *regs,
+ struct task_struct* task,
+ int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;

- cpuctx = &__get_cpu_var(perf_cpu_context);
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
rcu_read_lock();
perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
nr, nmi, data, regs);
- /*
- * doesn't really matter which of the child contexts the
- * events ends up in.
- */
- ctx = rcu_dereference(current->perf_event_ctxp);
+
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (ctx)
perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
rcu_read_unlock();
}

void __perf_sw_event(u32 event_id, s64 nr, int nmi,
- struct pt_regs *regs, u64 addr)
+ struct pt_regs *regs, u64 addr,
+ struct task_struct* task, int cpu)
{
struct perf_sample_data data;
int rctx;
@@ -4038,12 +4103,12 @@ void __perf_sw_event(u32 event_id, s64 nr, int nmi,
data.addr = addr;
data.raw = NULL;

- do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+ do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs, task, cpu);

perf_swevent_put_recursion_context(rctx);
}

-static void perf_swevent_read(struct perf_event *event)
+static void perf_swevent_update(struct perf_event *event)
{
}

@@ -4066,10 +4131,61 @@ static void perf_swevent_disable(struct perf_event *event)
static const struct pmu perf_ops_generic = {
.enable = perf_swevent_enable,
.disable = perf_swevent_disable,
- .update = perf_swevent_read,
+ .update = perf_swevent_update,
.unthrottle = perf_swevent_unthrottle,
};

+static int perf_rt_enable(struct perf_event* event)
+{
+ return 0;
+}
+
+static void perf_rt_init_event(struct perf_event* event)
+{
+ if (event->ctx->task)
+ {
+ perf_event_add(event, event->ctx->task->state == 0);
+ }
+ else
+ atomic64_set(&event->count, nr_running_cpu(smp_processor_id()));
+}
+
+static void perf_rt_disable(struct perf_event* event)
+{
+ /* Nothing to do */
+}
+
+static void perf_rt_unthrottle(struct perf_event* event)
+{
+ /* Nothing to do */
+}
+
+static void perf_rt_update(struct perf_event* event)
+{
+ /* Nothing to do */
+}
+
+static u64 perf_event_read_parent(struct perf_event* event)
+{
+ if (event->parent)
+ return __perf_event_read(event->parent);
+ else
+ return __perf_event_read(event);
+}
+
+static const struct pmu perf_ops_runnable_tasks = {
+ .enable = perf_rt_enable,
+ .disable = perf_rt_disable,
+ .update = perf_rt_update,
+ .unthrottle = perf_rt_unthrottle,
+ .read = perf_event_read_parent,
+ .add = perf_event_add_parent,
+ .reset = perf_event_reset_noop,
+ .wakeup = perf_event_wakeup_one,
+ .init = perf_rt_init_event,
+ .poll = perf_rt_poll,
+};
+
/*
* hrtimer based swevent callback
*/
@@ -4267,7 +4383,7 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,

/* Trace events already protected against recursion */
do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
- &data, regs);
+ &data, regs, current, smp_processor_id());
}
EXPORT_SYMBOL_GPL(perf_tp_event);

@@ -4404,6 +4520,13 @@ static void sw_perf_event_destroy(struct perf_event *event)
atomic_dec(&perf_swevent_enabled[event_id]);
}

+static void sw_rt_perf_event_destroy(struct perf_event *event)
+{
+ BUG_ON(event->parent && __perf_event_read(event) != (u64)0);
+ sw_perf_event_destroy(event);
+}
+
+
static const struct pmu *sw_perf_event_init(struct perf_event *event)
{
const struct pmu *pmu = NULL;
@@ -4445,6 +4568,13 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
}
pmu = &perf_ops_generic;
break;
+ case PERF_COUNT_SW_RUNNABLE_TASKS:
+ if (!event->parent) {
+ atomic_inc(&perf_swevent_enabled[event_id]);
+ event->destroy = sw_rt_perf_event_destroy;
+ }
+ pmu = &perf_ops_runnable_tasks;
+ break;
}

return pmu;
@@ -4743,7 +4873,7 @@ SYSCALL_DEFINE5(perf_event_open,
return -EACCES;
}

- if (attr.threshold && (attr.freq || attr.watermark))
+ if (attr.threshold && (attr.freq || attr.watermark || attr.min_threshold > attr.max_threshold))
return -EINVAL;

if (attr.freq) {
@@ -4944,6 +5074,8 @@ inherit_event(struct perf_event *parent_event,
*/
add_event_to_ctx(child_event, child_ctx);

+ perf_event_init_event(child_event);
+
/*
* Get a reference to the parent filp - we will fput it
* when the child event exits. This is safe to do because
diff --git a/kernel/sched.c b/kernel/sched.c
index 87f1f47..53c679c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1967,6 +1967,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

enqueue_task(rq, p, wakeup);
inc_nr_running(rq);
+ perf_sw_event_target(PERF_COUNT_SW_RUNNABLE_TASKS, 1, 1, task_pt_regs(p), 0, p, cpu_of(rq));
}

/*
@@ -1979,6 +1980,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

dequeue_task(rq, p, sleep);
dec_nr_running(rq);
+ perf_sw_event_target(PERF_COUNT_SW_RUNNABLE_TASKS, -1, 1, task_pt_regs(p), 0, p, cpu_of(rq));
}

/**
@@ -2932,6 +2934,11 @@ unsigned long nr_running(void)
return sum;
}

+unsigned long nr_running_cpu(int cpu)
+{
+ return cpu_rq(cpu)->nr_running;
+}
+
unsigned long nr_uninterruptible(void)
{
unsigned long i, sum = 0;
--
1.6.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/