Re: [RFC PATCH 1/1] kernel/events: Introduce IOC_COUNT_RECORDS

From: Arnaldo Carvalho de Melo
Date: Tue Jun 06 2017 - 11:51:57 EST


Em Tue, Jun 06, 2017 at 08:46:28PM +0530, Naveen N. Rao escreveu:
> Many perf sideband events (context switches, namespaces, ...) are useful
> by themselves without the need for subscribing to any overflow events.
> However, it is not possible to subscribe for notifications when such
> records are logged into the ring buffer. Introduce IOC_COUNT_RECORDS as
> a way to request this.
>
> With IOC_COUNT_RECORDS set, IOC_REFRESH takes the number of records
> after which to generate a notification, rather than the number of
> overflow events.

Can you take a look at tools/perf/python/twatch.py?

[acme@jouet linux]$ make O=/tmp/build/perf -C tools/perf install-bin
[root@jouet linux]# export PYTHONPATH=/tmp/build/perf/python/
[root@jouet linux]# python tools/perf/python/twatch.py
cpu: 0, pid: 29860, tid: 29860 { type: exit, pid: 29860, ppid: 29860, tid: 29860, ptid: 29860, time: 117729363047027}
cpu: 0, pid: 29854, tid: 29854 { type: exit, pid: 29854, ppid: 29854, tid: 29854, ptid: 29854, time: 117729363617885}
cpu: 0, pid: 29853, tid: 29853 { type: fork, pid: 29865, ppid: 29853, tid: 29865, ptid: 29853, time: 117729363800225}
cpu: 0, pid: 29865, tid: 29865 { type: comm, pid: 29865, tid: 29865, comm: fixdep }
cpu: 0, pid: 29865, tid: 29865 { type: exit, pid: 29865, ppid: 29865, tid: 29865, ptid: 29865, time: 117729364898505}
cpu: 0, pid: 29853, tid: 29853 { type: fork, pid: 29866, ppid: 29853, tid: 29866, ptid: 29853, time: 117729365022416}
cpu: 0, pid: 29866, tid: 29866 { type: comm, pid: 29866, tid: 29866, comm: rm }
cpu: 0, pid: 29866, tid: 29866 { type: exit, pid: 29866, ppid: 29866, tid: 29866, ptid: 29866, time: 117729365665831}
cpu: 0, pid: 29853, tid: 29853 { type: fork, pid: 29867, ppid: 29853, tid: 29867, ptid: 29853, time: 117729365846030}
cpu: 0, pid: 29867, tid: 29867 { type: comm, pid: 29867, tid: 29867, comm: mv }
cpu: 2, pid: 28218, tid: 28218 { type: exit, pid: 28218, ppid: 28218, tid: 28218, ptid: 28218, time: 117729704900029}
^CTraceback (most recent call last):
File "tools/perf/python/twatch.py", line 68, in <module>
main()
File "tools/perf/python/twatch.py", line 40, in main
evlist.poll(timeout = -1)
KeyboardInterrupt
[root@jouet linux]#

This is using the python binding to get notifications for such meta
events "synchronously", you can do the same with a C proggie, of course,
and using just what we have in the kernel already.

See its changelog comments to see examples:

git log tools/perf/python/twatch.py

For instance, what I think you want is in:

[acme@jouet linux]$ git log --oneline -1 cfeb1d90a1b1db96383b48888cb7a5f10ca12e12
cfeb1d90a1b1 perf python: Use attr.watermark in twatch.py
[acme@jouet linux]$ git log --oneline -1 58b32c1b538f2d197ce385d6a314e83f8b787021
58b32c1b538f perf python: Make twatch.py use soft dummy event, freq=0
[acme@jouet linux]$

- Arnaldo

> Signed-off-by: Naveen N. Rao <naveen.n.rao@xxxxxxxxxxxxxxxxxx>
> ---
> include/linux/perf_event.h | 1 +
> include/uapi/linux/perf_event.h | 1 +
> kernel/events/core.c | 16 +++++++++++++++-
> kernel/events/ring_buffer.c | 9 +++++++++
> 4 files changed, 26 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 24a635887f28..016f2da2bba7 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -683,6 +683,7 @@ struct perf_event {
> struct irq_work pending;
>
> atomic_t event_limit;
> + bool count_records;
>
> /* address range filters */
> struct perf_addr_filters_head addr_filters;
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index b1c0b187acfe..fb989ac71ded 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -408,6 +408,7 @@ struct perf_event_attr {
> #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
> #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
> #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32)
> +#define PERF_EVENT_IOC_COUNT_RECORDS _IO ('$', 10)
>
> enum perf_event_ioc_flags {
> PERF_IOC_FLAG_GROUP = 1U << 0,
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 6e75a5c9412d..637064880b36 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2674,6 +2674,16 @@ void perf_event_addr_filters_sync(struct perf_event *event)
> }
> EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
>
> +static int _perf_event_count_records(struct perf_event *event)
> +{
> + if (event->attr.inherit || !is_sampling_event(event))
> + return -EINVAL;
> +
> + event->count_records = 1;
> +
> + return 0;
> +}
> +
> static int _perf_event_refresh(struct perf_event *event, int refresh)
> {
> /*
> @@ -4699,6 +4709,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
> func = _perf_event_reset;
> break;
>
> + case PERF_EVENT_IOC_COUNT_RECORDS:
> + return _perf_event_count_records(event);
> +
> case PERF_EVENT_IOC_REFRESH:
> return _perf_event_refresh(event, arg);
>
> @@ -7342,7 +7355,8 @@ static int __perf_event_overflow(struct perf_event *event,
> */
>
> event->pending_kill = POLL_IN;
> - if (events && atomic_dec_and_test(&event->event_limit)) {
> + if (events && !event->count_records &&
> + atomic_dec_and_test(&event->event_limit)) {
> ret = 1;
> event->pending_kill = POLL_HUP;
>
> diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> index 2831480c63a2..9b9ca0608fed 100644
> --- a/kernel/events/ring_buffer.c
> +++ b/kernel/events/ring_buffer.c
> @@ -126,6 +126,7 @@ __perf_output_begin(struct perf_output_handle *handle,
> u64 id;
> u64 lost;
> } lost_event;
> + int events = atomic_read(&event->event_limit);
>
> rcu_read_lock();
> /*
> @@ -197,6 +198,14 @@ __perf_output_begin(struct perf_output_handle *handle,
> if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
> local_add(rb->watermark, &rb->wakeup);
>
> + if (events && event->count_records &&
> + atomic_dec_and_test(&event->event_limit)) {
> + event->pending_kill = POLL_HUP;
> + local_inc(&rb->wakeup);
> +
> + perf_event_disable_inatomic(event);
> + }
> +
> page_shift = PAGE_SHIFT + page_order(rb);
>
> handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
> --
> 2.12.2