[PATCH RFC 2/5] perf: Add fork to the sideband ioctl

From: Adrian Hunter
Date: Fri Apr 14 2023 - 04:23:43 EST


Support the case of output to an active event, and return an error if
output is not possible in that case. Set PERF_RECORD_MISC_STATUS_ONLY to
differentiate the ioctl status-only sideband event from a "real" sideband
event.

Set the fork parent pid/tid to the real parent for a thread group leader,
or to the thread group leader otherwise.

Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
---
kernel/events/core.c | 88 ++++++++++++++++++++++++++++++++++++--------
1 file changed, 73 insertions(+), 15 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5cbcc6851587..4e76596d3bfb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7948,6 +7948,54 @@ perf_iterate_sb(perf_iterate_f output, void *data,
rcu_read_unlock();
}

+typedef int (perf_output_f)(struct perf_event *event, void *data);
+
+static int perf_event_output_sb(struct perf_event *event, perf_output_f output, void *data)
+{
+ int err = -ENOENT;
+
+ preempt_disable();
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE ||
+ !event_filter_match(event) ||
+ READ_ONCE(event->oncpu) != smp_processor_id())
+ goto out;
+
+ err = output(event, data);
+out:
+ preempt_enable();
+ return err;
+}
+
+struct perf_output_f_data {
+ perf_output_f *func;
+ void *data;
+};
+
+void perf_output_f_wrapper(struct perf_event *event, void *data)
+{
+ struct perf_output_f_data *f_data = data;
+
+ f_data->func(event, f_data->data);
+}
+
+static int perf_output_sb(perf_output_f output, void *data,
+ struct perf_event_context *task_ctx,
+ struct perf_event *event)
+{
+ struct perf_output_f_data f_data = {
+ .func = output,
+ .data = data,
+ };
+
+ if (event)
+ return perf_event_output_sb(event, output, data);
+
+ perf_iterate_sb(perf_output_f_wrapper, &f_data, task_ctx);
+
+ return 0;
+}
+
/*
* Clear all file-based filters at exec, they'll have to be
* re-instated when/if these objects are mmapped again.
@@ -8107,8 +8155,7 @@ static int perf_event_task_match(struct perf_event *event)
event->attr.task;
}

-static void perf_event_task_output(struct perf_event *event,
- void *data)
+static int perf_event_task_output(struct perf_event *event, void *data)
{
struct perf_task_event *task_event = data;
struct perf_output_handle handle;
@@ -8117,7 +8164,7 @@ static void perf_event_task_output(struct perf_event *event,
int ret, size = task_event->event_id.header.size;

if (!perf_event_task_match(event))
- return;
+ return -ENOENT;

perf_event_header__init_id(&task_event->event_id.header, &sample, event);

@@ -8134,6 +8181,14 @@ static void perf_event_task_output(struct perf_event *event,
task->real_parent);
task_event->event_id.ptid = perf_event_pid(event,
task->real_parent);
+ } else if (task_event->event_id.header.misc & PERF_RECORD_MISC_STATUS_ONLY) {
+ if (thread_group_leader(task)) {
+ task_event->event_id.ppid = perf_event_pid(event, task->real_parent);
+ task_event->event_id.ptid = perf_event_tid(event, task->real_parent);
+ } else {
+ task_event->event_id.ppid = perf_event_pid(event, task);
+ task_event->event_id.ptid = perf_event_pid(event, task);
+ }
} else { /* PERF_RECORD_FORK */
task_event->event_id.ppid = perf_event_pid(event, current);
task_event->event_id.ptid = perf_event_tid(event, current);
@@ -8148,18 +8203,19 @@ static void perf_event_task_output(struct perf_event *event,
perf_output_end(&handle);
out:
task_event->event_id.header.size = size;
+ return ret;
}

-static void perf_event_task(struct task_struct *task,
- struct perf_event_context *task_ctx,
- int new)
+static int perf_event_task(struct task_struct *task,
+ struct perf_event_context *task_ctx,
+ int new, struct perf_event *event)
{
struct perf_task_event task_event;

if (!atomic_read(&nr_comm_events) &&
!atomic_read(&nr_mmap_events) &&
!atomic_read(&nr_task_events))
- return;
+ return -ENOENT;

task_event = (struct perf_task_event){
.task = task,
@@ -8167,7 +8223,7 @@ static void perf_event_task(struct task_struct *task,
.event_id = {
.header = {
.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
- .misc = 0,
+ .misc = event ? PERF_RECORD_MISC_STATUS_ONLY : 0,
.size = sizeof(task_event.event_id),
},
/* .pid */
@@ -8178,14 +8234,12 @@ static void perf_event_task(struct task_struct *task,
},
};

- perf_iterate_sb(perf_event_task_output,
- &task_event,
- task_ctx);
+ return perf_output_sb(perf_event_task_output, &task_event, task_ctx, event);
}

void perf_event_fork(struct task_struct *task)
{
- perf_event_task(task, NULL, 1);
+ perf_event_task(task, NULL, 1, NULL);
perf_event_namespaces(task);
}

@@ -12817,7 +12871,11 @@ EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);

static int perf_event_emit_fork(struct perf_event *event, struct task_struct *task)
{
- return -EINVAL;
+ if (!event->attr.comm && !event->attr.mmap && !event->attr.mmap2 &&
+ !event->attr.mmap_data && !event->attr.task)
+ return -EINVAL;
+
+ return perf_event_task(task, NULL, 1, event);
}

static int perf_event_emit_namespaces(struct perf_event *event, struct task_struct *task)
@@ -13115,7 +13173,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
*/
- perf_event_task(child, child_ctx, 0);
+ perf_event_task(child, child_ctx, 0, NULL);

list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
perf_event_exit_event(child_event, child_ctx);
@@ -13157,7 +13215,7 @@ void perf_event_exit_task(struct task_struct *child)
* child contexts and sets child->perf_event_ctxp[] to NULL.
* At this point we need to send EXIT events to cpu contexts.
*/
- perf_event_task(child, NULL, 0);
+ perf_event_task(child, NULL, 0, NULL);
}

static void perf_free_event(struct perf_event *event,
--
2.34.1