[PATCH RFC v3 05/13] perf/core: Add *group_leader for perf_event_create_group_kernel_counters()

From: Dapeng Mi
Date: Tue Aug 22 2023 - 01:05:01 EST


Add a new argument *group_leader for
perf_event_create_group_kernel_counters(), so group events can be
created from Kernel space just like user space does.

Current perf logic requires a perf events group is created to handle the
topdown metrics profiling. To support topdown metrics feature in KVM,
Kernel space also need the capability to create group events.

Co-developed-by: Like Xu <likexu@xxxxxxxxxxx>
Signed-off-by: Like Xu <likexu@xxxxxxxxxxx>
Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
---
arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 4 ++--
arch/x86/kvm/pmu.c | 2 +-
arch/x86/kvm/vmx/pmu_intel.c | 4 ++--
include/linux/perf_event.h | 1 +
kernel/events/core.c | 17 ++++++++++++++++-
kernel/events/hw_breakpoint.c | 4 ++--
kernel/events/hw_breakpoint_test.c | 2 +-
kernel/watchdog_perf.c | 2 +-
8 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 458cb7419502..6494b2701204 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -952,12 +952,12 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
u64 tmp;

miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (IS_ERR(miss_event))
goto out;

hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (IS_ERR(hit_event))
goto out_miss;

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index edb89b51b383..760d293f4a4a 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -221,7 +221,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
attr.precise_ip = pmc_get_pebs_precise_level(pmc);
}

- event = perf_event_create_kernel_counter(&attr, -1, current,
+ event = perf_event_create_kernel_counter(&attr, -1, current, NULL,
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 044d61aa63dc..9bf80fee34fb 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -302,8 +302,8 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
return 0;
}

- event = perf_event_create_kernel_counter(&attr, -1,
- current, NULL, NULL);
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ NULL, NULL, NULL);
if (IS_ERR(event)) {
pr_debug_ratelimited("%s: failed %ld\n",
__func__, PTR_ERR(event));
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2166a69e3bf2..c182f811f5f8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1102,6 +1102,7 @@ extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
int cpu,
struct task_struct *task,
+ struct perf_event *group_leader,
perf_overflow_handler_t callback,
void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 15eb82d1a010..a3af2e740dea 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12754,12 +12754,14 @@ SYSCALL_DEFINE5(perf_event_open,
* @attr: attributes of the counter to create
* @cpu: cpu in which the counter is bound
* @task: task to profile (NULL for percpu)
+ * @group_leader: the group leader event of the created event
* @overflow_handler: callback to trigger when we hit the event
* @context: context data could be used in overflow_handler callback
*/
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
struct task_struct *task,
+ struct perf_event *group_leader,
perf_overflow_handler_t overflow_handler,
void *context)
{
@@ -12767,6 +12769,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
struct perf_event_context *ctx;
struct perf_event *event;
struct pmu *pmu;
+ int move_group = 0;
int err;

/*
@@ -12776,7 +12779,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
if (attr->aux_output)
return ERR_PTR(-EINVAL);

- event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+ if (task && group_leader &&
+ group_leader->attr.inherit != attr->inherit)
+ return ERR_PTR(-EINVAL);
+
+ event = perf_event_alloc(attr, cpu, task, group_leader, NULL,
overflow_handler, context, -1);
if (IS_ERR(event)) {
err = PTR_ERR(event);
@@ -12806,6 +12813,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}

+ err = perf_event_group_leader_check(group_leader, event, attr, ctx,
+ &pmu, &move_group);
+ if (err)
+ goto err_unlock;
+
pmu_ctx = find_get_pmu_context(pmu, ctx, event);
if (IS_ERR(pmu_ctx)) {
err = PTR_ERR(pmu_ctx);
@@ -12833,6 +12845,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_pmu_ctx;
}

+ if (move_group)
+ perf_event_move_group(group_leader, pmu_ctx, ctx);
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c3797701339c..65b5b1421e62 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -771,7 +771,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
void *context,
struct task_struct *tsk)
{
- return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
+ return perf_event_create_kernel_counter(attr, -1, tsk, NULL, triggered,
context);
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
@@ -881,7 +881,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,

cpus_read_lock();
for_each_online_cpu(cpu) {
- bp = perf_event_create_kernel_counter(attr, cpu, NULL,
+ bp = perf_event_create_kernel_counter(attr, cpu, NULL, NULL,
triggered, context);
if (IS_ERR(bp)) {
err = PTR_ERR(bp);
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
index 2cfeeecf8de9..694db7645676 100644
--- a/kernel/events/hw_breakpoint_test.c
+++ b/kernel/events/hw_breakpoint_test.c
@@ -39,7 +39,7 @@ static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int
attr.bp_addr = (unsigned long)&break_vars[idx];
attr.bp_len = HW_BREAKPOINT_LEN_1;
attr.bp_type = HW_BREAKPOINT_RW;
- return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL, NULL);
}

static void unregister_test_bp(struct perf_event **bp)
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 8ea00c4a24b2..f8a52c4df079 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -120,7 +120,7 @@ static int hardlockup_detector_event_create(void)
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);

/* Try to register using hardware perf events */
- evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+ evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, NULL,
watchdog_overflow_callback, NULL);
if (IS_ERR(evt)) {
pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
--
2.34.1