[Patch v4 05/13] perf/core: Add *group_leader for perf_event_create_group_kernel_counters()

From: Dapeng Mi
Date: Wed Sep 27 2023 - 00:26:23 EST


Add a new argument *group_leader for
perf_event_create_group_kernel_counters(), so group events can be
created from Kernel space just like user space does.

Current perf logic requires a perf events group is created to handle the
topdown metrics profiling. To support topdown metrics feature in KVM,
Kernel space also need the capability to create group events.

Co-developed-by: Like Xu <likexu@xxxxxxxxxxx>
Signed-off-by: Like Xu <likexu@xxxxxxxxxxx>
Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
---
arch/arm64/kvm/pmu-emul.c | 2 +-
arch/riscv/kvm/vcpu_pmu.c | 2 +-
arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 4 ++--
arch/x86/kvm/pmu.c | 2 +-
arch/x86/kvm/vmx/pmu_intel.c | 4 ++--
include/linux/perf_event.h | 1 +
kernel/events/core.c | 17 ++++++++++++++++-
kernel/events/hw_breakpoint.c | 4 ++--
kernel/events/hw_breakpoint_test.c | 2 +-
kernel/watchdog_perf.c | 2 +-
10 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 6b066e04dc5d..d31dd91dc081 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -631,7 +631,7 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)

attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc));

- event = perf_event_create_kernel_counter(&attr, -1, current,
+ event = perf_event_create_kernel_counter(&attr, -1, current, NULL,
kvm_pmu_perf_overflow, pmc);

if (IS_ERR(event)) {
diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
index 86391a5061dd..74075097cdaa 100644
--- a/arch/riscv/kvm/vcpu_pmu.c
+++ b/arch/riscv/kvm/vcpu_pmu.c
@@ -247,7 +247,7 @@ static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr
*/
attr->sample_period = kvm_pmu_get_sample_period(pmc);

- event = perf_event_create_kernel_counter(attr, -1, current, NULL, pmc);
+ event = perf_event_create_kernel_counter(attr, -1, current, NULL, NULL, pmc);
if (IS_ERR(event)) {
pr_err("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event));
return PTR_ERR(event);
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 8f559eeae08e..f84b4c8838a6 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -966,12 +966,12 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
u64 tmp;

miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (IS_ERR(miss_event))
goto out;

hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL);
if (IS_ERR(hit_event))
goto out_miss;

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index edb89b51b383..760d293f4a4a 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -221,7 +221,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
attr.precise_ip = pmc_get_pebs_precise_level(pmc);
}

- event = perf_event_create_kernel_counter(&attr, -1, current,
+ event = perf_event_create_kernel_counter(&attr, -1, current, NULL,
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 044d61aa63dc..9bf80fee34fb 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -302,8 +302,8 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
return 0;
}

- event = perf_event_create_kernel_counter(&attr, -1,
- current, NULL, NULL);
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ NULL, NULL, NULL);
if (IS_ERR(event)) {
pr_debug_ratelimited("%s: failed %ld\n",
__func__, PTR_ERR(event));
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e85cd1c0eaf3..04e12a8e6584 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1102,6 +1102,7 @@ extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
int cpu,
struct task_struct *task,
+ struct perf_event *group_leader,
perf_overflow_handler_t callback,
void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 953e3d3a1664..3cc870d450c5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12743,12 +12743,14 @@ SYSCALL_DEFINE5(perf_event_open,
* @attr: attributes of the counter to create
* @cpu: cpu in which the counter is bound
* @task: task to profile (NULL for percpu)
+ * @group_leader: the group leader event of the created event
* @overflow_handler: callback to trigger when we hit the event
* @context: context data could be used in overflow_handler callback
*/
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
struct task_struct *task,
+ struct perf_event *group_leader,
perf_overflow_handler_t overflow_handler,
void *context)
{
@@ -12756,6 +12758,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
struct perf_event_context *ctx;
struct perf_event *event;
struct pmu *pmu;
+ int move_group = 0;
int err;

/*
@@ -12765,7 +12768,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
if (attr->aux_output)
return ERR_PTR(-EINVAL);

- event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+ if (task && group_leader &&
+ group_leader->attr.inherit != attr->inherit)
+ return ERR_PTR(-EINVAL);
+
+ event = perf_event_alloc(attr, cpu, task, group_leader, NULL,
overflow_handler, context, -1);
if (IS_ERR(event)) {
err = PTR_ERR(event);
@@ -12795,6 +12802,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}

+ err = perf_event_group_leader_check(group_leader, event, attr, ctx,
+ &pmu, &move_group);
+ if (err)
+ goto err_unlock;
+
pmu_ctx = find_get_pmu_context(pmu, ctx, event);
if (IS_ERR(pmu_ctx)) {
err = PTR_ERR(pmu_ctx);
@@ -12822,6 +12834,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_pmu_ctx;
}

+ if (move_group)
+ perf_event_move_group(group_leader, pmu_ctx, ctx);
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 6c2cb4e4f48d..5e328a684a4d 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -743,7 +743,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
void *context,
struct task_struct *tsk)
{
- return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
+ return perf_event_create_kernel_counter(attr, -1, tsk, NULL, triggered,
context);
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
@@ -853,7 +853,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,

cpus_read_lock();
for_each_online_cpu(cpu) {
- bp = perf_event_create_kernel_counter(attr, cpu, NULL,
+ bp = perf_event_create_kernel_counter(attr, cpu, NULL, NULL,
triggered, context);
if (IS_ERR(bp)) {
err = PTR_ERR(bp);
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
index 2cfeeecf8de9..694db7645676 100644
--- a/kernel/events/hw_breakpoint_test.c
+++ b/kernel/events/hw_breakpoint_test.c
@@ -39,7 +39,7 @@ static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int
attr.bp_addr = (unsigned long)&break_vars[idx];
attr.bp_len = HW_BREAKPOINT_LEN_1;
attr.bp_type = HW_BREAKPOINT_RW;
- return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL, NULL);
}

static void unregister_test_bp(struct perf_event **bp)
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 8ea00c4a24b2..f8a52c4df079 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -120,7 +120,7 @@ static int hardlockup_detector_event_create(void)
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);

/* Try to register using hardware perf events */
- evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+ evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, NULL,
watchdog_overflow_callback, NULL);
if (IS_ERR(evt)) {
pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
--
2.34.1