Re: [PATCH 02/10] perf/x86: Improve HT workaround GP counter constraint

From: Stephane Eranian
Date: Fri May 22 2015 - 07:24:58 EST


On Fri, May 22, 2015 at 4:21 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Fri, May 22, 2015 at 03:04:45AM -0700, Stephane Eranian wrote:
>> > + if (is_ht_workaround_enabled() &&
>> > + sched->state.nr_gp_counters++ >= x86_pmu.num_counters / 2)
>> > + return false;
>> > +
>>
>> Has to be > and not >= otherwise:
>
> but its a post inc, so we'll return: 0, 1, 2, ... With > we'll match
> after 3 gp events.
>
> I'll agree its not working right though.
>
> FWIW, I currently have the below; which also isn't working right.
>
> It should not enforce the limit when there's no exclusive events being
> scheduled.
>
> It also doesn't break uncore scheduling.
>
> ---
> arch/x86/kernel/cpu/perf_event.c | 31 ++++++++++++++++++++++----
> arch/x86/kernel/cpu/perf_event.h | 10 +++++---
> arch/x86/kernel/cpu/perf_event_intel.c | 28 ++++++-----------------
> arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 -
> 4 files changed, 43 insertions(+), 28 deletions(-)
>
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -611,6 +611,7 @@ struct sched_state {
> int event; /* event index */
> int counter; /* counter index */
> int unassigned; /* number of events to be assigned left */
> + int nr_gp; /* number of GP counters used */
> unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
> };
>
> @@ -620,6 +621,7 @@ struct sched_state {
> struct perf_sched {
> int max_weight;
> int max_events;
> + int max_gp;
> struct event_constraint **constraints;
> struct sched_state state;
> int saved_states;
> @@ -630,13 +632,14 @@ struct perf_sched {
> * Initialize interator that runs through all events and counters.
> */
> static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
> - int num, int wmin, int wmax)
> + int num, int wmin, int wmax, int gpmax)
> {
> int idx;
>
> memset(sched, 0, sizeof(*sched));
> sched->max_events = num;
> sched->max_weight = wmax;
> + sched->max_gp = gpmax;
> sched->constraints = constraints;
>
> for (idx = 0; idx < num; idx++) {
> @@ -696,6 +699,10 @@ static bool __perf_sched_find_counter(st
> goto done;
> }
> }
> +
> + if (sched->state.nr_gp++ >= sched->max_gp)
> + return false;
> +
> /* Grab the first unused counter starting with idx */
> idx = sched->state.counter;
> for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
> @@ -757,11 +764,11 @@ static bool perf_sched_next_event(struct
> * Assign a counter for each event.
> */
> int perf_assign_events(struct event_constraint **constraints, int n,
> - int wmin, int wmax, int *assign)
> + int wmin, int wmax, int gpmax, int *assign)
> {
> struct perf_sched sched;
>
> - perf_sched_init(&sched, constraints, n, wmin, wmax);
> + perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
>
> do {
> if (!perf_sched_find_counter(&sched))
> @@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
>
> /* slow path */
> if (i != n) {
> + int gpmax = x86_pmu.num_counters / 2;
> +
> + /*
> + * Do not allow scheduling of more than half the available
> + * generic counters.
> + *
> + * This helps avoid counter starvation of sibling thread by
> + * ensuring at most half the counters cannot be in exclusive
> + * mode. There is no designated counters for the limits. Any
> + * N/2 counters can be used. This helps with events with
> + * specific counter constraints.
> + */
> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
> + gpmax /= 2;
> +
That's num_counters / 4!
I think you meant: int gpmax = x86_pmu.num_counters;

> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
> - wmax, assign);
> + wmax, gpmax, assign);
> }
>
> /*
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -134,8 +134,6 @@ enum intel_excl_state_type {
> struct intel_excl_states {
> enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
> enum intel_excl_state_type state[X86_PMC_IDX_MAX];
> - int num_alloc_cntrs;/* #counters allocated */
> - int max_alloc_cntrs;/* max #counters allowed */
> bool sched_started; /* true if scheduling has started */
> };
>
> @@ -144,6 +142,11 @@ struct intel_excl_cntrs {
>
> struct intel_excl_states states[2];
>
> + union {
> + u16 has_exclusive[2];
> + u32 exclusive_present;
> + };
> +
> int refcnt; /* per-core: #HT threads */
> unsigned core_id; /* per-core: core id */
> };
> @@ -176,6 +179,7 @@ struct cpu_hw_events {
> struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
> struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
>
> + int n_excl; /* the number of exclusive events */
>
> unsigned int group_flag;
> int is_fake;
> @@ -719,7 +723,7 @@ static inline void __x86_pmu_enable_even
> void x86_pmu_enable_all(int added);
>
> int perf_assign_events(struct event_constraint **constraints, int n,
> - int wmin, int wmax, int *assign);
> + int wmin, int wmax, int gpmax, int *assign);
> int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
>
> void x86_pmu_stop(struct perf_event *event, int flags);
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_eve
> xl = &excl_cntrs->states[tid];
>
> xl->sched_started = true;
> - xl->num_alloc_cntrs = 0;
> /*
> * lock shared state until we are done scheduling
> * in stop_event_scheduling()
> @@ -2000,6 +1999,10 @@ intel_get_excl_constraints(struct cpu_hw
> * across HT threads
> */
> is_excl = c->flags & PERF_X86_EVENT_EXCL;
> + if (is_excl) {
> + if (!cpuc->n_excl++)
> + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
> + }
>
> /*
> * xl = state of current HT
> @@ -2008,18 +2011,6 @@ intel_get_excl_constraints(struct cpu_hw
> xl = &excl_cntrs->states[tid];
> xlo = &excl_cntrs->states[o_tid];
>
> - /*
> - * do not allow scheduling of more than max_alloc_cntrs
> - * which is set to half the available generic counters.
> - * this helps avoid counter starvation of sibling thread
> - * by ensuring at most half the counters cannot be in
> - * exclusive mode. There is not designated counters for the
> - * limits. Any N/2 counters can be used. This helps with
> - * events with specifix counter constraints
> - */
> - if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
> - return &emptyconstraint;
> -
> cx = c;
>
> /*
> @@ -2150,6 +2141,10 @@ static void intel_put_excl_constraints(s
>
> xl = &excl_cntrs->states[tid];
> xlo = &excl_cntrs->states[o_tid];
> + if (hwc->flags & PERF_X86_EVENT_EXCL) {
> + if (!--cpuc->n_excl)
> + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
> + }
>
> /*
> * put_constraint may be called from x86_schedule_events()
> @@ -2632,8 +2627,6 @@ static void intel_pmu_cpu_starting(int c
> cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
>
> if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
> - int h = x86_pmu.num_counters >> 1;
> -
> for_each_cpu(i, topology_thread_cpumask(cpu)) {
> struct intel_excl_cntrs *c;
>
> @@ -2647,11 +2640,6 @@ static void intel_pmu_cpu_starting(int c
> }
> cpuc->excl_cntrs->core_id = core_id;
> cpuc->excl_cntrs->refcnt++;
> - /*
> - * set hard limit to half the number of generic counters
> - */
> - cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
> - cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
> }
> }
>
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> @@ -394,7 +394,7 @@ static int uncore_assign_events(struct i
> /* slow path */
> if (i != n)
> ret = perf_assign_events(box->event_constraint, n,
> - wmin, wmax, assign);
> + wmin, wmax, n, assign);
>
> if (!assign || ret) {
> for (i = 0; i < n; i++)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/