Re: [patch] sched: beef up wake_wide()

From: Mike Galbraith
Date: Fri Jul 10 2015 - 01:19:39 EST


On Thu, 2015-07-09 at 15:26 +0200, Peter Zijlstra wrote:
> On Wed, Jul 08, 2015 at 08:13:46AM +0200, Mike Galbraith wrote:
> > static int wake_wide(struct task_struct *p)
> > {
> > + unsigned int waker_flips = current->wakee_flips;
> > + unsigned int wakee_flips = p->wakee_flips;
> > int factor = this_cpu_read(sd_llc_size);
> >
> > + if (waker_flips < wakee_flips)
> > + swap(waker_flips, wakee_flips);
>
> This makes the wakee/waker names useless, the end result is more like
> wakee_flips := client_flips, waker_flips := server_flips.

I settled on master/slave plus hopefully improved comment block.

> > + if (wakee_flips < factor || waker_flips < wakee_flips * factor)
> > + return 0;
>
> I don't get the first condition... why would the client ever flip? It
> only talks to that one server.

(tightening heuristic up a bit by one means or another would be good,
but "if it ain't broke, don't fix it" applies for this patchlet)

> > @@ -5021,14 +5015,17 @@ select_task_rq_fair(struct task_struct *
> > {
> > struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
> > int cpu = smp_processor_id();
> > + int new_cpu = prev_cpu;
> > int want_affine = 0;
> > int sync = wake_flags & WF_SYNC;
> >
> > rcu_read_lock();
> > + if (sd_flag & SD_BALANCE_WAKE) {
> > + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
> > + if (!want_affine)
> > + goto select_idle;
> > + }
>
> So this preserves/makes worse the bug Morten spotted, even without
> want_affine we should still attempt SD_BALANCE_WAKE if set.

Fixed. wake_wide() may override want_affine as before, want_affine may
override other ->flags as before, but a surviving domain selection now
results in a full balance instead of a select_idle_sibling() call.

sched: beef up wake_wide()

Josef Bacik reported that Facebook sees better performance with their
1:N load (1 dispatch/node, N workers/node) when carrying an old patch
to try very hard to wake to an idle CPU. While looking at wake_wide(),
I noticed that it doesn't pay attention to the wakeup of a many partner
waker, returning 1 only when waking one of its many partners.

Correct that, letting explicit domain flags override the heuristic.

While at it, adjust task_struct bits, we don't need a 64bit counter.

Signed-off-by: Mike Galbraith <umgwanakikbuti@xxxxxxxxx>
Tested-by: Josef Bacik <jbacik@xxxxxx>
---
include/linux/sched.h | 4 +--
kernel/sched/fair.c | 57 ++++++++++++++++++++++----------------------------
2 files changed, 28 insertions(+), 33 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1351,9 +1351,9 @@ struct task_struct {
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
- struct task_struct *last_wakee;
- unsigned long wakee_flips;
+ unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
+ struct task_struct *last_wakee;

int wake_cpu;
#endif
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4730,26 +4730,29 @@ static long effective_load(struct task_g

#endif

+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees. In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other. With
+ * both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size. Waker/wakee
+ * being client/server, worker/dispatcher, interrupt source or whatever is
+ * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ */
static int wake_wide(struct task_struct *p)
{
+ unsigned int master = current->wakee_flips;
+ unsigned int slave = p->wakee_flips;
int factor = this_cpu_read(sd_llc_size);

- /*
- * Yeah, it's the switching-frequency, could means many wakee or
- * rapidly switch, use factor here will just help to automatically
- * adjust the loose-degree, so bigger node will lead to more pull.
- */
- if (p->wakee_flips > factor) {
- /*
- * wakee is somewhat hot, it needs certain amount of cpu
- * resource, so if waker is far more hot, prefer to leave
- * it alone.
- */
- if (current->wakee_flips > (factor * p->wakee_flips))
- return 1;
- }
-
- return 0;
+ if (master < slave)
+ swap(master, slave);
+ if (slave < factor || master < slave * factor)
+ return 0;
+ return 1;
}

static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4761,13 +4764,6 @@ static int wake_affine(struct sched_doma
unsigned long weight;
int balanced;

- /*
- * If we wake multiple tasks be careful to not bounce
- * ourselves around too much.
- */
- if (wake_wide(p))
- return 0;
-
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -5021,12 +5017,12 @@ select_task_rq_fair(struct task_struct *
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int new_cpu = cpu;
+ int new_cpu = prev_cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;

if (sd_flag & SD_BALANCE_WAKE)
- want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));

rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -5040,6 +5036,8 @@ select_task_rq_fair(struct task_struct *
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
+ /* Prefer affinity over any other flags */
+ sd = NULL;
break;
}

@@ -5048,12 +5046,10 @@ select_task_rq_fair(struct task_struct *
}

if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
+ new_cpu = cpu;

- if (sd_flag & SD_BALANCE_WAKE) {
- new_cpu = select_idle_sibling(p, prev_cpu);
- goto unlock;
- }
+ if ((sd_flag & SD_BALANCE_WAKE) && (!sd || (!(sd->flags & SD_BALANCE_WAKE))))
+ new_cpu = select_idle_sibling(p, new_cpu);

while (sd) {
struct sched_group *group;
@@ -5089,7 +5085,6 @@ select_task_rq_fair(struct task_struct *
}
/* while loop will break here if sd == NULL */
}
-unlock:
rcu_read_unlock();

return new_cpu;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/