Re: 20% performance drop on PostgreSQL 9.2 from kernel 3.5.3 to3.6-rc5 on AMD chipsets - bisected

From: Peter Zijlstra
Date: Mon Sep 24 2012 - 13:45:02 EST


On Mon, 2012-09-24 at 18:54 +0200, Peter Zijlstra wrote:
> But let me try and come up with the list thing, I think we've
> actually got that someplace as well.

OK, I'm sure the below can be written better, but my brain is gone for
the day...

---
include/linux/sched.h | 1 +
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 102 +++++++++++++++++++++++++++++++++++---------------
3 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0beac68..d72ea68 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -888,6 +888,7 @@ struct sched_group {
atomic_t ref;

unsigned int group_weight;
+ int group_first;
struct sched_group_power *sgp;

/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b38f00e..1177eb1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5781,6 +5781,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)

do {
sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg->group_first = cpumask_first(sched_group_cpus(sg));
sg = sg->next;
} while (sg != sd->groups);

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a1..601bc38 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2634,50 +2634,90 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
- int cpu = smp_processor_id();
- int prev_cpu = task_cpu(p);
- struct sched_domain *sd;
- struct sched_group *sg;
- int i;
+ struct sched_domain *sd_smt, *sd_llc;
+ struct sched_group *sg_smt, *sg_llc;

/*
- * If the task is going to be woken-up on this cpu and if it is
- * already idle, then it is the right target.
+ * Of the target is idle, easy peasy, we're done.
*/
- if (target == cpu && idle_cpu(cpu))
- return cpu;
+ if (idle_cpu(target))
+ return target;

/*
- * If the task is going to be woken-up on the cpu where it previously
- * ran and if it is currently idle, then it the right target.
+ * Otherwise, see if there's an idle core in the cache domain.
*/
- if (target == prev_cpu && idle_cpu(prev_cpu))
- return prev_cpu;
+ sd_llc = rcu_dereference(per_cpu(sd_llc, target));
+ sg_llc = sd_llc->groups;
+ do {
+ int candidate = -1;
+
+ sd_smt = rcu_dereference(per_cpu(sd_llc, sg_llc->group_first));
+ for_each_lower_domain(sd_smt) {
+ if (sd_smt->flags & SD_SHARE_CPUPOWER) /* aka. SMT */
+ break;
+ }
+
+ if (!sd_smt) {
+ int cpu = sg_llc->group_first; /* Assume singleton group */
+
+ if (!idle_cpu(cpu))
+ goto next_llc;
+
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ goto next_llc;
+
+ return cpu;
+ }
+
+ sg_smt = sd_smt->groups;
+ do {
+ int cpu = sg_smt->group_first; /* Assume singleton group */
+
+ if (!idle_cpu(cpu)) /* core is not idle, skip to next core */
+ goto next_llc;
+
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ goto next_smt;
+
+ if (candidate < 0)
+ candidate = cpu;
+
+next_smt:
+ sg_smt = sg_smt->next;
+ } while (sg_smt != sd_smt->groups);
+
+ if (candidate >= 0)
+ return candidate;
+
+next_llc:
+ sg_llc = sg_llc->next;
+ } while (sg_llc != sd_llc->groups);

/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Failing that, see if there's an idle SMT sibling.
*/
- sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
+ sd_smt = rcu_dereference(per_cpu(sd_llc, target));
+ for_each_lower_domain(sd_smt) {
+ if (sd_smt->flags & SD_SHARE_CPUPOWER) /* aka. SMT */
+ break;
+ }
+
+ if (sd_smt) {
+ sg_smt = sd_smt->groups;
do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
+ int cpu = sg_smt->group_first; /* Assume singleton group */

- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) &&
+ idle_cpu(cpu))
+ return cpu;

- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ sg_smt = sg_smt->next;
+ } while (sg_smt != sd_smt->groups);
}
-done:
+
+ /*
+ * OK, no idle siblings of any kind, take what we started with.
+ */
return target;
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/