[PATCH 4/4] SCHED: Use a 2-d bitmap for searching lowest-pri CPU

From: Gregory Haskins
Date: Wed Nov 21 2007 - 15:13:01 EST


The current code use a linear algorithm which causes scaling issues
on larger SMP machines. This patch replaces that algorithm with a
2-dimensional bitmap to reduce latencies in the wake-up path.

In this past, this patch has maintained the priorties as a global
property. It now has a per-root-domain scope to shield unrelated
domains from unecessary cache collisions.

You may ask yourself why do we add logic to find_lowest_cpus() earlier
in the series and then rip it out in this patch. The answer is that
this current patch has been controversial, and is likely to not be
accepted (at least in the short term). Therefore, it is in our best
interest to optimize as much of the code prior to this patch as possible
even if we ultimately rip it out here. That way, the system is still tuned
even if this patch goes to /dev/null.

You may now be asking yourself why bother including this patch? The
answer is that our own independent testing shows
(http://article.gmane.org/gmane.linux.rt.user/1889) this patch makes a
significant performance improvement (at least for rt latencies) and
doesnt appear to cause any regressions in other dimensions. However,
I also understand the reservations raised by Steve Rostedt et. al.
Therefore, I include this patch in the hopes that it is useful to
someone, but with the understanding that it is not likely to be accepted
without further demonstration of its benefits.

Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx>
CC: Christoph Lameter <clameter@xxxxxxx>
---

kernel/Makefile | 1
kernel/sched.c | 4 +
kernel/sched_cpupri.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched_cpupri.h | 35 ++++++++++
kernel/sched_rt.c | 95 +++++-----------------------
5 files changed, 223 insertions(+), 78 deletions(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index dfa9695..c013a6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_SMP) += sched_cpupri.o

ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@xxxxxxxxxxxxxxxx>, the -fno-omit-frame-pointer is
diff --git a/kernel/sched.c b/kernel/sched.c
index 55a2a59..436db58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,6 +70,8 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>

+#include "sched_cpupri.h"
+
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
@@ -284,6 +286,7 @@ struct root_domain {
cpumask_t online;
cpumask_t rto_mask;
atomic_t rto_count;
+ struct cpupri cpupri;
};

static struct root_domain def_root_domain;
@@ -5787,6 +5790,7 @@ static void init_rootdomain(struct root_domain *rd, cpumask_t *map)

rd->span = *map;
cpus_and(rd->online, rd->span, cpu_online_map);
+ cpupri_init(&rd->cpupri);
}

static void init_defrootdomain(void)
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 0000000..e30d33f
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,166 @@
+/*
+ * kernel/sched_cpupri.c
+ *
+ * CPU priority management
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Author: Gregory Haskins <ghaskins@xxxxxxxxxx>
+ *
+ * This code tracks the priority of each CPU so that global migration
+ * decisions are easy to calculate. Each CPU can be in a state as follows:
+ *
+ * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ * going from the lowest priority to the highest. CPUs in the INVALID state
+ * are not eligible for routing. The system maintains this state with
+ * a 2 dimensional bitmap (the first for priority class, the second for cpus
+ * in that class). Therefore a typical application without affinity
+ * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ * searches). For tasks with affinity restrictions, the algorithm has a
+ * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ * yields the worst case search is fairly contrived.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include "sched_cpupri.h"
+
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+ int cpupri;
+
+ if (prio == MAX_PRIO)
+ cpupri = CPUPRI_IDLE;
+ else if (prio >= MAX_RT_PRIO)
+ cpupri = CPUPRI_NORMAL;
+ else
+ cpupri = MAX_RT_PRIO - prio + 1;
+
+ return cpupri;
+}
+
+#define for_each_cpupri_active(array, idx) \
+ for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
+ idx < CPUPRI_NR_PRIORITIES; \
+ idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invokation. By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times. While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ cpumask_t *lowest_mask)
+{
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
+
+ for_each_cpupri_active(cp->pri_active, idx) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
+ cpumask_t mask;
+
+ if (idx >= task_pri)
+ break;
+
+ cpus_and(mask, p->cpus_allowed, vec->mask);
+
+ if (cpus_empty(mask))
+ continue;
+
+ *lowest_mask = mask;
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ unsigned long flags;
+
+ newpri = convert_prio(newpri);
+
+ if (newpri == oldpri)
+ return;
+
+ /*
+ * If the cpu was currently mapped to a different value, we
+ * first need to unmap the old value
+ */
+ if (likely(oldpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
+
+ spin_lock_irqsave(&vec->lock, flags);
+
+ cpu_clear(cpu, vec->mask);
+ if (cpus_empty(vec->mask))
+ clear_bit(oldpri, cp->pri_active);
+
+ spin_unlock_irqrestore(&vec->lock, flags);
+ }
+
+ if (likely(newpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+
+ spin_lock_irqsave(&vec->lock, flags);
+
+ cpu_set(cpu, vec->mask);
+ set_bit(newpri, cp->pri_active);
+
+ spin_unlock_irqrestore(&vec->lock, flags);
+ }
+
+ *currpri = newpri;
+}
+
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Returns: (void)
+ */
+void cpupri_init(struct cpupri *cp)
+{
+ int i;
+
+ memset(cp, 0, sizeof(*cp));
+
+ for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+ spin_lock_init(&vec->lock);
+ cpus_clear(vec->mask);
+ }
+
+ for_each_possible_cpu(i)
+ cp->cpu_to_pri[i] = CPUPRI_INVALID;
+}
+
+
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 0000000..1392c93
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,35 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+
+#include <linux/sched.h>
+
+#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
+#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG
+
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE 0
+#define CPUPRI_NORMAL 1
+/* values 2-101 are RT priorities 0-99 */
+
+struct cpupri_vec
+{
+ spinlock_t lock;
+ cpumask_t mask;
+};
+
+struct cpupri {
+ struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+ long pri_active[CPUPRI_NR_PRI_WORDS];
+ int cpu_to_pri[NR_CPUS];
+};
+
+#ifdef CONFIG_SMP
+int cpupri_find(struct cpupri *cp,
+ struct task_struct *p, cpumask_t *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+void cpupri_init(struct cpupri *cp);
+#else
+#define cpupri_init() do { } while(0)
+#endif
+
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 6a5ac33..41dcbb4 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -73,8 +73,10 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
WARN_ON(!rt_task(p));
rq->rt.rt_nr_running++;
#ifdef CONFIG_SMP
- if (p->prio < rq->rt.highest_prio)
+ if (p->prio < rq->rt.highest_prio) {
rq->rt.highest_prio = p->prio;
+ cpupri_set(&rq->rd->cpupri, rq->cpu, p->prio);
+ }
if (p->nr_cpus_allowed > 1)
rq->rt.rt_nr_migratory++;

@@ -84,6 +86,8 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)

static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
{
+ int highest_prio = rq->rt.highest_prio;
+
WARN_ON(!rt_task(p));
WARN_ON(!rq->rt.rt_nr_running);
rq->rt.rt_nr_running--;
@@ -103,6 +107,9 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
if (p->nr_cpus_allowed > 1)
rq->rt.rt_nr_migratory--;

+ if (rq->rt.highest_prio != highest_prio)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+
update_rt_migration(rq);
#endif /* CONFIG_SMP */
}
@@ -295,73 +302,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,

static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);

-static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
-{
- int lowest_prio = -1;
- int lowest_cpu = -1;
- int count = 0;
- int cpu;
-
- cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
-
- /*
- * Scan each rq for the lowest prio.
- */
- for_each_cpu_mask(cpu, *lowest_mask) {
- struct rq *rq = cpu_rq(cpu);
-
- /* We look for lowest RT prio or non-rt CPU */
- if (rq->rt.highest_prio >= MAX_RT_PRIO) {
- /*
- * if we already found a low RT queue
- * and now we found this non-rt queue
- * clear the mask and set our bit.
- * Otherwise just return the queue as is
- * and the count==1 will cause the algorithm
- * to use the first bit found.
- */
- if (lowest_cpu != -1) {
- cpus_clear(*lowest_mask);
- cpu_set(rq->cpu, *lowest_mask);
- }
- return 1;
- }
-
- /* no locking for now */
- if ((rq->rt.highest_prio > task->prio)
- && (rq->rt.highest_prio >= lowest_prio)) {
- if (rq->rt.highest_prio > lowest_prio) {
- /* new low - clear old data */
- lowest_prio = rq->rt.highest_prio;
- lowest_cpu = cpu;
- count = 0;
- }
- count++;
- } else
- cpu_clear(cpu, *lowest_mask);
- }
-
- /*
- * Clear out all the set bits that represent
- * runqueues that were of higher prio than
- * the lowest_prio.
- */
- if (lowest_cpu > 0) {
- /*
- * Perhaps we could add another cpumask op to
- * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
- * Then that could be optimized to use memset and such.
- */
- for_each_cpu_mask(cpu, *lowest_mask) {
- if (cpu >= lowest_cpu)
- break;
- cpu_clear(cpu, *lowest_mask);
- }
- }
-
- return count;
-}
-
static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
{
int first;
@@ -383,18 +323,13 @@ static int find_lowest_rq(struct task_struct *task)
cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
- int count = find_lowest_cpus(task, lowest_mask);
+
+ if (task->nr_cpus_allowed == 1)
+ return -1; /* No other targets possible */

- if (!count)
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
return -1; /* No targets found */
-
- /*
- * There is no sense in performing an optimal search if only one
- * target is found.
- */
- if (count == 1)
- return first_cpu(*lowest_mask);
-
+
/*
* At this point we have built a mask of cpus representing the
* lowest priority tasks in the system. Now we want to elect
@@ -814,6 +749,8 @@ static void join_domain_rt(struct rq *rq)
{
if (rq->rt.overloaded)
rt_set_overload(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
}

/* Assumes rq->lock is held */
@@ -821,6 +758,8 @@ static void leave_domain_rt(struct rq *rq)
{
if (rq->rt.overloaded)
rt_clear_overload(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
}

static void set_curr_task_rt(struct rq *rq)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/