[PATCH 3/8] Replace struct prio_array with an RB tree

From: Fabio Checconi
Date: Mon Jun 15 2009 - 15:05:25 EST


To store groups in deadline order the old prio_array is not enough.
To keep things simple, use the same data structure to store tasks in
priority order.
---
include/linux/init_task.h | 1 -
include/linux/sched.h | 2 +-
kernel/sched.c | 25 ++-----
kernel/sched_rt.c | 157 +++++++++++++++++++++++++++++++--------------
4 files changed, 117 insertions(+), 68 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 28b1f30..5c698b4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -139,7 +139,6 @@ extern struct cred init_cred;
.group_node = LIST_HEAD_INIT(tsk.se.group_node), \
}, \
.rt = { \
- .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
.time_slice = HZ, \
.nr_cpus_allowed = NR_CPUS, \
}, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d08b3f7..a115067 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1128,7 +1128,7 @@ struct sched_entity {
};

struct sched_rt_entity {
- struct list_head run_list;
+ struct rb_node rb_node;
unsigned long timeout;
unsigned int time_slice;
int nr_cpus_allowed;
diff --git a/kernel/sched.c b/kernel/sched.c
index b8d432b..675ea96 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -157,11 +157,11 @@ static inline int task_has_rt_policy(struct task_struct *p)
}

/*
- * This is the priority-queue data structure of the RT scheduling class:
+ * This is the EDF queue data structure of the RT scheduling class:
*/
-struct rt_prio_array {
- DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
- struct list_head queue[MAX_RT_PRIO];
+struct rt_edf_tree {
+ struct rb_root rb_root;
+ struct rb_node *rb_leftmost;
};

struct rt_bandwidth {
@@ -481,7 +481,7 @@ struct cfs_rq {

/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
- struct rt_prio_array active;
+ struct rt_edf_tree active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
@@ -2595,7 +2595,7 @@ static void __sched_fork(struct task_struct *p)
p->se.wait_max = 0;
#endif

- INIT_LIST_HEAD(&p->rt.run_list);
+ RB_CLEAR_NODE(&p->rt.rb_node);
p->se.on_rq = 0;
INIT_LIST_HEAD(&p->se.group_node);

@@ -9069,16 +9069,7 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
{
- struct rt_prio_array *array;
- int i;
-
- array = &rt_rq->active;
- for (i = 0; i < MAX_RT_PRIO; i++) {
- INIT_LIST_HEAD(array->queue + i);
- __clear_bit(i, array->bitmap);
- }
- /* delimiter for bitsearch: */
- __set_bit(MAX_RT_PRIO, array->bitmap);
+ rt_rq->active.rb_root = RB_ROOT;

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
rt_rq->highest_prio.curr = MAX_RT_PRIO;
@@ -9158,7 +9149,7 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

rt_se->my_q = rt_rq;
rt_se->parent = parent;
- INIT_LIST_HEAD(&rt_se->run_list);
+ RB_CLEAR_NODE(&rt_se->rb_node);
}
#endif

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a..7ae1212 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -136,7 +136,7 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)

static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
- return !list_empty(&rt_se->run_list);
+ return !RB_EMPTY_NODE(&rt_se->rb_node);
}

#ifdef CONFIG_RT_GROUP_SCHED
@@ -668,6 +668,14 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}

#endif /* CONFIG_SMP */

+static inline struct sched_rt_entity *__rt_edf_first(struct rt_edf_tree *tree)
+{
+ if (!tree->rb_leftmost)
+ return NULL;
+
+ return rb_entry(tree->rb_leftmost, struct sched_rt_entity, rb_node);
+}
+
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
static void
inc_rt_prio(struct rt_rq *rt_rq, int prio)
@@ -683,6 +691,7 @@ inc_rt_prio(struct rt_rq *rt_rq, int prio)
static void
dec_rt_prio(struct rt_rq *rt_rq, int prio)
{
+ struct sched_rt_entity *rt_se;
int prev_prio = rt_rq->highest_prio.curr;

if (rt_rq->rt_nr_running) {
@@ -694,10 +703,8 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
* we may have some recomputation to do
*/
if (prio == prev_prio) {
- struct rt_prio_array *array = &rt_rq->active;
-
- rt_rq->highest_prio.curr =
- sched_find_first_bit(array->bitmap);
+ rt_se = __rt_edf_first(&rt_rq->active);
+ rt_rq->highest_prio.curr = rt_se_prio(rt_se);
}

} else
@@ -772,12 +779,71 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
dec_rt_group(rt_se, rt_rq);
}

+static inline int rt_entity_before(struct sched_rt_entity *a,
+ struct sched_rt_entity *b)
+{
+ return rt_se_prio(a) < rt_se_prio(b);
+}
+
+static void __rt_entity_insert(struct rt_edf_tree *tree,
+ struct sched_rt_entity *rt_se)
+{
+ struct rb_node **link = &tree->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct sched_rt_entity *entry;
+ int leftmost = 1;
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct sched_rt_entity, rb_node);
+ if (rt_entity_before(rt_se, entry))
+ link = &parent->rb_left;
+ else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ tree->rb_leftmost = &rt_se->rb_node;
+
+ rb_link_node(&rt_se->rb_node, parent, link);
+ rb_insert_color(&rt_se->rb_node, &tree->rb_root);
+}
+
+/*
+ * Attempt to insert rt_se as the first element of tree. This is used
+ * when requeueing a task at the head of a rt_rq before a reschedule, so
+ * it makes sense only if it is at the highest prio.
+ */
+static int __rt_entity_insert_head(struct rt_edf_tree *tree,
+ struct sched_rt_entity *rt_se)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+ struct sched_rt_entity *entry;
+
+ if (tree->rb_leftmost) {
+ entry = __rt_edf_first(tree);
+ if (rt_se_prio(entry) < rt_se_prio(rt_se))
+ return 1;
+ }
+
+ parent = rb_first(&tree->rb_root);
+ link = &parent->rb_left;
+
+ tree->rb_leftmost = &rt_se->rb_node;
+
+ rb_link_node(&rt_se->rb_node, parent, link);
+ rb_insert_color(&rt_se->rb_node, &tree->rb_root);
+
+ return 0;
+}
+
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
- struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
- struct list_head *queue = array->queue + rt_se_prio(rt_se);

/*
* Don't enqueue the group if its throttled, or when empty.
@@ -788,21 +854,25 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;

- list_add_tail(&rt_se->run_list, queue);
- __set_bit(rt_se_prio(rt_se), array->bitmap);
-
+ __rt_entity_insert(&rt_rq->active, rt_se);
inc_rt_tasks(rt_se, rt_rq);
}

+static void __rt_entity_remove(struct rt_edf_tree *tree,
+ struct sched_rt_entity *rt_se)
+{
+ if (tree->rb_leftmost == &rt_se->rb_node)
+ tree->rb_leftmost = rb_next(&rt_se->rb_node);
+
+ rb_erase(&rt_se->rb_node, &tree->rb_root);
+ RB_CLEAR_NODE(&rt_se->rb_node);
+}
+
static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
- struct rt_prio_array *array = &rt_rq->active;
-
- list_del_init(&rt_se->run_list);
- if (list_empty(array->queue + rt_se_prio(rt_se)))
- __clear_bit(rt_se_prio(rt_se), array->bitmap);

+ __rt_entity_remove(&rt_rq->active, rt_se);
dec_rt_tasks(rt_se, rt_rq);
}

@@ -881,14 +951,13 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
static void
requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
{
- if (on_rt_rq(rt_se)) {
- struct rt_prio_array *array = &rt_rq->active;
- struct list_head *queue = array->queue + rt_se_prio(rt_se);
+ struct rt_edf_tree *tree;

- if (head)
- list_move(&rt_se->run_list, queue);
- else
- list_move_tail(&rt_se->run_list, queue);
+ if (on_rt_rq(rt_se)) {
+ tree = &rt_rq->active;
+ __rt_entity_remove(tree, rt_se);
+ if (!head || __rt_entity_insert_head(tree, rt_se))
+ __rt_entity_insert(tree, rt_se);
}
}

@@ -1000,18 +1069,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
- struct rt_prio_array *array = &rt_rq->active;
- struct sched_rt_entity *next = NULL;
- struct list_head *queue;
- int idx;
+ struct rb_node *left = rt_rq->active.rb_leftmost;

- idx = sched_find_first_bit(array->bitmap);
- BUG_ON(idx >= MAX_RT_PRIO);
-
- queue = array->queue + idx;
- next = list_entry(queue->next, struct sched_rt_entity, run_list);
+ if (!left)
+ return NULL;

- return next;
+ return rb_entry(left, struct sched_rt_entity, rb_node);
}

static struct task_struct *_pick_next_task_rt(struct rq *rq)
@@ -1083,31 +1146,27 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
/* Return the second highest RT task, NULL otherwise */
static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
{
- struct task_struct *next = NULL;
+ struct task_struct *p, *next = NULL;
struct sched_rt_entity *rt_se;
- struct rt_prio_array *array;
struct rt_rq *rt_rq;
- int idx;
+ struct rt_edf_tree *tree;
+ struct rb_node *node;

for_each_leaf_rt_rq(rt_rq, rq) {
- array = &rt_rq->active;
- idx = sched_find_first_bit(array->bitmap);
- next_idx:
- if (idx >= MAX_RT_PRIO)
- continue;
- if (next && next->prio < idx)
- continue;
- list_for_each_entry(rt_se, array->queue + idx, run_list) {
- struct task_struct *p = rt_task_of(rt_se);
+ tree = &rt_rq->active;
+ for (node = tree->rb_leftmost; node; node = rb_next(node)) {
+ rt_se = rb_entry(node, struct sched_rt_entity, rb_node);
+ if (group_rt_rq(rt_se))
+ continue;
+
+ p = rt_task_of(rt_se);
+ if (next && next->prio < p->prio)
+ break;
if (pick_rt_task(rq, p, cpu)) {
next = p;
break;
}
}
- if (!next) {
- idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
- goto next_idx;
- }
}

return next;
@@ -1706,7 +1765,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
* Requeue to the end of queue if we are not the only element
* on the queue:
*/
- if (p->rt.run_list.prev != p->rt.run_list.next) {
+ if (rb_next(&p->rt.rb_node)) {
requeue_task_rt(rq, p, 0);
set_tsk_need_resched(p);
}
--
1.6.2.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/