[RFC/PATCH 2/3] sched: rt time limit

From: Peter Zijlstra
Date: Sun Dec 30 2007 - 19:38:29 EST

Next message: Peter Zijlstra: "[RFC/PATCH 0/3] sched: hrtick and rt group scheduling"
Previous message: Jan Engelhardt: "Re: [PATCH] [MEMSTICK] Initial commit for Sony MemoryStick support"
In reply to: Peter Zijlstra: "[RFC/PATCH 0/3] sched: hrtick and rt group scheduling"
Next in thread: Peter Zijlstra: "[RFC/PATCH 3/3] sched: rt group scheduling"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Very simple time limit on the realtime scheduling classes.
Allow the rq's realtime class to consume sched_rt_ratio of every
sched_rt_period slice. If the class exceeds this quota the fair class
will preempt the realtime class.

TODO:
- rt limit vs load-balance
- proper interface

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
---
include/linux/sched.h | 2 +
kernel/sched.c | 70 +++++++++++++++++++++++++++++++++++---------------
kernel/sched_rt.c | 53 +++++++++++++++++++++++++++++++++++++
kernel/sysctl.c | 18 ++++++++++++
4 files changed, 122 insertions(+), 21 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1531,6 +1531,8 @@ extern unsigned int sysctl_sched_child_r
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_rt_period;
+extern unsigned int sysctl_sched_rt_ratio;
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
extern unsigned int sysctl_sched_min_bal_int_shares;
extern unsigned int sysctl_sched_max_bal_int_shares;
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -342,13 +342,14 @@ struct cfs_rq {
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
- int rt_load_balance_idx;
- struct list_head *rt_load_balance_head, *rt_load_balance_curr;
unsigned long rt_nr_running;
+#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
- /* highest queued rt task prio */
- int highest_prio;
+ int highest_prio; /* highest queued rt task prio */
int overloaded;
+#endif
+ u64 rt_time;
+ u64 rt_throttled;
};

#ifdef CONFIG_SMP
@@ -415,6 +416,7 @@ struct rq {
struct list_head leaf_cfs_rq_list;
#endif
struct rt_rq rt;
+ u64 rt_period_expire;

/*
* This is part of a global counter where only the total sum
@@ -601,6 +603,21 @@ const_debug unsigned int sysctl_sched_fe
const_debug unsigned int sysctl_sched_nr_migrate = 32;

/*
+ * period over which we measure -rt task cpu usage in ms.
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_rt_period = 1000;
+
+#define SCHED_RT_FRAC_SHIFT 16
+#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
+
+/*
+ * ratio of time -rt tasks may consume.
+ * default: 100%
+ */
+const_debug unsigned int sysctl_sched_rt_ratio = SCHED_RT_FRAC;
+
+/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
* clock constructed from sched_clock():
*/
@@ -3673,8 +3690,8 @@ void scheduler_tick(void)
rq->clock = next_tick;
rq->tick_timestamp = rq->clock;
update_cpu_load(rq);
- if (curr != rq->idle) /* FIXME: needed? */
- curr->sched_class->task_tick(rq, curr, 0);
+ curr->sched_class->task_tick(rq, curr, 0);
+ update_sched_rt_period(rq);
spin_unlock(&rq->lock);

#ifdef CONFIG_SMP
@@ -7029,6 +7046,29 @@ static void init_cfs_rq(struct cfs_rq *c
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
}

+static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+{
+ struct rt_prio_array *array;
+ int i;
+
+ array = &rt_rq->active;
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ INIT_LIST_HEAD(array->queue + i);
+ __clear_bit(i, array->bitmap);
+ }
+ /* delimiter for bitsearch: */
+ __set_bit(MAX_RT_PRIO, array->bitmap);
+
+#ifdef CONFIG_SMP
+ rt_rq->rt_nr_migratory = 0;
+ rt_rq->highest_prio = MAX_RT_PRIO;
+ rt_rq->overloaded = 0;
+#endif
+
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+}
+
void __init sched_init(void)
{
int highest_cpu = 0;
@@ -7039,7 +7079,6 @@ void __init sched_init(void)
#endif

for_each_possible_cpu(i) {
- struct rt_prio_array *array;
struct rq *rq;

rq = cpu_rq(i);
@@ -7071,6 +7110,8 @@ void __init sched_init(void)
}
init_task_group.shares = init_task_group_load;
#endif
+ init_rt_rq(&rq->rt, rq);
+ rq->rt_period_expire = 0;

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
@@ -7083,22 +7124,11 @@ void __init sched_init(void)
rq->cpu = i;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
- rq->rt.highest_prio = MAX_RT_PRIO;
- rq->rt.overloaded = 0;
rq_attach_root(rq, &def_root_domain);
#endif
init_rq_hrtick(rq);
-
atomic_set(&rq->nr_iowait, 0);
-
- array = &rq->rt.active;
- for (j = 0; j < MAX_RT_PRIO; j++) {
- INIT_LIST_HEAD(array->queue + j);
- __clear_bit(j, array->bitmap);
- }
highest_cpu = i;
- /* delimiter for bitsearch: */
- __set_bit(MAX_RT_PRIO, array->bitmap);
}

set_load_weight(&init_task);
@@ -7270,7 +7300,7 @@ void set_curr_task(int cpu, struct task_
#ifdef CONFIG_SMP
/*
* distribute shares of all task groups among their schedulable entities,
- * to reflect load distrbution across cpus.
+ * to reflect load distribution across cpus.
*/
static int rebalance_shares(struct sched_domain *sd, int this_cpu)
{
@@ -7337,7 +7367,7 @@ static int rebalance_shares(struct sched
* sysctl_sched_max_bal_int_shares represents the maximum interval between
* consecutive calls to rebalance_shares() in the same sched domain.
*
- * These settings allows for the appropriate tradeoff between accuracy of
+ * These settings allows for the appropriate trade-off between accuracy of
* fairness and the associated overhead.
*
*/
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -45,6 +45,50 @@ static void update_rt_migration(struct r
}
#endif /* CONFIG_SMP */

+static int sched_rt_ratio_exceeded(struct rq *rq, struct rt_rq *rt_rq)
+{
+ u64 period, ratio;
+
+ if (sysctl_sched_rt_ratio == SCHED_RT_FRAC)
+ return 0;
+
+ if (rt_rq->rt_throttled)
+ return 1;
+
+ period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+ ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+ if (rt_rq->rt_time > ratio) {
+ rt_rq->rt_throttled = rq->clock + period - rt_rq->rt_time;
+ return 1;
+ }
+
+ return 0;
+}
+
+static void update_sched_rt_period(struct rq *rq)
+{
+ while (rq->clock > rq->rt_period_expire) {
+ u64 period, ratio;
+
+ period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+ ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+ rq->rt.rt_time -= min(rq->rt.rt_time, ratio);
+ rq->rt_period_expire += period;
+ }
+
+ /*
+ * When the rt throttle is expired, let them rip.
+ * (XXX: use hrtick when available)
+ */
+ if (rq->rt.rt_throttled && rq->clock > rq->rt.rt_throttled) {
+ rq->rt.rt_throttled = 0;
+ if (!sched_rt_ratio_exceeded(rq, &rq->rt))
+ resched_task(rq->curr);
+ }
+}
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -66,6 +110,11 @@ static void update_curr_rt(struct rq *rq
curr->se.sum_exec_runtime += delta_exec;
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
+
+ rq->rt.rt_time += delta_exec;
+ update_sched_rt_period(rq);
+ if (sched_rt_ratio_exceeded(rq, &rq->rt))
+ resched_task(curr);
}

static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -208,8 +257,12 @@ static struct task_struct *pick_next_tas
struct rt_prio_array *array = &rq->rt.active;
struct task_struct *next;
struct list_head *queue;
+ struct rt_rq *rt_rq = &rq->rt;
int idx;

+ if (sched_rt_ratio_exceeded(rq, rt_rq))
+ return NULL;
+
idx = sched_find_first_bit(array->bitmap);
if (idx >= MAX_RT_PRIO)
return NULL;
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -309,7 +309,23 @@ static struct ctl_table kern_table[] = {
.procname = "sched_nr_migrate",
.data = &sysctl_sched_nr_migrate,
.maxlen = sizeof(unsigned int),
- .mode = 644,
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_rt_period_ms",
+ .data = &sysctl_sched_rt_period,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_rt_ratio",
+ .data = &sysctl_sched_rt_ratio,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
.proc_handler = &proc_dointvec,
},
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)

--

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Peter Zijlstra: "[RFC/PATCH 0/3] sched: hrtick and rt group scheduling"
Previous message: Jan Engelhardt: "Re: [PATCH] [MEMSTICK] Initial commit for Sony MemoryStick support"
In reply to: Peter Zijlstra: "[RFC/PATCH 0/3] sched: hrtick and rt group scheduling"
Next in thread: Peter Zijlstra: "[RFC/PATCH 3/3] sched: rt group scheduling"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]