--- linux-2.5.66.virgin/kernel/sched.c Thu Mar 27 14:23:58 2003 +++ linux-2.5.66.twiddle/kernel/sched.c Fri Mar 28 18:35:29 2003 @@ -63,17 +63,40 @@ * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, * maximum timeslice is 200 msecs. Timeslices get refilled after * they expire. + * + * They are configurable via /proc/sys/sched */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) -#define NODE_THRESHOLD 125 + +int min_timeslice = (5 * HZ) / 1000; +int max_timeslice = (200 * HZ) / 1000; +int child_penalty = 50; +int parent_penalty = 100; +int exit_weight = 3; +int prio_bonus_ratio = 25; +int interactive_delta = 2; +int max_sleep_avg = 10 * HZ; +int starvation_limit = 1 * HZ; +int node_threshold = 125; +int max_accel_slices = 5; /* Max bonus per activation in slices. */ +int retard_prct_slice = 10; /* Percent of a slice to deduct. */ +int force_switch = 1; + +#define MIN_TIMESLICE (min_timeslice) +#define MAX_TIMESLICE (max_timeslice) +#define CHILD_PENALTY (child_penalty) +#define PARENT_PENALTY (parent_penalty) +#define EXIT_WEIGHT (exit_weight) +#define PRIO_BONUS_RATIO (prio_bonus_ratio) +#define INTERACTIVE_DELTA (interactive_delta) +#define MAX_SLEEP_AVG (max_sleep_avg) +#define STARVATION_LIMIT (starvation_limit) +#define NODE_THRESHOLD (node_threshold) +#define TIMESLICE_GRANULARITY (HZ/20 ?: 1) +#define MAX_ACCELERATION(slice) \ + (max_accel_slices ? max_accel_slices * (slice) : max_sleep_avg) +#define NEGATIVE_FEEDBACK(slice) \ + (retard_prct_slice ? retard_prct_slice * (slice) / 100 : 0) +#define FORCE_SWITCH (force_switch) /* * If a task is 'interactive' then we reinsert it in the active @@ -124,12 +147,17 @@ * task_timeslice() is the interface that is used by the scheduler. */ -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1))) +#define BASE_TIMESLICE(p) \ + (MAX_TIMESLICE * (MAX_PRIO-(p)->static_prio)/MAX_USER_PRIO) static inline unsigned int task_timeslice(task_t *p) { - return BASE_TIMESLICE(p); + unsigned int time_slice = BASE_TIMESLICE(p); + + if (time_slice < MIN_TIMESLICE) + time_slice = MIN_TIMESLICE; + + return time_slice; } /* @@ -347,6 +375,7 @@ if (sleep_time > 0) { int sleep_avg; + int slice = task_timeslice(p); /* * This code gives a bonus to interactive tasks. @@ -355,8 +384,15 @@ * value here, based on ->last_run. The more time a task * spends sleeping, the higher the average gets - and the * higher the priority boost gets as well. + * + * Prevent tasks with an extremely high context switch + * rate from becoming CPU hogs by steadily inflating their + * sleep_avg until they starve legitimate sleepers. */ - sleep_avg = p->sleep_avg + sleep_time; + if (p->sleep_avg > MAX_SLEEP_AVG / 2) + p->sleep_avg -= NEGATIVE_FEEDBACK(slice); + sleep_time %= MAX_ACCELERATION(slice); + sleep_avg = sleep_time + p->sleep_avg; /* * 'Overflow' bonus ticks go to the waker as well, so the @@ -1176,10 +1212,8 @@ * load-dependent, as the frequency of array switched decreases with * increasing number of running tasks: */ -#define EXPIRED_STARVING(rq) \ - (STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) +#define EXPIRED_STARVING(rq) ((rq)->expired_timestamp && \ + time_after_eq(jiffies, (rq)->expired_timestamp+STARVATION_LIMIT)) /* * This function gets called by the timer code, with HZ frequency. @@ -1193,6 +1227,10 @@ int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); task_t *p = current; + int array_switch = EXPIRED_STARVING(rq); + + if (FORCE_SWITCH && !rq->expired_timestamp) + rq->expired_timestamp = jiffies - (STARVATION_LIMIT / 2); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); @@ -1242,23 +1280,52 @@ /* put it at the end of the queue: */ dequeue_task(p, rq->active); - enqueue_task(p, rq->active); + enqueue_task(p, array_switch ? rq->expired : rq->active); + } else if (array_switch) { + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + dequeue_task(p, rq->active); + enqueue_task(p, rq->expired); } goto out; } - if (!--p->time_slice) { + if (unlikely(!--p->time_slice)) { dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); p->time_slice = task_timeslice(p); p->first_time_slice = 0; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + if (!TASK_INTERACTIVE(p) || array_switch) { if (!rq->expired_timestamp) rq->expired_timestamp = jiffies; enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + goto out; + } + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * If expired tasks are starving, switch arrays ASAP. + */ + if (!(p->time_slice % TIMESLICE_GRANULARITY) || array_switch) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, array_switch ? rq->expired : rq->active); + if (array_switch && !rq->expired_timestamp) + rq->expired_timestamp = jiffies; } out: spin_unlock(&rq->lock); @@ -1297,8 +1364,13 @@ rq = this_rq(); release_kernel_lock(prev); +#if 0 // MIKEDIDIT prev->last_run = jiffies; +#endif spin_lock_irq(&rq->lock); +#if 1 // MIKEDIDIT + prev->last_run = jiffies; +#endif /* * if entering off of a kernel preemption go straight @@ -1680,7 +1752,7 @@ */ int task_prio(task_t *p) { - return p->prio - MAX_USER_RT_PRIO; + return p->prio - MAX_RT_PRIO; } /** --- linux-2.5.66.virgin/kernel/sysctl.c Thu Mar 27 14:23:58 2003 +++ linux-2.5.66.twiddle/kernel/sysctl.c Fri Mar 28 13:51:31 2003 @@ -57,6 +57,19 @@ extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_timeslice; +extern int max_timeslice; +extern int child_penalty; +extern int parent_penalty; +extern int exit_weight; +extern int prio_bonus_ratio; +extern int interactive_delta; +extern int max_sleep_avg; +extern int starvation_limit; +extern int node_threshold; +extern int max_accel_slices; +extern int retard_prct_slice; +extern int force_switch; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -114,6 +127,7 @@ static ctl_table kern_table[]; static ctl_table vm_table[]; +static ctl_table sched_table[]; #ifdef CONFIG_NET extern ctl_table net_table[]; #endif @@ -158,6 +172,7 @@ {CTL_FS, "fs", NULL, 0, 0555, fs_table}, {CTL_DEBUG, "debug", NULL, 0, 0555, debug_table}, {CTL_DEV, "dev", NULL, 0, 0555, dev_table}, + {CTL_SCHED, "sched", NULL, 0, 0555, sched_table}, {0} }; @@ -360,7 +375,50 @@ static ctl_table dev_table[] = { {0} -}; +}; + +static ctl_table sched_table[] = { + {SCHED_MAX_TIMESLICE, "max_timeslice", &max_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MIN_TIMESLICE, "min_timeslice", &min_timeslice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_CHILD_PENALTY, "child_penalty", &child_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PARENT_PENALTY, "parent_penalty", &parent_penalty, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_EXIT_WEIGHT, "exit_weight", &exit_weight, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio", &prio_bonus_ratio, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_INTERACTIVE_DELTA, "interactive_delta", &interactive_delta, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_MAX_SLEEP_AVG, "max_sleep_avg", &max_sleep_avg, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_STARVATION_LIMIT, "starvation_limit", &starvation_limit, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_NODE_THRESHOLD, "node_threshold", &node_threshold, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &one, NULL}, + {SCHED_MAX_ACCELERATION, "max_accel_slices", &max_accel_slices, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, NULL}, + {SCHED_NEG_FEEDBACK, "retard_prct_slice", &retard_prct_slice, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, &one_hundred}, + {SCHED_FORCE_SWITCH, "force_switch", &force_switch, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, &zero, &one}, + {0} +}; extern void init_irq_proc (void); --- linux-2.5.66.virgin/include/linux/sysctl.h Fri Mar 7 06:07:40 2003 +++ linux-2.5.66.twiddle/include/linux/sysctl.h Fri Mar 28 13:53:47 2003 @@ -66,7 +66,8 @@ CTL_DEV=7, /* Devices */ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ - CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_SCHED=11, /* scheduler tunables */ }; /* CTL_BUS names: */ @@ -157,6 +158,22 @@ VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ }; +/* Tunable scheduler parameters in /proc/sys/sched/ */ +enum { + SCHED_MIN_TIMESLICE=1, /* minimum process timeslice */ + SCHED_MAX_TIMESLICE=2, /* maximum process timeslice */ + SCHED_CHILD_PENALTY=3, /* penalty on fork to child */ + SCHED_PARENT_PENALTY=4, /* penalty on fork to parent */ + SCHED_EXIT_WEIGHT=5, /* penalty to parent of CPU hog child */ + SCHED_PRIO_BONUS_RATIO=6, /* percent of max prio given as bonus */ + SCHED_INTERACTIVE_DELTA=7, /* delta used to scale interactivity */ + SCHED_MAX_SLEEP_AVG=8, /* maximum sleep avg attainable */ + SCHED_STARVATION_LIMIT=9, /* no re-active if expired is starved */ + SCHED_NODE_THRESHOLD=10, /* NUMA node rebalance threshold */ + SCHED_MAX_ACCELERATION=11, /* maximum bonus slices per activation */ + SCHED_NEG_FEEDBACK=12, /* percent slice onus per activation */ + SCHED_FORCE_SWITCH=13, /* force switch to expired array */ +}; /* CTL_NET names: */ enum