--- kernel/sched.c.org Thu Apr 17 05:38:40 2003 +++ kernel/sched.c Fri Apr 18 13:35:40 2003 @@ -66,14 +66,15 @@ */ #define MIN_TIMESLICE ( 10 * HZ / 1000) #define MAX_TIMESLICE (200 * HZ / 1000) -#define CHILD_PENALTY 50 +#define CHILD_PENALTY 75 #define PARENT_PENALTY 100 #define EXIT_WEIGHT 3 #define PRIO_BONUS_RATIO 25 #define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (10*HZ) +#define MAX_SLEEP_AVG (10 * HZ) +#define STARVATION_LIMIT (MAX_SLEEP_AVG / 2) #define NODE_THRESHOLD 125 +#define TIMESLICE_GRANULARITY (HZ/20 ?: 1) /* * If a task is 'interactive' then we reinsert it in the active @@ -124,12 +125,17 @@ * task_timeslice() is the interface that is used by the scheduler. */ -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1))) +#define BASE_TIMESLICE(p) \ + (MAX_TIMESLICE * (MAX_PRIO-(p)->static_prio)/MAX_USER_PRIO) static inline unsigned int task_timeslice(task_t *p) { - return BASE_TIMESLICE(p); + unsigned int time_slice = BASE_TIMESLICE(p); + + if (time_slice < MIN_TIMESLICE) + time_slice = MIN_TIMESLICE; + + return time_slice; } /* @@ -279,6 +285,7 @@ */ static inline void dequeue_task(struct task_struct *p, prio_array_t *array) { + p->array = NULL; array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) @@ -340,13 +347,16 @@ * Update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ +#define MAY_BACKBOOST \ + (!in_interrupt() && !TASK_NICE(current) && !TASK_NICE(p)) + static inline int activate_task(task_t *p, runqueue_t *rq) { long sleep_time = jiffies - p->last_run - 1; int requeue_waker = 0; if (sleep_time > 0) { - int sleep_avg; + int sleep_avg = p->sleep_avg; /* * This code gives a bonus to interactive tasks. @@ -356,7 +366,7 @@ * spends sleeping, the higher the average gets - and the * higher the priority boost gets as well. */ - sleep_avg = p->sleep_avg + sleep_time; + sleep_avg += min(sleep_time, (long) p->time_slice); /* * 'Overflow' bonus ticks go to the waker as well, so the @@ -364,8 +374,23 @@ * boosting tasks that are related to maximum-interactive * tasks. */ - if (sleep_avg > MAX_SLEEP_AVG) + if (sleep_avg > MAX_SLEEP_AVG) { + if (MAY_BACKBOOST) { +#if 0 + printk(KERN_DEBUG "%lu: %d boosted %d by %d\n", + jiffies, p->pid, current->pid, sleep_avg-MAX_SLEEP_AVG); +#endif + sleep_avg += current->sleep_avg - MAX_SLEEP_AVG; + if (sleep_avg > MAX_SLEEP_AVG) + sleep_avg = MAX_SLEEP_AVG; + + if (current->sleep_avg != sleep_avg) { + current->sleep_avg = sleep_avg; + requeue_waker = 1; + } + } sleep_avg = MAX_SLEEP_AVG; + } if (p->sleep_avg != sleep_avg) { p->sleep_avg = sleep_avg; p->prio = effective_prio(p); @@ -381,11 +406,10 @@ */ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { - nr_running_dec(rq); + dequeue_task(p, p->array); if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - dequeue_task(p, p->array); - p->array = NULL; + nr_running_dec(rq); } /* @@ -569,7 +593,10 @@ * from forking tasks that are max-interactive. */ current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; - p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + if (likely(current->parent->pid > 1)) + p->sleep_avg = current->sleep_avg * CHILD_PENALTY / 100; + else + p->sleep_avg = current->sleep_avg = MAX_SLEEP_AVG; p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); @@ -596,22 +623,20 @@ */ void sched_exit(task_t * p) { - unsigned long flags; - - local_irq_save(flags); if (p->first_time_slice) { p->parent->time_slice += p->time_slice; if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) p->parent->time_slice = MAX_TIMESLICE; } - local_irq_restore(flags); /* * If the child was a (relative-) CPU hog then decrease * the sleep_avg of the parent as well. */ - if (p->sleep_avg < p->parent->sleep_avg) +#if 0 + if (p->parent->pid > 1 && p->sleep_avg < p->parent->sleep_avg) p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT + - p->sleep_avg) / (EXIT_WEIGHT + 1); + p->sleep_avg) / MAX_USER_PRIO; +#endif } /** @@ -1161,14 +1186,29 @@ * * To guarantee that this does not starve expired tasks we ignore the * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks: + * than a 'reasonable' amount of time. */ #define EXPIRED_STARVING(rq) \ - (STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) + (STARVATION_LIMIT && (rq)->expired_timestamp && \ + time_after(jiffies, (rq)->expired_timestamp + STARVATION_LIMIT)) + +/* + * Scale the cpu usage penalty for tasks which are not niced to below + * zero by their nice value and interactivity. Simply because a task + * has been nice recently does NOT mean it should be allowed 3 seconds + * of cpu before it will be expired. + */ +static inline void sleep_avg_tick(task_t *p) +{ + int bias = 1, nice = TASK_NICE(p); + + if (nice >= 0) + bias += p->static_prio - p->prio + nice - INTERACTIVE_DELTA; + if (bias <= 0) + bias = 1; + if (p->sleep_avg >= bias) + p->sleep_avg -= bias; +} /* * This function gets called by the timer code, with HZ frequency. @@ -1203,12 +1243,13 @@ kstat_cpu(cpu).cpustat.user += user_ticks; kstat_cpu(cpu).cpustat.system += sys_ticks; + spin_lock(&rq->lock); /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { - set_tsk_need_resched(p); - return; + if (p->array != rq->active || p->state != TASK_RUNNING) { + if (p->state > TASK_UNINTERRUPTIBLE) + set_tsk_need_resched(p); + goto out; } - spin_lock(&rq->lock); /* * The task was running during this tick - update the * time slice counter and the sleep average. Note: we @@ -1217,8 +1258,6 @@ * it possible for interactive tasks to use up their * timeslices at their highest priority levels. */ - if (p->sleep_avg) - p->sleep_avg--; if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. @@ -1235,6 +1274,7 @@ } goto out; } + sleep_avg_tick(p); if (!--p->time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -1248,6 +1288,26 @@ enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); + goto out; + } + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + */ + if (!(p->time_slice % TIMESLICE_GRANULARITY)) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); } out: spin_unlock(&rq->lock); @@ -1669,7 +1729,7 @@ */ int task_prio(task_t *p) { - return p->prio - MAX_USER_RT_PRIO; + return p->prio - MAX_RT_PRIO; } /** @@ -2525,6 +2585,7 @@ rq = this_rq(); rq->curr = current; rq->idle = current; + current->sleep_avg = MAX_SLEEP_AVG; set_task_cpu(current, smp_processor_id()); wake_up_forked_process(current);