--- kernel/sched.c.org	Thu Apr 17 05:38:40 2003
+++ kernel/sched.c	Fri Apr 18 13:35:40 2003
@@ -66,14 +66,15 @@
  */
 #define MIN_TIMESLICE		( 10 * HZ / 1000)
 #define MAX_TIMESLICE		(200 * HZ / 1000)
-#define CHILD_PENALTY		50
+#define CHILD_PENALTY		75
 #define PARENT_PENALTY		100
 #define EXIT_WEIGHT		3
 #define PRIO_BONUS_RATIO	25
 #define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(10*HZ)
-#define STARVATION_LIMIT	(10*HZ)
+#define MAX_SLEEP_AVG		(10 * HZ)
+#define STARVATION_LIMIT	(MAX_SLEEP_AVG / 2)
 #define NODE_THRESHOLD		125
+#define TIMESLICE_GRANULARITY	(HZ/20 ?: 1)
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -124,12 +125,17 @@
  * task_timeslice() is the interface that is used by the scheduler.
  */
 
-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
-	((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
+#define BASE_TIMESLICE(p) \
+	(MAX_TIMESLICE * (MAX_PRIO-(p)->static_prio)/MAX_USER_PRIO)
 
 static inline unsigned int task_timeslice(task_t *p)
 {
-	return BASE_TIMESLICE(p);
+	unsigned int time_slice = BASE_TIMESLICE(p);
+
+	if (time_slice < MIN_TIMESLICE)
+		time_slice = MIN_TIMESLICE;
+
+	return time_slice;
 }
 
 /*
@@ -279,6 +285,7 @@
  */
 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
 {
+	p->array = NULL;
 	array->nr_active--;
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
@@ -340,13 +347,16 @@
  * Update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
+#define MAY_BACKBOOST \
+	(!in_interrupt() && !TASK_NICE(current) && !TASK_NICE(p))
+
 static inline int activate_task(task_t *p, runqueue_t *rq)
 {
 	long sleep_time = jiffies - p->last_run - 1;
 	int requeue_waker = 0;
 
 	if (sleep_time > 0) {
-		int sleep_avg;
+		int sleep_avg = p->sleep_avg;
 
 		/*
 		 * This code gives a bonus to interactive tasks.
@@ -356,7 +366,7 @@
 		 * spends sleeping, the higher the average gets - and the
 		 * higher the priority boost gets as well.
 		 */
-		sleep_avg = p->sleep_avg + sleep_time;
+		sleep_avg += min(sleep_time, (long) p->time_slice);
 
 		/*
 		 * 'Overflow' bonus ticks go to the waker as well, so the
@@ -364,8 +374,23 @@
 		 * boosting tasks that are related to maximum-interactive
 		 * tasks.
 		 */
-		if (sleep_avg > MAX_SLEEP_AVG)
+		if (sleep_avg > MAX_SLEEP_AVG) {
+			if (MAY_BACKBOOST) {
+#if 0
+				printk(KERN_DEBUG "%lu: %d boosted %d by %d\n",
+					jiffies, p->pid, current->pid, sleep_avg-MAX_SLEEP_AVG);
+#endif
+				sleep_avg += current->sleep_avg - MAX_SLEEP_AVG;
+				if (sleep_avg > MAX_SLEEP_AVG)
+					sleep_avg = MAX_SLEEP_AVG;
+
+				if (current->sleep_avg != sleep_avg) {
+					current->sleep_avg = sleep_avg;
+					requeue_waker = 1;
+				}
+			}
 			sleep_avg = MAX_SLEEP_AVG;
+		}
 		if (p->sleep_avg != sleep_avg) {
 			p->sleep_avg = sleep_avg;
 			p->prio = effective_prio(p);
@@ -381,11 +406,10 @@
  */
 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	nr_running_dec(rq);
+	dequeue_task(p, p->array);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
-	dequeue_task(p, p->array);
-	p->array = NULL;
+	nr_running_dec(rq);
 }
 
 /*
@@ -569,7 +593,10 @@
 	 * from forking tasks that are max-interactive.
 	 */
 	current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
-	p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
+	if (likely(current->parent->pid > 1))
+		p->sleep_avg = current->sleep_avg * CHILD_PENALTY / 100;
+	else
+		p->sleep_avg = current->sleep_avg = MAX_SLEEP_AVG;
 	p->prio = effective_prio(p);
 	set_task_cpu(p, smp_processor_id());
 
@@ -596,22 +623,20 @@
  */
 void sched_exit(task_t * p)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
 	if (p->first_time_slice) {
 		p->parent->time_slice += p->time_slice;
 		if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
 			p->parent->time_slice = MAX_TIMESLICE;
 	}
-	local_irq_restore(flags);
 	/*
 	 * If the child was a (relative-) CPU hog then decrease
 	 * the sleep_avg of the parent as well.
 	 */
-	if (p->sleep_avg < p->parent->sleep_avg)
+#if 0
+	if (p->parent->pid > 1 && p->sleep_avg < p->parent->sleep_avg)
 		p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
-			p->sleep_avg) / (EXIT_WEIGHT + 1);
+			p->sleep_avg) / MAX_USER_PRIO;
+#endif
 }
 
 /**
@@ -1161,14 +1186,29 @@
  *
  * To guarantee that this does not starve expired tasks we ignore the
  * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks:
+ * than a 'reasonable' amount of time.
  */
 #define EXPIRED_STARVING(rq) \
-		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
-		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1)))
+		(STARVATION_LIMIT && (rq)->expired_timestamp && \
+		time_after(jiffies, (rq)->expired_timestamp + STARVATION_LIMIT))
+
+/*
+ * Scale the cpu usage penalty for tasks which are not niced to below
+ * zero by their nice value and interactivity.  Simply because a task
+ * has been nice recently does NOT mean it should be allowed 3 seconds
+ * of cpu before it will be expired.
+ */
+static inline void sleep_avg_tick(task_t *p)
+{
+	int bias = 1, nice = TASK_NICE(p);
+
+	if (nice >= 0)
+		bias += p->static_prio - p->prio + nice - INTERACTIVE_DELTA;
+	if (bias <= 0)
+		bias = 1;
+	if (p->sleep_avg >= bias)
+		p->sleep_avg -= bias;
+}
 
 /*
  * This function gets called by the timer code, with HZ frequency.
@@ -1203,12 +1243,13 @@
 		kstat_cpu(cpu).cpustat.user += user_ticks;
 	kstat_cpu(cpu).cpustat.system += sys_ticks;
 
+	spin_lock(&rq->lock);
 	/* Task might have expired already, but not scheduled off yet */
-	if (p->array != rq->active) {
-		set_tsk_need_resched(p);
-		return;
+	if (p->array != rq->active || p->state != TASK_RUNNING) {
+		if (p->state > TASK_UNINTERRUPTIBLE)
+			set_tsk_need_resched(p);
+		goto out;
 	}
-	spin_lock(&rq->lock);
 	/*
 	 * The task was running during this tick - update the
 	 * time slice counter and the sleep average. Note: we
@@ -1217,8 +1258,6 @@
 	 * it possible for interactive tasks to use up their
 	 * timeslices at their highest priority levels.
 	 */
-	if (p->sleep_avg)
-		p->sleep_avg--;
 	if (unlikely(rt_task(p))) {
 		/*
 		 * RR tasks need a special form of timeslice management.
@@ -1235,6 +1274,7 @@
 		}
 		goto out;
 	}
+	sleep_avg_tick(p);
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
@@ -1248,6 +1288,26 @@
 			enqueue_task(p, rq->expired);
 		} else
 			enqueue_task(p, rq->active);
+		goto out;
+	}
+	/*
+	 * Prevent a too long timeslice allowing a task to monopolize
+	 * the CPU. We do this by splitting up the timeslice into
+	 * smaller pieces.
+	 *
+	 * Note: this does not mean the task's timeslices expire or
+	 * get lost in any way, they just might be preempted by
+	 * another task of equal priority. (one with higher
+	 * priority would have preempted this task already.) We
+	 * requeue this task to the end of the list on this priority
+	 * level, which is in essence a round-robin of tasks with
+	 * equal priority.
+	 */
+	if (!(p->time_slice % TIMESLICE_GRANULARITY)) {
+		dequeue_task(p, rq->active);
+		set_tsk_need_resched(p);
+		p->prio = effective_prio(p);
+		enqueue_task(p, rq->active);
 	}
 out:
 	spin_unlock(&rq->lock);
@@ -1669,7 +1729,7 @@
  */
 int task_prio(task_t *p)
 {
-	return p->prio - MAX_USER_RT_PRIO;
+	return p->prio - MAX_RT_PRIO;
 }
 
 /**
@@ -2525,6 +2585,7 @@
 	rq = this_rq();
 	rq->curr = current;
 	rq->idle = current;
+	current->sleep_avg = MAX_SLEEP_AVG;
 	set_task_cpu(current, smp_processor_id());
 	wake_up_forked_process(current);