--- linux-2.6.15/include/linux/sched.h.org Tue Jan 3 09:26:50 2006 +++ linux-2.6.15/include/linux/sched.h Sat Jan 7 14:45:37 2006 @@ -701,8 +701,8 @@ unsigned short ioprio; - unsigned long sleep_avg; - unsigned long long timestamp, last_ran; + unsigned long sleep_avg, slice_avg; + unsigned long long timestamp, last_ran, last_slice; unsigned long long sched_time; /* sched_clock time spent running */ int activated; --- linux-2.6.15/kernel/sched.c.org Sat Jan 7 16:22:13 2006 +++ linux-2.6.15/kernel/sched.c Tue Jan 10 13:59:44 2006 @@ -127,6 +127,20 @@ (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ MAX_SLEEP_AVG) +/* + * If a task's sleep_avg strays too far from it's slice_avg, this + * task is using more cpu than it's sleep_avg would indicate. When + * such a disparity is detected, prevent additional sleep time from + * being added to the existing imbalance, and increase the rate at + * which sleep_average is consumed. + */ +#define SLEEP_AVG_IMBALANCE(p) \ + ((p)->sleep_avg > (p)->slice_avg + (NS_MAX_SLEEP_AVG/10)) + +#define CPU_PENALTY(p) \ + (NS_TO_JIFFIES(min((p)->sleep_avg, (p)->slice_avg)) * MAX_BONUS / \ + MAX_SLEEP_AVG) + #define GRANULARITY (10 * HZ / 1000 ? : 1) #ifdef CONFIG_SMP @@ -744,7 +758,7 @@ else sleep_time = (unsigned long)__sleep_time; - if (likely(sleep_time > 0)) { + if (likely(sleep_time > 0 && !SLEEP_AVG_IMBALANCE(p))) { /* * User tasks that sleep a long time are categorised as * idle and will get just interactive status to stay active & @@ -1353,7 +1367,7 @@ out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state & TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible--; /* * Tasks on involuntary sleep don't earn @@ -1368,7 +1382,7 @@ * sleep is handled in a priority-neutral manner, no priority * boost and no penalty.) */ - if (old_state & TASK_NONINTERACTIVE) + if (old_state & TASK_NONINTERACTIVE || SLEEP_AVG_IMBALANCE(p)) __activate_task(p, rq); else activate_task(p, rq, cpu == this_cpu); @@ -1492,6 +1506,8 @@ */ p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + p->slice_avg = NS_MAX_SLEEP_AVG; + p->last_slice = sched_clock(); p->prio = effective_prio(p); @@ -2679,6 +2695,7 @@ if ((p->policy == SCHED_RR) && !--p->time_slice) { p->time_slice = task_timeslice(p); p->first_time_slice = 0; + p->last_slice = now; set_tsk_need_resched(p); /* put it at the end of the queue: */ @@ -2687,12 +2704,33 @@ goto out_unlock; } if (!--p->time_slice) { + unsigned long long nsecs = now - p->last_slice; + unsigned long idle, ticks; + int w = 10; + dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); p->time_slice = task_timeslice(p); p->first_time_slice = 0; + if (nsecs > ~0UL) + nsecs = ~0UL; + ticks = NS_TO_JIFFIES((unsigned long) nsecs); + if (ticks < p->time_slice) + ticks = p->time_slice; + idle = 100 - (100 * p->time_slice / ticks); + p->slice_avg /= NS_MAX_SLEEP_AVG / 100; + /* + * If the task is lowering it's cpu usage, speed up the + * effect on slice_avg so we don't over throttle. + */ + if (idle > p->slice_avg + 10) + w -= (100 * p->slice_avg / idle) / 10; + p->slice_avg = (w * p->slice_avg + idle) / (w + 1); + p->slice_avg *= NS_MAX_SLEEP_AVG / 100; + p->last_slice = now; + if (!rq->expired_timestamp) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { @@ -2996,7 +3034,7 @@ * Tasks charged proportionately less run_time at high sleep_avg to * delay them losing their interactive status */ - run_time /= (CURRENT_BONUS(prev) ? : 1); + run_time /= (CPU_PENALTY(prev) ? : 1); spin_lock_irq(&rq->lock); @@ -3010,7 +3048,7 @@ unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else { - if (prev->state == TASK_UNINTERRUPTIBLE) + if (prev->state & TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; deactivate_task(prev, rq); } @@ -3095,6 +3133,12 @@ prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) prev->sleep_avg = 0; + /* + * Enable detection of the beginning of a slice at tick time. + */ + if (!rt_task(next) && !(next->time_slice % DEF_TIMESLICE)) + next->last_slice = now; + prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next);