Re: [patch] schedule_timeout()

Andrea Arcangeli (andrea@e-mind.com)
Mon, 19 Oct 1998 23:39:17 +0200 (CEST)


On Mon, 19 Oct 1998, Andrea Arcangeli wrote:

>I should have cleaned my tree fine now. So now I produced a diff that
>works fine here (btw, I just fixed all problems I got yesterday night).

Now I fixed also the timer offset problem (thanks to Finn btw).

I just converted my whole kernel to use schedule_timeout(timeout) and
interruptible_sleep_on_timeout(&wait, TIMEOUT), and I am writing this
while jiffies has just wrapped on my machine (it started from -120*HZ).
Everything seems really stable here.

The only thing that I could eventually do now would be to check that no
bogus device driver set a timer->expires to -1 or to 0. Doing that in the
stock kernel in the most of cases should cause to delay a _lot_ or to
execute ASAP the timer. With my code instead it depends on the sign of
jiffies. To catch most of this bugs I only need to run in the future
((singed) jiffies < 0).

The conversion to the new _timeout() function is been trivial most of the
times and it' s been clean all other more clever times.

Now I am starting to use my code all the time because seems really stable
here. I' ll let you know if there will be troubles in it.

My new whole latest jiffies update is here:

ftp://e-mind.com/pub/linux/kernel-patches/jiffies-11...

The patch is against linux-2_1_125 + my latest parport update sent to
Linus some week ago + my latest lp/StylusColor patch posted on
linux-kernel some days ago. It should work fine on the stock 2.1.125
though.

To allow more people to complain I put here the most interesting part of
the patch here too:

Index: linux/kernel/sched.c
diff -u linux/kernel/sched.c:1.1.1.1 linux/kernel/sched.c:1.1.1.1.14.13
--- linux/kernel/sched.c:1.1.1.1 Fri Oct 2 19:22:39 1998
+++ linux/kernel/sched.c Mon Oct 19 22:46:45 1998
@@ -90,7 +90,7 @@

extern void mem_use(void);

-unsigned long volatile jiffies=0;
+unsigned long volatile jiffies=JIFFIES_OFFSET;

/*
* Init task must be ok at boot for the ix86 as we will check its signals
@@ -248,7 +248,6 @@
{
struct task_struct * p = (struct task_struct *) __data;

- p->timeout = 0;
wake_up_process(p);
}

@@ -329,11 +328,15 @@
struct timer_list *vec[TVR_SIZE];
};

-static struct timer_vec tv5 = { 0 };
-static struct timer_vec tv4 = { 0 };
-static struct timer_vec tv3 = { 0 };
-static struct timer_vec tv2 = { 0 };
-static struct timer_vec_root tv1 = { 0 };
+static struct timer_vec tv5 = { (((JIFFIES_OFFSET - 1) >>
+ (TVR_BITS + TVN_BITS * 3)) + 1) & TVN_MASK };
+static struct timer_vec tv4 = { (((JIFFIES_OFFSET - 1) >>
+ (TVR_BITS + TVN_BITS * 2)) + 1) & TVN_MASK };
+static struct timer_vec tv3 = { (((JIFFIES_OFFSET - 1) >>
+ (TVR_BITS + TVN_BITS)) + 1) & TVN_MASK };
+static struct timer_vec tv2 = { (((JIFFIES_OFFSET - 1) >>
+ TVR_BITS) + 1) & TVN_MASK };
+static struct timer_vec_root tv1 = { JIFFIES_OFFSET & TVR_MASK };

static struct timer_vec * const tvecs[] = {
(struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
@@ -341,7 +344,7 @@

#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))

-static unsigned long timer_jiffies = 0;
+static unsigned long timer_jiffies = JIFFIES_OFFSET;

static inline void insert_timer(struct timer_list *timer,
struct timer_list **vec, int idx)
@@ -372,12 +375,12 @@
} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
insert_timer(timer, tv4.vec, i);
- } else if (expires < timer_jiffies) {
+ } else if ((signed long) idx < 0) {
/* can happen if you add a timer with expires == jiffies,
* or you set a timer to go off in the past
*/
insert_timer(timer, tv1.vec, tv1.index);
- } else if (idx < 0xffffffffUL) {
+ } else if (idx <= 0xffffffffUL) {
int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
insert_timer(timer, tv5.vec, i);
} else {
@@ -445,6 +448,50 @@

#endif

+signed long schedule_timeout(signed long timeout)
+{
+ struct timer_list timer;
+ unsigned long expire;
+
+ /*
+ * PARANOID.
+ */
+ if (current->state == TASK_UNINTERRUPTIBLE)
+ printk(KERN_WARNING "schedule_timeout: task not interrutible "
+ "from %p\n", __builtin_return_address(0));
+ if (timeout < 0)
+ {
+ printk(KERN_ERR "schedule_timeout: wrong timeout value %lx "
+ "from %p\n", timeout, __builtin_return_address(0));
+ return timeout;
+ }
+
+ /*
+ * Here we start for real.
+ */
+ if (!timeout)
+ goto normal_schedule;
+
+ expire = timeout + jiffies;
+
+ init_timer(&timer);
+ timer.expires = expire;
+ timer.data = (unsigned long) current;
+ timer.function = process_timeout;
+
+ add_timer(&timer);
+ schedule();
+ del_timer(&timer);
+
+ timeout = expire - jiffies;
+
+ return timeout < 0 ? 0 : timeout;
+
+ normal_schedule:
+ schedule();
+ return 0;
+}
+
/*
* 'schedule()' is the scheduler function. It's a very simple and nice
* scheduler: it's not perfect, but certainly works for most things.
@@ -458,7 +505,6 @@
asmlinkage void schedule(void)
{
struct task_struct * prev, * next;
- unsigned long timeout;
int this_cpu;

prev = current;
@@ -481,16 +527,11 @@
prev->counter = prev->priority;
move_last_runqueue(prev);
}
- timeout = 0;
+
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (signal_pending(prev))
- goto makerunnable;
- timeout = prev->timeout;
- if (timeout && (timeout <= jiffies)) {
- prev->timeout = 0;
- timeout = 0;
- makerunnable:
+ {
prev->state = TASK_RUNNING;
break;
}
@@ -550,21 +591,9 @@
#endif

if (prev != next) {
- struct timer_list timer;
-
kstat.context_swtch++;
- if (timeout) {
- init_timer(&timer);
- timer.expires = timeout;
- timer.data = (unsigned long) prev;
- timer.function = process_timeout;
- add_timer(&timer);
- }
get_mmu_context(next);
switch_to(prev,next);
-
- if (timeout)
- del_timer(&timer);
}

spin_unlock(&scheduler_lock);
@@ -717,33 +746,55 @@
{
return __do_down(sem,TASK_INTERRUPTIBLE);
}
-

-static void FASTCALL(__sleep_on(struct wait_queue **p, int state));
-static void __sleep_on(struct wait_queue **p, int state)
-{
- unsigned long flags;
+#define SLEEP_ON_VAR \
+ unsigned long flags; \
struct wait_queue wait;

- current->state = state;
- wait.task = current;
- write_lock_irqsave(&waitqueue_lock, flags);
- __add_wait_queue(p, &wait);
+#define SLEEP_ON_HEAD \
+ wait.task = current; \
+ write_lock_irqsave(&waitqueue_lock, flags); \
+ __add_wait_queue(p, &wait); \
write_unlock(&waitqueue_lock);
- schedule();
- write_lock_irq(&waitqueue_lock);
- __remove_wait_queue(p, &wait);
+
+#define SLEEP_ON_TAIL \
+ write_lock_irq(&waitqueue_lock); \
+ __remove_wait_queue(p, &wait); \
write_unlock_irqrestore(&waitqueue_lock, flags);
-}

void interruptible_sleep_on(struct wait_queue **p)
{
- __sleep_on(p,TASK_INTERRUPTIBLE);
+ SLEEP_ON_VAR
+
+ current->state = TASK_INTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ schedule();
+ SLEEP_ON_TAIL
}

+long interruptible_sleep_on_timeout(struct wait_queue **p, long timeout)
+{
+ SLEEP_ON_VAR
+
+ current->state = TASK_INTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ timeout = schedule_timeout(timeout);
+ SLEEP_ON_TAIL
+
+ return timeout;
+}
+
void sleep_on(struct wait_queue **p)
{
- __sleep_on(p,TASK_UNINTERRUPTIBLE);
+ SLEEP_ON_VAR
+
+ current->state = TASK_UNINTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ schedule();
+ SLEEP_ON_TAIL
}

void scheduling_functions_end_here(void) { }
@@ -803,7 +854,7 @@
break;
if (!(mask & timer_active))
continue;
- if (tp->expires > jiffies)
+ if (time_after(tp->expires, jiffies))
continue;
timer_active &= ~mask;
tp->fn();
@@ -1563,16 +1614,14 @@
return 0;
}

- expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec) + jiffies;
+ expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);

- current->timeout = expire;
current->state = TASK_INTERRUPTIBLE;
- schedule();
+ expire = schedule_timeout(expire);

- if (expire > jiffies) {
+ if (expire) {
if (rmtp) {
- jiffies_to_timespec(expire - jiffies -
- (expire > jiffies + 1), &t);
+ jiffies_to_timespec(expire, &t);
if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
return -EFAULT;
}
Index: linux/kernel/signal.c
diff -u linux/kernel/signal.c:1.1.1.1 linux/kernel/signal.c:1.1.1.1.12.4
--- linux/kernel/signal.c:1.1.1.1 Fri Oct 2 19:22:39 1998
+++ linux/kernel/signal.c Sun Oct 18 21:50:32 1998
@@ -712,6 +712,7 @@
sigset_t these;
struct timespec ts;
siginfo_t info;
+ long timeout = 0;

/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
@@ -738,22 +739,18 @@
if (!sig) {
/* None ready -- temporarily unblock those we're interested
in so that we'll be awakened when they arrive. */
- unsigned long expire;
sigset_t oldblocked = current->blocked;
sigandsets(&current->blocked, &current->blocked, &these);
recalc_sigpending(current);
spin_unlock_irq(&current->sigmask_lock);

- expire = ~0UL;
- if (uts) {
- expire = (timespec_to_jiffies(&ts)
- + (ts.tv_sec || ts.tv_nsec));
- expire += jiffies;
- }
- current->timeout = expire;
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ if (uts)
+ timeout = (timespec_to_jiffies(&ts)
+ + (ts.tv_sec || ts.tv_nsec));

current->state = TASK_INTERRUPTIBLE;
- schedule();
+ timeout = schedule_timeout(timeout);

spin_lock_irq(&current->sigmask_lock);
sig = dequeue_signal(&these, &info);
@@ -770,10 +767,8 @@
}
} else {
ret = -EAGAIN;
- if (current->timeout != 0) {
- current->timeout = 0;
+ if (timeout)
ret = -EINTR;
- }
}

return ret;
Index: linux/fs/select.c
diff -u linux/fs/select.c:1.1.1.1 linux/fs/select.c:1.1.1.1.12.4
--- linux/fs/select.c:1.1.1.1 Fri Oct 2 19:22:35 1998
+++ linux/fs/select.c Mon Oct 19 21:40:15 1998
@@ -124,17 +124,17 @@
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

-int do_select(int n, fd_set_buffer *fds, unsigned long timeout)
+int do_select(int n, fd_set_buffer *fds, long *timeout)
{
poll_table wait_table, *wait;
- int retval;
- int i;
+ int retval, i;
+ long __timeout = *timeout;

lock_kernel();

wait = NULL;
- current->timeout = timeout;
- if (timeout) {
+ if (__timeout)
+ {
struct poll_table_entry *entry = (struct poll_table_entry *)
__get_free_page(GFP_KERNEL);
if (!entry) {
@@ -190,17 +190,22 @@
}
}
wait = NULL;
- if (retval || !current->timeout || signal_pending(current))
+ if (retval || !__timeout || signal_pending(current))
break;
- schedule();
+ __timeout = schedule_timeout(__timeout);
}
current->state = TASK_RUNNING;

out:
- if (timeout) {
+ if (*timeout) {
free_wait(&wait_table);
free_page((unsigned long) wait_table.entry);
}
+
+ /*
+ * Up-to-date the caller timeout.
+ */
+ *timeout = __timeout;
out_nowait:
unlock_kernel();
return retval;
@@ -218,10 +223,10 @@
sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
{
fd_set_buffer *fds;
- unsigned long timeout;
+ long timeout;
int ret;

- timeout = ~0UL;
+ timeout = MAX_SCHEDULE_TIMEOUT;
if (tvp) {
time_t sec, usec;

@@ -232,8 +237,6 @@

timeout = ROUND_UP(usec, 1000000/HZ);
timeout += sec * (unsigned long) HZ;
- if (timeout)
- timeout += jiffies + 1;
}

ret = -ENOMEM;
@@ -253,12 +256,11 @@
zero_fd_set(n, fds->res_out);
zero_fd_set(n, fds->res_ex);

- ret = do_select(n, fds, timeout);
+ ret = do_select(n, fds, &timeout);

if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
- unsigned long timeout = current->timeout - jiffies - 1;
time_t sec = 0, usec = 0;
- if ((long) timeout > 0) {
+ if (timeout) {
sec = timeout / HZ;
usec = timeout % HZ;
usec *= (1000000/HZ);
@@ -266,7 +268,6 @@
put_user(sec, &tvp->tv_sec);
put_user(usec, &tvp->tv_usec);
}
- current->timeout = 0;

if (ret < 0)
goto out;
@@ -287,7 +288,8 @@
return ret;
}

-static int do_poll(unsigned int nfds, struct pollfd *fds, poll_table *wait)
+static int do_poll(unsigned int nfds, struct pollfd *fds, poll_table *wait,
+ long timeout)
{
int count = 0;

@@ -321,15 +323,15 @@
}

wait = NULL;
- if (count || !current->timeout || signal_pending(current))
+ if (count || !timeout || signal_pending(current))
break;
- schedule();
+ timeout = schedule_timeout(timeout);
}
current->state = TASK_RUNNING;
return count;
}

-asmlinkage int sys_poll(struct pollfd * ufds, unsigned int nfds, int timeout)
+asmlinkage int sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
{
int i, fdcount, err, size;
struct pollfd * fds, *fds1;
@@ -342,9 +344,9 @@
goto out;

if (timeout < 0)
- timeout = 0x7fffffff;
+ timeout = MAX_SCHEDULE_TIMEOUT;
else if (timeout)
- timeout = ((unsigned long)timeout*HZ+999)/1000+jiffies+1;
+ timeout = (timeout*HZ+999)/1000+1;

err = -ENOMEM;
if (timeout) {
@@ -366,9 +368,7 @@
if (copy_from_user(fds, ufds, size))
goto out_fds;

- current->timeout = timeout;
- fdcount = do_poll(nfds, fds, wait);
- current->timeout = 0;
+ fdcount = do_poll(nfds, fds, wait, timeout);

/* OK, now copy the revents fields back to user space. */
fds1 = fds;
Index: linux/include/linux/sched.h
diff -u linux/include/linux/sched.h:1.1.1.1.2.1 linux/include/linux/sched.h:1.1.1.1.2.1.2.10
--- linux/include/linux/sched.h:1.1.1.1.2.1 Mon Oct 5 18:21:46 1998
+++ linux/include/linux/sched.h Mon Oct 19 21:40:17 1998
@@ -23,6 +23,8 @@
#include <linux/capability.h>
#include <linux/securebits.h>

+#define JIFFIES_OFFSET (-120*HZ)
+
/*
* cloning flags:
*/
@@ -119,9 +121,10 @@
extern void show_state(void);
extern void trap_init(void);

+#define MAX_SCHEDULE_TIMEOUT (~0UL >> 1)
+extern signed long FASTCALL(schedule_timeout(signed long timeout));
asmlinkage void schedule(void);

-
/*
* Open file table structure
*/
@@ -258,7 +261,7 @@
struct task_struct **tarray_ptr;

struct wait_queue *wait_chldexit; /* for wait4() */
- unsigned long timeout, policy, rt_priority;
+ unsigned long policy, rt_priority;
unsigned long it_real_value, it_prof_value, it_virt_value;
unsigned long it_real_incr, it_prof_incr, it_virt_incr;
struct timer_list real_timer;
@@ -348,7 +351,7 @@
/* pidhash */ NULL, NULL, \
/* tarray */ &task[0], \
/* chld wait */ NULL, \
-/* timeout */ 0,SCHED_OTHER,0,0,0,0,0,0,0, \
+/* timeout */ SCHED_OTHER,0,0,0,0,0,0,0, \
/* timer */ { NULL, NULL, 0, 0, it_real_fn }, \
/* utime */ {0,0,0,0},0, \
/* per CPU times */ {0, }, {0, }, \
@@ -457,7 +460,10 @@

extern void FASTCALL(__wake_up(struct wait_queue ** p, unsigned int mode));
extern void FASTCALL(sleep_on(struct wait_queue ** p));
+extern void FASTCALL(sleep_on(struct wait_queue ** p));
extern void FASTCALL(interruptible_sleep_on(struct wait_queue ** p));
+extern long FASTCALL(interruptible_sleep_on_timeout(struct wait_queue ** p,
+ signed long timeout));
extern void FASTCALL(wake_up_process(struct task_struct * tsk));

#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE)
Index: linux/arch/i386/kernel/time.c
diff -u linux/arch/i386/kernel/time.c:1.1.1.1 linux/arch/i386/kernel/time.c:1.1.1.1.14.3
--- linux/arch/i386/kernel/time.c:1.1.1.1 Fri Oct 2 19:23:36 1998
+++ linux/arch/i386/kernel/time.c Mon Oct 19 03:33:14 1998
@@ -53,6 +53,13 @@
unsigned long high;
} init_timer_cc, last_timer_cc;

+static void __init_timer_cc(void)
+{
+ __asm__("rdtsc"
+ :"=a" (init_timer_cc.low),
+ "=d" (init_timer_cc.high));
+}
+
static unsigned long do_fast_gettimeoffset(void)
{
register unsigned long eax asm("ax");
@@ -68,12 +75,12 @@
*/
static unsigned long cached_quotient=0;

- tmp = jiffies;
+ tmp = jiffies - JIFFIES_OFFSET;

quotient = cached_quotient;
low_timer = last_timer_cc.low;

- if (last_jiffies != tmp) {
+ if (tmp && last_jiffies != tmp) {
last_jiffies = tmp;

/* Get last timer tick in absolute kernel time */
@@ -110,7 +117,7 @@
}

/* Read the time counter */
- __asm__("rdtsc" : "=a" (eax), "=d" (edx));
+ __asm__("rdtsc" : "=a" (eax) : : "edx");

/* .. relative to previous jiffy (32 bits is enough) */
edx = 0;
@@ -191,7 +198,7 @@
* We do this guaranteed double memory access instead of a _p
* postfix in the previous port access. Wheee, hackady hack
*/
- jiffies_t = jiffies;
+ jiffies_t = jiffies - JIFFIES_OFFSET;

count |= inb_p(0x40) << 8;

@@ -380,6 +387,7 @@
static inline void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
do_timer(regs);
+
/*
* In the SMP case we use the local APIC timer interrupt to do the
* profiling, except when we simulate SMP mode on a uniprocessor
@@ -444,6 +452,14 @@
:"=a" (last_timer_cc.low),
"=d" (last_timer_cc.high));
timer_interrupt(irq, NULL, regs);
+
+ if (!(jiffies - JIFFIES_OFFSET))
+ /*
+ * If jiffies has overflowed in this timer_interrupt we must
+ * update the init_timer_cc to make the do_fast_gettimeoffset()
+ * quotient calc still valid. -arca
+ */
+ __init_timer_cc();
}
#endif

@@ -552,9 +568,7 @@
}

/* read Pentium cycle counter */
- __asm__("rdtsc"
- :"=a" (init_timer_cc.low),
- "=d" (init_timer_cc.high));
+ __init_timer_cc();
irq0.handler = pentium_timer_interrupt;
}
#endif
Index: linux/include/linux/poll.h
diff -u linux/include/linux/poll.h:1.1.1.1 linux/include/linux/poll.h:1.1.1.1.12.2
--- linux/include/linux/poll.h:1.1.1.1 Fri Oct 2 19:22:40 1998
+++ linux/include/linux/poll.h Mon Oct 19 01:44:39 1998
@@ -103,7 +103,7 @@
memset(fdset, 0, nr);
}

-extern int do_select(int n, fd_set_buffer *fds, unsigned long timeout);
+extern int do_select(int n, fd_set_buffer *fds, long *timeout);

#endif /* KERNEL */

A little description by words.

The `->timeout' task_struct field is gone. schedule() will not support a
timeout in the interruptible scheduling anymore. This is true also for the
interruptible_sleep_on() thing.

If you want to set a timeout now you have to use something like this:

long timeout = HZ;

current->state = TASK_INTERRUPTIBLE;
do {
timeout = schedule_timeout(timeout);
} while (timeout);

or also:

current->state = TASK_INTERRUPTIBLE;
schedule_timeout(HZ);

You are not forced to check the remaining time retured by the function of
course. You can for example check for signals pending.

To semplify the use of schedule_timeout() you can also do:

current->state = TASK_INTERRUPTIBLE;
if (schedule_timeout(0))
panic("your compiler is buggy");

this will be equivalent to the right way:

current->state = TASK_INTERRUPTIBLE;
schedule();

At first I avoided the use of timeout = 0 but while doing the kernel
updating I noticed that been enabled to use 0 would be very useful and
would produce more clean code.

If you need to set a timer to the most long time possible (as select()
does when you pass to it NULL) you can use the MAX_SCHEDULE_TIMEOUT
#define (in sched.h).

current->state = TASK_INTERRUPTIBLE;
schedule(MAX_SCHEDULE_TIMEOUT);

With this new schedule_timeout() change the scheduling code of Linux will
be automagically fully jiffies wrap compliant. This will improve also the
performance of schedule() in the no-timeout case. Doing the
schedule_timeout() porting I also noticed that all the code that use the
timeout thing become more clean.

-----------------------------------------------------------------------

As for schedule() there is schedule_timeout() for interruptible_sleep_on()
it' s been produced a new interruptible_sleep_on_timeout(). This is the
difference:

void interruptibel_sleep_on(wait_queue)
{
useful things
schedule()
useful things
}

void interruptibel_sleep_on_timeout(wait_queue, long timeout)
{
useful things
timeout = schedule_timeout(timeout)
useful things

return timeout;
}

Basically you was used to do:

current->timeout = jiffies + HZ;
interruptible_sleep_on(&wait_q);

and now you can directly do:

interruptible_sleep_on(&wait_q, HZ);

----------------------------------------------------------------------

With my whole patch applyed the core of linux 2.1.125 should be fully
jiffies wrap compliant. Eventually there could still be some buggy driver
that misuse some timer but it seems that my kernel-configuration is safe
from such mistakes. And tons of code still need the *_timeout()
conversion since I only updated the code I use ;-).

The first idea of a schedule_timeout() thing is been thought from MOLNAR
Ingo btw.

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/