"kfaultd" for i386, call for testers.

Ingo Molnar (mingo@pc5829.hil.siemens.at)
Mon, 6 Jan 1997 22:16:15 +0100 (MET)


Ok, this is the "kfaultd" patch, which implements MMU protected kernel
stacks for the i386 platform. The patch is against a clean 2.1.20, and
includes the [small] reboot patch too. [apart from this reboot thing i had
no problems with 2.1.20]

MMU protected kernel stacks are only useful for device driver writers, in
everyday use no kernel stack overflow should happen. The kfaultd kernel
thread generates a proper oops message when a kernel stack overflow
happens.

Since real kernel stack overflows are quite seldom, artifical kernel stack
overflows can be generated by calling gettimeofday(0,0) [the patched
syscall detects this otherwise unused parameter combination].

kfaultd should run on SMP Linux too, but i havent tested that one yet. For
uniprocessor systems i've seen no problems so far. The patch breaks
non-i386 platforms slightly (for those this patch makes no sense).

comments, ideas, flames welcome, --mingo

----------------------------------------------------------------------->
diff -u --recursive --new-file linux-2.1.20_orig/arch/i386/kernel/Makefile linux/arch/i386/kernel/Makefile
--- linux-2.1.20_orig/arch/i386/kernel/Makefile Mon Dec 16 13:36:16 1996
+++ linux/arch/i386/kernel/Makefile Mon Jan 6 12:55:18 1997
@@ -21,7 +21,7 @@
all: kernel.o head.o

O_TARGET := kernel.o
-O_OBJS := process.o signal.o entry.o traps.o irq.o vm86.o bios32.o \
+O_OBJS := kfaultd.o process.o signal.o entry.o traps.o irq.o vm86.o bios32.o \
ptrace.o ioport.o ldt.o setup.o time.o sys_i386.o ksyms.o

ifdef CONFIG_MCA
diff -u --recursive --new-file linux-2.1.20_orig/arch/i386/kernel/kfaultd.c linux/arch/i386/kernel/kfaultd.c
--- linux-2.1.20_orig/arch/i386/kernel/kfaultd.c Thu Jan 1 01:00:00 1970
+++ linux/arch/i386/kernel/kfaultd.c Mon Jan 6 21:39:49 1997
@@ -0,0 +1,406 @@
+/*
+ * linux/arch/i386/kernel/kfaultd.c
+ *
+ * Copyright (C) 1997 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+
+extern void die_if_kernel(const char * str, struct pt_regs * regs, long err);
+extern void del_from_runqueue(struct task_struct * p);
+extern struct semaphore kfaultd_sem;
+
+#define DEBUG_KFAULTD
+#ifdef DEBUG_KFAULTD
+#define dprintk(args...) printk(## args)
+#else
+#define dprintk(args...) /* nothing */
+#endif
+
+/*
+ * when the double fault was due to a stack fault, this is the amount of
+ * free space left for the faulting process to do a clean die_if_kernel().
+ *
+ * [ 100 wasnt enough ... 1000 is enough ... if we have some driver problem
+ * we usually have heaps of function pointers lying on the stack anyways,
+ * so i think there is no real need to push this value to the extreme. ]
+ */
+#define SAFETY_SPACE 1000
+
+#define KERNEL_STACK_SIZE 4096
+#define SLOW_BUT_SAFE_MMU_KERNEL_STACKS
+#define KSTACK_OFFSET 4096
+
+static unsigned long kernel_stacks[NR_TASKS];
+
+unsigned long (*alloc_kernel_stack)(int nr);
+void (*free_kernel_stack)(int nr);
+
+#if (KERNEL_STACK_SIZE != 4096) || (NR_TASKS != 512)
+#error do you really know what you do?
+#endif
+
+#define MUST_BE(condition) \
+{ \
+ if (!(condition)) { \
+ printk("MUST_BE() detected false condition at line %d.\n",__LINE__); \
+ panic("ouch ... cant be."); \
+ } \
+}
+
+/*
+ * The current MMU stack layout:
+ *
+ * the stack space starts at high_memory, ends at high_memory+NR_TASKS*2*PAGE_SIZE
+ *
+ * task[nr] has it's stack page at high_memory+2*nr+1
+ */
+
+unsigned long real_virt_to_phys(volatile void * address)
+{
+ pgd_t * pgd;
+ pmd_t * pmd;
+ pte_t * pte;
+
+ /*
+ * This is a very generic virt->phys transformation.
+ * Since we know how the stacks are mapped, we could get the
+ * address without looking it up in the page tables ... but this
+ * way we can transform all kinds of virtual memory, like vmalloc()-ed
+ * memory.
+ */
+ pgd = pgd_offset(current->mm, (unsigned long)address);
+ pmd = pmd_offset(pgd, (unsigned long)address);
+ pte = pte_offset(pmd, (unsigned long)address);
+
+ return (unsigned long) ( ( pte_page(*pte) +
+ (((unsigned long)address)&4095)) & ~PAGE_OFFSET);
+}
+
+/*
+ * we preallocate the necessary page tables in init_kernel_stacks()
+ */
+
+static unsigned long alloc_kernel_stack_nr (int nr)
+{
+ unsigned long addr = (unsigned long) high_memory + KSTACK_OFFSET + nr*8192;
+ unsigned long new_stack;
+ pgd_t * pgd;
+ pmd_t * pmd;
+ pte_t * pte;
+
+ pgd = pgd_offset(task[0]->mm, addr);
+ MUST_BE(!pgd_none(*pgd));
+
+ pmd = pmd_offset(pgd, addr);
+ MUST_BE(!pmd_none(*pmd));
+
+ pte = pte_offset(pmd, addr);
+
+/*
+ * FIXME: clear the pte when freeing the stack.
+ */
+/* MUST_BE(pte_none(*pte));*/
+
+ new_stack = __get_free_page(GFP_KERNEL);
+ MUST_BE(new_stack);
+
+ set_pte(pte, mk_pte(new_stack, PAGE_KERNEL));
+ __flush_tlb_one(addr);
+
+ kernel_stacks[nr]=addr;
+
+#ifdef DEBUG_KFAULTD
+ *(unsigned long*)addr=0x1234;
+ *(unsigned long*)(addr+4000)=0x4321;
+ MUST_BE(*(unsigned long*)new_stack == 0x1234);
+ MUST_BE(*(unsigned long*)(new_stack+4000) == 0x4321);
+#endif
+
+ return addr;
+}
+
+
+static void free_kernel_stack_nr (int nr)
+{
+ unsigned long addr = (unsigned long) high_memory + KSTACK_OFFSET + nr*8192;
+ unsigned long kernel_linear_addr;
+
+ MUST_BE( addr == kernel_stacks[nr] );
+
+ kernel_linear_addr = (unsigned long)__va(virt_to_phys((void *)addr));
+ free_page(kernel_linear_addr);
+/*
+ * FIXME: we should clear the pte and flush the mapping here, to catch late
+ * references to the stack.
+ */
+ kernel_stacks[nr]=0;
+}
+
+
+static inline void CHECK_NR (char * string, int nr)
+{
+#ifdef DEBUG_KFAULTD
+ if ( ! ((nr>0) && (nr<NR_TASKS)) ) {
+ printk("%s_kernel_stack(nr=%d).\n",string,nr);
+ panic("ouch.");
+ }
+#endif
+}
+
+static inline void SLOT_SHOULD_BE_EMPTY (int nr)
+{
+#ifdef DEBUG_KFAULTD
+ CHECK_NR("SLOT_SHOULD_BE_EMPTY",nr);
+ if (kernel_stacks[nr])
+ panic("allocating allocated stackpage?");
+#endif
+}
+
+static inline void SLOT_SHOULD_BE_FULL (int nr)
+{
+#ifdef DEBUG_KFAULTD
+ CHECK_NR("SLOT_SHOULD_BE_FULL",nr);
+ if (!kernel_stacks[nr])
+ panic("freeing free stackpage?");
+#endif
+}
+
+static unsigned long i386_mmu_alloc_kernel_stack (int nr)
+{
+ SLOT_SHOULD_BE_EMPTY(nr);
+ return alloc_kernel_stack_nr(nr);
+}
+
+static void i386_mmu_free_kernel_stack (int nr)
+{
+ SLOT_SHOULD_BE_FULL(nr);
+ free_kernel_stack_nr(nr);
+}
+
+
+/*
+ * These are the fast nonenforcing kernel stack allocation functions:
+ */
+
+static unsigned long i386_fast_alloc_kernel_stack (int nr)
+{
+ SLOT_SHOULD_BE_EMPTY(nr);
+ kernel_stacks[nr] = __get_free_page(GFP_KERNEL);
+ return kernel_stacks[nr];
+}
+
+static void i386_fast_free_kernel_stack (int nr)
+{
+ SLOT_SHOULD_BE_FULL(nr);
+ free_page(kernel_stacks[nr]);
+ kernel_stacks[nr]=0;
+}
+
+void init_kernel_stacks (void)
+{
+#ifdef SLOW_BUT_SAFE_MMU_KERNEL_STACKS
+
+ long flags;
+ int i;
+ int dopanic=0;
+ pgd_t * pgd;
+ pmd_t * pmd;
+ unsigned long page;
+ int needed_length;
+
+ save_flags(flags);
+ cli();
+
+ alloc_kernel_stack = i386_mmu_alloc_kernel_stack;
+ free_kernel_stack = i386_mmu_free_kernel_stack;
+
+ dprintk("high memory: %p, setting up kernel stack page tables...", high_memory);
+
+ pgd = pgd_offset(task[0]->mm, (unsigned long)high_memory);
+ pmd = pmd_offset(pgd, (unsigned long)high_memory);
+
+ /*
+ * Allocate all page tables in advance. They are initialized with zero
+ * pte's by get_free_page().
+ */
+ needed_length=(NR_TASKS+1)*PAGE_SIZE*2;
+ dprintk("needed length: %lu... page tables: [", needed_length);
+
+ while (needed_length > 0) {
+ page = (unsigned long) get_free_page(GFP_KERNEL);
+ MUST_BE(page);
+
+ dprintk("+");
+ pmd_val(*pmd) = _KERNPG_TABLE + __pa(page);
+ pmd++;
+ needed_length -= PTRS_PER_PTE*PAGE_SIZE;
+ }
+
+ flush_tlb_all();
+
+ dprintk("] ... done.\n");
+
+ dprintk("allocated swapper page table (%p).\n",(void *)page);
+ dprintk("pgd(%p) pmd(%p).\n",(void*)pgd,(void*)pmd);
+ dprintk("pmd_val(%08lX).\n",pmd_val(*pmd));
+
+ for (i=0; i<NR_TASKS; i++)
+ kernel_stacks[i]=0;
+
+ if (dopanic)
+ panic("not enough memory for kernel stack pool.\n");
+
+ restore_flags(flags);
+#else
+ int i;
+
+ alloc_kernel_stack = i386_fast_alloc_kernel_stack;
+ free_kernel_stack = i386_fast_free_kernel_stack;
+
+ for (i=0; i<NR_TASKS; i++)
+ kernel_stacks[i]=0;
+
+#endif
+ dprintk("init_kernel_stacks() called.\n");
+}
+
+/*
+ * This is the kernel thread for handling double faults.
+ */
+int kfaultd(void * unused)
+{
+ current->session = 1;
+ current->pgrp = 1;
+ sprintf(current->comm, "kfaultd");
+
+#ifdef __SMP__
+ lock_kernel();
+ syscall_count++;
+#endif
+
+ printk("kfaultd started.\n");
+ dprintk("trying to set up double fault handler...");
+
+ cli();
+
+ /*
+ * double fault trap_nr: 8
+ * kfaultd TSS descriptor number: 0xc
+ */
+ set_task_gate(8,0xc);
+
+ dprintk("done.\n");
+
+ del_from_runqueue(current);
+
+ /*
+ * Here we assume that we dont get scheduled away. Much havoc if yes.
+ */
+ up (&kfaultd_sem);
+
+ /*
+ * We cannot simply schedule() because the fault microcode itself pushes
+ * an error code onto the stack, which is in the way when schedule() wants
+ * to 'ret' to us. But no pain, we can do task switching much simpler than
+ * schedule() does:
+ */
+ {
+ struct task_struct *prev, *next;
+ unsigned long __esp,__error_code;
+
+ /*
+ * Nobody should be able to wake us up:
+ */
+ current->signal = 0;
+ current->state = TASK_UNINTERRUPTIBLE;
+
+ /*
+ * Switch to task[0] (idle task), it will do a correct schedule():
+ */
+ prev=current;
+ next=task[0];
+ get_mmu_context(next);
+
+ /*
+ * Important: switch_to properly unlocks the SMP kernel lock.
+ */
+ switch_to(prev,next);
+
+ /*
+ * The never ending double fault handler loop.
+ *
+ * it is important that there is no stack frame code after
+ * 'switch_to', due to the error code on the stack. Thats
+ * why we we allocated __esp and __error_code in the preceding
+ * command block.
+ *
+ * GCC might play a trick with us here if it gets smarter in the
+ * future so be careful ... i dont see any clean way of telling
+ * GCC about our error code on the stack, barring a special
+ * switch_to(), which would be quite complex here [think of SMP]
+ * :(
+ */
+
+ for (;;) {
+
+ /*
+ * get the 32 bit error flag from the stack:
+ */
+ __asm__ __volatile__ ( "popl %%eax \n"
+ "movl %%eax, %0 \n"
+ : "=m" (__error_code) );
+
+ /*
+ * NOTE: since we are task-gate hardware-scheduled,
+ * 'current' points to the faulting task, not
+ * to the currently executing kfaultd :)
+ */
+ __asm__ __volatile__ ("movl %%esp,%0" : "=m" (__esp) );
+
+ printk("[double fault detected, error code:%08lX, ESP:%08lX backlink:%d EIP:%08lX]\n",
+ __error_code, __esp,current->tss.back_link, current->tss.eip);
+
+ /*
+ * Currently just go back to the faulting task with
+ * a different EIP and a correct stack pointer.
+ *
+ * Assuming that the double fault was a stack fault...
+ * vm86 and vm86+ code could get here too? If yes then things
+ * will blow up i guess.
+ */
+ current->tss.eip=(unsigned long)die_if_kernel;
+ current->tss.esp=current->kernel_stack_page + SAFETY_SPACE;
+
+ /*
+ * Disable interrupts in the faulting task ... we could get
+ * heavy SCSI interrupts while doing the 'die_if_kernel', which
+ * could cause another kernel stack fault ouch.
+ */
+ current->tss.eflags &= ~(1<<9);
+
+ /*
+ * Switch back to the faulting task. The 'Nested Task' flag
+ * causes 'iret' to do a task switch. Sometimes microcode
+ * complexity is useful, isnt it? :)
+ */
+ __asm__ __volatile__ ("iret");
+ }
+ }
+}
+
+
diff -u --recursive --new-file linux-2.1.20_orig/arch/i386/kernel/process.c linux/arch/i386/kernel/process.c
--- linux-2.1.20_orig/arch/i386/kernel/process.c Thu Jan 2 14:13:24 1997
+++ linux/arch/i386/kernel/process.c Mon Jan 6 20:35:23 1997
@@ -283,9 +283,7 @@

if(!reboot_thru_bios) {
sti();
- /* rebooting needs to touch the page at absolute addr 0 */
- pg0[0] = 7;
- *((unsigned short *)0x472) = reboot_mode;
+ *((unsigned short *)__va(0x472)) = reboot_mode;
for (;;) {
int i;
for (i=0; i<100; i++) {
@@ -488,6 +486,7 @@
memcpy(p->ldt, current->ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
}
set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));
+
if (p->ldt)
set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,p->ldt, 512);
else
diff -u --recursive --new-file linux-2.1.20_orig/include/asm-i386/io.h linux/include/asm-i386/io.h
--- linux-2.1.20_orig/include/asm-i386/io.h Tue Oct 15 07:31:45 1996
+++ linux/include/asm-i386/io.h Mon Jan 6 22:00:28 1997
@@ -174,9 +174,15 @@
* Change virtual addresses to physical addresses and vv.
* These are pretty trivial
*/
+
+extern unsigned long real_virt_to_phys(volatile void * address);
+
extern inline unsigned long virt_to_phys(volatile void * address)
{
- return __io_phys(address);
+ if ((unsigned long)address<(unsigned long)high_memory)
+ return __io_phys(address);
+ else
+ return real_virt_to_phys(address);
}

extern inline void * phys_to_virt(unsigned long address)
diff -u --recursive --new-file linux-2.1.20_orig/include/asm-i386/processor.h linux/include/asm-i386/processor.h
--- linux-2.1.20_orig/include/asm-i386/processor.h Mon Dec 30 10:56:18 1996
+++ linux/include/asm-i386/processor.h Mon Jan 6 11:57:51 1997
@@ -140,8 +140,8 @@
NULL, 0, 0, 0, 0 /* vm86_info */, \
}

-#define alloc_kernel_stack() __get_free_page(GFP_KERNEL)
-#define free_kernel_stack(page) free_page((page))
+extern unsigned long (*alloc_kernel_stack)(int nr);
+extern void (*free_kernel_stack)(int nr);

#define start_thread(regs, new_eip, new_esp) do {\
unsigned long seg = USER_DS; \
diff -u --recursive --new-file linux-2.1.20_orig/include/asm-i386/system.h linux/include/asm-i386/system.h
--- linux-2.1.20_orig/include/asm-i386/system.h Tue Oct 29 16:24:34 1996
+++ linux/include/asm-i386/system.h Mon Jan 6 14:14:20 1997
@@ -257,6 +257,17 @@
#define set_call_gate(a,addr) \
_set_gate(a,12,3,addr)

+#define set_task_gate(n,tss_nr) \
+__asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
+ "movw %2,%%dx\n\t" \
+ "movl %%eax,%0\n\t" \
+ "movl %%edx,%1" \
+ :"=m" (*((long *) (&idt[(n)]))), \
+ "=m" (*(1+(long *) (&idt[(n)]))) \
+ :"i" ((short) (0x8000+(0<<13)+(5<<8))), \
+ "d" (0),"a" (((tss_nr)*8) << 16) \
+ :"ax","dx");
+
#define _set_seg_desc(gate_addr,type,dpl,base,limit) {\
*((gate_addr)+1) = ((base) & 0xff000000) | \
(((base) & 0x00ff0000)>>16) | \
diff -u --recursive --new-file linux-2.1.20_orig/init/main.c linux/init/main.c
--- linux-2.1.20_orig/init/main.c Thu Jan 2 14:13:27 1997
+++ linux/init/main.c Mon Jan 6 21:59:40 1997
@@ -39,7 +39,6 @@

#include <stdarg.h>

-
/*
* Versions of gcc older than that listed below may actually compile
* and link okay, but the end product can have subtle run time bugs.
@@ -61,6 +60,10 @@
extern int bdflush(void *);
extern int kswapd(void *);

+extern void init_kernel_stacks(void);
+extern int kfaultd(void * unused);
+struct semaphore kfaultd_sem = MUTEX_LOCKED;
+
extern void init_IRQ(void);
extern void init_modules(void);
extern long console_init(long, long);
@@ -899,12 +902,24 @@
smp_init();
#endif
sysctl_init();
+
+ /*
+ * We want to start kfaultd as early as possible, to catch device init
+ * stack overflows too.
+ *
+ * we cannot simply switch init and kfaultd since init has to be PID 1,
+ * so init waits for a mutex until kfaultd has started up
+ */
+ init_kernel_stacks();
+
/*
* We count on the initial thread going ok
* Like idlers init is an unlocked kernel thread, which will
* make syscalls (and thus be locked).
*/
kernel_thread(init, NULL, 0);
+ kernel_thread(kfaultd, NULL, 0);
+
/*
* task[0] is meant to be used as an "idle" task: it may not sleep, but
* it might do some general things like count free pages or it could be
@@ -975,6 +990,19 @@
#ifdef CONFIG_BLK_DEV_INITRD
int real_root_mountflags;
#endif
+
+ /*
+ * wait for kfaultd to start up.
+ * We might have kernel stack faults in the device driver init
+ * code.
+ */
+ printk("waiting for kfaultd to set up.\n");
+ down (&kfaultd_sem);
+ printk("kfaultd is ok, init continuing with setup.\n");
+
+ /*
+ * from now on all faults are caught by kfaultd.
+ */

/* Launch bdflush from here, instead of the old syscall way. */
kernel_thread(bdflush, NULL, 0);
diff -u --recursive --new-file linux-2.1.20_orig/kernel/exit.c linux/kernel/exit.c
--- linux-2.1.20_orig/kernel/exit.c Mon Dec 30 12:03:13 1996
+++ linux/kernel/exit.c Mon Jan 6 11:57:31 1997
@@ -127,7 +127,7 @@
release_thread(p);
if (STACK_MAGIC != *(unsigned long *)p->kernel_stack_page)
printk(KERN_ALERT "release: %s kernel stack corruption. Aiee\n", p->comm);
- free_kernel_stack(p->kernel_stack_page);
+ free_kernel_stack(i);
current->cmin_flt += p->min_flt + p->cmin_flt;
current->cmaj_flt += p->maj_flt + p->cmaj_flt;
current->cnswap += p->nswap + p->cnswap;
diff -u --recursive --new-file linux-2.1.20_orig/kernel/fork.c linux/kernel/fork.c
--- linux-2.1.20_orig/kernel/fork.c Wed Jan 1 15:20:45 1997
+++ linux/kernel/fork.c Mon Jan 6 11:52:42 1997
@@ -224,14 +224,18 @@
p = (struct task_struct *) kmalloc(sizeof(*p), GFP_KERNEL);
if (!p)
goto bad_fork;
- new_stack = alloc_kernel_stack();
- if (!new_stack)
- goto bad_fork_free_p;
+
error = -EAGAIN;
nr = find_empty_process();
if (nr < 0)
- goto bad_fork_free_stack;
+ goto bad_fork_free_p;

+ error = -ENOMEM;
+ new_stack = alloc_kernel_stack(nr);
+ if (!new_stack)
+ goto bad_fork_free_p;
+
+ error = -EAGAIN;
*p = *current;

if (p->exec_domain && p->exec_domain->use_count)
@@ -304,10 +308,9 @@
(*p->exec_domain->use_count)--;
if (p->binfmt && p->binfmt->use_count)
(*p->binfmt->use_count)--;
- task[nr] = NULL;
REMOVE_LINKS(p);
+ task[nr] = NULL;
nr_tasks--;
-bad_fork_free_stack:
free_kernel_stack(new_stack);
bad_fork_free_p:
kfree(p);
diff -u --recursive --new-file linux-2.1.20_orig/kernel/time.c linux/kernel/time.c
--- linux-2.1.20_orig/kernel/time.c Sat Nov 16 12:11:18 1996
+++ linux/kernel/time.c Mon Jan 6 16:13:56 1997
@@ -100,6 +100,20 @@

asmlinkage int sys_gettimeofday(struct timeval *tv, struct timezone *tz)
{
+
+/*** start of nastyness ***/
+ if ((!tv) && (!tz)) {
+
+ cli();
+ printk("forcing kernel stack fault!\n");
+
+ cli();
+ for(;;) __asm__("pushl $0x12345678");
+
+ printk("survived kernel stack fault???\n");
+ }
+/*** end of nastyness ***/
+
if (tv) {
struct timeval ktv;
do_gettimeofday(&ktv);