Re: [with-PATCH-really] highmem deadlock removal, balancing & cleanup

From: Ben LaHaise (bcrl@redhat.com)
Date: Fri May 25 2001 - 19:52:50 EST


On Sat, 26 May 2001, Andrea Arcangeli wrote:

> Please merge this one in 2.4 for now (originally from Ingo, I only
> improved it), this is a real definitive fix and there's no nicer way to
> handle that unless you want to generalize an API for people to generate
> private anti-deadlock ("make sure to always make a progress") memory
> pools:

Alternatively, the following might be more interesting...

                -ben

diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/arch/i386/kernel/irq.c linux-toolbox-current/arch/i386/kernel/irq.c
--- kernel-2.4.3-works/linux.orig/arch/i386/kernel/irq.c Thu May 10 16:04:39 2001
+++ linux-toolbox-current/arch/i386/kernel/irq.c Thu May 10 12:16:21 2001
@@ -32,6 +32,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/irq.h>
 #include <linux/proc_fs.h>
+#include <linux/mm/reservation.h>

 #include <asm/atomic.h>
 #include <asm/io.h>
@@ -576,7 +577,10 @@
         irq_desc_t *desc = irq_desc + irq;
         struct irqaction * action;
         unsigned int status;
+ struct page_reservation *saved_irq_rsv;

+ saved_irq_rsv = current->page_reservations;
+ current->page_reservations = &irq_rsv;
         kstat.irqs[cpu][irq]++;
         spin_lock(&desc->lock);
         desc->handler->ack(irq);
@@ -638,6 +642,7 @@

         if (softirq_active(cpu) & softirq_mask(cpu))
                 do_softirq();
+ current->page_reservations = saved_irq_rsv;
         return 1;
 }

diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/arch/i386/mm/fault.c linux-toolbox-current/arch/i386/mm/fault.c
--- kernel-2.4.3-works/linux.orig/arch/i386/mm/fault.c Thu May 10 16:04:40 2001
+++ linux-toolbox-current/arch/i386/mm/fault.c Mon May 14 13:26:57 2001
@@ -196,6 +196,7 @@
         if (in_interrupt() || !mm)
                 goto no_context;

+ atomic_inc(&mm->in_fault_count);
         down_read(&mm->mmap_sem);

         vma = find_vma(mm, address);
@@ -269,6 +270,7 @@
                 if (bit < 32)
                         tsk->thread.screen_bitmap |= 1 << bit;
         }
+ atomic_dec(&mm->in_fault_count);
         up_read(&mm->mmap_sem);
         return;

@@ -277,6 +279,7 @@
  * Fix it, but check if it's kernel or user first..
  */
 bad_area:
+ atomic_dec(&mm->in_fault_count);
         up_read(&mm->mmap_sem);

 bad_area_nosemaphore:
@@ -339,6 +342,7 @@
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
+ atomic_dec(&mm->in_fault_count);
         up_read(&mm->mmap_sem);
         printk("VM: killing process %s\n", tsk->comm);
         if (error_code & 4)
@@ -346,6 +350,7 @@
         goto no_context;

 do_sigbus:
+ atomic_dec(&mm->in_fault_count);
         up_read(&mm->mmap_sem);

         /*
diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/fs/buffer.c linux-toolbox-current/fs/buffer.c
--- kernel-2.4.3-works/linux.orig/fs/buffer.c Thu May 10 16:07:27 2001
+++ linux-toolbox-current/fs/buffer.c Fri May 11 16:42:19 2001
@@ -45,6 +45,7 @@
 #include <linux/quotaops.h>
 #include <linux/iobuf.h>
 #include <linux/highmem.h>
+#include <linux/mm/reservation.h>

 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -2735,6 +2736,7 @@
  */
 int bdflush(void *sem)
 {
+ static struct page_reservation rsv;
         struct task_struct *tsk = current;
         int flushed;
         /*
@@ -2748,6 +2750,12 @@
         strcpy(tsk->comm, "bdflush");
         bdflush_tsk = tsk;

+ init_page_reservation(&rsv, RSV_MULTISHOT, ZONE_NORMAL);
+ if (reserve_pages(&rsv, GFP_KERNEL, 64))
+ panic("bdflush unable to reserve emergency pages!\n");
+ tsk->page_reservations = &rsv;
+
+
         /* avoid getting signals */
         spin_lock_irq(&tsk->sigmask_lock);
         flush_signals(tsk);
@@ -2778,6 +2786,8 @@
                    the next schedule will block. */
                 __set_current_state(TASK_RUNNING);
         }
+
+ destroy_page_reservation(&rsv);
 }

 /*
@@ -2788,6 +2798,7 @@
  */
 int kupdate(void *sem)
 {
+ static struct page_reservation rsv;
         struct task_struct * tsk = current;
         int interval;

@@ -2795,6 +2806,11 @@
         tsk->pgrp = 1;
         strcpy(tsk->comm, "kupdated");

+ init_page_reservation(&rsv, RSV_MULTISHOT, ZONE_NORMAL);
+ if (reserve_pages(&rsv, GFP_KERNEL, 32))
+ panic("bdflush unable to reserve emergency pages!\n");
+ tsk->page_reservations = &rsv;
+
         /* sigstop and sigcont will stop and wakeup kupdate */
         spin_lock_irq(&tsk->sigmask_lock);
         sigfillset(&tsk->blocked);
@@ -2833,6 +2849,7 @@
 #endif
                 sync_old_buffers();
         }
+ destroy_page_reservation(&rsv);
 }

 static int __init bdflush_init(void)
diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/include/linux/mm/reservation.h linux-toolbox-current/include/linux/mm/reservation.h
--- kernel-2.4.3-works/linux.orig/include/linux/mm/reservation.h Wed Dec 31 19:00:00 1969
+++ linux-toolbox-current/include/linux/mm/reservation.h Thu May 10 12:16:21 2001
@@ -0,0 +1,48 @@
+#ifndef __LINUX__MM__RESERVATION_H
+#define __LINUX__MM__RESERVATION_H
+/* inclinde/linux/mm/reservation.h
+ * written by Benjamin LaHaise
+ *
+ * Copyright 2001 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * based in part on ideas/code from Arjan Van de Ven and Stephen Tweedie.
+ */
+
+#define RSV_ONESHOT 0x00
+#define RSV_MULTISHOT 0x01 /* reservation will replenish itself */
+
+struct page_reservation {
+ struct list_head list;
+ unsigned avail, used;
+ int flags;
+ zone_t *zone;
+};
+
+extern struct page_reservation irq_rsv;
+
+extern void init_page_reservation(struct page_reservation *rsv, int flags, int zone);
+extern void destroy_page_reservation(struct page_reservation *rsv);
+
+/* Reservation is an all or nothing thing. A successful reservation
+ * returns 0. Anything else is a failure.
+ */
+extern int reserve_pages(struct page_reservation *rsv, int gfp_mask, unsigned count);
+
+/* Release a previously reserved amount of memory. */
+extern void put_reserved_pages(struct page_reservation *rsv, unsigned count);
+
+#endif
diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/include/linux/mm.h linux-toolbox-current/include/linux/mm.h
--- kernel-2.4.3-works/linux.orig/include/linux/mm.h Thu May 10 16:04:40 2001
+++ linux-toolbox-current/include/linux/mm.h Mon May 14 13:33:43 2001
@@ -528,7 +528,7 @@

 #define GFP_BOUNCE (__GFP_HIGH | __GFP_FAIL)
-#define GFP_BUFFER (__GFP_HIGH | __GFP_WAIT)
+#define GFP_BUFFER (__GFP_HIGH | __GFP_WAIT | __GFP_FAIL)
 #define GFP_ATOMIC (__GFP_HIGH)
 #define GFP_USER ( __GFP_WAIT | __GFP_IO)
 #define GFP_HIGHUSER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/include/linux/mmzone.h linux-toolbox-current/include/linux/mmzone.h
--- kernel-2.4.3-works/linux.orig/include/linux/mmzone.h Thu May 10 16:07:27 2001
+++ linux-toolbox-current/include/linux/mmzone.h Fri May 11 20:46:13 2001
@@ -50,6 +50,10 @@
         unsigned long inactive_dirty_pages;
         unsigned long pages_min, pages_low, pages_high;

+ /* Page reservation */
+ unsigned long reserved_pages;
+ struct list_head depleted_rsv_list;
+
         /*
          * free areas of different sizes
          */
diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/include/linux/sched.h linux-toolbox-current/include/linux/sched.h
--- kernel-2.4.3-works/linux.orig/include/linux/sched.h Thu May 10 16:07:27 2001
+++ linux-toolbox-current/include/linux/sched.h Mon May 14 13:23:55 2001
@@ -210,6 +210,7 @@
         pgd_t * pgd;
         atomic_t mm_users; /* How many users with user space? */
         atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
+ atomic_t in_fault_count; /* number of in progress page faults */
         int map_count; /* number of VMAs */
         struct rw_semaphore mmap_sem;
         spinlock_t page_table_lock; /* Protects task page tables and mm->rss */
@@ -241,6 +242,7 @@
         pgd: swapper_pg_dir, \
         mm_users: ATOMIC_INIT(2), \
         mm_count: ATOMIC_INIT(1), \
+ in_fault_count: ATOMIC_INIT(0), \
         map_count: 1, \
         mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \
         page_table_lock: SPIN_LOCK_UNLOCKED, \
@@ -406,6 +408,8 @@
            u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty */
         spinlock_t alloc_lock;
+
+ struct page_reservation *page_reservations;
 };

 /*
@@ -486,7 +490,8 @@
     sig: &init_signals, \
     pending: { NULL, &tsk.pending.head, {{0}}}, \
     blocked: {{0}}, \
- alloc_lock: SPIN_LOCK_UNLOCKED \
+ alloc_lock: SPIN_LOCK_UNLOCKED, \
+ page_reservations: NULL, \
 }

diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/init/main.c linux-toolbox-current/init/main.c
--- kernel-2.4.3-works/linux.orig/init/main.c Thu May 10 16:04:39 2001
+++ linux-toolbox-current/init/main.c Thu May 10 21:00:20 2001
@@ -28,6 +28,7 @@
 #include <linux/iobuf.h>
 #include <linux/bootmem.h>
 #include <linux/tty.h>
+#include <linux/mm/reservation.h>

 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -88,6 +89,9 @@

 static int init(void *);

+extern struct page_reservation atomic_rsv;
+extern struct page_reservation swap_rsv;
+
 extern void init_IRQ(void);
 extern void init_modules(void);
 extern void sock_init(void);
@@ -654,6 +658,13 @@
         proc_root_init();
 #endif
         mempages = num_physpages;
+
+ if (reserve_pages(&irq_rsv, GFP_KERNEL, mempages >> 9))
+ panic("unable to reserve irq memory.\n");
+ if (reserve_pages(&swap_rsv, GFP_KERNEL, mempages >> 9))
+ panic("unable to reserve swap memory.\n");
+ if (reserve_pages(&atomic_rsv, GFP_KERNEL, mempages >> 10))
+ panic("unable to reserve atomic memory.\n");

         fork_init(mempages);
         proc_caches_init();
diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/kernel/exit.c linux-toolbox-current/kernel/exit.c
--- kernel-2.4.3-works/linux.orig/kernel/exit.c Thu May 10 16:07:27 2001
+++ linux-toolbox-current/kernel/exit.c Thu May 10 12:15:34 2001
@@ -10,6 +10,7 @@
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/tty.h>
+#include <linux/mm/reservation.h>
 #ifdef CONFIG_BSD_PROCESS_ACCT
 #include <linux/acct.h>
 #endif
@@ -422,6 +423,11 @@
 NORET_TYPE void do_exit(long code)
 {
         struct task_struct *tsk = current;
+
+ if (tsk->page_reservations) {
+ destroy_page_reservation(tsk->page_reservations);
+ tsk->page_reservations = NULL;
+ }

         if (in_interrupt())
                 panic("Aiee, killing interrupt handler!");
diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/kernel/fork.c linux-toolbox-current/kernel/fork.c
--- kernel-2.4.3-works/linux.orig/kernel/fork.c Thu May 10 16:07:27 2001
+++ linux-toolbox-current/kernel/fork.c Mon May 14 13:24:45 2001
@@ -203,6 +203,7 @@
 {
         atomic_set(&mm->mm_users, 1);
         atomic_set(&mm->mm_count, 1);
+ atomic_set(&mm->in_fault_count, 0);
         init_rwsem(&mm->mmap_sem);
         mm->page_table_lock = SPIN_LOCK_UNLOCKED;
         mm->pgd = pgd_alloc();
@@ -630,6 +631,7 @@
         p->tty_old_pgrp = 0;
         p->times.tms_utime = p->times.tms_stime = 0;
         p->times.tms_cutime = p->times.tms_cstime = 0;
+ p->page_reservations = 0;
 #ifdef CONFIG_SMP
         {
                 int i;
diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/mm/page_alloc.c linux-toolbox-current/mm/page_alloc.c
--- kernel-2.4.3-works/linux.orig/mm/page_alloc.c Thu May 10 16:07:27 2001
+++ linux-toolbox-current/mm/page_alloc.c Fri May 11 15:52:54 2001
@@ -18,7 +18,31 @@
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
+#include <linux/mm/reservation.h>

+extern int
+ page_launder_calls,
+ page_launder_scans,
+ page_launder_scanned_pages,
+ page_launder_scanned_not_dirty,
+ page_launder_scanned_active,
+ page_launder_scanned_skipped,
+ page_launder_scanned_dirty,
+ page_launder_scanned_dirty_skip,
+ page_launder_scanned_dirty_nolaunder,
+ page_launder_scanned_dirty_swapcache,
+ page_launder_scanned_buffers,
+ page_launder_scanned_buffers_refiled,
+ page_launder_scanned_buffers_freed_buffer,
+ page_launder_scanned_buffers_active,
+ page_launder_scanned_buffers_cleaned,
+ page_launder_scanned_buffers_released,
+ page_launder_scanned_mapped_clean,
+ page_launder_scanned_still_active
+;
+struct page_reservation atomic_rsv;
+struct page_reservation swap_rsv;
+struct page_reservation irq_rsv;
 int nr_swap_pages;
 int nr_active_pages;
 int nr_inactive_dirty_pages;
@@ -99,7 +123,7 @@

         page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
         page->age = PAGE_AGE_START;
-
+
         zone = page->zone;

         mask = (~0UL) << order;
@@ -115,7 +139,8 @@

         __save_flags(flags);
         __cli();
- if (!order && (per_cpu->nr_pages < per_cpu->max_nr_pages)) {
+ if (!order && (per_cpu->nr_pages < per_cpu->max_nr_pages) && list_empty(&zone->depleted_rsv_list)) {
+static int foo; if (foo++ < 5) printk("freeing per-cpu page\n");
                 list_add(&page->list, &per_cpu->head);
                 per_cpu->nr_pages++;
                 __restore_flags(flags);
@@ -124,6 +149,20 @@

         spin_lock(&zone->lock);

+ /* Check if we need to replenish any of this zone's reservations. */
+ if (!list_empty(&zone->depleted_rsv_list)) {
+ struct page_reservation *rsv = list_entry(zone->depleted_rsv_list.next, struct page_reservation, list);
+static int foo; if (foo++ < 5) printk("updating reserve: %p %u %u\n", rsv, rsv->avail, rsv->used);
+ if (!rsv->used)
+ BUG();
+ rsv->avail++;
+ rsv->used--;
+
+ list_del_init(&rsv->list);
+ if (rsv->used)
+ list_add(&rsv->list, zone->depleted_rsv_list.prev);
+ }
+
         zone->free_pages -= mask;

         while (mask + (1 << (MAX_ORDER-1))) {
@@ -190,12 +229,13 @@
 }

-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
-static struct page * rmqueue(zone_t *zone, unsigned long order)
+static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order, struct page_reservation *rsv));
+static struct page * rmqueue(zone_t *zone, unsigned long order, struct page_reservation *rsv)
 {
         per_cpu_t *per_cpu = zone->cpu_pages + smp_processor_id();
         free_area_t * area = zone->free_area + order;
         unsigned long curr_order = order;
+ unsigned long free_pages;
         struct list_head *head, *curr;
         unsigned long flags;
         struct page *page;
@@ -216,7 +256,15 @@
         }

         spin_lock(&zone->lock);
- do {
+
+ free_pages = zone->free_pages;
+ free_pages -= zone->reserved_pages;
+
+ /* Using a reservation? */
+ if (rsv)
+ free_pages += rsv->avail;
+
+ if (free_pages >= (1 << order)) do {
                 head = &area->free_list;
                 curr = memlist_next(head);

@@ -232,6 +280,16 @@
                         zone->free_pages -= 1 << order;

                         page = expand(zone, page, index, order, curr_order, area);
+ if (rsv && (rsv->avail >= (1 << order))) {
+static int foo; if (foo++ < 5) printk("alloc from reserv: %p %u %u\n", rsv, rsv->avail, rsv->used);
+ if (!rsv->used && (rsv->flags & RSV_MULTISHOT)) {
+static int foo; if (foo++ < 5) printk("multishot reserv: %p\n", rsv);
+ list_add(&rsv->list, &zone->depleted_rsv_list);
+ rsv->avail -= 1 << order;
+ rsv->used += 1 << order;
+}
+ zone->reserved_pages--;
+ }
                         spin_unlock_irqrestore(&zone->lock, flags);

                         set_page_count(page, 1);
@@ -265,6 +323,7 @@
         for (;;) {
                 zone_t *z = *(zone++);
                 unsigned long water_mark;
+ unsigned long free_pages;

                 if (!z)
                         break;
@@ -275,6 +334,8 @@
                  * We allocate if the number of free + inactive_clean
                  * pages is above the watermark.
                  */
+ free_pages = z->free_pages - z->reserved_pages;
+
                 switch (limit) {
                         default:
                         case PAGES_MIN:
@@ -287,14 +348,14 @@
                                 water_mark = z->pages_high;
                 }

- if (z->free_pages + z->inactive_clean_pages > water_mark) {
+ if (free_pages + z->inactive_clean_pages > water_mark) {
                         struct page *page = NULL;
                         /* If possible, reclaim a page directly. */
- if (direct_reclaim && z->free_pages < z->pages_min + 8)
+ if (direct_reclaim && free_pages < z->pages_min + 8)
                                 page = reclaim_page(z);
                         /* If that fails, fall back to rmqueue. */
                         if (!page)
- page = rmqueue(z, order);
+ page = rmqueue(z, order, NULL);
                         if (page)
                                 return page;
                 }
@@ -304,6 +365,8 @@
         return NULL;
 }

+extern struct page *get_reserved_page (void);
+

 /*
  * This is the 'heart' of the zoned buddy allocator:
@@ -320,7 +383,7 @@
          * Allocations put pressure on the VM subsystem.
          */
         memory_pressure++;
-
+
         /*
          * (If anyone calls gfp from interrupts nonatomically then it
          * will sooner or later tripped up by a schedule().)
@@ -351,11 +414,11 @@
                 if (!z->size)
                         BUG();

- if (z->free_pages >= z->pages_low) {
- page = rmqueue(z, order);
+ if (z->free_pages - z->reserved_pages >= z->pages_low) {
+ page = rmqueue(z, order, NULL);
                         if (page)
- return page;
- } else if (z->free_pages < z->pages_min &&
+ goto out_success;
+ } else if (z->free_pages - z->reserved_pages < z->pages_min &&
                                         waitqueue_active(&kreclaimd_wait)) {
                                 wake_up_interruptible(&kreclaimd_wait);
                 }
@@ -371,7 +434,7 @@
          */
         page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
         if (page)
- return page;
+ goto out_success;

         /*
          * Then try to allocate a page from a zone with more
@@ -383,7 +446,7 @@
          */
         page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
         if (page)
- return page;
+ goto out_success;

         /*
          * OK, none of the zones on our zonelist has lots
@@ -418,8 +481,42 @@
          */
         page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
         if (page)
- return page;
+ goto out_success;

+ /* Memory reservation hook. Note: memory reservations are
+ * attempted after all other normal means of allocations have
+ * failed. Give it a try with the memory reservation and see
+ * what happens.
+ * TODO: with memory reservations in place, much of the code
+ * below is completely bogus. Clean this up! -ben
+ */
+ if (current->page_reservations || (gfp_mask & __GFP_HIGH)) {
+ struct page_reservation *rsv;
+
+ if (gfp_mask & __GFP_HIGH)
+ rsv = &atomic_rsv;
+ else
+ rsv = current->page_reservations;
+
+ do {
+ zone = zonelist->zones;
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
+
+ if (z == rsv->zone) {
+static int foo; if (foo++ < 5) printk("trying reservation: %p\n", current->page_reservations);
+ page = rmqueue(z, order, rsv);
+ if (page)
+ goto out_success;
+ break;
+ }
+ }
+ } while (rsv == &atomic_rsv &&
+ (rsv = current->page_reservations));
+ }
+
         /*
          * If we dont want to try too hard then we can give up
          * now
@@ -465,9 +562,9 @@
                                                 break;
                                         __free_page(page);
                                         /* Try if the allocation succeeds. */
- page = rmqueue(z, order);
+ page = rmqueue(z, order, NULL);
                                         if (page)
- return page;
+ goto out_success;
                                 }
                         }
                 }
@@ -485,6 +582,11 @@
                         memory_pressure++;
                         try_to_free_pages(gfp_mask);
                         wakeup_bdflush(0);
+ if ((gfp_mask & __GFP_WAIT) && !(current->flags & PF_ATOMICALLOC)) {
+ __set_current_state(TASK_RUNNING);
+ current->policy |= SCHED_YIELD;
+ schedule();
+ }
                         goto try_again;
                 }
         }
@@ -511,31 +613,25 @@
                 if (direct_reclaim) {
                         page = reclaim_page(z);
                          if (page)
- return page;
+ goto out_success;
                 }

                 /* XXX: is pages_min/4 a good amount to reserve for this? */
- if (z->free_pages < z->pages_min / 4 &&
+ if (z->free_pages - z->reserved_pages < z->pages_min / 4 &&
                     !((current->flags & PF_MEMALLOC) &&
                       (gfp_mask & __GFP_WAIT)))
                         continue;
- page = rmqueue(z, order);
+ page = rmqueue(z, order, NULL);
                 if (page)
- return page;
+ goto out_success;
         }

- // okay - we are in trouble, lets go to the DMA pool directly:
-
- {
- zone_t *z = pgdat_list->node_zones;
-
- page = rmqueue(z, order);
- if (page)
- return page;
- }
         /* No luck.. */
         printk(KERN_INFO "__alloc_pages: %lu-order allocation failed.\n", order);
         return NULL;
+
+out_success:
+ return page;
 }

 /*
@@ -588,7 +684,7 @@
         sum = 0;
         while (pgdat) {
                 for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
- sum += zone->free_pages;
+ sum += zone->free_pages - zone->reserved_pages;
                 pgdat = pgdat->node_next;
         }
         return sum;
@@ -605,7 +701,8 @@
         sum = 0;
         pgdat = pgdat_list;
         while (pgdat) {
- sum += (pgdat->node_zones+zone_type)->free_pages;
+ zone_t *z = pgdat->node_zones+zone_type;
+ sum += z->free_pages - z->reserved_pages;
                 pgdat = pgdat->node_next;
         }
         return sum;
@@ -694,6 +791,7 @@

         while (pgdat) {
                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+ pages -= pgdat->node_zones[ZONE_HIGHMEM].reserved_pages;
                 pgdat = pgdat->node_next;
         }
         return pages;
@@ -723,13 +821,37 @@
                 freepages.low,
                 freepages.high);

+ printk( "%8u %8u %8u %8u %8u %8u\n"
+ "%8u %8u %8u %8u %8u %8u\n"
+ "%8u %8u %8u %8u %8u %8u\n",
+ page_launder_calls,
+ page_launder_scans,
+ page_launder_scanned_pages,
+ page_launder_scanned_not_dirty,
+ page_launder_scanned_active,
+ page_launder_scanned_skipped,
+ page_launder_scanned_dirty,
+ page_launder_scanned_dirty_skip,
+ page_launder_scanned_dirty_nolaunder,
+ page_launder_scanned_dirty_swapcache,
+ page_launder_scanned_buffers,
+ page_launder_scanned_buffers_refiled,
+ page_launder_scanned_buffers_freed_buffer,
+ page_launder_scanned_buffers_active,
+ page_launder_scanned_buffers_cleaned,
+ page_launder_scanned_buffers_released,
+ page_launder_scanned_mapped_clean,
+ page_launder_scanned_still_active
+);
+
         for (type = 0; type < MAX_NR_ZONES; type++) {
                 struct list_head *head, *curr;
                 zone_t *zone = pgdat->node_zones + type;
- unsigned long nr, total, flags;
+ unsigned long nr, total, flags, reserved;

- total = 0;
+ reserved = total = 0;
                 if (zone->size) {
+ printk("Zone %s: ", zone->name);
                         spin_lock_irqsave(&zone->lock, flags);
                          for (order = 0; order < MAX_ORDER; order++) {
                                 head = &(zone->free_area + order)->free_list;
@@ -745,9 +867,10 @@
                                 printk("%lu*%lukB ", nr,
                                                 (PAGE_SIZE>>10) << order);
                         }
+ reserved = zone->reserved_pages;
                         spin_unlock_irqrestore(&zone->lock, flags);
                 }
- printk("= %lukB)\n", total * (PAGE_SIZE>>10));
+ printk("= %lukB) Reserved: %lukB\n", total * (PAGE_SIZE>>10), reserved * (PAGE_SIZE>>10));
         }

 #ifdef SWAP_CACHE_INFO
@@ -901,8 +1024,11 @@
                 zone->lock = SPIN_LOCK_UNLOCKED;
                 zone->zone_pgdat = pgdat;
                 zone->free_pages = 0;
+ zone->reserved_pages = 0;
                 zone->inactive_clean_pages = 0;
                 zone->inactive_dirty_pages = 0;
+ zone->reserved_pages = 0;
+ INIT_LIST_HEAD(&zone->depleted_rsv_list);
                 memlist_init(&zone->inactive_clean_list);
                 if (!size)
                         continue;
@@ -914,7 +1040,7 @@
                         mask = zone_balance_min[j];
                 else if (mask > zone_balance_max[j])
                         mask = zone_balance_max[j];
- zone->pages_min = mask;
+ zone->pages_min = 0;
                 zone->pages_low = mask*2;
                 zone->pages_high = mask*3;
                 /*
@@ -928,7 +1054,7 @@
                  * for people who require it to catch load spikes in eg.
                  * gigabit ethernet routing...
                  */
- freepages.min += mask;
+ freepages.min = 0;
                 freepages.low += mask*2;
                 freepages.high += mask*3;
                 zone->zone_mem_map = mem_map + offset;
@@ -955,12 +1081,16 @@
                         bitmap_size = size >> i;
                         bitmap_size = (bitmap_size + 7) >> 3;
                         bitmap_size = LONG_ALIGN(bitmap_size);
- bitmap_size *= 2;
+ bitmap_size *= 1;
                         zone->free_area[i].map =
                           (unsigned int *) alloc_bootmem_node(pgdat, bitmap_size);
                 }
         }
         build_zonelists(pgdat);
+
+ init_page_reservation(&irq_rsv, RSV_MULTISHOT, ZONE_NORMAL);
+ init_page_reservation(&swap_rsv, RSV_MULTISHOT, ZONE_NORMAL);
+ init_page_reservation(&atomic_rsv, RSV_MULTISHOT, ZONE_NORMAL);
 }

 void __init free_area_init(unsigned long *zones_size)
@@ -977,6 +1107,101 @@
         for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]);
         printk("\n");
         return 1;
+}
+
+void init_page_reservation(struct page_reservation *rsv, int flags, int zone)
+{
+static int foo; if (foo++ < 5) printk("init_page_reservation(%p, %d, %d)\n", rsv, flags, zone);
+ INIT_LIST_HEAD(&rsv->list);
+ rsv->avail = 0;
+ rsv->used = 0;
+ rsv->flags = flags;
+
+ /* FIXME: This doesn't work properly on NUMA or multizoned setups.
+ */
+ rsv->zone = pgdat_list->node_zones + zone;
+}
+
+void destroy_page_reservation(struct page_reservation *rsv)
+{
+ unsigned long flags;
+ zone_t *zone = rsv->zone;
+static int foo; if (foo++ < 5) printk("destroy_page_reservation(%p)\n", rsv);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->reserved_pages -= rsv->avail;
+ list_del_init(&rsv->list); /* This relies on list_del_init being used */
+ spin_unlock_irqrestore(&zone->lock, flags);
+ memset(rsv, 0x57, sizeof(*rsv));
+}
+
+int reserve_pages(struct page_reservation *rsv, int gfp_mask, unsigned count)
+{
+ unsigned long flags, free_pages;
+ zone_t *zone = rsv->zone;
+ unsigned orig = count;
+ int tries = 5;
+static int foo; if (foo++ < 5) printk("reserve_pages(%p, %d, %u)\n", rsv, gfp_mask, count);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ free_pages = zone->free_pages - zone->reserved_pages;
+ if (free_pages > count)
+ free_pages = count;
+ count -= free_pages;
+ zone->reserved_pages += free_pages;
+
+ rsv->used += count;
+ if (count)
+ zone->pages_min++;
+ list_del_init(&rsv->list);
+ if (rsv->used)
+ list_add(&rsv->list, zone->depleted_rsv_list.prev);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ while (--tries && rsv->used) {
+ try_to_free_pages(gfp_mask);
+ if ((gfp_mask & __GFP_WAIT) && !(current->flags & PF_ATOMICALLOC)) {
+ __set_current_state(TASK_RUNNING);
+ current->policy |= SCHED_YIELD;
+ schedule();
+ }
+ }
+
+ if (count) {
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->pages_min--;
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ if (!rsv->used)
+ return 0;
+
+ put_reserved_pages(rsv, orig);
+ return -ENOMEM;
+}
+
+void put_reserved_pages(struct page_reservation *rsv, unsigned count)
+{
+ unsigned long flags;
+ zone_t *zone = rsv->zone;
+static int foo; if (foo++ < 5) printk("put_reserved_pages(%p, %u)\n", rsv, count);
+ spin_lock_irqsave(&zone->lock, flags);
+
+ if (rsv->used <= count) {
+ count -= rsv->used;
+ rsv->used = 0;
+ } else {
+ rsv->used -= count;
+ count = 0;
+ }
+
+ if (count > rsv->avail)
+ BUG();
+
+ rsv->avail -= count;
+ zone->reserved_pages -= count;
+ spin_unlock_irqrestore(&zone->lock, flags);
 }

 __setup("memfrac=", setup_mem_frac);
diff --exclude=.* --exclude=*.[^ch]* -urN kernel-2.4.3-works/linux.orig/mm/vmscan.c linux-toolbox-current/mm/vmscan.c
--- kernel-2.4.3-works/linux.orig/mm/vmscan.c Thu May 10 16:07:27 2001
+++ linux-toolbox-current/mm/vmscan.c Mon May 14 13:30:26 2001
@@ -21,9 +21,32 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
+#include <linux/mm/reservation.h>

 #include <asm/pgalloc.h>

+extern struct page_reservation swap_rsv;
+
+int page_launder_calls,
+ page_launder_scans,
+ page_launder_scanned_pages,
+ page_launder_scanned_not_dirty,
+ page_launder_scanned_active,
+ page_launder_scanned_skipped,
+ page_launder_scanned_dirty,
+ page_launder_scanned_dirty_skip,
+ page_launder_scanned_dirty_nolaunder,
+ page_launder_scanned_dirty_swapcache,
+ page_launder_scanned_buffers,
+ page_launder_scanned_buffers_refiled,
+ page_launder_scanned_buffers_freed_buffer,
+ page_launder_scanned_buffers_active,
+ page_launder_scanned_buffers_cleaned,
+ page_launder_scanned_buffers_released,
+ page_launder_scanned_mapped_clean,
+ page_launder_scanned_still_active
+;
+
 /*
  * The swap-out function returns 1 if it successfully
  * scanned all the pages it was asked to (`count').
@@ -230,6 +253,14 @@
 {
         unsigned long address;
         struct vm_area_struct* vma;
+ unsigned long min_rss = atomic_read(&mm->in_fault_count);
+
+ min_rss *= 64;
+ if (min_rss > 256)
+ min_rss = 256;
+
+ if (mm->rss <= min_rss)
+ return;

         /*
          * Go through process' page directory.
@@ -466,11 +497,18 @@
         flushed_pages = 0;
         freed_pages = 0;

+ spin_lock(&pagemap_lru_lock);
+ page_launder_calls++;
+ spin_unlock(&pagemap_lru_lock);
+
 dirty_page_rescan:
         spin_lock(&pagemap_lru_lock);
+ page_launder_scans++;
+
         maxscan = nr_inactive_dirty_pages;
         while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
                                 maxscan-- > 0) {
+ page_launder_scanned_pages++;

                 page = list_entry(page_lru, struct page, lru);
                 zone = page->zone;
@@ -481,6 +519,7 @@
                         list_del(page_lru);
                         nr_inactive_dirty_pages--;
                         zone->inactive_dirty_pages--;
+ page_launder_scanned_not_dirty++;
                         continue;
                 }

@@ -490,6 +529,7 @@
                      page_ramdisk(page)) {
                         del_page_from_inactive_dirty_list(page);
                         add_page_to_active_list(page);
+ page_launder_scanned_active++;
                         continue;
                 }

@@ -505,7 +545,7 @@
                 if (launder_loop && !maxlaunder)
                         break;
                 if (launder_loop && zone->inactive_clean_pages +
- zone->free_pages > zone->pages_high)
+ zone->free_pages - zone->reserved_pages > zone->pages_high)
                         goto skip_page;

                 /*
@@ -514,6 +554,7 @@
                  */
                 if (TryLockPage(page)) {
 skip_page:
+ page_launder_scanned_skipped++;
                         list_del(page_lru);
                         list_add(page_lru, &inactive_dirty_list);
                         continue;
@@ -524,13 +565,19 @@
                  * last copy..
                  */
                 if (PageDirty(page)) {
+ struct page_reservation *saved_rsv;
                         int (*writepage)(struct page *) = page->mapping->a_ops->writepage;

- if (!writepage || !can_get_io_locks)
+ page_launder_scanned_dirty++;
+
+ if (!writepage || !can_get_io_locks) {
+ page_launder_scanned_dirty_skip++;
                                 goto page_active;
+ }

                         /* First time through? Move it to the back of the list */
                         if (!launder_loop) {
+ page_launder_scanned_dirty_nolaunder++;
                                 list_del(page_lru);
                                 list_add(page_lru, &inactive_dirty_list);
                                 UnlockPage(page);
@@ -542,10 +589,16 @@
                         page_cache_get(page);
                         spin_unlock(&pagemap_lru_lock);

+ saved_rsv = current->page_reservations;
+ current->page_reservations = &swap_rsv;
                         writepage(page);
+ current->page_reservations = saved_rsv;
+
                         /* XXX: all ->writepage()s should use nr_async_pages */
                         if (!PageSwapCache(page))
                                 flushed_pages++;
+ else
+ page_launder_scanned_dirty_swapcache++;
                         maxlaunder--;
                         page_cache_release(page);

@@ -565,6 +618,7 @@
                  */
                 if (page->buffers) {
                         int wait, clearedbuf;
+ page_launder_scanned_buffers++;
                         /*
                          * Since we might be doing disk IO, we have to
                          * drop the spinlock and take an extra reference
@@ -594,21 +648,25 @@

                         /* The buffers were not freed. */
                         if (!clearedbuf) {
+ page_launder_scanned_buffers_refiled++;
                                 add_page_to_inactive_dirty_list(page);
                                 if (wait)
                                         flushed_pages++;

                         /* The page was only in the buffer cache. */
                         } else if (!page->mapping) {
+ page_launder_scanned_buffers_freed_buffer++;
                                 atomic_dec(&buffermem_pages);
                                 freed_pages++;

                         /* The page has more users besides the cache and us. */
                         } else if (page_count(page) > 2) {
+ page_launder_scanned_buffers_active++;
                                 add_page_to_active_list(page);

                         /* OK, we "created" a freeable page. */
                         } else /* page->mapping && page_count(page) == 2 */ {
+ page_launder_scanned_buffers_cleaned++;
                                 add_page_to_inactive_clean_list(page);
                                 freed_pages++;
                         }
@@ -618,10 +676,12 @@
                          * We can only do it here because we are accessing
                          * the page struct above.
                          */
+ page_launder_scanned_buffers_released++;
                         UnlockPage(page);
                         page_cache_release(page);

                 } else if (page->mapping && !PageDirty(page)) {
+ page_launder_scanned_mapped_clean++;
                         /*
                          * If a page had an extra reference in
                          * deactivate_page(), we will find it here.
@@ -634,6 +694,7 @@
                         freed_pages++;

                 } else {
+ page_launder_scanned_still_active++;
 page_active:
                         /*
                          * OK, we don't know what to do with the page.
@@ -660,6 +721,13 @@
          */
         shortage = free_shortage();
         if (can_get_io_locks && !launder_loop && shortage) {
+ if (gfp_mask & __GFP_WAIT) {
+ __set_current_state(TASK_RUNNING);
+ current->policy |= SCHED_YIELD;
+ schedule();
+ }
+
+ shortage = free_shortage();
                 launder_loop = 1;

                 /*
@@ -835,10 +903,11 @@
                 for(i = 0; i < MAX_NR_ZONES; i++) {
                         zone_t *zone = pgdat->node_zones+ i;
                         if (zone->size && (zone->inactive_clean_pages +
- zone->free_pages < zone->pages_min+1)) {
+ zone->free_pages - zone->reserved_pages < zone->pages_min+1)) {
                                 /* + 1 to have overlap with alloc_pages() !! */
                                 sum += zone->pages_min + 1;
                                 sum -= zone->free_pages;
+ sum += zone->reserved_pages;
                                 sum -= zone->inactive_clean_pages;
                         }
                 }
@@ -881,6 +950,7 @@
                         zone_shortage -= zone->inactive_dirty_pages;
                         zone_shortage -= zone->inactive_clean_pages;
                         zone_shortage -= zone->free_pages;
+ zone_shortage += zone->reserved_pages;
                         if (zone_shortage > 0)
                                 shortage += zone_shortage;
                 }
@@ -1009,6 +1079,7 @@

 int kswapd(void *unused)
 {
+ static struct page_reservation kswapd_rsv;
         struct task_struct *tsk = current;

         tsk->session = 1;
@@ -1016,6 +1087,11 @@
         strcpy(tsk->comm, "kswapd");
         sigfillset(&tsk->blocked);
         kswapd_task = tsk;
+
+ init_page_reservation(&kswapd_rsv, RSV_MULTISHOT, ZONE_NORMAL);
+ if (reserve_pages(&kswapd_rsv, GFP_KERNEL, 32))
+ panic("kswapd unable to reserve emergency pages!\n");
+ tsk->page_reservations = &kswapd_rsv;

         /*
          * Tell the memory management that we're a "memory allocator",
@@ -1086,6 +1162,8 @@
                         oom_kill();
                 }
         }
+
+ destroy_page_reservation(&kswapd_rsv);
 }

 void wakeup_kswapd(void)
@@ -1102,6 +1180,10 @@
 int try_to_free_pages(unsigned int gfp_mask)
 {
         int ret = 1;
+ struct page_reservation *rsv = current->page_reservations;
+
+ if (!rsv)
+ current->page_reservations = &swap_rsv;

         if (gfp_mask & __GFP_WAIT) {
                 unsigned long caller_memalloc = current->flags & PF_MEMALLOC;
@@ -1111,6 +1193,8 @@
                 current->flags |= caller_memalloc;
         }

+ current->page_reservations = rsv;
+
         return ret;
 }

@@ -1151,7 +1235,7 @@
                                 if (!zone->size)
                                         continue;

- while (zone->free_pages < zone->pages_low) {
+ while (zone->free_pages - zone->reserved_pages < zone->pages_low) {
                                         struct page * page;
                                         page = reclaim_page(zone);
                                         if (!page)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Thu May 31 2001 - 21:00:27 EST