Re: [patch] Re: bug in ATOMIC swapout (make_request()) hided without oom-arca

Andrea Arcangeli (andrea@e-mind.com)
Thu, 8 Oct 1998 21:05:07 +0200 (CEST)


On Thu, 8 Oct 1998, Linus Torvalds wrote:

>
>
>On Thu, 8 Oct 1998, Andrea Arcangeli wrote:
>>
>> And btw, I think it's not a great win being able to swapout async inside
>> and irq handler and, and right now it never happens because kswapd lock
>> the machine first (with the stock kernel if do_try_to_free_page() is
>> called not atomically you should see at least the kmem_cache_reap()
>> _KERN_ERR_).
>
>The whole reason you have this problem is that you completely broke
>page_alloc.c by not looking at __GFP_WAIT correctly. Undo those broken
>changes and I can at least look at the patch..

Woops, stupid me! I always had the problem under my eyes...

Excuse me for the wasted time.

This patch has at least the __get_free_pages() bug I added fixed...

I removed also the GFP_KSWAPD since _I_ don' t need more pressure from
kswapd (with 32 or 64mbyte of RAM). If the pressure of kswapd is not
enough on high end machines we can start doing _many_ things...

Andrea[s] Arcangeli

Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.3 linux/mm/filemap.c:1.3.2.6
--- linux/mm/filemap.c:1.3 Sun Oct 4 16:27:52 1998
+++ linux/mm/filemap.c Wed Oct 7 22:43:39 1998
@@ -153,7 +153,7 @@
} while (tmp != bh);

/* Refuse to swap out all buffer pages */
- if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages))
+ if (buffer_under_min())
goto next;
}

@@ -174,7 +174,7 @@
age_page(page);
if (page->age)
break;
- if (page_cache_size * 100 < (page_cache.min_percent * num_physpages))
+ if (pgcache_under_min())
break;
if (PageSwapCache(page)) {
delete_from_swap_cache(page);
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.1 linux/mm/page_alloc.c:1.1.1.1.10.3
--- linux/mm/page_alloc.c:1.1.1.1 Fri Oct 2 19:22:39 1998
+++ linux/mm/page_alloc.c Thu Oct 8 20:43:11 1998
@@ -237,45 +237,29 @@
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
unsigned long flags;
+ int again = 0;
+ int wait = gfp_mask & __GFP_WAIT;

if (order >= NR_MEM_LISTS)
goto nopage;

- if (gfp_mask & __GFP_WAIT) {
- if (in_interrupt()) {
- static int count = 0;
- if (++count < 5) {
- printk("gfp called nonatomically from interrupt %p\n",
- __builtin_return_address(0));
- }
- goto nopage;
- }
-
- if (freepages.min > nr_free_pages) {
- int freed;
- freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
- /*
- * Low priority (user) allocations must not
- * succeed if we didn't have enough memory
- * and we couldn't get more..
- */
- if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
- goto nopage;
- }
+ if (wait && in_interrupt()) {
+ printk("gfp called nonatomically from interrupt %p\n",
+ __builtin_return_address(0));
+ goto nopage;
}
+ again:
spin_lock_irqsave(&page_alloc_lock, flags);
RMQUEUE(order, (gfp_mask & GFP_DMA));
spin_unlock_irqrestore(&page_alloc_lock, flags);
+
+ if (!again && wait)
+ {
+ again = 1;
+ if (try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX))
+ goto again;
+ }

- /*
- * If we failed to find anything, we'll return NULL, but we'll
- * wake up kswapd _now_ ad even wait for it synchronously if
- * we can.. This way we'll at least make some forward progress
- * over time.
- */
- wake_up(&kswapd_wait);
- if (gfp_mask & __GFP_WAIT)
- schedule();
nopage:
return 0;
}
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.4 linux/mm/vmscan.c:1.4.2.14
--- linux/mm/vmscan.c:1.4 Sun Oct 4 16:27:52 1998
+++ linux/mm/vmscan.c Thu Oct 8 20:53:23 1998
@@ -447,40 +447,43 @@
static int do_try_to_free_page(int gfp_mask)
{
static int state = 0;
- int i=6;
- int stop;
+ int from_prio, to_prio;

/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);

/* We try harder if we are waiting .. */
- stop = 3;
if (gfp_mask & __GFP_WAIT)
- stop = 0;
+ {
+ from_prio = 3;
+ to_prio = 0;
+ } else {
+ from_prio = 6;
+ to_prio = 3;
+ }

- if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages)
- || (page_cache_size * 100 > page_cache.borrow_percent * num_physpages))
- shrink_mmap(i, gfp_mask);
+ if (buffer_over_borrow() || pgcache_over_borrow())
+ state = 0;

switch (state) {
do {
case 0:
- if (shrink_mmap(i, gfp_mask))
+ if (shrink_mmap(from_prio, gfp_mask))
return 1;
state = 1;
case 1:
- if (shm_swap(i, gfp_mask))
+ if (shm_swap(from_prio, gfp_mask))
return 1;
state = 2;
case 2:
- if (swap_out(i, gfp_mask))
+ if (swap_out(from_prio, gfp_mask))
return 1;
state = 3;
case 3:
- shrink_dcache_memory(i, gfp_mask);
+ shrink_dcache_memory(from_prio, gfp_mask);
state = 0;
- i--;
- } while ((i - stop) >= 0);
+ from_prio--;
+ } while (from_prio >= to_prio);
}
return 0;
}
@@ -524,7 +527,6 @@
lock_kernel();

/* Give kswapd a realtime priority. */
- current->policy = SCHED_FIFO;
current->rt_priority = 32; /* Fixme --- we need to standardise our
namings for POSIX.4 realtime scheduling
priorities. */
@@ -546,12 +548,16 @@
init_swap_timer();
add_wait_queue(&kswapd_wait, &wait);
while (1) {
- int tries;
+ int tries, free_memory, count;

current->state = TASK_INTERRUPTIBLE;
flush_signals(current);
run_task_queue(&tq_disk);
+ timer_active |= 1<<SWAP_TIMER;
+ current->policy = SCHED_FIFO;
schedule();
+ current->policy = SCHED_OTHER;
+ timer_active &= ~(1<<SWAP_TIMER);
swapstats.wakeups++;

/*
@@ -570,12 +576,21 @@
* woken up more often and the rate will be even
* higher).
*/
- tries = pager_daemon.tries_base;
- tries >>= 4*free_memory_available();
+ free_memory = free_memory_available();

- do {
- do_try_to_free_page(0);
+ if (free_memory == 2)
+ continue;
+ tries = pager_daemon.tries_base >> (free_memory + 2);
+
+ for (count = 0; count < tries; count++)
+ {
/*
+ * If we can' t free one page we can' t able to
+ * free tries page. -arca
+ */
+ if (!do_try_to_free_page(0))
+ break;
+ /*
* Syncing large chunks is faster than swapping
* synchronously (less head movement). -- Rik.
*/
@@ -583,7 +598,7 @@
run_task_queue(&tq_disk);
if (free_memory_available() > 1)
break;
- } while (--tries > 0);
+ }
}
/* As if we could ever get here - maybe we want to make this killable */
remove_wait_queue(&kswapd_wait, &wait);
@@ -598,22 +613,22 @@
*
* The "PF_MEMALLOC" flag protects us against recursion:
* if we need more memory as part of a swap-out effort we
- * will just silently return "success" to tell the page
- * allocator to accept the allocation.
+ * will just silently return "fail" to tell the page
+ * allocator that we are OOM.
*/
int try_to_free_pages(unsigned int gfp_mask, int count)
{
- int retval = 1;
+ int retval = 0;

lock_kernel();
if (!(current->flags & PF_MEMALLOC)) {
current->flags |= PF_MEMALLOC;
- do {
+ while (count--)
+ {
retval = do_try_to_free_page(gfp_mask);
if (!retval)
break;
- count--;
- } while (count > 0);
+ }
current->flags &= ~PF_MEMALLOC;
}
unlock_kernel();
@@ -649,11 +664,11 @@
}

if ((long) (now - want) >= 0) {
- if (want_wakeup || (num_physpages * buffer_mem.max_percent) < (buffermem >> PAGE_SHIFT) * 100
- || (num_physpages * page_cache.max_percent < page_cache_size * 100)) {
+ if (want_wakeup || buffer_over_max() || pgcache_over_max())
+ {
/* Set the next wake-up time */
next_swap_jiffies = now + swapout_interval;
- wake_up(&kswapd_wait);
+ kswapd_wakeup();
}
}
timer_active |= (1<<SWAP_TIMER);
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.3 linux/include/linux/mm.h:1.3.2.6
--- linux/include/linux/mm.h:1.3 Sun Oct 4 16:27:49 1998
+++ linux/include/linux/mm.h Thu Oct 8 20:43:13 1998
@@ -380,6 +380,36 @@
return vma;
}

+extern __inline__ void kswapd_wakeup(void)
+{
+ wake_up(&kswapd_wait);
+}
+
+#define buffer_under_min() ((buffermem >> PAGE_SHIFT) * 100 < \
+ buffer_mem.min_percent * num_physpages)
+#define buffer_under_borrow() ((buffermem >> PAGE_SHIFT) * 100 < \
+ buffer_mem.borrow_percent * num_physpages)
+#define buffer_under_max() ((buffermem >> PAGE_SHIFT) * 100 < \
+ buffer_mem.max_percent * num_physpages)
+#define buffer_over_min() ((buffermem >> PAGE_SHIFT) * 100 > \
+ buffer_mem.min_percent * num_physpages)
+#define buffer_over_borrow() ((buffermem >> PAGE_SHIFT) * 100 > \
+ buffer_mem.borrow_percent * num_physpages)
+#define buffer_over_max() ((buffermem >> PAGE_SHIFT) * 100 > \
+ buffer_mem.max_percent * num_physpages)
+#define pgcache_under_min() (page_cache_size * 100 < \
+ page_cache.min_percent * num_physpages)
+#define pgcache_under_borrow() (page_cache_size * 100 < \
+ page_cache.borrow_percent * num_physpages)
+#define pgcache_under_max() (page_cache_size * 100 < \
+ page_cache.max_percent * num_physpages)
+#define pgcache_over_min() (page_cache_size * 100 > \
+ page_cache.min_percent * num_physpages)
+#define pgcache_over_borrow() (page_cache_size * 100 > \
+ page_cache.borrow_percent * num_physpages)
+#define pgcache_over_max() (page_cache_size * 100 > \
+ page_cache.max_percent * num_physpages)
+
#endif /* __KERNEL__ */

#endif
Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.2 linux/kernel/fork.c:1.2.2.2
--- linux/kernel/fork.c:1.2 Mon Oct 5 01:08:12 1998
+++ linux/kernel/fork.c Thu Oct 8 14:41:11 1998
@@ -296,6 +296,8 @@
exit_mmap(mm);
free_page_tables(mm);
kmem_cache_free(mm_cachep, mm);
+ if (!free_memory_available())
+ kswapd_wakeup();
}
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/