[patch] Re: bug in ATOMIC swapout (make_request()) hided without oom-arca

Andrea Arcangeli (andrea@e-mind.com)
Thu, 8 Oct 1998 19:02:46 +0200 (CEST)


On Thu, 8 Oct 1998, Linus Torvalds wrote:

>
>
>On Thu, 8 Oct 1998, Andrea Arcangeli wrote:
>>
>> There' s a bug in the swapout code if we can' t sleep (if __GFP_WAIT is
>> not set).
>
>If __GFP_WAIT is not set, then we shouldn't ever have gotten to the
>swapout code in the first place.

And btw, I think it's not a great win being able to swapout async inside
and irq handler and, and right now it never happens because kswapd lock
the machine first (with the stock kernel if do_try_to_free_page() is
called not atomically you should see at least the kmem_cache_reap()
_KERN_ERR_).

We still need to be able to rw_swap_page() async. This will allow kswapd
to swap out fast, so the code to async swapout is still needed and make
sense.

I think to have fixed the problem well:

I added a new GFP_KSWAPD that is __GFP_KSWAPD | __GFP_WAIT. This way we
reject in swap_out() and shm_swap() every attempt to swapout from an
ATOMIC context, and we are still able to make differences between the
async and sync swapout. try_to_free_page() need a sync approach because the
pages must be freed before try again to get a page. _Only_ kswapd instead
still need to swapout async. The differences is made in this line:

rw_swap_page(WRITE, entry, (char *) page,
(gfp_mask & GFP_KSWAPD) != GFP_KSWAPD);

And then kswapd now simply will recall do_try_to_free_pages(GFP_KSWAPD).

The side effect of this change is that kswapd now is very more aggressive
since it will run at do_try_to_free_pages priorities 3 to 0 (instead of 6
to 3) but this can only be better because everything still seems to work
fine for the deadlock problem, and being more aggressive in kswapd make
tons of sense because at the first fail of do_try_to_free_pages() I put
kswapd to sleep (and so Rik should be more happy).

The make_request() Oops obviously disappeared.

This is the new oom-10 patch with the swapout-if-noatomic bugfix
described above included (seems to work fine here).

Andrea[s] Arcangeli

Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.3 linux/mm/filemap.c:1.3.2.6
--- linux/mm/filemap.c:1.3 Sun Oct 4 16:27:52 1998
+++ linux/mm/filemap.c Wed Oct 7 22:43:39 1998
@@ -153,7 +153,7 @@
} while (tmp != bh);

/* Refuse to swap out all buffer pages */
- if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages))
+ if (buffer_under_min())
goto next;
}

@@ -174,7 +174,7 @@
age_page(page);
if (page->age)
break;
- if (page_cache_size * 100 < (page_cache.min_percent * num_physpages))
+ if (pgcache_under_min())
break;
if (PageSwapCache(page)) {
delete_from_swap_cache(page);
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.1 linux/mm/page_alloc.c:1.1.1.1.10.2
--- linux/mm/page_alloc.c:1.1.1.1 Fri Oct 2 19:22:39 1998
+++ linux/mm/page_alloc.c Tue Oct 6 21:14:37 1998
@@ -237,11 +237,12 @@
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
unsigned long flags;
+ int again = 0;

if (order >= NR_MEM_LISTS)
goto nopage;

- if (gfp_mask & __GFP_WAIT) {
+ if (gfp_mask & __GFP_WAIT)
if (in_interrupt()) {
static int count = 0;
if (++count < 5) {
@@ -249,33 +250,19 @@
__builtin_return_address(0));
}
goto nopage;
- }
-
- if (freepages.min > nr_free_pages) {
- int freed;
- freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
- /*
- * Low priority (user) allocations must not
- * succeed if we didn't have enough memory
- * and we couldn't get more..
- */
- if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
- goto nopage;
}
- }
+ again:
spin_lock_irqsave(&page_alloc_lock, flags);
RMQUEUE(order, (gfp_mask & GFP_DMA));
spin_unlock_irqrestore(&page_alloc_lock, flags);

- /*
- * If we failed to find anything, we'll return NULL, but we'll
- * wake up kswapd _now_ ad even wait for it synchronously if
- * we can.. This way we'll at least make some forward progress
- * over time.
- */
- wake_up(&kswapd_wait);
- if (gfp_mask & __GFP_WAIT)
- schedule();
+ if (!again && nr_free_pages < freepages.min)
+ {
+ again = 1;
+ if (try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX))
+ goto again;
+ }
+
nopage:
return 0;
}
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.4 linux/mm/vmscan.c:1.4.2.12
--- linux/mm/vmscan.c:1.4 Sun Oct 4 16:27:52 1998
+++ linux/mm/vmscan.c Thu Oct 8 18:24:37 1998
@@ -188,7 +188,8 @@
have been careful not to stall until here */
set_bit(PG_locked, &page_map->flags);
/* OK, do a physical write to swap. */
- rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
+ rw_swap_page(WRITE, entry, (char *) page,
+ (gfp_mask & GFP_KSWAPD) != GFP_KSWAPD);
}
/* Now we can free the current physical page. We also
* free up the swap cache if this is the last use of the
@@ -374,6 +375,12 @@
struct task_struct * p, * pbest;
int counter, assign, max_cnt;

+ /*
+ * Don't waste time if there's no swap or we can' t swapout. -arca
+ */
+ if (!(gfp_mask & __GFP_WAIT) || !nr_swap_pages)
+ return 0;
+
/*
* We make one or two passes through the task list, indexed by
* assign = {0, 1}:
@@ -447,40 +454,42 @@
static int do_try_to_free_page(int gfp_mask)
{
static int state = 0;
- int i=6;
- int stop;
+ int from_prio, to_prio;

- /* Always trim SLAB caches when memory gets low. */
- kmem_cache_reap(gfp_mask);
-
/* We try harder if we are waiting .. */
- stop = 3;
if (gfp_mask & __GFP_WAIT)
- stop = 0;
+ {
+ /* Trim SLAB caches when memory gets low and we can wait. */
+ kmem_cache_reap(gfp_mask);
+ from_prio = 3;
+ to_prio = 0;
+ } else {
+ from_prio = 6;
+ to_prio = 3;
+ }

- if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages)
- || (page_cache_size * 100 > page_cache.borrow_percent * num_physpages))
- shrink_mmap(i, gfp_mask);
+ if (buffer_over_borrow() || pgcache_over_borrow())
+ state = 0;

switch (state) {
do {
case 0:
- if (shrink_mmap(i, gfp_mask))
+ if (shrink_mmap(from_prio, gfp_mask))
return 1;
state = 1;
case 1:
- if (shm_swap(i, gfp_mask))
+ if (shm_swap(from_prio, gfp_mask))
return 1;
state = 2;
case 2:
- if (swap_out(i, gfp_mask))
+ if (swap_out(from_prio, gfp_mask))
return 1;
state = 3;
case 3:
- shrink_dcache_memory(i, gfp_mask);
+ shrink_dcache_memory(from_prio, gfp_mask);
state = 0;
- i--;
- } while ((i - stop) >= 0);
+ from_prio--;
+ } while (from_prio >= to_prio);
}
return 0;
}
@@ -524,7 +533,6 @@
lock_kernel();

/* Give kswapd a realtime priority. */
- current->policy = SCHED_FIFO;
current->rt_priority = 32; /* Fixme --- we need to standardise our
namings for POSIX.4 realtime scheduling
priorities. */
@@ -546,12 +554,16 @@
init_swap_timer();
add_wait_queue(&kswapd_wait, &wait);
while (1) {
- int tries;
+ int tries, free_memory, count;

current->state = TASK_INTERRUPTIBLE;
flush_signals(current);
run_task_queue(&tq_disk);
+ timer_active |= 1<<SWAP_TIMER;
+ current->policy = SCHED_FIFO;
schedule();
+ current->policy = SCHED_OTHER;
+ timer_active &= ~(1<<SWAP_TIMER);
swapstats.wakeups++;

/*
@@ -570,11 +582,20 @@
* woken up more often and the rate will be even
* higher).
*/
- tries = pager_daemon.tries_base;
- tries >>= 4*free_memory_available();
+ free_memory = free_memory_available();

- do {
- do_try_to_free_page(0);
+ if (free_memory == 2)
+ continue;
+ tries = pager_daemon.tries_base >> (free_memory + 2);
+
+ for (count = 0; count < tries; count++)
+ {
+ /*
+ * If we can' t free one page we can' t able to
+ * free tries page. -arca
+ */
+ if (!do_try_to_free_page(GFP_KSWAPD))
+ break;
/*
* Syncing large chunks is faster than swapping
* synchronously (less head movement). -- Rik.
@@ -583,7 +604,7 @@
run_task_queue(&tq_disk);
if (free_memory_available() > 1)
break;
- } while (--tries > 0);
+ }
}
/* As if we could ever get here - maybe we want to make this killable */
remove_wait_queue(&kswapd_wait, &wait);
@@ -598,22 +619,22 @@
*
* The "PF_MEMALLOC" flag protects us against recursion:
* if we need more memory as part of a swap-out effort we
- * will just silently return "success" to tell the page
- * allocator to accept the allocation.
+ * will just silently return "fail" to tell the page
+ * allocator that we are OOM.
*/
int try_to_free_pages(unsigned int gfp_mask, int count)
{
- int retval = 1;
+ int retval = 0;

lock_kernel();
if (!(current->flags & PF_MEMALLOC)) {
current->flags |= PF_MEMALLOC;
- do {
+ while (count--)
+ {
retval = do_try_to_free_page(gfp_mask);
if (!retval)
break;
- count--;
- } while (count > 0);
+ }
current->flags &= ~PF_MEMALLOC;
}
unlock_kernel();
@@ -649,11 +670,11 @@
}

if ((long) (now - want) >= 0) {
- if (want_wakeup || (num_physpages * buffer_mem.max_percent) < (buffermem >> PAGE_SHIFT) * 100
- || (num_physpages * page_cache.max_percent < page_cache_size * 100)) {
+ if (want_wakeup || buffer_over_max() || pgcache_over_max())
+ {
/* Set the next wake-up time */
next_swap_jiffies = now + swapout_interval;
- wake_up(&kswapd_wait);
+ kswapd_wakeup();
}
}
timer_active |= (1<<SWAP_TIMER);
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.3 linux/include/linux/mm.h:1.3.2.5
--- linux/include/linux/mm.h:1.3 Sun Oct 4 16:27:49 1998
+++ linux/include/linux/mm.h Thu Oct 8 18:24:40 1998
@@ -317,6 +317,7 @@
#define __GFP_MED 0x04
#define __GFP_HIGH 0x08

+#define __GFP_KSWAPD 0x40
#define __GFP_DMA 0x80

#define GFP_BUFFER (__GFP_LOW | __GFP_WAIT)
@@ -329,6 +330,7 @@
platforms, used as appropriate on others */

#define GFP_DMA __GFP_DMA
+#define GFP_KSWAPD (__GFP_KSWAPD | __GFP_WAIT)

/* vma is the first one with address < vma->vm_end,
* and even address < vma->vm_start. Have to extend vma. */
@@ -379,6 +381,36 @@
vma = NULL;
return vma;
}
+
+extern __inline__ void kswapd_wakeup(void)
+{
+ wake_up(&kswapd_wait);
+}
+
+#define buffer_under_min() ((buffermem >> PAGE_SHIFT) * 100 < \
+ buffer_mem.min_percent * num_physpages)
+#define buffer_under_borrow() ((buffermem >> PAGE_SHIFT) * 100 < \
+ buffer_mem.borrow_percent * num_physpages)
+#define buffer_under_max() ((buffermem >> PAGE_SHIFT) * 100 < \
+ buffer_mem.max_percent * num_physpages)
+#define buffer_over_min() ((buffermem >> PAGE_SHIFT) * 100 > \
+ buffer_mem.min_percent * num_physpages)
+#define buffer_over_borrow() ((buffermem >> PAGE_SHIFT) * 100 > \
+ buffer_mem.borrow_percent * num_physpages)
+#define buffer_over_max() ((buffermem >> PAGE_SHIFT) * 100 > \
+ buffer_mem.max_percent * num_physpages)
+#define pgcache_under_min() (page_cache_size * 100 < \
+ page_cache.min_percent * num_physpages)
+#define pgcache_under_borrow() (page_cache_size * 100 < \
+ page_cache.borrow_percent * num_physpages)
+#define pgcache_under_max() (page_cache_size * 100 < \
+ page_cache.max_percent * num_physpages)
+#define pgcache_over_min() (page_cache_size * 100 > \
+ page_cache.min_percent * num_physpages)
+#define pgcache_over_borrow() (page_cache_size * 100 > \
+ page_cache.borrow_percent * num_physpages)
+#define pgcache_over_max() (page_cache_size * 100 > \
+ page_cache.max_percent * num_physpages)

#endif /* __KERNEL__ */

Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.2 linux/kernel/fork.c:1.2.2.2
--- linux/kernel/fork.c:1.2 Mon Oct 5 01:08:12 1998
+++ linux/kernel/fork.c Thu Oct 8 14:41:11 1998
@@ -296,6 +296,8 @@
exit_mmap(mm);
free_page_tables(mm);
kmem_cache_free(mm_cachep, mm);
+ if (!free_memory_available())
+ kswapd_wakeup();
}
}

Index: linux/ipc/shm.c
diff -u linux/ipc/shm.c:1.1.1.1 linux/ipc/shm.c:1.1.1.1.8.2
--- linux/ipc/shm.c:1.1.1.1 Fri Oct 2 19:23:03 1998
+++ linux/ipc/shm.c Thu Oct 8 17:59:59 1998
@@ -737,6 +737,12 @@
int loop = 0;
int counter;

+ /*
+ * Don't waste time if there's no swap or we can' t swapout. -arca
+ */
+ if (!(gfp_mask & __GFP_WAIT) || !nr_swap_pages)
+ return 0;
+
counter = shm_rss >> prio;
if (!counter || !(swap_nr = get_swap_page()))
return 0;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/