Re: kswap kernel daemon questions/thoughts

Rik van Riel (H.H.vanRiel@fys.ruu.nl)
Wed, 18 Feb 1998 15:24:19 +0100 (MET)


On Wed, 18 Feb 1998, Ka'Plaagh wrote:

> looking at the kswap daemon code I noticed something a little strange.
> Basically when it decides that it should run it tries to free either 3 or 6
> pages (depends on which tide mark it just passed, free_pages_high or
> free_pages_low). It tries a number of methods in turn, these are:

No, it only frees 1 page at a time. The 3 or 6 number is about
the number of _tries_, but once it succeeds, the function will
exit (notice the 'return 1's after the swapout call.

> reducing the sizes of the buffer and page caches
> swapping out shared pages
> swapping out or discarding pages
>
> It 'remembers' which of these it was doing the last time that it
> tried to free physical pages in the system and starts from there again this
> time. The kswap daemon runs periodically and checks the state of
> the system's memory, it runs more often is there are less than free_pages_low
> free pages.

No, the kswapd daemon doesn't run periodically. The function swap_tick()
gets called on every clock tick, and decides whether kswapd needs to
be woken. If nr_free_pages < free_pages_low, kswapd is woken immediately,
regardless of whatever. If free_pages_low < nr_free_pages < free_pages_high,
then kswapd is only woken if at least swapout_interval has passed
between the past invocation of kswapd and now.

> There's a little state machine that does the remembering. If one
> method fails, it moves onto the next method. Starting from the last place that it
> tried saves it needlessly retrying something that may not work (I guess).
> However, shouldn't it consider resetting the state to the first option once the
> number of free pages in the system rises above free_pages_high?

Why? If it just continues with the method it last tried, it'll
run into a failure sooner or later, and switch to the next
method. Eventually it'll return to the first option (notice
the 'state = 0 in the default: case).

> Also shouldn't
> the number of pages that it frees relate to the positions of the high and low
> free page mark as well as to their distance apart?

It does in my patch. The kswapd behaviour is somewhat broken,
as it is now. I've included my patch at the end of the message.

Rik.
+-----------------------------+------------------------------+
| For Linux mm-patches, go to | "I'm busy managing memory.." |
| my homepage (via LinuxHQ). | H.H.vanRiel@fys.ruu.nl |
| ...submissions welcome... | http://www.fys.ruu.nl/~riel/ |
+-----------------------------+------------------------------+

---------------> mmap-age-2.1.85.diff <---------------------

--- linux/include/linux/swap.h.orig Mon Feb 2 16:23:17 1998
+++ linux/include/linux/swap.h Thu Feb 12 00:53:31 1998
@@ -34,6 +34,7 @@

extern int nr_swap_pages;
extern int nr_free_pages;
+extern int nr_free_pages_bigorder;
extern atomic_t nr_async_pages;
extern int min_free_pages;
extern int free_pages_low;
--- linux/mm/filemap.c.orig Mon Feb 2 16:39:37 1998
+++ linux/mm/filemap.c Thu Feb 12 01:47:21 1998
@@ -22,10 +22,12 @@
#include <linux/locks.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include <linux/swapctl.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/blkdev.h>

+#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
@@ -37,6 +39,7 @@
* Shared mappings now work. 15.8.1995 Bruno.
*/

+extern int swapout_now_dammit;
unsigned long page_cache_size = 0;
struct page * page_hash_table[PAGE_HASH_SIZE];

@@ -159,9 +162,18 @@
switch (atomic_read(&page->count)) {
case 1:
/* If it has been referenced recently, don't free it */
- if (test_and_clear_bit(PG_referenced, &page->flags))
+ if (test_and_clear_bit(PG_referenced, &page->flags)) {
+ touch_page(page);
+ break;
+ }
+ age_page(page);
+ if (swapout_now_dammit && PageDMA(page) &&
+ clock < (num_physpages >> 4))
+ goto swapout;
+ if (page->age)
break;

+swapout:
/* is it a page cache page? */
if (page->inode) {
if (page->inode == &swapper_inode)
--- linux/mm/page_alloc.c.orig Mon Feb 2 16:23:16 1998
+++ linux/mm/page_alloc.c Thu Feb 12 00:53:30 1998
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
#include <linux/errno.h>
+#include <linux/sched.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/swap.h>
@@ -30,6 +31,9 @@
int nr_swap_pages = 0;
int nr_free_pages = 0;

+/* Number of the free pages in chunks of order 2 and bigger */
+int nr_free_pages_bigorder = 0;
+
/*
* Free area management
*
@@ -118,12 +122,17 @@
if (!test_and_change_bit(index, area->map))
break;
remove_mem_queue(list(map_nr ^ -mask));
+ if (order >= 2)
+ nr_free_pages_bigorder -= 1 << order;
mask <<= 1;
+ order++;
area++;
index >>= 1;
map_nr &= mask;
}
add_mem_queue(area, list(map_nr));
+ if (order >= 2)
+ nr_free_pages_bigorder += 1 << order;

#undef list

@@ -180,6 +189,8 @@
(prev->next = ret->next)->prev = prev; \
MARK_USED(map_nr, new_order, area); \
nr_free_pages -= 1 << order; \
+ if (new_order >= 2) \
+ nr_free_pages_bigorder -= 1 << new_order; \
EXPAND(ret, map_nr, order, new_order, area); \
spin_unlock_irqrestore(&page_alloc_lock, flags); \
return ADDRESS(map_nr); \
@@ -197,6 +208,8 @@
area--; high--; size >>= 1; \
add_mem_queue(area, map); \
MARK_USED(index, high, area); \
+ if (high >= 2) \
+ nr_free_pages_bigorder += 1 << high; \
index += size; \
map += size; \
} \
--- linux/mm/slab.c.orig Mon Feb 2 16:23:16 1998
+++ linux/mm/slab.c Thu Feb 12 00:53:30 1998
@@ -312,8 +312,11 @@

/* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB,
* then the page order must be less than this before trying the next order.
+ * -- Don't waste 4-page areas if we don't have to. As long as memory
+ * fragmentation is an issue (until 2.3?), we only allocate orders 0
+ * and 1 for small objects. -- Rik (originally tried by Zlatko)
*/
-#define SLAB_BREAK_GFP_ORDER 2
+#define SLAB_BREAK_GFP_ORDER 1

/* Macros for storing/retrieving the cachep and or slab from the
* global 'mem_map'. With off-slab bufctls, these are used to find the
--- linux/mm/vmscan.c.orig Mon Feb 2 16:32:42 1998
+++ linux/mm/vmscan.c Thu Feb 12 02:54:31 1998
@@ -37,10 +37,22 @@
*/
int swapout_interval = HZ / 4;

+/*
+ * How many times (free_pages_high - nr_free_pages) are we
+ * allowed to try to swap out a page.
+ */
+#define MAX_SWAP_TRY 3
+
+/*
+ * When we're in trouble, this is set to nonzero and we start
+ * swapping out pages in the lower part of memory.
+ */
+int swapout_now_dammit = 0;
+
/*
* The wait queue for waking up the pageout daemon:
*/
-static struct wait_queue * kswapd_wait = NULL;
+struct wait_queue * kswapd_wait = NULL;

/*
* We avoid doing a reschedule if the pageout daemon is already awake;
@@ -94,8 +106,12 @@
return 0;
}
age_page(page_map);
- if (page_map->age)
+ if (swapout_now_dammit && (MAP_NR(page) < (num_physpages >> 4))
+ && PageDMA(page_map))
+ goto swapout;
+ if (page_map->age - (gfp_mask & __GFP_WAIT))
return 0;
+swapout:
if (pte_dirty(pte)) {
if (PageSwapCache(page_map))
panic ("Can't still be swap cached!!!");
@@ -434,7 +450,6 @@
printk ("Starting kswapd v%.*s\n", i, s);
}

-#define MAX_SWAP_FAIL 3
/*
* The background pageout daemon.
* Started as a kernel thread from the init process.
@@ -462,7 +477,8 @@
init_swap_timer();

while (1) {
- int fail;
+ int tries;
+ swapout_now_dammit = 0;
kswapd_awake = 0;
flush_signals(current);
run_task_queue(&tq_disk);
@@ -476,20 +492,28 @@
* If we've had too many consecutive failures,
* go back to sleep to let other tasks run.
*/
- for (fail = 0; fail++ < MAX_SWAP_FAIL;) {
+ tries = (free_pages_high - nr_free_pages);
+ if (tries < min_free_pages)
+ tries = min_free_pages;
+ if (nr_free_pages < min_free_pages ||
+ nr_free_pages_bigorder < min_free_pages / 3) {
+ tries <<= 2;
+ swapout_now_dammit = 1;
+ }
+ while (tries-- > 0 && (nr_free_pages < free_pages_high ||
+ nr_free_pages_bigorder <= min_free_pages)) {
int pages, gfp_mask;

pages = nr_free_pages;
if (nr_free_pages >= min_free_pages)
pages += atomic_read(&nr_async_pages);
- if (pages >= free_pages_high)
- break;
/* Set up the proper flags. */
gfp_mask = __GFP_IO;
if (pages < free_pages_low)
gfp_mask |= __GFP_WAIT;
- if(try_to_free_page(gfp_mask))
- fail = 0;
+ try_to_free_page(gfp_mask);
+ if (atomic_read(&nr_async_pages) > min_free_pages)
+ run_task_queue(&tq_disk);
}
/*
* Report failure if we couldn't reach the minimum goal.
@@ -509,7 +533,7 @@
int want_wakeup = 0, memory_low = 0;
int pages = nr_free_pages + atomic_read(&nr_async_pages);

- if (pages < free_pages_low)
+ if (pages < free_pages_low || nr_free_pages_bigorder < min_free_pages / 2)
memory_low = want_wakeup = 1;
else if (pages < free_pages_high && jiffies >= next_swap_jiffies)
want_wakeup = 1;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu