swap cache and page cache patch - new design

Krzysztof Strasburger (strasbur@chkw386.ch.pwr.wroc.pl)
Thu, 15 Jan 98 13:29


Last time when I posted the swap cache patch to this list, somebody
(probably one of the developers) wrote to me, that not counting swap cache
pages as free is a design feature, not a bug. It is hard to fix features...
but maybe an enhancement of the feature will be accepted...
Here are my two previous patches designed not as hard fixes or even config
options, but as tunable (via the new /proc/sys/vm/spcache file) kernel
parameters. They allow changing of the policy of treating the swap cache
pages and tuning the "aggressiveness" of freeing page cache pages.
More details - see below.
The patch is against 2.0.33. I suppose, next version will be written
for 2.2.x ;-)

Krzysztof Strasburger

diff -u --recursive linux.orig/Documentation/memory-tuning.txt linux/Documentation/memory-tuning.txt
--- linux.orig/Documentation/memory-tuning.txt Tue Aug 19 03:16:47 1997
+++ linux/Documentation/memory-tuning.txt Thu Jan 15 07:35:26 1998
@@ -46,3 +46,26 @@
The other three files in /proc/sys/vm are undocumented, as yet.

Thomas Koenig, ig25@rz.uni-karlsruhe.de
+
+The file /proc/sys/vm/spcache allows changing of the handling of swap cache
+(first number) and the policy of freeing page cache pages in the shrink_mmap
+function (second number).
+If both numbers are negative (default -1), the old behavior is active (the
+number of swap cache pages is not added to free pages, page cache pages are
+marked referenced if they are in the page table of any process).
+If the first number is nonnegative, it denotes the number of "reserved"
+swap cache pages. If the number of swap cache pages is greater than this
+number, nr_swap_cache_pages-reserved_swap_cache_pages is added to the
+number of free pages.
+If the second number is nonnegative, page cache pages are not marked
+referenced. Additionally, these pages are aged if they are non referenced
+and priority (parameter of the shrink_mmap function, 0-6) is less than
+this number. If the number is greater than 7, page cache pages which are
+not in any page table are freed even if they are marked as referenced.
+My favorite setting is:
+echo 0 8 > /proc/sys/vm/spcache
+which stands for counting all swap cache pages as free, aging all
+non referenced page cache pages and freeing all page cache pages not present
+in any page table.
+
+Krzysztof Strasburger, strasbur@chkw386.ch.pwr.wroc.pl
diff -u --recursive linux.orig/include/linux/mm.h linux/include/linux/mm.h
--- linux.orig/include/linux/mm.h Tue Dec 2 22:18:11 1997
+++ linux/include/linux/mm.h Wed Jan 14 10:59:22 1998
@@ -10,6 +10,8 @@
#include <linux/string.h>

extern unsigned long high_memory;
+extern int swap_cache_reserve;
+extern int page_cache_priority;

#include <asm/page.h>
#include <asm/atomic.h>
diff -u --recursive linux.orig/include/linux/swap.h linux/include/linux/swap.h
--- linux.orig/include/linux/swap.h Mon Jun 3 12:38:37 1996
+++ linux/include/linux/swap.h Wed Jan 14 10:59:29 1998
@@ -34,6 +34,7 @@

extern int nr_swap_pages;
extern int nr_free_pages;
+extern int nr_swap_cache_pages;
extern atomic_t nr_async_pages;
extern int min_free_pages;
extern int free_pages_low;
@@ -113,10 +114,12 @@
swap_cache_find_total++;
#endif
entry = xchg(swap_cache + index, 0);
+ if (entry) {
#ifdef SWAP_CACHE_INFO
- if (entry)
swap_cache_find_success++;
#endif
+ nr_swap_cache_pages--;
+ }
return entry;
}

@@ -133,6 +136,7 @@
swap_cache_del_success++;
#endif
swap_free(entry);
+ nr_swap_cache_pages--;
return 1;
}
return 0;
diff -u --recursive linux.orig/include/linux/sysctl.h linux/include/linux/sysctl.h
--- linux.orig/include/linux/sysctl.h Tue Aug 12 21:06:35 1997
+++ linux/include/linux/sysctl.h Tue Jan 13 15:51:18 1998
@@ -67,7 +67,8 @@
#define VM_KSWAPD 2 /* struct: control background pageout */
#define VM_FREEPG 3 /* struct: Set free page thresholds */
#define VM_BDFLUSH 4 /* struct: Control buffer cache flushing */
-#define VM_MAXID 5
+#define VM_SPCACHE 5 /* struct: Control swap and page cache handling */
+#define VM_MAXID 6

/* CTL_NET names: */
#define NET_CORE 1
diff -u --recursive linux.orig/kernel/sysctl.c linux/kernel/sysctl.c
--- linux.orig/kernel/sysctl.c Wed Aug 13 22:02:42 1997
+++ linux/kernel/sysctl.c Tue Jan 13 15:51:18 1998
@@ -161,6 +161,8 @@
{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0600, NULL,
&proc_dointvec_minmax, &sysctl_intvec, NULL,
&bdflush_min, &bdflush_max},
+ {VM_SPCACHE, "spcache",
+ &swap_cache_reserve, 2*sizeof(int), 0600, NULL, &proc_dointvec},
{0}
};

diff -u --recursive linux.orig/mm/filemap.c linux/mm/filemap.c
--- linux.orig/mm/filemap.c Thu Dec 11 02:09:44 1997
+++ linux/mm/filemap.c Thu Jan 15 07:19:29 1998
@@ -159,6 +159,8 @@
case 1:
/* If it has been referenced recently, don't free it */
if (clear_bit(PG_referenced, &page->flags)) {
+ if (page->inode && page_cache_priority > 7)
+ goto free_page_cache;
/* age this page potential used */
if (priority < 4)
age_page(page);
@@ -166,21 +168,31 @@
}

/* is it a page cache page? */
- if (page->inode) {
- remove_page_from_hash_queue(page);
- remove_page_from_inode_queue(page);
- __free_page(page);
- return 1;
- }
+ if (!page->inode)
+ goto no_page_cache;
+free_page_cache:
+ remove_page_from_hash_queue(page);
+ remove_page_from_inode_queue(page);
+ __free_page(page);
+ return 1;

+no_page_cache:
/* is it a buffer cache page? */
if (free_buf && bh && try_to_free_buffer(bh, &bh, 6))
return 1;
break;

default:
- /* more than one users: we can't throw it away */
- set_bit(PG_referenced, &page->flags);
+ /* more than one users: we can't throw it away... */
+ if (page_cache_priority >= 0) {
+ /* ...but we can age it, so it will be
+ * easy to remove it from page tables */
+ if (page->inode && (!clear_bit(PG_referenced,&page->flags)) && priority < page_cache_priority)
+ age_page(page);
+ } else {
+ set_bit(PG_referenced, &page->flags);
+ }
+
/* fall through */
case 0:
/* nothing */
diff -u --recursive linux.orig/mm/mmap.c linux/mm/mmap.c
--- linux.orig/mm/mmap.c Fri Nov 7 17:57:31 1997
+++ linux/mm/mmap.c Tue Jan 13 15:51:18 1998
@@ -59,6 +59,8 @@
freepages >>= 1;
freepages += nr_free_pages;
freepages += nr_swap_pages;
+ if (swap_cache_reserve >= 0 && nr_swap_cache_pages > swap_cache_reserve)
+ freepages += nr_swap_cache_pages - swap_cache_reserve;
freepages -= MAP_NR(high_memory) >> 4;
return freepages > pages;
}
diff -u --recursive linux.orig/mm/swap.c linux/mm/swap.c
--- linux.orig/mm/swap.c Mon Jun 3 12:38:37 1996
+++ linux/mm/swap.c Wed Jan 14 11:04:21 1998
@@ -42,6 +42,10 @@
int free_pages_low = 30;
int free_pages_high = 40;

+/* And these two too... */
+int swap_cache_reserve = -1;
+int page_cache_priority = -1;
+
/* We track the number of pages currently being asynchronously swapped
out, so that we don't try to swap TOO many pages out at once */
atomic_t nr_async_pages = 0;
diff -u --recursive linux.orig/mm/swap_state.c linux/mm/swap_state.c
--- linux.orig/mm/swap_state.c Wed Mar 13 13:17:23 1996
+++ linux/mm/swap_state.c Tue Jan 13 15:51:19 1998
@@ -32,6 +32,8 @@
*/
unsigned long *swap_cache;

+int nr_swap_cache_pages = 0;
+
#ifdef SWAP_CACHE_INFO
unsigned long swap_cache_add_total = 0;
unsigned long swap_cache_add_success = 0;
@@ -64,6 +66,7 @@
#ifdef SWAP_CACHE_INFO
swap_cache_add_success++;
#endif
+ nr_swap_cache_pages++;
return 1;
}
return 0;
diff -u --recursive linux.orig/mm/swapfile.c linux/mm/swapfile.c
--- linux.orig/mm/swapfile.c Mon Mar 31 21:22:37 1997
+++ linux/mm/swapfile.c Tue Jan 13 15:51:19 1998
@@ -570,6 +570,8 @@
++val->totalswap;
}
}
+ if (swap_cache_reserve >= 0 && nr_swap_cache_pages > swap_cache_reserve)
+ val->freeswap += nr_swap_cache_pages - swap_cache_reserve;
val->freeswap <<= PAGE_SHIFT;
val->totalswap <<= PAGE_SHIFT;
return;