NUMA: Patch for node based swapping V2

From: Christoph Lameter
Date: Wed Oct 13 2004 - 10:17:53 EST


This was discussed yesterday on linux-mm.

Changelog:
* NUMA: Add ability to invoke kswapd on a node if local memory falls below a
certain threshold. A node may fill up its memory by simply copying a file
which will fill up the nodes memory with cached pages. The nodes memory will
currently only be reclaimed if all nodes in the system fall below a certain
threshhold. Until that time the processes on the node will only be allocated
off node memory. Invoking kswapd on a node fixes this situation until
a better solution can be found.
* Threshold may be set in /proc/sys/vm/node_swap in percent * 10. The threshold
is set to zero by default which means that node swapping is off.

Index: linux-2.6.9-rc4/mm/page_alloc.c
===================================================================
--- linux-2.6.9-rc4.orig/mm/page_alloc.c 2004-10-10 19:57:03.000000000 -0700
+++ linux-2.6.9-rc4/mm/page_alloc.c 2004-10-13 07:58:57.000000000 -0700
@@ -41,6 +41,19 @@
long nr_swap_pages;
int numnodes = 1;
int sysctl_lower_zone_protection = 0;
+#ifdef CONFIG_NUMA
+/*
+ * sysctl_node_swap is a percentage of the pages available
+ * in a zone multiplied by 10. If the available pages
+ * in a zone drop below this limit then kswapd is invoked
+ * for this zone alone. This results in the reclaiming
+ * of local memory. Local memory may be filled up by simply reading
+ * a file. If local memory is not available the off node memory
+ * will be allocated to a process which makes all memory access
+ * less efficient then they could be.
+ */
+int sysctl_node_swap = 0;
+#endif

EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
@@ -483,6 +496,14 @@
p = &z->pageset[cpu];
if (pg == orig) {
z->pageset[cpu].numa_hit++;
+ /*
+ * If zone allocation has left less than
+ * (sysctl_node_swap / 10) % of the zone free invoke kswapd.
+ * (the page limit is obtained through (pages*limit)/1024 to
+ * make the calculation more efficient)
+ */
+ if (z->free_pages < (z->present_pages * sysctl_node_swap) << 10)
+ wakeup_kswapd(z);
} else {
p->numa_miss++;
zonelist->zones[0]->pageset[cpu].numa_foreign++;
Index: linux-2.6.9-rc4/kernel/sysctl.c
===================================================================
--- linux-2.6.9-rc4.orig/kernel/sysctl.c 2004-10-10 19:57:03.000000000 -0700
+++ linux-2.6.9-rc4/kernel/sysctl.c 2004-10-11 12:54:51.000000000 -0700
@@ -65,6 +65,9 @@
extern int min_free_kbytes;
extern int printk_ratelimit_jiffies;
extern int printk_ratelimit_burst;
+#ifdef CONFIG_NUMA
+extern int sysctl_node_swap;
+#endif

#if defined(CONFIG_X86_LOCAL_APIC) && defined(__i386__)
int unknown_nmi_panic;
@@ -800,7 +803,17 @@
.extra1 = &zero,
},
#endif
- { .ctl_name = 0 }
+#ifdef CONFIG_NUMA
+ {
+ .ctl_name = VM_NODE_SWAP,
+ .procname = "node_swap",
+ .data = &sysctl_node_swap,
+ .maxlen = sizeof(sysctl_node_swap),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+#endif
+ { .ctl_name = 0 }
};

static ctl_table proc_table[] = {
Index: linux-2.6.9-rc4/include/linux/sysctl.h
===================================================================
--- linux-2.6.9-rc4.orig/include/linux/sysctl.h 2004-10-10 19:58:05.000000000 -0700
+++ linux-2.6.9-rc4/include/linux/sysctl.h 2004-10-11 12:54:51.000000000 -0700
@@ -167,6 +167,7 @@
VM_HUGETLB_GROUP=25, /* permitted hugetlb group */
VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
+ VM_NODE_SWAP=28, /* Swap local node memory limit (in % *10) */
};


Index: linux-2.6.9-rc4/mm/vmscan.c
===================================================================
--- linux-2.6.9-rc4.orig/mm/vmscan.c 2004-10-10 19:57:04.000000000 -0700
+++ linux-2.6.9-rc4/mm/vmscan.c 2004-10-11 12:54:51.000000000 -0700
@@ -1168,9 +1168,11 @@
*/
void wakeup_kswapd(struct zone *zone)
{
+ extern int sysctl_node_swap;
+
if (zone->present_pages == 0)
return;
- if (zone->free_pages > zone->pages_low)
+ if (zone->free_pages > (zone->present_pages * sysctl_node_swap) << 10 && zone->free_pages > zone->pages_low)
return;
if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
return;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/