highmem-zone memory hotremoval test patch

From: IWAMOTO Toshihiro
Date: Fri Oct 17 2003 - 03:30:04 EST


Hi all.

I'm working on memory hotplug. To demonstrate how highmem-zone memory
can be freed, a patch is attached.
It does:
- split highmem memory in 1GB chunks(zones)
- allocation from each highmem zone can be enabled/disabled
- kswapd can be activated to free pages in a disabled zone
- pages can be freed by "remapping" (memcpying a page into a
replacement page from another zone)
- has procfs interface to access the above functions, together with
displaying per zone memory usage

Page remapping works "fairly well". Given that highmem pages can be
successfully freed, I think letting highmem zones have "hot removable"
attribute is the way to go.

Moreover, page remapping could be used to create contiguous free area
to be used for hugetlb pages or for NUMA process migration.

Known issues (TODOs):
- You need to issue several "remap" or "purge" commands to free all
memory in a zone.
- It is currently incompatible with CONFIG_HIGHPTE.
- It does not work with hugetlb pages.
- It may not work well with filesystems other than ext2. If buffers
associated with a page cannot be freed by calling
try_to_release_page(), the page cannot be freed.

Some notes about reading or using this patch:
- shmem_dir and vmalloc were changed not to use highmem.
- page remapping code is in remapd() and remap_onepage(), the other
portion of the patch is somewhat trivial.
- To try this patch, basically you need an IA32 machine with >=2GB of
memory. Define CONFIG_MEMHOTPLUG in your kernel config.
- /proc/memhotplug interface:
To display per zone memory usage:
$ cat /proc/memhotplug
To disable memory allocation from a zone:
$ echo 'disable <zone number>' > /proc/memhotplug
To activate kswapd:
$ echo 'purge <zone number>' > /proc/memhotplug
To invoke page remapping:
$ echo 'remap <zone number>' > /proc/memhotplug
To (re)enable memory allocation:
$ echo 'enable <zone number>' > /proc/memhotplug

purge and remap are meant to work against disabled zones

========== cut here ==========
$Id: memoryhotplug.patch,v 1.7 2003/10/17 08:12:27 iwamoto Exp $

diff -dpur linux-2.6.0-test1/arch/i386/Kconfig linux-2.6.0-test1-mh/arch/i386/Kconfig
--- linux-2.6.0-test1/arch/i386/Kconfig Mon Jul 14 12:30:48 2003
+++ linux-2.6.0-test1-mh/arch/i386/Kconfig Wed Jul 30 13:41:23 2003
@@ -704,14 +704,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)

+config MEMHOTPLUGTEST
+ bool "Memory hotplug test"
+ default n
+
config DISCONTIGMEM
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y

config HAVE_ARCH_BOOTMEM_NODE
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y

config HIGHPTE
diff -dpur linux-2.6.0-test1/arch/i386/mm/discontig.c linux-2.6.0-test1-mh/arch/i386/mm/discontig.c
--- linux-2.6.0-test1/arch/i386/mm/discontig.c Mon Jul 14 12:34:41 2003
+++ linux-2.6.0-test1-mh/arch/i386/mm/discontig.c Mon Aug 4 11:02:49 2003
@@ -28,6 +28,7 @@
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
+#include <linux/proc_fs.h>
#include <asm/e820.h>
#include <asm/setup.h>

@@ -109,6 +110,42 @@ void __init get_memcfg_numa_flat(void)
numnodes = 1;
}

+void __init get_memcfg_numa_blks(void)
+{
+ int i, pfn;
+
+ printk("NUMA - single node, flat memory mode, but broken in several blocks\n");
+
+ /* Run the memory configuration and find the top of memory. */
+ find_max_pfn();
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ pfn = PFN_DOWN(1 << 30) * i;
+ node_start_pfn[i] = pfn;
+ pfn += PFN_DOWN(1 << 30);
+ if (pfn < max_pfn)
+ node_end_pfn[i] = pfn;
+ else {
+ node_end_pfn[i] = max_pfn;
+ i++;
+ printk("total %d blocks, max %d\n", i, max_pfn);
+ break;
+ }
+ }
+
+ /* Fill in the physnode_map with our simplistic memory model,
+ * all memory is in node 0.
+ */
+ for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+ pfn += PAGES_PER_ELEMENT)
+ {
+ physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(1 << 30);
+ }
+
+ /* Indicate there is one node available. */
+ node_set_online(0);
+ numnodes = i;
+}
+
/*
* Find the highest page frame number we have available for the node
*/
@@ -181,6 +218,8 @@ static void __init register_bootmem_low_
}
}

+static struct kcore_list numa_kc;
+
void __init remap_numa_kva(void)
{
void *vaddr;
@@ -194,7 +233,11 @@ void __init remap_numa_kva(void)
node_remap_start_pfn[node] + pfn,
PAGE_KERNEL_LARGE);
}
+ memset(node_remap_start_vaddr[node], 0,
+ node_remap_size[node] * PAGE_SIZE);
}
+ kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+ node_remap_offset[numnodes - 1] << PAGE_SHIFT);
}

static unsigned long calculate_numa_remap_pages(void)
diff -dpur linux-2.6.0-test1/drivers/char/mem.c linux-2.6.0-test1-mh/drivers/char/mem.c
--- linux-2.6.0-test1/drivers/char/mem.c Mon Jul 14 12:34:32 2003
+++ linux-2.6.0-test1-mh/drivers/char/mem.c Fri Aug 22 12:03:44 2003
@@ -24,6 +24,9 @@
#include <linux/smp_lock.h>
#include <linux/devfs_fs_kernel.h>
#include <linux/ptrace.h>
+#ifdef CONFIG_HIGHMEM
+#include <linux/highmem.h>
+#endif

#include <asm/uaccess.h>
#include <asm/io.h>
@@ -109,6 +112,37 @@ static ssize_t do_write_mem(struct file
* This funcion reads the *physical* memory. The f_pos points directly to the
* memory location.
*/
+#ifdef CONFIG_HIGHMEM
+static ssize_t read_highmem(struct file * file, char * buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long p = *ppos;
+ ssize_t read = 0;
+ int off, pfn = p >> PAGE_SHIFT;
+ char *pp;
+ struct page *page;
+
+ if (! pfn_valid(pfn))
+ return 0;
+ page = pfn_to_page(pfn);
+ pp = kmap(page);
+
+ off = p & (PAGE_SIZE - 1);
+ if (PAGE_SIZE - off > count)
+ count = PAGE_SIZE - off;
+
+ if (copy_to_user(buf, pp + off, count)) {
+ kunmap(page);
+ return -EFAULT;
+ }
+ read += count;
+ *ppos += read;
+ kunmap(page);
+ return read;
+}
+
+#endif
+
static ssize_t read_mem(struct file * file, char * buf,
size_t count, loff_t *ppos)
{
@@ -118,7 +152,11 @@ static ssize_t read_mem(struct file * fi

end_mem = __pa(high_memory);
if (p >= end_mem)
+#ifdef CONFIG_HIGHMEM
+ return read_highmem(file, buf, count, ppos);
+#else
return 0;
+#endif
if (count > end_mem - p)
count = end_mem - p;
read = 0;
diff -dpur linux-2.6.0-test1/fs/proc/kcore.c linux-2.6.0-test1-mh/fs/proc/kcore.c
--- linux-2.6.0-test1/fs/proc/kcore.c Mon Jul 14 12:34:39 2003
+++ linux-2.6.0-test1-mh/fs/proc/kcore.c Thu Jul 31 16:01:37 2003
@@ -450,7 +450,7 @@ static ssize_t read_kcore(struct file *f
}
kfree(elf_buf);
} else {
- if (kern_addr_valid(start)) {
+ if (1 /*kern_addr_valid(start)*/) {
unsigned long n;

n = copy_to_user(buffer, (char *)start, tsz);
diff -dpur linux-2.6.0-test1/include/asm-i386/kmap_types.h linux-2.6.0-test1-mh/include/asm-i386/kmap_types.h
--- linux-2.6.0-test1/include/asm-i386/kmap_types.h Mon Jul 14 12:36:38 2003
+++ linux-2.6.0-test1-mh/include/asm-i386/kmap_types.h Wed Oct 8 19:44:46 2003
@@ -24,7 +24,11 @@ D(10) KM_IRQ0,
D(11) KM_IRQ1,
D(12) KM_SOFTIRQ0,
D(13) KM_SOFTIRQ1,
-D(14) KM_TYPE_NR
+D(14) KM_TYPE_NR,
+#ifdef CONFIG_MEMHOTPLUGTEST
+D(15) KM_REMAP0,
+D(16) KM_REMAP1
+#endif
};

#undef D
diff -dpur linux-2.6.0-test1/include/asm-i386/mmzone.h linux-2.6.0-test1-mh/include/asm-i386/mmzone.h
--- linux-2.6.0-test1/include/asm-i386/mmzone.h Mon Jul 14 12:34:31 2003
+++ linux-2.6.0-test1-mh/include/asm-i386/mmzone.h Wed Jul 30 13:41:23 2003
@@ -122,7 +122,11 @@ static inline struct pglist_data *pfn_to
#elif CONFIG_NUMA /* summit or generic arch */
#include <asm/srat.h>
#elif CONFIG_X86_PC
+#ifdef CONFIG_MEMHOTPLUGTEST
+#define get_memcfg_numa get_memcfg_numa_blks
+#else
#define get_memcfg_numa get_memcfg_numa_flat
+#endif
#define get_zholes_size(n) (0)
#else
#define pfn_to_nid(pfn) (0)
diff -dpur linux-2.6.0-test1/include/asm-i386/numnodes.h linux-2.6.0-test1-mh/include/asm-i386/numnodes.h
--- linux-2.6.0-test1/include/asm-i386/numnodes.h Mon Jul 14 12:31:20 2003
+++ linux-2.6.0-test1-mh/include/asm-i386/numnodes.h Wed Jul 30 13:41:23 2003
@@ -8,7 +8,7 @@
#elif CONFIG_NUMA
#include <asm/srat.h>
#else
-#define MAX_NUMNODES 1
+#define MAX_NUMNODES 8
#endif /* CONFIG_X86_NUMAQ */

#endif /* _ASM_MAX_NUMNODES_H */
diff -dpur linux-2.6.0-test1/include/linux/mm.h linux-2.6.0-test1-mh/include/linux/mm.h
--- linux-2.6.0-test1/include/linux/mm.h Mon Jul 14 12:29:29 2003
+++ linux-2.6.0-test1-mh/include/linux/mm.h Wed Jul 30 13:41:23 2003
@@ -218,7 +218,14 @@ struct page {
*/
#define put_page_testzero(p) \
({ \
- BUG_ON(page_count(p) == 0); \
+ if (page_count(p) == 0) { \
+ int i; \
+ printk("Page: %lx ", (long)p); \
+ for(i = 0; i < sizeof(struct page); i++) \
+ printk(" %02x", ((unsigned char *)p)[i]); \
+ printk("\n"); \
+ BUG(); \
+ } \
atomic_dec_and_test(&(p)->count); \
})

@@ -611,6 +618,12 @@ static inline void
kernel_map_pages(struct page *page, int numpages, int enable)
{
}
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+#define page_trace(p) page_trace_func(p, __FUNCTION__, __LINE__)
+extern void page_trace_func(const struct page *, const char *, int);
+#else
+#define page_trace(p) do { } while(0)
#endif

#endif /* __KERNEL__ */
diff -dpur linux-2.6.0-test1/include/linux/mmzone.h linux-2.6.0-test1-mh/include/linux/mmzone.h
--- linux-2.6.0-test1/include/linux/mmzone.h Mon Jul 14 12:34:41 2003
+++ linux-2.6.0-test1-mh/include/linux/mmzone.h Wed Oct 8 12:26:23 2003
@@ -331,6 +331,10 @@ static inline unsigned int num_online_me
return num;
}

+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+int remapd(void *p);
+#endif
#else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */

#define node_online(node) \
diff -dpur linux-2.6.0-test1/ipc/util.c linux-2.6.0-test1-mh/ipc/util.c
--- linux-2.6.0-test1/ipc/util.c Mon Jul 14 12:33:12 2003
+++ linux-2.6.0-test1-mh/ipc/util.c Mon Sep 1 16:50:13 2003
@@ -323,6 +323,9 @@ void* ipc_rcu_alloc(int size)
if (out) out += sizeof(struct ipc_rcu_kmalloc);
}

+#ifdef CONFIG_MEMHOTPLUGTEST
+ printk("ipc_rcu_alloc: %lx\n", (unsigned long)out);
+#endif
return out;
}

diff -dpur linux-2.6.0-test1/mm/memory.c linux-2.6.0-test1-mh/mm/memory.c
--- linux-2.6.0-test1/mm/memory.c Mon Jul 14 12:33:49 2003
+++ linux-2.6.0-test1-mh/mm/memory.c Wed Jul 30 13:41:23 2003
@@ -413,6 +413,17 @@ zap_pte_range(struct mmu_gather *tlb, pm
mark_page_accessed(page);
tlb->freed++;
page_remove_rmap(page, ptep);
+#if 1 // debug
+ /* Validate page */
+ {
+ struct zone *z = page_zone(page);
+ int idx = page - z->zone_mem_map;
+ if (idx < 0 || idx >= z->spanned_pages) {
+ printk("zap_pte_range: %d %d\n", page->flags >> ZONE_SHIFT, idx);
+ BUG();
+ }
+ }
+#endif
tlb_remove_page(tlb, page);
}
}
diff -dpur linux-2.6.0-test1/mm/page_alloc.c linux-2.6.0-test1-mh/mm/page_alloc.c
--- linux-2.6.0-test1/mm/page_alloc.c Mon Jul 14 12:30:01 2003
+++ linux-2.6.0-test1-mh/mm/page_alloc.c Wed Oct 15 12:56:51 2003
@@ -31,6 +31,7 @@
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
+#include <linux/proc_fs.h>

#include <asm/tlbflush.h>

@@ -52,6 +53,10 @@ EXPORT_SYMBOL(nr_swap_pages);
*/
struct zone *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+static char zone_active[MAX_NR_ZONES*MAX_NR_NODES];
+static const struct page *page_trace_list[10];
+#endif

static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
@@ -256,6 +261,7 @@ free_pages_bulk(struct zone *zone, int c
page = list_entry(list->prev, struct page, list);
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->list);
+ page_trace(page);
__free_pages_bulk(page, base, zone, area, mask, order);
ret++;
}
@@ -409,7 +415,9 @@ int is_head_of_free_region(struct page *
spin_unlock_irqrestore(&zone->lock, flags);
return 0;
}
+#endif

+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUGTEST)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
@@ -510,9 +518,28 @@ static struct page *buffered_rmqueue(str
mod_page_state(pgalloc, 1 << order);
prep_new_page(page, order);
}
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_active[page->flags >> ZONE_SHIFT])
+ BUG();
+#endif
return page;
}

+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+ int i;
+
+ for(i = 0; ; i++) {
+ if (zone_table[i] == z)
+ return zone_active[i];
+ if (zone_table[i] == NULL)
+ BUG();
+ }
+}
+#endif
+
/*
* This is the 'heart' of the zoned buddy allocator.
*
@@ -558,6 +585,10 @@ __alloc_pages(unsigned int gfp_mask, uns
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
min += z->pages_low;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
@@ -578,6 +609,10 @@ __alloc_pages(unsigned int gfp_mask, uns
unsigned long local_min;
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
local_min = z->pages_min;
if (gfp_mask & __GFP_HIGH)
local_min >>= 2;
@@ -599,6 +634,10 @@ rebalance:
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
@@ -624,6 +663,10 @@ rebalance:
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
min += z->pages_min;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
@@ -662,6 +705,21 @@ nopage:
return NULL;
got_pg:
kernel_map_pages(page, 1 << order, 1);
+#if 1 // debug
+ /* Validate page */
+ {
+ struct zone *z = page_zone(page);
+ int idx = page - z->zone_mem_map;
+ if (idx < 0 || idx >= z->spanned_pages) {
+ printk("%d %d\n", page->flags >> ZONE_SHIFT, idx);
+ BUG();
+ }
+ }
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_active[page->flags >> ZONE_SHIFT])
+ BUG();
+#endif
return page;
}

@@ -1049,6 +1107,9 @@ static int __init build_zonelists_node(p
static void __init build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ struct zone *zone;
+#endif

local_node = pgdat->node_id;
printk("Building zonelist for node : %d\n", local_node);
@@ -1064,7 +1125,7 @@ static void __init build_zonelists(pg_da
k = ZONE_HIGHMEM;
if (i & __GFP_DMA)
k = ZONE_DMA;
-
+#ifndef CONFIG_MEMHOTPLUGTEST
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
@@ -1080,6 +1141,23 @@ static void __init build_zonelists(pg_da
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);

zonelist->zones[j++] = NULL;
+#else
+ for(; k >= 0; k--) {
+ zone = pgdat->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ for (node = local_node + 1; node < numnodes; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ for (node = 0; node < local_node; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ }
+#endif
}
}

@@ -1225,6 +1303,9 @@ static void __init free_area_init_core(s
unsigned long batch;

zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ zone_active[nid * MAX_NR_ZONES + j] = 1;
+#endif
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
@@ -1268,8 +1349,8 @@ static void __init free_area_init_core(s
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
}
- printk(" %s zone: %lu pages, LIFO batch:%lu\n",
- zone_names[j], realsize, batch);
+ printk(" %s zone: %lu pages, LIFO batch:%lu start:%lu\n",
+ zone_names[j], realsize, batch, zone_start_pfn);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
atomic_set(&zone->refill_counter, 0);
@@ -1615,3 +1696,187 @@ int min_free_kbytes_sysctl_handler(ctl_t
setup_per_zone_pages_min();
return 0;
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int mhtest_read(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ char *p;
+ int i, len;
+ const struct zone *z;
+
+ p = page;
+ for(i = 0; ; i++) {
+ z = zone_table[i];
+ if (z == NULL)
+ break;
+ if (! z->present_pages)
+ /* skip empty zone */
+ continue;
+ len = sprintf(p, "Zone %d: %sabled free %d, active %d, present %d\n", i,
+ zone_active[i] ? "en" : "dis", z->free_pages, z->nr_active,
+ z->present_pages);
+ p += len;
+ }
+ len = p - page;
+
+ if (len <= off + count)
+ *eof = 1;
+ *start = page + off;
+ len -= off;
+ if (len < 0)
+ len = 0;
+ if (len > count)
+ len = count;
+
+ return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ unsigned long idx;
+ char buf[64], *p;
+ int i;
+ struct list_head *l;
+
+ if (count > sizeof(buf) - 1)
+ count = sizeof(buf) - 1;
+ if (copy_from_user(buf, buffer, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ p = strchr(buf, ' ');
+ if (p == NULL)
+ goto out;
+
+ *p++ = '\0';
+ idx = simple_strtoul(p, NULL, 0);
+
+ if (strcmp(buf, "trace") == 0) {
+ for(i = 0; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]); i++)
+ if (page_trace_list[i] == NULL) {
+ page_trace_list[i] = (struct page *)idx;
+ printk("add trace %lx\n", (unsigned long)idx);
+ goto out;
+ }
+ printk("page_trace_list is full (not added)\n");
+ goto out;
+ } else if (strcmp(buf, "untrace") == 0) {
+ for(i = 0; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]); i++)
+ if (page_trace_list[i] == (struct page *)idx)
+ break;
+ if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0])) {
+ printk("not registered\n");
+ goto out;
+ }
+ for(; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]) - 1; i++)
+ page_trace_list[i] = page_trace_list[i + 1];
+ page_trace_list[i] = NULL;
+ goto out;
+ }
+ if (idx > MAX_NR_ZONES*MAX_NR_NODES) {
+ printk("Argument out of range\n");
+ goto out;
+ }
+ if (strcmp(buf, "disable") == 0) {
+ printk("disable %d\n", idx);
+ /* XXX */
+ for (i = 0; i < NR_CPUS; i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[0]; /* hot */
+ pcp->low = pcp->high = 0;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[1]; /* cold */
+ pcp->low = pcp->high = 0;
+ }
+ zone_active[idx] = 0;
+ zone_table[idx]->pages_high = zone_table[idx]->present_pages;
+ } else if (strcmp(buf, "purge") == 0) {
+ if (zone_active[idx])
+ printk("Zone %d still active (proceeding anyway)\n",
+ idx);
+ printk("purge %d\n", idx);
+ wake_up_interruptible(&zone_table[idx]->zone_pgdat->kswapd_wait);
+ /* XXX overkill, but who cares? */
+ on_each_cpu(drain_local_pages, NULL, 1, 1);
+ } else if (strcmp(buf, "enable") == 0) {
+ printk("enable %d\n", idx);
+ zone_active[idx] = 1;
+ zone_table[idx]->pages_high =
+ zone_table[idx]->pages_min * 3;
+ /* XXX */
+ for (i = 0; i < NR_CPUS; i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[0]; /* hot */
+ pcp->low = 2 * pcp->batch;
+ pcp->high = 6 * pcp->batch;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[1]; /* cold */
+ pcp->high = 2 * pcp->batch;
+ }
+ } else if (strcmp(buf, "remap") == 0) {
+ on_each_cpu(drain_local_pages, NULL, 1, 1);
+ kernel_thread(remapd, zone_table[idx], CLONE_KERNEL);
+ } else if (strcmp(buf, "active") == 0) {
+ if (zone_table[idx] == NULL)
+ goto out;
+ spin_lock_irq(&zone_table[idx]->lru_lock);
+ i = 0;
+ list_for_each(l, &zone_table[idx]->active_list) {
+ printk(" %lx", (unsigned long)list_entry(l, struct page, lru));
+ i++;
+ if (i == 10)
+ break;
+ }
+ spin_unlock_irq(&zone_table[idx]->lru_lock);
+ printk("\n");
+ } else if (strcmp(buf, "inuse") == 0) {
+ if (zone_table[idx] == NULL)
+ goto out;
+ for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+ if (page_count(&zone_table[idx]->zone_mem_map[i]))
+ printk(" %lx", (unsigned long)&zone_table[idx]->zone_mem_map[i]);
+ printk("\n");
+ }
+out:
+ return count;
+}
+
+static int __init procmhtest_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("memhotplug", 0, NULL);
+ if (entry == NULL)
+ return -1;
+
+ entry->read_proc = &mhtest_read;
+ entry->write_proc = &mhtest_write;
+ return 0;
+}
+__initcall(procmhtest_init);
+
+void
+page_trace_func(const struct page *p, const char *func, int line) {
+ int i;
+
+ for(i = 0; i < sizeof(page_trace_list) /
+ sizeof(page_trace_list[0]); i++) {
+ if (page_trace_list[i] == NULL)
+ return;
+ if (page_trace_list[i] == p)
+ break;
+ }
+ if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0]))
+ return;
+
+ printk("Page %lx, %s %d\n", (unsigned long)p, func, line);
+}
+#endif
diff -dpur linux-2.6.0-test1/mm/shmem.c linux-2.6.0-test1-mh/mm/shmem.c
--- linux-2.6.0-test1/mm/shmem.c Mon Jul 14 12:33:41 2003
+++ linux-2.6.0-test1-mh/mm/shmem.c Mon Sep 1 16:31:55 2003
@@ -76,7 +76,15 @@ static inline struct page *shmem_dir_all
* BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
* might be reconsidered if it ever diverges from PAGE_SIZE.
*/
- return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ struct page* p = alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
+ printk("shmem_dir_alloc: %lx\n", (unsigned long)p);
+ return p;
+#else
+ return alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#endif
}

static inline void shmem_dir_free(struct page *page)
diff -dpur linux-2.6.0-test1/mm/swap.c linux-2.6.0-test1-mh/mm/swap.c
--- linux-2.6.0-test1/mm/swap.c Mon Jul 14 12:33:10 2003
+++ linux-2.6.0-test1-mh/mm/swap.c Fri Sep 26 15:39:53 2003
@@ -77,6 +77,7 @@ void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);

+ page_trace(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && !PageActive(page)) {
del_page_from_inactive_list(zone, page);
@@ -185,6 +186,19 @@ void release_pages(struct page **pages,
struct page *page = pages[i];
struct zone *pagezone;

+ if (page_count(page) == 0) {
+ struct zone **z = zone_table;
+ int idx;
+ while (*z) {
+ idx = page - (*z)->zone_mem_map;
+ if (idx >= 0 && idx < (*z)->spanned_pages)
+ break;
+ z++;
+ }
+ if (*z != NULL)
+ printk("Zone: %lx %d, index: %d\n",
+ (unsigned long)*z, z - zone_table, idx);
+ }
if (PageReserved(page) || !put_page_testzero(page))
continue;

@@ -247,6 +261,10 @@ void __pagevec_release_nonlru(struct pag
BUG_ON(PageLRU(page));
if (put_page_testzero(page))
pagevec_add(&pages_to_free, page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ else
+ printk("Page %lx disappearing\n", page);
+#endif
}
pagevec_free(&pages_to_free);
pagevec_reinit(pvec);
diff -dpur linux-2.6.0-test1/mm/swap_state.c linux-2.6.0-test1-mh/mm/swap_state.c
--- linux-2.6.0-test1/mm/swap_state.c Mon Jul 14 12:33:46 2003
+++ linux-2.6.0-test1-mh/mm/swap_state.c Wed Jul 30 13:41:23 2003
@@ -151,6 +151,7 @@ int add_to_swap(struct page * page)
ClearPageDirty(page);
set_page_dirty(page);
INC_CACHE_INFO(add_total);
+ page_trace(page);
return 1;
case -EEXIST:
/* Raced with "speculative" read_swap_cache_async */
@@ -160,6 +161,7 @@ int add_to_swap(struct page * page)
default:
/* -ENOMEM radix-tree allocation failure */
swap_free(entry);
+ page_trace(page);
return 0;
}
}
diff -dpur linux-2.6.0-test1/mm/vmalloc.c linux-2.6.0-test1-mh/mm/vmalloc.c
--- linux-2.6.0-test1/mm/vmalloc.c Mon Jul 14 12:34:43 2003
+++ linux-2.6.0-test1-mh/mm/vmalloc.c Mon Sep 1 19:21:20 2003
@@ -431,7 +431,11 @@ fail:
*/
void *vmalloc(unsigned long size)
{
+#ifdef CONFIG_MEMHOTPLUGTEST
+ return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+#else
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+#endif
}

/**
diff -dpur linux-2.6.0-test1/mm/vmscan.c linux-2.6.0-test1-mh/mm/vmscan.c
--- linux-2.6.0-test1/mm/vmscan.c Mon Jul 14 12:30:43 2003
+++ linux-2.6.0-test1-mh/mm/vmscan.c Thu Oct 16 11:48:52 2003
@@ -254,12 +254,16 @@ shrink_list(struct list_head *page_list,
goto keep_locked;

pte_chain_lock(page);
+ if ((! zone_activep(page_zone(page))) && page_mapped(page))
+ page_referenced(page);
if (page_referenced(page) && page_mapping_inuse(page)) {
/* In active use or really unfreeable. Activate it. */
pte_chain_unlock(page);
+ page_trace(page);
goto activate_locked;
}

+ page_trace(page);
mapping = page->mapping;

#ifdef CONFIG_SWAP
@@ -278,6 +282,7 @@ shrink_list(struct list_head *page_list,
}
#endif /* CONFIG_SWAP */

+ page_trace(page);
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
@@ -286,9 +291,11 @@ shrink_list(struct list_head *page_list,
switch (try_to_unmap(page)) {
case SWAP_FAIL:
pte_chain_unlock(page);
+ page_trace(page);
goto activate_locked;
case SWAP_AGAIN:
pte_chain_unlock(page);
+ page_trace(page);
goto keep_locked;
case SWAP_SUCCESS:
; /* try to free the page below */
@@ -314,14 +321,17 @@ shrink_list(struct list_head *page_list,
* See swapfile.c:page_queue_congested().
*/
if (PageDirty(page)) {
+ page_trace(page);
if (!is_page_cache_freeable(page))
goto keep_locked;
+ page_trace(page);
if (!mapping)
goto keep_locked;
if (mapping->a_ops->writepage == NULL)
goto activate_locked;
if (!may_enter_fs)
goto keep_locked;
+ page_trace(page);
if (!may_write_to_queue(mapping->backing_dev_info))
goto keep_locked;
spin_lock(&mapping->page_lock);
@@ -333,6 +343,7 @@ shrink_list(struct list_head *page_list,
.nonblocking = 1,
.for_reclaim = 1,
};
+ page_trace(page);

list_move(&page->list, &mapping->locked_pages);
spin_unlock(&mapping->page_lock);
@@ -375,12 +386,14 @@ shrink_list(struct list_head *page_list,
* Otherwise, leave the page on the LRU so it is swappable.
*/
if (PagePrivate(page)) {
+ page_trace(page);
if (!try_to_release_page(page, gfp_mask))
goto activate_locked;
if (!mapping && page_count(page) == 1)
goto free_it;
}

+ page_trace(page);
if (!mapping)
goto keep_locked; /* truncate got there first */

@@ -396,6 +409,7 @@ shrink_list(struct list_head *page_list,
goto keep_locked;
}

+ page_trace(page);
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page->index };
@@ -554,7 +568,7 @@ done:
* But we had to alter page->flags anyway.
*/
static void
-refill_inactive_zone(struct zone *zone, const int nr_pages_in,
+refill_inactive_zone(struct zone *zone, int nr_pages_in,
struct page_state *ps, int priority)
{
int pgmoved;
@@ -572,6 +586,12 @@ refill_inactive_zone(struct zone *zone,

lru_add_drain();
pgmoved = 0;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone)) {
+ nr_pages = nr_pages_in = zone->present_pages - zone->free_pages;
+ printk("Purging active list of disabled zone\n");
+ }
+#endif
spin_lock_irq(&zone->lru_lock);
while (nr_pages && !list_empty(&zone->active_list)) {
page = list_entry(zone->active_list.prev, struct page, lru);
@@ -579,6 +599,7 @@ refill_inactive_zone(struct zone *zone,
if (!TestClearPageLRU(page))
BUG();
list_del(&page->lru);
+ page_trace(page);
if (page_count(page) == 0) {
/* It is currently in pagevec_release() */
SetPageLRU(page);
@@ -623,20 +644,30 @@ refill_inactive_zone(struct zone *zone,
*/
if (swap_tendency >= 100)
reclaim_mapped = 1;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone))
+ reclaim_mapped = 1;
+#endif

while (!list_empty(&l_hold)) {
page = list_entry(l_hold.prev, struct page, lru);
list_del(&page->lru);
if (page_mapped(page)) {
pte_chain_lock(page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone))
+ page_referenced(page); /* XXX */
+#endif
if (page_mapped(page) && page_referenced(page)) {
pte_chain_unlock(page);
+ page_trace(page);
list_add(&page->lru, &l_active);
continue;
}
pte_chain_unlock(page);
if (!reclaim_mapped) {
list_add(&page->lru, &l_active);
+ page_trace(page);
continue;
}
}
@@ -647,9 +678,11 @@ refill_inactive_zone(struct zone *zone,
if (total_swap_pages == 0 && !page->mapping &&
!PagePrivate(page)) {
list_add(&page->lru, &l_active);
+ page_trace(page);
continue;
}
list_add(&page->lru, &l_inactive);
+ page_trace(page);
}

pagevec_init(&pvec, 1);
@@ -732,6 +765,11 @@ shrink_zone(struct zone *zone, int max_s
ratio = (unsigned long)nr_pages * zone->nr_active /
((zone->nr_inactive | 1) * 2);
atomic_add(ratio+1, &zone->refill_counter);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone))
+ /* XXX */
+ atomic_add(SWAP_CLUSTER_MAX, &zone->refill_counter);
+#endif
if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
int count;

@@ -1002,6 +1040,201 @@ void wakeup_kswapd(struct zone *zone)
return;
wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+/* try to remap a page. returns non-zero on failure */
+int remap_onepage(struct page *page)
+{
+ struct page *newpage;
+ struct zone *zone;
+ struct address_space *mapping = page->mapping;
+ char *np, *op;
+ int waitcnt = 0, error = -1;
+
+ newpage = alloc_page(GFP_HIGHUSER);
+ if (newpage == NULL)
+ return -ENOMEM;
+ if (TestSetPageLocked(newpage))
+ BUG();
+ lock_page(page);
+
+ if (PagePrivate(page))
+ if (!try_to_release_page(page, GFP_KERNEL))
+ goto radixfail;
+ if (mapping == NULL) {
+ /* The page is an anon page. Allocate swap entry. */
+ /* ...but just bail for now */
+ if (!add_to_swap(page))
+ goto radixfail;
+ mapping = page->mapping;
+ }
+ error = radix_tree_preload(mapping->gfp_mask & ~__GFP_HIGHMEM);
+ if (error)
+ goto radixfail;
+ if (PagePrivate(page)) /* XXX */
+ BUG();
+
+ newpage->flags |= page->flags & ~PG_uptodate & ~(~0UL << ZONE_SHIFT);
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock(&mapping->page_lock);
+ radix_tree_delete(&mapping->page_tree, page->index);
+ __put_page(page);
+ /* list_del(&page->list); XXX */
+ radix_tree_insert(&mapping->page_tree, page->index, newpage);
+ page_cache_get(newpage);
+ radix_tree_preload_end();
+ newpage->mapping = mapping;
+ newpage->index = page->index;
+ spin_unlock(&mapping->page_lock);
+ if (PageDirty(page))
+ list_add(&newpage->list, &mapping->dirty_pages);
+ else
+ list_add(&newpage->list, &mapping->clean_pages);
+
+ pte_chain_lock(page);
+ if (page_mapped(page)) {
+ while ((error = try_to_unmap(page)) == SWAP_AGAIN)
+ ;
+ if (error == SWAP_FAIL)
+ /* either during mremap or mlocked */
+ goto unmapfail;
+ }
+ list_del(&page->list); /* XXX */
+ page->mapping = NULL;
+ pte_chain_unlock(page);
+ unlock_page(page); /* no lock needed while waiting page count */
+
+ while (page_count(page) != 1) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ if (waitcnt == 10000) {
+ printk("remap_onepage: still waiting on %p\n", page);
+ waitcnt++;
+ }
+ if (waitcnt < 10000)
+ waitcnt++;
+ }
+
+ np = kmap_atomic(newpage, KM_REMAP0);
+ op = kmap_atomic(page, KM_REMAP1);
+ memcpy(np, op, PAGE_SIZE);
+ kunmap_atomic(page, KM_REMAP1);
+ kunmap_atomic(newpage, KM_REMAP0);
+ ClearPageActive(page);
+ put_page(page);
+
+ /* We are done. Finish and let the waiters run. */
+ SetPageUptodate(newpage);
+ /* XXX locking order correct? */
+ zone = page_zone(newpage);
+ spin_lock_irq(&zone->lru_lock);
+ if (PageActive(newpage)) {
+ list_add(&newpage->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&newpage->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ SetPageLRU(newpage);
+ spin_unlock_irq(&zone->lru_lock);
+ unlock_page(newpage);
+ page_cache_release(newpage);
+ return 0;
+
+unmapfail:
+ /* unwind is impossible if some process is waiting on the newpage */
+ printk("You are hosed.\n");
+ BUG();
+
+radixfail:
+ unlock_page(page);
+ unlock_page(newpage);
+ __free_page(newpage);
+ return 1;
+}
+
+int remapd(void *p)
+{
+ struct zone *zone = p;
+ struct page *page;
+ int i, nr_failed = 0;
+ LIST_HEAD(failedp);
+
+ daemonize("remap%d", zone->zone_start_pfn);
+ on_each_cpu(lru_add_drain, NULL, 1, 1);
+ while(nr_failed < 100) {
+ spin_lock_irq(&zone->lru_lock);
+ for(i = 0; ! list_empty(&zone->inactive_list) &&
+ i < 10; i++) {
+ page = list_entry(zone->inactive_list.prev,
+ struct page, lru);
+ if (! TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ zone->nr_inactive--;
+ if (page_count(page) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(page);
+ list_add(&page->lru, &zone->inactive_list);
+ continue;
+ }
+ page_cache_get(page);
+ spin_unlock_irq(&zone->lru_lock);
+ goto got_page;
+ }
+
+ for(i = 0; ! list_empty(&zone->active_list) &&
+ i < 10; i++) {
+ page = list_entry(zone->active_list.prev,
+ struct page, lru);
+ if (! TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ zone->nr_active--;
+ if (page_count(page) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(page);
+ list_add(&page->lru, &zone->active_list);
+ continue;
+ }
+ page_cache_get(page);
+ spin_unlock_irq(&zone->lru_lock);
+ goto got_page;
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ break;
+
+ got_page:
+ if (remap_onepage(page)) {
+ nr_failed++;
+ list_add(&page->lru, &failedp);
+ }
+ }
+ if (list_empty(&failedp))
+ return 0;
+
+ spin_lock_irq(&zone->lru_lock);
+ while (! list_empty(&failedp)) {
+ page = list_entry(failedp.prev, struct page, lru);
+ list_del(&page->lru);
+ if (PageActive(page)) {
+ list_add(&page->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&page->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ SetPageLRU(page);
+ page_cache_release(page);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ return 0;
+}
+
+
+#endif

#ifdef CONFIG_SOFTWARE_SUSPEND
/*

--
IWAMOTO Toshihiro @ VA Linux Systems Japan
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/