[RFC PATCH 3/4] tmem: preswap implementation (layered on tmem)

From: Dan Magenheimer
Date: Fri Jun 19 2009 - 21:37:09 EST


--- linux-2.6.30/mm/page_io.c 2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/mm/page_io.c 2009-06-19 09:33:59.000000000 -0600
@@ -102,6 +102,12 @@
unlock_page(page);
goto out;
}
+ if (preswap_put(page) == 1) {
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ goto out;
+ }
bio = get_swap_bio(GFP_NOIO, page_private(page), page,
end_swap_bio_write);
if (bio == NULL) {
@@ -134,6 +140,12 @@
ret = -ENOMEM;
goto out;
}
+ if (preswap_get(page) == 1) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ bio_put(bio);
+ goto out;
+ }
count_vm_event(PSWPIN);
submit_bio(READ, bio);
out:
--- linux-2.6.30/mm/swapfile.c 2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/mm/swapfile.c 2009-06-19 16:20:14.000000000 -0600
@@ -35,7 +35,7 @@
#include <linux/swapops.h>
#include <linux/page_cgroup.h>

-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
long nr_swap_pages;
long total_swap_pages;
@@ -47,7 +47,7 @@
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";

-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};

static struct swap_info_struct swap_info[MAX_SWAPFILES];

@@ -488,6 +488,7 @@
swap_list.next = p - swap_info;
nr_swap_pages++;
p->inuse_pages--;
+ preswap_flush(p - swap_info, offset);
mem_cgroup_uncharge_swap(ent);
}
}
@@ -864,7 +865,7 @@
* Recycle to start on reaching the end, returning 0 when empty.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev)
+ unsigned int prev, unsigned int preswap)
{
unsigned int max = si->max;
unsigned int i = prev;
@@ -890,6 +891,12 @@
prev = 0;
i = 1;
}
+ if (preswap) {
+ if (preswap_test(si, i))
+ break;
+ else
+ continue;
+ }
count = si->swap_map[i];
if (count && count != SWAP_MAP_BAD)
break;
@@ -901,8 +908,12 @@
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
+ *
+ * if the boolean preswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages
*/
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, unsigned int preswap,
+ unsigned long pages_to_unuse)
{
struct swap_info_struct * si = &swap_info[type];
struct mm_struct *start_mm;
@@ -938,7 +949,7 @@
* one pass through swap_map is enough, but not necessarily:
* there are races when an instance of an entry might be missed.
*/
- while ((i = find_next_to_unuse(si, i)) != 0) {
+ while ((i = find_next_to_unuse(si, i, preswap)) != 0) {
if (signal_pending(current)) {
retval = -EINTR;
break;
@@ -1124,6 +1135,8 @@
* interactive performance.
*/
cond_resched();
+ if (preswap && pages_to_unuse && !--pages_to_unuse)
+ break;
}

mmput(start_mm);
@@ -1448,7 +1461,7 @@
spin_unlock(&swap_lock);

current->flags |= PF_SWAPOFF;
- err = try_to_unuse(type);
+ err = try_to_unuse(type, 0, 0);
current->flags &= ~PF_SWAPOFF;

if (err) {
@@ -1497,9 +1510,14 @@
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
+ preswap_flush_area(p - swap_info);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+#ifdef CONFIG_PRESWAP
+ if (p->preswap_map)
+ vfree(p->preswap_map);
+#endif
/* Destroy swap account informatin */
swap_cgroup_swapoff(type);

@@ -1812,6 +1830,11 @@
}

memset(swap_map, 0, maxpages * sizeof(short));
+#ifdef CONFIG_PRESWAP
+ p->preswap_map = vmalloc(maxpages / sizeof(long));
+ if (p->preswap_map)
+ memset(p->preswap_map, 0, maxpages / sizeof(long));
+#endif
for (i = 0; i < swap_header->info.nr_badpages; i++) {
int page_nr = swap_header->info.badpages[i];
if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
@@ -1886,6 +1909,7 @@
} else {
swap_info[prev].next = p - swap_info;
}
+ preswap_init(p - swap_info);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
error = 0;
@@ -2002,6 +2026,8 @@

si = &swap_info[swp_type(entry)];
target = swp_offset(entry);
+ if (preswap_test(si, target))
+ return 0;
base = (target >> our_page_cluster) << our_page_cluster;
end = base + (1 << our_page_cluster);
if (!base) /* first page is swap header */
@@ -2018,6 +2044,9 @@
break;
if (si->swap_map[toff] == SWAP_MAP_BAD)
break;
+ /* Don't read in preswap pages */
+ if (preswap_test(si, toff))
+ break;
}
/* Count contiguous allocated slots below our target */
for (toff = target; --toff >= base; nr_pages++) {
--- linux-2.6.30/include/linux/swap.h 2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/include/linux/swap.h 2009-06-19 12:51:55.000000000 -0600
@@ -8,6 +8,7 @@
#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/node.h>
+#include <linux/vmalloc.h>

#include <asm/atomic.h>
#include <asm/page.h>
@@ -154,8 +155,62 @@
unsigned int max;
unsigned int inuse_pages;
unsigned int old_block_size;
+#ifdef CONFIG_PRESWAP
+ unsigned long *preswap_map;
+ unsigned int preswap_pages;
+#endif
};

+#ifdef CONFIG_PRESWAP
+
+#include <linux/sysctl.h>
+extern int preswap_sysctl_handler(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
+extern const unsigned long preswap_zero, preswap_infinity;
+
+extern void preswap_shrink(unsigned long);
+extern int preswap_test(struct swap_info_struct *, unsigned long);
+extern void preswap_init(unsigned);
+extern int preswap_put(struct page *);
+extern int preswap_get(struct page *);
+extern void preswap_flush(unsigned, unsigned long);
+extern void preswap_flush_area(unsigned);
+/* in swapfile.c */
+extern int try_to_unuse(unsigned int, unsigned int, unsigned long);
+#else
+static inline void preswap_shrink(unsigned long target_pages)
+{
+}
+
+static inline int preswap_test(struct swap_info_struct *sis,
+ unsigned long offset)
+{
+ return 0;
+}
+
+static inline void preswap_init(unsigned type)
+{
+}
+
+static inline int preswap_put(struct page *page)
+{
+ return 0;
+}
+
+static inline int preswap_get(struct page *page)
+{
+ return 0;
+}
+
+static inline void preswap_flush(unsigned type, unsigned long offset)
+{
+}
+
+static inline void preswap_flush_area(unsigned type)
+{
+}
+#endif /* CONFIG_PRESWAP */
+
struct swap_list_t {
int head; /* head of priority-ordered swapfile list */
int next; /* swapfile to be used next */
@@ -312,6 +367,8 @@
extern int reuse_swap_page(struct page *);
extern int try_to_free_swap(struct page *);
struct backing_dev_info;
+extern struct swap_list_t swap_list;
+extern spinlock_t swap_lock;

/* linux/mm/thrash.c */
extern struct mm_struct * swap_token_mm;
--- linux-2.6.30/mm/preswap.c 1969-12-31 17:00:00.000000000 -0700
+++ linux-2.6.30-tmem/mm/preswap.c 2009-06-19 14:55:16.000000000 -0600
@@ -0,0 +1,274 @@
+/*
+ * linux/mm/preswap.c
+ *
+ * Implements a fast "preswap" on top of the transcendent memory ("tmem") API.
+ * When a swapdisk is enabled (with swapon), a "private persistent tmem pool"
+ * is created along with a bit-per-page preswap_map. When swapping occurs
+ * and a page is about to be written to disk, a "put" into the pool may first
+ * be attempted by passing the pageframe to be swapped, along with a "handle"
+ * consisting of a pool_id, an object id, and an index. Since the pool is of
+ * indeterminate size, the "put" may be rejected, in which case the page
+ * is swapped to disk as normal. If the "put" is successful, the page is
+ * copied to tmem and the preswap_map records the success. Later, when
+ * the page needs to be swapped in, the preswap_map is checked and, if set,
+ * the page may be obtained with a "get" operation. Note that the swap
+ * subsystem is responsible for: maintaining coherency between the swapcache,
+ * preswap, and the swapdisk; for evicting stale pages from preswap; and for
+ * emptying preswap when swapoff is performed. The "flush page" and "flush
+ * object" actions are provided for this.
+ *
+ * Note that if a "duplicate put" is performed to overwrite a page and
+ * the "put" operation fails, the page (and old data) is flushed and lost.
+ * Also note that multiple accesses to a tmem pool may be concurrent and
+ * any ordering must be guaranteed by the caller.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/uaccess.h>
+#include <linux/tmem.h>
+
+static u32 preswap_poolid = -1; /* if negative, preswap will never call tmem */
+
+const unsigned long preswap_zero = 0, preswap_infinity = ~0UL; /* for sysctl */
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS 4
+#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
+#define oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind) (_ind >> SWIZ_BITS)
+
+/*
+ * preswap_map test/set/clear operations (must be atomic)
+ */
+
+int preswap_test(struct swap_info_struct *sis, unsigned long offset)
+{
+ if (!sis->preswap_map)
+ return 0;
+ return test_bit(offset % BITS_PER_LONG,
+ &sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_set(struct swap_info_struct *sis,
+ unsigned long offset)
+{
+ if (!sis->preswap_map)
+ return;
+ set_bit(offset % BITS_PER_LONG,
+ &sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_clear(struct swap_info_struct *sis,
+ unsigned long offset)
+{
+ if (!sis->preswap_map)
+ return;
+ clear_bit(offset % BITS_PER_LONG,
+ &sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+/*
+ * preswap tmem operations
+ */
+
+/* returns 1 if the page was successfully put into preswap, 0 if the page
+ * was declined, and -ERRNO for a specific error */
+int preswap_put(struct page *page)
+{
+ swp_entry_t entry = { .val = page_private(page), };
+ unsigned type = swp_type(entry);
+ pgoff_t offset = swp_offset(entry);
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ unsigned long pfn = page_to_pfn(page);
+ struct swap_info_struct *sis = get_swap_info_struct(type);
+ int dup = 0, ret;
+
+ if ((s32)preswap_poolid < 0)
+ return 0;
+ if (ind64 != ind)
+ return 0;
+ if (preswap_test(sis, offset))
+ dup = 1;
+ mb(); /* ensure page is quiescent; tmem may address it with an alias */
+ ret = (*tmem_ops->put_page)(preswap_poolid, oswiz(type, ind),
+ iswiz(ind), pfn);
+ if (ret == 1) {
+ preswap_set(sis, offset);
+ if (!dup)
+ sis->preswap_pages++;
+ } else if (dup) {
+ /* failed dup put always results in an automatic flush of
+ * the (older) page from preswap */
+ preswap_clear(sis, offset);
+ sis->preswap_pages--;
+ }
+ return ret;
+}
+
+/* returns 1 if the page was successfully gotten from preswap, 0 if the page
+ * was not present (should never happen!), and -ERRNO for a specific error */
+int preswap_get(struct page *page)
+{
+ swp_entry_t entry = { .val = page_private(page), };
+ unsigned type = swp_type(entry);
+ pgoff_t offset = swp_offset(entry);
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ unsigned long pfn = page_to_pfn(page);
+ struct swap_info_struct *sis = get_swap_info_struct(type);
+ int ret;
+
+ if ((s32)preswap_poolid < 0)
+ return 0;
+ if (ind64 != ind)
+ return 0;
+ if (!preswap_test(sis, offset))
+ return 0;
+ ret = (*tmem_ops->get_page)(preswap_poolid, oswiz(type, ind),
+ iswiz(ind), pfn);
+ return ret;
+}
+
+/* flush a single page from preswap */
+void preswap_flush(unsigned type, unsigned long offset)
+{
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ struct swap_info_struct *sis = get_swap_info_struct(type);
+ int ret = 1;
+
+ if ((s32)preswap_poolid < 0)
+ return;
+ if (ind64 != ind)
+ return;
+ if (preswap_test(sis, offset)) {
+ ret = (*tmem_ops->flush_page)(preswap_poolid,
+ oswiz(type, ind), iswiz(ind));
+ sis->preswap_pages--;
+ preswap_clear(sis, offset);
+ }
+}
+
+/* flush all pages from the passed swaptype */
+void preswap_flush_area(unsigned type)
+{
+ struct swap_info_struct *sis = get_swap_info_struct(type);
+ int ind;
+
+ if ((s32)preswap_poolid < 0)
+ return;
+ for (ind = SWIZ_MASK; ind >= 0; ind--)
+ (void)(*tmem_ops->flush_object)(preswap_poolid,
+ oswiz(type, ind));
+ sis->preswap_pages = 0;
+}
+
+void preswap_init(unsigned type)
+{
+ /* only need one tmem pool for all swap types */
+ if ((s32)preswap_poolid >= 0)
+ return;
+ if (tmem_ops == NULL)
+ return;
+ preswap_poolid = (*tmem_ops->new_pool)(0, 0, TMEM_POOL_PERSIST);
+}
+
+/*
+ * preswap infrastructure functions
+ */
+
+/* code structure leveraged from sys_swapoff */
+void preswap_shrink(unsigned long target_pages)
+{
+ struct swap_info_struct *si = NULL;
+ unsigned long total_pages = 0, total_pages_to_unuse;
+ unsigned long pages = 0, unuse_pages = 0;
+ int type;
+ int wrapped = 0;
+
+ do {
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_list.head each time
+ */
+ spin_lock(&swap_lock);
+ total_pages = 0;
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = get_swap_info_struct(type);
+ total_pages += si->preswap_pages;
+ }
+ if (total_pages <= target_pages) {
+ spin_unlock(&swap_lock);
+ return;
+ }
+ total_pages_to_unuse = total_pages - target_pages;
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = get_swap_info_struct(type);
+ if (total_pages_to_unuse < si->preswap_pages)
+ pages = unuse_pages = total_pages_to_unuse;
+ else {
+ pages = si->preswap_pages;
+ unuse_pages = 0; /* unuse all */
+ }
+ if (security_vm_enough_memory(pages))
+ continue;
+ vm_unacct_memory(pages);
+ break;
+ }
+ spin_unlock(&swap_lock);
+ if (type < 0)
+ return;
+ current->flags |= PF_SWAPOFF;
+ (void)try_to_unuse(type, 1, unuse_pages);
+ current->flags &= ~PF_SWAPOFF;
+ wrapped++;
+ } while (wrapped <= 3);
+}
+
+
+#ifdef CONFIG_SYSCTL
+/* cat /sys/proc/vm/preswap provides total number of pages in preswap
+ * across all swaptypes. echo N > /sys/proc/vm/preswap attempts to shrink
+ * preswap page usage to N (usually 0) */
+int preswap_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ unsigned long npages;
+ int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ /* modeled after hugetlb_sysctl_handler in mm/hugetlb.c */
+ if (!write) {
+ spin_lock(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = get_swap_info_struct(type);
+ totalpages += si->preswap_pages;
+ }
+ spin_unlock(&swap_lock);
+ npages = totalpages;
+ }
+ table->data = &npages;
+ table->maxlen = sizeof(unsigned long);
+ proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+ if (write)
+ preswap_shrink(npages);
+
+ return 0;
+}
+#endif
--- linux-2.6.30/include/linux/sysctl.h 2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/include/linux/sysctl.h 2009-06-19 09:33:59.000000000 -0600
@@ -205,6 +205,7 @@
VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
+ VM_PRESWAP_PAGES=36, /* pages/target_pages in preswap */
};


--- linux-2.6.30/kernel/sysctl.c 2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/kernel/sysctl.c 2009-06-19 09:33:59.000000000 -0600
@@ -1282,6 +1282,18 @@
.proc_handler = &scan_unevictable_handler,
},
#endif
+#ifdef CONFIG_PRESWAP
+ {
+ .ctl_name = VM_PRESWAP_PAGES,
+ .procname = "preswap",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &preswap_sysctl_handler,
+ .extra1 = (void *)&preswap_zero,
+ .extra2 = (void *)&preswap_infinity,
+ },
+#endif
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/