[patch 3/7] mm: speculative get_page

From: Nick Piggin
Date: Thu Aug 11 2005 - 07:23:14 EST


3/7

--
SUSE Labs, Novell Inc.

If we can be sure that elevating the page_count on a pagecache
page will pin it, we can speculatively run this operation, and
subsequently check to see if we hit the right page rather than
relying on holding a lock or otherwise pinning a reference to
the page.

This can be done if get_page/put_page behaves in the same manner
throughout the whole tree (ie. if we "get" the page after it has
been used for something else, we must be able to free it with a
put_page).

There needs to be some careful logic for freed pages so they aren't
freed again, and also some careful logic for pages in the process
of being removed from pagecache.

Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -77,6 +77,7 @@
#define PG_uncached 19 /* Page has been mapped as uncached */

#define PG_free 20 /* Page is on the free lists */
+#define PG_freeing 21 /* PG_refcount about to be freed */

/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -312,6 +313,11 @@ extern void __mod_page_state(unsigned lo
#define __SetPageFree(page) __set_bit(PG_free, &(page)->flags)
#define __ClearPageFree(page) __clear_bit(PG_free, &(page)->flags)

+#define PageFreeing(page) test_bit(PG_freeing, &(page)->flags)
+#define SetPageFreeing(page) set_bit(PG_freeing, &(page)->flags)
+#define ClearPageFreeing(page) clear_bit(PG_freeing, &(page)->flags)
+#define __ClearPageFreeing(page) __clear_bit(PG_freeing, &(page)->flags)
+
struct page; /* forward declaration */

int test_clear_page_dirty(struct page *page);
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -50,6 +50,64 @@ static inline void mapping_set_gfp_mask(
#define page_cache_release(page) put_page(page)
void release_pages(struct page **pages, int nr, int cold);

+static inline struct page *page_cache_get_speculative(struct page **pagep)
+{
+ unsigned long flags;
+ struct page *page;
+
+ /*
+ * Disable IRQs (and preempt) so we don't deadlock with the page
+ * allocator who might be waiting for us to drop the speculative
+ * reference.
+ *
+ * Interrupts could be disabled _after_ loading *pagep, however
+ * we want to really minimise the window between taking a spec
+ * ref on the page and retesting the page.
+ */
+ local_irq_save(flags);
+
+ page = *pagep;
+ if (!page)
+ goto out_failed;
+
+ /* Note that get_page_testone provides a memory barrier */
+ if (unlikely(get_page_testone(page) || PageFree(page))) {
+ /*
+ * Picked up a freed page. Note order of evaluation of the
+ * above is important. If we are not the first speculative
+ * getter on a free page, then we'll fall through to the
+ * PageFree test, which is stable because the previous getters
+ * are keeping page allocation spinning on this page.
+ */
+ __put_page(page);
+ page = NULL;
+ goto out_failed;
+ }
+
+ /*
+ * interrupts and preempt could be enabled here (only need to be
+ * disabled because page allocation can spin on the elevated refcount,
+ * but we don't want to hold a reference on an unrelated page for too
+ * long, so keep preempt off until we know we have the right page
+ */
+
+ if (unlikely(PageFreeing(page) || page != *pagep)) {
+ /*
+ * Picked up a page being freed, or one that is no longer
+ * being pointed to by pagep. Note that we do the complete
+ * put_page, because unlike the above case, this page is
+ * not free and our reference is pinning it.
+ */
+ put_page(page);
+ page = NULL;
+ goto out_failed;
+ }
+
+out_failed:
+ local_irq_restore(flags);
+ return page;
+}
+
static inline struct page *page_cache_alloc(struct address_space *x)
{
return alloc_pages(mapping_gfp_mask(x)|__GFP_NORECLAIM, 0);
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -116,7 +116,6 @@ static void bad_page(const char *functio
1 << PG_writeback |
1 << PG_reserved |
1 << PG_free );
- set_page_count(page, 0);
reset_page_mapcount(page);
page->mapping = NULL;
tainted |= TAINT_BAD_PAGE;
@@ -316,7 +315,6 @@ static inline void free_pages_check(cons
{
if ( page_mapcount(page) ||
page->mapping != NULL ||
- page_count(page) != 0 ||
(page->flags & (
1 << PG_lru |
1 << PG_private |
@@ -424,7 +422,7 @@ expand(struct zone *zone, struct page *p
void set_page_refs(struct page *page, int order)
{
#ifdef CONFIG_MMU
- set_page_count(page, 1);
+ get_page(page);
#else
int i;

@@ -434,7 +432,7 @@ void set_page_refs(struct page *page, in
* - eg: access_process_vm()
*/
for (i = 0; i < (1 << order); i++)
- set_page_count(page + i, 1);
+ get_page(page + i);
#endif /* CONFIG_MMU */
}

@@ -445,7 +443,6 @@ static void prep_new_page(struct page *p
{
if ( page_mapcount(page) ||
page->mapping != NULL ||
- page_count(page) != 0 ||
(page->flags & (
1 << PG_lru |
1 << PG_private |
@@ -464,7 +461,13 @@ static void prep_new_page(struct page *p
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked | 1 << PG_mappedtodisk);
page->private = 0;
+
set_page_refs(page, order);
+ smp_mb();
+ /* Wait for speculative get_page after count has been elevated. */
+ while (unlikely(page_count(page) > 1))
+ cpu_relax();
+
kernel_map_pages(page, 1 << order, 1);
}

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -504,6 +504,7 @@ static int shrink_list(struct list_head
if (!mapping)
goto keep_locked; /* truncate got there first */

+ SetPageFreeing(page);
write_lock_irq(&mapping->tree_lock);

/*
@@ -513,6 +514,7 @@ static int shrink_list(struct list_head
*/
if (page_count(page) != 2 || PageDirty(page)) {
write_unlock_irq(&mapping->tree_lock);
+ ClearPageFreeing(page);
goto keep_locked;
}

@@ -533,6 +535,7 @@ static int shrink_list(struct list_head

free_it:
unlock_page(page);
+ __ClearPageFreeing(page);
reclaimed++;
if (!pagevec_add(&freed_pvec, page))
__pagevec_release_nonlru(&freed_pvec);
Index: linux-2.6/mm/bootmem.c
===================================================================
--- linux-2.6.orig/mm/bootmem.c
+++ linux-2.6/mm/bootmem.c
@@ -289,19 +289,20 @@ static unsigned long __init free_all_boo
int j, order;

page = pfn_to_page(pfn);
+ prefetchw(page);
+
count += BITS_PER_LONG;
- __ClearPageReserved(page);
order = ffs(BITS_PER_LONG) - 1;
- set_page_refs(page, order);
- for (j = 1; j < BITS_PER_LONG; j++) {
- if (j + 16 < BITS_PER_LONG)
- prefetchw(page + j + 16);
+ for (j = 0; j < BITS_PER_LONG; j++) {
+ if (j + 1 < BITS_PER_LONG)
+ prefetchw(page + j + 1);
__ClearPageReserved(page + j);
set_page_count(page + j, 0);
}
+ set_page_refs(page, order);
__free_pages(page, order);
+
i += BITS_PER_LONG;
- page += BITS_PER_LONG;
} else if (v) {
unsigned long m;

@@ -310,6 +311,7 @@ static unsigned long __init free_all_boo
if (v & m) {
count++;
__ClearPageReserved(page);
+ set_page_count(page, 0);
set_page_refs(page, 0);
__free_page(page);
}
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c
+++ linux-2.6/mm/swapfile.c
@@ -338,6 +338,7 @@ int remove_exclusive_swap_page(struct pa
retval = 0;
if (p->swap_map[swp_offset(entry)] == 1) {
/* Recheck the page count with the swapcache lock held.. */
+ SetPageFreeing(page);
write_lock_irq(&swapper_space.tree_lock);
if ((page_count(page) == 2) && !PageWriteback(page)) {
__delete_from_swap_cache(page);
@@ -345,6 +346,7 @@ int remove_exclusive_swap_page(struct pa
retval = 1;
}
write_unlock_irq(&swapper_space.tree_lock);
+ ClearPageFreeing(page);
}
swap_info_put(p);