Re: [PATCH 0/5] make slab gfp fair

From: Peter Zijlstra
Date: Sun May 20 2007 - 04:40:01 EST


Ok, full reset.

I care about kernel allocations only. In particular about those that
have PF_MEMALLOC semantics.

The thing I need is that any memory allocated below
ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER
is only ever used by processes that have ALLOC_NO_WATERMARKS rights;
for the duration of the distress.

What this patch does:
- change the page allocator to try ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER
if ALLOC_NO_WATERMARKS, before the actual ALLOC_NO_WATERMARKS alloc

- set page->reserve nonzero for each page allocated with
ALLOC_NO_WATERMARKS; which by the previous point implies that all
available zones are below ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER

- when a page->reserve slab is allocated store it in s->reserve_slab
and do not update the ->cpu_slab[] (this forces subsequent allocs to
retry the allocation).

All ALLOC_NO_WATERMARKS enabled slab allocations are served from
->reserve_slab, up until the point where a !page->reserve slab alloc
succeeds, at which point the ->reserve_slab is pushed into the partial
lists and ->reserve_slab set to NULL.

Since only the allocation of a new slab uses the gfp zone flags, and
other allocations placement hints they have to be uniform over all slab
allocs for a given kmem_cache. Thus the s->reserve_slab/page->reserve
status is kmem_cache wide.

Any holes left?

---

Index: linux-2.6-git/mm/internal.h
===================================================================
--- linux-2.6-git.orig/mm/internal.h
+++ linux-2.6-git/mm/internal.h
@@ -12,6 +12,7 @@
#define __MM_INTERNAL_H

#include <linux/mm.h>
+#include <linux/hardirq.h>

static inline void set_page_count(struct page *page, int v)
{
@@ -37,4 +38,50 @@ static inline void __put_page(struct pag
extern void fastcall __init __free_pages_bootmem(struct page *page,
unsigned int order);

+#define ALLOC_HARDER 0x01 /* try to alloc harder */
+#define ALLOC_HIGH 0x02 /* __GFP_HIGH set */
+#define ALLOC_WMARK_MIN 0x04 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW 0x08 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH 0x10 /* use pages_high watermark */
+#define ALLOC_NO_WATERMARKS 0x20 /* don't check watermarks at all */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+
+/*
+ * get the deepest reaching allocation flags for the given gfp_mask
+ */
+static int inline gfp_to_alloc_flags(gfp_t gfp_mask)
+{
+ struct task_struct *p = current;
+ int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+ const gfp_t wait = gfp_mask & __GFP_WAIT;
+
+ /*
+ * The caller may dip into page reserves a bit more if the caller
+ * cannot run direct reclaim, or if the caller has realtime scheduling
+ * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
+ * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ */
+ if (gfp_mask & __GFP_HIGH)
+ alloc_flags |= ALLOC_HIGH;
+
+ if (!wait) {
+ alloc_flags |= ALLOC_HARDER;
+ /*
+ * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ alloc_flags &= ~ALLOC_CPUSET;
+ } else if (unlikely(rt_task(p)) && !in_interrupt())
+ alloc_flags |= ALLOC_HARDER;
+
+ if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
+ if (!in_interrupt() &&
+ ((p->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))))
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ }
+
+ return alloc_flags;
+}
+
#endif
Index: linux-2.6-git/mm/page_alloc.c
===================================================================
--- linux-2.6-git.orig/mm/page_alloc.c
+++ linux-2.6-git/mm/page_alloc.c
@@ -1175,14 +1175,6 @@ failed:
return NULL;
}

-#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
-#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
-#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
-#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
-#define ALLOC_HARDER 0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
-
#ifdef CONFIG_FAIL_PAGE_ALLOC

static struct fail_page_alloc_attr {
@@ -1494,6 +1486,7 @@ zonelist_scan:

page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
if (page)
+ page->reserve = (alloc_flags & ALLOC_NO_WATERMARKS);
break;
this_zone_full:
if (NUMA_BUILD)
@@ -1619,48 +1612,36 @@ restart:
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
- *
- * The caller may dip into page reserves a bit more if the caller
- * cannot run direct reclaim, or if the caller has realtime scheduling
- * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
- alloc_flags = ALLOC_WMARK_MIN;
- if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
- alloc_flags |= ALLOC_HARDER;
- if (gfp_mask & __GFP_HIGH)
- alloc_flags |= ALLOC_HIGH;
- if (wait)
- alloc_flags |= ALLOC_CPUSET;
+ alloc_flags = gfp_to_alloc_flags(gfp_mask);

- /*
- * Go through the zonelist again. Let __GFP_HIGH and allocations
- * coming from realtime tasks go deeper into reserves.
- *
- * This is the last chance, in general, before the goto nopage.
- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
- */
- page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+ /* This is the last chance, in general, before the goto nopage. */
+ page = get_page_from_freelist(gfp_mask, order, zonelist,
+ alloc_flags & ~ALLOC_NO_WATERMARKS);
if (page)
goto got_pg;

/* This allocation should allow future memory freeing. */
-
rebalance:
- if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
- && !in_interrupt()) {
- if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+ if (alloc_flags & ALLOC_NO_WATERMARKS) {
nofail_alloc:
- /* go through the zonelist yet again, ignoring mins */
- page = get_page_from_freelist(gfp_mask, order,
+ /*
+ * Before going bare metal, try to get a page above the
+ * critical threshold - ignoring CPU sets.
+ */
+ page = get_page_from_freelist(gfp_mask, order, zonelist,
+ ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER);
+ if (page)
+ goto got_pg;
+
+ /* go through the zonelist yet again, ignoring mins */
+ page = get_page_from_freelist(gfp_mask, order,
zonelist, ALLOC_NO_WATERMARKS);
- if (page)
- goto got_pg;
- if (gfp_mask & __GFP_NOFAIL) {
- congestion_wait(WRITE, HZ/50);
- goto nofail_alloc;
- }
+ if (page)
+ goto got_pg;
+ if (wait && (gfp_mask & __GFP_NOFAIL)) {
+ congestion_wait(WRITE, HZ/50);
+ goto nofail_alloc;
}
goto nopage;
}
@@ -1669,6 +1650,10 @@ nofail_alloc:
if (!wait)
goto nopage;

+ /* Avoid recursion of direct reclaim */
+ if (p->flags & PF_MEMALLOC)
+ goto nopage;
+
cond_resched();

/* We now go into synchronous reclaim */
Index: linux-2.6-git/include/linux/mm_types.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_types.h
+++ linux-2.6-git/include/linux/mm_types.h
@@ -60,6 +60,7 @@ struct page {
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* SLUB: freelist req. slab lock */
+ int reserve; /* page_alloc: page is a reserve page */
};
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
Index: linux-2.6-git/include/linux/slub_def.h
===================================================================
--- linux-2.6-git.orig/include/linux/slub_def.h
+++ linux-2.6-git/include/linux/slub_def.h
@@ -46,6 +46,8 @@ struct kmem_cache {
struct list_head list; /* List of slab caches */
struct kobject kobj; /* For sysfs */

+ struct page *reserve_slab;
+
#ifdef CONFIG_NUMA
int defrag_ratio;
struct kmem_cache_node *node[MAX_NUMNODES];
Index: linux-2.6-git/mm/slub.c
===================================================================
--- linux-2.6-git.orig/mm/slub.c
+++ linux-2.6-git/mm/slub.c
@@ -20,11 +20,13 @@
#include <linux/mempolicy.h>
#include <linux/ctype.h>
#include <linux/kallsyms.h>
+#include "internal.h"

/*
* Lock order:
- * 1. slab_lock(page)
- * 2. slab->list_lock
+ * 1. reserve_lock
+ * 2. slab_lock(page)
+ * 3. node->list_lock
*
* The slab_lock protects operations on the object of a particular
* slab and its metadata in the page struct. If the slab lock
@@ -259,6 +261,8 @@ static int sysfs_slab_alias(struct kmem_
static void sysfs_slab_remove(struct kmem_cache *s) {}
#endif

+static DEFINE_SPINLOCK(reserve_lock);
+
/********************************************************************
* Core slab cache functions
*******************************************************************/
@@ -1007,7 +1011,7 @@ static void setup_object(struct kmem_cac
s->ctor(object, s, 0);
}

-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *reserve)
{
struct page *page;
struct kmem_cache_node *n;
@@ -1025,6 +1029,7 @@ static struct page *new_slab(struct kmem
if (!page)
goto out;

+ *reserve = page->reserve;
n = get_node(s, page_to_nid(page));
if (n)
atomic_long_inc(&n->nr_slabs);
@@ -1395,6 +1400,7 @@ static void *__slab_alloc(struct kmem_ca
{
void **object;
int cpu = smp_processor_id();
+ int reserve = 0;

if (!page)
goto new_slab;
@@ -1424,10 +1430,25 @@ new_slab:
if (page) {
s->cpu_slab[cpu] = page;
goto load_freelist;
- }
+ } else if (unlikely(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
+ goto try_reserve;

- page = new_slab(s, gfpflags, node);
- if (page) {
+alloc_slab:
+ page = new_slab(s, gfpflags, node, &reserve);
+ if (page && !reserve) {
+ if (unlikely(s->reserve_slab)) {
+ struct page *reserve;
+
+ spin_lock(&reserve_lock);
+ reserve = s->reserve_slab;
+ s->reserve_slab = NULL;
+ spin_unlock(&reserve_lock);
+
+ if (reserve) {
+ slab_lock(reserve);
+ unfreeze_slab(s, reserve);
+ }
+ }
cpu = smp_processor_id();
if (s->cpu_slab[cpu]) {
/*
@@ -1455,6 +1476,18 @@ new_slab:
SetSlabFrozen(page);
s->cpu_slab[cpu] = page;
goto load_freelist;
+ } else if (page) {
+ spin_lock(&reserve_lock);
+ if (s->reserve_slab) {
+ discard_slab(s, page);
+ page = s->reserve_slab;
+ goto got_reserve;
+ }
+ slab_lock(page);
+ SetSlabFrozen(page);
+ s->reserve_slab = page;
+ spin_unlock(&reserve_lock);
+ goto use_reserve;
}
return NULL;
debug:
@@ -1470,6 +1503,31 @@ debug:
page->freelist = object[page->offset];
slab_unlock(page);
return object;
+
+try_reserve:
+ spin_lock(&reserve_lock);
+ page = s->reserve_slab;
+ if (!page) {
+ spin_unlock(&reserve_lock);
+ goto alloc_slab;
+ }
+
+got_reserve:
+ slab_lock(page);
+ if (!page->freelist) {
+ s->reserve_slab = NULL;
+ spin_unlock(&reserve_lock);
+ unfreeze_slab(s, page);
+ goto alloc_slab;
+ }
+ spin_unlock(&reserve_lock);
+
+use_reserve:
+ object = page->freelist;
+ page->inuse++;
+ page->freelist = object[page->offset];
+ slab_unlock(page);
+ return object;
}

/*
@@ -1807,10 +1865,11 @@ static struct kmem_cache_node * __init e
{
struct page *page;
struct kmem_cache_node *n;
+ int reserve;

BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));

- page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
+ page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node, &reserve);
/* new_slab() disables interupts */
local_irq_enable();

@@ -2018,6 +2077,8 @@ static int kmem_cache_open(struct kmem_c
#ifdef CONFIG_NUMA
s->defrag_ratio = 100;
#endif
+ s->reserve_slab = NULL;
+
if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
return 1;
error:


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/