Re: __get_free_pages()

Dr. Werner Fink (werner@suse.de)
Thu, 28 May 1998 12:05:10 +0200


On Wed, May 27, 1998 at 06:11:10PM -0400, James Mastros wrote:
> On Wed, 27 May 1998, Andrej Presern wrote:
> > Can someone please explain what individual GFP_* bitmasks mean (in
> > include/linux/mm.h and as used in __get_free_pages())? Also, what is the
> > 'order' argument that __get_free_pages() takes?
>
> __GFP_WAIT: If memory isn't avaible, sleep untill it is
> __GFP_IO: We are allowed to do IO (eg swap).
> __GFP_{LOW|MED|HI}: Relative necessity.
> __GFP_DMA: Only allocate memory that we can DMA into (ie <16 MB on Intel
> boxes)
>
> the GFP_* flags are just combinations of the __GFP_* flags defined in
> include/linux/mm.h. The order in __get_free_pages is the order of pages we
> want (ie 2^order contigious pages. Order of 1 is equivlent to
> __get_free_page().)

The only remaining problem is that these GFP necessity flags aren't used so much
in __get_free_pages(). This is (maybe) an advantage for big system[1] but for
small ones __get_free_pages() overshoots the freepages.min limit under high
load. One result of such a behaviour is that kswapd must work on small
systems disproportionately more than on big ones. IMHO in such a case it
is possible that __get_free_pages() may fail more often due to really low and/or
fragmented memory ... but no one see this by pressing ALT-SysRQ-m because
the famous kswapd triggered by the next swap_tick() has `repaired' the
unbalanced memory situation `on the fly'.

[1]In other words: big systems have more hidden reserves to catch such rusting
situations in elegance but real servers are always small systems :-)

Werner

PS: Maybe the following patch will help small systems a bit. With this patch
I try to catch the overshoot behaviour of __get_free_pages() by
a `dynamic' response if we cross freepages.min limit. A change of the
necessity flag for buffer pages (e.g. for swap I/O) should help to
decrease the latency of swapping. Freshly reallocated pages will
get a better start to be not swapped out on the next cycle. And
at least maybe some dcache overruns will be cached.
----
diff -urN linux-2.1.103/fs/buffer.c linux/fs/buffer.c
--- linux-2.1.103/fs/buffer.c Mon May 11 23:38:28 1998
+++ linux/fs/buffer.c Thu May 28 11:26:32 1998
@@ -1209,8 +1209,16 @@
/* This is critical. We can't swap out pages to get
* more buffer heads, because the swap-out may need
* more buffer-heads itself. Thus SLAB_ATOMIC.
+ *
+ * Old but true:
+ *
+ * * This is no longer true, it is GFP_BUFFER again, the
+ * * swapping code now knows not to perform I/O when that
+ * * GFP level is specified... -DaveM
+ *
+ * <werner@suse.de>
*/
- if((bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC)) != NULL) {
+ if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
memset(bh, 0, sizeof(*bh));
nr_buffer_heads++;
return bh;
diff -urN linux-2.1.103/fs/dcache.c linux/fs/dcache.c
--- linux-2.1.103/fs/dcache.c Sat May 2 23:07:52 1998
+++ linux/fs/dcache.c Thu May 28 11:28:41 1998
@@ -432,7 +432,7 @@
* too much.
*
* Priority:
- * 0 - very urgent: schrink everything
+ * 0 - very urgent: shrink everything
* ...
* 6 - base-level: try to shrink a bit.
*/
diff -urN linux-2.1.103/include/linux/mm.h linux/include/linux/mm.h
--- linux-2.1.103/include/linux/mm.h Thu May 21 01:21:43 1998
+++ linux/include/linux/mm.h Thu May 28 11:26:32 1998
@@ -325,7 +325,7 @@

#define __GFP_DMA 0x80

-#define GFP_BUFFER (__GFP_LOW | __GFP_WAIT)
+#define GFP_BUFFER (__GFP_MED | __GFP_WAIT)
#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_USER (__GFP_LOW | __GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_LOW | __GFP_WAIT | __GFP_IO)
diff -urN linux-2.1.103/include/linux/pagemap.h linux/include/linux/pagemap.h
--- linux-2.1.103/include/linux/pagemap.h Thu May 21 01:21:43 1998
+++ linux/include/linux/pagemap.h Thu May 28 11:26:32 1998
@@ -11,6 +11,7 @@

#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/swapctl.h>

static inline unsigned long page_address(struct page * page)
{
@@ -20,8 +21,6 @@
#define PAGE_HASH_BITS 11
#define PAGE_HASH_SIZE (1 << PAGE_HASH_BITS)

-#define PAGE_AGE_VALUE 16
-
extern unsigned long page_cache_size; /* # of pages currently in the hash table */
extern struct page * page_hash_table[PAGE_HASH_SIZE];

@@ -84,7 +83,7 @@
{
page_cache_size++;
set_bit(PG_referenced, &page->flags);
- page->age = PAGE_AGE_VALUE;
+ touch_page(page);
if((page->next_hash = *p) != NULL)
(*p)->pprev_hash = &page->next_hash;
*p = page;
diff -urN linux-2.1.103/ipc/shm.c linux/ipc/shm.c
--- linux-2.1.103/ipc/shm.c Wed May 6 19:56:06 1998
+++ linux/ipc/shm.c Thu May 28 11:26:32 1998
@@ -14,6 +14,7 @@
#include <linux/stat.h>
#include <linux/malloc.h>
#include <linux/swap.h>
+#include <linux/swapctl.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
@@ -703,6 +704,11 @@
shm_swp--;
}
shm_rss++;
+
+ /* Give the reallocated page a better start */
+ if (shm_rss < (num_physpages>>3))
+ touch_page(&mem_map[MAP_NR(page)]);
+
pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
shp->shm_pages[idx] = pte_val(pte);
} else
diff -urN linux-2.1.103/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-2.1.103/mm/page_alloc.c Fri May 8 08:44:25 1998
+++ linux/mm/page_alloc.c Thu May 28 11:37:51 1998
@@ -114,8 +114,16 @@
int retval = 0;
unsigned long flags;
struct free_area_struct * list;
+ int reserved_pages = freepages.min;

/*
+ * If we are really down with free memory the kswapd
+ * should run immediately and thorough -- <werner@suse.de>
+ */
+ reserved_pages -= (12 + (reserved_pages>>3))
+ if (nr_free_pages <= reserved_pages)
+ return 0;
+ /*
* If we have more than about 3% to 5% of all memory free,
* consider it to be good enough for anything.
* It may not be, due to fragmentation, but we
@@ -253,6 +261,7 @@
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
unsigned long flags, maxorder;
+ int reserved_pages;

if (order >= NR_MEM_LISTS)
goto nopage;
@@ -262,8 +271,15 @@
* to empty in order to find a free page..
*/
maxorder = NR_MEM_LISTS-1;
- if (gfp_mask & __GFP_HIGH)
+ reserved_pages = freepages.min;
+ if (gfp_mask & __GFP_HIGH) {
maxorder = NR_MEM_LISTS;
+ reserved_pages = 0;
+ } else if (gfp_mask & __GFP_MED) {
+ reserved_pages -= (12 + (reserved_pages>>3));
+ if (gfp_mask & __GFP_IO)
+ reserved_pages = (reserved_pages>>3);
+ }

if (in_interrupt() && (gfp_mask & __GFP_WAIT)) {
static int count = 0;
@@ -275,15 +291,18 @@
}

for (;;) {
- spin_lock_irqsave(&page_alloc_lock, flags);
- RMQUEUE(order, maxorder, (gfp_mask & GFP_DMA));
- spin_unlock_irqrestore(&page_alloc_lock, flags);
- if (!(gfp_mask & __GFP_WAIT))
- break;
+ if (nr_free_pages > reserved_pages) {
+ spin_lock_irqsave(&page_alloc_lock, flags);
+ RMQUEUE(order, maxorder, (gfp_mask & GFP_DMA));
+ spin_unlock_irqrestore(&page_alloc_lock, flags);
+ if (!(gfp_mask & __GFP_WAIT))
+ break;
+ }
if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX))
break;
gfp_mask &= ~__GFP_WAIT; /* go through this only once */
maxorder = NR_MEM_LISTS; /* Allow anything this time */
+ reserved_pages = 0;
}
nopage:
return 0;
@@ -338,8 +357,9 @@
* This is totally arbitrary.
*/
i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
- if (i < 48)
- i = 48;
+ if (i < 24)
+ i = 24;
+ i += 24;
if (i > 256)
i = 256;
freepages.min = i;
@@ -402,6 +422,10 @@
vma->vm_mm->rss++;
tsk->min_flt++;
swap_free(entry);
+
+ /* Give the reallocated page a better start */
+ if (vma->vm_mm->rss < (num_physpages>>2))
+ touch_page(page_map);

if (!write_access || is_page_shared(page_map)) {
set_pte(page_table, mk_pte(page, vma->vm_page_prot));
diff -urN linux-2.1.103/mm/vmscan.c linux/mm/vmscan.c
--- linux-2.1.103/mm/vmscan.c Sun May 3 02:44:59 1998
+++ linux/mm/vmscan.c Thu May 28 11:30:34 1998
@@ -28,6 +28,9 @@
#include <asm/bitops.h>
#include <asm/pgtable.h>

+extern int inodes_stat[];
+extern int dentry_stat[];
+
/*
* When are we next due for a page scan?
*/
@@ -451,6 +454,8 @@
if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages)
|| (page_cache_size * 100 > page_cache.borrow_percent * num_physpages))
state = 0;
+ else if (dentry_stat[0] > 3*(inodes_stat[0] >> 1))
+ state = 3;

switch (state) {
do {

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu