[PATCH] Simple Page Coloring [last edition] (2.3.99pre5 diffs)

From: Joseph A. Martin (jmartin@linux08.mro.dec.com)
Date: Tue Apr 18 2000 - 16:42:06 EST


Thanks to those who provided various useful feedback. This last
edition effects a small improvement on the unhappy system build
benchmark, now only 8% worse than no change rather than 13%. Roughly
speaking, the change comes from excluding the file cache from this
scheme.

"make -s -j 2 boot" median elasped time of 5 runs:

           | stock | simple coloring
-----------+---------------------------------+--------------------------------
2.3.99pre3 | 272.53u 20.22s 2:35.02e 188%CPU | 273.17u 57.86s 2:56.18e 187%CPU
2.3.99pre5 | 273.04u 20.28s 2:35.21e 188%CPU | 272.14u 41.55s 2:47.05e 187%CPU

Regards,
\Joe Joseph A. Martin
 Joseph.Martin@Compaq.com Compaq Computer Corporation
                                        200 Forest Street MRO1-2/K20
 (508) 467-2100 Marlboro, MA 01752 U.S.A.

--- ../linux-2.3.99-pre5/mm/Makefile Mon Dec 6 13:14:13 1999
+++ ../patch/mm/Makefile Tue Apr 4 16:23:58 2000
@@ -16,4 +16,8 @@
 O_OBJS += highmem.o
 endif
 
+ifeq ($(CONFIG_PAGE_COLORING),y)
+O_OBJS += coloring.o
+endif
+
 include $(TOPDIR)/Rules.make
--- ../linux-2.3.99-pre5/mm/coloring.c Wed Apr 5 14:52:39 2000
+++ ../patch/mm/coloring.c Tue Apr 18 15:43:29 2000
@@ -0,0 +1,120 @@
+/*
+ * Written by Joseph A. Martin, Compaq Computer Corp., 14 April 2000.
+ */
+#include <linux/mm.h>
+#include <asm/cache.h>
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+#ifndef LN2_MAX_CACHE
+/* If this isn't defined in include/asm/cache.h, default to a guess for i386. */
+#define LN2_MIN_CACHE 19
+#define LN2_MAX_CACHE 22
+#endif
+
+/*
+ * Page coloring supplies single user pages which correspond to
+ * consecutive locations in a physically indexed secondary cache.
+ * The intent is spread application memory evenly through the cache
+ * and thereby improve cache efficiency.
+ *
+ * Page coloring stands the buddy system on its head, allocating the
+ * largest available blocks to cover the secondary cache. If there is
+ * no single block as large as the cache, it recursively assembles a
+ * collection of smaller blocks, which are disjoint modulo the biggest
+ * cache size. Therefore, it helps -- but is not required -- for
+ * MAX_ORDER > LN2_MAX_CACHE - PAGE_SHIFT
+ *
+ * The one subtle thing that is going on here is that multi-page
+ * blocks are being allocated, but single pages are being freed. It
+ * works because the buddy system keeps track of the parity of
+ * allocation, not the count (0, 1, or 2) of buddies free or available.
+ */
+
+/* This locks keeps the single_pages queue consistent. */
+static spinlock_t single_page_lock = SPIN_LOCK_UNLOCKED;
+
+/* This lock single-threads page production. */
+static spinlock_t single_page_producer_lock = SPIN_LOCK_UNLOCKED;
+
+static LIST_HEAD(single_pages);
+
+static inline int
+get_named_block(long order, unsigned long name)
+{
+ unsigned long flags;
+ int status = 0;
+ long match_mask = (-1L << (order+PAGE_SHIFT)) & ~(-1L << LN2_MAX_CACHE);
+ struct page * page;
+ unsigned long addr = 0;
+ struct list_head reject;
+
+ INIT_LIST_HEAD(&reject);
+
+ while ((page = alloc_pages(GFP_HIGHUSER, order)) != 0) {
+ addr = page_address(page);
+ if (((addr ^ name) & match_mask) == 0)
+ break;
+ list_add(&page->list, &reject);
+ addr = 0;
+ }
+
+ /* If necessary, recurse to smaller blocks but not too deeply. */
+ status = (addr != 0) ||
+ ((order > MAX(6, LN2_MIN_CACHE - PAGE_SHIFT - 2)) &&
+ (get_named_block(order - 1, name | (1 << (order + PAGE_SHIFT - 1)))
+ | get_named_block(order - 1, name)));
+
+ /*
+ * We hang on to the rejects until now so that the recursion
+ * above won't revisit them by way of splitting larger blocks.
+ */
+ while (!list_empty(&reject)) {
+ struct page *r = (struct page *)reject.next;
+ list_del(&r->list);
+ __free_pages(r, order);
+ }
+
+ if (addr != 0) {
+ int i;
+
+ /* Break up the block into single pages. */
+ spin_lock_irqsave(&single_page_lock, flags);
+ for (i = MAP_NR(addr) + (1<<order) - 1; i>=MAP_NR(addr); --i) {
+ list_add(&mem_map[i].list, &single_pages);
+ set_page_count(&mem_map[i], 1);
+ }
+ spin_unlock_irqrestore(&single_page_lock, flags);
+ }
+
+ return status;
+}
+
+struct page *
+_alloc_color_page(void)
+{
+ unsigned long flags;
+ struct page * page;
+
+ spin_lock_irqsave(&single_page_lock, flags);
+
+ while (list_empty(&single_pages)) {
+ int status = 0;
+
+ spin_unlock_irqrestore(&single_page_lock, flags);
+ if (spin_trylock(&single_page_producer_lock)) {
+ status = get_named_block(LN2_MAX_CACHE - PAGE_SHIFT, 0);
+ spin_unlock(&single_page_producer_lock);
+ }
+ if (status == 0) {
+ /* Failover to alloc_pages(GFP_HIGHUSER, 0) */
+ return (struct page *)0;
+ }
+ spin_lock_irqsave(&single_page_lock, flags);
+ }
+
+ page = (struct page *)single_pages.next;
+ list_del(&page->list);
+ spin_unlock_irqrestore(&single_page_lock, flags);
+ return page;
+}
--- ../linux-2.3.99-pre5/include/linux/mm.h Tue Apr 18 14:50:24 2000
+++ ../patch/include/linux/mm.h Tue Apr 18 15:34:41 2000
@@ -300,6 +300,44 @@
 extern mem_map_t * mem_map;
 
 /*
+ * GFP bitmasks..
+ */
+#define __GFP_WAIT 0x01
+#define __GFP_HIGH 0x02
+#define __GFP_IO 0x04
+#define __GFP_DMA 0x08
+#ifdef CONFIG_HIGHMEM
+#define __GFP_HIGHMEM 0x10
+#else
+#define __GFP_HIGHMEM 0x0 /* noop */
+#endif
+#ifdef CONFIG_PAGE_COLORING
+/* Overload GFP_HIGHUSER with this to exclude file cache from coloring. */
+#define __GFP_COLORABLE 0x20
+#else
+#define __GFP_COLORABLE 0x0 /* noop */
+#endif
+
+
+#define GFP_BUFFER (__GFP_HIGH | __GFP_WAIT)
+#define GFP_ATOMIC (__GFP_HIGH)
+#define GFP_USER (__GFP_WAIT | __GFP_IO)
+#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM | __GFP_COLORABLE)
+#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO)
+#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO)
+#define GFP_KSWAPD (__GFP_IO)
+
+/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
+ platforms, used as appropriate on others */
+
+#define GFP_DMA __GFP_DMA
+
+/* Flag - indicates that the buffer can be taken from high memory which is not
+ permanently mapped by the kernel */
+
+#define GFP_HIGHMEM __GFP_HIGHMEM
+
+/*
  * There is only one page-allocator function, and two main namespaces to
  * it. The alloc_page*() variants return 'struct page *' and as such
  * can allocate highmem pages, the *get*page*() variants return
@@ -325,14 +363,23 @@
 extern struct page * alloc_pages(int gfp_mask, unsigned long order);
 #endif /* !CONFIG_DISCONTIGMEM */
 
+#ifdef CONFIG_PAGE_COLORING
+extern struct page * _alloc_color_page(void);
+#define alloc_color_page(_GFP_) \
+ ((((_GFP_) & GFP_HIGHUSER) == GFP_HIGHUSER) ? _alloc_color_page() : 0)
+#else
+#define alloc_color_page(_GFP_) 0
+#endif /* CONFIG_PAGE_COLORING */
+
 #define alloc_page(gfp_mask) \
- alloc_pages(gfp_mask, 0)
+ (alloc_color_page(gfp_mask) ?: alloc_pages(gfp_mask, 0))
 
 extern inline unsigned long __get_free_pages (int gfp_mask, unsigned long order)
 {
         struct page * page;
 
- page = alloc_pages(gfp_mask, order);
+ if (order > 0 || (page = alloc_color_page(gfp_mask)) == 0)
+ page = alloc_pages(gfp_mask, order);
         if (!page)
                 return 0;
         return page_address(page);
@@ -461,38 +508,6 @@
                         size_t size, unsigned int flags);
 extern struct page *filemap_nopage(struct vm_area_struct * area,
                                     unsigned long address, int no_share);
-
-/*
- * GFP bitmasks..
- */
-#define __GFP_WAIT 0x01
-#define __GFP_HIGH 0x02
-#define __GFP_IO 0x04
-#define __GFP_DMA 0x08
-#ifdef CONFIG_HIGHMEM
-#define __GFP_HIGHMEM 0x10
-#else
-#define __GFP_HIGHMEM 0x0 /* noop */
-#endif
-
-
-#define GFP_BUFFER (__GFP_HIGH | __GFP_WAIT)
-#define GFP_ATOMIC (__GFP_HIGH)
-#define GFP_USER (__GFP_WAIT | __GFP_IO)
-#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
-#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO)
-#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO)
-#define GFP_KSWAPD (__GFP_IO)
-
-/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
- platforms, used as appropriate on others */
-
-#define GFP_DMA __GFP_DMA
-
-/* Flag - indicates that the buffer can be taken from high memory which is not
- permanently mapped by the kernel */
-
-#define GFP_HIGHMEM __GFP_HIGHMEM
 
 /* vma is the first one with address < vma->vm_end,
  * and even address < vma->vm_start. Have to extend vma. */
--- ../linux-2.3.99-pre5/include/asm-alpha/cache.h Mon Dec 6 20:15:53 1999
+++ ../patch/include/asm-alpha/cache.h Tue Apr 4 16:23:58 2000
@@ -10,4 +10,7 @@
 #define L1_CACHE_ALIGN(x) (((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1))
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 
+/* Size range for the secondary cache of the "latest and greatest" (EV6) */
+#define LN2_MIN_CACHE 21
+#define LN2_MAX_CACHE 24
 #endif
--- ../linux-2.3.99-pre5/kernel/ksyms.c Tue Apr 18 14:50:26 2000
+++ ../patch/kernel/ksyms.c Tue Apr 18 15:30:01 2000
@@ -97,6 +97,9 @@
 EXPORT_SYMBOL(__alloc_pages);
 EXPORT_SYMBOL(alloc_pages_node);
 EXPORT_SYMBOL(__free_pages_ok);
+#ifdef CONFIG_PAGE_COLORING
+EXPORT_SYMBOL(_alloc_color_page);
+#endif
 #ifndef CONFIG_DISCONTIGMEM
 EXPORT_SYMBOL(contig_page_data);
 #endif
--- ../linux-2.3.99-pre5/Documentation/Configure.help Tue Apr 18 14:50:11 2000
+++ ../patch/Documentation/Configure.help Tue Apr 18 15:21:40 2000
@@ -2050,6 +2050,14 @@
 
   If unsure, say N.
 
+Secondary cache optimization
+CONFIG_PAGE_COLORING
+ If possible, consecutive requests for single user pages are filled
+ with pages physically consecutive modulo the largest secondary cache
+ size. This reduces the amount of secondary cache conflict within an
+ application and is most useful for applications using large amounts
+ of memory for data.
+
 Non-standard serial port support
 CONFIG_SERIAL_NONSTANDARD
   Say Y here if you have any non-standard serial boards -- boards
--- ../linux-2.3.99-pre5/arch/alpha/config.in Tue Apr 18 14:50:11 2000
+++ ../patch/arch/alpha/config.in Tue Apr 18 15:25:04 2000
@@ -184,6 +184,10 @@
         bool 'Symmetric multi-processing support' CONFIG_SMP
 fi
 
+if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ bool 'Secondary cache optimization' CONFIG_PAGE_COLORING
+fi
+
 source drivers/pci/Config.in
 
 bool 'Support for hot-pluggable devices' CONFIG_HOTPLUG

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Apr 23 2000 - 21:00:14 EST