[patch] Memory Binding API

From: Matthew Dobson (colpatch@us.ibm.com)
Date: Fri Jul 12 2002 - 19:39:46 EST


Here is a Memory Binding API. This allows processes to bind themselves to
particular blocks of memory in a multi-memory block system. This patch is
based on top of the Simple Topology API I posted a few moments ago. The calls
take a simple bitmask of memblocks.

This is a scaled down version of a full NUMA API I posted several times in the
past, to a distinct lack of fanfare. I hope that this smaller patch holds more
interest to the community...

Enjoy!

-Matt

diff -Nur linux-2.5.25-test/arch/i386/config.in linux-2.5.25-api/arch/i386/config.in
--- linux-2.5.25-test/arch/i386/config.in Fri Jul 5 16:42:20 2002
+++ linux-2.5.25-api/arch/i386/config.in Fri Jul 12 14:11:40 2002
@@ -165,6 +165,10 @@
    fi
 else
    bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
+ if [ "$CONFIG_MULTIQUAD" = y ]; then
+ bool 'Memory Binding API Support' CONFIG_MEMBIND
+ bool 'IBM/Sequent NUMA-Q Hardware Support' CONFIG_IBMNUMAQ
+ fi
 fi
 
 bool 'Machine Check Exception' CONFIG_X86_MCE
diff -Nur linux-2.5.25-test/include/asm-i386/smp.h linux-2.5.25-api/include/asm-i386/smp.h
--- linux-2.5.25-test/include/asm-i386/smp.h Fri Jul 5 16:42:02 2002
+++ linux-2.5.25-api/include/asm-i386/smp.h Fri Jul 12 16:09:16 2002
@@ -55,6 +55,7 @@
 extern void smp_alloc_memory(void);
 extern unsigned long phys_cpu_present_map;
 extern unsigned long cpu_online_map;
+extern unsigned long memblk_online_map;
 extern volatile unsigned long smp_invalidate_needed;
 extern int pic_mode;
 extern int smp_num_siblings;
@@ -99,6 +100,11 @@
         return hweight32(cpu_online_map);
 }
 
+extern inline unsigned int num_online_memblks(void)
+{
+ return hweight32(memblk_online_map);
+}
+
 extern inline int any_online_cpu(unsigned int mask)
 {
         if (mask & cpu_online_map)
diff -Nur linux-2.5.25-test/include/linux/init_task.h linux-2.5.25-api/include/linux/init_task.h
--- linux-2.5.25-test/include/linux/init_task.h Fri Jul 5 16:42:04 2002
+++ linux-2.5.25-api/include/linux/init_task.h Fri Jul 12 16:32:19 2002
@@ -59,6 +59,7 @@
     children: LIST_HEAD_INIT(tsk.children), \
     sibling: LIST_HEAD_INIT(tsk.sibling), \
     thread_group: LIST_HEAD_INIT(tsk.thread_group), \
+ memblk_binding: { MEMBLK_NO_BINDING, MPOL_STRICT }, \
     wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
     real_timer: { \
         function: it_real_fn \
diff -Nur linux-2.5.25-test/include/linux/membind.h linux-2.5.25-api/include/linux/membind.h
--- linux-2.5.25-test/include/linux/membind.h Fri Jul 12 16:53:46 2002
+++ linux-2.5.25-api/include/linux/membind.h Fri Jul 12 16:31:30 2002
@@ -27,6 +27,35 @@
 #ifndef _LINUX_MEMBIND_H_
 #define _LINUX_MEMBIND_H_
 
+#include <linux/types.h>
+
+#ifdef CONFIG_MEMBIND
+#define NR_MEMBLKS 32 /* Max number of Memory Blocks */
+#else
+#define NR_MEMBLKS 1
+#endif
+
+typedef unsigned long memblk_bitmask_t;
+#define MEMBLK_NO_BINDING ((memblk_bitmask_t) 0) /* A '0' means use the memblk
+ and a '1' means *don't* */
+
+#define MPOL_STRICT 0 /* Memory MUST be allocated according to binding */
+#define MPOL_LOOSE 1 /* Memory will be allocated according to binding,
+ but can fall back to other memory blocks if necessary. */
+#define MPOL_FIRST 2 /* UNUSED FOR NOW */
+#define MPOL_STRIPE 4 /* UNUSED FOR NOW */
+#define MPOL_RR 8 /* UNUSED FOR NOW */
+
+
+typedef struct memblk_list {
+ memblk_bitmask_t bitmask;
+ int behavior;
+ rwlock_t lock;
+} memblk_list_t;
+
+
+int set_memblk_binding(memblk_bitmask_t, int);
+memblk_bitmask_t get_memblk_binding(void);
 int cpu_to_node(int);
 int memblk_to_node(int);
 int node_to_node(int);
diff -Nur linux-2.5.25-test/include/linux/mmzone.h linux-2.5.25-api/include/linux/mmzone.h
--- linux-2.5.25-test/include/linux/mmzone.h Fri Jul 5 16:42:02 2002
+++ linux-2.5.25-api/include/linux/mmzone.h Thu Jul 11 14:00:12 2002
@@ -136,6 +136,7 @@
         unsigned long node_start_mapnr;
         unsigned long node_size;
         int node_id;
+ int memblk_id; /* A unique ID for each memory block */
         struct pglist_data *node_next;
 } pg_data_t;
 
@@ -169,14 +170,15 @@
 #define NODE_MEM_MAP(nid) mem_map
 #define MAX_NR_NODES 1
 
-#else /* !CONFIG_DISCONTIGMEM */
+#endif /* !CONFIG_DISCONTIGMEM */
 
-#include <asm/mmzone.h>
+#if defined (CONFIG_DISCONTIGMEM) || defined (CONFIG_MEMBIND)
 
+#include <asm/mmzone.h>
 /* page->zone is currently 8 bits ... */
 #define MAX_NR_NODES (255 / MAX_NR_ZONES)
 
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* CONFIG_DISCONTIGMEM || CONFIG_MEMBIND */
 
 #define MAP_ALIGN(x) ((((x) % sizeof(struct page)) == 0) ? (x) : ((x) + \
                 sizeof(struct page) - ((x) % sizeof(struct page))))
diff -Nur linux-2.5.25-test/include/linux/prctl.h linux-2.5.25-api/include/linux/prctl.h
--- linux-2.5.25-test/include/linux/prctl.h Fri Jul 12 16:53:46 2002
+++ linux-2.5.25-api/include/linux/prctl.h Wed Jul 10 13:58:17 2002
@@ -26,6 +26,10 @@
 # define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */
 # define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */
 
+/* Get/Set MemBlk Binding */
+#define PR_SET_MEMBLK_BINDING 11
+#define PR_GET_MEMBLK_BINDING 12
+
 /* Get CPU/Node */
 #define PR_GET_CURR_CPU 13
 #define PR_GET_CURR_NODE 14
diff -Nur linux-2.5.25-test/include/linux/sched.h linux-2.5.25-api/include/linux/sched.h
--- linux-2.5.25-test/include/linux/sched.h Fri Jul 5 16:42:04 2002
+++ linux-2.5.25-api/include/linux/sched.h Wed Jul 10 13:43:48 2002
@@ -27,6 +27,7 @@
 #include <linux/securebits.h>
 #include <linux/fs_struct.h>
 #include <linux/compiler.h>
+#include <linux/membind.h>
 
 struct exec_domain;
 
@@ -302,6 +303,9 @@
         struct task_struct *pidhash_next;
         struct task_struct **pidhash_pprev;
 
+ /* additional Memory Binding stuff */
+ memblk_list_t memblk_binding;
+
         wait_queue_head_t wait_chldexit; /* for wait4() */
         struct completion *vfork_done; /* for vfork() */
 
diff -Nur linux-2.5.25-test/include/linux/smp.h linux-2.5.25-api/include/linux/smp.h
--- linux-2.5.25-test/include/linux/smp.h Fri Jul 5 16:42:28 2002
+++ linux-2.5.25-api/include/linux/smp.h Wed Jul 10 13:43:48 2002
@@ -86,6 +86,7 @@
 #define smp_call_function(func,info,retry,wait) ({ 0; })
 static inline void smp_send_reschedule(int cpu) { }
 static inline void smp_send_reschedule_all(void) { }
+#define memblk_online_map 1
 #define cpu_online_map 1
 #define cpu_online(cpu) 1
 #define num_online_cpus() 1
diff -Nur linux-2.5.25-test/kernel/Makefile linux-2.5.25-api/kernel/Makefile
--- linux-2.5.25-test/kernel/Makefile Fri Jul 5 16:42:18 2002
+++ linux-2.5.25-api/kernel/Makefile Wed Jul 10 13:43:48 2002
@@ -15,7 +15,7 @@
 obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
             module.o exit.o itimer.o time.o softirq.o resource.o \
             sysctl.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o context.o futex.o platform.o
+ signal.o sys.o kmod.o context.o futex.o platform.o membind.o
 
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += ksyms.o
diff -Nur linux-2.5.25-test/kernel/membind.c linux-2.5.25-api/kernel/membind.c
--- linux-2.5.25-test/kernel/membind.c Fri Jul 12 16:53:46 2002
+++ linux-2.5.25-api/kernel/membind.c Fri Jul 12 16:13:17 2002
@@ -33,9 +33,70 @@
 #include <linux/errno.h>
 #include <linux/smp.h>
 
+#define is_valid_memblk_behavior(x) (1)
+
+#define is_memblk_subset(x, y) (!(~(x) & (y))) /* test whether x is a subset of y */
+
 extern unsigned long memblk_online_map;
 
 /*
+ * set_memblk_binding(): Sets up a new MemBlk Binding
+ */
+int set_memblk_binding(memblk_bitmask_t memblks, int behavior)
+{
+ int ret;
+ unsigned long flags;
+
+ ret = -ENODEV;
+ /* Make sure that at least one of the memblks in the new binding set is online. */
+ if (!(memblks & memblk_online_map))
+ goto out;
+
+ read_lock_irqsave(&current->memblk_binding.lock, flags);
+
+ ret = -EPERM;
+ /* If the new binding expands upon the old binding, the caller
+ must have CAP_SYS_NICE. */
+ if ((!is_memblk_subset(memblks, current->memblk_binding.bitmask)) &&
+ (!capable(CAP_SYS_NICE)))
+ goto out_unlock;
+ read_unlock_irqrestore(&current->memblk_binding.lock, flags);
+
+ ret = -EINVAL;
+ // Test to make sure the behavior argument is valid.
+ if (!is_valid_memblk_behavior(behavior))
+ goto out;
+
+ write_lock_irqsave(&current->memblk_binding.lock, flags);
+ current->memblk_binding.bitmask = memblks;
+ current->memblk_binding.behavior = behavior;
+ write_unlock_irqrestore(&current->memblk_binding.lock, flags);
+
+ ret = 0;
+ goto out;
+
+ out_unlock:
+ read_unlock_irqrestore(&current->memblk_binding.lock, flags);
+ out:
+ return ret;
+}
+
+/*
+ * get_memblk_binding(): Returns the current MemBlk Binding
+ */
+inline memblk_bitmask_t get_memblk_binding(void)
+{
+ unsigned long flags;
+ memblk_bitmask_t memblk_binding;
+
+ read_lock_irqsave(&current->memblk_binding.lock, flags);
+ memblk_binding = current->memblk_binding.bitmask;
+ read_unlock_irqrestore(&current->memblk_binding.lock, flags);
+
+ return memblk_binding;
+}
+
+/*
  * cpu_to_node(cpu): Returns the number of the most specific Node
  * containing CPU 'cpu'.
  */
diff -Nur linux-2.5.25-test/kernel/sys.c linux-2.5.25-api/kernel/sys.c
--- linux-2.5.25-test/kernel/sys.c Fri Jul 12 16:53:46 2002
+++ linux-2.5.25-api/kernel/sys.c Fri Jul 12 16:11:16 2002
@@ -1292,6 +1292,12 @@
                         }
                         current->keep_capabilities = arg2;
                         break;
+ case PR_SET_MEMBLK_BINDING:
+ error = (long) set_memblk_binding((memblk_bitmask_t)arg2, (int)arg3);
+ break;
+ case PR_GET_MEMBLK_BINDING:
+ error = (long) get_memblk_binding();
+ break;
                 case PR_GET_CURR_CPU:
                         error = (long) get_curr_cpu();
                         break;
diff -Nur linux-2.5.25-test/mm/numa.c linux-2.5.25-api/mm/numa.c
--- linux-2.5.25-test/mm/numa.c Fri Jul 5 16:42:20 2002
+++ linux-2.5.25-api/mm/numa.c Fri Jul 12 16:35:19 2002
@@ -8,8 +8,10 @@
 #include <linux/bootmem.h>
 #include <linux/mmzone.h>
 #include <linux/spinlock.h>
+#include <linux/membind.h>
 
 int numnodes = 1; /* Initialized for UMA platforms */
+unsigned long memblk_online_map = 0UL; /* Similar to cpu_online_map, but for memory blocks */
 
 static bootmem_data_t contig_bootmem_data;
 pg_data_t contig_page_data = { bdata: &contig_bootmem_data };
@@ -27,6 +29,9 @@
 {
         free_area_init_core(0, &contig_page_data, &mem_map, zones_size,
                                 zone_start_paddr, zholes_size, pmap);
+ contig_page_data.node_id = 0;
+ contig_page_data.memblk_id = 0;
+ memblk_online_map = 1UL;
 }
 
 #endif /* !CONFIG_DISCONTIGMEM */
@@ -71,6 +76,11 @@
         free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr,
                                         zholes_size, pmap);
         pgdat->node_id = nid;
+ pgdat->memblk_id = num_online_memblks();
+ if (test_and_set_bit(num_online_memblks() + 1, &memblk_online_map)){
+ printk("memblk alread counted?!?!\n");
+ BUG();
+ }
 
         /*
          * Get space for the valid bitmap.
@@ -88,6 +98,8 @@
         return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
 }
 
+#ifdef CONFIG_NUMA
+
 /*
  * This can be refined. Currently, tries to do round robin, instead
  * should do concentratic circle search, starting from current node.
@@ -96,30 +110,66 @@
 {
         struct page *ret = 0;
         pg_data_t *start, *temp;
-#ifndef CONFIG_NUMA
+ int search_twice = 0;
+ memblk_bitmask_t memblk_bitmask;
         unsigned long flags;
- static pg_data_t *next = 0;
-#endif
 
         if (order >= MAX_ORDER)
                 return NULL;
-#ifdef CONFIG_NUMA
+
+ read_lock_irqsave(&current->memblk_binding.lock, flags);
+ memblk_bitmask = current->memblk_binding.bitmask;
+ /* if it is a loose binding, remember to search other memblks */
+ if ((current->memblk_binding.behavior == MPOL_LOOSE) &&
+ (current->memblk_binding.bitmask != MEMBLK_NO_BINDING))
+ search_twice = 1;
+ read_unlock_irqrestore(&current->memblk_api_lock, flags);
+
+search_through_memblks:
         temp = NODE_DATA(numa_node_id());
-#else
- spin_lock_irqsave(&node_lock, flags);
- if (!next) next = pgdat_list;
- temp = next;
- next = next->node_next;
- spin_unlock_irqrestore(&node_lock, flags);
-#endif
         start = temp;
         while (temp) {
- if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
- return(ret);
+ if (memblk_bitmask & (1 << temp->memblk_id))
+ if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
+ return(ret);
                 temp = temp->node_next;
         }
         temp = pgdat_list;
         while (temp != start) {
+ if (!(memblk_bitmask & (1 << temp->memblk_id)))
+ if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
+ return(ret);
+ temp = temp->node_next;
+ }
+
+ if (search_twice) {
+ /*
+ * If we failed to find a "preferred" memblk, try again
+ * looking for anything we haven't checked yet.
+ */
+ search_twice = 0; /* no infinite loops, please */
+ memblk_bitmask = ~memblk_bitmask;
+ goto search_through_memblks;
+ }
+ return(0);
+}
+
+#else /* !CONFIG_NUMA */
+
+struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+ struct page *ret = 0;
+ pg_data_t *temp;
+ unsigned long flags;
+
+ if (order >= MAX_ORDER)
+ return NULL;
+
+ spin_lock_irqsave(&node_lock, flags);
+ temp = pgdat_list;
+ spin_unlock_irqrestore(&node_lock, flags);
+
+ while (temp) {
                 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
                         return(ret);
                 temp = temp->node_next;
@@ -127,4 +179,6 @@
         return(0);
 }
 
+#endif /* CONFIG_NUMA */
+
 #endif /* CONFIG_DISCONTIGMEM */
diff -Nur linux-2.5.25-test/mm/page_alloc.c linux-2.5.25-api/mm/page_alloc.c
--- linux-2.5.25-test/mm/page_alloc.c Fri Jul 5 16:42:03 2002
+++ linux-2.5.25-api/mm/page_alloc.c Fri Jul 12 16:00:40 2002
@@ -41,6 +41,8 @@
 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
 
+extern unsigned long memblk_online_map;
+
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
@@ -921,6 +923,9 @@
 void __init free_area_init(unsigned long *zones_size)
 {
         free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
+ contig_page_data.node_id = 0;
+ contig_page_data.memblk_id = 0;
+ memblk_online_map = 1UL;
 }
 
 static int __init setup_mem_frac(char *str)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Mon Jul 15 2002 - 22:00:25 EST