diff -urpN -X /home/fletch/.diff.exclude 01-ingo/arch/i386/Kconfig 02-pools/arch/i386/Kconfig --- 01-ingo/arch/i386/Kconfig Fri Jan 17 09:18:19 2003 +++ 02-pools/arch/i386/Kconfig Sat Jan 18 11:59:54 2003 @@ -476,6 +476,11 @@ config NUMA bool "Numa Memory Allocation Support" depends on X86_NUMAQ +config SCHED_NUMA + bool "NUMA aware scheduler" + depends on NUMA + default y + config DISCONTIGMEM bool depends on NUMA diff -urpN -X /home/fletch/.diff.exclude 01-ingo/arch/ia64/Kconfig 02-pools/arch/ia64/Kconfig --- 01-ingo/arch/ia64/Kconfig Thu Jan 9 19:15:56 2003 +++ 02-pools/arch/ia64/Kconfig Sat Jan 18 12:00:08 2003 @@ -246,6 +246,11 @@ config DISCONTIGMEM or have huge holes in the physical address space for other reasons. See for more. +config SCHED_NUMA + bool "NUMA aware scheduler" + depends on NUMA + default y + config VIRTUAL_MEM_MAP bool "Enable Virtual Mem Map" depends on !NUMA diff -urpN -X /home/fletch/.diff.exclude 01-ingo/include/linux/sched.h 02-pools/include/linux/sched.h --- 01-ingo/include/linux/sched.h Fri Jan 17 09:18:32 2003 +++ 02-pools/include/linux/sched.h Sat Jan 18 12:21:09 2003 @@ -447,12 +447,12 @@ extern void set_cpus_allowed(task_t *p, # define set_cpus_allowed(p, new_mask) do { } while (0) #endif -#ifdef CONFIG_NUMA +#ifdef CONFIG_SCHED_NUMA extern void sched_balance_exec(void); -extern void node_nr_running_init(void); +extern void pool_nr_running_init(void); #else #define sched_balance_exec() {} -#define node_nr_running_init() {} +#define pool_nr_running_init() {} #endif extern void set_user_nice(task_t *p, long nice); diff -urpN -X /home/fletch/.diff.exclude 01-ingo/include/linux/sched_topo_ht.h 02-pools/include/linux/sched_topo_ht.h --- 01-ingo/include/linux/sched_topo_ht.h Wed Dec 31 16:00:00 1969 +++ 02-pools/include/linux/sched_topo_ht.h Sat Jan 18 12:20:00 2003 @@ -0,0 +1,17 @@ +#define CONFIG_SCHED_POOLS 1 /* should be a real config option */ + +/* + * The following is a temporary hack, for which I make no apologies - mbligh + * Assumes CPUs are paired together siblings (0,1) (2,3) (4,5) .... etc. + * We should probably do this in an arch topo file and use apicids. + */ + +#define MAX_NUMPOOLS NR_CPUS +#define numpools (num_online_cpus / 2) + +#define pool_to_cpu_mask(pool) ( (1UL << (pool*2)) || (1UL << (pool*2+1)) ) +#define cpu_to_pool(cpu) (cpu / 2) +#define cpu_to_pool_mask(cpu) (pool_to_cpu_mask(cpu_to_pool(cpu))) + +#define IDLE_REBALANCE_RATIO 2 +#define BUSY_REBALANCE_RATIO 2 diff -urpN -X /home/fletch/.diff.exclude 01-ingo/include/linux/sched_topo_numa.h 02-pools/include/linux/sched_topo_numa.h --- 01-ingo/include/linux/sched_topo_numa.h Wed Dec 31 16:00:00 1969 +++ 02-pools/include/linux/sched_topo_numa.h Sat Jan 18 12:20:05 2003 @@ -0,0 +1,11 @@ +#define CONFIG_SCHED_POOLS 1 /* should be a real config option */ + +#define MAX_NUMPOOLS MAX_NUMNODES +#define numpools numnodes + +#define pool_to_cpu_mask __node_to_cpu_mask +#define cpu_to_pool __cpu_to_node +#define cpu_to_pool_mask(cpu) (__node_to_cpu_mask(__cpu_to_node(cpu))) + +#define IDLE_REBALANCE_RATIO 10 +#define BUSY_REBALANCE_RATIO 5 diff -urpN -X /home/fletch/.diff.exclude 01-ingo/include/linux/sched_topology.h 02-pools/include/linux/sched_topology.h --- 01-ingo/include/linux/sched_topology.h Wed Dec 31 16:00:00 1969 +++ 02-pools/include/linux/sched_topology.h Sat Jan 18 11:59:36 2003 @@ -0,0 +1,14 @@ +#ifndef _LINUX_SCHED_TOPOLOGY_H +#define _LINUX_SCHED_TOPOLOGY_H + +#ifdef CONFIG_SCHED_TOPO_ARCH +#include +#elif CONFIG_SCHED_NUMA +#include +#elif CONFIG_SCHED_TOPO_HT +#include +#else +#include +#endif + +#endif /* _LINUX_SCHED_TOPOLOGY_H */ diff -urpN -X /home/fletch/.diff.exclude 01-ingo/init/main.c 02-pools/init/main.c --- 01-ingo/init/main.c Fri Jan 17 09:18:32 2003 +++ 02-pools/init/main.c Sat Jan 18 11:48:10 2003 @@ -495,7 +495,7 @@ static void do_pre_smp_initcalls(void) migration_init(); #endif - node_nr_running_init(); + pool_nr_running_init(); spawn_ksoftirqd(); } diff -urpN -X /home/fletch/.diff.exclude 01-ingo/kernel/sched.c 02-pools/kernel/sched.c --- 01-ingo/kernel/sched.c Sat Jan 18 10:58:57 2003 +++ 02-pools/kernel/sched.c Sat Jan 18 11:49:00 2003 @@ -32,6 +32,7 @@ #include #include #include +#include /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -67,7 +68,7 @@ #define INTERACTIVE_DELTA 2 #define MAX_SLEEP_AVG (2*HZ) #define STARVATION_LIMIT (2*HZ) -#define NODE_THRESHOLD 125 +#define POOL_THRESHOLD 125 /* * If a task is 'interactive' then we reinsert it in the active @@ -154,9 +155,9 @@ struct runqueue { task_t *curr, *idle; prio_array_t *active, *expired, arrays[2]; int prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA - atomic_t *node_nr_running; - int prev_node_load[MAX_NUMNODES]; +#ifdef CONFIG_SCHED_POOLS + atomic_t *pool_nr_running; + int prev_pool_load[MAX_NUMPOOLS]; #endif task_t *migration_thread; struct list_head migration_queue; @@ -181,47 +182,47 @@ static struct runqueue runqueues[NR_CPUS # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_NUMA +#ifdef CONFIG_SCHED_POOLS /* * Keep track of running tasks. */ -static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = - {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; +static atomic_t pool_nr_running[MAX_NUMPOOLS] ____cacheline_maxaligned_in_smp = + {[0 ...MAX_NUMPOOLS-1] = ATOMIC_INIT(0)}; static inline void nr_running_init(struct runqueue *rq) { - rq->node_nr_running = &node_nr_running[0]; + rq->pool_nr_running = &pool_nr_running[0]; } static inline void nr_running_inc(runqueue_t *rq) { - atomic_inc(rq->node_nr_running); + atomic_inc(rq->pool_nr_running); rq->nr_running++; } static inline void nr_running_dec(runqueue_t *rq) { - atomic_dec(rq->node_nr_running); + atomic_dec(rq->pool_nr_running); rq->nr_running--; } -__init void node_nr_running_init(void) +__init void pool_nr_running_init(void) { int i; for (i = 0; i < NR_CPUS; i++) - cpu_rq(i)->node_nr_running = &node_nr_running[__cpu_to_node(i)]; + cpu_rq(i)->pool_nr_running = &pool_nr_running[cpu_to_pool(i)]; } -#else /* !CONFIG_NUMA */ +#else /* !CONFIG_SCHED_POOLS */ # define nr_running_init(rq) do { } while (0) # define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) # define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_SCHED_POOLS */ /* * task_rq_lock - lock the runqueue a given task resides on and disable @@ -670,7 +671,7 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } -#if CONFIG_NUMA +#if CONFIG_SCHED_POOLS /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -697,7 +698,7 @@ static void sched_migrate_task(task_t *p */ static int sched_best_cpu(struct task_struct *p) { - int i, minload, load, best_cpu, node = 0; + int i, minload, load, best_cpu, pool = 0; unsigned long cpumask; best_cpu = task_cpu(p); @@ -705,16 +706,16 @@ static int sched_best_cpu(struct task_st return best_cpu; minload = 10000000; - for (i = 0; i < numnodes; i++) { - load = atomic_read(&node_nr_running[i]); + for (i = 0; i < numpools; i++) { + load = atomic_read(&pool_nr_running[i]); if (load < minload) { minload = load; - node = i; + pool = i; } } minload = 10000000; - cpumask = __node_to_cpu_mask(node); + cpumask = pool_to_cpu_mask(pool); for (i = 0; i < NR_CPUS; ++i) { if (!(cpumask & (1UL << i))) continue; @@ -730,7 +731,7 @@ void sched_balance_exec(void) { int new_cpu; - if (numnodes > 1) { + if (numpools > 1) { new_cpu = sched_best_cpu(current); if (new_cpu != smp_processor_id()) sched_migrate_task(current, new_cpu); @@ -738,33 +739,33 @@ void sched_balance_exec(void) } /* - * Find the busiest node. All previous node loads contribute with a + * Find the busiest pool. All previous pool loads contribute with a * geometrically deccaying weight to the load measure: - * load_{t} = load_{t-1}/2 + nr_node_running_{t} + * load_{t} = load_{t-1}/2 + nr_pool_running_{t} * This way sudden load peaks are flattened out a bit. */ -static int find_busiest_node(int this_node) +static int find_busiest_pool(int this_pool) { - int i, node = -1, load, this_load, maxload; + int i, pool = -1, load, this_load, maxload; - this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) - + atomic_read(&node_nr_running[this_node]); - this_rq()->prev_node_load[this_node] = this_load; - for (i = 0; i < numnodes; i++) { - if (i == this_node) + this_load = maxload = (this_rq()->prev_pool_load[this_pool] >> 1) + + atomic_read(&pool_nr_running[this_pool]); + this_rq()->prev_pool_load[this_pool] = this_load; + for (i = 0; i < numpools; i++) { + if (i == this_pool) continue; - load = (this_rq()->prev_node_load[i] >> 1) - + atomic_read(&node_nr_running[i]); - this_rq()->prev_node_load[i] = load; - if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) { + load = (this_rq()->prev_pool_load[i] >> 1) + + atomic_read(&pool_nr_running[i]); + this_rq()->prev_pool_load[i] = load; + if (load > maxload && (100*load > POOL_THRESHOLD*this_load)) { maxload = load; - node = i; + pool = i; } } - return node; + return pool; } -#endif /* CONFIG_NUMA */ +#endif /* CONFIG_SCHED_POOLS */ #if CONFIG_SMP @@ -983,22 +984,20 @@ out: * * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on * systems with HZ=100, every 10 msecs.) - * - * On NUMA, do a node-rebalance every 400 msecs. */ #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) #define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 2) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) +#define IDLE_POOL_REBALANCE_TICK (IDLE_REBALANCE_TICK * IDLE_REBALANCE_RATIO) +#define BUSY_POOL_REBALANCE_TICK (BUSY_REBALANCE_TICK * BUSY_REBALANCE_RATIO) -#if CONFIG_NUMA -static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) +#if CONFIG_SCHED_POOLS +static void balance_pool(runqueue_t *this_rq, int idle, int this_cpu) { - int node = find_busiest_node(__cpu_to_node(this_cpu)); + int pool = find_busiest_pool(cpu_to_pool(this_cpu)); unsigned long cpumask, this_cpumask = 1UL << this_cpu; - if (node >= 0) { - cpumask = __node_to_cpu_mask(node) | this_cpumask; + if (pool >= 0) { + cpumask = pool_to_cpu_mask(pool) | this_cpumask; spin_lock(&this_rq->lock); load_balance(this_rq, idle, cpumask); spin_unlock(&this_rq->lock); @@ -1008,38 +1007,38 @@ static void balance_node(runqueue_t *thi static void rebalance_tick(runqueue_t *this_rq, int idle) { -#if CONFIG_NUMA +#if CONFIG_SCHED_POOLS int this_cpu = smp_processor_id(); #endif unsigned long j = jiffies; /* - * First do inter-node rebalancing, then intra-node rebalancing, - * if both events happen in the same tick. The inter-node + * First do inter-pool rebalancing, then intra-pool rebalancing, + * if both events happen in the same tick. The inter-pool * rebalancing does not necessarily have to create a perfect - * balance within the node, since we load-balance the most loaded - * node with the current CPU. (ie. other CPUs in the local node + * balance within the pool, since we load-balance the most loaded + * pool with the current CPU. (ie. other CPUs in the local pool * are not balanced.) */ if (idle) { -#if CONFIG_NUMA - if (!(j % IDLE_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); +#if CONFIG_SCHED_POOLS + if (!(j % IDLE_POOL_REBALANCE_TICK)) + balance_pool(this_rq, idle, this_cpu); #endif if (!(j % IDLE_REBALANCE_TICK)) { spin_lock(&this_rq->lock); - load_balance(this_rq, 0, __cpu_to_node_mask(this_cpu)); + load_balance(this_rq, 0, cpu_to_pool_mask(this_cpu)); spin_unlock(&this_rq->lock); } return; } -#if CONFIG_NUMA - if (!(j % BUSY_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); +#if CONFIG_SCHED_POOLS + if (!(j % BUSY_POOL_REBALANCE_TICK)) + balance_pool(this_rq, idle, this_cpu); #endif if (!(j % BUSY_REBALANCE_TICK)) { spin_lock(&this_rq->lock); - load_balance(this_rq, idle, __cpu_to_node_mask(this_cpu)); + load_balance(this_rq, idle, cpu_to_pool_mask(this_cpu)); spin_unlock(&this_rq->lock); } } @@ -1208,7 +1207,7 @@ need_resched: pick_next_task: if (unlikely(!rq->nr_running)) { #if CONFIG_SMP - load_balance(rq, 1, __cpu_to_node_mask(smp_processor_id())); + load_balance(rq, 1, cpu_to_pool_mask(smp_processor_id())); if (rq->nr_running) goto pick_next_task; #endif