Re: [GIT PULL] scheduler fixes

From: Yinghai Lu
Date: Mon May 25 2009 - 00:46:49 EST


Ingo Molnar wrote:
> * Yinghai Lu <yinghai@xxxxxxxxxx> wrote:
>
>> Pekka J Enberg wrote:
>>> On Mon, 18 May 2009, Linus Torvalds wrote:
>>>>>> I hate that stupid bootmem allocator. I suspect we seriously
>>>>>> over-use it, and that we _should_ be able to do the SL*B init
>>>>>> earlier.
>>>>> Hm, tempting thought - not sure how to pull it off though.
>>>> As far as I can recall, one of the things that historically made us want
>>>> to use the bootmem allocator even relatively late was that the real SLAB
>>>> allocator had to wait until all the node information etc was initialized.
>>>>
>>>> That's pretty damn late. And I wonder if SLUB (and SLOB) might not need a
>>>> lot less initialization, and work much earlier. Something like that might
>>>> be the final nail in the coffin for SLAB, and convince me to just say
>>>> 'we don't support it any more".
>>> Ingo, here's a patch that boots UMA+SMP+SLUB x86-64 kernel on qemu all
>>> the way to userspace. It probably breaks bunch of things for now but
>>> something for you to play with if you want.
>>>
>> updated with tip/master. also add change to cpupri_init
>> otherwise will get
>> [ 0.000000] Memory: 523096612k/537526272k available (10461k kernel code, 656156k absent, 13773504k reserved, 7186k data, 2548k init)
>> [ 0.000000] SLUB: Genslabs=14, HWalign=64, Order=0-3, MinObjects=0, CPUs=32, Nodes=8
>> [ 0.000000] ------------[ cut here ]------------
>> [ 0.000000] WARNING: at kernel/lockdep.c:2282 lockdep_trace_alloc+0xaf/0xee()
>> [ 0.000000] Hardware name: Sun Fire X4600 M2
>> [ 0.000000] Modules linked in:
>> [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-rc6-tip-01778-g0afdd0f-dirty #259
>> [ 0.000000] Call Trace:
>> [ 0.000000] [<ffffffff810a0274>] ? lockdep_trace_alloc+0xaf/0xee
>> [ 0.000000] [<ffffffff81075ab0>] warn_slowpath_common+0x88/0xcb
>> [ 0.000000] [<ffffffff81075b15>] warn_slowpath_null+0x22/0x38
>> [ 0.000000] [<ffffffff810a0274>] lockdep_trace_alloc+0xaf/0xee
>> [ 0.000000] [<ffffffff8110301b>] kmem_cache_alloc_node+0x38/0x14d
>> [ 0.000000] [<ffffffff813ec548>] ? alloc_cpumask_var_node+0x4a/0x10a
>> [ 0.000000] [<ffffffff8109eb61>] ? lockdep_init_map+0xb9/0x564
>> [ 0.000000] [<ffffffff813ec548>] alloc_cpumask_var_node+0x4a/0x10a
>> [ 0.000000] [<ffffffff813ec62c>] alloc_cpumask_var+0x24/0x3a
>> [ 0.000000] [<ffffffff819e6306>] cpupri_init+0x7f/0x112
>> [ 0.000000] [<ffffffff819e5a30>] init_rootdomain+0x72/0xb7
>> [ 0.000000] [<ffffffff821facce>] sched_init+0x109/0x660
>> [ 0.000000] [<ffffffff82203082>] ? kmem_cache_init+0x193/0x1b2
>> [ 0.000000] [<ffffffff821dfd7a>] start_kernel+0x218/0x3f3
>> [ 0.000000] [<ffffffff821df2a9>] x86_64_start_reservations+0xb9/0xd4
>> [ 0.000000] [<ffffffff821df3b2>] x86_64_start_kernel+0xee/0x109
>> [ 0.000000] ---[ end trace a7919e7f17c0a725 ]---
>>
>> works with 8 sockets numa amd64 box.
>>
>> YH
>>
>> ---
>> init/main.c | 28 ++++++++++++++++------------
>> kernel/irq/handle.c | 23 ++++++++---------------
>> kernel/sched.c | 34 +++++++++++++---------------------
>> kernel/sched_cpupri.c | 9 ++++++---
>> mm/slub.c | 17 ++++++++++-------
>> 5 files changed, 53 insertions(+), 58 deletions(-)
>
> Very nice!
>
> Would it be possible to restructure things to move kmalloc init to
> before IRQ init as well? We have a couple of uglinesses there too.
>
> Conceptually, memory should be the first thing set up in general, in
> a kernel. It does not need IRQs, timers, the scheduler or any of the
> IO facilities and abstractions. All of them need memory though - and
> as Linux scales to more and more hardware via the same single image,
> so will we get more and more dynamic concepts like cpumask_var_t and
> sparse-irqs, which want to allocate very early.

Pekka's patch already made kmalloc before early_irq_init()/init_IRQ...

we can clean up alloc_desc_masks and
alloc_cpumask_var_node could be much simplified too.

[PATCH] x86: remove some alloc_bootmem_cpumask_var calling

except some is called from setup_percpu_area...

Signed-off-by: Yinghai Lu <yinghai@xxxxxxxxxx>

---
arch/x86/kernel/apic/io_apic.c | 4 ++--
include/linux/irq.h | 18 +++++++-----------
kernel/cpuset.c | 2 +-
kernel/profile.c | 6 ------
lib/cpumask.c | 11 ++---------
5 files changed, 12 insertions(+), 29 deletions(-)

Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -430,23 +430,19 @@ extern int set_irq_msi(unsigned int irq,
* Returns true if successful (or not required).
*/
static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
- bool boot)
+ bool boot)
{
-#ifdef CONFIG_CPUMASK_OFFSTACK
- if (boot) {
- alloc_bootmem_cpumask_var(&desc->affinity);
+ gfp_t gfp = GFP_ATOMIC;

-#ifdef CONFIG_GENERIC_PENDING_IRQ
- alloc_bootmem_cpumask_var(&desc->pending_mask);
-#endif
- return true;
- }
+ if (boot)
+ gfp = GFP_NOWAIT;

- if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
return false;

#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+ if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
free_cpumask_var(desc->affinity);
return false;
}
Index: linux-2.6/lib/cpumask.c
===================================================================
--- linux-2.6.orig/lib/cpumask.c
+++ linux-2.6/lib/cpumask.c
@@ -92,15 +92,8 @@ int cpumask_any_but(const struct cpumask
*/
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
- if (likely(slab_is_available()))
- *mask = kmalloc_node(cpumask_size(), flags, node);
- else {
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
- printk(KERN_ERR
- "=> alloc_cpumask_var: kmalloc not available!\n");
-#endif
- *mask = NULL;
- }
+ *mask = kmalloc_node(cpumask_size(), flags, node);
+
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
if (!*mask) {
printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
Index: linux-2.6/arch/x86/kernel/apic/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/apic/io_apic.c
+++ linux-2.6/arch/x86/kernel/apic/io_apic.c
@@ -185,8 +185,8 @@ int __init arch_early_irq_init(void)
for (i = 0; i < count; i++) {
desc = irq_to_desc(i);
desc->chip_data = &cfg[i];
- alloc_bootmem_cpumask_var(&cfg[i].domain);
- alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT);
+ alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT);
if (i < NR_IRQS_LEGACY)
cpumask_setall(cfg[i].domain);
}
Index: linux-2.6/kernel/cpuset.c
===================================================================
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {

int __init cpuset_init_early(void)
{
- alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
+ alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);

top_cpuset.mems_generation = cpuset_mems_generation++;
return 0;
Index: linux-2.6/kernel/profile.c
===================================================================
--- linux-2.6.orig/kernel/profile.c
+++ linux-2.6/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
buffer_bytes = prof_len*sizeof(atomic_t);
- if (!slab_is_available()) {
- prof_buffer = alloc_bootmem(buffer_bytes);
- alloc_bootmem_cpumask_var(&prof_cpu_mask);
- cpumask_copy(prof_cpu_mask, cpu_possible_mask);
- return 0;
- }

if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
return -ENOMEM;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/