Re: [PATCH] Make Intel 8-way Xeons boot again

From: Yinghai Lu
Date: Mon Jan 11 2010 - 16:43:24 EST


On Sun, Jan 10, 2010 at 2:26 AM, Ingo Molnar <mingo@xxxxxxx> wrote:
>
> * Yinghai Lu <yinghai@xxxxxxxxxx> wrote:
>
>> On Sat, Jan 9, 2010 at 6:30 PM, Ananth N Mavinakayanahalli
>> <ananth@xxxxxxxxxx> wrote:
>> > On Sat, Jan 09, 2010 at 01:13:39PM -0800, Yinghai Lu wrote:
>> >> On Sat, Jan 9, 2010 at 2:10 AM, Ananth N Mavinakayanahalli
>> >> <ananth@xxxxxxxxxx> wrote:
>> >> > On an 8-way system with Intel Xeon X7350 CPUs, booting 2.6.32 or newer
>> >> > kernels fails at:
>> >> >
>> >> > ...
>> >> > CPU0: Intel(R) Xeon(R) CPU ? ? ? ? ? X7350 ?@ 2.93GHz stepping 0b
>> >> > Booting Node ? 0, Processors ?#1 #2 #3 #4 #5 #6 #7 Ok.
>> >> > Brought up 8 CPUs
>> >> > Total of 8 processors activated (46906.05 BogoMIPS).
>> >> >
>> >> > Git bisect showed 2fbd07a5f as the offending commit.
>> >> >
>> >> > With the patch below, I am able to boot the latest Linus' git tree on
>> >> > the machine. If this patch is correct, it needs to get into the stable
>> >> > tree too.
>> >> >
>> >> > Signed-off-by: Ananth N Mavinakayanahalli <ananth@xxxxxxxxxx>
>> >> > ---
>> >> > Index: linux-2.6/arch/x86/kernel/apic/probe_64.c
>> >> > ===================================================================
>> >> > --- linux-2.6.orig/arch/x86/kernel/apic/probe_64.c ? ? ?2010-01-09 14:54:29.000000000 +0530
>> >> > +++ linux-2.6/arch/x86/kernel/apic/probe_64.c ? 2010-01-09 14:57:53.000000000 +0530
>> >> > @@ -70,7 +70,7 @@
>> >> > ? ? ? ?if (apic == &apic_flat) {
>> >> > ? ? ? ? ? ? ? ?switch (boot_cpu_data.x86_vendor) {
>> >> > ? ? ? ? ? ? ? ?case X86_VENDOR_INTEL:
>> >> > - ? ? ? ? ? ? ? ? ? ? ? if (num_processors > 8)
>> >> > + ? ? ? ? ? ? ? ? ? ? ? if (num_processors >= 8)
>> >> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?apic = &apic_physflat;
>> >> > ? ? ? ? ? ? ? ? ? ? ? ?break;
>> >> > ? ? ? ? ? ? ? ?case X86_VENDOR_AMD:
>> >>
>> >> can you send out whole bootlog with apic=debug?
>> >
>> > Here it is:
>> > ACPI: LAPIC (acpi_id[0x00] lapic_id[0x0c] enabled)
>> > ACPI: LAPIC (acpi_id[0x01] lapic_id[0x10] enabled)
>> > ACPI: LAPIC (acpi_id[0x02] lapic_id[0x0d] enabled)
>> > ACPI: LAPIC (acpi_id[0x03] lapic_id[0x11] enabled)
>> > ACPI: LAPIC (acpi_id[0x04] lapic_id[0x0e] enabled)
>> > ACPI: LAPIC (acpi_id[0x05] lapic_id[0x12] enabled)
>> > ACPI: LAPIC (acpi_id[0x06] lapic_id[0x0f] enabled)
>> > ACPI: LAPIC (acpi_id[0x07] lapic_id[0x13] enabled)
>> ...
>> > Setting APIC routing to flat
>> > Getting VERSION: 50014
>> > Getting VERSION: 50014
>> > Getting ID: c000000
>> > Getting ID: f3000000
>> > Getting LVT0: 700
>> > Getting LVT1: 400
>> > enabled ExtINT on CPU#0
>> > ESR value before enabling vector: 0x00000040 ?after: 0x00000000
>> > ENABLING IO-APIC IRQs
>> > ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
>> > CPU0: Intel(R) Xeon(R) CPU ? ? ? ? ? X7350 ?@ 2.93GHz stepping 0b
>> ...
>>
>> the BSP's physical apic id is 0x0c instead of 0.
>>
>> not sure Suresh test that or not.
>
> In any case this commit needs to be reverted as the assumption that it's safe
> to do this optimization is evidently not true.
>

use attached debug patch on one of my intel system and with nr_cpus=8,
it seems logical flat works.
that system BSP apic id is 0x20.

YH
[PATCH] use nr_cpus= to set nr_cpu_ids early

add nr_cpus= to set nr_cpu_ids. so we can simulate cpus <=8 on normal config.
instead of change NR_CPUS directly.

Signed-off-by: Yinghai Lu <yinghai@xxxxxxxxxx>

---
arch/ia64/kernel/acpi.c | 4 ++--
arch/x86/kernel/smpboot.c | 7 ++++---
init/main.c | 14 ++++++++++++++
3 files changed, 20 insertions(+), 5 deletions(-)

Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -149,6 +149,20 @@ static int __init nosmp(char *str)

early_param("nosmp", nosmp);

+/* this is hard limit */
+static int __init nrcpus(char *str)
+{
+ int nr_cpus;
+
+ get_option(&str, &nr_cpus);
+ if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+ nr_cpu_ids = nr_cpus;
+
+ return 0;
+}
+
+early_param("nr_cpus", nrcpus);
+
static int __init maxcpus(char *str)
{
get_option(&str, &setup_max_cpus);
Index: linux-2.6/arch/ia64/kernel/acpi.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/acpi.c
+++ linux-2.6/arch/ia64/kernel/acpi.c
@@ -883,8 +883,8 @@ __init void prefill_possible_map(void)

possible = available_cpus + additional_cpus;

- if (possible > NR_CPUS)
- possible = NR_CPUS;
+ if (possible > nr_cpu_ids)
+ possible = nr_cpu_ids;

printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
possible, max((possible - available_cpus), 0));
Index: linux-2.6/arch/x86/kernel/smpboot.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6/arch/x86/kernel/smpboot.c
@@ -1213,11 +1213,12 @@ __init void prefill_possible_map(void)

total_cpus = max_t(int, possible, num_processors + disabled_cpus);

- if (possible > CONFIG_NR_CPUS) {
+ /* nr_cpu_ids could be reduced via nr_cpus= */
+ if (possible > nr_cpu_ids) {
printk(KERN_WARNING
"%d Processors exceeds NR_CPUS limit of %d\n",
- possible, CONFIG_NR_CPUS);
- possible = CONFIG_NR_CPUS;
+ possible, nr_cpu_ids);
+ possible = nr_cpu_ids;
}

printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",