Re: 2.2.0-pre1 worse than 2.1.132

MOLNAR Ingo (mingo@chiara.csoma.elte.hu)
Tue, 29 Dec 1998 19:02:41 +0100 (CET)


> On Tue, 29 Dec 1998, MOLNAR Ingo wrote:
>
> >(it should work with egcs, but i'd suggest you try 2.7.2.3 too. The fact
> >that earlier kernels worked with your compiler means nothing, the SMP and
> >scheduling changes in pre5-2.1.133 (and pre-2.2) introduced some quite
> >optimized piece of APIC code)
> >
> >i'll double check those changes now.

ok, i've found the bug, the problem was that the TLB flush code tried to
talk to CPUs that were not booted up fully yet ...This bug was introduced
with the removal of smp_message_pass() in pre4-2.1.133. The attached patch
(against pre1-2.2) fixes the problem on my dual-P5 box.

it's not reasonable to require that TLB flush should not be called before
SMP is fully booted, thus i've changed the SMP code to deal with this
issue properly: only 'online' CPUs are considered. Additionally, if there
are no online CPUs around, then we do not send a broadcast IPI at all.

i've made some other cleanups in smp.c too, most notably the startup IPI
stage is serialized more seriously with the boot CPU, we are not supposed
to send random IPIs to APs while they are setting up. (STARTUP IPIs are
especially known to be buggy on older boards/CPUs) I've added a
cpu_callout_map to deal with this. (This change might have the side-effect
of fixing Matthias Schniedermeyer's ALR board?)

the patch also adds the fix for 486-SMP boards, and it doesnt let people
run a PII SMP kernel on a SMP-P5 box. (Configure.help now also points this
out)

i've booted/tested the patch on a dual-P5, dual-PII and a quad-Xeon box,
and i think we should now boot on SMP-486 boards too.

-- mingo

--- linux/include/asm-i386/bugs.h.orig Tue Dec 29 16:41:57 1998
+++ linux/include/asm-i386/bugs.h Tue Dec 29 16:50:42 1998
@@ -312,10 +312,34 @@
}
}

+/*
+ * Check wether we are able to run this kernel safely on SMP.
+ */
+__initfunc(static void smp_bugs_check(void))
+{
+#ifdef __SMP__
+ if (smp_found_config && boot_cpu_data.x86 <= 5) {
+#if CONFIG_M686
+ printk("kernel compiled for i686 but running on an <=i586 CPU, APIC operation unsafe!\n");
+ panic("compile kernel as i586 to enable local APIC bug workarounds\n");
+#else
+ printk("i586 local-APIC bug workarounds enabled.\n");
+#endif
+#endif
+#if !defined(CONFIG_M386) && !defined(CONFIG_M486)
+ if (boot_cpu_data.x86 <= 4) {
+ printk("i%d86 CPU, but kernel not compiled as i386/i486!\n", boot_cpu_data.x86);
+ panic("cannot continue booting.");
+ }
+#endif
+ }
+}
+
__initfunc(static void check_bugs(void))
{
check_cyrix_cpu();
identify_cpu(&boot_cpu_data);
+ smp_bugs_check();
#ifndef __SMP__
printk("CPU: ");
print_cpu_info(&boot_cpu_data);
--- linux/include/asm-i386/smp.h.orig Tue Dec 29 17:37:27 1998
+++ linux/include/asm-i386/smp.h Tue Dec 29 17:37:57 1998
@@ -159,6 +159,7 @@
extern unsigned long smp_alloc_memory(unsigned long mem_base);
extern unsigned char boot_cpu_id;
extern unsigned long cpu_present_map;
+extern unsigned long cpu_online_map;
extern volatile int cpu_number_map[NR_CPUS];
extern volatile unsigned long smp_invalidate_needed;
extern void smp_flush_tlb(void);
--- linux/arch/i386/kernel/smp.c.orig Tue Dec 29 16:37:07 1998
+++ linux/arch/i386/kernel/smp.c Tue Dec 29 18:47:35 1998
@@ -119,19 +119,21 @@
* function prototypes:
*/
static void cache_APIC_registers (void);
-
+static void stop_this_cpu (void);

static int smp_b_stepping = 0; /* Set if we find a B stepping CPU */

static int max_cpus = -1; /* Setup configured maximum number of CPUs to activate */
int smp_found_config=0; /* Have we found an SMP box */

-unsigned long cpu_present_map = 0; /* Bitmask of existing CPUs */
+unsigned long cpu_present_map = 0; /* Bitmask of physically existing CPUs */
+unsigned long cpu_online_map = 0; /* Bitmask of currently online CPUs */
int smp_num_cpus = 1; /* Total count of live CPUs */
int smp_threads_ready=0; /* Set when the idlers are all forked */
volatile int cpu_number_map[NR_CPUS]; /* which CPU maps to which logical number */
volatile int __cpu_logical_map[NR_CPUS]; /* which logical number maps to which CPU */
-volatile unsigned long cpu_callin_map[NR_CPUS] = {0,}; /* We always use 0 the rest is ready for parallel delivery */
+static volatile unsigned long cpu_callin_map[NR_CPUS] = {0,}; /* We always use 0 the rest is ready for parallel delivery */
+static volatile unsigned long cpu_callout_map[NR_CPUS] = {0,}; /* We always use 0 the rest is ready for parallel delivery */
volatile unsigned long smp_invalidate_needed; /* Used for the invalidate map that's also checked in the spinlock */
volatile unsigned long kstack_ptr; /* Stack vector for booting CPUs */
struct cpuinfo_x86 cpu_data[NR_CPUS]; /* Per CPU bogomips and other parameters */
@@ -694,16 +696,55 @@
return memory_start;
}

+extern void calibrate_delay(void);
+
void __init smp_callin(void)
{
- extern void calibrate_delay(void);
- int cpuid=GET_APIC_ID(apic_read(APIC_ID));
+ int cpuid;
+ unsigned long timeout;
+
+ /*
+ * (This works even if the APIC is not enabled.)
+ */
+ cpuid = GET_APIC_ID(apic_read(APIC_ID));
+
+ SMP_PRINTK(("CPU#%d waiting for CALLOUT\n", cpuid));
+
+ /*
+ * STARTUP IPIs are fragile beasts as they might sometimes
+ * trigger some glue motherboard logic. Complete APIC bus
+ * silence for 1 second, this overestimates the time the
+ * boot CPU is spending to send the up to 2 STARTUP IPIs
+ * by a factor of two. This should be enough.
+ */
+
+ /*
+ * Waiting 2s total for startup (udelay is not yet working)
+ */
+ timeout = jiffies + 2*HZ;
+ while (time_before(jiffies,timeout))
+ {
+ /*
+ * Has the boot CPU finished it's STARTUP sequence?
+ */
+ if (test_bit(cpuid, (unsigned long *)&cpu_callout_map[0]))
+ break;
+ }
+
+ while (!time_before(jiffies,timeout)) {
+ printk("BUG: CPU%d started up but did not get a callout!\n",
+ cpuid);
+ stop_this_cpu();
+ }

/*
- * Activate our APIC
+ * the boot CPU has finished the init stage and is spinning
+ * on callin_map until we finish. We are free to set up this
+ * CPU, first the APIC. (this is probably redundant on most
+ * boards)
*/

- SMP_PRINTK(("CALLIN %d %d\n",hard_smp_processor_id(), smp_processor_id()));
+ SMP_PRINTK(("CALLIN, before enable_local_APIC().\n"));
enable_local_APIC();

/*
@@ -711,7 +752,12 @@
*/
setup_APIC_clock();

- sti();
+ __sti();
+
+#ifdef CONFIG_MTRR
+ /* Must be done before calibration delay is computed */
+ mtrr_init_secondary_cpu ();
+#endif
/*
* Get our bogomips.
*/
@@ -738,10 +784,11 @@
*/
int __init start_secondary(void *unused)
{
-#ifdef CONFIG_MTRR
- /* Must be done before calibration delay is computed */
- mtrr_init_secondary_cpu ();
-#endif
+ /*
+ * Dont put anything before smp_callin(), SMP
+ * booting is too fragile that we want to limit the
+ * things done here to the most necessary things.
+ */
smp_callin();
while (!atomic_read(&smp_commenced))
/* nothing */ ;
@@ -917,12 +964,18 @@
apic_write(APIC_ICR, cfg); /* Kick the second */

SMP_PRINTK(("Startup point 1.\n"));
+
timeout = 0;
+ SMP_PRINTK(("Waiting for send to finish...\n"));
do {
- SMP_PRINTK(("Sleeping.\n")); mdelay(1000);
- udelay(10);
- } while ( (send_status = (apic_read(APIC_ICR) & 0x1000))
- && (timeout++ < 1000));
+ SMP_PRINTK(("+"));
+ udelay(100);
+ send_status = apic_read(APIC_ICR) & 0x1000;
+ } while (send_status && (timeout++ < 1000));
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
udelay(200);
accept_status = (apic_read(APIC_ESR) & 0xEF);
}
@@ -935,6 +988,13 @@

if ( !(send_status || accept_status) )
{
+ /*
+ * allow APs to start initializing.
+ */
+ SMP_PRINTK(("Before Callout %d.\n", i));
+ set_bit(i, (unsigned long *)&cpu_callout_map[0]);
+ SMP_PRINTK(("After Callout %d.\n", i));
+
for(timeout=0;timeout<50000;timeout++)
{
if (cpu_callin_map[0]&(1<<i))
@@ -980,6 +1040,7 @@

static void smp_tune_scheduling (void)
{
+ unsigned long cachesize;
/*
* Rough estimation for SMP scheduling, this is the number of
* cycles it takes for a fully memory-limited process to flush
@@ -990,7 +1051,21 @@
* L1 cache), on PIIs it's around 50-100 usecs, depending on
* the cache size)
*/
- cacheflush_time = cpu_hz/1024*boot_cpu_data.x86_cache_size/5000;
+
+ if (boot_cpu_data.x86 <= 4) {
+ /*
+ * this basically disables processor-affinity
+ * scheduling on <=i486 based SMP boards.
+ */
+ cacheflush_time = 0;
+ } else {
+ cachesize = boot_cpu_data.x86_cache_size;
+ if (cachesize == -1)
+ cachesize = 8; /* Pentiums */
+
+ cacheflush_time = cpu_hz/1024*cachesize/5000;
+ }
+
printk("per-CPU timeslice cutoff: %ld.%ld usecs.\n",
(long)cacheflush_time/(cpu_hz/1000000),
((long)cacheflush_time*100/(cpu_hz/1000000)) % 100);
@@ -1032,7 +1107,13 @@
printk("CPU%d: ", boot_cpu_id);
print_cpu_info(&cpu_data[boot_cpu_id]);

+ /*
+ * not necessary because the MP table should list the boot
+ * CPU too, but we do it for the sake of robustness anyway.
+ * (and for the case when a non-SMP board boots an SMP kernel)
+ */
cpu_present_map |= (1 << hard_smp_processor_id());
+
cpu_number_map[boot_cpu_id] = 0;

/*
@@ -1106,6 +1187,14 @@
* Now scan the CPU present map and fire up the other CPUs.
*/

+ /*
+ * Add all detected CPUs. (later on we can down individual
+ * CPUs which will change cpu_online_map but not necessarily
+ * cpu_present_map. We are pretty much ready for hot-swap CPUs.)
+ */
+ cpu_online_map = cpu_present_map;
+ mb();
+
SMP_PRINTK(("CPU map: %lx\n", cpu_present_map));

for(i=0;i<NR_CPUS;i++)
@@ -1116,7 +1205,7 @@
if (i == boot_cpu_id)
continue;

- if ((cpu_present_map & (1 << i))
+ if ((cpu_online_map & (1 << i))
&& (max_cpus < 0 || max_cpus > cpucount+1))
{
do_boot_cpu(i);
@@ -1126,9 +1215,9 @@
* Make sure we unmap all failed CPUs
*/

- if (cpu_number_map[i] == -1 && (cpu_present_map & (1 << i))) {
- printk("CPU #%d not responding. Removing from cpu_present_map.\n",i);
- cpu_present_map &= ~(1 << i);
+ if (cpu_number_map[i] == -1 && (cpu_online_map & (1 << i))) {
+ printk("CPU #%d not responding. Removing from cpu_online_map.\n",i);
+ cpu_online_map &= ~(1 << i);
}
}

@@ -1168,14 +1257,14 @@
if (cpucount==0)
{
printk(KERN_ERR "Error: only one processor found.\n");
- cpu_present_map=(1<<hard_smp_processor_id());
+ cpu_online_map = (1<<hard_smp_processor_id());
}
else
{
unsigned long bogosum=0;
for(i=0;i<32;i++)
{
- if (cpu_present_map&(1<<i))
+ if (cpu_online_map&(1<<i))
bogosum+=cpu_data[i].loops_per_sec;
}
printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
@@ -1386,44 +1475,51 @@
unsigned long flags;

/*
- * The assignment is safe because it's volatile so the
- * compiler cannot reorder it, because the i586 has
- * strict memory ordering and because only the kernel
- * lock holder may issue a tlb flush. If you break any
- * one of those three change this to an atomic bus
- * locked or.
+ * it's important that we do not generate any APIC traffic
+ * until the AP CPUs have booted up!
*/
+ if (cpu_online_map) {
+ /*
+ * The assignment is safe because it's volatile so the
+ * compiler cannot reorder it, because the i586 has
+ * strict memory ordering and because only the kernel
+ * lock holder may issue a tlb flush. If you break any
+ * one of those three change this to an atomic bus
+ * locked or.
+ */

- smp_invalidate_needed = cpu_present_map;
+ smp_invalidate_needed = cpu_online_map;

- /*
- * Processors spinning on some lock with IRQs disabled
- * will see this IRQ late. The smp_invalidate_needed
- * map will ensure they don't do a spurious flush tlb
- * or miss one.
- */
+ /*
+ * Processors spinning on some lock with IRQs disabled
+ * will see this IRQ late. The smp_invalidate_needed
+ * map will ensure they don't do a spurious flush tlb
+ * or miss one.
+ */

- __save_flags(flags);
- __cli();
+ __save_flags(flags);
+ __cli();

- send_IPI_allbutself(INVALIDATE_TLB_VECTOR);
+ send_IPI_allbutself(INVALIDATE_TLB_VECTOR);

- /*
- * Spin waiting for completion
- */
-
- stuck = 50000000;
- while (smp_invalidate_needed) {
/*
- * Take care of "crossing" invalidates
+ * Spin waiting for completion
*/
- if (test_bit(cpu, &smp_invalidate_needed))
+
+ stuck = 50000000;
+ while (smp_invalidate_needed) {
+ /*
+ * Take care of "crossing" invalidates
+ */
+ if (test_bit(cpu, &smp_invalidate_needed))
clear_bit(cpu, &smp_invalidate_needed);
- --stuck;
- if (!stuck) {
- printk("stuck on TLB IPI wait (CPU#%d)\n",cpu);
- break;
+ --stuck;
+ if (!stuck) {
+ printk("stuck on TLB IPI wait (CPU#%d)\n",cpu);
+ break;
+ }
}
+ __restore_flags(flags);
}

/*
@@ -1431,7 +1527,6 @@
*/
local_flush_tlb();

- __restore_flags(flags);
}


@@ -1584,14 +1679,24 @@
ack_APIC_irq();
}

+static void stop_this_cpu (void)
+{
+ /*
+ * Remove this CPU:
+ */
+ clear_bit(smp_processor_id(), &cpu_online_map);
+
+ if (cpu_data[smp_processor_id()].hlt_works_ok)
+ for(;;) __asm__("hlt");
+ for (;;);
+}
+
/*
* CPU halt call-back
*/
asmlinkage void smp_stop_cpu_interrupt(void)
{
- if (cpu_data[smp_processor_id()].hlt_works_ok)
- for(;;) __asm__("hlt");
- for (;;) ;
+ stop_this_cpu();
}

void (*mtrr_hook) (void) = NULL;
--- linux/arch/i386/kernel/setup.c.orig Tue Dec 29 18:26:41 1998
+++ linux/arch/i386/kernel/setup.c Tue Dec 29 18:26:47 1998
@@ -597,7 +597,7 @@

for(n=0; n<NR_CPUS; n++, c++) {
#ifdef __SMP__
- if (!(cpu_present_map & (1<<n)))
+ if (!(cpu_online_map & (1<<n)))
continue;
#endif
p += sprintf(p,"processor\t: %d\n"
--- linux/Documentation/Configure.help.orig Tue Dec 29 16:37:07 1998
+++ linux/Documentation/Configure.help Tue Dec 29 16:43:50 1998
@@ -91,6 +91,9 @@
single-CPU machines. On a single-CPU machine, a non-SMP kernel
will run faster than an SMP kernel.

+ i486 based SMP boards don't boot CONFIG_M586/M686 kernels. CONFIG_M686
+ SMP kernels might not work on all Pentium based boards.
+
People using multiprocessor machines should also say Y to "Enhanced
Real Time Clock Support", below. The "Advanced Power Management"
code will be disabled in an SMP kernel.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/