Re: [patch 35/37] x86/smpboot: Support parallel startup of secondary CPUs

From: Brian Gerst
Date: Sat Apr 15 2023 - 09:22:52 EST


On Fri, Apr 14, 2023 at 7:45 PM Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote:
>
> From: David Woodhouse <dwmw@xxxxxxxxxxxx>
>
> Rework the real-mode startup code to allow for APs to be brought up in
> parallel. This is in two parts:
>
> 1. Introduce a bit-spinlock to prevent them from all using the real
> mode stack at the same time.
>
> 2. Avoid needing to use the global smpboot_control variable to pass
> each AP its CPU number.
>
> To achieve the latter, export the cpuid_to_apicid[] array so that each
> AP can find its own CPU number by searching therein based on its APIC ID.
>
> Introduce flags in the top bits of smpboot_control which indicate methods
> by which an AP should find its CPU number. For a serialized bringup, the
> CPU number is explicitly passed in the low bits of smpboot_control as
> before. For parallel mode there are flags directing the AP to find its APIC
> ID in CPUID leaf 0x0b or 1x1f (for X2APIC mode) or CPUID leaf 0x01 where 8
> bits are sufficient, then perform the cpuid_to_apicid[] lookup with that.
>
> Aside from the fact that APs will now look up their CPU number via the
> newly-exported cpuid_to_apicid[] table, there is no behavioural change
> intended, since the parallel bootup has not yet been enabled.
>
> [ tglx: Initial proof of concept patch with bitlock and APIC ID lookup ]
> [ dwmw2: Rework and testing, commit message, CPUID 0x1 and CPU0 support ]
> [ seanc: Fix stray override of initial_gs in common_cpu_up() ]
> [ Oleksandr Natalenko: reported suspend/resume issue fixed in
> x86_acpi_suspend_lowlevel ]
>
> Co-developed-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Co-developed-by: Brian Gerst <brgerst@xxxxxxxxx>
> Signed-off-by: Brian Gerst <brgerst@xxxxxxxxx>
> Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
> Signed-off-by: Usama Arif <usama.arif@xxxxxxxxxxxxx>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> ---
> arch/x86/include/asm/apic.h | 2
> arch/x86/include/asm/realmode.h | 3 +
> arch/x86/include/asm/smp.h | 8 +++
> arch/x86/kernel/acpi/sleep.c | 9 +++
> arch/x86/kernel/apic/apic.c | 2
> arch/x86/kernel/head_64.S | 79 ++++++++++++++++++++++++++++++++++-
> arch/x86/kernel/smpboot.c | 5 --
> arch/x86/realmode/init.c | 3 +
> arch/x86/realmode/rm/trampoline_64.S | 27 +++++++++--
> 9 files changed, 125 insertions(+), 13 deletions(-)
>
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -55,6 +55,8 @@ extern int local_apic_timer_c2_ok;
> extern int disable_apic;
> extern unsigned int lapic_timer_period;
>
> +extern int cpuid_to_apicid[];
> +
> extern enum apic_intr_mode_id apic_intr_mode;
> enum apic_intr_mode_id {
> APIC_PIC,
> --- a/arch/x86/include/asm/realmode.h
> +++ b/arch/x86/include/asm/realmode.h
> @@ -52,6 +52,7 @@ struct trampoline_header {
> u64 efer;
> u32 cr4;
> u32 flags;
> + u32 lock;
> #endif
> };
>
> @@ -64,6 +65,8 @@ extern unsigned long initial_stack;
> extern unsigned long initial_vc_handler;
> #endif
>
> +extern u32 *trampoline_lock;
> +
> extern unsigned char real_mode_blob[];
> extern unsigned char real_mode_relocs[];
>
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -198,4 +198,12 @@ extern unsigned int smpboot_control;
>
> #endif /* !__ASSEMBLY__ */
>
> +/* Control bits for startup_64 */
> +#define STARTUP_APICID_CPUID_1F 0x80000000
> +#define STARTUP_APICID_CPUID_0B 0x40000000
> +#define STARTUP_APICID_CPUID_01 0x20000000
> +
> +/* Top 8 bits are reserved for control */
> +#define STARTUP_PARALLEL_MASK 0xFF000000
> +
> #endif /* _ASM_X86_SMP_H */
> --- a/arch/x86/kernel/acpi/sleep.c
> +++ b/arch/x86/kernel/acpi/sleep.c
> @@ -16,6 +16,7 @@
> #include <asm/cacheflush.h>
> #include <asm/realmode.h>
> #include <asm/hypervisor.h>
> +#include <asm/smp.h>
>
> #include <linux/ftrace.h>
> #include "../../realmode/rm/wakeup.h"
> @@ -127,7 +128,13 @@ int x86_acpi_suspend_lowlevel(void)
> * value is in the actual %rsp register.
> */
> current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
> - smpboot_control = smp_processor_id();
> + /*
> + * Ensure the CPU knows which one it is when it comes back, if
> + * it isn't in parallel mode and expected to work that out for
> + * itself.
> + */
> + if (!(smpboot_control & STARTUP_PARALLEL_MASK))
> + smpboot_control = smp_processor_id();
> #endif
> initial_code = (unsigned long)wakeup_long64;
> saved_magic = 0x123456789abcdef0L;
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -2377,7 +2377,7 @@ static int nr_logical_cpuids = 1;
> /*
> * Used to store mapping between logical CPU IDs and APIC IDs.
> */
> -static int cpuid_to_apicid[] = {
> +int cpuid_to_apicid[] = {
> [0 ... NR_CPUS - 1] = -1,
> };
>
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -25,6 +25,7 @@
> #include <asm/export.h>
> #include <asm/nospec-branch.h>
> #include <asm/fixmap.h>
> +#include <asm/smp.h>
>
> /*
> * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
> @@ -234,8 +235,70 @@ SYM_INNER_LABEL(secondary_startup_64_no_
> ANNOTATE_NOENDBR // above
>
> #ifdef CONFIG_SMP
> + /*
> + * For parallel boot, the APIC ID is retrieved from CPUID, and then
> + * used to look up the CPU number. For booting a single CPU, the
> + * CPU number is encoded in smpboot_control.
> + *
> + * Bit 31 STARTUP_APICID_CPUID_1F flag (use CPUID 0x1f)
> + * Bit 30 STARTUP_APICID_CPUID_0B flag (use CPUID 0x0b)
> + * Bit 29 STARTUP_APICID_CPUID_01 flag (use CPUID 0x01)
> + * Bit 0-23 CPU# if STARTUP_APICID_CPUID_xx flags are not set
> + */
> movl smpboot_control(%rip), %ecx
> + testl $STARTUP_APICID_CPUID_1F, %ecx
> + jnz .Luse_cpuid_1f
> + testl $STARTUP_APICID_CPUID_0B, %ecx
> + jnz .Luse_cpuid_0b
> + testl $STARTUP_APICID_CPUID_01, %ecx
> + jnz .Luse_cpuid_01
> + andl $(~STARTUP_PARALLEL_MASK), %ecx
> + jmp .Lsetup_cpu
> +
> +.Luse_cpuid_01:
> + mov $0x01, %eax
> + cpuid
> + mov %ebx, %edx
> + shr $24, %edx
> + jmp .Lsetup_AP
> +
> +.Luse_cpuid_0b:
> + mov $0x0B, %eax
> + xorl %ecx, %ecx
> + cpuid
> + jmp .Lsetup_AP
> +
> +.Luse_cpuid_1f:
> + mov $0x1f, %eax
> + xorl %ecx, %ecx
> + cpuid
>
> +.Lsetup_AP:
> + /* EDX contains the APIC ID of the current CPU */
> + xorq %rcx, %rcx
> + leaq cpuid_to_apicid(%rip), %rbx
> +
> +.Lfind_cpunr:
> + cmpl (%rbx,%rcx,4), %edx
> + jz .Lsetup_cpu
> + inc %ecx
> +#ifdef CONFIG_FORCE_NR_CPUS
> + cmpl $NR_CPUS, %ecx
> +#else
> + cmpl nr_cpu_ids(%rip), %ecx
> +#endif
> + jb .Lfind_cpunr
> +
> + /* APIC ID not found in the table. Drop the trampoline lock and bail. */
> + movq trampoline_lock(%rip), %rax
> + lock
> + btrl $0, (%rax)
> +
> +1: cli
> + hlt
> + jmp 1b
> +
> +.Lsetup_cpu:
> /* Get the per cpu offset for the given CPU# which is in ECX */
> movq __per_cpu_offset(,%rcx,8), %rdx
> #else
> @@ -248,10 +311,20 @@ SYM_INNER_LABEL(secondary_startup_64_no_
> *
> * RDX contains the per-cpu offset
> */
> - movq pcpu_hot + X86_current_task(%rdx), %rax
> - movq TASK_threadsp(%rax), %rsp
> + movq pcpu_hot + X86_top_of_stack(%rdx), %rsp

Switching to using pcpu_hot.top_of_stack is ok, but it's not
completely equivalent. top_of_stack points to the end of the pt_regs
structure, while the kernel stack starts below pt_regs even for kernel
threads. So you need to subtract PTREGS_SIZE from the stack pointer
after this.

This change should also be a separate patch.

--
Brian Gerst