Re: [5/7,v8] NUMA Hotplug Emulator: Support cpu probe/release inx86_64

From: David Rientjes
Date: Wed Dec 08 2010 - 16:36:48 EST


On Tue, 7 Dec 2010, shaohui.zheng@xxxxxxxxx wrote:

> From: Shaohui Zheng <shaohui.zheng@xxxxxxxxx>
>
> CPU physical hot-add/hot-remove are supported on some hardwares, and it
> was already supported in current linux kernel. NUMA Hotplug Emulator provides
> a mechanism to emulate the process with software method. It can be used for
> testing or debuging purpose.
>
> CPU physical hotplug is different with logical CPU online/offline. Logical
> online/offline is controled by interface /sys/device/cpu/cpuX/online. CPU
> hotplug emulator uses probe/release interface. It becomes possible to do cpu
> hotplug automation and stress
>
> Add cpu interface probe/release under sysfs for x86_64. User can use this
> interface to emulate the cpu hot-add and hot-remove process.
>
> Directive:
> *) Reserve CPU thru grub parameter like:
> maxcpus=4
>
> the rest CPUs will not be initiliazed.
>
> *) Probe CPU
> we can use the probe interface to hot-add new CPUs:
> echo nid > /sys/devices/system/cpu/probe
>
> *) Release a CPU
> echo cpu > /sys/devices/system/cpu/release
>
> A reserved CPU will be hot-added to the specified node.
> 1) nid == 0, the CPU will be added to the real node which the CPU
> should be in
> 2) nid != 0, add the CPU to node nid even through it is a fake node.
>

This patch is undoubtedly going to conflict with Tejun's unification of
the 32 and 64 bit NUMA boot paths, specifically the patch at
http://marc.info/?l=linux-kernel&m=129087151912379.

Tejun, what's the status of that patchset posted on November 27? Any
comments about this change?

> CC: Ingo Molnar <mingo@xxxxxxx>
> CC: Len Brown <len.brown@xxxxxxxxx>
> CC: Yinghai Lu <Yinghai.Lu@xxxxxxx>
> Signed-off-by: Shaohui Zheng <shaohui.zheng@xxxxxxxxx>
> Signed-off-by: Haicheng Li <haicheng.li@xxxxxxxxx>
> ---
> Index: linux-hpe4/arch/x86/kernel/acpi/boot.c
> ===================================================================
> --- linux-hpe4.orig/arch/x86/kernel/acpi/boot.c 2010-11-26 09:24:40.287725018 +0800
> +++ linux-hpe4/arch/x86/kernel/acpi/boot.c 2010-11-26 09:24:53.277724996 +0800
> @@ -647,8 +647,44 @@
> }
> EXPORT_SYMBOL(acpi_map_lsapic);
>
> +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
> +static void acpi_map_cpu2node_emu(int cpu, int physid, int nid)
> +{
> +#ifdef CONFIG_ACPI_NUMA
> +#ifdef CONFIG_X86_64
> + apicid_to_node[physid] = nid;
> + numa_set_node(cpu, nid);
> +#else /* CONFIG_X86_32 */
> + apicid_2_node[physid] = nid;
> + cpu_to_node_map[cpu] = nid;
> +#endif
> +#endif
> +}
> +
> +static u16 cpu_to_apicid_saved[CONFIG_NR_CPUS];
> +int __ref acpi_map_lsapic_emu(int pcpu, int nid)
> +{
> + /* backup cpu apicid to array cpu_to_apicid_saved */
> + if (cpu_to_apicid_saved[pcpu] == 0 &&
> + per_cpu(x86_cpu_to_apicid, pcpu) != BAD_APICID)
> + cpu_to_apicid_saved[pcpu] = per_cpu(x86_cpu_to_apicid, pcpu);
> +
> + per_cpu(x86_cpu_to_apicid, pcpu) = cpu_to_apicid_saved[pcpu];
> + acpi_map_cpu2node_emu(pcpu, per_cpu(x86_cpu_to_apicid, pcpu), nid);
> +
> + return pcpu;
> +}
> +EXPORT_SYMBOL(acpi_map_lsapic_emu);
> +#endif
> +
> int acpi_unmap_lsapic(int cpu)
> {
> +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
> + /* backup cpu apicid to array cpu_to_apicid_saved */
> + if (cpu_to_apicid_saved[cpu] == 0 &&
> + per_cpu(x86_cpu_to_apicid, cpu) != BAD_APICID)
> + cpu_to_apicid_saved[cpu] = per_cpu(x86_cpu_to_apicid, cpu);
> +#endif
> per_cpu(x86_cpu_to_apicid, cpu) = -1;
> set_cpu_present(cpu, false);
> num_processors--;
> Index: linux-hpe4/arch/x86/kernel/smpboot.c
> ===================================================================
> --- linux-hpe4.orig/arch/x86/kernel/smpboot.c 2010-11-26 09:24:40.297724969 +0800
> +++ linux-hpe4/arch/x86/kernel/smpboot.c 2010-11-26 12:48:58.977725001 +0800
> @@ -107,8 +107,6 @@
> mutex_unlock(&x86_cpu_hotplug_driver_mutex);
> }
>
> -ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
> -ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
> #else
> static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
> #define get_idle_for_cpu(x) (idle_thread_array[(x)])
> Index: linux-hpe4/arch/x86/kernel/topology.c
> ===================================================================
> --- linux-hpe4.orig/arch/x86/kernel/topology.c 2010-11-26 09:24:52.477725000 +0800
> +++ linux-hpe4/arch/x86/kernel/topology.c 2010-11-26 12:48:58.987725001 +0800
> @@ -30,6 +30,9 @@
> #include <linux/init.h>
> #include <linux/smp.h>
> #include <asm/cpu.h>
> +#include <linux/cpu.h>
> +#include <linux/topology.h>
> +#include <linux/acpi.h>
>
> static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
>
> @@ -66,6 +69,74 @@
> unregister_cpu(&per_cpu(cpu_devices, num).cpu);
> }
> EXPORT_SYMBOL(arch_unregister_cpu);
> +
> +ssize_t arch_cpu_probe(const char *buf, size_t count)
> +{
> + int nid = 0;
> + int num = 0, selected = 0;
> +
> + /* check parameters */
> + if (!buf || count < 2)
> + return -EPERM;
> +
> + nid = simple_strtoul(buf, NULL, 0);
> + printk(KERN_DEBUG "Add a cpu to node : %d\n", nid);
> +
> + if (nid < 0 || nid > nr_node_ids - 1) {
> + printk(KERN_ERR "Invalid NUMA node id: %d (0 <= nid < %d).\n",
> + nid, nr_node_ids);
> + return -EPERM;
> + }
> +
> + if (!node_online(nid)) {
> + printk(KERN_ERR "NUMA node %d is not online, give up.\n", nid);
> + return -EPERM;
> + }
> +
> + /* find first uninitialized cpu */
> + for_each_present_cpu(num) {
> + if (per_cpu(cpu_sys_devices, num) == NULL) {
> + selected = num;
> + break;
> + }
> + }
> +
> + if (selected >= num_possible_cpus()) {
> + printk(KERN_ERR "No free cpu, give up cpu probing.\n");
> + return -EPERM;
> + }
> +
> + /* register cpu */
> + arch_register_cpu_node(selected, nid);
> + acpi_map_lsapic_emu(selected, nid);
> +
> + return count;
> +}
> +EXPORT_SYMBOL(arch_cpu_probe);
> +
> +ssize_t arch_cpu_release(const char *buf, size_t count)
> +{
> + int cpu = 0;
> +
> + cpu = simple_strtoul(buf, NULL, 0);
> + /* cpu 0 is not hotplugable */
> + if (cpu == 0) {
> + printk(KERN_ERR "can not release cpu 0.\n");
> + return -EPERM;
> + }
> +
> + if (cpu_online(cpu)) {
> + printk(KERN_DEBUG "offline cpu %d.\n", cpu);
> + cpu_down(cpu);
> + }
> +
> + arch_unregister_cpu(cpu);
> + acpi_unmap_lsapic(cpu);
> +
> + return count;
> +}
> +EXPORT_SYMBOL(arch_cpu_release);
> +
> #else /* CONFIG_HOTPLUG_CPU */
>
> static int __init arch_register_cpu(int num)
> @@ -83,8 +154,14 @@
> register_one_node(i);
> #endif
>
> - for_each_present_cpu(i)
> - arch_register_cpu(i);
> + /*
> + * when cpu hotplug emulation enabled, register the online cpu only,
> + * the rests are reserved for cpu probe.
> + */
> + for_each_present_cpu(i) {
> + if ((cpu_hpe_on && cpu_online(i)) || !cpu_hpe_on)
> + arch_register_cpu(i);
> + }
>
> return 0;
> }
> Index: linux-hpe4/arch/x86/mm/numa_64.c
> ===================================================================
> --- linux-hpe4.orig/arch/x86/mm/numa_64.c 2010-11-26 09:24:40.317724965 +0800
> +++ linux-hpe4/arch/x86/mm/numa_64.c 2010-11-26 09:24:53.297725001 +0800
> @@ -12,6 +12,7 @@
> #include <linux/module.h>
> #include <linux/nodemask.h>
> #include <linux/sched.h>
> +#include <linux/cpu.h>
>
> #include <asm/e820.h>
> #include <asm/proto.h>
> @@ -785,6 +786,19 @@
> }
> #endif
>
> +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
> +static __init int cpu_hpe_setup(char *opt)
> +{
> + if (!opt)
> + return -EINVAL;
> +
> + if (!strncmp(opt, "on", 2) || !strncmp(opt, "1", 1))
> + cpu_hpe_on = 1;
> +
> + return 0;
> +}
> +early_param("cpu_hpe", cpu_hpe_setup);
> +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
>
> void __cpuinit numa_set_node(int cpu, int node)
> {
> Index: linux-hpe4/drivers/acpi/processor_driver.c
> ===================================================================
> --- linux-hpe4.orig/drivers/acpi/processor_driver.c 2010-11-26 09:24:40.327725004 +0800
> +++ linux-hpe4/drivers/acpi/processor_driver.c 2010-11-26 09:24:53.297725001 +0800
> @@ -530,6 +530,14 @@
> goto err_free_cpumask;
>
> sysdev = get_cpu_sysdev(pr->id);
> + /*
> + * Reserve cpu for hotplug emulation, the reserved cpu can be hot-added
> + * throu the cpu probe interface. Return directly.
> + */
> + if (sysdev == NULL) {
> + goto out;
> + }
> +
> if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
> result = -EFAULT;
> goto err_remove_fs;
> @@ -570,6 +578,7 @@
> goto err_remove_sysfs;
> }
>
> +out:
> return 0;
>
> err_remove_sysfs:
> Index: linux-hpe4/drivers/base/cpu.c
> ===================================================================
> --- linux-hpe4.orig/drivers/base/cpu.c 2010-11-26 09:24:52.477725000 +0800
> +++ linux-hpe4/drivers/base/cpu.c 2010-11-26 09:24:53.297725001 +0800
> @@ -22,9 +22,15 @@
> };
> EXPORT_SYMBOL(cpu_sysdev_class);
>
> -static DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);
> +DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);
>
> #ifdef CONFIG_HOTPLUG_CPU
> +/*
> + * cpu_hpe_on is a switch to enable/disable cpu hotplug emulation. it is
> + * disabled in default, we can enable it throu grub parameter cpu_hpe=on
> + */
> +int cpu_hpe_on;
> +
> static ssize_t show_online(struct sys_device *dev, struct sysdev_attribute *attr,
> char *buf)
> {
> Index: linux-hpe4/include/linux/acpi.h
> ===================================================================
> --- linux-hpe4.orig/include/linux/acpi.h 2010-11-26 09:24:40.347725041 +0800
> +++ linux-hpe4/include/linux/acpi.h 2010-11-26 09:24:53.297725001 +0800
> @@ -102,6 +102,7 @@
> #ifdef CONFIG_ACPI_HOTPLUG_CPU
> /* Arch dependent functions for cpu hotplug support */
> int acpi_map_lsapic(acpi_handle handle, int *pcpu);
> +int acpi_map_lsapic_emu(int pcpu, int nid);
> int acpi_unmap_lsapic(int cpu);
> #endif /* CONFIG_ACPI_HOTPLUG_CPU */
>
> Index: linux-hpe4/include/linux/cpu.h
> ===================================================================
> --- linux-hpe4.orig/include/linux/cpu.h 2010-11-26 09:24:52.477725000 +0800
> +++ linux-hpe4/include/linux/cpu.h 2010-11-26 09:24:53.297725001 +0800
> @@ -30,6 +30,8 @@
> struct sys_device sysdev;
> };
>
> +DECLARE_PER_CPU(struct sys_device *, cpu_sys_devices);
> +
> extern int register_cpu_node(struct cpu *cpu, int num, int nid);
>
> static inline int register_cpu(struct cpu *cpu, int num)
> @@ -149,6 +151,7 @@
> #define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
> #define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
> int cpu_down(unsigned int cpu);
> +extern int cpu_hpe_on;
>
> #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
> extern void cpu_hotplug_driver_lock(void);
> @@ -171,6 +174,7 @@
> /* These aren't inline functions due to a GCC bug. */
> #define register_hotcpu_notifier(nb) ({ (void)(nb); 0; })
> #define unregister_hotcpu_notifier(nb) ({ (void)(nb); })
> +static int cpu_hpe_on;
> #endif /* CONFIG_HOTPLUG_CPU */
>
> #ifdef CONFIG_PM_SLEEP_SMP
> Index: linux-hpe4/Documentation/x86/x86_64/boot-options.txt
> ===================================================================
> --- linux-hpe4.orig/Documentation/x86/x86_64/boot-options.txt 2010-11-26 12:49:44.847725099 +0800
> +++ linux-hpe4/Documentation/x86/x86_64/boot-options.txt 2010-11-26 12:55:50.527724999 +0800
> @@ -316,3 +316,9 @@
> Do not use GB pages for kernel direct mappings.
> gbpages
> Use GB pages for kernel direct mappings.
> + cpu_hpe=on/off
> + Enable/disable CPU hotplug emulation with software method. When cpu_hpe=on,
> + sysfs provides probe/release interface to hot add/remove CPUs dynamically.
> + We can use maxcpus=<N> to reserve CPUs.
> + This option is disabled by default.
> +
>
> --
> Thanks & Regards,
> Shaohui
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/