Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

From: Hillf Danton
Date: Sun Mar 10 2013 - 00:55:23 EST


On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman <cpw@xxxxxxx> wrote:
> From: Cliff Wickman <cpw@xxxxxxx>
>
> Allocating a large number of 1GB hugetlbfs pages at boot takes a
> very long time.
>
> Large system sites would at times like to allocate a very large amount of
> memory as 1GB pages. They would put this on the kernel boot line:
> default_hugepagesz=1G hugepagesz=1G hugepages=4096
> [Dynamic allocation of 1G pages is not an option, as zone pages only go
> up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>
> Each page is zeroed as it is allocated, and all allocation is done by
> cpu 0, as this path is early in boot:
> start_kernel
> kernel_init
> do_pre_smp_initcalls
> hugetlb_init
> hugetlb_init_hstates
> hugetlb_hstate_alloc_pages
>
> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> on large numa systems).
> This estimate is approximate (it depends on core frequency & number of hops
> to remote memory) but should be within a factor of 2 on most systems.
> A benchmark attempting to reserve a TB for 1GB pages would thus require
> ~1000 seconds of boot time just for this allocating. 32TB would take 8 hours.
>
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done. The 'no zeroing' flag would have to be passed
> down this code path:
>

FYI: huge pages are cleared just after allocated, for instance,
clear_huge_page() in hugetlb_no_page()

Hillf
> hugetlb_hstate_alloc_pages
> alloc_bootmem_huge_page
> __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c)
> __alloc_memory_core_early NO_ZERO
> if (!(flags & NO_ZERO))
> memset(ptr, 0, size);
>
> Or this path if CONFIG_NO_BOOTMEM is not set:
>
> hugetlb_hstate_alloc_pages
> alloc_bootmem_huge_page
> __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c)
> alloc_bootmem_core NO_ZERO
> if (!(flags & NO_ZERO))
> memset(region, 0, size);
> __alloc_bootmem_nopanic NO_ZERO
> ___alloc_bootmem_nopanic NO_ZERO
> alloc_bootmem_core NO_ZERO
> if (!(flags & NO_ZERO))
> memset(region, 0, size);
>
> Signed-off-by: Cliff Wickman <cpw@xxxxxxx>
>
> ---
> arch/x86/kernel/setup_percpu.c | 4 ++--
> include/linux/bootmem.h | 23 ++++++++++++++++-------
> mm/bootmem.c | 12 +++++++-----
> mm/hugetlb.c | 3 ++-
> mm/nobootmem.c | 41 +++++++++++++++++++++++------------------
> mm/page_cgroup.c | 2 +-
> mm/sparse.c | 2 +-
> 7 files changed, 52 insertions(+), 35 deletions(-)
>
> Index: linux/include/linux/bootmem.h
> ===================================================================
> --- linux.orig/include/linux/bootmem.h
> +++ linux/include/linux/bootmem.h
> @@ -8,6 +8,11 @@
> #include <asm/dma.h>
>
> /*
> + * allocation flags
> + */
> +#define NO_ZERO 0x00000001
> +
> +/*
> * simple boot-time physical memory area allocator.
> */
>
> @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
> unsigned long goal);
> extern void *__alloc_bootmem_nopanic(unsigned long size,
> unsigned long align,
> - unsigned long goal);
> + unsigned long goal,
> + u32 flags);
> extern void *__alloc_bootmem_node(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
> extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> - unsigned long goal);
> + unsigned long goal,
> + u32 flags);
> void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit);
> + unsigned long limit,
> + u32 flags);
> extern void *__alloc_bootmem_low(unsigned long size,
> unsigned long align,
> unsigned long goal);
> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
> #define alloc_bootmem_align(x, align) \
> __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_nopanic(x) \
> - __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
> #define alloc_bootmem_pages(x) \
> __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_pages_nopanic(x) \
> - __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> #define alloc_bootmem_node(pgdat, x) \
> __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_node_nopanic(pgdat, x) \
> - __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> + BOOTMEM_LOW_LIMIT, 0)
> #define alloc_bootmem_pages_node(pgdat, x) \
> __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> - __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>
> #define alloc_bootmem_low(x) \
> __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> Index: linux/arch/x86/kernel/setup_percpu.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_percpu.c
> +++ linux/arch/x86/kernel/setup_percpu.c
> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
> void *ptr;
>
> if (!node_online(node) || !NODE_DATA(node)) {
> - ptr = __alloc_bootmem_nopanic(size, align, goal);
> + ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
> pr_info("cpu %d has no node %d or node-local memory\n",
> cpu, node);
> pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
> cpu, size, __pa(ptr));
> } else {
> ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> - size, align, goal);
> + size, align, goal, 0);
> pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
> cpu, size, node, __pa(ptr));
> }
> Index: linux/mm/nobootmem.c
> ===================================================================
> --- linux.orig/mm/nobootmem.c
> +++ linux/mm/nobootmem.c
> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
> unsigned long max_pfn;
>
> static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> - u64 goal, u64 limit)
> + u64 goal, u64 limit, u32 flags)
> {
> void *ptr;
> u64 addr;
> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
> return NULL;
>
> ptr = phys_to_virt(addr);
> - memset(ptr, 0, size);
> + if (!(flags & NO_ZERO))
> + memset(ptr, 0, size);
> memblock_reserve(addr, size);
> /*
> * The min_count is set to 0 so that bootmem allocated blocks
> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
> static void * __init ___alloc_bootmem_nopanic(unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit)
> + unsigned long limit,
> + u32 flags)
> {
> void *ptr;
>
> @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>
> restart:
>
> - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> + limit, 0);
>
> if (ptr)
> return ptr;
> @@ -244,17 +247,17 @@ restart:
> * Returns NULL on failure.
> */
> void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> - unsigned long goal)
> + unsigned long goal, u32 flags)
> {
> unsigned long limit = -1UL;
>
> - return ___alloc_bootmem_nopanic(size, align, goal, limit);
> + return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> }
>
> static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> - unsigned long goal, unsigned long limit)
> + unsigned long goal, unsigned long limit, u32 flags)
> {
> - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>
> if (mem)
> return mem;
> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
> {
> unsigned long limit = -1UL;
>
> - return ___alloc_bootmem(size, align, goal, limit);
> + return ___alloc_bootmem(size, align, goal, limit, 0);
> }
>
> void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit)
> + unsigned long limit,
> + u32 flags)
> {
> void *ptr;
>
> again:
> ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> - goal, limit);
> + goal, limit, flags);
> if (ptr)
> return ptr;
>
> ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> - goal, limit);
> + goal, limit, flags);
> if (ptr)
> return ptr;
>
> @@ -315,12 +319,13 @@ again:
> }
>
> void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> - unsigned long align, unsigned long goal)
> + unsigned long align, unsigned long goal, u32 flags)
> {
> if (WARN_ON_ONCE(slab_is_available()))
> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> + 0, flags);
> }
>
> void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
> {
> void *ptr;
>
> - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
> if (ptr)
> return ptr;
>
> @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
> * The function panics if the request can not be satisfied.
> */
> void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> - unsigned long align, unsigned long goal)
> + unsigned long align, unsigned long goal)
> {
> if (WARN_ON_ONCE(slab_is_available()))
> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
> void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
> unsigned long goal)
> {
> - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
> }
>
> void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
> unsigned long goal)
> {
> return ___alloc_bootmem_nopanic(size, align, goal,
> - ARCH_LOW_ADDRESS_LIMIT);
> + ARCH_LOW_ADDRESS_LIMIT, 0);
> }
>
> /**
> Index: linux/mm/sparse.c
> ===================================================================
> --- linux.orig/mm/sparse.c
> +++ linux/mm/sparse.c
> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
> nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> again:
> p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> - SMP_CACHE_BYTES, goal, limit);
> + SMP_CACHE_BYTES, goal, limit, 0);
> if (!p && limit) {
> limit = 0;
> goto again;
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
> addr = __alloc_bootmem_node_nopanic(
> NODE_DATA(hstate_next_node_to_alloc(h,
> &node_states[N_MEMORY])),
> - huge_page_size(h), huge_page_size(h), 0);
> + huge_page_size(h), huge_page_size(h),
> + 0, NO_ZERO);
>
> if (addr) {
> /*
> Index: linux/mm/bootmem.c
> ===================================================================
> --- linux.orig/mm/bootmem.c
> +++ linux/mm/bootmem.c
> @@ -660,7 +660,7 @@ restart:
> * Returns NULL on failure.
> */
> void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> - unsigned long goal)
> + unsigned long goal, u32 flags)
> {
> unsigned long limit = 0;
>
> @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>
> void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size, unsigned long align,
> - unsigned long goal, unsigned long limit)
> + unsigned long goal, unsigned long limit,
> + u32 flags)
> {
> void *ptr;
>
> @@ -734,12 +735,13 @@ again:
> }
>
> void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> - unsigned long align, unsigned long goal)
> + unsigned long align, unsigned long goal, u32 flags)
> {
> if (WARN_ON_ONCE(slab_is_available()))
> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> + 0, flags);
> }
>
> void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
> {
> void *ptr;
>
> - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
> if (ptr)
> return ptr;
>
> Index: linux/mm/page_cgroup.c
> ===================================================================
> --- linux.orig/mm/page_cgroup.c
> +++ linux/mm/page_cgroup.c
> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
> table_size = sizeof(struct page_cgroup) * nr_pages;
>
> base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
> if (!base)
> return -ENOMEM;
> NODE_DATA(nid)->node_page_cgroup = base;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/