Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM

From: Dave Hansen
Date: Mon Apr 02 2007 - 16:50:37 EST


First of all, nice set of patches.

On Sat, 2007-03-31 at 23:10 -0800, Christoph Lameter wrote:
> --- linux-2.6.21-rc5-mm2.orig/include/asm-generic/memory_model.h 2007-03-31 22:47:14.000000000 -0700
> +++ linux-2.6.21-rc5-mm2/include/asm-generic/memory_model.h 2007-03-31 22:59:35.000000000 -0700
> @@ -47,6 +47,13 @@
> })
>
> #elif defined(CONFIG_SPARSEMEM)
> +#ifdef CONFIG_SPARSE_VIRTUAL
> +/*
> + * We have a virtual memmap that makes lookups very simple
> + */
> +#define __pfn_to_page(pfn) (vmemmap + (pfn))
> +#define __page_to_pfn(page) ((page) - vmemmap)
> +#else
> /*
> * Note: section's mem_map is encorded to reflect its start_pfn.
> * section[i].section_mem_map == mem_map's address - start_pfn;
> @@ -62,6 +69,7 @@
> struct mem_section *__sec = __pfn_to_section(__pfn); \
> __section_mem_map_addr(__sec) + __pfn; \
> })
> +#endif
> #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */

Any chance this can be done without embedding this inside another
#ifdef? I really hate untangling the mess when an #endif goes
missing.

Any reason this can't just be another #elif?

> #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
> Index: linux-2.6.21-rc5-mm2/mm/sparse.c
> ===================================================================
> --- linux-2.6.21-rc5-mm2.orig/mm/sparse.c 2007-03-31 22:47:14.000000000 -0700
> +++ linux-2.6.21-rc5-mm2/mm/sparse.c 2007-03-31 22:59:35.000000000 -0700
> @@ -9,6 +9,7 @@
> #include <linux/spinlock.h>
> #include <linux/vmalloc.h>
> #include <asm/dma.h>
> +#include <asm/pgalloc.h>
>
> /*
> * Permanent SPARSEMEM data:
> @@ -101,7 +102,7 @@ static inline int sparse_index_init(unsi
>
> /*
> * Although written for the SPARSEMEM_EXTREME case, this happens
> - * to also work for the flat array case becase
> + * to also work for the flat array case because
> * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
> */
> int __section_nr(struct mem_section* ms)
> @@ -211,6 +212,90 @@ static int sparse_init_one_section(struc
> return 1;
> }
>
> +#ifdef CONFIG_SPARSE_VIRTUAL
> +
> +void *vmemmap_alloc_block(unsigned long size, int node)
> +{
> + if (slab_is_available()) {
> + struct page *page =
> + alloc_pages_node(node, GFP_KERNEL,
> + get_order(size));
> +
> + BUG_ON(!page);
> + return page_address(page);
> + } else
> + return __alloc_bootmem_node(NODE_DATA(node), size, size,
> + __pa(MAX_DMA_ADDRESS));
> +}

Hmmmmmmm. Can we combine this with sparse_index_alloc()? Also, why not
just use the slab for this?

Let's get rid of the _block() part, too. I'm not sure it does any good.
At least make it _bytes() so that we know what the units are. Also, if
you're just going to round up internally and _not_ use the slab, can you
just make the argument in pages, or even order?

Can you think of any times when we'd want that BUG_ON() to be a
WARN_ON(), instead? I can see preferring having my mem_map[] on the
wrong node than hitting a BUG().

> +#ifndef ARCH_POPULATES_VIRTUAL_MEMMAP
> +/*
> + * Virtual memmap populate functionality for architectures that support
> + * PMDs for huge pages like i386, x86_64 etc.
> + */

How about:

/*
* Virtual memmap support for architectures that use Linux pagetables
* natively in hardware, and support mapping huge pages with PMD
* entries.
*/

It wouldn't make sense to map the vmemmap area with Linux pagetables on
an arch that didn't use them in hardware, right? So, perhaps this
doesn't quite belong in mm/sparse.c. Perhaps we need
arch/x86/sparse.c. ;)

> +static void vmemmap_pop_pmd(pud_t *pud, unsigned long addr,
> + unsigned long end, int node)
> +{
> + pmd_t *pmd;
> +
> + end = pmd_addr_end(addr, end);
> +
> + for (pmd = pmd_offset(pud, addr); addr < end;
> + pmd++, addr += PMD_SIZE) {
> + if (pmd_none(*pmd)) {
> + void *block;
> + pte_t pte;
> +
> + block = vmemmap_alloc_block(PMD_SIZE, node);
> + pte = pfn_pte(__pa(block) >> PAGE_SHIFT,
> + PAGE_KERNEL);
> + pte_mkdirty(pte);
> + pte_mkwrite(pte);
> + pte_mkyoung(pte);
> + mk_pte_huge(pte);
> + set_pmd(pmd, __pmd(pte_val(pte)));
> + }
> + }
> +}

Nitpick: I think this would look quite a bit neater with a little less
indentation.

How about making the loop start with

if (!pmd_none(*pmd))
continue;

It should bring the rest of the code in a bit and make that long line
more readable.

> +static void vmemmap_pop_pud(pgd_t *pgd, unsigned long addr,
> + unsigned long end, int node)
> +{
> + pud_t *pud;
> +
> + end = pud_addr_end(addr, end);
> + for (pud = pud_offset(pgd, addr); addr < end;
> + pud++, addr += PUD_SIZE) {
> +
> + if (pud_none(*pud))
> + pud_populate(&init_mm, pud,
> + vmemmap_alloc_block(PAGE_SIZE, node));
> +
> + vmemmap_pop_pmd(pud, addr, end, node);
> + }
> +}
> +
> +static void vmemmap_populate(struct page *start_page, unsigned long nr,
> + int node)
> +{
> + pgd_t *pgd;
> + unsigned long addr = (unsigned long)(start_page);
> + unsigned long end = pgd_addr_end(addr,
> + (unsigned long)((start_page + nr)));

There appear to be a few extra parentheses on these lines.

> + for (pgd = pgd_offset_k(addr); addr < end;
> + pgd++, addr += PGDIR_SIZE) {
> +
> + if (pgd_none(*pgd))
> + pgd_populate(&init_mm, pgd,
> + vmemmap_alloc_block(PAGE_SIZE, node));
> + vmemmap_pop_pud(pgd, addr, end, node);
> + }
> +}
> +#endif
> +#endif /* CONFIG_SPARSE_VIRTUAL */

We don't really need these #ifdefs embedded inside of each other,
either, right? Kconfig should take care of enforcing the dependency.

> static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
> {
> struct page *map;
> @@ -221,8 +306,13 @@ static struct page *sparse_early_mem_map
> if (map)
> return map;
>
> +#ifdef CONFIG_SPARSE_VIRTUAL
> + map = pfn_to_page(pnum * PAGES_PER_SECTION);
> + vmemmap_populate(map, PAGES_PER_SECTION, nid);
> +#else
> map = alloc_bootmem_node(NODE_DATA(nid),
> sizeof(struct page) * PAGES_PER_SECTION);
> +#endif

We really worked hard to keep #ifdefs out of the code flow in that file
and keep it as clean as possible. Could we hide this behind a helper?

map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
if (map)
return map;

+ map = alloc_vmemmap(map, PAGES_PER_SECTION, nid);
+ if (map)
+ return map;
+
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)
return map;

Then, do whatever magic you want in alloc_vmemmap().

-- Dave

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/