Re: [PATCH 2/6] arm64/mm: Enable memory hot remove

From: Anshuman Khandual
Date: Thu Apr 04 2019 - 01:39:34 EST




On 04/03/2019 06:07 PM, Robin Murphy wrote:
> [ +Steve ]
>
> Hi Anshuman,
>
> On 03/04/2019 05:30, Anshuman Khandual wrote:
>> Memory removal from an arch perspective involves tearing down two different
>> kernel based mappings i.e vmemmap and linear while releasing related page
>> table pages allocated for the physical memory range to be removed.
>>
>> Define a common kernel page table tear down helper remove_pagetable() which
>> can be used to unmap given kernel virtual address range. In effect it can
>> tear down both vmemap or kernel linear mappings. This new helper is called
>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>> The argument 'direct' here identifies kernel linear mappings.
>>
>> Vmemmap mappings page table pages are allocated through sparse mem helper
>> functions like vmemmap_alloc_block() which does not cycle the pages through
>> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
>> destructor construct pgtable_page_dtor().
>>
>> While here update arch_add_mempory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. Now enable memory hot
>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>
>> This implementation is overall inspired from kernel page table tear down
>> procedure on X86 architecture.
>
> A bit of a nit, but since this depends on at least patch #4 to work properly, it would be good to reorder the series appropriately.

Sure will move up the generic changes forward.

>> Signed-off-by: Anshuman Khandual <anshuman.khandual@xxxxxxx>
>> ---
>> Â arch/arm64/KconfigÂÂÂÂÂÂÂÂÂÂÂÂÂÂ |ÂÂ 3 +
>> Â arch/arm64/include/asm/pgtable.h |Â 14 +++
>> Â arch/arm64/mm/mmu.cÂÂÂÂÂÂÂÂÂÂÂÂÂ | 227 ++++++++++++++++++++++++++++++++++++++-
>> Â 3 files changed, 241 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index a2418fb..db3e625 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -266,6 +266,9 @@ config HAVE_GENERIC_GUP
>> Â config ARCH_ENABLE_MEMORY_HOTPLUG
>> ÂÂÂÂÂ def_bool y
>> Â +config ARCH_ENABLE_MEMORY_HOTREMOVE
>> +ÂÂÂ def_bool y
>> +
>> Â config ARCH_MEMORY_PROBE
>> ÂÂÂÂÂ bool "Enable /sys/devices/system/memory/probe interface"
>> ÂÂÂÂÂ depends on MEMORY_HOTPLUG
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index de70c1e..858098e 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -355,6 +355,18 @@ static inline int pmd_protnone(pmd_t pmd)
>> Â }
>> Â #endif
>> Â +#if (CONFIG_PGTABLE_LEVELS > 2)
>> +#define pmd_large(pmd)ÂÂÂ (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
>> +#else
>> +#define pmd_large(pmd) 0
>> +#endif
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 3)
>> +#define pud_large(pud)ÂÂÂ (pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT))
>> +#else
>> +#define pud_large(pmd) 0
>> +#endif
>
> These seem rather different from the versions that Steve is proposing in the generic pagewalk series - can you reach an agreement on which implementation is preferred?

Sure will take a look.

>
>> +
>> Â /*
>> ÂÂ * THP definitions.
>> ÂÂ */
>> @@ -555,6 +567,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
>> Â Â #else
>> Â +#define pmd_index(addr) 0
>> Â #define pud_page_paddr(pud)ÂÂÂ ({ BUILD_BUG(); 0; })
>> Â Â /* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
>> @@ -612,6 +625,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
>> Â Â #else
>> Â +#define pud_index(adrr)ÂÂÂ 0
>> Â #define pgd_page_paddr(pgd)ÂÂÂ ({ BUILD_BUG(); 0;})
>> Â Â /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index e97f018..ae0777b 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -714,6 +714,198 @@ int kern_addr_valid(unsigned long addr)
>> Â ÂÂÂÂÂ return pfn_valid(pte_pfn(pte));
>> Â }
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pagetable(struct page *page, int order)
>
> Do these need to be __meminit? AFAICS it's effectively redundant with the containing #ifdef, and removal feels like it's inherently a later-than-init thing anyway.

I was confused here a bit but even X86 does exactly the same. All these functions
are still labeled __meminit and all wrapped under CONFIG_MEMORY_HOTPLUG. Is there
any concern to have __meminit here ?

>
>> +{
>> +ÂÂÂ unsigned long magic;
>> +ÂÂÂ unsigned int nr_pages = 1 << order;
>> +
>> +ÂÂÂ if (PageReserved(page)) {
>> +ÂÂÂÂÂÂÂ __ClearPageReserved(page);
>> +
>> +ÂÂÂÂÂÂÂ magic = (unsigned long)page->freelist;
>> +ÂÂÂÂÂÂÂ if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
>> +ÂÂÂÂÂÂÂÂÂÂÂ while (nr_pages--)
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ put_page_bootmem(page++);
>> +ÂÂÂÂÂÂÂ } else
>> +ÂÂÂÂÂÂÂÂÂÂÂ while (nr_pages--)
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ free_reserved_page(page++);
>> +ÂÂÂ } else
>> +ÂÂÂÂÂÂÂ free_pages((unsigned long)page_address(page), order);
>> +}
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 2)
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, bool direct)
>> +{
>> +ÂÂÂ pte_t *pte;
>> +ÂÂÂ int i;
>> +
>> +ÂÂÂ for (i = 0; i < PTRS_PER_PTE; i++) {
>> +ÂÂÂÂÂÂÂ pte = pte_start + i;
>> +ÂÂÂÂÂÂÂ if (!pte_none(*pte))
>> +ÂÂÂÂÂÂÂÂÂÂÂ return;
>> +ÂÂÂ }
>> +
>> +ÂÂÂ if (direct)
>> +ÂÂÂÂÂÂÂ pgtable_page_dtor(pmd_page(*pmd));
>> +ÂÂÂ free_pagetable(pmd_page(*pmd), 0);
>> +ÂÂÂ spin_lock(&init_mm.page_table_lock);
>> +ÂÂÂ pmd_clear(pmd);
>> +ÂÂÂ spin_unlock(&init_mm.page_table_lock);
>> +}
>> +#else
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, bool direct)
>> +{
>> +}
>> +#endif
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 3)
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool direct)
>> +{
>> +ÂÂÂ pmd_t *pmd;
>> +ÂÂÂ int i;
>> +
>> +ÂÂÂ for (i = 0; i < PTRS_PER_PMD; i++) {
>> +ÂÂÂÂÂÂÂ pmd = pmd_start + i;
>> +ÂÂÂÂÂÂÂ if (!pmd_none(*pmd))
>> +ÂÂÂÂÂÂÂÂÂÂÂ return;
>> +ÂÂÂ }
>> +
>> +ÂÂÂ if (direct)
>> +ÂÂÂÂÂÂÂ pgtable_page_dtor(pud_page(*pud));
>> +ÂÂÂ free_pagetable(pud_page(*pud), 0);
>> +ÂÂÂ spin_lock(&init_mm.page_table_lock);
>> +ÂÂÂ pud_clear(pud);
>> +ÂÂÂ spin_unlock(&init_mm.page_table_lock);
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd, bool direct)
>> +{
>> +ÂÂÂ pud_t *pud;
>> +ÂÂÂ int i;
>> +
>> +ÂÂÂ for (i = 0; i < PTRS_PER_PUD; i++) {
>> +ÂÂÂÂÂÂÂ pud = pud_start + i;
>> +ÂÂÂÂÂÂÂ if (!pud_none(*pud))
>> +ÂÂÂÂÂÂÂÂÂÂÂ return;
>> +ÂÂÂ }
>> +
>> +ÂÂÂ if (direct)
>> +ÂÂÂÂÂÂÂ pgtable_page_dtor(pgd_page(*pgd));
>> +ÂÂÂ free_pagetable(pgd_page(*pgd), 0);
>> +ÂÂÂ spin_lock(&init_mm.page_table_lock);
>> +ÂÂÂ pgd_clear(pgd);
>> +ÂÂÂ spin_unlock(&init_mm.page_table_lock);
>> +}
>> +#else
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool direct)
>> +{
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd, bool direct)
>> +{
>> +}
>> +#endif
>> +
>> +static void __meminit
>> +remove_pte_table(pte_t *pte_start, unsigned long addr,
>> +ÂÂÂÂÂÂÂÂÂÂÂ unsigned long end, bool direct)
>> +{
>> +ÂÂÂ pte_t *pte;
>> +
>> +ÂÂÂ pte = pte_start + pte_index(addr);
>> +ÂÂÂ for (; addr < end; addr += PAGE_SIZE, pte++) {
>> +ÂÂÂÂÂÂÂ if (!pte_present(*pte))
>> +ÂÂÂÂÂÂÂÂÂÂÂ continue;
>> +
>> +ÂÂÂÂÂÂÂ if (!direct)
>> +ÂÂÂÂÂÂÂÂÂÂÂ free_pagetable(pte_page(*pte), 0);
>> +ÂÂÂÂÂÂÂ spin_lock(&init_mm.page_table_lock);
>> +ÂÂÂÂÂÂÂ pte_clear(&init_mm, addr, pte);
>> +ÂÂÂÂÂÂÂ spin_unlock(&init_mm.page_table_lock);
>> +ÂÂÂ }
>> +}
>> +
>> +static void __meminit
>> +remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
>> +ÂÂÂÂÂÂÂÂÂÂÂ unsigned long end, bool direct)
>> +{
>> +ÂÂÂ unsigned long next;
>> +ÂÂÂ pte_t *pte_base;
>> +ÂÂÂ pmd_t *pmd;
>> +
>> +ÂÂÂ pmd = pmd_start + pmd_index(addr);
>> +ÂÂÂ for (; addr < end; addr = next, pmd++) {
>> +ÂÂÂÂÂÂÂ next = pmd_addr_end(addr, end);
>> +ÂÂÂÂÂÂÂ if (!pmd_present(*pmd))
>> +ÂÂÂÂÂÂÂÂÂÂÂ continue;
>> +
>> +ÂÂÂÂÂÂÂ if (pmd_large(*pmd)) {
>> +ÂÂÂÂÂÂÂÂÂÂÂ if (!direct)
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ free_pagetable(pmd_page(*pmd),
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ get_order(PMD_SIZE));
>> +ÂÂÂÂÂÂÂÂÂÂÂ spin_lock(&init_mm.page_table_lock);
>> +ÂÂÂÂÂÂÂÂÂÂÂ pmd_clear(pmd);
>> +ÂÂÂÂÂÂÂÂÂÂÂ spin_unlock(&init_mm.page_table_lock);
>> +ÂÂÂÂÂÂÂÂÂÂÂ continue;
>> +ÂÂÂÂÂÂÂ }
>> +ÂÂÂÂÂÂÂ pte_base = pte_offset_kernel(pmd, 0UL);
>> +ÂÂÂÂÂÂÂ remove_pte_table(pte_base, addr, next, direct);
>> +ÂÂÂÂÂÂÂ free_pte_table(pte_base, pmd, direct);
>> +ÂÂÂ }
>> +}
>> +
>> +static void __meminit
>> +remove_pud_table(pud_t *pud_start, unsigned long addr,
>> +ÂÂÂÂÂÂÂÂÂÂÂ unsigned long end, bool direct)
>> +{
>> +ÂÂÂ unsigned long next;
>> +ÂÂÂ pmd_t *pmd_base;
>> +ÂÂÂ pud_t *pud;
>> +
>> +ÂÂÂ pud = pud_start + pud_index(addr);
>> +ÂÂÂ for (; addr < end; addr = next, pud++) {
>> +ÂÂÂÂÂÂÂ next = pud_addr_end(addr, end);
>> +ÂÂÂÂÂÂÂ if (!pud_present(*pud))
>> +ÂÂÂÂÂÂÂÂÂÂÂ continue;
>> +
>> +ÂÂÂÂÂÂÂ if (pud_large(*pud)) {
>> +ÂÂÂÂÂÂÂÂÂÂÂ if (!direct)
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ free_pagetable(pud_page(*pud),
>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ get_order(PUD_SIZE));
>> +ÂÂÂÂÂÂÂÂÂÂÂ spin_lock(&init_mm.page_table_lock);
>> +ÂÂÂÂÂÂÂÂÂÂÂ pud_clear(pud);
>> +ÂÂÂÂÂÂÂÂÂÂÂ spin_unlock(&init_mm.page_table_lock);
>> +ÂÂÂÂÂÂÂÂÂÂÂ continue;
>> +ÂÂÂÂÂÂÂ }
>> +ÂÂÂÂÂÂÂ pmd_base = pmd_offset(pud, 0UL);
>> +ÂÂÂÂÂÂÂ remove_pmd_table(pmd_base, addr, next, direct);
>> +ÂÂÂÂÂÂÂ free_pmd_table(pmd_base, pud, direct);
>> +ÂÂÂ }
>> +}
>> +
>> +static void __meminit
>> +remove_pagetable(unsigned long start, unsigned long end, bool direct)
>> +{
>> +ÂÂÂ unsigned long addr, next;
>> +ÂÂÂ pud_t *pud_base;
>> +ÂÂÂ pgd_t *pgd;
>> +
>> +ÂÂÂ for (addr = start; addr < end; addr = next) {
>> +ÂÂÂÂÂÂÂ next = pgd_addr_end(addr, end);
>> +ÂÂÂÂÂÂÂ pgd = pgd_offset_k(addr);
>> +ÂÂÂÂÂÂÂ if (!pgd_present(*pgd))
>> +ÂÂÂÂÂÂÂÂÂÂÂ continue;
>> +
>> +ÂÂÂÂÂÂÂ pud_base = pud_offset(pgd, 0UL);
>> +ÂÂÂÂÂÂÂ remove_pud_table(pud_base, addr, next, direct);
>> +ÂÂÂÂÂÂÂ free_pud_table(pud_base, pgd, direct);
>> +ÂÂÂ }
>> +ÂÂÂ flush_tlb_kernel_range(start, end);
>> +}
>> +#endif
>> +
>> Â #ifdef CONFIG_SPARSEMEM_VMEMMAP
>> Â #if !ARM64_SWAPPER_USES_SECTION_MAPS
>> Â int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>> @@ -758,9 +950,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>> ÂÂÂÂÂ return 0;
>> Â }
>> Â #endifÂÂÂ /* CONFIG_ARM64_64K_PAGES */
>> -void vmemmap_free(unsigned long start, unsigned long end,
>> +void __ref vmemmap_free(unsigned long start, unsigned long end,
>
> Why is the __ref needed? Presumably it's avoidable by addressing the __meminit thing above.

Right.

>
>> ÂÂÂÂÂÂÂÂÂ struct vmem_altmap *altmap)
>> Â {
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +ÂÂÂ remove_pagetable(start, end, false);
>> +#endif
>> Â }
>> Â #endifÂÂÂ /* CONFIG_SPARSEMEM_VMEMMAP */
>> Â @@ -1046,10 +1241,16 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>> Â }
>> Â Â #ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
>> +{
>> +ÂÂÂ WARN_ON(pgdir != init_mm.pgd);
>> +ÂÂÂ remove_pagetable(start, start + size, true);
>> +}
>> +
>> Â int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ bool want_memblock)
>> Â {
>> -ÂÂÂ int flags = 0;
>> +ÂÂÂ int flags = 0, ret = 0;
>
> Initialising ret here is unnecessary.

Sure. Will change.