Re: [PATCH v5 3/6] mm: make alloc_contig_range work at pageblock granularity

From: Zi Yan
Date: Mon Feb 14 2022 - 11:03:28 EST


On 14 Feb 2022, at 2:59, Christophe Leroy wrote:

> Le 11/02/2022 à 17:41, Zi Yan a écrit :
>> From: Zi Yan <ziy@xxxxxxxxxx>
>>
>> alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
>> pageblocks with different migratetypes. It might unnecessarily convert
>> extra pageblocks at the beginning and at the end of the range. Change
>> alloc_contig_range() to work at pageblock granularity.
>>
>> Special handling is needed for free pages and in-use pages across the
>> boundaries of the range specified alloc_contig_range(). Because these
>> partially isolated pages causes free page accounting issues. The free
>> pages will be split and freed into separate migratetype lists; the
>> in-use pages will be migrated then the freed pages will be handled.
>>
>> Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
>> ---
>> include/linux/page-isolation.h | 2 +-
>> mm/internal.h | 3 +
>> mm/memory_hotplug.c | 3 +-
>> mm/page_alloc.c | 235 +++++++++++++++++++++++++--------
>> mm/page_isolation.c | 33 ++++-
>> 5 files changed, 211 insertions(+), 65 deletions(-)
>>
>> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>> index 4ef7be6def83..78ff940cc169 100644
>> --- a/include/linux/page-isolation.h
>> +++ b/include/linux/page-isolation.h
>> @@ -54,7 +54,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
>> */
>> int
>> start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> - unsigned migratetype, int flags);
>> + unsigned migratetype, int flags, gfp_t gfp_flags);
>>
>> /*
>> * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 0d240e876831..509cbdc25992 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -319,6 +319,9 @@ isolate_freepages_range(struct compact_control *cc,
>> int
>> isolate_migratepages_range(struct compact_control *cc,
>> unsigned long low_pfn, unsigned long end_pfn);
>> +
>> +int
>> +isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int isolate_before_boundary);
>> #endif
>> int find_suitable_fallback(struct free_area *area, unsigned int order,
>> int migratetype, bool only_stealable, bool *can_steal);
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index ce68098832aa..82406d2f3e46 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1863,7 +1863,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>> /* set above range as isolated */
>> ret = start_isolate_page_range(start_pfn, end_pfn,
>> MIGRATE_MOVABLE,
>> - MEMORY_OFFLINE | REPORT_FAILURE);
>> + MEMORY_OFFLINE | REPORT_FAILURE,
>> + GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
>> if (ret) {
>> reason = "failure to isolate range";
>> goto failed_removal_pcplists_disabled;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 62ef78f3d771..7a4fa21aea5c 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -8985,7 +8985,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
>> #endif
>>
>> /* [start, end) must belong to a single zone. */
>> -static int __alloc_contig_migrate_range(struct compact_control *cc,
>> +int __alloc_contig_migrate_range(struct compact_control *cc,
>> unsigned long start, unsigned long end)
>> {
>> /* This function is based on compact_zone() from compaction.c. */
>> @@ -9043,6 +9043,167 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>> return 0;
>> }
>>
>> +/**
>> + * split_free_page() -- split a free page at split_pfn_offset
>> + * @free_page: the original free page
>> + * @order: the order of the page
>> + * @split_pfn_offset: split offset within the page
>> + *
>> + * It is used when the free page crosses two pageblocks with different migratetypes
>> + * at split_pfn_offset within the page. The split free page will be put into
>> + * separate migratetype lists afterwards. Otherwise, the function achieves
>> + * nothing.
>> + */
>> +static inline void split_free_page(struct page *free_page,
>> + int order, unsigned long split_pfn_offset)
>> +{
>> + struct zone *zone = page_zone(free_page);
>> + unsigned long free_page_pfn = page_to_pfn(free_page);
>> + unsigned long pfn;
>> + unsigned long flags;
>> + int free_page_order;
>> +
>> + spin_lock_irqsave(&zone->lock, flags);
>> + del_page_from_free_list(free_page, zone, order);
>> + for (pfn = free_page_pfn;
>> + pfn < free_page_pfn + (1UL << order);) {
>> + int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
>> +
>> + free_page_order = order_base_2(split_pfn_offset);
>> + __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
>> + mt, FPI_NONE);
>> + pfn += 1UL << free_page_order;
>> + split_pfn_offset -= (1UL << free_page_order);
>> + /* we have done the first part, now switch to second part */
>> + if (split_pfn_offset == 0)
>> + split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
>> + }
>> + spin_unlock_irqrestore(&zone->lock, flags);
>> +}
>> +
>> +/**
>> + * isolate_single_pageblock() -- tries to isolate a pageblock that might be
>> + * within a free or in-use page.
>> + * @boundary_pfn: pageblock-aligned pfn that a page might cross
>> + * @gfp_flags: GFP flags used for migrating pages
>> + * @isolate_before_boundary: isolate the pageblock before (1) or after (0)
>> + * the boundary_pfn
>> + *
>> + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
>> + * pageblock. When not all pageblocks within a page are isolated at the same
>> + * time, free page accounting can go wrong. For example, in the case of
>> + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
>> + * [ MAX_ORDER-1 ]
>> + * [ pageblock0 | pageblock1 ]
>> + * When either pageblock is isolated, if it is a free page, the page is not
>> + * split into separate migratetype lists, which is supposed to; if it is an
>> + * in-use page and freed later, __free_one_page() does not split the free page
>> + * either. The function handles this by splitting the free page or migrating
>> + * the in-use page then splitting the free page.
>> + */
>> +int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
>> + int isolate_before_boundary)
>
> Do you need such big param name ?

I am happy to take any suggestion.

>
> See
> https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#naming
>
> isolate_before_boundary could probably be shorter.

isolate_before instead?

>
> And should be a bool by the way.

Sure.
>
>> +{
>> + unsigned char saved_mt;
>> + /*
>> + * scan at max(MAX_ORDER_NR_PAGES, pageblock_nr_pages) aligned range to
>> + * avoid isolate pageblocks belonging to a bigger free or in-use page
>> + */
>> + unsigned long start_pfn = pfn_max_align_down(boundary_pfn);
>> + unsigned long isolated_pageblock_pfn;
>
> Variable name too long.
>
>> + unsigned long pfn;
>> +
>> + VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
>> +
>> + if (isolate_before_boundary)
>> + isolated_pageblock_pfn = boundary_pfn - pageblock_nr_pages;
>> + else
>> + isolated_pageblock_pfn = boundary_pfn;
>> +
>> + saved_mt = get_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn));
>> + set_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn), MIGRATE_ISOLATE);
>> +
>> + for (pfn = start_pfn; pfn < boundary_pfn;) {
>
> This loop is a bit long a deep. Isn't there a way to put what's in "if
> (PageHuge(page) || PageTransCompound(page))" into a sub-function ?
>

Let me give it a try.

> See
> https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#functions
>

Thanks for the review.


>> + struct page *page = pfn_to_page(pfn);
>> +
>> + /*
>> + * start_pfn is max(MAX_ORDER_NR_PAGES, pageblock_nr_pages)
>> + * aligned, if there is any free pages in [start_pfn, boundary_pfn),
>> + * its head page will always be in the range.
>> + */
>> + if (PageBuddy(page)) {
>> + int order = buddy_order(page);
>> +
>> + if (pfn + (1UL << order) > boundary_pfn)
>> + split_free_page(page, order, boundary_pfn - pfn);
>> + pfn += (1UL << order);
>> + continue;
>> + }
>> + /*
>> + * migrate compound pages then let the free page handling code
>> + * above do the rest
>> + */
>> + if (PageHuge(page) || PageTransCompound(page)) {
>> + unsigned long nr_pages = compound_nr(page);
>> + int order = compound_order(page);
>> + struct page *head = compound_head(page);
>> + unsigned long head_pfn = page_to_pfn(head);
>> +
>> + if (head_pfn + nr_pages >= boundary_pfn) {
>> + int ret;
>> + struct compact_control cc = {
>> + .nr_migratepages = 0,
>> + .order = -1,
>> + .zone = page_zone(pfn_to_page(head_pfn)),
>> + .mode = MIGRATE_SYNC,
>> + .ignore_skip_hint = true,
>> + .no_set_skip_hint = true,
>> + .gfp_mask = current_gfp_context(gfp_flags),
>> + .alloc_contig = true,
>> + };
>> +
>> + INIT_LIST_HEAD(&cc.migratepages);
>> +
>> + ret = __alloc_contig_migrate_range(&cc, head_pfn,
>> + head_pfn + nr_pages);
>> +
>> + if (ret) {
>> + /* restore the original migratetype */
>> + set_pageblock_migratetype(
>> + pfn_to_page(isolated_pageblock_pfn),
>> + saved_mt);
>> + return -EBUSY;
>> + }
>> + /*
>> + * reset pfn, let the free page handling code
>> + * above split the free page to the right
>> + * migratetype list.
>> + *
>> + * head_pfn is not used here as a hugetlb page
>> + * order can be bigger than MAX_ORDER-1, but
>> + * after it is freed, the free page order is not.
>> + * Use pfn within the range to find the head of
>> + * the free page and reset order to 0 if a hugetlb
>> + * page with >MAX_ORDER-1 order is encountered.
>> + */
>> + if (order > MAX_ORDER-1)
>> + order = 0;
>> + while (!PageBuddy(pfn_to_page(pfn))) {
>> + order++;
>> + pfn &= ~0UL << order;
>> + }
>> + continue;
>> + }
>> + pfn += nr_pages;
>> + continue;
>> + }
>> +
>> + pfn++;
>> + }
>> + return 0;
>> +}
>> +
>> +
>> /**
>> * alloc_contig_range() -- tries to allocate given range of pages
>> * @start: start PFN to allocate
>> @@ -9067,8 +9228,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
>> int alloc_contig_range(unsigned long start, unsigned long end,
>> unsigned migratetype, gfp_t gfp_mask)
>> {
>> - unsigned long outer_start, outer_end;
>> - unsigned int order;
>> + unsigned long outer_end;
>> + unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages);
>> + unsigned long alloc_end = ALIGN(end, pageblock_nr_pages);
>> int ret = 0;
>>
>> struct compact_control cc = {
>> @@ -9087,14 +9249,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>> * What we do here is we mark all pageblocks in range as
>> * MIGRATE_ISOLATE. Because pageblock and max order pages may
>> * have different sizes, and due to the way page allocator
>> - * work, we align the range to biggest of the two pages so
>> - * that page allocator won't try to merge buddies from
>> - * different pageblocks and change MIGRATE_ISOLATE to some
>> - * other migration type.
>> + * work, start_isolate_page_range() has special handlings for this.
>> *
>> * Once the pageblocks are marked as MIGRATE_ISOLATE, we
>> * migrate the pages from an unaligned range (ie. pages that
>> - * we are interested in). This will put all the pages in
>> + * we are interested in). This will put all the pages in
>> * range back to page allocator as MIGRATE_ISOLATE.
>> *
>> * When this is done, we take the pages in range from page
>> @@ -9107,9 +9266,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>> * put back to page allocator so that buddy can use them.
>> */
>>
>> - ret = start_isolate_page_range(start, end, migratetype, 0);
>> + ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
>> if (ret)
>> - return ret;
>> + goto done;
>>
>> drain_all_pages(cc.zone);
>>
>> @@ -9128,68 +9287,28 @@ int alloc_contig_range(unsigned long start, unsigned long end,
>> goto done;
>> ret = 0;
>>
>> - /*
>> - * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
>> - * aligned blocks that are marked as MIGRATE_ISOLATE. What's
>> - * more, all pages in [start, end) are free in page allocator.
>> - * What we are going to do is to allocate all pages from
>> - * [start, end) (that is remove them from page allocator).
>> - *
>> - * The only problem is that pages at the beginning and at the
>> - * end of interesting range may be not aligned with pages that
>> - * page allocator holds, ie. they can be part of higher order
>> - * pages. Because of this, we reserve the bigger range and
>> - * once this is done free the pages we are not interested in.
>> - *
>> - * We don't have to hold zone->lock here because the pages are
>> - * isolated thus they won't get removed from buddy.
>> - */
>> -
>> - order = 0;
>> - outer_start = start;
>> - while (!PageBuddy(pfn_to_page(outer_start))) {
>> - if (++order >= MAX_ORDER) {
>> - outer_start = start;
>> - break;
>> - }
>> - outer_start &= ~0UL << order;
>> - }
>> -
>> - if (outer_start != start) {
>> - order = buddy_order(pfn_to_page(outer_start));
>> -
>> - /*
>> - * outer_start page could be small order buddy page and
>> - * it doesn't include start page. Adjust outer_start
>> - * in this case to report failed page properly
>> - * on tracepoint in test_pages_isolated()
>> - */
>> - if (outer_start + (1UL << order) <= start)
>> - outer_start = start;
>> - }
>> -
>> /* Make sure the range is really isolated. */
>> - if (test_pages_isolated(outer_start, end, 0)) {
>> + if (test_pages_isolated(alloc_start, alloc_end, 0)) {
>> ret = -EBUSY;
>> goto done;
>> }
>>
>> /* Grab isolated pages from freelists. */
>> - outer_end = isolate_freepages_range(&cc, outer_start, end);
>> + outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end);
>> if (!outer_end) {
>> ret = -EBUSY;
>> goto done;
>> }
>>
>> /* Free head and tail (if any) */
>> - if (start != outer_start)
>> - free_contig_range(outer_start, start - outer_start);
>> - if (end != outer_end)
>> - free_contig_range(end, outer_end - end);
>> + if (start != alloc_start)
>> + free_contig_range(alloc_start, start - alloc_start);
>> + if (end != alloc_end)
>> + free_contig_range(end, alloc_end - end);
>>
>> done:
>> - undo_isolate_page_range(pfn_max_align_down(start),
>> - pfn_max_align_up(end), migratetype);
>> + undo_isolate_page_range(alloc_start,
>> + alloc_end, migratetype);
>> return ret;
>> }
>> EXPORT_SYMBOL(alloc_contig_range);
>> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
>> index 64d093ab83ec..0256d5e1032c 100644
>> --- a/mm/page_isolation.c
>> +++ b/mm/page_isolation.c
>> @@ -285,6 +285,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>> * and PageOffline() pages.
>> * REPORT_FAILURE - report details about the failure to
>> * isolate the range
>> + * @gfp_flags: GFP flags used for migrating pages that sit across the
>> + * range boundaries.
>> *
>> * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
>> * the range will never be allocated. Any free pages and pages freed in the
>> @@ -293,6 +295,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>> * pages in the range finally, the caller have to free all pages in the range.
>> * test_page_isolated() can be used for test it.
>> *
>> + * The function first tries to isolate the pageblocks at the beginning and end
>> + * of the range, since there might be pages across the range boundaries.
>> + * Afterwards, it isolates the rest of the range.
>> + *
>> * There is no high level synchronization mechanism that prevents two threads
>> * from trying to isolate overlapping ranges. If this happens, one thread
>> * will notice pageblocks in the overlapping range already set to isolate.
>> @@ -313,21 +319,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
>> * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
>> */
>> int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
>> - unsigned migratetype, int flags)
>> + unsigned migratetype, int flags, gfp_t gfp_flags)
>> {
>> unsigned long pfn;
>> struct page *page;
>> + /* isolation is done at page block granularity */
>> + unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
>> + unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
>> + int ret;
>>
>> - unsigned long isolate_start = pfn_max_align_down(start_pfn);
>> - unsigned long isolate_end = pfn_max_align_up(end_pfn);
>> + /* isolate [isolate_start, isolate_start + pageblock_nr_pages] pageblock */
>> + ret = isolate_single_pageblock(isolate_start, gfp_flags, 0);
>> + if (ret)
>> + return ret;
>> +
>> + /* isolate [isolate_end - pageblock_nr_pages, isolate_end] pageblock */
>> + ret = isolate_single_pageblock(isolate_end, gfp_flags, 1);
>> + if (ret) {
>> + unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
>> + return ret;
>> + }
>>
>> - for (pfn = isolate_start;
>> - pfn < isolate_end;
>> + /* skip isolated pageblocks at the beginning and end */
>> + for (pfn = isolate_start + pageblock_nr_pages;
>> + pfn < isolate_end - pageblock_nr_pages;
>> pfn += pageblock_nr_pages) {
>> page = __first_valid_page(pfn, pageblock_nr_pages);
>> if (page && set_migratetype_isolate(page, migratetype, flags,
>> start_pfn, end_pfn)) {
>> undo_isolate_page_range(isolate_start, pfn, migratetype);
>> + unset_migratetype_isolate(
>> + pfn_to_page(isolate_end - pageblock_nr_pages),
>> + migratetype);
>> return -EBUSY;
>> }
>> }

--
Best Regards,
Yan, Zi

Attachment: signature.asc
Description: OpenPGP digital signature