[RFC PATCH 05/31] mem_defrag: split a THP if either src or dst is THP only.

From: Zi Yan
Date: Fri Feb 15 2019 - 17:11:43 EST


From: Zi Yan <ziy@xxxxxxxxxx>

During the process of generating physically contiguous memory, it is
possible that we want to move a THP to a place with 512 base pages.
Exchange pages has not implemented the exchange of a THP and 512 base
pages. Instead, we can split the THP and exchange 512 base pages.
This increases the chance of creating a large contiguous region.
A split THP could be promoted back after all 512 pages are moved to the
destination or if none of its subpages is moved.
In-place THP promotion will be introduced later in this patch serie.

Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
---
mm/internal.h | 4 ++
mm/mem_defrag.c | 155 +++++++++++++++++++++++++++++++++++++-----------
mm/page_alloc.c | 45 ++++++++++++++
3 files changed, 168 insertions(+), 36 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 4fe8d1a4d7bb..70a6ef603e5b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -574,6 +574,10 @@ void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype);

+int expand_free_page(struct zone *zone, struct page *buddy_head,
+ struct page *page, int buddy_order, int page_order,
+ struct free_area *area, int migratetype);
+
void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags);

diff --git a/mm/mem_defrag.c b/mm/mem_defrag.c
index 414909e1c19c..4d458b125c95 100644
--- a/mm/mem_defrag.c
+++ b/mm/mem_defrag.c
@@ -643,6 +643,15 @@ static void exchange_free(struct page *freepage, unsigned long data)
head->num_freepages++;
}

+static bool page_can_migrate(struct page *page)
+{
+ if (PageAnon(page))
+ return true;
+ if (page_mapping(page))
+ return true;
+ return false;
+}
+
int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct page *anchor_page, unsigned long page_vaddr,
@@ -655,6 +664,7 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
int not_present = 0;
bool src_thp = false;

+restart:
for (scan_address = start_addr; scan_address < end_addr;
scan_address += page_size) {
struct page *scan_page;
@@ -683,6 +693,8 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
if ((scan_page == compound_head(scan_page)) &&
PageTransHuge(scan_page) && !PageHuge(scan_page))
src_thp = true;
+ else
+ src_thp = false;

/* Allow THPs */
if (PageCompound(scan_page) && !src_thp) {
@@ -720,13 +732,17 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
}

retry_defrag:
- /* migrate */
- if (PageBuddy(dest_page)) {
+ /* free pages */
+ if (page_count(dest_page) == 0 && dest_page->mapping == NULL) {
+ int buddy_page_order = 0;
+ unsigned long pfn = page_to_pfn(dest_page);
+ unsigned long buddy_pfn;
+ struct page *buddy = dest_page;
struct zone *zone = page_zone(dest_page);
spinlock_t *zone_lock = &zone->lock;
unsigned long zone_lock_flags;
unsigned long free_page_order = 0;
- int err = 0;
+ int err = 0, expand_err = 0;
struct exchange_alloc_head exchange_alloc_head = {0};
int migratetype = get_pageblock_migratetype(dest_page);

@@ -734,32 +750,77 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
INIT_LIST_HEAD(&exchange_alloc_head.freelist);
INIT_LIST_HEAD(&exchange_alloc_head.migratepage_list);

- count_vm_events(MEM_DEFRAG_DST_FREE_PAGES, 1<<scan_page_order);
+ /* not managed pages */
+ if (!dest_page->flags) {
+ failed += 1;
+ defrag_stats->dst_out_of_bound_failed += 1;

+ defrag_stats->not_defrag_vpn = scan_address + page_size;
+ goto quit_defrag;
+ }
+ /* spill order-0 pages to buddy allocator from pcplist */
+ if (!PageBuddy(dest_page) && !page_drained) {
+ drain_all_pages(zone);
+ page_drained = 1;
+ goto retry_defrag;
+ }
/* lock page_zone(dest_page)->lock */
spin_lock_irqsave(zone_lock, zone_lock_flags);

- if (!PageBuddy(dest_page)) {
+ while (!PageBuddy(buddy) && buddy_page_order < MAX_ORDER) {
+ buddy_pfn = pfn & ~((1<<buddy_page_order) - 1);
+ buddy = dest_page - (pfn - buddy_pfn);
+ buddy_page_order++;
+ }
+ if (!PageBuddy(buddy)) {
err = -EINVAL;
goto freepage_isolate_fail;
}

- free_page_order = page_order(dest_page);
+ count_vm_events(MEM_DEFRAG_DST_FREE_PAGES, 1<<scan_page_order);

- /* fail early if not enough free pages */
- if (free_page_order < scan_page_order) {
+ free_page_order = page_order(buddy);
+
+ /* caught some transient-state page */
+ if (free_page_order < buddy_page_order) {
err = -ENOMEM;
goto freepage_isolate_fail;
}

+ /* fail early if not enough free pages */
+ if (free_page_order < scan_page_order) {
+ int ret;
+
+ spin_unlock_irqrestore(zone_lock, zone_lock_flags);
+
+ if (is_huge_zero_page(scan_page)) {
+ err = -ENOMEM;
+ goto freepage_isolate_fail_unlocked;
+ }
+ get_page(scan_page);
+ lock_page(scan_page);
+ ret = split_huge_page(scan_page);
+ unlock_page(scan_page);
+ put_page(scan_page);
+ if (ret) {
+ err = -ENOMEM;
+ goto freepage_isolate_fail_unlocked;
+ } else {
+ goto restart;
+ }
+ }
+
/* __isolate_free_page() */
- err = isolate_free_page_no_wmark(dest_page, free_page_order);
+ err = isolate_free_page_no_wmark(buddy, free_page_order);
if (!err)
goto freepage_isolate_fail;

- expand(zone, dest_page, scan_page_order, free_page_order,
+ expand_err = expand_free_page(zone, buddy, dest_page,
+ free_page_order, scan_page_order,
&(zone->free_area[free_page_order]),
migratetype);
+ if (expand_err)
+ goto freepage_isolate_fail;

if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, -(1UL << scan_page_order),
@@ -778,7 +839,7 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,

freepage_isolate_fail:
spin_unlock_irqrestore(zone_lock, zone_lock_flags);
-
+freepage_isolate_fail_unlocked:
if (err < 0) {
failed += (page_size/PAGE_SIZE);
defrag_stats->dst_isolate_free_failed += (page_size/PAGE_SIZE);
@@ -844,6 +905,8 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
if ((dest_page == compound_head(dest_page)) &&
PageTransHuge(dest_page) && !PageHuge(dest_page))
dst_thp = true;
+ else
+ dst_thp = false;

if (PageCompound(dest_page) && !dst_thp) {
failed += get_contig_page_size(dest_page);
@@ -854,37 +917,56 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
}

if (src_thp != dst_thp) {
- failed += get_contig_page_size(scan_page);
- if (src_thp && !dst_thp)
- defrag_stats->src_thp_dst_not_failed +=
- page_size/PAGE_SIZE;
- else /* !src_thp && dst_thp */
- defrag_stats->dst_thp_src_not_failed +=
- page_size/PAGE_SIZE;
+ if (src_thp && !dst_thp) {
+ int ret;
+
+ if (!page_can_migrate(dest_page)) {
+ failed += get_contig_page_size(scan_page);
+ defrag_stats->not_defrag_vpn = scan_address + page_size;
+ goto quit_defrag;
+ }

+ get_page(scan_page);
+ lock_page(scan_page);
+ if (!PageCompound(scan_page) || is_huge_zero_page(scan_page)) {
+ ret = 0;
+ src_thp = false;
+ goto split_src_done;
+ }
+ ret = split_huge_page(scan_page);
+split_src_done:
+ unlock_page(scan_page);
+ put_page(scan_page);
+ if (ret)
+ defrag_stats->src_thp_dst_not_failed += page_size/PAGE_SIZE;
+ else
+ goto restart;
+ } else {/* !src_thp && dst_thp */
+ int ret;
+
+ get_page(dest_page);
+ lock_page(dest_page);
+ if (!PageCompound(dest_page) || is_huge_zero_page(dest_page)) {
+ ret = 0;
+ dst_thp = false;
+ goto split_dst_done;
+ }
+ ret = split_huge_page(dest_page);
+split_dst_done:
+ unlock_page(dest_page);
+ put_page(dest_page);
+ if (ret)
+ defrag_stats->dst_thp_src_not_failed += page_size/PAGE_SIZE;
+ else
+ goto retry_defrag;
+ }
+
+ failed += get_contig_page_size(scan_page);
defrag_stats->not_defrag_vpn = scan_address + page_size;
goto quit_defrag;
/*continue;*/
}

- /* free page on pcplist */
- if (page_count(dest_page) == 0) {
- /* not managed pages */
- if (!dest_page->flags) {
- failed += 1;
- defrag_stats->dst_out_of_bound_failed += 1;
-
- defrag_stats->not_defrag_vpn = scan_address + page_size;
- goto quit_defrag;
- }
- /* spill order-0 pages to buddy allocator from pcplist */
- if (!page_drained) {
- drain_all_pages(NULL);
- page_drained = 1;
- goto retry_defrag;
- }
- }
-
if (PageAnon(dest_page)) {
count_vm_events(MEM_DEFRAG_DST_ANON_PAGES,
1<<scan_page_order);
@@ -895,6 +977,7 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma,
1<<scan_page_order);
failed += 1<<scan_page_order;
defrag_stats->dst_anon_failed += 1<<scan_page_order;
+ /*print_page_stats(dest_page, "anonymous page");*/
}
} else if (page_mapping(dest_page)) {
count_vm_events(MEM_DEFRAG_DST_FILE_PAGES,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a35605e0924a..9ba2cdc320f2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1855,6 +1855,51 @@ inline void expand(struct zone *zone, struct page *page,
}
}

+inline int expand_free_page(struct zone *zone, struct page *buddy_head,
+ struct page *page, int buddy_order, int page_order, struct free_area *area,
+ int migratetype)
+{
+ unsigned long size = 1 << buddy_order;
+
+ if (!(page >= buddy_head && page < (buddy_head + (1<<buddy_order)))) {
+ int mapcount = PageSlab(buddy_head) ? 0 : page_mapcount(buddy_head);
+
+ mapcount = PageSlab(page) ? 0 : page_mapcount(page);
+ __free_one_page(buddy_head, page_to_pfn(buddy_head), zone, buddy_order,
+ migratetype);
+ return -EINVAL;
+ }
+
+ while (buddy_order > page_order) {
+ struct page *page_to_free;
+
+ area--;
+ buddy_order--;
+ size >>= 1;
+
+ if (page < (buddy_head + size))
+ page_to_free = buddy_head + size;
+ else {
+ page_to_free = buddy_head;
+ buddy_head = buddy_head + size;
+ }
+
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ if (set_page_guard(zone, page_to_free, buddy_order, migratetype))
+ continue;
+
+ list_add(&page_to_free->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+ set_page_order(page_to_free, buddy_order);
+ }
+ return 0;
+}
+
static void check_new_page_bad(struct page *page)
{
const char *bad_reason = NULL;
--
2.20.1