Re: [PATCH v2] mm/migrate: correct nr_failed in migrate_pages_sync()

From: Huang, Ying
Date: Tue Oct 17 2023 - 00:58:43 EST


Zi Yan <zi.yan@xxxxxxxx> writes:

> From: Zi Yan <ziy@xxxxxxxxxx>
>
> nr_failed was missing the large folio splits from migrate_pages_batch()
> and can cause a mismatch between migrate_pages() return value and the
> number of not migrated pages, i.e., when the return value of
> migrate_pages() is 0, there are still pages left in the from page list.
> It will happen when a non-PMD THP large folio fails to migrate due to
> -ENOMEM and is split successfully but not all the split pages are not
> migrated, migrate_pages_batch() would return non-zero, but
> astats.nr_thp_split = 0. nr_failed would be 0 and returned to the caller
> of migrate_pages(), but the not migrated pages are left in the from page
> list without being added back to LRU lists.
>
> Fix it by adding a new nr_split counter for large folio splits and adding
> it to nr_failed in migrate_page_sync() after migrate_pages_batch() is
> done.
>
> Fixes: 2ef7dbb26990 ("migrate_pages: try migrate in batch asynchronously firstly")
> Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
> ---
> include/trace/events/migrate.h | 24 ++++++++++++++----------
> mm/migrate.c | 15 +++++++++++----
> 2 files changed, 25 insertions(+), 14 deletions(-)
>
> diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
> index 061b5128f335..0190ef725b43 100644
> --- a/include/trace/events/migrate.h
> +++ b/include/trace/events/migrate.h
> @@ -49,10 +49,11 @@ TRACE_EVENT(mm_migrate_pages,
>
> TP_PROTO(unsigned long succeeded, unsigned long failed,
> unsigned long thp_succeeded, unsigned long thp_failed,
> - unsigned long thp_split, enum migrate_mode mode, int reason),
> + unsigned long thp_split, unsigned long large_folio_split,
> + enum migrate_mode mode, int reason),
>
> TP_ARGS(succeeded, failed, thp_succeeded, thp_failed,
> - thp_split, mode, reason),
> + thp_split, large_folio_split, mode, reason),
>
> TP_STRUCT__entry(
> __field( unsigned long, succeeded)
> @@ -60,26 +61,29 @@ TRACE_EVENT(mm_migrate_pages,
> __field( unsigned long, thp_succeeded)
> __field( unsigned long, thp_failed)
> __field( unsigned long, thp_split)
> + __field( unsigned long, large_folio_split)
> __field( enum migrate_mode, mode)
> __field( int, reason)
> ),
>
> TP_fast_assign(
> - __entry->succeeded = succeeded;
> - __entry->failed = failed;
> - __entry->thp_succeeded = thp_succeeded;
> - __entry->thp_failed = thp_failed;
> - __entry->thp_split = thp_split;
> - __entry->mode = mode;
> - __entry->reason = reason;
> + __entry->succeeded = succeeded;
> + __entry->failed = failed;
> + __entry->thp_succeeded = thp_succeeded;
> + __entry->thp_failed = thp_failed;
> + __entry->thp_split = thp_split;
> + __entry->large_folio_split = large_folio_split;
> + __entry->mode = mode;
> + __entry->reason = reason;
> ),
>
> - TP_printk("nr_succeeded=%lu nr_failed=%lu nr_thp_succeeded=%lu nr_thp_failed=%lu nr_thp_split=%lu mode=%s reason=%s",
> + TP_printk("nr_succeeded=%lu nr_failed=%lu nr_thp_succeeded=%lu nr_thp_failed=%lu nr_thp_split=%lu nr_split=%lu mode=%s reason=%s",
> __entry->succeeded,
> __entry->failed,
> __entry->thp_succeeded,
> __entry->thp_failed,
> __entry->thp_split,
> + __entry->large_folio_split,
> __print_symbolic(__entry->mode, MIGRATE_MODE),
> __print_symbolic(__entry->reason, MIGRATE_REASON))
> );

I suggest to split trace event change in another patch. Because this
patch will be backported to previous stable kernel. It's more
convenient for users if we keep trace event unchanged in stable kernel.
And this isn't part of fix.

> diff --git a/mm/migrate.c b/mm/migrate.c
> index c602bf6dec97..4caf405b6504 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1495,6 +1495,7 @@ struct migrate_pages_stats {
> int nr_thp_succeeded; /* THP migrated successfully */
> int nr_thp_failed; /* THP failed to be migrated */
> int nr_thp_split; /* THP split before migrating */
> + int nr_split; /* Large folio (include THP) split before migrating */
> };
>
> /*
> @@ -1614,6 +1615,7 @@ static int migrate_pages_batch(struct list_head *from,
> int nr_retry_pages = 0;
> int pass = 0;
> bool is_thp = false;
> + bool is_large = false;
> struct folio *folio, *folio2, *dst = NULL, *dst2;
> int rc, rc_saved = 0, nr_pages;
> LIST_HEAD(unmap_folios);
> @@ -1629,7 +1631,8 @@ static int migrate_pages_batch(struct list_head *from,
> nr_retry_pages = 0;
>
> list_for_each_entry_safe(folio, folio2, from, lru) {
> - is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
> + is_large = folio_test_large(folio);
> + is_thp = is_large && folio_test_pmd_mappable(folio);
> nr_pages = folio_nr_pages(folio);
>
> cond_resched();
> @@ -1649,6 +1652,7 @@ static int migrate_pages_batch(struct list_head *from,
> stats->nr_thp_failed++;
> if (!try_split_folio(folio, split_folios)) {
> stats->nr_thp_split++;
> + stats->nr_split++;
> continue;
> }
> stats->nr_failed_pages += nr_pages;
> @@ -1677,11 +1681,12 @@ static int migrate_pages_batch(struct list_head *from,
> nr_failed++;
> stats->nr_thp_failed += is_thp;
> /* Large folio NUMA faulting doesn't split to retry. */
> - if (folio_test_large(folio) && !nosplit) {
> + if (is_large && !nosplit) {

It appears that there's only one user of "is_large"? If so, it seems
unnecessary to introduce another variable. But this isn't a big issue.

> int ret = try_split_folio(folio, split_folios);
>
> if (!ret) {
> stats->nr_thp_split += is_thp;
> + stats->nr_split++;
> break;
> } else if (reason == MR_LONGTERM_PIN &&
> ret == -EAGAIN) {
> @@ -1827,6 +1832,7 @@ static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
> stats->nr_succeeded += astats.nr_succeeded;
> stats->nr_thp_succeeded += astats.nr_thp_succeeded;
> stats->nr_thp_split += astats.nr_thp_split;
> + stats->nr_split += astats.nr_split;
> if (rc < 0) {
> stats->nr_failed_pages += astats.nr_failed_pages;
> stats->nr_thp_failed += astats.nr_thp_failed;
> @@ -1834,7 +1840,7 @@ static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
> return rc;
> }
> stats->nr_thp_failed += astats.nr_thp_split;
> - nr_failed += astats.nr_thp_split;
> + nr_failed += astats.nr_split + astats.nr_thp_split;

THP splitting is counted in .nr_thp_split and .nr_split. So we should
add .nr_split only here.

> /*
> * Fall back to migrate all failed folios one by one synchronously. All
> * failed folios except split THPs will be retried, so their failure
> @@ -1969,7 +1975,8 @@ int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
> count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
> trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
> stats.nr_thp_succeeded, stats.nr_thp_failed,
> - stats.nr_thp_split, mode, reason);
> + stats.nr_thp_split, stats.nr_split, mode,
> + reason);
>
> if (ret_succeeded)
> *ret_succeeded = stats.nr_succeeded;

--
Best Regards,
Huang, Ying