Re: [3.6 regression?] THP + migration/compaction livelock (I think)

From: Marc Duponcheel
Date: Wed Nov 14 2012 - 20:14:50 EST


Hi David

Thanks for the changeset

I will test 3.6.6 without&with this weekend.

Have a nice day

On 2012 Nov 14, #David Rientjes wrote:
> On Wed, 14 Nov 2012, Marc Duponcheel wrote:
>
> > Hi all
> >
> > If someone can provide the patches (or learn me how to get them with
> > git (I apologise to not be git savy)) then, this weekend, I can apply
> > them to 3.6.6 and compare before/after to check if they fix #49361.
> >
>
> I've backported all the commits that Mel quoted to 3.6.6 and appended them
> to this email as one big patch. It should apply cleanly to your kernel.
>
> Now we are only missing these commits that weren't quoted:
>
> - 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page
> immediately when it is made available"), and
>
> - 83fde0f22872 ("mm: vmscan: scale number of pages reclaimed by
> reclaim/compaction based on failures").
>
> Since your regression is easily reproducible, would it be possible to try
> to reproduce the issue FIRST with 3.6.6 and, if still present as it was in
> 3.6.2, then try reproducing it with the appended patch?
>
> You earlier reported that khugepaged was taking the second-most cpu time
> when this was happening, which initially pointed you to thp, so presumably
> this isn't a kswapd issue running at 100%. If both 3.6.6 kernels fail
> (the one with and without the following patch), would it be possible to
> try Mel's suggestion of patching with
>
> - https://lkml.org/lkml/2012/11/5/308 +
> https://lkml.org/lkml/2012/11/12/113
>
> to see if it helps and, if not, reverting the latter and trying
>
> - https://lkml.org/lkml/2012/11/5/308 +
> https://lkml.org/lkml/2012/11/12/151
>
> as the final test? This will certainly help us to find out what needs to
> be backported to 3.6 stable to prevent this issue for other users.
>
> Thanks!
> ---
> include/linux/compaction.h | 15 ++
> include/linux/mmzone.h | 6 +-
> include/linux/pageblock-flags.h | 19 +-
> mm/compaction.c | 450 +++++++++++++++++++++++++--------------
> mm/internal.h | 16 +-
> mm/page_alloc.c | 42 ++--
> mm/vmscan.c | 8 +
> 7 files changed, 366 insertions(+), 190 deletions(-)
>
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -24,6 +24,7 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
> int order, gfp_t gfp_mask, nodemask_t *mask,
> bool sync, bool *contended);
> extern int compact_pgdat(pg_data_t *pgdat, int order);
> +extern void reset_isolation_suitable(pg_data_t *pgdat);
> extern unsigned long compaction_suitable(struct zone *zone, int order);
>
> /* Do not skip compaction more than 64 times */
> @@ -61,6 +62,16 @@ static inline bool compaction_deferred(struct zone *zone, int order)
> return zone->compact_considered < defer_limit;
> }
>
> +/* Returns true if restarting compaction after many failures */
> +static inline bool compaction_restarting(struct zone *zone, int order)
> +{
> + if (order < zone->compact_order_failed)
> + return false;
> +
> + return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
> + zone->compact_considered >= 1UL << zone->compact_defer_shift;
> +}
> +
> #else
> static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
> int order, gfp_t gfp_mask, nodemask_t *nodemask,
> @@ -74,6 +85,10 @@ static inline int compact_pgdat(pg_data_t *pgdat, int order)
> return COMPACT_CONTINUE;
> }
>
> +static inline void reset_isolation_suitable(pg_data_t *pgdat)
> +{
> +}
> +
> static inline unsigned long compaction_suitable(struct zone *zone, int order)
> {
> return COMPACT_SKIPPED;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -369,8 +369,12 @@ struct zone {
> spinlock_t lock;
> int all_unreclaimable; /* All pages pinned */
> #if defined CONFIG_COMPACTION || defined CONFIG_CMA
> - /* pfn where the last incremental compaction isolated free pages */
> + /* Set to true when the PG_migrate_skip bits should be cleared */
> + bool compact_blockskip_flush;
> +
> + /* pfns where compaction scanners should start */
> unsigned long compact_cached_free_pfn;
> + unsigned long compact_cached_migrate_pfn;
> #endif
> #ifdef CONFIG_MEMORY_HOTPLUG
> /* see spanned/present_pages for more description */
> diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
> --- a/include/linux/pageblock-flags.h
> +++ b/include/linux/pageblock-flags.h
> @@ -30,6 +30,9 @@ enum pageblock_bits {
> PB_migrate,
> PB_migrate_end = PB_migrate + 3 - 1,
> /* 3 bits required for migrate types */
> +#ifdef CONFIG_COMPACTION
> + PB_migrate_skip,/* If set the block is skipped by compaction */
> +#endif /* CONFIG_COMPACTION */
> NR_PAGEBLOCK_BITS
> };
>
> @@ -65,10 +68,22 @@ unsigned long get_pageblock_flags_group(struct page *page,
> void set_pageblock_flags_group(struct page *page, unsigned long flags,
> int start_bitidx, int end_bitidx);
>
> +#ifdef CONFIG_COMPACTION
> +#define get_pageblock_skip(page) \
> + get_pageblock_flags_group(page, PB_migrate_skip, \
> + PB_migrate_skip + 1)
> +#define clear_pageblock_skip(page) \
> + set_pageblock_flags_group(page, 0, PB_migrate_skip, \
> + PB_migrate_skip + 1)
> +#define set_pageblock_skip(page) \
> + set_pageblock_flags_group(page, 1, PB_migrate_skip, \
> + PB_migrate_skip + 1)
> +#endif /* CONFIG_COMPACTION */
> +
> #define get_pageblock_flags(page) \
> - get_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1)
> + get_pageblock_flags_group(page, 0, PB_migrate_end)
> #define set_pageblock_flags(page, flags) \
> set_pageblock_flags_group(page, flags, \
> - 0, NR_PAGEBLOCK_BITS-1)
> + 0, PB_migrate_end)
>
> #endif /* PAGEBLOCK_FLAGS_H */
> diff --git a/mm/compaction.c b/mm/compaction.c
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype)
> return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
> }
>
> +#ifdef CONFIG_COMPACTION
> +/* Returns true if the pageblock should be scanned for pages to isolate. */
> +static inline bool isolation_suitable(struct compact_control *cc,
> + struct page *page)
> +{
> + if (cc->ignore_skip_hint)
> + return true;
> +
> + return !get_pageblock_skip(page);
> +}
> +
> +/*
> + * This function is called to clear all cached information on pageblocks that
> + * should be skipped for page isolation when the migrate and free page scanner
> + * meet.
> + */
> +static void __reset_isolation_suitable(struct zone *zone)
> +{
> + unsigned long start_pfn = zone->zone_start_pfn;
> + unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
> + unsigned long pfn;
> +
> + zone->compact_cached_migrate_pfn = start_pfn;
> + zone->compact_cached_free_pfn = end_pfn;
> + zone->compact_blockskip_flush = false;
> +
> + /* Walk the zone and mark every pageblock as suitable for isolation */
> + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
> + struct page *page;
> +
> + cond_resched();
> +
> + if (!pfn_valid(pfn))
> + continue;
> +
> + page = pfn_to_page(pfn);
> + if (zone != page_zone(page))
> + continue;
> +
> + clear_pageblock_skip(page);
> + }
> +}
> +
> +void reset_isolation_suitable(pg_data_t *pgdat)
> +{
> + int zoneid;
> +
> + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
> + struct zone *zone = &pgdat->node_zones[zoneid];
> + if (!populated_zone(zone))
> + continue;
> +
> + /* Only flush if a full compaction finished recently */
> + if (zone->compact_blockskip_flush)
> + __reset_isolation_suitable(zone);
> + }
> +}
> +
> +/*
> + * If no pages were isolated then mark this pageblock to be skipped in the
> + * future. The information is later cleared by __reset_isolation_suitable().
> + */
> +static void update_pageblock_skip(struct compact_control *cc,
> + struct page *page, unsigned long nr_isolated,
> + bool migrate_scanner)
> +{
> + struct zone *zone = cc->zone;
> + if (!page)
> + return;
> +
> + if (!nr_isolated) {
> + unsigned long pfn = page_to_pfn(page);
> + set_pageblock_skip(page);
> +
> + /* Update where compaction should restart */
> + if (migrate_scanner) {
> + if (!cc->finished_update_migrate &&
> + pfn > zone->compact_cached_migrate_pfn)
> + zone->compact_cached_migrate_pfn = pfn;
> + } else {
> + if (!cc->finished_update_free &&
> + pfn < zone->compact_cached_free_pfn)
> + zone->compact_cached_free_pfn = pfn;
> + }
> + }
> +}
> +#else
> +static inline bool isolation_suitable(struct compact_control *cc,
> + struct page *page)
> +{
> + return true;
> +}
> +
> +static void update_pageblock_skip(struct compact_control *cc,
> + struct page *page, unsigned long nr_isolated,
> + bool migrate_scanner)
> +{
> +}
> +#endif /* CONFIG_COMPACTION */
> +
> +static inline bool should_release_lock(spinlock_t *lock)
> +{
> + return need_resched() || spin_is_contended(lock);
> +}
> +
> /*
> * Compaction requires the taking of some coarse locks that are potentially
> * very heavily contended. Check if the process needs to be scheduled or
> @@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype)
> static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
> bool locked, struct compact_control *cc)
> {
> - if (need_resched() || spin_is_contended(lock)) {
> + if (should_release_lock(lock)) {
> if (locked) {
> spin_unlock_irqrestore(lock, *flags);
> locked = false;
> @@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
>
> /* async aborts if taking too long or contended */
> if (!cc->sync) {
> - if (cc->contended)
> - *cc->contended = true;
> + cc->contended = true;
> return false;
> }
>
> cond_resched();
> - if (fatal_signal_pending(current))
> - return false;
> }
>
> if (!locked)
> @@ -91,44 +193,85 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
> return compact_checklock_irqsave(lock, flags, false, cc);
> }
>
> +/* Returns true if the page is within a block suitable for migration to */
> +static bool suitable_migration_target(struct page *page)
> +{
> + int migratetype = get_pageblock_migratetype(page);
> +
> + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
> + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
> + return false;
> +
> + /* If the page is a large free page, then allow migration */
> + if (PageBuddy(page) && page_order(page) >= pageblock_order)
> + return true;
> +
> + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
> + if (migrate_async_suitable(migratetype))
> + return true;
> +
> + /* Otherwise skip the block */
> + return false;
> +}
> +
> /*
> * Isolate free pages onto a private freelist. Caller must hold zone->lock.
> * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
> * pages inside of the pageblock (even though it may still end up isolating
> * some pages).
> */
> -static unsigned long isolate_freepages_block(unsigned long blockpfn,
> +static unsigned long isolate_freepages_block(struct compact_control *cc,
> + unsigned long blockpfn,
> unsigned long end_pfn,
> struct list_head *freelist,
> bool strict)
> {
> int nr_scanned = 0, total_isolated = 0;
> - struct page *cursor;
> + struct page *cursor, *valid_page = NULL;
> + unsigned long nr_strict_required = end_pfn - blockpfn;
> + unsigned long flags;
> + bool locked = false;
>
> cursor = pfn_to_page(blockpfn);
>
> - /* Isolate free pages. This assumes the block is valid */
> + /* Isolate free pages. */
> for (; blockpfn < end_pfn; blockpfn++, cursor++) {
> int isolated, i;
> struct page *page = cursor;
>
> - if (!pfn_valid_within(blockpfn)) {
> - if (strict)
> - return 0;
> - continue;
> - }
> nr_scanned++;
> + if (!pfn_valid_within(blockpfn))
> + continue;
> + if (!valid_page)
> + valid_page = page;
> + if (!PageBuddy(page))
> + continue;
> +
> + /*
> + * The zone lock must be held to isolate freepages.
> + * Unfortunately this is a very coarse lock and can be
> + * heavily contended if there are parallel allocations
> + * or parallel compactions. For async compaction do not
> + * spin on the lock and we acquire the lock as late as
> + * possible.
> + */
> + locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
> + locked, cc);
> + if (!locked)
> + break;
> +
> + /* Recheck this is a suitable migration target under lock */
> + if (!strict && !suitable_migration_target(page))
> + break;
>
> - if (!PageBuddy(page)) {
> - if (strict)
> - return 0;
> + /* Recheck this is a buddy page under lock */
> + if (!PageBuddy(page))
> continue;
> - }
>
> /* Found a free page, break it into order-0 pages */
> isolated = split_free_page(page);
> if (!isolated && strict)
> - return 0;
> + break;
> total_isolated += isolated;
> for (i = 0; i < isolated; i++) {
> list_add(&page->lru, freelist);
> @@ -143,6 +286,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
> }
>
> trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
> +
> + /*
> + * If strict isolation is requested by CMA then check that all the
> + * pages requested were isolated. If there were any failures, 0 is
> + * returned and CMA will fail.
> + */
> + if (strict && nr_strict_required > total_isolated)
> + total_isolated = 0;
> +
> + if (locked)
> + spin_unlock_irqrestore(&cc->zone->lock, flags);
> +
> + /* Update the pageblock-skip if the whole pageblock was scanned */
> + if (blockpfn == end_pfn)
> + update_pageblock_skip(cc, valid_page, total_isolated, false);
> +
> return total_isolated;
> }
>
> @@ -160,17 +319,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
> * a free page).
> */
> unsigned long
> -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
> +isolate_freepages_range(struct compact_control *cc,
> + unsigned long start_pfn, unsigned long end_pfn)
> {
> - unsigned long isolated, pfn, block_end_pfn, flags;
> - struct zone *zone = NULL;
> + unsigned long isolated, pfn, block_end_pfn;
> LIST_HEAD(freelist);
>
> - if (pfn_valid(start_pfn))
> - zone = page_zone(pfn_to_page(start_pfn));
> -
> for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
> - if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
> + if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
> break;
>
> /*
> @@ -180,10 +336,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
> block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
> block_end_pfn = min(block_end_pfn, end_pfn);
>
> - spin_lock_irqsave(&zone->lock, flags);
> - isolated = isolate_freepages_block(pfn, block_end_pfn,
> + isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
> &freelist, true);
> - spin_unlock_irqrestore(&zone->lock, flags);
>
> /*
> * In strict mode, isolate_freepages_block() returns 0 if
> @@ -276,7 +430,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> isolate_mode_t mode = 0;
> struct lruvec *lruvec;
> unsigned long flags;
> - bool locked;
> + bool locked = false;
> + struct page *page = NULL, *valid_page = NULL;
>
> /*
> * Ensure that there are not too many pages isolated from the LRU
> @@ -296,23 +451,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>
> /* Time to isolate some pages for migration */
> cond_resched();
> - spin_lock_irqsave(&zone->lru_lock, flags);
> - locked = true;
> for (; low_pfn < end_pfn; low_pfn++) {
> - struct page *page;
> -
> /* give a chance to irqs before checking need_resched() */
> - if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
> - spin_unlock_irqrestore(&zone->lru_lock, flags);
> - locked = false;
> + if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
> + if (should_release_lock(&zone->lru_lock)) {
> + spin_unlock_irqrestore(&zone->lru_lock, flags);
> + locked = false;
> + }
> }
>
> - /* Check if it is ok to still hold the lock */
> - locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
> - locked, cc);
> - if (!locked)
> - break;
> -
> /*
> * migrate_pfn does not necessarily start aligned to a
> * pageblock. Ensure that pfn_valid is called when moving
> @@ -340,6 +487,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> if (page_zone(page) != zone)
> continue;
>
> + if (!valid_page)
> + valid_page = page;
> +
> + /* If isolation recently failed, do not retry */
> + pageblock_nr = low_pfn >> pageblock_order;
> + if (!isolation_suitable(cc, page))
> + goto next_pageblock;
> +
> /* Skip if free */
> if (PageBuddy(page))
> continue;
> @@ -349,24 +504,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> * migration is optimistic to see if the minimum amount of work
> * satisfies the allocation
> */
> - pageblock_nr = low_pfn >> pageblock_order;
> if (!cc->sync && last_pageblock_nr != pageblock_nr &&
> !migrate_async_suitable(get_pageblock_migratetype(page))) {
> - low_pfn += pageblock_nr_pages;
> - low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
> - last_pageblock_nr = pageblock_nr;
> - continue;
> + cc->finished_update_migrate = true;
> + goto next_pageblock;
> }
>
> + /* Check may be lockless but that's ok as we recheck later */
> if (!PageLRU(page))
> continue;
>
> /*
> - * PageLRU is set, and lru_lock excludes isolation,
> - * splitting and collapsing (collapsing has already
> - * happened if PageLRU is set).
> + * PageLRU is set. lru_lock normally excludes isolation
> + * splitting and collapsing (collapsing has already happened
> + * if PageLRU is set) but the lock is not necessarily taken
> + * here and it is wasteful to take it just to check transhuge.
> + * Check TransHuge without lock and skip the whole pageblock if
> + * it's either a transhuge or hugetlbfs page, as calling
> + * compound_order() without preventing THP from splitting the
> + * page underneath us may return surprising results.
> */
> if (PageTransHuge(page)) {
> + if (!locked)
> + goto next_pageblock;
> + low_pfn += (1 << compound_order(page)) - 1;
> + continue;
> + }
> +
> + /* Check if it is ok to still hold the lock */
> + locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
> + locked, cc);
> + if (!locked || fatal_signal_pending(current))
> + break;
> +
> + /* Recheck PageLRU and PageTransHuge under lock */
> + if (!PageLRU(page))
> + continue;
> + if (PageTransHuge(page)) {
> low_pfn += (1 << compound_order(page)) - 1;
> continue;
> }
> @@ -383,6 +557,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> VM_BUG_ON(PageTransCompound(page));
>
> /* Successfully isolated */
> + cc->finished_update_migrate = true;
> del_page_from_lru_list(page, lruvec, page_lru(page));
> list_add(&page->lru, migratelist);
> cc->nr_migratepages++;
> @@ -393,6 +568,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> ++low_pfn;
> break;
> }
> +
> + continue;
> +
> +next_pageblock:
> + low_pfn += pageblock_nr_pages;
> + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
> + last_pageblock_nr = pageblock_nr;
> }
>
> acct_isolated(zone, locked, cc);
> @@ -400,6 +582,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> if (locked)
> spin_unlock_irqrestore(&zone->lru_lock, flags);
>
> + /* Update the pageblock-skip if the whole pageblock was scanned */
> + if (low_pfn == end_pfn)
> + update_pageblock_skip(cc, valid_page, nr_isolated, true);
> +
> trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
>
> return low_pfn;
> @@ -407,43 +593,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
>
> #endif /* CONFIG_COMPACTION || CONFIG_CMA */
> #ifdef CONFIG_COMPACTION
> -
> -/* Returns true if the page is within a block suitable for migration to */
> -static bool suitable_migration_target(struct page *page)
> -{
> -
> - int migratetype = get_pageblock_migratetype(page);
> -
> - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
> - if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
> - return false;
> -
> - /* If the page is a large free page, then allow migration */
> - if (PageBuddy(page) && page_order(page) >= pageblock_order)
> - return true;
> -
> - /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
> - if (migrate_async_suitable(migratetype))
> - return true;
> -
> - /* Otherwise skip the block */
> - return false;
> -}
> -
> -/*
> - * Returns the start pfn of the last page block in a zone. This is the starting
> - * point for full compaction of a zone. Compaction searches for free pages from
> - * the end of each zone, while isolate_freepages_block scans forward inside each
> - * page block.
> - */
> -static unsigned long start_free_pfn(struct zone *zone)
> -{
> - unsigned long free_pfn;
> - free_pfn = zone->zone_start_pfn + zone->spanned_pages;
> - free_pfn &= ~(pageblock_nr_pages-1);
> - return free_pfn;
> -}
> -
> /*
> * Based on information in the current compact_control, find blocks
> * suitable for isolating free pages from and then isolate them.
> @@ -453,7 +602,6 @@ static void isolate_freepages(struct zone *zone,
> {
> struct page *page;
> unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
> - unsigned long flags;
> int nr_freepages = cc->nr_freepages;
> struct list_head *freelist = &cc->freepages;
>
> @@ -501,30 +649,16 @@ static void isolate_freepages(struct zone *zone,
> if (!suitable_migration_target(page))
> continue;
>
> - /*
> - * Found a block suitable for isolating free pages from. Now
> - * we disabled interrupts, double check things are ok and
> - * isolate the pages. This is to minimise the time IRQs
> - * are disabled
> - */
> - isolated = 0;
> + /* If isolation recently failed, do not retry */
> + if (!isolation_suitable(cc, page))
> + continue;
>
> - /*
> - * The zone lock must be held to isolate freepages. This
> - * unfortunately this is a very coarse lock and can be
> - * heavily contended if there are parallel allocations
> - * or parallel compactions. For async compaction do not
> - * spin on the lock
> - */
> - if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
> - break;
> - if (suitable_migration_target(page)) {
> - end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
> - isolated = isolate_freepages_block(pfn, end_pfn,
> - freelist, false);
> - nr_freepages += isolated;
> - }
> - spin_unlock_irqrestore(&zone->lock, flags);
> + /* Found a block suitable for isolating free pages from */
> + isolated = 0;
> + end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
> + isolated = isolate_freepages_block(cc, pfn, end_pfn,
> + freelist, false);
> + nr_freepages += isolated;
>
> /*
> * Record the highest PFN we isolated pages from. When next
> @@ -532,17 +666,8 @@ static void isolate_freepages(struct zone *zone,
> * page migration may have returned some pages to the allocator
> */
> if (isolated) {
> + cc->finished_update_free = true;
> high_pfn = max(high_pfn, pfn);
> -
> - /*
> - * If the free scanner has wrapped, update
> - * compact_cached_free_pfn to point to the highest
> - * pageblock with free pages. This reduces excessive
> - * scanning of full pageblocks near the end of the
> - * zone
> - */
> - if (cc->order > 0 && cc->wrapped)
> - zone->compact_cached_free_pfn = high_pfn;
> }
> }
>
> @@ -551,11 +676,6 @@ static void isolate_freepages(struct zone *zone,
>
> cc->free_pfn = high_pfn;
> cc->nr_freepages = nr_freepages;
> -
> - /* If compact_cached_free_pfn is reset then set it now */
> - if (cc->order > 0 && !cc->wrapped &&
> - zone->compact_cached_free_pfn == start_free_pfn(zone))
> - zone->compact_cached_free_pfn = high_pfn;
> }
>
> /*
> @@ -634,7 +754,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
>
> /* Perform the isolation */
> low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
> - if (!low_pfn)
> + if (!low_pfn || cc->contended)
> return ISOLATE_ABORT;
>
> cc->migrate_pfn = low_pfn;
> @@ -651,27 +771,19 @@ static int compact_finished(struct zone *zone,
> if (fatal_signal_pending(current))
> return COMPACT_PARTIAL;
>
> - /*
> - * A full (order == -1) compaction run starts at the beginning and
> - * end of a zone; it completes when the migrate and free scanner meet.
> - * A partial (order > 0) compaction can start with the free scanner
> - * at a random point in the zone, and may have to restart.
> - */
> + /* Compaction run completes if the migrate and free scanner meet */
> if (cc->free_pfn <= cc->migrate_pfn) {
> - if (cc->order > 0 && !cc->wrapped) {
> - /* We started partway through; restart at the end. */
> - unsigned long free_pfn = start_free_pfn(zone);
> - zone->compact_cached_free_pfn = free_pfn;
> - cc->free_pfn = free_pfn;
> - cc->wrapped = 1;
> - return COMPACT_CONTINUE;
> - }
> - return COMPACT_COMPLETE;
> - }
> + /*
> + * Mark that the PG_migrate_skip information should be cleared
> + * by kswapd when it goes to sleep. kswapd does not set the
> + * flag itself as the decision to be clear should be directly
> + * based on an allocation request.
> + */
> + if (!current_is_kswapd())
> + zone->compact_blockskip_flush = true;
>
> - /* We wrapped around and ended up where we started. */
> - if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
> return COMPACT_COMPLETE;
> + }
>
> /*
> * order == -1 is expected when compacting via
> @@ -754,6 +866,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
> static int compact_zone(struct zone *zone, struct compact_control *cc)
> {
> int ret;
> + unsigned long start_pfn = zone->zone_start_pfn;
> + unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
>
> ret = compaction_suitable(zone, cc->order);
> switch (ret) {
> @@ -766,17 +880,29 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
> ;
> }
>
> - /* Setup to move all movable pages to the end of the zone */
> - cc->migrate_pfn = zone->zone_start_pfn;
> -
> - if (cc->order > 0) {
> - /* Incremental compaction. Start where the last one stopped. */
> - cc->free_pfn = zone->compact_cached_free_pfn;
> - cc->start_free_pfn = cc->free_pfn;
> - } else {
> - /* Order == -1 starts at the end of the zone. */
> - cc->free_pfn = start_free_pfn(zone);
> + /*
> + * Setup to move all movable pages to the end of the zone. Used cached
> + * information on where the scanners should start but check that it
> + * is initialised by ensuring the values are within zone boundaries.
> + */
> + cc->migrate_pfn = zone->compact_cached_migrate_pfn;
> + cc->free_pfn = zone->compact_cached_free_pfn;
> + if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
> + cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
> + zone->compact_cached_free_pfn = cc->free_pfn;
> }
> + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
> + cc->migrate_pfn = start_pfn;
> + zone->compact_cached_migrate_pfn = cc->migrate_pfn;
> + }
> +
> + /*
> + * Clear pageblock skip if there were failures recently and compaction
> + * is about to be retried after being deferred. kswapd does not do
> + * this reset as it'll reset the cached information when going to sleep.
> + */
> + if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
> + __reset_isolation_suitable(zone);
>
> migrate_prep_local();
>
> @@ -787,6 +913,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
> switch (isolate_migratepages(zone, cc)) {
> case ISOLATE_ABORT:
> ret = COMPACT_PARTIAL;
> + putback_lru_pages(&cc->migratepages);
> + cc->nr_migratepages = 0;
> goto out;
> case ISOLATE_NONE:
> continue;
> @@ -831,6 +959,7 @@ static unsigned long compact_zone_order(struct zone *zone,
> int order, gfp_t gfp_mask,
> bool sync, bool *contended)
> {
> + unsigned long ret;
> struct compact_control cc = {
> .nr_freepages = 0,
> .nr_migratepages = 0,
> @@ -838,12 +967,17 @@ static unsigned long compact_zone_order(struct zone *zone,
> .migratetype = allocflags_to_migratetype(gfp_mask),
> .zone = zone,
> .sync = sync,
> - .contended = contended,
> };
> INIT_LIST_HEAD(&cc.freepages);
> INIT_LIST_HEAD(&cc.migratepages);
>
> - return compact_zone(zone, &cc);
> + ret = compact_zone(zone, &cc);
> +
> + VM_BUG_ON(!list_empty(&cc.freepages));
> + VM_BUG_ON(!list_empty(&cc.migratepages));
> +
> + *contended = cc.contended;
> + return ret;
> }
>
> int sysctl_extfrag_threshold = 500;
> @@ -855,6 +989,8 @@ int sysctl_extfrag_threshold = 500;
> * @gfp_mask: The GFP mask of the current allocation
> * @nodemask: The allowed nodes to allocate from
> * @sync: Whether migration is synchronous or not
> + * @contended: Return value that is true if compaction was aborted due to lock contention
> + * @page: Optionally capture a free page of the requested order during compaction
> *
> * This is the main entry point for direct page compaction.
> */
> diff --git a/mm/internal.h b/mm/internal.h
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -118,23 +118,23 @@ struct compact_control {
> unsigned long nr_freepages; /* Number of isolated free pages */
> unsigned long nr_migratepages; /* Number of pages to migrate */
> unsigned long free_pfn; /* isolate_freepages search base */
> - unsigned long start_free_pfn; /* where we started the search */
> unsigned long migrate_pfn; /* isolate_migratepages search base */
> bool sync; /* Synchronous migration */
> - bool wrapped; /* Order > 0 compactions are
> - incremental, once free_pfn
> - and migrate_pfn meet, we restart
> - from the top of the zone;
> - remember we wrapped around. */
> + bool ignore_skip_hint; /* Scan blocks even if marked skip */
> + bool finished_update_free; /* True when the zone cached pfns are
> + * no longer being updated
> + */
> + bool finished_update_migrate;
>
> int order; /* order a direct compactor needs */
> int migratetype; /* MOVABLE, RECLAIMABLE etc */
> struct zone *zone;
> - bool *contended; /* True if a lock was contended */
> + bool contended; /* True if a lock was contended */
> };
>
> unsigned long
> -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
> +isolate_freepages_range(struct compact_control *cc,
> + unsigned long start_pfn, unsigned long end_pfn);
> unsigned long
> isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
> unsigned long low_pfn, unsigned long end_pfn);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2131,6 +2131,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> alloc_flags & ~ALLOC_NO_WATERMARKS,
> preferred_zone, migratetype);
> if (page) {
> + preferred_zone->compact_blockskip_flush = false;
> preferred_zone->compact_considered = 0;
> preferred_zone->compact_defer_shift = 0;
> if (order >= preferred_zone->compact_order_failed)
> @@ -4438,11 +4439,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>
> zone->spanned_pages = size;
> zone->present_pages = realsize;
> -#if defined CONFIG_COMPACTION || defined CONFIG_CMA
> - zone->compact_cached_free_pfn = zone->zone_start_pfn +
> - zone->spanned_pages;
> - zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
> -#endif
> #ifdef CONFIG_NUMA
> zone->node = nid;
> zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> @@ -5632,7 +5628,8 @@ __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
> }
>
> /* [start, end) must belong to a single zone. */
> -static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
> +static int __alloc_contig_migrate_range(struct compact_control *cc,
> + unsigned long start, unsigned long end)
> {
> /* This function is based on compact_zone() from compaction.c. */
>
> @@ -5640,25 +5637,17 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
> unsigned int tries = 0;
> int ret = 0;
>
> - struct compact_control cc = {
> - .nr_migratepages = 0,
> - .order = -1,
> - .zone = page_zone(pfn_to_page(start)),
> - .sync = true,
> - };
> - INIT_LIST_HEAD(&cc.migratepages);
> -
> migrate_prep_local();
>
> - while (pfn < end || !list_empty(&cc.migratepages)) {
> + while (pfn < end || !list_empty(&cc->migratepages)) {
> if (fatal_signal_pending(current)) {
> ret = -EINTR;
> break;
> }
>
> - if (list_empty(&cc.migratepages)) {
> - cc.nr_migratepages = 0;
> - pfn = isolate_migratepages_range(cc.zone, &cc,
> + if (list_empty(&cc->migratepages)) {
> + cc->nr_migratepages = 0;
> + pfn = isolate_migratepages_range(cc->zone, cc,
> pfn, end);
> if (!pfn) {
> ret = -EINTR;
> @@ -5670,12 +5659,12 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
> break;
> }
>
> - ret = migrate_pages(&cc.migratepages,
> + ret = migrate_pages(&cc->migratepages,
> __alloc_contig_migrate_alloc,
> 0, false, MIGRATE_SYNC);
> }
>
> - putback_lru_pages(&cc.migratepages);
> + putback_lru_pages(&cc->migratepages);
> return ret > 0 ? 0 : ret;
> }
>
> @@ -5754,6 +5743,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
> unsigned long outer_start, outer_end;
> int ret = 0, order;
>
> + struct compact_control cc = {
> + .nr_migratepages = 0,
> + .order = -1,
> + .zone = page_zone(pfn_to_page(start)),
> + .sync = true,
> + .ignore_skip_hint = true,
> + };
> + INIT_LIST_HEAD(&cc.migratepages);
> +
> /*
> * What we do here is we mark all pageblocks in range as
> * MIGRATE_ISOLATE. Because pageblock and max order pages may
> @@ -5783,7 +5781,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
> if (ret)
> goto done;
>
> - ret = __alloc_contig_migrate_range(start, end);
> + ret = __alloc_contig_migrate_range(&cc, start, end);
> if (ret)
> goto done;
>
> @@ -5832,7 +5830,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
> __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
>
> /* Grab isolated pages from freelists. */
> - outer_end = isolate_freepages_range(outer_start, end);
> + outer_end = isolate_freepages_range(&cc, outer_start, end);
> if (!outer_end) {
> ret = -EBUSY;
> goto done;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2839,6 +2839,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
> */
> set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
>
> + /*
> + * Compaction records what page blocks it recently failed to
> + * isolate pages from and skips them in the future scanning.
> + * When kswapd is going to sleep, it is reasonable to assume
> + * that pages and compaction may succeed so reset the cache.
> + */
> + reset_isolation_suitable(pgdat);
> +
> if (!kthread_should_stop())
> schedule();
>
>

--
--
Marc Duponcheel
Velodroomstraat 74 - 2600 Berchem - Belgium
+32 (0)478 68.10.91 - marc@xxxxxxxxxx
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/