[patch 18/21] per-zone LRU locking

From: Andrew Morton (akpm@zip.com.au)
Date: Sun Aug 11 2002 - 02:40:08 EST

Next message: Andrew Morton: "[patch 16/21] remove the zone_t and zonelist_t typedefs"
Previous message: Andrew Morton: "[patch 15/21] multiple pte pointers per pte_chain"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Now the LRUs are per-zone, make their lock per-zone as well.

In this patch the per-zone lock shares a cacheline with the zone's
buddy list lock, which is very bad. Some groundwork is needed to fix
this well.

This change is expected to be a significant win on NUMA, where most
page allocation comes from the local node's zones.

For NUMA the `struct zone' itself should really be placed in that
node's memory, which is something the platform owners should look at.
However the internode cache will help here.

Per-node kswapd would make heaps of sense too.

--- 2.5.31/mm/filemap.c~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/mm/filemap.c Sun Aug 11 00:20:57 2002
@@ -59,7 +59,7 @@
  * ->swap_device_lock (exclusive_swap_page, others)
  * ->rmap_lock (to/from swapcache)
  * ->mapping->page_lock
- * ->pagemap_lru_lock (zap_pte_range)
+ * ->zone->lru_lock (zap_pte_range)
  * ->inode_lock (__mark_inode_dirty)
  * ->sb_lock (fs/fs-writeback.c)
  *
@@ -68,7 +68,6 @@
  * ->mapping->page_lock (try_to_unmap_one)
  *
  */
-spinlock_t _pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;

/*
  * Remove a page from the page cache and free it. Caller has to make
--- 2.5.31/mm/memory.c~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/mm/memory.c Sun Aug 11 00:20:35 2002
@@ -405,7 +405,7 @@ static void zap_pte_range(mmu_gather_t *
                         cached_rmap_lock(page, &rmap_lock, &last_lockno);
                         __page_remove_rmap(page, ptep);
                         /*
- * This will take pagemap_lru_lock. Which nests inside
+ * This will take zone->lru_lock. Which nests inside
                          * rmap_lock
                          */
                         tlb_remove_page(tlb, &pvec, page);
--- 2.5.31/mm/rmap.c~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/mm/rmap.c Sun Aug 11 00:20:44 2002
@@ -14,7 +14,7 @@
/*
  * Locking:
  * - the page->pte.chain is protected by the PG_chainlock bit,
- * which nests within the pagemap_lru_lock, then the
+ * which nests within the zone->lru_lock, then the
  * mm->page_table_lock, and then the page lock.
  * - because swapout locking is opposite to the locking order
  * in the page fault path, the swapout path uses trylocks
@@ -317,7 +317,7 @@ void page_remove_rmap(struct page *page,
  * table entry mapping a page. Because locking order here is opposite
  * to the locking order used by the page fault path, we use trylocks.
  * Locking:
- * pagemap_lru_lock page_launder()
+ * zone->lru_lock page_launder()
  * page lock page_launder(), trylock
  * rmap_lock page_launder()
  * mm->page_table_lock try_to_unmap_one(), trylock
@@ -385,7 +385,7 @@ out_unlock:
  * @page: the page to get unmapped
  *
  * Tries to remove all the page table entries which are mapping this
- * page, used in the pageout path. Caller must hold pagemap_lru_lock
+ * page, used in the pageout path. Caller must hold zone->lru_lock
  * and the page lock. Return values are:
  *
  * SWAP_SUCCESS - we succeeded in removing all mappings
--- 2.5.31/mm/swap.c~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/mm/swap.c Sun Aug 11 00:20:35 2002
@@ -26,26 +26,20 @@
int page_cluster;

/*
- * Move an inactive page to the active list.
+ * FIXME: speed this up?
  */
-static inline void activate_page_nolock(struct page * page)
+void activate_page(struct page *page)
{
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
         if (PageLRU(page) && !PageActive(page)) {
- del_page_from_inactive_list(page);
+ del_page_from_inactive_list(zone, page);
                 SetPageActive(page);
- add_page_to_active_list(page);
+ add_page_to_active_list(zone, page);
                 KERNEL_STAT_INC(pgactivate);
         }
-}
-
-/*
- * FIXME: speed this up?
- */
-void activate_page(struct page * page)
-{
- spin_lock_irq(&_pagemap_lru_lock);
- activate_page_nolock(page);
- spin_unlock_irq(&_pagemap_lru_lock);
+ spin_unlock_irq(&zone->lru_lock);
}

/**
@@ -80,13 +74,14 @@ void __page_cache_release(struct page *p
{
         BUG_ON(page_count(page) != 0);
         if (PageLRU(page)) {
+ struct zone *zone = page_zone(page);
                 unsigned long flags;

- spin_lock_irqsave(&_pagemap_lru_lock, flags);
+ spin_lock_irqsave(&zone->lru_lock, flags);
                 if (!TestClearPageLRU(page))
                         BUG();
- del_page_from_lru(page);
- spin_unlock_irqrestore(&_pagemap_lru_lock, flags);
+ del_page_from_lru(zone, page);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
         }
         __free_page(page);
}
@@ -96,7 +91,7 @@ void __page_cache_release(struct page *p
  * pagevec's pages. If it fell to zero then remove the page from the LRU and
  * free it.
  *
- * Avoid taking pagemap_lru_lock if possible, but if it is taken, retain it
+ * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
  * for the remainder of the operation.
  *
  * The locking in this function is against shrink_cache(): we recheck the
@@ -108,28 +103,31 @@ void __page_cache_release(struct page *p
void __pagevec_release(struct pagevec *pvec)
{
         int i;
- int lock_held = 0;
         struct pagevec pages_to_free;
+ struct zone *zone = NULL;

         pagevec_init(&pages_to_free);
         for (i = 0; i < pagevec_count(pvec); i++) {
                 struct page *page = pvec->pages[i];
+ struct zone *pagezone;

                 if (!put_page_testzero(page))
                         continue;

- if (!lock_held) {
- spin_lock_irq(&_pagemap_lru_lock);
- lock_held = 1;
+ pagezone = page_zone(page);
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
                 }
-
                 if (TestClearPageLRU(page))
- del_page_from_lru(page);
+ del_page_from_lru(zone, page);
                 if (page_count(page) == 0)
                         pagevec_add(&pages_to_free, page);
         }
- if (lock_held)
- spin_unlock_irq(&_pagemap_lru_lock);
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);

         pagevec_free(&pages_to_free);
         pagevec_init(pvec);
@@ -164,26 +162,27 @@ void __pagevec_release_nonlru(struct pag
void pagevec_deactivate_inactive(struct pagevec *pvec)
{
         int i;
- int lock_held = 0;
+ struct zone *zone = NULL;

         if (pagevec_count(pvec) == 0)
                 return;
         for (i = 0; i < pagevec_count(pvec); i++) {
                 struct page *page = pvec->pages[i];
+ struct zone *pagezone = page_zone(page);

- if (!lock_held) {
+ if (pagezone != zone) {
                         if (PageActive(page) || !PageLRU(page))
                                 continue;
- spin_lock_irq(&_pagemap_lru_lock);
- lock_held = 1;
- }
- if (!PageActive(page) && PageLRU(page)) {
- struct zone *zone = page_zone(page);
- list_move(&page->lru, &zone->inactive_list);
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
                 }
+ if (!PageActive(page) && PageLRU(page))
+ list_move(&page->lru, &pagezone->inactive_list);
         }
- if (lock_held)
- spin_unlock_irq(&_pagemap_lru_lock);
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
         __pagevec_release(pvec);
}

@@ -194,16 +193,24 @@ void pagevec_deactivate_inactive(struct
void __pagevec_lru_add(struct pagevec *pvec)
{
         int i;
+ struct zone *zone = NULL;

- spin_lock_irq(&_pagemap_lru_lock);
         for (i = 0; i < pagevec_count(pvec); i++) {
                 struct page *page = pvec->pages[i];
+ struct zone *pagezone = page_zone(page);

+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
+ }
                 if (TestSetPageLRU(page))
                         BUG();
- add_page_to_inactive_list(page);
+ add_page_to_inactive_list(zone, page);
         }
- spin_unlock_irq(&_pagemap_lru_lock);
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
         pagevec_release(pvec);
}

@@ -214,16 +221,24 @@ void __pagevec_lru_add(struct pagevec *p
void __pagevec_lru_del(struct pagevec *pvec)
{
         int i;
+ struct zone *zone = NULL;

- spin_lock_irq(&_pagemap_lru_lock);
         for (i = 0; i < pagevec_count(pvec); i++) {
                 struct page *page = pvec->pages[i];
+ struct zone *pagezone = page_zone(page);

+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
+ }
                 if (!TestClearPageLRU(page))
                         BUG();
- del_page_from_lru(page);
+ del_page_from_lru(zone, page);
         }
- spin_unlock_irq(&_pagemap_lru_lock);
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
         pagevec_release(pvec);
}

--- 2.5.31/mm/vmscan.c~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/mm/vmscan.c Sun Aug 11 00:20:35 2002
@@ -264,7 +264,7 @@ keep:
}

/*
- * pagemap_lru_lock is heavily contented. We relieve it by quickly privatising
+ * zone->lru_lock is heavily contented. We relieve it by quickly privatising
  * a batch of pages and working on them outside the lock. Any pages which were
  * not freed will be added back to the LRU.
  *
@@ -292,7 +292,7 @@ shrink_cache(int nr_pages, struct zone *
         pagevec_init(&pvec);

         lru_add_drain();
- spin_lock_irq(&_pagemap_lru_lock);
+ spin_lock_irq(&zone->lru_lock);
         while (max_scan > 0 && nr_pages > 0) {
                 struct page *page;
                 int n = 0;
@@ -318,7 +318,7 @@ shrink_cache(int nr_pages, struct zone *
                         n++;
                 }
                 zone->nr_inactive -= n;
- spin_unlock_irq(&_pagemap_lru_lock);
+ spin_unlock_irq(&zone->lru_lock);

                 if (list_empty(&page_list))
                         goto done;
@@ -331,7 +331,7 @@ shrink_cache(int nr_pages, struct zone *
                 if (nr_pages <= 0 && list_empty(&page_list))
                         goto done;

- spin_lock_irq(&_pagemap_lru_lock);
+ spin_lock_irq(&zone->lru_lock);
                 /*
                  * Put back any unfreeable pages.
                  */
@@ -341,17 +341,17 @@ shrink_cache(int nr_pages, struct zone *
                                 BUG();
                         list_del(&page->lru);
                         if (PageActive(page))
- add_page_to_active_list(page);
+ add_page_to_active_list(zone, page);
                         else
- add_page_to_inactive_list(page);
+ add_page_to_inactive_list(zone, page);
                         if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&_pagemap_lru_lock);
+ spin_unlock_irq(&zone->lru_lock);
                                 __pagevec_release(&pvec);
- spin_lock_irq(&_pagemap_lru_lock);
+ spin_lock_irq(&zone->lru_lock);
                         }
                 }
           }
- spin_unlock_irq(&_pagemap_lru_lock);
+ spin_unlock_irq(&zone->lru_lock);
done:
         pagevec_release(&pvec);
         return nr_pages;
@@ -364,9 +364,9 @@ done:
  * processes, from rmap.
  *
  * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold pagemap_lru_lock across the whole operation. But if
+ * appropriate to hold zone->lru_lock across the whole operation. But if
  * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop pagemap_lru_lock around each page. It's impossible to balance
+ * should drop zone->lru_lock around each page. It's impossible to balance
  * this, so instead we remove the pages from the LRU while processing them.
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
@@ -388,7 +388,7 @@ refill_inactive_zone(struct zone *zone,
         unsigned last_lockno = -1;

         lru_add_drain();
- spin_lock_irq(&_pagemap_lru_lock);
+ spin_lock_irq(&zone->lru_lock);
         while (nr_pages && !list_empty(&zone->active_list)) {
                 page = list_entry(zone->active_list.prev, struct page, lru);
                 prefetchw_prev_lru_page(page, &zone->active_list, flags);
@@ -398,7 +398,7 @@ refill_inactive_zone(struct zone *zone,
                 list_move(&page->lru, &l_hold);
                 nr_pages--;
         }
- spin_unlock_irq(&_pagemap_lru_lock);
+ spin_unlock_irq(&zone->lru_lock);

         while (!list_empty(&l_hold)) {
                 page = list_entry(l_hold.prev, struct page, lru);
@@ -416,7 +416,7 @@ refill_inactive_zone(struct zone *zone,
         drop_rmap_lock(&rmap_lock, &last_lockno);

         pagevec_init(&pvec);
- spin_lock_irq(&_pagemap_lru_lock);
+ spin_lock_irq(&zone->lru_lock);
         while (!list_empty(&l_inactive)) {
                 page = list_entry(l_inactive.prev, struct page, lru);
                 prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -426,9 +426,9 @@ refill_inactive_zone(struct zone *zone,
                         BUG();
                 list_move(&page->lru, &zone->inactive_list);
                 if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&_pagemap_lru_lock);
+ spin_unlock_irq(&zone->lru_lock);
                         __pagevec_release(&pvec);
- spin_lock_irq(&_pagemap_lru_lock);
+ spin_lock_irq(&zone->lru_lock);
                 }
         }
         while (!list_empty(&l_active)) {
@@ -439,14 +439,14 @@ refill_inactive_zone(struct zone *zone,
                 BUG_ON(!PageActive(page));
                 list_move(&page->lru, &zone->active_list);
                 if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&_pagemap_lru_lock);
+ spin_unlock_irq(&zone->lru_lock);
                         __pagevec_release(&pvec);
- spin_lock_irq(&_pagemap_lru_lock);
+ spin_lock_irq(&zone->lru_lock);
                 }
         }
         zone->nr_active -= pgdeactivate;
         zone->nr_inactive += pgdeactivate;
- spin_unlock_irq(&_pagemap_lru_lock);
+ spin_unlock_irq(&zone->lru_lock);
         pagevec_release(&pvec);

         KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages);
--- 2.5.31/include/linux/mm.h~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/include/linux/mm.h Sun Aug 11 00:20:50 2002
@@ -157,7 +157,7 @@ struct page {
         struct address_space *mapping; /* The inode (or ...) we belong to. */
         unsigned long index; /* Our offset within mapping. */
         struct list_head lru; /* Pageout list, eg. active_list;
- protected by pagemap_lru_lock !! */
+ protected by zone->lru_lock !! */
         union {
                 struct pte_chain * chain; /* Reverse pte mapping pointer.
                                          * protected by PG_chainlock */
--- 2.5.31/include/linux/page-flags.h~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/include/linux/page-flags.h Sun Aug 11 00:20:35 2002
@@ -28,7 +28,7 @@
  *
  * Note that the referenced bit, the page->lru list_head and the active,
  * inactive_dirty and inactive_clean lists are protected by the
- * pagemap_lru_lock, and *NOT* by the usual PG_locked bit!
+ * zone->lru_lock, and *NOT* by the usual PG_locked bit!
  *
  * PG_error is set to indicate that an I/O error occurred on this page.
  *
--- 2.5.31/include/linux/swap.h~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/include/linux/swap.h Sun Aug 11 00:20:35 2002
@@ -212,8 +212,6 @@ extern struct swap_list_t swap_list;
asmlinkage long sys_swapoff(const char *);
asmlinkage long sys_swapon(const char *, int);

-extern spinlock_t _pagemap_lru_lock;
-
extern void FASTCALL(mark_page_accessed(struct page *));

extern spinlock_t swaplock;
--- 2.5.31/mm/page_alloc.c~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/mm/page_alloc.c Sun Aug 11 00:20:50 2002
@@ -829,7 +829,8 @@ void __init free_area_init_core(int nid,
                 printk("zone(%lu): %lu pages.\n", j, size);
                 zone->size = size;
                 zone->name = zone_names[j];
- zone->lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&zone->lock);
+ spin_lock_init(&zone->lru_lock);
                 zone->zone_pgdat = pgdat;
                 zone->free_pages = 0;
                 zone->need_balance = 0;
--- 2.5.31/include/linux/mmzone.h~per-zone-lock Sun Aug 11 00:20:35 2002
+++ 2.5.31-akpm/include/linux/mmzone.h Sun Aug 11 00:20:58 2002
@@ -44,6 +44,7 @@ struct zone {
         unsigned long pages_min, pages_low, pages_high;
         int need_balance;

+ spinlock_t lru_lock;
         struct list_head active_list;
         struct list_head inactive_list;
         atomic_t refill_counter;

.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Andrew Morton: "[patch 16/21] remove the zone_t and zonelist_t typedefs"
Previous message: Andrew Morton: "[patch 15/21] multiple pte pointers per pte_chain"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This archive was generated by hypermail 2b29 : Thu Aug 15 2002 - 22:00:23 EST