[PATCH 7/9] mm: split page table lock

From: Hugh Dickins
Date: Sat Oct 22 2005 - 11:30:35 EST


Christoph Lameter demonstrated very poor scalability on the SGI 512-way,
with a many-threaded application which concurrently initializes different
parts of a large anonymous area.

This patch corrects that, by using a separate spinlock per page table
page, to guard the page table entries in that page, instead of using
the mm's single page_table_lock. (But even then, page_table_lock is
still used to guard page table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of
the page table page: with a BUILD_BUG_ON in case it overflows - which it
would in the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.
Ideally, I suppose we would use split ptlock only for multi-threaded
processes on multi-cpu machines; but deciding that dynamically would
have its own costs. So for now enable it by config, at some number
of cpus - since the Kconfig language doesn't support inequalities, let
preprocessor compare that with NR_CPUS. But I don't think it's worth
being user-configurable: for good testing of both split and unsplit
configs, split now at 4 cpus, and perhaps change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be
attacking one part of the mm while another part is busy faulting.

Signed-off-by: Hugh Dickins <hugh@xxxxxxxxxxx>
---

arch/arm/mm/mm-armv.c | 1 +
arch/um/kernel/skas/mmu.c | 1 +
include/linux/mm.h | 26 +++++++++++++++++++++++++-
mm/Kconfig | 13 +++++++++++++
mm/memory.c | 24 ++++++++++++++----------
mm/mremap.c | 11 ++++++++++-
mm/rmap.c | 2 +-
7 files changed, 65 insertions(+), 13 deletions(-)

--- mm6/arch/arm/mm/mm-armv.c 2005-10-17 12:05:08.000000000 +0100
+++ mm7/arch/arm/mm/mm-armv.c 2005-10-22 14:07:25.000000000 +0100
@@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
pte = pmd_page(*pmd);
pmd_clear(pmd);
dec_page_state(nr_page_table_pages);
+ pte_lock_deinit(pte);
pte_free(pte);
pmd_free(pmd);
free:
--- mm6/arch/um/kernel/skas/mmu.c 2005-10-17 12:05:14.000000000 +0100
+++ mm7/arch/um/kernel/skas/mmu.c 2005-10-22 14:07:25.000000000 +0100
@@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_stru

if(!proc_mm || !ptrace_faultinfo){
free_page(mmu->id.stack);
+ pte_lock_deinit(virt_to_page(mmu->last_page_table));
pte_free_kernel((pte_t *) mmu->last_page_table);
dec_page_state(nr_page_table_pages);
#ifdef CONFIG_3_LEVEL_PGTABLES
--- mm6/include/linux/mm.h 2005-10-17 12:05:38.000000000 +0100
+++ mm7/include/linux/mm.h 2005-10-22 14:07:25.000000000 +0100
@@ -778,9 +778,33 @@ static inline pmd_t *pmd_alloc(struct mm
}
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */

+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * We tuck a spinlock to guard each pagetable page into its struct page,
+ * at page->private, with BUILD_BUG_ON to make sure that this will not
+ * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
+ * When freeing, reset page->mapping so free_pages_check won't complain.
+ */
+#define __pte_lockptr(page) ((spinlock_t *)&((page)->private))
+#define pte_lock_init(_page) do { \
+ BUILD_BUG_ON((size_t)(__pte_lockptr((struct page *)0) + 1) > \
+ sizeof(struct page)); \
+ spin_lock_init(__pte_lockptr(_page)); \
+} while (0)
+#define pte_lock_deinit(page) ((page)->mapping = NULL)
+#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#else
+/*
+ * We use mm->page_table_lock to guard all pagetable pages of the mm.
+ */
+#define pte_lock_init(page) do {} while (0)
+#define pte_lock_deinit(page) do {} while (0)
+#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
#define pte_offset_map_lock(mm, pmd, address, ptlp) \
({ \
- spinlock_t *__ptl = &(mm)->page_table_lock; \
+ spinlock_t *__ptl = pte_lockptr(mm, pmd); \
pte_t *__pte = pte_offset_map(pmd, address); \
*(ptlp) = __ptl; \
spin_lock(__ptl); \
--- mm6/mm/Kconfig 2005-10-17 12:05:40.000000000 +0100
+++ mm7/mm/Kconfig 2005-10-22 14:07:25.000000000 +0100
@@ -119,3 +119,16 @@ config MEMORY_HOTPLUG

comment "Memory hotplug is currently incompatible with Software Suspend"
depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+ int
+ default "4096" if ARM && !CPU_CACHE_VIPT
+ default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+ default "4"
--- mm6/mm/memory.c 2005-10-17 12:05:40.000000000 +0100
+++ mm7/mm/memory.c 2005-10-22 14:07:25.000000000 +0100
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_ga
{
struct page *page = pmd_page(*pmd);
pmd_clear(pmd);
+ pte_lock_deinit(page);
pte_free_tlb(tlb, page);
dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--;
@@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pm
if (!new)
return -ENOMEM;

+ pte_lock_init(new);
spin_lock(&mm->page_table_lock);
- if (pmd_present(*pmd)) /* Another has populated it */
+ if (pmd_present(*pmd)) { /* Another has populated it */
+ pte_lock_deinit(new);
pte_free(new);
- else {
+ } else {
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
@@ -432,7 +435,7 @@ again:
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
- src_ptl = &src_mm->page_table_lock;
+ src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock(src_ptl);

do {
@@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range);
* (but do_wp_page is only called after already making such a check;
* and do_anonymous_page and do_no_page can safely check later on).
*/
-static inline int pte_unmap_same(struct mm_struct *mm,
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spin_lock(&mm->page_table_lock);
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
same = pte_same(*page_table, orig_pte);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
#endif
pte_unmap(page_table);
@@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct
pte_t pte;
int ret = VM_FAULT_MINOR;

- if (!pte_unmap_same(mm, page_table, orig_pte))
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
goto out;

entry = pte_to_swp_entry(orig_pte);
@@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_s
page_cache_get(page);
entry = mk_pte(page, vma->vm_page_prot);

- ptl = &mm->page_table_lock;
+ ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (!pte_none(*page_table))
goto release;
@@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct
pgoff_t pgoff;
int err;

- if (!pte_unmap_same(mm, page_table, orig_pte))
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
return VM_FAULT_MINOR;

if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
@@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struc
pte, pmd, write_access, entry);
}

- ptl = &mm->page_table_lock;
+ ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
--- mm6/mm/mremap.c 2005-10-17 12:05:40.000000000 +0100
+++ mm7/mm/mremap.c 2005-10-22 14:07:25.000000000 +0100
@@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_str
struct address_space *mapping = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
- spinlock_t *old_ptl;
+ spinlock_t *old_ptl, *new_ptl;

if (vma->vm_file) {
/*
@@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_str
new_vma->vm_truncate_count = 0;
}

+ /*
+ * We don't have to worry about the ordering of src and dst
+ * pte locks because exclusive mmap_sem prevents deadlock.
+ */
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_ptl = pte_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock(new_ptl);

for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
@@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_str
set_pte_at(mm, new_addr, new_pte, pte);
}

+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
pte_unmap_nested(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
--- mm6/mm/rmap.c 2005-10-17 12:05:40.000000000 +0100
+++ mm7/mm/rmap.c 2005-10-22 14:07:25.000000000 +0100
@@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *p
return NULL;
}

- ptl = &mm->page_table_lock;
+ ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/