Re: [PATCH] fix pgd_lock deadlock

From: Andrea Arcangeli
Date: Wed Feb 23 2011 - 23:22:48 EST


On Tue, Feb 22, 2011 at 02:22:53PM +0000, Jan Beulich wrote:
> >>> On 22.02.11 at 14:49, Andrea Arcangeli <aarcange@xxxxxxxxxx> wrote:
> > On Tue, Feb 22, 2011 at 07:48:54AM +0000, Jan Beulich wrote:
> >> A possible alternative would be to acquire the page table lock
> >> in vmalloc_sync_all() only in the Xen case (perhaps by storing
> >> NULL into page->index in pgd_set_mm() when not running on
> >> Xen). This is utilizing the fact that there aren't (supposed to
> >> be - for non-pvops this is definitely the case) any TLB flush IPIs
> >> under Xen, and hence the race you're trying to fix doesn't
> >> exist there (while non-Xen doesn't need the extra locking).
> >
> > That's sure ok with me. Can we use a global runtime to check if the
> > guest is running under Xen paravirt, instead of passing that info
> > through page->something?
>
> If everyone's okay with putting a couple of "if (xen_pv_domain())"
> into mm/fault.c - sure. I would have thought that this wouldn't be
> liked, hence the suggestion to make this depend on seeing the
> backlink be non-NULL.

What about this? The page->private logic gets optimized away at
compile time with XEN=n.

The removal of _irqsave from pgd_lock, I'll delay it as it's no bugfix
anymore.

===
Subject: xen: stop taking the page_table_lock with irq disabled

From: Andrea Arcangeli <aarcange@xxxxxxxxxx>

It's forbidden to take the page_table_lock with the irq disabled or if there's
contention the IPIs (for tlb flushes) sent with the page_table_lock held will
never run leading to a deadlock.

Only Xen needs the page_table_lock and Xen won't need IPI TLB flushes hence the
deadlock doesn't exist for Xen.

Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
---
arch/x86/include/asm/pgtable.h | 5 +++--
arch/x86/mm/fault.c | 10 ++++++----
arch/x86/mm/init_64.c | 10 ++++++----
arch/x86/mm/pgtable.c | 9 +++------
4 files changed, 18 insertions(+), 16 deletions(-)

--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -235,14 +235,16 @@ void vmalloc_sync_all(void)

spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
- spinlock_t *pgt_lock;
+ struct mm_struct *mm;
pmd_t *ret;

- pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ mm = pgd_page_get_mm(page);

- spin_lock(pgt_lock);
+ if (mm)
+ spin_lock(&mm->page_table_lock);
ret = vmalloc_sync_one(page_address(page), address);
- spin_unlock(pgt_lock);
+ if (mm)
+ spin_unlock(&mm->page_table_lock);

if (!ret)
break;
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -114,11 +114,12 @@ void sync_global_pgds(unsigned long star
spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
- spinlock_t *pgt_lock;
+ struct mm_struct *mm;

pgd = (pgd_t *)page_address(page) + pgd_index(address);
- pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
- spin_lock(pgt_lock);
+ mm = pgd_page_get_mm(page);
+ if (mm)
+ spin_lock(&mm->page_table_lock);

if (pgd_none(*pgd))
set_pgd(pgd, *pgd_ref);
@@ -126,7 +127,8 @@ void sync_global_pgds(unsigned long star
BUG_ON(pgd_page_vaddr(*pgd)
!= pgd_page_vaddr(*pgd_ref));

- spin_unlock(pgt_lock);
+ if (mm)
+ spin_unlock(&mm->page_table_lock);
}
spin_unlock_irqrestore(&pgd_lock, flags);
}
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
+#include <xen/xen.h>

#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO

@@ -91,12 +92,8 @@ static inline void pgd_list_del(pgd_t *p
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
- virt_to_page(pgd)->index = (pgoff_t)mm;
-}
-
-struct mm_struct *pgd_page_get_mm(struct page *page)
-{
- return (struct mm_struct *)page->index;
+ if (xen_pv_domain())
+ virt_to_page(pgd)->index = (pgoff_t)mm;
}

static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,8 +28,6 @@ extern unsigned long empty_zero_page[PAG
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;

-extern struct mm_struct *pgd_page_get_mm(struct page *page);
-
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else /* !CONFIG_PARAVIRT */
@@ -83,6 +81,9 @@ extern struct mm_struct *pgd_page_get_mm

#endif /* CONFIG_PARAVIRT */

+#define pgd_page_get_mm(__page) \
+ ((struct mm_struct *)(xen_pv_domain() ? (__page)->index : 0))
+
/*
* The following only work if pte_present() is true.
* Undefined behaviour if not..
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/