Re: [PATCH 30 of 66] transparent hugepage core

From: Andrea Arcangeli
Date: Tue Dec 07 2010 - 16:25:43 EST


On Thu, Nov 18, 2010 at 03:12:21PM +0000, Mel Gorman wrote:
> All that seems fine to me. Nits in part that are simply not worth
> calling out. In principal, I Agree With This :)

I didn't understand what is not worth calling out, but I like the end
of your sentence ;).

> > +#define wait_split_huge_page(__anon_vma, __pmd) \
> > + do { \
> > + pmd_t *____pmd = (__pmd); \
> > + spin_unlock_wait(&(__anon_vma)->root->lock); \
> > + /* \
> > + * spin_unlock_wait() is just a loop in C and so the \
> > + * CPU can reorder anything around it. \
> > + */ \
> > + smp_mb(); \
>
> Just a note as I see nothing wrong with this but that's a good spot. The
> unlock isn't a memory barrier. Out of curiousity, does it really need to be
> a full barrier or would a write barrier have been enough?
>
> > + BUG_ON(pmd_trans_splitting(*____pmd) || \
> > + pmd_trans_huge(*____pmd)); \

spin_unlock reads, the BUG_ON reads, so even if we ignore what happens
before and after wait_split_huge_page(), at most it could be a read
memory barrier. It can't be a write memory barrier as the
spin_unlock_wait would pass it too.

I think it better be a full memory barrier to be sure the writes after
wait_split_huge_page return don't happen before spin_unlock_wait. It's
hard to see how that could happen though.

> > + } while (0)
> > +#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
> > +#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
> > +#if HPAGE_PMD_ORDER > MAX_ORDER
> > +#error "hugepages can't be allocated by the buddy allocator"
> > +#endif
> > +
> > +extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma);
> > +static inline int PageTransHuge(struct page *page)
> > +{
> > + VM_BUG_ON(PageTail(page));
> > + return PageHead(page);
> > +}
>
> gfp.h seems an odd place for these. Should the flags go in page-flags.h
> and maybe put vma_address() in internal.h?
>
> Not a biggie.

Cleaned up thanks.

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -97,13 +97,6 @@ extern void __split_huge_page_pmd(struct
#if HPAGE_PMD_ORDER > MAX_ORDER
#error "hugepages can't be allocated by the buddy allocator"
#endif
-
-extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma);
-static inline int PageTransHuge(struct page *page)
-{
- VM_BUG_ON(PageTail(page));
- return PageHead(page);
-}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUG(); 0; })
@@ -120,7 +113,6 @@ static inline int split_huge_page(struct
do { } while (0)
#define wait_split_huge_page(__anon_vma, __pmd) \
do { } while (0)
-#define PageTransHuge(page) 0
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -409,6 +409,19 @@ static inline void ClearPageCompound(str

#endif /* !PAGEFLAGS_EXTENDED */

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int PageTransHuge(struct page *page)
+{
+ VM_BUG_ON(PageTail(page));
+ return PageHead(page);
+}
+#else
+static inline int PageTransHuge(struct page *page)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_MMU
#define __PG_MLOCKED (1 << PG_mlocked)
#else
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,6 +134,10 @@ static inline void mlock_migrate_page(st
}
}

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+ struct vm_area_struct *vma);
+#endif
#else /* !CONFIG_MMU */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{

> > diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> > --- a/include/linux/mm_inline.h
> > +++ b/include/linux/mm_inline.h
> > @@ -20,11 +20,18 @@ static inline int page_is_file_cache(str
> > }
> >
> > static inline void
> > +__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
> > + struct list_head *head)
> > +{
> > + list_add(&page->lru, head);
> > + __inc_zone_state(zone, NR_LRU_BASE + l);
> > + mem_cgroup_add_lru_list(page, l);
> > +}
> > +
> > +static inline void
> > add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
> > {
> > - list_add(&page->lru, &zone->lru[l].list);
> > - __inc_zone_state(zone, NR_LRU_BASE + l);
> > - mem_cgroup_add_lru_list(page, l);
> > + __add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
> > }
> >
>
> Do these really need to be in a public header or can they move to
> mm/swap.c?

The above quoted change is a noop as far as the old code is concerned,
and moving it to swap.c would alter the old code. I think list_add and
__mod_zone_page_state is pretty small and fast so probably it's worth
keeping it as inline.

> > +static void prepare_pmd_huge_pte(pgtable_t pgtable,
> > + struct mm_struct *mm)
> > +{
> > + VM_BUG_ON(spin_can_lock(&mm->page_table_lock));
> > +
>
> assert_spin_locked() ?

Changed.

> > +int handle_pte_fault(struct mm_struct *mm,
> > + struct vm_area_struct *vma, unsigned long address,
> > + pte_t *pte, pmd_t *pmd, unsigned int flags)
> > {
> > pte_t entry;
> > spinlock_t *ptl;
> > @@ -3222,9 +3257,40 @@ int handle_mm_fault(struct mm_struct *mm
> > pmd = pmd_alloc(mm, pud, address);
> > if (!pmd)
> > return VM_FAULT_OOM;
> > - pte = pte_alloc_map(mm, vma, pmd, address);
> > - if (!pte)
> > + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
> > + if (!vma->vm_ops)
> > + return do_huge_pmd_anonymous_page(mm, vma, address,
> > + pmd, flags);
> > + } else {
> > + pmd_t orig_pmd = *pmd;
> > + barrier();
>
> What is this barrier for?

This is to be guaranteed gcc doesn't re-read the *pmd after the
barrier and it instead always read it from orig_pmd variable on the
local kernel stack. gcc doesn't know *pmd can still change from under
us until after we take some lock and the code relies on orig_pmd not
to change after barrier().

> Other than a few minor questions, these seems very similar to what you
> had before. There is a lot going on in this patch but I did not find
> anything wrong.
>
> Acked-by: Mel Gorman <mel@xxxxxxxxx>

Great thanks!
Andrea
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/