Re: [PATCH 4/5] mm: Do early cow for pinned pages during fork() for ptes

From: Oleg Nesterov
Date: Tue Sep 22 2020 - 08:40:34 EST


On 09/22, Oleg Nesterov wrote:
>
> On 09/21, Peter Xu wrote:
> >
> > @@ -859,6 +989,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
> > break;
> > }
> > +
> > + if (unlikely(data.cow_new_page)) {
> > + /*
> > + * If cow_new_page set, we must be at the 2nd round of
> > + * a previous COPY_MM_BREAK_COW. Try to arm the new
> > + * page now. Note that in all cases page_break_cow()
> > + * will properly release the objects in copy_mm_data.
> > + */
> > + WARN_ON_ONCE(copy_ret != COPY_MM_BREAK_COW);
> > + if (pte_install_copied_page(dst_mm, new, src_pte,
> > + dst_pte, addr, rss,
> > + &data)) {
> > + /* We installed the pte successfully; move on */
> > + progress++;
> > + continue;
>
> I'm afraid I misread this patch too ;)
>
> But it seems to me in this case the main loop can really "leak"
> COPY_MM_BREAK_COW. Suppose the the next 31 pte's are pte_none() and
> need_resched() is true.
>
> No?

If yes, perhaps we can simplify the copy_ret/cow_new_page logic and make
it more explicit?

Something like below, on top of this patch...

Oleg.


--- x/mm/memory.c
+++ x/mm/memory.c
@@ -704,17 +704,6 @@
};
};

-static inline void page_release_cow(struct copy_mm_data *data)
-{
- /* The old page should only be released in page_duplicate() */
- WARN_ON_ONCE(data->cow_old_page);
-
- if (data->cow_new_page) {
- put_page(data->cow_new_page);
- data->cow_new_page = NULL;
- }
-}
-
/*
* Duplicate the page for this PTE. Returns zero if page copied (so we need to
* retry on the same PTE again to arm the copied page very soon), or negative
@@ -925,7 +914,7 @@

if (!pte_same(*src_pte, data->cow_oldpte)) {
/* PTE has changed under us. Release the page and retry */
- page_release_cow(data);
+ put_page(data->cow_new_page);
return false;
}

@@ -936,12 +925,6 @@
set_pte_at(dst_mm, addr, dst_pte, entry);
rss[mm_counter(new_page)]++;

- /*
- * Manually clear the new page pointer since we've moved ownership to
- * the newly armed PTE.
- */
- data->cow_new_page = NULL;
-
return true;
}

@@ -958,16 +941,12 @@
struct copy_mm_data data;

again:
- /* We don't reset this for COPY_MM_BREAK_COW */
- memset(&data, 0, sizeof(data));
-
-again_break_cow:
init_rss_vec(rss);

dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte) {
- /* Guarantee that the new page is released if there is */
- page_release_cow(&data);
+ if (unlikely(copy_ret == COPY_MM_BREAK_COW))
+ put_page(data.cow_new_page);
return -ENOMEM;
}
src_pte = pte_offset_map(src_pmd, addr);
@@ -978,6 +957,22 @@
arch_enter_lazy_mmu_mode();

progress = 0;
+ if (unlikely(copy_ret == COPY_MM_BREAK_COW)) {
+ /*
+ * Note that in all cases pte_install_copied_page()
+ * will properly release the objects in copy_mm_data.
+ */
+ copy_ret = COPY_MM_DONE;
+ if (pte_install_copied_page(dst_mm, new, src_pte,
+ dst_pte, addr, rss,
+ &data)) {
+ /* We installed the pte successfully; move on */
+ progress++;
+ goto next;
+ }
+ /* PTE changed. Retry this pte (falls through) */
+ }
+
do {
/*
* We are holding two locks at this point - either of them
@@ -990,24 +985,6 @@
break;
}

- if (unlikely(data.cow_new_page)) {
- /*
- * If cow_new_page set, we must be at the 2nd round of
- * a previous COPY_MM_BREAK_COW. Try to arm the new
- * page now. Note that in all cases page_break_cow()
- * will properly release the objects in copy_mm_data.
- */
- WARN_ON_ONCE(copy_ret != COPY_MM_BREAK_COW);
- if (pte_install_copied_page(dst_mm, new, src_pte,
- dst_pte, addr, rss,
- &data)) {
- /* We installed the pte successfully; move on */
- progress++;
- continue;
- }
- /* PTE changed. Retry this pte (falls through) */
- }
-
if (pte_none(*src_pte)) {
progress++;
continue;
@@ -1017,6 +994,7 @@
if (copy_ret != COPY_MM_DONE)
break;
progress += 8;
+next:
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

arch_leave_lazy_mmu_mode();
@@ -1030,13 +1008,14 @@
case COPY_MM_SWAP_CONT:
if (add_swap_count_continuation(data.entry, GFP_KERNEL) < 0)
return -ENOMEM;
- break;
+ copy_ret = COPY_MM_DONE;
+ goto again;
case COPY_MM_BREAK_COW:
/* Do accounting onto parent mm directly */
ret = page_duplicate(src_mm, vma, addr, &data);
if (ret)
return ret;
- goto again_break_cow;
+ goto again;
case COPY_MM_DONE:
/* This means we're all good. */
break;