copy_page_range

Jakub Jelinek (jj@sunsite.ms.mff.cuni.cz)
Fri, 9 Jan 1998 09:10:21 +0100 (MET)


Hi Linus, hi linux-kernel!

Yeasterday I just go upset again by seeing assembly output of
copy_page_range. Yes, gcc does a poor job with inlining, but even if it
compiled optimal program, it would have several unnecessary things.
For instance there are several address and end variables, each of them
modified in all nested inline functions, so that obviously cannot be shared
in one common variable ever.
So I sat down and wrote a new copy_page_range without inlining.
I hope it is nor less readable than previous implementation (and anyway, how
often is somebody changing copy_page_range), hope goto purists aren't going
to kill me for two gotos.
It seems to work on my Ultra currently and apparently gives about 10-20usec
speedup for fork+exit (usually 266->250 usec).
It depends on pmd/pte tables beeing allocated with alignment equal to their
size, but as much as I've seen throughout the code, all architectures do
that, with the exception of sparc64 for pgd/pmd, but in that case it means
{pgd|pmd}_none so nothing breaks.

Looking at produced assembly on sparc64, I see it greatly decreased pseudo
register preasure, so it stopped trashing with stack -
sethi x, reg1
stx reg1, [%fp + y]
...
ldx [%fp + y], reg2
add reg2, z, reg2
stx reg2, [%fp + y]
...
etc. really doesn't perform very well. The old code used 11 stack slots
pretty often, the new code uses just two and never trashes with it.

I think it could be a good advantage for all RISC platforms and even for
Intel it might win some performance.

Linus, is there any chance this could go into your 2.1.xx tree?
If yes, there are several other such constructs which could benefit from
the same tricks (like clear_page_tables).

Ok, here it is:

--- memory.c.jj2 Wed Dec 3 07:47:06 1997
+++ memory.c Thu Jan 8 11:13:11 1998
@@ -175,100 +175,13 @@
return 0;
}

-static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
-{
- pte_t pte = *old_pte;
- unsigned long page_nr;
-
- if (pte_none(pte))
- return;
- if (!pte_present(pte)) {
- swap_duplicate(pte_val(pte));
- set_pte(new_pte, pte);
- return;
- }
- page_nr = MAP_NR(pte_page(pte));
- if (page_nr >= max_mapnr || PageReserved(mem_map+page_nr)) {
- set_pte(new_pte, pte);
- return;
- }
- if (cow)
- pte = pte_wrprotect(pte);
- if (delete_from_swap_cache(&mem_map[page_nr]))
- pte = pte_mkdirty(pte);
- set_pte(new_pte, pte_mkold(pte));
- set_pte(old_pte, pte);
- atomic_inc(&mem_map[page_nr].count);
-}
-
-static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
-{
- pte_t * src_pte, * dst_pte;
- unsigned long end;
-
- if (pmd_none(*src_pmd))
- return 0;
- if (pmd_bad(*src_pmd)) {
- printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
- pmd_clear(src_pmd);
- return 0;
- }
- src_pte = pte_offset(src_pmd, address);
- if (pmd_none(*dst_pmd)) {
- if (!pte_alloc(dst_pmd, 0))
- return -ENOMEM;
- }
- dst_pte = pte_offset(dst_pmd, address);
- address &= ~PMD_MASK;
- end = address + size;
- if (end >= PMD_SIZE)
- end = PMD_SIZE;
- do {
- /* I would like to switch arguments here, to make it
- * consistent with copy_xxx_range and memcpy syntax.
- */
- copy_one_pte(src_pte++, dst_pte++, cow);
- address += PAGE_SIZE;
- } while (address < end);
- return 0;
-}
-
-static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
-{
- pmd_t * src_pmd, * dst_pmd;
- unsigned long end;
- int error = 0;
-
- if (pgd_none(*src_pgd))
- return 0;
- if (pgd_bad(*src_pgd)) {
- printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
- pgd_clear(src_pgd);
- return 0;
- }
- src_pmd = pmd_offset(src_pgd, address);
- if (pgd_none(*dst_pgd)) {
- if (!pmd_alloc(dst_pgd, 0))
- return -ENOMEM;
- }
- dst_pmd = pmd_offset(dst_pgd, address);
- address &= ~PGDIR_MASK;
- end = address + size;
- if (end > PGDIR_SIZE)
- end = PGDIR_SIZE;
- do {
- error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
- if (error)
- break;
- address = (address + PMD_SIZE) & PMD_MASK;
- } while (address < end);
- return error;
-}
-
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
+ *
+ * 08Jan98 Merged into one routine from several inline routines to reduce
+ * variable count and make things faster. -jj
*/
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -276,18 +189,94 @@
pgd_t * src_pgd, * dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
- int error = 0, cow;
-
- cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+ unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+
src_pgd = pgd_offset(src, address);
dst_pgd = pgd_offset(dst, address);
- while (address < end) {
- error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
- if (error)
- break;
- address = (address + PGDIR_SIZE) & PGDIR_MASK;
+
+ if (address >= end) return 0; /* Isn't this superfluous test?? */
+
+ for (;; src_pgd++, dst_pgd++) {
+ pmd_t * src_pmd, * dst_pmd;
+
+ /* copy_pmd_range */
+
+ if (pgd_none(*src_pgd))
+ continue;
+ if (pgd_bad(*src_pgd)) {
+ printk("copy_pmd_range: bad pgd (%08lx)\n",
+ pgd_val(*src_pgd));
+ pgd_clear(src_pgd);
+ continue;
+ }
+ if (pgd_none(*dst_pgd)) {
+ if (!pmd_alloc(dst_pgd, 0))
+ return -ENOMEM;
+ }
+
+ src_pmd = pmd_offset(src_pgd, address);
+ dst_pmd = pmd_offset(dst_pgd, address);
+
+ do {
+ pte_t * src_pte, * dst_pte;
+
+ /* copy_pte_range */
+
+ if (pmd_none(*src_pmd))
+ goto cont_copy_pmd_range;
+ if (pmd_bad(*src_pmd)) {
+ printk("copy_pte_range: bad pmd (%08lx)\n",
+ pmd_val(*src_pmd));
+ pmd_clear(src_pmd);
+ goto cont_copy_pmd_range;
+ }
+ if (pmd_none(*dst_pmd)) {
+ if (!pte_alloc(dst_pmd, 0))
+ return -ENOMEM;
+ }
+
+ src_pte = pte_offset(src_pmd, address);
+ dst_pte = pte_offset(dst_pmd, address);
+
+ do {
+ pte_t pte = *src_pte;
+ unsigned long page_nr;
+
+ /* copy_one_pte */
+
+ if (pte_none(pte))
+ goto cont_copy_pte_range;
+ if (!pte_present(pte)) {
+ swap_duplicate(pte_val(pte));
+ set_pte(dst_pte, pte);
+ goto cont_copy_pte_range;
+ }
+ page_nr = MAP_NR(pte_page(pte));
+ if (page_nr >= max_mapnr ||
+ PageReserved(mem_map+page_nr)) {
+ set_pte(dst_pte, pte);
+ goto cont_copy_pte_range;
+ }
+ if (cow)
+ pte = pte_wrprotect(pte);
+ if (delete_from_swap_cache(&mem_map[page_nr]))
+ pte = pte_mkdirty(pte);
+ set_pte(dst_pte, pte_mkold(pte));
+ set_pte(src_pte, pte);
+ atomic_inc(&mem_map[page_nr].count);
+
+cont_copy_pte_range: address += PAGE_SIZE;
+ if (address >= end)
+ return 0;
+ src_pte++;
+ dst_pte++;
+ } while ((unsigned long)src_pte & (PTE_TABLE_SIZE - sizeof(pte_t)));
+
+cont_copy_pmd_range: src_pmd++;
+ dst_pmd++;
+ } while ((unsigned long)src_pmd & (PMD_TABLE_SIZE - sizeof(pmd_t)));
}
- return error;
+ return 0;
}

/*

Cheers,
Jakub
___________________________________________________________________
Jakub Jelinek | jj@sunsite.mff.cuni.cz | http://sunsite.mff.cuni.cz
Administrator of SunSITE Czech Republic, MFF, Charles University
___________________________________________________________________
Ultralinux - first 64bit OS to take full power of the UltraSparc
Linux version 2.1.77 on a sparc64 machine (331.41 BogoMips).
___________________________________________________________________