I made a patch to support the Alpha's multi-granularity TLB.
With this patch kernel can map upto 4MB of continuous space
by a TLB entry. I benchmarked with my Alpha box (EB164) by
a large matrix transpose, and got 30 to 40% higher performance
than conventional kernel. It is because excessive TLB miss was
occured in that benchmark under the conventional kernel, but
with my patch Alpha can map upto 256MB ( 4MB x 64 TB entries)
by itself and results in drastic reduction of the miss. In many
high performance applications, the high rate of TLB miss will be
occured and it will be resolved with my patch.
Then I think it will be worth to take.
It also worked to the 2.4.0-test2 fine.
Please CC to my email nshimizu@et.u-tokai.ac.jp because I usually
read the kernel traffic by WEB site.
I will attach the patch itself and the benchmark program.
Suggestion and comments are very welcome.
Thank you,
Naohiko Shimizu
diff -urN linux-2.4.0-test1/Documentation/page_gh.txt linux/Documentation/page_gh.txt
--- linux-2.4.0-test1/Documentation/page_gh.txt Thu Jan 1 09:00:00 1970
+++ linux/Documentation/page_gh.txt Sat Jul 1 16:18:19 2000
@@ -0,0 +1,24 @@
+The Alpha architecture defines Granularity Hint(GH) bits in the
+Page Table Entry(PTE). If these bits are set to non-zero value,
+it supply a hint to translation buffer implementations that
+a block of pages can be treated as a single larger page.
+It means that even if we don't have variable length page mechanism,
+we will have the opportunity to reduce translation misses.
+For the large working set HPC applications the performance
+degradation caused by the translation misses should be avoided.
+Then if we can use this feature, many HPC applications will be
+appreciated.
+
+In this release there is a configuration option to support this
+feature for Alpha architecture. You can turn on this feature by assert
+the CONFIG_PAGE_GH variable. With this release, you can set GH bits in
+your page table when you call brk system call or mmap system call with
+MAP_ANONYMOUS flag on. Then if you want to run your program with
+enough speed, you should allocate memory with these call (or more
+general you can use malloc library call).
+The GH bits only set on the heap allocation, after that if any process
+changes the table it will drop the bits for safety. You may want to
+stop swapping for continuous high performance operation.
+
+
+Naohiko Shimizu<nshimizu@keyaki.cc.u-tokai.ac.jp>
diff -urN linux-2.4.0-test1/arch/alpha/config.in linux/arch/alpha/config.in
--- linux-2.4.0-test1/arch/alpha/config.in Tue Mar 28 07:18:32 2000
+++ linux/arch/alpha/config.in Sat Jul 1 16:18:19 2000
@@ -184,6 +184,10 @@
bool 'Symmetric multi-processing support' CONFIG_SMP
fi
+bool 'PAGE Granularity Hint support' CONFIG_PAGE_GH
+dep_bool 'Set GH for mmap area' CONFIG_PAGE_GH_MMAP $CONFIG_PAGE_GH
+dep_bool 'Set GH for brk area' CONFIG_PAGE_GH_BRK $CONFIG_PAGE_GH
+
source drivers/pci/Config.in
bool 'Support for hot-pluggable devices' CONFIG_HOTPLUG
diff -urN linux-2.4.0-test1/arch/alpha/mm/init.c linux/arch/alpha/mm/init.c
--- linux-2.4.0-test1/arch/alpha/mm/init.c Tue Apr 25 05:39:34 2000
+++ linux/arch/alpha/mm/init.c Sat Jul 1 16:18:19 2000
@@ -5,6 +5,7 @@
*/
/* 2.3.x zone allocator, 1999 Andrea Arcangeli <andrea@suse.de> */
+/* PAGE_GH support, 2000 Naohiko Shimizu <nshimizu@keyaki.cc.u-tokai.ac.jp> */
#include <linux/config.h>
#include <linux/signal.h>
@@ -30,6 +31,11 @@
#include <asm/hwrpb.h>
#include <asm/dma.h>
#include <asm/mmu_context.h>
+
+#ifdef CONFIG_PAGE_GH
+int page_gh_order[] = {0,3,6,9};
+pgprot_t page_gh_prot[] = {0x0000,0x0020,0x0040,0x0060};
+#endif
static unsigned long totalram_pages;
diff -urN linux-2.4.0-test1/include/asm-alpha/pgtable.h linux/include/asm-alpha/pgtable.h
--- linux-2.4.0-test1/include/asm-alpha/pgtable.h Wed Mar 22 03:46:21 2000
+++ linux/include/asm-alpha/pgtable.h Sat Jul 1 16:38:34 2000
@@ -18,7 +18,10 @@
* within a page table are directly modified. Thus, the following
* hook is made available.
*/
-#define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval))
+/*
+ * PAGE_GH support added by Naohiko Shimizu
+ <nshimizu@keyaki.cc.u-tokai.ac.jp>
+ */
/* PMD_SHIFT determines the size of the area a second-level page table can map */
#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3))
@@ -217,9 +220,7 @@
extern inline unsigned long pgd_page(pgd_t pgd)
{ return PAGE_OFFSET + ((pgd_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); }
-extern inline int pte_none(pte_t pte) { return !pte_val(pte); }
extern inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_VALID; }
-extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep) = 0; }
extern inline int pmd_none(pmd_t pmd) { return !pmd_val(pmd); }
extern inline int pmd_bad(pmd_t pmd) { return (pmd_val(pmd) & ~_PFN_MASK) != _PAGE_TABLE; }
@@ -232,6 +233,36 @@
extern inline void pgd_clear(pgd_t * pgdp) { pgd_val(*pgdp) = 0; }
#define page_address(page) ((page)->virtual)
+
+#ifndef CONFIG_PAGE_GH
+#define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval))
+extern inline int pte_none(pte_t pte) { return !(pte_val(pte)); }
+extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep)=0; }
+#else
+#define PAGE_GH_MASK 0x0060
+#define PAGE_GH_MASK_SHIFT 5
+#define PAGE_GH_NR 4
+extern int page_gh_order[];
+extern pgprot_t page_gh_prot[];
+extern inline int pte_none(pte_t pte) { return !(pte_val(pte) & ~PAGE_GH_MASK); }
+#define pte_to_gh_index(x) ((pte_val(x) & PAGE_GH_MASK) >> PAGE_GH_MASK_SHIFT)
+extern inline pte_t mk_pte_gh_clean(pte_t pte) {pte_val(pte) &= ~PAGE_GH_MASK; return pte;}
+extern inline void clear_pte_gh(pte_t *pteptr, int order) {
+ int i;
+ pte_t *addr = (pte_t *)((unsigned long) pteptr & ~((1UL<<(order + SIZEOF_PTR_LOG2)) - 1));
+ for ( i=0; i < 1<<order; i++)
+ pte_val(*(addr+i)) &= ~PAGE_GH_MASK;
+ }
+extern inline void set_pte_raw(pte_t *pteptr, pte_t pteval) {
+ if ( pte_present(*pteptr) && ( pte_val(*pteptr) & PAGE_GH_MASK ))
+ clear_pte_gh(pteptr, page_gh_order[pte_to_gh_index(*pteptr)]);
+ *pteptr = pteval;
+}
+extern inline void set_pte(pte_t *pteptr, pte_t pteval) {
+ set_pte_raw(pteptr, mk_pte_gh_clean(pteval));
+}
+extern inline void pte_clear(pte_t *ptep) { pte_t pte; pte_val(pte)=0; set_pte(ptep, pte); }
+#endif
/*
* The following only work if pte_present() is true.
diff -urN linux-2.4.0-test1/include/linux/mm.h linux/include/linux/mm.h
--- linux-2.4.0-test1/include/linux/mm.h Thu May 25 11:52:42 2000
+++ linux/include/linux/mm.h Sat Jul 1 17:19:52 2000
@@ -312,6 +312,13 @@
extern struct page * FASTCALL(__alloc_pages(zonelist_t *zonelist, unsigned long order));
extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order);
+#ifndef CONFIG_PAGE_GH
+#define PAGE_GH_NR 1
+#define PAGE_GH_MASK (pgprot_t)0
+#define pte_to_gh_index(x) (0)
+#define clear_pte_gh(pte, order)
+#endif
+
#ifndef CONFIG_DISCONTIGMEM
static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
{
@@ -406,6 +413,7 @@
extern void vmtruncate(struct inode * inode, loff_t offset);
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
+extern int make_ptes_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len);
extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len);
@@ -421,6 +429,15 @@
extern void show_mem(void);
extern void si_meminfo(struct sysinfo * val);
extern void swapin_readahead(swp_entry_t);
+
+extern void __break_area (zonelist_t *zonelist, struct page *page, unsigned long order);
+static inline void break_area(int gfp_mask, struct page *page, unsigned long order)
+{
+ /* temporary check. */
+ if (contig_page_data.node_zonelists[gfp_mask].gfp_mask != (gfp_mask))
+ BUG();
+ __break_area(contig_page_data.node_zonelists+(gfp_mask), page, order);
+}
/* mmap.c */
extern void vma_init(void);
diff -urN linux-2.4.0-test1/mm/memory.c linux/mm/memory.c
--- linux-2.4.0-test1/mm/memory.c Tue May 16 04:00:33 2000
+++ linux/mm/memory.c Sat Jul 1 17:04:49 2000
@@ -34,6 +34,8 @@
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
+ * 13.06.00 - Support of PAGE_GH added by Naohiko Shimizu, Tokai Univ.
+ <nshimizu@keyaki.cc.u-tokai.ac.jp>
*/
#include <linux/mm.h>
@@ -53,6 +55,11 @@
void * high_memory;
struct page *highmem_start_page;
+#ifndef CONFIG_PAGE_GH
+int page_gh_order[] = {0};
+pgprot_t page_gh_prot[] = {0x00};
+#endif
+
/*
* We special-case the C-O-W ZERO_PAGE, because it's such
* a common occurrence (no need to read the page to know
@@ -211,7 +218,7 @@
do {
pte_t pte = *src_pte;
unsigned long page_nr;
-
+
/* copy_one_pte */
if (pte_none(pte))
@@ -1079,20 +1086,41 @@
static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
{
int high = 0;
+ int i;
+ unsigned long ghaddr;
struct page *page = NULL;
+ pte_t oldpte = *page_table;
+ int order = page_gh_order[pte_to_gh_index(oldpte)];
pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+/*
+ * We allocate a block of pages if the oldpte has the GH bit. This
+ * routine is called with empty oldpte but GH bits.
+ */
if (write_access) {
- page = alloc_page(GFP_HIGHUSER);
- if (!page)
+ pte_t *wktable = (pte_t *)((unsigned long)page_table & ~((1UL << (order + SIZEOF_PTR_LOG2)) -1));
+ page = alloc_pages(GFP_HIGHUSER, order);
+ if (!page) {
+ if (order) {
+ clear_pte_gh(page_table, order);
+ return do_anonymous_page(mm, vma, page_table, write_access, addr);
+ } else
return -1;
+ }
+ if (order) break_area(GFP_HIGHUSER, page, order);
if (PageHighMem(page))
high = 1;
- clear_user_highpage(page, addr);
- entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
- mm->rss++;
- flush_page_to_ram(page);
- }
- set_pte(page_table, entry);
+ ghaddr = addr & ~((PAGE_SIZE << order) - 1);
+ for (i=0; i < 1 << order; i++) {
+ entry = pte_mkwrite(pte_mkdirty(mk_pte(page+i, __pgprot(pgprot_val(vma->vm_page_prot) | pgprot_val(page_gh_prot[pte_to_gh_index(oldpte)])))));
+// if(i==1) printk("anonymous page %p %p %p %p %x %xl\n", page_table, wktable+i, (void *)pte_val(*(wktable+i)), (void *)pte_val(entry), order, pgprot_val(vma->vm_page_prot));
+ clear_user_highpage(page+i, ghaddr);
+ mm->rss++;
+ ghaddr += PAGE_SIZE;
+ set_pte_raw(wktable+i, entry);
+ flush_page_to_ram(page+i);
+ }
+ } else
+ set_pte(page_table, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
return 1; /* Minor fault */
@@ -1243,5 +1271,68 @@
return -1;
addr += PAGE_SIZE;
} while (addr < end);
+ return 0;
+}
+
+/*
+ * Allocating PTEs for future falt handling.
+ */
+
+int set_gh_range(unsigned long address, int order, pgprot_t prot)
+{
+ int i;
+ pgd_t * dir;
+ pmd_t *pmd;
+ pte_t * pte;
+
+ dir = pgd_offset(current->mm, address);
+ pmd = pmd_alloc(dir, address);
+ if (!pmd)
+ return -ENOMEM;
+ address &= ~PGDIR_MASK;
+ pte = pte_alloc(pmd, address);
+ if (!pte)
+ return -ENOMEM;
+ for (i = 0; i < 1<<order; i ++) {
+ set_pte_raw(pte, pte_modify(*pte, prot));
+ pte++;
+ }
+ return 0;
+}
+
+/*
+ * Simplistic new page table allocation for sys_brk..
+ * Only GH bit != 0 tables will be allocated.
+ * At this time, we will not allocate real storage, it remains
+ * for the page_fault handler.
+ */
+int make_ptes_present(unsigned long addr, unsigned long end)
+{
+ int i;
+ unsigned long rem;
+ if (addr >= end)
+ BUG();
+/*
+ * The first order(i=0) is the ordinary pte (1page), then we skip to
+ * allocate the pte.
+ */
+ for (i = 0; i < PAGE_GH_NR - 1; i++) {
+ rem = (~addr + 1) & ((PAGE_SIZE << page_gh_order[i+1]) - 1);
+ while (rem &&
+ (addr & ((PAGE_SIZE << page_gh_order[i]) - 1)) == 0UL &&
+ ((end - addr ) >= (PAGE_SIZE << page_gh_order[i]))) {
+ if(i) set_gh_range(addr, page_gh_order[i], page_gh_prot[i]);
+ addr += PAGE_SIZE << page_gh_order[i];
+ rem -= PAGE_SIZE << page_gh_order[i];
+ };
+ }
+ for (i = PAGE_GH_NR - 1; i > 0; i--) {
+ while (
+ (addr & ((PAGE_SIZE << page_gh_order[i]) - 1)) == 0UL &&
+ ((end - addr ) >= (PAGE_SIZE << page_gh_order[i]))) {
+ set_gh_range(addr, page_gh_order[i], page_gh_prot[i]);
+ addr += PAGE_SIZE << page_gh_order[i];
+ };
+ }
return 0;
}
diff -urN linux-2.4.0-test1/mm/mmap.c linux/mm/mmap.c
--- linux-2.4.0-test1/mm/mmap.c Thu Apr 27 01:11:32 2000
+++ linux/mm/mmap.c Sat Jul 1 16:18:19 2000
@@ -338,6 +338,17 @@
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
+ /*
+ * mmap may provide new pages and we have the chance to set the
+ * PAGE_GH bits in the TLB entries, then we call make_ptes_present.
+ * MAP_ANONYMOUS and MAP_PRIVATE flag gives a chance to tell us
+ * that it is a plain allocation.
+ * <N.Shimizu>
+ */
+#ifdef CONFIG_PAGE_GH_MMAP
+ if (flags & MAP_ANONYMOUS && flags & MAP_PRIVATE)
+ make_ptes_present(addr, addr + len);
+#endif
return addr;
unmap_and_free_vma:
@@ -819,6 +830,14 @@
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
+ /*
+ * brk provide new pages and we have the chance to set the
+ * PAGE_GH bits for TLB, then we call make_ptes_present.
+ * <N.Shimizu>
+ */
+#ifdef CONFIG_PAGE_GH_BRK
+ make_ptes_present(addr, addr + len);
+#endif
return addr;
}
diff -urN linux-2.4.0-test1/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-2.4.0-test1/mm/page_alloc.c Tue May 16 08:47:34 2000
+++ linux/mm/page_alloc.c Mon Jul 3 16:19:48 2000
@@ -7,6 +7,7 @@
* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ * Page Granularity Hint support, Naohiko Shimizu, Tokai Univ., Jul 2000
*/
#include <linux/config.h>
@@ -167,6 +168,26 @@
if (BAD_RANGE(zone,page))
BUG();
return page;
+}
+
+void __break_area (zonelist_t *zonelist, struct page *page, unsigned long order) {
+ zone_t **z = zonelist->zones;
+ zone_t *zone = *z;
+ unsigned long size = 1 << order;
+ int i,j;
+ unsigned int index = (page - mem_map) - zone->offset;
+ free_area_t * area = zone->free_area;
+ for ( i = 0; i < size; i++ ) {
+ MARK_USED(index + i, 0, area);
+ set_page_count(page + i, 1);
+ }
+ for ( j = 1; j < order; j++) {
+ area++;
+ for ( i = 0; i < size; i += 1<<j ) {
+ MARK_USED(index + i, j, area);
+ }
+ }
+ return;
}
static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
/*
This program is for testing large stride data transfer
I put this program under the GPL.
Contact:Naohiko Shimizu,
School of Engineering, Tokai University.
1117 Kitakaname, Kanagawa 259-12 Japan
email:nshimizu@keyaki.cc.u-tokai.ac.jp
TEL: +81-463-58-1211(ext.4084)
FAX: +81-463-58-8320
<URL:http://shimizu-lab.et.u-tokai.ac.jp/>
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/resource.h>
#define DIMENSION 1025
#define MINORLOOP 1
#define ITR 20
struct rusage rusage;
double dtime()
{
double q;
getrusage(RUSAGE_SELF,&rusage);
q = (double)(rusage.ru_utime.tv_sec);
q = q + (double)(rusage.ru_utime.tv_usec) * 1.0e-06;
return q;
}
int cmpdble(void *a, void *b) {
if(*(double *)a==*(double *)b) return 0;
if(*(double *)a>*(double *)b) return 1;
return -1; }
double bench1(double *a, double *b, int dim) {
int i,j,kk,cnt;
double starttime, endtime,table[ITR];
double time1,time2;
for(i=0; i< dim*dim; i++) a[i] = b[i] = 0.0;
for(kk=0; kk<ITR; kk++) {
starttime = dtime();
for(i=0; i<dim; i++) {
for(j=0; j< dim*MINORLOOP; j++) {
a[j*dim+i] = b[i*dim+j];
}
}
endtime = dtime();
table[kk] = endtime - starttime;
}
qsort((void *)table, ITR, sizeof(double), cmpdble);
time1 = 0;
cnt=0;
for(i=ITR*0.1; i<ITR*0.5; i++) {
// printf("%d %f\n", i, table[i]);
time1 += table[i];
cnt++;
}
return time1/(double)(cnt);
//return table[0];
}
double bench2(double *a, double *b, int dim) {
int i,j,kk,cnt;
double starttime, endtime,table[ITR];
double time1,time2;
for(i=0; i< dim*dim; i++) a[i] = b[i] = 0.0;
for(kk=0; kk<ITR; kk++) {
starttime = dtime();
for(j=0; j< dim; j++) {
for(i=0; i<dim*MINORLOOP; i++) {
a[j*dim+i] = b[i*dim+j];
}
}
endtime = dtime();
table[kk] = endtime - starttime;
}
qsort((void *)table, ITR, sizeof(double), cmpdble);
time1 = 0;
cnt=0;
for(i=ITR*0.0; i<ITR*0.5; i++) {
// printf("%d %f\n", i, table[i]);
time1 += table[i];
cnt++;
}
return time1/(double)(cnt);
}
int main(int argc, char **argv)
{
double *a, *b;
int dim;
double time1,time2;
if(argc > 1) dim = atoi(argv[1]);
else dim = DIMENSION;
a = (double *)malloc(sizeof(double)*(dim*dim*2));
b = a + dim*dim;
if(a==NULL) {
printf("not enough memory\n");
exit(1);
}
time1=bench1(a,b,dim);
printf("BRK area(store stride): %5.2fMB/S\n", dim*dim*MINORLOOP*sizeof(double)*1e-6/time1);
time2=bench2(a,b,dim);
free(a);
printf("BRK area(store continuous): %5.2fMB/S\n", dim*dim*MINORLOOP*sizeof(double)*1e-6/time2);
// if(mlockall(MCL_FUTURE)<0) {printf("memory can not be locked\n");exit(1);}
a = (double *)sbrk(sizeof(double)*(dim*dim*2));
b = a + dim*dim;
if(a==NULL) {
printf("not enough memory\n");
exit(1);
}
time1=bench1(a,b,dim);
printf("MMAP area(store stride): %5.2fMB/S\n", dim*dim*MINORLOOP*sizeof(double)*1e-6/time1);
time2=bench2(a,b,dim);
printf("MMAP area(store continuous): %5.2fMB/S\n", dim*MINORLOOP*dim*sizeof(double)*1e-6/time2);
munlockall();
exit(0);
}
What's new:
Document file is in the top of the patch. It is only about 10Kbytes, feel free to download the patch.
Contact Information
Naohiko Shimizu-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/
This archive was generated by hypermail 2b29 : Fri Jul 07 2000 - 21:00:14 EST