Re: AMD optimizations?

Alan Cox (alan@lxorguk.ukuu.org.uk)
Thu, 26 Aug 1999 12:05:37 +0100 (BST)


> I believe I read in Alan's diary that he's written a 3dnow! version
> of the memory copy routines. As long as it doesn't use Athlon specific
> opcodes then that should speed things along on a K6-2 or K6-3 too.

>From benchmarking it on my K6-II its the same speed from main memory as
using rep mov Intel style. It may be a little faster on cached memory. On
the K6-III it may be a win but you would want to change the prefetch factor
a bit.

Start with and see how well it compares to kernel memcpy performance (which
is rep movs)

#include <stdio.h>

char space[1024*1024];
char target[1024*1024];

void mmx_copy(char *from, char *to)
{
int i;
char fpu_save[108];
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
__asm__ __volatile__ (
" prefetch (%0)\n"
" prefetch 64(%0)\n"
" prefetch 128(%0)\n"
" prefetch 192(%0)\n"
" prefetch 256(%0)\n"
: : "r" (from) );

for(i=0;i<4096/64;i++)
{
__asm__ __volatile__ (
" prefetch 320(%0)\n"
" movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
}

int main(int argc, char *argv)
{
char *from=space;
char *to=target;
int r;
int i;
for(r=0;r<2048;r++)
{
for(i=0;i<256;i++)
{
mmx_copy(from, to);
from+=4096;
to+=4096;

}
from=space;
to=target;
}
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/