[patch] x86, generic speedup, do we need 'cld'?

MOLNAR Ingo (mingo@chiara.csoma.elte.hu)
Fri, 13 Mar 1998 18:18:22 +0100 (CET)


i've noticed that we do 'cld' way too often. Maybe there is some
nonobvious reason to use it, but isnt it so that all kernel entry points
do a cld currently.

(there is a single function in the kernel that does an 'std', it's
memmove(), but the direction flag is IRQ-safe.)

so the attached 2.1.90-2 patch removes all unnecessary 'cld's in kernel
include files. 'cld' itself is a quite costy instruction, 2 cycles plus it
doesnt pair, thus it has an average execution time of 3 cycles. The patch
has removed ~700 cld's from my kernel image, from very frequented places.
This also reduces kernel footprint.

Q1: is this really safe?

Q2: GCC seems to generate the majority of the remaining 200 cld's, is
there a way to let it generate _no_ cld's?

I'm running this kernel now, and it seems to be fine so far.

-- mingo

--- linux/include/asm-i386/string.h.orig Thu Mar 19 05:45:26 1998
+++ linux/include/asm-i386/string.h Thu Mar 19 05:46:45 1998
@@ -31,7 +31,6 @@
extern inline char * strcpy(char * dest,const char *src)
{
__asm__ __volatile__(
- "cld\n"
"1:\tlodsb\n\t"
"stosb\n\t"
"testb %%al,%%al\n\t"
@@ -45,7 +44,6 @@
extern inline char * strncpy(char * dest,const char *src,size_t count)
{
__asm__ __volatile__(
- "cld\n"
"1:\tdecl %2\n\t"
"js 2f\n\t"
"lodsb\n\t"
@@ -64,7 +62,6 @@
extern inline char * strcat(char * dest,const char * src)
{
__asm__ __volatile__(
- "cld\n\t"
"repne\n\t"
"scasb\n\t"
"decl %1\n"
@@ -81,7 +78,6 @@
extern inline char * strncat(char * dest,const char * src,size_t count)
{
__asm__ __volatile__(
- "cld\n\t"
"repne\n\t"
"scasb\n\t"
"decl %1\n\t"
@@ -105,7 +101,6 @@
{
register int __res;
__asm__ __volatile__(
- "cld\n"
"1:\tlodsb\n\t"
"scasb\n\t"
"jne 2f\n\t"
@@ -125,7 +120,6 @@
{
register int __res;
__asm__ __volatile__(
- "cld\n"
"1:\tdecl %3\n\t"
"js 2f\n\t"
"lodsb\n\t"
@@ -147,7 +141,6 @@
{
register char * __res;
__asm__ __volatile__(
- "cld\n\t"
"movb %%al,%%ah\n"
"1:\tlodsb\n\t"
"cmpb %%ah,%%al\n\t"
@@ -166,7 +159,6 @@
{
register char * __res;
__asm__ __volatile__(
- "cld\n\t"
"movb %%al,%%ah\n"
"1:\tlodsb\n\t"
"cmpb %%ah,%%al\n\t"
@@ -183,7 +175,6 @@
{
register char * __res;
__asm__ __volatile__(
- "cld\n\t"
"movl %4,%%edi\n\t"
"repne\n\t"
"scasb\n\t"
@@ -209,7 +200,6 @@
{
register char * __res;
__asm__ __volatile__(
- "cld\n\t"
"movl %4,%%edi\n\t"
"repne\n\t"
"scasb\n\t"
@@ -235,7 +225,6 @@
{
register char * __res;
__asm__ __volatile__(
- "cld\n\t"
"movl %4,%%edi\n\t"
"repne\n\t"
"scasb\n\t"
@@ -264,7 +253,6 @@
{
register char * __res;
__asm__ __volatile__(
- "cld\n\t" \
"movl %4,%%edi\n\t"
"repne\n\t"
"scasb\n\t"
@@ -293,7 +281,6 @@
{
register int __res;
__asm__ __volatile__(
- "cld\n\t"
"repne\n\t"
"scasb\n\t"
"notl %0\n\t"
@@ -315,7 +302,6 @@
"1:\txorl %0,%0\n\t"
"movl $-1,%%ecx\n\t"
"xorl %%eax,%%eax\n\t"
- "cld\n\t"
"movl %4,%%edi\n\t"
"repne\n\t"
"scasb\n\t"
@@ -366,7 +352,6 @@
extern inline void * __memcpy(void * to, const void * from, size_t n)
{
__asm__ __volatile__(
- "cld\n\t"
"rep ; movsl\n\t"
"testb $2,%b1\n\t"
"je 1f\n\t"
@@ -431,8 +416,7 @@
return to;
}
#define COMMON(x) \
-__asm__("cld\n\t" \
- "rep ; movsl" \
+__asm__("rep ; movsl" \
x \
: /* no outputs */ \
: "c" (n/4),"D" ((long) to),"S" ((long) from) \
@@ -458,7 +442,6 @@
{
if (dest<src)
__asm__ __volatile__(
- "cld\n\t"
"rep\n\t"
"movsb"
: /* no output */
@@ -487,7 +470,6 @@
if (!count)
return NULL;
__asm__ __volatile__(
- "cld\n\t"
"repne\n\t"
"scasb\n\t"
"je 1f\n\t"
@@ -501,7 +483,6 @@
extern inline void * __memset_generic(void * s, char c,size_t count)
{
__asm__ __volatile__(
- "cld\n\t"
"rep\n\t"
"stosb"
: /* no output */
@@ -521,7 +502,6 @@
extern inline void * __constant_c_memset(void * s, unsigned long c, size_t count)
{
__asm__ __volatile__(
- "cld\n\t"
"rep ; stosl\n\t"
"testb $2,%b1\n\t"
"je 1f\n\t"
@@ -582,8 +562,7 @@
return s;
}
#define COMMON(x) \
-__asm__("cld\n\t" \
- "rep ; stosl" \
+__asm__( "rep ; stosl" \
x \
: /* no outputs */ \
: "a" (pattern),"c" (count/4),"D" ((long) s) \
@@ -622,8 +601,7 @@
{
if (!size)
return addr;
- __asm__("cld
- repnz; scasb
+ __asm__("repnz; scasb
jnz 1f
dec %%edi
1: "
--- linux/include/asm-i386/io.h.orig Thu Mar 19 05:48:21 1998
+++ linux/include/asm-i386/io.h Thu Mar 19 05:50:09 1998
@@ -66,12 +66,12 @@

#define __INS(s) \
extern inline void ins##s(unsigned short port, void * addr, unsigned long count) \
-{ __asm__ __volatile__ ("cld ; rep ; ins" #s \
+{ __asm__ __volatile__ ("rep ; ins" #s \
: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }

#define __OUTS(s) \
extern inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
-{ __asm__ __volatile__ ("cld ; rep ; outs" #s \
+{ __asm__ __volatile__ ("rep ; outs" #s \
: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }

#define RETURN_TYPE unsigned char
--- linux/include/asm-i386/posix_types.h.orig Thu Mar 19 05:47:56 1998
+++ linux/include/asm-i386/posix_types.h Thu Mar 19 05:48:01 1998
@@ -57,7 +57,7 @@

#undef __FD_ZERO
#define __FD_ZERO(fdsetp) \
- __asm__ __volatile__("cld ; rep ; stosl" \
+ __asm__ __volatile__("rep ; stosl" \
:"=m" (*(__kernel_fd_set *) (fdsetp)) \
:"a" (0), "c" (__FDSET_LONGS), \
"D" ((__kernel_fd_set *) (fdsetp)) :"cx","di")
--- linux/include/asm-i386/bitops.h.orig Thu Mar 19 05:48:43 1998
+++ linux/include/asm-i386/bitops.h Thu Mar 19 05:48:47 1998
@@ -131,8 +131,7 @@

if (!size)
return 0;
- __asm__("cld\n\t"
- "movl $-1,%%eax\n\t"
+ __asm__("movl $-1,%%eax\n\t"
"xorl %%edx,%%edx\n\t"
"repe; scasl\n\t"
"je 1f\n\t"

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu