[PATCH] speed up on find_first_bit for i386 (let compiler do thework)

From: Steven Rostedt
Date: Thu Jul 28 2005 - 06:45:18 EST


In the thread "[RFC][PATCH] Make MAX_RT_PRIO and MAX_USER_RT_PRIO
configurable" I discovered that a C version of find_first_bit is faster
than the asm version now when compiled against gcc 3.3.6 and gcc 4.0.1
(both from versions of Debian unstable). I wrote a benchmark (attached)
that runs the code 1,000,000 times. First I do it with no bits set,
followed by the last bit set, a middle bit set and then the first bit
set. Only when no bits are set is the asm version of find_first_bit
faster and that is only when I ran it with gcc 4.0.1 (is gcc at fault
here?). I haven't spent any time actually looking at what gcc produces.
I only looked at the measurements.

I compiled this with "gcc -O2 -o ffb ffb.c". And here's the output.

On an AMD 2.2GHz SMP machine (SMP shouldn't affect the result here).
This is running gcc 4.0.1

/* comments embedded */

/* ticks match speed */
clock speed = 00000000:7f31d02a 2133970986 ticks per second

/* generic ffb (or just ffb) is the code that is currently in the system
* my ffb (or just my) is my new version that I'm submitting.
* my ffb 2 (or just my2) is my version playing with unlikely around an
* condition.
*/
no bit set
ffb=320 my=320 my2=320
generic ffb: 00000000:02e33f90
time: 0.022702922us
my ffb: 00000000:02ee66e7
time: 0.023045461us
my ffb 2: 00000000:032d63b9
time: 0.024979860us
/*
* The above shows that the original beats my version when no bit is
* set.
*/

last bit set
ffb=319 my=319 my2=319
generic ffb: 00000000:0382a116
time: 0.027597643us
my ffb: 00000000:0204c4a9
time: 0.015870375us
my ffb 2: 00000000:03244a1b
time: 0.024700391us
/*
* Here we see that there's quite an improvement over normal ffb when
* the last bit is set.
*/

middle bit set
ffb=159 my=159 my2=159
generic ffb: 00000000:02ce2b78
time: 0.022055584us
my ffb: 00000000:01241c5b
time: 0.008970962us
my ffb 2: 00000000:016171ff
time: 0.010854596us
/*
* Again, there's quite an improvement when a middle bit is set.
*/

first bit set
ffb=0 my=0 my2=0
generic ffb: 00000000:0232456a
time: 0.017267808us
my ffb: 00000000:003dd354
time: 0.001898712us
my ffb 2: 00000000:009d1f74
time: 0.004825372us
/*
* When the first bit is set, there's ever a greater improvement.
*/


Now for the results on my laptop with a Pentium 4 HT 3.3GZ. Running
gcc 3.3.6

clock speed = 00000000:c5de80ef 3319693551 ticks per second

no bit set
ffb=320 my=320 my2=320
generic ffb: 00000000:0aba64db
time: 0.054218162us
my ffb: 00000000:055f6c73
time: 0.027153036us
my ffb 2: 00000000:052e753e
time: 0.026186379us
/*
* Now we see even when no bits are set, my version beats the asm one.
*/

last bit set
ffb=319 my=319 my2=319
generic ffb: 00000000:0b69c638
time: 0.057680447us
my ffb: 00000000:050a27fb
time: 0.025469722us
my ffb 2: 00000000:04d32d78
time: 0.024384359us

middle bit set
ffb=159 my=159 my2=159
generic ffb: 00000000:0a1bc81f
time: 0.051086903us
my ffb: 00000000:020f3d7d
time: 0.010408554us
my ffb 2: 00000000:0324112a
time: 0.015873555us

first bit set
ffb=0 my=0 my2=0
generic ffb: 00000000:095a794d
time: 0.047270700us
my ffb: 00000000:005af2d0
time: 0.001795467us
my ffb 2: 00000000:005a0537
time: 0.001777144us


With this evidence, I present my patch against the 2.6.12.2 kernel.

Signed-off-by: Steven Rostedt <rostedt@xxxxxxxxxxx>

Index: vanilla_kernel/include/asm-i386/bitops.h
===================================================================
--- vanilla_kernel/include/asm-i386/bitops.h (revision 263)
+++ vanilla_kernel/include/asm-i386/bitops.h (working copy)
@@ -311,6 +311,20 @@
int find_next_zero_bit(const unsigned long *addr, int size, int offset);

/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ __asm__("bsfl %1,%0"
+ :"=r" (word)
+ :"rm" (word));
+ return word;
+}
+
+/**
* find_first_bit - find the first set bit in a memory region
* @addr: The address to start the search at
* @size: The maximum size to search
@@ -320,22 +334,16 @@
*/
static inline int find_first_bit(const unsigned long *addr, unsigned size)
{
- int d0, d1;
- int res;
-
- /* This looks at memory. Mark it volatile to tell gcc not to move it around */
- __asm__ __volatile__(
- "xorl %%eax,%%eax\n\t"
- "repe; scasl\n\t"
- "jz 1f\n\t"
- "leal -4(%%edi),%%edi\n\t"
- "bsfl (%%edi),%%eax\n"
- "1:\tsubl %%ebx,%%edi\n\t"
- "shll $3,%%edi\n\t"
- "addl %%edi,%%eax"
- :"=a" (res), "=&c" (d0), "=&D" (d1)
- :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
- return res;
+ int x = 0;
+ do {
+ if (*addr)
+ return __ffs(*addr) + x;
+ addr++;
+ if (x >= size)
+ break;
+ x += 32;
+ } while (1);
+ return x;
}

/**
@@ -360,20 +368,6 @@
return word;
}

-/**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
- __asm__("bsfl %1,%0"
- :"=r" (word)
- :"rm" (word));
- return word;
-}
-
/*
* fls: find last bit set.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define unlikely(x) __builtin_expect(!!(x), 0)

static inline int find_first_bit(const unsigned long *addr, unsigned size)
{
int d0, d1;
int res;

/* This looks at memory. Mark it volatile to tell gcc not to move it around */
__asm__ __volatile__(
"xorl %%eax,%%eax\n\t"
"repe; scasl\n\t"
"jz 1f\n\t"
"leal -4(%%edi),%%edi\n\t"
"bsfl (%%edi),%%eax\n"
"1:\tsubl %%ebx,%%edi\n\t"
"shll $3,%%edi\n\t"
"addl %%edi,%%eax"
:"=a" (res), "=&c" (d0), "=&D" (d1)
:"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
return res;
}

static inline unsigned long __ffs(unsigned long word)
{
__asm__("bsfl %1,%0"
:"=r" (word)
:"rm" (word));
return word;
}

static inline int my_find_first_bit(const unsigned long *b, unsigned size)
{
int x = 0;
do {
if (*b)
return __ffs(*b) + x;
b++;
if (x >= size)
break;
x += 32;
} while (1);
return x;
}

static inline int my_find_first_bit2(const unsigned long *b, unsigned size)
{
int x = 0;
do {
if (unlikely(*b))
return __ffs(*b) + x;
b++;
if (x >= size)
break;
x += 32;
} while (1);
return x;
}
#define rdtscll(val) \
__asm__ __volatile__("rdtsc" : "=A" (val))

#define rdtsc(low,high) \
__asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))

#define BITSIZE 310
static unsigned long array[((BITSIZE)>>5)+1];

#define ITER 1000000 /* 1,000,000 times */

void testit(unsigned long *array, unsigned long long clock)
{
unsigned long long s;
unsigned long long e;
unsigned long long t;
double f;
int i;
int x;

/*
* Since ITER is 1,000,000 the times will be in us.
*/

/*
* Make sure that the output is correct.
*/
printf("ffb=%d my=%d my2=%d\n",
find_first_bit(array,BITSIZE),
my_find_first_bit(array,BITSIZE),
my_find_first_bit2(array,BITSIZE));

rdtscll(s);
for (i=0; i < ITER; i++)
x = find_first_bit(array,BITSIZE);
rdtscll(e);
t = e - s;
f = (float)t / (float)clock;
printf("generic ffb: %08lx:%08lx\n",
(unsigned long)(t>>32),(unsigned long)t);
printf("time: %.09fus\n",f);

rdtscll(s);
for (i=0; i < ITER; i++)
x = my_find_first_bit(array,BITSIZE);
rdtscll(e);
t = e - s;
f = (float)t / (float)clock;
printf("my ffb: %08lx:%08lx\n",
(unsigned long)(t>>32),(unsigned long)t);
printf("time: %.09fus\n",f);

rdtscll(s);
for (i=0; i < ITER; i++)
x = my_find_first_bit2(array,BITSIZE);
rdtscll(e);
t = e - s;
f = (float)t / (float)clock;
printf("my ffb 2: %08lx:%08lx\n",
(unsigned long)(t>>32),(unsigned long)t);
printf("time: %.09fus\n",f);
}

int main(int argc, char **argv)
{
unsigned long long s;
unsigned long long e;
unsigned long long t;
unsigned long long clock;

/*
* Calculate BS time, just to get an
* idea of the tsc speed.
*/
rdtscll(s);
sleep(8);
rdtscll(e);

t = e - s;
t >>= 3;
printf("clock speed = %08lx:%08lx %llu ticks per second\n",
(unsigned long)(t>>32),(unsigned long)t,
t);
clock = t;

printf("\nno bit set\n");
testit(array,clock);

array[BITSIZE>>5] = 0x80000000;
printf("\nlast bit set\n");
testit(array,clock);

array[4] = 0x80000000;
printf("\nmiddle bit set\n");
testit(array,clock);

array[0] = 0x00000001;
printf("\nfirst bit set\n");
testit(array,clock);

exit(0);
}