Re: [PATCH] lib/int_sqrt.c: Optimize square root function

From: Peter Zijlstra
Date: Thu Jul 20 2017 - 07:25:03 EST


On Mon, Feb 02, 2015 at 11:13:44AM -0800, Linus Torvalds wrote:
> On Mon, Feb 2, 2015 at 11:00 AM, Linus Torvalds
> <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
> >
> > (I'm also not entirely sure what uses int_sqrt() that ends up being so
> > performance-critical, so it would be good to document that too, since
> > that probably also matters for the "what's the normal argument range"
> > question..)
>
> ... it's also not entirely clear that we need a whole new loop. We
> might just instead start off with a better guess for 'm' using some
> calculation that might be doable with a single conditional move
> instruction instead of a loop. Because I suspect that the inevitable
> branch misprediction of a new loop is likely as expensive as a few
> iterations through the core one.
>
> IOW, instead of
>
> m = 1UL << (BITS_PER_LONG - 2);
>
> perhaps something like
>
> m = 1UL << (BITS_PER_LONG/2- 2);
> if (m < x)
> m <<= BITS_PER_LONG/2;
>
> (assuming gcc can change that code into a "cmov") might cut down the
> "lots of empty loops" case in half for small values of 'x'.
>
> There's probably some other better cheap initial guess value estimator.

So I had a wee look at int_sqrt(), and yes Davidlohr's patch makes it
worse for my machine (Xeon E3v5 aka. fancy Skylake).

As in _MUCH_ worse..

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

Performance counter stats for './sqrt' (10 runs):

828,233,295 cycles:u ( +- 0.04% )
2,480,095,771 instructions:u # 2.99 insn per cycle ( +- 0.00% )

0.220202758 seconds time elapsed ( +- 0.18% )

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

Performance counter stats for './sqrt' (10 runs):

603,809,551 cycles:u ( +- 1.30% )
1,677,726,591 instructions:u # 2.78 insn per cycle ( +- 0.00% )

0.160801044 seconds time elapsed

That is, we went from ~60 cycles to ~82 cycles per invocation.


Now, the patches proposed in this thread do indeed improve matters but
seem to not have merged up until this day:


~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DANSHUL=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

Performance counter stats for './sqrt' (10 runs):

598,827,196 cycles:u ( +- 0.18% )
1,697,726,381 instructions:u # 2.84 insn per cycle ( +- 0.00% )

0.159880738 seconds time elapsed ( +- 0.36% )

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DLINUS=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

Performance counter stats for './sqrt' (10 runs):

601,463,184 cycles:u ( +- 0.07% )
1,380,095,976 instructions:u # 2.29 insn per cycle ( +- 0.00% )

0.160788095 seconds time elapsed ( +- 0.55% )

Which basically bring us back to the old performance. So I would
strongly suggest we merge one.


Now, the thing is, Intel CPUs are actually fairly good at fls(), so I
gave Joe's suggestions a spin too..

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

Performance counter stats for './sqrt' (10 runs):

390,420,004 cycles:u ( +- 0.07% )
1,141,802,493 instructions:u # 2.92 insn per cycle ( +- 0.00% )

0.105480000 seconds time elapsed ( +- 0.64% )

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 -DANSHUL=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

Performance counter stats for './sqrt' (10 runs):

328,415,775 cycles:u ( +- 0.15% )
1,138,579,704 instructions:u # 3.47 insn per cycle ( +- 0.00% )

0.088703205 seconds time elapsed

Which gets us a real improvement...

Now even with the software fls() fallback from
include/asm-generic/bitops/fls.h there is a significant improvement:


~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 -DANSHUL=1 -DSOFTFLS=1 ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

Performance counter stats for './sqrt' (10 runs):

375,633,931 cycles:u ( +- 0.13% )
1,289,700,013 instructions:u # 3.43 insn per cycle ( +- 0.00% )

0.100596211 seconds time elapsed


Also, I ran with -DVALIDATE=1 and while the comment says its a 'rough'
estimate of the sqrt() I did not in fact find any value for which we
deviate < INT_MAX.




----------------------[ sqrt.c ]---------------------------

#include <math.h>
#include <stdio.h>

#define BITS_PER_LONG (sizeof(long) * 8)

#ifndef LOOPS
#define LOOPS 1000000
#endif

#ifdef SOFTFLS

static __always_inline unsigned long fls(unsigned long x)
{
int r = 32;

if (!x)
return 0;

if (!(x & 0xffff0000u)) {
x <<= 16;
r -= 16;
}
if (!(x & 0xff000000u)) {
x <<= 8;
r -= 8;
}
if (!(x & 0xf0000000u)) {
x <<= 4;
r -= 4;
}
if (!(x & 0xc0000000u)) {
x <<= 2;
r -= 2;
}
if (!(x & 0x80000000u)) {
x <<= 1;
r -= 1;
}
return r;
}

#else

static __always_inline unsigned long fls(unsigned long word)
{
asm("rep; bsr %1,%0"
: "=r" (word)
: "rm" (word));
return BITS_PER_LONG - 1 - word;
}

#endif


#ifdef NEW

unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
unsigned long b, m, y = 0;

if (x <= 1)
return x;

#ifdef LINUS
m = 1UL << (BITS_PER_LONG/2 - 2);
if (m < x)
m <<= BITS_PER_LONG/2;

#else

#ifdef FLS
m = 1UL << ((fls(x) + 1) & ~1UL);
#else
m = 1UL << (BITS_PER_LONG - 2);
#endif

#ifdef ANSHUL
while (m > x)
m >>= 2;
#endif
#endif

while (m != 0) {
b = y + m;
y >>= 1;

if (x >= b) {
x -= b;
y += m;
}
m >>= 2;
}

return y;
}

#else

unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
unsigned long op, res, one;

op = x;
res = 0;

one = 1UL << (BITS_PER_LONG - 2);
while (one > op)
one >>= 2;

while (one != 0) {
if (op >= res + one) {
op = op - (res + one);
res = res + 2 * one;
}
res /= 2;
one /= 4;
}
return res;
}

#endif

void main(void)
{
unsigned long i;

for (i=0; i<LOOPS; i++) {
unsigned long a = int_sqrt(i);
asm volatile("" : : "r" (a));

#ifdef VALIDATE
{
unsigned long b = floor(sqrt(i));
if (a != b)
printf("%ld %ld %ld\n", i, a, b);
}
#endif

}
}

--------------

The only caveat seems to be that our fls() is only defined for 'int' not
long :/


---
lib/int_sqrt.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index 1ef4cc344977..87c3aa360441 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -7,12 +7,11 @@

#include <linux/kernel.h>
#include <linux/export.h>
+#include <linux/bitops.h>

/**
- * int_sqrt - rough approximation to sqrt
+ * int_sqrt - approximation to sqrt
* @x: integer of which to calculate the sqrt
- *
- * A very rough approximation to the sqrt() function.
*/
unsigned long int_sqrt(unsigned long x)
{
@@ -21,7 +20,11 @@ unsigned long int_sqrt(unsigned long x)
if (x <= 1)
return x;

- m = 1UL << (BITS_PER_LONG - 2);
+ m = 1UL << ((fls(x) + 1) & ~1UL);
+
+ while (m > x)
+ m >>= 2;
+
while (m != 0) {
b = y + m;
y >>= 1;