Re: [patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean

From: George Spelvin
Date: Sat May 07 2016 - 04:41:52 EST


Nothing critical, but a bit of kibitzing.
(That is slang in the Yiddish language for a person
who offers annoying and unwanted advice.)

> The binary GCD algorithm is based on the following facts:
> 1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
> 2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
> 3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

1) "even" and "odd" are adjectives. In English, adjectives to not have
plural suffixes. Thus, "they are even" or "they are odd".
2) Although "all" is not exactly wrong, it sounds odd. Since there are
exactly two of them it's clearer to say "both".

If I also rephrase the last line to fit into 80 columns, you get:

The binary GCD algorithm is based on the following facts:
- 1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
+ 1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2)
2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
- 3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
+ 3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

3) Negative config options are always confusing.

Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler
CONFIG_SLOW_FFS?

Also, you're allowed to add "help" to a non-interactive config option,
and some documentation might be useful.
E.g.

+config CPU_SLOW_FFS
+ def_bool n
+ help
+ If n, the CPU supports a fast __ffs (__builtin_ctz) operation,
+ either directly or via a short code sequence using a count
+ leading zeros or population count instruction. If y, the
+ operation is emulated by slower software, such as an unrolled
+ binary search.
+
+ This is purely an optimization option; the kernel
+ will function correctly regardless of how it is set.
+

Your benchmark code doesn't have to have a separate code path if
__x86_64__; rdtsc works on 32-bit code just as well. paths. And the
way you subtract the end and start times is unnecessarily complicated.
The C language guarantees that unsigned arithmetic simply wraps modulo
2^bits as expected.

Here's a simplified version:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <unistd.h>

#define swap(a, b) \
do { \
a ^= b; \
b ^= a; \
a ^= b; \
} while (0)

/* The Euclidean GCD algorithm */
unsigned long gcd0(unsigned long a, unsigned long b)
{
if (a < b)
swap(a, b);

while (b != 0) {
unsigned long r = a % b;
a = b;
b = r;
}
return a;
}

/* The binary GCD algorithm, using __builtin_ctzl */
unsigned long gcd1(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

b >>= __builtin_ctzl(b);

do {
a >>= __builtin_ctzl(a);

if (a < b)
swap(a, b);
a -= b;
} while (a);
return b << __builtin_ctzl(r);
}

/* Binary GCD algorithm, even/odd variant, without __builtin_ctzl */
unsigned long gcd2(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

r &= -r;

while (!(b & r))
b >>= 1;

for (;;) {
while (!(a & r))
a >>= 1;
if (a < b)
swap(a, b);
else if (a == b)
return a;
a -= b;
a >>= 1;
if (a & r)
a += b;
a >>= 1;
}
}

/* A variant of gcd1, with early out for gcd = 1 */
unsigned long gcd3(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

b >>= __builtin_ctzl(b);
if (b == 1)
return r & -r;

for (;;) {
a >>= __builtin_ctzl(a);
if (a == b || a == 1)
return a << __builtin_ctzl(r);

if (a < b)
swap(a, b);
a -= b;
}
}

unsigned long gcd4(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

r &= -r;

while (!(b & r))
b >>= 1;
if (b == r)
return r;

for (;;) {
while (!(a & r))
a >>= 1;
if (a == b || a == r)
return a;

if (a < b)
swap(a, b);
a -= b;
a >>= 1;
if (a & r)
a += b;
a >>= 1;
}
}

static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
gcd0, gcd1, gcd2, gcd3, gcd4,
};

#define TEST_ENTRIES (int)(sizeof(gcd_func) / sizeof(gcd_func[0]))

#define rdtscll(val) do { \
unsigned __a,__d; \
__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
(val) = __a | (unsigned long long)__d << 32; \
} while(0)

static unsigned long long
benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
unsigned long a, unsigned long b, unsigned long *res)
{
unsigned long long start, end;
unsigned long gcd_res;

rdtscll(start);
gcd_res = gcd(a, b);
rdtscll(end);

*res = gcd_res;
return end - start;
}

static inline unsigned long get_rand()
{
if (sizeof(long) == 8)
return (unsigned long)rand() << 32 | rand();
else
return rand();
}

int main(int argc, char **argv)
{
unsigned int seed = time(0);
int loops = 100;
int repeats = 1000;
unsigned long (*res)[TEST_ENTRIES];
unsigned long long elapsed[TEST_ENTRIES];
int i, j, k;

for (;;) {
int opt = getopt(argc, argv, "n:r:s:");
/* End condition always first */
if (opt == -1)
break;

switch (opt) {
case 'n':
loops = atoi(optarg);
break;
case 'r':
repeats = atoi(optarg);
break;
case 's':
seed = strtoul(optarg, NULL, 10);
break;
default:
/* You won't actually get here. */
break;
}
}

res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
memset(elapsed, 0, sizeof(elapsed));

srand(seed);
for (j = 0; j < loops; j++) {
unsigned long a = get_rand();
/* Do we have args? */
unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
unsigned long long min_elapsed[TEST_ENTRIES];
for (k = 0; k < repeats; k++) {
for (i = 0; i < TEST_ENTRIES; i++) {
unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
if (k == 0 || min_elapsed[i] > tmp)
min_elapsed[i] = tmp;
}
}
for (i = 0; i < TEST_ENTRIES; i++)
elapsed[i] += min_elapsed[i];
}

for (i = 0; i < TEST_ENTRIES; i++)
printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

k = 0;
srand(seed);
for (j = 0; j < loops; j++) {
unsigned long a = get_rand();
unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
for (i = 1; i < TEST_ENTRIES; i++) {
if (res[j][i] != res[j][0])
break;
}
if (i < TEST_ENTRIES) {
if (k == 0) {
k = 1;
fprintf(stderr, "Error:\n");
}
fprintf(stderr, "gcd(%lu, %lu): ", a, b);
for (i = 0; i < TEST_ENTRIES; i++)
fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
}
}

if (k == 0)
fprintf(stderr, "PASS\n");

free(res);

return 0;
}

Here are some more timings, with the same flags as your tests:

First, 32 bit code:
gcd0 gcd1 gcd2 gcd3 gcd4
Ivy Bridge 3156 1192 1740 1160 1640 PASS
AMD Phenom 7150 2564 2348 2975 2843 PASS
Core 2 4176 2592 4164 2604 3900 PASS
Pentium 4 11492 4784 7632 4852 6452 PASS

And 64-bit (longer times becuase the inputs are larger):
Ivy Bridge 10636 2496 3500 2432 3360 PASS
AMD Phenom 19482 4058 6030 5001 6845 PASS

Looking at those, I'm not sure how much better the gcd3/4 versions are
than gcd1/2. The difference seems pretty minor and sometimes negative.