Re: [PATCH] lib: find_*_bit reimplementation

From: Yury
Date: Sun Jan 25 2015 - 18:33:28 EST


On 24.01.2015 03:45, Rasmus Villemoes wrote:
> On Mon, Jan 19 2015, Yury Norov <yury.norov@xxxxxxxxx> wrote:
>
>> New implementation takes less space, and, I hope, easier to understand.
>>
>> Signed-off-by: Yury Norov <yury.norov@xxxxxxxxx>
>> ---
>> lib/find_next_bit.c | 265 +++++++++++++++-------------------------------------
>> 1 file changed, 73 insertions(+), 192 deletions(-)
>>
> That diffstat is certainly nice. Do you also have numbers for the
> size of the generated code, and do you know if there is a
> measurable performance difference? Have you tested whether the new and
> old code give the same results, also in corner cases?
Hello, Rasmus. Thank you for your time.

> Do you also have numbers for the size of the generated code

Before text section is 817 bytes, after - 533.
Comparing to this version, I have the patch modified now, and numbers may little differ.

> Have you tested whether the new and old code give the same results, also in corner cases?

I tested new version together with old one, and if new did return different value,
it was printed to system log. I didn't measure performance because I don't expect
significant gain here. But now I think it's good idea to write tests for performance and
corner cases. Maybe someone already did it... Brief googling didn't help. Anyway, with
new version of patch I will show my measures.
>
> Some remarks inline below.
>
>> diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
>> index 0cbfc0b..a5c915f 100644
>> --- a/lib/find_next_bit.c
>> +++ b/lib/find_next_bit.c
>> @@ -11,10 +11,39 @@
>>
>> #include <linux/bitops.h>
>> #include <linux/export.h>
>> +#include <linux/kernel.h>
>> #include <asm/types.h>
>> #include <asm/byteorder.h>
>>
>> -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
>> +#define HIGH_BITS_MASK(nr) (ULONG_MAX << (nr))
>> +#define MIN(a, b) ((a) < (b) ? (a) : (b))
>> +
> Please don't duplicate min/max macros. kernel.h already provides everything you need.
Ok.
>
>> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
>> +static unsigned long _find_next_bit(const unsigned long *addr,
>> + unsigned long end, unsigned long start, unsigned long flags)
> Having two parameters called end and start appearing in that
> order is slightly confusing. Why not keep the name 'size' for
> end, or maybe 'nbits' to make the unit clear. Also, I think flags
> should just be a bool and maybe renamed to something more meaningful.
You're right, Something like this:
static unsigned long _find_next_bit(const unsigned long *addr,
unsigned long nbits, unsigned long start_bit, bool set)
looks better.
>
>> +{
>> + unsigned long tmp = flags ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> +
>> + /* Handle 1st word. */
>> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
>> + tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
>> + start = round_down(start, BITS_PER_LONG);
>> + }
>> +
>> + do {
>> + if (tmp)
>> + return MIN(start + __ffs(tmp), end);
>> +
>> + start += BITS_PER_LONG;
>> + if (start >= end)
>> + return end;
>> +
>> + tmp = flags ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> + } while (1);
>> +}
>> +#endif
>>
>> #ifndef find_next_bit
>> /*
>> @@ -23,86 +52,16 @@
>> unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
>> unsigned long offset)
>> {
>> - const unsigned long *p = addr + BITOP_WORD(offset);
>> - unsigned long result = offset & ~(BITS_PER_LONG-1);
>> - unsigned long tmp;
>> -
>> - if (offset >= size)
>> - return size;
> The previous versions handled this, but your code will always access
> the word at addr[start/BITS_PER_LONG]. Are you sure no caller ever
> passes start >= size?
You're right. Fixed.
>
>> - size -= result;
>> - offset %= BITS_PER_LONG;
>> - if (offset) {
>> - tmp = *(p++);
>> - tmp &= (~0UL << offset);
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if ((tmp = *(p++)))
>> - goto found_middle;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = *p;
>> -
>> -found_first:
>> - tmp &= (~0UL >> (BITS_PER_LONG - size));
>> - if (tmp == 0UL) /* Are any bits set? */
>> - return result + size; /* Nope. */
>> -found_middle:
>> - return result + __ffs(tmp);
>> + return _find_next_bit(addr, size, offset, 1);
>> }
>> EXPORT_SYMBOL(find_next_bit);
>> #endif
>>
>> #ifndef find_next_zero_bit
>> -/*
>> - * This implementation of find_{first,next}_zero_bit was stolen from
>> - * Linus' asm-alpha/bitops.h.
>> - */
>> unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
>> unsigned long offset)
>> {
>> - const unsigned long *p = addr + BITOP_WORD(offset);
>> - unsigned long result = offset & ~(BITS_PER_LONG-1);
>> - unsigned long tmp;
>> -
>> - if (offset >= size)
>> - return size;
>> - size -= result;
>> - offset %= BITS_PER_LONG;
>> - if (offset) {
>> - tmp = *(p++);
>> - tmp |= ~0UL >> (BITS_PER_LONG - offset);
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (~tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> - }
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if (~(tmp = *(p++)))
>> - goto found_middle;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = *p;
>> -
>> -found_first:
>> - tmp |= ~0UL << size;
>> - if (tmp == ~0UL) /* Are any bits zero? */
>> - return result + size; /* Nope. */
>> -found_middle:
>> - return result + ffz(tmp);
>> + return _find_next_bit(addr, size, offset, 0);
>> }
>> EXPORT_SYMBOL(find_next_zero_bit);
>> #endif
>> @@ -113,24 +72,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
>> */
>> unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = 0;
>> - unsigned long tmp;
>> + unsigned int idx;
>>
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if ((tmp = *(p++)))
>> - goto found;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
>> + if (addr[idx])
>> + return idx * BITS_PER_LONG + __ffs(addr[idx]);
>> }
> I'm afraid this is wrong. It doesn't take garbage bits in the last
> (partial) word into account. You either need to loop over the full
> words and handle the last separately, masking off the garbage bits, or
> wrap min(, size) around the above expression, just as you've done in
> _find_next_bit. I think I prefer the latter.
Yes. Done.
>
> Does anyone use insane size bitmaps with 2^32 bits or more? If so,
> you'd have to watch out for overflow and use unsigned long for idx.
Yes.
>
>> - if (!size)
>> - return result;
>>
>> - tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
>> - if (tmp == 0UL) /* Are any bits set? */
>> - return result + size; /* Nope. */
>> -found:
>> - return result + __ffs(tmp);
>> + return size;
>> }
>> EXPORT_SYMBOL(find_first_bit);
>> #endif
>> @@ -141,24 +90,14 @@ EXPORT_SYMBOL(find_first_bit);
>> */
>> unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = 0;
>> - unsigned long tmp;
>> + unsigned int idx;
>>
>> - while (size & ~(BITS_PER_LONG-1)) {
>> - if (~(tmp = *(p++)))
>> - goto found;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> + for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
>> + if (addr[idx] != ULONG_MAX)
>> + return idx * BITS_PER_LONG + ffz(addr[idx]);
>> }
> The same of course applies here.
>
>> - if (!size)
>> - return result;
>>
>> - tmp = (*p) | (~0UL << size);
>> - if (tmp == ~0UL) /* Are any bits zero? */
>> - return result + size; /* Nope. */
>> -found:
>> - return result + ffz(tmp);
>> + return size;
>> }
>> EXPORT_SYMBOL(find_first_zero_bit);
>> #endif
>> @@ -166,18 +105,6 @@ EXPORT_SYMBOL(find_first_zero_bit);
>> #ifdef __BIG_ENDIAN
>>
>> /* include/linux/byteorder does not support "unsigned long" type */
>> -static inline unsigned long ext2_swabp(const unsigned long * x)
>> -{
>> -#if BITS_PER_LONG == 64
>> - return (unsigned long) __swab64p((u64 *) x);
>> -#elif BITS_PER_LONG == 32
>> - return (unsigned long) __swab32p((u32 *) x);
>> -#else
>> -#error BITS_PER_LONG not defined
>> -#endif
>> -}
>> -
>> -/* include/linux/byteorder doesn't support "unsigned long" type */
>> static inline unsigned long ext2_swab(const unsigned long y)
>> {
>> #if BITS_PER_LONG == 64
>> @@ -189,48 +116,40 @@ static inline unsigned long ext2_swab(const unsigned long y)
>> #endif
>> }
>>
>> -#ifndef find_next_zero_bit_le
>> -unsigned long find_next_zero_bit_le(const void *addr, unsigned
>> - long size, unsigned long offset)
>> +#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
>> +static unsigned long _find_next_bit_le(const unsigned long *addr,
>> + unsigned long end, unsigned long start, unsigned long flags)
>> {
>> - const unsigned long *p = addr;
>> - unsigned long result = offset & ~(BITS_PER_LONG - 1);
>> - unsigned long tmp;
>> -
>> - if (offset >= size)
>> - return size;
>> - p += BITOP_WORD(offset);
>> - size -= result;
>> - offset &= (BITS_PER_LONG - 1UL);
>> - if (offset) {
>> - tmp = ext2_swabp(p++);
>> - tmp |= (~0UL >> (BITS_PER_LONG - offset));
>> - if (size < BITS_PER_LONG)
>> - goto found_first;
>> - if (~tmp)
>> - goto found_middle;
>> - size -= BITS_PER_LONG;
>> - result += BITS_PER_LONG;
>> + unsigned long tmp = flags ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> +
>> + /* Handle 1st word. */
>> + if (!IS_ALIGNED(start, BITS_PER_LONG)) {
>> + tmp &= ext2_swab(HIGH_BITS_MASK(start
>> + % BITS_PER_LONG));
> Nit: There's no reason to break that line.
>
>> + start = round_down(start, BITS_PER_LONG);
>> }
>>
>> - while (size & ~(BITS_PER_LONG - 1)) {
>> - if (~(tmp = *(p++)))
>> - goto found_middle_swap;
>> - result += BITS_PER_LONG;
>> - size -= BITS_PER_LONG;
>> - }
>> - if (!size)
>> - return result;
>> - tmp = ext2_swabp(p);
>> -found_first:
>> - tmp |= ~0UL << size;
>> - if (tmp == ~0UL) /* Are any bits zero? */
>> - return result + size; /* Nope. Skip ffz */
>> -found_middle:
>> - return result + ffz(tmp);
>> + do {
>> + if (tmp)
>> + return MIN(start +
>> + __ffs(ext2_swab(tmp)), end);
> Again, no reason to break the line.
The reason is limitation of 80 bytes per line, but anyway, I did it bad way
>
>> -found_middle_swap:
>> - return result + ffz(ext2_swab(tmp));
>> + start += BITS_PER_LONG;
>> + if (start >= end)
>> + return end;
>> +
>> + tmp = flags ? addr[start / BITS_PER_LONG]
>> + : ~addr[start / BITS_PER_LONG];
>> + } while (1);
>> +}
>> +#endif
>> +
>> +#ifndef find_next_zero_bit_le
>> +unsigned long find_next_zero_bit_le(const void *addr, unsigned
>> + long size, unsigned long offset)
>> +{
>> + return _find_next_bit_le(addr, size, offset, o);
> I'm assuming you only compile-tested this on little-endian. The o
> wants to be a 0.
Yes, shame on me. I think I'd bring up SPARK or big-endian ARM environment
on my QEMU before uploading new version.
>
> Rasmus
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/