[PATCH] lib: hint GCC to inlilne _find_next_bit() helper

From: Yury Norov
Date: Sat Oct 28 2017 - 08:30:42 EST


Hi all,

It seems that inlining the _find_next_bit() helper makes
find_next_bit() and find_next_zero_bit() 2 times faster at
the scenario of finding all set/cleared bits of randomly
initialised bitmap.

For another typical scenario of traversing sparse bitmap
there is also measurable improvement observed, about 15%.

The increasing of text size of find_bit.o module is 40 bytes
for arm64 - from 252 to 292 bytes - is looking acceptable.

This patch also contains test module.

Measured on ThunderX machine. Tests for other architectures are
very appreciated.

Before:
[ 96.856195] Start testing find_bit() with random-filled bitmap
[ 96.868322] find_next_bit: 34529 cycles, 16304 iterations
[ 96.879525] find_next_zero_bit: 35771 cycles, 16465 iterations
[ 96.891409] find_last_bit: 17444 cycles, 16304 iterations
[ 96.914445] find_first_bit: 1219671 cycles, 16305 iterations
[ 96.925802] Start testing find_bit() with sparse bitmap
[ 96.936308] find_next_bit: 301 cycles, 66 iterations
[ 96.946981] find_next_zero_bit: 70897 cycles, 32703 iterations
[ 96.958694] find_last_bit: 286 cycles, 66 iterations
[ 96.968710] find_first_bit: 5260 cycles, 66 iterations

After:
[ 169.464229] Start testing find_bit() with random-filled bitmap
[ 169.476191] find_next_bit: 17520 cycles, 16336 iterations
[ 169.487210] find_next_zero_bit: 17622 cycles, 16433 iterations
[ 169.499111] find_last_bit: 19272 cycles, 16335 iterations
[ 169.519735] find_first_bit: 978657 cycles, 16337 iterations
[ 169.530912] Start testing find_bit() with sparse bitmap
[ 169.541414] find_next_bit: 252 cycles, 66 iterations
[ 169.551726] find_next_zero_bit: 34554 cycles, 32703 iterations
[ 169.563436] find_last_bit: 294 cycles, 66 iterations
[ 169.573439] find_first_bit: 3964 cycles, 66 iterations

CC: Alexey Dobriyan <adobriyan@xxxxxxxxx>
CC: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
CC: Clement Courbet <courbet@xxxxxxxxxx>
CC: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
CC: Rasmus Villemoes <linux@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Yury Norov <ynorov@xxxxxxxxxxxxxxxxxx>
---
lib/Kconfig.debug | 9 ++++
lib/Makefile | 1 +
lib/find_bit.c | 2 +-
lib/test_find_bit.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++
tools/lib/find_bit.c | 2 +-
5 files changed, 153 insertions(+), 2 deletions(-)
create mode 100644 lib/test_find_bit.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index dfdad67d8f6c..138034cc68a3 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1838,6 +1838,15 @@ config TEST_BPF

If unsure, say N.

+config TEST_FIND_BIT
+ tristate "Test find_bit functions"
+ default n
+ help
+ This builds the "test_find_bit" module that measure find_*_bit()
+ functions performance.
+
+ If unsure, say N.
+
config TEST_FIRMWARE
tristate "Test firmware loading via userspace interface"
default n
diff --git a/lib/Makefile b/lib/Makefile
index dafa79613fb4..edb792b42c86 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -45,6 +45,7 @@ obj-y += hexdump.o
obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
obj-y += kstrtox.o
obj-$(CONFIG_TEST_BPF) += test_bpf.o
+obj-$(CONFIG_TEST_FIND_BIT) += test_find_bit.o
obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o
obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 6ed74f78380c..9b0c89f3fd3a 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -28,7 +28,7 @@
* find_next_zero_bit. The difference is the "invert" argument, which
* is XORed with each fetched word before searching it for one bits.
*/
-static unsigned long _find_next_bit(const unsigned long *addr,
+static inline unsigned long _find_next_bit(const unsigned long *addr,
unsigned long nbits, unsigned long start, unsigned long invert)
{
unsigned long tmp;
diff --git a/lib/test_find_bit.c b/lib/test_find_bit.c
new file mode 100644
index 000000000000..8eaf10cae214
--- /dev/null
+++ b/lib/test_find_bit.c
@@ -0,0 +1,141 @@
+/*
+ * Test for find_*_bit functions.
+ *
+ * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/*
+ * find_bit functions are widely used in kernel, so the successful boot
+ * is good enough test for correctness.
+ *
+ * This test is focused on performance of traversing bitmaps. Two typical
+ * scenarios are reproduced:
+ * - randomly filled bitmap with approximately equal number of set and
+ * cleared bits;
+ * - sparse bitmap with few set bits at random positions.
+ */
+
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+
+#define BITMAP_LEN (4096UL * 8)
+#define SPARSE (500)
+
+static DECLARE_BITMAP(bitmap, BITMAP_LEN) __initdata;
+
+/*
+ * This is Schlemiel the Painter's algorithm.
+ * Should be called after all other tests for the same data
+ * because it sets all bits of bitmap to 1.
+ */
+static int __init test_find_first_bit(void *bitmap, unsigned long len)
+{
+ unsigned long i, cnt;
+ cycles_t cycles;
+
+ cycles = get_cycles();
+ for (cnt = i = 0; i < len; cnt++) {
+ i = find_first_bit(bitmap, len);
+ __clear_bit(i, bitmap);
+ }
+ cycles = get_cycles() - cycles;
+ pr_err("find_first_bit: %ld cycles, %ld iterations\n", cycles, cnt);
+
+ return 0;
+}
+
+static int __init test_find_next_bit(const void *bitmap, unsigned long len)
+{
+ unsigned long i, cnt;
+ cycles_t cycles;
+
+ cycles = get_cycles();
+ for (cnt = i = 0; i < BITMAP_LEN; cnt++)
+ i = find_next_bit(bitmap, BITMAP_LEN, i) + 1;
+ cycles = get_cycles() - cycles;
+ pr_err("find_next_bit: %ld cycles, %ld iterations\n", cycles, cnt);
+
+ return 0;
+}
+
+static int __init test_find_next_zero_bit(const void *bitmap, unsigned long len)
+{
+ unsigned long i, cnt;
+ cycles_t cycles;
+
+ cycles = get_cycles();
+ for (cnt = i = 0; i < BITMAP_LEN; cnt++)
+ i = find_next_zero_bit(bitmap, len, i) + 1;
+ cycles = get_cycles() - cycles;
+ pr_err("find_next_zero_bit: %ld cycles, %ld iterations\n", cycles, cnt);
+
+ return 0;
+}
+
+static int __init test_find_last_bit(const void *bitmap, unsigned long len)
+{
+ unsigned long l, cnt = 0;
+ cycles_t cycles;
+
+ cycles = get_cycles();
+ do {
+ cnt++;
+ l = find_last_bit(bitmap, len);
+ if (l >= len)
+ break;
+ len = l;
+ } while (len);
+ cycles = get_cycles() - cycles;
+ pr_err("find_last_bit: %ld cycles, %ld iterations\n", cycles, cnt);
+
+ return 0;
+}
+
+static int __init find_bit_test(void)
+{
+ unsigned long nbits = BITMAP_LEN / SPARSE;
+
+ pr_err("Start testing find_bit() with random-filled bitmap\n");
+
+ get_random_bytes(bitmap, sizeof(bitmap));
+
+ test_find_next_bit(bitmap, BITMAP_LEN);
+ test_find_next_zero_bit(bitmap, BITMAP_LEN);
+ test_find_last_bit(bitmap, BITMAP_LEN);
+ test_find_first_bit(bitmap, BITMAP_LEN);
+
+ pr_err("Start testing find_bit() with sparse bitmap\n");
+
+ bitmap_zero(bitmap, BITMAP_LEN);
+
+ while (nbits--)
+ __set_bit(prandom_u32() % BITMAP_LEN, bitmap);
+
+ test_find_next_bit(bitmap, BITMAP_LEN);
+ test_find_next_zero_bit(bitmap, BITMAP_LEN);
+ test_find_last_bit(bitmap, BITMAP_LEN);
+ test_find_first_bit(bitmap, BITMAP_LEN);
+
+ return 0;
+}
+module_init(find_bit_test);
+
+static void __exit test_find_bit_cleanup(void)
+{
+}
+module_exit(test_find_bit_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c
index 42c15f906aac..6873b5fa221b 100644
--- a/tools/lib/find_bit.c
+++ b/tools/lib/find_bit.c
@@ -29,7 +29,7 @@
* find_next_zero_bit. The difference is the "invert" argument, which
* is XORed with each fetched word before searching it for one bits.
*/
-static unsigned long _find_next_bit(const unsigned long *addr,
+static inline unsigned long _find_next_bit(const unsigned long *addr,
unsigned long nbits, unsigned long start, unsigned long invert)
{
unsigned long tmp;
--
2.11.0