Re: [PATCH v1 1/1] xarray: fix the data-race in xas_find_chunk() by using READ_ONCE()

From: Mirsad Todorovac
Date: Mon Sep 18 2023 - 12:31:04 EST




On 9/18/23 17:54, Jan Kara wrote:
On Mon 18-09-23 07:59:03, Yury Norov wrote:
On Mon, Sep 18, 2023 at 02:46:02PM +0200, Mirsad Todorovac wrote:
--------------------------------------------------------
lib/find_bit.c | 33 +++++++++++++++++----------------
1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/lib/find_bit.c b/lib/find_bit.c
index 32f99e9a670e..56244e4f744e 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -18,6 +18,7 @@
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/swab.h>
+#include <asm/rwonce.h>
/*
* Common helper for find_bit() function family
@@ -98,7 +99,7 @@ out: \
*/
unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
{
- return FIND_FIRST_BIT(addr[idx], /* nop */, size);
+ return FIND_FIRST_BIT(READ_ONCE(addr[idx]), /* nop */, size);
}
EXPORT_SYMBOL(_find_first_bit);
#endif

...

That doesn't look correct. READ_ONCE() implies that there's another
thread modifying the bitmap concurrently. This is not the true for
vast majority of bitmap API users, and I expect that forcing
READ_ONCE() would affect performance for them.

Bitmap functions, with a few rare exceptions like set_bit(), are not
thread-safe and require users to perform locking/synchronization where
needed.

Well, for xarray the write side is synchronized with a spinlock but the read
side is not (only RCU protected).

If you really need READ_ONCE, I think it's better to implement a new
flavor of the function(s) separately, like:
find_first_bit_read_once()

So yes, xarray really needs READ_ONCE(). And I don't think READ_ONCE()
imposes any real perfomance overhead in this particular case because for
any sane compiler the generated assembly with & without READ_ONCE() will be
exactly the same. For example I've checked disassembly of _find_next_bit()
using READ_ONCE(). The main loop is:

0xffffffff815a2b6d <+77>: inc %r8
0xffffffff815a2b70 <+80>: add $0x8,%rdx
0xffffffff815a2b74 <+84>: mov %r8,%rcx
0xffffffff815a2b77 <+87>: shl $0x6,%rcx
0xffffffff815a2b7b <+91>: cmp %rcx,%rax
0xffffffff815a2b7e <+94>: jbe 0xffffffff815a2b9b <_find_next_bit+123>
0xffffffff815a2b80 <+96>: mov (%rdx),%rcx
0xffffffff815a2b83 <+99>: test %rcx,%rcx
0xffffffff815a2b86 <+102>: je 0xffffffff815a2b6d <_find_next_bit+77>
0xffffffff815a2b88 <+104>: shl $0x6,%r8
0xffffffff815a2b8c <+108>: tzcnt %rcx,%rcx

So you can see the value we work with is copied from the address (rdx) into
a register (rcx) and the test and __ffs() happens on a register value and
thus READ_ONCE() has no practical effect. It just prevents the compiler
from doing some stupid de-optimization.

Honza

If I may also add, centralised READ_ONCE() version had fixed a couple of hundred of
the instances of KCSAN data-races in dmesg.

_find_*_bit() functions and/or macros cause quite a number of KCSAN BUG warnings:

95 _find_first_and_bit (lib/find_bit.c:114 (discriminator 10))
31 _find_first_zero_bit (lib/find_bit.c:125 (discriminator 10))
173 _find_next_and_bit (lib/find_bit.c:171 (discriminator 2))
655 _find_next_bit (lib/find_bit.c:133 (discriminator 2))
5 _find_next_zero_bit

Finding each one find_bit_*() function and replacing it with find_bit_*_read_once()
could be time-consuming and challenging.

However, I will do both versions so you could compare, if you'd like.

Note, in the PoC version I have only implemented find_next_bit_read_once() ATM to see if
this works.

Regards,
Mirsad

diff --git a/include/linux/find.h b/include/linux/find.h
index 5e4f39ef2e72..2b7f9f24cffb 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -40,6 +40,38 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
long size, unsigned long offset);
#endif

+unsigned long _find_next_bit_read_once(const unsigned long *addr1, unsigned long nbits,
+ unsigned long start);
+unsigned long _find_next_and_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long nbits, unsigned long start);
+unsigned long _find_next_andnot_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long nbits, unsigned long start);
+unsigned long _find_next_or_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long nbits, unsigned long start);
+unsigned long _find_next_zero_bit_read_once(const unsigned long *addr, unsigned long nbits,
+ unsigned long start);
+extern unsigned long _find_first_bit_read_once(const unsigned long *addr, unsigned long size);
+unsigned long __find_nth_bit_read_once(const unsigned long *addr, unsigned long size, unsigned long n);
+unsigned long __find_nth_and_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long size, unsigned long n);
+unsigned long __find_nth_andnot_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long size, unsigned long n);
+unsigned long __find_nth_and_andnot_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ const unsigned long *addr3, unsigned long size,
+ unsigned long n);
+extern unsigned long _find_first_and_bit_read_once(const unsigned long *addr1,
+ const unsigned long *addr2, unsigned long size);
+extern unsigned long _find_first_zero_bit_read_once(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_last_bit_read_once(const unsigned long *addr, unsigned long size);
+
+#ifdef __BIG_ENDIAN
+unsigned long _find_first_zero_bit_le_read_once(const unsigned long *addr, unsigned long size);
+unsigned long _find_next_zero_bit_le_read_once(const unsigned long *addr, unsigned
+ long size, unsigned long offset);
+unsigned long _find_next_bit_le_read_once(const unsigned long *addr, unsigned
+ long size, unsigned long offset);
+#endif
+
#ifndef find_next_bit
/**
* find_next_bit - find the next set bit in a memory region
@@ -68,6 +100,32 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
}
#endif

+#ifndef find_next_bit_read_once
+/**
+ * find_next_bit_read_once - find the next set bit in a memory region
+ * with data-race protection
+ * @addr: The address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_bit_read_once(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val;
+
+ val = *addr & GENMASK(size - 1, offset);
+ return val ? __ffs(val) : size;
+ }
+
+ return _find_next_bit_read_once(addr, size, offset);
+}
+#endif
+
#ifndef find_next_and_bit
/**
* find_next_and_bit - find the next set bit in both memory regions
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 1715fd322d62..6c04f2117c06 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1718,16 +1718,8 @@ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,

if (advance)
offset++;
- if (XA_CHUNK_SIZE == BITS_PER_LONG) {
- if (offset < XA_CHUNK_SIZE) {
- unsigned long data = READ_ONCE(*addr) & (~0UL << offset);
- if (data)
- return __ffs(data);
- }
- return XA_CHUNK_SIZE;
- }

- return find_next_bit(addr, XA_CHUNK_SIZE, offset);
+ return find_next_bit_read_once(addr, XA_CHUNK_SIZE, offset);
}

/**
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 32f99e9a670e..92a8e0016a20 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -18,6 +18,7 @@
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/swab.h>
+#include <asm/rwonce.h>

/*
* Common helper for find_bit() function family
@@ -268,3 +269,172 @@ EXPORT_SYMBOL(_find_next_bit_le);
#endif

#endif /* __BIG_ENDIAN */
+
+/*
+ * The read_once flavour of functions to avoid data-races.
+ *
+ */
+
+#ifndef find_first_bit_read_once
+/*
+ * Find the first set bit in a memory region.
+ */
+unsigned long _find_first_bit_read_once(const unsigned long *addr, unsigned long size)
+{
+ return FIND_FIRST_BIT(READ_ONCE(addr[idx]), /* nop */, size);
+}
+EXPORT_SYMBOL(_find_first_bit_read_once);
+#endif
+
+#ifndef find_first_and_bit_read_once
+/*
+ * Find the first set bit in two memory regions.
+ */
+unsigned long _find_first_and_bit_read_once(const unsigned long *addr1,
+ const unsigned long *addr2,
+ unsigned long size)
+{
+ return FIND_FIRST_BIT(READ_ONCE(addr1[idx]) & READ_ONCE(addr2[idx]), /* nop */, size);
+}
+EXPORT_SYMBOL(_find_first_and_bit_read_once);
+#endif
+
+#ifndef find_first_zero_bit_read_once
+/*
+ * Find the first cleared bit in a memory region.
+ */
+unsigned long _find_first_zero_bit_read_once(const unsigned long *addr, unsigned long size)
+{
+ return FIND_FIRST_BIT(~READ_ONCE(addr[idx]), /* nop */, size);
+}
+EXPORT_SYMBOL(_find_first_zero_bit_read_once);
+#endif
+
+#ifndef find_next_bit_read_once
+unsigned long _find_next_bit_read_once(const unsigned long *addr, unsigned long nbits, unsigned long start)
+{
+ return FIND_NEXT_BIT(READ_ONCE(addr[idx]), /* nop */, nbits, start);
+}
+EXPORT_SYMBOL(_find_next_bit_read_once);
+#endif
+
+unsigned long __find_nth_bit_read_once(const unsigned long *addr, unsigned long size, unsigned long n)
+{
+ return FIND_NTH_BIT(READ_ONCE(addr[idx]), size, n);
+}
+EXPORT_SYMBOL(__find_nth_bit_read_once);
+
+unsigned long __find_nth_and_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long size, unsigned long n)
+{
+ return FIND_NTH_BIT(READ_ONCE(addr1[idx]) & READ_ONCE(addr2[idx]), size, n);
+}
+EXPORT_SYMBOL(__find_nth_and_bit_read_once);
+
+unsigned long __find_nth_andnot_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long size, unsigned long n)
+{
+ return FIND_NTH_BIT(READ_ONCE(addr1[idx]) & ~READ_ONCE(addr2[idx]), size, n);
+}
+EXPORT_SYMBOL(__find_nth_andnot_bit_read_once);
+
+unsigned long __find_nth_and_andnot_bit_read_once(const unsigned long *addr1,
+ const unsigned long *addr2,
+ const unsigned long *addr3,
+ unsigned long size, unsigned long n)
+{
+ return FIND_NTH_BIT(READ_ONCE(addr1[idx]) & READ_ONCE(addr2[idx]) & ~READ_ONCE(addr3[idx]), size, n);
+}
+EXPORT_SYMBOL(__find_nth_and_andnot_bit_read_once);
+
+#ifndef find_next_and_bit_read_once
+unsigned long _find_next_and_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long nbits, unsigned long start)
+{
+ return FIND_NEXT_BIT(READ_ONCE(addr1[idx]) & READ_ONCE(addr2[idx]), /* nop */, nbits, start);
+}
+EXPORT_SYMBOL(_find_next_and_bit_read_once);
+#endif
+
+#ifndef find_next_andnot_bit_read_once
+unsigned long _find_next_andnot_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long nbits, unsigned long start)
+{
+ return FIND_NEXT_BIT(READ_ONCE(addr1[idx]) & ~READ_ONCE(addr2[idx]), /* nop */, nbits, start);
+}
+EXPORT_SYMBOL(_find_next_andnot_bit_read_once);
+#endif
+
+#ifndef find_next_or_bit_read_once
+unsigned long _find_next_or_bit_read_once(const unsigned long *addr1, const unsigned long *addr2,
+ unsigned long nbits, unsigned long start)
+{
+ return FIND_NEXT_BIT(READ_ONCE(addr1[idx]) | READ_ONCE(addr2[idx]), /* nop */, nbits, start);
+}
+EXPORT_SYMBOL(_find_next_or_bit_read_once);
+#endif
+
+#ifndef find_next_zero_bit_read_once
+unsigned long _find_next_zero_bit_read_once(const unsigned long *addr, unsigned long nbits,
+ unsigned long start)
+{
+ return FIND_NEXT_BIT(~READ_ONCE(addr[idx]), /* nop */, nbits, start);
+}
+EXPORT_SYMBOL(_find_next_zero_bit_read_once);
+#endif
+
+#ifndef find_last_bit_read_once
+unsigned long _find_last_bit_read_once(const unsigned long *addr, unsigned long size)
+{
+ if (size) {
+ unsigned long val = BITMAP_LAST_WORD_MASK(size);
+ unsigned long idx = (size-1) / BITS_PER_LONG;
+
+ do {
+ val &= READ_ONCE(addr[idx]);
+ if (val)
+ return idx * BITS_PER_LONG + __fls(val);
+
+ val = ~0ul;
+ } while (idx--);
+ }
+ return size;
+}
+EXPORT_SYMBOL(_find_last_bit_read_once);
+#endif
+
+#ifdef __BIG_ENDIAN
+
+#ifndef find_first_zero_bit_le_read_once
+/*
+ * Find the first cleared bit in an LE memory region.
+ */
+unsigned long _find_first_zero_bit_le_read_once(const unsigned long *addr,
+ unsigned long size)
+{
+ return FIND_FIRST_BIT(~READ_ONCE(addr[idx]), swab, size);
+}
+EXPORT_SYMBOL(_find_first_zero_bit_le_read_once);
+
+#endif
+
+#ifndef find_next_zero_bit_le_read_once
+unsigned long _find_next_zero_bit_le_read_once(const unsigned long *addr,
+ unsigned long size, unsigned long offset)
+{
+ return FIND_NEXT_BIT(~READ_ONCE(addr[idx]), swab, size, offset);
+}
+EXPORT_SYMBOL(_find_next_zero_bit_le_read_once);
+#endif
+
+#ifndef find_next_bit_le_read_once
+unsigned long _find_next_bit_le_read_once(const unsigned long *addr,
+ unsigned long size, unsigned long offset)
+{
+ return FIND_NEXT_BIT(READ_ONCE(addr[idx]), swab, size, offset);
+}
+EXPORT_SYMBOL(_find_next_bit_le_read_once);
+
+#endif
+
+#endif /* __BIG_ENDIAN */