Re: [PATCH] arm64: enable GENERIC_FIND_FIRST_BIT

From: Alexander Lobakin
Date: Wed Feb 24 2021 - 06:54:41 EST


From: Yury Norov <yury.norov@xxxxxxxxx>
Date: Sat, 5 Dec 2020 08:54:06 -0800

Hi,

> ARM64 doesn't implement find_first_{zero}_bit in arch code and doesn't
> enable it in config. It leads to using find_next_bit() which is less
> efficient:
>
> 0000000000000000 <find_first_bit>:
> 0: aa0003e4 mov x4, x0
> 4: aa0103e0 mov x0, x1
> 8: b4000181 cbz x1, 38 <find_first_bit+0x38>
> c: f9400083 ldr x3, [x4]
> 10: d2800802 mov x2, #0x40 // #64
> 14: 91002084 add x4, x4, #0x8
> 18: b40000c3 cbz x3, 30 <find_first_bit+0x30>
> 1c: 14000008 b 3c <find_first_bit+0x3c>
> 20: f8408483 ldr x3, [x4], #8
> 24: 91010045 add x5, x2, #0x40
> 28: b50000c3 cbnz x3, 40 <find_first_bit+0x40>
> 2c: aa0503e2 mov x2, x5
> 30: eb02001f cmp x0, x2
> 34: 54ffff68 b.hi 20 <find_first_bit+0x20> // b.pmore
> 38: d65f03c0 ret
> 3c: d2800002 mov x2, #0x0 // #0
> 40: dac00063 rbit x3, x3
> 44: dac01063 clz x3, x3
> 48: 8b020062 add x2, x3, x2
> 4c: eb02001f cmp x0, x2
> 50: 9a829000 csel x0, x0, x2, ls // ls = plast
> 54: d65f03c0 ret
>
> ...
>
> 0000000000000118 <_find_next_bit.constprop.1>:
> 118: eb02007f cmp x3, x2
> 11c: 540002e2 b.cs 178 <_find_next_bit.constprop.1+0x60> // b.hs, b.nlast
> 120: d346fc66 lsr x6, x3, #6
> 124: f8667805 ldr x5, [x0, x6, lsl #3]
> 128: b4000061 cbz x1, 134 <_find_next_bit.constprop.1+0x1c>
> 12c: f8667826 ldr x6, [x1, x6, lsl #3]
> 130: 8a0600a5 and x5, x5, x6
> 134: ca0400a6 eor x6, x5, x4
> 138: 92800005 mov x5, #0xffffffffffffffff // #-1
> 13c: 9ac320a5 lsl x5, x5, x3
> 140: 927ae463 and x3, x3, #0xffffffffffffffc0
> 144: ea0600a5 ands x5, x5, x6
> 148: 54000120 b.eq 16c <_find_next_bit.constprop.1+0x54> // b.none
> 14c: 1400000e b 184 <_find_next_bit.constprop.1+0x6c>
> 150: d346fc66 lsr x6, x3, #6
> 154: f8667805 ldr x5, [x0, x6, lsl #3]
> 158: b4000061 cbz x1, 164 <_find_next_bit.constprop.1+0x4c>
> 15c: f8667826 ldr x6, [x1, x6, lsl #3]
> 160: 8a0600a5 and x5, x5, x6
> 164: eb05009f cmp x4, x5
> 168: 540000c1 b.ne 180 <_find_next_bit.constprop.1+0x68> // b.any
> 16c: 91010063 add x3, x3, #0x40
> 170: eb03005f cmp x2, x3
> 174: 54fffee8 b.hi 150 <_find_next_bit.constprop.1+0x38> // b.pmore
> 178: aa0203e0 mov x0, x2
> 17c: d65f03c0 ret
> 180: ca050085 eor x5, x4, x5
> 184: dac000a5 rbit x5, x5
> 188: dac010a5 clz x5, x5
> 18c: 8b0300a3 add x3, x5, x3
> 190: eb03005f cmp x2, x3
> 194: 9a839042 csel x2, x2, x3, ls // ls = plast
> 198: aa0203e0 mov x0, x2
> 19c: d65f03c0 ret
>
> ...
>
> 0000000000000238 <find_next_bit>:
> 238: a9bf7bfd stp x29, x30, [sp, #-16]!
> 23c: aa0203e3 mov x3, x2
> 240: d2800004 mov x4, #0x0 // #0
> 244: aa0103e2 mov x2, x1
> 248: 910003fd mov x29, sp
> 24c: d2800001 mov x1, #0x0 // #0
> 250: 97ffffb2 bl 118 <_find_next_bit.constprop.1>
> 254: a8c17bfd ldp x29, x30, [sp], #16
> 258: d65f03c0 ret
>
> Enabling this functions would also benefit for_each_{set,clear}_bit().
> Would it make sense to enable this config for all such architectures by
> default?

I confirm that GENERIC_FIND_FIRST_BIT also produces more optimized and
fast code on MIPS (32 R2) where there is also no architecture-specific
bitsearching routines.
So, if it's okay for other folks, I'd suggest to go for it and enable
for all similar arches.

(otherwise, I'll publish a separate entry for mips-next after 5.12-rc1
release and mention you in "Suggested-by:")

> Signed-off-by: Yury Norov <yury.norov@xxxxxxxxx>
>
> ---
> arch/arm64/Kconfig | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 1515f6f153a0..2b90ef1f548e 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -106,6 +106,7 @@ config ARM64
> select GENERIC_CPU_AUTOPROBE
> select GENERIC_CPU_VULNERABILITIES
> select GENERIC_EARLY_IOREMAP
> + select GENERIC_FIND_FIRST_BIT
> select GENERIC_IDLE_POLL_SETUP
> select GENERIC_IRQ_IPI
> select GENERIC_IRQ_MULTI_HANDLER
> --
> 2.25.1

Thanks,
Al