Re: [PATCH 3/5] riscv: Vector checksum header

From: Samuel Holland
Date: Mon Aug 28 2023 - 14:23:36 EST


On 2023-08-26 8:26 PM, Charlie Jenkins wrote:
> This patch is not ready for merge as vector support in the kernel is
> limited. However, the code has been tested in QEMU so the algorithms
> do work. It is written in assembly rather than using the GCC vector
> instrinsics because they did not provide optimal code.
>
> Signed-off-by: Charlie Jenkins <charlie@xxxxxxxxxxxx>
> ---
> arch/riscv/include/asm/checksum.h | 81 +++++++++++++++++++++++++++++++++++++++
> 1 file changed, 81 insertions(+)
>
> diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h
> index af49b3409576..7e31c0ad6346 100644
> --- a/arch/riscv/include/asm/checksum.h
> +++ b/arch/riscv/include/asm/checksum.h
> @@ -10,6 +10,10 @@
> #include <linux/in6.h>
> #include <linux/uaccess.h>
>
> +#ifdef CONFIG_RISCV_ISA_V
> +#include <riscv_vector.h>
> +#endif
> +
> /* Default version is sufficient for 32 bit */
> #ifdef CONFIG_64BIT
> #define _HAVE_ARCH_IPV6_CSUM
> @@ -36,6 +40,46 @@ static inline __sum16 csum_fold(__wsum sum)
> * without the bitmanip extensions zba/zbb.
> */
> #ifdef CONFIG_32BIT
> +#ifdef CONFIG_RISCV_ISA_V
> +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> +{
> + vuint64m1_t prev_buffer;
> + vuint32m1_t curr_buffer;
> + unsigned int vl;
> + unsigned int high_result;
> + unsigned int low_result;
> +
> + asm("vsetivli x0, 1, e64, ta, ma \n\t\

The same concerns from patch 1 apply here as well. Vector assembly must be gated
behind an alternative section or a call to has_vector(), so the kernel can fall
back to a non-vector implementation at runtime.

You are also missing calls to kernel_vector_begin()/kernel_vector_end(), as
added by [1], which are required to avoid corrupting the user-mode vector
register context.

Regards,
Samuel

[1]: https://lore.kernel.org/linux-riscv/20230721112855.1006-3-andy.chiu@xxxxxxxxxx/

> + vmv.v.i %[prev_buffer], 0 \n\t\
> + 1: \n\t\
> + vsetvli %[vl], %[ihl], e32, m1, ta, ma \n\t\
> + vle32.v %[curr_buffer], (%[iph]) \n\t\
> + vwredsumu.vs %[prev_buffer], %[curr_buffer], %[prev_buffer] \n\t\
> + sub %[ihl], %[ihl], %[vl] \n\t"
> +#ifdef CONFIG_RISCV_ISA_ZBA
> + "sh2add %[iph], %[vl], %[iph] \n\t"
> +#else
> + "slli %[vl], %[vl], 2 \n\
> + add %[iph], %[vl], %[iph] \n\t"
> +#endif
> + "bnez %[ihl], 1b \n\
> + vsetivli x0, 1, e64, m1, ta, ma \n\
> + vmv.x.s %[low_result], %[prev_buffer] \n\
> + addi %[vl], x0, 32 \n\
> + vsrl.vx %[prev_buffer], %[prev_buffer], %[vl] \n\
> + vmv.x.s %[high_result], %[prev_buffer]"
> + : [vl] "=&r" (vl), [prev_buffer] "=&vd" (prev_buffer),
> + [curr_buffer] "=&vd" (curr_buffer),
> + [high_result] "=&r" (high_result),
> + [low_result] "=&r" (low_result)
> + : [iph] "r" (iph), [ihl] "r" (ihl));
> +
> + high_result += low_result;
> + high_result += high_result < low_result;
> + return csum_fold((__force __wsum)(high_result));
> +}
> +
> +#else
> static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> {
> __wsum csum = 0;
> @@ -47,8 +91,44 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> } while (++pos < ihl);
> return csum_fold(csum);
> }
> +#endif
> +#else
> +
> +#ifdef CONFIG_RISCV_ISA_V
> +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> +{
> + vuint64m1_t prev_buffer;
> + vuint32m1_t curr_buffer;
> + unsigned long vl;
> + unsigned long result;
> +
> + asm("vsetivli x0, 1, e64, ta, ma \n\
> + vmv.v.i %[prev_buffer], 0 \n\
> + 1: \n\
> + # Setup 32-bit sum of iph \n\
> + vsetvli %[vl], %[ihl], e32, m1, ta, ma \n\
> + vle32.v %[curr_buffer], (%[iph]) \n\
> + # Sum each 32-bit segment of iph that can fit into a vector reg \n\
> + vwredsumu.vs %[prev_buffer], %[curr_buffer], %[prev_buffer] \n\
> + subw %[ihl], %[ihl], %[vl] \n\t"
> +#ifdef CONFIG_RISCV_ISA_ZBA
> + "sh2add %[iph], %[vl], %[iph] \n\t"
> #else
> + "slli %[vl], %[vl], 2 \n\
> + addw %[iph], %[vl], %[iph] \n\t"
> +#endif
> + "# If not all of iph could fit into vector reg, do another sum \n\
> + bnez %[ihl], 1b \n\
> + vsetvli x0, x0, e64, m1, ta, ma \n\
> + vmv.x.s %[result], %[prev_buffer]"
> + : [vl] "=&r" (vl), [prev_buffer] "=&vd" (prev_buffer),
> + [curr_buffer] "=&vd" (curr_buffer), [result] "=&r" (result)
> + : [iph] "r" (iph), [ihl] "r" (ihl));
>
> + result += (result >> 32) | (result << 32);
> + return csum_fold((__force __wsum)(result >> 32));
> +}
> +#else
> /*
> * Quickly compute an IP checksum with the assumption that IPv4 headers will
> * always be in multiples of 32-bits, and have an ihl of at least 5.
> @@ -74,6 +154,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> return csum_fold((__force __wsum)(csum >> 32));
> }
> #endif
> +#endif
> #define ip_fast_csum ip_fast_csum
>
> extern unsigned int do_csum(const unsigned char *buff, int len);
>