[RFC PATCH 3/7] crypto: x86/crc - limit FPU preemption

From: Robert Elliott
Date: Thu Oct 06 2022 - 18:37:38 EST


As done by the ECB and CBC helpers in arch/x86/crypt/ecb_cbc_helpers.h,
limit the number of bytes processed between kernel_fpu_begin() and
kernel_fpu_end() calls.

Those functions call preempt_disable() and preempt_enable(), so
the CPU core is unavailable for scheduling while running, leading to:
rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {12-... } 22 jiffies s: 277 root: 0x1/.

Fixes: 78c37d191dd6 ("crypto: crc32 - add crc32 pclmulqdq implementation and wrappers for table implementation")
Fixes: 6a8ce1ef3940 ("crypto: crc32c - Optimize CRC32C calculation with PCLMULQDQ instruction")
Fixes: 0b95a7f85718 ("crypto: crct10dif - Glue code to cast accelerated CRCT10DIF assembly as a crypto transform")
Suggested-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
Signed-off-by: Robert Elliott <elliott@xxxxxxx>
---
arch/x86/crypto/crc32-pclmul_glue.c | 18 ++++++++++----
arch/x86/crypto/crc32c-intel_glue.c | 32 ++++++++++++++++++++-----
arch/x86/crypto/crct10dif-pclmul_glue.c | 32 ++++++++++++++++++++-----
3 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 288200fe7b4e..7cf65dc726c4 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -49,6 +49,8 @@
#define SCALE_F 16L /* size of xmm register */
#define SCALE_F_MASK (SCALE_F - 1)

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);

static u32 __attribute__((pure))
@@ -57,6 +59,7 @@ static u32 __attribute__((pure))
unsigned int iquotient;
unsigned int iremainder;
unsigned int prealign;
+ unsigned int chunk;

if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable())
return crc32_le(crc, p, len);
@@ -73,12 +76,19 @@ static u32 __attribute__((pure))
iquotient = len & (~SCALE_F_MASK);
iremainder = len & SCALE_F_MASK;

- kernel_fpu_begin();
- crc = crc32_pclmul_le_16(p, iquotient, crc);
- kernel_fpu_end();
+ do {
+ chunk = min(iquotient, FPU_BYTES);
+ iquotient -= chunk;
+
+ kernel_fpu_begin();
+ crc = crc32_pclmul_le_16(p, chunk, crc);
+ kernel_fpu_end();
+
+ p += chunk;
+ } while (iquotient);

if (iremainder)
- crc = crc32_le(crc, p + iquotient, iremainder);
+ crc = crc32_le(crc, p, iremainder);

return crc;
}
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index c5c965b694c6..b277c215f0fb 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -44,6 +44,8 @@
*/
#define CRC32C_PCL_BREAKEVEN 512

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
#endif /* CONFIG_X86_64 */
@@ -155,15 +157,23 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
u32 *crcp = shash_desc_ctx(desc);
+ unsigned int chunk;

/*
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
- kernel_fpu_begin();
- *crcp = crc_pcl(data, len, *crcp);
- kernel_fpu_end();
+ do {
+ chunk = min(len, FPU_BYTES);
+ len -= chunk;
+
+ kernel_fpu_begin();
+ *crcp = crc_pcl(data, chunk, *crcp);
+ kernel_fpu_end();
+
+ data += chunk;
+ } while (len);
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
@@ -172,10 +182,20 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
+ unsigned int chunk;
+
if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
- kernel_fpu_begin();
- *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
- kernel_fpu_end();
+ do {
+ chunk = min(len, FPU_BYTES);
+ len -= chunk;
+
+ kernel_fpu_begin();
+ *crcp = crc_pcl(data, chunk, *crcp);
+ kernel_fpu_end();
+
+ data += chunk;
+ } while (len);
+ *(__le32 *)out = ~cpu_to_le32(*crcp);
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index 7c5a32282d51..bcd362df6b62 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -36,6 +36,8 @@
#include <asm/cpu_device_id.h>
#include <asm/simd.h>

+#define FPU_BYTES 4096U /* avoid kernel_fpu_begin/end scheduler/rcu stalls */
+
asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);

struct chksum_desc_ctx {
@@ -55,11 +57,19 @@ static int chksum_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+ unsigned int chunk;

if (length >= 16 && crypto_simd_usable()) {
- kernel_fpu_begin();
- ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
- kernel_fpu_end();
+ do {
+ chunk = min(length, FPU_BYTES);
+ length -= chunk;
+
+ kernel_fpu_begin();
+ ctx->crc = crc_t10dif_pcl(ctx->crc, data, chunk);
+ kernel_fpu_end();
+
+ data += chunk;
+ } while (length);
} else
ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
return 0;
@@ -75,10 +85,20 @@ static int chksum_final(struct shash_desc *desc, u8 *out)

static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
{
+ unsigned int chunk;
+
if (len >= 16 && crypto_simd_usable()) {
- kernel_fpu_begin();
- *(__u16 *)out = crc_t10dif_pcl(crc, data, len);
- kernel_fpu_end();
+ do {
+ chunk = min(len, FPU_BYTES);
+ len -= chunk;
+
+ kernel_fpu_begin();
+ crc = crc_t10dif_pcl(crc, data, chunk);
+ kernel_fpu_end();
+
+ data += chunk;
+ } while (len);
+ *(__u16 *)out = crc;
} else
*(__u16 *)out = crc_t10dif_generic(crc, data, len);
return 0;
--
2.37.3