[PATCH 09/13] crypto: x86/poly - yield FPU context only when needed

From: Robert Elliott
Date: Mon Dec 19 2022 - 17:03:26 EST


The x86 assembly language implementations using SIMD process data
between kernel_fpu_begin() and kernel_fpu_end() calls. That
disables scheduler preemption, so prevents the CPU core from being
used by other threads.

The update() and finup() functions might be called to process
large quantities of data, which can result in RCU stalls and
soft lockups.

Rather than break the processing into 4 KiB passes, each of which
unilaterally calls kernel_fpu_begin() and kernel_fpu_end(),
periodically check if the kernel scheduler wants to run something
else on the CPU. If so, yield the kernel FPU context and let the
scheduler intervene.

Suggested-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
Signed-off-by: Robert Elliott <elliott@xxxxxxx>
---
arch/x86/crypto/nhpoly1305-avx2-glue.c | 22 +++++++-----
arch/x86/crypto/nhpoly1305-sse2-glue.c | 22 +++++++-----
arch/x86/crypto/poly1305_glue.c | 47 ++++++++++++--------------
arch/x86/crypto/polyval-clmulni_glue.c | 46 +++++++++++++++----------
4 files changed, 79 insertions(+), 58 deletions(-)

diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index 46b036204ed9..4afbfd35afda 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -22,15 +22,21 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc,
if (srclen < 64 || !crypto_simd_usable())
return crypto_nhpoly1305_update(desc, src, srclen);

- do {
- unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunk = min(srclen, 4096U);
+
+ crypto_nhpoly1305_update_helper(desc, src, chunk, nh_avx2);
+ srclen -= chunk;
+
+ if (!srclen)
+ break;
+
+ src += chunk;
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();

- kernel_fpu_begin();
- crypto_nhpoly1305_update_helper(desc, src, n, nh_avx2);
- kernel_fpu_end();
- src += n;
- srclen -= n;
- } while (srclen);
return 0;
}

diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index 4a4970d75107..f5c757f6f781 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -22,15 +22,21 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc,
if (srclen < 64 || !crypto_simd_usable())
return crypto_nhpoly1305_update(desc, src, srclen);

- do {
- unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunk = min(srclen, 4096U);
+
+ crypto_nhpoly1305_update_helper(desc, src, chunk, nh_sse2);
+ srclen -= chunk;
+
+ if (!srclen)
+ break;
+
+ src += chunk;
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();

- kernel_fpu_begin();
- crypto_nhpoly1305_update_helper(desc, src, n, nh_sse2);
- kernel_fpu_end();
- src += n;
- srclen -= n;
- } while (srclen);
return 0;
}

diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 1dfb8af48a3c..13e2e134b458 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -15,20 +15,13 @@
#include <asm/intel-family.h>
#include <asm/simd.h>

-asmlinkage void poly1305_init_x86_64(void *ctx,
- const u8 key[POLY1305_BLOCK_SIZE]);
-asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
- const size_t len, const u32 padbit);
-asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
- const size_t len, const u32 padbit);
+asmlinkage void poly1305_init_x86_64(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, unsigned int len, u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, unsigned int len, const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, unsigned int len, u32 padbit);
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, unsigned int len, u32 padbit);

static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
@@ -86,7 +79,7 @@ static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
poly1305_init_x86_64(ctx, key);
}

-static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
+static void poly1305_simd_blocks(void *ctx, const u8 *inp, unsigned int len,
const u32 padbit)
{
struct poly1305_arch_internal *state = ctx;
@@ -103,21 +96,25 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
return;
}

- do {
- const size_t bytes = min_t(size_t, len, SZ_4K);
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunk = min(len, 4096U);

- kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
- poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+ poly1305_blocks_avx512(ctx, inp, chunk, padbit);
else if (static_branch_likely(&poly1305_use_avx2))
- poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+ poly1305_blocks_avx2(ctx, inp, chunk, padbit);
else
- poly1305_blocks_avx(ctx, inp, bytes, padbit);
- kernel_fpu_end();
+ poly1305_blocks_avx(ctx, inp, chunk, padbit);
+ len -= chunk;

- len -= bytes;
- inp += bytes;
- } while (len);
+ if (!len)
+ break;
+
+ inp += chunk;
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();
}

static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
index 8fa58b0f3cb3..a3d72e87d58d 100644
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ b/arch/x86/crypto/polyval-clmulni_glue.c
@@ -45,8 +45,8 @@ struct polyval_desc_ctx {
u32 bytes;
};

-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- const u8 *in, size_t nblocks, u8 *accumulator);
+asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in,
+ unsigned int nblocks, u8 *accumulator);
asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);

static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
@@ -55,27 +55,40 @@ static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
}

static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
- const u8 *in, size_t nblocks, u8 *accumulator)
+ const u8 *in, unsigned int nblocks, u8 *accumulator)
{
- if (likely(crypto_simd_usable())) {
- kernel_fpu_begin();
- clmul_polyval_update(keys, in, nblocks, accumulator);
- kernel_fpu_end();
- } else {
+ if (!crypto_simd_usable()) {
polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
nblocks, accumulator);
+ return;
}
+
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunks = min(nblocks, 4096U / POLYVAL_BLOCK_SIZE);
+
+ clmul_polyval_update(keys, in, chunks, accumulator);
+ nblocks -= chunks;
+
+ if (!nblocks)
+ break;
+
+ in += chunks * POLYVAL_BLOCK_SIZE;
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();
}

static void internal_polyval_mul(u8 *op1, const u8 *op2)
{
- if (likely(crypto_simd_usable())) {
- kernel_fpu_begin();
- clmul_polyval_mul(op1, op2);
- kernel_fpu_end();
- } else {
+ if (!crypto_simd_usable()) {
polyval_mul_non4k(op1, op2);
+ return;
}
+
+ kernel_fpu_begin();
+ clmul_polyval_mul(op1, op2);
+ kernel_fpu_end();
}

static int polyval_x86_setkey(struct crypto_shash *tfm,
@@ -113,7 +126,6 @@ static int polyval_x86_update(struct shash_desc *desc,
struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
u8 *pos;
- unsigned int nblocks;
unsigned int n;

if (dctx->bytes) {
@@ -131,9 +143,9 @@ static int polyval_x86_update(struct shash_desc *desc,
tctx->key_powers[NUM_KEY_POWERS-1]);
}

- while (srclen >= POLYVAL_BLOCK_SIZE) {
- /* Allow rescheduling every 4K bytes. */
- nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
+ if (srclen >= POLYVAL_BLOCK_SIZE) {
+ const unsigned int nblocks = srclen / POLYVAL_BLOCK_SIZE;
+
internal_polyval_update(tctx, src, nblocks, dctx->buffer);
srclen -= nblocks * POLYVAL_BLOCK_SIZE;
src += nblocks * POLYVAL_BLOCK_SIZE;
--
2.38.1