[PATCH 11/13] crypto: x86/blake - yield FPU context only when needed

From: Robert Elliott
Date: Mon Dec 19 2022 - 17:03:52 EST


The x86 assembly language implementations using SIMD process data
between kernel_fpu_begin() and kernel_fpu_end() calls. That
disables scheduler preemption, so prevents the CPU core from being
used by other threads.

The update() and finup() functions might be called to process
large quantities of data, which can result in RCU stalls and
soft lockups.

Rather than break the processing into 4 KiB passes, each of which
unilaterally calls kernel_fpu_begin() and kernel_fpu_end(),
periodically check if the kernel scheduler wants to run something
else on the CPU. If so, yield the kernel FPU context and let the
scheduler intervene.

Adjust the type of the length arguments everywhere to be unsigned
long rather than size_t to avoid typecasts.

Suggested-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
Signed-off-by: Robert Elliott <elliott@xxxxxxx>
---
arch/x86/crypto/blake2s-glue.c | 41 ++++++++++++++++---------------
include/crypto/internal/blake2s.h | 8 +++---
lib/crypto/blake2s-generic.c | 12 ++++-----
3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index aaba21230528..bbb0a67ebb1c 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -12,46 +12,47 @@
#include <linux/sizes.h>

#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
#include <asm/processor.h>
#include <asm/simd.h>

-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
+asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc);

static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);

-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);

if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
- blake2s_compress_generic(state, block, nblocks, inc);
+ blake2s_compress_generic(state, data, nblocks, inc);
return;
}

- do {
- const size_t blocks = min_t(size_t, nblocks,
- SZ_4K / BLAKE2S_BLOCK_SIZE);
+ kernel_fpu_begin();
+ for (;;) {
+ const unsigned int chunks = min(nblocks, 4096U / BLAKE2S_BLOCK_SIZE);

- kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
static_branch_likely(&blake2s_use_avx512))
- blake2s_compress_avx512(state, block, blocks, inc);
+ blake2s_compress_avx512(state, data, chunks, inc);
else
- blake2s_compress_ssse3(state, block, blocks, inc);
- kernel_fpu_end();
+ blake2s_compress_ssse3(state, data, chunks, inc);

- nblocks -= blocks;
- block += blocks * BLAKE2S_BLOCK_SIZE;
- } while (nblocks);
+ nblocks -= chunks;
+
+ if (!nblocks)
+ break;
+
+ data += chunks * BLAKE2S_BLOCK_SIZE;
+ kernel_fpu_yield();
+ }
+ kernel_fpu_end();
}
EXPORT_SYMBOL(blake2s_compress);

diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
index 506d56530ca9..d6df791e6148 100644
--- a/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
@@ -10,11 +10,11 @@
#include <crypto/blake2s.h>
#include <linux/string.h>

-void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc);
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc);

-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc);
+void blake2s_compress(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc);

bool blake2s_selftest(void);

diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 75ccb3e633e6..6a1caa702698 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -37,12 +37,12 @@ static inline void blake2s_increment_counter(struct blake2s_state *state,
state->t[1] += (state->t[0] < inc);
}

-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc)
__weak __alias(blake2s_compress_generic);

-void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
+void blake2s_compress_generic(struct blake2s_state *state, const u8 *data,
+ unsigned int nblocks, u32 inc)
{
u32 m[16];
u32 v[16];
@@ -53,7 +53,7 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,

while (nblocks > 0) {
blake2s_increment_counter(state, inc);
- memcpy(m, block, BLAKE2S_BLOCK_SIZE);
+ memcpy(m, data, BLAKE2S_BLOCK_SIZE);
le32_to_cpu_array(m, ARRAY_SIZE(m));
memcpy(v, state->h, 32);
v[ 8] = BLAKE2S_IV0;
@@ -103,7 +103,7 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
for (i = 0; i < 8; ++i)
state->h[i] ^= v[i] ^ v[i + 8];

- block += BLAKE2S_BLOCK_SIZE;
+ data += BLAKE2S_BLOCK_SIZE;
--nblocks;
}
}
--
2.38.1